From 79e1ae3e54e34c8dd8ef71746de1830d9958175e Mon Sep 17 00:00:00 2001 From: Stenzek Date: Wed, 4 Oct 2023 00:19:17 +1000 Subject: [PATCH] CPU/CodeCache: Rewrite using new-rec's block management --- src/common/CMakeLists.txt | 2 + src/common/common.vcxproj | 2 + src/common/common.vcxproj.filters | 2 + src/common/intrin.h | 2 +- src/common/perf_scope.cpp | 198 ++ src/common/perf_scope.h | 20 + src/core/CMakeLists.txt | 1 + src/core/bus.h | 4 +- src/core/core.vcxproj | 4 + src/core/core.vcxproj.filters | 1 + src/core/cpu_code_cache.cpp | 2346 +++++++++-------- src/core/cpu_code_cache.h | 158 +- src/core/cpu_code_cache_private.h | 279 ++ src/core/cpu_core.cpp | 32 +- src/core/cpu_recompiler_code_generator.cpp | 857 +++--- src/core/cpu_recompiler_code_generator.h | 126 +- .../cpu_recompiler_code_generator_aarch32.cpp | 648 +++-- .../cpu_recompiler_code_generator_aarch64.cpp | 698 +++-- .../cpu_recompiler_code_generator_generic.cpp | 32 +- .../cpu_recompiler_code_generator_x64.cpp | 607 +++-- src/core/cpu_recompiler_register_cache.h | 61 +- src/core/cpu_recompiler_thunks.h | 14 +- src/core/cpu_recompiler_types.h | 210 +- src/core/hotkeys.cpp | 6 +- src/core/imgui_overlays.cpp | 5 +- src/core/settings.h | 8 - src/core/system.cpp | 38 +- src/util/page_fault_handler.cpp | 22 +- src/util/page_fault_handler.h | 2 +- 29 files changed, 3865 insertions(+), 2520 deletions(-) create mode 100644 src/common/perf_scope.cpp create mode 100644 src/common/perf_scope.h create mode 100644 src/core/cpu_code_cache_private.h diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index 840798ce3..1fb188b85 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -38,6 +38,8 @@ add_library(common minizip_helpers.cpp minizip_helpers.h path.h + perf_scope.cpp + perf_scope.h progress_callback.cpp progress_callback.h rectangle.h diff --git a/src/common/common.vcxproj b/src/common/common.vcxproj index df252a05f..e8a58bd0c 100644 --- a/src/common/common.vcxproj +++ b/src/common/common.vcxproj @@ -28,6 +28,7 @@ + @@ -59,6 +60,7 @@ + diff --git a/src/common/common.vcxproj.filters b/src/common/common.vcxproj.filters index 4a147b3a9..1fb457f4c 100644 --- a/src/common/common.vcxproj.filters +++ b/src/common/common.vcxproj.filters @@ -43,6 +43,7 @@ + @@ -69,6 +70,7 @@ + diff --git a/src/common/intrin.h b/src/common/intrin.h index b75ea45ba..7d5f18968 100644 --- a/src/common/intrin.h +++ b/src/common/intrin.h @@ -28,7 +28,7 @@ #endif template -static inline void MemsetPtrs(T* ptr, T value, u32 count) +ALWAYS_INLINE_RELEASE static void MemsetPtrs(T* ptr, T value, u32 count) { static_assert(std::is_pointer_v, "T is pointer type"); static_assert(sizeof(T) == sizeof(void*), "T isn't a fat pointer"); diff --git a/src/common/perf_scope.cpp b/src/common/perf_scope.cpp new file mode 100644 index 000000000..ad679dabd --- /dev/null +++ b/src/common/perf_scope.cpp @@ -0,0 +1,198 @@ + +// SPDX-FileCopyrightText: 2023 Connor McLaughlin , PCSX2 Team +// SPDX-License-Identifier: GPL-3.0 + +#include "perf_scope.h" +#include "assert.h" +#include "string_util.h" + +#include +#include + +#ifdef __linux__ +#include +#include +#include +#include +#include +#include +#include +#endif + +// #define ProfileWithPerf +// #define ProfileWithPerfJitDump + +// Perf is only supported on linux +#if defined(__linux__) && defined(ProfileWithPerf) + +static std::FILE* s_map_file = nullptr; +static bool s_map_file_opened = false; +static std::mutex s_mutex; +static void RegisterMethod(const void* ptr, size_t size, const char* symbol) +{ + std::unique_lock lock(s_mutex); + + if (!s_map_file) + { + if (s_map_file_opened) + return; + + char file[256]; + snprintf(file, std::size(file), "/tmp/perf-%d.map", getpid()); + s_map_file = std::fopen(file, "wb"); + s_map_file_opened = true; + if (!s_map_file) + return; + } + + std::fprintf(s_map_file, "%" PRIx64 " %zx %s\n", static_cast(reinterpret_cast(ptr)), size, symbol); + std::fflush(s_map_file); +} + +#elif defined(__linux__) && defined(ProfileWithPerfJitDump) +enum : u32 +{ + JIT_CODE_LOAD = 0, + JIT_CODE_MOVE = 1, + JIT_CODE_DEBUG_INFO = 2, + JIT_CODE_CLOSE = 3, + JIT_CODE_UNWINDING_INFO = 4 +}; + +#pragma pack(push, 1) +struct JITDUMP_HEADER +{ + u32 magic = 0x4A695444; // JiTD + u32 version = 1; + u32 header_size = sizeof(JITDUMP_HEADER); + u32 elf_mach; + u32 pad1 = 0; + u32 pid; + u64 timestamp; + u64 flags = 0; +}; +struct JITDUMP_RECORD_HEADER +{ + u32 id; + u32 total_size; + u64 timestamp; +}; +struct JITDUMP_CODE_LOAD +{ + JITDUMP_RECORD_HEADER header; + u32 pid; + u32 tid; + u64 vma; + u64 code_addr; + u64 code_size; + u64 code_index; + // name +}; +#pragma pack(pop) + +static u64 JitDumpTimestamp() +{ + struct timespec ts = {}; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (static_cast(ts.tv_sec) * 1000000000ULL) + static_cast(ts.tv_nsec); +} + +static FILE* s_jitdump_file = nullptr; +static bool s_jitdump_file_opened = false; +static std::mutex s_jitdump_mutex; +static u32 s_jitdump_record_id; + +static void RegisterMethod(const void* ptr, size_t size, const char* symbol) +{ + const u32 namelen = std::strlen(symbol) + 1; + + std::unique_lock lock(s_jitdump_mutex); + if (!s_jitdump_file) + { + if (!s_jitdump_file_opened) + { + char file[256]; + snprintf(file, std::size(file), "jit-%d.dump", getpid()); + s_jitdump_file = fopen(file, "w+b"); + s_jitdump_file_opened = true; + if (!s_jitdump_file) + return; + } + + void* perf_marker = mmap(nullptr, 4096, PROT_READ | PROT_EXEC, MAP_PRIVATE, fileno(s_jitdump_file), 0); + AssertMsg(perf_marker != MAP_FAILED, "Map perf marker"); + + JITDUMP_HEADER jh = {}; +#if defined(__aarch64__) + jh.elf_mach = EM_AARCH64; +#else + jh.elf_mach = EM_X86_64; +#endif + jh.pid = getpid(); + jh.timestamp = JitDumpTimestamp(); + std::fwrite(&jh, sizeof(jh), 1, s_jitdump_file); + } + + JITDUMP_CODE_LOAD cl = {}; + cl.header.id = JIT_CODE_LOAD; + cl.header.total_size = sizeof(cl) + namelen + static_cast(size); + cl.header.timestamp = JitDumpTimestamp(); + cl.pid = getpid(); + cl.tid = syscall(SYS_gettid); + cl.vma = 0; + cl.code_addr = static_cast(reinterpret_cast(ptr)); + cl.code_size = static_cast(size); + cl.code_index = s_jitdump_record_id++; + std::fwrite(&cl, sizeof(cl), 1, s_jitdump_file); + std::fwrite(symbol, namelen, 1, s_jitdump_file); + std::fwrite(ptr, size, 1, s_jitdump_file); + std::fflush(s_jitdump_file); +} + +#endif + +#if defined(__linux__) && (defined(ProfileWithPerf) || defined(ProfileWithPerfJitDump)) + +void PerfScope::Register(const void* ptr, size_t size, const char* symbol) +{ + char full_symbol[128]; + if (HasPrefix()) + std::snprintf(full_symbol, std::size(full_symbol), "%s_%s", m_prefix, symbol); + else + StringUtil::Strlcpy(full_symbol, symbol, std::size(full_symbol)); + RegisterMethod(ptr, size, full_symbol); +} + +void PerfScope::RegisterPC(const void* ptr, size_t size, u32 pc) +{ + char full_symbol[128]; + if (HasPrefix()) + std::snprintf(full_symbol, std::size(full_symbol), "%s_%08X", m_prefix, pc); + else + std::snprintf(full_symbol, std::size(full_symbol), "%08X", pc); + RegisterMethod(ptr, size, full_symbol); +} + +void PerfScope::RegisterKey(const void* ptr, size_t size, const char* prefix, u64 key) +{ + char full_symbol[128]; + if (HasPrefix()) + std::snprintf(full_symbol, std::size(full_symbol), "%s_%s%016" PRIX64, m_prefix, prefix, key); + else + std::snprintf(full_symbol, std::size(full_symbol), "%s%016" PRIX64, prefix, key); + RegisterMethod(ptr, size, full_symbol); +} + +#else + +void PerfScope::Register(const void* ptr, size_t size, const char* symbol) +{ +} +void PerfScope::RegisterPC(const void* ptr, size_t size, u32 pc) +{ +} +void PerfScope::RegisterKey(const void* ptr, size_t size, const char* prefix, u64 key) +{ +} + +#endif diff --git a/src/common/perf_scope.h b/src/common/perf_scope.h new file mode 100644 index 000000000..803b62297 --- /dev/null +++ b/src/common/perf_scope.h @@ -0,0 +1,20 @@ +// SPDX-FileCopyrightText: 2023 Connor McLaughlin , PCSX2 Team +// SPDX-License-Identifier: GPL-3.0 + +#pragma once + +#include "types.h" + +class PerfScope +{ +public: + constexpr PerfScope(const char* prefix) : m_prefix(prefix) {} + bool HasPrefix() const { return (m_prefix && m_prefix[0]); } + + void Register(const void* ptr, size_t size, const char* symbol); + void RegisterPC(const void* ptr, size_t size, u32 pc); + void RegisterKey(const void* ptr, size_t size, const char* prefix, u64 key); + +private: + const char* m_prefix; +}; diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index fcd06ca9f..d2e070ddb 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -19,6 +19,7 @@ add_library(core controller.h cpu_code_cache.cpp cpu_code_cache.h + cpu_code_cache_private.h cpu_core.cpp cpu_core.h cpu_core_private.h diff --git a/src/core/bus.h b/src/core/bus.h index 88c4b38ce..59a8c06ae 100644 --- a/src/core/bus.h +++ b/src/core/bus.h @@ -85,8 +85,8 @@ enum : TickCount enum : u32 { - RAM_2MB_CODE_PAGE_COUNT = (RAM_2MB_SIZE + (HOST_PAGE_SIZE + 1)) / HOST_PAGE_SIZE, - RAM_8MB_CODE_PAGE_COUNT = (RAM_8MB_SIZE + (HOST_PAGE_SIZE + 1)) / HOST_PAGE_SIZE, + RAM_2MB_CODE_PAGE_COUNT = (RAM_2MB_SIZE + (HOST_PAGE_SIZE - 1)) / HOST_PAGE_SIZE, + RAM_8MB_CODE_PAGE_COUNT = (RAM_8MB_SIZE + (HOST_PAGE_SIZE - 1)) / HOST_PAGE_SIZE, MEMORY_LUT_PAGE_SIZE = 4096, MEMORY_LUT_PAGE_SHIFT = 12, diff --git a/src/core/core.vcxproj b/src/core/core.vcxproj index 1b9275f2f..6366658d7 100644 --- a/src/core/core.vcxproj +++ b/src/core/core.vcxproj @@ -85,6 +85,7 @@ + @@ -176,6 +177,9 @@ {73ee0c55-6ffe-44e7-9c12-baa52434a797} + + {c51a346a-86b2-46df-9bb3-d0aa7e5d8699} + {075ced82-6a20-46df-94c7-9624ac9ddbeb} diff --git a/src/core/core.vcxproj.filters b/src/core/core.vcxproj.filters index fb7699b44..f0bd545d4 100644 --- a/src/core/core.vcxproj.filters +++ b/src/core/core.vcxproj.filters @@ -124,5 +124,6 @@ + \ No newline at end of file diff --git a/src/core/cpu_code_cache.cpp b/src/core/cpu_code_cache.cpp index 3af3a84a8..45ba18bc7 100644 --- a/src/core/cpu_code_cache.cpp +++ b/src/core/cpu_code_cache.cpp @@ -1,10 +1,8 @@ -// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin +// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) -#include "cpu_code_cache.h" #include "bus.h" -#include "common/assert.h" -#include "common/log.h" +#include "cpu_code_cache_private.h" #include "cpu_core.h" #include "cpu_core_private.h" #include "cpu_disasm.h" @@ -12,27 +10,110 @@ #include "settings.h" #include "system.h" #include "timing_event.h" + +#include "common/assert.h" +#include "common/intrin.h" +#include "common/log.h" + Log_SetChannel(CPU::CodeCache); #ifdef ENABLE_RECOMPILER #include "cpu_recompiler_code_generator.h" #endif +#include #include namespace CPU::CodeCache { -static constexpr bool USE_BLOCK_LINKING = true; +using LUTRangeList = std::array, 9>; +using PageProtectionArray = std::array; +using BlockInstructionInfoPair = std::pair; +using BlockInstructionList = std::vector; -// Fall blocks back to interpreter if we recompile more than 20 times within 100 frames. -static constexpr u32 RECOMPILE_FRAMES_TO_FALL_BACK_TO_INTERPRETER = 100; -static constexpr u32 RECOMPILE_COUNT_TO_FALL_BACK_TO_INTERPRETER = 20; -static constexpr u32 INVALIDATE_THRESHOLD_TO_DISABLE_LINKING = 10; +// Switch to manual protection if we invalidate more than 4 times within 20 frames. +// Fall blocks back to interpreter if we recompile more than 3 times within 15 frames. +// The interpreter fallback is set before the manual protection switch, so that if it's just a single block +// which is constantly getting mutated, we won't hurt the performance of the rest in the page. +static constexpr u32 RECOMPILE_COUNT_FOR_INTERPRETER_FALLBACK = 3; +static constexpr u32 RECOMPILE_FRAMES_FOR_INTERPRETER_FALLBACK = 15; +static constexpr u32 INVALIDATE_COUNT_FOR_MANUAL_PROTECTION = 4; +static constexpr u32 INVALIDATE_FRAMES_FOR_MANUAL_PROTECTION = 20; -#ifdef ENABLE_RECOMPILER +static CodeLUT DecodeCodeLUTPointer(u32 slot, CodeLUT ptr); +static CodeLUT EncodeCodeLUTPointer(u32 slot, CodeLUT ptr); +static CodeLUT OffsetCodeLUTPointer(CodeLUT fake_ptr, u32 pc); -// Currently remapping the code buffer doesn't work in macOS or Haiku. -#if !defined(__HAIKU__) && !defined(__APPLE__) +static void AllocateLUTs(); +static void DeallocateLUTs(); +static void ResetCodeLUT(); +static void SetCodeLUT(u32 pc, const void* function); +static void InvalidateBlock(Block* block, BlockState new_state); +static void ClearBlocks(); + +static Block* LookupBlock(u32 pc); +static Block* CreateBlock(u32 pc, const BlockInstructionList& instructions, const BlockMetadata& metadata); +static bool IsBlockCodeCurrent(const Block* block); +static bool RevalidateBlock(Block* block); +PageProtectionMode GetProtectionModeForPC(u32 pc); +PageProtectionMode GetProtectionModeForBlock(const Block* block); +static bool ReadBlockInstructions(u32 start_pc, BlockInstructionList* instructions, BlockMetadata* metadata); +static void FillBlockRegInfo(Block* block); +static void CopyRegInfo(InstructionInfo* dst, const InstructionInfo* src); +static void SetRegAccess(InstructionInfo* inst, Reg reg, bool write); +static void AddBlockToPageList(Block* block); + +static Common::PageFaultHandler::HandlerResult ExceptionHandler(void* exception_pc, void* fault_address, bool is_write); + +static Block* CreateCachedInterpreterBlock(u32 pc); +[[noreturn]] static void ExecuteCachedInterpreter(); +template +[[noreturn]] static void ExecuteCachedInterpreterImpl(); + +// Fast map provides lookup from PC to function +// Function pointers are offset so that you don't need to subtract +CodeLUTArray g_code_lut; +static BlockLUTArray s_block_lut; +static std::unique_ptr s_lut_code_pointers; +static std::unique_ptr s_lut_block_pointers; +static PageProtectionArray s_page_protection = {}; +static std::vector s_blocks; + +// for compiling - reuse to avoid allocations +static BlockInstructionList s_block_instructions; + +#ifdef ENABLE_RECOMPILER_SUPPORT + +static void BacklinkBlocks(u32 pc, const void* dst); +static void UnlinkBlockExits(Block* block); + +static void ClearASMFunctions(); +static void CompileASMFunctions(); +static bool CompileBlock(Block* block); +static Common::PageFaultHandler::HandlerResult HandleFastmemException(void* exception_pc, void* fault_address, + bool is_write); +static void BackpatchLoadStore(void* host_pc, const LoadstoreBackpatchInfo& info); + +static BlockLinkMap s_block_links; +static std::unordered_map s_fastmem_backpatch_info; +static std::unordered_set s_fastmem_faulting_pcs; + +NORETURN_FUNCTION_POINTER void (*g_enter_recompiler)(); +const void* g_compile_or_revalidate_block; +const void* g_check_events_and_dispatch; +const void* g_run_events_and_dispatch; +const void* g_dispatcher; +const void* g_interpret_block; +const void* g_discard_and_recompile_block; + +#ifdef ENABLE_RECOMPILER_PROFILING + +PerfScope MIPSPerfScope("MIPS"); + +#endif + +// Currently remapping the code buffer doesn't work in macOS. TODO: Make dynamic instead... +#ifndef __APPLE__ #define USE_STATIC_CODE_BUFFER 1 #endif @@ -44,75 +125,137 @@ static constexpr u32 RECOMPILER_FAR_CODE_CACHE_SIZE = 8 * 1024 * 1024; static constexpr u32 RECOMPILER_CODE_CACHE_SIZE = 32 * 1024 * 1024; static constexpr u32 RECOMPILER_FAR_CODE_CACHE_SIZE = 16 * 1024 * 1024; #endif -static constexpr u32 CODE_WRITE_FAULT_THRESHOLD_FOR_SLOWMEM = 10; #ifdef USE_STATIC_CODE_BUFFER static constexpr u32 RECOMPILER_GUARD_SIZE = 4096; -alignas(Recompiler::CODE_STORAGE_ALIGNMENT) static u8 - s_code_storage[RECOMPILER_CODE_CACHE_SIZE + RECOMPILER_FAR_CODE_CACHE_SIZE]; +alignas(HOST_PAGE_SIZE) static u8 s_code_storage[RECOMPILER_CODE_CACHE_SIZE + RECOMPILER_FAR_CODE_CACHE_SIZE]; #endif static JitCodeBuffer s_code_buffer; +#ifdef _DEBUG +static u32 s_total_instructions_compiled = 0; +static u32 s_total_host_instructions_emitted = 0; #endif -#ifdef ENABLE_RECOMPILER -static FastMapTable s_fast_map[FAST_MAP_TABLE_COUNT]; -static std::unique_ptr s_fast_map_pointers; +#endif // ENABLE_RECOMPILER_SUPPORT +} // namespace CPU::CodeCache -DispatcherFunction s_asm_dispatcher; -SingleBlockDispatcherFunction s_single_block_asm_dispatcher; - -static FastMapTable DecodeFastMapPointer(u32 slot, FastMapTable ptr) +bool CPU::CodeCache::IsUsingAnyRecompiler() { - if constexpr (sizeof(void*) == 8) - return reinterpret_cast(reinterpret_cast(ptr) + (static_cast(slot) << 17)); - else - return reinterpret_cast(reinterpret_cast(ptr) + (slot << 16)); +#ifdef ENABLE_RECOMPILER_SUPPORT + return g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler; +#else + return false; +#endif } -static FastMapTable EncodeFastMapPointer(u32 slot, FastMapTable ptr) +bool CPU::CodeCache::IsUsingFastmem() { - if constexpr (sizeof(void*) == 8) - return reinterpret_cast(reinterpret_cast(ptr) - (static_cast(slot) << 17)); - else - return reinterpret_cast(reinterpret_cast(ptr) - (slot << 16)); + return IsUsingAnyRecompiler() && g_settings.cpu_fastmem_mode != CPUFastmemMode::Disabled; } -static CodeBlock::HostCodePointer* OffsetFastMapPointer(FastMapTable fake_ptr, u32 pc) +void CPU::CodeCache::ProcessStartup() { - u8* fake_byte_ptr = reinterpret_cast(fake_ptr); - if constexpr (sizeof(void*) == 8) - return reinterpret_cast(fake_byte_ptr + (static_cast(pc) << 1)); - else - return reinterpret_cast(fake_byte_ptr + pc); -} + AllocateLUTs(); -static void CompileDispatcher(); -static void FastCompileBlockFunction(); -static void InvalidCodeFunction(); - -static constexpr u32 GetTableCount(u32 start, u32 end) -{ - return ((end >> FAST_MAP_TABLE_SHIFT) - (start >> FAST_MAP_TABLE_SHIFT)) + 1; -} - -static void AllocateFastMapTables(u32 start, u32 end, FastMapTable& table_ptr) -{ - const u32 start_slot = start >> FAST_MAP_TABLE_SHIFT; - const u32 count = GetTableCount(start, end); - for (u32 i = 0; i < count; i++) +#ifdef ENABLE_RECOMPILER_SUPPORT +#ifdef USE_STATIC_CODE_BUFFER + const bool has_buffer = s_code_buffer.Initialize(s_code_storage, sizeof(s_code_storage), + RECOMPILER_FAR_CODE_CACHE_SIZE, RECOMPILER_GUARD_SIZE); +#else + const bool has_buffer = false; +#endif + if (!has_buffer && !s_code_buffer.Allocate(RECOMPILER_CODE_CACHE_SIZE, RECOMPILER_FAR_CODE_CACHE_SIZE)) { - const u32 slot = start_slot + i; - - s_fast_map[slot] = EncodeFastMapPointer(slot, table_ptr); - table_ptr += FAST_MAP_TABLE_SIZE; + Panic("Failed to initialize code space"); } +#endif + + if (!Common::PageFaultHandler::InstallHandler(&s_block_lut, &ExceptionHandler)) + Panic("Failed to install page fault handler"); } -static void AllocateFastMap() +void CPU::CodeCache::ProcessShutdown() { - static constexpr VirtualMemoryAddress ranges[][2] = { + Common::PageFaultHandler::RemoveHandler(&s_block_lut); + +#ifdef ENABLE_RECOMPILER_SUPPORT + s_code_buffer.Destroy(); +#endif + + DeallocateLUTs(); +} + +void CPU::CodeCache::Initialize() +{ + Assert(s_blocks.empty()); + +#ifdef ENABLE_RECOMPILER_SUPPORT + if (IsUsingAnyRecompiler()) + { + s_code_buffer.Reset(); + CompileASMFunctions(); + ResetCodeLUT(); + } +#endif + + Bus::UpdateFastmemViews(IsUsingAnyRecompiler() ? g_settings.cpu_fastmem_mode : CPUFastmemMode::Disabled); + CPU::UpdateMemoryPointers(); +} + +void CPU::CodeCache::Shutdown() +{ + ClearBlocks(); + +#ifdef ENABLE_RECOMPILER_SUPPORT + ClearASMFunctions(); +#endif + + Bus::UpdateFastmemViews(CPUFastmemMode::Disabled); + CPU::UpdateMemoryPointers(); +} + +void CPU::CodeCache::Reset() +{ + ClearBlocks(); + +#ifdef ENABLE_RECOMPILER_SUPPORT + if (IsUsingAnyRecompiler()) + { + ClearASMFunctions(); + s_code_buffer.Reset(); + CompileASMFunctions(); + ResetCodeLUT(); + } +#endif +} + +void CPU::CodeCache::Execute() +{ +#ifdef ENABLE_RECOMPILER_SUPPORT + if (IsUsingAnyRecompiler()) + g_enter_recompiler(); + else + ExecuteCachedInterpreter(); +#else + ExecuteCachedInterpreter(); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// MARK: - Block Management +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace CPU::CodeCache { +static constexpr u32 GetLUTTableCount(u32 start, u32 end) +{ + return ((end >> LUT_TABLE_SHIFT) - (start >> LUT_TABLE_SHIFT)) + 1; +} + +static constexpr LUTRangeList GetLUTRanges() +{ + const LUTRangeList ranges = {{ {0x00000000, 0x00800000}, // RAM {0x1F000000, 0x1F800000}, // EXP1 {0x1FC00000, 0x1FC80000}, // BIOS @@ -124,418 +267,524 @@ static void AllocateFastMap() {0xA0000000, 0xA0800000}, // RAM {0xBF000000, 0xBF800000}, // EXP1 {0xBFC00000, 0xBFC80000} // BIOS - }; + }}; + return ranges; +} - u32 num_tables = 1; // unreachable table - for (u32 i = 0; i < countof(ranges); i++) - num_tables += GetTableCount(ranges[i][0], ranges[i][1]); +static constexpr u32 GetLUTSlotCount(bool include_unreachable) +{ + u32 tables = include_unreachable ? 1 : 0; // unreachable table + for (const auto& [start, end] : GetLUTRanges()) + tables += GetLUTTableCount(start, end); - const u32 num_slots = FAST_MAP_TABLE_SIZE * num_tables; - if (!s_fast_map_pointers) - s_fast_map_pointers = std::make_unique(num_slots); + return tables * LUT_TABLE_SIZE; +} +} // namespace CPU::CodeCache - FastMapTable table_ptr = s_fast_map_pointers.get(); - FastMapTable table_ptr_end = table_ptr + num_slots; +CPU::CodeCache::CodeLUT CPU::CodeCache::DecodeCodeLUTPointer(u32 slot, CodeLUT ptr) +{ + if constexpr (sizeof(void*) == 8) + return reinterpret_cast(reinterpret_cast(ptr) + (static_cast(slot) << 17)); + else + return reinterpret_cast(reinterpret_cast(ptr) + (slot << 16)); +} - // Fill the first table with invalid/unreachable. - for (u32 i = 0; i < FAST_MAP_TABLE_SIZE; i++) - table_ptr[i] = InvalidCodeFunction; +CPU::CodeCache::CodeLUT CPU::CodeCache::EncodeCodeLUTPointer(u32 slot, CodeLUT ptr) +{ + if constexpr (sizeof(void*) == 8) + return reinterpret_cast(reinterpret_cast(ptr) - (static_cast(slot) << 17)); + else + return reinterpret_cast(reinterpret_cast(ptr) - (slot << 16)); +} - // And the remaining with block compile pointers. - for (u32 i = FAST_MAP_TABLE_SIZE; i < num_slots; i++) - table_ptr[i] = FastCompileBlockFunction; +CPU::CodeCache::CodeLUT CPU::CodeCache::OffsetCodeLUTPointer(CodeLUT fake_ptr, u32 pc) +{ + u8* fake_byte_ptr = reinterpret_cast(fake_ptr); + if constexpr (sizeof(void*) == 8) + return reinterpret_cast(fake_byte_ptr + (static_cast(pc) << 1)); + else + return reinterpret_cast(fake_byte_ptr + pc); +} + +void CPU::CodeCache::AllocateLUTs() +{ + constexpr u32 num_code_slots = GetLUTSlotCount(true); + constexpr u32 num_block_slots = GetLUTSlotCount(false); + + Assert(!s_lut_code_pointers && !s_lut_block_pointers); + s_lut_code_pointers = std::make_unique(num_code_slots); + s_lut_block_pointers = std::make_unique(num_block_slots); + std::memset(s_lut_block_pointers.get(), 0, sizeof(Block*) * num_block_slots); + + CodeLUT code_table_ptr = s_lut_code_pointers.get(); + Block** block_table_ptr = s_lut_block_pointers.get(); + CodeLUT const code_table_ptr_end = code_table_ptr + num_code_slots; + Block** const block_table_ptr_end = block_table_ptr + num_block_slots; + + // Make the unreachable table jump to the invalid code callback. + MemsetPtrs(code_table_ptr, static_cast(nullptr), LUT_TABLE_COUNT); // Mark everything as unreachable to begin with. - for (u32 i = 0; i < FAST_MAP_TABLE_COUNT; i++) - s_fast_map[i] = EncodeFastMapPointer(i, table_ptr); - table_ptr += FAST_MAP_TABLE_SIZE; + for (u32 i = 0; i < LUT_TABLE_COUNT; i++) + { + g_code_lut[i] = EncodeCodeLUTPointer(i, code_table_ptr); + s_block_lut[i] = nullptr; + } + code_table_ptr += LUT_TABLE_SIZE; // Allocate ranges. - for (u32 i = 0; i < countof(ranges); i++) - AllocateFastMapTables(ranges[i][0], ranges[i][1], table_ptr); - - Assert(table_ptr == table_ptr_end); -} - -static void ResetFastMap() -{ - if (!s_fast_map_pointers) - return; - - for (u32 i = 0; i < FAST_MAP_TABLE_COUNT; i++) + for (const auto& [start, end] : GetLUTRanges()) { - FastMapTable ptr = DecodeFastMapPointer(i, s_fast_map[i]); - if (ptr == s_fast_map_pointers.get()) - continue; - - for (u32 j = 0; j < FAST_MAP_TABLE_SIZE; j++) - ptr[j] = FastCompileBlockFunction; - } -} - -static void FreeFastMap() -{ - std::memset(s_fast_map, 0, sizeof(s_fast_map)); - s_fast_map_pointers.reset(); -} - -static void SetFastMap(u32 pc, CodeBlock::HostCodePointer function) -{ - if (!s_fast_map_pointers) - return; - - const u32 slot = pc >> FAST_MAP_TABLE_SHIFT; - FastMapTable encoded_ptr = s_fast_map[slot]; - - const FastMapTable table_ptr = DecodeFastMapPointer(slot, encoded_ptr); - Assert(table_ptr != nullptr && table_ptr != s_fast_map_pointers.get()); - - CodeBlock::HostCodePointer* ptr = OffsetFastMapPointer(encoded_ptr, pc); - *ptr = function; -} - -#endif - -using BlockMap = std::unordered_map; -using HostCodeMap = std::map; - -void LogCurrentState(); - -/// Returns the block key for the current execution state. -static CodeBlockKey GetNextBlockKey(); - -/// Looks up the block in the cache if it's already been compiled. -static CodeBlock* LookupBlock(CodeBlockKey key, bool allow_flush); - -/// Can the current block execute? This will re-validate the block if necessary. -/// The block can also be flushed if recompilation failed, so ignore the pointer if false is returned. -static bool RevalidateBlock(CodeBlock* block, bool allow_flush); - -static bool CompileBlock(CodeBlock* block, bool allow_flush); -static void RemoveReferencesToBlock(CodeBlock* block); -static void AddBlockToPageMap(CodeBlock* block); -static void RemoveBlockFromPageMap(CodeBlock* block); - -/// Link block from to to. Returns the successor index. -static void LinkBlock(CodeBlock* from, CodeBlock* to, void* host_pc, void* host_resolve_pc, u32 host_pc_size); - -/// Unlink all blocks which point to this block, and any that this block links to. -static void UnlinkBlock(CodeBlock* block); - -static void ClearState(); - -static BlockMap s_blocks; -static std::array, Bus::RAM_8MB_CODE_PAGE_COUNT> m_ram_block_map; - -#ifdef ENABLE_RECOMPILER -static HostCodeMap s_host_code_map; - -static void AddBlockToHostCodeMap(CodeBlock* block); -static void RemoveBlockFromHostCodeMap(CodeBlock* block); - -static bool InitializeFastmem(); -static void ShutdownFastmem(); -static Common::PageFaultHandler::HandlerResult LUTPageFaultHandler(void* exception_pc, void* fault_address, - bool is_write); -#ifdef ENABLE_MMAP_FASTMEM -static Common::PageFaultHandler::HandlerResult MMapPageFaultHandler(void* exception_pc, void* fault_address, - bool is_write); -#endif -#endif // ENABLE_RECOMPILER - -void Initialize() -{ - Assert(s_blocks.empty()); - -#ifdef ENABLE_RECOMPILER - if (g_settings.IsUsingRecompiler()) - { -#ifdef USE_STATIC_CODE_BUFFER - const bool has_buffer = s_code_buffer.Initialize(s_code_storage, sizeof(s_code_storage), - RECOMPILER_FAR_CODE_CACHE_SIZE, RECOMPILER_GUARD_SIZE); -#else - const bool has_buffer = false; -#endif - if (!has_buffer && !s_code_buffer.Allocate(RECOMPILER_CODE_CACHE_SIZE, RECOMPILER_FAR_CODE_CACHE_SIZE)) + const u32 start_slot = start >> LUT_TABLE_SHIFT; + const u32 count = GetLUTTableCount(start, end); + for (u32 i = 0; i < count; i++) { - Panic("Failed to initialize code space"); + const u32 slot = start_slot + i; + + g_code_lut[slot] = EncodeCodeLUTPointer(slot, code_table_ptr); + code_table_ptr += LUT_TABLE_SIZE; + + s_block_lut[slot] = block_table_ptr; + block_table_ptr += LUT_TABLE_SIZE; } } + + Assert(code_table_ptr == code_table_ptr_end); + Assert(block_table_ptr == block_table_ptr_end); +} + +void CPU::CodeCache::DeallocateLUTs() +{ + s_lut_block_pointers.reset(); + s_lut_code_pointers.reset(); +} + +void CPU::CodeCache::ResetCodeLUT() +{ + if (!s_lut_code_pointers) + return; + + // Make the unreachable table jump to the invalid code callback. + MemsetPtrs(s_lut_code_pointers.get(), g_interpret_block, LUT_TABLE_COUNT); + + for (u32 i = 0; i < LUT_TABLE_COUNT; i++) + { + CodeLUT ptr = DecodeCodeLUTPointer(i, g_code_lut[i]); + if (ptr == s_lut_code_pointers.get()) + continue; + + MemsetPtrs(ptr, g_compile_or_revalidate_block, LUT_TABLE_SIZE); + } +} + +void CPU::CodeCache::SetCodeLUT(u32 pc, const void* function) +{ + if (!s_lut_code_pointers) + return; + + const u32 table = pc >> LUT_TABLE_SHIFT; + CodeLUT encoded_ptr = g_code_lut[table]; + +#ifdef _DEBUG + const CodeLUT table_ptr = DecodeCodeLUTPointer(table, encoded_ptr); + DebugAssert(table_ptr != nullptr && table_ptr != s_lut_code_pointers.get()); #endif - AllocateFastMap(); + *OffsetCodeLUTPointer(encoded_ptr, pc) = function; +} -#ifdef ENABLE_RECOMPILER - if (g_settings.IsUsingRecompiler()) +CPU::CodeCache::Block* CPU::CodeCache::LookupBlock(u32 pc) +{ + const u32 table = pc >> LUT_TABLE_SHIFT; + if (!s_block_lut[table]) + return nullptr; + + const u32 idx = (pc & 0xFFFF) >> 2; + return s_block_lut[table][idx]; +} + +CPU::CodeCache::Block* CPU::CodeCache::CreateBlock(u32 pc, const BlockInstructionList& instructions, + const BlockMetadata& metadata) +{ + const u32 size = static_cast(instructions.size()); + const u32 table = pc >> LUT_TABLE_SHIFT; + Assert(s_block_lut[table]); + + // retain from old block + const u32 frame_number = System::GetFrameNumber(); + u32 recompile_frame = System::GetFrameNumber(); + u8 recompile_count = 0; + + const u32 idx = (pc & 0xFFFF) >> 2; + Block* block = s_block_lut[table][idx]; + if (block) { - if (g_settings.IsUsingFastmem() && !InitializeFastmem()) - Panic("Failed to initialize fastmem"); + // shouldn't be in the page list.. since we should come here after invalidating + Assert(!block->next_block_in_page); - AllocateFastMap(); - CompileDispatcher(); - ResetFastMap(); + // keep recompile stats before resetting, that way we actually count recompiles + recompile_frame = block->compile_frame; + recompile_count = block->compile_count; + + // if it has the same number of instructions, we can reuse it + if (block->size != size) + { + // this sucks.. hopefully won't happen very often + // TODO: allocate max size, allow shrink but not grow + auto it = std::find(s_blocks.begin(), s_blocks.end(), block); + Assert(it != s_blocks.end()); + s_blocks.erase(it); + + std::free(block); + block = nullptr; + } + } + + if (!block) + { + block = + static_cast(std::malloc(sizeof(Block) + (sizeof(Instruction) * size) + (sizeof(InstructionInfo) * size))); + Assert(block); + s_blocks.push_back(block); + } + + block->pc = pc; + block->size = size; + block->host_code = nullptr; + block->next_block_in_page = nullptr; + block->num_exit_links = 0; + block->state = BlockState::Valid; + block->flags = metadata.flags; + block->protection = GetProtectionModeForBlock(block); + block->uncached_fetch_ticks = metadata.uncached_fetch_ticks; + block->icache_line_count = metadata.icache_line_count; + block->compile_frame = recompile_frame; + block->compile_count = recompile_count + 1; + + // copy instructions/info + { + const std::pair* ip = instructions.data(); + Instruction* dsti = block->Instructions(); + InstructionInfo* dstii = block->InstructionsInfo(); + + for (u32 i = 0; i < size; i++, ip++, dsti++, dstii++) + { + dsti->bits = ip->first.bits; + *dstii = ip->second; + } + } + + s_block_lut[table][idx] = block; + + // if the block is being recompiled too often, leave it in the list, but don't compile it. + const u32 frame_delta = frame_number - recompile_frame; + if (frame_delta >= RECOMPILE_FRAMES_FOR_INTERPRETER_FALLBACK) + { + block->compile_frame = frame_number; + block->compile_count = 1; + } + else if (block->compile_count >= RECOMPILE_COUNT_FOR_INTERPRETER_FALLBACK) + { + Log_DevFmt("{} recompiles in {} frames to block 0x{:08X}, not caching.", block->compile_count, frame_delta, + block->pc); + block->size = 0; + } + + // cached interpreter creates empty blocks when falling back + if (block->size == 0) + { + block->state = BlockState::FallbackToInterpreter; + block->protection = PageProtectionMode::Unprotected; + return block; + } + + // TODO: Only used by NewRec for now, don't waste time filling it. + if constexpr (false) + FillBlockRegInfo(block); + + // add it to the tracking list for its page + AddBlockToPageList(block); + + return block; +} + +bool CPU::CodeCache::IsBlockCodeCurrent(const Block* block) +{ + // blocks shouldn't be wrapping.. + const PhysicalMemoryAddress phys_addr = VirtualAddressToPhysical(block->pc); + DebugAssert((phys_addr + (sizeof(Instruction) * block->size)) <= Bus::g_ram_size); + + // can just do a straight memcmp.. + return (std::memcmp(Bus::g_ram + phys_addr, block->Instructions(), sizeof(Instruction) * block->size) == 0); +} + +bool CPU::CodeCache::RevalidateBlock(Block* block) +{ + DebugAssert(block->state != BlockState::Valid); + DebugAssert(AddressInRAM(block->pc)); + + if (block->state >= BlockState::NeedsRecompile) + return false; + + // Protection may have changed if we didn't execute before it got invalidated again. e.g. THPS2. + if (block->protection != GetProtectionModeForBlock(block)) + return false; + + if (!IsBlockCodeCurrent(block)) + { + // changed, needs recompiling + Log_DebugPrintf("Block at PC %08X has changed and needs recompiling", block->pc); + return false; + } + + block->state = BlockState::Valid; + AddBlockToPageList(block); + return true; +} + +void CPU::CodeCache::AddBlockToPageList(Block* block) +{ + DebugAssert(block->size > 0); + if (!AddressInRAM(block->pc) || block->protection != PageProtectionMode::WriteProtected) + return; + + const u32 page_idx = block->StartPageIndex(); + PageProtectionInfo& entry = s_page_protection[page_idx]; + Bus::SetRAMCodePage(page_idx); + + if (entry.last_block_in_page) + { + entry.last_block_in_page->next_block_in_page = block; + entry.last_block_in_page = block; + } + else + { + entry.first_block_in_page = block; + entry.last_block_in_page = block; + } +} + +void CPU::CodeCache::InvalidateBlocksWithPageIndex(u32 index) +{ + DebugAssert(index < Bus::RAM_8MB_CODE_PAGE_COUNT); + Bus::ClearRAMCodePage(index); + + BlockState new_block_state = BlockState::Invalidated; + PageProtectionInfo& ppi = s_page_protection[index]; + + const u32 frame_number = System::GetFrameNumber(); + const u32 frame_delta = frame_number - ppi.invalidate_frame; + ppi.invalidate_count++; + + if (frame_delta >= INVALIDATE_FRAMES_FOR_MANUAL_PROTECTION) + { + ppi.invalidate_count = 1; + ppi.invalidate_frame = frame_number; + } + else if (ppi.invalidate_count > INVALIDATE_COUNT_FOR_MANUAL_PROTECTION) + { + Log_DevFmt("{} invalidations in {} frames to page {} [0x{:08X} -> 0x{:08X}], switching to manual protection", + ppi.invalidate_count, frame_delta, index, (index * HOST_PAGE_SIZE), ((index + 1) * HOST_PAGE_SIZE)); + ppi.mode = PageProtectionMode::ManualCheck; + new_block_state = BlockState::NeedsRecompile; + } + + Block* block = ppi.first_block_in_page; + while (block) + { + InvalidateBlock(block, new_block_state); + block = std::exchange(block->next_block_in_page, nullptr); + } + + ppi.first_block_in_page = nullptr; + ppi.last_block_in_page = nullptr; +} + +CPU::CodeCache::PageProtectionMode CPU::CodeCache::GetProtectionModeForPC(u32 pc) +{ + if (!AddressInRAM(pc)) + return PageProtectionMode::Unprotected; + + const u32 page_idx = Bus::GetRAMCodePageIndex(pc); + return s_page_protection[page_idx].mode; +} + +CPU::CodeCache::PageProtectionMode CPU::CodeCache::GetProtectionModeForBlock(const Block* block) +{ + // if the block has a branch delay slot crossing a page, we must use manual protection. + // no other way about it. + if (block->HasFlag(BlockFlags::BranchDelaySpansPages)) + return PageProtectionMode::ManualCheck; + + return GetProtectionModeForPC(block->pc); +} + +void CPU::CodeCache::InvalidateBlock(Block* block, BlockState new_state) +{ +#ifdef ENABLE_RECOMPILER_SUPPORT + if (block->state == BlockState::Valid) + { + SetCodeLUT(block->pc, g_compile_or_revalidate_block); + BacklinkBlocks(block->pc, g_compile_or_revalidate_block); } #endif + + block->state = new_state; } -void ClearState() +void CPU::CodeCache::InvalidateAllRAMBlocks() { + // TODO: maybe combine the backlink into one big instruction flush cache? + + for (Block* block : s_blocks) + { + if (AddressInRAM(block->pc)) + InvalidateBlock(block, BlockState::Invalidated); + } + Bus::ClearRAMCodePageFlags(); - for (auto& it : m_ram_block_map) - it.clear(); +} - for (const auto& it : s_blocks) - delete it.second; +void CPU::CodeCache::ClearBlocks() +{ + for (u32 i = 0; i < Bus::RAM_8MB_CODE_PAGE_COUNT; i++) + { + PageProtectionInfo& ppi = s_page_protection[i]; + if (ppi.mode == PageProtectionMode::WriteProtected && ppi.first_block_in_page) + Bus::ClearRAMCodePage(i); + ppi = {}; + } + +#ifdef ENABLE_RECOMPILER_SUPPORT + s_fastmem_backpatch_info.clear(); + s_fastmem_faulting_pcs.clear(); + s_block_links.clear(); +#endif + + for (Block* block : s_blocks) + std::free(block); s_blocks.clear(); -#ifdef ENABLE_RECOMPILER - s_host_code_map.clear(); - s_code_buffer.Reset(); - ResetFastMap(); + + std::memset(s_lut_block_pointers.get(), 0, sizeof(Block*) * GetLUTSlotCount(false)); +} + +Common::PageFaultHandler::HandlerResult CPU::CodeCache::ExceptionHandler(void* exception_pc, void* fault_address, + bool is_write) +{ + // TODO: Catch general RAM writes, not just fastmem +#ifdef ENABLE_RECOMPILER_SUPPORT + return HandleFastmemException(exception_pc, fault_address, is_write); +#else + return Common::PageFaultHandler::HandlerResult::ExecuteNextHandler; #endif } -void Shutdown() +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// MARK: - Cached Interpreter +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +CPU::CodeCache::Block* CPU::CodeCache::CreateCachedInterpreterBlock(u32 pc) { - ClearState(); -#ifdef ENABLE_RECOMPILER - ShutdownFastmem(); - FreeFastMap(); - s_code_buffer.Destroy(); -#endif + BlockMetadata metadata = {}; + ReadBlockInstructions(pc, &s_block_instructions, &metadata); + return CreateBlock(pc, s_block_instructions, metadata); } template -[[noreturn]] static void ExecuteImpl() +[[noreturn]] void CPU::CodeCache::ExecuteCachedInterpreterImpl() { - CodeBlockKey next_block_key; +#define CHECK_DOWNCOUNT() \ + if (g_state.pending_ticks >= g_state.downcount) \ + break; for (;;) { TimingEvents::RunEvents(); - next_block_key = GetNextBlockKey(); while (g_state.pending_ticks < g_state.downcount) { - CodeBlock* block = LookupBlock(next_block_key, true); - if (!block) - { - InterpretUncachedBlock(); - next_block_key = GetNextBlockKey(); - continue; - } - - reexecute_block: - Assert(!(HasPendingInterrupt())); - -#if 0 - const u32 tick = TimingEvents::GetGlobalTickCounter() + CPU::GetPendingTicks(); - if (tick == 4188233674) - __debugbreak(); -#endif - #if 0 LogCurrentState(); #endif +#if 0 + if ((g_state.pending_ticks + TimingEvents::GetGlobalTickCounter()) == 3301006214) + __debugbreak(); +#endif + // Manually done because we don't want to compile blocks without a LUT. + const u32 pc = g_state.pc; + const u32 table = pc >> LUT_TABLE_SHIFT; + Block* block; + if (s_block_lut[table]) + { + const u32 idx = (pc & 0xFFFF) >> 2; + block = s_block_lut[table][idx]; + } + else + { + // Likely invalid code... + goto interpret_block; + } + + reexecute_block: + if (!block) + { + if ((block = CreateCachedInterpreterBlock(pc))->size == 0) [[unlikely]] + goto interpret_block; + } + else + { + if (block->state == BlockState::FallbackToInterpreter) [[unlikely]] + goto interpret_block; + + if ((block->state != BlockState::Valid && !RevalidateBlock(block)) || + (block->protection == PageProtectionMode::ManualCheck && !IsBlockCodeCurrent(block))) + { + if ((block = CreateCachedInterpreterBlock(pc))->size == 0) [[unlikely]] + goto interpret_block; + } + } + + // TODO: make DebugAssert + Assert(!(HasPendingInterrupt())); if (g_settings.cpu_recompiler_icache) CheckAndUpdateICacheTags(block->icache_line_count, block->uncached_fetch_ticks); - InterpretCachedBlock(*block); + InterpretCachedBlock(block); - if (g_state.pending_ticks >= g_state.downcount) - break; - else if (!USE_BLOCK_LINKING) + CHECK_DOWNCOUNT(); + + // Handle self-looping blocks + if (g_state.pc == block->pc) + goto reexecute_block; + else continue; - next_block_key = GetNextBlockKey(); - if (next_block_key.bits == block->key.bits) - { - // we can jump straight to it if there's no pending interrupts - // ensure it's not a self-modifying block - if (!block->invalidated || RevalidateBlock(block, true)) - goto reexecute_block; - } - else if (!block->invalidated) - { - // Try to find an already-linked block. - // TODO: Don't need to dereference the block, just store a pointer to the code. - for (const CodeBlock::LinkInfo& li : block->link_successors) - { - CodeBlock* linked_block = li.block; - if (linked_block->key.bits == next_block_key.bits) - { - if (linked_block->invalidated && !RevalidateBlock(linked_block, true)) - { - // CanExecuteBlock can result in a block flush, so stop iterating here. - break; - } - - // Execute the linked block - block = linked_block; - goto reexecute_block; - } - } - - // No acceptable blocks found in the successor list, try a new one. - CodeBlock* next_block = LookupBlock(next_block_key, false); - if (next_block) - { - // Link the previous block to this new block if we find a new block. - LinkBlock(block, next_block, nullptr, nullptr, 0); - block = next_block; - goto reexecute_block; - } - } + interpret_block: + InterpretUncachedBlock(); + CHECK_DOWNCOUNT(); + continue; } } - - // in case we switch to interpreter... - g_state.npc = g_state.pc; } -#ifdef ENABLE_RECOMPILER - -void CompileDispatcher() +[[noreturn]] void CPU::CodeCache::ExecuteCachedInterpreter() { - s_code_buffer.WriteProtect(false); - + if (g_settings.gpu_pgxp_enable) { - Recompiler::CodeGenerator cg(&s_code_buffer); - s_asm_dispatcher = cg.CompileDispatcher(); + if (g_settings.gpu_pgxp_cpu) + ExecuteCachedInterpreterImpl(); + else + ExecuteCachedInterpreterImpl(); } + else { - Recompiler::CodeGenerator cg(&s_code_buffer); - s_single_block_asm_dispatcher = cg.CompileSingleBlockDispatcher(); - } - - s_code_buffer.WriteProtect(true); -} - -FastMapTable* GetFastMapPointer() -{ - return s_fast_map; -} - -[[noreturn]] static void ExecuteRecompiler() -{ -#if 0 - for (;;) - { - if (HasPendingInterrupt()) - DispatchInterrupt(); - - TimingEvents::RunEvents(); - - while (g_state.pending_ticks < g_state.downcount) - { -#if 0 - LogCurrentState(); -#endif - - const u32 pc = g_state.pc; - s_single_block_asm_dispatcher(s_fast_map[pc >> 16][pc >> 2]); - } - } -#else - s_asm_dispatcher(); -#endif - UnreachableCode(); -} - -#endif - -[[noreturn]] void Execute() -{ - switch (g_settings.cpu_execution_mode) - { -#ifdef ENABLE_RECOMPILER - case CPUExecutionMode::Recompiler: - ExecuteRecompiler(); - break; -#endif - - default: - { - if (g_settings.gpu_pgxp_enable) - { - if (g_settings.gpu_pgxp_cpu) - ExecuteImpl(); - else - ExecuteImpl(); - } - else - { - ExecuteImpl(); - } - } - break; + ExecuteCachedInterpreterImpl(); } } -#if defined(ENABLE_RECOMPILER) - -JitCodeBuffer& GetCodeBuffer() -{ - return s_code_buffer; -} - -#endif - -void Reinitialize() -{ - ClearState(); - -#ifdef ENABLE_RECOMPILER - ShutdownFastmem(); -#endif - -#if defined(ENABLE_RECOMPILER) - s_code_buffer.Destroy(); - - if (g_settings.IsUsingRecompiler()) - { -#ifdef USE_STATIC_CODE_BUFFER - if (!s_code_buffer.Initialize(s_code_storage, sizeof(s_code_storage), RECOMPILER_FAR_CODE_CACHE_SIZE, - RECOMPILER_GUARD_SIZE)) -#else - if (!s_code_buffer.Allocate(RECOMPILER_CODE_CACHE_SIZE, RECOMPILER_FAR_CODE_CACHE_SIZE)) -#endif - { - Panic("Failed to initialize code space"); - } - } -#endif - -#ifdef ENABLE_RECOMPILER - if (g_settings.IsUsingRecompiler()) - { - if (g_settings.IsUsingFastmem() && !InitializeFastmem()) - Panic("Failed to initialize fastmem"); - - AllocateFastMap(); - CompileDispatcher(); - ResetFastMap(); - } -#endif -} - -void Flush() -{ - ClearState(); -#ifdef ENABLE_RECOMPILER - if (g_settings.IsUsingRecompiler()) - CompileDispatcher(); -#endif -} - -#ifndef _MSC_VER -void __debugbreak() -{ -} -#endif - -void LogCurrentState() +void CPU::CodeCache::LogCurrentState() { #if 0 if ((TimingEvents::GetGlobalTickCounter() + GetPendingTicks()) == 2546728915) @@ -561,148 +810,16 @@ void LogCurrentState() g_state.cop0_regs.sr.bits, static_cast(crc32(0, (const Bytef*)&g_state.gte_regs, sizeof(g_state.gte_regs)))); } -CodeBlockKey GetNextBlockKey() +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// MARK: - Block Compilation: Shared Code +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +bool CPU::CodeCache::ReadBlockInstructions(u32 start_pc, BlockInstructionList* instructions, BlockMetadata* metadata) { - CodeBlockKey key; - key.bits = 0; - key.SetPC(g_state.pc); - key.user_mode = InUserMode(); - return key; -} + // TODO: Jump to other block if it exists at this pc? -// assumes it has already been unlinked -static void FallbackExistingBlockToInterpreter(CodeBlock* block) -{ - // Replace with null so we don't try to compile it again. - s_blocks.emplace(block->key.bits, nullptr); - delete block; -} - -CodeBlock* LookupBlock(CodeBlockKey key, bool allow_flush) -{ - BlockMap::iterator iter = s_blocks.find(key.bits); - if (iter != s_blocks.end()) - { - // ensure it hasn't been invalidated - CodeBlock* existing_block = iter->second; - if (!existing_block || !existing_block->invalidated) - return existing_block; - - // if compilation fails or we're forced back to the interpreter, bail out - if (RevalidateBlock(existing_block, allow_flush)) - return existing_block; - else - return nullptr; - } - - CodeBlock* block = new CodeBlock(key); - block->recompile_frame_number = System::GetFrameNumber(); - - if (CompileBlock(block, allow_flush)) - { - // add it to the page map if it's in ram - AddBlockToPageMap(block); - -#ifdef ENABLE_RECOMPILER - SetFastMap(block->GetPC(), block->host_code); - AddBlockToHostCodeMap(block); -#endif - } - else - { - Log_ErrorPrintf("Failed to compile block at PC=0x%08X", key.GetPC()); - delete block; - block = nullptr; - } - - if (block || allow_flush) - s_blocks.emplace(key.bits, block); - - return block; -} - -bool RevalidateBlock(CodeBlock* block, bool allow_flush) -{ - for (const CodeBlockInstruction& cbi : block->instructions) - { - u32 new_code = 0; - SafeReadInstruction(cbi.pc, &new_code); - if (cbi.instruction.bits != new_code) - { - Log_DebugPrintf("Block 0x%08X changed at PC 0x%08X - %08X to %08X - recompiling.", block->GetPC(), cbi.pc, - cbi.instruction.bits, new_code); - goto recompile; - } - } - - // re-add it to the page map since it's still up-to-date - block->invalidated = false; - AddBlockToPageMap(block); -#ifdef ENABLE_RECOMPILER - SetFastMap(block->GetPC(), block->host_code); -#endif - return true; - -recompile: - // remove any references to the block from the lookup table. - // this is an edge case where compiling causes a flush-all due to no space, - // and we don't want to nuke the block we're compiling... - RemoveReferencesToBlock(block); - -#ifdef ENABLE_RECOMPILER - RemoveBlockFromHostCodeMap(block); -#endif - - const u32 frame_number = System::GetFrameNumber(); - const u32 frame_diff = frame_number - block->recompile_frame_number; - if (frame_diff <= RECOMPILE_FRAMES_TO_FALL_BACK_TO_INTERPRETER) - { - block->recompile_count++; - - if (block->recompile_count >= RECOMPILE_COUNT_TO_FALL_BACK_TO_INTERPRETER) - { - Log_PerfPrintf("Block 0x%08X has been recompiled %u times in %u frames, falling back to interpreter", - block->GetPC(), block->recompile_count, frame_diff); - - FallbackExistingBlockToInterpreter(block); - return false; - } - } - else - { - // It's been a while since this block was modified, so it's all good. - block->recompile_frame_number = frame_number; - block->recompile_count = 0; - } - - block->instructions.clear(); - - if (!CompileBlock(block, allow_flush)) - { - Log_PerfPrintf("Failed to recompile block 0x%08X, falling back to interpreter.", block->GetPC()); - FallbackExistingBlockToInterpreter(block); - return false; - } - - AddBlockToPageMap(block); - -#ifdef ENABLE_RECOMPILER - // re-add to page map again - SetFastMap(block->GetPC(), block->host_code); - AddBlockToHostCodeMap(block); -#endif - - // block is valid again - block->invalidated = false; - - // re-insert into the block map since we removed it earlier. - s_blocks.emplace(block->key.bits, block); - return true; -} - -bool CompileBlock(CodeBlock* block, bool allow_flush) -{ - u32 pc = block->GetPC(); + const PageProtectionMode protection = GetProtectionModeForPC(start_pc); + u32 pc = start_pc; bool is_branch_delay_slot = false; bool is_load_delay_slot = false; @@ -711,590 +828,759 @@ bool CompileBlock(CodeBlock* block, bool allow_flush) __debugbreak(); #endif - block->icache_line_count = 0; - block->uncached_fetch_ticks = 0; - block->contains_double_branches = false; - block->contains_loadstore_instructions = false; + instructions->clear(); + metadata->icache_line_count = 0; + metadata->uncached_fetch_ticks = 0; + metadata->flags = BlockFlags::None; u32 last_cache_line = ICACHE_LINES; + u32 last_page = (protection == PageProtectionMode::WriteProtected) ? Bus::GetRAMCodePageIndex(start_pc) : 0; for (;;) { - CodeBlockInstruction cbi = {}; - if (!SafeReadInstruction(pc, &cbi.instruction.bits) || !IsInvalidInstruction(cbi.instruction)) + if (protection == PageProtectionMode::WriteProtected) + { + const u32 this_page = Bus::GetRAMCodePageIndex(pc); + if (this_page != last_page) + { + // if we're just crossing the page and not in a branch delay slot, jump directly to the next block + if (!is_branch_delay_slot) + { + Log_DevFmt("Breaking block 0x{:08X} at 0x{:08X} due to page crossing", start_pc, pc); + metadata->flags |= BlockFlags::SpansPages; + break; + } + else + { + // otherwise, we need to use manual protection in case the delay slot changes. + // may as well keep going then, since we're doing manual check anyways. + Log_DevFmt("Block 0x{:08X} has branch delay slot crossing page at 0x{:08X}, forcing manual protection", + start_pc, pc); + metadata->flags |= BlockFlags::BranchDelaySpansPages; + } + } + } + + Instruction instruction; + if (!SafeReadInstruction(pc, &instruction.bits) || !IsInvalidInstruction(instruction)) break; - cbi.pc = pc; - cbi.is_branch_delay_slot = is_branch_delay_slot; - cbi.is_load_delay_slot = is_load_delay_slot; - cbi.is_branch_instruction = IsBranchInstruction(cbi.instruction); - cbi.is_direct_branch_instruction = IsDirectBranchInstruction(cbi.instruction); - cbi.is_unconditional_branch_instruction = IsUnconditionalBranchInstruction(cbi.instruction); - cbi.is_load_instruction = IsMemoryLoadInstruction(cbi.instruction); - cbi.is_store_instruction = IsMemoryStoreInstruction(cbi.instruction); - cbi.has_load_delay = InstructionHasLoadDelay(cbi.instruction); - cbi.can_trap = CanInstructionTrap(cbi.instruction, InUserMode()); - cbi.is_direct_branch_instruction = IsDirectBranchInstruction(cbi.instruction); + InstructionInfo info; + std::memset(&info, 0, sizeof(info)); + + info.pc = pc; + info.is_branch_delay_slot = is_branch_delay_slot; + info.is_load_delay_slot = is_load_delay_slot; + info.is_branch_instruction = IsBranchInstruction(instruction); + info.is_direct_branch_instruction = IsDirectBranchInstruction(instruction); + info.is_unconditional_branch_instruction = IsUnconditionalBranchInstruction(instruction); + info.is_load_instruction = IsMemoryLoadInstruction(instruction); + info.is_store_instruction = IsMemoryStoreInstruction(instruction); + info.has_load_delay = InstructionHasLoadDelay(instruction); + info.can_trap = CanInstructionTrap(instruction, false /*InUserMode()*/); + info.is_direct_branch_instruction = IsDirectBranchInstruction(instruction); if (g_settings.cpu_recompiler_icache) { const u32 icache_line = GetICacheLine(pc); if (icache_line != last_cache_line) { - block->icache_line_count++; + metadata->icache_line_count++; last_cache_line = icache_line; } } - block->uncached_fetch_ticks += GetInstructionReadTicks(pc); - block->contains_loadstore_instructions |= cbi.is_load_instruction; - block->contains_loadstore_instructions |= cbi.is_store_instruction; + metadata->uncached_fetch_ticks += GetInstructionReadTicks(pc); + if (info.is_load_instruction || info.is_store_instruction) + metadata->flags |= BlockFlags::ContainsLoadStoreInstructions; - pc += sizeof(cbi.instruction.bits); + pc += sizeof(Instruction); - if (is_branch_delay_slot && cbi.is_branch_instruction) + if (is_branch_delay_slot && info.is_branch_instruction) { - const CodeBlockInstruction& prev_cbi = block->instructions.back(); - if (!prev_cbi.is_unconditional_branch_instruction || !prev_cbi.is_direct_branch_instruction) + const BlockInstructionInfoPair& prev = instructions->back(); + if (!prev.second.is_unconditional_branch_instruction || !prev.second.is_direct_branch_instruction) { - Log_WarningPrintf("Conditional or indirect branch delay slot at %08X, skipping block", cbi.pc); + Log_WarningPrintf("Conditional or indirect branch delay slot at %08X, skipping block", info.pc); return false; } - if (!IsDirectBranchInstruction(cbi.instruction)) + if (!IsDirectBranchInstruction(instruction)) { - Log_WarningPrintf("Indirect branch in delay slot at %08X, skipping block", cbi.pc); + Log_WarningPrintf("Indirect branch in delay slot at %08X, skipping block", info.pc); return false; } // change the pc for the second branch's delay slot, it comes from the first branch - pc = GetDirectBranchTarget(prev_cbi.instruction, prev_cbi.pc); - Log_DevPrintf("Double branch at %08X, using delay slot from %08X -> %08X", cbi.pc, prev_cbi.pc, pc); + pc = GetDirectBranchTarget(prev.first, prev.second.pc); + Log_DevPrintf("Double branch at %08X, using delay slot from %08X -> %08X", info.pc, prev.second.pc, pc); } // instruction is decoded now - block->instructions.push_back(cbi); + instructions->emplace_back(instruction, info); // if we're in a branch delay slot, the block is now done // except if this is a branch in a branch delay slot, then we grab the one after that, and so on... - if (is_branch_delay_slot && !cbi.is_branch_instruction) + if (is_branch_delay_slot && !info.is_branch_instruction) break; // if this is a branch, we grab the next instruction (delay slot), and then exit - is_branch_delay_slot = cbi.is_branch_instruction; + is_branch_delay_slot = info.is_branch_instruction; // same for load delay - is_load_delay_slot = cbi.has_load_delay; + is_load_delay_slot = info.has_load_delay; // is this a non-branchy exit? (e.g. syscall) - if (IsExitBlockInstruction(cbi.instruction)) + if (IsExitBlockInstruction(instruction)) break; } - if (!block->instructions.empty()) + if (instructions->empty()) { - block->instructions.back().is_last_instruction = true; + Log_WarningFmt("Empty block compiled at 0x{:08X}", start_pc); + return false; + } + + instructions->back().second.is_last_instruction = true; #ifdef _DEBUG - SmallString disasm; - Log_DebugPrintf("Block at 0x%08X", block->GetPC()); - for (const CodeBlockInstruction& cbi : block->instructions) - { - CPU::DisassembleInstruction(&disasm, cbi.pc, cbi.instruction.bits); - Log_DebugPrintf("[%s %s 0x%08X] %08X %s", cbi.is_branch_delay_slot ? "BD" : " ", - cbi.is_load_delay_slot ? "LD" : " ", cbi.pc, cbi.instruction.bits, disasm.c_str()); - } -#endif - } - else + SmallString disasm; + Log_DebugPrintf("Block at 0x%08X", start_pc); + for (const auto& cbi : *instructions) { - Log_WarningPrintf("Empty block compiled at 0x%08X", block->key.GetPC()); - return false; - } - -#ifdef ENABLE_RECOMPILER - if (g_settings.IsUsingRecompiler()) - { - // Ensure we're not going to run out of space while compiling this block. - if (s_code_buffer.GetFreeCodeSpace() < - (block->instructions.size() * Recompiler::MAX_NEAR_HOST_BYTES_PER_INSTRUCTION) || - s_code_buffer.GetFreeFarCodeSpace() < - (block->instructions.size() * Recompiler::MAX_FAR_HOST_BYTES_PER_INSTRUCTION)) - { - if (allow_flush) - { - Log_WarningPrintf("Out of code space, flushing all blocks."); - Flush(); - } - else - { - Log_ErrorPrintf("Out of code space and cannot flush while compiling %08X.", block->GetPC()); - return false; - } - } - - s_code_buffer.WriteProtect(false); - Recompiler::CodeGenerator codegen(&s_code_buffer); - const bool compile_result = codegen.CompileBlock(block, &block->host_code, &block->host_code_size); - s_code_buffer.WriteProtect(true); - - if (!compile_result) - { - Log_ErrorPrintf("Failed to compile host code for block at 0x%08X", block->key.GetPC()); - return false; - } + CPU::DisassembleInstruction(&disasm, cbi.second.pc, cbi.first.bits); + Log_DebugPrintf("[%s %s 0x%08X] %08X %s", cbi.second.is_branch_delay_slot ? "BD" : " ", + cbi.second.is_load_delay_slot ? "LD" : " ", cbi.second.pc, cbi.first.bits, disasm.c_str()); } #endif return true; } -#ifdef ENABLE_RECOMPILER - -void FastCompileBlockFunction() +void CPU::CodeCache::CopyRegInfo(InstructionInfo* dst, const InstructionInfo* src) { - CodeBlock* block = LookupBlock(GetNextBlockKey(), true); - if (block) - { - s_single_block_asm_dispatcher(block->host_code); - return; - } + std::memcpy(dst->reg_flags, src->reg_flags, sizeof(dst->reg_flags)); + std::memcpy(dst->read_reg, src->read_reg, sizeof(dst->read_reg)); +} - if (g_settings.gpu_pgxp_enable) +void CPU::CodeCache::SetRegAccess(InstructionInfo* inst, Reg reg, bool write) +{ + if (reg == Reg::zero) + return; + + if (!write) { - if (g_settings.gpu_pgxp_cpu) - InterpretUncachedBlock(); - else - InterpretUncachedBlock(); + for (u32 i = 0; i < std::size(inst->read_reg); i++) + { + if (inst->read_reg[i] == Reg::zero) + { + inst->read_reg[i] = reg; + break; + } + } } else { - InterpretUncachedBlock(); - } -} - -void InvalidCodeFunction() -{ - Log_ErrorPrintf("Trying to execute invalid code at 0x%08X", g_state.pc); - if (g_settings.gpu_pgxp_enable) - { - if (g_settings.gpu_pgxp_cpu) - InterpretUncachedBlock(); - else - InterpretUncachedBlock(); - } - else - { - InterpretUncachedBlock(); - } -} - -#endif - -static void InvalidateBlock(CodeBlock* block, bool allow_frame_invalidation) -{ - // Invalidate forces the block to be checked again. - Log_DebugPrintf("Invalidating block at 0x%08X", block->GetPC()); - block->invalidated = true; - - if (block->can_link) - { - const u32 frame_number = System::GetFrameNumber(); - if (allow_frame_invalidation) +#if 0 + for (u32 i = 0; i < std::size(inst->write_reg); i++) { - const u32 frame_diff = frame_number - block->invalidate_frame_number; - if (frame_diff <= INVALIDATE_THRESHOLD_TO_DISABLE_LINKING) + if (inst->write_reg[i] == Reg::zero) { - Log_DevPrintf("Block 0x%08X has been invalidated in %u frames, disabling linking", block->GetPC(), frame_diff); - block->can_link = false; - } - else - { - // It's been a while since this block was modified, so it's all good. - block->invalidate_frame_number = frame_number; + inst->write_reg[i] = reg; + break; } } - else +#endif + } +} + +#define BackpropSetReads(reg) \ + do \ + { \ + if (!(inst->reg_flags[static_cast(reg)] & RI_USED)) \ + inst->reg_flags[static_cast(reg)] |= RI_LASTUSE; \ + prev->reg_flags[static_cast(reg)] |= RI_LIVE | RI_USED; \ + inst->reg_flags[static_cast(reg)] |= RI_USED; \ + SetRegAccess(inst, reg, false); \ + } while (0) + +#define BackpropSetWrites(reg) \ + do \ + { \ + prev->reg_flags[static_cast(reg)] &= ~(RI_LIVE | RI_USED); \ + if (!(inst->reg_flags[static_cast(reg)] & RI_USED)) \ + inst->reg_flags[static_cast(reg)] |= RI_LASTUSE; \ + inst->reg_flags[static_cast(reg)] |= RI_USED; \ + SetRegAccess(inst, reg, true); \ + } while (0) + +// TODO: memory loads should be delayed one instruction because of stupid load delays. +#define BackpropSetWritesDelayed(reg) BackpropSetWrites(reg) + +void CPU::CodeCache::FillBlockRegInfo(Block* block) +{ + const Instruction* iinst = block->Instructions() + (block->size - 1); + InstructionInfo* const start = block->InstructionsInfo(); + InstructionInfo* inst = start + (block->size - 1); + std::memset(inst->reg_flags, RI_LIVE, sizeof(inst->reg_flags)); + std::memset(inst->read_reg, 0, sizeof(inst->read_reg)); + // std::memset(inst->write_reg, 0, sizeof(inst->write_reg)); + + while (inst != start) + { + InstructionInfo* prev = inst - 1; + CopyRegInfo(prev, inst); + + const Reg rs = iinst->r.rs; + const Reg rt = iinst->r.rt; + + switch (iinst->op) { - // don't trigger frame number based invalidation for this block (e.g. memory save states) - block->invalidate_frame_number = frame_number - INVALIDATE_THRESHOLD_TO_DISABLE_LINKING - 1; - } - } - - UnlinkBlock(block); - -#ifdef ENABLE_RECOMPILER - SetFastMap(block->GetPC(), FastCompileBlockFunction); -#endif -} - -void InvalidateBlocksWithPageIndex(u32 page_index) -{ - DebugAssert(page_index < Bus::RAM_8MB_CODE_PAGE_COUNT); - auto& blocks = m_ram_block_map[page_index]; - for (CodeBlock* block : blocks) - InvalidateBlock(block, true); - - // Block will be re-added next execution. - blocks.clear(); - Bus::ClearRAMCodePage(page_index); -} - -void InvalidateAll() -{ - for (auto& it : s_blocks) - { - CodeBlock* block = it.second; - if (block && !block->invalidated) - InvalidateBlock(block, false); - } - - Bus::ClearRAMCodePageFlags(); - for (auto& it : m_ram_block_map) - it.clear(); -} - -void RemoveReferencesToBlock(CodeBlock* block) -{ - BlockMap::iterator iter = s_blocks.find(block->key.GetPC()); - Assert(iter != s_blocks.end() && iter->second == block); - -#ifdef ENABLE_RECOMPILER - SetFastMap(block->GetPC(), FastCompileBlockFunction); -#endif - - // if it's been invalidated it won't be in the page map - if (!block->invalidated) - RemoveBlockFromPageMap(block); - - UnlinkBlock(block); -#ifdef ENABLE_RECOMPILER - if (!block->invalidated) - RemoveBlockFromHostCodeMap(block); -#endif - - s_blocks.erase(iter); -} - -void AddBlockToPageMap(CodeBlock* block) -{ - if (!block->IsInRAM()) - return; - - const u32 start_page = block->GetStartPageIndex(); - const u32 end_page = block->GetEndPageIndex(); - for (u32 page = start_page; page <= end_page; page++) - { - m_ram_block_map[page].push_back(block); - Bus::SetRAMCodePage(page); - } -} - -void RemoveBlockFromPageMap(CodeBlock* block) -{ - if (!block->IsInRAM()) - return; - - const u32 start_page = block->GetStartPageIndex(); - const u32 end_page = block->GetEndPageIndex(); - for (u32 page = start_page; page <= end_page; page++) - { - auto& page_blocks = m_ram_block_map[page]; - auto page_block_iter = std::find(page_blocks.begin(), page_blocks.end(), block); - Assert(page_block_iter != page_blocks.end()); - page_blocks.erase(page_block_iter); - } -} - -void LinkBlock(CodeBlock* from, CodeBlock* to, void* host_pc, void* host_resolve_pc, u32 host_pc_size) -{ - Log_DebugPrintf("Linking block %p(%08x) to %p(%08x)", from, from->GetPC(), to, to->GetPC()); - - CodeBlock::LinkInfo li; - li.block = to; - li.host_pc = host_pc; - li.host_resolve_pc = host_resolve_pc; - li.host_pc_size = host_pc_size; - from->link_successors.push_back(li); - - li.block = from; - to->link_predecessors.push_back(li); - -#ifdef ENABLE_RECOMPILER - // apply in code - if (host_pc) - { - Log_ProfilePrintf("Backpatching %p(%08x) to jump to block %p (%08x)", host_pc, from->GetPC(), to, to->GetPC()); - s_code_buffer.WriteProtect(false); - Recompiler::CodeGenerator::BackpatchBranch(host_pc, host_pc_size, reinterpret_cast(to->host_code)); - s_code_buffer.WriteProtect(true); - } -#endif -} - -void UnlinkBlock(CodeBlock* block) -{ - if (block->link_predecessors.empty() && block->link_successors.empty()) - return; - -#ifdef ENABLE_RECOMPILER - if (g_settings.IsUsingRecompiler() && g_settings.cpu_recompiler_block_linking) - s_code_buffer.WriteProtect(false); -#endif - - for (CodeBlock::LinkInfo& li : block->link_predecessors) - { - auto iter = std::find_if(li.block->link_successors.begin(), li.block->link_successors.end(), - [block](const CodeBlock::LinkInfo& li) { return li.block == block; }); - Assert(iter != li.block->link_successors.end()); - -#ifdef ENABLE_RECOMPILER - // Restore blocks linked to this block back to the resolver - if (li.host_pc) - { - Log_ProfilePrintf("Backpatching %p(%08x) [predecessor] to jump to resolver", li.host_pc, li.block->GetPC()); - Recompiler::CodeGenerator::BackpatchBranch(li.host_pc, li.host_pc_size, li.host_resolve_pc); - } -#endif - - li.block->link_successors.erase(iter); - } - block->link_predecessors.clear(); - - for (CodeBlock::LinkInfo& li : block->link_successors) - { - auto iter = std::find_if(li.block->link_predecessors.begin(), li.block->link_predecessors.end(), - [block](const CodeBlock::LinkInfo& li) { return li.block == block; }); - Assert(iter != li.block->link_predecessors.end()); - -#ifdef ENABLE_RECOMPILER - // Restore blocks we're linking to back to the resolver, since the successor won't be linked to us to backpatch if - // it changes. - if (li.host_pc) - { - Log_ProfilePrintf("Backpatching %p(%08x) [successor] to jump to resolver", li.host_pc, li.block->GetPC()); - Recompiler::CodeGenerator::BackpatchBranch(li.host_pc, li.host_pc_size, li.host_resolve_pc); - } -#endif - - // Don't have to do anything special for successors - just let the successor know it's no longer linked. - li.block->link_predecessors.erase(iter); - } - block->link_successors.clear(); - -#ifdef ENABLE_RECOMPILER - if (g_settings.IsUsingRecompiler() && g_settings.cpu_recompiler_block_linking) - s_code_buffer.WriteProtect(true); -#endif -} - -#ifdef ENABLE_RECOMPILER - -void AddBlockToHostCodeMap(CodeBlock* block) -{ - if (!g_settings.IsUsingRecompiler()) - return; - - auto ir = s_host_code_map.emplace(block->host_code, block); - Assert(ir.second); -} - -void RemoveBlockFromHostCodeMap(CodeBlock* block) -{ - if (!g_settings.IsUsingRecompiler()) - return; - - HostCodeMap::iterator hc_iter = s_host_code_map.find(block->host_code); - Assert(hc_iter != s_host_code_map.end()); - s_host_code_map.erase(hc_iter); -} - -bool InitializeFastmem() -{ - const CPUFastmemMode mode = g_settings.cpu_fastmem_mode; - Assert(mode != CPUFastmemMode::Disabled); - -#ifdef ENABLE_MMAP_FASTMEM - const auto handler = (mode == CPUFastmemMode::MMap) ? MMapPageFaultHandler : LUTPageFaultHandler; -#else - const auto handler = LUTPageFaultHandler; - Assert(mode != CPUFastmemMode::MMap); -#endif - - if (!Common::PageFaultHandler::InstallHandler(&s_host_code_map, s_code_buffer.GetCodePointer(), - s_code_buffer.GetTotalSize(), handler)) - { - Log_ErrorPrintf("Failed to install page fault handler"); - return false; - } - - Bus::UpdateFastmemViews(mode); - CPU::UpdateMemoryPointers(); - return true; -} - -void ShutdownFastmem() -{ - Common::PageFaultHandler::RemoveHandler(&s_host_code_map); - Bus::UpdateFastmemViews(CPUFastmemMode::Disabled); - CPU::UpdateMemoryPointers(); -} - -#ifdef ENABLE_MMAP_FASTMEM - -Common::PageFaultHandler::HandlerResult MMapPageFaultHandler(void* exception_pc, void* fault_address, bool is_write) -{ - if (static_cast(fault_address) < static_cast(g_state.fastmem_base) || - (static_cast(fault_address) - static_cast(g_state.fastmem_base)) >= - static_cast(Bus::FASTMEM_ARENA_SIZE)) - { - return Common::PageFaultHandler::HandlerResult::ExecuteNextHandler; - } - - const PhysicalMemoryAddress fastmem_address = static_cast( - static_cast(static_cast(fault_address) - static_cast(g_state.fastmem_base))); - - Log_DevPrintf("Page fault handler invoked at PC=%p Address=%p %s, fastmem offset 0x%08X", exception_pc, fault_address, - is_write ? "(write)" : "(read)", fastmem_address); - - // use upper_bound to find the next block after the pc - HostCodeMap::iterator upper_iter = - s_host_code_map.upper_bound(reinterpret_cast(exception_pc)); - if (upper_iter == s_host_code_map.begin()) - return Common::PageFaultHandler::HandlerResult::ExecuteNextHandler; - - // then decrement it by one to (hopefully) get the block we want - upper_iter--; - - // find the loadstore info in the code block - CodeBlock* block = upper_iter->second; - for (auto bpi_iter = block->loadstore_backpatch_info.begin(); bpi_iter != block->loadstore_backpatch_info.end(); - ++bpi_iter) - { - Recompiler::LoadStoreBackpatchInfo& lbi = *bpi_iter; - if (lbi.host_pc == exception_pc) - { - if (is_write && !g_state.cop0_regs.sr.Isc && Bus::IsRAMAddress(fastmem_address)) + case InstructionOp::funct: { - // this is probably a code page, since we aren't going to fault due to requiring fastmem on RAM. - const u32 code_page_index = Bus::GetRAMCodePageIndex(fastmem_address); - if (Bus::IsRAMCodePage(code_page_index)) + const Reg rd = iinst->r.rd; + + switch (iinst->r.funct) { - if (++lbi.fault_count < CODE_WRITE_FAULT_THRESHOLD_FOR_SLOWMEM) - { - InvalidateBlocksWithPageIndex(code_page_index); - return Common::PageFaultHandler::HandlerResult::ContinueExecution; - } - else - { - Log_DevPrintf("Backpatching code write at %p (%08X) address %p (%08X) to slowmem after threshold", - exception_pc, lbi.guest_pc, fault_address, fastmem_address); - } + case InstructionFunct::sll: + case InstructionFunct::srl: + case InstructionFunct::sra: + BackpropSetWrites(rd); + BackpropSetReads(rt); + break; + + case InstructionFunct::sllv: + case InstructionFunct::srlv: + case InstructionFunct::srav: + case InstructionFunct::add: + case InstructionFunct::addu: + case InstructionFunct::sub: + case InstructionFunct::subu: + case InstructionFunct::and_: + case InstructionFunct::or_: + case InstructionFunct::xor_: + case InstructionFunct::nor: + case InstructionFunct::slt: + case InstructionFunct::sltu: + BackpropSetWrites(rd); + BackpropSetReads(rt); + BackpropSetReads(rs); + break; + + case InstructionFunct::jr: + BackpropSetReads(rs); + break; + + case InstructionFunct::jalr: + BackpropSetReads(rs); + BackpropSetWrites(rd); + break; + + case InstructionFunct::mfhi: + BackpropSetWrites(rd); + BackpropSetReads(Reg::hi); + break; + + case InstructionFunct::mflo: + BackpropSetWrites(rd); + BackpropSetReads(Reg::lo); + break; + + case InstructionFunct::mthi: + BackpropSetWrites(Reg::hi); + BackpropSetReads(rs); + break; + + case InstructionFunct::mtlo: + BackpropSetWrites(Reg::lo); + BackpropSetReads(rs); + break; + + case InstructionFunct::mult: + case InstructionFunct::multu: + case InstructionFunct::div: + case InstructionFunct::divu: + BackpropSetWrites(Reg::hi); + BackpropSetWrites(Reg::lo); + BackpropSetReads(rs); + BackpropSetReads(rt); + break; + + case InstructionFunct::syscall: + case InstructionFunct::break_: + break; + + default: + Log_ErrorPrintf("Unknown funct %u", static_cast(iinst->r.funct.GetValue())); + break; } } + break; - // found it, do fixup - s_code_buffer.WriteProtect(false); - const bool backpatch_result = Recompiler::CodeGenerator::BackpatchLoadStore(lbi); - s_code_buffer.WriteProtect(true); - if (backpatch_result) + case InstructionOp::b: { - // remove the backpatch entry since we won't be coming back to this one - block->loadstore_backpatch_info.erase(bpi_iter); - return Common::PageFaultHandler::HandlerResult::ContinueExecution; + if ((static_cast(iinst->i.rt.GetValue()) & u8(0x1E)) == u8(0x10)) + BackpropSetWrites(Reg::ra); + BackpropSetReads(rs); } - else - { - Log_ErrorPrintf("Failed to backpatch %p in block 0x%08X", exception_pc, block->GetPC()); - return Common::PageFaultHandler::HandlerResult::ExecuteNextHandler; - } - } - } + break; - // we didn't find the pc in our list.. - Log_ErrorPrintf("Loadstore PC not found for %p in block 0x%08X", exception_pc, block->GetPC()); - return Common::PageFaultHandler::HandlerResult::ExecuteNextHandler; + case InstructionOp::j: + break; + + case InstructionOp::jal: + BackpropSetWrites(Reg::ra); + break; + + case InstructionOp::beq: + case InstructionOp::bne: + BackpropSetReads(rs); + BackpropSetReads(rt); + break; + + case InstructionOp::blez: + case InstructionOp::bgtz: + BackpropSetReads(rs); + break; + + case InstructionOp::addi: + case InstructionOp::addiu: + case InstructionOp::slti: + case InstructionOp::sltiu: + case InstructionOp::andi: + case InstructionOp::ori: + case InstructionOp::xori: + BackpropSetWrites(rt); + BackpropSetReads(rs); + break; + + case InstructionOp::lui: + BackpropSetWrites(rt); + break; + + case InstructionOp::lb: + case InstructionOp::lh: + case InstructionOp::lw: + case InstructionOp::lbu: + case InstructionOp::lhu: + BackpropSetWritesDelayed(rt); + BackpropSetReads(rs); + break; + + case InstructionOp::lwl: + case InstructionOp::lwr: + BackpropSetWritesDelayed(rt); + BackpropSetReads(rs); + BackpropSetReads(rt); + break; + + case InstructionOp::sb: + case InstructionOp::sh: + case InstructionOp::swl: + case InstructionOp::sw: + case InstructionOp::swr: + BackpropSetReads(rt); + BackpropSetReads(rs); + break; + + case InstructionOp::cop0: + case InstructionOp::cop2: + { + if (iinst->cop.IsCommonInstruction()) + { + switch (iinst->cop.CommonOp()) + { + case CopCommonInstruction::mfcn: + case CopCommonInstruction::cfcn: + BackpropSetWritesDelayed(rt); + break; + + case CopCommonInstruction::mtcn: + case CopCommonInstruction::ctcn: + BackpropSetReads(rt); + break; + } + } + break; + + case InstructionOp::lwc2: + case InstructionOp::swc2: + BackpropSetReads(rs); + BackpropSetReads(rt); + break; + + default: + Log_ErrorPrintf("Unknown op %u", static_cast(iinst->r.funct.GetValue())); + break; + } + } // end switch + + inst--; + iinst--; + } // end while } -#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// MARK: - Recompiler Glue +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -Common::PageFaultHandler::HandlerResult LUTPageFaultHandler(void* exception_pc, void* fault_address, bool is_write) +#ifdef ENABLE_RECOMPILER_SUPPORT + +void CPU::CodeCache::CompileOrRevalidateBlock(u32 start_pc) { - // use upper_bound to find the next block after the pc - HostCodeMap::iterator upper_iter = - s_host_code_map.upper_bound(reinterpret_cast(exception_pc)); - if (upper_iter == s_host_code_map.begin()) - return Common::PageFaultHandler::HandlerResult::ExecuteNextHandler; + // TODO: this doesn't currently handle when the cache overflows... + DebugAssert(IsUsingAnyRecompiler()); - // then decrement it by one to (hopefully) get the block we want - upper_iter--; - - // find the loadstore info in the code block - CodeBlock* block = upper_iter->second; - for (auto bpi_iter = block->loadstore_backpatch_info.begin(); bpi_iter != block->loadstore_backpatch_info.end(); - ++bpi_iter) + Block* block = LookupBlock(start_pc); + if (block) { - Recompiler::LoadStoreBackpatchInfo& lbi = *bpi_iter; - if (lbi.host_pc == exception_pc) + // we should only be here if the block got invalidated + DebugAssert(block->state != BlockState::Valid); + if (RevalidateBlock(block)) { - // found it, do fixup - s_code_buffer.WriteProtect(false); - const bool backpatch_result = Recompiler::CodeGenerator::BackpatchLoadStore(lbi); - s_code_buffer.WriteProtect(true); - if (backpatch_result) - { - // remove the backpatch entry since we won't be coming back to this one - block->loadstore_backpatch_info.erase(bpi_iter); - return Common::PageFaultHandler::HandlerResult::ContinueExecution; - } - else - { - Log_ErrorPrintf("Failed to backpatch %p in block 0x%08X", exception_pc, block->GetPC()); - return Common::PageFaultHandler::HandlerResult::ExecuteNextHandler; - } + DebugAssert(block->host_code); + SetCodeLUT(start_pc, block->host_code); + BacklinkBlocks(start_pc, block->host_code); + return; } + + // remove outward links from this block, since we're recompiling it + UnlinkBlockExits(block); } - // we didn't find the pc in our list.. - Log_ErrorPrintf("Loadstore PC not found for %p in block 0x%08X", exception_pc, block->GetPC()); - return Common::PageFaultHandler::HandlerResult::ExecuteNextHandler; + BlockMetadata metadata = {}; + if (!ReadBlockInstructions(start_pc, &s_block_instructions, &metadata)) + { + Log_ErrorFmt("Failed to read block at 0x{:08X}, falling back to uncached interpreter", start_pc); + SetCodeLUT(start_pc, g_interpret_block); + BacklinkBlocks(start_pc, g_interpret_block); + return; + } + + // Ensure we're not going to run out of space while compiling this block. + // We could definitely do better here... TODO: far code is no longer needed for newrec + const u32 block_size = static_cast(s_block_instructions.size()); + if (s_code_buffer.GetFreeCodeSpace() < (block_size * Recompiler::MAX_NEAR_HOST_BYTES_PER_INSTRUCTION) || + s_code_buffer.GetFreeFarCodeSpace() < (block_size * Recompiler::MAX_FAR_HOST_BYTES_PER_INSTRUCTION)) + { + Log_ErrorFmt("Out of code space while compiling {:08X}. Resetting code cache.", start_pc); + CodeCache::Reset(); + } + + if ((block = CreateBlock(start_pc, s_block_instructions, metadata)) == nullptr || block->size == 0 || + !CompileBlock(block)) + { + Log_ErrorFmt("Failed to compile block at 0x{:08X}, falling back to uncached interpreter", start_pc); + SetCodeLUT(start_pc, g_interpret_block); + BacklinkBlocks(start_pc, g_interpret_block); + return; + } + + SetCodeLUT(start_pc, block->host_code); + BacklinkBlocks(start_pc, block->host_code); } -#endif // ENABLE_RECOMPILER - -} // namespace CPU::CodeCache - -#ifdef ENABLE_RECOMPILER - -void CPU::Recompiler::Thunks::ResolveBranch(CodeBlock* block, void* host_pc, void* host_resolve_pc, u32 host_pc_size) +void CPU::CodeCache::DiscardAndRecompileBlock(u32 start_pc) { - using namespace CPU::CodeCache; + Log_DevPrintf("Discard block %08X with manual protection", start_pc); + Block* block = LookupBlock(start_pc); + DebugAssert(block && block->state == BlockState::Valid); + InvalidateBlock(block, BlockState::NeedsRecompile); + CompileOrRevalidateBlock(start_pc); +} - CodeBlockKey key = GetNextBlockKey(); - CodeBlock* successor_block = LookupBlock(key, false); - if (!successor_block || (successor_block->invalidated && !RevalidateBlock(successor_block, false)) || - !block->can_link || !successor_block->can_link) +const void* CPU::CodeCache::CreateBlockLink(Block* block, void* code, u32 newpc) +{ + // self-linking should be handled by the caller + DebugAssert(newpc != block->pc); + + const void* dst = g_dispatcher; + if (g_settings.cpu_recompiler_block_linking) { - // just turn it into a return to the dispatcher instead. - s_code_buffer.WriteProtect(false); - CodeGenerator::BackpatchReturn(host_pc, host_pc_size); - s_code_buffer.WriteProtect(true); + const Block* next_block = LookupBlock(newpc); + if (next_block) + { + dst = (next_block->state == BlockState::Valid) ? + next_block->host_code : + ((next_block->state == BlockState::FallbackToInterpreter) ? g_interpret_block : + g_compile_or_revalidate_block); + DebugAssert(dst); + } + else + { + dst = g_compile_or_revalidate_block; + } + + BlockLinkMap::iterator iter = s_block_links.emplace(newpc, code); + DebugAssert(block->num_exit_links < MAX_BLOCK_EXIT_LINKS); + block->exit_links[block->num_exit_links++] = iter; + } + + Log_DebugPrintf("Linking %p with dst pc %08X to %p%s", code, newpc, dst, + (dst == g_compile_or_revalidate_block) ? "[compiler]" : ""); + return dst; +} + +void CPU::CodeCache::BacklinkBlocks(u32 pc, const void* dst) +{ + if (!g_settings.cpu_recompiler_block_linking) + return; + + const auto link_range = s_block_links.equal_range(pc); + for (auto it = link_range.first; it != link_range.second; ++it) + { + Log_DebugPrintf("Backlinking %p with dst pc %08X to %p%s", it->second, pc, dst, + (dst == g_compile_or_revalidate_block) ? "[compiler]" : ""); + EmitJump(it->second, dst, true); + } +} + +void CPU::CodeCache::UnlinkBlockExits(Block* block) +{ + const u32 num_exit_links = block->num_exit_links; + for (u32 i = 0; i < num_exit_links; i++) + s_block_links.erase(block->exit_links[i]); + block->num_exit_links = 0; +} + +JitCodeBuffer& CPU::CodeCache::GetCodeBuffer() +{ + return s_code_buffer; +} + +const void* CPU::CodeCache::GetInterpretUncachedBlockFunction() +{ + if (g_settings.gpu_pgxp_enable) + { + if (g_settings.gpu_pgxp_cpu) + return reinterpret_cast(InterpretUncachedBlock); + else + return reinterpret_cast(InterpretUncachedBlock); } else { - // link blocks! - LinkBlock(block, successor_block, host_pc, host_resolve_pc, host_pc_size); + return reinterpret_cast(InterpretUncachedBlock); } } +void CPU::CodeCache::ClearASMFunctions() +{ + g_enter_recompiler = nullptr; + g_compile_or_revalidate_block = nullptr; + g_check_events_and_dispatch = nullptr; + g_run_events_and_dispatch = nullptr; + g_dispatcher = nullptr; + g_interpret_block = nullptr; + g_discard_and_recompile_block = nullptr; + +#ifdef _DEBUG + s_total_instructions_compiled = 0; + s_total_host_instructions_emitted = 0; +#endif +} + +void CPU::CodeCache::CompileASMFunctions() +{ + s_code_buffer.WriteProtect(false); + + const u32 asm_size = EmitASMFunctions(s_code_buffer.GetFreeCodePointer(), s_code_buffer.GetFreeCodeSpace()); + +#ifdef ENABLE_RECOMPILER_PROFILING + MIPSPerfScope.Register(s_code_buffer.GetFreeCodePointer(), asm_size, "ASMFunctions"); +#endif + + s_code_buffer.CommitCode(asm_size); + s_code_buffer.WriteProtect(true); +} + +bool CPU::CodeCache::CompileBlock(Block* block) +{ + s_code_buffer.WriteProtect(false); + + const void* host_code = nullptr; + u32 host_code_size = 0; + u32 host_far_code_size = 0; + +#ifdef ENABLE_RECOMPILER + if (g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler) + { + Recompiler::CodeGenerator codegen(&s_code_buffer); + host_code = codegen.CompileBlock(block, &host_code_size, &host_far_code_size); + } +#endif + + s_code_buffer.WriteProtect(true); + + block->host_code = host_code; + + if (!host_code) + { + Log_ErrorFmt("Failed to compile host code for block at 0x{:08X}", block->pc); + block->state = BlockState::FallbackToInterpreter; + return false; + } + +#ifdef _DEBUG + const u32 host_instructions = GetHostInstructionCount(host_code, host_code_size); + s_total_instructions_compiled += block->size; + s_total_host_instructions_emitted += host_instructions; + + Log_ProfileFmt("0x{:08X}: {}/{}b for {}b ({}i), blowup: {:.2f}x, cache: {:.2f}%/{:.2f}%, ipi: {:.2f}/{:.2f}", + block->pc, host_code_size, host_far_code_size, block->size * 4, block->size, + static_cast(host_code_size) / static_cast(block->size * 4), s_code_buffer.GetUsedPct(), + s_code_buffer.GetFarUsedPct(), static_cast(host_instructions) / static_cast(block->size), + static_cast(s_total_host_instructions_emitted) / + static_cast(s_total_instructions_compiled)); +#else + Log_ProfileFmt("0x{:08X}: {}/{}b for {}b ({} inst), blowup: {:.2f}x, cache: {:.2f}%/{:.2f}%", block->pc, + host_code_size, host_far_code_size, block->size * 4, block->size, + static_cast(host_code_size) / static_cast(block->size * 4), s_code_buffer.GetUsedPct(), + s_code_buffer.GetFarUsedPct()); +#endif + +#if 0 + Log_DebugPrint("***HOST CODE**"); + DisassembleAndLogHostCode(host_code, host_code_size); +#endif + +#ifdef ENABLE_RECOMPILER_PROFILING + MIPSPerfScope.RegisterPC(host_code, host_code_size, block->pc); +#endif + + return true; +} + +void CPU::CodeCache::AddLoadStoreInfo(void* code_address, u32 code_size, u32 guest_pc, const void* thunk_address) +{ + DebugAssert(code_size < std::numeric_limits::max()); + + auto iter = s_fastmem_backpatch_info.find(code_address); + if (iter != s_fastmem_backpatch_info.end()) + s_fastmem_backpatch_info.erase(iter); + + LoadstoreBackpatchInfo info; + info.thunk_address = thunk_address; + info.guest_pc = guest_pc; + info.code_size = static_cast(code_size); + s_fastmem_backpatch_info.emplace(code_address, info); +} + +void CPU::CodeCache::AddLoadStoreInfo(void* code_address, u32 code_size, u32 guest_pc, TickCount cycles, + u32 gpr_bitmask, u8 address_register, u8 data_register, MemoryAccessSize size, + bool is_signed, bool is_load) +{ + DebugAssert(code_size < std::numeric_limits::max()); + DebugAssert(cycles >= 0 && cycles < std::numeric_limits::max()); + + auto iter = s_fastmem_backpatch_info.find(code_address); + if (iter != s_fastmem_backpatch_info.end()) + s_fastmem_backpatch_info.erase(iter); + + LoadstoreBackpatchInfo info; + info.thunk_address = nullptr; + info.guest_pc = guest_pc; + info.gpr_bitmask = gpr_bitmask; + info.cycles = static_cast(cycles); + info.address_register = address_register; + info.data_register = data_register; + info.size = static_cast(size); + info.is_signed = is_signed; + info.is_load = is_load; + info.code_size = static_cast(code_size); + s_fastmem_backpatch_info.emplace(code_address, info); +} + +Common::PageFaultHandler::HandlerResult CPU::CodeCache::HandleFastmemException(void* exception_pc, void* fault_address, + bool is_write) +{ + // TODO: Catch general RAM writes, not just fastmem + PhysicalMemoryAddress guest_address; + +#ifdef ENABLE_MMAP_FASTMEM + if (g_settings.cpu_fastmem_mode == CPUFastmemMode::MMap) + { + if (static_cast(fault_address) < static_cast(g_state.fastmem_base) || + (static_cast(fault_address) - static_cast(g_state.fastmem_base)) >= + static_cast(Bus::FASTMEM_ARENA_SIZE)) + { + return Common::PageFaultHandler::HandlerResult::ExecuteNextHandler; + } + + guest_address = static_cast( + static_cast(static_cast(fault_address) - static_cast(g_state.fastmem_base))); + } + else +#endif + { + // LUT fastmem - we can't compute the address. + guest_address = std::numeric_limits::max(); + } + + Log_DevFmt("Page fault handler invoked at PC={} Address={} {}, fastmem offset {:08X}", exception_pc, fault_address, + is_write ? "(write)" : "(read)", guest_address); + + auto iter = s_fastmem_backpatch_info.find(exception_pc); + if (iter == s_fastmem_backpatch_info.end()) + { + Log_ErrorFmt("No backpatch info found for {}", exception_pc); + return Common::PageFaultHandler::HandlerResult::ExecuteNextHandler; + } + + // if we're writing to ram, let it go through a few times, and use manual block protection to sort it out + // TODO: path for manual protection to return back to read-only pages + LoadstoreBackpatchInfo& info = iter->second; + if (is_write && !g_state.cop0_regs.sr.Isc && AddressInRAM(guest_address)) + { + Log_DevFmt("Ignoring fault due to RAM write @ 0x{:08X}", guest_address); + InvalidateBlocksWithPageIndex(Bus::GetRAMCodePageIndex(guest_address)); + return Common::PageFaultHandler::HandlerResult::ContinueExecution; + } + + Log_DevFmt("Backpatching {} at {}[{}] (pc {:08X} addr {:08X}): Bitmask {:08X} Addr {} Data {} Size {} Signed {:02X}", + info.is_load ? "load" : "store", exception_pc, info.code_size, info.guest_pc, guest_address, + info.gpr_bitmask, static_cast(info.address_register), static_cast(info.data_register), + info.AccessSizeInBytes(), static_cast(info.is_signed)); + + BackpatchLoadStore(exception_pc, info); + + // TODO: queue block for recompilation later + + // and store the pc in the faulting list, so that we don't emit another fastmem loadstore + s_fastmem_faulting_pcs.insert(info.guest_pc); + s_fastmem_backpatch_info.erase(iter); + return Common::PageFaultHandler::HandlerResult::ContinueExecution; +} + +void CPU::CodeCache::BackpatchLoadStore(void* host_pc, const LoadstoreBackpatchInfo& info) +{ + s_code_buffer.WriteProtect(false); + +#ifdef ENABLE_RECOMPILER + if (g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler) + Recompiler::CodeGenerator::BackpatchLoadStore(host_pc, info); +#endif + + s_code_buffer.WriteProtect(true); +} + +#ifdef ENABLE_RECOMPILER + void CPU::Recompiler::Thunks::LogPC(u32 pc) { -#if 1 +#if 0 + const u32 cyc = TimingEvents::GetGlobalTickCounter() + GetPendingTicks(); + s_last_cyc = cyc; + if (s_last_cyc == 3302138733) + __debugbreak(); +#endif +#if 0 CPU::CodeCache::LogCurrentState(); #endif #if 0 - if (TimingEvents::GetGlobalTickCounter() + GetPendingTicks() == 382856482) + if (TimingEvents::GetGlobalTickCounter() + GetPendingTicks() == 181991709) __debugbreak(); #endif } #endif // ENABLE_RECOMPILER + +#endif // ENABLE_RECOMPILER_SUPPORT diff --git a/src/core/cpu_code_cache.h b/src/core/cpu_code_cache.h index 6411253a0..126243afa 100644 --- a/src/core/cpu_code_cache.h +++ b/src/core/cpu_code_cache.h @@ -1,160 +1,42 @@ -// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin +// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) #pragma once + #include "bus.h" -#include "common/bitfield.h" #include "cpu_types.h" -#include "util/jit_code_buffer.h" -#include "util/page_fault_handler.h" -#include -#include -#include -#include -#include -#ifdef ENABLE_RECOMPILER -#include "cpu_recompiler_types.h" -#endif +namespace CPU::CodeCache { -namespace CPU { +/// Returns true if any recompiler is in use. +bool IsUsingAnyRecompiler(); -union CodeBlockKey -{ - u32 bits; +/// Returns true if any recompiler and fastmem is in use. +bool IsUsingFastmem(); - BitField user_mode; - BitField aligned_pc; +/// Allocates resources, call once at startup. +void ProcessStartup(); - ALWAYS_INLINE u32 GetPC() const { return aligned_pc << 2; } - ALWAYS_INLINE void SetPC(u32 pc) { aligned_pc = pc >> 2; } - - ALWAYS_INLINE u32 GetPCPhysicalAddress() const { return (aligned_pc << 2) & PHYSICAL_MEMORY_ADDRESS_MASK; } - - ALWAYS_INLINE CodeBlockKey() = default; - - ALWAYS_INLINE CodeBlockKey(const CodeBlockKey& rhs) : bits(rhs.bits) {} - - ALWAYS_INLINE CodeBlockKey& operator=(const CodeBlockKey& rhs) - { - bits = rhs.bits; - return *this; - } - - ALWAYS_INLINE bool operator==(const CodeBlockKey& rhs) const { return bits == rhs.bits; } - ALWAYS_INLINE bool operator!=(const CodeBlockKey& rhs) const { return bits != rhs.bits; } - ALWAYS_INLINE bool operator<(const CodeBlockKey& rhs) const { return bits < rhs.bits; } -}; - -struct CodeBlockInstruction -{ - Instruction instruction; - u32 pc; - - bool is_branch_instruction : 1; - bool is_direct_branch_instruction : 1; - bool is_unconditional_branch_instruction : 1; - bool is_branch_delay_slot : 1; - bool is_load_instruction : 1; - bool is_store_instruction : 1; - bool is_load_delay_slot : 1; - bool is_last_instruction : 1; - bool has_load_delay : 1; - bool can_trap : 1; -}; - -struct CodeBlock -{ - using HostCodePointer = void (*)(); - - struct LinkInfo - { - CodeBlock* block; - void* host_pc; - void* host_resolve_pc; - u32 host_pc_size; - }; - - CodeBlock(const CodeBlockKey key_) : key(key_) {} - - CodeBlockKey key; - u32 host_code_size = 0; - HostCodePointer host_code = nullptr; - - std::vector instructions; - std::vector link_predecessors; - std::vector link_successors; - - TickCount uncached_fetch_ticks = 0; - u32 icache_line_count = 0; - -#ifdef ENABLE_RECOMPILER - std::vector loadstore_backpatch_info; -#endif - - bool contains_loadstore_instructions = false; - bool contains_double_branches = false; - bool invalidated = false; - bool can_link = true; - - u32 recompile_frame_number = 0; - u32 recompile_count = 0; - u32 invalidate_frame_number = 0; - - u32 GetPC() const { return key.GetPC(); } - u32 GetSizeInBytes() const { return static_cast(instructions.size()) * sizeof(Instruction); } - u32 GetStartPageIndex() const { return (key.GetPCPhysicalAddress() / HOST_PAGE_SIZE); } - u32 GetEndPageIndex() const { return ((key.GetPCPhysicalAddress() + GetSizeInBytes()) / HOST_PAGE_SIZE); } - bool IsInRAM() const - { - // TODO: Constant - return key.GetPCPhysicalAddress() < 0x200000; - } -}; - -namespace CodeCache { - -enum : u32 -{ - FAST_MAP_TABLE_COUNT = 0x10000, - FAST_MAP_TABLE_SIZE = 0x10000 / 4, // 16384 - FAST_MAP_TABLE_SHIFT = 16, -}; - -using FastMapTable = CodeBlock::HostCodePointer*; +/// Frees resources, call once at shutdown. +void ProcessShutdown(); +/// Initializes resources for the system. void Initialize(); + +/// Frees resources used by the system. void Shutdown(); + +/// Runs the system. [[noreturn]] void Execute(); -#ifdef ENABLE_RECOMPILER -using DispatcherFunction = void (*)(); -using SingleBlockDispatcherFunction = void (*)(const CodeBlock::HostCodePointer); - -FastMapTable* GetFastMapPointer(); -#endif - -#if defined(ENABLE_RECOMPILER) -JitCodeBuffer& GetCodeBuffer(); -#endif - /// Flushes the code cache, forcing all blocks to be recompiled. -void Flush(); - -/// Changes whether the recompiler is enabled. -void Reinitialize(); +void Reset(); /// Invalidates all blocks which are in the range of the specified code page. void InvalidateBlocksWithPageIndex(u32 page_index); /// Invalidates all blocks in the cache. -void InvalidateAll(); - -template -void InterpretCachedBlock(const CodeBlock& block); - -template -void InterpretUncachedBlock(); +void InvalidateAllRAMBlocks(); /// Invalidates any code pages which overlap the specified range. ALWAYS_INLINE void InvalidateCodePages(PhysicalMemoryAddress address, u32 word_count) @@ -168,6 +50,4 @@ ALWAYS_INLINE void InvalidateCodePages(PhysicalMemoryAddress address, u32 word_c } } -}; // namespace CodeCache - -} // namespace CPU +} // namespace CPU::CodeCache diff --git a/src/core/cpu_code_cache_private.h b/src/core/cpu_code_cache_private.h new file mode 100644 index 000000000..f1392e0d8 --- /dev/null +++ b/src/core/cpu_code_cache_private.h @@ -0,0 +1,279 @@ +// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin +// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) + +#pragma once + +#include "bus.h" +#include "common/bitfield.h" +#include "common/perf_scope.h" +#include "cpu_code_cache.h" +#include "cpu_core_private.h" +#include "cpu_types.h" + +#include "util/jit_code_buffer.h" +#include "util/page_fault_handler.h" + +#include +#include +#include +#include +#include + +#ifdef ENABLE_RECOMPILER +// #include "cpu_recompiler_types.h" +#endif + +namespace CPU::CodeCache { + +enum : u32 +{ + LUT_TABLE_COUNT = 0x10000, + LUT_TABLE_SIZE = 0x10000 / sizeof(u32), // 16384, one for each PC + LUT_TABLE_SHIFT = 16, + + MAX_BLOCK_EXIT_LINKS = 2, +}; + +using CodeLUT = const void**; +using CodeLUTArray = std::array; +using BlockLinkMap = std::unordered_multimap; // TODO: try ordered? + +enum RegInfoFlags : u8 +{ + RI_LIVE = (1 << 0), + RI_USED = (1 << 1), + RI_LASTUSE = (1 << 2), +}; + +struct InstructionInfo +{ + u32 pc; // TODO: Remove this, old recs still depend on it. + + bool is_branch_instruction : 1; + bool is_direct_branch_instruction : 1; + bool is_unconditional_branch_instruction : 1; + bool is_branch_delay_slot : 1; + bool is_load_instruction : 1; + bool is_store_instruction : 1; + bool is_load_delay_slot : 1; + bool is_last_instruction : 1; + bool has_load_delay : 1; + bool can_trap : 1; + + u8 reg_flags[static_cast(Reg::count)]; + // Reg write_reg[3]; + Reg read_reg[3]; + + // If unset, values which are not live will not be written back to memory. + // Tends to break stuff at the moment. + static constexpr bool WRITE_DEAD_VALUES = true; + + /// Returns true if the register is used later in the block, and this isn't the last instruction to use it. + /// In other words, the register is worth keeping in a host register/caching it. + inline bool UsedTest(Reg reg) const { return (reg_flags[static_cast(reg)] & (RI_USED | RI_LASTUSE)) == RI_USED; } + + /// Returns true if the value should be computed/written back. + /// Basically, this means it's either used before it's overwritten, or not overwritten by the end of the block. + inline bool LiveTest(Reg reg) const + { + return WRITE_DEAD_VALUES || ((reg_flags[static_cast(reg)] & RI_LIVE) != 0); + } + + /// Returns true if the register can be renamed into another. + inline bool RenameTest(Reg reg) const { return (reg == Reg::zero || !UsedTest(reg) || !LiveTest(reg)); } + + /// Returns true if this instruction reads this register. + inline bool ReadsReg(Reg reg) const { return (read_reg[0] == reg || read_reg[1] == reg || read_reg[2] == reg); } +}; + +enum class BlockState : u8 +{ + Valid, + Invalidated, + NeedsRecompile, + FallbackToInterpreter +}; + +enum class BlockFlags : u8 +{ + None = 0, + ContainsLoadStoreInstructions = (1 << 0), + SpansPages = (1 << 1), + BranchDelaySpansPages = (1 << 2), +}; +IMPLEMENT_ENUM_CLASS_BITWISE_OPERATORS(BlockFlags); + +enum class PageProtectionMode : u8 +{ + WriteProtected, + ManualCheck, + Unprotected, +}; + +struct BlockMetadata +{ + TickCount uncached_fetch_ticks; + u32 icache_line_count; + BlockFlags flags; +}; + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4324) // C4324: 'CPU::CodeCache::Block': structure was padded due to alignment specifier) +#endif + +struct alignas(16) Block +{ + u32 pc; + u32 size; // in guest instructions + const void* host_code; + + // links to previous/next block within page + Block* next_block_in_page; + + BlockLinkMap::iterator exit_links[MAX_BLOCK_EXIT_LINKS]; + u8 num_exit_links; + + // TODO: Move up so it's part of the same cache line + BlockState state; + BlockFlags flags; + PageProtectionMode protection; + + TickCount uncached_fetch_ticks; + u32 icache_line_count; + + u32 compile_frame; + u8 compile_count; + + // followed by Instruction * size, InstructionRegInfo * size + ALWAYS_INLINE const Instruction* Instructions() const { return reinterpret_cast(this + 1); } + ALWAYS_INLINE Instruction* Instructions() { return reinterpret_cast(this + 1); } + + ALWAYS_INLINE const InstructionInfo* InstructionsInfo() const + { + return reinterpret_cast(Instructions() + size); + } + ALWAYS_INLINE InstructionInfo* InstructionsInfo() + { + return reinterpret_cast(Instructions() + size); + } + + // returns true if the block has a given flag + ALWAYS_INLINE bool HasFlag(BlockFlags flag) const { return ((flags & flag) != BlockFlags::None); } + + // returns the page index for the start of the block + ALWAYS_INLINE u32 StartPageIndex() const { return Bus::GetRAMCodePageIndex(pc); } + + // returns the page index for the last instruction in the block (inclusive) + ALWAYS_INLINE u32 EndPageIndex() const { return Bus::GetRAMCodePageIndex(pc + ((size - 1) * sizeof(Instruction))); } + + // returns true if the block spans multiple pages + ALWAYS_INLINE bool SpansPages() const { return StartPageIndex() != EndPageIndex(); } +}; + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +using BlockLUTArray = std::array; + +struct LoadstoreBackpatchInfo +{ + union + { + struct + { + u32 gpr_bitmask; + u16 cycles; + u16 address_register : 5; + u16 data_register : 5; + u16 size : 2; + u16 is_signed : 1; + u16 is_load : 1; + }; + + const void* thunk_address; // only needed for oldrec + }; + + u32 guest_pc; + u8 code_size; + + MemoryAccessSize AccessSize() const { return static_cast(size); } + u32 AccessSizeInBytes() const { return 1u << size; } +}; +static_assert(sizeof(LoadstoreBackpatchInfo) == 16); + +static inline bool AddressInRAM(VirtualMemoryAddress pc) +{ + return VirtualAddressToPhysical(pc) < Bus::g_ram_size; +} + +struct PageProtectionInfo +{ + Block* first_block_in_page; + Block* last_block_in_page; + + PageProtectionMode mode; + u16 invalidate_count; + u32 invalidate_frame; +}; +static_assert(sizeof(PageProtectionInfo) == (sizeof(Block*) * 2 + 8)); + +template +void InterpretCachedBlock(const Block* block); + +template +void InterpretUncachedBlock(); + +void LogCurrentState(); + +#if defined(ENABLE_RECOMPILER) +#define ENABLE_RECOMPILER_SUPPORT 1 + +#if defined(_DEBUG) || false +// Enable disassembly of host assembly code. +#define ENABLE_HOST_DISASSEMBLY 1 +#endif + +#if false +// Enable profiling of JIT blocks. +#define ENABLE_RECOMPILER_PROFILING 1 +#endif + +JitCodeBuffer& GetCodeBuffer(); +const void* GetInterpretUncachedBlockFunction(); + +void CompileOrRevalidateBlock(u32 start_pc); +void DiscardAndRecompileBlock(u32 start_pc); +const void* CreateBlockLink(Block* from_block, void* code, u32 newpc); + +void AddLoadStoreInfo(void* code_address, u32 code_size, u32 guest_pc, const void* thunk_address); +void AddLoadStoreInfo(void* code_address, u32 code_size, u32 guest_pc, TickCount cycles, u32 gpr_bitmask, + u8 address_register, u8 data_register, MemoryAccessSize size, bool is_signed, bool is_load); + +u32 EmitASMFunctions(void* code, u32 code_size); +u32 EmitJump(void* code, const void* dst, bool flush_icache); + +void DisassembleAndLogHostCode(const void* start, u32 size); +u32 GetHostInstructionCount(const void* start, u32 size); + +extern CodeLUTArray g_code_lut; + +extern NORETURN_FUNCTION_POINTER void (*g_enter_recompiler)(); +extern const void* g_compile_or_revalidate_block; +extern const void* g_check_events_and_dispatch; +extern const void* g_run_events_and_dispatch; +extern const void* g_dispatcher; +extern const void* g_block_dispatcher; +extern const void* g_interpret_block; +extern const void* g_discard_and_recompile_block; + +#ifdef ENABLE_RECOMPILER_PROFILING + +extern PerfScope MIPSPerfScope; + +#endif // ENABLE_RECOMPILER_PROFILING + +#endif // ENABLE_RECOMPILER + +} // namespace CPU::CodeCache diff --git a/src/core/cpu_core.cpp b/src/core/cpu_core.cpp index 51ca26ae0..0ef3f48f2 100644 --- a/src/core/cpu_core.cpp +++ b/src/core/cpu_core.cpp @@ -7,6 +7,7 @@ #include "common/fastjmp.h" #include "common/file_system.h" #include "common/log.h" +#include "cpu_code_cache_private.h" #include "cpu_core_private.h" #include "cpu_disasm.h" #include "cpu_recompiler_thunks.h" @@ -2262,20 +2263,24 @@ void CPU::SingleStep() } template -void CPU::CodeCache::InterpretCachedBlock(const CodeBlock& block) +void CPU::CodeCache::InterpretCachedBlock(const Block* block) { // set up the state so we've already fetched the instruction - DebugAssert(g_state.pc == block.GetPC()); - g_state.npc = block.GetPC() + 4; + DebugAssert(g_state.pc == block->pc); + g_state.npc = block->pc + 4; - for (const CodeBlockInstruction& cbi : block.instructions) + const Instruction* instruction = block->Instructions(); + const Instruction* end_instruction = instruction + block->size; + const CodeCache::InstructionInfo* info = block->InstructionsInfo(); + + do { g_state.pending_ticks++; // now executing the instruction we previously fetched - g_state.current_instruction.bits = cbi.instruction.bits; - g_state.current_instruction_pc = cbi.pc; - g_state.current_instruction_in_branch_delay_slot = cbi.is_branch_delay_slot; + g_state.current_instruction.bits = instruction->bits; + g_state.current_instruction_pc = info->pc; + g_state.current_instruction_in_branch_delay_slot = info->is_branch_delay_slot; // TODO: let int set it instead g_state.current_instruction_was_branch_taken = g_state.branch_was_taken; g_state.branch_was_taken = false; g_state.exception_raised = false; @@ -2292,15 +2297,18 @@ void CPU::CodeCache::InterpretCachedBlock(const CodeBlock& block) if (g_state.exception_raised) break; - } + + instruction++; + info++; + } while (instruction != end_instruction); // cleanup so the interpreter can kick in if needed g_state.next_instruction_is_branch_delay_slot = false; } -template void CPU::CodeCache::InterpretCachedBlock(const CodeBlock& block); -template void CPU::CodeCache::InterpretCachedBlock(const CodeBlock& block); -template void CPU::CodeCache::InterpretCachedBlock(const CodeBlock& block); +template void CPU::CodeCache::InterpretCachedBlock(const Block* block); +template void CPU::CodeCache::InterpretCachedBlock(const Block* block); +template void CPU::CodeCache::InterpretCachedBlock(const Block* block); template void CPU::CodeCache::InterpretUncachedBlock() @@ -2989,6 +2997,8 @@ static void MemoryBreakpoint(MemoryAccessType type, MemoryAccessSize size, Virtu static constexpr const char* types[2] = { "read", "write" }; const u32 cycle = TimingEvents::GetGlobalTickCounter() + CPU::g_state.pending_ticks; + if (cycle == 3301006373) + __debugbreak(); #if 0 static std::FILE* fp = nullptr; diff --git a/src/core/cpu_recompiler_code_generator.cpp b/src/core/cpu_recompiler_code_generator.cpp index ae3c08e1b..5c1a47658 100644 --- a/src/core/cpu_recompiler_code_generator.cpp +++ b/src/core/cpu_recompiler_code_generator.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin +// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) #include "cpu_recompiler_code_generator.h" @@ -17,74 +17,83 @@ Log_SetChannel(CPU::Recompiler); namespace CPU::Recompiler { -bool CodeGenerator::CompileBlock(CodeBlock* block, CodeBlock::HostCodePointer* out_host_code, u32* out_host_code_size) +const void* CodeGenerator::CompileBlock(CodeCache::Block* block, u32* out_host_code_size, u32* out_host_far_code_size) { // TODO: Align code buffer. m_block = block; - m_block_start = block->instructions.data(); - m_block_end = block->instructions.data() + block->instructions.size(); + m_block_start = {block->Instructions(), block->InstructionsInfo()}; + m_block_end = {block->Instructions() + block->size, block->InstructionsInfo() + block->size}; - m_pc = block->GetPC(); + m_pc = block->pc; m_pc_valid = true; - m_fastmem_load_base_in_register = false; - m_fastmem_store_base_in_register = false; - EmitBeginBlock(true); BlockPrologue(); m_current_instruction = m_block_start; - while (m_current_instruction != m_block_end) + while (m_current_instruction.instruction != m_block_end.instruction) { - if (!CompileInstruction(*m_current_instruction)) + if (!CompileInstruction(*m_current_instruction.instruction, *m_current_instruction.info)) { - m_current_instruction = nullptr; - m_block_end = nullptr; - m_block_start = nullptr; + m_current_instruction = {}; + m_block_end = {}; + m_block_start = {}; m_block = nullptr; - return false; + return nullptr; } - m_current_instruction++; + m_current_instruction.instruction++; + m_current_instruction.info++; } if (!m_block_linked) { BlockEpilogue(); - EmitEndBlock(true, true); + + if (block->HasFlag(CodeCache::BlockFlags::SpansPages)) + { + // jump directly to the next block + const Value pc = CalculatePC(); + WriteNewPC(pc, true); + const void* host_target = + CPU::CodeCache::CreateBlockLink(m_block, GetCurrentCodePointer(), static_cast(pc.constant_value)); + EmitBranch(host_target); + EmitEndBlock(true, nullptr); + } + else + { + EmitEndBlock(true, CodeCache::g_check_events_and_dispatch); + } } - FinalizeBlock(out_host_code, out_host_code_size); - Log_ProfilePrintf("JIT block 0x%08X: %zu instructions (%u bytes), %u host bytes", block->GetPC(), - block->instructions.size(), block->GetSizeInBytes(), *out_host_code_size); - + const void* code = FinalizeBlock(out_host_code_size, out_host_far_code_size); DebugAssert(m_register_cache.GetUsedHostRegisters() == 0); - m_current_instruction = nullptr; - m_block_end = nullptr; - m_block_start = nullptr; + m_current_instruction = {}; + m_block_end = {}; + m_block_start = {}; m_block = nullptr; - return true; + return code; } -bool CodeGenerator::CompileInstruction(const CodeBlockInstruction& cbi) +bool CodeGenerator::CompileInstruction(Instruction instruction, const CodeCache::InstructionInfo& info) { - if (IsNopInstruction(cbi.instruction)) + if (IsNopInstruction(instruction)) { - InstructionPrologue(cbi, 1); - InstructionEpilogue(cbi); + InstructionPrologue(instruction, info, 1); + InstructionEpilogue(instruction, info); return true; } bool result; - switch (cbi.instruction.op) + switch (instruction.op) { #if 1 case InstructionOp::ori: case InstructionOp::andi: case InstructionOp::xori: - result = Compile_Bitwise(cbi); + result = Compile_Bitwise(instruction, info); break; case InstructionOp::lb: @@ -92,23 +101,23 @@ bool CodeGenerator::CompileInstruction(const CodeBlockInstruction& cbi) case InstructionOp::lh: case InstructionOp::lhu: case InstructionOp::lw: - result = Compile_Load(cbi); + result = Compile_Load(instruction, info); break; case InstructionOp::lwl: case InstructionOp::lwr: - result = Compile_LoadLeftRight(cbi); + result = Compile_LoadLeftRight(instruction, info); break; case InstructionOp::swl: case InstructionOp::swr: - result = Compile_StoreLeftRight(cbi); + result = Compile_StoreLeftRight(instruction, info); break; case InstructionOp::sb: case InstructionOp::sh: case InstructionOp::sw: - result = Compile_Store(cbi); + result = Compile_Store(instruction, info); break; case InstructionOp::j: @@ -118,42 +127,42 @@ bool CodeGenerator::CompileInstruction(const CodeBlockInstruction& cbi) case InstructionOp::bne: case InstructionOp::bgtz: case InstructionOp::blez: - result = Compile_Branch(cbi); + result = Compile_Branch(instruction, info); break; case InstructionOp::addi: case InstructionOp::addiu: - result = Compile_Add(cbi); + result = Compile_Add(instruction, info); break; case InstructionOp::slti: case InstructionOp::sltiu: - result = Compile_SetLess(cbi); + result = Compile_SetLess(instruction, info); break; case InstructionOp::lui: - result = Compile_lui(cbi); + result = Compile_lui(instruction, info); break; case InstructionOp::cop0: - result = Compile_cop0(cbi); + result = Compile_cop0(instruction, info); break; case InstructionOp::cop2: case InstructionOp::lwc2: case InstructionOp::swc2: - result = Compile_cop2(cbi); + result = Compile_cop2(instruction, info); break; case InstructionOp::funct: { - switch (cbi.instruction.r.funct) + switch (instruction.r.funct) { case InstructionFunct::and_: case InstructionFunct::or_: case InstructionFunct::xor_: case InstructionFunct::nor: - result = Compile_Bitwise(cbi); + result = Compile_Bitwise(instruction, info); break; case InstructionFunct::sll: @@ -162,53 +171,53 @@ bool CodeGenerator::CompileInstruction(const CodeBlockInstruction& cbi) case InstructionFunct::sllv: case InstructionFunct::srlv: case InstructionFunct::srav: - result = Compile_Shift(cbi); + result = Compile_Shift(instruction, info); break; case InstructionFunct::mfhi: case InstructionFunct::mflo: case InstructionFunct::mthi: case InstructionFunct::mtlo: - result = Compile_MoveHiLo(cbi); + result = Compile_MoveHiLo(instruction, info); break; case InstructionFunct::add: case InstructionFunct::addu: - result = Compile_Add(cbi); + result = Compile_Add(instruction, info); break; case InstructionFunct::sub: case InstructionFunct::subu: - result = Compile_Subtract(cbi); + result = Compile_Subtract(instruction, info); break; case InstructionFunct::mult: case InstructionFunct::multu: - result = Compile_Multiply(cbi); + result = Compile_Multiply(instruction, info); break; case InstructionFunct::div: - result = Compile_SignedDivide(cbi); + result = Compile_SignedDivide(instruction, info); break; case InstructionFunct::divu: - result = Compile_Divide(cbi); + result = Compile_Divide(instruction, info); break; case InstructionFunct::slt: case InstructionFunct::sltu: - result = Compile_SetLess(cbi); + result = Compile_SetLess(instruction, info); break; case InstructionFunct::jr: case InstructionFunct::jalr: case InstructionFunct::syscall: case InstructionFunct::break_: - result = Compile_Branch(cbi); + result = Compile_Branch(instruction, info); break; default: - result = Compile_Fallback(cbi); + result = Compile_Fallback(instruction, info); break; } } @@ -216,7 +225,7 @@ bool CodeGenerator::CompileInstruction(const CodeBlockInstruction& cbi) #endif default: - result = Compile_Fallback(cbi); + result = Compile_Fallback(instruction, info); break; } @@ -917,11 +926,11 @@ Value CodeGenerator::NotValue(const Value& val) return res; } -void CodeGenerator::GenerateExceptionExit(const CodeBlockInstruction& cbi, Exception excode, - Condition condition /* = Condition::Always */) +void CodeGenerator::GenerateExceptionExit(Instruction instruction, const CodeCache::InstructionInfo& info, + Exception excode, Condition condition /* = Condition::Always */) { const Value CAUSE_bits = Value::FromConstantU32( - Cop0Registers::CAUSE::MakeValueForException(excode, cbi.is_branch_delay_slot, false, cbi.instruction.cop.cop_n)); + Cop0Registers::CAUSE::MakeValueForException(excode, info.is_branch_delay_slot, false, instruction.cop.cop_n)); if (condition == Condition::Always) { @@ -932,7 +941,7 @@ void CodeGenerator::GenerateExceptionExit(const CodeBlockInstruction& cbi, Excep if (excode == Exception::BP) { EmitFunctionCall(nullptr, static_cast(&CPU::RaiseBreakException), CAUSE_bits, - GetCurrentInstructionPC(), Value::FromConstantU32(cbi.instruction.bits)); + GetCurrentInstructionPC(), Value::FromConstantU32(instruction.bits)); } else { @@ -965,6 +974,14 @@ void CodeGenerator::BlockPrologue() { InitSpeculativeRegs(); + if (m_block->protection == CodeCache::PageProtectionMode::ManualCheck) + { + Log_DebugPrintf("Generate manual protection for PC %08X", m_block->pc); + const u8* ram_ptr = Bus::g_ram + VirtualAddressToPhysical(m_block->pc); + const u8* shadow_ptr = reinterpret_cast(m_block->Instructions()); + EmitBlockProtectCheck(ram_ptr, shadow_ptr, m_block->size * sizeof(Instruction)); + } + EmitStoreCPUStructField(offsetof(State, exception_raised), Value::FromConstantU8(0)); if (g_settings.bios_tty_logging) @@ -1004,8 +1021,8 @@ void CodeGenerator::BlockEpilogue() AddPendingCycles(true); } -void CodeGenerator::InstructionPrologue(const CodeBlockInstruction& cbi, TickCount cycles, - bool force_sync /* = false */) +void CodeGenerator::InstructionPrologue(Instruction instruction, const CodeCache::InstructionInfo& info, + TickCount cycles, bool force_sync /* = false */) { #if defined(_DEBUG) && defined(CPU_ARCH_X64) m_emit->nop(); @@ -1031,7 +1048,7 @@ void CodeGenerator::InstructionPrologue(const CodeBlockInstruction& cbi, TickCou m_current_instruction_was_branch_taken_dirty = false; } - if (m_current_instruction_in_branch_delay_slot_dirty && !cbi.is_branch_delay_slot) + if (m_current_instruction_in_branch_delay_slot_dirty && !info.is_branch_delay_slot) { EmitStoreCPUStructField(offsetof(State, current_instruction_in_branch_delay_slot), Value::FromConstantU8(0)); m_current_instruction_in_branch_delay_slot_dirty = false; @@ -1044,7 +1061,7 @@ void CodeGenerator::InstructionPrologue(const CodeBlockInstruction& cbi, TickCou return; } - if (cbi.is_branch_delay_slot && g_settings.cpu_recompiler_memory_exceptions) + if (info.is_branch_delay_slot && g_settings.cpu_recompiler_memory_exceptions) { // m_current_instruction_in_branch_delay_slot = true EmitStoreCPUStructField(offsetof(State, current_instruction_in_branch_delay_slot), Value::FromConstantU8(1)); @@ -1055,7 +1072,7 @@ void CodeGenerator::InstructionPrologue(const CodeBlockInstruction& cbi, TickCou AddPendingCycles(true); } -void CodeGenerator::InstructionEpilogue(const CodeBlockInstruction& cbi) +void CodeGenerator::InstructionEpilogue(Instruction instruction, const CodeCache::InstructionInfo& info) { m_register_cache.UpdateLoadDelay(); @@ -1080,8 +1097,9 @@ void CodeGenerator::InstructionEpilogue(const CodeBlockInstruction& cbi) void CodeGenerator::TruncateBlockAtCurrentInstruction() { - Log_DevPrintf("Truncating block %08X at %08X", m_block->GetPC(), m_current_instruction->pc); - m_block_end = m_current_instruction + 1; + Log_DevPrintf("Truncating block %08X at %08X", m_block->pc, m_current_instruction.info->pc); + m_block_end.instruction = m_current_instruction.instruction + 1; + m_block_end.info = m_current_instruction.info + 1; WriteNewPC(CalculatePC(), true); } @@ -1156,7 +1174,7 @@ Value CodeGenerator::CalculatePC(u32 offset /* = 0 */) Value CodeGenerator::GetCurrentInstructionPC(u32 offset /* = 0 */) { - return Value::FromConstantU32(m_current_instruction->pc); + return Value::FromConstantU32(m_current_instruction.info->pc); } void CodeGenerator::WriteNewPC(const Value& value, bool commit) @@ -1171,9 +1189,9 @@ void CodeGenerator::WriteNewPC(const Value& value, bool commit) } } -bool CodeGenerator::Compile_Fallback(const CodeBlockInstruction& cbi) +bool CodeGenerator::Compile_Fallback(Instruction instruction, const CodeCache::InstructionInfo& info) { - InstructionPrologue(cbi, 1, true); + InstructionPrologue(instruction, info, 1, true); // flush and invalidate all guest registers, since the fallback could change any of them m_register_cache.FlushAllGuestRegisters(true, true); @@ -1183,11 +1201,11 @@ bool CodeGenerator::Compile_Fallback(const CodeBlockInstruction& cbi) m_register_cache.WriteLoadDelayToCPU(true); } - EmitStoreCPUStructField(offsetof(State, current_instruction_pc), Value::FromConstantU32(cbi.pc)); - EmitStoreCPUStructField(offsetof(State, current_instruction.bits), Value::FromConstantU32(cbi.instruction.bits)); + EmitStoreCPUStructField(offsetof(State, current_instruction_pc), Value::FromConstantU32(info.pc)); + EmitStoreCPUStructField(offsetof(State, current_instruction.bits), Value::FromConstantU32(instruction.bits)); // emit the function call - if (CanInstructionTrap(cbi.instruction, m_block->key.user_mode)) + if (CanInstructionTrap(instruction, false /*m_block->key.user_mode*/)) { // TODO: Use carry flag or something here too Value return_value = m_register_cache.AllocateScratch(RegSize_8); @@ -1201,19 +1219,18 @@ bool CodeGenerator::Compile_Fallback(const CodeBlockInstruction& cbi) g_settings.gpu_pgxp_enable ? &Thunks::InterpretInstructionPGXP : &Thunks::InterpretInstruction); } - m_current_instruction_in_branch_delay_slot_dirty = cbi.is_branch_instruction; - m_branch_was_taken_dirty = cbi.is_branch_instruction; - m_next_load_delay_dirty = cbi.has_load_delay; + m_current_instruction_in_branch_delay_slot_dirty = info.is_branch_instruction; + m_branch_was_taken_dirty = info.is_branch_instruction; + m_next_load_delay_dirty = info.has_load_delay; InvalidateSpeculativeValues(); - InstructionEpilogue(cbi); + InstructionEpilogue(instruction, info); return true; } -bool CodeGenerator::Compile_Bitwise(const CodeBlockInstruction& cbi) +bool CodeGenerator::Compile_Bitwise(Instruction instruction, const CodeCache::InstructionInfo& info) { - InstructionPrologue(cbi, 1); + InstructionPrologue(instruction, info, 1); - const InstructionOp op = cbi.instruction.op; Value lhs; Value rhs; Reg dest; @@ -1221,33 +1238,33 @@ bool CodeGenerator::Compile_Bitwise(const CodeBlockInstruction& cbi) SpeculativeValue spec_lhs, spec_rhs; SpeculativeValue spec_value; - if (op != InstructionOp::funct) + if (instruction.op != InstructionOp::funct) { // rt <- rs op zext(imm) - lhs = m_register_cache.ReadGuestRegister(cbi.instruction.i.rs); - rhs = Value::FromConstantU32(cbi.instruction.i.imm_zext32()); - dest = cbi.instruction.i.rt; + lhs = m_register_cache.ReadGuestRegister(instruction.i.rs); + rhs = Value::FromConstantU32(instruction.i.imm_zext32()); + dest = instruction.i.rt; - spec_lhs = SpeculativeReadReg(cbi.instruction.i.rs); - spec_rhs = cbi.instruction.i.imm_zext32(); + spec_lhs = SpeculativeReadReg(instruction.i.rs); + spec_rhs = instruction.i.imm_zext32(); } else { - lhs = m_register_cache.ReadGuestRegister(cbi.instruction.r.rs); - rhs = m_register_cache.ReadGuestRegister(cbi.instruction.r.rt); - dest = cbi.instruction.r.rd; + lhs = m_register_cache.ReadGuestRegister(instruction.r.rs); + rhs = m_register_cache.ReadGuestRegister(instruction.r.rt); + dest = instruction.r.rd; - spec_lhs = SpeculativeReadReg(cbi.instruction.r.rs); - spec_rhs = SpeculativeReadReg(cbi.instruction.r.rt); + spec_lhs = SpeculativeReadReg(instruction.r.rs); + spec_rhs = SpeculativeReadReg(instruction.r.rt); } Value result; - switch (cbi.instruction.op) + switch (instruction.op) { case InstructionOp::ori: { if (g_settings.UsingPGXPCPUMode()) - EmitFunctionCall(nullptr, &PGXP::CPU_ORI, Value::FromConstantU32(cbi.instruction.bits), lhs); + EmitFunctionCall(nullptr, &PGXP::CPU_ORI, Value::FromConstantU32(instruction.bits), lhs); result = OrValues(lhs, rhs); if (spec_lhs && spec_rhs) @@ -1258,7 +1275,7 @@ bool CodeGenerator::Compile_Bitwise(const CodeBlockInstruction& cbi) case InstructionOp::andi: { if (g_settings.UsingPGXPCPUMode()) - EmitFunctionCall(nullptr, &PGXP::CPU_ANDI, Value::FromConstantU32(cbi.instruction.bits), lhs); + EmitFunctionCall(nullptr, &PGXP::CPU_ANDI, Value::FromConstantU32(instruction.bits), lhs); result = AndValues(lhs, rhs); if (spec_lhs && spec_rhs) @@ -1269,7 +1286,7 @@ bool CodeGenerator::Compile_Bitwise(const CodeBlockInstruction& cbi) case InstructionOp::xori: { if (g_settings.UsingPGXPCPUMode()) - EmitFunctionCall(nullptr, &PGXP::CPU_XORI, Value::FromConstantU32(cbi.instruction.bits), lhs); + EmitFunctionCall(nullptr, &PGXP::CPU_XORI, Value::FromConstantU32(instruction.bits), lhs); result = XorValues(lhs, rhs); if (spec_lhs && spec_rhs) @@ -1279,12 +1296,12 @@ bool CodeGenerator::Compile_Bitwise(const CodeBlockInstruction& cbi) case InstructionOp::funct: { - switch (cbi.instruction.r.funct) + switch (instruction.r.funct) { case InstructionFunct::or_: { if (g_settings.UsingPGXPCPUMode()) - EmitFunctionCall(nullptr, &PGXP::CPU_OR_, Value::FromConstantU32(cbi.instruction.bits), lhs, rhs); + EmitFunctionCall(nullptr, &PGXP::CPU_OR_, Value::FromConstantU32(instruction.bits), lhs, rhs); result = OrValues(lhs, rhs); if (spec_lhs && spec_rhs) @@ -1295,7 +1312,7 @@ bool CodeGenerator::Compile_Bitwise(const CodeBlockInstruction& cbi) case InstructionFunct::and_: { if (g_settings.UsingPGXPCPUMode()) - EmitFunctionCall(nullptr, &PGXP::CPU_AND_, Value::FromConstantU32(cbi.instruction.bits), lhs, rhs); + EmitFunctionCall(nullptr, &PGXP::CPU_AND_, Value::FromConstantU32(instruction.bits), lhs, rhs); result = AndValues(lhs, rhs); if (spec_lhs && spec_rhs) @@ -1306,7 +1323,7 @@ bool CodeGenerator::Compile_Bitwise(const CodeBlockInstruction& cbi) case InstructionFunct::xor_: { if (g_settings.UsingPGXPCPUMode()) - EmitFunctionCall(nullptr, &PGXP::CPU_XOR_, Value::FromConstantU32(cbi.instruction.bits), lhs, rhs); + EmitFunctionCall(nullptr, &PGXP::CPU_XOR_, Value::FromConstantU32(instruction.bits), lhs, rhs); result = XorValues(lhs, rhs); if (spec_lhs && spec_rhs) @@ -1317,7 +1334,7 @@ bool CodeGenerator::Compile_Bitwise(const CodeBlockInstruction& cbi) case InstructionFunct::nor: { if (g_settings.UsingPGXPCPUMode()) - EmitFunctionCall(nullptr, &PGXP::CPU_NOR, Value::FromConstantU32(cbi.instruction.bits), lhs, rhs); + EmitFunctionCall(nullptr, &PGXP::CPU_NOR, Value::FromConstantU32(instruction.bits), lhs, rhs); result = NotValue(OrValues(lhs, rhs)); if (spec_lhs && spec_rhs) @@ -1340,45 +1357,45 @@ bool CodeGenerator::Compile_Bitwise(const CodeBlockInstruction& cbi) m_register_cache.WriteGuestRegister(dest, std::move(result)); SpeculativeWriteReg(dest, spec_value); - InstructionEpilogue(cbi); + InstructionEpilogue(instruction, info); return true; } -bool CodeGenerator::Compile_Shift(const CodeBlockInstruction& cbi) +bool CodeGenerator::Compile_Shift(Instruction instruction, const CodeCache::InstructionInfo& info) { - InstructionPrologue(cbi, 1); + InstructionPrologue(instruction, info, 1); - const InstructionFunct funct = cbi.instruction.r.funct; - Value rt = m_register_cache.ReadGuestRegister(cbi.instruction.r.rt); - SpeculativeValue rt_spec = SpeculativeReadReg(cbi.instruction.r.rt); + const InstructionFunct funct = instruction.r.funct; + Value rt = m_register_cache.ReadGuestRegister(instruction.r.rt); + SpeculativeValue rt_spec = SpeculativeReadReg(instruction.r.rt); Value shamt; SpeculativeValue shamt_spec; if (funct == InstructionFunct::sll || funct == InstructionFunct::srl || funct == InstructionFunct::sra) { // rd <- rt op shamt - shamt = Value::FromConstantU32(cbi.instruction.r.shamt); - shamt_spec = cbi.instruction.r.shamt; + shamt = Value::FromConstantU32(instruction.r.shamt); + shamt_spec = instruction.r.shamt; } else { // rd <- rt op (rs & 0x1F) - shamt = m_register_cache.ReadGuestRegister(cbi.instruction.r.rs); - shamt_spec = SpeculativeReadReg(cbi.instruction.r.rs); + shamt = m_register_cache.ReadGuestRegister(instruction.r.rs); + shamt_spec = SpeculativeReadReg(instruction.r.rs); } Value result; SpeculativeValue result_spec; - switch (cbi.instruction.r.funct) + switch (instruction.r.funct) { case InstructionFunct::sll: case InstructionFunct::sllv: { if (g_settings.UsingPGXPCPUMode()) { - if (cbi.instruction.r.funct == InstructionFunct::sll) - EmitFunctionCall(nullptr, &PGXP::CPU_SLL, Value::FromConstantU32(cbi.instruction.bits), rt); - else // if (cbi.instruction.r.funct == InstructionFunct::sllv) - EmitFunctionCall(nullptr, &PGXP::CPU_SLLV, Value::FromConstantU32(cbi.instruction.bits), rt, shamt); + if (instruction.r.funct == InstructionFunct::sll) + EmitFunctionCall(nullptr, &PGXP::CPU_SLL, Value::FromConstantU32(instruction.bits), rt); + else // if (instruction.r.funct == InstructionFunct::sllv) + EmitFunctionCall(nullptr, &PGXP::CPU_SLLV, Value::FromConstantU32(instruction.bits), rt, shamt); } result = ShlValues(rt, shamt, false); @@ -1392,10 +1409,10 @@ bool CodeGenerator::Compile_Shift(const CodeBlockInstruction& cbi) { if (g_settings.UsingPGXPCPUMode()) { - if (cbi.instruction.r.funct == InstructionFunct::srl) - EmitFunctionCall(nullptr, &PGXP::CPU_SRL, Value::FromConstantU32(cbi.instruction.bits), rt); - else // if (cbi.instruction.r.funct == InstructionFunct::srlv) - EmitFunctionCall(nullptr, &PGXP::CPU_SRLV, Value::FromConstantU32(cbi.instruction.bits), rt, shamt); + if (instruction.r.funct == InstructionFunct::srl) + EmitFunctionCall(nullptr, &PGXP::CPU_SRL, Value::FromConstantU32(instruction.bits), rt); + else // if (instruction.r.funct == InstructionFunct::srlv) + EmitFunctionCall(nullptr, &PGXP::CPU_SRLV, Value::FromConstantU32(instruction.bits), rt, shamt); } result = ShrValues(rt, shamt, false); @@ -1409,10 +1426,10 @@ bool CodeGenerator::Compile_Shift(const CodeBlockInstruction& cbi) { if (g_settings.UsingPGXPCPUMode()) { - if (cbi.instruction.r.funct == InstructionFunct::sra) - EmitFunctionCall(nullptr, &PGXP::CPU_SRA, Value::FromConstantU32(cbi.instruction.bits), rt); - else // if (cbi.instruction.r.funct == InstructionFunct::srav) - EmitFunctionCall(nullptr, &PGXP::CPU_SRAV, Value::FromConstantU32(cbi.instruction.bits), rt, shamt); + if (instruction.r.funct == InstructionFunct::sra) + EmitFunctionCall(nullptr, &PGXP::CPU_SRA, Value::FromConstantU32(instruction.bits), rt); + else // if (instruction.r.funct == InstructionFunct::srav) + EmitFunctionCall(nullptr, &PGXP::CPU_SRAV, Value::FromConstantU32(instruction.bits), rt, shamt); } result = SarValues(rt, shamt, false); @@ -1426,37 +1443,37 @@ bool CodeGenerator::Compile_Shift(const CodeBlockInstruction& cbi) break; } - m_register_cache.WriteGuestRegister(cbi.instruction.r.rd, std::move(result)); - SpeculativeWriteReg(cbi.instruction.r.rd, result_spec); + m_register_cache.WriteGuestRegister(instruction.r.rd, std::move(result)); + SpeculativeWriteReg(instruction.r.rd, result_spec); - InstructionEpilogue(cbi); + InstructionEpilogue(instruction, info); return true; } -bool CodeGenerator::Compile_Load(const CodeBlockInstruction& cbi) +bool CodeGenerator::Compile_Load(Instruction instruction, const CodeCache::InstructionInfo& info) { - InstructionPrologue(cbi, 1); + InstructionPrologue(instruction, info, 1); // rt <- mem[rs + sext(imm)] - Value base = m_register_cache.ReadGuestRegister(cbi.instruction.i.rs); - Value offset = Value::FromConstantU32(cbi.instruction.i.imm_sext32()); + Value base = m_register_cache.ReadGuestRegister(instruction.i.rs); + Value offset = Value::FromConstantU32(instruction.i.imm_sext32()); Value address = AddValues(base, offset, false); - SpeculativeValue address_spec = SpeculativeReadReg(cbi.instruction.i.rs); + SpeculativeValue address_spec = SpeculativeReadReg(instruction.i.rs); SpeculativeValue value_spec; if (address_spec) - address_spec = *address_spec + cbi.instruction.i.imm_sext32(); + address_spec = *address_spec + instruction.i.imm_sext32(); Value result; - switch (cbi.instruction.op) + switch (instruction.op) { case InstructionOp::lb: case InstructionOp::lbu: { - result = EmitLoadGuestMemory(cbi, address, address_spec, RegSize_8); - ConvertValueSizeInPlace(&result, RegSize_32, (cbi.instruction.op == InstructionOp::lb)); + result = EmitLoadGuestMemory(instruction, info, address, address_spec, RegSize_8); + ConvertValueSizeInPlace(&result, RegSize_32, (instruction.op == InstructionOp::lb)); if (g_settings.gpu_pgxp_enable) - EmitFunctionCall(nullptr, PGXP::CPU_LBx, Value::FromConstantU32(cbi.instruction.bits), address, result); + EmitFunctionCall(nullptr, PGXP::CPU_LBx, Value::FromConstantU32(instruction.bits), address, result); if (address_spec) { @@ -1470,13 +1487,13 @@ bool CodeGenerator::Compile_Load(const CodeBlockInstruction& cbi) case InstructionOp::lh: case InstructionOp::lhu: { - result = EmitLoadGuestMemory(cbi, address, address_spec, RegSize_16); - ConvertValueSizeInPlace(&result, RegSize_32, (cbi.instruction.op == InstructionOp::lh)); + result = EmitLoadGuestMemory(instruction, info, address, address_spec, RegSize_16); + ConvertValueSizeInPlace(&result, RegSize_32, (instruction.op == InstructionOp::lh)); if (g_settings.gpu_pgxp_enable) { - EmitFunctionCall(nullptr, (cbi.instruction.op == InstructionOp::lhu) ? &PGXP::CPU_LHU : PGXP::CPU_LH, - Value::FromConstantU32(cbi.instruction.bits), address, result); + EmitFunctionCall(nullptr, (instruction.op == InstructionOp::lhu) ? &PGXP::CPU_LHU : PGXP::CPU_LH, + Value::FromConstantU32(instruction.bits), address, result); } if (address_spec) @@ -1490,9 +1507,9 @@ bool CodeGenerator::Compile_Load(const CodeBlockInstruction& cbi) case InstructionOp::lw: { - result = EmitLoadGuestMemory(cbi, address, address_spec, RegSize_32); + result = EmitLoadGuestMemory(instruction, info, address, address_spec, RegSize_32); if (g_settings.gpu_pgxp_enable) - EmitFunctionCall(nullptr, PGXP::CPU_LW, Value::FromConstantU32(cbi.instruction.bits), address, result); + EmitFunctionCall(nullptr, PGXP::CPU_LW, Value::FromConstantU32(instruction.bits), address, result); if (address_spec) value_spec = SpeculativeReadMemory(*address_spec); @@ -1504,36 +1521,36 @@ bool CodeGenerator::Compile_Load(const CodeBlockInstruction& cbi) break; } - m_register_cache.WriteGuestRegisterDelayed(cbi.instruction.i.rt, std::move(result)); - SpeculativeWriteReg(cbi.instruction.i.rt, value_spec); + m_register_cache.WriteGuestRegisterDelayed(instruction.i.rt, std::move(result)); + SpeculativeWriteReg(instruction.i.rt, value_spec); - InstructionEpilogue(cbi); + InstructionEpilogue(instruction, info); return true; } -bool CodeGenerator::Compile_Store(const CodeBlockInstruction& cbi) +bool CodeGenerator::Compile_Store(Instruction instruction, const CodeCache::InstructionInfo& info) { - InstructionPrologue(cbi, 1); + InstructionPrologue(instruction, info, 1); // mem[rs + sext(imm)] <- rt - Value base = m_register_cache.ReadGuestRegister(cbi.instruction.i.rs); - Value offset = Value::FromConstantU32(cbi.instruction.i.imm_sext32()); + Value base = m_register_cache.ReadGuestRegister(instruction.i.rs); + Value offset = Value::FromConstantU32(instruction.i.imm_sext32()); Value address = AddValues(base, offset, false); - Value value = m_register_cache.ReadGuestRegister(cbi.instruction.i.rt); + Value value = m_register_cache.ReadGuestRegister(instruction.i.rt); - SpeculativeValue address_spec = SpeculativeReadReg(cbi.instruction.i.rs); - SpeculativeValue value_spec = SpeculativeReadReg(cbi.instruction.i.rt); + SpeculativeValue address_spec = SpeculativeReadReg(instruction.i.rs); + SpeculativeValue value_spec = SpeculativeReadReg(instruction.i.rt); if (address_spec) - address_spec = *address_spec + cbi.instruction.i.imm_sext32(); + address_spec = *address_spec + instruction.i.imm_sext32(); - switch (cbi.instruction.op) + switch (instruction.op) { case InstructionOp::sb: { if (g_settings.gpu_pgxp_enable) - EmitFunctionCall(nullptr, PGXP::CPU_SB, Value::FromConstantU32(cbi.instruction.bits), address, value); + EmitFunctionCall(nullptr, PGXP::CPU_SB, Value::FromConstantU32(instruction.bits), address, value); - EmitStoreGuestMemory(cbi, address, address_spec, RegSize_8, value); + EmitStoreGuestMemory(instruction, info, address, address_spec, RegSize_8, value); if (address_spec) { @@ -1559,9 +1576,9 @@ bool CodeGenerator::Compile_Store(const CodeBlockInstruction& cbi) case InstructionOp::sh: { if (g_settings.gpu_pgxp_enable) - EmitFunctionCall(nullptr, PGXP::CPU_SH, Value::FromConstantU32(cbi.instruction.bits), address, value); + EmitFunctionCall(nullptr, PGXP::CPU_SH, Value::FromConstantU32(instruction.bits), address, value); - EmitStoreGuestMemory(cbi, address, address_spec, RegSize_16, value); + EmitStoreGuestMemory(instruction, info, address, address_spec, RegSize_16, value); if (address_spec) { @@ -1587,9 +1604,9 @@ bool CodeGenerator::Compile_Store(const CodeBlockInstruction& cbi) case InstructionOp::sw: { if (g_settings.gpu_pgxp_enable) - EmitFunctionCall(nullptr, PGXP::CPU_SW, Value::FromConstantU32(cbi.instruction.bits), address, value); + EmitFunctionCall(nullptr, PGXP::CPU_SW, Value::FromConstantU32(instruction.bits), address, value); - EmitStoreGuestMemory(cbi, address, address_spec, RegSize_32, value); + EmitStoreGuestMemory(instruction, info, address, address_spec, RegSize_32, value); if (address_spec) SpeculativeWriteMemory(*address_spec, value_spec); @@ -1601,7 +1618,7 @@ bool CodeGenerator::Compile_Store(const CodeBlockInstruction& cbi) break; } - InstructionEpilogue(cbi); + InstructionEpilogue(instruction, info); if (address_spec) { @@ -1609,13 +1626,13 @@ bool CodeGenerator::Compile_Store(const CodeBlockInstruction& cbi) if (seg == Segment::KUSEG || seg == Segment::KSEG0 || seg == Segment::KSEG1) { const PhysicalMemoryAddress phys_addr = VirtualAddressToPhysical(*address_spec); - const PhysicalMemoryAddress block_start = VirtualAddressToPhysical(m_block->GetPC()); - const PhysicalMemoryAddress block_end = VirtualAddressToPhysical( - m_block->GetPC() + static_cast(m_block->instructions.size()) * sizeof(Instruction)); + const PhysicalMemoryAddress block_start = VirtualAddressToPhysical(m_block->pc); + const PhysicalMemoryAddress block_end = + VirtualAddressToPhysical(m_block->pc + (m_block->size * sizeof(Instruction))); if (phys_addr >= block_start && phys_addr < block_end) { Log_WarningPrintf("Instruction %08X speculatively writes to %08X inside block %08X-%08X. Truncating block.", - cbi.pc, phys_addr, block_start, block_end); + info.pc, phys_addr, block_start, block_end); TruncateBlockAtCurrentInstruction(); } } @@ -1624,25 +1641,25 @@ bool CodeGenerator::Compile_Store(const CodeBlockInstruction& cbi) return true; } -bool CodeGenerator::Compile_LoadLeftRight(const CodeBlockInstruction& cbi) +bool CodeGenerator::Compile_LoadLeftRight(Instruction instruction, const CodeCache::InstructionInfo& info) { - InstructionPrologue(cbi, 1); + InstructionPrologue(instruction, info, 1); - Value base = m_register_cache.ReadGuestRegister(cbi.instruction.i.rs); - Value offset = Value::FromConstantU32(cbi.instruction.i.imm_sext32()); + Value base = m_register_cache.ReadGuestRegister(instruction.i.rs); + Value offset = Value::FromConstantU32(instruction.i.imm_sext32()); Value address = AddValues(base, offset, false); base.ReleaseAndClear(); - SpeculativeValue address_spec = SpeculativeReadReg(cbi.instruction.i.rs); + SpeculativeValue address_spec = SpeculativeReadReg(instruction.i.rs); if (address_spec) - address_spec = *address_spec + cbi.instruction.i.imm_sext32(); + address_spec = *address_spec + instruction.i.imm_sext32(); Value shift = ShlValues(AndValues(address, Value::FromConstantU32(3)), Value::FromConstantU32(3)); // * 8 address = AndValues(address, Value::FromConstantU32(~u32(3))); // hack to bypass load delays Value value; - if (cbi.instruction.i.rt == m_register_cache.GetLoadDelayRegister()) + if (instruction.i.rt == m_register_cache.GetLoadDelayRegister()) { const Value& ld_value = m_register_cache.GetLoadDelayValue(); if (ld_value.IsInHostRegister()) @@ -1656,24 +1673,24 @@ bool CodeGenerator::Compile_LoadLeftRight(const CodeBlockInstruction& cbi) // we don't actually care if it's our target reg or not, if it's not, it won't affect anything if (m_load_delay_dirty) { - Log_DevPrintf("Flushing interpreter load delay for lwl/lwr instruction at 0x%08X", cbi.pc); + Log_DevPrintf("Flushing interpreter load delay for lwl/lwr instruction at 0x%08X", info.pc); EmitFlushInterpreterLoadDelay(); - m_register_cache.InvalidateGuestRegister(cbi.instruction.r.rt); + m_register_cache.InvalidateGuestRegister(instruction.r.rt); m_load_delay_dirty = false; } - value = m_register_cache.ReadGuestRegister(cbi.instruction.i.rt, true, true); + value = m_register_cache.ReadGuestRegister(instruction.i.rt, true, true); } Value mem; - if (cbi.instruction.op == InstructionOp::lwl) + if (instruction.op == InstructionOp::lwl) { Value lhs = ShrValues(Value::FromConstantU32(0x00FFFFFF), shift); AndValueInPlace(lhs, value); shift = SubValues(Value::FromConstantU32(24), shift, false); value.ReleaseAndClear(); - mem = EmitLoadGuestMemory(cbi, address, address_spec, RegSize_32); + mem = EmitLoadGuestMemory(instruction, info, address, address_spec, RegSize_32); EmitShl(mem.GetHostRegister(), mem.GetHostRegister(), RegSize_32, shift); EmitOr(mem.GetHostRegister(), mem.GetHostRegister(), lhs); } @@ -1683,7 +1700,7 @@ bool CodeGenerator::Compile_LoadLeftRight(const CodeBlockInstruction& cbi) AndValueInPlace(lhs, value); value.ReleaseAndClear(); - mem = EmitLoadGuestMemory(cbi, address, address_spec, RegSize_32); + mem = EmitLoadGuestMemory(instruction, info, address, address_spec, RegSize_32); EmitShr(mem.GetHostRegister(), mem.GetHostRegister(), RegSize_32, shift); EmitOr(mem.GetHostRegister(), mem.GetHostRegister(), lhs); } @@ -1691,31 +1708,31 @@ bool CodeGenerator::Compile_LoadLeftRight(const CodeBlockInstruction& cbi) shift.ReleaseAndClear(); if (g_settings.gpu_pgxp_enable) - EmitFunctionCall(nullptr, PGXP::CPU_LW, Value::FromConstantU32(cbi.instruction.bits), address, mem); + EmitFunctionCall(nullptr, PGXP::CPU_LW, Value::FromConstantU32(instruction.bits), address, mem); - m_register_cache.WriteGuestRegisterDelayed(cbi.instruction.i.rt, std::move(mem)); + m_register_cache.WriteGuestRegisterDelayed(instruction.i.rt, std::move(mem)); // TODO: Speculative values - SpeculativeWriteReg(cbi.instruction.r.rt, std::nullopt); + SpeculativeWriteReg(instruction.r.rt, std::nullopt); - InstructionEpilogue(cbi); + InstructionEpilogue(instruction, info); return true; } -bool CodeGenerator::Compile_StoreLeftRight(const CodeBlockInstruction& cbi) +bool CodeGenerator::Compile_StoreLeftRight(Instruction instruction, const CodeCache::InstructionInfo& info) { - InstructionPrologue(cbi, 1); + InstructionPrologue(instruction, info, 1); - Value base = m_register_cache.ReadGuestRegister(cbi.instruction.i.rs); - Value offset = Value::FromConstantU32(cbi.instruction.i.imm_sext32()); + Value base = m_register_cache.ReadGuestRegister(instruction.i.rs); + Value offset = Value::FromConstantU32(instruction.i.imm_sext32()); Value address = AddValues(base, offset, false); base.ReleaseAndClear(); // TODO: Speculative values - SpeculativeValue address_spec = SpeculativeReadReg(cbi.instruction.i.rs); + SpeculativeValue address_spec = SpeculativeReadReg(instruction.i.rs); if (address_spec) { - address_spec = *address_spec + cbi.instruction.i.imm_sext32(); + address_spec = *address_spec + instruction.i.imm_sext32(); SpeculativeWriteMemory(*address_spec & ~3u, std::nullopt); } @@ -1723,14 +1740,14 @@ bool CodeGenerator::Compile_StoreLeftRight(const CodeBlockInstruction& cbi) address = AndValues(address, Value::FromConstantU32(~u32(3))); Value mem; - if (cbi.instruction.op == InstructionOp::swl) + if (instruction.op == InstructionOp::swl) { Value mask = ShlValues(Value::FromConstantU32(0xFFFFFF00), shift); - mem = EmitLoadGuestMemory(cbi, address, address_spec, RegSize_32); + mem = EmitLoadGuestMemory(instruction, info, address, address_spec, RegSize_32); EmitAnd(mem.GetHostRegister(), mem.GetHostRegister(), mask); mask.ReleaseAndClear(); - Value reg = m_register_cache.ReadGuestRegister(cbi.instruction.r.rt); + Value reg = m_register_cache.ReadGuestRegister(instruction.r.rt); Value lhs = ShrValues(reg, SubValues(Value::FromConstantU32(24), shift, false)); reg.ReleaseAndClear(); @@ -1739,11 +1756,11 @@ bool CodeGenerator::Compile_StoreLeftRight(const CodeBlockInstruction& cbi) else { Value mask = ShrValues(Value::FromConstantU32(0x00FFFFFF), SubValues(Value::FromConstantU32(24), shift, false)); - mem = EmitLoadGuestMemory(cbi, address, address_spec, RegSize_32); + mem = EmitLoadGuestMemory(instruction, info, address, address_spec, RegSize_32); AndValueInPlace(mem, mask); mask.ReleaseAndClear(); - Value reg = m_register_cache.ReadGuestRegister(cbi.instruction.r.rt); + Value reg = m_register_cache.ReadGuestRegister(instruction.r.rt); Value lhs = ShlValues(reg, shift); reg.ReleaseAndClear(); @@ -1752,36 +1769,36 @@ bool CodeGenerator::Compile_StoreLeftRight(const CodeBlockInstruction& cbi) shift.ReleaseAndClear(); - EmitStoreGuestMemory(cbi, address, address_spec, RegSize_32, mem); + EmitStoreGuestMemory(instruction, info, address, address_spec, RegSize_32, mem); if (g_settings.gpu_pgxp_enable) - EmitFunctionCall(nullptr, PGXP::CPU_SW, Value::FromConstantU32(cbi.instruction.bits), address, mem); + EmitFunctionCall(nullptr, PGXP::CPU_SW, Value::FromConstantU32(instruction.bits), address, mem); - InstructionEpilogue(cbi); + InstructionEpilogue(instruction, info); return true; } -bool CodeGenerator::Compile_MoveHiLo(const CodeBlockInstruction& cbi) +bool CodeGenerator::Compile_MoveHiLo(Instruction instruction, const CodeCache::InstructionInfo& info) { - InstructionPrologue(cbi, 1); + InstructionPrologue(instruction, info, 1); - switch (cbi.instruction.r.funct) + switch (instruction.r.funct) { case InstructionFunct::mfhi: { Value hi = m_register_cache.ReadGuestRegister(Reg::hi); if (g_settings.UsingPGXPCPUMode()) - EmitFunctionCall(nullptr, &PGXP::CPU_MFHI, Value::FromConstantU32(cbi.instruction.bits), hi); + EmitFunctionCall(nullptr, &PGXP::CPU_MFHI, Value::FromConstantU32(instruction.bits), hi); - m_register_cache.WriteGuestRegister(cbi.instruction.r.rd, std::move(hi)); - SpeculativeWriteReg(cbi.instruction.r.rd, std::nullopt); + m_register_cache.WriteGuestRegister(instruction.r.rd, std::move(hi)); + SpeculativeWriteReg(instruction.r.rd, std::nullopt); } break; case InstructionFunct::mthi: { - Value rs = m_register_cache.ReadGuestRegister(cbi.instruction.r.rs); + Value rs = m_register_cache.ReadGuestRegister(instruction.r.rs); if (g_settings.UsingPGXPCPUMode()) - EmitFunctionCall(nullptr, &PGXP::CPU_MTHI, Value::FromConstantU32(cbi.instruction.bits), rs); + EmitFunctionCall(nullptr, &PGXP::CPU_MTHI, Value::FromConstantU32(instruction.bits), rs); m_register_cache.WriteGuestRegister(Reg::hi, std::move(rs)); } @@ -1791,18 +1808,18 @@ bool CodeGenerator::Compile_MoveHiLo(const CodeBlockInstruction& cbi) { Value lo = m_register_cache.ReadGuestRegister(Reg::lo); if (g_settings.UsingPGXPCPUMode()) - EmitFunctionCall(nullptr, &PGXP::CPU_MFLO, Value::FromConstantU32(cbi.instruction.bits), lo); + EmitFunctionCall(nullptr, &PGXP::CPU_MFLO, Value::FromConstantU32(instruction.bits), lo); - m_register_cache.WriteGuestRegister(cbi.instruction.r.rd, std::move(lo)); - SpeculativeWriteReg(cbi.instruction.r.rd, std::nullopt); + m_register_cache.WriteGuestRegister(instruction.r.rd, std::move(lo)); + SpeculativeWriteReg(instruction.r.rd, std::nullopt); } break; case InstructionFunct::mtlo: { - Value rs = m_register_cache.ReadGuestRegister(cbi.instruction.r.rs); + Value rs = m_register_cache.ReadGuestRegister(instruction.r.rs); if (g_settings.UsingPGXPCPUMode()) - EmitFunctionCall(nullptr, &PGXP::CPU_MTLO, Value::FromConstantU32(cbi.instruction.bits), rs); + EmitFunctionCall(nullptr, &PGXP::CPU_MTLO, Value::FromConstantU32(instruction.bits), rs); m_register_cache.WriteGuestRegister(Reg::lo, std::move(rs)); } @@ -1813,48 +1830,47 @@ bool CodeGenerator::Compile_MoveHiLo(const CodeBlockInstruction& cbi) break; } - InstructionEpilogue(cbi); + InstructionEpilogue(instruction, info); return true; } -bool CodeGenerator::Compile_Add(const CodeBlockInstruction& cbi) +bool CodeGenerator::Compile_Add(Instruction instruction, const CodeCache::InstructionInfo& info) { - InstructionPrologue(cbi, 1); + InstructionPrologue(instruction, info, 1); - const bool check_overflow = - (cbi.instruction.op == InstructionOp::addi || - (cbi.instruction.op == InstructionOp::funct && cbi.instruction.r.funct == InstructionFunct::add)); + const bool check_overflow = (instruction.op == InstructionOp::addi || (instruction.op == InstructionOp::funct && + instruction.r.funct == InstructionFunct::add)); Value lhs, rhs; Reg lhs_src; SpeculativeValue lhs_spec, rhs_spec; Reg dest; - switch (cbi.instruction.op) + switch (instruction.op) { case InstructionOp::addi: case InstructionOp::addiu: { // rt <- rs + sext(imm) - dest = cbi.instruction.i.rt; - lhs_src = cbi.instruction.i.rs; - lhs = m_register_cache.ReadGuestRegister(cbi.instruction.i.rs); - rhs = Value::FromConstantU32(cbi.instruction.i.imm_sext32()); + dest = instruction.i.rt; + lhs_src = instruction.i.rs; + lhs = m_register_cache.ReadGuestRegister(instruction.i.rs); + rhs = Value::FromConstantU32(instruction.i.imm_sext32()); - lhs_spec = SpeculativeReadReg(cbi.instruction.i.rs); - rhs_spec = cbi.instruction.i.imm_sext32(); + lhs_spec = SpeculativeReadReg(instruction.i.rs); + rhs_spec = instruction.i.imm_sext32(); } break; case InstructionOp::funct: { - Assert(cbi.instruction.r.funct == InstructionFunct::add || cbi.instruction.r.funct == InstructionFunct::addu); - dest = cbi.instruction.r.rd; - lhs_src = cbi.instruction.r.rs; - lhs = m_register_cache.ReadGuestRegister(cbi.instruction.r.rs); - rhs = m_register_cache.ReadGuestRegister(cbi.instruction.r.rt); - lhs_spec = SpeculativeReadReg(cbi.instruction.r.rs); - rhs_spec = SpeculativeReadReg(cbi.instruction.r.rt); + Assert(instruction.r.funct == InstructionFunct::add || instruction.r.funct == InstructionFunct::addu); + dest = instruction.r.rd; + lhs_src = instruction.r.rs; + lhs = m_register_cache.ReadGuestRegister(instruction.r.rs); + rhs = m_register_cache.ReadGuestRegister(instruction.r.rt); + lhs_spec = SpeculativeReadReg(instruction.r.rs); + rhs_spec = SpeculativeReadReg(instruction.r.rt); } break; @@ -1871,15 +1887,15 @@ bool CodeGenerator::Compile_Add(const CodeBlockInstruction& cbi) } else if (g_settings.UsingPGXPCPUMode()) { - if (cbi.instruction.op != InstructionOp::funct) - EmitFunctionCall(nullptr, &PGXP::CPU_ADDI, Value::FromConstantU32(cbi.instruction.bits), lhs); + if (instruction.op != InstructionOp::funct) + EmitFunctionCall(nullptr, &PGXP::CPU_ADDI, Value::FromConstantU32(instruction.bits), lhs); else - EmitFunctionCall(nullptr, &PGXP::CPU_ADD, Value::FromConstantU32(cbi.instruction.bits), lhs, rhs); + EmitFunctionCall(nullptr, &PGXP::CPU_ADD, Value::FromConstantU32(instruction.bits), lhs, rhs); } Value result = AddValues(lhs, rhs, check_overflow); if (check_overflow) - GenerateExceptionExit(cbi, Exception::Ov, Condition::Overflow); + GenerateExceptionExit(instruction, info, Exception::Ov, Condition::Overflow); m_register_cache.WriteGuestRegister(dest, std::move(result)); @@ -1888,52 +1904,52 @@ bool CodeGenerator::Compile_Add(const CodeBlockInstruction& cbi) value_spec = *lhs_spec + *rhs_spec; SpeculativeWriteReg(dest, value_spec); - InstructionEpilogue(cbi); + InstructionEpilogue(instruction, info); return true; } -bool CodeGenerator::Compile_Subtract(const CodeBlockInstruction& cbi) +bool CodeGenerator::Compile_Subtract(Instruction instruction, const CodeCache::InstructionInfo& info) { - InstructionPrologue(cbi, 1); + InstructionPrologue(instruction, info, 1); - Assert(cbi.instruction.op == InstructionOp::funct); - const bool check_overflow = (cbi.instruction.r.funct == InstructionFunct::sub); + Assert(instruction.op == InstructionOp::funct); + const bool check_overflow = (instruction.r.funct == InstructionFunct::sub); - Value lhs = m_register_cache.ReadGuestRegister(cbi.instruction.r.rs); - Value rhs = m_register_cache.ReadGuestRegister(cbi.instruction.r.rt); + Value lhs = m_register_cache.ReadGuestRegister(instruction.r.rs); + Value rhs = m_register_cache.ReadGuestRegister(instruction.r.rt); - SpeculativeValue lhs_spec = SpeculativeReadReg(cbi.instruction.r.rs); - SpeculativeValue rhs_spec = SpeculativeReadReg(cbi.instruction.r.rt); + SpeculativeValue lhs_spec = SpeculativeReadReg(instruction.r.rs); + SpeculativeValue rhs_spec = SpeculativeReadReg(instruction.r.rt); if (g_settings.UsingPGXPCPUMode()) - EmitFunctionCall(nullptr, &PGXP::CPU_SUB, Value::FromConstantU32(cbi.instruction.bits), lhs, rhs); + EmitFunctionCall(nullptr, &PGXP::CPU_SUB, Value::FromConstantU32(instruction.bits), lhs, rhs); Value result = SubValues(lhs, rhs, check_overflow); if (check_overflow) - GenerateExceptionExit(cbi, Exception::Ov, Condition::Overflow); + GenerateExceptionExit(instruction, info, Exception::Ov, Condition::Overflow); - m_register_cache.WriteGuestRegister(cbi.instruction.r.rd, std::move(result)); + m_register_cache.WriteGuestRegister(instruction.r.rd, std::move(result)); SpeculativeValue value_spec; if (lhs_spec && rhs_spec) value_spec = *lhs_spec - *rhs_spec; - SpeculativeWriteReg(cbi.instruction.r.rd, value_spec); + SpeculativeWriteReg(instruction.r.rd, value_spec); - InstructionEpilogue(cbi); + InstructionEpilogue(instruction, info); return true; } -bool CodeGenerator::Compile_Multiply(const CodeBlockInstruction& cbi) +bool CodeGenerator::Compile_Multiply(Instruction instruction, const CodeCache::InstructionInfo& info) { - InstructionPrologue(cbi, 1); + InstructionPrologue(instruction, info, 1); - const bool signed_multiply = (cbi.instruction.r.funct == InstructionFunct::mult); - Value rs = m_register_cache.ReadGuestRegister(cbi.instruction.r.rs); - Value rt = m_register_cache.ReadGuestRegister(cbi.instruction.r.rt); + const bool signed_multiply = (instruction.r.funct == InstructionFunct::mult); + Value rs = m_register_cache.ReadGuestRegister(instruction.r.rs); + Value rt = m_register_cache.ReadGuestRegister(instruction.r.rt); if (g_settings.UsingPGXPCPUMode()) { EmitFunctionCall(nullptr, signed_multiply ? &PGXP::CPU_MULT : &PGXP::CPU_MULTU, - Value::FromConstantU32(cbi.instruction.bits), rs, rt); + Value::FromConstantU32(instruction.bits), rs, rt); } std::pair result = MulValues(rs, rt, signed_multiply); @@ -1942,7 +1958,7 @@ bool CodeGenerator::Compile_Multiply(const CodeBlockInstruction& cbi) m_register_cache.WriteGuestRegister(Reg::hi, std::move(result.first)); m_register_cache.WriteGuestRegister(Reg::lo, std::move(result.second)); - InstructionEpilogue(cbi); + InstructionEpilogue(instruction, info); return true; } @@ -1989,15 +2005,15 @@ static std::tuple MIPSDivide(s32 num, s32 denom) return std::tie(lo, hi); } -bool CodeGenerator::Compile_Divide(const CodeBlockInstruction& cbi) +bool CodeGenerator::Compile_Divide(Instruction instruction, const CodeCache::InstructionInfo& info) { - InstructionPrologue(cbi, 1); + InstructionPrologue(instruction, info, 1); - Value num = m_register_cache.ReadGuestRegister(cbi.instruction.r.rs); - Value denom = m_register_cache.ReadGuestRegister(cbi.instruction.r.rt); + Value num = m_register_cache.ReadGuestRegister(instruction.r.rs); + Value denom = m_register_cache.ReadGuestRegister(instruction.r.rt); if (g_settings.UsingPGXPCPUMode()) - EmitFunctionCall(nullptr, &PGXP::CPU_DIV, Value::FromConstantU32(cbi.instruction.bits), num, denom); + EmitFunctionCall(nullptr, &PGXP::CPU_DIV, Value::FromConstantU32(instruction.bits), num, denom); if (num.IsConstant() && denom.IsConstant()) { @@ -2046,19 +2062,19 @@ bool CodeGenerator::Compile_Divide(const CodeBlockInstruction& cbi) m_register_cache.WriteGuestRegister(Reg::hi, std::move(hi)); } - InstructionEpilogue(cbi); + InstructionEpilogue(instruction, info); return true; } -bool CodeGenerator::Compile_SignedDivide(const CodeBlockInstruction& cbi) +bool CodeGenerator::Compile_SignedDivide(Instruction instruction, const CodeCache::InstructionInfo& info) { - InstructionPrologue(cbi, 1); + InstructionPrologue(instruction, info, 1); - Value num = m_register_cache.ReadGuestRegister(cbi.instruction.r.rs); - Value denom = m_register_cache.ReadGuestRegister(cbi.instruction.r.rt); + Value num = m_register_cache.ReadGuestRegister(instruction.r.rs); + Value denom = m_register_cache.ReadGuestRegister(instruction.r.rt); if (g_settings.UsingPGXPCPUMode()) - EmitFunctionCall(nullptr, &PGXP::CPU_DIV, Value::FromConstantU32(cbi.instruction.bits), num, denom); + EmitFunctionCall(nullptr, &PGXP::CPU_DIV, Value::FromConstantU32(instruction.bits), num, denom); if (num.IsConstant() && denom.IsConstant()) { @@ -2132,58 +2148,58 @@ bool CodeGenerator::Compile_SignedDivide(const CodeBlockInstruction& cbi) m_register_cache.WriteGuestRegister(Reg::hi, std::move(hi)); } - InstructionEpilogue(cbi); + InstructionEpilogue(instruction, info); return true; } -bool CodeGenerator::Compile_SetLess(const CodeBlockInstruction& cbi) +bool CodeGenerator::Compile_SetLess(Instruction instruction, const CodeCache::InstructionInfo& info) { - InstructionPrologue(cbi, 1); + InstructionPrologue(instruction, info, 1); const bool signed_comparison = - (cbi.instruction.op == InstructionOp::slti || - (cbi.instruction.op == InstructionOp::funct && cbi.instruction.r.funct == InstructionFunct::slt)); + (instruction.op == InstructionOp::slti || + (instruction.op == InstructionOp::funct && instruction.r.funct == InstructionFunct::slt)); Reg dest; Value lhs, rhs; SpeculativeValue lhs_spec, rhs_spec; - if (cbi.instruction.op == InstructionOp::slti || cbi.instruction.op == InstructionOp::sltiu) + if (instruction.op == InstructionOp::slti || instruction.op == InstructionOp::sltiu) { // rt <- rs < {z,s}ext(imm) - dest = cbi.instruction.i.rt; - lhs = m_register_cache.ReadGuestRegister(cbi.instruction.i.rs, true, true); - rhs = Value::FromConstantU32(cbi.instruction.i.imm_sext32()); - lhs_spec = SpeculativeReadReg(cbi.instruction.i.rs); - rhs_spec = cbi.instruction.i.imm_sext32(); + dest = instruction.i.rt; + lhs = m_register_cache.ReadGuestRegister(instruction.i.rs, true, true); + rhs = Value::FromConstantU32(instruction.i.imm_sext32()); + lhs_spec = SpeculativeReadReg(instruction.i.rs); + rhs_spec = instruction.i.imm_sext32(); // flush the old value which might free up a register - if (dest != cbi.instruction.r.rs) + if (dest != instruction.r.rs) m_register_cache.InvalidateGuestRegister(dest); } else { // rd <- rs < rt - dest = cbi.instruction.r.rd; - lhs = m_register_cache.ReadGuestRegister(cbi.instruction.r.rs, true, true); - rhs = m_register_cache.ReadGuestRegister(cbi.instruction.r.rt); - lhs_spec = SpeculativeReadReg(cbi.instruction.r.rs); - rhs_spec = SpeculativeReadReg(cbi.instruction.r.rt); + dest = instruction.r.rd; + lhs = m_register_cache.ReadGuestRegister(instruction.r.rs, true, true); + rhs = m_register_cache.ReadGuestRegister(instruction.r.rt); + lhs_spec = SpeculativeReadReg(instruction.r.rs); + rhs_spec = SpeculativeReadReg(instruction.r.rt); // flush the old value which might free up a register - if (dest != cbi.instruction.i.rs && dest != cbi.instruction.r.rt) + if (dest != instruction.i.rs && dest != instruction.r.rt) m_register_cache.InvalidateGuestRegister(dest); } if (g_settings.UsingPGXPCPUMode()) { - if (cbi.instruction.op == InstructionOp::slti) - EmitFunctionCall(nullptr, &PGXP::CPU_SLTI, Value::FromConstantU32(cbi.instruction.bits), lhs); - else if (cbi.instruction.op == InstructionOp::sltiu) - EmitFunctionCall(nullptr, &PGXP::CPU_SLTIU, Value::FromConstantU32(cbi.instruction.bits), lhs); - else if (cbi.instruction.r.funct == InstructionFunct::slt) - EmitFunctionCall(nullptr, &PGXP::CPU_SLT, Value::FromConstantU32(cbi.instruction.bits), lhs, rhs); - else // if (cbi.instruction.r.funct == InstructionFunct::sltu) - EmitFunctionCall(nullptr, &PGXP::CPU_SLTU, Value::FromConstantU32(cbi.instruction.bits), lhs, rhs); + if (instruction.op == InstructionOp::slti) + EmitFunctionCall(nullptr, &PGXP::CPU_SLTI, Value::FromConstantU32(instruction.bits), lhs); + else if (instruction.op == InstructionOp::sltiu) + EmitFunctionCall(nullptr, &PGXP::CPU_SLTIU, Value::FromConstantU32(instruction.bits), lhs); + else if (instruction.r.funct == InstructionFunct::slt) + EmitFunctionCall(nullptr, &PGXP::CPU_SLT, Value::FromConstantU32(instruction.bits), lhs, rhs); + else // if (instruction.r.funct == InstructionFunct::sltu) + EmitFunctionCall(nullptr, &PGXP::CPU_SLTU, Value::FromConstantU32(instruction.bits), lhs, rhs); } Value result = m_register_cache.AllocateScratch(RegSize_32); @@ -2198,19 +2214,19 @@ bool CodeGenerator::Compile_SetLess(const CodeBlockInstruction& cbi) value_spec = BoolToUInt32(signed_comparison ? (static_cast(*lhs_spec) < static_cast(*rhs_spec)) : (*lhs_spec < *rhs_spec)); } - SpeculativeWriteReg(cbi.instruction.r.rd, value_spec); + SpeculativeWriteReg(instruction.r.rd, value_spec); - InstructionEpilogue(cbi); + InstructionEpilogue(instruction, info); return true; } -bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) +bool CodeGenerator::Compile_Branch(Instruction instruction, const CodeCache::InstructionInfo& info) { - InstructionPrologue(cbi, 1); + InstructionPrologue(instruction, info, 1); - auto DoBranch = [this, &cbi](Condition condition, const Value& lhs, const Value& rhs, Reg lr_reg, - Value&& branch_target) { - const bool can_link_block = cbi.is_direct_branch_instruction && g_settings.cpu_recompiler_block_linking; + auto DoBranch = [this, &instruction, &info](Condition condition, const Value& lhs, const Value& rhs, Reg lr_reg, + Value&& branch_target) { + const bool can_link_block = info.is_direct_branch_instruction && g_settings.cpu_recompiler_block_linking; // ensure the lr register is flushed, since we want it's correct value after the branch // we don't want to invalidate it yet because of "jalr r0, r0", branch_target could be the lr_reg. @@ -2218,12 +2234,13 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) m_register_cache.FlushGuestRegister(lr_reg, false, true); // compute return address, which is also set as the new pc when the branch isn't taken - Value next_pc = CalculatePC(4); - DebugAssert(next_pc.IsConstant()); + Value constant_next_pc = CalculatePC(4); + Value next_pc = constant_next_pc; + DebugAssert(constant_next_pc.IsConstant()); if (condition != Condition::Always) { next_pc = m_register_cache.AllocateScratch(RegSize_32); - EmitCopyValue(next_pc.GetHostRegister(), CalculatePC(4)); + EmitCopyValue(next_pc.GetHostRegister(), constant_next_pc); } Value take_branch; @@ -2332,10 +2349,11 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) { // if it's an in-block branch, compile the delay slot now // TODO: Make this more optimal by moving the condition down if it's a nop - Assert((m_current_instruction + 1) != m_block_end); - InstructionEpilogue(cbi); - m_current_instruction++; - if (!CompileInstruction(*m_current_instruction)) + Assert((m_current_instruction.instruction + 1) != m_block_end.instruction); + InstructionEpilogue(instruction, info); + m_current_instruction.instruction++; + m_current_instruction.info++; + if (!CompileInstruction(*m_current_instruction.instruction, *m_current_instruction.info)) return false; // flush all regs since we're at the end of the block now @@ -2361,20 +2379,20 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) &return_to_dispatcher); // we're committed at this point :D - EmitEndBlock(true, false); + EmitEndBlock(true, nullptr); - const void* jump_pointer = GetCurrentCodePointer(); - const void* resolve_pointer = GetCurrentFarCodePointer(); - EmitBranch(resolve_pointer); - const u32 jump_size = static_cast(static_cast(GetCurrentCodePointer()) - - static_cast(jump_pointer)); - SwitchToFarCode(); - - EmitBeginBlock(true); - EmitFunctionCall(nullptr, &CPU::Recompiler::Thunks::ResolveBranch, Value::FromConstantPtr(m_block), - Value::FromConstantPtr(jump_pointer), Value::FromConstantPtr(resolve_pointer), - Value::FromConstantU32(jump_size)); - EmitEndBlock(true, true); + DebugAssert(branch_target.IsConstant()); + if (static_cast(branch_target.constant_value) == m_block->pc) + { + // self-link + EmitBranch(GetStartNearCodePointer()); + } + else + { + const void* host_target = CPU::CodeCache::CreateBlockLink(m_block, GetCurrentCodePointer(), + static_cast(branch_target.constant_value)); + EmitBranch(host_target); + } } m_register_cache.PopState(); @@ -2396,26 +2414,26 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) EmitConditionalBranch(Condition::GreaterEqual, false, pending_ticks.GetHostRegister(), downcount, &return_to_dispatcher); - EmitEndBlock(true, false); + EmitEndBlock(true, nullptr); - const void* jump_pointer = GetCurrentCodePointer(); - const void* resolve_pointer = GetCurrentFarCodePointer(); - EmitBranch(GetCurrentFarCodePointer()); - const u32 jump_size = - static_cast(static_cast(GetCurrentCodePointer()) - static_cast(jump_pointer)); - SwitchToFarCode(); - - EmitBeginBlock(true); - EmitFunctionCall(nullptr, &CPU::Recompiler::Thunks::ResolveBranch, Value::FromConstantPtr(m_block), - Value::FromConstantPtr(jump_pointer), Value::FromConstantPtr(resolve_pointer), - Value::FromConstantU32(jump_size)); - EmitEndBlock(true, true); + const Value& jump_target = (condition != Condition::Always) ? constant_next_pc : branch_target; + DebugAssert(jump_target.IsConstant()); + if (static_cast(jump_target.constant_value) == m_block->pc) + { + // self-link + EmitBranch(GetStartNearCodePointer()); + } + else + { + const void* host_target = CPU::CodeCache::CreateBlockLink(m_block, GetCurrentCodePointer(), + static_cast(jump_target.constant_value)); + EmitBranch(host_target); + } m_register_cache.PopState(); - SwitchToNearCode(); EmitBindLabel(&return_to_dispatcher); - EmitEndBlock(true, true); + EmitEndBlock(true, CodeCache::g_run_events_and_dispatch); } else { @@ -2435,7 +2453,7 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) WriteNewPC(branch_target, true); } - InstructionEpilogue(cbi); + InstructionEpilogue(instruction, info); } return true; @@ -2443,36 +2461,35 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) // Compute the branch target. // This depends on the form of the instruction. - switch (cbi.instruction.op) + switch (instruction.op) { case InstructionOp::j: case InstructionOp::jal: { // npc = (pc & 0xF0000000) | (target << 2) Value branch_target = OrValues(AndValues(CalculatePC(), Value::FromConstantU32(0xF0000000)), - Value::FromConstantU32(cbi.instruction.j.target << 2)); + Value::FromConstantU32(instruction.j.target << 2)); return DoBranch(Condition::Always, Value(), Value(), - (cbi.instruction.op == InstructionOp::jal) ? Reg::ra : Reg::count, std::move(branch_target)); + (instruction.op == InstructionOp::jal) ? Reg::ra : Reg::count, std::move(branch_target)); } case InstructionOp::funct: { - if (cbi.instruction.r.funct == InstructionFunct::jr || cbi.instruction.r.funct == InstructionFunct::jalr) + if (instruction.r.funct == InstructionFunct::jr || instruction.r.funct == InstructionFunct::jalr) { // npc = rs, link to rt - Value branch_target = m_register_cache.ReadGuestRegister(cbi.instruction.r.rs); + Value branch_target = m_register_cache.ReadGuestRegister(instruction.r.rs); return DoBranch(Condition::Always, Value(), Value(), - (cbi.instruction.r.funct == InstructionFunct::jalr) ? cbi.instruction.r.rd : Reg::count, + (instruction.r.funct == InstructionFunct::jalr) ? instruction.r.rd : Reg::count, std::move(branch_target)); } - else if (cbi.instruction.r.funct == InstructionFunct::syscall || - cbi.instruction.r.funct == InstructionFunct::break_) + else if (instruction.r.funct == InstructionFunct::syscall || instruction.r.funct == InstructionFunct::break_) { const Exception excode = - (cbi.instruction.r.funct == InstructionFunct::syscall) ? Exception::Syscall : Exception::BP; - GenerateExceptionExit(cbi, excode); - InstructionEpilogue(cbi); + (instruction.r.funct == InstructionFunct::syscall) ? Exception::Syscall : Exception::BP; + GenerateExceptionExit(instruction, info, excode); + InstructionEpilogue(instruction, info); return true; } else @@ -2485,20 +2502,19 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) case InstructionOp::bne: { // npc = pc + (sext(imm) << 2) - Value branch_target = CalculatePC(cbi.instruction.i.imm_sext32() << 2); + Value branch_target = CalculatePC(instruction.i.imm_sext32() << 2); // beq zero, zero, addr -> unconditional branch - if (cbi.instruction.op == InstructionOp::beq && cbi.instruction.i.rs == Reg::zero && - cbi.instruction.i.rt == Reg::zero) + if (instruction.op == InstructionOp::beq && instruction.i.rs == Reg::zero && instruction.i.rt == Reg::zero) { return DoBranch(Condition::Always, Value(), Value(), Reg::count, std::move(branch_target)); } else { // branch <- rs op rt - Value lhs = m_register_cache.ReadGuestRegister(cbi.instruction.i.rs, true, true); - Value rhs = m_register_cache.ReadGuestRegister(cbi.instruction.i.rt); - const Condition condition = (cbi.instruction.op == InstructionOp::beq) ? Condition::Equal : Condition::NotEqual; + Value lhs = m_register_cache.ReadGuestRegister(instruction.i.rs, true, true); + Value rhs = m_register_cache.ReadGuestRegister(instruction.i.rt); + const Condition condition = (instruction.op == InstructionOp::beq) ? Condition::Equal : Condition::NotEqual; return DoBranch(condition, lhs, rhs, Reg::count, std::move(branch_target)); } } @@ -2507,24 +2523,23 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) case InstructionOp::blez: { // npc = pc + (sext(imm) << 2) - Value branch_target = CalculatePC(cbi.instruction.i.imm_sext32() << 2); + Value branch_target = CalculatePC(instruction.i.imm_sext32() << 2); // branch <- rs op 0 - Value lhs = m_register_cache.ReadGuestRegister(cbi.instruction.i.rs, true, true); + Value lhs = m_register_cache.ReadGuestRegister(instruction.i.rs, true, true); - const Condition condition = - (cbi.instruction.op == InstructionOp::bgtz) ? Condition::Greater : Condition::LessEqual; + const Condition condition = (instruction.op == InstructionOp::bgtz) ? Condition::Greater : Condition::LessEqual; return DoBranch(condition, lhs, Value::FromConstantU32(0), Reg::count, std::move(branch_target)); } case InstructionOp::b: { // npc = pc + (sext(imm) << 2) - Value branch_target = CalculatePC(cbi.instruction.i.imm_sext32() << 2); + Value branch_target = CalculatePC(instruction.i.imm_sext32() << 2); - const u8 rt = static_cast(cbi.instruction.i.rt.GetValue()); + const u8 rt = static_cast(instruction.i.rt.GetValue()); const bool bgez = ConvertToBoolUnchecked(rt & u8(1)); - const Condition condition = (bgez && cbi.instruction.r.rs == Reg::zero) ? + const Condition condition = (bgez && instruction.r.rs == Reg::zero) ? Condition::Always : (bgez ? Condition::PositiveOrZero : Condition::Negative); const bool link = (rt & u8(0x1E)) == u8(0x10); @@ -2532,7 +2547,7 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) // Read has to happen before the link as the compare can use ra. Value lhs; if (condition != Condition::Always) - lhs = m_register_cache.ReadGuestRegisterToScratch(cbi.instruction.i.rs); + lhs = m_register_cache.ReadGuestRegisterToScratch(instruction.i.rs); // The return address is always written if link is set, regardless of whether the branch is taken. if (link) @@ -2549,27 +2564,27 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) } } -bool CodeGenerator::Compile_lui(const CodeBlockInstruction& cbi) +bool CodeGenerator::Compile_lui(Instruction instruction, const CodeCache::InstructionInfo& info) { - InstructionPrologue(cbi, 1); + InstructionPrologue(instruction, info, 1); if (g_settings.UsingPGXPCPUMode()) - EmitFunctionCall(nullptr, &PGXP::CPU_LUI, Value::FromConstantU32(cbi.instruction.bits)); + EmitFunctionCall(nullptr, &PGXP::CPU_LUI, Value::FromConstantU32(instruction.bits)); // rt <- (imm << 16) - const u32 value = cbi.instruction.i.imm_zext32() << 16; - m_register_cache.WriteGuestRegister(cbi.instruction.i.rt, Value::FromConstantU32(value)); - SpeculativeWriteReg(cbi.instruction.i.rt, value); + const u32 value = instruction.i.imm_zext32() << 16; + m_register_cache.WriteGuestRegister(instruction.i.rt, Value::FromConstantU32(value)); + SpeculativeWriteReg(instruction.i.rt, value); - InstructionEpilogue(cbi); + InstructionEpilogue(instruction, info); return true; } -bool CodeGenerator::Compile_cop0(const CodeBlockInstruction& cbi) +bool CodeGenerator::Compile_cop0(Instruction instruction, const CodeCache::InstructionInfo& info) { - if (cbi.instruction.cop.IsCommonInstruction()) + if (instruction.cop.IsCommonInstruction()) { - switch (cbi.instruction.cop.CommonOp()) + switch (instruction.cop.CommonOp()) { case CopCommonInstruction::mfcn: case CopCommonInstruction::mtcn: @@ -2577,7 +2592,7 @@ bool CodeGenerator::Compile_cop0(const CodeBlockInstruction& cbi) u32 offset; u32 write_mask = UINT32_C(0xFFFFFFFF); - const Cop0Reg reg = static_cast(cbi.instruction.r.rd.GetValue()); + const Cop0Reg reg = static_cast(instruction.r.rd.GetValue()); switch (reg) { case Cop0Reg::BPC: @@ -2632,33 +2647,33 @@ bool CodeGenerator::Compile_cop0(const CodeBlockInstruction& cbi) break; default: - return Compile_Fallback(cbi); + return Compile_Fallback(instruction, info); } - InstructionPrologue(cbi, 1); + InstructionPrologue(instruction, info, 1); - if (cbi.instruction.cop.CommonOp() == CopCommonInstruction::mfcn) + if (instruction.cop.CommonOp() == CopCommonInstruction::mfcn) { // coprocessor loads are load-delayed Value value = m_register_cache.AllocateScratch(RegSize_32); EmitLoadCPUStructField(value.host_reg, value.size, offset); if (g_settings.UsingPGXPCPUMode()) - EmitFunctionCall(nullptr, &PGXP::CPU_MFC0, Value::FromConstantU32(cbi.instruction.bits), value); + EmitFunctionCall(nullptr, &PGXP::CPU_MFC0, Value::FromConstantU32(instruction.bits), value); - m_register_cache.WriteGuestRegisterDelayed(cbi.instruction.r.rt, std::move(value)); + m_register_cache.WriteGuestRegisterDelayed(instruction.r.rt, std::move(value)); if (reg == Cop0Reg::SR) - SpeculativeWriteReg(cbi.instruction.r.rt, m_speculative_constants.cop0_sr); + SpeculativeWriteReg(instruction.r.rt, m_speculative_constants.cop0_sr); else - SpeculativeWriteReg(cbi.instruction.r.rt, std::nullopt); + SpeculativeWriteReg(instruction.r.rt, std::nullopt); } else { // some registers are not writable, so ignore those if (write_mask != 0) { - Value value = m_register_cache.ReadGuestRegister(cbi.instruction.r.rt); + Value value = m_register_cache.ReadGuestRegister(instruction.r.rt); if (write_mask != UINT32_C(0xFFFFFFFF)) { // need to adjust the mask @@ -2672,7 +2687,7 @@ bool CodeGenerator::Compile_cop0(const CodeBlockInstruction& cbi) if (g_settings.UsingPGXPCPUMode()) { - EmitFunctionCall(nullptr, &PGXP::CPU_MTC0, Value::FromConstantU32(cbi.instruction.bits), masked_value, + EmitFunctionCall(nullptr, &PGXP::CPU_MTC0, Value::FromConstantU32(instruction.bits), masked_value, value); } value = std::move(masked_value); @@ -2680,11 +2695,11 @@ bool CodeGenerator::Compile_cop0(const CodeBlockInstruction& cbi) else { if (g_settings.UsingPGXPCPUMode()) - EmitFunctionCall(nullptr, &PGXP::CPU_MTC0, Value::FromConstantU32(cbi.instruction.bits), value, value); + EmitFunctionCall(nullptr, &PGXP::CPU_MTC0, Value::FromConstantU32(instruction.bits), value, value); } if (reg == Cop0Reg::SR) - m_speculative_constants.cop0_sr = SpeculativeReadReg(cbi.instruction.r.rt); + m_speculative_constants.cop0_sr = SpeculativeReadReg(instruction.r.rt); // changing SR[Isc] needs to update fastmem views if (reg == Cop0Reg::SR) @@ -2708,7 +2723,7 @@ bool CodeGenerator::Compile_cop0(const CodeBlockInstruction& cbi) } } - if (cbi.instruction.cop.CommonOp() == CopCommonInstruction::mtcn) + if (instruction.cop.CommonOp() == CopCommonInstruction::mtcn) { if (reg == Cop0Reg::CAUSE || reg == Cop0Reg::SR) { @@ -2766,22 +2781,22 @@ bool CodeGenerator::Compile_cop0(const CodeBlockInstruction& cbi) } } - InstructionEpilogue(cbi); + InstructionEpilogue(instruction, info); return true; } // only mfc/mtc for cop0 default: - return Compile_Fallback(cbi); + return Compile_Fallback(instruction, info); } } else { - switch (cbi.instruction.cop.Cop0Op()) + switch (instruction.cop.Cop0Op()) { case Cop0Instruction::rfe: { - InstructionPrologue(cbi, 1); + InstructionPrologue(instruction, info, 1); // shift mode bits right two, preserving upper bits static constexpr u32 mode_bits_mask = UINT32_C(0b1111); @@ -2809,12 +2824,12 @@ bool CodeGenerator::Compile_cop0(const CodeBlockInstruction& cbi) EmitBindLabel(&no_interrupt); m_register_cache.UninhibitAllocation(); - InstructionEpilogue(cbi); + InstructionEpilogue(instruction, info); return true; } default: - return Compile_Fallback(cbi); + return Compile_Fallback(instruction, info); } } } @@ -2931,110 +2946,110 @@ void CodeGenerator::DoGTERegisterWrite(u32 index, const Value& value) } } -bool CodeGenerator::Compile_cop2(const CodeBlockInstruction& cbi) +bool CodeGenerator::Compile_cop2(Instruction instruction, const CodeCache::InstructionInfo& info) { - if (cbi.instruction.op == InstructionOp::lwc2 || cbi.instruction.op == InstructionOp::swc2) + if (instruction.op == InstructionOp::lwc2 || instruction.op == InstructionOp::swc2) { StallUntilGTEComplete(); - InstructionPrologue(cbi, 1); + InstructionPrologue(instruction, info, 1); - const u32 reg = static_cast(cbi.instruction.i.rt.GetValue()); - Value address = AddValues(m_register_cache.ReadGuestRegister(cbi.instruction.i.rs), - Value::FromConstantU32(cbi.instruction.i.imm_sext32()), false); - SpeculativeValue spec_address = SpeculativeReadReg(cbi.instruction.i.rs); + const u32 reg = static_cast(instruction.i.rt.GetValue()); + Value address = AddValues(m_register_cache.ReadGuestRegister(instruction.i.rs), + Value::FromConstantU32(instruction.i.imm_sext32()), false); + SpeculativeValue spec_address = SpeculativeReadReg(instruction.i.rs); if (spec_address) - spec_address = *spec_address + cbi.instruction.i.imm_sext32(); + spec_address = *spec_address + instruction.i.imm_sext32(); - if (cbi.instruction.op == InstructionOp::lwc2) + if (instruction.op == InstructionOp::lwc2) { - Value value = EmitLoadGuestMemory(cbi, address, spec_address, RegSize_32); + Value value = EmitLoadGuestMemory(instruction, info, address, spec_address, RegSize_32); DoGTERegisterWrite(reg, value); if (g_settings.gpu_pgxp_enable) - EmitFunctionCall(nullptr, PGXP::CPU_LWC2, Value::FromConstantU32(cbi.instruction.bits), address, value); + EmitFunctionCall(nullptr, PGXP::CPU_LWC2, Value::FromConstantU32(instruction.bits), address, value); } else { Value value = DoGTERegisterRead(reg); - EmitStoreGuestMemory(cbi, address, spec_address, RegSize_32, value); + EmitStoreGuestMemory(instruction, info, address, spec_address, RegSize_32, value); if (g_settings.gpu_pgxp_enable) - EmitFunctionCall(nullptr, PGXP::CPU_SWC2, Value::FromConstantU32(cbi.instruction.bits), address, value); + EmitFunctionCall(nullptr, PGXP::CPU_SWC2, Value::FromConstantU32(instruction.bits), address, value); - SpeculativeValue spec_base = SpeculativeReadReg(cbi.instruction.i.rs); + SpeculativeValue spec_base = SpeculativeReadReg(instruction.i.rs); if (spec_base) SpeculativeWriteMemory(*spec_address, std::nullopt); } - InstructionEpilogue(cbi); + InstructionEpilogue(instruction, info); return true; } - Assert(cbi.instruction.op == InstructionOp::cop2); + Assert(instruction.op == InstructionOp::cop2); - if (cbi.instruction.cop.IsCommonInstruction()) + if (instruction.cop.IsCommonInstruction()) { - switch (cbi.instruction.cop.CommonOp()) + switch (instruction.cop.CommonOp()) { case CopCommonInstruction::mfcn: case CopCommonInstruction::cfcn: { - const u32 reg = static_cast(cbi.instruction.r.rd.GetValue()) + - ((cbi.instruction.cop.CommonOp() == CopCommonInstruction::cfcn) ? 32 : 0); + const u32 reg = static_cast(instruction.r.rd.GetValue()) + + ((instruction.cop.CommonOp() == CopCommonInstruction::cfcn) ? 32 : 0); StallUntilGTEComplete(); - InstructionPrologue(cbi, 1); + InstructionPrologue(instruction, info, 1); Value value = DoGTERegisterRead(reg); // PGXP done first here before ownership is transferred. if (g_settings.gpu_pgxp_enable) - EmitFunctionCall(nullptr, PGXP::CPU_MFC2, Value::FromConstantU32(cbi.instruction.bits), value); + EmitFunctionCall(nullptr, PGXP::CPU_MFC2, Value::FromConstantU32(instruction.bits), value); - m_register_cache.WriteGuestRegisterDelayed(cbi.instruction.r.rt, std::move(value)); - SpeculativeWriteReg(cbi.instruction.r.rt, std::nullopt); + m_register_cache.WriteGuestRegisterDelayed(instruction.r.rt, std::move(value)); + SpeculativeWriteReg(instruction.r.rt, std::nullopt); - InstructionEpilogue(cbi); + InstructionEpilogue(instruction, info); return true; } case CopCommonInstruction::mtcn: case CopCommonInstruction::ctcn: { - const u32 reg = static_cast(cbi.instruction.r.rd.GetValue()) + - ((cbi.instruction.cop.CommonOp() == CopCommonInstruction::ctcn) ? 32 : 0); + const u32 reg = static_cast(instruction.r.rd.GetValue()) + + ((instruction.cop.CommonOp() == CopCommonInstruction::ctcn) ? 32 : 0); StallUntilGTEComplete(); - InstructionPrologue(cbi, 1); + InstructionPrologue(instruction, info, 1); - Value value = m_register_cache.ReadGuestRegister(cbi.instruction.r.rt); + Value value = m_register_cache.ReadGuestRegister(instruction.r.rt); DoGTERegisterWrite(reg, value); if (g_settings.gpu_pgxp_enable) - EmitFunctionCall(nullptr, PGXP::CPU_MTC2, Value::FromConstantU32(cbi.instruction.bits), value); + EmitFunctionCall(nullptr, PGXP::CPU_MTC2, Value::FromConstantU32(instruction.bits), value); - InstructionEpilogue(cbi); + InstructionEpilogue(instruction, info); return true; } default: - return Compile_Fallback(cbi); + return Compile_Fallback(instruction, info); } } else { TickCount func_ticks; - GTE::InstructionImpl func = GTE::GetInstructionImpl(cbi.instruction.bits, &func_ticks); + GTE::InstructionImpl func = GTE::GetInstructionImpl(instruction.bits, &func_ticks); // forward everything to the GTE. StallUntilGTEComplete(); - InstructionPrologue(cbi, 1); + InstructionPrologue(instruction, info, 1); - Value instruction_bits = Value::FromConstantU32(cbi.instruction.bits & GTE::Instruction::REQUIRED_BITS_MASK); + Value instruction_bits = Value::FromConstantU32(instruction.bits & GTE::Instruction::REQUIRED_BITS_MASK); EmitFunctionCall(nullptr, func, instruction_bits); AddGTETicks(func_ticks); - InstructionEpilogue(cbi); + InstructionEpilogue(instruction, info); return true; } } diff --git a/src/core/cpu_recompiler_code_generator.h b/src/core/cpu_recompiler_code_generator.h index 68126ea2e..096c64dde 100644 --- a/src/core/cpu_recompiler_code_generator.h +++ b/src/core/cpu_recompiler_code_generator.h @@ -9,7 +9,7 @@ #include "util/jit_code_buffer.h" -#include "cpu_code_cache.h" +#include "cpu_code_cache_private.h" #include "cpu_recompiler_register_cache.h" #include "cpu_recompiler_thunks.h" #include "cpu_recompiler_types.h" @@ -17,34 +17,56 @@ namespace CPU::Recompiler { +enum class Condition : u8 +{ + Always, + NotEqual, + Equal, + Overflow, + Greater, + GreaterEqual, + LessEqual, + Less, + Negative, + PositiveOrZero, + Above, // unsigned variant of Greater + AboveEqual, // unsigned variant of GreaterEqual + Below, // unsigned variant of Less + BelowEqual, // unsigned variant of LessEqual + + NotZero, + Zero +}; + class CodeGenerator { public: using SpeculativeValue = std::optional; + struct CodeBlockInstruction + { + const Instruction* instruction; + const CodeCache::InstructionInfo* info; + }; + CodeGenerator(JitCodeBuffer* code_buffer); ~CodeGenerator(); static const char* GetHostRegName(HostReg reg, RegSize size = HostPointerSize); static void AlignCodeBuffer(JitCodeBuffer* code_buffer); - static bool BackpatchLoadStore(const LoadStoreBackpatchInfo& lbi); - static void BackpatchBranch(void* pc, u32 pc_size, void* target); - static void BackpatchReturn(void* pc, u32 pc_size); + static void BackpatchLoadStore(void* host_pc, const CodeCache::LoadstoreBackpatchInfo& lbi); - bool CompileBlock(CodeBlock* block, CodeBlock::HostCodePointer* out_host_code, u32* out_host_code_size); - - CodeCache::DispatcherFunction CompileDispatcher(); - CodeCache::SingleBlockDispatcherFunction CompileSingleBlockDispatcher(); + const void* CompileBlock(CodeCache::Block* block, u32* out_host_code_size, u32* out_host_far_code_size); ////////////////////////////////////////////////////////////////////////// // Code Generation ////////////////////////////////////////////////////////////////////////// void EmitBeginBlock(bool allocate_registers = true); - void EmitEndBlock(bool free_registers = true, bool emit_return = true); + void EmitEndBlock(bool free_registers, const void* jump_to); void EmitExceptionExit(); void EmitExceptionExitOnBool(const Value& value); - void FinalizeBlock(CodeBlock::HostCodePointer* out_host_code, u32* out_host_code_size); + const void* FinalizeBlock(u32* out_host_code_size, u32* out_host_far_code_size); void EmitSignExtend(HostReg to_reg, RegSize to_size, HostReg from_reg, RegSize from_size); void EmitZeroExtend(HostReg to_reg, RegSize to_size, HostReg from_reg, RegSize from_size); @@ -77,6 +99,7 @@ public: void EmitMoveNextInterpreterLoadDelay(); void EmitCancelInterpreterLoadDelayForReg(Reg reg); void EmitICacheCheckAndUpdate(); + void EmitBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size); void EmitStallUntilGTEComplete(); void EmitLoadCPUStructField(HostReg host_reg, RegSize size, u32 offset); void EmitStoreCPUStructField(u32 offset, const Value& value); @@ -88,18 +111,19 @@ public: // Automatically generates an exception handler. Value GetFastmemLoadBase(); Value GetFastmemStoreBase(); - Value EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const Value& address, const SpeculativeValue& address_spec, - RegSize size); + Value EmitLoadGuestMemory(Instruction instruction, const CodeCache::InstructionInfo& info, const Value& address, + const SpeculativeValue& address_spec, RegSize size); void EmitLoadGuestRAMFastmem(const Value& address, RegSize size, Value& result); - void EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, Value& result); - void EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, Value& result, - bool in_far_code); - void EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const Value& address, const SpeculativeValue& address_spec, - RegSize size, const Value& value); - void EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, - const Value& value); - void EmitStoreGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, - const Value& value, bool in_far_code); + void EmitLoadGuestMemoryFastmem(Instruction instruction, const CodeCache::InstructionInfo& info, const Value& address, + RegSize size, Value& result); + void EmitLoadGuestMemorySlowmem(Instruction instruction, const CodeCache::InstructionInfo& info, const Value& address, + RegSize size, Value& result, bool in_far_code); + void EmitStoreGuestMemory(Instruction instruction, const CodeCache::InstructionInfo& info, const Value& address, + const SpeculativeValue& address_spec, RegSize size, const Value& value); + void EmitStoreGuestMemoryFastmem(Instruction instruction, const CodeCache::InstructionInfo& info, + const Value& address, RegSize size, const Value& value); + void EmitStoreGuestMemorySlowmem(Instruction instruction, const CodeCache::InstructionInfo& info, + const Value& address, RegSize size, const Value& value, bool in_far_code); void EmitUpdateFastmemBase(); // Unconditional branch to pointer. May allocate a scratch register. @@ -179,7 +203,7 @@ public: Value NotValue(const Value& val); // Raising exception if condition is true. - void GenerateExceptionExit(const CodeBlockInstruction& cbi, Exception excode, + void GenerateExceptionExit(Instruction instruction, const CodeCache::InstructionInfo& info, Exception excode, Condition condition = Condition::Always); private: @@ -194,6 +218,7 @@ private: void SwitchToFarCode(); void SwitchToNearCode(); + void* GetStartNearCodePointer() const; void* GetCurrentCodePointer() const; void* GetCurrentNearCodePointer() const; void* GetCurrentFarCodePointer() const; @@ -204,8 +229,9 @@ private: // branch target, memory address, etc void BlockPrologue(); void BlockEpilogue(); - void InstructionPrologue(const CodeBlockInstruction& cbi, TickCount cycles, bool force_sync = false); - void InstructionEpilogue(const CodeBlockInstruction& cbi); + void InstructionPrologue(Instruction instruction, const CodeCache::InstructionInfo& info, TickCount cycles, + bool force_sync = false); + void InstructionEpilogue(Instruction instruction, const CodeCache::InstructionInfo& info); void TruncateBlockAtCurrentInstruction(); void AddPendingCycles(bool commit); void AddGTETicks(TickCount ticks); @@ -221,32 +247,33 @@ private: ////////////////////////////////////////////////////////////////////////// // Instruction Code Generators ////////////////////////////////////////////////////////////////////////// - bool CompileInstruction(const CodeBlockInstruction& cbi); - bool Compile_Fallback(const CodeBlockInstruction& cbi); - bool Compile_Nop(const CodeBlockInstruction& cbi); - bool Compile_Bitwise(const CodeBlockInstruction& cbi); - bool Compile_Shift(const CodeBlockInstruction& cbi); - bool Compile_Load(const CodeBlockInstruction& cbi); - bool Compile_Store(const CodeBlockInstruction& cbi); - bool Compile_LoadLeftRight(const CodeBlockInstruction& cbi); - bool Compile_StoreLeftRight(const CodeBlockInstruction& cbi); - bool Compile_MoveHiLo(const CodeBlockInstruction& cbi); - bool Compile_Add(const CodeBlockInstruction& cbi); - bool Compile_Subtract(const CodeBlockInstruction& cbi); - bool Compile_Multiply(const CodeBlockInstruction& cbi); - bool Compile_Divide(const CodeBlockInstruction& cbi); - bool Compile_SignedDivide(const CodeBlockInstruction& cbi); - bool Compile_SetLess(const CodeBlockInstruction& cbi); - bool Compile_Branch(const CodeBlockInstruction& cbi); - bool Compile_lui(const CodeBlockInstruction& cbi); - bool Compile_cop0(const CodeBlockInstruction& cbi); - bool Compile_cop2(const CodeBlockInstruction& cbi); + bool CompileInstruction(Instruction instruction, const CodeCache::InstructionInfo& info); + bool Compile_Fallback(Instruction instruction, const CodeCache::InstructionInfo& info); + bool Compile_Nop(Instruction instruction, const CodeCache::InstructionInfo& info); + bool Compile_Bitwise(Instruction instruction, const CodeCache::InstructionInfo& info); + bool Compile_Shift(Instruction instruction, const CodeCache::InstructionInfo& info); + bool Compile_Load(Instruction instruction, const CodeCache::InstructionInfo& info); + bool Compile_Store(Instruction instruction, const CodeCache::InstructionInfo& info); + bool Compile_LoadLeftRight(Instruction instruction, const CodeCache::InstructionInfo& info); + bool Compile_StoreLeftRight(Instruction instruction, const CodeCache::InstructionInfo& info); + bool Compile_MoveHiLo(Instruction instruction, const CodeCache::InstructionInfo& info); + bool Compile_Add(Instruction instruction, const CodeCache::InstructionInfo& info); + bool Compile_Subtract(Instruction instruction, const CodeCache::InstructionInfo& info); + bool Compile_Multiply(Instruction instruction, const CodeCache::InstructionInfo& info); + bool Compile_Divide(Instruction instruction, const CodeCache::InstructionInfo& info); + bool Compile_SignedDivide(Instruction instruction, const CodeCache::InstructionInfo& info); + bool Compile_SetLess(Instruction instruction, const CodeCache::InstructionInfo& info); + bool Compile_Branch(Instruction instruction, const CodeCache::InstructionInfo& info); + bool Compile_lui(Instruction instruction, const CodeCache::InstructionInfo& info); + bool Compile_cop0(Instruction instruction, const CodeCache::InstructionInfo& info); + bool Compile_cop2(Instruction instruction, const CodeCache::InstructionInfo& info); JitCodeBuffer* m_code_buffer; - CodeBlock* m_block = nullptr; - const CodeBlockInstruction* m_block_start = nullptr; - const CodeBlockInstruction* m_block_end = nullptr; - const CodeBlockInstruction* m_current_instruction = nullptr; + + CodeCache::Block* m_block = nullptr; + CodeBlockInstruction m_block_start = {}; + CodeBlockInstruction m_block_end = {}; + CodeBlockInstruction m_current_instruction = {}; RegisterCache m_register_cache; CodeEmitter m_near_emitter; CodeEmitter m_far_emitter; @@ -267,9 +294,6 @@ private: bool m_next_load_delay_dirty = false; bool m_gte_busy_cycles_dirty = false; - bool m_fastmem_load_base_in_register = false; - bool m_fastmem_store_base_in_register = false; - ////////////////////////////////////////////////////////////////////////// // Speculative Constants ////////////////////////////////////////////////////////////////////////// diff --git a/src/core/cpu_recompiler_code_generator_aarch32.cpp b/src/core/cpu_recompiler_code_generator_aarch32.cpp index 04921969b..e1fb940ab 100644 --- a/src/core/cpu_recompiler_code_generator_aarch32.cpp +++ b/src/core/cpu_recompiler_code_generator_aarch32.cpp @@ -1,9 +1,11 @@ -// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin +// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) #include "common/align.h" #include "common/assert.h" #include "common/log.h" + +#include "cpu_code_cache_private.h" #include "cpu_core.h" #include "cpu_core_private.h" #include "cpu_recompiler_code_generator.h" @@ -12,38 +14,230 @@ #include "timing_event.h" Log_SetChannel(CPU::Recompiler); -// #include "vixl/aarch32/disasm-aarch32.h" -// #include +#ifdef ENABLE_HOST_DISASSEMBLY +#include "vixl/aarch32/disasm-aarch32.h" +#include +#endif namespace a32 = vixl::aarch32; namespace CPU::Recompiler { - -constexpr HostReg RCPUPTR = 4; -constexpr HostReg RRETURN = 0; -constexpr HostReg RARG1 = 0; -constexpr HostReg RARG2 = 1; -constexpr HostReg RARG3 = 2; -constexpr HostReg RARG4 = 3; -constexpr HostReg RSCRATCH = 12; -constexpr u32 FUNCTION_CALL_SHADOW_SPACE = 32; constexpr u32 FUNCTION_CALLEE_SAVED_SPACE_RESERVE = 80; // 8 registers constexpr u32 FUNCTION_CALLER_SAVED_SPACE_RESERVE = 144; // 18 registers -> 224 bytes -constexpr u32 FUNCTION_STACK_SIZE = - FUNCTION_CALLEE_SAVED_SPACE_RESERVE + FUNCTION_CALLER_SAVED_SPACE_RESERVE + FUNCTION_CALL_SHADOW_SPACE; +constexpr u32 FUNCTION_STACK_SIZE = FUNCTION_CALLEE_SAVED_SPACE_RESERVE + FUNCTION_CALLER_SAVED_SPACE_RESERVE; +} // namespace CPU::Recompiler -static s32 GetPCDisplacement(const void* current, const void* target) +s32 CPU::Recompiler::armGetPCDisplacement(const void* current, const void* target) { Assert(Common::IsAlignedPow2(reinterpret_cast(current), 4)); Assert(Common::IsAlignedPow2(reinterpret_cast(target), 4)); return static_cast((reinterpret_cast(target) - reinterpret_cast(current))); } -static bool IsPCDisplacementInImmediateRange(s32 displacement) +bool CPU::Recompiler::armIsPCDisplacementInImmediateRange(s32 displacement) { return (displacement >= -33554432 && displacement <= 33554428); } +void CPU::Recompiler::armEmitMov(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& rd, u32 imm) +{ + if (vixl::IsUintN(16, imm)) + { + armAsm->mov(vixl::aarch32::al, rd, imm & 0xffff); + return; + } + + armAsm->mov(vixl::aarch32::al, rd, imm & 0xffff); + armAsm->movt(vixl::aarch32::al, rd, imm >> 16); +} + +void CPU::Recompiler::armMoveAddressToReg(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg, + const void* addr) +{ + armEmitMov(armAsm, reg, static_cast(reinterpret_cast(addr))); +} + +void CPU::Recompiler::armEmitJmp(vixl::aarch32::Assembler* armAsm, const void* ptr, bool force_inline) +{ + // TODO: pooling + + const s32 displacement = armGetPCDisplacement(armAsm->GetCursorAddress(), ptr); + if (!armIsPCDisplacementInImmediateRange(displacement)) + { + armMoveAddressToReg(armAsm, RSCRATCH, ptr); + armAsm->bx(RSCRATCH); + } + else + { + a32::Label label(displacement + armAsm->GetCursorOffset()); + armAsm->b(&label); + } +} + +void CPU::Recompiler::armEmitCall(vixl::aarch32::Assembler* armAsm, const void* ptr, bool force_inline) +{ + // TODO: pooling + + const s32 displacement = armGetPCDisplacement(armAsm->GetCursorAddress(), ptr); + if (!armIsPCDisplacementInImmediateRange(displacement)) + { + armMoveAddressToReg(armAsm, RSCRATCH, ptr); + armAsm->blx(RSCRATCH); + } + else + { + a32::Label label(displacement + armAsm->GetCursorOffset()); + armAsm->bl(&label); + } +} + +void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size) +{ +#ifdef ENABLE_HOST_DISASSEMBLY + a32::PrintDisassembler dis(std::cout, 0); + dis.SetCodeAddress(reinterpret_cast(start)); + dis.DisassembleA32Buffer(static_cast(start), size); +#else + Log_ErrorPrint("Not compiled with ENABLE_HOST_DISASSEMBLY."); +#endif +} + +u32 CPU::CodeCache::GetHostInstructionCount(const void* start, u32 size) +{ + return size / a32::kA32InstructionSizeInBytes; +} + +u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache) +{ + using namespace vixl::aarch32; + using namespace CPU::Recompiler; + + const s32 disp = armGetPCDisplacement(code, dst); + DebugAssert(armIsPCDisplacementInImmediateRange(disp)); + + // A32 jumps are silly. + { + vixl::aarch32::Assembler emit(static_cast(code), kA32InstructionSizeInBytes, a32::A32); + a32::Label label(disp); + emit.b(&label); + } + + if (flush_icache) + JitCodeBuffer::FlushInstructionCache(code, kA32InstructionSizeInBytes); + + return kA32InstructionSizeInBytes; +} + +u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size) +{ + using namespace vixl::aarch32; + using namespace CPU::Recompiler; + +#define PTR(x) a32::MemOperand(RSTATE, (s32)(((u8*)(x)) - ((u8*)&g_state))) + + Assembler actual_asm(static_cast(code), code_size); + Assembler* armAsm = &actual_asm; + +#ifdef VIXL_DEBUG + vixl::CodeBufferCheckScope asm_check(armAsm, code_size, vixl::CodeBufferCheckScope::kDontReserveBufferSpace); +#endif + + Label dispatch; + + g_enter_recompiler = armAsm->GetCursorAddress(); + { + // reserve some space for saving caller-saved registers + armAsm->sub(sp, sp, FUNCTION_STACK_SIZE); + + // Need the CPU state for basically everything :-) + armMoveAddressToReg(armAsm, RSTATE, &g_state); + } + + // check events then for frame done + g_check_events_and_dispatch = armAsm->GetCursorAddress(); + { + Label skip_event_check; + armAsm->ldr(RARG1, PTR(&g_state.pending_ticks)); + armAsm->ldr(RARG2, PTR(&g_state.downcount)); + armAsm->cmp(RARG1, RARG2); + armAsm->b(lt, &skip_event_check); + + g_run_events_and_dispatch = armAsm->GetCursorAddress(); + armEmitCall(armAsm, reinterpret_cast(&TimingEvents::RunEvents), true); + + armAsm->bind(&skip_event_check); + } + + // TODO: align? + g_dispatcher = armAsm->GetCursorAddress(); + { + armAsm->bind(&dispatch); + + // x9 <- s_fast_map[pc >> 16] + armAsm->ldr(RARG1, PTR(&g_state.pc)); + armMoveAddressToReg(armAsm, RARG3, g_code_lut.data()); + armAsm->lsr(RARG2, RARG1, 16); + armAsm->ldr(RARG2, MemOperand(RARG3, RARG2, LSL, 2)); + + // blr(x9[pc * 2]) (fast_map[pc >> 2]) + armAsm->ldr(RARG1, MemOperand(RARG2, RARG1)); + armAsm->blx(RARG1); + } + + g_compile_or_revalidate_block = armAsm->GetCursorAddress(); + { + armAsm->ldr(RARG1, PTR(&g_state.pc)); + armEmitCall(armAsm, reinterpret_cast(&CompileOrRevalidateBlock), true); + armAsm->b(&dispatch); + } + + g_discard_and_recompile_block = armAsm->GetCursorAddress(); + { + armAsm->ldr(RARG1, PTR(&g_state.pc)); + armEmitCall(armAsm, reinterpret_cast(&DiscardAndRecompileBlock), true); + armAsm->b(&dispatch); + } + + g_interpret_block = armAsm->GetCursorAddress(); + { + armEmitCall(armAsm, reinterpret_cast(GetInterpretUncachedBlockFunction()), true); + armAsm->b(&dispatch); + } + + armAsm->FinalizeCode(); + +#if 0 + // TODO: align? + s_trampoline_targets.clear(); + s_trampoline_start_ptr = static_cast(code) + armAsm->GetCursorOffset(); + s_trampoline_used = 0; +#endif + +#undef PTR + return static_cast(armAsm->GetCursorOffset()) /* + TRAMPOLINE_AREA_SIZE*/; +} + +// Macros aren't used with old-rec. +#undef RRET +#undef RARG1 +#undef RARG2 +#undef RARG3 +#undef RARG4 +#undef RSCRATCH +#undef RMEMBASE +#undef RSTATE + +namespace CPU::Recompiler { + +constexpr HostReg RCPUPTR = 4; +constexpr HostReg RMEMBASEPTR = 5; +constexpr HostReg RRETURN = 0; +constexpr HostReg RARG1 = 0; +constexpr HostReg RARG2 = 1; +constexpr HostReg RARG3 = 2; +constexpr HostReg RARG4 = 3; +constexpr HostReg RSCRATCH = 12; + static const a32::Register GetHostReg8(HostReg reg) { return a32::Register(reg); @@ -82,6 +276,11 @@ static const a32::Register GetCPUPtrReg() return GetHostReg32(RCPUPTR); } +static const a32::Register GetFastmemBasePtrReg() +{ + return GetHostReg32(RMEMBASEPTR); +} + CodeGenerator::CodeGenerator(JitCodeBuffer* code_buffer) : m_code_buffer(code_buffer), m_register_cache(*this), m_near_emitter(static_cast(code_buffer->GetFreeCodePointer()), code_buffer->GetFreeCodeSpace(), @@ -136,6 +335,11 @@ void CodeGenerator::SwitchToNearCode() m_emit = &m_near_emitter; } +void* CodeGenerator::GetStartNearCodePointer() const +{ + return static_cast(m_code_buffer->GetFreeCodePointer()); +} + void* CodeGenerator::GetCurrentNearCodePointer() const { return static_cast(m_code_buffer->GetFreeCodePointer()) + m_near_emitter.GetCursorOffset(); @@ -168,8 +372,6 @@ Value CodeGenerator::GetValueInHostOrScratchRegister(const Value& value, bool al void CodeGenerator::EmitBeginBlock(bool allocate_registers /* = true */) { - m_emit->sub(a32::sp, a32::sp, FUNCTION_STACK_SIZE); - if (allocate_registers) { // Save the link register, since we'll be calling functions. @@ -183,22 +385,31 @@ void CodeGenerator::EmitBeginBlock(bool allocate_registers /* = true */) // m_emit->Mov(GetCPUPtrReg(), reinterpret_cast(&g_state)); DebugAssert(cpu_reg_allocated); UNREFERENCED_VARIABLE(cpu_reg_allocated); + + // If there's loadstore instructions, preload the fastmem base. + if (m_block->HasFlag(CodeCache::BlockFlags::ContainsLoadStoreInstructions)) + { + const bool fastmem_reg_allocated = m_register_cache.AllocateHostReg(RMEMBASEPTR); + Assert(fastmem_reg_allocated); + m_emit->Ldr(GetFastmemBasePtrReg(), a32::MemOperand(GetCPUPtrReg(), offsetof(State, fastmem_base))); + } } } -void CodeGenerator::EmitEndBlock(bool free_registers /* = true */, bool emit_return /* = true */) +void CodeGenerator::EmitEndBlock(bool free_registers /* = true */, const void* jump_to) { if (free_registers) { + if (m_block->HasFlag(CodeCache::BlockFlags::ContainsLoadStoreInstructions)) + m_register_cache.FreeHostReg(RMEMBASEPTR); + m_register_cache.FreeHostReg(RCPUPTR); m_register_cache.FreeHostReg(14); m_register_cache.PopCalleeSavedRegisters(true); } - m_emit->add(a32::sp, a32::sp, FUNCTION_STACK_SIZE); - - if (emit_return) - m_emit->bx(a32::lr); + if (jump_to) + armEmitJmp(m_emit, jump_to, true); } void CodeGenerator::EmitExceptionExit() @@ -212,8 +423,7 @@ void CodeGenerator::EmitExceptionExit() m_register_cache.PopCalleeSavedRegisters(false); - m_emit->add(a32::sp, a32::sp, FUNCTION_STACK_SIZE); - m_emit->bx(a32::lr); + armEmitJmp(m_emit, CodeCache::g_check_events_and_dispatch, true); } void CodeGenerator::EmitExceptionExitOnBool(const Value& value) @@ -236,13 +446,14 @@ void CodeGenerator::EmitExceptionExitOnBool(const Value& value) m_register_cache.PopState(); } -void CodeGenerator::FinalizeBlock(CodeBlock::HostCodePointer* out_host_code, u32* out_host_code_size) +const void* CodeGenerator::FinalizeBlock(u32* out_host_code_size, u32* out_host_far_code_size) { m_near_emitter.FinalizeCode(); m_far_emitter.FinalizeCode(); - *out_host_code = reinterpret_cast(m_code_buffer->GetFreeCodePointer()); + const void* code = m_code_buffer->GetFreeCodePointer(); *out_host_code_size = static_cast(m_near_emitter.GetSizeOfCodeGenerated()); + *out_host_far_code_size = static_cast(m_far_emitter.GetSizeOfCodeGenerated()); m_code_buffer->CommitCode(static_cast(m_near_emitter.GetSizeOfCodeGenerated())); m_code_buffer->CommitFarCode(static_cast(m_far_emitter.GetSizeOfCodeGenerated())); @@ -252,11 +463,7 @@ void CodeGenerator::FinalizeBlock(CodeBlock::HostCodePointer* out_host_code, u32 m_far_emitter = CodeEmitter(static_cast(m_code_buffer->GetFreeFarCodePointer()), m_code_buffer->GetFreeFarCodeSpace(), a32::A32); -#if 0 - a32::PrintDisassembler dis(std::cout, 0); - dis.SetCodeAddress(reinterpret_cast(*out_host_code)); - dis.DisassembleA32Buffer(reinterpret_cast(*out_host_code), *out_host_code_size); -#endif + return code; } void CodeGenerator::EmitSignExtend(HostReg to_reg, RegSize to_size, HostReg from_reg, RegSize from_size) @@ -847,8 +1054,6 @@ void CodeGenerator::EmitSetConditionResult(HostReg to_reg, RegSize to_size, Cond u32 CodeGenerator::PrepareStackForCall() { - m_fastmem_load_base_in_register = false; - m_fastmem_store_base_in_register = false; m_register_cache.PushCallerSavedRegisters(); return 0; } @@ -860,17 +1065,7 @@ void CodeGenerator::RestoreStackAfterCall(u32 adjust_size) void CodeGenerator::EmitCall(const void* ptr) { - const s32 displacement = GetPCDisplacement(GetCurrentCodePointer(), ptr); - if (!IsPCDisplacementInImmediateRange(displacement)) - { - m_emit->Mov(GetHostReg32(RSCRATCH), reinterpret_cast(ptr)); - m_emit->blx(GetHostReg32(RSCRATCH)); - } - else - { - a32::Label label(displacement + m_emit->GetCursorOffset()); - m_emit->bl(&label); - } + armEmitCall(m_emit, ptr, false); } void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr) @@ -1005,7 +1200,7 @@ void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr, co void CodeGenerator::EmitPushHostReg(HostReg reg, u32 position) { - const a32::MemOperand addr(a32::sp, FUNCTION_STACK_SIZE - FUNCTION_CALL_SHADOW_SPACE - (position * 4)); + const a32::MemOperand addr(a32::sp, FUNCTION_STACK_SIZE - (position * 4)); m_emit->str(GetHostReg32(reg), addr); } @@ -1018,7 +1213,7 @@ void CodeGenerator::EmitPushHostRegPair(HostReg reg, HostReg reg2, u32 position) void CodeGenerator::EmitPopHostReg(HostReg reg, u32 position) { - const a32::MemOperand addr(a32::sp, FUNCTION_STACK_SIZE - FUNCTION_CALL_SHADOW_SPACE - (position * 4)); + const a32::MemOperand addr(a32::sp, FUNCTION_STACK_SIZE - (position * 4)); m_emit->ldr(GetHostReg32(reg), addr); } @@ -1153,51 +1348,13 @@ void CodeGenerator::EmitAddCPUStructField(u32 offset, const Value& value) } } -Value CodeGenerator::GetFastmemLoadBase() -{ - Value val = Value::FromHostReg(&m_register_cache, RARG4, RegSize_32); - if (!m_fastmem_load_base_in_register) - { - m_emit->ldr(GetHostReg32(val), a32::MemOperand(GetCPUPtrReg(), offsetof(CPU::State, fastmem_base))); - m_fastmem_load_base_in_register = true; - } - - return val; -} - -Value CodeGenerator::GetFastmemStoreBase() -{ - Value val = Value::FromHostReg(&m_register_cache, RARG3, RegSize_32); - if (!m_fastmem_store_base_in_register) - { - m_emit->ldr(GetHostReg32(val), a32::MemOperand(GetCPUPtrReg(), offsetof(CPU::State, fastmem_base))); - m_emit->add(GetHostReg32(val), GetHostReg32(val), sizeof(u32*) * Bus::FASTMEM_LUT_NUM_PAGES); - m_fastmem_store_base_in_register = true; - } - - return val; -} - void CodeGenerator::EmitUpdateFastmemBase() { - if (m_fastmem_load_base_in_register) - { - Value val = Value::FromHostReg(&m_register_cache, RARG4, RegSize_32); - m_emit->ldr(GetHostReg32(val), a32::MemOperand(GetCPUPtrReg(), offsetof(CPU::State, fastmem_base))); - } - - if (m_fastmem_store_base_in_register) - { - Value val = Value::FromHostReg(&m_register_cache, RARG3, RegSize_32); - m_emit->ldr(GetHostReg32(val), a32::MemOperand(GetCPUPtrReg(), offsetof(CPU::State, fastmem_base))); - m_emit->add(GetHostReg32(val), GetHostReg32(val), sizeof(u32*) * Bus::FASTMEM_LUT_NUM_PAGES); - } + m_emit->Ldr(GetFastmemBasePtrReg(), a32::MemOperand(GetCPUPtrReg(), offsetof(State, fastmem_base))); } void CodeGenerator::EmitLoadGuestRAMFastmem(const Value& address, RegSize size, Value& result) { - Value fastmem_base = GetFastmemLoadBase(); - HostReg address_reg; if (address.IsConstant()) { @@ -1212,7 +1369,7 @@ void CodeGenerator::EmitLoadGuestRAMFastmem(const Value& address, RegSize size, m_emit->lsr(GetHostReg32(RARG1), GetHostReg32(address_reg), Bus::FASTMEM_LUT_PAGE_SHIFT); m_emit->and_(GetHostReg32(RARG2), GetHostReg32(address_reg), Bus::FASTMEM_LUT_PAGE_MASK); m_emit->ldr(GetHostReg32(RARG1), - a32::MemOperand(GetHostReg32(fastmem_base), GetHostReg32(RARG1), a32::LSL, 2)); // pointer load + a32::MemOperand(GetFastmemBasePtrReg(), GetHostReg32(RARG1), a32::LSL, 2)); // pointer load switch (size) { @@ -1234,18 +1391,9 @@ void CodeGenerator::EmitLoadGuestRAMFastmem(const Value& address, RegSize size, } } -void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, - Value& result) +void CodeGenerator::EmitLoadGuestMemoryFastmem(Instruction instruction, const CodeCache::InstructionInfo& info, + const Value& address, RegSize size, Value& result) { - // fastmem - LoadStoreBackpatchInfo bpi; - bpi.address_host_reg = HostReg_Invalid; - bpi.value_host_reg = result.host_reg; - bpi.guest_pc = m_current_instruction->pc; - bpi.fault_count = 0; - - Value fastmem_base = GetFastmemLoadBase(); - HostReg address_reg; if (address.IsConstant()) { @@ -1258,25 +1406,25 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, } m_emit->lsr(GetHostReg32(RARG1), GetHostReg32(address_reg), Bus::FASTMEM_LUT_PAGE_SHIFT); - m_emit->and_(GetHostReg32(RARG2), GetHostReg32(address_reg), Bus::FASTMEM_LUT_PAGE_MASK); m_emit->ldr(GetHostReg32(RARG1), - a32::MemOperand(GetHostReg32(fastmem_base), GetHostReg32(RARG1), a32::LSL, 2)); // pointer load + a32::MemOperand(GetFastmemBasePtrReg(), GetHostReg32(RARG1), a32::LSL, 2)); // pointer load m_register_cache.InhibitAllocation(); - bpi.host_pc = GetCurrentNearCodePointer(); + + void* host_pc = GetCurrentNearCodePointer(); switch (size) { case RegSize_8: - m_emit->ldrb(GetHostReg32(result.host_reg), a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(RARG2))); + m_emit->ldrb(GetHostReg32(result.host_reg), a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(address_reg))); break; case RegSize_16: - m_emit->ldrh(GetHostReg32(result.host_reg), a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(RARG2))); + m_emit->ldrh(GetHostReg32(result.host_reg), a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(address_reg))); break; case RegSize_32: - m_emit->ldr(GetHostReg32(result.host_reg), a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(RARG2))); + m_emit->ldr(GetHostReg32(result.host_reg), a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(address_reg))); break; default: @@ -1284,13 +1432,11 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, break; } - bpi.host_code_size = static_cast( - static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(bpi.host_pc))); - - const bool old_store_fastmem_base = m_fastmem_store_base_in_register; + const u32 host_code_size = + static_cast(static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(host_pc))); // generate slowmem fallback - bpi.host_slowmem_pc = GetCurrentFarCodePointer(); + const void* host_slowmem_pc = GetCurrentFarCodePointer(); SwitchToFarCode(); // we add the ticks *after* the add here, since we counted incorrectly, then correct for it below @@ -1298,27 +1444,22 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast(m_delayed_cycles_add))); m_delayed_cycles_add += Bus::RAM_READ_TICKS; - EmitLoadGuestMemorySlowmem(cbi, address, size, result, true); + EmitLoadGuestMemorySlowmem(instruction, info, address, size, result, true); EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast(-m_delayed_cycles_add))); - // restore fastmem base state for the next instruction - if (old_store_fastmem_base) - fastmem_base = GetFastmemStoreBase(); - fastmem_base = GetFastmemLoadBase(); - // return to the block code EmitBranch(GetCurrentNearCodePointer(), false); SwitchToNearCode(); m_register_cache.UninhibitAllocation(); - m_block->loadstore_backpatch_info.push_back(bpi); + CPU::CodeCache::AddLoadStoreInfo(host_pc, host_code_size, info.pc, host_slowmem_pc); } -void CodeGenerator::EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, - Value& result, bool in_far_code) +void CodeGenerator::EmitLoadGuestMemorySlowmem(Instruction instruction, const CodeCache::InstructionInfo& info, + const Value& address, RegSize size, Value& result, bool in_far_code) { if (g_settings.cpu_recompiler_memory_exceptions) { @@ -1359,7 +1500,7 @@ void CodeGenerator::EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi, m_emit->lsl(GetHostReg32(result.host_reg), GetHostReg32(result.host_reg), 2); EmitOr(result.host_reg, result.host_reg, Value::FromConstantU32(Cop0Registers::CAUSE::MakeValueForException( - static_cast(0), cbi.is_branch_delay_slot, false, cbi.instruction.cop.cop_n))); + static_cast(0), info.is_branch_delay_slot, false, instruction.cop.cop_n))); EmitFunctionCall(nullptr, static_cast(&CPU::RaiseException), result, GetCurrentInstructionPC()); EmitExceptionExit(); @@ -1392,16 +1533,9 @@ void CodeGenerator::EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi, } } -void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, - const Value& value) +void CodeGenerator::EmitStoreGuestMemoryFastmem(Instruction instruction, const CodeCache::InstructionInfo& info, + const Value& address, RegSize size, const Value& value) { - LoadStoreBackpatchInfo bpi; - bpi.address_host_reg = HostReg_Invalid; - bpi.value_host_reg = value.host_reg; - bpi.guest_pc = m_current_instruction->pc; - bpi.fault_count = 0; - - Value fastmem_base = GetFastmemStoreBase(); Value actual_value = GetValueInHostRegister(value); HostReg address_reg; @@ -1418,25 +1552,27 @@ void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, // TODO: if this gets backpatched, these instructions are wasted m_emit->lsr(GetHostReg32(RARG1), GetHostReg32(address_reg), Bus::FASTMEM_LUT_PAGE_SHIFT); - m_emit->and_(GetHostReg32(RARG2), GetHostReg32(address_reg), Bus::FASTMEM_LUT_PAGE_MASK); m_emit->ldr(GetHostReg32(RARG1), - a32::MemOperand(GetHostReg32(fastmem_base), GetHostReg32(RARG1), a32::LSL, 2)); // pointer load + a32::MemOperand(GetFastmemBasePtrReg(), GetHostReg32(RARG1), a32::LSL, 2)); // pointer load m_register_cache.InhibitAllocation(); - bpi.host_pc = GetCurrentNearCodePointer(); + + void* host_pc = GetCurrentNearCodePointer(); switch (size) { case RegSize_8: - m_emit->strb(GetHostReg32(actual_value.host_reg), a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(RARG2))); + m_emit->strb(GetHostReg32(actual_value.host_reg), + a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(address_reg))); break; case RegSize_16: - m_emit->strh(GetHostReg32(actual_value.host_reg), a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(RARG2))); + m_emit->strh(GetHostReg32(actual_value.host_reg), + a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(address_reg))); break; case RegSize_32: - m_emit->str(GetHostReg32(actual_value.host_reg), a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(RARG2))); + m_emit->str(GetHostReg32(actual_value.host_reg), a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(address_reg))); break; default: @@ -1444,39 +1580,33 @@ void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, break; } - bpi.host_code_size = static_cast( - static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(bpi.host_pc))); - - const bool old_load_fastmem_base = m_fastmem_load_base_in_register; + const u32 host_code_size = + static_cast(static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(host_pc))); // generate slowmem fallback - bpi.host_slowmem_pc = GetCurrentFarCodePointer(); + void* host_slowmem_pc = GetCurrentFarCodePointer(); SwitchToFarCode(); DebugAssert(m_delayed_cycles_add > 0); EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast(m_delayed_cycles_add))); - EmitStoreGuestMemorySlowmem(cbi, address, size, actual_value, true); + EmitStoreGuestMemorySlowmem(instruction, info, address, size, actual_value, true); EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast(-m_delayed_cycles_add))); - // restore fastmem base state for the next instruction - if (old_load_fastmem_base) - fastmem_base = GetFastmemLoadBase(); - fastmem_base = GetFastmemStoreBase(); - // return to the block code EmitBranch(GetCurrentNearCodePointer(), false); SwitchToNearCode(); m_register_cache.UninhibitAllocation(); - m_block->loadstore_backpatch_info.push_back(bpi); + CPU::CodeCache::AddLoadStoreInfo(host_pc, host_code_size, info.pc, host_slowmem_pc); } -void CodeGenerator::EmitStoreGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, - const Value& value, bool in_far_code) +void CodeGenerator::EmitStoreGuestMemorySlowmem(Instruction instruction, const CodeCache::InstructionInfo& info, + const Value& address, RegSize size, const Value& value, + bool in_far_code) { Value value_in_hr = GetValueInHostRegister(value); @@ -1520,7 +1650,7 @@ void CodeGenerator::EmitStoreGuestMemorySlowmem(const CodeBlockInstruction& cbi, m_emit->lsl(GetHostReg32(result.host_reg), GetHostReg32(result.host_reg), 2); EmitOr(result.host_reg, result.host_reg, Value::FromConstantU32(Cop0Registers::CAUSE::MakeValueForException( - static_cast(0), cbi.is_branch_delay_slot, false, cbi.instruction.cop.cop_n))); + static_cast(0), info.is_branch_delay_slot, false, instruction.cop.cop_n))); EmitFunctionCall(nullptr, static_cast(&CPU::RaiseException), result, GetCurrentInstructionPC()); if (!in_far_code) @@ -1552,18 +1682,18 @@ void CodeGenerator::EmitStoreGuestMemorySlowmem(const CodeBlockInstruction& cbi, } } -bool CodeGenerator::BackpatchLoadStore(const LoadStoreBackpatchInfo& lbi) +void CodeGenerator::BackpatchLoadStore(void* host_pc, const CodeCache::LoadstoreBackpatchInfo& lbi) { - Log_DevPrintf("Backpatching %p (guest PC 0x%08X) to slowmem at %p", lbi.host_pc, lbi.guest_pc, lbi.host_slowmem_pc); + Log_DevFmt("Backpatching {} (guest PC 0x{:08X}) to slowmem at {}", host_pc, lbi.guest_pc, lbi.thunk_address); // turn it into a jump to the slowmem handler - vixl::aarch32::MacroAssembler emit(static_cast(lbi.host_pc), lbi.host_code_size, a32::A32); + vixl::aarch32::MacroAssembler emit(static_cast(host_pc), lbi.code_size, a32::A32); // check jump distance - const s32 displacement = GetPCDisplacement(lbi.host_pc, lbi.host_slowmem_pc); - if (!IsPCDisplacementInImmediateRange(displacement)) + const s32 displacement = armGetPCDisplacement(host_pc, lbi.thunk_address); + if (!armIsPCDisplacementInImmediateRange(displacement)) { - emit.Mov(GetHostReg32(RSCRATCH), reinterpret_cast(lbi.host_slowmem_pc)); + armMoveAddressToReg(&emit, GetHostReg32(RSCRATCH), lbi.thunk_address); emit.bx(GetHostReg32(RSCRATCH)); } else @@ -1572,56 +1702,12 @@ bool CodeGenerator::BackpatchLoadStore(const LoadStoreBackpatchInfo& lbi) emit.b(&label); } - const s32 nops = (static_cast(lbi.host_code_size) - static_cast(emit.GetCursorOffset())) / 4; + const s32 nops = (static_cast(lbi.code_size) - static_cast(emit.GetCursorOffset())) / 4; Assert(nops >= 0); for (s32 i = 0; i < nops; i++) emit.nop(); - JitCodeBuffer::FlushInstructionCache(lbi.host_pc, lbi.host_code_size); - return true; -} - -void CodeGenerator::BackpatchReturn(void* pc, u32 pc_size) -{ - Log_ProfilePrintf("Backpatching %p to return", pc); - - vixl::aarch32::MacroAssembler emit(static_cast(pc), pc_size, a32::A32); - emit.bx(a32::lr); - - const s32 nops = (static_cast(pc_size) - static_cast(emit.GetCursorOffset())) / 4; - Assert(nops >= 0); - for (s32 i = 0; i < nops; i++) - emit.nop(); - - JitCodeBuffer::FlushInstructionCache(pc, pc_size); -} - -void CodeGenerator::BackpatchBranch(void* pc, u32 pc_size, void* target) -{ - Log_ProfilePrintf("Backpatching %p to %p [branch]", pc, target); - - vixl::aarch32::MacroAssembler emit(static_cast(pc), pc_size, a32::A32); - - // check jump distance - const s32 displacement = GetPCDisplacement(pc, target); - if (!IsPCDisplacementInImmediateRange(displacement)) - { - emit.Mov(GetHostReg32(RSCRATCH), reinterpret_cast(target)); - emit.bx(GetHostReg32(RSCRATCH)); - } - else - { - a32::Label label(displacement + emit.GetCursorOffset()); - emit.b(&label); - } - - // shouldn't have any nops - const s32 nops = (static_cast(pc_size) - static_cast(emit.GetCursorOffset())) / 4; - Assert(nops >= 0); - for (s32 i = 0; i < nops; i++) - emit.nop(); - - JitCodeBuffer::FlushInstructionCache(pc, pc_size); + JitCodeBuffer::FlushInstructionCache(host_pc, lbi.code_size); } void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr) @@ -1751,7 +1837,8 @@ void CodeGenerator::EmitICacheCheckAndUpdate() { if (GetSegmentForAddress(m_pc) >= Segment::KSEG1) { - EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast(m_block->uncached_fetch_ticks))); + EmitAddCPUStructField(offsetof(State, pending_ticks), + Value::FromConstantU32(static_cast(m_block->uncached_fetch_ticks))); } else { @@ -1789,6 +1876,82 @@ void CodeGenerator::EmitICacheCheckAndUpdate() } } +void CodeGenerator::EmitBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size) +{ + // store it first to reduce code size, because we can offset + armMoveAddressToReg(m_emit, GetHostReg32(RARG1), ram_ptr); + armMoveAddressToReg(m_emit, GetHostReg32(RARG2), shadow_ptr); + + u32 offset = 0; + a32::Label block_changed; + +#if 0 + /* TODO: Vectorize +#include +#include + +bool foo(const void* a, const void* b) +{ + uint8x16_t v1 = vld1q_u8((const uint8_t*)a); + uint8x16_t v2 = vld1q_u8((const uint8_t*)b); + uint8x16_t v3 = vld1q_u8((const uint8_t*)a + 16); + uint8x16_t v4 = vld1q_u8((const uint8_t*)a + 16); + uint8x16_t r = vceqq_u8(v1, v2); + uint8x16_t r2 = vceqq_u8(v2, v3); + uint8x16_t r3 = vandq_u8(r, r2); + uint32x2_t rr = vpmin_u32(vget_low_u32(vreinterpretq_u32_u8(r3)), vget_high_u32(vreinterpretq_u32_u8(r3))); + if ((vget_lane_u32(rr, 0) & vget_lane_u32(rr, 1)) != 0xFFFFFFFFu) + return false; + else + return true; +} +*/ + bool first = true; + + while (size >= 16) + { + const a32::VRegister vtmp = a32::v2.V4S(); + const a32::VRegister dst = first ? a32::v0.V4S() : a32::v1.V4S(); + m_emit->ldr(dst, a32::MemOperand(RXARG1, offset)); + m_emit->ldr(vtmp, a32::MemOperand(RXARG2, offset)); + m_emit->cmeq(dst, dst, vtmp); + if (!first) + m_emit->and_(dst.V16B(), dst.V16B(), vtmp.V16B()); + else + first = false; + + offset += 16; + size -= 16; + } + + if (!first) + { + // TODO: make sure this doesn't choke on ffffffff + m_emit->uminv(a32::s0, a32::v0.V4S()); + m_emit->fcmp(a32::s0, 0.0); + m_emit->b(&block_changed, a32::eq); + } +#endif + + while (size >= 4) + { + m_emit->ldr(GetHostReg32(RARG3), a32::MemOperand(GetHostReg32(RARG1), offset)); + m_emit->ldr(GetHostReg32(RARG4), a32::MemOperand(GetHostReg32(RARG2), offset)); + m_emit->cmp(GetHostReg32(RARG3), GetHostReg32(RARG4)); + m_emit->b(a32::ne, &block_changed); + offset += 4; + size -= 4; + } + + DebugAssert(size == 0); + + a32::Label block_unchanged; + m_emit->b(&block_unchanged); + m_emit->bind(&block_changed); + armEmitJmp(m_emit, CodeCache::g_discard_and_recompile_block, false); + m_emit->bind(&block_unchanged); +} + void CodeGenerator::EmitStallUntilGTEComplete() { static_assert(offsetof(State, pending_ticks) + sizeof(u32) == offsetof(State, gte_completion_tick)); @@ -1809,8 +1972,8 @@ void CodeGenerator::EmitStallUntilGTEComplete() void CodeGenerator::EmitBranch(const void* address, bool allow_scratch) { - const s32 displacement = GetPCDisplacement(GetCurrentCodePointer(), address); - if (IsPCDisplacementInImmediateRange(displacement)) + const s32 displacement = armGetPCDisplacement(GetCurrentCodePointer(), address); + if (armIsPCDisplacementInImmediateRange(displacement)) { a32::Label label(displacement + m_emit->GetCursorOffset()); m_emit->b(&label); @@ -2057,81 +2220,4 @@ void CodeGenerator::EmitLoadGlobalAddress(HostReg host_reg, const void* ptr) m_emit->Mov(GetHostReg32(host_reg), reinterpret_cast(ptr)); } -CodeCache::DispatcherFunction CodeGenerator::CompileDispatcher() -{ - m_emit->sub(a32::sp, a32::sp, FUNCTION_STACK_SIZE); - m_register_cache.ReserveCalleeSavedRegisters(); - const u32 stack_adjust = PrepareStackForCall(); - - EmitLoadGlobalAddress(RCPUPTR, &g_state); - - a32::Label event_test; - m_emit->b(&event_test); - - // main dispatch loop - a32::Label main_loop; - m_emit->Bind(&main_loop); - - // time to lookup the block - // r0 <- pc - m_emit->ldr(a32::r0, a32::MemOperand(GetHostReg32(RCPUPTR), offsetof(State, pc))); - - // r1 <- s_fast_map[pc >> 16] - EmitLoadGlobalAddress(2, CodeCache::GetFastMapPointer()); - m_emit->lsr(a32::r1, a32::r0, 16); - m_emit->ldr(a32::r1, a32::MemOperand(a32::r2, a32::r1, a32::LSL, 2)); - - // blr(r1[pc]) (fast_map[pc >> 2]) - m_emit->ldr(a32::r0, a32::MemOperand(a32::r1, a32::r0)); - m_emit->blx(a32::r0); - - // r0 <- pending_ticks - // r1 <- downcount - m_emit->ldr(a32::r0, a32::MemOperand(GetHostReg32(RCPUPTR), offsetof(State, pending_ticks))); - m_emit->ldr(a32::r1, a32::MemOperand(GetHostReg32(RCPUPTR), offsetof(State, downcount))); - - // while downcount < pending_ticks - a32::Label downcount_hit; - m_emit->cmp(a32::r0, a32::r1); - m_emit->b(a32::lt, &main_loop); - - // end while - m_emit->Bind(&event_test); - EmitCall(reinterpret_cast(&TimingEvents::RunEvents)); - m_emit->b(&main_loop); - - RestoreStackAfterCall(stack_adjust); - m_register_cache.PopCalleeSavedRegisters(true); - m_emit->add(a32::sp, a32::sp, FUNCTION_STACK_SIZE); - m_emit->bx(a32::lr); - - CodeBlock::HostCodePointer ptr; - u32 code_size; - FinalizeBlock(&ptr, &code_size); - Log_DevPrintf("Dispatcher is %u bytes at %p", code_size, ptr); - return reinterpret_cast(ptr); -} - -CodeCache::SingleBlockDispatcherFunction CodeGenerator::CompileSingleBlockDispatcher() -{ - m_emit->sub(a32::sp, a32::sp, FUNCTION_STACK_SIZE); - m_register_cache.ReserveCalleeSavedRegisters(); - const u32 stack_adjust = PrepareStackForCall(); - - EmitLoadGlobalAddress(RCPUPTR, &g_state); - - m_emit->blx(GetHostReg32(RARG1)); - - RestoreStackAfterCall(stack_adjust); - m_register_cache.PopCalleeSavedRegisters(true); - m_emit->add(a32::sp, a32::sp, FUNCTION_STACK_SIZE); - m_emit->bx(a32::lr); - - CodeBlock::HostCodePointer ptr; - u32 code_size; - FinalizeBlock(&ptr, &code_size); - Log_DevPrintf("Single block dispatcher is %u bytes at %p", code_size, ptr); - return reinterpret_cast(ptr); -} - } // namespace CPU::Recompiler diff --git a/src/core/cpu_recompiler_code_generator_aarch64.cpp b/src/core/cpu_recompiler_code_generator_aarch64.cpp index 76d62d09b..57853376f 100644 --- a/src/core/cpu_recompiler_code_generator_aarch64.cpp +++ b/src/core/cpu_recompiler_code_generator_aarch64.cpp @@ -1,9 +1,10 @@ -// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin +// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) #include "common/align.h" #include "common/assert.h" #include "common/log.h" +#include "cpu_code_cache_private.h" #include "cpu_core.h" #include "cpu_core_private.h" #include "cpu_recompiler_code_generator.h" @@ -12,8 +13,399 @@ #include "timing_event.h" Log_SetChannel(CPU::Recompiler); +#ifdef ENABLE_HOST_DISASSEMBLY +#include "vixl/aarch64/disasm-aarch64.h" +#endif + namespace a64 = vixl::aarch64; +namespace CPU::Recompiler { +constexpr u64 FUNCTION_CALLEE_SAVED_SPACE_RESERVE = 80; // 8 registers +constexpr u64 FUNCTION_CALLER_SAVED_SPACE_RESERVE = 144; // 18 registers -> 224 bytes +constexpr u64 FUNCTION_STACK_SIZE = FUNCTION_CALLEE_SAVED_SPACE_RESERVE + FUNCTION_CALLER_SAVED_SPACE_RESERVE; + +static constexpr u32 TRAMPOLINE_AREA_SIZE = 4 * 1024; +static std::unordered_map s_trampoline_targets; +static u8* s_trampoline_start_ptr = nullptr; +static u32 s_trampoline_used = 0; +} // namespace CPU::Recompiler + +bool CPU::Recompiler::armIsCallerSavedRegister(u32 id) +{ + // same on both linux and windows + return (id <= 18); +} + +void CPU::Recompiler::armEmitMov(a64::Assembler* armAsm, const a64::Register& rd, u64 imm) +{ + DebugAssert(vixl::IsUint32(imm) || vixl::IsInt32(imm) || rd.Is64Bits()); + DebugAssert(rd.GetCode() != a64::sp.GetCode()); + + if (imm == 0) + { + armAsm->mov(rd, a64::Assembler::AppropriateZeroRegFor(rd)); + return; + } + + // The worst case for size is mov 64-bit immediate to sp: + // * up to 4 instructions to materialise the constant + // * 1 instruction to move to sp + + // Immediates on Aarch64 can be produced using an initial value, and zero to + // three move keep operations. + // + // Initial values can be generated with: + // 1. 64-bit move zero (movz). + // 2. 32-bit move inverted (movn). + // 3. 64-bit move inverted. + // 4. 32-bit orr immediate. + // 5. 64-bit orr immediate. + // Move-keep may then be used to modify each of the 16-bit half words. + // + // The code below supports all five initial value generators, and + // applying move-keep operations to move-zero and move-inverted initial + // values. + + // Try to move the immediate in one instruction, and if that fails, switch to + // using multiple instructions. + const unsigned reg_size = rd.GetSizeInBits(); + + if (a64::Assembler::IsImmMovz(imm, reg_size) && !rd.IsSP()) + { + // Immediate can be represented in a move zero instruction. Movz can't write + // to the stack pointer. + armAsm->movz(rd, imm); + return; + } + else if (a64::Assembler::IsImmMovn(imm, reg_size) && !rd.IsSP()) + { + // Immediate can be represented in a move negative instruction. Movn can't + // write to the stack pointer. + armAsm->movn(rd, rd.Is64Bits() ? ~imm : (~imm & a64::kWRegMask)); + return; + } + else if (a64::Assembler::IsImmLogical(imm, reg_size)) + { + // Immediate can be represented in a logical orr instruction. + DebugAssert(!rd.IsZero()); + armAsm->orr(rd, a64::Assembler::AppropriateZeroRegFor(rd), imm); + return; + } + + // Generic immediate case. Imm will be represented by + // [imm3, imm2, imm1, imm0], where each imm is 16 bits. + // A move-zero or move-inverted is generated for the first non-zero or + // non-0xffff immX, and a move-keep for subsequent non-zero immX. + + uint64_t ignored_halfword = 0; + bool invert_move = false; + // If the number of 0xffff halfwords is greater than the number of 0x0000 + // halfwords, it's more efficient to use move-inverted. + if (vixl::CountClearHalfWords(~imm, reg_size) > vixl::CountClearHalfWords(imm, reg_size)) + { + ignored_halfword = 0xffff; + invert_move = true; + } + + // Iterate through the halfwords. Use movn/movz for the first non-ignored + // halfword, and movk for subsequent halfwords. + DebugAssert((reg_size % 16) == 0); + bool first_mov_done = false; + for (unsigned i = 0; i < (reg_size / 16); i++) + { + uint64_t imm16 = (imm >> (16 * i)) & 0xffff; + if (imm16 != ignored_halfword) + { + if (!first_mov_done) + { + if (invert_move) + armAsm->movn(rd, ~imm16 & 0xffff, 16 * i); + else + armAsm->movz(rd, imm16, 16 * i); + first_mov_done = true; + } + else + { + // Construct a wider constant. + armAsm->movk(rd, imm16, 16 * i); + } + } + } + + DebugAssert(first_mov_done); +} + +s64 CPU::Recompiler::armGetPCDisplacement(const void* current, const void* target) +{ + // pxAssert(Common::IsAlignedPow2(reinterpret_cast(current), 4)); + // pxAssert(Common::IsAlignedPow2(reinterpret_cast(target), 4)); + return static_cast((reinterpret_cast(target) - reinterpret_cast(current)) >> 2); +} + +void CPU::Recompiler::armMoveAddressToReg(a64::Assembler* armAsm, const a64::XRegister& reg, const void* addr) +{ + const void* cur = armAsm->GetCursorAddress(); + const void* current_code_ptr_page = + reinterpret_cast(reinterpret_cast(cur) & ~static_cast(0xFFF)); + const void* ptr_page = + reinterpret_cast(reinterpret_cast(addr) & ~static_cast(0xFFF)); + const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10; + const u32 page_offset = static_cast(reinterpret_cast(addr) & 0xFFFu); + if (vixl::IsInt21(page_displacement) && a64::Assembler::IsImmAddSub(page_offset)) + { + armAsm->adrp(reg, page_displacement); + armAsm->add(reg, reg, page_offset); + } + else if (vixl::IsInt21(page_displacement) && a64::Assembler::IsImmLogical(page_offset, 64)) + { + armAsm->adrp(reg, page_displacement); + armAsm->orr(reg, reg, page_offset); + } + else + { + armEmitMov(armAsm, reg, reinterpret_cast(addr)); + } +} +void CPU::Recompiler::armEmitJmp(a64::Assembler* armAsm, const void* ptr, bool force_inline) +{ + const void* cur = armAsm->GetCursorAddress(); + s64 displacement = armGetPCDisplacement(cur, ptr); + bool use_blr = !vixl::IsInt26(displacement); + if (use_blr && !force_inline) + { + if (u8* trampoline = armGetJumpTrampoline(ptr); trampoline) + { + displacement = armGetPCDisplacement(cur, trampoline); + use_blr = !vixl::IsInt26(displacement); + } + } + + if (use_blr) + { + armMoveAddressToReg(armAsm, RXSCRATCH, ptr); + armAsm->br(RXSCRATCH); + } + else + { + armAsm->b(displacement); + } +} + +void CPU::Recompiler::armEmitCall(a64::Assembler* armAsm, const void* ptr, bool force_inline) +{ + const void* cur = armAsm->GetCursorAddress(); + s64 displacement = armGetPCDisplacement(cur, ptr); + bool use_blr = !vixl::IsInt26(displacement); + if (use_blr && !force_inline) + { + if (u8* trampoline = armGetJumpTrampoline(ptr); trampoline) + { + displacement = armGetPCDisplacement(cur, trampoline); + use_blr = !vixl::IsInt26(displacement); + } + } + + if (use_blr) + { + armMoveAddressToReg(armAsm, RXSCRATCH, ptr); + armAsm->blr(RXSCRATCH); + } + else + { + armAsm->bl(displacement); + } +} + +void CPU::Recompiler::armEmitCondBranch(a64::Assembler* armAsm, a64::Condition cond, const void* ptr) +{ + const s64 jump_distance = static_cast(reinterpret_cast(ptr) - + reinterpret_cast(armAsm->GetCursorAddress())); + // pxAssert(Common::IsAligned(jump_distance, 4)); + + if (a64::Instruction::IsValidImmPCOffset(a64::CondBranchType, jump_distance >> 2)) + { + armAsm->b(jump_distance >> 2, cond); + } + else + { + a64::Label branch_not_taken; + armAsm->b(&branch_not_taken, InvertCondition(cond)); + + const s64 new_jump_distance = static_cast(reinterpret_cast(ptr) - + reinterpret_cast(armAsm->GetCursorAddress())); + armAsm->b(new_jump_distance >> 2); + armAsm->bind(&branch_not_taken); + } +} + +u8* CPU::Recompiler::armGetJumpTrampoline(const void* target) +{ + auto it = s_trampoline_targets.find(target); + if (it != s_trampoline_targets.end()) + return s_trampoline_start_ptr + it->second; + + // align to 16 bytes? + const u32 offset = s_trampoline_used; // Common::AlignUpPow2(s_trampoline_used, 16); + + // 4 movs plus a jump + if (TRAMPOLINE_AREA_SIZE - offset < 20) + { + Panic("Ran out of space in constant pool"); + return nullptr; + } + + u8* start = s_trampoline_start_ptr + offset; + a64::Assembler armAsm(start, TRAMPOLINE_AREA_SIZE - offset); + armMoveAddressToReg(&armAsm, RXSCRATCH, target); + armAsm.br(RXSCRATCH); + + const u32 size = static_cast(armAsm.GetSizeOfCodeGenerated()); + DebugAssert(size < 20); + s_trampoline_targets.emplace(target, offset); + s_trampoline_used = offset + static_cast(size); + + JitCodeBuffer::FlushInstructionCache(start, size); + return start; +} + +void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size) +{ +#ifdef ENABLE_HOST_DISASSEMBLY + class MyDisassembler : public a64::Disassembler + { + protected: + void ProcessOutput(const a64::Instruction* instr) override + { + Log_DebugPrintf("0x%016" PRIx64 " %08" PRIx32 "\t\t%s", reinterpret_cast(instr), + instr->GetInstructionBits(), GetOutput()); + } + }; + + a64::Decoder decoder; + MyDisassembler disas; + decoder.AppendVisitor(&disas); + decoder.Decode(static_cast(start), + reinterpret_cast(static_cast(start) + size)); +#else + Log_ErrorPrint("Not compiled with ENABLE_HOST_DISASSEMBLY."); +#endif +} + +u32 CPU::CodeCache::GetHostInstructionCount(const void* start, u32 size) +{ + return size / a64::kInstructionSize; +} + +u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache) +{ + using namespace a64; + using namespace CPU::Recompiler; + + const s64 disp = armGetPCDisplacement(code, dst); + DebugAssert(vixl::IsInt26(disp)); + + const u32 new_code = B | Assembler::ImmUncondBranch(disp); + std::memcpy(code, &new_code, sizeof(new_code)); + if (flush_icache) + JitCodeBuffer::FlushInstructionCache(code, kInstructionSize); + + return kInstructionSize; +} + +u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size) +{ + using namespace vixl::aarch64; + using namespace CPU::Recompiler; + +#define PTR(x) a64::MemOperand(RSTATE, (s64)(((u8*)(x)) - ((u8*)&g_state))) + + Assembler actual_asm(static_cast(code), code_size); + Assembler* armAsm = &actual_asm; + +#ifdef VIXL_DEBUG + vixl::CodeBufferCheckScope asm_check(armAsm, code_size, vixl::CodeBufferCheckScope::kDontReserveBufferSpace); +#endif + + Label dispatch; + + g_enter_recompiler = armAsm->GetCursorAddress(); + { + // reserve some space for saving caller-saved registers + armAsm->sub(sp, sp, CPU::Recompiler::FUNCTION_STACK_SIZE); + + // Need the CPU state for basically everything :-) + armMoveAddressToReg(armAsm, RSTATE, &g_state); + + // Fastmem setup, oldrec doesn't need it + if (IsUsingFastmem() && g_settings.cpu_execution_mode != CPUExecutionMode::Recompiler) + armAsm->ldr(RMEMBASE, PTR(&g_state.fastmem_base)); + + // Fall through to event dispatcher + } + + // check events then for frame done + g_check_events_and_dispatch = armAsm->GetCursorAddress(); + { + Label skip_event_check; + armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks)); + armAsm->ldr(RWARG2, PTR(&g_state.downcount)); + armAsm->cmp(RWARG1, RWARG2); + armAsm->b(&skip_event_check, lt); + + g_run_events_and_dispatch = armAsm->GetCursorAddress(); + armEmitCall(armAsm, reinterpret_cast(&TimingEvents::RunEvents), true); + + armAsm->bind(&skip_event_check); + } + + // TODO: align? + g_dispatcher = armAsm->GetCursorAddress(); + { + armAsm->bind(&dispatch); + + // x9 <- s_fast_map[pc >> 16] + armAsm->ldr(RWARG1, PTR(&g_state.pc)); + armMoveAddressToReg(armAsm, RXARG3, g_code_lut.data()); + armAsm->lsr(RWARG2, RWARG1, 16); + armAsm->lsr(RWARG1, RWARG1, 2); + armAsm->ldr(RXARG2, MemOperand(RXARG3, RXARG2, LSL, 3)); + + // blr(x9[pc * 2]) (fast_map[pc >> 2]) + armAsm->ldr(RXARG1, MemOperand(RXARG2, RXARG1, LSL, 3)); + armAsm->blr(RXARG1); + } + + g_compile_or_revalidate_block = armAsm->GetCursorAddress(); + { + armAsm->ldr(RWARG1, PTR(&g_state.pc)); + armEmitCall(armAsm, reinterpret_cast(&CompileOrRevalidateBlock), true); + armAsm->b(&dispatch); + } + + g_discard_and_recompile_block = armAsm->GetCursorAddress(); + { + armAsm->ldr(RWARG1, PTR(&g_state.pc)); + armEmitCall(armAsm, reinterpret_cast(&DiscardAndRecompileBlock), true); + armAsm->b(&dispatch); + } + + g_interpret_block = armAsm->GetCursorAddress(); + { + armEmitCall(armAsm, reinterpret_cast(GetInterpretUncachedBlockFunction()), true); + armAsm->b(&dispatch); + } + + armAsm->FinalizeCode(); + + // TODO: align? + s_trampoline_targets.clear(); + s_trampoline_start_ptr = static_cast(code) + armAsm->GetCursorOffset(); + s_trampoline_used = 0; + +#undef PTR + return static_cast(armAsm->GetCursorOffset()) + TRAMPOLINE_AREA_SIZE; +} + namespace CPU::Recompiler { constexpr HostReg RCPUPTR = 19; @@ -24,18 +416,6 @@ constexpr HostReg RARG2 = 1; constexpr HostReg RARG3 = 2; constexpr HostReg RARG4 = 3; constexpr HostReg RSCRATCH = 8; -constexpr u64 FUNCTION_CALL_SHADOW_SPACE = 32; -constexpr u64 FUNCTION_CALLEE_SAVED_SPACE_RESERVE = 80; // 8 registers -constexpr u64 FUNCTION_CALLER_SAVED_SPACE_RESERVE = 144; // 18 registers -> 224 bytes -constexpr u64 FUNCTION_STACK_SIZE = - FUNCTION_CALLEE_SAVED_SPACE_RESERVE + FUNCTION_CALLER_SAVED_SPACE_RESERVE + FUNCTION_CALL_SHADOW_SPACE; - -static s64 GetPCDisplacement(const void* current, const void* target) -{ - Assert(Common::IsAlignedPow2(reinterpret_cast(current), 4)); - Assert(Common::IsAlignedPow2(reinterpret_cast(target), 4)); - return static_cast((reinterpret_cast(target) - reinterpret_cast(current)) >> 2); -} static const a64::WRegister GetHostReg8(HostReg reg) { @@ -158,6 +538,11 @@ void CodeGenerator::SwitchToNearCode() m_emit = &m_near_emitter; } +void* CodeGenerator::GetStartNearCodePointer() const +{ + return static_cast(m_code_buffer->GetFreeCodePointer()); +} + void* CodeGenerator::GetCurrentNearCodePointer() const { return static_cast(m_code_buffer->GetFreeCodePointer()) + m_near_emitter.GetCursorOffset(); @@ -196,8 +581,6 @@ Value CodeGenerator::GetValueInHostOrScratchRegister(const Value& value, bool al void CodeGenerator::EmitBeginBlock(bool allocate_registers /* = true */) { - m_emit->Sub(a64::sp, a64::sp, FUNCTION_STACK_SIZE); - if (allocate_registers) { // Save the link register, since we'll be calling functions. @@ -213,7 +596,7 @@ void CodeGenerator::EmitBeginBlock(bool allocate_registers /* = true */) UNREFERENCED_VARIABLE(cpu_reg_allocated); // If there's loadstore instructions, preload the fastmem base. - if (m_block->contains_loadstore_instructions) + if (m_block->HasFlag(CodeCache::BlockFlags::ContainsLoadStoreInstructions)) { const bool fastmem_reg_allocated = m_register_cache.AllocateHostReg(RMEMBASEPTR); Assert(fastmem_reg_allocated); @@ -222,11 +605,11 @@ void CodeGenerator::EmitBeginBlock(bool allocate_registers /* = true */) } } -void CodeGenerator::EmitEndBlock(bool free_registers /* = true */, bool emit_return /* = true */) +void CodeGenerator::EmitEndBlock(bool free_registers, const void* jump_to) { if (free_registers) { - if (m_block->contains_loadstore_instructions) + if (m_block->HasFlag(CodeCache::BlockFlags::ContainsLoadStoreInstructions)) m_register_cache.FreeHostReg(RMEMBASEPTR); m_register_cache.FreeHostReg(RCPUPTR); @@ -235,10 +618,8 @@ void CodeGenerator::EmitEndBlock(bool free_registers /* = true */, bool emit_ret m_register_cache.PopCalleeSavedRegisters(true); } - m_emit->Add(a64::sp, a64::sp, FUNCTION_STACK_SIZE); - - if (emit_return) - m_emit->Ret(); + if (jump_to) + armEmitJmp(m_emit, jump_to, true); } void CodeGenerator::EmitExceptionExit() @@ -252,8 +633,7 @@ void CodeGenerator::EmitExceptionExit() m_register_cache.PopCalleeSavedRegisters(false); - m_emit->Add(a64::sp, a64::sp, FUNCTION_STACK_SIZE); - m_emit->Ret(); + armEmitJmp(m_emit, CodeCache::g_check_events_and_dispatch, true); } void CodeGenerator::EmitExceptionExitOnBool(const Value& value) @@ -275,19 +655,22 @@ void CodeGenerator::EmitExceptionExitOnBool(const Value& value) m_register_cache.PopState(); } -void CodeGenerator::FinalizeBlock(CodeBlock::HostCodePointer* out_host_code, u32* out_host_code_size) +const void* CodeGenerator::FinalizeBlock(u32* out_host_code_size, u32* out_host_far_code_size) { m_near_emitter.FinalizeCode(); m_far_emitter.FinalizeCode(); - *out_host_code = reinterpret_cast(m_code_buffer->GetFreeCodePointer()); + const void* code = m_code_buffer->GetFreeCodePointer(); *out_host_code_size = static_cast(m_near_emitter.GetSizeOfCodeGenerated()); + *out_host_far_code_size = static_cast(m_far_emitter.GetSizeOfCodeGenerated()); m_code_buffer->CommitCode(static_cast(m_near_emitter.GetSizeOfCodeGenerated())); m_code_buffer->CommitFarCode(static_cast(m_far_emitter.GetSizeOfCodeGenerated())); m_near_emitter.Reset(); m_far_emitter.Reset(); + + return code; } void CodeGenerator::EmitSignExtend(HostReg to_reg, RegSize to_size, HostReg from_reg, RegSize from_size) @@ -1028,7 +1411,7 @@ void CodeGenerator::RestoreStackAfterCall(u32 adjust_size) void CodeGenerator::EmitCall(const void* ptr) { - const s64 displacement = GetPCDisplacement(GetCurrentCodePointer(), ptr); + const s64 displacement = armGetPCDisplacement(GetCurrentCodePointer(), ptr); const bool use_blr = !vixl::IsInt26(displacement); if (use_blr) { @@ -1173,25 +1556,25 @@ void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr, co void CodeGenerator::EmitPushHostReg(HostReg reg, u32 position) { - const a64::MemOperand addr(a64::sp, FUNCTION_STACK_SIZE - FUNCTION_CALL_SHADOW_SPACE - (position * 8)); + const a64::MemOperand addr(a64::sp, FUNCTION_STACK_SIZE - (position * 8)); m_emit->str(GetHostReg64(reg), addr); } void CodeGenerator::EmitPushHostRegPair(HostReg reg, HostReg reg2, u32 position) { - const a64::MemOperand addr(a64::sp, FUNCTION_STACK_SIZE - FUNCTION_CALL_SHADOW_SPACE - ((position + 1) * 8)); + const a64::MemOperand addr(a64::sp, FUNCTION_STACK_SIZE - ((position + 1) * 8)); m_emit->stp(GetHostReg64(reg2), GetHostReg64(reg), addr); } void CodeGenerator::EmitPopHostReg(HostReg reg, u32 position) { - const a64::MemOperand addr(a64::sp, FUNCTION_STACK_SIZE - FUNCTION_CALL_SHADOW_SPACE - (position * 8)); + const a64::MemOperand addr(a64::sp, FUNCTION_STACK_SIZE - (position * 8)); m_emit->ldr(GetHostReg64(reg), addr); } void CodeGenerator::EmitPopHostRegPair(HostReg reg, HostReg reg2, u32 position) { - const a64::MemOperand addr(a64::sp, FUNCTION_STACK_SIZE - FUNCTION_CALL_SHADOW_SPACE - (position * 8)); + const a64::MemOperand addr(a64::sp, FUNCTION_STACK_SIZE - (position * 8)); m_emit->ldp(GetHostReg64(reg2), GetHostReg64(reg), addr); } @@ -1399,15 +1782,11 @@ void CodeGenerator::EmitLoadGuestRAMFastmem(const Value& address, RegSize size, } } -void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, - Value& result) +void CodeGenerator::EmitLoadGuestMemoryFastmem(Instruction instruction, const CodeCache::InstructionInfo& info, + const Value& address, RegSize size, Value& result) { // fastmem - LoadStoreBackpatchInfo bpi; - bpi.address_host_reg = HostReg_Invalid; - bpi.value_host_reg = result.host_reg; - bpi.guest_pc = m_current_instruction->pc; - bpi.fault_count = 0; + void* host_pc = GetCurrentNearCodePointer(); HostReg address_reg; if (address.IsConstant()) @@ -1424,7 +1803,7 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, if (g_settings.cpu_fastmem_mode == CPUFastmemMode::MMap) { - bpi.host_pc = GetCurrentNearCodePointer(); + host_pc = GetCurrentNearCodePointer(); switch (size) { @@ -1451,7 +1830,7 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, m_emit->and_(GetHostReg32(RARG2), GetHostReg32(address_reg), Bus::FASTMEM_LUT_PAGE_MASK); m_emit->ldr(GetHostReg64(RARG1), a64::MemOperand(GetFastmemBasePtrReg(), GetHostReg32(RARG1), a64::LSL, 3)); - bpi.host_pc = GetCurrentNearCodePointer(); + host_pc = GetCurrentNearCodePointer(); switch (size) { @@ -1473,11 +1852,11 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, } } - bpi.host_code_size = static_cast( - static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(bpi.host_pc))); + const u32 host_code_size = + static_cast(static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(host_pc))); // generate slowmem fallback - bpi.host_slowmem_pc = GetCurrentFarCodePointer(); + const void* host_slowmem_pc = GetCurrentFarCodePointer(); SwitchToFarCode(); // we add the ticks *after* the add here, since we counted incorrectly, then correct for it below @@ -1485,7 +1864,7 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast(m_delayed_cycles_add))); m_delayed_cycles_add += Bus::RAM_READ_TICKS; - EmitLoadGuestMemorySlowmem(cbi, address, size, result, true); + EmitLoadGuestMemorySlowmem(instruction, info, address, size, result, true); EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast(-m_delayed_cycles_add))); @@ -1496,11 +1875,11 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, SwitchToNearCode(); m_register_cache.UninhibitAllocation(); - m_block->loadstore_backpatch_info.push_back(bpi); + CPU::CodeCache::AddLoadStoreInfo(host_pc, host_code_size, info.pc, host_slowmem_pc); } -void CodeGenerator::EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, - Value& result, bool in_far_code) +void CodeGenerator::EmitLoadGuestMemorySlowmem(Instruction instruction, const CodeCache::InstructionInfo& info, + const Value& address, RegSize size, Value& result, bool in_far_code) { if (g_settings.cpu_recompiler_memory_exceptions) { @@ -1540,7 +1919,7 @@ void CodeGenerator::EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi, m_emit->lsl(GetHostReg32(result.host_reg), GetHostReg32(result.host_reg), 2); EmitOr(result.host_reg, result.host_reg, Value::FromConstantU32(Cop0Registers::CAUSE::MakeValueForException( - static_cast(0), cbi.is_branch_delay_slot, false, cbi.instruction.cop.cop_n))); + static_cast(0), info.is_branch_delay_slot, false, instruction.cop.cop_n))); EmitFunctionCall(nullptr, static_cast(&CPU::RaiseException), result, GetCurrentInstructionPC()); EmitExceptionExit(); @@ -1573,17 +1952,13 @@ void CodeGenerator::EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi, } } -void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, - const Value& value) +void CodeGenerator::EmitStoreGuestMemoryFastmem(Instruction instruction, const CodeCache::InstructionInfo& info, + const Value& address, RegSize size, const Value& value) { Value value_in_hr = GetValueInHostRegister(value); // fastmem - LoadStoreBackpatchInfo bpi; - bpi.address_host_reg = HostReg_Invalid; - bpi.value_host_reg = value.host_reg; - bpi.guest_pc = m_current_instruction->pc; - bpi.fault_count = 0; + void* host_pc = GetCurrentNearCodePointer(); HostReg address_reg; if (address.IsConstant()) @@ -1599,7 +1974,7 @@ void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, m_register_cache.InhibitAllocation(); if (g_settings.cpu_fastmem_mode == CPUFastmemMode::MMap) { - bpi.host_pc = GetCurrentNearCodePointer(); + host_pc = GetCurrentNearCodePointer(); switch (size) { @@ -1627,7 +2002,7 @@ void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, m_emit->add(GetHostReg64(RARG3), GetFastmemBasePtrReg(), Bus::FASTMEM_LUT_NUM_PAGES * sizeof(u32*)); m_emit->ldr(GetHostReg64(RARG1), a64::MemOperand(GetHostReg64(RARG3), GetHostReg32(RARG1), a64::LSL, 3)); - bpi.host_pc = GetCurrentNearCodePointer(); + host_pc = GetCurrentNearCodePointer(); switch (size) { @@ -1649,17 +2024,17 @@ void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, } } - bpi.host_code_size = static_cast( - static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(bpi.host_pc))); + const u32 host_code_size = + static_cast(static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(host_pc))); // generate slowmem fallback - bpi.host_slowmem_pc = GetCurrentFarCodePointer(); + void* host_slowmem_pc = GetCurrentFarCodePointer(); SwitchToFarCode(); DebugAssert(m_delayed_cycles_add > 0); EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast(m_delayed_cycles_add))); - EmitStoreGuestMemorySlowmem(cbi, address, size, value_in_hr, true); + EmitStoreGuestMemorySlowmem(instruction, info, address, size, value_in_hr, true); EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast(-m_delayed_cycles_add))); @@ -1670,11 +2045,12 @@ void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, SwitchToNearCode(); m_register_cache.UninhibitAllocation(); - m_block->loadstore_backpatch_info.push_back(bpi); + CPU::CodeCache::AddLoadStoreInfo(host_pc, host_code_size, info.pc, host_slowmem_pc); } -void CodeGenerator::EmitStoreGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, - const Value& value, bool in_far_code) +void CodeGenerator::EmitStoreGuestMemorySlowmem(Instruction instruction, const CodeCache::InstructionInfo& info, + const Value& address, RegSize size, const Value& value, + bool in_far_code) { Value value_in_hr = GetValueInHostRegister(value); @@ -1717,7 +2093,7 @@ void CodeGenerator::EmitStoreGuestMemorySlowmem(const CodeBlockInstruction& cbi, m_emit->lsl(GetHostReg32(result.host_reg), GetHostReg32(result.host_reg), 2); EmitOr(result.host_reg, result.host_reg, Value::FromConstantU32(Cop0Registers::CAUSE::MakeValueForException( - static_cast(0), cbi.is_branch_delay_slot, false, cbi.instruction.cop.cop_n))); + static_cast(0), info.is_branch_delay_slot, false, instruction.cop.cop_n))); EmitFunctionCall(nullptr, static_cast(&CPU::RaiseException), result, GetCurrentInstructionPC()); if (!in_far_code) @@ -1754,64 +2130,26 @@ void CodeGenerator::EmitUpdateFastmemBase() m_emit->Ldr(GetFastmemBasePtrReg(), a64::MemOperand(GetCPUPtrReg(), offsetof(State, fastmem_base))); } -bool CodeGenerator::BackpatchLoadStore(const LoadStoreBackpatchInfo& lbi) +void CodeGenerator::BackpatchLoadStore(void* host_pc, const CodeCache::LoadstoreBackpatchInfo& lbi) { - Log_DevPrintf("Backpatching %p (guest PC 0x%08X) to slowmem at %p", lbi.host_pc, lbi.guest_pc, lbi.host_slowmem_pc); + Log_DevFmt("Backpatching {} (guest PC 0x{:08X}) to slowmem at {}", host_pc, lbi.guest_pc, lbi.thunk_address); // check jump distance const s64 jump_distance = - static_cast(reinterpret_cast(lbi.host_slowmem_pc) - reinterpret_cast(lbi.host_pc)); + static_cast(reinterpret_cast(lbi.thunk_address) - reinterpret_cast(host_pc)); Assert(Common::IsAligned(jump_distance, 4)); Assert(a64::Instruction::IsValidImmPCOffset(a64::UncondBranchType, jump_distance >> 2)); // turn it into a jump to the slowmem handler - vixl::aarch64::MacroAssembler emit(static_cast(lbi.host_pc), lbi.host_code_size, - a64::PositionDependentCode); + vixl::aarch64::MacroAssembler emit(static_cast(host_pc), lbi.code_size, a64::PositionDependentCode); emit.b(jump_distance >> 2); - const s32 nops = (static_cast(lbi.host_code_size) - static_cast(emit.GetCursorOffset())) / 4; + const s32 nops = (static_cast(lbi.code_size) - static_cast(emit.GetCursorOffset())) / 4; Assert(nops >= 0); for (s32 i = 0; i < nops; i++) emit.nop(); - JitCodeBuffer::FlushInstructionCache(lbi.host_pc, lbi.host_code_size); - return true; -} - -void CodeGenerator::BackpatchReturn(void* pc, u32 pc_size) -{ - Log_ProfilePrintf("Backpatching %p to return", pc); - - vixl::aarch64::MacroAssembler emit(static_cast(pc), pc_size, a64::PositionDependentCode); - emit.ret(); - - const s32 nops = (static_cast(pc_size) - static_cast(emit.GetCursorOffset())) / 4; - Assert(nops >= 0); - for (s32 i = 0; i < nops; i++) - emit.nop(); - - JitCodeBuffer::FlushInstructionCache(pc, pc_size); -} - -void CodeGenerator::BackpatchBranch(void* pc, u32 pc_size, void* target) -{ - Log_ProfilePrintf("Backpatching %p to %p [branch]", pc, target); - - // check jump distance - const s64 jump_distance = static_cast(reinterpret_cast(target) - reinterpret_cast(pc)); - Assert(Common::IsAligned(jump_distance, 4)); - Assert(a64::Instruction::IsValidImmPCOffset(a64::UncondBranchType, jump_distance >> 2)); - - vixl::aarch64::MacroAssembler emit(static_cast(pc), pc_size, a64::PositionDependentCode); - emit.b(jump_distance >> 2); - - // shouldn't have any nops - const s32 nops = (static_cast(pc_size) - static_cast(emit.GetCursorOffset())) / 4; - Assert(nops >= 0); - for (s32 i = 0; i < nops; i++) - emit.nop(); - - JitCodeBuffer::FlushInstructionCache(pc, pc_size); + JitCodeBuffer::FlushInstructionCache(host_pc, lbi.code_size); } void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr) @@ -1980,6 +2318,69 @@ void CodeGenerator::EmitICacheCheckAndUpdate() } } +void CodeGenerator::EmitBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size) +{ + // store it first to reduce code size, because we can offset + armMoveAddressToReg(m_emit, RXARG1, ram_ptr); + armMoveAddressToReg(m_emit, RXARG2, shadow_ptr); + + bool first = true; + u32 offset = 0; + a64::Label block_changed; + + while (size >= 16) + { + const a64::VRegister vtmp = a64::v2.V4S(); + const a64::VRegister dst = first ? a64::v0.V4S() : a64::v1.V4S(); + m_emit->ldr(dst, a64::MemOperand(RXARG1, offset)); + m_emit->ldr(vtmp, a64::MemOperand(RXARG2, offset)); + m_emit->cmeq(dst, dst, vtmp); + if (!first) + m_emit->and_(dst.V16B(), dst.V16B(), vtmp.V16B()); + else + first = false; + + offset += 16; + size -= 16; + } + + if (!first) + { + // TODO: make sure this doesn't choke on ffffffff + m_emit->uminv(a64::s0, a64::v0.V4S()); + m_emit->fcmp(a64::s0, 0.0); + m_emit->b(&block_changed, a64::eq); + } + + while (size >= 8) + { + m_emit->ldr(RXARG3, a64::MemOperand(RXARG1, offset)); + m_emit->ldr(RXSCRATCH, a64::MemOperand(RXARG2, offset)); + m_emit->cmp(RXARG3, RXSCRATCH); + m_emit->b(&block_changed, a64::ne); + offset += 8; + size -= 8; + } + + while (size >= 4) + { + m_emit->ldr(RWARG3, a64::MemOperand(RXARG1, offset)); + m_emit->ldr(RWSCRATCH, a64::MemOperand(RXARG2, offset)); + m_emit->cmp(RWARG3, RWSCRATCH); + m_emit->b(&block_changed, a64::ne); + offset += 4; + size -= 4; + } + + DebugAssert(size == 0); + + a64::Label block_unchanged; + m_emit->b(&block_unchanged); + m_emit->bind(&block_changed); + armEmitJmp(m_emit, CodeCache::g_discard_and_recompile_block, false); + m_emit->bind(&block_unchanged); +} + void CodeGenerator::EmitStallUntilGTEComplete() { static_assert(offsetof(State, pending_ticks) + sizeof(u32) == offsetof(State, gte_completion_tick)); @@ -2253,7 +2654,7 @@ void CodeGenerator::EmitLoadGlobalAddress(HostReg host_reg, const void* ptr) reinterpret_cast(GetCurrentCodePointer()) & ~static_cast(0xFFF)); const void* ptr_page = reinterpret_cast(reinterpret_cast(ptr) & ~static_cast(0xFFF)); - const s64 page_displacement = GetPCDisplacement(current_code_ptr_page, ptr_page) >> 10; + const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10; const u32 page_offset = static_cast(reinterpret_cast(ptr) & 0xFFFu); if (vixl::IsInt21(page_displacement) && a64::Assembler::IsImmLogical(page_offset, 64)) { @@ -2266,81 +2667,4 @@ void CodeGenerator::EmitLoadGlobalAddress(HostReg host_reg, const void* ptr) } } -CodeCache::DispatcherFunction CodeGenerator::CompileDispatcher() -{ - m_emit->sub(a64::sp, a64::sp, FUNCTION_STACK_SIZE); - m_register_cache.ReserveCalleeSavedRegisters(); - const u32 stack_adjust = PrepareStackForCall(); - - EmitLoadGlobalAddress(RCPUPTR, &g_state); - - a64::Label event_test; - m_emit->b(&event_test); - - // main dispatch loop - a64::Label main_loop; - m_emit->Bind(&main_loop); - - // time to lookup the block - // w8 <- pc - m_emit->ldr(a64::w8, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, pc))); - - // x9 <- s_fast_map[pc >> 16] - EmitLoadGlobalAddress(10, CodeCache::GetFastMapPointer()); - m_emit->lsr(a64::w9, a64::w8, 16); - m_emit->lsr(a64::w8, a64::w8, 2); - m_emit->ldr(a64::x9, a64::MemOperand(a64::x10, a64::x9, a64::LSL, 3)); - - // blr(x9[pc * 2]) (fast_map[pc >> 2]) - m_emit->ldr(a64::x8, a64::MemOperand(a64::x9, a64::x8, a64::LSL, 3)); - m_emit->blr(a64::x8); - - // w8 <- pending_ticks - // w9 <- downcount - m_emit->ldr(a64::w8, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, pending_ticks))); - m_emit->ldr(a64::w9, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, downcount))); - - // while downcount < pending_ticks - m_emit->cmp(a64::w8, a64::w9); - m_emit->b(&main_loop, a64::lt); - - m_emit->Bind(&event_test); - EmitCall(reinterpret_cast(&TimingEvents::RunEvents)); - m_emit->b(&main_loop); - - // all done - RestoreStackAfterCall(stack_adjust); - m_register_cache.PopCalleeSavedRegisters(true); - m_emit->add(a64::sp, a64::sp, FUNCTION_STACK_SIZE); - m_emit->ret(); - - CodeBlock::HostCodePointer ptr; - u32 code_size; - FinalizeBlock(&ptr, &code_size); - Log_DevPrintf("Dispatcher is %u bytes at %p", code_size, ptr); - return reinterpret_cast(ptr); -} - -CodeCache::SingleBlockDispatcherFunction CodeGenerator::CompileSingleBlockDispatcher() -{ - m_emit->sub(a64::sp, a64::sp, FUNCTION_STACK_SIZE); - m_register_cache.ReserveCalleeSavedRegisters(); - const u32 stack_adjust = PrepareStackForCall(); - - EmitLoadGlobalAddress(RCPUPTR, &g_state); - - m_emit->blr(GetHostReg64(RARG1)); - - RestoreStackAfterCall(stack_adjust); - m_register_cache.PopCalleeSavedRegisters(true); - m_emit->add(a64::sp, a64::sp, FUNCTION_STACK_SIZE); - m_emit->ret(); - - CodeBlock::HostCodePointer ptr; - u32 code_size; - FinalizeBlock(&ptr, &code_size); - Log_DevPrintf("Dispatcher is %u bytes at %p", code_size, ptr); - return reinterpret_cast(ptr); -} - } // namespace CPU::Recompiler diff --git a/src/core/cpu_recompiler_code_generator_generic.cpp b/src/core/cpu_recompiler_code_generator_generic.cpp index 5177292b8..4521fd671 100644 --- a/src/core/cpu_recompiler_code_generator_generic.cpp +++ b/src/core/cpu_recompiler_code_generator_generic.cpp @@ -29,8 +29,8 @@ void CodeGenerator::EmitStoreInterpreterLoadDelay(Reg reg, const Value& value) m_load_delay_dirty = true; } -Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const Value& address, - const SpeculativeValue& address_spec, RegSize size) +Value CodeGenerator::EmitLoadGuestMemory(Instruction instruction, const CodeCache::InstructionInfo& info, + const Value& address, const SpeculativeValue& address_spec, RegSize size) { if (address.IsConstant() && !SpeculativeIsCacheIsolated()) { @@ -44,7 +44,8 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const { Value result = m_register_cache.AllocateScratch(size); - if (g_settings.IsUsingFastmem() && Bus::IsRAMAddress(static_cast(address.constant_value))) + // TODO: mask off... + if (CodeCache::IsUsingFastmem() && Bus::IsRAMAddress(static_cast(address.constant_value))) { // have to mask away the high bits for mirrors, since we don't map them in fastmem EmitLoadGuestRAMFastmem(Value::FromConstantU32(static_cast(address.constant_value) & Bus::g_ram_mask), @@ -68,25 +69,25 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const { if (!use_fastmem) { - Log_ProfilePrintf("Non-constant load at 0x%08X, speculative address 0x%08X, using fastmem = %s", cbi.pc, + Log_ProfilePrintf("Non-constant load at 0x%08X, speculative address 0x%08X, using fastmem = %s", info.pc, *address_spec, use_fastmem ? "yes" : "no"); } } else { - Log_ProfilePrintf("Non-constant load at 0x%08X, speculative address UNKNOWN, using fastmem = %s", cbi.pc, + Log_ProfilePrintf("Non-constant load at 0x%08X, speculative address UNKNOWN, using fastmem = %s", info.pc, use_fastmem ? "yes" : "no"); } - if (g_settings.IsUsingFastmem() && use_fastmem) + if (CodeCache::IsUsingFastmem() && use_fastmem) { - EmitLoadGuestMemoryFastmem(cbi, address, size, result); + EmitLoadGuestMemoryFastmem(instruction, info, address, size, result); } else { AddPendingCycles(true); m_register_cache.FlushCallerSavedGuestRegisters(true, true); - EmitLoadGuestMemorySlowmem(cbi, address, size, result, false); + EmitLoadGuestMemorySlowmem(instruction, info, address, size, result, false); } // Downcast to ignore upper 56/48/32 bits. This should be a noop. @@ -115,8 +116,9 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const return result; } -void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const Value& address, - const SpeculativeValue& address_spec, RegSize size, const Value& value) +void CodeGenerator::EmitStoreGuestMemory(Instruction instruction, const CodeCache::InstructionInfo& info, + const Value& address, const SpeculativeValue& address_spec, RegSize size, + const Value& value) { if (address.IsConstant() && !SpeculativeIsCacheIsolated()) { @@ -141,25 +143,25 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const { if (!use_fastmem) { - Log_ProfilePrintf("Non-constant store at 0x%08X, speculative address 0x%08X, using fastmem = %s", cbi.pc, + Log_ProfilePrintf("Non-constant store at 0x%08X, speculative address 0x%08X, using fastmem = %s", info.pc, *address_spec, use_fastmem ? "yes" : "no"); } } else { - Log_ProfilePrintf("Non-constant store at 0x%08X, speculative address UNKNOWN, using fastmem = %s", cbi.pc, + Log_ProfilePrintf("Non-constant store at 0x%08X, speculative address UNKNOWN, using fastmem = %s", info.pc, use_fastmem ? "yes" : "no"); } - if (g_settings.IsUsingFastmem() && use_fastmem) + if (CodeCache::IsUsingFastmem() && use_fastmem) { - EmitStoreGuestMemoryFastmem(cbi, address, size, value); + EmitStoreGuestMemoryFastmem(instruction, info, address, size, value); } else { AddPendingCycles(true); m_register_cache.FlushCallerSavedGuestRegisters(true, true); - EmitStoreGuestMemorySlowmem(cbi, address, size, value, false); + EmitStoreGuestMemorySlowmem(instruction, info, address, size, value, false); } } diff --git a/src/core/cpu_recompiler_code_generator_x64.cpp b/src/core/cpu_recompiler_code_generator_x64.cpp index aa73b672e..46cb4beb1 100644 --- a/src/core/cpu_recompiler_code_generator_x64.cpp +++ b/src/core/cpu_recompiler_code_generator_x64.cpp @@ -1,38 +1,315 @@ -// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin +// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) -#include "common/align.h" -#include "common/assert.h" -#include "common/log.h" +#include "cpu_code_cache_private.h" #include "cpu_core.h" #include "cpu_core_private.h" #include "cpu_recompiler_code_generator.h" #include "cpu_recompiler_thunks.h" #include "settings.h" #include "timing_event.h" + +#include "common/align.h" +#include "common/assert.h" +#include "common/log.h" + Log_SetChannel(Recompiler::CodeGenerator); +#ifdef ENABLE_HOST_DISASSEMBLY +#include "Zycore/Format.h" +#include "Zycore/Status.h" +#include "Zydis/Zydis.h" +#endif + +bool CPU::Recompiler::IsCallerSavedRegister(u32 id) +{ +#ifdef _WIN32 + // The x64 ABI considers the registers RAX, RCX, RDX, R8, R9, R10, R11, and XMM0-XMM5 volatile. + return (id <= 2 || (id >= 8 && id <= 11)); +#else + // rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 are scratch registers. + return (id <= 2 || id == 6 || id == 7 || (id >= 8 && id <= 11)); +#endif +} + +u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size) +{ + using namespace Xbyak; + +#define PTR(x) (cg->rbp + (((u8*)(x)) - ((u8*)&g_state))) + +#ifdef _WIN32 + // Shadow space for Win32 + constexpr u32 stack_size = 32 + 8; +#else + // Stack still needs to be aligned + constexpr u32 stack_size = 8; +#endif + + DebugAssert(g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler); + + CodeGenerator acg(code_size, static_cast(code)); + CodeGenerator* cg = &acg; + + Label dispatch; + Label exit_recompiler; + + g_enter_recompiler = reinterpret_cast(const_cast(cg->getCurr())); + { + // Don't need to save registers, because we fastjmp out when execution is interrupted. + cg->sub(cg->rsp, stack_size); + + // CPU state pointer + cg->lea(cg->rbp, cg->qword[cg->rip + &g_state]); + + // newrec preloads fastmem base + if (g_settings.cpu_execution_mode != CPUExecutionMode::Recompiler && CodeCache::IsUsingFastmem()) + cg->mov(cg->rbx, cg->qword[PTR(&g_state.fastmem_base)]); + + // Fall through to event dispatcher + } + + // check events then for frame done + g_check_events_and_dispatch = cg->getCurr(); + { + Label skip_event_check; + cg->mov(RWARG1, cg->dword[PTR(&g_state.pending_ticks)]); + cg->cmp(RWARG1, cg->dword[PTR(&g_state.downcount)]); + cg->jl(skip_event_check); + + g_run_events_and_dispatch = cg->getCurr(); + cg->call(reinterpret_cast(&TimingEvents::RunEvents)); + + cg->L(skip_event_check); + } + + // TODO: align? + g_dispatcher = cg->getCurr(); + { + cg->L(dispatch); + + // rcx <- s_fast_map[pc >> 16] + cg->mov(RWARG1, cg->dword[PTR(&g_state.pc)]); + cg->lea(RXARG2, cg->dword[PTR(g_code_lut.data())]); + cg->mov(RWARG3, RWARG1); + cg->shr(RWARG3, 16); + cg->mov(RXARG2, cg->qword[RXARG2 + RXARG3 * 8]); + + // call(rcx[pc * 2]) (fast_map[pc >> 2]) + cg->jmp(cg->qword[RXARG2 + RXARG1 * 2]); + } + + g_compile_or_revalidate_block = cg->getCurr(); + { + cg->mov(RWARG1, cg->dword[PTR(&g_state.pc)]); + cg->call(&CompileOrRevalidateBlock); + cg->jmp(dispatch); + } + + g_discard_and_recompile_block = cg->getCurr(); + { + cg->mov(RWARG1, cg->dword[PTR(&g_state.pc)]); + cg->call(&DiscardAndRecompileBlock); + cg->jmp(dispatch); + } + + g_interpret_block = cg->getCurr(); + { + cg->call(CodeCache::GetInterpretUncachedBlockFunction()); + cg->jmp(dispatch); + } + +#undef PTR + + return static_cast(cg->getSize()); +} + +u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache) +{ + u8* ptr = static_cast(code); + *(ptr++) = 0xE9; // jmp + + const ptrdiff_t disp = (reinterpret_cast(dst) - reinterpret_cast(code)) - 5; + DebugAssert(disp >= static_cast(std::numeric_limits::min()) && + disp <= static_cast(std::numeric_limits::max())); + + const s32 disp32 = static_cast(disp); + std::memcpy(ptr, &disp32, sizeof(disp32)); + return 5; +} + +#ifdef ENABLE_HOST_DISASSEMBLY + +static ZydisFormatterFunc s_old_print_address; + +static ZyanStatus ZydisFormatterPrintAddressAbsolute(const ZydisFormatter* formatter, ZydisFormatterBuffer* buffer, + ZydisFormatterContext* context) +{ + using namespace CPU; + + ZyanU64 address; + ZYAN_CHECK(ZydisCalcAbsoluteAddress(context->instruction, context->operand, context->runtime_address, &address)); + + char buf[128]; + u32 len = 0; + +#define A(x) static_cast(reinterpret_cast(x)) + + if (address >= A(Bus::g_ram) && address < A(Bus::g_ram + Bus::g_ram_size)) + { + len = snprintf(buf, sizeof(buf), "g_ram+0x%08X", static_cast(address - A(Bus::g_ram))); + } + else if (address >= A(&g_state.regs) && + address < A(reinterpret_cast(&g_state.regs) + sizeof(CPU::Registers))) + { + len = snprintf(buf, sizeof(buf), "g_state.regs.%s", + GetRegName(static_cast(((address - A(&g_state.regs.r[0])) / 4u)))); + } + else if (address >= A(&g_state.cop0_regs) && + address < A(reinterpret_cast(&g_state.cop0_regs) + sizeof(CPU::Cop0Registers))) + { + for (const DebuggerRegisterListEntry& rle : g_debugger_register_list) + { + if (address == static_cast(reinterpret_cast(rle.value_ptr))) + { + len = snprintf(buf, sizeof(buf), "g_state.cop0_regs.%s", rle.name); + break; + } + } + } + else if (address >= A(&g_state.gte_regs) && + address < A(reinterpret_cast(&g_state.gte_regs) + sizeof(GTE::Regs))) + { + for (const DebuggerRegisterListEntry& rle : g_debugger_register_list) + { + if (address == static_cast(reinterpret_cast(rle.value_ptr))) + { + len = snprintf(buf, sizeof(buf), "g_state.gte_regs.%s", rle.name); + break; + } + } + } + else if (address == A(&g_state.load_delay_reg)) + { + len = snprintf(buf, sizeof(buf), "g_state.load_delay_reg"); + } + else if (address == A(&g_state.next_load_delay_reg)) + { + len = snprintf(buf, sizeof(buf), "g_state.next_load_delay_reg"); + } + else if (address == A(&g_state.load_delay_value)) + { + len = snprintf(buf, sizeof(buf), "g_state.load_delay_value"); + } + else if (address == A(&g_state.next_load_delay_value)) + { + len = snprintf(buf, sizeof(buf), "g_state.next_load_delay_value"); + } + else if (address == A(&g_state.pending_ticks)) + { + len = snprintf(buf, sizeof(buf), "g_state.pending_ticks"); + } + else if (address == A(&g_state.downcount)) + { + len = snprintf(buf, sizeof(buf), "g_state.downcount"); + } + +#undef A + + if (len > 0) + { + ZYAN_CHECK(ZydisFormatterBufferAppend(buffer, ZYDIS_TOKEN_SYMBOL)); + ZyanString* string; + ZYAN_CHECK(ZydisFormatterBufferGetString(buffer, &string)); + return ZyanStringAppendFormat(string, "&%s", buf); + } + + return s_old_print_address(formatter, buffer, context); +} + +void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size) +{ + ZydisDecoder disas_decoder; + ZydisFormatter disas_formatter; + ZydisDecodedInstruction disas_instruction; + ZydisDecodedOperand disas_operands[ZYDIS_MAX_OPERAND_COUNT]; + ZydisDecoderInit(&disas_decoder, ZYDIS_MACHINE_MODE_LONG_64, ZYDIS_STACK_WIDTH_64); + ZydisFormatterInit(&disas_formatter, ZYDIS_FORMATTER_STYLE_INTEL); + s_old_print_address = (ZydisFormatterFunc)&ZydisFormatterPrintAddressAbsolute; + ZydisFormatterSetHook(&disas_formatter, ZYDIS_FORMATTER_FUNC_PRINT_ADDRESS_ABS, (const void**)&s_old_print_address); + + const u8* ptr = static_cast(start); + TinyString hex; + ZyanUSize remaining = size; + while (ZYAN_SUCCESS(ZydisDecoderDecodeFull(&disas_decoder, ptr, remaining, &disas_instruction, disas_operands))) + { + char buffer[256]; + if (ZYAN_SUCCESS(ZydisFormatterFormatInstruction(&disas_formatter, &disas_instruction, disas_operands, + ZYDIS_MAX_OPERAND_COUNT, buffer, sizeof(buffer), + static_cast(reinterpret_cast(ptr)), nullptr))) + { + hex.clear(); + for (u32 i = 0; i < 10; i++) + { + if (i < disas_instruction.length) + hex.append_fmt(" {:02X}", ptr[i]); + else + hex.append(" "); + } + Log::WriteFmt("HostCode", "", LOGLEVEL_DEBUG, " {:016X} {} {}", + static_cast(reinterpret_cast(ptr)), hex, buffer); + } + + ptr += disas_instruction.length; + remaining -= disas_instruction.length; + } +} + +u32 CPU::CodeCache::GetHostInstructionCount(const void* start, u32 size) +{ + ZydisDecoder disas_decoder; + ZydisDecodedInstruction disas_instruction; + ZydisDecoderContext disas_context; + ZydisDecoderInit(&disas_decoder, ZYDIS_MACHINE_MODE_LONG_64, ZYDIS_STACK_WIDTH_64); + + const u8* ptr = static_cast(start); + ZyanUSize remaining = size; + u32 inst_count = 0; + while ( + ZYAN_SUCCESS(ZydisDecoderDecodeInstruction(&disas_decoder, &disas_context, ptr, remaining, &disas_instruction))) + { + ptr += disas_instruction.length; + remaining -= disas_instruction.length; + inst_count++; + } + + return inst_count; +} + +#else + +void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size) +{ + Log_ErrorPrint("Not compiled with ENABLE_HOST_DISASSEMBLY."); +} + +u32 CPU::CodeCache::GetHostInstructionCount(const void* start, u32 size) +{ + Log_ErrorPrint("Not compiled with ENABLE_HOST_DISASSEMBLY."); + return 0; +} + +#endif // ENABLE_HOST_DISASSEMBLY + namespace CPU::Recompiler { -#if defined(ABI_WIN64) -constexpr HostReg RCPUPTR = Xbyak::Operand::RBP; -constexpr HostReg RMEMBASEPTR = Xbyak::Operand::RBX; -constexpr HostReg RRETURN = Xbyak::Operand::RAX; -constexpr HostReg RARG1 = Xbyak::Operand::RCX; -constexpr HostReg RARG2 = Xbyak::Operand::RDX; -constexpr HostReg RARG3 = Xbyak::Operand::R8; -constexpr HostReg RARG4 = Xbyak::Operand::R9; -constexpr u32 FUNCTION_CALL_SHADOW_SPACE = 32; -#elif defined(ABI_SYSV) -constexpr HostReg RCPUPTR = Xbyak::Operand::RBP; -constexpr HostReg RMEMBASEPTR = Xbyak::Operand::RBX; -constexpr HostReg RRETURN = Xbyak::Operand::RAX; -constexpr HostReg RARG1 = Xbyak::Operand::RDI; -constexpr HostReg RARG2 = Xbyak::Operand::RSI; -constexpr HostReg RARG3 = Xbyak::Operand::RDX; -constexpr HostReg RARG4 = Xbyak::Operand::RCX; -constexpr u32 FUNCTION_CALL_SHADOW_SPACE = 0; -#endif +static constexpr HostReg RCPUPTR = Xbyak::Operand::RBP; +static constexpr HostReg RMEMBASEPTR = Xbyak::Operand::RBX; +static constexpr HostReg RRETURN = RXRET.getIdx(); +static constexpr HostReg RARG1 = RXARG1.getIdx(); +static constexpr HostReg RARG2 = RXARG2.getIdx(); +static constexpr HostReg RARG3 = RXARG3.getIdx(); +static constexpr HostReg RARG4 = RXARG4.getIdx(); static const Xbyak::Reg8 GetHostReg8(HostReg reg) { @@ -80,7 +357,7 @@ static const Xbyak::Reg64 GetHostReg64(const Value& value) static const Xbyak::Reg64 GetCPUPtrReg() { - return GetHostReg64(RCPUPTR); + return Xbyak::Reg64(RCPUPTR); } static const Xbyak::Reg64 GetFastmemBasePtrReg() @@ -177,6 +454,11 @@ void CodeGenerator::SwitchToNearCode() m_emit = &m_near_emitter; } +void* CodeGenerator::GetStartNearCodePointer() const +{ + return m_near_emitter.getCode(); +} + void* CodeGenerator::GetCurrentNearCodePointer() const { return m_near_emitter.getCurr(); @@ -217,10 +499,9 @@ void CodeGenerator::EmitBeginBlock(bool allocate_registers /* = true */) const bool cpu_reg_allocated = m_register_cache.AllocateHostReg(RCPUPTR); DebugAssert(cpu_reg_allocated); UNREFERENCED_VARIABLE(cpu_reg_allocated); - // m_emit->mov(GetCPUPtrReg(), reinterpret_cast(&g_state)); // If there's loadstore instructions, preload the fastmem base. - if (m_block->contains_loadstore_instructions) + if (m_block->HasFlag(CodeCache::BlockFlags::ContainsLoadStoreInstructions)) { const bool fastmem_reg_allocated = m_register_cache.AllocateHostReg(RMEMBASEPTR); DebugAssert(fastmem_reg_allocated); @@ -230,19 +511,19 @@ void CodeGenerator::EmitBeginBlock(bool allocate_registers /* = true */) } } -void CodeGenerator::EmitEndBlock(bool free_registers /* = true */, bool emit_return /* = true */) +void CodeGenerator::EmitEndBlock(bool free_registers, const void* jump_to) { if (free_registers) { m_register_cache.FreeHostReg(RCPUPTR); - if (m_block->contains_loadstore_instructions) + if (m_block->HasFlag(CodeCache::BlockFlags::ContainsLoadStoreInstructions)) m_register_cache.FreeHostReg(RMEMBASEPTR); m_register_cache.PopCalleeSavedRegisters(true); } - if (emit_return) - m_emit->ret(); + if (jump_to) + m_emit->jmp(jump_to); } void CodeGenerator::EmitExceptionExit() @@ -257,7 +538,7 @@ void CodeGenerator::EmitExceptionExit() m_register_cache.FlushLoadDelay(false); m_register_cache.PopCalleeSavedRegisters(false); - m_emit->ret(); + m_emit->jmp(CodeCache::g_check_events_and_dispatch); } void CodeGenerator::EmitExceptionExitOnBool(const Value& value) @@ -276,20 +557,23 @@ void CodeGenerator::EmitExceptionExitOnBool(const Value& value) m_register_cache.PopState(); } -void CodeGenerator::FinalizeBlock(CodeBlock::HostCodePointer* out_host_code, u32* out_host_code_size) +const void* CodeGenerator::FinalizeBlock(u32* out_host_code_size, u32* out_host_far_code_size) { m_near_emitter.ready(); m_far_emitter.ready(); const u32 near_size = static_cast(m_near_emitter.getSize()); const u32 far_size = static_cast(m_far_emitter.getSize()); - *out_host_code = m_near_emitter.getCode(); + const void* code = m_near_emitter.getCode(); *out_host_code_size = near_size; + *out_host_far_code_size = far_size; m_code_buffer->CommitCode(near_size); m_code_buffer->CommitFarCode(far_size); m_near_emitter.reset(); m_far_emitter.reset(); + + return code; } void CodeGenerator::EmitSignExtend(HostReg to_reg, RegSize to_size, HostReg from_reg, RegSize from_size) @@ -1461,8 +1745,9 @@ u32 CodeGenerator::PrepareStackForCall() // we assume that the stack is unaligned at this point const u32 num_callee_saved = m_register_cache.GetActiveCalleeSavedRegisterCount(); const u32 num_caller_saved = m_register_cache.PushCallerSavedRegisters(); - const u32 current_offset = 8 + (num_callee_saved + num_caller_saved) * 8; - const u32 aligned_offset = Common::AlignUp(current_offset + FUNCTION_CALL_SHADOW_SPACE, 16); + const u32 current_offset = (num_callee_saved + num_caller_saved) * 8; + const u32 aligned_offset = + (current_offset == 0) ? 0 : Common::AlignUp(current_offset + FUNCTION_CALL_SHADOW_SPACE, 16); const u32 adjust_size = aligned_offset - current_offset; if (adjust_size > 0) m_emit->sub(m_emit->rsp, adjust_size); @@ -1902,16 +2187,11 @@ void CodeGenerator::EmitLoadGuestRAMFastmem(const Value& address, RegSize size, } } -void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, - Value& result) +void CodeGenerator::EmitLoadGuestMemoryFastmem(Instruction instruction, const CodeCache::InstructionInfo& info, + const Value& address, RegSize size, Value& result) { // fastmem - LoadStoreBackpatchInfo bpi; - bpi.host_pc = GetCurrentNearCodePointer(); - bpi.address_host_reg = HostReg_Invalid; - bpi.value_host_reg = result.host_reg; - bpi.guest_pc = m_current_instruction->pc; - bpi.fault_count = 0; + void* host_pc = GetCurrentNearCodePointer(); if (g_settings.cpu_fastmem_mode == CPUFastmemMode::MMap) { @@ -1921,7 +2201,7 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, { actual_address = &result; m_emit->mov(GetHostReg32(result.host_reg), address.constant_value); - bpi.host_pc = GetCurrentNearCodePointer(); + host_pc = GetCurrentNearCodePointer(); } m_register_cache.InhibitAllocation(); @@ -1988,7 +2268,7 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, m_emit->shr(GetHostReg32(RARG1), Bus::FASTMEM_LUT_PAGE_SHIFT); m_emit->and_(GetHostReg32(RARG2), Bus::FASTMEM_LUT_PAGE_MASK); m_emit->mov(GetHostReg64(RARG1), m_emit->qword[GetFastmemBasePtrReg() + GetHostReg64(RARG1) * 8]); - bpi.host_pc = GetCurrentNearCodePointer(); + host_pc = GetCurrentNearCodePointer(); switch (size) { @@ -2011,18 +2291,17 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, } // insert nops, we need at least 5 bytes for a relative jump - const u32 fastmem_size = - static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(bpi.host_pc)); + const u32 fastmem_size = static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(host_pc)); const u32 nops = (fastmem_size < 5 ? 5 - fastmem_size : 0); for (u32 i = 0; i < nops; i++) m_emit->nop(); - bpi.host_code_size = static_cast( - static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(bpi.host_pc))); + const u32 host_code_size = + static_cast(static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(host_pc))); // generate slowmem fallback m_far_emitter.align(16); - bpi.host_slowmem_pc = GetCurrentFarCodePointer(); + void* thunk_host_pc = GetCurrentFarCodePointer(); SwitchToFarCode(); // we add the ticks *after* the add here, since we counted incorrectly, then correct for it below @@ -2030,7 +2309,7 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast(m_delayed_cycles_add))); m_delayed_cycles_add += Bus::RAM_READ_TICKS; - EmitLoadGuestMemorySlowmem(cbi, address, size, result, true); + EmitLoadGuestMemorySlowmem(instruction, info, address, size, result, true); EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast(-m_delayed_cycles_add))); @@ -2041,11 +2320,11 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, SwitchToNearCode(); m_register_cache.UninhibitAllocation(); - m_block->loadstore_backpatch_info.push_back(bpi); + CPU::CodeCache::AddLoadStoreInfo(host_pc, host_code_size, info.pc, thunk_host_pc); } -void CodeGenerator::EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, - Value& result, bool in_far_code) +void CodeGenerator::EmitLoadGuestMemorySlowmem(Instruction instruction, const CodeCache::InstructionInfo& info, + const Value& address, RegSize size, Value& result, bool in_far_code) { if (g_settings.cpu_recompiler_memory_exceptions) { @@ -2082,8 +2361,8 @@ void CodeGenerator::EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi, m_emit->neg(GetHostReg32(result.host_reg)); m_emit->shl(GetHostReg32(result.host_reg), 2); m_emit->or_(GetHostReg32(result.host_reg), - Cop0Registers::CAUSE::MakeValueForException(static_cast(0), cbi.is_branch_delay_slot, false, - cbi.instruction.cop.cop_n)); + Cop0Registers::CAUSE::MakeValueForException(static_cast(0), info.is_branch_delay_slot, false, + instruction.cop.cop_n)); EmitFunctionCall(nullptr, static_cast(&CPU::RaiseException), result, GetCurrentInstructionPC()); EmitExceptionExit(); @@ -2116,16 +2395,11 @@ void CodeGenerator::EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi, } } -void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, - const Value& value) +void CodeGenerator::EmitStoreGuestMemoryFastmem(Instruction instruction, const CodeCache::InstructionInfo& info, + const Value& address, RegSize size, const Value& value) { // fastmem - LoadStoreBackpatchInfo bpi; - bpi.host_pc = GetCurrentNearCodePointer(); - bpi.address_host_reg = HostReg_Invalid; - bpi.value_host_reg = value.host_reg; - bpi.guest_pc = m_current_instruction->pc; - bpi.fault_count = 0; + void* host_pc = GetCurrentNearCodePointer(); if (g_settings.cpu_fastmem_mode == CPUFastmemMode::MMap) { @@ -2137,7 +2411,7 @@ void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, temp_address.SetHostReg(&m_register_cache, RRETURN, RegSize_32); actual_address = &temp_address; m_emit->mov(GetHostReg32(temp_address), address.constant_value); - bpi.host_pc = GetCurrentNearCodePointer(); + host_pc = GetCurrentNearCodePointer(); } m_register_cache.InhibitAllocation(); @@ -2252,7 +2526,7 @@ void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, m_emit->and_(GetHostReg32(RARG2), Bus::FASTMEM_LUT_PAGE_MASK); m_emit->mov(GetHostReg64(RARG1), m_emit->qword[GetFastmemBasePtrReg() + GetHostReg64(RARG1) * 8 + (Bus::FASTMEM_LUT_NUM_PAGES * 8)]); - bpi.host_pc = GetCurrentNearCodePointer(); + host_pc = GetCurrentNearCodePointer(); switch (size) { @@ -2290,24 +2564,23 @@ void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, } // insert nops, we need at least 5 bytes for a relative jump - const u32 fastmem_size = - static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(bpi.host_pc)); + const u32 fastmem_size = static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(host_pc)); const u32 nops = (fastmem_size < 5 ? 5 - fastmem_size : 0); for (u32 i = 0; i < nops; i++) m_emit->nop(); - bpi.host_code_size = static_cast( - static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(bpi.host_pc))); + const u32 host_code_size = + static_cast(static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(host_pc))); // generate slowmem fallback m_far_emitter.align(); - bpi.host_slowmem_pc = GetCurrentFarCodePointer(); + const void* host_thunk_pc = GetCurrentFarCodePointer(); SwitchToFarCode(); DebugAssert(m_delayed_cycles_add > 0); EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast(m_delayed_cycles_add))); - EmitStoreGuestMemorySlowmem(cbi, address, size, value, true); + EmitStoreGuestMemorySlowmem(instruction, info, address, size, value, true); EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast(-m_delayed_cycles_add))); @@ -2318,11 +2591,12 @@ void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, SwitchToNearCode(); m_register_cache.UninhibitAllocation(); - m_block->loadstore_backpatch_info.push_back(bpi); + CPU::CodeCache::AddLoadStoreInfo(host_pc, host_code_size, info.pc, host_thunk_pc); } -void CodeGenerator::EmitStoreGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, - const Value& value, bool in_far_code) +void CodeGenerator::EmitStoreGuestMemorySlowmem(Instruction instruction, const CodeCache::InstructionInfo& info, + const Value& address, RegSize size, const Value& value, + bool in_far_code) { if (g_settings.cpu_recompiler_memory_exceptions) { @@ -2360,8 +2634,8 @@ void CodeGenerator::EmitStoreGuestMemorySlowmem(const CodeBlockInstruction& cbi, // cause_bits = (result << 2) | BD | cop_n m_emit->shl(GetHostReg32(result), 2); m_emit->or_(GetHostReg32(result), - Cop0Registers::CAUSE::MakeValueForException(static_cast(0), cbi.is_branch_delay_slot, false, - cbi.instruction.cop.cop_n)); + Cop0Registers::CAUSE::MakeValueForException(static_cast(0), info.is_branch_delay_slot, false, + instruction.cop.cop_n)); EmitFunctionCall(nullptr, static_cast(&CPU::RaiseException), result, GetCurrentInstructionPC()); EmitExceptionExit(); @@ -2398,55 +2672,21 @@ void CodeGenerator::EmitUpdateFastmemBase() m_emit->mov(GetFastmemBasePtrReg(), m_emit->qword[GetCPUPtrReg() + offsetof(CPU::State, fastmem_base)]); } -bool CodeGenerator::BackpatchLoadStore(const LoadStoreBackpatchInfo& lbi) +void CodeGenerator::BackpatchLoadStore(void* host_pc, const CodeCache::LoadstoreBackpatchInfo& lbi) { - Log_ProfilePrintf("Backpatching %p (guest PC 0x%08X) to slowmem", lbi.host_pc, lbi.guest_pc); + Log_ProfileFmt("Backpatching {} (guest PC 0x{:08X}) to slowmem", host_pc, lbi.guest_pc); // turn it into a jump to the slowmem handler - Xbyak::CodeGenerator cg(lbi.host_code_size, lbi.host_pc); - cg.jmp(lbi.host_slowmem_pc); + Xbyak::CodeGenerator cg(lbi.code_size, host_pc); + cg.jmp(lbi.thunk_address); - const s32 nops = static_cast(lbi.host_code_size) - - static_cast(static_cast(cg.getCurr() - static_cast(lbi.host_pc))); + const s32 nops = static_cast(lbi.code_size) - + static_cast(static_cast(cg.getCurr() - static_cast(host_pc))); Assert(nops >= 0); for (s32 i = 0; i < nops; i++) cg.nop(); - JitCodeBuffer::FlushInstructionCache(lbi.host_pc, lbi.host_code_size); - return true; -} - -void CodeGenerator::BackpatchReturn(void* pc, u32 pc_size) -{ - Log_ProfilePrintf("Backpatching %p to return", pc); - - Xbyak::CodeGenerator cg(pc_size, pc); - cg.ret(); - - const s32 nops = - static_cast(pc_size) - static_cast(static_cast(cg.getCurr() - static_cast(pc))); - Assert(nops >= 0); - for (s32 i = 0; i < nops; i++) - cg.nop(); - - JitCodeBuffer::FlushInstructionCache(pc, pc_size); -} - -void CodeGenerator::BackpatchBranch(void* pc, u32 pc_size, void* target) -{ - Log_ProfilePrintf("Backpatching %p to %p [branch]", pc, target); - - Xbyak::CodeGenerator cg(pc_size, pc); - cg.jmp(target); - - // shouldn't have any nops - const s32 nops = - static_cast(pc_size) - static_cast(static_cast(cg.getCurr() - static_cast(pc))); - Assert(nops >= 0); - for (s32 i = 0; i < nops; i++) - cg.nop(); - - JitCodeBuffer::FlushInstructionCache(pc, pc_size); + JitCodeBuffer::FlushInstructionCache(host_pc, lbi.code_size); } void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr) @@ -2737,6 +2977,62 @@ void CodeGenerator::EmitICacheCheckAndUpdate() } } +void CodeGenerator::EmitBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size) +{ + const auto ram_ptr_reg = GetHostReg64(RARG1); + const auto shadow_ptr_reg = GetHostReg64(RARG2); + const auto temp_reg = GetHostReg64(RARG3); + const auto temp_reg32 = GetHostReg32(RARG3); + + // store it first to reduce code size, because we can offset + m_emit->mov(ram_ptr_reg, static_cast(reinterpret_cast(ram_ptr))); + m_emit->mov(shadow_ptr_reg, static_cast(reinterpret_cast(shadow_ptr))); + + bool first = true; + u32 offset = 0; + while (size >= 16) + { + const Xbyak::Xmm& dst = first ? m_emit->xmm0 : m_emit->xmm1; + m_emit->movups(dst, m_emit->xword[ram_ptr_reg + offset]); + m_emit->pcmpeqd(dst, m_emit->xword[shadow_ptr_reg + offset]); + if (!first) + m_emit->pand(m_emit->xmm0, dst); + else + first = false; + + offset += 16; + size -= 16; + } + + // TODO: better codegen for 16 byte aligned blocks + if (!first) + { + m_emit->movmskps(temp_reg32, m_emit->xmm0); + m_emit->cmp(temp_reg32, 0xf); + m_emit->jne(CodeCache::g_discard_and_recompile_block); + } + + while (size >= 8) + { + m_emit->mov(temp_reg, m_emit->qword[ram_ptr_reg + offset]); + m_emit->cmp(temp_reg, m_emit->qword[shadow_ptr_reg + offset]); + m_emit->jne(CodeCache::g_discard_and_recompile_block); + offset += 8; + size -= 8; + } + + while (size >= 4) + { + m_emit->mov(temp_reg32, m_emit->dword[ram_ptr_reg + offset]); + m_emit->cmp(temp_reg32, m_emit->dword[shadow_ptr_reg + offset]); + m_emit->jne(CodeCache::g_discard_and_recompile_block); + offset += 4; + size -= 4; + } + + DebugAssert(size == 0); +} + void CodeGenerator::EmitStallUntilGTEComplete() { m_emit->mov(GetHostReg32(RRETURN), m_emit->dword[GetCPUPtrReg() + offsetof(State, pending_ticks)]); @@ -2759,7 +3055,7 @@ void CodeGenerator::EmitBranch(const void* address, bool allow_scratch) static_cast(reinterpret_cast(address) - reinterpret_cast(GetCurrentCodePointer())); if (Xbyak::inner::IsInInt32(static_cast(jump_distance))) { - m_emit->jmp(address); + m_emit->jmp(address, Xbyak::CodeGenerator::T_NEAR); return; } @@ -3068,77 +3364,4 @@ void CodeGenerator::EmitLoadGlobalAddress(HostReg host_reg, const void* ptr) else m_emit->mov(GetHostReg64(host_reg), reinterpret_cast(ptr)); } - -CodeCache::DispatcherFunction CodeGenerator::CompileDispatcher() -{ - m_register_cache.ReserveCalleeSavedRegisters(); - const u32 stack_adjust = PrepareStackForCall(); - - EmitLoadGlobalAddress(Xbyak::Operand::RBP, &g_state); - - Xbyak::Label event_test; - m_emit->jmp(event_test); - - // main dispatch loop - Xbyak::Label main_loop; - m_emit->align(16); - m_emit->L(main_loop); - - // time to lookup the block - // eax <- pc - m_emit->mov(m_emit->eax, m_emit->dword[m_emit->rbp + offsetof(State, pc)]); - - // rcx <- s_fast_map[pc >> 16] - EmitLoadGlobalAddress(Xbyak::Operand::RBX, CodeCache::GetFastMapPointer()); - m_emit->mov(m_emit->ecx, m_emit->eax); - m_emit->shr(m_emit->ecx, 16); - m_emit->mov(m_emit->rcx, m_emit->qword[m_emit->rbx + m_emit->rcx * 8]); - - // call(rcx[pc * 2]) (fast_map[pc >> 2]) - m_emit->call(m_emit->qword[m_emit->rcx + m_emit->rax * 2]); - - // eax <- pending_ticks - m_emit->mov(m_emit->eax, m_emit->dword[m_emit->rbp + offsetof(State, pending_ticks)]); - - // while eax < downcount - Xbyak::Label downcount_hit; - m_emit->cmp(m_emit->eax, m_emit->dword[m_emit->rbp + offsetof(State, downcount)]); - m_emit->jl(main_loop); - - m_emit->L(event_test); - EmitCall(reinterpret_cast(&TimingEvents::RunEvents)); - m_emit->jmp(main_loop); - - // all done - RestoreStackAfterCall(stack_adjust); - m_register_cache.PopCalleeSavedRegisters(true); - m_emit->ret(); - - CodeBlock::HostCodePointer ptr; - u32 code_size; - FinalizeBlock(&ptr, &code_size); - Log_DevPrintf("Dispatcher is %u bytes at %p", code_size, ptr); - return ptr; -} - -CodeCache::SingleBlockDispatcherFunction CodeGenerator::CompileSingleBlockDispatcher() -{ - m_register_cache.ReserveCalleeSavedRegisters(); - const u32 stack_adjust = PrepareStackForCall(); - - EmitLoadGlobalAddress(Xbyak::Operand::RBP, &g_state); - - m_emit->call(GetHostReg64(RARG1)); - - RestoreStackAfterCall(stack_adjust); - m_register_cache.PopCalleeSavedRegisters(true); - m_emit->ret(); - - CodeBlock::HostCodePointer ptr; - u32 code_size; - FinalizeBlock(&ptr, &code_size); - Log_DevPrintf("Single block dispatcher is %u bytes at %p", code_size, ptr); - return reinterpret_cast(ptr); -} - } // namespace CPU::Recompiler diff --git a/src/core/cpu_recompiler_register_cache.h b/src/core/cpu_recompiler_register_cache.h index b8bfb2600..0f63053ea 100644 --- a/src/core/cpu_recompiler_register_cache.h +++ b/src/core/cpu_recompiler_register_cache.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin +// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) #pragma once @@ -6,6 +6,12 @@ #include "cpu_recompiler_types.h" #include "cpu_types.h" +#if defined(CPU_ARCH_ARM32) +#include "vixl/aarch32/macro-assembler-aarch32.h" +#elif defined(CPU_ARCH_ARM64) +#include "vixl/aarch64/macro-assembler-aarch64.h" +#endif + #include #include #include @@ -13,6 +19,59 @@ namespace CPU::Recompiler { +enum RegSize : u8 +{ + RegSize_8, + RegSize_16, + RegSize_32, + RegSize_64, +}; + +#if defined(CPU_ARCH_X64) + +using HostReg = unsigned; +using CodeEmitter = Xbyak::CodeGenerator; +using LabelType = Xbyak::Label; +enum : u32 +{ + HostReg_Count = 16 +}; +constexpr HostReg HostReg_Invalid = static_cast(HostReg_Count); +constexpr RegSize HostPointerSize = RegSize_64; + +#elif defined(CPU_ARCH_ARM32) + +using HostReg = unsigned; +using CodeEmitter = vixl::aarch32::MacroAssembler; +using LabelType = vixl::aarch32::Label; +enum : u32 +{ + HostReg_Count = vixl::aarch32::kNumberOfRegisters +}; +constexpr HostReg HostReg_Invalid = static_cast(HostReg_Count); +constexpr RegSize HostPointerSize = RegSize_32; + +#elif defined(CPU_ARCH_ARM64) + +using HostReg = unsigned; +using CodeEmitter = vixl::aarch64::MacroAssembler; +using LabelType = vixl::aarch64::Label; +enum : u32 +{ + HostReg_Count = vixl::aarch64::kNumberOfRegisters +}; +constexpr HostReg HostReg_Invalid = static_cast(HostReg_Count); +constexpr RegSize HostPointerSize = RegSize_64; + +#else + +#error Unknown architecture. + +#endif + +class CodeGenerator; +class RegisterCache; + enum class HostRegState : u8 { None = 0, diff --git a/src/core/cpu_recompiler_thunks.h b/src/core/cpu_recompiler_thunks.h index 31145aea2..b29971f0f 100644 --- a/src/core/cpu_recompiler_thunks.h +++ b/src/core/cpu_recompiler_thunks.h @@ -1,15 +1,11 @@ -// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin +// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) #pragma once #include "cpu_code_cache.h" #include "cpu_types.h" -namespace CPU { -struct CodeBlock; -struct CodeBlockInstruction; - -namespace Recompiler::Thunks { +namespace CPU::Recompiler::Thunks { ////////////////////////////////////////////////////////////////////////// // Trampolines for calling back from the JIT @@ -18,7 +14,6 @@ namespace Recompiler::Thunks { ////////////////////////////////////////////////////////////////////////// bool InterpretInstruction(); bool InterpretInstructionPGXP(); -void CheckAndUpdateICache(u32 pc, u32 line_count); // Memory access functions for the JIT - MSB is set on exception. u64 ReadMemoryByte(u32 address); @@ -36,9 +31,6 @@ void UncheckedWriteMemoryByte(u32 address, u32 value); void UncheckedWriteMemoryHalfWord(u32 address, u32 value); void UncheckedWriteMemoryWord(u32 address, u32 value); -void ResolveBranch(CodeBlock* block, void* host_pc, void* host_resolve_pc, u32 host_pc_size); void LogPC(u32 pc); -} // namespace Recompiler::Thunks - -} // namespace CPU +} // namespace CPU::Recompiler::Thunks diff --git a/src/core/cpu_recompiler_types.h b/src/core/cpu_recompiler_types.h index 432446c36..58273691c 100644 --- a/src/core/cpu_recompiler_types.h +++ b/src/core/cpu_recompiler_types.h @@ -1,6 +1,8 @@ -// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin +// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) +// Shared code between recompiler backends. + #pragma once #include "cpu_types.h" @@ -14,158 +16,118 @@ #define XBYAK_NO_OP_NAMES 1 #include "xbyak.h" -#elif defined(CPU_ARCH_ARM32) - -#include "vixl/aarch32/constants-aarch32.h" -#include "vixl/aarch32/instructions-aarch32.h" -#include "vixl/aarch32/macro-assembler-aarch32.h" - -#elif defined(CPU_ARCH_ARM64) - -#include "vixl/aarch64/constants-aarch64.h" -#include "vixl/aarch64/macro-assembler-aarch64.h" - -#endif - -namespace CPU { - -namespace Recompiler { - -class CodeGenerator; -class RegisterCache; - -enum RegSize : u8 -{ - RegSize_8, - RegSize_16, - RegSize_32, - RegSize_64, -}; - -enum class Condition : u8 -{ - Always, - NotEqual, - Equal, - Overflow, - Greater, - GreaterEqual, - LessEqual, - Less, - Negative, - PositiveOrZero, - Above, // unsigned variant of Greater - AboveEqual, // unsigned variant of GreaterEqual - Below, // unsigned variant of Less - BelowEqual, // unsigned variant of LessEqual - - NotZero, - Zero -}; - -#if defined(CPU_ARCH_X64) - -using HostReg = unsigned; -using CodeEmitter = Xbyak::CodeGenerator; -using LabelType = Xbyak::Label; -enum : u32 -{ - HostReg_Count = 16 -}; -constexpr HostReg HostReg_Invalid = static_cast(HostReg_Count); -constexpr RegSize HostPointerSize = RegSize_64; +namespace CPU::Recompiler { // A reasonable "maximum" number of bytes per instruction. constexpr u32 MAX_NEAR_HOST_BYTES_PER_INSTRUCTION = 64; constexpr u32 MAX_FAR_HOST_BYTES_PER_INSTRUCTION = 128; -// Alignment of code stoarge. -constexpr u32 CODE_STORAGE_ALIGNMENT = 4096; - // ABI selection #if defined(_WIN32) #define ABI_WIN64 1 -#elif defined(__linux__) || defined(__ANDROID__) || defined(__APPLE__) || defined(__HAIKU__) || defined(__FreeBSD__) + +#define RWRET Xbyak::Reg32(Xbyak::Operand::EAX) +#define RWARG1 Xbyak::Reg32(Xbyak::Operand::RCX) +#define RWARG2 Xbyak::Reg32(Xbyak::Operand::RDX) +#define RWARG3 Xbyak::Reg32(Xbyak::Operand::R8D) +#define RWARG4 Xbyak::Reg32(Xbyak::Operand::R9D) +#define RXRET Xbyak::Reg64(Xbyak::Operand::RAX) +#define RXARG1 Xbyak::Reg64(Xbyak::Operand::RCX) +#define RXARG2 Xbyak::Reg64(Xbyak::Operand::RDX) +#define RXARG3 Xbyak::Reg64(Xbyak::Operand::R8) +#define RXARG4 Xbyak::Reg64(Xbyak::Operand::R9) + +static constexpr u32 FUNCTION_CALL_SHADOW_SPACE = 32; + +#elif defined(__linux__) || defined(__ANDROID__) || defined(__APPLE__) || defined(__FreeBSD__) #define ABI_SYSV 1 + +#define RWRET Xbyak::Reg32(Xbyak::Operand::EAX) +#define RWARG1 Xbyak::Reg32(Xbyak::Operand::EDI) +#define RWARG2 Xbyak::Reg32(Xbyak::Operand::ESI) +#define RWARG3 Xbyak::Reg32(Xbyak::Operand::EDX) +#define RWARG4 Xbyak::Reg32(Xbyak::Operand::ECX) +#define RXRET Xbyak::Reg64(Xbyak::Operand::RAX) +#define RXARG1 Xbyak::Reg64(Xbyak::Operand::RDI) +#define RXARG2 Xbyak::Reg64(Xbyak::Operand::RSI) +#define RXARG3 Xbyak::Reg64(Xbyak::Operand::RDX) +#define RXARG4 Xbyak::Reg64(Xbyak::Operand::RCX) + +static constexpr u32 FUNCTION_CALL_SHADOW_SPACE = 0; + #else #error Unknown ABI. #endif +bool IsCallerSavedRegister(u32 id); + +} // namespace CPU::Recompiler + #elif defined(CPU_ARCH_ARM32) -using HostReg = unsigned; -using CodeEmitter = vixl::aarch32::MacroAssembler; -using LabelType = vixl::aarch32::Label; -enum : u32 -{ - HostReg_Count = vixl::aarch32::kNumberOfRegisters -}; -constexpr HostReg HostReg_Invalid = static_cast(HostReg_Count); -constexpr RegSize HostPointerSize = RegSize_32; +#include "vixl/aarch32/assembler-aarch32.h" +#include "vixl/aarch32/constants-aarch32.h" +#include "vixl/aarch32/instructions-aarch32.h" + +namespace CPU::Recompiler { // A reasonable "maximum" number of bytes per instruction. constexpr u32 MAX_NEAR_HOST_BYTES_PER_INSTRUCTION = 64; constexpr u32 MAX_FAR_HOST_BYTES_PER_INSTRUCTION = 128; -// Alignment of code stoarge. -constexpr u32 CODE_STORAGE_ALIGNMENT = 4096; +#define RRET vixl::aarch32::r0 +#define RARG1 vixl::aarch32::r0 +#define RARG2 vixl::aarch32::r1 +#define RARG3 vixl::aarch32::r2 +#define RARG4 vixl::aarch32::r3 +#define RSCRATCH vixl::aarch32::r12 +#define RSTATE vixl::aarch32::r4 +#define RMEMBASE vixl::aarch32::r5 + +s32 armGetPCDisplacement(const void* current, const void* target); +bool armIsPCDisplacementInImmediateRange(s32 displacement); +void armMoveAddressToReg(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg, const void* addr); +void armEmitMov(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& rd, u32 imm); +void armEmitJmp(vixl::aarch32::Assembler* armAsm, const void* ptr, bool force_inline); +void armEmitCall(vixl::aarch32::Assembler* armAsm, const void* ptr, bool force_inline); + +} // namespace CPU::Recompiler #elif defined(CPU_ARCH_ARM64) -using HostReg = unsigned; -using CodeEmitter = vixl::aarch64::MacroAssembler; -using LabelType = vixl::aarch64::Label; -enum : u32 -{ - HostReg_Count = vixl::aarch64::kNumberOfRegisters -}; -constexpr HostReg HostReg_Invalid = static_cast(HostReg_Count); -constexpr RegSize HostPointerSize = RegSize_64; +#include "vixl/aarch64/assembler-aarch64.h" +#include "vixl/aarch64/constants-aarch64.h" + +namespace CPU::Recompiler { // A reasonable "maximum" number of bytes per instruction. constexpr u32 MAX_NEAR_HOST_BYTES_PER_INSTRUCTION = 64; constexpr u32 MAX_FAR_HOST_BYTES_PER_INSTRUCTION = 128; -// Alignment of code stoarge. -constexpr u32 CODE_STORAGE_ALIGNMENT = 4096; +#define RWRET vixl::aarch64::w0 +#define RXRET vixl::aarch64::x0 +#define RWARG1 vixl::aarch64::w0 +#define RXARG1 vixl::aarch64::x0 +#define RWARG2 vixl::aarch64::w1 +#define RXARG2 vixl::aarch64::x1 +#define RWARG3 vixl::aarch64::w2 +#define RXARG3 vixl::aarch64::x2 +#define RWARG4 vixl::aarch64::w3 +#define RXARG4 vixl::aarch64::x3 +#define RWSCRATCH vixl::aarch64::w16 +#define RXSCRATCH vixl::aarch64::x16 +#define RSTATE vixl::aarch64::x19 +#define RMEMBASE vixl::aarch64::x20 -#elif defined(CPU_ARCH_RISCV64) +bool armIsCallerSavedRegister(u32 id); +s64 armGetPCDisplacement(const void* current, const void* target); +void armMoveAddressToReg(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::XRegister& reg, const void* addr); +void armEmitMov(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& rd, u64 imm); +void armEmitJmp(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline); +void armEmitCall(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline); +void armEmitCondBranch(vixl::aarch64::Assembler* armAsm, vixl::aarch64::Condition cond, const void* ptr); +u8* armGetJumpTrampoline(const void* target); -using HostReg = unsigned; - -// Alignment of code stoarge. -constexpr u32 CODE_STORAGE_ALIGNMENT = 4096; - -#else - -using HostReg = int; - -class CodeEmitter -{ -}; - -enum : u32 -{ - HostReg_Count = 1 -}; - -constexpr HostReg HostReg_Invalid = static_cast(HostReg_Count); -constexpr RegSize HostPointerSize = RegSize_64; -constexpr bool SHIFTS_ARE_IMPLICITLY_MASKED = false; +} // namespace CPU::Recompiler #endif - -struct LoadStoreBackpatchInfo -{ - void* host_pc; // pointer to instruction which will fault - void* host_slowmem_pc; // pointer to slowmem callback code - u32 host_code_size; // size of the fastmem load as well as the add for cycles - HostReg address_host_reg; // register containing the guest address to load/store - HostReg value_host_reg; // register containing the source/destination - PhysicalMemoryAddress guest_pc; - u32 fault_count; -}; - -} // namespace Recompiler - -} // namespace CPU diff --git a/src/core/hotkeys.cpp b/src/core/hotkeys.cpp index fbb3ae28b..2c97dbecb 100644 --- a/src/core/hotkeys.cpp +++ b/src/core/hotkeys.cpp @@ -315,8 +315,7 @@ DEFINE_HOTKEY("TogglePGXP", TRANSLATE_NOOP("Hotkeys", "Graphics"), TRANSLATE_NOO PGXP::Shutdown(); // we need to recompile all blocks if pgxp is toggled on/off - if (g_settings.IsUsingCodeCache()) - CPU::CodeCache::Flush(); + CPU::CodeCache::Reset(); // need to swap interpreters System::InterruptExecution(); @@ -407,8 +406,7 @@ DEFINE_HOTKEY("TogglePGXPCPU", TRANSLATE_NOOP("Hotkeys", "Graphics"), TRANSLATE_ PGXP::Initialize(); // we need to recompile all blocks if pgxp is toggled on/off - if (g_settings.IsUsingCodeCache()) - CPU::CodeCache::Flush(); + CPU::CodeCache::Reset(); } }) diff --git a/src/core/imgui_overlays.cpp b/src/core/imgui_overlays.cpp index b1d8685ba..a52b9a5c5 100644 --- a/src/core/imgui_overlays.cpp +++ b/src/core/imgui_overlays.cpp @@ -349,8 +349,9 @@ void ImGuiManager::DrawPerformanceOverlay() System::GetMaximumFrameTime()); DRAW_LINE(fixed_font, text, IM_COL32(255, 255, 255, 255)); - if (g_settings.cpu_overclock_active || (!g_settings.IsUsingRecompiler() || g_settings.cpu_recompiler_icache || - g_settings.cpu_recompiler_memory_exceptions)) + if (g_settings.cpu_overclock_active || + (g_settings.cpu_execution_mode != CPUExecutionMode::Recompiler || g_settings.cpu_recompiler_icache || + g_settings.cpu_recompiler_memory_exceptions)) { first = true; text.assign("CPU["); diff --git a/src/core/settings.h b/src/core/settings.h index efaf0da48..7fc54a3fb 100644 --- a/src/core/settings.h +++ b/src/core/settings.h @@ -254,8 +254,6 @@ struct Settings bool log_to_window = false; bool log_to_file = false; - ALWAYS_INLINE bool IsUsingCodeCache() const { return (cpu_execution_mode != CPUExecutionMode::Interpreter); } - ALWAYS_INLINE bool IsUsingRecompiler() const { return (cpu_execution_mode == CPUExecutionMode::Recompiler); } ALWAYS_INLINE bool IsUsingSoftwareRenderer() const { return (gpu_renderer == GPURenderer::Software); } ALWAYS_INLINE bool IsRunaheadEnabled() const { return (runahead_frames > 0); } @@ -275,12 +273,6 @@ struct Settings gpu_pgxp_depth_clear_threshold = value / GPU_PGXP_DEPTH_THRESHOLD_SCALE; } - ALWAYS_INLINE bool IsUsingFastmem() const - { - return (cpu_fastmem_mode != CPUFastmemMode::Disabled && cpu_execution_mode == CPUExecutionMode::Recompiler && - !cpu_recompiler_memory_exceptions); - } - ALWAYS_INLINE s32 GetAudioOutputVolume(bool fast_forwarding) const { return audio_output_muted ? 0 : (fast_forwarding ? audio_fast_forward_volume : audio_output_volume); diff --git a/src/core/system.cpp b/src/core/system.cpp index 5f0831651..da854247b 100644 --- a/src/core/system.cpp +++ b/src/core/system.cpp @@ -244,6 +244,8 @@ void System::Internal::ProcessStartup() if (!Bus::AllocateMemory()) Panic("Failed to allocate memory for emulated bus."); + CPU::CodeCache::ProcessStartup(); + // This will call back to Host::LoadSettings() -> ReloadSources(). LoadSettings(false); @@ -265,6 +267,7 @@ void System::Internal::ProcessShutdown() InputManager::CloseSources(); + CPU::CodeCache::ProcessShutdown(); Bus::ReleaseMemory(); } @@ -1508,6 +1511,8 @@ bool System::Initialize(bool force_software_renderer) return false; } + CPU::CodeCache::Initialize(); + if (!CreateGPU(force_software_renderer ? GPURenderer::Software : g_settings.gpu_renderer, false)) { Bus::Shutdown(); @@ -1536,9 +1541,6 @@ bool System::Initialize(bool force_software_renderer) return false; } - // CPU code cache must happen after GPU, because it might steal our address space. - CPU::CodeCache::Initialize(); - DMA::Initialize(); InterruptController::Initialize(); @@ -1704,6 +1706,7 @@ void System::Execute() // TODO: Purge reset/restore g_gpu->RestoreDeviceContext(); + TimingEvents::UpdateCPUDowncount(); if (s_rewind_load_counter >= 0) DoRewind(); @@ -2037,9 +2040,9 @@ bool System::DoState(StateWrapper& sw, GPUTexture** host_texture, bool update_di if (sw.IsReading()) { if (is_memory_state) - CPU::CodeCache::InvalidateAll(); + CPU::CodeCache::InvalidateAllRAMBlocks(); else - CPU::CodeCache::Flush(); + CPU::CodeCache::Reset(); } // only reset pgxp if we're not runahead-rollbacking. the value checks will save us from broken rendering, and it @@ -2158,7 +2161,7 @@ void System::InternalReset() return; CPU::Reset(); - CPU::CodeCache::Flush(); + CPU::CodeCache::Reset(); if (g_settings.gpu_pgxp_enable) PGXP::Initialize(); @@ -3522,7 +3525,10 @@ void System::CheckForSettingsChanges(const Settings& old_settings) g_settings.cpu_execution_mode))), 5.0f); CPU::ExecutionModeChanged(); - CPU::CodeCache::Reinitialize(); + if (old_settings.cpu_execution_mode != CPUExecutionMode::Interpreter) + CPU::CodeCache::Shutdown(); + if (g_settings.cpu_execution_mode != CPUExecutionMode::Interpreter) + CPU::CodeCache::Initialize(); CPU::ClearICache(); } @@ -3534,12 +3540,7 @@ void System::CheckForSettingsChanges(const Settings& old_settings) { Host::AddOSDMessage(TRANSLATE_STR("OSDMessage", "Recompiler options changed, flushing all blocks."), 5.0f); CPU::ExecutionModeChanged(); - - // changing memory exceptions can re-enable fastmem - if (g_settings.cpu_recompiler_memory_exceptions != old_settings.cpu_recompiler_memory_exceptions) - CPU::CodeCache::Reinitialize(); - else - CPU::CodeCache::Flush(); + CPU::CodeCache::Reset(); if (g_settings.cpu_recompiler_icache != old_settings.cpu_recompiler_icache) CPU::ClearICache(); @@ -3597,20 +3598,13 @@ void System::CheckForSettingsChanges(const Settings& old_settings) g_settings.gpu_pgxp_vertex_cache != old_settings.gpu_pgxp_vertex_cache || g_settings.gpu_pgxp_cpu != old_settings.gpu_pgxp_cpu))) { - if (g_settings.IsUsingCodeCache()) - { - Host::AddOSDMessage(g_settings.gpu_pgxp_enable ? - TRANSLATE_STR("OSDMessage", "PGXP enabled, recompiling all blocks.") : - TRANSLATE_STR("OSDMessage", "PGXP disabled, recompiling all blocks."), - 5.0f); - CPU::CodeCache::Flush(); - } - if (old_settings.gpu_pgxp_enable) PGXP::Shutdown(); if (g_settings.gpu_pgxp_enable) PGXP::Initialize(); + + CPU::CodeCache::Reset(); } if (g_settings.cdrom_readahead_sectors != old_settings.cdrom_readahead_sectors) diff --git a/src/util/page_fault_handler.cpp b/src/util/page_fault_handler.cpp index 08db476b6..4ee2a4222 100644 --- a/src/util/page_fault_handler.cpp +++ b/src/util/page_fault_handler.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin +// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) #include "page_fault_handler.h" @@ -28,24 +28,12 @@ struct RegisteredHandler { Callback callback; const void* owner; - void* start_pc; - u32 code_size; }; static std::vector m_handlers; static std::mutex m_handler_lock; static thread_local bool s_in_handler; -#if defined(CPU_ARCH_ARM32) -static bool IsStoreInstruction(const void* ptr) -{ - u32 bits; - std::memcpy(&bits, ptr, sizeof(bits)); - - // TODO - return false; -} - -#elif defined(CPU_ARCH_ARM64) +#if defined(CPU_ARCH_ARM64) static bool IsStoreInstruction(const void* ptr) { u32 bits; @@ -146,7 +134,7 @@ static void SIGSEGVHandler(int sig, siginfo_t* info, void* ctx) const bool is_write = (static_cast(ctx)->uc_mcontext.gregs[REG_ERR] & 2) != 0; #elif defined(CPU_ARCH_ARM32) void* const exception_pc = reinterpret_cast(static_cast(ctx)->uc_mcontext.arm_pc); - const bool is_write = IsStoreInstruction(exception_pc); + const bool is_write = (static_cast(ctx)->uc_mcontext.error_code & (1 << 11)) != 0; // DFSR.WnR #elif defined(CPU_ARCH_ARM64) void* const exception_pc = reinterpret_cast(static_cast(ctx)->uc_mcontext.pc); const bool is_write = IsStoreInstruction(exception_pc); @@ -221,7 +209,7 @@ static void SIGSEGVHandler(int sig, siginfo_t* info, void* ctx) #endif -bool InstallHandler(const void* owner, void* start_pc, u32 code_size, Callback callback) +bool InstallHandler(const void* owner, Callback callback) { bool was_empty; { @@ -267,7 +255,7 @@ bool InstallHandler(const void* owner, void* start_pc, u32 code_size, Callback c #endif } - m_handlers.push_back(RegisteredHandler{callback, owner, start_pc, code_size}); + m_handlers.push_back(RegisteredHandler{callback, owner}); return true; } diff --git a/src/util/page_fault_handler.h b/src/util/page_fault_handler.h index 6eb00dfb1..0bc52310f 100644 --- a/src/util/page_fault_handler.h +++ b/src/util/page_fault_handler.h @@ -14,7 +14,7 @@ enum class HandlerResult using Callback = HandlerResult (*)(void* exception_pc, void* fault_address, bool is_write); using Handle = void*; -bool InstallHandler(const void* owner, void* start_pc, u32 code_size, Callback callback); +bool InstallHandler(const void* owner, Callback callback); bool RemoveHandler(const void* owner); } // namespace Common::PageFaultHandler