JIT optimizations and refactoring (#675)

* CPU/Recompiler: Use rel32 call where possible for no-args * JitCodeBuffer: Support using preallocated buffer * CPU/Recompiler/AArch64: Use bl instead of blr for short branches * CPU/CodeCache: Allocate recompiler buffer in program space This means we don't need 64-bit moves for every call out of the recompiler. * GTE: Don't store as u16 and load as u32 * CPU/Recompiler: Add methods to emit global load/stores * GTE: Convert class to namespace * CPU/Recompiler: Call GTE functions directly * Settings: Turn into a global variable * GPU: Replace local pointers with global * InterruptController: Turn into a global pointer * System: Replace local pointers with global * Timers: Turn into a global instance * DMA: Turn into a global instance * SPU: Turn into a global instance * CDROM: Turn into a global instance * MDEC: Turn into a global instance * Pad: Turn into a global instance * SIO: Turn into a global instance * CDROM: Move audio FIFO to the heap * CPU/Recompiler: Drop ASMFunctions No longer needed since we have code in the same 4GB window. * CPUCodeCache: Turn class into namespace * Bus: Local pointer -> global pointers * CPU: Turn class into namespace * Bus: Turn into namespace * GTE: Store registers in CPU state struct Allows relative addressing on ARM. * CPU/Recompiler: Align code storage to page size * CPU/Recompiler: Fix relative branches on A64 * HostInterface: Local references to global * System: Turn into a namespace, move events out * Add guard pages * Android: Fix build
2025-06-18 03:25:46 -04:00 · 2020-07-31 17:09:18 +10:00
parent 1f9fc6ab74
commit b6f871d2b9
88 changed files with 4993 additions and 5045 deletions
--- a/src/core/gte.h
+++ b/src/core/gte.h
@ -1,119 +1,24 @@
 #pragma once
-#include "common/state_wrapper.h"
 #include "gte_types.h"

-namespace CPU {
-class Core;
-
-namespace Recompiler {
-class CodeGenerator;
-}
-} // namespace CPU
+class StateWrapper;

 namespace GTE {

-class Core
-{
-public:
-  friend CPU::Core;
-  friend CPU::Recompiler::CodeGenerator;
+void Initialize();
+void Reset();
+bool DoState(StateWrapper& sw);

-  Core();
-  ~Core();
+// control registers are offset by +32
+u32 ReadRegister(u32 index);
+void WriteRegister(u32 index, u32 value);

-  ALWAYS_INLINE void SetWidescreenHack(bool enabled) { m_widescreen_hack = enabled; }
+// use with care, direct register access
+u32* GetRegisterPtr(u32 index);

-  void Initialize();
-  void Reset();
-  bool DoState(StateWrapper& sw);
+void ExecuteInstruction(u32 inst_bits);

-  // control registers are offset by +32
-  u32 ReadRegister(u32 index) const;
-  void WriteRegister(u32 index, u32 value);
+using InstructionImpl = void (*)(Instruction);
+InstructionImpl GetInstructionImpl(u32 inst_bits);

-  void ExecuteInstruction(Instruction inst);
-
-private:
-  static constexpr s64 MAC0_MIN_VALUE = -(INT64_C(1) << 31);
-  static constexpr s64 MAC0_MAX_VALUE = (INT64_C(1) << 31) - 1;
-  static constexpr s64 MAC123_MIN_VALUE = -(INT64_C(1) << 43);
-  static constexpr s64 MAC123_MAX_VALUE = (INT64_C(1) << 43) - 1;
-  static constexpr s32 IR0_MIN_VALUE = 0x0000;
-  static constexpr s32 IR0_MAX_VALUE = 0x1000;
-  static constexpr s32 IR123_MIN_VALUE = -(INT64_C(1) << 15);
-  static constexpr s32 IR123_MAX_VALUE = (INT64_C(1) << 15) - 1;
-
-  // Checks for underflow/overflow.
-  template<u32 index>
-  void CheckMACOverflow(s64 value);
-
-  // Checks for underflow/overflow, sign-extending to 31/43 bits.
-  template<u32 index>
-  s64 SignExtendMACResult(s64 value);
-
-  template<u32 index>
-  void TruncateAndSetMAC(s64 value, u8 shift);
-
-  template<u32 index>
-  void TruncateAndSetMACAndIR(s64 value, u8 shift, bool lm);
-
-  template<u32 index>
-  void TruncateAndSetIR(s32 value, bool lm);
-
-  template<u32 index>
-  u32 TruncateRGB(s32 value);
-
-  void SetOTZ(s32 value);
-  void PushSXY(s32 x, s32 y);
-  void PushSZ(s32 value);
-  void PushRGBFromMAC();
-
-  // Divide using Unsigned Newton-Raphson algorithm.
-  u32 UNRDivide(u32 lhs, u32 rhs);
-
-  // 3x3 matrix * 3x1 vector, updates MAC[1-3] and IR[1-3]
-  void MulMatVec(const s16 M[3][3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm);
-
-  // 3x3 matrix * 3x1 vector with translation, updates MAC[1-3] and IR[1-3]
-  void MulMatVec(const s16 M[3][3], const s32 T[3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm);
-  void MulMatVecBuggy(const s16 M[3][3], const s32 T[3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm);
-
-  // Interpolate colour, or as in nocash "MAC+(FC-MAC)*IR0".
-  void InterpolateColor(s64 in_MAC1, s64 in_MAC2, s64 in_MAC3, u8 shift, bool lm);
-
-  void RTPS(const s16 V[3], u8 shift, bool lm, bool last);
-  void NCS(const s16 V[3], u8 shift, bool lm);
-  void NCCS(const s16 V[3], u8 shift, bool lm);
-  void NCDS(const s16 V[3], u8 shift, bool lm);
-  void DPCS(const u8 color[3], u8 shift, bool lm);
-
-  void Execute_MVMVA(Instruction inst);
-  void Execute_SQR(Instruction inst);
-  void Execute_OP(Instruction inst);
-  void Execute_RTPS(Instruction inst);
-  void Execute_RTPT(Instruction inst);
-  void Execute_NCLIP(Instruction inst);
-  void Execute_AVSZ3(Instruction inst);
-  void Execute_AVSZ4(Instruction inst);
-  void Execute_NCS(Instruction inst);
-  void Execute_NCT(Instruction inst);
-  void Execute_NCCS(Instruction inst);
-  void Execute_NCCT(Instruction inst);
-  void Execute_NCDS(Instruction inst);
-  void Execute_NCDT(Instruction inst);
-  void Execute_CC(Instruction inst);
-  void Execute_CDP(Instruction inst);
-  void Execute_DPCS(Instruction inst);
-  void Execute_DPCT(Instruction inst);
-  void Execute_DCPL(Instruction inst);
-  void Execute_INTPL(Instruction inst);
-  void Execute_GPL(Instruction inst);
-  void Execute_GPF(Instruction inst);
-
-  Regs m_regs = {};
-  bool m_widescreen_hack = false;
-};
-
-#include "gte.inl"
-
-} // namespace GTE
+} // namespace GTE