diff --git a/README.md b/README.md
index a667aafa8..b747fbf86 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,7 @@ A "BIOS" ROM image is required to to start the emulator and to play games. You c
 
 ## Latest News
 
+- 2020/08/01: Initial PGXP (geometry/perspective correction) support.
 - 2020/07/28: Qt frontend supports displaying interface in multiple languages.
 - 2020/07/23: m3u multi-disc support for libretro core.
 - 2020/07/22: Support multiple bindings for each controller button/axis.
diff --git a/android/app/src/main/res/xml/root_preferences.xml b/android/app/src/main/res/xml/root_preferences.xml
index c4a10f0a5..6589670a5 100644
--- a/android/app/src/main/res/xml/root_preferences.xml
+++ b/android/app/src/main/res/xml/root_preferences.xml
@@ -129,6 +129,50 @@
 
     </PreferenceCategory>
   
+    <PreferenceCategory app:title="Enhancements">
+
+        <SwitchPreferenceCompat
+            app:key="GPU/TrueColor"
+            app:title="True Color Rendering (24-bit, disables dithering)"
+            app:defaultValue="false"/>
+
+        <SwitchPreferenceCompat
+          app:key="GPU/ScaledDithering"
+          app:title="Scaled Dithering (scale dither pattern to resolution)"
+          app:defaultValue="true"/>
+
+         <SwitchPreferenceCompat
+          app:key="GPU/DisableInterlacing"
+          app:title="Disable Interlacing (force progressive render/scan)"
+          app:defaultValue="true"/>
+      
+        <SwitchPreferenceCompat
+          app:key="GPU/ForceNTSCTimings"
+          app:title="Force NTSC Timings (60hz-on-PAL)"
+          app:defaultValue="false"/>
+
+        <SwitchPreferenceCompat
+            app:key="GPU/PGXPEnable"
+            app:title="PGXP Geometry Correction"
+            app:defaultValue="false"/>
+
+        <SwitchPreferenceCompat
+            app:key="GPU/PGXPCulling"
+            app:title="PGXP Culling Correction"
+            app:defaultValue="true"/>
+      
+        <SwitchPreferenceCompat
+            app:key="GPU/PGXPTextureCorrection"
+            app:title="PGXP Texture Correction"
+            app:defaultValue="true"/>
+      
+        <SwitchPreferenceCompat
+            app:key="GPU/PGXPVertexCache"
+            app:title="PGXP Vertex Cache"
+            app:defaultValue="false"/>
+
+    </PreferenceCategory>
+  
     <PreferenceCategory app:title="Display">
          <ListPreference
             app:key="Display/CropMode"
@@ -144,7 +188,7 @@
             app:entries="@array/settings_display_aspect_ratio_names"
             app:entryValues="@array/settings_display_aspect_ratio_values"
             app:defaultValue="4:3"
-            app:useSimpleSummaryProvider="true" />]
+            app:useSimpleSummaryProvider="true" />
             
          <SwitchPreferenceCompat
             app:key="Display/LinearFiltering"
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index ddac448f6..73b7af22c 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -59,6 +59,8 @@ add_library(core
     negcon.h
     pad.cpp
     pad.h
+    pgxp.cpp
+    pgxp.h
     playstation_mouse.cpp
     playstation_mouse.h
     psf_loader.cpp
diff --git a/src/core/core.vcxproj b/src/core/core.vcxproj
index 0d621ecb4..d8b9669ec 100644
--- a/src/core/core.vcxproj
+++ b/src/core/core.vcxproj
@@ -80,6 +80,7 @@
     <ClCompile Include="negcon.cpp" />
     <ClCompile Include="pad.cpp" />
     <ClCompile Include="controller.cpp" />
+    <ClCompile Include="pgxp.cpp" />
     <ClCompile Include="playstation_mouse.cpp" />
     <ClCompile Include="psf_loader.cpp" />
     <ClCompile Include="resources.cpp" />
@@ -126,6 +127,7 @@
     <ClInclude Include="negcon.h" />
     <ClInclude Include="pad.h" />
     <ClInclude Include="controller.h" />
+    <ClInclude Include="pgxp.h" />
     <ClInclude Include="playstation_mouse.h" />
     <ClInclude Include="psf_loader.h" />
     <ClInclude Include="resources.h" />
diff --git a/src/core/core.vcxproj.filters b/src/core/core.vcxproj.filters
index 9261a37ae..775cff690 100644
--- a/src/core/core.vcxproj.filters
+++ b/src/core/core.vcxproj.filters
@@ -46,6 +46,7 @@
     <ClCompile Include="gpu_hw_vulkan.cpp" />
     <ClCompile Include="resources.cpp" />
     <ClCompile Include="host_interface_progress_callback.cpp" />
+    <ClCompile Include="pgxp.cpp" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="types.h" />
@@ -94,5 +95,6 @@
     <ClInclude Include="resources.h" />
     <ClInclude Include="host_interface_progress_callback.h" />
     <ClInclude Include="gte_types.h" />
+    <ClInclude Include="pgxp.h" />
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/src/core/cpu_core.cpp b/src/core/cpu_core.cpp
index 6a541aa7d..251d8810d 100644
--- a/src/core/cpu_core.cpp
+++ b/src/core/cpu_core.cpp
@@ -6,6 +6,8 @@
 #include "cpu_disasm.h"
 #include "cpu_recompiler_thunks.h"
 #include "gte.h"
+#include "pgxp.h"
+#include "settings.h"
 #include "timing_event.h"
 #include <cstdio>
 Log_SetChannel(CPU::Core);
@@ -73,6 +75,9 @@ void Initialize()
   g_state.cop0_regs.PRID = UINT32_C(0x00000002);
 
   GTE::Initialize();
+
+  if (g_settings.gpu_pgxp_enable)
+    PGXP::Initialize();
 }
 
 void Shutdown()
@@ -100,6 +105,9 @@ void Reset()
   GTE::Reset();
 
   SetPC(RESET_VECTOR);
+
+  if (g_settings.gpu_pgxp_enable)
+    PGXP::Initialize();
 }
 
 bool DoState(StateWrapper& sw)
@@ -137,6 +145,9 @@ bool DoState(StateWrapper& sw)
   if (!GTE::DoState(sw))
     return false;
 
+  if (sw.IsReading())
+    PGXP::Initialize();
+
   return !sw.HasError();
 }
 
@@ -893,7 +904,12 @@ void ExecuteInstruction()
       if (!ReadMemoryByte(addr, &value))
         return;
 
-      WriteRegDelayed(inst.i.rt, SignExtend32(value));
+      const u32 sxvalue = SignExtend32(value);
+
+      WriteRegDelayed(inst.i.rt, sxvalue);
+
+      if (g_settings.gpu_pgxp_enable)
+        PGXP::CPU_LBx(inst.bits, sxvalue, addr);
     }
     break;
 
@@ -904,7 +920,11 @@ void ExecuteInstruction()
       if (!ReadMemoryHalfWord(addr, &value))
         return;
 
-      WriteRegDelayed(inst.i.rt, SignExtend32(value));
+      const u32 sxvalue = SignExtend32(value);
+      WriteRegDelayed(inst.i.rt, sxvalue);
+
+      if (g_settings.gpu_pgxp_enable)
+        PGXP::CPU_LHx(inst.bits, sxvalue, addr);
     }
     break;
 
@@ -916,6 +936,9 @@ void ExecuteInstruction()
         return;
 
       WriteRegDelayed(inst.i.rt, value);
+
+      if (g_settings.gpu_pgxp_enable)
+        PGXP::CPU_LW(inst.bits, value, addr);
     }
     break;
 
@@ -926,7 +949,11 @@ void ExecuteInstruction()
       if (!ReadMemoryByte(addr, &value))
         return;
 
-      WriteRegDelayed(inst.i.rt, ZeroExtend32(value));
+      const u32 zxvalue = ZeroExtend32(value);
+      WriteRegDelayed(inst.i.rt, zxvalue);
+
+      if (g_settings.gpu_pgxp_enable)
+        PGXP::CPU_LBx(inst.bits, zxvalue, addr);
     }
     break;
 
@@ -937,7 +964,11 @@ void ExecuteInstruction()
       if (!ReadMemoryHalfWord(addr, &value))
         return;
 
-      WriteRegDelayed(inst.i.rt, ZeroExtend32(value));
+      const u32 zxvalue = ZeroExtend32(value);
+      WriteRegDelayed(inst.i.rt, zxvalue);
+
+      if (g_settings.gpu_pgxp_enable)
+        PGXP::CPU_LHx(inst.bits, zxvalue, addr);
     }
     break;
 
@@ -966,6 +997,9 @@ void ExecuteInstruction()
       }
 
       WriteRegDelayed(inst.i.rt, new_value);
+
+      if (g_settings.gpu_pgxp_enable)
+        PGXP::CPU_LW(inst.bits, new_value, addr);
     }
     break;
 
@@ -974,6 +1008,9 @@ void ExecuteInstruction()
       const VirtualMemoryAddress addr = ReadReg(inst.i.rs) + inst.i.imm_sext32();
       const u8 value = Truncate8(ReadReg(inst.i.rt));
       WriteMemoryByte(addr, value);
+
+      if (g_settings.gpu_pgxp_enable)
+        PGXP::CPU_SB(inst.bits, value, addr);
     }
     break;
 
@@ -982,6 +1019,9 @@ void ExecuteInstruction()
       const VirtualMemoryAddress addr = ReadReg(inst.i.rs) + inst.i.imm_sext32();
       const u16 value = Truncate16(ReadReg(inst.i.rt));
       WriteMemoryHalfWord(addr, value);
+
+      if (g_settings.gpu_pgxp_enable)
+        PGXP::CPU_SH(inst.bits, value, addr);
     }
     break;
 
@@ -990,6 +1030,9 @@ void ExecuteInstruction()
       const VirtualMemoryAddress addr = ReadReg(inst.i.rs) + inst.i.imm_sext32();
       const u32 value = ReadReg(inst.i.rt);
       WriteMemoryWord(addr, value);
+
+      if (g_settings.gpu_pgxp_enable)
+        PGXP::CPU_SW(inst.bits, value, addr);
     }
     break;
 
@@ -1017,6 +1060,9 @@ void ExecuteInstruction()
       }
 
       WriteMemoryWord(aligned_addr, new_value);
+
+      if (g_settings.gpu_pgxp_enable)
+        PGXP::CPU_SW(inst.bits, new_value, addr);
     }
     break;
 
@@ -1132,6 +1178,9 @@ void ExecuteInstruction()
         return;
 
       GTE::WriteRegister(ZeroExtend32(static_cast<u8>(inst.i.rt.GetValue())), value);
+
+      if (g_settings.gpu_pgxp_enable)
+        PGXP::CPU_LWC2(inst.bits, value, addr);
     }
     break;
 
@@ -1147,6 +1196,9 @@ void ExecuteInstruction()
       const VirtualMemoryAddress addr = ReadReg(inst.i.rs) + inst.i.imm_sext32();
       const u32 value = GTE::ReadRegister(ZeroExtend32(static_cast<u8>(inst.i.rt.GetValue())));
       WriteMemoryWord(addr, value);
+
+      if (g_settings.gpu_pgxp_enable)
+        PGXP::CPU_SWC2(inst.bits, value, addr);
     }
     break;
 
@@ -1230,20 +1282,44 @@ void ExecuteCop2Instruction()
     switch (inst.cop.CommonOp())
     {
       case CopCommonInstruction::cfcn:
-        WriteRegDelayed(inst.r.rt, GTE::ReadRegister(static_cast<u32>(inst.r.rd.GetValue()) + 32));
-        break;
+      {
+        const u32 value = GTE::ReadRegister(static_cast<u32>(inst.r.rd.GetValue()) + 32);
+        WriteRegDelayed(inst.r.rt, value);
+
+        if (g_settings.gpu_pgxp_enable)
+          PGXP::CPU_CFC2(inst.bits, value, value);
+      }
+      break;
 
       case CopCommonInstruction::ctcn:
-        GTE::WriteRegister(static_cast<u32>(inst.r.rd.GetValue()) + 32, ReadReg(inst.r.rt));
-        break;
+      {
+        const u32 value = ReadReg(inst.r.rt);
+        GTE::WriteRegister(static_cast<u32>(inst.r.rd.GetValue()) + 32, value);
+
+        if (g_settings.gpu_pgxp_enable)
+          PGXP::CPU_CTC2(inst.bits, value, value);
+      }
+      break;
 
       case CopCommonInstruction::mfcn:
-        WriteRegDelayed(inst.r.rt, GTE::ReadRegister(static_cast<u32>(inst.r.rd.GetValue())));
-        break;
+      {
+        const u32 value = GTE::ReadRegister(static_cast<u32>(inst.r.rd.GetValue()));
+        WriteRegDelayed(inst.r.rt, value);
+
+        if (g_settings.gpu_pgxp_enable)
+          PGXP::CPU_MFC2(inst.bits, value, value);
+      }
+      break;
 
       case CopCommonInstruction::mtcn:
-        GTE::WriteRegister(static_cast<u32>(inst.r.rd.GetValue()), ReadReg(inst.r.rt));
-        break;
+      {
+        const u32 value = ReadReg(inst.r.rt);
+        GTE::WriteRegister(static_cast<u32>(inst.r.rd.GetValue()), value);
+
+        if (g_settings.gpu_pgxp_enable)
+          PGXP::CPU_MTC2(inst.bits, value, value);
+      }
+      break;
 
       case CopCommonInstruction::bcnc:
       default:
diff --git a/src/core/cpu_recompiler_code_generator.cpp b/src/core/cpu_recompiler_code_generator.cpp
index 42523791a..47fbc8afd 100644
--- a/src/core/cpu_recompiler_code_generator.cpp
+++ b/src/core/cpu_recompiler_code_generator.cpp
@@ -3,6 +3,8 @@
 #include "cpu_core.h"
 #include "cpu_disasm.h"
 #include "gte.h"
+#include "pgxp.h"
+#include "settings.h"
 Log_SetChannel(CPU::Recompiler);
 
 // TODO: Turn load+sext/zext into a single signed/unsigned load
@@ -1115,19 +1117,32 @@ bool CodeGenerator::Compile_Load(const CodeBlockInstruction& cbi)
   {
     case InstructionOp::lb:
     case InstructionOp::lbu:
+    {
       result = EmitLoadGuestMemory(cbi, address, RegSize_8);
       ConvertValueSizeInPlace(&result, RegSize_32, (cbi.instruction.op == InstructionOp::lb));
-      break;
+      if (g_settings.gpu_pgxp_enable)
+        EmitFunctionCall(nullptr, PGXP::CPU_LBx, Value::FromConstantU32(cbi.instruction.bits), result, address);
+    }
+    break;
 
     case InstructionOp::lh:
     case InstructionOp::lhu:
+    {
       result = EmitLoadGuestMemory(cbi, address, RegSize_16);
       ConvertValueSizeInPlace(&result, RegSize_32, (cbi.instruction.op == InstructionOp::lh));
-      break;
+
+      if (g_settings.gpu_pgxp_enable)
+        EmitFunctionCall(nullptr, PGXP::CPU_LHx, Value::FromConstantU32(cbi.instruction.bits), result, address);
+    }
+    break;
 
     case InstructionOp::lw:
+    {
       result = EmitLoadGuestMemory(cbi, address, RegSize_32);
-      break;
+      if (g_settings.gpu_pgxp_enable)
+        EmitFunctionCall(nullptr, PGXP::CPU_LW, Value::FromConstantU32(cbi.instruction.bits), result, address);
+    }
+    break;
 
     default:
       UnreachableCode();
@@ -1153,16 +1168,34 @@ bool CodeGenerator::Compile_Store(const CodeBlockInstruction& cbi)
   switch (cbi.instruction.op)
   {
     case InstructionOp::sb:
+    {
       EmitStoreGuestMemory(cbi, address, value.ViewAsSize(RegSize_8));
-      break;
+      if (g_settings.gpu_pgxp_enable)
+      {
+        EmitFunctionCall(nullptr, PGXP::CPU_SB, Value::FromConstantU32(cbi.instruction.bits),
+                         value.ViewAsSize(RegSize_8), address);
+      }
+    }
+    break;
 
     case InstructionOp::sh:
+    {
       EmitStoreGuestMemory(cbi, address, value.ViewAsSize(RegSize_16));
-      break;
+      if (g_settings.gpu_pgxp_enable)
+      {
+        EmitFunctionCall(nullptr, PGXP::CPU_SH, Value::FromConstantU32(cbi.instruction.bits),
+                         value.ViewAsSize(RegSize_16), address);
+      }
+    }
+    break;
 
     case InstructionOp::sw:
+    {
       EmitStoreGuestMemory(cbi, address, value);
-      break;
+      if (g_settings.gpu_pgxp_enable)
+        EmitFunctionCall(nullptr, PGXP::CPU_SW, Value::FromConstantU32(cbi.instruction.bits), value, address);
+    }
+    break;
 
     default:
       UnreachableCode();
@@ -1827,11 +1860,17 @@ bool CodeGenerator::Compile_cop2(const CodeBlockInstruction& cbi)
     {
       Value value = EmitLoadGuestMemory(cbi, address, RegSize_32);
       DoGTERegisterWrite(reg, value);
+
+      if (g_settings.gpu_pgxp_enable)
+        EmitFunctionCall(nullptr, PGXP::CPU_LWC2, Value::FromConstantU32(cbi.instruction.bits), value, address);
     }
     else
     {
       Value value = DoGTERegisterRead(reg);
       EmitStoreGuestMemory(cbi, address, value);
+
+      if (g_settings.gpu_pgxp_enable)
+        EmitFunctionCall(nullptr, PGXP::CPU_SWC2, Value::FromConstantU32(cbi.instruction.bits), value, address);
     }
 
     InstructionEpilogue(cbi);
@@ -1851,7 +1890,19 @@ bool CodeGenerator::Compile_cop2(const CodeBlockInstruction& cbi)
                         ((cbi.instruction.cop.CommonOp() == CopCommonInstruction::cfcn) ? 32 : 0);
 
         InstructionPrologue(cbi, 1);
-        m_register_cache.WriteGuestRegisterDelayed(cbi.instruction.r.rt, DoGTERegisterRead(reg));
+
+        Value value = DoGTERegisterRead(reg);
+
+        // PGXP done first here before ownership is transferred.
+        if (g_settings.gpu_pgxp_enable)
+        {
+          EmitFunctionCall(
+            nullptr, (cbi.instruction.cop.CommonOp() == CopCommonInstruction::cfcn) ? PGXP::CPU_CFC2 : PGXP::CPU_MFC2,
+            Value::FromConstantU32(cbi.instruction.bits), value, value);
+        }
+
+        m_register_cache.WriteGuestRegisterDelayed(cbi.instruction.r.rt, std::move(value));
+
         InstructionEpilogue(cbi);
         return true;
       }
@@ -1863,7 +1914,17 @@ bool CodeGenerator::Compile_cop2(const CodeBlockInstruction& cbi)
                         ((cbi.instruction.cop.CommonOp() == CopCommonInstruction::ctcn) ? 32 : 0);
 
         InstructionPrologue(cbi, 1);
-        DoGTERegisterWrite(reg, m_register_cache.ReadGuestRegister(cbi.instruction.r.rt));
+
+        Value value = m_register_cache.ReadGuestRegister(cbi.instruction.r.rt);
+        DoGTERegisterWrite(reg, value);
+
+        if (g_settings.gpu_pgxp_enable)
+        {
+          EmitFunctionCall(
+            nullptr, (cbi.instruction.cop.CommonOp() == CopCommonInstruction::ctcn) ? PGXP::CPU_CTC2 : PGXP::CPU_MTC2,
+            Value::FromConstantU32(cbi.instruction.bits), value, value);
+        }
+
         InstructionEpilogue(cbi);
         return true;
       }
diff --git a/src/core/dma.cpp b/src/core/dma.cpp
index f6157f4cf..d685f5158 100644
--- a/src/core/dma.cpp
+++ b/src/core/dma.cpp
@@ -429,7 +429,8 @@ void DMA::UnhaltTransfer(TickCount ticks)
 TickCount DMA::TransferMemoryToDevice(Channel channel, u32 address, u32 increment, u32 word_count)
 {
   const u32* src_pointer = reinterpret_cast<u32*>(Bus::g_ram + address);
-  if (static_cast<s32>(increment) < 0 || ((address + (increment * word_count)) & ADDRESS_MASK) <= address)
+  if (channel != Channel::GPU &&
+      (static_cast<s32>(increment) < 0 || ((address + (increment * word_count)) & ADDRESS_MASK) <= address))
   {
     // Use temp buffer if it's wrapping around
     if (m_transfer_buffer.size() < word_count)
@@ -447,8 +448,21 @@ TickCount DMA::TransferMemoryToDevice(Channel channel, u32 address, u32 incremen
   switch (channel)
   {
     case Channel::GPU:
-      g_gpu->DMAWrite(src_pointer, word_count);
-      break;
+    {
+      if (g_gpu->BeginDMAWrite())
+      {
+        u8* ram_pointer = Bus::g_ram;
+        for (u32 i = 0; i < word_count; i++)
+        {
+          u32 value;
+          std::memcpy(&value, &ram_pointer[address], sizeof(u32));
+          g_gpu->DMAWrite(address, value);
+          address = (address + increment) & ADDRESS_MASK;
+        }
+        g_gpu->EndDMAWrite();
+      }
+    }
+    break;
 
     case Channel::SPU:
       g_spu.DMAWrite(src_pointer, word_count);
diff --git a/src/core/gpu.cpp b/src/core/gpu.cpp
index deabe216f..4f1a74407 100644
--- a/src/core/gpu.cpp
+++ b/src/core/gpu.cpp
@@ -349,32 +349,17 @@ void GPU::DMARead(u32* words, u32 word_count)
     words[i] = ReadGPUREAD();
 }
 
-void GPU::DMAWrite(const u32* words, u32 word_count)
+void GPU::EndDMAWrite()
 {
-  switch (m_GPUSTAT.dma_direction)
+  m_fifo_pushed = true;
+  if (!m_syncing)
   {
-    case DMADirection::CPUtoGP0:
-    {
-      m_fifo.PushRange(words, word_count);
-      m_fifo_pushed = true;
-      if (!m_syncing)
-      {
-        ExecuteCommands();
-        UpdateCommandTickEvent();
-      }
-      else
-      {
-        UpdateDMARequest();
-      }
-    }
-    break;
-
-    default:
-    {
-      Log_ErrorPrintf("Unhandled GPU DMA write mode %u for %u words",
-                      static_cast<u32>(m_GPUSTAT.dma_direction.GetValue()), word_count);
-    }
-    break;
+    ExecuteCommands();
+    UpdateCommandTickEvent();
+  }
+  else
+  {
+    UpdateDMARequest();
   }
 }
 
diff --git a/src/core/gpu.h b/src/core/gpu.h
index 668adde1b..62f0a4817 100644
--- a/src/core/gpu.h
+++ b/src/core/gpu.h
@@ -136,7 +136,13 @@ public:
 
   // DMA access
   void DMARead(u32* words, u32 word_count);
-  void DMAWrite(const u32* words, u32 word_count);
+
+  ALWAYS_INLINE bool BeginDMAWrite() const { return (m_GPUSTAT.dma_direction == DMADirection::CPUtoGP0); }
+  ALWAYS_INLINE void DMAWrite(u32 address, u32 value)
+  {
+    m_fifo.Push((ZeroExtend64(address) << 32) | ZeroExtend64(value));
+  }
+  void EndDMAWrite();
 
   /// Returns the number of pending GPU ticks.
   TickCount GetPendingCRTCTicks() const;
@@ -276,6 +282,14 @@ protected:
   // Sprites/rectangles should be clipped to 12 bits before drawing.
   static constexpr s32 TruncateVertexPosition(s32 x) { return SignExtendN<11, s32>(x); }
 
+  struct NativeVertex
+  {
+    s16 x;
+    s16 y;
+    u32 color;
+    u16 texcoord;
+  };
+
   union VRAMPixel
   {
     u16 bits;
@@ -700,11 +714,15 @@ protected:
     u16 row;
   } m_vram_transfer = {};
 
-  HeapFIFOQueue<u32, MAX_FIFO_SIZE> m_fifo;
+  HeapFIFOQueue<u64, MAX_FIFO_SIZE> m_fifo;
   std::vector<u32> m_blit_buffer;
   u32 m_blit_remaining_words;
   RenderCommand m_render_command{};
 
+  ALWAYS_INLINE u32 FifoPop() { return Truncate32(m_fifo.Pop()); }
+  ALWAYS_INLINE u32 FifoPeek() { return Truncate32(m_fifo.Peek()); }
+  ALWAYS_INLINE u32 FifoPeek(u32 i) { return Truncate32(m_fifo.Peek(i)); }
+
   TickCount m_max_run_ahead = 128;
   u32 m_fifo_size = 128;
 
diff --git a/src/core/gpu_commands.cpp b/src/core/gpu_commands.cpp
index 6d658ac18..c9334822c 100644
--- a/src/core/gpu_commands.cpp
+++ b/src/core/gpu_commands.cpp
@@ -33,7 +33,7 @@ void GPU::ExecuteCommands()
       {
         case BlitterState::Idle:
         {
-          const u32 command = m_fifo.Peek(0) >> 24;
+          const u32 command = FifoPeek(0) >> 24;
           if ((this->*s_GP0_command_handler_table[command])())
             continue;
           else
@@ -45,8 +45,11 @@ void GPU::ExecuteCommands()
           DebugAssert(m_blit_remaining_words > 0);
           const u32 words_to_copy = std::min(m_blit_remaining_words, m_fifo.GetSize());
           const size_t old_size = m_blit_buffer.size();
-          m_blit_buffer.resize(m_blit_buffer.size() + words_to_copy);
-          m_fifo.PopRange(&m_blit_buffer[old_size], words_to_copy);
+          // m_blit_buffer.resize(m_blit_buffer.size() + words_to_copy);
+          // FifoPopRange(&m_blit_buffer[old_size], words_to_copy);
+          m_blit_buffer.reserve(m_blit_buffer.size() + words_to_copy);
+          for (u32 i = 0; i < words_to_copy; i++)
+            m_blit_buffer.push_back(FifoPop());
           m_blit_remaining_words -= words_to_copy;
           AddCommandTicks(words_to_copy);
 
@@ -72,7 +75,7 @@ void GPU::ExecuteCommands()
           {
             // polyline must have at least two vertices, and the terminator is (word & 0xf000f000) == 0x50005000.
             // terminator is on the first word for the vertex
-            if ((m_fifo.Peek(terminator_index) & UINT32_C(0xF000F000)) == UINT32_C(0x50005000))
+            if ((FifoPeek(terminator_index) & UINT32_C(0xF000F000)) == UINT32_C(0x50005000))
               break;
           }
 
@@ -81,8 +84,11 @@ void GPU::ExecuteCommands()
           if (words_to_copy > 0)
           {
             const size_t old_size = m_blit_buffer.size();
-            m_blit_buffer.resize(m_blit_buffer.size() + words_to_copy);
-            m_fifo.PopRange(&m_blit_buffer[old_size], words_to_copy);
+            // m_blit_buffer.resize(m_blit_buffer.size() + words_to_copy);
+            // FifoPopRange(&m_blit_buffer[old_size], words_to_copy);
+            m_blit_buffer.reserve(m_blit_buffer.size() + words_to_copy);
+            for (u32 i = 0; i < words_to_copy; i++)
+              m_blit_buffer.push_back(FifoPop());
           }
 
           Log_DebugPrintf("Added %u words to polyline", words_to_copy);
@@ -170,12 +176,12 @@ GPU::GP0CommandHandlerTable GPU::GenerateGP0CommandHandlerTable()
 
 bool GPU::HandleUnknownGP0Command()
 {
-  const u32 command = m_fifo.Peek() >> 24;
+  const u32 command = FifoPeek() >> 24;
   Log_ErrorPrintf("Unimplemented GP0 command 0x%02X", command);
 
   SmallString dump;
   for (u32 i = 0; i < m_fifo.GetSize(); i++)
-    dump.AppendFormattedString("%s0x%08X", (i > 0) ? " " : "", m_fifo.Peek(i));
+    dump.AppendFormattedString("%s0x%08X", (i > 0) ? " " : "", FifoPeek(i));
   Log_ErrorPrintf("FIFO: %s", dump.GetCharArray());
 
   m_fifo.RemoveOne();
@@ -216,7 +222,7 @@ bool GPU::HandleInterruptRequestCommand()
 
 bool GPU::HandleSetDrawModeCommand()
 {
-  const u32 param = m_fifo.Pop() & 0x00FFFFFFu;
+  const u32 param = FifoPop() & 0x00FFFFFFu;
   Log_DebugPrintf("Set draw mode %08X", param);
   SetDrawMode(Truncate16(param));
   AddCommandTicks(1);
@@ -226,7 +232,7 @@ bool GPU::HandleSetDrawModeCommand()
 
 bool GPU::HandleSetTextureWindowCommand()
 {
-  const u32 param = m_fifo.Pop() & 0x00FFFFFFu;
+  const u32 param = FifoPop() & 0x00FFFFFFu;
   SetTextureWindow(param);
   Log_DebugPrintf("Set texture window %02X %02X %02X %02X", m_draw_mode.texture_window_mask_x,
                   m_draw_mode.texture_window_mask_y, m_draw_mode.texture_window_offset_x,
@@ -239,7 +245,7 @@ bool GPU::HandleSetTextureWindowCommand()
 
 bool GPU::HandleSetDrawingAreaTopLeftCommand()
 {
-  const u32 param = m_fifo.Pop() & 0x00FFFFFFu;
+  const u32 param = FifoPop() & 0x00FFFFFFu;
   const u32 left = param & VRAM_WIDTH_MASK;
   const u32 top = (param >> 10) & VRAM_HEIGHT_MASK;
   Log_DebugPrintf("Set drawing area top-left: (%u, %u)", left, top);
@@ -259,7 +265,7 @@ bool GPU::HandleSetDrawingAreaTopLeftCommand()
 
 bool GPU::HandleSetDrawingAreaBottomRightCommand()
 {
-  const u32 param = m_fifo.Pop() & 0x00FFFFFFu;
+  const u32 param = FifoPop() & 0x00FFFFFFu;
 
   const u32 right = param & VRAM_WIDTH_MASK;
   const u32 bottom = (param >> 10) & VRAM_HEIGHT_MASK;
@@ -280,7 +286,7 @@ bool GPU::HandleSetDrawingAreaBottomRightCommand()
 
 bool GPU::HandleSetDrawingOffsetCommand()
 {
-  const u32 param = m_fifo.Pop() & 0x00FFFFFFu;
+  const u32 param = FifoPop() & 0x00FFFFFFu;
   const s32 x = SignExtendN<11, s32>(param & 0x7FFu);
   const s32 y = SignExtendN<11, s32>((param >> 11) & 0x7FFu);
   Log_DebugPrintf("Set drawing offset (%d, %d)", m_drawing_offset.x, m_drawing_offset.y);
@@ -299,7 +305,7 @@ bool GPU::HandleSetDrawingOffsetCommand()
 
 bool GPU::HandleSetMaskBitCommand()
 {
-  const u32 param = m_fifo.Pop() & 0x00FFFFFFu;
+  const u32 param = FifoPop() & 0x00FFFFFFu;
 
   constexpr u32 gpustat_mask = (1 << 11) | (1 << 12);
   const u32 gpustat_bits = (param & 0x03) << 11;
@@ -318,7 +324,7 @@ bool GPU::HandleSetMaskBitCommand()
 
 bool GPU::HandleRenderPolygonCommand()
 {
-  const RenderCommand rc{m_fifo.Peek(0)};
+  const RenderCommand rc{FifoPeek(0)};
 
   // shaded vertices use the colour from the first word for the first vertex
   const u32 words_per_vertex = 1 + BoolToUInt32(rc.texture_enable) + BoolToUInt32(rc.shading_enable);
@@ -344,10 +350,10 @@ bool GPU::HandleRenderPolygonCommand()
   // set draw state up
   if (rc.texture_enable)
   {
-    const u16 texpage_attribute = Truncate16((rc.shading_enable ? m_fifo.Peek(5) : m_fifo.Peek(4)) >> 16);
+    const u16 texpage_attribute = Truncate16((rc.shading_enable ? FifoPeek(5) : FifoPeek(4)) >> 16);
     SetDrawMode((texpage_attribute & DrawMode::Reg::POLYGON_TEXPAGE_MASK) |
                 (m_draw_mode.mode_reg.bits & ~DrawMode::Reg::POLYGON_TEXPAGE_MASK));
-    SetTexturePalette(Truncate16(m_fifo.Peek(2) >> 16));
+    SetTexturePalette(Truncate16(FifoPeek(2) >> 16));
   }
 
   m_stats.num_vertices += num_vertices;
@@ -362,7 +368,7 @@ bool GPU::HandleRenderPolygonCommand()
 
 bool GPU::HandleRenderRectangleCommand()
 {
-  const RenderCommand rc{m_fifo.Peek(0)};
+  const RenderCommand rc{FifoPeek(0)};
   const u32 total_words =
     2 + BoolToUInt32(rc.texture_enable) + BoolToUInt32(rc.rectangle_size == DrawRectangleSize::Variable);
 
@@ -372,7 +378,7 @@ bool GPU::HandleRenderRectangleCommand()
     SynchronizeCRTC();
 
   if (rc.texture_enable)
-    SetTexturePalette(Truncate16(m_fifo.Peek(2) >> 16));
+    SetTexturePalette(Truncate16(FifoPeek(2) >> 16));
 
   const TickCount setup_ticks = 16;
   AddCommandTicks(setup_ticks);
@@ -394,7 +400,7 @@ bool GPU::HandleRenderRectangleCommand()
 
 bool GPU::HandleRenderLineCommand()
 {
-  const RenderCommand rc{m_fifo.Peek(0)};
+  const RenderCommand rc{FifoPeek(0)};
   const u32 total_words = rc.shading_enable ? 4 : 3;
   CHECK_COMMAND_SIZE(total_words);
 
@@ -417,7 +423,7 @@ bool GPU::HandleRenderLineCommand()
 bool GPU::HandleRenderPolyLineCommand()
 {
   // always read the first two vertices, we test for the terminator after that
-  const RenderCommand rc{m_fifo.Peek(0)};
+  const RenderCommand rc{FifoPeek(0)};
   const u32 min_words = rc.shading_enable ? 3 : 4;
   CHECK_COMMAND_SIZE(min_words);
 
@@ -434,8 +440,11 @@ bool GPU::HandleRenderPolyLineCommand()
   m_fifo.RemoveOne();
 
   const u32 words_to_pop = min_words - 1;
-  m_blit_buffer.resize(words_to_pop);
-  m_fifo.PopRange(m_blit_buffer.data(), words_to_pop);
+  // m_blit_buffer.resize(words_to_pop);
+  // FifoPopRange(m_blit_buffer.data(), words_to_pop);
+  m_blit_buffer.reserve(words_to_pop);
+  for (u32 i = 0; i < words_to_pop; i++)
+    m_blit_buffer.push_back(Truncate32(FifoPop()));
 
   // polyline goes via a different path through the blit buffer
   m_blitter_state = BlitterState::DrawingPolyLine;
@@ -452,11 +461,11 @@ bool GPU::HandleFillRectangleCommand()
 
   FlushRender();
 
-  const u32 color = m_fifo.Pop() & 0x00FFFFFF;
-  const u32 dst_x = m_fifo.Peek() & 0x3F0;
-  const u32 dst_y = (m_fifo.Pop() >> 16) & VRAM_COORD_MASK;
-  const u32 width = ((m_fifo.Peek() & VRAM_WIDTH_MASK) + 0xF) & ~0xF;
-  const u32 height = (m_fifo.Pop() >> 16) & VRAM_HEIGHT_MASK;
+  const u32 color = FifoPop() & 0x00FFFFFF;
+  const u32 dst_x = FifoPeek() & 0x3F0;
+  const u32 dst_y = (FifoPop() >> 16) & VRAM_COORD_MASK;
+  const u32 width = ((FifoPeek() & VRAM_WIDTH_MASK) + 0xF) & ~0xF;
+  const u32 height = (FifoPop() >> 16) & VRAM_HEIGHT_MASK;
 
   Log_DebugPrintf("Fill VRAM rectangle offset=(%u,%u), size=(%u,%u)", dst_x, dst_y, width, height);
 
@@ -472,10 +481,10 @@ bool GPU::HandleCopyRectangleCPUToVRAMCommand()
   CHECK_COMMAND_SIZE(3);
   m_fifo.RemoveOne();
 
-  const u32 dst_x = m_fifo.Peek() & VRAM_COORD_MASK;
-  const u32 dst_y = (m_fifo.Pop() >> 16) & VRAM_COORD_MASK;
-  const u32 copy_width = ReplaceZero(m_fifo.Peek() & VRAM_WIDTH_MASK, 0x400);
-  const u32 copy_height = ReplaceZero((m_fifo.Pop() >> 16) & VRAM_HEIGHT_MASK, 0x200);
+  const u32 dst_x = FifoPeek() & VRAM_COORD_MASK;
+  const u32 dst_y = (FifoPop() >> 16) & VRAM_COORD_MASK;
+  const u32 copy_width = ReplaceZero(FifoPeek() & VRAM_WIDTH_MASK, 0x400);
+  const u32 copy_height = ReplaceZero((FifoPop() >> 16) & VRAM_HEIGHT_MASK, 0x200);
   const u32 num_pixels = copy_width * copy_height;
   const u32 num_words = ((num_pixels + 1) / 2);
 
@@ -520,10 +529,10 @@ bool GPU::HandleCopyRectangleVRAMToCPUCommand()
   CHECK_COMMAND_SIZE(3);
   m_fifo.RemoveOne();
 
-  m_vram_transfer.x = Truncate16(m_fifo.Peek() & VRAM_COORD_MASK);
-  m_vram_transfer.y = Truncate16((m_fifo.Pop() >> 16) & VRAM_COORD_MASK);
-  m_vram_transfer.width = ((Truncate16(m_fifo.Peek()) - 1) & VRAM_WIDTH_MASK) + 1;
-  m_vram_transfer.height = ((Truncate16(m_fifo.Pop() >> 16) - 1) & VRAM_HEIGHT_MASK) + 1;
+  m_vram_transfer.x = Truncate16(FifoPeek() & VRAM_COORD_MASK);
+  m_vram_transfer.y = Truncate16((FifoPop() >> 16) & VRAM_COORD_MASK);
+  m_vram_transfer.width = ((Truncate16(FifoPeek()) - 1) & VRAM_WIDTH_MASK) + 1;
+  m_vram_transfer.height = ((Truncate16(FifoPop() >> 16) - 1) & VRAM_HEIGHT_MASK) + 1;
 
   Log_DebugPrintf("Copy rectangle from VRAM to CPU offset=(%u,%u), size=(%u,%u)", m_vram_transfer.x, m_vram_transfer.y,
                   m_vram_transfer.width, m_vram_transfer.height);
@@ -554,12 +563,12 @@ bool GPU::HandleCopyRectangleVRAMToVRAMCommand()
   CHECK_COMMAND_SIZE(4);
   m_fifo.RemoveOne();
 
-  const u32 src_x = m_fifo.Peek() & VRAM_COORD_MASK;
-  const u32 src_y = (m_fifo.Pop() >> 16) & VRAM_COORD_MASK;
-  const u32 dst_x = m_fifo.Peek() & VRAM_COORD_MASK;
-  const u32 dst_y = (m_fifo.Pop() >> 16) & VRAM_COORD_MASK;
-  const u32 width = ReplaceZero(m_fifo.Peek() & VRAM_WIDTH_MASK, 0x400);
-  const u32 height = ReplaceZero((m_fifo.Pop() >> 16) & VRAM_HEIGHT_MASK, 0x200);
+  const u32 src_x = FifoPeek() & VRAM_COORD_MASK;
+  const u32 src_y = (FifoPop() >> 16) & VRAM_COORD_MASK;
+  const u32 dst_x = FifoPeek() & VRAM_COORD_MASK;
+  const u32 dst_y = (FifoPop() >> 16) & VRAM_COORD_MASK;
+  const u32 width = ReplaceZero(FifoPeek() & VRAM_WIDTH_MASK, 0x400);
+  const u32 height = ReplaceZero((FifoPop() >> 16) & VRAM_HEIGHT_MASK, 0x200);
 
   Log_DebugPrintf("Copy rectangle from VRAM to VRAM src=(%u,%u), dst=(%u,%u), size=(%u,%u)", src_x, src_y, dst_x, dst_y,
                   width, height);
diff --git a/src/core/gpu_hw.cpp b/src/core/gpu_hw.cpp
index aaf93604f..f544451c8 100644
--- a/src/core/gpu_hw.cpp
+++ b/src/core/gpu_hw.cpp
@@ -2,17 +2,25 @@
 #include "common/assert.h"
 #include "common/log.h"
 #include "common/state_wrapper.h"
+#include "cpu_core.h"
+#include "pgxp.h"
 #include "settings.h"
 #include "system.h"
 #include <imgui.h>
 #include <sstream>
 Log_SetChannel(GPU_HW);
 
-GPU_HW::GPU_HW() : GPU() { m_vram_ptr = m_vram_shadow.data(); }
+GPU_HW::GPU_HW() : GPU()
+{
+  m_vram_ptr = m_vram_shadow.data();
+}
 
 GPU_HW::~GPU_HW() = default;
 
-bool GPU_HW::IsHardwareRenderer() const { return true; }
+bool GPU_HW::IsHardwareRenderer() const
+{
+  return true;
+}
 
 bool GPU_HW::Initialize(HostDisplay* host_display)
 {
@@ -110,35 +118,39 @@ void GPU_HW::HandleFlippedQuadTextureCoordinates(BatchVertex* vertices)
 
   // It might be faster to do more direct checking here, but the code below handles primitives in any order and
   // orientation, and is far more SIMD-friendly if needed.
-  const s32 abx = vertices[1].x - vertices[0].x;
-  const s32 aby = vertices[1].y - vertices[0].y;
-  const s32 bcx = vertices[2].x - vertices[1].x;
-  const s32 bcy = vertices[2].y - vertices[1].y;
-  const s32 cax = vertices[0].x - vertices[2].x;
-  const s32 cay = vertices[0].y - vertices[2].y;
+  const float abx = vertices[1].x - vertices[0].x;
+  const float aby = vertices[1].y - vertices[0].y;
+  const float bcx = vertices[2].x - vertices[1].x;
+  const float bcy = vertices[2].y - vertices[1].y;
+  const float cax = vertices[0].x - vertices[2].x;
+  const float cay = vertices[0].y - vertices[2].y;
 
   // Compute static derivatives, just assume W is uniform across the primitive and that the plane equation remains the
   // same across the quad. (which it is, there is no Z.. yet).
-  const s32 dudx = -aby * vertices[2].u - bcy * vertices[0].u - cay * vertices[1].u;
-  const s32 dvdx = -aby * vertices[2].v - bcy * vertices[0].v - cay * vertices[1].v;
-  const s32 dudy = +abx * vertices[2].u + bcx * vertices[0].u + cax * vertices[1].u;
-  const s32 dvdy = +abx * vertices[2].v + bcx * vertices[0].v + cax * vertices[1].v;
-  const s32 area = bcx * cay - bcy * cax;
+  const float dudx = -aby * static_cast<float>(vertices[2].u) - bcy * static_cast<float>(vertices[0].u) -
+                     cay * static_cast<float>(vertices[1].u);
+  const float dvdx = -aby * static_cast<float>(vertices[2].v) - bcy * static_cast<float>(vertices[0].v) -
+                     cay * static_cast<float>(vertices[1].v);
+  const float dudy = +abx * static_cast<float>(vertices[2].u) + bcx * static_cast<float>(vertices[0].u) +
+                     cax * static_cast<float>(vertices[1].u);
+  const float dvdy = +abx * static_cast<float>(vertices[2].v) + bcx * static_cast<float>(vertices[0].v) +
+                     cax * static_cast<float>(vertices[1].v);
+  const float area = bcx * cay - bcy * cax;
 
   // Detect and reject any triangles with 0 size texture area
   const s32 texArea = (vertices[1].u - vertices[0].u) * (vertices[2].v - vertices[0].v) -
                       (vertices[2].u - vertices[0].u) * (vertices[1].v - vertices[0].v);
 
   // Shouldn't matter as degenerate primitives will be culled anyways.
-  if (area == 0 && texArea == 0)
+  if (area == 0.0f && texArea == 0)
     return;
 
   // Use floats here as it'll be faster than integer divides.
-  const float rcp_area = 1.0f / static_cast<float>(area);
-  const float dudx_area = static_cast<float>(dudx) * rcp_area;
-  const float dudy_area = static_cast<float>(dudy) * rcp_area;
-  const float dvdx_area = static_cast<float>(dvdx) * rcp_area;
-  const float dvdy_area = static_cast<float>(dvdy) * rcp_area;
+  const float rcp_area = 1.0f / area;
+  const float dudx_area = dudx * rcp_area;
+  const float dudy_area = dudy * rcp_area;
+  const float dvdx_area = dvdx * rcp_area;
+  const float dvdy_area = dvdy * rcp_area;
   const bool neg_dudx = dudx_area < 0.0f;
   const bool neg_dudy = dudy_area < 0.0f;
   const bool neg_dvdx = dvdx_area < 0.0f;
@@ -179,22 +191,22 @@ void GPU_HW::HandleFlippedQuadTextureCoordinates(BatchVertex* vertices)
 
 // The PlayStation GPU draws lines from start to end, inclusive. Or, more specifically, inclusive of the greatest delta
 // in the x or y direction.
-void GPU_HW::FixLineVertexCoordinates(BatchVertex& start, BatchVertex& end, s32 dx, s32 dy)
+void GPU_HW::FixLineVertexCoordinates(s32& start_x, s32& start_y, s32& end_x, s32& end_y, s32 dx, s32 dy)
 {
   // deliberately not else if to catch the equal case
   if (dx >= dy)
   {
-    if (start.x > end.x)
-      start.x++;
+    if (start_x > end_x)
+      start_x++;
     else
-      end.x++;
+      end_x++;
   }
   if (dx <= dy)
   {
-    if (start.y > end.y)
-      start.y++;
+    if (start_y > end_y)
+      start_y++;
     else
-      end.y++;
+      end_y++;
   }
 }
 
@@ -202,6 +214,7 @@ void GPU_HW::LoadVertices()
 {
   const RenderCommand rc{m_render_command.bits};
   const u32 texpage = ZeroExtend32(m_draw_mode.mode_reg.bits) | (ZeroExtend32(m_draw_mode.palette_reg) << 16);
+  const float depth = GetCurrentNormalizedVertexDepth();
 
   if (m_GPUSTAT.check_mask_before_draw)
     m_current_depth++;
@@ -215,17 +228,36 @@ void GPU_HW::LoadVertices()
       const u32 first_color = rc.color_for_first_vertex;
       const bool shaded = rc.shading_enable;
       const bool textured = rc.texture_enable;
+      const bool pgxp = g_settings.gpu_pgxp_enable;
 
       const u32 num_vertices = rc.quad_polygon ? 4 : 3;
       std::array<BatchVertex, 4> vertices;
+      std::array<std::array<s32, 2>, 4> native_vertex_positions;
+      bool valid_w = g_settings.gpu_pgxp_texture_correction;
       for (u32 i = 0; i < num_vertices; i++)
       {
-        const u32 color = (shaded && i > 0) ? (m_fifo.Pop() & UINT32_C(0x00FFFFFF)) : first_color;
-        const VertexPosition vp{m_fifo.Pop()};
-        const u16 packed_texcoord = textured ? Truncate16(m_fifo.Pop()) : 0;
+        const u32 color = (shaded && i > 0) ? (FifoPop() & UINT32_C(0x00FFFFFF)) : first_color;
+        const u64 maddr_and_pos = m_fifo.Pop();
+        const VertexPosition vp{Truncate32(maddr_and_pos)};
+        const u16 texcoord = textured ? Truncate16(FifoPop()) : 0;
+        const s32 native_x = m_drawing_offset.x + vp.x;
+        const s32 native_y = m_drawing_offset.y + vp.y;
+        native_vertex_positions[i][0] = native_x;
+        native_vertex_positions[i][1] = native_y;
+        vertices[i].Set(static_cast<float>(native_x), static_cast<float>(native_y), depth, 1.0f, color, texpage,
+                        texcoord);
 
-        vertices[i].Set(m_drawing_offset.x + vp.x, m_drawing_offset.y + vp.y, m_current_depth, color, texpage,
-                        packed_texcoord);
+        if (pgxp)
+        {
+          valid_w &=
+            PGXP::GetPreciseVertex(Truncate32(maddr_and_pos >> 32), vp.bits, native_x, native_y, m_drawing_offset.x,
+                                 m_drawing_offset.y, &vertices[i].x, &vertices[i].y, &vertices[i].w);
+        }
+      }
+      if (!valid_w)
+      {
+        for (BatchVertex& v : vertices)
+          v.w = 1.0f;
       }
 
       if (rc.quad_polygon && m_resolution_scale > 1)
@@ -235,19 +267,20 @@ void GPU_HW::LoadVertices()
         return;
 
       // Cull polygons which are too large.
-      const s32 min_x_12 = std::min(vertices[1].x, vertices[2].x);
-      const s32 max_x_12 = std::max(vertices[1].x, vertices[2].x);
-      const s32 min_y_12 = std::min(vertices[1].y, vertices[2].y);
-      const s32 max_y_12 = std::max(vertices[1].y, vertices[2].y);
-      const s32 min_x = std::min(min_x_12, vertices[0].x);
-      const s32 max_x = std::max(max_x_12, vertices[0].x);
-      const s32 min_y = std::min(min_y_12, vertices[0].y);
-      const s32 max_y = std::max(max_y_12, vertices[0].y);
+      const s32 min_x_12 = std::min(native_vertex_positions[1][0], native_vertex_positions[2][0]);
+      const s32 max_x_12 = std::max(native_vertex_positions[1][0], native_vertex_positions[2][0]);
+      const s32 min_y_12 = std::min(native_vertex_positions[1][1], native_vertex_positions[2][1]);
+      const s32 max_y_12 = std::max(native_vertex_positions[1][1], native_vertex_positions[2][1]);
+      const s32 min_x = std::min(min_x_12, native_vertex_positions[0][0]);
+      const s32 max_x = std::max(max_x_12, native_vertex_positions[0][0]);
+      const s32 min_y = std::min(min_y_12, native_vertex_positions[0][1]);
+      const s32 max_y = std::max(max_y_12, native_vertex_positions[0][1]);
 
       if ((max_x - min_x) >= MAX_PRIMITIVE_WIDTH || (max_y - min_y) >= MAX_PRIMITIVE_HEIGHT)
       {
-        Log_DebugPrintf("Culling too-large polygon: %d,%d %d,%d %d,%d", vertices[0].x, vertices[0].y, vertices[1].x,
-                        vertices[1].y, vertices[2].x, vertices[2].y);
+        Log_DebugPrintf("Culling too-large polygon: %d,%d %d,%d %d,%d", native_vertex_positions[0][0],
+                        native_vertex_positions[0][1], native_vertex_positions[1][0], native_vertex_positions[1][1],
+                        native_vertex_positions[2][0], native_vertex_positions[2][1]);
       }
       else
       {
@@ -268,16 +301,17 @@ void GPU_HW::LoadVertices()
       // quads
       if (rc.quad_polygon)
       {
-        const s32 min_x_123 = std::min(min_x_12, vertices[3].x);
-        const s32 max_x_123 = std::max(max_x_12, vertices[3].x);
-        const s32 min_y_123 = std::min(min_y_12, vertices[3].y);
-        const s32 max_y_123 = std::max(max_y_12, vertices[3].y);
+        const s32 min_x_123 = std::min(min_x_12, native_vertex_positions[3][0]);
+        const s32 max_x_123 = std::max(max_x_12, native_vertex_positions[3][0]);
+        const s32 min_y_123 = std::min(min_y_12, native_vertex_positions[3][1]);
+        const s32 max_y_123 = std::max(max_y_12, native_vertex_positions[3][1]);
 
         // Cull polygons which are too large.
         if ((max_x_123 - min_x_123) >= MAX_PRIMITIVE_WIDTH || (max_y_123 - min_y_123) >= MAX_PRIMITIVE_HEIGHT)
         {
-          Log_DebugPrintf("Culling too-large polygon (quad second half): %d,%d %d,%d %d,%d", vertices[2].x,
-                          vertices[2].y, vertices[1].x, vertices[1].y, vertices[0].x, vertices[0].y);
+          Log_DebugPrintf("Culling too-large polygon (quad second half): %d,%d %d,%d %d,%d",
+                          native_vertex_positions[2][0], native_vertex_positions[2][1], native_vertex_positions[1][0],
+                          native_vertex_positions[1][1], native_vertex_positions[0][0], native_vertex_positions[0][1]);
         }
         else
         {
@@ -303,11 +337,11 @@ void GPU_HW::LoadVertices()
     case Primitive::Rectangle:
     {
       const u32 color = rc.color_for_first_vertex;
-      const VertexPosition vp{m_fifo.Pop()};
+      const VertexPosition vp{FifoPop()};
       const s32 pos_x = TruncateVertexPosition(m_drawing_offset.x + vp.x);
       const s32 pos_y = TruncateVertexPosition(m_drawing_offset.y + vp.y);
 
-      const auto [texcoord_x, texcoord_y] = UnpackTexcoord(rc.texture_enable ? Truncate16(m_fifo.Pop()) : 0);
+      const auto [texcoord_x, texcoord_y] = UnpackTexcoord(rc.texture_enable ? Truncate16(FifoPop()) : 0);
       u16 orig_tex_left = ZeroExtend16(texcoord_x);
       u16 orig_tex_top = ZeroExtend16(texcoord_y);
       s32 rectangle_width;
@@ -328,7 +362,7 @@ void GPU_HW::LoadVertices()
           break;
         default:
         {
-          const u32 width_and_height = m_fifo.Pop();
+          const u32 width_and_height = FifoPop();
           rectangle_width = static_cast<s32>(width_and_height & VRAM_WIDTH_MASK);
           rectangle_height = static_cast<s32>((width_and_height >> 16) & VRAM_HEIGHT_MASK);
 
@@ -353,25 +387,25 @@ void GPU_HW::LoadVertices()
       for (s32 y_offset = 0; y_offset < rectangle_height;)
       {
         const s32 quad_height = std::min<s32>(rectangle_height - y_offset, TEXTURE_PAGE_WIDTH - tex_top);
-        const s32 quad_start_y = pos_y + y_offset;
-        const s32 quad_end_y = quad_start_y + quad_height;
+        const float quad_start_y = static_cast<float>(pos_y + y_offset);
+        const float quad_end_y = quad_start_y + static_cast<float>(quad_height);
         const u16 tex_bottom = tex_top + static_cast<u16>(quad_height);
 
         u16 tex_left = orig_tex_left;
         for (s32 x_offset = 0; x_offset < rectangle_width;)
         {
           const s32 quad_width = std::min<s32>(rectangle_width - x_offset, TEXTURE_PAGE_HEIGHT - tex_left);
-          const s32 quad_start_x = pos_x + x_offset;
-          const s32 quad_end_x = quad_start_x + quad_width;
+          const float quad_start_x = static_cast<float>(pos_x + x_offset);
+          const float quad_end_x = quad_start_x + static_cast<float>(quad_width);
           const u16 tex_right = tex_left + static_cast<u16>(quad_width);
 
-          AddNewVertex(quad_start_x, quad_start_y, m_current_depth, color, texpage, tex_left, tex_top);
-          AddNewVertex(quad_end_x, quad_start_y, m_current_depth, color, texpage, tex_right, tex_top);
-          AddNewVertex(quad_start_x, quad_end_y, m_current_depth, color, texpage, tex_left, tex_bottom);
+          AddNewVertex(quad_start_x, quad_start_y, depth, 1.0f, color, texpage, tex_left, tex_top);
+          AddNewVertex(quad_end_x, quad_start_y, depth, 1.0f, color, texpage, tex_right, tex_top);
+          AddNewVertex(quad_start_x, quad_end_y, depth, 1.0f, color, texpage, tex_left, tex_bottom);
 
-          AddNewVertex(quad_start_x, quad_end_y, m_current_depth, color, texpage, tex_left, tex_bottom);
-          AddNewVertex(quad_end_x, quad_start_y, m_current_depth, color, texpage, tex_right, tex_top);
-          AddNewVertex(quad_end_x, quad_end_y, m_current_depth, color, texpage, tex_right, tex_bottom);
+          AddNewVertex(quad_start_x, quad_end_y, depth, 1.0f, color, texpage, tex_left, tex_bottom);
+          AddNewVertex(quad_end_x, quad_start_y, depth, 1.0f, color, texpage, tex_right, tex_top);
+          AddNewVertex(quad_end_x, quad_end_y, depth, 1.0f, color, texpage, tex_right, tex_bottom);
 
           x_offset += quad_width;
           tex_left = 0;
@@ -404,41 +438,41 @@ void GPU_HW::LoadVertices()
         if (rc.shading_enable)
         {
           color0 = rc.color_for_first_vertex;
-          pos0.bits = m_fifo.Pop();
-          color1 = m_fifo.Pop() & UINT32_C(0x00FFFFFF);
-          pos1.bits = m_fifo.Pop();
+          pos0.bits = FifoPop();
+          color1 = FifoPop() & UINT32_C(0x00FFFFFF);
+          pos1.bits = FifoPop();
         }
         else
         {
           color0 = color1 = rc.color_for_first_vertex;
-          pos0.bits = m_fifo.Pop();
-          pos1.bits = m_fifo.Pop();
+          pos0.bits = FifoPop();
+          pos1.bits = FifoPop();
         }
 
         if (!IsDrawingAreaIsValid())
           return;
 
-        BatchVertex start, end;
-        start.Set(m_drawing_offset.x + pos0.x, m_drawing_offset.y + pos0.y, m_current_depth, color0, 0, 0);
-        end.Set(m_drawing_offset.x + pos1.x, m_drawing_offset.y + pos1.y, m_current_depth, color1, 0, 0);
+        s32 start_x = pos0.x + m_drawing_offset.x;
+        s32 start_y = pos0.y + m_drawing_offset.y;
+        s32 end_x = pos1.x + m_drawing_offset.x;
+        s32 end_y = pos1.y + m_drawing_offset.y;
 
-        const s32 min_x = std::min(start.x, end.x);
-        const s32 max_x = std::max(start.x, end.x);
-        const s32 min_y = std::min(start.y, end.y);
-        const s32 max_y = std::max(start.y, end.y);
+        const s32 min_x = std::min(start_x, end_x);
+        const s32 max_x = std::max(start_x, end_x);
+        const s32 min_y = std::min(start_y, end_y);
+        const s32 max_y = std::max(start_y, end_y);
         const s32 dx = max_x - min_x;
         const s32 dy = max_y - min_y;
-
         if (dx >= MAX_PRIMITIVE_WIDTH || dy >= MAX_PRIMITIVE_HEIGHT)
         {
-          Log_DebugPrintf("Culling too-large line: %d,%d - %d,%d", start.x, start.y, end.x, end.y);
+          Log_DebugPrintf("Culling too-large line: %d,%d - %d,%d", start_x, start_y, end_x, end_y);
           return;
         }
 
-        FixLineVertexCoordinates(start, end, dx, dy);
-
-        AddVertex(start);
-        AddVertex(end);
+        FixLineVertexCoordinates(start_x, start_y, end_x, end_y, dx, dy);
+        AddNewVertex(static_cast<float>(start_x), static_cast<float>(start_y), depth, 1.0f, color0, 0,
+                     static_cast<u16>(0));
+        AddNewVertex(static_cast<float>(end_x), static_cast<float>(end_y), depth, 1.0f, color1, 0, static_cast<u16>(0));
 
         const u32 clip_left = static_cast<u32>(std::clamp<s32>(min_x, m_drawing_area.left, m_drawing_area.left));
         const u32 clip_right = static_cast<u32>(std::clamp<s32>(max_x, m_drawing_area.left, m_drawing_area.right)) + 1u;
@@ -461,37 +495,38 @@ void GPU_HW::LoadVertices()
         const u32 first_color = rc.color_for_first_vertex;
         const bool shaded = rc.shading_enable;
 
-        BatchVertex last_vertex;
+        s32 last_x, last_y;
+        u32 last_color;
         u32 buffer_pos = 0;
         for (u32 i = 0; i < num_vertices; i++)
         {
           const u32 color = (shaded && i > 0) ? (m_blit_buffer[buffer_pos++] & UINT32_C(0x00FFFFFF)) : first_color;
           const VertexPosition vp{m_blit_buffer[buffer_pos++]};
-
-          BatchVertex vertex;
-          vertex.Set(m_drawing_offset.x + vp.x, m_drawing_offset.y + vp.y, m_current_depth, color, 0, 0);
+          const s32 x = m_drawing_offset.x + vp.x;
+          const s32 y = m_drawing_offset.y + vp.y;
 
           if (i > 0)
           {
-            const s32 min_x = std::min(last_vertex.x, vertex.x);
-            const s32 max_x = std::max(last_vertex.x, vertex.x);
-            const s32 min_y = std::min(last_vertex.y, vertex.y);
-            const s32 max_y = std::max(last_vertex.y, vertex.y);
+            const s32 min_x = std::min(last_x, x);
+            const s32 max_x = std::max(last_x, x);
+            const s32 min_y = std::min(last_y, y);
+            const s32 max_y = std::max(last_y, y);
             const s32 dx = max_x - min_x;
             const s32 dy = max_y - min_y;
 
             if (dx >= MAX_PRIMITIVE_WIDTH || dy >= MAX_PRIMITIVE_HEIGHT)
             {
-              Log_DebugPrintf("Culling too-large line: %d,%d - %d,%d", last_vertex.x, last_vertex.y, vertex.x,
-                              vertex.y);
+              Log_DebugPrintf("Culling too-large line: %d,%d - %d,%d", last_x, last_y, x, y);
             }
             else
             {
-              BatchVertex start(last_vertex);
-              BatchVertex end(vertex);
-              FixLineVertexCoordinates(start, end, dx, dy);
-              AddVertex(start);
-              AddVertex(end);
+              s32 start_x = last_x, start_y = last_y;
+              s32 end_x = x, end_y = y;
+              FixLineVertexCoordinates(start_x, start_y, end_x, end_y, dx, dy);
+              AddNewVertex(static_cast<float>(start_x), static_cast<float>(start_y), depth, 1.0f, last_color, 0,
+                           static_cast<u16>(0));
+              AddNewVertex(static_cast<float>(end_x), static_cast<float>(end_y), depth, 1.0f, color, 0,
+                           static_cast<u16>(0));
 
               const u32 clip_left = static_cast<u32>(std::clamp<s32>(min_x, m_drawing_area.left, m_drawing_area.left));
               const u32 clip_right =
@@ -505,7 +540,9 @@ void GPU_HW::LoadVertices()
             }
           }
 
-          std::memcpy(&last_vertex, &vertex, sizeof(BatchVertex));
+          last_x = x;
+          last_y = y;
+          last_color = color;
         }
       }
     }
diff --git a/src/core/gpu_hw.h b/src/core/gpu_hw.h
index ae978763e..b12cba164 100644
--- a/src/core/gpu_hw.h
+++ b/src/core/gpu_hw.h
@@ -55,24 +55,26 @@ protected:
 
   struct BatchVertex
   {
-    s32 x;
-    s32 y;
-    s32 z;
+    float x;
+    float y;
+    float z;
+    float w;
     u32 color;
     u32 texpage;
     u16 u; // 16-bit texcoords are needed for 256 extent rectangles
     u16 v;
 
-    ALWAYS_INLINE void Set(s32 x_, s32 y_, s32 z_, u32 color_, u32 texpage_, u16 packed_texcoord)
+    ALWAYS_INLINE void Set(float x_, float y_, float z_, float w_, u32 color_, u32 texpage_, u16 packed_texcoord)
     {
-      Set(x_, y_, z_, color_, texpage_, packed_texcoord & 0xFF, (packed_texcoord >> 8));
+      Set(x_, y_, z_, w_, color_, texpage_, packed_texcoord & 0xFF, (packed_texcoord >> 8));
     }
 
-    ALWAYS_INLINE void Set(s32 x_, s32 y_, s32 z_, u32 color_, u32 texpage_, u16 u_, u16 v_)
+    ALWAYS_INLINE void Set(float x_, float y_, float z_, float w_, u32 color_, u32 texpage_, u16 u_, u16 v_)
     {
       x = x_;
       y = y_;
       z = z_;
+      w = w_;
       color = color_;
       texpage = texpage_;
       u = u_;
@@ -191,7 +193,7 @@ protected:
   /// Returns the value to be written to the depth buffer for the current operation for mask bit emulation.
   ALWAYS_INLINE float GetCurrentNormalizedVertexDepth() const
   {
-    return (static_cast<float>(m_current_depth) / 65535.0f);
+    return 1.0f - (static_cast<float>(m_current_depth) / 65535.0f);
   }
 
   /// Returns the interlaced mode to use when scanning out/displaying.
@@ -234,7 +236,7 @@ protected:
 
   /// Handles quads with flipped texture coordinate directions.
   static void HandleFlippedQuadTextureCoordinates(BatchVertex* vertices);
-  static void FixLineVertexCoordinates(BatchVertex& start, BatchVertex& end, s32 dx, s32 dy);
+  static void FixLineVertexCoordinates(s32& start_x, s32& start_y, s32& end_x, s32& end_y, s32 dx, s32 dy);
 
   HeapArray<u16, VRAM_WIDTH * VRAM_HEIGHT> m_vram_shadow;
 
diff --git a/src/core/gpu_hw_d3d11.cpp b/src/core/gpu_hw_d3d11.cpp
index d0bb17e02..9f1ae2dd1 100644
--- a/src/core/gpu_hw_d3d11.cpp
+++ b/src/core/gpu_hw_d3d11.cpp
@@ -263,7 +263,7 @@ bool GPU_HW_D3D11::CreateTextureBuffer()
 bool GPU_HW_D3D11::CreateBatchInputLayout()
 {
   static constexpr std::array<D3D11_INPUT_ELEMENT_DESC, 4> attributes = {
-    {{"ATTR", 0, DXGI_FORMAT_R32G32B32_SINT, 0, offsetof(BatchVertex, x), D3D11_INPUT_PER_VERTEX_DATA, 0},
+    {{"ATTR", 0, DXGI_FORMAT_R32G32B32A32_FLOAT, 0, offsetof(BatchVertex, x), D3D11_INPUT_PER_VERTEX_DATA, 0},
      {"ATTR", 1, DXGI_FORMAT_R8G8B8A8_UNORM, 0, offsetof(BatchVertex, color), D3D11_INPUT_PER_VERTEX_DATA, 0},
      {"ATTR", 2, DXGI_FORMAT_R32_UINT, 0, offsetof(BatchVertex, u), D3D11_INPUT_PER_VERTEX_DATA, 0},
      {"ATTR", 3, DXGI_FORMAT_R32_UINT, 0, offsetof(BatchVertex, texpage), D3D11_INPUT_PER_VERTEX_DATA, 0}}};
diff --git a/src/core/gpu_hw_opengl.cpp b/src/core/gpu_hw_opengl.cpp
index 393a34158..152ce2442 100644
--- a/src/core/gpu_hw_opengl.cpp
+++ b/src/core/gpu_hw_opengl.cpp
@@ -291,7 +291,7 @@ bool GPU_HW_OpenGL::CreateVertexBuffer()
   glEnableVertexAttribArray(1);
   glEnableVertexAttribArray(2);
   glEnableVertexAttribArray(3);
-  glVertexAttribIPointer(0, 3, GL_INT, sizeof(BatchVertex), reinterpret_cast<void*>(offsetof(BatchVertex, x)));
+  glVertexAttribPointer(0, 4, GL_FLOAT, false, sizeof(BatchVertex), reinterpret_cast<void*>(offsetof(BatchVertex, x)));
   glVertexAttribPointer(1, 4, GL_UNSIGNED_BYTE, true, sizeof(BatchVertex),
                         reinterpret_cast<void*>(offsetof(BatchVertex, color)));
   glVertexAttribIPointer(2, 1, GL_UNSIGNED_INT, sizeof(BatchVertex), reinterpret_cast<void*>(offsetof(BatchVertex, u)));
diff --git a/src/core/gpu_hw_shadergen.cpp b/src/core/gpu_hw_shadergen.cpp
index 41ec511d7..e8108b08f 100644
--- a/src/core/gpu_hw_shadergen.cpp
+++ b/src/core/gpu_hw_shadergen.cpp
@@ -516,12 +516,12 @@ std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured, bool upsc
   const char* output_block_suffix = upscaled_lines ? "VS" : "";
   if (textured)
   {
-    DeclareVertexEntryPoint(ss, {"int3 a_pos", "float4 a_col0", "uint a_texcoord", "uint a_texpage"}, 1, 1,
+    DeclareVertexEntryPoint(ss, {"float4 a_pos", "float4 a_col0", "uint a_texcoord", "uint a_texpage"}, 1, 1,
                             {{"nointerpolation", "uint4 v_texpage"}}, false, output_block_suffix);
   }
   else
   {
-    DeclareVertexEntryPoint(ss, {"int3 a_pos", "float4 a_col0"}, 1, 0, {}, false, output_block_suffix);
+    DeclareVertexEntryPoint(ss, {"float4 a_pos", "float4 a_col0"}, 1, 0, {}, false, output_block_suffix);
   }
 
   ss << R"(
@@ -532,9 +532,10 @@ std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured, bool upsc
   float vertex_offset = (RESOLUTION_SCALE == 1u) ? 0.5 : 0.0;
 
   // 0..+1023 -> -1..1
-  float pos_x = ((float(a_pos.x) + vertex_offset) / 512.0) - 1.0;
-  float pos_y = ((float(a_pos.y) + vertex_offset) / -256.0) + 1.0;
-  float pos_z = 1.0 - (float(a_pos.z) / 65535.0);
+  float pos_x = ((a_pos.x + vertex_offset) / 512.0) - 1.0;
+  float pos_y = ((a_pos.y + vertex_offset) / -256.0) + 1.0;
+  float pos_z = a_pos.z;
+  float pos_w = a_pos.w;
 
 #if API_OPENGL || API_OPENGL_ES
   // OpenGL seems to be off by one pixel in the Y direction due to lower-left origin, but only on
@@ -550,7 +551,7 @@ std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured, bool upsc
   pos_y = -pos_y;
 #endif
 
-  v_pos = float4(pos_x, pos_y, pos_z, 1.0);
+  v_pos = float4(pos_x * pos_w, pos_y * pos_w, pos_z * pos_w, pos_w);
 
   v_col0 = a_col0;
   #if TEXTURED
diff --git a/src/core/gpu_hw_vulkan.cpp b/src/core/gpu_hw_vulkan.cpp
index 6eceb0b1c..26d7f452e 100644
--- a/src/core/gpu_hw_vulkan.cpp
+++ b/src/core/gpu_hw_vulkan.cpp
@@ -669,7 +669,7 @@ bool GPU_HW_Vulkan::CompilePipelines()
                 gpbuilder.SetRenderPass(m_vram_render_pass, 0);
 
                 gpbuilder.AddVertexBuffer(0, sizeof(BatchVertex), VK_VERTEX_INPUT_RATE_VERTEX);
-                gpbuilder.AddVertexAttribute(0, 0, VK_FORMAT_R32G32B32_SINT, offsetof(BatchVertex, x));
+                gpbuilder.AddVertexAttribute(0, 0, VK_FORMAT_R32G32B32A32_SFLOAT, offsetof(BatchVertex, x));
                 gpbuilder.AddVertexAttribute(1, 0, VK_FORMAT_R8G8B8A8_UNORM, offsetof(BatchVertex, color));
                 if (textured)
                 {
diff --git a/src/core/gpu_sw.cpp b/src/core/gpu_sw.cpp
index 545270750..c68a4f975 100644
--- a/src/core/gpu_sw.cpp
+++ b/src/core/gpu_sw.cpp
@@ -227,18 +227,18 @@ void GPU_SW::DispatchRenderCommand()
       for (u32 i = 0; i < num_vertices; i++)
       {
         SWVertex& vert = vertices[i];
-        const u32 color_rgb = (shaded && i > 0) ? (m_fifo.Pop() & UINT32_C(0x00FFFFFF)) : first_color;
+        const u32 color_rgb = (shaded && i > 0) ? (FifoPop() & UINT32_C(0x00FFFFFF)) : first_color;
         vert.color_r = Truncate8(color_rgb);
         vert.color_g = Truncate8(color_rgb >> 8);
         vert.color_b = Truncate8(color_rgb >> 16);
 
-        const VertexPosition vp{m_fifo.Pop()};
+        const VertexPosition vp{FifoPop()};
         vert.x = vp.x;
         vert.y = vp.y;
 
         if (textured)
         {
-          std::tie(vert.texcoord_x, vert.texcoord_y) = UnpackTexcoord(Truncate16(m_fifo.Pop()));
+          std::tie(vert.texcoord_x, vert.texcoord_y) = UnpackTexcoord(Truncate16(FifoPop()));
         }
         else
         {
@@ -262,8 +262,8 @@ void GPU_SW::DispatchRenderCommand()
     case Primitive::Rectangle:
     {
       const auto [r, g, b] = UnpackColorRGB24(rc.color_for_first_vertex);
-      const VertexPosition vp{m_fifo.Pop()};
-      const u32 texcoord_and_palette = rc.texture_enable ? m_fifo.Pop() : 0;
+      const VertexPosition vp{FifoPop()};
+      const u32 texcoord_and_palette = rc.texture_enable ? FifoPop() : 0;
       const auto [texcoord_x, texcoord_y] = UnpackTexcoord(Truncate16(texcoord_and_palette));
 
       s32 width;
@@ -284,7 +284,7 @@ void GPU_SW::DispatchRenderCommand()
           break;
         default:
         {
-          const u32 width_and_height = m_fifo.Pop();
+          const u32 width_and_height = FifoPop();
           width = static_cast<s32>(width_and_height & VRAM_WIDTH_MASK);
           height = static_cast<s32>((width_and_height >> 16) & VRAM_HEIGHT_MASK);
 
@@ -321,7 +321,7 @@ void GPU_SW::DispatchRenderCommand()
       // first vertex
       SWVertex* p0 = &vertices[0];
       SWVertex* p1 = &vertices[1];
-      p0->SetPosition(VertexPosition{rc.polyline ? m_blit_buffer[buffer_pos++] : m_fifo.Pop()});
+      p0->SetPosition(VertexPosition{rc.polyline ? m_blit_buffer[buffer_pos++] : Truncate32(FifoPop())});
       p0->SetColorRGB24(first_color);
 
       // remaining vertices in line strip
@@ -335,8 +335,8 @@ void GPU_SW::DispatchRenderCommand()
         }
         else
         {
-          p1->SetColorRGB24(shaded ? (m_fifo.Pop() & UINT32_C(0x00FFFFFF)) : first_color);
-          p1->SetPosition(VertexPosition{m_fifo.Pop()});
+          p1->SetColorRGB24(shaded ? (FifoPop() & UINT32_C(0x00FFFFFF)) : first_color);
+          p1->SetPosition(VertexPosition{Truncate32(FifoPop())});
         }
 
         // down here because of the FIFO pops
diff --git a/src/core/gte.cpp b/src/core/gte.cpp
index 1b2a0d2d5..12cda54ad 100644
--- a/src/core/gte.cpp
+++ b/src/core/gte.cpp
@@ -3,6 +3,7 @@
 #include "common/bitutils.h"
 #include "common/state_wrapper.h"
 #include "cpu_core.h"
+#include "pgxp.h"
 #include "settings.h"
 #include <algorithm>
 #include <array>
@@ -621,6 +622,21 @@ static void RTPS(const s16 V[3], u8 shift, bool lm, bool last)
   CheckMACOverflow<0>(Sy);
   PushSXY(s32(Sx >> 16), s32(Sy >> 16));
 
+  if (g_settings.gpu_pgxp_enable)
+  {
+    // this can potentially use increased precision on Z
+    const float precise_z = std::max<float>((float)REGS.H / 2.f, (float)REGS.SZ3);
+    const float precise_h_div_sz = (float)REGS.H / precise_z;
+    const float fofx = ((float)REGS.OFX / (float)(1 << 16));
+    const float fofy = ((float)REGS.OFY / (float)(1 << 16));
+    float precise_x = fofx + ((float)REGS.IR1 * precise_h_div_sz) * ((g_settings.gpu_widescreen_hack) ? 0.75f : 1.00f);
+    float precise_y = fofy + ((float)REGS.IR2 * precise_h_div_sz);
+
+    precise_x = std::clamp<float>(precise_x, -0x400, 0x3ff);
+    precise_y = std::clamp<float>(precise_y, -0x400, 0x3ff);
+    PGXP::GTE_PushSXYZ2f(precise_x, precise_y, precise_z, REGS.dr32[14]);
+  }
+
   if (last)
   {
     // MAC0=(((H*20000h/SZ3)+1)/2)*DQA+DQB, IR0=MAC0/1000h  ;Depth cueing 0..+1000h
@@ -664,6 +680,19 @@ static void Execute_NCLIP(Instruction inst)
   REGS.FLAG.UpdateError();
 }
 
+static void Execute_NCLIP_PGXP(Instruction inst)
+{
+  if (PGXP::GTE_NCLIP_valid(REGS.dr32[12], REGS.dr32[13], REGS.dr32[14]))
+  {
+    REGS.FLAG.Clear();
+    REGS.MAC0 = static_cast<s32>(PGXP::GTE_NCLIP());
+  }
+  else
+  {
+    Execute_NCLIP(inst);
+  }
+}
+
 static void Execute_AVSZ3(Instruction inst)
 {
   REGS.FLAG.Clear();
@@ -994,8 +1023,13 @@ void ExecuteInstruction(u32 inst_bits)
       break;
 
     case 0x06:
-      Execute_NCLIP(inst);
-      break;
+    {
+      if (g_settings.gpu_pgxp_enable && g_settings.gpu_pgxp_culling)
+        Execute_NCLIP_PGXP(inst);
+      else
+        Execute_NCLIP(inst);
+    }
+    break;
 
     case 0x0C:
       Execute_OP(inst);
@@ -1092,7 +1126,12 @@ InstructionImpl GetInstructionImpl(u32 inst_bits)
       return &Execute_RTPS;
 
     case 0x06:
-      return &Execute_NCLIP;
+    {
+      if (g_settings.gpu_pgxp_enable && g_settings.gpu_pgxp_culling)
+        return &Execute_NCLIP_PGXP;
+      else
+        return &Execute_NCLIP;
+    }
 
     case 0x0C:
       return &Execute_OP;
diff --git a/src/core/host_interface.cpp b/src/core/host_interface.cpp
index 4f0d3c6e6..6e945922c 100644
--- a/src/core/host_interface.cpp
+++ b/src/core/host_interface.cpp
@@ -8,12 +8,13 @@
 #include "common/log.h"
 #include "common/string_util.h"
 #include "controller.h"
-#include "cpu_core.h"
 #include "cpu_code_cache.h"
+#include "cpu_core.h"
 #include "dma.h"
 #include "gpu.h"
 #include "gte.h"
 #include "host_display.h"
+#include "pgxp.h"
 #include "save_state_version.h"
 #include "system.h"
 #include <cmath>
@@ -367,6 +368,10 @@ void HostInterface::SetDefaultSettings(SettingsInterface& si)
   si.SetBoolValue("GPU", "DisableInterlacing", false);
   si.SetBoolValue("GPU", "ForceNTSCTimings", false);
   si.SetBoolValue("GPU", "WidescreenHack", false);
+  si.SetBoolValue("GPU", "PGXPEnable", false);
+  si.SetBoolValue("GPU", "PGXPCulling", true);
+  si.SetBoolValue("GPU", "PGXPTextureCorrection", true);
+  si.SetBoolValue("GPU", "PGXPVertexCache", false);
 
   si.SetStringValue("Display", "CropMode", Settings::GetDisplayCropModeName(Settings::DEFAULT_DISPLAY_CROP_MODE));
   si.SetStringValue("Display", "AspectRatio",
@@ -485,6 +490,19 @@ void HostInterface::CheckForSettingsChanges(const Settings& old_settings)
       g_gpu->UpdateSettings();
     }
 
+    if (g_settings.gpu_pgxp_enable != old_settings.gpu_pgxp_enable ||
+        (g_settings.gpu_pgxp_enable && g_settings.gpu_pgxp_culling != old_settings.gpu_pgxp_culling))
+    {
+      if (g_settings.IsUsingCodeCache())
+      {
+        ReportFormattedMessage("PGXP %s, recompiling all blocks.", g_settings.gpu_pgxp_enable ? "enabled" : "disabled");
+        CPU::CodeCache::Flush();
+      }
+
+      if (g_settings.gpu_pgxp_enable)
+        PGXP::Initialize();
+    }
+
     if (g_settings.cdrom_read_thread != old_settings.cdrom_read_thread)
       g_cdrom.SetUseReadThread(g_settings.cdrom_read_thread);
 
@@ -625,8 +643,7 @@ void HostInterface::ToggleSoftwareRendering()
   if (System::IsShutdown() || g_settings.gpu_renderer == GPURenderer::Software)
     return;
 
-  const GPURenderer new_renderer =
-    g_gpu->IsHardwareRenderer() ? GPURenderer::Software : g_settings.gpu_renderer;
+  const GPURenderer new_renderer = g_gpu->IsHardwareRenderer() ? GPURenderer::Software : g_settings.gpu_renderer;
 
   AddFormattedOSDMessage(2.0f, "Switching to %s renderer...", Settings::GetRendererDisplayName(new_renderer));
   System::RecreateGPU(new_renderer);
diff --git a/src/core/pgxp.cpp b/src/core/pgxp.cpp
new file mode 100644
index 000000000..7ca7c4f94
--- /dev/null
+++ b/src/core/pgxp.cpp
@@ -0,0 +1,800 @@
+/***************************************************************************
+ *   Original copyright notice from PGXP code from Beetle PSX.             *
+ *   Copyright (C) 2016 by iCatButler                                      *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.           *
+ ***************************************************************************/
+
+#include "pgxp.h"
+#include "settings.h"
+#include <cmath>
+
+namespace PGXP {
+// pgxp_types.h
+typedef struct PGXP_value_Tag
+{
+  float x;
+  float y;
+  float z;
+  union
+  {
+    unsigned int flags;
+    unsigned char compFlags[4];
+    unsigned short halfFlags[2];
+  };
+  unsigned int count;
+  unsigned int value;
+
+  unsigned short gFlags;
+  unsigned char lFlags;
+  unsigned char hFlags;
+} PGXP_value;
+
+// pgxp_value.h
+typedef union
+{
+  struct
+  {
+    u8 l, h, h2, h3;
+  } b;
+  struct
+  {
+    u16 l, h;
+  } w;
+  struct
+  {
+    s8 l, h, h2, h3;
+  } sb;
+  struct
+  {
+    s16 l, h;
+  } sw;
+  u32 d;
+  s32 sd;
+} psx_value;
+
+typedef enum
+{
+  UNINITIALISED = 0,
+  INVALID_PSX_VALUE = 1,
+  INVALID_ADDRESS = 2,
+  INVALID_BITWISE_OP = 3,
+  DIVIDE_BY_ZERO = 4,
+  INVALID_8BIT_LOAD = 5,
+  INVALID_8BIT_STORE = 6
+} PGXP_error_states;
+
+#define NONE 0
+#define ALL 0xFFFFFFFF
+#define VALID 1
+#define VALID_0 (VALID << 0)
+#define VALID_1 (VALID << 8)
+#define VALID_2 (VALID << 16)
+#define VALID_3 (VALID << 24)
+#define VALID_01 (VALID_0 | VALID_1)
+#define VALID_012 (VALID_0 | VALID_1 | VALID_2)
+#define VALID_ALL (VALID_0 | VALID_1 | VALID_2 | VALID_3)
+#define INV_VALID_ALL (ALL ^ VALID_ALL)
+
+static const PGXP_value PGXP_value_invalid_address = {0.f, 0.f, 0.f, {0}, 0, 0, INVALID_ADDRESS, 0, 0};
+static const PGXP_value PGXP_value_zero = {0.f, 0.f, 0.f, {0}, 0, VALID_ALL, 0, 0, 0};
+
+static void Validate(PGXP_value* pV, u32 psxV);
+static void MaskValidate(PGXP_value* pV, u32 psxV, u32 mask, u32 validMask);
+
+typedef union
+{
+  struct
+  {
+    s16 x;
+    s16 y;
+  };
+  struct
+  {
+    u16 ux;
+    u16 uy;
+  };
+  u32 word;
+} low_value;
+
+// pgxp_mem.h
+static u32 PGXP_ConvertAddress(u32 addr);
+static PGXP_value* GetPtr(u32 addr);
+static PGXP_value* ReadMem(u32 addr);
+
+static void ValidateAndCopyMem(PGXP_value* dest, u32 addr, u32 value);
+static void ValidateAndCopyMem16(PGXP_value* dest, u32 addr, u32 value, int sign);
+
+static void WriteMem(PGXP_value* value, u32 addr);
+static void WriteMem16(PGXP_value* src, u32 addr);
+
+// pgxp_gpu.h
+void PGXP_CacheVertex(short sx, short sy, const PGXP_value* _pVertex);
+
+// pgxp_gte.h
+static void PGXP_InitGTE();
+
+// pgxp_cpu.h
+static void PGXP_InitCPU();
+static PGXP_value CPU_reg_mem[34];
+#define CPU_Hi CPU_reg[33]
+#define CPU_Lo CPU_reg[34]
+static PGXP_value CP0_reg_mem[32];
+
+static PGXP_value* CPU_reg = CPU_reg_mem;
+static PGXP_value* CP0_reg = CP0_reg_mem;
+
+// pgxp_value.c
+void Validate(PGXP_value* pV, u32 psxV)
+{
+  // assume pV is not NULL
+  pV->flags &= (pV->value == psxV) ? ALL : INV_VALID_ALL;
+}
+
+void MaskValidate(PGXP_value* pV, u32 psxV, u32 mask, u32 validMask)
+{
+  // assume pV is not NULL
+  pV->flags &= ((pV->value & mask) == (psxV & mask)) ? ALL : (ALL ^ (validMask));
+}
+
+// pgxp_mem.c
+static void PGXP_InitMem();
+static PGXP_value Mem[3 * 2048 * 1024 / 4]; // mirror 2MB in 32-bit words * 3
+static const u32 UserMemOffset = 0;
+static const u32 ScratchOffset = 2048 * 1024 / 4;
+static const u32 RegisterOffset = 2 * 2048 * 1024 / 4;
+static const u32 InvalidAddress = 3 * 2048 * 1024 / 4;
+
+void PGXP_InitMem()
+{
+  memset(Mem, 0, sizeof(Mem));
+}
+
+u32 PGXP_ConvertAddress(u32 addr)
+{
+  u32 paddr = addr;
+  switch (paddr >> 24)
+  {
+    case 0x80:
+    case 0xa0:
+    case 0x00:
+      // RAM further mirrored over 8MB
+      paddr = ((paddr & 0x7FFFFF) % 0x200000) >> 2;
+      paddr = UserMemOffset + paddr;
+      break;
+    default:
+      if ((paddr >> 20) == 0x1f8)
+      {
+        if (paddr >= 0x1f801000)
+        {
+          //	paddr = ((paddr & 0xFFFF) - 0x1000);
+          //	paddr = (paddr % 0x2000) >> 2;
+          paddr = ((paddr & 0xFFFF) - 0x1000) >> 2;
+          paddr = RegisterOffset + paddr;
+          break;
+        }
+        else
+        {
+          // paddr = ((paddr & 0xFFF) % 0x400) >> 2;
+          paddr = (paddr & 0x3FF) >> 2;
+          paddr = ScratchOffset + paddr;
+          break;
+        }
+      }
+
+      paddr = InvalidAddress;
+      break;
+  }
+
+#ifdef GTE_LOG
+    // GTE_LOG("PGXP_Read %x [%x] |", addr, paddr);
+#endif
+
+  return paddr;
+}
+
+PGXP_value* GetPtr(u32 addr)
+{
+  addr = PGXP_ConvertAddress(addr);
+
+  if (addr != InvalidAddress)
+    return &Mem[addr];
+  return NULL;
+}
+
+PGXP_value* ReadMem(u32 addr)
+{
+  return GetPtr(addr);
+}
+
+void ValidateAndCopyMem(PGXP_value* dest, u32 addr, u32 value)
+{
+  PGXP_value* pMem = GetPtr(addr);
+  if (pMem != NULL)
+  {
+    Validate(pMem, value);
+    *dest = *pMem;
+    return;
+  }
+
+  *dest = PGXP_value_invalid_address;
+}
+
+void ValidateAndCopyMem16(PGXP_value* dest, u32 addr, u32 value, int sign)
+{
+  u32 validMask = 0;
+  psx_value val, mask;
+  PGXP_value* pMem = GetPtr(addr);
+  if (pMem != NULL)
+  {
+    mask.d = val.d = 0;
+    // determine if high or low word
+    if ((addr % 4) == 2)
+    {
+      val.w.h = static_cast<u16>(value);
+      mask.w.h = 0xFFFF;
+      validMask = VALID_1;
+    }
+    else
+    {
+      val.w.l = static_cast<u16>(value);
+      mask.w.l = 0xFFFF;
+      validMask = VALID_0;
+    }
+
+    // validate and copy whole value
+    MaskValidate(pMem, val.d, mask.d, validMask);
+    *dest = *pMem;
+
+    // if high word then shift
+    if ((addr % 4) == 2)
+    {
+      dest->x = dest->y;
+      dest->lFlags = dest->hFlags;
+      dest->compFlags[0] = dest->compFlags[1];
+    }
+
+    // truncate value
+    dest->y = (dest->x < 0) ? -1.f * sign : 0.f; // 0.f;
+    dest->hFlags = 0;
+    dest->value = value;
+    dest->compFlags[1] = VALID; // iCB: High word is valid, just 0
+    return;
+  }
+
+  *dest = PGXP_value_invalid_address;
+}
+
+void WriteMem(PGXP_value* value, u32 addr)
+{
+  PGXP_value* pMem = GetPtr(addr);
+
+  if (pMem)
+    *pMem = *value;
+}
+
+void WriteMem16(PGXP_value* src, u32 addr)
+{
+  PGXP_value* dest = GetPtr(addr);
+  psx_value* pVal = NULL;
+
+  if (dest)
+  {
+    pVal = (psx_value*)&dest->value;
+    // determine if high or low word
+    if ((addr % 4) == 2)
+    {
+      dest->y = src->x;
+      dest->hFlags = src->lFlags;
+      dest->compFlags[1] = src->compFlags[0];
+      pVal->w.h = (u16)src->value;
+    }
+    else
+    {
+      dest->x = src->x;
+      dest->lFlags = src->lFlags;
+      dest->compFlags[0] = src->compFlags[0];
+      pVal->w.l = (u16)src->value;
+    }
+
+    // overwrite z/w if valid
+    if (src->compFlags[2] == VALID)
+    {
+      dest->z = src->z;
+      dest->compFlags[2] = src->compFlags[2];
+    }
+
+    // dest->valid = dest->valid && src->valid;
+    dest->gFlags |= src->gFlags; // inherit flags from both values (?)
+  }
+}
+
+// pgxp_main.c
+u32 static gMode = 0;
+
+void Initialize()
+{
+  PGXP_InitMem();
+  PGXP_InitCPU();
+  PGXP_InitGTE();
+}
+
+void PGXP_SetModes(u32 modes)
+{
+  gMode = modes;
+}
+
+u32 PGXP_GetModes()
+{
+  return gMode;
+}
+
+void PGXP_EnableModes(u32 modes)
+{
+  gMode |= modes;
+}
+
+void PGXP_DisableModes(u32 modes)
+{
+  gMode = gMode & ~modes;
+}
+
+// pgxp_gte.c
+
+// GTE registers
+static PGXP_value GTE_data_reg_mem[32];
+static PGXP_value GTE_ctrl_reg_mem[32];
+
+static PGXP_value* GTE_data_reg = GTE_data_reg_mem;
+static PGXP_value* GTE_ctrl_reg = GTE_ctrl_reg_mem;
+
+void PGXP_InitGTE()
+{
+  memset(GTE_data_reg_mem, 0, sizeof(GTE_data_reg_mem));
+  memset(GTE_ctrl_reg_mem, 0, sizeof(GTE_ctrl_reg_mem));
+}
+
+// Instruction register decoding
+#define op(_instr) (_instr >> 26)          // The op part of the instruction register
+#define func(_instr) ((_instr)&0x3F)       // The funct part of the instruction register
+#define sa(_instr) ((_instr >> 6) & 0x1F)  // The sa part of the instruction register
+#define rd(_instr) ((_instr >> 11) & 0x1F) // The rd part of the instruction register
+#define rt(_instr) ((_instr >> 16) & 0x1F) // The rt part of the instruction register
+#define rs(_instr) ((_instr >> 21) & 0x1F) // The rs part of the instruction register
+#define imm(_instr) (_instr & 0xFFFF)      // The immediate part of the instruction register
+
+#define SX0 (GTE_data_reg[12].x)
+#define SY0 (GTE_data_reg[12].y)
+#define SX1 (GTE_data_reg[13].x)
+#define SY1 (GTE_data_reg[13].y)
+#define SX2 (GTE_data_reg[14].x)
+#define SY2 (GTE_data_reg[14].y)
+
+#define SXY0 (GTE_data_reg[12])
+#define SXY1 (GTE_data_reg[13])
+#define SXY2 (GTE_data_reg[14])
+#define SXYP (GTE_data_reg[15])
+
+void GTE_PushSXYZ2f(float _x, float _y, float _z, unsigned int _v)
+{
+  static unsigned int uCount = 0;
+  low_value temp;
+  // push values down FIFO
+  SXY0 = SXY1;
+  SXY1 = SXY2;
+
+  SXY2.x = _x;
+  SXY2.y = _y;
+  SXY2.z = _z;
+  SXY2.value = _v;
+  SXY2.flags = VALID_ALL;
+  SXY2.count = uCount++;
+
+  // cache value in GPU plugin
+  temp.word = _v;
+  if (g_settings.gpu_pgxp_vertex_cache)
+    PGXP_CacheVertex(temp.x, temp.y, &SXY2);
+  else
+    PGXP_CacheVertex(0, 0, NULL);
+
+#ifdef GTE_LOG
+  GTE_LOG("PGXP_PUSH (%f, %f) %u %u|", SXY2.x, SXY2.y, SXY2.flags, SXY2.count);
+#endif
+}
+
+void GTE_PushSXYZ2s(s64 _x, s64 _y, s64 _z, u32 v)
+{
+  float fx = (float)(_x) / (float)(1 << 16);
+  float fy = (float)(_y) / (float)(1 << 16);
+  float fz = (float)(_z);
+
+  // if(Config.PGXP_GTE)
+  GTE_PushSXYZ2f(fx, fy, fz, v);
+}
+
+#define VX(n) (psxRegs.CP2D.p[n << 1].sw.l)
+#define VY(n) (psxRegs.CP2D.p[n << 1].sw.h)
+#define VZ(n) (psxRegs.CP2D.p[(n << 1) + 1].sw.l)
+
+int GTE_NCLIP_valid(u32 sxy0, u32 sxy1, u32 sxy2)
+{
+  Validate(&SXY0, sxy0);
+  Validate(&SXY1, sxy1);
+  Validate(&SXY2, sxy2);
+  if (((SXY0.flags & SXY1.flags & SXY2.flags & VALID_01) == VALID_01)) // && Config.PGXP_GTE && (Config.PGXP_Mode > 0))
+    return 1;
+  return 0;
+}
+
+float GTE_NCLIP()
+{
+  float nclip = ((SX0 * SY1) + (SX1 * SY2) + (SX2 * SY0) - (SX0 * SY2) - (SX1 * SY0) - (SX2 * SY1));
+
+  // ensure fractional values are not incorrectly rounded to 0
+  float nclipAbs = std::abs(nclip);
+  if ((0.1f < nclipAbs) && (nclipAbs < 1.f))
+    nclip += (nclip < 0.f ? -1 : 1);
+
+  // float AX = SX1 - SX0;
+  // float AY = SY1 - SY0;
+
+  // float BX = SX2 - SX0;
+  // float BY = SY2 - SY0;
+
+  //// normalise A and B
+  // float mA = sqrt((AX*AX) + (AY*AY));
+  // float mB = sqrt((BX*BX) + (BY*BY));
+
+  //// calculate AxB to get Z component of C
+  // float CZ = ((AX * BY) - (AY * BX)) * (1 << 12);
+
+  return nclip;
+}
+
+static void PGXP_MTC2_int(PGXP_value value, u32 reg)
+{
+  switch (reg)
+  {
+    case 15:
+      // push FIFO
+      SXY0 = SXY1;
+      SXY1 = SXY2;
+      SXY2 = value;
+      SXYP = SXY2;
+      break;
+
+    case 31:
+      return;
+  }
+
+  GTE_data_reg[reg] = value;
+}
+
+////////////////////////////////////
+// Data transfer tracking
+////////////////////////////////////
+
+void CPU_MFC2(u32 instr, u32 rtVal, u32 rdVal)
+{
+  // CPU[Rt] = GTE_D[Rd]
+  Validate(&GTE_data_reg[rd(instr)], rdVal);
+  CPU_reg[rt(instr)] = GTE_data_reg[rd(instr)];
+  CPU_reg[rt(instr)].value = rtVal;
+}
+
+void CPU_MTC2(u32 instr, u32 rdVal, u32 rtVal)
+{
+  // GTE_D[Rd] = CPU[Rt]
+  Validate(&CPU_reg[rt(instr)], rtVal);
+  PGXP_MTC2_int(CPU_reg[rt(instr)], rd(instr));
+  GTE_data_reg[rd(instr)].value = rdVal;
+}
+
+void CPU_CFC2(u32 instr, u32 rtVal, u32 rdVal)
+{
+  // CPU[Rt] = GTE_C[Rd]
+  Validate(&GTE_ctrl_reg[rd(instr)], rdVal);
+  CPU_reg[rt(instr)] = GTE_ctrl_reg[rd(instr)];
+  CPU_reg[rt(instr)].value = rtVal;
+}
+
+void CPU_CTC2(u32 instr, u32 rdVal, u32 rtVal)
+{
+  // GTE_C[Rd] = CPU[Rt]
+  Validate(&CPU_reg[rt(instr)], rtVal);
+  GTE_ctrl_reg[rd(instr)] = CPU_reg[rt(instr)];
+  GTE_ctrl_reg[rd(instr)].value = rdVal;
+}
+
+////////////////////////////////////
+// Memory Access
+////////////////////////////////////
+void CPU_LWC2(u32 instr, u32 rtVal, u32 addr)
+{
+  // GTE_D[Rt] = Mem[addr]
+  PGXP_value val;
+  ValidateAndCopyMem(&val, addr, rtVal);
+  PGXP_MTC2_int(val, rt(instr));
+}
+
+void CPU_SWC2(u32 instr, u32 rtVal, u32 addr)
+{
+  //  Mem[addr] = GTE_D[Rt]
+  Validate(&GTE_data_reg[rt(instr)], rtVal);
+  WriteMem(&GTE_data_reg[rt(instr)], addr);
+}
+
+// pgxp_gpu.c
+/////////////////////////////////
+//// Blade_Arma's Vertex Cache (CatBlade?)
+/////////////////////////////////
+const unsigned int mode_init = 0;
+const unsigned int mode_write = 1;
+const unsigned int mode_read = 2;
+const unsigned int mode_fail = 3;
+
+PGXP_value vertexCache[0x800 * 2][0x800 * 2];
+
+unsigned int baseID = 0;
+unsigned int lastID = 0;
+unsigned int cacheMode = 0;
+
+unsigned int IsSessionID(unsigned int vertID)
+{
+  // No wrapping
+  if (lastID >= baseID)
+    return (vertID >= baseID);
+
+  // If vertID is >= baseID it is pre-wrap and in session
+  if (vertID >= baseID)
+    return 1;
+
+  // vertID is < baseID, If it is <= lastID it is post-wrap and in session
+  if (vertID <= lastID)
+    return 1;
+
+  return 0;
+}
+
+void PGXP_CacheVertex(short sx, short sy, const PGXP_value* _pVertex)
+{
+  const PGXP_value* pNewVertex = (const PGXP_value*)_pVertex;
+  PGXP_value* pOldVertex = NULL;
+
+  if (!pNewVertex)
+  {
+    cacheMode = mode_fail;
+    return;
+  }
+
+  // if (bGteAccuracy)
+  {
+    if (cacheMode != mode_write)
+    {
+      // Initialise cache on first use
+      if (cacheMode == mode_init)
+        memset(vertexCache, 0x00, sizeof(vertexCache));
+
+      // First vertex of write session (frame?)
+      cacheMode = mode_write;
+      baseID = pNewVertex->count;
+    }
+
+    lastID = pNewVertex->count;
+
+    if (sx >= -0x800 && sx <= 0x7ff && sy >= -0x800 && sy <= 0x7ff)
+    {
+      pOldVertex = &vertexCache[sy + 0x800][sx + 0x800];
+
+      // To avoid ambiguity there can only be one valid entry per-session
+      if (0) //(IsSessionID(pOldVertex->count) && (pOldVertex->value == pNewVertex->value))
+      {
+        // check to ensure this isn't identical
+        if ((fabsf(pOldVertex->x - pNewVertex->x) > 0.1f) || (fabsf(pOldVertex->y - pNewVertex->y) > 0.1f) ||
+            (fabsf(pOldVertex->z - pNewVertex->z) > 0.1f))
+        {
+          *pOldVertex = *pNewVertex;
+          pOldVertex->gFlags = 5;
+          return;
+        }
+      }
+
+      // Write vertex into cache
+      *pOldVertex = *pNewVertex;
+      pOldVertex->gFlags = 1;
+    }
+  }
+}
+
+PGXP_value* PGXP_GetCachedVertex(short sx, short sy)
+{
+  // if (bGteAccuracy)
+  {
+    if (cacheMode != mode_read)
+    {
+      if (cacheMode == mode_fail)
+        return NULL;
+
+      // Initialise cache on first use
+      if (cacheMode == mode_init)
+        memset(vertexCache, 0x00, sizeof(vertexCache));
+
+      // First vertex of read session (frame?)
+      cacheMode = mode_read;
+    }
+
+    if (sx >= -0x800 && sx <= 0x7ff && sy >= -0x800 && sy <= 0x7ff)
+    {
+      // Return pointer to cache entry
+      return &vertexCache[sy + 0x800][sx + 0x800];
+    }
+  }
+
+  return NULL;
+}
+
+static float TruncateVertexPosition(float p)
+{
+  const s32 int_part = static_cast<s32>(p);
+  const float int_part_f = static_cast<float>(int_part);
+  return static_cast<float>(static_cast<s16>(int_part << 5) >> 5) + (p - int_part_f);
+}
+
+bool GetPreciseVertex(u32 addr, u32 value, int x, int y, int xOffs, int yOffs, float* out_x, float* out_y, float* out_w)
+{
+  const PGXP_value* vert = ReadMem(addr);
+  if (vert && ((vert->flags & VALID_01) == VALID_01) && (vert->value == value))
+  {
+    // There is a value here with valid X and Y coordinates
+    *out_x = TruncateVertexPosition(vert->x) + static_cast<float>(xOffs);
+    *out_y = TruncateVertexPosition(vert->y) + static_cast<float>(yOffs);
+    *out_w = vert->z / 32768.0f;
+
+    // This value does not have a valid W coordinate
+    return ((vert->flags & VALID_2) == VALID_2);
+  }
+  else
+  {
+    const short psx_x = (short)(value & 0xFFFFu);
+    const short psx_y = (short)(value >> 16);
+
+    // Look in cache for valid vertex
+    vert = PGXP_GetCachedVertex(psx_x, psx_y);
+    if ((vert) && /*(IsSessionID(vert->count)) &&*/ (vert->gFlags == 1))
+    {
+      // a value is found, it is from the current session and is unambiguous (there was only one value recorded at that
+      // position)
+      *out_x = TruncateVertexPosition(vert->x) + static_cast<float>(xOffs);
+      *out_y = TruncateVertexPosition(vert->y) + static_cast<float>(yOffs);
+      *out_w = vert->z / 32768.0f;
+      return false; // iCB: Getting the wrong w component causes too great an error when using perspective correction
+                    // so disable it
+    }
+    else
+    {
+      // no valid value can be found anywhere, use the native PSX data
+      *out_x = static_cast<float>(x);
+      *out_y = static_cast<float>(y);
+      *out_w = 1.0f;
+      return false;
+    }
+  }
+}
+
+// pgxp_cpu.c
+
+// Instruction register decoding
+#define op(_instr) (_instr >> 26)          // The op part of the instruction register
+#define func(_instr) ((_instr)&0x3F)       // The funct part of the instruction register
+#define sa(_instr) ((_instr >> 6) & 0x1F)  // The sa part of the instruction register
+#define rd(_instr) ((_instr >> 11) & 0x1F) // The rd part of the instruction register
+#define rt(_instr) ((_instr >> 16) & 0x1F) // The rt part of the instruction register
+#define rs(_instr) ((_instr >> 21) & 0x1F) // The rs part of the instruction register
+#define imm(_instr) (_instr & 0xFFFF)      // The immediate part of the instruction register
+
+void PGXP_InitCPU()
+{
+  memset(CPU_reg_mem, 0, sizeof(CPU_reg_mem));
+  memset(CP0_reg_mem, 0, sizeof(CP0_reg_mem));
+}
+
+// invalidate register (invalid 8 bit read)
+static void InvalidLoad(u32 addr, u32 code, u32 value)
+{
+  u32 reg = ((code >> 16) & 0x1F); // The rt part of the instruction register
+  PGXP_value* pD = NULL;
+  PGXP_value p;
+
+  p.x = p.y = -1337; // default values
+
+  // p.valid = 0;
+  // p.count = value;
+  pD = ReadMem(addr);
+
+  if (pD)
+  {
+    p.count = addr;
+    p = *pD;
+  }
+  else
+  {
+    p.count = value;
+  }
+
+  p.flags = 0;
+
+  // invalidate register
+  CPU_reg[reg] = p;
+}
+
+// invalidate memory address (invalid 8 bit write)
+static void InvalidStore(u32 addr, u32 code, u32 value)
+{
+  u32 reg = ((code >> 16) & 0x1F); // The rt part of the instruction register
+  PGXP_value* pD = NULL;
+  PGXP_value p;
+
+  pD = ReadMem(addr);
+
+  p.x = p.y = -2337;
+
+  if (pD)
+    p = *pD;
+
+  p.flags = 0;
+  p.count = (reg * 1000) + value;
+
+  // invalidate memory
+  WriteMem(&p, addr);
+}
+
+void CPU_LW(u32 instr, u32 rtVal, u32 addr)
+{
+  // Rt = Mem[Rs + Im]
+  ValidateAndCopyMem(&CPU_reg[rt(instr)], addr, rtVal);
+}
+
+void CPU_LBx(u32 instr, u32 rtVal, u32 addr)
+{
+  InvalidLoad(addr, instr, 116);
+}
+
+void CPU_LHx(u32 instr, u32 rtVal, u32 addr)
+{
+  // Rt = Mem[Rs + Im] (sign/zero extended)
+  ValidateAndCopyMem16(&CPU_reg[rt(instr)], addr, rtVal, 1);
+}
+
+void CPU_SB(u32 instr, u8 rtVal, u32 addr)
+{
+  InvalidStore(addr, instr, 208);
+}
+
+void CPU_SH(u32 instr, u16 rtVal, u32 addr)
+{
+  // validate and copy half value
+  MaskValidate(&CPU_reg[rt(instr)], rtVal, 0xFFFF, VALID_0);
+  WriteMem16(&CPU_reg[rt(instr)], addr);
+}
+
+void CPU_SW(u32 instr, u32 rtVal, u32 addr)
+{
+  // Mem[Rs + Im] = Rt
+  Validate(&CPU_reg[rt(instr)], rtVal);
+  WriteMem(&CPU_reg[rt(instr)], addr);
+}
+
+} // namespace PGXP
\ No newline at end of file
diff --git a/src/core/pgxp.h b/src/core/pgxp.h
new file mode 100644
index 000000000..02b996615
--- /dev/null
+++ b/src/core/pgxp.h
@@ -0,0 +1,54 @@
+/***************************************************************************
+ *   Original copyright notice from PGXP code from Beetle PSX.             *
+ *   Copyright (C) 2016 by iCatButler                                      *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.           *
+ ***************************************************************************/
+
+#pragma once
+#include "types.h"
+
+namespace PGXP {
+
+void Initialize();
+
+// -- GTE functions
+// Transforms
+void GTE_PushSXYZ2f(float _x, float _y, float _z, unsigned int _v);
+void GTE_PushSXYZ2s(s64 _x, s64 _y, s64 _z, u32 v);
+int GTE_NCLIP_valid(u32 sxy0, u32 sxy1, u32 sxy2);
+float GTE_NCLIP();
+
+// Data transfer tracking
+void CPU_MFC2(u32 instr, u32 rtVal, u32 rdVal); // copy GTE data reg to GPR reg (MFC2)
+void CPU_MTC2(u32 instr, u32 rdVal, u32 rtVal); // copy GPR reg to GTE data reg (MTC2)
+void CPU_CFC2(u32 instr, u32 rtVal, u32 rdVal); // copy GTE ctrl reg to GPR reg (CFC2)
+void CPU_CTC2(u32 instr, u32 rdVal, u32 rtVal); // copy GPR reg to GTE ctrl reg (CTC2)
+// Memory Access
+void CPU_LWC2(u32 instr, u32 rtVal, u32 addr); // copy memory to GTE reg
+void CPU_SWC2(u32 instr, u32 rtVal, u32 addr); // copy GTE reg to memory
+
+bool GetPreciseVertex(u32 addr, u32 value, int x, int y, int xOffs, int yOffs, float* out_x, float* out_y, float* out_w);
+
+// -- CPU functions
+void CPU_LW(u32 instr, u32 rtVal, u32 addr);
+void CPU_LHx(u32 instr, u32 rtVal, u32 addr);
+void CPU_LBx(u32 instr, u32 rtVal, u32 addr);
+void CPU_SB(u32 instr, u8 rtVal, u32 addr);
+void CPU_SH(u32 instr, u16 rtVal, u32 addr);
+void CPU_SW(u32 instr, u32 rtVal, u32 addr);
+
+} // namespace PGXP
\ No newline at end of file
diff --git a/src/core/settings.cpp b/src/core/settings.cpp
index ac8f209f1..dac561636 100644
--- a/src/core/settings.cpp
+++ b/src/core/settings.cpp
@@ -101,6 +101,10 @@ void Settings::Load(SettingsInterface& si)
   gpu_disable_interlacing = si.GetBoolValue("GPU", "DisableInterlacing", false);
   gpu_force_ntsc_timings = si.GetBoolValue("GPU", "ForceNTSCTimings", false);
   gpu_widescreen_hack = si.GetBoolValue("GPU", "WidescreenHack", false);
+  gpu_pgxp_enable = si.GetBoolValue("GPU", "PGXPEnable", false);
+  gpu_pgxp_culling = si.GetBoolValue("GPU", "PGXPCulling", true);
+  gpu_pgxp_texture_correction = si.GetBoolValue("GPU", "PGXPTextureCorrection", true);
+  gpu_pgxp_vertex_cache = si.GetBoolValue("GPU", "PGXPVertexCache", false);
 
   display_crop_mode =
     ParseDisplayCropMode(
@@ -203,6 +207,10 @@ void Settings::Save(SettingsInterface& si) const
   si.SetBoolValue("GPU", "DisableInterlacing", gpu_disable_interlacing);
   si.SetBoolValue("GPU", "ForceNTSCTimings", gpu_force_ntsc_timings);
   si.SetBoolValue("GPU", "WidescreenHack", gpu_widescreen_hack);
+  si.SetBoolValue("GPU", "PGXPEnable", gpu_pgxp_enable);
+  si.SetBoolValue("GPU", "PGXPCulling", gpu_pgxp_culling);
+  si.SetBoolValue("GPU", "PGXPTextureCorrection", gpu_pgxp_texture_correction);
+  si.SetBoolValue("GPU", "PGXPVertexCache", gpu_pgxp_vertex_cache);
 
   si.SetStringValue("Display", "CropMode", GetDisplayCropModeName(display_crop_mode));
   si.SetStringValue("Display", "AspectRatio", GetDisplayAspectRatioName(display_aspect_ratio));
diff --git a/src/core/settings.h b/src/core/settings.h
index 2e6b24d25..d0a010811 100644
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -88,6 +88,10 @@ struct Settings
   bool gpu_disable_interlacing = false;
   bool gpu_force_ntsc_timings = false;
   bool gpu_widescreen_hack = false;
+  bool gpu_pgxp_enable = false;
+  bool gpu_pgxp_culling = true;
+  bool gpu_pgxp_texture_correction = true;
+  bool gpu_pgxp_vertex_cache = false;
   DisplayCropMode display_crop_mode = DisplayCropMode::None;
   DisplayAspectRatio display_aspect_ratio = DisplayAspectRatio::R4_3;
   bool display_linear_filtering = true;
@@ -146,6 +150,7 @@ struct Settings
   bool log_to_window = false;
   bool log_to_file = false;
 
+  ALWAYS_INLINE bool IsUsingCodeCache() const { return (cpu_execution_mode != CPUExecutionMode::Interpreter); }
   ALWAYS_INLINE bool IsUsingRecompiler() const { return (cpu_execution_mode == CPUExecutionMode::Recompiler); }
   ALWAYS_INLINE bool IsUsingSoftwareRenderer() const { return (gpu_renderer == GPURenderer::Software); }
 
diff --git a/src/duckstation-libretro/libretro_host_interface.cpp b/src/duckstation-libretro/libretro_host_interface.cpp
index 41481984a..d60664d20 100644
--- a/src/duckstation-libretro/libretro_host_interface.cpp
+++ b/src/duckstation-libretro/libretro_host_interface.cpp
@@ -352,7 +352,7 @@ void LibretroHostInterface::OnSystemDestroyed()
   m_using_hardware_renderer = false;
 }
 
-static std::array<retro_core_option_definition, 23> s_option_definitions = {{
+static std::array<retro_core_option_definition, 27> s_option_definitions = {{
   {"Console.Region",
    "Console Region",
    "Determines which region/hardware to emulate. Auto-Detect will use the region of the disc inserted.",
@@ -447,6 +447,29 @@ static std::array<retro_core_option_definition, 23> s_option_definitions = {{
    "backgrounds, this enhancement will not work as expected.",
    {{"true", "Enabled"}, {"false", "Disabled"}},
    "false"},
+  {"GPU.PGXPEnable",
+   "PGXP Geometry Correction",
+   "Reduces \"wobbly\" polygons by attempting to preserve the fractional component through memory transfers. Only "
+   "works with the hardware renderers, and may not be compatible with all games.",
+   {{"true", "Enabled"}, {"false", "Disabled"}},
+   "false"},
+  {"GPU.PGXPCulling",
+   "PGXP Culling Correction",
+   "Increases the precision of polygon culling, reducing the number of holes in geometry. Requires geometry correction "
+   "enabled.",
+   {{"true", "Enabled"}, {"false", "Disabled"}},
+   "true"},
+  {"GPU.PGXPTextureCorrection",
+   "PGXP Texture Correction",
+   "Uses perspective-correct interpolation for texture coordinates and colors, straightening out warped textures. "
+   "Requires geometry correction enabled.",
+   {{"true", "Enabled"}, {"false", "Disabled"}},
+   "true"},
+  {"GPU.PGXPVertexCache",
+   "PGXP Vertex Cache",
+   "Uses screen coordinates as a fallback when tracking vertices through memory fails. May improve PGXP compatibility.",
+   {{"true", "Enabled"}, {"false", "Disabled"}},
+   "false"},
   {"Display.CropMode",
    "Crop Mode",
    "Changes how much of the image is cropped. Some games display garbage in the overscan area which is typically "
diff --git a/src/duckstation-qt/gpusettingswidget.cpp b/src/duckstation-qt/gpusettingswidget.cpp
index 82af959b1..cf61e5f65 100644
--- a/src/duckstation-qt/gpusettingswidget.cpp
+++ b/src/duckstation-qt/gpusettingswidget.cpp
@@ -40,11 +40,20 @@ GPUSettingsWidget::GPUSettingsWidget(QtHostInterface* host_interface, QWidget* p
                                                "TextureFiltering");
   SettingWidgetBinder::BindWidgetToBoolSetting(m_host_interface, m_ui.widescreenHack, "GPU", "WidescreenHack");
 
+  SettingWidgetBinder::BindWidgetToBoolSetting(m_host_interface, m_ui.pgxpEnable, "GPU", "PGXPEnable", false);
+  SettingWidgetBinder::BindWidgetToBoolSetting(m_host_interface, m_ui.pgxpCulling, "GPU", "PGXPCulling", true);
+  SettingWidgetBinder::BindWidgetToBoolSetting(m_host_interface, m_ui.pgxpTextureCorrection, "GPU",
+                                               "PGXPTextureCorrection", true);
+  SettingWidgetBinder::BindWidgetToBoolSetting(m_host_interface, m_ui.pgxpVertexCache, "GPU", "PGXPVertexCache", false);
+
   connect(m_ui.resolutionScale, QOverload<int>::of(&QComboBox::currentIndexChanged), this,
           &GPUSettingsWidget::updateScaledDitheringEnabled);
   connect(m_ui.trueColor, &QCheckBox::stateChanged, this, &GPUSettingsWidget::updateScaledDitheringEnabled);
   updateScaledDitheringEnabled();
 
+  connect(m_ui.pgxpEnable, &QCheckBox::stateChanged, this, &GPUSettingsWidget::updatePGXPSettingsEnabled);
+  updatePGXPSettingsEnabled();
+
   connect(m_ui.renderer, QOverload<int>::of(&QComboBox::currentIndexChanged), this,
           &GPUSettingsWidget::populateGPUAdapters);
   connect(m_ui.adapter, QOverload<int>::of(&QComboBox::currentIndexChanged), this,
@@ -126,6 +135,19 @@ GPUSettingsWidget::GPUSettingsWidget(QtHostInterface* host_interface, QWidget* p
     tr("Scales vertex positions in screen-space to a widescreen aspect ratio, essentially "
        "increasing the field of view from 4:3 to 16:9 in 3D games. For 2D games, or games which "
        "use pre-rendered backgrounds, this enhancement will not work as expected."));
+  dialog->registerWidgetHelp(
+    m_ui.pgxpEnable, tr("Geometry Correction"), tr("Unchecked"),
+    tr("Reduces \"wobbly\" polygons by attempting to preserve the fractional component through memory transfers. Only "
+       "works with the hardware renderers, and may not be compatible with all games."));
+  dialog->registerWidgetHelp(m_ui.pgxpCulling, tr("Culling Correction"), tr("Checked"),
+                             tr("Increases the precision of polygon culling, reducing the number of holes in geometry. "
+                                "Requires geometry correction enabled."));
+  dialog->registerWidgetHelp(m_ui.pgxpTextureCorrection, tr("Texture Correction"), tr("Checked"),
+                             tr("Uses perspective-correct interpolation for texture coordinates and colors, "
+                                "straightening out warped textures. Requires geometry correction enabled."));
+  dialog->registerWidgetHelp(m_ui.pgxpVertexCache, tr("Vertex Cache"), tr("Unchecked"),
+                             tr("Uses screen coordinates as a fallback when tracking vertices through memory fails. "
+                                "May improve PGXP compatibility."));
 }
 
 GPUSettingsWidget::~GPUSettingsWidget() = default;
@@ -232,3 +254,11 @@ void GPUSettingsWidget::onGPUAdapterIndexChanged()
 
   m_host_interface->SetStringSettingValue("GPU", "Adapter", m_ui.adapter->currentText().toUtf8().constData());
 }
+
+void GPUSettingsWidget::updatePGXPSettingsEnabled()
+{
+  const bool enabled = m_ui.pgxpEnable->isChecked();
+  m_ui.pgxpCulling->setEnabled(enabled);
+  m_ui.pgxpTextureCorrection->setEnabled(enabled);
+  m_ui.pgxpVertexCache->setEnabled(enabled);
+}
diff --git a/src/duckstation-qt/gpusettingswidget.h b/src/duckstation-qt/gpusettingswidget.h
index 433c204e4..5206b77c9 100644
--- a/src/duckstation-qt/gpusettingswidget.h
+++ b/src/duckstation-qt/gpusettingswidget.h
@@ -19,6 +19,7 @@ private Q_SLOTS:
   void updateScaledDitheringEnabled();
   void populateGPUAdapters();
   void onGPUAdapterIndexChanged();
+  void updatePGXPSettingsEnabled();
 
 private:
   void setupAdditionalUi();
diff --git a/src/duckstation-qt/gpusettingswidget.ui b/src/duckstation-qt/gpusettingswidget.ui
index 43a299d77..93aa03e49 100644
--- a/src/duckstation-qt/gpusettingswidget.ui
+++ b/src/duckstation-qt/gpusettingswidget.ui
@@ -7,13 +7,13 @@
     <x>0</x>
     <y>0</y>
     <width>448</width>
-    <height>307</height>
+    <height>720</height>
    </rect>
   </property>
   <property name="windowTitle">
    <string>Form</string>
   </property>
-  <layout class="QVBoxLayout" name="verticalLayout">
+  <layout class="QVBoxLayout" name="verticalLayout_2">
    <property name="leftMargin">
     <number>0</number>
    </property>
@@ -27,165 +27,221 @@
     <number>0</number>
    </property>
    <item>
-    <widget class="QGroupBox" name="groupBox">
-     <property name="title">
-      <string>Basic</string>
+    <widget class="QScrollArea" name="scrollArea">
+     <property name="widgetResizable">
+      <bool>true</bool>
      </property>
-     <layout class="QFormLayout" name="formLayout_3">
-      <item row="0" column="0">
-       <widget class="QLabel" name="label">
-        <property name="text">
-         <string>Renderer:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="0" column="1">
-       <widget class="QComboBox" name="renderer"/>
-      </item>
-      <item row="1" column="0">
-       <widget class="QLabel" name="label_5">
-        <property name="text">
-         <string>Adapter:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="1" column="1">
-       <widget class="QComboBox" name="adapter"/>
-      </item>
-      <item row="2" column="0" colspan="2">
-       <widget class="QCheckBox" name="useDebugDevice">
-        <property name="text">
-         <string>Use Debug Device</string>
-        </property>
-       </widget>
-      </item>
-     </layout>
+     <widget class="QWidget" name="scrollAreaWidgetContents">
+      <property name="geometry">
+       <rect>
+        <x>0</x>
+        <y>0</y>
+        <width>423</width>
+        <height>762</height>
+       </rect>
+      </property>
+      <layout class="QVBoxLayout" name="verticalLayout_3">
+       <item>
+        <widget class="QGroupBox" name="groupBox">
+         <property name="title">
+          <string>Basic</string>
+         </property>
+         <layout class="QFormLayout" name="formLayout_3">
+          <item row="0" column="0">
+           <widget class="QLabel" name="label">
+            <property name="text">
+             <string>Renderer:</string>
+            </property>
+           </widget>
+          </item>
+          <item row="0" column="1">
+           <widget class="QComboBox" name="renderer"/>
+          </item>
+          <item row="1" column="0">
+           <widget class="QLabel" name="label_5">
+            <property name="text">
+             <string>Adapter:</string>
+            </property>
+           </widget>
+          </item>
+          <item row="1" column="1">
+           <widget class="QComboBox" name="adapter"/>
+          </item>
+          <item row="2" column="0" colspan="2">
+           <widget class="QCheckBox" name="useDebugDevice">
+            <property name="text">
+             <string>Use Debug Device</string>
+            </property>
+           </widget>
+          </item>
+         </layout>
+        </widget>
+       </item>
+       <item>
+        <widget class="QGroupBox" name="groupBox_3">
+         <property name="title">
+          <string>Screen Display</string>
+         </property>
+         <layout class="QFormLayout" name="formLayout_4">
+          <item row="0" column="0">
+           <widget class="QLabel" name="label_4">
+            <property name="text">
+             <string>Aspect Ratio:</string>
+            </property>
+           </widget>
+          </item>
+          <item row="0" column="1">
+           <widget class="QComboBox" name="displayAspectRatio"/>
+          </item>
+          <item row="1" column="0">
+           <widget class="QLabel" name="label_3">
+            <property name="text">
+             <string>Crop:</string>
+            </property>
+           </widget>
+          </item>
+          <item row="1" column="1">
+           <widget class="QComboBox" name="displayCropMode"/>
+          </item>
+          <item row="2" column="0" colspan="2">
+           <widget class="QCheckBox" name="displayLinearFiltering">
+            <property name="text">
+             <string>Linear Upscaling</string>
+            </property>
+           </widget>
+          </item>
+          <item row="3" column="0" colspan="2">
+           <widget class="QCheckBox" name="displayIntegerScaling">
+            <property name="text">
+             <string>Integer Upscaling</string>
+            </property>
+           </widget>
+          </item>
+          <item row="4" column="0" colspan="2">
+           <widget class="QCheckBox" name="vsync">
+            <property name="text">
+             <string>VSync</string>
+            </property>
+           </widget>
+          </item>
+         </layout>
+        </widget>
+       </item>
+       <item>
+        <widget class="QGroupBox" name="groupBox_2">
+         <property name="title">
+          <string>Enhancements</string>
+         </property>
+         <layout class="QFormLayout" name="formLayout_2">
+          <item row="0" column="0">
+           <widget class="QLabel" name="label_2">
+            <property name="text">
+             <string>Resolution Scale:</string>
+            </property>
+           </widget>
+          </item>
+          <item row="0" column="1">
+           <widget class="QComboBox" name="resolutionScale"/>
+          </item>
+          <item row="1" column="0" colspan="2">
+           <widget class="QCheckBox" name="trueColor">
+            <property name="text">
+             <string>True Color Rendering (24-bit, disables dithering)</string>
+            </property>
+           </widget>
+          </item>
+          <item row="2" column="0" colspan="2">
+           <widget class="QCheckBox" name="scaledDithering">
+            <property name="text">
+             <string>Scaled Dithering (scale dither pattern to resolution)</string>
+            </property>
+           </widget>
+          </item>
+          <item row="3" column="0" colspan="2">
+           <widget class="QCheckBox" name="disableInterlacing">
+            <property name="text">
+             <string>Disable Interlacing (force progressive render/scan)</string>
+            </property>
+           </widget>
+          </item>
+          <item row="4" column="0" colspan="2">
+           <widget class="QCheckBox" name="forceNTSCTimings">
+            <property name="text">
+             <string>Force NTSC Timings (60hz-on-PAL)</string>
+            </property>
+           </widget>
+          </item>
+          <item row="5" column="0" colspan="2">
+           <widget class="QCheckBox" name="linearTextureFiltering">
+            <property name="text">
+             <string>Bilinear Texture Filtering</string>
+            </property>
+           </widget>
+          </item>
+          <item row="6" column="0" colspan="2">
+           <widget class="QCheckBox" name="widescreenHack">
+            <property name="text">
+             <string>Widescreen Hack</string>
+            </property>
+           </widget>
+          </item>
+         </layout>
+        </widget>
+       </item>
+       <item>
+        <widget class="QGroupBox" name="groupBox_4">
+         <property name="title">
+          <string>PGXP</string>
+         </property>
+         <layout class="QVBoxLayout" name="verticalLayout">
+          <item>
+           <widget class="QCheckBox" name="pgxpEnable">
+            <property name="text">
+             <string>Geometry Correction</string>
+            </property>
+           </widget>
+          </item>
+          <item>
+           <widget class="QCheckBox" name="pgxpCulling">
+            <property name="text">
+             <string>Culling Correction</string>
+            </property>
+           </widget>
+          </item>
+          <item>
+           <widget class="QCheckBox" name="pgxpTextureCorrection">
+            <property name="text">
+             <string>Texture Correction</string>
+            </property>
+           </widget>
+          </item>
+          <item>
+           <widget class="QCheckBox" name="pgxpVertexCache">
+            <property name="text">
+             <string>Vertex Cache</string>
+            </property>
+           </widget>
+          </item>
+         </layout>
+        </widget>
+       </item>
+       <item>
+        <spacer name="verticalSpacer">
+         <property name="orientation">
+          <enum>Qt::Vertical</enum>
+         </property>
+         <property name="sizeHint" stdset="0">
+          <size>
+           <width>20</width>
+           <height>40</height>
+          </size>
+         </property>
+        </spacer>
+       </item>
+      </layout>
+     </widget>
     </widget>
    </item>
-   <item>
-    <widget class="QGroupBox" name="groupBox_3">
-     <property name="title">
-      <string>Screen Display</string>
-     </property>
-     <layout class="QFormLayout" name="formLayout_4">
-      <item row="0" column="0">
-       <widget class="QLabel" name="label_4">
-        <property name="text">
-         <string>Aspect Ratio:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="0" column="1">
-       <widget class="QComboBox" name="displayAspectRatio"/>
-      </item>
-      <item row="1" column="0">
-       <widget class="QLabel" name="label_3">
-        <property name="text">
-         <string>Crop:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="1" column="1">
-       <widget class="QComboBox" name="displayCropMode"/>
-      </item>
-      <item row="2" column="0" colspan="2">
-       <widget class="QCheckBox" name="displayLinearFiltering">
-        <property name="text">
-         <string>Linear Upscaling</string>
-        </property>
-       </widget>
-      </item>
-      <item row="3" column="0" colspan="2">
-       <widget class="QCheckBox" name="displayIntegerScaling">
-        <property name="text">
-         <string>Integer Upscaling</string>
-        </property>
-       </widget>
-      </item>
-      <item row="4" column="0" colspan="2">
-       <widget class="QCheckBox" name="vsync">
-        <property name="text">
-         <string>VSync</string>
-        </property>
-       </widget>
-      </item>
-     </layout>
-    </widget>
-   </item>
-   <item>
-    <widget class="QGroupBox" name="groupBox_2">
-     <property name="title">
-      <string>Enhancements</string>
-     </property>
-     <layout class="QFormLayout" name="formLayout_2">
-      <item row="0" column="0">
-       <widget class="QLabel" name="label_2">
-        <property name="text">
-         <string>Resolution Scale:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="0" column="1">
-       <widget class="QComboBox" name="resolutionScale"/>
-      </item>
-      <item row="1" column="0" colspan="2">
-       <widget class="QCheckBox" name="trueColor">
-        <property name="text">
-         <string>True Color Rendering (24-bit, disables dithering)</string>
-        </property>
-       </widget>
-      </item>
-      <item row="2" column="0" colspan="2">
-       <widget class="QCheckBox" name="scaledDithering">
-        <property name="text">
-         <string>Scaled Dithering (scale dither pattern to resolution)</string>
-        </property>
-       </widget>
-      </item>
-      <item row="3" column="0" colspan="2">
-       <widget class="QCheckBox" name="disableInterlacing">
-        <property name="text">
-         <string>Disable Interlacing (force progressive render/scan)</string>
-        </property>
-       </widget>
-      </item>
-      <item row="4" column="0" colspan="2">
-       <widget class="QCheckBox" name="forceNTSCTimings">
-        <property name="text">
-         <string>Force NTSC Timings (60hz-on-PAL)</string>
-        </property>
-       </widget>
-      </item>
-      <item row="5" column="0" colspan="2">
-       <widget class="QCheckBox" name="linearTextureFiltering">
-        <property name="text">
-         <string>Bilinear Texture Filtering</string>
-        </property>
-       </widget>
-      </item>
-      <item row="6" column="0" colspan="2">
-       <widget class="QCheckBox" name="widescreenHack">
-        <property name="text">
-         <string>Widescreen Hack</string>
-        </property>
-       </widget>
-      </item>
-     </layout>
-    </widget>
-   </item>
-   <item>
-    <spacer name="verticalSpacer">
-     <property name="orientation">
-      <enum>Qt::Vertical</enum>
-     </property>
-     <property name="sizeHint" stdset="0">
-      <size>
-       <width>20</width>
-       <height>40</height>
-      </size>
-     </property>
-    </spacer>
-   </item>
   </layout>
  </widget>
  <resources/>
diff --git a/src/duckstation-sdl/sdl_host_interface.cpp b/src/duckstation-sdl/sdl_host_interface.cpp
index 3b2a89dd4..ec2b9cd56 100644
--- a/src/duckstation-sdl/sdl_host_interface.cpp
+++ b/src/duckstation-sdl/sdl_host_interface.cpp
@@ -858,6 +858,18 @@ void SDLHostInterface::DrawQuickSettingsMenu()
     ImGui::EndMenu();
   }
 
+  if (ImGui::BeginMenu("PGXP"))
+  {
+    settings_changed |= ImGui::MenuItem("PGXP Enabled", nullptr, &m_settings_copy.gpu_pgxp_enable);
+    settings_changed |=
+      ImGui::MenuItem("PGXP Culling", nullptr, &m_settings_copy.gpu_pgxp_culling, m_settings_copy.gpu_pgxp_enable);
+    settings_changed |= ImGui::MenuItem("PGXP Texture Correction", nullptr,
+                                        &m_settings_copy.gpu_pgxp_texture_correction, m_settings_copy.gpu_pgxp_enable);
+    settings_changed |= ImGui::MenuItem("PGXP Vertex Cache", nullptr, &m_settings_copy.gpu_pgxp_vertex_cache,
+                                        m_settings_copy.gpu_pgxp_enable);
+    ImGui::EndMenu();
+  }
+
   settings_changed |= ImGui::MenuItem("True (24-Bit) Color", nullptr, &m_settings_copy.gpu_true_color);
   settings_changed |= ImGui::MenuItem("Scaled Dithering", nullptr, &m_settings_copy.gpu_scaled_dithering);
   settings_changed |= ImGui::MenuItem("Texture Filtering", nullptr, &m_settings_copy.gpu_texture_filtering);
@@ -1316,6 +1328,11 @@ void SDLHostInterface::DrawSettingsWindow()
         settings_changed |= ImGui::Checkbox("Disable Interlacing", &m_settings_copy.gpu_disable_interlacing);
         settings_changed |= ImGui::Checkbox("Force NTSC Timings", &m_settings_copy.gpu_force_ntsc_timings);
         settings_changed |= ImGui::Checkbox("Widescreen Hack", &m_settings_copy.gpu_widescreen_hack);
+
+        settings_changed |= ImGui::Checkbox("PGXP Enabled", &m_settings_copy.gpu_pgxp_enable);
+        settings_changed |= ImGui::Checkbox("PGXP Culling", &m_settings_copy.gpu_pgxp_culling);
+        settings_changed |= ImGui::Checkbox("PGXP Texture Correction", &m_settings_copy.gpu_pgxp_texture_correction);
+        settings_changed |= ImGui::Checkbox("PGXP Vertex Cache", &m_settings_copy.gpu_pgxp_vertex_cache);
       }
 
       ImGui::EndTabItem();
diff --git a/src/frontend-common/common_host_interface.cpp b/src/frontend-common/common_host_interface.cpp
index fb2572904..56faf45ac 100644
--- a/src/frontend-common/common_host_interface.cpp
+++ b/src/frontend-common/common_host_interface.cpp
@@ -8,11 +8,13 @@
 #include "controller_interface.h"
 #include "core/cdrom.h"
 #include "core/controller.h"
+#include "core/cpu_code_cache.h"
 #include "core/dma.h"
 #include "core/game_list.h"
 #include "core/gpu.h"
 #include "core/host_display.h"
 #include "core/mdec.h"
+#include "core/pgxp.h"
 #include "core/save_state_version.h"
 #include "core/spu.h"
 #include "core/system.h"
@@ -1295,6 +1297,22 @@ void CommonHostInterface::RegisterGraphicsHotkeys()
                      ToggleSoftwareRendering();
                  });
 
+  RegisterHotkey(StaticString("Graphics"), StaticString("TogglePGXP"), StaticString("Toggle PGXP"),
+                 [this](bool pressed) {
+                   if (!pressed)
+                   {
+                     g_settings.gpu_pgxp_enable = !g_settings.gpu_pgxp_enable;
+                     ReportFormattedMessage("PGXP is now %s.", g_settings.gpu_pgxp_enable ? "enabled" : "disabled");
+
+                     if (g_settings.gpu_pgxp_enable)
+                       PGXP::Initialize();
+
+                     // we need to recompile all blocks if pgxp is toggled on/off
+                     if (g_settings.IsUsingCodeCache())
+                       CPU::CodeCache::Flush();
+                   }
+                 });
+
   RegisterHotkey(StaticString("Graphics"), StaticString("IncreaseResolutionScale"),
                  StaticString("Increase Resolution Scale"), [this](bool pressed) {
                    if (!pressed)