diff --git a/src/common/types.h b/src/common/types.h
index 2e67fcc62..5b8ee1d09 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -190,6 +190,15 @@ constexpr bool ConvertToBoolUnchecked(TValue value)
   return ret;
 }
 
+// Generic sign extension
+template<int NBITS, typename T>
+constexpr T SignExtendN(T value)
+{
+  // http://graphics.stanford.edu/~seander/bithacks.html#VariableSignExtend
+  constexpr int shift = 8 * sizeof(T) - NBITS;
+  return static_cast<T>((static_cast<std::make_signed_t<T>>(value) << shift) >> shift);
+}
+
 // Enum class bitwise operators
 #define IMPLEMENT_ENUM_CLASS_BITWISE_OPERATORS(type_)                                                                  \
   inline constexpr type_ operator&(type_ lhs, type_ rhs)                                                               \
diff --git a/src/pse/gte.cpp b/src/pse/gte.cpp
index c6a7f7047..83ec9f3e1 100644
--- a/src/pse/gte.cpp
+++ b/src/pse/gte.cpp
@@ -405,11 +405,12 @@ void Core::RTPS(const s16 V[3], bool sf, bool lm, bool last)
 {
   const u8 shift = sf ? 12 : 0;
 #define dot3(i)                                                                                                        \
-  CheckMACResult<i + 1>(                                                                                               \
+  SignExtendMACResult<i + 1>(                                                                                          \
     (s64(m_regs.TR[i]) << 12) +                                                                                        \
-    CheckMACResult<i + 1>(CheckMACResult<i + 1>(CheckMACResult<i + 1>(s64(s32(m_regs.RT[i][0]) * s32(V[0]))) +         \
-                                                s64(s32(m_regs.RT[i][1]) * s32(V[1]))) +                               \
-                          s64(s32(m_regs.RT[i][2]) * s32(V[2]))))
+    SignExtendMACResult<i + 1>(                                                                                        \
+      SignExtendMACResult<i + 1>(SignExtendMACResult<i + 1>(s64(s32(m_regs.RT[i][0]) * s32(V[0]))) +                   \
+                                 s64(s32(m_regs.RT[i][1]) * s32(V[1]))) +                                              \
+      s64(s32(m_regs.RT[i][2]) * s32(V[2]))))
 
   // IR1 = MAC1 = (TRX*1000h + RT11*VX0 + RT12*VY0 + RT13*VZ0) SAR (sf*12)
   // IR2 = MAC2 = (TRY*1000h + RT21*VX0 + RT22*VY0 + RT23*VZ0) SAR (sf*12)
@@ -451,14 +452,17 @@ void Core::RTPS(const s16 V[3], bool sf, bool lm, bool last)
 
   // MAC0=(((H*20000h/SZ3)+1)/2)*IR1+OFX, SX2=MAC0/10000h ;ScrX FIFO -400h..+3FFh
   // MAC0=(((H*20000h/SZ3)+1)/2)*IR2+OFY, SY2=MAC0/10000h ;ScrY FIFO -400h..+3FFh
-  const s64 Sx = TruncateAndSetMAC<0>(s64(result) * s64(m_regs.IR1) + s64(m_regs.OFX), 0);
-  const s64 Sy = TruncateAndSetMAC<0>(s64(result) * s64(m_regs.IR2) + s64(m_regs.OFY), 0);
+  const s64 Sx = s64(result) * s64(m_regs.IR1) + s64(m_regs.OFX);
+  const s64 Sy = s64(result) * s64(m_regs.IR2) + s64(m_regs.OFY);
+  TruncateAndSetMAC<0>(Sx, 0);
+  TruncateAndSetMAC<1>(Sy, 0);
   PushSXY(s32(Sx >> 16), s32(Sy >> 16));
 
   if (last)
   {
     // MAC0=(((H*20000h/SZ3)+1)/2)*DQA+DQB, IR0=MAC0/1000h  ;Depth cueing 0..+1000h
-    const s64 Sz = TruncateAndSetMAC<0>(s64(result) * s64(m_regs.DQA) + s64(m_regs.DQB), 0);
+    const s64 Sz = s64(result) * s64(m_regs.DQA) + s64(m_regs.DQB);
+    TruncateAndSetMAC<0>(Sz, 0);
     TruncateAndSetIR<0>(s32(Sz >> 12), true);
   }
 }
@@ -517,8 +521,7 @@ void Core::Execute_AVSZ3(Instruction inst)
 {
   m_regs.FLAG.Clear();
 
-  const s64 result =
-    TruncateAndSetMAC<0>(s64(m_regs.ZSF3) * s32(u32(m_regs.SZ1) + u32(m_regs.SZ2) + u32(m_regs.SZ3)), 0);
+  const s64 result = s64(m_regs.ZSF3) * s32(u32(m_regs.SZ1) + u32(m_regs.SZ2) + u32(m_regs.SZ3));
   TruncateAndSetMAC<0>(result, 0);
   SetOTZ(s32(result >> 12));
 
@@ -529,8 +532,7 @@ void Core::Execute_AVSZ4(Instruction inst)
 {
   m_regs.FLAG.Clear();
 
-  const s64 result = TruncateAndSetMAC<0>(
-    s64(m_regs.ZSF4) * s32(u32(m_regs.SZ0) + u32(m_regs.SZ1) + u32(m_regs.SZ2) + u32(m_regs.SZ3)), 0);
+  const s64 result = s64(m_regs.ZSF4) * s32(u32(m_regs.SZ0) + u32(m_regs.SZ1) + u32(m_regs.SZ2) + u32(m_regs.SZ3));
   TruncateAndSetMAC<0>(result, 0);
   SetOTZ(s32(result >> 12));
 
@@ -540,41 +542,31 @@ void Core::Execute_AVSZ4(Instruction inst)
 void Core::MulMatVec(const s16 M[3][3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm)
 {
 #define dot3(i)                                                                                                        \
-  TruncateAndSetMAC<i + 1>(                                                                                            \
-    CheckMACResult<i + 1>(CheckMACResult<i + 1>(s64(s32(M[i][0]) * s32(Vx))) + s64(s32(M[i][1]) * s32(Vy))) +          \
-      s64(s32(M[i][2]) * s32(Vz)),                                                                                     \
-    shift)
+  TruncateAndSetMACAndIR<i + 1>(SignExtendMACResult<i + 1>((s64(M[i][0]) * s64(Vx)) + (s64(M[i][1]) * s64(Vy))) +      \
+                                  (s64(M[i][2]) * s64(Vz)),                                                            \
+                                shift, lm)
 
   dot3(0);
   dot3(1);
   dot3(2);
 
 #undef dot3
-
-  TruncateAndSetIR<1>(m_regs.MAC1, lm);
-  TruncateAndSetIR<2>(m_regs.MAC2, lm);
-  TruncateAndSetIR<3>(m_regs.MAC3, lm);
 }
 
 void Core::MulMatVec(const s16 M[3][3], const s32 T[3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm)
 {
 #define dot3(i)                                                                                                        \
-  TruncateAndSetMAC<i + 1>(                                                                                            \
-    (s64(T[i]) << 12) +                                                                                                \
-      CheckMACResult<i + 1>(                                                                                           \
-        CheckMACResult<i + 1>(CheckMACResult<i + 1>(s64(s32(M[i][0]) * s32(Vx))) + s64(s32(M[i][1]) * s32(Vy))) +      \
-        s64(s32(M[i][2]) * s32(Vz))),                                                                                  \
-    shift)
+  TruncateAndSetMACAndIR<i + 1>(                                                                                       \
+    SignExtendMACResult<i + 1>(SignExtendMACResult<i + 1>((s64(T[i]) << 12) + (s64(M[i][0]) * s64(Vx))) +              \
+                               (s64(M[i][1]) * s64(Vy))) +                                                             \
+      (s64(M[i][2]) * s64(Vz)),                                                                                        \
+    shift, lm)
 
   dot3(0);
   dot3(1);
   dot3(2);
 
 #undef dot3
-
-  TruncateAndSetIR<1>(m_regs.MAC1, lm);
-  TruncateAndSetIR<2>(m_regs.MAC2, lm);
-  TruncateAndSetIR<3>(m_regs.MAC3, lm);
 }
 
 void Core::NCCS(const s16 V[3], bool sf, bool lm)
diff --git a/src/pse/gte.h b/src/pse/gte.h
index 28b36db9e..ef2c019de 100644
--- a/src/pse/gte.h
+++ b/src/pse/gte.h
@@ -35,15 +35,22 @@ private:
   static constexpr s32 IR123_MIN_VALUE = -(INT64_C(1) << 15);
   static constexpr s32 IR123_MAX_VALUE = (INT64_C(1) << 15) - 1;
 
-  // Checks for underflow/overflow. Returns the value untouched so it can be threaded through an expression.
+  // Checks for underflow/overflow.
   template<u32 index>
-  s64 CheckMACResult(s64 value);
+  void CheckMACOverflow(s64 value);
+
+  // Checks for underflow/overflow, sign-extending to 31/43 bits.
+  template<u32 index>
+  s64 SignExtendMACResult(s64 value);
 
   template<u32 index>
-  s64 TruncateAndSetMAC(s64 value, u8 shift);
+  void TruncateAndSetMAC(s64 value, u8 shift);
 
   template<u32 index>
-  s16 TruncateAndSetIR(s32 value, bool lm);
+  void TruncateAndSetMACAndIR(s64 value, u8 shift, bool lm);
+
+  template<u32 index>
+  void TruncateAndSetIR(s32 value, bool lm);
 
   template<u32 index>
   u8 TruncateRGB(s32 value);
@@ -55,7 +62,7 @@ private:
 
   // 3x3 matrix * 3x1 vector, updates MAC[1-3] and IR[1-3]
   void MulMatVec(const s16 M[3][3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm);
-  
+
   // 3x3 matrix * 3x1 vector with translation, updates MAC[1-3] and IR[1-3]
   void MulMatVec(const s16 M[3][3], const s32 T[3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm);
 
diff --git a/src/pse/gte.inl b/src/pse/gte.inl
index 260ecd7c3..adfa8f55b 100644
--- a/src/pse/gte.inl
+++ b/src/pse/gte.inl
@@ -1,7 +1,7 @@
 #include "gte.h"
 
 template<u32 index>
-s64 GTE::Core::CheckMACResult(s64 value)
+void GTE::Core::CheckMACOverflow(s64 value)
 {
   constexpr s64 MIN_VALUE = (index == 0) ? MAC0_MIN_VALUE : MAC123_MIN_VALUE;
   constexpr s64 MAX_VALUE = (index == 0) ? MAC0_MAX_VALUE : MAC123_MAX_VALUE;
@@ -27,24 +27,28 @@ s64 GTE::Core::CheckMACResult(s64 value)
     else if constexpr (index == 3)
       m_regs.FLAG.mac3_overflow = true;
   }
-
-  return value;
 }
 
 template<u32 index>
-s64 GTE::Core::TruncateAndSetMAC(s64 value, u8 shift)
+s64 GTE::Core::SignExtendMACResult(s64 value)
 {
-  value = CheckMACResult<index>(value);
+  CheckMACOverflow<index>(value);
+  return SignExtendN < index == 0 ? 31 : 44 > (value);
+}
+
+template<u32 index>
+void GTE::Core::TruncateAndSetMAC(s64 value, u8 shift)
+{
+  CheckMACOverflow<index>(value);
 
   // shift should be done before storing to avoid losing precision
   value >>= shift;
 
   m_regs.dr32[24 + index] = Truncate32(static_cast<u64>(value));
-  return value;
 }
 
 template<u32 index>
-s16 GTE::Core::TruncateAndSetIR(s32 value, bool lm)
+void GTE::Core::TruncateAndSetIR(s32 value, bool lm)
 {
   constexpr s32 MIN_VALUE = (index == 0) ? IR0_MIN_VALUE : IR123_MIN_VALUE;
   constexpr s32 MAX_VALUE = (index == 0) ? IR0_MAX_VALUE : IR123_MAX_VALUE;
@@ -76,7 +80,22 @@ s16 GTE::Core::TruncateAndSetIR(s32 value, bool lm)
 
   // store sign-extended 16-bit value as 32-bit
   m_regs.dr32[8 + index] = value;
-  return static_cast<s16>(value);
+}
+
+template<u32 index>
+void GTE::Core::TruncateAndSetMACAndIR(s64 value, u8 shift, bool lm)
+{
+  CheckMACOverflow<index>(value);
+
+  // shift should be done before storing to avoid losing precision
+  value >>= shift;
+
+  // set MAC
+  const s32 value32 = static_cast<s32>(value);
+  m_regs.dr32[24 + index] = value32;
+
+  // set IR
+  TruncateAndSetIR<index>(value32, lm);
 }
 
 template<u32 index>