diff --git a/src/common/types.h b/src/common/types.h index 2e67fcc62..5b8ee1d09 100644 --- a/src/common/types.h +++ b/src/common/types.h @@ -190,6 +190,15 @@ constexpr bool ConvertToBoolUnchecked(TValue value) return ret; } +// Generic sign extension +template +constexpr T SignExtendN(T value) +{ + // http://graphics.stanford.edu/~seander/bithacks.html#VariableSignExtend + constexpr int shift = 8 * sizeof(T) - NBITS; + return static_cast((static_cast>(value) << shift) >> shift); +} + // Enum class bitwise operators #define IMPLEMENT_ENUM_CLASS_BITWISE_OPERATORS(type_) \ inline constexpr type_ operator&(type_ lhs, type_ rhs) \ diff --git a/src/pse/gte.cpp b/src/pse/gte.cpp index c6a7f7047..83ec9f3e1 100644 --- a/src/pse/gte.cpp +++ b/src/pse/gte.cpp @@ -405,11 +405,12 @@ void Core::RTPS(const s16 V[3], bool sf, bool lm, bool last) { const u8 shift = sf ? 12 : 0; #define dot3(i) \ - CheckMACResult( \ + SignExtendMACResult( \ (s64(m_regs.TR[i]) << 12) + \ - CheckMACResult(CheckMACResult(CheckMACResult(s64(s32(m_regs.RT[i][0]) * s32(V[0]))) + \ - s64(s32(m_regs.RT[i][1]) * s32(V[1]))) + \ - s64(s32(m_regs.RT[i][2]) * s32(V[2])))) + SignExtendMACResult( \ + SignExtendMACResult(SignExtendMACResult(s64(s32(m_regs.RT[i][0]) * s32(V[0]))) + \ + s64(s32(m_regs.RT[i][1]) * s32(V[1]))) + \ + s64(s32(m_regs.RT[i][2]) * s32(V[2])))) // IR1 = MAC1 = (TRX*1000h + RT11*VX0 + RT12*VY0 + RT13*VZ0) SAR (sf*12) // IR2 = MAC2 = (TRY*1000h + RT21*VX0 + RT22*VY0 + RT23*VZ0) SAR (sf*12) @@ -451,14 +452,17 @@ void Core::RTPS(const s16 V[3], bool sf, bool lm, bool last) // MAC0=(((H*20000h/SZ3)+1)/2)*IR1+OFX, SX2=MAC0/10000h ;ScrX FIFO -400h..+3FFh // MAC0=(((H*20000h/SZ3)+1)/2)*IR2+OFY, SY2=MAC0/10000h ;ScrY FIFO -400h..+3FFh - const s64 Sx = TruncateAndSetMAC<0>(s64(result) * s64(m_regs.IR1) + s64(m_regs.OFX), 0); - const s64 Sy = TruncateAndSetMAC<0>(s64(result) * s64(m_regs.IR2) + s64(m_regs.OFY), 0); + const s64 Sx = s64(result) * s64(m_regs.IR1) + s64(m_regs.OFX); + const s64 Sy = s64(result) * s64(m_regs.IR2) + s64(m_regs.OFY); + TruncateAndSetMAC<0>(Sx, 0); + TruncateAndSetMAC<1>(Sy, 0); PushSXY(s32(Sx >> 16), s32(Sy >> 16)); if (last) { // MAC0=(((H*20000h/SZ3)+1)/2)*DQA+DQB, IR0=MAC0/1000h ;Depth cueing 0..+1000h - const s64 Sz = TruncateAndSetMAC<0>(s64(result) * s64(m_regs.DQA) + s64(m_regs.DQB), 0); + const s64 Sz = s64(result) * s64(m_regs.DQA) + s64(m_regs.DQB); + TruncateAndSetMAC<0>(Sz, 0); TruncateAndSetIR<0>(s32(Sz >> 12), true); } } @@ -517,8 +521,7 @@ void Core::Execute_AVSZ3(Instruction inst) { m_regs.FLAG.Clear(); - const s64 result = - TruncateAndSetMAC<0>(s64(m_regs.ZSF3) * s32(u32(m_regs.SZ1) + u32(m_regs.SZ2) + u32(m_regs.SZ3)), 0); + const s64 result = s64(m_regs.ZSF3) * s32(u32(m_regs.SZ1) + u32(m_regs.SZ2) + u32(m_regs.SZ3)); TruncateAndSetMAC<0>(result, 0); SetOTZ(s32(result >> 12)); @@ -529,8 +532,7 @@ void Core::Execute_AVSZ4(Instruction inst) { m_regs.FLAG.Clear(); - const s64 result = TruncateAndSetMAC<0>( - s64(m_regs.ZSF4) * s32(u32(m_regs.SZ0) + u32(m_regs.SZ1) + u32(m_regs.SZ2) + u32(m_regs.SZ3)), 0); + const s64 result = s64(m_regs.ZSF4) * s32(u32(m_regs.SZ0) + u32(m_regs.SZ1) + u32(m_regs.SZ2) + u32(m_regs.SZ3)); TruncateAndSetMAC<0>(result, 0); SetOTZ(s32(result >> 12)); @@ -540,41 +542,31 @@ void Core::Execute_AVSZ4(Instruction inst) void Core::MulMatVec(const s16 M[3][3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm) { #define dot3(i) \ - TruncateAndSetMAC( \ - CheckMACResult(CheckMACResult(s64(s32(M[i][0]) * s32(Vx))) + s64(s32(M[i][1]) * s32(Vy))) + \ - s64(s32(M[i][2]) * s32(Vz)), \ - shift) + TruncateAndSetMACAndIR(SignExtendMACResult((s64(M[i][0]) * s64(Vx)) + (s64(M[i][1]) * s64(Vy))) + \ + (s64(M[i][2]) * s64(Vz)), \ + shift, lm) dot3(0); dot3(1); dot3(2); #undef dot3 - - TruncateAndSetIR<1>(m_regs.MAC1, lm); - TruncateAndSetIR<2>(m_regs.MAC2, lm); - TruncateAndSetIR<3>(m_regs.MAC3, lm); } void Core::MulMatVec(const s16 M[3][3], const s32 T[3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm) { #define dot3(i) \ - TruncateAndSetMAC( \ - (s64(T[i]) << 12) + \ - CheckMACResult( \ - CheckMACResult(CheckMACResult(s64(s32(M[i][0]) * s32(Vx))) + s64(s32(M[i][1]) * s32(Vy))) + \ - s64(s32(M[i][2]) * s32(Vz))), \ - shift) + TruncateAndSetMACAndIR( \ + SignExtendMACResult(SignExtendMACResult((s64(T[i]) << 12) + (s64(M[i][0]) * s64(Vx))) + \ + (s64(M[i][1]) * s64(Vy))) + \ + (s64(M[i][2]) * s64(Vz)), \ + shift, lm) dot3(0); dot3(1); dot3(2); #undef dot3 - - TruncateAndSetIR<1>(m_regs.MAC1, lm); - TruncateAndSetIR<2>(m_regs.MAC2, lm); - TruncateAndSetIR<3>(m_regs.MAC3, lm); } void Core::NCCS(const s16 V[3], bool sf, bool lm) diff --git a/src/pse/gte.h b/src/pse/gte.h index 28b36db9e..ef2c019de 100644 --- a/src/pse/gte.h +++ b/src/pse/gte.h @@ -35,15 +35,22 @@ private: static constexpr s32 IR123_MIN_VALUE = -(INT64_C(1) << 15); static constexpr s32 IR123_MAX_VALUE = (INT64_C(1) << 15) - 1; - // Checks for underflow/overflow. Returns the value untouched so it can be threaded through an expression. + // Checks for underflow/overflow. template - s64 CheckMACResult(s64 value); + void CheckMACOverflow(s64 value); + + // Checks for underflow/overflow, sign-extending to 31/43 bits. + template + s64 SignExtendMACResult(s64 value); template - s64 TruncateAndSetMAC(s64 value, u8 shift); + void TruncateAndSetMAC(s64 value, u8 shift); template - s16 TruncateAndSetIR(s32 value, bool lm); + void TruncateAndSetMACAndIR(s64 value, u8 shift, bool lm); + + template + void TruncateAndSetIR(s32 value, bool lm); template u8 TruncateRGB(s32 value); @@ -55,7 +62,7 @@ private: // 3x3 matrix * 3x1 vector, updates MAC[1-3] and IR[1-3] void MulMatVec(const s16 M[3][3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm); - + // 3x3 matrix * 3x1 vector with translation, updates MAC[1-3] and IR[1-3] void MulMatVec(const s16 M[3][3], const s32 T[3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm); diff --git a/src/pse/gte.inl b/src/pse/gte.inl index 260ecd7c3..adfa8f55b 100644 --- a/src/pse/gte.inl +++ b/src/pse/gte.inl @@ -1,7 +1,7 @@ #include "gte.h" template -s64 GTE::Core::CheckMACResult(s64 value) +void GTE::Core::CheckMACOverflow(s64 value) { constexpr s64 MIN_VALUE = (index == 0) ? MAC0_MIN_VALUE : MAC123_MIN_VALUE; constexpr s64 MAX_VALUE = (index == 0) ? MAC0_MAX_VALUE : MAC123_MAX_VALUE; @@ -27,24 +27,28 @@ s64 GTE::Core::CheckMACResult(s64 value) else if constexpr (index == 3) m_regs.FLAG.mac3_overflow = true; } - - return value; } template -s64 GTE::Core::TruncateAndSetMAC(s64 value, u8 shift) +s64 GTE::Core::SignExtendMACResult(s64 value) { - value = CheckMACResult(value); + CheckMACOverflow(value); + return SignExtendN < index == 0 ? 31 : 44 > (value); +} + +template +void GTE::Core::TruncateAndSetMAC(s64 value, u8 shift) +{ + CheckMACOverflow(value); // shift should be done before storing to avoid losing precision value >>= shift; m_regs.dr32[24 + index] = Truncate32(static_cast(value)); - return value; } template -s16 GTE::Core::TruncateAndSetIR(s32 value, bool lm) +void GTE::Core::TruncateAndSetIR(s32 value, bool lm) { constexpr s32 MIN_VALUE = (index == 0) ? IR0_MIN_VALUE : IR123_MIN_VALUE; constexpr s32 MAX_VALUE = (index == 0) ? IR0_MAX_VALUE : IR123_MAX_VALUE; @@ -76,7 +80,22 @@ s16 GTE::Core::TruncateAndSetIR(s32 value, bool lm) // store sign-extended 16-bit value as 32-bit m_regs.dr32[8 + index] = value; - return static_cast(value); +} + +template +void GTE::Core::TruncateAndSetMACAndIR(s64 value, u8 shift, bool lm) +{ + CheckMACOverflow(value); + + // shift should be done before storing to avoid losing precision + value >>= shift; + + // set MAC + const s32 value32 = static_cast(value); + m_regs.dr32[24 + index] = value32; + + // set IR + TruncateAndSetIR(value32, lm); } template