CPU/NewRec: Fix lwl/lwr on ARM

This commit is contained in:
Stenzek 2024-03-31 21:26:13 +10:00
parent 86927ea3eb
commit 71a07a5a8e
No known key found for this signature in database
6 changed files with 91 additions and 95 deletions

View File

@ -616,8 +616,8 @@ u32 CPU::NewRec::Compiler::GetFreeHostReg(u32 flags)
} }
} }
Log_DebugPrintf("Freeing register %s in host register %s for allocation", GetHostRegName(lowest), Log_DebugPrintf("Freeing register %s in host register %s for allocation", GetRegName(ra.reg),
GetRegName(ra.reg)); GetHostRegName(lowest));
} }
break; break;
case HR_TYPE_LOAD_DELAY_VALUE: case HR_TYPE_LOAD_DELAY_VALUE:
@ -628,8 +628,8 @@ u32 CPU::NewRec::Compiler::GetFreeHostReg(u32 flags)
break; break;
case HR_TYPE_NEXT_LOAD_DELAY_VALUE: case HR_TYPE_NEXT_LOAD_DELAY_VALUE:
{ {
Log_DebugPrintf("Freeing next load delay register %s in host register %s due for allocation", Log_DebugPrintf("Freeing next load delay register %s in host register %s due for allocation", GetRegName(ra.reg),
GetHostRegName(lowest), GetRegName(ra.reg)); GetHostRegName(lowest));
} }
break; break;
default: default:
@ -875,6 +875,7 @@ void CPU::NewRec::Compiler::FlushHostReg(u32 reg)
void CPU::NewRec::Compiler::FreeHostReg(u32 reg) void CPU::NewRec::Compiler::FreeHostReg(u32 reg)
{ {
DebugAssert(IsHostRegAllocated(reg)); DebugAssert(IsHostRegAllocated(reg));
Log_DebugPrintf("Freeing host register %s", GetHostRegName(reg));
FlushHostReg(reg); FlushHostReg(reg);
ClearHostReg(reg); ClearHostReg(reg);
} }

View File

@ -284,8 +284,8 @@ bool foo(const void* a, const void* b)
while (size >= 4) while (size >= 4)
{ {
armAsm->ldr(RARG3, MemOperand(RARG1, offset)); armAsm->ldr(RARG3, MemOperand(RARG1, offset));
armAsm->ldr(RARG4, MemOperand(RARG2, offset)); armAsm->ldr(RSCRATCH, MemOperand(RARG2, offset));
armAsm->cmp(RARG3, RARG4); armAsm->cmp(RARG3, RSCRATCH);
armAsm->b(ne, &block_changed); armAsm->b(ne, &block_changed);
offset += 4; offset += 4;
size -= 4; size -= 4;
@ -723,7 +723,7 @@ void CPU::NewRec::AArch32Compiler::Compile_Fallback()
{ {
Flush(FLUSH_FOR_INTERPRETER); Flush(FLUSH_FOR_INTERPRETER);
EmitCall(armAsm, reinterpret_cast<const void*>(&CPU::Recompiler::Thunks::InterpretInstruction)); EmitCall(reinterpret_cast<const void*>(&CPU::Recompiler::Thunks::InterpretInstruction));
// TODO: make me less garbage // TODO: make me less garbage
// TODO: this is wrong, it flushes the load delay on the same cycle when we return. // TODO: this is wrong, it flushes the load delay on the same cycle when we return.
@ -1637,9 +1637,9 @@ void CPU::NewRec::AArch32Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize
{ {
// const u32 mask = UINT32_C(0x00FFFFFF) >> shift; // const u32 mask = UINT32_C(0x00FFFFFF) >> shift;
// new_value = (value & mask) | (RWRET << (24 - shift)); // new_value = (value & mask) | (RWRET << (24 - shift));
EmitMov(RARG4, 0xFFFFFFu); EmitMov(RSCRATCH, 0xFFFFFFu);
armAsm->lsr(RARG4, RARG4, RARG2); armAsm->lsr(RSCRATCH, RSCRATCH, RARG2);
armAsm->and_(value, value, RARG4); armAsm->and_(value, value, RSCRATCH);
armAsm->lsl(RRET, RRET, RARG3); armAsm->lsl(RRET, RRET, RARG3);
armAsm->orr(value, value, RRET); armAsm->orr(value, value, RRET);
} }
@ -1648,9 +1648,9 @@ void CPU::NewRec::AArch32Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize
// const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift); // const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift);
// new_value = (value & mask) | (RWRET >> shift); // new_value = (value & mask) | (RWRET >> shift);
armAsm->lsr(RRET, RRET, RARG2); armAsm->lsr(RRET, RRET, RARG2);
EmitMov(RARG4, 0xFFFFFF00u); EmitMov(RSCRATCH, 0xFFFFFF00u);
armAsm->lsl(RARG4, RARG4, RARG3); armAsm->lsl(RSCRATCH, RSCRATCH, RARG3);
armAsm->and_(value, value, RARG4); armAsm->and_(value, value, RSCRATCH);
armAsm->orr(value, value, RRET); armAsm->orr(value, value, RRET);
} }
@ -1857,15 +1857,20 @@ void CPU::NewRec::AArch32Compiler::Compile_swx(CompileFlags cf, MemoryAccessSize
void CPU::NewRec::AArch32Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, void CPU::NewRec::AArch32Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
const std::optional<VirtualMemoryAddress>& address) const std::optional<VirtualMemoryAddress>& address)
{ {
FlushForLoadStore(address, true, use_fastmem);
const u32 index = static_cast<u32>(inst->r.rt.GetValue()); const u32 index = static_cast<u32>(inst->r.rt.GetValue());
const auto [ptr, action] = GetGTERegisterPointer(index, false); const auto [ptr, action] = GetGTERegisterPointer(index, false);
const Register addr = (g_settings.gpu_pgxp_enable || action == GTERegisterAccessAction::CallHandler) ?
Register(AllocateTempHostReg(HR_CALLEE_SAVED)) :
RARG1;
const Register data = g_settings.gpu_pgxp_enable ? Register(AllocateTempHostReg(HR_CALLEE_SAVED)) : RARG2;
FlushForLoadStore(address, true, use_fastmem);
ComputeLoadStoreAddressArg(cf, address, addr);
switch (action) switch (action)
{ {
case GTERegisterAccessAction::Direct: case GTERegisterAccessAction::Direct:
{ {
armAsm->ldr(RARG2, PTR(ptr)); armAsm->ldr(data, PTR(ptr));
} }
break; break;
@ -1875,7 +1880,7 @@ void CPU::NewRec::AArch32Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSiz
Flush(FLUSH_FOR_C_CALL); Flush(FLUSH_FOR_C_CALL);
EmitMov(RARG1, index); EmitMov(RARG1, index);
EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister)); EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister));
armAsm->mov(RARG2, RRET); armAsm->mov(data, RRET);
} }
break; break;
@ -1886,30 +1891,24 @@ void CPU::NewRec::AArch32Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSiz
break; break;
} }
// PGXP makes this a giant pain. GenerateStore(addr, data, size, use_fastmem);
if (!g_settings.gpu_pgxp_enable) if (!g_settings.gpu_pgxp_enable)
{ {
const Register addr = ComputeLoadStoreAddressArg(cf, address); if (addr.GetCode() != RARG1.GetCode())
GenerateStore(addr, RARG2, size, use_fastmem); FreeHostReg(addr.GetCode());
return;
} }
else
{
// TODO: This can be simplified because we don't need to validate in PGXP.. // TODO: This can be simplified because we don't need to validate in PGXP..
const Register addr_reg = Register(AllocateTempHostReg(HR_CALLEE_SAVED));
const Register data_backup = Register(AllocateTempHostReg(HR_CALLEE_SAVED));
FlushForLoadStore(address, true, use_fastmem);
ComputeLoadStoreAddressArg(cf, address, addr_reg);
armAsm->mov(data_backup, RARG2);
GenerateStore(addr_reg, RARG2, size, use_fastmem);
Flush(FLUSH_FOR_C_CALL); Flush(FLUSH_FOR_C_CALL);
armAsm->mov(RARG3, data_backup); armAsm->mov(RARG3, data);
armAsm->mov(RARG2, addr_reg); FreeHostReg(data.GetCode());
FreeHostReg(addr_reg.GetCode()); armAsm->mov(RARG2, addr);
FreeHostReg(data_backup.GetCode()); FreeHostReg(addr.GetCode());
EmitMov(RARG1, inst->bits); EmitMov(RARG1, inst->bits);
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWC2)); EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWC2));
} }
}
void CPU::NewRec::AArch32Compiler::Compile_mtc0(CompileFlags cf) void CPU::NewRec::AArch32Compiler::Compile_mtc0(CompileFlags cf)
{ {

View File

@ -693,7 +693,7 @@ void CPU::NewRec::AArch64Compiler::Compile_Fallback()
{ {
Flush(FLUSH_FOR_INTERPRETER); Flush(FLUSH_FOR_INTERPRETER);
EmitCall(armAsm, &CPU::Recompiler::Thunks::InterpretInstruction); EmitCall(reinterpret_cast<const void*>(&CPU::Recompiler::Thunks::InterpretInstruction));
// TODO: make me less garbage // TODO: make me less garbage
// TODO: this is wrong, it flushes the load delay on the same cycle when we return. // TODO: this is wrong, it flushes the load delay on the same cycle when we return.
@ -1616,9 +1616,9 @@ void CPU::NewRec::AArch64Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize
{ {
// const u32 mask = UINT32_C(0x00FFFFFF) >> shift; // const u32 mask = UINT32_C(0x00FFFFFF) >> shift;
// new_value = (value & mask) | (RWRET << (24 - shift)); // new_value = (value & mask) | (RWRET << (24 - shift));
EmitMov(RWARG4, 0xFFFFFFu); EmitMov(RWSCRATCH, 0xFFFFFFu);
armAsm->lsrv(RWARG4, RWARG4, RWARG2); armAsm->lsrv(RWSCRATCH, RWSCRATCH, RWARG2);
armAsm->and_(value, value, RWARG4); armAsm->and_(value, value, RWSCRATCH);
armAsm->lslv(RWRET, RWRET, RWARG3); armAsm->lslv(RWRET, RWRET, RWARG3);
armAsm->orr(value, value, RWRET); armAsm->orr(value, value, RWRET);
} }
@ -1627,9 +1627,9 @@ void CPU::NewRec::AArch64Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize
// const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift); // const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift);
// new_value = (value & mask) | (RWRET >> shift); // new_value = (value & mask) | (RWRET >> shift);
armAsm->lsrv(RWRET, RWRET, RWARG2); armAsm->lsrv(RWRET, RWRET, RWARG2);
EmitMov(RWARG4, 0xFFFFFF00u); EmitMov(RWSCRATCH, 0xFFFFFF00u);
armAsm->lslv(RWARG4, RWARG4, RWARG3); armAsm->lslv(RWSCRATCH, RWSCRATCH, RWARG3);
armAsm->and_(value, value, RWARG4); armAsm->and_(value, value, RWSCRATCH);
armAsm->orr(value, value, RWRET); armAsm->orr(value, value, RWRET);
} }
@ -1836,15 +1836,20 @@ void CPU::NewRec::AArch64Compiler::Compile_swx(CompileFlags cf, MemoryAccessSize
void CPU::NewRec::AArch64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, void CPU::NewRec::AArch64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
const std::optional<VirtualMemoryAddress>& address) const std::optional<VirtualMemoryAddress>& address)
{ {
FlushForLoadStore(address, true, use_fastmem);
const u32 index = static_cast<u32>(inst->r.rt.GetValue()); const u32 index = static_cast<u32>(inst->r.rt.GetValue());
const auto [ptr, action] = GetGTERegisterPointer(index, false); const auto [ptr, action] = GetGTERegisterPointer(index, false);
const WRegister addr = (g_settings.gpu_pgxp_enable || action == GTERegisterAccessAction::CallHandler) ?
WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) :
RWARG1;
const WRegister data = g_settings.gpu_pgxp_enable ? WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) : RWARG2;
FlushForLoadStore(address, true, use_fastmem);
ComputeLoadStoreAddressArg(cf, address, addr);
switch (action) switch (action)
{ {
case GTERegisterAccessAction::Direct: case GTERegisterAccessAction::Direct:
{ {
armAsm->ldr(RWARG2, PTR(ptr)); armAsm->ldr(data, PTR(ptr));
} }
break; break;
@ -1854,7 +1859,7 @@ void CPU::NewRec::AArch64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSiz
Flush(FLUSH_FOR_C_CALL); Flush(FLUSH_FOR_C_CALL);
EmitMov(RWARG1, index); EmitMov(RWARG1, index);
EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister)); EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister));
armAsm->mov(RWARG2, RWRET); armAsm->mov(data, RWRET);
} }
break; break;
@ -1865,29 +1870,23 @@ void CPU::NewRec::AArch64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSiz
break; break;
} }
// PGXP makes this a giant pain. GenerateStore(addr, data, size, use_fastmem);
if (!g_settings.gpu_pgxp_enable) if (!g_settings.gpu_pgxp_enable)
{ {
const WRegister addr = ComputeLoadStoreAddressArg(cf, address); if (addr.GetCode() != RWARG1.GetCode())
GenerateStore(addr, RWARG2, size, use_fastmem); FreeHostReg(addr.GetCode());
return;
} }
else
{
// TODO: This can be simplified because we don't need to validate in PGXP.. // TODO: This can be simplified because we don't need to validate in PGXP..
const WRegister addr_reg = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
const WRegister data_backup = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
FlushForLoadStore(address, true, use_fastmem);
ComputeLoadStoreAddressArg(cf, address, addr_reg);
armAsm->mov(data_backup, RWARG2);
GenerateStore(addr_reg, RWARG2, size, use_fastmem);
Flush(FLUSH_FOR_C_CALL); Flush(FLUSH_FOR_C_CALL);
armAsm->mov(RWARG3, data_backup); armAsm->mov(RWARG3, data);
armAsm->mov(RWARG2, addr_reg); FreeHostReg(data.GetCode());
armAsm->mov(RWARG2, addr);
FreeHostReg(addr.GetCode());
EmitMov(RWARG1, inst->bits); EmitMov(RWARG1, inst->bits);
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWC2)); EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWC2));
FreeHostReg(addr_reg.GetCode()); }
FreeHostReg(data_backup.GetCode());
} }
void CPU::NewRec::AArch64Compiler::Compile_mtc0(CompileFlags cf) void CPU::NewRec::AArch64Compiler::Compile_mtc0(CompileFlags cf)

View File

@ -2143,15 +2143,20 @@ void CPU::NewRec::RISCV64Compiler::Compile_swx(CompileFlags cf, MemoryAccessSize
void CPU::NewRec::RISCV64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, void CPU::NewRec::RISCV64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
const std::optional<VirtualMemoryAddress>& address) const std::optional<VirtualMemoryAddress>& address)
{ {
FlushForLoadStore(address, true, use_fastmem);
const u32 index = static_cast<u32>(inst->r.rt.GetValue()); const u32 index = static_cast<u32>(inst->r.rt.GetValue());
const auto [ptr, action] = GetGTERegisterPointer(index, false); const auto [ptr, action] = GetGTERegisterPointer(index, false);
const GPR addr = (g_settings.gpu_pgxp_enable || action == GTERegisterAccessAction::CallHandler) ?
GPR(AllocateTempHostReg(HR_CALLEE_SAVED)) :
RARG1;
const GPR data = g_settings.gpu_pgxp_enable ? GPR(AllocateTempHostReg(HR_CALLEE_SAVED)) : RARG2;
FlushForLoadStore(address, true, use_fastmem);
ComputeLoadStoreAddressArg(cf, address, addr);
switch (action) switch (action)
{ {
case GTERegisterAccessAction::Direct: case GTERegisterAccessAction::Direct:
{ {
rvAsm->LW(RARG2, PTR(ptr)); rvAsm->LW(data, PTR(ptr));
} }
break; break;
@ -2161,7 +2166,7 @@ void CPU::NewRec::RISCV64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSiz
Flush(FLUSH_FOR_C_CALL); Flush(FLUSH_FOR_C_CALL);
EmitMov(RARG1, index); EmitMov(RARG1, index);
EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister)); EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister));
rvAsm->MV(RARG2, RRET); rvAsm->MV(data, RRET);
} }
break; break;
@ -2172,29 +2177,24 @@ void CPU::NewRec::RISCV64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSiz
break; break;
} }
// PGXP makes this a giant pain. GenerateStore(addr, data, size, use_fastmem);
if (!g_settings.gpu_pgxp_enable) if (!g_settings.gpu_pgxp_enable)
{ {
const GPR addr = ComputeLoadStoreAddressArg(cf, address); if (addr.Index() != RARG1.Index())
GenerateStore(addr, RARG2, size, use_fastmem); FreeHostReg(addr.Index());
return;
} }
else
{
// TODO: This can be simplified because we don't need to validate in PGXP.. // TODO: This can be simplified because we don't need to validate in PGXP..
const GPR addr_reg = GPR(AllocateTempHostReg(HR_CALLEE_SAVED));
const GPR data_backup = GPR(AllocateTempHostReg(HR_CALLEE_SAVED));
FlushForLoadStore(address, true, use_fastmem);
ComputeLoadStoreAddressArg(cf, address, addr_reg);
rvAsm->MV(data_backup, RARG2);
GenerateStore(addr_reg, RARG2, size, use_fastmem);
Flush(FLUSH_FOR_C_CALL); Flush(FLUSH_FOR_C_CALL);
rvAsm->MV(RARG3, data_backup); rvAsm->MV(RARG3, data);
rvAsm->MV(RARG2, addr_reg); FreeHostReg(data.Index());
rvAsm->MV(RARG2, addr);
FreeHostReg(addr.Index());
EmitMov(RARG1, inst->bits); EmitMov(RARG1, inst->bits);
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWC2)); EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWC2));
FreeHostReg(addr_reg.Index()); }
FreeHostReg(data_backup.Index());
} }
void CPU::NewRec::RISCV64Compiler::Compile_mtc0(CompileFlags cf) void CPU::NewRec::RISCV64Compiler::Compile_mtc0(CompileFlags cf)

View File

@ -299,7 +299,6 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
#undef RARG1 #undef RARG1
#undef RARG2 #undef RARG2
#undef RARG3 #undef RARG3
#undef RARG4
#undef RSCRATCH #undef RSCRATCH
#undef RSTATE #undef RSTATE

View File

@ -117,8 +117,6 @@ constexpr u32 MAX_FAR_HOST_BYTES_PER_INSTRUCTION = 128;
#define RXARG2 vixl::aarch64::x1 #define RXARG2 vixl::aarch64::x1
#define RWARG3 vixl::aarch64::w2 #define RWARG3 vixl::aarch64::w2
#define RXARG3 vixl::aarch64::x2 #define RXARG3 vixl::aarch64::x2
#define RWARG4 vixl::aarch64::w3
#define RXARG4 vixl::aarch64::x3
#define RWSCRATCH vixl::aarch64::w16 #define RWSCRATCH vixl::aarch64::w16
#define RXSCRATCH vixl::aarch64::x16 #define RXSCRATCH vixl::aarch64::x16
#define RSTATE vixl::aarch64::x19 #define RSTATE vixl::aarch64::x19