System: Refactor main loop

Reduces JIT exits.
Improves runahead performance.
This commit is contained in:
Stenzek
2023-08-15 23:12:21 +10:00
parent 4ebd34fcb3
commit 5b980dafa5
43 changed files with 1343 additions and 923 deletions

View File

@ -12,6 +12,8 @@ add_library(common
dimensional_array.h
error.cpp
error.h
fastjmp.cpp
fastjmp.h
fifo_queue.h
file_system.cpp
file_system.h
@ -97,6 +99,14 @@ if(WIN32)
windows_headers.h
)
target_link_libraries(common PRIVATE d3dcompiler.lib)
if(${CPU_ARCH} STREQUAL "x64")
enable_language(ASM_MASM)
target_sources(common PRIVATE fastjmp_x86.asm)
elseif(${CPU_ARCH} STREQUAL "aarch32" OR ${CPU_ARCH} STREQUAL "aarch64")
enable_language(ASM_MARMASM)
target_sources(common PRIVATE fastjmp_arm.asm)
endif()
endif()
if(NOT WIN32 AND NOT ANDROID)

View File

@ -23,6 +23,7 @@
<ClInclude Include="dimensional_array.h" />
<ClInclude Include="easing.h" />
<ClInclude Include="error.h" />
<ClInclude Include="fastjmp.h" />
<ClInclude Include="fifo_queue.h" />
<ClInclude Include="file_system.h" />
<ClInclude Include="gl\context.h">
@ -123,6 +124,7 @@
<ClCompile Include="d3d12\stream_buffer.cpp" />
<ClCompile Include="d3d12\texture.cpp" />
<ClCompile Include="d3d12\util.cpp" />
<ClCompile Include="fastjmp.cpp" />
<ClCompile Include="file_system.cpp" />
<ClCompile Include="gl\context.cpp">
<ExcludedFromBuild Condition="'$(Platform)'=='ARM64'">true</ExcludedFromBuild>
@ -192,6 +194,16 @@
<Natvis Include="bitfield.natvis" />
</ItemGroup>
<ItemGroup>
<MARMASM Include="fastjmp_arm.asm">
<FileType>Document</FileType>
<ExcludedFromBuild Condition="'$(Platform)'!='ARM64'">true</ExcludedFromBuild>
</MARMASM>
<MASM Include="fastjmp_x86.asm">
<FileType>Document</FileType>
<ExcludedFromBuild Condition="'$(Platform)'!='Win32' And '$(Platform)'!='x64'">true</ExcludedFromBuild>
<PreprocessorDefinitions Condition="'$(Platform)'=='Win32'">_M_X86_32;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions Condition="'$(Platform)'=='x64'">_M_X86_64;%(PreprocessorDefinitions)</PreprocessorDefinitions>
</MASM>
<None Include="vulkan\entry_points.inl">
<ExcludedFromBuild Condition="'$(Platform)'=='ARM64'">true</ExcludedFromBuild>
</None>
@ -219,9 +231,17 @@
<Project>{73ee0c55-6ffe-44e7-9c12-baa52434a797}</Project>
</ProjectReference>
</ItemGroup>
<ImportGroup Label="ExtensionTargets">
<Import Project="$(VCTargetsPath)\BuildCustomizations\marmasm.targets" />
<Import Project="$(VCTargetsPath)\BuildCustomizations\masm.targets" />
</ImportGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{EE054E08-3799-4A59-A422-18259C105FFD}</ProjectGuid>
</PropertyGroup>
<ImportGroup Label="ExtensionSettings">
<Import Project="$(VCTargetsPath)\BuildCustomizations\marmasm.props" />
<Import Project="$(VCTargetsPath)\BuildCustomizations\masm.props" />
</ImportGroup>
<Import Project="..\..\dep\msvc\vsprops\StaticLibrary.props" />
<Import Project="common.props" />
<ItemDefinitionGroup>

View File

@ -129,6 +129,7 @@
<ClInclude Include="build_timestamp.h" />
<ClInclude Include="sha1_digest.h" />
<ClInclude Include="gpu_texture.h" />
<ClInclude Include="fastjmp.h" />
</ItemGroup>
<ItemGroup>
<ClCompile Include="gl\program.cpp">
@ -234,6 +235,7 @@
<ClCompile Include="threading.cpp" />
<ClCompile Include="sha1_digest.cpp" />
<ClCompile Include="gpu_texture.cpp" />
<ClCompile Include="fastjmp.cpp" />
</ItemGroup>
<ItemGroup>
<Natvis Include="bitfield.natvis" />
@ -260,4 +262,10 @@
<Filter>vulkan</Filter>
</None>
</ItemGroup>
<ItemGroup>
<MASM Include="fastjmp_x86.asm" />
</ItemGroup>
<ItemGroup>
<MARMASM Include="fastjmp_arm.asm" />
</ItemGroup>
</Project>

166
src/common/fastjmp.cpp Normal file
View File

@ -0,0 +1,166 @@
// SPDX-FileCopyrightText: 2021 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
#ifndef _WIN32
#include "fastjmp.h"
#if defined(__APPLE__)
#define PREFIX "_"
#else
#define PREFIX ""
#endif
#if defined(__x86_64__)
asm("\t.global " PREFIX "fastjmp_set\n"
"\t.global " PREFIX "fastjmp_jmp\n"
"\t.text\n"
"\t" PREFIX "fastjmp_set:"
R"(
movq 0(%rsp), %rax
movq %rsp, %rdx # fixup stack pointer, so it doesn't include the call to fastjmp_set
addq $8, %rdx
movq %rax, 0(%rdi) # actually rip
movq %rbx, 8(%rdi)
movq %rdx, 16(%rdi) # actually rsp
movq %rbp, 24(%rdi)
movq %r12, 32(%rdi)
movq %r13, 40(%rdi)
movq %r14, 48(%rdi)
movq %r15, 56(%rdi)
xorl %eax, %eax
ret
)"
"\t" PREFIX "fastjmp_jmp:"
R"(
movl %esi, %eax
movq 0(%rdi), %rdx # actually rip
movq 8(%rdi), %rbx
movq 16(%rdi), %rsp # actually rsp
movq 24(%rdi), %rbp
movq 32(%rdi), %r12
movq 40(%rdi), %r13
movq 48(%rdi), %r14
movq 56(%rdi), %r15
jmp *%rdx
)");
#elif defined(__aarch64__)
asm(
"\t.global " PREFIX "fastjmp_set\n"
"\t.global " PREFIX "fastjmp_jmp\n"
"\t.text\n"
"\t.align 16\n"
"\t" PREFIX "fastjmp_set:" R"(
mov x16, sp
stp x16, x30, [x0]
stp x19, x20, [x0, #16]
stp x21, x22, [x0, #32]
stp x23, x24, [x0, #48]
stp x25, x26, [x0, #64]
stp x27, x28, [x0, #80]
str x29, [x0, #96]
stp d8, d9, [x0, #112]
stp d10, d11, [x0, #128]
stp d12, d13, [x0, #144]
stp d14, d15, [x0, #160]
mov w0, wzr
br x30
)"
".align 16\n"
"\t" PREFIX "fastjmp_jmp:" R"(
ldp x16, x30, [x0]
mov sp, x16
ldp x19, x20, [x0, #16]
ldp x21, x22, [x0, #32]
ldp x23, x24, [x0, #48]
ldp x25, x26, [x0, #64]
ldp x27, x28, [x0, #80]
ldr x29, [x0, #96]
ldp d8, d9, [x0, #112]
ldp d10, d11, [x0, #128]
ldp d12, d13, [x0, #144]
ldp d14, d15, [x0, #160]
mov w0, w1
br x30
)");
#elif defined(__riscv) && __riscv_xlen == 64
asm(
"\t.global " PREFIX "fastjmp_set\n"
"\t.global " PREFIX "fastjmp_jmp\n"
"\t.text\n"
"\t.align 16\n"
"\t" PREFIX "fastjmp_set:" R"(
sd sp, 0(a0)
sd s0, 8(a0)
sd s1, 16(a0)
sd s2, 24(a0)
sd s3, 32(a0)
sd s4, 40(a0)
sd s5, 48(a0)
sd s6, 56(a0)
sd s7, 64(a0)
sd s8, 72(a0)
sd s9, 80(a0)
sd s10, 88(a0)
sd s11, 96(a0)
fsd fs0, 104(a0)
fsd fs1, 112(a0)
fsd fs2, 120(a0)
fsd fs3, 128(a0)
fsd fs4, 136(a0)
fsd fs5, 144(a0)
fsd fs6, 152(a0)
fsd fs7, 160(a0)
fsd fs8, 168(a0)
fsd fs9, 176(a0)
fsd fs10, 184(a0)
fsd fs11, 192(a0)
sd ra, 208(a0)
li a0, 0
jr ra
)"
".align 16\n"
"\t" PREFIX "fastjmp_jmp:" R"(
ld ra, 208(a0)
fld fs11, 192(a0)
fld fs10, 184(a0)
fld fs9, 176(a0)
fld fs8, 168(a0)
fld fs7, 160(a0)
fld fs6, 152(a0)
fld fs5, 144(a0)
fld fs4, 136(a0)
fld fs3, 128(a0)
fld fs2, 120(a0)
fld fs1, 112(a0)
fld fs0, 104(a0)
ld s11, 96(a0)
ld s10, 88(a0)
ld s9, 80(a0)
ld s8, 72(a0)
ld s7, 64(a0)
ld s6, 56(a0)
ld s5, 48(a0)
ld s4, 40(a0)
ld s3, 32(a0)
ld s2, 24(a0)
ld s1, 16(a0)
ld s0, 8(a0)
ld sp, 0(a0)
mv a0, a1
jr ra
)");
#else
#error Unknown platform.
#endif
#endif // __WIN32

33
src/common/fastjmp.h Normal file
View File

@ -0,0 +1,33 @@
// SPDX-FileCopyrightText: 2021 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
#pragma once
#include "types.h"
#include <cstddef>
#include <cstdint>
struct fastjmp_buf
{
#if defined(_WIN32) && defined(_M_AMD64)
static constexpr std::size_t BUF_SIZE = 240;
#elif defined(_M_ARM64) || defined(__aarch64__)
static constexpr std::size_t BUF_SIZE = 168;
#elif defined(__x86_64__)
static constexpr std::size_t BUF_SIZE = 64;
#elif defined(_M_IX86) || defined(__i386__)
static constexpr std::size_t BUF_SIZE = 24;
#elif defined(__riscv) && __riscv_xlen == 64
static constexpr std::size_t BUF_SIZE = 208;
#else
#error Unknown architecture.
#endif
alignas(16) std::uint8_t buf[BUF_SIZE];
};
extern "C" {
int fastjmp_set(fastjmp_buf* buf);
[[noreturn]] void fastjmp_jmp(const fastjmp_buf* buf, int ret);
}

View File

@ -0,0 +1,47 @@
; SPDX-FileCopyrightText: 2021 Connor McLaughlin <stenzek@gmail.com>
; SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
#include "ksarm64.h"
EXPORT fastjmp_set
EXPORT fastjmp_jmp
TEXTAREA
; void fastjmp_set(fastjmp_buf*)
LEAF_ENTRY fastjmp_set
mov x16, sp
stp x16, x30, [x0]
stp x19, x20, [x0, #16]
stp x21, x22, [x0, #32]
stp x23, x24, [x0, #48]
stp x25, x26, [x0, #64]
stp x27, x28, [x0, #80]
str x29, [x0, #96]
stp d8, d9, [x0, #112]
stp d10, d11, [x0, #128]
stp d12, d13, [x0, #144]
stp d14, d15, [x0, #160]
mov w0, wzr
br x30
LEAF_END
; void fastjmp_jmp(fastjmp_buf*, int)
LEAF_ENTRY fastjmp_jmp
ldp x16, x30, [x0]
mov sp, x16
ldp x19, x20, [x0, #16]
ldp x21, x22, [x0, #32]
ldp x23, x24, [x0, #48]
ldp x25, x26, [x0, #64]
ldp x27, x28, [x0, #80]
ldr x29, [x0, #96]
ldp d8, d9, [x0, #112]
ldp d10, d11, [x0, #128]
ldp d12, d13, [x0, #144]
ldp d14, d15, [x0, #160]
mov w0, w1
br x30
LEAF_END
END

119
src/common/fastjmp_x86.asm Normal file
View File

@ -0,0 +1,119 @@
; SPDX-FileCopyrightText: 2021 Connor McLaughlin <stenzek@gmail.com>
; SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
IFDEF _M_X86_32
; -----------------------------------------
; 32-bit X86
; -----------------------------------------
.386
.model flat
_TEXT SEGMENT
PUBLIC @fastjmp_set@4
PUBLIC @fastjmp_jmp@8
; void fastjmp_set(fastjmp_buf*)
@fastjmp_set@4 PROC
mov eax, dword ptr [esp]
mov edx, esp ; fixup stack pointer, so it doesn't include the call to fastjmp_set
add edx, 4
mov dword ptr [ecx], eax ; actually eip
mov dword ptr [ecx + 4], ebx
mov dword ptr [ecx + 8], edx ; actually esp
mov dword ptr [ecx + 12], ebp
mov dword ptr [ecx + 16], esi
mov dword ptr [ecx + 20], edi
xor eax, eax
ret
@fastjmp_set@4 ENDP
; void __fastcall fastjmp_jmp(fastjmp_buf*, int)
@fastjmp_jmp@8 PROC
mov eax, edx ; return code
mov edx, dword ptr [ecx + 0]
mov ebx, dword ptr [ecx + 4]
mov esp, dword ptr [ecx + 8]
mov ebp, dword ptr [ecx + 12]
mov esi, dword ptr [ecx + 16]
mov edi, dword ptr [ecx + 20]
jmp edx
@fastjmp_jmp@8 ENDP
_TEXT ENDS
ENDIF ; _M_X86_32
IFDEF _M_X86_64
; -----------------------------------------
; 64-bit X86
; -----------------------------------------
_TEXT SEGMENT
PUBLIC fastjmp_set
PUBLIC fastjmp_jmp
; void fastjmp_set(fastjmp_buf*)
fastjmp_set PROC
mov rax, qword ptr [rsp]
mov rdx, rsp ; fixup stack pointer, so it doesn't include the call to fastjmp_set
add rdx, 8
mov qword ptr [rcx], rax ; actually rip
mov qword ptr [rcx + 8], rbx
mov qword ptr [rcx + 16], rdx ; actually rsp
mov qword ptr [rcx + 24], rbp
mov qword ptr [rcx + 32], rsi
mov qword ptr [rcx + 40], rdi
mov qword ptr [rcx + 48], r12
mov qword ptr [rcx + 56], r13
mov qword ptr [rcx + 64], r14
mov qword ptr [rcx + 72], r15
movaps xmmword ptr [rcx + 80], xmm6
movaps xmmword ptr [rcx + 96], xmm7
movaps xmmword ptr [rcx + 112], xmm8
add rcx, 112 ; split to two batches to fit displacement in a single byte
movaps xmmword ptr [rcx + 16], xmm9
movaps xmmword ptr [rcx + 32], xmm10
movaps xmmword ptr [rcx + 48], xmm11
movaps xmmword ptr [rcx + 64], xmm12
movaps xmmword ptr [rcx + 80], xmm13
movaps xmmword ptr [rcx + 96], xmm14
movaps xmmword ptr [rcx + 112], xmm15
xor eax, eax
ret
fastjmp_set ENDP
; void fastjmp_jmp(fastjmp_buf*, int)
fastjmp_jmp PROC
mov eax, edx ; return code
mov rdx, qword ptr [rcx + 0] ; actually rip
mov rbx, qword ptr [rcx + 8]
mov rsp, qword ptr [rcx + 16]
mov rbp, qword ptr [rcx + 24]
mov rsi, qword ptr [rcx + 32]
mov rdi, qword ptr [rcx + 40]
mov r12, qword ptr [rcx + 48]
mov r13, qword ptr [rcx + 56]
mov r14, qword ptr [rcx + 64]
mov r15, qword ptr [rcx + 72]
movaps xmm6, xmmword ptr [rcx + 80]
movaps xmm7, xmmword ptr [rcx + 96]
movaps xmm8, xmmword ptr [rcx + 112]
add rcx, 112 ; split to two batches to fit displacement in a single byte
movaps xmm9, xmmword ptr [rcx + 16]
movaps xmm10, xmmword ptr [rcx + 32]
movaps xmm11, xmmword ptr [rcx + 48]
movaps xmm12, xmmword ptr [rcx + 64]
movaps xmm13, xmmword ptr [rcx + 80]
movaps xmm14, xmmword ptr [rcx + 96]
movaps xmm15, xmmword ptr [rcx + 112]
jmp rdx
fastjmp_jmp ENDP
_TEXT ENDS
ENDIF ; _M_X86_64
END

View File

@ -27,6 +27,8 @@
#define CPU_AARCH64 1
#elif defined(__arm__)
#define CPU_AARCH32 1
#elif defined(__riscv) && __riscv_xlen == 64
#define CPU_RISCV64 1
#else
#error Unknown architecture.
#endif

View File

@ -65,6 +65,13 @@ char (&__countof_ArraySizeHelper(T (&array)[N]))[N];
#define UNLIKELY(x) __builtin_expect(!!(x), 0)
#endif
// [[noreturn]] which can be used on function pointers.
#ifdef _MSC_VER
// __declspec(noreturn) produces error C3829.
#define NORETURN_FUNCTION_POINTER
#else
#define NORETURN_FUNCTION_POINTER __attribute__((noreturn))
#endif
// disable warnings that show up at warning level 4
// TODO: Move to build system instead