diff --git a/src/common/heap_array.h b/src/common/heap_array.h index 21d98ff68..6e4d59299 100644 --- a/src/common/heap_array.h +++ b/src/common/heap_array.h @@ -11,7 +11,7 @@ #include #include -template +template class FixedHeapArray { public: @@ -24,11 +24,11 @@ public: using const_pointer = const T*; using this_type = FixedHeapArray; - FixedHeapArray() { m_data = new T[SIZE]; } + FixedHeapArray() { allocate(); } FixedHeapArray(const this_type& copy) { - m_data = new T[SIZE]; + allocate(); std::copy(copy.cbegin(), copy.cend(), begin()); } @@ -38,7 +38,7 @@ public: move.m_data = nullptr; } - ~FixedHeapArray() { delete[] m_data; } + ~FixedHeapArray() { deallocate(); } size_type size() const { return SIZE; } size_type capacity() const { return SIZE; } @@ -107,6 +107,42 @@ public: #undef RELATIONAL_OPERATOR private: + void allocate() + { + if constexpr (ALIGNMENT > 0) + { +#ifdef _MSC_VER + m_data = static_cast(_aligned_malloc(SIZE * sizeof(T), ALIGNMENT)); + if (!m_data) + Panic("Memory allocation failed."); +#else + if (posix_memalign(reinterpret_cast(&m_data), ALIGNMENT, SIZE * sizeof(T)) != 0) + Panic("Memory allocation failed."); +#endif + } + else + { + m_data = static_cast(std::malloc(SIZE * sizeof(T))); + if (!m_data) + Panic("Memory allocation failed."); + } + } + void deallocate() + { + if constexpr (ALIGNMENT > 0) + { +#ifdef _MSC_VER + _aligned_free(m_data); +#else + std::free(m_data); +#endif + } + else + { + std::free(m_data); + } + } + T* m_data; }; @@ -313,23 +349,23 @@ private: if constexpr (alignment > 0) { #ifdef _MSC_VER - m_data = _aligned_realloc(prev_ptr, size, alignment); + m_data = static_cast(_aligned_realloc(prev_ptr, size * sizeof(T), alignment)); if (!m_data) Panic("Memory allocation failed."); #else - if (posix_memalign(reinterpret_cast(&m_data), alignment, size) != 0) + if (posix_memalign(reinterpret_cast(&m_data), alignment, size * sizeof(T)) != 0) Panic("Memory allocation failed."); if (prev_ptr) { - std::memcpy(m_data, prev_ptr, prev_size); + std::memcpy(m_data, prev_ptr, prev_size * sizeof(T)); std::free(prev_ptr); } #endif } else { - m_data = static_cast(std::realloc(prev_ptr, size)); + m_data = static_cast(std::realloc(prev_ptr, size * sizeof(T))); if (!m_data) Panic("Memory allocation failed."); } diff --git a/src/util/cd_image_chd.cpp b/src/util/cd_image_chd.cpp index e5db944af..8879f635d 100644 --- a/src/util/cd_image_chd.cpp +++ b/src/util/cd_image_chd.cpp @@ -9,6 +9,8 @@ #include "common/error.h" #include "common/file_system.h" #include "common/hash_combine.h" +#include "common/heap_array.h" +#include "common/intrin.h" #include "common/log.h" #include "common/path.h" #include "common/string_util.h" @@ -76,11 +78,13 @@ private: chd_file* OpenCHD(std::string_view filename, FileSystem::ManagedCFilePtr fp, Error* error, u32 recursion_level); bool ReadHunk(u32 hunk_index); + static void CopyAndSwap(void* dst_ptr, const u8* src_ptr); + chd_file* m_chd = nullptr; u32 m_hunk_size = 0; u32 m_sectors_per_hunk = 0; - std::vector m_hunk_buffer; + DynamicHeapArray m_hunk_buffer; u32 m_current_hunk_index = static_cast(-1); bool m_precached = false; @@ -443,12 +447,39 @@ bool CDImageCHD::IsPrecached() const return m_precached; } -// There's probably a more efficient way of doing this with vectorization... -ALWAYS_INLINE static void CopyAndSwap(void* dst_ptr, const u8* src_ptr, u32 data_size) +ALWAYS_INLINE_RELEASE void CDImageCHD::CopyAndSwap(void* dst_ptr, const u8* src_ptr) { + constexpr u32 data_size = RAW_SECTOR_SIZE; + u8* dst_ptr_byte = static_cast(dst_ptr); -#if defined(CPU_ARCH_X64) || defined(CPU_ARCH_ARM64) - const u32 num_values = data_size / 8; +#if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON) + static_assert((data_size % 16) == 0); + constexpr u32 num_values = data_size / 16; + +#if defined(CPU_ARCH_SSE) + // Requires SSSE3. + //const __m128i mask = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); + for (u32 i = 0; i < num_values; i++) + { + __m128i value = _mm_load_si128(reinterpret_cast(src_ptr)); + //value = _mm_shuffle_epi8(value, mask); + value = _mm_or_si128(_mm_slli_epi16(value, 8), _mm_srli_epi16(value, 8)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst_ptr_byte), value); + src_ptr += sizeof(value); + dst_ptr_byte += sizeof(value); + } +#elif defined(CPU_ARCH_NEON) + for (u32 i = 0; i < num_values; i++) + { + uint16x8_t value = vld1q_u16(reinterpret_cast(src_ptr)); + value = vorrq_u16(vshlq_n_u16(value, 8), vshrq_n_u16(value, 8)); + vst1q_u16(reinterpret_cast(dst_ptr_byte), value); + src_ptr += sizeof(value); + dst_ptr_byte += sizeof(value); + } +#endif +#elif defined(CPU_ARCH_RISCV64) + constexpr u32 num_values = data_size / 8; for (u32 i = 0; i < num_values; i++) { u64 value; @@ -458,8 +489,8 @@ ALWAYS_INLINE static void CopyAndSwap(void* dst_ptr, const u8* src_ptr, u32 data src_ptr += sizeof(value); dst_ptr_byte += sizeof(value); } -#elif defined(CPU_ARCH_X86) || defined(CPU_ARCH_ARM32) - const u32 num_values = data_size / 4; +#else + constexpr u32 num_values = data_size / 4; for (u32 i = 0; i < num_values; i++) { u32 value; @@ -469,17 +500,6 @@ ALWAYS_INLINE static void CopyAndSwap(void* dst_ptr, const u8* src_ptr, u32 data src_ptr += sizeof(value); dst_ptr_byte += sizeof(value); } -#else - const u32 num_values = data_size / sizeof(u16); - for (u32 i = 0; i < num_values; i++) - { - u16 value; - std::memcpy(&value, src_ptr, sizeof(value)); - value = (value << 8) | (value >> 8); - std::memcpy(dst_ptr_byte, &value, sizeof(value)); - src_ptr += sizeof(value); - dst_ptr_byte += sizeof(value); - } #endif } @@ -495,7 +515,7 @@ bool CDImageCHD::ReadSectorFromIndex(void* buffer, const Index& index, LBA lba_i // Audio data is in big-endian, so we have to swap it for little endian hosts... if (index.mode == TrackMode::Audio) - CopyAndSwap(buffer, &m_hunk_buffer[hunk_offset], RAW_SECTOR_SIZE); + CopyAndSwap(buffer, &m_hunk_buffer[hunk_offset]); else std::memcpy(buffer, &m_hunk_buffer[hunk_offset], RAW_SECTOR_SIZE);