GPU/Software: Reduce number of copies by one, enable 16-bit scanout

This commit is contained in:
Connor McLaughlin
2020-10-22 01:25:33 +10:00
parent beffbaee39
commit d3d881aa6b
20 changed files with 875 additions and 187 deletions

View File

@ -654,15 +654,15 @@ void GPU_HW_D3D11::UpdateDisplay()
if (IsUsingMultisampling())
{
UpdateVRAMReadTexture();
m_host_display->SetDisplayTexture(m_vram_read_texture.GetD3DSRV(), m_vram_read_texture.GetWidth(),
m_vram_read_texture.GetHeight(), 0, 0, m_vram_read_texture.GetWidth(),
m_vram_read_texture.GetHeight());
m_host_display->SetDisplayTexture(m_vram_read_texture.GetD3DSRV(), HostDisplayPixelFormat::RGBA8,
m_vram_read_texture.GetWidth(), m_vram_read_texture.GetHeight(), 0, 0,
m_vram_read_texture.GetWidth(), m_vram_read_texture.GetHeight());
}
else
{
m_host_display->SetDisplayTexture(m_vram_texture.GetD3DSRV(), m_vram_texture.GetWidth(),
m_vram_texture.GetHeight(), 0, 0, m_vram_texture.GetWidth(),
m_vram_texture.GetHeight());
m_host_display->SetDisplayTexture(m_vram_texture.GetD3DSRV(), HostDisplayPixelFormat::RGBA8,
m_vram_texture.GetWidth(), m_vram_texture.GetHeight(), 0, 0,
m_vram_texture.GetWidth(), m_vram_texture.GetHeight());
}
m_host_display->SetDisplayParameters(VRAM_WIDTH, VRAM_HEIGHT, 0, 0, VRAM_WIDTH, VRAM_HEIGHT,
@ -689,9 +689,9 @@ void GPU_HW_D3D11::UpdateDisplay()
!IsUsingMultisampling() && (scaled_vram_offset_x + scaled_display_width) <= m_vram_texture.GetWidth() &&
(scaled_vram_offset_y + scaled_display_height) <= m_vram_texture.GetHeight())
{
m_host_display->SetDisplayTexture(m_vram_texture.GetD3DSRV(), m_vram_texture.GetWidth(),
m_vram_texture.GetHeight(), scaled_vram_offset_x, scaled_vram_offset_y,
scaled_display_width, scaled_display_height);
m_host_display->SetDisplayTexture(m_vram_texture.GetD3DSRV(), HostDisplayPixelFormat::RGBA8,
m_vram_texture.GetWidth(), m_vram_texture.GetHeight(), scaled_vram_offset_x,
scaled_vram_offset_y, scaled_display_width, scaled_display_height);
}
else
{
@ -711,9 +711,9 @@ void GPU_HW_D3D11::UpdateDisplay()
SetViewportAndScissor(0, 0, scaled_display_width, scaled_display_height);
DrawUtilityShader(display_pixel_shader, uniforms, sizeof(uniforms));
m_host_display->SetDisplayTexture(m_display_texture.GetD3DSRV(), m_display_texture.GetWidth(),
m_display_texture.GetHeight(), 0, 0, scaled_display_width,
scaled_display_height);
m_host_display->SetDisplayTexture(m_display_texture.GetD3DSRV(), HostDisplayPixelFormat::RGBA8,
m_display_texture.GetWidth(), m_display_texture.GetHeight(), 0, 0,
scaled_display_width, scaled_display_height);
RestoreGraphicsAPIState();
}

View File

@ -656,17 +656,18 @@ void GPU_HW_OpenGL::UpdateDisplay()
{
UpdateVRAMReadTexture();
m_host_display->SetDisplayTexture(
reinterpret_cast<void*>(static_cast<uintptr_t>(m_vram_read_texture.GetGLId())), m_vram_read_texture.GetWidth(),
static_cast<s32>(m_vram_read_texture.GetHeight()), 0, m_vram_read_texture.GetHeight(),
m_vram_read_texture.GetWidth(), -static_cast<s32>(m_vram_read_texture.GetHeight()));
m_host_display->SetDisplayTexture(reinterpret_cast<void*>(static_cast<uintptr_t>(m_vram_read_texture.GetGLId())),
HostDisplayPixelFormat::RGBA8, m_vram_read_texture.GetWidth(),
static_cast<s32>(m_vram_read_texture.GetHeight()), 0,
m_vram_read_texture.GetHeight(), m_vram_read_texture.GetWidth(),
-static_cast<s32>(m_vram_read_texture.GetHeight()));
}
else
{
m_host_display->SetDisplayTexture(reinterpret_cast<void*>(static_cast<uintptr_t>(m_vram_texture.GetGLId())),
m_vram_texture.GetWidth(), static_cast<s32>(m_vram_texture.GetHeight()), 0,
m_vram_texture.GetHeight(), m_vram_texture.GetWidth(),
-static_cast<s32>(m_vram_texture.GetHeight()));
HostDisplayPixelFormat::RGBA8, m_vram_texture.GetWidth(),
static_cast<s32>(m_vram_texture.GetHeight()), 0, m_vram_texture.GetHeight(),
m_vram_texture.GetWidth(), -static_cast<s32>(m_vram_texture.GetHeight()));
}
m_host_display->SetDisplayParameters(VRAM_WIDTH, VRAM_HEIGHT, 0, 0, VRAM_WIDTH, VRAM_HEIGHT,
static_cast<float>(VRAM_WIDTH) / static_cast<float>(VRAM_HEIGHT));
@ -693,7 +694,8 @@ void GPU_HW_OpenGL::UpdateDisplay()
(scaled_vram_offset_y + scaled_display_height) <= m_vram_texture.GetHeight())
{
m_host_display->SetDisplayTexture(reinterpret_cast<void*>(static_cast<uintptr_t>(m_vram_texture.GetGLId())),
m_vram_texture.GetWidth(), m_vram_texture.GetHeight(), scaled_vram_offset_x,
HostDisplayPixelFormat::RGBA8, m_vram_texture.GetWidth(),
m_vram_texture.GetHeight(), scaled_vram_offset_x,
m_vram_texture.GetHeight() - scaled_vram_offset_y, scaled_display_width,
-static_cast<s32>(scaled_display_height));
}
@ -723,8 +725,8 @@ void GPU_HW_OpenGL::UpdateDisplay()
glDrawArrays(GL_TRIANGLES, 0, 3);
m_host_display->SetDisplayTexture(reinterpret_cast<void*>(static_cast<uintptr_t>(m_display_texture.GetGLId())),
m_display_texture.GetWidth(), m_display_texture.GetHeight(), 0,
scaled_display_height, scaled_display_width,
HostDisplayPixelFormat::RGBA8, m_display_texture.GetWidth(),
m_display_texture.GetHeight(), 0, scaled_display_height, scaled_display_width,
-static_cast<s32>(scaled_display_height));
// restore state

View File

@ -975,16 +975,17 @@ void GPU_HW_Vulkan::UpdateDisplay()
if (IsUsingMultisampling())
{
UpdateVRAMReadTexture();
m_host_display->SetDisplayTexture(&m_vram_read_texture, m_vram_read_texture.GetWidth(),
m_vram_read_texture.GetHeight(), 0, 0, m_vram_read_texture.GetWidth(),
m_vram_read_texture.GetHeight());
m_host_display->SetDisplayTexture(&m_vram_read_texture, HostDisplayPixelFormat::RGBA8,
m_vram_read_texture.GetWidth(), m_vram_read_texture.GetHeight(), 0, 0,
m_vram_read_texture.GetWidth(), m_vram_read_texture.GetHeight());
}
else
{
m_vram_texture.TransitionToLayout(g_vulkan_context->GetCurrentCommandBuffer(),
VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
m_host_display->SetDisplayTexture(&m_vram_texture, m_vram_texture.GetWidth(), m_vram_texture.GetHeight(), 0, 0,
m_vram_texture.GetWidth(), m_vram_texture.GetHeight());
m_host_display->SetDisplayTexture(&m_vram_texture, HostDisplayPixelFormat::RGBA8, m_vram_texture.GetWidth(),
m_vram_texture.GetHeight(), 0, 0, m_vram_texture.GetWidth(),
m_vram_texture.GetHeight());
}
m_host_display->SetDisplayParameters(VRAM_WIDTH, VRAM_HEIGHT, 0, 0, VRAM_WIDTH, VRAM_HEIGHT,
static_cast<float>(VRAM_WIDTH) / static_cast<float>(VRAM_HEIGHT));
@ -1012,9 +1013,9 @@ void GPU_HW_Vulkan::UpdateDisplay()
{
m_vram_texture.TransitionToLayout(g_vulkan_context->GetCurrentCommandBuffer(),
VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
m_host_display->SetDisplayTexture(&m_vram_texture, m_vram_texture.GetWidth(), m_vram_texture.GetHeight(),
scaled_vram_offset_x, scaled_vram_offset_y, scaled_display_width,
scaled_display_height);
m_host_display->SetDisplayTexture(&m_vram_texture, HostDisplayPixelFormat::RGBA8, m_vram_texture.GetWidth(),
m_vram_texture.GetHeight(), scaled_vram_offset_x, scaled_vram_offset_y,
scaled_display_width, scaled_display_height);
}
else
{
@ -1047,8 +1048,9 @@ void GPU_HW_Vulkan::UpdateDisplay()
m_vram_texture.TransitionToLayout(cmdbuf, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL);
m_display_texture.TransitionToLayout(cmdbuf, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
m_host_display->SetDisplayTexture(&m_display_texture, m_display_texture.GetWidth(), m_display_texture.GetHeight(),
0, 0, scaled_display_width, scaled_display_height);
m_host_display->SetDisplayTexture(&m_display_texture, HostDisplayPixelFormat::RGBA8, m_display_texture.GetWidth(),
m_display_texture.GetHeight(), 0, 0, scaled_display_width,
scaled_display_height);
RestoreGraphicsAPIState();
}

View File

@ -1,11 +1,24 @@
#include "gpu_sw.h"
#include "common/align.h"
#include "common/assert.h"
#include "common/cpu_detect.h"
#include "common/log.h"
#include "common/make_array.h"
#include "host_display.h"
#include "system.h"
#include <algorithm>
Log_SetChannel(GPU_SW);
#if defined(CPU_X64)
#include <emmintrin.h>
#elif defined(CPU_AARCH64)
#ifdef _MSC_VER
#include <arm64_neon.h>
#else
#include <arm_neon.h>
#endif
#endif
GPU_SW::GPU_SW()
{
m_vram.fill(0);
@ -28,9 +41,27 @@ bool GPU_SW::Initialize(HostDisplay* host_display)
if (!GPU::Initialize(host_display))
return false;
m_display_texture = host_display->CreateTexture(VRAM_WIDTH, VRAM_HEIGHT, nullptr, 0, true);
if (!m_display_texture)
return false;
static constexpr auto formats_for_16bit = make_array(HostDisplayPixelFormat::RGB565, HostDisplayPixelFormat::RGBA5551,
HostDisplayPixelFormat::RGBA8, HostDisplayPixelFormat::BGRA8);
static constexpr auto formats_for_24bit =
make_array(HostDisplayPixelFormat::RGBA8, HostDisplayPixelFormat::BGRA8, HostDisplayPixelFormat::RGB565,
HostDisplayPixelFormat::RGBA5551);
for (const HostDisplayPixelFormat format : formats_for_16bit)
{
if (m_host_display->SupportsDisplayPixelFormat(format))
{
m_16bit_display_format = format;
break;
}
}
for (const HostDisplayPixelFormat format : formats_for_24bit)
{
if (m_host_display->SupportsDisplayPixelFormat(format))
{
m_24bit_display_format = format;
break;
}
}
return true;
}
@ -42,74 +73,323 @@ void GPU_SW::Reset()
m_vram.fill(0);
}
void GPU_SW::CopyOut15Bit(u32 src_x, u32 src_y, u32* dst_ptr, u32 dst_stride, u32 width, u32 height, bool interlaced,
bool interleaved)
template<HostDisplayPixelFormat out_format, typename out_type>
static void CopyOutRow16(const u16* src_ptr, out_type* dst_ptr, u32 width);
template<HostDisplayPixelFormat out_format, typename out_type>
static out_type VRAM16ToOutput(u16 value);
template<>
ALWAYS_INLINE u16 VRAM16ToOutput<HostDisplayPixelFormat::RGBA5551, u16>(u16 value)
{
return (value & 0x3E0) | ((value >> 10) & 0x1F) | ((value & 0x1F) << 10);
}
template<>
ALWAYS_INLINE u16 VRAM16ToOutput<HostDisplayPixelFormat::RGB565, u16>(u16 value)
{
return ((value & 0x3E0) << 1) | ((value & 0x20) << 1) | ((value >> 10) & 0x1F) | ((value & 0x1F) << 11);
}
template<>
ALWAYS_INLINE u32 VRAM16ToOutput<HostDisplayPixelFormat::RGBA8, u32>(u16 value)
{
u8 r = Truncate8(value & 31);
u8 g = Truncate8((value >> 5) & 31);
u8 b = Truncate8((value >> 10) & 31);
// 00012345 -> 1234545
b = (b << 3) | (b & 0b111);
g = (g << 3) | (g & 0b111);
r = (r << 3) | (r & 0b111);
return ZeroExtend32(r) | (ZeroExtend32(g) << 8) | (ZeroExtend32(b) << 16) | (0xFF000000u);
}
template<>
ALWAYS_INLINE u32 VRAM16ToOutput<HostDisplayPixelFormat::BGRA8, u32>(u16 value)
{
u8 r = Truncate8(value & 31);
u8 g = Truncate8((value >> 5) & 31);
u8 b = Truncate8((value >> 10) & 31);
// 00012345 -> 1234545
b = (b << 3) | (b & 0b111);
g = (g << 3) | (g & 0b111);
r = (r << 3) | (r & 0b111);
return ZeroExtend32(b) | (ZeroExtend32(g) << 8) | (ZeroExtend32(r) << 16) | (0xFF000000u);
}
template<>
ALWAYS_INLINE void CopyOutRow16<HostDisplayPixelFormat::RGBA5551, u16>(const u16* src_ptr, u16* dst_ptr, u32 width)
{
u32 col = 0;
#if defined(CPU_X64)
const u32 aligned_width = Common::AlignDownPow2(width, 8);
for (; col < aligned_width; col += 8)
{
const __m128i single_mask = _mm_set1_epi16(0x1F);
__m128i value = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src_ptr));
src_ptr += 8;
__m128i a = _mm_and_si128(value, _mm_set1_epi16(static_cast<s16>(static_cast<u16>(0x3E0))));
__m128i b = _mm_and_si128(_mm_srli_epi16(value, 10), single_mask);
__m128i c = _mm_slli_epi16(_mm_and_si128(value, single_mask), 10);
value = _mm_or_si128(_mm_or_si128(a, b), c);
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst_ptr), value);
dst_ptr += 8;
}
#elif defined(CPU_AARCH64)
const u32 aligned_width = Common::AlignDownPow2(width, 8);
for (; col < aligned_width; col += 8)
{
const uint16x8_t single_mask = vdupq_n_u16(0x1F);
uint16x8_t value = vld1q_u16(src_ptr);
src_ptr += 8;
uint16x8_t a = vandq_u16(value, vdupq_n_u16(0x3E0));
uint16x8_t b = vandq_u16(vshrq_n_u16(value, 10), single_mask);
uint16x8_t c = vshlq_n_u16(vandq_u16(value, single_mask), 10);
value = vorrq_u16(vorrq_u16(a, b), c);
vst1q_u16(dst_ptr, value);
dst_ptr += 8;
}
#endif
for (; col < width; col++)
*(dst_ptr++) = VRAM16ToOutput<HostDisplayPixelFormat::RGBA5551, u16>(*(src_ptr++));
}
template<>
ALWAYS_INLINE void CopyOutRow16<HostDisplayPixelFormat::RGB565, u16>(const u16* src_ptr, u16* dst_ptr, u32 width)
{
u32 col = 0;
#if defined(CPU_X64)
const u32 aligned_width = Common::AlignDownPow2(width, 8);
for (; col < aligned_width; col += 8)
{
const __m128i single_mask = _mm_set1_epi16(0x1F);
__m128i value = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src_ptr));
src_ptr += 8;
__m128i a = _mm_slli_epi16(_mm_and_si128(value, _mm_set1_epi16(static_cast<s16>(static_cast<u16>(0x3E0)))), 1);
__m128i b = _mm_slli_epi16(_mm_and_si128(value, _mm_set1_epi16(static_cast<s16>(static_cast<u16>(0x20)))), 1);
__m128i c = _mm_and_si128(_mm_srli_epi16(value, 10), single_mask);
__m128i d = _mm_slli_epi16(_mm_and_si128(value, single_mask), 11);
value = _mm_or_si128(_mm_or_si128(_mm_or_si128(a, b), c), d);
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst_ptr), value);
dst_ptr += 8;
}
#elif defined(CPU_AARCH64)
const u32 aligned_width = Common::AlignDownPow2(width, 8);
const uint16x8_t single_mask = vdupq_n_u16(0x1F);
for (; col < aligned_width; col += 8)
{
uint16x8_t value = vld1q_u16(src_ptr);
src_ptr += 8;
uint16x8_t a = vshlq_n_u16(vandq_u16(value, vdupq_n_u16(0x3E0)), 1); // (value & 0x3E0) << 1
uint16x8_t b = vshlq_n_u16(vandq_u16(value, vdupq_n_u16(0x20)), 1); // (value & 0x20) << 1
uint16x8_t c = vandq_u16(vshrq_n_u16(value, 10), single_mask); // ((value >> 10) & 0x1F)
uint16x8_t d = vshlq_n_u16(vandq_u16(value, single_mask), 11); // ((value & 0x1F) << 11)
value = vorrq_u16(vorrq_u16(vorrq_u16(a, b), c), d);
vst1q_u16(dst_ptr, value);
dst_ptr += 8;
}
#endif
for (; col < width; col++)
*(dst_ptr++) = VRAM16ToOutput<HostDisplayPixelFormat::RGB565, u16>(*(src_ptr++));
}
template<>
ALWAYS_INLINE void CopyOutRow16<HostDisplayPixelFormat::RGBA8, u32>(const u16* src_ptr, u32* dst_ptr, u32 width)
{
for (u32 col = 0; col < width; col++)
*(dst_ptr++) = VRAM16ToOutput<HostDisplayPixelFormat::RGBA8, u32>(*(src_ptr++));
}
template<>
ALWAYS_INLINE void CopyOutRow16<HostDisplayPixelFormat::BGRA8, u32>(const u16* src_ptr, u32* dst_ptr, u32 width)
{
for (u32 col = 0; col < width; col++)
*(dst_ptr++) = VRAM16ToOutput<HostDisplayPixelFormat::BGRA8, u32>(*(src_ptr++));
}
template<HostDisplayPixelFormat display_format>
void GPU_SW::CopyOut15Bit(u32 src_x, u32 src_y, u32 width, u32 height, u32 field, bool interlaced, bool interleaved)
{
u8* dst_ptr;
u32 dst_stride;
using OutputPixelType = std::conditional_t<
display_format == HostDisplayPixelFormat::RGBA8 || display_format == HostDisplayPixelFormat::BGRA8, u32, u16>;
if (!interlaced)
{
if (!m_host_display->BeginSetDisplayPixels(display_format, width, height, reinterpret_cast<void**>(&dst_ptr),
&dst_stride))
{
return;
}
}
else
{
dst_stride = Common::AlignUpPow2<u32>(width * sizeof(OutputPixelType), 4);
dst_ptr = m_display_texture_buffer.data() + (field != 0 ? dst_stride : 0);
}
const u32 output_stride = dst_stride;
const u8 interlaced_shift = BoolToUInt8(interlaced);
const u8 interleaved_shift = BoolToUInt8(interleaved);
// Fast path when not wrapping around.
if ((src_x + width) <= VRAM_WIDTH && (src_y + height) <= VRAM_HEIGHT)
{
const u32 rows = height >> interlaced_shift;
dst_stride <<= interlaced_shift;
height >>= interlaced_shift;
const u16* src_ptr = &m_vram[src_y * VRAM_WIDTH + src_x];
const u32 src_stride = VRAM_WIDTH << interleaved_shift;
for (u32 row = 0; row < height; row++)
const u32 src_step = VRAM_WIDTH << interleaved_shift;
for (u32 row = 0; row < rows; row++)
{
const u16* src_row_ptr = src_ptr;
u32* dst_row_ptr = dst_ptr;
for (u32 col = 0; col < width; col++)
*(dst_row_ptr++) = RGBA5551ToRGBA8888(*(src_row_ptr++));
src_ptr += src_stride;
CopyOutRow16<display_format>(src_ptr, reinterpret_cast<OutputPixelType*>(dst_ptr), width);
src_ptr += src_step;
dst_ptr += dst_stride;
}
}
else
{
const u32 rows = height >> interlaced_shift;
dst_stride <<= interlaced_shift;
height >>= interlaced_shift;
const u32 end_x = src_x + width;
for (u32 row = 0; row < height; row++)
for (u32 row = 0; row < rows; row++)
{
const u16* src_row_ptr = &m_vram[(src_y % VRAM_HEIGHT) * VRAM_WIDTH];
u32* dst_row_ptr = dst_ptr;
OutputPixelType* dst_row_ptr = reinterpret_cast<OutputPixelType*>(dst_ptr);
for (u32 col = src_x; col < end_x; col++)
*(dst_row_ptr++) = RGBA5551ToRGBA8888(src_row_ptr[col % VRAM_WIDTH]);
src_y += (1 << interleaved_shift);
dst_ptr += dst_stride;
{
*(dst_row_ptr++) = VRAM16ToOutput<display_format, OutputPixelType>(src_row_ptr[col % VRAM_WIDTH]);
src_y += (1 << interleaved_shift);
dst_ptr += dst_stride;
}
}
}
if (!interlaced)
{
m_host_display->EndSetDisplayPixels();
}
else
{
m_host_display->SetDisplayPixels(display_format, width, height, m_display_texture_buffer.data(), output_stride);
}
}
void GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32* dst_ptr, u32 dst_stride, u32 width, u32 height, bool interlaced,
void GPU_SW::CopyOut15Bit(HostDisplayPixelFormat display_format, u32 src_x, u32 src_y, u32 width, u32 height, u32 field,
bool interlaced, bool interleaved)
{
switch (display_format)
{
case HostDisplayPixelFormat::RGBA5551:
CopyOut15Bit<HostDisplayPixelFormat::RGBA5551>(src_x, src_y, width, height, field, interlaced, interleaved);
break;
case HostDisplayPixelFormat::RGB565:
CopyOut15Bit<HostDisplayPixelFormat::RGB565>(src_x, src_y, width, height, field, interlaced, interleaved);
break;
case HostDisplayPixelFormat::RGBA8:
CopyOut15Bit<HostDisplayPixelFormat::RGBA8>(src_x, src_y, width, height, field, interlaced, interleaved);
break;
case HostDisplayPixelFormat::BGRA8:
CopyOut15Bit<HostDisplayPixelFormat::BGRA8>(src_x, src_y, width, height, field, interlaced, interleaved);
break;
default:
break;
}
}
template<HostDisplayPixelFormat display_format>
void GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u32 field, bool interlaced,
bool interleaved)
{
u8* dst_ptr;
u32 dst_stride;
using OutputPixelType = std::conditional_t<
display_format == HostDisplayPixelFormat::RGBA8 || display_format == HostDisplayPixelFormat::BGRA8, u32, u16>;
if (!interlaced)
{
if (!m_host_display->BeginSetDisplayPixels(display_format, width, height, reinterpret_cast<void**>(&dst_ptr),
&dst_stride))
{
return;
}
}
else
{
dst_stride = Common::AlignUpPow2<u32>(width * sizeof(OutputPixelType), 4);
dst_ptr = m_display_texture_buffer.data() + (field != 0 ? dst_stride : 0);
}
const u32 output_stride = dst_stride;
const u8 interlaced_shift = BoolToUInt8(interlaced);
const u8 interleaved_shift = BoolToUInt8(interleaved);
const u32 rows = height >> interlaced_shift;
dst_stride <<= interlaced_shift;
if ((src_x + width) <= VRAM_WIDTH && (src_y + height) <= VRAM_HEIGHT)
if ((src_x + width) <= VRAM_WIDTH && (src_y + (rows << interleaved_shift)) <= VRAM_HEIGHT)
{
dst_stride <<= interlaced_shift;
height >>= interlaced_shift;
const u8* src_ptr = reinterpret_cast<const u8*>(&m_vram[src_y * VRAM_WIDTH + src_x]);
const u8* src_ptr = reinterpret_cast<const u8*>(&m_vram[src_y * VRAM_WIDTH + src_x]) + (skip_x * 3);
const u32 src_stride = (VRAM_WIDTH << interleaved_shift) * sizeof(u16);
for (u32 row = 0; row < height; row++)
for (u32 row = 0; row < rows; row++)
{
const u8* src_row_ptr = src_ptr;
u8* dst_row_ptr = reinterpret_cast<u8*>(dst_ptr);
for (u32 col = 0; col < width; col++)
if constexpr (display_format == HostDisplayPixelFormat::RGBA8)
{
*(dst_row_ptr++) = *(src_row_ptr++);
*(dst_row_ptr++) = *(src_row_ptr++);
*(dst_row_ptr++) = *(src_row_ptr++);
*(dst_row_ptr++) = 0xFF;
const u8* src_row_ptr = src_ptr;
u8* dst_row_ptr = reinterpret_cast<u8*>(dst_ptr);
for (u32 col = 0; col < width; col++)
{
*(dst_row_ptr++) = *(src_row_ptr++);
*(dst_row_ptr++) = *(src_row_ptr++);
*(dst_row_ptr++) = *(src_row_ptr++);
*(dst_row_ptr++) = 0xFF;
}
}
else if constexpr (display_format == HostDisplayPixelFormat::BGRA8)
{
const u8* src_row_ptr = src_ptr;
u8* dst_row_ptr = reinterpret_cast<u8*>(dst_ptr);
for (u32 col = 0; col < width; col++)
{
*(dst_row_ptr++) = src_row_ptr[2];
*(dst_row_ptr++) = src_row_ptr[1];
*(dst_row_ptr++) = src_row_ptr[0];
*(dst_row_ptr++) = 0xFF;
src_row_ptr += 3;
}
}
else if constexpr (display_format == HostDisplayPixelFormat::RGB565)
{
const u8* src_row_ptr = src_ptr;
u16* dst_row_ptr = reinterpret_cast<u16*>(dst_ptr);
for (u32 col = 0; col < width; col++)
{
*(dst_row_ptr++) = ((static_cast<u16>(src_row_ptr[0]) >> 3) << 11) |
((static_cast<u16>(src_row_ptr[1]) >> 2) << 5) | (static_cast<u16>(src_row_ptr[2]) >> 3);
src_row_ptr += 3;
}
}
else if constexpr (display_format == HostDisplayPixelFormat::RGBA5551)
{
const u8* src_row_ptr = src_ptr;
u16* dst_row_ptr = reinterpret_cast<u16*>(dst_ptr);
for (u32 col = 0; col < width; col++)
{
*(dst_row_ptr++) = ((static_cast<u16>(src_row_ptr[0]) >> 3) << 10) |
((static_cast<u16>(src_row_ptr[1]) >> 3) << 5) | (static_cast<u16>(src_row_ptr[2]) >> 3);
src_row_ptr += 3;
}
}
src_ptr += src_stride;
@ -118,39 +398,83 @@ void GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32* dst_ptr, u32 dst_stride, u3
}
else
{
dst_stride <<= interlaced_shift;
height >>= interlaced_shift;
for (u32 row = 0; row < height; row++)
for (u32 row = 0; row < rows; row++)
{
const u16* src_row_ptr = &m_vram[(src_y % VRAM_HEIGHT) * VRAM_WIDTH];
u32* dst_row_ptr = dst_ptr;
OutputPixelType* dst_row_ptr = reinterpret_cast<OutputPixelType*>(dst_ptr);
for (u32 col = 0; col < width; col++)
{
const u32 offset = (src_x + ((col * 3) / 2));
const u32 offset = (src_x + (((skip_x + col) * 3) / 2));
const u16 s0 = src_row_ptr[offset % VRAM_WIDTH];
const u16 s1 = src_row_ptr[(offset + 1) % VRAM_WIDTH];
const u8 shift = static_cast<u8>(col & 1u) * 8;
*(dst_row_ptr++) = (((ZeroExtend32(s1) << 16) | ZeroExtend32(s0)) >> shift) | 0xFF000000u;
const u32 rgb = (((ZeroExtend32(s1) << 16) | ZeroExtend32(s0)) >> shift);
if constexpr (display_format == HostDisplayPixelFormat::RGBA8)
{
*(dst_row_ptr++) = rgb | 0xFF000000u;
}
else if constexpr (display_format == HostDisplayPixelFormat::BGRA8)
{
*(dst_row_ptr++) = (rgb & 0x00FF00) | ((rgb & 0xFF) << 16) | ((rgb >> 16) & 0xFF) | 0xFF000000u;
}
else if constexpr (display_format == HostDisplayPixelFormat::RGB565)
{
*(dst_row_ptr++) = ((rgb >> 3) & 0x1F) | (((rgb >> 10) << 5) & 0x7E0) | (((rgb >> 19) << 11) & 0x3E0000);
}
else if constexpr (display_format == HostDisplayPixelFormat::RGBA5551)
{
*(dst_row_ptr++) = ((rgb >> 3) & 0x1F) | (((rgb >> 11) << 5) & 0x3E0) | (((rgb >> 19) << 10) & 0x1F0000);
}
}
src_y += (1 << interleaved_shift);
dst_ptr += dst_stride;
}
}
if (!interlaced)
{
m_host_display->EndSetDisplayPixels();
}
else
{
m_host_display->SetDisplayPixels(display_format, width, height, m_display_texture_buffer.data(), output_stride);
}
}
void GPU_SW::CopyOut24Bit(HostDisplayPixelFormat display_format, u32 src_x, u32 src_y, u32 skip_x, u32 width,
u32 height, u32 field, bool interlaced, bool interleaved)
{
switch (display_format)
{
case HostDisplayPixelFormat::RGBA5551:
CopyOut24Bit<HostDisplayPixelFormat::RGBA5551>(src_x, src_y, skip_x, width, height, field, interlaced,
interleaved);
break;
case HostDisplayPixelFormat::RGB565:
CopyOut24Bit<HostDisplayPixelFormat::RGB565>(src_x, src_y, skip_x, width, height, field, interlaced, interleaved);
break;
case HostDisplayPixelFormat::RGBA8:
CopyOut24Bit<HostDisplayPixelFormat::RGBA8>(src_x, src_y, skip_x, width, height, field, interlaced, interleaved);
break;
case HostDisplayPixelFormat::BGRA8:
CopyOut24Bit<HostDisplayPixelFormat::BGRA8>(src_x, src_y, skip_x, width, height, field, interlaced, interleaved);
break;
default:
break;
}
}
void GPU_SW::ClearDisplay()
{
std::memset(m_display_texture_buffer.data(), 0, sizeof(u32) * m_display_texture_buffer.size());
std::memset(m_display_texture_buffer.data(), 0, m_display_texture_buffer.size());
}
void GPU_SW::UpdateDisplay()
{
// fill display texture
m_display_texture_buffer.resize(VRAM_WIDTH * VRAM_HEIGHT);
if (!g_settings.debugging.show_vram)
{
if (IsDisplayDisabled())
@ -162,39 +486,37 @@ void GPU_SW::UpdateDisplay()
const u32 vram_offset_y = m_crtc_state.display_vram_top;
const u32 display_width = m_crtc_state.display_vram_width;
const u32 display_height = m_crtc_state.display_vram_height;
const u32 texture_offset_x = m_crtc_state.display_vram_left - m_crtc_state.regs.X;
if (IsInterlacedDisplayEnabled())
{
const u32 field = GetInterlacedDisplayField();
if (m_GPUSTAT.display_area_color_depth_24)
{
CopyOut24Bit(m_crtc_state.regs.X, vram_offset_y + field, m_display_texture_buffer.data() + field * VRAM_WIDTH,
VRAM_WIDTH, display_width + texture_offset_x, display_height, true, m_GPUSTAT.vertical_resolution);
CopyOut24Bit(m_24bit_display_format, m_crtc_state.regs.X, vram_offset_y + field,
m_crtc_state.display_vram_left - m_crtc_state.regs.X, display_width, display_height, field, true,
m_GPUSTAT.vertical_resolution);
}
else
{
CopyOut15Bit(m_crtc_state.regs.X, vram_offset_y + field, m_display_texture_buffer.data() + field * VRAM_WIDTH,
VRAM_WIDTH, display_width + texture_offset_x, display_height, true, m_GPUSTAT.vertical_resolution);
CopyOut15Bit(m_16bit_display_format, m_crtc_state.display_vram_left, vram_offset_y + field, display_width,
display_height, field, true, m_GPUSTAT.vertical_resolution);
}
}
else
{
if (m_GPUSTAT.display_area_color_depth_24)
{
CopyOut24Bit(m_crtc_state.regs.X, vram_offset_y, m_display_texture_buffer.data(), VRAM_WIDTH,
display_width + texture_offset_x, display_height, false, false);
CopyOut24Bit(m_24bit_display_format, m_crtc_state.regs.X, vram_offset_y,
m_crtc_state.display_vram_left - m_crtc_state.regs.X, display_width, display_height, 0, false,
false);
}
else
{
CopyOut15Bit(m_crtc_state.regs.X, vram_offset_y, m_display_texture_buffer.data(), VRAM_WIDTH,
display_width + texture_offset_x, display_height, false, false);
CopyOut15Bit(m_16bit_display_format, m_crtc_state.display_vram_left, vram_offset_y, display_width,
display_height, 0, false, false);
}
}
m_host_display->UpdateTexture(m_display_texture.get(), 0, 0, display_width, display_height,
m_display_texture_buffer.data(), VRAM_WIDTH * sizeof(u32));
m_host_display->SetDisplayTexture(m_display_texture->GetHandle(), VRAM_WIDTH, VRAM_HEIGHT, texture_offset_x, 0,
display_width, display_height);
m_host_display->SetDisplayParameters(m_crtc_state.display_width, m_crtc_state.display_height,
m_crtc_state.display_origin_left, m_crtc_state.display_origin_top,
m_crtc_state.display_vram_width, m_crtc_state.display_vram_height,
@ -202,11 +524,7 @@ void GPU_SW::UpdateDisplay()
}
else
{
CopyOut15Bit(0, 0, m_display_texture_buffer.data(), VRAM_WIDTH, VRAM_WIDTH, VRAM_HEIGHT, false, false);
m_host_display->UpdateTexture(m_display_texture.get(), 0, 0, VRAM_WIDTH, VRAM_HEIGHT,
m_display_texture_buffer.data(), VRAM_WIDTH * sizeof(u32));
m_host_display->SetDisplayTexture(m_display_texture->GetHandle(), VRAM_WIDTH, VRAM_HEIGHT, 0, 0, VRAM_WIDTH,
VRAM_HEIGHT);
CopyOut15Bit(m_16bit_display_format, 0, 0, VRAM_WIDTH, VRAM_HEIGHT, 0, false, false);
m_host_display->SetDisplayParameters(VRAM_WIDTH, VRAM_HEIGHT, 0, 0, VRAM_WIDTH, VRAM_HEIGHT,
static_cast<float>(VRAM_WIDTH) / static_cast<float>(VRAM_HEIGHT));
}
@ -379,7 +697,8 @@ constexpr GPU_SW::DitherLUT GPU_SW::ComputeDitherLUT()
static constexpr GPU_SW::DitherLUT s_dither_lut = GPU_SW::ComputeDitherLUT();
template<bool texture_enable, bool raw_texture_enable, bool transparency_enable, bool dithering_enable>
void ALWAYS_INLINE_RELEASE GPU_SW::ShadePixel(u32 x, u32 y, u8 color_r, u8 color_g, u8 color_b, u8 texcoord_x, u8 texcoord_y)
void ALWAYS_INLINE_RELEASE GPU_SW::ShadePixel(u32 x, u32 y, u8 color_r, u8 color_g, u8 color_b, u8 texcoord_x,
u8 texcoord_y)
{
VRAMPixel color;
bool transparent;

View File

@ -1,5 +1,7 @@
#pragma once
#include "common/heap_array.h"
#include "gpu.h"
#include "host_display.h"
#include <array>
#include <memory>
#include <vector>
@ -47,10 +49,17 @@ protected:
//////////////////////////////////////////////////////////////////////////
// Scanout
//////////////////////////////////////////////////////////////////////////
void CopyOut15Bit(u32 src_x, u32 src_y, u32* dst_ptr, u32 dst_stride, u32 width, u32 height, bool interlaced,
bool interleaved);
void CopyOut24Bit(u32 src_x, u32 src_y, u32* dst_ptr, u32 dst_stride, u32 width, u32 height, bool interlaced,
template<HostDisplayPixelFormat display_format>
void CopyOut15Bit(u32 src_x, u32 src_y, u32 width, u32 height, u32 field, bool interlaced, bool interleaved);
void CopyOut15Bit(HostDisplayPixelFormat display_format, u32 src_x, u32 src_y, u32 width, u32 height, u32 field,
bool interlaced, bool interleaved);
template<HostDisplayPixelFormat display_format>
void CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u32 field, bool interlaced,
bool interleaved);
void CopyOut24Bit(HostDisplayPixelFormat display_format, u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height,
u32 field, bool interlaced, bool interleaved);
void ClearDisplay() override;
void UpdateDisplay() override;
@ -117,8 +126,8 @@ protected:
using DrawLineFunction = void (GPU_SW::*)(const SWVertex* p0, const SWVertex* p1);
DrawLineFunction GetDrawLineFunction(bool shading_enable, bool transparency_enable, bool dithering_enable);
std::vector<u32> m_display_texture_buffer;
std::unique_ptr<HostDisplayTexture> m_display_texture;
std::array<u16, VRAM_WIDTH * VRAM_HEIGHT> m_vram;
HeapArray<u8, VRAM_WIDTH * VRAM_HEIGHT * sizeof(u32)> m_display_texture_buffer;
HostDisplayPixelFormat m_16bit_display_format = HostDisplayPixelFormat::RGB565;
HostDisplayPixelFormat m_24bit_display_format = HostDisplayPixelFormat::RGBA8;
};

View File

@ -1,4 +1,5 @@
#include "host_display.h"
#include "common/assert.h"
#include "common/file_system.h"
#include "common/log.h"
#include "common/string_util.h"
@ -35,6 +36,53 @@ bool HostDisplay::ShouldSkipDisplayingFrame()
return false;
}
u32 HostDisplay::GetDisplayPixelFormatSize(HostDisplayPixelFormat format)
{
switch (format)
{
case HostDisplayPixelFormat::RGBA8:
case HostDisplayPixelFormat::BGRA8:
return 4;
case HostDisplayPixelFormat::RGBA5551:
case HostDisplayPixelFormat::RGB565:
return 2;
default:
return 0;
}
}
bool HostDisplay::SetDisplayPixels(HostDisplayPixelFormat format, u32 width, u32 height, const void* buffer, u32 pitch)
{
void* map_ptr;
u32 map_pitch;
if (!BeginSetDisplayPixels(format, width, height, &map_ptr, &map_pitch))
return false;
if (pitch == map_pitch)
{
std::memcpy(map_ptr, buffer, height * map_pitch);
}
else
{
const u32 copy_size = width * GetDisplayPixelFormatSize(format);
DebugAssert(pitch >= copy_size && map_pitch >= copy_size);
const u8* src_ptr = static_cast<const u8*>(buffer);
u8* dst_ptr = static_cast<u8*>(map_ptr);
for (u32 i = 0; i < height; i++)
{
std::memcpy(dst_ptr, src_ptr, copy_size);
src_ptr += pitch;
dst_ptr += map_pitch;
}
}
EndSetDisplayPixels();
return true;
}
void HostDisplay::SetSoftwareCursor(std::unique_ptr<HostDisplayTexture> texture, float scale /*= 1.0f*/)
{
m_cursor_texture = std::move(texture);
@ -327,7 +375,7 @@ bool HostDisplay::WriteTextureToFile(const void* texture_handle, u32 x, u32 y, u
bool HostDisplay::WriteDisplayTextureToFile(const char* filename, bool full_resolution /* = true */,
bool apply_aspect_ratio /* = true */)
{
if (!m_display_texture_handle)
if (!m_display_texture_handle || m_display_texture_format != HostDisplayPixelFormat::RGBA8)
return false;
apply_aspect_ratio = (m_display_aspect_ratio > 0) ? apply_aspect_ratio : false;
@ -394,7 +442,7 @@ bool HostDisplay::WriteDisplayTextureToFile(const char* filename, bool full_reso
bool HostDisplay::WriteDisplayTextureToBuffer(std::vector<u32>* buffer, u32 resize_width /* = 0 */,
u32 resize_height /* = 0 */, bool clear_alpha /* = true */)
{
if (!m_display_texture_handle)
if (!m_display_texture_handle || m_display_texture_format != HostDisplayPixelFormat::RGBA8)
return false;
const bool flip_y = (m_display_texture_view_height < 0);

View File

@ -7,6 +7,16 @@
#include <tuple>
#include <vector>
enum class HostDisplayPixelFormat : u32
{
Unknown,
RGBA8,
BGRA8,
RGB565,
RGBA5551,
Count
};
// An abstracted RGBA8 texture.
class HostDisplayTexture
{
@ -16,6 +26,8 @@ public:
virtual void* GetHandle() const = 0;
virtual u32 GetWidth() const = 0;
virtual u32 GetHeight() const = 0;
ALWAYS_INLINE HostDisplayPixelFormat GetFormat() const { return HostDisplayPixelFormat::RGBA8; }
};
// Interface to the frontend's renderer.
@ -111,10 +123,11 @@ public:
m_display_changed = true;
}
void SetDisplayTexture(void* texture_handle, s32 texture_width, s32 texture_height, s32 view_x, s32 view_y,
s32 view_width, s32 view_height)
void SetDisplayTexture(void* texture_handle, HostDisplayPixelFormat texture_format, s32 texture_width,
s32 texture_height, s32 view_x, s32 view_y, s32 view_width, s32 view_height)
{
m_display_texture_handle = texture_handle;
m_display_texture_format = texture_format;
m_display_texture_width = texture_width;
m_display_texture_height = texture_height;
m_display_texture_view_x = view_x;
@ -124,6 +137,15 @@ public:
m_display_changed = true;
}
void SetDisplayTextureRect(s32 view_x, s32 view_y, s32 view_width, s32 view_height)
{
m_display_texture_view_x = view_x;
m_display_texture_view_y = view_y;
m_display_texture_view_width = view_width;
m_display_texture_view_height = view_height;
m_display_changed = true;
}
void SetDisplayParameters(s32 display_width, s32 display_height, s32 active_left, s32 active_top, s32 active_width,
s32 active_height, float display_aspect_ratio)
{
@ -137,6 +159,15 @@ public:
m_display_changed = true;
}
static u32 GetDisplayPixelFormatSize(HostDisplayPixelFormat format);
virtual bool SupportsDisplayPixelFormat(HostDisplayPixelFormat format) const = 0;
virtual bool BeginSetDisplayPixels(HostDisplayPixelFormat format, u32 width, u32 height, void** out_buffer,
u32* out_pitch) = 0;
virtual void EndSetDisplayPixels() = 0;
virtual bool SetDisplayPixels(HostDisplayPixelFormat format, u32 width, u32 height, const void* buffer, u32 pitch);
void SetDisplayLinearFiltering(bool enabled) { m_display_linear_filtering = enabled; }
void SetDisplayTopMargin(s32 height) { m_display_top_margin = height; }
void SetDisplayIntegerScaling(bool enabled) { m_display_integer_scaling = enabled; }
@ -201,6 +232,7 @@ protected:
float m_display_frame_interval = 0.0f;
void* m_display_texture_handle = nullptr;
HostDisplayPixelFormat m_display_texture_format = HostDisplayPixelFormat::Count;
s32 m_display_texture_width = 0;
s32 m_display_texture_height = 0;
s32 m_display_texture_view_x = 0;