diff --git a/src/core/gpu_hw.cpp b/src/core/gpu_hw.cpp index 51d617382..8e4bc9740 100644 --- a/src/core/gpu_hw.cpp +++ b/src/core/gpu_hw.cpp @@ -215,6 +215,32 @@ void GPU_HW::HandleFlippedQuadTextureCoordinates(BatchVertex* vertices) } } +bool GPU_HW::AreUVLimitsNeeded() +{ + // We only need UV limits if PGXP is enabled, or texture filtering is enabled. + return g_settings.gpu_pgxp_enable || g_settings.gpu_texture_filtering; +} + +void GPU_HW::ComputePolygonUVLimits(BatchVertex* vertices, u32 num_vertices) +{ + u16 min_u = vertices[0].u, max_u = vertices[0].u, min_v = vertices[0].v, max_v = vertices[0].v; + for (u32 i = 1; i < num_vertices; i++) + { + min_u = std::min(min_u, vertices[i].u); + max_u = std::max(max_u, vertices[i].u); + min_v = std::min(min_v, vertices[i].v); + max_v = std::max(max_v, vertices[i].v); + } + + if (min_u != max_u) + max_u--; + if (min_v != max_v) + max_v--; + + for (u32 i = 0; i < num_vertices; i++) + vertices[i].SetUVLimits(min_u, max_u, min_v, max_v); +} + void GPU_HW::DrawLine(float x0, float y0, u32 col0, float x1, float y1, u32 col1, float depth) { const float dx = x1 - x0; @@ -223,10 +249,10 @@ void GPU_HW::DrawLine(float x0, float y0, u32 col0, float x1, float y1, u32 col1 if (dx == 0.0f && dy == 0.0f) { // Degenerate, render a point. - output[0].Set(x0, y0, depth, 1.0f, col0, 0, 0); - output[1].Set(x0 + 1.0f, y0, depth, 1.0f, col0, 0, 0); - output[2].Set(x1, y1 + 1.0f, depth, 1.0f, col0, 0, 0); - output[3].Set(x1 + 1.0f, y1 + 1.0f, depth, 1.0f, col0, 0, 0); + output[0].Set(x0, y0, depth, 1.0f, col0, 0, 0, 0); + output[1].Set(x0 + 1.0f, y0, depth, 1.0f, col0, 0, 0, 0); + output[2].Set(x1, y1 + 1.0f, depth, 1.0f, col0, 0, 0, 0); + output[3].Set(x1 + 1.0f, y1 + 1.0f, depth, 1.0f, col0, 0, 0, 0); } else { @@ -290,10 +316,10 @@ void GPU_HW::DrawLine(float x0, float y0, u32 col0, float x1, float y1, u32 col1 const float ox1 = x1 + pad_x1; const float oy1 = y1 + pad_y1; - output[0].Set(ox0, oy0, depth, 1.0f, col0, 0, 0); - output[1].Set(ox0 + fill_dx, oy0 + fill_dy, depth, 1.0f, col0, 0, 0); - output[2].Set(ox1, oy1, depth, 1.0f, col1, 0, 0); - output[3].Set(ox1 + fill_dx, oy1 + fill_dy, depth, 1.0f, col1, 0, 0); + output[0].Set(ox0, oy0, depth, 1.0f, col0, 0, 0, 0); + output[1].Set(ox0 + fill_dx, oy0 + fill_dy, depth, 1.0f, col0, 0, 0, 0); + output[2].Set(ox1, oy1, depth, 1.0f, col1, 0, 0, 0); + output[3].Set(ox1 + fill_dx, oy1 + fill_dy, depth, 1.0f, col1, 0, 0, 0); } AddVertex(output[0]); @@ -339,7 +365,7 @@ void GPU_HW::LoadVertices() native_vertex_positions[i][0] = native_x; native_vertex_positions[i][1] = native_y; vertices[i].Set(static_cast(native_x), static_cast(native_y), depth, 1.0f, color, texpage, - texcoord); + texcoord, 0xFFFF0000u); if (pgxp) { @@ -357,6 +383,9 @@ void GPU_HW::LoadVertices() if (rc.quad_polygon && m_resolution_scale > 1) HandleFlippedQuadTextureCoordinates(vertices.data()); + if (AreUVLimitsNeeded()) + ComputePolygonUVLimits(vertices.data(), num_vertices); + if (!IsDrawingAreaIsValid()) return; @@ -490,14 +519,15 @@ void GPU_HW::LoadVertices() const float quad_start_x = static_cast(pos_x + x_offset); const float quad_end_x = quad_start_x + static_cast(quad_width); const u16 tex_right = tex_left + static_cast(quad_width); + const u32 uv_limits = BatchVertex::PackUVLimits(tex_left, tex_right - 1, tex_top, tex_bottom - 1); - AddNewVertex(quad_start_x, quad_start_y, depth, 1.0f, color, texpage, tex_left, tex_top); - AddNewVertex(quad_end_x, quad_start_y, depth, 1.0f, color, texpage, tex_right, tex_top); - AddNewVertex(quad_start_x, quad_end_y, depth, 1.0f, color, texpage, tex_left, tex_bottom); + AddNewVertex(quad_start_x, quad_start_y, depth, 1.0f, color, texpage, tex_left, tex_top, uv_limits); + AddNewVertex(quad_end_x, quad_start_y, depth, 1.0f, color, texpage, tex_right, tex_top, uv_limits); + AddNewVertex(quad_start_x, quad_end_y, depth, 1.0f, color, texpage, tex_left, tex_bottom, uv_limits); - AddNewVertex(quad_start_x, quad_end_y, depth, 1.0f, color, texpage, tex_left, tex_bottom); - AddNewVertex(quad_end_x, quad_start_y, depth, 1.0f, color, texpage, tex_right, tex_top); - AddNewVertex(quad_end_x, quad_end_y, depth, 1.0f, color, texpage, tex_right, tex_bottom); + AddNewVertex(quad_start_x, quad_end_y, depth, 1.0f, color, texpage, tex_left, tex_bottom, uv_limits); + AddNewVertex(quad_end_x, quad_start_y, depth, 1.0f, color, texpage, tex_right, tex_top, uv_limits); + AddNewVertex(quad_end_x, quad_end_y, depth, 1.0f, color, texpage, tex_right, tex_bottom, uv_limits); x_offset += quad_width; tex_left = 0; @@ -628,6 +658,8 @@ void GPU_HW::LoadVertices() UnreachableCode(); break; } + + FlushRender(); } void GPU_HW::CalcScissorRect(int* left, int* top, int* right, int* bottom) diff --git a/src/core/gpu_hw.h b/src/core/gpu_hw.h index 018342a7d..14aa2ce38 100644 --- a/src/core/gpu_hw.h +++ b/src/core/gpu_hw.h @@ -58,13 +58,16 @@ protected: u32 texpage; u16 u; // 16-bit texcoords are needed for 256 extent rectangles u16 v; + u32 uv_limits; - ALWAYS_INLINE void Set(float x_, float y_, float z_, float w_, u32 color_, u32 texpage_, u16 packed_texcoord) + ALWAYS_INLINE void Set(float x_, float y_, float z_, float w_, u32 color_, u32 texpage_, u16 packed_texcoord, + u32 uv_limits_) { - Set(x_, y_, z_, w_, color_, texpage_, packed_texcoord & 0xFF, (packed_texcoord >> 8)); + Set(x_, y_, z_, w_, color_, texpage_, packed_texcoord & 0xFF, (packed_texcoord >> 8), uv_limits_); } - ALWAYS_INLINE void Set(float x_, float y_, float z_, float w_, u32 color_, u32 texpage_, u16 u_, u16 v_) + ALWAYS_INLINE void Set(float x_, float y_, float z_, float w_, u32 color_, u32 texpage_, u16 u_, u16 v_, + u32 uv_limits_) { x = x_; y = y_; @@ -74,6 +77,17 @@ protected: texpage = texpage_; u = u_; v = v_; + uv_limits = uv_limits_; + } + + ALWAYS_INLINE static u32 PackUVLimits(u32 min_u, u32 max_u, u32 min_v, u32 max_v) + { + return min_u | (min_v << 8) | (max_u << 16) | (max_v << 24); + } + + ALWAYS_INLINE void SetUVLimits(u32 min_u, u32 max_u, u32 min_v, u32 max_v) + { + uv_limits = PackUVLimits(min_u, max_u, min_v, max_v); } }; @@ -236,6 +250,10 @@ protected: /// Handles quads with flipped texture coordinate directions. static void HandleFlippedQuadTextureCoordinates(BatchVertex* vertices); + /// Computes polygon U/V boundaries. + static void ComputePolygonUVLimits(BatchVertex* vertices, u32 num_vertices); + static bool AreUVLimitsNeeded(); + HeapArray m_vram_shadow; BatchVertex* m_batch_start_vertex_ptr = nullptr; diff --git a/src/core/gpu_hw_d3d11.cpp b/src/core/gpu_hw_d3d11.cpp index da9eeb83d..a998bf8bd 100644 --- a/src/core/gpu_hw_d3d11.cpp +++ b/src/core/gpu_hw_d3d11.cpp @@ -265,11 +265,12 @@ bool GPU_HW_D3D11::CreateTextureBuffer() bool GPU_HW_D3D11::CreateBatchInputLayout() { - static constexpr std::array attributes = { + static constexpr std::array attributes = { {{"ATTR", 0, DXGI_FORMAT_R32G32B32A32_FLOAT, 0, offsetof(BatchVertex, x), D3D11_INPUT_PER_VERTEX_DATA, 0}, {"ATTR", 1, DXGI_FORMAT_R8G8B8A8_UNORM, 0, offsetof(BatchVertex, color), D3D11_INPUT_PER_VERTEX_DATA, 0}, {"ATTR", 2, DXGI_FORMAT_R32_UINT, 0, offsetof(BatchVertex, u), D3D11_INPUT_PER_VERTEX_DATA, 0}, - {"ATTR", 3, DXGI_FORMAT_R32_UINT, 0, offsetof(BatchVertex, texpage), D3D11_INPUT_PER_VERTEX_DATA, 0}}}; + {"ATTR", 3, DXGI_FORMAT_R32_UINT, 0, offsetof(BatchVertex, texpage), D3D11_INPUT_PER_VERTEX_DATA, 0}, + {"ATTR", 4, DXGI_FORMAT_R8G8B8A8_UNORM, 0, offsetof(BatchVertex, uv_limits), D3D11_INPUT_PER_VERTEX_DATA, 0}}}; // we need a vertex shader... GPU_HW_ShaderGen shadergen(m_host_display->GetRenderAPI(), m_resolution_scale, m_true_color, m_scaled_dithering, diff --git a/src/core/gpu_hw_d3d11.h b/src/core/gpu_hw_d3d11.h index 7a0b45e03..ae12c6c22 100644 --- a/src/core/gpu_hw_d3d11.h +++ b/src/core/gpu_hw_d3d11.h @@ -95,6 +95,7 @@ private: ComPtr m_texture_stream_buffer_srv_r16ui; ComPtr m_cull_none_rasterizer_state; + ComPtr m_wireframe_rasterizer_state; ComPtr m_depth_disabled_state; ComPtr m_depth_test_always_state; diff --git a/src/core/gpu_hw_opengl.cpp b/src/core/gpu_hw_opengl.cpp index dd1e943a6..65ed75857 100644 --- a/src/core/gpu_hw_opengl.cpp +++ b/src/core/gpu_hw_opengl.cpp @@ -297,12 +297,15 @@ bool GPU_HW_OpenGL::CreateVertexBuffer() glEnableVertexAttribArray(1); glEnableVertexAttribArray(2); glEnableVertexAttribArray(3); + glEnableVertexAttribArray(4); glVertexAttribPointer(0, 4, GL_FLOAT, false, sizeof(BatchVertex), reinterpret_cast(offsetof(BatchVertex, x))); glVertexAttribPointer(1, 4, GL_UNSIGNED_BYTE, true, sizeof(BatchVertex), reinterpret_cast(offsetof(BatchVertex, color))); glVertexAttribIPointer(2, 1, GL_UNSIGNED_INT, sizeof(BatchVertex), reinterpret_cast(offsetof(BatchVertex, u))); glVertexAttribIPointer(3, 1, GL_UNSIGNED_INT, sizeof(BatchVertex), reinterpret_cast(offsetof(BatchVertex, texpage))); + glVertexAttribPointer(4, 4, GL_UNSIGNED_BYTE, true, sizeof(BatchVertex), + reinterpret_cast(offsetof(BatchVertex, uv_limits))); glBindVertexArray(0); glGenVertexArrays(1, &m_attributeless_vao_id); @@ -367,6 +370,7 @@ bool GPU_HW_OpenGL::CompilePrograms() { prog.BindAttribute(2, "a_texcoord"); prog.BindAttribute(3, "a_texpage"); + prog.BindAttribute(4, "a_uv_limits"); } if (!IsGLES() || m_supports_dual_source_blend) diff --git a/src/core/gpu_hw_shadergen.cpp b/src/core/gpu_hw_shadergen.cpp index d966e5ab8..06e4bfd33 100644 --- a/src/core/gpu_hw_shadergen.cpp +++ b/src/core/gpu_hw_shadergen.cpp @@ -508,8 +508,9 @@ std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured, bool upsc const char* output_block_suffix = upscaled_lines ? "VS" : ""; if (textured) { - DeclareVertexEntryPoint(ss, {"float4 a_pos", "float4 a_col0", "uint a_texcoord", "uint a_texpage"}, 1, 1, - {{"nointerpolation", "uint4 v_texpage"}}, false, output_block_suffix); + DeclareVertexEntryPoint( + ss, {"float4 a_pos", "float4 a_col0", "uint a_texcoord", "uint a_texpage", "float4 a_uv_limits"}, 1, 1, + {{"nointerpolation", "uint4 v_texpage"}, {"nointerpolation", "float4 v_uv_limits"}}, false, output_block_suffix); } else { @@ -557,6 +558,8 @@ std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured, bool upsc v_texpage.y = ((a_texpage >> 4) & 1u) * 256u * RESOLUTION_SCALE; v_texpage.z = ((a_texpage >> 16) & 63u) * 16u * RESOLUTION_SCALE; v_texpage.w = ((a_texpage >> 22) & 511u) * RESOLUTION_SCALE; + + v_uv_limits = a_uv_limits * float4(255.0, 255.0, 255.0, 255.0); #endif } )"; @@ -658,13 +661,7 @@ uint2 FloatToIntegerCoords(float2 coords) float4 SampleFromVRAM(uint4 texpage, float2 coords) { #if PALETTE - // We can't currently use upscaled coordinate for palettes because of how they're packed. - // Not that it would be any benefit anyway, render-to-texture effects don't use palettes. - #if !TEXTURE_FILTERING - coords /= float2(RESOLUTION_SCALE, RESOLUTION_SCALE); - #endif uint2 icoord = ApplyTextureWindow(FloatToIntegerCoords(coords)); - uint2 index_coord = icoord; #if PALETTE_4_BIT index_coord.x /= 4u; @@ -698,12 +695,43 @@ float4 SampleFromVRAM(uint4 texpage, float2 coords) return SAMPLE_TEXTURE(samp0, float2(direct_icoord) * RCP_VRAM_SIZE); #endif } + +void BilinearSampleFromVRAM(uint4 texpage, float2 coords, float4 uv_limits, + out float4 texcol, out float ialpha) +{ + // Compute the coordinates of the four texels we will be interpolating between. + // Clamp this to the triangle texture coordinates. + float2 texel_top_left = frac(coords) - float2(0.5, 0.5); + float2 texel_offset = sign(texel_top_left); + float4 fcoords = max(coords.xyxy + float4(0.0, 0.0, texel_offset.x, texel_offset.y), + float4(0.0, 0.0, 0.0, 0.0)); + + // Load four texels. + float4 s00 = SampleFromVRAM(texpage, clamp(fcoords.xy, uv_limits.xy, uv_limits.zw)); + float4 s10 = SampleFromVRAM(texpage, clamp(fcoords.zy, uv_limits.xy, uv_limits.zw)); + float4 s01 = SampleFromVRAM(texpage, clamp(fcoords.xw, uv_limits.xy, uv_limits.zw)); + float4 s11 = SampleFromVRAM(texpage, clamp(fcoords.zw, uv_limits.xy, uv_limits.zw)); + + // Compute alpha from how many texels aren't pixel color 0000h. + float a00 = float(VECTOR_NEQ(s00, TRANSPARENT_PIXEL_COLOR)); + float a10 = float(VECTOR_NEQ(s10, TRANSPARENT_PIXEL_COLOR)); + float a01 = float(VECTOR_NEQ(s01, TRANSPARENT_PIXEL_COLOR)); + float a11 = float(VECTOR_NEQ(s11, TRANSPARENT_PIXEL_COLOR)); + + // Bilinearly interpolate. + float2 weights = abs(texel_top_left); + texcol = lerp(lerp(s00, s10, weights.x), lerp(s01, s11, weights.x), weights.y); + ialpha = lerp(lerp(a00, a10, weights.x), lerp(a01, a11, weights.x), weights.y); +} + #endif )"; if (textured) { - DeclareFragmentEntryPoint(ss, 1, 1, {{"nointerpolation", "uint4 v_texpage"}}, true, use_dual_source ? 2 : 1, true); + DeclareFragmentEntryPoint(ss, 1, 1, + {{"nointerpolation", "uint4 v_texpage"}, {"nointerpolation", "float4 v_uv_limits"}}, true, + use_dual_source ? 2 : 1, true); } else { @@ -725,48 +753,35 @@ float4 SampleFromVRAM(uint4 texpage, float2 coords) #endif #if TEXTURED + float2 coords = v_tex0; + float4 uv_limits = v_uv_limits; + float4 texcol; + + // We can't currently use upscaled coordinate for palettes because of how they're packed. + // Not that it would be any benefit anyway, render-to-texture effects don't use palettes. + #if PALETTE + coords /= float2(RESOLUTION_SCALE, RESOLUTION_SCALE); + #else + uv_limits *= float4(RESOLUTION_SCALE, RESOLUTION_SCALE, RESOLUTION_SCALE, RESOLUTION_SCALE); + #endif + #if TEXTURE_FILTERING - // Compute the coordinates of the four texels we will be interpolating between. - // TODO: Find some way to clamp this to the triangle texture coordinates? - float2 downscaled_coords = v_tex0; - #if PALETTE - downscaled_coords /= float2(RESOLUTION_SCALE, RESOLUTION_SCALE); - #endif - float2 texel_top_left = frac(downscaled_coords) - float2(0.5, 0.5); - float2 texel_offset = sign(texel_top_left); - float4 fcoords = max(downscaled_coords.xyxy + float4(0.0, 0.0, texel_offset.x, texel_offset.y), - float4(0.0, 0.0, 0.0, 0.0)); - - // Load four texels. - float4 s00 = SampleFromVRAM(v_texpage, fcoords.xy); - float4 s10 = SampleFromVRAM(v_texpage, fcoords.zy); - float4 s01 = SampleFromVRAM(v_texpage, fcoords.xw); - float4 s11 = SampleFromVRAM(v_texpage, fcoords.zw); - - // Compute alpha from how many texels aren't pixel color 0000h. - float a00 = float(VECTOR_NEQ(s00, TRANSPARENT_PIXEL_COLOR)); - float a10 = float(VECTOR_NEQ(s10, TRANSPARENT_PIXEL_COLOR)); - float a01 = float(VECTOR_NEQ(s01, TRANSPARENT_PIXEL_COLOR)); - float a11 = float(VECTOR_NEQ(s11, TRANSPARENT_PIXEL_COLOR)); - - // Bilinearly interpolate. - float2 weights = abs(texel_top_left); - float4 texcol = lerp(lerp(s00, s10, weights.x), lerp(s01, s11, weights.x), weights.y); - ialpha = lerp(lerp(a00, a10, weights.x), lerp(a01, a11, weights.x), weights.y); + BilinearSampleFromVRAM(v_texpage, coords, uv_limits, texcol, ialpha); if (ialpha < 0.5) discard; texcol.rgb /= float3(ialpha, ialpha, ialpha); semitransparent = (texcol.a != 0.0); #else - float4 texcol = SampleFromVRAM(v_texpage, v_tex0); + texcol = SampleFromVRAM(v_texpage, clamp(coords, uv_limits.xy, uv_limits.zw)); if (VECTOR_EQ(texcol, TRANSPARENT_PIXEL_COLOR)) discard; - semitransparent = (texcol.a != 0.0); ialpha = 1.0; #endif + semitransparent = (texcol.a != 0.0); + // If not using true color, truncate the framebuffer colors to 5-bit. #if !TRUE_COLOR icolor = uint3(texcol.rgb * float3(255.0, 255.0, 255.0)) >> 3; diff --git a/src/core/gpu_hw_vulkan.cpp b/src/core/gpu_hw_vulkan.cpp index c7c8f9149..30fd0c379 100644 --- a/src/core/gpu_hw_vulkan.cpp +++ b/src/core/gpu_hw_vulkan.cpp @@ -646,6 +646,7 @@ bool GPU_HW_Vulkan::CompilePipelines() { gpbuilder.AddVertexAttribute(2, 0, VK_FORMAT_R32_UINT, offsetof(BatchVertex, u)); gpbuilder.AddVertexAttribute(3, 0, VK_FORMAT_R32_UINT, offsetof(BatchVertex, texpage)); + gpbuilder.AddVertexAttribute(4, 0, VK_FORMAT_R8G8B8A8_UNORM, offsetof(BatchVertex, uv_limits)); } gpbuilder.SetPrimitiveTopology(VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST);