diff --git a/src/core/gpu_hw.cpp b/src/core/gpu_hw.cpp
index 51d617382..8e4bc9740 100644
--- a/src/core/gpu_hw.cpp
+++ b/src/core/gpu_hw.cpp
@@ -215,6 +215,32 @@ void GPU_HW::HandleFlippedQuadTextureCoordinates(BatchVertex* vertices)
   }
 }
 
+bool GPU_HW::AreUVLimitsNeeded()
+{
+  // We only need UV limits if PGXP is enabled, or texture filtering is enabled.
+  return g_settings.gpu_pgxp_enable || g_settings.gpu_texture_filtering;
+}
+
+void GPU_HW::ComputePolygonUVLimits(BatchVertex* vertices, u32 num_vertices)
+{
+  u16 min_u = vertices[0].u, max_u = vertices[0].u, min_v = vertices[0].v, max_v = vertices[0].v;
+  for (u32 i = 1; i < num_vertices; i++)
+  {
+    min_u = std::min<u16>(min_u, vertices[i].u);
+    max_u = std::max<u16>(max_u, vertices[i].u);
+    min_v = std::min<u16>(min_v, vertices[i].v);
+    max_v = std::max<u16>(max_v, vertices[i].v);
+  }
+
+  if (min_u != max_u)
+    max_u--;
+  if (min_v != max_v)
+    max_v--;
+
+  for (u32 i = 0; i < num_vertices; i++)
+    vertices[i].SetUVLimits(min_u, max_u, min_v, max_v);
+}
+
 void GPU_HW::DrawLine(float x0, float y0, u32 col0, float x1, float y1, u32 col1, float depth)
 {
   const float dx = x1 - x0;
@@ -223,10 +249,10 @@ void GPU_HW::DrawLine(float x0, float y0, u32 col0, float x1, float y1, u32 col1
   if (dx == 0.0f && dy == 0.0f)
   {
     // Degenerate, render a point.
-    output[0].Set(x0, y0, depth, 1.0f, col0, 0, 0);
-    output[1].Set(x0 + 1.0f, y0, depth, 1.0f, col0, 0, 0);
-    output[2].Set(x1, y1 + 1.0f, depth, 1.0f, col0, 0, 0);
-    output[3].Set(x1 + 1.0f, y1 + 1.0f, depth, 1.0f, col0, 0, 0);
+    output[0].Set(x0, y0, depth, 1.0f, col0, 0, 0, 0);
+    output[1].Set(x0 + 1.0f, y0, depth, 1.0f, col0, 0, 0, 0);
+    output[2].Set(x1, y1 + 1.0f, depth, 1.0f, col0, 0, 0, 0);
+    output[3].Set(x1 + 1.0f, y1 + 1.0f, depth, 1.0f, col0, 0, 0, 0);
   }
   else
   {
@@ -290,10 +316,10 @@ void GPU_HW::DrawLine(float x0, float y0, u32 col0, float x1, float y1, u32 col1
     const float ox1 = x1 + pad_x1;
     const float oy1 = y1 + pad_y1;
 
-    output[0].Set(ox0, oy0, depth, 1.0f, col0, 0, 0);
-    output[1].Set(ox0 + fill_dx, oy0 + fill_dy, depth, 1.0f, col0, 0, 0);
-    output[2].Set(ox1, oy1, depth, 1.0f, col1, 0, 0);
-    output[3].Set(ox1 + fill_dx, oy1 + fill_dy, depth, 1.0f, col1, 0, 0);
+    output[0].Set(ox0, oy0, depth, 1.0f, col0, 0, 0, 0);
+    output[1].Set(ox0 + fill_dx, oy0 + fill_dy, depth, 1.0f, col0, 0, 0, 0);
+    output[2].Set(ox1, oy1, depth, 1.0f, col1, 0, 0, 0);
+    output[3].Set(ox1 + fill_dx, oy1 + fill_dy, depth, 1.0f, col1, 0, 0, 0);
   }
 
   AddVertex(output[0]);
@@ -339,7 +365,7 @@ void GPU_HW::LoadVertices()
         native_vertex_positions[i][0] = native_x;
         native_vertex_positions[i][1] = native_y;
         vertices[i].Set(static_cast<float>(native_x), static_cast<float>(native_y), depth, 1.0f, color, texpage,
-                        texcoord);
+                        texcoord, 0xFFFF0000u);
 
         if (pgxp)
         {
@@ -357,6 +383,9 @@ void GPU_HW::LoadVertices()
       if (rc.quad_polygon && m_resolution_scale > 1)
         HandleFlippedQuadTextureCoordinates(vertices.data());
 
+      if (AreUVLimitsNeeded())
+        ComputePolygonUVLimits(vertices.data(), num_vertices);
+
       if (!IsDrawingAreaIsValid())
         return;
 
@@ -490,14 +519,15 @@ void GPU_HW::LoadVertices()
           const float quad_start_x = static_cast<float>(pos_x + x_offset);
           const float quad_end_x = quad_start_x + static_cast<float>(quad_width);
           const u16 tex_right = tex_left + static_cast<u16>(quad_width);
+          const u32 uv_limits = BatchVertex::PackUVLimits(tex_left, tex_right - 1, tex_top, tex_bottom - 1);
 
-          AddNewVertex(quad_start_x, quad_start_y, depth, 1.0f, color, texpage, tex_left, tex_top);
-          AddNewVertex(quad_end_x, quad_start_y, depth, 1.0f, color, texpage, tex_right, tex_top);
-          AddNewVertex(quad_start_x, quad_end_y, depth, 1.0f, color, texpage, tex_left, tex_bottom);
+          AddNewVertex(quad_start_x, quad_start_y, depth, 1.0f, color, texpage, tex_left, tex_top, uv_limits);
+          AddNewVertex(quad_end_x, quad_start_y, depth, 1.0f, color, texpage, tex_right, tex_top, uv_limits);
+          AddNewVertex(quad_start_x, quad_end_y, depth, 1.0f, color, texpage, tex_left, tex_bottom, uv_limits);
 
-          AddNewVertex(quad_start_x, quad_end_y, depth, 1.0f, color, texpage, tex_left, tex_bottom);
-          AddNewVertex(quad_end_x, quad_start_y, depth, 1.0f, color, texpage, tex_right, tex_top);
-          AddNewVertex(quad_end_x, quad_end_y, depth, 1.0f, color, texpage, tex_right, tex_bottom);
+          AddNewVertex(quad_start_x, quad_end_y, depth, 1.0f, color, texpage, tex_left, tex_bottom, uv_limits);
+          AddNewVertex(quad_end_x, quad_start_y, depth, 1.0f, color, texpage, tex_right, tex_top, uv_limits);
+          AddNewVertex(quad_end_x, quad_end_y, depth, 1.0f, color, texpage, tex_right, tex_bottom, uv_limits);
 
           x_offset += quad_width;
           tex_left = 0;
@@ -628,6 +658,8 @@ void GPU_HW::LoadVertices()
       UnreachableCode();
       break;
   }
+
+  FlushRender();
 }
 
 void GPU_HW::CalcScissorRect(int* left, int* top, int* right, int* bottom)
diff --git a/src/core/gpu_hw.h b/src/core/gpu_hw.h
index 018342a7d..14aa2ce38 100644
--- a/src/core/gpu_hw.h
+++ b/src/core/gpu_hw.h
@@ -58,13 +58,16 @@ protected:
     u32 texpage;
     u16 u; // 16-bit texcoords are needed for 256 extent rectangles
     u16 v;
+    u32 uv_limits;
 
-    ALWAYS_INLINE void Set(float x_, float y_, float z_, float w_, u32 color_, u32 texpage_, u16 packed_texcoord)
+    ALWAYS_INLINE void Set(float x_, float y_, float z_, float w_, u32 color_, u32 texpage_, u16 packed_texcoord,
+                           u32 uv_limits_)
     {
-      Set(x_, y_, z_, w_, color_, texpage_, packed_texcoord & 0xFF, (packed_texcoord >> 8));
+      Set(x_, y_, z_, w_, color_, texpage_, packed_texcoord & 0xFF, (packed_texcoord >> 8), uv_limits_);
     }
 
-    ALWAYS_INLINE void Set(float x_, float y_, float z_, float w_, u32 color_, u32 texpage_, u16 u_, u16 v_)
+    ALWAYS_INLINE void Set(float x_, float y_, float z_, float w_, u32 color_, u32 texpage_, u16 u_, u16 v_,
+                           u32 uv_limits_)
     {
       x = x_;
       y = y_;
@@ -74,6 +77,17 @@ protected:
       texpage = texpage_;
       u = u_;
       v = v_;
+      uv_limits = uv_limits_;
+    }
+
+    ALWAYS_INLINE static u32 PackUVLimits(u32 min_u, u32 max_u, u32 min_v, u32 max_v)
+    {
+      return min_u | (min_v << 8) | (max_u << 16) | (max_v << 24);
+    }
+
+    ALWAYS_INLINE void SetUVLimits(u32 min_u, u32 max_u, u32 min_v, u32 max_v)
+    {
+      uv_limits = PackUVLimits(min_u, max_u, min_v, max_v);
     }
   };
 
@@ -236,6 +250,10 @@ protected:
   /// Handles quads with flipped texture coordinate directions.
   static void HandleFlippedQuadTextureCoordinates(BatchVertex* vertices);
 
+  /// Computes polygon U/V boundaries.
+  static void ComputePolygonUVLimits(BatchVertex* vertices, u32 num_vertices);
+  static bool AreUVLimitsNeeded();
+
   HeapArray<u16, VRAM_WIDTH * VRAM_HEIGHT> m_vram_shadow;
 
   BatchVertex* m_batch_start_vertex_ptr = nullptr;
diff --git a/src/core/gpu_hw_d3d11.cpp b/src/core/gpu_hw_d3d11.cpp
index da9eeb83d..a998bf8bd 100644
--- a/src/core/gpu_hw_d3d11.cpp
+++ b/src/core/gpu_hw_d3d11.cpp
@@ -265,11 +265,12 @@ bool GPU_HW_D3D11::CreateTextureBuffer()
 
 bool GPU_HW_D3D11::CreateBatchInputLayout()
 {
-  static constexpr std::array<D3D11_INPUT_ELEMENT_DESC, 4> attributes = {
+  static constexpr std::array<D3D11_INPUT_ELEMENT_DESC, 5> attributes = {
     {{"ATTR", 0, DXGI_FORMAT_R32G32B32A32_FLOAT, 0, offsetof(BatchVertex, x), D3D11_INPUT_PER_VERTEX_DATA, 0},
      {"ATTR", 1, DXGI_FORMAT_R8G8B8A8_UNORM, 0, offsetof(BatchVertex, color), D3D11_INPUT_PER_VERTEX_DATA, 0},
      {"ATTR", 2, DXGI_FORMAT_R32_UINT, 0, offsetof(BatchVertex, u), D3D11_INPUT_PER_VERTEX_DATA, 0},
-     {"ATTR", 3, DXGI_FORMAT_R32_UINT, 0, offsetof(BatchVertex, texpage), D3D11_INPUT_PER_VERTEX_DATA, 0}}};
+     {"ATTR", 3, DXGI_FORMAT_R32_UINT, 0, offsetof(BatchVertex, texpage), D3D11_INPUT_PER_VERTEX_DATA, 0},
+     {"ATTR", 4, DXGI_FORMAT_R8G8B8A8_UNORM, 0, offsetof(BatchVertex, uv_limits), D3D11_INPUT_PER_VERTEX_DATA, 0}}};
 
   // we need a vertex shader...
   GPU_HW_ShaderGen shadergen(m_host_display->GetRenderAPI(), m_resolution_scale, m_true_color, m_scaled_dithering,
diff --git a/src/core/gpu_hw_d3d11.h b/src/core/gpu_hw_d3d11.h
index 7a0b45e03..ae12c6c22 100644
--- a/src/core/gpu_hw_d3d11.h
+++ b/src/core/gpu_hw_d3d11.h
@@ -95,6 +95,7 @@ private:
   ComPtr<ID3D11ShaderResourceView> m_texture_stream_buffer_srv_r16ui;
 
   ComPtr<ID3D11RasterizerState> m_cull_none_rasterizer_state;
+  ComPtr<ID3D11RasterizerState> m_wireframe_rasterizer_state;
 
   ComPtr<ID3D11DepthStencilState> m_depth_disabled_state;
   ComPtr<ID3D11DepthStencilState> m_depth_test_always_state;
diff --git a/src/core/gpu_hw_opengl.cpp b/src/core/gpu_hw_opengl.cpp
index dd1e943a6..65ed75857 100644
--- a/src/core/gpu_hw_opengl.cpp
+++ b/src/core/gpu_hw_opengl.cpp
@@ -297,12 +297,15 @@ bool GPU_HW_OpenGL::CreateVertexBuffer()
   glEnableVertexAttribArray(1);
   glEnableVertexAttribArray(2);
   glEnableVertexAttribArray(3);
+  glEnableVertexAttribArray(4);
   glVertexAttribPointer(0, 4, GL_FLOAT, false, sizeof(BatchVertex), reinterpret_cast<void*>(offsetof(BatchVertex, x)));
   glVertexAttribPointer(1, 4, GL_UNSIGNED_BYTE, true, sizeof(BatchVertex),
                         reinterpret_cast<void*>(offsetof(BatchVertex, color)));
   glVertexAttribIPointer(2, 1, GL_UNSIGNED_INT, sizeof(BatchVertex), reinterpret_cast<void*>(offsetof(BatchVertex, u)));
   glVertexAttribIPointer(3, 1, GL_UNSIGNED_INT, sizeof(BatchVertex),
                          reinterpret_cast<void*>(offsetof(BatchVertex, texpage)));
+  glVertexAttribPointer(4, 4, GL_UNSIGNED_BYTE, true, sizeof(BatchVertex),
+                        reinterpret_cast<void*>(offsetof(BatchVertex, uv_limits)));
   glBindVertexArray(0);
 
   glGenVertexArrays(1, &m_attributeless_vao_id);
@@ -367,6 +370,7 @@ bool GPU_HW_OpenGL::CompilePrograms()
               {
                 prog.BindAttribute(2, "a_texcoord");
                 prog.BindAttribute(3, "a_texpage");
+                prog.BindAttribute(4, "a_uv_limits");
               }
 
               if (!IsGLES() || m_supports_dual_source_blend)
diff --git a/src/core/gpu_hw_shadergen.cpp b/src/core/gpu_hw_shadergen.cpp
index d966e5ab8..06e4bfd33 100644
--- a/src/core/gpu_hw_shadergen.cpp
+++ b/src/core/gpu_hw_shadergen.cpp
@@ -508,8 +508,9 @@ std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured, bool upsc
   const char* output_block_suffix = upscaled_lines ? "VS" : "";
   if (textured)
   {
-    DeclareVertexEntryPoint(ss, {"float4 a_pos", "float4 a_col0", "uint a_texcoord", "uint a_texpage"}, 1, 1,
-                            {{"nointerpolation", "uint4 v_texpage"}}, false, output_block_suffix);
+    DeclareVertexEntryPoint(
+      ss, {"float4 a_pos", "float4 a_col0", "uint a_texcoord", "uint a_texpage", "float4 a_uv_limits"}, 1, 1,
+      {{"nointerpolation", "uint4 v_texpage"}, {"nointerpolation", "float4 v_uv_limits"}}, false, output_block_suffix);
   }
   else
   {
@@ -557,6 +558,8 @@ std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured, bool upsc
     v_texpage.y = ((a_texpage >> 4) & 1u) * 256u * RESOLUTION_SCALE;
     v_texpage.z = ((a_texpage >> 16) & 63u) * 16u * RESOLUTION_SCALE;
     v_texpage.w = ((a_texpage >> 22) & 511u) * RESOLUTION_SCALE;
+
+    v_uv_limits = a_uv_limits * float4(255.0, 255.0, 255.0, 255.0);
   #endif
 }
 )";
@@ -658,13 +661,7 @@ uint2 FloatToIntegerCoords(float2 coords)
 float4 SampleFromVRAM(uint4 texpage, float2 coords)
 {
   #if PALETTE
-    // We can't currently use upscaled coordinate for palettes because of how they're packed.
-    // Not that it would be any benefit anyway, render-to-texture effects don't use palettes.
-    #if !TEXTURE_FILTERING
-      coords /= float2(RESOLUTION_SCALE, RESOLUTION_SCALE);
-    #endif
     uint2 icoord = ApplyTextureWindow(FloatToIntegerCoords(coords));
-
     uint2 index_coord = icoord;
     #if PALETTE_4_BIT
       index_coord.x /= 4u;
@@ -698,12 +695,43 @@ float4 SampleFromVRAM(uint4 texpage, float2 coords)
     return SAMPLE_TEXTURE(samp0, float2(direct_icoord) * RCP_VRAM_SIZE);
   #endif
 }
+
+void BilinearSampleFromVRAM(uint4 texpage, float2 coords, float4 uv_limits,
+                            out float4 texcol, out float ialpha)
+{
+  // Compute the coordinates of the four texels we will be interpolating between.
+  // Clamp this to the triangle texture coordinates.
+  float2 texel_top_left = frac(coords) - float2(0.5, 0.5);
+  float2 texel_offset = sign(texel_top_left);
+  float4 fcoords = max(coords.xyxy + float4(0.0, 0.0, texel_offset.x, texel_offset.y),
+                        float4(0.0, 0.0, 0.0, 0.0));
+
+  // Load four texels.
+  float4 s00 = SampleFromVRAM(texpage, clamp(fcoords.xy, uv_limits.xy, uv_limits.zw));
+  float4 s10 = SampleFromVRAM(texpage, clamp(fcoords.zy, uv_limits.xy, uv_limits.zw));
+  float4 s01 = SampleFromVRAM(texpage, clamp(fcoords.xw, uv_limits.xy, uv_limits.zw));
+  float4 s11 = SampleFromVRAM(texpage, clamp(fcoords.zw, uv_limits.xy, uv_limits.zw));
+
+  // Compute alpha from how many texels aren't pixel color 0000h.
+  float a00 = float(VECTOR_NEQ(s00, TRANSPARENT_PIXEL_COLOR));
+  float a10 = float(VECTOR_NEQ(s10, TRANSPARENT_PIXEL_COLOR));
+  float a01 = float(VECTOR_NEQ(s01, TRANSPARENT_PIXEL_COLOR));
+  float a11 = float(VECTOR_NEQ(s11, TRANSPARENT_PIXEL_COLOR));
+
+  // Bilinearly interpolate.
+  float2 weights = abs(texel_top_left);
+  texcol = lerp(lerp(s00, s10, weights.x), lerp(s01, s11, weights.x), weights.y);
+  ialpha = lerp(lerp(a00, a10, weights.x), lerp(a01, a11, weights.x), weights.y);
+}
+
 #endif
 )";
 
   if (textured)
   {
-    DeclareFragmentEntryPoint(ss, 1, 1, {{"nointerpolation", "uint4 v_texpage"}}, true, use_dual_source ? 2 : 1, true);
+    DeclareFragmentEntryPoint(ss, 1, 1,
+                              {{"nointerpolation", "uint4 v_texpage"}, {"nointerpolation", "float4 v_uv_limits"}}, true,
+                              use_dual_source ? 2 : 1, true);
   }
   else
   {
@@ -725,48 +753,35 @@ float4 SampleFromVRAM(uint4 texpage, float2 coords)
   #endif
 
   #if TEXTURED
+    float2 coords = v_tex0;
+    float4 uv_limits = v_uv_limits;
+    float4 texcol;
+
+    // We can't currently use upscaled coordinate for palettes because of how they're packed.
+    // Not that it would be any benefit anyway, render-to-texture effects don't use palettes.
+    #if PALETTE
+      coords /= float2(RESOLUTION_SCALE, RESOLUTION_SCALE);
+    #else
+      uv_limits *= float4(RESOLUTION_SCALE, RESOLUTION_SCALE, RESOLUTION_SCALE, RESOLUTION_SCALE);
+    #endif
+
     #if TEXTURE_FILTERING
-      // Compute the coordinates of the four texels we will be interpolating between.
-      // TODO: Find some way to clamp this to the triangle texture coordinates?
-      float2 downscaled_coords = v_tex0;
-      #if PALETTE
-        downscaled_coords /= float2(RESOLUTION_SCALE, RESOLUTION_SCALE);
-      #endif
-      float2 texel_top_left = frac(downscaled_coords) - float2(0.5, 0.5);
-      float2 texel_offset = sign(texel_top_left);
-      float4 fcoords = max(downscaled_coords.xyxy + float4(0.0, 0.0, texel_offset.x, texel_offset.y),
-                           float4(0.0, 0.0, 0.0, 0.0));
-
-      // Load four texels.
-      float4 s00 = SampleFromVRAM(v_texpage, fcoords.xy);
-      float4 s10 = SampleFromVRAM(v_texpage, fcoords.zy);
-      float4 s01 = SampleFromVRAM(v_texpage, fcoords.xw);
-      float4 s11 = SampleFromVRAM(v_texpage, fcoords.zw);
-
-      // Compute alpha from how many texels aren't pixel color 0000h.
-      float a00 = float(VECTOR_NEQ(s00, TRANSPARENT_PIXEL_COLOR));
-      float a10 = float(VECTOR_NEQ(s10, TRANSPARENT_PIXEL_COLOR));
-      float a01 = float(VECTOR_NEQ(s01, TRANSPARENT_PIXEL_COLOR));
-      float a11 = float(VECTOR_NEQ(s11, TRANSPARENT_PIXEL_COLOR));
-
-      // Bilinearly interpolate.
-      float2 weights = abs(texel_top_left);
-      float4 texcol = lerp(lerp(s00, s10, weights.x), lerp(s01, s11, weights.x), weights.y);
-      ialpha = lerp(lerp(a00, a10, weights.x), lerp(a01, a11, weights.x), weights.y);
+      BilinearSampleFromVRAM(v_texpage, coords, uv_limits, texcol, ialpha);
       if (ialpha < 0.5)
         discard;
 
       texcol.rgb /= float3(ialpha, ialpha, ialpha);
       semitransparent = (texcol.a != 0.0);
     #else
-      float4 texcol = SampleFromVRAM(v_texpage, v_tex0);
+      texcol = SampleFromVRAM(v_texpage, clamp(coords, uv_limits.xy, uv_limits.zw));
       if (VECTOR_EQ(texcol, TRANSPARENT_PIXEL_COLOR))
         discard;
 
-      semitransparent = (texcol.a != 0.0);
       ialpha = 1.0;
     #endif
 
+    semitransparent = (texcol.a != 0.0);
+
     // If not using true color, truncate the framebuffer colors to 5-bit.
     #if !TRUE_COLOR
       icolor = uint3(texcol.rgb * float3(255.0, 255.0, 255.0)) >> 3;
diff --git a/src/core/gpu_hw_vulkan.cpp b/src/core/gpu_hw_vulkan.cpp
index c7c8f9149..30fd0c379 100644
--- a/src/core/gpu_hw_vulkan.cpp
+++ b/src/core/gpu_hw_vulkan.cpp
@@ -646,6 +646,7 @@ bool GPU_HW_Vulkan::CompilePipelines()
               {
                 gpbuilder.AddVertexAttribute(2, 0, VK_FORMAT_R32_UINT, offsetof(BatchVertex, u));
                 gpbuilder.AddVertexAttribute(3, 0, VK_FORMAT_R32_UINT, offsetof(BatchVertex, texpage));
+                gpbuilder.AddVertexAttribute(4, 0, VK_FORMAT_R8G8B8A8_UNORM, offsetof(BatchVertex, uv_limits));
               }
 
               gpbuilder.SetPrimitiveTopology(VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST);