HostDisplay: Add GPU usage statistics

2025-06-21 07:25:39 -04:00 · 2022-09-03 14:15:15 +10:00
parent baf9f641ce
commit bec0d6e7df
22 changed files with 698 additions and 101 deletions
--- a/src/common/d3d12/context.cpp
+++ b/src/common/d3d12/context.cpp
@ -138,7 +138,7 @@ bool Context::Create(IDXGIFactory* dxgi_factory, u32 adapter_index, bool enable_
  if (!g_d3d12_context->CreateDevice(dxgi_factory, adapter_index, enable_debug_layer) ||
      !g_d3d12_context->CreateCommandQueue() || !g_d3d12_context->CreateFence() ||
      !g_d3d12_context->CreateDescriptorHeaps() || !g_d3d12_context->CreateCommandLists() ||
-      !g_d3d12_context->CreateTextureStreamBuffer())
+      !g_d3d12_context->CreateTimestampQuery() || !g_d3d12_context->CreateTextureStreamBuffer())
  {
    Destroy();
    return false;
@ -326,20 +326,64 @@ void Context::MoveToNextCommandList()
  // We may have to wait if this command list hasn't finished on the GPU.
  CommandListResources& res = m_command_lists[m_current_command_list];
  WaitForFence(res.ready_fence_value);
+  res.ready_fence_value = m_current_fence_value;

  // Begin command list.
  res.command_allocator->Reset();
  res.command_list->Reset(res.command_allocator.Get(), nullptr);
+
+  if (res.has_timestamp_query)
+  {
+    // readback timestamp from the last time this cmdlist was used.
+    // we don't need to worry about disjoint in dx12, the frequency is reliable within a single cmdlist.
+    const u32 offset = (m_current_command_list * (sizeof(u64) * NUM_TIMESTAMP_QUERIES_PER_CMDLIST));
+    const D3D12_RANGE read_range = {offset, offset + (sizeof(u64) * NUM_TIMESTAMP_QUERIES_PER_CMDLIST)};
+    void* map;
+    HRESULT hr = m_timestamp_query_buffer->Map(0, &read_range, &map);
+    if (SUCCEEDED(hr))
+    {
+      u64 timestamps[2];
+      std::memcpy(timestamps, static_cast<const u8*>(map) + offset, sizeof(timestamps));
+      m_accumulated_gpu_time +=
+        static_cast<float>(static_cast<double>(timestamps[1] - timestamps[0]) / m_timestamp_frequency);
+
+      const D3D12_RANGE write_range = {};
+      m_timestamp_query_buffer->Unmap(0, &write_range);
+    }
+    else
+    {
+      Log_WarningPrintf("Map() for timestamp query failed: %08X", hr);
+    }
+  }
+
+  res.has_timestamp_query = m_gpu_timing_enabled;
+  if (m_gpu_timing_enabled)
+  {
+    res.command_list->EndQuery(m_timestamp_query_heap.Get(), D3D12_QUERY_TYPE_TIMESTAMP,
+                               m_current_command_list * NUM_TIMESTAMP_QUERIES_PER_CMDLIST);
+  }
+
  res.command_list->SetDescriptorHeaps(static_cast<UINT>(m_gpu_descriptor_heaps.size()), m_gpu_descriptor_heaps.data());
-  res.ready_fence_value = m_current_fence_value;
 }

 void Context::ExecuteCommandList(bool wait_for_completion)
 {
  CommandListResources& res = m_command_lists[m_current_command_list];
+  HRESULT hr;
+
+  if (res.has_timestamp_query)
+  {
+    // write the timestamp back at the end of the cmdlist
+    res.command_list->EndQuery(m_timestamp_query_heap.Get(), D3D12_QUERY_TYPE_TIMESTAMP,
+                               (m_current_command_list * NUM_TIMESTAMP_QUERIES_PER_CMDLIST) + 1);
+    res.command_list->ResolveQueryData(m_timestamp_query_heap.Get(), D3D12_QUERY_TYPE_TIMESTAMP,
+                                       m_current_command_list * NUM_TIMESTAMP_QUERIES_PER_CMDLIST,
+                                       NUM_TIMESTAMP_QUERIES_PER_CMDLIST, m_timestamp_query_buffer.Get(),
+                                       m_current_command_list * (sizeof(u64) * NUM_TIMESTAMP_QUERIES_PER_CMDLIST));
+  }

  // Close and queue command list.
-  HRESULT hr = res.command_list->Close();
+  hr = res.command_list->Close();
  AssertMsg(SUCCEEDED(hr), "Close command list");
  const std::array<ID3D12CommandList*, 1> execute_lists{res.command_list.Get()};
  m_command_queue->ExecuteCommandLists(static_cast<UINT>(execute_lists.size()), execute_lists.data());
@ -391,6 +435,8 @@ void Context::DestroyResources()
 {
  ExecuteCommandList(true);

+  m_timestamp_query_buffer.Reset();
+  m_timestamp_query_heap.Reset();
  m_texture_stream_buffer.Destroy(false);
  m_descriptor_heap_manager.Free(&m_null_srv_descriptor);
  m_sampler_heap_manager.Destroy();
@ -450,4 +496,61 @@ void Context::WaitForGPUIdle()
    index = (index + 1) % NUM_COMMAND_LISTS;
  }
 }
+
+bool Context::CreateTimestampQuery()
+{
+  constexpr u32 QUERY_COUNT = NUM_TIMESTAMP_QUERIES_PER_CMDLIST * NUM_COMMAND_LISTS;
+  constexpr u32 BUFFER_SIZE = sizeof(u64) * QUERY_COUNT;
+
+  const D3D12_QUERY_HEAP_DESC desc = {D3D12_QUERY_HEAP_TYPE_TIMESTAMP, QUERY_COUNT};
+  HRESULT hr = m_device->CreateQueryHeap(&desc, IID_PPV_ARGS(m_timestamp_query_heap.ReleaseAndGetAddressOf()));
+  if (FAILED(hr))
+  {
+    Log_ErrorPrintf("CreateQueryHeap() for timestamp failed with %08X", hr);
+    return false;
+  }
+
+  const D3D12_HEAP_PROPERTIES heap_properties = {D3D12_HEAP_TYPE_READBACK};
+  const D3D12_RESOURCE_DESC resource_desc = {D3D12_RESOURCE_DIMENSION_BUFFER,
+                                             0,
+                                             BUFFER_SIZE,
+                                             1,
+                                             1,
+                                             1,
+                                             DXGI_FORMAT_UNKNOWN,
+                                             {1, 0},
+                                             D3D12_TEXTURE_LAYOUT_ROW_MAJOR,
+                                             D3D12_RESOURCE_FLAG_NONE};
+  hr = m_device->CreateCommittedResource(&heap_properties, D3D12_HEAP_FLAG_NONE, &resource_desc,
+                                         D3D12_RESOURCE_STATE_COPY_DEST, nullptr,
+                                         IID_PPV_ARGS(m_timestamp_query_buffer.ReleaseAndGetAddressOf()));
+  if (FAILED(hr))
+  {
+    Log_ErrorPrintf("CreateResource() for timestamp failed with %08X", hr);
+    return false;
+  }
+
+  u64 frequency;
+  hr = m_command_queue->GetTimestampFrequency(&frequency);
+  if (FAILED(hr))
+  {
+    Log_ErrorPrintf("GetTimestampFrequency() failed: %08X", hr);
+    return false;
+  }
+
+  m_timestamp_frequency = static_cast<double>(frequency) / 1000.0;
+  return true;
+}
+
+float Context::GetAndResetAccumulatedGPUTime()
+{
+  const float time = m_accumulated_gpu_time;
+  m_accumulated_gpu_time = 0.0f;
+  return time;
+}
+
+void Context::SetEnableGPUTiming(bool enabled)
+{
+  m_gpu_timing_enabled = enabled;
+}
 } // namespace D3D12
--- a/src/common/d3d12/context.h
+++ b/src/common/d3d12/context.h
@ -31,6 +31,9 @@ public:

    // Textures that don't fit into this buffer will be uploaded with a staging buffer.
    TEXTURE_UPLOAD_BUFFER_SIZE = 16 * 1024 * 1024,
+
+    /// Start/End timestamp queries.
+    NUM_TIMESTAMP_QUERIES_PER_CMDLIST = 2,
  };

  ~Context();
@ -92,6 +95,9 @@ public:
  void DeferDescriptorDestruction(DescriptorHeapManager& manager, u32 index);
  void DeferDescriptorDestruction(DescriptorHeapManager& manager, DescriptorHandle* handle);

+  float GetAndResetAccumulatedGPUTime();
+  void SetEnableGPUTiming(bool enabled);
+
 private:
  struct CommandListResources
  {
@ -100,6 +106,7 @@ private:
    std::vector<ID3D12Resource*> pending_resources;
    std::vector<std::pair<DescriptorHeapManager&, u32>> pending_descriptors;
    u64 ready_fence_value = 0;
+    bool has_timestamp_query = false;
  };

  Context();
@ -110,6 +117,7 @@ private:
  bool CreateDescriptorHeaps();
  bool CreateCommandLists();
  bool CreateTextureStreamBuffer();
+  bool CreateTimestampQuery();
  void MoveToNextCommandList();
  void DestroyPendingResources(CommandListResources& cmdlist);
  void DestroyResources();
@ -126,6 +134,12 @@ private:
  std::array<CommandListResources, NUM_COMMAND_LISTS> m_command_lists;
  u32 m_current_command_list = NUM_COMMAND_LISTS - 1;

+  ComPtr<ID3D12QueryHeap> m_timestamp_query_heap;
+  ComPtr<ID3D12Resource> m_timestamp_query_buffer;
+  double m_timestamp_frequency = 0.0;
+  float m_accumulated_gpu_time = 0.0f;
+  bool m_gpu_timing_enabled = false;
+
  DescriptorHeapManager m_descriptor_heap_manager;
  DescriptorHeapManager m_rtv_heap_manager;
  DescriptorHeapManager m_dsv_heap_manager;
--- a/src/common/gl/context.cpp
+++ b/src/common/gl/context.cpp
@ -55,14 +55,27 @@ static bool ShouldPreferESContext()
 #endif
 }

-static void DisableBrokenExtensions(const char* gl_vendor, const char* gl_renderer)
+static void DisableBrokenExtensions(const char* gl_vendor, const char* gl_renderer, const char* gl_version)
 {
  if (std::strstr(gl_vendor, "ARM"))
  {
    // GL_{EXT,OES}_copy_image seem to be implemented on the CPU in the Mali drivers...
-    Log_VerbosePrintf("Mali driver detected, disabling GL_{EXT,OES}_copy_image");
-    GLAD_GL_EXT_copy_image = 0;
-    GLAD_GL_OES_copy_image = 0;
+    // Older drivers don't implement timer queries correctly either.
+    int gl_major_version, gl_minor_version, unused_version, major_version, patch_version;
+    if (std::sscanf(gl_version, "OpenGL ES %d.%d v%d.r%dp%d", &gl_major_version, &gl_minor_version, &unused_version,
+                    &major_version, &patch_version) == 5 &&
+        gl_major_version >= 3 && gl_minor_version >= 2 && major_version >= 32)
+    {
+      // r32p0 and beyond seem okay.
+      Log_VerbosePrintf("Keeping copy_image for driver version '%s'", gl_version);
+    }
+    else
+    {
+      Log_VerbosePrintf("Older Mali driver detected, disabling GL_{EXT,OES}_copy_image, disjoint_timer_query.");
+      GLAD_GL_EXT_copy_image = 0;
+      GLAD_GL_OES_copy_image = 0;
+      GLAD_GL_EXT_disjoint_timer_query = 0;
+    }
  }
 }

@ -173,7 +186,7 @@ std::unique_ptr<GL::Context> Context::Create(const WindowInfo& wi, const Version
  Log_InfoPrintf("GL_VERSION: %s", gl_version);
  Log_InfoPrintf("GL_SHADING_LANGUAGE_VERSION: %s", gl_shading_language_version);

-  DisableBrokenExtensions(gl_vendor, gl_renderer);
+  DisableBrokenExtensions(gl_vendor, gl_renderer, gl_version);

  return context;
 }
--- a/src/common/gl/program.cpp
+++ b/src/common/gl/program.cpp
@ -164,7 +164,7 @@ bool Program::GetBinary(std::vector<u8>* out_data, u32* out_data_format)
  }

  *out_data_format = static_cast<u32>(format);
-  Log_InfoPrintf("Program binary retrieved, %zu bytes, format %u", out_data->size(), *out_data_format);
+  Log_DevPrintf("Program binary retrieved, %zu bytes, format %u", out_data->size(), *out_data_format);
  return true;
 }

--- a/src/common/vulkan/context.cpp
+++ b/src/common/vulkan/context.cpp
@ -622,9 +622,17 @@ bool Context::CreateDevice(VkSurfaceKHR surface, bool enable_validation_layer, c
  // Grab the graphics and present queues.
  vkGetDeviceQueue(m_device, m_graphics_queue_family_index, 0, &m_graphics_queue);
  if (surface)
-  {
    vkGetDeviceQueue(m_device, m_present_queue_family_index, 0, &m_present_queue);
-  }
+
+  m_gpu_timing_supported = (m_device_properties.limits.timestampComputeAndGraphics != 0 &&
+                            queue_family_properties[m_graphics_queue_family_index].timestampValidBits > 0 &&
+                            m_device_properties.limits.timestampPeriod > 0);
+  Log_VerbosePrintf("GPU timing is %s (TS=%u TS valid bits=%u, TS period=%f)",
+                    m_gpu_timing_supported ? "supported" : "not supported",
+                    static_cast<u32>(m_device_properties.limits.timestampComputeAndGraphics),
+                    queue_family_properties[m_graphics_queue_family_index].timestampValidBits,
+                    m_device_properties.limits.timestampPeriod);
+
  return true;
 }

@ -751,6 +759,20 @@ bool Context::CreateGlobalDescriptorPool()
    return false;
  }
  Vulkan::Util::SetObjectName(g_vulkan_context->GetDevice(), m_global_descriptor_pool, "Global Descriptor Pool");
+
+  if (m_gpu_timing_supported)
+  {
+    const VkQueryPoolCreateInfo query_create_info = {
+      VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, nullptr, 0, VK_QUERY_TYPE_TIMESTAMP, NUM_COMMAND_BUFFERS * 2, 0};
+    res = vkCreateQueryPool(m_device, &query_create_info, nullptr, &m_timestamp_query_pool);
+    if (res != VK_SUCCESS)
+    {
+      LOG_VULKAN_ERROR(res, "vkCreateQueryPool failed: ");
+      m_gpu_timing_supported = false;
+      return false;
+    }
+  }
+
  return true;
 }

@ -831,6 +853,19 @@ void Context::WaitForGPUIdle()
  vkDeviceWaitIdle(m_device);
 }

+float Context::GetAndResetAccumulatedGPUTime()
+{
+  const float time = m_accumulated_gpu_time;
+  m_accumulated_gpu_time = 0.0f;
+  return time;
+}
+
+bool Context::SetEnableGPUTiming(bool enabled)
+{
+  m_gpu_timing_enabled = enabled && m_gpu_timing_supported;
+  return (enabled == m_gpu_timing_enabled);
+}
+
 void Context::WaitForCommandBufferCompletion(u32 index)
 {
  // Wait for this command buffer to be completed.
@ -868,6 +903,12 @@ void Context::SubmitCommandBuffer(VkSemaphore wait_semaphore /* = VK_NULL_HANDLE
 {
  FrameResources& resources = m_frame_resources[m_current_frame];

+  if (m_gpu_timing_enabled && resources.timestamp_written)
+  {
+    vkCmdWriteTimestamp(m_current_command_buffer, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, m_timestamp_query_pool,
+                        m_current_frame * 2 + 1);
+  }
+
  // End the current command buffer.
  VkResult res = vkEndCommandBuffer(resources.command_buffer);
  if (res != VK_SUCCESS)
@ -1048,9 +1089,41 @@ void Context::ActivateCommandBuffer(u32 index)
  if (res != VK_SUCCESS)
    LOG_VULKAN_ERROR(res, "vkResetDescriptorPool failed: ");

+  if (m_gpu_timing_enabled)
+  {
+    if (resources.timestamp_written)
+    {
+      std::array<u64, 2> timestamps;
+      res =
+        vkGetQueryPoolResults(m_device, m_timestamp_query_pool, index * 2, static_cast<u32>(timestamps.size()),
+                              sizeof(u64) * timestamps.size(), timestamps.data(), sizeof(u64), VK_QUERY_RESULT_64_BIT);
+      if (res == VK_SUCCESS)
+      {
+        // if we didn't write the timestamp at the start of the cmdbuffer (just enabled timing), the first TS will be
+        // zero
+        if (timestamps[0] > 0)
+        {
+          const double ns_diff =
+            (timestamps[1] - timestamps[0]) * static_cast<double>(m_device_properties.limits.timestampPeriod);
+          m_accumulated_gpu_time = static_cast<float>(static_cast<double>(m_accumulated_gpu_time) + (ns_diff / 1000000.0));
+        }
+      }
+      else
+      {
+        LOG_VULKAN_ERROR(res, "vkGetQueryPoolResults failed: ");
+      }
+    }
+
+    vkCmdResetQueryPool(resources.command_buffer, m_timestamp_query_pool, index * 2, 2);
+    vkCmdWriteTimestamp(resources.command_buffer, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, m_timestamp_query_pool,
+                        index * 2);
+  }
+
+  resources.fence_counter = m_next_fence_counter++;
+  resources.timestamp_written = m_gpu_timing_enabled;
+
  m_current_frame = index;
  m_current_command_buffer = resources.command_buffer;
-  resources.fence_counter = m_next_fence_counter++;
 }

 void Context::ExecuteCommandBuffer(bool wait_for_completion)
--- a/src/common/vulkan/context.h
+++ b/src/common/vulkan/context.h
@ -180,6 +180,9 @@ public:

  void WaitForGPUIdle();

+  float GetAndResetAccumulatedGPUTime();
+  bool SetEnableGPUTiming(bool enabled);
+
 private:
  Context(VkInstance instance, VkPhysicalDevice physical_device, bool owns_device);

@ -216,6 +219,7 @@ private:
    VkFence fence = VK_NULL_HANDLE;
    u64 fence_counter = 0;
    bool needs_fence_wait = false;
+    bool timestamp_written = false;

    std::vector<std::function<void()>> cleanup_resources;
  };
@ -233,6 +237,11 @@ private:
  VkQueue m_present_queue = VK_NULL_HANDLE;
  u32 m_present_queue_family_index = 0;

+  VkQueryPool m_timestamp_query_pool = VK_NULL_HANDLE;
+  float m_accumulated_gpu_time = 0.0f;
+  bool m_gpu_timing_enabled = false;
+  bool m_gpu_timing_supported = false;
+
  std::array<FrameResources, NUM_COMMAND_BUFFERS> m_frame_resources;
  u64 m_next_fence_counter = 1;
  u64 m_completed_fence_counter = 0;