13 files changed, 69 insertions, 91 deletions
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index ff024f530..2ae3639b5 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -531,14 +531,6 @@ void GPU::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const {
     interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value);
 }
 
-void GPU::ShutDown() {
-    // Signal that threads should no longer block on syncpoint fences
-    shutting_down.store(true, std::memory_order_relaxed);
-    sync_cv.notify_all();
-
-    gpu_thread.ShutDown();
-}
-
 void GPU::OnCommandListEnd() {
     if (is_async) {
         // This command only applies to asynchronous GPU mode
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index a8e98e51b..e6a02a71b 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -219,9 +219,6 @@ public:
         return *shader_notify;
     }
 
-    // Stops the GPU execution and waits for the GPU to finish working
-    void ShutDown();
-
     /// Allows the CPU/NvFlinger to wait on the GPU before presenting a frame.
     void WaitFence(u32 syncpoint_id, u32 value);
 
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index 46f642b19..9547f277a 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -17,9 +17,9 @@
 namespace VideoCommon::GPUThread {
 
 /// Runs the GPU thread
-static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
-                      Core::Frontend::GraphicsContext& context, Tegra::DmaPusher& dma_pusher,
-                      SynchState& state) {
+static void RunThread(std::stop_token stop_token, Core::System& system,
+                      VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context,
+                      Tegra::DmaPusher& dma_pusher, SynchState& state) {
     std::string name = "yuzu:GPU";
     MicroProfileOnThreadCreate(name.c_str());
     SCOPE_EXIT({ MicroProfileOnThreadExit(); });
@@ -28,20 +28,14 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
     Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
     system.RegisterHostThread();
 
-    // Wait for first GPU command before acquiring the window context
-    state.queue.Wait();
-
-    // If emulation was stopped during disk shader loading, abort before trying to acquire context
-    if (!state.is_running) {
-        return;
-    }
-
     auto current_context = context.Acquire();
     VideoCore::RasterizerInterface* const rasterizer = renderer.ReadRasterizer();
 
-    CommandDataContainer next;
-    while (state.is_running) {
-        next = state.queue.PopWait();
+    while (!stop_token.stop_requested()) {
+        CommandDataContainer next = state.queue.PopWait(stop_token);
+        if (stop_token.stop_requested()) {
+            break;
+        }
         if (auto* submit_list = std::get_if<SubmitListCommand>(&next.data)) {
             dma_pusher.Push(std::move(submit_list->entries));
             dma_pusher.DispatchCalls();
@@ -55,8 +49,6 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
             rasterizer->FlushRegion(flush->addr, flush->size);
         } else if (const auto* invalidate = std::get_if<InvalidateRegionCommand>(&next.data)) {
             rasterizer->OnCPUWrite(invalidate->addr, invalidate->size);
-        } else if (std::holds_alternative<EndProcessingCommand>(next.data)) {
-            ASSERT(state.is_running == false);
         } else {
             UNREACHABLE();
         }
@@ -73,16 +65,14 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
 ThreadManager::ThreadManager(Core::System& system_, bool is_async_)
     : system{system_}, is_async{is_async_} {}
 
-ThreadManager::~ThreadManager() {
-    ShutDown();
-}
+ThreadManager::~ThreadManager() = default;
 
 void ThreadManager::StartThread(VideoCore::RendererBase& renderer,
                                 Core::Frontend::GraphicsContext& context,
                                 Tegra::DmaPusher& dma_pusher) {
     rasterizer = renderer.ReadRasterizer();
-    thread = std::thread(RunThread, std::ref(system), std::ref(renderer), std::ref(context),
-                         std::ref(dma_pusher), std::ref(state));
+    thread = std::jthread(RunThread, std::ref(system), std::ref(renderer), std::ref(context),
+                          std::ref(dma_pusher), std::ref(state));
 }
 
 void ThreadManager::SubmitList(Tegra::CommandList&& entries) {
@@ -117,26 +107,6 @@ void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) {
     rasterizer->OnCPUWrite(addr, size);
 }
 
-void ThreadManager::ShutDown() {
-    if (!state.is_running) {
-        return;
-    }
-
-    {
-        std::lock_guard lk(state.write_lock);
-        state.is_running = false;
-        state.cv.notify_all();
-    }
-
-    if (!thread.joinable()) {
-        return;
-    }
-
-    // Notify GPU thread that a shutdown is pending
-    PushCommand(EndProcessingCommand());
-    thread.join();
-}
-
 void ThreadManager::OnCommandListEnd() {
     PushCommand(OnCommandListEndCommand());
 }
@@ -152,9 +122,8 @@ u64 ThreadManager::PushCommand(CommandData&& command_data, bool block) {
     state.queue.Push(CommandDataContainer(std::move(command_data), fence, block));
 
     if (block) {
-        state.cv.wait(lk, [this, fence] {
-            return fence <= state.signaled_fence.load(std::memory_order_relaxed) ||
-                   !state.is_running;
+        state.cv.wait(lk, thread.get_stop_token(), [this, fence] {
+            return fence <= state.signaled_fence.load(std::memory_order_relaxed);
         });
     }
 
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index 11a648f38..91bada925 100644
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -33,9 +33,6 @@ class RendererBase;
 
 namespace VideoCommon::GPUThread {
 
-/// Command to signal to the GPU thread that processing has ended
-struct EndProcessingCommand final {};
-
 /// Command to signal to the GPU thread that a command list is ready for processing
 struct SubmitListCommand final {
     explicit SubmitListCommand(Tegra::CommandList&& entries_) : entries{std::move(entries_)} {}
@@ -83,7 +80,7 @@ struct OnCommandListEndCommand final {};
 struct GPUTickCommand final {};
 
 using CommandData =
-    std::variant<EndProcessingCommand, SubmitListCommand, SwapBuffersCommand, FlushRegionCommand,
+    std::variant<std::monostate, SubmitListCommand, SwapBuffersCommand, FlushRegionCommand,
                  InvalidateRegionCommand, FlushAndInvalidateRegionCommand, OnCommandListEndCommand,
                  GPUTickCommand>;
 
@@ -100,14 +97,12 @@ struct CommandDataContainer {
 
 /// Struct used to synchronize the GPU thread
 struct SynchState final {
-    std::atomic_bool is_running{true};
-
-    using CommandQueue = Common::SPSCQueue<CommandDataContainer>;
+    using CommandQueue = Common::SPSCQueue<CommandDataContainer, true>;
     std::mutex write_lock;
     CommandQueue queue;
     u64 last_fence{};
     std::atomic<u64> signaled_fence{};
-    std::condition_variable cv;
+    std::condition_variable_any cv;
 };
 
 /// Class used to manage the GPU thread
@@ -149,7 +144,7 @@ private:
     VideoCore::RasterizerInterface* rasterizer = nullptr;
 
     SynchState state;
-    std::thread thread;
+    std::jthread thread;
 };
 
 } // namespace VideoCommon::GPUThread
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index b0e14182e..02682bd76 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -293,6 +293,8 @@ void ShaderCache::LoadDiskResources(u64 title_id, std::stop_token stop_loading,
     }};
     LoadPipelines(stop_loading, shader_cache_filename, CACHE_VERSION, load_compute, load_graphics);
 
+    LOG_INFO(Render_OpenGL, "Total Pipeline Count: {}", state.total);
+
     std::unique_lock lock{state.mutex};
     callback(VideoCore::LoadCallbackStage::Build, 0, state.total);
     state.has_loaded = true;
diff --git a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp
index adb557f60..d87da2a34 100644
--- a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp
+++ b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp
@@ -19,7 +19,6 @@ namespace Vulkan {
 // Prefer small grow rates to avoid saturating the descriptor pool with barely used pipelines
 constexpr size_t SETS_GROW_RATE = 16;
 constexpr s32 SCORE_THRESHOLD = 3;
-constexpr u32 SETS_PER_POOL = 64;
 
 struct DescriptorBank {
     DescriptorBankInfo info;
@@ -59,11 +58,12 @@ static DescriptorBankInfo MakeBankInfo(std::span<const Shader::Info> infos) {
 static void AllocatePool(const Device& device, DescriptorBank& bank) {
     std::array<VkDescriptorPoolSize, 6> pool_sizes;
     size_t pool_cursor{};
+    const u32 sets_per_pool = device.GetSetsPerPool();
     const auto add = [&](VkDescriptorType type, u32 count) {
         if (count > 0) {
             pool_sizes[pool_cursor++] = {
                 .type = type,
-                .descriptorCount = count * SETS_PER_POOL,
+                .descriptorCount = count * sets_per_pool,
             };
         }
     };
@@ -78,7 +78,7 @@ static void AllocatePool(const Device& device, DescriptorBank& bank) {
         .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
         .pNext = nullptr,
         .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT,
-        .maxSets = SETS_PER_POOL,
+        .maxSets = sets_per_pool,
         .poolSizeCount = static_cast<u32>(pool_cursor),
         .pPoolSizes = std::data(pool_sizes),
     }));
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index 31bfbcb06..eb8b4e08b 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -447,6 +447,8 @@ void PipelineCache::LoadDiskResources(u64 title_id, std::stop_token stop_loading
     VideoCommon::LoadPipelines(stop_loading, pipeline_cache_filename, CACHE_VERSION, load_compute,
                                load_graphics);
 
+    LOG_INFO(Render_Vulkan, "Total Pipeline Count: {}", state.total);
+
     std::unique_lock lock{state.mutex};
     callback(VideoCore::LoadCallbackStage::Build, 0, state.total);
     state.has_loaded = true;
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 841a6b846..3bcd6d6cc 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -765,12 +765,7 @@ void RasterizerVulkan::UpdateStencilOp(Tegra::Engines::Maxwell3D::Regs& regs) {
     const Maxwell::StencilOp zpass = regs.stencil_front_op_zpass;
     const Maxwell::ComparisonOp compare = regs.stencil_front_func_func;
     if (regs.stencil_two_side_enable) {
-        scheduler.Record([fail, zfail, zpass, compare](vk::CommandBuffer cmdbuf) {
-            cmdbuf.SetStencilOpEXT(VK_STENCIL_FACE_FRONT_AND_BACK, MaxwellToVK::StencilOp(fail),
-                                   MaxwellToVK::StencilOp(zpass), MaxwellToVK::StencilOp(zfail),
-                                   MaxwellToVK::ComparisonOp(compare));
-        });
-    } else {
+        // Separate stencil op per face
         const Maxwell::StencilOp back_fail = regs.stencil_back_op_fail;
         const Maxwell::StencilOp back_zfail = regs.stencil_back_op_zfail;
         const Maxwell::StencilOp back_zpass = regs.stencil_back_op_zpass;
@@ -785,6 +780,13 @@ void RasterizerVulkan::UpdateStencilOp(Tegra::Engines::Maxwell3D::Regs& regs) {
                                    MaxwellToVK::StencilOp(back_zfail),
                                    MaxwellToVK::ComparisonOp(back_compare));
         });
+    } else {
+        // Front face defines the stencil op of both faces
+        scheduler.Record([fail, zfail, zpass, compare](vk::CommandBuffer cmdbuf) {
+            cmdbuf.SetStencilOpEXT(VK_STENCIL_FACE_FRONT_AND_BACK, MaxwellToVK::StencilOp(fail),
+                                   MaxwellToVK::StencilOp(zpass), MaxwellToVK::StencilOp(zfail),
+                                   MaxwellToVK::ComparisonOp(compare));
+        });
     }
 }
 
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp
index 1d438787a..0c11c814f 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@@ -43,17 +43,10 @@ VKScheduler::VKScheduler(const Device& device_, StateTracker& state_tracker_)
       command_pool{std::make_unique<CommandPool>(*master_semaphore, device)} {
     AcquireNewChunk();
     AllocateWorkerCommandBuffer();
-    worker_thread = std::thread(&VKScheduler::WorkerThread, this);
+    worker_thread = std::jthread([this](std::stop_token token) { WorkerThread(token); });
 }
 
-VKScheduler::~VKScheduler() {
-    {
-        std::lock_guard lock{work_mutex};
-        quit = true;
-    }
-    work_cv.notify_all();
-    worker_thread.join();
-}
+VKScheduler::~VKScheduler() = default;
 
 void VKScheduler::Flush(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) {
     SubmitExecution(signal_semaphore, wait_semaphore);
@@ -135,7 +128,7 @@ bool VKScheduler::UpdateGraphicsPipeline(GraphicsPipeline* pipeline) {
     return true;
 }
 
-void VKScheduler::WorkerThread() {
+void VKScheduler::WorkerThread(std::stop_token stop_token) {
     Common::SetCurrentThreadName("yuzu:VulkanWorker");
     do {
         if (work_queue.empty()) {
@@ -144,8 +137,8 @@ void VKScheduler::WorkerThread() {
         std::unique_ptr<CommandChunk> work;
         {
             std::unique_lock lock{work_mutex};
-            work_cv.wait(lock, [this] { return !work_queue.empty() || quit; });
-            if (quit) {
+            work_cv.wait(lock, stop_token, [this] { return !work_queue.empty(); });
+            if (stop_token.stop_requested()) {
                 continue;
             }
             work = std::move(work_queue.front());
@@ -158,7 +151,7 @@ void VKScheduler::WorkerThread() {
         }
         std::lock_guard reserve_lock{reserve_mutex};
         chunk_reserve.push_back(std::move(work));
-    } while (!quit);
+    } while (!stop_token.stop_requested());
 }
 
 void VKScheduler::AllocateWorkerCommandBuffer() {
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h
index 759ed5a48..bd22e4e83 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.h
+++ b/src/video_core/renderer_vulkan/vk_scheduler.h
@@ -187,7 +187,7 @@ private:
         GraphicsPipeline* graphics_pipeline = nullptr;
     };
 
-    void WorkerThread();
+    void WorkerThread(std::stop_token stop_token);
 
     void AllocateWorkerCommandBuffer();
 
@@ -212,7 +212,7 @@ private:
     vk::CommandBuffer current_cmdbuf;
 
     std::unique_ptr<CommandChunk> chunk;
-    std::thread worker_thread;
+    std::jthread worker_thread;
 
     State state;
 
@@ -224,9 +224,8 @@ private:
     std::vector<std::unique_ptr<CommandChunk>> chunk_reserve;
     std::mutex reserve_mutex;
     std::mutex work_mutex;
-    std::condition_variable work_cv;
+    std::condition_variable_any work_cv;
     std::condition_variable wait_cv;
-    std::atomic_bool quit{};
 };
 
 } // namespace Vulkan
diff --git a/src/video_core/vulkan_common/vulkan_debug_callback.cpp b/src/video_core/vulkan_common/vulkan_debug_callback.cpp
index 0f60765bb..cf94e1d39 100644
--- a/src/video_core/vulkan_common/vulkan_debug_callback.cpp
+++ b/src/video_core/vulkan_common/vulkan_debug_callback.cpp
@@ -16,6 +16,7 @@ VkBool32 Callback(VkDebugUtilsMessageSeverityFlagBitsEXT severity,
     switch (static_cast<u32>(data->messageIdNumber)) {
     case 0x682a878au: // VUID-vkCmdBindVertexBuffers2EXT-pBuffers-parameter
     case 0x99fb7dfdu: // UNASSIGNED-RequiredParameter (vkCmdBindVertexBuffers2EXT pBuffers[0])
+    case 0xe8616bf2u: // Bound VkDescriptorSet 0x0[] was destroyed. Likely push_descriptor related
         return VK_FALSE;
     default:
         break;
diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp
index 24821c1a3..c2ec9f76a 100644
--- a/src/video_core/vulkan_common/vulkan_device.cpp
+++ b/src/video_core/vulkan_common/vulkan_device.cpp
@@ -368,8 +368,9 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
     };
     SetNext(next, demote);
 
+    VkPhysicalDeviceFloat16Int8FeaturesKHR float16_int8;
     if (is_int8_supported || is_float16_supported) {
-        VkPhysicalDeviceFloat16Int8FeaturesKHR float16_int8{
+        float16_int8 = {
             .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR,
             .pNext = nullptr,
             .shaderFloat16 = is_float16_supported,
@@ -587,6 +588,26 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
             ext_extended_dynamic_state = false;
         }
     }
+
+    sets_per_pool = 64;
+    if (driver_id == VK_DRIVER_ID_AMD_PROPRIETARY || driver_id == VK_DRIVER_ID_AMD_OPEN_SOURCE) {
+        // AMD drivers need a higher amount of Sets per Pool in certain circunstances like in XC2.
+        sets_per_pool = 96;
+    }
+
+    const bool is_amd = driver_id == VK_DRIVER_ID_AMD_PROPRIETARY ||
+                        driver_id == VK_DRIVER_ID_MESA_RADV ||
+                        driver_id == VK_DRIVER_ID_AMD_OPEN_SOURCE;
+    if (ext_sampler_filter_minmax && is_amd) {
+        // Disable ext_sampler_filter_minmax on AMD GCN4 and lower as it is broken.
+        if (!is_float16_supported) {
+            LOG_WARNING(
+                Render_Vulkan,
+                "Blacklisting AMD GCN4 and lower for VK_EXT_SAMPLER_FILTER_MINMAX_EXTENSION_NAME");
+            ext_sampler_filter_minmax = false;
+        }
+    }
+
     if (ext_vertex_input_dynamic_state && driver_id == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS) {
         LOG_WARNING(Render_Vulkan, "Blacklisting Intel for VK_EXT_vertex_input_dynamic_state");
         ext_vertex_input_dynamic_state = false;
diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h
index 5599c38c5..bc180a32a 100644
--- a/src/video_core/vulkan_common/vulkan_device.h
+++ b/src/video_core/vulkan_common/vulkan_device.h
@@ -323,6 +323,10 @@ public:
         return device_access_memory;
     }
 
+    u32 GetSetsPerPool() const {
+        return sets_per_pool;
+    }
+
 private:
     /// Checks if the physical device is suitable.
     void CheckSuitability(bool requires_swapchain) const;
@@ -376,6 +380,7 @@ private:
     VkShaderStageFlags guest_warp_stages{};     ///< Stages where the guest warp size can be forced.
     u64 device_access_memory{};                 ///< Total size of device local memory in bytes.
     u32 max_push_descriptors{};                 ///< Maximum number of push descriptors
+    u32 sets_per_pool{};                        ///< Sets per Description Pool
     bool is_optimal_astc_supported{};           ///< Support for native ASTC.
     bool is_float16_supported{};                ///< Support for float16 arithmetic.
     bool is_int8_supported{};                   ///< Support for int8 arithmetic.