diff options
Diffstat (limited to 'src/video_core')
70 files changed, 1088 insertions, 700 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index bf6439530..3b2fe01da 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -220,8 +220,8 @@ add_library(video_core STATIC surface.h texture_cache/accelerated_swizzle.cpp texture_cache/accelerated_swizzle.h - texture_cache/decode_bc4.cpp - texture_cache/decode_bc4.h + texture_cache/decode_bc.cpp + texture_cache/decode_bc.h texture_cache/descriptor_table.h texture_cache/formatter.cpp texture_cache/formatter.h @@ -279,7 +279,7 @@ add_library(video_core STATIC create_target_directory_groups(video_core) target_link_libraries(video_core PUBLIC common core) -target_link_libraries(video_core PUBLIC glad shader_recompiler stb) +target_link_libraries(video_core PUBLIC glad shader_recompiler stb bc_decoder) if (YUZU_USE_BUNDLED_FFMPEG AND NOT (WIN32 OR ANDROID)) add_dependencies(video_core ffmpeg-build) @@ -291,7 +291,7 @@ target_link_options(video_core PRIVATE ${FFmpeg_LDFLAGS}) add_dependencies(video_core host_shaders) target_include_directories(video_core PRIVATE ${HOST_SHADERS_INCLUDE}) -target_link_libraries(video_core PRIVATE sirit Vulkan::Headers) +target_link_libraries(video_core PRIVATE sirit Vulkan::Headers vma) if (ENABLE_NSIGHT_AFTERMATH) if (NOT DEFINED ENV{NSIGHT_AFTERMATH_SDK}) diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 45977d578..58a45ab67 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -207,7 +207,7 @@ bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am if (has_new_downloads) { memory_tracker.MarkRegionAsGpuModified(*cpu_dest_address, amount); } - tmp_buffer.resize(amount); + tmp_buffer.resize_destructive(amount); cpu_memory.ReadBlockUnsafe(*cpu_src_address, tmp_buffer.data(), amount); cpu_memory.WriteBlockUnsafe(*cpu_dest_address, tmp_buffer.data(), amount); return true; @@ -1279,7 +1279,7 @@ template <class P> typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu_addr, u32 wanted_size) { static constexpr int STREAM_LEAP_THRESHOLD = 16; - std::vector<BufferId> overlap_ids; + boost::container::small_vector<BufferId, 16> overlap_ids; VAddr begin = cpu_addr; VAddr end = cpu_addr + wanted_size; int stream_score = 0; diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h index 63a120f7a..fe6068cfe 100644 --- a/src/video_core/buffer_cache/buffer_cache_base.h +++ b/src/video_core/buffer_cache/buffer_cache_base.h @@ -229,7 +229,7 @@ class BufferCache : public VideoCommon::ChannelSetupCaches<BufferCacheChannelInf using OverlapCounter = boost::icl::split_interval_map<VAddr, int>; struct OverlapResult { - std::vector<BufferId> ids; + boost::container::small_vector<BufferId, 16> ids; VAddr begin; VAddr end; bool has_stream_leap = false; @@ -582,7 +582,7 @@ private: BufferId inline_buffer_id; std::array<BufferId, ((1ULL << 39) >> CACHING_PAGEBITS)> page_table; - std::vector<u8> tmp_buffer; + Common::ScratchBuffer<u8> tmp_buffer; }; } // namespace VideoCommon diff --git a/src/video_core/cdma_pusher.h b/src/video_core/cdma_pusher.h index 83112dfce..7d660af47 100644 --- a/src/video_core/cdma_pusher.h +++ b/src/video_core/cdma_pusher.h @@ -63,7 +63,6 @@ struct ChCommand { }; using ChCommandHeaderList = std::vector<ChCommandHeader>; -using ChCommandList = std::vector<ChCommand>; struct ThiRegisters { u32_le increment_syncpt{}; diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h index 1cdb690ed..8a2784cdc 100644 --- a/src/video_core/dma_pusher.h +++ b/src/video_core/dma_pusher.h @@ -6,6 +6,7 @@ #include <array> #include <span> #include <vector> +#include <boost/container/small_vector.hpp> #include <queue> #include "common/bit_field.h" @@ -102,11 +103,12 @@ inline CommandHeader BuildCommandHeader(BufferMethods method, u32 arg_count, Sub struct CommandList final { CommandList() = default; explicit CommandList(std::size_t size) : command_lists(size) {} - explicit CommandList(std::vector<CommandHeader>&& prefetch_command_list_) + explicit CommandList( + boost::container::small_vector<CommandHeader, 512>&& prefetch_command_list_) : prefetch_command_list{std::move(prefetch_command_list_)} {} - std::vector<CommandListHeader> command_lists; - std::vector<CommandHeader> prefetch_command_list; + boost::container::small_vector<CommandListHeader, 512> command_lists; + boost::container::small_vector<CommandHeader, 512> prefetch_command_list; }; /** diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index ebe5536de..a290d6ea7 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -108,9 +108,11 @@ void MaxwellDMA::Launch() { if (regs.launch_dma.remap_enable != 0 && is_const_a_dst) { ASSERT(regs.remap_const.component_size_minus_one == 3); accelerate.BufferClear(regs.offset_out, regs.line_length_in, regs.remap_consta_value); - std::vector<u32> tmp_buffer(regs.line_length_in, regs.remap_consta_value); + read_buffer.resize_destructive(regs.line_length_in * sizeof(u32)); + std::span<u32> span(reinterpret_cast<u32*>(read_buffer.data()), regs.line_length_in); + std::ranges::fill(span, regs.remap_consta_value); memory_manager.WriteBlockUnsafe(regs.offset_out, - reinterpret_cast<u8*>(tmp_buffer.data()), + reinterpret_cast<u8*>(read_buffer.data()), regs.line_length_in * sizeof(u32)); } else { memory_manager.FlushCaching(); @@ -126,32 +128,33 @@ void MaxwellDMA::Launch() { UNIMPLEMENTED_IF(regs.line_length_in % 16 != 0); UNIMPLEMENTED_IF(regs.offset_in % 16 != 0); UNIMPLEMENTED_IF(regs.offset_out % 16 != 0); - std::vector<u8> tmp_buffer(16); + read_buffer.resize_destructive(16); for (u32 offset = 0; offset < regs.line_length_in; offset += 16) { - memory_manager.ReadBlockUnsafe( + memory_manager.ReadBlock( convert_linear_2_blocklinear_addr(regs.offset_in + offset), - tmp_buffer.data(), tmp_buffer.size()); - memory_manager.WriteBlockCached(regs.offset_out + offset, tmp_buffer.data(), - tmp_buffer.size()); + read_buffer.data(), read_buffer.size()); + memory_manager.WriteBlockCached(regs.offset_out + offset, read_buffer.data(), + read_buffer.size()); } } else if (is_src_pitch && !is_dst_pitch) { UNIMPLEMENTED_IF(regs.line_length_in % 16 != 0); UNIMPLEMENTED_IF(regs.offset_in % 16 != 0); UNIMPLEMENTED_IF(regs.offset_out % 16 != 0); - std::vector<u8> tmp_buffer(16); + read_buffer.resize_destructive(16); for (u32 offset = 0; offset < regs.line_length_in; offset += 16) { - memory_manager.ReadBlockUnsafe(regs.offset_in + offset, tmp_buffer.data(), - tmp_buffer.size()); + memory_manager.ReadBlock(regs.offset_in + offset, read_buffer.data(), + read_buffer.size()); memory_manager.WriteBlockCached( convert_linear_2_blocklinear_addr(regs.offset_out + offset), - tmp_buffer.data(), tmp_buffer.size()); + read_buffer.data(), read_buffer.size()); } } else { if (!accelerate.BufferCopy(regs.offset_in, regs.offset_out, regs.line_length_in)) { - std::vector<u8> tmp_buffer(regs.line_length_in); - memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(), - regs.line_length_in); - memory_manager.WriteBlockCached(regs.offset_out, tmp_buffer.data(), + read_buffer.resize_destructive(regs.line_length_in); + memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), + regs.line_length_in, + VideoCommon::CacheType::NoBufferCache); + memory_manager.WriteBlockCached(regs.offset_out, read_buffer.data(), regs.line_length_in); } } @@ -171,7 +174,8 @@ void MaxwellDMA::CopyBlockLinearToPitch() { src_operand.address = regs.offset_in; DMA::BufferOperand dst_operand; - dst_operand.pitch = regs.pitch_out; + u32 abs_pitch_out = std::abs(static_cast<s32>(regs.pitch_out)); + dst_operand.pitch = abs_pitch_out; dst_operand.width = regs.line_length_in; dst_operand.height = regs.line_count; dst_operand.address = regs.offset_out; @@ -218,7 +222,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() { const size_t src_size = CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth); - const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count; + const size_t dst_size = static_cast<size_t>(abs_pitch_out) * regs.line_count; read_buffer.resize_destructive(src_size); write_buffer.resize_destructive(dst_size); @@ -227,7 +231,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() { UnswizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, width, height, depth, x_offset, src_params.origin.y, x_elements, regs.line_count, block_height, block_depth, - regs.pitch_out); + abs_pitch_out); memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size); } diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 456f733cf..db385076d 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -193,18 +193,13 @@ struct GPU::Impl { } [[nodiscard]] u64 GetTicks() const { - // This values were reversed engineered by fincs from NVN - // The gpu clock is reported in units of 385/625 nanoseconds - constexpr u64 gpu_ticks_num = 384; - constexpr u64 gpu_ticks_den = 625; + u64 gpu_tick = system.CoreTiming().GetGPUTicks(); - u64 nanoseconds = system.CoreTiming().GetCPUTimeNs().count(); if (Settings::values.use_fast_gpu_time.GetValue()) { - nanoseconds /= 256; + gpu_tick /= 256; } - const u64 nanoseconds_num = nanoseconds / gpu_ticks_den; - const u64 nanoseconds_rem = nanoseconds % gpu_ticks_den; - return nanoseconds_num * gpu_ticks_num + (nanoseconds_rem * gpu_ticks_num) / gpu_ticks_den; + + return gpu_tick; } [[nodiscard]] bool IsAsync() const { diff --git a/src/video_core/host1x/codecs/h264.cpp b/src/video_core/host1x/codecs/h264.cpp index 6ce179167..ce827eb6c 100644 --- a/src/video_core/host1x/codecs/h264.cpp +++ b/src/video_core/host1x/codecs/h264.cpp @@ -4,6 +4,7 @@ #include <array> #include <bit> +#include "common/scratch_buffer.h" #include "common/settings.h" #include "video_core/host1x/codecs/h264.h" #include "video_core/host1x/host1x.h" @@ -188,7 +189,8 @@ void H264BitWriter::WriteBit(bool state) { } void H264BitWriter::WriteScalingList(std::span<const u8> list, s32 start, s32 count) { - std::vector<u8> scan(count); + static Common::ScratchBuffer<u8> scan{}; + scan.resize_destructive(count); if (count == 16) { std::memcpy(scan.data(), zig_zag_scan.data(), scan.size()); } else { diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index 2442c3c29..e61d9af80 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -33,6 +33,7 @@ set(SHADER_FILES opengl_fidelityfx_fsr.frag opengl_fidelityfx_fsr_easu.frag opengl_fidelityfx_fsr_rcas.frag + opengl_lmem_warmup.comp opengl_present.frag opengl_present.vert opengl_present_scaleforce.frag diff --git a/src/video_core/host_shaders/opengl_lmem_warmup.comp b/src/video_core/host_shaders/opengl_lmem_warmup.comp new file mode 100644 index 000000000..518268477 --- /dev/null +++ b/src/video_core/host_shaders/opengl_lmem_warmup.comp @@ -0,0 +1,47 @@ +// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +// This shader is a workaround for a quirk in NVIDIA OpenGL drivers +// Shaders using local memory see a great performance benefit if a shader that was dispatched +// before it had more local memory allocated. +// This shader allocates the maximum local memory allowed on NVIDIA drivers to ensure that +// subsequent shaders see the performance boost. + +// NOTE: This shader does no actual meaningful work and returns immediately, +// it is simply a means to have the driver expect a shader using lots of local memory. + +#version 450 + +layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in; + +layout(location = 0) uniform uint uniform_data; + +layout(binding = 0, rgba8) uniform writeonly restrict image2DArray dest_image; + +#define MAX_LMEM_SIZE 4080 // Size chosen to avoid errors in Nvidia's GLSL compiler +#define NUM_LMEM_CONSTANTS 1 +#define ARRAY_SIZE MAX_LMEM_SIZE - NUM_LMEM_CONSTANTS + +uint lmem_0[ARRAY_SIZE]; +const uvec4 constant_values[NUM_LMEM_CONSTANTS] = uvec4[](uvec4(0)); + +void main() { + const uint global_id = gl_GlobalInvocationID.x; + if (global_id <= 128) { + // Since the shader is called with a dispatch of 1x1x1 + // This should always be the case, and this shader will not actually execute + return; + } + for (uint t = 0; t < uniform_data; t++) { + const uint offset = (t * uniform_data); + lmem_0[offset] = t; + } + const uint offset = (gl_GlobalInvocationID.y * uniform_data + gl_GlobalInvocationID.x); + const uint value = lmem_0[offset]; + const uint const_value = constant_values[offset / 4][offset % 4]; + const uvec4 color = uvec4(value + const_value); + + // A "side-effect" is needed so the variables don't get optimized out, + // but this should never execute so there should be no clobbering of previously bound state. + imageStore(dest_image, ivec3(gl_GlobalInvocationID), color); +} diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index 7b2cde7a7..45141e488 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -111,7 +111,7 @@ GPUVAddr MemoryManager::PageTableOp(GPUVAddr gpu_addr, [[maybe_unused]] VAddr cp [[maybe_unused]] const auto current_entry_type = GetEntry<false>(current_gpu_addr); SetEntry<false>(current_gpu_addr, entry_type); if (current_entry_type != entry_type) { - rasterizer->ModifyGPUMemory(unique_identifier, gpu_addr, page_size); + rasterizer->ModifyGPUMemory(unique_identifier, current_gpu_addr, page_size); } if constexpr (entry_type == EntryType::Mapped) { const VAddr current_cpu_addr = cpu_addr + offset; @@ -134,7 +134,7 @@ GPUVAddr MemoryManager::BigPageTableOp(GPUVAddr gpu_addr, [[maybe_unused]] VAddr [[maybe_unused]] const auto current_entry_type = GetEntry<true>(current_gpu_addr); SetEntry<true>(current_gpu_addr, entry_type); if (current_entry_type != entry_type) { - rasterizer->ModifyGPUMemory(unique_identifier, gpu_addr, big_page_size); + rasterizer->ModifyGPUMemory(unique_identifier, current_gpu_addr, big_page_size); } if constexpr (entry_type == EntryType::Mapped) { const VAddr current_cpu_addr = cpu_addr + offset; @@ -587,7 +587,7 @@ void MemoryManager::InvalidateRegion(GPUVAddr gpu_addr, size_t size, void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size, VideoCommon::CacheType which) { - std::vector<u8> tmp_buffer(size); + tmp_buffer.resize_destructive(size); ReadBlock(gpu_src_addr, tmp_buffer.data(), size, which); // The output block must be flushed in case it has data modified from the GPU. @@ -670,9 +670,9 @@ bool MemoryManager::IsFullyMappedRange(GPUVAddr gpu_addr, std::size_t size) cons return result; } -std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange( - GPUVAddr gpu_addr, std::size_t size) const { - std::vector<std::pair<GPUVAddr, std::size_t>> result{}; +boost::container::small_vector<std::pair<GPUVAddr, std::size_t>, 32> +MemoryManager::GetSubmappedRange(GPUVAddr gpu_addr, std::size_t size) const { + boost::container::small_vector<std::pair<GPUVAddr, std::size_t>, 32> result{}; GetSubmappedRangeImpl<true>(gpu_addr, size, result); return result; } @@ -680,8 +680,9 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange( template <bool is_gpu_address> void MemoryManager::GetSubmappedRangeImpl( GPUVAddr gpu_addr, std::size_t size, - std::vector<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>& - result) const { + boost::container::small_vector< + std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>, 32>& result) + const { std::optional<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>> last_segment{}; std::optional<VAddr> old_page_addr{}; diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index 794535122..4202c26ff 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -8,10 +8,12 @@ #include <mutex> #include <optional> #include <vector> +#include <boost/container/small_vector.hpp> #include "common/common_types.h" #include "common/multi_level_page_table.h" #include "common/range_map.h" +#include "common/scratch_buffer.h" #include "common/virtual_buffer.h" #include "video_core/cache_types.h" #include "video_core/pte_kind.h" @@ -107,8 +109,8 @@ public: * if the region is continuous, a single pair will be returned. If it's unmapped, an empty * vector will be returned; */ - std::vector<std::pair<GPUVAddr, std::size_t>> GetSubmappedRange(GPUVAddr gpu_addr, - std::size_t size) const; + boost::container::small_vector<std::pair<GPUVAddr, std::size_t>, 32> GetSubmappedRange( + GPUVAddr gpu_addr, std::size_t size) const; GPUVAddr Map(GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size, PTEKind kind = PTEKind::INVALID, bool is_big_pages = true); @@ -165,7 +167,8 @@ private: template <bool is_gpu_address> void GetSubmappedRangeImpl( GPUVAddr gpu_addr, std::size_t size, - std::vector<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>& + boost::container::small_vector< + std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>, 32>& result) const; Core::System& system; @@ -215,8 +218,8 @@ private: Common::VirtualBuffer<u32> big_page_table_cpu; std::vector<u64> big_page_continuous; - std::vector<std::pair<VAddr, std::size_t>> page_stash{}; - std::vector<std::pair<VAddr, std::size_t>> page_stash2{}; + boost::container::small_vector<std::pair<VAddr, std::size_t>, 32> page_stash{}; + boost::container::small_vector<std::pair<VAddr, std::size_t>, 32> page_stash2{}; mutable std::mutex guard; @@ -226,6 +229,8 @@ private: std::unique_ptr<VideoCommon::InvalidationAccumulator> accumulator; static std::atomic<size_t> unique_identifier_generator; + + Common::ScratchBuffer<u8> tmp_buffer; }; } // namespace Tegra diff --git a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp index 3151c0db8..f9ca55c36 100644 --- a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp @@ -63,6 +63,7 @@ ComputePipeline::ComputePipeline(const Device& device, TextureCache& texture_cac writes_global_memory = !use_storage_buffers && std::ranges::any_of(info.storage_buffers_descriptors, [](const auto& desc) { return desc.is_written; }); + uses_local_memory = info.uses_local_memory; if (force_context_flush) { std::scoped_lock lock{built_mutex}; built_fence.Create(); diff --git a/src/video_core/renderer_opengl/gl_compute_pipeline.h b/src/video_core/renderer_opengl/gl_compute_pipeline.h index 9bcc72b59..c26b4fa5e 100644 --- a/src/video_core/renderer_opengl/gl_compute_pipeline.h +++ b/src/video_core/renderer_opengl/gl_compute_pipeline.h @@ -59,6 +59,10 @@ public: return writes_global_memory; } + [[nodiscard]] bool UsesLocalMemory() const noexcept { + return uses_local_memory; + } + void SetEngine(Tegra::Engines::KeplerCompute* kepler_compute_, Tegra::MemoryManager* gpu_memory_) { kepler_compute = kepler_compute_; @@ -84,6 +88,7 @@ private: bool use_storage_buffers{}; bool writes_global_memory{}; + bool uses_local_memory{}; std::mutex built_mutex; std::condition_variable built_condvar; diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index 03d234f2f..33e63c17d 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -194,6 +194,7 @@ Device::Device(Core::Frontend::EmuWindow& emu_window) { has_bool_ref_bug = true; } } + has_lmem_perf_bug = is_nvidia; strict_context_required = emu_window.StrictContextRequired(); // Blocks AMD and Intel OpenGL drivers on Windows from using asynchronous shader compilation. diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index ad27264e5..a5a6bbbba 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -192,6 +192,10 @@ public: return supports_conditional_barriers; } + bool HasLmemPerfBug() const { + return has_lmem_perf_bug; + } + private: static bool TestVariableAoffi(); static bool TestPreciseBug(); @@ -238,6 +242,7 @@ private: bool can_report_memory{}; bool strict_context_required{}; bool supports_conditional_barriers{}; + bool has_lmem_perf_bug{}; std::string vendor_name; }; diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp index c58f760b8..23a48c6fe 100644 --- a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp @@ -215,6 +215,7 @@ GraphicsPipeline::GraphicsPipeline(const Device& device, TextureCache& texture_c writes_global_memory |= std::ranges::any_of( info.storage_buffers_descriptors, [](const auto& desc) { return desc.is_written; }); + uses_local_memory |= info.uses_local_memory; } ASSERT(num_textures <= MAX_TEXTURES); ASSERT(num_images <= MAX_IMAGES); diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.h b/src/video_core/renderer_opengl/gl_graphics_pipeline.h index 7bab3be0a..7b3d7eae8 100644 --- a/src/video_core/renderer_opengl/gl_graphics_pipeline.h +++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.h @@ -98,6 +98,10 @@ public: return writes_global_memory; } + [[nodiscard]] bool UsesLocalMemory() const noexcept { + return uses_local_memory; + } + [[nodiscard]] bool IsBuilt() noexcept; template <typename Spec> @@ -146,6 +150,7 @@ private: bool use_storage_buffers{}; bool writes_global_memory{}; + bool uses_local_memory{}; static constexpr std::size_t XFB_ENTRY_STRIDE = 3; GLsizei num_xfb_attribs{}; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index fc711c44a..edf527f2d 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -222,6 +222,9 @@ void RasterizerOpenGL::PrepareDraw(bool is_indexed, Func&& draw_func) { gpu.TickWork(); std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; + if (pipeline->UsesLocalMemory()) { + program_manager.LocalMemoryWarmup(); + } pipeline->SetEngine(maxwell3d, gpu_memory); pipeline->Configure(is_indexed); @@ -371,6 +374,9 @@ void RasterizerOpenGL::DispatchCompute() { if (!pipeline) { return; } + if (pipeline->UsesLocalMemory()) { + program_manager.LocalMemoryWarmup(); + } pipeline->SetEngine(kepler_compute, gpu_memory); pipeline->Configure(); const auto& qmd{kepler_compute->launch_description}; diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 3f077311e..0329ed820 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -85,7 +85,9 @@ Shader::RuntimeInfo MakeRuntimeInfo(const GraphicsPipelineKey& key, case Shader::Stage::VertexB: case Shader::Stage::Geometry: if (!use_assembly_shaders && key.xfb_enabled != 0) { - info.xfb_varyings = VideoCommon::MakeTransformFeedbackVaryings(key.xfb_state); + auto [varyings, count] = VideoCommon::MakeTransformFeedbackVaryings(key.xfb_state); + info.xfb_varyings = varyings; + info.xfb_count = count; } break; case Shader::Stage::TessellationEval: diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp index 98841ae65..03d4b9d06 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.cpp +++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp @@ -3,7 +3,9 @@ #include <glad/glad.h> +#include "video_core/host_shaders/opengl_lmem_warmup_comp.h" #include "video_core/renderer_opengl/gl_shader_manager.h" +#include "video_core/renderer_opengl/gl_shader_util.h" namespace OpenGL { @@ -17,6 +19,10 @@ ProgramManager::ProgramManager(const Device& device) { if (device.UseAssemblyShaders()) { glEnable(GL_COMPUTE_PROGRAM_NV); } + if (device.HasLmemPerfBug()) { + lmem_warmup_program = + CreateProgram(HostShaders::OPENGL_LMEM_WARMUP_COMP, GL_COMPUTE_SHADER); + } } void ProgramManager::BindComputeProgram(GLuint program) { @@ -98,6 +104,13 @@ void ProgramManager::BindAssemblyPrograms(std::span<const OGLAssemblyProgram, NU void ProgramManager::RestoreGuestCompute() {} +void ProgramManager::LocalMemoryWarmup() { + if (lmem_warmup_program.handle != 0) { + BindComputeProgram(lmem_warmup_program.handle); + glDispatchCompute(1, 1, 1); + } +} + void ProgramManager::BindPipeline() { if (!is_pipeline_bound) { is_pipeline_bound = true; diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h index 07ffab77f..852d8c88e 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.h +++ b/src/video_core/renderer_opengl/gl_shader_manager.h @@ -30,6 +30,8 @@ public: void RestoreGuestCompute(); + void LocalMemoryWarmup(); + private: void BindPipeline(); @@ -44,6 +46,7 @@ private: u32 current_stage_mask = 0; std::array<GLuint, NUM_STAGES> current_programs{}; GLuint current_assembly_compute_program = 0; + OGLProgram lmem_warmup_program; }; } // namespace OpenGL diff --git a/src/video_core/renderer_vulkan/blit_image.cpp b/src/video_core/renderer_vulkan/blit_image.cpp index cf2964a3f..28d4b15a0 100644 --- a/src/video_core/renderer_vulkan/blit_image.cpp +++ b/src/video_core/renderer_vulkan/blit_image.cpp @@ -495,6 +495,9 @@ void BlitImageHelper::BlitDepthStencil(const Framebuffer* dst_framebuffer, const Region2D& dst_region, const Region2D& src_region, Tegra::Engines::Fermi2D::Filter filter, Tegra::Engines::Fermi2D::Operation operation) { + if (!device.IsExtShaderStencilExportSupported()) { + return; + } ASSERT(filter == Tegra::Engines::Fermi2D::Filter::Point); ASSERT(operation == Tegra::Engines::Fermi2D::Operation::SrcCopy); const BlitImagePipelineKey key{ diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index 9a0b10568..a8540339d 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -259,6 +259,26 @@ FormatInfo SurfaceFormat(const Device& device, FormatType format_type, bool with break; } } + // Transcode on hardware that doesn't support BCn natively + if (!device.IsOptimalBcnSupported() && VideoCore::Surface::IsPixelFormatBCn(pixel_format)) { + const bool is_srgb = with_srgb && VideoCore::Surface::IsPixelFormatSRGB(pixel_format); + if (pixel_format == PixelFormat::BC4_SNORM) { + tuple.format = VK_FORMAT_R8_SNORM; + } else if (pixel_format == PixelFormat::BC4_UNORM) { + tuple.format = VK_FORMAT_R8_UNORM; + } else if (pixel_format == PixelFormat::BC5_SNORM) { + tuple.format = VK_FORMAT_R8G8_SNORM; + } else if (pixel_format == PixelFormat::BC5_UNORM) { + tuple.format = VK_FORMAT_R8G8_UNORM; + } else if (pixel_format == PixelFormat::BC6H_SFLOAT || + pixel_format == PixelFormat::BC6H_UFLOAT) { + tuple.format = VK_FORMAT_R16G16B16A16_SFLOAT; + } else if (is_srgb) { + tuple.format = VK_FORMAT_A8B8G8R8_SRGB_PACK32; + } else { + tuple.format = VK_FORMAT_A8B8G8R8_UNORM_PACK32; + } + } const bool attachable = (tuple.usage & Attachable) != 0; const bool storage = (tuple.usage & Storage) != 0; diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index 77128c6e2..454bb66a4 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -12,6 +12,7 @@ #include <fmt/format.h> #include "common/logging/log.h" +#include "common/polyfill_ranges.h" #include "common/scope_exit.h" #include "common/settings.h" #include "common/telemetry.h" @@ -65,6 +66,21 @@ std::string BuildCommaSeparatedExtensions( return fmt::format("{}", fmt::join(available_extensions, ",")); } +DebugCallback MakeDebugCallback(const vk::Instance& instance, const vk::InstanceDispatch& dld) { + if (!Settings::values.renderer_debug) { + return DebugCallback{}; + } + const std::optional properties = vk::EnumerateInstanceExtensionProperties(dld); + const auto it = std::ranges::find_if(*properties, [](const auto& prop) { + return std::strcmp(VK_EXT_DEBUG_UTILS_EXTENSION_NAME, prop.extensionName) == 0; + }); + if (it != properties->end()) { + return CreateDebugUtilsCallback(instance); + } else { + return CreateDebugReportCallback(instance); + } +} + } // Anonymous namespace Device CreateDevice(const vk::Instance& instance, const vk::InstanceDispatch& dld, @@ -87,10 +103,10 @@ RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_, cpu_memory(cpu_memory_), gpu(gpu_), library(OpenLibrary(context.get())), instance(CreateInstance(*library, dld, VK_API_VERSION_1_1, render_window.GetWindowInfo().type, Settings::values.renderer_debug.GetValue())), - debug_callback(Settings::values.renderer_debug ? CreateDebugCallback(instance) : nullptr), + debug_callback(MakeDebugCallback(instance, dld)), surface(CreateSurface(instance, render_window.GetWindowInfo())), - device(CreateDevice(instance, dld, *surface)), memory_allocator(device, false), - state_tracker(), scheduler(device, state_tracker), + device(CreateDevice(instance, dld, *surface)), memory_allocator(device), state_tracker(), + scheduler(device, state_tracker), swapchain(*surface, device, scheduler, render_window.GetFramebufferLayout().width, render_window.GetFramebufferLayout().height, false), present_manager(instance, render_window, device, memory_allocator, scheduler, swapchain, @@ -173,7 +189,7 @@ void Vulkan::RendererVulkan::RenderScreenshot(const Tegra::FramebufferConfig& fr return; } const Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout}; - vk::Image staging_image = device.GetLogical().CreateImage(VkImageCreateInfo{ + vk::Image staging_image = memory_allocator.CreateImage(VkImageCreateInfo{ .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, .pNext = nullptr, .flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT, @@ -196,7 +212,6 @@ void Vulkan::RendererVulkan::RenderScreenshot(const Tegra::FramebufferConfig& fr .pQueueFamilyIndices = nullptr, .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, }); - const auto image_commit = memory_allocator.Commit(staging_image, MemoryUsage::DeviceLocal); const vk::ImageView dst_view = device.GetLogical().CreateImageView(VkImageViewCreateInfo{ .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, @@ -234,8 +249,8 @@ void Vulkan::RendererVulkan::RenderScreenshot(const Tegra::FramebufferConfig& fr .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, }; - const vk::Buffer dst_buffer = device.GetLogical().CreateBuffer(dst_buffer_info); - MemoryCommit dst_buffer_memory = memory_allocator.Commit(dst_buffer, MemoryUsage::Download); + const vk::Buffer dst_buffer = + memory_allocator.CreateBuffer(dst_buffer_info, MemoryUsage::Download); scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([&](vk::CommandBuffer cmdbuf) { @@ -309,8 +324,9 @@ void Vulkan::RendererVulkan::RenderScreenshot(const Tegra::FramebufferConfig& fr scheduler.Finish(); // Copy backing image data to the QImage screenshot buffer - const auto dst_memory_map = dst_buffer_memory.Map(); - std::memcpy(renderer_settings.screenshot_bits, dst_memory_map.data(), dst_memory_map.size()); + dst_buffer.Invalidate(); + std::memcpy(renderer_settings.screenshot_bits, dst_buffer.Mapped().data(), + dst_buffer.Mapped().size()); renderer_settings.screenshot_complete_callback(false); renderer_settings.screenshot_requested = false; } diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h index b2e8cbd1b..ca22c0baa 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.h +++ b/src/video_core/renderer_vulkan/renderer_vulkan.h @@ -5,6 +5,7 @@ #include <memory> #include <string> +#include <variant> #include "common/dynamic_library.h" #include "video_core/renderer_base.h" @@ -33,6 +34,8 @@ class GPU; namespace Vulkan { +using DebugCallback = std::variant<vk::DebugUtilsMessenger, vk::DebugReportCallback>; + Device CreateDevice(const vk::Instance& instance, const vk::InstanceDispatch& dld, VkSurfaceKHR surface); @@ -71,7 +74,7 @@ private: vk::InstanceDispatch dld; vk::Instance instance; - vk::DebugUtilsMessenger debug_callback; + DebugCallback debug_callback; vk::SurfaceKHR surface; ScreenInfo screen_info; diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index acb143fc7..ad3b29f0e 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -162,7 +162,7 @@ void BlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, SetUniformData(data, layout); SetVertexData(data, framebuffer, layout); - const std::span<u8> mapped_span = buffer_commit.Map(); + const std::span<u8> mapped_span = buffer.Mapped(); std::memcpy(mapped_span.data(), &data, sizeof(data)); if (!use_accelerated) { @@ -1071,14 +1071,9 @@ void BlitScreen::ReleaseRawImages() { scheduler.Wait(tick); } raw_images.clear(); - raw_buffer_commits.clear(); - aa_image_view.reset(); aa_image.reset(); - aa_commit = MemoryCommit{}; - buffer.reset(); - buffer_commit = MemoryCommit{}; } void BlitScreen::CreateStagingBuffer(const Tegra::FramebufferConfig& framebuffer) { @@ -1094,20 +1089,18 @@ void BlitScreen::CreateStagingBuffer(const Tegra::FramebufferConfig& framebuffer .pQueueFamilyIndices = nullptr, }; - buffer = device.GetLogical().CreateBuffer(ci); - buffer_commit = memory_allocator.Commit(buffer, MemoryUsage::Upload); + buffer = memory_allocator.CreateBuffer(ci, MemoryUsage::Upload); } void BlitScreen::CreateRawImages(const Tegra::FramebufferConfig& framebuffer) { raw_images.resize(image_count); raw_image_views.resize(image_count); - raw_buffer_commits.resize(image_count); const auto create_image = [&](bool used_on_framebuffer = false, u32 up_scale = 1, u32 down_shift = 0) { u32 extra_usages = used_on_framebuffer ? VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT : VK_IMAGE_USAGE_TRANSFER_DST_BIT; - return device.GetLogical().CreateImage(VkImageCreateInfo{ + return memory_allocator.CreateImage(VkImageCreateInfo{ .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, .pNext = nullptr, .flags = 0, @@ -1130,9 +1123,6 @@ void BlitScreen::CreateRawImages(const Tegra::FramebufferConfig& framebuffer) { .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, }); }; - const auto create_commit = [&](vk::Image& image) { - return memory_allocator.Commit(image, MemoryUsage::DeviceLocal); - }; const auto create_image_view = [&](vk::Image& image, bool used_on_framebuffer = false) { return device.GetLogical().CreateImageView(VkImageViewCreateInfo{ .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, @@ -1161,7 +1151,6 @@ void BlitScreen::CreateRawImages(const Tegra::FramebufferConfig& framebuffer) { for (size_t i = 0; i < image_count; ++i) { raw_images[i] = create_image(); - raw_buffer_commits[i] = create_commit(raw_images[i]); raw_image_views[i] = create_image_view(raw_images[i]); } @@ -1169,7 +1158,6 @@ void BlitScreen::CreateRawImages(const Tegra::FramebufferConfig& framebuffer) { const u32 up_scale = Settings::values.resolution_info.up_scale; const u32 down_shift = Settings::values.resolution_info.down_shift; aa_image = create_image(true, up_scale, down_shift); - aa_commit = create_commit(aa_image); aa_image_view = create_image_view(aa_image, true); VkExtent2D size{ .width = (up_scale * framebuffer.width) >> down_shift, diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.h b/src/video_core/renderer_vulkan/vk_blit_screen.h index 68ec20253..8365b5668 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.h +++ b/src/video_core/renderer_vulkan/vk_blit_screen.h @@ -142,13 +142,11 @@ private: vk::Sampler sampler; vk::Buffer buffer; - MemoryCommit buffer_commit; std::vector<u64> resource_ticks; std::vector<vk::Image> raw_images; std::vector<vk::ImageView> raw_image_views; - std::vector<MemoryCommit> raw_buffer_commits; vk::DescriptorPool aa_descriptor_pool; vk::DescriptorSetLayout aa_descriptor_set_layout; @@ -159,7 +157,6 @@ private: vk::DescriptorSets aa_descriptor_sets; vk::Image aa_image; vk::ImageView aa_image_view; - MemoryCommit aa_commit; u32 raw_width = 0; u32 raw_height = 0; diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index e30fcb1ed..b72f95235 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -50,7 +50,7 @@ size_t BytesPerIndex(VkIndexType index_type) { } } -vk::Buffer CreateBuffer(const Device& device, u64 size) { +vk::Buffer CreateBuffer(const Device& device, const MemoryAllocator& memory_allocator, u64 size) { VkBufferUsageFlags flags = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | @@ -60,7 +60,7 @@ vk::Buffer CreateBuffer(const Device& device, u64 size) { if (device.IsExtTransformFeedbackSupported()) { flags |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT; } - return device.GetLogical().CreateBuffer({ + const VkBufferCreateInfo buffer_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, @@ -69,7 +69,8 @@ vk::Buffer CreateBuffer(const Device& device, u64 size) { .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, - }); + }; + return memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); } } // Anonymous namespace @@ -79,8 +80,8 @@ Buffer::Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams null_params) Buffer::Buffer(BufferCacheRuntime& runtime, VideoCore::RasterizerInterface& rasterizer_, VAddr cpu_addr_, u64 size_bytes_) : VideoCommon::BufferBase<VideoCore::RasterizerInterface>(rasterizer_, cpu_addr_, size_bytes_), - device{&runtime.device}, buffer{CreateBuffer(*device, SizeBytes())}, - commit{runtime.memory_allocator.Commit(buffer, MemoryUsage::DeviceLocal)} { + device{&runtime.device}, buffer{ + CreateBuffer(*device, runtime.memory_allocator, SizeBytes())} { if (runtime.device.HasDebuggingToolAttached()) { buffer.SetObjectNameEXT(fmt::format("Buffer 0x{:x}", CpuAddr()).c_str()); } @@ -138,7 +139,7 @@ public: const u32 num_first_offset_copies = 4; const size_t bytes_per_index = BytesPerIndex(index_type); const size_t size_bytes = num_triangle_indices * bytes_per_index * num_first_offset_copies; - buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{ + const VkBufferCreateInfo buffer_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, @@ -147,14 +148,21 @@ public: .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, - }); + }; + buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); if (device.HasDebuggingToolAttached()) { buffer.SetObjectNameEXT("Quad LUT"); } - memory_commit = memory_allocator.Commit(buffer, MemoryUsage::DeviceLocal); - const StagingBufferRef staging = staging_pool.Request(size_bytes, MemoryUsage::Upload); - u8* staging_data = staging.mapped_span.data(); + const bool host_visible = buffer.IsHostVisible(); + const StagingBufferRef staging = [&] { + if (host_visible) { + return StagingBufferRef{}; + } + return staging_pool.Request(size_bytes, MemoryUsage::Upload); + }(); + + u8* staging_data = host_visible ? buffer.Mapped().data() : staging.mapped_span.data(); const size_t quad_size = bytes_per_index * 6; for (u32 first = 0; first < num_first_offset_copies; ++first) { @@ -164,29 +172,33 @@ public: } } - scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([src_buffer = staging.buffer, src_offset = staging.offset, - dst_buffer = *buffer, size_bytes](vk::CommandBuffer cmdbuf) { - const VkBufferCopy copy{ - .srcOffset = src_offset, - .dstOffset = 0, - .size = size_bytes, - }; - const VkBufferMemoryBarrier write_barrier{ - .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, - .dstAccessMask = VK_ACCESS_INDEX_READ_BIT, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .buffer = dst_buffer, - .offset = 0, - .size = size_bytes, - }; - cmdbuf.CopyBuffer(src_buffer, dst_buffer, copy); - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, write_barrier); - }); + if (!host_visible) { + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([src_buffer = staging.buffer, src_offset = staging.offset, + dst_buffer = *buffer, size_bytes](vk::CommandBuffer cmdbuf) { + const VkBufferCopy copy{ + .srcOffset = src_offset, + .dstOffset = 0, + .size = size_bytes, + }; + const VkBufferMemoryBarrier write_barrier{ + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_INDEX_READ_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = dst_buffer, + .offset = 0, + .size = size_bytes, + }; + cmdbuf.CopyBuffer(src_buffer, dst_buffer, copy); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, write_barrier); + }); + } else { + buffer.Flush(); + } } void BindBuffer(u32 first) { @@ -361,7 +373,7 @@ void BufferCacheRuntime::CopyBuffer(VkBuffer dst_buffer, VkBuffer src_buffer, .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, }; // Measuring a popular game, this number never exceeds the specified size once data is warmed up - boost::container::small_vector<VkBufferCopy, 3> vk_copies(copies.size()); + boost::container::small_vector<VkBufferCopy, 8> vk_copies(copies.size()); std::ranges::transform(copies, vk_copies.begin(), MakeBufferCopy); scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([src_buffer, dst_buffer, vk_copies, barrier](vk::CommandBuffer cmdbuf) { @@ -578,7 +590,8 @@ void BufferCacheRuntime::ReserveNullBuffer() { .pNext = nullptr, .flags = 0, .size = 4, - .usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, + .usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT | + VK_BUFFER_USAGE_TRANSFER_DST_BIT, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, @@ -587,11 +600,10 @@ void BufferCacheRuntime::ReserveNullBuffer() { create_info.usage |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT; } create_info.usage |= VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT; - null_buffer = device.GetLogical().CreateBuffer(create_info); + null_buffer = memory_allocator.CreateBuffer(create_info, MemoryUsage::DeviceLocal); if (device.HasDebuggingToolAttached()) { null_buffer.SetObjectNameEXT("Null buffer"); } - null_buffer_commit = memory_allocator.Commit(null_buffer, MemoryUsage::DeviceLocal); scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([buffer = *null_buffer](vk::CommandBuffer cmdbuf) { diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index cdeef8846..95446c732 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -48,7 +48,6 @@ private: const Device* device{}; vk::Buffer buffer; - MemoryCommit commit; std::vector<BufferView> views; }; @@ -142,7 +141,6 @@ private: std::shared_ptr<QuadStripIndexBuffer> quad_strip_index_buffer; vk::Buffer null_buffer; - MemoryCommit null_buffer_commit; std::unique_ptr<Uint8Pass> uint8_pass; QuadIndexedPass quad_index_pass; diff --git a/src/video_core/renderer_vulkan/vk_fsr.cpp b/src/video_core/renderer_vulkan/vk_fsr.cpp index df972cd54..9bcdca2fb 100644 --- a/src/video_core/renderer_vulkan/vk_fsr.cpp +++ b/src/video_core/renderer_vulkan/vk_fsr.cpp @@ -205,10 +205,9 @@ void FSR::CreateDescriptorSets() { void FSR::CreateImages() { images.resize(image_count * 2); image_views.resize(image_count * 2); - buffer_commits.resize(image_count * 2); for (size_t i = 0; i < image_count * 2; ++i) { - images[i] = device.GetLogical().CreateImage(VkImageCreateInfo{ + images[i] = memory_allocator.CreateImage(VkImageCreateInfo{ .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, .pNext = nullptr, .flags = 0, @@ -231,7 +230,6 @@ void FSR::CreateImages() { .pQueueFamilyIndices = nullptr, .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, }); - buffer_commits[i] = memory_allocator.Commit(images[i], MemoryUsage::DeviceLocal); image_views[i] = device.GetLogical().CreateImageView(VkImageViewCreateInfo{ .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, .pNext = nullptr, diff --git a/src/video_core/renderer_vulkan/vk_fsr.h b/src/video_core/renderer_vulkan/vk_fsr.h index 5d872861f..8bb9fc23a 100644 --- a/src/video_core/renderer_vulkan/vk_fsr.h +++ b/src/video_core/renderer_vulkan/vk_fsr.h @@ -47,7 +47,6 @@ private: vk::Sampler sampler; std::vector<vk::Image> images; std::vector<vk::ImageView> image_views; - std::vector<MemoryCommit> buffer_commits; }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index c1595642e..ad35cacac 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -652,13 +652,14 @@ void GraphicsPipeline::MakePipeline(VkRenderPass render_pass) { .pNext = nullptr, .negativeOneToOne = key.state.ndc_minus_one_to_one.Value() != 0 ? VK_TRUE : VK_FALSE, }; + const u32 num_viewports = std::min<u32>(device.GetMaxViewports(), Maxwell::NumViewports); VkPipelineViewportStateCreateInfo viewport_ci{ .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, .pNext = nullptr, .flags = 0, - .viewportCount = Maxwell::NumViewports, + .viewportCount = num_viewports, .pViewports = nullptr, - .scissorCount = Maxwell::NumViewports, + .scissorCount = num_viewports, .pScissors = nullptr, }; if (device.IsNvViewportSwizzleSupported()) { diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.cpp b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp index 5eeda08d2..6b288b994 100644 --- a/src/video_core/renderer_vulkan/vk_master_semaphore.cpp +++ b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp @@ -75,15 +75,9 @@ void MasterSemaphore::Refresh() { void MasterSemaphore::Wait(u64 tick) { if (!semaphore) { - // If we don't support timeline semaphores, use an atomic wait - while (true) { - u64 current_value = gpu_tick.load(std::memory_order_relaxed); - if (current_value >= tick) { - return; - } - gpu_tick.wait(current_value); - } - + // If we don't support timeline semaphores, wait for the value normally + std::unique_lock lk{free_mutex}; + free_cv.wait(lk, [&] { return gpu_tick.load(std::memory_order_relaxed) >= tick; }); return; } @@ -198,11 +192,13 @@ void MasterSemaphore::WaitThread(std::stop_token token) { fence.Wait(); fence.Reset(); - gpu_tick.store(host_tick); - gpu_tick.notify_all(); - std::scoped_lock lock{free_mutex}; - free_queue.push_front(std::move(fence)); + { + std::scoped_lock lock{free_mutex}; + free_queue.push_front(std::move(fence)); + gpu_tick.store(host_tick); + } + free_cv.notify_one(); } } diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.h b/src/video_core/renderer_vulkan/vk_master_semaphore.h index 1e7c90215..3f599d7bd 100644 --- a/src/video_core/renderer_vulkan/vk_master_semaphore.h +++ b/src/video_core/renderer_vulkan/vk_master_semaphore.h @@ -72,6 +72,7 @@ private: std::atomic<u64> current_tick{1}; ///< Current logical tick. std::mutex wait_mutex; std::mutex free_mutex; + std::condition_variable free_cv; std::condition_variable_any wait_cv; std::queue<Waitable> wait_queue; ///< Queue for the fences to be waited on by the wait thread. std::deque<vk::Fence> free_queue; ///< Holds available fences for submission. diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 18e040a1b..d600c4e61 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -167,7 +167,10 @@ Shader::RuntimeInfo MakeRuntimeInfo(std::span<const Shader::IR::Program> program info.fixed_state_point_size = point_size; } if (key.state.xfb_enabled) { - info.xfb_varyings = VideoCommon::MakeTransformFeedbackVaryings(key.state.xfb_state); + auto [varyings, count] = + VideoCommon::MakeTransformFeedbackVaryings(key.state.xfb_state); + info.xfb_varyings = varyings; + info.xfb_count = count; } info.convert_depth_mode = gl_ndc; } @@ -214,7 +217,10 @@ Shader::RuntimeInfo MakeRuntimeInfo(std::span<const Shader::IR::Program> program info.fixed_state_point_size = point_size; } if (key.state.xfb_enabled != 0) { - info.xfb_varyings = VideoCommon::MakeTransformFeedbackVaryings(key.state.xfb_state); + auto [varyings, count] = + VideoCommon::MakeTransformFeedbackVaryings(key.state.xfb_state); + info.xfb_varyings = varyings; + info.xfb_count = count; } info.convert_depth_mode = gl_ndc; break; @@ -303,7 +309,7 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, const Device& device .support_int16 = device.IsShaderInt16Supported(), .support_int64 = device.IsShaderInt64Supported(), .support_vertex_instance_id = false, - .support_float_controls = true, + .support_float_controls = device.IsKhrShaderFloatControlsSupported(), .support_separate_denorm_behavior = float_control.denormBehaviorIndependence == VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL, .support_separate_rounding_mode = @@ -319,12 +325,13 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, const Device& device .support_fp64_signed_zero_nan_preserve = float_control.shaderSignedZeroInfNanPreserveFloat64 != VK_FALSE, .support_explicit_workgroup_layout = device.IsKhrWorkgroupMemoryExplicitLayoutSupported(), - .support_vote = true, + .support_vote = device.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_VOTE_BIT), .support_viewport_index_layer_non_geometry = device.IsExtShaderViewportIndexLayerSupported(), .support_viewport_mask = device.IsNvViewportArray2Supported(), .support_typeless_image_loads = device.IsFormatlessImageLoadSupported(), - .support_demote_to_helper_invocation = true, + .support_demote_to_helper_invocation = + device.IsExtShaderDemoteToHelperInvocationSupported(), .support_int64_atomics = device.IsExtShaderAtomicInt64Supported(), .support_derivative_control = true, .support_geometry_shader_passthrough = device.IsNvGeometryShaderPassthroughSupported(), @@ -705,10 +712,7 @@ std::unique_ptr<ComputePipeline> PipelineCache::CreateComputePipeline( std::unique_ptr<ComputePipeline> PipelineCache::CreateComputePipeline( ShaderPools& pools, const ComputePipelineCacheKey& key, Shader::Environment& env, PipelineStatistics* statistics, bool build_in_parallel) try { - // TODO: Remove this when Intel fixes their shader compiler. - // https://github.com/IGCIT/Intel-GPU-Community-Issue-Tracker-IGCIT/issues/159 - if (device.GetDriverID() == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS && - !Settings::values.enable_compute_pipelines.GetValue()) { + if (device.HasBrokenCompute()) { LOG_ERROR(Render_Vulkan, "Skipping 0x{:016x}", key.Hash()); return nullptr; } diff --git a/src/video_core/renderer_vulkan/vk_present_manager.cpp b/src/video_core/renderer_vulkan/vk_present_manager.cpp index 10ace0420..d681bd22a 100644 --- a/src/video_core/renderer_vulkan/vk_present_manager.cpp +++ b/src/video_core/renderer_vulkan/vk_present_manager.cpp @@ -181,7 +181,7 @@ void PresentManager::RecreateFrame(Frame* frame, u32 width, u32 height, bool is_ frame->height = height; frame->is_srgb = is_srgb; - frame->image = dld.CreateImage({ + frame->image = memory_allocator.CreateImage({ .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, .pNext = nullptr, .flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT, @@ -204,8 +204,6 @@ void PresentManager::RecreateFrame(Frame* frame, u32 width, u32 height, bool is_ .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, }); - frame->image_commit = memory_allocator.Commit(frame->image, MemoryUsage::DeviceLocal); - frame->image_view = dld.CreateImageView({ .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, .pNext = nullptr, diff --git a/src/video_core/renderer_vulkan/vk_present_manager.h b/src/video_core/renderer_vulkan/vk_present_manager.h index 4ac2e2395..83e859416 100644 --- a/src/video_core/renderer_vulkan/vk_present_manager.h +++ b/src/video_core/renderer_vulkan/vk_present_manager.h @@ -29,7 +29,6 @@ struct Frame { vk::Image image; vk::ImageView image_view; vk::Framebuffer framebuffer; - MemoryCommit image_commit; vk::CommandBuffer cmdbuf; vk::Semaphore render_ready; vk::Fence present_done; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 84e3a30cc..f7c0d939a 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -315,7 +315,14 @@ void RasterizerVulkan::Clear(u32 layer_count) { FlushWork(); gpu_memory->FlushCaching(); +#if ANDROID + if (Settings::IsGPULevelHigh()) { + // This is problematic on Android, disable on GPU Normal. + query_cache.UpdateCounters(); + } +#else query_cache.UpdateCounters(); +#endif auto& regs = maxwell3d->regs; const bool use_color = regs.clear_surface.R || regs.clear_surface.G || regs.clear_surface.B || @@ -925,7 +932,7 @@ void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& reg } const bool is_rescaling{texture_cache.IsRescaling()}; const float scale = is_rescaling ? Settings::values.resolution_info.up_factor : 1.0f; - const std::array viewports{ + const std::array viewport_list{ GetViewportState(device, regs, 0, scale), GetViewportState(device, regs, 1, scale), GetViewportState(device, regs, 2, scale), GetViewportState(device, regs, 3, scale), GetViewportState(device, regs, 4, scale), GetViewportState(device, regs, 5, scale), @@ -935,7 +942,11 @@ void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& reg GetViewportState(device, regs, 12, scale), GetViewportState(device, regs, 13, scale), GetViewportState(device, regs, 14, scale), GetViewportState(device, regs, 15, scale), }; - scheduler.Record([viewports](vk::CommandBuffer cmdbuf) { cmdbuf.SetViewport(0, viewports); }); + scheduler.Record([this, viewport_list](vk::CommandBuffer cmdbuf) { + const u32 num_viewports = std::min<u32>(device.GetMaxViewports(), Maxwell::NumViewports); + const vk::Span<VkViewport> viewports(viewport_list.data(), num_viewports); + cmdbuf.SetViewport(0, viewports); + }); } void RasterizerVulkan::UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs) { @@ -948,7 +959,7 @@ void RasterizerVulkan::UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs up_scale = Settings::values.resolution_info.up_scale; down_shift = Settings::values.resolution_info.down_shift; } - const std::array scissors{ + const std::array scissor_list{ GetScissorState(regs, 0, up_scale, down_shift), GetScissorState(regs, 1, up_scale, down_shift), GetScissorState(regs, 2, up_scale, down_shift), @@ -966,7 +977,11 @@ void RasterizerVulkan::UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs GetScissorState(regs, 14, up_scale, down_shift), GetScissorState(regs, 15, up_scale, down_shift), }; - scheduler.Record([scissors](vk::CommandBuffer cmdbuf) { cmdbuf.SetScissor(0, scissors); }); + scheduler.Record([this, scissor_list](vk::CommandBuffer cmdbuf) { + const u32 num_scissors = std::min<u32>(device.GetMaxViewports(), Maxwell::NumViewports); + const vk::Span<VkRect2D> scissors(scissor_list.data(), num_scissors); + cmdbuf.SetScissor(0, scissors); + }); } void RasterizerVulkan::UpdateDepthBias(Tegra::Engines::Maxwell3D::Regs& regs) { diff --git a/src/video_core/renderer_vulkan/vk_smaa.cpp b/src/video_core/renderer_vulkan/vk_smaa.cpp index f8735189d..5efd7d66e 100644 --- a/src/video_core/renderer_vulkan/vk_smaa.cpp +++ b/src/video_core/renderer_vulkan/vk_smaa.cpp @@ -25,9 +25,7 @@ namespace { #define ARRAY_TO_SPAN(a) std::span(a, (sizeof(a) / sizeof(a[0]))) -std::pair<vk::Image, MemoryCommit> CreateWrappedImage(const Device& device, - MemoryAllocator& allocator, - VkExtent2D dimensions, VkFormat format) { +vk::Image CreateWrappedImage(MemoryAllocator& allocator, VkExtent2D dimensions, VkFormat format) { const VkImageCreateInfo image_ci{ .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, .pNext = nullptr, @@ -46,11 +44,7 @@ std::pair<vk::Image, MemoryCommit> CreateWrappedImage(const Device& device, .pQueueFamilyIndices = nullptr, .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, }; - - auto image = device.GetLogical().CreateImage(image_ci); - auto commit = allocator.Commit(image, Vulkan::MemoryUsage::DeviceLocal); - - return std::make_pair(std::move(image), std::move(commit)); + return allocator.CreateImage(image_ci); } void TransitionImageLayout(vk::CommandBuffer& cmdbuf, VkImage image, VkImageLayout target_layout, @@ -82,7 +76,7 @@ void TransitionImageLayout(vk::CommandBuffer& cmdbuf, VkImage image, VkImageLayo void UploadImage(const Device& device, MemoryAllocator& allocator, Scheduler& scheduler, vk::Image& image, VkExtent2D dimensions, VkFormat format, std::span<const u8> initial_contents = {}) { - auto upload_buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{ + const VkBufferCreateInfo upload_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, @@ -91,9 +85,10 @@ void UploadImage(const Device& device, MemoryAllocator& allocator, Scheduler& sc .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, - }); - auto upload_commit = allocator.Commit(upload_buffer, MemoryUsage::Upload); - std::ranges::copy(initial_contents, upload_commit.Map().begin()); + }; + auto upload_buffer = allocator.CreateBuffer(upload_ci, MemoryUsage::Upload); + std::ranges::copy(initial_contents, upload_buffer.Mapped().begin()); + upload_buffer.Flush(); const std::array<VkBufferImageCopy, 1> regions{{{ .bufferOffset = 0, @@ -117,9 +112,6 @@ void UploadImage(const Device& device, MemoryAllocator& allocator, Scheduler& sc VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); }); scheduler.Finish(); - - // This should go out of scope before the commit - auto upload_buffer2 = std::move(upload_buffer); } vk::ImageView CreateWrappedImageView(const Device& device, vk::Image& image, VkFormat format) { @@ -531,10 +523,8 @@ void SMAA::CreateImages() { static constexpr VkExtent2D area_extent{AREATEX_WIDTH, AREATEX_HEIGHT}; static constexpr VkExtent2D search_extent{SEARCHTEX_WIDTH, SEARCHTEX_HEIGHT}; - std::tie(m_static_images[Area], m_static_buffer_commits[Area]) = - CreateWrappedImage(m_device, m_allocator, area_extent, VK_FORMAT_R8G8_UNORM); - std::tie(m_static_images[Search], m_static_buffer_commits[Search]) = - CreateWrappedImage(m_device, m_allocator, search_extent, VK_FORMAT_R8_UNORM); + m_static_images[Area] = CreateWrappedImage(m_allocator, area_extent, VK_FORMAT_R8G8_UNORM); + m_static_images[Search] = CreateWrappedImage(m_allocator, search_extent, VK_FORMAT_R8_UNORM); m_static_image_views[Area] = CreateWrappedImageView(m_device, m_static_images[Area], VK_FORMAT_R8G8_UNORM); @@ -544,12 +534,11 @@ void SMAA::CreateImages() { for (u32 i = 0; i < m_image_count; i++) { Images& images = m_dynamic_images.emplace_back(); - std::tie(images.images[Blend], images.buffer_commits[Blend]) = - CreateWrappedImage(m_device, m_allocator, m_extent, VK_FORMAT_R16G16B16A16_SFLOAT); - std::tie(images.images[Edges], images.buffer_commits[Edges]) = - CreateWrappedImage(m_device, m_allocator, m_extent, VK_FORMAT_R16G16_SFLOAT); - std::tie(images.images[Output], images.buffer_commits[Output]) = - CreateWrappedImage(m_device, m_allocator, m_extent, VK_FORMAT_R16G16B16A16_SFLOAT); + images.images[Blend] = + CreateWrappedImage(m_allocator, m_extent, VK_FORMAT_R16G16B16A16_SFLOAT); + images.images[Edges] = CreateWrappedImage(m_allocator, m_extent, VK_FORMAT_R16G16_SFLOAT); + images.images[Output] = + CreateWrappedImage(m_allocator, m_extent, VK_FORMAT_R16G16B16A16_SFLOAT); images.image_views[Blend] = CreateWrappedImageView(m_device, images.images[Blend], VK_FORMAT_R16G16B16A16_SFLOAT); diff --git a/src/video_core/renderer_vulkan/vk_smaa.h b/src/video_core/renderer_vulkan/vk_smaa.h index 99a369148..0e214258a 100644 --- a/src/video_core/renderer_vulkan/vk_smaa.h +++ b/src/video_core/renderer_vulkan/vk_smaa.h @@ -66,13 +66,11 @@ private: std::array<vk::Pipeline, MaxSMAAStage> m_pipelines{}; std::array<vk::RenderPass, MaxSMAAStage> m_renderpasses{}; - std::array<MemoryCommit, MaxStaticImage> m_static_buffer_commits; std::array<vk::Image, MaxStaticImage> m_static_images{}; std::array<vk::ImageView, MaxStaticImage> m_static_image_views{}; struct Images { vk::DescriptorSets descriptor_sets{}; - std::array<MemoryCommit, MaxDynamicImage> buffer_commits; std::array<vk::Image, MaxDynamicImage> images{}; std::array<vk::ImageView, MaxDynamicImage> image_views{}; std::array<vk::Framebuffer, MaxSMAAStage> framebuffers{}; diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp index 74ca77216..ce92f66ab 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp @@ -30,55 +30,6 @@ constexpr VkDeviceSize MAX_STREAM_BUFFER_REQUEST_SIZE = 8_MiB; constexpr VkDeviceSize STREAM_BUFFER_SIZE = 128_MiB; constexpr VkDeviceSize REGION_SIZE = STREAM_BUFFER_SIZE / StagingBufferPool::NUM_SYNCS; -constexpr VkMemoryPropertyFlags HOST_FLAGS = - VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; -constexpr VkMemoryPropertyFlags STREAM_FLAGS = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | HOST_FLAGS; - -bool IsStreamHeap(VkMemoryHeap heap) noexcept { - return STREAM_BUFFER_SIZE < (heap.size * 2) / 3; -} - -std::optional<u32> FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& props, u32 type_mask, - VkMemoryPropertyFlags flags) noexcept { - for (u32 type_index = 0; type_index < props.memoryTypeCount; ++type_index) { - if (((type_mask >> type_index) & 1) == 0) { - // Memory type is incompatible - continue; - } - const VkMemoryType& memory_type = props.memoryTypes[type_index]; - if ((memory_type.propertyFlags & flags) != flags) { - // Memory type doesn't have the flags we want - continue; - } - if (!IsStreamHeap(props.memoryHeaps[memory_type.heapIndex])) { - // Memory heap is not suitable for streaming - continue; - } - // Success! - return type_index; - } - return std::nullopt; -} - -u32 FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& props, u32 type_mask, - bool try_device_local) { - std::optional<u32> type; - if (try_device_local) { - // Try to find a DEVICE_LOCAL_BIT type, Nvidia and AMD have a dedicated heap for this - type = FindMemoryTypeIndex(props, type_mask, STREAM_FLAGS); - if (type) { - return *type; - } - } - // Otherwise try without the DEVICE_LOCAL_BIT - type = FindMemoryTypeIndex(props, type_mask, HOST_FLAGS); - if (type) { - return *type; - } - // This should never happen, and in case it does, signal it as an out of memory situation - throw vk::Exception(VK_ERROR_OUT_OF_DEVICE_MEMORY); -} - size_t Region(size_t iterator) noexcept { return iterator / REGION_SIZE; } @@ -87,58 +38,26 @@ size_t Region(size_t iterator) noexcept { StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& memory_allocator_, Scheduler& scheduler_) : device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_} { - const vk::Device& dev = device.GetLogical(); - stream_buffer = dev.CreateBuffer(VkBufferCreateInfo{ + VkBufferCreateInfo stream_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, .size = STREAM_BUFFER_SIZE, .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | - VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | - VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT, + VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, - }); - if (device.HasDebuggingToolAttached()) { - stream_buffer.SetObjectNameEXT("Stream Buffer"); - } - VkMemoryDedicatedRequirements dedicated_reqs{ - .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS, - .pNext = nullptr, - .prefersDedicatedAllocation = VK_FALSE, - .requiresDedicatedAllocation = VK_FALSE, - }; - const auto requirements = dev.GetBufferMemoryRequirements(*stream_buffer, &dedicated_reqs); - const bool make_dedicated = dedicated_reqs.prefersDedicatedAllocation == VK_TRUE || - dedicated_reqs.requiresDedicatedAllocation == VK_TRUE; - const VkMemoryDedicatedAllocateInfo dedicated_info{ - .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO, - .pNext = nullptr, - .image = nullptr, - .buffer = *stream_buffer, }; - const auto memory_properties = device.GetPhysical().GetMemoryProperties().memoryProperties; - VkMemoryAllocateInfo stream_memory_info{ - .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, - .pNext = make_dedicated ? &dedicated_info : nullptr, - .allocationSize = requirements.size, - .memoryTypeIndex = - FindMemoryTypeIndex(memory_properties, requirements.memoryTypeBits, true), - }; - stream_memory = dev.TryAllocateMemory(stream_memory_info); - if (!stream_memory) { - LOG_INFO(Render_Vulkan, "Dynamic memory allocation failed, trying with system memory"); - stream_memory_info.memoryTypeIndex = - FindMemoryTypeIndex(memory_properties, requirements.memoryTypeBits, false); - stream_memory = dev.AllocateMemory(stream_memory_info); + if (device.IsExtTransformFeedbackSupported()) { + stream_ci.usage |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT; } - + stream_buffer = memory_allocator.CreateBuffer(stream_ci, MemoryUsage::Stream); if (device.HasDebuggingToolAttached()) { - stream_memory.SetObjectNameEXT("Stream Buffer Memory"); + stream_buffer.SetObjectNameEXT("Stream Buffer"); } - stream_buffer.BindMemory(*stream_memory, 0); - stream_pointer = stream_memory.Map(0, STREAM_BUFFER_SIZE); + stream_pointer = stream_buffer.Mapped(); + ASSERT_MSG(!stream_pointer.empty(), "Stream buffer must be host visible!"); } StagingBufferPool::~StagingBufferPool() = default; @@ -199,7 +118,7 @@ StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) { return StagingBufferRef{ .buffer = *stream_buffer, .offset = static_cast<VkDeviceSize>(offset), - .mapped_span = std::span<u8>(stream_pointer + offset, size), + .mapped_span = stream_pointer.subspan(offset, size), .usage{}, .log2_level{}, .index{}, @@ -247,29 +166,29 @@ std::optional<StagingBufferRef> StagingBufferPool::TryGetReservedBuffer(size_t s StagingBufferRef StagingBufferPool::CreateStagingBuffer(size_t size, MemoryUsage usage, bool deferred) { const u32 log2 = Common::Log2Ceil64(size); - vk::Buffer buffer = device.GetLogical().CreateBuffer({ + VkBufferCreateInfo buffer_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, .size = 1ULL << log2, .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | - VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | - VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT, + VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, - }); + }; + if (device.IsExtTransformFeedbackSupported()) { + buffer_ci.usage |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT; + } + vk::Buffer buffer = memory_allocator.CreateBuffer(buffer_ci, usage); if (device.HasDebuggingToolAttached()) { ++buffer_index; buffer.SetObjectNameEXT(fmt::format("Staging Buffer {}", buffer_index).c_str()); } - MemoryCommit commit = memory_allocator.Commit(buffer, usage); - const std::span<u8> mapped_span = IsHostVisible(usage) ? commit.Map() : std::span<u8>{}; - + const std::span<u8> mapped_span = buffer.Mapped(); StagingBuffer& entry = GetCache(usage)[log2].entries.emplace_back(StagingBuffer{ .buffer = std::move(buffer), - .commit = std::move(commit), .mapped_span = mapped_span, .usage = usage, .log2_level = log2, diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h index 4fd15f11a..5f69f08b1 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h @@ -46,7 +46,6 @@ private: struct StagingBuffer { vk::Buffer buffer; - MemoryCommit commit; std::span<u8> mapped_span; MemoryUsage usage; u32 log2_level; @@ -97,8 +96,7 @@ private: Scheduler& scheduler; vk::Buffer stream_buffer; - vk::DeviceMemory stream_memory; - u8* stream_pointer = nullptr; + std::span<u8> stream_pointer; size_t iterator = 0; size_t used_iterator = 0; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index f025f618b..8385b5509 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -15,7 +15,6 @@ #include "video_core/renderer_vulkan/blit_image.h" #include "video_core/renderer_vulkan/maxwell_to_vk.h" #include "video_core/renderer_vulkan/vk_compute_pass.h" -#include "video_core/renderer_vulkan/vk_rasterizer.h" #include "video_core/renderer_vulkan/vk_render_pass_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" @@ -163,11 +162,12 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) { }; } -[[nodiscard]] vk::Image MakeImage(const Device& device, const ImageInfo& info) { +[[nodiscard]] vk::Image MakeImage(const Device& device, const MemoryAllocator& allocator, + const ImageInfo& info) { if (info.type == ImageType::Buffer) { return vk::Image{}; } - return device.GetLogical().CreateImage(MakeImageCreateInfo(device, info)); + return allocator.CreateImage(MakeImageCreateInfo(device, info)); } [[nodiscard]] VkImageAspectFlags ImageAspectMask(PixelFormat format) { @@ -330,9 +330,9 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) { }; } -[[maybe_unused]] [[nodiscard]] std::vector<VkBufferCopy> TransformBufferCopies( - std::span<const VideoCommon::BufferCopy> copies, size_t buffer_offset) { - std::vector<VkBufferCopy> result(copies.size()); +[[maybe_unused]] [[nodiscard]] boost::container::small_vector<VkBufferCopy, 16> +TransformBufferCopies(std::span<const VideoCommon::BufferCopy> copies, size_t buffer_offset) { + boost::container::small_vector<VkBufferCopy, 16> result(copies.size()); std::ranges::transform( copies, result.begin(), [buffer_offset](const VideoCommon::BufferCopy& copy) { return VkBufferCopy{ @@ -344,7 +344,7 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) { return result; } -[[nodiscard]] std::vector<VkBufferImageCopy> TransformBufferImageCopies( +[[nodiscard]] boost::container::small_vector<VkBufferImageCopy, 16> TransformBufferImageCopies( std::span<const BufferImageCopy> copies, size_t buffer_offset, VkImageAspectFlags aspect_mask) { struct Maker { VkBufferImageCopy operator()(const BufferImageCopy& copy) const { @@ -377,14 +377,14 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) { VkImageAspectFlags aspect_mask; }; if (aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { - std::vector<VkBufferImageCopy> result(copies.size() * 2); + boost::container::small_vector<VkBufferImageCopy, 16> result(copies.size() * 2); std::ranges::transform(copies, result.begin(), Maker{buffer_offset, VK_IMAGE_ASPECT_DEPTH_BIT}); std::ranges::transform(copies, result.begin() + copies.size(), Maker{buffer_offset, VK_IMAGE_ASPECT_STENCIL_BIT}); return result; } else { - std::vector<VkBufferImageCopy> result(copies.size()); + boost::container::small_vector<VkBufferImageCopy, 16> result(copies.size()); std::ranges::transform(copies, result.begin(), Maker{buffer_offset, aspect_mask}); return result; } @@ -839,14 +839,14 @@ bool TextureCacheRuntime::ShouldReinterpret(Image& dst, Image& src) { VkBuffer TextureCacheRuntime::GetTemporaryBuffer(size_t needed_size) { const auto level = (8 * sizeof(size_t)) - std::countl_zero(needed_size - 1ULL); - if (buffer_commits[level]) { + if (buffers[level]) { return *buffers[level]; } const auto new_size = Common::NextPow2(needed_size); static constexpr VkBufferUsageFlags flags = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT; - buffers[level] = device.GetLogical().CreateBuffer({ + const VkBufferCreateInfo temp_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, @@ -855,9 +855,8 @@ VkBuffer TextureCacheRuntime::GetTemporaryBuffer(size_t needed_size) { .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, - }); - buffer_commits[level] = std::make_unique<MemoryCommit>( - memory_allocator.Commit(buffers[level], MemoryUsage::DeviceLocal)); + }; + buffers[level] = memory_allocator.CreateBuffer(temp_ci, MemoryUsage::DeviceLocal); return *buffers[level]; } @@ -867,8 +866,8 @@ void TextureCacheRuntime::BarrierFeedbackLoop() { void TextureCacheRuntime::ReinterpretImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies) { - std::vector<VkBufferImageCopy> vk_in_copies(copies.size()); - std::vector<VkBufferImageCopy> vk_out_copies(copies.size()); + boost::container::small_vector<VkBufferImageCopy, 16> vk_in_copies(copies.size()); + boost::container::small_vector<VkBufferImageCopy, 16> vk_out_copies(copies.size()); const VkImageAspectFlags src_aspect_mask = src.AspectMask(); const VkImageAspectFlags dst_aspect_mask = dst.AspectMask(); @@ -1157,7 +1156,7 @@ void TextureCacheRuntime::ConvertImage(Framebuffer* dst, ImageView& dst_view, Im void TextureCacheRuntime::CopyImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies) { - std::vector<VkImageCopy> vk_copies(copies.size()); + boost::container::small_vector<VkImageCopy, 16> vk_copies(copies.size()); const VkImageAspectFlags aspect_mask = dst.AspectMask(); ASSERT(aspect_mask == src.AspectMask()); @@ -1266,8 +1265,8 @@ void TextureCacheRuntime::TickFrame() {} Image::Image(TextureCacheRuntime& runtime_, const ImageInfo& info_, GPUVAddr gpu_addr_, VAddr cpu_addr_) : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_), scheduler{&runtime_.scheduler}, - runtime{&runtime_}, original_image(MakeImage(runtime_.device, info)), - commit(runtime_.memory_allocator.Commit(original_image, MemoryUsage::DeviceLocal)), + runtime{&runtime_}, + original_image(MakeImage(runtime_.device, runtime_.memory_allocator, info)), aspect_mask(ImageAspectMask(info.format)) { if (IsPixelFormatASTC(info.format) && !runtime->device.IsOptimalAstcSupported()) { if (Settings::values.async_astc.GetValue()) { @@ -1280,6 +1279,10 @@ Image::Image(TextureCacheRuntime& runtime_, const ImageInfo& info_, GPUVAddr gpu flags |= VideoCommon::ImageFlagBits::Converted; flags |= VideoCommon::ImageFlagBits::CostlyLoad; } + if (IsPixelFormatBCn(info.format) && !runtime->device.IsOptimalBcnSupported()) { + flags |= VideoCommon::ImageFlagBits::Converted; + flags |= VideoCommon::ImageFlagBits::CostlyLoad; + } if (runtime->device.HasDebuggingToolAttached()) { original_image.SetObjectNameEXT(VideoCommon::Name(*this).c_str()); } @@ -1332,7 +1335,7 @@ void Image::UploadMemory(VkBuffer buffer, VkDeviceSize offset, ScaleDown(true); } scheduler->RequestOutsideRenderPassOperationContext(); - std::vector vk_copies = TransformBufferImageCopies(copies, offset, aspect_mask); + auto vk_copies = TransformBufferImageCopies(copies, offset, aspect_mask); const VkBuffer src_buffer = buffer; const VkImage vk_image = *original_image; const VkImageAspectFlags vk_aspect_mask = aspect_mask; @@ -1367,8 +1370,9 @@ void Image::DownloadMemory(std::span<VkBuffer> buffers_span, std::span<VkDeviceS if (is_rescaled) { ScaleDown(); } - boost::container::small_vector<VkBuffer, 1> buffers_vector{}; - boost::container::small_vector<std::vector<VkBufferImageCopy>, 1> vk_copies; + boost::container::small_vector<VkBuffer, 8> buffers_vector{}; + boost::container::small_vector<boost::container::small_vector<VkBufferImageCopy, 16>, 8> + vk_copies; for (size_t index = 0; index < buffers_span.size(); index++) { buffers_vector.emplace_back(buffers_span[index]); vk_copies.emplace_back( @@ -1467,9 +1471,7 @@ bool Image::ScaleUp(bool ignore) { auto scaled_info = info; scaled_info.size.width = scaled_width; scaled_info.size.height = scaled_height; - scaled_image = MakeImage(runtime->device, scaled_info); - auto& allocator = runtime->memory_allocator; - scaled_commit = MemoryCommit(allocator.Commit(scaled_image, MemoryUsage::DeviceLocal)); + scaled_image = MakeImage(runtime->device, runtime->memory_allocator, scaled_info); ignore = false; } current_image = *scaled_image; @@ -1858,7 +1860,7 @@ Framebuffer::~Framebuffer() = default; void Framebuffer::CreateFramebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM_RT> color_buffers, ImageView* depth_buffer, bool is_rescaled) { - std::vector<VkImageView> attachments; + boost::container::small_vector<VkImageView, NUM_RT + 1> attachments; RenderPassKey renderpass_key{}; s32 num_layers = 1; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index f14525dcb..220943116 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -116,7 +116,6 @@ public: static constexpr size_t indexing_slots = 8 * sizeof(size_t); std::array<vk::Buffer, indexing_slots> buffers{}; - std::array<std::unique_ptr<MemoryCommit>, indexing_slots> buffer_commits{}; }; class Image : public VideoCommon::ImageBase { @@ -180,12 +179,10 @@ private: TextureCacheRuntime* runtime{}; vk::Image original_image; - MemoryCommit commit; std::vector<vk::ImageView> storage_image_views; VkImageAspectFlags aspect_mask = 0; bool initialized = false; vk::Image scaled_image{}; - MemoryCommit scaled_commit{}; VkImage current_image{}; std::unique_ptr<Framebuffer> scale_framebuffer; diff --git a/src/video_core/renderer_vulkan/vk_turbo_mode.cpp b/src/video_core/renderer_vulkan/vk_turbo_mode.cpp index a802d3c49..460d8d59d 100644 --- a/src/video_core/renderer_vulkan/vk_turbo_mode.cpp +++ b/src/video_core/renderer_vulkan/vk_turbo_mode.cpp @@ -18,7 +18,7 @@ using namespace Common::Literals; TurboMode::TurboMode(const vk::Instance& instance, const vk::InstanceDispatch& dld) #ifndef ANDROID - : m_device{CreateDevice(instance, dld, VK_NULL_HANDLE)}, m_allocator{m_device, false} + : m_device{CreateDevice(instance, dld, VK_NULL_HANDLE)}, m_allocator{m_device} #endif { { @@ -41,7 +41,7 @@ void TurboMode::Run(std::stop_token stop_token) { auto& dld = m_device.GetLogical(); // Allocate buffer. 2MiB should be sufficient. - auto buffer = dld.CreateBuffer(VkBufferCreateInfo{ + const VkBufferCreateInfo buffer_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, @@ -50,10 +50,8 @@ void TurboMode::Run(std::stop_token stop_token) { .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, - }); - - // Commit some device local memory for the buffer. - auto commit = m_allocator.Commit(buffer, MemoryUsage::DeviceLocal); + }; + vk::Buffer buffer = m_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); // Create the descriptor pool to contain our descriptor. static constexpr VkDescriptorPoolSize pool_size{ diff --git a/src/video_core/shader_cache.cpp b/src/video_core/shader_cache.cpp index c5213875b..4db948b6d 100644 --- a/src/video_core/shader_cache.cpp +++ b/src/video_core/shader_cache.cpp @@ -151,11 +151,9 @@ void ShaderCache::RemovePendingShaders() { marked_for_removal.erase(std::unique(marked_for_removal.begin(), marked_for_removal.end()), marked_for_removal.end()); - std::vector<ShaderInfo*> removed_shaders; - removed_shaders.reserve(marked_for_removal.size()); + boost::container::small_vector<ShaderInfo*, 16> removed_shaders; std::scoped_lock lock{lookup_mutex}; - for (Entry* const entry : marked_for_removal) { removed_shaders.push_back(entry->data); diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp index cb51529e4..e16cd5e73 100644 --- a/src/video_core/surface.cpp +++ b/src/video_core/surface.cpp @@ -269,6 +269,28 @@ bool IsPixelFormatASTC(PixelFormat format) { } } +bool IsPixelFormatBCn(PixelFormat format) { + switch (format) { + case PixelFormat::BC1_RGBA_UNORM: + case PixelFormat::BC2_UNORM: + case PixelFormat::BC3_UNORM: + case PixelFormat::BC4_UNORM: + case PixelFormat::BC4_SNORM: + case PixelFormat::BC5_UNORM: + case PixelFormat::BC5_SNORM: + case PixelFormat::BC1_RGBA_SRGB: + case PixelFormat::BC2_SRGB: + case PixelFormat::BC3_SRGB: + case PixelFormat::BC7_UNORM: + case PixelFormat::BC6H_UFLOAT: + case PixelFormat::BC6H_SFLOAT: + case PixelFormat::BC7_SRGB: + return true; + default: + return false; + } +} + bool IsPixelFormatSRGB(PixelFormat format) { switch (format) { case PixelFormat::A8B8G8R8_SRGB: diff --git a/src/video_core/surface.h b/src/video_core/surface.h index 0225d3287..9b9c4d9bc 100644 --- a/src/video_core/surface.h +++ b/src/video_core/surface.h @@ -501,6 +501,8 @@ SurfaceType GetFormatType(PixelFormat pixel_format); bool IsPixelFormatASTC(PixelFormat format); +bool IsPixelFormatBCn(PixelFormat format); + bool IsPixelFormatSRGB(PixelFormat format); bool IsPixelFormatInteger(PixelFormat format); diff --git a/src/video_core/texture_cache/decode_bc.cpp b/src/video_core/texture_cache/decode_bc.cpp new file mode 100644 index 000000000..3e26474a3 --- /dev/null +++ b/src/video_core/texture_cache/decode_bc.cpp @@ -0,0 +1,129 @@ +// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <algorithm> +#include <array> +#include <span> +#include <bc_decoder.h> + +#include "common/common_types.h" +#include "video_core/texture_cache/decode_bc.h" + +namespace VideoCommon { + +namespace { +constexpr u32 BLOCK_SIZE = 4; + +using VideoCore::Surface::PixelFormat; + +constexpr bool IsSigned(PixelFormat pixel_format) { + switch (pixel_format) { + case PixelFormat::BC4_SNORM: + case PixelFormat::BC4_UNORM: + case PixelFormat::BC5_SNORM: + case PixelFormat::BC5_UNORM: + case PixelFormat::BC6H_SFLOAT: + case PixelFormat::BC6H_UFLOAT: + return true; + default: + return false; + } +} + +constexpr u32 BlockSize(PixelFormat pixel_format) { + switch (pixel_format) { + case PixelFormat::BC1_RGBA_SRGB: + case PixelFormat::BC1_RGBA_UNORM: + case PixelFormat::BC4_SNORM: + case PixelFormat::BC4_UNORM: + return 8; + default: + return 16; + } +} +} // Anonymous namespace + +u32 ConvertedBytesPerBlock(VideoCore::Surface::PixelFormat pixel_format) { + switch (pixel_format) { + case PixelFormat::BC4_SNORM: + case PixelFormat::BC4_UNORM: + return 1; + case PixelFormat::BC5_SNORM: + case PixelFormat::BC5_UNORM: + return 2; + case PixelFormat::BC6H_SFLOAT: + case PixelFormat::BC6H_UFLOAT: + return 8; + default: + return 4; + } +} + +template <auto decompress, PixelFormat pixel_format> +void DecompressBlocks(std::span<const u8> input, std::span<u8> output, Extent3D extent, + bool is_signed = false) { + const u32 out_bpp = ConvertedBytesPerBlock(pixel_format); + const u32 block_width = std::min(extent.width, BLOCK_SIZE); + const u32 block_height = std::min(extent.height, BLOCK_SIZE); + const u32 pitch = extent.width * out_bpp; + size_t input_offset = 0; + size_t output_offset = 0; + for (u32 slice = 0; slice < extent.depth; ++slice) { + for (u32 y = 0; y < extent.height; y += block_height) { + size_t row_offset = 0; + for (u32 x = 0; x < extent.width; + x += block_width, row_offset += block_width * out_bpp) { + const u8* src = input.data() + input_offset; + u8* const dst = output.data() + output_offset + row_offset; + if constexpr (IsSigned(pixel_format)) { + decompress(src, dst, x, y, extent.width, extent.height, is_signed); + } else { + decompress(src, dst, x, y, extent.width, extent.height); + } + input_offset += BlockSize(pixel_format); + } + output_offset += block_height * pitch; + } + } +} + +void DecompressBCn(std::span<const u8> input, std::span<u8> output, Extent3D extent, + VideoCore::Surface::PixelFormat pixel_format) { + switch (pixel_format) { + case PixelFormat::BC1_RGBA_UNORM: + case PixelFormat::BC1_RGBA_SRGB: + DecompressBlocks<bcn::DecodeBc1, PixelFormat::BC1_RGBA_UNORM>(input, output, extent); + break; + case PixelFormat::BC2_UNORM: + case PixelFormat::BC2_SRGB: + DecompressBlocks<bcn::DecodeBc2, PixelFormat::BC2_UNORM>(input, output, extent); + break; + case PixelFormat::BC3_UNORM: + case PixelFormat::BC3_SRGB: + DecompressBlocks<bcn::DecodeBc3, PixelFormat::BC3_UNORM>(input, output, extent); + break; + case PixelFormat::BC4_SNORM: + case PixelFormat::BC4_UNORM: + DecompressBlocks<bcn::DecodeBc4, PixelFormat::BC4_UNORM>( + input, output, extent, pixel_format == PixelFormat::BC4_SNORM); + break; + case PixelFormat::BC5_SNORM: + case PixelFormat::BC5_UNORM: + DecompressBlocks<bcn::DecodeBc5, PixelFormat::BC5_UNORM>( + input, output, extent, pixel_format == PixelFormat::BC5_SNORM); + break; + case PixelFormat::BC6H_SFLOAT: + case PixelFormat::BC6H_UFLOAT: + DecompressBlocks<bcn::DecodeBc6, PixelFormat::BC6H_UFLOAT>( + input, output, extent, pixel_format == PixelFormat::BC6H_SFLOAT); + break; + case PixelFormat::BC7_SRGB: + case PixelFormat::BC7_UNORM: + DecompressBlocks<bcn::DecodeBc7, PixelFormat::BC7_UNORM>(input, output, extent); + break; + default: + LOG_WARNING(HW_GPU, "Unimplemented BCn decompression {}", pixel_format); + } +} + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/decode_bc4.h b/src/video_core/texture_cache/decode_bc.h index ab2f735be..41d1ec0a3 100644 --- a/src/video_core/texture_cache/decode_bc4.h +++ b/src/video_core/texture_cache/decode_bc.h @@ -6,10 +6,14 @@ #include <span> #include "common/common_types.h" +#include "video_core/surface.h" #include "video_core/texture_cache/types.h" namespace VideoCommon { -void DecompressBC4(std::span<const u8> data, Extent3D extent, std::span<u8> output); +[[nodiscard]] u32 ConvertedBytesPerBlock(VideoCore::Surface::PixelFormat pixel_format); + +void DecompressBCn(std::span<const u8> input, std::span<u8> output, Extent3D extent, + VideoCore::Surface::PixelFormat pixel_format); } // namespace VideoCommon diff --git a/src/video_core/texture_cache/decode_bc4.cpp b/src/video_core/texture_cache/decode_bc4.cpp deleted file mode 100644 index ef98afdca..000000000 --- a/src/video_core/texture_cache/decode_bc4.cpp +++ /dev/null @@ -1,96 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#include <algorithm> -#include <array> -#include <span> - -#include "common/assert.h" -#include "common/common_types.h" -#include "video_core/texture_cache/decode_bc4.h" -#include "video_core/texture_cache/types.h" - -namespace VideoCommon { - -// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_compression_rgtc.txt -[[nodiscard]] constexpr u32 DecompressBlock(u64 bits, u32 x, u32 y) { - const u32 code_offset = 16 + 3 * (4 * y + x); - const u32 code = (bits >> code_offset) & 7; - const u32 red0 = (bits >> 0) & 0xff; - const u32 red1 = (bits >> 8) & 0xff; - if (red0 > red1) { - switch (code) { - case 0: - return red0; - case 1: - return red1; - case 2: - return (6 * red0 + 1 * red1) / 7; - case 3: - return (5 * red0 + 2 * red1) / 7; - case 4: - return (4 * red0 + 3 * red1) / 7; - case 5: - return (3 * red0 + 4 * red1) / 7; - case 6: - return (2 * red0 + 5 * red1) / 7; - case 7: - return (1 * red0 + 6 * red1) / 7; - } - } else { - switch (code) { - case 0: - return red0; - case 1: - return red1; - case 2: - return (4 * red0 + 1 * red1) / 5; - case 3: - return (3 * red0 + 2 * red1) / 5; - case 4: - return (2 * red0 + 3 * red1) / 5; - case 5: - return (1 * red0 + 4 * red1) / 5; - case 6: - return 0; - case 7: - return 0xff; - } - } - return 0; -} - -void DecompressBC4(std::span<const u8> input, Extent3D extent, std::span<u8> output) { - UNIMPLEMENTED_IF_MSG(extent.width % 4 != 0, "Unaligned width={}", extent.width); - UNIMPLEMENTED_IF_MSG(extent.height % 4 != 0, "Unaligned height={}", extent.height); - static constexpr u32 BLOCK_SIZE = 4; - size_t input_offset = 0; - for (u32 slice = 0; slice < extent.depth; ++slice) { - for (u32 block_y = 0; block_y < extent.height / 4; ++block_y) { - for (u32 block_x = 0; block_x < extent.width / 4; ++block_x) { - u64 bits; - std::memcpy(&bits, &input[input_offset], sizeof(bits)); - input_offset += sizeof(bits); - - for (u32 y = 0; y < BLOCK_SIZE; ++y) { - for (u32 x = 0; x < BLOCK_SIZE; ++x) { - const u32 linear_z = slice; - const u32 linear_y = block_y * BLOCK_SIZE + y; - const u32 linear_x = block_x * BLOCK_SIZE + x; - const u32 offset_z = linear_z * extent.width * extent.height; - const u32 offset_y = linear_y * extent.width; - const u32 offset_x = linear_x; - const u32 output_offset = (offset_z + offset_y + offset_x) * 4ULL; - const u32 color = DecompressBlock(bits, x, y); - output[output_offset + 0] = static_cast<u8>(color); - output[output_offset + 1] = 0; - output[output_offset + 2] = 0; - output[output_offset + 3] = 0xff; - } - } - } - } - } -} - -} // namespace VideoCommon diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h index 1b8a17ee8..55d49d017 100644 --- a/src/video_core/texture_cache/image_base.h +++ b/src/video_core/texture_cache/image_base.h @@ -6,6 +6,7 @@ #include <array> #include <optional> #include <vector> +#include <boost/container/small_vector.hpp> #include "common/common_funcs.h" #include "common/common_types.h" @@ -108,8 +109,8 @@ struct ImageBase { std::vector<ImageViewInfo> image_view_infos; std::vector<ImageViewId> image_view_ids; - std::vector<u32> slice_offsets; - std::vector<SubresourceBase> slice_subresources; + boost::container::small_vector<u32, 16> slice_offsets; + boost::container::small_vector<SubresourceBase, 16> slice_subresources; std::vector<AliasedImage> aliased_images; std::vector<ImageId> overlapping_images; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 4027d860b..8190f3ba1 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -186,6 +186,10 @@ void TextureCache<P>::FillComputeImageViews(std::span<ImageViewInOut> views) { template <class P> void TextureCache<P>::CheckFeedbackLoop(std::span<const ImageViewInOut> views) { + if (!Settings::values.barrier_feedback_loops.GetValue()) { + return; + } + const bool requires_barrier = [&] { for (const auto& view : views) { if (!view.id) { @@ -300,7 +304,7 @@ void TextureCache<P>::SynchronizeComputeDescriptors() { } template <class P> -bool TextureCache<P>::RescaleRenderTargets(bool is_clear) { +bool TextureCache<P>::RescaleRenderTargets() { auto& flags = maxwell3d->dirty.flags; u32 scale_rating = 0; bool rescaled = false; @@ -338,13 +342,13 @@ bool TextureCache<P>::RescaleRenderTargets(bool is_clear) { ImageViewId& color_buffer_id = render_targets.color_buffer_ids[index]; if (flags[Dirty::ColorBuffer0 + index] || force) { flags[Dirty::ColorBuffer0 + index] = false; - BindRenderTarget(&color_buffer_id, FindColorBuffer(index, is_clear)); + BindRenderTarget(&color_buffer_id, FindColorBuffer(index)); } check_rescale(color_buffer_id, tmp_color_images[index]); } if (flags[Dirty::ZetaBuffer] || force) { flags[Dirty::ZetaBuffer] = false; - BindRenderTarget(&render_targets.depth_buffer_id, FindDepthBuffer(is_clear)); + BindRenderTarget(&render_targets.depth_buffer_id, FindDepthBuffer()); } check_rescale(render_targets.depth_buffer_id, tmp_depth_image); @@ -409,7 +413,7 @@ void TextureCache<P>::UpdateRenderTargets(bool is_clear) { return; } - const bool rescaled = RescaleRenderTargets(is_clear); + const bool rescaled = RescaleRenderTargets(); if (is_rescaling != rescaled) { flags[Dirty::RescaleViewports] = true; flags[Dirty::RescaleScissors] = true; @@ -522,7 +526,7 @@ void TextureCache<P>::WriteMemory(VAddr cpu_addr, size_t size) { template <class P> void TextureCache<P>::DownloadMemory(VAddr cpu_addr, size_t size) { - std::vector<ImageId> images; + boost::container::small_vector<ImageId, 16> images; ForEachImageInRegion(cpu_addr, size, [&images](ImageId image_id, ImageBase& image) { if (!image.IsSafeDownload()) { return; @@ -575,7 +579,7 @@ std::optional<VideoCore::RasterizerDownloadArea> TextureCache<P>::GetFlushArea(V template <class P> void TextureCache<P>::UnmapMemory(VAddr cpu_addr, size_t size) { - std::vector<ImageId> deleted_images; + boost::container::small_vector<ImageId, 16> deleted_images; ForEachImageInRegion(cpu_addr, size, [&](ImageId id, Image&) { deleted_images.push_back(id); }); for (const ImageId id : deleted_images) { Image& image = slot_images[id]; @@ -589,19 +593,11 @@ void TextureCache<P>::UnmapMemory(VAddr cpu_addr, size_t size) { template <class P> void TextureCache<P>::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t size) { - std::vector<ImageId> deleted_images; + boost::container::small_vector<ImageId, 16> deleted_images; ForEachImageInRegionGPU(as_id, gpu_addr, size, [&](ImageId id, Image&) { deleted_images.push_back(id); }); for (const ImageId id : deleted_images) { Image& image = slot_images[id]; - if (True(image.flags & ImageFlagBits::CpuModified)) { - return; - } - image.flags |= ImageFlagBits::CpuModified; - if (True(image.flags & ImageFlagBits::Tracked)) { - UntrackImage(image, id); - } - /* if (True(image.flags & ImageFlagBits::Remapped)) { continue; } @@ -609,7 +605,6 @@ void TextureCache<P>::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t siz if (True(image.flags & ImageFlagBits::Tracked)) { UntrackImage(image, id); } - */ } } @@ -875,6 +870,10 @@ ImageId TextureCache<P>::DmaImageId(const Tegra::DMA::ImageOperand& operand, boo return NULL_IMAGE_ID; } auto& image = slot_images[image_id]; + if (image.info.type == ImageType::e3D) { + // Don't accelerate 3D images. + return NULL_IMAGE_ID; + } if (!is_upload && !image.info.dma_downloaded) { // Force a full sync. image.info.dma_downloaded = true; @@ -1097,7 +1096,7 @@ ImageId TextureCache<P>::FindImage(const ImageInfo& info, GPUVAddr gpu_addr, const bool native_bgr = runtime.HasNativeBgr(); const bool flexible_formats = True(options & RelaxedOptions::Format); ImageId image_id{}; - boost::container::small_vector<ImageId, 1> image_ids; + boost::container::small_vector<ImageId, 8> image_ids; const auto lambda = [&](ImageId existing_image_id, ImageBase& existing_image) { if (True(existing_image.flags & ImageFlagBits::Remapped)) { return false; @@ -1618,7 +1617,7 @@ ImageId TextureCache<P>::FindDMAImage(const ImageInfo& info, GPUVAddr gpu_addr) } } ImageId image_id{}; - boost::container::small_vector<ImageId, 1> image_ids; + boost::container::small_vector<ImageId, 8> image_ids; const auto lambda = [&](ImageId existing_image_id, ImageBase& existing_image) { if (True(existing_image.flags & ImageFlagBits::Remapped)) { return false; @@ -1678,7 +1677,7 @@ SamplerId TextureCache<P>::FindSampler(const TSCEntry& config) { } template <class P> -ImageViewId TextureCache<P>::FindColorBuffer(size_t index, bool is_clear) { +ImageViewId TextureCache<P>::FindColorBuffer(size_t index) { const auto& regs = maxwell3d->regs; if (index >= regs.rt_control.count) { return ImageViewId{}; @@ -1692,11 +1691,11 @@ ImageViewId TextureCache<P>::FindColorBuffer(size_t index, bool is_clear) { return ImageViewId{}; } const ImageInfo info(regs.rt[index], regs.anti_alias_samples_mode); - return FindRenderTargetView(info, gpu_addr, is_clear); + return FindRenderTargetView(info, gpu_addr); } template <class P> -ImageViewId TextureCache<P>::FindDepthBuffer(bool is_clear) { +ImageViewId TextureCache<P>::FindDepthBuffer() { const auto& regs = maxwell3d->regs; if (!regs.zeta_enable) { return ImageViewId{}; @@ -1706,18 +1705,16 @@ ImageViewId TextureCache<P>::FindDepthBuffer(bool is_clear) { return ImageViewId{}; } const ImageInfo info(regs.zeta, regs.zeta_size, regs.anti_alias_samples_mode); - return FindRenderTargetView(info, gpu_addr, is_clear); + return FindRenderTargetView(info, gpu_addr); } template <class P> -ImageViewId TextureCache<P>::FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr, - bool is_clear) { - const auto options = is_clear ? RelaxedOptions::Samples : RelaxedOptions{}; +ImageViewId TextureCache<P>::FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr) { ImageId image_id{}; bool delete_state = has_deleted_images; do { has_deleted_images = false; - image_id = FindOrInsertImage(info, gpu_addr, options); + image_id = FindOrInsertImage(info, gpu_addr); delete_state |= has_deleted_images; } while (has_deleted_images); has_deleted_images = delete_state; @@ -1940,7 +1937,7 @@ void TextureCache<P>::RegisterImage(ImageId image_id) { image.map_view_id = map_id; return; } - std::vector<ImageViewId> sparse_maps{}; + boost::container::small_vector<ImageViewId, 16> sparse_maps; ForEachSparseSegment( image, [this, image_id, &sparse_maps](GPUVAddr gpu_addr, VAddr cpu_addr, size_t size) { auto map_id = slot_map_views.insert(gpu_addr, cpu_addr, size, image_id); @@ -2215,7 +2212,7 @@ void TextureCache<P>::MarkModification(ImageBase& image) noexcept { template <class P> void TextureCache<P>::SynchronizeAliases(ImageId image_id) { - boost::container::small_vector<const AliasedImage*, 1> aliased_images; + boost::container::small_vector<const AliasedImage*, 8> aliased_images; Image& image = slot_images[image_id]; bool any_rescaled = True(image.flags & ImageFlagBits::Rescaled); bool any_modified = True(image.flags & ImageFlagBits::GpuModified); diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index d96ddea9d..e9ec91265 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -56,7 +56,7 @@ struct ImageViewInOut { struct AsyncDecodeContext { ImageId image_id; Common::ScratchBuffer<u8> decoded_data; - std::vector<BufferImageCopy> copies; + boost::container::small_vector<BufferImageCopy, 16> copies; std::mutex mutex; std::atomic_bool complete; }; @@ -178,9 +178,8 @@ public: void SynchronizeComputeDescriptors(); /// Updates the Render Targets if they can be rescaled - /// @param is_clear True when the render targets are being used for clears /// @retval True if the Render Targets have been rescaled. - bool RescaleRenderTargets(bool is_clear); + bool RescaleRenderTargets(); /// Update bound render targets and upload memory if necessary /// @param is_clear True when the render targets are being used for clears @@ -336,14 +335,13 @@ private: [[nodiscard]] SamplerId FindSampler(const TSCEntry& config); /// Find or create an image view for the given color buffer index - [[nodiscard]] ImageViewId FindColorBuffer(size_t index, bool is_clear); + [[nodiscard]] ImageViewId FindColorBuffer(size_t index); /// Find or create an image view for the depth buffer - [[nodiscard]] ImageViewId FindDepthBuffer(bool is_clear); + [[nodiscard]] ImageViewId FindDepthBuffer(); /// Find or create a view for a render target with the given image parameters - [[nodiscard]] ImageViewId FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr, - bool is_clear); + [[nodiscard]] ImageViewId FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr); /// Iterates over all the images in a region calling func template <typename Func> @@ -431,7 +429,7 @@ private: std::unordered_map<u64, std::vector<ImageMapId>, Common::IdentityHash<u64>> page_table; std::unordered_map<u64, std::vector<ImageId>, Common::IdentityHash<u64>> sparse_page_table; - std::unordered_map<ImageId, std::vector<ImageViewId>> sparse_views; + std::unordered_map<ImageId, boost::container::small_vector<ImageViewId, 16>> sparse_views; VAddr virtual_invalid_space{}; diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp index 95a5b47d8..9a618a57a 100644 --- a/src/video_core/texture_cache/util.cpp +++ b/src/video_core/texture_cache/util.cpp @@ -24,7 +24,7 @@ #include "video_core/engines/maxwell_3d.h" #include "video_core/memory_manager.h" #include "video_core/surface.h" -#include "video_core/texture_cache/decode_bc4.h" +#include "video_core/texture_cache/decode_bc.h" #include "video_core/texture_cache/format_lookup_table.h" #include "video_core/texture_cache/formatter.h" #include "video_core/texture_cache/samples_helper.h" @@ -61,8 +61,6 @@ using VideoCore::Surface::PixelFormatFromDepthFormat; using VideoCore::Surface::PixelFormatFromRenderTargetFormat; using VideoCore::Surface::SurfaceType; -constexpr u32 CONVERTED_BYTES_PER_BLOCK = BytesPerBlock(PixelFormat::A8B8G8R8_UNORM); - struct LevelInfo { Extent3D size; Extent3D block; @@ -329,13 +327,13 @@ template <u32 GOB_EXTENT> [[nodiscard]] std::optional<SubresourceExtent> ResolveOverlapRightAddress3D( const ImageInfo& new_info, GPUVAddr gpu_addr, const ImageBase& overlap, bool strict_size) { - const std::vector<u32> slice_offsets = CalculateSliceOffsets(new_info); + const auto slice_offsets = CalculateSliceOffsets(new_info); const u32 diff = static_cast<u32>(overlap.gpu_addr - gpu_addr); const auto it = std::ranges::find(slice_offsets, diff); if (it == slice_offsets.end()) { return std::nullopt; } - const std::vector subresources = CalculateSliceSubresources(new_info); + const auto subresources = CalculateSliceSubresources(new_info); const SubresourceBase base = subresources[std::distance(slice_offsets.begin(), it)]; const ImageInfo& info = overlap.info; if (!IsBlockLinearSizeCompatible(new_info, info, base.level, 0, strict_size)) { @@ -612,7 +610,8 @@ u32 CalculateConvertedSizeBytes(const ImageInfo& info) noexcept { } return output_size; } - return NumBlocksPerLayer(info, TILE_SIZE) * info.resources.layers * CONVERTED_BYTES_PER_BLOCK; + return NumBlocksPerLayer(info, TILE_SIZE) * info.resources.layers * + ConvertedBytesPerBlock(info.format); } u32 CalculateLayerStride(const ImageInfo& info) noexcept { @@ -655,9 +654,9 @@ LevelArray CalculateMipLevelSizes(const ImageInfo& info) noexcept { return sizes; } -std::vector<u32> CalculateSliceOffsets(const ImageInfo& info) { +boost::container::small_vector<u32, 16> CalculateSliceOffsets(const ImageInfo& info) { ASSERT(info.type == ImageType::e3D); - std::vector<u32> offsets; + boost::container::small_vector<u32, 16> offsets; offsets.reserve(NumSlices(info)); const LevelInfo level_info = MakeLevelInfo(info); @@ -679,9 +678,10 @@ std::vector<u32> CalculateSliceOffsets(const ImageInfo& info) { return offsets; } -std::vector<SubresourceBase> CalculateSliceSubresources(const ImageInfo& info) { +boost::container::small_vector<SubresourceBase, 16> CalculateSliceSubresources( + const ImageInfo& info) { ASSERT(info.type == ImageType::e3D); - std::vector<SubresourceBase> subresources; + boost::container::small_vector<SubresourceBase, 16> subresources; subresources.reserve(NumSlices(info)); for (s32 level = 0; level < info.resources.levels; ++level) { const s32 depth = AdjustMipSize(info.size.depth, level); @@ -723,8 +723,10 @@ ImageViewType RenderTargetImageViewType(const ImageInfo& info) noexcept { } } -std::vector<ImageCopy> MakeShrinkImageCopies(const ImageInfo& dst, const ImageInfo& src, - SubresourceBase base, u32 up_scale, u32 down_shift) { +boost::container::small_vector<ImageCopy, 16> MakeShrinkImageCopies(const ImageInfo& dst, + const ImageInfo& src, + SubresourceBase base, + u32 up_scale, u32 down_shift) { ASSERT(dst.resources.levels >= src.resources.levels); const bool is_dst_3d = dst.type == ImageType::e3D; @@ -733,7 +735,7 @@ std::vector<ImageCopy> MakeShrinkImageCopies(const ImageInfo& dst, const ImageIn ASSERT(src.resources.levels == 1); } const bool both_2d{src.type == ImageType::e2D && dst.type == ImageType::e2D}; - std::vector<ImageCopy> copies; + boost::container::small_vector<ImageCopy, 16> copies; copies.reserve(src.resources.levels); for (s32 level = 0; level < src.resources.levels; ++level) { ImageCopy& copy = copies.emplace_back(); @@ -770,9 +772,10 @@ std::vector<ImageCopy> MakeShrinkImageCopies(const ImageInfo& dst, const ImageIn return copies; } -std::vector<ImageCopy> MakeReinterpretImageCopies(const ImageInfo& src, u32 up_scale, - u32 down_shift) { - std::vector<ImageCopy> copies; +boost::container::small_vector<ImageCopy, 16> MakeReinterpretImageCopies(const ImageInfo& src, + u32 up_scale, + u32 down_shift) { + boost::container::small_vector<ImageCopy, 16> copies; copies.reserve(src.resources.levels); const bool is_3d = src.type == ImageType::e3D; for (s32 level = 0; level < src.resources.levels; ++level) { @@ -824,9 +827,11 @@ bool IsValidEntry(const Tegra::MemoryManager& gpu_memory, const TICEntry& config return gpu_memory.GpuToCpuAddress(address, guest_size_bytes).has_value(); } -std::vector<BufferImageCopy> UnswizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, - const ImageInfo& info, std::span<const u8> input, - std::span<u8> output) { +boost::container::small_vector<BufferImageCopy, 16> UnswizzleImage(Tegra::MemoryManager& gpu_memory, + GPUVAddr gpu_addr, + const ImageInfo& info, + std::span<const u8> input, + std::span<u8> output) { const size_t guest_size_bytes = input.size_bytes(); const u32 bpp_log2 = BytesPerBlockLog2(info.format); const Extent3D size = info.size; @@ -861,7 +866,7 @@ std::vector<BufferImageCopy> UnswizzleImage(Tegra::MemoryManager& gpu_memory, GP info.tile_width_spacing); size_t guest_offset = 0; u32 host_offset = 0; - std::vector<BufferImageCopy> copies(num_levels); + boost::container::small_vector<BufferImageCopy, 16> copies(num_levels); for (s32 level = 0; level < num_levels; ++level) { const Extent3D level_size = AdjustMipSize(size, level); @@ -939,7 +944,8 @@ void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8 tile_size.height, output.subspan(output_offset)); output_offset += copy.image_extent.width * copy.image_extent.height * - copy.image_subresource.num_layers * CONVERTED_BYTES_PER_BLOCK; + copy.image_subresource.num_layers * + BytesPerBlock(PixelFormat::A8B8G8R8_UNORM); } else if (astc) { // BC1 uses 0.5 bytes per texel // BC3 uses 1 byte per texel @@ -950,7 +956,8 @@ void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8 const u32 plane_dim = copy.image_extent.width * copy.image_extent.height; const u32 level_size = plane_dim * copy.image_extent.depth * - copy.image_subresource.num_layers * CONVERTED_BYTES_PER_BLOCK; + copy.image_subresource.num_layers * + BytesPerBlock(PixelFormat::A8B8G8R8_UNORM); decode_scratch.resize_destructive(level_size); Tegra::Texture::ASTC::Decompress( @@ -970,15 +977,20 @@ void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8 bpp_div; output_offset += static_cast<u32>(copy.buffer_size); } else { - DecompressBC4(input_offset, copy.image_extent, output.subspan(output_offset)); - + const Extent3D image_extent{ + .width = copy.image_extent.width, + .height = copy.image_extent.height * copy.image_subresource.num_layers, + .depth = copy.image_extent.depth, + }; + DecompressBCn(input_offset, output.subspan(output_offset), image_extent, info.format); output_offset += copy.image_extent.width * copy.image_extent.height * - copy.image_subresource.num_layers * CONVERTED_BYTES_PER_BLOCK; + copy.image_subresource.num_layers * + ConvertedBytesPerBlock(info.format); } } } -std::vector<BufferImageCopy> FullDownloadCopies(const ImageInfo& info) { +boost::container::small_vector<BufferImageCopy, 16> FullDownloadCopies(const ImageInfo& info) { const Extent3D size = info.size; const u32 bytes_per_block = BytesPerBlock(info.format); if (info.type == ImageType::Linear) { @@ -1006,7 +1018,7 @@ std::vector<BufferImageCopy> FullDownloadCopies(const ImageInfo& info) { u32 host_offset = 0; - std::vector<BufferImageCopy> copies(num_levels); + boost::container::small_vector<BufferImageCopy, 16> copies(num_levels); for (s32 level = 0; level < num_levels; ++level) { const Extent3D level_size = AdjustMipSize(size, level); const u32 num_blocks_per_layer = NumBlocks(level_size, tile_size); @@ -1042,10 +1054,10 @@ Extent3D MipBlockSize(const ImageInfo& info, u32 level) { return AdjustMipBlockSize(num_tiles, level_info.block, level); } -std::vector<SwizzleParameters> FullUploadSwizzles(const ImageInfo& info) { +boost::container::small_vector<SwizzleParameters, 16> FullUploadSwizzles(const ImageInfo& info) { const Extent2D tile_size = DefaultBlockSize(info.format); if (info.type == ImageType::Linear) { - return std::vector{SwizzleParameters{ + return {SwizzleParameters{ .num_tiles = AdjustTileSize(info.size, tile_size), .block = {}, .buffer_offset = 0, @@ -1057,7 +1069,7 @@ std::vector<SwizzleParameters> FullUploadSwizzles(const ImageInfo& info) { const s32 num_levels = info.resources.levels; u32 guest_offset = 0; - std::vector<SwizzleParameters> params(num_levels); + boost::container::small_vector<SwizzleParameters, 16> params(num_levels); for (s32 level = 0; level < num_levels; ++level) { const Extent3D level_size = AdjustMipSize(size, level); const Extent3D num_tiles = AdjustTileSize(level_size, tile_size); diff --git a/src/video_core/texture_cache/util.h b/src/video_core/texture_cache/util.h index 84aa6880d..ab45a43c4 100644 --- a/src/video_core/texture_cache/util.h +++ b/src/video_core/texture_cache/util.h @@ -5,6 +5,7 @@ #include <optional> #include <span> +#include <boost/container/small_vector.hpp> #include "common/common_types.h" #include "common/scratch_buffer.h" @@ -40,9 +41,10 @@ struct OverlapResult { [[nodiscard]] LevelArray CalculateMipLevelSizes(const ImageInfo& info) noexcept; -[[nodiscard]] std::vector<u32> CalculateSliceOffsets(const ImageInfo& info); +[[nodiscard]] boost::container::small_vector<u32, 16> CalculateSliceOffsets(const ImageInfo& info); -[[nodiscard]] std::vector<SubresourceBase> CalculateSliceSubresources(const ImageInfo& info); +[[nodiscard]] boost::container::small_vector<SubresourceBase, 16> CalculateSliceSubresources( + const ImageInfo& info); [[nodiscard]] u32 CalculateLevelStrideAlignment(const ImageInfo& info, u32 level); @@ -51,21 +53,18 @@ struct OverlapResult { [[nodiscard]] ImageViewType RenderTargetImageViewType(const ImageInfo& info) noexcept; -[[nodiscard]] std::vector<ImageCopy> MakeShrinkImageCopies(const ImageInfo& dst, - const ImageInfo& src, - SubresourceBase base, u32 up_scale = 1, - u32 down_shift = 0); +[[nodiscard]] boost::container::small_vector<ImageCopy, 16> MakeShrinkImageCopies( + const ImageInfo& dst, const ImageInfo& src, SubresourceBase base, u32 up_scale = 1, + u32 down_shift = 0); -[[nodiscard]] std::vector<ImageCopy> MakeReinterpretImageCopies(const ImageInfo& src, - u32 up_scale = 1, - u32 down_shift = 0); +[[nodiscard]] boost::container::small_vector<ImageCopy, 16> MakeReinterpretImageCopies( + const ImageInfo& src, u32 up_scale = 1, u32 down_shift = 0); [[nodiscard]] bool IsValidEntry(const Tegra::MemoryManager& gpu_memory, const TICEntry& config); -[[nodiscard]] std::vector<BufferImageCopy> UnswizzleImage(Tegra::MemoryManager& gpu_memory, - GPUVAddr gpu_addr, const ImageInfo& info, - std::span<const u8> input, - std::span<u8> output); +[[nodiscard]] boost::container::small_vector<BufferImageCopy, 16> UnswizzleImage( + Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const ImageInfo& info, + std::span<const u8> input, std::span<u8> output); [[nodiscard]] BufferCopy UploadBufferCopy(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const ImageBase& image, std::span<u8> output); @@ -73,13 +72,15 @@ struct OverlapResult { void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8> output, std::span<BufferImageCopy> copies); -[[nodiscard]] std::vector<BufferImageCopy> FullDownloadCopies(const ImageInfo& info); +[[nodiscard]] boost::container::small_vector<BufferImageCopy, 16> FullDownloadCopies( + const ImageInfo& info); [[nodiscard]] Extent3D MipSize(Extent3D size, u32 level); [[nodiscard]] Extent3D MipBlockSize(const ImageInfo& info, u32 level); -[[nodiscard]] std::vector<SwizzleParameters> FullUploadSwizzles(const ImageInfo& info); +[[nodiscard]] boost::container::small_vector<SwizzleParameters, 16> FullUploadSwizzles( + const ImageInfo& info); void SwizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const ImageInfo& info, std::span<const BufferImageCopy> copies, std::span<const u8> memory, diff --git a/src/video_core/textures/bcn.cpp b/src/video_core/textures/bcn.cpp index 671212a49..16ddbe320 100644 --- a/src/video_core/textures/bcn.cpp +++ b/src/video_core/textures/bcn.cpp @@ -3,7 +3,6 @@ #include <stb_dxt.h> #include <string.h> - #include "common/alignment.h" #include "video_core/textures/bcn.h" #include "video_core/textures/workers.h" diff --git a/src/video_core/textures/bcn.h b/src/video_core/textures/bcn.h index 6464af885..d5d2a16c9 100644 --- a/src/video_core/textures/bcn.h +++ b/src/video_core/textures/bcn.h @@ -4,14 +4,13 @@ #pragma once #include <span> -#include <stdint.h> + +#include "common/common_types.h" namespace Tegra::Texture::BCN { -void CompressBC1(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth, - std::span<uint8_t> output); +void CompressBC1(std::span<const u8> data, u32 width, u32 height, u32 depth, std::span<u8> output); -void CompressBC3(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth, - std::span<uint8_t> output); +void CompressBC3(std::span<const u8> data, u32 width, u32 height, u32 depth, std::span<u8> output); } // namespace Tegra::Texture::BCN diff --git a/src/video_core/transform_feedback.cpp b/src/video_core/transform_feedback.cpp index 155599316..1f353d2df 100644 --- a/src/video_core/transform_feedback.cpp +++ b/src/video_core/transform_feedback.cpp @@ -13,7 +13,7 @@ namespace VideoCommon { -std::vector<Shader::TransformFeedbackVarying> MakeTransformFeedbackVaryings( +std::pair<std::array<Shader::TransformFeedbackVarying, 256>, u32> MakeTransformFeedbackVaryings( const TransformFeedbackState& state) { static constexpr std::array VECTORS{ 28U, // gl_Position @@ -62,7 +62,8 @@ std::vector<Shader::TransformFeedbackVarying> MakeTransformFeedbackVaryings( 216U, // gl_TexCoord[6] 220U, // gl_TexCoord[7] }; - std::vector<Shader::TransformFeedbackVarying> xfb(256); + std::array<Shader::TransformFeedbackVarying, 256> xfb{}; + u32 count{0}; for (size_t buffer = 0; buffer < state.layouts.size(); ++buffer) { const auto& locations = state.varyings[buffer]; const auto& layout = state.layouts[buffer]; @@ -103,11 +104,12 @@ std::vector<Shader::TransformFeedbackVarying> MakeTransformFeedbackVaryings( } } xfb[attribute] = varying; + count = std::max(count, attribute); highest = std::max(highest, (base_offset + varying.components) * 4); } UNIMPLEMENTED_IF(highest != layout.stride); } - return xfb; + return {xfb, count + 1}; } } // namespace VideoCommon diff --git a/src/video_core/transform_feedback.h b/src/video_core/transform_feedback.h index d13eb16c3..401b1352a 100644 --- a/src/video_core/transform_feedback.h +++ b/src/video_core/transform_feedback.h @@ -24,7 +24,7 @@ struct TransformFeedbackState { varyings; }; -std::vector<Shader::TransformFeedbackVarying> MakeTransformFeedbackVaryings( +std::pair<std::array<Shader::TransformFeedbackVarying, 256>, u32> MakeTransformFeedbackVaryings( const TransformFeedbackState& state); } // namespace VideoCommon diff --git a/src/video_core/vulkan_common/vulkan_debug_callback.cpp b/src/video_core/vulkan_common/vulkan_debug_callback.cpp index 9de484c29..67e8065a4 100644 --- a/src/video_core/vulkan_common/vulkan_debug_callback.cpp +++ b/src/video_core/vulkan_common/vulkan_debug_callback.cpp @@ -7,10 +7,10 @@ namespace Vulkan { namespace { -VkBool32 Callback(VkDebugUtilsMessageSeverityFlagBitsEXT severity, - VkDebugUtilsMessageTypeFlagsEXT type, - const VkDebugUtilsMessengerCallbackDataEXT* data, - [[maybe_unused]] void* user_data) { +VkBool32 DebugUtilCallback(VkDebugUtilsMessageSeverityFlagBitsEXT severity, + VkDebugUtilsMessageTypeFlagsEXT type, + const VkDebugUtilsMessengerCallbackDataEXT* data, + [[maybe_unused]] void* user_data) { // Skip logging known false-positive validation errors switch (static_cast<u32>(data->messageIdNumber)) { #ifdef ANDROID @@ -62,9 +62,26 @@ VkBool32 Callback(VkDebugUtilsMessageSeverityFlagBitsEXT severity, } return VK_FALSE; } + +VkBool32 DebugReportCallback(VkDebugReportFlagsEXT flags, VkDebugReportObjectTypeEXT objectType, + uint64_t object, size_t location, int32_t messageCode, + const char* pLayerPrefix, const char* pMessage, void* pUserData) { + const VkDebugReportFlagBitsEXT severity = static_cast<VkDebugReportFlagBitsEXT>(flags); + const std::string_view message{pMessage}; + if (severity & VK_DEBUG_REPORT_ERROR_BIT_EXT) { + LOG_CRITICAL(Render_Vulkan, "{}", message); + } else if (severity & VK_DEBUG_REPORT_WARNING_BIT_EXT) { + LOG_WARNING(Render_Vulkan, "{}", message); + } else if (severity & VK_DEBUG_REPORT_INFORMATION_BIT_EXT) { + LOG_INFO(Render_Vulkan, "{}", message); + } else if (severity & VK_DEBUG_REPORT_DEBUG_BIT_EXT) { + LOG_DEBUG(Render_Vulkan, "{}", message); + } + return VK_FALSE; +} } // Anonymous namespace -vk::DebugUtilsMessenger CreateDebugCallback(const vk::Instance& instance) { +vk::DebugUtilsMessenger CreateDebugUtilsCallback(const vk::Instance& instance) { return instance.CreateDebugUtilsMessenger(VkDebugUtilsMessengerCreateInfoEXT{ .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT, .pNext = nullptr, @@ -76,7 +93,18 @@ vk::DebugUtilsMessenger CreateDebugCallback(const vk::Instance& instance) { .messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT, - .pfnUserCallback = Callback, + .pfnUserCallback = DebugUtilCallback, + .pUserData = nullptr, + }); +} + +vk::DebugReportCallback CreateDebugReportCallback(const vk::Instance& instance) { + return instance.CreateDebugReportCallback({ + .sType = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT, + .pNext = nullptr, + .flags = VK_DEBUG_REPORT_DEBUG_BIT_EXT | VK_DEBUG_REPORT_INFORMATION_BIT_EXT | + VK_DEBUG_REPORT_ERROR_BIT_EXT | VK_DEBUG_REPORT_WARNING_BIT_EXT, + .pfnCallback = DebugReportCallback, .pUserData = nullptr, }); } diff --git a/src/video_core/vulkan_common/vulkan_debug_callback.h b/src/video_core/vulkan_common/vulkan_debug_callback.h index 71b1f69ec..a8af7b406 100644 --- a/src/video_core/vulkan_common/vulkan_debug_callback.h +++ b/src/video_core/vulkan_common/vulkan_debug_callback.h @@ -7,6 +7,8 @@ namespace Vulkan { -vk::DebugUtilsMessenger CreateDebugCallback(const vk::Instance& instance); +vk::DebugUtilsMessenger CreateDebugUtilsCallback(const vk::Instance& instance); + +vk::DebugReportCallback CreateDebugReportCallback(const vk::Instance& instance); } // namespace Vulkan diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index dcedf4425..421e71e5a 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -22,6 +22,8 @@ #include <adrenotools/bcenabler.h> #endif +#include <vk_mem_alloc.h> + namespace Vulkan { using namespace Common::Literals; namespace { @@ -316,6 +318,7 @@ NvidiaArchitecture GetNvidiaArchitecture(vk::PhysicalDevice physical, std::vector<const char*> ExtensionListForVulkan( const std::set<std::string, std::less<>>& extensions) { std::vector<const char*> output; + output.reserve(extensions.size()); for (const auto& extension : extensions) { output.push_back(extension.c_str()); } @@ -346,7 +349,7 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR const bool is_s8gen2 = device_id == 0x43050a01; const bool is_arm = driver_id == VK_DRIVER_ID_ARM_PROPRIETARY; - if ((is_mvk || is_qualcomm || is_turnip) && !is_suitable) { + if ((is_mvk || is_qualcomm || is_turnip || is_arm) && !is_suitable) { LOG_WARNING(Render_Vulkan, "Unsuitable driver, continuing anyway"); } else if (!is_suitable) { throw vk::Exception(VK_ERROR_INCOMPATIBLE_DRIVER); @@ -525,6 +528,14 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR } sets_per_pool = 64; + if (extensions.extended_dynamic_state3 && is_amd_driver && + properties.properties.driverVersion >= VK_MAKE_API_VERSION(0, 2, 0, 270)) { + LOG_WARNING(Render_Vulkan, + "AMD drivers after 23.5.2 have broken extendedDynamicState3ColorBlendEquation"); + features.extended_dynamic_state3.extendedDynamicState3ColorBlendEnable = false; + features.extended_dynamic_state3.extendedDynamicState3ColorBlendEquation = false; + dynamic_state3_blending = false; + } if (is_amd_driver) { // AMD drivers need a higher amount of Sets per Pool in certain circumstances like in XC2. sets_per_pool = 96; @@ -562,6 +573,9 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR LOG_WARNING(Render_Vulkan, "Intel proprietary drivers do not support MSAA image blits"); cant_blit_msaa = true; } + has_broken_compute = + CheckBrokenCompute(properties.driver.driverID, properties.properties.driverVersion) && + !Settings::values.enable_compute_pipelines.GetValue(); if (is_intel_anv || (is_qualcomm && !is_s8gen2)) { LOG_WARNING(Render_Vulkan, "Driver does not support native BGR format"); must_emulate_bgr565 = true; @@ -592,9 +606,31 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR graphics_queue = logical.GetQueue(graphics_family); present_queue = logical.GetQueue(present_family); + + VmaVulkanFunctions functions{}; + functions.vkGetInstanceProcAddr = dld.vkGetInstanceProcAddr; + functions.vkGetDeviceProcAddr = dld.vkGetDeviceProcAddr; + + const VmaAllocatorCreateInfo allocator_info = { + .flags = VMA_ALLOCATOR_CREATE_EXTERNALLY_SYNCHRONIZED_BIT, + .physicalDevice = physical, + .device = *logical, + .preferredLargeHeapBlockSize = 0, + .pAllocationCallbacks = nullptr, + .pDeviceMemoryCallbacks = nullptr, + .pHeapSizeLimit = nullptr, + .pVulkanFunctions = &functions, + .instance = instance, + .vulkanApiVersion = VK_API_VERSION_1_1, + .pTypeExternalMemoryHandleTypes = nullptr, + }; + + vk::Check(vmaCreateAllocator(&allocator_info, &allocator)); } -Device::~Device() = default; +Device::~Device() { + vmaDestroyAllocator(allocator); +} VkFormat Device::GetSupportedFormat(VkFormat wanted_format, VkFormatFeatureFlags wanted_usage, FormatType format_type) const { @@ -877,6 +913,10 @@ bool Device::GetSuitability(bool requires_swapchain) { properties.driver.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES; SetNext(next, properties.driver); + // Retrieve subgroup properties. + properties.subgroup_properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES; + SetNext(next, properties.subgroup_properties); + // Retrieve relevant extension properties. if (extensions.shader_float_controls) { properties.float_controls.sType = diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index 8c7e44fcb..1f17265d5 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -10,9 +10,12 @@ #include <vector> #include "common/common_types.h" +#include "common/logging/log.h" #include "common/settings.h" #include "video_core/vulkan_common/vulkan_wrapper.h" +VK_DEFINE_HANDLE(VmaAllocator) + // Define all features which may be used by the implementation here. // Vulkan version in the macro describes the minimum version required for feature availability. // If the Vulkan version is lower than the required version, the named extension is required. @@ -198,6 +201,11 @@ public: return dld; } + /// Returns the VMA allocator. + VmaAllocator GetAllocator() const { + return allocator; + } + /// Returns the logical device. const vk::Device& GetLogical() const { return logical; @@ -285,6 +293,11 @@ public: return features.features.textureCompressionASTC_LDR; } + /// Returns true if BCn is natively supported. + bool IsOptimalBcnSupported() const { + return features.features.textureCompressionBC; + } + /// Returns true if descriptor aliasing is natively supported. bool IsDescriptorAliasingSupported() const { return GetDriverID() != VK_DRIVER_ID_QUALCOMM_PROPRIETARY; @@ -315,6 +328,11 @@ public: return properties.subgroup_size_control.requiredSubgroupSizeStages & stage; } + /// Returns true if the device supports the provided subgroup feature. + bool IsSubgroupFeatureSupported(VkSubgroupFeatureFlagBits feature) const { + return properties.subgroup_properties.supportedOperations & feature; + } + /// Returns the maximum number of push descriptors. u32 MaxPushDescriptors() const { return properties.push_descriptor.maxPushDescriptors; @@ -380,6 +398,11 @@ public: return extensions.swapchain_mutable_format; } + /// Returns true if VK_KHR_shader_float_controls is enabled. + bool IsKhrShaderFloatControlsSupported() const { + return extensions.shader_float_controls; + } + /// Returns true if the device supports VK_KHR_workgroup_memory_explicit_layout. bool IsKhrWorkgroupMemoryExplicitLayoutSupported() const { return extensions.workgroup_memory_explicit_layout; @@ -405,6 +428,11 @@ public: return extensions.sampler_filter_minmax; } + /// Returns true if the device supports VK_EXT_shader_stencil_export. + bool IsExtShaderStencilExportSupported() const { + return extensions.shader_stencil_export; + } + /// Returns true if the device supports VK_EXT_depth_range_unrestricted. bool IsExtDepthRangeUnrestrictedSupported() const { return extensions.depth_range_unrestricted; @@ -474,9 +502,9 @@ public: return extensions.vertex_input_dynamic_state; } - /// Returns true if the device supports VK_EXT_shader_stencil_export. - bool IsExtShaderStencilExportSupported() const { - return extensions.shader_stencil_export; + /// Returns true if the device supports VK_EXT_shader_demote_to_helper_invocation + bool IsExtShaderDemoteToHelperInvocationSupported() const { + return extensions.shader_demote_to_helper_invocation; } /// Returns true if the device supports VK_EXT_conservative_rasterization. @@ -510,12 +538,17 @@ public: if (extensions.spirv_1_4) { return 0x00010400U; } - return 0x00010000U; + return 0x00010300U; } /// Returns true when a known debugging tool is attached. bool HasDebuggingToolAttached() const { - return has_renderdoc || has_nsight_graphics || Settings::values.renderer_debug.GetValue(); + return has_renderdoc || has_nsight_graphics; + } + + /// @returns True if compute pipelines can cause crashing. + bool HasBrokenCompute() const { + return has_broken_compute; } /// Returns true when the device does not properly support cube compatibility. @@ -575,10 +608,30 @@ public: return properties.properties.limits.maxVertexInputBindings; } + u32 GetMaxViewports() const { + return properties.properties.limits.maxViewports; + } + bool SupportsConditionalBarriers() const { return supports_conditional_barriers; } + [[nodiscard]] static constexpr bool CheckBrokenCompute(VkDriverId driver_id, + u32 driver_version) { + if (driver_id == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS) { + const u32 major = VK_API_VERSION_MAJOR(driver_version); + const u32 minor = VK_API_VERSION_MINOR(driver_version); + const u32 patch = VK_API_VERSION_PATCH(driver_version); + if (major == 0 && minor == 405 && patch < 286) { + LOG_WARNING( + Render_Vulkan, + "Intel proprietary drivers 0.405.0 until 0.405.286 have broken compute"); + return true; + } + } + return false; + } + private: /// Checks if the physical device is suitable and configures the object state /// with all necessary info about its properties. @@ -608,6 +661,7 @@ private: private: VkInstance instance; ///< Vulkan instance. + VmaAllocator allocator; ///< VMA allocator. vk::DeviceDispatch dld; ///< Device function pointers. vk::PhysicalDevice physical; ///< Physical device. vk::Device logical; ///< Logical device. @@ -650,6 +704,7 @@ private: struct Properties { VkPhysicalDeviceDriverProperties driver{}; + VkPhysicalDeviceSubgroupProperties subgroup_properties{}; VkPhysicalDeviceFloatControlsProperties float_controls{}; VkPhysicalDevicePushDescriptorPropertiesKHR push_descriptor{}; VkPhysicalDeviceSubgroupSizeControlProperties subgroup_size_control{}; @@ -672,6 +727,7 @@ private: bool is_integrated{}; ///< Is GPU an iGPU. bool is_virtual{}; ///< Is GPU a virtual GPU. bool is_non_gpu{}; ///< Is SoftwareRasterizer, FPGA, non-GPU device. + bool has_broken_compute{}; ///< Compute shaders can cause crashes bool has_broken_cube_compatibility{}; ///< Has broken cube compatibility bit bool has_renderdoc{}; ///< Has RenderDoc attached bool has_nsight_graphics{}; ///< Has Nsight Graphics attached diff --git a/src/video_core/vulkan_common/vulkan_instance.cpp b/src/video_core/vulkan_common/vulkan_instance.cpp index b6d83e446..7624a9b32 100644 --- a/src/video_core/vulkan_common/vulkan_instance.cpp +++ b/src/video_core/vulkan_common/vulkan_instance.cpp @@ -31,10 +31,34 @@ namespace Vulkan { namespace { + +[[nodiscard]] bool AreExtensionsSupported(const vk::InstanceDispatch& dld, + std::span<const char* const> extensions) { + const std::optional properties = vk::EnumerateInstanceExtensionProperties(dld); + if (!properties) { + LOG_ERROR(Render_Vulkan, "Failed to query extension properties"); + return false; + } + for (const char* extension : extensions) { + const auto it = std::ranges::find_if(*properties, [extension](const auto& prop) { + return std::strcmp(extension, prop.extensionName) == 0; + }); + if (it == properties->end()) { + LOG_ERROR(Render_Vulkan, "Required instance extension {} is not available", extension); + return false; + } + } + return true; +} + [[nodiscard]] std::vector<const char*> RequiredExtensions( - Core::Frontend::WindowSystemType window_type, bool enable_validation) { + const vk::InstanceDispatch& dld, Core::Frontend::WindowSystemType window_type, + bool enable_validation) { std::vector<const char*> extensions; extensions.reserve(6); +#ifdef __APPLE__ + extensions.push_back(VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME); +#endif switch (window_type) { case Core::Frontend::WindowSystemType::Headless: break; @@ -66,35 +90,14 @@ namespace { extensions.push_back(VK_KHR_SURFACE_EXTENSION_NAME); } if (enable_validation) { - extensions.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME); + const bool debug_utils = + AreExtensionsSupported(dld, std::array{VK_EXT_DEBUG_UTILS_EXTENSION_NAME}); + extensions.push_back(debug_utils ? VK_EXT_DEBUG_UTILS_EXTENSION_NAME + : VK_EXT_DEBUG_REPORT_EXTENSION_NAME); } - extensions.push_back(VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME); - -#ifdef __APPLE__ - extensions.push_back(VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME); -#endif return extensions; } -[[nodiscard]] bool AreExtensionsSupported(const vk::InstanceDispatch& dld, - std::span<const char* const> extensions) { - const std::optional properties = vk::EnumerateInstanceExtensionProperties(dld); - if (!properties) { - LOG_ERROR(Render_Vulkan, "Failed to query extension properties"); - return false; - } - for (const char* extension : extensions) { - const auto it = std::ranges::find_if(*properties, [extension](const auto& prop) { - return std::strcmp(extension, prop.extensionName) == 0; - }); - if (it == properties->end()) { - LOG_ERROR(Render_Vulkan, "Required instance extension {} is not available", extension); - return false; - } - } - return true; -} - [[nodiscard]] std::vector<const char*> Layers(bool enable_validation) { std::vector<const char*> layers; if (enable_validation) { @@ -138,7 +141,8 @@ vk::Instance CreateInstance(const Common::DynamicLibrary& library, vk::InstanceD LOG_ERROR(Render_Vulkan, "Failed to load Vulkan function pointers"); throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); } - const std::vector<const char*> extensions = RequiredExtensions(window_type, enable_validation); + const std::vector<const char*> extensions = + RequiredExtensions(dld, window_type, enable_validation); if (!AreExtensionsSupported(dld, extensions)) { throw vk::Exception(VK_ERROR_EXTENSION_NOT_PRESENT); } diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp index e28a556f8..a2ef0efa4 100644 --- a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp +++ b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp @@ -6,8 +6,6 @@ #include <optional> #include <vector> -#include <glad/glad.h> - #include "common/alignment.h" #include "common/assert.h" #include "common/common_types.h" @@ -17,6 +15,8 @@ #include "video_core/vulkan_common/vulkan_memory_allocator.h" #include "video_core/vulkan_common/vulkan_wrapper.h" +#include <vk_mem_alloc.h> + namespace Vulkan { namespace { struct Range { @@ -49,22 +49,45 @@ struct Range { case MemoryUsage::Download: return VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT; + case MemoryUsage::Stream: + return VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; } ASSERT_MSG(false, "Invalid memory usage={}", usage); return VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; } -constexpr VkExportMemoryAllocateInfo EXPORT_ALLOCATE_INFO{ - .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO, - .pNext = nullptr, -#ifdef _WIN32 - .handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT, -#elif __unix__ - .handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT, -#else - .handleTypes = 0, -#endif -}; +[[nodiscard]] VkMemoryPropertyFlags MemoryUsagePreferedVmaFlags(MemoryUsage usage) { + return usage != MemoryUsage::DeviceLocal ? VK_MEMORY_PROPERTY_HOST_COHERENT_BIT + : VkMemoryPropertyFlagBits{}; +} + +[[nodiscard]] VmaAllocationCreateFlags MemoryUsageVmaFlags(MemoryUsage usage) { + switch (usage) { + case MemoryUsage::Upload: + case MemoryUsage::Stream: + return VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT; + case MemoryUsage::Download: + return VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT; + case MemoryUsage::DeviceLocal: + return VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | + VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT; + } + return {}; +} + +[[nodiscard]] VmaMemoryUsage MemoryUsageVma(MemoryUsage usage) { + switch (usage) { + case MemoryUsage::DeviceLocal: + case MemoryUsage::Stream: + return VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE; + case MemoryUsage::Upload: + case MemoryUsage::Download: + return VMA_MEMORY_USAGE_AUTO_PREFER_HOST; + } + return VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE; +} + } // Anonymous namespace class MemoryAllocation { @@ -74,14 +97,6 @@ public: : allocator{allocator_}, memory{std::move(memory_)}, allocation_size{allocation_size_}, property_flags{properties}, shifted_memory_type{1U << type} {} -#if defined(_WIN32) || defined(__unix__) - ~MemoryAllocation() { - if (owning_opengl_handle != 0) { - glDeleteMemoryObjectsEXT(1, &owning_opengl_handle); - } - } -#endif - MemoryAllocation& operator=(const MemoryAllocation&) = delete; MemoryAllocation(const MemoryAllocation&) = delete; @@ -120,31 +135,6 @@ public: return memory_mapped_span; } -#ifdef _WIN32 - [[nodiscard]] u32 ExportOpenGLHandle() { - if (!owning_opengl_handle) { - glCreateMemoryObjectsEXT(1, &owning_opengl_handle); - glImportMemoryWin32HandleEXT(owning_opengl_handle, allocation_size, - GL_HANDLE_TYPE_OPAQUE_WIN32_EXT, - memory.GetMemoryWin32HandleKHR()); - } - return owning_opengl_handle; - } -#elif __unix__ - [[nodiscard]] u32 ExportOpenGLHandle() { - if (!owning_opengl_handle) { - glCreateMemoryObjectsEXT(1, &owning_opengl_handle); - glImportMemoryFdEXT(owning_opengl_handle, allocation_size, GL_HANDLE_TYPE_OPAQUE_FD_EXT, - memory.GetMemoryFdKHR()); - } - return owning_opengl_handle; - } -#else - [[nodiscard]] u32 ExportOpenGLHandle() { - return 0; - } -#endif - /// Returns whether this allocation is compatible with the arguments. [[nodiscard]] bool IsCompatible(VkMemoryPropertyFlags flags, u32 type_mask) const { return (flags & property_flags) == flags && (type_mask & shifted_memory_type) != 0; @@ -182,9 +172,6 @@ private: const u32 shifted_memory_type; ///< Shifted Vulkan memory type. std::vector<Range> commits; ///< All commit ranges done from this allocation. std::span<u8> memory_mapped_span; ///< Memory mapped span. Empty if not queried before. -#if defined(_WIN32) || defined(__unix__) - u32 owning_opengl_handle{}; ///< Owning OpenGL memory object handle. -#endif }; MemoryCommit::MemoryCommit(MemoryAllocation* allocation_, VkDeviceMemory memory_, u64 begin_, @@ -216,24 +203,70 @@ std::span<u8> MemoryCommit::Map() { return span; } -u32 MemoryCommit::ExportOpenGLHandle() const { - return allocation->ExportOpenGLHandle(); -} - void MemoryCommit::Release() { if (allocation) { allocation->Free(begin); } } -MemoryAllocator::MemoryAllocator(const Device& device_, bool export_allocations_) - : device{device_}, properties{device_.GetPhysical().GetMemoryProperties().memoryProperties}, - export_allocations{export_allocations_}, +MemoryAllocator::MemoryAllocator(const Device& device_) + : device{device_}, allocator{device.GetAllocator()}, + properties{device_.GetPhysical().GetMemoryProperties().memoryProperties}, buffer_image_granularity{ device_.GetPhysical().GetProperties().limits.bufferImageGranularity} {} MemoryAllocator::~MemoryAllocator() = default; +vk::Image MemoryAllocator::CreateImage(const VkImageCreateInfo& ci) const { + const VmaAllocationCreateInfo alloc_ci = { + .flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT, + .usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE, + .requiredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + .preferredFlags = 0, + .memoryTypeBits = 0, + .pool = VK_NULL_HANDLE, + .pUserData = nullptr, + .priority = 0.f, + }; + + VkImage handle{}; + VmaAllocation allocation{}; + + vk::Check(vmaCreateImage(allocator, &ci, &alloc_ci, &handle, &allocation, nullptr)); + + return vk::Image(handle, *device.GetLogical(), allocator, allocation, + device.GetDispatchLoader()); +} + +vk::Buffer MemoryAllocator::CreateBuffer(const VkBufferCreateInfo& ci, MemoryUsage usage) const { + const VmaAllocationCreateInfo alloc_ci = { + .flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT | + MemoryUsageVmaFlags(usage), + .usage = MemoryUsageVma(usage), + .requiredFlags = 0, + .preferredFlags = MemoryUsagePreferedVmaFlags(usage), + .memoryTypeBits = 0, + .pool = VK_NULL_HANDLE, + .pUserData = nullptr, + .priority = 0.f, + }; + + VkBuffer handle{}; + VmaAllocationInfo alloc_info{}; + VmaAllocation allocation{}; + VkMemoryPropertyFlags property_flags{}; + + vk::Check(vmaCreateBuffer(allocator, &ci, &alloc_ci, &handle, &allocation, &alloc_info)); + vmaGetAllocationMemoryProperties(allocator, allocation, &property_flags); + + u8* data = reinterpret_cast<u8*>(alloc_info.pMappedData); + const std::span<u8> mapped_data = data ? std::span<u8>{data, ci.size} : std::span<u8>{}; + const bool is_coherent = property_flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; + + return vk::Buffer(handle, *device.GetLogical(), allocator, allocation, mapped_data, is_coherent, + device.GetDispatchLoader()); +} + MemoryCommit MemoryAllocator::Commit(const VkMemoryRequirements& requirements, MemoryUsage usage) { // Find the fastest memory flags we can afford with the current requirements const u32 type_mask = requirements.memoryTypeBits; @@ -253,25 +286,11 @@ MemoryCommit MemoryAllocator::Commit(const VkMemoryRequirements& requirements, M return TryCommit(requirements, flags).value(); } -MemoryCommit MemoryAllocator::Commit(const vk::Buffer& buffer, MemoryUsage usage) { - auto commit = Commit(device.GetLogical().GetBufferMemoryRequirements(*buffer), usage); - buffer.BindMemory(commit.Memory(), commit.Offset()); - return commit; -} - -MemoryCommit MemoryAllocator::Commit(const vk::Image& image, MemoryUsage usage) { - VkMemoryRequirements requirements = device.GetLogical().GetImageMemoryRequirements(*image); - requirements.size = Common::AlignUp(requirements.size, buffer_image_granularity); - auto commit = Commit(requirements, usage); - image.BindMemory(commit.Memory(), commit.Offset()); - return commit; -} - bool MemoryAllocator::TryAllocMemory(VkMemoryPropertyFlags flags, u32 type_mask, u64 size) { const u32 type = FindType(flags, type_mask).value(); vk::DeviceMemory memory = device.GetLogical().TryAllocateMemory({ .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, - .pNext = export_allocations ? &EXPORT_ALLOCATE_INFO : nullptr, + .pNext = nullptr, .allocationSize = size, .memoryTypeIndex = type, }); @@ -342,16 +361,4 @@ std::optional<u32> MemoryAllocator::FindType(VkMemoryPropertyFlags flags, u32 ty return std::nullopt; } -bool IsHostVisible(MemoryUsage usage) noexcept { - switch (usage) { - case MemoryUsage::DeviceLocal: - return false; - case MemoryUsage::Upload: - case MemoryUsage::Download: - return true; - } - ASSERT_MSG(false, "Invalid memory usage={}", usage); - return false; -} - } // namespace Vulkan diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.h b/src/video_core/vulkan_common/vulkan_memory_allocator.h index a5bff03fe..f449bc8d0 100644 --- a/src/video_core/vulkan_common/vulkan_memory_allocator.h +++ b/src/video_core/vulkan_common/vulkan_memory_allocator.h @@ -9,6 +9,8 @@ #include "common/common_types.h" #include "video_core/vulkan_common/vulkan_wrapper.h" +VK_DEFINE_HANDLE(VmaAllocator) + namespace Vulkan { class Device; @@ -17,9 +19,11 @@ class MemoryAllocation; /// Hints and requirements for the backing memory type of a commit enum class MemoryUsage { - DeviceLocal, ///< Hints device local usages, fastest memory type to read and write from the GPU + DeviceLocal, ///< Requests device local host visible buffer, falling back to device local + ///< memory. Upload, ///< Requires a host visible memory type optimized for CPU to GPU uploads Download, ///< Requires a host visible memory type optimized for GPU to CPU readbacks + Stream, ///< Requests device local host visible buffer, falling back host memory. }; /// Ownership handle of a memory commitment. @@ -41,9 +45,6 @@ public: /// It will map the backing allocation if it hasn't been mapped before. std::span<u8> Map(); - /// Returns an non-owning OpenGL handle, creating one if it doesn't exist. - u32 ExportOpenGLHandle() const; - /// Returns the Vulkan memory handler. VkDeviceMemory Memory() const { return memory; @@ -74,16 +75,19 @@ public: * Construct memory allocator * * @param device_ Device to allocate from - * @param export_allocations_ True when allocations have to be exported * * @throw vk::Exception on failure */ - explicit MemoryAllocator(const Device& device_, bool export_allocations_); + explicit MemoryAllocator(const Device& device_); ~MemoryAllocator(); MemoryAllocator& operator=(const MemoryAllocator&) = delete; MemoryAllocator(const MemoryAllocator&) = delete; + vk::Image CreateImage(const VkImageCreateInfo& ci) const; + + vk::Buffer CreateBuffer(const VkBufferCreateInfo& ci, MemoryUsage usage) const; + /** * Commits a memory with the specified requirements. * @@ -97,9 +101,6 @@ public: /// Commits memory required by the buffer and binds it. MemoryCommit Commit(const vk::Buffer& buffer, MemoryUsage usage); - /// Commits memory required by the image and binds it. - MemoryCommit Commit(const vk::Image& image, MemoryUsage usage); - private: /// Tries to allocate a chunk of memory. bool TryAllocMemory(VkMemoryPropertyFlags flags, u32 type_mask, u64 size); @@ -117,15 +118,12 @@ private: /// Returns index to the fastest memory type compatible with the passed requirements. std::optional<u32> FindType(VkMemoryPropertyFlags flags, u32 type_mask) const; - const Device& device; ///< Device handle. - const VkPhysicalDeviceMemoryProperties properties; ///< Physical device properties. - const bool export_allocations; ///< True when memory allocations have to be exported. + const Device& device; ///< Device handle. + VmaAllocator allocator; ///< Vma allocator. + const VkPhysicalDeviceMemoryProperties properties; ///< Physical device properties. std::vector<std::unique_ptr<MemoryAllocation>> allocations; ///< Current allocations. VkDeviceSize buffer_image_granularity; // The granularity for adjacent offsets between buffers // and optimal images }; -/// Returns true when a memory usage is guaranteed to be host visible. -bool IsHostVisible(MemoryUsage usage) noexcept; - } // namespace Vulkan diff --git a/src/video_core/vulkan_common/vulkan_wrapper.cpp b/src/video_core/vulkan_common/vulkan_wrapper.cpp index 336f53700..2fa29793a 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.cpp +++ b/src/video_core/vulkan_common/vulkan_wrapper.cpp @@ -12,6 +12,8 @@ #include "video_core/vulkan_common/vulkan_wrapper.h" +#include <vk_mem_alloc.h> + namespace Vulkan::vk { namespace { @@ -257,7 +259,9 @@ bool Load(VkInstance instance, InstanceDispatch& dld) noexcept { // These functions may fail to load depending on the enabled extensions. // Don't return a failure on these. X(vkCreateDebugUtilsMessengerEXT); + X(vkCreateDebugReportCallbackEXT); X(vkDestroyDebugUtilsMessengerEXT); + X(vkDestroyDebugReportCallbackEXT); X(vkDestroySurfaceKHR); X(vkGetPhysicalDeviceFeatures2); X(vkGetPhysicalDeviceProperties2); @@ -479,6 +483,11 @@ void Destroy(VkInstance instance, VkDebugUtilsMessengerEXT handle, dld.vkDestroyDebugUtilsMessengerEXT(instance, handle, nullptr); } +void Destroy(VkInstance instance, VkDebugReportCallbackEXT handle, + const InstanceDispatch& dld) noexcept { + dld.vkDestroyDebugReportCallbackEXT(instance, handle, nullptr); +} + void Destroy(VkInstance instance, VkSurfaceKHR handle, const InstanceDispatch& dld) noexcept { dld.vkDestroySurfaceKHR(instance, handle, nullptr); } @@ -547,24 +556,47 @@ DebugUtilsMessenger Instance::CreateDebugUtilsMessenger( return DebugUtilsMessenger(object, handle, *dld); } -void Buffer::BindMemory(VkDeviceMemory memory, VkDeviceSize offset) const { - Check(dld->vkBindBufferMemory(owner, handle, memory, offset)); +DebugReportCallback Instance::CreateDebugReportCallback( + const VkDebugReportCallbackCreateInfoEXT& create_info) const { + VkDebugReportCallbackEXT object; + Check(dld->vkCreateDebugReportCallbackEXT(handle, &create_info, nullptr, &object)); + return DebugReportCallback(object, handle, *dld); } -void Buffer::SetObjectNameEXT(const char* name) const { - SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_BUFFER, name); +void Image::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_IMAGE, name); } -void BufferView::SetObjectNameEXT(const char* name) const { - SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_BUFFER_VIEW, name); +void Image::Release() const noexcept { + if (handle) { + vmaDestroyImage(allocator, handle, allocation); + } } -void Image::BindMemory(VkDeviceMemory memory, VkDeviceSize offset) const { - Check(dld->vkBindImageMemory(owner, handle, memory, offset)); +void Buffer::Flush() const { + if (!is_coherent) { + vmaFlushAllocation(allocator, allocation, 0, VK_WHOLE_SIZE); + } } -void Image::SetObjectNameEXT(const char* name) const { - SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_IMAGE, name); +void Buffer::Invalidate() const { + if (!is_coherent) { + vmaInvalidateAllocation(allocator, allocation, 0, VK_WHOLE_SIZE); + } +} + +void Buffer::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_BUFFER, name); +} + +void Buffer::Release() const noexcept { + if (handle) { + vmaDestroyBuffer(allocator, handle, allocation); + } +} + +void BufferView::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_BUFFER_VIEW, name); } void ImageView::SetObjectNameEXT(const char* name) const { @@ -701,24 +733,12 @@ Queue Device::GetQueue(u32 family_index) const noexcept { return Queue(queue, *dld); } -Buffer Device::CreateBuffer(const VkBufferCreateInfo& ci) const { - VkBuffer object; - Check(dld->vkCreateBuffer(handle, &ci, nullptr, &object)); - return Buffer(object, handle, *dld); -} - BufferView Device::CreateBufferView(const VkBufferViewCreateInfo& ci) const { VkBufferView object; Check(dld->vkCreateBufferView(handle, &ci, nullptr, &object)); return BufferView(object, handle, *dld); } -Image Device::CreateImage(const VkImageCreateInfo& ci) const { - VkImage object; - Check(dld->vkCreateImage(handle, &ci, nullptr, &object)); - return Image(object, handle, *dld); -} - ImageView Device::CreateImageView(const VkImageViewCreateInfo& ci) const { VkImageView object; Check(dld->vkCreateImageView(handle, &ci, nullptr, &object)); diff --git a/src/video_core/vulkan_common/vulkan_wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h index 4ff328a21..b5e70fcd4 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.h +++ b/src/video_core/vulkan_common/vulkan_wrapper.h @@ -32,6 +32,9 @@ #pragma warning(disable : 26812) // Disable prefer enum class over enum #endif +VK_DEFINE_HANDLE(VmaAllocator) +VK_DEFINE_HANDLE(VmaAllocation) + namespace Vulkan::vk { /** @@ -161,8 +164,10 @@ struct InstanceDispatch { PFN_vkEnumerateInstanceLayerProperties vkEnumerateInstanceLayerProperties{}; PFN_vkCreateDebugUtilsMessengerEXT vkCreateDebugUtilsMessengerEXT{}; + PFN_vkCreateDebugReportCallbackEXT vkCreateDebugReportCallbackEXT{}; PFN_vkCreateDevice vkCreateDevice{}; PFN_vkDestroyDebugUtilsMessengerEXT vkDestroyDebugUtilsMessengerEXT{}; + PFN_vkDestroyDebugReportCallbackEXT vkDestroyDebugReportCallbackEXT{}; PFN_vkDestroyDevice vkDestroyDevice{}; PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR{}; PFN_vkEnumerateDeviceExtensionProperties vkEnumerateDeviceExtensionProperties{}; @@ -363,6 +368,7 @@ void Destroy(VkDevice, VkSwapchainKHR, const DeviceDispatch&) noexcept; void Destroy(VkDevice, VkSemaphore, const DeviceDispatch&) noexcept; void Destroy(VkDevice, VkShaderModule, const DeviceDispatch&) noexcept; void Destroy(VkInstance, VkDebugUtilsMessengerEXT, const InstanceDispatch&) noexcept; +void Destroy(VkInstance, VkDebugReportCallbackEXT, const InstanceDispatch&) noexcept; void Destroy(VkInstance, VkSurfaceKHR, const InstanceDispatch&) noexcept; VkResult Free(VkDevice, VkDescriptorPool, Span<VkDescriptorSet>, const DeviceDispatch&) noexcept; @@ -578,6 +584,7 @@ private: }; using DebugUtilsMessenger = Handle<VkDebugUtilsMessengerEXT, VkInstance, InstanceDispatch>; +using DebugReportCallback = Handle<VkDebugReportCallbackEXT, VkInstance, InstanceDispatch>; using DescriptorSetLayout = Handle<VkDescriptorSetLayout, VkDevice, DeviceDispatch>; using DescriptorUpdateTemplate = Handle<VkDescriptorUpdateTemplate, VkDevice, DeviceDispatch>; using Pipeline = Handle<VkPipeline, VkDevice, DeviceDispatch>; @@ -610,12 +617,149 @@ public: DebugUtilsMessenger CreateDebugUtilsMessenger( const VkDebugUtilsMessengerCreateInfoEXT& create_info) const; + /// Creates a debug report callback. + /// @throw Exception on creation failure. + DebugReportCallback CreateDebugReportCallback( + const VkDebugReportCallbackCreateInfoEXT& create_info) const; + /// Returns dispatch table. const InstanceDispatch& Dispatch() const noexcept { return *dld; } }; +class Image { +public: + explicit Image(VkImage handle_, VkDevice owner_, VmaAllocator allocator_, + VmaAllocation allocation_, const DeviceDispatch& dld_) noexcept + : handle{handle_}, owner{owner_}, allocator{allocator_}, + allocation{allocation_}, dld{&dld_} {} + Image() = default; + + Image(const Image&) = delete; + Image& operator=(const Image&) = delete; + + Image(Image&& rhs) noexcept + : handle{std::exchange(rhs.handle, nullptr)}, owner{rhs.owner}, allocator{rhs.allocator}, + allocation{rhs.allocation}, dld{rhs.dld} {} + + Image& operator=(Image&& rhs) noexcept { + Release(); + handle = std::exchange(rhs.handle, nullptr); + owner = rhs.owner; + allocator = rhs.allocator; + allocation = rhs.allocation; + dld = rhs.dld; + return *this; + } + + ~Image() noexcept { + Release(); + } + + VkImage operator*() const noexcept { + return handle; + } + + void reset() noexcept { + Release(); + handle = nullptr; + } + + explicit operator bool() const noexcept { + return handle != nullptr; + } + + void SetObjectNameEXT(const char* name) const; + +private: + void Release() const noexcept; + + VkImage handle = nullptr; + VkDevice owner = nullptr; + VmaAllocator allocator = nullptr; + VmaAllocation allocation = nullptr; + const DeviceDispatch* dld = nullptr; +}; + +class Buffer { +public: + explicit Buffer(VkBuffer handle_, VkDevice owner_, VmaAllocator allocator_, + VmaAllocation allocation_, std::span<u8> mapped_, bool is_coherent_, + const DeviceDispatch& dld_) noexcept + : handle{handle_}, owner{owner_}, allocator{allocator_}, + allocation{allocation_}, mapped{mapped_}, is_coherent{is_coherent_}, dld{&dld_} {} + Buffer() = default; + + Buffer(const Buffer&) = delete; + Buffer& operator=(const Buffer&) = delete; + + Buffer(Buffer&& rhs) noexcept + : handle{std::exchange(rhs.handle, nullptr)}, owner{rhs.owner}, allocator{rhs.allocator}, + allocation{rhs.allocation}, mapped{rhs.mapped}, + is_coherent{rhs.is_coherent}, dld{rhs.dld} {} + + Buffer& operator=(Buffer&& rhs) noexcept { + Release(); + handle = std::exchange(rhs.handle, nullptr); + owner = rhs.owner; + allocator = rhs.allocator; + allocation = rhs.allocation; + mapped = rhs.mapped; + is_coherent = rhs.is_coherent; + dld = rhs.dld; + return *this; + } + + ~Buffer() noexcept { + Release(); + } + + VkBuffer operator*() const noexcept { + return handle; + } + + void reset() noexcept { + Release(); + handle = nullptr; + } + + explicit operator bool() const noexcept { + return handle != nullptr; + } + + /// Returns the host mapped memory, an empty span otherwise. + std::span<u8> Mapped() noexcept { + return mapped; + } + + std::span<const u8> Mapped() const noexcept { + return mapped; + } + + /// Returns true if the buffer is mapped to the host. + bool IsHostVisible() const noexcept { + return !mapped.empty(); + } + + void Flush() const; + + void Invalidate() const; + + void SetObjectNameEXT(const char* name) const; + +private: + void Release() const noexcept; + + VkBuffer handle = nullptr; + VkDevice owner = nullptr; + VmaAllocator allocator = nullptr; + VmaAllocation allocation = nullptr; + std::span<u8> mapped = {}; + bool is_coherent = false; + const DeviceDispatch* dld = nullptr; +}; + class Queue { public: /// Construct an empty queue handle. @@ -639,17 +783,6 @@ private: const DeviceDispatch* dld = nullptr; }; -class Buffer : public Handle<VkBuffer, VkDevice, DeviceDispatch> { - using Handle<VkBuffer, VkDevice, DeviceDispatch>::Handle; - -public: - /// Attaches a memory allocation. - void BindMemory(VkDeviceMemory memory, VkDeviceSize offset) const; - - /// Set object name. - void SetObjectNameEXT(const char* name) const; -}; - class BufferView : public Handle<VkBufferView, VkDevice, DeviceDispatch> { using Handle<VkBufferView, VkDevice, DeviceDispatch>::Handle; @@ -658,17 +791,6 @@ public: void SetObjectNameEXT(const char* name) const; }; -class Image : public Handle<VkImage, VkDevice, DeviceDispatch> { - using Handle<VkImage, VkDevice, DeviceDispatch>::Handle; - -public: - /// Attaches a memory allocation. - void BindMemory(VkDeviceMemory memory, VkDeviceSize offset) const; - - /// Set object name. - void SetObjectNameEXT(const char* name) const; -}; - class ImageView : public Handle<VkImageView, VkDevice, DeviceDispatch> { using Handle<VkImageView, VkDevice, DeviceDispatch>::Handle; @@ -840,12 +962,8 @@ public: Queue GetQueue(u32 family_index) const noexcept; - Buffer CreateBuffer(const VkBufferCreateInfo& ci) const; - BufferView CreateBufferView(const VkBufferViewCreateInfo& ci) const; - Image CreateImage(const VkImageCreateInfo& ci) const; - ImageView CreateImageView(const VkImageViewCreateInfo& ci) const; Semaphore CreateSemaphore() const; |