15 files changed, 104 insertions, 77 deletions
diff --git a/CMakeModules/GenerateSCMRev.cmake b/CMakeModules/GenerateSCMRev.cmake
index 21e03ae98..fa7ae835f 100644
--- a/CMakeModules/GenerateSCMRev.cmake
+++ b/CMakeModules/GenerateSCMRev.cmake
@@ -5,6 +5,10 @@ function(get_timestamp _var)
 endfunction()
 
 list(APPEND CMAKE_MODULE_PATH "${SRC_DIR}/externals/cmake-modules")
+
+# Find the package here with the known path so that the GetGit commands can find it as well
+find_package(Git QUIET PATHS "${GIT_EXECUTABLE}")
+
 # generate git/build information
 include(GetGitRevisionDescription)
 get_git_head_revision(GIT_REF_SPEC GIT_REV)
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 9b0c3db68..9afc6105d 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -15,6 +15,10 @@ endif ()
 if (DEFINED ENV{DISPLAYVERSION})
   set(DISPLAY_VERSION $ENV{DISPLAYVERSION})
 endif ()
+
+# Pass the path to git to the GenerateSCMRev.cmake as well
+find_package(Git QUIET)
+
 add_custom_command(OUTPUT scm_rev.cpp
     COMMAND ${CMAKE_COMMAND}
       -DSRC_DIR="${CMAKE_SOURCE_DIR}"
@@ -23,6 +27,7 @@ add_custom_command(OUTPUT scm_rev.cpp
       -DTITLE_BAR_FORMAT_RUNNING="${TITLE_BAR_FORMAT_RUNNING}"
       -DBUILD_TAG="${BUILD_TAG}"
       -DBUILD_ID="${DISPLAY_VERSION}"
+      -DGIT_EXECUTABLE="${GIT_EXECUTABLE}"
       -P "${CMAKE_SOURCE_DIR}/CMakeModules/GenerateSCMRev.cmake"
     DEPENDS
       # WARNING! It was too much work to try and make a common location for this list,
diff --git a/src/core/hle/kernel/process.cpp b/src/core/hle/kernel/process.cpp
index 12ea4ebe3..b9035a0be 100644
--- a/src/core/hle/kernel/process.cpp
+++ b/src/core/hle/kernel/process.cpp
@@ -317,6 +317,8 @@ void Process::FreeTLSRegion(VAddr tls_address) {
 }
 
 void Process::LoadModule(CodeSet module_, VAddr base_addr) {
+    code_memory_size += module_.memory.size();
+
     const auto memory = std::make_shared<PhysicalMemory>(std::move(module_.memory));
 
     const auto MapSegment = [&](const CodeSet::Segment& segment, VMAPermission permissions,
@@ -332,8 +334,6 @@ void Process::LoadModule(CodeSet module_, VAddr base_addr) {
     MapSegment(module_.CodeSegment(), VMAPermission::ReadExecute, MemoryState::Code);
     MapSegment(module_.RODataSegment(), VMAPermission::Read, MemoryState::CodeData);
     MapSegment(module_.DataSegment(), VMAPermission::ReadWrite, MemoryState::CodeData);
-
-    code_memory_size += module_.memory.size();
 }
 
 Process::Process(Core::System& system)
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index a35e7a195..16f95b77d 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -1271,8 +1271,6 @@ public:
 
     } dirty{};
 
-    std::array<u8, Regs::NUM_REGS> dirty_pointers{};
-
     /// Reads a register value located at the input method address
     u32 GetRegisterValue(u32 method) const;
 
@@ -1367,6 +1365,8 @@ private:
 
     bool execute_on{true};
 
+    std::array<u8, Regs::NUM_REGS> dirty_pointers{};
+
     /// Retrieves information about a specific TIC entry from the TIC buffer.
     Texture::TICEntry GetTICEntry(u32 tic_index) const;
 
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index de742d11c..a4acb3796 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -34,9 +34,6 @@ using VideoCommon::Shader::ShaderIR;
 
 namespace {
 
-// One UBO is always reserved for emulation values on staged shaders
-constexpr u32 STAGE_RESERVED_UBOS = 1;
-
 constexpr u32 STAGE_MAIN_OFFSET = 10;
 constexpr u32 KERNEL_MAIN_OFFSET = 0;
 
@@ -243,7 +240,6 @@ CachedProgram BuildShader(const Device& device, u64 unique_identifier, ShaderTyp
     if (!code_b.empty()) {
         ir_b.emplace(code_b, main_offset, COMPILER_SETTINGS, locker);
     }
-    const auto entries = GLShader::GetEntries(ir);
 
     std::string source = fmt::format(R"(// {}
 #version 430 core
@@ -314,9 +310,10 @@ std::unordered_set<GLenum> GetSupportedFormats() {
 
 CachedShader::CachedShader(const ShaderParameters& params, ShaderType shader_type,
                            GLShader::ShaderEntries entries, ProgramCode code, ProgramCode code_b)
-    : RasterizerCacheObject{params.host_ptr}, system{params.system}, disk_cache{params.disk_cache},
-      device{params.device}, cpu_addr{params.cpu_addr}, unique_identifier{params.unique_identifier},
-      shader_type{shader_type}, entries{entries}, code{std::move(code)}, code_b{std::move(code_b)} {
+    : RasterizerCacheObject{params.host_ptr}, system{params.system},
+      disk_cache{params.disk_cache}, device{params.device}, cpu_addr{params.cpu_addr},
+      unique_identifier{params.unique_identifier}, shader_type{shader_type},
+      entries{std::move(entries)}, code{std::move(code)}, code_b{std::move(code_b)} {
     if (!params.precompiled_variants) {
         return;
     }
diff --git a/src/video_core/renderer_opengl/utils.cpp b/src/video_core/renderer_opengl/utils.cpp
index 9770dda1c..ac99e6385 100644
--- a/src/video_core/renderer_opengl/utils.cpp
+++ b/src/video_core/renderer_opengl/utils.cpp
@@ -6,16 +6,20 @@
 #include <vector>
 
 #include <fmt/format.h>
-
 #include <glad/glad.h>
 
-#include "common/assert.h"
 #include "common/common_types.h"
-#include "common/scope_exit.h"
 #include "video_core/renderer_opengl/utils.h"
 
 namespace OpenGL {
 
+struct VertexArrayPushBuffer::Entry {
+    GLuint binding_index{};
+    const GLuint* buffer{};
+    GLintptr offset{};
+    GLsizei stride{};
+};
+
 VertexArrayPushBuffer::VertexArrayPushBuffer() = default;
 
 VertexArrayPushBuffer::~VertexArrayPushBuffer() = default;
@@ -47,6 +51,13 @@ void VertexArrayPushBuffer::Bind() {
     }
 }
 
+struct BindBuffersRangePushBuffer::Entry {
+    GLuint binding;
+    const GLuint* buffer;
+    GLintptr offset;
+    GLsizeiptr size;
+};
+
 BindBuffersRangePushBuffer::BindBuffersRangePushBuffer(GLenum target) : target{target} {}
 
 BindBuffersRangePushBuffer::~BindBuffersRangePushBuffer() = default;
diff --git a/src/video_core/renderer_opengl/utils.h b/src/video_core/renderer_opengl/utils.h
index d56153fe7..3ad7c02d4 100644
--- a/src/video_core/renderer_opengl/utils.h
+++ b/src/video_core/renderer_opengl/utils.h
@@ -26,12 +26,7 @@ public:
     void Bind();
 
 private:
-    struct Entry {
-        GLuint binding_index{};
-        const GLuint* buffer{};
-        GLintptr offset{};
-        GLsizei stride{};
-    };
+    struct Entry;
 
     GLuint vao{};
     const GLuint* index_buffer{};
@@ -50,12 +45,7 @@ public:
     void Bind();
 
 private:
-    struct Entry {
-        GLuint binding;
-        const GLuint* buffer;
-        GLintptr offset;
-        GLsizeiptr size;
-    };
+    struct Entry;
 
     GLenum target;
     std::vector<Entry> entries;
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index 000e3616d..331808113 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -44,7 +44,7 @@ vk::SamplerMipmapMode MipmapMode(Tegra::Texture::TextureMipmapFilter mipmap_filt
     return {};
 }
 
-vk::SamplerAddressMode WrapMode(Tegra::Texture::WrapMode wrap_mode,
+vk::SamplerAddressMode WrapMode(const VKDevice& device, Tegra::Texture::WrapMode wrap_mode,
                                 Tegra::Texture::TextureFilter filter) {
     switch (wrap_mode) {
     case Tegra::Texture::WrapMode::Wrap:
@@ -56,7 +56,12 @@ vk::SamplerAddressMode WrapMode(Tegra::Texture::WrapMode wrap_mode,
     case Tegra::Texture::WrapMode::Border:
         return vk::SamplerAddressMode::eClampToBorder;
     case Tegra::Texture::WrapMode::Clamp:
-        // TODO(Rodrigo): Emulate GL_CLAMP properly
+        if (device.GetDriverID() == vk::DriverIdKHR::eNvidiaProprietary) {
+            // Nvidia's Vulkan driver defaults to GL_CLAMP on invalid enumerations, we can hack this
+            // by sending an invalid enumeration.
+            return static_cast<vk::SamplerAddressMode>(0xcafe);
+        }
+        // TODO(Rodrigo): Emulate GL_CLAMP properly on other vendors
         switch (filter) {
         case Tegra::Texture::TextureFilter::Nearest:
             return vk::SamplerAddressMode::eClampToEdge;
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.h b/src/video_core/renderer_vulkan/maxwell_to_vk.h
index 1534b738b..7e9678b7b 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.h
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.h
@@ -22,7 +22,7 @@ vk::Filter Filter(Tegra::Texture::TextureFilter filter);
 
 vk::SamplerMipmapMode MipmapMode(Tegra::Texture::TextureMipmapFilter mipmap_filter);
 
-vk::SamplerAddressMode WrapMode(Tegra::Texture::WrapMode wrap_mode,
+vk::SamplerAddressMode WrapMode(const VKDevice& device, Tegra::Texture::WrapMode wrap_mode,
                                 Tegra::Texture::TextureFilter filter);
 
 vk::CompareOp DepthCompareFunction(Tegra::Texture::DepthCompareFunc depth_compare_func);
diff --git a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
index 1ce583f75..0a8ec8398 100644
--- a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
@@ -46,9 +46,9 @@ UniqueSampler VKSamplerCache::CreateSampler(const Tegra::Texture::TSCEntry& tsc)
         {}, MaxwellToVK::Sampler::Filter(tsc.mag_filter),
         MaxwellToVK::Sampler::Filter(tsc.min_filter),
         MaxwellToVK::Sampler::MipmapMode(tsc.mipmap_filter),
-        MaxwellToVK::Sampler::WrapMode(tsc.wrap_u, tsc.mag_filter),
-        MaxwellToVK::Sampler::WrapMode(tsc.wrap_v, tsc.mag_filter),
-        MaxwellToVK::Sampler::WrapMode(tsc.wrap_p, tsc.mag_filter), tsc.GetLodBias(),
+        MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_u, tsc.mag_filter),
+        MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_v, tsc.mag_filter),
+        MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_p, tsc.mag_filter), tsc.GetLodBias(),
         has_anisotropy, max_anisotropy, tsc.depth_compare_enabled,
         MaxwellToVK::Sampler::DepthCompareFunction(tsc.depth_compare_func), tsc.GetMinLod(),
         tsc.GetMaxLod(), vk_border_color.value_or(vk::BorderColor::eFloatTransparentBlack),
diff --git a/src/video_core/shader/control_flow.cpp b/src/video_core/shader/control_flow.cpp
index b427ac873..0229733b6 100644
--- a/src/video_core/shader/control_flow.cpp
+++ b/src/video_core/shader/control_flow.cpp
@@ -65,7 +65,7 @@ struct BlockInfo {
 
 struct CFGRebuildState {
     explicit CFGRebuildState(const ProgramCode& program_code, u32 start, ConstBufferLocker& locker)
-        : program_code{program_code}, start{start}, locker{locker} {}
+        : program_code{program_code}, locker{locker}, start{start} {}
 
     const ProgramCode& program_code;
     ConstBufferLocker& locker;
diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp
index c934d0719..8cc84e935 100644
--- a/src/video_core/shader/decode/memory.cpp
+++ b/src/video_core/shader/decode/memory.cpp
@@ -6,6 +6,7 @@
 #include <vector>
 #include <fmt/format.h>
 
+#include "common/alignment.h"
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "common/logging/log.h"
@@ -22,34 +23,39 @@ using Tegra::Shader::Register;
 
 namespace {
 
-u32 GetLdgMemorySize(Tegra::Shader::UniformType uniform_type) {
+bool IsUnaligned(Tegra::Shader::UniformType uniform_type) {
+    return uniform_type == Tegra::Shader::UniformType::UnsignedByte ||
+           uniform_type == Tegra::Shader::UniformType::UnsignedShort;
+}
+
+u32 GetUnalignedMask(Tegra::Shader::UniformType uniform_type) {
     switch (uniform_type) {
     case Tegra::Shader::UniformType::UnsignedByte:
-    case Tegra::Shader::UniformType::Single:
-        return 1;
-    case Tegra::Shader::UniformType::Double:
-        return 2;
-    case Tegra::Shader::UniformType::Quad:
-    case Tegra::Shader::UniformType::UnsignedQuad:
-        return 4;
+        return 0b11;
+    case Tegra::Shader::UniformType::UnsignedShort:
+        return 0b10;
     default:
-        UNIMPLEMENTED_MSG("Unimplemented size={}!", static_cast<u32>(uniform_type));
-        return 1;
+        UNREACHABLE();
+        return 0;
     }
 }
 
-u32 GetStgMemorySize(Tegra::Shader::UniformType uniform_type) {
+u32 GetMemorySize(Tegra::Shader::UniformType uniform_type) {
     switch (uniform_type) {
+    case Tegra::Shader::UniformType::UnsignedByte:
+        return 8;
+    case Tegra::Shader::UniformType::UnsignedShort:
+        return 16;
     case Tegra::Shader::UniformType::Single:
-        return 1;
+        return 32;
     case Tegra::Shader::UniformType::Double:
-        return 2;
+        return 64;
     case Tegra::Shader::UniformType::Quad:
     case Tegra::Shader::UniformType::UnsignedQuad:
-        return 4;
+        return 128;
     default:
         UNIMPLEMENTED_MSG("Unimplemented size={}!", static_cast<u32>(uniform_type));
-        return 1;
+        return 32;
     }
 }
 
@@ -184,9 +190,10 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
         }();
 
         const auto [real_address_base, base_address, descriptor] =
-            TrackGlobalMemory(bb, instr, false);
+            TrackGlobalMemory(bb, instr, true, false);
 
-        const u32 count = GetLdgMemorySize(type);
+        const u32 size = GetMemorySize(type);
+        const u32 count = Common::AlignUp(size, 32) / 32;
         if (!real_address_base || !base_address) {
             // Tracking failed, load zeroes.
             for (u32 i = 0; i < count; ++i) {
@@ -200,14 +207,15 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
             const Node real_address = Operation(OperationCode::UAdd, real_address_base, it_offset);
             Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor);
 
-            if (type == Tegra::Shader::UniformType::UnsignedByte) {
-                // To handle unaligned loads get the byte used to dereferenced global memory
-                // and extract that byte from the loaded uint32.
-                Node byte = Operation(OperationCode::UBitwiseAnd, real_address, Immediate(3));
-                byte = Operation(OperationCode::ULogicalShiftLeft, std::move(byte), Immediate(3));
+            // To handle unaligned loads get the bytes used to dereference global memory and extract
+            // those bytes from the loaded u32.
+            if (IsUnaligned(type)) {
+                Node mask = Immediate(GetUnalignedMask(type));
+                Node offset = Operation(OperationCode::UBitwiseAnd, real_address, std::move(mask));
+                offset = Operation(OperationCode::ULogicalShiftLeft, offset, Immediate(3));
 
-                gmem = Operation(OperationCode::UBitfieldExtract, std::move(gmem), std::move(byte),
-                                 Immediate(8));
+                gmem = Operation(OperationCode::UBitfieldExtract, std::move(gmem),
+                                 std::move(offset), Immediate(size));
             }
 
             SetTemporary(bb, i, gmem);
@@ -295,19 +303,32 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
             }
         }();
 
+        // For unaligned reads we have to read memory too.
+        const bool is_read = IsUnaligned(type);
         const auto [real_address_base, base_address, descriptor] =
-            TrackGlobalMemory(bb, instr, true);
+            TrackGlobalMemory(bb, instr, is_read, true);
         if (!real_address_base || !base_address) {
             // Tracking failed, skip the store.
             break;
         }
 
-        const u32 count = GetStgMemorySize(type);
+        const u32 size = GetMemorySize(type);
+        const u32 count = Common::AlignUp(size, 32) / 32;
         for (u32 i = 0; i < count; ++i) {
             const Node it_offset = Immediate(i * 4);
             const Node real_address = Operation(OperationCode::UAdd, real_address_base, it_offset);
             const Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor);
-            const Node value = GetRegister(instr.gpr0.Value() + i);
+            Node value = GetRegister(instr.gpr0.Value() + i);
+
+            if (IsUnaligned(type)) {
+                Node mask = Immediate(GetUnalignedMask(type));
+                Node offset = Operation(OperationCode::UBitwiseAnd, real_address, std::move(mask));
+                offset = Operation(OperationCode::ULogicalShiftLeft, offset, Immediate(3));
+
+                value = Operation(OperationCode::UBitfieldInsert, gmem, std::move(value), offset,
+                                  Immediate(size));
+            }
+
             bb.push_back(Operation(OperationCode::Assign, gmem, value));
         }
         break;
@@ -336,7 +357,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
 
 std::tuple<Node, Node, GlobalMemoryBase> ShaderIR::TrackGlobalMemory(NodeBlock& bb,
                                                                      Instruction instr,
-                                                                     bool is_write) {
+                                                                     bool is_read, bool is_write) {
     const auto addr_register{GetRegister(instr.gmem.gpr)};
     const auto immediate_offset{static_cast<u32>(instr.gmem.offset)};
 
@@ -351,11 +372,8 @@ std::tuple<Node, Node, GlobalMemoryBase> ShaderIR::TrackGlobalMemory(NodeBlock&
     const GlobalMemoryBase descriptor{index, offset};
     const auto& [entry, is_new] = used_global_memory.try_emplace(descriptor);
     auto& usage = entry->second;
-    if (is_write) {
-        usage.is_written = true;
-    } else {
-        usage.is_read = true;
-    }
+    usage.is_written |= is_write;
+    usage.is_read |= is_read;
 
     const auto real_address =
         Operation(OperationCode::UAdd, NO_PRECISE, Immediate(immediate_offset), addr_register);
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp
index 4b14cdf58..cd984f763 100644
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -794,14 +794,10 @@ std::tuple<std::size_t, std::size_t> ShaderIR::ValidateAndGetCoordinateElement(
 
 std::vector<Node> ShaderIR::GetAoffiCoordinates(Node aoffi_reg, std::size_t coord_count,
                                                 bool is_tld4) {
-    const auto [coord_offsets, size, wrap_value,
-                diff_value] = [is_tld4]() -> std::tuple<std::array<u32, 3>, u32, s32, s32> {
-        if (is_tld4) {
-            return {{0, 8, 16}, 6, 32, 64};
-        } else {
-            return {{0, 4, 8}, 4, 8, 16};
-        }
-    }();
+    const std::array coord_offsets = is_tld4 ? std::array{0U, 8U, 16U} : std::array{0U, 4U, 8U};
+    const u32 size = is_tld4 ? 6 : 4;
+    const s32 wrap_value = is_tld4 ? 32 : 8;
+    const s32 diff_value = is_tld4 ? 64 : 16;
     const u32 mask = (1U << size) - 1;
 
     std::vector<Node> aoffi;
@@ -814,7 +810,7 @@ std::vector<Node> ShaderIR::GetAoffiCoordinates(Node aoffi_reg, std::size_t coor
         LOG_WARNING(HW_GPU,
                     "AOFFI constant folding failed, some hardware might have graphical issues");
         for (std::size_t coord = 0; coord < coord_count; ++coord) {
-            const Node value = BitfieldExtract(aoffi_reg, coord_offsets.at(coord), size);
+            const Node value = BitfieldExtract(aoffi_reg, coord_offsets[coord], size);
             const Node condition =
                 Operation(OperationCode::LogicalIGreaterEqual, value, Immediate(wrap_value));
             const Node negative = Operation(OperationCode::IAdd, value, Immediate(-diff_value));
@@ -824,7 +820,7 @@ std::vector<Node> ShaderIR::GetAoffiCoordinates(Node aoffi_reg, std::size_t coor
     }
 
     for (std::size_t coord = 0; coord < coord_count; ++coord) {
-        s32 value = (*aoffi_immediate >> coord_offsets.at(coord)) & mask;
+        s32 value = (*aoffi_immediate >> coord_offsets[coord]) & mask;
         if (value >= wrap_value) {
             value -= diff_value;
         }
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index aacd0a0da..ba1db4c11 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -394,7 +394,7 @@ private:
 
     std::tuple<Node, Node, GlobalMemoryBase> TrackGlobalMemory(NodeBlock& bb,
                                                                Tegra::Shader::Instruction instr,
-                                                               bool is_write);
+                                                               bool is_read, bool is_write);
 
     /// Register new amending code and obtain the reference id.
     std::size_t DeclareAmend(Node new_amend);
diff --git a/src/yuzu/configuration/configure_hotkeys.cpp b/src/yuzu/configuration/configure_hotkeys.cpp
index 3ea0b8d67..fa9052136 100644
--- a/src/yuzu/configuration/configure_hotkeys.cpp
+++ b/src/yuzu/configuration/configure_hotkeys.cpp
@@ -48,6 +48,7 @@ void ConfigureHotkeys::Populate(const HotkeyRegistry& registry) {
     }
 
     ui->hotkey_list->expandAll();
+    ui->hotkey_list->resizeColumnToContents(0);
 }
 
 void ConfigureHotkeys::changeEvent(QEvent* event) {