Android #139

Merge PR 12074
Merge PR 11535
2023-11-23 00:57:23 +00:00 · 2023-11-23 00:57:23 +00:00 · 2023-11-23 00:57:23 +00:00 · 2023-11-21 22:17:42 -05:00 · 2023-11-21 22:57:47 +00:00 · 2023-11-21 22:57:09 +00:00
89 changed files with 3673 additions and 947 deletions
--- a/.codespellrc
+++ b/.codespellrc
@ -3,4 +3,4 @@

 [codespell]
 skip = ./.git,./build,./dist,./Doxyfile,./externals,./LICENSES,./src/android/app/src/main/res
-ignore-words-list = aci,allright,ba,canonicalizations,deques,froms,hda,inout,lod,masia,nam,nax,nd,optin,pullrequests,pullrequest,te,transfered,unstall,uscaled,vas,zink
+ignore-words-list = aci,allright,ba,canonicalizations,deques,froms,hda,inout,lod,masia,nam,nax,nce,nd,optin,pullrequests,pullrequest,te,transfered,unstall,uscaled,vas,zink
--- a/.gitmodules
+++ b/.gitmodules
@ -61,3 +61,6 @@
 [submodule "breakpad"]
 	path = externals/breakpad
 	url = https://github.com/yuzu-emu/breakpad.git
+[submodule "oaknut"]
+	path = externals/oaknut
+	url = https://github.com/merryhime/oaknut
--- a/README.md
+++ b/README.md
@ -1,3 +1,13 @@
+| Pull Request | Commit | Title | Author | Merged? |
+|----|----|----|----|----|
+| [11535](https://github.com/yuzu-emu/yuzu//pull/11535) | [`50bcfa5fb`](https://github.com/yuzu-emu/yuzu//pull/11535/files) | renderer_vulkan: Introduce separate cmd buffer for uploads | [GPUCode](https://github.com/GPUCode/) | Yes |
+| [12074](https://github.com/yuzu-emu/yuzu//pull/12074) | [`20b671baa`](https://github.com/yuzu-emu/yuzu//pull/12074/files) | Implement Native Code Execution (NCE) | [GPUCode](https://github.com/GPUCode/) | Yes |
+
+
+End of merge log. You can find the original README.md below the break.
+
+-----
+
 <!--
 SPDX-FileCopyrightText: 2018 yuzu Emulator Project
 SPDX-License-Identifier: GPL-2.0-or-later
--- a/dist/languages/.tx/config
+++ b/dist/languages/.tx/config
@ -6,3 +6,8 @@ file_filter = <lang>.ts
 source_file = en.ts
 source_lang = en
 type = QT
+
+[o:yuzu-emulator:p:yuzu:r:yuzu-android]
+file_filter = ../../src/android/app/src/main/res/values-<lang>/strings.xml
+source_file = ../../src/android/app/src/main/res/values/strings.xml
+type = ANDROID
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@ -20,6 +20,10 @@ if ((ARCHITECTURE_x86 OR ARCHITECTURE_x86_64) AND NOT TARGET xbyak::xbyak)
 endif()

 # Dynarmic
+if (ARCHITECTURE_arm64 AND NOT TARGET merry::oaknut)
+    add_subdirectory(oaknut)
+endif()
+
 if ((ARCHITECTURE_x86_64 OR ARCHITECTURE_arm64) AND NOT TARGET dynarmic::dynarmic)
    set(DYNARMIC_IGNORE_ASSERTS ON)
    add_subdirectory(dynarmic)
--- a/externals/oaknut
+++ b/externals/oaknut
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/NativeLibrary.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/NativeLibrary.kt
@ -301,6 +301,11 @@ object NativeLibrary {
     */
    external fun getPerfStats(): DoubleArray

+    /**
+     * Returns the current CPU backend.
+     */
+    external fun getCpuBackend(): String
+
    /**
     * Notifies the core emulation that the orientation has changed.
     */
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/IntSetting.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/IntSetting.kt
@ -10,6 +10,7 @@ enum class IntSetting(
    override val category: Settings.Category,
    override val androidDefault: Int? = null
 ) : AbstractIntSetting {
+    CPU_BACKEND("cpu_backend", Settings.Category.Cpu),
    CPU_ACCURACY("cpu_accuracy", Settings.Category.Cpu),
    REGION_INDEX("region_index", Settings.Category.System),
    LANGUAGE_INDEX("language_index", Settings.Category.System),
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/view/SettingsItem.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/view/SettingsItem.kt
@ -77,6 +77,15 @@ abstract class SettingsItem(
                    "%"
                )
            )
+            put(
+                SingleChoiceSetting(
+                    IntSetting.CPU_BACKEND,
+                    R.string.cpu_backend,
+                    0,
+                    R.array.cpuBackendNames,
+                    R.array.cpuBackendValues
+                )
+            )
            put(
                SingleChoiceSetting(
                    IntSetting.CPU_ACCURACY,
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragmentPresenter.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragmentPresenter.kt
@ -269,6 +269,7 @@ class SettingsFragmentPresenter(
            add(BooleanSetting.RENDERER_DEBUG.key)

            add(HeaderSetting(R.string.cpu))
+            add(IntSetting.CPU_BACKEND.key)
            add(IntSetting.CPU_ACCURACY.key)
            add(BooleanSetting.CPU_DEBUG_MODE.key)
            add(SettingsItem.FASTMEM_COMBINED)
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EmulationFragment.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EmulationFragment.kt
@ -414,8 +414,10 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback {
            perfStatsUpdater = {
                if (emulationViewModel.emulationStarted.value) {
                    val perfStats = NativeLibrary.getPerfStats()
+                    val cpuBackend = NativeLibrary.getCpuBackend()
                    if (_binding != null) {
-                        binding.showFpsText.text = String.format("FPS: %.1f", perfStats[FPS])
+                        binding.showFpsText.text =
+                            String.format("FPS: %.1f\n%s", perfStats[FPS], cpuBackend)
                    }
                    perfStatsUpdateHandler.postDelayed(perfStatsUpdater!!, 800)
                }
--- a/src/android/app/src/main/jni/config.cpp
+++ b/src/android/app/src/main/jni/config.cpp
@ -177,6 +177,7 @@ void Config::ReadValues() {
    ReadSetting("Core", Settings::values.memory_layout_mode);

    // Cpu
+    ReadSetting("Cpu", Settings::values.cpu_backend);
    ReadSetting("Cpu", Settings::values.cpu_accuracy);
    ReadSetting("Cpu", Settings::values.cpu_debug_mode);
    ReadSetting("Cpu", Settings::values.cpuopt_page_tables);
--- a/src/android/app/src/main/jni/default_ini.h
+++ b/src/android/app/src/main/jni/default_ini.h
@ -153,6 +153,10 @@ use_multi_core =
 use_unsafe_extended_memory_layout =

 [Cpu]
+Selects the preferred CPU backend for executing ARM instructions
+# 0 (default): Dynarmic, 1: NCE
+cpu_backend =
+
 # Adjusts various optimizations.
 # Auto-select mode enables choice unsafe optimizations.
 # Accurate enables only safe optimizations.
--- a/src/android/app/src/main/jni/native.cpp
+++ b/src/android/app/src/main/jni/native.cpp
@ -707,6 +707,14 @@ jdoubleArray Java_org_yuzu_yuzu_1emu_NativeLibrary_getPerfStats(JNIEnv* env, jcl
    return j_stats;
 }

+jstring Java_org_yuzu_yuzu_1emu_NativeLibrary_getCpuBackend(JNIEnv* env, jclass clazz) {
+    if (Settings::IsNceEnabled()) {
+        return ToJString(env, "NCE");
+    }
+
+    return ToJString(env, "JIT");
+}
+
 void Java_org_yuzu_yuzu_1emu_utils_DirectoryInitialization_setSysDirectory(JNIEnv* env,
                                                                           jclass clazz,
                                                                           jstring j_path) {}
--- a/src/android/app/src/main/res/values/arrays.xml
+++ b/src/android/app/src/main/res/values/arrays.xml
@ -175,6 +175,16 @@
        <item>2</item>
    </integer-array>

+    <string-array name="cpuBackendNames">
+        <item>@string/cpu_backend_dynarmic</item>
+        <item>@string/cpu_backend_nce</item>
+    </string-array>
+
+    <integer-array name="cpuBackendValues">
+        <item>0</item>
+        <item>1</item>
+    </integer-array>
+
    <string-array name="cpuAccuracyNames">
        <item>@string/auto</item>
        <item>@string/cpu_accuracy_accurate</item>
--- a/src/android/app/src/main/res/values/strings.xml
+++ b/src/android/app/src/main/res/values/strings.xml
@ -185,6 +185,7 @@
    <string name="frame_limit_enable_description">Limits emulation speed to a specified percentage of normal speed.</string>
    <string name="frame_limit_slider">Limit speed percent</string>
    <string name="frame_limit_slider_description">Specifies the percentage to limit emulation speed. 100% is the normal speed. Values higher or lower will increase or decrease the speed limit.</string>
+    <string name="cpu_backend">CPU Backend</string>
    <string name="cpu_accuracy">CPU accuracy</string>
    <string name="value_with_units">%1$s%2$s</string>

@ -416,6 +417,10 @@
    <string name="ratio_force_sixteen_ten">Force 16:10</string>
    <string name="ratio_stretch">Stretch to window</string>

+    <!-- CPU Backend -->
+    <string name="cpu_backend_dynarmic">Dynarmic (Slow)</string>
+    <string name="cpu_backend_nce">Native code execution (NCE)</string>
+
    <!-- CPU Accuracy -->
    <string name="cpu_accuracy_accurate">Accurate</string>
    <string name="cpu_accuracy_unsafe">Unsafe</string>
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@ -52,6 +52,7 @@ add_library(common STATIC
    fiber.cpp
    fiber.h
    fixed_point.h
+    free_region_manager.h
    fs/file.cpp
    fs/file.h
    fs/fs.cpp
@ -166,6 +167,13 @@ if (WIN32)
  target_link_libraries(common PRIVATE ntdll)
 endif()

+if (NOT WIN32)
+  target_sources(common PRIVATE
+    signal_chain.cpp
+    signal_chain.h
+  )
+endif()
+
 if(ANDROID)
    target_sources(common
        PRIVATE
--- a/src/common/free_region_manager.h
+++ b/src/common/free_region_manager.h
@ -0,0 +1,55 @@
+// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <mutex>
+#include <boost/icl/interval_set.hpp>
+
+namespace Common {
+
+class FreeRegionManager {
+public:
+    explicit FreeRegionManager() = default;
+    ~FreeRegionManager() = default;
+
+    void SetAddressSpace(void* start, size_t size) {
+        this->FreeBlock(start, size);
+    }
+
+    std::pair<void*, size_t> FreeBlock(void* block_ptr, size_t size) {
+        std::scoped_lock lk(m_mutex);
+
+        // Check to see if we are adjacent to any regions.
+        auto start_address = reinterpret_cast<uintptr_t>(block_ptr);
+        auto end_address = start_address + size;
+        auto it = m_free_regions.find({start_address - 1, end_address + 1});
+
+        // If we are, join with them, ensuring we stay in bounds.
+        if (it != m_free_regions.end()) {
+            start_address = std::min(start_address, it->lower());
+            end_address = std::max(end_address, it->upper());
+        }
+
+        // Free the relevant region.
+        m_free_regions.insert({start_address, end_address});
+
+        // Return the adjusted pointers.
+        block_ptr = reinterpret_cast<void*>(start_address);
+        size = end_address - start_address;
+        return {block_ptr, size};
+    }
+
+    void AllocateBlock(void* block_ptr, size_t size) {
+        std::scoped_lock lk(m_mutex);
+
+        auto address = reinterpret_cast<uintptr_t>(block_ptr);
+        m_free_regions.subtract({address, address + size});
+    }
+
+private:
+    std::mutex m_mutex;
+    boost::icl::interval_set<uintptr_t> m_free_regions;
+};
+
+} // namespace Common
--- a/src/common/host_memory.cpp
+++ b/src/common/host_memory.cpp
@ -21,15 +21,18 @@
 #include <boost/icl/interval_set.hpp>
 #include <fcntl.h>
 #include <sys/mman.h>
+#include <sys/random.h>
 #include <unistd.h>
 #include "common/scope_exit.h"

 #endif // ^^^ Linux ^^^

 #include <mutex>
+#include <random>

 #include "common/alignment.h"
 #include "common/assert.h"
+#include "common/free_region_manager.h"
 #include "common/host_memory.h"
 #include "common/logging/log.h"

@ -141,7 +144,7 @@ public:
        Release();
    }

-    void Map(size_t virtual_offset, size_t host_offset, size_t length) {
+    void Map(size_t virtual_offset, size_t host_offset, size_t length, MemoryPermission perms) {
        std::unique_lock lock{placeholder_mutex};
        if (!IsNiechePlaceholder(virtual_offset, length)) {
            Split(virtual_offset, length);
@ -160,7 +163,7 @@ public:
        }
    }

-    void Protect(size_t virtual_offset, size_t length, bool read, bool write) {
+    void Protect(size_t virtual_offset, size_t length, bool read, bool write, bool execute) {
        DWORD new_flags{};
        if (read && write) {
            new_flags = PAGE_READWRITE;
@ -186,6 +189,11 @@ public:
        }
    }

+    void EnableDirectMappedAddress() {
+        // TODO
+        UNREACHABLE();
+    }
+
    const size_t backing_size; ///< Size of the backing memory in bytes
    const size_t virtual_size; ///< Size of the virtual address placeholder in bytes

@ -353,6 +361,64 @@ private:

 #elif defined(__linux__) || defined(__FreeBSD__) // ^^^ Windows ^^^ vvv Linux vvv

+#ifdef ARCHITECTURE_arm64
+
+static uint64_t GetRandomU64() {
+    uint64_t ret;
+    ASSERT(getrandom(&ret, sizeof(ret), 0) == 0);
+    return ret;
+}
+
+static void* ChooseVirtualBase(size_t virtual_size) {
+    constexpr uintptr_t Map39BitSize = (1ULL << 39);
+    constexpr uintptr_t Map36BitSize = (1ULL << 36);
+
+    // Seed the MT with some initial strong randomness.
+    //
+    // This is not a cryptographic application, we just want something more
+    // random than the current time.
+    std::mt19937_64 rng(GetRandomU64());
+
+    // We want to ensure we are allocating at an address aligned to the L2 block size.
+    // For Qualcomm devices, we must also allocate memory above 36 bits.
+    const size_t lower = Map36BitSize / HugePageSize;
+    const size_t upper = (Map39BitSize - virtual_size) / HugePageSize;
+    const size_t range = upper - lower;
+
+    // Try up to 64 times to allocate memory at random addresses in the range.
+    for (int i = 0; i < 64; i++) {
+        // Calculate a possible location.
+        uintptr_t hint_address = ((rng() % range) + lower) * HugePageSize;
+
+        // Try to map.
+        // Note: we may be able to take advantage of MAP_FIXED_NOREPLACE here.
+        void* map_pointer =
+            mmap(reinterpret_cast<void*>(hint_address), virtual_size, PROT_READ | PROT_WRITE,
+                 MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0);
+
+        // If we successfully mapped, we're done.
+        if (reinterpret_cast<uintptr_t>(map_pointer) == hint_address) {
+            return map_pointer;
+        }
+
+        // Unmap if necessary, and try again.
+        if (map_pointer != MAP_FAILED) {
+            munmap(map_pointer, virtual_size);
+        }
+    }
+
+    return MAP_FAILED;
+}
+
+#else
+
+static void* ChooseVirtualBase(size_t virtual_size) {
+    return mmap(nullptr, virtual_size, PROT_READ | PROT_WRITE,
+                MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0);
+}
+
+#endif
+
 class HostMemory::Impl {
 public:
    explicit Impl(size_t backing_size_, size_t virtual_size_)
@ -415,8 +481,7 @@ public:
            }
        }
 #else
-        virtual_base = static_cast<u8*>(mmap(nullptr, virtual_size, PROT_NONE,
-                                             MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0));
+        virtual_base = virtual_map_base = static_cast<u8*>(ChooseVirtualBase(virtual_size));
        if (virtual_base == MAP_FAILED) {
            LOG_CRITICAL(HW_Memory, "mmap failed: {}", strerror(errno));
            throw std::bad_alloc{};
@ -424,7 +489,7 @@ public:
        madvise(virtual_base, virtual_size, MADV_HUGEPAGE);
 #endif

-        placeholders.add({0, virtual_size});
+        free_manager.SetAddressSpace(virtual_base, virtual_size);
        good = true;
    }

@ -432,14 +497,29 @@ public:
        Release();
    }

-    void Map(size_t virtual_offset, size_t host_offset, size_t length) {
-        {
-            std::scoped_lock lock{placeholder_mutex};
-            placeholders.subtract({virtual_offset, virtual_offset + length});
-        }
+    void Map(size_t virtual_offset, size_t host_offset, size_t length, MemoryPermission perms) {
+        // Intersect the range with our address space.
+        AdjustMap(&virtual_offset, &length);

-        void* ret = mmap(virtual_base + virtual_offset, length, PROT_READ | PROT_WRITE,
-                         MAP_SHARED | MAP_FIXED, fd, host_offset);
+        // We are removing a placeholder.
+        free_manager.AllocateBlock(virtual_base + virtual_offset, length);
+
+        // Deduce mapping protection flags.
+        int flags = PROT_NONE;
+        if (True(perms & MemoryPermission::Read)) {
+            flags |= PROT_READ;
+        }
+        if (True(perms & MemoryPermission::Write)) {
+            flags |= PROT_WRITE;
+        }
+#ifdef ARCHITECTURE_arm64
+        if (True(perms & MemoryPermission::Execute)) {
+            flags |= PROT_EXEC;
+        }
+#endif
+
+        void* ret = mmap(virtual_base + virtual_offset, length, flags, MAP_SHARED | MAP_FIXED, fd,
+                         host_offset);
        ASSERT_MSG(ret != MAP_FAILED, "mmap failed: {}", strerror(errno));
    }

@ -447,47 +527,54 @@ public:
        // The method name is wrong. We're still talking about the virtual range.
        // We don't want to unmap, we want to reserve this memory.

-        {
-            std::scoped_lock lock{placeholder_mutex};
-            auto it = placeholders.find({virtual_offset - 1, virtual_offset + length + 1});
+        // Intersect the range with our address space.
+        AdjustMap(&virtual_offset, &length);

-            if (it != placeholders.end()) {
-                size_t prev_upper = virtual_offset + length;
-                virtual_offset = std::min(virtual_offset, it->lower());
-                length = std::max(it->upper(), prev_upper) - virtual_offset;
-            }
+        // Merge with any adjacent placeholder mappings.
+        auto [merged_pointer, merged_size] =
+            free_manager.FreeBlock(virtual_base + virtual_offset, length);

-            placeholders.add({virtual_offset, virtual_offset + length});
-        }
-
-        void* ret = mmap(virtual_base + virtual_offset, length, PROT_NONE,
+        void* ret = mmap(merged_pointer, merged_size, PROT_NONE,
                         MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
        ASSERT_MSG(ret != MAP_FAILED, "mmap failed: {}", strerror(errno));
    }

-    void Protect(size_t virtual_offset, size_t length, bool read, bool write) {
-        int flags = 0;
+    void Protect(size_t virtual_offset, size_t length, bool read, bool write, bool execute) {
+        // Intersect the range with our address space.
+        AdjustMap(&virtual_offset, &length);
+
+        int flags = PROT_NONE;
        if (read) {
            flags |= PROT_READ;
        }
        if (write) {
            flags |= PROT_WRITE;
        }
+#ifdef ARCHITECTURE_arm64
+        if (execute) {
+            flags |= PROT_EXEC;
+        }
+#endif
        int ret = mprotect(virtual_base + virtual_offset, length, flags);
        ASSERT_MSG(ret == 0, "mprotect failed: {}", strerror(errno));
    }

+    void EnableDirectMappedAddress() {
+        virtual_base = nullptr;
+    }
+
    const size_t backing_size; ///< Size of the backing memory in bytes
    const size_t virtual_size; ///< Size of the virtual address placeholder in bytes

    u8* backing_base{reinterpret_cast<u8*>(MAP_FAILED)};
    u8* virtual_base{reinterpret_cast<u8*>(MAP_FAILED)};
+    u8* virtual_map_base{reinterpret_cast<u8*>(MAP_FAILED)};

 private:
    /// Release all resources in the object
    void Release() {
-        if (virtual_base != MAP_FAILED) {
-            int ret = munmap(virtual_base, virtual_size);
+        if (virtual_map_base != MAP_FAILED) {
+            int ret = munmap(virtual_map_base, virtual_size);
            ASSERT_MSG(ret == 0, "munmap failed: {}", strerror(errno));
        }

@ -502,10 +589,29 @@ private:
        }
    }

-    int fd{-1}; // memfd file descriptor, -1 is the error value of memfd_create
+    void AdjustMap(size_t* virtual_offset, size_t* length) {
+        if (virtual_base != nullptr) {
+            return;
+        }

-    boost::icl::interval_set<size_t> placeholders; ///< Mapped placeholders
-    std::mutex placeholder_mutex;                  ///< Mutex for placeholders
+        // If we are direct mapped, we want to make sure we are operating on a region
+        // that is in range of our virtual mapping.
+        size_t intended_start = *virtual_offset;
+        size_t intended_end = intended_start + *length;
+        size_t address_space_start = reinterpret_cast<size_t>(virtual_map_base);
+        size_t address_space_end = address_space_start + virtual_size;
+
+        if (address_space_start > intended_end || intended_start > address_space_end) {
+            *virtual_offset = 0;
+            *length = 0;
+        } else {
+            *virtual_offset = std::max(intended_start, address_space_start);
+            *length = std::min(intended_end, address_space_end) - *virtual_offset;
+        }
+    }
+
+    int fd{-1}; // memfd file descriptor, -1 is the error value of memfd_create
+    FreeRegionManager free_manager{};
 };

 #else // ^^^ Linux ^^^ vvv Generic vvv
@ -518,11 +624,11 @@ public:
        throw std::bad_alloc{};
    }

-    void Map(size_t virtual_offset, size_t host_offset, size_t length) {}
+    void Map(size_t virtual_offset, size_t host_offset, size_t length, MemoryPermission perm) {}

    void Unmap(size_t virtual_offset, size_t length) {}

-    void Protect(size_t virtual_offset, size_t length, bool read, bool write) {}
+    void Protect(size_t virtual_offset, size_t length, bool read, bool write, bool execute) {}

    u8* backing_base{nullptr};
    u8* virtual_base{nullptr};
@ -535,15 +641,16 @@ HostMemory::HostMemory(size_t backing_size_, size_t virtual_size_)
    try {
        // Try to allocate a fastmem arena.
        // The implementation will fail with std::bad_alloc on errors.
-        impl = std::make_unique<HostMemory::Impl>(AlignUp(backing_size, PageAlignment),
-                                                  AlignUp(virtual_size, PageAlignment) +
-                                                      3 * HugePageSize);
+        impl =
+            std::make_unique<HostMemory::Impl>(AlignUp(backing_size, PageAlignment),
+                                               AlignUp(virtual_size, PageAlignment) + HugePageSize);
        backing_base = impl->backing_base;
        virtual_base = impl->virtual_base;

        if (virtual_base) {
-            virtual_base += 2 * HugePageSize - 1;
-            virtual_base -= reinterpret_cast<size_t>(virtual_base) & (HugePageSize - 1);
+            // Ensure the virtual base is aligned to the L2 block size.
+            virtual_base = reinterpret_cast<u8*>(
+                Common::AlignUp(reinterpret_cast<uintptr_t>(virtual_base), HugePageSize));
            virtual_base_offset = virtual_base - impl->virtual_base;
        }

@ -562,7 +669,8 @@ HostMemory::HostMemory(HostMemory&&) noexcept = default;

 HostMemory& HostMemory::operator=(HostMemory&&) noexcept = default;

-void HostMemory::Map(size_t virtual_offset, size_t host_offset, size_t length) {
+void HostMemory::Map(size_t virtual_offset, size_t host_offset, size_t length,
+                     MemoryPermission perms) {
    ASSERT(virtual_offset % PageAlignment == 0);
    ASSERT(host_offset % PageAlignment == 0);
    ASSERT(length % PageAlignment == 0);
@ -571,7 +679,7 @@ void HostMemory::Map(size_t virtual_offset, size_t host_offset, size_t length) {
    if (length == 0 || !virtual_base || !impl) {
        return;
    }
-    impl->Map(virtual_offset + virtual_base_offset, host_offset, length);
+    impl->Map(virtual_offset + virtual_base_offset, host_offset, length, perms);
 }

 void HostMemory::Unmap(size_t virtual_offset, size_t length) {
@ -584,14 +692,22 @@ void HostMemory::Unmap(size_t virtual_offset, size_t length) {
    impl->Unmap(virtual_offset + virtual_base_offset, length);
 }

-void HostMemory::Protect(size_t virtual_offset, size_t length, bool read, bool write) {
+void HostMemory::Protect(size_t virtual_offset, size_t length, bool read, bool write,
+                         bool execute) {
    ASSERT(virtual_offset % PageAlignment == 0);
    ASSERT(length % PageAlignment == 0);
    ASSERT(virtual_offset + length <= virtual_size);
    if (length == 0 || !virtual_base || !impl) {
        return;
    }
-    impl->Protect(virtual_offset + virtual_base_offset, length, read, write);
+    impl->Protect(virtual_offset + virtual_base_offset, length, read, write, execute);
+}
+
+void HostMemory::EnableDirectMappedAddress() {
+    if (impl) {
+        impl->EnableDirectMappedAddress();
+        virtual_size += reinterpret_cast<uintptr_t>(virtual_base);
+    }
 }

 } // namespace Common
--- a/src/common/host_memory.h
+++ b/src/common/host_memory.h
@ -4,11 +4,20 @@
 #pragma once

 #include <memory>
+#include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "common/virtual_buffer.h"

 namespace Common {

+enum class MemoryPermission : u32 {
+    Read = 1 << 0,
+    Write = 1 << 1,
+    ReadWrite = Read | Write,
+    Execute = 1 << 2,
+};
+DECLARE_ENUM_FLAG_OPERATORS(MemoryPermission)
+
 /**
 * A low level linear memory buffer, which supports multiple mappings
 * Its purpose is to rebuild a given sparse memory layout, including mirrors.
@ -31,11 +40,13 @@ public:
    HostMemory(HostMemory&& other) noexcept;
    HostMemory& operator=(HostMemory&& other) noexcept;

-    void Map(size_t virtual_offset, size_t host_offset, size_t length);
+    void Map(size_t virtual_offset, size_t host_offset, size_t length, MemoryPermission perms);

    void Unmap(size_t virtual_offset, size_t length);

-    void Protect(size_t virtual_offset, size_t length, bool read, bool write);
+    void Protect(size_t virtual_offset, size_t length, bool read, bool write, bool execute = false);
+
+    void EnableDirectMappedAddress();

    [[nodiscard]] u8* BackingBasePointer() noexcept {
        return backing_base;
--- a/src/common/settings.cpp
+++ b/src/common/settings.cpp
@ -41,6 +41,7 @@ SWITCHABLE(AspectRatio, true);
 SWITCHABLE(AstcDecodeMode, true);
 SWITCHABLE(AstcRecompression, true);
 SWITCHABLE(AudioMode, true);
+SWITCHABLE(CpuBackend, true);
 SWITCHABLE(CpuAccuracy, true);
 SWITCHABLE(FullscreenMode, true);
 SWITCHABLE(GpuAccuracy, true);
@ -155,6 +156,22 @@ bool IsFastmemEnabled() {
    return true;
 }

+static bool is_nce_enabled = false;
+
+void SetNceEnabled(bool is_39bit) {
+    const bool is_nce_selected = values.cpu_backend.GetValue() == CpuBackend::Nce;
+    is_nce_enabled = IsFastmemEnabled() && is_nce_selected && is_39bit;
+    if (is_nce_selected && !is_nce_enabled) {
+        LOG_WARNING(
+            Common,
+            "Program does not utilize 39-bit address space, unable to natively execute code");
+    }
+}
+
+bool IsNceEnabled() {
+    return is_nce_enabled;
+}
+
 bool IsDockedMode() {
    return values.use_docked_mode.GetValue() == Settings::ConsoleMode::Docked;
 }
--- a/src/common/settings.h
+++ b/src/common/settings.h
@ -63,6 +63,7 @@ SWITCHABLE(AspectRatio, true);
 SWITCHABLE(AstcDecodeMode, true);
 SWITCHABLE(AstcRecompression, true);
 SWITCHABLE(AudioMode, true);
+SWITCHABLE(CpuBackend, true);
 SWITCHABLE(CpuAccuracy, true);
 SWITCHABLE(FullscreenMode, true);
 SWITCHABLE(GpuAccuracy, true);
@ -179,6 +180,14 @@ struct Values {
                                             &use_speed_limit};

    // Cpu
+    SwitchableSetting<CpuBackend, true> cpu_backend{
+        linkage,         CpuBackend::Dynarmic, CpuBackend::Dynarmic,
+#ifdef ARCHITECTURE_arm64
+        CpuBackend::Nce,
+#else
+                                                    CpuBackend::Dynarmic,
+#endif
+        "cpu_backend",   Category::Cpu};
    SwitchableSetting<CpuAccuracy, true> cpu_accuracy{linkage,           CpuAccuracy::Auto,
                                                      CpuAccuracy::Auto, CpuAccuracy::Paranoid,
                                                      "cpu_accuracy",    Category::Cpu};
@ -358,6 +367,8 @@ struct Values {
                                          Category::RendererDebug};
    // TODO: remove this once AMDVLK supports VK_EXT_depth_bias_control
    bool renderer_amdvlk_depth_bias_workaround{};
+    Setting<bool> disable_buffer_reorder{linkage, false, "disable_buffer_reorder",
+                                         Category::RendererDebug};

    // System
    SwitchableSetting<Language, true> language_index{linkage,
@ -534,6 +545,8 @@ bool IsGPULevelExtreme();
 bool IsGPULevelHigh();

 bool IsFastmemEnabled();
+void SetNceEnabled(bool is_64bit);
+bool IsNceEnabled();

 bool IsDockedMode();

--- a/src/common/settings_enums.h
+++ b/src/common/settings_enums.h
@ -129,6 +129,8 @@ ENUM(ShaderBackend, Glsl, Glasm, SpirV);

 ENUM(GpuAccuracy, Normal, High, Extreme);

+ENUM(CpuBackend, Dynarmic, Nce);
+
 ENUM(CpuAccuracy, Auto, Accurate, Unsafe, Paranoid);

 ENUM(MemoryLayout, Memory_4Gb, Memory_6Gb, Memory_8Gb);
--- a/src/common/signal_chain.cpp
+++ b/src/common/signal_chain.cpp
@ -0,0 +1,42 @@
+// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <dlfcn.h>
+
+#include "common/assert.h"
+#include "common/dynamic_library.h"
+#include "common/scope_exit.h"
+#include "common/signal_chain.h"
+
+namespace Common {
+
+template <typename T>
+T* LookupLibcSymbol(const char* name) {
+#if defined(__BIONIC__)
+    Common::DynamicLibrary provider("libc.so");
+    if (!provider.IsOpen()) {
+        UNREACHABLE_MSG("Failed to open libc!");
+    }
+#else
+    // For other operating environments, we assume the symbol is not overridden.
+    const char* base = nullptr;
+    Common::DynamicLibrary provider(base);
+#endif
+
+    void* sym = provider.GetSymbolAddress(name);
+    if (sym == nullptr) {
+        sym = dlsym(RTLD_DEFAULT, name);
+    }
+    if (sym == nullptr) {
+        UNREACHABLE_MSG("Unable to find symbol {}!", name);
+    }
+
+    return reinterpret_cast<T*>(sym);
+}
+
+int SigAction(int signum, const struct sigaction* act, struct sigaction* oldact) {
+    static auto libc_sigaction = LookupLibcSymbol<decltype(sigaction)>("sigaction");
+    return libc_sigaction(signum, act, oldact);
+}
+
+} // namespace Common
--- a/src/common/signal_chain.h
+++ b/src/common/signal_chain.h
@ -0,0 +1,19 @@
+// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#ifndef _WIN32
+
+#include <signal.h>
+
+namespace Common {
+
+// Android's ART overrides sigaction with its own wrapper. This is problematic for SIGSEGV
+// in particular, because ART's handler accesses tpidr_el0, which conflicts with NCE.
+// This extracts the libc symbol and calls it directly.
+int SigAction(int signum, const struct sigaction* act, struct sigaction* oldact);
+
+} // namespace Common
+
+#endif
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@ -921,6 +921,23 @@ if (ENABLE_WEB_SERVICE)
    target_link_libraries(core PRIVATE web_service)
 endif()

+if (ARCHITECTURE_arm64 AND (ANDROID OR ${CMAKE_SYSTEM_NAME} STREQUAL "Linux"))
+    target_compile_definitions(core PRIVATE -DHAS_NCE)
+    enable_language(C ASM)
+    set(CMAKE_ASM_FLAGS "${CFLAGS} -x assembler-with-cpp")
+
+    target_sources(core PRIVATE
+        arm/nce/arm_nce.cpp
+        arm/nce/arm_nce.h
+        arm/nce/arm_nce.s
+        arm/nce/guest_context.h
+        arm/nce/patch.cpp
+        arm/nce/patch.h
+        arm/nce/instructions.h
+    )
+    target_link_libraries(core PRIVATE merry::oaknut)
+endif()
+
 if (ARCHITECTURE_x86_64 OR ARCHITECTURE_arm64)
    target_sources(core PRIVATE
        arm/dynarmic/arm_dynarmic.h
--- a/src/core/arm/arm_interface.cpp
+++ b/src/core/arm/arm_interface.cpp
@ -201,6 +201,8 @@ void ARM_Interface::Run() {
        if (True(hr & HaltReason::DataAbort)) {
            if (system.DebuggerEnabled()) {
                system.GetDebugger().NotifyThreadWatchpoint(current_thread, *HaltedWatchpoint());
+            } else {
+                LogBacktrace();
            }
            current_thread->RequestSuspend(SuspendType::Debug);
            break;
--- a/src/core/arm/arm_interface.h
+++ b/src/core/arm/arm_interface.h
@ -81,6 +81,9 @@ public:
    // thread context to be 800 bytes in size.
    static_assert(sizeof(ThreadContext64) == 0x320);

+    /// Perform any backend-specific initialization.
+    virtual void Initialize() {}
+
    /// Runs the CPU until an event happens
    void Run();

--- a/src/core/arm/nce/arm_nce.cpp
+++ b/src/core/arm/nce/arm_nce.cpp
@ -0,0 +1,400 @@
+// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <cinttypes>
+#include <memory>
+
+#include "common/signal_chain.h"
+#include "core/arm/nce/arm_nce.h"
+#include "core/arm/nce/patch.h"
+#include "core/core.h"
+#include "core/memory.h"
+
+#include "core/hle/kernel/k_process.h"
+
+#include <signal.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+namespace Core {
+
+namespace {
+
+struct sigaction g_orig_action;
+
+// Verify assembly offsets.
+using NativeExecutionParameters = Kernel::KThread::NativeExecutionParameters;
+static_assert(offsetof(NativeExecutionParameters, native_context) == TpidrEl0NativeContext);
+static_assert(offsetof(NativeExecutionParameters, lock) == TpidrEl0Lock);
+static_assert(offsetof(NativeExecutionParameters, magic) == TpidrEl0TlsMagic);
+
+fpsimd_context* GetFloatingPointState(mcontext_t& host_ctx) {
+    _aarch64_ctx* header = reinterpret_cast<_aarch64_ctx*>(&host_ctx.__reserved);
+    while (header->magic != FPSIMD_MAGIC) {
+        header = reinterpret_cast<_aarch64_ctx*>(reinterpret_cast<char*>(header) + header->size);
+    }
+    return reinterpret_cast<fpsimd_context*>(header);
+}
+
+} // namespace
+
+void* ARM_NCE::RestoreGuestContext(void* raw_context) {
+    // Retrieve the host context.
+    auto& host_ctx = static_cast<ucontext_t*>(raw_context)->uc_mcontext;
+
+    // Thread-local parameters will be located in x9.
+    auto* tpidr = reinterpret_cast<NativeExecutionParameters*>(host_ctx.regs[9]);
+    auto* guest_ctx = static_cast<GuestContext*>(tpidr->native_context);
+
+    // Retrieve the host floating point state.
+    auto* fpctx = GetFloatingPointState(host_ctx);
+
+    // Save host callee-saved registers.
+    std::memcpy(guest_ctx->host_ctx.host_saved_vregs.data(), &fpctx->vregs[8],
+                sizeof(guest_ctx->host_ctx.host_saved_vregs));
+    std::memcpy(guest_ctx->host_ctx.host_saved_regs.data(), &host_ctx.regs[19],
+                sizeof(guest_ctx->host_ctx.host_saved_regs));
+
+    // Save stack pointer.
+    guest_ctx->host_ctx.host_sp = host_ctx.sp;
+
+    // Restore all guest state except tpidr_el0.
+    host_ctx.sp = guest_ctx->sp;
+    host_ctx.pc = guest_ctx->pc;
+    host_ctx.pstate = guest_ctx->pstate;
+    fpctx->fpcr = guest_ctx->fpcr;
+    fpctx->fpsr = guest_ctx->fpsr;
+    std::memcpy(host_ctx.regs, guest_ctx->cpu_registers.data(), sizeof(host_ctx.regs));
+    std::memcpy(fpctx->vregs, guest_ctx->vector_registers.data(), sizeof(fpctx->vregs));
+
+    // Return the new thread-local storage pointer.
+    return tpidr;
+}
+
+void ARM_NCE::SaveGuestContext(GuestContext* guest_ctx, void* raw_context) {
+    // Retrieve the host context.
+    auto& host_ctx = static_cast<ucontext_t*>(raw_context)->uc_mcontext;
+
+    // Retrieve the host floating point state.
+    auto* fpctx = GetFloatingPointState(host_ctx);
+
+    // Save all guest registers except tpidr_el0.
+    std::memcpy(guest_ctx->cpu_registers.data(), host_ctx.regs, sizeof(host_ctx.regs));
+    std::memcpy(guest_ctx->vector_registers.data(), fpctx->vregs, sizeof(fpctx->vregs));
+    guest_ctx->fpsr = fpctx->fpsr;
+    guest_ctx->fpcr = fpctx->fpcr;
+    guest_ctx->pstate = static_cast<u32>(host_ctx.pstate);
+    guest_ctx->pc = host_ctx.pc;
+    guest_ctx->sp = host_ctx.sp;
+
+    // Restore stack pointer.
+    host_ctx.sp = guest_ctx->host_ctx.host_sp;
+
+    // Restore host callee-saved registers.
+    std::memcpy(&host_ctx.regs[19], guest_ctx->host_ctx.host_saved_regs.data(),
+                sizeof(guest_ctx->host_ctx.host_saved_regs));
+    std::memcpy(&fpctx->vregs[8], guest_ctx->host_ctx.host_saved_vregs.data(),
+                sizeof(guest_ctx->host_ctx.host_saved_vregs));
+
+    // Return from the call on exit by setting pc to x30.
+    host_ctx.pc = guest_ctx->host_ctx.host_saved_regs[11];
+
+    // Clear esr_el1 and return it.
+    host_ctx.regs[0] = guest_ctx->esr_el1.exchange(0);
+}
+
+bool ARM_NCE::HandleGuestFault(GuestContext* guest_ctx, void* raw_info, void* raw_context) {
+    auto& host_ctx = static_cast<ucontext_t*>(raw_context)->uc_mcontext;
+    auto* info = static_cast<siginfo_t*>(raw_info);
+
+    // Try to handle an invalid access.
+    // TODO: handle accesses which split a page?
+    const Common::ProcessAddress addr =
+        (reinterpret_cast<u64>(info->si_addr) & ~Memory::YUZU_PAGEMASK);
+    if (guest_ctx->system->ApplicationMemory().InvalidateNCE(addr, Memory::YUZU_PAGESIZE)) {
+        // We handled the access successfully and are returning to guest code.
+        return true;
+    }
+
+    // We can't handle the access, so determine why we crashed.
+    const bool is_prefetch_abort = host_ctx.pc == reinterpret_cast<u64>(info->si_addr);
+
+    // For data aborts, skip the instruction and return to guest code.
+    // This will allow games to continue in many scenarios where they would otherwise crash.
+    if (!is_prefetch_abort) {
+        host_ctx.pc += 4;
+        return true;
+    }
+
+    // This is a prefetch abort.
+    guest_ctx->esr_el1.fetch_or(static_cast<u64>(HaltReason::PrefetchAbort));
+
+    // Forcibly mark the context as locked. We are still running.
+    // We may race with SignalInterrupt here:
+    // - If we lose the race, then SignalInterrupt will send us a signal we are masking,
+    //   and it will do nothing when it is unmasked, as we have already left guest code.
+    // - If we win the race, then SignalInterrupt will wait for us to unlock first.
+    auto& thread_params = guest_ctx->parent->running_thread->GetNativeExecutionParameters();
+    thread_params.lock.store(SpinLockLocked);
+
+    // Return to host.
+    SaveGuestContext(guest_ctx, raw_context);
+    return false;
+}
+
+void ARM_NCE::HandleHostFault(int sig, void* raw_info, void* raw_context) {
+    return g_orig_action.sa_sigaction(sig, static_cast<siginfo_t*>(raw_info), raw_context);
+}
+
+HaltReason ARM_NCE::RunJit() {
+    // Get the thread parameters.
+    // TODO: pass the current thread down from ::Run
+    auto* thread = Kernel::GetCurrentThreadPointer(system.Kernel());
+    auto* thread_params = &thread->GetNativeExecutionParameters();
+
+    {
+        // Lock our core context.
+        std::scoped_lock lk{lock};
+
+        // We should not be running.
+        ASSERT(running_thread == nullptr);
+
+        // Check if we need to run. If we have already been halted, we are done.
+        u64 halt = guest_ctx.esr_el1.exchange(0);
+        if (halt != 0) {
+            return static_cast<HaltReason>(halt);
+        }
+
+        // Mark that we are running.
+        running_thread = thread;
+
+        // Acquire the lock on the thread parameters.
+        // This allows us to force synchronization with SignalInterrupt.
+        LockThreadParameters(thread_params);
+    }
+
+    // Assign current members.
+    guest_ctx.parent = this;
+    thread_params->native_context = &guest_ctx;
+    thread_params->tpidr_el0 = guest_ctx.tpidr_el0;
+    thread_params->tpidrro_el0 = guest_ctx.tpidrro_el0;
+    thread_params->is_running = true;
+
+    HaltReason halt{};
+
+    // TODO: finding and creating the post handler needs to be locked
+    // to deal with dynamic loading of NROs.
+    const auto& post_handlers = system.ApplicationProcess()->GetPostHandlers();
+    if (auto it = post_handlers.find(guest_ctx.pc); it != post_handlers.end()) {
+        halt = ReturnToRunCodeByTrampoline(thread_params, &guest_ctx, it->second);
+    } else {
+        halt = ReturnToRunCodeByExceptionLevelChange(thread_id, thread_params);
+    }
+
+    // Unload members.
+    // The thread does not change, so we can persist the old reference.
+    guest_ctx.tpidr_el0 = thread_params->tpidr_el0;
+    thread_params->native_context = nullptr;
+    thread_params->is_running = false;
+
+    // Unlock the thread parameters.
+    UnlockThreadParameters(thread_params);
+
+    {
+        // Lock the core context.
+        std::scoped_lock lk{lock};
+
+        // On exit, we no longer have an active thread.
+        running_thread = nullptr;
+    }
+
+    // Return the halt reason.
+    return halt;
+}
+
+HaltReason ARM_NCE::StepJit() {
+    return HaltReason::StepThread;
+}
+
+u32 ARM_NCE::GetSvcNumber() const {
+    return guest_ctx.svc_swi;
+}
+
+ARM_NCE::ARM_NCE(System& system_, bool uses_wall_clock_, std::size_t core_index_)
+    : ARM_Interface{system_, uses_wall_clock_}, core_index{core_index_} {
+    guest_ctx.system = &system_;
+}
+
+ARM_NCE::~ARM_NCE() = default;
+
+void ARM_NCE::Initialize() {
+    thread_id = gettid();
+
+    // Setup our signals
+    static std::once_flag flag;
+    std::call_once(flag, [] {
+        using HandlerType = decltype(sigaction::sa_sigaction);
+
+        sigset_t signal_mask;
+        sigemptyset(&signal_mask);
+        sigaddset(&signal_mask, ReturnToRunCodeByExceptionLevelChangeSignal);
+        sigaddset(&signal_mask, BreakFromRunCodeSignal);
+        sigaddset(&signal_mask, GuestFaultSignal);
+
+        struct sigaction return_to_run_code_action {};
+        return_to_run_code_action.sa_flags = SA_SIGINFO | SA_ONSTACK;
+        return_to_run_code_action.sa_sigaction = reinterpret_cast<HandlerType>(
+            &ARM_NCE::ReturnToRunCodeByExceptionLevelChangeSignalHandler);
+        return_to_run_code_action.sa_mask = signal_mask;
+        Common::SigAction(ReturnToRunCodeByExceptionLevelChangeSignal, &return_to_run_code_action,
+                          nullptr);
+
+        struct sigaction break_from_run_code_action {};
+        break_from_run_code_action.sa_flags = SA_SIGINFO | SA_ONSTACK;
+        break_from_run_code_action.sa_sigaction =
+            reinterpret_cast<HandlerType>(&ARM_NCE::BreakFromRunCodeSignalHandler);
+        break_from_run_code_action.sa_mask = signal_mask;
+        Common::SigAction(BreakFromRunCodeSignal, &break_from_run_code_action, nullptr);
+
+        struct sigaction fault_action {};
+        fault_action.sa_flags = SA_SIGINFO | SA_ONSTACK | SA_RESTART;
+        fault_action.sa_sigaction =
+            reinterpret_cast<HandlerType>(&ARM_NCE::GuestFaultSignalHandler);
+        fault_action.sa_mask = signal_mask;
+        Common::SigAction(GuestFaultSignal, &fault_action, &g_orig_action);
+
+        // Simplify call for g_orig_action.
+        // These fields occupy the same space in memory, so this should be a no-op in practice.
+        if (!(g_orig_action.sa_flags & SA_SIGINFO)) {
+            g_orig_action.sa_sigaction =
+                reinterpret_cast<decltype(g_orig_action.sa_sigaction)>(g_orig_action.sa_handler);
+        }
+    });
+}
+
+void ARM_NCE::SetPC(u64 pc) {
+    guest_ctx.pc = pc;
+}
+
+u64 ARM_NCE::GetPC() const {
+    return guest_ctx.pc;
+}
+
+u64 ARM_NCE::GetSP() const {
+    return guest_ctx.sp;
+}
+
+u64 ARM_NCE::GetReg(int index) const {
+    return guest_ctx.cpu_registers[index];
+}
+
+void ARM_NCE::SetReg(int index, u64 value) {
+    guest_ctx.cpu_registers[index] = value;
+}
+
+u128 ARM_NCE::GetVectorReg(int index) const {
+    return guest_ctx.vector_registers[index];
+}
+
+void ARM_NCE::SetVectorReg(int index, u128 value) {
+    guest_ctx.vector_registers[index] = value;
+}
+
+u32 ARM_NCE::GetPSTATE() const {
+    return guest_ctx.pstate;
+}
+
+void ARM_NCE::SetPSTATE(u32 pstate) {
+    guest_ctx.pstate = pstate;
+}
+
+u64 ARM_NCE::GetTlsAddress() const {
+    return guest_ctx.tpidrro_el0;
+}
+
+void ARM_NCE::SetTlsAddress(u64 address) {
+    guest_ctx.tpidrro_el0 = address;
+}
+
+u64 ARM_NCE::GetTPIDR_EL0() const {
+    return guest_ctx.tpidr_el0;
+}
+
+void ARM_NCE::SetTPIDR_EL0(u64 value) {
+    guest_ctx.tpidr_el0 = value;
+}
+
+void ARM_NCE::SaveContext(ThreadContext64& ctx) const {
+    ctx.cpu_registers = guest_ctx.cpu_registers;
+    ctx.sp = guest_ctx.sp;
+    ctx.pc = guest_ctx.pc;
+    ctx.pstate = guest_ctx.pstate;
+    ctx.vector_registers = guest_ctx.vector_registers;
+    ctx.fpcr = guest_ctx.fpcr;
+    ctx.fpsr = guest_ctx.fpsr;
+    ctx.tpidr = guest_ctx.tpidr_el0;
+}
+
+void ARM_NCE::LoadContext(const ThreadContext64& ctx) {
+    guest_ctx.cpu_registers = ctx.cpu_registers;
+    guest_ctx.sp = ctx.sp;
+    guest_ctx.pc = ctx.pc;
+    guest_ctx.pstate = ctx.pstate;
+    guest_ctx.vector_registers = ctx.vector_registers;
+    guest_ctx.fpcr = ctx.fpcr;
+    guest_ctx.fpsr = ctx.fpsr;
+    guest_ctx.tpidr_el0 = ctx.tpidr;
+}
+
+void ARM_NCE::SignalInterrupt() {
+    // Lock core context.
+    std::scoped_lock lk{lock};
+
+    // Add break loop condition.
+    guest_ctx.esr_el1.fetch_or(static_cast<u64>(HaltReason::BreakLoop));
+
+    // If there is no thread running, we are done.
+    if (running_thread == nullptr) {
+        return;
+    }
+
+    // Lock the thread context.
+    auto* params = &running_thread->GetNativeExecutionParameters();
+    LockThreadParameters(params);
+
+    if (params->is_running) {
+        // We should signal to the running thread.
+        // The running thread will unlock the thread context.
+        syscall(SYS_tkill, thread_id, BreakFromRunCodeSignal);
+    } else {
+        // If the thread is no longer running, we have nothing to do.
+        UnlockThreadParameters(params);
+    }
+}
+
+void ARM_NCE::ClearInterrupt() {
+    guest_ctx.esr_el1 = {};
+}
+
+void ARM_NCE::ClearInstructionCache() {
+    // TODO: This is not possible to implement correctly on Linux because
+    // we do not have any access to ic iallu.
+
+    // Require accesses to complete.
+    std::atomic_thread_fence(std::memory_order_seq_cst);
+}
+
+void ARM_NCE::InvalidateCacheRange(u64 addr, std::size_t size) {
+    this->ClearInstructionCache();
+}
+
+void ARM_NCE::ClearExclusiveState() {
+    // No-op.
+}
+
+void ARM_NCE::PageTableChanged(Common::PageTable& page_table,
+                               std::size_t new_address_space_size_in_bits) {
+    // No-op. Page table is never used.
+}
+
+} // namespace Core
--- a/src/core/arm/nce/arm_nce.h
+++ b/src/core/arm/nce/arm_nce.h
@ -0,0 +1,108 @@
+// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include <span>
+#include <unordered_map>
+#include <vector>
+
+#include "core/arm/arm_interface.h"
+#include "core/arm/nce/guest_context.h"
+
+namespace Core::Memory {
+class Memory;
+}
+
+namespace Core {
+
+class System;
+
+class ARM_NCE final : public ARM_Interface {
+public:
+    ARM_NCE(System& system_, bool uses_wall_clock_, std::size_t core_index_);
+
+    ~ARM_NCE() override;
+
+    void Initialize() override;
+    void SetPC(u64 pc) override;
+    u64 GetPC() const override;
+    u64 GetSP() const override;
+    u64 GetReg(int index) const override;
+    void SetReg(int index, u64 value) override;
+    u128 GetVectorReg(int index) const override;
+    void SetVectorReg(int index, u128 value) override;
+
+    u32 GetPSTATE() const override;
+    void SetPSTATE(u32 pstate) override;
+    u64 GetTlsAddress() const override;
+    void SetTlsAddress(u64 address) override;
+    void SetTPIDR_EL0(u64 value) override;
+    u64 GetTPIDR_EL0() const override;
+
+    Architecture GetArchitecture() const override {
+        return Architecture::Aarch64;
+    }
+
+    void SaveContext(ThreadContext32& ctx) const override {}
+    void SaveContext(ThreadContext64& ctx) const override;
+    void LoadContext(const ThreadContext32& ctx) override {}
+    void LoadContext(const ThreadContext64& ctx) override;
+
+    void SignalInterrupt() override;
+    void ClearInterrupt() override;
+    void ClearExclusiveState() override;
+    void ClearInstructionCache() override;
+    void InvalidateCacheRange(u64 addr, std::size_t size) override;
+    void PageTableChanged(Common::PageTable& new_page_table,
+                          std::size_t new_address_space_size_in_bits) override;
+
+protected:
+    HaltReason RunJit() override;
+    HaltReason StepJit() override;
+
+    u32 GetSvcNumber() const override;
+
+    const Kernel::DebugWatchpoint* HaltedWatchpoint() const override {
+        return nullptr;
+    }
+
+    void RewindBreakpointInstruction() override {}
+
+private:
+    // Assembly definitions.
+    static HaltReason ReturnToRunCodeByTrampoline(void* tpidr, GuestContext* ctx,
+                                                  u64 trampoline_addr);
+    static HaltReason ReturnToRunCodeByExceptionLevelChange(int tid, void* tpidr);
+
+    static void ReturnToRunCodeByExceptionLevelChangeSignalHandler(int sig, void* info,
+                                                                   void* raw_context);
+    static void BreakFromRunCodeSignalHandler(int sig, void* info, void* raw_context);
+    static void GuestFaultSignalHandler(int sig, void* info, void* raw_context);
+
+    static void LockThreadParameters(void* tpidr);
+    static void UnlockThreadParameters(void* tpidr);
+
+private:
+    // C++ implementation functions for assembly definitions.
+    static void* RestoreGuestContext(void* raw_context);
+    static void SaveGuestContext(GuestContext* ctx, void* raw_context);
+    static bool HandleGuestFault(GuestContext* ctx, void* info, void* raw_context);
+    static void HandleHostFault(int sig, void* info, void* raw_context);
+
+public:
+    // Members set on initialization.
+    std::size_t core_index{};
+    pid_t thread_id{-1};
+
+    // Core context.
+    GuestContext guest_ctx;
+
+    // Thread and invalidation info.
+    std::mutex lock;
+    Kernel::KThread* running_thread{};
+};
+
+} // namespace Core
--- a/src/core/arm/nce/arm_nce.s
+++ b/src/core/arm/nce/arm_nce.s
@ -0,0 +1,222 @@
+/* SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project */
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#include "core/arm/nce/arm_nce_asm_definitions.h"
+
+#define LOAD_IMMEDIATE_32(reg, val)                     \
+    mov     reg, #(((val) >> 0x00) & 0xFFFF);           \
+    movk    reg, #(((val) >> 0x10) & 0xFFFF), lsl #16
+
+
+/* static HaltReason Core::ARM_NCE::ReturnToRunCodeByTrampoline(void* tpidr, Core::GuestContext* ctx, u64 trampoline_addr) */
+.section    .text._ZN4Core7ARM_NCE27ReturnToRunCodeByTrampolineEPvPNS_12GuestContextEm, "ax", %progbits
+.global     _ZN4Core7ARM_NCE27ReturnToRunCodeByTrampolineEPvPNS_12GuestContextEm
+.type       _ZN4Core7ARM_NCE27ReturnToRunCodeByTrampolineEPvPNS_12GuestContextEm, %function
+_ZN4Core7ARM_NCE27ReturnToRunCodeByTrampolineEPvPNS_12GuestContextEm:
+    /* Back up host sp to x3. */
+    /* Back up host tpidr_el0 to x4. */
+    mov     x3, sp
+    mrs     x4, tpidr_el0
+
+    /* Load guest sp. x5 is used as a scratch register. */
+    ldr     x5, [x1, #(GuestContextSp)]
+    mov     sp, x5
+
+    /* Offset GuestContext pointer to the host member. */
+    add     x5, x1, #(GuestContextHostContext)
+
+    /* Save original host sp and tpidr_el0 (x3, x4) to host context. */
+    stp     x3, x4, [x5, #(HostContextSpTpidrEl0)]
+
+    /* Save all callee-saved host GPRs. */
+    stp     x19, x20, [x5, #(HostContextRegs+0x0)]
+    stp     x21, x22, [x5, #(HostContextRegs+0x10)]
+    stp     x23, x24, [x5, #(HostContextRegs+0x20)]
+    stp     x25, x26, [x5, #(HostContextRegs+0x30)]
+    stp     x27, x28, [x5, #(HostContextRegs+0x40)]
+    stp     x29, x30, [x5, #(HostContextRegs+0x50)]
+
+    /* Save all callee-saved host FPRs. */
+    stp     q8, q9,   [x5, #(HostContextVregs+0x0)]
+    stp     q10, q11, [x5, #(HostContextVregs+0x20)]
+    stp     q12, q13, [x5, #(HostContextVregs+0x40)]
+    stp     q14, q15, [x5, #(HostContextVregs+0x60)]
+
+    /* Load guest tpidr_el0 from argument. */
+    msr     tpidr_el0, x0
+
+    /* Tail call the trampoline to restore guest state. */
+    br      x2
+
+
+/* static HaltReason Core::ARM_NCE::ReturnToRunCodeByExceptionLevelChange(int tid, void* tpidr) */
+.section    .text._ZN4Core7ARM_NCE37ReturnToRunCodeByExceptionLevelChangeEiPv, "ax", %progbits
+.global     _ZN4Core7ARM_NCE37ReturnToRunCodeByExceptionLevelChangeEiPv
+.type       _ZN4Core7ARM_NCE37ReturnToRunCodeByExceptionLevelChangeEiPv, %function
+_ZN4Core7ARM_NCE37ReturnToRunCodeByExceptionLevelChangeEiPv:
+    /* This jumps to the signal handler, which will restore the entire context. */
+    /* On entry, x0 = thread id, which is already in the right place. */
+
+    /* Move tpidr to x9 so it is not trampled. */
+    mov     x9, x1
+
+    /* Set up arguments. */
+    mov     x8, #(__NR_tkill)
+    mov     x1, #(ReturnToRunCodeByExceptionLevelChangeSignal)
+
+    /* Tail call the signal handler. */
+    svc     #0
+
+    /* Block execution from flowing here. */
+    brk     #1000
+
+
+/* static void Core::ARM_NCE::ReturnToRunCodeByExceptionLevelChangeSignalHandler(int sig, void* info, void* raw_context) */
+.section    .text._ZN4Core7ARM_NCE50ReturnToRunCodeByExceptionLevelChangeSignalHandlerEiPvS1_, "ax", %progbits
+.global     _ZN4Core7ARM_NCE50ReturnToRunCodeByExceptionLevelChangeSignalHandlerEiPvS1_
+.type       _ZN4Core7ARM_NCE50ReturnToRunCodeByExceptionLevelChangeSignalHandlerEiPvS1_, %function
+_ZN4Core7ARM_NCE50ReturnToRunCodeByExceptionLevelChangeSignalHandlerEiPvS1_:
+    stp     x29, x30, [sp, #-0x10]!
+    mov     x29, sp
+
+    /* Call the context restorer with the raw context. */
+    mov     x0, x2
+    bl      _ZN4Core7ARM_NCE19RestoreGuestContextEPv
+
+    /* Save the old value of tpidr_el0. */
+    mrs     x8, tpidr_el0
+    ldr     x9, [x0, #(TpidrEl0NativeContext)]
+    str     x8, [x9, #(GuestContextHostContext + HostContextTpidrEl0)]
+
+    /* Set our new tpidr_el0. */
+    msr     tpidr_el0, x0
+
+    /* Unlock the context. */
+    bl      _ZN4Core7ARM_NCE22UnlockThreadParametersEPv
+
+    /* Returning from here will enter the guest. */
+    ldp     x29, x30, [sp], #0x10
+    ret
+
+
+/* static void Core::ARM_NCE::BreakFromRunCodeSignalHandler(int sig, void* info, void* raw_context) */
+.section    .text._ZN4Core7ARM_NCE29BreakFromRunCodeSignalHandlerEiPvS1_, "ax", %progbits
+.global     _ZN4Core7ARM_NCE29BreakFromRunCodeSignalHandlerEiPvS1_
+.type       _ZN4Core7ARM_NCE29BreakFromRunCodeSignalHandlerEiPvS1_, %function
+_ZN4Core7ARM_NCE29BreakFromRunCodeSignalHandlerEiPvS1_:
+    /* Check to see if we have the correct TLS magic. */
+    mrs     x8, tpidr_el0
+    ldr     w9, [x8, #(TpidrEl0TlsMagic)]
+
+    LOAD_IMMEDIATE_32(w10, TlsMagic)
+
+    cmp     w9, w10
+    b.ne    1f
+
+    /* Correct TLS magic, so this is a guest interrupt. */
+    /* Restore host tpidr_el0. */
+    ldr     x0, [x8, #(TpidrEl0NativeContext)]
+    ldr     x3, [x0, #(GuestContextHostContext + HostContextTpidrEl0)]
+    msr     tpidr_el0, x3
+
+    /* Tail call the restorer. */
+    mov     x1, x2
+    b       _ZN4Core7ARM_NCE16SaveGuestContextEPNS_12GuestContextEPv
+
+    /* Returning from here will enter host code. */
+
+1:
+    /* Incorrect TLS magic, so this is a spurious signal. */
+    ret
+
+
+/* static void Core::ARM_NCE::GuestFaultSignalHandler(int sig, void* info, void* raw_context) */
+.section    .text._ZN4Core7ARM_NCE23GuestFaultSignalHandlerEiPvS1_, "ax", %progbits
+.global     _ZN4Core7ARM_NCE23GuestFaultSignalHandlerEiPvS1_
+.type       _ZN4Core7ARM_NCE23GuestFaultSignalHandlerEiPvS1_, %function
+_ZN4Core7ARM_NCE23GuestFaultSignalHandlerEiPvS1_:
+    /* Check to see if we have the correct TLS magic. */
+    mrs     x8, tpidr_el0
+    ldr     w9, [x8, #(TpidrEl0TlsMagic)]
+
+    LOAD_IMMEDIATE_32(w10, TlsMagic)
+
+    cmp     w9, w10
+    b.eq    1f
+
+    /* Incorrect TLS magic, so this is a host fault. */
+    /* Tail call the handler. */
+    b       _ZN4Core7ARM_NCE15HandleHostFaultEiPvS1_
+
+1:
+    /* Correct TLS magic, so this is a guest fault. */
+    stp     x29, x30, [sp, #-0x20]!
+    str     x19, [sp, #0x10]
+    mov     x29, sp
+
+    /* Save the old tpidr_el0. */
+    mov     x19, x8
+
+    /* Restore host tpidr_el0. */
+    ldr     x0, [x8, #(TpidrEl0NativeContext)]
+    ldr     x3, [x0, #(GuestContextHostContext + HostContextTpidrEl0)]
+    msr     tpidr_el0, x3
+
+    /* Call the handler. */
+    bl       _ZN4Core7ARM_NCE16HandleGuestFaultEPNS_12GuestContextEPvS3_
+
+    /* If the handler returned false, we want to preserve the host tpidr_el0. */
+    cbz     x0, 2f
+
+    /* Otherwise, restore guest tpidr_el0. */
+    msr     tpidr_el0, x19
+
+2:
+    ldr     x19, [sp, #0x10]
+    ldp     x29, x30, [sp], #0x20
+    ret
+
+
+/* static void Core::ARM_NCE::LockThreadParameters(void* tpidr) */
+.section    .text._ZN4Core7ARM_NCE20LockThreadParametersEPv, "ax", %progbits
+.global     _ZN4Core7ARM_NCE20LockThreadParametersEPv
+.type       _ZN4Core7ARM_NCE20LockThreadParametersEPv, %function
+_ZN4Core7ARM_NCE20LockThreadParametersEPv:
+    /* Offset to lock member. */
+    add     x0, x0, #(TpidrEl0Lock)
+
+1:
+    /* Clear the monitor. */
+    clrex
+
+2:
+    /* Load-linked with acquire ordering. */
+    ldaxr   w1, [x0]
+
+    /* If the value was SpinLockLocked, clear monitor and retry. */
+    cbz     w1, 1b
+
+    /* Store-conditional SpinLockLocked with relaxed ordering. */
+    stxr    w1, wzr, [x0]
+
+    /* If we failed to store, retry. */
+    cbnz    w1, 2b
+
+    ret
+
+
+/* static void Core::ARM_NCE::UnlockThreadParameters(void* tpidr) */
+.section    .text._ZN4Core7ARM_NCE22UnlockThreadParametersEPv, "ax", %progbits
+.global     _ZN4Core7ARM_NCE22UnlockThreadParametersEPv
+.type       _ZN4Core7ARM_NCE22UnlockThreadParametersEPv, %function
+_ZN4Core7ARM_NCE22UnlockThreadParametersEPv:
+    /* Offset to lock member. */
+    add     x0, x0, #(TpidrEl0Lock)
+
+    /* Load SpinLockUnlocked. */
+    mov     w1, #(SpinLockUnlocked)
+
+    /* Store value with release ordering. */
+    stlr    w1, [x0]
+
+    ret
--- a/src/core/arm/nce/arm_nce_asm_definitions.h
+++ b/src/core/arm/nce/arm_nce_asm_definitions.h
@ -0,0 +1,29 @@
+/* SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project */
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#pragma once
+
+#define __ASSEMBLY__
+
+#include <asm-generic/signal.h>
+#include <asm-generic/unistd.h>
+
+#define ReturnToRunCodeByExceptionLevelChangeSignal SIGUSR2
+#define BreakFromRunCodeSignal SIGURG
+#define GuestFaultSignal SIGSEGV
+
+#define GuestContextSp 0xF8
+#define GuestContextHostContext 0x320
+
+#define HostContextSpTpidrEl0 0xE0
+#define HostContextTpidrEl0 0xE8
+#define HostContextRegs 0x0
+#define HostContextVregs 0x60
+
+#define TpidrEl0NativeContext 0x10
+#define TpidrEl0Lock 0x18
+#define TpidrEl0TlsMagic 0x20
+#define TlsMagic 0x555a5559
+
+#define SpinLockLocked 0
+#define SpinLockUnlocked 1
--- a/src/core/arm/nce/guest_context.h
+++ b/src/core/arm/nce/guest_context.h
@ -0,0 +1,50 @@
+// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "core/arm/arm_interface.h"
+#include "core/arm/nce/arm_nce_asm_definitions.h"
+
+namespace Core {
+
+class ARM_NCE;
+class System;
+
+struct HostContext {
+    alignas(16) std::array<u64, 12> host_saved_regs{};
+    alignas(16) std::array<u128, 8> host_saved_vregs{};
+    u64 host_sp{};
+    void* host_tpidr_el0{};
+};
+
+struct GuestContext {
+    std::array<u64, 31> cpu_registers{};
+    u64 sp{};
+    u64 pc{};
+    u32 fpcr{};
+    u32 fpsr{};
+    std::array<u128, 32> vector_registers{};
+    u32 pstate{};
+    alignas(16) HostContext host_ctx{};
+    u64 tpidrro_el0{};
+    u64 tpidr_el0{};
+    std::atomic<u64> esr_el1{};
+    u32 nzcv{};
+    u32 svc_swi{};
+    System* system{};
+    ARM_NCE* parent{};
+};
+
+// Verify assembly offsets.
+static_assert(offsetof(GuestContext, sp) == GuestContextSp);
+static_assert(offsetof(GuestContext, host_ctx) == GuestContextHostContext);
+static_assert(offsetof(HostContext, host_sp) == HostContextSpTpidrEl0);
+static_assert(offsetof(HostContext, host_tpidr_el0) - 8 == HostContextSpTpidrEl0);
+static_assert(offsetof(HostContext, host_tpidr_el0) == HostContextTpidrEl0);
+static_assert(offsetof(HostContext, host_saved_regs) == HostContextRegs);
+static_assert(offsetof(HostContext, host_saved_vregs) == HostContextVregs);
+
+} // namespace Core
--- a/src/core/arm/nce/instructions.h
+++ b/src/core/arm/nce/instructions.h
@ -0,0 +1,147 @@
+// SPDX-FileCopyrightText: Copyright © 2020 Skyline Team and Contributors
+// SPDX-License-Identifier: MPL-2.0
+
+#include "common/bit_field.h"
+#include "common/common_types.h"
+
+namespace Core::NCE {
+
+enum SystemRegister : u32 {
+    TpidrEl0 = 0x5E82,
+    TpidrroEl0 = 0x5E83,
+    CntfrqEl0 = 0x5F00,
+    CntpctEl0 = 0x5F01,
+};
+
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/SVC--Supervisor-Call-
+union SVC {
+    constexpr explicit SVC(u32 raw_) : raw{raw_} {}
+
+    constexpr bool Verify() {
+        return (this->GetSig0() == 0x1 && this->GetSig1() == 0x6A0);
+    }
+
+    constexpr u32 GetSig0() {
+        return decltype(sig0)::ExtractValue(raw);
+    }
+
+    constexpr u32 GetValue() {
+        return decltype(value)::ExtractValue(raw);
+    }
+
+    constexpr u32 GetSig1() {
+        return decltype(sig1)::ExtractValue(raw);
+    }
+
+    u32 raw;
+
+private:
+    BitField<0, 5, u32> sig0;   // 0x1
+    BitField<5, 16, u32> value; // 16-bit immediate
+    BitField<21, 11, u32> sig1; // 0x6A0
+};
+static_assert(sizeof(SVC) == sizeof(u32));
+static_assert(SVC(0xD40000C1).Verify());
+static_assert(SVC(0xD40000C1).GetValue() == 0x6);
+
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/MRS--Move-System-Register-
+union MRS {
+    constexpr explicit MRS(u32 raw_) : raw{raw_} {}
+
+    constexpr bool Verify() {
+        return (this->GetSig() == 0xD53);
+    }
+
+    constexpr u32 GetRt() {
+        return decltype(rt)::ExtractValue(raw);
+    }
+
+    constexpr u32 GetSystemReg() {
+        return decltype(system_reg)::ExtractValue(raw);
+    }
+
+    constexpr u32 GetSig() {
+        return decltype(sig)::ExtractValue(raw);
+    }
+
+    u32 raw;
+
+private:
+    BitField<0, 5, u32> rt;          // destination register
+    BitField<5, 15, u32> system_reg; // source system register
+    BitField<20, 12, u32> sig;       // 0xD53
+};
+static_assert(sizeof(MRS) == sizeof(u32));
+static_assert(MRS(0xD53BE020).Verify());
+static_assert(MRS(0xD53BE020).GetSystemReg() == CntpctEl0);
+static_assert(MRS(0xD53BE020).GetRt() == 0x0);
+
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/MSR--register---Move-general-purpose-register-to-System-Register-
+union MSR {
+    constexpr explicit MSR(u32 raw_) : raw{raw_} {}
+
+    constexpr bool Verify() {
+        return this->GetSig() == 0xD51;
+    }
+
+    constexpr u32 GetRt() {
+        return decltype(rt)::ExtractValue(raw);
+    }
+
+    constexpr u32 GetSystemReg() {
+        return decltype(system_reg)::ExtractValue(raw);
+    }
+
+    constexpr u32 GetSig() {
+        return decltype(sig)::ExtractValue(raw);
+    }
+
+    u32 raw;
+
+private:
+    BitField<0, 5, u32> rt;          // source register
+    BitField<5, 15, u32> system_reg; // destination system register
+    BitField<20, 12, u32> sig;       // 0xD51
+};
+static_assert(sizeof(MSR) == sizeof(u32));
+static_assert(MSR(0xD51BD040).Verify());
+static_assert(MSR(0xD51BD040).GetSystemReg() == TpidrEl0);
+static_assert(MSR(0xD51BD040).GetRt() == 0x0);
+
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDXR--Load-Exclusive-Register-
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDXP--Load-Exclusive-Pair-of-Registers-
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STXR--Store-Exclusive-Register-
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STXP--Store-Exclusive-Pair-of-registers-
+union Exclusive {
+    constexpr explicit Exclusive(u32 raw_) : raw{raw_} {}
+
+    constexpr bool Verify() {
+        return this->GetSig() == 0x10;
+    }
+
+    constexpr u32 GetSig() {
+        return decltype(sig)::ExtractValue(raw);
+    }
+
+    constexpr u32 AsOrdered() {
+        return raw | decltype(o0)::FormatValue(1);
+    }
+
+    u32 raw;
+
+private:
+    BitField<0, 5, u32> rt;    // memory operand
+    BitField<5, 5, u32> rn;    // register operand 1
+    BitField<10, 5, u32> rt2;  // register operand 2
+    BitField<15, 1, u32> o0;   // ordered
+    BitField<16, 5, u32> rs;   // status register
+    BitField<21, 2, u32> l;    // operation type
+    BitField<23, 7, u32> sig;  // 0x10
+    BitField<30, 2, u32> size; // size
+};
+static_assert(Exclusive(0xC85FFC00).Verify());
+static_assert(Exclusive(0xC85FFC00).AsOrdered() == 0xC85FFC00);
+static_assert(Exclusive(0xC85F7C00).AsOrdered() == 0xC85FFC00);
+static_assert(Exclusive(0xC8200440).AsOrdered() == 0xC8208440);
+
+} // namespace Core::NCE
--- a/src/core/arm/nce/patch.cpp
+++ b/src/core/arm/nce/patch.cpp
@ -0,0 +1,472 @@
+// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "common/arm64/native_clock.h"
+#include "common/bit_cast.h"
+#include "common/literals.h"
+#include "core/arm/nce/arm_nce.h"
+#include "core/arm/nce/guest_context.h"
+#include "core/arm/nce/instructions.h"
+#include "core/arm/nce/patch.h"
+#include "core/core.h"
+#include "core/core_timing.h"
+#include "core/hle/kernel/svc.h"
+
+namespace Core::NCE {
+
+using namespace Common::Literals;
+using namespace oaknut::util;
+
+using NativeExecutionParameters = Kernel::KThread::NativeExecutionParameters;
+
+constexpr size_t MaxRelativeBranch = 128_MiB;
+constexpr u32 ModuleCodeIndex = 0x24 / sizeof(u32);
+
+Patcher::Patcher() : c(m_patch_instructions) {}
+
+Patcher::~Patcher() = default;
+
+void Patcher::PatchText(const Kernel::PhysicalMemory& program_image,
+                        const Kernel::CodeSet::Segment& code) {
+
+    // Write save context helper function.
+    c.l(m_save_context);
+    WriteSaveContext();
+
+    // Write load context helper function.
+    c.l(m_load_context);
+    WriteLoadContext();
+
+    // Retrieve text segment data.
+    const auto text = std::span{program_image}.subspan(code.offset, code.size);
+    const auto text_words =
+        std::span<const u32>{reinterpret_cast<const u32*>(text.data()), text.size() / sizeof(u32)};
+
+    // Loop through instructions, patching as needed.
+    for (u32 i = ModuleCodeIndex; i < static_cast<u32>(text_words.size()); i++) {
+        const u32 inst = text_words[i];
+
+        const auto AddRelocations = [&] {
+            const uintptr_t this_offset = i * sizeof(u32);
+            const uintptr_t next_offset = this_offset + sizeof(u32);
+
+            // Relocate from here to patch.
+            this->BranchToPatch(this_offset);
+
+            // Relocate from patch to next instruction.
+            return next_offset;
+        };
+
+        // SVC
+        if (auto svc = SVC{inst}; svc.Verify()) {
+            WriteSvcTrampoline(AddRelocations(), svc.GetValue());
+            continue;
+        }
+
+        // MRS Xn, TPIDR_EL0
+        // MRS Xn, TPIDRRO_EL0
+        if (auto mrs = MRS{inst};
+            mrs.Verify() && (mrs.GetSystemReg() == TpidrroEl0 || mrs.GetSystemReg() == TpidrEl0)) {
+            const auto src_reg = mrs.GetSystemReg() == TpidrroEl0 ? oaknut::SystemReg::TPIDRRO_EL0
+                                                                  : oaknut::SystemReg::TPIDR_EL0;
+            const auto dest_reg = oaknut::XReg{static_cast<int>(mrs.GetRt())};
+            WriteMrsHandler(AddRelocations(), dest_reg, src_reg);
+            continue;
+        }
+
+        // MRS Xn, CNTPCT_EL0
+        if (auto mrs = MRS{inst}; mrs.Verify() && mrs.GetSystemReg() == CntpctEl0) {
+            WriteCntpctHandler(AddRelocations(), oaknut::XReg{static_cast<int>(mrs.GetRt())});
+            continue;
+        }
+
+        // MRS Xn, CNTFRQ_EL0
+        if (auto mrs = MRS{inst}; mrs.Verify() && mrs.GetSystemReg() == CntfrqEl0) {
+            UNREACHABLE();
+        }
+
+        // MSR TPIDR_EL0, Xn
+        if (auto msr = MSR{inst}; msr.Verify() && msr.GetSystemReg() == TpidrEl0) {
+            WriteMsrHandler(AddRelocations(), oaknut::XReg{static_cast<int>(msr.GetRt())});
+            continue;
+        }
+    }
+
+    // Determine patching mode for the final relocation step
+    const size_t image_size = program_image.size();
+    this->mode = image_size > MaxRelativeBranch ? PatchMode::PreText : PatchMode::PostData;
+}
+
+void Patcher::RelocateAndCopy(Common::ProcessAddress load_base,
+                              const Kernel::CodeSet::Segment& code,
+                              Kernel::PhysicalMemory& program_image,
+                              EntryTrampolines* out_trampolines) {
+    const size_t patch_size = GetSectionSize();
+    const size_t image_size = program_image.size();
+
+    // Retrieve text segment data.
+    const auto text = std::span{program_image}.subspan(code.offset, code.size);
+    const auto text_words =
+        std::span<u32>{reinterpret_cast<u32*>(text.data()), text.size() / sizeof(u32)};
+
+    const auto ApplyBranchToPatchRelocation = [&](u32* target, const Relocation& rel) {
+        oaknut::CodeGenerator rc{target};
+        if (mode == PatchMode::PreText) {
+            rc.B(rel.patch_offset - patch_size - rel.module_offset);
+        } else {
+            rc.B(image_size - rel.module_offset + rel.patch_offset);
+        }
+    };
+
+    const auto ApplyBranchToModuleRelocation = [&](u32* target, const Relocation& rel) {
+        oaknut::CodeGenerator rc{target};
+        if (mode == PatchMode::PreText) {
+            rc.B(patch_size - rel.patch_offset + rel.module_offset);
+        } else {
+            rc.B(rel.module_offset - image_size - rel.patch_offset);
+        }
+    };
+
+    const auto RebasePatch = [&](ptrdiff_t patch_offset) {
+        if (mode == PatchMode::PreText) {
+            return GetInteger(load_base) + patch_offset;
+        } else {
+            return GetInteger(load_base) + image_size + patch_offset;
+        }
+    };
+
+    const auto RebasePc = [&](uintptr_t module_offset) {
+        if (mode == PatchMode::PreText) {
+            return GetInteger(load_base) + patch_size + module_offset;
+        } else {
+            return GetInteger(load_base) + module_offset;
+        }
+    };
+
+    // We are now ready to relocate!
+    for (const Relocation& rel : m_branch_to_patch_relocations) {
+        ApplyBranchToPatchRelocation(text_words.data() + rel.module_offset / sizeof(u32), rel);
+    }
+    for (const Relocation& rel : m_branch_to_module_relocations) {
+        ApplyBranchToModuleRelocation(m_patch_instructions.data() + rel.patch_offset / sizeof(u32),
+                                      rel);
+    }
+
+    // Rewrite PC constants and record post trampolines
+    for (const Relocation& rel : m_write_module_pc_relocations) {
+        oaknut::CodeGenerator rc{m_patch_instructions.data() + rel.patch_offset / sizeof(u32)};
+        rc.dx(RebasePc(rel.module_offset));
+    }
+    for (const Trampoline& rel : m_trampolines) {
+        out_trampolines->insert({RebasePc(rel.module_offset), RebasePatch(rel.patch_offset)});
+    }
+
+    // Cortex-A57 seems to treat all exclusives as ordered, but newer processors do not.
+    // Convert to ordered to preserve this assumption.
+    for (u32 i = ModuleCodeIndex; i < static_cast<u32>(text_words.size()); i++) {
+        const u32 inst = text_words[i];
+        if (auto exclusive = Exclusive{inst}; exclusive.Verify()) {
+            text_words[i] = exclusive.AsOrdered();
+        }
+    }
+
+    // Copy to program image
+    if (this->mode == PatchMode::PreText) {
+        std::memcpy(program_image.data(), m_patch_instructions.data(),
+                    m_patch_instructions.size() * sizeof(u32));
+    } else {
+        program_image.resize(image_size + patch_size);
+        std::memcpy(program_image.data() + image_size, m_patch_instructions.data(),
+                    m_patch_instructions.size() * sizeof(u32));
+    }
+}
+
+size_t Patcher::GetSectionSize() const noexcept {
+    return Common::AlignUp(m_patch_instructions.size() * sizeof(u32), Core::Memory::YUZU_PAGESIZE);
+}
+
+void Patcher::WriteLoadContext() {
+    // This function was called, which modifies X30, so use that as a scratch register.
+    // SP contains the guest X30, so save our return X30 to SP + 8, since we have allocated 16 bytes
+    // of stack.
+    c.STR(X30, SP, 8);
+    c.MRS(X30, oaknut::SystemReg::TPIDR_EL0);
+    c.LDR(X30, X30, offsetof(NativeExecutionParameters, native_context));
+
+    // Load system registers.
+    c.LDR(W0, X30, offsetof(GuestContext, fpsr));
+    c.MSR(oaknut::SystemReg::FPSR, X0);
+    c.LDR(W0, X30, offsetof(GuestContext, fpcr));
+    c.MSR(oaknut::SystemReg::FPCR, X0);
+    c.LDR(W0, X30, offsetof(GuestContext, nzcv));
+    c.MSR(oaknut::SystemReg::NZCV, X0);
+
+    // Load all vector registers.
+    static constexpr size_t VEC_OFF = offsetof(GuestContext, vector_registers);
+    for (int i = 0; i <= 30; i += 2) {
+        c.LDP(oaknut::QReg{i}, oaknut::QReg{i + 1}, X30, VEC_OFF + 16 * i);
+    }
+
+    // Load all general-purpose registers except X30.
+    for (int i = 0; i <= 28; i += 2) {
+        c.LDP(oaknut::XReg{i}, oaknut::XReg{i + 1}, X30, 8 * i);
+    }
+
+    // Reload our return X30 from the stack and return.
+    // The patch code will reload the guest X30 for us.
+    c.LDR(X30, SP, 8);
+    c.RET();
+}
+
+void Patcher::WriteSaveContext() {
+    // This function was called, which modifies X30, so use that as a scratch register.
+    // SP contains the guest X30, so save our X30 to SP + 8, since we have allocated 16 bytes of
+    // stack.
+    c.STR(X30, SP, 8);
+    c.MRS(X30, oaknut::SystemReg::TPIDR_EL0);
+    c.LDR(X30, X30, offsetof(NativeExecutionParameters, native_context));
+
+    // Store all general-purpose registers except X30.
+    for (int i = 0; i <= 28; i += 2) {
+        c.STP(oaknut::XReg{i}, oaknut::XReg{i + 1}, X30, 8 * i);
+    }
+
+    // Store all vector registers.
+    static constexpr size_t VEC_OFF = offsetof(GuestContext, vector_registers);
+    for (int i = 0; i <= 30; i += 2) {
+        c.STP(oaknut::QReg{i}, oaknut::QReg{i + 1}, X30, VEC_OFF + 16 * i);
+    }
+
+    // Store guest system registers, X30 and SP, using X0 as a scratch register.
+    c.STR(X0, SP, PRE_INDEXED, -16);
+    c.LDR(X0, SP, 16);
+    c.STR(X0, X30, 8 * 30);
+    c.ADD(X0, SP, 32);
+    c.STR(X0, X30, offsetof(GuestContext, sp));
+    c.MRS(X0, oaknut::SystemReg::FPSR);
+    c.STR(W0, X30, offsetof(GuestContext, fpsr));
+    c.MRS(X0, oaknut::SystemReg::FPCR);
+    c.STR(W0, X30, offsetof(GuestContext, fpcr));
+    c.MRS(X0, oaknut::SystemReg::NZCV);
+    c.STR(W0, X30, offsetof(GuestContext, nzcv));
+    c.LDR(X0, SP, POST_INDEXED, 16);
+
+    // Reload our return X30 from the stack, and return.
+    c.LDR(X30, SP, 8);
+    c.RET();
+}
+
+void Patcher::WriteSvcTrampoline(ModuleDestLabel module_dest, u32 svc_id) {
+    // We are about to start saving state, so we need to lock the context.
+    this->LockContext();
+
+    // Store guest X30 to the stack. Then, save the context and restore the stack.
+    // This will save all registers except PC, but we know PC at patch time.
+    c.STR(X30, SP, PRE_INDEXED, -16);
+    c.BL(m_save_context);
+    c.LDR(X30, SP, POST_INDEXED, 16);
+
+    // Now that we've saved all registers, we can use any registers as scratch.
+    // Store PC + 4 to arm interface, since we know the instruction offset from the entry point.
+    oaknut::Label pc_after_svc;
+    c.MRS(X1, oaknut::SystemReg::TPIDR_EL0);
+    c.LDR(X1, X1, offsetof(NativeExecutionParameters, native_context));
+    c.LDR(X2, pc_after_svc);
+    c.STR(X2, X1, offsetof(GuestContext, pc));
+
+    // Store SVC number to execute when we return
+    c.MOV(X2, svc_id);
+    c.STR(W2, X1, offsetof(GuestContext, svc_swi));
+
+    // We are calling a SVC. Clear esr_el1 and return it.
+    static_assert(std::is_same_v<std::underlying_type_t<HaltReason>, u64>);
+    oaknut::Label retry;
+    c.ADD(X2, X1, offsetof(GuestContext, esr_el1));
+    c.l(retry);
+    c.LDAXR(X0, X2);
+    c.STLXR(W3, XZR, X2);
+    c.CBNZ(W3, retry);
+
+    // Add "calling SVC" flag. Since this is X0, this is now our return value.
+    c.ORR(X0, X0, static_cast<u64>(HaltReason::SupervisorCall));
+
+    // Offset the GuestContext pointer to the HostContext member.
+    // STP has limited range of [-512, 504] which we can't reach otherwise
+    // NB: Due to this all offsets below are from the start of HostContext.
+    c.ADD(X1, X1, offsetof(GuestContext, host_ctx));
+
+    // Reload host TPIDR_EL0 and SP.
+    static_assert(offsetof(HostContext, host_sp) + 8 == offsetof(HostContext, host_tpidr_el0));
+    c.LDP(X2, X3, X1, offsetof(HostContext, host_sp));
+    c.MOV(SP, X2);
+    c.MSR(oaknut::SystemReg::TPIDR_EL0, X3);
+
+    // Load callee-saved host registers and return to host.
+    static constexpr size_t HOST_REGS_OFF = offsetof(HostContext, host_saved_regs);
+    static constexpr size_t HOST_VREGS_OFF = offsetof(HostContext, host_saved_vregs);
+    c.LDP(X19, X20, X1, HOST_REGS_OFF);
+    c.LDP(X21, X22, X1, HOST_REGS_OFF + 2 * sizeof(u64));
+    c.LDP(X23, X24, X1, HOST_REGS_OFF + 4 * sizeof(u64));
+    c.LDP(X25, X26, X1, HOST_REGS_OFF + 6 * sizeof(u64));
+    c.LDP(X27, X28, X1, HOST_REGS_OFF + 8 * sizeof(u64));
+    c.LDP(X29, X30, X1, HOST_REGS_OFF + 10 * sizeof(u64));
+    c.LDP(Q8, Q9, X1, HOST_VREGS_OFF);
+    c.LDP(Q10, Q11, X1, HOST_VREGS_OFF + 2 * sizeof(u128));
+    c.LDP(Q12, Q13, X1, HOST_VREGS_OFF + 4 * sizeof(u128));
+    c.LDP(Q14, Q15, X1, HOST_VREGS_OFF + 6 * sizeof(u128));
+    c.RET();
+
+    // Write the post-SVC trampoline address, which will jump back to the guest after restoring its
+    // state.
+    m_trampolines.push_back({c.offset(), module_dest});
+
+    // Host called this location. Save the return address so we can
+    // unwind the stack properly when jumping back.
+    c.MRS(X2, oaknut::SystemReg::TPIDR_EL0);
+    c.LDR(X2, X2, offsetof(NativeExecutionParameters, native_context));
+    c.ADD(X0, X2, offsetof(GuestContext, host_ctx));
+    c.STR(X30, X0, offsetof(HostContext, host_saved_regs) + 11 * sizeof(u64));
+
+    // Reload all guest registers except X30 and PC.
+    // The function also expects 16 bytes of stack already allocated.
+    c.STR(X30, SP, PRE_INDEXED, -16);
+    c.BL(m_load_context);
+    c.LDR(X30, SP, POST_INDEXED, 16);
+
+    // Use X1 as a scratch register to restore X30.
+    c.STR(X1, SP, PRE_INDEXED, -16);
+    c.MRS(X1, oaknut::SystemReg::TPIDR_EL0);
+    c.LDR(X1, X1, offsetof(NativeExecutionParameters, native_context));
+    c.LDR(X30, X1, offsetof(GuestContext, cpu_registers) + sizeof(u64) * 30);
+    c.LDR(X1, SP, POST_INDEXED, 16);
+
+    // Unlock the context.
+    this->UnlockContext();
+
+    // Jump back to the instruction after the emulated SVC.
+    this->BranchToModule(module_dest);
+
+    // Store PC after call.
+    c.l(pc_after_svc);
+    this->WriteModulePc(module_dest);
+}
+
+void Patcher::WriteMrsHandler(ModuleDestLabel module_dest, oaknut::XReg dest_reg,
+                              oaknut::SystemReg src_reg) {
+    // Retrieve emulated TLS register from GuestContext.
+    c.MRS(dest_reg, oaknut::SystemReg::TPIDR_EL0);
+    if (src_reg == oaknut::SystemReg::TPIDRRO_EL0) {
+        c.LDR(dest_reg, dest_reg, offsetof(NativeExecutionParameters, tpidrro_el0));
+    } else {
+        c.LDR(dest_reg, dest_reg, offsetof(NativeExecutionParameters, tpidr_el0));
+    }
+
+    // Jump back to the instruction after the emulated MRS.
+    this->BranchToModule(module_dest);
+}
+
+void Patcher::WriteMsrHandler(ModuleDestLabel module_dest, oaknut::XReg src_reg) {
+    const auto scratch_reg = src_reg.index() == 0 ? X1 : X0;
+    c.STR(scratch_reg, SP, PRE_INDEXED, -16);
+
+    // Save guest value to NativeExecutionParameters::tpidr_el0.
+    c.MRS(scratch_reg, oaknut::SystemReg::TPIDR_EL0);
+    c.STR(src_reg, scratch_reg, offsetof(NativeExecutionParameters, tpidr_el0));
+
+    // Restore scratch register.
+    c.LDR(scratch_reg, SP, POST_INDEXED, 16);
+
+    // Jump back to the instruction after the emulated MSR.
+    this->BranchToModule(module_dest);
+}
+
+void Patcher::WriteCntpctHandler(ModuleDestLabel module_dest, oaknut::XReg dest_reg) {
+    static Common::Arm64::NativeClock clock{};
+    const auto factor = clock.GetGuestCNTFRQFactor();
+    const auto raw_factor = Common::BitCast<std::array<u64, 2>>(factor);
+
+    const auto use_x2_x3 = dest_reg.index() == 0 || dest_reg.index() == 1;
+    oaknut::XReg scratch0 = use_x2_x3 ? X2 : X0;
+    oaknut::XReg scratch1 = use_x2_x3 ? X3 : X1;
+
+    oaknut::Label factorlo;
+    oaknut::Label factorhi;
+
+    // Save scratches.
+    c.STP(scratch0, scratch1, SP, PRE_INDEXED, -16);
+
+    // Load counter value.
+    c.MRS(dest_reg, oaknut::SystemReg::CNTVCT_EL0);
+
+    // Load scaling factor.
+    c.LDR(scratch0, factorlo);
+    c.LDR(scratch1, factorhi);
+
+    // Multiply low bits and get result.
+    c.UMULH(scratch0, dest_reg, scratch0);
+
+    // Multiply high bits and add low bit result.
+    c.MADD(dest_reg, dest_reg, scratch1, scratch0);
+
+    // Reload scratches.
+    c.LDP(scratch0, scratch1, SP, POST_INDEXED, 16);
+
+    // Jump back to the instruction after the emulated MRS.
+    this->BranchToModule(module_dest);
+
+    // Scaling factor constant values.
+    c.l(factorlo);
+    c.dx(raw_factor[0]);
+    c.l(factorhi);
+    c.dx(raw_factor[1]);
+}
+
+void Patcher::LockContext() {
+    oaknut::Label retry;
+
+    // Save scratches.
+    c.STP(X0, X1, SP, PRE_INDEXED, -16);
+
+    // Reload lock pointer.
+    c.l(retry);
+    c.CLREX();
+    c.MRS(X0, oaknut::SystemReg::TPIDR_EL0);
+    c.ADD(X0, X0, offsetof(NativeExecutionParameters, lock));
+
+    static_assert(SpinLockLocked == 0);
+
+    // Load-linked with acquire ordering.
+    c.LDAXR(W1, X0);
+
+    // If the value was SpinLockLocked, clear monitor and retry.
+    c.CBZ(W1, retry);
+
+    // Store-conditional SpinLockLocked with relaxed ordering.
+    c.STXR(W1, WZR, X0);
+
+    // If we failed to store, retry.
+    c.CBNZ(W1, retry);
+
+    // We succeeded! Reload scratches.
+    c.LDP(X0, X1, SP, POST_INDEXED, 16);
+}
+
+void Patcher::UnlockContext() {
+    // Save scratches.
+    c.STP(X0, X1, SP, PRE_INDEXED, -16);
+
+    // Load lock pointer.
+    c.MRS(X0, oaknut::SystemReg::TPIDR_EL0);
+    c.ADD(X0, X0, offsetof(NativeExecutionParameters, lock));
+
+    // Load SpinLockUnlocked.
+    c.MOV(W1, SpinLockUnlocked);
+
+    // Store value with release ordering.
+    c.STLR(W1, X0);
+
+    // Load scratches.
+    c.LDP(X0, X1, SP, POST_INDEXED, 16);
+}
+
+} // namespace Core::NCE
--- a/src/core/arm/nce/patch.h
+++ b/src/core/arm/nce/patch.h
@ -0,0 +1,101 @@
+// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <span>
+#include <unordered_map>
+#include <vector>
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wshorten-64-to-32"
+#include <oaknut/code_block.hpp>
+#include <oaknut/oaknut.hpp>
+#pragma GCC diagnostic pop
+
+#include "common/common_types.h"
+#include "core/hle/kernel/code_set.h"
+#include "core/hle/kernel/k_typed_address.h"
+#include "core/hle/kernel/physical_memory.h"
+
+namespace Core::NCE {
+
+enum class PatchMode : u32 {
+    None,
+    PreText,  ///< Patch section is inserted before .text
+    PostData, ///< Patch section is inserted after .data
+};
+
+using ModuleTextAddress = u64;
+using PatchTextAddress = u64;
+using EntryTrampolines = std::unordered_map<ModuleTextAddress, PatchTextAddress>;
+
+class Patcher {
+public:
+    explicit Patcher();
+    ~Patcher();
+
+    void PatchText(const Kernel::PhysicalMemory& program_image,
+                   const Kernel::CodeSet::Segment& code);
+    void RelocateAndCopy(Common::ProcessAddress load_base, const Kernel::CodeSet::Segment& code,
+                         Kernel::PhysicalMemory& program_image, EntryTrampolines* out_trampolines);
+    size_t GetSectionSize() const noexcept;
+
+    [[nodiscard]] PatchMode GetPatchMode() const noexcept {
+        return mode;
+    }
+
+private:
+    using ModuleDestLabel = uintptr_t;
+
+    struct Trampoline {
+        ptrdiff_t patch_offset;
+        uintptr_t module_offset;
+    };
+
+    void WriteLoadContext();
+    void WriteSaveContext();
+    void LockContext();
+    void UnlockContext();
+    void WriteSvcTrampoline(ModuleDestLabel module_dest, u32 svc_id);
+    void WriteMrsHandler(ModuleDestLabel module_dest, oaknut::XReg dest_reg,
+                         oaknut::SystemReg src_reg);
+    void WriteMsrHandler(ModuleDestLabel module_dest, oaknut::XReg src_reg);
+    void WriteCntpctHandler(ModuleDestLabel module_dest, oaknut::XReg dest_reg);
+
+private:
+    void BranchToPatch(uintptr_t module_dest) {
+        m_branch_to_patch_relocations.push_back({c.offset(), module_dest});
+    }
+
+    void BranchToModule(uintptr_t module_dest) {
+        m_branch_to_module_relocations.push_back({c.offset(), module_dest});
+        c.dw(0);
+    }
+
+    void WriteModulePc(uintptr_t module_dest) {
+        m_write_module_pc_relocations.push_back({c.offset(), module_dest});
+        c.dx(0);
+    }
+
+private:
+    // List of patch instructions we have generated.
+    std::vector<u32> m_patch_instructions{};
+
+    // Relocation type for relative branch from module to patch.
+    struct Relocation {
+        ptrdiff_t patch_offset;  ///< Offset in bytes from the start of the patch section.
+        uintptr_t module_offset; ///< Offset in bytes from the start of the text section.
+    };
+
+    oaknut::VectorCodeGenerator c;
+    std::vector<Trampoline> m_trampolines;
+    std::vector<Relocation> m_branch_to_patch_relocations{};
+    std::vector<Relocation> m_branch_to_module_relocations{};
+    std::vector<Relocation> m_write_module_pc_relocations{};
+    oaknut::Label m_save_context{};
+    oaknut::Label m_load_context{};
+    PatchMode mode{PatchMode::None};
+};
+
+} // namespace Core::NCE
--- a/src/core/cpu_manager.cpp
+++ b/src/core/cpu_manager.cpp
@ -211,6 +211,8 @@ void CpuManager::RunThread(std::stop_token token, std::size_t core) {
        system.GPU().ObtainContext();
    }

+    system.ArmInterface(core).Initialize();
+
    auto& kernel = system.Kernel();
    auto& scheduler = *kernel.CurrentScheduler();
    auto* thread = scheduler.GetSchedulerCurrentThread();
--- a/src/core/device_memory.cpp
+++ b/src/core/device_memory.cpp
@ -6,7 +6,7 @@

 namespace Core {

-#ifdef ANDROID
+#ifdef HAS_NCE
 constexpr size_t VirtualReserveSize = 1ULL << 38;
 #else
 constexpr size_t VirtualReserveSize = 1ULL << 39;
@ -15,6 +15,7 @@ constexpr size_t VirtualReserveSize = 1ULL << 39;
 DeviceMemory::DeviceMemory()
    : buffer{Kernel::Board::Nintendo::Nx::KSystemControl::Init::GetIntendedMemorySize(),
             VirtualReserveSize} {}
+
 DeviceMemory::~DeviceMemory() = default;

 } // namespace Core
--- a/src/core/hle/kernel/code_set.h
+++ b/src/core/hle/kernel/code_set.h
@ -75,12 +75,26 @@ struct CodeSet final {
        return segments[2];
    }

+#ifdef HAS_NCE
+    Segment& PatchSegment() {
+        return patch_segment;
+    }
+
+    const Segment& PatchSegment() const {
+        return patch_segment;
+    }
+#endif
+
    /// The overall data that backs this code set.
    Kernel::PhysicalMemory memory;

    /// The segments that comprise this code set.
    std::array<Segment, 3> segments;

+#ifdef HAS_NCE
+    Segment patch_segment;
+#endif
+
    /// The entry point address for this code set.
    KProcessAddress entrypoint = 0;
 };
--- a/src/core/hle/kernel/k_address_space_info.cpp
+++ b/src/core/hle/kernel/k_address_space_info.cpp
@ -25,8 +25,8 @@ constexpr std::array<KAddressSpaceInfo, 13> AddressSpaceInfos{{
   { .bit_width = 36, .address = 2_GiB       , .size = 64_GiB  - 2_GiB  , .type = KAddressSpaceInfo::Type::MapLarge, },
   { .bit_width = 36, .address = Size_Invalid, .size = 8_GiB            , .type = KAddressSpaceInfo::Type::Heap,     },
   { .bit_width = 36, .address = Size_Invalid, .size = 6_GiB            , .type = KAddressSpaceInfo::Type::Alias,    },
-#ifdef ANDROID
-   // With Android, we use a 38-bit address space due to memory limitations. This should (safely) truncate ASLR region.
+#ifdef HAS_NCE
+   // With NCE, we use a 38-bit address space due to memory limitations. This should (safely) truncate ASLR region.
   { .bit_width = 39, .address = 128_MiB     , .size = 256_GiB - 128_MiB, .type = KAddressSpaceInfo::Type::Map39Bit, },
 #else
   { .bit_width = 39, .address = 128_MiB     , .size = 512_GiB - 128_MiB, .type = KAddressSpaceInfo::Type::Map39Bit, },
--- a/src/core/hle/kernel/k_page_table_base.cpp
+++ b/src/core/hle/kernel/k_page_table_base.cpp
@ -88,6 +88,22 @@ Result FlushDataCache(AddressType addr, u64 size) {
    R_SUCCEED();
 }

+constexpr Common::MemoryPermission ConvertToMemoryPermission(KMemoryPermission perm) {
+    Common::MemoryPermission perms{};
+    if (True(perm & KMemoryPermission::UserRead)) {
+        perms |= Common::MemoryPermission::Read;
+    }
+    if (True(perm & KMemoryPermission::UserWrite)) {
+        perms |= Common::MemoryPermission::Write;
+    }
+#ifdef HAS_NCE
+    if (True(perm & KMemoryPermission::UserExecute)) {
+        perms |= Common::MemoryPermission::Execute;
+    }
+#endif
+    return perms;
+}
+
 } // namespace

 void KPageTableBase::MemoryRange::Open() {
@ -170,7 +186,8 @@ Result KPageTableBase::InitializeForProcess(Svc::CreateProcessFlag as_type, bool
                                            KMemoryManager::Pool pool, KProcessAddress code_address,
                                            size_t code_size, KSystemResource* system_resource,
                                            KResourceLimit* resource_limit,
-                                            Core::Memory::Memory& memory) {
+                                            Core::Memory::Memory& memory,
+                                            KProcessAddress aslr_space_start) {
    // Calculate region extents.
    const size_t as_width = GetAddressSpaceWidth(as_type);
    const KProcessAddress start = 0;
@ -211,7 +228,8 @@ Result KPageTableBase::InitializeForProcess(Svc::CreateProcessFlag as_type, bool
        heap_region_size = GetSpaceSize(KAddressSpaceInfo::Type::Heap);
        stack_region_size = GetSpaceSize(KAddressSpaceInfo::Type::Stack);
        kernel_map_region_size = GetSpaceSize(KAddressSpaceInfo::Type::MapSmall);
-        m_code_region_start = GetSpaceStart(KAddressSpaceInfo::Type::Map39Bit);
+        m_code_region_start = m_address_space_start + aslr_space_start +
+                              GetSpaceStart(KAddressSpaceInfo::Type::Map39Bit);
        m_code_region_end = m_code_region_start + GetSpaceSize(KAddressSpaceInfo::Type::Map39Bit);
        m_alias_code_region_start = m_code_region_start;
        m_alias_code_region_end = m_code_region_end;
@ -5643,7 +5661,8 @@ Result KPageTableBase::Operate(PageLinkedList* page_list, KProcessAddress virt_a
    case OperationType::Map: {
        ASSERT(virt_addr != 0);
        ASSERT(Common::IsAligned(GetInteger(virt_addr), PageSize));
-        m_memory->MapMemoryRegion(*m_impl, virt_addr, num_pages * PageSize, phys_addr);
+        m_memory->MapMemoryRegion(*m_impl, virt_addr, num_pages * PageSize, phys_addr,
+                                  ConvertToMemoryPermission(properties.perm));

        // Open references to pages, if we should.
        if (this->IsHeapPhysicalAddress(phys_addr)) {
@ -5658,8 +5677,18 @@ Result KPageTableBase::Operate(PageLinkedList* page_list, KProcessAddress virt_a
    }
    case OperationType::ChangePermissions:
    case OperationType::ChangePermissionsAndRefresh:
-    case OperationType::ChangePermissionsAndRefreshAndFlush:
+    case OperationType::ChangePermissionsAndRefreshAndFlush: {
+        const bool read = True(properties.perm & Kernel::KMemoryPermission::UserRead);
+        const bool write = True(properties.perm & Kernel::KMemoryPermission::UserWrite);
+        // todo: this doesn't really belong here and should go into m_memory to handle rasterizer
+        // access todo: ignore exec on non-direct-mapped case
+        const bool exec = True(properties.perm & Kernel::KMemoryPermission::UserExecute);
+        if (Settings::IsFastmemEnabled()) {
+            m_system.DeviceMemory().buffer.Protect(GetInteger(virt_addr), num_pages * PageSize,
+                                                   read, write, exec);
+        }
        R_SUCCEED();
+    }
    default:
        UNREACHABLE();
    }
@ -5687,7 +5716,8 @@ Result KPageTableBase::Operate(PageLinkedList* page_list, KProcessAddress virt_a
            const size_t size{node.GetNumPages() * PageSize};

            // Map the pages.
-            m_memory->MapMemoryRegion(*m_impl, virt_addr, size, node.GetAddress());
+            m_memory->MapMemoryRegion(*m_impl, virt_addr, size, node.GetAddress(),
+                                      ConvertToMemoryPermission(properties.perm));

            virt_addr += size;
        }
--- a/src/core/hle/kernel/k_page_table_base.h
+++ b/src/core/hle/kernel/k_page_table_base.h
@ -235,7 +235,8 @@ public:
                                bool enable_device_address_space_merge, bool from_back,
                                KMemoryManager::Pool pool, KProcessAddress code_address,
                                size_t code_size, KSystemResource* system_resource,
-                                KResourceLimit* resource_limit, Core::Memory::Memory& memory);
+                                KResourceLimit* resource_limit, Core::Memory::Memory& memory,
+                                KProcessAddress aslr_space_start);

    void Finalize();

--- a/src/core/hle/kernel/k_process.cpp
+++ b/src/core/hle/kernel/k_process.cpp
@ -300,7 +300,7 @@ Result KProcess::Initialize(const Svc::CreateProcessParameter& params, const KPa
            False(params.flags & Svc::CreateProcessFlag::DisableDeviceAddressSpaceMerge);
        R_TRY(m_page_table.Initialize(as_type, enable_aslr, enable_das_merge, !enable_aslr, pool,
                                      params.code_address, params.code_num_pages * PageSize,
-                                      m_system_resource, res_limit, this->GetMemory()));
+                                      m_system_resource, res_limit, this->GetMemory(), 0));
    }
    ON_RESULT_FAILURE_2 {
        m_page_table.Finalize();
@ -332,7 +332,7 @@ Result KProcess::Initialize(const Svc::CreateProcessParameter& params, const KPa

 Result KProcess::Initialize(const Svc::CreateProcessParameter& params,
                            std::span<const u32> user_caps, KResourceLimit* res_limit,
-                            KMemoryManager::Pool pool) {
+                            KMemoryManager::Pool pool, KProcessAddress aslr_space_start) {
    ASSERT(res_limit != nullptr);

    // Set members.
@ -393,7 +393,7 @@ Result KProcess::Initialize(const Svc::CreateProcessParameter& params,
            False(params.flags & Svc::CreateProcessFlag::DisableDeviceAddressSpaceMerge);
        R_TRY(m_page_table.Initialize(as_type, enable_aslr, enable_das_merge, !enable_aslr, pool,
                                      params.code_address, code_size, m_system_resource, res_limit,
-                                      this->GetMemory()));
+                                      this->GetMemory(), aslr_space_start));
    }
    ON_RESULT_FAILURE_2 {
        m_page_table.Finalize();
@ -1128,7 +1128,7 @@ KProcess::KProcess(KernelCore& kernel)
 KProcess::~KProcess() = default;

 Result KProcess::LoadFromMetadata(const FileSys::ProgramMetadata& metadata, std::size_t code_size,
-                                  bool is_hbl) {
+                                  KProcessAddress aslr_space_start, bool is_hbl) {
    // Create a resource limit for the process.
    const auto physical_memory_size =
        m_kernel.MemoryManager().GetSize(Kernel::KMemoryManager::Pool::Application);
@ -1179,7 +1179,7 @@ Result KProcess::LoadFromMetadata(const FileSys::ProgramMetadata& metadata, std:
        .name = {},
        .version = {},
        .program_id = metadata.GetTitleID(),
-        .code_address = code_address,
+        .code_address = code_address + GetInteger(aslr_space_start),
        .code_num_pages = static_cast<s32>(code_size / PageSize),
        .flags = flag,
        .reslimit = Svc::InvalidHandle,
@ -1193,7 +1193,7 @@ Result KProcess::LoadFromMetadata(const FileSys::ProgramMetadata& metadata, std:

    // Initialize for application process.
    R_TRY(this->Initialize(params, metadata.GetKernelCapabilities(), res_limit,
-                           KMemoryManager::Pool::Application));
+                           KMemoryManager::Pool::Application, aslr_space_start));

    // Assign remaining properties.
    m_is_hbl = is_hbl;
@ -1214,6 +1214,17 @@ void KProcess::LoadModule(CodeSet code_set, KProcessAddress base_addr) {
    ReprotectSegment(code_set.CodeSegment(), Svc::MemoryPermission::ReadExecute);
    ReprotectSegment(code_set.RODataSegment(), Svc::MemoryPermission::Read);
    ReprotectSegment(code_set.DataSegment(), Svc::MemoryPermission::ReadWrite);
+
+#ifdef ARCHITECTURE_arm64
+    if (Settings::IsNceEnabled()) {
+        auto& buffer = m_kernel.System().DeviceMemory().buffer;
+        const auto& code = code_set.CodeSegment();
+        const auto& patch = code_set.PatchSegment();
+        buffer.Protect(GetInteger(base_addr + code.addr), code.size, true, true, true);
+        buffer.Protect(GetInteger(base_addr + patch.addr), patch.size, true, true, true);
+        ReprotectSegment(code_set.PatchSegment(), Svc::MemoryPermission::None);
+    }
+#endif
 }

 bool KProcess::InsertWatchpoint(KProcessAddress addr, u64 size, DebugWatchpointType type) {
--- a/src/core/hle/kernel/k_process.h
+++ b/src/core/hle/kernel/k_process.h
@ -120,6 +120,9 @@ private:
    std::atomic<s64> m_num_ipc_messages{};
    std::atomic<s64> m_num_ipc_replies{};
    std::atomic<s64> m_num_ipc_receives{};
+#ifdef HAS_NCE
+    std::unordered_map<u64, u64> m_post_handlers{};
+#endif

 private:
    Result StartTermination();
@ -150,7 +153,8 @@ public:
                      std::span<const u32> caps, KResourceLimit* res_limit,
                      KMemoryManager::Pool pool, bool immortal);
    Result Initialize(const Svc::CreateProcessParameter& params, std::span<const u32> user_caps,
-                      KResourceLimit* res_limit, KMemoryManager::Pool pool);
+                      KResourceLimit* res_limit, KMemoryManager::Pool pool,
+                      KProcessAddress aslr_space_start);
    void Exit();

    const char* GetName() const {
@ -466,6 +470,12 @@ public:

    static void Switch(KProcess* cur_process, KProcess* next_process);

+#ifdef HAS_NCE
+    std::unordered_map<u64, u64>& GetPostHandlers() noexcept {
+        return m_post_handlers;
+    }
+#endif
+
 public:
    // Attempts to insert a watchpoint into a free slot. Returns false if none are available.
    bool InsertWatchpoint(KProcessAddress addr, u64 size, DebugWatchpointType type);
@ -479,7 +489,7 @@ public:

 public:
    Result LoadFromMetadata(const FileSys::ProgramMetadata& metadata, std::size_t code_size,
-                            bool is_hbl);
+                            KProcessAddress aslr_space_start, bool is_hbl);

    void LoadModule(CodeSet code_set, KProcessAddress base_addr);

--- a/src/core/hle/kernel/k_process_page_table.h
+++ b/src/core/hle/kernel/k_process_page_table.h
@ -23,10 +23,11 @@ public:
    Result Initialize(Svc::CreateProcessFlag as_type, bool enable_aslr, bool enable_das_merge,
                      bool from_back, KMemoryManager::Pool pool, KProcessAddress code_address,
                      size_t code_size, KSystemResource* system_resource,
-                      KResourceLimit* resource_limit, Core::Memory::Memory& memory) {
-        R_RETURN(m_page_table.InitializeForProcess(as_type, enable_aslr, enable_das_merge,
-                                                   from_back, pool, code_address, code_size,
-                                                   system_resource, resource_limit, memory));
+                      KResourceLimit* resource_limit, Core::Memory::Memory& memory,
+                      KProcessAddress aslr_space_start) {
+        R_RETURN(m_page_table.InitializeForProcess(
+            as_type, enable_aslr, enable_das_merge, from_back, pool, code_address, code_size,
+            system_resource, resource_limit, memory, aslr_space_start));
    }

    void Finalize() {
--- a/src/core/hle/kernel/k_thread.h
+++ b/src/core/hle/kernel/k_thread.h
@ -655,6 +655,21 @@ public:
        return m_stack_top;
    }

+public:
+    // TODO: This shouldn't be defined in kernel namespace
+    struct NativeExecutionParameters {
+        u64 tpidr_el0{};
+        u64 tpidrro_el0{};
+        void* native_context{};
+        std::atomic<u32> lock{1};
+        bool is_running{};
+        u32 magic{Common::MakeMagic('Y', 'U', 'Z', 'U')};
+    };
+
+    NativeExecutionParameters& GetNativeExecutionParameters() {
+        return m_native_execution_parameters;
+    }
+
 private:
    KThread* RemoveWaiterByKey(bool* out_has_waiters, KProcessAddress key,
                               bool is_kernel_address_key);
@ -914,6 +929,7 @@ private:
    ThreadWaitReasonForDebugging m_wait_reason_for_debugging{};
    uintptr_t m_argument{};
    KProcessAddress m_stack_top{};
+    NativeExecutionParameters m_native_execution_parameters{};

 public:
    using ConditionVariableThreadTreeType = ConditionVariableThreadTree;
--- a/src/core/hle/kernel/physical_core.cpp
+++ b/src/core/hle/kernel/physical_core.cpp
@ -1,8 +1,12 @@
 // SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

+#include "common/settings.h"
 #include "core/arm/dynarmic/arm_dynarmic_32.h"
 #include "core/arm/dynarmic/arm_dynarmic_64.h"
+#ifdef HAS_NCE
+#include "core/arm/nce/arm_nce.h"
+#endif
 #include "core/core.h"
 #include "core/hle/kernel/k_scheduler.h"
 #include "core/hle/kernel/kernel.h"
@ -14,7 +18,8 @@ PhysicalCore::PhysicalCore(std::size_t core_index, Core::System& system, KSchedu
    : m_core_index{core_index}, m_system{system}, m_scheduler{scheduler} {
 #if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64)
    // TODO(bunnei): Initialization relies on a core being available. We may later replace this with
-    // a 32-bit instance of Dynarmic. This should be abstracted out to a CPU manager.
+    // an NCE interface or a 32-bit instance of Dynarmic. This should be abstracted out to a CPU
+    // manager.
    auto& kernel = system.Kernel();
    m_arm_interface = std::make_unique<Core::ARM_Dynarmic_64>(
        system, kernel.IsMulticore(),
@ -28,6 +33,13 @@ PhysicalCore::PhysicalCore(std::size_t core_index, Core::System& system, KSchedu
 PhysicalCore::~PhysicalCore() = default;

 void PhysicalCore::Initialize(bool is_64_bit) {
+#if defined(HAS_NCE)
+    if (Settings::IsNceEnabled()) {
+        m_arm_interface = std::make_unique<Core::ARM_NCE>(m_system, m_system.Kernel().IsMulticore(),
+                                                          m_core_index);
+        return;
+    }
+#endif
 #if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64)
    auto& kernel = m_system.Kernel();
    if (!is_64_bit) {
--- a/src/core/hle/service/friend/friend.cpp
+++ b/src/core/hle/service/friend/friend.cpp
@ -32,7 +32,7 @@ public:
            {10200, nullptr, "SendFriendRequestForApplication"},
            {10211, nullptr, "AddFacedFriendRequestForApplication"},
            {10400, &IFriendService::GetBlockedUserListIds, "GetBlockedUserListIds"},
-            {10420, nullptr, "IsBlockedUserListCacheAvailable"},
+            {10420, &IFriendService::CheckBlockedUserListAvailability, "CheckBlockedUserListAvailability"},
            {10421, nullptr, "EnsureBlockedUserListAvailable"},
            {10500, nullptr, "GetProfileList"},
            {10600, nullptr, "DeclareOpenOnlinePlaySession"},
@ -206,6 +206,17 @@ private:
        rb.Push(true);
    }

+    void CheckBlockedUserListAvailability(HLERequestContext& ctx) {
+        IPC::RequestParser rp{ctx};
+        const auto uuid{rp.PopRaw<Common::UUID>()};
+
+        LOG_WARNING(Service_Friend, "(STUBBED) called, uuid=0x{}", uuid.RawString());
+
+        IPC::ResponseBuilder rb{ctx, 3};
+        rb.Push(ResultSuccess);
+        rb.Push(true);
+    }
+
    KernelHelpers::ServiceContext service_context;

    Kernel::KEvent* completion_event;
--- a/src/core/loader/deconstructed_rom_directory.cpp
+++ b/src/core/loader/deconstructed_rom_directory.cpp
@ -3,6 +3,7 @@

 #include <cstring>
 #include "common/logging/log.h"
+#include "common/settings.h"
 #include "core/core.h"
 #include "core/file_sys/content_archive.h"
 #include "core/file_sys/control_metadata.h"
@ -14,6 +15,10 @@
 #include "core/loader/deconstructed_rom_directory.h"
 #include "core/loader/nso.h"

+#ifdef HAS_NCE
+#include "core/arm/nce/patch.h"
+#endif
+
 namespace Loader {

 AppLoader_DeconstructedRomDirectory::AppLoader_DeconstructedRomDirectory(FileSys::VirtualFile file_,
@ -124,21 +129,43 @@ AppLoader_DeconstructedRomDirectory::LoadResult AppLoader_DeconstructedRomDirect
    }
    metadata.Print();

-    const auto static_modules = {"rtld",    "main",    "subsdk0", "subsdk1", "subsdk2",
+    // Enable NCE only for programs with 39-bit address space.
+    const bool is_39bit =
+        metadata.GetAddressSpaceType() == FileSys::ProgramAddressSpaceType::Is39Bit;
+    Settings::SetNceEnabled(is_39bit);
+
+    const std::array static_modules = {"rtld",    "main",    "subsdk0", "subsdk1", "subsdk2",
                                       "subsdk3", "subsdk4", "subsdk5", "subsdk6", "subsdk7",
                                       "subsdk8", "subsdk9", "sdk"};

-    // Use the NSO module loader to figure out the code layout
    std::size_t code_size{};
-    for (const auto& module : static_modules) {
+
+    // Define an nce patch context for each potential module.
+#ifdef HAS_NCE
+    std::array<Core::NCE::Patcher, 13> module_patchers;
+#endif
+
+    const auto GetPatcher = [&](size_t i) -> Core::NCE::Patcher* {
+#ifdef HAS_NCE
+        if (Settings::IsNceEnabled()) {
+            return &module_patchers[i];
+        }
+#endif
+        return nullptr;
+    };
+
+    // Use the NSO module loader to figure out the code layout
+    for (size_t i = 0; i < static_modules.size(); i++) {
+        const auto& module = static_modules[i];
        const FileSys::VirtualFile module_file{dir->GetFile(module)};
        if (!module_file) {
            continue;
        }

        const bool should_pass_arguments = std::strcmp(module, "rtld") == 0;
-        const auto tentative_next_load_addr = AppLoader_NSO::LoadModule(
-            process, system, *module_file, code_size, should_pass_arguments, false);
+        const auto tentative_next_load_addr =
+            AppLoader_NSO::LoadModule(process, system, *module_file, code_size,
+                                      should_pass_arguments, false, {}, GetPatcher(i));
        if (!tentative_next_load_addr) {
            return {ResultStatus::ErrorLoadingNSO, {}};
        }
@ -146,8 +173,18 @@ AppLoader_DeconstructedRomDirectory::LoadResult AppLoader_DeconstructedRomDirect
        code_size = *tentative_next_load_addr;
    }

+    // Enable direct memory mapping in case of NCE.
+    const u64 fastmem_base = [&]() -> size_t {
+        if (Settings::IsNceEnabled()) {
+            auto& buffer = system.DeviceMemory().buffer;
+            buffer.EnableDirectMappedAddress();
+            return reinterpret_cast<u64>(buffer.VirtualBasePointer());
+        }
+        return 0;
+    }();
+
    // Setup the process code layout
-    if (process.LoadFromMetadata(metadata, code_size, is_hbl).IsError()) {
+    if (process.LoadFromMetadata(metadata, code_size, fastmem_base, is_hbl).IsError()) {
        return {ResultStatus::ErrorUnableToParseKernelMetadata, {}};
    }

@ -157,7 +194,8 @@ AppLoader_DeconstructedRomDirectory::LoadResult AppLoader_DeconstructedRomDirect
    VAddr next_load_addr{base_address};
    const FileSys::PatchManager pm{metadata.GetTitleID(), system.GetFileSystemController(),
                                   system.GetContentProvider()};
-    for (const auto& module : static_modules) {
+    for (size_t i = 0; i < static_modules.size(); i++) {
+        const auto& module = static_modules[i];
        const FileSys::VirtualFile module_file{dir->GetFile(module)};
        if (!module_file) {
            continue;
@ -165,15 +203,16 @@ AppLoader_DeconstructedRomDirectory::LoadResult AppLoader_DeconstructedRomDirect

        const VAddr load_addr{next_load_addr};
        const bool should_pass_arguments = std::strcmp(module, "rtld") == 0;
-        const auto tentative_next_load_addr = AppLoader_NSO::LoadModule(
-            process, system, *module_file, load_addr, should_pass_arguments, true, pm);
+        const auto tentative_next_load_addr =
+            AppLoader_NSO::LoadModule(process, system, *module_file, load_addr,
+                                      should_pass_arguments, true, pm, GetPatcher(i));
        if (!tentative_next_load_addr) {
            return {ResultStatus::ErrorLoadingNSO, {}};
        }

        next_load_addr = *tentative_next_load_addr;
        modules.insert_or_assign(load_addr, module);
-        LOG_DEBUG(Loader, "loaded module {} @ 0x{:X}", module, load_addr);
+        LOG_DEBUG(Loader, "loaded module {} @ {:#X}", module, load_addr);
    }

    // Find the RomFS by searching for a ".romfs" file in this directory
--- a/src/core/loader/kip.cpp
+++ b/src/core/loader/kip.cpp
@ -91,7 +91,8 @@ AppLoader::LoadResult AppLoader_KIP::Load(Kernel::KProcess& process,

    // Setup the process code layout
    if (process
-            .LoadFromMetadata(FileSys::ProgramMetadata::GetDefault(), program_image.size(), false)
+            .LoadFromMetadata(FileSys::ProgramMetadata::GetDefault(), program_image.size(), 0,
+                              false)
            .IsError()) {
        return {ResultStatus::ErrorNotInitialized, {}};
    }
--- a/src/core/loader/nro.cpp
+++ b/src/core/loader/nro.cpp
@ -22,6 +22,10 @@
 #include "core/loader/nso.h"
 #include "core/memory.h"

+#ifdef HAS_NCE
+#include "core/arm/nce/patch.h"
+#endif
+
 namespace Loader {

 struct NroSegmentHeader {
@ -139,7 +143,8 @@ static constexpr u32 PageAlignSize(u32 size) {
    return static_cast<u32>((size + Core::Memory::YUZU_PAGEMASK) & ~Core::Memory::YUZU_PAGEMASK);
 }

-static bool LoadNroImpl(Kernel::KProcess& process, const std::vector<u8>& data) {
+static bool LoadNroImpl(Core::System& system, Kernel::KProcess& process,
+                        const std::vector<u8>& data) {
    if (data.size() < sizeof(NroHeader)) {
        return {};
    }
@ -194,14 +199,61 @@ static bool LoadNroImpl(Kernel::KProcess& process, const std::vector<u8>& data)

    codeset.DataSegment().size += bss_size;
    program_image.resize(static_cast<u32>(program_image.size()) + bss_size);
+    size_t image_size = program_image.size();
+
+#ifdef HAS_NCE
+    const auto& code = codeset.CodeSegment();
+
+    // NROs always have a 39-bit address space.
+    Settings::SetNceEnabled(true);
+
+    // Create NCE patcher
+    Core::NCE::Patcher patch{};
+
+    if (Settings::IsNceEnabled()) {
+        // Patch SVCs and MRS calls in the guest code
+        patch.PatchText(program_image, code);
+
+        // We only support PostData patching for NROs.
+        ASSERT(patch.GetPatchMode() == Core::NCE::PatchMode::PostData);
+
+        // Update patch section.
+        auto& patch_segment = codeset.PatchSegment();
+        patch_segment.addr = image_size;
+        patch_segment.size = static_cast<u32>(patch.GetSectionSize());
+
+        // Add patch section size to the module size.
+        image_size += patch_segment.size;
+    }
+#endif
+
+    // Enable direct memory mapping in case of NCE.
+    const u64 fastmem_base = [&]() -> size_t {
+        if (Settings::IsNceEnabled()) {
+            auto& buffer = system.DeviceMemory().buffer;
+            buffer.EnableDirectMappedAddress();
+            return reinterpret_cast<u64>(buffer.VirtualBasePointer());
+        }
+        return 0;
+    }();

    // Setup the process code layout
    if (process
-            .LoadFromMetadata(FileSys::ProgramMetadata::GetDefault(), program_image.size(), false)
+            .LoadFromMetadata(FileSys::ProgramMetadata::GetDefault(), image_size, fastmem_base,
+                              false)
            .IsError()) {
        return false;
    }

+    // Relocate code patch and copy to the program_image if running under NCE.
+    // This needs to be after LoadFromMetadata so we can use the process entry point.
+#ifdef HAS_NCE
+    if (Settings::IsNceEnabled()) {
+        patch.RelocateAndCopy(process.GetEntryPoint(), code, program_image,
+                              &process.GetPostHandlers());
+    }
+#endif
+
    // Load codeset for current process
    codeset.memory = std::move(program_image);
    process.LoadModule(std::move(codeset), process.GetEntryPoint());
@ -209,8 +261,9 @@ static bool LoadNroImpl(Kernel::KProcess& process, const std::vector<u8>& data)
    return true;
 }

-bool AppLoader_NRO::LoadNro(Kernel::KProcess& process, const FileSys::VfsFile& nro_file) {
-    return LoadNroImpl(process, nro_file.ReadAllBytes());
+bool AppLoader_NRO::LoadNro(Core::System& system, Kernel::KProcess& process,
+                            const FileSys::VfsFile& nro_file) {
+    return LoadNroImpl(system, process, nro_file.ReadAllBytes());
 }

 AppLoader_NRO::LoadResult AppLoader_NRO::Load(Kernel::KProcess& process, Core::System& system) {
@ -218,7 +271,7 @@ AppLoader_NRO::LoadResult AppLoader_NRO::Load(Kernel::KProcess& process, Core::S
        return {ResultStatus::ErrorAlreadyLoaded, {}};
    }

-    if (!LoadNro(process, *file)) {
+    if (!LoadNro(system, process, *file)) {
        return {ResultStatus::ErrorLoadingNRO, {}};
    }

--- a/src/core/loader/nro.h
+++ b/src/core/loader/nro.h
@ -54,7 +54,7 @@ public:
    bool IsRomFSUpdatable() const override;

 private:
-    bool LoadNro(Kernel::KProcess& process, const FileSys::VfsFile& nro_file);
+    bool LoadNro(Core::System& system, Kernel::KProcess& process, const FileSys::VfsFile& nro_file);

    std::vector<u8> icon_data;
    std::unique_ptr<FileSys::NACP> nacp;
--- a/src/core/loader/nso.cpp
+++ b/src/core/loader/nso.cpp
@ -20,6 +20,10 @@
 #include "core/loader/nso.h"
 #include "core/memory.h"

+#ifdef HAS_NCE
+#include "core/arm/nce/patch.h"
+#endif
+
 namespace Loader {
 namespace {
 struct MODHeader {
@ -72,7 +76,8 @@ FileType AppLoader_NSO::IdentifyType(const FileSys::VirtualFile& in_file) {
 std::optional<VAddr> AppLoader_NSO::LoadModule(Kernel::KProcess& process, Core::System& system,
                                               const FileSys::VfsFile& nso_file, VAddr load_base,
                                               bool should_pass_arguments, bool load_into_process,
-                                               std::optional<FileSys::PatchManager> pm) {
+                                               std::optional<FileSys::PatchManager> pm,
+                                               Core::NCE::Patcher* patch) {
    if (nso_file.GetSize() < sizeof(NSOHeader)) {
        return std::nullopt;
    }
@ -86,6 +91,16 @@ std::optional<VAddr> AppLoader_NSO::LoadModule(Kernel::KProcess& process, Core::
        return std::nullopt;
    }

+    // Allocate some space at the beginning if we are patching in PreText mode.
+    const size_t module_start = [&]() -> size_t {
+#ifdef HAS_NCE
+        if (patch && patch->GetPatchMode() == Core::NCE::PatchMode::PreText) {
+            return patch->GetSectionSize();
+        }
+#endif
+        return 0;
+    }();
+
    // Build program image
    Kernel::CodeSet codeset;
    Kernel::PhysicalMemory program_image;
@ -95,11 +110,12 @@ std::optional<VAddr> AppLoader_NSO::LoadModule(Kernel::KProcess& process, Core::
        if (nso_header.IsSegmentCompressed(i)) {
            data = DecompressSegment(data, nso_header.segments[i]);
        }
-        program_image.resize(nso_header.segments[i].location + static_cast<u32>(data.size()));
-        std::memcpy(program_image.data() + nso_header.segments[i].location, data.data(),
-                    data.size());
-        codeset.segments[i].addr = nso_header.segments[i].location;
-        codeset.segments[i].offset = nso_header.segments[i].location;
+        program_image.resize(module_start + nso_header.segments[i].location +
+                             static_cast<u32>(data.size()));
+        std::memcpy(program_image.data() + module_start + nso_header.segments[i].location,
+                    data.data(), data.size());
+        codeset.segments[i].addr = module_start + nso_header.segments[i].location;
+        codeset.segments[i].offset = module_start + nso_header.segments[i].location;
        codeset.segments[i].size = nso_header.segments[i].size;
    }

@ -118,7 +134,7 @@ std::optional<VAddr> AppLoader_NSO::LoadModule(Kernel::KProcess& process, Core::
    }

    codeset.DataSegment().size += nso_header.segments[2].bss_size;
-    const u32 image_size{
+    u32 image_size{
        PageAlignSize(static_cast<u32>(program_image.size()) + nso_header.segments[2].bss_size)};
    program_image.resize(image_size);

@ -129,16 +145,45 @@ std::optional<VAddr> AppLoader_NSO::LoadModule(Kernel::KProcess& process, Core::
    // Apply patches if necessary
    const auto name = nso_file.GetName();
    if (pm && (pm->HasNSOPatch(nso_header.build_id, name) || Settings::values.dump_nso)) {
-        std::vector<u8> pi_header(sizeof(NSOHeader) + program_image.size());
+        std::span<u8> patchable_section(program_image.data() + module_start,
+                                        program_image.size() - module_start);
+        std::vector<u8> pi_header(sizeof(NSOHeader) + patchable_section.size());
        std::memcpy(pi_header.data(), &nso_header, sizeof(NSOHeader));
-        std::memcpy(pi_header.data() + sizeof(NSOHeader), program_image.data(),
-                    program_image.size());
+        std::memcpy(pi_header.data() + sizeof(NSOHeader), patchable_section.data(),
+                    patchable_section.size());

        pi_header = pm->PatchNSO(pi_header, name);

-        std::copy(pi_header.begin() + sizeof(NSOHeader), pi_header.end(), program_image.data());
+        std::copy(pi_header.begin() + sizeof(NSOHeader), pi_header.end(), patchable_section.data());
    }

+#ifdef HAS_NCE
+    // If we are computing the process code layout and using nce backend, patch.
+    const auto& code = codeset.CodeSegment();
+    if (patch && patch->GetPatchMode() == Core::NCE::PatchMode::None) {
+        // Patch SVCs and MRS calls in the guest code
+        patch->PatchText(program_image, code);
+
+        // Add patch section size to the module size.
+        image_size += patch->GetSectionSize();
+    } else if (patch) {
+        // Relocate code patch and copy to the program_image.
+        patch->RelocateAndCopy(load_base, code, program_image, &process.GetPostHandlers());
+
+        // Update patch section.
+        auto& patch_segment = codeset.PatchSegment();
+        patch_segment.addr =
+            patch->GetPatchMode() == Core::NCE::PatchMode::PreText ? 0 : image_size;
+        patch_segment.size = static_cast<u32>(patch->GetSectionSize());
+
+        // Add patch section size to the module size. In PreText mode image_size
+        // already contains the patch segment as part of module_start.
+        if (patch->GetPatchMode() == Core::NCE::PatchMode::PostData) {
+            image_size += patch_segment.size;
+        }
+    }
+#endif
+
    // If we aren't actually loading (i.e. just computing the process code layout), we are done
    if (!load_into_process) {
        return load_base + image_size;
--- a/src/core/loader/nso.h
+++ b/src/core/loader/nso.h
@ -15,6 +15,10 @@ namespace Core {
 class System;
 }

+namespace Core::NCE {
+class Patcher;
+}
+
 namespace Kernel {
 class KProcess;
 }
@ -88,7 +92,8 @@ public:
    static std::optional<VAddr> LoadModule(Kernel::KProcess& process, Core::System& system,
                                           const FileSys::VfsFile& nso_file, VAddr load_base,
                                           bool should_pass_arguments, bool load_into_process,
-                                           std::optional<FileSys::PatchManager> pm = {});
+                                           std::optional<FileSys::PatchManager> pm = {},
+                                           Core::NCE::Patcher* patch = nullptr);

    LoadResult Load(Kernel::KProcess& process, Core::System& system) override;

--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@ -53,7 +53,7 @@ struct Memory::Impl {
    }

    void MapMemoryRegion(Common::PageTable& page_table, Common::ProcessAddress base, u64 size,
-                         Common::PhysicalAddress target) {
+                         Common::PhysicalAddress target, Common::MemoryPermission perms) {
        ASSERT_MSG((size & YUZU_PAGEMASK) == 0, "non-page aligned size: {:016X}", size);
        ASSERT_MSG((base & YUZU_PAGEMASK) == 0, "non-page aligned base: {:016X}", GetInteger(base));
        ASSERT_MSG(target >= DramMemoryMap::Base, "Out of bounds target: {:016X}",
@ -63,7 +63,7 @@ struct Memory::Impl {

        if (Settings::IsFastmemEnabled()) {
            system.DeviceMemory().buffer.Map(GetInteger(base),
-                                             GetInteger(target) - DramMemoryMap::Base, size);
+                                             GetInteger(target) - DramMemoryMap::Base, size, perms);
        }
    }

@ -831,8 +831,8 @@ void Memory::SetCurrentPageTable(Kernel::KProcess& process, u32 core_id) {
 }

 void Memory::MapMemoryRegion(Common::PageTable& page_table, Common::ProcessAddress base, u64 size,
-                             Common::PhysicalAddress target) {
-    impl->MapMemoryRegion(page_table, base, size, target);
+                             Common::PhysicalAddress target, Common::MemoryPermission perms) {
+    impl->MapMemoryRegion(page_table, base, size, target, perms);
 }

 void Memory::UnmapRegion(Common::PageTable& page_table, Common::ProcessAddress base, u64 size) {
@ -1001,4 +1001,17 @@ void Memory::FlushRegion(Common::ProcessAddress dest_addr, size_t size) {
    impl->FlushRegion(dest_addr, size);
 }

+bool Memory::InvalidateNCE(Common::ProcessAddress vaddr, size_t size) {
+    bool mapped = true;
+    u8* const ptr = impl->GetPointerImpl(
+        GetInteger(vaddr),
+        [&] {
+            LOG_ERROR(HW_Memory, "Unmapped InvalidateNCE for {} bytes @ {:#x}", size,
+                      GetInteger(vaddr));
+            mapped = false;
+        },
+        [&] { impl->system.GPU().InvalidateRegion(GetInteger(vaddr), size); });
+    return mapped && ptr != nullptr;
+}
+
 } // namespace Core::Memory
--- a/src/core/memory.h
+++ b/src/core/memory.h
@ -15,8 +15,9 @@
 #include "core/hle/result.h"

 namespace Common {
+enum class MemoryPermission : u32;
 struct PageTable;
-}
+} // namespace Common

 namespace Core {
 class System;
@ -82,9 +83,10 @@ public:
     * @param size       The amount of bytes to map. Must be page-aligned.
     * @param target     Buffer with the memory backing the mapping. Must be of length at least
     *                   `size`.
+     * @param perms      The permissions to map the memory with.
     */
    void MapMemoryRegion(Common::PageTable& page_table, Common::ProcessAddress base, u64 size,
-                         Common::PhysicalAddress target);
+                         Common::PhysicalAddress target, Common::MemoryPermission perms);

    /**
     * Unmaps a region of the emulated process address space.
@ -472,6 +474,7 @@ public:

    void SetGPUDirtyManagers(std::span<Core::GPUDirtyMemoryManager> managers);
    void InvalidateRegion(Common::ProcessAddress dest_addr, size_t size);
+    bool InvalidateNCE(Common::ProcessAddress vaddr, size_t size);
    void FlushRegion(Common::ProcessAddress dest_addr, size_t size);

 private:
--- a/src/tests/common/host_memory.cpp
+++ b/src/tests/common/host_memory.cpp
@ -11,6 +11,7 @@ using namespace Common::Literals;

 static constexpr size_t VIRTUAL_SIZE = 1ULL << 39;
 static constexpr size_t BACKING_SIZE = 4_GiB;
+static constexpr auto PERMS = Common::MemoryPermission::ReadWrite;

 TEST_CASE("HostMemory: Initialize and deinitialize", "[common]") {
    { HostMemory mem(BACKING_SIZE, VIRTUAL_SIZE); }
@ -19,7 +20,7 @@ TEST_CASE("HostMemory: Initialize and deinitialize", "[common]") {

 TEST_CASE("HostMemory: Simple map", "[common]") {
    HostMemory mem(BACKING_SIZE, VIRTUAL_SIZE);
-    mem.Map(0x5000, 0x8000, 0x1000);
+    mem.Map(0x5000, 0x8000, 0x1000, PERMS);

    volatile u8* const data = mem.VirtualBasePointer() + 0x5000;
    data[0] = 50;
@ -28,8 +29,8 @@ TEST_CASE("HostMemory: Simple map", "[common]") {

 TEST_CASE("HostMemory: Simple mirror map", "[common]") {
    HostMemory mem(BACKING_SIZE, VIRTUAL_SIZE);
-    mem.Map(0x5000, 0x3000, 0x2000);
-    mem.Map(0x8000, 0x4000, 0x1000);
+    mem.Map(0x5000, 0x3000, 0x2000, PERMS);
+    mem.Map(0x8000, 0x4000, 0x1000, PERMS);

    volatile u8* const mirror_a = mem.VirtualBasePointer() + 0x5000;
    volatile u8* const mirror_b = mem.VirtualBasePointer() + 0x8000;
@ -39,7 +40,7 @@ TEST_CASE("HostMemory: Simple mirror map", "[common]") {

 TEST_CASE("HostMemory: Simple unmap", "[common]") {
    HostMemory mem(BACKING_SIZE, VIRTUAL_SIZE);
-    mem.Map(0x5000, 0x3000, 0x2000);
+    mem.Map(0x5000, 0x3000, 0x2000, PERMS);

    volatile u8* const data = mem.VirtualBasePointer() + 0x5000;
    data[75] = 50;
@ -50,7 +51,7 @@ TEST_CASE("HostMemory: Simple unmap", "[common]") {

 TEST_CASE("HostMemory: Simple unmap and remap", "[common]") {
    HostMemory mem(BACKING_SIZE, VIRTUAL_SIZE);
-    mem.Map(0x5000, 0x3000, 0x2000);
+    mem.Map(0x5000, 0x3000, 0x2000, PERMS);

    volatile u8* const data = mem.VirtualBasePointer() + 0x5000;
    data[0] = 50;
@ -58,79 +59,79 @@ TEST_CASE("HostMemory: Simple unmap and remap", "[common]") {

    mem.Unmap(0x5000, 0x2000);

-    mem.Map(0x5000, 0x3000, 0x2000);
+    mem.Map(0x5000, 0x3000, 0x2000, PERMS);
    REQUIRE(data[0] == 50);

-    mem.Map(0x7000, 0x2000, 0x5000);
+    mem.Map(0x7000, 0x2000, 0x5000, PERMS);
    REQUIRE(data[0x3000] == 50);
 }

 TEST_CASE("HostMemory: Nieche allocation", "[common]") {
    HostMemory mem(BACKING_SIZE, VIRTUAL_SIZE);
-    mem.Map(0x0000, 0, 0x20000);
+    mem.Map(0x0000, 0, 0x20000, PERMS);
    mem.Unmap(0x0000, 0x4000);
-    mem.Map(0x1000, 0, 0x2000);
-    mem.Map(0x3000, 0, 0x1000);
-    mem.Map(0, 0, 0x1000);
+    mem.Map(0x1000, 0, 0x2000, PERMS);
+    mem.Map(0x3000, 0, 0x1000, PERMS);
+    mem.Map(0, 0, 0x1000, PERMS);
 }

 TEST_CASE("HostMemory: Full unmap", "[common]") {
    HostMemory mem(BACKING_SIZE, VIRTUAL_SIZE);
-    mem.Map(0x8000, 0, 0x4000);
+    mem.Map(0x8000, 0, 0x4000, PERMS);
    mem.Unmap(0x8000, 0x4000);
-    mem.Map(0x6000, 0, 0x16000);
+    mem.Map(0x6000, 0, 0x16000, PERMS);
 }

 TEST_CASE("HostMemory: Right out of bounds unmap", "[common]") {
    HostMemory mem(BACKING_SIZE, VIRTUAL_SIZE);
-    mem.Map(0x0000, 0, 0x4000);
+    mem.Map(0x0000, 0, 0x4000, PERMS);
    mem.Unmap(0x2000, 0x4000);
-    mem.Map(0x2000, 0x80000, 0x4000);
+    mem.Map(0x2000, 0x80000, 0x4000, PERMS);
 }

 TEST_CASE("HostMemory: Left out of bounds unmap", "[common]") {
    HostMemory mem(BACKING_SIZE, VIRTUAL_SIZE);
-    mem.Map(0x8000, 0, 0x4000);
+    mem.Map(0x8000, 0, 0x4000, PERMS);
    mem.Unmap(0x6000, 0x4000);
-    mem.Map(0x8000, 0, 0x2000);
+    mem.Map(0x8000, 0, 0x2000, PERMS);
 }

 TEST_CASE("HostMemory: Multiple placeholder unmap", "[common]") {
    HostMemory mem(BACKING_SIZE, VIRTUAL_SIZE);
-    mem.Map(0x0000, 0, 0x4000);
-    mem.Map(0x4000, 0, 0x1b000);
+    mem.Map(0x0000, 0, 0x4000, PERMS);
+    mem.Map(0x4000, 0, 0x1b000, PERMS);
    mem.Unmap(0x3000, 0x1c000);
-    mem.Map(0x3000, 0, 0x20000);
+    mem.Map(0x3000, 0, 0x20000, PERMS);
 }

 TEST_CASE("HostMemory: Unmap between placeholders", "[common]") {
    HostMemory mem(BACKING_SIZE, VIRTUAL_SIZE);
-    mem.Map(0x0000, 0, 0x4000);
-    mem.Map(0x4000, 0, 0x4000);
+    mem.Map(0x0000, 0, 0x4000, PERMS);
+    mem.Map(0x4000, 0, 0x4000, PERMS);
    mem.Unmap(0x2000, 0x4000);
-    mem.Map(0x2000, 0, 0x4000);
+    mem.Map(0x2000, 0, 0x4000, PERMS);
 }

 TEST_CASE("HostMemory: Unmap to origin", "[common]") {
    HostMemory mem(BACKING_SIZE, VIRTUAL_SIZE);
-    mem.Map(0x4000, 0, 0x4000);
-    mem.Map(0x8000, 0, 0x4000);
+    mem.Map(0x4000, 0, 0x4000, PERMS);
+    mem.Map(0x8000, 0, 0x4000, PERMS);
    mem.Unmap(0x4000, 0x4000);
-    mem.Map(0, 0, 0x4000);
-    mem.Map(0x4000, 0, 0x4000);
+    mem.Map(0, 0, 0x4000, PERMS);
+    mem.Map(0x4000, 0, 0x4000, PERMS);
 }

 TEST_CASE("HostMemory: Unmap to right", "[common]") {
    HostMemory mem(BACKING_SIZE, VIRTUAL_SIZE);
-    mem.Map(0x4000, 0, 0x4000);
-    mem.Map(0x8000, 0, 0x4000);
+    mem.Map(0x4000, 0, 0x4000, PERMS);
+    mem.Map(0x8000, 0, 0x4000, PERMS);
    mem.Unmap(0x8000, 0x4000);
-    mem.Map(0x8000, 0, 0x4000);
+    mem.Map(0x8000, 0, 0x4000, PERMS);
 }

 TEST_CASE("HostMemory: Partial right unmap check bindings", "[common]") {
    HostMemory mem(BACKING_SIZE, VIRTUAL_SIZE);
-    mem.Map(0x4000, 0x10000, 0x4000);
+    mem.Map(0x4000, 0x10000, 0x4000, PERMS);

    volatile u8* const ptr = mem.VirtualBasePointer() + 0x4000;
    ptr[0x1000] = 17;
@ -142,7 +143,7 @@ TEST_CASE("HostMemory: Partial right unmap check bindings", "[common]") {

 TEST_CASE("HostMemory: Partial left unmap check bindings", "[common]") {
    HostMemory mem(BACKING_SIZE, VIRTUAL_SIZE);
-    mem.Map(0x4000, 0x10000, 0x4000);
+    mem.Map(0x4000, 0x10000, 0x4000, PERMS);

    volatile u8* const ptr = mem.VirtualBasePointer() + 0x4000;
    ptr[0x3000] = 19;
@ -156,7 +157,7 @@ TEST_CASE("HostMemory: Partial left unmap check bindings", "[common]") {

 TEST_CASE("HostMemory: Partial middle unmap check bindings", "[common]") {
    HostMemory mem(BACKING_SIZE, VIRTUAL_SIZE);
-    mem.Map(0x4000, 0x10000, 0x4000);
+    mem.Map(0x4000, 0x10000, 0x4000, PERMS);

    volatile u8* const ptr = mem.VirtualBasePointer() + 0x4000;
    ptr[0x0000] = 19;
@ -170,8 +171,8 @@ TEST_CASE("HostMemory: Partial middle unmap check bindings", "[common]") {

 TEST_CASE("HostMemory: Partial sparse middle unmap and check bindings", "[common]") {
    HostMemory mem(BACKING_SIZE, VIRTUAL_SIZE);
-    mem.Map(0x4000, 0x10000, 0x2000);
-    mem.Map(0x6000, 0x20000, 0x2000);
+    mem.Map(0x4000, 0x10000, 0x2000, PERMS);
+    mem.Map(0x6000, 0x20000, 0x2000, PERMS);

    volatile u8* const ptr = mem.VirtualBasePointer() + 0x4000;
    ptr[0x0000] = 19;
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@ -4,7 +4,7 @@
 add_subdirectory(host_shaders)

 if(LIBVA_FOUND)
-    set_source_files_properties(host1x/codecs/codec.cpp
+    set_source_files_properties(host1x/ffmpeg/ffmpeg.cpp
        PROPERTIES COMPILE_DEFINITIONS LIBVA_FOUND=1)
    list(APPEND FFmpeg_LIBRARIES ${LIBVA_LIBRARIES})
 endif()
@ -15,6 +15,7 @@ add_library(video_core STATIC
    buffer_cache/buffer_cache.cpp
    buffer_cache/buffer_cache.h
    buffer_cache/memory_tracker_base.h
+    buffer_cache/usage_tracker.h
    buffer_cache/word_manager.h
    cache_types.h
    cdma_pusher.cpp
@ -66,6 +67,8 @@ add_library(video_core STATIC
    host1x/codecs/vp9.cpp
    host1x/codecs/vp9.h
    host1x/codecs/vp9_types.h
+    host1x/ffmpeg/ffmpeg.cpp
+    host1x/ffmpeg/ffmpeg.h
    host1x/control.cpp
    host1x/control.h
    host1x/host1x.cpp
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@ -67,6 +67,7 @@ void BufferCache<P>::TickFrame() {
    if (!channel_state) {
        return;
    }
+    runtime.TickFrame(slot_buffers);

    // Calculate hits and shots and move hit bits to the right
    const u32 hits = std::reduce(channel_state->uniform_cache_hits.begin(),
@ -230,7 +231,10 @@ bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am
    for (const IntervalType& add_interval : tmp_intervals) {
        common_ranges.add(add_interval);
    }
-    runtime.CopyBuffer(dest_buffer, src_buffer, copies);
+    const auto& copy = copies[0];
+    src_buffer.MarkUsage(copy.src_offset, copy.size);
+    dest_buffer.MarkUsage(copy.dst_offset, copy.size);
+    runtime.CopyBuffer(dest_buffer, src_buffer, copies, true);
    if (has_new_downloads) {
        memory_tracker.MarkRegionAsGpuModified(*cpu_dest_address, amount);
    }
@ -258,9 +262,10 @@ bool BufferCache<P>::DMAClear(GPUVAddr dst_address, u64 amount, u32 value) {
    common_ranges.subtract(subtract_interval);

    const BufferId buffer = FindBuffer(*cpu_dst_address, static_cast<u32>(size));
-    auto& dest_buffer = slot_buffers[buffer];
+    Buffer& dest_buffer = slot_buffers[buffer];
    const u32 offset = dest_buffer.Offset(*cpu_dst_address);
    runtime.ClearBuffer(dest_buffer, offset, size, value);
+    dest_buffer.MarkUsage(offset, size);
    return true;
 }

@ -603,6 +608,7 @@ void BufferCache<P>::CommitAsyncFlushesHigh() {
            VAddr orig_cpu_addr = static_cast<VAddr>(second_copy.src_offset);
            const IntervalType base_interval{orig_cpu_addr, orig_cpu_addr + copy.size};
            async_downloads += std::make_pair(base_interval, 1);
+            buffer.MarkUsage(copy.src_offset, copy.size);
            runtime.CopyBuffer(download_staging.buffer, buffer, copies, false);
            normalized_copies.push_back(second_copy);
        }
@ -621,8 +627,9 @@ void BufferCache<P>::CommitAsyncFlushesHigh() {
                    // Have in mind the staging buffer offset for the copy
                    copy.dst_offset += download_staging.offset;
                    const std::array copies{copy};
-                    runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies,
-                                       false);
+                    Buffer& buffer = slot_buffers[buffer_id];
+                    buffer.MarkUsage(copy.src_offset, copy.size);
+                    runtime.CopyBuffer(download_staging.buffer, buffer, copies, false);
                }
                runtime.PostCopyBarrier();
                runtime.Finish();
@ -742,7 +749,7 @@ void BufferCache<P>::BindHostIndexBuffer() {
                {BufferCopy{.src_offset = upload_staging.offset, .dst_offset = 0, .size = size}}};
            std::memcpy(upload_staging.mapped_span.data(),
                        draw_state.inline_index_draw_indexes.data(), size);
-            runtime.CopyBuffer(buffer, upload_staging.buffer, copies);
+            runtime.CopyBuffer(buffer, upload_staging.buffer, copies, true);
        } else {
            buffer.ImmediateUpload(0, draw_state.inline_index_draw_indexes);
        }
@ -754,6 +761,7 @@ void BufferCache<P>::BindHostIndexBuffer() {
            offset + draw_state.index_buffer.first * draw_state.index_buffer.FormatSizeInBytes();
        runtime.BindIndexBuffer(buffer, new_offset, size);
    } else {
+        buffer.MarkUsage(offset, size);
        runtime.BindIndexBuffer(draw_state.topology, draw_state.index_buffer.format,
                                draw_state.index_buffer.first, draw_state.index_buffer.count,
                                buffer, offset, size);
@ -790,6 +798,7 @@ void BufferCache<P>::BindHostVertexBuffers() {

            const u32 stride = maxwell3d->regs.vertex_streams[index].stride;
            const u32 offset = buffer.Offset(binding.cpu_addr);
+            buffer.MarkUsage(offset, binding.size);

            host_bindings.buffers.push_back(&buffer);
            host_bindings.offsets.push_back(offset);
@ -895,6 +904,7 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32
    if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
        channel_state->uniform_buffer_binding_sizes[stage][binding_index] = size;
    }
+    buffer.MarkUsage(offset, size);
    if constexpr (NEEDS_BIND_UNIFORM_INDEX) {
        runtime.BindUniformBuffer(stage, binding_index, buffer, offset, size);
    } else {
@ -913,6 +923,7 @@ void BufferCache<P>::BindHostGraphicsStorageBuffers(size_t stage) {
        SynchronizeBuffer(buffer, binding.cpu_addr, size);

        const u32 offset = buffer.Offset(binding.cpu_addr);
+        buffer.MarkUsage(offset, size);
        const bool is_written = ((channel_state->written_storage_buffers[stage] >> index) & 1) != 0;

        if (is_written) {
@ -943,6 +954,7 @@ void BufferCache<P>::BindHostGraphicsTextureBuffers(size_t stage) {

        const u32 offset = buffer.Offset(binding.cpu_addr);
        const PixelFormat format = binding.format;
+        buffer.MarkUsage(offset, size);
        if constexpr (SEPARATE_IMAGE_BUFFERS_BINDINGS) {
            if (((channel_state->image_texture_buffers[stage] >> index) & 1) != 0) {
                runtime.BindImageBuffer(buffer, offset, size, format);
@ -975,9 +987,10 @@ void BufferCache<P>::BindHostTransformFeedbackBuffers() {
        MarkWrittenBuffer(binding.buffer_id, binding.cpu_addr, size);

        const u32 offset = buffer.Offset(binding.cpu_addr);
+        buffer.MarkUsage(offset, size);
        host_bindings.buffers.push_back(&buffer);
        host_bindings.offsets.push_back(offset);
-        host_bindings.sizes.push_back(binding.size);
+        host_bindings.sizes.push_back(size);
    }
    if (host_bindings.buffers.size() > 0) {
        runtime.BindTransformFeedbackBuffers(host_bindings);
@ -1001,6 +1014,7 @@ void BufferCache<P>::BindHostComputeUniformBuffers() {
        SynchronizeBuffer(buffer, binding.cpu_addr, size);

        const u32 offset = buffer.Offset(binding.cpu_addr);
+        buffer.MarkUsage(offset, size);
        if constexpr (NEEDS_BIND_UNIFORM_INDEX) {
            runtime.BindComputeUniformBuffer(binding_index, buffer, offset, size);
            ++binding_index;
@ -1021,6 +1035,7 @@ void BufferCache<P>::BindHostComputeStorageBuffers() {
        SynchronizeBuffer(buffer, binding.cpu_addr, size);

        const u32 offset = buffer.Offset(binding.cpu_addr);
+        buffer.MarkUsage(offset, size);
        const bool is_written =
            ((channel_state->written_compute_storage_buffers >> index) & 1) != 0;

@ -1053,6 +1068,7 @@ void BufferCache<P>::BindHostComputeTextureBuffers() {

        const u32 offset = buffer.Offset(binding.cpu_addr);
        const PixelFormat format = binding.format;
+        buffer.MarkUsage(offset, size);
        if constexpr (SEPARATE_IMAGE_BUFFERS_BINDINGS) {
            if (((channel_state->image_compute_texture_buffers >> index) & 1) != 0) {
                runtime.BindImageBuffer(buffer, offset, size, format);
@ -1172,10 +1188,11 @@ void BufferCache<P>::UpdateVertexBuffer(u32 index) {
    if (!gpu_memory->IsWithinGPUAddressRange(gpu_addr_end)) {
        size = static_cast<u32>(gpu_memory->MaxContinuousRange(gpu_addr_begin, size));
    }
+    const BufferId buffer_id = FindBuffer(*cpu_addr, size);
    channel_state->vertex_buffers[index] = Binding{
        .cpu_addr = *cpu_addr,
        .size = size,
-        .buffer_id = FindBuffer(*cpu_addr, size),
+        .buffer_id = buffer_id,
    };
 }

@ -1401,7 +1418,8 @@ void BufferCache<P>::JoinOverlap(BufferId new_buffer_id, BufferId overlap_id,
        .dst_offset = dst_base_offset,
        .size = overlap.SizeBytes(),
    });
-    runtime.CopyBuffer(new_buffer, overlap, copies);
+    new_buffer.MarkUsage(copies[0].dst_offset, copies[0].size);
+    runtime.CopyBuffer(new_buffer, overlap, copies, true);
    DeleteBuffer(overlap_id, true);
 }

@ -1414,7 +1432,9 @@ BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) {
    const u32 size = static_cast<u32>(overlap.end - overlap.begin);
    const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, overlap.begin, size);
    auto& new_buffer = slot_buffers[new_buffer_id];
-    runtime.ClearBuffer(new_buffer, 0, new_buffer.SizeBytes(), 0);
+    const size_t size_bytes = new_buffer.SizeBytes();
+    runtime.ClearBuffer(new_buffer, 0, size_bytes, 0);
+    new_buffer.MarkUsage(0, size_bytes);
    for (const BufferId overlap_id : overlap.ids) {
        JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap);
    }
@ -1467,11 +1487,6 @@ void BufferCache<P>::TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept {

 template <class P>
 bool BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) {
-    return SynchronizeBufferImpl(buffer, cpu_addr, size);
-}
-
-template <class P>
-bool BufferCache<P>::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size) {
    boost::container::small_vector<BufferCopy, 4> copies;
    u64 total_size_bytes = 0;
    u64 largest_copy = 0;
@ -1493,51 +1508,6 @@ bool BufferCache<P>::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 s
    return false;
 }

-template <class P>
-bool BufferCache<P>::SynchronizeBufferNoModified(Buffer& buffer, VAddr cpu_addr, u32 size) {
-    boost::container::small_vector<BufferCopy, 4> copies;
-    u64 total_size_bytes = 0;
-    u64 largest_copy = 0;
-    IntervalSet found_sets{};
-    auto make_copies = [&] {
-        for (auto& interval : found_sets) {
-            const std::size_t sub_size = interval.upper() - interval.lower();
-            const VAddr cpu_addr_ = interval.lower();
-            copies.push_back(BufferCopy{
-                .src_offset = total_size_bytes,
-                .dst_offset = cpu_addr_ - buffer.CpuAddr(),
-                .size = sub_size,
-            });
-            total_size_bytes += sub_size;
-            largest_copy = std::max<u64>(largest_copy, sub_size);
-        }
-        const std::span<BufferCopy> copies_span(copies.data(), copies.size());
-        UploadMemory(buffer, total_size_bytes, largest_copy, copies_span);
-    };
-    memory_tracker.ForEachUploadRange(cpu_addr, size, [&](u64 cpu_addr_out, u64 range_size) {
-        const VAddr base_adr = cpu_addr_out;
-        const VAddr end_adr = base_adr + range_size;
-        const IntervalType add_interval{base_adr, end_adr};
-        found_sets.add(add_interval);
-    });
-    if (found_sets.empty()) {
-        return true;
-    }
-    const IntervalType search_interval{cpu_addr, cpu_addr + size};
-    auto it = common_ranges.lower_bound(search_interval);
-    auto it_end = common_ranges.upper_bound(search_interval);
-    if (it == common_ranges.end()) {
-        make_copies();
-        return false;
-    }
-    while (it != it_end) {
-        found_sets.subtract(*it);
-        it++;
-    }
-    make_copies();
-    return false;
-}
-
 template <class P>
 void BufferCache<P>::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy,
                                  std::span<BufferCopy> copies) {
@ -1586,7 +1556,8 @@ void BufferCache<P>::MappedUploadMemory([[maybe_unused]] Buffer& buffer,
            // Apply the staging offset
            copy.src_offset += upload_staging.offset;
        }
-        runtime.CopyBuffer(buffer, upload_staging.buffer, copies);
+        const bool can_reorder = runtime.CanReorderUpload(buffer, copies);
+        runtime.CopyBuffer(buffer, upload_staging.buffer, copies, true, can_reorder);
    }
 }

@ -1628,7 +1599,8 @@ void BufferCache<P>::InlineMemoryImplementation(VAddr dest_address, size_t copy_
        }};
        u8* const src_pointer = upload_staging.mapped_span.data();
        std::memcpy(src_pointer, inlined_buffer.data(), copy_size);
-        runtime.CopyBuffer(buffer, upload_staging.buffer, copies);
+        const bool can_reorder = runtime.CanReorderUpload(buffer, copies);
+        runtime.CopyBuffer(buffer, upload_staging.buffer, copies, true, can_reorder);
    } else {
        buffer.ImmediateUpload(buffer.Offset(dest_address), inlined_buffer.first(copy_size));
    }
@ -1681,8 +1653,9 @@ void BufferCache<P>::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si
        for (BufferCopy& copy : copies) {
            // Modify copies to have the staging offset in mind
            copy.dst_offset += download_staging.offset;
+            buffer.MarkUsage(copy.src_offset, copy.size);
        }
-        runtime.CopyBuffer(download_staging.buffer, buffer, copies_span);
+        runtime.CopyBuffer(download_staging.buffer, buffer, copies_span, true);
        runtime.Finish();
        for (const BufferCopy& copy : copies) {
            const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
--- a/src/video_core/buffer_cache/buffer_cache_base.h
+++ b/src/video_core/buffer_cache/buffer_cache_base.h
@ -529,10 +529,6 @@ private:

    bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size);

-    bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size);
-
-    bool SynchronizeBufferNoModified(Buffer& buffer, VAddr cpu_addr, u32 size);
-
    void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy,
                      std::span<BufferCopy> copies);

--- a/src/video_core/buffer_cache/usage_tracker.h
+++ b/src/video_core/buffer_cache/usage_tracker.h
@ -0,0 +1,79 @@
+// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#pragma once
+
+#include "common/alignment.h"
+#include "common/common_types.h"
+
+namespace VideoCommon {
+
+class UsageTracker {
+    static constexpr size_t BYTES_PER_BIT_SHIFT = 6;
+    static constexpr size_t PAGE_SHIFT = 6 + BYTES_PER_BIT_SHIFT;
+    static constexpr size_t PAGE_BYTES = 1 << PAGE_SHIFT;
+
+public:
+    explicit UsageTracker(size_t size) {
+        const size_t num_pages = (size >> PAGE_SHIFT) + 1;
+        pages.resize(num_pages, 0ULL);
+    }
+
+    void Reset() noexcept {
+        std::ranges::fill(pages, 0ULL);
+    }
+
+    void Track(u64 offset, u64 size) noexcept {
+        const size_t page = offset >> PAGE_SHIFT;
+        const size_t page_end = (offset + size) >> PAGE_SHIFT;
+        TrackPage(page, offset, size);
+        if (page == page_end) {
+            return;
+        }
+        for (size_t i = page + 1; i < page_end; i++) {
+            pages[i] = ~u64{0};
+        }
+        const size_t offset_end = offset + size;
+        const size_t offset_end_page_aligned = Common::AlignDown(offset_end, PAGE_BYTES);
+        TrackPage(page_end, offset_end_page_aligned, offset_end - offset_end_page_aligned);
+    }
+
+    [[nodiscard]] bool IsUsed(u64 offset, u64 size) const noexcept {
+        const size_t page = offset >> PAGE_SHIFT;
+        const size_t page_end = (offset + size) >> PAGE_SHIFT;
+        if (IsPageUsed(page, offset, size)) {
+            return true;
+        }
+        for (size_t i = page + 1; i < page_end; i++) {
+            if (pages[i] != 0) {
+                return true;
+            }
+        }
+        const size_t offset_end = offset + size;
+        const size_t offset_end_page_aligned = Common::AlignDown(offset_end, PAGE_BYTES);
+        return IsPageUsed(page_end, offset_end_page_aligned, offset_end - offset_end_page_aligned);
+    }
+
+private:
+    void TrackPage(u64 page, u64 offset, u64 size) noexcept {
+        const size_t offset_in_page = offset % PAGE_BYTES;
+        const size_t first_bit = offset_in_page >> BYTES_PER_BIT_SHIFT;
+        const size_t num_bits = std::min(size, PAGE_BYTES) >> BYTES_PER_BIT_SHIFT;
+        const size_t mask = ~u64{0} >> (64 - num_bits);
+        pages[page] |= (~u64{0} & mask) << first_bit;
+    }
+
+    bool IsPageUsed(u64 page, u64 offset, u64 size) const noexcept {
+        const size_t offset_in_page = offset % PAGE_BYTES;
+        const size_t first_bit = offset_in_page >> BYTES_PER_BIT_SHIFT;
+        const size_t num_bits = std::min(size, PAGE_BYTES) >> BYTES_PER_BIT_SHIFT;
+        const size_t mask = ~u64{0} >> (64 - num_bits);
+        const size_t mask2 = (~u64{0} & mask) << first_bit;
+        return (pages[page] & mask2) != 0;
+    }
+
+private:
+    std::vector<u64> pages;
+};
+
+} // namespace VideoCommon
--- a/src/video_core/host1x/codecs/codec.cpp
+++ b/src/video_core/host1x/codecs/codec.cpp
@ -1,11 +1,7 @@
 // SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

-#include <algorithm>
-#include <fstream>
-#include <vector>
 #include "common/assert.h"
-#include "common/scope_exit.h"
 #include "common/settings.h"
 #include "video_core/host1x/codecs/codec.h"
 #include "video_core/host1x/codecs/h264.h"
@ -14,242 +10,17 @@
 #include "video_core/host1x/host1x.h"
 #include "video_core/memory_manager.h"

-extern "C" {
-#include <libavfilter/buffersink.h>
-#include <libavfilter/buffersrc.h>
-#include <libavutil/opt.h>
-#ifdef LIBVA_FOUND
-// for querying VAAPI driver information
-#include <libavutil/hwcontext_vaapi.h>
-#endif
-}
-
 namespace Tegra {
-namespace {
-constexpr AVPixelFormat PREFERRED_GPU_FMT = AV_PIX_FMT_NV12;
-constexpr AVPixelFormat PREFERRED_CPU_FMT = AV_PIX_FMT_YUV420P;
-constexpr std::array PREFERRED_GPU_DECODERS = {
-    AV_HWDEVICE_TYPE_CUDA,
-#ifdef _WIN32
-    AV_HWDEVICE_TYPE_D3D11VA,
-    AV_HWDEVICE_TYPE_DXVA2,
-#elif defined(__unix__)
-    AV_HWDEVICE_TYPE_VAAPI,
-    AV_HWDEVICE_TYPE_VDPAU,
-#endif
-    // last resort for Linux Flatpak (w/ NVIDIA)
-    AV_HWDEVICE_TYPE_VULKAN,
-};
-
-void AVPacketDeleter(AVPacket* ptr) {
-    av_packet_free(&ptr);
-}
-
-using AVPacketPtr = std::unique_ptr<AVPacket, decltype(&AVPacketDeleter)>;
-
-AVPixelFormat GetGpuFormat(AVCodecContext* av_codec_ctx, const AVPixelFormat* pix_fmts) {
-    for (const AVPixelFormat* p = pix_fmts; *p != AV_PIX_FMT_NONE; ++p) {
-        if (*p == av_codec_ctx->pix_fmt) {
-            return av_codec_ctx->pix_fmt;
-        }
-    }
-    LOG_INFO(Service_NVDRV, "Could not find compatible GPU AV format, falling back to CPU");
-    av_buffer_unref(&av_codec_ctx->hw_device_ctx);
-    av_codec_ctx->pix_fmt = PREFERRED_CPU_FMT;
-    return PREFERRED_CPU_FMT;
-}
-
-// List all the currently available hwcontext in ffmpeg
-std::vector<AVHWDeviceType> ListSupportedContexts() {
-    std::vector<AVHWDeviceType> contexts{};
-    AVHWDeviceType current_device_type = AV_HWDEVICE_TYPE_NONE;
-    do {
-        current_device_type = av_hwdevice_iterate_types(current_device_type);
-        contexts.push_back(current_device_type);
-    } while (current_device_type != AV_HWDEVICE_TYPE_NONE);
-    return contexts;
-}
-
-} // namespace
-
-void AVFrameDeleter(AVFrame* ptr) {
-    av_frame_free(&ptr);
-}

 Codec::Codec(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs)
    : host1x(host1x_), state{regs}, h264_decoder(std::make_unique<Decoder::H264>(host1x)),
      vp8_decoder(std::make_unique<Decoder::VP8>(host1x)),
      vp9_decoder(std::make_unique<Decoder::VP9>(host1x)) {}

-Codec::~Codec() {
-    if (!initialized) {
-        return;
-    }
-    // Free libav memory
-    avcodec_free_context(&av_codec_ctx);
-    av_buffer_unref(&av_gpu_decoder);
-
-    if (filters_initialized) {
-        avfilter_graph_free(&av_filter_graph);
-    }
-}
-
-bool Codec::CreateGpuAvDevice() {
-    static constexpr auto HW_CONFIG_METHOD = AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX;
-    static const auto supported_contexts = ListSupportedContexts();
-    for (const auto& type : PREFERRED_GPU_DECODERS) {
-        if (std::none_of(supported_contexts.begin(), supported_contexts.end(),
-                         [&type](const auto& context) { return context == type; })) {
-            LOG_DEBUG(Service_NVDRV, "{} explicitly unsupported", av_hwdevice_get_type_name(type));
-            continue;
-        }
-        // Avoid memory leak from not cleaning up after av_hwdevice_ctx_create
-        av_buffer_unref(&av_gpu_decoder);
-        const int hwdevice_res = av_hwdevice_ctx_create(&av_gpu_decoder, type, nullptr, nullptr, 0);
-        if (hwdevice_res < 0) {
-            LOG_DEBUG(Service_NVDRV, "{} av_hwdevice_ctx_create failed {}",
-                      av_hwdevice_get_type_name(type), hwdevice_res);
-            continue;
-        }
-#ifdef LIBVA_FOUND
-        if (type == AV_HWDEVICE_TYPE_VAAPI) {
-            // we need to determine if this is an impersonated VAAPI driver
-            AVHWDeviceContext* hwctx =
-                static_cast<AVHWDeviceContext*>(static_cast<void*>(av_gpu_decoder->data));
-            AVVAAPIDeviceContext* vactx = static_cast<AVVAAPIDeviceContext*>(hwctx->hwctx);
-            const char* vendor_name = vaQueryVendorString(vactx->display);
-            if (strstr(vendor_name, "VDPAU backend")) {
-                // VDPAU impersonated VAAPI impl's are super buggy, we need to skip them
-                LOG_DEBUG(Service_NVDRV, "Skipping vdapu impersonated VAAPI driver");
-                continue;
-            } else {
-                // according to some user testing, certain vaapi driver (Intel?) could be buggy
-                // so let's log the driver name which may help the developers/supporters
-                LOG_DEBUG(Service_NVDRV, "Using VAAPI driver: {}", vendor_name);
-            }
-        }
-#endif
-        for (int i = 0;; i++) {
-            const AVCodecHWConfig* config = avcodec_get_hw_config(av_codec, i);
-            if (!config) {
-                LOG_DEBUG(Service_NVDRV, "{} decoder does not support device type {}.",
-                          av_codec->name, av_hwdevice_get_type_name(type));
-                break;
-            }
-            if ((config->methods & HW_CONFIG_METHOD) != 0 && config->device_type == type) {
-                LOG_INFO(Service_NVDRV, "Using {} GPU decoder", av_hwdevice_get_type_name(type));
-                av_codec_ctx->pix_fmt = config->pix_fmt;
-                return true;
-            }
-        }
-    }
-    return false;
-}
-
-void Codec::InitializeAvCodecContext() {
-    av_codec_ctx = avcodec_alloc_context3(av_codec);
-    av_opt_set(av_codec_ctx->priv_data, "tune", "zerolatency", 0);
-    av_codec_ctx->thread_count = 0;
-    av_codec_ctx->thread_type &= ~FF_THREAD_FRAME;
-}
-
-void Codec::InitializeGpuDecoder() {
-    if (!CreateGpuAvDevice()) {
-        av_buffer_unref(&av_gpu_decoder);
-        return;
-    }
-    auto* hw_device_ctx = av_buffer_ref(av_gpu_decoder);
-    ASSERT_MSG(hw_device_ctx, "av_buffer_ref failed");
-    av_codec_ctx->hw_device_ctx = hw_device_ctx;
-    av_codec_ctx->get_format = GetGpuFormat;
-}
-
-void Codec::InitializeAvFilters(AVFrame* frame) {
-    const AVFilter* buffer_src = avfilter_get_by_name("buffer");
-    const AVFilter* buffer_sink = avfilter_get_by_name("buffersink");
-    AVFilterInOut* inputs = avfilter_inout_alloc();
-    AVFilterInOut* outputs = avfilter_inout_alloc();
-    SCOPE_EXIT({
-        avfilter_inout_free(&inputs);
-        avfilter_inout_free(&outputs);
-    });
-
-    // Don't know how to get the accurate time_base but it doesn't matter for yadif filter
-    // so just use 1/1 to make buffer filter happy
-    std::string args = fmt::format("video_size={}x{}:pix_fmt={}:time_base=1/1", frame->width,
-                                   frame->height, frame->format);
-
-    av_filter_graph = avfilter_graph_alloc();
-    int ret = avfilter_graph_create_filter(&av_filter_src_ctx, buffer_src, "in", args.c_str(),
-                                           nullptr, av_filter_graph);
-    if (ret < 0) {
-        LOG_ERROR(Service_NVDRV, "avfilter_graph_create_filter source error: {}", ret);
-        return;
-    }
-
-    ret = avfilter_graph_create_filter(&av_filter_sink_ctx, buffer_sink, "out", nullptr, nullptr,
-                                       av_filter_graph);
-    if (ret < 0) {
-        LOG_ERROR(Service_NVDRV, "avfilter_graph_create_filter sink error: {}", ret);
-        return;
-    }
-
-    inputs->name = av_strdup("out");
-    inputs->filter_ctx = av_filter_sink_ctx;
-    inputs->pad_idx = 0;
-    inputs->next = nullptr;
-
-    outputs->name = av_strdup("in");
-    outputs->filter_ctx = av_filter_src_ctx;
-    outputs->pad_idx = 0;
-    outputs->next = nullptr;
-
-    const char* description = "yadif=1:-1:0";
-    ret = avfilter_graph_parse_ptr(av_filter_graph, description, &inputs, &outputs, nullptr);
-    if (ret < 0) {
-        LOG_ERROR(Service_NVDRV, "avfilter_graph_parse_ptr error: {}", ret);
-        return;
-    }
-
-    ret = avfilter_graph_config(av_filter_graph, nullptr);
-    if (ret < 0) {
-        LOG_ERROR(Service_NVDRV, "avfilter_graph_config error: {}", ret);
-        return;
-    }
-
-    filters_initialized = true;
-}
+Codec::~Codec() = default;

 void Codec::Initialize() {
-    const AVCodecID codec = [&] {
-        switch (current_codec) {
-        case Host1x::NvdecCommon::VideoCodec::H264:
-            return AV_CODEC_ID_H264;
-        case Host1x::NvdecCommon::VideoCodec::VP8:
-            return AV_CODEC_ID_VP8;
-        case Host1x::NvdecCommon::VideoCodec::VP9:
-            return AV_CODEC_ID_VP9;
-        default:
-            UNIMPLEMENTED_MSG("Unknown codec {}", current_codec);
-            return AV_CODEC_ID_NONE;
-        }
-    }();
-    av_codec = avcodec_find_decoder(codec);
-
-    InitializeAvCodecContext();
-    if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::Gpu) {
-        InitializeGpuDecoder();
-    }
-    if (const int res = avcodec_open2(av_codec_ctx, av_codec, nullptr); res < 0) {
-        LOG_ERROR(Service_NVDRV, "avcodec_open2() Failed with result {}", res);
-        avcodec_free_context(&av_codec_ctx);
-        av_buffer_unref(&av_gpu_decoder);
-        return;
-    }
-    if (!av_codec_ctx->hw_device_ctx) {
-        LOG_INFO(Service_NVDRV, "Using FFmpeg software decoding");
-    }
-    initialized = true;
+    initialized = decode_api.Initialize(current_codec);
 }

 void Codec::SetTargetCodec(Host1x::NvdecCommon::VideoCodec codec) {
@ -264,14 +35,18 @@ void Codec::Decode() {
    if (is_first_frame) {
        Initialize();
    }
+
    if (!initialized) {
        return;
    }
+
+    // Assemble bitstream.
    bool vp9_hidden_frame = false;
-    const auto& frame_data = [&]() {
+    size_t configuration_size = 0;
+    const auto packet_data = [&]() {
        switch (current_codec) {
        case Tegra::Host1x::NvdecCommon::VideoCodec::H264:
-            return h264_decoder->ComposeFrame(state, is_first_frame);
+            return h264_decoder->ComposeFrame(state, &configuration_size, is_first_frame);
        case Tegra::Host1x::NvdecCommon::VideoCodec::VP8:
            return vp8_decoder->ComposeFrame(state);
        case Tegra::Host1x::NvdecCommon::VideoCodec::VP9:
@ -283,89 +58,35 @@ void Codec::Decode() {
            return std::span<const u8>{};
        }
    }();
-    AVPacketPtr packet{av_packet_alloc(), AVPacketDeleter};
-    if (!packet) {
-        LOG_ERROR(Service_NVDRV, "av_packet_alloc failed");
+
+    // Send assembled bitstream to decoder.
+    if (!decode_api.SendPacket(packet_data, configuration_size)) {
        return;
    }
-    packet->data = const_cast<u8*>(frame_data.data());
-    packet->size = static_cast<s32>(frame_data.size());
-    if (const int res = avcodec_send_packet(av_codec_ctx, packet.get()); res != 0) {
-        LOG_DEBUG(Service_NVDRV, "avcodec_send_packet error {}", res);
-        return;
-    }
-    // Only receive/store visible frames
+
+    // Only receive/store visible frames.
    if (vp9_hidden_frame) {
        return;
    }
-    AVFramePtr initial_frame{av_frame_alloc(), AVFrameDeleter};
-    AVFramePtr final_frame{nullptr, AVFrameDeleter};
-    ASSERT_MSG(initial_frame, "av_frame_alloc initial_frame failed");
-    if (const int ret = avcodec_receive_frame(av_codec_ctx, initial_frame.get()); ret) {
-        LOG_DEBUG(Service_NVDRV, "avcodec_receive_frame error {}", ret);
-        return;
-    }
-    if (initial_frame->width == 0 || initial_frame->height == 0) {
-        LOG_WARNING(Service_NVDRV, "Zero width or height in frame");
-        return;
-    }
-    bool is_interlaced = initial_frame->interlaced_frame != 0;
-    if (av_codec_ctx->hw_device_ctx) {
-        final_frame = AVFramePtr{av_frame_alloc(), AVFrameDeleter};
-        ASSERT_MSG(final_frame, "av_frame_alloc final_frame failed");
-        // Can't use AV_PIX_FMT_YUV420P and share code with software decoding in vic.cpp
-        // because Intel drivers crash unless using AV_PIX_FMT_NV12
-        final_frame->format = PREFERRED_GPU_FMT;
-        const int ret = av_hwframe_transfer_data(final_frame.get(), initial_frame.get(), 0);
-        ASSERT_MSG(!ret, "av_hwframe_transfer_data error {}", ret);
-    } else {
-        final_frame = std::move(initial_frame);
-    }
-    if (final_frame->format != PREFERRED_CPU_FMT && final_frame->format != PREFERRED_GPU_FMT) {
-        UNIMPLEMENTED_MSG("Unexpected video format: {}", final_frame->format);
-        return;
-    }
-    if (!is_interlaced) {
-        av_frames.push(std::move(final_frame));
-    } else {
-        if (!filters_initialized) {
-            InitializeAvFilters(final_frame.get());
-        }
-        if (const int ret = av_buffersrc_add_frame_flags(av_filter_src_ctx, final_frame.get(),
-                                                         AV_BUFFERSRC_FLAG_KEEP_REF);
-            ret) {
-            LOG_DEBUG(Service_NVDRV, "av_buffersrc_add_frame_flags error {}", ret);
-            return;
-        }
-        while (true) {
-            auto filter_frame = AVFramePtr{av_frame_alloc(), AVFrameDeleter};

-            int ret = av_buffersink_get_frame(av_filter_sink_ctx, filter_frame.get());
+    // Receive output frames from decoder.
+    decode_api.ReceiveFrames(frames);

-            if (ret == AVERROR(EAGAIN) || ret == AVERROR(AVERROR_EOF))
-                break;
-            if (ret < 0) {
-                LOG_DEBUG(Service_NVDRV, "av_buffersink_get_frame error {}", ret);
-                return;
-            }
-
-            av_frames.push(std::move(filter_frame));
-        }
-    }
-    while (av_frames.size() > 10) {
-        LOG_TRACE(Service_NVDRV, "av_frames.push overflow dropped frame");
-        av_frames.pop();
+    while (frames.size() > 10) {
+        LOG_DEBUG(HW_GPU, "ReceiveFrames overflow, dropped frame");
+        frames.pop();
    }
 }

-AVFramePtr Codec::GetCurrentFrame() {
+std::unique_ptr<FFmpeg::Frame> Codec::GetCurrentFrame() {
    // Sometimes VIC will request more frames than have been decoded.
-    // in this case, return a nullptr and don't overwrite previous frame data
-    if (av_frames.empty()) {
-        return AVFramePtr{nullptr, AVFrameDeleter};
+    // in this case, return a blank frame and don't overwrite previous data.
+    if (frames.empty()) {
+        return {};
    }
-    AVFramePtr frame = std::move(av_frames.front());
-    av_frames.pop();
+
+    auto frame = std::move(frames.front());
+    frames.pop();
    return frame;
 }

--- a/src/video_core/host1x/codecs/codec.h
+++ b/src/video_core/host1x/codecs/codec.h
@ -4,28 +4,15 @@
 #pragma once

 #include <memory>
+#include <optional>
 #include <string_view>
 #include <queue>
 #include "common/common_types.h"
+#include "video_core/host1x/ffmpeg/ffmpeg.h"
 #include "video_core/host1x/nvdec_common.h"

-extern "C" {
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wconversion"
-#endif
-#include <libavcodec/avcodec.h>
-#include <libavfilter/avfilter.h>
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
-}
-
 namespace Tegra {

-void AVFrameDeleter(AVFrame* ptr);
-using AVFramePtr = std::unique_ptr<AVFrame, decltype(&AVFrameDeleter)>;
-
 namespace Decoder {
 class H264;
 class VP8;
@ -51,7 +38,7 @@ public:
    void Decode();

    /// Returns next decoded frame
-    [[nodiscard]] AVFramePtr GetCurrentFrame();
+    [[nodiscard]] std::unique_ptr<FFmpeg::Frame> GetCurrentFrame();

    /// Returns the value of current_codec
    [[nodiscard]] Host1x::NvdecCommon::VideoCodec GetCurrentCodec() const;
@ -60,25 +47,9 @@ public:
    [[nodiscard]] std::string_view GetCurrentCodecName() const;

 private:
-    void InitializeAvCodecContext();
-
-    void InitializeAvFilters(AVFrame* frame);
-
-    void InitializeGpuDecoder();
-
-    bool CreateGpuAvDevice();
-
    bool initialized{};
-    bool filters_initialized{};
    Host1x::NvdecCommon::VideoCodec current_codec{Host1x::NvdecCommon::VideoCodec::None};
-
-    const AVCodec* av_codec{nullptr};
-    AVCodecContext* av_codec_ctx{nullptr};
-    AVBufferRef* av_gpu_decoder{nullptr};
-
-    AVFilterContext* av_filter_src_ctx{nullptr};
-    AVFilterContext* av_filter_sink_ctx{nullptr};
-    AVFilterGraph* av_filter_graph{nullptr};
+    FFmpeg::DecodeApi decode_api;

    Host1x::Host1x& host1x;
    const Host1x::NvdecCommon::NvdecRegisters& state;
@ -86,7 +57,7 @@ private:
    std::unique_ptr<Decoder::VP8> vp8_decoder;
    std::unique_ptr<Decoder::VP9> vp9_decoder;

-    std::queue<AVFramePtr> av_frames{};
+    std::queue<std::unique_ptr<FFmpeg::Frame>> frames{};
 };

 } // namespace Tegra
--- a/src/video_core/host1x/codecs/h264.cpp
+++ b/src/video_core/host1x/codecs/h264.cpp
@ -30,7 +30,7 @@ H264::H264(Host1x::Host1x& host1x_) : host1x{host1x_} {}
 H264::~H264() = default;

 std::span<const u8> H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state,
-                                       bool is_first_frame) {
+                                       size_t* out_configuration_size, bool is_first_frame) {
    H264DecoderContext context;
    host1x.MemoryManager().ReadBlock(state.picture_info_offset, &context,
                                     sizeof(H264DecoderContext));
@ -39,6 +39,7 @@ std::span<const u8> H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters
    if (!is_first_frame && frame_number != 0) {
        frame.resize_destructive(context.stream_len);
        host1x.MemoryManager().ReadBlock(state.frame_bitstream_offset, frame.data(), frame.size());
+        *out_configuration_size = 0;
        return frame;
    }

@ -157,6 +158,7 @@ std::span<const u8> H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters
    frame.resize(encoded_header.size() + context.stream_len);
    std::memcpy(frame.data(), encoded_header.data(), encoded_header.size());

+    *out_configuration_size = encoded_header.size();
    host1x.MemoryManager().ReadBlock(state.frame_bitstream_offset,
                                     frame.data() + encoded_header.size(), context.stream_len);

--- a/src/video_core/host1x/codecs/h264.h
+++ b/src/video_core/host1x/codecs/h264.h
@ -67,6 +67,7 @@ public:

    /// Compose the H264 frame for FFmpeg decoding
    [[nodiscard]] std::span<const u8> ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state,
+                                                   size_t* out_configuration_size,
                                                   bool is_first_frame = false);

 private:
--- a/src/video_core/host1x/ffmpeg/ffmpeg.cpp
+++ b/src/video_core/host1x/ffmpeg/ffmpeg.cpp
@ -0,0 +1,419 @@
+// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "common/scope_exit.h"
+#include "common/settings.h"
+#include "video_core/host1x/ffmpeg/ffmpeg.h"
+
+extern "C" {
+#ifdef LIBVA_FOUND
+// for querying VAAPI driver information
+#include <libavutil/hwcontext_vaapi.h>
+#endif
+}
+
+namespace FFmpeg {
+
+namespace {
+
+constexpr AVPixelFormat PreferredGpuFormat = AV_PIX_FMT_NV12;
+constexpr AVPixelFormat PreferredCpuFormat = AV_PIX_FMT_YUV420P;
+constexpr std::array PreferredGpuDecoders = {
+    AV_HWDEVICE_TYPE_CUDA,
+#ifdef _WIN32
+    AV_HWDEVICE_TYPE_D3D11VA,
+    AV_HWDEVICE_TYPE_DXVA2,
+#elif defined(__unix__)
+    AV_HWDEVICE_TYPE_VAAPI,
+    AV_HWDEVICE_TYPE_VDPAU,
+#endif
+    // last resort for Linux Flatpak (w/ NVIDIA)
+    AV_HWDEVICE_TYPE_VULKAN,
+};
+
+AVPixelFormat GetGpuFormat(AVCodecContext* codec_context, const AVPixelFormat* pix_fmts) {
+    for (const AVPixelFormat* p = pix_fmts; *p != AV_PIX_FMT_NONE; ++p) {
+        if (*p == codec_context->pix_fmt) {
+            return codec_context->pix_fmt;
+        }
+    }
+
+    LOG_INFO(HW_GPU, "Could not find compatible GPU AV format, falling back to CPU");
+    av_buffer_unref(&codec_context->hw_device_ctx);
+
+    codec_context->pix_fmt = PreferredCpuFormat;
+    return codec_context->pix_fmt;
+}
+
+std::string AVError(int errnum) {
+    char errbuf[AV_ERROR_MAX_STRING_SIZE] = {};
+    av_make_error_string(errbuf, sizeof(errbuf) - 1, errnum);
+    return errbuf;
+}
+
+} // namespace
+
+Packet::Packet(std::span<const u8> data) {
+    m_packet = av_packet_alloc();
+    m_packet->data = const_cast<u8*>(data.data());
+    m_packet->size = static_cast<s32>(data.size());
+}
+
+Packet::~Packet() {
+    av_packet_free(&m_packet);
+}
+
+Frame::Frame() {
+    m_frame = av_frame_alloc();
+}
+
+Frame::~Frame() {
+    av_frame_free(&m_frame);
+}
+
+Decoder::Decoder(Tegra::Host1x::NvdecCommon::VideoCodec codec) {
+    const AVCodecID av_codec = [&] {
+        switch (codec) {
+        case Tegra::Host1x::NvdecCommon::VideoCodec::H264:
+            return AV_CODEC_ID_H264;
+        case Tegra::Host1x::NvdecCommon::VideoCodec::VP8:
+            return AV_CODEC_ID_VP8;
+        case Tegra::Host1x::NvdecCommon::VideoCodec::VP9:
+            return AV_CODEC_ID_VP9;
+        default:
+            UNIMPLEMENTED_MSG("Unknown codec {}", codec);
+            return AV_CODEC_ID_NONE;
+        }
+    }();
+
+    m_codec = avcodec_find_decoder(av_codec);
+}
+
+bool Decoder::SupportsDecodingOnDevice(AVPixelFormat* out_pix_fmt, AVHWDeviceType type) const {
+    for (int i = 0;; i++) {
+        const AVCodecHWConfig* config = avcodec_get_hw_config(m_codec, i);
+        if (!config) {
+            LOG_DEBUG(HW_GPU, "{} decoder does not support device type {}", m_codec->name,
+                      av_hwdevice_get_type_name(type));
+            break;
+        }
+        if ((config->methods & AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX) != 0 &&
+            config->device_type == type) {
+            LOG_INFO(HW_GPU, "Using {} GPU decoder", av_hwdevice_get_type_name(type));
+            *out_pix_fmt = config->pix_fmt;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+std::vector<AVHWDeviceType> HardwareContext::GetSupportedDeviceTypes() {
+    std::vector<AVHWDeviceType> types;
+    AVHWDeviceType current_device_type = AV_HWDEVICE_TYPE_NONE;
+
+    while (true) {
+        current_device_type = av_hwdevice_iterate_types(current_device_type);
+        if (current_device_type == AV_HWDEVICE_TYPE_NONE) {
+            return types;
+        }
+
+        types.push_back(current_device_type);
+    }
+}
+
+HardwareContext::~HardwareContext() {
+    av_buffer_unref(&m_gpu_decoder);
+}
+
+bool HardwareContext::InitializeForDecoder(DecoderContext& decoder_context,
+                                           const Decoder& decoder) {
+    const auto supported_types = GetSupportedDeviceTypes();
+    for (const auto type : PreferredGpuDecoders) {
+        AVPixelFormat hw_pix_fmt;
+
+        if (std::ranges::find(supported_types, type) == supported_types.end()) {
+            LOG_DEBUG(HW_GPU, "{} explicitly unsupported", av_hwdevice_get_type_name(type));
+            continue;
+        }
+
+        if (!this->InitializeWithType(type)) {
+            continue;
+        }
+
+        if (decoder.SupportsDecodingOnDevice(&hw_pix_fmt, type)) {
+            decoder_context.InitializeHardwareDecoder(*this, hw_pix_fmt);
+            return true;
+        }
+    }
+
+    return false;
+}
+
+bool HardwareContext::InitializeWithType(AVHWDeviceType type) {
+    av_buffer_unref(&m_gpu_decoder);
+
+    if (const int ret = av_hwdevice_ctx_create(&m_gpu_decoder, type, nullptr, nullptr, 0);
+        ret < 0) {
+        LOG_DEBUG(HW_GPU, "av_hwdevice_ctx_create({}) failed: {}", av_hwdevice_get_type_name(type),
+                  AVError(ret));
+        return false;
+    }
+
+#ifdef LIBVA_FOUND
+    if (type == AV_HWDEVICE_TYPE_VAAPI) {
+        // We need to determine if this is an impersonated VAAPI driver.
+        auto* hwctx = reinterpret_cast<AVHWDeviceContext*>(m_gpu_decoder->data);
+        auto* vactx = static_cast<AVVAAPIDeviceContext*>(hwctx->hwctx);
+        const char* vendor_name = vaQueryVendorString(vactx->display);
+        if (strstr(vendor_name, "VDPAU backend")) {
+            // VDPAU impersonated VAAPI impls are super buggy, we need to skip them.
+            LOG_DEBUG(HW_GPU, "Skipping VDPAU impersonated VAAPI driver");
+            return false;
+        } else {
+            // According to some user testing, certain VAAPI drivers (Intel?) could be buggy.
+            // Log the driver name just in case.
+            LOG_DEBUG(HW_GPU, "Using VAAPI driver: {}", vendor_name);
+        }
+    }
+#endif
+
+    return true;
+}
+
+DecoderContext::DecoderContext(const Decoder& decoder) {
+    m_codec_context = avcodec_alloc_context3(decoder.GetCodec());
+    av_opt_set(m_codec_context->priv_data, "tune", "zerolatency", 0);
+    m_codec_context->thread_count = 0;
+    m_codec_context->thread_type &= ~FF_THREAD_FRAME;
+}
+
+DecoderContext::~DecoderContext() {
+    av_buffer_unref(&m_codec_context->hw_device_ctx);
+    avcodec_free_context(&m_codec_context);
+}
+
+void DecoderContext::InitializeHardwareDecoder(const HardwareContext& context,
+                                               AVPixelFormat hw_pix_fmt) {
+    m_codec_context->hw_device_ctx = av_buffer_ref(context.GetBufferRef());
+    m_codec_context->get_format = GetGpuFormat;
+    m_codec_context->pix_fmt = hw_pix_fmt;
+}
+
+bool DecoderContext::OpenContext(const Decoder& decoder) {
+    if (const int ret = avcodec_open2(m_codec_context, decoder.GetCodec(), nullptr); ret < 0) {
+        LOG_ERROR(HW_GPU, "avcodec_open2 error: {}", AVError(ret));
+        return false;
+    }
+
+    if (!m_codec_context->hw_device_ctx) {
+        LOG_INFO(HW_GPU, "Using FFmpeg software decoding");
+    }
+
+    return true;
+}
+
+bool DecoderContext::SendPacket(const Packet& packet) {
+    if (const int ret = avcodec_send_packet(m_codec_context, packet.GetPacket()); ret < 0) {
+        LOG_ERROR(HW_GPU, "avcodec_send_packet error: {}", AVError(ret));
+        return false;
+    }
+
+    return true;
+}
+
+std::unique_ptr<Frame> DecoderContext::ReceiveFrame(bool* out_is_interlaced) {
+    auto dst_frame = std::make_unique<Frame>();
+
+    const auto ReceiveImpl = [&](AVFrame* frame) {
+        if (const int ret = avcodec_receive_frame(m_codec_context, frame); ret < 0) {
+            LOG_ERROR(HW_GPU, "avcodec_receive_frame error: {}", AVError(ret));
+            return false;
+        }
+
+        *out_is_interlaced = frame->interlaced_frame != 0;
+        return true;
+    };
+
+    if (m_codec_context->hw_device_ctx) {
+        // If we have a hardware context, make a separate frame here to receive the
+        // hardware result before sending it to the output.
+        Frame intermediate_frame;
+
+        if (!ReceiveImpl(intermediate_frame.GetFrame())) {
+            return {};
+        }
+
+        dst_frame->SetFormat(PreferredGpuFormat);
+        if (const int ret =
+                av_hwframe_transfer_data(dst_frame->GetFrame(), intermediate_frame.GetFrame(), 0);
+            ret < 0) {
+            LOG_ERROR(HW_GPU, "av_hwframe_transfer_data error: {}", AVError(ret));
+            return {};
+        }
+    } else {
+        // Otherwise, decode the frame as normal.
+        if (!ReceiveImpl(dst_frame->GetFrame())) {
+            return {};
+        }
+    }
+
+    return dst_frame;
+}
+
+DeinterlaceFilter::DeinterlaceFilter(const Frame& frame) {
+    const AVFilter* buffer_src = avfilter_get_by_name("buffer");
+    const AVFilter* buffer_sink = avfilter_get_by_name("buffersink");
+    AVFilterInOut* inputs = avfilter_inout_alloc();
+    AVFilterInOut* outputs = avfilter_inout_alloc();
+    SCOPE_EXIT({
+        avfilter_inout_free(&inputs);
+        avfilter_inout_free(&outputs);
+    });
+
+    // Don't know how to get the accurate time_base but it doesn't matter for yadif filter
+    // so just use 1/1 to make buffer filter happy
+    std::string args = fmt::format("video_size={}x{}:pix_fmt={}:time_base=1/1", frame.GetWidth(),
+                                   frame.GetHeight(), static_cast<int>(frame.GetPixelFormat()));
+
+    m_filter_graph = avfilter_graph_alloc();
+    int ret = avfilter_graph_create_filter(&m_source_context, buffer_src, "in", args.c_str(),
+                                           nullptr, m_filter_graph);
+    if (ret < 0) {
+        LOG_ERROR(HW_GPU, "avfilter_graph_create_filter source error: {}", AVError(ret));
+        return;
+    }
+
+    ret = avfilter_graph_create_filter(&m_sink_context, buffer_sink, "out", nullptr, nullptr,
+                                       m_filter_graph);
+    if (ret < 0) {
+        LOG_ERROR(HW_GPU, "avfilter_graph_create_filter sink error: {}", AVError(ret));
+        return;
+    }
+
+    inputs->name = av_strdup("out");
+    inputs->filter_ctx = m_sink_context;
+    inputs->pad_idx = 0;
+    inputs->next = nullptr;
+
+    outputs->name = av_strdup("in");
+    outputs->filter_ctx = m_source_context;
+    outputs->pad_idx = 0;
+    outputs->next = nullptr;
+
+    const char* description = "yadif=1:-1:0";
+    ret = avfilter_graph_parse_ptr(m_filter_graph, description, &inputs, &outputs, nullptr);
+    if (ret < 0) {
+        LOG_ERROR(HW_GPU, "avfilter_graph_parse_ptr error: {}", AVError(ret));
+        return;
+    }
+
+    ret = avfilter_graph_config(m_filter_graph, nullptr);
+    if (ret < 0) {
+        LOG_ERROR(HW_GPU, "avfilter_graph_config error: {}", AVError(ret));
+        return;
+    }
+
+    m_initialized = true;
+}
+
+bool DeinterlaceFilter::AddSourceFrame(const Frame& frame) {
+    if (const int ret = av_buffersrc_add_frame_flags(m_source_context, frame.GetFrame(),
+                                                     AV_BUFFERSRC_FLAG_KEEP_REF);
+        ret < 0) {
+        LOG_ERROR(HW_GPU, "av_buffersrc_add_frame_flags error: {}", AVError(ret));
+        return false;
+    }
+
+    return true;
+}
+
+std::unique_ptr<Frame> DeinterlaceFilter::DrainSinkFrame() {
+    auto dst_frame = std::make_unique<Frame>();
+    const int ret = av_buffersink_get_frame(m_sink_context, dst_frame->GetFrame());
+
+    if (ret == AVERROR(EAGAIN) || ret == AVERROR(AVERROR_EOF)) {
+        return {};
+    }
+
+    if (ret < 0) {
+        LOG_ERROR(HW_GPU, "av_buffersink_get_frame error: {}", AVError(ret));
+        return {};
+    }
+
+    return dst_frame;
+}
+
+DeinterlaceFilter::~DeinterlaceFilter() {
+    avfilter_graph_free(&m_filter_graph);
+}
+
+void DecodeApi::Reset() {
+    m_deinterlace_filter.reset();
+    m_hardware_context.reset();
+    m_decoder_context.reset();
+    m_decoder.reset();
+}
+
+bool DecodeApi::Initialize(Tegra::Host1x::NvdecCommon::VideoCodec codec) {
+    this->Reset();
+    m_decoder.emplace(codec);
+    m_decoder_context.emplace(*m_decoder);
+
+    // Enable GPU decoding if requested.
+    if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::Gpu) {
+        m_hardware_context.emplace();
+        m_hardware_context->InitializeForDecoder(*m_decoder_context, *m_decoder);
+    }
+
+    // Open the decoder context.
+    if (!m_decoder_context->OpenContext(*m_decoder)) {
+        this->Reset();
+        return false;
+    }
+
+    return true;
+}
+
+bool DecodeApi::SendPacket(std::span<const u8> packet_data, size_t configuration_size) {
+    FFmpeg::Packet packet(packet_data);
+    return m_decoder_context->SendPacket(packet);
+}
+
+void DecodeApi::ReceiveFrames(std::queue<std::unique_ptr<Frame>>& frame_queue) {
+    // Receive raw frame from decoder.
+    bool is_interlaced;
+    auto frame = m_decoder_context->ReceiveFrame(&is_interlaced);
+    if (!frame) {
+        return;
+    }
+
+    if (!is_interlaced) {
+        // If the frame is not interlaced, we can pend it now.
+        frame_queue.push(std::move(frame));
+    } else {
+        // Create the deinterlacer if needed.
+        if (!m_deinterlace_filter) {
+            m_deinterlace_filter.emplace(*frame);
+        }
+
+        // Add the frame we just received.
+        if (!m_deinterlace_filter->AddSourceFrame(*frame)) {
+            return;
+        }
+
+        // Pend output fields.
+        while (true) {
+            auto filter_frame = m_deinterlace_filter->DrainSinkFrame();
+            if (!filter_frame) {
+                break;
+            }
+
+            frame_queue.push(std::move(filter_frame));
+        }
+    }
+}
+
+} // namespace FFmpeg
--- a/src/video_core/host1x/ffmpeg/ffmpeg.h
+++ b/src/video_core/host1x/ffmpeg/ffmpeg.h
@ -0,0 +1,213 @@
+// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <memory>
+#include <optional>
+#include <span>
+#include <vector>
+#include <queue>
+
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "video_core/host1x/nvdec_common.h"
+
+extern "C" {
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+
+#include <libavcodec/avcodec.h>
+#include <libavfilter/avfilter.h>
+#include <libavfilter/buffersink.h>
+#include <libavfilter/buffersrc.h>
+#include <libavutil/avutil.h>
+#include <libavutil/opt.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+namespace FFmpeg {
+
+class Packet;
+class Frame;
+class Decoder;
+class HardwareContext;
+class DecoderContext;
+class DeinterlaceFilter;
+
+// Wraps an AVPacket, a container for compressed bitstream data.
+class Packet {
+public:
+    YUZU_NON_COPYABLE(Packet);
+    YUZU_NON_MOVEABLE(Packet);
+
+    explicit Packet(std::span<const u8> data);
+    ~Packet();
+
+    AVPacket* GetPacket() const {
+        return m_packet;
+    }
+
+private:
+    AVPacket* m_packet{};
+};
+
+// Wraps an AVFrame, a container for audio and video stream data.
+class Frame {
+public:
+    YUZU_NON_COPYABLE(Frame);
+    YUZU_NON_MOVEABLE(Frame);
+
+    explicit Frame();
+    ~Frame();
+
+    int GetWidth() const {
+        return m_frame->width;
+    }
+
+    int GetHeight() const {
+        return m_frame->height;
+    }
+
+    AVPixelFormat GetPixelFormat() const {
+        return static_cast<AVPixelFormat>(m_frame->format);
+    }
+
+    int GetStride(int plane) const {
+        return m_frame->linesize[plane];
+    }
+
+    int* GetStrides() const {
+        return m_frame->linesize;
+    }
+
+    u8* GetData(int plane) const {
+        return m_frame->data[plane];
+    }
+
+    u8** GetPlanes() const {
+        return m_frame->data;
+    }
+
+    void SetFormat(int format) {
+        m_frame->format = format;
+    }
+
+    AVFrame* GetFrame() const {
+        return m_frame;
+    }
+
+private:
+    AVFrame* m_frame{};
+};
+
+// Wraps an AVCodec, a type containing information about a codec.
+class Decoder {
+public:
+    YUZU_NON_COPYABLE(Decoder);
+    YUZU_NON_MOVEABLE(Decoder);
+
+    explicit Decoder(Tegra::Host1x::NvdecCommon::VideoCodec codec);
+    ~Decoder() = default;
+
+    bool SupportsDecodingOnDevice(AVPixelFormat* out_pix_fmt, AVHWDeviceType type) const;
+
+    const AVCodec* GetCodec() const {
+        return m_codec;
+    }
+
+private:
+    const AVCodec* m_codec{};
+};
+
+// Wraps AVBufferRef for an accelerated decoder.
+class HardwareContext {
+public:
+    YUZU_NON_COPYABLE(HardwareContext);
+    YUZU_NON_MOVEABLE(HardwareContext);
+
+    static std::vector<AVHWDeviceType> GetSupportedDeviceTypes();
+
+    explicit HardwareContext() = default;
+    ~HardwareContext();
+
+    bool InitializeForDecoder(DecoderContext& decoder_context, const Decoder& decoder);
+
+    AVBufferRef* GetBufferRef() const {
+        return m_gpu_decoder;
+    }
+
+private:
+    bool InitializeWithType(AVHWDeviceType type);
+
+    AVBufferRef* m_gpu_decoder{};
+};
+
+// Wraps an AVCodecContext.
+class DecoderContext {
+public:
+    YUZU_NON_COPYABLE(DecoderContext);
+    YUZU_NON_MOVEABLE(DecoderContext);
+
+    explicit DecoderContext(const Decoder& decoder);
+    ~DecoderContext();
+
+    void InitializeHardwareDecoder(const HardwareContext& context, AVPixelFormat hw_pix_fmt);
+    bool OpenContext(const Decoder& decoder);
+    bool SendPacket(const Packet& packet);
+    std::unique_ptr<Frame> ReceiveFrame(bool* out_is_interlaced);
+
+    AVCodecContext* GetCodecContext() const {
+        return m_codec_context;
+    }
+
+private:
+    AVCodecContext* m_codec_context{};
+};
+
+// Wraps an AVFilterGraph.
+class DeinterlaceFilter {
+public:
+    YUZU_NON_COPYABLE(DeinterlaceFilter);
+    YUZU_NON_MOVEABLE(DeinterlaceFilter);
+
+    explicit DeinterlaceFilter(const Frame& frame);
+    ~DeinterlaceFilter();
+
+    bool AddSourceFrame(const Frame& frame);
+    std::unique_ptr<Frame> DrainSinkFrame();
+
+private:
+    AVFilterGraph* m_filter_graph{};
+    AVFilterContext* m_source_context{};
+    AVFilterContext* m_sink_context{};
+    bool m_initialized{};
+};
+
+class DecodeApi {
+public:
+    YUZU_NON_COPYABLE(DecodeApi);
+    YUZU_NON_MOVEABLE(DecodeApi);
+
+    DecodeApi() = default;
+    ~DecodeApi() = default;
+
+    bool Initialize(Tegra::Host1x::NvdecCommon::VideoCodec codec);
+    void Reset();
+
+    bool SendPacket(std::span<const u8> packet_data, size_t configuration_size);
+    void ReceiveFrames(std::queue<std::unique_ptr<Frame>>& frame_queue);
+
+private:
+    std::optional<FFmpeg::Decoder> m_decoder;
+    std::optional<FFmpeg::DecoderContext> m_decoder_context;
+    std::optional<FFmpeg::HardwareContext> m_hardware_context;
+    std::optional<FFmpeg::DeinterlaceFilter> m_deinterlace_filter;
+};
+
+} // namespace FFmpeg
--- a/src/video_core/host1x/nvdec.cpp
+++ b/src/video_core/host1x/nvdec.cpp
@ -28,7 +28,7 @@ void Nvdec::ProcessMethod(u32 method, u32 argument) {
    }
 }

-AVFramePtr Nvdec::GetFrame() {
+std::unique_ptr<FFmpeg::Frame> Nvdec::GetFrame() {
    return codec->GetCurrentFrame();
 }

--- a/src/video_core/host1x/nvdec.h
+++ b/src/video_core/host1x/nvdec.h
@ -23,7 +23,7 @@ public:
    void ProcessMethod(u32 method, u32 argument);

    /// Return most recently decoded frame
-    [[nodiscard]] AVFramePtr GetFrame();
+    [[nodiscard]] std::unique_ptr<FFmpeg::Frame> GetFrame();

 private:
    /// Invoke codec to decode a frame
--- a/src/video_core/host1x/vic.cpp
+++ b/src/video_core/host1x/vic.cpp
@ -82,27 +82,26 @@ void Vic::Execute() {
        return;
    }
    const VicConfig config{host1x.MemoryManager().Read<u64>(config_struct_address + 0x20)};
-    const AVFramePtr frame_ptr = nvdec_processor->GetFrame();
-    const auto* frame = frame_ptr.get();
+    auto frame = nvdec_processor->GetFrame();
    if (!frame) {
        return;
    }
    const u64 surface_width = config.surface_width_minus1 + 1;
    const u64 surface_height = config.surface_height_minus1 + 1;
-    if (static_cast<u64>(frame->width) != surface_width ||
-        static_cast<u64>(frame->height) != surface_height) {
+    if (static_cast<u64>(frame->GetWidth()) != surface_width ||
+        static_cast<u64>(frame->GetHeight()) != surface_height) {
        // TODO: Properly support multiple video streams with differing frame dimensions
        LOG_WARNING(Service_NVDRV, "Frame dimensions {}x{} don't match surface dimensions {}x{}",
-                    frame->width, frame->height, surface_width, surface_height);
+                    frame->GetWidth(), frame->GetHeight(), surface_width, surface_height);
    }
    switch (config.pixel_format) {
    case VideoPixelFormat::RGBA8:
    case VideoPixelFormat::BGRA8:
    case VideoPixelFormat::RGBX8:
-        WriteRGBFrame(frame, config);
+        WriteRGBFrame(std::move(frame), config);
        break;
    case VideoPixelFormat::YUV420:
-        WriteYUVFrame(frame, config);
+        WriteYUVFrame(std::move(frame), config);
        break;
    default:
        UNIMPLEMENTED_MSG("Unknown video pixel format {:X}", config.pixel_format.Value());
@ -110,10 +109,14 @@ void Vic::Execute() {
    }
 }

-void Vic::WriteRGBFrame(const AVFrame* frame, const VicConfig& config) {
+void Vic::WriteRGBFrame(std::unique_ptr<FFmpeg::Frame> frame, const VicConfig& config) {
    LOG_TRACE(Service_NVDRV, "Writing RGB Frame");

-    if (!scaler_ctx || frame->width != scaler_width || frame->height != scaler_height) {
+    const auto frame_width = frame->GetWidth();
+    const auto frame_height = frame->GetHeight();
+    const auto frame_format = frame->GetPixelFormat();
+
+    if (!scaler_ctx || frame_width != scaler_width || frame_height != scaler_height) {
        const AVPixelFormat target_format = [pixel_format = config.pixel_format]() {
            switch (pixel_format) {
            case VideoPixelFormat::RGBA8:
@ -129,27 +132,26 @@ void Vic::WriteRGBFrame(const AVFrame* frame, const VicConfig& config) {

        sws_freeContext(scaler_ctx);
        // Frames are decoded into either YUV420 or NV12 formats. Convert to desired RGB format
-        scaler_ctx = sws_getContext(frame->width, frame->height,
-                                    static_cast<AVPixelFormat>(frame->format), frame->width,
-                                    frame->height, target_format, 0, nullptr, nullptr, nullptr);
-        scaler_width = frame->width;
-        scaler_height = frame->height;
+        scaler_ctx = sws_getContext(frame_width, frame_height, frame_format, frame_width,
+                                    frame_height, target_format, 0, nullptr, nullptr, nullptr);
+        scaler_width = frame_width;
+        scaler_height = frame_height;
        converted_frame_buffer.reset();
    }
    if (!converted_frame_buffer) {
-        const size_t frame_size = frame->width * frame->height * 4;
+        const size_t frame_size = frame_width * frame_height * 4;
        converted_frame_buffer = AVMallocPtr{static_cast<u8*>(av_malloc(frame_size)), av_free};
    }
-    const std::array<int, 4> converted_stride{frame->width * 4, frame->height * 4, 0, 0};
+    const std::array<int, 4> converted_stride{frame_width * 4, frame_height * 4, 0, 0};
    u8* const converted_frame_buf_addr{converted_frame_buffer.get()};
-    sws_scale(scaler_ctx, frame->data, frame->linesize, 0, frame->height, &converted_frame_buf_addr,
-              converted_stride.data());
+    sws_scale(scaler_ctx, frame->GetPlanes(), frame->GetStrides(), 0, frame_height,
+              &converted_frame_buf_addr, converted_stride.data());

    // Use the minimum of surface/frame dimensions to avoid buffer overflow.
    const u32 surface_width = static_cast<u32>(config.surface_width_minus1) + 1;
    const u32 surface_height = static_cast<u32>(config.surface_height_minus1) + 1;
-    const u32 width = std::min(surface_width, static_cast<u32>(frame->width));
-    const u32 height = std::min(surface_height, static_cast<u32>(frame->height));
+    const u32 width = std::min(surface_width, static_cast<u32>(frame_width));
+    const u32 height = std::min(surface_height, static_cast<u32>(frame_height));
    const u32 blk_kind = static_cast<u32>(config.block_linear_kind);
    if (blk_kind != 0) {
        // swizzle pitch linear to block linear
@ -169,23 +171,23 @@ void Vic::WriteRGBFrame(const AVFrame* frame, const VicConfig& config) {
    }
 }

-void Vic::WriteYUVFrame(const AVFrame* frame, const VicConfig& config) {
+void Vic::WriteYUVFrame(std::unique_ptr<FFmpeg::Frame> frame, const VicConfig& config) {
    LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame");

    const std::size_t surface_width = config.surface_width_minus1 + 1;
    const std::size_t surface_height = config.surface_height_minus1 + 1;
    const std::size_t aligned_width = (surface_width + 0xff) & ~0xffUL;
    // Use the minimum of surface/frame dimensions to avoid buffer overflow.
-    const auto frame_width = std::min(surface_width, static_cast<size_t>(frame->width));
-    const auto frame_height = std::min(surface_height, static_cast<size_t>(frame->height));
+    const auto frame_width = std::min(surface_width, static_cast<size_t>(frame->GetWidth()));
+    const auto frame_height = std::min(surface_height, static_cast<size_t>(frame->GetHeight()));

-    const auto stride = static_cast<size_t>(frame->linesize[0]);
+    const auto stride = static_cast<size_t>(frame->GetStride(0));

    luma_buffer.resize_destructive(aligned_width * surface_height);
    chroma_buffer.resize_destructive(aligned_width * surface_height / 2);

    // Populate luma buffer
-    const u8* luma_src = frame->data[0];
+    const u8* luma_src = frame->GetData(0);
    for (std::size_t y = 0; y < frame_height; ++y) {
        const std::size_t src = y * stride;
        const std::size_t dst = y * aligned_width;
@ -196,16 +198,16 @@ void Vic::WriteYUVFrame(const AVFrame* frame, const VicConfig& config) {

    // Chroma
    const std::size_t half_height = frame_height / 2;
-    const auto half_stride = static_cast<size_t>(frame->linesize[1]);
+    const auto half_stride = static_cast<size_t>(frame->GetStride(1));

-    switch (frame->format) {
+    switch (frame->GetPixelFormat()) {
    case AV_PIX_FMT_YUV420P: {
        // Frame from FFmpeg software
        // Populate chroma buffer from both channels with interleaving.
        const std::size_t half_width = frame_width / 2;
        u8* chroma_buffer_data = chroma_buffer.data();
-        const u8* chroma_b_src = frame->data[1];
-        const u8* chroma_r_src = frame->data[2];
+        const u8* chroma_b_src = frame->GetData(1);
+        const u8* chroma_r_src = frame->GetData(2);
        for (std::size_t y = 0; y < half_height; ++y) {
            const std::size_t src = y * half_stride;
            const std::size_t dst = y * aligned_width;
@ -219,7 +221,7 @@ void Vic::WriteYUVFrame(const AVFrame* frame, const VicConfig& config) {
    case AV_PIX_FMT_NV12: {
        // Frame from VA-API hardware
        // This is already interleaved so just copy
-        const u8* chroma_src = frame->data[1];
+        const u8* chroma_src = frame->GetData(1);
        for (std::size_t y = 0; y < half_height; ++y) {
            const std::size_t src = y * stride;
            const std::size_t dst = y * aligned_width;
--- a/src/video_core/host1x/vic.h
+++ b/src/video_core/host1x/vic.h
@ -39,9 +39,9 @@ public:
 private:
    void Execute();

-    void WriteRGBFrame(const AVFrame* frame, const VicConfig& config);
+    void WriteRGBFrame(std::unique_ptr<FFmpeg::Frame> frame, const VicConfig& config);

-    void WriteYUVFrame(const AVFrame* frame, const VicConfig& config);
+    void WriteYUVFrame(std::unique_ptr<FFmpeg::Frame> frame, const VicConfig& config);

    Host1x& host1x;
    std::shared_ptr<Tegra::Host1x::Nvdec> nvdec_processor;
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@ -178,13 +178,14 @@ void BufferCacheRuntime::CopyBuffer(GLuint dst_buffer, Buffer& src_buffer,
 }

 void BufferCacheRuntime::CopyBuffer(Buffer& dst_buffer, GLuint src_buffer,
-                                    std::span<const VideoCommon::BufferCopy> copies, bool barrier) {
+                                    std::span<const VideoCommon::BufferCopy> copies, bool barrier,
+                                    bool) {
    CopyBuffer(dst_buffer.Handle(), src_buffer, copies, barrier);
 }

 void BufferCacheRuntime::CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer,
-                                    std::span<const VideoCommon::BufferCopy> copies) {
-    CopyBuffer(dst_buffer.Handle(), src_buffer.Handle(), copies);
+                                    std::span<const VideoCommon::BufferCopy> copies, bool) {
+    CopyBuffer(dst_buffer.Handle(), src_buffer.Handle(), copies, true);
 }

 void BufferCacheRuntime::PreCopyBarrier() {
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@ -30,6 +30,8 @@ public:

    void MakeResident(GLenum access) noexcept;

+    void MarkUsage(u64 offset, u64 size) {}
+
    [[nodiscard]] GLuint View(u32 offset, u32 size, VideoCore::Surface::PixelFormat format);

    [[nodiscard]] GLuint64EXT HostGpuAddr() const noexcept {
@ -66,22 +68,29 @@ public:

    [[nodiscard]] StagingBufferMap DownloadStagingBuffer(size_t size);

+    bool CanReorderUpload(const Buffer&, std::span<const VideoCommon::BufferCopy>) {
+        return false;
+    }
+
    void CopyBuffer(GLuint dst_buffer, GLuint src_buffer,
-                    std::span<const VideoCommon::BufferCopy> copies, bool barrier = true);
+                    std::span<const VideoCommon::BufferCopy> copies, bool barrier);

    void CopyBuffer(GLuint dst_buffer, Buffer& src_buffer,
-                    std::span<const VideoCommon::BufferCopy> copies, bool barrier = true);
+                    std::span<const VideoCommon::BufferCopy> copies, bool barrier);

    void CopyBuffer(Buffer& dst_buffer, GLuint src_buffer,
-                    std::span<const VideoCommon::BufferCopy> copies, bool barrier = true);
+                    std::span<const VideoCommon::BufferCopy> copies, bool barrier,
+                    bool can_reorder_upload = false);

    void CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer,
-                    std::span<const VideoCommon::BufferCopy> copies);
+                    std::span<const VideoCommon::BufferCopy> copies, bool);

    void PreCopyBarrier();
    void PostCopyBarrier();
    void Finish();

+    void TickFrame(VideoCommon::SlotVector<Buffer>&) noexcept {}
+
    void ClearBuffer(Buffer& dest_buffer, u32 offset, size_t size, u32 value);

    void BindIndexBuffer(Buffer& buffer, u32 offset, u32 size);
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@ -79,13 +79,13 @@ vk::Buffer CreateBuffer(const Device& device, const MemoryAllocator& memory_allo
 } // Anonymous namespace

 Buffer::Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams null_params)
-    : VideoCommon::BufferBase<VideoCore::RasterizerInterface>(null_params) {}
+    : VideoCommon::BufferBase<VideoCore::RasterizerInterface>(null_params), tracker{4096} {}

 Buffer::Buffer(BufferCacheRuntime& runtime, VideoCore::RasterizerInterface& rasterizer_,
               VAddr cpu_addr_, u64 size_bytes_)
    : VideoCommon::BufferBase<VideoCore::RasterizerInterface>(rasterizer_, cpu_addr_, size_bytes_),
-      device{&runtime.device}, buffer{
-                                   CreateBuffer(*device, runtime.memory_allocator, SizeBytes())} {
+      device{&runtime.device}, buffer{CreateBuffer(*device, runtime.memory_allocator, SizeBytes())},
+      tracker{SizeBytes()} {
    if (runtime.device.HasDebuggingToolAttached()) {
        buffer.SetObjectNameEXT(fmt::format("Buffer 0x{:x}", CpuAddr()).c_str());
    }
@ -355,12 +355,31 @@ bool BufferCacheRuntime::CanReportMemoryUsage() const {
    return device.CanReportMemoryUsage();
 }

+void BufferCacheRuntime::TickFrame(VideoCommon::SlotVector<Buffer>& slot_buffers) noexcept {
+    for (auto it = slot_buffers.begin(); it != slot_buffers.end(); it++) {
+        it->ResetUsageTracking();
+    }
+}
+
 void BufferCacheRuntime::Finish() {
    scheduler.Finish();
 }

+bool BufferCacheRuntime::CanReorderUpload(const Buffer& buffer,
+                                          std::span<const VideoCommon::BufferCopy> copies) {
+    if (Settings::values.disable_buffer_reorder) {
+        return false;
+    }
+    const bool can_use_upload_cmdbuf =
+        std::ranges::all_of(copies, [&](const VideoCommon::BufferCopy& copy) {
+            return !buffer.IsRegionUsed(copy.dst_offset, copy.size);
+        });
+    return can_use_upload_cmdbuf;
+}
+
 void BufferCacheRuntime::CopyBuffer(VkBuffer dst_buffer, VkBuffer src_buffer,
-                                    std::span<const VideoCommon::BufferCopy> copies, bool barrier) {
+                                    std::span<const VideoCommon::BufferCopy> copies, bool barrier,
+                                    bool can_reorder_upload) {
    if (dst_buffer == VK_NULL_HANDLE || src_buffer == VK_NULL_HANDLE) {
        return;
    }
@ -376,9 +395,18 @@ void BufferCacheRuntime::CopyBuffer(VkBuffer dst_buffer, VkBuffer src_buffer,
        .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
        .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT,
    };
+
    // Measuring a popular game, this number never exceeds the specified size once data is warmed up
    boost::container::small_vector<VkBufferCopy, 8> vk_copies(copies.size());
    std::ranges::transform(copies, vk_copies.begin(), MakeBufferCopy);
+    if (src_buffer == staging_pool.StreamBuf() && can_reorder_upload) {
+        scheduler.RecordWithUploadBuffer([src_buffer, dst_buffer, vk_copies](
+                                             vk::CommandBuffer, vk::CommandBuffer upload_cmdbuf) {
+            upload_cmdbuf.CopyBuffer(src_buffer, dst_buffer, vk_copies);
+        });
+        return;
+    }
+
    scheduler.RequestOutsideRenderPassOperationContext();
    scheduler.Record([src_buffer, dst_buffer, vk_copies, barrier](vk::CommandBuffer cmdbuf) {
        if (barrier) {
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.h
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h
@ -5,6 +5,7 @@

 #include "video_core/buffer_cache/buffer_cache_base.h"
 #include "video_core/buffer_cache/memory_tracker_base.h"
+#include "video_core/buffer_cache/usage_tracker.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/renderer_vulkan/vk_compute_pass.h"
 #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
@ -34,6 +35,18 @@ public:
        return *buffer;
    }

+    [[nodiscard]] bool IsRegionUsed(u64 offset, u64 size) const noexcept {
+        return tracker.IsUsed(offset, size);
+    }
+
+    void MarkUsage(u64 offset, u64 size) noexcept {
+        tracker.Track(offset, size);
+    }
+
+    void ResetUsageTracking() noexcept {
+        tracker.Reset();
+    }
+
    operator VkBuffer() const noexcept {
        return *buffer;
    }
@ -49,6 +62,7 @@ private:
    const Device* device{};
    vk::Buffer buffer;
    std::vector<BufferView> views;
+    VideoCommon::UsageTracker tracker;
 };

 class QuadArrayIndexBuffer;
@ -67,6 +81,8 @@ public:
                                ComputePassDescriptorQueue& compute_pass_descriptor_queue,
                                DescriptorPool& descriptor_pool);

+    void TickFrame(VideoCommon::SlotVector<Buffer>& slot_buffers) noexcept;
+
    void Finish();

    u64 GetDeviceLocalMemory() const;
@ -79,12 +95,15 @@ public:

    [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size, bool deferred = false);

+    bool CanReorderUpload(const Buffer& buffer, std::span<const VideoCommon::BufferCopy> copies);
+
    void FreeDeferredStagingBuffer(StagingBufferRef& ref);

    void PreCopyBarrier();

    void CopyBuffer(VkBuffer src_buffer, VkBuffer dst_buffer,
-                    std::span<const VideoCommon::BufferCopy> copies, bool barrier = true);
+                    std::span<const VideoCommon::BufferCopy> copies, bool barrier,
+                    bool can_reorder_upload = false);

    void PostCopyBarrier();

--- a/src/video_core/renderer_vulkan/vk_master_semaphore.cpp
+++ b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp
@ -100,12 +100,14 @@ void MasterSemaphore::Wait(u64 tick) {
    Refresh();
 }

-VkResult MasterSemaphore::SubmitQueue(vk::CommandBuffer& cmdbuf, VkSemaphore signal_semaphore,
-                                      VkSemaphore wait_semaphore, u64 host_tick) {
+VkResult MasterSemaphore::SubmitQueue(vk::CommandBuffer& cmdbuf, vk::CommandBuffer& upload_cmdbuf,
+                                      VkSemaphore signal_semaphore, VkSemaphore wait_semaphore,
+                                      u64 host_tick) {
    if (semaphore) {
-        return SubmitQueueTimeline(cmdbuf, signal_semaphore, wait_semaphore, host_tick);
+        return SubmitQueueTimeline(cmdbuf, upload_cmdbuf, signal_semaphore, wait_semaphore,
+                                   host_tick);
    } else {
-        return SubmitQueueFence(cmdbuf, signal_semaphore, wait_semaphore, host_tick);
+        return SubmitQueueFence(cmdbuf, upload_cmdbuf, signal_semaphore, wait_semaphore, host_tick);
    }
 }

@ -115,6 +117,7 @@ static constexpr std::array<VkPipelineStageFlags, 2> wait_stage_masks{
 };

 VkResult MasterSemaphore::SubmitQueueTimeline(vk::CommandBuffer& cmdbuf,
+                                              vk::CommandBuffer& upload_cmdbuf,
                                              VkSemaphore signal_semaphore,
                                              VkSemaphore wait_semaphore, u64 host_tick) {
    const VkSemaphore timeline_semaphore = *semaphore;
@ -123,6 +126,8 @@ VkResult MasterSemaphore::SubmitQueueTimeline(vk::CommandBuffer& cmdbuf,
    const std::array signal_values{host_tick, u64(0)};
    const std::array signal_semaphores{timeline_semaphore, signal_semaphore};

+    const std::array cmdbuffers{*upload_cmdbuf, *cmdbuf};
+
    const u32 num_wait_semaphores = wait_semaphore ? 1 : 0;
    const VkTimelineSemaphoreSubmitInfo timeline_si{
        .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO,
@ -138,8 +143,8 @@ VkResult MasterSemaphore::SubmitQueueTimeline(vk::CommandBuffer& cmdbuf,
        .waitSemaphoreCount = num_wait_semaphores,
        .pWaitSemaphores = &wait_semaphore,
        .pWaitDstStageMask = wait_stage_masks.data(),
-        .commandBufferCount = 1,
-        .pCommandBuffers = cmdbuf.address(),
+        .commandBufferCount = static_cast<u32>(cmdbuffers.size()),
+        .pCommandBuffers = cmdbuffers.data(),
        .signalSemaphoreCount = num_signal_semaphores,
        .pSignalSemaphores = signal_semaphores.data(),
    };
@ -147,19 +152,23 @@ VkResult MasterSemaphore::SubmitQueueTimeline(vk::CommandBuffer& cmdbuf,
    return device.GetGraphicsQueue().Submit(submit_info);
 }

-VkResult MasterSemaphore::SubmitQueueFence(vk::CommandBuffer& cmdbuf, VkSemaphore signal_semaphore,
-                                           VkSemaphore wait_semaphore, u64 host_tick) {
+VkResult MasterSemaphore::SubmitQueueFence(vk::CommandBuffer& cmdbuf,
+                                           vk::CommandBuffer& upload_cmdbuf,
+                                           VkSemaphore signal_semaphore, VkSemaphore wait_semaphore,
+                                           u64 host_tick) {
    const u32 num_signal_semaphores = signal_semaphore ? 1 : 0;
    const u32 num_wait_semaphores = wait_semaphore ? 1 : 0;

+    const std::array cmdbuffers{*upload_cmdbuf, *cmdbuf};
+
    const VkSubmitInfo submit_info{
        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
        .pNext = nullptr,
        .waitSemaphoreCount = num_wait_semaphores,
        .pWaitSemaphores = &wait_semaphore,
        .pWaitDstStageMask = wait_stage_masks.data(),
-        .commandBufferCount = 1,
-        .pCommandBuffers = cmdbuf.address(),
+        .commandBufferCount = static_cast<u32>(cmdbuffers.size()),
+        .pCommandBuffers = cmdbuffers.data(),
        .signalSemaphoreCount = num_signal_semaphores,
        .pSignalSemaphores = &signal_semaphore,
    };
--- a/src/video_core/renderer_vulkan/vk_master_semaphore.h
+++ b/src/video_core/renderer_vulkan/vk_master_semaphore.h
@ -52,14 +52,16 @@ public:
    void Wait(u64 tick);

    /// Submits the device graphics queue, updating the tick as necessary
-    VkResult SubmitQueue(vk::CommandBuffer& cmdbuf, VkSemaphore signal_semaphore,
-                         VkSemaphore wait_semaphore, u64 host_tick);
+    VkResult SubmitQueue(vk::CommandBuffer& cmdbuf, vk::CommandBuffer& upload_cmdbuf,
+                         VkSemaphore signal_semaphore, VkSemaphore wait_semaphore, u64 host_tick);

 private:
-    VkResult SubmitQueueTimeline(vk::CommandBuffer& cmdbuf, VkSemaphore signal_semaphore,
-                                 VkSemaphore wait_semaphore, u64 host_tick);
-    VkResult SubmitQueueFence(vk::CommandBuffer& cmdbuf, VkSemaphore signal_semaphore,
-                              VkSemaphore wait_semaphore, u64 host_tick);
+    VkResult SubmitQueueTimeline(vk::CommandBuffer& cmdbuf, vk::CommandBuffer& upload_cmdbuf,
+                                 VkSemaphore signal_semaphore, VkSemaphore wait_semaphore,
+                                 u64 host_tick);
+    VkResult SubmitQueueFence(vk::CommandBuffer& cmdbuf, vk::CommandBuffer& upload_cmdbuf,
+                              VkSemaphore signal_semaphore, VkSemaphore wait_semaphore,
+                              u64 host_tick);

    void WaitThread(std::stop_token token);

--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@ -22,11 +22,12 @@ namespace Vulkan {

 MICROPROFILE_DECLARE(Vulkan_WaitForWorker);

-void Scheduler::CommandChunk::ExecuteAll(vk::CommandBuffer cmdbuf) {
+void Scheduler::CommandChunk::ExecuteAll(vk::CommandBuffer cmdbuf,
+                                         vk::CommandBuffer upload_cmdbuf) {
    auto command = first;
    while (command != nullptr) {
        auto next = command->GetNext();
-        command->Execute(cmdbuf);
+        command->Execute(cmdbuf, upload_cmdbuf);
        command->~Command();
        command = next;
    }
@ -180,7 +181,7 @@ void Scheduler::WorkerThread(std::stop_token stop_token) {
            // Perform the work, tracking whether the chunk was a submission
            // before executing.
            const bool has_submit = work->HasSubmit();
-            work->ExecuteAll(current_cmdbuf);
+            work->ExecuteAll(current_cmdbuf, current_upload_cmdbuf);

            // If the chunk was a submission, reallocate the command buffer.
            if (has_submit) {
@ -205,6 +206,13 @@ void Scheduler::AllocateWorkerCommandBuffer() {
        .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
        .pInheritanceInfo = nullptr,
    });
+    current_upload_cmdbuf = vk::CommandBuffer(command_pool->Commit(), device.GetDispatchLoader());
+    current_upload_cmdbuf.Begin({
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
+        .pNext = nullptr,
+        .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
+        .pInheritanceInfo = nullptr,
+    });
 }

 u64 Scheduler::SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) {
@ -212,7 +220,17 @@ u64 Scheduler::SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_se
    InvalidateState();

    const u64 signal_value = master_semaphore->NextTick();
-    Record([signal_semaphore, wait_semaphore, signal_value, this](vk::CommandBuffer cmdbuf) {
+    RecordWithUploadBuffer([signal_semaphore, wait_semaphore, signal_value,
+                            this](vk::CommandBuffer cmdbuf, vk::CommandBuffer upload_cmdbuf) {
+        static constexpr VkMemoryBarrier WRITE_BARRIER{
+            .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
+            .pNext = nullptr,
+            .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
+            .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT,
+        };
+        upload_cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT,
+                                      VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, WRITE_BARRIER);
+        upload_cmdbuf.End();
        cmdbuf.End();

        if (on_submit) {
@ -221,7 +239,7 @@ u64 Scheduler::SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_se

        std::scoped_lock lock{submit_mutex};
        switch (const VkResult result = master_semaphore->SubmitQueue(
-                    cmdbuf, signal_semaphore, wait_semaphore, signal_value)) {
+                    cmdbuf, upload_cmdbuf, signal_semaphore, wait_semaphore, signal_value)) {
        case VK_SUCCESS:
            break;
        case VK_ERROR_DEVICE_LOST:
--- a/src/video_core/renderer_vulkan/vk_scheduler.h
+++ b/src/video_core/renderer_vulkan/vk_scheduler.h
@ -80,7 +80,8 @@ public:

    /// Send work to a separate thread.
    template <typename T>
-    void Record(T&& command) {
+        requires std::is_invocable_v<T, vk::CommandBuffer, vk::CommandBuffer>
+    void RecordWithUploadBuffer(T&& command) {
        if (chunk->Record(command)) {
            return;
        }
@ -88,6 +89,15 @@ public:
        (void)chunk->Record(command);
    }

+    template <typename T>
+        requires std::is_invocable_v<T, vk::CommandBuffer>
+    void Record(T&& c) {
+        this->RecordWithUploadBuffer(
+            [command = std::move(c)](vk::CommandBuffer cmdbuf, vk::CommandBuffer) {
+                command(cmdbuf);
+            });
+    }
+
    /// Returns the current command buffer tick.
    [[nodiscard]] u64 CurrentTick() const noexcept {
        return master_semaphore->CurrentTick();
@ -119,7 +129,7 @@ private:
    public:
        virtual ~Command() = default;

-        virtual void Execute(vk::CommandBuffer cmdbuf) const = 0;
+        virtual void Execute(vk::CommandBuffer cmdbuf, vk::CommandBuffer upload_cmdbuf) const = 0;

        Command* GetNext() const {
            return next;
@ -142,8 +152,8 @@ private:
        TypedCommand(TypedCommand&&) = delete;
        TypedCommand& operator=(TypedCommand&&) = delete;

-        void Execute(vk::CommandBuffer cmdbuf) const override {
-            command(cmdbuf);
+        void Execute(vk::CommandBuffer cmdbuf, vk::CommandBuffer upload_cmdbuf) const override {
+            command(cmdbuf, upload_cmdbuf);
        }

    private:
@ -152,7 +162,7 @@ private:

    class CommandChunk final {
    public:
-        void ExecuteAll(vk::CommandBuffer cmdbuf);
+        void ExecuteAll(vk::CommandBuffer cmdbuf, vk::CommandBuffer upload_cmdbuf);

        template <typename T>
        bool Record(T& command) {
@ -228,6 +238,7 @@ private:
    VideoCommon::QueryCacheBase<QueryCacheParams>* query_cache = nullptr;

    vk::CommandBuffer current_cmdbuf;
+    vk::CommandBuffer current_upload_cmdbuf;

    std::unique_ptr<CommandChunk> chunk;
    std::function<void()> on_submit;
--- a/src/video_core/renderer_vulkan/vk_smaa.cpp
+++ b/src/video_core/renderer_vulkan/vk_smaa.cpp
@ -672,7 +672,7 @@ void SMAA::UploadImages(Scheduler& scheduler) {
    UploadImage(m_device, m_allocator, scheduler, m_static_images[Search], search_extent,
                VK_FORMAT_R8_UNORM, ARRAY_TO_SPAN(searchTexBytes));

-    scheduler.Record([&](vk::CommandBuffer& cmdbuf) {
+    scheduler.Record([&](vk::CommandBuffer cmdbuf) {
        for (auto& images : m_dynamic_images) {
            for (size_t i = 0; i < MaxDynamicImage; i++) {
                ClearColorImage(cmdbuf, *images.images[i]);
@ -707,7 +707,7 @@ VkImageView SMAA::Draw(Scheduler& scheduler, size_t image_index, VkImage source_
    UpdateDescriptorSets(source_image_view, image_index);

    scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([=, this](vk::CommandBuffer& cmdbuf) {
+    scheduler.Record([=, this](vk::CommandBuffer cmdbuf) {
        TransitionImageLayout(cmdbuf, source_image, VK_IMAGE_LAYOUT_GENERAL);
        TransitionImageLayout(cmdbuf, edges_image, VK_IMAGE_LAYOUT_GENERAL);
        BeginRenderPass(cmdbuf, m_renderpasses[EdgeDetection], edge_detection_framebuffer,
--- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h
+++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h
@ -36,6 +36,10 @@ public:
    StagingBufferRef Request(size_t size, MemoryUsage usage, bool deferred = false);
    void FreeDeferred(StagingBufferRef& ref);

+    [[nodiscard]] VkBuffer StreamBuf() const noexcept {
+        return *stream_buffer;
+    }
+
    void TickFrame();

 private:
--- a/src/video_core/texture_cache/slot_vector.h
+++ b/src/video_core/texture_cache/slot_vector.h
@ -138,6 +138,10 @@ public:
        return Iterator(this, SlotId{SlotId::INVALID_INDEX});
    }

+    [[nodiscard]] size_t size() const noexcept {
+        return values_capacity - free_list.size();
+    }
+
 private:
    struct NonTrivialDummy {
        NonTrivialDummy() noexcept {}
--- a/src/video_core/vulkan_common/vulkan_wrapper.h
+++ b/src/video_core/vulkan_common/vulkan_wrapper.h
@ -1101,6 +1101,10 @@ public:
        return &handle;
    }

+    VkCommandBuffer operator*() const noexcept {
+        return handle;
+    }
+
    void Begin(const VkCommandBufferBeginInfo& begin_info) const {
        Check(dld->vkBeginCommandBuffer(handle, &begin_info));
    }
--- a/src/yuzu/CMakeLists.txt
+++ b/src/yuzu/CMakeLists.txt
@ -252,6 +252,7 @@ file(GLOB_RECURSE THEMES ${PROJECT_SOURCE_DIR}/dist/qt_themes/*)
 if (ENABLE_QT_TRANSLATION)
    set(YUZU_QT_LANGUAGES "${PROJECT_SOURCE_DIR}/dist/languages" CACHE PATH "Path to the translation bundle for the Qt frontend")
    option(GENERATE_QT_TRANSLATION "Generate en.ts as the translation source file" OFF)
+    option(WORKAROUND_BROKEN_LUPDATE "Run lupdate directly through CMake if Qt's convenience wrappers don't work" OFF)

    # Update source TS file if enabled
    if (GENERATE_QT_TRANSLATION)
@ -259,6 +260,21 @@ if (ENABLE_QT_TRANSLATION)
        # these calls to qt_create_translation also creates a rule to generate en.qm which conflicts with providing english plurals
        # so we have to set a OUTPUT_LOCATION so that we don't have multiple rules to generate en.qm
        set_source_files_properties(${YUZU_QT_LANGUAGES}/en.ts PROPERTIES OUTPUT_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/translations")
+        if (WORKAROUND_BROKEN_LUPDATE)
+            add_custom_command(OUTPUT ${YUZU_QT_LANGUAGES}/en.ts
+                COMMAND lupdate
+                    -source-language en_US
+                    -target-language en_US
+                    ${SRCS}
+                    ${UIS}
+                    -ts ${YUZU_QT_LANGUAGES}/en.ts
+                DEPENDS
+                    ${SRCS}
+                    ${UIS}
+                WORKING_DIRECTORY
+                    ${CMAKE_CURRENT_SOURCE_DIR}
+            )
+        else()
            qt_create_translation(QM_FILES
                ${SRCS}
                ${UIS}
@ -267,11 +283,28 @@ if (ENABLE_QT_TRANSLATION)
                -source-language en_US
                -target-language en_US
            )
+        endif()

        # Generate plurals into dist/english_plurals/generated_en.ts so it can be used to revise dist/english_plurals/en.ts
        set(GENERATED_PLURALS_FILE ${PROJECT_SOURCE_DIR}/dist/english_plurals/generated_en.ts)
        set_source_files_properties(${GENERATED_PLURALS_FILE} PROPERTIES OUTPUT_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/plurals")
+        if (WORKAROUND_BROKEN_LUPDATE)
+            add_custom_command(OUTPUT ${GENERATED_PLURALS_FILE}
+                COMMAND lupdate
+                    -source-language en_US
+                    -target-language en_US
+                    ${SRCS}
+                    ${UIS}
+                    -ts ${GENERATED_PLURALS_FILE}
+                DEPENDS
+                    ${SRCS}
+                    ${UIS}
+                WORKING_DIRECTORY
+                    ${CMAKE_CURRENT_SOURCE_DIR}
+            )
+        else()
            qt_create_translation(QM_FILES ${SRCS} ${UIS} ${GENERATED_PLURALS_FILE} OPTIONS -pluralonly -source-language en_US -target-language en_US)
+        endif()

        add_custom_target(translation ALL DEPENDS ${YUZU_QT_LANGUAGES}/en.ts ${GENERATED_PLURALS_FILE})
    endif()
--- a/src/yuzu/configuration/configure_debug.cpp
+++ b/src/yuzu/configuration/configure_debug.cpp
@ -51,6 +51,8 @@ void ConfigureDebug::SetConfiguration() {
    ui->enable_all_controllers->setChecked(Settings::values.enable_all_controllers.GetValue());
    ui->enable_renderdoc_hotkey->setEnabled(runtime_lock);
    ui->enable_renderdoc_hotkey->setChecked(Settings::values.enable_renderdoc_hotkey.GetValue());
+    ui->disable_buffer_reorder->setEnabled(runtime_lock);
+    ui->disable_buffer_reorder->setChecked(Settings::values.disable_buffer_reorder.GetValue());
    ui->enable_graphics_debugging->setEnabled(runtime_lock);
    ui->enable_graphics_debugging->setChecked(Settings::values.renderer_debug.GetValue());
    ui->enable_shader_feedback->setEnabled(runtime_lock);
@ -96,6 +98,7 @@ void ConfigureDebug::ApplyConfiguration() {
    Settings::values.enable_all_controllers = ui->enable_all_controllers->isChecked();
    Settings::values.renderer_debug = ui->enable_graphics_debugging->isChecked();
    Settings::values.enable_renderdoc_hotkey = ui->enable_renderdoc_hotkey->isChecked();
+    Settings::values.disable_buffer_reorder = ui->disable_buffer_reorder->isChecked();
    Settings::values.renderer_shader_feedback = ui->enable_shader_feedback->isChecked();
    Settings::values.cpu_debug_mode = ui->enable_cpu_debugging->isChecked();
    Settings::values.enable_nsight_aftermath = ui->enable_nsight_aftermath->isChecked();
--- a/src/yuzu/configuration/configure_debug.ui
+++ b/src/yuzu/configuration/configure_debug.ui
@ -271,19 +271,6 @@
          </widget>
         </item>
         <item row="8" column="0">
-          <widget class="QCheckBox" name="disable_macro_hle">
-           <property name="enabled">
-            <bool>true</bool>
-           </property>
-           <property name="toolTip">
-            <string>When checked, it disables the macro HLE functions. Enabling this makes games run slower</string>
-           </property>
-           <property name="text">
-            <string>Disable Macro HLE</string>
-           </property>
-          </widget>
-         </item>
-         <item row="7" column="0">
          <widget class="QCheckBox" name="dump_macros">
           <property name="enabled">
            <bool>true</bool>
@ -306,59 +293,7 @@
           </property>
          </widget>
         </item>
-         <item row="2" column="0">
-          <widget class="QCheckBox" name="enable_shader_feedback">
-           <property name="toolTip">
-            <string>When checked, yuzu will log statistics about the compiled pipeline cache</string>
-           </property>
-           <property name="text">
-            <string>Enable Shader Feedback</string>
-           </property>
-          </widget>
-         </item>
         <item row="6" column="0">
-          <widget class="QCheckBox" name="disable_macro_jit">
-           <property name="enabled">
-            <bool>true</bool>
-           </property>
-           <property name="toolTip">
-            <string>When checked, it disables the macro Just In Time compiler. Enabling this makes games run slower</string>
-           </property>
-           <property name="text">
-            <string>Disable Macro JIT</string>
-           </property>
-          </widget>
-         </item>
-         <item row="9" column="0">
-          <spacer name="verticalSpacer_5">
-           <property name="orientation">
-            <enum>Qt::Vertical</enum>
-           </property>
-           <property name="sizeType">
-            <enum>QSizePolicy::Preferred</enum>
-           </property>
-           <property name="sizeHint" stdset="0">
-            <size>
-             <width>20</width>
-             <height>0</height>
-            </size>
-           </property>
-          </spacer>
-         </item>
-         <item row="0" column="0">
-          <widget class="QCheckBox" name="enable_graphics_debugging">
-           <property name="enabled">
-            <bool>true</bool>
-           </property>
-           <property name="toolTip">
-            <string>When checked, the graphics API enters a slower debugging mode</string>
-           </property>
-           <property name="text">
-            <string>Enable Graphics Debugging</string>
-           </property>
-          </widget>
-         </item>
-         <item row="5" column="0">
          <widget class="QCheckBox" name="dump_shaders">
           <property name="enabled">
            <bool>true</bool>
@ -378,6 +313,81 @@
           </property>
          </widget>
         </item>
+         <item row="7" column="0">
+          <widget class="QCheckBox" name="disable_macro_jit">
+           <property name="enabled">
+            <bool>true</bool>
+           </property>
+           <property name="toolTip">
+            <string>When checked, it disables the macro Just In Time compiler. Enabling this makes games run slower</string>
+           </property>
+           <property name="text">
+            <string>Disable Macro JIT</string>
+           </property>
+          </widget>
+         </item>
+         <item row="9" column="0">
+          <widget class="QCheckBox" name="disable_macro_hle">
+           <property name="enabled">
+            <bool>true</bool>
+           </property>
+           <property name="toolTip">
+            <string>When checked, it disables the macro HLE functions. Enabling this makes games run slower</string>
+           </property>
+           <property name="text">
+            <string>Disable Macro HLE</string>
+           </property>
+          </widget>
+         </item>
+         <item row="0" column="0">
+          <widget class="QCheckBox" name="enable_graphics_debugging">
+           <property name="enabled">
+            <bool>true</bool>
+           </property>
+           <property name="toolTip">
+            <string>When checked, the graphics API enters a slower debugging mode</string>
+           </property>
+           <property name="text">
+            <string>Enable Graphics Debugging</string>
+           </property>
+          </widget>
+         </item>
+         <item row="10" column="0">
+          <spacer name="verticalSpacer_5">
+           <property name="orientation">
+            <enum>Qt::Vertical</enum>
+           </property>
+           <property name="sizeType">
+            <enum>QSizePolicy::Preferred</enum>
+           </property>
+           <property name="sizeHint" stdset="0">
+            <size>
+             <width>20</width>
+             <height>0</height>
+            </size>
+           </property>
+          </spacer>
+         </item>
+         <item row="2" column="0">
+          <widget class="QCheckBox" name="enable_shader_feedback">
+           <property name="toolTip">
+            <string>When checked, yuzu will log statistics about the compiled pipeline cache</string>
+           </property>
+           <property name="text">
+            <string>Enable Shader Feedback</string>
+           </property>
+          </widget>
+         </item>
+         <item row="5" column="0">
+          <widget class="QCheckBox" name="disable_buffer_reorder">
+           <property name="toolTip">
+            <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;When checked, disables reording of mapped memory uploads which allows to associate uploads with specific draws. May reduce performance in some cases.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+           </property>
+           <property name="text">
+            <string>Disable Buffer Reorder</string>
+           </property>
+          </widget>
+         </item>
        </layout>
       </widget>
      </item>
--- a/src/yuzu/configuration/shared_translation.cpp
+++ b/src/yuzu/configuration/shared_translation.cpp
@ -1,17 +1,18 @@
 // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

-#include "common/time_zone.h"
 #include "yuzu/configuration/shared_translation.h"

 #include <map>
 #include <memory>
 #include <tuple>
 #include <utility>
+#include <QCoreApplication>
 #include <QWidget>
 #include "common/settings.h"
 #include "common/settings_enums.h"
 #include "common/settings_setting.h"
+#include "common/time_zone.h"
 #include "yuzu/uisettings.h"

 namespace ConfigurationShared {
@ -21,123 +22,135 @@ std::unique_ptr<TranslationMap> InitializeTranslations(QWidget* parent) {
    const auto& tr = [parent](const char* text) -> QString { return parent->tr(text); };

 #define INSERT(SETTINGS, ID, NAME, TOOLTIP)                                                        \
-    translations->insert(std::pair{SETTINGS::values.ID.Id(), std::pair{tr((NAME)), tr((TOOLTIP))}})
+    translations->insert(std::pair{SETTINGS::values.ID.Id(), std::pair{(NAME), (TOOLTIP)}})

    // A setting can be ignored by giving it a blank name

    // Audio
-    INSERT(Settings, sink_id, "Output Engine:", "");
-    INSERT(Settings, audio_output_device_id, "Output Device:", "");
-    INSERT(Settings, audio_input_device_id, "Input Device:", "");
-    INSERT(Settings, audio_muted, "Mute audio", "");
-    INSERT(Settings, volume, "Volume:", "");
-    INSERT(Settings, dump_audio_commands, "", "");
-    INSERT(UISettings, mute_when_in_background, "Mute audio when in background", "");
+    INSERT(Settings, sink_id, tr("Output Engine:"), QStringLiteral());
+    INSERT(Settings, audio_output_device_id, tr("Output Device:"), QStringLiteral());
+    INSERT(Settings, audio_input_device_id, tr("Input Device:"), QStringLiteral());
+    INSERT(Settings, audio_muted, tr("Mute audio"), QStringLiteral());
+    INSERT(Settings, volume, tr("Volume:"), QStringLiteral());
+    INSERT(Settings, dump_audio_commands, QStringLiteral(), QStringLiteral());
+    INSERT(UISettings, mute_when_in_background, tr("Mute audio when in background"),
+           QStringLiteral());

    // Core
-    INSERT(Settings, use_multi_core, "Multicore CPU Emulation", "");
-    INSERT(Settings, memory_layout_mode, "Memory Layout", "");
-    INSERT(Settings, use_speed_limit, "", "");
-    INSERT(Settings, speed_limit, "Limit Speed Percent", "");
+    INSERT(Settings, use_multi_core, tr("Multicore CPU Emulation"), QStringLiteral());
+    INSERT(Settings, memory_layout_mode, tr("Memory Layout"), QStringLiteral());
+    INSERT(Settings, use_speed_limit, QStringLiteral(), QStringLiteral());
+    INSERT(Settings, speed_limit, tr("Limit Speed Percent"), QStringLiteral());

    // Cpu
-    INSERT(Settings, cpu_accuracy, "Accuracy:", "");
+    INSERT(Settings, cpu_accuracy, tr("Accuracy:"), QStringLiteral());

    // Cpu Debug

    // Cpu Unsafe
-    INSERT(Settings, cpuopt_unsafe_unfuse_fma,
-           "Unfuse FMA (improve performance on CPUs without FMA)",
-           "This option improves speed by reducing accuracy of fused-multiply-add instructions on "
-           "CPUs without native FMA support.");
-    INSERT(Settings, cpuopt_unsafe_reduce_fp_error, "Faster FRSQRTE and FRECPE",
-           "This option improves the speed of some approximate floating-point functions by using "
-           "less accurate native approximations.");
-    INSERT(Settings, cpuopt_unsafe_ignore_standard_fpcr, "Faster ASIMD instructions (32 bits only)",
-           "This option improves the speed of 32 bits ASIMD floating-point functions by running "
-           "with incorrect rounding modes.");
-    INSERT(Settings, cpuopt_unsafe_inaccurate_nan, "Inaccurate NaN handling",
-           "This option improves speed by removing NaN checking. Please note this also reduces "
-           "accuracy of certain floating-point instructions.");
    INSERT(
-        Settings, cpuopt_unsafe_fastmem_check, "Disable address space checks",
-        "This option improves speed by eliminating a safety check before every memory read/write "
-        "in guest. Disabling it may allow a game to read/write the emulator's memory.");
-    INSERT(Settings, cpuopt_unsafe_ignore_global_monitor, "Ignore global monitor",
-           "This option improves speed by relying only on the semantics of cmpxchg to ensure "
+        Settings, cpuopt_unsafe_unfuse_fma,
+        tr("Unfuse FMA (improve performance on CPUs without FMA)"),
+        tr("This option improves speed by reducing accuracy of fused-multiply-add instructions on "
+           "CPUs without native FMA support."));
+    INSERT(
+        Settings, cpuopt_unsafe_reduce_fp_error, tr("Faster FRSQRTE and FRECPE"),
+        tr("This option improves the speed of some approximate floating-point functions by using "
+           "less accurate native approximations."));
+    INSERT(Settings, cpuopt_unsafe_ignore_standard_fpcr,
+           tr("Faster ASIMD instructions (32 bits only)"),
+           tr("This option improves the speed of 32 bits ASIMD floating-point functions by running "
+              "with incorrect rounding modes."));
+    INSERT(Settings, cpuopt_unsafe_inaccurate_nan, tr("Inaccurate NaN handling"),
+           tr("This option improves speed by removing NaN checking. Please note this also reduces "
+              "accuracy of certain floating-point instructions."));
+    INSERT(Settings, cpuopt_unsafe_fastmem_check, tr("Disable address space checks"),
+           tr("This option improves speed by eliminating a safety check before every memory "
+              "read/write "
+              "in guest. Disabling it may allow a game to read/write the emulator's memory."));
+    INSERT(
+        Settings, cpuopt_unsafe_ignore_global_monitor, tr("Ignore global monitor"),
+        tr("This option improves speed by relying only on the semantics of cmpxchg to ensure "
           "safety of exclusive access instructions. Please note this may result in deadlocks and "
-           "other race conditions.");
+           "other race conditions."));

    // Renderer
-    INSERT(Settings, renderer_backend, "API:", "");
-    INSERT(Settings, vulkan_device, "Device:", "");
-    INSERT(Settings, shader_backend, "Shader Backend:", "");
-    INSERT(Settings, resolution_setup, "Resolution:", "");
-    INSERT(Settings, scaling_filter, "Window Adapting Filter:", "");
-    INSERT(Settings, fsr_sharpening_slider, "FSR Sharpness:", "");
-    INSERT(Settings, anti_aliasing, "Anti-Aliasing Method:", "");
-    INSERT(Settings, fullscreen_mode, "Fullscreen Mode:", "");
-    INSERT(Settings, aspect_ratio, "Aspect Ratio:", "");
-    INSERT(Settings, use_disk_shader_cache, "Use disk pipeline cache", "");
-    INSERT(Settings, use_asynchronous_gpu_emulation, "Use asynchronous GPU emulation", "");
-    INSERT(Settings, nvdec_emulation, "NVDEC emulation:", "");
-    INSERT(Settings, accelerate_astc, "ASTC Decoding Method:", "");
-    INSERT(Settings, astc_recompression, "ASTC Recompression Method:", "");
-    INSERT(Settings, vsync_mode, "VSync Mode:",
-           "FIFO (VSync) does not drop frames or exhibit tearing but is limited by the screen "
+    INSERT(Settings, renderer_backend, tr("API:"), QStringLiteral());
+    INSERT(Settings, vulkan_device, tr("Device:"), QStringLiteral());
+    INSERT(Settings, shader_backend, tr("Shader Backend:"), QStringLiteral());
+    INSERT(Settings, resolution_setup, tr("Resolution:"), QStringLiteral());
+    INSERT(Settings, scaling_filter, tr("Window Adapting Filter:"), QStringLiteral());
+    INSERT(Settings, fsr_sharpening_slider, tr("FSR Sharpness:"), QStringLiteral());
+    INSERT(Settings, anti_aliasing, tr("Anti-Aliasing Method:"), QStringLiteral());
+    INSERT(Settings, fullscreen_mode, tr("Fullscreen Mode:"), QStringLiteral());
+    INSERT(Settings, aspect_ratio, tr("Aspect Ratio:"), QStringLiteral());
+    INSERT(Settings, use_disk_shader_cache, tr("Use disk pipeline cache"), QStringLiteral());
+    INSERT(Settings, use_asynchronous_gpu_emulation, tr("Use asynchronous GPU emulation"),
+           QStringLiteral());
+    INSERT(Settings, nvdec_emulation, tr("NVDEC emulation:"), QStringLiteral());
+    INSERT(Settings, accelerate_astc, tr("ASTC Decoding Method:"), QStringLiteral());
+    INSERT(Settings, astc_recompression, tr("ASTC Recompression Method:"), QStringLiteral());
+    INSERT(
+        Settings, vsync_mode, tr("VSync Mode:"),
+        tr("FIFO (VSync) does not drop frames or exhibit tearing but is limited by the screen "
           "refresh rate.\nFIFO Relaxed is similar to FIFO but allows tearing as it recovers from "
           "a slow down.\nMailbox can have lower latency than FIFO and does not tear but may drop "
           "frames.\nImmediate (no synchronization) just presents whatever is available and can "
-           "exhibit tearing.");
-    INSERT(Settings, bg_red, "", "");
-    INSERT(Settings, bg_green, "", "");
-    INSERT(Settings, bg_blue, "", "");
+           "exhibit tearing."));
+    INSERT(Settings, bg_red, QStringLiteral(), QStringLiteral());
+    INSERT(Settings, bg_green, QStringLiteral(), QStringLiteral());
+    INSERT(Settings, bg_blue, QStringLiteral(), QStringLiteral());

    // Renderer (Advanced Graphics)
-    INSERT(Settings, async_presentation, "Enable asynchronous presentation (Vulkan only)", "");
-    INSERT(Settings, renderer_force_max_clock, "Force maximum clocks (Vulkan only)",
-           "Runs work in the background while waiting for graphics commands to keep the GPU from "
-           "lowering its clock speed.");
-    INSERT(Settings, max_anisotropy, "Anisotropic Filtering:", "");
-    INSERT(Settings, gpu_accuracy, "Accuracy Level:", "");
-    INSERT(Settings, use_asynchronous_shaders, "Use asynchronous shader building (Hack)",
-           "Enables asynchronous shader compilation, which may reduce shader stutter. This feature "
-           "is experimental.");
-    INSERT(Settings, use_fast_gpu_time, "Use Fast GPU Time (Hack)",
-           "Enables Fast GPU Time. This option will force most games to run at their highest "
-           "native resolution.");
-    INSERT(Settings, use_vulkan_driver_pipeline_cache, "Use Vulkan pipeline cache",
-           "Enables GPU vendor-specific pipeline cache. This option can improve shader loading "
+    INSERT(Settings, async_presentation, tr("Enable asynchronous presentation (Vulkan only)"),
+           QStringLiteral());
+    INSERT(
+        Settings, renderer_force_max_clock, tr("Force maximum clocks (Vulkan only)"),
+        tr("Runs work in the background while waiting for graphics commands to keep the GPU from "
+           "lowering its clock speed."));
+    INSERT(Settings, max_anisotropy, tr("Anisotropic Filtering:"), QStringLiteral());
+    INSERT(Settings, gpu_accuracy, tr("Accuracy Level:"), QStringLiteral());
+    INSERT(
+        Settings, use_asynchronous_shaders, tr("Use asynchronous shader building (Hack)"),
+        tr("Enables asynchronous shader compilation, which may reduce shader stutter. This feature "
+           "is experimental."));
+    INSERT(Settings, use_fast_gpu_time, tr("Use Fast GPU Time (Hack)"),
+           tr("Enables Fast GPU Time. This option will force most games to run at their highest "
+              "native resolution."));
+    INSERT(Settings, use_vulkan_driver_pipeline_cache, tr("Use Vulkan pipeline cache"),
+           tr("Enables GPU vendor-specific pipeline cache. This option can improve shader loading "
              "time significantly in cases where the Vulkan driver does not store pipeline cache "
-           "files internally.");
-    INSERT(Settings, enable_compute_pipelines, "Enable Compute Pipelines (Intel Vulkan Only)",
-           "Enable compute pipelines, required by some games.\nThis setting only exists for Intel "
+              "files internally."));
+    INSERT(
+        Settings, enable_compute_pipelines, tr("Enable Compute Pipelines (Intel Vulkan Only)"),
+        tr("Enable compute pipelines, required by some games.\nThis setting only exists for Intel "
           "proprietary drivers, and may crash if enabled.\nCompute pipelines are always enabled "
-           "on all other drivers.");
-    INSERT(Settings, use_reactive_flushing, "Enable Reactive Flushing",
-           "Uses reactive flushing instead of predictive flushing, allowing more accurate memory "
-           "syncing.");
-    INSERT(Settings, use_video_framerate, "Sync to framerate of video playback",
-           "Run the game at normal speed during video playback, even when the framerate is "
-           "unlocked.");
-    INSERT(Settings, barrier_feedback_loops, "Barrier feedback loops",
-           "Improves rendering of transparency effects in specific games.");
+           "on all other drivers."));
+    INSERT(
+        Settings, use_reactive_flushing, tr("Enable Reactive Flushing"),
+        tr("Uses reactive flushing instead of predictive flushing, allowing more accurate memory "
+           "syncing."));
+    INSERT(Settings, use_video_framerate, tr("Sync to framerate of video playback"),
+           tr("Run the game at normal speed during video playback, even when the framerate is "
+              "unlocked."));
+    INSERT(Settings, barrier_feedback_loops, tr("Barrier feedback loops"),
+           tr("Improves rendering of transparency effects in specific games."));

    // Renderer (Debug)

    // System
-    INSERT(Settings, rng_seed, "RNG Seed", "");
-    INSERT(Settings, rng_seed_enabled, "", "");
-    INSERT(Settings, device_name, "Device Name", "");
-    INSERT(Settings, custom_rtc, "Custom RTC", "");
-    INSERT(Settings, custom_rtc_enabled, "", "");
-    INSERT(Settings, language_index,
-           "Language:", "Note: this can be overridden when region setting is auto-select");
-    INSERT(Settings, region_index, "Region:", "");
-    INSERT(Settings, time_zone_index, "Time Zone:", "");
-    INSERT(Settings, sound_index, "Sound Output Mode:", "");
-    INSERT(Settings, use_docked_mode, "Console Mode:", "");
-    INSERT(Settings, current_user, "", "");
+    INSERT(Settings, rng_seed, tr("RNG Seed"), QStringLiteral());
+    INSERT(Settings, rng_seed_enabled, QStringLiteral(), QStringLiteral());
+    INSERT(Settings, device_name, tr("Device Name"), QStringLiteral());
+    INSERT(Settings, custom_rtc, tr("Custom RTC"), QStringLiteral());
+    INSERT(Settings, custom_rtc_enabled, QStringLiteral(), QStringLiteral());
+    INSERT(Settings, language_index, tr("Language:"),
+           tr("Note: this can be overridden when region setting is auto-select"));
+    INSERT(Settings, region_index, tr("Region:"), QStringLiteral());
+    INSERT(Settings, time_zone_index, tr("Time Zone:"), QStringLiteral());
+    INSERT(Settings, sound_index, tr("Sound Output Mode:"), QStringLiteral());
+    INSERT(Settings, use_docked_mode, tr("Console Mode:"), QStringLiteral());
+    INSERT(Settings, current_user, QStringLiteral(), QStringLiteral());

    // Controls

@ -154,11 +167,14 @@ std::unique_ptr<TranslationMap> InitializeTranslations(QWidget* parent) {
    // Ui

    // Ui General
-    INSERT(UISettings, select_user_on_boot, "Prompt for user on game boot", "");
-    INSERT(UISettings, pause_when_in_background, "Pause emulation when in background", "");
-    INSERT(UISettings, confirm_before_stopping, "Confirm before stopping emulation", "");
-    INSERT(UISettings, hide_mouse, "Hide mouse on inactivity", "");
-    INSERT(UISettings, controller_applet_disabled, "Disable controller applet", "");
+    INSERT(UISettings, select_user_on_boot, tr("Prompt for user on game boot"), QStringLiteral());
+    INSERT(UISettings, pause_when_in_background, tr("Pause emulation when in background"),
+           QStringLiteral());
+    INSERT(UISettings, confirm_before_stopping, tr("Confirm before stopping emulation"),
+           QStringLiteral());
+    INSERT(UISettings, hide_mouse, tr("Hide mouse on inactivity"), QStringLiteral());
+    INSERT(UISettings, controller_applet_disabled, tr("Disable controller applet"),
+           QStringLiteral());

    // Ui Debugging

@ -178,140 +194,141 @@ std::unique_ptr<ComboboxTranslationMap> ComboboxEnumeration(QWidget* parent) {
        return parent->tr(text, context);
    };

-#define PAIR(ENUM, VALUE, TRANSLATION)                                                             \
-    { static_cast<u32>(Settings::ENUM::VALUE), tr(TRANSLATION) }
-#define CTX_PAIR(ENUM, VALUE, TRANSLATION, CONTEXT)                                                \
-    { static_cast<u32>(Settings::ENUM::VALUE), tr(TRANSLATION, CONTEXT) }
+#define PAIR(ENUM, VALUE, TRANSLATION) {static_cast<u32>(Settings::ENUM::VALUE), (TRANSLATION)}

    // Intentionally skipping VSyncMode to let the UI fill that one out

    translations->insert({Settings::EnumMetadata<Settings::AstcDecodeMode>::Index(),
                          {
-                              PAIR(AstcDecodeMode, Cpu, "CPU"),
-                              PAIR(AstcDecodeMode, Gpu, "GPU"),
-                              PAIR(AstcDecodeMode, CpuAsynchronous, "CPU Asynchronous"),
+                              PAIR(AstcDecodeMode, Cpu, tr("CPU")),
+                              PAIR(AstcDecodeMode, Gpu, tr("GPU")),
+                              PAIR(AstcDecodeMode, CpuAsynchronous, tr("CPU Asynchronous")),
                          }});
-    translations->insert({Settings::EnumMetadata<Settings::AstcRecompression>::Index(),
+    translations->insert(
+        {Settings::EnumMetadata<Settings::AstcRecompression>::Index(),
         {
-                              PAIR(AstcRecompression, Uncompressed, "Uncompressed (Best quality)"),
-                              PAIR(AstcRecompression, Bc1, "BC1 (Low quality)"),
-                              PAIR(AstcRecompression, Bc3, "BC3 (Medium quality)"),
+             PAIR(AstcRecompression, Uncompressed, tr("Uncompressed (Best quality)")),
+             PAIR(AstcRecompression, Bc1, tr("BC1 (Low quality)")),
+             PAIR(AstcRecompression, Bc3, tr("BC3 (Medium quality)")),
         }});
    translations->insert({Settings::EnumMetadata<Settings::RendererBackend>::Index(),
                          {
 #ifdef HAS_OPENGL
-                              PAIR(RendererBackend, OpenGL, "OpenGL"),
+                              PAIR(RendererBackend, OpenGL, tr("OpenGL")),
 #endif
-                              PAIR(RendererBackend, Vulkan, "Vulkan"),
-                              PAIR(RendererBackend, Null, "Null"),
+                              PAIR(RendererBackend, Vulkan, tr("Vulkan")),
+                              PAIR(RendererBackend, Null, tr("Null")),
                          }});
-    translations->insert({Settings::EnumMetadata<Settings::ShaderBackend>::Index(),
+    translations->insert(
+        {Settings::EnumMetadata<Settings::ShaderBackend>::Index(),
         {
-                              PAIR(ShaderBackend, Glsl, "GLSL"),
-                              PAIR(ShaderBackend, Glasm, "GLASM (Assembly Shaders, NVIDIA Only)"),
-                              PAIR(ShaderBackend, SpirV, "SPIR-V (Experimental, Mesa Only)"),
+             PAIR(ShaderBackend, Glsl, tr("GLSL")),
+             PAIR(ShaderBackend, Glasm, tr("GLASM (Assembly Shaders, NVIDIA Only)")),
+             PAIR(ShaderBackend, SpirV, tr("SPIR-V (Experimental, Mesa Only)")),
         }});
    translations->insert({Settings::EnumMetadata<Settings::GpuAccuracy>::Index(),
                          {
-                              PAIR(GpuAccuracy, Normal, "Normal"),
-                              PAIR(GpuAccuracy, High, "High"),
-                              PAIR(GpuAccuracy, Extreme, "Extreme"),
+                              PAIR(GpuAccuracy, Normal, tr("Normal")),
+                              PAIR(GpuAccuracy, High, tr("High")),
+                              PAIR(GpuAccuracy, Extreme, tr("Extreme")),
                          }});
-    translations->insert({Settings::EnumMetadata<Settings::CpuAccuracy>::Index(),
+    translations->insert(
+        {Settings::EnumMetadata<Settings::CpuAccuracy>::Index(),
         {
-                              PAIR(CpuAccuracy, Auto, "Auto"),
-                              PAIR(CpuAccuracy, Accurate, "Accurate"),
-                              PAIR(CpuAccuracy, Unsafe, "Unsafe"),
-                              PAIR(CpuAccuracy, Paranoid, "Paranoid (disables most optimizations)"),
+             PAIR(CpuAccuracy, Auto, tr("Auto")),
+             PAIR(CpuAccuracy, Accurate, tr("Accurate")),
+             PAIR(CpuAccuracy, Unsafe, tr("Unsafe")),
+             PAIR(CpuAccuracy, Paranoid, tr("Paranoid (disables most optimizations)")),
         }});
    translations->insert({Settings::EnumMetadata<Settings::FullscreenMode>::Index(),
                          {
-                              PAIR(FullscreenMode, Borderless, "Borderless Windowed"),
-                              PAIR(FullscreenMode, Exclusive, "Exclusive Fullscreen"),
+                              PAIR(FullscreenMode, Borderless, tr("Borderless Windowed")),
+                              PAIR(FullscreenMode, Exclusive, tr("Exclusive Fullscreen")),
                          }});
    translations->insert({Settings::EnumMetadata<Settings::NvdecEmulation>::Index(),
                          {
-                              PAIR(NvdecEmulation, Off, "No Video Output"),
-                              PAIR(NvdecEmulation, Cpu, "CPU Video Decoding"),
-                              PAIR(NvdecEmulation, Gpu, "GPU Video Decoding (Default)"),
+                              PAIR(NvdecEmulation, Off, tr("No Video Output")),
+                              PAIR(NvdecEmulation, Cpu, tr("CPU Video Decoding")),
+                              PAIR(NvdecEmulation, Gpu, tr("GPU Video Decoding (Default)")),
                          }});
-    translations->insert({Settings::EnumMetadata<Settings::ResolutionSetup>::Index(),
+    translations->insert(
+        {Settings::EnumMetadata<Settings::ResolutionSetup>::Index(),
         {
-                              PAIR(ResolutionSetup, Res1_2X, "0.5X (360p/540p) [EXPERIMENTAL]"),
-                              PAIR(ResolutionSetup, Res3_4X, "0.75X (540p/810p) [EXPERIMENTAL]"),
-                              PAIR(ResolutionSetup, Res1X, "1X (720p/1080p)"),
-                              PAIR(ResolutionSetup, Res3_2X, "1.5X (1080p/1620p) [EXPERIMENTAL]"),
-                              PAIR(ResolutionSetup, Res2X, "2X (1440p/2160p)"),
-                              PAIR(ResolutionSetup, Res3X, "3X (2160p/3240p)"),
-                              PAIR(ResolutionSetup, Res4X, "4X (2880p/4320p)"),
-                              PAIR(ResolutionSetup, Res5X, "5X (3600p/5400p)"),
-                              PAIR(ResolutionSetup, Res6X, "6X (4320p/6480p)"),
-                              PAIR(ResolutionSetup, Res7X, "7X (5040p/7560p)"),
-                              PAIR(ResolutionSetup, Res8X, "8X (5760p/8640p)"),
+             PAIR(ResolutionSetup, Res1_2X, tr("0.5X (360p/540p) [EXPERIMENTAL]")),
+             PAIR(ResolutionSetup, Res3_4X, tr("0.75X (540p/810p) [EXPERIMENTAL]")),
+             PAIR(ResolutionSetup, Res1X, tr("1X (720p/1080p)")),
+             PAIR(ResolutionSetup, Res3_2X, tr("1.5X (1080p/1620p) [EXPERIMENTAL]")),
+             PAIR(ResolutionSetup, Res2X, tr("2X (1440p/2160p)")),
+             PAIR(ResolutionSetup, Res3X, tr("3X (2160p/3240p)")),
+             PAIR(ResolutionSetup, Res4X, tr("4X (2880p/4320p)")),
+             PAIR(ResolutionSetup, Res5X, tr("5X (3600p/5400p)")),
+             PAIR(ResolutionSetup, Res6X, tr("6X (4320p/6480p)")),
+             PAIR(ResolutionSetup, Res7X, tr("7X (5040p/7560p)")),
+             PAIR(ResolutionSetup, Res8X, tr("8X (5760p/8640p)")),
         }});
    translations->insert({Settings::EnumMetadata<Settings::ScalingFilter>::Index(),
                          {
-                              PAIR(ScalingFilter, NearestNeighbor, "Nearest Neighbor"),
-                              PAIR(ScalingFilter, Bilinear, "Bilinear"),
-                              PAIR(ScalingFilter, Bicubic, "Bicubic"),
-                              PAIR(ScalingFilter, Gaussian, "Gaussian"),
-                              PAIR(ScalingFilter, ScaleForce, "ScaleForce"),
-                              PAIR(ScalingFilter, Fsr, "AMD FidelityFX™️ Super Resolution"),
+                              PAIR(ScalingFilter, NearestNeighbor, tr("Nearest Neighbor")),
+                              PAIR(ScalingFilter, Bilinear, tr("Bilinear")),
+                              PAIR(ScalingFilter, Bicubic, tr("Bicubic")),
+                              PAIR(ScalingFilter, Gaussian, tr("Gaussian")),
+                              PAIR(ScalingFilter, ScaleForce, tr("ScaleForce")),
+                              PAIR(ScalingFilter, Fsr, tr("AMD FidelityFX™️ Super Resolution")),
                          }});
    translations->insert({Settings::EnumMetadata<Settings::AntiAliasing>::Index(),
                          {
-                              PAIR(AntiAliasing, None, "None"),
-                              PAIR(AntiAliasing, Fxaa, "FXAA"),
-                              PAIR(AntiAliasing, Smaa, "SMAA"),
+                              PAIR(AntiAliasing, None, tr("None")),
+                              PAIR(AntiAliasing, Fxaa, tr("FXAA")),
+                              PAIR(AntiAliasing, Smaa, tr("SMAA")),
                          }});
    translations->insert({Settings::EnumMetadata<Settings::AspectRatio>::Index(),
                          {
-                              PAIR(AspectRatio, R16_9, "Default (16:9)"),
-                              PAIR(AspectRatio, R4_3, "Force 4:3"),
-                              PAIR(AspectRatio, R21_9, "Force 21:9"),
-                              PAIR(AspectRatio, R16_10, "Force 16:10"),
-                              PAIR(AspectRatio, Stretch, "Stretch to Window"),
+                              PAIR(AspectRatio, R16_9, tr("Default (16:9)")),
+                              PAIR(AspectRatio, R4_3, tr("Force 4:3")),
+                              PAIR(AspectRatio, R21_9, tr("Force 21:9")),
+                              PAIR(AspectRatio, R16_10, tr("Force 16:10")),
+                              PAIR(AspectRatio, Stretch, tr("Stretch to Window")),
                          }});
    translations->insert({Settings::EnumMetadata<Settings::AnisotropyMode>::Index(),
                          {
-                              PAIR(AnisotropyMode, Automatic, "Automatic"),
-                              PAIR(AnisotropyMode, Default, "Default"),
-                              PAIR(AnisotropyMode, X2, "2x"),
-                              PAIR(AnisotropyMode, X4, "4x"),
-                              PAIR(AnisotropyMode, X8, "8x"),
-                              PAIR(AnisotropyMode, X16, "16x"),
+                              PAIR(AnisotropyMode, Automatic, tr("Automatic")),
+                              PAIR(AnisotropyMode, Default, tr("Default")),
+                              PAIR(AnisotropyMode, X2, tr("2x")),
+                              PAIR(AnisotropyMode, X4, tr("4x")),
+                              PAIR(AnisotropyMode, X8, tr("8x")),
+                              PAIR(AnisotropyMode, X16, tr("16x")),
                          }});
    translations->insert(
        {Settings::EnumMetadata<Settings::Language>::Index(),
         {
-             PAIR(Language, Japanese, "Japanese (日本語)"),
-             PAIR(Language, EnglishAmerican, "American English"),
-             PAIR(Language, French, "French (français)"),
-             PAIR(Language, German, "German (Deutsch)"),
-             PAIR(Language, Italian, "Italian (italiano)"),
-             PAIR(Language, Spanish, "Spanish (español)"),
-             PAIR(Language, Chinese, "Chinese"),
-             PAIR(Language, Korean, "Korean (한국어)"),
-             PAIR(Language, Dutch, "Dutch (Nederlands)"),
-             PAIR(Language, Portuguese, "Portuguese (português)"),
-             PAIR(Language, Russian, "Russian (Русский)"),
-             PAIR(Language, Taiwanese, "Taiwanese"),
-             PAIR(Language, EnglishBritish, "British English"),
-             PAIR(Language, FrenchCanadian, "Canadian French"),
-             PAIR(Language, SpanishLatin, "Latin American Spanish"),
-             PAIR(Language, ChineseSimplified, "Simplified Chinese"),
-             PAIR(Language, ChineseTraditional, "Traditional Chinese (正體中文)"),
-             PAIR(Language, PortugueseBrazilian, "Brazilian Portuguese (português do Brasil)"),
+             PAIR(Language, Japanese, tr("Japanese (日本語)")),
+             PAIR(Language, EnglishAmerican, tr("American English")),
+             PAIR(Language, French, tr("French (français)")),
+             PAIR(Language, German, tr("German (Deutsch)")),
+             PAIR(Language, Italian, tr("Italian (italiano)")),
+             PAIR(Language, Spanish, tr("Spanish (español)")),
+             PAIR(Language, Chinese, tr("Chinese")),
+             PAIR(Language, Korean, tr("Korean (한국어)")),
+             PAIR(Language, Dutch, tr("Dutch (Nederlands)")),
+             PAIR(Language, Portuguese, tr("Portuguese (português)")),
+             PAIR(Language, Russian, tr("Russian (Русский)")),
+             PAIR(Language, Taiwanese, tr("Taiwanese")),
+             PAIR(Language, EnglishBritish, tr("British English")),
+             PAIR(Language, FrenchCanadian, tr("Canadian French")),
+             PAIR(Language, SpanishLatin, tr("Latin American Spanish")),
+             PAIR(Language, ChineseSimplified, tr("Simplified Chinese")),
+             PAIR(Language, ChineseTraditional, tr("Traditional Chinese (正體中文)")),
+             PAIR(Language, PortugueseBrazilian, tr("Brazilian Portuguese (português do Brasil)")),
         }});
    translations->insert({Settings::EnumMetadata<Settings::Region>::Index(),
                          {
-                              PAIR(Region, Japan, "Japan"),
-                              PAIR(Region, Usa, "USA"),
-                              PAIR(Region, Europe, "Europe"),
-                              PAIR(Region, Australia, "Australia"),
-                              PAIR(Region, China, "China"),
-                              PAIR(Region, Korea, "Korea"),
-                              PAIR(Region, Taiwan, "Taiwan"),
+                              PAIR(Region, Japan, tr("Japan")),
+                              PAIR(Region, Usa, tr("USA")),
+                              PAIR(Region, Europe, tr("Europe")),
+                              PAIR(Region, Australia, tr("Australia")),
+                              PAIR(Region, China, tr("China")),
+                              PAIR(Region, Korea, tr("Korea")),
+                              PAIR(Region, Taiwan, tr("Taiwan")),
                          }});
    translations->insert(
        {Settings::EnumMetadata<Settings::TimeZone>::Index(),
@ -323,72 +340,74 @@ std::unique_ptr<ComboboxTranslationMap> ComboboxEnumeration(QWidget* parent) {
             {static_cast<u32>(Settings::TimeZone::Default),
              tr("Default (%1)", "Default time zone")
                  .arg(QString::fromStdString(Common::TimeZone::GetDefaultTimeZone()))},
-             PAIR(TimeZone, Cet, "CET"),
-             PAIR(TimeZone, Cst6Cdt, "CST6CDT"),
-             PAIR(TimeZone, Cuba, "Cuba"),
-             PAIR(TimeZone, Eet, "EET"),
-             PAIR(TimeZone, Egypt, "Egypt"),
-             PAIR(TimeZone, Eire, "Eire"),
-             PAIR(TimeZone, Est, "EST"),
-             PAIR(TimeZone, Est5Edt, "EST5EDT"),
-             PAIR(TimeZone, Gb, "GB"),
-             PAIR(TimeZone, GbEire, "GB-Eire"),
-             PAIR(TimeZone, Gmt, "GMT"),
-             PAIR(TimeZone, GmtPlusZero, "GMT+0"),
-             PAIR(TimeZone, GmtMinusZero, "GMT-0"),
-             PAIR(TimeZone, GmtZero, "GMT0"),
-             PAIR(TimeZone, Greenwich, "Greenwich"),
-             PAIR(TimeZone, Hongkong, "Hongkong"),
-             PAIR(TimeZone, Hst, "HST"),
-             PAIR(TimeZone, Iceland, "Iceland"),
-             PAIR(TimeZone, Iran, "Iran"),
-             PAIR(TimeZone, Israel, "Israel"),
-             PAIR(TimeZone, Jamaica, "Jamaica"),
-             PAIR(TimeZone, Japan, "Japan"),
-             PAIR(TimeZone, Kwajalein, "Kwajalein"),
-             PAIR(TimeZone, Libya, "Libya"),
-             PAIR(TimeZone, Met, "MET"),
-             PAIR(TimeZone, Mst, "MST"),
-             PAIR(TimeZone, Mst7Mdt, "MST7MDT"),
-             PAIR(TimeZone, Navajo, "Navajo"),
-             PAIR(TimeZone, Nz, "NZ"),
-             PAIR(TimeZone, NzChat, "NZ-CHAT"),
-             PAIR(TimeZone, Poland, "Poland"),
-             PAIR(TimeZone, Portugal, "Portugal"),
-             PAIR(TimeZone, Prc, "PRC"),
-             PAIR(TimeZone, Pst8Pdt, "PST8PDT"),
-             PAIR(TimeZone, Roc, "ROC"),
-             PAIR(TimeZone, Rok, "ROK"),
-             PAIR(TimeZone, Singapore, "Singapore"),
-             PAIR(TimeZone, Turkey, "Turkey"),
-             PAIR(TimeZone, Uct, "UCT"),
-             PAIR(TimeZone, Universal, "Universal"),
-             PAIR(TimeZone, Utc, "UTC"),
-             PAIR(TimeZone, WSu, "W-SU"),
-             PAIR(TimeZone, Wet, "WET"),
-             PAIR(TimeZone, Zulu, "Zulu"),
+             PAIR(TimeZone, Cet, tr("CET")),
+             PAIR(TimeZone, Cst6Cdt, tr("CST6CDT")),
+             PAIR(TimeZone, Cuba, tr("Cuba")),
+             PAIR(TimeZone, Eet, tr("EET")),
+             PAIR(TimeZone, Egypt, tr("Egypt")),
+             PAIR(TimeZone, Eire, tr("Eire")),
+             PAIR(TimeZone, Est, tr("EST")),
+             PAIR(TimeZone, Est5Edt, tr("EST5EDT")),
+             PAIR(TimeZone, Gb, tr("GB")),
+             PAIR(TimeZone, GbEire, tr("GB-Eire")),
+             PAIR(TimeZone, Gmt, tr("GMT")),
+             PAIR(TimeZone, GmtPlusZero, tr("GMT+0")),
+             PAIR(TimeZone, GmtMinusZero, tr("GMT-0")),
+             PAIR(TimeZone, GmtZero, tr("GMT0")),
+             PAIR(TimeZone, Greenwich, tr("Greenwich")),
+             PAIR(TimeZone, Hongkong, tr("Hongkong")),
+             PAIR(TimeZone, Hst, tr("HST")),
+             PAIR(TimeZone, Iceland, tr("Iceland")),
+             PAIR(TimeZone, Iran, tr("Iran")),
+             PAIR(TimeZone, Israel, tr("Israel")),
+             PAIR(TimeZone, Jamaica, tr("Jamaica")),
+             PAIR(TimeZone, Japan, tr("Japan")),
+             PAIR(TimeZone, Kwajalein, tr("Kwajalein")),
+             PAIR(TimeZone, Libya, tr("Libya")),
+             PAIR(TimeZone, Met, tr("MET")),
+             PAIR(TimeZone, Mst, tr("MST")),
+             PAIR(TimeZone, Mst7Mdt, tr("MST7MDT")),
+             PAIR(TimeZone, Navajo, tr("Navajo")),
+             PAIR(TimeZone, Nz, tr("NZ")),
+             PAIR(TimeZone, NzChat, tr("NZ-CHAT")),
+             PAIR(TimeZone, Poland, tr("Poland")),
+             PAIR(TimeZone, Portugal, tr("Portugal")),
+             PAIR(TimeZone, Prc, tr("PRC")),
+             PAIR(TimeZone, Pst8Pdt, tr("PST8PDT")),
+             PAIR(TimeZone, Roc, tr("ROC")),
+             PAIR(TimeZone, Rok, tr("ROK")),
+             PAIR(TimeZone, Singapore, tr("Singapore")),
+             PAIR(TimeZone, Turkey, tr("Turkey")),
+             PAIR(TimeZone, Uct, tr("UCT")),
+             PAIR(TimeZone, Universal, tr("Universal")),
+             PAIR(TimeZone, Utc, tr("UTC")),
+             PAIR(TimeZone, WSu, tr("W-SU")),
+             PAIR(TimeZone, Wet, tr("WET")),
+             PAIR(TimeZone, Zulu, tr("Zulu")),
         }});
    translations->insert({Settings::EnumMetadata<Settings::AudioMode>::Index(),
                          {
-                              PAIR(AudioMode, Mono, "Mono"),
-                              PAIR(AudioMode, Stereo, "Stereo"),
-                              PAIR(AudioMode, Surround, "Surround"),
+                              PAIR(AudioMode, Mono, tr("Mono")),
+                              PAIR(AudioMode, Stereo, tr("Stereo")),
+                              PAIR(AudioMode, Surround, tr("Surround")),
                          }});
    translations->insert({Settings::EnumMetadata<Settings::MemoryLayout>::Index(),
                          {
-                              PAIR(MemoryLayout, Memory_4Gb, "4GB DRAM (Default)"),
-                              PAIR(MemoryLayout, Memory_6Gb, "6GB DRAM (Unsafe)"),
-                              PAIR(MemoryLayout, Memory_8Gb, "8GB DRAM (Unsafe)"),
+                              PAIR(MemoryLayout, Memory_4Gb, tr("4GB DRAM (Default)")),
+                              PAIR(MemoryLayout, Memory_6Gb, tr("6GB DRAM (Unsafe)")),
+                              PAIR(MemoryLayout, Memory_8Gb, tr("8GB DRAM (Unsafe)")),
+                          }});
+    translations->insert({Settings::EnumMetadata<Settings::ConsoleMode>::Index(),
+                          {
+                              PAIR(ConsoleMode, Docked, tr("Docked")),
+                              PAIR(ConsoleMode, Handheld, tr("Handheld")),
                          }});
-    translations->insert(
-        {Settings::EnumMetadata<Settings::ConsoleMode>::Index(),
-         {PAIR(ConsoleMode, Docked, "Docked"), PAIR(ConsoleMode, Handheld, "Handheld")}});
    translations->insert(
        {Settings::EnumMetadata<Settings::ConfirmStop>::Index(),
         {
-             PAIR(ConfirmStop, Ask_Always, "Always ask (Default)"),
-             PAIR(ConfirmStop, Ask_Based_On_Game, "Only if game specifies not to stop"),
-             PAIR(ConfirmStop, Ask_Never, "Never ask"),
+             PAIR(ConfirmStop, Ask_Always, tr("Always ask (Default)")),
+             PAIR(ConfirmStop, Ask_Based_On_Game, tr("Only if game specifies not to stop")),
+             PAIR(ConfirmStop, Ask_Never, tr("Never ask")),
         }});

 #undef PAIR
--- a/src/yuzu/configuration/shared_widget.cpp
+++ b/src/yuzu/configuration/shared_widget.cpp
@ -194,7 +194,7 @@ QWidget* Widget::CreateRadioGroup(std::function<std::string()>& serializer,
        return group;
    }

-    const auto get_selected = [=]() -> int {
+    const auto get_selected = [this]() -> int {
        for (const auto& [id, button] : radio_buttons) {
            if (button->isChecked()) {
                return id;
@ -203,7 +203,7 @@ QWidget* Widget::CreateRadioGroup(std::function<std::string()>& serializer,
        return -1;
    };

-    const auto set_index = [=](u32 value) {
+    const auto set_index = [this](u32 value) {
        for (const auto& [id, button] : radio_buttons) {
            button->setChecked(id == value);
        }
--- a/src/yuzu/game_list_worker.cpp
+++ b/src/yuzu/game_list_worker.cpp
@ -479,6 +479,6 @@ void GameListWorker::run() {
        }
    }

-    RecordEvent([=](GameList* game_list) { game_list->DonePopulating(watch_list); });
+    RecordEvent([this](GameList* game_list) { game_list->DonePopulating(watch_list); });
    processing_completed.Set();
 }
Author	SHA1	Message	Date
yuzubot	e62da0afe1	Android #139	2023-11-23 00:57:23 +00:00
yuzubot	4047e9071f	Merge PR 12074	2023-11-23 00:57:23 +00:00
yuzubot	d8f125ab10	Merge PR 11535	2023-11-23 00:57:23 +00:00
liamwhite	91c12db070	Merge pull request #12123 from merryhime/explicit-this Explicit this	2023-11-21 22:17:42 -05:00
Merry	b088a448cd	game_list_worker: Explicit caputure of 'this'	2023-11-21 22:57:47 +00:00
Merry	c4f6c3b00b	shared_widget: Explicit capture of 'this'	2023-11-21 22:57:09 +00:00
liamwhite	cddb28cf26	Merge pull request #12107 from daisymlleung/patch-1 Stub CheckBlockedUserListAvailability for Super Bomberman R 2	2023-11-21 09:19:41 -05:00
liamwhite	538e137bca	Merge pull request #12100 from t895/android-translations translations: Add android translations to transifex config	2023-11-21 09:19:34 -05:00
liamwhite	e69118042f	Merge pull request #12045 from liamwhite/codec-refactor video_core: refactor video frame and packet parsing	2023-11-21 09:19:26 -05:00
liamwhite	a6b8d85b34	Merge pull request #11984 from lat9nq/lupdate shared_translation: Call tr more directly	2023-11-21 09:19:13 -05:00
daisymlleung	0216aa55b4	Stub CheckBlockedUserListAvailability	2023-11-21 01:57:58 +08:00
t895	0298c2cc5d	translations: Add android translations to transifex config	2023-11-20 10:37:09 -05:00
Liam	4055a476aa	video_core: refactor video frame and packet parsing	2023-11-16 17:01:38 -05:00
lat9nq	cb3559539a	CMakeLists: Add option to call lupdate directly qt_create_translation silently fails to run at all on my system. Since there is no error, I was unable to determine a fix. This sidesteps the convenience function by setting up the rules ourselves. This is left as an option since this path likely does not work on Windows.	2023-11-08 11:54:05 -05:00
lat9nq	71cdfa6ad5	shared_translation: Call tr for each string Qt can't parse tr called within a macro, so we must call it on each string. shared_translation: Remove redundant include	2023-11-08 11:54:01 -05:00