Merge pull request #5121 from bunnei/optimize-core-timing
core: Optimize core timing utility functions to avoid unnecessary math
This commit is contained in:
		| @@ -168,7 +168,6 @@ add_library(common STATIC | ||||
|     time_zone.cpp | ||||
|     time_zone.h | ||||
|     tree.h | ||||
|     uint128.cpp | ||||
|     uint128.h | ||||
|     uuid.cpp | ||||
|     uuid.h | ||||
|   | ||||
| @@ -1,71 +0,0 @@ | ||||
| // Copyright 2019 yuzu Emulator Project | ||||
| // Licensed under GPLv2 or any later version | ||||
| // Refer to the license.txt file included. | ||||
|  | ||||
| #ifdef _MSC_VER | ||||
| #include <intrin.h> | ||||
|  | ||||
| #pragma intrinsic(_umul128) | ||||
| #pragma intrinsic(_udiv128) | ||||
| #endif | ||||
| #include <cstring> | ||||
| #include "common/uint128.h" | ||||
|  | ||||
| namespace Common { | ||||
|  | ||||
| #ifdef _MSC_VER | ||||
|  | ||||
| u64 MultiplyAndDivide64(u64 a, u64 b, u64 d) { | ||||
|     u128 r{}; | ||||
|     r[0] = _umul128(a, b, &r[1]); | ||||
|     u64 remainder; | ||||
| #if _MSC_VER < 1923 | ||||
|     return udiv128(r[1], r[0], d, &remainder); | ||||
| #else | ||||
|     return _udiv128(r[1], r[0], d, &remainder); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| #else | ||||
|  | ||||
| u64 MultiplyAndDivide64(u64 a, u64 b, u64 d) { | ||||
|     const u64 diva = a / d; | ||||
|     const u64 moda = a % d; | ||||
|     const u64 divb = b / d; | ||||
|     const u64 modb = b % d; | ||||
|     return diva * b + moda * divb + moda * modb / d; | ||||
| } | ||||
|  | ||||
| #endif | ||||
|  | ||||
| u128 Multiply64Into128(u64 a, u64 b) { | ||||
|     u128 result; | ||||
| #ifdef _MSC_VER | ||||
|     result[0] = _umul128(a, b, &result[1]); | ||||
| #else | ||||
|     unsigned __int128 tmp = a; | ||||
|     tmp *= b; | ||||
|     std::memcpy(&result, &tmp, sizeof(u128)); | ||||
| #endif | ||||
|     return result; | ||||
| } | ||||
|  | ||||
| std::pair<u64, u64> Divide128On32(u128 dividend, u32 divisor) { | ||||
|     u64 remainder = dividend[0] % divisor; | ||||
|     u64 accum = dividend[0] / divisor; | ||||
|     if (dividend[1] == 0) | ||||
|         return {accum, remainder}; | ||||
|     // We ignore dividend[1] / divisor as that overflows | ||||
|     const u64 first_segment = (dividend[1] % divisor) << 32; | ||||
|     accum += (first_segment / divisor) << 32; | ||||
|     const u64 second_segment = (first_segment % divisor) << 32; | ||||
|     accum += (second_segment / divisor); | ||||
|     remainder += second_segment % divisor; | ||||
|     if (remainder >= divisor) { | ||||
|         accum++; | ||||
|         remainder -= divisor; | ||||
|     } | ||||
|     return {accum, remainder}; | ||||
| } | ||||
|  | ||||
| } // namespace Common | ||||
| @@ -4,19 +4,98 @@ | ||||
|  | ||||
| #pragma once | ||||
|  | ||||
| #include <cstring> | ||||
| #include <utility> | ||||
|  | ||||
| #ifdef _MSC_VER | ||||
| #include <intrin.h> | ||||
| #pragma intrinsic(__umulh) | ||||
| #pragma intrinsic(_umul128) | ||||
| #pragma intrinsic(_udiv128) | ||||
| #else | ||||
| #include <x86intrin.h> | ||||
| #endif | ||||
|  | ||||
| #include "common/common_types.h" | ||||
|  | ||||
| namespace Common { | ||||
|  | ||||
| // This function multiplies 2 u64 values and divides it by a u64 value. | ||||
| [[nodiscard]] u64 MultiplyAndDivide64(u64 a, u64 b, u64 d); | ||||
| [[nodiscard]] static inline u64 MultiplyAndDivide64(u64 a, u64 b, u64 d) { | ||||
| #ifdef _MSC_VER | ||||
|     u128 r{}; | ||||
|     r[0] = _umul128(a, b, &r[1]); | ||||
|     u64 remainder; | ||||
| #if _MSC_VER < 1923 | ||||
|     return udiv128(r[1], r[0], d, &remainder); | ||||
| #else | ||||
|     return _udiv128(r[1], r[0], d, &remainder); | ||||
| #endif | ||||
| #else | ||||
|     const u64 diva = a / d; | ||||
|     const u64 moda = a % d; | ||||
|     const u64 divb = b / d; | ||||
|     const u64 modb = b % d; | ||||
|     return diva * b + moda * divb + moda * modb / d; | ||||
| #endif | ||||
| } | ||||
|  | ||||
| // This function multiplies 2 u64 values and produces a u128 value; | ||||
| [[nodiscard]] u128 Multiply64Into128(u64 a, u64 b); | ||||
| [[nodiscard]] static inline u128 Multiply64Into128(u64 a, u64 b) { | ||||
|     u128 result; | ||||
| #ifdef _MSC_VER | ||||
|     result[0] = _umul128(a, b, &result[1]); | ||||
| #else | ||||
|     unsigned __int128 tmp = a; | ||||
|     tmp *= b; | ||||
|     std::memcpy(&result, &tmp, sizeof(u128)); | ||||
| #endif | ||||
|     return result; | ||||
| } | ||||
|  | ||||
| // This function divides a u128 by a u32 value and produces two u64 values: | ||||
| // the result of division and the remainder | ||||
| [[nodiscard]] std::pair<u64, u64> Divide128On32(u128 dividend, u32 divisor); | ||||
| [[nodiscard]] static inline u64 GetFixedPoint64Factor(u64 numerator, u64 divisor) { | ||||
| #ifdef __SIZEOF_INT128__ | ||||
|     const auto base = static_cast<unsigned __int128>(numerator) << 64ULL; | ||||
|     return static_cast<u64>(base / divisor); | ||||
| #elif defined(_M_X64) || defined(_M_ARM64) | ||||
|     std::array<u64, 2> r = {0, numerator}; | ||||
|     u64 remainder; | ||||
| #if _MSC_VER < 1923 | ||||
|     return udiv128(r[1], r[0], divisor, &remainder); | ||||
| #else | ||||
|     return _udiv128(r[1], r[0], divisor, &remainder); | ||||
| #endif | ||||
| #else | ||||
|     // This one is bit more inaccurate. | ||||
|     return MultiplyAndDivide64(std::numeric_limits<u64>::max(), numerator, divisor); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| [[nodiscard]] static inline u64 MultiplyHigh(u64 a, u64 b) { | ||||
| #ifdef __SIZEOF_INT128__ | ||||
|     return (static_cast<unsigned __int128>(a) * static_cast<unsigned __int128>(b)) >> 64; | ||||
| #elif defined(_M_X64) || defined(_M_ARM64) | ||||
|     return __umulh(a, b); // MSVC | ||||
| #else | ||||
|     // Generic fallback | ||||
|     const u64 a_lo = u32(a); | ||||
|     const u64 a_hi = a >> 32; | ||||
|     const u64 b_lo = u32(b); | ||||
|     const u64 b_hi = b >> 32; | ||||
|  | ||||
|     const u64 a_x_b_hi = a_hi * b_hi; | ||||
|     const u64 a_x_b_mid = a_hi * b_lo; | ||||
|     const u64 b_x_a_mid = b_hi * a_lo; | ||||
|     const u64 a_x_b_lo = a_lo * b_lo; | ||||
|  | ||||
|     const u64 carry_bit = (static_cast<u64>(static_cast<u32>(a_x_b_mid)) + | ||||
|                            static_cast<u64>(static_cast<u32>(b_x_a_mid)) + (a_x_b_lo >> 32)) >> | ||||
|                           32; | ||||
|  | ||||
|     const u64 multhi = a_x_b_hi + (a_x_b_mid >> 32) + (b_x_a_mid >> 32) + carry_bit; | ||||
|  | ||||
|     return multhi; | ||||
| #endif | ||||
| } | ||||
|  | ||||
| } // namespace Common | ||||
|   | ||||
| @@ -2,6 +2,8 @@ | ||||
| // Licensed under GPLv2 or any later version | ||||
| // Refer to the license.txt file included. | ||||
|  | ||||
| #include <cstdint> | ||||
|  | ||||
| #include "common/uint128.h" | ||||
| #include "common/wall_clock.h" | ||||
|  | ||||
| @@ -18,7 +20,9 @@ using base_time_point = std::chrono::time_point<base_timer>; | ||||
| class StandardWallClock final : public WallClock { | ||||
| public: | ||||
|     explicit StandardWallClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequency_) | ||||
|         : WallClock(emulated_cpu_frequency_, emulated_clock_frequency_, false) { | ||||
|         : WallClock(emulated_cpu_frequency_, emulated_clock_frequency_, false), | ||||
|           emulated_clock_factor{GetFixedPoint64Factor(emulated_clock_frequency, 1000000000)}, | ||||
|           emulated_cpu_factor{GetFixedPoint64Factor(emulated_cpu_frequency, 1000000000)} { | ||||
|         start_time = base_timer::now(); | ||||
|     } | ||||
|  | ||||
| @@ -41,16 +45,11 @@ public: | ||||
|     } | ||||
|  | ||||
|     u64 GetClockCycles() override { | ||||
|         std::chrono::nanoseconds time_now = GetTimeNS(); | ||||
|         const u128 temporary = | ||||
|             Common::Multiply64Into128(time_now.count(), emulated_clock_frequency); | ||||
|         return Common::Divide128On32(temporary, 1000000000).first; | ||||
|         return MultiplyHigh(GetTimeNS().count(), emulated_clock_factor); | ||||
|     } | ||||
|  | ||||
|     u64 GetCPUCycles() override { | ||||
|         std::chrono::nanoseconds time_now = GetTimeNS(); | ||||
|         const u128 temporary = Common::Multiply64Into128(time_now.count(), emulated_cpu_frequency); | ||||
|         return Common::Divide128On32(temporary, 1000000000).first; | ||||
|         return MultiplyHigh(GetTimeNS().count(), emulated_cpu_factor); | ||||
|     } | ||||
|  | ||||
|     void Pause([[maybe_unused]] bool is_paused) override { | ||||
| @@ -59,6 +58,8 @@ public: | ||||
|  | ||||
| private: | ||||
|     base_time_point start_time; | ||||
|     const u64 emulated_clock_factor; | ||||
|     const u64 emulated_cpu_factor; | ||||
| }; | ||||
|  | ||||
| #ifdef ARCHITECTURE_x86_64 | ||||
|   | ||||
| @@ -8,68 +8,10 @@ | ||||
| #include <mutex> | ||||
| #include <thread> | ||||
|  | ||||
| #ifdef _MSC_VER | ||||
| #include <intrin.h> | ||||
|  | ||||
| #pragma intrinsic(__umulh) | ||||
| #pragma intrinsic(_udiv128) | ||||
| #else | ||||
| #include <x86intrin.h> | ||||
| #endif | ||||
|  | ||||
| #include "common/atomic_ops.h" | ||||
| #include "common/uint128.h" | ||||
| #include "common/x64/native_clock.h" | ||||
|  | ||||
| namespace { | ||||
|  | ||||
| [[nodiscard]] u64 GetFixedPoint64Factor(u64 numerator, u64 divisor) { | ||||
| #ifdef __SIZEOF_INT128__ | ||||
|     const auto base = static_cast<unsigned __int128>(numerator) << 64ULL; | ||||
|     return static_cast<u64>(base / divisor); | ||||
| #elif defined(_M_X64) || defined(_M_ARM64) | ||||
|     std::array<u64, 2> r = {0, numerator}; | ||||
|     u64 remainder; | ||||
| #if _MSC_VER < 1923 | ||||
|     return udiv128(r[1], r[0], divisor, &remainder); | ||||
| #else | ||||
|     return _udiv128(r[1], r[0], divisor, &remainder); | ||||
| #endif | ||||
| #else | ||||
|     // This one is bit more inaccurate. | ||||
|     return MultiplyAndDivide64(std::numeric_limits<u64>::max(), numerator, divisor); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| [[nodiscard]] u64 MultiplyHigh(u64 a, u64 b) { | ||||
| #ifdef __SIZEOF_INT128__ | ||||
|     return (static_cast<unsigned __int128>(a) * static_cast<unsigned __int128>(b)) >> 64; | ||||
| #elif defined(_M_X64) || defined(_M_ARM64) | ||||
|     return __umulh(a, b); // MSVC | ||||
| #else | ||||
|     // Generic fallback | ||||
|     const u64 a_lo = u32(a); | ||||
|     const u64 a_hi = a >> 32; | ||||
|     const u64 b_lo = u32(b); | ||||
|     const u64 b_hi = b >> 32; | ||||
|  | ||||
|     const u64 a_x_b_hi = a_hi * b_hi; | ||||
|     const u64 a_x_b_mid = a_hi * b_lo; | ||||
|     const u64 b_x_a_mid = b_hi * a_lo; | ||||
|     const u64 a_x_b_lo = a_lo * b_lo; | ||||
|  | ||||
|     const u64 carry_bit = (static_cast<u64>(static_cast<u32>(a_x_b_mid)) + | ||||
|                            static_cast<u64>(static_cast<u32>(b_x_a_mid)) + (a_x_b_lo >> 32)) >> | ||||
|                           32; | ||||
|  | ||||
|     const u64 multhi = a_x_b_hi + (a_x_b_mid >> 32) + (b_x_a_mid >> 32) + carry_bit; | ||||
|  | ||||
|     return multhi; | ||||
| #endif | ||||
| } | ||||
|  | ||||
| } // namespace | ||||
|  | ||||
| namespace Common { | ||||
|  | ||||
| u64 EstimateRDTSCFrequency() { | ||||
|   | ||||
| @@ -19,7 +19,6 @@ add_library(core STATIC | ||||
|     core.h | ||||
|     core_timing.cpp | ||||
|     core_timing.h | ||||
|     core_timing_util.cpp | ||||
|     core_timing_util.h | ||||
|     cpu_manager.cpp | ||||
|     cpu_manager.h | ||||
|   | ||||
| @@ -1,84 +0,0 @@ | ||||
| // Copyright 2008 Dolphin Emulator Project / 2017 Citra Emulator Project | ||||
| // Licensed under GPLv2+ | ||||
| // Refer to the license.txt file included. | ||||
|  | ||||
| #include "core/core_timing_util.h" | ||||
|  | ||||
| #include <cinttypes> | ||||
| #include <limits> | ||||
| #include "common/logging/log.h" | ||||
| #include "common/uint128.h" | ||||
| #include "core/hardware_properties.h" | ||||
|  | ||||
| namespace Core::Timing { | ||||
|  | ||||
| constexpr u64 MAX_VALUE_TO_MULTIPLY = std::numeric_limits<s64>::max() / Hardware::BASE_CLOCK_RATE; | ||||
|  | ||||
| s64 msToCycles(std::chrono::milliseconds ms) { | ||||
|     if (static_cast<u64>(ms.count() / 1000) > MAX_VALUE_TO_MULTIPLY) { | ||||
|         LOG_ERROR(Core_Timing, "Integer overflow, use max value"); | ||||
|         return std::numeric_limits<s64>::max(); | ||||
|     } | ||||
|     if (static_cast<u64>(ms.count()) > MAX_VALUE_TO_MULTIPLY) { | ||||
|         LOG_DEBUG(Core_Timing, "Time very big, do rounding"); | ||||
|         return Hardware::BASE_CLOCK_RATE * (ms.count() / 1000); | ||||
|     } | ||||
|     return (Hardware::BASE_CLOCK_RATE * ms.count()) / 1000; | ||||
| } | ||||
|  | ||||
| s64 usToCycles(std::chrono::microseconds us) { | ||||
|     if (static_cast<u64>(us.count() / 1000000) > MAX_VALUE_TO_MULTIPLY) { | ||||
|         LOG_ERROR(Core_Timing, "Integer overflow, use max value"); | ||||
|         return std::numeric_limits<s64>::max(); | ||||
|     } | ||||
|     if (static_cast<u64>(us.count()) > MAX_VALUE_TO_MULTIPLY) { | ||||
|         LOG_DEBUG(Core_Timing, "Time very big, do rounding"); | ||||
|         return Hardware::BASE_CLOCK_RATE * (us.count() / 1000000); | ||||
|     } | ||||
|     return (Hardware::BASE_CLOCK_RATE * us.count()) / 1000000; | ||||
| } | ||||
|  | ||||
| s64 nsToCycles(std::chrono::nanoseconds ns) { | ||||
|     const u128 temporal = Common::Multiply64Into128(ns.count(), Hardware::BASE_CLOCK_RATE); | ||||
|     return Common::Divide128On32(temporal, static_cast<u32>(1000000000)).first; | ||||
| } | ||||
|  | ||||
| u64 msToClockCycles(std::chrono::milliseconds ns) { | ||||
|     const u128 temp = Common::Multiply64Into128(ns.count(), Hardware::CNTFREQ); | ||||
|     return Common::Divide128On32(temp, 1000).first; | ||||
| } | ||||
|  | ||||
| u64 usToClockCycles(std::chrono::microseconds ns) { | ||||
|     const u128 temp = Common::Multiply64Into128(ns.count(), Hardware::CNTFREQ); | ||||
|     return Common::Divide128On32(temp, 1000000).first; | ||||
| } | ||||
|  | ||||
| u64 nsToClockCycles(std::chrono::nanoseconds ns) { | ||||
|     const u128 temp = Common::Multiply64Into128(ns.count(), Hardware::CNTFREQ); | ||||
|     return Common::Divide128On32(temp, 1000000000).first; | ||||
| } | ||||
|  | ||||
| u64 CpuCyclesToClockCycles(u64 ticks) { | ||||
|     const u128 temporal = Common::Multiply64Into128(ticks, Hardware::CNTFREQ); | ||||
|     return Common::Divide128On32(temporal, static_cast<u32>(Hardware::BASE_CLOCK_RATE)).first; | ||||
| } | ||||
|  | ||||
| std::chrono::milliseconds CyclesToMs(s64 cycles) { | ||||
|     const u128 temporal = Common::Multiply64Into128(cycles, 1000); | ||||
|     u64 ms = Common::Divide128On32(temporal, static_cast<u32>(Hardware::BASE_CLOCK_RATE)).first; | ||||
|     return std::chrono::milliseconds(ms); | ||||
| } | ||||
|  | ||||
| std::chrono::nanoseconds CyclesToNs(s64 cycles) { | ||||
|     const u128 temporal = Common::Multiply64Into128(cycles, 1000000000); | ||||
|     u64 ns = Common::Divide128On32(temporal, static_cast<u32>(Hardware::BASE_CLOCK_RATE)).first; | ||||
|     return std::chrono::nanoseconds(ns); | ||||
| } | ||||
|  | ||||
| std::chrono::microseconds CyclesToUs(s64 cycles) { | ||||
|     const u128 temporal = Common::Multiply64Into128(cycles, 1000000); | ||||
|     u64 us = Common::Divide128On32(temporal, static_cast<u32>(Hardware::BASE_CLOCK_RATE)).first; | ||||
|     return std::chrono::microseconds(us); | ||||
| } | ||||
|  | ||||
| } // namespace Core::Timing | ||||
| @@ -1,24 +1,59 @@ | ||||
| // Copyright 2008 Dolphin Emulator Project / 2017 Citra Emulator Project | ||||
| // Licensed under GPLv2+ | ||||
| // Copyright 2020 yuzu Emulator Project | ||||
| // Licensed under GPLv2 or any later version | ||||
| // Refer to the license.txt file included. | ||||
|  | ||||
| #pragma once | ||||
|  | ||||
| #include <chrono> | ||||
|  | ||||
| #include "common/common_types.h" | ||||
| #include "core/hardware_properties.h" | ||||
|  | ||||
| namespace Core::Timing { | ||||
|  | ||||
| s64 msToCycles(std::chrono::milliseconds ms); | ||||
| s64 usToCycles(std::chrono::microseconds us); | ||||
| s64 nsToCycles(std::chrono::nanoseconds ns); | ||||
| u64 msToClockCycles(std::chrono::milliseconds ns); | ||||
| u64 usToClockCycles(std::chrono::microseconds ns); | ||||
| u64 nsToClockCycles(std::chrono::nanoseconds ns); | ||||
| std::chrono::milliseconds CyclesToMs(s64 cycles); | ||||
| std::chrono::nanoseconds CyclesToNs(s64 cycles); | ||||
| std::chrono::microseconds CyclesToUs(s64 cycles); | ||||
| namespace detail { | ||||
| constexpr u64 CNTFREQ_ADJUSTED = Hardware::CNTFREQ / 1000; | ||||
| constexpr u64 BASE_CLOCK_RATE_ADJUSTED = Hardware::BASE_CLOCK_RATE / 1000; | ||||
| } // namespace detail | ||||
|  | ||||
| u64 CpuCyclesToClockCycles(u64 ticks); | ||||
| [[nodiscard]] constexpr s64 msToCycles(std::chrono::milliseconds ms) { | ||||
|     return ms.count() * detail::BASE_CLOCK_RATE_ADJUSTED; | ||||
| } | ||||
|  | ||||
| [[nodiscard]] constexpr s64 usToCycles(std::chrono::microseconds us) { | ||||
|     return us.count() * detail::BASE_CLOCK_RATE_ADJUSTED / 1000; | ||||
| } | ||||
|  | ||||
| [[nodiscard]] constexpr s64 nsToCycles(std::chrono::nanoseconds ns) { | ||||
|     return ns.count() * detail::BASE_CLOCK_RATE_ADJUSTED / 1000000; | ||||
| } | ||||
|  | ||||
| [[nodiscard]] constexpr u64 msToClockCycles(std::chrono::milliseconds ms) { | ||||
|     return static_cast<u64>(ms.count()) * detail::CNTFREQ_ADJUSTED; | ||||
| } | ||||
|  | ||||
| [[nodiscard]] constexpr u64 usToClockCycles(std::chrono::microseconds us) { | ||||
|     return us.count() * detail::CNTFREQ_ADJUSTED / 1000; | ||||
| } | ||||
|  | ||||
| [[nodiscard]] constexpr u64 nsToClockCycles(std::chrono::nanoseconds ns) { | ||||
|     return ns.count() * detail::CNTFREQ_ADJUSTED / 1000000; | ||||
| } | ||||
|  | ||||
| [[nodiscard]] constexpr u64 CpuCyclesToClockCycles(u64 ticks) { | ||||
|     return ticks * detail::CNTFREQ_ADJUSTED / detail::BASE_CLOCK_RATE_ADJUSTED; | ||||
| } | ||||
|  | ||||
| [[nodiscard]] constexpr std::chrono::milliseconds CyclesToMs(s64 cycles) { | ||||
|     return std::chrono::milliseconds(cycles / detail::BASE_CLOCK_RATE_ADJUSTED); | ||||
| } | ||||
|  | ||||
| [[nodiscard]] constexpr std::chrono::nanoseconds CyclesToNs(s64 cycles) { | ||||
|     return std::chrono::nanoseconds(cycles * 1000000 / detail::BASE_CLOCK_RATE_ADJUSTED); | ||||
| } | ||||
|  | ||||
| [[nodiscard]] constexpr std::chrono::microseconds CyclesToUs(s64 cycles) { | ||||
|     return std::chrono::microseconds(cycles * 1000 / detail::BASE_CLOCK_RATE_ADJUSTED); | ||||
| } | ||||
|  | ||||
| } // namespace Core::Timing | ||||
|   | ||||
		Reference in New Issue
	
	Block a user