Merge pull request #10935 from Morph1984/mwaitx
x64: Make use of monitorx instructions for power efficient sleeps (AMD)
This commit is contained in:
		@@ -93,6 +93,7 @@ void AppendCPUInfo(FieldCollection& fc) {
 | 
			
		||||
    add_field("CPU_Extension_x64_GFNI", caps.gfni);
 | 
			
		||||
    add_field("CPU_Extension_x64_INVARIANT_TSC", caps.invariant_tsc);
 | 
			
		||||
    add_field("CPU_Extension_x64_LZCNT", caps.lzcnt);
 | 
			
		||||
    add_field("CPU_Extension_x64_MONITORX", caps.monitorx);
 | 
			
		||||
    add_field("CPU_Extension_x64_MOVBE", caps.movbe);
 | 
			
		||||
    add_field("CPU_Extension_x64_PCLMULQDQ", caps.pclmulqdq);
 | 
			
		||||
    add_field("CPU_Extension_x64_POPCNT", caps.popcnt);
 | 
			
		||||
 
 | 
			
		||||
@@ -168,6 +168,7 @@ static CPUCaps Detect() {
 | 
			
		||||
        __cpuid(cpu_id, 0x80000001);
 | 
			
		||||
        caps.lzcnt = Common::Bit<5>(cpu_id[2]);
 | 
			
		||||
        caps.fma4 = Common::Bit<16>(cpu_id[2]);
 | 
			
		||||
        caps.monitorx = Common::Bit<29>(cpu_id[2]);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (max_ex_fn >= 0x80000007) {
 | 
			
		||||
 
 | 
			
		||||
@@ -63,6 +63,7 @@ struct CPUCaps {
 | 
			
		||||
    bool gfni : 1;
 | 
			
		||||
    bool invariant_tsc : 1;
 | 
			
		||||
    bool lzcnt : 1;
 | 
			
		||||
    bool monitorx : 1;
 | 
			
		||||
    bool movbe : 1;
 | 
			
		||||
    bool pclmulqdq : 1;
 | 
			
		||||
    bool popcnt : 1;
 | 
			
		||||
 
 | 
			
		||||
@@ -13,36 +13,60 @@
 | 
			
		||||
 | 
			
		||||
namespace Common::X64 {
 | 
			
		||||
 | 
			
		||||
namespace {
 | 
			
		||||
 | 
			
		||||
// 100,000 cycles is a reasonable amount of time to wait to save on CPU resources.
 | 
			
		||||
// For reference:
 | 
			
		||||
// At 1 GHz, 100K cycles is 100us
 | 
			
		||||
// At 2 GHz, 100K cycles is 50us
 | 
			
		||||
// At 4 GHz, 100K cycles is 25us
 | 
			
		||||
constexpr auto PauseCycles = 100'000U;
 | 
			
		||||
 | 
			
		||||
} // Anonymous namespace
 | 
			
		||||
 | 
			
		||||
#ifdef _MSC_VER
 | 
			
		||||
__forceinline static void TPAUSE() {
 | 
			
		||||
    // 100,000 cycles is a reasonable amount of time to wait to save on CPU resources.
 | 
			
		||||
    // For reference:
 | 
			
		||||
    // At 1 GHz, 100K cycles is 100us
 | 
			
		||||
    // At 2 GHz, 100K cycles is 50us
 | 
			
		||||
    // At 4 GHz, 100K cycles is 25us
 | 
			
		||||
    static constexpr auto PauseCycles = 100'000;
 | 
			
		||||
    _tpause(0, FencedRDTSC() + PauseCycles);
 | 
			
		||||
    static constexpr auto RequestC02State = 0U;
 | 
			
		||||
    _tpause(RequestC02State, FencedRDTSC() + PauseCycles);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
__forceinline static void MWAITX() {
 | 
			
		||||
    static constexpr auto EnableWaitTimeFlag = 1U << 1;
 | 
			
		||||
    static constexpr auto RequestC1State = 0U;
 | 
			
		||||
 | 
			
		||||
    // monitor_var should be aligned to a cache line.
 | 
			
		||||
    alignas(64) u64 monitor_var{};
 | 
			
		||||
    _mm_monitorx(&monitor_var, 0, 0);
 | 
			
		||||
    _mm_mwaitx(EnableWaitTimeFlag, RequestC1State, PauseCycles);
 | 
			
		||||
}
 | 
			
		||||
#else
 | 
			
		||||
static void TPAUSE() {
 | 
			
		||||
    // 100,000 cycles is a reasonable amount of time to wait to save on CPU resources.
 | 
			
		||||
    // For reference:
 | 
			
		||||
    // At 1 GHz, 100K cycles is 100us
 | 
			
		||||
    // At 2 GHz, 100K cycles is 50us
 | 
			
		||||
    // At 4 GHz, 100K cycles is 25us
 | 
			
		||||
    static constexpr auto PauseCycles = 100'000;
 | 
			
		||||
    static constexpr auto RequestC02State = 0U;
 | 
			
		||||
    const auto tsc = FencedRDTSC() + PauseCycles;
 | 
			
		||||
    const auto eax = static_cast<u32>(tsc & 0xFFFFFFFF);
 | 
			
		||||
    const auto edx = static_cast<u32>(tsc >> 32);
 | 
			
		||||
    asm volatile("tpause %0" : : "r"(0), "d"(edx), "a"(eax));
 | 
			
		||||
    asm volatile("tpause %0" : : "r"(RequestC02State), "d"(edx), "a"(eax));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void MWAITX() {
 | 
			
		||||
    static constexpr auto EnableWaitTimeFlag = 1U << 1;
 | 
			
		||||
    static constexpr auto RequestC1State = 0U;
 | 
			
		||||
 | 
			
		||||
    // monitor_var should be aligned to a cache line.
 | 
			
		||||
    alignas(64) u64 monitor_var{};
 | 
			
		||||
    asm volatile("monitorx" : : "a"(&monitor_var), "c"(0), "d"(0));
 | 
			
		||||
    asm volatile("mwaitx" : : "a"(RequestC1State), "b"(PauseCycles), "c"(EnableWaitTimeFlag));
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
void MicroSleep() {
 | 
			
		||||
    static const bool has_waitpkg = GetCPUCaps().waitpkg;
 | 
			
		||||
    static const bool has_monitorx = GetCPUCaps().monitorx;
 | 
			
		||||
 | 
			
		||||
    if (has_waitpkg) {
 | 
			
		||||
        TPAUSE();
 | 
			
		||||
    } else if (has_monitorx) {
 | 
			
		||||
        MWAITX();
 | 
			
		||||
    } else {
 | 
			
		||||
        std::this_thread::yield();
 | 
			
		||||
    }
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user