Merge pull request #5275 from FernandoS27/fast-native-clock

X86/NativeClock: Improve performance of clock calculations on hot path.
This commit is contained in:
bunnei 2021-01-15 23:01:42 -08:00 committed by GitHub
commit a7fd61fcce
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 174 additions and 104 deletions

View File

@ -98,7 +98,6 @@ add_library(common STATIC
algorithm.h
alignment.h
assert.h
atomic_ops.cpp
atomic_ops.h
detached_tasks.cpp
detached_tasks.h

View File

@ -1,75 +0,0 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <cstring>
#include "common/atomic_ops.h"
#if _MSC_VER
#include <intrin.h>
#endif
namespace Common {
#if _MSC_VER
bool AtomicCompareAndSwap(volatile u8* pointer, u8 value, u8 expected) {
const u8 result =
_InterlockedCompareExchange8(reinterpret_cast<volatile char*>(pointer), value, expected);
return result == expected;
}
bool AtomicCompareAndSwap(volatile u16* pointer, u16 value, u16 expected) {
const u16 result =
_InterlockedCompareExchange16(reinterpret_cast<volatile short*>(pointer), value, expected);
return result == expected;
}
bool AtomicCompareAndSwap(volatile u32* pointer, u32 value, u32 expected) {
const u32 result =
_InterlockedCompareExchange(reinterpret_cast<volatile long*>(pointer), value, expected);
return result == expected;
}
bool AtomicCompareAndSwap(volatile u64* pointer, u64 value, u64 expected) {
const u64 result = _InterlockedCompareExchange64(reinterpret_cast<volatile __int64*>(pointer),
value, expected);
return result == expected;
}
bool AtomicCompareAndSwap(volatile u64* pointer, u128 value, u128 expected) {
return _InterlockedCompareExchange128(reinterpret_cast<volatile __int64*>(pointer), value[1],
value[0],
reinterpret_cast<__int64*>(expected.data())) != 0;
}
#else
bool AtomicCompareAndSwap(volatile u8* pointer, u8 value, u8 expected) {
return __sync_bool_compare_and_swap(pointer, expected, value);
}
bool AtomicCompareAndSwap(volatile u16* pointer, u16 value, u16 expected) {
return __sync_bool_compare_and_swap(pointer, expected, value);
}
bool AtomicCompareAndSwap(volatile u32* pointer, u32 value, u32 expected) {
return __sync_bool_compare_and_swap(pointer, expected, value);
}
bool AtomicCompareAndSwap(volatile u64* pointer, u64 value, u64 expected) {
return __sync_bool_compare_and_swap(pointer, expected, value);
}
bool AtomicCompareAndSwap(volatile u64* pointer, u128 value, u128 expected) {
unsigned __int128 value_a;
unsigned __int128 expected_a;
std::memcpy(&value_a, value.data(), sizeof(u128));
std::memcpy(&expected_a, expected.data(), sizeof(u128));
return __sync_bool_compare_and_swap((unsigned __int128*)pointer, expected_a, value_a);
}
#endif
} // namespace Common

View File

@ -4,14 +4,75 @@
#pragma once
#include <cstring>
#include <memory>
#include "common/common_types.h"
#if _MSC_VER
#include <intrin.h>
#endif
namespace Common {
[[nodiscard]] bool AtomicCompareAndSwap(volatile u8* pointer, u8 value, u8 expected);
[[nodiscard]] bool AtomicCompareAndSwap(volatile u16* pointer, u16 value, u16 expected);
[[nodiscard]] bool AtomicCompareAndSwap(volatile u32* pointer, u32 value, u32 expected);
[[nodiscard]] bool AtomicCompareAndSwap(volatile u64* pointer, u64 value, u64 expected);
[[nodiscard]] bool AtomicCompareAndSwap(volatile u64* pointer, u128 value, u128 expected);
#if _MSC_VER
[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u8* pointer, u8 value, u8 expected) {
const u8 result =
_InterlockedCompareExchange8(reinterpret_cast<volatile char*>(pointer), value, expected);
return result == expected;
}
[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u16* pointer, u16 value, u16 expected) {
const u16 result =
_InterlockedCompareExchange16(reinterpret_cast<volatile short*>(pointer), value, expected);
return result == expected;
}
[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u32* pointer, u32 value, u32 expected) {
const u32 result =
_InterlockedCompareExchange(reinterpret_cast<volatile long*>(pointer), value, expected);
return result == expected;
}
[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u64* pointer, u64 value, u64 expected) {
const u64 result = _InterlockedCompareExchange64(reinterpret_cast<volatile __int64*>(pointer),
value, expected);
return result == expected;
}
[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u64* pointer, u128 value, u128 expected) {
return _InterlockedCompareExchange128(reinterpret_cast<volatile __int64*>(pointer), value[1],
value[0],
reinterpret_cast<__int64*>(expected.data())) != 0;
}
#else
[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u8* pointer, u8 value, u8 expected) {
return __sync_bool_compare_and_swap(pointer, expected, value);
}
[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u16* pointer, u16 value, u16 expected) {
return __sync_bool_compare_and_swap(pointer, expected, value);
}
[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u32* pointer, u32 value, u32 expected) {
return __sync_bool_compare_and_swap(pointer, expected, value);
}
[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u64* pointer, u64 value, u64 expected) {
return __sync_bool_compare_and_swap(pointer, expected, value);
}
[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u64* pointer, u128 value, u128 expected) {
unsigned __int128 value_a;
unsigned __int128 expected_a;
std::memcpy(&value_a, value.data(), sizeof(u128));
std::memcpy(&expected_a, expected.data(), sizeof(u128));
return __sync_bool_compare_and_swap((unsigned __int128*)pointer, expected_a, value_a);
}
#endif
} // namespace Common

View File

@ -2,19 +2,74 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <array>
#include <chrono>
#include <limits>
#include <mutex>
#include <thread>
#ifdef _MSC_VER
#include <intrin.h>
#pragma intrinsic(__umulh)
#pragma intrinsic(_udiv128)
#else
#include <x86intrin.h>
#endif
#include "common/atomic_ops.h"
#include "common/uint128.h"
#include "common/x64/native_clock.h"
namespace {
[[nodiscard]] u64 GetFixedPoint64Factor(u64 numerator, u64 divisor) {
#ifdef __SIZEOF_INT128__
const auto base = static_cast<unsigned __int128>(numerator) << 64ULL;
return static_cast<u64>(base / divisor);
#elif defined(_M_X64) || defined(_M_ARM64)
std::array<u64, 2> r = {0, numerator};
u64 remainder;
#if _MSC_VER < 1923
return udiv128(r[1], r[0], divisor, &remainder);
#else
return _udiv128(r[1], r[0], divisor, &remainder);
#endif
#else
// This one is bit more inaccurate.
return MultiplyAndDivide64(std::numeric_limits<u64>::max(), numerator, divisor);
#endif
}
[[nodiscard]] u64 MultiplyHigh(u64 a, u64 b) {
#ifdef __SIZEOF_INT128__
return (static_cast<unsigned __int128>(a) * static_cast<unsigned __int128>(b)) >> 64;
#elif defined(_M_X64) || defined(_M_ARM64)
return __umulh(a, b); // MSVC
#else
// Generic fallback
const u64 a_lo = u32(a);
const u64 a_hi = a >> 32;
const u64 b_lo = u32(b);
const u64 b_hi = b >> 32;
const u64 a_x_b_hi = a_hi * b_hi;
const u64 a_x_b_mid = a_hi * b_lo;
const u64 b_x_a_mid = b_hi * a_lo;
const u64 a_x_b_lo = a_lo * b_lo;
const u64 carry_bit = (static_cast<u64>(static_cast<u32>(a_x_b_mid)) +
static_cast<u64>(static_cast<u32>(b_x_a_mid)) + (a_x_b_lo >> 32)) >>
32;
const u64 multhi = a_x_b_hi + (a_x_b_mid >> 32) + (b_x_a_mid >> 32) + carry_bit;
return multhi;
#endif
}
} // namespace
namespace Common {
u64 EstimateRDTSCFrequency() {
@ -48,54 +103,71 @@ NativeClock::NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequen
: WallClock(emulated_cpu_frequency_, emulated_clock_frequency_, true), rtsc_frequency{
rtsc_frequency_} {
_mm_mfence();
last_measure = __rdtsc();
accumulated_ticks = 0U;
time_point.inner.last_measure = __rdtsc();
time_point.inner.accumulated_ticks = 0U;
ns_rtsc_factor = GetFixedPoint64Factor(1000000000, rtsc_frequency);
us_rtsc_factor = GetFixedPoint64Factor(1000000, rtsc_frequency);
ms_rtsc_factor = GetFixedPoint64Factor(1000, rtsc_frequency);
clock_rtsc_factor = GetFixedPoint64Factor(emulated_clock_frequency, rtsc_frequency);
cpu_rtsc_factor = GetFixedPoint64Factor(emulated_cpu_frequency, rtsc_frequency);
}
u64 NativeClock::GetRTSC() {
std::scoped_lock scope{rtsc_serialize};
_mm_mfence();
const u64 current_measure = __rdtsc();
u64 diff = current_measure - last_measure;
diff = diff & ~static_cast<u64>(static_cast<s64>(diff) >> 63); // max(diff, 0)
if (current_measure > last_measure) {
last_measure = current_measure;
}
accumulated_ticks += diff;
TimePoint new_time_point{};
TimePoint current_time_point{};
do {
current_time_point.pack = time_point.pack;
_mm_mfence();
const u64 current_measure = __rdtsc();
u64 diff = current_measure - current_time_point.inner.last_measure;
diff = diff & ~static_cast<u64>(static_cast<s64>(diff) >> 63); // max(diff, 0)
new_time_point.inner.last_measure = current_measure > current_time_point.inner.last_measure
? current_measure
: current_time_point.inner.last_measure;
new_time_point.inner.accumulated_ticks = current_time_point.inner.accumulated_ticks + diff;
} while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack,
current_time_point.pack));
/// The clock cannot be more precise than the guest timer, remove the lower bits
return accumulated_ticks & inaccuracy_mask;
return new_time_point.inner.accumulated_ticks & inaccuracy_mask;
}
void NativeClock::Pause(bool is_paused) {
if (!is_paused) {
_mm_mfence();
last_measure = __rdtsc();
TimePoint current_time_point{};
TimePoint new_time_point{};
do {
current_time_point.pack = time_point.pack;
new_time_point.pack = current_time_point.pack;
_mm_mfence();
new_time_point.inner.last_measure = __rdtsc();
} while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack,
current_time_point.pack));
}
}
std::chrono::nanoseconds NativeClock::GetTimeNS() {
const u64 rtsc_value = GetRTSC();
return std::chrono::nanoseconds{MultiplyAndDivide64(rtsc_value, 1000000000, rtsc_frequency)};
return std::chrono::nanoseconds{MultiplyHigh(rtsc_value, ns_rtsc_factor)};
}
std::chrono::microseconds NativeClock::GetTimeUS() {
const u64 rtsc_value = GetRTSC();
return std::chrono::microseconds{MultiplyAndDivide64(rtsc_value, 1000000, rtsc_frequency)};
return std::chrono::microseconds{MultiplyHigh(rtsc_value, us_rtsc_factor)};
}
std::chrono::milliseconds NativeClock::GetTimeMS() {
const u64 rtsc_value = GetRTSC();
return std::chrono::milliseconds{MultiplyAndDivide64(rtsc_value, 1000, rtsc_frequency)};
return std::chrono::milliseconds{MultiplyHigh(rtsc_value, ms_rtsc_factor)};
}
u64 NativeClock::GetClockCycles() {
const u64 rtsc_value = GetRTSC();
return MultiplyAndDivide64(rtsc_value, emulated_clock_frequency, rtsc_frequency);
return MultiplyHigh(rtsc_value, clock_rtsc_factor);
}
u64 NativeClock::GetCPUCycles() {
const u64 rtsc_value = GetRTSC();
return MultiplyAndDivide64(rtsc_value, emulated_cpu_frequency, rtsc_frequency);
return MultiplyHigh(rtsc_value, cpu_rtsc_factor);
}
} // namespace X64

View File

@ -6,7 +6,6 @@
#include <optional>
#include "common/spin_lock.h"
#include "common/wall_clock.h"
namespace Common {
@ -32,14 +31,28 @@ public:
private:
u64 GetRTSC();
union alignas(16) TimePoint {
TimePoint() : pack{} {}
u128 pack{};
struct Inner {
u64 last_measure{};
u64 accumulated_ticks{};
} inner;
};
/// value used to reduce the native clocks accuracy as some apss rely on
/// undefined behavior where the level of accuracy in the clock shouldn't
/// be higher.
static constexpr u64 inaccuracy_mask = ~(UINT64_C(0x400) - 1);
SpinLock rtsc_serialize{};
u64 last_measure{};
u64 accumulated_ticks{};
TimePoint time_point;
// factors
u64 clock_rtsc_factor{};
u64 cpu_rtsc_factor{};
u64 ns_rtsc_factor{};
u64 us_rtsc_factor{};
u64 ms_rtsc_factor{};
u64 rtsc_frequency;
};
} // namespace X64