From 4aec060f6de410698d5b0a5bffd42d4327b258e4 Mon Sep 17 00:00:00 2001 From: Markus Wick Date: Tue, 6 Apr 2021 20:30:22 +0200 Subject: [PATCH 1/4] common/threadsafe_queue: Provide Wait() method. It shall block until there is something to consume in the queue. And use it for the GPU emulation instead of the spin loop. This is only in booting the emulator, however in BOTW this is the case for about 1 second. --- src/common/threadsafe_queue.h | 10 +++++++++- src/video_core/gpu_thread.cpp | 3 +-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/common/threadsafe_queue.h b/src/common/threadsafe_queue.h index a4647314a7..ad04df8cab 100644 --- a/src/common/threadsafe_queue.h +++ b/src/common/threadsafe_queue.h @@ -83,11 +83,15 @@ public: return true; } - T PopWait() { + void Wait() { if (Empty()) { std::unique_lock lock{cv_mutex}; cv.wait(lock, [this]() { return !Empty(); }); } + } + + T PopWait() { + Wait(); T t; Pop(t); return t; @@ -156,6 +160,10 @@ public: return spsc_queue.Pop(t); } + void Wait() { + spsc_queue.Wait(); + } + T PopWait() { return spsc_queue.PopWait(); } diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index 99353f15f5..cd59a7faf1 100644 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -29,8 +29,7 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer, system.RegisterHostThread(); // Wait for first GPU command before acquiring the window context - while (state.queue.Empty()) - ; + state.queue.Wait(); // If emulation was stopped during disk shader loading, abort before trying to acquire context if (!state.is_running) { From 5145133a604f626c05f832465ac22019b003c32a Mon Sep 17 00:00:00 2001 From: Markus Wick Date: Wed, 7 Apr 2021 08:42:54 +0200 Subject: [PATCH 2/4] video_core/gpu_thread: Implement a ShutDown method. This was implicitly done by `is_powered_on = false`, however the explicit method allows us to block until the GPU is actually gone. This should fix a race condition while removing the other subsystems while the GPU is still active. --- src/core/core.cpp | 2 +- src/video_core/gpu.cpp | 4 ++-- src/video_core/gpu.h | 4 ++-- src/video_core/gpu_thread.cpp | 26 ++++++++++++++++++-------- src/video_core/gpu_thread.h | 7 +++++-- 5 files changed, 28 insertions(+), 15 deletions(-) diff --git a/src/core/core.cpp b/src/core/core.cpp index 305f56ff1c..56b47e671c 100644 --- a/src/core/core.cpp +++ b/src/core/core.cpp @@ -296,7 +296,7 @@ struct System::Impl { exit_lock = false; if (gpu_core) { - gpu_core->WaitIdle(); + gpu_core->ShutDown(); } services.reset(); diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index c61f446196..009c6f5748 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -517,8 +517,8 @@ void GPU::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const { interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value); } -void GPU::WaitIdle() const { - gpu_thread.WaitIdle(); +void GPU::ShutDown() { + gpu_thread.ShutDown(); } void GPU::OnCommandListEnd() { diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index b2ee454964..ecab35d3bf 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -219,8 +219,8 @@ public: return *shader_notify; } - // Waits for the GPU to finish working - void WaitIdle() const; + // Stops the GPU execution and waits for the GPU to finish working + void ShutDown(); /// Allows the CPU/NvFlinger to wait on the GPU before presenting a frame. void WaitFence(u32 syncpoint_id, u32 value); diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index cd59a7faf1..6b8f06f780 100644 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -68,13 +68,7 @@ ThreadManager::ThreadManager(Core::System& system_, bool is_async_) : system{system_}, is_async{is_async_} {} ThreadManager::~ThreadManager() { - if (!thread.joinable()) { - return; - } - - // Notify GPU thread that a shutdown is pending - PushCommand(EndProcessingCommand()); - thread.join(); + ShutDown(); } void ThreadManager::StartThread(VideoCore::RendererBase& renderer, @@ -132,10 +126,26 @@ void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) { void ThreadManager::WaitIdle() const { while (state.last_fence > state.signaled_fence.load(std::memory_order_relaxed) && - system.IsPoweredOn()) { + state.is_running) { } } +void ThreadManager::ShutDown() { + if (!state.is_running) { + return; + } + + state.is_running = false; + + if (!thread.joinable()) { + return; + } + + // Notify GPU thread that a shutdown is pending + PushCommand(EndProcessingCommand()); + thread.join(); +} + void ThreadManager::OnCommandListEnd() { PushCommand(OnCommandListEndCommand()); } diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h index 18269e51c4..d384164de2 100644 --- a/src/video_core/gpu_thread.h +++ b/src/video_core/gpu_thread.h @@ -132,8 +132,8 @@ public: /// Notify rasterizer that any caches of the specified region should be flushed and invalidated void FlushAndInvalidateRegion(VAddr addr, u64 size); - // Wait until the gpu thread is idle. - void WaitIdle() const; + // Stops the GPU execution and waits for the GPU to finish working + void ShutDown(); void OnCommandListEnd(); @@ -141,6 +141,9 @@ private: /// Pushes a command to be executed by the GPU thread u64 PushCommand(CommandData&& command_data); + // Wait until the gpu thread is idle. + void WaitIdle() const; + Core::System& system; const bool is_async; VideoCore::RasterizerInterface* rasterizer = nullptr; From e6fb49fa4bb2864702abcefc14f6bb62eaba7a7e Mon Sep 17 00:00:00 2001 From: Markus Wick Date: Wed, 7 Apr 2021 13:57:49 +0200 Subject: [PATCH 3/4] video_core/gpu_thread: Keep the write lock for allocating the fence. Else the fence might get submited out-of-order into the queue, which makes testing them pointless. Overhead should be tiny as the mutex is just moved from the queue to the writing code. --- src/video_core/gpu_thread.cpp | 2 ++ src/video_core/gpu_thread.h | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index 6b8f06f780..9488bf5444 100644 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -151,11 +151,13 @@ void ThreadManager::OnCommandListEnd() { } u64 ThreadManager::PushCommand(CommandData&& command_data) { + std::unique_lock lk(state.write_lock); const u64 fence{++state.last_fence}; state.queue.Push(CommandDataContainer(std::move(command_data), fence)); if (!is_async) { // In synchronous GPU mode, block the caller until the command has executed + lk.unlock(); WaitIdle(); } diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h index d384164de2..cb901c22a8 100644 --- a/src/video_core/gpu_thread.h +++ b/src/video_core/gpu_thread.h @@ -101,7 +101,8 @@ struct CommandDataContainer { struct SynchState final { std::atomic_bool is_running{true}; - using CommandQueue = Common::MPSCQueue; + using CommandQueue = Common::SPSCQueue; + std::mutex write_lock; CommandQueue queue; u64 last_fence{}; std::atomic signaled_fence{}; From e8bd9aed8bf0f60455d0ae6a8f6f3abf92dd8305 Mon Sep 17 00:00:00 2001 From: Markus Wick Date: Wed, 7 Apr 2021 11:41:31 +0200 Subject: [PATCH 4/4] video_core: Use a CV for blocking commands. There is no need for a busy loop here. Let's just use a condition variable to save some power. --- src/video_core/gpu_thread.cpp | 45 +++++++++++++++++++++-------------- src/video_core/gpu_thread.h | 11 ++++----- 2 files changed, 32 insertions(+), 24 deletions(-) diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index 9488bf5444..7addfbc7bb 100644 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -56,11 +56,17 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer, } else if (const auto* invalidate = std::get_if(&next.data)) { rasterizer->OnCPUWrite(invalidate->addr, invalidate->size); } else if (std::holds_alternative(next.data)) { - return; + ASSERT(state.is_running == false); } else { UNREACHABLE(); } state.signaled_fence.store(next.fence); + if (next.block) { + // We have to lock the write_lock to ensure that the condition_variable wait not get a + // race between the check and the lock itself. + std::lock_guard lk(state.write_lock); + state.cv.notify_all(); + } } } @@ -105,9 +111,8 @@ void ThreadManager::FlushRegion(VAddr addr, u64 size) { case Settings::GPUAccuracy::Extreme: { auto& gpu = system.GPU(); u64 fence = gpu.RequestFlush(addr, size); - PushCommand(GPUTickCommand()); - while (fence > gpu.CurrentFlushRequestFence()) { - } + PushCommand(GPUTickCommand(), true); + ASSERT(fence <= gpu.CurrentFlushRequestFence()); break; } default: @@ -124,18 +129,16 @@ void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) { rasterizer->OnCPUWrite(addr, size); } -void ThreadManager::WaitIdle() const { - while (state.last_fence > state.signaled_fence.load(std::memory_order_relaxed) && - state.is_running) { - } -} - void ThreadManager::ShutDown() { if (!state.is_running) { return; } - state.is_running = false; + { + std::lock_guard lk(state.write_lock); + state.is_running = false; + state.cv.notify_all(); + } if (!thread.joinable()) { return; @@ -150,15 +153,21 @@ void ThreadManager::OnCommandListEnd() { PushCommand(OnCommandListEndCommand()); } -u64 ThreadManager::PushCommand(CommandData&& command_data) { - std::unique_lock lk(state.write_lock); - const u64 fence{++state.last_fence}; - state.queue.Push(CommandDataContainer(std::move(command_data), fence)); - +u64 ThreadManager::PushCommand(CommandData&& command_data, bool block) { if (!is_async) { // In synchronous GPU mode, block the caller until the command has executed - lk.unlock(); - WaitIdle(); + block = true; + } + + std::unique_lock lk(state.write_lock); + const u64 fence{++state.last_fence}; + state.queue.Push(CommandDataContainer(std::move(command_data), fence, block)); + + if (block) { + state.cv.wait(lk, [this, fence] { + return fence <= state.signaled_fence.load(std::memory_order_relaxed) || + !state.is_running; + }); } return fence; diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h index cb901c22a8..11a648f38b 100644 --- a/src/video_core/gpu_thread.h +++ b/src/video_core/gpu_thread.h @@ -90,11 +90,12 @@ using CommandData = struct CommandDataContainer { CommandDataContainer() = default; - explicit CommandDataContainer(CommandData&& data_, u64 next_fence_) - : data{std::move(data_)}, fence{next_fence_} {} + explicit CommandDataContainer(CommandData&& data_, u64 next_fence_, bool block_) + : data{std::move(data_)}, fence{next_fence_}, block(block_) {} CommandData data; u64 fence{}; + bool block{}; }; /// Struct used to synchronize the GPU thread @@ -106,6 +107,7 @@ struct SynchState final { CommandQueue queue; u64 last_fence{}; std::atomic signaled_fence{}; + std::condition_variable cv; }; /// Class used to manage the GPU thread @@ -140,10 +142,7 @@ public: private: /// Pushes a command to be executed by the GPU thread - u64 PushCommand(CommandData&& command_data); - - // Wait until the gpu thread is idle. - void WaitIdle() const; + u64 PushCommand(CommandData&& command_data, bool block = false); Core::System& system; const bool is_async;