From 6c7eb81f7d871f5c08a4844471633a67725aae73 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Wed, 4 Jan 2023 22:05:20 -0500 Subject: [PATCH 1/4] video_core: Cache GPU internal writes. --- src/video_core/CMakeLists.txt | 1 + src/video_core/engines/engine_upload.cpp | 2 +- src/video_core/engines/maxwell_3d.cpp | 7 +- src/video_core/engines/maxwell_dma.cpp | 17 ++-- src/video_core/invalidation_accumulator.h | 78 +++++++++++++++++++ src/video_core/memory_manager.cpp | 62 +++++++++++---- src/video_core/memory_manager.h | 17 +++- src/video_core/rasterizer_interface.h | 7 ++ .../renderer_vulkan/vk_rasterizer.cpp | 23 ++++++ .../renderer_vulkan/vk_rasterizer.h | 1 + 10 files changed, 185 insertions(+), 30 deletions(-) create mode 100644 src/video_core/invalidation_accumulator.h diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index aa271a3770..b7095ae133 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -85,6 +85,7 @@ add_library(video_core STATIC gpu.h gpu_thread.cpp gpu_thread.h + invalidation_accumulator.h memory_manager.cpp memory_manager.h precompiled_headers.h diff --git a/src/video_core/engines/engine_upload.cpp b/src/video_core/engines/engine_upload.cpp index cea1dd8b0f..7f5a0c29d7 100644 --- a/src/video_core/engines/engine_upload.cpp +++ b/src/video_core/engines/engine_upload.cpp @@ -76,7 +76,7 @@ void State::ProcessData(std::span read_buffer) { regs.dest.height, regs.dest.depth, x_offset, regs.dest.y, x_elements, regs.line_count, regs.dest.BlockHeight(), regs.dest.BlockDepth(), regs.line_length_in); - memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size); + memory_manager.WriteBlockCached(address, tmp_buffer.data(), dst_size); } } diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index fbfd1ddd24..97f5477897 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -485,11 +485,6 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) { } void Maxwell3D::ProcessQueryGet() { - // TODO(Subv): Support the other query units. - if (regs.report_semaphore.query.location != Regs::ReportSemaphore::Location::All) { - LOG_DEBUG(HW_GPU, "Locations other than ALL are unimplemented"); - } - switch (regs.report_semaphore.query.operation) { case Regs::ReportSemaphore::Operation::Release: if (regs.report_semaphore.query.short_query != 0) { @@ -649,7 +644,7 @@ void Maxwell3D::ProcessCBMultiData(const u32* start_base, u32 amount) { const GPUVAddr address{buffer_address + regs.const_buffer.offset}; const size_t copy_size = amount * sizeof(u32); - memory_manager.WriteBlock(address, start_base, copy_size); + memory_manager.WriteBlockCached(address, start_base, copy_size); // Increment the current buffer position. regs.const_buffer.offset += static_cast(copy_size); diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index 01f70ea9e5..7bf08e3e0d 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -69,7 +69,7 @@ void MaxwellDMA::Launch() { if (launch.multi_line_enable) { const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH; const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH; - + memory_manager.FlushCaching(); if (!is_src_pitch && !is_dst_pitch) { // If both the source and the destination are in block layout, assert. CopyBlockLinearToBlockLinear(); @@ -104,6 +104,7 @@ void MaxwellDMA::Launch() { reinterpret_cast(tmp_buffer.data()), regs.line_length_in * sizeof(u32)); } else { + memory_manager.FlushCaching(); const auto convert_linear_2_blocklinear_addr = [](u64 address) { return (address & ~0x1f0ULL) | ((address & 0x40) >> 2) | ((address & 0x10) << 1) | ((address & 0x180) >> 1) | ((address & 0x20) << 3); @@ -121,7 +122,7 @@ void MaxwellDMA::Launch() { memory_manager.ReadBlockUnsafe( convert_linear_2_blocklinear_addr(regs.offset_in + offset), tmp_buffer.data(), tmp_buffer.size()); - memory_manager.WriteBlock(regs.offset_out + offset, tmp_buffer.data(), + memory_manager.WriteBlockCached(regs.offset_out + offset, tmp_buffer.data(), tmp_buffer.size()); } } else if (is_src_pitch && !is_dst_pitch) { @@ -132,7 +133,7 @@ void MaxwellDMA::Launch() { for (u32 offset = 0; offset < regs.line_length_in; offset += 16) { memory_manager.ReadBlockUnsafe(regs.offset_in + offset, tmp_buffer.data(), tmp_buffer.size()); - memory_manager.WriteBlock( + memory_manager.WriteBlockCached( convert_linear_2_blocklinear_addr(regs.offset_out + offset), tmp_buffer.data(), tmp_buffer.size()); } @@ -141,7 +142,7 @@ void MaxwellDMA::Launch() { std::vector tmp_buffer(regs.line_length_in); memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(), regs.line_length_in); - memory_manager.WriteBlock(regs.offset_out, tmp_buffer.data(), + memory_manager.WriteBlockCached(regs.offset_out, tmp_buffer.data(), regs.line_length_in); } } @@ -204,7 +205,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() { src_params.origin.y, x_elements, regs.line_count, block_height, block_depth, regs.pitch_out); - memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); + memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size); } void MaxwellDMA::CopyPitchToBlockLinear() { @@ -256,7 +257,7 @@ void MaxwellDMA::CopyPitchToBlockLinear() { dst_params.origin.y, x_elements, regs.line_count, block_height, block_depth, regs.pitch_in); - memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); + memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size); } void MaxwellDMA::FastCopyBlockLinearToPitch() { @@ -287,7 +288,7 @@ void MaxwellDMA::FastCopyBlockLinearToPitch() { regs.src_params.block_size.height, regs.src_params.block_size.depth, regs.pitch_out); - memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); + memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size); } void MaxwellDMA::CopyBlockLinearToBlockLinear() { @@ -347,7 +348,7 @@ void MaxwellDMA::CopyBlockLinearToBlockLinear() { dst.depth, dst_x_offset, dst.origin.y, x_elements, regs.line_count, dst.block_size.height, dst.block_size.depth, pitch); - memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); + memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size); } void MaxwellDMA::ReleaseSemaphore() { diff --git a/src/video_core/invalidation_accumulator.h b/src/video_core/invalidation_accumulator.h new file mode 100644 index 0000000000..42420e31c8 --- /dev/null +++ b/src/video_core/invalidation_accumulator.h @@ -0,0 +1,78 @@ +// SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include + +#include "common/common_types.h" + +namespace VideoCommon { + +class InvalidationAccumulator { +public: + InvalidationAccumulator() = default; + ~InvalidationAccumulator() = default; + + void Add(GPUVAddr address, size_t size) { + const auto reset_values = [&]() { + if (has_collected) { + buffer.emplace_back(start_address, accumulated_size); + } + start_address = address; + accumulated_size = size; + last_collection = start_address + size; + }; + if (address >= start_address && address + size <= last_collection) [[likely]] { + return; + } + size = (address + size + atomicy_side_mask) & atomicy_mask - address; + address = address & atomicy_mask; + if (!has_collected) [[unlikely]] { + reset_values(); + has_collected = true; + return; + } + if (address != last_collection) [[unlikely]] { + reset_values(); + return; + } + accumulated_size += size; + last_collection += size; + } + + void Clear() { + buffer.clear(); + start_address = 0; + last_collection = 0; + has_collected = false; + } + + bool AnyAccumulated() const { + return has_collected; + } + + template + void Callback(Func&& func) { + if (!has_collected) { + return; + } + buffer.emplace_back(start_address, accumulated_size); + for (auto& [address, size] : buffer) { + func(address, size); + } + } + +private: + static constexpr size_t atomicy_bits = 5; + static constexpr size_t atomicy_size = 1ULL << atomicy_bits; + static constexpr size_t atomicy_side_mask = atomicy_size - 1; + static constexpr size_t atomicy_mask = ~atomicy_side_mask; + GPUVAddr start_address{}; + GPUVAddr last_collection{}; + size_t accumulated_size{}; + bool has_collected{}; + std::vector> buffer; +}; + +} // namespace VideoCommon diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index 3a5cdeb39f..83924475b5 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -11,6 +11,7 @@ #include "core/hle/kernel/k_page_table.h" #include "core/hle/kernel/k_process.h" #include "core/memory.h" +#include "video_core/invalidation_accumulator.h" #include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_base.h" @@ -26,7 +27,8 @@ MemoryManager::MemoryManager(Core::System& system_, u64 address_space_bits_, u64 entries{}, big_entries{}, page_table{address_space_bits, address_space_bits + page_bits - 38, page_bits != big_page_bits ? page_bits : 0}, kind_map{PTEKind::INVALID}, unique_identifier{unique_identifier_generator.fetch_add( - 1, std::memory_order_acq_rel)} { + 1, std::memory_order_acq_rel)}, + accumulator{std::make_unique()} { address_space_size = 1ULL << address_space_bits; page_size = 1ULL << page_bits; page_mask = page_size - 1ULL; @@ -185,15 +187,12 @@ void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) { if (size == 0) { return; } - const auto submapped_ranges = GetSubmappedRange(gpu_addr, size); + GetSubmappedRangeImpl(gpu_addr, size, page_stash); - for (const auto& [map_addr, map_size] : submapped_ranges) { - // Flush and invalidate through the GPU interface, to be asynchronous if possible. - const std::optional cpu_addr = GpuToCpuAddress(map_addr); - ASSERT(cpu_addr); - - rasterizer->UnmapMemory(*cpu_addr, map_size); + for (const auto& [map_addr, map_size] : page_stash) { + rasterizer->UnmapMemory(map_addr, map_size); } + page_stash.clear(); BigPageTableOp(gpu_addr, 0, size, PTEKind::INVALID); PageTableOp(gpu_addr, 0, size, PTEKind::INVALID); @@ -454,6 +453,12 @@ void MemoryManager::WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buf WriteBlockImpl(gpu_dest_addr, src_buffer, size, VideoCommon::CacheType::None); } +void MemoryManager::WriteBlockCached(GPUVAddr gpu_dest_addr, const void* src_buffer, + std::size_t size) { + WriteBlockImpl(gpu_dest_addr, src_buffer, size, VideoCommon::CacheType::None); + accumulator->Add(gpu_dest_addr, size); +} + void MemoryManager::FlushRegion(GPUVAddr gpu_addr, size_t size, VideoCommon::CacheType which) const { auto do_nothing = [&]([[maybe_unused]] std::size_t page_index, @@ -663,7 +668,17 @@ bool MemoryManager::IsFullyMappedRange(GPUVAddr gpu_addr, std::size_t size) cons std::vector> MemoryManager::GetSubmappedRange( GPUVAddr gpu_addr, std::size_t size) const { std::vector> result{}; - std::optional> last_segment{}; + GetSubmappedRangeImpl(gpu_addr, size, result); + return result; +} + +template +void MemoryManager::GetSubmappedRangeImpl( + GPUVAddr gpu_addr, std::size_t size, + std::vector, std::size_t>>& + result) const { + std::optional, std::size_t>> + last_segment{}; std::optional old_page_addr{}; const auto split = [&last_segment, &result]([[maybe_unused]] std::size_t page_index, [[maybe_unused]] std::size_t offset, @@ -685,8 +700,12 @@ std::vector> MemoryManager::GetSubmappedRange( } old_page_addr = {cpu_addr_base + copy_amount}; if (!last_segment) { - const GPUVAddr new_base_addr = (page_index << big_page_bits) + offset; - last_segment = {new_base_addr, copy_amount}; + if constexpr (is_gpu_address) { + const GPUVAddr new_base_addr = (page_index << big_page_bits) + offset; + last_segment = {new_base_addr, copy_amount}; + } else { + last_segment = {cpu_addr_base, copy_amount}; + } } else { last_segment->second += copy_amount; } @@ -703,8 +722,12 @@ std::vector> MemoryManager::GetSubmappedRange( } old_page_addr = {cpu_addr_base + copy_amount}; if (!last_segment) { - const GPUVAddr new_base_addr = (page_index << page_bits) + offset; - last_segment = {new_base_addr, copy_amount}; + if constexpr (is_gpu_address) { + const GPUVAddr new_base_addr = (page_index << page_bits) + offset; + last_segment = {new_base_addr, copy_amount}; + } else { + last_segment = {cpu_addr_base, copy_amount}; + } } else { last_segment->second += copy_amount; } @@ -715,7 +738,18 @@ std::vector> MemoryManager::GetSubmappedRange( }; MemoryOperation(gpu_addr, size, extend_size_big, split, do_short_pages); split(0, 0, 0); - return result; +} + +void MemoryManager::FlushCaching() { + if (!accumulator->AnyAccumulated()) { + return; + } + accumulator->Callback([this](GPUVAddr addr, size_t size) { + GetSubmappedRangeImpl(addr, size, page_stash); + }); + rasterizer->InnerInvalidation(page_stash); + page_stash.clear(); + accumulator->Clear(); } } // namespace Tegra diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index 828e13439e..e6de0d0cba 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -19,6 +19,10 @@ namespace VideoCore { class RasterizerInterface; } +namespace VideoCommon { +class InvalidationAccumulator; +} + namespace Core { class DeviceMemory; namespace Memory { @@ -80,6 +84,7 @@ public: */ void ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const; void WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size); + void WriteBlockCached(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size); /** * Checks if a gpu region can be simply read with a pointer. @@ -102,7 +107,7 @@ public: * will be returned; */ std::vector> GetSubmappedRange(GPUVAddr gpu_addr, - std::size_t size) const; + std::size_t size) const; GPUVAddr Map(GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size, PTEKind kind = PTEKind::INVALID, bool is_big_pages = true); @@ -129,6 +134,8 @@ public: size_t GetMemoryLayoutSize(GPUVAddr gpu_addr, size_t max_size = std::numeric_limits::max()) const; + void FlushCaching(); + private: template inline void MemoryOperation(GPUVAddr gpu_src_addr, std::size_t size, FuncMapped&& func_mapped, @@ -154,6 +161,12 @@ private: inline bool IsBigPageContinous(size_t big_page_index) const; inline void SetBigPageContinous(size_t big_page_index, bool value); + template + void GetSubmappedRangeImpl( + GPUVAddr gpu_addr, std::size_t size, + std::vector, std::size_t>>& + result) const; + Core::System& system; Core::Memory::Memory& memory; Core::DeviceMemory& device_memory; @@ -201,10 +214,12 @@ private: Common::VirtualBuffer big_page_table_cpu; std::vector big_page_continous; + std::vector> page_stash{}; constexpr static size_t continous_bits = 64; const size_t unique_identifier; + std::unique_ptr accumulator; static std::atomic unique_identifier_generator; }; diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index f44c7df506..6b66ad7b60 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -6,6 +6,7 @@ #include #include #include +#include #include "common/common_types.h" #include "common/polyfill_thread.h" #include "video_core/cache_types.h" @@ -95,6 +96,12 @@ public: virtual void InvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) = 0; + virtual void InnerInvalidation(std::span> sequences) { + for (const auto [cpu_addr, size] : sequences) { + InvalidateRegion(cpu_addr, size); + } + } + /// Notify rasterizer that any caches of the specified region are desync with guest virtual void OnCPUWrite(VAddr addr, u64 size) = 0; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 242bf9602a..6c4d745649 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -186,6 +186,7 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) { SCOPE_EXIT({ gpu.TickWork(); }); FlushWork(); + gpu_memory->FlushCaching(); query_cache.UpdateCounters(); @@ -393,6 +394,7 @@ void RasterizerVulkan::Clear(u32 layer_count) { void RasterizerVulkan::DispatchCompute() { FlushWork(); + gpu_memory->FlushCaching(); ComputePipeline* const pipeline{pipeline_cache.CurrentComputePipeline()}; if (!pipeline) { @@ -481,6 +483,27 @@ void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size, VideoCommon::Cache } } +void RasterizerVulkan::InnerInvalidation(std::span> sequences) { + { + std::scoped_lock lock{texture_cache.mutex}; + for (const auto [addr, size] : sequences) { + texture_cache.WriteMemory(addr, size); + } + } + { + std::scoped_lock lock{buffer_cache.mutex}; + for (const auto [addr, size] : sequences) { + buffer_cache.WriteMemory(addr, size); + } + } + { + for (const auto [addr, size] : sequences) { + query_cache.InvalidateRegion(addr, size); + pipeline_cache.InvalidateRegion(addr, size); + } + } +} + void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) { if (addr == 0 || size == 0) { return; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index c661e5b197..472cc64d91 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -79,6 +79,7 @@ public: VideoCommon::CacheType which = VideoCommon::CacheType::All) override; void InvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; + void InnerInvalidation(std::span> sequences) override; void OnCPUWrite(VAddr addr, u64 size) override; void InvalidateGPUCache() override; void UnmapMemory(VAddr addr, u64 size) override; From af5ecb0b15d4449f58434e70eed835cf71fc5527 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Thu, 5 Jan 2023 06:06:33 -0500 Subject: [PATCH 2/4] MemoryManager: use fastmem directly. --- src/core/memory.cpp | 2 +- src/video_core/memory_manager.cpp | 40 ++++++++++++++++++++++++------- src/video_core/memory_manager.h | 3 ++- 3 files changed, 34 insertions(+), 11 deletions(-) diff --git a/src/core/memory.cpp b/src/core/memory.cpp index 26be74df4a..a1e41faffb 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -436,7 +436,7 @@ struct Memory::Impl { } if (Settings::IsFastmemEnabled()) { - const bool is_read_enable = Settings::IsGPULevelHigh() || !cached; + const bool is_read_enable = !Settings::IsGPULevelExtreme() || !cached; system.DeviceMemory().buffer.Protect(vaddr, size, is_read_enable, !cached); } diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index 83924475b5..0a63900543 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -6,6 +6,7 @@ #include "common/alignment.h" #include "common/assert.h" #include "common/logging/log.h" +#include "common/settings.h" #include "core/core.h" #include "core/device_memory.h" #include "core/hle/kernel/k_page_table.h" @@ -45,6 +46,11 @@ MemoryManager::MemoryManager(Core::System& system_, u64 address_space_bits_, u64 big_page_table_cpu.resize(big_page_table_size); big_page_continous.resize(big_page_table_size / continous_bits, 0); entries.resize(page_table_size / 32, 0); + if (!Settings::IsGPULevelExtreme()) { + fastmem_arena = system.DeviceMemory().buffer.VirtualBasePointer(); + } else { + fastmem_arena = nullptr; + } } MemoryManager::~MemoryManager() = default; @@ -354,7 +360,7 @@ inline void MemoryManager::MemoryOperation(GPUVAddr gpu_src_addr, std::size_t si } } -template +template void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size, [[maybe_unused]] VideoCommon::CacheType which) const { auto set_to_zero = [&]([[maybe_unused]] std::size_t page_index, @@ -368,8 +374,12 @@ void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std: if constexpr (is_safe) { rasterizer->FlushRegion(cpu_addr_base, copy_amount, which); } - u8* physical = memory.GetPointer(cpu_addr_base); - std::memcpy(dest_buffer, physical, copy_amount); + if constexpr (use_fastmem) { + std::memcpy(dest_buffer, &fastmem_arena[cpu_addr_base], copy_amount); + } else { + u8* physical = memory.GetPointer(cpu_addr_base); + std::memcpy(dest_buffer, physical, copy_amount); + } dest_buffer = static_cast(dest_buffer) + copy_amount; }; auto mapped_big = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) { @@ -378,11 +388,15 @@ void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std: if constexpr (is_safe) { rasterizer->FlushRegion(cpu_addr_base, copy_amount, which); } - if (!IsBigPageContinous(page_index)) [[unlikely]] { - memory.ReadBlockUnsafe(cpu_addr_base, dest_buffer, copy_amount); + if constexpr (use_fastmem) { + std::memcpy(dest_buffer, &fastmem_arena[cpu_addr_base], copy_amount); } else { - u8* physical = memory.GetPointer(cpu_addr_base); - std::memcpy(dest_buffer, physical, copy_amount); + if (!IsBigPageContinous(page_index)) [[unlikely]] { + memory.ReadBlockUnsafe(cpu_addr_base, dest_buffer, copy_amount); + } else { + u8* physical = memory.GetPointer(cpu_addr_base); + std::memcpy(dest_buffer, physical, copy_amount); + } } dest_buffer = static_cast(dest_buffer) + copy_amount; }; @@ -396,12 +410,20 @@ void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std: void MemoryManager::ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size, VideoCommon::CacheType which) const { - ReadBlockImpl(gpu_src_addr, dest_buffer, size, which); + if (fastmem_arena) [[likely]] { + ReadBlockImpl(gpu_src_addr, dest_buffer, size, which); + return; + } + ReadBlockImpl(gpu_src_addr, dest_buffer, size, which); } void MemoryManager::ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, const std::size_t size) const { - ReadBlockImpl(gpu_src_addr, dest_buffer, size, VideoCommon::CacheType::None); + if (fastmem_arena) [[likely]] { + ReadBlockImpl(gpu_src_addr, dest_buffer, size, VideoCommon::CacheType::None); + return; + } + ReadBlockImpl(gpu_src_addr, dest_buffer, size, VideoCommon::CacheType::None); } template diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index e6de0d0cba..ff9e3c0b3a 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -141,7 +141,7 @@ private: inline void MemoryOperation(GPUVAddr gpu_src_addr, std::size_t size, FuncMapped&& func_mapped, FuncReserved&& func_reserved, FuncUnmapped&& func_unmapped) const; - template + template void ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size, VideoCommon::CacheType which) const; @@ -215,6 +215,7 @@ private: std::vector big_page_continous; std::vector> page_stash{}; + u8* fastmem_arena{}; constexpr static size_t continous_bits = 64; From 2d0c4f2b1d8a069754725a2fb3597a7506e265a5 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Thu, 5 Jan 2023 06:43:28 -0500 Subject: [PATCH 3/4] Fermi2D: sync cache flushes --- src/video_core/engines/fermi_2d.cpp | 6 ++++-- src/video_core/engines/fermi_2d.h | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp index e655e72543..a126c359cd 100644 --- a/src/video_core/engines/fermi_2d.cpp +++ b/src/video_core/engines/fermi_2d.cpp @@ -6,6 +6,7 @@ #include "common/microprofile.h" #include "video_core/engines/fermi_2d.h" #include "video_core/engines/sw_blitter/blitter.h" +#include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" #include "video_core/surface.h" #include "video_core/textures/decoders.h" @@ -20,8 +21,8 @@ namespace Tegra::Engines { using namespace Texture; -Fermi2D::Fermi2D(MemoryManager& memory_manager_) { - sw_blitter = std::make_unique(memory_manager_); +Fermi2D::Fermi2D(MemoryManager& memory_manager_) : memory_manager{memory_manager_} { + sw_blitter = std::make_unique(memory_manager); // Nvidia's OpenGL driver seems to assume these values regs.src.depth = 1; regs.dst.depth = 1; @@ -104,6 +105,7 @@ void Fermi2D::Blit() { config.src_x0 = 0; } + memory_manager.FlushCaching(); if (!rasterizer->AccelerateSurfaceCopy(src, regs.dst, config)) { sw_blitter->Blit(src, regs.dst, config); } diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h index 523fbdec25..705b323e15 100644 --- a/src/video_core/engines/fermi_2d.h +++ b/src/video_core/engines/fermi_2d.h @@ -305,6 +305,7 @@ public: private: VideoCore::RasterizerInterface* rasterizer = nullptr; std::unique_ptr sw_blitter; + MemoryManager& memory_manager; /// Performs the copy from the source surface to the destination surface as configured in the /// registers. From b56ad93bbc9ac38820c1e1cb4b03256dd50aa17a Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Thu, 5 Jan 2023 06:43:54 -0500 Subject: [PATCH 4/4] BufferBase: Don't ignore GPU pages. --- src/tests/video_core/buffer_base.cpp | 2 +- src/video_core/buffer_cache/buffer_base.h | 14 ++++++-------- src/video_core/engines/maxwell_dma.cpp | 4 ++-- src/video_core/invalidation_accumulator.h | 13 +++++++------ src/video_core/memory_manager.cpp | 2 +- src/video_core/memory_manager.h | 2 +- src/video_core/rasterizer_interface.h | 2 +- src/video_core/renderer_vulkan/vk_rasterizer.cpp | 6 +++--- 8 files changed, 22 insertions(+), 23 deletions(-) diff --git a/src/tests/video_core/buffer_base.cpp b/src/tests/video_core/buffer_base.cpp index f7236afabf..5cd0628f22 100644 --- a/src/tests/video_core/buffer_base.cpp +++ b/src/tests/video_core/buffer_base.cpp @@ -538,7 +538,7 @@ TEST_CASE("BufferBase: Cached write downloads") { int num = 0; buffer.ForEachDownloadRangeAndClear(c, WORD, [&](u64 offset, u64 size) { ++num; }); buffer.ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; }); - REQUIRE(num == 0); + REQUIRE(num == 1); REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE)); REQUIRE(!buffer.IsRegionGpuModified(c + PAGE, PAGE)); buffer.FlushCachedWrites(); diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h index 92d77eef2d..c47b7d8666 100644 --- a/src/video_core/buffer_cache/buffer_base.h +++ b/src/video_core/buffer_cache/buffer_base.h @@ -430,7 +430,7 @@ private: if (query_begin >= SizeBytes() || size < 0) { return; } - u64* const untracked_words = Array(); + [[maybe_unused]] u64* const untracked_words = Array(); u64* const state_words = Array(); const u64 query_end = query_begin + std::min(static_cast(size), SizeBytes()); u64* const words_begin = state_words + query_begin / BYTES_PER_WORD; @@ -483,7 +483,7 @@ private: NotifyRasterizer(word_index, current_bits, ~u64{0}); } // Exclude CPU modified pages when visiting GPU pages - const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0); + const u64 word = current_word; u64 page = page_begin; page_begin = 0; @@ -531,7 +531,7 @@ private: [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { static_assert(type != Type::Untracked); - const u64* const untracked_words = Array(); + [[maybe_unused]] const u64* const untracked_words = Array(); const u64* const state_words = Array(); const u64 num_query_words = size / BYTES_PER_WORD + 1; const u64 word_begin = offset / BYTES_PER_WORD; @@ -539,8 +539,7 @@ private: const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD; for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) { - const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; - const u64 word = state_words[word_index] & ~off_word; + const u64 word = state_words[word_index]; if (word == 0) { continue; } @@ -564,7 +563,7 @@ private: [[nodiscard]] std::pair ModifiedRegion(u64 offset, u64 size) const noexcept { static_assert(type != Type::Untracked); - const u64* const untracked_words = Array(); + [[maybe_unused]] const u64* const untracked_words = Array(); const u64* const state_words = Array(); const u64 num_query_words = size / BYTES_PER_WORD + 1; const u64 word_begin = offset / BYTES_PER_WORD; @@ -574,8 +573,7 @@ private: u64 begin = std::numeric_limits::max(); u64 end = 0; for (u64 word_index = word_begin; word_index < word_end; ++word_index) { - const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; - const u64 word = state_words[word_index] & ~off_word; + const u64 word = state_words[word_index]; if (word == 0) { continue; } diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index 7bf08e3e0d..7762c7d96f 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -123,7 +123,7 @@ void MaxwellDMA::Launch() { convert_linear_2_blocklinear_addr(regs.offset_in + offset), tmp_buffer.data(), tmp_buffer.size()); memory_manager.WriteBlockCached(regs.offset_out + offset, tmp_buffer.data(), - tmp_buffer.size()); + tmp_buffer.size()); } } else if (is_src_pitch && !is_dst_pitch) { UNIMPLEMENTED_IF(regs.line_length_in % 16 != 0); @@ -143,7 +143,7 @@ void MaxwellDMA::Launch() { memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(), regs.line_length_in); memory_manager.WriteBlockCached(regs.offset_out, tmp_buffer.data(), - regs.line_length_in); + regs.line_length_in); } } } diff --git a/src/video_core/invalidation_accumulator.h b/src/video_core/invalidation_accumulator.h index 42420e31c8..2c2aaf7bb0 100644 --- a/src/video_core/invalidation_accumulator.h +++ b/src/video_core/invalidation_accumulator.h @@ -3,6 +3,7 @@ #pragma once +#include #include #include "common/common_types.h" @@ -26,8 +27,8 @@ public: if (address >= start_address && address + size <= last_collection) [[likely]] { return; } - size = (address + size + atomicy_side_mask) & atomicy_mask - address; - address = address & atomicy_mask; + size = ((address + size + atomicity_size_mask) & atomicity_mask) - address; + address = address & atomicity_mask; if (!has_collected) [[unlikely]] { reset_values(); has_collected = true; @@ -64,10 +65,10 @@ public: } private: - static constexpr size_t atomicy_bits = 5; - static constexpr size_t atomicy_size = 1ULL << atomicy_bits; - static constexpr size_t atomicy_side_mask = atomicy_size - 1; - static constexpr size_t atomicy_mask = ~atomicy_side_mask; + static constexpr size_t atomicity_bits = 5; + static constexpr size_t atomicity_size = 1ULL << atomicity_bits; + static constexpr size_t atomicity_size_mask = atomicity_size - 1; + static constexpr size_t atomicity_mask = ~atomicity_size_mask; GPUVAddr start_address{}; GPUVAddr last_collection{}; size_t accumulated_size{}; diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index 0a63900543..3bcae3503b 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -46,7 +46,7 @@ MemoryManager::MemoryManager(Core::System& system_, u64 address_space_bits_, u64 big_page_table_cpu.resize(big_page_table_size); big_page_continous.resize(big_page_table_size / continous_bits, 0); entries.resize(page_table_size / 32, 0); - if (!Settings::IsGPULevelExtreme()) { + if (!Settings::IsGPULevelExtreme() && Settings::IsFastmemEnabled()) { fastmem_arena = system.DeviceMemory().buffer.VirtualBasePointer(); } else { fastmem_arena = nullptr; diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index ff9e3c0b3a..2936364f09 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -107,7 +107,7 @@ public: * will be returned; */ std::vector> GetSubmappedRange(GPUVAddr gpu_addr, - std::size_t size) const; + std::size_t size) const; GPUVAddr Map(GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size, PTEKind kind = PTEKind::INVALID, bool is_big_pages = true); diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index 6b66ad7b60..1735b61645 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -97,7 +97,7 @@ public: VideoCommon::CacheType which = VideoCommon::CacheType::All) = 0; virtual void InnerInvalidation(std::span> sequences) { - for (const auto [cpu_addr, size] : sequences) { + for (const auto& [cpu_addr, size] : sequences) { InvalidateRegion(cpu_addr, size); } } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 6c4d745649..ed4a721668 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -486,18 +486,18 @@ void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size, VideoCommon::Cache void RasterizerVulkan::InnerInvalidation(std::span> sequences) { { std::scoped_lock lock{texture_cache.mutex}; - for (const auto [addr, size] : sequences) { + for (const auto& [addr, size] : sequences) { texture_cache.WriteMemory(addr, size); } } { std::scoped_lock lock{buffer_cache.mutex}; - for (const auto [addr, size] : sequences) { + for (const auto& [addr, size] : sequences) { buffer_cache.WriteMemory(addr, size); } } { - for (const auto [addr, size] : sequences) { + for (const auto& [addr, size] : sequences) { query_cache.InvalidateRegion(addr, size); pipeline_cache.InvalidateRegion(addr, size); }