Merge pull request #9559 from FernandoS27/cached-writes

VideoCore: Implement Cached Writes, use fastmem for reading GPU memory and eliminate old stuffs
This commit is contained in:
Fernando S 2023-01-06 07:31:39 -05:00 committed by GitHub
commit 5bcbb8de45
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 233 additions and 53 deletions

View File

@ -436,7 +436,7 @@ struct Memory::Impl {
} }
if (Settings::IsFastmemEnabled()) { if (Settings::IsFastmemEnabled()) {
const bool is_read_enable = Settings::IsGPULevelHigh() || !cached; const bool is_read_enable = !Settings::IsGPULevelExtreme() || !cached;
system.DeviceMemory().buffer.Protect(vaddr, size, is_read_enable, !cached); system.DeviceMemory().buffer.Protect(vaddr, size, is_read_enable, !cached);
} }

View File

@ -538,7 +538,7 @@ TEST_CASE("BufferBase: Cached write downloads") {
int num = 0; int num = 0;
buffer.ForEachDownloadRangeAndClear(c, WORD, [&](u64 offset, u64 size) { ++num; }); buffer.ForEachDownloadRangeAndClear(c, WORD, [&](u64 offset, u64 size) { ++num; });
buffer.ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; }); buffer.ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; });
REQUIRE(num == 0); REQUIRE(num == 1);
REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE)); REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE));
REQUIRE(!buffer.IsRegionGpuModified(c + PAGE, PAGE)); REQUIRE(!buffer.IsRegionGpuModified(c + PAGE, PAGE));
buffer.FlushCachedWrites(); buffer.FlushCachedWrites();

View File

@ -85,6 +85,7 @@ add_library(video_core STATIC
gpu.h gpu.h
gpu_thread.cpp gpu_thread.cpp
gpu_thread.h gpu_thread.h
invalidation_accumulator.h
memory_manager.cpp memory_manager.cpp
memory_manager.h memory_manager.h
precompiled_headers.h precompiled_headers.h

View File

@ -430,7 +430,7 @@ private:
if (query_begin >= SizeBytes() || size < 0) { if (query_begin >= SizeBytes() || size < 0) {
return; return;
} }
u64* const untracked_words = Array<Type::Untracked>(); [[maybe_unused]] u64* const untracked_words = Array<Type::Untracked>();
u64* const state_words = Array<type>(); u64* const state_words = Array<type>();
const u64 query_end = query_begin + std::min(static_cast<u64>(size), SizeBytes()); const u64 query_end = query_begin + std::min(static_cast<u64>(size), SizeBytes());
u64* const words_begin = state_words + query_begin / BYTES_PER_WORD; u64* const words_begin = state_words + query_begin / BYTES_PER_WORD;
@ -483,7 +483,7 @@ private:
NotifyRasterizer<true>(word_index, current_bits, ~u64{0}); NotifyRasterizer<true>(word_index, current_bits, ~u64{0});
} }
// Exclude CPU modified pages when visiting GPU pages // Exclude CPU modified pages when visiting GPU pages
const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0); const u64 word = current_word;
u64 page = page_begin; u64 page = page_begin;
page_begin = 0; page_begin = 0;
@ -531,7 +531,7 @@ private:
[[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept {
static_assert(type != Type::Untracked); static_assert(type != Type::Untracked);
const u64* const untracked_words = Array<Type::Untracked>(); [[maybe_unused]] const u64* const untracked_words = Array<Type::Untracked>();
const u64* const state_words = Array<type>(); const u64* const state_words = Array<type>();
const u64 num_query_words = size / BYTES_PER_WORD + 1; const u64 num_query_words = size / BYTES_PER_WORD + 1;
const u64 word_begin = offset / BYTES_PER_WORD; const u64 word_begin = offset / BYTES_PER_WORD;
@ -539,8 +539,7 @@ private:
const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE);
u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD; u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD;
for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) { for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) {
const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; const u64 word = state_words[word_index];
const u64 word = state_words[word_index] & ~off_word;
if (word == 0) { if (word == 0) {
continue; continue;
} }
@ -564,7 +563,7 @@ private:
[[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept { [[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept {
static_assert(type != Type::Untracked); static_assert(type != Type::Untracked);
const u64* const untracked_words = Array<Type::Untracked>(); [[maybe_unused]] const u64* const untracked_words = Array<Type::Untracked>();
const u64* const state_words = Array<type>(); const u64* const state_words = Array<type>();
const u64 num_query_words = size / BYTES_PER_WORD + 1; const u64 num_query_words = size / BYTES_PER_WORD + 1;
const u64 word_begin = offset / BYTES_PER_WORD; const u64 word_begin = offset / BYTES_PER_WORD;
@ -574,8 +573,7 @@ private:
u64 begin = std::numeric_limits<u64>::max(); u64 begin = std::numeric_limits<u64>::max();
u64 end = 0; u64 end = 0;
for (u64 word_index = word_begin; word_index < word_end; ++word_index) { for (u64 word_index = word_begin; word_index < word_end; ++word_index) {
const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; const u64 word = state_words[word_index];
const u64 word = state_words[word_index] & ~off_word;
if (word == 0) { if (word == 0) {
continue; continue;
} }

View File

@ -76,7 +76,7 @@ void State::ProcessData(std::span<const u8> read_buffer) {
regs.dest.height, regs.dest.depth, x_offset, regs.dest.y, regs.dest.height, regs.dest.depth, x_offset, regs.dest.y,
x_elements, regs.line_count, regs.dest.BlockHeight(), x_elements, regs.line_count, regs.dest.BlockHeight(),
regs.dest.BlockDepth(), regs.line_length_in); regs.dest.BlockDepth(), regs.line_length_in);
memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size); memory_manager.WriteBlockCached(address, tmp_buffer.data(), dst_size);
} }
} }

View File

@ -6,6 +6,7 @@
#include "common/microprofile.h" #include "common/microprofile.h"
#include "video_core/engines/fermi_2d.h" #include "video_core/engines/fermi_2d.h"
#include "video_core/engines/sw_blitter/blitter.h" #include "video_core/engines/sw_blitter/blitter.h"
#include "video_core/memory_manager.h"
#include "video_core/rasterizer_interface.h" #include "video_core/rasterizer_interface.h"
#include "video_core/surface.h" #include "video_core/surface.h"
#include "video_core/textures/decoders.h" #include "video_core/textures/decoders.h"
@ -20,8 +21,8 @@ namespace Tegra::Engines {
using namespace Texture; using namespace Texture;
Fermi2D::Fermi2D(MemoryManager& memory_manager_) { Fermi2D::Fermi2D(MemoryManager& memory_manager_) : memory_manager{memory_manager_} {
sw_blitter = std::make_unique<Blitter::SoftwareBlitEngine>(memory_manager_); sw_blitter = std::make_unique<Blitter::SoftwareBlitEngine>(memory_manager);
// Nvidia's OpenGL driver seems to assume these values // Nvidia's OpenGL driver seems to assume these values
regs.src.depth = 1; regs.src.depth = 1;
regs.dst.depth = 1; regs.dst.depth = 1;
@ -104,6 +105,7 @@ void Fermi2D::Blit() {
config.src_x0 = 0; config.src_x0 = 0;
} }
memory_manager.FlushCaching();
if (!rasterizer->AccelerateSurfaceCopy(src, regs.dst, config)) { if (!rasterizer->AccelerateSurfaceCopy(src, regs.dst, config)) {
sw_blitter->Blit(src, regs.dst, config); sw_blitter->Blit(src, regs.dst, config);
} }

View File

@ -305,6 +305,7 @@ public:
private: private:
VideoCore::RasterizerInterface* rasterizer = nullptr; VideoCore::RasterizerInterface* rasterizer = nullptr;
std::unique_ptr<Blitter::SoftwareBlitEngine> sw_blitter; std::unique_ptr<Blitter::SoftwareBlitEngine> sw_blitter;
MemoryManager& memory_manager;
/// Performs the copy from the source surface to the destination surface as configured in the /// Performs the copy from the source surface to the destination surface as configured in the
/// registers. /// registers.

View File

@ -485,11 +485,6 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) {
} }
void Maxwell3D::ProcessQueryGet() { void Maxwell3D::ProcessQueryGet() {
// TODO(Subv): Support the other query units.
if (regs.report_semaphore.query.location != Regs::ReportSemaphore::Location::All) {
LOG_DEBUG(HW_GPU, "Locations other than ALL are unimplemented");
}
switch (regs.report_semaphore.query.operation) { switch (regs.report_semaphore.query.operation) {
case Regs::ReportSemaphore::Operation::Release: case Regs::ReportSemaphore::Operation::Release:
if (regs.report_semaphore.query.short_query != 0) { if (regs.report_semaphore.query.short_query != 0) {
@ -649,7 +644,7 @@ void Maxwell3D::ProcessCBMultiData(const u32* start_base, u32 amount) {
const GPUVAddr address{buffer_address + regs.const_buffer.offset}; const GPUVAddr address{buffer_address + regs.const_buffer.offset};
const size_t copy_size = amount * sizeof(u32); const size_t copy_size = amount * sizeof(u32);
memory_manager.WriteBlock(address, start_base, copy_size); memory_manager.WriteBlockCached(address, start_base, copy_size);
// Increment the current buffer position. // Increment the current buffer position.
regs.const_buffer.offset += static_cast<u32>(copy_size); regs.const_buffer.offset += static_cast<u32>(copy_size);

View File

@ -69,7 +69,7 @@ void MaxwellDMA::Launch() {
if (launch.multi_line_enable) { if (launch.multi_line_enable) {
const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH; const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH;
const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH; const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH;
memory_manager.FlushCaching();
if (!is_src_pitch && !is_dst_pitch) { if (!is_src_pitch && !is_dst_pitch) {
// If both the source and the destination are in block layout, assert. // If both the source and the destination are in block layout, assert.
CopyBlockLinearToBlockLinear(); CopyBlockLinearToBlockLinear();
@ -104,6 +104,7 @@ void MaxwellDMA::Launch() {
reinterpret_cast<u8*>(tmp_buffer.data()), reinterpret_cast<u8*>(tmp_buffer.data()),
regs.line_length_in * sizeof(u32)); regs.line_length_in * sizeof(u32));
} else { } else {
memory_manager.FlushCaching();
const auto convert_linear_2_blocklinear_addr = [](u64 address) { const auto convert_linear_2_blocklinear_addr = [](u64 address) {
return (address & ~0x1f0ULL) | ((address & 0x40) >> 2) | ((address & 0x10) << 1) | return (address & ~0x1f0ULL) | ((address & 0x40) >> 2) | ((address & 0x10) << 1) |
((address & 0x180) >> 1) | ((address & 0x20) << 3); ((address & 0x180) >> 1) | ((address & 0x20) << 3);
@ -121,8 +122,8 @@ void MaxwellDMA::Launch() {
memory_manager.ReadBlockUnsafe( memory_manager.ReadBlockUnsafe(
convert_linear_2_blocklinear_addr(regs.offset_in + offset), convert_linear_2_blocklinear_addr(regs.offset_in + offset),
tmp_buffer.data(), tmp_buffer.size()); tmp_buffer.data(), tmp_buffer.size());
memory_manager.WriteBlock(regs.offset_out + offset, tmp_buffer.data(), memory_manager.WriteBlockCached(regs.offset_out + offset, tmp_buffer.data(),
tmp_buffer.size()); tmp_buffer.size());
} }
} else if (is_src_pitch && !is_dst_pitch) { } else if (is_src_pitch && !is_dst_pitch) {
UNIMPLEMENTED_IF(regs.line_length_in % 16 != 0); UNIMPLEMENTED_IF(regs.line_length_in % 16 != 0);
@ -132,7 +133,7 @@ void MaxwellDMA::Launch() {
for (u32 offset = 0; offset < regs.line_length_in; offset += 16) { for (u32 offset = 0; offset < regs.line_length_in; offset += 16) {
memory_manager.ReadBlockUnsafe(regs.offset_in + offset, tmp_buffer.data(), memory_manager.ReadBlockUnsafe(regs.offset_in + offset, tmp_buffer.data(),
tmp_buffer.size()); tmp_buffer.size());
memory_manager.WriteBlock( memory_manager.WriteBlockCached(
convert_linear_2_blocklinear_addr(regs.offset_out + offset), convert_linear_2_blocklinear_addr(regs.offset_out + offset),
tmp_buffer.data(), tmp_buffer.size()); tmp_buffer.data(), tmp_buffer.size());
} }
@ -141,8 +142,8 @@ void MaxwellDMA::Launch() {
std::vector<u8> tmp_buffer(regs.line_length_in); std::vector<u8> tmp_buffer(regs.line_length_in);
memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(), memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(),
regs.line_length_in); regs.line_length_in);
memory_manager.WriteBlock(regs.offset_out, tmp_buffer.data(), memory_manager.WriteBlockCached(regs.offset_out, tmp_buffer.data(),
regs.line_length_in); regs.line_length_in);
} }
} }
} }
@ -204,7 +205,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
src_params.origin.y, x_elements, regs.line_count, block_height, block_depth, src_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
regs.pitch_out); regs.pitch_out);
memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
} }
void MaxwellDMA::CopyPitchToBlockLinear() { void MaxwellDMA::CopyPitchToBlockLinear() {
@ -256,7 +257,7 @@ void MaxwellDMA::CopyPitchToBlockLinear() {
dst_params.origin.y, x_elements, regs.line_count, block_height, block_depth, dst_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
regs.pitch_in); regs.pitch_in);
memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
} }
void MaxwellDMA::FastCopyBlockLinearToPitch() { void MaxwellDMA::FastCopyBlockLinearToPitch() {
@ -287,7 +288,7 @@ void MaxwellDMA::FastCopyBlockLinearToPitch() {
regs.src_params.block_size.height, regs.src_params.block_size.depth, regs.src_params.block_size.height, regs.src_params.block_size.depth,
regs.pitch_out); regs.pitch_out);
memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
} }
void MaxwellDMA::CopyBlockLinearToBlockLinear() { void MaxwellDMA::CopyBlockLinearToBlockLinear() {
@ -347,7 +348,7 @@ void MaxwellDMA::CopyBlockLinearToBlockLinear() {
dst.depth, dst_x_offset, dst.origin.y, x_elements, regs.line_count, dst.depth, dst_x_offset, dst.origin.y, x_elements, regs.line_count,
dst.block_size.height, dst.block_size.depth, pitch); dst.block_size.height, dst.block_size.depth, pitch);
memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
} }
void MaxwellDMA::ReleaseSemaphore() { void MaxwellDMA::ReleaseSemaphore() {

View File

@ -0,0 +1,79 @@
// SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#pragma once
#include <utility>
#include <vector>
#include "common/common_types.h"
namespace VideoCommon {
class InvalidationAccumulator {
public:
InvalidationAccumulator() = default;
~InvalidationAccumulator() = default;
void Add(GPUVAddr address, size_t size) {
const auto reset_values = [&]() {
if (has_collected) {
buffer.emplace_back(start_address, accumulated_size);
}
start_address = address;
accumulated_size = size;
last_collection = start_address + size;
};
if (address >= start_address && address + size <= last_collection) [[likely]] {
return;
}
size = ((address + size + atomicity_size_mask) & atomicity_mask) - address;
address = address & atomicity_mask;
if (!has_collected) [[unlikely]] {
reset_values();
has_collected = true;
return;
}
if (address != last_collection) [[unlikely]] {
reset_values();
return;
}
accumulated_size += size;
last_collection += size;
}
void Clear() {
buffer.clear();
start_address = 0;
last_collection = 0;
has_collected = false;
}
bool AnyAccumulated() const {
return has_collected;
}
template <typename Func>
void Callback(Func&& func) {
if (!has_collected) {
return;
}
buffer.emplace_back(start_address, accumulated_size);
for (auto& [address, size] : buffer) {
func(address, size);
}
}
private:
static constexpr size_t atomicity_bits = 5;
static constexpr size_t atomicity_size = 1ULL << atomicity_bits;
static constexpr size_t atomicity_size_mask = atomicity_size - 1;
static constexpr size_t atomicity_mask = ~atomicity_size_mask;
GPUVAddr start_address{};
GPUVAddr last_collection{};
size_t accumulated_size{};
bool has_collected{};
std::vector<std::pair<VAddr, size_t>> buffer;
};
} // namespace VideoCommon

View File

@ -6,11 +6,13 @@
#include "common/alignment.h" #include "common/alignment.h"
#include "common/assert.h" #include "common/assert.h"
#include "common/logging/log.h" #include "common/logging/log.h"
#include "common/settings.h"
#include "core/core.h" #include "core/core.h"
#include "core/device_memory.h" #include "core/device_memory.h"
#include "core/hle/kernel/k_page_table.h" #include "core/hle/kernel/k_page_table.h"
#include "core/hle/kernel/k_process.h" #include "core/hle/kernel/k_process.h"
#include "core/memory.h" #include "core/memory.h"
#include "video_core/invalidation_accumulator.h"
#include "video_core/memory_manager.h" #include "video_core/memory_manager.h"
#include "video_core/rasterizer_interface.h" #include "video_core/rasterizer_interface.h"
#include "video_core/renderer_base.h" #include "video_core/renderer_base.h"
@ -26,7 +28,8 @@ MemoryManager::MemoryManager(Core::System& system_, u64 address_space_bits_, u64
entries{}, big_entries{}, page_table{address_space_bits, address_space_bits + page_bits - 38, entries{}, big_entries{}, page_table{address_space_bits, address_space_bits + page_bits - 38,
page_bits != big_page_bits ? page_bits : 0}, page_bits != big_page_bits ? page_bits : 0},
kind_map{PTEKind::INVALID}, unique_identifier{unique_identifier_generator.fetch_add( kind_map{PTEKind::INVALID}, unique_identifier{unique_identifier_generator.fetch_add(
1, std::memory_order_acq_rel)} { 1, std::memory_order_acq_rel)},
accumulator{std::make_unique<VideoCommon::InvalidationAccumulator>()} {
address_space_size = 1ULL << address_space_bits; address_space_size = 1ULL << address_space_bits;
page_size = 1ULL << page_bits; page_size = 1ULL << page_bits;
page_mask = page_size - 1ULL; page_mask = page_size - 1ULL;
@ -43,6 +46,11 @@ MemoryManager::MemoryManager(Core::System& system_, u64 address_space_bits_, u64
big_page_table_cpu.resize(big_page_table_size); big_page_table_cpu.resize(big_page_table_size);
big_page_continous.resize(big_page_table_size / continous_bits, 0); big_page_continous.resize(big_page_table_size / continous_bits, 0);
entries.resize(page_table_size / 32, 0); entries.resize(page_table_size / 32, 0);
if (!Settings::IsGPULevelExtreme() && Settings::IsFastmemEnabled()) {
fastmem_arena = system.DeviceMemory().buffer.VirtualBasePointer();
} else {
fastmem_arena = nullptr;
}
} }
MemoryManager::~MemoryManager() = default; MemoryManager::~MemoryManager() = default;
@ -185,15 +193,12 @@ void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) {
if (size == 0) { if (size == 0) {
return; return;
} }
const auto submapped_ranges = GetSubmappedRange(gpu_addr, size); GetSubmappedRangeImpl<false>(gpu_addr, size, page_stash);
for (const auto& [map_addr, map_size] : submapped_ranges) { for (const auto& [map_addr, map_size] : page_stash) {
// Flush and invalidate through the GPU interface, to be asynchronous if possible. rasterizer->UnmapMemory(map_addr, map_size);
const std::optional<VAddr> cpu_addr = GpuToCpuAddress(map_addr);
ASSERT(cpu_addr);
rasterizer->UnmapMemory(*cpu_addr, map_size);
} }
page_stash.clear();
BigPageTableOp<EntryType::Free>(gpu_addr, 0, size, PTEKind::INVALID); BigPageTableOp<EntryType::Free>(gpu_addr, 0, size, PTEKind::INVALID);
PageTableOp<EntryType::Free>(gpu_addr, 0, size, PTEKind::INVALID); PageTableOp<EntryType::Free>(gpu_addr, 0, size, PTEKind::INVALID);
@ -355,7 +360,7 @@ inline void MemoryManager::MemoryOperation(GPUVAddr gpu_src_addr, std::size_t si
} }
} }
template <bool is_safe> template <bool is_safe, bool use_fastmem>
void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size, void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size,
[[maybe_unused]] VideoCommon::CacheType which) const { [[maybe_unused]] VideoCommon::CacheType which) const {
auto set_to_zero = [&]([[maybe_unused]] std::size_t page_index, auto set_to_zero = [&]([[maybe_unused]] std::size_t page_index,
@ -369,8 +374,12 @@ void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std:
if constexpr (is_safe) { if constexpr (is_safe) {
rasterizer->FlushRegion(cpu_addr_base, copy_amount, which); rasterizer->FlushRegion(cpu_addr_base, copy_amount, which);
} }
u8* physical = memory.GetPointer(cpu_addr_base); if constexpr (use_fastmem) {
std::memcpy(dest_buffer, physical, copy_amount); std::memcpy(dest_buffer, &fastmem_arena[cpu_addr_base], copy_amount);
} else {
u8* physical = memory.GetPointer(cpu_addr_base);
std::memcpy(dest_buffer, physical, copy_amount);
}
dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount; dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
}; };
auto mapped_big = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) { auto mapped_big = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
@ -379,11 +388,15 @@ void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std:
if constexpr (is_safe) { if constexpr (is_safe) {
rasterizer->FlushRegion(cpu_addr_base, copy_amount, which); rasterizer->FlushRegion(cpu_addr_base, copy_amount, which);
} }
if (!IsBigPageContinous(page_index)) [[unlikely]] { if constexpr (use_fastmem) {
memory.ReadBlockUnsafe(cpu_addr_base, dest_buffer, copy_amount); std::memcpy(dest_buffer, &fastmem_arena[cpu_addr_base], copy_amount);
} else { } else {
u8* physical = memory.GetPointer(cpu_addr_base); if (!IsBigPageContinous(page_index)) [[unlikely]] {
std::memcpy(dest_buffer, physical, copy_amount); memory.ReadBlockUnsafe(cpu_addr_base, dest_buffer, copy_amount);
} else {
u8* physical = memory.GetPointer(cpu_addr_base);
std::memcpy(dest_buffer, physical, copy_amount);
}
} }
dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount; dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
}; };
@ -397,12 +410,20 @@ void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std:
void MemoryManager::ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size, void MemoryManager::ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size,
VideoCommon::CacheType which) const { VideoCommon::CacheType which) const {
ReadBlockImpl<true>(gpu_src_addr, dest_buffer, size, which); if (fastmem_arena) [[likely]] {
ReadBlockImpl<true, true>(gpu_src_addr, dest_buffer, size, which);
return;
}
ReadBlockImpl<true, false>(gpu_src_addr, dest_buffer, size, which);
} }
void MemoryManager::ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, void MemoryManager::ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer,
const std::size_t size) const { const std::size_t size) const {
ReadBlockImpl<false>(gpu_src_addr, dest_buffer, size, VideoCommon::CacheType::None); if (fastmem_arena) [[likely]] {
ReadBlockImpl<false, true>(gpu_src_addr, dest_buffer, size, VideoCommon::CacheType::None);
return;
}
ReadBlockImpl<false, false>(gpu_src_addr, dest_buffer, size, VideoCommon::CacheType::None);
} }
template <bool is_safe> template <bool is_safe>
@ -454,6 +475,12 @@ void MemoryManager::WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buf
WriteBlockImpl<false>(gpu_dest_addr, src_buffer, size, VideoCommon::CacheType::None); WriteBlockImpl<false>(gpu_dest_addr, src_buffer, size, VideoCommon::CacheType::None);
} }
void MemoryManager::WriteBlockCached(GPUVAddr gpu_dest_addr, const void* src_buffer,
std::size_t size) {
WriteBlockImpl<false>(gpu_dest_addr, src_buffer, size, VideoCommon::CacheType::None);
accumulator->Add(gpu_dest_addr, size);
}
void MemoryManager::FlushRegion(GPUVAddr gpu_addr, size_t size, void MemoryManager::FlushRegion(GPUVAddr gpu_addr, size_t size,
VideoCommon::CacheType which) const { VideoCommon::CacheType which) const {
auto do_nothing = [&]([[maybe_unused]] std::size_t page_index, auto do_nothing = [&]([[maybe_unused]] std::size_t page_index,
@ -663,7 +690,17 @@ bool MemoryManager::IsFullyMappedRange(GPUVAddr gpu_addr, std::size_t size) cons
std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange( std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
GPUVAddr gpu_addr, std::size_t size) const { GPUVAddr gpu_addr, std::size_t size) const {
std::vector<std::pair<GPUVAddr, std::size_t>> result{}; std::vector<std::pair<GPUVAddr, std::size_t>> result{};
std::optional<std::pair<GPUVAddr, std::size_t>> last_segment{}; GetSubmappedRangeImpl<true>(gpu_addr, size, result);
return result;
}
template <bool is_gpu_address>
void MemoryManager::GetSubmappedRangeImpl(
GPUVAddr gpu_addr, std::size_t size,
std::vector<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>&
result) const {
std::optional<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>
last_segment{};
std::optional<VAddr> old_page_addr{}; std::optional<VAddr> old_page_addr{};
const auto split = [&last_segment, &result]([[maybe_unused]] std::size_t page_index, const auto split = [&last_segment, &result]([[maybe_unused]] std::size_t page_index,
[[maybe_unused]] std::size_t offset, [[maybe_unused]] std::size_t offset,
@ -685,8 +722,12 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
} }
old_page_addr = {cpu_addr_base + copy_amount}; old_page_addr = {cpu_addr_base + copy_amount};
if (!last_segment) { if (!last_segment) {
const GPUVAddr new_base_addr = (page_index << big_page_bits) + offset; if constexpr (is_gpu_address) {
last_segment = {new_base_addr, copy_amount}; const GPUVAddr new_base_addr = (page_index << big_page_bits) + offset;
last_segment = {new_base_addr, copy_amount};
} else {
last_segment = {cpu_addr_base, copy_amount};
}
} else { } else {
last_segment->second += copy_amount; last_segment->second += copy_amount;
} }
@ -703,8 +744,12 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
} }
old_page_addr = {cpu_addr_base + copy_amount}; old_page_addr = {cpu_addr_base + copy_amount};
if (!last_segment) { if (!last_segment) {
const GPUVAddr new_base_addr = (page_index << page_bits) + offset; if constexpr (is_gpu_address) {
last_segment = {new_base_addr, copy_amount}; const GPUVAddr new_base_addr = (page_index << page_bits) + offset;
last_segment = {new_base_addr, copy_amount};
} else {
last_segment = {cpu_addr_base, copy_amount};
}
} else { } else {
last_segment->second += copy_amount; last_segment->second += copy_amount;
} }
@ -715,7 +760,18 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
}; };
MemoryOperation<true>(gpu_addr, size, extend_size_big, split, do_short_pages); MemoryOperation<true>(gpu_addr, size, extend_size_big, split, do_short_pages);
split(0, 0, 0); split(0, 0, 0);
return result; }
void MemoryManager::FlushCaching() {
if (!accumulator->AnyAccumulated()) {
return;
}
accumulator->Callback([this](GPUVAddr addr, size_t size) {
GetSubmappedRangeImpl<false>(addr, size, page_stash);
});
rasterizer->InnerInvalidation(page_stash);
page_stash.clear();
accumulator->Clear();
} }
} // namespace Tegra } // namespace Tegra

View File

@ -19,6 +19,10 @@ namespace VideoCore {
class RasterizerInterface; class RasterizerInterface;
} }
namespace VideoCommon {
class InvalidationAccumulator;
}
namespace Core { namespace Core {
class DeviceMemory; class DeviceMemory;
namespace Memory { namespace Memory {
@ -80,6 +84,7 @@ public:
*/ */
void ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const; void ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;
void WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size); void WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
void WriteBlockCached(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
/** /**
* Checks if a gpu region can be simply read with a pointer. * Checks if a gpu region can be simply read with a pointer.
@ -129,12 +134,14 @@ public:
size_t GetMemoryLayoutSize(GPUVAddr gpu_addr, size_t GetMemoryLayoutSize(GPUVAddr gpu_addr,
size_t max_size = std::numeric_limits<size_t>::max()) const; size_t max_size = std::numeric_limits<size_t>::max()) const;
void FlushCaching();
private: private:
template <bool is_big_pages, typename FuncMapped, typename FuncReserved, typename FuncUnmapped> template <bool is_big_pages, typename FuncMapped, typename FuncReserved, typename FuncUnmapped>
inline void MemoryOperation(GPUVAddr gpu_src_addr, std::size_t size, FuncMapped&& func_mapped, inline void MemoryOperation(GPUVAddr gpu_src_addr, std::size_t size, FuncMapped&& func_mapped,
FuncReserved&& func_reserved, FuncUnmapped&& func_unmapped) const; FuncReserved&& func_reserved, FuncUnmapped&& func_unmapped) const;
template <bool is_safe> template <bool is_safe, bool use_fastmem>
void ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size, void ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size,
VideoCommon::CacheType which) const; VideoCommon::CacheType which) const;
@ -154,6 +161,12 @@ private:
inline bool IsBigPageContinous(size_t big_page_index) const; inline bool IsBigPageContinous(size_t big_page_index) const;
inline void SetBigPageContinous(size_t big_page_index, bool value); inline void SetBigPageContinous(size_t big_page_index, bool value);
template <bool is_gpu_address>
void GetSubmappedRangeImpl(
GPUVAddr gpu_addr, std::size_t size,
std::vector<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>&
result) const;
Core::System& system; Core::System& system;
Core::Memory::Memory& memory; Core::Memory::Memory& memory;
Core::DeviceMemory& device_memory; Core::DeviceMemory& device_memory;
@ -201,10 +214,13 @@ private:
Common::VirtualBuffer<u32> big_page_table_cpu; Common::VirtualBuffer<u32> big_page_table_cpu;
std::vector<u64> big_page_continous; std::vector<u64> big_page_continous;
std::vector<std::pair<VAddr, std::size_t>> page_stash{};
u8* fastmem_arena{};
constexpr static size_t continous_bits = 64; constexpr static size_t continous_bits = 64;
const size_t unique_identifier; const size_t unique_identifier;
std::unique_ptr<VideoCommon::InvalidationAccumulator> accumulator;
static std::atomic<size_t> unique_identifier_generator; static std::atomic<size_t> unique_identifier_generator;
}; };

View File

@ -6,6 +6,7 @@
#include <functional> #include <functional>
#include <optional> #include <optional>
#include <span> #include <span>
#include <utility>
#include "common/common_types.h" #include "common/common_types.h"
#include "common/polyfill_thread.h" #include "common/polyfill_thread.h"
#include "video_core/cache_types.h" #include "video_core/cache_types.h"
@ -95,6 +96,12 @@ public:
virtual void InvalidateRegion(VAddr addr, u64 size, virtual void InvalidateRegion(VAddr addr, u64 size,
VideoCommon::CacheType which = VideoCommon::CacheType::All) = 0; VideoCommon::CacheType which = VideoCommon::CacheType::All) = 0;
virtual void InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) {
for (const auto& [cpu_addr, size] : sequences) {
InvalidateRegion(cpu_addr, size);
}
}
/// Notify rasterizer that any caches of the specified region are desync with guest /// Notify rasterizer that any caches of the specified region are desync with guest
virtual void OnCPUWrite(VAddr addr, u64 size) = 0; virtual void OnCPUWrite(VAddr addr, u64 size) = 0;

View File

@ -186,6 +186,7 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) {
SCOPE_EXIT({ gpu.TickWork(); }); SCOPE_EXIT({ gpu.TickWork(); });
FlushWork(); FlushWork();
gpu_memory->FlushCaching();
query_cache.UpdateCounters(); query_cache.UpdateCounters();
@ -393,6 +394,7 @@ void RasterizerVulkan::Clear(u32 layer_count) {
void RasterizerVulkan::DispatchCompute() { void RasterizerVulkan::DispatchCompute() {
FlushWork(); FlushWork();
gpu_memory->FlushCaching();
ComputePipeline* const pipeline{pipeline_cache.CurrentComputePipeline()}; ComputePipeline* const pipeline{pipeline_cache.CurrentComputePipeline()};
if (!pipeline) { if (!pipeline) {
@ -481,6 +483,27 @@ void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size, VideoCommon::Cache
} }
} }
void RasterizerVulkan::InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) {
{
std::scoped_lock lock{texture_cache.mutex};
for (const auto& [addr, size] : sequences) {
texture_cache.WriteMemory(addr, size);
}
}
{
std::scoped_lock lock{buffer_cache.mutex};
for (const auto& [addr, size] : sequences) {
buffer_cache.WriteMemory(addr, size);
}
}
{
for (const auto& [addr, size] : sequences) {
query_cache.InvalidateRegion(addr, size);
pipeline_cache.InvalidateRegion(addr, size);
}
}
}
void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) { void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) {
if (addr == 0 || size == 0) { if (addr == 0 || size == 0) {
return; return;

View File

@ -79,6 +79,7 @@ public:
VideoCommon::CacheType which = VideoCommon::CacheType::All) override; VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
void InvalidateRegion(VAddr addr, u64 size, void InvalidateRegion(VAddr addr, u64 size,
VideoCommon::CacheType which = VideoCommon::CacheType::All) override; VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
void InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) override;
void OnCPUWrite(VAddr addr, u64 size) override; void OnCPUWrite(VAddr addr, u64 size) override;
void InvalidateGPUCache() override; void InvalidateGPUCache() override;
void UnmapMemory(VAddr addr, u64 size) override; void UnmapMemory(VAddr addr, u64 size) override;