DMA & InlineToMemory Engines Rework.

This commit is contained in:
bunnei 2022-08-14 02:36:36 -07:00 committed by Fernando Sahmkow
parent b2099fbdcc
commit f5fd6b5c86
21 changed files with 323 additions and 242 deletions

View File

@ -24,4 +24,12 @@ template <class ForwardIt, class T, class Compare = std::less<>>
return first != last && !comp(value, *first) ? first : last;
}
template <typename T, typename Func, typename... Args>
T FoldRight(T initial_value, Func&& func, Args&&... args) {
T value{initial_value};
const auto high_func = [&value, &func]<typename T>(T x) { value = func(value, x); };
(std::invoke(high_func, std::forward<Args>(args)), ...);
return value;
}
} // namespace Common

View File

@ -126,7 +126,7 @@ public:
void DownloadMemory(VAddr cpu_addr, u64 size);
bool InlineMemory(VAddr dest_address, size_t copy_size, std::span<u8> inlined_buffer);
bool InlineMemory(VAddr dest_address, size_t copy_size, std::span<const u8> inlined_buffer);
void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size);
@ -1685,7 +1685,7 @@ void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes,
template <class P>
bool BufferCache<P>::InlineMemory(VAddr dest_address, size_t copy_size,
std::span<u8> inlined_buffer) {
std::span<const u8> inlined_buffer) {
const bool is_dirty = IsRegionRegistered(dest_address, copy_size);
if (!is_dirty) {
return false;

View File

@ -3,6 +3,7 @@
#include <cstring>
#include "common/algorithm.h"
#include "common/assert.h"
#include "video_core/engines/engine_upload.h"
#include "video_core/memory_manager.h"
@ -34,21 +35,48 @@ void State::ProcessData(const u32 data, const bool is_last_call) {
if (!is_last_call) {
return;
}
ProcessData(inner_buffer);
}
void State::ProcessData(const u32* data, size_t num_data) {
std::span<const u8> read_buffer(reinterpret_cast<const u8*>(data), num_data * sizeof(u32));
ProcessData(read_buffer);
}
void State::ProcessData(std::span<const u8> read_buffer) {
const GPUVAddr address{regs.dest.Address()};
if (is_linear) {
rasterizer->AccelerateInlineToMemory(address, copy_size, inner_buffer);
if (regs.line_count == 1) {
rasterizer->AccelerateInlineToMemory(address, copy_size, read_buffer);
} else {
for (u32 line = 0; line < regs.line_count; ++line) {
const GPUVAddr dest_line = address + static_cast<size_t>(line) * regs.dest.pitch;
memory_manager.WriteBlockUnsafe(
dest_line, read_buffer.data() + static_cast<size_t>(line) * regs.line_length_in,
regs.line_length_in);
}
memory_manager.InvalidateRegion(address, regs.dest.pitch * regs.line_count);
}
} else {
UNIMPLEMENTED_IF(regs.dest.z != 0);
UNIMPLEMENTED_IF(regs.dest.depth != 1);
UNIMPLEMENTED_IF(regs.dest.BlockWidth() != 0);
UNIMPLEMENTED_IF(regs.dest.BlockDepth() != 0);
u32 width = regs.dest.width;
u32 x_elements = regs.line_length_in;
u32 x_offset = regs.dest.x;
const u32 bpp_shift = Common::FoldRight(
4U, [](u32 x, u32 y) { return std::min(x, static_cast<u32>(std::countr_zero(y))); },
width, x_elements, x_offset, static_cast<u32>(address));
width >>= bpp_shift;
x_elements >>= bpp_shift;
x_offset >>= bpp_shift;
const u32 bytes_per_pixel = 1U << bpp_shift;
const std::size_t dst_size = Tegra::Texture::CalculateSize(
true, 1, regs.dest.width, regs.dest.height, 1, regs.dest.BlockHeight(), 0);
true, bytes_per_pixel, width, regs.dest.height, regs.dest.depth,
regs.dest.BlockHeight(), regs.dest.BlockDepth());
tmp_buffer.resize(dst_size);
memory_manager.ReadBlock(address, tmp_buffer.data(), dst_size);
Tegra::Texture::SwizzleKepler(regs.dest.width, regs.dest.height, regs.dest.x, regs.dest.y,
regs.dest.BlockHeight(), copy_size, inner_buffer.data(),
tmp_buffer.data());
Tegra::Texture::SwizzleSubrect(tmp_buffer, read_buffer, bytes_per_pixel, width,
regs.dest.height, regs.dest.depth, x_offset, regs.dest.y,
x_elements, regs.line_count, regs.dest.BlockHeight(),
regs.dest.BlockDepth(), regs.line_length_in);
memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size);
}
}

View File

@ -3,6 +3,7 @@
#pragma once
#include <span>
#include <vector>
#include "common/bit_field.h"
#include "common/common_types.h"
@ -33,7 +34,7 @@ struct Registers {
u32 width;
u32 height;
u32 depth;
u32 z;
u32 layer;
u32 x;
u32 y;
@ -62,11 +63,14 @@ public:
void ProcessExec(bool is_linear_);
void ProcessData(u32 data, bool is_last_call);
void ProcessData(const u32* data, size_t num_data);
/// Binds a rasterizer to this engine.
void BindRasterizer(VideoCore::RasterizerInterface* rasterizer);
private:
void ProcessData(std::span<const u8> read_buffer);
u32 write_offset = 0;
u32 copy_size = 0;
std::vector<u8> inner_buffer;

View File

@ -36,8 +36,6 @@ void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_cal
}
case KEPLER_COMPUTE_REG_INDEX(data_upload): {
upload_state.ProcessData(method_argument, is_last_call);
if (is_last_call) {
}
break;
}
case KEPLER_COMPUTE_REG_INDEX(launch):
@ -50,8 +48,15 @@ void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_cal
void KeplerCompute::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
u32 methods_pending) {
for (std::size_t i = 0; i < amount; i++) {
CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
switch (method) {
case KEPLER_COMPUTE_REG_INDEX(data_upload):
upload_state.ProcessData(base_start, static_cast<size_t>(amount));
return;
default:
for (std::size_t i = 0; i < amount; i++) {
CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
}
break;
}
}

View File

@ -33,8 +33,6 @@ void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call
}
case KEPLERMEMORY_REG_INDEX(data): {
upload_state.ProcessData(method_argument, is_last_call);
if (is_last_call) {
}
break;
}
}
@ -42,8 +40,15 @@ void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call
void KeplerMemory::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
u32 methods_pending) {
for (std::size_t i = 0; i < amount; i++) {
CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
switch (method) {
case KEPLERMEMORY_REG_INDEX(data):
upload_state.ProcessData(base_start, static_cast<size_t>(amount));
return;
default:
for (std::size_t i = 0; i < amount; i++) {
CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
}
break;
}
}

View File

@ -239,8 +239,6 @@ void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argume
return upload_state.ProcessExec(regs.exec_upload.linear != 0);
case MAXWELL3D_REG_INDEX(data_upload):
upload_state.ProcessData(argument, is_last_call);
if (is_last_call) {
}
return;
case MAXWELL3D_REG_INDEX(fragment_barrier):
return rasterizer->FragmentBarrier();
@ -316,6 +314,9 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 15:
ProcessCBMultiData(base_start, amount);
break;
case MAXWELL3D_REG_INDEX(data_upload):
upload_state.ProcessData(base_start, static_cast<size_t>(amount));
return;
default:
for (std::size_t i = 0; i < amount; i++) {
CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);

View File

@ -1,6 +1,7 @@
// SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include "common/algorithm.h"
#include "common/assert.h"
#include "common/logging/log.h"
#include "common/microprofile.h"
@ -54,8 +55,6 @@ void MaxwellDMA::Launch() {
const LaunchDMA& launch = regs.launch_dma;
ASSERT(launch.interrupt_type == LaunchDMA::InterruptType::NONE);
ASSERT(launch.data_transfer_type == LaunchDMA::DataTransferType::NON_PIPELINED);
ASSERT(regs.dst_params.origin.x == 0);
ASSERT(regs.dst_params.origin.y == 0);
const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH;
const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH;
@ -121,12 +120,13 @@ void MaxwellDMA::CopyPitchToPitch() {
void MaxwellDMA::CopyBlockLinearToPitch() {
UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0);
UNIMPLEMENTED_IF(regs.src_params.block_size.depth != 0);
UNIMPLEMENTED_IF(regs.src_params.layer != 0);
const bool is_remapping = regs.launch_dma.remap_enable != 0;
// Optimized path for micro copies.
const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count;
if (dst_size < GOB_SIZE && regs.pitch_out <= GOB_SIZE_X &&
if (!is_remapping && dst_size < GOB_SIZE && regs.pitch_out <= GOB_SIZE_X &&
regs.src_params.height > GOB_SIZE_Y) {
FastCopyBlockLinearToPitch();
return;
@ -134,10 +134,27 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
// Deswizzle the input and copy it over.
UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0);
const u32 bytes_per_pixel =
regs.launch_dma.remap_enable ? regs.pitch_out / regs.line_length_in : 1;
const Parameters& src_params = regs.src_params;
const u32 width = src_params.width;
const u32 num_remap_components = regs.remap_const.num_dst_components_minus_one + 1;
const u32 remap_components_size = regs.remap_const.component_size_minus_one + 1;
const u32 base_bpp = !is_remapping ? 1U : num_remap_components * remap_components_size;
u32 width = src_params.width;
u32 x_elements = regs.line_length_in;
u32 x_offset = src_params.origin.x;
u32 bpp_shift = 0U;
if (!is_remapping) {
bpp_shift = Common::FoldRight(
4U, [](u32 x, u32 y) { return std::min(x, static_cast<u32>(std::countr_zero(y))); },
width, x_elements, x_offset, static_cast<u32>(regs.offset_in));
width >>= bpp_shift;
x_elements >>= bpp_shift;
x_offset >>= bpp_shift;
}
const u32 bytes_per_pixel = base_bpp << bpp_shift;
const u32 height = src_params.height;
const u32 depth = src_params.depth;
const u32 block_height = src_params.block_size.height;
@ -155,30 +172,46 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
UnswizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_out, width, bytes_per_pixel,
block_height, src_params.origin.x, src_params.origin.y, write_buffer.data(),
read_buffer.data());
UnswizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, width, height, depth, x_offset,
src_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
regs.pitch_out);
memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
}
void MaxwellDMA::CopyPitchToBlockLinear() {
UNIMPLEMENTED_IF_MSG(regs.dst_params.block_size.width != 0, "Block width is not one");
UNIMPLEMENTED_IF(regs.dst_params.layer != 0);
UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0);
const bool is_remapping = regs.launch_dma.remap_enable != 0;
const u32 num_remap_components = regs.remap_const.num_dst_components_minus_one + 1;
const u32 remap_components_size = regs.remap_const.component_size_minus_one + 1;
const auto& dst_params = regs.dst_params;
const u32 bytes_per_pixel =
regs.launch_dma.remap_enable ? regs.pitch_in / regs.line_length_in : 1;
const u32 width = dst_params.width;
const u32 base_bpp = !is_remapping ? 1U : num_remap_components * remap_components_size;
u32 width = dst_params.width;
u32 x_elements = regs.line_length_in;
u32 x_offset = dst_params.origin.x;
u32 bpp_shift = 0U;
if (!is_remapping) {
bpp_shift = Common::FoldRight(
4U, [](u32 x, u32 y) { return std::min(x, static_cast<u32>(std::countr_zero(y))); },
width, x_elements, x_offset, static_cast<u32>(regs.offset_out));
width >>= bpp_shift;
x_elements >>= bpp_shift;
x_offset >>= bpp_shift;
}
const u32 bytes_per_pixel = base_bpp << bpp_shift;
const u32 height = dst_params.height;
const u32 depth = dst_params.depth;
const u32 block_height = dst_params.block_size.height;
const u32 block_depth = dst_params.block_size.depth;
const size_t dst_size =
CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth);
const size_t dst_layer_size =
CalculateSize(true, bytes_per_pixel, width, height, 1, block_height, block_depth);
const size_t src_size = static_cast<size_t>(regs.pitch_in) * regs.line_count;
if (read_buffer.size() < src_size) {
@ -188,32 +221,23 @@ void MaxwellDMA::CopyPitchToBlockLinear() {
write_buffer.resize(dst_size);
}
memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
if (Settings::IsGPULevelExtreme()) {
memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
} else {
memory_manager.ReadBlockUnsafe(regs.offset_in, read_buffer.data(), src_size);
memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size);
}
// If the input is linear and the output is tiled, swizzle the input and copy it over.
if (regs.dst_params.block_size.depth > 0) {
ASSERT(dst_params.layer == 0);
SwizzleSliceToVoxel(regs.line_length_in, regs.line_count, regs.pitch_in, width, height,
bytes_per_pixel, block_height, block_depth, dst_params.origin.x,
dst_params.origin.y, write_buffer.data(), read_buffer.data());
} else {
SwizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_in, width, bytes_per_pixel,
write_buffer.data() + dst_layer_size * dst_params.layer, read_buffer.data(),
block_height, dst_params.origin.x, dst_params.origin.y);
}
SwizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, width, height, depth, x_offset,
dst_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
regs.pitch_in);
memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
}
void MaxwellDMA::FastCopyBlockLinearToPitch() {
const u32 bytes_per_pixel =
regs.launch_dma.remap_enable ? regs.pitch_out / regs.line_length_in : 1;
const u32 bytes_per_pixel = 1U;
const size_t src_size = GOB_SIZE;
const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count;
u32 pos_x = regs.src_params.origin.x;
@ -239,9 +263,10 @@ void MaxwellDMA::FastCopyBlockLinearToPitch() {
memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size);
}
UnswizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_out, regs.src_params.width,
bytes_per_pixel, regs.src_params.block_size.height, pos_x, pos_y,
write_buffer.data(), read_buffer.data());
UnswizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, regs.src_params.width,
regs.src_params.height, 1, pos_x, pos_y, regs.line_length_in, regs.line_count,
regs.src_params.block_size.height, regs.src_params.block_size.depth,
regs.pitch_out);
memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
}

View File

@ -189,10 +189,16 @@ public:
BitField<4, 3, Swizzle> dst_y;
BitField<8, 3, Swizzle> dst_z;
BitField<12, 3, Swizzle> dst_w;
BitField<0, 12, u32> dst_components_raw;
BitField<16, 2, u32> component_size_minus_one;
BitField<20, 2, u32> num_src_components_minus_one;
BitField<24, 2, u32> num_dst_components_minus_one;
};
Swizzle GetComponent(size_t i) {
const u32 raw = dst_components_raw;
return static_cast<Swizzle>((raw >> (i * 3)) & 0x7);
}
};
static_assert(sizeof(RemapConst) == 12);

View File

@ -156,8 +156,9 @@ void Vic::WriteRGBFrame(const AVFrame* frame, const VicConfig& config) {
const u32 block_height = static_cast<u32>(config.block_linear_height_log2);
const auto size = Texture::CalculateSize(true, 4, width, height, 1, block_height, 0);
luma_buffer.resize(size);
Texture::SwizzleSubrect(width, height, width * 4, width, 4, luma_buffer.data(),
converted_frame_buf_addr, block_height, 0, 0);
std::span<const u8> frame_buff(converted_frame_buf_addr, 4 * width * height);
Texture::SwizzleSubrect(luma_buffer, frame_buff, 4, width, height, 1,
0, 0, width, height, block_height, 0, width * 4);
host1x.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), size);
} else {

View File

@ -462,6 +462,97 @@ void MemoryManager::FlushRegion(GPUVAddr gpu_addr, size_t size) const {
MemoryOperation<true>(gpu_addr, size, mapped_big, do_nothing, flush_short_pages);
}
bool MemoryManager::IsMemoryDirty(GPUVAddr gpu_addr, size_t size) const {
bool result = false;
auto do_nothing = [&]([[maybe_unused]] std::size_t page_index,
[[maybe_unused]] std::size_t offset,
[[maybe_unused]] std::size_t copy_amount) { return false; };
auto mapped_normal = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
const VAddr cpu_addr_base =
(static_cast<VAddr>(page_table[page_index]) << cpu_page_bits) + offset;
result |= rasterizer->MustFlushRegion(cpu_addr_base, copy_amount);
return result;
};
auto mapped_big = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
const VAddr cpu_addr_base =
(static_cast<VAddr>(big_page_table_cpu[page_index]) << cpu_page_bits) + offset;
result |= rasterizer->MustFlushRegion(cpu_addr_base, copy_amount);
return result;
};
auto check_short_pages = [&](std::size_t page_index, std::size_t offset,
std::size_t copy_amount) {
GPUVAddr base = (page_index << big_page_bits) + offset;
MemoryOperation<false>(base, copy_amount, mapped_normal, do_nothing, do_nothing);
return result;
};
MemoryOperation<true>(gpu_addr, size, mapped_big, do_nothing, check_short_pages);
return result;
}
size_t MemoryManager::MaxContinousRange(GPUVAddr gpu_addr, size_t size) const {
std::optional<VAddr> old_page_addr{};
size_t range_so_far = 0;
bool result{false};
auto fail = [&]([[maybe_unused]] std::size_t page_index, [[maybe_unused]] std::size_t offset,
std::size_t copy_amount) {
result = true;
return true;
};
auto short_check = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
const VAddr cpu_addr_base =
(static_cast<VAddr>(page_table[page_index]) << cpu_page_bits) + offset;
if (old_page_addr && *old_page_addr != cpu_addr_base) {
result = true;
return true;
}
range_so_far += copy_amount;
old_page_addr = {cpu_addr_base + copy_amount};
return false;
};
auto big_check = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
const VAddr cpu_addr_base =
(static_cast<VAddr>(big_page_table_cpu[page_index]) << cpu_page_bits) + offset;
if (old_page_addr && *old_page_addr != cpu_addr_base) {
return true;
}
range_so_far += copy_amount;
old_page_addr = {cpu_addr_base + copy_amount};
return false;
};
auto check_short_pages = [&](std::size_t page_index, std::size_t offset,
std::size_t copy_amount) {
GPUVAddr base = (page_index << big_page_bits) + offset;
MemoryOperation<false>(base, copy_amount, short_check, fail, fail);
return result;
};
MemoryOperation<true>(gpu_addr, size, big_check, fail, check_short_pages);
return range_so_far;
}
void MemoryManager::InvalidateRegion(GPUVAddr gpu_addr, size_t size) const {
auto do_nothing = [&]([[maybe_unused]] std::size_t page_index,
[[maybe_unused]] std::size_t offset,
[[maybe_unused]] std::size_t copy_amount) {};
auto mapped_normal = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
const VAddr cpu_addr_base =
(static_cast<VAddr>(page_table[page_index]) << cpu_page_bits) + offset;
rasterizer->InvalidateRegion(cpu_addr_base, copy_amount);
};
auto mapped_big = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
const VAddr cpu_addr_base =
(static_cast<VAddr>(big_page_table_cpu[page_index]) << cpu_page_bits) + offset;
rasterizer->InvalidateRegion(cpu_addr_base, copy_amount);
};
auto invalidate_short_pages = [&](std::size_t page_index, std::size_t offset,
std::size_t copy_amount) {
GPUVAddr base = (page_index << big_page_bits) + offset;
MemoryOperation<false>(base, copy_amount, mapped_normal, do_nothing, do_nothing);
};
MemoryOperation<true>(gpu_addr, size, mapped_big, do_nothing, invalidate_short_pages);
}
void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size) {
std::vector<u8> tmp_buffer(size);
ReadBlock(gpu_src_addr, tmp_buffer.data(), size);

View File

@ -104,6 +104,12 @@ public:
void FlushRegion(GPUVAddr gpu_addr, size_t size) const;
void InvalidateRegion(GPUVAddr gpu_addr, size_t size) const;
bool IsMemoryDirty(GPUVAddr gpu_addr, size_t size) const;
size_t MaxContinousRange(GPUVAddr gpu_addr, size_t size) const;
private:
template <bool is_big_pages, typename FuncMapped, typename FuncReserved, typename FuncUnmapped>
inline void MemoryOperation(GPUVAddr gpu_src_addr, std::size_t size, FuncMapped&& func_mapped,

View File

@ -129,7 +129,7 @@ public:
[[nodiscard]] virtual Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() = 0;
virtual void AccelerateInlineToMemory(GPUVAddr address, size_t copy_size,
std::span<u8> memory) = 0;
std::span<const u8> memory) = 0;
/// Attempt to use a faster method to display the framebuffer to screen
[[nodiscard]] virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config,

View File

@ -476,7 +476,7 @@ Tegra::Engines::AccelerateDMAInterface& RasterizerOpenGL::AccessAccelerateDMA()
}
void RasterizerOpenGL::AccelerateInlineToMemory(GPUVAddr address, size_t copy_size,
std::span<u8> memory) {
std::span<const u8> memory) {
auto cpu_addr = gpu_memory->GpuToCpuAddress(address);
if (!cpu_addr) [[unlikely]] {
gpu_memory->WriteBlock(address, memory.data(), copy_size);

View File

@ -99,7 +99,7 @@ public:
const Tegra::Engines::Fermi2D::Config& copy_config) override;
Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() override;
void AccelerateInlineToMemory(GPUVAddr address, size_t copy_size,
std::span<u8> memory) override;
std::span<const u8> memory) override;
bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
u32 pixel_stride) override;
void LoadDiskResources(u64 title_id, std::stop_token stop_loading,

View File

@ -26,8 +26,6 @@
namespace Vulkan {
using Tegra::Texture::SWIZZLE_TABLE;
namespace {
constexpr u32 ASTC_BINDING_INPUT_BUFFER = 0;

View File

@ -548,7 +548,7 @@ Tegra::Engines::AccelerateDMAInterface& RasterizerVulkan::AccessAccelerateDMA()
}
void RasterizerVulkan::AccelerateInlineToMemory(GPUVAddr address, size_t copy_size,
std::span<u8> memory) {
std::span<const u8> memory) {
auto cpu_addr = gpu_memory->GpuToCpuAddress(address);
if (!cpu_addr) [[unlikely]] {
gpu_memory->WriteBlock(address, memory.data(), copy_size);

View File

@ -95,7 +95,7 @@ public:
const Tegra::Engines::Fermi2D::Config& copy_config) override;
Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() override;
void AccelerateInlineToMemory(GPUVAddr address, size_t copy_size,
std::span<u8> memory) override;
std::span<const u8> memory) override;
bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
u32 pixel_stride) override;
void LoadDiskResources(u64 title_id, std::stop_token stop_loading,

View File

@ -517,7 +517,6 @@ void SwizzleBlockLinearImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr
const u32 host_bytes_per_layer = num_blocks_per_layer * bytes_per_block;
UNIMPLEMENTED_IF(info.tile_width_spacing > 0);
UNIMPLEMENTED_IF(copy.image_offset.x != 0);
UNIMPLEMENTED_IF(copy.image_offset.y != 0);
UNIMPLEMENTED_IF(copy.image_offset.z != 0);

View File

@ -89,6 +89,69 @@ void SwizzleImpl(std::span<u8> output, std::span<const u8> input, u32 width, u32
}
}
template <bool TO_LINEAR, u32 BYTES_PER_PIXEL>
void SwizzleSubrectImpl(std::span<u8> output, std::span<const u8> input, u32 width, u32 height,
u32 depth, u32 origin_x, u32 origin_y, u32 extent_x, u32 num_lines,
u32 block_height, u32 block_depth, u32 pitch_linear) {
// The origin of the transformation can be configured here, leave it as zero as the current API
// doesn't expose it.
static constexpr u32 origin_z = 0;
// We can configure here a custom pitch
// As it's not exposed 'width * BYTES_PER_PIXEL' will be the expected pitch.
const u32 pitch = pitch_linear;
const u32 stride = Common::AlignUpLog2(width * BYTES_PER_PIXEL, GOB_SIZE_X_SHIFT);
const u32 gobs_in_x = Common::DivCeilLog2(stride, GOB_SIZE_X_SHIFT);
const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth);
const u32 slice_size =
Common::DivCeilLog2(height, block_height + GOB_SIZE_Y_SHIFT) * block_size;
const u32 block_height_mask = (1U << block_height) - 1;
const u32 block_depth_mask = (1U << block_depth) - 1;
const u32 x_shift = GOB_SIZE_SHIFT + block_height + block_depth;
u32 unprocessed_lines = num_lines;
u32 extent_y = std::min(num_lines, height - origin_y);
for (u32 slice = 0; slice < depth; ++slice) {
const u32 z = slice + origin_z;
const u32 offset_z = (z >> block_depth) * slice_size +
((z & block_depth_mask) << (GOB_SIZE_SHIFT + block_height));
const u32 lines_in_y = std::min(unprocessed_lines, extent_y);
for (u32 line = 0; line < lines_in_y; ++line) {
const u32 y = line + origin_y;
const u32 swizzled_y = pdep<SWIZZLE_Y_BITS>(y);
const u32 block_y = y >> GOB_SIZE_Y_SHIFT;
const u32 offset_y = (block_y >> block_height) * block_size +
((block_y & block_height_mask) << GOB_SIZE_SHIFT);
u32 swizzled_x = pdep<SWIZZLE_X_BITS>(origin_x * BYTES_PER_PIXEL);
for (u32 column = 0; column < extent_x;
++column, incrpdep<SWIZZLE_X_BITS, BYTES_PER_PIXEL>(swizzled_x)) {
const u32 x = (column + origin_x) * BYTES_PER_PIXEL;
const u32 offset_x = (x >> GOB_SIZE_X_SHIFT) << x_shift;
const u32 base_swizzled_offset = offset_z + offset_y + offset_x;
const u32 swizzled_offset = base_swizzled_offset + (swizzled_x | swizzled_y);
const u32 unswizzled_offset =
slice * pitch * height + line * pitch + column * BYTES_PER_PIXEL;
u8* const dst = &output[TO_LINEAR ? swizzled_offset : unswizzled_offset];
const u8* const src = &input[TO_LINEAR ? unswizzled_offset : swizzled_offset];
std::memcpy(dst, src, BYTES_PER_PIXEL);
}
}
unprocessed_lines -= lines_in_y;
if (unprocessed_lines == 0) {
return;
}
}
}
template <bool TO_LINEAR>
void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) {
@ -111,97 +174,6 @@ void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixe
}
}
template <u32 BYTES_PER_PIXEL>
void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
u8* swizzled_data, const u8* unswizzled_data, u32 block_height_bit,
u32 offset_x, u32 offset_y) {
const u32 block_height = 1U << block_height_bit;
const u32 image_width_in_gobs =
(swizzled_width * BYTES_PER_PIXEL + (GOB_SIZE_X - 1)) / GOB_SIZE_X;
for (u32 line = 0; line < subrect_height; ++line) {
const u32 dst_y = line + offset_y;
const u32 gob_address_y =
(dst_y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs +
((dst_y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE;
const u32 swizzled_y = pdep<SWIZZLE_Y_BITS>(dst_y);
u32 swizzled_x = pdep<SWIZZLE_X_BITS>(offset_x * BYTES_PER_PIXEL);
for (u32 x = 0; x < subrect_width;
++x, incrpdep<SWIZZLE_X_BITS, BYTES_PER_PIXEL>(swizzled_x)) {
const u32 dst_x = x + offset_x;
const u32 gob_address =
gob_address_y + (dst_x * BYTES_PER_PIXEL / GOB_SIZE_X) * GOB_SIZE * block_height;
const u32 swizzled_offset = gob_address + (swizzled_x | swizzled_y);
const u32 unswizzled_offset = line * source_pitch + x * BYTES_PER_PIXEL;
const u8* const source_line = unswizzled_data + unswizzled_offset;
u8* const dest_addr = swizzled_data + swizzled_offset;
std::memcpy(dest_addr, source_line, BYTES_PER_PIXEL);
}
}
}
template <u32 BYTES_PER_PIXEL>
void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 block_height,
u32 origin_x, u32 origin_y, u8* output, const u8* input) {
const u32 stride = width * BYTES_PER_PIXEL;
const u32 gobs_in_x = (stride + GOB_SIZE_X - 1) / GOB_SIZE_X;
const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height);
const u32 block_height_mask = (1U << block_height) - 1;
const u32 x_shift = GOB_SIZE_SHIFT + block_height;
for (u32 line = 0; line < line_count; ++line) {
const u32 src_y = line + origin_y;
const u32 swizzled_y = pdep<SWIZZLE_Y_BITS>(src_y);
const u32 block_y = src_y >> GOB_SIZE_Y_SHIFT;
const u32 src_offset_y = (block_y >> block_height) * block_size +
((block_y & block_height_mask) << GOB_SIZE_SHIFT);
u32 swizzled_x = pdep<SWIZZLE_X_BITS>(origin_x * BYTES_PER_PIXEL);
for (u32 column = 0; column < line_length_in;
++column, incrpdep<SWIZZLE_X_BITS, BYTES_PER_PIXEL>(swizzled_x)) {
const u32 src_x = (column + origin_x) * BYTES_PER_PIXEL;
const u32 src_offset_x = (src_x >> GOB_SIZE_X_SHIFT) << x_shift;
const u32 swizzled_offset = src_offset_y + src_offset_x + (swizzled_x | swizzled_y);
const u32 unswizzled_offset = line * pitch + column * BYTES_PER_PIXEL;
std::memcpy(output + unswizzled_offset, input + swizzled_offset, BYTES_PER_PIXEL);
}
}
}
template <u32 BYTES_PER_PIXEL>
void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 height,
u32 block_height, u32 block_depth, u32 origin_x, u32 origin_y, u8* output,
const u8* input) {
UNIMPLEMENTED_IF(origin_x > 0);
UNIMPLEMENTED_IF(origin_y > 0);
const u32 stride = width * BYTES_PER_PIXEL;
const u32 gobs_in_x = (stride + GOB_SIZE_X - 1) / GOB_SIZE_X;
const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth);
const u32 block_height_mask = (1U << block_height) - 1;
const u32 x_shift = static_cast<u32>(GOB_SIZE_SHIFT) + block_height + block_depth;
for (u32 line = 0; line < line_count; ++line) {
const u32 swizzled_y = pdep<SWIZZLE_Y_BITS>(line);
const u32 block_y = line / GOB_SIZE_Y;
const u32 dst_offset_y =
(block_y >> block_height) * block_size + (block_y & block_height_mask) * GOB_SIZE;
u32 swizzled_x = 0;
for (u32 x = 0; x < line_length_in; ++x, incrpdep<SWIZZLE_X_BITS, 1>(swizzled_x)) {
const u32 dst_offset =
((x / GOB_SIZE_X) << x_shift) + dst_offset_y + (swizzled_x | swizzled_y);
const u32 src_offset = x * BYTES_PER_PIXEL + line * pitch;
std::memcpy(output + dst_offset, input + src_offset, BYTES_PER_PIXEL);
}
}
}
} // Anonymous namespace
void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,
@ -218,15 +190,15 @@ void SwizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_p
stride_alignment);
}
void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
u32 bytes_per_pixel, u8* swizzled_data, const u8* unswizzled_data,
u32 block_height_bit, u32 offset_x, u32 offset_y) {
void SwizzleSubrect(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
u32 height, u32 depth, u32 origin_x, u32 origin_y, u32 extent_x, u32 extent_y,
u32 block_height, u32 block_depth, u32 pitch_linear) {
switch (bytes_per_pixel) {
#define BPP_CASE(x) \
case x: \
return SwizzleSubrect<x>(subrect_width, subrect_height, source_pitch, swizzled_width, \
swizzled_data, unswizzled_data, block_height_bit, offset_x, \
offset_y);
return SwizzleSubrectImpl<true, x>(output, input, width, height, depth, origin_x, \
origin_y, extent_x, extent_y, block_height, \
block_depth, pitch_linear);
BPP_CASE(1)
BPP_CASE(2)
BPP_CASE(3)
@ -241,13 +213,15 @@ void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32
}
}
void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 bytes_per_pixel,
u32 block_height, u32 origin_x, u32 origin_y, u8* output, const u8* input) {
void UnswizzleSubrect(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,
u32 width, u32 height, u32 depth, u32 origin_x, u32 origin_y, u32 extent_x,
u32 extent_y, u32 block_height, u32 block_depth, u32 pitch_linear) {
switch (bytes_per_pixel) {
#define BPP_CASE(x) \
case x: \
return UnswizzleSubrect<x>(line_length_in, line_count, pitch, width, block_height, \
origin_x, origin_y, output, input);
return SwizzleSubrectImpl<false, x>(output, input, width, height, depth, origin_x, \
origin_y, extent_x, extent_y, block_height, \
block_depth, pitch_linear);
BPP_CASE(1)
BPP_CASE(2)
BPP_CASE(3)
@ -262,55 +236,6 @@ void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width,
}
}
void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 height,
u32 bytes_per_pixel, u32 block_height, u32 block_depth, u32 origin_x,
u32 origin_y, u8* output, const u8* input) {
switch (bytes_per_pixel) {
#define BPP_CASE(x) \
case x: \
return SwizzleSliceToVoxel<x>(line_length_in, line_count, pitch, width, height, \
block_height, block_depth, origin_x, origin_y, output, \
input);
BPP_CASE(1)
BPP_CASE(2)
BPP_CASE(3)
BPP_CASE(4)
BPP_CASE(6)
BPP_CASE(8)
BPP_CASE(12)
BPP_CASE(16)
#undef BPP_CASE
default:
ASSERT_MSG(false, "Invalid bytes_per_pixel={}", bytes_per_pixel);
}
}
void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 dst_y,
const u32 block_height_bit, const std::size_t copy_size, const u8* source_data,
u8* swizzle_data) {
const u32 block_height = 1U << block_height_bit;
const u32 image_width_in_gobs{(width + GOB_SIZE_X - 1) / GOB_SIZE_X};
std::size_t count = 0;
for (std::size_t y = dst_y; y < height && count < copy_size; ++y) {
const std::size_t gob_address_y =
(y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs +
((y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE;
const u32 swizzled_y = pdep<SWIZZLE_Y_BITS>(static_cast<u32>(y));
u32 swizzled_x = pdep<SWIZZLE_X_BITS>(dst_x);
for (std::size_t x = dst_x; x < width && count < copy_size;
++x, incrpdep<SWIZZLE_X_BITS, 1>(swizzled_x)) {
const std::size_t gob_address =
gob_address_y + (x / GOB_SIZE_X) * GOB_SIZE * block_height;
const std::size_t swizzled_offset = gob_address + (swizzled_x | swizzled_y);
const u8* source_line = source_data + count;
u8* dest_addr = swizzle_data + swizzled_offset;
count++;
*dest_addr = *source_line;
}
}
}
std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height, u32 depth,
u32 block_height, u32 block_depth) {
if (tiled) {

View File

@ -40,7 +40,6 @@ constexpr SwizzleTable MakeSwizzleTable() {
}
return table;
}
constexpr SwizzleTable SWIZZLE_TABLE = MakeSwizzleTable();
/// Unswizzles a block linear texture into linear memory.
void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,
@ -57,34 +56,14 @@ std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height
u32 block_height, u32 block_depth);
/// Copies an untiled subrectangle into a tiled surface.
void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
u32 bytes_per_pixel, u8* swizzled_data, const u8* unswizzled_data,
u32 block_height_bit, u32 offset_x, u32 offset_y);
void SwizzleSubrect(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
u32 height, u32 depth, u32 origin_x, u32 origin_y, u32 extent_x, u32 extent_y,
u32 block_height, u32 block_depth, u32 pitch_linear);
/// Copies a tiled subrectangle into a linear surface.
void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 bytes_per_pixel,
u32 block_height, u32 origin_x, u32 origin_y, u8* output, const u8* input);
/// @brief Swizzles a 2D array of pixels into a 3D texture
/// @param line_length_in Number of pixels per line
/// @param line_count Number of lines
/// @param pitch Number of bytes per line
/// @param width Width of the swizzled texture
/// @param height Height of the swizzled texture
/// @param bytes_per_pixel Number of bytes used per pixel
/// @param block_height Block height shift
/// @param block_depth Block depth shift
/// @param origin_x Column offset in pixels of the swizzled texture
/// @param origin_y Row offset in pixels of the swizzled texture
/// @param output Pointer to the pixels of the swizzled texture
/// @param input Pointer to the 2D array of pixels used as input
/// @pre input and output points to an array large enough to hold the number of bytes used
void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 height,
u32 bytes_per_pixel, u32 block_height, u32 block_depth, u32 origin_x,
u32 origin_y, u8* output, const u8* input);
void SwizzleKepler(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height,
std::size_t copy_size, const u8* source_data, u8* swizzle_data);
void UnswizzleSubrect(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,
u32 width, u32 height, u32 depth, u32 origin_x, u32 origin_y, u32 extent_x,
u32 extent_y, u32 block_height, u32 block_depth, u32 pitch_linear);
/// Obtains the offset of the gob for positions 'dst_x' & 'dst_y'
u64 GetGOBOffset(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height,