From e1144e364e61d51b6ea8f950f6defd69d1b9f3fe Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sun, 21 Dec 2014 02:47:44 +0100
Subject: [PATCH 01/21] citra-qt: Always show pica framebuffers as RGBA8.

We actually don't really know yet how the format is encoded. Hence just use what works.
---
 src/citra_qt/debugger/graphics_framebuffer.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/src/citra_qt/debugger/graphics_framebuffer.cpp b/src/citra_qt/debugger/graphics_framebuffer.cpp
index dd41c3880c..28d5519fb0 100644
--- a/src/citra_qt/debugger/graphics_framebuffer.cpp
+++ b/src/citra_qt/debugger/graphics_framebuffer.cpp
@@ -202,7 +202,8 @@ void GraphicsFramebufferWidget::OnUpdate()
         framebuffer_address = framebuffer.GetColorBufferPhysicalAddress();
         framebuffer_width = framebuffer.GetWidth();
         framebuffer_height = framebuffer.GetHeight();
-        framebuffer_format = static_cast<Format>(framebuffer.color_format);
+        // TODO: It's unknown how this format is actually specified
+        framebuffer_format = Format::RGBA8;
 
         break;
     }

From b03a97e0b86bd49111fffee22c5ab28c73d6d7bf Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sun, 21 Dec 2014 02:48:15 +0100
Subject: [PATCH 02/21] citra-qt: Fix displaying RGBA5551 framebuffers.

(not that it matters at the moment, because this code is not used yet)
---
 src/citra_qt/debugger/graphics_framebuffer.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/citra_qt/debugger/graphics_framebuffer.cpp b/src/citra_qt/debugger/graphics_framebuffer.cpp
index 28d5519fb0..4a45027b87 100644
--- a/src/citra_qt/debugger/graphics_framebuffer.cpp
+++ b/src/citra_qt/debugger/graphics_framebuffer.cpp
@@ -263,6 +263,10 @@ void GraphicsFramebufferWidget::OnUpdate()
                 u8 g = (value >> 6) & 0x1F;
                 u8 b = (value >> 1) & 0x1F;
                 u8 a = value & 1;
+                r = (r << 3) | (r >> 2);
+                g = (g << 3) | (g >> 2);
+                b = (b << 3) | (b >> 2);
+                a *= 255;
 
                 decoded_image.setPixel(x, y, qRgba(r, g, b, 255/*a*/));
             }

From b7a48c422aa7293525909ac7b32575bce8575bde Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sun, 21 Dec 2014 02:49:45 +0100
Subject: [PATCH 03/21] Pica/CommandProcessor: Add support for integer
 uniforms.

---
 src/video_core/command_processor.cpp | 13 +++++++++++++
 src/video_core/pica.h                | 10 +++++++++-
 src/video_core/vertex_shader.cpp     |  7 +++++++
 src/video_core/vertex_shader.h       |  1 +
 4 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 9602779f4e..9e1975ddbb 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -173,6 +173,19 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
 
             break;
 
+        case PICA_REG_INDEX_WORKAROUND(vs_int_uniforms[0], 0x2b1):
+        case PICA_REG_INDEX_WORKAROUND(vs_int_uniforms[1], 0x2b2):
+        case PICA_REG_INDEX_WORKAROUND(vs_int_uniforms[2], 0x2b3):
+        case PICA_REG_INDEX_WORKAROUND(vs_int_uniforms[3], 0x2b4):
+        {
+            int index = (id - PICA_REG_INDEX_WORKAROUND(vs_int_uniforms[0], 0x2b1));
+            auto values = registers.vs_int_uniforms[index];
+            VertexShader::GetIntUniform(index) = Math::Vec4<u8>(values.x, values.y, values.z, values.w);
+            LOG_ERROR(HW_GPU, "Set integer uniform %d to %02x %02x %02x %02x",
+                      index, values.x.Value(), values.y.Value(), values.z.Value(), values.w.Value());
+            break;
+        }
+
         case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[0], 0x2c1):
         case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[1], 0x2c2):
         case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[2], 0x2c3):
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 38bac748cd..f518cc98b2 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -495,8 +495,14 @@ struct Regs {
     INSERT_PADDING_WORDS(0x51);
 
     BitField<0, 16, u32> vs_bool_uniforms;
+    union {
+        BitField< 0, 8, u32> x;
+        BitField< 8, 8, u32> y;
+        BitField<16, 8, u32> z;
+        BitField<24, 8, u32> w;
+    } vs_int_uniforms[4];
 
-    INSERT_PADDING_WORDS(0x9);
+    INSERT_PADDING_WORDS(0x5);
 
     // Offset to shader program entry point (in words)
     BitField<0, 16, u32> vs_main_offset;
@@ -625,6 +631,7 @@ struct Regs {
         ADD_FIELD(trigger_draw_indexed);
         ADD_FIELD(triangle_topology);
         ADD_FIELD(vs_bool_uniforms);
+        ADD_FIELD(vs_int_uniforms);
         ADD_FIELD(vs_main_offset);
         ADD_FIELD(vs_input_register_map);
         ADD_FIELD(vs_uniform_setup);
@@ -696,6 +703,7 @@ ASSERT_REG_POSITION(trigger_draw, 0x22e);
 ASSERT_REG_POSITION(trigger_draw_indexed, 0x22f);
 ASSERT_REG_POSITION(triangle_topology, 0x25e);
 ASSERT_REG_POSITION(vs_bool_uniforms, 0x2b0);
+ASSERT_REG_POSITION(vs_int_uniforms, 0x2b1);
 ASSERT_REG_POSITION(vs_main_offset, 0x2ba);
 ASSERT_REG_POSITION(vs_input_register_map, 0x2bb);
 ASSERT_REG_POSITION(vs_uniform_setup, 0x2c0);
diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp
index bed5081a0e..090ffd420a 100644
--- a/src/video_core/vertex_shader.cpp
+++ b/src/video_core/vertex_shader.cpp
@@ -30,6 +30,8 @@ static struct {
     Math::Vec4<float24> f[96];
 
     std::array<bool,16> b;
+
+    std::array<Math::Vec4<u8>,4> i;
 } shader_uniforms;
 
 // TODO: Not sure where the shader binary and swizzle patterns are supposed to be loaded to!
@@ -57,6 +59,11 @@ bool& GetBoolUniform(u32 index)
     return shader_uniforms.b[index];
 }
 
+Math::Vec4<u8>& GetIntUniform(u32 index)
+{
+    return shader_uniforms.i[index];
+}
+
 const std::array<u32, 1024>& GetShaderBinary()
 {
     return shader_memory;
diff --git a/src/video_core/vertex_shader.h b/src/video_core/vertex_shader.h
index af3fb2a2f0..3a68a34092 100644
--- a/src/video_core/vertex_shader.h
+++ b/src/video_core/vertex_shader.h
@@ -73,6 +73,7 @@ OutputVertex RunShader(const InputVertex& input, int num_attributes);
 
 Math::Vec4<float24>& GetFloatUniform(u32 index);
 bool& GetBoolUniform(u32 index);
+Math::Vec4<u8>& GetIntUniform(u32 index);
 
 const std::array<u32, 1024>& GetShaderBinary();
 const std::array<u32, 1024>& GetSwizzlePatterns();

From 632655e292cc317f8a985747dda8883d3f785431 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sun, 21 Dec 2014 02:51:48 +0100
Subject: [PATCH 04/21] Pica: Fix A4, IA4 and IA8 texture formats.

Both IA4 and IA8 had their component order mixed up. Additionally, IA4 used the wrong number of nibbles per texel. A4 skipped every second texel.
---
 src/video_core/debug_utils/debug_utils.cpp | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp
index 5921185a65..9c0fbc4538 100644
--- a/src/video_core/debug_utils/debug_utils.cpp
+++ b/src/video_core/debug_utils/debug_utils.cpp
@@ -389,13 +389,11 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
     {
         const u8* source_ptr = source + offset * 2;
 
-        // TODO: component order not verified
-
         if (disable_alpha) {
             // Show intensity as red, alpha as green
-            return { source_ptr[0], source_ptr[1], 0, 255 };
+            return { source_ptr[1], source_ptr[0], 0, 255 };
         } else {
-            return { source_ptr[0], source_ptr[0], source_ptr[0], source_ptr[1]};
+            return { source_ptr[1], source_ptr[1], source_ptr[1], source_ptr[0]};
         }
     }
 
@@ -418,12 +416,10 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
 
     case Regs::TextureFormat::IA4:
     {
-        const u8* source_ptr = source + offset / 2;
+        const u8* source_ptr = source + offset;
 
-        // TODO: component order not verified
-
-        u8 i = (*source_ptr) & 0xF;
-        u8 a = ((*source_ptr) & 0xF0) >> 4;
+        u8 i = ((*source_ptr) & 0xF0) >> 4;
+        u8 a = (*source_ptr) & 0xF;
         a |= a << 4;
         i |= i << 4;
 
@@ -439,15 +435,13 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
     {
         const u8* source_ptr = source + offset / 2;
 
-        // TODO: component order not verified
-
         u8 a = (coarse_x % 2) ? ((*source_ptr)&0xF) : (((*source_ptr) & 0xF0) >> 4);
         a |= a << 4;
 
         if (disable_alpha) {
-            return { *source_ptr, *source_ptr, *source_ptr, 255 };
+            return { a, a, a, 255 };
         } else {
-            return { 0, 0, 0, *source_ptr };
+            return { 0, 0, 0, a };
         }
     }
 

From 36291bc3f6e051f561b24408f7d3642235a749c8 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sun, 21 Dec 2014 02:55:51 +0100
Subject: [PATCH 05/21] Pica: Add output merger definitions.

---
 src/video_core/pica.h | 57 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 56 insertions(+), 1 deletion(-)

diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index f518cc98b2..4afda7b4b6 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -289,7 +289,7 @@ struct Regs {
     TevStageConfig tev_stage4;
     INSERT_PADDING_WORDS(0x3);
     TevStageConfig tev_stage5;
-    INSERT_PADDING_WORDS(0x13);
+    INSERT_PADDING_WORDS(0x3);
 
     const std::array<Regs::TevStageConfig,6> GetTevStages() const {
         return { tev_stage0, tev_stage1,
@@ -297,6 +297,59 @@ struct Regs {
                  tev_stage4, tev_stage5 };
     };
 
+    struct {
+        enum DepthFunc : u32 {
+            Always      = 1,
+            GreaterThan = 6,
+        };
+
+        union {
+            // If false, logic blending is used
+            BitField<8, 1, u32> alphablend_enable;
+        };
+
+        union {
+            enum BlendEquation : u32 {
+                Add = 0,
+            };
+
+            enum BlendFactor : u32 {
+                Zero = 0,
+                One = 1,
+
+                SourceAlpha = 6,
+                OneMinusSourceAlpha = 7,
+            };
+
+            BitField< 0, 8, BlendEquation> blend_equation_rgb;
+            BitField< 8, 8, BlendEquation> blend_equation_a;
+
+            BitField<16, 4, BlendFactor> factor_source_rgb;
+            BitField<20, 4, BlendFactor> factor_dest_rgb;
+
+            BitField<24, 4, BlendFactor> factor_source_a;
+            BitField<28, 4, BlendFactor> factor_dest_a;
+        } alpha_blending;
+
+        union {
+            enum Op {
+                Set = 4,
+            };
+
+            BitField<0, 4, Op> op;
+        } logic_op;
+
+        INSERT_PADDING_WORDS(0x4);
+
+        union {
+            BitField< 0, 1, u32> depth_test_enable;
+            BitField< 4, 3, DepthFunc> depth_test_func;
+            BitField<12, 1, u32> depth_write_enable;
+        };
+
+        INSERT_PADDING_WORDS(0x8);
+    } output_merger;
+
     struct {
         enum ColorFormat : u32 {
             RGBA8    = 0,
@@ -623,6 +676,7 @@ struct Regs {
         ADD_FIELD(tev_stage3);
         ADD_FIELD(tev_stage4);
         ADD_FIELD(tev_stage5);
+        ADD_FIELD(output_merger);
         ADD_FIELD(framebuffer);
         ADD_FIELD(vertex_attributes);
         ADD_FIELD(index_array);
@@ -695,6 +749,7 @@ ASSERT_REG_POSITION(tev_stage2, 0xd0);
 ASSERT_REG_POSITION(tev_stage3, 0xd8);
 ASSERT_REG_POSITION(tev_stage4, 0xf0);
 ASSERT_REG_POSITION(tev_stage5, 0xf8);
+ASSERT_REG_POSITION(output_merger, 0x100);
 ASSERT_REG_POSITION(framebuffer, 0x110);
 ASSERT_REG_POSITION(vertex_attributes, 0x200);
 ASSERT_REG_POSITION(index_array, 0x227);

From 77bb58afeb39344b7481f6f003a9beb6c7b87199 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sun, 21 Dec 2014 02:57:56 +0100
Subject: [PATCH 06/21] Pica/Rasterizer: Further enhance Tev support.

---
 src/video_core/rasterizer.cpp | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index a801488726..04ff686153 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -279,12 +279,15 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                     }
                 };
 
-                auto GetColorModifier = [](ColorModifier factor, const Math::Vec4<u8>& values) -> Math::Vec3<u8> {
+                static auto GetColorModifier = [](ColorModifier factor, const Math::Vec4<u8>& values) -> Math::Vec3<u8> {
                     switch (factor)
                     {
                     case ColorModifier::SourceColor:
                         return values.rgb();
 
+                    case ColorModifier::OneMinusSourceColor:
+                        return (Math::Vec3<u8>(255, 255, 255) - values.rgb()).Cast<u8>();
+
                     case ColorModifier::SourceAlpha:
                         return { values.a(), values.a(), values.a() };
 
@@ -295,7 +298,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                     }
                 };
 
-                auto GetAlphaModifier = [](AlphaModifier factor, u8 value) -> u8 {
+                static auto GetAlphaModifier = [](AlphaModifier factor, u8 value) -> u8 {
                     switch (factor) {
                     case AlphaModifier::SourceAlpha:
                         return value;
@@ -310,7 +313,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                     }
                 };
 
-                auto ColorCombine = [](Operation op, const Math::Vec3<u8> input[3]) -> Math::Vec3<u8> {
+                static auto ColorCombine = [](Operation op, const Math::Vec3<u8> input[3]) -> Math::Vec3<u8> {
                     switch (op) {
                     case Operation::Replace:
                         return input[0];
@@ -330,6 +333,15 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                     case Operation::Lerp:
                         return ((input[0] * input[2] + input[1] * (Math::MakeVec<u8>(255, 255, 255) - input[2]).Cast<u8>()) / 255).Cast<u8>();
 
+                    case Operation::Subtract:
+                    {
+                        auto result = input[0].Cast<int>() - input[1].Cast<int>();
+                        result.r() = std::max(0, result.r());
+                        result.g() = std::max(0, result.g());
+                        result.b() = std::max(0, result.b());
+                        return result.Cast<u8>();
+                    }
+
                     default:
                         LOG_ERROR(HW_GPU, "Unknown color combiner operation %d\n", (int)op);
                         _dbg_assert_(HW_GPU, 0);
@@ -337,7 +349,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                     }
                 };
 
-                auto AlphaCombine = [](Operation op, const std::array<u8,3>& input) -> u8 {
+                static auto AlphaCombine = [](Operation op, const std::array<u8,3>& input) -> u8 {
                     switch (op) {
                     case Operation::Replace:
                         return input[0];
@@ -351,6 +363,9 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                     case Operation::Lerp:
                         return (input[0] * input[2] + input[1] * (255 - input[2])) / 255;
 
+                    case Operation::Subtract:
+                        return std::max(0, (int)input[0] - (int)input[1]);
+
                     default:
                         LOG_ERROR(HW_GPU, "Unknown alpha combiner operation %d\n", (int)op);
                         _dbg_assert_(HW_GPU, 0);

From e229ff8c836fa213f1bdd31cabe924457f5e7e0c Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sun, 21 Dec 2014 02:59:08 +0100
Subject: [PATCH 07/21] Pica/Rasterizer: Implement depth testing.

---
 src/video_core/pica.h         |  1 +
 src/video_core/rasterizer.cpp | 39 +++++++++++++++++++++++++++++------
 2 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 4afda7b4b6..810a926c9e 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -300,6 +300,7 @@ struct Regs {
     struct {
         enum DepthFunc : u32 {
             Always      = 1,
+            LessThan    = 4,
             GreaterThan = 6,
         };
 
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 04ff686153..8dff2db275 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -396,12 +396,39 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                 combiner_output = Math::MakeVec(color_output, alpha_output);
             }
 
-            // TODO: Not sure if the multiplication by 65535 has already been taken care
-            // of when transforming to screen coordinates or not.
-            u16 z = (u16)(((float)v0.screenpos[2].ToFloat32() * w0 +
-                           (float)v1.screenpos[2].ToFloat32() * w1 +
-                           (float)v2.screenpos[2].ToFloat32() * w2) * 65535.f / wsum);
-            SetDepth(x >> 4, y >> 4, z);
+            // TODO: Does depth indeed only get written even if depth testing is enabled?
+            if (registers.output_merger.depth_test_enable) {
+                u16 z = (u16)(-((float)v0.screenpos[2].ToFloat32() * w0 +
+                            (float)v1.screenpos[2].ToFloat32() * w1 +
+                            (float)v2.screenpos[2].ToFloat32() * w2) * 65535.f / wsum);
+                u16 ref_z = GetDepth(x >> 4, y >> 4);
+
+                bool pass = false;
+
+                switch (registers.output_merger.depth_test_func) {
+                case registers.output_merger.Always:
+                    pass = true;
+                    break;
+
+                case registers.output_merger.LessThan:
+                    pass = z < ref_z;
+                    break;
+
+                case registers.output_merger.GreaterThan:
+                    pass = z > ref_z;
+                    break;
+
+                default:
+                    LOG_ERROR(HW_GPU, "Unknown depth test function %x", registers.output_merger.depth_test_func.Value());
+                    break;
+                }
+
+                if (!pass)
+                    continue;
+
+                if (registers.output_merger.depth_write_enable)
+                    SetDepth(x >> 4, y >> 4, z);
+            }
 
             DrawPixel(x >> 4, y >> 4, combiner_output);
         }

From a7ae0330b1e4d5aa7fab3bb07bb2cf58f8572dc5 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sun, 21 Dec 2014 03:00:25 +0100
Subject: [PATCH 08/21] Pica/Rasterizer: Implement alpha blending.

---
 src/video_core/rasterizer.cpp | 84 +++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)

diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 8dff2db275..5f7971fe20 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -25,6 +25,18 @@ static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
     *(color_buffer + x + y * registers.framebuffer.GetWidth()) = value;
 }
 
+static const Math::Vec4<u8> GetPixel(int x, int y) {
+    u32* color_buffer_u32 = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(registers.framebuffer.GetColorBufferPhysicalAddress())));
+
+    u32 value = *(color_buffer_u32 + x + y * registers.framebuffer.GetWidth());
+    Math::Vec4<u8> ret;
+    ret.a() = value >> 24;
+    ret.r() = (value >> 16) & 0xFF;
+    ret.g() = (value >> 8) & 0xFF;
+    ret.b() = value & 0xFF;
+    return ret;
+ }
+
 static u32 GetDepth(int x, int y) {
     u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(registers.framebuffer.GetDepthBufferPhysicalAddress())));
 
@@ -430,6 +442,78 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                     SetDepth(x >> 4, y >> 4, z);
             }
 
+            auto dest = GetPixel(x >> 4, y >> 4);
+
+            if (registers.output_merger.alphablend_enable) {
+                auto params = registers.output_merger.alpha_blending;
+
+                auto LookupFactorRGB = [&](decltype(params)::BlendFactor factor) -> Math::Vec3<u8> {
+                    switch(factor) {
+                    case params.Zero:
+                        return Math::Vec3<u8>(0, 0, 0);
+
+                    case params.One:
+                        return Math::Vec3<u8>(255, 255, 255);
+
+                    case params.SourceAlpha:
+                        return Math::MakeVec(combiner_output.a(), combiner_output.a(), combiner_output.a());
+
+                    case params.OneMinusSourceAlpha:
+                        return Math::Vec3<u8>(255-combiner_output.a(), 255-combiner_output.a(), 255-combiner_output.a());
+
+                    default:
+                        LOG_CRITICAL(HW_GPU, "Unknown color blend factor %x", factor);
+                        exit(0);
+                        break;
+                    }
+                };
+
+                auto LookupFactorA = [&](decltype(params)::BlendFactor factor) -> u8 {
+                    switch(factor) {
+                    case params.Zero:
+                        return 0;
+
+                    case params.One:
+                        return 255;
+
+                    case params.SourceAlpha:
+                        return combiner_output.a();
+
+                    case params.OneMinusSourceAlpha:
+                        return 255 - combiner_output.a();
+
+                    default:
+                        LOG_CRITICAL(HW_GPU, "Unknown alpha blend factor %x", factor);
+                        exit(0);
+                        break;
+                    }
+                };
+
+                auto srcfactor = Math::MakeVec(LookupFactorRGB(params.factor_source_rgb),
+                                               LookupFactorA(params.factor_source_a));
+                auto dstfactor = Math::MakeVec(LookupFactorRGB(params.factor_dest_rgb),
+                                               LookupFactorA(params.factor_dest_a));
+
+                switch (params.blend_equation_rgb) {
+                case params.Add:
+                {
+                    auto result = (combiner_output * srcfactor + dest * dstfactor) / 255;
+                    result.r() = std::min(255, result.r());
+                    result.g() = std::min(255, result.g());
+                    result.b() = std::min(255, result.b());
+                    combiner_output = result.Cast<u8>();
+                    break;
+                }
+
+                default:
+                    LOG_CRITICAL(HW_GPU, "Unknown RGB blend equation %x", params.blend_equation_rgb.Value());
+                    exit(0);
+                }
+            } else {
+                LOG_CRITICAL(HW_GPU, "logic op: %x", registers.output_merger.logic_op);
+                exit(0);
+            }
+
             DrawPixel(x >> 4, y >> 4, combiner_output);
         }
     }

From 3da52ead9badab44257fce6e606873f6abc7dc6f Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sun, 28 Dec 2014 23:33:59 +0100
Subject: [PATCH 09/21] Pica/DebugUtils: Fix a bug in RGBA4 texture decoding.

---
 src/video_core/debug_utils/debug_utils.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp
index 9c0fbc4538..83d585d16e 100644
--- a/src/video_core/debug_utils/debug_utils.cpp
+++ b/src/video_core/debug_utils/debug_utils.cpp
@@ -375,9 +375,9 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
     {
         const u8* source_ptr = source + offset * 2;
         u8 r = source_ptr[1] >> 4;
-        u8 g = source_ptr[1] & 0xFF;
+        u8 g = source_ptr[1] & 0xF;
         u8 b = source_ptr[0] >> 4;
-        u8 a = source_ptr[0] & 0xFF;
+        u8 a = source_ptr[0] & 0xF;
         r = (r << 4) | r;
         g = (g << 4) | g;
         b = (b << 4) | b;

From 3b78af904e5a4f959ab206a207bd26441886c9a8 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sun, 21 Dec 2014 02:56:32 +0100
Subject: [PATCH 10/21] Pica/Rasterizer: Textures seem to be laid out flipped
 vertically.

Not sure if this is a correct fix. Probably should instead change the decoding logic itself.
---
 src/video_core/rasterizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 5f7971fe20..08b649fb6f 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -214,7 +214,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                     }
                 };
                 s = GetWrappedTexCoord(registers.texture0.wrap_s, s, registers.texture0.width);
-                t = GetWrappedTexCoord(registers.texture0.wrap_t, t, registers.texture0.height);
+                t = registers.texture0.height - 1 - GetWrappedTexCoord(registers.texture0.wrap_t, t, registers.texture0.height);
 
                 u8* texture_data = Memory::GetPointer(PAddrToVAddr(texture.config.GetPhysicalAddress()));
                 auto info = DebugUtils::TextureInfo::FromPicaRegister(texture.config, texture.format);

From 0f494240228e24e21c88bf9f3178aaa68db4fb45 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sat, 13 Dec 2014 21:39:42 +0100
Subject: [PATCH 11/21] Pica/Rasterizer: Implement backface culling.

---
 src/video_core/pica.h         | 16 +++++++++++++++-
 src/video_core/rasterizer.cpp | 30 +++++++++++++++++++++---------
 2 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 810a926c9e..f5771ed844 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -50,7 +50,19 @@ struct Regs {
 
     u32 trigger_irq;
 
-    INSERT_PADDING_WORDS(0x30);
+    INSERT_PADDING_WORDS(0x2f);
+
+    enum class CullMode : u32 {
+        // Select which polygons are considered to be "frontfacing".
+        KeepAll              = 0,
+        KeepClockWise        = 1,
+        KeepCounterClockWise = 2,
+        // TODO: What does the third value imply?
+    };
+
+    union {
+        BitField<0, 2, CullMode> cull_mode;
+    };
 
     BitField<0, 24, u32> viewport_size_x;
 
@@ -659,6 +671,7 @@ struct Regs {
             } while(false)
 
         ADD_FIELD(trigger_irq);
+        ADD_FIELD(cull_mode);
         ADD_FIELD(viewport_size_x);
         ADD_FIELD(viewport_size_y);
         ADD_FIELD(viewport_depth_range);
@@ -730,6 +743,7 @@ private:
 #define ASSERT_REG_POSITION(field_name, position) static_assert(offsetof(Regs, field_name) == position * 4, "Field "#field_name" has invalid position")
 
 ASSERT_REG_POSITION(trigger_irq, 0x10);
+ASSERT_REG_POSITION(cull_mode, 0x40);
 ASSERT_REG_POSITION(viewport_size_x, 0x41);
 ASSERT_REG_POSITION(viewport_size_y, 0x43);
 ASSERT_REG_POSITION(viewport_depth_range, 0x4d);
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 08b649fb6f..9148745dc6 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -82,10 +82,31 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
     auto ScreenToRasterizerCoordinates = [FloatToFix](const Math::Vec3<float24> vec) {
                                              return Math::Vec3<Fix12P4>{FloatToFix(vec.x), FloatToFix(vec.y), FloatToFix(vec.z)};
                                          };
+    static auto orient2d = [](const Math::Vec2<Fix12P4>& vtx1,
+                              const Math::Vec2<Fix12P4>& vtx2,
+                              const Math::Vec2<Fix12P4>& vtx3) {
+        const auto vec1 = Math::MakeVec(vtx2 - vtx1, 0);
+        const auto vec2 = Math::MakeVec(vtx3 - vtx1, 0);
+        // TODO: There is a very small chance this will overflow for sizeof(int) == 4
+        return Math::Cross(vec1, vec2).z;
+    };
+
     Math::Vec3<Fix12P4> vtxpos[3]{ ScreenToRasterizerCoordinates(v0.screenpos),
                                    ScreenToRasterizerCoordinates(v1.screenpos),
                                    ScreenToRasterizerCoordinates(v2.screenpos) };
 
+    if (registers.cull_mode == Regs::CullMode::KeepClockWise) {
+        // Reverse vertex order and use the CCW code path.
+        std::swap(vtxpos[1], vtxpos[2]);
+    }
+
+    if (registers.cull_mode != Regs::CullMode::KeepAll) {
+        // Cull away triangles which are wound clockwise.
+        // TODO: A check for degenerate triangles ("== 0") should be considered for CullMode::KeepAll
+        if (orient2d(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0)
+            return;
+    }
+
     // TODO: Proper scissor rect test!
     u16 min_x = std::min({vtxpos[0].x, vtxpos[1].x, vtxpos[2].x});
     u16 min_y = std::min({vtxpos[0].y, vtxpos[1].y, vtxpos[2].y});
@@ -128,15 +149,6 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
         for (u16 x = min_x; x < max_x; x += 0x10) {
 
             // Calculate the barycentric coordinates w0, w1 and w2
-            auto orient2d = [](const Math::Vec2<Fix12P4>& vtx1,
-                               const Math::Vec2<Fix12P4>& vtx2,
-                               const Math::Vec2<Fix12P4>& vtx3) {
-                const auto vec1 = Math::MakeVec(vtx2 - vtx1, 0);
-                const auto vec2 = Math::MakeVec(vtx3 - vtx1, 0);
-                // TODO: There is a very small chance this will overflow for sizeof(int) == 4
-                return Math::Cross(vec1, vec2).z;
-            };
-
             int w0 = bias0 + orient2d(vtxpos[1].xy(), vtxpos[2].xy(), {x, y});
             int w1 = bias1 + orient2d(vtxpos[2].xy(), vtxpos[0].xy(), {x, y});
             int w2 = bias2 + orient2d(vtxpos[0].xy(), vtxpos[1].xy(), {x, y});

From 18a5e888bbeda989aba6576e7dd7e2b6c09b45ea Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sun, 28 Dec 2014 23:28:58 +0100
Subject: [PATCH 12/21] GPU: Pseudo-implement horizontal scaling.

It's not really known how this actually works. Some testing has shown that this probably performs no filtering, and common usage in games suggests it's not actually resizing the image at all.
However, this patch does seem to fix some homebrew showing quasi-duplicated images while still keeping other applications in a working state.
---
 src/core/hw/gpu.cpp | 6 +++++-
 src/core/hw/gpu.h   | 3 +++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp
index dd619cb16c..0ff6c6cde1 100644
--- a/src/core/hw/gpu.cpp
+++ b/src/core/hw/gpu.cpp
@@ -94,11 +94,15 @@ inline void Write(u32 addr, const T data) {
                         int r, g, b, a;
                     } source_color = { 0, 0, 0, 0 };
 
+                    // Cheap emulation of horizontal scaling: Just skip each second pixel of the
+                    // input framebuffer. We keep track of this in the pixel_skip variable.
+                    unsigned pixel_skip = (config.scale_horizontally != 0) ? 2 : 1;
+
                     switch (config.input_format) {
                     case Regs::PixelFormat::RGBA8:
                     {
                         // TODO: Most likely got the component order messed up.
-                        u8* srcptr = source_pointer + x * 4 + y * config.input_width * 4;
+                        u8* srcptr = source_pointer + x * 4 * pixel_skip + y * config.input_width * 4 * pixel_skip;
                         source_color.r = srcptr[0]; // blue
                         source_color.g = srcptr[1]; // green
                         source_color.b = srcptr[2]; // red
diff --git a/src/core/hw/gpu.h b/src/core/hw/gpu.h
index 292f496c1e..7de0552326 100644
--- a/src/core/hw/gpu.h
+++ b/src/core/hw/gpu.h
@@ -157,6 +157,9 @@ struct Regs {
             BitField< 8, 3, PixelFormat> input_format;
             BitField<12, 3, PixelFormat> output_format;
             BitField<16, 1, u32> output_tiled;     // stores output in a tiled format
+
+            // TODO: Not really sure if this actually scales, or even resizes at all.
+            BitField<24, 1, u32> scale_horizontally;
         };
 
         INSERT_PADDING_WORDS(0x1);

From b2d461020d12b9abf06857747ed237c0c3a6647a Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sun, 21 Dec 2014 03:02:15 +0100
Subject: [PATCH 13/21] Pica/CommandProcessor: Workaround games not setting the
 input position's w component.

---
 src/video_core/command_processor.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 9e1975ddbb..76acdc1775 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -112,6 +112,10 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
                 // Initialize data for the current vertex
                 VertexShader::InputVertex input;
 
+                // Load a debugging token to check whether this gets loaded by the running
+                // application or not.
+                input.attr[0].w = float24::FromRawFloat24(0x00abcdef);
+
                 for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) {
                     for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
                         const u8* srcdata = Memory::GetPointer(PAddrToVAddr(vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i]));
@@ -136,6 +140,16 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
                     }
                 }
 
+                // HACK: Some games do not initialize the vertex position's w component. This leads
+                //       to critical issues since it messes up perspective division. As a
+                //       workaround, we force the fourth component to 1.0 if we find this to be the
+                //       case.
+                //       To do this, we additionally have to assume that the first input attribute
+                //       is the vertex position, since there's no information about this other than
+                //       the empiric observation that this is usually the case.
+                if (input.attr[0].w == float24::FromRawFloat24(0x00abcdef))
+                    input.attr[0].w = float24::FromFloat32(1.0);
+
                 if (g_debug_context)
                     g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, (void*)&input);
 

From 323a56f89835714f0973cf808b7b59b2589012d8 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Wed, 31 Dec 2014 15:01:50 +0100
Subject: [PATCH 14/21] Pica/CommandProcessor: Cleanups.

---
 src/video_core/command_processor.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 76acdc1775..0d9f4ba666 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -114,7 +114,8 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
 
                 // Load a debugging token to check whether this gets loaded by the running
                 // application or not.
-                input.attr[0].w = float24::FromRawFloat24(0x00abcdef);
+                static const float24 debug_token = float24::FromRawFloat24(0x00abcdef);
+                input.attr[0].w = debug_token;
 
                 for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) {
                     for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
@@ -147,7 +148,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
                 //       To do this, we additionally have to assume that the first input attribute
                 //       is the vertex position, since there's no information about this other than
                 //       the empiric observation that this is usually the case.
-                if (input.attr[0].w == float24::FromRawFloat24(0x00abcdef))
+                if (input.attr[0].w == debug_token)
                     input.attr[0].w = float24::FromFloat32(1.0);
 
                 if (g_debug_context)
@@ -195,7 +196,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
             int index = (id - PICA_REG_INDEX_WORKAROUND(vs_int_uniforms[0], 0x2b1));
             auto values = registers.vs_int_uniforms[index];
             VertexShader::GetIntUniform(index) = Math::Vec4<u8>(values.x, values.y, values.z, values.w);
-            LOG_ERROR(HW_GPU, "Set integer uniform %d to %02x %02x %02x %02x",
+            LOG_TRACE(HW_GPU, "Set integer uniform %d to %02x %02x %02x %02x",
                       index, values.x.Value(), values.y.Value(), values.z.Value(), values.w.Value());
             break;
         }

From 40c720084146e8c2c00b58bc42bf0ebd98fa1496 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Wed, 31 Dec 2014 15:02:48 +0100
Subject: [PATCH 15/21] Pica/VertexShader: Coding style fixes.

---
 src/video_core/vertex_shader.cpp | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp
index 090ffd420a..ff825e2e1e 100644
--- a/src/video_core/vertex_shader.cpp
+++ b/src/video_core/vertex_shader.cpp
@@ -39,38 +39,31 @@ static struct {
 static std::array<u32, 1024> shader_memory;
 static std::array<u32, 1024> swizzle_data;
 
-void SubmitShaderMemoryChange(u32 addr, u32 value)
-{
+void SubmitShaderMemoryChange(u32 addr, u32 value) {
     shader_memory[addr] = value;
 }
 
-void SubmitSwizzleDataChange(u32 addr, u32 value)
-{
+void SubmitSwizzleDataChange(u32 addr, u32 value) {
     swizzle_data[addr] = value;
 }
 
-Math::Vec4<float24>& GetFloatUniform(u32 index)
-{
+Math::Vec4<float24>& GetFloatUniform(u32 index) {
     return shader_uniforms.f[index];
 }
 
-bool& GetBoolUniform(u32 index)
-{
+bool& GetBoolUniform(u32 index) {
     return shader_uniforms.b[index];
 }
 
-Math::Vec4<u8>& GetIntUniform(u32 index)
-{
+Math::Vec4<u8>& GetIntUniform(u32 index) {
     return shader_uniforms.i[index];
 }
 
-const std::array<u32, 1024>& GetShaderBinary()
-{
+const std::array<u32, 1024>& GetShaderBinary() {
     return shader_memory;
 }
 
-const std::array<u32, 1024>& GetSwizzlePatterns()
-{
+const std::array<u32, 1024>& GetSwizzlePatterns() {
     return swizzle_data;
 }
 
@@ -444,8 +437,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
     }
 }
 
-OutputVertex RunShader(const InputVertex& input, int num_attributes)
-{
+OutputVertex RunShader(const InputVertex& input, int num_attributes) {
     VertexShaderState state;
 
     const u32* main = &shader_memory[registers.vs_main_offset];

From 195d73a385c9dd88150ed9b875e313c186e7d96e Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Wed, 31 Dec 2014 15:04:39 +0100
Subject: [PATCH 16/21] Pica/Rasterizer: Clean up long code lines.

---
 src/video_core/rasterizer.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 9148745dc6..9822b36a6f 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -18,7 +18,8 @@ namespace Pica {
 namespace Rasterizer {
 
 static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
-    u32* color_buffer = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(registers.framebuffer.GetColorBufferPhysicalAddress())));
+    const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress();
+    u32* color_buffer = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(addr)));
     u32 value = (color.a() << 24) | (color.r() << 16) | (color.g() << 8) | color.b();
 
     // Assuming RGBA8 format until actual framebuffer format handling is implemented
@@ -26,7 +27,8 @@ static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
 }
 
 static const Math::Vec4<u8> GetPixel(int x, int y) {
-    u32* color_buffer_u32 = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(registers.framebuffer.GetColorBufferPhysicalAddress())));
+    const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress();
+    u32* color_buffer_u32 = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(addr)));
 
     u32 value = *(color_buffer_u32 + x + y * registers.framebuffer.GetWidth());
     Math::Vec4<u8> ret;
@@ -38,14 +40,16 @@ static const Math::Vec4<u8> GetPixel(int x, int y) {
  }
 
 static u32 GetDepth(int x, int y) {
-    u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(registers.framebuffer.GetDepthBufferPhysicalAddress())));
+    const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress();
+    u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));
 
     // Assuming 16-bit depth buffer format until actual format handling is implemented
     return *(depth_buffer + x + y * registers.framebuffer.GetWidth());
 }
 
 static void SetDepth(int x, int y, u16 value) {
-    u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(registers.framebuffer.GetDepthBufferPhysicalAddress())));
+    const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress();
+    u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));
 
     // Assuming 16-bit depth buffer format until actual format handling is implemented
     *(depth_buffer + x + y * registers.framebuffer.GetWidth()) = value;

From d13bd327ba70a89f8e634afc3c9c22ba3c0f6e38 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Wed, 31 Dec 2014 15:05:33 +0100
Subject: [PATCH 17/21] Pica/Rasterizer: Fix a bug related to multitexturing
 and texture wrapping.

---
 src/video_core/rasterizer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 9822b36a6f..4dfc21885c 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -229,8 +229,8 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                             return 0;
                     }
                 };
-                s = GetWrappedTexCoord(registers.texture0.wrap_s, s, registers.texture0.width);
-                t = registers.texture0.height - 1 - GetWrappedTexCoord(registers.texture0.wrap_t, t, registers.texture0.height);
+                s = GetWrappedTexCoord(texture.config.wrap_s, s, texture.config.width);
+                t = texture.config.height - 1 - GetWrappedTexCoord(texture.config.wrap_t, t, texture.config.height);
 
                 u8* texture_data = Memory::GetPointer(PAddrToVAddr(texture.config.GetPhysicalAddress()));
                 auto info = DebugUtils::TextureInfo::FromPicaRegister(texture.config, texture.format);

From 614baa39d1bc1489c25acf3578ae7f99cc1b5ad0 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Wed, 31 Dec 2014 15:08:07 +0100
Subject: [PATCH 18/21] VideoCore: Remove some unused functions.

---
 src/video_core/utils.h | 26 --------------------------
 1 file changed, 26 deletions(-)

diff --git a/src/video_core/utils.h b/src/video_core/utils.h
index 63ebccbde1..6fd6404258 100644
--- a/src/video_core/utils.h
+++ b/src/video_core/utils.h
@@ -8,32 +8,6 @@
 
 #include "common/common_types.h"
 
-namespace FormatPrecision {
-
-/// Adjust RGBA8 color with RGBA6 precision
-static inline u32 rgba8_with_rgba6(u32 src) {
-    u32 color = src;
-    color &= 0xFCFCFCFC;
-    color |= (color >> 6) & 0x03030303;
-    return color;
-}
-
-/// Adjust RGBA8 color with RGB565 precision
-static inline u32 rgba8_with_rgb565(u32 src) {
-    u32 color = (src & 0xF8FCF8);
-    color |= (color >> 5) & 0x070007;
-    color |= (color >> 6) & 0x000300;
-    color |= 0xFF000000;
-    return color;
-}
-
-/// Adjust Z24 depth value with Z16 precision
-static inline u32 z24_with_z16(u32 src) {
-    return (src & 0xFFFF00) | (src >> 16);
-}
-
-} // namespace
-
 namespace VideoCore {
 
 /// Structure for the TGA texture format (for dumping)

From 47543d62cf9e982598f58438ad24769c2b36ec77 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Wed, 31 Dec 2014 15:17:07 +0100
Subject: [PATCH 19/21] Pica: Cleanup color conversion.

---
 .../debugger/graphics_framebuffer.cpp         | 13 +++-----
 src/video_core/color.h                        | 32 +++++++++++++++++++
 src/video_core/debug_utils/debug_utils.cpp    | 32 ++++++++-----------
 3 files changed, 51 insertions(+), 26 deletions(-)
 create mode 100644 src/video_core/color.h

diff --git a/src/citra_qt/debugger/graphics_framebuffer.cpp b/src/citra_qt/debugger/graphics_framebuffer.cpp
index 4a45027b87..a9e9de6529 100644
--- a/src/citra_qt/debugger/graphics_framebuffer.cpp
+++ b/src/citra_qt/debugger/graphics_framebuffer.cpp
@@ -10,6 +10,7 @@
 #include <QPushButton>
 #include <QSpinBox>
 
+#include "video_core/color.h"
 #include "video_core/pica.h"
 
 #include "graphics_framebuffer.hxx"
@@ -259,14 +260,10 @@ void GraphicsFramebufferWidget::OnUpdate()
         for (unsigned y = 0; y < framebuffer_height; ++y) {
             for (unsigned x = 0; x < framebuffer_width; ++x) {
                 u16 value = *(u16*)(((u8*)color_buffer) + x * 2 + y * framebuffer_width * 2);
-                u8 r = (value >> 11) & 0x1F;
-                u8 g = (value >> 6) & 0x1F;
-                u8 b = (value >> 1) & 0x1F;
-                u8 a = value & 1;
-                r = (r << 3) | (r >> 2);
-                g = (g << 3) | (g >> 2);
-                b = (b << 3) | (b >> 2);
-                a *= 255;
+                u8 r = Color::Convert5To8((value >> 11) & 0x1F);
+                u8 g = Color::Convert5To8((value >> 6) & 0x1F);
+                u8 b = Color::Convert5To8((value >> 1) & 0x1F);
+                u8 a = Color::Convert1To8(value & 1);
 
                 decoded_image.setPixel(x, y, qRgba(r, g, b, 255/*a*/));
             }
diff --git a/src/video_core/color.h b/src/video_core/color.h
new file mode 100644
index 0000000000..e86ac12652
--- /dev/null
+++ b/src/video_core/color.h
@@ -0,0 +1,32 @@
+// Copyright 2014 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_types.h"
+
+namespace Color {
+
+/// Convert a 1-bit color component to 8 bit
+static inline u8 Convert1To8(u8 value) {
+    return value * 255;
+}
+
+/// Convert a 4-bit color component to 8 bit
+static inline u8 Convert4To8(u8 value) {
+    return (value << 4) | value;
+}
+
+/// Convert a 5-bit color component to 8 bit
+static inline u8 Convert5To8(u8 value) {
+    return (value << 3) | (value >> 2);
+}
+
+/// Convert a 6-bit color component to 8 bit
+static inline u8 Convert6To8(u8 value) {
+    return (value << 2) | (value >> 4);
+}
+
+
+} // namespace
diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp
index 83d585d16e..a494465b99 100644
--- a/src/video_core/debug_utils/debug_utils.cpp
+++ b/src/video_core/debug_utils/debug_utils.cpp
@@ -19,6 +19,7 @@
 #include "common/log.h"
 #include "common/file_util.h"
 
+#include "video_core/color.h"
 #include "video_core/math.h"
 #include "video_core/pica.h"
 
@@ -359,29 +360,26 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
         u8 g = ((source_ptr) >> 6) & 0x1F;
         u8 b = (source_ptr >> 1) & 0x1F;
         u8 a = source_ptr & 1;
-        return Math::MakeVec<u8>((r << 3) | (r >> 2), (g << 3) | (g >> 2), (b << 3) | (b >> 2), disable_alpha ? 255 : (a * 255));
+        return Math::MakeVec<u8>(Color::Convert5To8(r), Color::Convert5To8(g),
+                                 Color::Convert5To8(b), disable_alpha ? 255 : Color::Convert1To8(a));
     }
 
     case Regs::TextureFormat::RGB565:
     {
         const u16 source_ptr = *(const u16*)(source + offset * 2);
-        u8 r = (source_ptr >> 11) & 0x1F;
-        u8 g = ((source_ptr) >> 5) & 0x3F;
-        u8 b = (source_ptr) & 0x1F;
-        return Math::MakeVec<u8>((r << 3) | (r >> 2), (g << 2) | (g >> 4), (b << 3) | (b >> 2), 255);
+        u8 r = Color::Convert5To8((source_ptr >> 11) & 0x1F);
+        u8 g = Color::Convert6To8(((source_ptr) >> 5) & 0x3F);
+        u8 b = Color::Convert5To8((source_ptr) & 0x1F);
+        return Math::MakeVec<u8>(r, g, b, 255);
     }
 
     case Regs::TextureFormat::RGBA4:
     {
         const u8* source_ptr = source + offset * 2;
-        u8 r = source_ptr[1] >> 4;
-        u8 g = source_ptr[1] & 0xF;
-        u8 b = source_ptr[0] >> 4;
-        u8 a = source_ptr[0] & 0xF;
-        r = (r << 4) | r;
-        g = (g << 4) | g;
-        b = (b << 4) | b;
-        a = (a << 4) | a;
+        u8 r = Color::Convert4To8(source_ptr[1] >> 4);
+        u8 g = Color::Convert4To8(source_ptr[1] & 0xF);
+        u8 b = Color::Convert4To8(source_ptr[0] >> 4);
+        u8 a = Color::Convert4To8(source_ptr[0] & 0xF);
         return { r, g, b, disable_alpha ? (u8)255 : a };
     }
 
@@ -418,10 +416,8 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
     {
         const u8* source_ptr = source + offset;
 
-        u8 i = ((*source_ptr) & 0xF0) >> 4;
-        u8 a = (*source_ptr) & 0xF;
-        a |= a << 4;
-        i |= i << 4;
+        u8 i = Color::Convert4To8(((*source_ptr) & 0xF0) >> 4);
+        u8 a = Color::Convert4To8((*source_ptr) & 0xF);
 
         if (disable_alpha) {
             // Show intensity as red, alpha as green
@@ -436,7 +432,7 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
         const u8* source_ptr = source + offset / 2;
 
         u8 a = (coarse_x % 2) ? ((*source_ptr)&0xF) : (((*source_ptr) & 0xF0) >> 4);
-        a |= a << 4;
+        a = Color::Convert4To8(a);
 
         if (disable_alpha) {
             return { a, a, a, 255 };

From 9675d19b47865c2dac5e662f5a265589bd03a283 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Wed, 31 Dec 2014 15:19:40 +0100
Subject: [PATCH 20/21] Pica/Rasterizer: Make orient2d a free function and
 rename it to SignedArea.

---
 src/video_core/rasterizer.cpp | 71 +++++++++++++++++++----------------
 1 file changed, 39 insertions(+), 32 deletions(-)

diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 4dfc21885c..9850e517ae 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -55,30 +55,45 @@ static void SetDepth(int x, int y, u16 value) {
     *(depth_buffer + x + y * registers.framebuffer.GetWidth()) = value;
 }
 
+// NOTE: Assuming that rasterizer coordinates are 12.4 fixed-point values
+struct Fix12P4 {
+    Fix12P4() {}
+    Fix12P4(u16 val) : val(val) {}
+
+    static u16 FracMask() { return 0xF; }
+    static u16 IntMask() { return (u16)~0xF; }
+
+    operator u16() const {
+        return val;
+    }
+
+    bool operator < (const Fix12P4& oth) const {
+        return (u16)*this < (u16)oth;
+    }
+
+private:
+    u16 val;
+};
+
+/**
+ * Calculate signed area of the triangle spanned by the three argument vertices.
+ * The sign denotes an orientation.
+ *
+ * @todo define orientation concretely.
+ */
+static int SignedArea (const Math::Vec2<Fix12P4>& vtx1,
+                       const Math::Vec2<Fix12P4>& vtx2,
+                       const Math::Vec2<Fix12P4>& vtx3) {
+    const auto vec1 = Math::MakeVec(vtx2 - vtx1, 0);
+    const auto vec2 = Math::MakeVec(vtx3 - vtx1, 0);
+    // TODO: There is a very small chance this will overflow for sizeof(int) == 4
+    return Math::Cross(vec1, vec2).z;
+};
+
 void ProcessTriangle(const VertexShader::OutputVertex& v0,
                      const VertexShader::OutputVertex& v1,
                      const VertexShader::OutputVertex& v2)
 {
-    // NOTE: Assuming that rasterizer coordinates are 12.4 fixed-point values
-    struct Fix12P4 {
-        Fix12P4() {}
-        Fix12P4(u16 val) : val(val) {}
-
-        static u16 FracMask() { return 0xF; }
-        static u16 IntMask() { return (u16)~0xF; }
-
-        operator u16() const {
-            return val;
-        }
-
-        bool operator < (const Fix12P4& oth) const {
-            return (u16)*this < (u16)oth;
-        }
-
-    private:
-        u16 val;
-    };
-
     // vertex positions in rasterizer coordinates
     auto FloatToFix = [](float24 flt) {
                           return Fix12P4(static_cast<unsigned short>(flt.ToFloat32() * 16.0f));
@@ -86,14 +101,6 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
     auto ScreenToRasterizerCoordinates = [FloatToFix](const Math::Vec3<float24> vec) {
                                              return Math::Vec3<Fix12P4>{FloatToFix(vec.x), FloatToFix(vec.y), FloatToFix(vec.z)};
                                          };
-    static auto orient2d = [](const Math::Vec2<Fix12P4>& vtx1,
-                              const Math::Vec2<Fix12P4>& vtx2,
-                              const Math::Vec2<Fix12P4>& vtx3) {
-        const auto vec1 = Math::MakeVec(vtx2 - vtx1, 0);
-        const auto vec2 = Math::MakeVec(vtx3 - vtx1, 0);
-        // TODO: There is a very small chance this will overflow for sizeof(int) == 4
-        return Math::Cross(vec1, vec2).z;
-    };
 
     Math::Vec3<Fix12P4> vtxpos[3]{ ScreenToRasterizerCoordinates(v0.screenpos),
                                    ScreenToRasterizerCoordinates(v1.screenpos),
@@ -107,7 +114,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
     if (registers.cull_mode != Regs::CullMode::KeepAll) {
         // Cull away triangles which are wound clockwise.
         // TODO: A check for degenerate triangles ("== 0") should be considered for CullMode::KeepAll
-        if (orient2d(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0)
+        if (SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0)
             return;
     }
 
@@ -153,9 +160,9 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
         for (u16 x = min_x; x < max_x; x += 0x10) {
 
             // Calculate the barycentric coordinates w0, w1 and w2
-            int w0 = bias0 + orient2d(vtxpos[1].xy(), vtxpos[2].xy(), {x, y});
-            int w1 = bias1 + orient2d(vtxpos[2].xy(), vtxpos[0].xy(), {x, y});
-            int w2 = bias2 + orient2d(vtxpos[0].xy(), vtxpos[1].xy(), {x, y});
+            int w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y});
+            int w1 = bias1 + SignedArea(vtxpos[2].xy(), vtxpos[0].xy(), {x, y});
+            int w2 = bias2 + SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), {x, y});
             int wsum = w0 + w1 + w2;
 
             // If current pixel is not covered by the current primitive

From bc187be0c13f66b1a714d868ab8aa18214550bdc Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Wed, 31 Dec 2014 15:29:45 +0100
Subject: [PATCH 21/21] Pica/Rasterizer: Remove some redundant casts.

---
 src/video_core/rasterizer.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 9850e517ae..025d4e4841 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -433,9 +433,9 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
 
             // TODO: Does depth indeed only get written even if depth testing is enabled?
             if (registers.output_merger.depth_test_enable) {
-                u16 z = (u16)(-((float)v0.screenpos[2].ToFloat32() * w0 +
-                            (float)v1.screenpos[2].ToFloat32() * w1 +
-                            (float)v2.screenpos[2].ToFloat32() * w2) * 65535.f / wsum);
+                u16 z = (u16)(-(v0.screenpos[2].ToFloat32() * w0 +
+                            v1.screenpos[2].ToFloat32() * w1 +
+                            v2.screenpos[2].ToFloat32() * w2) * 65535.f / wsum);
                 u16 ref_z = GetDepth(x >> 4, y >> 4);
 
                 bool pass = false;