From fe8e6618f2907a9262d69232ef0e2d5d58cbc6e0 Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Sun, 2 Jun 2019 18:52:07 -0300
Subject: [PATCH] shader: Split SSY and PBK stack

Hardware testing revealed that SSY and PBK push to a different stack,
allowing code like this:

        SSY label1;
        PBK label2;
        SYNC;
label1: PBK;
label2: EXIT;
---
 .../renderer_opengl/gl_shader_decompiler.cpp  | 31 ++++++++++--
 .../renderer_vulkan/vk_shader_decompiler.cpp  | 49 ++++++++++++++-----
 src/video_core/shader/decode/other.cpp        | 18 +++----
 src/video_core/shader/node.h                  |  7 ++-
 4 files changed, 78 insertions(+), 27 deletions(-)

diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index f2d0722af..afcc06afc 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -143,6 +143,24 @@ u32 GetGenericAttributeIndex(Attribute::Index index) {
     return static_cast<u32>(index) - static_cast<u32>(Attribute::Index::Attribute_0);
 }
 
+constexpr const char* GetFlowStackPrefix(MetaStackClass stack) {
+    switch (stack) {
+    case MetaStackClass::Ssy:
+        return "ssy";
+    case MetaStackClass::Pbk:
+        return "pbk";
+    }
+    return {};
+}
+
+std::string FlowStackName(MetaStackClass stack) {
+    return fmt::format("{}_flow_stack", GetFlowStackPrefix(stack));
+}
+
+std::string FlowStackTopName(MetaStackClass stack) {
+    return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
+}
+
 class GLSLDecompiler final {
 public:
     explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderStage stage,
@@ -173,8 +191,10 @@ public:
         // TODO(Subv): Figure out the actual depth of the flow stack, for now it seems
         // unlikely that shaders will use 20 nested SSYs and PBKs.
         constexpr u32 FLOW_STACK_SIZE = 20;
-        code.AddLine("uint flow_stack[{}];", FLOW_STACK_SIZE);
-        code.AddLine("uint flow_stack_top = 0u;");
+        for (const auto stack : std::array{MetaStackClass::Ssy, MetaStackClass::Pbk}) {
+            code.AddLine("uint {}[{}];", FlowStackName(stack), FLOW_STACK_SIZE);
+            code.AddLine("uint {} = 0u;", FlowStackTopName(stack));
+        }
 
         code.AddLine("while (true) {{");
         ++code.scope;
@@ -1438,15 +1458,18 @@ private:
     }
 
     std::string PushFlowStack(Operation operation) {
+        const auto stack = std::get<MetaStackClass>(operation.GetMeta());
         const auto target = std::get_if<ImmediateNode>(&*operation[0]);
         UNIMPLEMENTED_IF(!target);
 
-        code.AddLine("flow_stack[flow_stack_top++] = 0x{:x}u;", target->GetValue());
+        code.AddLine("{}[{}++] = 0x{:x}u;", FlowStackName(stack), FlowStackTopName(stack),
+                     target->GetValue());
         return {};
     }
 
     std::string PopFlowStack(Operation operation) {
-        code.AddLine("jmp_to = flow_stack[--flow_stack_top];");
+        const auto stack = std::get<MetaStackClass>(operation.GetMeta());
+        code.AddLine("jmp_to = {}[--{}];", FlowStackName(stack), FlowStackTopName(stack));
         code.AddLine("break;");
         return {};
     }
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 547883425..33ad9764a 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -132,20 +132,16 @@ public:
             branch_labels.push_back(label);
         }
 
-        // TODO(Rodrigo): Figure out the actual depth of the flow stack, for now it seems unlikely
-        // that shaders will use 20 nested SSYs and PBKs.
-        constexpr u32 FLOW_STACK_SIZE = 20;
-        const Id flow_stack_type = TypeArray(t_uint, Constant(t_uint, FLOW_STACK_SIZE));
         jmp_to = Emit(OpVariable(TypePointer(spv::StorageClass::Function, t_uint),
                                  spv::StorageClass::Function, Constant(t_uint, first_address)));
-        flow_stack = Emit(OpVariable(TypePointer(spv::StorageClass::Function, flow_stack_type),
-                                     spv::StorageClass::Function, ConstantNull(flow_stack_type)));
-        flow_stack_top =
-            Emit(OpVariable(t_func_uint, spv::StorageClass::Function, Constant(t_uint, 0)));
+        std::tie(ssy_flow_stack, ssy_flow_stack_top) = CreateFlowStack();
+        std::tie(pbk_flow_stack, pbk_flow_stack_top) = CreateFlowStack();
 
         Name(jmp_to, "jmp_to");
-        Name(flow_stack, "flow_stack");
-        Name(flow_stack_top, "flow_stack_top");
+        Name(ssy_flow_stack, "ssy_flow_stack");
+        Name(ssy_flow_stack_top, "ssy_flow_stack_top");
+        Name(pbk_flow_stack, "pbk_flow_stack");
+        Name(pbk_flow_stack_top, "pbk_flow_stack_top");
 
         Emit(OpBranch(loop_label));
         Emit(loop_label);
@@ -952,6 +948,7 @@ private:
         const auto target = std::get_if<ImmediateNode>(&*operation[0]);
         ASSERT(target);
 
+        const auto [flow_stack, flow_stack_top] = GetFlowStack(operation);
         const Id current = Emit(OpLoad(t_uint, flow_stack_top));
         const Id next = Emit(OpIAdd(t_uint, current, Constant(t_uint, 1)));
         const Id access = Emit(OpAccessChain(t_func_uint, flow_stack, current));
@@ -962,6 +959,7 @@ private:
     }
 
     Id PopFlowStack(Operation operation) {
+        const auto [flow_stack, flow_stack_top] = GetFlowStack(operation);
         const Id current = Emit(OpLoad(t_uint, flow_stack_top));
         const Id previous = Emit(OpISub(t_uint, current, Constant(t_uint, 1)));
         const Id access = Emit(OpAccessChain(t_func_uint, flow_stack, previous));
@@ -1172,6 +1170,31 @@ private:
         Emit(skip_label);
     }
 
+    std::tuple<Id, Id> CreateFlowStack() {
+        // TODO(Rodrigo): Figure out the actual depth of the flow stack, for now it seems unlikely
+        // that shaders will use 20 nested SSYs and PBKs.
+        constexpr u32 FLOW_STACK_SIZE = 20;
+        constexpr auto storage_class = spv::StorageClass::Function;
+
+        const Id flow_stack_type = TypeArray(t_uint, Constant(t_uint, FLOW_STACK_SIZE));
+        const Id stack = Emit(OpVariable(TypePointer(storage_class, flow_stack_type), storage_class,
+                                         ConstantNull(flow_stack_type)));
+        const Id top = Emit(OpVariable(t_func_uint, storage_class, Constant(t_uint, 0)));
+        return std::tie(stack, top);
+    }
+
+    std::pair<Id, Id> GetFlowStack(Operation operation) {
+        const auto stack_class = std::get<MetaStackClass>(operation.GetMeta());
+        switch (stack_class) {
+        case MetaStackClass::Ssy:
+            return {ssy_flow_stack, ssy_flow_stack_top};
+        case MetaStackClass::Pbk:
+            return {pbk_flow_stack, pbk_flow_stack_top};
+        }
+        UNREACHABLE();
+        return {};
+    }
+
     static constexpr OperationDecompilersArray operation_decompilers = {
         &SPIRVDecompiler::Assign,
 
@@ -1414,8 +1437,10 @@ private:
 
     Id execute_function{};
     Id jmp_to{};
-    Id flow_stack_top{};
-    Id flow_stack{};
+    Id ssy_flow_stack_top{};
+    Id pbk_flow_stack_top{};
+    Id ssy_flow_stack{};
+    Id pbk_flow_stack{};
     Id continue_label{};
     std::map<u32, Id> labels;
 };
diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp
index 6fc07f213..d46a8ab82 100644
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -109,22 +109,20 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
         UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
                              "Constant buffer flow is not supported");
 
-        // The SSY opcode tells the GPU where to re-converge divergent execution paths, it sets the
-        // target of the jump that the SYNC instruction will make. The SSY opcode has a similar
-        // structure to the BRA opcode.
+        // The SSY opcode tells the GPU where to re-converge divergent execution paths with SYNC.
         const u32 target = pc + instr.bra.GetBranchTarget();
-        bb.push_back(Operation(OperationCode::PushFlowStack, Immediate(target)));
+        bb.push_back(
+            Operation(OperationCode::PushFlowStack, MetaStackClass::Ssy, Immediate(target)));
         break;
     }
     case OpCode::Id::PBK: {
         UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
                              "Constant buffer PBK is not supported");
 
-        // PBK pushes to a stack the address where BRK will jump to. This shares stack with SSY but
-        // using SYNC on a PBK address will kill the shader execution. We don't emulate this because
-        // it's very unlikely a driver will emit such invalid shader.
+        // PBK pushes to a stack the address where BRK will jump to.
         const u32 target = pc + instr.bra.GetBranchTarget();
-        bb.push_back(Operation(OperationCode::PushFlowStack, Immediate(target)));
+        bb.push_back(
+            Operation(OperationCode::PushFlowStack, MetaStackClass::Pbk, Immediate(target)));
         break;
     }
     case OpCode::Id::SYNC: {
@@ -133,7 +131,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
                              static_cast<u32>(cc));
 
         // The SYNC opcode jumps to the address previously set by the SSY opcode
-        bb.push_back(Operation(OperationCode::PopFlowStack));
+        bb.push_back(Operation(OperationCode::PopFlowStack, MetaStackClass::Ssy));
         break;
     }
     case OpCode::Id::BRK: {
@@ -142,7 +140,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
                              static_cast<u32>(cc));
 
         // The BRK opcode jumps to the address previously set by the PBK opcode
-        bb.push_back(Operation(OperationCode::PopFlowStack));
+        bb.push_back(Operation(OperationCode::PopFlowStack, MetaStackClass::Pbk));
         break;
     }
     case OpCode::Id::IPA: {
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index c002f90f9..3cfb911bb 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -174,6 +174,11 @@ enum class InternalFlag {
     Amount = 4,
 };
 
+enum class MetaStackClass {
+    Ssy,
+    Pbk,
+};
+
 class OperationNode;
 class ConditionalNode;
 class GprNode;
@@ -285,7 +290,7 @@ struct MetaTexture {
 };
 
 /// Parameters that modify an operation but are not part of any particular operand
-using Meta = std::variant<MetaArithmetic, MetaTexture, Tegra::Shader::HalfType>;
+using Meta = std::variant<MetaArithmetic, MetaTexture, MetaStackClass, Tegra::Shader::HalfType>;
 
 /// Holds any kind of operation that can be done in the IR
 class OperationNode final {