/*
 * Copyright 2023 Alyssa Rosenzweig
 * SPDX-License-Identifier: MIT
 */

#include "agx_builder.h"
#include "agx_compiler.h"
#include "agx_opcodes.h"

/*
 * AGX control flow instructions predicate out threads. No forward branches are
 * inserted during instruction selection, only backwards branches at the end of
 * loops exist before this pass. This means, prior to this pass, we would always
 * execute both sides of an if.
 *
 * To improve performance, this pass inserts conservative forward branches after
 * if_*cmp, else_*cmp, and break_if_*cmp instructions, jumping the
 * subgroup to their logical destination if all threads in the subgroup are
 * inactive. This has the effect of skipping over the unexecuted half of an if.
 * That means this pass is critical for control flow performance.
 */

/* Estimated cost of inserting a jmp_exec_none. This value is tuned to Dolphin
 * ubershaders. It needs to be retuned in lockstep with changes to the cost
 * estimation heuristic.
 */
#define COST_JMP (19)

static uint32_t
cost_instr(agx_instr *I)
{
   /* TODO: Better heuristic */
   switch (I->op) {
   case AGX_OPCODE_DEVICE_LOAD:
      return 10;
   case AGX_OPCODE_TEXTURE_LOAD:
   case AGX_OPCODE_TEXTURE_SAMPLE:
      return 50;
   default:
      return 1;
   }
}

/*
 * Estimate the cost between the instruction and the branch target. This is an
 * input for our heuristic. The branch target is guaranteed to be a forward
 * branch.
 */
static uint32_t
cost_between(agx_context *ctx, agx_block *from, agx_instr *from_I,
             agx_block *target, bool skip_to_end_of_target)
{
   uint32_t cost = 0;

   /* Consider the cost in the rest of this block */
   if (from_I != agx_last_instr(from)) {
      agx_foreach_instr_in_block_from(from, J, from_I) {
         /* If we reach the end, we're done */
         if (from == target && skip_to_end_of_target &&
             J == agx_last_instr(target))
            break;

         cost += cost_instr(J);
      }
   }

   if (from == target)
      return cost;

   /* Consider the cost in the subsequent blocks */
   agx_foreach_block_from(ctx, from, block) {
      if (block == from)
         continue;

      if (block == target && !skip_to_end_of_target)
         break;

      agx_foreach_instr_in_block(block, I) {
         if (block == target && I == agx_last_instr(target))
            break;

         cost += cost_instr(I);
      }

      if (block == target) {
         assert(skip_to_end_of_target);
         break;
      }
   }

   return cost;
}

static void
try_insert_jmp(agx_context *ctx, agx_block *from, agx_instr *from_I,
               agx_block *target, bool skip_to_end_of_target,
               unsigned inverse_probability)
{
   agx_builder b = agx_init_builder(ctx, agx_after_instr(from_I));

   /* If the control flow instruction was only inserted for its side effects,
    * there is nowhere to jump. Bail.
    */
   if (!target)
      return;

   /* If we do not insert a jump, we execute the predicated instructions
    * unconditionally, with an expected cost C.
    *
    * If we do insert a jump, then we pay the cost J of the jump, AND if we do
    * not take the jump, also the cost of the instructions C. The expected cost
    * if we insert a jump is therefore J + P(not all threads inactive) C.
    *
    * Therefore, we should insert a jump if:
    *
    *    J + P(not all threads inactive) C < C
    *
    * To model the implicit (i-cache, etc) costs of inserting a jump
    * instruction, we tie break conservatively, comparing with < instead of <=.
    *
    * Rearranging terms, we should NOT insert a jump if:
    *
    *    C < J / P(all threads inactive).
    */
   uint32_t cost_instructions =
      cost_between(ctx, from, from_I, target, skip_to_end_of_target);

   if (cost_instructions < COST_JMP * inverse_probability)
      return;

   /* It looks like inserting a jump will be a win. Do so. */
   if (skip_to_end_of_target)
      agx_jmp_exec_none_after(&b, target);
   else
      agx_jmp_exec_none(&b, target);
}

void
agx_opt_jmp_none(agx_context *ctx)
{
   agx_foreach_block(ctx, blk) {
      /* Handle the beginning of blocks */
      agx_instr *first_ = agx_first_instr(blk);
      if (first_ && (first_->op == AGX_OPCODE_ELSE_ICMP ||
                     first_->op == AGX_OPCODE_ELSE_FCMP)) {

         /* The target of the else is the last block of the else, so we skip
          * to the end of the block (to start execution with the pop_exec).
          */
         try_insert_jmp(ctx, blk, first_, first_->target, true, 2);
      } else if (first_ &&
                 (first_->op == AGX_OPCODE_BREAK_IF_ICMP ||
                  first_->op == AGX_OPCODE_BREAK_IF_FCMP) &&
                 first_->nest == 1) {
         /* The target of the break is the block immediately after the end of
          * the loop, so jump to the end of the previous block to get the
          * appropriate pop_exec.
          *
          * Also, note we only do this for nest=1 to ensure we don't insert
          * jumps inside if-statements inside breaks. We can't insert a
          * jmp_exec_none inside the if because it would break out of the loop
          * for threads that are still running the loop but merely predicated
          * out due to the if-condition. This is similarly why we don't bother
          * handling unconditional break.
          *
          * TODO: This is not optimal, but fixing this would require
          * considerably more CFG gymnastics.
          */
         agx_block *target = agx_prev_block(first_->target);
         try_insert_jmp(ctx, blk, first_, target, true, 10);
      }

      /* Handle end of block instructions */
      agx_foreach_instr_in_block_rev(blk, I) {
         if (!instr_after_logical_end(I))
            break;

         if (I->op == AGX_OPCODE_IF_ICMP || I->op == AGX_OPCODE_IF_FCMP) {
            try_insert_jmp(ctx, blk, I, I->target, false, 2);
            break;
         }
      }
   }
}