intel/compiler/brw_opt.cpp

/*
 * Copyright © 2010 Intel Corporation
 * SPDX-License-Identifier: MIT
 */

#include "brw_eu.h"
#include "brw_fs.h"
#include "brw_builder.h"

#include "dev/intel_debug.h"

using namespace brw;

void
brw_optimize(fs_visitor &s)
{
   const nir_shader *nir = s.nir;

   s.debug_optimizer(nir, "start", 0, 0);

   /* Start by validating the shader we currently have. */
   brw_validate(s);

   /* Track how much non-SSA at this point. */
   {
      const brw::def_analysis &defs = s.def_analysis.require();
      s.shader_stats.non_ssa_registers_after_nir =
         defs.count() - defs.ssa_count();
   }

   bool progress = false;
   int iteration = 0;
   int pass_num = 0;

#define OPT(pass, ...) ({                                               \
      pass_num++;                                                       \
      bool this_progress = pass(s, ##__VA_ARGS__);                      \
                                                                        \
      if (this_progress)                                                \
         s.debug_optimizer(nir, #pass, iteration, pass_num);            \
                                                                        \
      brw_validate(s);                                                  \
                                                                        \
      progress = progress || this_progress;                             \
      this_progress;                                                    \
   })

   if (s.compiler->lower_dpas)
      OPT(brw_lower_dpas);

   OPT(brw_opt_split_virtual_grfs);

   /* Before anything else, eliminate dead code.  The results of some NIR
    * instructions may effectively be calculated twice.  Once when the
    * instruction is encountered, and again when the user of that result is
    * encountered.  Wipe those away before algebraic optimizations and
    * especially copy propagation can mix things up.
    */
   OPT(brw_opt_dead_code_eliminate);

   OPT(brw_opt_remove_extra_rounding_modes);

   OPT(brw_opt_eliminate_find_live_channel);

   do {
      progress = false;
      pass_num = 0;
      iteration++;

      OPT(brw_opt_algebraic);
      OPT(brw_opt_cse_defs);
      if (!OPT(brw_opt_copy_propagation_defs))
         OPT(brw_opt_copy_propagation);
      OPT(brw_opt_cmod_propagation);
      OPT(brw_opt_dead_code_eliminate);
      OPT(brw_opt_saturate_propagation);
      OPT(brw_opt_register_coalesce);

      OPT(brw_opt_compact_virtual_grfs);
   } while (progress);

   brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_OPT_LOOP);

   progress = false;
   pass_num = 0;

   if (OPT(brw_opt_combine_convergent_txf))
      OPT(brw_opt_copy_propagation_defs);

   if (OPT(brw_lower_pack)) {
      OPT(brw_opt_register_coalesce);
      OPT(brw_opt_dead_code_eliminate);
   }

   OPT(brw_lower_subgroup_ops);
   OPT(brw_lower_csel);
   OPT(brw_lower_simd_width);
   OPT(brw_lower_scalar_fp64_MAD);
   OPT(brw_lower_barycentrics);
   OPT(brw_lower_logical_sends);

   brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_EARLY_LOWERING);

   /* After logical SEND lowering. */

   if (!OPT(brw_opt_copy_propagation_defs))
      OPT(brw_opt_copy_propagation);

   /* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
    * Do this before splitting SENDs.
    */
   if (OPT(brw_opt_zero_samples)) {
      if (!OPT(brw_opt_copy_propagation_defs)) {
         OPT(brw_opt_copy_propagation);
      }
   }

   if (s.devinfo->ver >= 30)
      OPT(brw_opt_send_to_send_gather);

   OPT(brw_opt_split_sends);
   OPT(brw_workaround_nomask_control_flow);

   if (progress) {
      /* Do both forms of copy propagation because it is important to
       * eliminate as many cases of load_payload-of-load_payload as possible.
       */
      OPT(brw_opt_copy_propagation_defs);
      OPT(brw_opt_copy_propagation);

      /* Run after logical send lowering to give it a chance to CSE the
       * LOAD_PAYLOAD instructions created to construct the payloads of
       * e.g. texturing messages in cases where it wasn't possible to CSE the
       * whole logical instruction.
       */
      OPT(brw_opt_cse_defs);
      OPT(brw_opt_register_coalesce);
      OPT(brw_opt_dead_code_eliminate);
   }

   OPT(brw_opt_remove_redundant_halts);

   if (OPT(brw_lower_load_payload)) {
      OPT(brw_opt_split_virtual_grfs);

      OPT(brw_opt_register_coalesce);
      OPT(brw_lower_simd_width);
      OPT(brw_opt_dead_code_eliminate);
   }

   brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_MIDDLE_LOWERING);

   OPT(brw_lower_alu_restrictions);

   OPT(brw_opt_combine_constants);
   if (OPT(brw_lower_integer_multiplication)) {
      /* If lower_integer_multiplication made progress, it may have produced
       * some 32x32-bit MULs in the process of lowering 64-bit MULs.  Run it
       * one more time to clean those up if they exist.
       */
      OPT(brw_lower_integer_multiplication);
   }
   OPT(brw_lower_sub_sat);

   progress = false;
   OPT(brw_lower_derivatives);
   OPT(brw_lower_regioning);

   /* Try both copy propagation passes.  The defs one will likely not be
    * able to handle everything at this point.
    */
   const bool cp1 = OPT(brw_opt_copy_propagation_defs);
   const bool cp2 = OPT(brw_opt_copy_propagation);
   if (cp1 || cp2)
      OPT(brw_opt_combine_constants);

   OPT(brw_opt_dead_code_eliminate);
   OPT(brw_opt_register_coalesce);

   if (progress)
      OPT(brw_lower_simd_width);

   if (s.devinfo->ver >= 30)
      OPT(brw_opt_send_gather_to_send);

   OPT(brw_lower_uniform_pull_constant_loads);

   if (OPT(brw_lower_send_descriptors)) {
      /* No need for standard copy_propagation since
       * brw_fs_opt_address_reg_load will only optimize defs.
       */
      if (OPT(brw_opt_copy_propagation_defs))
         OPT(brw_opt_algebraic);
      OPT(brw_opt_address_reg_load);
      OPT(brw_opt_dead_code_eliminate);
   }

   OPT(brw_lower_sends_overlapping_payload);

   OPT(brw_lower_indirect_mov);

   OPT(brw_lower_find_live_channel);

   OPT(brw_lower_load_subgroup_invocation);

   brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_LATE_LOWERING);
}

static unsigned
load_payload_sources_read_for_size(fs_inst *lp, unsigned size_read)
{
   assert(lp->opcode == SHADER_OPCODE_LOAD_PAYLOAD);
   assert(size_read >= lp->header_size * REG_SIZE);

   unsigned i;
   unsigned size = lp->header_size * REG_SIZE;
   for (i = lp->header_size; size < size_read && i < lp->sources; i++)
      size += lp->exec_size * brw_type_size_bytes(lp->src[i].type);

   /* Size read must cover exactly a subset of sources. */
   assert(size == size_read);
   return i;
}

/**
 * Optimize sample messages that have constant zero values for the trailing
 * parameters. We can just reduce the message length for these
 * instructions instead of reserving a register for it. Trailing parameters
 * that aren't sent default to zero anyway. This will cause the dead code
 * eliminator to remove the MOV instruction that would otherwise be emitted to
 * set up the zero value.
 */

bool
brw_opt_zero_samples(fs_visitor &s)
{
   bool progress = false;

   foreach_block_and_inst(block, fs_inst, send, s.cfg) {
      if (send->opcode != SHADER_OPCODE_SEND ||
          send->sfid != BRW_SFID_SAMPLER)
         continue;

      /* Wa_14012688258:
       *
       * Don't trim zeros at the end of payload for sample operations
       * in cube and cube arrays.
       */
      if (send->keep_payload_trailing_zeros)
         continue;

      /* This pass works on SENDs before splitting. */
      if (send->ex_mlen > 0)
         continue;

      fs_inst *lp = (fs_inst *) send->prev;

      if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
         continue;

      /* How much of the payload are actually read by this SEND. */
      const unsigned params =
         load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);

      /* We don't want to remove the message header or the first parameter.
       * Removing the first parameter is not allowed, see the Haswell PRM
       * volume 7, page 149:
       *
       *     "Parameter 0 is required except for the sampleinfo message, which
       *      has no parameter 0"
       */
      const unsigned first_param_idx = lp->header_size;
      unsigned zero_size = 0;
      for (unsigned i = params - 1; i > first_param_idx; i--) {
         if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero())
            break;
         zero_size += lp->exec_size * brw_type_size_bytes(lp->src[i].type) * lp->dst.stride;
      }

      /* Round down to ensure to only consider full registers. */
      const unsigned zero_len = ROUND_DOWN_TO(zero_size / REG_SIZE, reg_unit(s.devinfo));
      if (zero_len > 0) {
         /* Note mlen is in REG_SIZE units. */
         send->mlen -= zero_len;
         progress = true;
      }
   }

   if (progress)
      s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);

   return progress;
}

/**
 * Opportunistically split SEND message payloads.
 *
 * Gfx9+ supports "split" SEND messages, which take two payloads that are
 * implicitly concatenated.  If we find a SEND message with a single payload,
 * we can split that payload in two.  This results in smaller contiguous
 * register blocks for us to allocate.  But it can help beyond that, too.
 *
 * We try and split a LOAD_PAYLOAD between sources which change registers.
 * For example, a sampler message often contains a x/y/z coordinate that may
 * already be in a contiguous VGRF, combined with an LOD, shadow comparitor,
 * or array index, which comes from elsewhere.  In this case, the first few
 * sources will be different offsets of the same VGRF, then a later source
 * will be a different VGRF.  So we split there, possibly eliminating the
 * payload concatenation altogether.
 */
bool
brw_opt_split_sends(fs_visitor &s)
{
   bool progress = false;

   foreach_block_and_inst(block, fs_inst, send, s.cfg) {
      if (send->opcode != SHADER_OPCODE_SEND ||
          send->mlen <= reg_unit(s.devinfo) || send->ex_mlen > 0 ||
          send->src[2].file != VGRF)
         continue;

      /* Currently don't split sends that reuse a previously used payload. */
      fs_inst *lp = (fs_inst *) send->prev;

      if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
         continue;

      if (lp->dst.file != send->src[2].file || lp->dst.nr != send->src[2].nr)
         continue;

      /* Split either after the header (if present), or when consecutive
       * sources switch from one VGRF to a different one.
       */
      unsigned mid = lp->header_size;
      if (mid == 0) {
         for (mid = 1; mid < lp->sources; mid++) {
            if (lp->src[mid].file == BAD_FILE)
               continue;

            if (lp->src[0].file != lp->src[mid].file ||
                lp->src[0].nr != lp->src[mid].nr)
               break;
         }
      }

      /* SEND mlen might be smaller than what LOAD_PAYLOAD provides, so
       * find out how many sources from the payload does it really need.
       */
      const unsigned end =
         load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);

      /* Nothing to split. */
      if (end <= mid)
         continue;

      const brw_builder ibld(&s, block, lp);
      fs_inst *lp1 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[0], mid, lp->header_size);
      fs_inst *lp2 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[mid], end - mid, 0);

      assert(lp1->size_written % REG_SIZE == 0);
      assert(lp2->size_written % REG_SIZE == 0);
      assert((lp1->size_written + lp2->size_written) / REG_SIZE == send->mlen);

      lp1->dst = brw_vgrf(s.alloc.allocate(lp1->size_written / REG_SIZE), lp1->dst.type);
      lp2->dst = brw_vgrf(s.alloc.allocate(lp2->size_written / REG_SIZE), lp2->dst.type);

      send->resize_sources(4);
      send->src[2] = lp1->dst;
      send->src[3] = lp2->dst;
      send->ex_mlen = lp2->size_written / REG_SIZE;
      send->mlen -= send->ex_mlen;

      progress = true;
   }

   if (progress)
      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);

   return progress;
}

/**
 * Remove redundant or useless halts.
 *
 * For example, we can eliminate halts in the following sequence:
 *
 * halt        (redundant with the next halt)
 * halt        (useless; jumps to the next instruction)
 * halt-target
 */
bool
brw_opt_remove_redundant_halts(fs_visitor &s)
{
   bool progress = false;

   unsigned halt_count = 0;
   fs_inst *halt_target = NULL;
   bblock_t *halt_target_block = NULL;
   foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
      if (inst->opcode == BRW_OPCODE_HALT)
         halt_count++;

      if (inst->opcode == SHADER_OPCODE_HALT_TARGET) {
         halt_target = inst;
         halt_target_block = block;
         break;
      }
   }

   if (!halt_target) {
      assert(halt_count == 0);
      return false;
   }

   /* Delete any HALTs immediately before the halt target. */
   for (fs_inst *prev = (fs_inst *) halt_target->prev;
        !prev->is_head_sentinel() && prev->opcode == BRW_OPCODE_HALT;
        prev = (fs_inst *) halt_target->prev) {
      prev->remove(halt_target_block);
      halt_count--;
      progress = true;
   }

   if (halt_count == 0) {
      halt_target->remove(halt_target_block);
      progress = true;
   }

   if (progress)
      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);

   return progress;
}

/**
 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
 * flow.  We could probably do better here with some form of divergence
 * analysis.
 */
bool
brw_opt_eliminate_find_live_channel(fs_visitor &s)
{
   bool progress = false;
   unsigned depth = 0;

   if (!brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
                                      s.prog_data)) {
      /* The optimization below assumes that channel zero is live on thread
       * dispatch, which may not be the case if the fixed function dispatches
       * threads sparsely.
       */
      return false;
   }

   foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
      switch (inst->opcode) {
      case BRW_OPCODE_IF:
      case BRW_OPCODE_DO:
         depth++;
         break;

      case BRW_OPCODE_ENDIF:
      case BRW_OPCODE_WHILE:
         depth--;
         break;

      case BRW_OPCODE_HALT:
         /* This can potentially make control flow non-uniform until the end
          * of the program.
          */
         goto out;

      case SHADER_OPCODE_FIND_LIVE_CHANNEL:
         if (depth == 0) {
            inst->opcode = BRW_OPCODE_MOV;
            inst->src[0] = brw_imm_ud(0u);
            inst->force_writemask_all = true;

            /* FIND_LIVE_CHANNEL emitted by emit_uniformize will have
             * size_written set by hand to a smaller value. In this case,
             * munge the exec_size to match.
             */
            if (inst->size_written == inst->dst.component_size(8 * reg_unit(s.devinfo)))
               inst->exec_size = 8 * reg_unit(s.devinfo);

            inst->resize_sources(1);
            progress = true;

            /* emit_uniformize() frequently emits FIND_LIVE_CHANNEL paired
             * with a BROADCAST.  Save some work for opt_copy_propagation
             * and opt_algebraic by trivially cleaning up both together.
             */
            assert(!inst->next->is_tail_sentinel());
            fs_inst *bcast = (fs_inst *) inst->next;

            /* Ignore stride when comparing */
            if (bcast->opcode == SHADER_OPCODE_BROADCAST &&
                inst->dst.file == VGRF &&
                inst->dst.file == bcast->src[1].file &&
                inst->dst.nr == bcast->src[1].nr &&
                inst->dst.offset == bcast->src[1].offset) {
               bcast->opcode = BRW_OPCODE_MOV;
               if (!is_uniform(bcast->src[0]))
                  bcast->src[0] = component(bcast->src[0], 0);

               bcast->force_writemask_all = true;
               bcast->exec_size = 8 * reg_unit(s.devinfo);
               assert(bcast->size_written == bcast->dst.component_size(bcast->exec_size));
               bcast->resize_sources(1);
            }
         }
         break;

      default:
         break;
      }
   }

out:
   if (progress)
      s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);

   return progress;
}

/**
 * Rounding modes for conversion instructions are included for each
 * conversion, but right now it is a state. So once it is set,
 * we don't need to call it again for subsequent calls.
 *
 * This is useful for vector/matrices conversions, as setting the
 * mode once is enough for the full vector/matrix
 */
bool
brw_opt_remove_extra_rounding_modes(fs_visitor &s)
{
   bool progress = false;
   unsigned execution_mode = s.nir->info.float_controls_execution_mode;

   brw_rnd_mode base_mode = BRW_RND_MODE_UNSPECIFIED;
   if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
        FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
        FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
       execution_mode)
      base_mode = BRW_RND_MODE_RTNE;
   if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
        FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
        FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
       execution_mode)
      base_mode = BRW_RND_MODE_RTZ;

   foreach_block (block, s.cfg) {
      brw_rnd_mode prev_mode = base_mode;

      foreach_inst_in_block_safe (fs_inst, inst, block) {
         if (inst->opcode == SHADER_OPCODE_RND_MODE) {
            assert(inst->src[0].file == IMM);
            const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d;
            if (mode == prev_mode) {
               inst->remove(block);
               progress = true;
            } else {
               prev_mode = mode;
            }
         }
      }
   }

   if (progress)
      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);

   return progress;
}

bool
brw_opt_send_to_send_gather(fs_visitor &s)
{
   const intel_device_info *devinfo = s.devinfo;
   bool progress = false;

   assert(devinfo->ver >= 30);

   const unsigned unit = reg_unit(devinfo);
   assert(unit == 2);

   unsigned count = 0;

   foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
      if (inst->opcode != SHADER_OPCODE_SEND)
         continue;

      /* For 1-2 registers, send-gather offers no benefits over split-send. */
      if (inst->mlen + inst->ex_mlen <= 2 * unit)
         continue;

      assert(inst->mlen % unit == 0);
      assert(inst->ex_mlen % unit == 0);

      struct {
         brw_reg src;
         unsigned phys_len;
      } payload[2] = {
         { inst->src[2], inst->mlen / unit },
         { inst->src[3], inst->ex_mlen / unit },
      };

      const unsigned num_payload_sources = payload[0].phys_len + payload[1].phys_len;

      /* Limited by Src0.Length in the SEND instruction. */
      if (num_payload_sources > 15)
         continue;

      if (INTEL_DEBUG(DEBUG_NO_SEND_GATHER)) {
         count++;
         continue;
      }

      inst->resize_sources(3 + num_payload_sources);
      /* Sources 0 and 1 remain the same.  Source 2 will be filled
       * after register allocation.
       */
      inst->src[2] = {};

      int idx = 3;
      for (unsigned p = 0; p < ARRAY_SIZE(payload); p++) {
         for (unsigned i = 0; i < payload[p].phys_len; i++) {
            inst->src[idx++] = byte_offset(payload[p].src,
                                           i * reg_unit(devinfo) * REG_SIZE);
         }
      }
      assert(idx == inst->sources);

      inst->opcode = SHADER_OPCODE_SEND_GATHER;
      inst->mlen = 0;
      inst->ex_mlen = 0;

      progress = true;
   }

   if (INTEL_DEBUG(DEBUG_NO_SEND_GATHER)) {
      fprintf(stderr, "Ignored %u opportunities to try SEND_GATHER in %s shader.\n",
              count, _mesa_shader_stage_to_string(s.stage));
   }

   if (progress)
      s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
                            DEPENDENCY_INSTRUCTION_DATA_FLOW);

   return progress;
}

/* If after optimizations, the sources are *still* contiguous in a
 * SEND_GATHER, prefer to use the regular SEND, which would save
 * having to write the ARF scalar register.
 */
bool
brw_opt_send_gather_to_send(fs_visitor &s)
{
   const intel_device_info *devinfo = s.devinfo;
   bool progress = false;

   assert(devinfo->ver >= 30);

   const unsigned unit = reg_unit(devinfo);
   assert(unit == 2);

   foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
      if (inst->opcode != SHADER_OPCODE_SEND_GATHER)
         continue;

      assert(inst->sources > 2);
      assert(inst->src[2].file == BAD_FILE);

      const int num_payload_sources = inst->sources - 3;
      assert(num_payload_sources > 0);

      /* Limited by Src0.Length in the SEND instruction. */
      assert(num_payload_sources < 16);

      /* Determine whether the sources are still spread in either one or two
       * spans.  In those cases the regular SEND instruction can be used
       * and there's no need to use SEND_GATHER (which would set ARF scalar register
       * adding an extra instruction).
       */
      const brw_reg *payload = &inst->src[3];
      brw_reg payload1       = payload[0];
      brw_reg payload2       = {};
      int payload1_len       = 0;
      int payload2_len       = 0;

      for (int i = 0; i < num_payload_sources; i++) {
         if (payload[i].file == VGRF &&
             payload[i].nr == payload1.nr &&
             payload[i].offset == payload1_len * REG_SIZE * unit)
            payload1_len++;
         else {
            payload2 = payload[i];
            break;
         }
      }

      if (payload2.file == VGRF) {
         for (int i = payload1_len; i < num_payload_sources; i++) {
            if (payload[i].file == VGRF &&
                payload[i].nr == payload2.nr &&
                payload[i].offset == payload2_len * REG_SIZE * unit)
               payload2_len++;
            else
               break;
         }
      } else {
         payload2 = brw_null_reg();
      }

      if (payload1_len + payload2_len != num_payload_sources)
         continue;

      /* Bspec 57058 (r64705) says
       *
       *    When a source data payload is used in dataport message, that payload
       *    must be specified as Source 1 portion of a Split Send message.
       *
       * But at this point the split point is not guaranteed to respect that.
       *
       * TODO: Pass LSC address length or infer it so valid splits can work.
       */
      if (payload2_len && (inst->sfid == GFX12_SFID_UGM ||
                           inst->sfid == GFX12_SFID_TGM ||
                           inst->sfid == GFX12_SFID_SLM ||
                           inst->sfid == BRW_SFID_URB)) {
         enum lsc_opcode lsc_op = lsc_msg_desc_opcode(devinfo, inst->desc);
         if (lsc_op_num_data_values(lsc_op) > 0)
            continue;
      }

      inst->resize_sources(4);
      inst->opcode  = SHADER_OPCODE_SEND;
      inst->src[2]  = payload1;
      inst->src[3]  = payload2;
      inst->mlen    = payload1_len * unit;
      inst->ex_mlen = payload2_len * unit;

      progress = true;
   }

   if (progress) {
      s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
                            DEPENDENCY_INSTRUCTION_DATA_FLOW);
   }

   return progress;
}