intel/compiler/brw_fs_scoreboard.cpp

/*
 * Copyright © 2019 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

/** @file brw_fs_scoreboard.cpp
 *
 * Gen12+ hardware lacks the register scoreboard logic that used to guarantee
 * data coherency between register reads and writes in previous generations.
 * This lowering pass runs after register allocation in order to make up for
 * it.
 *
 * It works by performing global dataflow analysis in order to determine the
 * set of potential dependencies of every instruction in the shader, and then
 * inserts any required SWSB annotations and additional SYNC instructions in
 * order to guarantee data coherency.
 *
 * WARNING - Access of the following (rarely used) ARF registers is not
 *           tracked here, and require the RegDist SWSB annotation to be set
 *           to 1 by the generator in order to avoid data races:
 *
 *  - sp stack pointer
 *  - sr0 state register
 *  - cr0 control register
 *  - ip instruction pointer
 *  - tm0 timestamp register
 *  - dbg0 debug register
 *
 * The following ARF registers don't need to be tracked here because data
 * coherency is still provided transparently by the hardware:
 *
 *  - f0-1 flag registers
 *  - n0 notification register
 *  - tdr0 thread dependency register
 */

#include "brw_fs.h"
#include "brw_cfg.h"

using namespace brw;

namespace {
   /**
    * In-order instruction accounting.
    * @{
    */

   /**
    * Number of in-order hardware instructions contained in this IR
    * instruction.  This determines the increment applied to the RegDist
    * counter calculated for any ordered dependency that crosses this
    * instruction.
    */
   unsigned
   ordered_unit(const fs_inst *inst)
   {
      switch (inst->opcode) {
      case BRW_OPCODE_SYNC:
      case BRW_OPCODE_DO:
      case SHADER_OPCODE_UNDEF:
      case FS_OPCODE_PLACEHOLDER_HALT:
      case FS_OPCODE_SCHEDULING_FENCE:
         return 0;
      default:
         /* Note that the following is inaccurate for virtual instructions
          * that expand to more in-order instructions than assumed here, but
          * that can only lead to suboptimal execution ordering, data
          * coherency won't be impacted.  Providing exact RegDist counts for
          * each virtual instruction would allow better ALU performance, but
          * it would require keeping this switch statement in perfect sync
          * with the generator in order to avoid data corruption.  Lesson is
          * (again) don't use virtual instructions if you want optimal
          * scheduling.
          */
         return is_unordered(inst) ? 0 : 1;
      }
   }

   /**
    * Type for an instruction counter that increments for in-order
    * instructions only, arbitrarily denoted 'jp' throughout this lowering
    * pass in order to distinguish it from the regular instruction counter.
    */
   typedef int ordered_address;

   /**
    * Return the number of instructions in the program.
    */
   unsigned
   num_instructions(const backend_shader *shader)
   {
      return shader->cfg->blocks[shader->cfg->num_blocks - 1]->end_ip + 1;
   }

   /**
    * Calculate the local ordered_address instruction counter at every
    * instruction of the shader for subsequent constant-time look-up.
    */
   ordered_address *
   ordered_inst_addresses(const fs_visitor *shader)
   {
      ordered_address *jps = new ordered_address[num_instructions(shader)];
      ordered_address jp = 0;
      unsigned ip = 0;

      foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
         jps[ip] = jp;
         jp += ordered_unit(inst);
         ip++;
      }

      return jps;
   }

   /**
    * Synchronization mode required for data manipulated by in-order
    * instructions.
    *
    * Similar to tgl_sbid_mode, but without SET mode.  Defined as a separate
    * enum for additional type safety.  The hardware doesn't provide control
    * over the synchronization mode for RegDist annotations, this is only used
    * internally in this pass in order to optimize out redundant read
    * dependencies where possible.
    */
   enum tgl_regdist_mode {
      TGL_REGDIST_NULL = 0,
      TGL_REGDIST_SRC = 1,
      TGL_REGDIST_DST = 2
   };

   /**
    * Allow bitwise arithmetic of tgl_regdist_mode enums.
    */
   tgl_regdist_mode
   operator|(tgl_regdist_mode x, tgl_regdist_mode y)
   {
      return tgl_regdist_mode(unsigned(x) | unsigned(y));
   }

   tgl_regdist_mode
   operator&(tgl_regdist_mode x, tgl_regdist_mode y)
   {
      return tgl_regdist_mode(unsigned(x) & unsigned(y));
   }

   tgl_regdist_mode &
   operator|=(tgl_regdist_mode &x, tgl_regdist_mode y)
   {
      return x = x | y;
   }

   tgl_regdist_mode &
   operator&=(tgl_regdist_mode &x, tgl_regdist_mode y)
   {
      return x = x & y;
   }

   /** @} */

   /**
    * Representation of an equivalence relation among the set of unsigned
    * integers.
    *
    * Its initial state is the identity relation '~' such that i ~ j if and
    * only if i == j for every pair of unsigned integers i and j.
    */
   struct equivalence_relation {
      equivalence_relation(unsigned n) : is(new unsigned[n]), n(n)
      {
         for (unsigned i = 0; i < n; i++)
            is[i] = i;
      }

      ~equivalence_relation()
      {
         delete[] is;
      }

      /**
       * Return equivalence class index of the specified element.  Effectively
       * this is the numeric value of an arbitrary representative from the
       * equivalence class.
       *
       * Allows the evaluation of the equivalence relation according to the
       * rule that i ~ j if and only if lookup(i) == lookup(j).
       */
      unsigned
      lookup(unsigned i) const
      {
         if (i < n && is[i] != i)
            return lookup(is[i]);
         else
            return i;
      }

      /**
       * Create an array with the results of the lookup() method for
       * constant-time evaluation.
       */
      unsigned *
      flatten() const
      {
         unsigned *ids = new unsigned[n];

         for (unsigned i = 0; i < n; i++)
            ids[i] = lookup(i);

         return ids;
      }

      /**
       * Mutate the existing equivalence relation minimally by imposing the
       * additional requirement that i ~ j.
       *
       * The algorithm updates the internal representation recursively in
       * order to guarantee transitivity while preserving the previously
       * specified equivalence requirements.
       */
      unsigned
      link(unsigned i, unsigned j)
      {
         const unsigned k = lookup(i);
         assign(i, k);
         assign(j, k);
         return k;
      }

   private:
      equivalence_relation(const equivalence_relation &);

      equivalence_relation &
      operator=(const equivalence_relation &);

      /**
       * Assign the representative of \p from to be equivalent to \p to.
       *
       * At the same time the data structure is partially flattened as much as
       * it's possible without increasing the number of recursive calls.
       */
      void
      assign(unsigned from, unsigned to)
      {
         if (from != to) {
            assert(from < n);

            if (is[from] != from)
               assign(is[from], to);

            is[from] = to;
         }
      }

      unsigned *is;
      unsigned n;
   };

   /**
    * Representation of a data dependency between two instructions in the
    * program.
    * @{
    */
   struct dependency {
      /**
       * No dependency information.
       */
      dependency() : ordered(TGL_REGDIST_NULL), jp(INT_MIN),
                     unordered(TGL_SBID_NULL), id(0),
                     exec_all(false) {}

      /**
       * Construct a dependency on the in-order instruction with the provided
       * ordered_address instruction counter.
       */
      dependency(tgl_regdist_mode mode, ordered_address jp, bool exec_all) :
         ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0),
         exec_all(exec_all) {}

      /**
       * Construct a dependency on the out-of-order instruction with the
       * specified synchronization token.
       */
      dependency(tgl_sbid_mode mode, unsigned id, bool exec_all) :
         ordered(TGL_REGDIST_NULL), jp(INT_MIN), unordered(mode), id(id),
         exec_all(exec_all) {}

      /**
       * Synchronization mode of in-order dependency, or zero if no in-order
       * dependency is present.
       */
      tgl_regdist_mode ordered;

      /**
       * Instruction counter of in-order dependency.
       *
       * For a dependency part of a different block in the program, this is
       * relative to the specific control flow path taken between the
       * dependency and the current block: It is the ordered_address such that
       * the difference between it and the ordered_address of the first
       * instruction of the current block is exactly the number of in-order
       * instructions across that control flow path.  It is not guaranteed to
       * be equal to the local ordered_address of the generating instruction
       * [as returned by ordered_inst_addresses()], except for block-local
       * dependencies.
       */
      ordered_address jp;

      /**
       * Synchronization mode of unordered dependency, or zero if no unordered
       * dependency is present.
       */
      tgl_sbid_mode unordered;

      /** Synchronization token of out-of-order dependency. */
      unsigned id;

      /**
       * Whether the dependency could be run with execution masking disabled,
       * which might lead to the unwanted execution of the generating
       * instruction in cases where a BB is executed with all channels
       * disabled due to hardware bug GEN:BUG:1407528679.
       */
      bool exec_all;

      /**
       * Trivial in-order dependency that's always satisfied.
       *
       * Note that unlike a default-constructed dependency() which is also
       * trivially satisfied, this is considered to provide dependency
       * information and can be used to clear a previously pending dependency
       * via shadow().
       */
      static const dependency done;

      friend bool
      operator==(const dependency &dep0, const dependency &dep1)
      {
         return dep0.ordered == dep1.ordered &&
                dep0.jp == dep1.jp &&
                dep0.unordered == dep1.unordered &&
                dep0.id == dep1.id &&
                dep0.exec_all == dep1.exec_all;
      }

      friend bool
      operator!=(const dependency &dep0, const dependency &dep1)
      {
         return !(dep0 == dep1);
      }
   };

   const dependency dependency::done = dependency(TGL_REGDIST_SRC, INT_MIN, false);

   /**
    * Return whether \p dep contains any dependency information.
    */
   bool
   is_valid(const dependency &dep)
   {
      return dep.ordered || dep.unordered;
   }

   /**
    * Combine \p dep0 and \p dep1 into a single dependency object that is only
    * satisfied when both original dependencies are satisfied.  This might
    * involve updating the equivalence relation \p eq in order to make sure
    * that both out-of-order dependencies are assigned the same hardware SBID
    * as synchronization token.
    */
   dependency
   merge(equivalence_relation &eq,
         const dependency &dep0, const dependency &dep1)
   {
      dependency dep;

      if (dep0.ordered || dep1.ordered) {
         dep.ordered = dep0.ordered | dep1.ordered;
         dep.jp = MAX2(dep0.jp, dep1.jp);
      }

      if (dep0.unordered || dep1.unordered) {
         dep.unordered = dep0.unordered | dep1.unordered;
         dep.id = eq.link(dep0.unordered ? dep0.id : dep1.id,
                          dep1.unordered ? dep1.id : dep0.id);
      }

      dep.exec_all = dep0.exec_all || dep1.exec_all;

      return dep;
   }

   /**
    * Override dependency information of \p dep0 with that of \p dep1.
    */
   dependency
   shadow(const dependency &dep0, const dependency &dep1)
   {
      return is_valid(dep1) ? dep1 : dep0;
   }

   /**
    * Translate dependency information across the program.
    *
    * This returns a dependency on the same instruction translated to the
    * ordered_address space of a different block.  The correct shift for
    * transporting a dependency across an edge of the CFG is the difference
    * between the local ordered_address of the first instruction of the target
    * block and the local ordered_address of the instruction immediately after
    * the end of the origin block.
    */
   dependency
   transport(dependency dep, int delta)
   {
      if (dep.ordered && dep.jp > INT_MIN)
         dep.jp += delta;

      return dep;
   }

   /**
    * Return simplified dependency removing any synchronization modes not
    * applicable to an instruction reading the same register location.
    */
   dependency
   dependency_for_read(dependency dep)
   {
      dep.ordered &= TGL_REGDIST_DST;
      return dep;
   }

   /**
    * Return simplified dependency removing any synchronization modes not
    * applicable to an instruction \p inst writing the same register location.
    */
   dependency
   dependency_for_write(const fs_inst *inst, dependency dep)
   {
      if (!is_unordered(inst))
         dep.ordered &= TGL_REGDIST_DST;
      return dep;
   }

   /** @} */

   /**
    * Scoreboard representation.  This keeps track of the data dependencies of
    * registers with GRF granularity.
    */
   class scoreboard {
   public:
      /**
       * Look up the most current data dependency for register \p r.
       */
      dependency
      get(const fs_reg &r) const
      {
         if (const dependency *p = const_cast<scoreboard *>(this)->dep(r))
            return *p;
         else
            return dependency();
      }

      /**
       * Specify the most current data dependency for register \p r.
       */
      void
      set(const fs_reg &r, const dependency &d)
      {
         if (dependency *p = dep(r))
            *p = d;
      }

      /**
       * Component-wise merge() of corresponding dependencies from two
       * scoreboard objects.  \sa merge().
       */
      friend scoreboard
      merge(equivalence_relation &eq,
            const scoreboard &sb0, const scoreboard &sb1)
      {
         scoreboard sb;

         for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
            sb.grf_deps[i] = merge(eq, sb0.grf_deps[i], sb1.grf_deps[i]);

         sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep);

         for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
            sb.accum_deps[i] = merge(eq, sb0.accum_deps[i], sb1.accum_deps[i]);

         return sb;
      }

      /**
       * Component-wise shadow() of corresponding dependencies from two
       * scoreboard objects.  \sa shadow().
       */
      friend scoreboard
      shadow(const scoreboard &sb0, const scoreboard &sb1)
      {
         scoreboard sb;

         for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
            sb.grf_deps[i] = shadow(sb0.grf_deps[i], sb1.grf_deps[i]);

         sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep);

         for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
            sb.accum_deps[i] = shadow(sb0.accum_deps[i], sb1.accum_deps[i]);

         return sb;
      }

      /**
       * Component-wise transport() of dependencies from a scoreboard
       * object.  \sa transport().
       */
      friend scoreboard
      transport(const scoreboard &sb0, int delta)
      {
         scoreboard sb;

         for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
            sb.grf_deps[i] = transport(sb0.grf_deps[i], delta);

         sb.addr_dep = transport(sb0.addr_dep, delta);

         for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
            sb.accum_deps[i] = transport(sb0.accum_deps[i], delta);

         return sb;
      }

      friend bool
      operator==(const scoreboard &sb0, const scoreboard &sb1)
      {
         for (unsigned i = 0; i < ARRAY_SIZE(sb0.grf_deps); i++) {
            if (sb0.grf_deps[i] != sb1.grf_deps[i])
               return false;
         }

         if (sb0.addr_dep != sb1.addr_dep)
            return false;

         for (unsigned i = 0; i < ARRAY_SIZE(sb0.accum_deps); i++) {
            if (sb0.accum_deps[i] != sb1.accum_deps[i])
               return false;
         }

         return true;
      }

      friend bool
      operator!=(const scoreboard &sb0, const scoreboard &sb1)
      {
         return !(sb0 == sb1);
      }

   private:
      dependency grf_deps[BRW_MAX_GRF];
      dependency addr_dep;
      dependency accum_deps[10];

      dependency *
      dep(const fs_reg &r)
      {
         const unsigned reg = (r.file == VGRF ? r.nr + r.offset / REG_SIZE :
                               reg_offset(r) / REG_SIZE);

         return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] :
                 r.file == MRF ? &grf_deps[GEN7_MRF_HACK_START + reg] :
                 r.file == ARF && reg >= BRW_ARF_ADDRESS &&
                                  reg < BRW_ARF_ACCUMULATOR ? &addr_dep :
                 r.file == ARF && reg >= BRW_ARF_ACCUMULATOR &&
                                  reg < BRW_ARF_FLAG ? &accum_deps[
                                     reg - BRW_ARF_ACCUMULATOR] :
                 NULL);
      }
   };

   /**
    * Dependency list handling.
    * @{
    */
   struct dependency_list {
      dependency_list() : deps(NULL), n(0) {}

      ~dependency_list()
      {
         free(deps);
      }

      void
      push_back(const dependency &dep)
      {
         deps = (dependency *)realloc(deps, (n + 1) * sizeof(*deps));
         deps[n++] = dep;
      }

      unsigned
      size() const
      {
         return n;
      }

      const dependency &
      operator[](unsigned i) const
      {
         assert(i < n);
         return deps[i];
      }

      dependency &
      operator[](unsigned i)
      {
         assert(i < n);
         return deps[i];
      }

   private:
      dependency_list(const dependency_list &);
      dependency_list &
      operator=(const dependency_list &);

      dependency *deps;
      unsigned n;
   };

   /**
    * Add dependency \p dep to the list of dependencies of an instruction
    * \p deps.
    */
   void
   add_dependency(const unsigned *ids, dependency_list &deps, dependency dep)
   {
      if (is_valid(dep)) {
         /* Translate the unordered dependency token first in order to keep
          * the list minimally redundant.
          */
         if (dep.unordered)
            dep.id = ids[dep.id];

         /* Try to combine the specified dependency with any existing ones. */
         for (unsigned i = 0; i < deps.size(); i++) {
            /* Don't combine otherwise matching dependencies if there is an
             * exec_all mismatch which would cause a SET dependency to gain an
             * exec_all flag, since that would prevent it from being baked
             * into the instruction we want to allocate an SBID for.
             */
            if (deps[i].exec_all != dep.exec_all &&
                (!deps[i].exec_all || (dep.unordered & TGL_SBID_SET)) &&
                (!dep.exec_all || (deps[i].unordered & TGL_SBID_SET)))
               continue;

            if (dep.ordered && deps[i].ordered) {
               deps[i].jp = MAX2(deps[i].jp, dep.jp);
               deps[i].ordered |= dep.ordered;
               deps[i].exec_all |= dep.exec_all;
               dep.ordered = TGL_REGDIST_NULL;
            }

            if (dep.unordered && deps[i].unordered && deps[i].id == dep.id) {
               deps[i].unordered |= dep.unordered;
               deps[i].exec_all |= dep.exec_all;
               dep.unordered = TGL_SBID_NULL;
            }
         }

         /* Add it to the end of the list if necessary. */
         if (is_valid(dep))
            deps.push_back(dep);
      }
   }

   /**
    * Construct a tgl_swsb annotation encoding any ordered dependencies from
    * the dependency list \p deps of an instruction with ordered_address \p
    * jp.  If \p exec_all is false only dependencies known to be executed with
    * channel masking applied will be considered in the calculation.
    */
   tgl_swsb
   ordered_dependency_swsb(const dependency_list &deps,
                           const ordered_address &jp,
                           bool exec_all)
   {
      unsigned min_dist = ~0u;

      for (unsigned i = 0; i < deps.size(); i++) {
         if (deps[i].ordered && exec_all >= deps[i].exec_all) {
            const unsigned dist = jp - deps[i].jp;
            const unsigned max_dist = 10;
            assert(jp > deps[i].jp);
            if (dist <= max_dist)
               min_dist = MIN3(min_dist, dist, 7);
         }
      }

      return { min_dist == ~0u ? 0 : min_dist };
   }

   /**
    * Return whether the dependency list \p deps of an instruction with
    * ordered_address \p jp has any non-trivial ordered dependencies.  If \p
    * exec_all is false only dependencies known to be executed with channel
    * masking applied will be considered in the calculation.
    */
   bool
   find_ordered_dependency(const dependency_list &deps,
                           const ordered_address &jp,
                           bool exec_all)
   {
      return ordered_dependency_swsb(deps, jp, exec_all).regdist;
   }

   /**
    * Return the full tgl_sbid_mode bitset for the first unordered dependency
    * on the list \p deps that matches the specified tgl_sbid_mode, or zero if
    * no such dependency is present.  If \p exec_all is false only
    * dependencies known to be executed with channel masking applied will be
    * considered in the calculation.
    */
   tgl_sbid_mode
   find_unordered_dependency(const dependency_list &deps,
                             tgl_sbid_mode unordered,
                             bool exec_all)
   {
      if (unordered) {
         for (unsigned i = 0; i < deps.size(); i++) {
            if ((unordered & deps[i].unordered) &&
                exec_all >= deps[i].exec_all)
               return deps[i].unordered;
         }
      }

      return TGL_SBID_NULL;
   }

   /**
    * Return the tgl_sbid_mode bitset of an unordered dependency from the list
    * \p deps that can be represented directly in the SWSB annotation of the
    * instruction without additional SYNC instructions, or zero if no such
    * dependency is present.
    */
   tgl_sbid_mode
   baked_unordered_dependency_mode(const fs_inst *inst,
                                   const dependency_list &deps,
                                   const ordered_address &jp)
   {
      const bool exec_all = inst->force_writemask_all;
      const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);

      if (find_unordered_dependency(deps, TGL_SBID_SET, exec_all))
         return find_unordered_dependency(deps, TGL_SBID_SET, exec_all);
      else if (has_ordered && is_unordered(inst))
         return TGL_SBID_NULL;
      else if (find_unordered_dependency(deps, TGL_SBID_DST, exec_all) &&
               (!has_ordered || !is_unordered(inst)))
         return find_unordered_dependency(deps, TGL_SBID_DST, exec_all);
      else if (!has_ordered)
         return find_unordered_dependency(deps, TGL_SBID_SRC, exec_all);
      else
         return TGL_SBID_NULL;
   }

   /** @} */

   /**
    * Shader instruction dependency calculation.
    * @{
    */

   /**
    * Update scoreboard object \p sb to account for the execution of
    * instruction \p inst.
    */
   void
   update_inst_scoreboard(const ordered_address *jps,
                          const fs_inst *inst, unsigned ip, scoreboard &sb)
   {
      const bool exec_all = inst->force_writemask_all;

      /* Track any source registers that may be fetched asynchronously by this
       * instruction, otherwise clear the dependency in order to avoid
       * subsequent redundant synchronization.
       */
      for (unsigned i = 0; i < inst->sources; i++) {
         const dependency rd_dep =
            (inst->is_payload(i) ||
             inst->is_math()) ? dependency(TGL_SBID_SRC, ip, exec_all) :
            ordered_unit(inst) ? dependency(TGL_REGDIST_SRC, jps[ip], exec_all) :
            dependency::done;

         for (unsigned j = 0; j < regs_read(inst, i); j++)
            sb.set(byte_offset(inst->src[i], REG_SIZE * j), rd_dep);
      }

      if (is_send(inst) && inst->base_mrf != -1) {
         const dependency rd_dep = dependency(TGL_SBID_SRC, ip, exec_all);

         for (unsigned j = 0; j < inst->mlen; j++)
            sb.set(brw_uvec_mrf(8, inst->base_mrf + j, 0), rd_dep);
      }

      /* Track any destination registers of this instruction. */
      const dependency wr_dep =
         is_unordered(inst) ? dependency(TGL_SBID_DST, ip, exec_all) :
         ordered_unit(inst) ? dependency(TGL_REGDIST_DST, jps[ip], exec_all) :
         dependency();

      if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
          !inst->dst.is_null()) {
         for (unsigned j = 0; j < regs_written(inst); j++)
            sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep);
      }
   }

   /**
    * Calculate scoreboard objects locally that represent any pending (and
    * unconditionally resolved) dependencies at the end of each block of the
    * program.
    */
   scoreboard *
   gather_block_scoreboards(const fs_visitor *shader,
                            const ordered_address *jps)
   {
      scoreboard *sbs = new scoreboard[shader->cfg->num_blocks];
      unsigned ip = 0;

      foreach_block_and_inst(block, fs_inst, inst, shader->cfg)
         update_inst_scoreboard(jps, inst, ip++, sbs[block->num]);

      return sbs;
   }

   /**
    * Propagate data dependencies globally through the control flow graph
    * until a fixed point is reached.
    *
    * Calculates the set of dependencies potentially pending at the beginning
    * of each block, and returns it as an array of scoreboard objects.
    */
   scoreboard *
   propagate_block_scoreboards(const fs_visitor *shader,
                               const ordered_address *jps,
                               equivalence_relation &eq)
   {
      const scoreboard *delta_sbs = gather_block_scoreboards(shader, jps);
      scoreboard *in_sbs = new scoreboard[shader->cfg->num_blocks];
      scoreboard *out_sbs = new scoreboard[shader->cfg->num_blocks];

      for (bool progress = true; progress;) {
         progress = false;

         foreach_block(block, shader->cfg) {
            const scoreboard sb = shadow(in_sbs[block->num],
                                         delta_sbs[block->num]);

            if (sb != out_sbs[block->num]) {
               foreach_list_typed(bblock_link, child_link, link,
                                  &block->children) {
                  scoreboard &in_sb = in_sbs[child_link->block->num];
                  const int delta =
                     jps[child_link->block->start_ip] - jps[block->end_ip]
                     - ordered_unit(static_cast<const fs_inst *>(block->end()));

                  in_sb = merge(eq, in_sb, transport(sb, delta));
               }

               out_sbs[block->num] = sb;
               progress = true;
            }
         }
      }

      delete[] delta_sbs;
      delete[] out_sbs;

      return in_sbs;
   }

   /**
    * Return the list of potential dependencies of each instruction in the
    * shader based on the result of global dependency analysis.
    */
   dependency_list *
   gather_inst_dependencies(const fs_visitor *shader,
                            const ordered_address *jps)
   {
      equivalence_relation eq(num_instructions(shader));
      scoreboard *sbs = propagate_block_scoreboards(shader, jps, eq);
      const unsigned *ids = eq.flatten();
      dependency_list *deps = new dependency_list[num_instructions(shader)];
      unsigned ip = 0;

      foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
         const bool exec_all = inst->force_writemask_all;
         scoreboard &sb = sbs[block->num];

         for (unsigned i = 0; i < inst->sources; i++) {
            for (unsigned j = 0; j < regs_read(inst, i); j++)
               add_dependency(ids, deps[ip], dependency_for_read(
                  sb.get(byte_offset(inst->src[i], REG_SIZE * j))));
         }

         if (is_send(inst) && inst->base_mrf != -1) {
            for (unsigned j = 0; j < inst->mlen; j++)
               add_dependency(ids, deps[ip], dependency_for_read(
                  sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
         }

         if (is_unordered(inst))
            add_dependency(ids, deps[ip],
                           dependency(TGL_SBID_SET, ip, exec_all));

         if (!inst->no_dd_check) {
            if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
               for (unsigned j = 0; j < regs_written(inst); j++) {
                  add_dependency(ids, deps[ip], dependency_for_write(inst,
                     sb.get(byte_offset(inst->dst, REG_SIZE * j))));
               }
            }

            if (is_send(inst) && inst->base_mrf != -1) {
               for (unsigned j = 0; j < inst->implied_mrf_writes(); j++)
                  add_dependency(ids, deps[ip], dependency_for_write(inst,
                     sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
            }
         }

         update_inst_scoreboard(jps, inst, ip, sb);
         ip++;
      }

      delete[] sbs;
      delete[] ids;

      return deps;
   }

   /** @} */

   /**
    * Allocate SBID tokens to track the execution of every out-of-order
    * instruction of the shader.
    */
   dependency_list *
   allocate_inst_dependencies(const fs_visitor *shader,
                              const dependency_list *deps0)
   {
      /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in
       *       shaders with a large number of SEND messages.
       */

      /* Allocate an unordered dependency ID to hardware SBID translation
       * table with as many entries as instructions there are in the shader,
       * which is the maximum number of unordered IDs we can find in the
       * program.
       */
      unsigned *ids = new unsigned[num_instructions(shader)];
      for (unsigned ip = 0; ip < num_instructions(shader); ip++)
         ids[ip] = ~0u;

      dependency_list *deps1 = new dependency_list[num_instructions(shader)];
      unsigned next_id = 0;

      for (unsigned ip = 0; ip < num_instructions(shader); ip++) {
         for (unsigned i = 0; i < deps0[ip].size(); i++) {
            const dependency &dep = deps0[ip][i];

            if (dep.unordered && ids[dep.id] == ~0u)
               ids[dep.id] = (next_id++) & 0xf;

            add_dependency(ids, deps1[ip], dep);
         }
      }

      delete[] ids;

      return deps1;
   }

   /**
    * Emit dependency information provided by \p deps into the shader,
    * inserting additional SYNC instructions for dependencies that can't be
    * represented directly by annotating existing instructions.
    */
   void
   emit_inst_dependencies(fs_visitor *shader,
                          const ordered_address *jps,
                          const dependency_list *deps)
   {
      unsigned ip = 0;

      foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) {
         const bool exec_all = inst->force_writemask_all;
         tgl_swsb swsb = ordered_dependency_swsb(deps[ip], jps[ip], exec_all);
         const tgl_sbid_mode unordered_mode =
            baked_unordered_dependency_mode(inst, deps[ip], jps[ip]);

         for (unsigned i = 0; i < deps[ip].size(); i++) {
            const dependency &dep = deps[ip][i];

            if (dep.unordered) {
               if (unordered_mode == dep.unordered &&
                   exec_all >= dep.exec_all && !swsb.mode) {
                  /* Bake unordered dependency into the instruction's SWSB if
                   * possible, except in cases where the current instruction
                   * isn't marked NoMask but the dependency is, since that
                   * might lead to data coherency issues due to
                   * GEN:BUG:1407528679.
                   */
                  swsb.sbid = dep.id;
                  swsb.mode = dep.unordered;
               } else {
                  /* Emit dependency into the SWSB of an extra SYNC
                   * instruction.
                   */
                  const fs_builder ibld = fs_builder(shader, block, inst)
                                          .exec_all().group(1, 0);
                  fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
                                            brw_imm_ud(TGL_SYNC_NOP));
                  sync->sched.sbid = dep.id;
                  sync->sched.mode = dep.unordered;
                  assert(!(sync->sched.mode & TGL_SBID_SET));
               }
            }
         }

         for (unsigned i = 0; i < deps[ip].size(); i++) {
            const dependency &dep = deps[ip][i];

            if (dep.ordered && dep.exec_all > exec_all &&
                find_ordered_dependency(deps[ip], jps[ip], true)) {
               /* If the current instruction is not marked NoMask but an
                * ordered dependency is, perform the synchronization as a
                * separate NoMask SYNC instruction in order to avoid data
                * coherency issues due to GEN:BUG:1407528679.  The similar
                * scenario with unordered dependencies should have been
                * handled above.
                */
               const fs_builder ibld = fs_builder(shader, block, inst)
                                       .exec_all().group(1, 0);
               fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
                                         brw_imm_ud(TGL_SYNC_NOP));
               sync->sched = ordered_dependency_swsb(deps[ip], jps[ip], true);
               break;
            }
         }

         /* Update the IR. */
         inst->sched = swsb;
         inst->no_dd_check = inst->no_dd_clear = false;
         ip++;
      }
   }
}

bool
fs_visitor::lower_scoreboard()
{
   if (devinfo->gen >= 12) {
      const ordered_address *jps = ordered_inst_addresses(this);
      const dependency_list *deps0 = gather_inst_dependencies(this, jps);
      const dependency_list *deps1 = allocate_inst_dependencies(this, deps0);
      emit_inst_dependencies(this, jps, deps1);
      delete[] deps1;
      delete[] deps0;
      delete[] jps;
   }

   return true;
}