• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2019 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /** @file brw_fs_scoreboard.cpp
25  *
26  * Gen12+ hardware lacks the register scoreboard logic that used to guarantee
27  * data coherency between register reads and writes in previous generations.
28  * This lowering pass runs after register allocation in order to make up for
29  * it.
30  *
31  * It works by performing global dataflow analysis in order to determine the
32  * set of potential dependencies of every instruction in the shader, and then
33  * inserts any required SWSB annotations and additional SYNC instructions in
34  * order to guarantee data coherency.
35  *
36  * WARNING - Access of the following (rarely used) ARF registers is not
37  *           tracked here, and require the RegDist SWSB annotation to be set
38  *           to 1 by the generator in order to avoid data races:
39  *
40  *  - sp stack pointer
41  *  - sr0 state register
42  *  - cr0 control register
43  *  - ip instruction pointer
44  *  - tm0 timestamp register
45  *  - dbg0 debug register
46  *
47  * The following ARF registers don't need to be tracked here because data
48  * coherency is still provided transparently by the hardware:
49  *
50  *  - f0-1 flag registers
51  *  - n0 notification register
52  *  - tdr0 thread dependency register
53  */
54 
55 #include "brw_fs.h"
56 #include "brw_cfg.h"
57 
58 using namespace brw;
59 
60 namespace {
61    /**
62     * In-order instruction accounting.
63     * @{
64     */
65 
66    /**
67     * Number of in-order hardware instructions contained in this IR
68     * instruction.  This determines the increment applied to the RegDist
69     * counter calculated for any ordered dependency that crosses this
70     * instruction.
71     */
72    unsigned
ordered_unit(const fs_inst * inst)73    ordered_unit(const fs_inst *inst)
74    {
75       switch (inst->opcode) {
76       case BRW_OPCODE_SYNC:
77       case BRW_OPCODE_DO:
78       case SHADER_OPCODE_UNDEF:
79       case FS_OPCODE_PLACEHOLDER_HALT:
80       case FS_OPCODE_SCHEDULING_FENCE:
81          return 0;
82       default:
83          /* Note that the following is inaccurate for virtual instructions
84           * that expand to more in-order instructions than assumed here, but
85           * that can only lead to suboptimal execution ordering, data
86           * coherency won't be impacted.  Providing exact RegDist counts for
87           * each virtual instruction would allow better ALU performance, but
88           * it would require keeping this switch statement in perfect sync
89           * with the generator in order to avoid data corruption.  Lesson is
90           * (again) don't use virtual instructions if you want optimal
91           * scheduling.
92           */
93          return is_unordered(inst) ? 0 : 1;
94       }
95    }
96 
97    /**
98     * Type for an instruction counter that increments for in-order
99     * instructions only, arbitrarily denoted 'jp' throughout this lowering
100     * pass in order to distinguish it from the regular instruction counter.
101     */
102    typedef int ordered_address;
103 
104    /**
105     * Return the number of instructions in the program.
106     */
107    unsigned
num_instructions(const backend_shader * shader)108    num_instructions(const backend_shader *shader)
109    {
110       return shader->cfg->blocks[shader->cfg->num_blocks - 1]->end_ip + 1;
111    }
112 
113    /**
114     * Calculate the local ordered_address instruction counter at every
115     * instruction of the shader for subsequent constant-time look-up.
116     */
117    ordered_address *
ordered_inst_addresses(const fs_visitor * shader)118    ordered_inst_addresses(const fs_visitor *shader)
119    {
120       ordered_address *jps = new ordered_address[num_instructions(shader)];
121       ordered_address jp = 0;
122       unsigned ip = 0;
123 
124       foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
125          jps[ip] = jp;
126          jp += ordered_unit(inst);
127          ip++;
128       }
129 
130       return jps;
131    }
132 
133    /**
134     * Synchronization mode required for data manipulated by in-order
135     * instructions.
136     *
137     * Similar to tgl_sbid_mode, but without SET mode.  Defined as a separate
138     * enum for additional type safety.  The hardware doesn't provide control
139     * over the synchronization mode for RegDist annotations, this is only used
140     * internally in this pass in order to optimize out redundant read
141     * dependencies where possible.
142     */
143    enum tgl_regdist_mode {
144       TGL_REGDIST_NULL = 0,
145       TGL_REGDIST_SRC = 1,
146       TGL_REGDIST_DST = 2
147    };
148 
149    /**
150     * Allow bitwise arithmetic of tgl_regdist_mode enums.
151     */
152    tgl_regdist_mode
operator |(tgl_regdist_mode x,tgl_regdist_mode y)153    operator|(tgl_regdist_mode x, tgl_regdist_mode y)
154    {
155       return tgl_regdist_mode(unsigned(x) | unsigned(y));
156    }
157 
158    tgl_regdist_mode
operator &(tgl_regdist_mode x,tgl_regdist_mode y)159    operator&(tgl_regdist_mode x, tgl_regdist_mode y)
160    {
161       return tgl_regdist_mode(unsigned(x) & unsigned(y));
162    }
163 
164    tgl_regdist_mode &
operator |=(tgl_regdist_mode & x,tgl_regdist_mode y)165    operator|=(tgl_regdist_mode &x, tgl_regdist_mode y)
166    {
167       return x = x | y;
168    }
169 
170    tgl_regdist_mode &
operator &=(tgl_regdist_mode & x,tgl_regdist_mode y)171    operator&=(tgl_regdist_mode &x, tgl_regdist_mode y)
172    {
173       return x = x & y;
174    }
175 
176    /** @} */
177 
178    /**
179     * Representation of an equivalence relation among the set of unsigned
180     * integers.
181     *
182     * Its initial state is the identity relation '~' such that i ~ j if and
183     * only if i == j for every pair of unsigned integers i and j.
184     */
185    struct equivalence_relation {
equivalence_relation__anonf3a7e32b0111::equivalence_relation186       equivalence_relation(unsigned n) : is(new unsigned[n]), n(n)
187       {
188          for (unsigned i = 0; i < n; i++)
189             is[i] = i;
190       }
191 
~equivalence_relation__anonf3a7e32b0111::equivalence_relation192       ~equivalence_relation()
193       {
194          delete[] is;
195       }
196 
197       /**
198        * Return equivalence class index of the specified element.  Effectively
199        * this is the numeric value of an arbitrary representative from the
200        * equivalence class.
201        *
202        * Allows the evaluation of the equivalence relation according to the
203        * rule that i ~ j if and only if lookup(i) == lookup(j).
204        */
205       unsigned
lookup__anonf3a7e32b0111::equivalence_relation206       lookup(unsigned i) const
207       {
208          if (i < n && is[i] != i)
209             return lookup(is[i]);
210          else
211             return i;
212       }
213 
214       /**
215        * Create an array with the results of the lookup() method for
216        * constant-time evaluation.
217        */
218       unsigned *
flatten__anonf3a7e32b0111::equivalence_relation219       flatten() const
220       {
221          unsigned *ids = new unsigned[n];
222 
223          for (unsigned i = 0; i < n; i++)
224             ids[i] = lookup(i);
225 
226          return ids;
227       }
228 
229       /**
230        * Mutate the existing equivalence relation minimally by imposing the
231        * additional requirement that i ~ j.
232        *
233        * The algorithm updates the internal representation recursively in
234        * order to guarantee transitivity while preserving the previously
235        * specified equivalence requirements.
236        */
237       unsigned
link__anonf3a7e32b0111::equivalence_relation238       link(unsigned i, unsigned j)
239       {
240          const unsigned k = lookup(i);
241          assign(i, k);
242          assign(j, k);
243          return k;
244       }
245 
246    private:
247       equivalence_relation(const equivalence_relation &);
248 
249       equivalence_relation &
250       operator=(const equivalence_relation &);
251 
252       /**
253        * Assign the representative of \p from to be equivalent to \p to.
254        *
255        * At the same time the data structure is partially flattened as much as
256        * it's possible without increasing the number of recursive calls.
257        */
258       void
assign__anonf3a7e32b0111::equivalence_relation259       assign(unsigned from, unsigned to)
260       {
261          if (from != to) {
262             assert(from < n);
263 
264             if (is[from] != from)
265                assign(is[from], to);
266 
267             is[from] = to;
268          }
269       }
270 
271       unsigned *is;
272       unsigned n;
273    };
274 
275    /**
276     * Representation of a data dependency between two instructions in the
277     * program.
278     * @{
279     */
280    struct dependency {
281       /**
282        * No dependency information.
283        */
dependency__anonf3a7e32b0111::dependency284       dependency() : ordered(TGL_REGDIST_NULL), jp(INT_MIN),
285                      unordered(TGL_SBID_NULL), id(0),
286                      exec_all(false) {}
287 
288       /**
289        * Construct a dependency on the in-order instruction with the provided
290        * ordered_address instruction counter.
291        */
dependency__anonf3a7e32b0111::dependency292       dependency(tgl_regdist_mode mode, ordered_address jp, bool exec_all) :
293          ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0),
294          exec_all(exec_all) {}
295 
296       /**
297        * Construct a dependency on the out-of-order instruction with the
298        * specified synchronization token.
299        */
dependency__anonf3a7e32b0111::dependency300       dependency(tgl_sbid_mode mode, unsigned id, bool exec_all) :
301          ordered(TGL_REGDIST_NULL), jp(INT_MIN), unordered(mode), id(id),
302          exec_all(exec_all) {}
303 
304       /**
305        * Synchronization mode of in-order dependency, or zero if no in-order
306        * dependency is present.
307        */
308       tgl_regdist_mode ordered;
309 
310       /**
311        * Instruction counter of in-order dependency.
312        *
313        * For a dependency part of a different block in the program, this is
314        * relative to the specific control flow path taken between the
315        * dependency and the current block: It is the ordered_address such that
316        * the difference between it and the ordered_address of the first
317        * instruction of the current block is exactly the number of in-order
318        * instructions across that control flow path.  It is not guaranteed to
319        * be equal to the local ordered_address of the generating instruction
320        * [as returned by ordered_inst_addresses()], except for block-local
321        * dependencies.
322        */
323       ordered_address jp;
324 
325       /**
326        * Synchronization mode of unordered dependency, or zero if no unordered
327        * dependency is present.
328        */
329       tgl_sbid_mode unordered;
330 
331       /** Synchronization token of out-of-order dependency. */
332       unsigned id;
333 
334       /**
335        * Whether the dependency could be run with execution masking disabled,
336        * which might lead to the unwanted execution of the generating
337        * instruction in cases where a BB is executed with all channels
338        * disabled due to hardware bug GEN:BUG:1407528679.
339        */
340       bool exec_all;
341 
342       /**
343        * Trivial in-order dependency that's always satisfied.
344        *
345        * Note that unlike a default-constructed dependency() which is also
346        * trivially satisfied, this is considered to provide dependency
347        * information and can be used to clear a previously pending dependency
348        * via shadow().
349        */
350       static const dependency done;
351 
352       friend bool
operator ==(const dependency & dep0,const dependency & dep1)353       operator==(const dependency &dep0, const dependency &dep1)
354       {
355          return dep0.ordered == dep1.ordered &&
356                 dep0.jp == dep1.jp &&
357                 dep0.unordered == dep1.unordered &&
358                 dep0.id == dep1.id &&
359                 dep0.exec_all == dep1.exec_all;
360       }
361 
362       friend bool
operator !=(const dependency & dep0,const dependency & dep1)363       operator!=(const dependency &dep0, const dependency &dep1)
364       {
365          return !(dep0 == dep1);
366       }
367    };
368 
369    const dependency dependency::done = dependency(TGL_REGDIST_SRC, INT_MIN, false);
370 
371    /**
372     * Return whether \p dep contains any dependency information.
373     */
374    bool
is_valid(const dependency & dep)375    is_valid(const dependency &dep)
376    {
377       return dep.ordered || dep.unordered;
378    }
379 
380    /**
381     * Combine \p dep0 and \p dep1 into a single dependency object that is only
382     * satisfied when both original dependencies are satisfied.  This might
383     * involve updating the equivalence relation \p eq in order to make sure
384     * that both out-of-order dependencies are assigned the same hardware SBID
385     * as synchronization token.
386     */
387    dependency
merge(equivalence_relation & eq,const dependency & dep0,const dependency & dep1)388    merge(equivalence_relation &eq,
389          const dependency &dep0, const dependency &dep1)
390    {
391       dependency dep;
392 
393       if (dep0.ordered || dep1.ordered) {
394          dep.ordered = dep0.ordered | dep1.ordered;
395          dep.jp = MAX2(dep0.jp, dep1.jp);
396       }
397 
398       if (dep0.unordered || dep1.unordered) {
399          dep.unordered = dep0.unordered | dep1.unordered;
400          dep.id = eq.link(dep0.unordered ? dep0.id : dep1.id,
401                           dep1.unordered ? dep1.id : dep0.id);
402       }
403 
404       dep.exec_all = dep0.exec_all || dep1.exec_all;
405 
406       return dep;
407    }
408 
409    /**
410     * Override dependency information of \p dep0 with that of \p dep1.
411     */
412    dependency
shadow(const dependency & dep0,const dependency & dep1)413    shadow(const dependency &dep0, const dependency &dep1)
414    {
415       return is_valid(dep1) ? dep1 : dep0;
416    }
417 
418    /**
419     * Translate dependency information across the program.
420     *
421     * This returns a dependency on the same instruction translated to the
422     * ordered_address space of a different block.  The correct shift for
423     * transporting a dependency across an edge of the CFG is the difference
424     * between the local ordered_address of the first instruction of the target
425     * block and the local ordered_address of the instruction immediately after
426     * the end of the origin block.
427     */
428    dependency
transport(dependency dep,int delta)429    transport(dependency dep, int delta)
430    {
431       if (dep.ordered && dep.jp > INT_MIN)
432          dep.jp += delta;
433 
434       return dep;
435    }
436 
437    /**
438     * Return simplified dependency removing any synchronization modes not
439     * applicable to an instruction reading the same register location.
440     */
441    dependency
dependency_for_read(dependency dep)442    dependency_for_read(dependency dep)
443    {
444       dep.ordered &= TGL_REGDIST_DST;
445       return dep;
446    }
447 
448    /**
449     * Return simplified dependency removing any synchronization modes not
450     * applicable to an instruction \p inst writing the same register location.
451     */
452    dependency
dependency_for_write(const fs_inst * inst,dependency dep)453    dependency_for_write(const fs_inst *inst, dependency dep)
454    {
455       if (!is_unordered(inst))
456          dep.ordered &= TGL_REGDIST_DST;
457       return dep;
458    }
459 
460    /** @} */
461 
462    /**
463     * Scoreboard representation.  This keeps track of the data dependencies of
464     * registers with GRF granularity.
465     */
466    class scoreboard {
467    public:
468       /**
469        * Look up the most current data dependency for register \p r.
470        */
471       dependency
get(const fs_reg & r) const472       get(const fs_reg &r) const
473       {
474          if (const dependency *p = const_cast<scoreboard *>(this)->dep(r))
475             return *p;
476          else
477             return dependency();
478       }
479 
480       /**
481        * Specify the most current data dependency for register \p r.
482        */
483       void
set(const fs_reg & r,const dependency & d)484       set(const fs_reg &r, const dependency &d)
485       {
486          if (dependency *p = dep(r))
487             *p = d;
488       }
489 
490       /**
491        * Component-wise merge() of corresponding dependencies from two
492        * scoreboard objects.  \sa merge().
493        */
494       friend scoreboard
merge(equivalence_relation & eq,const scoreboard & sb0,const scoreboard & sb1)495       merge(equivalence_relation &eq,
496             const scoreboard &sb0, const scoreboard &sb1)
497       {
498          scoreboard sb;
499 
500          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
501             sb.grf_deps[i] = merge(eq, sb0.grf_deps[i], sb1.grf_deps[i]);
502 
503          sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep);
504 
505          for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
506             sb.accum_deps[i] = merge(eq, sb0.accum_deps[i], sb1.accum_deps[i]);
507 
508          return sb;
509       }
510 
511       /**
512        * Component-wise shadow() of corresponding dependencies from two
513        * scoreboard objects.  \sa shadow().
514        */
515       friend scoreboard
shadow(const scoreboard & sb0,const scoreboard & sb1)516       shadow(const scoreboard &sb0, const scoreboard &sb1)
517       {
518          scoreboard sb;
519 
520          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
521             sb.grf_deps[i] = shadow(sb0.grf_deps[i], sb1.grf_deps[i]);
522 
523          sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep);
524 
525          for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
526             sb.accum_deps[i] = shadow(sb0.accum_deps[i], sb1.accum_deps[i]);
527 
528          return sb;
529       }
530 
531       /**
532        * Component-wise transport() of dependencies from a scoreboard
533        * object.  \sa transport().
534        */
535       friend scoreboard
transport(const scoreboard & sb0,int delta)536       transport(const scoreboard &sb0, int delta)
537       {
538          scoreboard sb;
539 
540          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
541             sb.grf_deps[i] = transport(sb0.grf_deps[i], delta);
542 
543          sb.addr_dep = transport(sb0.addr_dep, delta);
544 
545          for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
546             sb.accum_deps[i] = transport(sb0.accum_deps[i], delta);
547 
548          return sb;
549       }
550 
551       friend bool
operator ==(const scoreboard & sb0,const scoreboard & sb1)552       operator==(const scoreboard &sb0, const scoreboard &sb1)
553       {
554          for (unsigned i = 0; i < ARRAY_SIZE(sb0.grf_deps); i++) {
555             if (sb0.grf_deps[i] != sb1.grf_deps[i])
556                return false;
557          }
558 
559          if (sb0.addr_dep != sb1.addr_dep)
560             return false;
561 
562          for (unsigned i = 0; i < ARRAY_SIZE(sb0.accum_deps); i++) {
563             if (sb0.accum_deps[i] != sb1.accum_deps[i])
564                return false;
565          }
566 
567          return true;
568       }
569 
570       friend bool
operator !=(const scoreboard & sb0,const scoreboard & sb1)571       operator!=(const scoreboard &sb0, const scoreboard &sb1)
572       {
573          return !(sb0 == sb1);
574       }
575 
576    private:
577       dependency grf_deps[BRW_MAX_GRF];
578       dependency addr_dep;
579       dependency accum_deps[10];
580 
581       dependency *
dep(const fs_reg & r)582       dep(const fs_reg &r)
583       {
584          const unsigned reg = (r.file == VGRF ? r.nr + r.offset / REG_SIZE :
585                                reg_offset(r) / REG_SIZE);
586 
587          return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] :
588                  r.file == MRF ? &grf_deps[GEN7_MRF_HACK_START + reg] :
589                  r.file == ARF && reg >= BRW_ARF_ADDRESS &&
590                                   reg < BRW_ARF_ACCUMULATOR ? &addr_dep :
591                  r.file == ARF && reg >= BRW_ARF_ACCUMULATOR &&
592                                   reg < BRW_ARF_FLAG ? &accum_deps[
593                                      reg - BRW_ARF_ACCUMULATOR] :
594                  NULL);
595       }
596    };
597 
598    /**
599     * Dependency list handling.
600     * @{
601     */
602    struct dependency_list {
dependency_list__anonf3a7e32b0111::dependency_list603       dependency_list() : deps(NULL), n(0) {}
604 
~dependency_list__anonf3a7e32b0111::dependency_list605       ~dependency_list()
606       {
607          free(deps);
608       }
609 
610       void
push_back__anonf3a7e32b0111::dependency_list611       push_back(const dependency &dep)
612       {
613          deps = (dependency *)realloc(deps, (n + 1) * sizeof(*deps));
614          deps[n++] = dep;
615       }
616 
617       unsigned
size__anonf3a7e32b0111::dependency_list618       size() const
619       {
620          return n;
621       }
622 
623       const dependency &
operator []__anonf3a7e32b0111::dependency_list624       operator[](unsigned i) const
625       {
626          assert(i < n);
627          return deps[i];
628       }
629 
630       dependency &
operator []__anonf3a7e32b0111::dependency_list631       operator[](unsigned i)
632       {
633          assert(i < n);
634          return deps[i];
635       }
636 
637    private:
638       dependency_list(const dependency_list &);
639       dependency_list &
640       operator=(const dependency_list &);
641 
642       dependency *deps;
643       unsigned n;
644    };
645 
646    /**
647     * Add dependency \p dep to the list of dependencies of an instruction
648     * \p deps.
649     */
650    void
add_dependency(const unsigned * ids,dependency_list & deps,dependency dep)651    add_dependency(const unsigned *ids, dependency_list &deps, dependency dep)
652    {
653       if (is_valid(dep)) {
654          /* Translate the unordered dependency token first in order to keep
655           * the list minimally redundant.
656           */
657          if (dep.unordered)
658             dep.id = ids[dep.id];
659 
660          /* Try to combine the specified dependency with any existing ones. */
661          for (unsigned i = 0; i < deps.size(); i++) {
662             /* Don't combine otherwise matching dependencies if there is an
663              * exec_all mismatch which would cause a SET dependency to gain an
664              * exec_all flag, since that would prevent it from being baked
665              * into the instruction we want to allocate an SBID for.
666              */
667             if (deps[i].exec_all != dep.exec_all &&
668                 (!deps[i].exec_all || (dep.unordered & TGL_SBID_SET)) &&
669                 (!dep.exec_all || (deps[i].unordered & TGL_SBID_SET)))
670                continue;
671 
672             if (dep.ordered && deps[i].ordered) {
673                deps[i].jp = MAX2(deps[i].jp, dep.jp);
674                deps[i].ordered |= dep.ordered;
675                deps[i].exec_all |= dep.exec_all;
676                dep.ordered = TGL_REGDIST_NULL;
677             }
678 
679             if (dep.unordered && deps[i].unordered && deps[i].id == dep.id) {
680                deps[i].unordered |= dep.unordered;
681                deps[i].exec_all |= dep.exec_all;
682                dep.unordered = TGL_SBID_NULL;
683             }
684          }
685 
686          /* Add it to the end of the list if necessary. */
687          if (is_valid(dep))
688             deps.push_back(dep);
689       }
690    }
691 
692    /**
693     * Construct a tgl_swsb annotation encoding any ordered dependencies from
694     * the dependency list \p deps of an instruction with ordered_address \p
695     * jp.  If \p exec_all is false only dependencies known to be executed with
696     * channel masking applied will be considered in the calculation.
697     */
698    tgl_swsb
ordered_dependency_swsb(const dependency_list & deps,const ordered_address & jp,bool exec_all)699    ordered_dependency_swsb(const dependency_list &deps,
700                            const ordered_address &jp,
701                            bool exec_all)
702    {
703       unsigned min_dist = ~0u;
704 
705       for (unsigned i = 0; i < deps.size(); i++) {
706          if (deps[i].ordered && exec_all >= deps[i].exec_all) {
707             const unsigned dist = jp - deps[i].jp;
708             const unsigned max_dist = 10;
709             assert(jp > deps[i].jp);
710             if (dist <= max_dist)
711                min_dist = MIN3(min_dist, dist, 7);
712          }
713       }
714 
715       return { min_dist == ~0u ? 0 : min_dist };
716    }
717 
718    /**
719     * Return whether the dependency list \p deps of an instruction with
720     * ordered_address \p jp has any non-trivial ordered dependencies.  If \p
721     * exec_all is false only dependencies known to be executed with channel
722     * masking applied will be considered in the calculation.
723     */
724    bool
find_ordered_dependency(const dependency_list & deps,const ordered_address & jp,bool exec_all)725    find_ordered_dependency(const dependency_list &deps,
726                            const ordered_address &jp,
727                            bool exec_all)
728    {
729       return ordered_dependency_swsb(deps, jp, exec_all).regdist;
730    }
731 
732    /**
733     * Return the full tgl_sbid_mode bitset for the first unordered dependency
734     * on the list \p deps that matches the specified tgl_sbid_mode, or zero if
735     * no such dependency is present.  If \p exec_all is false only
736     * dependencies known to be executed with channel masking applied will be
737     * considered in the calculation.
738     */
739    tgl_sbid_mode
find_unordered_dependency(const dependency_list & deps,tgl_sbid_mode unordered,bool exec_all)740    find_unordered_dependency(const dependency_list &deps,
741                              tgl_sbid_mode unordered,
742                              bool exec_all)
743    {
744       if (unordered) {
745          for (unsigned i = 0; i < deps.size(); i++) {
746             if ((unordered & deps[i].unordered) &&
747                 exec_all >= deps[i].exec_all)
748                return deps[i].unordered;
749          }
750       }
751 
752       return TGL_SBID_NULL;
753    }
754 
755    /**
756     * Return the tgl_sbid_mode bitset of an unordered dependency from the list
757     * \p deps that can be represented directly in the SWSB annotation of the
758     * instruction without additional SYNC instructions, or zero if no such
759     * dependency is present.
760     */
761    tgl_sbid_mode
baked_unordered_dependency_mode(const fs_inst * inst,const dependency_list & deps,const ordered_address & jp)762    baked_unordered_dependency_mode(const fs_inst *inst,
763                                    const dependency_list &deps,
764                                    const ordered_address &jp)
765    {
766       const bool exec_all = inst->force_writemask_all;
767       const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
768 
769       if (find_unordered_dependency(deps, TGL_SBID_SET, exec_all))
770          return find_unordered_dependency(deps, TGL_SBID_SET, exec_all);
771       else if (has_ordered && is_unordered(inst))
772          return TGL_SBID_NULL;
773       else if (find_unordered_dependency(deps, TGL_SBID_DST, exec_all) &&
774                (!has_ordered || !is_unordered(inst)))
775          return find_unordered_dependency(deps, TGL_SBID_DST, exec_all);
776       else if (!has_ordered)
777          return find_unordered_dependency(deps, TGL_SBID_SRC, exec_all);
778       else
779          return TGL_SBID_NULL;
780    }
781 
782    /** @} */
783 
784    /**
785     * Shader instruction dependency calculation.
786     * @{
787     */
788 
789    /**
790     * Update scoreboard object \p sb to account for the execution of
791     * instruction \p inst.
792     */
793    void
update_inst_scoreboard(const ordered_address * jps,const fs_inst * inst,unsigned ip,scoreboard & sb)794    update_inst_scoreboard(const ordered_address *jps,
795                           const fs_inst *inst, unsigned ip, scoreboard &sb)
796    {
797       const bool exec_all = inst->force_writemask_all;
798 
799       /* Track any source registers that may be fetched asynchronously by this
800        * instruction, otherwise clear the dependency in order to avoid
801        * subsequent redundant synchronization.
802        */
803       for (unsigned i = 0; i < inst->sources; i++) {
804          const dependency rd_dep =
805             (inst->is_payload(i) ||
806              inst->is_math()) ? dependency(TGL_SBID_SRC, ip, exec_all) :
807             ordered_unit(inst) ? dependency(TGL_REGDIST_SRC, jps[ip], exec_all) :
808             dependency::done;
809 
810          for (unsigned j = 0; j < regs_read(inst, i); j++)
811             sb.set(byte_offset(inst->src[i], REG_SIZE * j), rd_dep);
812       }
813 
814       if (is_send(inst) && inst->base_mrf != -1) {
815          const dependency rd_dep = dependency(TGL_SBID_SRC, ip, exec_all);
816 
817          for (unsigned j = 0; j < inst->mlen; j++)
818             sb.set(brw_uvec_mrf(8, inst->base_mrf + j, 0), rd_dep);
819       }
820 
821       /* Track any destination registers of this instruction. */
822       const dependency wr_dep =
823          is_unordered(inst) ? dependency(TGL_SBID_DST, ip, exec_all) :
824          ordered_unit(inst) ? dependency(TGL_REGDIST_DST, jps[ip], exec_all) :
825          dependency();
826 
827       if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
828           !inst->dst.is_null()) {
829          for (unsigned j = 0; j < regs_written(inst); j++)
830             sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep);
831       }
832    }
833 
834    /**
835     * Calculate scoreboard objects locally that represent any pending (and
836     * unconditionally resolved) dependencies at the end of each block of the
837     * program.
838     */
839    scoreboard *
gather_block_scoreboards(const fs_visitor * shader,const ordered_address * jps)840    gather_block_scoreboards(const fs_visitor *shader,
841                             const ordered_address *jps)
842    {
843       scoreboard *sbs = new scoreboard[shader->cfg->num_blocks];
844       unsigned ip = 0;
845 
846       foreach_block_and_inst(block, fs_inst, inst, shader->cfg)
847          update_inst_scoreboard(jps, inst, ip++, sbs[block->num]);
848 
849       return sbs;
850    }
851 
852    /**
853     * Propagate data dependencies globally through the control flow graph
854     * until a fixed point is reached.
855     *
856     * Calculates the set of dependencies potentially pending at the beginning
857     * of each block, and returns it as an array of scoreboard objects.
858     */
859    scoreboard *
propagate_block_scoreboards(const fs_visitor * shader,const ordered_address * jps,equivalence_relation & eq)860    propagate_block_scoreboards(const fs_visitor *shader,
861                                const ordered_address *jps,
862                                equivalence_relation &eq)
863    {
864       const scoreboard *delta_sbs = gather_block_scoreboards(shader, jps);
865       scoreboard *in_sbs = new scoreboard[shader->cfg->num_blocks];
866       scoreboard *out_sbs = new scoreboard[shader->cfg->num_blocks];
867 
868       for (bool progress = true; progress;) {
869          progress = false;
870 
871          foreach_block(block, shader->cfg) {
872             const scoreboard sb = shadow(in_sbs[block->num],
873                                          delta_sbs[block->num]);
874 
875             if (sb != out_sbs[block->num]) {
876                foreach_list_typed(bblock_link, child_link, link,
877                                   &block->children) {
878                   scoreboard &in_sb = in_sbs[child_link->block->num];
879                   const int delta =
880                      jps[child_link->block->start_ip] - jps[block->end_ip]
881                      - ordered_unit(static_cast<const fs_inst *>(block->end()));
882 
883                   in_sb = merge(eq, in_sb, transport(sb, delta));
884                }
885 
886                out_sbs[block->num] = sb;
887                progress = true;
888             }
889          }
890       }
891 
892       delete[] delta_sbs;
893       delete[] out_sbs;
894 
895       return in_sbs;
896    }
897 
898    /**
899     * Return the list of potential dependencies of each instruction in the
900     * shader based on the result of global dependency analysis.
901     */
902    dependency_list *
gather_inst_dependencies(const fs_visitor * shader,const ordered_address * jps)903    gather_inst_dependencies(const fs_visitor *shader,
904                             const ordered_address *jps)
905    {
906       equivalence_relation eq(num_instructions(shader));
907       scoreboard *sbs = propagate_block_scoreboards(shader, jps, eq);
908       const unsigned *ids = eq.flatten();
909       dependency_list *deps = new dependency_list[num_instructions(shader)];
910       unsigned ip = 0;
911 
912       foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
913          const bool exec_all = inst->force_writemask_all;
914          scoreboard &sb = sbs[block->num];
915 
916          for (unsigned i = 0; i < inst->sources; i++) {
917             for (unsigned j = 0; j < regs_read(inst, i); j++)
918                add_dependency(ids, deps[ip], dependency_for_read(
919                   sb.get(byte_offset(inst->src[i], REG_SIZE * j))));
920          }
921 
922          if (is_send(inst) && inst->base_mrf != -1) {
923             for (unsigned j = 0; j < inst->mlen; j++)
924                add_dependency(ids, deps[ip], dependency_for_read(
925                   sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
926          }
927 
928          if (is_unordered(inst))
929             add_dependency(ids, deps[ip],
930                            dependency(TGL_SBID_SET, ip, exec_all));
931 
932          if (!inst->no_dd_check) {
933             if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
934                for (unsigned j = 0; j < regs_written(inst); j++) {
935                   add_dependency(ids, deps[ip], dependency_for_write(inst,
936                      sb.get(byte_offset(inst->dst, REG_SIZE * j))));
937                }
938             }
939 
940             if (is_send(inst) && inst->base_mrf != -1) {
941                for (unsigned j = 0; j < inst->implied_mrf_writes(); j++)
942                   add_dependency(ids, deps[ip], dependency_for_write(inst,
943                      sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
944             }
945          }
946 
947          update_inst_scoreboard(jps, inst, ip, sb);
948          ip++;
949       }
950 
951       delete[] sbs;
952       delete[] ids;
953 
954       return deps;
955    }
956 
957    /** @} */
958 
959    /**
960     * Allocate SBID tokens to track the execution of every out-of-order
961     * instruction of the shader.
962     */
963    dependency_list *
allocate_inst_dependencies(const fs_visitor * shader,const dependency_list * deps0)964    allocate_inst_dependencies(const fs_visitor *shader,
965                               const dependency_list *deps0)
966    {
967       /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in
968        *       shaders with a large number of SEND messages.
969        */
970 
971       /* Allocate an unordered dependency ID to hardware SBID translation
972        * table with as many entries as instructions there are in the shader,
973        * which is the maximum number of unordered IDs we can find in the
974        * program.
975        */
976       unsigned *ids = new unsigned[num_instructions(shader)];
977       for (unsigned ip = 0; ip < num_instructions(shader); ip++)
978          ids[ip] = ~0u;
979 
980       dependency_list *deps1 = new dependency_list[num_instructions(shader)];
981       unsigned next_id = 0;
982 
983       for (unsigned ip = 0; ip < num_instructions(shader); ip++) {
984          for (unsigned i = 0; i < deps0[ip].size(); i++) {
985             const dependency &dep = deps0[ip][i];
986 
987             if (dep.unordered && ids[dep.id] == ~0u)
988                ids[dep.id] = (next_id++) & 0xf;
989 
990             add_dependency(ids, deps1[ip], dep);
991          }
992       }
993 
994       delete[] ids;
995 
996       return deps1;
997    }
998 
999    /**
1000     * Emit dependency information provided by \p deps into the shader,
1001     * inserting additional SYNC instructions for dependencies that can't be
1002     * represented directly by annotating existing instructions.
1003     */
1004    void
emit_inst_dependencies(fs_visitor * shader,const ordered_address * jps,const dependency_list * deps)1005    emit_inst_dependencies(fs_visitor *shader,
1006                           const ordered_address *jps,
1007                           const dependency_list *deps)
1008    {
1009       unsigned ip = 0;
1010 
1011       foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) {
1012          const bool exec_all = inst->force_writemask_all;
1013          tgl_swsb swsb = ordered_dependency_swsb(deps[ip], jps[ip], exec_all);
1014          const tgl_sbid_mode unordered_mode =
1015             baked_unordered_dependency_mode(inst, deps[ip], jps[ip]);
1016 
1017          for (unsigned i = 0; i < deps[ip].size(); i++) {
1018             const dependency &dep = deps[ip][i];
1019 
1020             if (dep.unordered) {
1021                if (unordered_mode == dep.unordered &&
1022                    exec_all >= dep.exec_all && !swsb.mode) {
1023                   /* Bake unordered dependency into the instruction's SWSB if
1024                    * possible, except in cases where the current instruction
1025                    * isn't marked NoMask but the dependency is, since that
1026                    * might lead to data coherency issues due to
1027                    * GEN:BUG:1407528679.
1028                    */
1029                   swsb.sbid = dep.id;
1030                   swsb.mode = dep.unordered;
1031                } else {
1032                   /* Emit dependency into the SWSB of an extra SYNC
1033                    * instruction.
1034                    */
1035                   const fs_builder ibld = fs_builder(shader, block, inst)
1036                                           .exec_all().group(1, 0);
1037                   fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
1038                                             brw_imm_ud(TGL_SYNC_NOP));
1039                   sync->sched.sbid = dep.id;
1040                   sync->sched.mode = dep.unordered;
1041                   assert(!(sync->sched.mode & TGL_SBID_SET));
1042                }
1043             }
1044          }
1045 
1046          for (unsigned i = 0; i < deps[ip].size(); i++) {
1047             const dependency &dep = deps[ip][i];
1048 
1049             if (dep.ordered && dep.exec_all > exec_all &&
1050                 find_ordered_dependency(deps[ip], jps[ip], true)) {
1051                /* If the current instruction is not marked NoMask but an
1052                 * ordered dependency is, perform the synchronization as a
1053                 * separate NoMask SYNC instruction in order to avoid data
1054                 * coherency issues due to GEN:BUG:1407528679.  The similar
1055                 * scenario with unordered dependencies should have been
1056                 * handled above.
1057                 */
1058                const fs_builder ibld = fs_builder(shader, block, inst)
1059                                        .exec_all().group(1, 0);
1060                fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
1061                                          brw_imm_ud(TGL_SYNC_NOP));
1062                sync->sched = ordered_dependency_swsb(deps[ip], jps[ip], true);
1063                break;
1064             }
1065          }
1066 
1067          /* Update the IR. */
1068          inst->sched = swsb;
1069          inst->no_dd_check = inst->no_dd_clear = false;
1070          ip++;
1071       }
1072    }
1073 }
1074 
1075 bool
lower_scoreboard()1076 fs_visitor::lower_scoreboard()
1077 {
1078    if (devinfo->gen >= 12) {
1079       const ordered_address *jps = ordered_inst_addresses(this);
1080       const dependency_list *deps0 = gather_inst_dependencies(this, jps);
1081       const dependency_list *deps1 = allocate_inst_dependencies(this, deps0);
1082       emit_inst_dependencies(this, jps, deps1);
1083       delete[] deps1;
1084       delete[] deps0;
1085       delete[] jps;
1086    }
1087 
1088    return true;
1089 }
1090