• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2019 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /** @file brw_fs_scoreboard.cpp
25  *
26  * Gfx12+ hardware lacks the register scoreboard logic that used to guarantee
27  * data coherency between register reads and writes in previous generations.
28  * This lowering pass runs after register allocation in order to make up for
29  * it.
30  *
31  * It works by performing global dataflow analysis in order to determine the
32  * set of potential dependencies of every instruction in the shader, and then
33  * inserts any required SWSB annotations and additional SYNC instructions in
34  * order to guarantee data coherency.
35  *
36  * WARNING - Access of the following (rarely used) ARF registers is not
37  *           tracked here, and require the RegDist SWSB annotation to be set
38  *           to 1 by the generator in order to avoid data races:
39  *
40  *  - sp stack pointer
41  *  - sr0 state register
42  *  - cr0 control register
43  *  - ip instruction pointer
44  *  - tm0 timestamp register
45  *  - dbg0 debug register
46  *  - acc2-9 special accumulator registers on TGL
47  *  - mme0-7 math macro extended accumulator registers
48  *
49  * The following ARF registers don't need to be tracked here because data
50  * coherency is still provided transparently by the hardware:
51  *
52  *  - f0-1 flag registers
53  *  - n0 notification register
54  *  - tdr0 thread dependency register
55  */
56 
57 #include "brw_fs.h"
58 #include "brw_cfg.h"
59 
60 using namespace brw;
61 
62 namespace {
63    /**
64     * In-order instruction accounting.
65     * @{
66     */
67 
68    /**
69     * Return the RegDist pipeline the hardware will synchronize with if no
70     * pipeline information is provided in the SWSB annotation of an
71     * instruction (e.g. when TGL_PIPE_NONE is specified in tgl_swsb).
72     */
73    tgl_pipe
inferred_sync_pipe(const struct intel_device_info * devinfo,const fs_inst * inst)74    inferred_sync_pipe(const struct intel_device_info *devinfo, const fs_inst *inst)
75    {
76       if (devinfo->verx10 >= 125) {
77          bool has_int_src = false, has_long_src = false;
78 
79          if (is_send(inst))
80             return TGL_PIPE_NONE;
81 
82          for (unsigned i = 0; i < inst->sources; i++) {
83             if (inst->src[i].file != BAD_FILE &&
84                 !inst->is_control_source(i)) {
85                const brw_reg_type t = inst->src[i].type;
86                has_int_src |= !brw_reg_type_is_floating_point(t);
87                has_long_src |= type_sz(t) >= 8;
88             }
89          }
90 
91          return has_long_src ? TGL_PIPE_LONG :
92                 has_int_src ? TGL_PIPE_INT :
93                 TGL_PIPE_FLOAT;
94 
95       } else {
96          return TGL_PIPE_FLOAT;
97       }
98    }
99 
100    /**
101     * Return the RegDist pipeline that will execute an instruction, or
102     * TGL_PIPE_NONE if the instruction is out-of-order and doesn't use the
103     * RegDist synchronization mechanism.
104     */
105    tgl_pipe
inferred_exec_pipe(const struct intel_device_info * devinfo,const fs_inst * inst)106    inferred_exec_pipe(const struct intel_device_info *devinfo, const fs_inst *inst)
107    {
108       const brw_reg_type t = get_exec_type(inst);
109       const bool is_dword_multiply = !brw_reg_type_is_floating_point(t) &&
110          ((inst->opcode == BRW_OPCODE_MUL &&
111            MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4) ||
112           (inst->opcode == BRW_OPCODE_MAD &&
113            MIN2(type_sz(inst->src[1].type), type_sz(inst->src[2].type)) >= 4));
114 
115       if (is_unordered(inst))
116          return TGL_PIPE_NONE;
117       else if (devinfo->verx10 < 125)
118          return TGL_PIPE_FLOAT;
119       else if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
120                type_sz(t) >= 8)
121          return TGL_PIPE_INT;
122       else if (inst->opcode == SHADER_OPCODE_BROADCAST &&
123                !devinfo->has_64bit_float && type_sz(t) >= 8)
124          return TGL_PIPE_INT;
125       else if (inst->opcode == FS_OPCODE_PACK_HALF_2x16_SPLIT)
126          return TGL_PIPE_FLOAT;
127       else if (type_sz(inst->dst.type) >= 8 || type_sz(t) >= 8 ||
128                is_dword_multiply)
129          return TGL_PIPE_LONG;
130       else if (brw_reg_type_is_floating_point(inst->dst.type))
131          return TGL_PIPE_FLOAT;
132       else
133          return TGL_PIPE_INT;
134    }
135 
136    /**
137     * Index of the \p p pipeline counter in the ordered_address vector defined
138     * below.
139     */
140 #define IDX(p) (p >= TGL_PIPE_FLOAT ? unsigned(p - TGL_PIPE_FLOAT) :    \
141                 (abort(), ~0u))
142 
143    /**
144     * Number of in-order hardware instructions for pipeline index \p contained
145     * in this IR instruction.  This determines the increment applied to the
146     * RegDist counter calculated for any ordered dependency that crosses this
147     * instruction.
148     */
149    unsigned
ordered_unit(const struct intel_device_info * devinfo,const fs_inst * inst,unsigned p)150    ordered_unit(const struct intel_device_info *devinfo, const fs_inst *inst,
151                 unsigned p)
152    {
153       switch (inst->opcode) {
154       case BRW_OPCODE_SYNC:
155       case BRW_OPCODE_DO:
156       case SHADER_OPCODE_UNDEF:
157       case SHADER_OPCODE_HALT_TARGET:
158       case FS_OPCODE_SCHEDULING_FENCE:
159          return 0;
160       default:
161          /* Note that the following is inaccurate for virtual instructions
162           * that expand to more in-order instructions than assumed here, but
163           * that can only lead to suboptimal execution ordering, data
164           * coherency won't be impacted.  Providing exact RegDist counts for
165           * each virtual instruction would allow better ALU performance, but
166           * it would require keeping this switch statement in perfect sync
167           * with the generator in order to avoid data corruption.  Lesson is
168           * (again) don't use virtual instructions if you want optimal
169           * scheduling.
170           */
171          if (!is_unordered(inst) && (p == IDX(inferred_exec_pipe(devinfo, inst)) ||
172                                      p == IDX(TGL_PIPE_ALL)))
173             return 1;
174          else
175             return 0;
176       }
177    }
178 
179    /**
180     * Type for an instruction counter that increments for in-order
181     * instructions only, arbitrarily denoted 'jp' throughout this lowering
182     * pass in order to distinguish it from the regular instruction counter.
183     * This is represented as a vector with an independent counter for each
184     * asynchronous ALU pipeline in the EU.
185     */
186    struct ordered_address {
187       /**
188        * Construct the ordered address of a dependency known to execute on a
189        * single specified pipeline \p p (unless TGL_PIPE_NONE or TGL_PIPE_ALL
190        * is provided), in which case the vector counter will be initialized
191        * with all components equal to INT_MIN (always satisfied) except for
192        * component IDX(p).
193        */
ordered_address__anon3376bf920111::ordered_address194       ordered_address(tgl_pipe p = TGL_PIPE_NONE, int jp0 = INT_MIN) {
195          for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++)
196             jp[q] = (p == TGL_PIPE_NONE || (IDX(p) != q && p != TGL_PIPE_ALL) ?
197                      INT_MIN : jp0);
198       }
199 
200       int jp[IDX(TGL_PIPE_ALL)];
201 
202       friend bool
operator ==(const ordered_address & jp0,const ordered_address & jp1)203       operator==(const ordered_address &jp0, const ordered_address &jp1)
204       {
205          for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) {
206             if (jp0.jp[p] != jp1.jp[p])
207                return false;
208          }
209 
210          return true;
211       }
212    };
213 
214    /**
215     * Return true if the specified ordered address is trivially satisfied for
216     * all pipelines except potentially for the specified pipeline \p p.
217     */
218    bool
is_single_pipe(const ordered_address & jp,tgl_pipe p)219    is_single_pipe(const ordered_address &jp, tgl_pipe p)
220    {
221       for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) {
222          if ((p == TGL_PIPE_NONE || IDX(p) != q) && jp.jp[q] > INT_MIN)
223             return false;
224       }
225 
226       return true;
227    }
228 
229    /**
230     * Return the number of instructions in the program.
231     */
232    unsigned
num_instructions(const backend_shader * shader)233    num_instructions(const backend_shader *shader)
234    {
235       return shader->cfg->blocks[shader->cfg->num_blocks - 1]->end_ip + 1;
236    }
237 
238    /**
239     * Calculate the local ordered_address instruction counter at every
240     * instruction of the shader for subsequent constant-time look-up.
241     */
242    ordered_address *
ordered_inst_addresses(const fs_visitor * shader)243    ordered_inst_addresses(const fs_visitor *shader)
244    {
245       ordered_address *jps = new ordered_address[num_instructions(shader)];
246       ordered_address jp(TGL_PIPE_ALL, 0);
247       unsigned ip = 0;
248 
249       foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
250          jps[ip] = jp;
251          for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
252             jp.jp[p] += ordered_unit(shader->devinfo, inst, p);
253          ip++;
254       }
255 
256       return jps;
257    }
258 
259    /**
260     * Synchronization mode required for data manipulated by in-order
261     * instructions.
262     *
263     * Similar to tgl_sbid_mode, but without SET mode.  Defined as a separate
264     * enum for additional type safety.  The hardware doesn't provide control
265     * over the synchronization mode for RegDist annotations, this is only used
266     * internally in this pass in order to optimize out redundant read
267     * dependencies where possible.
268     */
269    enum tgl_regdist_mode {
270       TGL_REGDIST_NULL = 0,
271       TGL_REGDIST_SRC = 1,
272       TGL_REGDIST_DST = 2
273    };
274 
275    /**
276     * Allow bitwise arithmetic of tgl_regdist_mode enums.
277     */
278    tgl_regdist_mode
operator |(tgl_regdist_mode x,tgl_regdist_mode y)279    operator|(tgl_regdist_mode x, tgl_regdist_mode y)
280    {
281       return tgl_regdist_mode(unsigned(x) | unsigned(y));
282    }
283 
284    tgl_regdist_mode
operator &(tgl_regdist_mode x,tgl_regdist_mode y)285    operator&(tgl_regdist_mode x, tgl_regdist_mode y)
286    {
287       return tgl_regdist_mode(unsigned(x) & unsigned(y));
288    }
289 
290    tgl_regdist_mode &
operator |=(tgl_regdist_mode & x,tgl_regdist_mode y)291    operator|=(tgl_regdist_mode &x, tgl_regdist_mode y)
292    {
293       return x = x | y;
294    }
295 
296    tgl_regdist_mode &
operator &=(tgl_regdist_mode & x,tgl_regdist_mode y)297    operator&=(tgl_regdist_mode &x, tgl_regdist_mode y)
298    {
299       return x = x & y;
300    }
301 
302    /** @} */
303 
304    /**
305     * Representation of an equivalence relation among the set of unsigned
306     * integers.
307     *
308     * Its initial state is the identity relation '~' such that i ~ j if and
309     * only if i == j for every pair of unsigned integers i and j.
310     */
311    struct equivalence_relation {
equivalence_relation__anon3376bf920111::equivalence_relation312       equivalence_relation(unsigned n) : is(new unsigned[n]), n(n)
313       {
314          for (unsigned i = 0; i < n; i++)
315             is[i] = i;
316       }
317 
~equivalence_relation__anon3376bf920111::equivalence_relation318       ~equivalence_relation()
319       {
320          delete[] is;
321       }
322 
323       /**
324        * Return equivalence class index of the specified element.  Effectively
325        * this is the numeric value of an arbitrary representative from the
326        * equivalence class.
327        *
328        * Allows the evaluation of the equivalence relation according to the
329        * rule that i ~ j if and only if lookup(i) == lookup(j).
330        */
331       unsigned
lookup__anon3376bf920111::equivalence_relation332       lookup(unsigned i) const
333       {
334          if (i < n && is[i] != i)
335             return lookup(is[i]);
336          else
337             return i;
338       }
339 
340       /**
341        * Create an array with the results of the lookup() method for
342        * constant-time evaluation.
343        */
344       unsigned *
flatten__anon3376bf920111::equivalence_relation345       flatten() const
346       {
347          unsigned *ids = new unsigned[n];
348 
349          for (unsigned i = 0; i < n; i++)
350             ids[i] = lookup(i);
351 
352          return ids;
353       }
354 
355       /**
356        * Mutate the existing equivalence relation minimally by imposing the
357        * additional requirement that i ~ j.
358        *
359        * The algorithm updates the internal representation recursively in
360        * order to guarantee transitivity while preserving the previously
361        * specified equivalence requirements.
362        */
363       unsigned
link__anon3376bf920111::equivalence_relation364       link(unsigned i, unsigned j)
365       {
366          const unsigned k = lookup(i);
367          assign(i, k);
368          assign(j, k);
369          return k;
370       }
371 
372    private:
373       equivalence_relation(const equivalence_relation &);
374 
375       equivalence_relation &
376       operator=(const equivalence_relation &);
377 
378       /**
379        * Assign the representative of \p from to be equivalent to \p to.
380        *
381        * At the same time the data structure is partially flattened as much as
382        * it's possible without increasing the number of recursive calls.
383        */
384       void
assign__anon3376bf920111::equivalence_relation385       assign(unsigned from, unsigned to)
386       {
387          if (from != to) {
388             assert(from < n);
389 
390             if (is[from] != from)
391                assign(is[from], to);
392 
393             is[from] = to;
394          }
395       }
396 
397       unsigned *is;
398       unsigned n;
399    };
400 
401    /**
402     * Representation of a data dependency between two instructions in the
403     * program.
404     * @{
405     */
406    struct dependency {
407       /**
408        * No dependency information.
409        */
dependency__anon3376bf920111::dependency410       dependency() : ordered(TGL_REGDIST_NULL), jp(),
411                      unordered(TGL_SBID_NULL), id(0),
412                      exec_all(false) {}
413 
414       /**
415        * Construct a dependency on the in-order instruction with the provided
416        * ordered_address instruction counter.
417        */
dependency__anon3376bf920111::dependency418       dependency(tgl_regdist_mode mode, const ordered_address &jp,
419                  bool exec_all) :
420          ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0),
421          exec_all(exec_all) {}
422 
423       /**
424        * Construct a dependency on the out-of-order instruction with the
425        * specified synchronization token.
426        */
dependency__anon3376bf920111::dependency427       dependency(tgl_sbid_mode mode, unsigned id, bool exec_all) :
428          ordered(TGL_REGDIST_NULL), jp(), unordered(mode), id(id),
429          exec_all(exec_all) {}
430 
431       /**
432        * Synchronization mode of in-order dependency, or zero if no in-order
433        * dependency is present.
434        */
435       tgl_regdist_mode ordered;
436 
437       /**
438        * Instruction counter of in-order dependency.
439        *
440        * For a dependency part of a different block in the program, this is
441        * relative to the specific control flow path taken between the
442        * dependency and the current block: It is the ordered_address such that
443        * the difference between it and the ordered_address of the first
444        * instruction of the current block is exactly the number of in-order
445        * instructions across that control flow path.  It is not guaranteed to
446        * be equal to the local ordered_address of the generating instruction
447        * [as returned by ordered_inst_addresses()], except for block-local
448        * dependencies.
449        */
450       ordered_address jp;
451 
452       /**
453        * Synchronization mode of unordered dependency, or zero if no unordered
454        * dependency is present.
455        */
456       tgl_sbid_mode unordered;
457 
458       /** Synchronization token of out-of-order dependency. */
459       unsigned id;
460 
461       /**
462        * Whether the dependency could be run with execution masking disabled,
463        * which might lead to the unwanted execution of the generating
464        * instruction in cases where a BB is executed with all channels
465        * disabled due to hardware bug Wa_1407528679.
466        */
467       bool exec_all;
468 
469       /**
470        * Trivial in-order dependency that's always satisfied.
471        *
472        * Note that unlike a default-constructed dependency() which is also
473        * trivially satisfied, this is considered to provide dependency
474        * information and can be used to clear a previously pending dependency
475        * via shadow().
476        */
477       static const dependency done;
478 
479       friend bool
operator ==(const dependency & dep0,const dependency & dep1)480       operator==(const dependency &dep0, const dependency &dep1)
481       {
482          return dep0.ordered == dep1.ordered &&
483                 dep0.jp == dep1.jp &&
484                 dep0.unordered == dep1.unordered &&
485                 dep0.id == dep1.id &&
486                 dep0.exec_all == dep1.exec_all;
487       }
488 
489       friend bool
operator !=(const dependency & dep0,const dependency & dep1)490       operator!=(const dependency &dep0, const dependency &dep1)
491       {
492          return !(dep0 == dep1);
493       }
494    };
495 
496    const dependency dependency::done =
497         dependency(TGL_REGDIST_SRC, ordered_address(), false);
498 
499    /**
500     * Return whether \p dep contains any dependency information.
501     */
502    bool
is_valid(const dependency & dep)503    is_valid(const dependency &dep)
504    {
505       return dep.ordered || dep.unordered;
506    }
507 
508    /**
509     * Combine \p dep0 and \p dep1 into a single dependency object that is only
510     * satisfied when both original dependencies are satisfied.  This might
511     * involve updating the equivalence relation \p eq in order to make sure
512     * that both out-of-order dependencies are assigned the same hardware SBID
513     * as synchronization token.
514     */
515    dependency
merge(equivalence_relation & eq,const dependency & dep0,const dependency & dep1)516    merge(equivalence_relation &eq,
517          const dependency &dep0, const dependency &dep1)
518    {
519       dependency dep;
520 
521       if (dep0.ordered || dep1.ordered) {
522          dep.ordered = dep0.ordered | dep1.ordered;
523          for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
524             dep.jp.jp[p] = MAX2(dep0.jp.jp[p], dep1.jp.jp[p]);
525       }
526 
527       if (dep0.unordered || dep1.unordered) {
528          dep.unordered = dep0.unordered | dep1.unordered;
529          dep.id = eq.link(dep0.unordered ? dep0.id : dep1.id,
530                           dep1.unordered ? dep1.id : dep0.id);
531       }
532 
533       dep.exec_all = dep0.exec_all || dep1.exec_all;
534 
535       return dep;
536    }
537 
538    /**
539     * Override dependency information of \p dep0 with that of \p dep1.
540     */
541    dependency
shadow(const dependency & dep0,const dependency & dep1)542    shadow(const dependency &dep0, const dependency &dep1)
543    {
544       return is_valid(dep1) ? dep1 : dep0;
545    }
546 
547    /**
548     * Translate dependency information across the program.
549     *
550     * This returns a dependency on the same instruction translated to the
551     * ordered_address space of a different block.  The correct shift for
552     * transporting a dependency across an edge of the CFG is the difference
553     * between the local ordered_address of the first instruction of the target
554     * block and the local ordered_address of the instruction immediately after
555     * the end of the origin block.
556     */
557    dependency
transport(dependency dep,int delta[IDX (TGL_PIPE_ALL)])558    transport(dependency dep, int delta[IDX(TGL_PIPE_ALL)])
559    {
560       if (dep.ordered) {
561          for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) {
562             if (dep.jp.jp[p] > INT_MIN)
563                dep.jp.jp[p] += delta[p];
564          }
565       }
566 
567       return dep;
568    }
569 
570    /**
571     * Return simplified dependency removing any synchronization modes not
572     * applicable to an instruction reading the same register location.
573     */
574    dependency
dependency_for_read(dependency dep)575    dependency_for_read(dependency dep)
576    {
577       dep.ordered &= TGL_REGDIST_DST;
578       return dep;
579    }
580 
581    /**
582     * Return simplified dependency removing any synchronization modes not
583     * applicable to an instruction \p inst writing the same register location.
584     *
585     * This clears any WaR dependency for writes performed from the same
586     * pipeline as the read, since there is no possibility for a data hazard.
587     */
588    dependency
dependency_for_write(const struct intel_device_info * devinfo,const fs_inst * inst,dependency dep)589    dependency_for_write(const struct intel_device_info *devinfo,
590                         const fs_inst *inst, dependency dep)
591    {
592       if (!is_unordered(inst) &&
593           is_single_pipe(dep.jp, inferred_exec_pipe(devinfo, inst)))
594          dep.ordered &= TGL_REGDIST_DST;
595       return dep;
596    }
597 
598    /** @} */
599 
600    /**
601     * Scoreboard representation.  This keeps track of the data dependencies of
602     * registers with GRF granularity.
603     */
604    class scoreboard {
605    public:
606       /**
607        * Look up the most current data dependency for register \p r.
608        */
609       dependency
get(const fs_reg & r) const610       get(const fs_reg &r) const
611       {
612          if (const dependency *p = const_cast<scoreboard *>(this)->dep(r))
613             return *p;
614          else
615             return dependency();
616       }
617 
618       /**
619        * Specify the most current data dependency for register \p r.
620        */
621       void
set(const fs_reg & r,const dependency & d)622       set(const fs_reg &r, const dependency &d)
623       {
624          if (dependency *p = dep(r))
625             *p = d;
626       }
627 
628       /**
629        * Component-wise merge() of corresponding dependencies from two
630        * scoreboard objects.  \sa merge().
631        */
632       friend scoreboard
merge(equivalence_relation & eq,const scoreboard & sb0,const scoreboard & sb1)633       merge(equivalence_relation &eq,
634             const scoreboard &sb0, const scoreboard &sb1)
635       {
636          scoreboard sb;
637 
638          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
639             sb.grf_deps[i] = merge(eq, sb0.grf_deps[i], sb1.grf_deps[i]);
640 
641          sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep);
642          sb.accum_dep = merge(eq, sb0.accum_dep, sb1.accum_dep);
643 
644          return sb;
645       }
646 
647       /**
648        * Component-wise shadow() of corresponding dependencies from two
649        * scoreboard objects.  \sa shadow().
650        */
651       friend scoreboard
shadow(const scoreboard & sb0,const scoreboard & sb1)652       shadow(const scoreboard &sb0, const scoreboard &sb1)
653       {
654          scoreboard sb;
655 
656          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
657             sb.grf_deps[i] = shadow(sb0.grf_deps[i], sb1.grf_deps[i]);
658 
659          sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep);
660          sb.accum_dep = shadow(sb0.accum_dep, sb1.accum_dep);
661 
662          return sb;
663       }
664 
665       /**
666        * Component-wise transport() of dependencies from a scoreboard
667        * object.  \sa transport().
668        */
669       friend scoreboard
transport(const scoreboard & sb0,int delta[IDX (TGL_PIPE_ALL)])670       transport(const scoreboard &sb0, int delta[IDX(TGL_PIPE_ALL)])
671       {
672          scoreboard sb;
673 
674          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
675             sb.grf_deps[i] = transport(sb0.grf_deps[i], delta);
676 
677          sb.addr_dep = transport(sb0.addr_dep, delta);
678          sb.accum_dep = transport(sb0.accum_dep, delta);
679 
680          return sb;
681       }
682 
683       friend bool
operator ==(const scoreboard & sb0,const scoreboard & sb1)684       operator==(const scoreboard &sb0, const scoreboard &sb1)
685       {
686          for (unsigned i = 0; i < ARRAY_SIZE(sb0.grf_deps); i++) {
687             if (sb0.grf_deps[i] != sb1.grf_deps[i])
688                return false;
689          }
690 
691          if (sb0.addr_dep != sb1.addr_dep)
692             return false;
693 
694          if (sb0.accum_dep != sb1.accum_dep)
695             return false;
696 
697          return true;
698       }
699 
700       friend bool
operator !=(const scoreboard & sb0,const scoreboard & sb1)701       operator!=(const scoreboard &sb0, const scoreboard &sb1)
702       {
703          return !(sb0 == sb1);
704       }
705 
706    private:
707       dependency grf_deps[BRW_MAX_GRF];
708       dependency addr_dep;
709       dependency accum_dep;
710 
711       dependency *
dep(const fs_reg & r)712       dep(const fs_reg &r)
713       {
714          const unsigned reg = (r.file == VGRF ? r.nr + r.offset / REG_SIZE :
715                                reg_offset(r) / REG_SIZE);
716 
717          return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] :
718                  r.file == MRF ? &grf_deps[GFX7_MRF_HACK_START + reg] :
719                  r.file == ARF && reg >= BRW_ARF_ADDRESS &&
720                                   reg < BRW_ARF_ACCUMULATOR ? &addr_dep :
721                  r.file == ARF && reg >= BRW_ARF_ACCUMULATOR &&
722                                   reg < BRW_ARF_FLAG ? &accum_dep :
723                  NULL);
724       }
725    };
726 
727    /**
728     * Dependency list handling.
729     * @{
730     */
731    struct dependency_list {
dependency_list__anon3376bf920111::dependency_list732       dependency_list() : deps(NULL), n(0) {}
733 
~dependency_list__anon3376bf920111::dependency_list734       ~dependency_list()
735       {
736          free(deps);
737       }
738 
739       void
push_back__anon3376bf920111::dependency_list740       push_back(const dependency &dep)
741       {
742          deps = (dependency *)realloc(deps, (n + 1) * sizeof(*deps));
743          deps[n++] = dep;
744       }
745 
746       unsigned
size__anon3376bf920111::dependency_list747       size() const
748       {
749          return n;
750       }
751 
752       const dependency &
operator []__anon3376bf920111::dependency_list753       operator[](unsigned i) const
754       {
755          assert(i < n);
756          return deps[i];
757       }
758 
759       dependency &
operator []__anon3376bf920111::dependency_list760       operator[](unsigned i)
761       {
762          assert(i < n);
763          return deps[i];
764       }
765 
766    private:
767       dependency_list(const dependency_list &);
768       dependency_list &
769       operator=(const dependency_list &);
770 
771       dependency *deps;
772       unsigned n;
773    };
774 
775    /**
776     * Add dependency \p dep to the list of dependencies of an instruction
777     * \p deps.
778     */
779    void
add_dependency(const unsigned * ids,dependency_list & deps,dependency dep)780    add_dependency(const unsigned *ids, dependency_list &deps, dependency dep)
781    {
782       if (is_valid(dep)) {
783          /* Translate the unordered dependency token first in order to keep
784           * the list minimally redundant.
785           */
786          if (dep.unordered)
787             dep.id = ids[dep.id];
788 
789          /* Try to combine the specified dependency with any existing ones. */
790          for (unsigned i = 0; i < deps.size(); i++) {
791             /* Don't combine otherwise matching dependencies if there is an
792              * exec_all mismatch which would cause a SET dependency to gain an
793              * exec_all flag, since that would prevent it from being baked
794              * into the instruction we want to allocate an SBID for.
795              */
796             if (deps[i].exec_all != dep.exec_all &&
797                 (!deps[i].exec_all || (dep.unordered & TGL_SBID_SET)) &&
798                 (!dep.exec_all || (deps[i].unordered & TGL_SBID_SET)))
799                continue;
800 
801             if (dep.ordered && deps[i].ordered) {
802                for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
803                   deps[i].jp.jp[p] = MAX2(deps[i].jp.jp[p], dep.jp.jp[p]);
804 
805                deps[i].ordered |= dep.ordered;
806                deps[i].exec_all |= dep.exec_all;
807                dep.ordered = TGL_REGDIST_NULL;
808             }
809 
810             if (dep.unordered && deps[i].unordered && deps[i].id == dep.id) {
811                deps[i].unordered |= dep.unordered;
812                deps[i].exec_all |= dep.exec_all;
813                dep.unordered = TGL_SBID_NULL;
814             }
815          }
816 
817          /* Add it to the end of the list if necessary. */
818          if (is_valid(dep))
819             deps.push_back(dep);
820       }
821    }
822 
823    /**
824     * Construct a tgl_swsb annotation encoding any ordered dependencies from
825     * the dependency list \p deps of an instruction with ordered_address \p
826     * jp.  If \p exec_all is false only dependencies known to be executed with
827     * channel masking applied will be considered in the calculation.
828     */
829    tgl_swsb
ordered_dependency_swsb(const dependency_list & deps,const ordered_address & jp,bool exec_all)830    ordered_dependency_swsb(const dependency_list &deps,
831                            const ordered_address &jp,
832                            bool exec_all)
833    {
834       tgl_pipe p = TGL_PIPE_NONE;
835       unsigned min_dist = ~0u;
836 
837       for (unsigned i = 0; i < deps.size(); i++) {
838          if (deps[i].ordered && exec_all >= deps[i].exec_all) {
839             for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) {
840                const unsigned dist = jp.jp[q] - int64_t(deps[i].jp.jp[q]);
841                const unsigned max_dist = (q == IDX(TGL_PIPE_LONG) ? 14 : 10);
842                assert(jp.jp[q] > deps[i].jp.jp[q]);
843                if (dist <= max_dist) {
844                   p = (p && IDX(p) != q ? TGL_PIPE_ALL :
845                        tgl_pipe(TGL_PIPE_FLOAT + q));
846                   min_dist = MIN3(min_dist, dist, 7);
847                }
848             }
849          }
850       }
851 
852       return { p ? min_dist : 0, p };
853    }
854 
855    /**
856     * Return whether the dependency list \p deps of an instruction with
857     * ordered_address \p jp has any non-trivial ordered dependencies.  If \p
858     * exec_all is false only dependencies known to be executed with channel
859     * masking applied will be considered in the calculation.
860     */
861    bool
find_ordered_dependency(const dependency_list & deps,const ordered_address & jp,bool exec_all)862    find_ordered_dependency(const dependency_list &deps,
863                            const ordered_address &jp,
864                            bool exec_all)
865    {
866       return ordered_dependency_swsb(deps, jp, exec_all).regdist;
867    }
868 
869    /**
870     * Return the full tgl_sbid_mode bitset for the first unordered dependency
871     * on the list \p deps that matches the specified tgl_sbid_mode, or zero if
872     * no such dependency is present.  If \p exec_all is false only
873     * dependencies known to be executed with channel masking applied will be
874     * considered in the calculation.
875     */
876    tgl_sbid_mode
find_unordered_dependency(const dependency_list & deps,tgl_sbid_mode unordered,bool exec_all)877    find_unordered_dependency(const dependency_list &deps,
878                              tgl_sbid_mode unordered,
879                              bool exec_all)
880    {
881       if (unordered) {
882          for (unsigned i = 0; i < deps.size(); i++) {
883             if ((unordered & deps[i].unordered) &&
884                 exec_all >= deps[i].exec_all)
885                return deps[i].unordered;
886          }
887       }
888 
889       return TGL_SBID_NULL;
890    }
891 
892    /**
893     * Return the tgl_sbid_mode bitset of an unordered dependency from the list
894     * \p deps that can be represented directly in the SWSB annotation of the
895     * instruction without additional SYNC instructions, or zero if no such
896     * dependency is present.
897     */
898    tgl_sbid_mode
baked_unordered_dependency_mode(const struct intel_device_info * devinfo,const fs_inst * inst,const dependency_list & deps,const ordered_address & jp)899    baked_unordered_dependency_mode(const struct intel_device_info *devinfo,
900                                    const fs_inst *inst,
901                                    const dependency_list &deps,
902                                    const ordered_address &jp)
903    {
904       const bool exec_all = inst->force_writemask_all;
905       const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
906       const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp,
907                                                             exec_all).pipe;
908 
909       if (find_unordered_dependency(deps, TGL_SBID_SET, exec_all))
910          return find_unordered_dependency(deps, TGL_SBID_SET, exec_all);
911       else if (has_ordered && is_unordered(inst))
912          return TGL_SBID_NULL;
913       else if (find_unordered_dependency(deps, TGL_SBID_DST, exec_all) &&
914                (!has_ordered || ordered_pipe == inferred_sync_pipe(devinfo, inst)))
915          return find_unordered_dependency(deps, TGL_SBID_DST, exec_all);
916       else if (!has_ordered)
917          return find_unordered_dependency(deps, TGL_SBID_SRC, exec_all);
918       else
919          return TGL_SBID_NULL;
920    }
921 
922    /**
923     * Return whether an ordered dependency from the list \p deps can be
924     * represented directly in the SWSB annotation of the instruction without
925     * additional SYNC instructions.
926     */
927    bool
baked_ordered_dependency_mode(const struct intel_device_info * devinfo,const fs_inst * inst,const dependency_list & deps,const ordered_address & jp)928    baked_ordered_dependency_mode(const struct intel_device_info *devinfo,
929                                  const fs_inst *inst,
930                                  const dependency_list &deps,
931                                  const ordered_address &jp)
932    {
933       const bool exec_all = inst->force_writemask_all;
934       const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
935       const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp,
936                                                             exec_all).pipe;
937       const tgl_sbid_mode unordered_mode =
938          baked_unordered_dependency_mode(devinfo, inst, deps, jp);
939 
940       if (!has_ordered)
941          return false;
942       else if (!unordered_mode)
943          return true;
944       else
945          return ordered_pipe == inferred_sync_pipe(devinfo, inst) &&
946                 unordered_mode == (is_unordered(inst) ? TGL_SBID_SET :
947                                    TGL_SBID_DST);
948    }
949 
950    /** @} */
951 
952    /**
953     * Shader instruction dependency calculation.
954     * @{
955     */
956 
957    /**
958     * Update scoreboard object \p sb to account for the execution of
959     * instruction \p inst.
960     */
961    void
update_inst_scoreboard(const fs_visitor * shader,const ordered_address * jps,const fs_inst * inst,unsigned ip,scoreboard & sb)962    update_inst_scoreboard(const fs_visitor *shader, const ordered_address *jps,
963                           const fs_inst *inst, unsigned ip, scoreboard &sb)
964    {
965       const bool exec_all = inst->force_writemask_all;
966       const struct intel_device_info *devinfo = shader->devinfo;
967       const tgl_pipe p = inferred_exec_pipe(devinfo, inst);
968       const ordered_address jp = p ? ordered_address(p, jps[ip].jp[IDX(p)]) :
969                                      ordered_address();
970 
971       /* Track any source registers that may be fetched asynchronously by this
972        * instruction, otherwise clear the dependency in order to avoid
973        * subsequent redundant synchronization.
974        */
975       for (unsigned i = 0; i < inst->sources; i++) {
976          const dependency rd_dep =
977             (inst->is_payload(i) ||
978              inst->is_math()) ? dependency(TGL_SBID_SRC, ip, exec_all) :
979             ordered_unit(devinfo, inst, IDX(TGL_PIPE_ALL)) ?
980                dependency(TGL_REGDIST_SRC, jp, exec_all) :
981             dependency::done;
982 
983          for (unsigned j = 0; j < regs_read(inst, i); j++)
984             sb.set(byte_offset(inst->src[i], REG_SIZE * j), rd_dep);
985       }
986 
987       if (inst->reads_accumulator_implicitly())
988          sb.set(brw_acc_reg(8), dependency(TGL_REGDIST_SRC, jp, exec_all));
989 
990       if (is_send(inst) && inst->base_mrf != -1) {
991          const dependency rd_dep = dependency(TGL_SBID_SRC, ip, exec_all);
992 
993          for (unsigned j = 0; j < inst->mlen; j++)
994             sb.set(brw_uvec_mrf(8, inst->base_mrf + j, 0), rd_dep);
995       }
996 
997       /* Track any destination registers of this instruction. */
998       const dependency wr_dep =
999          is_unordered(inst) ? dependency(TGL_SBID_DST, ip, exec_all) :
1000          ordered_unit(devinfo, inst, IDX(TGL_PIPE_ALL)) ?
1001             dependency(TGL_REGDIST_DST, jp, exec_all) :
1002          dependency();
1003 
1004       if (inst->writes_accumulator_implicitly(devinfo))
1005          sb.set(brw_acc_reg(8), wr_dep);
1006 
1007       if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
1008           !inst->dst.is_null()) {
1009          for (unsigned j = 0; j < regs_written(inst); j++)
1010             sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep);
1011       }
1012    }
1013 
1014    /**
1015     * Calculate scoreboard objects locally that represent any pending (and
1016     * unconditionally resolved) dependencies at the end of each block of the
1017     * program.
1018     */
1019    scoreboard *
gather_block_scoreboards(const fs_visitor * shader,const ordered_address * jps)1020    gather_block_scoreboards(const fs_visitor *shader,
1021                             const ordered_address *jps)
1022    {
1023       scoreboard *sbs = new scoreboard[shader->cfg->num_blocks];
1024       unsigned ip = 0;
1025 
1026       foreach_block_and_inst(block, fs_inst, inst, shader->cfg)
1027          update_inst_scoreboard(shader, jps, inst, ip++, sbs[block->num]);
1028 
1029       return sbs;
1030    }
1031 
1032    /**
1033     * Propagate data dependencies globally through the control flow graph
1034     * until a fixed point is reached.
1035     *
1036     * Calculates the set of dependencies potentially pending at the beginning
1037     * of each block, and returns it as an array of scoreboard objects.
1038     */
1039    scoreboard *
propagate_block_scoreboards(const fs_visitor * shader,const ordered_address * jps,equivalence_relation & eq)1040    propagate_block_scoreboards(const fs_visitor *shader,
1041                                const ordered_address *jps,
1042                                equivalence_relation &eq)
1043    {
1044       const scoreboard *delta_sbs = gather_block_scoreboards(shader, jps);
1045       scoreboard *in_sbs = new scoreboard[shader->cfg->num_blocks];
1046       scoreboard *out_sbs = new scoreboard[shader->cfg->num_blocks];
1047 
1048       for (bool progress = true; progress;) {
1049          progress = false;
1050 
1051          foreach_block(block, shader->cfg) {
1052             const scoreboard sb = shadow(in_sbs[block->num],
1053                                          delta_sbs[block->num]);
1054 
1055             if (sb != out_sbs[block->num]) {
1056                foreach_list_typed(bblock_link, child_link, link,
1057                                   &block->children) {
1058                   scoreboard &in_sb = in_sbs[child_link->block->num];
1059                   int delta[IDX(TGL_PIPE_ALL)];
1060 
1061                   for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
1062                      delta[p] = jps[child_link->block->start_ip].jp[p]
1063                         - jps[block->end_ip].jp[p]
1064                         - ordered_unit(shader->devinfo,
1065                                        static_cast<const fs_inst *>(block->end()), p);
1066 
1067                   in_sb = merge(eq, in_sb, transport(sb, delta));
1068                }
1069 
1070                out_sbs[block->num] = sb;
1071                progress = true;
1072             }
1073          }
1074       }
1075 
1076       delete[] delta_sbs;
1077       delete[] out_sbs;
1078 
1079       return in_sbs;
1080    }
1081 
1082    /**
1083     * Return the list of potential dependencies of each instruction in the
1084     * shader based on the result of global dependency analysis.
1085     */
1086    dependency_list *
gather_inst_dependencies(const fs_visitor * shader,const ordered_address * jps)1087    gather_inst_dependencies(const fs_visitor *shader,
1088                             const ordered_address *jps)
1089    {
1090       const struct intel_device_info *devinfo = shader->devinfo;
1091       equivalence_relation eq(num_instructions(shader));
1092       scoreboard *sbs = propagate_block_scoreboards(shader, jps, eq);
1093       const unsigned *ids = eq.flatten();
1094       dependency_list *deps = new dependency_list[num_instructions(shader)];
1095       unsigned ip = 0;
1096 
1097       foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
1098          const bool exec_all = inst->force_writemask_all;
1099          const tgl_pipe p = inferred_exec_pipe(devinfo, inst);
1100          scoreboard &sb = sbs[block->num];
1101 
1102          for (unsigned i = 0; i < inst->sources; i++) {
1103             for (unsigned j = 0; j < regs_read(inst, i); j++)
1104                add_dependency(ids, deps[ip], dependency_for_read(
1105                   sb.get(byte_offset(inst->src[i], REG_SIZE * j))));
1106          }
1107 
1108          if (inst->reads_accumulator_implicitly()) {
1109             /* Wa_22012725308:
1110              *
1111              * "When the accumulator registers are used as source and/or
1112              *  destination, hardware does not ensure prevention of write
1113              *  after read hazard across execution pipes."
1114              */
1115             const dependency dep = sb.get(brw_acc_reg(8));
1116             if (dep.ordered && !is_single_pipe(dep.jp, p))
1117                add_dependency(ids, deps[ip], dep);
1118          }
1119 
1120          if (is_send(inst) && inst->base_mrf != -1) {
1121             for (unsigned j = 0; j < inst->mlen; j++)
1122                add_dependency(ids, deps[ip], dependency_for_read(
1123                   sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
1124          }
1125 
1126          if (is_unordered(inst))
1127             add_dependency(ids, deps[ip],
1128                            dependency(TGL_SBID_SET, ip, exec_all));
1129 
1130          if (!inst->no_dd_check) {
1131             if (inst->dst.file != BAD_FILE && !inst->dst.is_null() &&
1132                 !inst->dst.is_accumulator()) {
1133                for (unsigned j = 0; j < regs_written(inst); j++) {
1134                   add_dependency(ids, deps[ip], dependency_for_write(devinfo, inst,
1135                      sb.get(byte_offset(inst->dst, REG_SIZE * j))));
1136                }
1137             }
1138 
1139             if (inst->writes_accumulator_implicitly(devinfo) ||
1140                 inst->dst.is_accumulator()) {
1141                /* Wa_22012725308:
1142                 *
1143                 * "When the accumulator registers are used as source and/or
1144                 *  destination, hardware does not ensure prevention of write
1145                 *  after read hazard across execution pipes."
1146                 */
1147                const dependency dep = sb.get(brw_acc_reg(8));
1148                if (dep.ordered && !is_single_pipe(dep.jp, p))
1149                   add_dependency(ids, deps[ip], dep);
1150             }
1151 
1152             if (is_send(inst) && inst->base_mrf != -1) {
1153                for (unsigned j = 0; j < inst->implied_mrf_writes(); j++)
1154                   add_dependency(ids, deps[ip], dependency_for_write(devinfo, inst,
1155                      sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
1156             }
1157          }
1158 
1159          update_inst_scoreboard(shader, jps, inst, ip, sb);
1160          ip++;
1161       }
1162 
1163       delete[] sbs;
1164       delete[] ids;
1165 
1166       return deps;
1167    }
1168 
1169    /** @} */
1170 
1171    /**
1172     * Allocate SBID tokens to track the execution of every out-of-order
1173     * instruction of the shader.
1174     */
1175    dependency_list *
allocate_inst_dependencies(const fs_visitor * shader,const dependency_list * deps0)1176    allocate_inst_dependencies(const fs_visitor *shader,
1177                               const dependency_list *deps0)
1178    {
1179       /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in
1180        *       shaders with a large number of SEND messages.
1181        */
1182 
1183       /* Allocate an unordered dependency ID to hardware SBID translation
1184        * table with as many entries as instructions there are in the shader,
1185        * which is the maximum number of unordered IDs we can find in the
1186        * program.
1187        */
1188       unsigned *ids = new unsigned[num_instructions(shader)];
1189       for (unsigned ip = 0; ip < num_instructions(shader); ip++)
1190          ids[ip] = ~0u;
1191 
1192       dependency_list *deps1 = new dependency_list[num_instructions(shader)];
1193       unsigned next_id = 0;
1194 
1195       for (unsigned ip = 0; ip < num_instructions(shader); ip++) {
1196          for (unsigned i = 0; i < deps0[ip].size(); i++) {
1197             const dependency &dep = deps0[ip][i];
1198 
1199             if (dep.unordered && ids[dep.id] == ~0u)
1200                ids[dep.id] = (next_id++) & 0xf;
1201 
1202             add_dependency(ids, deps1[ip], dep);
1203          }
1204       }
1205 
1206       delete[] ids;
1207 
1208       return deps1;
1209    }
1210 
1211    /**
1212     * Emit dependency information provided by \p deps into the shader,
1213     * inserting additional SYNC instructions for dependencies that can't be
1214     * represented directly by annotating existing instructions.
1215     */
1216    void
emit_inst_dependencies(fs_visitor * shader,const ordered_address * jps,const dependency_list * deps)1217    emit_inst_dependencies(fs_visitor *shader,
1218                           const ordered_address *jps,
1219                           const dependency_list *deps)
1220    {
1221       const struct intel_device_info *devinfo = shader->devinfo;
1222       unsigned ip = 0;
1223 
1224       foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) {
1225          const bool exec_all = inst->force_writemask_all;
1226          const bool ordered_mode =
1227             baked_ordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]);
1228          const tgl_sbid_mode unordered_mode =
1229             baked_unordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]);
1230          tgl_swsb swsb = !ordered_mode ? tgl_swsb() :
1231             ordered_dependency_swsb(deps[ip], jps[ip], exec_all);
1232 
1233          for (unsigned i = 0; i < deps[ip].size(); i++) {
1234             const dependency &dep = deps[ip][i];
1235 
1236             if (dep.unordered) {
1237                if (unordered_mode == dep.unordered &&
1238                    exec_all >= dep.exec_all && !swsb.mode) {
1239                   /* Bake unordered dependency into the instruction's SWSB if
1240                    * possible, except in cases where the current instruction
1241                    * isn't marked NoMask but the dependency is, since that
1242                    * might lead to data coherency issues due to
1243                    * Wa_1407528679.
1244                    */
1245                   swsb.sbid = dep.id;
1246                   swsb.mode = dep.unordered;
1247                } else {
1248                   /* Emit dependency into the SWSB of an extra SYNC
1249                    * instruction.
1250                    */
1251                   const fs_builder ibld = fs_builder(shader, block, inst)
1252                                           .exec_all().group(1, 0);
1253                   fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
1254                                             brw_imm_ud(TGL_SYNC_NOP));
1255                   sync->sched.sbid = dep.id;
1256                   sync->sched.mode = dep.unordered;
1257                   assert(!(sync->sched.mode & TGL_SBID_SET));
1258                }
1259             }
1260          }
1261 
1262          for (unsigned i = 0; i < deps[ip].size(); i++) {
1263             const dependency &dep = deps[ip][i];
1264 
1265             if (dep.ordered &&
1266                 find_ordered_dependency(deps[ip], jps[ip], true) &&
1267                 (!ordered_mode || dep.exec_all > exec_all)) {
1268                /* If the current instruction is not marked NoMask but an
1269                 * ordered dependency is, perform the synchronization as a
1270                 * separate NoMask SYNC instruction in order to avoid data
1271                 * coherency issues due to Wa_1407528679.  The similar
1272                 * scenario with unordered dependencies should have been
1273                 * handled above.
1274                 */
1275                const fs_builder ibld = fs_builder(shader, block, inst)
1276                                        .exec_all().group(1, 0);
1277                fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
1278                                          brw_imm_ud(TGL_SYNC_NOP));
1279                sync->sched = ordered_dependency_swsb(deps[ip], jps[ip], true);
1280                break;
1281             }
1282          }
1283 
1284          /* Update the IR. */
1285          inst->sched = swsb;
1286          inst->no_dd_check = inst->no_dd_clear = false;
1287          ip++;
1288       }
1289    }
1290 }
1291 
1292 bool
lower_scoreboard()1293 fs_visitor::lower_scoreboard()
1294 {
1295    if (devinfo->ver >= 12) {
1296       const ordered_address *jps = ordered_inst_addresses(this);
1297       const dependency_list *deps0 = gather_inst_dependencies(this, jps);
1298       const dependency_list *deps1 = allocate_inst_dependencies(this, deps0);
1299       emit_inst_dependencies(this, jps, deps1);
1300       delete[] deps1;
1301       delete[] deps0;
1302       delete[] jps;
1303    }
1304 
1305    return true;
1306 }
1307