• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2019 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /** @file brw_fs_scoreboard.cpp
25  *
26  * Gfx12+ hardware lacks the register scoreboard logic that used to guarantee
27  * data coherency between register reads and writes in previous generations.
28  * This lowering pass runs after register allocation in order to make up for
29  * it.
30  *
31  * It works by performing global dataflow analysis in order to determine the
32  * set of potential dependencies of every instruction in the shader, and then
33  * inserts any required SWSB annotations and additional SYNC instructions in
34  * order to guarantee data coherency.
35  *
36  * WARNING - Access of the following (rarely used) ARF registers is not
37  *           tracked here, and require the RegDist SWSB annotation to be set
38  *           to 1 by the generator in order to avoid data races:
39  *
40  *  - sp stack pointer
41  *  - sr0 state register
42  *  - cr0 control register
43  *  - ip instruction pointer
44  *  - tm0 timestamp register
45  *  - dbg0 debug register
46  *  - acc2-9 special accumulator registers on TGL
47  *  - mme0-7 math macro extended accumulator registers
48  *
49  * The following ARF registers don't need to be tracked here because data
50  * coherency is still provided transparently by the hardware:
51  *
52  *  - f0-1 flag registers
53  *  - n0 notification register
54  *  - tdr0 thread dependency register
55  */
56 
57 #include "brw_fs.h"
58 #include "brw_fs_builder.h"
59 #include "brw_cfg.h"
60 
61 using namespace brw;
62 
63 namespace {
64    /**
65     * In-order instruction accounting.
66     * @{
67     */
68 
69    /**
70     * Return the RegDist pipeline the hardware will synchronize with if no
71     * pipeline information is provided in the SWSB annotation of an
72     * instruction (e.g. when TGL_PIPE_NONE is specified in tgl_swsb).
73     */
74    tgl_pipe
inferred_sync_pipe(const struct intel_device_info * devinfo,const fs_inst * inst)75    inferred_sync_pipe(const struct intel_device_info *devinfo, const fs_inst *inst)
76    {
77       if (devinfo->verx10 >= 125) {
78          bool has_int_src = false, has_long_src = false;
79          const bool has_long_pipe = !devinfo->has_64bit_float_via_math_pipe;
80 
81          if (is_send(inst))
82             return TGL_PIPE_NONE;
83 
84          for (unsigned i = 0; i < inst->sources; i++) {
85             if (inst->src[i].file != BAD_FILE &&
86                 !inst->is_control_source(i)) {
87                const brw_reg_type t = inst->src[i].type;
88                has_int_src |= !brw_reg_type_is_floating_point(t);
89                has_long_src |= type_sz(t) >= 8;
90             }
91          }
92 
93          /* Avoid the emitting (RegDist, SWSB) annotations for long
94           * instructions on platforms where they are unordered. It's not clear
95           * what the inferred sync pipe is for them or if we are even allowed
96           * to use these annotations in this case. Return NONE, which should
97           * prevent baked_{un,}ordered_dependency_mode functions from even
98           * trying to emit these annotations.
99           */
100          if (!has_long_pipe && has_long_src)
101             return TGL_PIPE_NONE;
102 
103          return has_long_src ? TGL_PIPE_LONG :
104                 has_int_src ? TGL_PIPE_INT :
105                 TGL_PIPE_FLOAT;
106 
107       } else {
108          return TGL_PIPE_FLOAT;
109       }
110    }
111 
112    /**
113     * Return the RegDist pipeline that will execute an instruction, or
114     * TGL_PIPE_NONE if the instruction is out-of-order and doesn't use the
115     * RegDist synchronization mechanism.
116     */
117    tgl_pipe
inferred_exec_pipe(const struct intel_device_info * devinfo,const fs_inst * inst)118    inferred_exec_pipe(const struct intel_device_info *devinfo, const fs_inst *inst)
119    {
120       const brw_reg_type t = get_exec_type(inst);
121       const bool is_dword_multiply = !brw_reg_type_is_floating_point(t) &&
122          ((inst->opcode == BRW_OPCODE_MUL &&
123            MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4) ||
124           (inst->opcode == BRW_OPCODE_MAD &&
125            MIN2(type_sz(inst->src[1].type), type_sz(inst->src[2].type)) >= 4));
126 
127       if (is_unordered(devinfo, inst))
128          return TGL_PIPE_NONE;
129       else if (devinfo->verx10 < 125)
130          return TGL_PIPE_FLOAT;
131       else if (inst->is_math() && devinfo->ver >= 20)
132          return TGL_PIPE_MATH;
133       else if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT ||
134                inst->opcode == SHADER_OPCODE_BROADCAST ||
135                inst->opcode == SHADER_OPCODE_SHUFFLE)
136          return TGL_PIPE_INT;
137       else if (inst->opcode == FS_OPCODE_PACK_HALF_2x16_SPLIT)
138          return TGL_PIPE_FLOAT;
139       else if (devinfo->ver >= 20 && type_sz(inst->dst.type) >= 8 &&
140                brw_reg_type_is_floating_point(inst->dst.type)) {
141          assert(devinfo->has_64bit_float);
142          return TGL_PIPE_LONG;
143       } else if (devinfo->ver < 20 &&
144                  (type_sz(inst->dst.type) >= 8 || type_sz(t) >= 8 ||
145                   is_dword_multiply)) {
146          assert(devinfo->has_64bit_float || devinfo->has_64bit_int ||
147                 devinfo->has_integer_dword_mul);
148          return TGL_PIPE_LONG;
149       } else if (brw_reg_type_is_floating_point(inst->dst.type))
150          return TGL_PIPE_FLOAT;
151       else
152          return TGL_PIPE_INT;
153    }
154 
155    /**
156     * Index of the \p p pipeline counter in the ordered_address vector defined
157     * below.
158     */
159 #define IDX(p) (p >= TGL_PIPE_FLOAT ? unsigned(p - TGL_PIPE_FLOAT) :    \
160                 (abort(), ~0u))
161 
162    /**
163     * Number of in-order hardware instructions for pipeline index \p contained
164     * in this IR instruction.  This determines the increment applied to the
165     * RegDist counter calculated for any ordered dependency that crosses this
166     * instruction.
167     */
168    unsigned
ordered_unit(const struct intel_device_info * devinfo,const fs_inst * inst,unsigned p)169    ordered_unit(const struct intel_device_info *devinfo, const fs_inst *inst,
170                 unsigned p)
171    {
172       switch (inst->opcode) {
173       case BRW_OPCODE_SYNC:
174       case BRW_OPCODE_DO:
175       case SHADER_OPCODE_UNDEF:
176       case SHADER_OPCODE_HALT_TARGET:
177       case FS_OPCODE_SCHEDULING_FENCE:
178          return 0;
179       default:
180          /* Note that the following is inaccurate for virtual instructions
181           * that expand to more in-order instructions than assumed here, but
182           * that can only lead to suboptimal execution ordering, data
183           * coherency won't be impacted.  Providing exact RegDist counts for
184           * each virtual instruction would allow better ALU performance, but
185           * it would require keeping this switch statement in perfect sync
186           * with the generator in order to avoid data corruption.  Lesson is
187           * (again) don't use virtual instructions if you want optimal
188           * scheduling.
189           */
190          if (!is_unordered(devinfo, inst) &&
191              (p == IDX(inferred_exec_pipe(devinfo, inst)) ||
192               p == IDX(TGL_PIPE_ALL)))
193             return 1;
194          else
195             return 0;
196       }
197    }
198 
199    /**
200     * Type for an instruction counter that increments for in-order
201     * instructions only, arbitrarily denoted 'jp' throughout this lowering
202     * pass in order to distinguish it from the regular instruction counter.
203     * This is represented as a vector with an independent counter for each
204     * asynchronous ALU pipeline in the EU.
205     */
206    struct ordered_address {
207       /**
208        * Construct the ordered address of a dependency known to execute on a
209        * single specified pipeline \p p (unless TGL_PIPE_NONE or TGL_PIPE_ALL
210        * is provided), in which case the vector counter will be initialized
211        * with all components equal to INT_MIN (always satisfied) except for
212        * component IDX(p).
213        */
ordered_address__anond1928e340111::ordered_address214       ordered_address(tgl_pipe p = TGL_PIPE_NONE, int jp0 = INT_MIN) {
215          for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++)
216             jp[q] = (p == TGL_PIPE_NONE || (IDX(p) != q && p != TGL_PIPE_ALL) ?
217                      INT_MIN : jp0);
218       }
219 
220       int jp[IDX(TGL_PIPE_ALL)];
221 
222       friend bool
operator ==(const ordered_address & jp0,const ordered_address & jp1)223       operator==(const ordered_address &jp0, const ordered_address &jp1)
224       {
225          for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) {
226             if (jp0.jp[p] != jp1.jp[p])
227                return false;
228          }
229 
230          return true;
231       }
232    };
233 
234    /**
235     * Return true if the specified ordered address is trivially satisfied for
236     * all pipelines except potentially for the specified pipeline \p p.
237     */
238    bool
is_single_pipe(const ordered_address & jp,tgl_pipe p)239    is_single_pipe(const ordered_address &jp, tgl_pipe p)
240    {
241       for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) {
242          if ((p == TGL_PIPE_NONE || IDX(p) != q) && jp.jp[q] > INT_MIN)
243             return false;
244       }
245 
246       return true;
247    }
248 
249    /**
250     * Return the number of instructions in the program.
251     */
252    unsigned
num_instructions(const backend_shader * shader)253    num_instructions(const backend_shader *shader)
254    {
255       return shader->cfg->blocks[shader->cfg->num_blocks - 1]->end_ip + 1;
256    }
257 
258    /**
259     * Calculate the local ordered_address instruction counter at every
260     * instruction of the shader for subsequent constant-time look-up.
261     */
262    ordered_address *
ordered_inst_addresses(const fs_visitor * shader)263    ordered_inst_addresses(const fs_visitor *shader)
264    {
265       ordered_address *jps = new ordered_address[num_instructions(shader)];
266       ordered_address jp(TGL_PIPE_ALL, 0);
267       unsigned ip = 0;
268 
269       foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
270          jps[ip] = jp;
271          for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
272             jp.jp[p] += ordered_unit(shader->devinfo, inst, p);
273          ip++;
274       }
275 
276       return jps;
277    }
278 
279    /**
280     * Synchronization mode required for data manipulated by in-order
281     * instructions.
282     *
283     * Similar to tgl_sbid_mode, but without SET mode.  Defined as a separate
284     * enum for additional type safety.  The hardware doesn't provide control
285     * over the synchronization mode for RegDist annotations, this is only used
286     * internally in this pass in order to optimize out redundant read
287     * dependencies where possible.
288     */
289    enum tgl_regdist_mode {
290       TGL_REGDIST_NULL = 0,
291       TGL_REGDIST_SRC = 1,
292       TGL_REGDIST_DST = 2
293    };
294 
295    /**
296     * Allow bitwise arithmetic of tgl_regdist_mode enums.
297     */
298    tgl_regdist_mode
operator |(tgl_regdist_mode x,tgl_regdist_mode y)299    operator|(tgl_regdist_mode x, tgl_regdist_mode y)
300    {
301       return tgl_regdist_mode(unsigned(x) | unsigned(y));
302    }
303 
304    tgl_regdist_mode
operator &(tgl_regdist_mode x,tgl_regdist_mode y)305    operator&(tgl_regdist_mode x, tgl_regdist_mode y)
306    {
307       return tgl_regdist_mode(unsigned(x) & unsigned(y));
308    }
309 
310    tgl_regdist_mode &
operator |=(tgl_regdist_mode & x,tgl_regdist_mode y)311    operator|=(tgl_regdist_mode &x, tgl_regdist_mode y)
312    {
313       return x = x | y;
314    }
315 
316    tgl_regdist_mode &
operator &=(tgl_regdist_mode & x,tgl_regdist_mode y)317    operator&=(tgl_regdist_mode &x, tgl_regdist_mode y)
318    {
319       return x = x & y;
320    }
321 
322    /** @} */
323 
324    /**
325     * Representation of an equivalence relation among the set of unsigned
326     * integers.
327     *
328     * Its initial state is the identity relation '~' such that i ~ j if and
329     * only if i == j for every pair of unsigned integers i and j.
330     */
331    struct equivalence_relation {
equivalence_relation__anond1928e340111::equivalence_relation332       equivalence_relation(unsigned n) : is(new unsigned[n]), n(n)
333       {
334          for (unsigned i = 0; i < n; i++)
335             is[i] = i;
336       }
337 
~equivalence_relation__anond1928e340111::equivalence_relation338       ~equivalence_relation()
339       {
340          delete[] is;
341       }
342 
343       /**
344        * Return equivalence class index of the specified element.  Effectively
345        * this is the numeric value of an arbitrary representative from the
346        * equivalence class.
347        *
348        * Allows the evaluation of the equivalence relation according to the
349        * rule that i ~ j if and only if lookup(i) == lookup(j).
350        */
351       unsigned
lookup__anond1928e340111::equivalence_relation352       lookup(unsigned i) const
353       {
354          if (i < n && is[i] != i)
355             return lookup(is[i]);
356          else
357             return i;
358       }
359 
360       /**
361        * Create an array with the results of the lookup() method for
362        * constant-time evaluation.
363        */
364       unsigned *
flatten__anond1928e340111::equivalence_relation365       flatten() const
366       {
367          unsigned *ids = new unsigned[n];
368 
369          for (unsigned i = 0; i < n; i++)
370             ids[i] = lookup(i);
371 
372          return ids;
373       }
374 
375       /**
376        * Mutate the existing equivalence relation minimally by imposing the
377        * additional requirement that i ~ j.
378        *
379        * The algorithm updates the internal representation recursively in
380        * order to guarantee transitivity while preserving the previously
381        * specified equivalence requirements.
382        */
383       unsigned
link__anond1928e340111::equivalence_relation384       link(unsigned i, unsigned j)
385       {
386          const unsigned k = lookup(i);
387          assign(i, k);
388          assign(j, k);
389          return k;
390       }
391 
392    private:
393       equivalence_relation(const equivalence_relation &);
394 
395       equivalence_relation &
396       operator=(const equivalence_relation &);
397 
398       /**
399        * Assign the representative of \p from to be equivalent to \p to.
400        *
401        * At the same time the data structure is partially flattened as much as
402        * it's possible without increasing the number of recursive calls.
403        */
404       void
assign__anond1928e340111::equivalence_relation405       assign(unsigned from, unsigned to)
406       {
407          if (from != to) {
408             assert(from < n);
409 
410             if (is[from] != from)
411                assign(is[from], to);
412 
413             is[from] = to;
414          }
415       }
416 
417       unsigned *is;
418       unsigned n;
419    };
420 
421    /**
422     * Representation of a data dependency between two instructions in the
423     * program.
424     * @{
425     */
426    struct dependency {
427       /**
428        * No dependency information.
429        */
dependency__anond1928e340111::dependency430       dependency() : ordered(TGL_REGDIST_NULL), jp(),
431                      unordered(TGL_SBID_NULL), id(0),
432                      exec_all(false) {}
433 
434       /**
435        * Construct a dependency on the in-order instruction with the provided
436        * ordered_address instruction counter.
437        */
dependency__anond1928e340111::dependency438       dependency(tgl_regdist_mode mode, const ordered_address &jp,
439                  bool exec_all) :
440          ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0),
441          exec_all(exec_all) {}
442 
443       /**
444        * Construct a dependency on the out-of-order instruction with the
445        * specified synchronization token.
446        */
dependency__anond1928e340111::dependency447       dependency(tgl_sbid_mode mode, unsigned id, bool exec_all) :
448          ordered(TGL_REGDIST_NULL), jp(), unordered(mode), id(id),
449          exec_all(exec_all) {}
450 
451       /**
452        * Synchronization mode of in-order dependency, or zero if no in-order
453        * dependency is present.
454        */
455       tgl_regdist_mode ordered;
456 
457       /**
458        * Instruction counter of in-order dependency.
459        *
460        * For a dependency part of a different block in the program, this is
461        * relative to the specific control flow path taken between the
462        * dependency and the current block: It is the ordered_address such that
463        * the difference between it and the ordered_address of the first
464        * instruction of the current block is exactly the number of in-order
465        * instructions across that control flow path.  It is not guaranteed to
466        * be equal to the local ordered_address of the generating instruction
467        * [as returned by ordered_inst_addresses()], except for block-local
468        * dependencies.
469        */
470       ordered_address jp;
471 
472       /**
473        * Synchronization mode of unordered dependency, or zero if no unordered
474        * dependency is present.
475        */
476       tgl_sbid_mode unordered;
477 
478       /** Synchronization token of out-of-order dependency. */
479       unsigned id;
480 
481       /**
482        * Whether the dependency could be run with execution masking disabled,
483        * which might lead to the unwanted execution of the generating
484        * instruction in cases where a BB is executed with all channels
485        * disabled due to hardware bug Wa_1407528679.
486        */
487       bool exec_all;
488 
489       /**
490        * Trivial in-order dependency that's always satisfied.
491        *
492        * Note that unlike a default-constructed dependency() which is also
493        * trivially satisfied, this is considered to provide dependency
494        * information and can be used to clear a previously pending dependency
495        * via shadow().
496        */
497       static const dependency done;
498 
499       friend bool
operator ==(const dependency & dep0,const dependency & dep1)500       operator==(const dependency &dep0, const dependency &dep1)
501       {
502          return dep0.ordered == dep1.ordered &&
503                 dep0.jp == dep1.jp &&
504                 dep0.unordered == dep1.unordered &&
505                 dep0.id == dep1.id &&
506                 dep0.exec_all == dep1.exec_all;
507       }
508 
509       friend bool
operator !=(const dependency & dep0,const dependency & dep1)510       operator!=(const dependency &dep0, const dependency &dep1)
511       {
512          return !(dep0 == dep1);
513       }
514    };
515 
516    const dependency dependency::done =
517         dependency(TGL_REGDIST_DST, ordered_address(), false);
518 
519    /**
520     * Return whether \p dep contains any dependency information.
521     */
522    bool
is_valid(const dependency & dep)523    is_valid(const dependency &dep)
524    {
525       return dep.ordered || dep.unordered;
526    }
527 
528    /**
529     * Combine \p dep0 and \p dep1 into a single dependency object that is only
530     * satisfied when both original dependencies are satisfied.  This might
531     * involve updating the equivalence relation \p eq in order to make sure
532     * that both out-of-order dependencies are assigned the same hardware SBID
533     * as synchronization token.
534     */
535    dependency
merge(equivalence_relation & eq,const dependency & dep0,const dependency & dep1)536    merge(equivalence_relation &eq,
537          const dependency &dep0, const dependency &dep1)
538    {
539       dependency dep;
540 
541       if (dep0.ordered || dep1.ordered) {
542          dep.ordered = dep0.ordered | dep1.ordered;
543          for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
544             dep.jp.jp[p] = MAX2(dep0.jp.jp[p], dep1.jp.jp[p]);
545       }
546 
547       if (dep0.unordered || dep1.unordered) {
548          dep.unordered = dep0.unordered | dep1.unordered;
549          dep.id = eq.link(dep0.unordered ? dep0.id : dep1.id,
550                           dep1.unordered ? dep1.id : dep0.id);
551       }
552 
553       dep.exec_all = dep0.exec_all || dep1.exec_all;
554 
555       return dep;
556    }
557 
558    /**
559     * Override dependency information of \p dep0 with that of \p dep1.
560     */
561    dependency
shadow(const dependency & dep0,const dependency & dep1)562    shadow(const dependency &dep0, const dependency &dep1)
563    {
564       if (dep0.ordered == TGL_REGDIST_SRC &&
565           is_valid(dep1) && !(dep1.unordered & TGL_SBID_DST) &&
566                             !(dep1.ordered & TGL_REGDIST_DST)) {
567          /* As an optimization (see dependency_for_read()),
568           * instructions with a RaR dependency don't synchronize
569           * against a previous in-order read, so we need to pass
570           * through both ordered dependencies instead of simply
571           * dropping the first one.  Otherwise we could encounter a
572           * WaR data hazard between OP0 and OP2 in cases like:
573           *
574           *   OP0 r1:f r0:d
575           *   OP1 r2:d r0:d
576           *   OP2 r0:d r3:d
577           *
578           * since only the integer-pipeline r0 dependency from OP1
579           * would be visible to OP2, even though OP0 could technically
580           * execute after OP1 due to the floating-point and integer
581           * pipelines being asynchronous on Gfx12.5+ platforms, so
582           * synchronizing OP2 against OP1 would be insufficient.
583           */
584          dependency dep = dep1;
585 
586          dep.ordered |= dep0.ordered;
587          for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
588                dep.jp.jp[p] = MAX2(dep.jp.jp[p], dep0.jp.jp[p]);
589 
590          return dep;
591       } else {
592          return is_valid(dep1) ? dep1 : dep0;
593       }
594    }
595 
596    /**
597     * Translate dependency information across the program.
598     *
599     * This returns a dependency on the same instruction translated to the
600     * ordered_address space of a different block.  The correct shift for
601     * transporting a dependency across an edge of the CFG is the difference
602     * between the local ordered_address of the first instruction of the target
603     * block and the local ordered_address of the instruction immediately after
604     * the end of the origin block.
605     */
606    dependency
transport(dependency dep,int delta[IDX (TGL_PIPE_ALL)])607    transport(dependency dep, int delta[IDX(TGL_PIPE_ALL)])
608    {
609       if (dep.ordered) {
610          for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) {
611             if (dep.jp.jp[p] > INT_MIN)
612                dep.jp.jp[p] += delta[p];
613          }
614       }
615 
616       return dep;
617    }
618 
619    /**
620     * Return simplified dependency removing any synchronization modes not
621     * applicable to an instruction reading the same register location.
622     */
623    dependency
dependency_for_read(dependency dep)624    dependency_for_read(dependency dep)
625    {
626       dep.ordered &= TGL_REGDIST_DST;
627       return dep;
628    }
629 
630    /**
631     * Return simplified dependency removing any synchronization modes not
632     * applicable to an instruction \p inst writing the same register location.
633     *
634     * This clears any WaR dependency for writes performed from the same
635     * pipeline as the read, since there is no possibility for a data hazard.
636     */
637    dependency
dependency_for_write(const struct intel_device_info * devinfo,const fs_inst * inst,dependency dep)638    dependency_for_write(const struct intel_device_info *devinfo,
639                         const fs_inst *inst, dependency dep)
640    {
641       if (!is_unordered(devinfo, inst) &&
642           is_single_pipe(dep.jp, inferred_exec_pipe(devinfo, inst)))
643          dep.ordered &= TGL_REGDIST_DST;
644       return dep;
645    }
646 
647    /** @} */
648 
649    /**
650     * Scoreboard representation.  This keeps track of the data dependencies of
651     * registers with GRF granularity.
652     */
653    class scoreboard {
654    public:
655       /**
656        * Look up the most current data dependency for register \p r.
657        */
658       dependency
get(const fs_reg & r) const659       get(const fs_reg &r) const
660       {
661          if (const dependency *p = const_cast<scoreboard *>(this)->dep(r))
662             return *p;
663          else
664             return dependency();
665       }
666 
667       /**
668        * Specify the most current data dependency for register \p r.
669        */
670       void
set(const fs_reg & r,const dependency & d)671       set(const fs_reg &r, const dependency &d)
672       {
673          if (dependency *p = dep(r))
674             *p = d;
675       }
676 
677       /**
678        * Component-wise merge() of corresponding dependencies from two
679        * scoreboard objects.  \sa merge().
680        */
681       friend scoreboard
merge(equivalence_relation & eq,const scoreboard & sb0,const scoreboard & sb1)682       merge(equivalence_relation &eq,
683             const scoreboard &sb0, const scoreboard &sb1)
684       {
685          scoreboard sb;
686 
687          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
688             sb.grf_deps[i] = merge(eq, sb0.grf_deps[i], sb1.grf_deps[i]);
689 
690          sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep);
691          sb.accum_dep = merge(eq, sb0.accum_dep, sb1.accum_dep);
692 
693          return sb;
694       }
695 
696       /**
697        * Component-wise shadow() of corresponding dependencies from two
698        * scoreboard objects.  \sa shadow().
699        */
700       friend scoreboard
shadow(const scoreboard & sb0,const scoreboard & sb1)701       shadow(const scoreboard &sb0, const scoreboard &sb1)
702       {
703          scoreboard sb;
704 
705          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
706             sb.grf_deps[i] = shadow(sb0.grf_deps[i], sb1.grf_deps[i]);
707 
708          sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep);
709          sb.accum_dep = shadow(sb0.accum_dep, sb1.accum_dep);
710 
711          return sb;
712       }
713 
714       /**
715        * Component-wise transport() of dependencies from a scoreboard
716        * object.  \sa transport().
717        */
718       friend scoreboard
transport(const scoreboard & sb0,int delta[IDX (TGL_PIPE_ALL)])719       transport(const scoreboard &sb0, int delta[IDX(TGL_PIPE_ALL)])
720       {
721          scoreboard sb;
722 
723          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
724             sb.grf_deps[i] = transport(sb0.grf_deps[i], delta);
725 
726          sb.addr_dep = transport(sb0.addr_dep, delta);
727          sb.accum_dep = transport(sb0.accum_dep, delta);
728 
729          return sb;
730       }
731 
732       friend bool
operator ==(const scoreboard & sb0,const scoreboard & sb1)733       operator==(const scoreboard &sb0, const scoreboard &sb1)
734       {
735          for (unsigned i = 0; i < ARRAY_SIZE(sb0.grf_deps); i++) {
736             if (sb0.grf_deps[i] != sb1.grf_deps[i])
737                return false;
738          }
739 
740          if (sb0.addr_dep != sb1.addr_dep)
741             return false;
742 
743          if (sb0.accum_dep != sb1.accum_dep)
744             return false;
745 
746          return true;
747       }
748 
749       friend bool
operator !=(const scoreboard & sb0,const scoreboard & sb1)750       operator!=(const scoreboard &sb0, const scoreboard &sb1)
751       {
752          return !(sb0 == sb1);
753       }
754 
755    private:
756       dependency grf_deps[XE2_MAX_GRF];
757       dependency addr_dep;
758       dependency accum_dep;
759 
760       dependency *
dep(const fs_reg & r)761       dep(const fs_reg &r)
762       {
763          const unsigned reg = (r.file == VGRF ? r.nr + r.offset / REG_SIZE :
764                                reg_offset(r) / REG_SIZE);
765 
766          return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] :
767                  r.file == ARF && reg >= BRW_ARF_ADDRESS &&
768                                   reg < BRW_ARF_ACCUMULATOR ? &addr_dep :
769                  r.file == ARF && reg >= BRW_ARF_ACCUMULATOR &&
770                                   reg < BRW_ARF_FLAG ? &accum_dep :
771                  NULL);
772       }
773    };
774 
775    /**
776     * Dependency list handling.
777     * @{
778     */
779    struct dependency_list {
dependency_list__anond1928e340111::dependency_list780       dependency_list() : deps(NULL), n(0) {}
781 
~dependency_list__anond1928e340111::dependency_list782       ~dependency_list()
783       {
784          free(deps);
785       }
786 
787       void
push_back__anond1928e340111::dependency_list788       push_back(const dependency &dep)
789       {
790          deps = (dependency *)realloc(deps, (n + 1) * sizeof(*deps));
791          deps[n++] = dep;
792       }
793 
794       unsigned
size__anond1928e340111::dependency_list795       size() const
796       {
797          return n;
798       }
799 
800       const dependency &
operator []__anond1928e340111::dependency_list801       operator[](unsigned i) const
802       {
803          assert(i < n);
804          return deps[i];
805       }
806 
807       dependency &
operator []__anond1928e340111::dependency_list808       operator[](unsigned i)
809       {
810          assert(i < n);
811          return deps[i];
812       }
813 
814    private:
815       dependency_list(const dependency_list &);
816       dependency_list &
817       operator=(const dependency_list &);
818 
819       dependency *deps;
820       unsigned n;
821    };
822 
823    /**
824     * Add dependency \p dep to the list of dependencies of an instruction
825     * \p deps.
826     */
827    void
add_dependency(const unsigned * ids,dependency_list & deps,dependency dep)828    add_dependency(const unsigned *ids, dependency_list &deps, dependency dep)
829    {
830       if (is_valid(dep)) {
831          /* Translate the unordered dependency token first in order to keep
832           * the list minimally redundant.
833           */
834          if (dep.unordered)
835             dep.id = ids[dep.id];
836 
837          /* Try to combine the specified dependency with any existing ones. */
838          for (unsigned i = 0; i < deps.size(); i++) {
839             /* Don't combine otherwise matching dependencies if there is an
840              * exec_all mismatch which would cause a SET dependency to gain an
841              * exec_all flag, since that would prevent it from being baked
842              * into the instruction we want to allocate an SBID for.
843              */
844             if (deps[i].exec_all != dep.exec_all &&
845                 (!deps[i].exec_all || (dep.unordered & TGL_SBID_SET)) &&
846                 (!dep.exec_all || (deps[i].unordered & TGL_SBID_SET)))
847                continue;
848 
849             if (dep.ordered && deps[i].ordered) {
850                for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
851                   deps[i].jp.jp[p] = MAX2(deps[i].jp.jp[p], dep.jp.jp[p]);
852 
853                deps[i].ordered |= dep.ordered;
854                deps[i].exec_all |= dep.exec_all;
855                dep.ordered = TGL_REGDIST_NULL;
856             }
857 
858             if (dep.unordered && deps[i].unordered && deps[i].id == dep.id) {
859                deps[i].unordered |= dep.unordered;
860                deps[i].exec_all |= dep.exec_all;
861                dep.unordered = TGL_SBID_NULL;
862             }
863          }
864 
865          /* Add it to the end of the list if necessary. */
866          if (is_valid(dep))
867             deps.push_back(dep);
868       }
869    }
870 
871    /**
872     * Construct a tgl_swsb annotation encoding any ordered dependencies from
873     * the dependency list \p deps of an instruction with ordered_address \p
874     * jp.  If \p exec_all is false only dependencies known to be executed with
875     * channel masking applied will be considered in the calculation.
876     */
877    tgl_swsb
ordered_dependency_swsb(const dependency_list & deps,const ordered_address & jp,bool exec_all)878    ordered_dependency_swsb(const dependency_list &deps,
879                            const ordered_address &jp,
880                            bool exec_all)
881    {
882       tgl_pipe p = TGL_PIPE_NONE;
883       unsigned min_dist = ~0u;
884 
885       for (unsigned i = 0; i < deps.size(); i++) {
886          if (deps[i].ordered && exec_all >= deps[i].exec_all) {
887             for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) {
888                const unsigned dist = jp.jp[q] - int64_t(deps[i].jp.jp[q]);
889                const unsigned max_dist = (q == IDX(TGL_PIPE_LONG) ? 14 : 10);
890                assert(jp.jp[q] > deps[i].jp.jp[q]);
891                if (dist <= max_dist) {
892                   p = (p && IDX(p) != q ? TGL_PIPE_ALL :
893                        tgl_pipe(TGL_PIPE_FLOAT + q));
894                   min_dist = MIN3(min_dist, dist, 7);
895                }
896             }
897          }
898       }
899 
900       return { p ? min_dist : 0, p };
901    }
902 
903    /**
904     * Return whether the dependency list \p deps of an instruction with
905     * ordered_address \p jp has any non-trivial ordered dependencies.  If \p
906     * exec_all is false only dependencies known to be executed with channel
907     * masking applied will be considered in the calculation.
908     */
909    bool
find_ordered_dependency(const dependency_list & deps,const ordered_address & jp,bool exec_all)910    find_ordered_dependency(const dependency_list &deps,
911                            const ordered_address &jp,
912                            bool exec_all)
913    {
914       return ordered_dependency_swsb(deps, jp, exec_all).regdist;
915    }
916 
917    /**
918     * Return the full tgl_sbid_mode bitset for the first unordered dependency
919     * on the list \p deps that matches the specified tgl_sbid_mode, or zero if
920     * no such dependency is present.  If \p exec_all is false only
921     * dependencies known to be executed with channel masking applied will be
922     * considered in the calculation.
923     */
924    tgl_sbid_mode
find_unordered_dependency(const dependency_list & deps,tgl_sbid_mode unordered,bool exec_all)925    find_unordered_dependency(const dependency_list &deps,
926                              tgl_sbid_mode unordered,
927                              bool exec_all)
928    {
929       if (unordered) {
930          for (unsigned i = 0; i < deps.size(); i++) {
931             if ((unordered & deps[i].unordered) &&
932                 exec_all >= deps[i].exec_all)
933                return deps[i].unordered;
934          }
935       }
936 
937       return TGL_SBID_NULL;
938    }
939 
940    /**
941     * Return the tgl_sbid_mode bitset of an unordered dependency from the list
942     * \p deps that can be represented directly in the SWSB annotation of the
943     * instruction without additional SYNC instructions, or zero if no such
944     * dependency is present.
945     */
946    tgl_sbid_mode
baked_unordered_dependency_mode(const struct intel_device_info * devinfo,const fs_inst * inst,const dependency_list & deps,const ordered_address & jp)947    baked_unordered_dependency_mode(const struct intel_device_info *devinfo,
948                                    const fs_inst *inst,
949                                    const dependency_list &deps,
950                                    const ordered_address &jp)
951    {
952       const bool exec_all = inst->force_writemask_all;
953       const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
954       const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp,
955                                                             exec_all).pipe;
956 
957       if (find_unordered_dependency(deps, TGL_SBID_SET, exec_all))
958          return find_unordered_dependency(deps, TGL_SBID_SET, exec_all);
959       else if (has_ordered && is_unordered(devinfo, inst))
960          return TGL_SBID_NULL;
961       else if (find_unordered_dependency(deps, TGL_SBID_DST, exec_all) &&
962                (!has_ordered || ordered_pipe == inferred_sync_pipe(devinfo, inst)))
963          return find_unordered_dependency(deps, TGL_SBID_DST, exec_all);
964       else if (!has_ordered)
965          return find_unordered_dependency(deps, TGL_SBID_SRC, exec_all);
966       else
967          return TGL_SBID_NULL;
968    }
969 
970    /**
971     * Return whether an ordered dependency from the list \p deps can be
972     * represented directly in the SWSB annotation of the instruction without
973     * additional SYNC instructions.
974     */
975    bool
baked_ordered_dependency_mode(const struct intel_device_info * devinfo,const fs_inst * inst,const dependency_list & deps,const ordered_address & jp)976    baked_ordered_dependency_mode(const struct intel_device_info *devinfo,
977                                  const fs_inst *inst,
978                                  const dependency_list &deps,
979                                  const ordered_address &jp)
980    {
981       const bool exec_all = inst->force_writemask_all;
982       const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
983       const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp,
984                                                             exec_all).pipe;
985       const tgl_sbid_mode unordered_mode =
986          baked_unordered_dependency_mode(devinfo, inst, deps, jp);
987 
988       if (!has_ordered)
989          return false;
990       else if (!unordered_mode)
991          return true;
992       else
993          return ordered_pipe == inferred_sync_pipe(devinfo, inst) &&
994                 unordered_mode == (is_unordered(devinfo, inst) ? TGL_SBID_SET :
995                                    TGL_SBID_DST);
996    }
997 
998    /** @} */
999 
1000    /**
1001     * Shader instruction dependency calculation.
1002     * @{
1003     */
1004 
1005    /**
1006     * Update scoreboard object \p sb to account for the execution of
1007     * instruction \p inst.
1008     */
1009    void
update_inst_scoreboard(const fs_visitor * shader,const ordered_address * jps,const fs_inst * inst,unsigned ip,scoreboard & sb)1010    update_inst_scoreboard(const fs_visitor *shader, const ordered_address *jps,
1011                           const fs_inst *inst, unsigned ip, scoreboard &sb)
1012    {
1013       const bool exec_all = inst->force_writemask_all;
1014       const struct intel_device_info *devinfo = shader->devinfo;
1015       const tgl_pipe p = inferred_exec_pipe(devinfo, inst);
1016       const ordered_address jp = p ? ordered_address(p, jps[ip].jp[IDX(p)]) :
1017                                      ordered_address();
1018       const bool is_ordered = ordered_unit(devinfo, inst, IDX(TGL_PIPE_ALL));
1019       const bool is_unordered_math =
1020          (inst->is_math() && devinfo->ver < 20) ||
1021          (devinfo->has_64bit_float_via_math_pipe &&
1022           (get_exec_type(inst) == BRW_REGISTER_TYPE_DF ||
1023            inst->dst.type == BRW_REGISTER_TYPE_DF));
1024 
1025       /* Track any source registers that may be fetched asynchronously by this
1026        * instruction, otherwise clear the dependency in order to avoid
1027        * subsequent redundant synchronization.
1028        */
1029       for (unsigned i = 0; i < inst->sources; i++) {
1030          const dependency rd_dep =
1031             (inst->is_payload(i) ||
1032              inst->opcode == BRW_OPCODE_DPAS ||
1033              is_unordered_math) ? dependency(TGL_SBID_SRC, ip, exec_all) :
1034             is_ordered ? dependency(TGL_REGDIST_SRC, jp, exec_all) :
1035             dependency::done;
1036 
1037          for (unsigned j = 0; j < regs_read(inst, i); j++) {
1038             const fs_reg r = byte_offset(inst->src[i], REG_SIZE * j);
1039             sb.set(r, shadow(sb.get(r), rd_dep));
1040          }
1041       }
1042 
1043       if (inst->reads_accumulator_implicitly())
1044          sb.set(brw_acc_reg(8), dependency(TGL_REGDIST_SRC, jp, exec_all));
1045 
1046       /* Track any destination registers of this instruction. */
1047       const dependency wr_dep =
1048          is_unordered(devinfo, inst) ? dependency(TGL_SBID_DST, ip, exec_all) :
1049          is_ordered ? dependency(TGL_REGDIST_DST, jp, exec_all) :
1050          dependency();
1051 
1052       if (inst->writes_accumulator_implicitly(devinfo))
1053          sb.set(brw_acc_reg(8), wr_dep);
1054 
1055       if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
1056           !inst->dst.is_null()) {
1057          for (unsigned j = 0; j < regs_written(inst); j++)
1058             sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep);
1059       }
1060    }
1061 
1062    /**
1063     * Calculate scoreboard objects locally that represent any pending (and
1064     * unconditionally resolved) dependencies at the end of each block of the
1065     * program.
1066     */
1067    scoreboard *
gather_block_scoreboards(const fs_visitor * shader,const ordered_address * jps)1068    gather_block_scoreboards(const fs_visitor *shader,
1069                             const ordered_address *jps)
1070    {
1071       scoreboard *sbs = new scoreboard[shader->cfg->num_blocks];
1072       unsigned ip = 0;
1073 
1074       foreach_block_and_inst(block, fs_inst, inst, shader->cfg)
1075          update_inst_scoreboard(shader, jps, inst, ip++, sbs[block->num]);
1076 
1077       return sbs;
1078    }
1079 
1080    /**
1081     * Propagate data dependencies globally through the control flow graph
1082     * until a fixed point is reached.
1083     *
1084     * Calculates the set of dependencies potentially pending at the beginning
1085     * of each block, and returns it as an array of scoreboard objects.
1086     */
1087    scoreboard *
propagate_block_scoreboards(const fs_visitor * shader,const ordered_address * jps,equivalence_relation & eq)1088    propagate_block_scoreboards(const fs_visitor *shader,
1089                                const ordered_address *jps,
1090                                equivalence_relation &eq)
1091    {
1092       const scoreboard *delta_sbs = gather_block_scoreboards(shader, jps);
1093       scoreboard *in_sbs = new scoreboard[shader->cfg->num_blocks];
1094       scoreboard *out_sbs = new scoreboard[shader->cfg->num_blocks];
1095 
1096       for (bool progress = true; progress;) {
1097          progress = false;
1098 
1099          foreach_block(block, shader->cfg) {
1100             const scoreboard sb = shadow(in_sbs[block->num],
1101                                          delta_sbs[block->num]);
1102 
1103             if (sb != out_sbs[block->num]) {
1104                foreach_list_typed(bblock_link, child_link, link,
1105                                   &block->children) {
1106                   scoreboard &in_sb = in_sbs[child_link->block->num];
1107                   int delta[IDX(TGL_PIPE_ALL)];
1108 
1109                   for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
1110                      delta[p] = jps[child_link->block->start_ip].jp[p]
1111                         - jps[block->end_ip].jp[p]
1112                         - ordered_unit(shader->devinfo,
1113                                        static_cast<const fs_inst *>(block->end()), p);
1114 
1115                   in_sb = merge(eq, in_sb, transport(sb, delta));
1116                }
1117 
1118                out_sbs[block->num] = sb;
1119                progress = true;
1120             }
1121          }
1122       }
1123 
1124       delete[] delta_sbs;
1125       delete[] out_sbs;
1126 
1127       return in_sbs;
1128    }
1129 
1130    /**
1131     * Return the list of potential dependencies of each instruction in the
1132     * shader based on the result of global dependency analysis.
1133     */
1134    dependency_list *
gather_inst_dependencies(const fs_visitor * shader,const ordered_address * jps)1135    gather_inst_dependencies(const fs_visitor *shader,
1136                             const ordered_address *jps)
1137    {
1138       const struct intel_device_info *devinfo = shader->devinfo;
1139       equivalence_relation eq(num_instructions(shader));
1140       scoreboard *sbs = propagate_block_scoreboards(shader, jps, eq);
1141       const unsigned *ids = eq.flatten();
1142       dependency_list *deps = new dependency_list[num_instructions(shader)];
1143       unsigned ip = 0;
1144 
1145       foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
1146          const bool exec_all = inst->force_writemask_all;
1147          const tgl_pipe p = inferred_exec_pipe(devinfo, inst);
1148          scoreboard &sb = sbs[block->num];
1149 
1150          for (unsigned i = 0; i < inst->sources; i++) {
1151             for (unsigned j = 0; j < regs_read(inst, i); j++)
1152                add_dependency(ids, deps[ip], dependency_for_read(
1153                   sb.get(byte_offset(inst->src[i], REG_SIZE * j))));
1154          }
1155 
1156          if (inst->reads_accumulator_implicitly()) {
1157             /* Wa_22012725308:
1158              *
1159              * "When the accumulator registers are used as source and/or
1160              *  destination, hardware does not ensure prevention of write
1161              *  after read hazard across execution pipes."
1162              */
1163             const dependency dep = sb.get(brw_acc_reg(8));
1164             if (dep.ordered && !is_single_pipe(dep.jp, p))
1165                add_dependency(ids, deps[ip], dep);
1166          }
1167 
1168          if (is_unordered(devinfo, inst) && !inst->eot)
1169             add_dependency(ids, deps[ip],
1170                            dependency(TGL_SBID_SET, ip, exec_all));
1171 
1172          if (!inst->no_dd_check) {
1173             if (inst->dst.file != BAD_FILE && !inst->dst.is_null() &&
1174                 !inst->dst.is_accumulator()) {
1175                for (unsigned j = 0; j < regs_written(inst); j++) {
1176                   add_dependency(ids, deps[ip], dependency_for_write(devinfo, inst,
1177                      sb.get(byte_offset(inst->dst, REG_SIZE * j))));
1178                }
1179             }
1180 
1181             if (inst->writes_accumulator_implicitly(devinfo) ||
1182                 inst->dst.is_accumulator()) {
1183                /* Wa_22012725308:
1184                 *
1185                 * "When the accumulator registers are used as source and/or
1186                 *  destination, hardware does not ensure prevention of write
1187                 *  after read hazard across execution pipes."
1188                 */
1189                const dependency dep = sb.get(brw_acc_reg(8));
1190                if (dep.ordered && !is_single_pipe(dep.jp, p))
1191                   add_dependency(ids, deps[ip], dep);
1192             }
1193          }
1194 
1195          update_inst_scoreboard(shader, jps, inst, ip, sb);
1196          ip++;
1197       }
1198 
1199       delete[] sbs;
1200       delete[] ids;
1201 
1202       return deps;
1203    }
1204 
1205    /** @} */
1206 
1207    /**
1208     * Allocate SBID tokens to track the execution of every out-of-order
1209     * instruction of the shader.
1210     */
1211    dependency_list *
allocate_inst_dependencies(const fs_visitor * shader,const dependency_list * deps0)1212    allocate_inst_dependencies(const fs_visitor *shader,
1213                               const dependency_list *deps0)
1214    {
1215       /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in
1216        *       shaders with a large number of SEND messages.
1217        *
1218        * XXX - Use 32 SBIDs on Xe2+ while in large GRF mode.
1219        */
1220       const unsigned num_sbids = 16;
1221 
1222       /* Allocate an unordered dependency ID to hardware SBID translation
1223        * table with as many entries as instructions there are in the shader,
1224        * which is the maximum number of unordered IDs we can find in the
1225        * program.
1226        */
1227       unsigned *ids = new unsigned[num_instructions(shader)];
1228       for (unsigned ip = 0; ip < num_instructions(shader); ip++)
1229          ids[ip] = ~0u;
1230 
1231       dependency_list *deps1 = new dependency_list[num_instructions(shader)];
1232       unsigned next_id = 0;
1233 
1234       for (unsigned ip = 0; ip < num_instructions(shader); ip++) {
1235          for (unsigned i = 0; i < deps0[ip].size(); i++) {
1236             const dependency &dep = deps0[ip][i];
1237 
1238             if (dep.unordered && ids[dep.id] == ~0u)
1239                ids[dep.id] = (next_id++) & (num_sbids - 1);
1240 
1241             add_dependency(ids, deps1[ip], dep);
1242          }
1243       }
1244 
1245       delete[] ids;
1246 
1247       return deps1;
1248    }
1249 
1250    /**
1251     * Emit dependency information provided by \p deps into the shader,
1252     * inserting additional SYNC instructions for dependencies that can't be
1253     * represented directly by annotating existing instructions.
1254     */
1255    void
emit_inst_dependencies(fs_visitor * shader,const ordered_address * jps,const dependency_list * deps)1256    emit_inst_dependencies(fs_visitor *shader,
1257                           const ordered_address *jps,
1258                           const dependency_list *deps)
1259    {
1260       const struct intel_device_info *devinfo = shader->devinfo;
1261       unsigned ip = 0;
1262 
1263       foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) {
1264          const bool exec_all = inst->force_writemask_all;
1265          const bool ordered_mode =
1266             baked_ordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]);
1267          const tgl_sbid_mode unordered_mode =
1268             baked_unordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]);
1269          tgl_swsb swsb = !ordered_mode ? tgl_swsb() :
1270             ordered_dependency_swsb(deps[ip], jps[ip], exec_all);
1271 
1272          for (unsigned i = 0; i < deps[ip].size(); i++) {
1273             const dependency &dep = deps[ip][i];
1274 
1275             if (dep.unordered) {
1276                if (unordered_mode == dep.unordered &&
1277                    exec_all >= dep.exec_all && !swsb.mode) {
1278                   /* Bake unordered dependency into the instruction's SWSB if
1279                    * possible, except in cases where the current instruction
1280                    * isn't marked NoMask but the dependency is, since that
1281                    * might lead to data coherency issues due to
1282                    * Wa_1407528679.
1283                    */
1284                   swsb.sbid = dep.id;
1285                   swsb.mode = dep.unordered;
1286                } else {
1287                   /* Emit dependency into the SWSB of an extra SYNC
1288                    * instruction.
1289                    */
1290                   const fs_builder ibld = fs_builder(shader, block, inst)
1291                                           .exec_all().group(1, 0);
1292                   fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
1293                                             brw_imm_ud(TGL_SYNC_NOP));
1294                   sync->sched.sbid = dep.id;
1295                   sync->sched.mode = dep.unordered;
1296                   assert(!(sync->sched.mode & TGL_SBID_SET));
1297                }
1298             }
1299          }
1300 
1301          for (unsigned i = 0; i < deps[ip].size(); i++) {
1302             const dependency &dep = deps[ip][i];
1303 
1304             if (dep.ordered &&
1305                 find_ordered_dependency(deps[ip], jps[ip], true) &&
1306                 (!ordered_mode || dep.exec_all > exec_all)) {
1307                /* If the current instruction is not marked NoMask but an
1308                 * ordered dependency is, perform the synchronization as a
1309                 * separate NoMask SYNC instruction in order to avoid data
1310                 * coherency issues due to Wa_1407528679.  The similar
1311                 * scenario with unordered dependencies should have been
1312                 * handled above.
1313                 */
1314                const fs_builder ibld = fs_builder(shader, block, inst)
1315                                        .exec_all().group(1, 0);
1316                fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
1317                                          brw_imm_ud(TGL_SYNC_NOP));
1318                sync->sched = ordered_dependency_swsb(deps[ip], jps[ip], true);
1319                break;
1320             }
1321          }
1322 
1323          /* Update the IR. */
1324          inst->sched = swsb;
1325          inst->no_dd_check = inst->no_dd_clear = false;
1326          ip++;
1327       }
1328    }
1329 }
1330 
1331 bool
brw_fs_lower_scoreboard(fs_visitor & s)1332 brw_fs_lower_scoreboard(fs_visitor &s)
1333 {
1334    if (s.devinfo->ver >= 12) {
1335       const ordered_address *jps = ordered_inst_addresses(&s);
1336       const dependency_list *deps0 = gather_inst_dependencies(&s, jps);
1337       const dependency_list *deps1 = allocate_inst_dependencies(&s, deps0);
1338       emit_inst_dependencies(&s, jps, deps1);
1339       delete[] deps1;
1340       delete[] deps0;
1341       delete[] jps;
1342    }
1343 
1344    return true;
1345 }
1346