• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2019 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /** @file
25  *
26  * Gfx12+ hardware lacks the register scoreboard logic that used to guarantee
27  * data coherency between register reads and writes in previous generations.
28  * This lowering pass runs after register allocation in order to make up for
29  * it.
30  *
31  * It works by performing global dataflow analysis in order to determine the
32  * set of potential dependencies of every instruction in the shader, and then
33  * inserts any required SWSB annotations and additional SYNC instructions in
34  * order to guarantee data coherency.
35  *
36  * WARNING - Access of the following (rarely used) ARF registers is not
37  *           tracked here, and require the RegDist SWSB annotation to be set
38  *           to 1 by the generator in order to avoid data races:
39  *
40  *  - sp stack pointer
41  *  - sr0 state register
42  *  - cr0 control register
43  *  - ip instruction pointer
44  *  - tm0 timestamp register
45  *  - dbg0 debug register
46  *  - acc2-9 special accumulator registers on TGL
47  *  - mme0-7 math macro extended accumulator registers
48  *
49  * The following ARF registers don't need to be tracked here because data
50  * coherency is still provided transparently by the hardware:
51  *
52  *  - f0-1 flag registers
53  *  - n0 notification register
54  *  - tdr0 thread dependency register
55  */
56 
57 #include "brw_fs.h"
58 #include "brw_builder.h"
59 #include "brw_cfg.h"
60 
61 using namespace brw;
62 
63 namespace {
64    /**
65     * In-order instruction accounting.
66     * @{
67     */
68 
69    /**
70     * Return the RegDist pipeline the hardware will synchronize with if no
71     * pipeline information is provided in the SWSB annotation of an
72     * instruction (e.g. when TGL_PIPE_NONE is specified in tgl_swsb).
73     */
74    tgl_pipe
inferred_sync_pipe(const struct intel_device_info * devinfo,const fs_inst * inst)75    inferred_sync_pipe(const struct intel_device_info *devinfo, const fs_inst *inst)
76    {
77       if (devinfo->verx10 >= 125) {
78          bool has_int_src = false, has_long_src = false;
79          const bool has_long_pipe = !devinfo->has_64bit_float_via_math_pipe;
80 
81          if (is_send(inst))
82             return TGL_PIPE_NONE;
83 
84          for (unsigned i = 0; i < inst->sources; i++) {
85             if (inst->src[i].file != BAD_FILE &&
86                 !inst->is_control_source(i)) {
87                const brw_reg_type t = inst->src[i].type;
88                has_int_src |= !brw_type_is_float(t);
89                has_long_src |= brw_type_size_bytes(t) >= 8;
90             }
91          }
92 
93          /* Avoid the emitting (RegDist, SWSB) annotations for long
94           * instructions on platforms where they are unordered. It's not clear
95           * what the inferred sync pipe is for them or if we are even allowed
96           * to use these annotations in this case. Return NONE, which should
97           * prevent baked_{un,}ordered_dependency_mode functions from even
98           * trying to emit these annotations.
99           */
100          if (!has_long_pipe && has_long_src)
101             return TGL_PIPE_NONE;
102 
103          return has_long_src ? TGL_PIPE_LONG :
104                 has_int_src ? TGL_PIPE_INT :
105                 TGL_PIPE_FLOAT;
106 
107       } else {
108          return TGL_PIPE_FLOAT;
109       }
110    }
111 
112    /**
113     * Return the RegDist pipeline that will execute an instruction, or
114     * TGL_PIPE_NONE if the instruction is out-of-order and doesn't use the
115     * RegDist synchronization mechanism.
116     */
117    tgl_pipe
inferred_exec_pipe(const struct intel_device_info * devinfo,const fs_inst * inst)118    inferred_exec_pipe(const struct intel_device_info *devinfo, const fs_inst *inst)
119    {
120       const brw_reg_type t = get_exec_type(inst);
121       const bool is_dword_multiply = !brw_type_is_float(t) &&
122          ((inst->opcode == BRW_OPCODE_MUL &&
123            MIN2(brw_type_size_bytes(inst->src[0].type),
124                 brw_type_size_bytes(inst->src[1].type)) >= 4) ||
125           (inst->opcode == BRW_OPCODE_MAD &&
126            MIN2(brw_type_size_bytes(inst->src[1].type),
127                 brw_type_size_bytes(inst->src[2].type)) >= 4));
128 
129       if (is_unordered(devinfo, inst))
130          return TGL_PIPE_NONE;
131       else if (devinfo->verx10 < 125)
132          return TGL_PIPE_FLOAT;
133       else if (devinfo->ver >= 30 &&
134                inst->exec_size == 1 &&
135                inst->dst.file == ARF &&
136                inst->dst.nr == BRW_ARF_SCALAR &&
137                inst->src[0].file == IMM) {
138          /* Scalar pipe has a very narrow usage.  See Bspec 56701 (r60146),
139           * in the SWSB description entry.
140           */
141          return TGL_PIPE_SCALAR;
142       } else if (inst->is_math() && devinfo->ver >= 20)
143          return TGL_PIPE_MATH;
144       else if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT ||
145                inst->opcode == SHADER_OPCODE_BROADCAST ||
146                inst->opcode == SHADER_OPCODE_SHUFFLE)
147          return TGL_PIPE_INT;
148       else if (inst->opcode == FS_OPCODE_PACK_HALF_2x16_SPLIT)
149          return TGL_PIPE_FLOAT;
150       else if (devinfo->ver >= 20 &&
151                brw_type_size_bytes(inst->dst.type) >= 8 &&
152                brw_type_is_float(inst->dst.type)) {
153          assert(devinfo->has_64bit_float);
154          return TGL_PIPE_LONG;
155       } else if (devinfo->ver < 20 &&
156                  (brw_type_size_bytes(inst->dst.type) >= 8 ||
157                   brw_type_size_bytes(t) >= 8 || is_dword_multiply)) {
158          assert(devinfo->has_64bit_float || devinfo->has_64bit_int ||
159                 devinfo->has_integer_dword_mul);
160          return TGL_PIPE_LONG;
161       } else if (brw_type_is_float(inst->dst.type))
162          return TGL_PIPE_FLOAT;
163       else
164          return TGL_PIPE_INT;
165    }
166 
167    /**
168     * Index of the \p p pipeline counter in the ordered_address vector defined
169     * below.
170     */
171 #define IDX(p) (p >= TGL_PIPE_FLOAT ? unsigned(p - TGL_PIPE_FLOAT) :    \
172                 (abort(), ~0u))
173 
174    /**
175     * Number of in-order hardware instructions for pipeline index \p contained
176     * in this IR instruction.  This determines the increment applied to the
177     * RegDist counter calculated for any ordered dependency that crosses this
178     * instruction.
179     */
180    unsigned
ordered_unit(const struct intel_device_info * devinfo,const fs_inst * inst,unsigned p)181    ordered_unit(const struct intel_device_info *devinfo, const fs_inst *inst,
182                 unsigned p)
183    {
184       switch (inst->opcode) {
185       case BRW_OPCODE_SYNC:
186       case BRW_OPCODE_DO:
187       case SHADER_OPCODE_UNDEF:
188       case SHADER_OPCODE_HALT_TARGET:
189       case FS_OPCODE_SCHEDULING_FENCE:
190          return 0;
191       default:
192          /* Note that the following is inaccurate for virtual instructions
193           * that expand to more in-order instructions than assumed here, but
194           * that can only lead to suboptimal execution ordering, data
195           * coherency won't be impacted.  Providing exact RegDist counts for
196           * each virtual instruction would allow better ALU performance, but
197           * it would require keeping this switch statement in perfect sync
198           * with the generator in order to avoid data corruption.  Lesson is
199           * (again) don't use virtual instructions if you want optimal
200           * scheduling.
201           */
202          if (!is_unordered(devinfo, inst) &&
203              (p == IDX(inferred_exec_pipe(devinfo, inst)) ||
204               p == IDX(TGL_PIPE_ALL)))
205             return 1;
206          else
207             return 0;
208       }
209    }
210 
211    /**
212     * Type for an instruction counter that increments for in-order
213     * instructions only, arbitrarily denoted 'jp' throughout this lowering
214     * pass in order to distinguish it from the regular instruction counter.
215     * This is represented as a vector with an independent counter for each
216     * asynchronous ALU pipeline in the EU.
217     */
218    struct ordered_address {
219       /**
220        * Construct the ordered address of a dependency known to execute on a
221        * single specified pipeline \p p (unless TGL_PIPE_NONE or TGL_PIPE_ALL
222        * is provided), in which case the vector counter will be initialized
223        * with all components equal to INT_MIN (always satisfied) except for
224        * component IDX(p).
225        */
ordered_address__anon22b2e7a30111::ordered_address226       ordered_address(tgl_pipe p = TGL_PIPE_NONE, int jp0 = INT_MIN) {
227          for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++)
228             jp[q] = (p == TGL_PIPE_NONE || (IDX(p) != q && p != TGL_PIPE_ALL) ?
229                      INT_MIN : jp0);
230       }
231 
232       int jp[IDX(TGL_PIPE_ALL)];
233 
234       friend bool
operator ==(const ordered_address & jp0,const ordered_address & jp1)235       operator==(const ordered_address &jp0, const ordered_address &jp1)
236       {
237          for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) {
238             if (jp0.jp[p] != jp1.jp[p])
239                return false;
240          }
241 
242          return true;
243       }
244    };
245 
246    /**
247     * Return true if the specified ordered address is trivially satisfied for
248     * all pipelines except potentially for the specified pipeline \p p.
249     */
250    bool
is_single_pipe(const ordered_address & jp,tgl_pipe p)251    is_single_pipe(const ordered_address &jp, tgl_pipe p)
252    {
253       for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) {
254          if ((p == TGL_PIPE_NONE || IDX(p) != q) && jp.jp[q] > INT_MIN)
255             return false;
256       }
257 
258       return true;
259    }
260 
261    /**
262     * Return the number of instructions in the program.
263     */
264    unsigned
num_instructions(const fs_visitor * shader)265    num_instructions(const fs_visitor *shader)
266    {
267       return shader->cfg->blocks[shader->cfg->num_blocks - 1]->end_ip + 1;
268    }
269 
270    /**
271     * Calculate the local ordered_address instruction counter at every
272     * instruction of the shader for subsequent constant-time look-up.
273     */
274    ordered_address *
ordered_inst_addresses(const fs_visitor * shader)275    ordered_inst_addresses(const fs_visitor *shader)
276    {
277       ordered_address *jps = new ordered_address[num_instructions(shader)];
278       ordered_address jp(TGL_PIPE_ALL, 0);
279       unsigned ip = 0;
280 
281       foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
282          jps[ip] = jp;
283          for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
284             jp.jp[p] += ordered_unit(shader->devinfo, inst, p);
285          ip++;
286       }
287 
288       return jps;
289    }
290 
291    /**
292     * Synchronization mode required for data manipulated by in-order
293     * instructions.
294     *
295     * Similar to tgl_sbid_mode, but without SET mode.  Defined as a separate
296     * enum for additional type safety.  The hardware doesn't provide control
297     * over the synchronization mode for RegDist annotations, this is only used
298     * internally in this pass in order to optimize out redundant read
299     * dependencies where possible.
300     */
301    enum tgl_regdist_mode {
302       TGL_REGDIST_NULL = 0,
303       TGL_REGDIST_SRC = 1,
304       TGL_REGDIST_DST = 2
305    };
306 
307    /**
308     * Allow bitwise arithmetic of tgl_regdist_mode enums.
309     */
310    tgl_regdist_mode
operator |(tgl_regdist_mode x,tgl_regdist_mode y)311    operator|(tgl_regdist_mode x, tgl_regdist_mode y)
312    {
313       return tgl_regdist_mode(unsigned(x) | unsigned(y));
314    }
315 
316    tgl_regdist_mode
operator &(tgl_regdist_mode x,tgl_regdist_mode y)317    operator&(tgl_regdist_mode x, tgl_regdist_mode y)
318    {
319       return tgl_regdist_mode(unsigned(x) & unsigned(y));
320    }
321 
322    tgl_regdist_mode &
operator |=(tgl_regdist_mode & x,tgl_regdist_mode y)323    operator|=(tgl_regdist_mode &x, tgl_regdist_mode y)
324    {
325       return x = x | y;
326    }
327 
328    tgl_regdist_mode &
operator &=(tgl_regdist_mode & x,tgl_regdist_mode y)329    operator&=(tgl_regdist_mode &x, tgl_regdist_mode y)
330    {
331       return x = x & y;
332    }
333 
334    /** @} */
335 
336    /**
337     * Representation of an equivalence relation among the set of unsigned
338     * integers.
339     *
340     * Its initial state is the identity relation '~' such that i ~ j if and
341     * only if i == j for every pair of unsigned integers i and j.
342     */
343    struct equivalence_relation {
equivalence_relation__anon22b2e7a30111::equivalence_relation344       equivalence_relation(unsigned n) : is(new unsigned[n]), n(n)
345       {
346          for (unsigned i = 0; i < n; i++)
347             is[i] = i;
348       }
349 
~equivalence_relation__anon22b2e7a30111::equivalence_relation350       ~equivalence_relation()
351       {
352          delete[] is;
353       }
354 
355       /**
356        * Return equivalence class index of the specified element.  Effectively
357        * this is the numeric value of an arbitrary representative from the
358        * equivalence class.
359        *
360        * Allows the evaluation of the equivalence relation according to the
361        * rule that i ~ j if and only if lookup(i) == lookup(j).
362        */
363       unsigned
lookup__anon22b2e7a30111::equivalence_relation364       lookup(unsigned i) const
365       {
366          if (i < n && is[i] != i)
367             return lookup(is[i]);
368          else
369             return i;
370       }
371 
372       /**
373        * Create an array with the results of the lookup() method for
374        * constant-time evaluation.
375        */
376       unsigned *
flatten__anon22b2e7a30111::equivalence_relation377       flatten() const
378       {
379          unsigned *ids = new unsigned[n];
380 
381          for (unsigned i = 0; i < n; i++)
382             ids[i] = lookup(i);
383 
384          return ids;
385       }
386 
387       /**
388        * Mutate the existing equivalence relation minimally by imposing the
389        * additional requirement that i ~ j.
390        *
391        * The algorithm updates the internal representation recursively in
392        * order to guarantee transitivity while preserving the previously
393        * specified equivalence requirements.
394        */
395       unsigned
link__anon22b2e7a30111::equivalence_relation396       link(unsigned i, unsigned j)
397       {
398          const unsigned k = lookup(i);
399          assign(i, k);
400          assign(j, k);
401          return k;
402       }
403 
404    private:
405       equivalence_relation(const equivalence_relation &);
406 
407       equivalence_relation &
408       operator=(const equivalence_relation &);
409 
410       /**
411        * Assign the representative of \p from to be equivalent to \p to.
412        *
413        * At the same time the data structure is partially flattened as much as
414        * it's possible without increasing the number of recursive calls.
415        */
416       void
assign__anon22b2e7a30111::equivalence_relation417       assign(unsigned from, unsigned to)
418       {
419          if (from != to) {
420             assert(from < n);
421 
422             if (is[from] != from)
423                assign(is[from], to);
424 
425             is[from] = to;
426          }
427       }
428 
429       unsigned *is;
430       unsigned n;
431    };
432 
433    /**
434     * Representation of a data dependency between two instructions in the
435     * program.
436     * @{
437     */
438    struct dependency {
439       /**
440        * No dependency information.
441        */
dependency__anon22b2e7a30111::dependency442       dependency() : ordered(TGL_REGDIST_NULL), jp(),
443                      unordered(TGL_SBID_NULL), id(0),
444                      exec_all(false) {}
445 
446       /**
447        * Construct a dependency on the in-order instruction with the provided
448        * ordered_address instruction counter.
449        */
dependency__anon22b2e7a30111::dependency450       dependency(tgl_regdist_mode mode, const ordered_address &jp,
451                  bool exec_all) :
452          ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0),
453          exec_all(exec_all) {}
454 
455       /**
456        * Construct a dependency on the out-of-order instruction with the
457        * specified synchronization token.
458        */
dependency__anon22b2e7a30111::dependency459       dependency(tgl_sbid_mode mode, unsigned id, bool exec_all) :
460          ordered(TGL_REGDIST_NULL), jp(), unordered(mode), id(id),
461          exec_all(exec_all) {}
462 
463       /**
464        * Synchronization mode of in-order dependency, or zero if no in-order
465        * dependency is present.
466        */
467       tgl_regdist_mode ordered;
468 
469       /**
470        * Instruction counter of in-order dependency.
471        *
472        * For a dependency part of a different block in the program, this is
473        * relative to the specific control flow path taken between the
474        * dependency and the current block: It is the ordered_address such that
475        * the difference between it and the ordered_address of the first
476        * instruction of the current block is exactly the number of in-order
477        * instructions across that control flow path.  It is not guaranteed to
478        * be equal to the local ordered_address of the generating instruction
479        * [as returned by ordered_inst_addresses()], except for block-local
480        * dependencies.
481        */
482       ordered_address jp;
483 
484       /**
485        * Synchronization mode of unordered dependency, or zero if no unordered
486        * dependency is present.
487        */
488       tgl_sbid_mode unordered;
489 
490       /** Synchronization token of out-of-order dependency. */
491       unsigned id;
492 
493       /**
494        * Whether the dependency could be run with execution masking disabled,
495        * which might lead to the unwanted execution of the generating
496        * instruction in cases where a BB is executed with all channels
497        * disabled due to hardware bug Wa_1407528679.
498        */
499       bool exec_all;
500 
501       /**
502        * Trivial in-order dependency that's always satisfied.
503        *
504        * Note that unlike a default-constructed dependency() which is also
505        * trivially satisfied, this is considered to provide dependency
506        * information and can be used to clear a previously pending dependency
507        * via shadow().
508        */
509       static const dependency done;
510 
511       friend bool
operator ==(const dependency & dep0,const dependency & dep1)512       operator==(const dependency &dep0, const dependency &dep1)
513       {
514          return dep0.ordered == dep1.ordered &&
515                 dep0.jp == dep1.jp &&
516                 dep0.unordered == dep1.unordered &&
517                 dep0.id == dep1.id &&
518                 dep0.exec_all == dep1.exec_all;
519       }
520 
521       friend bool
operator !=(const dependency & dep0,const dependency & dep1)522       operator!=(const dependency &dep0, const dependency &dep1)
523       {
524          return !(dep0 == dep1);
525       }
526    };
527 
528    const dependency dependency::done =
529         dependency(TGL_REGDIST_DST, ordered_address(), false);
530 
531    /**
532     * Return whether \p dep contains any dependency information.
533     */
534    bool
is_valid(const dependency & dep)535    is_valid(const dependency &dep)
536    {
537       return dep.ordered || dep.unordered;
538    }
539 
540    /**
541     * Combine \p dep0 and \p dep1 into a single dependency object that is only
542     * satisfied when both original dependencies are satisfied.  This might
543     * involve updating the equivalence relation \p eq in order to make sure
544     * that both out-of-order dependencies are assigned the same hardware SBID
545     * as synchronization token.
546     */
547    dependency
merge(equivalence_relation & eq,const dependency & dep0,const dependency & dep1)548    merge(equivalence_relation &eq,
549          const dependency &dep0, const dependency &dep1)
550    {
551       dependency dep;
552 
553       if (dep0.ordered || dep1.ordered) {
554          dep.ordered = dep0.ordered | dep1.ordered;
555          for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
556             dep.jp.jp[p] = MAX2(dep0.jp.jp[p], dep1.jp.jp[p]);
557       }
558 
559       if (dep0.unordered || dep1.unordered) {
560          dep.unordered = dep0.unordered | dep1.unordered;
561          dep.id = eq.link(dep0.unordered ? dep0.id : dep1.id,
562                           dep1.unordered ? dep1.id : dep0.id);
563       }
564 
565       dep.exec_all = dep0.exec_all || dep1.exec_all;
566 
567       return dep;
568    }
569 
570    /**
571     * Override dependency information of \p dep0 with that of \p dep1.
572     */
573    dependency
shadow(const dependency & dep0,const dependency & dep1)574    shadow(const dependency &dep0, const dependency &dep1)
575    {
576       if (dep0.ordered == TGL_REGDIST_SRC &&
577           is_valid(dep1) && !(dep1.unordered & TGL_SBID_DST) &&
578                             !(dep1.ordered & TGL_REGDIST_DST)) {
579          /* As an optimization (see dependency_for_read()),
580           * instructions with a RaR dependency don't synchronize
581           * against a previous in-order read, so we need to pass
582           * through both ordered dependencies instead of simply
583           * dropping the first one.  Otherwise we could encounter a
584           * WaR data hazard between OP0 and OP2 in cases like:
585           *
586           *   OP0 r1:f r0:d
587           *   OP1 r2:d r0:d
588           *   OP2 r0:d r3:d
589           *
590           * since only the integer-pipeline r0 dependency from OP1
591           * would be visible to OP2, even though OP0 could technically
592           * execute after OP1 due to the floating-point and integer
593           * pipelines being asynchronous on Gfx12.5+ platforms, so
594           * synchronizing OP2 against OP1 would be insufficient.
595           */
596          dependency dep = dep1;
597 
598          dep.ordered |= dep0.ordered;
599          for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
600                dep.jp.jp[p] = MAX2(dep.jp.jp[p], dep0.jp.jp[p]);
601 
602          return dep;
603       } else {
604          return is_valid(dep1) ? dep1 : dep0;
605       }
606    }
607 
608    /**
609     * Translate dependency information across the program.
610     *
611     * This returns a dependency on the same instruction translated to the
612     * ordered_address space of a different block.  The correct shift for
613     * transporting a dependency across an edge of the CFG is the difference
614     * between the local ordered_address of the first instruction of the target
615     * block and the local ordered_address of the instruction immediately after
616     * the end of the origin block.
617     */
618    dependency
transport(dependency dep,int delta[IDX (TGL_PIPE_ALL)])619    transport(dependency dep, int delta[IDX(TGL_PIPE_ALL)])
620    {
621       if (dep.ordered) {
622          for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) {
623             if (dep.jp.jp[p] > INT_MIN)
624                dep.jp.jp[p] += delta[p];
625          }
626       }
627 
628       return dep;
629    }
630 
631    /**
632     * Return simplified dependency removing any synchronization modes not
633     * applicable to an instruction reading the same register location.
634     */
635    dependency
dependency_for_read(dependency dep)636    dependency_for_read(dependency dep)
637    {
638       dep.ordered &= TGL_REGDIST_DST;
639       return dep;
640    }
641 
642    /**
643     * Return simplified dependency removing any synchronization modes not
644     * applicable to an instruction \p inst writing the same register location.
645     *
646     * This clears any WaR dependency for writes performed from the same
647     * pipeline as the read, since there is no possibility for a data hazard.
648     */
649    dependency
dependency_for_write(const struct intel_device_info * devinfo,const fs_inst * inst,dependency dep)650    dependency_for_write(const struct intel_device_info *devinfo,
651                         const fs_inst *inst, dependency dep)
652    {
653       if (!is_unordered(devinfo, inst) &&
654           is_single_pipe(dep.jp, inferred_exec_pipe(devinfo, inst)))
655          dep.ordered &= TGL_REGDIST_DST;
656       return dep;
657    }
658 
659    /** @} */
660 
661    /**
662     * Scoreboard representation.  This keeps track of the data dependencies of
663     * registers with GRF granularity.
664     */
665    class scoreboard {
666    public:
667       /**
668        * Look up the most current data dependency for register \p r.
669        */
670       dependency
get(const brw_reg & r) const671       get(const brw_reg &r) const
672       {
673          if (const dependency *p = const_cast<scoreboard *>(this)->dep(r))
674             return *p;
675          else
676             return dependency();
677       }
678 
679       /**
680        * Specify the most current data dependency for register \p r.
681        */
682       void
set(const brw_reg & r,const dependency & d)683       set(const brw_reg &r, const dependency &d)
684       {
685          if (dependency *p = dep(r))
686             *p = d;
687       }
688 
689       /**
690        * Component-wise merge() of corresponding dependencies from two
691        * scoreboard objects.  \sa merge().
692        */
693       friend scoreboard
merge(equivalence_relation & eq,const scoreboard & sb0,const scoreboard & sb1)694       merge(equivalence_relation &eq,
695             const scoreboard &sb0, const scoreboard &sb1)
696       {
697          scoreboard sb;
698 
699          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
700             sb.grf_deps[i] = merge(eq, sb0.grf_deps[i], sb1.grf_deps[i]);
701 
702          sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep);
703          sb.accum_dep = merge(eq, sb0.accum_dep, sb1.accum_dep);
704          sb.scalar_dep = merge(eq, sb0.scalar_dep, sb1.scalar_dep);
705 
706          return sb;
707       }
708 
709       /**
710        * Component-wise shadow() of corresponding dependencies from two
711        * scoreboard objects.  \sa shadow().
712        */
713       friend scoreboard
shadow(const scoreboard & sb0,const scoreboard & sb1)714       shadow(const scoreboard &sb0, const scoreboard &sb1)
715       {
716          scoreboard sb;
717 
718          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
719             sb.grf_deps[i] = shadow(sb0.grf_deps[i], sb1.grf_deps[i]);
720 
721          sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep);
722          sb.accum_dep = shadow(sb0.accum_dep, sb1.accum_dep);
723          sb.scalar_dep = shadow(sb0.scalar_dep, sb1.scalar_dep);
724 
725          return sb;
726       }
727 
728       /**
729        * Component-wise transport() of dependencies from a scoreboard
730        * object.  \sa transport().
731        */
732       friend scoreboard
transport(const scoreboard & sb0,int delta[IDX (TGL_PIPE_ALL)])733       transport(const scoreboard &sb0, int delta[IDX(TGL_PIPE_ALL)])
734       {
735          scoreboard sb;
736 
737          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
738             sb.grf_deps[i] = transport(sb0.grf_deps[i], delta);
739 
740          sb.addr_dep = transport(sb0.addr_dep, delta);
741          sb.accum_dep = transport(sb0.accum_dep, delta);
742          sb.scalar_dep = transport(sb0.scalar_dep, delta);
743 
744          return sb;
745       }
746 
747       friend bool
operator ==(const scoreboard & sb0,const scoreboard & sb1)748       operator==(const scoreboard &sb0, const scoreboard &sb1)
749       {
750          for (unsigned i = 0; i < ARRAY_SIZE(sb0.grf_deps); i++) {
751             if (sb0.grf_deps[i] != sb1.grf_deps[i])
752                return false;
753          }
754 
755          if (sb0.addr_dep != sb1.addr_dep)
756             return false;
757 
758          if (sb0.accum_dep != sb1.accum_dep)
759             return false;
760 
761          if (sb0.scalar_dep != sb1.scalar_dep)
762             return false;
763 
764          return true;
765       }
766 
767       friend bool
operator !=(const scoreboard & sb0,const scoreboard & sb1)768       operator!=(const scoreboard &sb0, const scoreboard &sb1)
769       {
770          return !(sb0 == sb1);
771       }
772 
773    private:
774       dependency grf_deps[XE3_MAX_GRF];
775       dependency addr_dep;
776       dependency accum_dep;
777       dependency scalar_dep;
778 
779       dependency *
dep(const brw_reg & r)780       dep(const brw_reg &r)
781       {
782          const unsigned reg = (r.file == VGRF ? r.nr + r.offset / REG_SIZE :
783                                reg_offset(r) / REG_SIZE);
784 
785          return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] :
786                  r.file == ARF && reg >= BRW_ARF_ADDRESS &&
787                                   reg < BRW_ARF_ACCUMULATOR ? &addr_dep :
788                  r.file == ARF && reg >= BRW_ARF_ACCUMULATOR &&
789                                   reg < BRW_ARF_FLAG ? &accum_dep :
790                  r.file == ARF && reg >= BRW_ARF_SCALAR &&
791                                   reg < BRW_ARF_STATE ? &scalar_dep :
792                  NULL);
793       }
794    };
795 
796    /**
797     * Dependency list handling.
798     * @{
799     */
800    struct dependency_list {
dependency_list__anon22b2e7a30111::dependency_list801       dependency_list() : deps(NULL), n(0) {}
802 
~dependency_list__anon22b2e7a30111::dependency_list803       ~dependency_list()
804       {
805          free(deps);
806       }
807 
808       void
push_back__anon22b2e7a30111::dependency_list809       push_back(const dependency &dep)
810       {
811          deps = (dependency *)realloc(deps, (n + 1) * sizeof(*deps));
812          deps[n++] = dep;
813       }
814 
815       unsigned
size__anon22b2e7a30111::dependency_list816       size() const
817       {
818          return n;
819       }
820 
821       const dependency &
operator []__anon22b2e7a30111::dependency_list822       operator[](unsigned i) const
823       {
824          assert(i < n);
825          return deps[i];
826       }
827 
828       dependency &
operator []__anon22b2e7a30111::dependency_list829       operator[](unsigned i)
830       {
831          assert(i < n);
832          return deps[i];
833       }
834 
835    private:
836       dependency_list(const dependency_list &);
837       dependency_list &
838       operator=(const dependency_list &);
839 
840       dependency *deps;
841       unsigned n;
842    };
843 
844    /**
845     * Add dependency \p dep to the list of dependencies of an instruction
846     * \p deps.
847     */
848    void
add_dependency(const unsigned * ids,dependency_list & deps,dependency dep)849    add_dependency(const unsigned *ids, dependency_list &deps, dependency dep)
850    {
851       if (is_valid(dep)) {
852          /* Translate the unordered dependency token first in order to keep
853           * the list minimally redundant.
854           */
855          if (dep.unordered)
856             dep.id = ids[dep.id];
857 
858          /* Try to combine the specified dependency with any existing ones. */
859          for (unsigned i = 0; i < deps.size(); i++) {
860             /* Don't combine otherwise matching dependencies if there is an
861              * exec_all mismatch which would cause a SET dependency to gain an
862              * exec_all flag, since that would prevent it from being baked
863              * into the instruction we want to allocate an SBID for.
864              */
865             if (deps[i].exec_all != dep.exec_all &&
866                 (!deps[i].exec_all || (dep.unordered & TGL_SBID_SET)) &&
867                 (!dep.exec_all || (deps[i].unordered & TGL_SBID_SET)))
868                continue;
869 
870             if (dep.ordered && deps[i].ordered) {
871                for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
872                   deps[i].jp.jp[p] = MAX2(deps[i].jp.jp[p], dep.jp.jp[p]);
873 
874                deps[i].ordered |= dep.ordered;
875                deps[i].exec_all |= dep.exec_all;
876                dep.ordered = TGL_REGDIST_NULL;
877             }
878 
879             if (dep.unordered && deps[i].unordered && deps[i].id == dep.id) {
880                deps[i].unordered |= dep.unordered;
881                deps[i].exec_all |= dep.exec_all;
882                dep.unordered = TGL_SBID_NULL;
883             }
884          }
885 
886          /* Add it to the end of the list if necessary. */
887          if (is_valid(dep))
888             deps.push_back(dep);
889       }
890    }
891 
892    /**
893     * Construct a tgl_swsb annotation encoding any ordered dependencies from
894     * the dependency list \p deps of an instruction with ordered_address \p
895     * jp.  If \p exec_all is false only dependencies known to be executed with
896     * channel masking applied will be considered in the calculation.
897     */
898    tgl_swsb
ordered_dependency_swsb(const dependency_list & deps,const ordered_address & jp,bool exec_all)899    ordered_dependency_swsb(const dependency_list &deps,
900                            const ordered_address &jp,
901                            bool exec_all)
902    {
903       tgl_pipe p = TGL_PIPE_NONE;
904       unsigned min_dist = ~0u;
905 
906       for (unsigned i = 0; i < deps.size(); i++) {
907          if (deps[i].ordered && exec_all >= deps[i].exec_all) {
908             for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) {
909                const unsigned dist = jp.jp[q] - int64_t(deps[i].jp.jp[q]);
910                const unsigned max_dist = (q == IDX(TGL_PIPE_LONG) ? 14 : 10);
911                assert(jp.jp[q] > deps[i].jp.jp[q]);
912                if (dist <= max_dist) {
913                   p = (p && IDX(p) != q ? TGL_PIPE_ALL :
914                        tgl_pipe(TGL_PIPE_FLOAT + q));
915                   min_dist = MIN3(min_dist, dist, 7);
916                }
917             }
918          }
919       }
920 
921       return { p ? min_dist : 0, p };
922    }
923 
924    /**
925     * Return whether the dependency list \p deps of an instruction with
926     * ordered_address \p jp has any non-trivial ordered dependencies.  If \p
927     * exec_all is false only dependencies known to be executed with channel
928     * masking applied will be considered in the calculation.
929     */
930    bool
find_ordered_dependency(const dependency_list & deps,const ordered_address & jp,bool exec_all)931    find_ordered_dependency(const dependency_list &deps,
932                            const ordered_address &jp,
933                            bool exec_all)
934    {
935       return ordered_dependency_swsb(deps, jp, exec_all).regdist;
936    }
937 
938    /**
939     * Return the full tgl_sbid_mode bitset for the first unordered dependency
940     * on the list \p deps that matches the specified tgl_sbid_mode, or zero if
941     * no such dependency is present.  If \p exec_all is false only
942     * dependencies known to be executed with channel masking applied will be
943     * considered in the calculation.
944     */
945    tgl_sbid_mode
find_unordered_dependency(const dependency_list & deps,tgl_sbid_mode unordered,bool exec_all)946    find_unordered_dependency(const dependency_list &deps,
947                              tgl_sbid_mode unordered,
948                              bool exec_all)
949    {
950       if (unordered) {
951          for (unsigned i = 0; i < deps.size(); i++) {
952             if ((unordered & deps[i].unordered) &&
953                 exec_all >= deps[i].exec_all)
954                return deps[i].unordered;
955          }
956       }
957 
958       return TGL_SBID_NULL;
959    }
960 
961    /**
962     * Return the tgl_sbid_mode bitset of an unordered dependency from the list
963     * \p deps that can be represented directly in the SWSB annotation of the
964     * instruction without additional SYNC instructions, or zero if no such
965     * dependency is present.
966     */
967    tgl_sbid_mode
baked_unordered_dependency_mode(const struct intel_device_info * devinfo,const fs_inst * inst,const dependency_list & deps,const ordered_address & jp)968    baked_unordered_dependency_mode(const struct intel_device_info *devinfo,
969                                    const fs_inst *inst,
970                                    const dependency_list &deps,
971                                    const ordered_address &jp)
972    {
973       const bool exec_all = inst->force_writemask_all;
974       const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
975       const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp,
976                                                             exec_all).pipe;
977 
978       if (find_unordered_dependency(deps, TGL_SBID_SET, exec_all))
979          return find_unordered_dependency(deps, TGL_SBID_SET, exec_all);
980       else if (has_ordered && is_unordered(devinfo, inst))
981          return TGL_SBID_NULL;
982       else if (is_send(inst) && devinfo->ver >= 20)
983          return TGL_SBID_NULL;
984       else if (find_unordered_dependency(deps, TGL_SBID_DST, exec_all) &&
985                (!has_ordered || ordered_pipe == inferred_sync_pipe(devinfo, inst)))
986          return find_unordered_dependency(deps, TGL_SBID_DST, exec_all);
987       else if (!has_ordered)
988          return find_unordered_dependency(deps, TGL_SBID_SRC, exec_all);
989       else
990          return TGL_SBID_NULL;
991    }
992 
993    /**
994     * Return whether an ordered dependency from the list \p deps can be
995     * represented directly in the SWSB annotation of the instruction without
996     * additional SYNC instructions.
997     */
998    bool
baked_ordered_dependency_mode(const struct intel_device_info * devinfo,const fs_inst * inst,const dependency_list & deps,const ordered_address & jp)999    baked_ordered_dependency_mode(const struct intel_device_info *devinfo,
1000                                  const fs_inst *inst,
1001                                  const dependency_list &deps,
1002                                  const ordered_address &jp)
1003    {
1004       const bool exec_all = inst->force_writemask_all;
1005       const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
1006       const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp,
1007                                                             exec_all).pipe;
1008       const tgl_sbid_mode unordered_mode =
1009          baked_unordered_dependency_mode(devinfo, inst, deps, jp);
1010       const tgl_pipe inferred_pipe = inferred_sync_pipe(devinfo, inst);
1011 
1012       if (!has_ordered)
1013          return false;
1014       else if (!unordered_mode)
1015          return true;
1016       else if (devinfo->ver < 20)
1017          return ordered_pipe == inferred_pipe &&
1018                 unordered_mode == (is_unordered(devinfo, inst) ? TGL_SBID_SET :
1019                                    TGL_SBID_DST);
1020       else if (is_send(inst))
1021          return unordered_mode == TGL_SBID_SET &&
1022                 (ordered_pipe == TGL_PIPE_FLOAT ||
1023                  ordered_pipe == TGL_PIPE_INT ||
1024                  ordered_pipe == TGL_PIPE_ALL);
1025       else if (inst->opcode == BRW_OPCODE_DPAS)
1026          return ordered_pipe == inferred_pipe;
1027       else
1028          return (unordered_mode == TGL_SBID_DST && ordered_pipe == inferred_pipe) ||
1029                 (unordered_mode == TGL_SBID_SRC && ordered_pipe == inferred_pipe) ||
1030                 (unordered_mode == TGL_SBID_DST && ordered_pipe == TGL_PIPE_ALL);
1031    }
1032 
1033    /** @} */
1034 
1035    /**
1036     * Shader instruction dependency calculation.
1037     * @{
1038     */
1039 
1040    /**
1041     * Update scoreboard object \p sb to account for the execution of
1042     * instruction \p inst.
1043     */
1044    void
update_inst_scoreboard(const fs_visitor * shader,const ordered_address * jps,const fs_inst * inst,unsigned ip,scoreboard & sb)1045    update_inst_scoreboard(const fs_visitor *shader, const ordered_address *jps,
1046                           const fs_inst *inst, unsigned ip, scoreboard &sb)
1047    {
1048       const bool exec_all = inst->force_writemask_all;
1049       const struct intel_device_info *devinfo = shader->devinfo;
1050       const tgl_pipe p = inferred_exec_pipe(devinfo, inst);
1051       const ordered_address jp = p ? ordered_address(p, jps[ip].jp[IDX(p)]) :
1052                                      ordered_address();
1053       const bool is_ordered = ordered_unit(devinfo, inst, IDX(TGL_PIPE_ALL));
1054       const bool is_unordered_math =
1055          (inst->is_math() && devinfo->ver < 20) ||
1056          (devinfo->has_64bit_float_via_math_pipe &&
1057           (get_exec_type(inst) == BRW_TYPE_DF ||
1058            inst->dst.type == BRW_TYPE_DF));
1059 
1060       /* Track any source registers that may be fetched asynchronously by this
1061        * instruction, otherwise clear the dependency in order to avoid
1062        * subsequent redundant synchronization.
1063        */
1064       for (unsigned i = 0; i < inst->sources; i++) {
1065          const dependency rd_dep =
1066             (inst->is_payload(i) ||
1067              inst->opcode == BRW_OPCODE_DPAS ||
1068              is_unordered_math) ? dependency(TGL_SBID_SRC, ip, exec_all) :
1069             is_ordered ? dependency(TGL_REGDIST_SRC, jp, exec_all) :
1070             dependency::done;
1071 
1072          for (unsigned j = 0; j < regs_read(devinfo, inst, i); j++) {
1073             const brw_reg r = byte_offset(inst->src[i], REG_SIZE * j);
1074             sb.set(r, shadow(sb.get(r), rd_dep));
1075          }
1076       }
1077 
1078       if (inst->reads_accumulator_implicitly())
1079          sb.set(brw_acc_reg(8), dependency(TGL_REGDIST_SRC, jp, exec_all));
1080 
1081       /* Track any destination registers of this instruction. */
1082       const dependency wr_dep =
1083          is_unordered(devinfo, inst) ? dependency(TGL_SBID_DST, ip, exec_all) :
1084          is_ordered ? dependency(TGL_REGDIST_DST, jp, exec_all) :
1085          dependency();
1086 
1087       if (inst->writes_accumulator_implicitly(devinfo))
1088          sb.set(brw_acc_reg(8), wr_dep);
1089 
1090       if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
1091           !inst->dst.is_null()) {
1092          for (unsigned j = 0; j < regs_written(inst); j++)
1093             sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep);
1094       }
1095    }
1096 
1097    /**
1098     * Calculate scoreboard objects locally that represent any pending (and
1099     * unconditionally resolved) dependencies at the end of each block of the
1100     * program.
1101     */
1102    scoreboard *
gather_block_scoreboards(const fs_visitor * shader,const ordered_address * jps)1103    gather_block_scoreboards(const fs_visitor *shader,
1104                             const ordered_address *jps)
1105    {
1106       scoreboard *sbs = new scoreboard[shader->cfg->num_blocks];
1107       unsigned ip = 0;
1108 
1109       foreach_block_and_inst(block, fs_inst, inst, shader->cfg)
1110          update_inst_scoreboard(shader, jps, inst, ip++, sbs[block->num]);
1111 
1112       return sbs;
1113    }
1114 
1115    /**
1116     * Propagate data dependencies globally through the control flow graph
1117     * until a fixed point is reached.
1118     *
1119     * Calculates the set of dependencies potentially pending at the beginning
1120     * of each block, and returns it as an array of scoreboard objects.
1121     */
1122    scoreboard *
propagate_block_scoreboards(const fs_visitor * shader,const ordered_address * jps,equivalence_relation & eq)1123    propagate_block_scoreboards(const fs_visitor *shader,
1124                                const ordered_address *jps,
1125                                equivalence_relation &eq)
1126    {
1127       const scoreboard *delta_sbs = gather_block_scoreboards(shader, jps);
1128       scoreboard *in_sbs = new scoreboard[shader->cfg->num_blocks];
1129       scoreboard *out_sbs = new scoreboard[shader->cfg->num_blocks];
1130 
1131       for (bool progress = true; progress;) {
1132          progress = false;
1133 
1134          foreach_block(block, shader->cfg) {
1135             const scoreboard sb = shadow(in_sbs[block->num],
1136                                          delta_sbs[block->num]);
1137 
1138             if (sb != out_sbs[block->num]) {
1139                foreach_list_typed(bblock_link, child_link, link,
1140                                   &block->children) {
1141                   scoreboard &in_sb = in_sbs[child_link->block->num];
1142                   int delta[IDX(TGL_PIPE_ALL)];
1143 
1144                   for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
1145                      delta[p] = jps[child_link->block->start_ip].jp[p]
1146                         - jps[block->end_ip].jp[p]
1147                         - ordered_unit(shader->devinfo,
1148                                        static_cast<const fs_inst *>(block->end()), p);
1149 
1150                   in_sb = merge(eq, in_sb, transport(sb, delta));
1151                }
1152 
1153                out_sbs[block->num] = sb;
1154                progress = true;
1155             }
1156          }
1157       }
1158 
1159       delete[] delta_sbs;
1160       delete[] out_sbs;
1161 
1162       return in_sbs;
1163    }
1164 
1165    /**
1166     * Return the list of potential dependencies of each instruction in the
1167     * shader based on the result of global dependency analysis.
1168     */
1169    dependency_list *
gather_inst_dependencies(const fs_visitor * shader,const ordered_address * jps)1170    gather_inst_dependencies(const fs_visitor *shader,
1171                             const ordered_address *jps)
1172    {
1173       const struct intel_device_info *devinfo = shader->devinfo;
1174       equivalence_relation eq(num_instructions(shader));
1175       scoreboard *sbs = propagate_block_scoreboards(shader, jps, eq);
1176       const unsigned *ids = eq.flatten();
1177       dependency_list *deps = new dependency_list[num_instructions(shader)];
1178       unsigned ip = 0;
1179 
1180       foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
1181          const bool exec_all = inst->force_writemask_all;
1182          const tgl_pipe p = inferred_exec_pipe(devinfo, inst);
1183          scoreboard &sb = sbs[block->num];
1184 
1185          for (unsigned i = 0; i < inst->sources; i++) {
1186             for (unsigned j = 0; j < regs_read(devinfo, inst, i); j++)
1187                add_dependency(ids, deps[ip], dependency_for_read(
1188                   sb.get(byte_offset(inst->src[i], REG_SIZE * j))));
1189          }
1190 
1191          if (inst->reads_accumulator_implicitly()) {
1192             /* Wa_22012725308:
1193              *
1194              * "When the accumulator registers are used as source and/or
1195              *  destination, hardware does not ensure prevention of write
1196              *  after read hazard across execution pipes."
1197              */
1198             const dependency dep = sb.get(brw_acc_reg(8));
1199             if (dep.ordered && !is_single_pipe(dep.jp, p))
1200                add_dependency(ids, deps[ip], dep);
1201          }
1202 
1203          if (is_unordered(devinfo, inst) && !inst->eot)
1204             add_dependency(ids, deps[ip],
1205                            dependency(TGL_SBID_SET, ip, exec_all));
1206 
1207          if (!inst->no_dd_check) {
1208             if (inst->dst.file != BAD_FILE && !inst->dst.is_null() &&
1209                 !inst->dst.is_accumulator()) {
1210                for (unsigned j = 0; j < regs_written(inst); j++) {
1211                   add_dependency(ids, deps[ip], dependency_for_write(devinfo, inst,
1212                      sb.get(byte_offset(inst->dst, REG_SIZE * j))));
1213                }
1214             }
1215 
1216             if (inst->writes_accumulator_implicitly(devinfo) ||
1217                 inst->dst.is_accumulator()) {
1218                /* Wa_22012725308:
1219                 *
1220                 * "When the accumulator registers are used as source and/or
1221                 *  destination, hardware does not ensure prevention of write
1222                 *  after read hazard across execution pipes."
1223                 */
1224                const dependency dep = sb.get(brw_acc_reg(8));
1225                if (dep.ordered && !is_single_pipe(dep.jp, p))
1226                   add_dependency(ids, deps[ip], dep);
1227             }
1228          }
1229 
1230          update_inst_scoreboard(shader, jps, inst, ip, sb);
1231          ip++;
1232       }
1233 
1234       delete[] sbs;
1235       delete[] ids;
1236 
1237       return deps;
1238    }
1239 
1240    /** @} */
1241 
1242    /**
1243     * Allocate SBID tokens to track the execution of every out-of-order
1244     * instruction of the shader.
1245     */
1246    dependency_list *
allocate_inst_dependencies(const fs_visitor * shader,const dependency_list * deps0)1247    allocate_inst_dependencies(const fs_visitor *shader,
1248                               const dependency_list *deps0)
1249    {
1250       /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in
1251        *       shaders with a large number of SEND messages.
1252        *
1253        * XXX - Use 32 SBIDs on Xe2 while in large GRF mode.
1254        */
1255       const unsigned num_sbids = (shader->devinfo->ver >= 30 ? 32 : 16);
1256 
1257       /* Allocate an unordered dependency ID to hardware SBID translation
1258        * table with as many entries as instructions there are in the shader,
1259        * which is the maximum number of unordered IDs we can find in the
1260        * program.
1261        */
1262       unsigned *ids = new unsigned[num_instructions(shader)];
1263       for (unsigned ip = 0; ip < num_instructions(shader); ip++)
1264          ids[ip] = ~0u;
1265 
1266       dependency_list *deps1 = new dependency_list[num_instructions(shader)];
1267       unsigned next_id = 0;
1268 
1269       for (unsigned ip = 0; ip < num_instructions(shader); ip++) {
1270          for (unsigned i = 0; i < deps0[ip].size(); i++) {
1271             const dependency &dep = deps0[ip][i];
1272 
1273             if (dep.unordered && ids[dep.id] == ~0u)
1274                ids[dep.id] = (next_id++) & (num_sbids - 1);
1275 
1276             add_dependency(ids, deps1[ip], dep);
1277          }
1278       }
1279 
1280       delete[] ids;
1281 
1282       return deps1;
1283    }
1284 
1285    /**
1286     * Emit dependency information provided by \p deps into the shader,
1287     * inserting additional SYNC instructions for dependencies that can't be
1288     * represented directly by annotating existing instructions.
1289     */
1290    void
emit_inst_dependencies(fs_visitor * shader,const ordered_address * jps,const dependency_list * deps)1291    emit_inst_dependencies(fs_visitor *shader,
1292                           const ordered_address *jps,
1293                           const dependency_list *deps)
1294    {
1295       const struct intel_device_info *devinfo = shader->devinfo;
1296       unsigned ip = 0;
1297 
1298       foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) {
1299          const bool exec_all = inst->force_writemask_all;
1300          const bool ordered_mode =
1301             baked_ordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]);
1302          const tgl_sbid_mode unordered_mode =
1303             baked_unordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]);
1304          tgl_swsb swsb = !ordered_mode ? tgl_swsb() :
1305             ordered_dependency_swsb(deps[ip], jps[ip], exec_all);
1306 
1307          for (unsigned i = 0; i < deps[ip].size(); i++) {
1308             const dependency &dep = deps[ip][i];
1309 
1310             if (dep.unordered) {
1311                if (unordered_mode == dep.unordered &&
1312                    exec_all >= dep.exec_all && !swsb.mode) {
1313                   /* Bake unordered dependency into the instruction's SWSB if
1314                    * possible, except in cases where the current instruction
1315                    * isn't marked NoMask but the dependency is, since that
1316                    * might lead to data coherency issues due to
1317                    * Wa_1407528679.
1318                    */
1319                   swsb.sbid = dep.id;
1320                   swsb.mode = dep.unordered;
1321                } else {
1322                   /* Emit dependency into the SWSB of an extra SYNC
1323                    * instruction.
1324                    */
1325                   const brw_builder ibld = brw_builder(shader, block, inst)
1326                                            .exec_all().group(1, 0);
1327                   fs_inst *sync = ibld.SYNC(TGL_SYNC_NOP);
1328                   sync->sched.sbid = dep.id;
1329                   sync->sched.mode = dep.unordered;
1330                   assert(!(sync->sched.mode & TGL_SBID_SET));
1331                }
1332             }
1333          }
1334 
1335          for (unsigned i = 0; i < deps[ip].size(); i++) {
1336             const dependency &dep = deps[ip][i];
1337 
1338             if (dep.ordered &&
1339                 find_ordered_dependency(deps[ip], jps[ip], true) &&
1340                 (!ordered_mode || dep.exec_all > exec_all)) {
1341                /* If the current instruction is not marked NoMask but an
1342                 * ordered dependency is, perform the synchronization as a
1343                 * separate NoMask SYNC instruction in order to avoid data
1344                 * coherency issues due to Wa_1407528679.  The similar
1345                 * scenario with unordered dependencies should have been
1346                 * handled above.
1347                 */
1348                const brw_builder ibld = brw_builder(shader, block, inst)
1349                                         .exec_all().group(1, 0);
1350                fs_inst *sync = ibld.SYNC(TGL_SYNC_NOP);
1351                sync->sched = ordered_dependency_swsb(deps[ip], jps[ip], true);
1352                break;
1353             }
1354          }
1355 
1356          /* Update the IR. */
1357          inst->sched = swsb;
1358          inst->no_dd_check = inst->no_dd_clear = false;
1359          ip++;
1360       }
1361    }
1362 }
1363 
1364 bool
brw_lower_scoreboard(fs_visitor & s)1365 brw_lower_scoreboard(fs_visitor &s)
1366 {
1367    if (s.devinfo->ver >= 12) {
1368       const ordered_address *jps = ordered_inst_addresses(&s);
1369       const dependency_list *deps0 = gather_inst_dependencies(&s, jps);
1370       const dependency_list *deps1 = allocate_inst_dependencies(&s, deps0);
1371       emit_inst_dependencies(&s, jps, deps1);
1372       delete[] deps1;
1373       delete[] deps0;
1374       delete[] jps;
1375    }
1376 
1377    return true;
1378 }
1379