1 /*
2 * Copyright © 2019 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file
25 *
26 * Gfx12+ hardware lacks the register scoreboard logic that used to guarantee
27 * data coherency between register reads and writes in previous generations.
28 * This lowering pass runs after register allocation in order to make up for
29 * it.
30 *
31 * It works by performing global dataflow analysis in order to determine the
32 * set of potential dependencies of every instruction in the shader, and then
33 * inserts any required SWSB annotations and additional SYNC instructions in
34 * order to guarantee data coherency.
35 *
36 * WARNING - Access of the following (rarely used) ARF registers is not
37 * tracked here, and require the RegDist SWSB annotation to be set
38 * to 1 by the generator in order to avoid data races:
39 *
40 * - sp stack pointer
41 * - sr0 state register
42 * - cr0 control register
43 * - ip instruction pointer
44 * - tm0 timestamp register
45 * - dbg0 debug register
46 * - acc2-9 special accumulator registers on TGL
47 * - mme0-7 math macro extended accumulator registers
48 *
49 * The following ARF registers don't need to be tracked here because data
50 * coherency is still provided transparently by the hardware:
51 *
52 * - f0-1 flag registers
53 * - n0 notification register
54 * - tdr0 thread dependency register
55 */
56
57 #include "brw_fs.h"
58 #include "brw_builder.h"
59 #include "brw_cfg.h"
60
61 using namespace brw;
62
63 namespace {
64 /**
65 * In-order instruction accounting.
66 * @{
67 */
68
69 /**
70 * Return the RegDist pipeline the hardware will synchronize with if no
71 * pipeline information is provided in the SWSB annotation of an
72 * instruction (e.g. when TGL_PIPE_NONE is specified in tgl_swsb).
73 */
74 tgl_pipe
inferred_sync_pipe(const struct intel_device_info * devinfo,const fs_inst * inst)75 inferred_sync_pipe(const struct intel_device_info *devinfo, const fs_inst *inst)
76 {
77 if (devinfo->verx10 >= 125) {
78 bool has_int_src = false, has_long_src = false;
79 const bool has_long_pipe = !devinfo->has_64bit_float_via_math_pipe;
80
81 if (is_send(inst))
82 return TGL_PIPE_NONE;
83
84 for (unsigned i = 0; i < inst->sources; i++) {
85 if (inst->src[i].file != BAD_FILE &&
86 !inst->is_control_source(i)) {
87 const brw_reg_type t = inst->src[i].type;
88 has_int_src |= !brw_type_is_float(t);
89 has_long_src |= brw_type_size_bytes(t) >= 8;
90 }
91 }
92
93 /* Avoid the emitting (RegDist, SWSB) annotations for long
94 * instructions on platforms where they are unordered. It's not clear
95 * what the inferred sync pipe is for them or if we are even allowed
96 * to use these annotations in this case. Return NONE, which should
97 * prevent baked_{un,}ordered_dependency_mode functions from even
98 * trying to emit these annotations.
99 */
100 if (!has_long_pipe && has_long_src)
101 return TGL_PIPE_NONE;
102
103 return has_long_src ? TGL_PIPE_LONG :
104 has_int_src ? TGL_PIPE_INT :
105 TGL_PIPE_FLOAT;
106
107 } else {
108 return TGL_PIPE_FLOAT;
109 }
110 }
111
112 /**
113 * Return the RegDist pipeline that will execute an instruction, or
114 * TGL_PIPE_NONE if the instruction is out-of-order and doesn't use the
115 * RegDist synchronization mechanism.
116 */
117 tgl_pipe
inferred_exec_pipe(const struct intel_device_info * devinfo,const fs_inst * inst)118 inferred_exec_pipe(const struct intel_device_info *devinfo, const fs_inst *inst)
119 {
120 const brw_reg_type t = get_exec_type(inst);
121 const bool is_dword_multiply = !brw_type_is_float(t) &&
122 ((inst->opcode == BRW_OPCODE_MUL &&
123 MIN2(brw_type_size_bytes(inst->src[0].type),
124 brw_type_size_bytes(inst->src[1].type)) >= 4) ||
125 (inst->opcode == BRW_OPCODE_MAD &&
126 MIN2(brw_type_size_bytes(inst->src[1].type),
127 brw_type_size_bytes(inst->src[2].type)) >= 4));
128
129 if (is_unordered(devinfo, inst))
130 return TGL_PIPE_NONE;
131 else if (devinfo->verx10 < 125)
132 return TGL_PIPE_FLOAT;
133 else if (devinfo->ver >= 30 &&
134 inst->exec_size == 1 &&
135 inst->dst.file == ARF &&
136 inst->dst.nr == BRW_ARF_SCALAR &&
137 inst->src[0].file == IMM) {
138 /* Scalar pipe has a very narrow usage. See Bspec 56701 (r60146),
139 * in the SWSB description entry.
140 */
141 return TGL_PIPE_SCALAR;
142 } else if (inst->is_math() && devinfo->ver >= 20)
143 return TGL_PIPE_MATH;
144 else if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT ||
145 inst->opcode == SHADER_OPCODE_BROADCAST ||
146 inst->opcode == SHADER_OPCODE_SHUFFLE)
147 return TGL_PIPE_INT;
148 else if (inst->opcode == FS_OPCODE_PACK_HALF_2x16_SPLIT)
149 return TGL_PIPE_FLOAT;
150 else if (devinfo->ver >= 20 &&
151 brw_type_size_bytes(inst->dst.type) >= 8 &&
152 brw_type_is_float(inst->dst.type)) {
153 assert(devinfo->has_64bit_float);
154 return TGL_PIPE_LONG;
155 } else if (devinfo->ver < 20 &&
156 (brw_type_size_bytes(inst->dst.type) >= 8 ||
157 brw_type_size_bytes(t) >= 8 || is_dword_multiply)) {
158 assert(devinfo->has_64bit_float || devinfo->has_64bit_int ||
159 devinfo->has_integer_dword_mul);
160 return TGL_PIPE_LONG;
161 } else if (brw_type_is_float(inst->dst.type))
162 return TGL_PIPE_FLOAT;
163 else
164 return TGL_PIPE_INT;
165 }
166
167 /**
168 * Index of the \p p pipeline counter in the ordered_address vector defined
169 * below.
170 */
171 #define IDX(p) (p >= TGL_PIPE_FLOAT ? unsigned(p - TGL_PIPE_FLOAT) : \
172 (abort(), ~0u))
173
174 /**
175 * Number of in-order hardware instructions for pipeline index \p contained
176 * in this IR instruction. This determines the increment applied to the
177 * RegDist counter calculated for any ordered dependency that crosses this
178 * instruction.
179 */
180 unsigned
ordered_unit(const struct intel_device_info * devinfo,const fs_inst * inst,unsigned p)181 ordered_unit(const struct intel_device_info *devinfo, const fs_inst *inst,
182 unsigned p)
183 {
184 switch (inst->opcode) {
185 case BRW_OPCODE_SYNC:
186 case BRW_OPCODE_DO:
187 case SHADER_OPCODE_UNDEF:
188 case SHADER_OPCODE_HALT_TARGET:
189 case FS_OPCODE_SCHEDULING_FENCE:
190 return 0;
191 default:
192 /* Note that the following is inaccurate for virtual instructions
193 * that expand to more in-order instructions than assumed here, but
194 * that can only lead to suboptimal execution ordering, data
195 * coherency won't be impacted. Providing exact RegDist counts for
196 * each virtual instruction would allow better ALU performance, but
197 * it would require keeping this switch statement in perfect sync
198 * with the generator in order to avoid data corruption. Lesson is
199 * (again) don't use virtual instructions if you want optimal
200 * scheduling.
201 */
202 if (!is_unordered(devinfo, inst) &&
203 (p == IDX(inferred_exec_pipe(devinfo, inst)) ||
204 p == IDX(TGL_PIPE_ALL)))
205 return 1;
206 else
207 return 0;
208 }
209 }
210
211 /**
212 * Type for an instruction counter that increments for in-order
213 * instructions only, arbitrarily denoted 'jp' throughout this lowering
214 * pass in order to distinguish it from the regular instruction counter.
215 * This is represented as a vector with an independent counter for each
216 * asynchronous ALU pipeline in the EU.
217 */
218 struct ordered_address {
219 /**
220 * Construct the ordered address of a dependency known to execute on a
221 * single specified pipeline \p p (unless TGL_PIPE_NONE or TGL_PIPE_ALL
222 * is provided), in which case the vector counter will be initialized
223 * with all components equal to INT_MIN (always satisfied) except for
224 * component IDX(p).
225 */
ordered_address__anon22b2e7a30111::ordered_address226 ordered_address(tgl_pipe p = TGL_PIPE_NONE, int jp0 = INT_MIN) {
227 for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++)
228 jp[q] = (p == TGL_PIPE_NONE || (IDX(p) != q && p != TGL_PIPE_ALL) ?
229 INT_MIN : jp0);
230 }
231
232 int jp[IDX(TGL_PIPE_ALL)];
233
234 friend bool
operator ==(const ordered_address & jp0,const ordered_address & jp1)235 operator==(const ordered_address &jp0, const ordered_address &jp1)
236 {
237 for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) {
238 if (jp0.jp[p] != jp1.jp[p])
239 return false;
240 }
241
242 return true;
243 }
244 };
245
246 /**
247 * Return true if the specified ordered address is trivially satisfied for
248 * all pipelines except potentially for the specified pipeline \p p.
249 */
250 bool
is_single_pipe(const ordered_address & jp,tgl_pipe p)251 is_single_pipe(const ordered_address &jp, tgl_pipe p)
252 {
253 for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) {
254 if ((p == TGL_PIPE_NONE || IDX(p) != q) && jp.jp[q] > INT_MIN)
255 return false;
256 }
257
258 return true;
259 }
260
261 /**
262 * Return the number of instructions in the program.
263 */
264 unsigned
num_instructions(const fs_visitor * shader)265 num_instructions(const fs_visitor *shader)
266 {
267 return shader->cfg->blocks[shader->cfg->num_blocks - 1]->end_ip + 1;
268 }
269
270 /**
271 * Calculate the local ordered_address instruction counter at every
272 * instruction of the shader for subsequent constant-time look-up.
273 */
274 ordered_address *
ordered_inst_addresses(const fs_visitor * shader)275 ordered_inst_addresses(const fs_visitor *shader)
276 {
277 ordered_address *jps = new ordered_address[num_instructions(shader)];
278 ordered_address jp(TGL_PIPE_ALL, 0);
279 unsigned ip = 0;
280
281 foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
282 jps[ip] = jp;
283 for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
284 jp.jp[p] += ordered_unit(shader->devinfo, inst, p);
285 ip++;
286 }
287
288 return jps;
289 }
290
291 /**
292 * Synchronization mode required for data manipulated by in-order
293 * instructions.
294 *
295 * Similar to tgl_sbid_mode, but without SET mode. Defined as a separate
296 * enum for additional type safety. The hardware doesn't provide control
297 * over the synchronization mode for RegDist annotations, this is only used
298 * internally in this pass in order to optimize out redundant read
299 * dependencies where possible.
300 */
301 enum tgl_regdist_mode {
302 TGL_REGDIST_NULL = 0,
303 TGL_REGDIST_SRC = 1,
304 TGL_REGDIST_DST = 2
305 };
306
307 /**
308 * Allow bitwise arithmetic of tgl_regdist_mode enums.
309 */
310 tgl_regdist_mode
operator |(tgl_regdist_mode x,tgl_regdist_mode y)311 operator|(tgl_regdist_mode x, tgl_regdist_mode y)
312 {
313 return tgl_regdist_mode(unsigned(x) | unsigned(y));
314 }
315
316 tgl_regdist_mode
operator &(tgl_regdist_mode x,tgl_regdist_mode y)317 operator&(tgl_regdist_mode x, tgl_regdist_mode y)
318 {
319 return tgl_regdist_mode(unsigned(x) & unsigned(y));
320 }
321
322 tgl_regdist_mode &
operator |=(tgl_regdist_mode & x,tgl_regdist_mode y)323 operator|=(tgl_regdist_mode &x, tgl_regdist_mode y)
324 {
325 return x = x | y;
326 }
327
328 tgl_regdist_mode &
operator &=(tgl_regdist_mode & x,tgl_regdist_mode y)329 operator&=(tgl_regdist_mode &x, tgl_regdist_mode y)
330 {
331 return x = x & y;
332 }
333
334 /** @} */
335
336 /**
337 * Representation of an equivalence relation among the set of unsigned
338 * integers.
339 *
340 * Its initial state is the identity relation '~' such that i ~ j if and
341 * only if i == j for every pair of unsigned integers i and j.
342 */
343 struct equivalence_relation {
equivalence_relation__anon22b2e7a30111::equivalence_relation344 equivalence_relation(unsigned n) : is(new unsigned[n]), n(n)
345 {
346 for (unsigned i = 0; i < n; i++)
347 is[i] = i;
348 }
349
~equivalence_relation__anon22b2e7a30111::equivalence_relation350 ~equivalence_relation()
351 {
352 delete[] is;
353 }
354
355 /**
356 * Return equivalence class index of the specified element. Effectively
357 * this is the numeric value of an arbitrary representative from the
358 * equivalence class.
359 *
360 * Allows the evaluation of the equivalence relation according to the
361 * rule that i ~ j if and only if lookup(i) == lookup(j).
362 */
363 unsigned
lookup__anon22b2e7a30111::equivalence_relation364 lookup(unsigned i) const
365 {
366 if (i < n && is[i] != i)
367 return lookup(is[i]);
368 else
369 return i;
370 }
371
372 /**
373 * Create an array with the results of the lookup() method for
374 * constant-time evaluation.
375 */
376 unsigned *
flatten__anon22b2e7a30111::equivalence_relation377 flatten() const
378 {
379 unsigned *ids = new unsigned[n];
380
381 for (unsigned i = 0; i < n; i++)
382 ids[i] = lookup(i);
383
384 return ids;
385 }
386
387 /**
388 * Mutate the existing equivalence relation minimally by imposing the
389 * additional requirement that i ~ j.
390 *
391 * The algorithm updates the internal representation recursively in
392 * order to guarantee transitivity while preserving the previously
393 * specified equivalence requirements.
394 */
395 unsigned
link__anon22b2e7a30111::equivalence_relation396 link(unsigned i, unsigned j)
397 {
398 const unsigned k = lookup(i);
399 assign(i, k);
400 assign(j, k);
401 return k;
402 }
403
404 private:
405 equivalence_relation(const equivalence_relation &);
406
407 equivalence_relation &
408 operator=(const equivalence_relation &);
409
410 /**
411 * Assign the representative of \p from to be equivalent to \p to.
412 *
413 * At the same time the data structure is partially flattened as much as
414 * it's possible without increasing the number of recursive calls.
415 */
416 void
assign__anon22b2e7a30111::equivalence_relation417 assign(unsigned from, unsigned to)
418 {
419 if (from != to) {
420 assert(from < n);
421
422 if (is[from] != from)
423 assign(is[from], to);
424
425 is[from] = to;
426 }
427 }
428
429 unsigned *is;
430 unsigned n;
431 };
432
433 /**
434 * Representation of a data dependency between two instructions in the
435 * program.
436 * @{
437 */
438 struct dependency {
439 /**
440 * No dependency information.
441 */
dependency__anon22b2e7a30111::dependency442 dependency() : ordered(TGL_REGDIST_NULL), jp(),
443 unordered(TGL_SBID_NULL), id(0),
444 exec_all(false) {}
445
446 /**
447 * Construct a dependency on the in-order instruction with the provided
448 * ordered_address instruction counter.
449 */
dependency__anon22b2e7a30111::dependency450 dependency(tgl_regdist_mode mode, const ordered_address &jp,
451 bool exec_all) :
452 ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0),
453 exec_all(exec_all) {}
454
455 /**
456 * Construct a dependency on the out-of-order instruction with the
457 * specified synchronization token.
458 */
dependency__anon22b2e7a30111::dependency459 dependency(tgl_sbid_mode mode, unsigned id, bool exec_all) :
460 ordered(TGL_REGDIST_NULL), jp(), unordered(mode), id(id),
461 exec_all(exec_all) {}
462
463 /**
464 * Synchronization mode of in-order dependency, or zero if no in-order
465 * dependency is present.
466 */
467 tgl_regdist_mode ordered;
468
469 /**
470 * Instruction counter of in-order dependency.
471 *
472 * For a dependency part of a different block in the program, this is
473 * relative to the specific control flow path taken between the
474 * dependency and the current block: It is the ordered_address such that
475 * the difference between it and the ordered_address of the first
476 * instruction of the current block is exactly the number of in-order
477 * instructions across that control flow path. It is not guaranteed to
478 * be equal to the local ordered_address of the generating instruction
479 * [as returned by ordered_inst_addresses()], except for block-local
480 * dependencies.
481 */
482 ordered_address jp;
483
484 /**
485 * Synchronization mode of unordered dependency, or zero if no unordered
486 * dependency is present.
487 */
488 tgl_sbid_mode unordered;
489
490 /** Synchronization token of out-of-order dependency. */
491 unsigned id;
492
493 /**
494 * Whether the dependency could be run with execution masking disabled,
495 * which might lead to the unwanted execution of the generating
496 * instruction in cases where a BB is executed with all channels
497 * disabled due to hardware bug Wa_1407528679.
498 */
499 bool exec_all;
500
501 /**
502 * Trivial in-order dependency that's always satisfied.
503 *
504 * Note that unlike a default-constructed dependency() which is also
505 * trivially satisfied, this is considered to provide dependency
506 * information and can be used to clear a previously pending dependency
507 * via shadow().
508 */
509 static const dependency done;
510
511 friend bool
operator ==(const dependency & dep0,const dependency & dep1)512 operator==(const dependency &dep0, const dependency &dep1)
513 {
514 return dep0.ordered == dep1.ordered &&
515 dep0.jp == dep1.jp &&
516 dep0.unordered == dep1.unordered &&
517 dep0.id == dep1.id &&
518 dep0.exec_all == dep1.exec_all;
519 }
520
521 friend bool
operator !=(const dependency & dep0,const dependency & dep1)522 operator!=(const dependency &dep0, const dependency &dep1)
523 {
524 return !(dep0 == dep1);
525 }
526 };
527
528 const dependency dependency::done =
529 dependency(TGL_REGDIST_DST, ordered_address(), false);
530
531 /**
532 * Return whether \p dep contains any dependency information.
533 */
534 bool
is_valid(const dependency & dep)535 is_valid(const dependency &dep)
536 {
537 return dep.ordered || dep.unordered;
538 }
539
540 /**
541 * Combine \p dep0 and \p dep1 into a single dependency object that is only
542 * satisfied when both original dependencies are satisfied. This might
543 * involve updating the equivalence relation \p eq in order to make sure
544 * that both out-of-order dependencies are assigned the same hardware SBID
545 * as synchronization token.
546 */
547 dependency
merge(equivalence_relation & eq,const dependency & dep0,const dependency & dep1)548 merge(equivalence_relation &eq,
549 const dependency &dep0, const dependency &dep1)
550 {
551 dependency dep;
552
553 if (dep0.ordered || dep1.ordered) {
554 dep.ordered = dep0.ordered | dep1.ordered;
555 for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
556 dep.jp.jp[p] = MAX2(dep0.jp.jp[p], dep1.jp.jp[p]);
557 }
558
559 if (dep0.unordered || dep1.unordered) {
560 dep.unordered = dep0.unordered | dep1.unordered;
561 dep.id = eq.link(dep0.unordered ? dep0.id : dep1.id,
562 dep1.unordered ? dep1.id : dep0.id);
563 }
564
565 dep.exec_all = dep0.exec_all || dep1.exec_all;
566
567 return dep;
568 }
569
570 /**
571 * Override dependency information of \p dep0 with that of \p dep1.
572 */
573 dependency
shadow(const dependency & dep0,const dependency & dep1)574 shadow(const dependency &dep0, const dependency &dep1)
575 {
576 if (dep0.ordered == TGL_REGDIST_SRC &&
577 is_valid(dep1) && !(dep1.unordered & TGL_SBID_DST) &&
578 !(dep1.ordered & TGL_REGDIST_DST)) {
579 /* As an optimization (see dependency_for_read()),
580 * instructions with a RaR dependency don't synchronize
581 * against a previous in-order read, so we need to pass
582 * through both ordered dependencies instead of simply
583 * dropping the first one. Otherwise we could encounter a
584 * WaR data hazard between OP0 and OP2 in cases like:
585 *
586 * OP0 r1:f r0:d
587 * OP1 r2:d r0:d
588 * OP2 r0:d r3:d
589 *
590 * since only the integer-pipeline r0 dependency from OP1
591 * would be visible to OP2, even though OP0 could technically
592 * execute after OP1 due to the floating-point and integer
593 * pipelines being asynchronous on Gfx12.5+ platforms, so
594 * synchronizing OP2 against OP1 would be insufficient.
595 */
596 dependency dep = dep1;
597
598 dep.ordered |= dep0.ordered;
599 for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
600 dep.jp.jp[p] = MAX2(dep.jp.jp[p], dep0.jp.jp[p]);
601
602 return dep;
603 } else {
604 return is_valid(dep1) ? dep1 : dep0;
605 }
606 }
607
608 /**
609 * Translate dependency information across the program.
610 *
611 * This returns a dependency on the same instruction translated to the
612 * ordered_address space of a different block. The correct shift for
613 * transporting a dependency across an edge of the CFG is the difference
614 * between the local ordered_address of the first instruction of the target
615 * block and the local ordered_address of the instruction immediately after
616 * the end of the origin block.
617 */
618 dependency
transport(dependency dep,int delta[IDX (TGL_PIPE_ALL)])619 transport(dependency dep, int delta[IDX(TGL_PIPE_ALL)])
620 {
621 if (dep.ordered) {
622 for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) {
623 if (dep.jp.jp[p] > INT_MIN)
624 dep.jp.jp[p] += delta[p];
625 }
626 }
627
628 return dep;
629 }
630
631 /**
632 * Return simplified dependency removing any synchronization modes not
633 * applicable to an instruction reading the same register location.
634 */
635 dependency
dependency_for_read(dependency dep)636 dependency_for_read(dependency dep)
637 {
638 dep.ordered &= TGL_REGDIST_DST;
639 return dep;
640 }
641
642 /**
643 * Return simplified dependency removing any synchronization modes not
644 * applicable to an instruction \p inst writing the same register location.
645 *
646 * This clears any WaR dependency for writes performed from the same
647 * pipeline as the read, since there is no possibility for a data hazard.
648 */
649 dependency
dependency_for_write(const struct intel_device_info * devinfo,const fs_inst * inst,dependency dep)650 dependency_for_write(const struct intel_device_info *devinfo,
651 const fs_inst *inst, dependency dep)
652 {
653 if (!is_unordered(devinfo, inst) &&
654 is_single_pipe(dep.jp, inferred_exec_pipe(devinfo, inst)))
655 dep.ordered &= TGL_REGDIST_DST;
656 return dep;
657 }
658
659 /** @} */
660
661 /**
662 * Scoreboard representation. This keeps track of the data dependencies of
663 * registers with GRF granularity.
664 */
665 class scoreboard {
666 public:
667 /**
668 * Look up the most current data dependency for register \p r.
669 */
670 dependency
get(const brw_reg & r) const671 get(const brw_reg &r) const
672 {
673 if (const dependency *p = const_cast<scoreboard *>(this)->dep(r))
674 return *p;
675 else
676 return dependency();
677 }
678
679 /**
680 * Specify the most current data dependency for register \p r.
681 */
682 void
set(const brw_reg & r,const dependency & d)683 set(const brw_reg &r, const dependency &d)
684 {
685 if (dependency *p = dep(r))
686 *p = d;
687 }
688
689 /**
690 * Component-wise merge() of corresponding dependencies from two
691 * scoreboard objects. \sa merge().
692 */
693 friend scoreboard
merge(equivalence_relation & eq,const scoreboard & sb0,const scoreboard & sb1)694 merge(equivalence_relation &eq,
695 const scoreboard &sb0, const scoreboard &sb1)
696 {
697 scoreboard sb;
698
699 for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
700 sb.grf_deps[i] = merge(eq, sb0.grf_deps[i], sb1.grf_deps[i]);
701
702 sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep);
703 sb.accum_dep = merge(eq, sb0.accum_dep, sb1.accum_dep);
704 sb.scalar_dep = merge(eq, sb0.scalar_dep, sb1.scalar_dep);
705
706 return sb;
707 }
708
709 /**
710 * Component-wise shadow() of corresponding dependencies from two
711 * scoreboard objects. \sa shadow().
712 */
713 friend scoreboard
shadow(const scoreboard & sb0,const scoreboard & sb1)714 shadow(const scoreboard &sb0, const scoreboard &sb1)
715 {
716 scoreboard sb;
717
718 for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
719 sb.grf_deps[i] = shadow(sb0.grf_deps[i], sb1.grf_deps[i]);
720
721 sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep);
722 sb.accum_dep = shadow(sb0.accum_dep, sb1.accum_dep);
723 sb.scalar_dep = shadow(sb0.scalar_dep, sb1.scalar_dep);
724
725 return sb;
726 }
727
728 /**
729 * Component-wise transport() of dependencies from a scoreboard
730 * object. \sa transport().
731 */
732 friend scoreboard
transport(const scoreboard & sb0,int delta[IDX (TGL_PIPE_ALL)])733 transport(const scoreboard &sb0, int delta[IDX(TGL_PIPE_ALL)])
734 {
735 scoreboard sb;
736
737 for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
738 sb.grf_deps[i] = transport(sb0.grf_deps[i], delta);
739
740 sb.addr_dep = transport(sb0.addr_dep, delta);
741 sb.accum_dep = transport(sb0.accum_dep, delta);
742 sb.scalar_dep = transport(sb0.scalar_dep, delta);
743
744 return sb;
745 }
746
747 friend bool
operator ==(const scoreboard & sb0,const scoreboard & sb1)748 operator==(const scoreboard &sb0, const scoreboard &sb1)
749 {
750 for (unsigned i = 0; i < ARRAY_SIZE(sb0.grf_deps); i++) {
751 if (sb0.grf_deps[i] != sb1.grf_deps[i])
752 return false;
753 }
754
755 if (sb0.addr_dep != sb1.addr_dep)
756 return false;
757
758 if (sb0.accum_dep != sb1.accum_dep)
759 return false;
760
761 if (sb0.scalar_dep != sb1.scalar_dep)
762 return false;
763
764 return true;
765 }
766
767 friend bool
operator !=(const scoreboard & sb0,const scoreboard & sb1)768 operator!=(const scoreboard &sb0, const scoreboard &sb1)
769 {
770 return !(sb0 == sb1);
771 }
772
773 private:
774 dependency grf_deps[XE3_MAX_GRF];
775 dependency addr_dep;
776 dependency accum_dep;
777 dependency scalar_dep;
778
779 dependency *
dep(const brw_reg & r)780 dep(const brw_reg &r)
781 {
782 const unsigned reg = (r.file == VGRF ? r.nr + r.offset / REG_SIZE :
783 reg_offset(r) / REG_SIZE);
784
785 return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] :
786 r.file == ARF && reg >= BRW_ARF_ADDRESS &&
787 reg < BRW_ARF_ACCUMULATOR ? &addr_dep :
788 r.file == ARF && reg >= BRW_ARF_ACCUMULATOR &&
789 reg < BRW_ARF_FLAG ? &accum_dep :
790 r.file == ARF && reg >= BRW_ARF_SCALAR &&
791 reg < BRW_ARF_STATE ? &scalar_dep :
792 NULL);
793 }
794 };
795
796 /**
797 * Dependency list handling.
798 * @{
799 */
800 struct dependency_list {
dependency_list__anon22b2e7a30111::dependency_list801 dependency_list() : deps(NULL), n(0) {}
802
~dependency_list__anon22b2e7a30111::dependency_list803 ~dependency_list()
804 {
805 free(deps);
806 }
807
808 void
push_back__anon22b2e7a30111::dependency_list809 push_back(const dependency &dep)
810 {
811 deps = (dependency *)realloc(deps, (n + 1) * sizeof(*deps));
812 deps[n++] = dep;
813 }
814
815 unsigned
size__anon22b2e7a30111::dependency_list816 size() const
817 {
818 return n;
819 }
820
821 const dependency &
operator []__anon22b2e7a30111::dependency_list822 operator[](unsigned i) const
823 {
824 assert(i < n);
825 return deps[i];
826 }
827
828 dependency &
operator []__anon22b2e7a30111::dependency_list829 operator[](unsigned i)
830 {
831 assert(i < n);
832 return deps[i];
833 }
834
835 private:
836 dependency_list(const dependency_list &);
837 dependency_list &
838 operator=(const dependency_list &);
839
840 dependency *deps;
841 unsigned n;
842 };
843
844 /**
845 * Add dependency \p dep to the list of dependencies of an instruction
846 * \p deps.
847 */
848 void
add_dependency(const unsigned * ids,dependency_list & deps,dependency dep)849 add_dependency(const unsigned *ids, dependency_list &deps, dependency dep)
850 {
851 if (is_valid(dep)) {
852 /* Translate the unordered dependency token first in order to keep
853 * the list minimally redundant.
854 */
855 if (dep.unordered)
856 dep.id = ids[dep.id];
857
858 /* Try to combine the specified dependency with any existing ones. */
859 for (unsigned i = 0; i < deps.size(); i++) {
860 /* Don't combine otherwise matching dependencies if there is an
861 * exec_all mismatch which would cause a SET dependency to gain an
862 * exec_all flag, since that would prevent it from being baked
863 * into the instruction we want to allocate an SBID for.
864 */
865 if (deps[i].exec_all != dep.exec_all &&
866 (!deps[i].exec_all || (dep.unordered & TGL_SBID_SET)) &&
867 (!dep.exec_all || (deps[i].unordered & TGL_SBID_SET)))
868 continue;
869
870 if (dep.ordered && deps[i].ordered) {
871 for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
872 deps[i].jp.jp[p] = MAX2(deps[i].jp.jp[p], dep.jp.jp[p]);
873
874 deps[i].ordered |= dep.ordered;
875 deps[i].exec_all |= dep.exec_all;
876 dep.ordered = TGL_REGDIST_NULL;
877 }
878
879 if (dep.unordered && deps[i].unordered && deps[i].id == dep.id) {
880 deps[i].unordered |= dep.unordered;
881 deps[i].exec_all |= dep.exec_all;
882 dep.unordered = TGL_SBID_NULL;
883 }
884 }
885
886 /* Add it to the end of the list if necessary. */
887 if (is_valid(dep))
888 deps.push_back(dep);
889 }
890 }
891
892 /**
893 * Construct a tgl_swsb annotation encoding any ordered dependencies from
894 * the dependency list \p deps of an instruction with ordered_address \p
895 * jp. If \p exec_all is false only dependencies known to be executed with
896 * channel masking applied will be considered in the calculation.
897 */
898 tgl_swsb
ordered_dependency_swsb(const dependency_list & deps,const ordered_address & jp,bool exec_all)899 ordered_dependency_swsb(const dependency_list &deps,
900 const ordered_address &jp,
901 bool exec_all)
902 {
903 tgl_pipe p = TGL_PIPE_NONE;
904 unsigned min_dist = ~0u;
905
906 for (unsigned i = 0; i < deps.size(); i++) {
907 if (deps[i].ordered && exec_all >= deps[i].exec_all) {
908 for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) {
909 const unsigned dist = jp.jp[q] - int64_t(deps[i].jp.jp[q]);
910 const unsigned max_dist = (q == IDX(TGL_PIPE_LONG) ? 14 : 10);
911 assert(jp.jp[q] > deps[i].jp.jp[q]);
912 if (dist <= max_dist) {
913 p = (p && IDX(p) != q ? TGL_PIPE_ALL :
914 tgl_pipe(TGL_PIPE_FLOAT + q));
915 min_dist = MIN3(min_dist, dist, 7);
916 }
917 }
918 }
919 }
920
921 return { p ? min_dist : 0, p };
922 }
923
924 /**
925 * Return whether the dependency list \p deps of an instruction with
926 * ordered_address \p jp has any non-trivial ordered dependencies. If \p
927 * exec_all is false only dependencies known to be executed with channel
928 * masking applied will be considered in the calculation.
929 */
930 bool
find_ordered_dependency(const dependency_list & deps,const ordered_address & jp,bool exec_all)931 find_ordered_dependency(const dependency_list &deps,
932 const ordered_address &jp,
933 bool exec_all)
934 {
935 return ordered_dependency_swsb(deps, jp, exec_all).regdist;
936 }
937
938 /**
939 * Return the full tgl_sbid_mode bitset for the first unordered dependency
940 * on the list \p deps that matches the specified tgl_sbid_mode, or zero if
941 * no such dependency is present. If \p exec_all is false only
942 * dependencies known to be executed with channel masking applied will be
943 * considered in the calculation.
944 */
945 tgl_sbid_mode
find_unordered_dependency(const dependency_list & deps,tgl_sbid_mode unordered,bool exec_all)946 find_unordered_dependency(const dependency_list &deps,
947 tgl_sbid_mode unordered,
948 bool exec_all)
949 {
950 if (unordered) {
951 for (unsigned i = 0; i < deps.size(); i++) {
952 if ((unordered & deps[i].unordered) &&
953 exec_all >= deps[i].exec_all)
954 return deps[i].unordered;
955 }
956 }
957
958 return TGL_SBID_NULL;
959 }
960
961 /**
962 * Return the tgl_sbid_mode bitset of an unordered dependency from the list
963 * \p deps that can be represented directly in the SWSB annotation of the
964 * instruction without additional SYNC instructions, or zero if no such
965 * dependency is present.
966 */
967 tgl_sbid_mode
baked_unordered_dependency_mode(const struct intel_device_info * devinfo,const fs_inst * inst,const dependency_list & deps,const ordered_address & jp)968 baked_unordered_dependency_mode(const struct intel_device_info *devinfo,
969 const fs_inst *inst,
970 const dependency_list &deps,
971 const ordered_address &jp)
972 {
973 const bool exec_all = inst->force_writemask_all;
974 const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
975 const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp,
976 exec_all).pipe;
977
978 if (find_unordered_dependency(deps, TGL_SBID_SET, exec_all))
979 return find_unordered_dependency(deps, TGL_SBID_SET, exec_all);
980 else if (has_ordered && is_unordered(devinfo, inst))
981 return TGL_SBID_NULL;
982 else if (is_send(inst) && devinfo->ver >= 20)
983 return TGL_SBID_NULL;
984 else if (find_unordered_dependency(deps, TGL_SBID_DST, exec_all) &&
985 (!has_ordered || ordered_pipe == inferred_sync_pipe(devinfo, inst)))
986 return find_unordered_dependency(deps, TGL_SBID_DST, exec_all);
987 else if (!has_ordered)
988 return find_unordered_dependency(deps, TGL_SBID_SRC, exec_all);
989 else
990 return TGL_SBID_NULL;
991 }
992
993 /**
994 * Return whether an ordered dependency from the list \p deps can be
995 * represented directly in the SWSB annotation of the instruction without
996 * additional SYNC instructions.
997 */
998 bool
baked_ordered_dependency_mode(const struct intel_device_info * devinfo,const fs_inst * inst,const dependency_list & deps,const ordered_address & jp)999 baked_ordered_dependency_mode(const struct intel_device_info *devinfo,
1000 const fs_inst *inst,
1001 const dependency_list &deps,
1002 const ordered_address &jp)
1003 {
1004 const bool exec_all = inst->force_writemask_all;
1005 const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
1006 const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp,
1007 exec_all).pipe;
1008 const tgl_sbid_mode unordered_mode =
1009 baked_unordered_dependency_mode(devinfo, inst, deps, jp);
1010 const tgl_pipe inferred_pipe = inferred_sync_pipe(devinfo, inst);
1011
1012 if (!has_ordered)
1013 return false;
1014 else if (!unordered_mode)
1015 return true;
1016 else if (devinfo->ver < 20)
1017 return ordered_pipe == inferred_pipe &&
1018 unordered_mode == (is_unordered(devinfo, inst) ? TGL_SBID_SET :
1019 TGL_SBID_DST);
1020 else if (is_send(inst))
1021 return unordered_mode == TGL_SBID_SET &&
1022 (ordered_pipe == TGL_PIPE_FLOAT ||
1023 ordered_pipe == TGL_PIPE_INT ||
1024 ordered_pipe == TGL_PIPE_ALL);
1025 else if (inst->opcode == BRW_OPCODE_DPAS)
1026 return ordered_pipe == inferred_pipe;
1027 else
1028 return (unordered_mode == TGL_SBID_DST && ordered_pipe == inferred_pipe) ||
1029 (unordered_mode == TGL_SBID_SRC && ordered_pipe == inferred_pipe) ||
1030 (unordered_mode == TGL_SBID_DST && ordered_pipe == TGL_PIPE_ALL);
1031 }
1032
1033 /** @} */
1034
1035 /**
1036 * Shader instruction dependency calculation.
1037 * @{
1038 */
1039
1040 /**
1041 * Update scoreboard object \p sb to account for the execution of
1042 * instruction \p inst.
1043 */
1044 void
update_inst_scoreboard(const fs_visitor * shader,const ordered_address * jps,const fs_inst * inst,unsigned ip,scoreboard & sb)1045 update_inst_scoreboard(const fs_visitor *shader, const ordered_address *jps,
1046 const fs_inst *inst, unsigned ip, scoreboard &sb)
1047 {
1048 const bool exec_all = inst->force_writemask_all;
1049 const struct intel_device_info *devinfo = shader->devinfo;
1050 const tgl_pipe p = inferred_exec_pipe(devinfo, inst);
1051 const ordered_address jp = p ? ordered_address(p, jps[ip].jp[IDX(p)]) :
1052 ordered_address();
1053 const bool is_ordered = ordered_unit(devinfo, inst, IDX(TGL_PIPE_ALL));
1054 const bool is_unordered_math =
1055 (inst->is_math() && devinfo->ver < 20) ||
1056 (devinfo->has_64bit_float_via_math_pipe &&
1057 (get_exec_type(inst) == BRW_TYPE_DF ||
1058 inst->dst.type == BRW_TYPE_DF));
1059
1060 /* Track any source registers that may be fetched asynchronously by this
1061 * instruction, otherwise clear the dependency in order to avoid
1062 * subsequent redundant synchronization.
1063 */
1064 for (unsigned i = 0; i < inst->sources; i++) {
1065 const dependency rd_dep =
1066 (inst->is_payload(i) ||
1067 inst->opcode == BRW_OPCODE_DPAS ||
1068 is_unordered_math) ? dependency(TGL_SBID_SRC, ip, exec_all) :
1069 is_ordered ? dependency(TGL_REGDIST_SRC, jp, exec_all) :
1070 dependency::done;
1071
1072 for (unsigned j = 0; j < regs_read(devinfo, inst, i); j++) {
1073 const brw_reg r = byte_offset(inst->src[i], REG_SIZE * j);
1074 sb.set(r, shadow(sb.get(r), rd_dep));
1075 }
1076 }
1077
1078 if (inst->reads_accumulator_implicitly())
1079 sb.set(brw_acc_reg(8), dependency(TGL_REGDIST_SRC, jp, exec_all));
1080
1081 /* Track any destination registers of this instruction. */
1082 const dependency wr_dep =
1083 is_unordered(devinfo, inst) ? dependency(TGL_SBID_DST, ip, exec_all) :
1084 is_ordered ? dependency(TGL_REGDIST_DST, jp, exec_all) :
1085 dependency();
1086
1087 if (inst->writes_accumulator_implicitly(devinfo))
1088 sb.set(brw_acc_reg(8), wr_dep);
1089
1090 if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
1091 !inst->dst.is_null()) {
1092 for (unsigned j = 0; j < regs_written(inst); j++)
1093 sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep);
1094 }
1095 }
1096
1097 /**
1098 * Calculate scoreboard objects locally that represent any pending (and
1099 * unconditionally resolved) dependencies at the end of each block of the
1100 * program.
1101 */
1102 scoreboard *
gather_block_scoreboards(const fs_visitor * shader,const ordered_address * jps)1103 gather_block_scoreboards(const fs_visitor *shader,
1104 const ordered_address *jps)
1105 {
1106 scoreboard *sbs = new scoreboard[shader->cfg->num_blocks];
1107 unsigned ip = 0;
1108
1109 foreach_block_and_inst(block, fs_inst, inst, shader->cfg)
1110 update_inst_scoreboard(shader, jps, inst, ip++, sbs[block->num]);
1111
1112 return sbs;
1113 }
1114
1115 /**
1116 * Propagate data dependencies globally through the control flow graph
1117 * until a fixed point is reached.
1118 *
1119 * Calculates the set of dependencies potentially pending at the beginning
1120 * of each block, and returns it as an array of scoreboard objects.
1121 */
1122 scoreboard *
propagate_block_scoreboards(const fs_visitor * shader,const ordered_address * jps,equivalence_relation & eq)1123 propagate_block_scoreboards(const fs_visitor *shader,
1124 const ordered_address *jps,
1125 equivalence_relation &eq)
1126 {
1127 const scoreboard *delta_sbs = gather_block_scoreboards(shader, jps);
1128 scoreboard *in_sbs = new scoreboard[shader->cfg->num_blocks];
1129 scoreboard *out_sbs = new scoreboard[shader->cfg->num_blocks];
1130
1131 for (bool progress = true; progress;) {
1132 progress = false;
1133
1134 foreach_block(block, shader->cfg) {
1135 const scoreboard sb = shadow(in_sbs[block->num],
1136 delta_sbs[block->num]);
1137
1138 if (sb != out_sbs[block->num]) {
1139 foreach_list_typed(bblock_link, child_link, link,
1140 &block->children) {
1141 scoreboard &in_sb = in_sbs[child_link->block->num];
1142 int delta[IDX(TGL_PIPE_ALL)];
1143
1144 for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
1145 delta[p] = jps[child_link->block->start_ip].jp[p]
1146 - jps[block->end_ip].jp[p]
1147 - ordered_unit(shader->devinfo,
1148 static_cast<const fs_inst *>(block->end()), p);
1149
1150 in_sb = merge(eq, in_sb, transport(sb, delta));
1151 }
1152
1153 out_sbs[block->num] = sb;
1154 progress = true;
1155 }
1156 }
1157 }
1158
1159 delete[] delta_sbs;
1160 delete[] out_sbs;
1161
1162 return in_sbs;
1163 }
1164
1165 /**
1166 * Return the list of potential dependencies of each instruction in the
1167 * shader based on the result of global dependency analysis.
1168 */
1169 dependency_list *
gather_inst_dependencies(const fs_visitor * shader,const ordered_address * jps)1170 gather_inst_dependencies(const fs_visitor *shader,
1171 const ordered_address *jps)
1172 {
1173 const struct intel_device_info *devinfo = shader->devinfo;
1174 equivalence_relation eq(num_instructions(shader));
1175 scoreboard *sbs = propagate_block_scoreboards(shader, jps, eq);
1176 const unsigned *ids = eq.flatten();
1177 dependency_list *deps = new dependency_list[num_instructions(shader)];
1178 unsigned ip = 0;
1179
1180 foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
1181 const bool exec_all = inst->force_writemask_all;
1182 const tgl_pipe p = inferred_exec_pipe(devinfo, inst);
1183 scoreboard &sb = sbs[block->num];
1184
1185 for (unsigned i = 0; i < inst->sources; i++) {
1186 for (unsigned j = 0; j < regs_read(devinfo, inst, i); j++)
1187 add_dependency(ids, deps[ip], dependency_for_read(
1188 sb.get(byte_offset(inst->src[i], REG_SIZE * j))));
1189 }
1190
1191 if (inst->reads_accumulator_implicitly()) {
1192 /* Wa_22012725308:
1193 *
1194 * "When the accumulator registers are used as source and/or
1195 * destination, hardware does not ensure prevention of write
1196 * after read hazard across execution pipes."
1197 */
1198 const dependency dep = sb.get(brw_acc_reg(8));
1199 if (dep.ordered && !is_single_pipe(dep.jp, p))
1200 add_dependency(ids, deps[ip], dep);
1201 }
1202
1203 if (is_unordered(devinfo, inst) && !inst->eot)
1204 add_dependency(ids, deps[ip],
1205 dependency(TGL_SBID_SET, ip, exec_all));
1206
1207 if (!inst->no_dd_check) {
1208 if (inst->dst.file != BAD_FILE && !inst->dst.is_null() &&
1209 !inst->dst.is_accumulator()) {
1210 for (unsigned j = 0; j < regs_written(inst); j++) {
1211 add_dependency(ids, deps[ip], dependency_for_write(devinfo, inst,
1212 sb.get(byte_offset(inst->dst, REG_SIZE * j))));
1213 }
1214 }
1215
1216 if (inst->writes_accumulator_implicitly(devinfo) ||
1217 inst->dst.is_accumulator()) {
1218 /* Wa_22012725308:
1219 *
1220 * "When the accumulator registers are used as source and/or
1221 * destination, hardware does not ensure prevention of write
1222 * after read hazard across execution pipes."
1223 */
1224 const dependency dep = sb.get(brw_acc_reg(8));
1225 if (dep.ordered && !is_single_pipe(dep.jp, p))
1226 add_dependency(ids, deps[ip], dep);
1227 }
1228 }
1229
1230 update_inst_scoreboard(shader, jps, inst, ip, sb);
1231 ip++;
1232 }
1233
1234 delete[] sbs;
1235 delete[] ids;
1236
1237 return deps;
1238 }
1239
1240 /** @} */
1241
1242 /**
1243 * Allocate SBID tokens to track the execution of every out-of-order
1244 * instruction of the shader.
1245 */
1246 dependency_list *
allocate_inst_dependencies(const fs_visitor * shader,const dependency_list * deps0)1247 allocate_inst_dependencies(const fs_visitor *shader,
1248 const dependency_list *deps0)
1249 {
1250 /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in
1251 * shaders with a large number of SEND messages.
1252 *
1253 * XXX - Use 32 SBIDs on Xe2 while in large GRF mode.
1254 */
1255 const unsigned num_sbids = (shader->devinfo->ver >= 30 ? 32 : 16);
1256
1257 /* Allocate an unordered dependency ID to hardware SBID translation
1258 * table with as many entries as instructions there are in the shader,
1259 * which is the maximum number of unordered IDs we can find in the
1260 * program.
1261 */
1262 unsigned *ids = new unsigned[num_instructions(shader)];
1263 for (unsigned ip = 0; ip < num_instructions(shader); ip++)
1264 ids[ip] = ~0u;
1265
1266 dependency_list *deps1 = new dependency_list[num_instructions(shader)];
1267 unsigned next_id = 0;
1268
1269 for (unsigned ip = 0; ip < num_instructions(shader); ip++) {
1270 for (unsigned i = 0; i < deps0[ip].size(); i++) {
1271 const dependency &dep = deps0[ip][i];
1272
1273 if (dep.unordered && ids[dep.id] == ~0u)
1274 ids[dep.id] = (next_id++) & (num_sbids - 1);
1275
1276 add_dependency(ids, deps1[ip], dep);
1277 }
1278 }
1279
1280 delete[] ids;
1281
1282 return deps1;
1283 }
1284
1285 /**
1286 * Emit dependency information provided by \p deps into the shader,
1287 * inserting additional SYNC instructions for dependencies that can't be
1288 * represented directly by annotating existing instructions.
1289 */
1290 void
emit_inst_dependencies(fs_visitor * shader,const ordered_address * jps,const dependency_list * deps)1291 emit_inst_dependencies(fs_visitor *shader,
1292 const ordered_address *jps,
1293 const dependency_list *deps)
1294 {
1295 const struct intel_device_info *devinfo = shader->devinfo;
1296 unsigned ip = 0;
1297
1298 foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) {
1299 const bool exec_all = inst->force_writemask_all;
1300 const bool ordered_mode =
1301 baked_ordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]);
1302 const tgl_sbid_mode unordered_mode =
1303 baked_unordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]);
1304 tgl_swsb swsb = !ordered_mode ? tgl_swsb() :
1305 ordered_dependency_swsb(deps[ip], jps[ip], exec_all);
1306
1307 for (unsigned i = 0; i < deps[ip].size(); i++) {
1308 const dependency &dep = deps[ip][i];
1309
1310 if (dep.unordered) {
1311 if (unordered_mode == dep.unordered &&
1312 exec_all >= dep.exec_all && !swsb.mode) {
1313 /* Bake unordered dependency into the instruction's SWSB if
1314 * possible, except in cases where the current instruction
1315 * isn't marked NoMask but the dependency is, since that
1316 * might lead to data coherency issues due to
1317 * Wa_1407528679.
1318 */
1319 swsb.sbid = dep.id;
1320 swsb.mode = dep.unordered;
1321 } else {
1322 /* Emit dependency into the SWSB of an extra SYNC
1323 * instruction.
1324 */
1325 const brw_builder ibld = brw_builder(shader, block, inst)
1326 .exec_all().group(1, 0);
1327 fs_inst *sync = ibld.SYNC(TGL_SYNC_NOP);
1328 sync->sched.sbid = dep.id;
1329 sync->sched.mode = dep.unordered;
1330 assert(!(sync->sched.mode & TGL_SBID_SET));
1331 }
1332 }
1333 }
1334
1335 for (unsigned i = 0; i < deps[ip].size(); i++) {
1336 const dependency &dep = deps[ip][i];
1337
1338 if (dep.ordered &&
1339 find_ordered_dependency(deps[ip], jps[ip], true) &&
1340 (!ordered_mode || dep.exec_all > exec_all)) {
1341 /* If the current instruction is not marked NoMask but an
1342 * ordered dependency is, perform the synchronization as a
1343 * separate NoMask SYNC instruction in order to avoid data
1344 * coherency issues due to Wa_1407528679. The similar
1345 * scenario with unordered dependencies should have been
1346 * handled above.
1347 */
1348 const brw_builder ibld = brw_builder(shader, block, inst)
1349 .exec_all().group(1, 0);
1350 fs_inst *sync = ibld.SYNC(TGL_SYNC_NOP);
1351 sync->sched = ordered_dependency_swsb(deps[ip], jps[ip], true);
1352 break;
1353 }
1354 }
1355
1356 /* Update the IR. */
1357 inst->sched = swsb;
1358 inst->no_dd_check = inst->no_dd_clear = false;
1359 ip++;
1360 }
1361 }
1362 }
1363
1364 bool
brw_lower_scoreboard(fs_visitor & s)1365 brw_lower_scoreboard(fs_visitor &s)
1366 {
1367 if (s.devinfo->ver >= 12) {
1368 const ordered_address *jps = ordered_inst_addresses(&s);
1369 const dependency_list *deps0 = gather_inst_dependencies(&s, jps);
1370 const dependency_list *deps1 = allocate_inst_dependencies(&s, deps0);
1371 emit_inst_dependencies(&s, jps, deps1);
1372 delete[] deps1;
1373 delete[] deps0;
1374 delete[] jps;
1375 }
1376
1377 return true;
1378 }
1379