1 /*
2 * Copyright © 2019 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs_scoreboard.cpp
25 *
26 * Gfx12+ hardware lacks the register scoreboard logic that used to guarantee
27 * data coherency between register reads and writes in previous generations.
28 * This lowering pass runs after register allocation in order to make up for
29 * it.
30 *
31 * It works by performing global dataflow analysis in order to determine the
32 * set of potential dependencies of every instruction in the shader, and then
33 * inserts any required SWSB annotations and additional SYNC instructions in
34 * order to guarantee data coherency.
35 *
36 * WARNING - Access of the following (rarely used) ARF registers is not
37 * tracked here, and require the RegDist SWSB annotation to be set
38 * to 1 by the generator in order to avoid data races:
39 *
40 * - sp stack pointer
41 * - sr0 state register
42 * - cr0 control register
43 * - ip instruction pointer
44 * - tm0 timestamp register
45 * - dbg0 debug register
46 * - acc2-9 special accumulator registers on TGL
47 * - mme0-7 math macro extended accumulator registers
48 *
49 * The following ARF registers don't need to be tracked here because data
50 * coherency is still provided transparently by the hardware:
51 *
52 * - f0-1 flag registers
53 * - n0 notification register
54 * - tdr0 thread dependency register
55 */
56
57 #include "brw_fs.h"
58 #include "brw_fs_builder.h"
59 #include "brw_cfg.h"
60
61 using namespace brw;
62
63 namespace {
64 /**
65 * In-order instruction accounting.
66 * @{
67 */
68
69 /**
70 * Return the RegDist pipeline the hardware will synchronize with if no
71 * pipeline information is provided in the SWSB annotation of an
72 * instruction (e.g. when TGL_PIPE_NONE is specified in tgl_swsb).
73 */
74 tgl_pipe
inferred_sync_pipe(const struct intel_device_info * devinfo,const fs_inst * inst)75 inferred_sync_pipe(const struct intel_device_info *devinfo, const fs_inst *inst)
76 {
77 if (devinfo->verx10 >= 125) {
78 bool has_int_src = false, has_long_src = false;
79 const bool has_long_pipe = !devinfo->has_64bit_float_via_math_pipe;
80
81 if (is_send(inst))
82 return TGL_PIPE_NONE;
83
84 for (unsigned i = 0; i < inst->sources; i++) {
85 if (inst->src[i].file != BAD_FILE &&
86 !inst->is_control_source(i)) {
87 const brw_reg_type t = inst->src[i].type;
88 has_int_src |= !brw_reg_type_is_floating_point(t);
89 has_long_src |= type_sz(t) >= 8;
90 }
91 }
92
93 /* Avoid the emitting (RegDist, SWSB) annotations for long
94 * instructions on platforms where they are unordered. It's not clear
95 * what the inferred sync pipe is for them or if we are even allowed
96 * to use these annotations in this case. Return NONE, which should
97 * prevent baked_{un,}ordered_dependency_mode functions from even
98 * trying to emit these annotations.
99 */
100 if (!has_long_pipe && has_long_src)
101 return TGL_PIPE_NONE;
102
103 return has_long_src ? TGL_PIPE_LONG :
104 has_int_src ? TGL_PIPE_INT :
105 TGL_PIPE_FLOAT;
106
107 } else {
108 return TGL_PIPE_FLOAT;
109 }
110 }
111
112 /**
113 * Return the RegDist pipeline that will execute an instruction, or
114 * TGL_PIPE_NONE if the instruction is out-of-order and doesn't use the
115 * RegDist synchronization mechanism.
116 */
117 tgl_pipe
inferred_exec_pipe(const struct intel_device_info * devinfo,const fs_inst * inst)118 inferred_exec_pipe(const struct intel_device_info *devinfo, const fs_inst *inst)
119 {
120 const brw_reg_type t = get_exec_type(inst);
121 const bool is_dword_multiply = !brw_reg_type_is_floating_point(t) &&
122 ((inst->opcode == BRW_OPCODE_MUL &&
123 MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4) ||
124 (inst->opcode == BRW_OPCODE_MAD &&
125 MIN2(type_sz(inst->src[1].type), type_sz(inst->src[2].type)) >= 4));
126
127 if (is_unordered(devinfo, inst))
128 return TGL_PIPE_NONE;
129 else if (devinfo->verx10 < 125)
130 return TGL_PIPE_FLOAT;
131 else if (inst->is_math() && devinfo->ver >= 20)
132 return TGL_PIPE_MATH;
133 else if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT ||
134 inst->opcode == SHADER_OPCODE_BROADCAST ||
135 inst->opcode == SHADER_OPCODE_SHUFFLE)
136 return TGL_PIPE_INT;
137 else if (inst->opcode == FS_OPCODE_PACK_HALF_2x16_SPLIT)
138 return TGL_PIPE_FLOAT;
139 else if (devinfo->ver >= 20 && type_sz(inst->dst.type) >= 8 &&
140 brw_reg_type_is_floating_point(inst->dst.type)) {
141 assert(devinfo->has_64bit_float);
142 return TGL_PIPE_LONG;
143 } else if (devinfo->ver < 20 &&
144 (type_sz(inst->dst.type) >= 8 || type_sz(t) >= 8 ||
145 is_dword_multiply)) {
146 assert(devinfo->has_64bit_float || devinfo->has_64bit_int ||
147 devinfo->has_integer_dword_mul);
148 return TGL_PIPE_LONG;
149 } else if (brw_reg_type_is_floating_point(inst->dst.type))
150 return TGL_PIPE_FLOAT;
151 else
152 return TGL_PIPE_INT;
153 }
154
155 /**
156 * Index of the \p p pipeline counter in the ordered_address vector defined
157 * below.
158 */
159 #define IDX(p) (p >= TGL_PIPE_FLOAT ? unsigned(p - TGL_PIPE_FLOAT) : \
160 (abort(), ~0u))
161
162 /**
163 * Number of in-order hardware instructions for pipeline index \p contained
164 * in this IR instruction. This determines the increment applied to the
165 * RegDist counter calculated for any ordered dependency that crosses this
166 * instruction.
167 */
168 unsigned
ordered_unit(const struct intel_device_info * devinfo,const fs_inst * inst,unsigned p)169 ordered_unit(const struct intel_device_info *devinfo, const fs_inst *inst,
170 unsigned p)
171 {
172 switch (inst->opcode) {
173 case BRW_OPCODE_SYNC:
174 case BRW_OPCODE_DO:
175 case SHADER_OPCODE_UNDEF:
176 case SHADER_OPCODE_HALT_TARGET:
177 case FS_OPCODE_SCHEDULING_FENCE:
178 return 0;
179 default:
180 /* Note that the following is inaccurate for virtual instructions
181 * that expand to more in-order instructions than assumed here, but
182 * that can only lead to suboptimal execution ordering, data
183 * coherency won't be impacted. Providing exact RegDist counts for
184 * each virtual instruction would allow better ALU performance, but
185 * it would require keeping this switch statement in perfect sync
186 * with the generator in order to avoid data corruption. Lesson is
187 * (again) don't use virtual instructions if you want optimal
188 * scheduling.
189 */
190 if (!is_unordered(devinfo, inst) &&
191 (p == IDX(inferred_exec_pipe(devinfo, inst)) ||
192 p == IDX(TGL_PIPE_ALL)))
193 return 1;
194 else
195 return 0;
196 }
197 }
198
199 /**
200 * Type for an instruction counter that increments for in-order
201 * instructions only, arbitrarily denoted 'jp' throughout this lowering
202 * pass in order to distinguish it from the regular instruction counter.
203 * This is represented as a vector with an independent counter for each
204 * asynchronous ALU pipeline in the EU.
205 */
206 struct ordered_address {
207 /**
208 * Construct the ordered address of a dependency known to execute on a
209 * single specified pipeline \p p (unless TGL_PIPE_NONE or TGL_PIPE_ALL
210 * is provided), in which case the vector counter will be initialized
211 * with all components equal to INT_MIN (always satisfied) except for
212 * component IDX(p).
213 */
ordered_address__anond1928e340111::ordered_address214 ordered_address(tgl_pipe p = TGL_PIPE_NONE, int jp0 = INT_MIN) {
215 for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++)
216 jp[q] = (p == TGL_PIPE_NONE || (IDX(p) != q && p != TGL_PIPE_ALL) ?
217 INT_MIN : jp0);
218 }
219
220 int jp[IDX(TGL_PIPE_ALL)];
221
222 friend bool
operator ==(const ordered_address & jp0,const ordered_address & jp1)223 operator==(const ordered_address &jp0, const ordered_address &jp1)
224 {
225 for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) {
226 if (jp0.jp[p] != jp1.jp[p])
227 return false;
228 }
229
230 return true;
231 }
232 };
233
234 /**
235 * Return true if the specified ordered address is trivially satisfied for
236 * all pipelines except potentially for the specified pipeline \p p.
237 */
238 bool
is_single_pipe(const ordered_address & jp,tgl_pipe p)239 is_single_pipe(const ordered_address &jp, tgl_pipe p)
240 {
241 for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) {
242 if ((p == TGL_PIPE_NONE || IDX(p) != q) && jp.jp[q] > INT_MIN)
243 return false;
244 }
245
246 return true;
247 }
248
249 /**
250 * Return the number of instructions in the program.
251 */
252 unsigned
num_instructions(const backend_shader * shader)253 num_instructions(const backend_shader *shader)
254 {
255 return shader->cfg->blocks[shader->cfg->num_blocks - 1]->end_ip + 1;
256 }
257
258 /**
259 * Calculate the local ordered_address instruction counter at every
260 * instruction of the shader for subsequent constant-time look-up.
261 */
262 ordered_address *
ordered_inst_addresses(const fs_visitor * shader)263 ordered_inst_addresses(const fs_visitor *shader)
264 {
265 ordered_address *jps = new ordered_address[num_instructions(shader)];
266 ordered_address jp(TGL_PIPE_ALL, 0);
267 unsigned ip = 0;
268
269 foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
270 jps[ip] = jp;
271 for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
272 jp.jp[p] += ordered_unit(shader->devinfo, inst, p);
273 ip++;
274 }
275
276 return jps;
277 }
278
279 /**
280 * Synchronization mode required for data manipulated by in-order
281 * instructions.
282 *
283 * Similar to tgl_sbid_mode, but without SET mode. Defined as a separate
284 * enum for additional type safety. The hardware doesn't provide control
285 * over the synchronization mode for RegDist annotations, this is only used
286 * internally in this pass in order to optimize out redundant read
287 * dependencies where possible.
288 */
289 enum tgl_regdist_mode {
290 TGL_REGDIST_NULL = 0,
291 TGL_REGDIST_SRC = 1,
292 TGL_REGDIST_DST = 2
293 };
294
295 /**
296 * Allow bitwise arithmetic of tgl_regdist_mode enums.
297 */
298 tgl_regdist_mode
operator |(tgl_regdist_mode x,tgl_regdist_mode y)299 operator|(tgl_regdist_mode x, tgl_regdist_mode y)
300 {
301 return tgl_regdist_mode(unsigned(x) | unsigned(y));
302 }
303
304 tgl_regdist_mode
operator &(tgl_regdist_mode x,tgl_regdist_mode y)305 operator&(tgl_regdist_mode x, tgl_regdist_mode y)
306 {
307 return tgl_regdist_mode(unsigned(x) & unsigned(y));
308 }
309
310 tgl_regdist_mode &
operator |=(tgl_regdist_mode & x,tgl_regdist_mode y)311 operator|=(tgl_regdist_mode &x, tgl_regdist_mode y)
312 {
313 return x = x | y;
314 }
315
316 tgl_regdist_mode &
operator &=(tgl_regdist_mode & x,tgl_regdist_mode y)317 operator&=(tgl_regdist_mode &x, tgl_regdist_mode y)
318 {
319 return x = x & y;
320 }
321
322 /** @} */
323
324 /**
325 * Representation of an equivalence relation among the set of unsigned
326 * integers.
327 *
328 * Its initial state is the identity relation '~' such that i ~ j if and
329 * only if i == j for every pair of unsigned integers i and j.
330 */
331 struct equivalence_relation {
equivalence_relation__anond1928e340111::equivalence_relation332 equivalence_relation(unsigned n) : is(new unsigned[n]), n(n)
333 {
334 for (unsigned i = 0; i < n; i++)
335 is[i] = i;
336 }
337
~equivalence_relation__anond1928e340111::equivalence_relation338 ~equivalence_relation()
339 {
340 delete[] is;
341 }
342
343 /**
344 * Return equivalence class index of the specified element. Effectively
345 * this is the numeric value of an arbitrary representative from the
346 * equivalence class.
347 *
348 * Allows the evaluation of the equivalence relation according to the
349 * rule that i ~ j if and only if lookup(i) == lookup(j).
350 */
351 unsigned
lookup__anond1928e340111::equivalence_relation352 lookup(unsigned i) const
353 {
354 if (i < n && is[i] != i)
355 return lookup(is[i]);
356 else
357 return i;
358 }
359
360 /**
361 * Create an array with the results of the lookup() method for
362 * constant-time evaluation.
363 */
364 unsigned *
flatten__anond1928e340111::equivalence_relation365 flatten() const
366 {
367 unsigned *ids = new unsigned[n];
368
369 for (unsigned i = 0; i < n; i++)
370 ids[i] = lookup(i);
371
372 return ids;
373 }
374
375 /**
376 * Mutate the existing equivalence relation minimally by imposing the
377 * additional requirement that i ~ j.
378 *
379 * The algorithm updates the internal representation recursively in
380 * order to guarantee transitivity while preserving the previously
381 * specified equivalence requirements.
382 */
383 unsigned
link__anond1928e340111::equivalence_relation384 link(unsigned i, unsigned j)
385 {
386 const unsigned k = lookup(i);
387 assign(i, k);
388 assign(j, k);
389 return k;
390 }
391
392 private:
393 equivalence_relation(const equivalence_relation &);
394
395 equivalence_relation &
396 operator=(const equivalence_relation &);
397
398 /**
399 * Assign the representative of \p from to be equivalent to \p to.
400 *
401 * At the same time the data structure is partially flattened as much as
402 * it's possible without increasing the number of recursive calls.
403 */
404 void
assign__anond1928e340111::equivalence_relation405 assign(unsigned from, unsigned to)
406 {
407 if (from != to) {
408 assert(from < n);
409
410 if (is[from] != from)
411 assign(is[from], to);
412
413 is[from] = to;
414 }
415 }
416
417 unsigned *is;
418 unsigned n;
419 };
420
421 /**
422 * Representation of a data dependency between two instructions in the
423 * program.
424 * @{
425 */
426 struct dependency {
427 /**
428 * No dependency information.
429 */
dependency__anond1928e340111::dependency430 dependency() : ordered(TGL_REGDIST_NULL), jp(),
431 unordered(TGL_SBID_NULL), id(0),
432 exec_all(false) {}
433
434 /**
435 * Construct a dependency on the in-order instruction with the provided
436 * ordered_address instruction counter.
437 */
dependency__anond1928e340111::dependency438 dependency(tgl_regdist_mode mode, const ordered_address &jp,
439 bool exec_all) :
440 ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0),
441 exec_all(exec_all) {}
442
443 /**
444 * Construct a dependency on the out-of-order instruction with the
445 * specified synchronization token.
446 */
dependency__anond1928e340111::dependency447 dependency(tgl_sbid_mode mode, unsigned id, bool exec_all) :
448 ordered(TGL_REGDIST_NULL), jp(), unordered(mode), id(id),
449 exec_all(exec_all) {}
450
451 /**
452 * Synchronization mode of in-order dependency, or zero if no in-order
453 * dependency is present.
454 */
455 tgl_regdist_mode ordered;
456
457 /**
458 * Instruction counter of in-order dependency.
459 *
460 * For a dependency part of a different block in the program, this is
461 * relative to the specific control flow path taken between the
462 * dependency and the current block: It is the ordered_address such that
463 * the difference between it and the ordered_address of the first
464 * instruction of the current block is exactly the number of in-order
465 * instructions across that control flow path. It is not guaranteed to
466 * be equal to the local ordered_address of the generating instruction
467 * [as returned by ordered_inst_addresses()], except for block-local
468 * dependencies.
469 */
470 ordered_address jp;
471
472 /**
473 * Synchronization mode of unordered dependency, or zero if no unordered
474 * dependency is present.
475 */
476 tgl_sbid_mode unordered;
477
478 /** Synchronization token of out-of-order dependency. */
479 unsigned id;
480
481 /**
482 * Whether the dependency could be run with execution masking disabled,
483 * which might lead to the unwanted execution of the generating
484 * instruction in cases where a BB is executed with all channels
485 * disabled due to hardware bug Wa_1407528679.
486 */
487 bool exec_all;
488
489 /**
490 * Trivial in-order dependency that's always satisfied.
491 *
492 * Note that unlike a default-constructed dependency() which is also
493 * trivially satisfied, this is considered to provide dependency
494 * information and can be used to clear a previously pending dependency
495 * via shadow().
496 */
497 static const dependency done;
498
499 friend bool
operator ==(const dependency & dep0,const dependency & dep1)500 operator==(const dependency &dep0, const dependency &dep1)
501 {
502 return dep0.ordered == dep1.ordered &&
503 dep0.jp == dep1.jp &&
504 dep0.unordered == dep1.unordered &&
505 dep0.id == dep1.id &&
506 dep0.exec_all == dep1.exec_all;
507 }
508
509 friend bool
operator !=(const dependency & dep0,const dependency & dep1)510 operator!=(const dependency &dep0, const dependency &dep1)
511 {
512 return !(dep0 == dep1);
513 }
514 };
515
516 const dependency dependency::done =
517 dependency(TGL_REGDIST_DST, ordered_address(), false);
518
519 /**
520 * Return whether \p dep contains any dependency information.
521 */
522 bool
is_valid(const dependency & dep)523 is_valid(const dependency &dep)
524 {
525 return dep.ordered || dep.unordered;
526 }
527
528 /**
529 * Combine \p dep0 and \p dep1 into a single dependency object that is only
530 * satisfied when both original dependencies are satisfied. This might
531 * involve updating the equivalence relation \p eq in order to make sure
532 * that both out-of-order dependencies are assigned the same hardware SBID
533 * as synchronization token.
534 */
535 dependency
merge(equivalence_relation & eq,const dependency & dep0,const dependency & dep1)536 merge(equivalence_relation &eq,
537 const dependency &dep0, const dependency &dep1)
538 {
539 dependency dep;
540
541 if (dep0.ordered || dep1.ordered) {
542 dep.ordered = dep0.ordered | dep1.ordered;
543 for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
544 dep.jp.jp[p] = MAX2(dep0.jp.jp[p], dep1.jp.jp[p]);
545 }
546
547 if (dep0.unordered || dep1.unordered) {
548 dep.unordered = dep0.unordered | dep1.unordered;
549 dep.id = eq.link(dep0.unordered ? dep0.id : dep1.id,
550 dep1.unordered ? dep1.id : dep0.id);
551 }
552
553 dep.exec_all = dep0.exec_all || dep1.exec_all;
554
555 return dep;
556 }
557
558 /**
559 * Override dependency information of \p dep0 with that of \p dep1.
560 */
561 dependency
shadow(const dependency & dep0,const dependency & dep1)562 shadow(const dependency &dep0, const dependency &dep1)
563 {
564 if (dep0.ordered == TGL_REGDIST_SRC &&
565 is_valid(dep1) && !(dep1.unordered & TGL_SBID_DST) &&
566 !(dep1.ordered & TGL_REGDIST_DST)) {
567 /* As an optimization (see dependency_for_read()),
568 * instructions with a RaR dependency don't synchronize
569 * against a previous in-order read, so we need to pass
570 * through both ordered dependencies instead of simply
571 * dropping the first one. Otherwise we could encounter a
572 * WaR data hazard between OP0 and OP2 in cases like:
573 *
574 * OP0 r1:f r0:d
575 * OP1 r2:d r0:d
576 * OP2 r0:d r3:d
577 *
578 * since only the integer-pipeline r0 dependency from OP1
579 * would be visible to OP2, even though OP0 could technically
580 * execute after OP1 due to the floating-point and integer
581 * pipelines being asynchronous on Gfx12.5+ platforms, so
582 * synchronizing OP2 against OP1 would be insufficient.
583 */
584 dependency dep = dep1;
585
586 dep.ordered |= dep0.ordered;
587 for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
588 dep.jp.jp[p] = MAX2(dep.jp.jp[p], dep0.jp.jp[p]);
589
590 return dep;
591 } else {
592 return is_valid(dep1) ? dep1 : dep0;
593 }
594 }
595
596 /**
597 * Translate dependency information across the program.
598 *
599 * This returns a dependency on the same instruction translated to the
600 * ordered_address space of a different block. The correct shift for
601 * transporting a dependency across an edge of the CFG is the difference
602 * between the local ordered_address of the first instruction of the target
603 * block and the local ordered_address of the instruction immediately after
604 * the end of the origin block.
605 */
606 dependency
transport(dependency dep,int delta[IDX (TGL_PIPE_ALL)])607 transport(dependency dep, int delta[IDX(TGL_PIPE_ALL)])
608 {
609 if (dep.ordered) {
610 for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) {
611 if (dep.jp.jp[p] > INT_MIN)
612 dep.jp.jp[p] += delta[p];
613 }
614 }
615
616 return dep;
617 }
618
619 /**
620 * Return simplified dependency removing any synchronization modes not
621 * applicable to an instruction reading the same register location.
622 */
623 dependency
dependency_for_read(dependency dep)624 dependency_for_read(dependency dep)
625 {
626 dep.ordered &= TGL_REGDIST_DST;
627 return dep;
628 }
629
630 /**
631 * Return simplified dependency removing any synchronization modes not
632 * applicable to an instruction \p inst writing the same register location.
633 *
634 * This clears any WaR dependency for writes performed from the same
635 * pipeline as the read, since there is no possibility for a data hazard.
636 */
637 dependency
dependency_for_write(const struct intel_device_info * devinfo,const fs_inst * inst,dependency dep)638 dependency_for_write(const struct intel_device_info *devinfo,
639 const fs_inst *inst, dependency dep)
640 {
641 if (!is_unordered(devinfo, inst) &&
642 is_single_pipe(dep.jp, inferred_exec_pipe(devinfo, inst)))
643 dep.ordered &= TGL_REGDIST_DST;
644 return dep;
645 }
646
647 /** @} */
648
649 /**
650 * Scoreboard representation. This keeps track of the data dependencies of
651 * registers with GRF granularity.
652 */
653 class scoreboard {
654 public:
655 /**
656 * Look up the most current data dependency for register \p r.
657 */
658 dependency
get(const fs_reg & r) const659 get(const fs_reg &r) const
660 {
661 if (const dependency *p = const_cast<scoreboard *>(this)->dep(r))
662 return *p;
663 else
664 return dependency();
665 }
666
667 /**
668 * Specify the most current data dependency for register \p r.
669 */
670 void
set(const fs_reg & r,const dependency & d)671 set(const fs_reg &r, const dependency &d)
672 {
673 if (dependency *p = dep(r))
674 *p = d;
675 }
676
677 /**
678 * Component-wise merge() of corresponding dependencies from two
679 * scoreboard objects. \sa merge().
680 */
681 friend scoreboard
merge(equivalence_relation & eq,const scoreboard & sb0,const scoreboard & sb1)682 merge(equivalence_relation &eq,
683 const scoreboard &sb0, const scoreboard &sb1)
684 {
685 scoreboard sb;
686
687 for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
688 sb.grf_deps[i] = merge(eq, sb0.grf_deps[i], sb1.grf_deps[i]);
689
690 sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep);
691 sb.accum_dep = merge(eq, sb0.accum_dep, sb1.accum_dep);
692
693 return sb;
694 }
695
696 /**
697 * Component-wise shadow() of corresponding dependencies from two
698 * scoreboard objects. \sa shadow().
699 */
700 friend scoreboard
shadow(const scoreboard & sb0,const scoreboard & sb1)701 shadow(const scoreboard &sb0, const scoreboard &sb1)
702 {
703 scoreboard sb;
704
705 for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
706 sb.grf_deps[i] = shadow(sb0.grf_deps[i], sb1.grf_deps[i]);
707
708 sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep);
709 sb.accum_dep = shadow(sb0.accum_dep, sb1.accum_dep);
710
711 return sb;
712 }
713
714 /**
715 * Component-wise transport() of dependencies from a scoreboard
716 * object. \sa transport().
717 */
718 friend scoreboard
transport(const scoreboard & sb0,int delta[IDX (TGL_PIPE_ALL)])719 transport(const scoreboard &sb0, int delta[IDX(TGL_PIPE_ALL)])
720 {
721 scoreboard sb;
722
723 for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
724 sb.grf_deps[i] = transport(sb0.grf_deps[i], delta);
725
726 sb.addr_dep = transport(sb0.addr_dep, delta);
727 sb.accum_dep = transport(sb0.accum_dep, delta);
728
729 return sb;
730 }
731
732 friend bool
operator ==(const scoreboard & sb0,const scoreboard & sb1)733 operator==(const scoreboard &sb0, const scoreboard &sb1)
734 {
735 for (unsigned i = 0; i < ARRAY_SIZE(sb0.grf_deps); i++) {
736 if (sb0.grf_deps[i] != sb1.grf_deps[i])
737 return false;
738 }
739
740 if (sb0.addr_dep != sb1.addr_dep)
741 return false;
742
743 if (sb0.accum_dep != sb1.accum_dep)
744 return false;
745
746 return true;
747 }
748
749 friend bool
operator !=(const scoreboard & sb0,const scoreboard & sb1)750 operator!=(const scoreboard &sb0, const scoreboard &sb1)
751 {
752 return !(sb0 == sb1);
753 }
754
755 private:
756 dependency grf_deps[XE2_MAX_GRF];
757 dependency addr_dep;
758 dependency accum_dep;
759
760 dependency *
dep(const fs_reg & r)761 dep(const fs_reg &r)
762 {
763 const unsigned reg = (r.file == VGRF ? r.nr + r.offset / REG_SIZE :
764 reg_offset(r) / REG_SIZE);
765
766 return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] :
767 r.file == ARF && reg >= BRW_ARF_ADDRESS &&
768 reg < BRW_ARF_ACCUMULATOR ? &addr_dep :
769 r.file == ARF && reg >= BRW_ARF_ACCUMULATOR &&
770 reg < BRW_ARF_FLAG ? &accum_dep :
771 NULL);
772 }
773 };
774
775 /**
776 * Dependency list handling.
777 * @{
778 */
779 struct dependency_list {
dependency_list__anond1928e340111::dependency_list780 dependency_list() : deps(NULL), n(0) {}
781
~dependency_list__anond1928e340111::dependency_list782 ~dependency_list()
783 {
784 free(deps);
785 }
786
787 void
push_back__anond1928e340111::dependency_list788 push_back(const dependency &dep)
789 {
790 deps = (dependency *)realloc(deps, (n + 1) * sizeof(*deps));
791 deps[n++] = dep;
792 }
793
794 unsigned
size__anond1928e340111::dependency_list795 size() const
796 {
797 return n;
798 }
799
800 const dependency &
operator []__anond1928e340111::dependency_list801 operator[](unsigned i) const
802 {
803 assert(i < n);
804 return deps[i];
805 }
806
807 dependency &
operator []__anond1928e340111::dependency_list808 operator[](unsigned i)
809 {
810 assert(i < n);
811 return deps[i];
812 }
813
814 private:
815 dependency_list(const dependency_list &);
816 dependency_list &
817 operator=(const dependency_list &);
818
819 dependency *deps;
820 unsigned n;
821 };
822
823 /**
824 * Add dependency \p dep to the list of dependencies of an instruction
825 * \p deps.
826 */
827 void
add_dependency(const unsigned * ids,dependency_list & deps,dependency dep)828 add_dependency(const unsigned *ids, dependency_list &deps, dependency dep)
829 {
830 if (is_valid(dep)) {
831 /* Translate the unordered dependency token first in order to keep
832 * the list minimally redundant.
833 */
834 if (dep.unordered)
835 dep.id = ids[dep.id];
836
837 /* Try to combine the specified dependency with any existing ones. */
838 for (unsigned i = 0; i < deps.size(); i++) {
839 /* Don't combine otherwise matching dependencies if there is an
840 * exec_all mismatch which would cause a SET dependency to gain an
841 * exec_all flag, since that would prevent it from being baked
842 * into the instruction we want to allocate an SBID for.
843 */
844 if (deps[i].exec_all != dep.exec_all &&
845 (!deps[i].exec_all || (dep.unordered & TGL_SBID_SET)) &&
846 (!dep.exec_all || (deps[i].unordered & TGL_SBID_SET)))
847 continue;
848
849 if (dep.ordered && deps[i].ordered) {
850 for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
851 deps[i].jp.jp[p] = MAX2(deps[i].jp.jp[p], dep.jp.jp[p]);
852
853 deps[i].ordered |= dep.ordered;
854 deps[i].exec_all |= dep.exec_all;
855 dep.ordered = TGL_REGDIST_NULL;
856 }
857
858 if (dep.unordered && deps[i].unordered && deps[i].id == dep.id) {
859 deps[i].unordered |= dep.unordered;
860 deps[i].exec_all |= dep.exec_all;
861 dep.unordered = TGL_SBID_NULL;
862 }
863 }
864
865 /* Add it to the end of the list if necessary. */
866 if (is_valid(dep))
867 deps.push_back(dep);
868 }
869 }
870
871 /**
872 * Construct a tgl_swsb annotation encoding any ordered dependencies from
873 * the dependency list \p deps of an instruction with ordered_address \p
874 * jp. If \p exec_all is false only dependencies known to be executed with
875 * channel masking applied will be considered in the calculation.
876 */
877 tgl_swsb
ordered_dependency_swsb(const dependency_list & deps,const ordered_address & jp,bool exec_all)878 ordered_dependency_swsb(const dependency_list &deps,
879 const ordered_address &jp,
880 bool exec_all)
881 {
882 tgl_pipe p = TGL_PIPE_NONE;
883 unsigned min_dist = ~0u;
884
885 for (unsigned i = 0; i < deps.size(); i++) {
886 if (deps[i].ordered && exec_all >= deps[i].exec_all) {
887 for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) {
888 const unsigned dist = jp.jp[q] - int64_t(deps[i].jp.jp[q]);
889 const unsigned max_dist = (q == IDX(TGL_PIPE_LONG) ? 14 : 10);
890 assert(jp.jp[q] > deps[i].jp.jp[q]);
891 if (dist <= max_dist) {
892 p = (p && IDX(p) != q ? TGL_PIPE_ALL :
893 tgl_pipe(TGL_PIPE_FLOAT + q));
894 min_dist = MIN3(min_dist, dist, 7);
895 }
896 }
897 }
898 }
899
900 return { p ? min_dist : 0, p };
901 }
902
903 /**
904 * Return whether the dependency list \p deps of an instruction with
905 * ordered_address \p jp has any non-trivial ordered dependencies. If \p
906 * exec_all is false only dependencies known to be executed with channel
907 * masking applied will be considered in the calculation.
908 */
909 bool
find_ordered_dependency(const dependency_list & deps,const ordered_address & jp,bool exec_all)910 find_ordered_dependency(const dependency_list &deps,
911 const ordered_address &jp,
912 bool exec_all)
913 {
914 return ordered_dependency_swsb(deps, jp, exec_all).regdist;
915 }
916
917 /**
918 * Return the full tgl_sbid_mode bitset for the first unordered dependency
919 * on the list \p deps that matches the specified tgl_sbid_mode, or zero if
920 * no such dependency is present. If \p exec_all is false only
921 * dependencies known to be executed with channel masking applied will be
922 * considered in the calculation.
923 */
924 tgl_sbid_mode
find_unordered_dependency(const dependency_list & deps,tgl_sbid_mode unordered,bool exec_all)925 find_unordered_dependency(const dependency_list &deps,
926 tgl_sbid_mode unordered,
927 bool exec_all)
928 {
929 if (unordered) {
930 for (unsigned i = 0; i < deps.size(); i++) {
931 if ((unordered & deps[i].unordered) &&
932 exec_all >= deps[i].exec_all)
933 return deps[i].unordered;
934 }
935 }
936
937 return TGL_SBID_NULL;
938 }
939
940 /**
941 * Return the tgl_sbid_mode bitset of an unordered dependency from the list
942 * \p deps that can be represented directly in the SWSB annotation of the
943 * instruction without additional SYNC instructions, or zero if no such
944 * dependency is present.
945 */
946 tgl_sbid_mode
baked_unordered_dependency_mode(const struct intel_device_info * devinfo,const fs_inst * inst,const dependency_list & deps,const ordered_address & jp)947 baked_unordered_dependency_mode(const struct intel_device_info *devinfo,
948 const fs_inst *inst,
949 const dependency_list &deps,
950 const ordered_address &jp)
951 {
952 const bool exec_all = inst->force_writemask_all;
953 const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
954 const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp,
955 exec_all).pipe;
956
957 if (find_unordered_dependency(deps, TGL_SBID_SET, exec_all))
958 return find_unordered_dependency(deps, TGL_SBID_SET, exec_all);
959 else if (has_ordered && is_unordered(devinfo, inst))
960 return TGL_SBID_NULL;
961 else if (find_unordered_dependency(deps, TGL_SBID_DST, exec_all) &&
962 (!has_ordered || ordered_pipe == inferred_sync_pipe(devinfo, inst)))
963 return find_unordered_dependency(deps, TGL_SBID_DST, exec_all);
964 else if (!has_ordered)
965 return find_unordered_dependency(deps, TGL_SBID_SRC, exec_all);
966 else
967 return TGL_SBID_NULL;
968 }
969
970 /**
971 * Return whether an ordered dependency from the list \p deps can be
972 * represented directly in the SWSB annotation of the instruction without
973 * additional SYNC instructions.
974 */
975 bool
baked_ordered_dependency_mode(const struct intel_device_info * devinfo,const fs_inst * inst,const dependency_list & deps,const ordered_address & jp)976 baked_ordered_dependency_mode(const struct intel_device_info *devinfo,
977 const fs_inst *inst,
978 const dependency_list &deps,
979 const ordered_address &jp)
980 {
981 const bool exec_all = inst->force_writemask_all;
982 const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
983 const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp,
984 exec_all).pipe;
985 const tgl_sbid_mode unordered_mode =
986 baked_unordered_dependency_mode(devinfo, inst, deps, jp);
987
988 if (!has_ordered)
989 return false;
990 else if (!unordered_mode)
991 return true;
992 else
993 return ordered_pipe == inferred_sync_pipe(devinfo, inst) &&
994 unordered_mode == (is_unordered(devinfo, inst) ? TGL_SBID_SET :
995 TGL_SBID_DST);
996 }
997
998 /** @} */
999
1000 /**
1001 * Shader instruction dependency calculation.
1002 * @{
1003 */
1004
1005 /**
1006 * Update scoreboard object \p sb to account for the execution of
1007 * instruction \p inst.
1008 */
1009 void
update_inst_scoreboard(const fs_visitor * shader,const ordered_address * jps,const fs_inst * inst,unsigned ip,scoreboard & sb)1010 update_inst_scoreboard(const fs_visitor *shader, const ordered_address *jps,
1011 const fs_inst *inst, unsigned ip, scoreboard &sb)
1012 {
1013 const bool exec_all = inst->force_writemask_all;
1014 const struct intel_device_info *devinfo = shader->devinfo;
1015 const tgl_pipe p = inferred_exec_pipe(devinfo, inst);
1016 const ordered_address jp = p ? ordered_address(p, jps[ip].jp[IDX(p)]) :
1017 ordered_address();
1018 const bool is_ordered = ordered_unit(devinfo, inst, IDX(TGL_PIPE_ALL));
1019 const bool is_unordered_math =
1020 (inst->is_math() && devinfo->ver < 20) ||
1021 (devinfo->has_64bit_float_via_math_pipe &&
1022 (get_exec_type(inst) == BRW_REGISTER_TYPE_DF ||
1023 inst->dst.type == BRW_REGISTER_TYPE_DF));
1024
1025 /* Track any source registers that may be fetched asynchronously by this
1026 * instruction, otherwise clear the dependency in order to avoid
1027 * subsequent redundant synchronization.
1028 */
1029 for (unsigned i = 0; i < inst->sources; i++) {
1030 const dependency rd_dep =
1031 (inst->is_payload(i) ||
1032 inst->opcode == BRW_OPCODE_DPAS ||
1033 is_unordered_math) ? dependency(TGL_SBID_SRC, ip, exec_all) :
1034 is_ordered ? dependency(TGL_REGDIST_SRC, jp, exec_all) :
1035 dependency::done;
1036
1037 for (unsigned j = 0; j < regs_read(inst, i); j++) {
1038 const fs_reg r = byte_offset(inst->src[i], REG_SIZE * j);
1039 sb.set(r, shadow(sb.get(r), rd_dep));
1040 }
1041 }
1042
1043 if (inst->reads_accumulator_implicitly())
1044 sb.set(brw_acc_reg(8), dependency(TGL_REGDIST_SRC, jp, exec_all));
1045
1046 /* Track any destination registers of this instruction. */
1047 const dependency wr_dep =
1048 is_unordered(devinfo, inst) ? dependency(TGL_SBID_DST, ip, exec_all) :
1049 is_ordered ? dependency(TGL_REGDIST_DST, jp, exec_all) :
1050 dependency();
1051
1052 if (inst->writes_accumulator_implicitly(devinfo))
1053 sb.set(brw_acc_reg(8), wr_dep);
1054
1055 if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
1056 !inst->dst.is_null()) {
1057 for (unsigned j = 0; j < regs_written(inst); j++)
1058 sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep);
1059 }
1060 }
1061
1062 /**
1063 * Calculate scoreboard objects locally that represent any pending (and
1064 * unconditionally resolved) dependencies at the end of each block of the
1065 * program.
1066 */
1067 scoreboard *
gather_block_scoreboards(const fs_visitor * shader,const ordered_address * jps)1068 gather_block_scoreboards(const fs_visitor *shader,
1069 const ordered_address *jps)
1070 {
1071 scoreboard *sbs = new scoreboard[shader->cfg->num_blocks];
1072 unsigned ip = 0;
1073
1074 foreach_block_and_inst(block, fs_inst, inst, shader->cfg)
1075 update_inst_scoreboard(shader, jps, inst, ip++, sbs[block->num]);
1076
1077 return sbs;
1078 }
1079
1080 /**
1081 * Propagate data dependencies globally through the control flow graph
1082 * until a fixed point is reached.
1083 *
1084 * Calculates the set of dependencies potentially pending at the beginning
1085 * of each block, and returns it as an array of scoreboard objects.
1086 */
1087 scoreboard *
propagate_block_scoreboards(const fs_visitor * shader,const ordered_address * jps,equivalence_relation & eq)1088 propagate_block_scoreboards(const fs_visitor *shader,
1089 const ordered_address *jps,
1090 equivalence_relation &eq)
1091 {
1092 const scoreboard *delta_sbs = gather_block_scoreboards(shader, jps);
1093 scoreboard *in_sbs = new scoreboard[shader->cfg->num_blocks];
1094 scoreboard *out_sbs = new scoreboard[shader->cfg->num_blocks];
1095
1096 for (bool progress = true; progress;) {
1097 progress = false;
1098
1099 foreach_block(block, shader->cfg) {
1100 const scoreboard sb = shadow(in_sbs[block->num],
1101 delta_sbs[block->num]);
1102
1103 if (sb != out_sbs[block->num]) {
1104 foreach_list_typed(bblock_link, child_link, link,
1105 &block->children) {
1106 scoreboard &in_sb = in_sbs[child_link->block->num];
1107 int delta[IDX(TGL_PIPE_ALL)];
1108
1109 for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
1110 delta[p] = jps[child_link->block->start_ip].jp[p]
1111 - jps[block->end_ip].jp[p]
1112 - ordered_unit(shader->devinfo,
1113 static_cast<const fs_inst *>(block->end()), p);
1114
1115 in_sb = merge(eq, in_sb, transport(sb, delta));
1116 }
1117
1118 out_sbs[block->num] = sb;
1119 progress = true;
1120 }
1121 }
1122 }
1123
1124 delete[] delta_sbs;
1125 delete[] out_sbs;
1126
1127 return in_sbs;
1128 }
1129
1130 /**
1131 * Return the list of potential dependencies of each instruction in the
1132 * shader based on the result of global dependency analysis.
1133 */
1134 dependency_list *
gather_inst_dependencies(const fs_visitor * shader,const ordered_address * jps)1135 gather_inst_dependencies(const fs_visitor *shader,
1136 const ordered_address *jps)
1137 {
1138 const struct intel_device_info *devinfo = shader->devinfo;
1139 equivalence_relation eq(num_instructions(shader));
1140 scoreboard *sbs = propagate_block_scoreboards(shader, jps, eq);
1141 const unsigned *ids = eq.flatten();
1142 dependency_list *deps = new dependency_list[num_instructions(shader)];
1143 unsigned ip = 0;
1144
1145 foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
1146 const bool exec_all = inst->force_writemask_all;
1147 const tgl_pipe p = inferred_exec_pipe(devinfo, inst);
1148 scoreboard &sb = sbs[block->num];
1149
1150 for (unsigned i = 0; i < inst->sources; i++) {
1151 for (unsigned j = 0; j < regs_read(inst, i); j++)
1152 add_dependency(ids, deps[ip], dependency_for_read(
1153 sb.get(byte_offset(inst->src[i], REG_SIZE * j))));
1154 }
1155
1156 if (inst->reads_accumulator_implicitly()) {
1157 /* Wa_22012725308:
1158 *
1159 * "When the accumulator registers are used as source and/or
1160 * destination, hardware does not ensure prevention of write
1161 * after read hazard across execution pipes."
1162 */
1163 const dependency dep = sb.get(brw_acc_reg(8));
1164 if (dep.ordered && !is_single_pipe(dep.jp, p))
1165 add_dependency(ids, deps[ip], dep);
1166 }
1167
1168 if (is_unordered(devinfo, inst) && !inst->eot)
1169 add_dependency(ids, deps[ip],
1170 dependency(TGL_SBID_SET, ip, exec_all));
1171
1172 if (!inst->no_dd_check) {
1173 if (inst->dst.file != BAD_FILE && !inst->dst.is_null() &&
1174 !inst->dst.is_accumulator()) {
1175 for (unsigned j = 0; j < regs_written(inst); j++) {
1176 add_dependency(ids, deps[ip], dependency_for_write(devinfo, inst,
1177 sb.get(byte_offset(inst->dst, REG_SIZE * j))));
1178 }
1179 }
1180
1181 if (inst->writes_accumulator_implicitly(devinfo) ||
1182 inst->dst.is_accumulator()) {
1183 /* Wa_22012725308:
1184 *
1185 * "When the accumulator registers are used as source and/or
1186 * destination, hardware does not ensure prevention of write
1187 * after read hazard across execution pipes."
1188 */
1189 const dependency dep = sb.get(brw_acc_reg(8));
1190 if (dep.ordered && !is_single_pipe(dep.jp, p))
1191 add_dependency(ids, deps[ip], dep);
1192 }
1193 }
1194
1195 update_inst_scoreboard(shader, jps, inst, ip, sb);
1196 ip++;
1197 }
1198
1199 delete[] sbs;
1200 delete[] ids;
1201
1202 return deps;
1203 }
1204
1205 /** @} */
1206
1207 /**
1208 * Allocate SBID tokens to track the execution of every out-of-order
1209 * instruction of the shader.
1210 */
1211 dependency_list *
allocate_inst_dependencies(const fs_visitor * shader,const dependency_list * deps0)1212 allocate_inst_dependencies(const fs_visitor *shader,
1213 const dependency_list *deps0)
1214 {
1215 /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in
1216 * shaders with a large number of SEND messages.
1217 *
1218 * XXX - Use 32 SBIDs on Xe2+ while in large GRF mode.
1219 */
1220 const unsigned num_sbids = 16;
1221
1222 /* Allocate an unordered dependency ID to hardware SBID translation
1223 * table with as many entries as instructions there are in the shader,
1224 * which is the maximum number of unordered IDs we can find in the
1225 * program.
1226 */
1227 unsigned *ids = new unsigned[num_instructions(shader)];
1228 for (unsigned ip = 0; ip < num_instructions(shader); ip++)
1229 ids[ip] = ~0u;
1230
1231 dependency_list *deps1 = new dependency_list[num_instructions(shader)];
1232 unsigned next_id = 0;
1233
1234 for (unsigned ip = 0; ip < num_instructions(shader); ip++) {
1235 for (unsigned i = 0; i < deps0[ip].size(); i++) {
1236 const dependency &dep = deps0[ip][i];
1237
1238 if (dep.unordered && ids[dep.id] == ~0u)
1239 ids[dep.id] = (next_id++) & (num_sbids - 1);
1240
1241 add_dependency(ids, deps1[ip], dep);
1242 }
1243 }
1244
1245 delete[] ids;
1246
1247 return deps1;
1248 }
1249
1250 /**
1251 * Emit dependency information provided by \p deps into the shader,
1252 * inserting additional SYNC instructions for dependencies that can't be
1253 * represented directly by annotating existing instructions.
1254 */
1255 void
emit_inst_dependencies(fs_visitor * shader,const ordered_address * jps,const dependency_list * deps)1256 emit_inst_dependencies(fs_visitor *shader,
1257 const ordered_address *jps,
1258 const dependency_list *deps)
1259 {
1260 const struct intel_device_info *devinfo = shader->devinfo;
1261 unsigned ip = 0;
1262
1263 foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) {
1264 const bool exec_all = inst->force_writemask_all;
1265 const bool ordered_mode =
1266 baked_ordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]);
1267 const tgl_sbid_mode unordered_mode =
1268 baked_unordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]);
1269 tgl_swsb swsb = !ordered_mode ? tgl_swsb() :
1270 ordered_dependency_swsb(deps[ip], jps[ip], exec_all);
1271
1272 for (unsigned i = 0; i < deps[ip].size(); i++) {
1273 const dependency &dep = deps[ip][i];
1274
1275 if (dep.unordered) {
1276 if (unordered_mode == dep.unordered &&
1277 exec_all >= dep.exec_all && !swsb.mode) {
1278 /* Bake unordered dependency into the instruction's SWSB if
1279 * possible, except in cases where the current instruction
1280 * isn't marked NoMask but the dependency is, since that
1281 * might lead to data coherency issues due to
1282 * Wa_1407528679.
1283 */
1284 swsb.sbid = dep.id;
1285 swsb.mode = dep.unordered;
1286 } else {
1287 /* Emit dependency into the SWSB of an extra SYNC
1288 * instruction.
1289 */
1290 const fs_builder ibld = fs_builder(shader, block, inst)
1291 .exec_all().group(1, 0);
1292 fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
1293 brw_imm_ud(TGL_SYNC_NOP));
1294 sync->sched.sbid = dep.id;
1295 sync->sched.mode = dep.unordered;
1296 assert(!(sync->sched.mode & TGL_SBID_SET));
1297 }
1298 }
1299 }
1300
1301 for (unsigned i = 0; i < deps[ip].size(); i++) {
1302 const dependency &dep = deps[ip][i];
1303
1304 if (dep.ordered &&
1305 find_ordered_dependency(deps[ip], jps[ip], true) &&
1306 (!ordered_mode || dep.exec_all > exec_all)) {
1307 /* If the current instruction is not marked NoMask but an
1308 * ordered dependency is, perform the synchronization as a
1309 * separate NoMask SYNC instruction in order to avoid data
1310 * coherency issues due to Wa_1407528679. The similar
1311 * scenario with unordered dependencies should have been
1312 * handled above.
1313 */
1314 const fs_builder ibld = fs_builder(shader, block, inst)
1315 .exec_all().group(1, 0);
1316 fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
1317 brw_imm_ud(TGL_SYNC_NOP));
1318 sync->sched = ordered_dependency_swsb(deps[ip], jps[ip], true);
1319 break;
1320 }
1321 }
1322
1323 /* Update the IR. */
1324 inst->sched = swsb;
1325 inst->no_dd_check = inst->no_dd_clear = false;
1326 ip++;
1327 }
1328 }
1329 }
1330
1331 bool
brw_fs_lower_scoreboard(fs_visitor & s)1332 brw_fs_lower_scoreboard(fs_visitor &s)
1333 {
1334 if (s.devinfo->ver >= 12) {
1335 const ordered_address *jps = ordered_inst_addresses(&s);
1336 const dependency_list *deps0 = gather_inst_dependencies(&s, jps);
1337 const dependency_list *deps1 = allocate_inst_dependencies(&s, deps0);
1338 emit_inst_dependencies(&s, jps, deps1);
1339 delete[] deps1;
1340 delete[] deps0;
1341 delete[] jps;
1342 }
1343
1344 return true;
1345 }
1346