1 /*
2 * Copyright © 2019 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs_scoreboard.cpp
25 *
26 * Gen12+ hardware lacks the register scoreboard logic that used to guarantee
27 * data coherency between register reads and writes in previous generations.
28 * This lowering pass runs after register allocation in order to make up for
29 * it.
30 *
31 * It works by performing global dataflow analysis in order to determine the
32 * set of potential dependencies of every instruction in the shader, and then
33 * inserts any required SWSB annotations and additional SYNC instructions in
34 * order to guarantee data coherency.
35 *
36 * WARNING - Access of the following (rarely used) ARF registers is not
37 * tracked here, and require the RegDist SWSB annotation to be set
38 * to 1 by the generator in order to avoid data races:
39 *
40 * - sp stack pointer
41 * - sr0 state register
42 * - cr0 control register
43 * - ip instruction pointer
44 * - tm0 timestamp register
45 * - dbg0 debug register
46 *
47 * The following ARF registers don't need to be tracked here because data
48 * coherency is still provided transparently by the hardware:
49 *
50 * - f0-1 flag registers
51 * - n0 notification register
52 * - tdr0 thread dependency register
53 */
54
55 #include "brw_fs.h"
56 #include "brw_cfg.h"
57
58 using namespace brw;
59
60 namespace {
61 /**
62 * In-order instruction accounting.
63 * @{
64 */
65
66 /**
67 * Number of in-order hardware instructions contained in this IR
68 * instruction. This determines the increment applied to the RegDist
69 * counter calculated for any ordered dependency that crosses this
70 * instruction.
71 */
72 unsigned
ordered_unit(const fs_inst * inst)73 ordered_unit(const fs_inst *inst)
74 {
75 switch (inst->opcode) {
76 case BRW_OPCODE_SYNC:
77 case BRW_OPCODE_DO:
78 case SHADER_OPCODE_UNDEF:
79 case FS_OPCODE_PLACEHOLDER_HALT:
80 case FS_OPCODE_SCHEDULING_FENCE:
81 return 0;
82 default:
83 /* Note that the following is inaccurate for virtual instructions
84 * that expand to more in-order instructions than assumed here, but
85 * that can only lead to suboptimal execution ordering, data
86 * coherency won't be impacted. Providing exact RegDist counts for
87 * each virtual instruction would allow better ALU performance, but
88 * it would require keeping this switch statement in perfect sync
89 * with the generator in order to avoid data corruption. Lesson is
90 * (again) don't use virtual instructions if you want optimal
91 * scheduling.
92 */
93 return is_unordered(inst) ? 0 : 1;
94 }
95 }
96
97 /**
98 * Type for an instruction counter that increments for in-order
99 * instructions only, arbitrarily denoted 'jp' throughout this lowering
100 * pass in order to distinguish it from the regular instruction counter.
101 */
102 typedef int ordered_address;
103
104 /**
105 * Return the number of instructions in the program.
106 */
107 unsigned
num_instructions(const backend_shader * shader)108 num_instructions(const backend_shader *shader)
109 {
110 return shader->cfg->blocks[shader->cfg->num_blocks - 1]->end_ip + 1;
111 }
112
113 /**
114 * Calculate the local ordered_address instruction counter at every
115 * instruction of the shader for subsequent constant-time look-up.
116 */
117 ordered_address *
ordered_inst_addresses(const fs_visitor * shader)118 ordered_inst_addresses(const fs_visitor *shader)
119 {
120 ordered_address *jps = new ordered_address[num_instructions(shader)];
121 ordered_address jp = 0;
122 unsigned ip = 0;
123
124 foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
125 jps[ip] = jp;
126 jp += ordered_unit(inst);
127 ip++;
128 }
129
130 return jps;
131 }
132
133 /**
134 * Synchronization mode required for data manipulated by in-order
135 * instructions.
136 *
137 * Similar to tgl_sbid_mode, but without SET mode. Defined as a separate
138 * enum for additional type safety. The hardware doesn't provide control
139 * over the synchronization mode for RegDist annotations, this is only used
140 * internally in this pass in order to optimize out redundant read
141 * dependencies where possible.
142 */
143 enum tgl_regdist_mode {
144 TGL_REGDIST_NULL = 0,
145 TGL_REGDIST_SRC = 1,
146 TGL_REGDIST_DST = 2
147 };
148
149 /**
150 * Allow bitwise arithmetic of tgl_regdist_mode enums.
151 */
152 tgl_regdist_mode
operator |(tgl_regdist_mode x,tgl_regdist_mode y)153 operator|(tgl_regdist_mode x, tgl_regdist_mode y)
154 {
155 return tgl_regdist_mode(unsigned(x) | unsigned(y));
156 }
157
158 tgl_regdist_mode
operator &(tgl_regdist_mode x,tgl_regdist_mode y)159 operator&(tgl_regdist_mode x, tgl_regdist_mode y)
160 {
161 return tgl_regdist_mode(unsigned(x) & unsigned(y));
162 }
163
164 tgl_regdist_mode &
operator |=(tgl_regdist_mode & x,tgl_regdist_mode y)165 operator|=(tgl_regdist_mode &x, tgl_regdist_mode y)
166 {
167 return x = x | y;
168 }
169
170 tgl_regdist_mode &
operator &=(tgl_regdist_mode & x,tgl_regdist_mode y)171 operator&=(tgl_regdist_mode &x, tgl_regdist_mode y)
172 {
173 return x = x & y;
174 }
175
176 /** @} */
177
178 /**
179 * Representation of an equivalence relation among the set of unsigned
180 * integers.
181 *
182 * Its initial state is the identity relation '~' such that i ~ j if and
183 * only if i == j for every pair of unsigned integers i and j.
184 */
185 struct equivalence_relation {
equivalence_relation__anonf3a7e32b0111::equivalence_relation186 equivalence_relation(unsigned n) : is(new unsigned[n]), n(n)
187 {
188 for (unsigned i = 0; i < n; i++)
189 is[i] = i;
190 }
191
~equivalence_relation__anonf3a7e32b0111::equivalence_relation192 ~equivalence_relation()
193 {
194 delete[] is;
195 }
196
197 /**
198 * Return equivalence class index of the specified element. Effectively
199 * this is the numeric value of an arbitrary representative from the
200 * equivalence class.
201 *
202 * Allows the evaluation of the equivalence relation according to the
203 * rule that i ~ j if and only if lookup(i) == lookup(j).
204 */
205 unsigned
lookup__anonf3a7e32b0111::equivalence_relation206 lookup(unsigned i) const
207 {
208 if (i < n && is[i] != i)
209 return lookup(is[i]);
210 else
211 return i;
212 }
213
214 /**
215 * Create an array with the results of the lookup() method for
216 * constant-time evaluation.
217 */
218 unsigned *
flatten__anonf3a7e32b0111::equivalence_relation219 flatten() const
220 {
221 unsigned *ids = new unsigned[n];
222
223 for (unsigned i = 0; i < n; i++)
224 ids[i] = lookup(i);
225
226 return ids;
227 }
228
229 /**
230 * Mutate the existing equivalence relation minimally by imposing the
231 * additional requirement that i ~ j.
232 *
233 * The algorithm updates the internal representation recursively in
234 * order to guarantee transitivity while preserving the previously
235 * specified equivalence requirements.
236 */
237 unsigned
link__anonf3a7e32b0111::equivalence_relation238 link(unsigned i, unsigned j)
239 {
240 const unsigned k = lookup(i);
241 assign(i, k);
242 assign(j, k);
243 return k;
244 }
245
246 private:
247 equivalence_relation(const equivalence_relation &);
248
249 equivalence_relation &
250 operator=(const equivalence_relation &);
251
252 /**
253 * Assign the representative of \p from to be equivalent to \p to.
254 *
255 * At the same time the data structure is partially flattened as much as
256 * it's possible without increasing the number of recursive calls.
257 */
258 void
assign__anonf3a7e32b0111::equivalence_relation259 assign(unsigned from, unsigned to)
260 {
261 if (from != to) {
262 assert(from < n);
263
264 if (is[from] != from)
265 assign(is[from], to);
266
267 is[from] = to;
268 }
269 }
270
271 unsigned *is;
272 unsigned n;
273 };
274
275 /**
276 * Representation of a data dependency between two instructions in the
277 * program.
278 * @{
279 */
280 struct dependency {
281 /**
282 * No dependency information.
283 */
dependency__anonf3a7e32b0111::dependency284 dependency() : ordered(TGL_REGDIST_NULL), jp(INT_MIN),
285 unordered(TGL_SBID_NULL), id(0),
286 exec_all(false) {}
287
288 /**
289 * Construct a dependency on the in-order instruction with the provided
290 * ordered_address instruction counter.
291 */
dependency__anonf3a7e32b0111::dependency292 dependency(tgl_regdist_mode mode, ordered_address jp, bool exec_all) :
293 ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0),
294 exec_all(exec_all) {}
295
296 /**
297 * Construct a dependency on the out-of-order instruction with the
298 * specified synchronization token.
299 */
dependency__anonf3a7e32b0111::dependency300 dependency(tgl_sbid_mode mode, unsigned id, bool exec_all) :
301 ordered(TGL_REGDIST_NULL), jp(INT_MIN), unordered(mode), id(id),
302 exec_all(exec_all) {}
303
304 /**
305 * Synchronization mode of in-order dependency, or zero if no in-order
306 * dependency is present.
307 */
308 tgl_regdist_mode ordered;
309
310 /**
311 * Instruction counter of in-order dependency.
312 *
313 * For a dependency part of a different block in the program, this is
314 * relative to the specific control flow path taken between the
315 * dependency and the current block: It is the ordered_address such that
316 * the difference between it and the ordered_address of the first
317 * instruction of the current block is exactly the number of in-order
318 * instructions across that control flow path. It is not guaranteed to
319 * be equal to the local ordered_address of the generating instruction
320 * [as returned by ordered_inst_addresses()], except for block-local
321 * dependencies.
322 */
323 ordered_address jp;
324
325 /**
326 * Synchronization mode of unordered dependency, or zero if no unordered
327 * dependency is present.
328 */
329 tgl_sbid_mode unordered;
330
331 /** Synchronization token of out-of-order dependency. */
332 unsigned id;
333
334 /**
335 * Whether the dependency could be run with execution masking disabled,
336 * which might lead to the unwanted execution of the generating
337 * instruction in cases where a BB is executed with all channels
338 * disabled due to hardware bug GEN:BUG:1407528679.
339 */
340 bool exec_all;
341
342 /**
343 * Trivial in-order dependency that's always satisfied.
344 *
345 * Note that unlike a default-constructed dependency() which is also
346 * trivially satisfied, this is considered to provide dependency
347 * information and can be used to clear a previously pending dependency
348 * via shadow().
349 */
350 static const dependency done;
351
352 friend bool
operator ==(const dependency & dep0,const dependency & dep1)353 operator==(const dependency &dep0, const dependency &dep1)
354 {
355 return dep0.ordered == dep1.ordered &&
356 dep0.jp == dep1.jp &&
357 dep0.unordered == dep1.unordered &&
358 dep0.id == dep1.id &&
359 dep0.exec_all == dep1.exec_all;
360 }
361
362 friend bool
operator !=(const dependency & dep0,const dependency & dep1)363 operator!=(const dependency &dep0, const dependency &dep1)
364 {
365 return !(dep0 == dep1);
366 }
367 };
368
369 const dependency dependency::done = dependency(TGL_REGDIST_SRC, INT_MIN, false);
370
371 /**
372 * Return whether \p dep contains any dependency information.
373 */
374 bool
is_valid(const dependency & dep)375 is_valid(const dependency &dep)
376 {
377 return dep.ordered || dep.unordered;
378 }
379
380 /**
381 * Combine \p dep0 and \p dep1 into a single dependency object that is only
382 * satisfied when both original dependencies are satisfied. This might
383 * involve updating the equivalence relation \p eq in order to make sure
384 * that both out-of-order dependencies are assigned the same hardware SBID
385 * as synchronization token.
386 */
387 dependency
merge(equivalence_relation & eq,const dependency & dep0,const dependency & dep1)388 merge(equivalence_relation &eq,
389 const dependency &dep0, const dependency &dep1)
390 {
391 dependency dep;
392
393 if (dep0.ordered || dep1.ordered) {
394 dep.ordered = dep0.ordered | dep1.ordered;
395 dep.jp = MAX2(dep0.jp, dep1.jp);
396 }
397
398 if (dep0.unordered || dep1.unordered) {
399 dep.unordered = dep0.unordered | dep1.unordered;
400 dep.id = eq.link(dep0.unordered ? dep0.id : dep1.id,
401 dep1.unordered ? dep1.id : dep0.id);
402 }
403
404 dep.exec_all = dep0.exec_all || dep1.exec_all;
405
406 return dep;
407 }
408
409 /**
410 * Override dependency information of \p dep0 with that of \p dep1.
411 */
412 dependency
shadow(const dependency & dep0,const dependency & dep1)413 shadow(const dependency &dep0, const dependency &dep1)
414 {
415 return is_valid(dep1) ? dep1 : dep0;
416 }
417
418 /**
419 * Translate dependency information across the program.
420 *
421 * This returns a dependency on the same instruction translated to the
422 * ordered_address space of a different block. The correct shift for
423 * transporting a dependency across an edge of the CFG is the difference
424 * between the local ordered_address of the first instruction of the target
425 * block and the local ordered_address of the instruction immediately after
426 * the end of the origin block.
427 */
428 dependency
transport(dependency dep,int delta)429 transport(dependency dep, int delta)
430 {
431 if (dep.ordered && dep.jp > INT_MIN)
432 dep.jp += delta;
433
434 return dep;
435 }
436
437 /**
438 * Return simplified dependency removing any synchronization modes not
439 * applicable to an instruction reading the same register location.
440 */
441 dependency
dependency_for_read(dependency dep)442 dependency_for_read(dependency dep)
443 {
444 dep.ordered &= TGL_REGDIST_DST;
445 return dep;
446 }
447
448 /**
449 * Return simplified dependency removing any synchronization modes not
450 * applicable to an instruction \p inst writing the same register location.
451 */
452 dependency
dependency_for_write(const fs_inst * inst,dependency dep)453 dependency_for_write(const fs_inst *inst, dependency dep)
454 {
455 if (!is_unordered(inst))
456 dep.ordered &= TGL_REGDIST_DST;
457 return dep;
458 }
459
460 /** @} */
461
462 /**
463 * Scoreboard representation. This keeps track of the data dependencies of
464 * registers with GRF granularity.
465 */
466 class scoreboard {
467 public:
468 /**
469 * Look up the most current data dependency for register \p r.
470 */
471 dependency
get(const fs_reg & r) const472 get(const fs_reg &r) const
473 {
474 if (const dependency *p = const_cast<scoreboard *>(this)->dep(r))
475 return *p;
476 else
477 return dependency();
478 }
479
480 /**
481 * Specify the most current data dependency for register \p r.
482 */
483 void
set(const fs_reg & r,const dependency & d)484 set(const fs_reg &r, const dependency &d)
485 {
486 if (dependency *p = dep(r))
487 *p = d;
488 }
489
490 /**
491 * Component-wise merge() of corresponding dependencies from two
492 * scoreboard objects. \sa merge().
493 */
494 friend scoreboard
merge(equivalence_relation & eq,const scoreboard & sb0,const scoreboard & sb1)495 merge(equivalence_relation &eq,
496 const scoreboard &sb0, const scoreboard &sb1)
497 {
498 scoreboard sb;
499
500 for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
501 sb.grf_deps[i] = merge(eq, sb0.grf_deps[i], sb1.grf_deps[i]);
502
503 sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep);
504
505 for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
506 sb.accum_deps[i] = merge(eq, sb0.accum_deps[i], sb1.accum_deps[i]);
507
508 return sb;
509 }
510
511 /**
512 * Component-wise shadow() of corresponding dependencies from two
513 * scoreboard objects. \sa shadow().
514 */
515 friend scoreboard
shadow(const scoreboard & sb0,const scoreboard & sb1)516 shadow(const scoreboard &sb0, const scoreboard &sb1)
517 {
518 scoreboard sb;
519
520 for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
521 sb.grf_deps[i] = shadow(sb0.grf_deps[i], sb1.grf_deps[i]);
522
523 sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep);
524
525 for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
526 sb.accum_deps[i] = shadow(sb0.accum_deps[i], sb1.accum_deps[i]);
527
528 return sb;
529 }
530
531 /**
532 * Component-wise transport() of dependencies from a scoreboard
533 * object. \sa transport().
534 */
535 friend scoreboard
transport(const scoreboard & sb0,int delta)536 transport(const scoreboard &sb0, int delta)
537 {
538 scoreboard sb;
539
540 for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
541 sb.grf_deps[i] = transport(sb0.grf_deps[i], delta);
542
543 sb.addr_dep = transport(sb0.addr_dep, delta);
544
545 for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
546 sb.accum_deps[i] = transport(sb0.accum_deps[i], delta);
547
548 return sb;
549 }
550
551 friend bool
operator ==(const scoreboard & sb0,const scoreboard & sb1)552 operator==(const scoreboard &sb0, const scoreboard &sb1)
553 {
554 for (unsigned i = 0; i < ARRAY_SIZE(sb0.grf_deps); i++) {
555 if (sb0.grf_deps[i] != sb1.grf_deps[i])
556 return false;
557 }
558
559 if (sb0.addr_dep != sb1.addr_dep)
560 return false;
561
562 for (unsigned i = 0; i < ARRAY_SIZE(sb0.accum_deps); i++) {
563 if (sb0.accum_deps[i] != sb1.accum_deps[i])
564 return false;
565 }
566
567 return true;
568 }
569
570 friend bool
operator !=(const scoreboard & sb0,const scoreboard & sb1)571 operator!=(const scoreboard &sb0, const scoreboard &sb1)
572 {
573 return !(sb0 == sb1);
574 }
575
576 private:
577 dependency grf_deps[BRW_MAX_GRF];
578 dependency addr_dep;
579 dependency accum_deps[10];
580
581 dependency *
dep(const fs_reg & r)582 dep(const fs_reg &r)
583 {
584 const unsigned reg = (r.file == VGRF ? r.nr + r.offset / REG_SIZE :
585 reg_offset(r) / REG_SIZE);
586
587 return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] :
588 r.file == MRF ? &grf_deps[GEN7_MRF_HACK_START + reg] :
589 r.file == ARF && reg >= BRW_ARF_ADDRESS &&
590 reg < BRW_ARF_ACCUMULATOR ? &addr_dep :
591 r.file == ARF && reg >= BRW_ARF_ACCUMULATOR &&
592 reg < BRW_ARF_FLAG ? &accum_deps[
593 reg - BRW_ARF_ACCUMULATOR] :
594 NULL);
595 }
596 };
597
598 /**
599 * Dependency list handling.
600 * @{
601 */
602 struct dependency_list {
dependency_list__anonf3a7e32b0111::dependency_list603 dependency_list() : deps(NULL), n(0) {}
604
~dependency_list__anonf3a7e32b0111::dependency_list605 ~dependency_list()
606 {
607 free(deps);
608 }
609
610 void
push_back__anonf3a7e32b0111::dependency_list611 push_back(const dependency &dep)
612 {
613 deps = (dependency *)realloc(deps, (n + 1) * sizeof(*deps));
614 deps[n++] = dep;
615 }
616
617 unsigned
size__anonf3a7e32b0111::dependency_list618 size() const
619 {
620 return n;
621 }
622
623 const dependency &
operator []__anonf3a7e32b0111::dependency_list624 operator[](unsigned i) const
625 {
626 assert(i < n);
627 return deps[i];
628 }
629
630 dependency &
operator []__anonf3a7e32b0111::dependency_list631 operator[](unsigned i)
632 {
633 assert(i < n);
634 return deps[i];
635 }
636
637 private:
638 dependency_list(const dependency_list &);
639 dependency_list &
640 operator=(const dependency_list &);
641
642 dependency *deps;
643 unsigned n;
644 };
645
646 /**
647 * Add dependency \p dep to the list of dependencies of an instruction
648 * \p deps.
649 */
650 void
add_dependency(const unsigned * ids,dependency_list & deps,dependency dep)651 add_dependency(const unsigned *ids, dependency_list &deps, dependency dep)
652 {
653 if (is_valid(dep)) {
654 /* Translate the unordered dependency token first in order to keep
655 * the list minimally redundant.
656 */
657 if (dep.unordered)
658 dep.id = ids[dep.id];
659
660 /* Try to combine the specified dependency with any existing ones. */
661 for (unsigned i = 0; i < deps.size(); i++) {
662 /* Don't combine otherwise matching dependencies if there is an
663 * exec_all mismatch which would cause a SET dependency to gain an
664 * exec_all flag, since that would prevent it from being baked
665 * into the instruction we want to allocate an SBID for.
666 */
667 if (deps[i].exec_all != dep.exec_all &&
668 (!deps[i].exec_all || (dep.unordered & TGL_SBID_SET)) &&
669 (!dep.exec_all || (deps[i].unordered & TGL_SBID_SET)))
670 continue;
671
672 if (dep.ordered && deps[i].ordered) {
673 deps[i].jp = MAX2(deps[i].jp, dep.jp);
674 deps[i].ordered |= dep.ordered;
675 deps[i].exec_all |= dep.exec_all;
676 dep.ordered = TGL_REGDIST_NULL;
677 }
678
679 if (dep.unordered && deps[i].unordered && deps[i].id == dep.id) {
680 deps[i].unordered |= dep.unordered;
681 deps[i].exec_all |= dep.exec_all;
682 dep.unordered = TGL_SBID_NULL;
683 }
684 }
685
686 /* Add it to the end of the list if necessary. */
687 if (is_valid(dep))
688 deps.push_back(dep);
689 }
690 }
691
692 /**
693 * Construct a tgl_swsb annotation encoding any ordered dependencies from
694 * the dependency list \p deps of an instruction with ordered_address \p
695 * jp. If \p exec_all is false only dependencies known to be executed with
696 * channel masking applied will be considered in the calculation.
697 */
698 tgl_swsb
ordered_dependency_swsb(const dependency_list & deps,const ordered_address & jp,bool exec_all)699 ordered_dependency_swsb(const dependency_list &deps,
700 const ordered_address &jp,
701 bool exec_all)
702 {
703 unsigned min_dist = ~0u;
704
705 for (unsigned i = 0; i < deps.size(); i++) {
706 if (deps[i].ordered && exec_all >= deps[i].exec_all) {
707 const unsigned dist = jp - deps[i].jp;
708 const unsigned max_dist = 10;
709 assert(jp > deps[i].jp);
710 if (dist <= max_dist)
711 min_dist = MIN3(min_dist, dist, 7);
712 }
713 }
714
715 return { min_dist == ~0u ? 0 : min_dist };
716 }
717
718 /**
719 * Return whether the dependency list \p deps of an instruction with
720 * ordered_address \p jp has any non-trivial ordered dependencies. If \p
721 * exec_all is false only dependencies known to be executed with channel
722 * masking applied will be considered in the calculation.
723 */
724 bool
find_ordered_dependency(const dependency_list & deps,const ordered_address & jp,bool exec_all)725 find_ordered_dependency(const dependency_list &deps,
726 const ordered_address &jp,
727 bool exec_all)
728 {
729 return ordered_dependency_swsb(deps, jp, exec_all).regdist;
730 }
731
732 /**
733 * Return the full tgl_sbid_mode bitset for the first unordered dependency
734 * on the list \p deps that matches the specified tgl_sbid_mode, or zero if
735 * no such dependency is present. If \p exec_all is false only
736 * dependencies known to be executed with channel masking applied will be
737 * considered in the calculation.
738 */
739 tgl_sbid_mode
find_unordered_dependency(const dependency_list & deps,tgl_sbid_mode unordered,bool exec_all)740 find_unordered_dependency(const dependency_list &deps,
741 tgl_sbid_mode unordered,
742 bool exec_all)
743 {
744 if (unordered) {
745 for (unsigned i = 0; i < deps.size(); i++) {
746 if ((unordered & deps[i].unordered) &&
747 exec_all >= deps[i].exec_all)
748 return deps[i].unordered;
749 }
750 }
751
752 return TGL_SBID_NULL;
753 }
754
755 /**
756 * Return the tgl_sbid_mode bitset of an unordered dependency from the list
757 * \p deps that can be represented directly in the SWSB annotation of the
758 * instruction without additional SYNC instructions, or zero if no such
759 * dependency is present.
760 */
761 tgl_sbid_mode
baked_unordered_dependency_mode(const fs_inst * inst,const dependency_list & deps,const ordered_address & jp)762 baked_unordered_dependency_mode(const fs_inst *inst,
763 const dependency_list &deps,
764 const ordered_address &jp)
765 {
766 const bool exec_all = inst->force_writemask_all;
767 const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
768
769 if (find_unordered_dependency(deps, TGL_SBID_SET, exec_all))
770 return find_unordered_dependency(deps, TGL_SBID_SET, exec_all);
771 else if (has_ordered && is_unordered(inst))
772 return TGL_SBID_NULL;
773 else if (find_unordered_dependency(deps, TGL_SBID_DST, exec_all) &&
774 (!has_ordered || !is_unordered(inst)))
775 return find_unordered_dependency(deps, TGL_SBID_DST, exec_all);
776 else if (!has_ordered)
777 return find_unordered_dependency(deps, TGL_SBID_SRC, exec_all);
778 else
779 return TGL_SBID_NULL;
780 }
781
782 /** @} */
783
784 /**
785 * Shader instruction dependency calculation.
786 * @{
787 */
788
789 /**
790 * Update scoreboard object \p sb to account for the execution of
791 * instruction \p inst.
792 */
793 void
update_inst_scoreboard(const ordered_address * jps,const fs_inst * inst,unsigned ip,scoreboard & sb)794 update_inst_scoreboard(const ordered_address *jps,
795 const fs_inst *inst, unsigned ip, scoreboard &sb)
796 {
797 const bool exec_all = inst->force_writemask_all;
798
799 /* Track any source registers that may be fetched asynchronously by this
800 * instruction, otherwise clear the dependency in order to avoid
801 * subsequent redundant synchronization.
802 */
803 for (unsigned i = 0; i < inst->sources; i++) {
804 const dependency rd_dep =
805 (inst->is_payload(i) ||
806 inst->is_math()) ? dependency(TGL_SBID_SRC, ip, exec_all) :
807 ordered_unit(inst) ? dependency(TGL_REGDIST_SRC, jps[ip], exec_all) :
808 dependency::done;
809
810 for (unsigned j = 0; j < regs_read(inst, i); j++)
811 sb.set(byte_offset(inst->src[i], REG_SIZE * j), rd_dep);
812 }
813
814 if (is_send(inst) && inst->base_mrf != -1) {
815 const dependency rd_dep = dependency(TGL_SBID_SRC, ip, exec_all);
816
817 for (unsigned j = 0; j < inst->mlen; j++)
818 sb.set(brw_uvec_mrf(8, inst->base_mrf + j, 0), rd_dep);
819 }
820
821 /* Track any destination registers of this instruction. */
822 const dependency wr_dep =
823 is_unordered(inst) ? dependency(TGL_SBID_DST, ip, exec_all) :
824 ordered_unit(inst) ? dependency(TGL_REGDIST_DST, jps[ip], exec_all) :
825 dependency();
826
827 if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
828 !inst->dst.is_null()) {
829 for (unsigned j = 0; j < regs_written(inst); j++)
830 sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep);
831 }
832 }
833
834 /**
835 * Calculate scoreboard objects locally that represent any pending (and
836 * unconditionally resolved) dependencies at the end of each block of the
837 * program.
838 */
839 scoreboard *
gather_block_scoreboards(const fs_visitor * shader,const ordered_address * jps)840 gather_block_scoreboards(const fs_visitor *shader,
841 const ordered_address *jps)
842 {
843 scoreboard *sbs = new scoreboard[shader->cfg->num_blocks];
844 unsigned ip = 0;
845
846 foreach_block_and_inst(block, fs_inst, inst, shader->cfg)
847 update_inst_scoreboard(jps, inst, ip++, sbs[block->num]);
848
849 return sbs;
850 }
851
852 /**
853 * Propagate data dependencies globally through the control flow graph
854 * until a fixed point is reached.
855 *
856 * Calculates the set of dependencies potentially pending at the beginning
857 * of each block, and returns it as an array of scoreboard objects.
858 */
859 scoreboard *
propagate_block_scoreboards(const fs_visitor * shader,const ordered_address * jps,equivalence_relation & eq)860 propagate_block_scoreboards(const fs_visitor *shader,
861 const ordered_address *jps,
862 equivalence_relation &eq)
863 {
864 const scoreboard *delta_sbs = gather_block_scoreboards(shader, jps);
865 scoreboard *in_sbs = new scoreboard[shader->cfg->num_blocks];
866 scoreboard *out_sbs = new scoreboard[shader->cfg->num_blocks];
867
868 for (bool progress = true; progress;) {
869 progress = false;
870
871 foreach_block(block, shader->cfg) {
872 const scoreboard sb = shadow(in_sbs[block->num],
873 delta_sbs[block->num]);
874
875 if (sb != out_sbs[block->num]) {
876 foreach_list_typed(bblock_link, child_link, link,
877 &block->children) {
878 scoreboard &in_sb = in_sbs[child_link->block->num];
879 const int delta =
880 jps[child_link->block->start_ip] - jps[block->end_ip]
881 - ordered_unit(static_cast<const fs_inst *>(block->end()));
882
883 in_sb = merge(eq, in_sb, transport(sb, delta));
884 }
885
886 out_sbs[block->num] = sb;
887 progress = true;
888 }
889 }
890 }
891
892 delete[] delta_sbs;
893 delete[] out_sbs;
894
895 return in_sbs;
896 }
897
898 /**
899 * Return the list of potential dependencies of each instruction in the
900 * shader based on the result of global dependency analysis.
901 */
902 dependency_list *
gather_inst_dependencies(const fs_visitor * shader,const ordered_address * jps)903 gather_inst_dependencies(const fs_visitor *shader,
904 const ordered_address *jps)
905 {
906 equivalence_relation eq(num_instructions(shader));
907 scoreboard *sbs = propagate_block_scoreboards(shader, jps, eq);
908 const unsigned *ids = eq.flatten();
909 dependency_list *deps = new dependency_list[num_instructions(shader)];
910 unsigned ip = 0;
911
912 foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
913 const bool exec_all = inst->force_writemask_all;
914 scoreboard &sb = sbs[block->num];
915
916 for (unsigned i = 0; i < inst->sources; i++) {
917 for (unsigned j = 0; j < regs_read(inst, i); j++)
918 add_dependency(ids, deps[ip], dependency_for_read(
919 sb.get(byte_offset(inst->src[i], REG_SIZE * j))));
920 }
921
922 if (is_send(inst) && inst->base_mrf != -1) {
923 for (unsigned j = 0; j < inst->mlen; j++)
924 add_dependency(ids, deps[ip], dependency_for_read(
925 sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
926 }
927
928 if (is_unordered(inst))
929 add_dependency(ids, deps[ip],
930 dependency(TGL_SBID_SET, ip, exec_all));
931
932 if (!inst->no_dd_check) {
933 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
934 for (unsigned j = 0; j < regs_written(inst); j++) {
935 add_dependency(ids, deps[ip], dependency_for_write(inst,
936 sb.get(byte_offset(inst->dst, REG_SIZE * j))));
937 }
938 }
939
940 if (is_send(inst) && inst->base_mrf != -1) {
941 for (unsigned j = 0; j < inst->implied_mrf_writes(); j++)
942 add_dependency(ids, deps[ip], dependency_for_write(inst,
943 sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
944 }
945 }
946
947 update_inst_scoreboard(jps, inst, ip, sb);
948 ip++;
949 }
950
951 delete[] sbs;
952 delete[] ids;
953
954 return deps;
955 }
956
957 /** @} */
958
959 /**
960 * Allocate SBID tokens to track the execution of every out-of-order
961 * instruction of the shader.
962 */
963 dependency_list *
allocate_inst_dependencies(const fs_visitor * shader,const dependency_list * deps0)964 allocate_inst_dependencies(const fs_visitor *shader,
965 const dependency_list *deps0)
966 {
967 /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in
968 * shaders with a large number of SEND messages.
969 */
970
971 /* Allocate an unordered dependency ID to hardware SBID translation
972 * table with as many entries as instructions there are in the shader,
973 * which is the maximum number of unordered IDs we can find in the
974 * program.
975 */
976 unsigned *ids = new unsigned[num_instructions(shader)];
977 for (unsigned ip = 0; ip < num_instructions(shader); ip++)
978 ids[ip] = ~0u;
979
980 dependency_list *deps1 = new dependency_list[num_instructions(shader)];
981 unsigned next_id = 0;
982
983 for (unsigned ip = 0; ip < num_instructions(shader); ip++) {
984 for (unsigned i = 0; i < deps0[ip].size(); i++) {
985 const dependency &dep = deps0[ip][i];
986
987 if (dep.unordered && ids[dep.id] == ~0u)
988 ids[dep.id] = (next_id++) & 0xf;
989
990 add_dependency(ids, deps1[ip], dep);
991 }
992 }
993
994 delete[] ids;
995
996 return deps1;
997 }
998
999 /**
1000 * Emit dependency information provided by \p deps into the shader,
1001 * inserting additional SYNC instructions for dependencies that can't be
1002 * represented directly by annotating existing instructions.
1003 */
1004 void
emit_inst_dependencies(fs_visitor * shader,const ordered_address * jps,const dependency_list * deps)1005 emit_inst_dependencies(fs_visitor *shader,
1006 const ordered_address *jps,
1007 const dependency_list *deps)
1008 {
1009 unsigned ip = 0;
1010
1011 foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) {
1012 const bool exec_all = inst->force_writemask_all;
1013 tgl_swsb swsb = ordered_dependency_swsb(deps[ip], jps[ip], exec_all);
1014 const tgl_sbid_mode unordered_mode =
1015 baked_unordered_dependency_mode(inst, deps[ip], jps[ip]);
1016
1017 for (unsigned i = 0; i < deps[ip].size(); i++) {
1018 const dependency &dep = deps[ip][i];
1019
1020 if (dep.unordered) {
1021 if (unordered_mode == dep.unordered &&
1022 exec_all >= dep.exec_all && !swsb.mode) {
1023 /* Bake unordered dependency into the instruction's SWSB if
1024 * possible, except in cases where the current instruction
1025 * isn't marked NoMask but the dependency is, since that
1026 * might lead to data coherency issues due to
1027 * GEN:BUG:1407528679.
1028 */
1029 swsb.sbid = dep.id;
1030 swsb.mode = dep.unordered;
1031 } else {
1032 /* Emit dependency into the SWSB of an extra SYNC
1033 * instruction.
1034 */
1035 const fs_builder ibld = fs_builder(shader, block, inst)
1036 .exec_all().group(1, 0);
1037 fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
1038 brw_imm_ud(TGL_SYNC_NOP));
1039 sync->sched.sbid = dep.id;
1040 sync->sched.mode = dep.unordered;
1041 assert(!(sync->sched.mode & TGL_SBID_SET));
1042 }
1043 }
1044 }
1045
1046 for (unsigned i = 0; i < deps[ip].size(); i++) {
1047 const dependency &dep = deps[ip][i];
1048
1049 if (dep.ordered && dep.exec_all > exec_all &&
1050 find_ordered_dependency(deps[ip], jps[ip], true)) {
1051 /* If the current instruction is not marked NoMask but an
1052 * ordered dependency is, perform the synchronization as a
1053 * separate NoMask SYNC instruction in order to avoid data
1054 * coherency issues due to GEN:BUG:1407528679. The similar
1055 * scenario with unordered dependencies should have been
1056 * handled above.
1057 */
1058 const fs_builder ibld = fs_builder(shader, block, inst)
1059 .exec_all().group(1, 0);
1060 fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
1061 brw_imm_ud(TGL_SYNC_NOP));
1062 sync->sched = ordered_dependency_swsb(deps[ip], jps[ip], true);
1063 break;
1064 }
1065 }
1066
1067 /* Update the IR. */
1068 inst->sched = swsb;
1069 inst->no_dd_check = inst->no_dd_clear = false;
1070 ip++;
1071 }
1072 }
1073 }
1074
1075 bool
lower_scoreboard()1076 fs_visitor::lower_scoreboard()
1077 {
1078 if (devinfo->gen >= 12) {
1079 const ordered_address *jps = ordered_inst_addresses(this);
1080 const dependency_list *deps0 = gather_inst_dependencies(this, jps);
1081 const dependency_list *deps1 = allocate_inst_dependencies(this, deps0);
1082 emit_inst_dependencies(this, jps, deps1);
1083 delete[] deps1;
1084 delete[] deps0;
1085 delete[] jps;
1086 }
1087
1088 return true;
1089 }
1090