1 /*
2 * Copyright © 2019 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs_scoreboard.cpp
25 *
26 * Gfx12+ hardware lacks the register scoreboard logic that used to guarantee
27 * data coherency between register reads and writes in previous generations.
28 * This lowering pass runs after register allocation in order to make up for
29 * it.
30 *
31 * It works by performing global dataflow analysis in order to determine the
32 * set of potential dependencies of every instruction in the shader, and then
33 * inserts any required SWSB annotations and additional SYNC instructions in
34 * order to guarantee data coherency.
35 *
36 * WARNING - Access of the following (rarely used) ARF registers is not
37 * tracked here, and require the RegDist SWSB annotation to be set
38 * to 1 by the generator in order to avoid data races:
39 *
40 * - sp stack pointer
41 * - sr0 state register
42 * - cr0 control register
43 * - ip instruction pointer
44 * - tm0 timestamp register
45 * - dbg0 debug register
46 * - acc2-9 special accumulator registers on TGL
47 * - mme0-7 math macro extended accumulator registers
48 *
49 * The following ARF registers don't need to be tracked here because data
50 * coherency is still provided transparently by the hardware:
51 *
52 * - f0-1 flag registers
53 * - n0 notification register
54 * - tdr0 thread dependency register
55 */
56
57 #include "brw_fs.h"
58 #include "brw_cfg.h"
59
60 using namespace brw;
61
62 namespace {
63 /**
64 * In-order instruction accounting.
65 * @{
66 */
67
68 /**
69 * Return the RegDist pipeline the hardware will synchronize with if no
70 * pipeline information is provided in the SWSB annotation of an
71 * instruction (e.g. when TGL_PIPE_NONE is specified in tgl_swsb).
72 */
73 tgl_pipe
inferred_sync_pipe(const struct intel_device_info * devinfo,const fs_inst * inst)74 inferred_sync_pipe(const struct intel_device_info *devinfo, const fs_inst *inst)
75 {
76 if (devinfo->verx10 >= 125) {
77 bool has_int_src = false, has_long_src = false;
78
79 if (is_send(inst))
80 return TGL_PIPE_NONE;
81
82 for (unsigned i = 0; i < inst->sources; i++) {
83 if (inst->src[i].file != BAD_FILE &&
84 !inst->is_control_source(i)) {
85 const brw_reg_type t = inst->src[i].type;
86 has_int_src |= !brw_reg_type_is_floating_point(t);
87 has_long_src |= type_sz(t) >= 8;
88 }
89 }
90
91 return has_long_src ? TGL_PIPE_LONG :
92 has_int_src ? TGL_PIPE_INT :
93 TGL_PIPE_FLOAT;
94
95 } else {
96 return TGL_PIPE_FLOAT;
97 }
98 }
99
100 /**
101 * Return the RegDist pipeline that will execute an instruction, or
102 * TGL_PIPE_NONE if the instruction is out-of-order and doesn't use the
103 * RegDist synchronization mechanism.
104 */
105 tgl_pipe
inferred_exec_pipe(const struct intel_device_info * devinfo,const fs_inst * inst)106 inferred_exec_pipe(const struct intel_device_info *devinfo, const fs_inst *inst)
107 {
108 const brw_reg_type t = get_exec_type(inst);
109 const bool is_dword_multiply = !brw_reg_type_is_floating_point(t) &&
110 ((inst->opcode == BRW_OPCODE_MUL &&
111 MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4) ||
112 (inst->opcode == BRW_OPCODE_MAD &&
113 MIN2(type_sz(inst->src[1].type), type_sz(inst->src[2].type)) >= 4));
114
115 if (is_unordered(inst))
116 return TGL_PIPE_NONE;
117 else if (devinfo->verx10 < 125)
118 return TGL_PIPE_FLOAT;
119 else if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
120 type_sz(t) >= 8)
121 return TGL_PIPE_INT;
122 else if (inst->opcode == SHADER_OPCODE_BROADCAST &&
123 !devinfo->has_64bit_float && type_sz(t) >= 8)
124 return TGL_PIPE_INT;
125 else if (inst->opcode == FS_OPCODE_PACK_HALF_2x16_SPLIT)
126 return TGL_PIPE_FLOAT;
127 else if (type_sz(inst->dst.type) >= 8 || type_sz(t) >= 8 ||
128 is_dword_multiply)
129 return TGL_PIPE_LONG;
130 else if (brw_reg_type_is_floating_point(inst->dst.type))
131 return TGL_PIPE_FLOAT;
132 else
133 return TGL_PIPE_INT;
134 }
135
136 /**
137 * Index of the \p p pipeline counter in the ordered_address vector defined
138 * below.
139 */
140 #define IDX(p) (p >= TGL_PIPE_FLOAT ? unsigned(p - TGL_PIPE_FLOAT) : \
141 (abort(), ~0u))
142
143 /**
144 * Number of in-order hardware instructions for pipeline index \p contained
145 * in this IR instruction. This determines the increment applied to the
146 * RegDist counter calculated for any ordered dependency that crosses this
147 * instruction.
148 */
149 unsigned
ordered_unit(const struct intel_device_info * devinfo,const fs_inst * inst,unsigned p)150 ordered_unit(const struct intel_device_info *devinfo, const fs_inst *inst,
151 unsigned p)
152 {
153 switch (inst->opcode) {
154 case BRW_OPCODE_SYNC:
155 case BRW_OPCODE_DO:
156 case SHADER_OPCODE_UNDEF:
157 case SHADER_OPCODE_HALT_TARGET:
158 case FS_OPCODE_SCHEDULING_FENCE:
159 return 0;
160 default:
161 /* Note that the following is inaccurate for virtual instructions
162 * that expand to more in-order instructions than assumed here, but
163 * that can only lead to suboptimal execution ordering, data
164 * coherency won't be impacted. Providing exact RegDist counts for
165 * each virtual instruction would allow better ALU performance, but
166 * it would require keeping this switch statement in perfect sync
167 * with the generator in order to avoid data corruption. Lesson is
168 * (again) don't use virtual instructions if you want optimal
169 * scheduling.
170 */
171 if (!is_unordered(inst) && (p == IDX(inferred_exec_pipe(devinfo, inst)) ||
172 p == IDX(TGL_PIPE_ALL)))
173 return 1;
174 else
175 return 0;
176 }
177 }
178
179 /**
180 * Type for an instruction counter that increments for in-order
181 * instructions only, arbitrarily denoted 'jp' throughout this lowering
182 * pass in order to distinguish it from the regular instruction counter.
183 * This is represented as a vector with an independent counter for each
184 * asynchronous ALU pipeline in the EU.
185 */
186 struct ordered_address {
187 /**
188 * Construct the ordered address of a dependency known to execute on a
189 * single specified pipeline \p p (unless TGL_PIPE_NONE or TGL_PIPE_ALL
190 * is provided), in which case the vector counter will be initialized
191 * with all components equal to INT_MIN (always satisfied) except for
192 * component IDX(p).
193 */
ordered_address__anon3376bf920111::ordered_address194 ordered_address(tgl_pipe p = TGL_PIPE_NONE, int jp0 = INT_MIN) {
195 for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++)
196 jp[q] = (p == TGL_PIPE_NONE || (IDX(p) != q && p != TGL_PIPE_ALL) ?
197 INT_MIN : jp0);
198 }
199
200 int jp[IDX(TGL_PIPE_ALL)];
201
202 friend bool
operator ==(const ordered_address & jp0,const ordered_address & jp1)203 operator==(const ordered_address &jp0, const ordered_address &jp1)
204 {
205 for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) {
206 if (jp0.jp[p] != jp1.jp[p])
207 return false;
208 }
209
210 return true;
211 }
212 };
213
214 /**
215 * Return true if the specified ordered address is trivially satisfied for
216 * all pipelines except potentially for the specified pipeline \p p.
217 */
218 bool
is_single_pipe(const ordered_address & jp,tgl_pipe p)219 is_single_pipe(const ordered_address &jp, tgl_pipe p)
220 {
221 for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) {
222 if ((p == TGL_PIPE_NONE || IDX(p) != q) && jp.jp[q] > INT_MIN)
223 return false;
224 }
225
226 return true;
227 }
228
229 /**
230 * Return the number of instructions in the program.
231 */
232 unsigned
num_instructions(const backend_shader * shader)233 num_instructions(const backend_shader *shader)
234 {
235 return shader->cfg->blocks[shader->cfg->num_blocks - 1]->end_ip + 1;
236 }
237
238 /**
239 * Calculate the local ordered_address instruction counter at every
240 * instruction of the shader for subsequent constant-time look-up.
241 */
242 ordered_address *
ordered_inst_addresses(const fs_visitor * shader)243 ordered_inst_addresses(const fs_visitor *shader)
244 {
245 ordered_address *jps = new ordered_address[num_instructions(shader)];
246 ordered_address jp(TGL_PIPE_ALL, 0);
247 unsigned ip = 0;
248
249 foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
250 jps[ip] = jp;
251 for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
252 jp.jp[p] += ordered_unit(shader->devinfo, inst, p);
253 ip++;
254 }
255
256 return jps;
257 }
258
259 /**
260 * Synchronization mode required for data manipulated by in-order
261 * instructions.
262 *
263 * Similar to tgl_sbid_mode, but without SET mode. Defined as a separate
264 * enum for additional type safety. The hardware doesn't provide control
265 * over the synchronization mode for RegDist annotations, this is only used
266 * internally in this pass in order to optimize out redundant read
267 * dependencies where possible.
268 */
269 enum tgl_regdist_mode {
270 TGL_REGDIST_NULL = 0,
271 TGL_REGDIST_SRC = 1,
272 TGL_REGDIST_DST = 2
273 };
274
275 /**
276 * Allow bitwise arithmetic of tgl_regdist_mode enums.
277 */
278 tgl_regdist_mode
operator |(tgl_regdist_mode x,tgl_regdist_mode y)279 operator|(tgl_regdist_mode x, tgl_regdist_mode y)
280 {
281 return tgl_regdist_mode(unsigned(x) | unsigned(y));
282 }
283
284 tgl_regdist_mode
operator &(tgl_regdist_mode x,tgl_regdist_mode y)285 operator&(tgl_regdist_mode x, tgl_regdist_mode y)
286 {
287 return tgl_regdist_mode(unsigned(x) & unsigned(y));
288 }
289
290 tgl_regdist_mode &
operator |=(tgl_regdist_mode & x,tgl_regdist_mode y)291 operator|=(tgl_regdist_mode &x, tgl_regdist_mode y)
292 {
293 return x = x | y;
294 }
295
296 tgl_regdist_mode &
operator &=(tgl_regdist_mode & x,tgl_regdist_mode y)297 operator&=(tgl_regdist_mode &x, tgl_regdist_mode y)
298 {
299 return x = x & y;
300 }
301
302 /** @} */
303
304 /**
305 * Representation of an equivalence relation among the set of unsigned
306 * integers.
307 *
308 * Its initial state is the identity relation '~' such that i ~ j if and
309 * only if i == j for every pair of unsigned integers i and j.
310 */
311 struct equivalence_relation {
equivalence_relation__anon3376bf920111::equivalence_relation312 equivalence_relation(unsigned n) : is(new unsigned[n]), n(n)
313 {
314 for (unsigned i = 0; i < n; i++)
315 is[i] = i;
316 }
317
~equivalence_relation__anon3376bf920111::equivalence_relation318 ~equivalence_relation()
319 {
320 delete[] is;
321 }
322
323 /**
324 * Return equivalence class index of the specified element. Effectively
325 * this is the numeric value of an arbitrary representative from the
326 * equivalence class.
327 *
328 * Allows the evaluation of the equivalence relation according to the
329 * rule that i ~ j if and only if lookup(i) == lookup(j).
330 */
331 unsigned
lookup__anon3376bf920111::equivalence_relation332 lookup(unsigned i) const
333 {
334 if (i < n && is[i] != i)
335 return lookup(is[i]);
336 else
337 return i;
338 }
339
340 /**
341 * Create an array with the results of the lookup() method for
342 * constant-time evaluation.
343 */
344 unsigned *
flatten__anon3376bf920111::equivalence_relation345 flatten() const
346 {
347 unsigned *ids = new unsigned[n];
348
349 for (unsigned i = 0; i < n; i++)
350 ids[i] = lookup(i);
351
352 return ids;
353 }
354
355 /**
356 * Mutate the existing equivalence relation minimally by imposing the
357 * additional requirement that i ~ j.
358 *
359 * The algorithm updates the internal representation recursively in
360 * order to guarantee transitivity while preserving the previously
361 * specified equivalence requirements.
362 */
363 unsigned
link__anon3376bf920111::equivalence_relation364 link(unsigned i, unsigned j)
365 {
366 const unsigned k = lookup(i);
367 assign(i, k);
368 assign(j, k);
369 return k;
370 }
371
372 private:
373 equivalence_relation(const equivalence_relation &);
374
375 equivalence_relation &
376 operator=(const equivalence_relation &);
377
378 /**
379 * Assign the representative of \p from to be equivalent to \p to.
380 *
381 * At the same time the data structure is partially flattened as much as
382 * it's possible without increasing the number of recursive calls.
383 */
384 void
assign__anon3376bf920111::equivalence_relation385 assign(unsigned from, unsigned to)
386 {
387 if (from != to) {
388 assert(from < n);
389
390 if (is[from] != from)
391 assign(is[from], to);
392
393 is[from] = to;
394 }
395 }
396
397 unsigned *is;
398 unsigned n;
399 };
400
401 /**
402 * Representation of a data dependency between two instructions in the
403 * program.
404 * @{
405 */
406 struct dependency {
407 /**
408 * No dependency information.
409 */
dependency__anon3376bf920111::dependency410 dependency() : ordered(TGL_REGDIST_NULL), jp(),
411 unordered(TGL_SBID_NULL), id(0),
412 exec_all(false) {}
413
414 /**
415 * Construct a dependency on the in-order instruction with the provided
416 * ordered_address instruction counter.
417 */
dependency__anon3376bf920111::dependency418 dependency(tgl_regdist_mode mode, const ordered_address &jp,
419 bool exec_all) :
420 ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0),
421 exec_all(exec_all) {}
422
423 /**
424 * Construct a dependency on the out-of-order instruction with the
425 * specified synchronization token.
426 */
dependency__anon3376bf920111::dependency427 dependency(tgl_sbid_mode mode, unsigned id, bool exec_all) :
428 ordered(TGL_REGDIST_NULL), jp(), unordered(mode), id(id),
429 exec_all(exec_all) {}
430
431 /**
432 * Synchronization mode of in-order dependency, or zero if no in-order
433 * dependency is present.
434 */
435 tgl_regdist_mode ordered;
436
437 /**
438 * Instruction counter of in-order dependency.
439 *
440 * For a dependency part of a different block in the program, this is
441 * relative to the specific control flow path taken between the
442 * dependency and the current block: It is the ordered_address such that
443 * the difference between it and the ordered_address of the first
444 * instruction of the current block is exactly the number of in-order
445 * instructions across that control flow path. It is not guaranteed to
446 * be equal to the local ordered_address of the generating instruction
447 * [as returned by ordered_inst_addresses()], except for block-local
448 * dependencies.
449 */
450 ordered_address jp;
451
452 /**
453 * Synchronization mode of unordered dependency, or zero if no unordered
454 * dependency is present.
455 */
456 tgl_sbid_mode unordered;
457
458 /** Synchronization token of out-of-order dependency. */
459 unsigned id;
460
461 /**
462 * Whether the dependency could be run with execution masking disabled,
463 * which might lead to the unwanted execution of the generating
464 * instruction in cases where a BB is executed with all channels
465 * disabled due to hardware bug Wa_1407528679.
466 */
467 bool exec_all;
468
469 /**
470 * Trivial in-order dependency that's always satisfied.
471 *
472 * Note that unlike a default-constructed dependency() which is also
473 * trivially satisfied, this is considered to provide dependency
474 * information and can be used to clear a previously pending dependency
475 * via shadow().
476 */
477 static const dependency done;
478
479 friend bool
operator ==(const dependency & dep0,const dependency & dep1)480 operator==(const dependency &dep0, const dependency &dep1)
481 {
482 return dep0.ordered == dep1.ordered &&
483 dep0.jp == dep1.jp &&
484 dep0.unordered == dep1.unordered &&
485 dep0.id == dep1.id &&
486 dep0.exec_all == dep1.exec_all;
487 }
488
489 friend bool
operator !=(const dependency & dep0,const dependency & dep1)490 operator!=(const dependency &dep0, const dependency &dep1)
491 {
492 return !(dep0 == dep1);
493 }
494 };
495
496 const dependency dependency::done =
497 dependency(TGL_REGDIST_SRC, ordered_address(), false);
498
499 /**
500 * Return whether \p dep contains any dependency information.
501 */
502 bool
is_valid(const dependency & dep)503 is_valid(const dependency &dep)
504 {
505 return dep.ordered || dep.unordered;
506 }
507
508 /**
509 * Combine \p dep0 and \p dep1 into a single dependency object that is only
510 * satisfied when both original dependencies are satisfied. This might
511 * involve updating the equivalence relation \p eq in order to make sure
512 * that both out-of-order dependencies are assigned the same hardware SBID
513 * as synchronization token.
514 */
515 dependency
merge(equivalence_relation & eq,const dependency & dep0,const dependency & dep1)516 merge(equivalence_relation &eq,
517 const dependency &dep0, const dependency &dep1)
518 {
519 dependency dep;
520
521 if (dep0.ordered || dep1.ordered) {
522 dep.ordered = dep0.ordered | dep1.ordered;
523 for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
524 dep.jp.jp[p] = MAX2(dep0.jp.jp[p], dep1.jp.jp[p]);
525 }
526
527 if (dep0.unordered || dep1.unordered) {
528 dep.unordered = dep0.unordered | dep1.unordered;
529 dep.id = eq.link(dep0.unordered ? dep0.id : dep1.id,
530 dep1.unordered ? dep1.id : dep0.id);
531 }
532
533 dep.exec_all = dep0.exec_all || dep1.exec_all;
534
535 return dep;
536 }
537
538 /**
539 * Override dependency information of \p dep0 with that of \p dep1.
540 */
541 dependency
shadow(const dependency & dep0,const dependency & dep1)542 shadow(const dependency &dep0, const dependency &dep1)
543 {
544 return is_valid(dep1) ? dep1 : dep0;
545 }
546
547 /**
548 * Translate dependency information across the program.
549 *
550 * This returns a dependency on the same instruction translated to the
551 * ordered_address space of a different block. The correct shift for
552 * transporting a dependency across an edge of the CFG is the difference
553 * between the local ordered_address of the first instruction of the target
554 * block and the local ordered_address of the instruction immediately after
555 * the end of the origin block.
556 */
557 dependency
transport(dependency dep,int delta[IDX (TGL_PIPE_ALL)])558 transport(dependency dep, int delta[IDX(TGL_PIPE_ALL)])
559 {
560 if (dep.ordered) {
561 for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) {
562 if (dep.jp.jp[p] > INT_MIN)
563 dep.jp.jp[p] += delta[p];
564 }
565 }
566
567 return dep;
568 }
569
570 /**
571 * Return simplified dependency removing any synchronization modes not
572 * applicable to an instruction reading the same register location.
573 */
574 dependency
dependency_for_read(dependency dep)575 dependency_for_read(dependency dep)
576 {
577 dep.ordered &= TGL_REGDIST_DST;
578 return dep;
579 }
580
581 /**
582 * Return simplified dependency removing any synchronization modes not
583 * applicable to an instruction \p inst writing the same register location.
584 *
585 * This clears any WaR dependency for writes performed from the same
586 * pipeline as the read, since there is no possibility for a data hazard.
587 */
588 dependency
dependency_for_write(const struct intel_device_info * devinfo,const fs_inst * inst,dependency dep)589 dependency_for_write(const struct intel_device_info *devinfo,
590 const fs_inst *inst, dependency dep)
591 {
592 if (!is_unordered(inst) &&
593 is_single_pipe(dep.jp, inferred_exec_pipe(devinfo, inst)))
594 dep.ordered &= TGL_REGDIST_DST;
595 return dep;
596 }
597
598 /** @} */
599
600 /**
601 * Scoreboard representation. This keeps track of the data dependencies of
602 * registers with GRF granularity.
603 */
604 class scoreboard {
605 public:
606 /**
607 * Look up the most current data dependency for register \p r.
608 */
609 dependency
get(const fs_reg & r) const610 get(const fs_reg &r) const
611 {
612 if (const dependency *p = const_cast<scoreboard *>(this)->dep(r))
613 return *p;
614 else
615 return dependency();
616 }
617
618 /**
619 * Specify the most current data dependency for register \p r.
620 */
621 void
set(const fs_reg & r,const dependency & d)622 set(const fs_reg &r, const dependency &d)
623 {
624 if (dependency *p = dep(r))
625 *p = d;
626 }
627
628 /**
629 * Component-wise merge() of corresponding dependencies from two
630 * scoreboard objects. \sa merge().
631 */
632 friend scoreboard
merge(equivalence_relation & eq,const scoreboard & sb0,const scoreboard & sb1)633 merge(equivalence_relation &eq,
634 const scoreboard &sb0, const scoreboard &sb1)
635 {
636 scoreboard sb;
637
638 for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
639 sb.grf_deps[i] = merge(eq, sb0.grf_deps[i], sb1.grf_deps[i]);
640
641 sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep);
642 sb.accum_dep = merge(eq, sb0.accum_dep, sb1.accum_dep);
643
644 return sb;
645 }
646
647 /**
648 * Component-wise shadow() of corresponding dependencies from two
649 * scoreboard objects. \sa shadow().
650 */
651 friend scoreboard
shadow(const scoreboard & sb0,const scoreboard & sb1)652 shadow(const scoreboard &sb0, const scoreboard &sb1)
653 {
654 scoreboard sb;
655
656 for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
657 sb.grf_deps[i] = shadow(sb0.grf_deps[i], sb1.grf_deps[i]);
658
659 sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep);
660 sb.accum_dep = shadow(sb0.accum_dep, sb1.accum_dep);
661
662 return sb;
663 }
664
665 /**
666 * Component-wise transport() of dependencies from a scoreboard
667 * object. \sa transport().
668 */
669 friend scoreboard
transport(const scoreboard & sb0,int delta[IDX (TGL_PIPE_ALL)])670 transport(const scoreboard &sb0, int delta[IDX(TGL_PIPE_ALL)])
671 {
672 scoreboard sb;
673
674 for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
675 sb.grf_deps[i] = transport(sb0.grf_deps[i], delta);
676
677 sb.addr_dep = transport(sb0.addr_dep, delta);
678 sb.accum_dep = transport(sb0.accum_dep, delta);
679
680 return sb;
681 }
682
683 friend bool
operator ==(const scoreboard & sb0,const scoreboard & sb1)684 operator==(const scoreboard &sb0, const scoreboard &sb1)
685 {
686 for (unsigned i = 0; i < ARRAY_SIZE(sb0.grf_deps); i++) {
687 if (sb0.grf_deps[i] != sb1.grf_deps[i])
688 return false;
689 }
690
691 if (sb0.addr_dep != sb1.addr_dep)
692 return false;
693
694 if (sb0.accum_dep != sb1.accum_dep)
695 return false;
696
697 return true;
698 }
699
700 friend bool
operator !=(const scoreboard & sb0,const scoreboard & sb1)701 operator!=(const scoreboard &sb0, const scoreboard &sb1)
702 {
703 return !(sb0 == sb1);
704 }
705
706 private:
707 dependency grf_deps[BRW_MAX_GRF];
708 dependency addr_dep;
709 dependency accum_dep;
710
711 dependency *
dep(const fs_reg & r)712 dep(const fs_reg &r)
713 {
714 const unsigned reg = (r.file == VGRF ? r.nr + r.offset / REG_SIZE :
715 reg_offset(r) / REG_SIZE);
716
717 return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] :
718 r.file == MRF ? &grf_deps[GFX7_MRF_HACK_START + reg] :
719 r.file == ARF && reg >= BRW_ARF_ADDRESS &&
720 reg < BRW_ARF_ACCUMULATOR ? &addr_dep :
721 r.file == ARF && reg >= BRW_ARF_ACCUMULATOR &&
722 reg < BRW_ARF_FLAG ? &accum_dep :
723 NULL);
724 }
725 };
726
727 /**
728 * Dependency list handling.
729 * @{
730 */
731 struct dependency_list {
dependency_list__anon3376bf920111::dependency_list732 dependency_list() : deps(NULL), n(0) {}
733
~dependency_list__anon3376bf920111::dependency_list734 ~dependency_list()
735 {
736 free(deps);
737 }
738
739 void
push_back__anon3376bf920111::dependency_list740 push_back(const dependency &dep)
741 {
742 deps = (dependency *)realloc(deps, (n + 1) * sizeof(*deps));
743 deps[n++] = dep;
744 }
745
746 unsigned
size__anon3376bf920111::dependency_list747 size() const
748 {
749 return n;
750 }
751
752 const dependency &
operator []__anon3376bf920111::dependency_list753 operator[](unsigned i) const
754 {
755 assert(i < n);
756 return deps[i];
757 }
758
759 dependency &
operator []__anon3376bf920111::dependency_list760 operator[](unsigned i)
761 {
762 assert(i < n);
763 return deps[i];
764 }
765
766 private:
767 dependency_list(const dependency_list &);
768 dependency_list &
769 operator=(const dependency_list &);
770
771 dependency *deps;
772 unsigned n;
773 };
774
775 /**
776 * Add dependency \p dep to the list of dependencies of an instruction
777 * \p deps.
778 */
779 void
add_dependency(const unsigned * ids,dependency_list & deps,dependency dep)780 add_dependency(const unsigned *ids, dependency_list &deps, dependency dep)
781 {
782 if (is_valid(dep)) {
783 /* Translate the unordered dependency token first in order to keep
784 * the list minimally redundant.
785 */
786 if (dep.unordered)
787 dep.id = ids[dep.id];
788
789 /* Try to combine the specified dependency with any existing ones. */
790 for (unsigned i = 0; i < deps.size(); i++) {
791 /* Don't combine otherwise matching dependencies if there is an
792 * exec_all mismatch which would cause a SET dependency to gain an
793 * exec_all flag, since that would prevent it from being baked
794 * into the instruction we want to allocate an SBID for.
795 */
796 if (deps[i].exec_all != dep.exec_all &&
797 (!deps[i].exec_all || (dep.unordered & TGL_SBID_SET)) &&
798 (!dep.exec_all || (deps[i].unordered & TGL_SBID_SET)))
799 continue;
800
801 if (dep.ordered && deps[i].ordered) {
802 for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
803 deps[i].jp.jp[p] = MAX2(deps[i].jp.jp[p], dep.jp.jp[p]);
804
805 deps[i].ordered |= dep.ordered;
806 deps[i].exec_all |= dep.exec_all;
807 dep.ordered = TGL_REGDIST_NULL;
808 }
809
810 if (dep.unordered && deps[i].unordered && deps[i].id == dep.id) {
811 deps[i].unordered |= dep.unordered;
812 deps[i].exec_all |= dep.exec_all;
813 dep.unordered = TGL_SBID_NULL;
814 }
815 }
816
817 /* Add it to the end of the list if necessary. */
818 if (is_valid(dep))
819 deps.push_back(dep);
820 }
821 }
822
823 /**
824 * Construct a tgl_swsb annotation encoding any ordered dependencies from
825 * the dependency list \p deps of an instruction with ordered_address \p
826 * jp. If \p exec_all is false only dependencies known to be executed with
827 * channel masking applied will be considered in the calculation.
828 */
829 tgl_swsb
ordered_dependency_swsb(const dependency_list & deps,const ordered_address & jp,bool exec_all)830 ordered_dependency_swsb(const dependency_list &deps,
831 const ordered_address &jp,
832 bool exec_all)
833 {
834 tgl_pipe p = TGL_PIPE_NONE;
835 unsigned min_dist = ~0u;
836
837 for (unsigned i = 0; i < deps.size(); i++) {
838 if (deps[i].ordered && exec_all >= deps[i].exec_all) {
839 for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) {
840 const unsigned dist = jp.jp[q] - int64_t(deps[i].jp.jp[q]);
841 const unsigned max_dist = (q == IDX(TGL_PIPE_LONG) ? 14 : 10);
842 assert(jp.jp[q] > deps[i].jp.jp[q]);
843 if (dist <= max_dist) {
844 p = (p && IDX(p) != q ? TGL_PIPE_ALL :
845 tgl_pipe(TGL_PIPE_FLOAT + q));
846 min_dist = MIN3(min_dist, dist, 7);
847 }
848 }
849 }
850 }
851
852 return { p ? min_dist : 0, p };
853 }
854
855 /**
856 * Return whether the dependency list \p deps of an instruction with
857 * ordered_address \p jp has any non-trivial ordered dependencies. If \p
858 * exec_all is false only dependencies known to be executed with channel
859 * masking applied will be considered in the calculation.
860 */
861 bool
find_ordered_dependency(const dependency_list & deps,const ordered_address & jp,bool exec_all)862 find_ordered_dependency(const dependency_list &deps,
863 const ordered_address &jp,
864 bool exec_all)
865 {
866 return ordered_dependency_swsb(deps, jp, exec_all).regdist;
867 }
868
869 /**
870 * Return the full tgl_sbid_mode bitset for the first unordered dependency
871 * on the list \p deps that matches the specified tgl_sbid_mode, or zero if
872 * no such dependency is present. If \p exec_all is false only
873 * dependencies known to be executed with channel masking applied will be
874 * considered in the calculation.
875 */
876 tgl_sbid_mode
find_unordered_dependency(const dependency_list & deps,tgl_sbid_mode unordered,bool exec_all)877 find_unordered_dependency(const dependency_list &deps,
878 tgl_sbid_mode unordered,
879 bool exec_all)
880 {
881 if (unordered) {
882 for (unsigned i = 0; i < deps.size(); i++) {
883 if ((unordered & deps[i].unordered) &&
884 exec_all >= deps[i].exec_all)
885 return deps[i].unordered;
886 }
887 }
888
889 return TGL_SBID_NULL;
890 }
891
892 /**
893 * Return the tgl_sbid_mode bitset of an unordered dependency from the list
894 * \p deps that can be represented directly in the SWSB annotation of the
895 * instruction without additional SYNC instructions, or zero if no such
896 * dependency is present.
897 */
898 tgl_sbid_mode
baked_unordered_dependency_mode(const struct intel_device_info * devinfo,const fs_inst * inst,const dependency_list & deps,const ordered_address & jp)899 baked_unordered_dependency_mode(const struct intel_device_info *devinfo,
900 const fs_inst *inst,
901 const dependency_list &deps,
902 const ordered_address &jp)
903 {
904 const bool exec_all = inst->force_writemask_all;
905 const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
906 const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp,
907 exec_all).pipe;
908
909 if (find_unordered_dependency(deps, TGL_SBID_SET, exec_all))
910 return find_unordered_dependency(deps, TGL_SBID_SET, exec_all);
911 else if (has_ordered && is_unordered(inst))
912 return TGL_SBID_NULL;
913 else if (find_unordered_dependency(deps, TGL_SBID_DST, exec_all) &&
914 (!has_ordered || ordered_pipe == inferred_sync_pipe(devinfo, inst)))
915 return find_unordered_dependency(deps, TGL_SBID_DST, exec_all);
916 else if (!has_ordered)
917 return find_unordered_dependency(deps, TGL_SBID_SRC, exec_all);
918 else
919 return TGL_SBID_NULL;
920 }
921
922 /**
923 * Return whether an ordered dependency from the list \p deps can be
924 * represented directly in the SWSB annotation of the instruction without
925 * additional SYNC instructions.
926 */
927 bool
baked_ordered_dependency_mode(const struct intel_device_info * devinfo,const fs_inst * inst,const dependency_list & deps,const ordered_address & jp)928 baked_ordered_dependency_mode(const struct intel_device_info *devinfo,
929 const fs_inst *inst,
930 const dependency_list &deps,
931 const ordered_address &jp)
932 {
933 const bool exec_all = inst->force_writemask_all;
934 const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
935 const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp,
936 exec_all).pipe;
937 const tgl_sbid_mode unordered_mode =
938 baked_unordered_dependency_mode(devinfo, inst, deps, jp);
939
940 if (!has_ordered)
941 return false;
942 else if (!unordered_mode)
943 return true;
944 else
945 return ordered_pipe == inferred_sync_pipe(devinfo, inst) &&
946 unordered_mode == (is_unordered(inst) ? TGL_SBID_SET :
947 TGL_SBID_DST);
948 }
949
950 /** @} */
951
952 /**
953 * Shader instruction dependency calculation.
954 * @{
955 */
956
957 /**
958 * Update scoreboard object \p sb to account for the execution of
959 * instruction \p inst.
960 */
961 void
update_inst_scoreboard(const fs_visitor * shader,const ordered_address * jps,const fs_inst * inst,unsigned ip,scoreboard & sb)962 update_inst_scoreboard(const fs_visitor *shader, const ordered_address *jps,
963 const fs_inst *inst, unsigned ip, scoreboard &sb)
964 {
965 const bool exec_all = inst->force_writemask_all;
966 const struct intel_device_info *devinfo = shader->devinfo;
967 const tgl_pipe p = inferred_exec_pipe(devinfo, inst);
968 const ordered_address jp = p ? ordered_address(p, jps[ip].jp[IDX(p)]) :
969 ordered_address();
970
971 /* Track any source registers that may be fetched asynchronously by this
972 * instruction, otherwise clear the dependency in order to avoid
973 * subsequent redundant synchronization.
974 */
975 for (unsigned i = 0; i < inst->sources; i++) {
976 const dependency rd_dep =
977 (inst->is_payload(i) ||
978 inst->is_math()) ? dependency(TGL_SBID_SRC, ip, exec_all) :
979 ordered_unit(devinfo, inst, IDX(TGL_PIPE_ALL)) ?
980 dependency(TGL_REGDIST_SRC, jp, exec_all) :
981 dependency::done;
982
983 for (unsigned j = 0; j < regs_read(inst, i); j++)
984 sb.set(byte_offset(inst->src[i], REG_SIZE * j), rd_dep);
985 }
986
987 if (inst->reads_accumulator_implicitly())
988 sb.set(brw_acc_reg(8), dependency(TGL_REGDIST_SRC, jp, exec_all));
989
990 if (is_send(inst) && inst->base_mrf != -1) {
991 const dependency rd_dep = dependency(TGL_SBID_SRC, ip, exec_all);
992
993 for (unsigned j = 0; j < inst->mlen; j++)
994 sb.set(brw_uvec_mrf(8, inst->base_mrf + j, 0), rd_dep);
995 }
996
997 /* Track any destination registers of this instruction. */
998 const dependency wr_dep =
999 is_unordered(inst) ? dependency(TGL_SBID_DST, ip, exec_all) :
1000 ordered_unit(devinfo, inst, IDX(TGL_PIPE_ALL)) ?
1001 dependency(TGL_REGDIST_DST, jp, exec_all) :
1002 dependency();
1003
1004 if (inst->writes_accumulator_implicitly(devinfo))
1005 sb.set(brw_acc_reg(8), wr_dep);
1006
1007 if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
1008 !inst->dst.is_null()) {
1009 for (unsigned j = 0; j < regs_written(inst); j++)
1010 sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep);
1011 }
1012 }
1013
1014 /**
1015 * Calculate scoreboard objects locally that represent any pending (and
1016 * unconditionally resolved) dependencies at the end of each block of the
1017 * program.
1018 */
1019 scoreboard *
gather_block_scoreboards(const fs_visitor * shader,const ordered_address * jps)1020 gather_block_scoreboards(const fs_visitor *shader,
1021 const ordered_address *jps)
1022 {
1023 scoreboard *sbs = new scoreboard[shader->cfg->num_blocks];
1024 unsigned ip = 0;
1025
1026 foreach_block_and_inst(block, fs_inst, inst, shader->cfg)
1027 update_inst_scoreboard(shader, jps, inst, ip++, sbs[block->num]);
1028
1029 return sbs;
1030 }
1031
1032 /**
1033 * Propagate data dependencies globally through the control flow graph
1034 * until a fixed point is reached.
1035 *
1036 * Calculates the set of dependencies potentially pending at the beginning
1037 * of each block, and returns it as an array of scoreboard objects.
1038 */
1039 scoreboard *
propagate_block_scoreboards(const fs_visitor * shader,const ordered_address * jps,equivalence_relation & eq)1040 propagate_block_scoreboards(const fs_visitor *shader,
1041 const ordered_address *jps,
1042 equivalence_relation &eq)
1043 {
1044 const scoreboard *delta_sbs = gather_block_scoreboards(shader, jps);
1045 scoreboard *in_sbs = new scoreboard[shader->cfg->num_blocks];
1046 scoreboard *out_sbs = new scoreboard[shader->cfg->num_blocks];
1047
1048 for (bool progress = true; progress;) {
1049 progress = false;
1050
1051 foreach_block(block, shader->cfg) {
1052 const scoreboard sb = shadow(in_sbs[block->num],
1053 delta_sbs[block->num]);
1054
1055 if (sb != out_sbs[block->num]) {
1056 foreach_list_typed(bblock_link, child_link, link,
1057 &block->children) {
1058 scoreboard &in_sb = in_sbs[child_link->block->num];
1059 int delta[IDX(TGL_PIPE_ALL)];
1060
1061 for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
1062 delta[p] = jps[child_link->block->start_ip].jp[p]
1063 - jps[block->end_ip].jp[p]
1064 - ordered_unit(shader->devinfo,
1065 static_cast<const fs_inst *>(block->end()), p);
1066
1067 in_sb = merge(eq, in_sb, transport(sb, delta));
1068 }
1069
1070 out_sbs[block->num] = sb;
1071 progress = true;
1072 }
1073 }
1074 }
1075
1076 delete[] delta_sbs;
1077 delete[] out_sbs;
1078
1079 return in_sbs;
1080 }
1081
1082 /**
1083 * Return the list of potential dependencies of each instruction in the
1084 * shader based on the result of global dependency analysis.
1085 */
1086 dependency_list *
gather_inst_dependencies(const fs_visitor * shader,const ordered_address * jps)1087 gather_inst_dependencies(const fs_visitor *shader,
1088 const ordered_address *jps)
1089 {
1090 const struct intel_device_info *devinfo = shader->devinfo;
1091 equivalence_relation eq(num_instructions(shader));
1092 scoreboard *sbs = propagate_block_scoreboards(shader, jps, eq);
1093 const unsigned *ids = eq.flatten();
1094 dependency_list *deps = new dependency_list[num_instructions(shader)];
1095 unsigned ip = 0;
1096
1097 foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
1098 const bool exec_all = inst->force_writemask_all;
1099 const tgl_pipe p = inferred_exec_pipe(devinfo, inst);
1100 scoreboard &sb = sbs[block->num];
1101
1102 for (unsigned i = 0; i < inst->sources; i++) {
1103 for (unsigned j = 0; j < regs_read(inst, i); j++)
1104 add_dependency(ids, deps[ip], dependency_for_read(
1105 sb.get(byte_offset(inst->src[i], REG_SIZE * j))));
1106 }
1107
1108 if (inst->reads_accumulator_implicitly()) {
1109 /* Wa_22012725308:
1110 *
1111 * "When the accumulator registers are used as source and/or
1112 * destination, hardware does not ensure prevention of write
1113 * after read hazard across execution pipes."
1114 */
1115 const dependency dep = sb.get(brw_acc_reg(8));
1116 if (dep.ordered && !is_single_pipe(dep.jp, p))
1117 add_dependency(ids, deps[ip], dep);
1118 }
1119
1120 if (is_send(inst) && inst->base_mrf != -1) {
1121 for (unsigned j = 0; j < inst->mlen; j++)
1122 add_dependency(ids, deps[ip], dependency_for_read(
1123 sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
1124 }
1125
1126 if (is_unordered(inst))
1127 add_dependency(ids, deps[ip],
1128 dependency(TGL_SBID_SET, ip, exec_all));
1129
1130 if (!inst->no_dd_check) {
1131 if (inst->dst.file != BAD_FILE && !inst->dst.is_null() &&
1132 !inst->dst.is_accumulator()) {
1133 for (unsigned j = 0; j < regs_written(inst); j++) {
1134 add_dependency(ids, deps[ip], dependency_for_write(devinfo, inst,
1135 sb.get(byte_offset(inst->dst, REG_SIZE * j))));
1136 }
1137 }
1138
1139 if (inst->writes_accumulator_implicitly(devinfo) ||
1140 inst->dst.is_accumulator()) {
1141 /* Wa_22012725308:
1142 *
1143 * "When the accumulator registers are used as source and/or
1144 * destination, hardware does not ensure prevention of write
1145 * after read hazard across execution pipes."
1146 */
1147 const dependency dep = sb.get(brw_acc_reg(8));
1148 if (dep.ordered && !is_single_pipe(dep.jp, p))
1149 add_dependency(ids, deps[ip], dep);
1150 }
1151
1152 if (is_send(inst) && inst->base_mrf != -1) {
1153 for (unsigned j = 0; j < inst->implied_mrf_writes(); j++)
1154 add_dependency(ids, deps[ip], dependency_for_write(devinfo, inst,
1155 sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
1156 }
1157 }
1158
1159 update_inst_scoreboard(shader, jps, inst, ip, sb);
1160 ip++;
1161 }
1162
1163 delete[] sbs;
1164 delete[] ids;
1165
1166 return deps;
1167 }
1168
1169 /** @} */
1170
1171 /**
1172 * Allocate SBID tokens to track the execution of every out-of-order
1173 * instruction of the shader.
1174 */
1175 dependency_list *
allocate_inst_dependencies(const fs_visitor * shader,const dependency_list * deps0)1176 allocate_inst_dependencies(const fs_visitor *shader,
1177 const dependency_list *deps0)
1178 {
1179 /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in
1180 * shaders with a large number of SEND messages.
1181 */
1182
1183 /* Allocate an unordered dependency ID to hardware SBID translation
1184 * table with as many entries as instructions there are in the shader,
1185 * which is the maximum number of unordered IDs we can find in the
1186 * program.
1187 */
1188 unsigned *ids = new unsigned[num_instructions(shader)];
1189 for (unsigned ip = 0; ip < num_instructions(shader); ip++)
1190 ids[ip] = ~0u;
1191
1192 dependency_list *deps1 = new dependency_list[num_instructions(shader)];
1193 unsigned next_id = 0;
1194
1195 for (unsigned ip = 0; ip < num_instructions(shader); ip++) {
1196 for (unsigned i = 0; i < deps0[ip].size(); i++) {
1197 const dependency &dep = deps0[ip][i];
1198
1199 if (dep.unordered && ids[dep.id] == ~0u)
1200 ids[dep.id] = (next_id++) & 0xf;
1201
1202 add_dependency(ids, deps1[ip], dep);
1203 }
1204 }
1205
1206 delete[] ids;
1207
1208 return deps1;
1209 }
1210
1211 /**
1212 * Emit dependency information provided by \p deps into the shader,
1213 * inserting additional SYNC instructions for dependencies that can't be
1214 * represented directly by annotating existing instructions.
1215 */
1216 void
emit_inst_dependencies(fs_visitor * shader,const ordered_address * jps,const dependency_list * deps)1217 emit_inst_dependencies(fs_visitor *shader,
1218 const ordered_address *jps,
1219 const dependency_list *deps)
1220 {
1221 const struct intel_device_info *devinfo = shader->devinfo;
1222 unsigned ip = 0;
1223
1224 foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) {
1225 const bool exec_all = inst->force_writemask_all;
1226 const bool ordered_mode =
1227 baked_ordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]);
1228 const tgl_sbid_mode unordered_mode =
1229 baked_unordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]);
1230 tgl_swsb swsb = !ordered_mode ? tgl_swsb() :
1231 ordered_dependency_swsb(deps[ip], jps[ip], exec_all);
1232
1233 for (unsigned i = 0; i < deps[ip].size(); i++) {
1234 const dependency &dep = deps[ip][i];
1235
1236 if (dep.unordered) {
1237 if (unordered_mode == dep.unordered &&
1238 exec_all >= dep.exec_all && !swsb.mode) {
1239 /* Bake unordered dependency into the instruction's SWSB if
1240 * possible, except in cases where the current instruction
1241 * isn't marked NoMask but the dependency is, since that
1242 * might lead to data coherency issues due to
1243 * Wa_1407528679.
1244 */
1245 swsb.sbid = dep.id;
1246 swsb.mode = dep.unordered;
1247 } else {
1248 /* Emit dependency into the SWSB of an extra SYNC
1249 * instruction.
1250 */
1251 const fs_builder ibld = fs_builder(shader, block, inst)
1252 .exec_all().group(1, 0);
1253 fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
1254 brw_imm_ud(TGL_SYNC_NOP));
1255 sync->sched.sbid = dep.id;
1256 sync->sched.mode = dep.unordered;
1257 assert(!(sync->sched.mode & TGL_SBID_SET));
1258 }
1259 }
1260 }
1261
1262 for (unsigned i = 0; i < deps[ip].size(); i++) {
1263 const dependency &dep = deps[ip][i];
1264
1265 if (dep.ordered &&
1266 find_ordered_dependency(deps[ip], jps[ip], true) &&
1267 (!ordered_mode || dep.exec_all > exec_all)) {
1268 /* If the current instruction is not marked NoMask but an
1269 * ordered dependency is, perform the synchronization as a
1270 * separate NoMask SYNC instruction in order to avoid data
1271 * coherency issues due to Wa_1407528679. The similar
1272 * scenario with unordered dependencies should have been
1273 * handled above.
1274 */
1275 const fs_builder ibld = fs_builder(shader, block, inst)
1276 .exec_all().group(1, 0);
1277 fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
1278 brw_imm_ud(TGL_SYNC_NOP));
1279 sync->sched = ordered_dependency_swsb(deps[ip], jps[ip], true);
1280 break;
1281 }
1282 }
1283
1284 /* Update the IR. */
1285 inst->sched = swsb;
1286 inst->no_dd_check = inst->no_dd_clear = false;
1287 ip++;
1288 }
1289 }
1290 }
1291
1292 bool
lower_scoreboard()1293 fs_visitor::lower_scoreboard()
1294 {
1295 if (devinfo->ver >= 12) {
1296 const ordered_address *jps = ordered_inst_addresses(this);
1297 const dependency_list *deps0 = gather_inst_dependencies(this, jps);
1298 const dependency_list *deps1 = allocate_inst_dependencies(this, deps0);
1299 emit_inst_dependencies(this, jps, deps1);
1300 delete[] deps1;
1301 delete[] deps0;
1302 delete[] jps;
1303 }
1304
1305 return true;
1306 }
1307