1 /*
2 * Copyright © 2017 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs_bank_conflicts.cpp
25 *
26 * This file contains a GRF bank conflict mitigation pass. The pass is
27 * intended to be run after register allocation and works by rearranging the
28 * layout of the GRF space (without altering the semantics of the program) in
29 * a way that minimizes the number of GRF bank conflicts incurred by ternary
30 * instructions.
31 *
32 * Unfortunately there is close to no information about bank conflicts in the
33 * hardware spec, but experimentally on Gfx7-Gfx9 ternary instructions seem to
34 * incur an average bank conflict penalty of one cycle per SIMD8 op whenever
35 * the second and third source are stored in the same GRF bank (\sa bank_of()
36 * for the exact bank layout) which cannot be fetched during the same cycle by
37 * the EU, unless the EU logic manages to optimize out the read cycle of a
38 * duplicate source register (\sa is_conflict_optimized_out()).
39 *
40 * The asymptotic run-time of the algorithm is dominated by the
41 * shader_conflict_weight_matrix() computation below, which is O(n) on the
42 * number of instructions in the program, however for small and medium-sized
43 * programs the run-time is likely to be dominated by
44 * optimize_reg_permutation() which is O(m^3) on the number of GRF atoms of
45 * the program (\sa partitioning), which is bounded (since the program uses a
46 * bounded number of registers post-regalloc) and of the order of 100. For
47 * that reason optimize_reg_permutation() is vectorized in order to keep the
48 * cubic term within reasonable bounds for m close to its theoretical maximum.
49 */
50
51 #include "brw_fs.h"
52 #include "brw_cfg.h"
53
54 #ifdef __SSE2__
55
56 #include <emmintrin.h>
57
58 /**
59 * Thin layer around vector intrinsics so they can be easily replaced with
60 * e.g. the fall-back scalar path, an implementation with different vector
61 * width or using different SIMD architectures (AVX-512?!).
62 *
63 * This implementation operates on pairs of independent SSE2 integer vectors à
64 * la SIMD16 for somewhat improved throughput. SSE2 is supported by virtually
65 * all platforms that care about bank conflicts, so this path should almost
66 * always be available in practice.
67 */
68 namespace {
69 /**
70 * SIMD integer vector data type.
71 */
72 struct vector_type {
73 __m128i v[2];
74 };
75
76 /**
77 * Scalar data type matching the representation of a single component of \p
78 * vector_type.
79 */
80 typedef int16_t scalar_type;
81
82 /**
83 * Maximum integer value representable as a \p scalar_type.
84 */
85 const scalar_type max_scalar = INT16_MAX;
86
87 /**
88 * Number of components of a \p vector_type.
89 */
90 const unsigned vector_width = 2 * sizeof(__m128i) / sizeof(scalar_type);
91
92 /**
93 * Set the i-th component of vector \p v to \p x.
94 */
95 void
set(vector_type & v,unsigned i,scalar_type x)96 set(vector_type &v, unsigned i, scalar_type x)
97 {
98 assert(i < vector_width);
99 memcpy((char *)v.v + i * sizeof(x), &x, sizeof(x));
100 }
101
102 /**
103 * Get the i-th component of vector \p v.
104 */
105 scalar_type
get(const vector_type & v,unsigned i)106 get(const vector_type &v, unsigned i)
107 {
108 assert(i < vector_width);
109 scalar_type x;
110 memcpy(&x, (char *)v.v + i * sizeof(x), sizeof(x));
111 return x;
112 }
113
114 /**
115 * Add two vectors with saturation.
116 */
117 vector_type
adds(const vector_type & v,const vector_type & w)118 adds(const vector_type &v, const vector_type &w)
119 {
120 const vector_type u = {{
121 _mm_adds_epi16(v.v[0], w.v[0]),
122 _mm_adds_epi16(v.v[1], w.v[1])
123 }};
124 return u;
125 }
126
127 /**
128 * Subtract two vectors with saturation.
129 */
130 vector_type
subs(const vector_type & v,const vector_type & w)131 subs(const vector_type &v, const vector_type &w)
132 {
133 const vector_type u = {{
134 _mm_subs_epi16(v.v[0], w.v[0]),
135 _mm_subs_epi16(v.v[1], w.v[1])
136 }};
137 return u;
138 }
139
140 /**
141 * Compute the bitwise conjunction of two vectors.
142 */
143 vector_type
mask(const vector_type & v,const vector_type & w)144 mask(const vector_type &v, const vector_type &w)
145 {
146 const vector_type u = {{
147 _mm_and_si128(v.v[0], w.v[0]),
148 _mm_and_si128(v.v[1], w.v[1])
149 }};
150 return u;
151 }
152
153 /**
154 * Reduce the components of a vector using saturating addition.
155 */
156 scalar_type
sums(const vector_type & v)157 sums(const vector_type &v)
158 {
159 const __m128i v8 = _mm_adds_epi16(v.v[0], v.v[1]);
160 const __m128i v4 = _mm_adds_epi16(v8, _mm_shuffle_epi32(v8, 0x4e));
161 const __m128i v2 = _mm_adds_epi16(v4, _mm_shuffle_epi32(v4, 0xb1));
162 const __m128i v1 = _mm_adds_epi16(v2, _mm_shufflelo_epi16(v2, 0xb1));
163 return _mm_extract_epi16(v1, 0);
164 }
165 }
166
167 #else
168
169 /**
170 * Thin layer around vector intrinsics so they can be easily replaced with
171 * e.g. the fall-back scalar path, an implementation with different vector
172 * width or using different SIMD architectures (AVX-512?!).
173 *
174 * This implementation operates on scalar values and doesn't rely on
175 * any vector extensions. This is mainly intended for debugging and
176 * to keep this file building on exotic platforms.
177 */
178 namespace {
179 /**
180 * SIMD integer vector data type.
181 */
182 typedef int16_t vector_type;
183
184 /**
185 * Scalar data type matching the representation of a single component of \p
186 * vector_type.
187 */
188 typedef int16_t scalar_type;
189
190 /**
191 * Maximum integer value representable as a \p scalar_type.
192 */
193 const scalar_type max_scalar = INT16_MAX;
194
195 /**
196 * Number of components of a \p vector_type.
197 */
198 const unsigned vector_width = 1;
199
200 /**
201 * Set the i-th component of vector \p v to \p x.
202 */
203 void
set(vector_type & v,unsigned i,scalar_type x)204 set(vector_type &v, unsigned i, scalar_type x)
205 {
206 assert(i < vector_width);
207 v = x;
208 }
209
210 /**
211 * Get the i-th component of vector \p v.
212 */
213 scalar_type
get(const vector_type & v,unsigned i)214 get(const vector_type &v, unsigned i)
215 {
216 assert(i < vector_width);
217 return v;
218 }
219
220 /**
221 * Add two vectors with saturation.
222 */
223 vector_type
adds(vector_type v,vector_type w)224 adds(vector_type v, vector_type w)
225 {
226 return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) + w));
227 }
228
229 /**
230 * Subtract two vectors with saturation.
231 */
232 vector_type
subs(vector_type v,vector_type w)233 subs(vector_type v, vector_type w)
234 {
235 return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) - w));
236 }
237
238 /**
239 * Compute the bitwise conjunction of two vectors.
240 */
241 vector_type
mask(vector_type v,vector_type w)242 mask(vector_type v, vector_type w)
243 {
244 return v & w;
245 }
246
247 /**
248 * Reduce the components of a vector using saturating addition.
249 */
250 scalar_type
sums(vector_type v)251 sums(vector_type v)
252 {
253 return v;
254 }
255 }
256
257 #endif
258
259 /**
260 * Swap \p x and \p y.
261 */
262 #define SWAP(x, y) do { \
263 __typeof(y) _swap_tmp = y; \
264 y = x; \
265 x = _swap_tmp; \
266 } while (0)
267
268 namespace {
269 /**
270 * Variable-length vector type intended to represent cycle-count costs for
271 * arbitrary atom-to-bank assignments. It's indexed by a pair of integers
272 * (i, p), where i is an atom index and p in {0, 1} indicates the parity of
273 * the conflict (respectively, whether the cost is incurred whenever the
274 * atoms are assigned the same bank b or opposite-parity banks b and b^1).
275 * \sa shader_conflict_weight_matrix()
276 */
277 struct weight_vector_type {
weight_vector_type__anonc366d64e0311::weight_vector_type278 weight_vector_type() : v(NULL), size(0) {}
279
weight_vector_type__anonc366d64e0311::weight_vector_type280 weight_vector_type(unsigned n) : v(alloc(n)), size(n) {}
281
weight_vector_type__anonc366d64e0311::weight_vector_type282 weight_vector_type(const weight_vector_type &u) :
283 v(alloc(u.size)), size(u.size)
284 {
285 memcpy(v, u.v,
286 DIV_ROUND_UP(u.size, vector_width) * sizeof(vector_type));
287 }
288
~weight_vector_type__anonc366d64e0311::weight_vector_type289 ~weight_vector_type()
290 {
291 free(v);
292 }
293
294 weight_vector_type &
operator =__anonc366d64e0311::weight_vector_type295 operator=(weight_vector_type u)
296 {
297 SWAP(v, u.v);
298 SWAP(size, u.size);
299 return *this;
300 }
301
302 vector_type *v;
303 unsigned size;
304
305 private:
306 static vector_type *
alloc__anonc366d64e0311::weight_vector_type307 alloc(unsigned n)
308 {
309 const unsigned align = MAX2(sizeof(void *), __alignof__(vector_type));
310 const unsigned size = DIV_ROUND_UP(n, vector_width) * sizeof(vector_type);
311 void *p;
312 if (posix_memalign(&p, align, size))
313 return NULL;
314 memset(p, 0, size);
315 return reinterpret_cast<vector_type *>(p);
316 }
317 };
318
319 /**
320 * Set the (i, p)-th component of weight vector \p v to \p x.
321 */
322 void
set(weight_vector_type & v,unsigned i,unsigned p,scalar_type x)323 set(weight_vector_type &v, unsigned i, unsigned p, scalar_type x)
324 {
325 set(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width, x);
326 }
327
328 /**
329 * Get the (i, p)-th component of weight vector \p v.
330 */
331 scalar_type
get(const weight_vector_type & v,unsigned i,unsigned p)332 get(const weight_vector_type &v, unsigned i, unsigned p)
333 {
334 return get(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width);
335 }
336
337 /**
338 * Swap the (i, p)-th and (j, q)-th components of weight vector \p v.
339 */
340 void
swap(weight_vector_type & v,unsigned i,unsigned p,unsigned j,unsigned q)341 swap(weight_vector_type &v,
342 unsigned i, unsigned p,
343 unsigned j, unsigned q)
344 {
345 const scalar_type tmp = get(v, i, p);
346 set(v, i, p, get(v, j, q));
347 set(v, j, q, tmp);
348 }
349 }
350
351 namespace {
352 /**
353 * Object that represents the partitioning of an arbitrary register space
354 * into indivisible units (referred to as atoms below) that can potentially
355 * be rearranged independently from other registers. The partitioning is
356 * inferred from a number of contiguity requirements specified using
357 * require_contiguous(). This allows efficient look-up of the atom index a
358 * given register address belongs to, or conversely the range of register
359 * addresses that belong to a given atom.
360 */
361 struct partitioning {
362 /**
363 * Create a (for the moment unrestricted) partitioning of a register
364 * file of size \p n. The units are arbitrary.
365 */
partitioning__anonc366d64e0411::partitioning366 partitioning(unsigned n) :
367 max_reg(n),
368 offsets(new unsigned[n + num_terminator_atoms]),
369 atoms(new unsigned[n + num_terminator_atoms])
370 {
371 for (unsigned i = 0; i < n + num_terminator_atoms; i++) {
372 offsets[i] = i;
373 atoms[i] = i;
374 }
375 }
376
partitioning__anonc366d64e0411::partitioning377 partitioning(const partitioning &p) :
378 max_reg(p.max_reg),
379 offsets(new unsigned[p.num_atoms() + num_terminator_atoms]),
380 atoms(new unsigned[p.max_reg + num_terminator_atoms])
381 {
382 memcpy(offsets, p.offsets,
383 sizeof(unsigned) * (p.num_atoms() + num_terminator_atoms));
384 memcpy(atoms, p.atoms,
385 sizeof(unsigned) * (p.max_reg + num_terminator_atoms));
386 }
387
~partitioning__anonc366d64e0411::partitioning388 ~partitioning()
389 {
390 delete[] offsets;
391 delete[] atoms;
392 }
393
394 partitioning &
operator =__anonc366d64e0411::partitioning395 operator=(partitioning p)
396 {
397 SWAP(max_reg, p.max_reg);
398 SWAP(offsets, p.offsets);
399 SWAP(atoms, p.atoms);
400 return *this;
401 }
402
403 /**
404 * Require register range [reg, reg + n[ to be considered part of the
405 * same atom.
406 */
407 void
require_contiguous__anonc366d64e0411::partitioning408 require_contiguous(unsigned reg, unsigned n)
409 {
410 unsigned r = atoms[reg];
411
412 /* Renumber atoms[reg...] = { r... } and their offsets[r...] for the
413 * case that the specified contiguity requirement leads to the fusion
414 * (yay) of one or more existing atoms.
415 */
416 for (unsigned reg1 = reg + 1; reg1 <= max_reg; reg1++) {
417 if (offsets[atoms[reg1]] < reg + n) {
418 atoms[reg1] = r;
419 } else {
420 if (offsets[atoms[reg1 - 1]] != offsets[atoms[reg1]])
421 r++;
422
423 offsets[r] = offsets[atoms[reg1]];
424 atoms[reg1] = r;
425 }
426 }
427 }
428
429 /**
430 * Get the atom index register address \p reg belongs to.
431 */
432 unsigned
atom_of_reg__anonc366d64e0411::partitioning433 atom_of_reg(unsigned reg) const
434 {
435 return atoms[reg];
436 }
437
438 /**
439 * Get the base register address that belongs to atom \p r.
440 */
441 unsigned
reg_of_atom__anonc366d64e0411::partitioning442 reg_of_atom(unsigned r) const
443 {
444 return offsets[r];
445 }
446
447 /**
448 * Get the size of atom \p r in register address units.
449 */
450 unsigned
size_of_atom__anonc366d64e0411::partitioning451 size_of_atom(unsigned r) const
452 {
453 assert(r < num_atoms());
454 return reg_of_atom(r + 1) - reg_of_atom(r);
455 }
456
457 /**
458 * Get the number of atoms the whole register space is partitioned into.
459 */
460 unsigned
num_atoms__anonc366d64e0411::partitioning461 num_atoms() const
462 {
463 return atoms[max_reg];
464 }
465
466 private:
467 /**
468 * Number of trailing atoms inserted for convenience so among other
469 * things we don't need to special-case the last element in
470 * size_of_atom().
471 */
472 static const unsigned num_terminator_atoms = 1;
473 unsigned max_reg;
474 unsigned *offsets;
475 unsigned *atoms;
476 };
477
478 /**
479 * Only GRF sources (whether they have been register-allocated or not) can
480 * possibly incur bank conflicts.
481 */
482 bool
is_grf(const fs_reg & r)483 is_grf(const fs_reg &r)
484 {
485 return r.file == VGRF || r.file == FIXED_GRF;
486 }
487
488 /**
489 * Register offset of \p r in GRF units. Useful because the representation
490 * of GRFs post-register allocation is somewhat inconsistent and depends on
491 * whether the register already had a fixed GRF offset prior to register
492 * allocation or whether it was part of a VGRF allocation.
493 */
494 unsigned
reg_of(const fs_reg & r)495 reg_of(const fs_reg &r)
496 {
497 assert(is_grf(r));
498 if (r.file == VGRF)
499 return r.nr + r.offset / REG_SIZE;
500 else
501 return reg_offset(r) / REG_SIZE;
502 }
503
504 /**
505 * Calculate the finest partitioning of the GRF space compatible with the
506 * register contiguity requirements derived from all instructions part of
507 * the program.
508 */
509 partitioning
shader_reg_partitioning(const fs_visitor * v)510 shader_reg_partitioning(const fs_visitor *v)
511 {
512 partitioning p(BRW_MAX_GRF);
513
514 foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
515 if (is_grf(inst->dst))
516 p.require_contiguous(reg_of(inst->dst), regs_written(inst));
517
518 for (int i = 0; i < inst->sources; i++) {
519 if (is_grf(inst->src[i]))
520 p.require_contiguous(reg_of(inst->src[i]), regs_read(inst, i));
521 }
522 }
523
524 return p;
525 }
526
527 /**
528 * Return the set of GRF atoms that should be left untouched at their
529 * original location to avoid violating hardware or software assumptions.
530 */
531 bool *
shader_reg_constraints(const fs_visitor * v,const partitioning & p)532 shader_reg_constraints(const fs_visitor *v, const partitioning &p)
533 {
534 bool *constrained = new bool[p.num_atoms()]();
535
536 /* These are read implicitly by some send-message instructions without
537 * any indication at the IR level. Assume they are unsafe to move
538 * around.
539 */
540 for (unsigned reg = 0; reg < 2; reg++)
541 constrained[p.atom_of_reg(reg)] = true;
542
543 /* At Intel Broadwell PRM, vol 07, section "Instruction Set Reference",
544 * subsection "EUISA Instructions", Send Message (page 990):
545 *
546 * "r127 must not be used for return address when there is a src and
547 * dest overlap in send instruction."
548 *
549 * Register allocation ensures that, so don't move 127 around to avoid
550 * breaking that property.
551 */
552 if (v->devinfo->ver >= 8)
553 constrained[p.atom_of_reg(127)] = true;
554
555 foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
556 /* Assume that anything referenced via fixed GRFs is baked into the
557 * hardware's fixed-function logic and may be unsafe to move around.
558 * Also take into account the source GRF restrictions of EOT
559 * send-message instructions.
560 */
561 if (inst->dst.file == FIXED_GRF)
562 constrained[p.atom_of_reg(reg_of(inst->dst))] = true;
563
564 for (int i = 0; i < inst->sources; i++) {
565 if (inst->src[i].file == FIXED_GRF ||
566 (is_grf(inst->src[i]) && inst->eot))
567 constrained[p.atom_of_reg(reg_of(inst->src[i]))] = true;
568 }
569
570 /* Preserve the original allocation of VGRFs used by the barycentric
571 * source of the LINTERP instruction on Gfx6, since pair-aligned
572 * barycentrics allow the PLN instruction to be used.
573 */
574 if (v->devinfo->has_pln && v->devinfo->ver <= 6 &&
575 inst->opcode == FS_OPCODE_LINTERP)
576 constrained[p.atom_of_reg(reg_of(inst->src[0]))] = true;
577
578 /* The location of the Gfx7 MRF hack registers is hard-coded in the
579 * rest of the compiler back-end. Don't attempt to move them around.
580 */
581 if (v->devinfo->ver >= 7) {
582 assert(inst->dst.file != MRF);
583
584 for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
585 const unsigned reg = GFX7_MRF_HACK_START + inst->base_mrf + i;
586 constrained[p.atom_of_reg(reg)] = true;
587 }
588 }
589 }
590
591 return constrained;
592 }
593
594 /**
595 * Return whether the hardware will be able to prevent a bank conflict by
596 * optimizing out the read cycle of a source register. The formula was
597 * found experimentally.
598 */
599 bool
is_conflict_optimized_out(const intel_device_info * devinfo,const fs_inst * inst)600 is_conflict_optimized_out(const intel_device_info *devinfo,
601 const fs_inst *inst)
602 {
603 return devinfo->ver >= 9 &&
604 ((is_grf(inst->src[0]) && (reg_of(inst->src[0]) == reg_of(inst->src[1]) ||
605 reg_of(inst->src[0]) == reg_of(inst->src[2]))) ||
606 reg_of(inst->src[1]) == reg_of(inst->src[2]));
607 }
608
609 /**
610 * Return a matrix that allows reasonably efficient computation of the
611 * cycle-count cost of bank conflicts incurred throughout the whole program
612 * for any given atom-to-bank assignment.
613 *
614 * More precisely, if C_r_s_p is the result of this function, the total
615 * cost of all bank conflicts involving any given atom r can be readily
616 * recovered as follows:
617 *
618 * S(B) = Sum_s_p(d_(p^B_r)_(B_s) * C_r_s_p)
619 *
620 * where d_i_j is the Kronecker delta, and B_r indicates the bank
621 * assignment of r. \sa delta_conflicts() for a vectorized implementation
622 * of the expression above.
623 *
624 * FINISHME: Teach this about the Gfx10+ bank conflict rules, which are
625 * somewhat more relaxed than on previous generations. In the
626 * meantime optimizing based on Gfx9 weights is likely to be more
627 * helpful than not optimizing at all.
628 */
629 weight_vector_type *
shader_conflict_weight_matrix(const fs_visitor * v,const partitioning & p)630 shader_conflict_weight_matrix(const fs_visitor *v, const partitioning &p)
631 {
632 weight_vector_type *conflicts = new weight_vector_type[p.num_atoms()];
633 for (unsigned r = 0; r < p.num_atoms(); r++)
634 conflicts[r] = weight_vector_type(2 * p.num_atoms());
635
636 /* Crude approximation of the number of times the current basic block
637 * will be executed at run-time.
638 */
639 unsigned block_scale = 1;
640
641 foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
642 if (inst->opcode == BRW_OPCODE_DO) {
643 block_scale *= 10;
644
645 } else if (inst->opcode == BRW_OPCODE_WHILE) {
646 block_scale /= 10;
647
648 } else if (inst->is_3src(v->compiler) &&
649 is_grf(inst->src[1]) && is_grf(inst->src[2])) {
650 const unsigned r = p.atom_of_reg(reg_of(inst->src[1]));
651 const unsigned s = p.atom_of_reg(reg_of(inst->src[2]));
652
653 /* Estimate of the cycle-count cost of incurring a bank conflict
654 * for this instruction. This is only true on the average, for a
655 * sequence of back-to-back ternary instructions, since the EU
656 * front-end only seems to be able to issue a new instruction at
657 * an even cycle. The cost of a bank conflict incurred by an
658 * isolated ternary instruction may be higher.
659 */
660 const unsigned exec_size = inst->dst.component_size(inst->exec_size);
661 const unsigned cycle_scale = block_scale * DIV_ROUND_UP(exec_size,
662 REG_SIZE);
663
664 /* Neglect same-atom conflicts (since they're either trivial or
665 * impossible to avoid without splitting the atom), and conflicts
666 * known to be optimized out by the hardware.
667 */
668 if (r != s && !is_conflict_optimized_out(v->devinfo, inst)) {
669 /* Calculate the parity of the sources relative to the start of
670 * their respective atoms. If their parity is the same (and
671 * none of the atoms straddle the 2KB mark), the instruction
672 * will incur a conflict iff both atoms are assigned the same
673 * bank b. If their parity is opposite, the instruction will
674 * incur a conflict iff they are assigned opposite banks (b and
675 * b^1).
676 */
677 const bool p_r = 1 & (reg_of(inst->src[1]) - p.reg_of_atom(r));
678 const bool p_s = 1 & (reg_of(inst->src[2]) - p.reg_of_atom(s));
679 const unsigned p = p_r ^ p_s;
680
681 /* Calculate the updated cost of a hypothetical conflict
682 * between atoms r and s. Note that the weight matrix is
683 * symmetric with respect to indices r and s by construction.
684 */
685 const scalar_type w = MIN2(unsigned(max_scalar),
686 get(conflicts[r], s, p) + cycle_scale);
687 set(conflicts[r], s, p, w);
688 set(conflicts[s], r, p, w);
689 }
690 }
691 }
692
693 return conflicts;
694 }
695
696 /**
697 * Return the set of GRF atoms that could potentially lead to bank
698 * conflicts if laid out unfavorably in the GRF space according to
699 * the specified \p conflicts matrix (\sa
700 * shader_conflict_weight_matrix()).
701 */
702 bool *
have_any_conflicts(const partitioning & p,const weight_vector_type * conflicts)703 have_any_conflicts(const partitioning &p,
704 const weight_vector_type *conflicts)
705 {
706 bool *any_conflicts = new bool[p.num_atoms()]();
707
708 for (unsigned r = 0; r < p.num_atoms(); r++) {
709 const unsigned m = DIV_ROUND_UP(conflicts[r].size, vector_width);
710 for (unsigned s = 0; s < m; s++)
711 any_conflicts[r] |= sums(conflicts[r].v[s]);
712 }
713
714 return any_conflicts;
715 }
716
717 /**
718 * Calculate the difference between two S(B) cost estimates as defined
719 * above (\sa shader_conflict_weight_matrix()). This represents the
720 * (partial) cycle-count benefit from moving an atom r from bank p to n.
721 * The respective bank assignments Bp and Bn are encoded as the \p
722 * bank_mask_p and \p bank_mask_n bitmasks for efficient computation,
723 * according to the formula:
724 *
725 * bank_mask(B)_s_p = -d_(p^B_r)_(B_s)
726 *
727 * Notice the similarity with the delta function in the S(B) expression
728 * above, and how bank_mask(B) can be precomputed for every possible
729 * selection of r since bank_mask(B) only depends on it via B_r that may
730 * only assume one of four different values, so the caller can keep every
731 * possible bank_mask(B) vector in memory without much hassle (\sa
732 * bank_characteristics()).
733 */
734 int
delta_conflicts(const weight_vector_type & bank_mask_p,const weight_vector_type & bank_mask_n,const weight_vector_type & conflicts)735 delta_conflicts(const weight_vector_type &bank_mask_p,
736 const weight_vector_type &bank_mask_n,
737 const weight_vector_type &conflicts)
738 {
739 const unsigned m = DIV_ROUND_UP(conflicts.size, vector_width);
740 vector_type s_p = {}, s_n = {};
741
742 for (unsigned r = 0; r < m; r++) {
743 s_p = adds(s_p, mask(bank_mask_p.v[r], conflicts.v[r]));
744 s_n = adds(s_n, mask(bank_mask_n.v[r], conflicts.v[r]));
745 }
746
747 return sums(subs(s_p, s_n));
748 }
749
750 /**
751 * Register atom permutation, represented as the start GRF offset each atom
752 * is mapped into.
753 */
754 struct permutation {
permutation__anonc366d64e0411::permutation755 permutation() : v(NULL), size(0) {}
756
permutation__anonc366d64e0411::permutation757 permutation(unsigned n) :
758 v(new unsigned[n]()), size(n) {}
759
permutation__anonc366d64e0411::permutation760 permutation(const permutation &p) :
761 v(new unsigned[p.size]), size(p.size)
762 {
763 memcpy(v, p.v, p.size * sizeof(unsigned));
764 }
765
~permutation__anonc366d64e0411::permutation766 ~permutation()
767 {
768 delete[] v;
769 }
770
771 permutation &
operator =__anonc366d64e0411::permutation772 operator=(permutation p)
773 {
774 SWAP(v, p.v);
775 SWAP(size, p.size);
776 return *this;
777 }
778
779 unsigned *v;
780 unsigned size;
781 };
782
783 /**
784 * Return an identity permutation of GRF atoms.
785 */
786 permutation
identity_reg_permutation(const partitioning & p)787 identity_reg_permutation(const partitioning &p)
788 {
789 permutation map(p.num_atoms());
790
791 for (unsigned r = 0; r < map.size; r++)
792 map.v[r] = p.reg_of_atom(r);
793
794 return map;
795 }
796
797 /**
798 * Return the bank index of GRF address \p reg, numbered according to the
799 * table:
800 * Even Odd
801 * Lo 0 1
802 * Hi 2 3
803 */
804 unsigned
bank_of(unsigned reg)805 bank_of(unsigned reg)
806 {
807 return (reg & 0x40) >> 5 | (reg & 1);
808 }
809
810 /**
811 * Return bitmasks suitable for use as bank mask arguments for the
812 * delta_conflicts() computation. Note that this is just the (negative)
813 * characteristic function of each bank, if you regard it as a set
814 * containing all atoms assigned to it according to the \p map array.
815 */
816 weight_vector_type *
bank_characteristics(const permutation & map)817 bank_characteristics(const permutation &map)
818 {
819 weight_vector_type *banks = new weight_vector_type[4];
820
821 for (unsigned b = 0; b < 4; b++) {
822 banks[b] = weight_vector_type(2 * map.size);
823
824 for (unsigned j = 0; j < map.size; j++) {
825 for (unsigned p = 0; p < 2; p++)
826 set(banks[b], j, p,
827 (b ^ p) == bank_of(map.v[j]) ? -1 : 0);
828 }
829 }
830
831 return banks;
832 }
833
834 /**
835 * Return an improved permutation of GRF atoms based on \p map attempting
836 * to reduce the total cycle-count cost of bank conflicts greedily.
837 *
838 * Note that this doesn't attempt to merge multiple atoms into one, which
839 * may allow it to do a better job in some cases -- It simply reorders
840 * existing atoms in the GRF space without affecting their identity.
841 */
842 permutation
optimize_reg_permutation(const partitioning & p,const bool * constrained,const weight_vector_type * conflicts,permutation map)843 optimize_reg_permutation(const partitioning &p,
844 const bool *constrained,
845 const weight_vector_type *conflicts,
846 permutation map)
847 {
848 const bool *any_conflicts = have_any_conflicts(p, conflicts);
849 weight_vector_type *banks = bank_characteristics(map);
850
851 for (unsigned r = 0; r < map.size; r++) {
852 const unsigned bank_r = bank_of(map.v[r]);
853
854 if (!constrained[r]) {
855 unsigned best_s = r;
856 int best_benefit = 0;
857
858 for (unsigned s = 0; s < map.size; s++) {
859 const unsigned bank_s = bank_of(map.v[s]);
860
861 if (bank_r != bank_s && !constrained[s] &&
862 p.size_of_atom(r) == p.size_of_atom(s) &&
863 (any_conflicts[r] || any_conflicts[s])) {
864 const int benefit =
865 delta_conflicts(banks[bank_r], banks[bank_s], conflicts[r]) +
866 delta_conflicts(banks[bank_s], banks[bank_r], conflicts[s]);
867
868 if (benefit > best_benefit) {
869 best_s = s;
870 best_benefit = benefit;
871 }
872 }
873 }
874
875 if (best_s != r) {
876 for (unsigned b = 0; b < 4; b++) {
877 for (unsigned p = 0; p < 2; p++)
878 swap(banks[b], r, p, best_s, p);
879 }
880
881 SWAP(map.v[r], map.v[best_s]);
882 }
883 }
884 }
885
886 delete[] banks;
887 delete[] any_conflicts;
888 return map;
889 }
890
891 /**
892 * Apply the GRF atom permutation given by \p map to register \p r and
893 * return the result.
894 */
895 fs_reg
transform(const partitioning & p,const permutation & map,fs_reg r)896 transform(const partitioning &p, const permutation &map, fs_reg r)
897 {
898 if (r.file == VGRF) {
899 const unsigned reg = reg_of(r);
900 const unsigned s = p.atom_of_reg(reg);
901 r.nr = map.v[s] + reg - p.reg_of_atom(s);
902 r.offset = r.offset % REG_SIZE;
903 }
904
905 return r;
906 }
907 }
908
909 bool
opt_bank_conflicts()910 fs_visitor::opt_bank_conflicts()
911 {
912 assert(grf_used || !"Must be called after register allocation");
913
914 /* No ternary instructions -- No bank conflicts. */
915 if (devinfo->ver < 6)
916 return false;
917
918 const partitioning p = shader_reg_partitioning(this);
919 const bool *constrained = shader_reg_constraints(this, p);
920 const weight_vector_type *conflicts =
921 shader_conflict_weight_matrix(this, p);
922 const permutation map =
923 optimize_reg_permutation(p, constrained, conflicts,
924 identity_reg_permutation(p));
925
926 foreach_block_and_inst(block, fs_inst, inst, cfg) {
927 inst->dst = transform(p, map, inst->dst);
928
929 for (int i = 0; i < inst->sources; i++)
930 inst->src[i] = transform(p, map, inst->src[i]);
931 }
932
933 delete[] conflicts;
934 delete[] constrained;
935 return true;
936 }
937
938 /**
939 * Return whether the instruction incurs GRF bank conflict cycles.
940 *
941 * Note that this is only accurate after register allocation because otherwise
942 * we don't know which bank each VGRF is going to end up aligned to.
943 */
944 bool
has_bank_conflict(const struct brw_isa_info * isa,const fs_inst * inst)945 has_bank_conflict(const struct brw_isa_info *isa, const fs_inst *inst)
946 {
947 return is_3src(isa, inst->opcode) &&
948 is_grf(inst->src[1]) && is_grf(inst->src[2]) &&
949 bank_of(reg_of(inst->src[1])) == bank_of(reg_of(inst->src[2])) &&
950 !is_conflict_optimized_out(isa->devinfo, inst);
951 }
952