1 /*
2 * Copyright © 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs_combine_constants.cpp
25 *
26 * This file contains the opt_combine_constants() pass that runs after the
27 * regular optimization loop. It passes over the instruction list and
28 * selectively promotes immediate values to registers by emitting a mov(1)
29 * instruction.
30 *
31 * This is useful on Gen 7 particularly, because a few instructions can be
32 * coissued (i.e., issued in the same cycle as another thread on the same EU
33 * issues an instruction) under some circumstances, one of which is that they
34 * cannot use immediate values.
35 */
36
37 #include "brw_fs.h"
38 #include "brw_fs_builder.h"
39 #include "brw_cfg.h"
40 #include "util/half_float.h"
41
42 using namespace brw;
43
44 static const bool debug = false;
45
46 enum PACKED interpreted_type {
47 float_only = 0,
48 integer_only,
49 either_type
50 };
51
52 struct value {
53 /** Raw bit pattern of the value. */
54 nir_const_value value;
55
56 /** Instruction that uses this instance of the value. */
57 unsigned instr_index;
58
59 /** Size, in bits, of the value. */
60 uint8_t bit_size;
61
62 /**
63 * Which source of instr is this value?
64 *
65 * \note This field is not actually used by \c brw_combine_constants, but
66 * it is generally very useful to callers.
67 */
68 uint8_t src;
69
70 /**
71 * In what ways can instr interpret this value?
72 *
73 * Choices are floating-point only, integer only, or either type.
74 */
75 enum interpreted_type type;
76
77 /**
78 * Only try to make a single source non-constant.
79 *
80 * On some architectures, some instructions require that all sources be
81 * non-constant. For example, the multiply-accumulate instruction on Intel
82 * GPUs upto Gen11 require that all sources be non-constant. Other
83 * instructions, like the selection instruction, allow one constant source.
84 *
85 * If a single constant source is allowed, set this flag to true.
86 *
87 * If an instruction allows a single constant and it has only a signle
88 * constant to begin, it should be included. Various places in
89 * \c combine_constants will assume that there are multiple constants if
90 * \c ::allow_one_constant is set. This may even be enforced by in-code
91 * assertions.
92 */
93 bool allow_one_constant;
94
95 /**
96 * Restrict values that can reach this value to not include negations.
97 *
98 * This is useful for instructions that cannot have source modifiers. For
99 * example, on Intel GPUs the integer source of a shift instruction (e.g.,
100 * SHL) can have a source modifier, but the integer source of the bitfield
101 * insertion instruction (i.e., BFI2) cannot. A pair of these instructions
102 * might have sources that are negations of each other. Using this flag
103 * will ensure that the BFI2 does not have a negated source, but the SHL
104 * might.
105 */
106 bool no_negations;
107
108 /**
109 * \name UtilCombineConstantsPrivate
110 * Private data used only by brw_combine_constants
111 *
112 * Any data stored in these fields will be overwritten by the call to
113 * \c brw_combine_constants. No assumptions should be made about the
114 * state of these fields after that function returns.
115 */
116 /**@{*/
117 /** Mask of negations that can be generated from this value. */
118 uint8_t reachable_mask;
119
120 /** Mask of negations that can generate this value. */
121 uint8_t reaching_mask;
122
123 /**
124 * Value with the next source from the same instruction.
125 *
126 * This pointer may be \c NULL. If it is not \c NULL, it will form a
127 * singly-linked circular list of values. The list is unorderd. That is,
128 * as the list is iterated, the \c ::src values will be in arbitrary order.
129 *
130 * \todo Is it even possible for there to be more than two elements in this
131 * list? This pass does not operate on vecN instructions or intrinsics, so
132 * the theoretical limit should be three. However, instructions with all
133 * constant sources should have been folded away.
134 */
135 struct value *next_src;
136 /**@}*/
137 };
138
139 struct combine_constants_value {
140 /** Raw bit pattern of the constant loaded. */
141 nir_const_value value;
142
143 /**
144 * Index of the first user.
145 *
146 * This is the offset into \c combine_constants_result::user_map of the
147 * first user of this value.
148 */
149 unsigned first_user;
150
151 /** Number of users of this value. */
152 unsigned num_users;
153
154 /** Size, in bits, of the value. */
155 uint8_t bit_size;
156 };
157
158 struct combine_constants_user {
159 /** Index into the array of values passed to brw_combine_constants. */
160 unsigned index;
161
162 /**
163 * Manner in which the value should be interpreted in the instruction.
164 *
165 * This is only useful when ::negate is set. Unless the corresponding
166 * value::type is \c either_type, this field must have the same value as
167 * value::type.
168 */
169 enum interpreted_type type;
170
171 /** Should this value be negated to generate the original value? */
172 bool negate;
173 };
174
175 class combine_constants_result {
176 public:
combine_constants_result(unsigned num_candidates,bool & success)177 combine_constants_result(unsigned num_candidates, bool &success)
178 : num_values_to_emit(0), user_map(NULL)
179 {
180 user_map = (struct combine_constants_user *) calloc(num_candidates,
181 sizeof(user_map[0]));
182
183 /* In the worst case, the number of output values will be equal to the
184 * number of input values. Allocate a buffer that is known to be large
185 * enough now, and it can be reduced later.
186 */
187 values_to_emit =
188 (struct combine_constants_value *) calloc(num_candidates,
189 sizeof(values_to_emit[0]));
190
191 success = (user_map != NULL && values_to_emit != NULL);
192 }
193
~combine_constants_result()194 ~combine_constants_result()
195 {
196 free(values_to_emit);
197 free(user_map);
198 }
199
append_value(const nir_const_value & value,unsigned bit_size)200 void append_value(const nir_const_value &value, unsigned bit_size)
201 {
202 values_to_emit[num_values_to_emit].value = value;
203 values_to_emit[num_values_to_emit].first_user = 0;
204 values_to_emit[num_values_to_emit].num_users = 0;
205 values_to_emit[num_values_to_emit].bit_size = bit_size;
206 num_values_to_emit++;
207 }
208
209 unsigned num_values_to_emit;
210 struct combine_constants_value *values_to_emit;
211
212 struct combine_constants_user *user_map;
213 };
214
215 #define VALUE_INDEX 0
216 #define FLOAT_NEG_INDEX 1
217 #define INT_NEG_INDEX 2
218 #define MAX_NUM_REACHABLE 3
219
220 #define VALUE_EXISTS (1 << VALUE_INDEX)
221 #define FLOAT_NEG_EXISTS (1 << FLOAT_NEG_INDEX)
222 #define INT_NEG_EXISTS (1 << INT_NEG_INDEX)
223
224 static bool
negation_exists(nir_const_value v,unsigned bit_size,enum interpreted_type base_type)225 negation_exists(nir_const_value v, unsigned bit_size,
226 enum interpreted_type base_type)
227 {
228 /* either_type does not make sense in this context. */
229 assert(base_type == float_only || base_type == integer_only);
230
231 switch (bit_size) {
232 case 8:
233 if (base_type == float_only)
234 return false;
235 else
236 return v.i8 != 0 && v.i8 != INT8_MIN;
237
238 case 16:
239 if (base_type == float_only)
240 return !util_is_half_nan(v.i16);
241 else
242 return v.i16 != 0 && v.i16 != INT16_MIN;
243
244 case 32:
245 if (base_type == float_only)
246 return !isnan(v.f32);
247 else
248 return v.i32 != 0 && v.i32 != INT32_MIN;
249
250 case 64:
251 if (base_type == float_only)
252 return !isnan(v.f64);
253 else
254 return v.i64 != 0 && v.i64 != INT64_MIN;
255
256 default:
257 unreachable("unsupported bit-size should have already been filtered.");
258 }
259 }
260
261 static nir_const_value
negate(nir_const_value v,unsigned bit_size,enum interpreted_type base_type)262 negate(nir_const_value v, unsigned bit_size, enum interpreted_type base_type)
263 {
264 /* either_type does not make sense in this context. */
265 assert(base_type == float_only || base_type == integer_only);
266
267 nir_const_value ret = { 0, };
268
269 switch (bit_size) {
270 case 8:
271 assert(base_type == integer_only);
272 ret.i8 = -v.i8;
273 break;
274
275 case 16:
276 if (base_type == float_only)
277 ret.u16 = v.u16 ^ INT16_MIN;
278 else
279 ret.i16 = -v.i16;
280 break;
281
282 case 32:
283 if (base_type == float_only)
284 ret.u32 = v.u32 ^ INT32_MIN;
285 else
286 ret.i32 = -v.i32;
287 break;
288
289 case 64:
290 if (base_type == float_only)
291 ret.u64 = v.u64 ^ INT64_MIN;
292 else
293 ret.i64 = -v.i64;
294 break;
295
296 default:
297 unreachable("unsupported bit-size should have already been filtered.");
298 }
299
300 return ret;
301 }
302
303 static nir_const_value
absolute(nir_const_value v,unsigned bit_size,enum interpreted_type base_type)304 absolute(nir_const_value v, unsigned bit_size, enum interpreted_type base_type)
305 {
306 /* either_type does not make sense in this context. */
307 assert(base_type == float_only || base_type == integer_only);
308
309 nir_const_value ret = { 0, };
310
311 switch (bit_size) {
312 case 8:
313 assert(base_type == integer_only);
314 ret.i8 = abs(v.i8);
315 break;
316
317 case 16:
318 if (base_type == float_only)
319 ret.u16 = v.u16 & 0x7fff;
320 else
321 ret.i16 = abs(v.i16);
322 break;
323
324 case 32:
325 if (base_type == float_only)
326 ret.f32 = fabs(v.f32);
327 else
328 ret.i32 = abs(v.i32);
329 break;
330
331 case 64:
332 if (base_type == float_only)
333 ret.f64 = fabs(v.f64);
334 else {
335 if (sizeof(v.i64) == sizeof(long int)) {
336 ret.i64 = labs((long int) v.i64);
337 } else {
338 assert(sizeof(v.i64) == sizeof(long long int));
339 ret.i64 = llabs((long long int) v.i64);
340 }
341 }
342 break;
343
344 default:
345 unreachable("unsupported bit-size should have already been filtered.");
346 }
347
348 return ret;
349 }
350
351 static void
calculate_masks(nir_const_value v,enum interpreted_type type,unsigned bit_size,uint8_t * reachable_mask,uint8_t * reaching_mask)352 calculate_masks(nir_const_value v, enum interpreted_type type,
353 unsigned bit_size, uint8_t *reachable_mask,
354 uint8_t *reaching_mask)
355 {
356 *reachable_mask = 0;
357 *reaching_mask = 0;
358
359 /* Calculate the extended reachable mask. */
360 if (type == float_only || type == either_type) {
361 if (negation_exists(v, bit_size, float_only))
362 *reachable_mask |= FLOAT_NEG_EXISTS;
363 }
364
365 if (type == integer_only || type == either_type) {
366 if (negation_exists(v, bit_size, integer_only))
367 *reachable_mask |= INT_NEG_EXISTS;
368 }
369
370 /* Calculate the extended reaching mask. All of the "is this negation
371 * possible" was already determined for the reachable_mask, so reuse that
372 * data.
373 */
374 if (type == float_only || type == either_type) {
375 if (*reachable_mask & FLOAT_NEG_EXISTS)
376 *reaching_mask |= FLOAT_NEG_EXISTS;
377 }
378
379 if (type == integer_only || type == either_type) {
380 if (*reachable_mask & INT_NEG_EXISTS)
381 *reaching_mask |= INT_NEG_EXISTS;
382 }
383 }
384
385 static void
calculate_reachable_values(nir_const_value v,unsigned bit_size,unsigned reachable_mask,nir_const_value * reachable_values)386 calculate_reachable_values(nir_const_value v,
387 unsigned bit_size,
388 unsigned reachable_mask,
389 nir_const_value *reachable_values)
390 {
391 memset(reachable_values, 0, MAX_NUM_REACHABLE * sizeof(reachable_values[0]));
392
393 reachable_values[VALUE_INDEX] = v;
394
395 if (reachable_mask & INT_NEG_EXISTS) {
396 const nir_const_value neg = negate(v, bit_size, integer_only);
397
398 reachable_values[INT_NEG_INDEX] = neg;
399 }
400
401 if (reachable_mask & FLOAT_NEG_EXISTS) {
402 const nir_const_value neg = negate(v, bit_size, float_only);
403
404 reachable_values[FLOAT_NEG_INDEX] = neg;
405 }
406 }
407
408 static bool
value_equal(nir_const_value a,nir_const_value b,unsigned bit_size)409 value_equal(nir_const_value a, nir_const_value b, unsigned bit_size)
410 {
411 switch (bit_size) {
412 case 8:
413 return a.u8 == b.u8;
414 case 16:
415 return a.u16 == b.u16;
416 case 32:
417 return a.u32 == b.u32;
418 case 64:
419 return a.u64 == b.u64;
420 default:
421 unreachable("unsupported bit-size should have already been filtered.");
422 }
423 }
424
425 /** Can these values be the same with one level of negation? */
426 static bool
value_can_equal(const nir_const_value * from,uint8_t reachable_mask,nir_const_value to,uint8_t reaching_mask,unsigned bit_size)427 value_can_equal(const nir_const_value *from, uint8_t reachable_mask,
428 nir_const_value to, uint8_t reaching_mask,
429 unsigned bit_size)
430 {
431 const uint8_t combined_mask = reachable_mask & reaching_mask;
432
433 return value_equal(from[VALUE_INDEX], to, bit_size) ||
434 ((combined_mask & INT_NEG_EXISTS) &&
435 value_equal(from[INT_NEG_INDEX], to, bit_size)) ||
436 ((combined_mask & FLOAT_NEG_EXISTS) &&
437 value_equal(from[FLOAT_NEG_INDEX], to, bit_size));
438 }
439
440 static void
preprocess_candidates(struct value * candidates,unsigned num_candidates)441 preprocess_candidates(struct value *candidates, unsigned num_candidates)
442 {
443 /* Calculate the reaching_mask and reachable_mask for each candidate. */
444 for (unsigned i = 0; i < num_candidates; i++) {
445 calculate_masks(candidates[i].value,
446 candidates[i].type,
447 candidates[i].bit_size,
448 &candidates[i].reachable_mask,
449 &candidates[i].reaching_mask);
450
451 /* If negations are not allowed, then only the original value is
452 * reaching.
453 */
454 if (candidates[i].no_negations)
455 candidates[i].reaching_mask = 0;
456 }
457
458 for (unsigned i = 0; i < num_candidates; i++)
459 candidates[i].next_src = NULL;
460
461 for (unsigned i = 0; i < num_candidates - 1; i++) {
462 if (candidates[i].next_src != NULL)
463 continue;
464
465 struct value *prev = &candidates[i];
466
467 for (unsigned j = i + 1; j < num_candidates; j++) {
468 if (candidates[i].instr_index == candidates[j].instr_index) {
469 prev->next_src = &candidates[j];
470 prev = prev->next_src;
471 }
472 }
473
474 /* Close the cycle. */
475 if (prev != &candidates[i])
476 prev->next_src = &candidates[i];
477 }
478 }
479
480 static bool
reaching_value_exists(const struct value * c,const struct combine_constants_value * values,unsigned num_values)481 reaching_value_exists(const struct value *c,
482 const struct combine_constants_value *values,
483 unsigned num_values)
484 {
485 nir_const_value reachable_values[MAX_NUM_REACHABLE];
486
487 calculate_reachable_values(c->value, c->bit_size, c->reaching_mask,
488 reachable_values);
489
490 /* Check to see if the value is already in the result set. */
491 for (unsigned j = 0; j < num_values; j++) {
492 if (c->bit_size == values[j].bit_size &&
493 value_can_equal(reachable_values, c->reaching_mask,
494 values[j].value, c->reaching_mask,
495 c->bit_size)) {
496 return true;
497 }
498 }
499
500 return false;
501 }
502
503 static combine_constants_result *
combine_constants_greedy(struct value * candidates,unsigned num_candidates)504 combine_constants_greedy(struct value *candidates, unsigned num_candidates)
505 {
506 bool success;
507 combine_constants_result *result =
508 new combine_constants_result(num_candidates, success);
509 if (result == NULL || !success) {
510 delete result;
511 return NULL;
512 }
513
514 BITSET_WORD *remain =
515 (BITSET_WORD *) calloc(BITSET_WORDS(num_candidates), sizeof(remain[0]));
516
517 if (remain == NULL) {
518 delete result;
519 return NULL;
520 }
521
522 memset(remain, 0xff, BITSET_WORDS(num_candidates) * sizeof(remain[0]));
523
524 /* Operate in three passes. The first pass handles all values that must be
525 * emitted and for which a negation cannot exist.
526 */
527 unsigned i;
528 for (i = 0; i < num_candidates; i++) {
529 if (candidates[i].allow_one_constant ||
530 (candidates[i].reaching_mask & (FLOAT_NEG_EXISTS | INT_NEG_EXISTS))) {
531 continue;
532 }
533
534 /* Check to see if the value is already in the result set. */
535 bool found = false;
536 const unsigned num_values = result->num_values_to_emit;
537 for (unsigned j = 0; j < num_values; j++) {
538 if (candidates[i].bit_size == result->values_to_emit[j].bit_size &&
539 value_equal(candidates[i].value,
540 result->values_to_emit[j].value,
541 candidates[i].bit_size)) {
542 found = true;
543 break;
544 }
545 }
546
547 if (!found)
548 result->append_value(candidates[i].value, candidates[i].bit_size);
549
550 BITSET_CLEAR(remain, i);
551 }
552
553 /* The second pass handles all values that must be emitted and for which a
554 * negation can exist.
555 */
556 BITSET_FOREACH_SET(i, remain, num_candidates) {
557 if (candidates[i].allow_one_constant)
558 continue;
559
560 assert(candidates[i].reaching_mask & (FLOAT_NEG_EXISTS | INT_NEG_EXISTS));
561
562 if (!reaching_value_exists(&candidates[i], result->values_to_emit,
563 result->num_values_to_emit)) {
564 result->append_value(absolute(candidates[i].value,
565 candidates[i].bit_size,
566 candidates[i].type),
567 candidates[i].bit_size);
568 }
569
570 BITSET_CLEAR(remain, i);
571 }
572
573 /* The third pass handles all of the values that may not have to be
574 * emitted. These are the values where allow_one_constant is set.
575 */
576 BITSET_FOREACH_SET(i, remain, num_candidates) {
577 assert(candidates[i].allow_one_constant);
578
579 /* The BITSET_FOREACH_SET macro does not detect changes to the bitset
580 * that occur within the current word. Since code in this loop may
581 * clear bits from the set, re-test here.
582 */
583 if (!BITSET_TEST(remain, i))
584 continue;
585
586 assert(candidates[i].next_src != NULL);
587
588 const struct value *const other_candidate = candidates[i].next_src;
589 const unsigned j = other_candidate - candidates;
590
591 if (!reaching_value_exists(&candidates[i], result->values_to_emit,
592 result->num_values_to_emit)) {
593 /* Before emitting a value, see if a match for the other source of
594 * the instruction exists.
595 */
596 if (!reaching_value_exists(&candidates[j], result->values_to_emit,
597 result->num_values_to_emit)) {
598 result->append_value(candidates[i].value, candidates[i].bit_size);
599 }
600 }
601
602 /* Mark both sources as handled. */
603 BITSET_CLEAR(remain, i);
604 BITSET_CLEAR(remain, j);
605 }
606
607 /* As noted above, there will never be more values in the output than in
608 * the input. If there are fewer values, reduce the size of the
609 * allocation.
610 */
611 if (result->num_values_to_emit < num_candidates) {
612 result->values_to_emit = (struct combine_constants_value *)
613 realloc(result->values_to_emit, sizeof(result->values_to_emit[0]) *
614 result->num_values_to_emit);
615
616 /* Is it even possible for a reducing realloc to fail? */
617 assert(result->values_to_emit != NULL);
618 }
619
620 /* Create the mapping from "combined" constants to list of candidates
621 * passed in by the caller.
622 */
623 memset(remain, 0xff, BITSET_WORDS(num_candidates) * sizeof(remain[0]));
624
625 unsigned total_users = 0;
626
627 const unsigned num_values = result->num_values_to_emit;
628 for (unsigned value_idx = 0; value_idx < num_values; value_idx++) {
629 result->values_to_emit[value_idx].first_user = total_users;
630
631 uint8_t reachable_mask;
632 uint8_t unused_mask;
633
634 calculate_masks(result->values_to_emit[value_idx].value, either_type,
635 result->values_to_emit[value_idx].bit_size,
636 &reachable_mask, &unused_mask);
637
638 nir_const_value reachable_values[MAX_NUM_REACHABLE];
639
640 calculate_reachable_values(result->values_to_emit[value_idx].value,
641 result->values_to_emit[value_idx].bit_size,
642 reachable_mask, reachable_values);
643
644 for (unsigned i = 0; i < num_candidates; i++) {
645 bool matched = false;
646
647 if (!BITSET_TEST(remain, i))
648 continue;
649
650 if (candidates[i].bit_size != result->values_to_emit[value_idx].bit_size)
651 continue;
652
653 if (value_equal(candidates[i].value, result->values_to_emit[value_idx].value,
654 result->values_to_emit[value_idx].bit_size)) {
655 result->user_map[total_users].index = i;
656 result->user_map[total_users].type = candidates[i].type;
657 result->user_map[total_users].negate = false;
658 total_users++;
659
660 matched = true;
661 BITSET_CLEAR(remain, i);
662 } else {
663 const uint8_t combined_mask = reachable_mask &
664 candidates[i].reaching_mask;
665
666 enum interpreted_type type = either_type;
667
668 if ((combined_mask & INT_NEG_EXISTS) &&
669 value_equal(candidates[i].value,
670 reachable_values[INT_NEG_INDEX],
671 candidates[i].bit_size)) {
672 type = integer_only;
673 }
674
675 if (type == either_type &&
676 (combined_mask & FLOAT_NEG_EXISTS) &&
677 value_equal(candidates[i].value,
678 reachable_values[FLOAT_NEG_INDEX],
679 candidates[i].bit_size)) {
680 type = float_only;
681 }
682
683 if (type != either_type) {
684 /* Finding a match on this path implies that the user must
685 * allow source negations.
686 */
687 assert(!candidates[i].no_negations);
688
689 result->user_map[total_users].index = i;
690 result->user_map[total_users].type = type;
691 result->user_map[total_users].negate = true;
692 total_users++;
693
694 matched = true;
695 BITSET_CLEAR(remain, i);
696 }
697 }
698
699 /* Mark the other source of instructions that can have a constant
700 * source. Selection is the prime example of this, and we want to
701 * avoid generating sequences like bcsel(a, fneg(b), ineg(c)).
702 *
703 * This also makes sure that the assertion (below) that *all* values
704 * were processed holds even when some values may be allowed to
705 * remain as constants.
706 *
707 * FINISHME: There may be value in only doing this when type ==
708 * either_type. If both sources are loaded, a register allocator may
709 * be able to make a better choice about which value to "spill"
710 * (i.e., replace with an immediate) under heavy register pressure.
711 */
712 if (matched && candidates[i].allow_one_constant) {
713 const struct value *const other_src = candidates[i].next_src;
714 const unsigned idx = other_src - candidates;
715
716 assert(idx < num_candidates);
717 BITSET_CLEAR(remain, idx);
718 }
719 }
720
721 assert(total_users > result->values_to_emit[value_idx].first_user);
722 result->values_to_emit[value_idx].num_users =
723 total_users - result->values_to_emit[value_idx].first_user;
724 }
725
726 /* Verify that all of the values were emitted by the loop above. If any
727 * bits are still set in remain, then some value was not emitted. The use
728 * of memset to populate remain prevents the use of a more performant loop.
729 */
730 #ifndef NDEBUG
731 bool pass = true;
732
733 BITSET_FOREACH_SET(i, remain, num_candidates) {
734 fprintf(stderr, "candidate %d was not processed: { "
735 ".b = %s, "
736 ".f32 = %f, .f64 = %g, "
737 ".i8 = %d, .u8 = 0x%02x, "
738 ".i16 = %d, .u16 = 0x%04x, "
739 ".i32 = %d, .u32 = 0x%08x, "
740 ".i64 = %" PRId64 ", .u64 = 0x%016" PRIx64 " }\n",
741 i,
742 candidates[i].value.b ? "true" : "false",
743 candidates[i].value.f32, candidates[i].value.f64,
744 candidates[i].value.i8, candidates[i].value.u8,
745 candidates[i].value.i16, candidates[i].value.u16,
746 candidates[i].value.i32, candidates[i].value.u32,
747 candidates[i].value.i64, candidates[i].value.u64);
748 pass = false;
749 }
750
751 assert(pass && "All values should have been processed.");
752 #endif
753
754 free(remain);
755
756 return result;
757 }
758
759 static combine_constants_result *
brw_combine_constants(struct value * candidates,unsigned num_candidates)760 brw_combine_constants(struct value *candidates, unsigned num_candidates)
761 {
762 preprocess_candidates(candidates, num_candidates);
763
764 return combine_constants_greedy(candidates, num_candidates);
765 }
766
767 /**
768 * Box for storing fs_inst and some other necessary data
769 *
770 * \sa box_instruction
771 */
772 struct fs_inst_box {
773 fs_inst *inst;
774 unsigned ip;
775 bblock_t *block;
776 bool must_promote;
777 };
778
779 /** A box for putting fs_regs in a linked list. */
780 struct reg_link {
781 DECLARE_RALLOC_CXX_OPERATORS(reg_link)
782
reg_linkreg_link783 reg_link(fs_inst *inst, unsigned src, bool negate, enum interpreted_type type)
784 : inst(inst), src(src), negate(negate), type(type) {}
785
786 struct exec_node link;
787 fs_inst *inst;
788 uint8_t src;
789 bool negate;
790 enum interpreted_type type;
791 };
792
793 static struct exec_node *
link(void * mem_ctx,fs_inst * inst,unsigned src,bool negate,enum interpreted_type type)794 link(void *mem_ctx, fs_inst *inst, unsigned src, bool negate,
795 enum interpreted_type type)
796 {
797 reg_link *l = new(mem_ctx) reg_link(inst, src, negate, type);
798 return &l->link;
799 }
800
801 /**
802 * Information about an immediate value.
803 */
804 struct imm {
805 /** The common ancestor of all blocks using this immediate value. */
806 bblock_t *block;
807
808 /**
809 * The instruction generating the immediate value, if all uses are contained
810 * within a single basic block. Otherwise, NULL.
811 */
812 fs_inst *inst;
813
814 /**
815 * A list of fs_regs that refer to this immediate. If we promote it, we'll
816 * have to patch these up to refer to the new GRF.
817 */
818 exec_list *uses;
819
820 /** The immediate value */
821 union {
822 char bytes[8];
823 double df;
824 int64_t d64;
825 float f;
826 int32_t d;
827 int16_t w;
828 };
829 uint8_t size;
830
831 /** When promoting half-float we need to account for certain restrictions */
832 bool is_half_float;
833
834 /**
835 * The GRF register and subregister number where we've decided to store the
836 * constant value.
837 */
838 uint8_t subreg_offset;
839 uint16_t nr;
840
841 /** The number of coissuable instructions using this immediate. */
842 uint16_t uses_by_coissue;
843
844 /**
845 * Whether this constant is used by an instruction that can't handle an
846 * immediate source (and already has to be promoted to a GRF).
847 */
848 bool must_promote;
849
850 /** Is the value used only in a single basic block? */
851 bool used_in_single_block;
852
853 uint16_t first_use_ip;
854 uint16_t last_use_ip;
855 };
856
857 /** The working set of information about immediates. */
858 struct table {
859 struct value *values;
860 int size;
861 int num_values;
862
863 struct imm *imm;
864 int len;
865
866 struct fs_inst_box *boxes;
867 unsigned num_boxes;
868 unsigned size_boxes;
869 };
870
871 static struct value *
new_value(struct table * table,void * mem_ctx)872 new_value(struct table *table, void *mem_ctx)
873 {
874 if (table->num_values == table->size) {
875 table->size *= 2;
876 table->values = reralloc(mem_ctx, table->values, struct value, table->size);
877 }
878 return &table->values[table->num_values++];
879 }
880
881 /**
882 * Store an instruction with some other data in a table.
883 *
884 * \returns the index into the dynamic array of boxes for the instruction.
885 */
886 static unsigned
box_instruction(struct table * table,void * mem_ctx,fs_inst * inst,unsigned ip,bblock_t * block,bool must_promote)887 box_instruction(struct table *table, void *mem_ctx, fs_inst *inst,
888 unsigned ip, bblock_t *block, bool must_promote)
889 {
890 /* It is common for box_instruction to be called consecutively for each
891 * source of an instruction. As a result, the most common case for finding
892 * an instruction in the table is when that instruction was the last one
893 * added. Search the list back to front.
894 */
895 for (unsigned i = table->num_boxes; i > 0; /* empty */) {
896 i--;
897
898 if (table->boxes[i].inst == inst)
899 return i;
900 }
901
902 if (table->num_boxes == table->size_boxes) {
903 table->size_boxes *= 2;
904 table->boxes = reralloc(mem_ctx, table->boxes, fs_inst_box,
905 table->size_boxes);
906 }
907
908 assert(table->num_boxes < table->size_boxes);
909
910 const unsigned idx = table->num_boxes++;
911 fs_inst_box *ib = &table->boxes[idx];
912
913 ib->inst = inst;
914 ib->block = block;
915 ib->ip = ip;
916 ib->must_promote = must_promote;
917
918 return idx;
919 }
920
921 /**
922 * Comparator used for sorting an array of imm structures.
923 *
924 * We sort by basic block number, then last use IP, then first use IP (least
925 * to greatest). This sorting causes immediates live in the same area to be
926 * allocated to the same register in the hopes that all values will be dead
927 * about the same time and the register can be reused.
928 */
929 static int
compare(const void * _a,const void * _b)930 compare(const void *_a, const void *_b)
931 {
932 const struct imm *a = (const struct imm *)_a,
933 *b = (const struct imm *)_b;
934
935 int block_diff = a->block->num - b->block->num;
936 if (block_diff)
937 return block_diff;
938
939 int end_diff = a->last_use_ip - b->last_use_ip;
940 if (end_diff)
941 return end_diff;
942
943 return a->first_use_ip - b->first_use_ip;
944 }
945
946 static struct brw_reg
build_imm_reg_for_copy(struct imm * imm)947 build_imm_reg_for_copy(struct imm *imm)
948 {
949 switch (imm->size) {
950 case 8:
951 return brw_imm_d(imm->d64);
952 case 4:
953 return brw_imm_d(imm->d);
954 case 2:
955 return brw_imm_w(imm->w);
956 default:
957 unreachable("not implemented");
958 }
959 }
960
961 static inline uint32_t
get_alignment_for_imm(const struct imm * imm)962 get_alignment_for_imm(const struct imm *imm)
963 {
964 if (imm->is_half_float)
965 return 4; /* At least MAD seems to require this */
966 else
967 return imm->size;
968 }
969
970 static bool
representable_as_hf(float f,uint16_t * hf)971 representable_as_hf(float f, uint16_t *hf)
972 {
973 union fi u;
974 uint16_t h = _mesa_float_to_half(f);
975 u.f = _mesa_half_to_float(h);
976
977 if (u.f == f) {
978 *hf = h;
979 return true;
980 }
981
982 return false;
983 }
984
985 static bool
representable_as_w(int d,int16_t * w)986 representable_as_w(int d, int16_t *w)
987 {
988 int res = ((d & 0xffff8000) + 0x8000) & 0xffff7fff;
989 if (!res) {
990 *w = d;
991 return true;
992 }
993
994 return false;
995 }
996
997 static bool
representable_as_uw(unsigned ud,uint16_t * uw)998 representable_as_uw(unsigned ud, uint16_t *uw)
999 {
1000 if (!(ud & 0xffff0000)) {
1001 *uw = ud;
1002 return true;
1003 }
1004
1005 return false;
1006 }
1007
1008 static bool
supports_src_as_imm(const struct intel_device_info * devinfo,const fs_inst * inst)1009 supports_src_as_imm(const struct intel_device_info *devinfo, const fs_inst *inst)
1010 {
1011 if (devinfo->ver < 12)
1012 return false;
1013
1014 switch (inst->opcode) {
1015 case BRW_OPCODE_ADD3:
1016 /* ADD3 only exists on Gfx12.5+. */
1017 return true;
1018
1019 case BRW_OPCODE_MAD:
1020 /* Integer types can always mix sizes. Floating point types can mix
1021 * sizes on Gfx12. On Gfx12.5, floating point sources must all be HF or
1022 * all be F.
1023 */
1024 return devinfo->verx10 < 125 || inst->src[0].type != BRW_REGISTER_TYPE_F;
1025
1026 default:
1027 return false;
1028 }
1029 }
1030
1031 static bool
can_promote_src_as_imm(const struct intel_device_info * devinfo,fs_inst * inst,unsigned src_idx)1032 can_promote_src_as_imm(const struct intel_device_info *devinfo, fs_inst *inst,
1033 unsigned src_idx)
1034 {
1035 bool can_promote = false;
1036
1037 /* Experiment shows that we can only support src0 as immediate for MAD on
1038 * Gfx12. ADD3 can use src0 or src2 in Gfx12.5, but constant propagation
1039 * only propagates into src0. It's possible that src2 works for W or UW MAD
1040 * on Gfx12.5.
1041 */
1042 if (src_idx != 0)
1043 return false;
1044
1045 if (!supports_src_as_imm(devinfo, inst))
1046 return false;
1047
1048 /* TODO - Fix the codepath below to use a bfloat16 immediate on XeHP,
1049 * since HF/F mixed mode has been removed from the hardware.
1050 */
1051 switch (inst->src[src_idx].type) {
1052 case BRW_REGISTER_TYPE_F: {
1053 uint16_t hf;
1054 if (representable_as_hf(inst->src[src_idx].f, &hf)) {
1055 inst->src[src_idx] = retype(brw_imm_uw(hf), BRW_REGISTER_TYPE_HF);
1056 can_promote = true;
1057 }
1058 break;
1059 }
1060 case BRW_REGISTER_TYPE_D: {
1061 int16_t w;
1062 if (representable_as_w(inst->src[src_idx].d, &w)) {
1063 inst->src[src_idx] = brw_imm_w(w);
1064 can_promote = true;
1065 }
1066 break;
1067 }
1068 case BRW_REGISTER_TYPE_UD: {
1069 uint16_t uw;
1070 if (representable_as_uw(inst->src[src_idx].ud, &uw)) {
1071 inst->src[src_idx] = brw_imm_uw(uw);
1072 can_promote = true;
1073 }
1074 break;
1075 }
1076 case BRW_REGISTER_TYPE_W:
1077 case BRW_REGISTER_TYPE_UW:
1078 case BRW_REGISTER_TYPE_HF:
1079 can_promote = true;
1080 break;
1081 default:
1082 break;
1083 }
1084
1085 return can_promote;
1086 }
1087
1088 static void
add_candidate_immediate(struct table * table,fs_inst * inst,unsigned ip,unsigned i,bool must_promote,bool allow_one_constant,bblock_t * block,const struct intel_device_info * devinfo,void * const_ctx)1089 add_candidate_immediate(struct table *table, fs_inst *inst, unsigned ip,
1090 unsigned i,
1091 bool must_promote,
1092 bool allow_one_constant,
1093 bblock_t *block,
1094 const struct intel_device_info *devinfo,
1095 void *const_ctx)
1096 {
1097 struct value *v = new_value(table, const_ctx);
1098
1099 unsigned box_idx = box_instruction(table, const_ctx, inst, ip, block,
1100 must_promote);
1101
1102 v->value.u64 = inst->src[i].d64;
1103 v->bit_size = 8 * type_sz(inst->src[i].type);
1104 v->instr_index = box_idx;
1105 v->src = i;
1106 v->allow_one_constant = allow_one_constant;
1107
1108 /* Right-shift instructions are special. They can have source modifiers,
1109 * but changing the type can change the semantic of the instruction. Only
1110 * allow negations on a right shift if the source type is already signed.
1111 */
1112 v->no_negations = !inst->can_do_source_mods(devinfo) ||
1113 ((inst->opcode == BRW_OPCODE_SHR ||
1114 inst->opcode == BRW_OPCODE_ASR) &&
1115 brw_reg_type_is_unsigned_integer(inst->src[i].type));
1116
1117 switch (inst->src[i].type) {
1118 case BRW_REGISTER_TYPE_DF:
1119 case BRW_REGISTER_TYPE_NF:
1120 case BRW_REGISTER_TYPE_F:
1121 case BRW_REGISTER_TYPE_HF:
1122 v->type = float_only;
1123 break;
1124
1125 case BRW_REGISTER_TYPE_UQ:
1126 case BRW_REGISTER_TYPE_Q:
1127 case BRW_REGISTER_TYPE_UD:
1128 case BRW_REGISTER_TYPE_D:
1129 case BRW_REGISTER_TYPE_UW:
1130 case BRW_REGISTER_TYPE_W:
1131 v->type = integer_only;
1132 break;
1133
1134 case BRW_REGISTER_TYPE_VF:
1135 case BRW_REGISTER_TYPE_UV:
1136 case BRW_REGISTER_TYPE_V:
1137 case BRW_REGISTER_TYPE_UB:
1138 case BRW_REGISTER_TYPE_B:
1139 default:
1140 unreachable("not reached");
1141 }
1142
1143 /* It is safe to change the type of the operands of a select instruction
1144 * that has no conditional modifier, no source modifiers, and no saturate
1145 * modifer.
1146 */
1147 if (inst->opcode == BRW_OPCODE_SEL &&
1148 inst->conditional_mod == BRW_CONDITIONAL_NONE &&
1149 !inst->src[0].negate && !inst->src[0].abs &&
1150 !inst->src[1].negate && !inst->src[1].abs &&
1151 !inst->saturate) {
1152 v->type = either_type;
1153 }
1154 }
1155
1156 struct register_allocation {
1157 /** VGRF for storing values. */
1158 unsigned nr;
1159
1160 /**
1161 * Mask of currently available slots in this register.
1162 *
1163 * Each register is 16, 16-bit slots. Allocations require 1, 2, or 4 slots
1164 * for word, double-word, or quad-word values, respectively.
1165 */
1166 uint16_t avail;
1167 };
1168
1169 static fs_reg
allocate_slots(struct register_allocation * regs,unsigned num_regs,unsigned bytes,unsigned align_bytes,brw::simple_allocator & alloc)1170 allocate_slots(struct register_allocation *regs, unsigned num_regs,
1171 unsigned bytes, unsigned align_bytes,
1172 brw::simple_allocator &alloc)
1173 {
1174 assert(bytes == 2 || bytes == 4 || bytes == 8);
1175 assert(align_bytes == 2 || align_bytes == 4 || align_bytes == 8);
1176
1177 const unsigned words = bytes / 2;
1178 const unsigned align_words = align_bytes / 2;
1179 const uint16_t mask = (1U << words) - 1;
1180
1181 for (unsigned i = 0; i < num_regs; i++) {
1182 for (unsigned j = 0; j <= (16 - words); j += align_words) {
1183 const uint16_t x = regs[i].avail >> j;
1184
1185 if ((x & mask) == mask) {
1186 if (regs[i].nr == UINT_MAX)
1187 regs[i].nr = alloc.allocate(1);
1188
1189 regs[i].avail &= ~(mask << j);
1190
1191 fs_reg reg(VGRF, regs[i].nr);
1192 reg.offset = j * 2;
1193
1194 return reg;
1195 }
1196 }
1197 }
1198
1199 unreachable("No free slots found.");
1200 }
1201
1202 static void
deallocate_slots(struct register_allocation * regs,unsigned num_regs,unsigned reg_nr,unsigned subreg_offset,unsigned bytes)1203 deallocate_slots(struct register_allocation *regs, unsigned num_regs,
1204 unsigned reg_nr, unsigned subreg_offset, unsigned bytes)
1205 {
1206 assert(bytes == 2 || bytes == 4 || bytes == 8);
1207 assert(subreg_offset % 2 == 0);
1208 assert(subreg_offset + bytes <= 32);
1209
1210 const unsigned words = bytes / 2;
1211 const unsigned offset = subreg_offset / 2;
1212 const uint16_t mask = ((1U << words) - 1) << offset;
1213
1214 for (unsigned i = 0; i < num_regs; i++) {
1215 if (regs[i].nr == reg_nr) {
1216 regs[i].avail |= mask;
1217 return;
1218 }
1219 }
1220
1221 unreachable("No such register found.");
1222 }
1223
1224 static void
parcel_out_registers(struct imm * imm,unsigned len,const bblock_t * cur_block,struct register_allocation * regs,unsigned num_regs,brw::simple_allocator & alloc,unsigned ver)1225 parcel_out_registers(struct imm *imm, unsigned len, const bblock_t *cur_block,
1226 struct register_allocation *regs, unsigned num_regs,
1227 brw::simple_allocator &alloc, unsigned ver)
1228 {
1229 /* Each basic block has two distinct set of constants. There is the set of
1230 * constants that only have uses in that block, and there is the set of
1231 * constants that have uses after that block.
1232 *
1233 * Allocation proceeds in three passes.
1234 *
1235 * 1. Allocate space for the values that are used outside this block.
1236 *
1237 * 2. Allocate space for the values that are used only in this block.
1238 *
1239 * 3. Deallocate the space for the values that are used only in this block.
1240 */
1241
1242 for (unsigned pass = 0; pass < 2; pass++) {
1243 const bool used_in_single_block = pass != 0;
1244
1245 for (unsigned i = 0; i < len; i++) {
1246 if (imm[i].block == cur_block &&
1247 imm[i].used_in_single_block == used_in_single_block) {
1248 /* From the BDW and CHV PRM, 3D Media GPGPU, Special Restrictions:
1249 *
1250 * "In Align16 mode, the channel selects and channel enables apply
1251 * to a pair of half-floats, because these parameters are defined
1252 * for DWord elements ONLY. This is applicable when both source
1253 * and destination are half-floats."
1254 *
1255 * This means that Align16 instructions that use promoted HF
1256 * immediates and use a <0,1,0>:HF region would read 2 HF slots
1257 * instead of replicating the single one we want. To avoid this, we
1258 * always populate both HF slots within a DWord with the constant.
1259 */
1260 const unsigned width = ver == 8 && imm[i].is_half_float ? 2 : 1;
1261
1262 const fs_reg reg = allocate_slots(regs, num_regs,
1263 imm[i].size * width,
1264 get_alignment_for_imm(&imm[i]),
1265 alloc);
1266
1267 imm[i].nr = reg.nr;
1268 imm[i].subreg_offset = reg.offset;
1269 }
1270 }
1271 }
1272
1273 for (unsigned i = 0; i < len; i++) {
1274 if (imm[i].block == cur_block && imm[i].used_in_single_block) {
1275 const unsigned width = ver == 8 && imm[i].is_half_float ? 2 : 1;
1276
1277 deallocate_slots(regs, num_regs, imm[i].nr, imm[i].subreg_offset,
1278 imm[i].size * width);
1279 }
1280 }
1281 }
1282
1283 bool
brw_fs_opt_combine_constants(fs_visitor & s)1284 brw_fs_opt_combine_constants(fs_visitor &s)
1285 {
1286 const intel_device_info *devinfo = s.devinfo;
1287 void *const_ctx = ralloc_context(NULL);
1288
1289 struct table table;
1290
1291 /* For each of the dynamic arrays in the table, allocate about a page of
1292 * memory. On LP64 systems, this gives 126 value objects 169 fs_inst_box
1293 * objects. Even larger shaders that have been obverved rarely need more
1294 * than 20 or 30 values. Most smaller shaders, which is most shaders, need
1295 * at most a couple dozen fs_inst_box.
1296 */
1297 table.size = (4096 - (5 * sizeof(void *))) / sizeof(struct value);
1298 table.num_values = 0;
1299 table.values = ralloc_array(const_ctx, struct value, table.size);
1300
1301 table.size_boxes = (4096 - (5 * sizeof(void *))) / sizeof(struct fs_inst_box);
1302 table.num_boxes = 0;
1303 table.boxes = ralloc_array(const_ctx, fs_inst_box, table.size_boxes);
1304
1305 const brw::idom_tree &idom = s.idom_analysis.require();
1306 unsigned ip = -1;
1307
1308 /* Make a pass through all instructions and count the number of times each
1309 * constant is used by coissueable instructions or instructions that cannot
1310 * take immediate arguments.
1311 */
1312 foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
1313 ip++;
1314
1315 switch (inst->opcode) {
1316 case SHADER_OPCODE_INT_QUOTIENT:
1317 case SHADER_OPCODE_INT_REMAINDER:
1318 case SHADER_OPCODE_POW:
1319 if (inst->src[0].file == IMM) {
1320 assert(inst->opcode != SHADER_OPCODE_POW);
1321
1322 add_candidate_immediate(&table, inst, ip, 0, true, false, block,
1323 devinfo, const_ctx);
1324 }
1325 break;
1326
1327 case BRW_OPCODE_ADD3:
1328 case BRW_OPCODE_MAD: {
1329 for (int i = 0; i < inst->sources; i++) {
1330 if (inst->src[i].file != IMM)
1331 continue;
1332
1333 if (can_promote_src_as_imm(devinfo, inst, i))
1334 continue;
1335
1336 add_candidate_immediate(&table, inst, ip, i, true, false, block,
1337 devinfo, const_ctx);
1338 }
1339
1340 break;
1341 }
1342
1343 case BRW_OPCODE_BFE:
1344 case BRW_OPCODE_BFI2:
1345 case BRW_OPCODE_LRP:
1346 for (int i = 0; i < inst->sources; i++) {
1347 if (inst->src[i].file != IMM)
1348 continue;
1349
1350 add_candidate_immediate(&table, inst, ip, i, true, false, block,
1351 devinfo, const_ctx);
1352 }
1353
1354 break;
1355
1356 case BRW_OPCODE_SEL:
1357 if (inst->src[0].file == IMM) {
1358 /* It is possible to have src0 be immediate but src1 not be
1359 * immediate for the non-commutative conditional modifiers (e.g.,
1360 * G).
1361 */
1362 if (inst->conditional_mod == BRW_CONDITIONAL_NONE ||
1363 /* Only GE and L are commutative. */
1364 inst->conditional_mod == BRW_CONDITIONAL_GE ||
1365 inst->conditional_mod == BRW_CONDITIONAL_L) {
1366 assert(inst->src[1].file == IMM);
1367
1368 add_candidate_immediate(&table, inst, ip, 0, true, true, block,
1369 devinfo, const_ctx);
1370 add_candidate_immediate(&table, inst, ip, 1, true, true, block,
1371 devinfo, const_ctx);
1372 } else {
1373 add_candidate_immediate(&table, inst, ip, 0, true, false, block,
1374 devinfo, const_ctx);
1375 }
1376 }
1377 break;
1378
1379 case BRW_OPCODE_ASR:
1380 case BRW_OPCODE_BFI1:
1381 case BRW_OPCODE_ROL:
1382 case BRW_OPCODE_ROR:
1383 case BRW_OPCODE_SHL:
1384 case BRW_OPCODE_SHR:
1385 if (inst->src[0].file == IMM) {
1386 add_candidate_immediate(&table, inst, ip, 0, true, false, block,
1387 devinfo, const_ctx);
1388 }
1389 break;
1390
1391 default:
1392 break;
1393 }
1394 }
1395
1396 if (table.num_values == 0) {
1397 ralloc_free(const_ctx);
1398 return false;
1399 }
1400
1401 combine_constants_result *result =
1402 brw_combine_constants(table.values, table.num_values);
1403
1404 table.imm = ralloc_array(const_ctx, struct imm, result->num_values_to_emit);
1405 table.len = 0;
1406
1407 for (unsigned i = 0; i < result->num_values_to_emit; i++) {
1408 struct imm *imm = &table.imm[table.len];
1409
1410 imm->block = NULL;
1411 imm->inst = NULL;
1412 imm->d64 = result->values_to_emit[i].value.u64;
1413 imm->size = result->values_to_emit[i].bit_size / 8;
1414
1415 imm->uses_by_coissue = 0;
1416 imm->must_promote = false;
1417 imm->is_half_float = false;
1418
1419 imm->first_use_ip = UINT16_MAX;
1420 imm->last_use_ip = 0;
1421
1422 imm->uses = new(const_ctx) exec_list;
1423
1424 const unsigned first_user = result->values_to_emit[i].first_user;
1425 const unsigned last_user = first_user +
1426 result->values_to_emit[i].num_users;
1427
1428 for (unsigned j = first_user; j < last_user; j++) {
1429 const unsigned idx = table.values[result->user_map[j].index].instr_index;
1430 fs_inst_box *const ib = &table.boxes[idx];
1431
1432 const unsigned src = table.values[result->user_map[j].index].src;
1433
1434 imm->uses->push_tail(link(const_ctx, ib->inst, src,
1435 result->user_map[j].negate,
1436 result->user_map[j].type));
1437
1438 if (ib->must_promote)
1439 imm->must_promote = true;
1440 else
1441 imm->uses_by_coissue++;
1442
1443 if (imm->block == NULL) {
1444 /* Block should only be NULL on the first pass. On the first
1445 * pass, inst should also be NULL.
1446 */
1447 assert(imm->inst == NULL);
1448
1449 imm->inst = ib->inst;
1450 imm->block = ib->block;
1451 imm->first_use_ip = ib->ip;
1452 imm->last_use_ip = ib->ip;
1453 imm->used_in_single_block = true;
1454 } else {
1455 bblock_t *intersection = idom.intersect(ib->block,
1456 imm->block);
1457
1458 if (ib->block != imm->block)
1459 imm->used_in_single_block = false;
1460
1461 if (imm->first_use_ip > ib->ip) {
1462 imm->first_use_ip = ib->ip;
1463
1464 /* If the first-use instruction is to be tracked, block must be
1465 * the block that contains it. The old block was read in the
1466 * idom.intersect call above, so it is safe to overwrite it
1467 * here.
1468 */
1469 imm->inst = ib->inst;
1470 imm->block = ib->block;
1471 }
1472
1473 if (imm->last_use_ip < ib->ip)
1474 imm->last_use_ip = ib->ip;
1475
1476 /* The common dominator is not the block that contains the
1477 * first-use instruction, so don't track that instruction. The
1478 * load instruction will be added in the common dominator block
1479 * instead of before the first-use instruction.
1480 */
1481 if (intersection != imm->block)
1482 imm->inst = NULL;
1483
1484 imm->block = intersection;
1485 }
1486
1487 if (ib->inst->src[src].type == BRW_REGISTER_TYPE_HF)
1488 imm->is_half_float = true;
1489 }
1490
1491 /* Remove constants from the table that don't have enough uses to make
1492 * them profitable to store in a register.
1493 */
1494 if (imm->must_promote || imm->uses_by_coissue >= 4)
1495 table.len++;
1496 }
1497
1498 delete result;
1499
1500 if (table.len == 0) {
1501 ralloc_free(const_ctx);
1502 return false;
1503 }
1504 if (s.cfg->num_blocks != 1)
1505 qsort(table.imm, table.len, sizeof(struct imm), compare);
1506
1507 struct register_allocation *regs =
1508 (struct register_allocation *) calloc(table.len, sizeof(regs[0]));
1509
1510 for (int i = 0; i < table.len; i++) {
1511 regs[i].nr = UINT_MAX;
1512 regs[i].avail = 0xffff;
1513 }
1514
1515 foreach_block(block, s.cfg) {
1516 parcel_out_registers(table.imm, table.len, block, regs, table.len,
1517 s.alloc, devinfo->ver);
1518 }
1519
1520 free(regs);
1521
1522 bool rebuild_cfg = false;
1523
1524 /* Insert MOVs to load the constant values into GRFs. */
1525 for (int i = 0; i < table.len; i++) {
1526 struct imm *imm = &table.imm[i];
1527
1528 /* Insert it either before the instruction that generated the immediate
1529 * or after the last non-control flow instruction of the common ancestor.
1530 */
1531 exec_node *n;
1532 bblock_t *insert_block;
1533 if (imm->inst != nullptr) {
1534 n = imm->inst;
1535 insert_block = imm->block;
1536 } else {
1537 if (imm->block->start()->opcode == BRW_OPCODE_DO) {
1538 /* DO blocks are weird. They can contain only the single DO
1539 * instruction. As a result, MOV instructions cannot be added to
1540 * the DO block.
1541 */
1542 bblock_t *next_block = imm->block->next();
1543 if (next_block->starts_with_control_flow()) {
1544 /* This is the difficult case. This occurs for code like
1545 *
1546 * do {
1547 * do {
1548 * ...
1549 * } while (...);
1550 * } while (...);
1551 *
1552 * when the MOV instructions need to be inserted between the
1553 * two DO instructions.
1554 *
1555 * To properly handle this scenario, a new block would need to
1556 * be inserted. Doing so would require modifying arbitrary many
1557 * CONTINUE, BREAK, and WHILE instructions to point to the new
1558 * block.
1559 *
1560 * It is unlikely that this would ever be correct. Instead,
1561 * insert the MOV instructions in the known wrong place and
1562 * rebuild the CFG at the end of the pass.
1563 */
1564 insert_block = imm->block;
1565 n = insert_block->last_non_control_flow_inst()->next;
1566
1567 rebuild_cfg = true;
1568 } else {
1569 insert_block = next_block;
1570 n = insert_block->start();
1571 }
1572 } else {
1573 insert_block = imm->block;
1574 n = insert_block->last_non_control_flow_inst()->next;
1575 }
1576 }
1577
1578 /* From the BDW and CHV PRM, 3D Media GPGPU, Special Restrictions:
1579 *
1580 * "In Align16 mode, the channel selects and channel enables apply to a
1581 * pair of half-floats, because these parameters are defined for DWord
1582 * elements ONLY. This is applicable when both source and destination
1583 * are half-floats."
1584 *
1585 * This means that Align16 instructions that use promoted HF immediates
1586 * and use a <0,1,0>:HF region would read 2 HF slots instead of
1587 * replicating the single one we want. To avoid this, we always populate
1588 * both HF slots within a DWord with the constant.
1589 */
1590 const uint32_t width = 1;
1591 const fs_builder ibld = fs_builder(&s, width).at(insert_block, n).exec_all();
1592
1593 fs_reg reg(VGRF, imm->nr);
1594 reg.offset = imm->subreg_offset;
1595 reg.stride = 0;
1596
1597 /* Put the immediate in an offset aligned to its size. Some instructions
1598 * seem to have additional alignment requirements, so account for that
1599 * too.
1600 */
1601 assert(reg.offset == ALIGN(reg.offset, get_alignment_for_imm(imm)));
1602
1603 struct brw_reg imm_reg = build_imm_reg_for_copy(imm);
1604
1605 /* Ensure we have enough space in the register to copy the immediate */
1606 assert(reg.offset + type_sz(imm_reg.type) * width <= REG_SIZE);
1607
1608 ibld.MOV(retype(reg, imm_reg.type), imm_reg);
1609 }
1610 s.shader_stats.promoted_constants = table.len;
1611
1612 /* Rewrite the immediate sources to refer to the new GRFs. */
1613 for (int i = 0; i < table.len; i++) {
1614 foreach_list_typed(reg_link, link, link, table.imm[i].uses) {
1615 fs_reg *reg = &link->inst->src[link->src];
1616
1617 if (link->inst->opcode == BRW_OPCODE_SEL) {
1618 if (link->type == either_type) {
1619 /* Do not change the register type. */
1620 } else if (link->type == integer_only) {
1621 reg->type = brw_int_type(type_sz(reg->type), true);
1622 } else {
1623 assert(link->type == float_only);
1624
1625 switch (type_sz(reg->type)) {
1626 case 2:
1627 reg->type = BRW_REGISTER_TYPE_HF;
1628 break;
1629 case 4:
1630 reg->type = BRW_REGISTER_TYPE_F;
1631 break;
1632 case 8:
1633 reg->type = BRW_REGISTER_TYPE_DF;
1634 break;
1635 default:
1636 unreachable("Bad type size");
1637 }
1638 }
1639 } else if ((link->inst->opcode == BRW_OPCODE_SHL ||
1640 link->inst->opcode == BRW_OPCODE_ASR) &&
1641 link->negate) {
1642 reg->type = brw_int_type(type_sz(reg->type), true);
1643 }
1644
1645 #ifdef DEBUG
1646 switch (reg->type) {
1647 case BRW_REGISTER_TYPE_DF:
1648 assert((isnan(reg->df) && isnan(table.imm[i].df)) ||
1649 (fabs(reg->df) == fabs(table.imm[i].df)));
1650 break;
1651 case BRW_REGISTER_TYPE_F:
1652 assert((isnan(reg->f) && isnan(table.imm[i].f)) ||
1653 (fabsf(reg->f) == fabsf(table.imm[i].f)));
1654 break;
1655 case BRW_REGISTER_TYPE_HF:
1656 assert((isnan(_mesa_half_to_float(reg->d & 0xffffu)) &&
1657 isnan(_mesa_half_to_float(table.imm[i].w))) ||
1658 (fabsf(_mesa_half_to_float(reg->d & 0xffffu)) ==
1659 fabsf(_mesa_half_to_float(table.imm[i].w))));
1660 break;
1661 case BRW_REGISTER_TYPE_Q:
1662 assert(abs(reg->d64) == abs(table.imm[i].d64));
1663 break;
1664 case BRW_REGISTER_TYPE_UQ:
1665 assert(!link->negate);
1666 assert(reg->d64 == table.imm[i].d64);
1667 break;
1668 case BRW_REGISTER_TYPE_D:
1669 assert(abs(reg->d) == abs(table.imm[i].d));
1670 break;
1671 case BRW_REGISTER_TYPE_UD:
1672 assert(!link->negate);
1673 assert(reg->d == table.imm[i].d);
1674 break;
1675 case BRW_REGISTER_TYPE_W:
1676 assert(abs((int16_t) (reg->d & 0xffff)) == table.imm[i].w);
1677 break;
1678 case BRW_REGISTER_TYPE_UW:
1679 assert(!link->negate);
1680 assert((reg->ud & 0xffffu) == (uint16_t) table.imm[i].w);
1681 break;
1682 default:
1683 break;
1684 }
1685 #endif
1686
1687 assert(link->inst->can_do_source_mods(devinfo) || !link->negate);
1688
1689 reg->file = VGRF;
1690 reg->offset = table.imm[i].subreg_offset;
1691 reg->stride = 0;
1692 reg->negate = link->negate;
1693 reg->nr = table.imm[i].nr;
1694 }
1695 }
1696
1697 /* Fixup any SEL instructions that have src0 still as an immediate. Fixup
1698 * the types of any SEL instruction that have a negation on one of the
1699 * sources. Adding the negation may have changed the type of that source,
1700 * so the other source (and destination) must be changed to match.
1701 */
1702 for (unsigned i = 0; i < table.num_boxes; i++) {
1703 fs_inst *inst = table.boxes[i].inst;
1704
1705 if (inst->opcode != BRW_OPCODE_SEL)
1706 continue;
1707
1708 /* If both sources have negation, the types had better be the same! */
1709 assert(!inst->src[0].negate || !inst->src[1].negate ||
1710 inst->src[0].type == inst->src[1].type);
1711
1712 /* If either source has a negation, force the type of the other source
1713 * and the type of the result to be the same.
1714 */
1715 if (inst->src[0].negate) {
1716 inst->src[1].type = inst->src[0].type;
1717 inst->dst.type = inst->src[0].type;
1718 }
1719
1720 if (inst->src[1].negate) {
1721 inst->src[0].type = inst->src[1].type;
1722 inst->dst.type = inst->src[1].type;
1723 }
1724
1725 if (inst->src[0].file != IMM)
1726 continue;
1727
1728 assert(inst->src[1].file != IMM);
1729 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE ||
1730 inst->conditional_mod == BRW_CONDITIONAL_GE ||
1731 inst->conditional_mod == BRW_CONDITIONAL_L);
1732
1733 fs_reg temp = inst->src[0];
1734 inst->src[0] = inst->src[1];
1735 inst->src[1] = temp;
1736
1737 /* If this was predicated, flipping operands means we also need to flip
1738 * the predicate.
1739 */
1740 if (inst->conditional_mod == BRW_CONDITIONAL_NONE)
1741 inst->predicate_inverse = !inst->predicate_inverse;
1742 }
1743
1744 if (debug) {
1745 for (int i = 0; i < table.len; i++) {
1746 struct imm *imm = &table.imm[i];
1747
1748 fprintf(stderr,
1749 "0x%016" PRIx64 " - block %3d, reg %3d sub %2d, "
1750 "Uses: (%2d, %2d), IP: %4d to %4d, length %4d\n",
1751 (uint64_t)(imm->d & BITFIELD64_MASK(imm->size * 8)),
1752 imm->block->num,
1753 imm->nr,
1754 imm->subreg_offset,
1755 imm->must_promote,
1756 imm->uses_by_coissue,
1757 imm->first_use_ip,
1758 imm->last_use_ip,
1759 imm->last_use_ip - imm->first_use_ip);
1760 }
1761 }
1762
1763 if (rebuild_cfg) {
1764 /* When the CFG is initially built, the instructions are removed from
1765 * the list of instructions stored in fs_visitor -- the same exec_node
1766 * is used for membership in that list and in a block list. So we need
1767 * to pull them back before rebuilding the CFG.
1768 */
1769 assert(exec_list_length(&s.instructions) == 0);
1770 foreach_block(block, s.cfg) {
1771 exec_list_append(&s.instructions, &block->instructions);
1772 }
1773
1774 delete s.cfg;
1775 s.cfg = NULL;
1776 s.calculate_cfg();
1777 }
1778
1779 ralloc_free(const_ctx);
1780
1781 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES |
1782 (rebuild_cfg ? DEPENDENCY_BLOCKS : DEPENDENCY_NOTHING));
1783
1784 return true;
1785 }
1786