1 /*
2 * Copyright © 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file
25 *
26 * This file contains the opt_combine_constants() pass that runs after the
27 * regular optimization loop. It passes over the instruction list and promotes
28 * immediate values to registers by emitting a mov(1) instruction.
29 */
30
31 #include "brw_fs.h"
32 #include "brw_fs_builder.h"
33 #include "brw_cfg.h"
34 #include "util/half_float.h"
35
36 using namespace brw;
37
38 static const bool debug = false;
39
40 enum PACKED interpreted_type {
41 float_only = 0,
42 integer_only,
43 either_type
44 };
45
46 struct value {
47 /** Raw bit pattern of the value. */
48 nir_const_value value;
49
50 /** Instruction that uses this instance of the value. */
51 unsigned instr_index;
52
53 /** Size, in bits, of the value. */
54 uint8_t bit_size;
55
56 /**
57 * Which source of instr is this value?
58 *
59 * \note This field is not actually used by \c brw_combine_constants, but
60 * it is generally very useful to callers.
61 */
62 uint8_t src;
63
64 /**
65 * In what ways can instr interpret this value?
66 *
67 * Choices are floating-point only, integer only, or either type.
68 */
69 enum interpreted_type type;
70
71 /**
72 * Only try to make a single source non-constant.
73 *
74 * On some architectures, some instructions require that all sources be
75 * non-constant. For example, the multiply-accumulate instruction on Intel
76 * GPUs upto Gen11 require that all sources be non-constant. Other
77 * instructions, like the selection instruction, allow one constant source.
78 *
79 * If a single constant source is allowed, set this flag to true.
80 *
81 * If an instruction allows a single constant and it has only a signle
82 * constant to begin, it should be included. Various places in
83 * \c combine_constants will assume that there are multiple constants if
84 * \c ::allow_one_constant is set. This may even be enforced by in-code
85 * assertions.
86 */
87 bool allow_one_constant;
88
89 /**
90 * Restrict values that can reach this value to not include negations.
91 *
92 * This is useful for instructions that cannot have source modifiers. For
93 * example, on Intel GPUs the integer source of a shift instruction (e.g.,
94 * SHL) can have a source modifier, but the integer source of the bitfield
95 * insertion instruction (i.e., BFI2) cannot. A pair of these instructions
96 * might have sources that are negations of each other. Using this flag
97 * will ensure that the BFI2 does not have a negated source, but the SHL
98 * might.
99 */
100 bool no_negations;
101
102 /**
103 * \name UtilCombineConstantsPrivate
104 * Private data used only by brw_combine_constants
105 *
106 * Any data stored in these fields will be overwritten by the call to
107 * \c brw_combine_constants. No assumptions should be made about the
108 * state of these fields after that function returns.
109 */
110 /**@{*/
111 /** Mask of negations that can be generated from this value. */
112 uint8_t reachable_mask;
113
114 /** Mask of negations that can generate this value. */
115 uint8_t reaching_mask;
116
117 /**
118 * Value with the next source from the same instruction.
119 *
120 * This pointer may be \c NULL. If it is not \c NULL, it will form a
121 * singly-linked circular list of values. The list is unorderd. That is,
122 * as the list is iterated, the \c ::src values will be in arbitrary order.
123 *
124 * \todo Is it even possible for there to be more than two elements in this
125 * list? This pass does not operate on vecN instructions or intrinsics, so
126 * the theoretical limit should be three. However, instructions with all
127 * constant sources should have been folded away.
128 */
129 struct value *next_src;
130 /**@}*/
131 };
132
133 struct combine_constants_value {
134 /** Raw bit pattern of the constant loaded. */
135 nir_const_value value;
136
137 /**
138 * Index of the first user.
139 *
140 * This is the offset into \c combine_constants_result::user_map of the
141 * first user of this value.
142 */
143 unsigned first_user;
144
145 /** Number of users of this value. */
146 unsigned num_users;
147
148 /** Size, in bits, of the value. */
149 uint8_t bit_size;
150 };
151
152 struct combine_constants_user {
153 /** Index into the array of values passed to brw_combine_constants. */
154 unsigned index;
155
156 /**
157 * Manner in which the value should be interpreted in the instruction.
158 *
159 * This is only useful when ::negate is set. Unless the corresponding
160 * value::type is \c either_type, this field must have the same value as
161 * value::type.
162 */
163 enum interpreted_type type;
164
165 /** Should this value be negated to generate the original value? */
166 bool negate;
167 };
168
169 class combine_constants_result {
170 public:
combine_constants_result(unsigned num_candidates,bool & success)171 combine_constants_result(unsigned num_candidates, bool &success)
172 : num_values_to_emit(0), user_map(NULL)
173 {
174 user_map = (struct combine_constants_user *) calloc(num_candidates,
175 sizeof(user_map[0]));
176
177 /* In the worst case, the number of output values will be equal to the
178 * number of input values. Allocate a buffer that is known to be large
179 * enough now, and it can be reduced later.
180 */
181 values_to_emit =
182 (struct combine_constants_value *) calloc(num_candidates,
183 sizeof(values_to_emit[0]));
184
185 success = (user_map != NULL && values_to_emit != NULL);
186 }
187
~combine_constants_result()188 ~combine_constants_result()
189 {
190 free(values_to_emit);
191 free(user_map);
192 }
193
append_value(const nir_const_value & value,unsigned bit_size)194 void append_value(const nir_const_value &value, unsigned bit_size)
195 {
196 values_to_emit[num_values_to_emit].value = value;
197 values_to_emit[num_values_to_emit].first_user = 0;
198 values_to_emit[num_values_to_emit].num_users = 0;
199 values_to_emit[num_values_to_emit].bit_size = bit_size;
200 num_values_to_emit++;
201 }
202
203 unsigned num_values_to_emit;
204 struct combine_constants_value *values_to_emit;
205
206 struct combine_constants_user *user_map;
207 };
208
209 #define VALUE_INDEX 0
210 #define FLOAT_NEG_INDEX 1
211 #define INT_NEG_INDEX 2
212 #define MAX_NUM_REACHABLE 3
213
214 #define VALUE_EXISTS (1 << VALUE_INDEX)
215 #define FLOAT_NEG_EXISTS (1 << FLOAT_NEG_INDEX)
216 #define INT_NEG_EXISTS (1 << INT_NEG_INDEX)
217
218 static bool
negation_exists(nir_const_value v,unsigned bit_size,enum interpreted_type base_type)219 negation_exists(nir_const_value v, unsigned bit_size,
220 enum interpreted_type base_type)
221 {
222 /* either_type does not make sense in this context. */
223 assert(base_type == float_only || base_type == integer_only);
224
225 switch (bit_size) {
226 case 8:
227 if (base_type == float_only)
228 return false;
229 else
230 return v.i8 != 0 && v.i8 != INT8_MIN;
231
232 case 16:
233 if (base_type == float_only)
234 return !util_is_half_nan(v.i16);
235 else
236 return v.i16 != 0 && v.i16 != INT16_MIN;
237
238 case 32:
239 if (base_type == float_only)
240 return !isnan(v.f32);
241 else
242 return v.i32 != 0 && v.i32 != INT32_MIN;
243
244 case 64:
245 if (base_type == float_only)
246 return !isnan(v.f64);
247 else
248 return v.i64 != 0 && v.i64 != INT64_MIN;
249
250 default:
251 unreachable("unsupported bit-size should have already been filtered.");
252 }
253 }
254
255 static nir_const_value
negate(nir_const_value v,unsigned bit_size,enum interpreted_type base_type)256 negate(nir_const_value v, unsigned bit_size, enum interpreted_type base_type)
257 {
258 /* either_type does not make sense in this context. */
259 assert(base_type == float_only || base_type == integer_only);
260
261 nir_const_value ret = { 0, };
262
263 switch (bit_size) {
264 case 8:
265 assert(base_type == integer_only);
266 ret.i8 = -v.i8;
267 break;
268
269 case 16:
270 if (base_type == float_only)
271 ret.u16 = v.u16 ^ INT16_MIN;
272 else
273 ret.i16 = -v.i16;
274 break;
275
276 case 32:
277 if (base_type == float_only)
278 ret.u32 = v.u32 ^ INT32_MIN;
279 else
280 ret.i32 = -v.i32;
281 break;
282
283 case 64:
284 if (base_type == float_only)
285 ret.u64 = v.u64 ^ INT64_MIN;
286 else
287 ret.i64 = -v.i64;
288 break;
289
290 default:
291 unreachable("unsupported bit-size should have already been filtered.");
292 }
293
294 return ret;
295 }
296
297 static nir_const_value
absolute(nir_const_value v,unsigned bit_size,enum interpreted_type base_type)298 absolute(nir_const_value v, unsigned bit_size, enum interpreted_type base_type)
299 {
300 /* either_type does not make sense in this context. */
301 assert(base_type == float_only || base_type == integer_only);
302
303 nir_const_value ret = { 0, };
304
305 switch (bit_size) {
306 case 8:
307 assert(base_type == integer_only);
308 ret.i8 = abs(v.i8);
309 break;
310
311 case 16:
312 if (base_type == float_only)
313 ret.u16 = v.u16 & 0x7fff;
314 else
315 ret.i16 = abs(v.i16);
316 break;
317
318 case 32:
319 if (base_type == float_only)
320 ret.f32 = fabs(v.f32);
321 else
322 ret.i32 = abs(v.i32);
323 break;
324
325 case 64:
326 if (base_type == float_only)
327 ret.f64 = fabs(v.f64);
328 else {
329 if (sizeof(v.i64) == sizeof(long int)) {
330 ret.i64 = labs((long int) v.i64);
331 } else {
332 assert(sizeof(v.i64) == sizeof(long long int));
333 ret.i64 = llabs((long long int) v.i64);
334 }
335 }
336 break;
337
338 default:
339 unreachable("unsupported bit-size should have already been filtered.");
340 }
341
342 return ret;
343 }
344
345 static void
calculate_masks(nir_const_value v,enum interpreted_type type,unsigned bit_size,uint8_t * reachable_mask,uint8_t * reaching_mask)346 calculate_masks(nir_const_value v, enum interpreted_type type,
347 unsigned bit_size, uint8_t *reachable_mask,
348 uint8_t *reaching_mask)
349 {
350 *reachable_mask = 0;
351 *reaching_mask = 0;
352
353 /* Calculate the extended reachable mask. */
354 if (type == float_only || type == either_type) {
355 if (negation_exists(v, bit_size, float_only))
356 *reachable_mask |= FLOAT_NEG_EXISTS;
357 }
358
359 if (type == integer_only || type == either_type) {
360 if (negation_exists(v, bit_size, integer_only))
361 *reachable_mask |= INT_NEG_EXISTS;
362 }
363
364 /* Calculate the extended reaching mask. All of the "is this negation
365 * possible" was already determined for the reachable_mask, so reuse that
366 * data.
367 */
368 if (type == float_only || type == either_type) {
369 if (*reachable_mask & FLOAT_NEG_EXISTS)
370 *reaching_mask |= FLOAT_NEG_EXISTS;
371 }
372
373 if (type == integer_only || type == either_type) {
374 if (*reachable_mask & INT_NEG_EXISTS)
375 *reaching_mask |= INT_NEG_EXISTS;
376 }
377 }
378
379 static void
calculate_reachable_values(nir_const_value v,unsigned bit_size,unsigned reachable_mask,nir_const_value * reachable_values)380 calculate_reachable_values(nir_const_value v,
381 unsigned bit_size,
382 unsigned reachable_mask,
383 nir_const_value *reachable_values)
384 {
385 memset(reachable_values, 0, MAX_NUM_REACHABLE * sizeof(reachable_values[0]));
386
387 reachable_values[VALUE_INDEX] = v;
388
389 if (reachable_mask & INT_NEG_EXISTS) {
390 const nir_const_value neg = negate(v, bit_size, integer_only);
391
392 reachable_values[INT_NEG_INDEX] = neg;
393 }
394
395 if (reachable_mask & FLOAT_NEG_EXISTS) {
396 const nir_const_value neg = negate(v, bit_size, float_only);
397
398 reachable_values[FLOAT_NEG_INDEX] = neg;
399 }
400 }
401
402 static bool
value_equal(nir_const_value a,nir_const_value b,unsigned bit_size)403 value_equal(nir_const_value a, nir_const_value b, unsigned bit_size)
404 {
405 switch (bit_size) {
406 case 8:
407 return a.u8 == b.u8;
408 case 16:
409 return a.u16 == b.u16;
410 case 32:
411 return a.u32 == b.u32;
412 case 64:
413 return a.u64 == b.u64;
414 default:
415 unreachable("unsupported bit-size should have already been filtered.");
416 }
417 }
418
419 /** Can these values be the same with one level of negation? */
420 static bool
value_can_equal(const nir_const_value * from,uint8_t reachable_mask,nir_const_value to,uint8_t reaching_mask,unsigned bit_size)421 value_can_equal(const nir_const_value *from, uint8_t reachable_mask,
422 nir_const_value to, uint8_t reaching_mask,
423 unsigned bit_size)
424 {
425 const uint8_t combined_mask = reachable_mask & reaching_mask;
426
427 return value_equal(from[VALUE_INDEX], to, bit_size) ||
428 ((combined_mask & INT_NEG_EXISTS) &&
429 value_equal(from[INT_NEG_INDEX], to, bit_size)) ||
430 ((combined_mask & FLOAT_NEG_EXISTS) &&
431 value_equal(from[FLOAT_NEG_INDEX], to, bit_size));
432 }
433
434 static void
preprocess_candidates(struct value * candidates,unsigned num_candidates)435 preprocess_candidates(struct value *candidates, unsigned num_candidates)
436 {
437 /* Calculate the reaching_mask and reachable_mask for each candidate. */
438 for (unsigned i = 0; i < num_candidates; i++) {
439 calculate_masks(candidates[i].value,
440 candidates[i].type,
441 candidates[i].bit_size,
442 &candidates[i].reachable_mask,
443 &candidates[i].reaching_mask);
444
445 /* If negations are not allowed, then only the original value is
446 * reaching.
447 */
448 if (candidates[i].no_negations)
449 candidates[i].reaching_mask = 0;
450 }
451
452 for (unsigned i = 0; i < num_candidates; i++)
453 candidates[i].next_src = NULL;
454
455 for (unsigned i = 0; i < num_candidates - 1; i++) {
456 if (candidates[i].next_src != NULL)
457 continue;
458
459 struct value *prev = &candidates[i];
460
461 for (unsigned j = i + 1; j < num_candidates; j++) {
462 if (candidates[i].instr_index == candidates[j].instr_index) {
463 prev->next_src = &candidates[j];
464 prev = prev->next_src;
465 }
466 }
467
468 /* Close the cycle. */
469 if (prev != &candidates[i])
470 prev->next_src = &candidates[i];
471 }
472 }
473
474 static bool
reaching_value_exists(const struct value * c,const struct combine_constants_value * values,unsigned num_values)475 reaching_value_exists(const struct value *c,
476 const struct combine_constants_value *values,
477 unsigned num_values)
478 {
479 nir_const_value reachable_values[MAX_NUM_REACHABLE];
480
481 calculate_reachable_values(c->value, c->bit_size, c->reaching_mask,
482 reachable_values);
483
484 /* Check to see if the value is already in the result set. */
485 for (unsigned j = 0; j < num_values; j++) {
486 if (c->bit_size == values[j].bit_size &&
487 value_can_equal(reachable_values, c->reaching_mask,
488 values[j].value, c->reaching_mask,
489 c->bit_size)) {
490 return true;
491 }
492 }
493
494 return false;
495 }
496
497 static combine_constants_result *
combine_constants_greedy(struct value * candidates,unsigned num_candidates)498 combine_constants_greedy(struct value *candidates, unsigned num_candidates)
499 {
500 bool success;
501 combine_constants_result *result =
502 new combine_constants_result(num_candidates, success);
503 if (result == NULL || !success) {
504 delete result;
505 return NULL;
506 }
507
508 BITSET_WORD *remain =
509 (BITSET_WORD *) calloc(BITSET_WORDS(num_candidates), sizeof(remain[0]));
510
511 if (remain == NULL) {
512 delete result;
513 return NULL;
514 }
515
516 memset(remain, 0xff, BITSET_WORDS(num_candidates) * sizeof(remain[0]));
517
518 /* Operate in three passes. The first pass handles all values that must be
519 * emitted and for which a negation cannot exist.
520 */
521 unsigned i;
522 for (i = 0; i < num_candidates; i++) {
523 if (candidates[i].allow_one_constant ||
524 (candidates[i].reaching_mask & (FLOAT_NEG_EXISTS | INT_NEG_EXISTS))) {
525 continue;
526 }
527
528 /* Check to see if the value is already in the result set. */
529 bool found = false;
530 const unsigned num_values = result->num_values_to_emit;
531 for (unsigned j = 0; j < num_values; j++) {
532 if (candidates[i].bit_size == result->values_to_emit[j].bit_size &&
533 value_equal(candidates[i].value,
534 result->values_to_emit[j].value,
535 candidates[i].bit_size)) {
536 found = true;
537 break;
538 }
539 }
540
541 if (!found)
542 result->append_value(candidates[i].value, candidates[i].bit_size);
543
544 BITSET_CLEAR(remain, i);
545 }
546
547 /* The second pass handles all values that must be emitted and for which a
548 * negation can exist.
549 */
550 BITSET_FOREACH_SET(i, remain, num_candidates) {
551 if (candidates[i].allow_one_constant)
552 continue;
553
554 assert(candidates[i].reaching_mask & (FLOAT_NEG_EXISTS | INT_NEG_EXISTS));
555
556 if (!reaching_value_exists(&candidates[i], result->values_to_emit,
557 result->num_values_to_emit)) {
558 result->append_value(absolute(candidates[i].value,
559 candidates[i].bit_size,
560 candidates[i].type),
561 candidates[i].bit_size);
562 }
563
564 BITSET_CLEAR(remain, i);
565 }
566
567 /* The third pass handles all of the values that may not have to be
568 * emitted. These are the values where allow_one_constant is set.
569 */
570 BITSET_FOREACH_SET(i, remain, num_candidates) {
571 assert(candidates[i].allow_one_constant);
572
573 /* The BITSET_FOREACH_SET macro does not detect changes to the bitset
574 * that occur within the current word. Since code in this loop may
575 * clear bits from the set, re-test here.
576 */
577 if (!BITSET_TEST(remain, i))
578 continue;
579
580 assert(candidates[i].next_src != NULL);
581
582 const struct value *const other_candidate = candidates[i].next_src;
583 const unsigned j = other_candidate - candidates;
584
585 if (!reaching_value_exists(&candidates[i], result->values_to_emit,
586 result->num_values_to_emit)) {
587 /* Before emitting a value, see if a match for the other source of
588 * the instruction exists.
589 */
590 if (!reaching_value_exists(&candidates[j], result->values_to_emit,
591 result->num_values_to_emit)) {
592 result->append_value(candidates[i].value, candidates[i].bit_size);
593 }
594 }
595
596 /* Mark both sources as handled. */
597 BITSET_CLEAR(remain, i);
598 BITSET_CLEAR(remain, j);
599 }
600
601 /* As noted above, there will never be more values in the output than in
602 * the input. If there are fewer values, reduce the size of the
603 * allocation.
604 */
605 if (result->num_values_to_emit < num_candidates) {
606 result->values_to_emit = (struct combine_constants_value *)
607 realloc(result->values_to_emit, sizeof(result->values_to_emit[0]) *
608 result->num_values_to_emit);
609
610 /* Is it even possible for a reducing realloc to fail? */
611 assert(result->values_to_emit != NULL);
612 }
613
614 /* Create the mapping from "combined" constants to list of candidates
615 * passed in by the caller.
616 */
617 memset(remain, 0xff, BITSET_WORDS(num_candidates) * sizeof(remain[0]));
618
619 unsigned total_users = 0;
620
621 const unsigned num_values = result->num_values_to_emit;
622 for (unsigned value_idx = 0; value_idx < num_values; value_idx++) {
623 result->values_to_emit[value_idx].first_user = total_users;
624
625 uint8_t reachable_mask;
626 uint8_t unused_mask;
627
628 calculate_masks(result->values_to_emit[value_idx].value, either_type,
629 result->values_to_emit[value_idx].bit_size,
630 &reachable_mask, &unused_mask);
631
632 nir_const_value reachable_values[MAX_NUM_REACHABLE];
633
634 calculate_reachable_values(result->values_to_emit[value_idx].value,
635 result->values_to_emit[value_idx].bit_size,
636 reachable_mask, reachable_values);
637
638 for (unsigned i = 0; i < num_candidates; i++) {
639 bool matched = false;
640
641 if (!BITSET_TEST(remain, i))
642 continue;
643
644 if (candidates[i].bit_size != result->values_to_emit[value_idx].bit_size)
645 continue;
646
647 if (value_equal(candidates[i].value, result->values_to_emit[value_idx].value,
648 result->values_to_emit[value_idx].bit_size)) {
649 result->user_map[total_users].index = i;
650 result->user_map[total_users].type = candidates[i].type;
651 result->user_map[total_users].negate = false;
652 total_users++;
653
654 matched = true;
655 BITSET_CLEAR(remain, i);
656 } else {
657 const uint8_t combined_mask = reachable_mask &
658 candidates[i].reaching_mask;
659
660 enum interpreted_type type = either_type;
661
662 if ((combined_mask & INT_NEG_EXISTS) &&
663 value_equal(candidates[i].value,
664 reachable_values[INT_NEG_INDEX],
665 candidates[i].bit_size)) {
666 type = integer_only;
667 }
668
669 if (type == either_type &&
670 (combined_mask & FLOAT_NEG_EXISTS) &&
671 value_equal(candidates[i].value,
672 reachable_values[FLOAT_NEG_INDEX],
673 candidates[i].bit_size)) {
674 type = float_only;
675 }
676
677 if (type != either_type) {
678 /* Finding a match on this path implies that the user must
679 * allow source negations.
680 */
681 assert(!candidates[i].no_negations);
682
683 result->user_map[total_users].index = i;
684 result->user_map[total_users].type = type;
685 result->user_map[total_users].negate = true;
686 total_users++;
687
688 matched = true;
689 BITSET_CLEAR(remain, i);
690 }
691 }
692
693 /* Mark the other source of instructions that can have a constant
694 * source. Selection is the prime example of this, and we want to
695 * avoid generating sequences like bcsel(a, fneg(b), ineg(c)).
696 *
697 * This also makes sure that the assertion (below) that *all* values
698 * were processed holds even when some values may be allowed to
699 * remain as constants.
700 *
701 * FINISHME: There may be value in only doing this when type ==
702 * either_type. If both sources are loaded, a register allocator may
703 * be able to make a better choice about which value to "spill"
704 * (i.e., replace with an immediate) under heavy register pressure.
705 */
706 if (matched && candidates[i].allow_one_constant) {
707 const struct value *const other_src = candidates[i].next_src;
708 const unsigned idx = other_src - candidates;
709
710 assert(idx < num_candidates);
711 BITSET_CLEAR(remain, idx);
712 }
713 }
714
715 assert(total_users > result->values_to_emit[value_idx].first_user);
716 result->values_to_emit[value_idx].num_users =
717 total_users - result->values_to_emit[value_idx].first_user;
718 }
719
720 /* Verify that all of the values were emitted by the loop above. If any
721 * bits are still set in remain, then some value was not emitted. The use
722 * of memset to populate remain prevents the use of a more performant loop.
723 */
724 #ifndef NDEBUG
725 bool pass = true;
726
727 BITSET_FOREACH_SET(i, remain, num_candidates) {
728 fprintf(stderr, "candidate %d was not processed: { "
729 ".b = %s, "
730 ".f32 = %f, .f64 = %g, "
731 ".i8 = %d, .u8 = 0x%02x, "
732 ".i16 = %d, .u16 = 0x%04x, "
733 ".i32 = %d, .u32 = 0x%08x, "
734 ".i64 = %" PRId64 ", .u64 = 0x%016" PRIx64 " }\n",
735 i,
736 candidates[i].value.b ? "true" : "false",
737 candidates[i].value.f32, candidates[i].value.f64,
738 candidates[i].value.i8, candidates[i].value.u8,
739 candidates[i].value.i16, candidates[i].value.u16,
740 candidates[i].value.i32, candidates[i].value.u32,
741 candidates[i].value.i64, candidates[i].value.u64);
742 pass = false;
743 }
744
745 assert(pass && "All values should have been processed.");
746 #endif
747
748 free(remain);
749
750 return result;
751 }
752
753 static combine_constants_result *
brw_combine_constants(struct value * candidates,unsigned num_candidates)754 brw_combine_constants(struct value *candidates, unsigned num_candidates)
755 {
756 preprocess_candidates(candidates, num_candidates);
757
758 return combine_constants_greedy(candidates, num_candidates);
759 }
760
761 /**
762 * Box for storing fs_inst and some other necessary data
763 *
764 * \sa box_instruction
765 */
766 struct fs_inst_box {
767 fs_inst *inst;
768 unsigned ip;
769 bblock_t *block;
770 };
771
772 /** A box for putting fs_regs in a linked list. */
773 struct reg_link {
774 DECLARE_RALLOC_CXX_OPERATORS(reg_link)
775
reg_linkreg_link776 reg_link(fs_inst *inst, unsigned src, bool negate, enum interpreted_type type)
777 : inst(inst), src(src), negate(negate), type(type) {}
778
779 struct exec_node link;
780 fs_inst *inst;
781 uint8_t src;
782 bool negate;
783 enum interpreted_type type;
784 };
785
786 static struct exec_node *
link(void * mem_ctx,fs_inst * inst,unsigned src,bool negate,enum interpreted_type type)787 link(void *mem_ctx, fs_inst *inst, unsigned src, bool negate,
788 enum interpreted_type type)
789 {
790 reg_link *l = new(mem_ctx) reg_link(inst, src, negate, type);
791 return &l->link;
792 }
793
794 /**
795 * Information about an immediate value.
796 */
797 struct imm {
798 /** The common ancestor of all blocks using this immediate value. */
799 bblock_t *block;
800
801 /**
802 * The instruction generating the immediate value, if all uses are contained
803 * within a single basic block. Otherwise, NULL.
804 */
805 fs_inst *inst;
806
807 /**
808 * A list of fs_regs that refer to this immediate. If we promote it, we'll
809 * have to patch these up to refer to the new GRF.
810 */
811 exec_list *uses;
812
813 /** The immediate value */
814 union {
815 char bytes[8];
816 double df;
817 int64_t d64;
818 float f;
819 int32_t d;
820 int16_t w;
821 };
822 uint8_t size;
823
824 /** When promoting half-float we need to account for certain restrictions */
825 bool is_half_float;
826
827 /**
828 * The GRF register and subregister number where we've decided to store the
829 * constant value.
830 */
831 uint8_t subreg_offset;
832 uint16_t nr;
833
834 /** Is the value used only in a single basic block? */
835 bool used_in_single_block;
836
837 uint16_t first_use_ip;
838 uint16_t last_use_ip;
839 };
840
841 /** The working set of information about immediates. */
842 struct table {
843 struct value *values;
844 int size;
845 int num_values;
846
847 struct imm *imm;
848 int len;
849
850 struct fs_inst_box *boxes;
851 unsigned num_boxes;
852 unsigned size_boxes;
853 };
854
855 static struct value *
new_value(struct table * table,void * mem_ctx)856 new_value(struct table *table, void *mem_ctx)
857 {
858 if (table->num_values == table->size) {
859 table->size *= 2;
860 table->values = reralloc(mem_ctx, table->values, struct value, table->size);
861 }
862 return &table->values[table->num_values++];
863 }
864
865 /**
866 * Store an instruction with some other data in a table.
867 *
868 * \returns the index into the dynamic array of boxes for the instruction.
869 */
870 static unsigned
box_instruction(struct table * table,void * mem_ctx,fs_inst * inst,unsigned ip,bblock_t * block)871 box_instruction(struct table *table, void *mem_ctx, fs_inst *inst,
872 unsigned ip, bblock_t *block)
873 {
874 /* It is common for box_instruction to be called consecutively for each
875 * source of an instruction. As a result, the most common case for finding
876 * an instruction in the table is when that instruction was the last one
877 * added. Search the list back to front.
878 */
879 for (unsigned i = table->num_boxes; i > 0; /* empty */) {
880 i--;
881
882 if (table->boxes[i].inst == inst)
883 return i;
884 }
885
886 if (table->num_boxes == table->size_boxes) {
887 table->size_boxes *= 2;
888 table->boxes = reralloc(mem_ctx, table->boxes, fs_inst_box,
889 table->size_boxes);
890 }
891
892 assert(table->num_boxes < table->size_boxes);
893
894 const unsigned idx = table->num_boxes++;
895 fs_inst_box *ib = &table->boxes[idx];
896
897 ib->inst = inst;
898 ib->block = block;
899 ib->ip = ip;
900
901 return idx;
902 }
903
904 /**
905 * Comparator used for sorting an array of imm structures.
906 *
907 * We sort by basic block number, then last use IP, then first use IP (least
908 * to greatest). This sorting causes immediates live in the same area to be
909 * allocated to the same register in the hopes that all values will be dead
910 * about the same time and the register can be reused.
911 */
912 static int
compare(const void * _a,const void * _b)913 compare(const void *_a, const void *_b)
914 {
915 const struct imm *a = (const struct imm *)_a,
916 *b = (const struct imm *)_b;
917
918 int block_diff = a->block->num - b->block->num;
919 if (block_diff)
920 return block_diff;
921
922 int end_diff = a->last_use_ip - b->last_use_ip;
923 if (end_diff)
924 return end_diff;
925
926 return a->first_use_ip - b->first_use_ip;
927 }
928
929 static struct brw_reg
build_imm_reg_for_copy(struct imm * imm)930 build_imm_reg_for_copy(struct imm *imm)
931 {
932 switch (imm->size) {
933 case 8:
934 return brw_imm_d(imm->d64);
935 case 4:
936 return brw_imm_d(imm->d);
937 case 2:
938 return brw_imm_w(imm->w);
939 default:
940 unreachable("not implemented");
941 }
942 }
943
944 static inline uint32_t
get_alignment_for_imm(const struct imm * imm)945 get_alignment_for_imm(const struct imm *imm)
946 {
947 if (imm->is_half_float)
948 return 4; /* At least MAD seems to require this */
949 else
950 return imm->size;
951 }
952
953 static bool
representable_as_hf(float f,uint16_t * hf)954 representable_as_hf(float f, uint16_t *hf)
955 {
956 union fi u;
957 uint16_t h = _mesa_float_to_half(f);
958 u.f = _mesa_half_to_float(h);
959
960 if (u.f == f) {
961 *hf = h;
962 return true;
963 }
964
965 return false;
966 }
967
968 static bool
representable_as_w(int d,int16_t * w)969 representable_as_w(int d, int16_t *w)
970 {
971 int res = ((d & 0xffff8000) + 0x8000) & 0xffff7fff;
972 if (!res) {
973 *w = d;
974 return true;
975 }
976
977 return false;
978 }
979
980 static bool
representable_as_uw(unsigned ud,uint16_t * uw)981 representable_as_uw(unsigned ud, uint16_t *uw)
982 {
983 if (!(ud & 0xffff0000)) {
984 *uw = ud;
985 return true;
986 }
987
988 return false;
989 }
990
991 static bool
supports_src_as_imm(const struct intel_device_info * devinfo,const fs_inst * inst,unsigned src_idx)992 supports_src_as_imm(const struct intel_device_info *devinfo, const fs_inst *inst,
993 unsigned src_idx)
994 {
995 switch (inst->opcode) {
996 case BRW_OPCODE_ADD3:
997 /* ADD3 can use src0 or src2 in Gfx12.5. */
998 return src_idx != 1;
999
1000 case BRW_OPCODE_BFE:
1001 /* BFE can use src0 or src2 in Gfx12+. */
1002 return devinfo->ver >= 12 && src_idx != 1;
1003
1004 case BRW_OPCODE_CSEL:
1005 /* While MAD can mix F and HF sources on some platforms, CSEL cannot. */
1006 return devinfo->ver >= 12 && inst->src[0].type != BRW_TYPE_F;
1007
1008 case BRW_OPCODE_MAD:
1009 switch (devinfo->verx10) {
1010 case 90:
1011 return false;
1012
1013 case 110:
1014 /* For Gfx11, experiments seem to show that HF mixed with F is not
1015 * allowed in src0 or src2. It is possible that src1 is allowed, but
1016 * that cannot have an immediate value. Experiments seem to show that
1017 * HF immediate can only ever be src0.
1018 *
1019 * W (or UW) immediate mixed with other integer sizes can occur in
1020 * either src0 or src2.
1021 */
1022 return (src_idx == 0 && inst->src[src_idx].type != BRW_TYPE_F) ||
1023 (src_idx == 2 && brw_type_is_int(inst->src[src_idx].type));
1024
1025 case 120:
1026 /* For Gfx12, experiments seem to show that HF immediate mixed with F
1027 * can only occur in src0. W (or UW) immediate mixed with other
1028 * integer sizes can occur in either src0 or src2.
1029 */
1030 return src_idx == 0 ||
1031 (src_idx == 2 && brw_type_is_int(inst->src[src_idx].type));
1032
1033 default:
1034 /* For Gfx12.5, HF mixed with F is not allowed at all (per the
1035 * Bspec). W (or UW) immediate mixed with other integer sizes can
1036 * occur in either src0 or src2.
1037 *
1038 * FINISHME: It's possible (probable?) that src2 can also be HF when
1039 * the other sources are HF. This has not been tested, so it is
1040 * currently not allowed.
1041 */
1042 assert(devinfo->verx10 >= 125);
1043 return (src_idx == 0 && inst->src[src_idx].type != BRW_TYPE_F) ||
1044 (src_idx == 2 && brw_type_is_int(inst->src[src_idx].type));
1045 }
1046 break;
1047
1048 default:
1049 return false;
1050 }
1051 }
1052
1053 static bool
can_promote_src_as_imm(const struct intel_device_info * devinfo,fs_inst * inst,unsigned src_idx)1054 can_promote_src_as_imm(const struct intel_device_info *devinfo, fs_inst *inst,
1055 unsigned src_idx)
1056 {
1057 bool can_promote = false;
1058
1059 if (!supports_src_as_imm(devinfo, inst, src_idx))
1060 return false;
1061
1062 switch (inst->src[src_idx].type) {
1063 case BRW_TYPE_F: {
1064 uint16_t hf;
1065 if (representable_as_hf(inst->src[src_idx].f, &hf)) {
1066 inst->src[src_idx] = retype(brw_imm_uw(hf), BRW_TYPE_HF);
1067 can_promote = true;
1068 }
1069 break;
1070 }
1071 case BRW_TYPE_D:
1072 case BRW_TYPE_UD: {
1073 /* ADD3, CSEL, and MAD can mix signed and unsiged types. Only BFE
1074 * cannot.
1075 */
1076 if (inst->src[src_idx].type == BRW_TYPE_D ||
1077 inst->opcode != BRW_OPCODE_BFE) {
1078 int16_t w;
1079 if (representable_as_w(inst->src[src_idx].d, &w)) {
1080 inst->src[src_idx] = brw_imm_w(w);
1081 can_promote = true;
1082 break;
1083 }
1084 }
1085
1086 if (inst->src[src_idx].type == BRW_TYPE_UD ||
1087 inst->opcode != BRW_OPCODE_BFE) {
1088 uint16_t uw;
1089 if (representable_as_uw(inst->src[src_idx].ud, &uw)) {
1090 inst->src[src_idx] = brw_imm_uw(uw);
1091 can_promote = true;
1092 break;
1093 }
1094 }
1095 break;
1096 }
1097 case BRW_TYPE_W:
1098 case BRW_TYPE_UW:
1099 case BRW_TYPE_HF:
1100 can_promote = true;
1101 break;
1102 default:
1103 break;
1104 }
1105
1106 return can_promote;
1107 }
1108
1109 static void
add_candidate_immediate(struct table * table,fs_inst * inst,unsigned ip,unsigned i,bool allow_one_constant,bblock_t * block,const struct intel_device_info * devinfo,void * const_ctx)1110 add_candidate_immediate(struct table *table, fs_inst *inst, unsigned ip,
1111 unsigned i,
1112 bool allow_one_constant,
1113 bblock_t *block,
1114 const struct intel_device_info *devinfo,
1115 void *const_ctx)
1116 {
1117 struct value *v = new_value(table, const_ctx);
1118
1119 unsigned box_idx = box_instruction(table, const_ctx, inst, ip, block);
1120
1121 v->value.u64 = inst->src[i].d64;
1122 v->bit_size = brw_type_size_bits(inst->src[i].type);
1123 v->instr_index = box_idx;
1124 v->src = i;
1125 v->allow_one_constant = allow_one_constant;
1126
1127 /* Right-shift instructions are special. They can have source modifiers,
1128 * but changing the type can change the semantic of the instruction. Only
1129 * allow negations on a right shift if the source type is already signed.
1130 */
1131 v->no_negations = !inst->can_do_source_mods(devinfo) ||
1132 ((inst->opcode == BRW_OPCODE_SHR ||
1133 inst->opcode == BRW_OPCODE_ASR) &&
1134 brw_type_is_uint(inst->src[i].type));
1135
1136 switch (inst->src[i].type) {
1137 case BRW_TYPE_DF:
1138 case BRW_TYPE_F:
1139 case BRW_TYPE_HF:
1140 v->type = float_only;
1141 break;
1142
1143 case BRW_TYPE_UQ:
1144 case BRW_TYPE_Q:
1145 case BRW_TYPE_UD:
1146 case BRW_TYPE_D:
1147 case BRW_TYPE_UW:
1148 case BRW_TYPE_W:
1149 v->type = integer_only;
1150 break;
1151
1152 case BRW_TYPE_VF:
1153 case BRW_TYPE_UV:
1154 case BRW_TYPE_V:
1155 case BRW_TYPE_UB:
1156 case BRW_TYPE_B:
1157 default:
1158 unreachable("not reached");
1159 }
1160
1161 /* It is safe to change the type of the operands of a select instruction
1162 * that has no conditional modifier, no source modifiers, and no saturate
1163 * modifer.
1164 */
1165 if (inst->opcode == BRW_OPCODE_SEL &&
1166 inst->conditional_mod == BRW_CONDITIONAL_NONE &&
1167 !inst->src[0].negate && !inst->src[0].abs &&
1168 !inst->src[1].negate && !inst->src[1].abs &&
1169 !inst->saturate) {
1170 v->type = either_type;
1171 }
1172 }
1173
1174 struct register_allocation {
1175 /** VGRF for storing values. */
1176 unsigned nr;
1177
1178 /**
1179 * Mask of currently available slots in this register.
1180 *
1181 * Each register is 16 (32 on Xe2), 16-bit slots. Allocations require 1,
1182 * 2, or 4 slots for word, double-word, or quad-word values, respectively.
1183 */
1184 uint32_t avail;
1185 };
1186
1187 static brw_reg
allocate_slots(const intel_device_info * devinfo,struct register_allocation * regs,unsigned num_regs,unsigned bytes,unsigned align_bytes,brw::simple_allocator & alloc)1188 allocate_slots(const intel_device_info *devinfo,
1189 struct register_allocation *regs, unsigned num_regs,
1190 unsigned bytes, unsigned align_bytes,
1191 brw::simple_allocator &alloc)
1192 {
1193 assert(bytes == 2 || bytes == 4 || bytes == 8);
1194 assert(align_bytes == 2 || align_bytes == 4 || align_bytes == 8);
1195
1196 const unsigned slots_per_reg =
1197 REG_SIZE * reg_unit(devinfo) / sizeof(uint16_t);
1198
1199 const unsigned words = bytes / 2;
1200 const unsigned align_words = align_bytes / 2;
1201 const uint32_t mask = (1U << words) - 1;
1202
1203 for (unsigned i = 0; i < num_regs; i++) {
1204 for (unsigned j = 0; j <= (slots_per_reg - words); j += align_words) {
1205 const uint32_t x = regs[i].avail >> j;
1206
1207 if ((x & mask) == mask) {
1208 if (regs[i].nr == UINT_MAX)
1209 regs[i].nr = alloc.allocate(reg_unit(devinfo));
1210
1211 regs[i].avail &= ~(mask << j);
1212
1213 brw_reg reg = brw_vgrf(regs[i].nr, BRW_TYPE_F);
1214 reg.offset = j * 2;
1215
1216 return reg;
1217 }
1218 }
1219 }
1220
1221 unreachable("No free slots found.");
1222 }
1223
1224 static void
deallocate_slots(const struct intel_device_info * devinfo,struct register_allocation * regs,unsigned num_regs,unsigned reg_nr,unsigned subreg_offset,unsigned bytes)1225 deallocate_slots(const struct intel_device_info *devinfo,
1226 struct register_allocation *regs, unsigned num_regs,
1227 unsigned reg_nr, unsigned subreg_offset, unsigned bytes)
1228 {
1229 assert(bytes == 2 || bytes == 4 || bytes == 8);
1230 assert(subreg_offset % 2 == 0);
1231 assert(subreg_offset + bytes <= REG_SIZE * reg_unit(devinfo));
1232
1233 const unsigned words = bytes / 2;
1234 const unsigned offset = subreg_offset / 2;
1235 const uint32_t mask = ((1U << words) - 1) << offset;
1236
1237 for (unsigned i = 0; i < num_regs; i++) {
1238 if (regs[i].nr == reg_nr) {
1239 regs[i].avail |= mask;
1240 return;
1241 }
1242 }
1243
1244 unreachable("No such register found.");
1245 }
1246
1247 static void
parcel_out_registers(const intel_device_info * devinfo,struct imm * imm,unsigned len,const bblock_t * cur_block,struct register_allocation * regs,unsigned num_regs,brw::simple_allocator & alloc)1248 parcel_out_registers(const intel_device_info *devinfo,
1249 struct imm *imm, unsigned len, const bblock_t *cur_block,
1250 struct register_allocation *regs, unsigned num_regs,
1251 brw::simple_allocator &alloc)
1252 {
1253 /* Each basic block has two distinct set of constants. There is the set of
1254 * constants that only have uses in that block, and there is the set of
1255 * constants that have uses after that block.
1256 *
1257 * Allocation proceeds in three passes.
1258 *
1259 * 1. Allocate space for the values that are used outside this block.
1260 *
1261 * 2. Allocate space for the values that are used only in this block.
1262 *
1263 * 3. Deallocate the space for the values that are used only in this block.
1264 */
1265
1266 for (unsigned pass = 0; pass < 2; pass++) {
1267 const bool used_in_single_block = pass != 0;
1268
1269 for (unsigned i = 0; i < len; i++) {
1270 if (imm[i].block == cur_block &&
1271 imm[i].used_in_single_block == used_in_single_block) {
1272 const brw_reg reg = allocate_slots(devinfo, regs, num_regs,
1273 imm[i].size,
1274 get_alignment_for_imm(&imm[i]),
1275 alloc);
1276
1277 imm[i].nr = reg.nr;
1278 imm[i].subreg_offset = reg.offset;
1279 }
1280 }
1281 }
1282
1283 for (unsigned i = 0; i < len; i++) {
1284 if (imm[i].block == cur_block && imm[i].used_in_single_block) {
1285 deallocate_slots(devinfo, regs, num_regs, imm[i].nr,
1286 imm[i].subreg_offset, imm[i].size);
1287 }
1288 }
1289 }
1290
1291 bool
brw_opt_combine_constants(fs_visitor & s)1292 brw_opt_combine_constants(fs_visitor &s)
1293 {
1294 const intel_device_info *devinfo = s.devinfo;
1295 void *const_ctx = ralloc_context(NULL);
1296
1297 struct table table;
1298
1299 /* For each of the dynamic arrays in the table, allocate about a page of
1300 * memory. On LP64 systems, this gives 126 value objects 169 fs_inst_box
1301 * objects. Even larger shaders that have been obverved rarely need more
1302 * than 20 or 30 values. Most smaller shaders, which is most shaders, need
1303 * at most a couple dozen fs_inst_box.
1304 */
1305 table.size = (4096 - (5 * sizeof(void *))) / sizeof(struct value);
1306 table.num_values = 0;
1307 table.values = ralloc_array(const_ctx, struct value, table.size);
1308
1309 table.size_boxes = (4096 - (5 * sizeof(void *))) / sizeof(struct fs_inst_box);
1310 table.num_boxes = 0;
1311 table.boxes = ralloc_array(const_ctx, fs_inst_box, table.size_boxes);
1312
1313 const brw::idom_tree &idom = s.idom_analysis.require();
1314 unsigned ip = -1;
1315
1316 /* Make a pass through all instructions and mark each constant is used in
1317 * instruction sources that cannot legally be immediate values.
1318 */
1319 foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
1320 ip++;
1321
1322 switch (inst->opcode) {
1323 case SHADER_OPCODE_INT_QUOTIENT:
1324 case SHADER_OPCODE_INT_REMAINDER:
1325 case SHADER_OPCODE_POW:
1326 if (inst->src[0].file == IMM) {
1327 add_candidate_immediate(&table, inst, ip, 0, false, block,
1328 devinfo, const_ctx);
1329 }
1330 break;
1331
1332 /* FINISHME: CSEL handling could be better. For some cases, src[0] and
1333 * src[1] can be commutative (e.g., any integer comparison). In those
1334 * cases when src[1] is IMM, the sources could be exchanged. In
1335 * addition, when both sources are IMM that could be represented as
1336 * 16-bits, it would be better to add both sources with
1337 * allow_one_constant=true as is done for SEL.
1338 */
1339 case BRW_OPCODE_BFE:
1340 case BRW_OPCODE_ADD3:
1341 case BRW_OPCODE_CSEL:
1342 case BRW_OPCODE_MAD: {
1343 if (inst->opcode == BRW_OPCODE_MAD &&
1344 devinfo->ver == 11 &&
1345 can_promote_src_as_imm(devinfo, inst, 0) &&
1346 can_promote_src_as_imm(devinfo, inst, 2)) {
1347 /* MAD can have either src0 or src2 be immediate. Add both as
1348 * candidates, but mark them "allow one constant."
1349 */
1350 add_candidate_immediate(&table, inst, ip, 0, true, block,
1351 devinfo, const_ctx);
1352 add_candidate_immediate(&table, inst, ip, 2, true, block,
1353 devinfo, const_ctx);
1354 } else {
1355 for (int i = 0; i < inst->sources; i++) {
1356 if (inst->src[i].file != IMM)
1357 continue;
1358
1359 if (can_promote_src_as_imm(devinfo, inst, i))
1360 continue;
1361
1362 add_candidate_immediate(&table, inst, ip, i, false, block,
1363 devinfo, const_ctx);
1364 }
1365 }
1366
1367 break;
1368 }
1369
1370 case BRW_OPCODE_BFI2:
1371 case BRW_OPCODE_LRP:
1372 for (int i = 0; i < inst->sources; i++) {
1373 if (inst->src[i].file != IMM)
1374 continue;
1375
1376 add_candidate_immediate(&table, inst, ip, i, false, block,
1377 devinfo, const_ctx);
1378 }
1379
1380 break;
1381
1382 case BRW_OPCODE_SEL:
1383 if (inst->src[0].file == IMM) {
1384 /* It is possible to have src0 be immediate but src1 not be
1385 * immediate for the non-commutative conditional modifiers (e.g.,
1386 * G).
1387 */
1388 if (inst->conditional_mod == BRW_CONDITIONAL_NONE ||
1389 /* Only GE and L are commutative. */
1390 inst->conditional_mod == BRW_CONDITIONAL_GE ||
1391 inst->conditional_mod == BRW_CONDITIONAL_L) {
1392 assert(inst->src[1].file == IMM);
1393
1394 add_candidate_immediate(&table, inst, ip, 0, true, block,
1395 devinfo, const_ctx);
1396 add_candidate_immediate(&table, inst, ip, 1, true, block,
1397 devinfo, const_ctx);
1398 } else {
1399 add_candidate_immediate(&table, inst, ip, 0, false, block,
1400 devinfo, const_ctx);
1401 }
1402 }
1403 break;
1404
1405 case BRW_OPCODE_ASR:
1406 case BRW_OPCODE_BFI1:
1407 case BRW_OPCODE_MUL:
1408 case BRW_OPCODE_ROL:
1409 case BRW_OPCODE_ROR:
1410 case BRW_OPCODE_SHL:
1411 case BRW_OPCODE_SHR:
1412 if (inst->src[0].file == IMM) {
1413 add_candidate_immediate(&table, inst, ip, 0, false, block,
1414 devinfo, const_ctx);
1415 }
1416 break;
1417
1418 default:
1419 break;
1420 }
1421 }
1422
1423 if (table.num_values == 0) {
1424 ralloc_free(const_ctx);
1425 return false;
1426 }
1427
1428 combine_constants_result *result =
1429 brw_combine_constants(table.values, table.num_values);
1430
1431 table.imm = ralloc_array(const_ctx, struct imm, result->num_values_to_emit);
1432 table.len = 0;
1433
1434 for (unsigned i = 0; i < result->num_values_to_emit; i++) {
1435 struct imm *imm = &table.imm[table.len];
1436
1437 imm->block = NULL;
1438 imm->inst = NULL;
1439 imm->d64 = result->values_to_emit[i].value.u64;
1440 imm->size = result->values_to_emit[i].bit_size / 8;
1441
1442 imm->is_half_float = false;
1443
1444 imm->first_use_ip = UINT16_MAX;
1445 imm->last_use_ip = 0;
1446
1447 imm->uses = new(const_ctx) exec_list;
1448
1449 const unsigned first_user = result->values_to_emit[i].first_user;
1450 const unsigned last_user = first_user +
1451 result->values_to_emit[i].num_users;
1452
1453 for (unsigned j = first_user; j < last_user; j++) {
1454 const unsigned idx = table.values[result->user_map[j].index].instr_index;
1455 fs_inst_box *const ib = &table.boxes[idx];
1456
1457 const unsigned src = table.values[result->user_map[j].index].src;
1458
1459 imm->uses->push_tail(link(const_ctx, ib->inst, src,
1460 result->user_map[j].negate,
1461 result->user_map[j].type));
1462
1463 if (imm->block == NULL) {
1464 /* Block should only be NULL on the first pass. On the first
1465 * pass, inst should also be NULL.
1466 */
1467 assert(imm->inst == NULL);
1468
1469 imm->inst = ib->inst;
1470 imm->block = ib->block;
1471 imm->first_use_ip = ib->ip;
1472 imm->last_use_ip = ib->ip;
1473 imm->used_in_single_block = true;
1474 } else {
1475 bblock_t *intersection = idom.intersect(ib->block,
1476 imm->block);
1477
1478 if (ib->block != imm->block)
1479 imm->used_in_single_block = false;
1480
1481 if (imm->first_use_ip > ib->ip) {
1482 imm->first_use_ip = ib->ip;
1483
1484 /* If the first-use instruction is to be tracked, block must be
1485 * the block that contains it. The old block was read in the
1486 * idom.intersect call above, so it is safe to overwrite it
1487 * here.
1488 */
1489 imm->inst = ib->inst;
1490 imm->block = ib->block;
1491 }
1492
1493 if (imm->last_use_ip < ib->ip)
1494 imm->last_use_ip = ib->ip;
1495
1496 /* The common dominator is not the block that contains the
1497 * first-use instruction, so don't track that instruction. The
1498 * load instruction will be added in the common dominator block
1499 * instead of before the first-use instruction.
1500 */
1501 if (intersection != imm->block)
1502 imm->inst = NULL;
1503
1504 imm->block = intersection;
1505 }
1506
1507 if (ib->inst->src[src].type == BRW_TYPE_HF)
1508 imm->is_half_float = true;
1509 }
1510
1511 table.len++;
1512 }
1513
1514 delete result;
1515
1516 if (table.len == 0) {
1517 ralloc_free(const_ctx);
1518 return false;
1519 }
1520 if (s.cfg->num_blocks != 1)
1521 qsort(table.imm, table.len, sizeof(struct imm), compare);
1522
1523 struct register_allocation *regs =
1524 (struct register_allocation *) calloc(table.len, sizeof(regs[0]));
1525
1526 const unsigned all_avail = devinfo->ver >= 20 ? 0xffffffff : 0xffff;
1527
1528 for (int i = 0; i < table.len; i++) {
1529 regs[i].nr = UINT_MAX;
1530 regs[i].avail = all_avail;
1531 }
1532
1533 foreach_block(block, s.cfg) {
1534 parcel_out_registers(devinfo, table.imm, table.len, block, regs,
1535 table.len, s.alloc);
1536 }
1537
1538 free(regs);
1539
1540 bool rebuild_cfg = false;
1541
1542 /* Insert MOVs to load the constant values into GRFs. */
1543 for (int i = 0; i < table.len; i++) {
1544 struct imm *imm = &table.imm[i];
1545
1546 /* Insert it either before the instruction that generated the immediate
1547 * or after the last non-control flow instruction of the common ancestor.
1548 */
1549 exec_node *n;
1550 bblock_t *insert_block;
1551 if (imm->inst != nullptr) {
1552 n = imm->inst;
1553 insert_block = imm->block;
1554 } else {
1555 if (imm->block->start()->opcode == BRW_OPCODE_DO) {
1556 /* DO blocks are weird. They can contain only the single DO
1557 * instruction. As a result, MOV instructions cannot be added to
1558 * the DO block.
1559 */
1560 bblock_t *next_block = imm->block->next();
1561 if (next_block->starts_with_control_flow()) {
1562 /* This is the difficult case. This occurs for code like
1563 *
1564 * do {
1565 * do {
1566 * ...
1567 * } while (...);
1568 * } while (...);
1569 *
1570 * when the MOV instructions need to be inserted between the
1571 * two DO instructions.
1572 *
1573 * To properly handle this scenario, a new block would need to
1574 * be inserted. Doing so would require modifying arbitrary many
1575 * CONTINUE, BREAK, and WHILE instructions to point to the new
1576 * block.
1577 *
1578 * It is unlikely that this would ever be correct. Instead,
1579 * insert the MOV instructions in the known wrong place and
1580 * rebuild the CFG at the end of the pass.
1581 */
1582 insert_block = imm->block;
1583 n = insert_block->last_non_control_flow_inst()->next;
1584
1585 rebuild_cfg = true;
1586 } else {
1587 insert_block = next_block;
1588 n = insert_block->start();
1589 }
1590 } else {
1591 insert_block = imm->block;
1592 n = insert_block->last_non_control_flow_inst()->next;
1593 }
1594 }
1595
1596 /* From the BDW and CHV PRM, 3D Media GPGPU, Special Restrictions:
1597 *
1598 * "In Align16 mode, the channel selects and channel enables apply to a
1599 * pair of half-floats, because these parameters are defined for DWord
1600 * elements ONLY. This is applicable when both source and destination
1601 * are half-floats."
1602 *
1603 * This means that Align16 instructions that use promoted HF immediates
1604 * and use a <0,1,0>:HF region would read 2 HF slots instead of
1605 * replicating the single one we want. To avoid this, we always populate
1606 * both HF slots within a DWord with the constant.
1607 */
1608 const uint32_t width = 1;
1609 const fs_builder ibld = fs_builder(&s, width).at(insert_block, n).exec_all();
1610
1611 brw_reg reg = brw_vgrf(imm->nr, BRW_TYPE_F);
1612 reg.offset = imm->subreg_offset;
1613 reg.stride = 0;
1614
1615 /* Put the immediate in an offset aligned to its size. Some instructions
1616 * seem to have additional alignment requirements, so account for that
1617 * too.
1618 */
1619 assert(reg.offset == ALIGN(reg.offset, get_alignment_for_imm(imm)));
1620
1621 struct brw_reg imm_reg = build_imm_reg_for_copy(imm);
1622
1623 /* Ensure we have enough space in the register to copy the immediate */
1624 assert(reg.offset + brw_type_size_bytes(imm_reg.type) * width <= REG_SIZE * reg_unit(devinfo));
1625
1626 ibld.MOV(retype(reg, imm_reg.type), imm_reg);
1627 }
1628 s.shader_stats.promoted_constants = table.len;
1629
1630 /* Rewrite the immediate sources to refer to the new GRFs. */
1631 for (int i = 0; i < table.len; i++) {
1632 foreach_list_typed(reg_link, link, link, table.imm[i].uses) {
1633 brw_reg *reg = &link->inst->src[link->src];
1634
1635 if (link->inst->opcode == BRW_OPCODE_SEL) {
1636 if (link->type == either_type) {
1637 /* Do not change the register type. */
1638 } else if (link->type == integer_only) {
1639 reg->type = brw_int_type(brw_type_size_bytes(reg->type), true);
1640 } else {
1641 assert(link->type == float_only);
1642
1643 switch (brw_type_size_bytes(reg->type)) {
1644 case 2:
1645 reg->type = BRW_TYPE_HF;
1646 break;
1647 case 4:
1648 reg->type = BRW_TYPE_F;
1649 break;
1650 case 8:
1651 reg->type = BRW_TYPE_DF;
1652 break;
1653 default:
1654 unreachable("Bad type size");
1655 }
1656 }
1657 } else if ((link->inst->opcode == BRW_OPCODE_SHL ||
1658 link->inst->opcode == BRW_OPCODE_ASR) &&
1659 link->negate) {
1660 reg->type = brw_int_type(brw_type_size_bytes(reg->type), true);
1661 }
1662
1663 #if MESA_DEBUG
1664 switch (reg->type) {
1665 case BRW_TYPE_DF:
1666 assert((isnan(reg->df) && isnan(table.imm[i].df)) ||
1667 (fabs(reg->df) == fabs(table.imm[i].df)));
1668 break;
1669 case BRW_TYPE_F:
1670 assert((isnan(reg->f) && isnan(table.imm[i].f)) ||
1671 (fabsf(reg->f) == fabsf(table.imm[i].f)));
1672 break;
1673 case BRW_TYPE_HF:
1674 assert((isnan(_mesa_half_to_float(reg->d & 0xffffu)) &&
1675 isnan(_mesa_half_to_float(table.imm[i].w))) ||
1676 (fabsf(_mesa_half_to_float(reg->d & 0xffffu)) ==
1677 fabsf(_mesa_half_to_float(table.imm[i].w))));
1678 break;
1679 case BRW_TYPE_Q:
1680 assert(abs(reg->d64) == abs(table.imm[i].d64));
1681 break;
1682 case BRW_TYPE_UQ:
1683 assert(!link->negate);
1684 assert(reg->d64 == table.imm[i].d64);
1685 break;
1686 case BRW_TYPE_D:
1687 assert(abs(reg->d) == abs(table.imm[i].d));
1688 break;
1689 case BRW_TYPE_UD:
1690 assert(!link->negate);
1691 assert(reg->d == table.imm[i].d);
1692 break;
1693 case BRW_TYPE_W:
1694 assert(abs((int16_t) (reg->d & 0xffff)) == table.imm[i].w);
1695 break;
1696 case BRW_TYPE_UW:
1697 assert(!link->negate);
1698 assert((reg->ud & 0xffffu) == (uint16_t) table.imm[i].w);
1699 break;
1700 default:
1701 break;
1702 }
1703 #endif
1704
1705 assert(link->inst->can_do_source_mods(devinfo) || !link->negate);
1706
1707 reg->file = VGRF;
1708 reg->offset = table.imm[i].subreg_offset;
1709 reg->stride = 0;
1710 reg->negate = link->negate;
1711 reg->nr = table.imm[i].nr;
1712 }
1713 }
1714
1715 /* Fixup any SEL instructions that have src0 still as an immediate. Fixup
1716 * the types of any SEL instruction that have a negation on one of the
1717 * sources. Adding the negation may have changed the type of that source,
1718 * so the other source (and destination) must be changed to match.
1719 */
1720 for (unsigned i = 0; i < table.num_boxes; i++) {
1721 fs_inst *inst = table.boxes[i].inst;
1722
1723 if (inst->opcode != BRW_OPCODE_SEL)
1724 continue;
1725
1726 /* If both sources have negation, the types had better be the same! */
1727 assert(!inst->src[0].negate || !inst->src[1].negate ||
1728 inst->src[0].type == inst->src[1].type);
1729
1730 /* If either source has a negation, force the type of the other source
1731 * and the type of the result to be the same.
1732 */
1733 if (inst->src[0].negate) {
1734 inst->src[1].type = inst->src[0].type;
1735 inst->dst.type = inst->src[0].type;
1736 }
1737
1738 if (inst->src[1].negate) {
1739 inst->src[0].type = inst->src[1].type;
1740 inst->dst.type = inst->src[1].type;
1741 }
1742
1743 if (inst->src[0].file != IMM)
1744 continue;
1745
1746 assert(inst->src[1].file != IMM);
1747 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE ||
1748 inst->conditional_mod == BRW_CONDITIONAL_GE ||
1749 inst->conditional_mod == BRW_CONDITIONAL_L);
1750
1751 brw_reg temp = inst->src[0];
1752 inst->src[0] = inst->src[1];
1753 inst->src[1] = temp;
1754
1755 /* If this was predicated, flipping operands means we also need to flip
1756 * the predicate.
1757 */
1758 if (inst->conditional_mod == BRW_CONDITIONAL_NONE)
1759 inst->predicate_inverse = !inst->predicate_inverse;
1760 }
1761
1762 if (debug) {
1763 for (int i = 0; i < table.len; i++) {
1764 struct imm *imm = &table.imm[i];
1765
1766 fprintf(stderr,
1767 "0x%016" PRIx64 " - block %3d, reg %3d sub %2d, "
1768 "IP: %4d to %4d, length %4d\n",
1769 (uint64_t)(imm->d & BITFIELD64_MASK(imm->size * 8)),
1770 imm->block->num,
1771 imm->nr,
1772 imm->subreg_offset,
1773 imm->first_use_ip,
1774 imm->last_use_ip,
1775 imm->last_use_ip - imm->first_use_ip);
1776 }
1777 }
1778
1779 if (rebuild_cfg) {
1780 /* When the CFG is initially built, the instructions are removed from
1781 * the list of instructions stored in fs_visitor -- the same exec_node
1782 * is used for membership in that list and in a block list. So we need
1783 * to pull them back before rebuilding the CFG.
1784 */
1785 assert(exec_list_length(&s.instructions) == 0);
1786 foreach_block(block, s.cfg) {
1787 exec_list_append(&s.instructions, &block->instructions);
1788 }
1789
1790 delete s.cfg;
1791 s.cfg = NULL;
1792 brw_calculate_cfg(s);
1793 }
1794
1795 ralloc_free(const_ctx);
1796
1797 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES |
1798 (rebuild_cfg ? DEPENDENCY_BLOCKS : DEPENDENCY_NOTHING));
1799
1800 return true;
1801 }
1802