1 /*
2 * Copyright © 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file elk_fs_combine_constants.cpp
25 *
26 * This file contains the opt_combine_constants() pass that runs after the
27 * regular optimization loop. It passes over the instruction list and
28 * selectively promotes immediate values to registers by emitting a mov(1)
29 * instruction.
30 *
31 * This is useful on Gen 7 particularly, because a few instructions can be
32 * coissued (i.e., issued in the same cycle as another thread on the same EU
33 * issues an instruction) under some circumstances, one of which is that they
34 * cannot use immediate values.
35 */
36
37 #include "elk_fs.h"
38 #include "elk_fs_builder.h"
39 #include "elk_cfg.h"
40 #include "util/half_float.h"
41
42 using namespace elk;
43
44 static const bool debug = false;
45
46 namespace {
47
48 enum PACKED interpreted_type {
49 float_only = 0,
50 integer_only,
51 either_type
52 };
53
54 struct value {
55 /** Raw bit pattern of the value. */
56 nir_const_value value;
57
58 /** Instruction that uses this instance of the value. */
59 unsigned instr_index;
60
61 /** Size, in bits, of the value. */
62 uint8_t bit_size;
63
64 /**
65 * Which source of instr is this value?
66 *
67 * \note This field is not actually used by \c elk_combine_constants, but
68 * it is generally very useful to callers.
69 */
70 uint8_t src;
71
72 /**
73 * In what ways can instr interpret this value?
74 *
75 * Choices are floating-point only, integer only, or either type.
76 */
77 enum interpreted_type type;
78
79 /**
80 * Only try to make a single source non-constant.
81 *
82 * On some architectures, some instructions require that all sources be
83 * non-constant. For example, the multiply-accumulate instruction on Intel
84 * GPUs upto Gen11 require that all sources be non-constant. Other
85 * instructions, like the selection instruction, allow one constant source.
86 *
87 * If a single constant source is allowed, set this flag to true.
88 *
89 * If an instruction allows a single constant and it has only a signle
90 * constant to begin, it should be included. Various places in
91 * \c combine_constants will assume that there are multiple constants if
92 * \c ::allow_one_constant is set. This may even be enforced by in-code
93 * assertions.
94 */
95 bool allow_one_constant;
96
97 /**
98 * Restrict values that can reach this value to not include negations.
99 *
100 * This is useful for instructions that cannot have source modifiers. For
101 * example, on Intel GPUs the integer source of a shift instruction (e.g.,
102 * SHL) can have a source modifier, but the integer source of the bitfield
103 * insertion instruction (i.e., BFI2) cannot. A pair of these instructions
104 * might have sources that are negations of each other. Using this flag
105 * will ensure that the BFI2 does not have a negated source, but the SHL
106 * might.
107 */
108 bool no_negations;
109
110 /**
111 * \name UtilCombineConstantsPrivate
112 * Private data used only by elk_combine_constants
113 *
114 * Any data stored in these fields will be overwritten by the call to
115 * \c elk_combine_constants. No assumptions should be made about the
116 * state of these fields after that function returns.
117 */
118 /**@{*/
119 /** Mask of negations that can be generated from this value. */
120 uint8_t reachable_mask;
121
122 /** Mask of negations that can generate this value. */
123 uint8_t reaching_mask;
124
125 /**
126 * Value with the next source from the same instruction.
127 *
128 * This pointer may be \c NULL. If it is not \c NULL, it will form a
129 * singly-linked circular list of values. The list is unorderd. That is,
130 * as the list is iterated, the \c ::src values will be in arbitrary order.
131 *
132 * \todo Is it even possible for there to be more than two elements in this
133 * list? This pass does not operate on vecN instructions or intrinsics, so
134 * the theoretical limit should be three. However, instructions with all
135 * constant sources should have been folded away.
136 */
137 struct value *next_src;
138 /**@}*/
139 };
140
141 struct combine_constants_value {
142 /** Raw bit pattern of the constant loaded. */
143 nir_const_value value;
144
145 /**
146 * Index of the first user.
147 *
148 * This is the offset into \c combine_constants_result::user_map of the
149 * first user of this value.
150 */
151 unsigned first_user;
152
153 /** Number of users of this value. */
154 unsigned num_users;
155
156 /** Size, in bits, of the value. */
157 uint8_t bit_size;
158 };
159
160 struct combine_constants_user {
161 /** Index into the array of values passed to elk_combine_constants. */
162 unsigned index;
163
164 /**
165 * Manner in which the value should be interpreted in the instruction.
166 *
167 * This is only useful when ::negate is set. Unless the corresponding
168 * value::type is \c either_type, this field must have the same value as
169 * value::type.
170 */
171 enum interpreted_type type;
172
173 /** Should this value be negated to generate the original value? */
174 bool negate;
175 };
176
177 class combine_constants_result {
178 public:
combine_constants_result(unsigned num_candidates,bool & success)179 combine_constants_result(unsigned num_candidates, bool &success)
180 : num_values_to_emit(0), user_map(NULL)
181 {
182 user_map = (struct combine_constants_user *) calloc(num_candidates,
183 sizeof(user_map[0]));
184
185 /* In the worst case, the number of output values will be equal to the
186 * number of input values. Allocate a buffer that is known to be large
187 * enough now, and it can be reduced later.
188 */
189 values_to_emit =
190 (struct combine_constants_value *) calloc(num_candidates,
191 sizeof(values_to_emit[0]));
192
193 success = (user_map != NULL && values_to_emit != NULL);
194 }
195
~combine_constants_result()196 ~combine_constants_result()
197 {
198 free(values_to_emit);
199 free(user_map);
200 }
201
append_value(const nir_const_value & value,unsigned bit_size)202 void append_value(const nir_const_value &value, unsigned bit_size)
203 {
204 values_to_emit[num_values_to_emit].value = value;
205 values_to_emit[num_values_to_emit].first_user = 0;
206 values_to_emit[num_values_to_emit].num_users = 0;
207 values_to_emit[num_values_to_emit].bit_size = bit_size;
208 num_values_to_emit++;
209 }
210
211 unsigned num_values_to_emit;
212 struct combine_constants_value *values_to_emit;
213
214 struct combine_constants_user *user_map;
215 };
216
217 #define VALUE_INDEX 0
218 #define FLOAT_NEG_INDEX 1
219 #define INT_NEG_INDEX 2
220 #define MAX_NUM_REACHABLE 3
221
222 #define VALUE_EXISTS (1 << VALUE_INDEX)
223 #define FLOAT_NEG_EXISTS (1 << FLOAT_NEG_INDEX)
224 #define INT_NEG_EXISTS (1 << INT_NEG_INDEX)
225
226 static bool
negation_exists(nir_const_value v,unsigned bit_size,enum interpreted_type base_type)227 negation_exists(nir_const_value v, unsigned bit_size,
228 enum interpreted_type base_type)
229 {
230 /* either_type does not make sense in this context. */
231 assert(base_type == float_only || base_type == integer_only);
232
233 switch (bit_size) {
234 case 8:
235 if (base_type == float_only)
236 return false;
237 else
238 return v.i8 != 0 && v.i8 != INT8_MIN;
239
240 case 16:
241 if (base_type == float_only)
242 return !util_is_half_nan(v.i16);
243 else
244 return v.i16 != 0 && v.i16 != INT16_MIN;
245
246 case 32:
247 if (base_type == float_only)
248 return !isnan(v.f32);
249 else
250 return v.i32 != 0 && v.i32 != INT32_MIN;
251
252 case 64:
253 if (base_type == float_only)
254 return !isnan(v.f64);
255 else
256 return v.i64 != 0 && v.i64 != INT64_MIN;
257
258 default:
259 unreachable("unsupported bit-size should have already been filtered.");
260 }
261 }
262
263 static nir_const_value
negate(nir_const_value v,unsigned bit_size,enum interpreted_type base_type)264 negate(nir_const_value v, unsigned bit_size, enum interpreted_type base_type)
265 {
266 /* either_type does not make sense in this context. */
267 assert(base_type == float_only || base_type == integer_only);
268
269 nir_const_value ret = { 0, };
270
271 switch (bit_size) {
272 case 8:
273 assert(base_type == integer_only);
274 ret.i8 = -v.i8;
275 break;
276
277 case 16:
278 if (base_type == float_only)
279 ret.u16 = v.u16 ^ INT16_MIN;
280 else
281 ret.i16 = -v.i16;
282 break;
283
284 case 32:
285 if (base_type == float_only)
286 ret.u32 = v.u32 ^ INT32_MIN;
287 else
288 ret.i32 = -v.i32;
289 break;
290
291 case 64:
292 if (base_type == float_only)
293 ret.u64 = v.u64 ^ INT64_MIN;
294 else
295 ret.i64 = -v.i64;
296 break;
297
298 default:
299 unreachable("unsupported bit-size should have already been filtered.");
300 }
301
302 return ret;
303 }
304
305 static nir_const_value
absolute(nir_const_value v,unsigned bit_size,enum interpreted_type base_type)306 absolute(nir_const_value v, unsigned bit_size, enum interpreted_type base_type)
307 {
308 /* either_type does not make sense in this context. */
309 assert(base_type == float_only || base_type == integer_only);
310
311 nir_const_value ret = { 0, };
312
313 switch (bit_size) {
314 case 8:
315 assert(base_type == integer_only);
316 ret.i8 = abs(v.i8);
317 break;
318
319 case 16:
320 if (base_type == float_only)
321 ret.u16 = v.u16 & 0x7fff;
322 else
323 ret.i16 = abs(v.i16);
324 break;
325
326 case 32:
327 if (base_type == float_only)
328 ret.f32 = fabs(v.f32);
329 else
330 ret.i32 = abs(v.i32);
331 break;
332
333 case 64:
334 if (base_type == float_only)
335 ret.f64 = fabs(v.f64);
336 else {
337 if (sizeof(v.i64) == sizeof(long int)) {
338 ret.i64 = labs((long int) v.i64);
339 } else {
340 assert(sizeof(v.i64) == sizeof(long long int));
341 ret.i64 = llabs((long long int) v.i64);
342 }
343 }
344 break;
345
346 default:
347 unreachable("unsupported bit-size should have already been filtered.");
348 }
349
350 return ret;
351 }
352
353 static void
calculate_masks(nir_const_value v,enum interpreted_type type,unsigned bit_size,uint8_t * reachable_mask,uint8_t * reaching_mask)354 calculate_masks(nir_const_value v, enum interpreted_type type,
355 unsigned bit_size, uint8_t *reachable_mask,
356 uint8_t *reaching_mask)
357 {
358 *reachable_mask = 0;
359 *reaching_mask = 0;
360
361 /* Calculate the extended reachable mask. */
362 if (type == float_only || type == either_type) {
363 if (negation_exists(v, bit_size, float_only))
364 *reachable_mask |= FLOAT_NEG_EXISTS;
365 }
366
367 if (type == integer_only || type == either_type) {
368 if (negation_exists(v, bit_size, integer_only))
369 *reachable_mask |= INT_NEG_EXISTS;
370 }
371
372 /* Calculate the extended reaching mask. All of the "is this negation
373 * possible" was already determined for the reachable_mask, so reuse that
374 * data.
375 */
376 if (type == float_only || type == either_type) {
377 if (*reachable_mask & FLOAT_NEG_EXISTS)
378 *reaching_mask |= FLOAT_NEG_EXISTS;
379 }
380
381 if (type == integer_only || type == either_type) {
382 if (*reachable_mask & INT_NEG_EXISTS)
383 *reaching_mask |= INT_NEG_EXISTS;
384 }
385 }
386
387 static void
calculate_reachable_values(nir_const_value v,unsigned bit_size,unsigned reachable_mask,nir_const_value * reachable_values)388 calculate_reachable_values(nir_const_value v,
389 unsigned bit_size,
390 unsigned reachable_mask,
391 nir_const_value *reachable_values)
392 {
393 memset(reachable_values, 0, MAX_NUM_REACHABLE * sizeof(reachable_values[0]));
394
395 reachable_values[VALUE_INDEX] = v;
396
397 if (reachable_mask & INT_NEG_EXISTS) {
398 const nir_const_value neg = negate(v, bit_size, integer_only);
399
400 reachable_values[INT_NEG_INDEX] = neg;
401 }
402
403 if (reachable_mask & FLOAT_NEG_EXISTS) {
404 const nir_const_value neg = negate(v, bit_size, float_only);
405
406 reachable_values[FLOAT_NEG_INDEX] = neg;
407 }
408 }
409
410 static bool
value_equal(nir_const_value a,nir_const_value b,unsigned bit_size)411 value_equal(nir_const_value a, nir_const_value b, unsigned bit_size)
412 {
413 switch (bit_size) {
414 case 8:
415 return a.u8 == b.u8;
416 case 16:
417 return a.u16 == b.u16;
418 case 32:
419 return a.u32 == b.u32;
420 case 64:
421 return a.u64 == b.u64;
422 default:
423 unreachable("unsupported bit-size should have already been filtered.");
424 }
425 }
426
427 /** Can these values be the same with one level of negation? */
428 static bool
value_can_equal(const nir_const_value * from,uint8_t reachable_mask,nir_const_value to,uint8_t reaching_mask,unsigned bit_size)429 value_can_equal(const nir_const_value *from, uint8_t reachable_mask,
430 nir_const_value to, uint8_t reaching_mask,
431 unsigned bit_size)
432 {
433 const uint8_t combined_mask = reachable_mask & reaching_mask;
434
435 return value_equal(from[VALUE_INDEX], to, bit_size) ||
436 ((combined_mask & INT_NEG_EXISTS) &&
437 value_equal(from[INT_NEG_INDEX], to, bit_size)) ||
438 ((combined_mask & FLOAT_NEG_EXISTS) &&
439 value_equal(from[FLOAT_NEG_INDEX], to, bit_size));
440 }
441
442 static void
preprocess_candidates(struct value * candidates,unsigned num_candidates)443 preprocess_candidates(struct value *candidates, unsigned num_candidates)
444 {
445 /* Calculate the reaching_mask and reachable_mask for each candidate. */
446 for (unsigned i = 0; i < num_candidates; i++) {
447 calculate_masks(candidates[i].value,
448 candidates[i].type,
449 candidates[i].bit_size,
450 &candidates[i].reachable_mask,
451 &candidates[i].reaching_mask);
452
453 /* If negations are not allowed, then only the original value is
454 * reaching.
455 */
456 if (candidates[i].no_negations)
457 candidates[i].reaching_mask = 0;
458 }
459
460 for (unsigned i = 0; i < num_candidates; i++)
461 candidates[i].next_src = NULL;
462
463 for (unsigned i = 0; i < num_candidates - 1; i++) {
464 if (candidates[i].next_src != NULL)
465 continue;
466
467 struct value *prev = &candidates[i];
468
469 for (unsigned j = i + 1; j < num_candidates; j++) {
470 if (candidates[i].instr_index == candidates[j].instr_index) {
471 prev->next_src = &candidates[j];
472 prev = prev->next_src;
473 }
474 }
475
476 /* Close the cycle. */
477 if (prev != &candidates[i])
478 prev->next_src = &candidates[i];
479 }
480 }
481
482 static bool
reaching_value_exists(const struct value * c,const struct combine_constants_value * values,unsigned num_values)483 reaching_value_exists(const struct value *c,
484 const struct combine_constants_value *values,
485 unsigned num_values)
486 {
487 nir_const_value reachable_values[MAX_NUM_REACHABLE];
488
489 calculate_reachable_values(c->value, c->bit_size, c->reaching_mask,
490 reachable_values);
491
492 /* Check to see if the value is already in the result set. */
493 for (unsigned j = 0; j < num_values; j++) {
494 if (c->bit_size == values[j].bit_size &&
495 value_can_equal(reachable_values, c->reaching_mask,
496 values[j].value, c->reaching_mask,
497 c->bit_size)) {
498 return true;
499 }
500 }
501
502 return false;
503 }
504
505 static combine_constants_result *
combine_constants_greedy(struct value * candidates,unsigned num_candidates)506 combine_constants_greedy(struct value *candidates, unsigned num_candidates)
507 {
508 bool success;
509 combine_constants_result *result =
510 new combine_constants_result(num_candidates, success);
511 if (result == NULL || !success) {
512 delete result;
513 return NULL;
514 }
515
516 BITSET_WORD *remain =
517 (BITSET_WORD *) calloc(BITSET_WORDS(num_candidates), sizeof(remain[0]));
518
519 if (remain == NULL) {
520 delete result;
521 return NULL;
522 }
523
524 memset(remain, 0xff, BITSET_WORDS(num_candidates) * sizeof(remain[0]));
525
526 /* Operate in three passes. The first pass handles all values that must be
527 * emitted and for which a negation cannot exist.
528 */
529 unsigned i;
530 for (i = 0; i < num_candidates; i++) {
531 if (candidates[i].allow_one_constant ||
532 (candidates[i].reaching_mask & (FLOAT_NEG_EXISTS | INT_NEG_EXISTS))) {
533 continue;
534 }
535
536 /* Check to see if the value is already in the result set. */
537 bool found = false;
538 const unsigned num_values = result->num_values_to_emit;
539 for (unsigned j = 0; j < num_values; j++) {
540 if (candidates[i].bit_size == result->values_to_emit[j].bit_size &&
541 value_equal(candidates[i].value,
542 result->values_to_emit[j].value,
543 candidates[i].bit_size)) {
544 found = true;
545 break;
546 }
547 }
548
549 if (!found)
550 result->append_value(candidates[i].value, candidates[i].bit_size);
551
552 BITSET_CLEAR(remain, i);
553 }
554
555 /* The second pass handles all values that must be emitted and for which a
556 * negation can exist.
557 */
558 BITSET_FOREACH_SET(i, remain, num_candidates) {
559 if (candidates[i].allow_one_constant)
560 continue;
561
562 assert(candidates[i].reaching_mask & (FLOAT_NEG_EXISTS | INT_NEG_EXISTS));
563
564 if (!reaching_value_exists(&candidates[i], result->values_to_emit,
565 result->num_values_to_emit)) {
566 result->append_value(absolute(candidates[i].value,
567 candidates[i].bit_size,
568 candidates[i].type),
569 candidates[i].bit_size);
570 }
571
572 BITSET_CLEAR(remain, i);
573 }
574
575 /* The third pass handles all of the values that may not have to be
576 * emitted. These are the values where allow_one_constant is set.
577 */
578 BITSET_FOREACH_SET(i, remain, num_candidates) {
579 assert(candidates[i].allow_one_constant);
580
581 /* The BITSET_FOREACH_SET macro does not detect changes to the bitset
582 * that occur within the current word. Since code in this loop may
583 * clear bits from the set, re-test here.
584 */
585 if (!BITSET_TEST(remain, i))
586 continue;
587
588 assert(candidates[i].next_src != NULL);
589
590 const struct value *const other_candidate = candidates[i].next_src;
591 const unsigned j = other_candidate - candidates;
592
593 if (!reaching_value_exists(&candidates[i], result->values_to_emit,
594 result->num_values_to_emit)) {
595 /* Before emitting a value, see if a match for the other source of
596 * the instruction exists.
597 */
598 if (!reaching_value_exists(&candidates[j], result->values_to_emit,
599 result->num_values_to_emit)) {
600 result->append_value(candidates[i].value, candidates[i].bit_size);
601 }
602 }
603
604 /* Mark both sources as handled. */
605 BITSET_CLEAR(remain, i);
606 BITSET_CLEAR(remain, j);
607 }
608
609 /* As noted above, there will never be more values in the output than in
610 * the input. If there are fewer values, reduce the size of the
611 * allocation.
612 */
613 if (result->num_values_to_emit < num_candidates) {
614 result->values_to_emit = (struct combine_constants_value *)
615 realloc(result->values_to_emit, sizeof(result->values_to_emit[0]) *
616 result->num_values_to_emit);
617
618 /* Is it even possible for a reducing realloc to fail? */
619 assert(result->values_to_emit != NULL);
620 }
621
622 /* Create the mapping from "combined" constants to list of candidates
623 * passed in by the caller.
624 */
625 memset(remain, 0xff, BITSET_WORDS(num_candidates) * sizeof(remain[0]));
626
627 unsigned total_users = 0;
628
629 const unsigned num_values = result->num_values_to_emit;
630 for (unsigned value_idx = 0; value_idx < num_values; value_idx++) {
631 result->values_to_emit[value_idx].first_user = total_users;
632
633 uint8_t reachable_mask;
634 uint8_t unused_mask;
635
636 calculate_masks(result->values_to_emit[value_idx].value, either_type,
637 result->values_to_emit[value_idx].bit_size,
638 &reachable_mask, &unused_mask);
639
640 nir_const_value reachable_values[MAX_NUM_REACHABLE];
641
642 calculate_reachable_values(result->values_to_emit[value_idx].value,
643 result->values_to_emit[value_idx].bit_size,
644 reachable_mask, reachable_values);
645
646 for (unsigned i = 0; i < num_candidates; i++) {
647 bool matched = false;
648
649 if (!BITSET_TEST(remain, i))
650 continue;
651
652 if (candidates[i].bit_size != result->values_to_emit[value_idx].bit_size)
653 continue;
654
655 if (value_equal(candidates[i].value, result->values_to_emit[value_idx].value,
656 result->values_to_emit[value_idx].bit_size)) {
657 result->user_map[total_users].index = i;
658 result->user_map[total_users].type = candidates[i].type;
659 result->user_map[total_users].negate = false;
660 total_users++;
661
662 matched = true;
663 BITSET_CLEAR(remain, i);
664 } else {
665 const uint8_t combined_mask = reachable_mask &
666 candidates[i].reaching_mask;
667
668 enum interpreted_type type = either_type;
669
670 if ((combined_mask & INT_NEG_EXISTS) &&
671 value_equal(candidates[i].value,
672 reachable_values[INT_NEG_INDEX],
673 candidates[i].bit_size)) {
674 type = integer_only;
675 }
676
677 if (type == either_type &&
678 (combined_mask & FLOAT_NEG_EXISTS) &&
679 value_equal(candidates[i].value,
680 reachable_values[FLOAT_NEG_INDEX],
681 candidates[i].bit_size)) {
682 type = float_only;
683 }
684
685 if (type != either_type) {
686 /* Finding a match on this path implies that the user must
687 * allow source negations.
688 */
689 assert(!candidates[i].no_negations);
690
691 result->user_map[total_users].index = i;
692 result->user_map[total_users].type = type;
693 result->user_map[total_users].negate = true;
694 total_users++;
695
696 matched = true;
697 BITSET_CLEAR(remain, i);
698 }
699 }
700
701 /* Mark the other source of instructions that can have a constant
702 * source. Selection is the prime example of this, and we want to
703 * avoid generating sequences like bcsel(a, fneg(b), ineg(c)).
704 *
705 * This also makes sure that the assertion (below) that *all* values
706 * were processed holds even when some values may be allowed to
707 * remain as constants.
708 *
709 * FINISHME: There may be value in only doing this when type ==
710 * either_type. If both sources are loaded, a register allocator may
711 * be able to make a better choice about which value to "spill"
712 * (i.e., replace with an immediate) under heavy register pressure.
713 */
714 if (matched && candidates[i].allow_one_constant) {
715 const struct value *const other_src = candidates[i].next_src;
716 const unsigned idx = other_src - candidates;
717
718 assert(idx < num_candidates);
719 BITSET_CLEAR(remain, idx);
720 }
721 }
722
723 assert(total_users > result->values_to_emit[value_idx].first_user);
724 result->values_to_emit[value_idx].num_users =
725 total_users - result->values_to_emit[value_idx].first_user;
726 }
727
728 /* Verify that all of the values were emitted by the loop above. If any
729 * bits are still set in remain, then some value was not emitted. The use
730 * of memset to populate remain prevents the use of a more performant loop.
731 */
732 #ifndef NDEBUG
733 bool pass = true;
734
735 BITSET_FOREACH_SET(i, remain, num_candidates) {
736 fprintf(stderr, "candidate %d was not processed: { "
737 ".b = %s, "
738 ".f32 = %f, .f64 = %g, "
739 ".i8 = %d, .u8 = 0x%02x, "
740 ".i16 = %d, .u16 = 0x%04x, "
741 ".i32 = %d, .u32 = 0x%08x, "
742 ".i64 = %" PRId64 ", .u64 = 0x%016" PRIx64 " }\n",
743 i,
744 candidates[i].value.b ? "true" : "false",
745 candidates[i].value.f32, candidates[i].value.f64,
746 candidates[i].value.i8, candidates[i].value.u8,
747 candidates[i].value.i16, candidates[i].value.u16,
748 candidates[i].value.i32, candidates[i].value.u32,
749 candidates[i].value.i64, candidates[i].value.u64);
750 pass = false;
751 }
752
753 assert(pass && "All values should have been processed.");
754 #endif
755
756 free(remain);
757
758 return result;
759 }
760
761 static combine_constants_result *
elk_combine_constants(struct value * candidates,unsigned num_candidates)762 elk_combine_constants(struct value *candidates, unsigned num_candidates)
763 {
764 preprocess_candidates(candidates, num_candidates);
765
766 return combine_constants_greedy(candidates, num_candidates);
767 }
768
769 /* Returns whether an instruction could co-issue if its immediate source were
770 * replaced with a GRF source.
771 */
772 static bool
could_coissue(const struct intel_device_info * devinfo,const elk_fs_inst * inst)773 could_coissue(const struct intel_device_info *devinfo, const elk_fs_inst *inst)
774 {
775 assert(inst->opcode == ELK_OPCODE_MOV ||
776 inst->opcode == ELK_OPCODE_CMP ||
777 inst->opcode == ELK_OPCODE_ADD ||
778 inst->opcode == ELK_OPCODE_MUL);
779
780 if (devinfo->ver != 7)
781 return false;
782
783 /* Only float instructions can coissue. We don't have a great
784 * understanding of whether or not something like float(int(a) + int(b))
785 * would be considered float (based on the destination type) or integer
786 * (based on the source types), so we take the conservative choice of
787 * only promoting when both destination and source are float.
788 */
789 return inst->dst.type == ELK_REGISTER_TYPE_F &&
790 inst->src[0].type == ELK_REGISTER_TYPE_F;
791 }
792
793 /**
794 * Box for storing elk_fs_inst and some other necessary data
795 *
796 * \sa box_instruction
797 */
798 struct fs_inst_box {
799 elk_fs_inst *inst;
800 unsigned ip;
801 elk_bblock_t *block;
802 bool must_promote;
803 };
804
805 /** A box for putting fs_regs in a linked list. */
806 struct reg_link {
807 DECLARE_RALLOC_CXX_OPERATORS(reg_link)
808
reg_link__anon4db9e6250111::reg_link809 reg_link(elk_fs_inst *inst, unsigned src, bool negate, enum interpreted_type type)
810 : inst(inst), src(src), negate(negate), type(type) {}
811
812 struct exec_node link;
813 elk_fs_inst *inst;
814 uint8_t src;
815 bool negate;
816 enum interpreted_type type;
817 };
818
819 static struct exec_node *
link(void * mem_ctx,elk_fs_inst * inst,unsigned src,bool negate,enum interpreted_type type)820 link(void *mem_ctx, elk_fs_inst *inst, unsigned src, bool negate,
821 enum interpreted_type type)
822 {
823 reg_link *l = new(mem_ctx) reg_link(inst, src, negate, type);
824 return &l->link;
825 }
826
827 /**
828 * Information about an immediate value.
829 */
830 struct imm {
831 /** The common ancestor of all blocks using this immediate value. */
832 elk_bblock_t *block;
833
834 /**
835 * The instruction generating the immediate value, if all uses are contained
836 * within a single basic block. Otherwise, NULL.
837 */
838 elk_fs_inst *inst;
839
840 /**
841 * A list of fs_regs that refer to this immediate. If we promote it, we'll
842 * have to patch these up to refer to the new GRF.
843 */
844 exec_list *uses;
845
846 /** The immediate value */
847 union {
848 char bytes[8];
849 double df;
850 int64_t d64;
851 float f;
852 int32_t d;
853 int16_t w;
854 };
855 uint8_t size;
856
857 /** When promoting half-float we need to account for certain restrictions */
858 bool is_half_float;
859
860 /**
861 * The GRF register and subregister number where we've decided to store the
862 * constant value.
863 */
864 uint8_t subreg_offset;
865 uint16_t nr;
866
867 /** The number of coissuable instructions using this immediate. */
868 uint16_t uses_by_coissue;
869
870 /**
871 * Whether this constant is used by an instruction that can't handle an
872 * immediate source (and already has to be promoted to a GRF).
873 */
874 bool must_promote;
875
876 /** Is the value used only in a single basic block? */
877 bool used_in_single_block;
878
879 uint16_t first_use_ip;
880 uint16_t last_use_ip;
881 };
882
883 /** The working set of information about immediates. */
884 struct table {
885 struct value *values;
886 int size;
887 int num_values;
888
889 struct imm *imm;
890 int len;
891
892 struct fs_inst_box *boxes;
893 unsigned num_boxes;
894 unsigned size_boxes;
895 };
896
897 static struct value *
new_value(struct table * table,void * mem_ctx)898 new_value(struct table *table, void *mem_ctx)
899 {
900 if (table->num_values == table->size) {
901 table->size *= 2;
902 table->values = reralloc(mem_ctx, table->values, struct value, table->size);
903 }
904 return &table->values[table->num_values++];
905 }
906
907 /**
908 * Store an instruction with some other data in a table.
909 *
910 * \returns the index into the dynamic array of boxes for the instruction.
911 */
912 static unsigned
box_instruction(struct table * table,void * mem_ctx,elk_fs_inst * inst,unsigned ip,elk_bblock_t * block,bool must_promote)913 box_instruction(struct table *table, void *mem_ctx, elk_fs_inst *inst,
914 unsigned ip, elk_bblock_t *block, bool must_promote)
915 {
916 /* It is common for box_instruction to be called consecutively for each
917 * source of an instruction. As a result, the most common case for finding
918 * an instruction in the table is when that instruction was the last one
919 * added. Search the list back to front.
920 */
921 for (unsigned i = table->num_boxes; i > 0; /* empty */) {
922 i--;
923
924 if (table->boxes[i].inst == inst)
925 return i;
926 }
927
928 if (table->num_boxes == table->size_boxes) {
929 table->size_boxes *= 2;
930 table->boxes = reralloc(mem_ctx, table->boxes, fs_inst_box,
931 table->size_boxes);
932 }
933
934 assert(table->num_boxes < table->size_boxes);
935
936 const unsigned idx = table->num_boxes++;
937 fs_inst_box *ib = &table->boxes[idx];
938
939 ib->inst = inst;
940 ib->block = block;
941 ib->ip = ip;
942 ib->must_promote = must_promote;
943
944 return idx;
945 }
946
947 /**
948 * Comparator used for sorting an array of imm structures.
949 *
950 * We sort by basic block number, then last use IP, then first use IP (least
951 * to greatest). This sorting causes immediates live in the same area to be
952 * allocated to the same register in the hopes that all values will be dead
953 * about the same time and the register can be reused.
954 */
955 static int
compare(const void * _a,const void * _b)956 compare(const void *_a, const void *_b)
957 {
958 const struct imm *a = (const struct imm *)_a,
959 *b = (const struct imm *)_b;
960
961 int block_diff = a->block->num - b->block->num;
962 if (block_diff)
963 return block_diff;
964
965 int end_diff = a->last_use_ip - b->last_use_ip;
966 if (end_diff)
967 return end_diff;
968
969 return a->first_use_ip - b->first_use_ip;
970 }
971
972 static struct elk_reg
build_imm_reg_for_copy(struct imm * imm)973 build_imm_reg_for_copy(struct imm *imm)
974 {
975 switch (imm->size) {
976 case 8:
977 return elk_imm_d(imm->d64);
978 case 4:
979 return elk_imm_d(imm->d);
980 case 2:
981 return elk_imm_w(imm->w);
982 default:
983 unreachable("not implemented");
984 }
985 }
986
987 static inline uint32_t
get_alignment_for_imm(const struct imm * imm)988 get_alignment_for_imm(const struct imm *imm)
989 {
990 if (imm->is_half_float)
991 return 4; /* At least MAD seems to require this */
992 else
993 return imm->size;
994 }
995
996 static bool
representable_as_hf(float f,uint16_t * hf)997 representable_as_hf(float f, uint16_t *hf)
998 {
999 union fi u;
1000 uint16_t h = _mesa_float_to_half(f);
1001 u.f = _mesa_half_to_float(h);
1002
1003 if (u.f == f) {
1004 *hf = h;
1005 return true;
1006 }
1007
1008 return false;
1009 }
1010
1011 static bool
representable_as_w(int d,int16_t * w)1012 representable_as_w(int d, int16_t *w)
1013 {
1014 int res = ((d & 0xffff8000) + 0x8000) & 0xffff7fff;
1015 if (!res) {
1016 *w = d;
1017 return true;
1018 }
1019
1020 return false;
1021 }
1022
1023 static bool
representable_as_uw(unsigned ud,uint16_t * uw)1024 representable_as_uw(unsigned ud, uint16_t *uw)
1025 {
1026 if (!(ud & 0xffff0000)) {
1027 *uw = ud;
1028 return true;
1029 }
1030
1031 return false;
1032 }
1033
1034 static bool
supports_src_as_imm(const struct intel_device_info * devinfo,const elk_fs_inst * inst)1035 supports_src_as_imm(const struct intel_device_info *devinfo, const elk_fs_inst *inst)
1036 {
1037 if (devinfo->ver < 12)
1038 return false;
1039
1040 switch (inst->opcode) {
1041 case ELK_OPCODE_ADD3:
1042 /* ADD3 only exists on Gfx12.5+. */
1043 return true;
1044
1045 case ELK_OPCODE_MAD:
1046 /* Integer types can always mix sizes. Floating point types can mix
1047 * sizes on Gfx12. On Gfx12.5, floating point sources must all be HF or
1048 * all be F.
1049 */
1050 return devinfo->verx10 < 125 || inst->src[0].type != ELK_REGISTER_TYPE_F;
1051
1052 default:
1053 return false;
1054 }
1055 }
1056
1057 static bool
can_promote_src_as_imm(const struct intel_device_info * devinfo,elk_fs_inst * inst,unsigned src_idx)1058 can_promote_src_as_imm(const struct intel_device_info *devinfo, elk_fs_inst *inst,
1059 unsigned src_idx)
1060 {
1061 bool can_promote = false;
1062
1063 /* Experiment shows that we can only support src0 as immediate for MAD on
1064 * Gfx12. ADD3 can use src0 or src2 in Gfx12.5, but constant propagation
1065 * only propagates into src0. It's possible that src2 works for W or UW MAD
1066 * on Gfx12.5.
1067 */
1068 if (src_idx != 0)
1069 return false;
1070
1071 if (!supports_src_as_imm(devinfo, inst))
1072 return false;
1073
1074 /* TODO - Fix the codepath below to use a bfloat16 immediate on XeHP,
1075 * since HF/F mixed mode has been removed from the hardware.
1076 */
1077 switch (inst->src[src_idx].type) {
1078 case ELK_REGISTER_TYPE_F: {
1079 uint16_t hf;
1080 if (representable_as_hf(inst->src[src_idx].f, &hf)) {
1081 inst->src[src_idx] = retype(elk_imm_uw(hf), ELK_REGISTER_TYPE_HF);
1082 can_promote = true;
1083 }
1084 break;
1085 }
1086 case ELK_REGISTER_TYPE_D: {
1087 int16_t w;
1088 if (representable_as_w(inst->src[src_idx].d, &w)) {
1089 inst->src[src_idx] = elk_imm_w(w);
1090 can_promote = true;
1091 }
1092 break;
1093 }
1094 case ELK_REGISTER_TYPE_UD: {
1095 uint16_t uw;
1096 if (representable_as_uw(inst->src[src_idx].ud, &uw)) {
1097 inst->src[src_idx] = elk_imm_uw(uw);
1098 can_promote = true;
1099 }
1100 break;
1101 }
1102 case ELK_REGISTER_TYPE_W:
1103 case ELK_REGISTER_TYPE_UW:
1104 case ELK_REGISTER_TYPE_HF:
1105 can_promote = true;
1106 break;
1107 default:
1108 break;
1109 }
1110
1111 return can_promote;
1112 }
1113
1114 static void
add_candidate_immediate(struct table * table,elk_fs_inst * inst,unsigned ip,unsigned i,bool must_promote,bool allow_one_constant,elk_bblock_t * block,const struct intel_device_info * devinfo,void * const_ctx)1115 add_candidate_immediate(struct table *table, elk_fs_inst *inst, unsigned ip,
1116 unsigned i,
1117 bool must_promote,
1118 bool allow_one_constant,
1119 elk_bblock_t *block,
1120 const struct intel_device_info *devinfo,
1121 void *const_ctx)
1122 {
1123 struct value *v = new_value(table, const_ctx);
1124
1125 unsigned box_idx = box_instruction(table, const_ctx, inst, ip, block,
1126 must_promote);
1127
1128 v->value.u64 = inst->src[i].d64;
1129 v->bit_size = 8 * type_sz(inst->src[i].type);
1130 v->instr_index = box_idx;
1131 v->src = i;
1132 v->allow_one_constant = allow_one_constant;
1133
1134 /* Right-shift instructions are special. They can have source modifiers,
1135 * but changing the type can change the semantic of the instruction. Only
1136 * allow negations on a right shift if the source type is already signed.
1137 */
1138 v->no_negations = !inst->can_do_source_mods(devinfo) ||
1139 ((inst->opcode == ELK_OPCODE_SHR ||
1140 inst->opcode == ELK_OPCODE_ASR) &&
1141 elk_reg_type_is_unsigned_integer(inst->src[i].type));
1142
1143 switch (inst->src[i].type) {
1144 case ELK_REGISTER_TYPE_DF:
1145 case ELK_REGISTER_TYPE_NF:
1146 case ELK_REGISTER_TYPE_F:
1147 case ELK_REGISTER_TYPE_HF:
1148 v->type = float_only;
1149 break;
1150
1151 case ELK_REGISTER_TYPE_UQ:
1152 case ELK_REGISTER_TYPE_Q:
1153 case ELK_REGISTER_TYPE_UD:
1154 case ELK_REGISTER_TYPE_D:
1155 case ELK_REGISTER_TYPE_UW:
1156 case ELK_REGISTER_TYPE_W:
1157 v->type = integer_only;
1158 break;
1159
1160 case ELK_REGISTER_TYPE_VF:
1161 case ELK_REGISTER_TYPE_UV:
1162 case ELK_REGISTER_TYPE_V:
1163 case ELK_REGISTER_TYPE_UB:
1164 case ELK_REGISTER_TYPE_B:
1165 default:
1166 unreachable("not reached");
1167 }
1168
1169 /* It is safe to change the type of the operands of a select instruction
1170 * that has no conditional modifier, no source modifiers, and no saturate
1171 * modifer.
1172 */
1173 if (inst->opcode == ELK_OPCODE_SEL &&
1174 inst->conditional_mod == ELK_CONDITIONAL_NONE &&
1175 !inst->src[0].negate && !inst->src[0].abs &&
1176 !inst->src[1].negate && !inst->src[1].abs &&
1177 !inst->saturate) {
1178 v->type = either_type;
1179 }
1180 }
1181
1182 struct register_allocation {
1183 /** VGRF for storing values. */
1184 unsigned nr;
1185
1186 /**
1187 * Mask of currently available slots in this register.
1188 *
1189 * Each register is 16, 16-bit slots. Allocations require 1, 2, or 4 slots
1190 * for word, double-word, or quad-word values, respectively.
1191 */
1192 uint16_t avail;
1193 };
1194
1195 static elk_fs_reg
allocate_slots(struct register_allocation * regs,unsigned num_regs,unsigned bytes,unsigned align_bytes,elk::simple_allocator & alloc)1196 allocate_slots(struct register_allocation *regs, unsigned num_regs,
1197 unsigned bytes, unsigned align_bytes,
1198 elk::simple_allocator &alloc)
1199 {
1200 assert(bytes == 2 || bytes == 4 || bytes == 8);
1201 assert(align_bytes == 2 || align_bytes == 4 || align_bytes == 8);
1202
1203 const unsigned words = bytes / 2;
1204 const unsigned align_words = align_bytes / 2;
1205 const uint16_t mask = (1U << words) - 1;
1206
1207 for (unsigned i = 0; i < num_regs; i++) {
1208 for (unsigned j = 0; j <= (16 - words); j += align_words) {
1209 const uint16_t x = regs[i].avail >> j;
1210
1211 if ((x & mask) == mask) {
1212 if (regs[i].nr == UINT_MAX)
1213 regs[i].nr = alloc.allocate(1);
1214
1215 regs[i].avail &= ~(mask << j);
1216
1217 elk_fs_reg reg(VGRF, regs[i].nr);
1218 reg.offset = j * 2;
1219
1220 return reg;
1221 }
1222 }
1223 }
1224
1225 unreachable("No free slots found.");
1226 }
1227
1228 static void
deallocate_slots(struct register_allocation * regs,unsigned num_regs,unsigned reg_nr,unsigned subreg_offset,unsigned bytes)1229 deallocate_slots(struct register_allocation *regs, unsigned num_regs,
1230 unsigned reg_nr, unsigned subreg_offset, unsigned bytes)
1231 {
1232 assert(bytes == 2 || bytes == 4 || bytes == 8);
1233 assert(subreg_offset % 2 == 0);
1234 assert(subreg_offset + bytes <= 32);
1235
1236 const unsigned words = bytes / 2;
1237 const unsigned offset = subreg_offset / 2;
1238 const uint16_t mask = ((1U << words) - 1) << offset;
1239
1240 for (unsigned i = 0; i < num_regs; i++) {
1241 if (regs[i].nr == reg_nr) {
1242 regs[i].avail |= mask;
1243 return;
1244 }
1245 }
1246
1247 unreachable("No such register found.");
1248 }
1249
1250 static void
parcel_out_registers(struct imm * imm,unsigned len,const elk_bblock_t * cur_block,struct register_allocation * regs,unsigned num_regs,elk::simple_allocator & alloc,unsigned ver)1251 parcel_out_registers(struct imm *imm, unsigned len, const elk_bblock_t *cur_block,
1252 struct register_allocation *regs, unsigned num_regs,
1253 elk::simple_allocator &alloc, unsigned ver)
1254 {
1255 /* Each basic block has two distinct set of constants. There is the set of
1256 * constants that only have uses in that block, and there is the set of
1257 * constants that have uses after that block.
1258 *
1259 * Allocation proceeds in three passes.
1260 *
1261 * 1. Allocate space for the values that are used outside this block.
1262 *
1263 * 2. Allocate space for the values that are used only in this block.
1264 *
1265 * 3. Deallocate the space for the values that are used only in this block.
1266 */
1267
1268 for (unsigned pass = 0; pass < 2; pass++) {
1269 const bool used_in_single_block = pass != 0;
1270
1271 for (unsigned i = 0; i < len; i++) {
1272 if (imm[i].block == cur_block &&
1273 imm[i].used_in_single_block == used_in_single_block) {
1274 /* From the BDW and CHV PRM, 3D Media GPGPU, Special Restrictions:
1275 *
1276 * "In Align16 mode, the channel selects and channel enables apply
1277 * to a pair of half-floats, because these parameters are defined
1278 * for DWord elements ONLY. This is applicable when both source
1279 * and destination are half-floats."
1280 *
1281 * This means that Align16 instructions that use promoted HF
1282 * immediates and use a <0,1,0>:HF region would read 2 HF slots
1283 * instead of replicating the single one we want. To avoid this, we
1284 * always populate both HF slots within a DWord with the constant.
1285 */
1286 const unsigned width = ver == 8 && imm[i].is_half_float ? 2 : 1;
1287
1288 const elk_fs_reg reg = allocate_slots(regs, num_regs,
1289 imm[i].size * width,
1290 get_alignment_for_imm(&imm[i]),
1291 alloc);
1292
1293 imm[i].nr = reg.nr;
1294 imm[i].subreg_offset = reg.offset;
1295 }
1296 }
1297 }
1298
1299 for (unsigned i = 0; i < len; i++) {
1300 if (imm[i].block == cur_block && imm[i].used_in_single_block) {
1301 const unsigned width = ver == 8 && imm[i].is_half_float ? 2 : 1;
1302
1303 deallocate_slots(regs, num_regs, imm[i].nr, imm[i].subreg_offset,
1304 imm[i].size * width);
1305 }
1306 }
1307 }
1308
1309 } /* namespace */
1310
1311 bool
opt_combine_constants()1312 elk_fs_visitor::opt_combine_constants()
1313 {
1314 void *const_ctx = ralloc_context(NULL);
1315
1316 struct table table;
1317
1318 /* For each of the dynamic arrays in the table, allocate about a page of
1319 * memory. On LP64 systems, this gives 126 value objects 169 fs_inst_box
1320 * objects. Even larger shaders that have been obverved rarely need more
1321 * than 20 or 30 values. Most smaller shaders, which is most shaders, need
1322 * at most a couple dozen fs_inst_box.
1323 */
1324 table.size = (4096 - (5 * sizeof(void *))) / sizeof(struct value);
1325 table.num_values = 0;
1326 table.values = ralloc_array(const_ctx, struct value, table.size);
1327
1328 table.size_boxes = (4096 - (5 * sizeof(void *))) / sizeof(struct fs_inst_box);
1329 table.num_boxes = 0;
1330 table.boxes = ralloc_array(const_ctx, fs_inst_box, table.size_boxes);
1331
1332 const elk::idom_tree &idom = idom_analysis.require();
1333 unsigned ip = -1;
1334
1335 /* Make a pass through all instructions and count the number of times each
1336 * constant is used by coissueable instructions or instructions that cannot
1337 * take immediate arguments.
1338 */
1339 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1340 ip++;
1341
1342 switch (inst->opcode) {
1343 case ELK_SHADER_OPCODE_INT_QUOTIENT:
1344 case ELK_SHADER_OPCODE_INT_REMAINDER:
1345 case ELK_SHADER_OPCODE_POW:
1346 if (inst->src[0].file == IMM) {
1347 assert(inst->opcode != ELK_SHADER_OPCODE_POW);
1348
1349 add_candidate_immediate(&table, inst, ip, 0, true, false, block,
1350 devinfo, const_ctx);
1351 }
1352
1353 if (inst->src[1].file == IMM && devinfo->ver < 8) {
1354 add_candidate_immediate(&table, inst, ip, 1, true, false, block,
1355 devinfo, const_ctx);
1356 }
1357
1358 break;
1359
1360 case ELK_OPCODE_ADD3:
1361 case ELK_OPCODE_MAD: {
1362 for (int i = 0; i < inst->sources; i++) {
1363 if (inst->src[i].file != IMM)
1364 continue;
1365
1366 if (can_promote_src_as_imm(devinfo, inst, i))
1367 continue;
1368
1369 add_candidate_immediate(&table, inst, ip, i, true, false, block,
1370 devinfo, const_ctx);
1371 }
1372
1373 break;
1374 }
1375
1376 case ELK_OPCODE_BFE:
1377 case ELK_OPCODE_BFI2:
1378 case ELK_OPCODE_LRP:
1379 for (int i = 0; i < inst->sources; i++) {
1380 if (inst->src[i].file != IMM)
1381 continue;
1382
1383 add_candidate_immediate(&table, inst, ip, i, true, false, block,
1384 devinfo, const_ctx);
1385 }
1386
1387 break;
1388
1389 case ELK_OPCODE_SEL:
1390 if (inst->src[0].file == IMM) {
1391 /* It is possible to have src0 be immediate but src1 not be
1392 * immediate for the non-commutative conditional modifiers (e.g.,
1393 * G).
1394 */
1395 if (inst->conditional_mod == ELK_CONDITIONAL_NONE ||
1396 /* Only GE and L are commutative. */
1397 inst->conditional_mod == ELK_CONDITIONAL_GE ||
1398 inst->conditional_mod == ELK_CONDITIONAL_L) {
1399 assert(inst->src[1].file == IMM);
1400
1401 add_candidate_immediate(&table, inst, ip, 0, true, true, block,
1402 devinfo, const_ctx);
1403 add_candidate_immediate(&table, inst, ip, 1, true, true, block,
1404 devinfo, const_ctx);
1405 } else {
1406 add_candidate_immediate(&table, inst, ip, 0, true, false, block,
1407 devinfo, const_ctx);
1408 }
1409 }
1410 break;
1411
1412 case ELK_OPCODE_ASR:
1413 case ELK_OPCODE_BFI1:
1414 case ELK_OPCODE_ROL:
1415 case ELK_OPCODE_ROR:
1416 case ELK_OPCODE_SHL:
1417 case ELK_OPCODE_SHR:
1418 if (inst->src[0].file == IMM) {
1419 add_candidate_immediate(&table, inst, ip, 0, true, false, block,
1420 devinfo, const_ctx);
1421 }
1422 break;
1423
1424 case ELK_OPCODE_MOV:
1425 if (could_coissue(devinfo, inst) && inst->src[0].file == IMM) {
1426 add_candidate_immediate(&table, inst, ip, 0, false, false, block,
1427 devinfo, const_ctx);
1428 }
1429 break;
1430
1431 case ELK_OPCODE_CMP:
1432 case ELK_OPCODE_ADD:
1433 case ELK_OPCODE_MUL:
1434 assert(inst->src[0].file != IMM);
1435
1436 if (could_coissue(devinfo, inst) && inst->src[1].file == IMM) {
1437 add_candidate_immediate(&table, inst, ip, 1, false, false, block,
1438 devinfo, const_ctx);
1439 }
1440 break;
1441
1442 default:
1443 break;
1444 }
1445 }
1446
1447 if (table.num_values == 0) {
1448 ralloc_free(const_ctx);
1449 return false;
1450 }
1451
1452 combine_constants_result *result =
1453 elk_combine_constants(table.values, table.num_values);
1454
1455 table.imm = ralloc_array(const_ctx, struct imm, result->num_values_to_emit);
1456 table.len = 0;
1457
1458 for (unsigned i = 0; i < result->num_values_to_emit; i++) {
1459 struct imm *imm = &table.imm[table.len];
1460
1461 imm->block = NULL;
1462 imm->inst = NULL;
1463 imm->d64 = result->values_to_emit[i].value.u64;
1464 imm->size = result->values_to_emit[i].bit_size / 8;
1465
1466 imm->uses_by_coissue = 0;
1467 imm->must_promote = false;
1468 imm->is_half_float = false;
1469
1470 imm->first_use_ip = UINT16_MAX;
1471 imm->last_use_ip = 0;
1472
1473 imm->uses = new(const_ctx) exec_list;
1474
1475 const unsigned first_user = result->values_to_emit[i].first_user;
1476 const unsigned last_user = first_user +
1477 result->values_to_emit[i].num_users;
1478
1479 for (unsigned j = first_user; j < last_user; j++) {
1480 const unsigned idx = table.values[result->user_map[j].index].instr_index;
1481 fs_inst_box *const ib = &table.boxes[idx];
1482
1483 const unsigned src = table.values[result->user_map[j].index].src;
1484
1485 imm->uses->push_tail(link(const_ctx, ib->inst, src,
1486 result->user_map[j].negate,
1487 result->user_map[j].type));
1488
1489 if (ib->must_promote)
1490 imm->must_promote = true;
1491 else
1492 imm->uses_by_coissue++;
1493
1494 if (imm->block == NULL) {
1495 /* Block should only be NULL on the first pass. On the first
1496 * pass, inst should also be NULL.
1497 */
1498 assert(imm->inst == NULL);
1499
1500 imm->inst = ib->inst;
1501 imm->block = ib->block;
1502 imm->first_use_ip = ib->ip;
1503 imm->last_use_ip = ib->ip;
1504 imm->used_in_single_block = true;
1505 } else {
1506 elk_bblock_t *intersection = idom.intersect(ib->block,
1507 imm->block);
1508
1509 if (ib->block != imm->block)
1510 imm->used_in_single_block = false;
1511
1512 if (imm->first_use_ip > ib->ip) {
1513 imm->first_use_ip = ib->ip;
1514
1515 /* If the first-use instruction is to be tracked, block must be
1516 * the block that contains it. The old block was read in the
1517 * idom.intersect call above, so it is safe to overwrite it
1518 * here.
1519 */
1520 imm->inst = ib->inst;
1521 imm->block = ib->block;
1522 }
1523
1524 if (imm->last_use_ip < ib->ip)
1525 imm->last_use_ip = ib->ip;
1526
1527 /* The common dominator is not the block that contains the
1528 * first-use instruction, so don't track that instruction. The
1529 * load instruction will be added in the common dominator block
1530 * instead of before the first-use instruction.
1531 */
1532 if (intersection != imm->block)
1533 imm->inst = NULL;
1534
1535 imm->block = intersection;
1536 }
1537
1538 if (ib->inst->src[src].type == ELK_REGISTER_TYPE_HF)
1539 imm->is_half_float = true;
1540 }
1541
1542 /* Remove constants from the table that don't have enough uses to make
1543 * them profitable to store in a register.
1544 */
1545 if (imm->must_promote || imm->uses_by_coissue >= 4)
1546 table.len++;
1547 }
1548
1549 delete result;
1550
1551 if (table.len == 0) {
1552 ralloc_free(const_ctx);
1553 return false;
1554 }
1555 if (cfg->num_blocks != 1)
1556 qsort(table.imm, table.len, sizeof(struct imm), compare);
1557
1558 if (devinfo->ver > 7) {
1559 struct register_allocation *regs =
1560 (struct register_allocation *) calloc(table.len, sizeof(regs[0]));
1561
1562 for (int i = 0; i < table.len; i++) {
1563 regs[i].nr = UINT_MAX;
1564 regs[i].avail = 0xffff;
1565 }
1566
1567 foreach_block(block, cfg) {
1568 parcel_out_registers(table.imm, table.len, block, regs, table.len,
1569 alloc, devinfo->ver);
1570 }
1571
1572 free(regs);
1573 } else {
1574 elk_fs_reg reg(VGRF, alloc.allocate(1));
1575 reg.stride = 0;
1576
1577 for (int i = 0; i < table.len; i++) {
1578 struct imm *imm = &table.imm[i];
1579
1580 /* Put the immediate in an offset aligned to its size. Some
1581 * instructions seem to have additional alignment requirements, so
1582 * account for that too.
1583 */
1584 reg.offset = ALIGN(reg.offset, get_alignment_for_imm(imm));
1585
1586 /* Ensure we have enough space in the register to copy the immediate */
1587 if (reg.offset + imm->size > REG_SIZE) {
1588 reg.nr = alloc.allocate(1);
1589 reg.offset = 0;
1590 }
1591
1592 imm->nr = reg.nr;
1593 imm->subreg_offset = reg.offset;
1594
1595 reg.offset += imm->size;
1596 }
1597 }
1598
1599 bool rebuild_cfg = false;
1600
1601 /* Insert MOVs to load the constant values into GRFs. */
1602 for (int i = 0; i < table.len; i++) {
1603 struct imm *imm = &table.imm[i];
1604
1605 /* Insert it either before the instruction that generated the immediate
1606 * or after the last non-control flow instruction of the common ancestor.
1607 */
1608 exec_node *n;
1609 elk_bblock_t *insert_block;
1610 if (imm->inst != nullptr) {
1611 n = imm->inst;
1612 insert_block = imm->block;
1613 } else {
1614 if (imm->block->start()->opcode == ELK_OPCODE_DO) {
1615 /* DO blocks are weird. They can contain only the single DO
1616 * instruction. As a result, MOV instructions cannot be added to
1617 * the DO block.
1618 */
1619 elk_bblock_t *next_block = imm->block->next();
1620 if (next_block->starts_with_control_flow()) {
1621 /* This is the difficult case. This occurs for code like
1622 *
1623 * do {
1624 * do {
1625 * ...
1626 * } while (...);
1627 * } while (...);
1628 *
1629 * when the MOV instructions need to be inserted between the
1630 * two DO instructions.
1631 *
1632 * To properly handle this scenario, a new block would need to
1633 * be inserted. Doing so would require modifying arbitrary many
1634 * CONTINUE, BREAK, and WHILE instructions to point to the new
1635 * block.
1636 *
1637 * It is unlikely that this would ever be correct. Instead,
1638 * insert the MOV instructions in the known wrong place and
1639 * rebuild the CFG at the end of the pass.
1640 */
1641 insert_block = imm->block;
1642 n = insert_block->last_non_control_flow_inst()->next;
1643
1644 rebuild_cfg = true;
1645 } else {
1646 insert_block = next_block;
1647 n = insert_block->start();
1648 }
1649 } else {
1650 insert_block = imm->block;
1651 n = insert_block->last_non_control_flow_inst()->next;
1652 }
1653 }
1654
1655 /* From the BDW and CHV PRM, 3D Media GPGPU, Special Restrictions:
1656 *
1657 * "In Align16 mode, the channel selects and channel enables apply to a
1658 * pair of half-floats, because these parameters are defined for DWord
1659 * elements ONLY. This is applicable when both source and destination
1660 * are half-floats."
1661 *
1662 * This means that Align16 instructions that use promoted HF immediates
1663 * and use a <0,1,0>:HF region would read 2 HF slots instead of
1664 * replicating the single one we want. To avoid this, we always populate
1665 * both HF slots within a DWord with the constant.
1666 */
1667 const uint32_t width = devinfo->ver == 8 && imm->is_half_float ? 2 : 1;
1668 const fs_builder ibld = fs_builder(this, width).at(insert_block, n).exec_all();
1669
1670 elk_fs_reg reg(VGRF, imm->nr);
1671 reg.offset = imm->subreg_offset;
1672 reg.stride = 0;
1673
1674 /* Put the immediate in an offset aligned to its size. Some instructions
1675 * seem to have additional alignment requirements, so account for that
1676 * too.
1677 */
1678 assert(reg.offset == ALIGN(reg.offset, get_alignment_for_imm(imm)));
1679
1680 struct elk_reg imm_reg = build_imm_reg_for_copy(imm);
1681
1682 /* Ensure we have enough space in the register to copy the immediate */
1683 assert(reg.offset + type_sz(imm_reg.type) * width <= REG_SIZE);
1684
1685 ibld.MOV(retype(reg, imm_reg.type), imm_reg);
1686 }
1687 shader_stats.promoted_constants = table.len;
1688
1689 /* Rewrite the immediate sources to refer to the new GRFs. */
1690 for (int i = 0; i < table.len; i++) {
1691 foreach_list_typed(reg_link, link, link, table.imm[i].uses) {
1692 elk_fs_reg *reg = &link->inst->src[link->src];
1693
1694 if (link->inst->opcode == ELK_OPCODE_SEL) {
1695 if (link->type == either_type) {
1696 /* Do not change the register type. */
1697 } else if (link->type == integer_only) {
1698 reg->type = elk_int_type(type_sz(reg->type), true);
1699 } else {
1700 assert(link->type == float_only);
1701
1702 switch (type_sz(reg->type)) {
1703 case 2:
1704 reg->type = ELK_REGISTER_TYPE_HF;
1705 break;
1706 case 4:
1707 reg->type = ELK_REGISTER_TYPE_F;
1708 break;
1709 case 8:
1710 reg->type = ELK_REGISTER_TYPE_DF;
1711 break;
1712 default:
1713 unreachable("Bad type size");
1714 }
1715 }
1716 } else if ((link->inst->opcode == ELK_OPCODE_SHL ||
1717 link->inst->opcode == ELK_OPCODE_ASR) &&
1718 link->negate) {
1719 reg->type = elk_int_type(type_sz(reg->type), true);
1720 }
1721
1722 #ifdef DEBUG
1723 switch (reg->type) {
1724 case ELK_REGISTER_TYPE_DF:
1725 assert((isnan(reg->df) && isnan(table.imm[i].df)) ||
1726 (fabs(reg->df) == fabs(table.imm[i].df)));
1727 break;
1728 case ELK_REGISTER_TYPE_F:
1729 assert((isnan(reg->f) && isnan(table.imm[i].f)) ||
1730 (fabsf(reg->f) == fabsf(table.imm[i].f)));
1731 break;
1732 case ELK_REGISTER_TYPE_HF:
1733 assert((isnan(_mesa_half_to_float(reg->d & 0xffffu)) &&
1734 isnan(_mesa_half_to_float(table.imm[i].w))) ||
1735 (fabsf(_mesa_half_to_float(reg->d & 0xffffu)) ==
1736 fabsf(_mesa_half_to_float(table.imm[i].w))));
1737 break;
1738 case ELK_REGISTER_TYPE_Q:
1739 assert(abs(reg->d64) == abs(table.imm[i].d64));
1740 break;
1741 case ELK_REGISTER_TYPE_UQ:
1742 assert(!link->negate);
1743 assert(reg->d64 == table.imm[i].d64);
1744 break;
1745 case ELK_REGISTER_TYPE_D:
1746 assert(abs(reg->d) == abs(table.imm[i].d));
1747 break;
1748 case ELK_REGISTER_TYPE_UD:
1749 assert(!link->negate);
1750 assert(reg->d == table.imm[i].d);
1751 break;
1752 case ELK_REGISTER_TYPE_W:
1753 assert(abs((int16_t) (reg->d & 0xffff)) == table.imm[i].w);
1754 break;
1755 case ELK_REGISTER_TYPE_UW:
1756 assert(!link->negate);
1757 assert((reg->ud & 0xffffu) == (uint16_t) table.imm[i].w);
1758 break;
1759 default:
1760 break;
1761 }
1762 #endif
1763
1764 assert(link->inst->can_do_source_mods(devinfo) || !link->negate);
1765
1766 reg->file = VGRF;
1767 reg->offset = table.imm[i].subreg_offset;
1768 reg->stride = 0;
1769 reg->negate = link->negate;
1770 reg->nr = table.imm[i].nr;
1771 }
1772 }
1773
1774 /* Fixup any SEL instructions that have src0 still as an immediate. Fixup
1775 * the types of any SEL instruction that have a negation on one of the
1776 * sources. Adding the negation may have changed the type of that source,
1777 * so the other source (and destination) must be changed to match.
1778 */
1779 for (unsigned i = 0; i < table.num_boxes; i++) {
1780 elk_fs_inst *inst = table.boxes[i].inst;
1781
1782 if (inst->opcode != ELK_OPCODE_SEL)
1783 continue;
1784
1785 /* If both sources have negation, the types had better be the same! */
1786 assert(!inst->src[0].negate || !inst->src[1].negate ||
1787 inst->src[0].type == inst->src[1].type);
1788
1789 /* If either source has a negation, force the type of the other source
1790 * and the type of the result to be the same.
1791 */
1792 if (inst->src[0].negate) {
1793 inst->src[1].type = inst->src[0].type;
1794 inst->dst.type = inst->src[0].type;
1795 }
1796
1797 if (inst->src[1].negate) {
1798 inst->src[0].type = inst->src[1].type;
1799 inst->dst.type = inst->src[1].type;
1800 }
1801
1802 if (inst->src[0].file != IMM)
1803 continue;
1804
1805 assert(inst->src[1].file != IMM);
1806 assert(inst->conditional_mod == ELK_CONDITIONAL_NONE ||
1807 inst->conditional_mod == ELK_CONDITIONAL_GE ||
1808 inst->conditional_mod == ELK_CONDITIONAL_L);
1809
1810 elk_fs_reg temp = inst->src[0];
1811 inst->src[0] = inst->src[1];
1812 inst->src[1] = temp;
1813
1814 /* If this was predicated, flipping operands means we also need to flip
1815 * the predicate.
1816 */
1817 if (inst->conditional_mod == ELK_CONDITIONAL_NONE)
1818 inst->predicate_inverse = !inst->predicate_inverse;
1819 }
1820
1821 if (debug) {
1822 for (int i = 0; i < table.len; i++) {
1823 struct imm *imm = &table.imm[i];
1824
1825 fprintf(stderr,
1826 "0x%016" PRIx64 " - block %3d, reg %3d sub %2d, "
1827 "Uses: (%2d, %2d), IP: %4d to %4d, length %4d\n",
1828 (uint64_t)(imm->d & BITFIELD64_MASK(imm->size * 8)),
1829 imm->block->num,
1830 imm->nr,
1831 imm->subreg_offset,
1832 imm->must_promote,
1833 imm->uses_by_coissue,
1834 imm->first_use_ip,
1835 imm->last_use_ip,
1836 imm->last_use_ip - imm->first_use_ip);
1837 }
1838 }
1839
1840 if (rebuild_cfg) {
1841 /* When the CFG is initially built, the instructions are removed from
1842 * the list of instructions stored in elk_fs_visitor -- the same exec_node
1843 * is used for membership in that list and in a block list. So we need
1844 * to pull them back before rebuilding the CFG.
1845 */
1846 assert(exec_list_length(&instructions) == 0);
1847 foreach_block(block, cfg) {
1848 exec_list_append(&instructions, &block->instructions);
1849 }
1850
1851 delete cfg;
1852 cfg = NULL;
1853 calculate_cfg();
1854 }
1855
1856 ralloc_free(const_ctx);
1857
1858 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES |
1859 (rebuild_cfg ? DEPENDENCY_BLOCKS : DEPENDENCY_NOTHING));
1860
1861 return true;
1862 }
1863