1 /*
2 * Copyright © 2018 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "elk_fs.h"
25 #include "elk_cfg.h"
26 #include "elk_fs_builder.h"
27
28 using namespace elk;
29
30 namespace {
31 /* From the SKL PRM Vol 2a, "Move":
32 *
33 * "A mov with the same source and destination type, no source modifier,
34 * and no saturation is a raw move. A packed byte destination region (B
35 * or UB type with HorzStride == 1 and ExecSize > 1) can only be written
36 * using raw move."
37 */
38 bool
is_byte_raw_mov(const elk_fs_inst * inst)39 is_byte_raw_mov(const elk_fs_inst *inst)
40 {
41 return type_sz(inst->dst.type) == 1 &&
42 inst->opcode == ELK_OPCODE_MOV &&
43 inst->src[0].type == inst->dst.type &&
44 !inst->saturate &&
45 !inst->src[0].negate &&
46 !inst->src[0].abs;
47 }
48
49 /*
50 * Return an acceptable byte stride for the destination of an instruction
51 * that requires it to have some particular alignment.
52 */
53 unsigned
required_dst_byte_stride(const elk_fs_inst * inst)54 required_dst_byte_stride(const elk_fs_inst *inst)
55 {
56 if (inst->dst.is_accumulator()) {
57 /* If the destination is an accumulator, insist that we leave the
58 * stride alone. We cannot "fix" accumulator destinations by writing
59 * to a temporary and emitting a MOV into the original destination.
60 * For multiply instructions (our one use of the accumulator), the
61 * MUL writes the full 66 bits of the accumulator whereas the MOV we
62 * would emit only writes 33 bits and leaves the top 33 bits
63 * undefined.
64 *
65 * It's safe to just require the original stride here because the
66 * lowering pass will detect the mismatch in has_invalid_src_region
67 * and fix the sources of the multiply instead of the destination.
68 */
69 return inst->dst.stride * type_sz(inst->dst.type);
70 } else if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
71 !is_byte_raw_mov(inst)) {
72 return get_exec_type_size(inst);
73 } else {
74 /* Calculate the maximum byte stride and the minimum/maximum type
75 * size across all source and destination operands we are required to
76 * lower.
77 */
78 unsigned max_stride = inst->dst.stride * type_sz(inst->dst.type);
79 unsigned min_size = type_sz(inst->dst.type);
80 unsigned max_size = type_sz(inst->dst.type);
81
82 for (unsigned i = 0; i < inst->sources; i++) {
83 if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {
84 const unsigned size = type_sz(inst->src[i].type);
85 max_stride = MAX2(max_stride, inst->src[i].stride * size);
86 min_size = MIN2(min_size, size);
87 max_size = MAX2(max_size, size);
88 }
89 }
90
91 /* All operands involved in lowering need to fit in the calculated
92 * stride.
93 */
94 assert(max_size <= 4 * min_size);
95
96 /* Attempt to use the largest byte stride among all present operands,
97 * but never exceed a stride of 4 since that would lead to illegal
98 * destination regions during lowering.
99 */
100 return MIN2(max_stride, 4 * min_size);
101 }
102 }
103
104 /*
105 * Return an acceptable byte sub-register offset for the destination of an
106 * instruction that requires it to be aligned to the sub-register offset of
107 * the sources.
108 */
109 unsigned
required_dst_byte_offset(const intel_device_info * devinfo,const elk_fs_inst * inst)110 required_dst_byte_offset(const intel_device_info *devinfo, const elk_fs_inst *inst)
111 {
112 for (unsigned i = 0; i < inst->sources; i++) {
113 if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
114 if (reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE) !=
115 reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE))
116 return 0;
117 }
118
119 return reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
120 }
121
122 /*
123 * Return the closest legal execution type for an instruction on
124 * the specified platform.
125 */
126 elk_reg_type
required_exec_type(const intel_device_info * devinfo,const elk_fs_inst * inst)127 required_exec_type(const intel_device_info *devinfo, const elk_fs_inst *inst)
128 {
129 const elk_reg_type t = get_exec_type(inst);
130 const bool has_64bit = elk_reg_type_is_floating_point(t) ?
131 devinfo->has_64bit_float : devinfo->has_64bit_int;
132
133 switch (inst->opcode) {
134 case ELK_SHADER_OPCODE_SHUFFLE:
135 /* IVB has an issue (which we found empirically) where it reads
136 * two address register components per channel for indirectly
137 * addressed 64-bit sources.
138 *
139 * From the Cherryview PRM Vol 7. "Register Region Restrictions":
140 *
141 * "When source or destination datatype is 64b or operation is
142 * integer DWord multiply, indirect addressing must not be
143 * used."
144 *
145 * Work around both of the above and handle platforms that
146 * don't support 64-bit types at all.
147 */
148 if ((!devinfo->has_64bit_int ||
149 devinfo->platform == INTEL_PLATFORM_CHV ||
150 intel_device_info_is_9lp(devinfo)) && type_sz(t) > 4)
151 return ELK_REGISTER_TYPE_UD;
152 else if (has_dst_aligned_region_restriction(devinfo, inst))
153 return elk_int_type(type_sz(t), false);
154 else
155 return t;
156
157 case ELK_SHADER_OPCODE_SEL_EXEC:
158 if ((!has_64bit || devinfo->has_64bit_float_via_math_pipe) &&
159 type_sz(t) > 4)
160 return ELK_REGISTER_TYPE_UD;
161 else
162 return t;
163
164 case ELK_SHADER_OPCODE_QUAD_SWIZZLE:
165 if (has_dst_aligned_region_restriction(devinfo, inst))
166 return elk_int_type(type_sz(t), false);
167 else
168 return t;
169
170 case ELK_SHADER_OPCODE_CLUSTER_BROADCAST:
171 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
172 *
173 * "When source or destination datatype is 64b or operation is
174 * integer DWord multiply, indirect addressing must not be
175 * used."
176 *
177 * For MTL (verx10 == 125), float64 is supported, but int64 is not.
178 * Therefore we need to lower cluster broadcast using 32-bit int ops.
179 *
180 * For gfx12.5+ platforms that support int64, the register regions
181 * used by cluster broadcast aren't supported by the 64-bit pipeline.
182 *
183 * Work around the above and handle platforms that don't
184 * support 64-bit types at all.
185 */
186 if ((!has_64bit || devinfo->verx10 >= 125 ||
187 devinfo->platform == INTEL_PLATFORM_CHV ||
188 intel_device_info_is_9lp(devinfo)) && type_sz(t) > 4)
189 return ELK_REGISTER_TYPE_UD;
190 else
191 return elk_int_type(type_sz(t), false);
192
193 case ELK_SHADER_OPCODE_BROADCAST:
194 case ELK_SHADER_OPCODE_MOV_INDIRECT:
195 if (((devinfo->verx10 == 70 ||
196 devinfo->platform == INTEL_PLATFORM_CHV ||
197 intel_device_info_is_9lp(devinfo) ||
198 devinfo->verx10 >= 125) && type_sz(inst->src[0].type) > 4) ||
199 (devinfo->verx10 >= 125 &&
200 elk_reg_type_is_floating_point(inst->src[0].type)))
201 return elk_int_type(type_sz(t), false);
202 else
203 return t;
204
205 default:
206 return t;
207 }
208 }
209
210 /*
211 * Return the stride between channels of the specified register in
212 * byte units, or ~0u if the region cannot be represented with a
213 * single one-dimensional stride.
214 */
215 unsigned
byte_stride(const elk_fs_reg & reg)216 byte_stride(const elk_fs_reg ®)
217 {
218 switch (reg.file) {
219 case BAD_FILE:
220 case UNIFORM:
221 case IMM:
222 case VGRF:
223 case MRF:
224 case ATTR:
225 return reg.stride * type_sz(reg.type);
226 case ARF:
227 case FIXED_GRF:
228 if (reg.is_null()) {
229 return 0;
230 } else {
231 const unsigned hstride = reg.hstride ? 1 << (reg.hstride - 1) : 0;
232 const unsigned vstride = reg.vstride ? 1 << (reg.vstride - 1) : 0;
233 const unsigned width = 1 << reg.width;
234
235 if (width == 1) {
236 return vstride * type_sz(reg.type);
237 } else if (hstride * width == vstride) {
238 return hstride * type_sz(reg.type);
239 } else {
240 return ~0u;
241 }
242 }
243 default:
244 unreachable("Invalid register file");
245 }
246 }
247
248 /*
249 * Return whether the instruction has an unsupported channel bit layout
250 * specified for the i-th source region.
251 */
252 bool
has_invalid_src_region(const intel_device_info * devinfo,const elk_fs_inst * inst,unsigned i)253 has_invalid_src_region(const intel_device_info *devinfo, const elk_fs_inst *inst,
254 unsigned i)
255 {
256 if (is_send(inst) || inst->is_math() || inst->is_control_source(i) ||
257 inst->opcode == ELK_OPCODE_DPAS) {
258 return false;
259 }
260
261 /* Empirical testing shows that Broadwell has a bug affecting half-float
262 * MAD instructions when any of its sources has a non-zero offset, such
263 * as:
264 *
265 * mad(8) g18<1>HF -g17<4,4,1>HF g14.8<4,4,1>HF g11<4,4,1>HF { align16 1Q };
266 *
267 * We used to generate code like this for SIMD8 executions where we
268 * used to pack components Y and W of a vector at offset 16B of a SIMD
269 * register. The problem doesn't occur if the stride of the source is 0.
270 */
271 if (devinfo->ver == 8 &&
272 inst->opcode == ELK_OPCODE_MAD &&
273 inst->src[i].type == ELK_REGISTER_TYPE_HF &&
274 reg_offset(inst->src[i]) % REG_SIZE > 0 &&
275 inst->src[i].stride != 0) {
276 return true;
277 }
278
279 const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
280 const unsigned src_byte_offset = reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
281
282 return has_dst_aligned_region_restriction(devinfo, inst) &&
283 !is_uniform(inst->src[i]) &&
284 (byte_stride(inst->src[i]) != byte_stride(inst->dst) ||
285 src_byte_offset != dst_byte_offset);
286 }
287
288 /*
289 * Return whether the instruction has an unsupported channel bit layout
290 * specified for the destination region.
291 */
292 bool
has_invalid_dst_region(const intel_device_info * devinfo,const elk_fs_inst * inst)293 has_invalid_dst_region(const intel_device_info *devinfo,
294 const elk_fs_inst *inst)
295 {
296 if (is_send(inst) || inst->is_math()) {
297 return false;
298 } else {
299 const elk_reg_type exec_type = get_exec_type(inst);
300 const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
301 const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&
302 type_sz(inst->dst.type) < type_sz(exec_type);
303
304 return (has_dst_aligned_region_restriction(devinfo, inst) &&
305 (required_dst_byte_stride(inst) != byte_stride(inst->dst) ||
306 required_dst_byte_offset(devinfo, inst) != dst_byte_offset)) ||
307 (is_narrowing_conversion &&
308 required_dst_byte_stride(inst) != byte_stride(inst->dst));
309 }
310 }
311
312 /**
313 * Return a non-zero value if the execution type of the instruction is
314 * unsupported. The destination and sources matching the returned mask
315 * will be bit-cast to an integer type of appropriate size, lowering any
316 * source or destination modifiers into separate MOV instructions.
317 */
318 unsigned
has_invalid_exec_type(const intel_device_info * devinfo,const elk_fs_inst * inst)319 has_invalid_exec_type(const intel_device_info *devinfo, const elk_fs_inst *inst)
320 {
321 if (required_exec_type(devinfo, inst) != get_exec_type(inst)) {
322 switch (inst->opcode) {
323 case ELK_SHADER_OPCODE_SHUFFLE:
324 case ELK_SHADER_OPCODE_QUAD_SWIZZLE:
325 case ELK_SHADER_OPCODE_CLUSTER_BROADCAST:
326 case ELK_SHADER_OPCODE_BROADCAST:
327 case ELK_SHADER_OPCODE_MOV_INDIRECT:
328 return 0x1;
329
330 case ELK_SHADER_OPCODE_SEL_EXEC:
331 return 0x3;
332
333 default:
334 unreachable("Unknown invalid execution type source mask.");
335 }
336 } else {
337 return 0;
338 }
339 }
340
341 /*
342 * Return whether the instruction has unsupported source modifiers
343 * specified for the i-th source region.
344 */
345 bool
has_invalid_src_modifiers(const intel_device_info * devinfo,const elk_fs_inst * inst,unsigned i)346 has_invalid_src_modifiers(const intel_device_info *devinfo,
347 const elk_fs_inst *inst, unsigned i)
348 {
349 return (!inst->can_do_source_mods(devinfo) &&
350 (inst->src[i].negate || inst->src[i].abs)) ||
351 ((has_invalid_exec_type(devinfo, inst) & (1u << i)) &&
352 (inst->src[i].negate || inst->src[i].abs ||
353 inst->src[i].type != get_exec_type(inst)));
354 }
355
356 /*
357 * Return whether the instruction has an unsupported type conversion
358 * specified for the destination.
359 */
360 bool
has_invalid_conversion(const intel_device_info * devinfo,const elk_fs_inst * inst)361 has_invalid_conversion(const intel_device_info *devinfo, const elk_fs_inst *inst)
362 {
363 switch (inst->opcode) {
364 case ELK_OPCODE_MOV:
365 return false;
366 case ELK_OPCODE_SEL:
367 return inst->dst.type != get_exec_type(inst);
368 default:
369 /* FIXME: We assume the opcodes not explicitly mentioned before just
370 * work fine with arbitrary conversions, unless they need to be
371 * bit-cast.
372 */
373 return has_invalid_exec_type(devinfo, inst) &&
374 inst->dst.type != get_exec_type(inst);
375 }
376 }
377
378 /**
379 * Return whether the instruction has unsupported destination modifiers.
380 */
381 bool
has_invalid_dst_modifiers(const intel_device_info * devinfo,const elk_fs_inst * inst)382 has_invalid_dst_modifiers(const intel_device_info *devinfo, const elk_fs_inst *inst)
383 {
384 return (has_invalid_exec_type(devinfo, inst) &&
385 (inst->saturate || inst->conditional_mod)) ||
386 has_invalid_conversion(devinfo, inst);
387 }
388
389 /**
390 * Return whether the instruction has non-standard semantics for the
391 * conditional mod which don't cause the flag register to be updated with
392 * the comparison result.
393 */
394 bool
has_inconsistent_cmod(const elk_fs_inst * inst)395 has_inconsistent_cmod(const elk_fs_inst *inst)
396 {
397 return inst->opcode == ELK_OPCODE_SEL ||
398 inst->opcode == ELK_OPCODE_CSEL ||
399 inst->opcode == ELK_OPCODE_IF ||
400 inst->opcode == ELK_OPCODE_WHILE;
401 }
402
403 bool
404 lower_instruction(elk_fs_visitor *v, elk_bblock_t *block, elk_fs_inst *inst);
405 }
406
407 namespace elk {
408 /**
409 * Remove any modifiers from the \p i-th source region of the instruction,
410 * including negate, abs and any implicit type conversion to the execution
411 * type. Instead any source modifiers will be implemented as a separate
412 * MOV instruction prior to the original instruction.
413 */
414 bool
lower_src_modifiers(elk_fs_visitor * v,elk_bblock_t * block,elk_fs_inst * inst,unsigned i)415 lower_src_modifiers(elk_fs_visitor *v, elk_bblock_t *block, elk_fs_inst *inst, unsigned i)
416 {
417 assert(inst->components_read(i) == 1);
418 assert(v->devinfo->has_integer_dword_mul ||
419 inst->opcode != ELK_OPCODE_MUL ||
420 elk_reg_type_is_floating_point(get_exec_type(inst)) ||
421 MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4 ||
422 type_sz(inst->src[i].type) == get_exec_type_size(inst));
423
424 const fs_builder ibld(v, block, inst);
425 const elk_fs_reg tmp = ibld.vgrf(get_exec_type(inst));
426
427 lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));
428 inst->src[i] = tmp;
429
430 return true;
431 }
432 }
433
434 namespace {
435 /**
436 * Remove any modifiers from the destination region of the instruction,
437 * including saturate, conditional mod and any implicit type conversion
438 * from the execution type. Instead any destination modifiers will be
439 * implemented as a separate MOV instruction after the original
440 * instruction.
441 */
442 bool
lower_dst_modifiers(elk_fs_visitor * v,elk_bblock_t * block,elk_fs_inst * inst)443 lower_dst_modifiers(elk_fs_visitor *v, elk_bblock_t *block, elk_fs_inst *inst)
444 {
445 const fs_builder ibld(v, block, inst);
446 const elk_reg_type type = get_exec_type(inst);
447 /* Not strictly necessary, but if possible use a temporary with the same
448 * channel alignment as the current destination in order to avoid
449 * violating the restrictions enforced later on by lower_src_region()
450 * and lower_dst_region(), which would introduce additional copy
451 * instructions into the program unnecessarily.
452 */
453 const unsigned stride =
454 type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 :
455 type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);
456 elk_fs_reg tmp = ibld.vgrf(type, stride);
457 ibld.UNDEF(tmp);
458 tmp = horiz_stride(tmp, stride);
459
460 /* Emit a MOV taking care of all the destination modifiers. */
461 elk_fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
462 mov->saturate = inst->saturate;
463 if (!has_inconsistent_cmod(inst))
464 mov->conditional_mod = inst->conditional_mod;
465 if (inst->opcode != ELK_OPCODE_SEL) {
466 mov->predicate = inst->predicate;
467 mov->predicate_inverse = inst->predicate_inverse;
468 }
469 mov->flag_subreg = inst->flag_subreg;
470 lower_instruction(v, block, mov);
471
472 /* Point the original instruction at the temporary, and clean up any
473 * destination modifiers.
474 */
475 assert(inst->size_written == inst->dst.component_size(inst->exec_size));
476 inst->dst = tmp;
477 inst->size_written = inst->dst.component_size(inst->exec_size);
478 inst->saturate = false;
479 if (!has_inconsistent_cmod(inst))
480 inst->conditional_mod = ELK_CONDITIONAL_NONE;
481
482 assert(!inst->flags_written(v->devinfo) || !mov->predicate);
483 return true;
484 }
485
486 /**
487 * Remove any non-trivial shuffling of data from the \p i-th source region
488 * of the instruction. Instead implement the region as a series of integer
489 * copies into a temporary with the same channel layout as the destination.
490 */
491 bool
lower_src_region(elk_fs_visitor * v,elk_bblock_t * block,elk_fs_inst * inst,unsigned i)492 lower_src_region(elk_fs_visitor *v, elk_bblock_t *block, elk_fs_inst *inst, unsigned i)
493 {
494 assert(inst->components_read(i) == 1);
495 const fs_builder ibld(v, block, inst);
496 const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride /
497 type_sz(inst->src[i].type);
498 assert(stride > 0);
499 elk_fs_reg tmp = ibld.vgrf(inst->src[i].type, stride);
500 ibld.UNDEF(tmp);
501 tmp = horiz_stride(tmp, stride);
502
503 /* Emit a series of 32-bit integer copies with any source modifiers
504 * cleaned up (because their semantics are dependent on the type).
505 */
506 const elk_reg_type raw_type = elk_int_type(MIN2(type_sz(tmp.type), 4),
507 false);
508 const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
509 elk_fs_reg raw_src = inst->src[i];
510 raw_src.negate = false;
511 raw_src.abs = false;
512
513 for (unsigned j = 0; j < n; j++)
514 ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j));
515
516 /* Point the original instruction at the temporary, making sure to keep
517 * any source modifiers in the instruction.
518 */
519 elk_fs_reg lower_src = tmp;
520 lower_src.negate = inst->src[i].negate;
521 lower_src.abs = inst->src[i].abs;
522 inst->src[i] = lower_src;
523
524 return true;
525 }
526
527 /**
528 * Remove any non-trivial shuffling of data from the destination region of
529 * the instruction. Instead implement the region as a series of integer
530 * copies from a temporary with a channel layout compatible with the
531 * sources.
532 */
533 bool
lower_dst_region(elk_fs_visitor * v,elk_bblock_t * block,elk_fs_inst * inst)534 lower_dst_region(elk_fs_visitor *v, elk_bblock_t *block, elk_fs_inst *inst)
535 {
536 /* We cannot replace the result of an integer multiply which writes the
537 * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
538 * value whereas the MOV will act on only 32 or 33 bits of the
539 * accumulator.
540 */
541 assert(inst->opcode != ELK_OPCODE_MUL || !inst->dst.is_accumulator() ||
542 elk_reg_type_is_floating_point(inst->dst.type));
543
544 const fs_builder ibld(v, block, inst);
545 const unsigned stride = required_dst_byte_stride(inst) /
546 type_sz(inst->dst.type);
547 assert(stride > 0);
548 elk_fs_reg tmp = ibld.vgrf(inst->dst.type, stride);
549 ibld.UNDEF(tmp);
550 tmp = horiz_stride(tmp, stride);
551
552 /* Emit a series of 32-bit integer copies from the temporary into the
553 * original destination.
554 */
555 const elk_reg_type raw_type = elk_int_type(MIN2(type_sz(tmp.type), 4),
556 false);
557 const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
558
559 if (inst->predicate && inst->opcode != ELK_OPCODE_SEL) {
560 /* Note that in general we cannot simply predicate the copies on the
561 * same flag register as the original instruction, since it may have
562 * been overwritten by the instruction itself. Instead initialize
563 * the temporary with the previous contents of the destination
564 * register.
565 */
566 for (unsigned j = 0; j < n; j++)
567 ibld.MOV(subscript(tmp, raw_type, j),
568 subscript(inst->dst, raw_type, j));
569 }
570
571 for (unsigned j = 0; j < n; j++)
572 ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),
573 subscript(tmp, raw_type, j));
574
575 /* Point the original instruction at the temporary, making sure to keep
576 * any destination modifiers in the instruction.
577 */
578 assert(inst->size_written == inst->dst.component_size(inst->exec_size));
579 inst->dst = tmp;
580 inst->size_written = inst->dst.component_size(inst->exec_size);
581
582 return true;
583 }
584
585 /**
586 * Change sources and destination of the instruction to an
587 * appropriate legal type, splitting the instruction into multiple
588 * ones of smaller execution type if necessary, to be used in cases
589 * where the execution type of an instruction is unsupported.
590 */
591 bool
lower_exec_type(elk_fs_visitor * v,elk_bblock_t * block,elk_fs_inst * inst)592 lower_exec_type(elk_fs_visitor *v, elk_bblock_t *block, elk_fs_inst *inst)
593 {
594 assert(inst->dst.type == get_exec_type(inst));
595 const unsigned mask = has_invalid_exec_type(v->devinfo, inst);
596 const elk_reg_type raw_type = required_exec_type(v->devinfo, inst);
597 const unsigned n = get_exec_type_size(inst) / type_sz(raw_type);
598 const fs_builder ibld(v, block, inst);
599
600 elk_fs_reg tmp = ibld.vgrf(inst->dst.type, inst->dst.stride);
601 ibld.UNDEF(tmp);
602 tmp = horiz_stride(tmp, inst->dst.stride);
603
604 for (unsigned j = 0; j < n; j++) {
605 elk_fs_inst sub_inst = *inst;
606
607 for (unsigned i = 0; i < inst->sources; i++) {
608 if (mask & (1u << i)) {
609 assert(inst->src[i].type == inst->dst.type);
610 sub_inst.src[i] = subscript(inst->src[i], raw_type, j);
611 }
612 }
613
614 sub_inst.dst = subscript(tmp, raw_type, j);
615
616 assert(sub_inst.size_written == sub_inst.dst.component_size(sub_inst.exec_size));
617 assert(!sub_inst.flags_written(v->devinfo) && !sub_inst.saturate);
618 ibld.emit(sub_inst);
619
620 elk_fs_inst *mov = ibld.MOV(subscript(inst->dst, raw_type, j),
621 subscript(tmp, raw_type, j));
622 if (inst->opcode != ELK_OPCODE_SEL) {
623 mov->predicate = inst->predicate;
624 mov->predicate_inverse = inst->predicate_inverse;
625 }
626 lower_instruction(v, block, mov);
627 }
628
629 inst->remove(block);
630
631 return true;
632 }
633
634 /**
635 * Legalize the source and destination regioning controls of the specified
636 * instruction.
637 */
638 bool
lower_instruction(elk_fs_visitor * v,elk_bblock_t * block,elk_fs_inst * inst)639 lower_instruction(elk_fs_visitor *v, elk_bblock_t *block, elk_fs_inst *inst)
640 {
641 const intel_device_info *devinfo = v->devinfo;
642 bool progress = false;
643
644 if (has_invalid_dst_modifiers(devinfo, inst))
645 progress |= lower_dst_modifiers(v, block, inst);
646
647 if (has_invalid_dst_region(devinfo, inst))
648 progress |= lower_dst_region(v, block, inst);
649
650 for (unsigned i = 0; i < inst->sources; i++) {
651 if (has_invalid_src_modifiers(devinfo, inst, i))
652 progress |= lower_src_modifiers(v, block, inst, i);
653
654 if (has_invalid_src_region(devinfo, inst, i))
655 progress |= lower_src_region(v, block, inst, i);
656 }
657
658 if (has_invalid_exec_type(devinfo, inst))
659 progress |= lower_exec_type(v, block, inst);
660
661 return progress;
662 }
663 }
664
665 bool
lower_regioning()666 elk_fs_visitor::lower_regioning()
667 {
668 bool progress = false;
669
670 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg)
671 progress |= lower_instruction(this, block, inst);
672
673 if (progress)
674 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
675
676 return progress;
677 }
678