1 /*
2 * Copyright © 2018 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_fs.h"
25 #include "brw_cfg.h"
26 #include "brw_builder.h"
27
28 using namespace brw;
29
30 namespace {
31 /* From the SKL PRM Vol 2a, "Move":
32 *
33 * "A mov with the same source and destination type, no source modifier,
34 * and no saturation is a raw move. A packed byte destination region (B
35 * or UB type with HorzStride == 1 and ExecSize > 1) can only be written
36 * using raw move."
37 */
38 bool
is_byte_raw_mov(const fs_inst * inst)39 is_byte_raw_mov(const fs_inst *inst)
40 {
41 return brw_type_size_bytes(inst->dst.type) == 1 &&
42 inst->opcode == BRW_OPCODE_MOV &&
43 inst->src[0].type == inst->dst.type &&
44 !inst->saturate &&
45 !inst->src[0].negate &&
46 !inst->src[0].abs;
47 }
48
49 /*
50 * Return an acceptable byte stride for the specified source of an
51 * instruction affected by a regioning restriction.
52 */
53 unsigned
required_src_byte_stride(const intel_device_info * devinfo,const fs_inst * inst,unsigned i)54 required_src_byte_stride(const intel_device_info *devinfo, const fs_inst *inst,
55 unsigned i)
56 {
57 if (has_dst_aligned_region_restriction(devinfo, inst)) {
58 return MAX2(brw_type_size_bytes(inst->dst.type),
59 byte_stride(inst->dst));
60
61 } else if (has_subdword_integer_region_restriction(devinfo, inst,
62 &inst->src[i], 1)) {
63 /* Use a stride of 32bits if possible, since that will guarantee that
64 * the copy emitted to lower this region won't be affected by the
65 * sub-dword integer region restrictions. This may not be possible
66 * for the second source of an instruction if we're required to use
67 * packed data due to Wa_16012383669.
68 */
69 return (i == 1 ? brw_type_size_bytes(inst->src[i].type) : 4);
70
71 } else {
72 return byte_stride(inst->src[i]);
73 }
74 }
75
76 /*
77 * Return an acceptable byte sub-register offset for the specified source
78 * of an instruction affected by a regioning restriction.
79 */
80 unsigned
required_src_byte_offset(const intel_device_info * devinfo,const fs_inst * inst,unsigned i)81 required_src_byte_offset(const intel_device_info *devinfo, const fs_inst *inst,
82 unsigned i)
83 {
84 if (has_dst_aligned_region_restriction(devinfo, inst)) {
85 return reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
86
87 } else if (has_subdword_integer_region_restriction(devinfo, inst,
88 &inst->src[i], 1)) {
89 const unsigned dst_byte_stride =
90 MAX2(byte_stride(inst->dst), brw_type_size_bytes(inst->dst.type));
91 const unsigned src_byte_stride = required_src_byte_stride(devinfo, inst, i);
92 const unsigned dst_byte_offset =
93 reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
94 const unsigned src_byte_offset =
95 reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
96
97 if (src_byte_stride > brw_type_size_bytes(inst->src[i].type)) {
98 assert(src_byte_stride >= dst_byte_stride);
99 /* The source is affected by the Xe2+ sub-dword integer regioning
100 * restrictions. For the case of source 0 BSpec#56640 specifies a
101 * number of equations relating the source and destination
102 * sub-register numbers in all cases where a source stride of
103 * 32bits is allowed. These equations have the form:
104 *
105 * k * Dst.SubReg % m = Src.SubReg / l
106 *
107 * For some constants k, l and m different for each combination of
108 * source and destination types and strides. The expression in
109 * the return statement below computes a valid source offset by
110 * inverting the equation like:
111 *
112 * Src.SubReg = l * k * (Dst.SubReg % m)
113 *
114 * and then scaling by the element type sizes in order to get an
115 * expression in terms of byte offsets instead of sub-register
116 * numbers. It can be easily verified that in all cases listed on
117 * the hardware spec where the source has a well-defined uniform
118 * stride the product l*k is equal to the ratio between the source
119 * and destination strides.
120 */
121 const unsigned m = 64 * dst_byte_stride / src_byte_stride;
122 return dst_byte_offset % m * src_byte_stride / dst_byte_stride;
123 } else {
124 assert(src_byte_stride == brw_type_size_bytes(inst->src[i].type));
125 /* A packed source is required, likely due to the stricter
126 * requirements of the second source region. The source being
127 * packed guarantees that the region of the original instruction
128 * will be valid, but the copy may break the regioning
129 * restrictions. Do our best to try to prevent that from
130 * happening by making sure the offset of the temporary matches
131 * the original source based on the same equation above -- However
132 * that may not be sufficient if the source had a stride larger
133 * than 32bits, lowering the copy recursively may be necessary.
134 */
135 return src_byte_offset * src_byte_stride / byte_stride(inst->src[i]);
136 }
137
138 } else {
139 return reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
140 }
141 }
142
143 /*
144 * Return an acceptable byte stride for the destination of an instruction
145 * that requires it to have some particular alignment.
146 */
147 unsigned
required_dst_byte_stride(const fs_inst * inst)148 required_dst_byte_stride(const fs_inst *inst)
149 {
150 if (inst->dst.is_accumulator()) {
151 /* If the destination is an accumulator, insist that we leave the
152 * stride alone. We cannot "fix" accumulator destinations by writing
153 * to a temporary and emitting a MOV into the original destination.
154 * For multiply instructions (our one use of the accumulator), the
155 * MUL writes the full 66 bits of the accumulator whereas the MOV we
156 * would emit only writes 33 bits and leaves the top 33 bits
157 * undefined.
158 *
159 * It's safe to just require the original stride here because the
160 * lowering pass will detect the mismatch in has_invalid_src_region
161 * and fix the sources of the multiply instead of the destination.
162 */
163 return inst->dst.hstride * brw_type_size_bytes(inst->dst.type);
164 } else if (brw_type_size_bytes(inst->dst.type) < get_exec_type_size(inst) &&
165 !is_byte_raw_mov(inst)) {
166 return get_exec_type_size(inst);
167 } else {
168 /* Calculate the maximum byte stride and the minimum/maximum type
169 * size across all source and destination operands we are required to
170 * lower.
171 */
172 unsigned max_stride = inst->dst.stride * brw_type_size_bytes(inst->dst.type);
173 unsigned min_size = brw_type_size_bytes(inst->dst.type);
174 unsigned max_size = brw_type_size_bytes(inst->dst.type);
175
176 for (unsigned i = 0; i < inst->sources; i++) {
177 if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {
178 const unsigned size = brw_type_size_bytes(inst->src[i].type);
179 max_stride = MAX2(max_stride, inst->src[i].stride * size);
180 min_size = MIN2(min_size, size);
181 max_size = MAX2(max_size, size);
182 }
183 }
184
185 /* All operands involved in lowering need to fit in the calculated
186 * stride.
187 */
188 assert(max_size <= 4 * min_size);
189
190 /* Attempt to use the largest byte stride among all present operands,
191 * but never exceed a stride of 4 since that would lead to illegal
192 * destination regions during lowering.
193 */
194 return MIN2(max_stride, 4 * min_size);
195 }
196 }
197
198 /*
199 * Return an acceptable byte sub-register offset for the destination of an
200 * instruction that requires it to be aligned to the sub-register offset of
201 * the sources.
202 */
203 unsigned
required_dst_byte_offset(const intel_device_info * devinfo,const fs_inst * inst)204 required_dst_byte_offset(const intel_device_info *devinfo, const fs_inst *inst)
205 {
206 for (unsigned i = 0; i < inst->sources; i++) {
207 if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
208 if (reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE) !=
209 reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE))
210 return 0;
211 }
212
213 return reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
214 }
215
216 /*
217 * Return the closest legal execution type for an instruction on
218 * the specified platform.
219 */
220 brw_reg_type
required_exec_type(const intel_device_info * devinfo,const fs_inst * inst)221 required_exec_type(const intel_device_info *devinfo, const fs_inst *inst)
222 {
223 const brw_reg_type t = get_exec_type(inst);
224 const bool has_64bit = brw_type_is_float(t) ?
225 devinfo->has_64bit_float : devinfo->has_64bit_int;
226
227 switch (inst->opcode) {
228 case SHADER_OPCODE_SHUFFLE:
229 /* IVB has an issue (which we found empirically) where it reads
230 * two address register components per channel for indirectly
231 * addressed 64-bit sources.
232 *
233 * From the Cherryview PRM Vol 7. "Register Region Restrictions":
234 *
235 * "When source or destination datatype is 64b or operation is
236 * integer DWord multiply, indirect addressing must not be
237 * used."
238 *
239 * Work around both of the above and handle platforms that
240 * don't support 64-bit types at all.
241 */
242 if ((!devinfo->has_64bit_int ||
243 intel_device_info_is_9lp(devinfo) ||
244 devinfo->ver >= 20) && brw_type_size_bytes(t) > 4)
245 return BRW_TYPE_UD;
246 else if (has_dst_aligned_region_restriction(devinfo, inst))
247 return brw_int_type(brw_type_size_bytes(t), false);
248 else
249 return t;
250
251 case SHADER_OPCODE_SEL_EXEC:
252 if ((!has_64bit || devinfo->has_64bit_float_via_math_pipe) &&
253 brw_type_size_bytes(t) > 4)
254 return BRW_TYPE_UD;
255 else
256 return t;
257
258 case SHADER_OPCODE_QUAD_SWIZZLE:
259 if (has_dst_aligned_region_restriction(devinfo, inst))
260 return brw_int_type(brw_type_size_bytes(t), false);
261 else
262 return t;
263
264 case SHADER_OPCODE_CLUSTER_BROADCAST:
265 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
266 *
267 * "When source or destination datatype is 64b or operation is
268 * integer DWord multiply, indirect addressing must not be
269 * used."
270 *
271 * For MTL (verx10 == 125), float64 is supported, but int64 is not.
272 * Therefore we need to lower cluster broadcast using 32-bit int ops.
273 *
274 * For gfx12.5+ platforms that support int64, the register regions
275 * used by cluster broadcast aren't supported by the 64-bit pipeline.
276 *
277 * Work around the above and handle platforms that don't
278 * support 64-bit types at all.
279 */
280 if ((!has_64bit || devinfo->verx10 >= 125 ||
281 intel_device_info_is_9lp(devinfo) ||
282 devinfo->ver >= 20) && brw_type_size_bytes(t) > 4)
283 return BRW_TYPE_UD;
284 else
285 return brw_int_type(brw_type_size_bytes(t), false);
286
287 default:
288 return t;
289 }
290 }
291
292 /*
293 * Return whether the instruction has an unsupported channel bit layout
294 * specified for the i-th source region.
295 */
296 bool
has_invalid_src_region(const intel_device_info * devinfo,const fs_inst * inst,unsigned i)297 has_invalid_src_region(const intel_device_info *devinfo, const fs_inst *inst,
298 unsigned i)
299 {
300 /* Wa_22016140776:
301 *
302 * Scalar broadcast on HF math (packed or unpacked) must not be used.
303 * Compiler must use a mov instruction to expand the scalar value to
304 * a vector before using in a HF (packed or unpacked) math operation.
305 */
306 if (inst->is_math() && intel_needs_workaround(devinfo, 22016140776) &&
307 is_uniform(inst->src[i]) && inst->src[i].type == BRW_TYPE_HF) {
308 return true;
309 }
310
311 if (is_send(inst) || inst->is_control_source(i) ||
312 inst->opcode == BRW_OPCODE_DPAS) {
313 return false;
314 }
315
316 const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
317 const unsigned src_byte_offset = reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
318
319 return (has_dst_aligned_region_restriction(devinfo, inst) &&
320 !is_uniform(inst->src[i]) &&
321 (byte_stride(inst->src[i]) != required_src_byte_stride(devinfo, inst, i) ||
322 src_byte_offset != dst_byte_offset)) ||
323 (has_subdword_integer_region_restriction(devinfo, inst) &&
324 (byte_stride(inst->src[i]) != required_src_byte_stride(devinfo, inst, i) ||
325 src_byte_offset != required_src_byte_offset(devinfo, inst, i)));
326 }
327
328 /*
329 * Return whether the instruction has an unsupported channel bit layout
330 * specified for the destination region.
331 */
332 bool
has_invalid_dst_region(const intel_device_info * devinfo,const fs_inst * inst)333 has_invalid_dst_region(const intel_device_info *devinfo,
334 const fs_inst *inst)
335 {
336 if (is_send(inst)) {
337 return false;
338 } else {
339 const brw_reg_type exec_type = get_exec_type(inst);
340 const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
341 const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&
342 brw_type_size_bytes(inst->dst.type) < brw_type_size_bytes(exec_type);
343
344 return (has_dst_aligned_region_restriction(devinfo, inst) &&
345 (required_dst_byte_stride(inst) != byte_stride(inst->dst) ||
346 required_dst_byte_offset(devinfo, inst) != dst_byte_offset)) ||
347 (is_narrowing_conversion &&
348 required_dst_byte_stride(inst) != byte_stride(inst->dst));
349 }
350 }
351
352 /**
353 * Return a non-zero value if the execution type of the instruction is
354 * unsupported. The destination and sources matching the returned mask
355 * will be bit-cast to an integer type of appropriate size, lowering any
356 * source or destination modifiers into separate MOV instructions.
357 */
358 unsigned
has_invalid_exec_type(const intel_device_info * devinfo,const fs_inst * inst)359 has_invalid_exec_type(const intel_device_info *devinfo, const fs_inst *inst)
360 {
361 if (required_exec_type(devinfo, inst) != get_exec_type(inst)) {
362 switch (inst->opcode) {
363 case SHADER_OPCODE_SHUFFLE:
364 case SHADER_OPCODE_QUAD_SWIZZLE:
365 case SHADER_OPCODE_CLUSTER_BROADCAST:
366 case SHADER_OPCODE_BROADCAST:
367 case SHADER_OPCODE_MOV_INDIRECT:
368 return 0x1;
369
370 case SHADER_OPCODE_SEL_EXEC:
371 return 0x3;
372
373 default:
374 unreachable("Unknown invalid execution type source mask.");
375 }
376 } else {
377 return 0;
378 }
379 }
380
381 /**
382 * Return whether the instruction has an unsupported type conversion
383 * that must be handled by expanding the source operand.
384 */
385 bool
has_invalid_src_conversion(const intel_device_info * devinfo,const fs_inst * inst)386 has_invalid_src_conversion(const intel_device_info *devinfo,
387 const fs_inst *inst)
388 {
389 /* Scalar byte to float conversion is not allowed on DG2+ */
390 return devinfo->verx10 >= 125 &&
391 inst->opcode == BRW_OPCODE_MOV &&
392 brw_type_is_float(inst->dst.type) &&
393 brw_type_size_bits(inst->src[0].type) == 8 &&
394 is_uniform(inst->src[0]);
395 }
396
397 /*
398 * Return whether the instruction has unsupported source modifiers
399 * specified for the i-th source region.
400 */
401 bool
has_invalid_src_modifiers(const intel_device_info * devinfo,const fs_inst * inst,unsigned i)402 has_invalid_src_modifiers(const intel_device_info *devinfo,
403 const fs_inst *inst, unsigned i)
404 {
405 return (!inst->can_do_source_mods(devinfo) &&
406 (inst->src[i].negate || inst->src[i].abs)) ||
407 ((has_invalid_exec_type(devinfo, inst) & (1u << i)) &&
408 (inst->src[i].negate || inst->src[i].abs ||
409 inst->src[i].type != get_exec_type(inst))) ||
410 has_invalid_src_conversion(devinfo, inst);
411 }
412
413 /*
414 * Return whether the instruction has an unsupported type conversion
415 * specified for the destination.
416 */
417 bool
has_invalid_conversion(const intel_device_info * devinfo,const fs_inst * inst)418 has_invalid_conversion(const intel_device_info *devinfo, const fs_inst *inst)
419 {
420 switch (inst->opcode) {
421 case BRW_OPCODE_MOV:
422 return false;
423 case BRW_OPCODE_SEL:
424 return inst->dst.type != get_exec_type(inst);
425 default:
426 /* FIXME: We assume the opcodes not explicitly mentioned before just
427 * work fine with arbitrary conversions, unless they need to be
428 * bit-cast.
429 */
430 return has_invalid_exec_type(devinfo, inst) &&
431 inst->dst.type != get_exec_type(inst);
432 }
433 }
434
435 /**
436 * Return whether the instruction has unsupported destination modifiers.
437 */
438 bool
has_invalid_dst_modifiers(const intel_device_info * devinfo,const fs_inst * inst)439 has_invalid_dst_modifiers(const intel_device_info *devinfo, const fs_inst *inst)
440 {
441 return (has_invalid_exec_type(devinfo, inst) &&
442 (inst->saturate || inst->conditional_mod)) ||
443 has_invalid_conversion(devinfo, inst);
444 }
445
446 /**
447 * Return whether the instruction has non-standard semantics for the
448 * conditional mod which don't cause the flag register to be updated with
449 * the comparison result.
450 */
451 bool
has_inconsistent_cmod(const fs_inst * inst)452 has_inconsistent_cmod(const fs_inst *inst)
453 {
454 return inst->opcode == BRW_OPCODE_SEL ||
455 inst->opcode == BRW_OPCODE_CSEL ||
456 inst->opcode == BRW_OPCODE_IF ||
457 inst->opcode == BRW_OPCODE_WHILE;
458 }
459
460 bool
461 lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);
462 }
463
464 /**
465 * Remove any modifiers from the \p i-th source region of the instruction,
466 * including negate, abs and any implicit type conversion to the execution
467 * type. Instead any source modifiers will be implemented as a separate
468 * MOV instruction prior to the original instruction.
469 */
470 bool
brw_lower_src_modifiers(fs_visitor & s,bblock_t * block,fs_inst * inst,unsigned i)471 brw_lower_src_modifiers(fs_visitor &s, bblock_t *block, fs_inst *inst, unsigned i)
472 {
473 assert(inst->components_read(i) == 1);
474 assert(s.devinfo->has_integer_dword_mul ||
475 inst->opcode != BRW_OPCODE_MUL ||
476 brw_type_is_float(get_exec_type(inst)) ||
477 MIN2(brw_type_size_bytes(inst->src[0].type), brw_type_size_bytes(inst->src[1].type)) >= 4 ||
478 brw_type_size_bytes(inst->src[i].type) == get_exec_type_size(inst));
479
480 const brw_builder ibld(&s, block, inst);
481 const brw_reg tmp = ibld.vgrf(get_exec_type(inst));
482
483 lower_instruction(&s, block, ibld.MOV(tmp, inst->src[i]));
484 inst->src[i] = tmp;
485
486 return true;
487 }
488
489 namespace {
490 /**
491 * Remove any modifiers from the destination region of the instruction,
492 * including saturate, conditional mod and any implicit type conversion
493 * from the execution type. Instead any destination modifiers will be
494 * implemented as a separate MOV instruction after the original
495 * instruction.
496 */
497 bool
lower_dst_modifiers(fs_visitor * v,bblock_t * block,fs_inst * inst)498 lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst)
499 {
500 const brw_builder ibld(v, block, inst);
501 const brw_reg_type type = get_exec_type(inst);
502 /* Not strictly necessary, but if possible use a temporary with the same
503 * channel alignment as the current destination in order to avoid
504 * violating the restrictions enforced later on by lower_src_region()
505 * and lower_dst_region(), which would introduce additional copy
506 * instructions into the program unnecessarily.
507 */
508 const unsigned stride =
509 brw_type_size_bytes(inst->dst.type) * inst->dst.stride <= brw_type_size_bytes(type) ? 1 :
510 brw_type_size_bytes(inst->dst.type) * inst->dst.stride / brw_type_size_bytes(type);
511 brw_reg tmp = ibld.vgrf(type, stride);
512 ibld.UNDEF(tmp);
513 tmp = horiz_stride(tmp, stride);
514
515 /* Emit a MOV taking care of all the destination modifiers. */
516 fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
517 mov->saturate = inst->saturate;
518 if (!has_inconsistent_cmod(inst))
519 mov->conditional_mod = inst->conditional_mod;
520 if (inst->opcode != BRW_OPCODE_SEL) {
521 mov->predicate = inst->predicate;
522 mov->predicate_inverse = inst->predicate_inverse;
523 }
524 mov->flag_subreg = inst->flag_subreg;
525 lower_instruction(v, block, mov);
526
527 /* Point the original instruction at the temporary, and clean up any
528 * destination modifiers.
529 */
530 assert(inst->size_written == inst->dst.component_size(inst->exec_size));
531 inst->dst = tmp;
532 inst->size_written = inst->dst.component_size(inst->exec_size);
533 inst->saturate = false;
534 if (!has_inconsistent_cmod(inst))
535 inst->conditional_mod = BRW_CONDITIONAL_NONE;
536
537 assert(!inst->flags_written(v->devinfo) || !mov->predicate);
538 return true;
539 }
540
541 /**
542 * Remove any non-trivial shuffling of data from the \p i-th source region
543 * of the instruction. Instead implement the region as a series of integer
544 * copies into a temporary with the same channel layout as the destination.
545 */
546 bool
lower_src_region(fs_visitor * v,bblock_t * block,fs_inst * inst,unsigned i)547 lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
548 {
549 assert(inst->components_read(i) == 1);
550 const intel_device_info *devinfo = v->devinfo;
551 const brw_builder ibld(v, block, inst);
552 const unsigned stride = required_src_byte_stride(devinfo, inst, i) /
553 brw_type_size_bytes(inst->src[i].type);
554 assert(stride > 0);
555 /* Calculate the size of the temporary allocation manually instead of
556 * relying on the builder, since we may have to add some amount of
557 * padding mandated by the hardware for Xe2+ instructions with sub-dword
558 * integer regions.
559 */
560 const unsigned size =
561 DIV_ROUND_UP(required_src_byte_offset(v->devinfo, inst, i) +
562 inst->exec_size * stride *
563 brw_type_size_bytes(inst->src[i].type),
564 reg_unit(devinfo) * REG_SIZE) * reg_unit(devinfo);
565 brw_reg tmp = brw_vgrf(v->alloc.allocate(size), inst->src[i].type);
566 ibld.UNDEF(tmp);
567 tmp = byte_offset(horiz_stride(tmp, stride),
568 required_src_byte_offset(devinfo, inst, i));
569
570 /* Emit a series of 32-bit integer copies with any source modifiers
571 * cleaned up (because their semantics are dependent on the type).
572 */
573 const brw_reg_type raw_type = brw_int_type(MIN2(brw_type_size_bytes(tmp.type), 4),
574 false);
575 const unsigned n = brw_type_size_bytes(tmp.type) / brw_type_size_bytes(raw_type);
576 brw_reg raw_src = inst->src[i];
577 raw_src.negate = false;
578 raw_src.abs = false;
579
580 for (unsigned j = 0; j < n; j++) {
581 fs_inst *jnst = ibld.MOV(subscript(tmp, raw_type, j),
582 subscript(raw_src, raw_type, j));
583 if (has_subdword_integer_region_restriction(devinfo, jnst)) {
584 /* The copy isn't guaranteed to comply with all subdword integer
585 * regioning restrictions in some cases. Lower it recursively.
586 */
587 lower_instruction(v, block, jnst);
588 }
589 }
590
591 /* Point the original instruction at the temporary, making sure to keep
592 * any source modifiers in the instruction.
593 */
594 brw_reg lower_src = tmp;
595 lower_src.negate = inst->src[i].negate;
596 lower_src.abs = inst->src[i].abs;
597 inst->src[i] = lower_src;
598
599 return true;
600 }
601
602 /**
603 * Remove any non-trivial shuffling of data from the destination region of
604 * the instruction. Instead implement the region as a series of integer
605 * copies from a temporary with a channel layout compatible with the
606 * sources.
607 */
608 bool
lower_dst_region(fs_visitor * v,bblock_t * block,fs_inst * inst)609 lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
610 {
611 /* We cannot replace the result of an integer multiply which writes the
612 * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
613 * value whereas the MOV will act on only 32 or 33 bits of the
614 * accumulator.
615 */
616 assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() ||
617 brw_type_is_float(inst->dst.type));
618
619 const brw_builder ibld(v, block, inst);
620 const unsigned stride = required_dst_byte_stride(inst) /
621 brw_type_size_bytes(inst->dst.type);
622 assert(stride > 0);
623 brw_reg tmp = ibld.vgrf(inst->dst.type, stride);
624 ibld.UNDEF(tmp);
625 tmp = horiz_stride(tmp, stride);
626
627 if (!inst->dst.is_null()) {
628 /* Emit a series of 32-bit integer copies from the temporary into the
629 * original destination.
630 */
631 const brw_reg_type raw_type =
632 brw_int_type(MIN2(brw_type_size_bytes(tmp.type), 4), false);
633
634 const unsigned n =
635 brw_type_size_bytes(tmp.type) / brw_type_size_bytes(raw_type);
636
637 if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
638 /* Note that in general we cannot simply predicate the copies on
639 * the same flag register as the original instruction, since it
640 * may have been overwritten by the instruction itself. Instead
641 * initialize the temporary with the previous contents of the
642 * destination register.
643 */
644 for (unsigned j = 0; j < n; j++)
645 ibld.MOV(subscript(tmp, raw_type, j),
646 subscript(inst->dst, raw_type, j));
647 }
648
649 for (unsigned j = 0; j < n; j++) {
650 fs_inst *jnst = ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),
651 subscript(tmp, raw_type, j));
652 if (has_subdword_integer_region_restriction(v->devinfo, jnst)) {
653 /* The copy isn't guaranteed to comply with all subdword integer
654 * regioning restrictions in some cases. Lower it recursively.
655 */
656 lower_instruction(v, block, jnst);
657 }
658 }
659
660 /* If the destination was an accumulator, after lowering it will be a
661 * GRF. Clear writes_accumulator for the instruction.
662 */
663 if (inst->dst.is_accumulator())
664 inst->writes_accumulator = false;
665 }
666
667 /* Point the original instruction at the temporary, making sure to keep
668 * any destination modifiers in the instruction.
669 */
670 assert(inst->size_written == inst->dst.component_size(inst->exec_size));
671 inst->dst = tmp;
672 inst->size_written = inst->dst.component_size(inst->exec_size);
673
674 return true;
675 }
676
677 /**
678 * Change sources and destination of the instruction to an
679 * appropriate legal type, splitting the instruction into multiple
680 * ones of smaller execution type if necessary, to be used in cases
681 * where the execution type of an instruction is unsupported.
682 */
683 bool
lower_exec_type(fs_visitor * v,bblock_t * block,fs_inst * inst)684 lower_exec_type(fs_visitor *v, bblock_t *block, fs_inst *inst)
685 {
686 assert(inst->dst.type == get_exec_type(inst));
687 const unsigned mask = has_invalid_exec_type(v->devinfo, inst);
688 const brw_reg_type raw_type = required_exec_type(v->devinfo, inst);
689 const unsigned n = get_exec_type_size(inst) / brw_type_size_bytes(raw_type);
690 const brw_builder ibld(v, block, inst);
691
692 brw_reg tmp = ibld.vgrf(inst->dst.type, inst->dst.stride);
693 ibld.UNDEF(tmp);
694 tmp = horiz_stride(tmp, inst->dst.stride);
695
696 for (unsigned j = 0; j < n; j++) {
697 fs_inst sub_inst = *inst;
698
699 for (unsigned i = 0; i < inst->sources; i++) {
700 if (mask & (1u << i)) {
701 assert(inst->src[i].type == inst->dst.type);
702 sub_inst.src[i] = subscript(inst->src[i], raw_type, j);
703 }
704 }
705
706 sub_inst.dst = subscript(tmp, raw_type, j);
707
708 assert(sub_inst.size_written == sub_inst.dst.component_size(sub_inst.exec_size));
709 assert(!sub_inst.flags_written(v->devinfo) && !sub_inst.saturate);
710 ibld.emit(sub_inst);
711
712 fs_inst *mov = ibld.MOV(subscript(inst->dst, raw_type, j),
713 subscript(tmp, raw_type, j));
714 if (inst->opcode != BRW_OPCODE_SEL) {
715 mov->predicate = inst->predicate;
716 mov->predicate_inverse = inst->predicate_inverse;
717 }
718 lower_instruction(v, block, mov);
719 }
720
721 inst->remove(block);
722
723 return true;
724 }
725
726 /**
727 * Fast-path for very specific kinds of invalid regions.
728 *
729 * Gfx12.5+ does not allow moves of B or UB sources to floating-point
730 * destinations. This restriction can be resolved more efficiently than by
731 * the general lowering in lower_src_modifiers or lower_src_region.
732 */
733 void
lower_src_conversion(fs_visitor * v,bblock_t * block,fs_inst * inst)734 lower_src_conversion(fs_visitor *v, bblock_t *block, fs_inst *inst)
735 {
736 const intel_device_info *devinfo = v->devinfo;
737 const brw_builder ibld = brw_builder(v, block, inst).scalar_group();
738
739 /* We only handle scalar conversions from small types for now. */
740 assert(is_uniform(inst->src[0]));
741
742 brw_reg tmp = ibld.vgrf(brw_type_with_size(inst->src[0].type, 32));
743 fs_inst *mov = ibld.MOV(tmp, inst->src[0]);
744
745 inst->src[0] = component(tmp, 0);
746
747 /* Assert that neither the added MOV nor the original instruction will need
748 * any additional lowering.
749 */
750 assert(!has_invalid_src_region(devinfo, mov, 0));
751 assert(!has_invalid_src_modifiers(devinfo, mov, 0));
752 assert(!has_invalid_dst_region(devinfo, mov));
753
754 assert(!has_invalid_src_region(devinfo, inst, 0));
755 assert(!has_invalid_src_modifiers(devinfo, inst, 0));
756 }
757
758 /**
759 * Legalize the source and destination regioning controls of the specified
760 * instruction.
761 */
762 bool
lower_instruction(fs_visitor * v,bblock_t * block,fs_inst * inst)763 lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
764 {
765 const intel_device_info *devinfo = v->devinfo;
766 bool progress = false;
767
768 /* BROADCAST is special. It's destination region is a bit of a lie, and
769 * it gets lower in brw_eu_emit. For the purposes of region
770 * restrictions, let's assume that the final code emission will do the
771 * right thing. Doing a bunch of shuffling here is only going to make a
772 * mess of things.
773 */
774 if (inst->opcode == SHADER_OPCODE_BROADCAST)
775 return false;
776
777 if (has_invalid_dst_modifiers(devinfo, inst))
778 progress |= lower_dst_modifiers(v, block, inst);
779
780 if (has_invalid_dst_region(devinfo, inst))
781 progress |= lower_dst_region(v, block, inst);
782
783 if (has_invalid_src_conversion(devinfo, inst)) {
784 lower_src_conversion(v, block, inst);
785 progress = true;
786 }
787
788 for (unsigned i = 0; i < inst->sources; i++) {
789 if (has_invalid_src_modifiers(devinfo, inst, i))
790 progress |= brw_lower_src_modifiers(*v, block, inst, i);
791
792 if (has_invalid_src_region(devinfo, inst, i))
793 progress |= lower_src_region(v, block, inst, i);
794 }
795
796 if (has_invalid_exec_type(devinfo, inst))
797 progress |= lower_exec_type(v, block, inst);
798
799 return progress;
800 }
801 }
802
803 bool
brw_lower_regioning(fs_visitor & s)804 brw_lower_regioning(fs_visitor &s)
805 {
806 bool progress = false;
807
808 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg)
809 progress |= lower_instruction(&s, block, inst);
810
811 if (progress)
812 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
813
814 return progress;
815 }
816