1 /*
2 * Copyright © 2018 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_fs.h"
25 #include "brw_cfg.h"
26 #include "brw_fs_builder.h"
27
28 using namespace brw;
29
30 namespace {
31 /* From the SKL PRM Vol 2a, "Move":
32 *
33 * "A mov with the same source and destination type, no source modifier,
34 * and no saturation is a raw move. A packed byte destination region (B
35 * or UB type with HorzStride == 1 and ExecSize > 1) can only be written
36 * using raw move."
37 */
38 bool
is_byte_raw_mov(const fs_inst * inst)39 is_byte_raw_mov(const fs_inst *inst)
40 {
41 return type_sz(inst->dst.type) == 1 &&
42 inst->opcode == BRW_OPCODE_MOV &&
43 inst->src[0].type == inst->dst.type &&
44 !inst->saturate &&
45 !inst->src[0].negate &&
46 !inst->src[0].abs;
47 }
48
49 /*
50 * Return an acceptable byte stride for the destination of an instruction
51 * that requires it to have some particular alignment.
52 */
53 unsigned
required_dst_byte_stride(const fs_inst * inst)54 required_dst_byte_stride(const fs_inst *inst)
55 {
56 if (inst->dst.is_accumulator()) {
57 /* If the destination is an accumulator, insist that we leave the
58 * stride alone. We cannot "fix" accumulator destinations by writing
59 * to a temporary and emitting a MOV into the original destination.
60 * For multiply instructions (our one use of the accumulator), the
61 * MUL writes the full 66 bits of the accumulator whereas the MOV we
62 * would emit only writes 33 bits and leaves the top 33 bits
63 * undefined.
64 *
65 * It's safe to just require the original stride here because the
66 * lowering pass will detect the mismatch in has_invalid_src_region
67 * and fix the sources of the multiply instead of the destination.
68 */
69 return inst->dst.stride * type_sz(inst->dst.type);
70 } else if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
71 !is_byte_raw_mov(inst)) {
72 return get_exec_type_size(inst);
73 } else {
74 /* Calculate the maximum byte stride and the minimum/maximum type
75 * size across all source and destination operands we are required to
76 * lower.
77 */
78 unsigned max_stride = inst->dst.stride * type_sz(inst->dst.type);
79 unsigned min_size = type_sz(inst->dst.type);
80 unsigned max_size = type_sz(inst->dst.type);
81
82 for (unsigned i = 0; i < inst->sources; i++) {
83 if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {
84 const unsigned size = type_sz(inst->src[i].type);
85 max_stride = MAX2(max_stride, inst->src[i].stride * size);
86 min_size = MIN2(min_size, size);
87 max_size = MAX2(max_size, size);
88 }
89 }
90
91 /* All operands involved in lowering need to fit in the calculated
92 * stride.
93 */
94 assert(max_size <= 4 * min_size);
95
96 /* Attempt to use the largest byte stride among all present operands,
97 * but never exceed a stride of 4 since that would lead to illegal
98 * destination regions during lowering.
99 */
100 return MIN2(max_stride, 4 * min_size);
101 }
102 }
103
104 /*
105 * Return an acceptable byte sub-register offset for the destination of an
106 * instruction that requires it to be aligned to the sub-register offset of
107 * the sources.
108 */
109 unsigned
required_dst_byte_offset(const intel_device_info * devinfo,const fs_inst * inst)110 required_dst_byte_offset(const intel_device_info *devinfo, const fs_inst *inst)
111 {
112 for (unsigned i = 0; i < inst->sources; i++) {
113 if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
114 if (reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE) !=
115 reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE))
116 return 0;
117 }
118
119 return reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
120 }
121
122 /*
123 * Return the closest legal execution type for an instruction on
124 * the specified platform.
125 */
126 brw_reg_type
required_exec_type(const intel_device_info * devinfo,const fs_inst * inst)127 required_exec_type(const intel_device_info *devinfo, const fs_inst *inst)
128 {
129 const brw_reg_type t = get_exec_type(inst);
130 const bool has_64bit = brw_reg_type_is_floating_point(t) ?
131 devinfo->has_64bit_float : devinfo->has_64bit_int;
132
133 switch (inst->opcode) {
134 case SHADER_OPCODE_SHUFFLE:
135 /* IVB has an issue (which we found empirically) where it reads
136 * two address register components per channel for indirectly
137 * addressed 64-bit sources.
138 *
139 * From the Cherryview PRM Vol 7. "Register Region Restrictions":
140 *
141 * "When source or destination datatype is 64b or operation is
142 * integer DWord multiply, indirect addressing must not be
143 * used."
144 *
145 * Work around both of the above and handle platforms that
146 * don't support 64-bit types at all.
147 */
148 if ((!devinfo->has_64bit_int ||
149 devinfo->platform == INTEL_PLATFORM_CHV ||
150 intel_device_info_is_9lp(devinfo)) && type_sz(t) > 4)
151 return BRW_REGISTER_TYPE_UD;
152 else if (has_dst_aligned_region_restriction(devinfo, inst))
153 return brw_int_type(type_sz(t), false);
154 else
155 return t;
156
157 case SHADER_OPCODE_SEL_EXEC:
158 if ((!has_64bit || devinfo->has_64bit_float_via_math_pipe) &&
159 type_sz(t) > 4)
160 return BRW_REGISTER_TYPE_UD;
161 else
162 return t;
163
164 case SHADER_OPCODE_QUAD_SWIZZLE:
165 if (has_dst_aligned_region_restriction(devinfo, inst))
166 return brw_int_type(type_sz(t), false);
167 else
168 return t;
169
170 case SHADER_OPCODE_CLUSTER_BROADCAST:
171 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
172 *
173 * "When source or destination datatype is 64b or operation is
174 * integer DWord multiply, indirect addressing must not be
175 * used."
176 *
177 * For MTL (verx10 == 125), float64 is supported, but int64 is not.
178 * Therefore we need to lower cluster broadcast using 32-bit int ops.
179 *
180 * For gfx12.5+ platforms that support int64, the register regions
181 * used by cluster broadcast aren't supported by the 64-bit pipeline.
182 *
183 * Work around the above and handle platforms that don't
184 * support 64-bit types at all.
185 */
186 if ((!has_64bit || devinfo->verx10 >= 125 ||
187 intel_device_info_is_9lp(devinfo)) && type_sz(t) > 4)
188 return BRW_REGISTER_TYPE_UD;
189 else
190 return brw_int_type(type_sz(t), false);
191
192 case SHADER_OPCODE_BROADCAST:
193 case SHADER_OPCODE_MOV_INDIRECT:
194 if (((intel_device_info_is_9lp(devinfo) ||
195 devinfo->verx10 >= 125) && type_sz(inst->src[0].type) > 4) ||
196 (devinfo->verx10 >= 125 &&
197 brw_reg_type_is_floating_point(inst->src[0].type)))
198 return brw_int_type(type_sz(t), false);
199 else
200 return t;
201
202 default:
203 return t;
204 }
205 }
206
207 /*
208 * Return the stride between channels of the specified register in
209 * byte units, or ~0u if the region cannot be represented with a
210 * single one-dimensional stride.
211 */
212 unsigned
byte_stride(const fs_reg & reg)213 byte_stride(const fs_reg ®)
214 {
215 switch (reg.file) {
216 case BAD_FILE:
217 case UNIFORM:
218 case IMM:
219 case VGRF:
220 case ATTR:
221 return reg.stride * type_sz(reg.type);
222 case ARF:
223 case FIXED_GRF:
224 if (reg.is_null()) {
225 return 0;
226 } else {
227 const unsigned hstride = reg.hstride ? 1 << (reg.hstride - 1) : 0;
228 const unsigned vstride = reg.vstride ? 1 << (reg.vstride - 1) : 0;
229 const unsigned width = 1 << reg.width;
230
231 if (width == 1) {
232 return vstride * type_sz(reg.type);
233 } else if (hstride * width == vstride) {
234 return hstride * type_sz(reg.type);
235 } else {
236 return ~0u;
237 }
238 }
239 default:
240 unreachable("Invalid register file");
241 }
242 }
243
244 /*
245 * Return whether the instruction has an unsupported channel bit layout
246 * specified for the i-th source region.
247 */
248 bool
has_invalid_src_region(const intel_device_info * devinfo,const fs_inst * inst,unsigned i)249 has_invalid_src_region(const intel_device_info *devinfo, const fs_inst *inst,
250 unsigned i)
251 {
252 if (is_send(inst) || inst->is_math() || inst->is_control_source(i) ||
253 inst->opcode == BRW_OPCODE_DPAS) {
254 return false;
255 }
256
257 const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
258 const unsigned src_byte_offset = reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
259
260 return has_dst_aligned_region_restriction(devinfo, inst) &&
261 !is_uniform(inst->src[i]) &&
262 (byte_stride(inst->src[i]) != byte_stride(inst->dst) ||
263 src_byte_offset != dst_byte_offset);
264 }
265
266 /*
267 * Return whether the instruction has an unsupported channel bit layout
268 * specified for the destination region.
269 */
270 bool
has_invalid_dst_region(const intel_device_info * devinfo,const fs_inst * inst)271 has_invalid_dst_region(const intel_device_info *devinfo,
272 const fs_inst *inst)
273 {
274 if (is_send(inst) || inst->is_math()) {
275 return false;
276 } else {
277 const brw_reg_type exec_type = get_exec_type(inst);
278 const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
279 const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&
280 type_sz(inst->dst.type) < type_sz(exec_type);
281
282 return (has_dst_aligned_region_restriction(devinfo, inst) &&
283 (required_dst_byte_stride(inst) != byte_stride(inst->dst) ||
284 required_dst_byte_offset(devinfo, inst) != dst_byte_offset)) ||
285 (is_narrowing_conversion &&
286 required_dst_byte_stride(inst) != byte_stride(inst->dst));
287 }
288 }
289
290 /**
291 * Return a non-zero value if the execution type of the instruction is
292 * unsupported. The destination and sources matching the returned mask
293 * will be bit-cast to an integer type of appropriate size, lowering any
294 * source or destination modifiers into separate MOV instructions.
295 */
296 unsigned
has_invalid_exec_type(const intel_device_info * devinfo,const fs_inst * inst)297 has_invalid_exec_type(const intel_device_info *devinfo, const fs_inst *inst)
298 {
299 if (required_exec_type(devinfo, inst) != get_exec_type(inst)) {
300 switch (inst->opcode) {
301 case SHADER_OPCODE_SHUFFLE:
302 case SHADER_OPCODE_QUAD_SWIZZLE:
303 case SHADER_OPCODE_CLUSTER_BROADCAST:
304 case SHADER_OPCODE_BROADCAST:
305 case SHADER_OPCODE_MOV_INDIRECT:
306 return 0x1;
307
308 case SHADER_OPCODE_SEL_EXEC:
309 return 0x3;
310
311 default:
312 unreachable("Unknown invalid execution type source mask.");
313 }
314 } else {
315 return 0;
316 }
317 }
318
319 /*
320 * Return whether the instruction has unsupported source modifiers
321 * specified for the i-th source region.
322 */
323 bool
has_invalid_src_modifiers(const intel_device_info * devinfo,const fs_inst * inst,unsigned i)324 has_invalid_src_modifiers(const intel_device_info *devinfo,
325 const fs_inst *inst, unsigned i)
326 {
327 return (!inst->can_do_source_mods(devinfo) &&
328 (inst->src[i].negate || inst->src[i].abs)) ||
329 ((has_invalid_exec_type(devinfo, inst) & (1u << i)) &&
330 (inst->src[i].negate || inst->src[i].abs ||
331 inst->src[i].type != get_exec_type(inst)));
332 }
333
334 /*
335 * Return whether the instruction has an unsupported type conversion
336 * specified for the destination.
337 */
338 bool
has_invalid_conversion(const intel_device_info * devinfo,const fs_inst * inst)339 has_invalid_conversion(const intel_device_info *devinfo, const fs_inst *inst)
340 {
341 switch (inst->opcode) {
342 case BRW_OPCODE_MOV:
343 return false;
344 case BRW_OPCODE_SEL:
345 return inst->dst.type != get_exec_type(inst);
346 default:
347 /* FIXME: We assume the opcodes not explicitly mentioned before just
348 * work fine with arbitrary conversions, unless they need to be
349 * bit-cast.
350 */
351 return has_invalid_exec_type(devinfo, inst) &&
352 inst->dst.type != get_exec_type(inst);
353 }
354 }
355
356 /**
357 * Return whether the instruction has unsupported destination modifiers.
358 */
359 bool
has_invalid_dst_modifiers(const intel_device_info * devinfo,const fs_inst * inst)360 has_invalid_dst_modifiers(const intel_device_info *devinfo, const fs_inst *inst)
361 {
362 return (has_invalid_exec_type(devinfo, inst) &&
363 (inst->saturate || inst->conditional_mod)) ||
364 has_invalid_conversion(devinfo, inst);
365 }
366
367 /**
368 * Return whether the instruction has non-standard semantics for the
369 * conditional mod which don't cause the flag register to be updated with
370 * the comparison result.
371 */
372 bool
has_inconsistent_cmod(const fs_inst * inst)373 has_inconsistent_cmod(const fs_inst *inst)
374 {
375 return inst->opcode == BRW_OPCODE_SEL ||
376 inst->opcode == BRW_OPCODE_CSEL ||
377 inst->opcode == BRW_OPCODE_IF ||
378 inst->opcode == BRW_OPCODE_WHILE;
379 }
380
381 bool
382 lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);
383 }
384
385 namespace brw {
386 /**
387 * Remove any modifiers from the \p i-th source region of the instruction,
388 * including negate, abs and any implicit type conversion to the execution
389 * type. Instead any source modifiers will be implemented as a separate
390 * MOV instruction prior to the original instruction.
391 */
392 bool
lower_src_modifiers(fs_visitor * v,bblock_t * block,fs_inst * inst,unsigned i)393 lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
394 {
395 assert(inst->components_read(i) == 1);
396 assert(v->devinfo->has_integer_dword_mul ||
397 inst->opcode != BRW_OPCODE_MUL ||
398 brw_reg_type_is_floating_point(get_exec_type(inst)) ||
399 MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4 ||
400 type_sz(inst->src[i].type) == get_exec_type_size(inst));
401
402 const fs_builder ibld(v, block, inst);
403 const fs_reg tmp = ibld.vgrf(get_exec_type(inst));
404
405 lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));
406 inst->src[i] = tmp;
407
408 return true;
409 }
410 }
411
412 namespace {
413 /**
414 * Remove any modifiers from the destination region of the instruction,
415 * including saturate, conditional mod and any implicit type conversion
416 * from the execution type. Instead any destination modifiers will be
417 * implemented as a separate MOV instruction after the original
418 * instruction.
419 */
420 bool
lower_dst_modifiers(fs_visitor * v,bblock_t * block,fs_inst * inst)421 lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst)
422 {
423 const fs_builder ibld(v, block, inst);
424 const brw_reg_type type = get_exec_type(inst);
425 /* Not strictly necessary, but if possible use a temporary with the same
426 * channel alignment as the current destination in order to avoid
427 * violating the restrictions enforced later on by lower_src_region()
428 * and lower_dst_region(), which would introduce additional copy
429 * instructions into the program unnecessarily.
430 */
431 const unsigned stride =
432 type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 :
433 type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);
434 fs_reg tmp = ibld.vgrf(type, stride);
435 ibld.UNDEF(tmp);
436 tmp = horiz_stride(tmp, stride);
437
438 /* Emit a MOV taking care of all the destination modifiers. */
439 fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
440 mov->saturate = inst->saturate;
441 if (!has_inconsistent_cmod(inst))
442 mov->conditional_mod = inst->conditional_mod;
443 if (inst->opcode != BRW_OPCODE_SEL) {
444 mov->predicate = inst->predicate;
445 mov->predicate_inverse = inst->predicate_inverse;
446 }
447 mov->flag_subreg = inst->flag_subreg;
448 lower_instruction(v, block, mov);
449
450 /* Point the original instruction at the temporary, and clean up any
451 * destination modifiers.
452 */
453 assert(inst->size_written == inst->dst.component_size(inst->exec_size));
454 inst->dst = tmp;
455 inst->size_written = inst->dst.component_size(inst->exec_size);
456 inst->saturate = false;
457 if (!has_inconsistent_cmod(inst))
458 inst->conditional_mod = BRW_CONDITIONAL_NONE;
459
460 assert(!inst->flags_written(v->devinfo) || !mov->predicate);
461 return true;
462 }
463
464 /**
465 * Remove any non-trivial shuffling of data from the \p i-th source region
466 * of the instruction. Instead implement the region as a series of integer
467 * copies into a temporary with the same channel layout as the destination.
468 */
469 bool
lower_src_region(fs_visitor * v,bblock_t * block,fs_inst * inst,unsigned i)470 lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
471 {
472 assert(inst->components_read(i) == 1);
473 const fs_builder ibld(v, block, inst);
474 const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride /
475 type_sz(inst->src[i].type);
476 assert(stride > 0);
477 fs_reg tmp = ibld.vgrf(inst->src[i].type, stride);
478 ibld.UNDEF(tmp);
479 tmp = horiz_stride(tmp, stride);
480
481 /* Emit a series of 32-bit integer copies with any source modifiers
482 * cleaned up (because their semantics are dependent on the type).
483 */
484 const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
485 false);
486 const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
487 fs_reg raw_src = inst->src[i];
488 raw_src.negate = false;
489 raw_src.abs = false;
490
491 for (unsigned j = 0; j < n; j++)
492 ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j));
493
494 /* Point the original instruction at the temporary, making sure to keep
495 * any source modifiers in the instruction.
496 */
497 fs_reg lower_src = tmp;
498 lower_src.negate = inst->src[i].negate;
499 lower_src.abs = inst->src[i].abs;
500 inst->src[i] = lower_src;
501
502 return true;
503 }
504
505 /**
506 * Remove any non-trivial shuffling of data from the destination region of
507 * the instruction. Instead implement the region as a series of integer
508 * copies from a temporary with a channel layout compatible with the
509 * sources.
510 */
511 bool
lower_dst_region(fs_visitor * v,bblock_t * block,fs_inst * inst)512 lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
513 {
514 /* We cannot replace the result of an integer multiply which writes the
515 * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
516 * value whereas the MOV will act on only 32 or 33 bits of the
517 * accumulator.
518 */
519 assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() ||
520 brw_reg_type_is_floating_point(inst->dst.type));
521
522 const fs_builder ibld(v, block, inst);
523 const unsigned stride = required_dst_byte_stride(inst) /
524 type_sz(inst->dst.type);
525 assert(stride > 0);
526 fs_reg tmp = ibld.vgrf(inst->dst.type, stride);
527 ibld.UNDEF(tmp);
528 tmp = horiz_stride(tmp, stride);
529
530 /* Emit a series of 32-bit integer copies from the temporary into the
531 * original destination.
532 */
533 const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
534 false);
535 const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
536
537 if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
538 /* Note that in general we cannot simply predicate the copies on the
539 * same flag register as the original instruction, since it may have
540 * been overwritten by the instruction itself. Instead initialize
541 * the temporary with the previous contents of the destination
542 * register.
543 */
544 for (unsigned j = 0; j < n; j++)
545 ibld.MOV(subscript(tmp, raw_type, j),
546 subscript(inst->dst, raw_type, j));
547 }
548
549 for (unsigned j = 0; j < n; j++)
550 ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),
551 subscript(tmp, raw_type, j));
552
553 /* Point the original instruction at the temporary, making sure to keep
554 * any destination modifiers in the instruction.
555 */
556 assert(inst->size_written == inst->dst.component_size(inst->exec_size));
557 inst->dst = tmp;
558 inst->size_written = inst->dst.component_size(inst->exec_size);
559
560 return true;
561 }
562
563 /**
564 * Change sources and destination of the instruction to an
565 * appropriate legal type, splitting the instruction into multiple
566 * ones of smaller execution type if necessary, to be used in cases
567 * where the execution type of an instruction is unsupported.
568 */
569 bool
lower_exec_type(fs_visitor * v,bblock_t * block,fs_inst * inst)570 lower_exec_type(fs_visitor *v, bblock_t *block, fs_inst *inst)
571 {
572 assert(inst->dst.type == get_exec_type(inst));
573 const unsigned mask = has_invalid_exec_type(v->devinfo, inst);
574 const brw_reg_type raw_type = required_exec_type(v->devinfo, inst);
575 const unsigned n = get_exec_type_size(inst) / type_sz(raw_type);
576 const fs_builder ibld(v, block, inst);
577
578 fs_reg tmp = ibld.vgrf(inst->dst.type, inst->dst.stride);
579 ibld.UNDEF(tmp);
580 tmp = horiz_stride(tmp, inst->dst.stride);
581
582 for (unsigned j = 0; j < n; j++) {
583 fs_inst sub_inst = *inst;
584
585 for (unsigned i = 0; i < inst->sources; i++) {
586 if (mask & (1u << i)) {
587 assert(inst->src[i].type == inst->dst.type);
588 sub_inst.src[i] = subscript(inst->src[i], raw_type, j);
589 }
590 }
591
592 sub_inst.dst = subscript(tmp, raw_type, j);
593
594 assert(sub_inst.size_written == sub_inst.dst.component_size(sub_inst.exec_size));
595 assert(!sub_inst.flags_written(v->devinfo) && !sub_inst.saturate);
596 ibld.emit(sub_inst);
597
598 fs_inst *mov = ibld.MOV(subscript(inst->dst, raw_type, j),
599 subscript(tmp, raw_type, j));
600 if (inst->opcode != BRW_OPCODE_SEL) {
601 mov->predicate = inst->predicate;
602 mov->predicate_inverse = inst->predicate_inverse;
603 }
604 lower_instruction(v, block, mov);
605 }
606
607 inst->remove(block);
608
609 return true;
610 }
611
612 /**
613 * Legalize the source and destination regioning controls of the specified
614 * instruction.
615 */
616 bool
lower_instruction(fs_visitor * v,bblock_t * block,fs_inst * inst)617 lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
618 {
619 const intel_device_info *devinfo = v->devinfo;
620 bool progress = false;
621
622 if (has_invalid_dst_modifiers(devinfo, inst))
623 progress |= lower_dst_modifiers(v, block, inst);
624
625 if (has_invalid_dst_region(devinfo, inst))
626 progress |= lower_dst_region(v, block, inst);
627
628 for (unsigned i = 0; i < inst->sources; i++) {
629 if (has_invalid_src_modifiers(devinfo, inst, i))
630 progress |= lower_src_modifiers(v, block, inst, i);
631
632 if (has_invalid_src_region(devinfo, inst, i))
633 progress |= lower_src_region(v, block, inst, i);
634 }
635
636 if (has_invalid_exec_type(devinfo, inst))
637 progress |= lower_exec_type(v, block, inst);
638
639 return progress;
640 }
641 }
642
643 bool
brw_fs_lower_regioning(fs_visitor & s)644 brw_fs_lower_regioning(fs_visitor &s)
645 {
646 bool progress = false;
647
648 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg)
649 progress |= lower_instruction(&s, block, inst);
650
651 if (progress)
652 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
653
654 return progress;
655 }
656