• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_fs.h"
25 #include "brw_cfg.h"
26 #include "brw_fs_builder.h"
27 
28 using namespace brw;
29 
30 namespace {
31    /* From the SKL PRM Vol 2a, "Move":
32     *
33     * "A mov with the same source and destination type, no source modifier,
34     *  and no saturation is a raw move. A packed byte destination region (B
35     *  or UB type with HorzStride == 1 and ExecSize > 1) can only be written
36     *  using raw move."
37     */
38    bool
is_byte_raw_mov(const fs_inst * inst)39    is_byte_raw_mov(const fs_inst *inst)
40    {
41       return type_sz(inst->dst.type) == 1 &&
42              inst->opcode == BRW_OPCODE_MOV &&
43              inst->src[0].type == inst->dst.type &&
44              !inst->saturate &&
45              !inst->src[0].negate &&
46              !inst->src[0].abs;
47    }
48 
49    /*
50     * Return an acceptable byte stride for the destination of an instruction
51     * that requires it to have some particular alignment.
52     */
53    unsigned
required_dst_byte_stride(const fs_inst * inst)54    required_dst_byte_stride(const fs_inst *inst)
55    {
56       if (inst->dst.is_accumulator()) {
57          /* If the destination is an accumulator, insist that we leave the
58           * stride alone.  We cannot "fix" accumulator destinations by writing
59           * to a temporary and emitting a MOV into the original destination.
60           * For multiply instructions (our one use of the accumulator), the
61           * MUL writes the full 66 bits of the accumulator whereas the MOV we
62           * would emit only writes 33 bits and leaves the top 33 bits
63           * undefined.
64           *
65           * It's safe to just require the original stride here because the
66           * lowering pass will detect the mismatch in has_invalid_src_region
67           * and fix the sources of the multiply instead of the destination.
68           */
69          return inst->dst.stride * type_sz(inst->dst.type);
70       } else if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
71           !is_byte_raw_mov(inst)) {
72          return get_exec_type_size(inst);
73       } else {
74          /* Calculate the maximum byte stride and the minimum/maximum type
75           * size across all source and destination operands we are required to
76           * lower.
77           */
78          unsigned max_stride = inst->dst.stride * type_sz(inst->dst.type);
79          unsigned min_size = type_sz(inst->dst.type);
80          unsigned max_size = type_sz(inst->dst.type);
81 
82          for (unsigned i = 0; i < inst->sources; i++) {
83             if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {
84                const unsigned size = type_sz(inst->src[i].type);
85                max_stride = MAX2(max_stride, inst->src[i].stride * size);
86                min_size = MIN2(min_size, size);
87                max_size = MAX2(max_size, size);
88             }
89          }
90 
91          /* All operands involved in lowering need to fit in the calculated
92           * stride.
93           */
94          assert(max_size <= 4 * min_size);
95 
96          /* Attempt to use the largest byte stride among all present operands,
97           * but never exceed a stride of 4 since that would lead to illegal
98           * destination regions during lowering.
99           */
100          return MIN2(max_stride, 4 * min_size);
101       }
102    }
103 
104    /*
105     * Return an acceptable byte sub-register offset for the destination of an
106     * instruction that requires it to be aligned to the sub-register offset of
107     * the sources.
108     */
109    unsigned
required_dst_byte_offset(const intel_device_info * devinfo,const fs_inst * inst)110    required_dst_byte_offset(const intel_device_info *devinfo, const fs_inst *inst)
111    {
112       for (unsigned i = 0; i < inst->sources; i++) {
113          if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
114             if (reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE) !=
115                 reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE))
116                return 0;
117       }
118 
119       return reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
120    }
121 
122    /*
123     * Return the closest legal execution type for an instruction on
124     * the specified platform.
125     */
126    brw_reg_type
required_exec_type(const intel_device_info * devinfo,const fs_inst * inst)127    required_exec_type(const intel_device_info *devinfo, const fs_inst *inst)
128    {
129       const brw_reg_type t = get_exec_type(inst);
130       const bool has_64bit = brw_reg_type_is_floating_point(t) ?
131          devinfo->has_64bit_float : devinfo->has_64bit_int;
132 
133       switch (inst->opcode) {
134       case SHADER_OPCODE_SHUFFLE:
135          /* IVB has an issue (which we found empirically) where it reads
136           * two address register components per channel for indirectly
137           * addressed 64-bit sources.
138           *
139           * From the Cherryview PRM Vol 7. "Register Region Restrictions":
140           *
141           *    "When source or destination datatype is 64b or operation is
142           *    integer DWord multiply, indirect addressing must not be
143           *    used."
144           *
145           * Work around both of the above and handle platforms that
146           * don't support 64-bit types at all.
147           */
148          if ((!devinfo->has_64bit_int ||
149               devinfo->platform == INTEL_PLATFORM_CHV ||
150               intel_device_info_is_9lp(devinfo)) && type_sz(t) > 4)
151             return BRW_REGISTER_TYPE_UD;
152          else if (has_dst_aligned_region_restriction(devinfo, inst))
153             return brw_int_type(type_sz(t), false);
154          else
155             return t;
156 
157       case SHADER_OPCODE_SEL_EXEC:
158          if ((!has_64bit || devinfo->has_64bit_float_via_math_pipe) &&
159              type_sz(t) > 4)
160             return BRW_REGISTER_TYPE_UD;
161          else
162             return t;
163 
164       case SHADER_OPCODE_QUAD_SWIZZLE:
165          if (has_dst_aligned_region_restriction(devinfo, inst))
166             return brw_int_type(type_sz(t), false);
167          else
168             return t;
169 
170       case SHADER_OPCODE_CLUSTER_BROADCAST:
171          /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
172           *
173           *    "When source or destination datatype is 64b or operation is
174           *    integer DWord multiply, indirect addressing must not be
175           *    used."
176           *
177           * For MTL (verx10 == 125), float64 is supported, but int64 is not.
178           * Therefore we need to lower cluster broadcast using 32-bit int ops.
179           *
180           * For gfx12.5+ platforms that support int64, the register regions
181           * used by cluster broadcast aren't supported by the 64-bit pipeline.
182           *
183           * Work around the above and handle platforms that don't
184           * support 64-bit types at all.
185           */
186          if ((!has_64bit || devinfo->verx10 >= 125 ||
187               intel_device_info_is_9lp(devinfo)) && type_sz(t) > 4)
188             return BRW_REGISTER_TYPE_UD;
189          else
190             return brw_int_type(type_sz(t), false);
191 
192       case SHADER_OPCODE_BROADCAST:
193       case SHADER_OPCODE_MOV_INDIRECT:
194          if (((intel_device_info_is_9lp(devinfo) ||
195                devinfo->verx10 >= 125) && type_sz(inst->src[0].type) > 4) ||
196              (devinfo->verx10 >= 125 &&
197               brw_reg_type_is_floating_point(inst->src[0].type)))
198             return brw_int_type(type_sz(t), false);
199          else
200             return t;
201 
202       default:
203          return t;
204       }
205    }
206 
207    /*
208     * Return the stride between channels of the specified register in
209     * byte units, or ~0u if the region cannot be represented with a
210     * single one-dimensional stride.
211     */
212    unsigned
byte_stride(const fs_reg & reg)213    byte_stride(const fs_reg &reg)
214    {
215       switch (reg.file) {
216       case BAD_FILE:
217       case UNIFORM:
218       case IMM:
219       case VGRF:
220       case ATTR:
221          return reg.stride * type_sz(reg.type);
222       case ARF:
223       case FIXED_GRF:
224          if (reg.is_null()) {
225             return 0;
226          } else {
227             const unsigned hstride = reg.hstride ? 1 << (reg.hstride - 1) : 0;
228             const unsigned vstride = reg.vstride ? 1 << (reg.vstride - 1) : 0;
229             const unsigned width = 1 << reg.width;
230 
231             if (width == 1) {
232                return vstride * type_sz(reg.type);
233             } else if (hstride * width == vstride) {
234                return hstride * type_sz(reg.type);
235             } else {
236                return ~0u;
237             }
238          }
239       default:
240          unreachable("Invalid register file");
241       }
242    }
243 
244    /*
245     * Return whether the instruction has an unsupported channel bit layout
246     * specified for the i-th source region.
247     */
248    bool
has_invalid_src_region(const intel_device_info * devinfo,const fs_inst * inst,unsigned i)249    has_invalid_src_region(const intel_device_info *devinfo, const fs_inst *inst,
250                           unsigned i)
251    {
252       if (is_send(inst) || inst->is_math() || inst->is_control_source(i) ||
253           inst->opcode == BRW_OPCODE_DPAS) {
254          return false;
255       }
256 
257       const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
258       const unsigned src_byte_offset = reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
259 
260       return has_dst_aligned_region_restriction(devinfo, inst) &&
261              !is_uniform(inst->src[i]) &&
262              (byte_stride(inst->src[i]) != byte_stride(inst->dst) ||
263               src_byte_offset != dst_byte_offset);
264    }
265 
266    /*
267     * Return whether the instruction has an unsupported channel bit layout
268     * specified for the destination region.
269     */
270    bool
has_invalid_dst_region(const intel_device_info * devinfo,const fs_inst * inst)271    has_invalid_dst_region(const intel_device_info *devinfo,
272                           const fs_inst *inst)
273    {
274       if (is_send(inst) || inst->is_math()) {
275          return false;
276       } else {
277          const brw_reg_type exec_type = get_exec_type(inst);
278          const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
279          const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&
280             type_sz(inst->dst.type) < type_sz(exec_type);
281 
282          return (has_dst_aligned_region_restriction(devinfo, inst) &&
283                  (required_dst_byte_stride(inst) != byte_stride(inst->dst) ||
284                   required_dst_byte_offset(devinfo, inst) != dst_byte_offset)) ||
285                 (is_narrowing_conversion &&
286                  required_dst_byte_stride(inst) != byte_stride(inst->dst));
287       }
288    }
289 
290    /**
291     * Return a non-zero value if the execution type of the instruction is
292     * unsupported.  The destination and sources matching the returned mask
293     * will be bit-cast to an integer type of appropriate size, lowering any
294     * source or destination modifiers into separate MOV instructions.
295     */
296    unsigned
has_invalid_exec_type(const intel_device_info * devinfo,const fs_inst * inst)297    has_invalid_exec_type(const intel_device_info *devinfo, const fs_inst *inst)
298    {
299       if (required_exec_type(devinfo, inst) != get_exec_type(inst)) {
300          switch (inst->opcode) {
301          case SHADER_OPCODE_SHUFFLE:
302          case SHADER_OPCODE_QUAD_SWIZZLE:
303          case SHADER_OPCODE_CLUSTER_BROADCAST:
304          case SHADER_OPCODE_BROADCAST:
305          case SHADER_OPCODE_MOV_INDIRECT:
306             return 0x1;
307 
308          case SHADER_OPCODE_SEL_EXEC:
309             return 0x3;
310 
311          default:
312             unreachable("Unknown invalid execution type source mask.");
313          }
314       } else {
315          return 0;
316       }
317    }
318 
319    /*
320     * Return whether the instruction has unsupported source modifiers
321     * specified for the i-th source region.
322     */
323    bool
has_invalid_src_modifiers(const intel_device_info * devinfo,const fs_inst * inst,unsigned i)324    has_invalid_src_modifiers(const intel_device_info *devinfo,
325                              const fs_inst *inst, unsigned i)
326    {
327       return (!inst->can_do_source_mods(devinfo) &&
328               (inst->src[i].negate || inst->src[i].abs)) ||
329              ((has_invalid_exec_type(devinfo, inst) & (1u << i)) &&
330               (inst->src[i].negate || inst->src[i].abs ||
331                inst->src[i].type != get_exec_type(inst)));
332    }
333 
334    /*
335     * Return whether the instruction has an unsupported type conversion
336     * specified for the destination.
337     */
338    bool
has_invalid_conversion(const intel_device_info * devinfo,const fs_inst * inst)339    has_invalid_conversion(const intel_device_info *devinfo, const fs_inst *inst)
340    {
341       switch (inst->opcode) {
342       case BRW_OPCODE_MOV:
343          return false;
344       case BRW_OPCODE_SEL:
345          return inst->dst.type != get_exec_type(inst);
346       default:
347          /* FIXME: We assume the opcodes not explicitly mentioned before just
348           * work fine with arbitrary conversions, unless they need to be
349           * bit-cast.
350           */
351          return has_invalid_exec_type(devinfo, inst) &&
352                 inst->dst.type != get_exec_type(inst);
353       }
354    }
355 
356    /**
357     * Return whether the instruction has unsupported destination modifiers.
358     */
359    bool
has_invalid_dst_modifiers(const intel_device_info * devinfo,const fs_inst * inst)360    has_invalid_dst_modifiers(const intel_device_info *devinfo, const fs_inst *inst)
361    {
362       return (has_invalid_exec_type(devinfo, inst) &&
363               (inst->saturate || inst->conditional_mod)) ||
364              has_invalid_conversion(devinfo, inst);
365    }
366 
367    /**
368     * Return whether the instruction has non-standard semantics for the
369     * conditional mod which don't cause the flag register to be updated with
370     * the comparison result.
371     */
372    bool
has_inconsistent_cmod(const fs_inst * inst)373    has_inconsistent_cmod(const fs_inst *inst)
374    {
375       return inst->opcode == BRW_OPCODE_SEL ||
376              inst->opcode == BRW_OPCODE_CSEL ||
377              inst->opcode == BRW_OPCODE_IF ||
378              inst->opcode == BRW_OPCODE_WHILE;
379    }
380 
381    bool
382    lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);
383 }
384 
385 namespace brw {
386    /**
387     * Remove any modifiers from the \p i-th source region of the instruction,
388     * including negate, abs and any implicit type conversion to the execution
389     * type.  Instead any source modifiers will be implemented as a separate
390     * MOV instruction prior to the original instruction.
391     */
392    bool
lower_src_modifiers(fs_visitor * v,bblock_t * block,fs_inst * inst,unsigned i)393    lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
394    {
395       assert(inst->components_read(i) == 1);
396       assert(v->devinfo->has_integer_dword_mul ||
397              inst->opcode != BRW_OPCODE_MUL ||
398              brw_reg_type_is_floating_point(get_exec_type(inst)) ||
399              MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4 ||
400              type_sz(inst->src[i].type) == get_exec_type_size(inst));
401 
402       const fs_builder ibld(v, block, inst);
403       const fs_reg tmp = ibld.vgrf(get_exec_type(inst));
404 
405       lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));
406       inst->src[i] = tmp;
407 
408       return true;
409    }
410 }
411 
412 namespace {
413    /**
414     * Remove any modifiers from the destination region of the instruction,
415     * including saturate, conditional mod and any implicit type conversion
416     * from the execution type.  Instead any destination modifiers will be
417     * implemented as a separate MOV instruction after the original
418     * instruction.
419     */
420    bool
lower_dst_modifiers(fs_visitor * v,bblock_t * block,fs_inst * inst)421    lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst)
422    {
423       const fs_builder ibld(v, block, inst);
424       const brw_reg_type type = get_exec_type(inst);
425       /* Not strictly necessary, but if possible use a temporary with the same
426        * channel alignment as the current destination in order to avoid
427        * violating the restrictions enforced later on by lower_src_region()
428        * and lower_dst_region(), which would introduce additional copy
429        * instructions into the program unnecessarily.
430        */
431       const unsigned stride =
432          type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 :
433          type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);
434       fs_reg tmp = ibld.vgrf(type, stride);
435       ibld.UNDEF(tmp);
436       tmp = horiz_stride(tmp, stride);
437 
438       /* Emit a MOV taking care of all the destination modifiers. */
439       fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
440       mov->saturate = inst->saturate;
441       if (!has_inconsistent_cmod(inst))
442          mov->conditional_mod = inst->conditional_mod;
443       if (inst->opcode != BRW_OPCODE_SEL) {
444          mov->predicate = inst->predicate;
445          mov->predicate_inverse = inst->predicate_inverse;
446       }
447       mov->flag_subreg = inst->flag_subreg;
448       lower_instruction(v, block, mov);
449 
450       /* Point the original instruction at the temporary, and clean up any
451        * destination modifiers.
452        */
453       assert(inst->size_written == inst->dst.component_size(inst->exec_size));
454       inst->dst = tmp;
455       inst->size_written = inst->dst.component_size(inst->exec_size);
456       inst->saturate = false;
457       if (!has_inconsistent_cmod(inst))
458          inst->conditional_mod = BRW_CONDITIONAL_NONE;
459 
460       assert(!inst->flags_written(v->devinfo) || !mov->predicate);
461       return true;
462    }
463 
464    /**
465     * Remove any non-trivial shuffling of data from the \p i-th source region
466     * of the instruction.  Instead implement the region as a series of integer
467     * copies into a temporary with the same channel layout as the destination.
468     */
469    bool
lower_src_region(fs_visitor * v,bblock_t * block,fs_inst * inst,unsigned i)470    lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
471    {
472       assert(inst->components_read(i) == 1);
473       const fs_builder ibld(v, block, inst);
474       const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride /
475                               type_sz(inst->src[i].type);
476       assert(stride > 0);
477       fs_reg tmp = ibld.vgrf(inst->src[i].type, stride);
478       ibld.UNDEF(tmp);
479       tmp = horiz_stride(tmp, stride);
480 
481       /* Emit a series of 32-bit integer copies with any source modifiers
482        * cleaned up (because their semantics are dependent on the type).
483        */
484       const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
485                                                  false);
486       const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
487       fs_reg raw_src = inst->src[i];
488       raw_src.negate = false;
489       raw_src.abs = false;
490 
491       for (unsigned j = 0; j < n; j++)
492          ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j));
493 
494       /* Point the original instruction at the temporary, making sure to keep
495        * any source modifiers in the instruction.
496        */
497       fs_reg lower_src = tmp;
498       lower_src.negate = inst->src[i].negate;
499       lower_src.abs = inst->src[i].abs;
500       inst->src[i] = lower_src;
501 
502       return true;
503    }
504 
505    /**
506     * Remove any non-trivial shuffling of data from the destination region of
507     * the instruction.  Instead implement the region as a series of integer
508     * copies from a temporary with a channel layout compatible with the
509     * sources.
510     */
511    bool
lower_dst_region(fs_visitor * v,bblock_t * block,fs_inst * inst)512    lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
513    {
514       /* We cannot replace the result of an integer multiply which writes the
515        * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
516        * value whereas the MOV will act on only 32 or 33 bits of the
517        * accumulator.
518        */
519       assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() ||
520              brw_reg_type_is_floating_point(inst->dst.type));
521 
522       const fs_builder ibld(v, block, inst);
523       const unsigned stride = required_dst_byte_stride(inst) /
524                               type_sz(inst->dst.type);
525       assert(stride > 0);
526       fs_reg tmp = ibld.vgrf(inst->dst.type, stride);
527       ibld.UNDEF(tmp);
528       tmp = horiz_stride(tmp, stride);
529 
530       /* Emit a series of 32-bit integer copies from the temporary into the
531        * original destination.
532        */
533       const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
534                                                  false);
535       const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
536 
537       if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
538          /* Note that in general we cannot simply predicate the copies on the
539           * same flag register as the original instruction, since it may have
540           * been overwritten by the instruction itself.  Instead initialize
541           * the temporary with the previous contents of the destination
542           * register.
543           */
544          for (unsigned j = 0; j < n; j++)
545             ibld.MOV(subscript(tmp, raw_type, j),
546                      subscript(inst->dst, raw_type, j));
547       }
548 
549       for (unsigned j = 0; j < n; j++)
550          ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),
551                                         subscript(tmp, raw_type, j));
552 
553       /* Point the original instruction at the temporary, making sure to keep
554        * any destination modifiers in the instruction.
555        */
556       assert(inst->size_written == inst->dst.component_size(inst->exec_size));
557       inst->dst = tmp;
558       inst->size_written = inst->dst.component_size(inst->exec_size);
559 
560       return true;
561    }
562 
563    /**
564     * Change sources and destination of the instruction to an
565     * appropriate legal type, splitting the instruction into multiple
566     * ones of smaller execution type if necessary, to be used in cases
567     * where the execution type of an instruction is unsupported.
568     */
569    bool
lower_exec_type(fs_visitor * v,bblock_t * block,fs_inst * inst)570    lower_exec_type(fs_visitor *v, bblock_t *block, fs_inst *inst)
571    {
572       assert(inst->dst.type == get_exec_type(inst));
573       const unsigned mask = has_invalid_exec_type(v->devinfo, inst);
574       const brw_reg_type raw_type = required_exec_type(v->devinfo, inst);
575       const unsigned n = get_exec_type_size(inst) / type_sz(raw_type);
576       const fs_builder ibld(v, block, inst);
577 
578       fs_reg tmp = ibld.vgrf(inst->dst.type, inst->dst.stride);
579       ibld.UNDEF(tmp);
580       tmp = horiz_stride(tmp, inst->dst.stride);
581 
582       for (unsigned j = 0; j < n; j++) {
583          fs_inst sub_inst = *inst;
584 
585          for (unsigned i = 0; i < inst->sources; i++) {
586             if (mask & (1u << i)) {
587                assert(inst->src[i].type == inst->dst.type);
588                sub_inst.src[i] = subscript(inst->src[i], raw_type, j);
589             }
590          }
591 
592          sub_inst.dst = subscript(tmp, raw_type, j);
593 
594          assert(sub_inst.size_written == sub_inst.dst.component_size(sub_inst.exec_size));
595          assert(!sub_inst.flags_written(v->devinfo) && !sub_inst.saturate);
596          ibld.emit(sub_inst);
597 
598          fs_inst *mov = ibld.MOV(subscript(inst->dst, raw_type, j),
599                                  subscript(tmp, raw_type, j));
600          if (inst->opcode != BRW_OPCODE_SEL) {
601             mov->predicate = inst->predicate;
602             mov->predicate_inverse = inst->predicate_inverse;
603          }
604          lower_instruction(v, block, mov);
605       }
606 
607       inst->remove(block);
608 
609       return true;
610    }
611 
612    /**
613     * Legalize the source and destination regioning controls of the specified
614     * instruction.
615     */
616    bool
lower_instruction(fs_visitor * v,bblock_t * block,fs_inst * inst)617    lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
618    {
619       const intel_device_info *devinfo = v->devinfo;
620       bool progress = false;
621 
622       if (has_invalid_dst_modifiers(devinfo, inst))
623          progress |= lower_dst_modifiers(v, block, inst);
624 
625       if (has_invalid_dst_region(devinfo, inst))
626          progress |= lower_dst_region(v, block, inst);
627 
628       for (unsigned i = 0; i < inst->sources; i++) {
629          if (has_invalid_src_modifiers(devinfo, inst, i))
630             progress |= lower_src_modifiers(v, block, inst, i);
631 
632          if (has_invalid_src_region(devinfo, inst, i))
633             progress |= lower_src_region(v, block, inst, i);
634       }
635 
636       if (has_invalid_exec_type(devinfo, inst))
637          progress |= lower_exec_type(v, block, inst);
638 
639       return progress;
640    }
641 }
642 
643 bool
brw_fs_lower_regioning(fs_visitor & s)644 brw_fs_lower_regioning(fs_visitor &s)
645 {
646    bool progress = false;
647 
648    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg)
649       progress |= lower_instruction(&s, block, inst);
650 
651    if (progress)
652       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
653 
654    return progress;
655 }
656