• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_fs.h"
25 #include "brw_cfg.h"
26 #include "brw_builder.h"
27 
28 using namespace brw;
29 
30 namespace {
31    /* From the SKL PRM Vol 2a, "Move":
32     *
33     * "A mov with the same source and destination type, no source modifier,
34     *  and no saturation is a raw move. A packed byte destination region (B
35     *  or UB type with HorzStride == 1 and ExecSize > 1) can only be written
36     *  using raw move."
37     */
38    bool
is_byte_raw_mov(const fs_inst * inst)39    is_byte_raw_mov(const fs_inst *inst)
40    {
41       return brw_type_size_bytes(inst->dst.type) == 1 &&
42              inst->opcode == BRW_OPCODE_MOV &&
43              inst->src[0].type == inst->dst.type &&
44              !inst->saturate &&
45              !inst->src[0].negate &&
46              !inst->src[0].abs;
47    }
48 
49    /*
50     * Return an acceptable byte stride for the specified source of an
51     * instruction affected by a regioning restriction.
52     */
53    unsigned
required_src_byte_stride(const intel_device_info * devinfo,const fs_inst * inst,unsigned i)54    required_src_byte_stride(const intel_device_info *devinfo, const fs_inst *inst,
55                             unsigned i)
56    {
57       if (has_dst_aligned_region_restriction(devinfo, inst)) {
58          return MAX2(brw_type_size_bytes(inst->dst.type),
59                      byte_stride(inst->dst));
60 
61       } else if (has_subdword_integer_region_restriction(devinfo, inst,
62                                                          &inst->src[i], 1)) {
63          /* Use a stride of 32bits if possible, since that will guarantee that
64           * the copy emitted to lower this region won't be affected by the
65           * sub-dword integer region restrictions.  This may not be possible
66           * for the second source of an instruction if we're required to use
67           * packed data due to Wa_16012383669.
68           */
69          return (i == 1 ? brw_type_size_bytes(inst->src[i].type) : 4);
70 
71       } else {
72          return byte_stride(inst->src[i]);
73       }
74    }
75 
76    /*
77     * Return an acceptable byte sub-register offset for the specified source
78     * of an instruction affected by a regioning restriction.
79     */
80    unsigned
required_src_byte_offset(const intel_device_info * devinfo,const fs_inst * inst,unsigned i)81    required_src_byte_offset(const intel_device_info *devinfo, const fs_inst *inst,
82                             unsigned i)
83    {
84       if (has_dst_aligned_region_restriction(devinfo, inst)) {
85          return reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
86 
87       } else if (has_subdword_integer_region_restriction(devinfo, inst,
88                                                          &inst->src[i], 1)) {
89          const unsigned dst_byte_stride =
90             MAX2(byte_stride(inst->dst), brw_type_size_bytes(inst->dst.type));
91          const unsigned src_byte_stride = required_src_byte_stride(devinfo, inst, i);
92          const unsigned dst_byte_offset =
93             reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
94          const unsigned src_byte_offset =
95             reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
96 
97          if (src_byte_stride > brw_type_size_bytes(inst->src[i].type)) {
98             assert(src_byte_stride >= dst_byte_stride);
99             /* The source is affected by the Xe2+ sub-dword integer regioning
100              * restrictions.  For the case of source 0 BSpec#56640 specifies a
101              * number of equations relating the source and destination
102              * sub-register numbers in all cases where a source stride of
103              * 32bits is allowed.  These equations have the form:
104              *
105              *   k * Dst.SubReg % m = Src.SubReg / l
106              *
107              * For some constants k, l and m different for each combination of
108              * source and destination types and strides.  The expression in
109              * the return statement below computes a valid source offset by
110              * inverting the equation like:
111              *
112              *   Src.SubReg = l * k * (Dst.SubReg % m)
113              *
114              * and then scaling by the element type sizes in order to get an
115              * expression in terms of byte offsets instead of sub-register
116              * numbers.  It can be easily verified that in all cases listed on
117              * the hardware spec where the source has a well-defined uniform
118              * stride the product l*k is equal to the ratio between the source
119              * and destination strides.
120              */
121             const unsigned m = 64 * dst_byte_stride / src_byte_stride;
122             return dst_byte_offset % m * src_byte_stride / dst_byte_stride;
123          } else {
124             assert(src_byte_stride == brw_type_size_bytes(inst->src[i].type));
125             /* A packed source is required, likely due to the stricter
126              * requirements of the second source region.  The source being
127              * packed guarantees that the region of the original instruction
128              * will be valid, but the copy may break the regioning
129              * restrictions.  Do our best to try to prevent that from
130              * happening by making sure the offset of the temporary matches
131              * the original source based on the same equation above -- However
132              * that may not be sufficient if the source had a stride larger
133              * than 32bits, lowering the copy recursively may be necessary.
134              */
135             return src_byte_offset * src_byte_stride / byte_stride(inst->src[i]);
136          }
137 
138       } else {
139          return reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
140       }
141    }
142 
143    /*
144     * Return an acceptable byte stride for the destination of an instruction
145     * that requires it to have some particular alignment.
146     */
147    unsigned
required_dst_byte_stride(const fs_inst * inst)148    required_dst_byte_stride(const fs_inst *inst)
149    {
150       if (inst->dst.is_accumulator()) {
151          /* If the destination is an accumulator, insist that we leave the
152           * stride alone.  We cannot "fix" accumulator destinations by writing
153           * to a temporary and emitting a MOV into the original destination.
154           * For multiply instructions (our one use of the accumulator), the
155           * MUL writes the full 66 bits of the accumulator whereas the MOV we
156           * would emit only writes 33 bits and leaves the top 33 bits
157           * undefined.
158           *
159           * It's safe to just require the original stride here because the
160           * lowering pass will detect the mismatch in has_invalid_src_region
161           * and fix the sources of the multiply instead of the destination.
162           */
163          return inst->dst.hstride * brw_type_size_bytes(inst->dst.type);
164       } else if (brw_type_size_bytes(inst->dst.type) < get_exec_type_size(inst) &&
165           !is_byte_raw_mov(inst)) {
166          return get_exec_type_size(inst);
167       } else {
168          /* Calculate the maximum byte stride and the minimum/maximum type
169           * size across all source and destination operands we are required to
170           * lower.
171           */
172          unsigned max_stride = inst->dst.stride * brw_type_size_bytes(inst->dst.type);
173          unsigned min_size = brw_type_size_bytes(inst->dst.type);
174          unsigned max_size = brw_type_size_bytes(inst->dst.type);
175 
176          for (unsigned i = 0; i < inst->sources; i++) {
177             if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {
178                const unsigned size = brw_type_size_bytes(inst->src[i].type);
179                max_stride = MAX2(max_stride, inst->src[i].stride * size);
180                min_size = MIN2(min_size, size);
181                max_size = MAX2(max_size, size);
182             }
183          }
184 
185          /* All operands involved in lowering need to fit in the calculated
186           * stride.
187           */
188          assert(max_size <= 4 * min_size);
189 
190          /* Attempt to use the largest byte stride among all present operands,
191           * but never exceed a stride of 4 since that would lead to illegal
192           * destination regions during lowering.
193           */
194          return MIN2(max_stride, 4 * min_size);
195       }
196    }
197 
198    /*
199     * Return an acceptable byte sub-register offset for the destination of an
200     * instruction that requires it to be aligned to the sub-register offset of
201     * the sources.
202     */
203    unsigned
required_dst_byte_offset(const intel_device_info * devinfo,const fs_inst * inst)204    required_dst_byte_offset(const intel_device_info *devinfo, const fs_inst *inst)
205    {
206       for (unsigned i = 0; i < inst->sources; i++) {
207          if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
208             if (reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE) !=
209                 reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE))
210                return 0;
211       }
212 
213       return reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
214    }
215 
216    /*
217     * Return the closest legal execution type for an instruction on
218     * the specified platform.
219     */
220    brw_reg_type
required_exec_type(const intel_device_info * devinfo,const fs_inst * inst)221    required_exec_type(const intel_device_info *devinfo, const fs_inst *inst)
222    {
223       const brw_reg_type t = get_exec_type(inst);
224       const bool has_64bit = brw_type_is_float(t) ?
225          devinfo->has_64bit_float : devinfo->has_64bit_int;
226 
227       switch (inst->opcode) {
228       case SHADER_OPCODE_SHUFFLE:
229          /* IVB has an issue (which we found empirically) where it reads
230           * two address register components per channel for indirectly
231           * addressed 64-bit sources.
232           *
233           * From the Cherryview PRM Vol 7. "Register Region Restrictions":
234           *
235           *    "When source or destination datatype is 64b or operation is
236           *    integer DWord multiply, indirect addressing must not be
237           *    used."
238           *
239           * Work around both of the above and handle platforms that
240           * don't support 64-bit types at all.
241           */
242          if ((!devinfo->has_64bit_int ||
243               intel_device_info_is_9lp(devinfo) ||
244               devinfo->ver >= 20) && brw_type_size_bytes(t) > 4)
245             return BRW_TYPE_UD;
246          else if (has_dst_aligned_region_restriction(devinfo, inst))
247             return brw_int_type(brw_type_size_bytes(t), false);
248          else
249             return t;
250 
251       case SHADER_OPCODE_SEL_EXEC:
252          if ((!has_64bit || devinfo->has_64bit_float_via_math_pipe) &&
253              brw_type_size_bytes(t) > 4)
254             return BRW_TYPE_UD;
255          else
256             return t;
257 
258       case SHADER_OPCODE_QUAD_SWIZZLE:
259          if (has_dst_aligned_region_restriction(devinfo, inst))
260             return brw_int_type(brw_type_size_bytes(t), false);
261          else
262             return t;
263 
264       case SHADER_OPCODE_CLUSTER_BROADCAST:
265          /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
266           *
267           *    "When source or destination datatype is 64b or operation is
268           *    integer DWord multiply, indirect addressing must not be
269           *    used."
270           *
271           * For MTL (verx10 == 125), float64 is supported, but int64 is not.
272           * Therefore we need to lower cluster broadcast using 32-bit int ops.
273           *
274           * For gfx12.5+ platforms that support int64, the register regions
275           * used by cluster broadcast aren't supported by the 64-bit pipeline.
276           *
277           * Work around the above and handle platforms that don't
278           * support 64-bit types at all.
279           */
280          if ((!has_64bit || devinfo->verx10 >= 125 ||
281               intel_device_info_is_9lp(devinfo) ||
282               devinfo->ver >= 20) && brw_type_size_bytes(t) > 4)
283             return BRW_TYPE_UD;
284          else
285             return brw_int_type(brw_type_size_bytes(t), false);
286 
287       default:
288          return t;
289       }
290    }
291 
292    /*
293     * Return whether the instruction has an unsupported channel bit layout
294     * specified for the i-th source region.
295     */
296    bool
has_invalid_src_region(const intel_device_info * devinfo,const fs_inst * inst,unsigned i)297    has_invalid_src_region(const intel_device_info *devinfo, const fs_inst *inst,
298                           unsigned i)
299    {
300       /* Wa_22016140776:
301        *
302        *    Scalar broadcast on HF math (packed or unpacked) must not be used.
303        *    Compiler must use a mov instruction to expand the scalar value to
304        *    a vector before using in a HF (packed or unpacked) math operation.
305        */
306       if (inst->is_math() && intel_needs_workaround(devinfo, 22016140776) &&
307           is_uniform(inst->src[i]) && inst->src[i].type == BRW_TYPE_HF) {
308          return true;
309       }
310 
311       if (is_send(inst) || inst->is_control_source(i) ||
312           inst->opcode == BRW_OPCODE_DPAS) {
313          return false;
314       }
315 
316       const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
317       const unsigned src_byte_offset = reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
318 
319       return (has_dst_aligned_region_restriction(devinfo, inst) &&
320               !is_uniform(inst->src[i]) &&
321               (byte_stride(inst->src[i]) != required_src_byte_stride(devinfo, inst, i) ||
322                src_byte_offset != dst_byte_offset)) ||
323              (has_subdword_integer_region_restriction(devinfo, inst) &&
324               (byte_stride(inst->src[i]) != required_src_byte_stride(devinfo, inst, i) ||
325                src_byte_offset != required_src_byte_offset(devinfo, inst, i)));
326    }
327 
328    /*
329     * Return whether the instruction has an unsupported channel bit layout
330     * specified for the destination region.
331     */
332    bool
has_invalid_dst_region(const intel_device_info * devinfo,const fs_inst * inst)333    has_invalid_dst_region(const intel_device_info *devinfo,
334                           const fs_inst *inst)
335    {
336       if (is_send(inst)) {
337          return false;
338       } else {
339          const brw_reg_type exec_type = get_exec_type(inst);
340          const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
341          const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&
342             brw_type_size_bytes(inst->dst.type) < brw_type_size_bytes(exec_type);
343 
344          return (has_dst_aligned_region_restriction(devinfo, inst) &&
345                  (required_dst_byte_stride(inst) != byte_stride(inst->dst) ||
346                   required_dst_byte_offset(devinfo, inst) != dst_byte_offset)) ||
347                 (is_narrowing_conversion &&
348                  required_dst_byte_stride(inst) != byte_stride(inst->dst));
349       }
350    }
351 
352    /**
353     * Return a non-zero value if the execution type of the instruction is
354     * unsupported.  The destination and sources matching the returned mask
355     * will be bit-cast to an integer type of appropriate size, lowering any
356     * source or destination modifiers into separate MOV instructions.
357     */
358    unsigned
has_invalid_exec_type(const intel_device_info * devinfo,const fs_inst * inst)359    has_invalid_exec_type(const intel_device_info *devinfo, const fs_inst *inst)
360    {
361       if (required_exec_type(devinfo, inst) != get_exec_type(inst)) {
362          switch (inst->opcode) {
363          case SHADER_OPCODE_SHUFFLE:
364          case SHADER_OPCODE_QUAD_SWIZZLE:
365          case SHADER_OPCODE_CLUSTER_BROADCAST:
366          case SHADER_OPCODE_BROADCAST:
367          case SHADER_OPCODE_MOV_INDIRECT:
368             return 0x1;
369 
370          case SHADER_OPCODE_SEL_EXEC:
371             return 0x3;
372 
373          default:
374             unreachable("Unknown invalid execution type source mask.");
375          }
376       } else {
377          return 0;
378       }
379    }
380 
381    /**
382     * Return whether the instruction has an unsupported type conversion
383     * that must be handled by expanding the source operand.
384     */
385    bool
has_invalid_src_conversion(const intel_device_info * devinfo,const fs_inst * inst)386    has_invalid_src_conversion(const intel_device_info *devinfo,
387                               const fs_inst *inst)
388    {
389       /* Scalar byte to float conversion is not allowed on DG2+ */
390       return devinfo->verx10 >= 125 &&
391              inst->opcode == BRW_OPCODE_MOV &&
392              brw_type_is_float(inst->dst.type) &&
393              brw_type_size_bits(inst->src[0].type) == 8 &&
394              is_uniform(inst->src[0]);
395    }
396 
397    /*
398     * Return whether the instruction has unsupported source modifiers
399     * specified for the i-th source region.
400     */
401    bool
has_invalid_src_modifiers(const intel_device_info * devinfo,const fs_inst * inst,unsigned i)402    has_invalid_src_modifiers(const intel_device_info *devinfo,
403                              const fs_inst *inst, unsigned i)
404    {
405       return (!inst->can_do_source_mods(devinfo) &&
406               (inst->src[i].negate || inst->src[i].abs)) ||
407              ((has_invalid_exec_type(devinfo, inst) & (1u << i)) &&
408               (inst->src[i].negate || inst->src[i].abs ||
409                inst->src[i].type != get_exec_type(inst))) ||
410              has_invalid_src_conversion(devinfo, inst);
411    }
412 
413    /*
414     * Return whether the instruction has an unsupported type conversion
415     * specified for the destination.
416     */
417    bool
has_invalid_conversion(const intel_device_info * devinfo,const fs_inst * inst)418    has_invalid_conversion(const intel_device_info *devinfo, const fs_inst *inst)
419    {
420       switch (inst->opcode) {
421       case BRW_OPCODE_MOV:
422          return false;
423       case BRW_OPCODE_SEL:
424          return inst->dst.type != get_exec_type(inst);
425       default:
426          /* FIXME: We assume the opcodes not explicitly mentioned before just
427           * work fine with arbitrary conversions, unless they need to be
428           * bit-cast.
429           */
430          return has_invalid_exec_type(devinfo, inst) &&
431                 inst->dst.type != get_exec_type(inst);
432       }
433    }
434 
435    /**
436     * Return whether the instruction has unsupported destination modifiers.
437     */
438    bool
has_invalid_dst_modifiers(const intel_device_info * devinfo,const fs_inst * inst)439    has_invalid_dst_modifiers(const intel_device_info *devinfo, const fs_inst *inst)
440    {
441       return (has_invalid_exec_type(devinfo, inst) &&
442               (inst->saturate || inst->conditional_mod)) ||
443              has_invalid_conversion(devinfo, inst);
444    }
445 
446    /**
447     * Return whether the instruction has non-standard semantics for the
448     * conditional mod which don't cause the flag register to be updated with
449     * the comparison result.
450     */
451    bool
has_inconsistent_cmod(const fs_inst * inst)452    has_inconsistent_cmod(const fs_inst *inst)
453    {
454       return inst->opcode == BRW_OPCODE_SEL ||
455              inst->opcode == BRW_OPCODE_CSEL ||
456              inst->opcode == BRW_OPCODE_IF ||
457              inst->opcode == BRW_OPCODE_WHILE;
458    }
459 
460    bool
461    lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);
462 }
463 
464 /**
465  * Remove any modifiers from the \p i-th source region of the instruction,
466  * including negate, abs and any implicit type conversion to the execution
467  * type.  Instead any source modifiers will be implemented as a separate
468  * MOV instruction prior to the original instruction.
469  */
470 bool
brw_lower_src_modifiers(fs_visitor & s,bblock_t * block,fs_inst * inst,unsigned i)471 brw_lower_src_modifiers(fs_visitor &s, bblock_t *block, fs_inst *inst, unsigned i)
472 {
473    assert(inst->components_read(i) == 1);
474    assert(s.devinfo->has_integer_dword_mul ||
475           inst->opcode != BRW_OPCODE_MUL ||
476           brw_type_is_float(get_exec_type(inst)) ||
477           MIN2(brw_type_size_bytes(inst->src[0].type), brw_type_size_bytes(inst->src[1].type)) >= 4 ||
478           brw_type_size_bytes(inst->src[i].type) == get_exec_type_size(inst));
479 
480    const brw_builder ibld(&s, block, inst);
481    const brw_reg tmp = ibld.vgrf(get_exec_type(inst));
482 
483    lower_instruction(&s, block, ibld.MOV(tmp, inst->src[i]));
484    inst->src[i] = tmp;
485 
486    return true;
487 }
488 
489 namespace {
490    /**
491     * Remove any modifiers from the destination region of the instruction,
492     * including saturate, conditional mod and any implicit type conversion
493     * from the execution type.  Instead any destination modifiers will be
494     * implemented as a separate MOV instruction after the original
495     * instruction.
496     */
497    bool
lower_dst_modifiers(fs_visitor * v,bblock_t * block,fs_inst * inst)498    lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst)
499    {
500       const brw_builder ibld(v, block, inst);
501       const brw_reg_type type = get_exec_type(inst);
502       /* Not strictly necessary, but if possible use a temporary with the same
503        * channel alignment as the current destination in order to avoid
504        * violating the restrictions enforced later on by lower_src_region()
505        * and lower_dst_region(), which would introduce additional copy
506        * instructions into the program unnecessarily.
507        */
508       const unsigned stride =
509          brw_type_size_bytes(inst->dst.type) * inst->dst.stride <= brw_type_size_bytes(type) ? 1 :
510          brw_type_size_bytes(inst->dst.type) * inst->dst.stride / brw_type_size_bytes(type);
511       brw_reg tmp = ibld.vgrf(type, stride);
512       ibld.UNDEF(tmp);
513       tmp = horiz_stride(tmp, stride);
514 
515       /* Emit a MOV taking care of all the destination modifiers. */
516       fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
517       mov->saturate = inst->saturate;
518       if (!has_inconsistent_cmod(inst))
519          mov->conditional_mod = inst->conditional_mod;
520       if (inst->opcode != BRW_OPCODE_SEL) {
521          mov->predicate = inst->predicate;
522          mov->predicate_inverse = inst->predicate_inverse;
523       }
524       mov->flag_subreg = inst->flag_subreg;
525       lower_instruction(v, block, mov);
526 
527       /* Point the original instruction at the temporary, and clean up any
528        * destination modifiers.
529        */
530       assert(inst->size_written == inst->dst.component_size(inst->exec_size));
531       inst->dst = tmp;
532       inst->size_written = inst->dst.component_size(inst->exec_size);
533       inst->saturate = false;
534       if (!has_inconsistent_cmod(inst))
535          inst->conditional_mod = BRW_CONDITIONAL_NONE;
536 
537       assert(!inst->flags_written(v->devinfo) || !mov->predicate);
538       return true;
539    }
540 
541    /**
542     * Remove any non-trivial shuffling of data from the \p i-th source region
543     * of the instruction.  Instead implement the region as a series of integer
544     * copies into a temporary with the same channel layout as the destination.
545     */
546    bool
lower_src_region(fs_visitor * v,bblock_t * block,fs_inst * inst,unsigned i)547    lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
548    {
549       assert(inst->components_read(i) == 1);
550       const intel_device_info *devinfo = v->devinfo;
551       const brw_builder ibld(v, block, inst);
552       const unsigned stride = required_src_byte_stride(devinfo, inst, i) /
553                               brw_type_size_bytes(inst->src[i].type);
554       assert(stride > 0);
555       /* Calculate the size of the temporary allocation manually instead of
556        * relying on the builder, since we may have to add some amount of
557        * padding mandated by the hardware for Xe2+ instructions with sub-dword
558        * integer regions.
559        */
560       const unsigned size =
561          DIV_ROUND_UP(required_src_byte_offset(v->devinfo, inst, i) +
562                       inst->exec_size * stride *
563                       brw_type_size_bytes(inst->src[i].type),
564                       reg_unit(devinfo) * REG_SIZE) * reg_unit(devinfo);
565       brw_reg tmp = brw_vgrf(v->alloc.allocate(size), inst->src[i].type);
566       ibld.UNDEF(tmp);
567       tmp = byte_offset(horiz_stride(tmp, stride),
568                         required_src_byte_offset(devinfo, inst, i));
569 
570       /* Emit a series of 32-bit integer copies with any source modifiers
571        * cleaned up (because their semantics are dependent on the type).
572        */
573       const brw_reg_type raw_type = brw_int_type(MIN2(brw_type_size_bytes(tmp.type), 4),
574                                                  false);
575       const unsigned n = brw_type_size_bytes(tmp.type) / brw_type_size_bytes(raw_type);
576       brw_reg raw_src = inst->src[i];
577       raw_src.negate = false;
578       raw_src.abs = false;
579 
580       for (unsigned j = 0; j < n; j++) {
581 	fs_inst *jnst = ibld.MOV(subscript(tmp, raw_type, j),
582 				 subscript(raw_src, raw_type, j));
583 	if (has_subdword_integer_region_restriction(devinfo, jnst)) {
584            /* The copy isn't guaranteed to comply with all subdword integer
585             * regioning restrictions in some cases.  Lower it recursively.
586             */
587 	   lower_instruction(v, block, jnst);
588         }
589       }
590 
591       /* Point the original instruction at the temporary, making sure to keep
592        * any source modifiers in the instruction.
593        */
594       brw_reg lower_src = tmp;
595       lower_src.negate = inst->src[i].negate;
596       lower_src.abs = inst->src[i].abs;
597       inst->src[i] = lower_src;
598 
599       return true;
600    }
601 
602    /**
603     * Remove any non-trivial shuffling of data from the destination region of
604     * the instruction.  Instead implement the region as a series of integer
605     * copies from a temporary with a channel layout compatible with the
606     * sources.
607     */
608    bool
lower_dst_region(fs_visitor * v,bblock_t * block,fs_inst * inst)609    lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
610    {
611       /* We cannot replace the result of an integer multiply which writes the
612        * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
613        * value whereas the MOV will act on only 32 or 33 bits of the
614        * accumulator.
615        */
616       assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() ||
617              brw_type_is_float(inst->dst.type));
618 
619       const brw_builder ibld(v, block, inst);
620       const unsigned stride = required_dst_byte_stride(inst) /
621                               brw_type_size_bytes(inst->dst.type);
622       assert(stride > 0);
623       brw_reg tmp = ibld.vgrf(inst->dst.type, stride);
624       ibld.UNDEF(tmp);
625       tmp = horiz_stride(tmp, stride);
626 
627       if (!inst->dst.is_null()) {
628          /* Emit a series of 32-bit integer copies from the temporary into the
629           * original destination.
630           */
631          const brw_reg_type raw_type =
632             brw_int_type(MIN2(brw_type_size_bytes(tmp.type), 4), false);
633 
634          const unsigned n =
635             brw_type_size_bytes(tmp.type) / brw_type_size_bytes(raw_type);
636 
637          if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
638             /* Note that in general we cannot simply predicate the copies on
639              * the same flag register as the original instruction, since it
640              * may have been overwritten by the instruction itself.  Instead
641              * initialize the temporary with the previous contents of the
642              * destination register.
643              */
644             for (unsigned j = 0; j < n; j++)
645                ibld.MOV(subscript(tmp, raw_type, j),
646                         subscript(inst->dst, raw_type, j));
647          }
648 
649          for (unsigned j = 0; j < n; j++) {
650             fs_inst *jnst = ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),
651                                                            subscript(tmp, raw_type, j));
652             if (has_subdword_integer_region_restriction(v->devinfo, jnst)) {
653                /* The copy isn't guaranteed to comply with all subdword integer
654                 * regioning restrictions in some cases.  Lower it recursively.
655                 */
656                lower_instruction(v, block, jnst);
657             }
658          }
659 
660          /* If the destination was an accumulator, after lowering it will be a
661           * GRF. Clear writes_accumulator for the instruction.
662           */
663          if (inst->dst.is_accumulator())
664             inst->writes_accumulator = false;
665       }
666 
667       /* Point the original instruction at the temporary, making sure to keep
668        * any destination modifiers in the instruction.
669        */
670       assert(inst->size_written == inst->dst.component_size(inst->exec_size));
671       inst->dst = tmp;
672       inst->size_written = inst->dst.component_size(inst->exec_size);
673 
674       return true;
675    }
676 
677    /**
678     * Change sources and destination of the instruction to an
679     * appropriate legal type, splitting the instruction into multiple
680     * ones of smaller execution type if necessary, to be used in cases
681     * where the execution type of an instruction is unsupported.
682     */
683    bool
lower_exec_type(fs_visitor * v,bblock_t * block,fs_inst * inst)684    lower_exec_type(fs_visitor *v, bblock_t *block, fs_inst *inst)
685    {
686       assert(inst->dst.type == get_exec_type(inst));
687       const unsigned mask = has_invalid_exec_type(v->devinfo, inst);
688       const brw_reg_type raw_type = required_exec_type(v->devinfo, inst);
689       const unsigned n = get_exec_type_size(inst) / brw_type_size_bytes(raw_type);
690       const brw_builder ibld(v, block, inst);
691 
692       brw_reg tmp = ibld.vgrf(inst->dst.type, inst->dst.stride);
693       ibld.UNDEF(tmp);
694       tmp = horiz_stride(tmp, inst->dst.stride);
695 
696       for (unsigned j = 0; j < n; j++) {
697          fs_inst sub_inst = *inst;
698 
699          for (unsigned i = 0; i < inst->sources; i++) {
700             if (mask & (1u << i)) {
701                assert(inst->src[i].type == inst->dst.type);
702                sub_inst.src[i] = subscript(inst->src[i], raw_type, j);
703             }
704          }
705 
706          sub_inst.dst = subscript(tmp, raw_type, j);
707 
708          assert(sub_inst.size_written == sub_inst.dst.component_size(sub_inst.exec_size));
709          assert(!sub_inst.flags_written(v->devinfo) && !sub_inst.saturate);
710          ibld.emit(sub_inst);
711 
712          fs_inst *mov = ibld.MOV(subscript(inst->dst, raw_type, j),
713                                  subscript(tmp, raw_type, j));
714          if (inst->opcode != BRW_OPCODE_SEL) {
715             mov->predicate = inst->predicate;
716             mov->predicate_inverse = inst->predicate_inverse;
717          }
718          lower_instruction(v, block, mov);
719       }
720 
721       inst->remove(block);
722 
723       return true;
724    }
725 
726    /**
727     * Fast-path for very specific kinds of invalid regions.
728     *
729     * Gfx12.5+ does not allow moves of B or UB sources to floating-point
730     * destinations. This restriction can be resolved more efficiently than by
731     * the general lowering in lower_src_modifiers or lower_src_region.
732     */
733    void
lower_src_conversion(fs_visitor * v,bblock_t * block,fs_inst * inst)734    lower_src_conversion(fs_visitor *v, bblock_t *block, fs_inst *inst)
735    {
736       const intel_device_info *devinfo = v->devinfo;
737       const brw_builder ibld = brw_builder(v, block, inst).scalar_group();
738 
739       /* We only handle scalar conversions from small types for now. */
740       assert(is_uniform(inst->src[0]));
741 
742       brw_reg tmp = ibld.vgrf(brw_type_with_size(inst->src[0].type, 32));
743       fs_inst *mov = ibld.MOV(tmp, inst->src[0]);
744 
745       inst->src[0] = component(tmp, 0);
746 
747       /* Assert that neither the added MOV nor the original instruction will need
748        * any additional lowering.
749        */
750       assert(!has_invalid_src_region(devinfo, mov, 0));
751       assert(!has_invalid_src_modifiers(devinfo, mov, 0));
752       assert(!has_invalid_dst_region(devinfo, mov));
753 
754       assert(!has_invalid_src_region(devinfo, inst, 0));
755       assert(!has_invalid_src_modifiers(devinfo, inst, 0));
756    }
757 
758    /**
759     * Legalize the source and destination regioning controls of the specified
760     * instruction.
761     */
762    bool
lower_instruction(fs_visitor * v,bblock_t * block,fs_inst * inst)763    lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
764    {
765       const intel_device_info *devinfo = v->devinfo;
766       bool progress = false;
767 
768       /* BROADCAST is special. It's destination region is a bit of a lie, and
769        * it gets lower in brw_eu_emit. For the purposes of region
770        * restrictions, let's assume that the final code emission will do the
771        * right thing. Doing a bunch of shuffling here is only going to make a
772        * mess of things.
773        */
774       if (inst->opcode == SHADER_OPCODE_BROADCAST)
775          return false;
776 
777       if (has_invalid_dst_modifiers(devinfo, inst))
778          progress |= lower_dst_modifiers(v, block, inst);
779 
780       if (has_invalid_dst_region(devinfo, inst))
781          progress |= lower_dst_region(v, block, inst);
782 
783       if (has_invalid_src_conversion(devinfo, inst)) {
784          lower_src_conversion(v, block, inst);
785          progress = true;
786       }
787 
788       for (unsigned i = 0; i < inst->sources; i++) {
789          if (has_invalid_src_modifiers(devinfo, inst, i))
790             progress |= brw_lower_src_modifiers(*v, block, inst, i);
791 
792          if (has_invalid_src_region(devinfo, inst, i))
793             progress |= lower_src_region(v, block, inst, i);
794       }
795 
796       if (has_invalid_exec_type(devinfo, inst))
797          progress |= lower_exec_type(v, block, inst);
798 
799       return progress;
800    }
801 }
802 
803 bool
brw_lower_regioning(fs_visitor & s)804 brw_lower_regioning(fs_visitor &s)
805 {
806    bool progress = false;
807 
808    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg)
809       progress |= lower_instruction(&s, block, inst);
810 
811    if (progress)
812       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
813 
814    return progress;
815 }
816