• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "brw_fs.h"
7 #include "brw_fs_builder.h"
8 
9 using namespace brw;
10 
11 static bool
is_mixed_float_with_fp32_dst(const fs_inst * inst)12 is_mixed_float_with_fp32_dst(const fs_inst *inst)
13 {
14    if (inst->dst.type != BRW_REGISTER_TYPE_F)
15       return false;
16 
17    for (int i = 0; i < inst->sources; i++) {
18       if (inst->src[i].type == BRW_REGISTER_TYPE_HF)
19          return true;
20    }
21 
22    return false;
23 }
24 
25 static bool
is_mixed_float_with_packed_fp16_dst(const fs_inst * inst)26 is_mixed_float_with_packed_fp16_dst(const fs_inst *inst)
27 {
28    if (inst->dst.type != BRW_REGISTER_TYPE_HF ||
29        inst->dst.stride != 1)
30       return false;
31 
32    for (int i = 0; i < inst->sources; i++) {
33       if (inst->src[i].type == BRW_REGISTER_TYPE_F)
34          return true;
35    }
36 
37    return false;
38 }
39 
40 /**
41  * Get the closest allowed SIMD width for instruction \p inst accounting for
42  * some common regioning and execution control restrictions that apply to FPU
43  * instructions.  These restrictions don't necessarily have any relevance to
44  * instructions not executed by the FPU pipeline like extended math, control
45  * flow or send message instructions.
46  *
47  * For virtual opcodes it's really up to the instruction -- In some cases
48  * (e.g. where a virtual instruction unrolls into a simple sequence of FPU
49  * instructions) it may simplify virtual instruction lowering if we can
50  * enforce FPU-like regioning restrictions already on the virtual instruction,
51  * in other cases (e.g. virtual send-like instructions) this may be
52  * excessively restrictive.
53  */
54 static unsigned
get_fpu_lowered_simd_width(const fs_visitor * shader,const fs_inst * inst)55 get_fpu_lowered_simd_width(const fs_visitor *shader,
56                            const fs_inst *inst)
57 {
58    const struct brw_compiler *compiler = shader->compiler;
59    const struct intel_device_info *devinfo = compiler->devinfo;
60 
61    /* Maximum execution size representable in the instruction controls. */
62    unsigned max_width = MIN2(32, inst->exec_size);
63 
64    /* Number of channels per polygon handled by a multipolygon PS shader. */
65    const unsigned poly_width = shader->dispatch_width /
66                                MAX2(1, shader->max_polygons);
67 
68    /* Number of registers that will be read by an ATTR source if
69     * present for multipolygon PS shaders, since the PS vertex setup
70     * data for each polygon is stored in different contiguous GRFs.
71     */
72    const unsigned attr_reg_count = (shader->stage != MESA_SHADER_FRAGMENT ||
73                                     shader->max_polygons < 2 ? 0 :
74                                     DIV_ROUND_UP(inst->exec_size,
75                                                  poly_width) * reg_unit(devinfo));
76 
77    /* According to the PRMs:
78     *  "A. In Direct Addressing mode, a source cannot span more than 2
79     *      adjacent GRF registers.
80     *   B. A destination cannot span more than 2 adjacent GRF registers."
81     *
82     * Look for the source or destination with the largest register region
83     * which is the one that is going to limit the overall execution size of
84     * the instruction due to this rule.
85     */
86    unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
87 
88    for (unsigned i = 0; i < inst->sources; i++)
89       reg_count = MAX3(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE),
90                        (inst->src[i].file == ATTR ? attr_reg_count : 0));
91 
92    /* Calculate the maximum execution size of the instruction based on the
93     * factor by which it goes over the hardware limit of 2 GRFs.
94     */
95    const unsigned max_reg_count = 2 * reg_unit(devinfo);
96    if (reg_count > max_reg_count)
97       max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, max_reg_count));
98 
99    /* From the IVB PRMs (applies to HSW too):
100     *  "Instructions with condition modifiers must not use SIMD32."
101     *
102     * From the BDW PRMs (applies to later hardware too):
103     *  "Ternary instruction with condition modifiers must not use SIMD32."
104     */
105    if (inst->conditional_mod && inst->is_3src(compiler) && devinfo->ver < 12)
106       max_width = MIN2(max_width, 16);
107 
108    /* From the IVB PRMs (applies to other devices that don't have the
109     * intel_device_info::supports_simd16_3src flag set):
110     *  "In Align16 access mode, SIMD16 is not allowed for DW operations and
111     *   SIMD8 is not allowed for DF operations."
112     */
113    if (inst->is_3src(compiler) && !devinfo->supports_simd16_3src)
114       max_width = MIN2(max_width, inst->exec_size / reg_count);
115 
116    /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
117     * Float Operations:
118     *
119     *    "No SIMD16 in mixed mode when destination is f32. Instruction
120     *     execution size must be no more than 8."
121     *
122     * FIXME: the simulator doesn't seem to complain if we don't do this and
123     * empirical testing with existing CTS tests show that they pass just fine
124     * without implementing this, however, since our interpretation of the PRM
125     * is that conversion MOVs between HF and F are still mixed-float
126     * instructions (and therefore subject to this restriction) we decided to
127     * split them to be safe. Might be useful to do additional investigation to
128     * lift the restriction if we can ensure that it is safe though, since these
129     * conversions are common when half-float types are involved since many
130     * instructions do not support HF types and conversions from/to F are
131     * required.
132     */
133    if (is_mixed_float_with_fp32_dst(inst) && devinfo->ver < 20)
134       max_width = MIN2(max_width, 8);
135 
136    /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
137     * Float Operations:
138     *
139     *    "No SIMD16 in mixed mode when destination is packed f16 for both
140     *     Align1 and Align16."
141     */
142    if (is_mixed_float_with_packed_fp16_dst(inst) && devinfo->ver < 20)
143       max_width = MIN2(max_width, 8);
144 
145    /* Only power-of-two execution sizes are representable in the instruction
146     * control fields.
147     */
148    return 1 << util_logbase2(max_width);
149 }
150 
151 /**
152  * Get the maximum allowed SIMD width for instruction \p inst accounting for
153  * various payload size restrictions that apply to sampler message
154  * instructions.
155  *
156  * This is only intended to provide a maximum theoretical bound for the
157  * execution size of the message based on the number of argument components
158  * alone, which in most cases will determine whether the SIMD8 or SIMD16
159  * variant of the message can be used, though some messages may have
160  * additional restrictions not accounted for here (e.g. pre-ILK hardware uses
161  * the message length to determine the exact SIMD width and argument count,
162  * which makes a number of sampler message combinations impossible to
163  * represent).
164  *
165  * Note: Platforms with monolithic SIMD16 double the possible SIMD widths
166  * change from (SIMD8, SIMD16) to (SIMD16, SIMD32).
167  */
168 static unsigned
get_sampler_lowered_simd_width(const struct intel_device_info * devinfo,const fs_inst * inst)169 get_sampler_lowered_simd_width(const struct intel_device_info *devinfo,
170                                const fs_inst *inst)
171 {
172    /* If we have a min_lod parameter on anything other than a simple sample
173     * message, it will push it over 5 arguments and we have to fall back to
174     * SIMD8.
175     */
176    if (inst->opcode != SHADER_OPCODE_TEX &&
177        inst->components_read(TEX_LOGICAL_SRC_MIN_LOD))
178       return devinfo->ver < 20 ? 8 : 16;
179 
180    /* On Gfx9+ the LOD argument is for free if we're able to use the LZ
181     * variant of the TXL or TXF message.
182     */
183    const bool implicit_lod = (inst->opcode == SHADER_OPCODE_TXL ||
184                               inst->opcode == SHADER_OPCODE_TXF) &&
185                              inst->src[TEX_LOGICAL_SRC_LOD].is_zero();
186 
187    /* Calculate the total number of argument components that need to be passed
188     * to the sampler unit.
189     */
190    const unsigned num_payload_components =
191       inst->components_read(TEX_LOGICAL_SRC_COORDINATE) +
192       inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
193       (implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) +
194       inst->components_read(TEX_LOGICAL_SRC_LOD2) +
195       inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
196       (inst->opcode == SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
197        inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) +
198       inst->components_read(TEX_LOGICAL_SRC_MCS);
199 
200    const unsigned simd_limit = reg_unit(devinfo) *
201       (num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);
202 
203    /* SIMD16 (SIMD32 on Xe2) messages with more than five arguments exceed the
204     * maximum message size supported by the sampler, regardless of whether a
205     * header is provided or not.
206     */
207    return MIN2(inst->exec_size, simd_limit);
208 }
209 
210 /**
211  * Get the closest native SIMD width supported by the hardware for instruction
212  * \p inst.  The instruction will be left untouched by
213  * fs_visitor::lower_simd_width() if the returned value is equal to the
214  * original execution size.
215  */
216 unsigned
brw_fs_get_lowered_simd_width(const fs_visitor * shader,const fs_inst * inst)217 brw_fs_get_lowered_simd_width(const fs_visitor *shader, const fs_inst *inst)
218 {
219    const struct brw_compiler *compiler = shader->compiler;
220    const struct intel_device_info *devinfo = compiler->devinfo;
221 
222    switch (inst->opcode) {
223    case BRW_OPCODE_DP4A:
224    case BRW_OPCODE_MOV:
225    case BRW_OPCODE_SEL:
226    case BRW_OPCODE_NOT:
227    case BRW_OPCODE_AND:
228    case BRW_OPCODE_OR:
229    case BRW_OPCODE_XOR:
230    case BRW_OPCODE_SHR:
231    case BRW_OPCODE_SHL:
232    case BRW_OPCODE_ASR:
233    case BRW_OPCODE_ROR:
234    case BRW_OPCODE_ROL:
235    case BRW_OPCODE_CMPN:
236    case BRW_OPCODE_CSEL:
237    case BRW_OPCODE_BFREV:
238    case BRW_OPCODE_BFE:
239    case BRW_OPCODE_ADD:
240    case BRW_OPCODE_MUL:
241    case BRW_OPCODE_AVG:
242    case BRW_OPCODE_FRC:
243    case BRW_OPCODE_RNDU:
244    case BRW_OPCODE_RNDD:
245    case BRW_OPCODE_RNDE:
246    case BRW_OPCODE_RNDZ:
247    case BRW_OPCODE_LZD:
248    case BRW_OPCODE_FBH:
249    case BRW_OPCODE_FBL:
250    case BRW_OPCODE_CBIT:
251    case BRW_OPCODE_SAD2:
252    case BRW_OPCODE_MAD:
253    case BRW_OPCODE_LRP:
254    case BRW_OPCODE_ADD3:
255    case FS_OPCODE_PACK:
256    case SHADER_OPCODE_SEL_EXEC:
257    case SHADER_OPCODE_CLUSTER_BROADCAST:
258    case SHADER_OPCODE_MOV_RELOC_IMM:
259    case BRW_OPCODE_CMP:
260    case BRW_OPCODE_BFI1:
261    case BRW_OPCODE_BFI2:
262       return get_fpu_lowered_simd_width(shader, inst);
263 
264    case BRW_OPCODE_IF:
265       assert(inst->src[0].file == BAD_FILE || inst->exec_size <= 16);
266       return inst->exec_size;
267 
268    case SHADER_OPCODE_RCP:
269    case SHADER_OPCODE_RSQ:
270    case SHADER_OPCODE_SQRT:
271    case SHADER_OPCODE_EXP2:
272    case SHADER_OPCODE_LOG2:
273    case SHADER_OPCODE_SIN:
274    case SHADER_OPCODE_COS: {
275       if (inst->dst.type == BRW_REGISTER_TYPE_HF)
276          return MIN2(8, inst->exec_size);
277       return MIN2(16, inst->exec_size);
278    }
279 
280    case SHADER_OPCODE_POW: {
281       /* SIMD16 is only allowed on Gfx7+. Extended Math Function is limited
282        * to SIMD8 with half-float
283        */
284       if (inst->dst.type == BRW_REGISTER_TYPE_HF)
285          return MIN2(8, inst->exec_size);
286       return MIN2(16, inst->exec_size);
287    }
288 
289    case SHADER_OPCODE_USUB_SAT:
290    case SHADER_OPCODE_ISUB_SAT:
291       return get_fpu_lowered_simd_width(shader, inst);
292 
293    case SHADER_OPCODE_INT_QUOTIENT:
294    case SHADER_OPCODE_INT_REMAINDER:
295       /* Integer division is limited to SIMD8 on all generations. */
296       return MIN2(8, inst->exec_size);
297 
298    case FS_OPCODE_LINTERP:
299    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
300    case FS_OPCODE_PACK_HALF_2x16_SPLIT:
301    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
302    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
303    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
304    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
305    case FS_OPCODE_DDX_COARSE:
306    case FS_OPCODE_DDX_FINE:
307    case FS_OPCODE_DDY_COARSE:
308    case FS_OPCODE_DDY_FINE:
309       return MIN2(16, inst->exec_size);
310 
311    case SHADER_OPCODE_MULH:
312       /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
313        * is 8-wide on Gfx7+.
314        */
315       return devinfo->ver >= 20 ? 16 : 8;
316 
317    case FS_OPCODE_FB_WRITE_LOGICAL:
318       /* Dual-source FB writes are unsupported in SIMD16 mode. */
319       return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
320               8 : MIN2(16, inst->exec_size));
321 
322    case FS_OPCODE_FB_READ_LOGICAL:
323       return MIN2(16, inst->exec_size);
324 
325    case SHADER_OPCODE_TEX_LOGICAL:
326    case SHADER_OPCODE_TXF_CMS_LOGICAL:
327    case SHADER_OPCODE_TXF_UMS_LOGICAL:
328    case SHADER_OPCODE_TXF_MCS_LOGICAL:
329    case SHADER_OPCODE_LOD_LOGICAL:
330    case SHADER_OPCODE_TG4_LOGICAL:
331    case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
332    case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
333    case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
334    case SHADER_OPCODE_TG4_BIAS_LOGICAL:
335    case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
336    case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
337    case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
338    case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
339    case SHADER_OPCODE_TXL_LOGICAL:
340    case FS_OPCODE_TXB_LOGICAL:
341    case SHADER_OPCODE_TXF_LOGICAL:
342    case SHADER_OPCODE_TXS_LOGICAL:
343       return get_sampler_lowered_simd_width(devinfo, inst);
344 
345    /* On gfx12 parameters are fixed to 16-bit values and therefore they all
346     * always fit regardless of the execution size.
347     */
348    case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
349       return MIN2(16, inst->exec_size);
350 
351    case SHADER_OPCODE_TXD_LOGICAL:
352       /* TXD is unsupported in SIMD16 mode previous to Xe2. SIMD32 is still
353        * unsuppported on Xe2.
354        */
355       return devinfo->ver < 20 ? 8 : 16;
356 
357    case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
358    case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
359    case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
360       return 8;
361 
362    case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
363    case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
364    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
365    case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
366    case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
367    case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
368    case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
369    case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
370    case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
371    case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
372    case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
373       return MIN2(16, inst->exec_size);
374 
375    case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
376    case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
377    case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
378       assert(inst->exec_size <= 16);
379       return inst->exec_size;
380 
381    case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
382       return devinfo->has_lsc ? MIN2(16, inst->exec_size) : 8;
383 
384    case SHADER_OPCODE_URB_READ_LOGICAL:
385    case SHADER_OPCODE_URB_WRITE_LOGICAL:
386       return MIN2(devinfo->ver < 20 ? 8 : 16, inst->exec_size);
387 
388    case SHADER_OPCODE_QUAD_SWIZZLE: {
389       const unsigned swiz = inst->src[1].ud;
390       return (is_uniform(inst->src[0]) ?
391                  get_fpu_lowered_simd_width(shader, inst) :
392               devinfo->ver < 11 && type_sz(inst->src[0].type) == 4 ? 8 :
393               swiz == BRW_SWIZZLE_XYXY || swiz == BRW_SWIZZLE_ZWZW ? 4 :
394               get_fpu_lowered_simd_width(shader, inst));
395    }
396    case SHADER_OPCODE_MOV_INDIRECT: {
397       /* From IVB and HSW PRMs:
398        *
399        * "2.When the destination requires two registers and the sources are
400        *  indirect, the sources must use 1x1 regioning mode.
401        *
402        * In case of DF instructions in HSW/IVB, the exec_size is limited by
403        * the EU decompression logic not handling VxH indirect addressing
404        * correctly.
405        */
406       const unsigned max_size = 2 * REG_SIZE;
407       /* Prior to Broadwell, we only have 8 address subregisters. */
408       return MIN3(16,
409                   max_size / (inst->dst.stride * type_sz(inst->dst.type)),
410                   inst->exec_size);
411    }
412 
413    case SHADER_OPCODE_LOAD_PAYLOAD: {
414       const unsigned reg_count =
415          DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
416 
417       if (reg_count > 2) {
418          /* Only LOAD_PAYLOAD instructions with per-channel destination region
419           * can be easily lowered (which excludes headers and heterogeneous
420           * types).
421           */
422          assert(!inst->header_size);
423          for (unsigned i = 0; i < inst->sources; i++)
424             assert(type_sz(inst->dst.type) == type_sz(inst->src[i].type) ||
425                    inst->src[i].file == BAD_FILE);
426 
427          return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
428       } else {
429          return inst->exec_size;
430       }
431    }
432    default:
433       return inst->exec_size;
434    }
435 }
436 
437 /**
438  * Return true if splitting out the group of channels of instruction \p inst
439  * given by lbld.group() requires allocating a temporary for the i-th source
440  * of the lowered instruction.
441  */
442 static inline bool
needs_src_copy(const fs_builder & lbld,const fs_inst * inst,unsigned i)443 needs_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i)
444 {
445    return !(is_periodic(inst->src[i], lbld.dispatch_width()) ||
446             (inst->components_read(i) == 1 &&
447              lbld.dispatch_width() <= inst->exec_size)) ||
448           (inst->flags_written(lbld.shader->devinfo) &
449            brw_fs_flag_mask(inst->src[i], type_sz(inst->src[i].type)));
450 }
451 
452 /**
453  * Extract the data that would be consumed by the channel group given by
454  * lbld.group() from the i-th source region of instruction \p inst and return
455  * it as result in packed form.
456  */
457 static fs_reg
emit_unzip(const fs_builder & lbld,fs_inst * inst,unsigned i)458 emit_unzip(const fs_builder &lbld, fs_inst *inst, unsigned i)
459 {
460    assert(lbld.group() >= inst->group);
461 
462    /* Specified channel group from the source region. */
463    const fs_reg src = horiz_offset(inst->src[i], lbld.group() - inst->group);
464 
465    if (needs_src_copy(lbld, inst, i)) {
466       /* Builder of the right width to perform the copy avoiding uninitialized
467        * data if the lowered execution size is greater than the original
468        * execution size of the instruction.
469        */
470       const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),
471                                               inst->exec_size), 0);
472       const fs_reg tmp = lbld.vgrf(inst->src[i].type, inst->components_read(i));
473 
474       for (unsigned k = 0; k < inst->components_read(i); ++k)
475          cbld.MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k));
476 
477       return tmp;
478 
479    } else if (is_periodic(inst->src[i], lbld.dispatch_width())) {
480       /* The source is invariant for all dispatch_width-wide groups of the
481        * original region.
482        */
483       return inst->src[i];
484 
485    } else {
486       /* We can just point the lowered instruction at the right channel group
487        * from the original region.
488        */
489       return src;
490    }
491 }
492 
493 /**
494  * Return true if splitting out the group of channels of instruction \p inst
495  * given by lbld.group() requires allocating a temporary for the destination
496  * of the lowered instruction and copying the data back to the original
497  * destination region.
498  */
499 static inline bool
needs_dst_copy(const fs_builder & lbld,const fs_inst * inst)500 needs_dst_copy(const fs_builder &lbld, const fs_inst *inst)
501 {
502    if (inst->dst.is_null())
503       return false;
504 
505    /* If the instruction writes more than one component we'll have to shuffle
506     * the results of multiple lowered instructions in order to make sure that
507     * they end up arranged correctly in the original destination region.
508     */
509    if (inst->size_written > inst->dst.component_size(inst->exec_size))
510       return true;
511 
512    /* If the lowered execution size is larger than the original the result of
513     * the instruction won't fit in the original destination, so we'll have to
514     * allocate a temporary in any case.
515     */
516    if (lbld.dispatch_width() > inst->exec_size)
517       return true;
518 
519    for (unsigned i = 0; i < inst->sources; i++) {
520       /* If we already made a copy of the source for other reasons there won't
521        * be any overlap with the destination.
522        */
523       if (needs_src_copy(lbld, inst, i))
524          continue;
525 
526       /* In order to keep the logic simple we emit a copy whenever the
527        * destination region doesn't exactly match an overlapping source, which
528        * may point at the source and destination not being aligned group by
529        * group which could cause one of the lowered instructions to overwrite
530        * the data read from the same source by other lowered instructions.
531        */
532       if (regions_overlap(inst->dst, inst->size_written,
533                           inst->src[i], inst->size_read(i)) &&
534           !inst->dst.equals(inst->src[i]))
535         return true;
536    }
537 
538    return false;
539 }
540 
541 /**
542  * Insert data from a packed temporary into the channel group given by
543  * lbld.group() of the destination region of instruction \p inst and return
544  * the temporary as result.  Any copy instructions that are required for
545  * unzipping the previous value (in the case of partial writes) will be
546  * inserted using \p lbld_before and any copy instructions required for
547  * zipping up the destination of \p inst will be inserted using \p lbld_after.
548  */
549 static fs_reg
emit_zip(const fs_builder & lbld_before,const fs_builder & lbld_after,fs_inst * inst)550 emit_zip(const fs_builder &lbld_before, const fs_builder &lbld_after,
551          fs_inst *inst)
552 {
553    assert(lbld_before.dispatch_width() == lbld_after.dispatch_width());
554    assert(lbld_before.group() == lbld_after.group());
555    assert(lbld_after.group() >= inst->group);
556 
557    const struct intel_device_info *devinfo = lbld_before.shader->devinfo;
558 
559    /* Specified channel group from the destination region. */
560    const fs_reg dst = horiz_offset(inst->dst, lbld_after.group() - inst->group);
561 
562    if (!needs_dst_copy(lbld_after, inst)) {
563       /* No need to allocate a temporary for the lowered instruction, just
564        * take the right group of channels from the original region.
565        */
566       return dst;
567    }
568 
569    /* Deal with the residency data part later */
570    const unsigned residency_size = inst->has_sampler_residency() ?
571       (reg_unit(devinfo) * REG_SIZE) : 0;
572    const unsigned dst_size = (inst->size_written - residency_size) /
573       inst->dst.component_size(inst->exec_size);
574 
575    const fs_reg tmp = lbld_after.vgrf(inst->dst.type,
576                                       dst_size + inst->has_sampler_residency());
577 
578    if (inst->predicate) {
579       /* Handle predication by copying the original contents of the
580        * destination into the temporary before emitting the lowered
581        * instruction.
582        */
583       const fs_builder gbld_before =
584          lbld_before.group(MIN2(lbld_before.dispatch_width(),
585                                 inst->exec_size), 0);
586       for (unsigned k = 0; k < dst_size; ++k) {
587          gbld_before.MOV(offset(tmp, lbld_before, k),
588                          offset(dst, inst->exec_size, k));
589       }
590    }
591 
592    const fs_builder gbld_after =
593       lbld_after.group(MIN2(lbld_after.dispatch_width(),
594                             inst->exec_size), 0);
595    for (unsigned k = 0; k < dst_size; ++k) {
596       /* Use a builder of the right width to perform the copy avoiding
597        * uninitialized data if the lowered execution size is greater than the
598        * original execution size of the instruction.
599        */
600       gbld_after.MOV(offset(dst, inst->exec_size, k),
601                      offset(tmp, lbld_after, k));
602    }
603 
604    if (inst->has_sampler_residency()) {
605       /* Sampler messages with residency need a special attention. In the
606        * first lane of the last component are located the Pixel Null Mask
607        * (bits 0:15) & some upper bits we need to discard (bits 16:31). We
608        * have to build a single 32bit value for the SIMD32 message out of 2
609        * SIMD16 16 bit values.
610        */
611       const fs_builder rbld = gbld_after.exec_all().group(1, 0);
612       fs_reg local_res_reg = component(
613          retype(offset(tmp, lbld_before, dst_size),
614                 BRW_REGISTER_TYPE_UW), 0);
615       fs_reg final_res_reg =
616          retype(byte_offset(inst->dst,
617                             inst->size_written - residency_size +
618                             gbld_after.group() / 8),
619                 BRW_REGISTER_TYPE_UW);
620       rbld.MOV(final_res_reg, local_res_reg);
621    }
622 
623    return tmp;
624 }
625 
626 bool
brw_fs_lower_simd_width(fs_visitor & s)627 brw_fs_lower_simd_width(fs_visitor &s)
628 {
629    bool progress = false;
630 
631    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
632       const unsigned lower_width = brw_fs_get_lowered_simd_width(&s, inst);
633 
634       if (lower_width != inst->exec_size) {
635          /* Builder matching the original instruction.  We may also need to
636           * emit an instruction of width larger than the original, set the
637           * execution size of the builder to the highest of both for now so
638           * we're sure that both cases can be handled.
639           */
640          const unsigned max_width = MAX2(inst->exec_size, lower_width);
641 
642          const fs_builder bld = fs_builder(&s).at_end();
643          const fs_builder ibld = bld.at(block, inst)
644                                     .exec_all(inst->force_writemask_all)
645                                     .group(max_width, inst->group / max_width);
646 
647          /* Split the copies in chunks of the execution width of either the
648           * original or the lowered instruction, whichever is lower.
649           */
650          const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width);
651          const unsigned residency_size = inst->has_sampler_residency() ?
652             (reg_unit(s.devinfo) * REG_SIZE) : 0;
653          const unsigned dst_size =
654             (inst->size_written - residency_size) /
655             inst->dst.component_size(inst->exec_size);
656 
657          assert(!inst->writes_accumulator && !inst->mlen);
658 
659          /* Inserting the zip, unzip, and duplicated instructions in all of
660           * the right spots is somewhat tricky.  All of the unzip and any
661           * instructions from the zip which unzip the destination prior to
662           * writing need to happen before all of the per-group instructions
663           * and the zip instructions need to happen after.  In order to sort
664           * this all out, we insert the unzip instructions before \p inst,
665           * insert the per-group instructions after \p inst (i.e. before
666           * inst->next), and insert the zip instructions before the
667           * instruction after \p inst.  Since we are inserting instructions
668           * after \p inst, inst->next is a moving target and we need to save
669           * it off here so that we insert the zip instructions in the right
670           * place.
671           *
672           * Since we're inserting split instructions after after_inst, the
673           * instructions will end up in the reverse order that we insert them.
674           * However, certain render target writes require that the low group
675           * instructions come before the high group.  From the Ivy Bridge PRM
676           * Vol. 4, Pt. 1, Section 3.9.11:
677           *
678           *    "If multiple SIMD8 Dual Source messages are delivered by the
679           *    pixel shader thread, each SIMD8_DUALSRC_LO message must be
680           *    issued before the SIMD8_DUALSRC_HI message with the same Slot
681           *    Group Select setting."
682           *
683           * And, from Section 3.9.11.1 of the same PRM:
684           *
685           *    "When SIMD32 or SIMD16 PS threads send render target writes
686           *    with multiple SIMD8 and SIMD16 messages, the following must
687           *    hold:
688           *
689           *    All the slots (as described above) must have a corresponding
690           *    render target write irrespective of the slot's validity. A slot
691           *    is considered valid when at least one sample is enabled. For
692           *    example, a SIMD16 PS thread must send two SIMD8 render target
693           *    writes to cover all the slots.
694           *
695           *    PS thread must send SIMD render target write messages with
696           *    increasing slot numbers. For example, SIMD16 thread has
697           *    Slot[15:0] and if two SIMD8 render target writes are used, the
698           *    first SIMD8 render target write must send Slot[7:0] and the
699           *    next one must send Slot[15:8]."
700           *
701           * In order to make low group instructions come before high group
702           * instructions (this is required for some render target writes), we
703           * split from the highest group to lowest.
704           */
705          exec_node *const after_inst = inst->next;
706          for (int i = n - 1; i >= 0; i--) {
707             /* Emit a copy of the original instruction with the lowered width.
708              * If the EOT flag was set throw it away except for the last
709              * instruction to avoid killing the thread prematurely.
710              */
711             fs_inst split_inst = *inst;
712             split_inst.exec_size = lower_width;
713             split_inst.eot = inst->eot && i == int(n - 1);
714 
715             /* Select the correct channel enables for the i-th group, then
716              * transform the sources and destination and emit the lowered
717              * instruction.
718              */
719             const fs_builder lbld = ibld.group(lower_width, i);
720 
721             for (unsigned j = 0; j < inst->sources; j++)
722                split_inst.src[j] = emit_unzip(lbld.at(block, inst), inst, j);
723 
724             split_inst.dst = emit_zip(lbld.at(block, inst),
725                                       lbld.at(block, after_inst), inst);
726             split_inst.size_written =
727                split_inst.dst.component_size(lower_width) * dst_size +
728                residency_size;
729 
730             lbld.at(block, inst->next).emit(split_inst);
731          }
732 
733          inst->remove(block);
734          progress = true;
735       }
736    }
737 
738    if (progress)
739       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
740 
741    return progress;
742 }
743