1 /*
2 * Copyright © 2010 Intel Corporation
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "brw_fs.h"
7 #include "brw_fs_builder.h"
8
9 using namespace brw;
10
11 static bool
is_mixed_float_with_fp32_dst(const fs_inst * inst)12 is_mixed_float_with_fp32_dst(const fs_inst *inst)
13 {
14 if (inst->dst.type != BRW_TYPE_F)
15 return false;
16
17 for (int i = 0; i < inst->sources; i++) {
18 if (inst->src[i].type == BRW_TYPE_HF)
19 return true;
20 }
21
22 return false;
23 }
24
25 static bool
is_mixed_float_with_packed_fp16_dst(const fs_inst * inst)26 is_mixed_float_with_packed_fp16_dst(const fs_inst *inst)
27 {
28 if (inst->dst.type != BRW_TYPE_HF || inst->dst.stride != 1)
29 return false;
30
31 for (int i = 0; i < inst->sources; i++) {
32 if (inst->src[i].type == BRW_TYPE_F)
33 return true;
34 }
35
36 return false;
37 }
38
39 /**
40 * Get the closest allowed SIMD width for instruction \p inst accounting for
41 * some common regioning and execution control restrictions that apply to FPU
42 * instructions. These restrictions don't necessarily have any relevance to
43 * instructions not executed by the FPU pipeline like extended math, control
44 * flow or send message instructions.
45 *
46 * For virtual opcodes it's really up to the instruction -- In some cases
47 * (e.g. where a virtual instruction unrolls into a simple sequence of FPU
48 * instructions) it may simplify virtual instruction lowering if we can
49 * enforce FPU-like regioning restrictions already on the virtual instruction,
50 * in other cases (e.g. virtual send-like instructions) this may be
51 * excessively restrictive.
52 */
53 static unsigned
get_fpu_lowered_simd_width(const fs_visitor * shader,const fs_inst * inst)54 get_fpu_lowered_simd_width(const fs_visitor *shader,
55 const fs_inst *inst)
56 {
57 const struct brw_compiler *compiler = shader->compiler;
58 const struct intel_device_info *devinfo = compiler->devinfo;
59
60 /* Maximum execution size representable in the instruction controls. */
61 unsigned max_width = MIN2(32, inst->exec_size);
62
63 /* Number of channels per polygon handled by a multipolygon PS shader. */
64 const unsigned poly_width = shader->dispatch_width /
65 MAX2(1, shader->max_polygons);
66
67 /* Number of registers that will be read by an ATTR source if
68 * present for multipolygon PS shaders, since the PS vertex setup
69 * data for each polygon is stored in different contiguous GRFs.
70 */
71 const unsigned attr_reg_count = (shader->stage != MESA_SHADER_FRAGMENT ||
72 shader->max_polygons < 2 ? 0 :
73 DIV_ROUND_UP(inst->exec_size,
74 poly_width) * reg_unit(devinfo));
75
76 /* According to the PRMs:
77 * "A. In Direct Addressing mode, a source cannot span more than 2
78 * adjacent GRF registers.
79 * B. A destination cannot span more than 2 adjacent GRF registers."
80 *
81 * Look for the source or destination with the largest register region
82 * which is the one that is going to limit the overall execution size of
83 * the instruction due to this rule.
84 */
85 unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
86
87 for (unsigned i = 0; i < inst->sources; i++)
88 reg_count = MAX3(reg_count, DIV_ROUND_UP(inst->size_read(devinfo, i), REG_SIZE),
89 (inst->src[i].file == ATTR ? attr_reg_count : 0));
90
91 /* Calculate the maximum execution size of the instruction based on the
92 * factor by which it goes over the hardware limit of 2 GRFs.
93 */
94 const unsigned max_reg_count = 2 * reg_unit(devinfo);
95 if (reg_count > max_reg_count)
96 max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, max_reg_count));
97
98 /* From the IVB PRMs (applies to HSW too):
99 * "Instructions with condition modifiers must not use SIMD32."
100 *
101 * From the BDW PRMs (applies to later hardware too):
102 * "Ternary instruction with condition modifiers must not use SIMD32."
103 */
104 if (inst->conditional_mod && inst->is_3src(compiler) && devinfo->ver < 12)
105 max_width = MIN2(max_width, 16);
106
107 /* From the IVB PRMs (applies to other devices that don't have the
108 * intel_device_info::supports_simd16_3src flag set):
109 * "In Align16 access mode, SIMD16 is not allowed for DW operations and
110 * SIMD8 is not allowed for DF operations."
111 */
112 if (inst->is_3src(compiler) && !devinfo->supports_simd16_3src)
113 max_width = MIN2(max_width, inst->exec_size / reg_count);
114
115 if (inst->opcode != BRW_OPCODE_MOV) {
116 /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
117 * Float Operations:
118 *
119 * "No SIMD16 in mixed mode when destination is f32. Instruction
120 * execution size must be no more than 8."
121 *
122 * Testing indicates that this restriction does not apply to MOVs.
123 */
124 if (is_mixed_float_with_fp32_dst(inst) && devinfo->ver < 20)
125 max_width = MIN2(max_width, 8);
126
127 /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
128 * Float Operations:
129 *
130 * "No SIMD16 in mixed mode when destination is packed f16 for both
131 * Align1 and Align16."
132 */
133 if (is_mixed_float_with_packed_fp16_dst(inst) && devinfo->ver < 20)
134 max_width = MIN2(max_width, 8);
135 }
136
137 /* Only power-of-two execution sizes are representable in the instruction
138 * control fields.
139 */
140 return 1 << util_logbase2(max_width);
141 }
142
143 /**
144 * Get the maximum allowed SIMD width for instruction \p inst accounting for
145 * various payload size restrictions that apply to sampler message
146 * instructions.
147 *
148 * This is only intended to provide a maximum theoretical bound for the
149 * execution size of the message based on the number of argument components
150 * alone, which in most cases will determine whether the SIMD8 or SIMD16
151 * variant of the message can be used, though some messages may have
152 * additional restrictions not accounted for here (e.g. pre-ILK hardware uses
153 * the message length to determine the exact SIMD width and argument count,
154 * which makes a number of sampler message combinations impossible to
155 * represent).
156 *
157 * Note: Platforms with monolithic SIMD16 double the possible SIMD widths
158 * change from (SIMD8, SIMD16) to (SIMD16, SIMD32).
159 */
160 static unsigned
get_sampler_lowered_simd_width(const struct intel_device_info * devinfo,const fs_inst * inst)161 get_sampler_lowered_simd_width(const struct intel_device_info *devinfo,
162 const fs_inst *inst)
163 {
164 /* If we have a min_lod parameter on anything other than a simple sample
165 * message, it will push it over 5 arguments and we have to fall back to
166 * SIMD8.
167 */
168 if (inst->opcode != SHADER_OPCODE_TEX_LOGICAL &&
169 inst->components_read(TEX_LOGICAL_SRC_MIN_LOD))
170 return devinfo->ver < 20 ? 8 : 16;
171
172 /* On Gfx9+ the LOD argument is for free if we're able to use the LZ
173 * variant of the TXL or TXF message.
174 */
175 const bool implicit_lod = (inst->opcode == SHADER_OPCODE_TXL_LOGICAL ||
176 inst->opcode == SHADER_OPCODE_TXF_LOGICAL) &&
177 inst->src[TEX_LOGICAL_SRC_LOD].is_zero();
178
179 /* Calculate the total number of argument components that need to be passed
180 * to the sampler unit.
181 */
182 assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
183 const unsigned grad_components =
184 inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
185 assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
186 const unsigned coord_components =
187 inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
188
189 unsigned num_payload_components =
190 coord_components +
191 inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
192 (implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) +
193 inst->components_read(TEX_LOGICAL_SRC_LOD2) +
194 inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
195 (inst->opcode == SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
196 inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) +
197 inst->components_read(TEX_LOGICAL_SRC_MCS) +
198 inst->components_read(TEX_LOGICAL_SRC_MIN_LOD);
199
200
201 if (inst->opcode == FS_OPCODE_TXB_LOGICAL && devinfo->ver >= 20) {
202 num_payload_components += 3 - coord_components;
203 } else if (inst->opcode == SHADER_OPCODE_TXD_LOGICAL &&
204 devinfo->verx10 >= 125 && devinfo->ver < 20) {
205 num_payload_components +=
206 3 - coord_components + (2 - grad_components) * 2;
207 } else {
208 num_payload_components += 4 - coord_components;
209 if (inst->opcode == SHADER_OPCODE_TXD_LOGICAL)
210 num_payload_components += (3 - grad_components) * 2;
211 }
212
213
214 const unsigned simd_limit = reg_unit(devinfo) *
215 (num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);
216
217 /* SIMD16 (SIMD32 on Xe2) messages with more than five arguments exceed the
218 * maximum message size supported by the sampler, regardless of whether a
219 * header is provided or not.
220 */
221 return MIN2(inst->exec_size, simd_limit);
222 }
223
224 static bool
is_half_float_src_dst(const fs_inst * inst)225 is_half_float_src_dst(const fs_inst *inst)
226 {
227 if (inst->dst.type == BRW_TYPE_HF)
228 return true;
229
230 for (int i = 0; i < inst->sources; i++) {
231 if (inst->src[i].type == BRW_TYPE_HF)
232 return true;
233 }
234
235 return false;
236 }
237
238 /**
239 * Get the closest native SIMD width supported by the hardware for instruction
240 * \p inst. The instruction will be left untouched by
241 * fs_visitor::lower_simd_width() if the returned value is equal to the
242 * original execution size.
243 */
244 unsigned
brw_get_lowered_simd_width(const fs_visitor * shader,const fs_inst * inst)245 brw_get_lowered_simd_width(const fs_visitor *shader, const fs_inst *inst)
246 {
247 const struct brw_compiler *compiler = shader->compiler;
248 const struct intel_device_info *devinfo = compiler->devinfo;
249
250 switch (inst->opcode) {
251 case BRW_OPCODE_DP4A:
252 case BRW_OPCODE_MOV:
253 case BRW_OPCODE_SEL:
254 case BRW_OPCODE_NOT:
255 case BRW_OPCODE_AND:
256 case BRW_OPCODE_OR:
257 case BRW_OPCODE_XOR:
258 case BRW_OPCODE_SHR:
259 case BRW_OPCODE_SHL:
260 case BRW_OPCODE_ASR:
261 case BRW_OPCODE_ROR:
262 case BRW_OPCODE_ROL:
263 case BRW_OPCODE_CMPN:
264 case BRW_OPCODE_CSEL:
265 case BRW_OPCODE_BFREV:
266 case BRW_OPCODE_BFE:
267 case BRW_OPCODE_ADD:
268 case BRW_OPCODE_MUL:
269 case BRW_OPCODE_AVG:
270 case BRW_OPCODE_FRC:
271 case BRW_OPCODE_RNDU:
272 case BRW_OPCODE_RNDD:
273 case BRW_OPCODE_RNDE:
274 case BRW_OPCODE_RNDZ:
275 case BRW_OPCODE_LZD:
276 case BRW_OPCODE_FBH:
277 case BRW_OPCODE_FBL:
278 case BRW_OPCODE_CBIT:
279 case BRW_OPCODE_MAD:
280 case BRW_OPCODE_LRP:
281 case BRW_OPCODE_ADD3:
282 case FS_OPCODE_PACK:
283 case SHADER_OPCODE_SEL_EXEC:
284 case SHADER_OPCODE_CLUSTER_BROADCAST:
285 case SHADER_OPCODE_MOV_RELOC_IMM:
286 case BRW_OPCODE_CMP:
287 case BRW_OPCODE_BFI1:
288 case BRW_OPCODE_BFI2:
289 return get_fpu_lowered_simd_width(shader, inst);
290
291 case SHADER_OPCODE_RCP:
292 case SHADER_OPCODE_RSQ:
293 case SHADER_OPCODE_SQRT:
294 case SHADER_OPCODE_EXP2:
295 case SHADER_OPCODE_LOG2:
296 case SHADER_OPCODE_SIN:
297 case SHADER_OPCODE_COS: {
298 /* Xe2+: BSpec 56797
299 *
300 * Math operation rules when half-floats are used on both source and
301 * destination operands and both source and destinations are packed.
302 *
303 * The execution size must be 16.
304 */
305 if (is_half_float_src_dst(inst))
306 return devinfo->ver < 20 ? MIN2(8, inst->exec_size) :
307 MIN2(16, inst->exec_size);
308 if (devinfo->ver < 20)
309 return MIN2(16, inst->exec_size);
310
311 return inst->exec_size;
312 }
313
314 case SHADER_OPCODE_POW: {
315 /* SIMD16 is only allowed on Gfx7+. Extended Math Function is limited
316 * to SIMD8 with half-float
317 */
318 if (is_half_float_src_dst(inst))
319 return MIN2(8, inst->exec_size);
320 return MIN2(16, inst->exec_size);
321 }
322
323 case SHADER_OPCODE_USUB_SAT:
324 case SHADER_OPCODE_ISUB_SAT:
325 return get_fpu_lowered_simd_width(shader, inst);
326
327 case SHADER_OPCODE_INT_QUOTIENT:
328 case SHADER_OPCODE_INT_REMAINDER:
329 /* Integer division is limited to SIMD8 on all generations. */
330 return MIN2(8, inst->exec_size);
331
332 case BRW_OPCODE_PLN:
333 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
334 case FS_OPCODE_PACK_HALF_2x16_SPLIT:
335 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
336 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
337 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
338 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
339 case FS_OPCODE_DDX_COARSE:
340 case FS_OPCODE_DDX_FINE:
341 case FS_OPCODE_DDY_COARSE:
342 case FS_OPCODE_DDY_FINE:
343 return MIN2(16, inst->exec_size);
344
345 case SHADER_OPCODE_MULH:
346 /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
347 * is 8-wide on Gfx7+.
348 */
349 return devinfo->ver >= 20 ? 16 : 8;
350
351 case FS_OPCODE_FB_WRITE_LOGICAL:
352 if (devinfo->ver >= 20) {
353 /* Dual-source FB writes are unsupported in SIMD32 mode. */
354 return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
355 16 : MIN2(32, inst->exec_size));
356 } else {
357 /* Dual-source FB writes are unsupported in SIMD16 mode. */
358 return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
359 8 : MIN2(16, inst->exec_size));
360 }
361
362 case FS_OPCODE_FB_READ_LOGICAL:
363 return MIN2(16, inst->exec_size);
364
365 case SHADER_OPCODE_TEX_LOGICAL:
366 case SHADER_OPCODE_TXF_MCS_LOGICAL:
367 case SHADER_OPCODE_LOD_LOGICAL:
368 case SHADER_OPCODE_TG4_LOGICAL:
369 case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
370 case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
371 case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
372 case SHADER_OPCODE_TG4_BIAS_LOGICAL:
373 case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
374 case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
375 case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
376 case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
377 case SHADER_OPCODE_TXL_LOGICAL:
378 case FS_OPCODE_TXB_LOGICAL:
379 case SHADER_OPCODE_TXF_LOGICAL:
380 case SHADER_OPCODE_TXS_LOGICAL:
381 return get_sampler_lowered_simd_width(devinfo, inst);
382
383 case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
384 case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
385 case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
386 if (devinfo->ver >= 20)
387 return inst->exec_size;
388
389 if (inst->src[MEMORY_LOGICAL_MODE].ud == MEMORY_MODE_TYPED)
390 return 8;
391
392 /* HDC A64 atomics are limited to SIMD8 */
393 if (!devinfo->has_lsc &&
394 inst->src[MEMORY_LOGICAL_BINDING_TYPE].ud == LSC_ADDR_SURFTYPE_FLAT
395 && lsc_opcode_is_atomic((enum lsc_opcode)
396 inst->src[MEMORY_LOGICAL_OPCODE].ud))
397 return 8;
398
399 return MIN2(16, inst->exec_size);
400
401 /* On gfx12 parameters are fixed to 16-bit values and therefore they all
402 * always fit regardless of the execution size.
403 */
404 case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
405 return MIN2(16, inst->exec_size);
406
407 case SHADER_OPCODE_TXD_LOGICAL:
408 /* TXD is unsupported in SIMD16 mode previous to Xe2. SIMD32 is still
409 * unsuppported on Xe2.
410 */
411 return devinfo->ver < 20 ? 8 : 16;
412
413 case SHADER_OPCODE_URB_READ_LOGICAL:
414 case SHADER_OPCODE_URB_WRITE_LOGICAL:
415 return MIN2(devinfo->ver < 20 ? 8 : 16, inst->exec_size);
416
417 case SHADER_OPCODE_QUAD_SWIZZLE: {
418 const unsigned swiz = inst->src[1].ud;
419 return (is_uniform(inst->src[0]) ?
420 get_fpu_lowered_simd_width(shader, inst) :
421 devinfo->ver < 11 && brw_type_size_bytes(inst->src[0].type) == 4 ? 8 :
422 swiz == BRW_SWIZZLE_XYXY || swiz == BRW_SWIZZLE_ZWZW ? 4 :
423 get_fpu_lowered_simd_width(shader, inst));
424 }
425 case SHADER_OPCODE_MOV_INDIRECT: {
426 /* From IVB and HSW PRMs:
427 *
428 * "2.When the destination requires two registers and the sources are
429 * indirect, the sources must use 1x1 regioning mode.
430 *
431 * In case of DF instructions in HSW/IVB, the exec_size is limited by
432 * the EU decompression logic not handling VxH indirect addressing
433 * correctly.
434 */
435 const unsigned max_size = 2 * REG_SIZE;
436 /* Prior to Broadwell, we only have 8 address subregisters. */
437 return MIN3(16,
438 max_size / (inst->dst.stride * brw_type_size_bytes(inst->dst.type)),
439 inst->exec_size);
440 }
441
442 case SHADER_OPCODE_LOAD_PAYLOAD: {
443 const unsigned reg_count =
444 DIV_ROUND_UP(inst->dst.component_size(inst->exec_size),
445 REG_SIZE * reg_unit(devinfo));
446
447 if (reg_count > 2) {
448 /* Only LOAD_PAYLOAD instructions with per-channel destination region
449 * can be easily lowered (which excludes headers and heterogeneous
450 * types).
451 */
452 assert(!inst->header_size);
453 for (unsigned i = 0; i < inst->sources; i++)
454 assert(brw_type_size_bits(inst->dst.type) == brw_type_size_bits(inst->src[i].type) ||
455 inst->src[i].file == BAD_FILE);
456
457 return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
458 } else {
459 return inst->exec_size;
460 }
461 }
462 default:
463 return inst->exec_size;
464 }
465 }
466
467 /**
468 * Return true if splitting out the group of channels of instruction \p inst
469 * given by lbld.group() requires allocating a temporary for the i-th source
470 * of the lowered instruction.
471 */
472 static inline bool
needs_src_copy(const fs_builder & lbld,const fs_inst * inst,unsigned i)473 needs_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i)
474 {
475 /* The indirectly indexed register stays the same even if we split the
476 * instruction.
477 */
478 if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
479 return false;
480
481 return !inst->src[i].is_scalar &&
482 (!(is_periodic(inst->src[i], lbld.dispatch_width()) ||
483 (inst->components_read(i) == 1 &&
484 lbld.dispatch_width() <= inst->exec_size)) ||
485 (inst->flags_written(lbld.shader->devinfo) &
486 brw_fs_flag_mask(inst->src[i], brw_type_size_bytes(inst->src[i].type))));
487 }
488
489 /**
490 * Extract the data that would be consumed by the channel group given by
491 * lbld.group() from the i-th source region of instruction \p inst and return
492 * it as result in packed form.
493 */
494 static brw_reg
emit_unzip(const fs_builder & lbld,fs_inst * inst,unsigned i)495 emit_unzip(const fs_builder &lbld, fs_inst *inst, unsigned i)
496 {
497 assert(lbld.group() >= inst->group);
498
499 /* Specified channel group from the source region. */
500 const brw_reg src = horiz_offset(inst->src[i], lbld.group() - inst->group);
501
502 if (needs_src_copy(lbld, inst, i)) {
503 const unsigned num_components = inst->components_read(i);
504 const brw_reg tmp = lbld.vgrf(inst->src[i].type, num_components);
505
506 assert(num_components <= NIR_MAX_VEC_COMPONENTS);
507 brw_reg comps[NIR_MAX_VEC_COMPONENTS];
508
509 for (unsigned k = 0; k < num_components; ++k)
510 comps[k] = offset(src, inst->exec_size, k);
511 lbld.VEC(tmp, comps, num_components);
512
513 return tmp;
514 } else if (is_periodic(inst->src[i], lbld.dispatch_width()) ||
515 (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) ||
516 inst->src[i].is_scalar) {
517 /* The source is invariant for all dispatch_width-wide groups of the
518 * original region.
519 *
520 * The src[0] of MOV_INDIRECT is invariant regardless of the execution
521 * size.
522 */
523 return inst->src[i];
524
525 } else {
526 /* We can just point the lowered instruction at the right channel group
527 * from the original region.
528 */
529 return src;
530 }
531 }
532
533 /**
534 * Return true if splitting out the group of channels of instruction \p inst
535 * given by lbld.group() requires allocating a temporary for the destination
536 * of the lowered instruction and copying the data back to the original
537 * destination region.
538 */
539 static inline bool
needs_dst_copy(const fs_builder & lbld,const fs_inst * inst)540 needs_dst_copy(const fs_builder &lbld, const fs_inst *inst)
541 {
542 if (inst->dst.is_null())
543 return false;
544
545 /* If the instruction writes more than one component we'll have to shuffle
546 * the results of multiple lowered instructions in order to make sure that
547 * they end up arranged correctly in the original destination region.
548 */
549 if (inst->size_written > inst->dst.component_size(inst->exec_size))
550 return true;
551
552 for (unsigned i = 0; i < inst->sources; i++) {
553 /* If we already made a copy of the source for other reasons there won't
554 * be any overlap with the destination.
555 */
556 if (needs_src_copy(lbld, inst, i))
557 continue;
558
559 /* In order to keep the logic simple we emit a copy whenever the
560 * destination region doesn't exactly match an overlapping source, which
561 * may point at the source and destination not being aligned group by
562 * group which could cause one of the lowered instructions to overwrite
563 * the data read from the same source by other lowered instructions.
564 */
565 if (regions_overlap(inst->dst, inst->size_written,
566 inst->src[i], inst->size_read(lbld.shader->devinfo, i)) &&
567 !inst->dst.equals(inst->src[i]))
568 return true;
569 }
570
571 return false;
572 }
573
574 /**
575 * Insert data from a packed temporary into the channel group given by
576 * lbld.group() of the destination region of instruction \p inst and return
577 * the temporary as result. Any copy instructions that are required for
578 * unzipping the previous value (in the case of partial writes) will be
579 * inserted using \p lbld_before and any copy instructions required for
580 * zipping up the destination of \p inst will be inserted using \p lbld_after.
581 */
582 static brw_reg
emit_zip(const fs_builder & lbld_before,const fs_builder & lbld_after,fs_inst * inst)583 emit_zip(const fs_builder &lbld_before, const fs_builder &lbld_after,
584 fs_inst *inst)
585 {
586 assert(lbld_before.dispatch_width() == lbld_after.dispatch_width());
587 assert(lbld_before.group() == lbld_after.group());
588 assert(lbld_after.group() >= inst->group);
589
590 const struct intel_device_info *devinfo = lbld_before.shader->devinfo;
591
592 /* Specified channel group from the destination region. */
593 const brw_reg dst = horiz_offset(inst->dst, lbld_after.group() - inst->group);
594
595 if (!needs_dst_copy(lbld_after, inst)) {
596 /* No need to allocate a temporary for the lowered instruction, just
597 * take the right group of channels from the original region.
598 */
599 return dst;
600 }
601
602 /* Deal with the residency data part later */
603 const unsigned residency_size = inst->has_sampler_residency() ?
604 (reg_unit(devinfo) * REG_SIZE) : 0;
605 const unsigned dst_size = (inst->size_written - residency_size) /
606 inst->dst.component_size(inst->exec_size);
607
608 const brw_reg tmp = lbld_after.vgrf(inst->dst.type,
609 dst_size + inst->has_sampler_residency());
610
611 if (inst->predicate) {
612 /* Handle predication by copying the original contents of the
613 * destination into the temporary before emitting the lowered
614 * instruction.
615 */
616 for (unsigned k = 0; k < dst_size; ++k) {
617 lbld_before.MOV(offset(tmp, lbld_before, k),
618 offset(dst, inst->exec_size, k));
619 }
620 }
621
622 for (unsigned k = 0; k < dst_size; ++k) {
623 /* Copy the (split) temp into the original (larger) destination */
624 lbld_after.MOV(offset(dst, inst->exec_size, k),
625 offset(tmp, lbld_after, k));
626 }
627
628 if (inst->has_sampler_residency()) {
629 /* Sampler messages with residency need a special attention. In the
630 * first lane of the last component are located the Pixel Null Mask
631 * (bits 0:15) & some upper bits we need to discard (bits 16:31). We
632 * have to build a single 32bit value for the SIMD32 message out of 2
633 * SIMD16 16 bit values.
634 */
635 const fs_builder rbld = lbld_after.exec_all().group(1, 0);
636 brw_reg local_res_reg = component(
637 retype(offset(tmp, lbld_before, dst_size), BRW_TYPE_UW), 0);
638 brw_reg final_res_reg =
639 retype(byte_offset(inst->dst,
640 inst->size_written - residency_size +
641 lbld_after.group() / 8), BRW_TYPE_UW);
642 rbld.MOV(final_res_reg, local_res_reg);
643 }
644
645 return tmp;
646 }
647
648 bool
brw_lower_simd_width(fs_visitor & s)649 brw_lower_simd_width(fs_visitor &s)
650 {
651 bool progress = false;
652
653 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
654 const unsigned lower_width = brw_get_lowered_simd_width(&s, inst);
655
656 /* No splitting required */
657 if (lower_width == inst->exec_size)
658 continue;
659
660 assert(lower_width < inst->exec_size);
661
662 /* Builder matching the original instruction. */
663 const fs_builder bld = fs_builder(&s).at_end();
664 const fs_builder ibld =
665 bld.at(block, inst).exec_all(inst->force_writemask_all)
666 .group(inst->exec_size, inst->group / inst->exec_size);
667
668 /* Split the copies in chunks of the execution width of either the
669 * original or the lowered instruction, whichever is lower.
670 */
671 const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width);
672 const unsigned residency_size = inst->has_sampler_residency() ?
673 (reg_unit(s.devinfo) * REG_SIZE) : 0;
674 const unsigned dst_size =
675 (inst->size_written - residency_size) /
676 inst->dst.component_size(inst->exec_size);
677
678 assert(!inst->writes_accumulator && !inst->mlen);
679
680 /* Inserting the zip, unzip, and duplicated instructions in all of
681 * the right spots is somewhat tricky. All of the unzip and any
682 * instructions from the zip which unzip the destination prior to
683 * writing need to happen before all of the per-group instructions
684 * and the zip instructions need to happen after. In order to sort
685 * this all out, we insert the unzip instructions before \p inst,
686 * insert the per-group instructions after \p inst (i.e. before
687 * inst->next), and insert the zip instructions before the
688 * instruction after \p inst. Since we are inserting instructions
689 * after \p inst, inst->next is a moving target and we need to save
690 * it off here so that we insert the zip instructions in the right
691 * place.
692 *
693 * Since we're inserting split instructions after after_inst, the
694 * instructions will end up in the reverse order that we insert them.
695 * However, certain render target writes require that the low group
696 * instructions come before the high group. From the Ivy Bridge PRM
697 * Vol. 4, Pt. 1, Section 3.9.11:
698 *
699 * "If multiple SIMD8 Dual Source messages are delivered by the
700 * pixel shader thread, each SIMD8_DUALSRC_LO message must be
701 * issued before the SIMD8_DUALSRC_HI message with the same Slot
702 * Group Select setting."
703 *
704 * And, from Section 3.9.11.1 of the same PRM:
705 *
706 * "When SIMD32 or SIMD16 PS threads send render target writes
707 * with multiple SIMD8 and SIMD16 messages, the following must
708 * hold:
709 *
710 * All the slots (as described above) must have a corresponding
711 * render target write irrespective of the slot's validity. A slot
712 * is considered valid when at least one sample is enabled. For
713 * example, a SIMD16 PS thread must send two SIMD8 render target
714 * writes to cover all the slots.
715 *
716 * PS thread must send SIMD render target write messages with
717 * increasing slot numbers. For example, SIMD16 thread has
718 * Slot[15:0] and if two SIMD8 render target writes are used, the
719 * first SIMD8 render target write must send Slot[7:0] and the
720 * next one must send Slot[15:8]."
721 *
722 * In order to make low group instructions come before high group
723 * instructions (this is required for some render target writes), we
724 * split from the highest group to lowest.
725 */
726 exec_node *const after_inst = inst->next;
727 for (int i = n - 1; i >= 0; i--) {
728 /* Emit a copy of the original instruction with the lowered width.
729 * If the EOT flag was set throw it away except for the last
730 * instruction to avoid killing the thread prematurely.
731 */
732 fs_inst split_inst = *inst;
733 split_inst.exec_size = lower_width;
734 split_inst.eot = inst->eot && i == int(n - 1);
735
736 /* Select the correct channel enables for the i-th group, then
737 * transform the sources and destination and emit the lowered
738 * instruction.
739 */
740 const fs_builder lbld = ibld.group(lower_width, i);
741
742 for (unsigned j = 0; j < inst->sources; j++)
743 split_inst.src[j] = emit_unzip(lbld.at(block, inst), inst, j);
744
745 split_inst.dst = emit_zip(lbld.at(block, inst),
746 lbld.at(block, after_inst), inst);
747 split_inst.size_written =
748 split_inst.dst.component_size(lower_width) * dst_size +
749 residency_size;
750
751 lbld.at(block, inst->next).emit(split_inst);
752 }
753
754 inst->remove(block);
755 progress = true;
756 }
757
758 if (progress)
759 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
760
761 return progress;
762 }
763