1 /*
2 * Copyright © 2010 Intel Corporation
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "brw_eu.h"
7 #include "brw_fs.h"
8 #include "brw_fs_builder.h"
9
10 using namespace brw;
11
12 void
brw_optimize(fs_visitor & s)13 brw_optimize(fs_visitor &s)
14 {
15 const nir_shader *nir = s.nir;
16
17 s.debug_optimizer(nir, "start", 0, 0);
18
19 /* Start by validating the shader we currently have. */
20 brw_validate(s);
21
22 /* Track how much non-SSA at this point. */
23 {
24 const brw::def_analysis &defs = s.def_analysis.require();
25 s.shader_stats.non_ssa_registers_after_nir =
26 defs.count() - defs.ssa_count();
27 }
28
29 bool progress = false;
30 int iteration = 0;
31 int pass_num = 0;
32
33 #define OPT(pass, ...) ({ \
34 pass_num++; \
35 bool this_progress = pass(s, ##__VA_ARGS__); \
36 \
37 if (this_progress) \
38 s.debug_optimizer(nir, #pass, iteration, pass_num); \
39 \
40 brw_validate(s); \
41 \
42 progress = progress || this_progress; \
43 this_progress; \
44 })
45
46 if (s.compiler->lower_dpas)
47 OPT(brw_lower_dpas);
48
49 OPT(brw_opt_split_virtual_grfs);
50
51 /* Before anything else, eliminate dead code. The results of some NIR
52 * instructions may effectively be calculated twice. Once when the
53 * instruction is encountered, and again when the user of that result is
54 * encountered. Wipe those away before algebraic optimizations and
55 * especially copy propagation can mix things up.
56 */
57 OPT(brw_opt_dead_code_eliminate);
58
59 OPT(brw_opt_remove_extra_rounding_modes);
60
61 OPT(brw_opt_eliminate_find_live_channel);
62
63 do {
64 progress = false;
65 pass_num = 0;
66 iteration++;
67
68 OPT(brw_opt_algebraic);
69 OPT(brw_opt_cse_defs);
70 if (!OPT(brw_opt_copy_propagation_defs))
71 OPT(brw_opt_copy_propagation);
72 OPT(brw_opt_cmod_propagation);
73 OPT(brw_opt_dead_code_eliminate);
74 OPT(brw_opt_saturate_propagation);
75 OPT(brw_opt_register_coalesce);
76
77 OPT(brw_opt_compact_virtual_grfs);
78 } while (progress);
79
80 brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_OPT_LOOP);
81
82 progress = false;
83 pass_num = 0;
84
85 if (OPT(brw_opt_combine_convergent_txf))
86 OPT(brw_opt_copy_propagation_defs);
87
88 if (OPT(brw_lower_pack)) {
89 OPT(brw_opt_register_coalesce);
90 OPT(brw_opt_dead_code_eliminate);
91 }
92
93 OPT(brw_lower_subgroup_ops);
94 OPT(brw_lower_csel);
95 OPT(brw_lower_simd_width);
96 OPT(brw_lower_scalar_fp64_MAD);
97 OPT(brw_lower_barycentrics);
98 OPT(brw_lower_logical_sends);
99
100 brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_EARLY_LOWERING);
101
102 /* After logical SEND lowering. */
103
104 if (!OPT(brw_opt_copy_propagation_defs))
105 OPT(brw_opt_copy_propagation);
106
107 /* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
108 * Do this before splitting SENDs.
109 */
110 if (OPT(brw_opt_zero_samples)) {
111 if (!OPT(brw_opt_copy_propagation_defs)) {
112 OPT(brw_opt_copy_propagation);
113 }
114 }
115
116 OPT(brw_opt_split_sends);
117 OPT(brw_workaround_nomask_control_flow);
118
119 if (progress) {
120 /* Do both forms of copy propagation because it is important to
121 * eliminate as many cases of load_payload-of-load_payload as possible.
122 */
123 OPT(brw_opt_copy_propagation_defs);
124 OPT(brw_opt_copy_propagation);
125
126 /* Run after logical send lowering to give it a chance to CSE the
127 * LOAD_PAYLOAD instructions created to construct the payloads of
128 * e.g. texturing messages in cases where it wasn't possible to CSE the
129 * whole logical instruction.
130 */
131 OPT(brw_opt_cse_defs);
132 OPT(brw_opt_register_coalesce);
133 OPT(brw_opt_dead_code_eliminate);
134 }
135
136 OPT(brw_opt_remove_redundant_halts);
137
138 if (OPT(brw_lower_load_payload)) {
139 OPT(brw_opt_split_virtual_grfs);
140
141 OPT(brw_opt_register_coalesce);
142 OPT(brw_lower_simd_width);
143 OPT(brw_opt_dead_code_eliminate);
144 }
145
146 brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_MIDDLE_LOWERING);
147
148 OPT(brw_lower_alu_restrictions);
149
150 OPT(brw_opt_combine_constants);
151 if (OPT(brw_lower_integer_multiplication)) {
152 /* If lower_integer_multiplication made progress, it may have produced
153 * some 32x32-bit MULs in the process of lowering 64-bit MULs. Run it
154 * one more time to clean those up if they exist.
155 */
156 OPT(brw_lower_integer_multiplication);
157 }
158 OPT(brw_lower_sub_sat);
159
160 progress = false;
161 OPT(brw_lower_derivatives);
162 OPT(brw_lower_regioning);
163
164 /* Try both copy propagation passes. The defs one will likely not be
165 * able to handle everything at this point.
166 */
167 const bool cp1 = OPT(brw_opt_copy_propagation_defs);
168 const bool cp2 = OPT(brw_opt_copy_propagation);
169 if (cp1 || cp2)
170 OPT(brw_opt_combine_constants);
171
172 OPT(brw_opt_dead_code_eliminate);
173 OPT(brw_opt_register_coalesce);
174
175 if (progress)
176 OPT(brw_lower_simd_width);
177
178 OPT(brw_lower_uniform_pull_constant_loads);
179
180 if (OPT(brw_lower_send_descriptors)) {
181 /* No need for standard copy_propagation since
182 * brw_fs_opt_address_reg_load will only optimize defs.
183 */
184 if (OPT(brw_opt_copy_propagation_defs))
185 OPT(brw_opt_algebraic);
186 OPT(brw_opt_address_reg_load);
187 OPT(brw_opt_dead_code_eliminate);
188 }
189
190 OPT(brw_lower_sends_overlapping_payload);
191
192 OPT(brw_lower_indirect_mov);
193
194 OPT(brw_lower_find_live_channel);
195
196 OPT(brw_lower_load_subgroup_invocation);
197
198 brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_LATE_LOWERING);
199 }
200
201 static unsigned
load_payload_sources_read_for_size(fs_inst * lp,unsigned size_read)202 load_payload_sources_read_for_size(fs_inst *lp, unsigned size_read)
203 {
204 assert(lp->opcode == SHADER_OPCODE_LOAD_PAYLOAD);
205 assert(size_read >= lp->header_size * REG_SIZE);
206
207 unsigned i;
208 unsigned size = lp->header_size * REG_SIZE;
209 for (i = lp->header_size; size < size_read && i < lp->sources; i++)
210 size += lp->exec_size * brw_type_size_bytes(lp->src[i].type);
211
212 /* Size read must cover exactly a subset of sources. */
213 assert(size == size_read);
214 return i;
215 }
216
217 /**
218 * Optimize sample messages that have constant zero values for the trailing
219 * parameters. We can just reduce the message length for these
220 * instructions instead of reserving a register for it. Trailing parameters
221 * that aren't sent default to zero anyway. This will cause the dead code
222 * eliminator to remove the MOV instruction that would otherwise be emitted to
223 * set up the zero value.
224 */
225
226 bool
brw_opt_zero_samples(fs_visitor & s)227 brw_opt_zero_samples(fs_visitor &s)
228 {
229 bool progress = false;
230
231 foreach_block_and_inst(block, fs_inst, send, s.cfg) {
232 if (send->opcode != SHADER_OPCODE_SEND ||
233 send->sfid != BRW_SFID_SAMPLER)
234 continue;
235
236 /* Wa_14012688258:
237 *
238 * Don't trim zeros at the end of payload for sample operations
239 * in cube and cube arrays.
240 */
241 if (send->keep_payload_trailing_zeros)
242 continue;
243
244 /* This pass works on SENDs before splitting. */
245 if (send->ex_mlen > 0)
246 continue;
247
248 fs_inst *lp = (fs_inst *) send->prev;
249
250 if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
251 continue;
252
253 /* How much of the payload are actually read by this SEND. */
254 const unsigned params =
255 load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
256
257 /* We don't want to remove the message header or the first parameter.
258 * Removing the first parameter is not allowed, see the Haswell PRM
259 * volume 7, page 149:
260 *
261 * "Parameter 0 is required except for the sampleinfo message, which
262 * has no parameter 0"
263 */
264 const unsigned first_param_idx = lp->header_size;
265 unsigned zero_size = 0;
266 for (unsigned i = params - 1; i > first_param_idx; i--) {
267 if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero())
268 break;
269 zero_size += lp->exec_size * brw_type_size_bytes(lp->src[i].type) * lp->dst.stride;
270 }
271
272 /* Round down to ensure to only consider full registers. */
273 const unsigned zero_len = ROUND_DOWN_TO(zero_size / REG_SIZE, reg_unit(s.devinfo));
274 if (zero_len > 0) {
275 /* Note mlen is in REG_SIZE units. */
276 send->mlen -= zero_len;
277 progress = true;
278 }
279 }
280
281 if (progress)
282 s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
283
284 return progress;
285 }
286
287 /**
288 * Opportunistically split SEND message payloads.
289 *
290 * Gfx9+ supports "split" SEND messages, which take two payloads that are
291 * implicitly concatenated. If we find a SEND message with a single payload,
292 * we can split that payload in two. This results in smaller contiguous
293 * register blocks for us to allocate. But it can help beyond that, too.
294 *
295 * We try and split a LOAD_PAYLOAD between sources which change registers.
296 * For example, a sampler message often contains a x/y/z coordinate that may
297 * already be in a contiguous VGRF, combined with an LOD, shadow comparitor,
298 * or array index, which comes from elsewhere. In this case, the first few
299 * sources will be different offsets of the same VGRF, then a later source
300 * will be a different VGRF. So we split there, possibly eliminating the
301 * payload concatenation altogether.
302 */
303 bool
brw_opt_split_sends(fs_visitor & s)304 brw_opt_split_sends(fs_visitor &s)
305 {
306 bool progress = false;
307
308 foreach_block_and_inst(block, fs_inst, send, s.cfg) {
309 if (send->opcode != SHADER_OPCODE_SEND ||
310 send->mlen <= reg_unit(s.devinfo) || send->ex_mlen > 0 ||
311 send->src[2].file != VGRF)
312 continue;
313
314 /* Currently don't split sends that reuse a previously used payload. */
315 fs_inst *lp = (fs_inst *) send->prev;
316
317 if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
318 continue;
319
320 if (lp->dst.file != send->src[2].file || lp->dst.nr != send->src[2].nr)
321 continue;
322
323 /* Split either after the header (if present), or when consecutive
324 * sources switch from one VGRF to a different one.
325 */
326 unsigned mid = lp->header_size;
327 if (mid == 0) {
328 for (mid = 1; mid < lp->sources; mid++) {
329 if (lp->src[mid].file == BAD_FILE)
330 continue;
331
332 if (lp->src[0].file != lp->src[mid].file ||
333 lp->src[0].nr != lp->src[mid].nr)
334 break;
335 }
336 }
337
338 /* SEND mlen might be smaller than what LOAD_PAYLOAD provides, so
339 * find out how many sources from the payload does it really need.
340 */
341 const unsigned end =
342 load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
343
344 /* Nothing to split. */
345 if (end <= mid)
346 continue;
347
348 const fs_builder ibld(&s, block, lp);
349 fs_inst *lp1 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[0], mid, lp->header_size);
350 fs_inst *lp2 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[mid], end - mid, 0);
351
352 assert(lp1->size_written % REG_SIZE == 0);
353 assert(lp2->size_written % REG_SIZE == 0);
354 assert((lp1->size_written + lp2->size_written) / REG_SIZE == send->mlen);
355
356 lp1->dst = brw_vgrf(s.alloc.allocate(lp1->size_written / REG_SIZE), lp1->dst.type);
357 lp2->dst = brw_vgrf(s.alloc.allocate(lp2->size_written / REG_SIZE), lp2->dst.type);
358
359 send->resize_sources(4);
360 send->src[2] = lp1->dst;
361 send->src[3] = lp2->dst;
362 send->ex_mlen = lp2->size_written / REG_SIZE;
363 send->mlen -= send->ex_mlen;
364
365 progress = true;
366 }
367
368 if (progress)
369 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
370
371 return progress;
372 }
373
374 /**
375 * Remove redundant or useless halts.
376 *
377 * For example, we can eliminate halts in the following sequence:
378 *
379 * halt (redundant with the next halt)
380 * halt (useless; jumps to the next instruction)
381 * halt-target
382 */
383 bool
brw_opt_remove_redundant_halts(fs_visitor & s)384 brw_opt_remove_redundant_halts(fs_visitor &s)
385 {
386 bool progress = false;
387
388 unsigned halt_count = 0;
389 fs_inst *halt_target = NULL;
390 bblock_t *halt_target_block = NULL;
391 foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
392 if (inst->opcode == BRW_OPCODE_HALT)
393 halt_count++;
394
395 if (inst->opcode == SHADER_OPCODE_HALT_TARGET) {
396 halt_target = inst;
397 halt_target_block = block;
398 break;
399 }
400 }
401
402 if (!halt_target) {
403 assert(halt_count == 0);
404 return false;
405 }
406
407 /* Delete any HALTs immediately before the halt target. */
408 for (fs_inst *prev = (fs_inst *) halt_target->prev;
409 !prev->is_head_sentinel() && prev->opcode == BRW_OPCODE_HALT;
410 prev = (fs_inst *) halt_target->prev) {
411 prev->remove(halt_target_block);
412 halt_count--;
413 progress = true;
414 }
415
416 if (halt_count == 0) {
417 halt_target->remove(halt_target_block);
418 progress = true;
419 }
420
421 if (progress)
422 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
423
424 return progress;
425 }
426
427 /**
428 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
429 * flow. We could probably do better here with some form of divergence
430 * analysis.
431 */
432 bool
brw_opt_eliminate_find_live_channel(fs_visitor & s)433 brw_opt_eliminate_find_live_channel(fs_visitor &s)
434 {
435 bool progress = false;
436 unsigned depth = 0;
437
438 if (!brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
439 s.prog_data)) {
440 /* The optimization below assumes that channel zero is live on thread
441 * dispatch, which may not be the case if the fixed function dispatches
442 * threads sparsely.
443 */
444 return false;
445 }
446
447 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
448 switch (inst->opcode) {
449 case BRW_OPCODE_IF:
450 case BRW_OPCODE_DO:
451 depth++;
452 break;
453
454 case BRW_OPCODE_ENDIF:
455 case BRW_OPCODE_WHILE:
456 depth--;
457 break;
458
459 case BRW_OPCODE_HALT:
460 /* This can potentially make control flow non-uniform until the end
461 * of the program.
462 */
463 goto out;
464
465 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
466 if (depth == 0) {
467 inst->opcode = BRW_OPCODE_MOV;
468 inst->src[0] = brw_imm_ud(0u);
469 inst->force_writemask_all = true;
470
471 /* FIND_LIVE_CHANNEL emitted by emit_uniformize will have
472 * size_written set by hand to a smaller value. In this case,
473 * munge the exec_size to match.
474 */
475 if (inst->size_written == inst->dst.component_size(8 * reg_unit(s.devinfo)))
476 inst->exec_size = 8 * reg_unit(s.devinfo);
477
478 inst->resize_sources(1);
479 progress = true;
480
481 /* emit_uniformize() frequently emits FIND_LIVE_CHANNEL paired
482 * with a BROADCAST. Save some work for opt_copy_propagation
483 * and opt_algebraic by trivially cleaning up both together.
484 */
485 assert(!inst->next->is_tail_sentinel());
486 fs_inst *bcast = (fs_inst *) inst->next;
487
488 /* Ignore stride when comparing */
489 if (bcast->opcode == SHADER_OPCODE_BROADCAST &&
490 inst->dst.file == VGRF &&
491 inst->dst.file == bcast->src[1].file &&
492 inst->dst.nr == bcast->src[1].nr &&
493 inst->dst.offset == bcast->src[1].offset) {
494 bcast->opcode = BRW_OPCODE_MOV;
495 if (!is_uniform(bcast->src[0]))
496 bcast->src[0] = component(bcast->src[0], 0);
497
498 bcast->force_writemask_all = true;
499 bcast->exec_size = 8 * reg_unit(s.devinfo);
500 assert(bcast->size_written == bcast->dst.component_size(bcast->exec_size));
501 bcast->resize_sources(1);
502 }
503 }
504 break;
505
506 default:
507 break;
508 }
509 }
510
511 out:
512 if (progress)
513 s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
514
515 return progress;
516 }
517
518 /**
519 * Rounding modes for conversion instructions are included for each
520 * conversion, but right now it is a state. So once it is set,
521 * we don't need to call it again for subsequent calls.
522 *
523 * This is useful for vector/matrices conversions, as setting the
524 * mode once is enough for the full vector/matrix
525 */
526 bool
brw_opt_remove_extra_rounding_modes(fs_visitor & s)527 brw_opt_remove_extra_rounding_modes(fs_visitor &s)
528 {
529 bool progress = false;
530 unsigned execution_mode = s.nir->info.float_controls_execution_mode;
531
532 brw_rnd_mode base_mode = BRW_RND_MODE_UNSPECIFIED;
533 if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
534 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
535 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
536 execution_mode)
537 base_mode = BRW_RND_MODE_RTNE;
538 if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
539 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
540 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
541 execution_mode)
542 base_mode = BRW_RND_MODE_RTZ;
543
544 foreach_block (block, s.cfg) {
545 brw_rnd_mode prev_mode = base_mode;
546
547 foreach_inst_in_block_safe (fs_inst, inst, block) {
548 if (inst->opcode == SHADER_OPCODE_RND_MODE) {
549 assert(inst->src[0].file == IMM);
550 const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d;
551 if (mode == prev_mode) {
552 inst->remove(block);
553 progress = true;
554 } else {
555 prev_mode = mode;
556 }
557 }
558 }
559 }
560
561 if (progress)
562 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
563
564 return progress;
565 }
566