• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "brw_eu.h"
7 #include "brw_fs.h"
8 #include "brw_fs_builder.h"
9 
10 using namespace brw;
11 
12 void
brw_optimize(fs_visitor & s)13 brw_optimize(fs_visitor &s)
14 {
15    const nir_shader *nir = s.nir;
16 
17    s.debug_optimizer(nir, "start", 0, 0);
18 
19    /* Start by validating the shader we currently have. */
20    brw_validate(s);
21 
22    /* Track how much non-SSA at this point. */
23    {
24       const brw::def_analysis &defs = s.def_analysis.require();
25       s.shader_stats.non_ssa_registers_after_nir =
26          defs.count() - defs.ssa_count();
27    }
28 
29    bool progress = false;
30    int iteration = 0;
31    int pass_num = 0;
32 
33 #define OPT(pass, ...) ({                                               \
34       pass_num++;                                                       \
35       bool this_progress = pass(s, ##__VA_ARGS__);                      \
36                                                                         \
37       if (this_progress)                                                \
38          s.debug_optimizer(nir, #pass, iteration, pass_num);            \
39                                                                         \
40       brw_validate(s);                                                  \
41                                                                         \
42       progress = progress || this_progress;                             \
43       this_progress;                                                    \
44    })
45 
46    if (s.compiler->lower_dpas)
47       OPT(brw_lower_dpas);
48 
49    OPT(brw_opt_split_virtual_grfs);
50 
51    /* Before anything else, eliminate dead code.  The results of some NIR
52     * instructions may effectively be calculated twice.  Once when the
53     * instruction is encountered, and again when the user of that result is
54     * encountered.  Wipe those away before algebraic optimizations and
55     * especially copy propagation can mix things up.
56     */
57    OPT(brw_opt_dead_code_eliminate);
58 
59    OPT(brw_opt_remove_extra_rounding_modes);
60 
61    OPT(brw_opt_eliminate_find_live_channel);
62 
63    do {
64       progress = false;
65       pass_num = 0;
66       iteration++;
67 
68       OPT(brw_opt_algebraic);
69       OPT(brw_opt_cse_defs);
70       if (!OPT(brw_opt_copy_propagation_defs))
71          OPT(brw_opt_copy_propagation);
72       OPT(brw_opt_cmod_propagation);
73       OPT(brw_opt_dead_code_eliminate);
74       OPT(brw_opt_saturate_propagation);
75       OPT(brw_opt_register_coalesce);
76 
77       OPT(brw_opt_compact_virtual_grfs);
78    } while (progress);
79 
80    brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_OPT_LOOP);
81 
82    progress = false;
83    pass_num = 0;
84 
85    if (OPT(brw_opt_combine_convergent_txf))
86       OPT(brw_opt_copy_propagation_defs);
87 
88    if (OPT(brw_lower_pack)) {
89       OPT(brw_opt_register_coalesce);
90       OPT(brw_opt_dead_code_eliminate);
91    }
92 
93    OPT(brw_lower_subgroup_ops);
94    OPT(brw_lower_csel);
95    OPT(brw_lower_simd_width);
96    OPT(brw_lower_scalar_fp64_MAD);
97    OPT(brw_lower_barycentrics);
98    OPT(brw_lower_logical_sends);
99 
100    brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_EARLY_LOWERING);
101 
102    /* After logical SEND lowering. */
103 
104    if (!OPT(brw_opt_copy_propagation_defs))
105       OPT(brw_opt_copy_propagation);
106 
107    /* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
108     * Do this before splitting SENDs.
109     */
110    if (OPT(brw_opt_zero_samples)) {
111       if (!OPT(brw_opt_copy_propagation_defs)) {
112          OPT(brw_opt_copy_propagation);
113       }
114    }
115 
116    OPT(brw_opt_split_sends);
117    OPT(brw_workaround_nomask_control_flow);
118 
119    if (progress) {
120       /* Do both forms of copy propagation because it is important to
121        * eliminate as many cases of load_payload-of-load_payload as possible.
122        */
123       OPT(brw_opt_copy_propagation_defs);
124       OPT(brw_opt_copy_propagation);
125 
126       /* Run after logical send lowering to give it a chance to CSE the
127        * LOAD_PAYLOAD instructions created to construct the payloads of
128        * e.g. texturing messages in cases where it wasn't possible to CSE the
129        * whole logical instruction.
130        */
131       OPT(brw_opt_cse_defs);
132       OPT(brw_opt_register_coalesce);
133       OPT(brw_opt_dead_code_eliminate);
134    }
135 
136    OPT(brw_opt_remove_redundant_halts);
137 
138    if (OPT(brw_lower_load_payload)) {
139       OPT(brw_opt_split_virtual_grfs);
140 
141       OPT(brw_opt_register_coalesce);
142       OPT(brw_lower_simd_width);
143       OPT(brw_opt_dead_code_eliminate);
144    }
145 
146    brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_MIDDLE_LOWERING);
147 
148    OPT(brw_lower_alu_restrictions);
149 
150    OPT(brw_opt_combine_constants);
151    if (OPT(brw_lower_integer_multiplication)) {
152       /* If lower_integer_multiplication made progress, it may have produced
153        * some 32x32-bit MULs in the process of lowering 64-bit MULs.  Run it
154        * one more time to clean those up if they exist.
155        */
156       OPT(brw_lower_integer_multiplication);
157    }
158    OPT(brw_lower_sub_sat);
159 
160    progress = false;
161    OPT(brw_lower_derivatives);
162    OPT(brw_lower_regioning);
163 
164    /* Try both copy propagation passes.  The defs one will likely not be
165     * able to handle everything at this point.
166     */
167    const bool cp1 = OPT(brw_opt_copy_propagation_defs);
168    const bool cp2 = OPT(brw_opt_copy_propagation);
169    if (cp1 || cp2)
170       OPT(brw_opt_combine_constants);
171 
172    OPT(brw_opt_dead_code_eliminate);
173    OPT(brw_opt_register_coalesce);
174 
175    if (progress)
176       OPT(brw_lower_simd_width);
177 
178    OPT(brw_lower_uniform_pull_constant_loads);
179 
180    if (OPT(brw_lower_send_descriptors)) {
181       /* No need for standard copy_propagation since
182        * brw_fs_opt_address_reg_load will only optimize defs.
183        */
184       if (OPT(brw_opt_copy_propagation_defs))
185          OPT(brw_opt_algebraic);
186       OPT(brw_opt_address_reg_load);
187       OPT(brw_opt_dead_code_eliminate);
188    }
189 
190    OPT(brw_lower_sends_overlapping_payload);
191 
192    OPT(brw_lower_indirect_mov);
193 
194    OPT(brw_lower_find_live_channel);
195 
196    OPT(brw_lower_load_subgroup_invocation);
197 
198    brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_LATE_LOWERING);
199 }
200 
201 static unsigned
load_payload_sources_read_for_size(fs_inst * lp,unsigned size_read)202 load_payload_sources_read_for_size(fs_inst *lp, unsigned size_read)
203 {
204    assert(lp->opcode == SHADER_OPCODE_LOAD_PAYLOAD);
205    assert(size_read >= lp->header_size * REG_SIZE);
206 
207    unsigned i;
208    unsigned size = lp->header_size * REG_SIZE;
209    for (i = lp->header_size; size < size_read && i < lp->sources; i++)
210       size += lp->exec_size * brw_type_size_bytes(lp->src[i].type);
211 
212    /* Size read must cover exactly a subset of sources. */
213    assert(size == size_read);
214    return i;
215 }
216 
217 /**
218  * Optimize sample messages that have constant zero values for the trailing
219  * parameters. We can just reduce the message length for these
220  * instructions instead of reserving a register for it. Trailing parameters
221  * that aren't sent default to zero anyway. This will cause the dead code
222  * eliminator to remove the MOV instruction that would otherwise be emitted to
223  * set up the zero value.
224  */
225 
226 bool
brw_opt_zero_samples(fs_visitor & s)227 brw_opt_zero_samples(fs_visitor &s)
228 {
229    bool progress = false;
230 
231    foreach_block_and_inst(block, fs_inst, send, s.cfg) {
232       if (send->opcode != SHADER_OPCODE_SEND ||
233           send->sfid != BRW_SFID_SAMPLER)
234          continue;
235 
236       /* Wa_14012688258:
237        *
238        * Don't trim zeros at the end of payload for sample operations
239        * in cube and cube arrays.
240        */
241       if (send->keep_payload_trailing_zeros)
242          continue;
243 
244       /* This pass works on SENDs before splitting. */
245       if (send->ex_mlen > 0)
246          continue;
247 
248       fs_inst *lp = (fs_inst *) send->prev;
249 
250       if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
251          continue;
252 
253       /* How much of the payload are actually read by this SEND. */
254       const unsigned params =
255          load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
256 
257       /* We don't want to remove the message header or the first parameter.
258        * Removing the first parameter is not allowed, see the Haswell PRM
259        * volume 7, page 149:
260        *
261        *     "Parameter 0 is required except for the sampleinfo message, which
262        *      has no parameter 0"
263        */
264       const unsigned first_param_idx = lp->header_size;
265       unsigned zero_size = 0;
266       for (unsigned i = params - 1; i > first_param_idx; i--) {
267          if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero())
268             break;
269          zero_size += lp->exec_size * brw_type_size_bytes(lp->src[i].type) * lp->dst.stride;
270       }
271 
272       /* Round down to ensure to only consider full registers. */
273       const unsigned zero_len = ROUND_DOWN_TO(zero_size / REG_SIZE, reg_unit(s.devinfo));
274       if (zero_len > 0) {
275          /* Note mlen is in REG_SIZE units. */
276          send->mlen -= zero_len;
277          progress = true;
278       }
279    }
280 
281    if (progress)
282       s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
283 
284    return progress;
285 }
286 
287 /**
288  * Opportunistically split SEND message payloads.
289  *
290  * Gfx9+ supports "split" SEND messages, which take two payloads that are
291  * implicitly concatenated.  If we find a SEND message with a single payload,
292  * we can split that payload in two.  This results in smaller contiguous
293  * register blocks for us to allocate.  But it can help beyond that, too.
294  *
295  * We try and split a LOAD_PAYLOAD between sources which change registers.
296  * For example, a sampler message often contains a x/y/z coordinate that may
297  * already be in a contiguous VGRF, combined with an LOD, shadow comparitor,
298  * or array index, which comes from elsewhere.  In this case, the first few
299  * sources will be different offsets of the same VGRF, then a later source
300  * will be a different VGRF.  So we split there, possibly eliminating the
301  * payload concatenation altogether.
302  */
303 bool
brw_opt_split_sends(fs_visitor & s)304 brw_opt_split_sends(fs_visitor &s)
305 {
306    bool progress = false;
307 
308    foreach_block_and_inst(block, fs_inst, send, s.cfg) {
309       if (send->opcode != SHADER_OPCODE_SEND ||
310           send->mlen <= reg_unit(s.devinfo) || send->ex_mlen > 0 ||
311           send->src[2].file != VGRF)
312          continue;
313 
314       /* Currently don't split sends that reuse a previously used payload. */
315       fs_inst *lp = (fs_inst *) send->prev;
316 
317       if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
318          continue;
319 
320       if (lp->dst.file != send->src[2].file || lp->dst.nr != send->src[2].nr)
321          continue;
322 
323       /* Split either after the header (if present), or when consecutive
324        * sources switch from one VGRF to a different one.
325        */
326       unsigned mid = lp->header_size;
327       if (mid == 0) {
328          for (mid = 1; mid < lp->sources; mid++) {
329             if (lp->src[mid].file == BAD_FILE)
330                continue;
331 
332             if (lp->src[0].file != lp->src[mid].file ||
333                 lp->src[0].nr != lp->src[mid].nr)
334                break;
335          }
336       }
337 
338       /* SEND mlen might be smaller than what LOAD_PAYLOAD provides, so
339        * find out how many sources from the payload does it really need.
340        */
341       const unsigned end =
342          load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
343 
344       /* Nothing to split. */
345       if (end <= mid)
346          continue;
347 
348       const fs_builder ibld(&s, block, lp);
349       fs_inst *lp1 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[0], mid, lp->header_size);
350       fs_inst *lp2 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[mid], end - mid, 0);
351 
352       assert(lp1->size_written % REG_SIZE == 0);
353       assert(lp2->size_written % REG_SIZE == 0);
354       assert((lp1->size_written + lp2->size_written) / REG_SIZE == send->mlen);
355 
356       lp1->dst = brw_vgrf(s.alloc.allocate(lp1->size_written / REG_SIZE), lp1->dst.type);
357       lp2->dst = brw_vgrf(s.alloc.allocate(lp2->size_written / REG_SIZE), lp2->dst.type);
358 
359       send->resize_sources(4);
360       send->src[2] = lp1->dst;
361       send->src[3] = lp2->dst;
362       send->ex_mlen = lp2->size_written / REG_SIZE;
363       send->mlen -= send->ex_mlen;
364 
365       progress = true;
366    }
367 
368    if (progress)
369       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
370 
371    return progress;
372 }
373 
374 /**
375  * Remove redundant or useless halts.
376  *
377  * For example, we can eliminate halts in the following sequence:
378  *
379  * halt        (redundant with the next halt)
380  * halt        (useless; jumps to the next instruction)
381  * halt-target
382  */
383 bool
brw_opt_remove_redundant_halts(fs_visitor & s)384 brw_opt_remove_redundant_halts(fs_visitor &s)
385 {
386    bool progress = false;
387 
388    unsigned halt_count = 0;
389    fs_inst *halt_target = NULL;
390    bblock_t *halt_target_block = NULL;
391    foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
392       if (inst->opcode == BRW_OPCODE_HALT)
393          halt_count++;
394 
395       if (inst->opcode == SHADER_OPCODE_HALT_TARGET) {
396          halt_target = inst;
397          halt_target_block = block;
398          break;
399       }
400    }
401 
402    if (!halt_target) {
403       assert(halt_count == 0);
404       return false;
405    }
406 
407    /* Delete any HALTs immediately before the halt target. */
408    for (fs_inst *prev = (fs_inst *) halt_target->prev;
409         !prev->is_head_sentinel() && prev->opcode == BRW_OPCODE_HALT;
410         prev = (fs_inst *) halt_target->prev) {
411       prev->remove(halt_target_block);
412       halt_count--;
413       progress = true;
414    }
415 
416    if (halt_count == 0) {
417       halt_target->remove(halt_target_block);
418       progress = true;
419    }
420 
421    if (progress)
422       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
423 
424    return progress;
425 }
426 
427 /**
428  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
429  * flow.  We could probably do better here with some form of divergence
430  * analysis.
431  */
432 bool
brw_opt_eliminate_find_live_channel(fs_visitor & s)433 brw_opt_eliminate_find_live_channel(fs_visitor &s)
434 {
435    bool progress = false;
436    unsigned depth = 0;
437 
438    if (!brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
439                                       s.prog_data)) {
440       /* The optimization below assumes that channel zero is live on thread
441        * dispatch, which may not be the case if the fixed function dispatches
442        * threads sparsely.
443        */
444       return false;
445    }
446 
447    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
448       switch (inst->opcode) {
449       case BRW_OPCODE_IF:
450       case BRW_OPCODE_DO:
451          depth++;
452          break;
453 
454       case BRW_OPCODE_ENDIF:
455       case BRW_OPCODE_WHILE:
456          depth--;
457          break;
458 
459       case BRW_OPCODE_HALT:
460          /* This can potentially make control flow non-uniform until the end
461           * of the program.
462           */
463          goto out;
464 
465       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
466          if (depth == 0) {
467             inst->opcode = BRW_OPCODE_MOV;
468             inst->src[0] = brw_imm_ud(0u);
469             inst->force_writemask_all = true;
470 
471             /* FIND_LIVE_CHANNEL emitted by emit_uniformize will have
472              * size_written set by hand to a smaller value. In this case,
473              * munge the exec_size to match.
474              */
475             if (inst->size_written == inst->dst.component_size(8 * reg_unit(s.devinfo)))
476                inst->exec_size = 8 * reg_unit(s.devinfo);
477 
478             inst->resize_sources(1);
479             progress = true;
480 
481             /* emit_uniformize() frequently emits FIND_LIVE_CHANNEL paired
482              * with a BROADCAST.  Save some work for opt_copy_propagation
483              * and opt_algebraic by trivially cleaning up both together.
484              */
485             assert(!inst->next->is_tail_sentinel());
486             fs_inst *bcast = (fs_inst *) inst->next;
487 
488             /* Ignore stride when comparing */
489             if (bcast->opcode == SHADER_OPCODE_BROADCAST &&
490                 inst->dst.file == VGRF &&
491                 inst->dst.file == bcast->src[1].file &&
492                 inst->dst.nr == bcast->src[1].nr &&
493                 inst->dst.offset == bcast->src[1].offset) {
494                bcast->opcode = BRW_OPCODE_MOV;
495                if (!is_uniform(bcast->src[0]))
496                   bcast->src[0] = component(bcast->src[0], 0);
497 
498                bcast->force_writemask_all = true;
499                bcast->exec_size = 8 * reg_unit(s.devinfo);
500                assert(bcast->size_written == bcast->dst.component_size(bcast->exec_size));
501                bcast->resize_sources(1);
502             }
503          }
504          break;
505 
506       default:
507          break;
508       }
509    }
510 
511 out:
512    if (progress)
513       s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
514 
515    return progress;
516 }
517 
518 /**
519  * Rounding modes for conversion instructions are included for each
520  * conversion, but right now it is a state. So once it is set,
521  * we don't need to call it again for subsequent calls.
522  *
523  * This is useful for vector/matrices conversions, as setting the
524  * mode once is enough for the full vector/matrix
525  */
526 bool
brw_opt_remove_extra_rounding_modes(fs_visitor & s)527 brw_opt_remove_extra_rounding_modes(fs_visitor &s)
528 {
529    bool progress = false;
530    unsigned execution_mode = s.nir->info.float_controls_execution_mode;
531 
532    brw_rnd_mode base_mode = BRW_RND_MODE_UNSPECIFIED;
533    if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
534         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
535         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
536        execution_mode)
537       base_mode = BRW_RND_MODE_RTNE;
538    if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
539         FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
540         FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
541        execution_mode)
542       base_mode = BRW_RND_MODE_RTZ;
543 
544    foreach_block (block, s.cfg) {
545       brw_rnd_mode prev_mode = base_mode;
546 
547       foreach_inst_in_block_safe (fs_inst, inst, block) {
548          if (inst->opcode == SHADER_OPCODE_RND_MODE) {
549             assert(inst->src[0].file == IMM);
550             const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d;
551             if (mode == prev_mode) {
552                inst->remove(block);
553                progress = true;
554             } else {
555                prev_mode = mode;
556             }
557          }
558       }
559    }
560 
561    if (progress)
562       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
563 
564    return progress;
565 }
566