• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "brw_dead_control_flow.h"
7 #include "brw_eu.h"
8 #include "brw_fs.h"
9 #include "brw_fs_builder.h"
10 
11 using namespace brw;
12 
13 void
brw_fs_optimize(fs_visitor & s)14 brw_fs_optimize(fs_visitor &s)
15 {
16    const intel_device_info *devinfo = s.devinfo;
17    const nir_shader *nir = s.nir;
18 
19    s.debug_optimizer(nir, "start", 0, 0);
20 
21    /* Start by validating the shader we currently have. */
22    s.validate();
23 
24    bool progress = false;
25    int iteration = 0;
26    int pass_num = 0;
27 
28 #define OPT(pass, ...) ({                                               \
29       pass_num++;                                                       \
30       bool this_progress = pass(s, ##__VA_ARGS__);                      \
31                                                                         \
32       if (this_progress)                                                \
33          s.debug_optimizer(nir, #pass, iteration, pass_num);            \
34                                                                         \
35       s.validate();                                                     \
36                                                                         \
37       progress = progress || this_progress;                             \
38       this_progress;                                                    \
39    })
40 
41    s.assign_constant_locations();
42    OPT(brw_fs_lower_constant_loads);
43 
44    s.validate();
45 
46    if (s.compiler->lower_dpas)
47       OPT(brw_lower_dpas);
48 
49    OPT(brw_fs_opt_split_virtual_grfs);
50 
51    /* Before anything else, eliminate dead code.  The results of some NIR
52     * instructions may effectively be calculated twice.  Once when the
53     * instruction is encountered, and again when the user of that result is
54     * encountered.  Wipe those away before algebraic optimizations and
55     * especially copy propagation can mix things up.
56     */
57    OPT(brw_fs_opt_dead_code_eliminate);
58 
59    OPT(brw_fs_opt_remove_extra_rounding_modes);
60 
61    do {
62       progress = false;
63       pass_num = 0;
64       iteration++;
65 
66       OPT(brw_fs_opt_algebraic);
67       OPT(brw_fs_opt_cse);
68       OPT(brw_fs_opt_copy_propagation);
69       OPT(opt_predicated_break);
70       OPT(brw_fs_opt_cmod_propagation);
71       OPT(brw_fs_opt_dead_code_eliminate);
72       OPT(brw_fs_opt_peephole_sel);
73       OPT(dead_control_flow_eliminate);
74       OPT(brw_fs_opt_saturate_propagation);
75       OPT(brw_fs_opt_register_coalesce);
76       OPT(brw_fs_opt_eliminate_find_live_channel);
77 
78       OPT(brw_fs_opt_compact_virtual_grfs);
79    } while (progress);
80 
81    progress = false;
82    pass_num = 0;
83 
84    if (OPT(brw_fs_lower_pack)) {
85       OPT(brw_fs_opt_register_coalesce);
86       OPT(brw_fs_opt_dead_code_eliminate);
87    }
88 
89    OPT(brw_fs_lower_simd_width);
90    OPT(brw_fs_lower_barycentrics);
91    OPT(brw_fs_lower_logical_sends);
92 
93    /* After logical SEND lowering. */
94 
95    if (OPT(brw_fs_opt_copy_propagation))
96       OPT(brw_fs_opt_algebraic);
97 
98    /* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
99     * Do this before splitting SENDs.
100     */
101    if (OPT(brw_fs_opt_zero_samples) && OPT(brw_fs_opt_copy_propagation))
102       OPT(brw_fs_opt_algebraic);
103 
104    OPT(brw_fs_opt_split_sends);
105    OPT(brw_fs_workaround_nomask_control_flow);
106 
107    if (progress) {
108       if (OPT(brw_fs_opt_copy_propagation))
109          OPT(brw_fs_opt_algebraic);
110 
111       /* Run after logical send lowering to give it a chance to CSE the
112        * LOAD_PAYLOAD instructions created to construct the payloads of
113        * e.g. texturing messages in cases where it wasn't possible to CSE the
114        * whole logical instruction.
115        */
116       OPT(brw_fs_opt_cse);
117       OPT(brw_fs_opt_register_coalesce);
118       OPT(brw_fs_opt_dead_code_eliminate);
119       OPT(brw_fs_opt_peephole_sel);
120    }
121 
122    OPT(brw_fs_opt_remove_redundant_halts);
123 
124    if (OPT(brw_fs_lower_load_payload)) {
125       OPT(brw_fs_opt_split_virtual_grfs);
126 
127       /* Lower 64 bit MOVs generated by payload lowering. */
128       if (!devinfo->has_64bit_float || !devinfo->has_64bit_int)
129          OPT(brw_fs_opt_algebraic);
130 
131       OPT(brw_fs_opt_register_coalesce);
132       OPT(brw_fs_lower_simd_width);
133       OPT(brw_fs_opt_dead_code_eliminate);
134    }
135 
136    OPT(brw_fs_opt_combine_constants);
137    if (OPT(brw_fs_lower_integer_multiplication)) {
138       /* If lower_integer_multiplication made progress, it may have produced
139        * some 32x32-bit MULs in the process of lowering 64-bit MULs.  Run it
140        * one more time to clean those up if they exist.
141        */
142       OPT(brw_fs_lower_integer_multiplication);
143    }
144    OPT(brw_fs_lower_sub_sat);
145 
146    progress = false;
147    OPT(brw_fs_lower_derivatives);
148    OPT(brw_fs_lower_regioning);
149    if (progress) {
150       if (OPT(brw_fs_opt_copy_propagation))
151          OPT(brw_fs_opt_algebraic);
152       OPT(brw_fs_opt_dead_code_eliminate);
153       OPT(brw_fs_lower_simd_width);
154    }
155 
156    OPT(brw_fs_lower_sends_overlapping_payload);
157 
158    OPT(brw_fs_lower_uniform_pull_constant_loads);
159 
160    OPT(brw_fs_lower_find_live_channel);
161 
162    s.validate();
163 }
164 
165 static unsigned
load_payload_sources_read_for_size(fs_inst * lp,unsigned size_read)166 load_payload_sources_read_for_size(fs_inst *lp, unsigned size_read)
167 {
168    assert(lp->opcode == SHADER_OPCODE_LOAD_PAYLOAD);
169    assert(size_read >= lp->header_size * REG_SIZE);
170 
171    unsigned i;
172    unsigned size = lp->header_size * REG_SIZE;
173    for (i = lp->header_size; size < size_read && i < lp->sources; i++)
174       size += lp->exec_size * type_sz(lp->src[i].type);
175 
176    /* Size read must cover exactly a subset of sources. */
177    assert(size == size_read);
178    return i;
179 }
180 
181 /**
182  * Optimize sample messages that have constant zero values for the trailing
183  * parameters. We can just reduce the message length for these
184  * instructions instead of reserving a register for it. Trailing parameters
185  * that aren't sent default to zero anyway. This will cause the dead code
186  * eliminator to remove the MOV instruction that would otherwise be emitted to
187  * set up the zero value.
188  */
189 
190 bool
brw_fs_opt_zero_samples(fs_visitor & s)191 brw_fs_opt_zero_samples(fs_visitor &s)
192 {
193    bool progress = false;
194 
195    foreach_block_and_inst(block, fs_inst, send, s.cfg) {
196       if (send->opcode != SHADER_OPCODE_SEND ||
197           send->sfid != BRW_SFID_SAMPLER)
198          continue;
199 
200       /* Wa_14012688258:
201        *
202        * Don't trim zeros at the end of payload for sample operations
203        * in cube and cube arrays.
204        */
205       if (send->keep_payload_trailing_zeros)
206          continue;
207 
208       /* This pass works on SENDs before splitting. */
209       if (send->ex_mlen > 0)
210          continue;
211 
212       fs_inst *lp = (fs_inst *) send->prev;
213 
214       if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
215          continue;
216 
217       /* How much of the payload are actually read by this SEND. */
218       const unsigned params =
219          load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
220 
221       /* We don't want to remove the message header or the first parameter.
222        * Removing the first parameter is not allowed, see the Haswell PRM
223        * volume 7, page 149:
224        *
225        *     "Parameter 0 is required except for the sampleinfo message, which
226        *      has no parameter 0"
227        */
228       const unsigned first_param_idx = lp->header_size;
229       unsigned zero_size = 0;
230       for (unsigned i = params - 1; i > first_param_idx; i--) {
231          if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero())
232             break;
233          zero_size += lp->exec_size * type_sz(lp->src[i].type) * lp->dst.stride;
234       }
235 
236       const unsigned zero_len = zero_size / (reg_unit(s.devinfo) * REG_SIZE);
237       if (zero_len > 0) {
238          send->mlen -= zero_len;
239          progress = true;
240       }
241    }
242 
243    if (progress)
244       s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
245 
246    return progress;
247 }
248 
249 /**
250  * Opportunistically split SEND message payloads.
251  *
252  * Gfx9+ supports "split" SEND messages, which take two payloads that are
253  * implicitly concatenated.  If we find a SEND message with a single payload,
254  * we can split that payload in two.  This results in smaller contiguous
255  * register blocks for us to allocate.  But it can help beyond that, too.
256  *
257  * We try and split a LOAD_PAYLOAD between sources which change registers.
258  * For example, a sampler message often contains a x/y/z coordinate that may
259  * already be in a contiguous VGRF, combined with an LOD, shadow comparitor,
260  * or array index, which comes from elsewhere.  In this case, the first few
261  * sources will be different offsets of the same VGRF, then a later source
262  * will be a different VGRF.  So we split there, possibly eliminating the
263  * payload concatenation altogether.
264  */
265 bool
brw_fs_opt_split_sends(fs_visitor & s)266 brw_fs_opt_split_sends(fs_visitor &s)
267 {
268    bool progress = false;
269 
270    foreach_block_and_inst(block, fs_inst, send, s.cfg) {
271       if (send->opcode != SHADER_OPCODE_SEND ||
272           send->mlen <= reg_unit(s.devinfo) || send->ex_mlen > 0)
273          continue;
274 
275       assert(send->src[2].file == VGRF);
276 
277       /* Currently don't split sends that reuse a previously used payload. */
278       fs_inst *lp = (fs_inst *) send->prev;
279 
280       if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
281          continue;
282 
283       if (lp->dst.file != send->src[2].file || lp->dst.nr != send->src[2].nr)
284          continue;
285 
286       /* Split either after the header (if present), or when consecutive
287        * sources switch from one VGRF to a different one.
288        */
289       unsigned mid = lp->header_size;
290       if (mid == 0) {
291          for (mid = 1; mid < lp->sources; mid++) {
292             if (lp->src[mid].file == BAD_FILE)
293                continue;
294 
295             if (lp->src[0].file != lp->src[mid].file ||
296                 lp->src[0].nr != lp->src[mid].nr)
297                break;
298          }
299       }
300 
301       /* SEND mlen might be smaller than what LOAD_PAYLOAD provides, so
302        * find out how many sources from the payload does it really need.
303        */
304       const unsigned end =
305          load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
306 
307       /* Nothing to split. */
308       if (end <= mid)
309          continue;
310 
311       const fs_builder ibld(&s, block, lp);
312       fs_inst *lp1 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[0], mid, lp->header_size);
313       fs_inst *lp2 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[mid], end - mid, 0);
314 
315       assert(lp1->size_written % REG_SIZE == 0);
316       assert(lp2->size_written % REG_SIZE == 0);
317       assert((lp1->size_written + lp2->size_written) / REG_SIZE == send->mlen);
318 
319       lp1->dst = fs_reg(VGRF, s.alloc.allocate(lp1->size_written / REG_SIZE), lp1->dst.type);
320       lp2->dst = fs_reg(VGRF, s.alloc.allocate(lp2->size_written / REG_SIZE), lp2->dst.type);
321 
322       send->resize_sources(4);
323       send->src[2] = lp1->dst;
324       send->src[3] = lp2->dst;
325       send->ex_mlen = lp2->size_written / REG_SIZE;
326       send->mlen -= send->ex_mlen;
327 
328       progress = true;
329    }
330 
331    if (progress)
332       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
333 
334    return progress;
335 }
336 
337 /**
338  * Remove redundant or useless halts.
339  *
340  * For example, we can eliminate halts in the following sequence:
341  *
342  * halt        (redundant with the next halt)
343  * halt        (useless; jumps to the next instruction)
344  * halt-target
345  */
346 bool
brw_fs_opt_remove_redundant_halts(fs_visitor & s)347 brw_fs_opt_remove_redundant_halts(fs_visitor &s)
348 {
349    bool progress = false;
350 
351    unsigned halt_count = 0;
352    fs_inst *halt_target = NULL;
353    bblock_t *halt_target_block = NULL;
354    foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
355       if (inst->opcode == BRW_OPCODE_HALT)
356          halt_count++;
357 
358       if (inst->opcode == SHADER_OPCODE_HALT_TARGET) {
359          halt_target = inst;
360          halt_target_block = block;
361          break;
362       }
363    }
364 
365    if (!halt_target) {
366       assert(halt_count == 0);
367       return false;
368    }
369 
370    /* Delete any HALTs immediately before the halt target. */
371    for (fs_inst *prev = (fs_inst *) halt_target->prev;
372         !prev->is_head_sentinel() && prev->opcode == BRW_OPCODE_HALT;
373         prev = (fs_inst *) halt_target->prev) {
374       prev->remove(halt_target_block);
375       halt_count--;
376       progress = true;
377    }
378 
379    if (halt_count == 0) {
380       halt_target->remove(halt_target_block);
381       progress = true;
382    }
383 
384    if (progress)
385       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
386 
387    return progress;
388 }
389 
390 /**
391  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
392  * flow.  We could probably do better here with some form of divergence
393  * analysis.
394  */
395 bool
brw_fs_opt_eliminate_find_live_channel(fs_visitor & s)396 brw_fs_opt_eliminate_find_live_channel(fs_visitor &s)
397 {
398    bool progress = false;
399    unsigned depth = 0;
400 
401    if (!brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
402                                       s.stage_prog_data)) {
403       /* The optimization below assumes that channel zero is live on thread
404        * dispatch, which may not be the case if the fixed function dispatches
405        * threads sparsely.
406        */
407       return false;
408    }
409 
410    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
411       switch (inst->opcode) {
412       case BRW_OPCODE_IF:
413       case BRW_OPCODE_DO:
414          depth++;
415          break;
416 
417       case BRW_OPCODE_ENDIF:
418       case BRW_OPCODE_WHILE:
419          depth--;
420          break;
421 
422       case BRW_OPCODE_HALT:
423          /* This can potentially make control flow non-uniform until the end
424           * of the program.
425           */
426          goto out;
427 
428       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
429          if (depth == 0) {
430             inst->opcode = BRW_OPCODE_MOV;
431             inst->src[0] = brw_imm_ud(0u);
432             inst->sources = 1;
433             inst->force_writemask_all = true;
434             progress = true;
435          }
436          break;
437 
438       default:
439          break;
440       }
441    }
442 
443 out:
444    if (progress)
445       s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
446 
447    return progress;
448 }
449 
450 /**
451  * Rounding modes for conversion instructions are included for each
452  * conversion, but right now it is a state. So once it is set,
453  * we don't need to call it again for subsequent calls.
454  *
455  * This is useful for vector/matrices conversions, as setting the
456  * mode once is enough for the full vector/matrix
457  */
458 bool
brw_fs_opt_remove_extra_rounding_modes(fs_visitor & s)459 brw_fs_opt_remove_extra_rounding_modes(fs_visitor &s)
460 {
461    bool progress = false;
462    unsigned execution_mode = s.nir->info.float_controls_execution_mode;
463 
464    brw_rnd_mode base_mode = BRW_RND_MODE_UNSPECIFIED;
465    if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
466         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
467         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
468        execution_mode)
469       base_mode = BRW_RND_MODE_RTNE;
470    if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
471         FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
472         FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
473        execution_mode)
474       base_mode = BRW_RND_MODE_RTZ;
475 
476    foreach_block (block, s.cfg) {
477       brw_rnd_mode prev_mode = base_mode;
478 
479       foreach_inst_in_block_safe (fs_inst, inst, block) {
480          if (inst->opcode == SHADER_OPCODE_RND_MODE) {
481             assert(inst->src[0].file == BRW_IMMEDIATE_VALUE);
482             const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d;
483             if (mode == prev_mode) {
484                inst->remove(block);
485                progress = true;
486             } else {
487                prev_mode = mode;
488             }
489          }
490       }
491    }
492 
493    if (progress)
494       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
495 
496    return progress;
497 }
498 
499