1 /*
2 * Copyright © 2010 Intel Corporation
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "brw_dead_control_flow.h"
7 #include "brw_eu.h"
8 #include "brw_fs.h"
9 #include "brw_fs_builder.h"
10
11 using namespace brw;
12
13 void
brw_fs_optimize(fs_visitor & s)14 brw_fs_optimize(fs_visitor &s)
15 {
16 const intel_device_info *devinfo = s.devinfo;
17 const nir_shader *nir = s.nir;
18
19 s.debug_optimizer(nir, "start", 0, 0);
20
21 /* Start by validating the shader we currently have. */
22 s.validate();
23
24 bool progress = false;
25 int iteration = 0;
26 int pass_num = 0;
27
28 #define OPT(pass, ...) ({ \
29 pass_num++; \
30 bool this_progress = pass(s, ##__VA_ARGS__); \
31 \
32 if (this_progress) \
33 s.debug_optimizer(nir, #pass, iteration, pass_num); \
34 \
35 s.validate(); \
36 \
37 progress = progress || this_progress; \
38 this_progress; \
39 })
40
41 s.assign_constant_locations();
42 OPT(brw_fs_lower_constant_loads);
43
44 s.validate();
45
46 if (s.compiler->lower_dpas)
47 OPT(brw_lower_dpas);
48
49 OPT(brw_fs_opt_split_virtual_grfs);
50
51 /* Before anything else, eliminate dead code. The results of some NIR
52 * instructions may effectively be calculated twice. Once when the
53 * instruction is encountered, and again when the user of that result is
54 * encountered. Wipe those away before algebraic optimizations and
55 * especially copy propagation can mix things up.
56 */
57 OPT(brw_fs_opt_dead_code_eliminate);
58
59 OPT(brw_fs_opt_remove_extra_rounding_modes);
60
61 do {
62 progress = false;
63 pass_num = 0;
64 iteration++;
65
66 OPT(brw_fs_opt_algebraic);
67 OPT(brw_fs_opt_cse);
68 OPT(brw_fs_opt_copy_propagation);
69 OPT(opt_predicated_break);
70 OPT(brw_fs_opt_cmod_propagation);
71 OPT(brw_fs_opt_dead_code_eliminate);
72 OPT(brw_fs_opt_peephole_sel);
73 OPT(dead_control_flow_eliminate);
74 OPT(brw_fs_opt_saturate_propagation);
75 OPT(brw_fs_opt_register_coalesce);
76 OPT(brw_fs_opt_eliminate_find_live_channel);
77
78 OPT(brw_fs_opt_compact_virtual_grfs);
79 } while (progress);
80
81 progress = false;
82 pass_num = 0;
83
84 if (OPT(brw_fs_lower_pack)) {
85 OPT(brw_fs_opt_register_coalesce);
86 OPT(brw_fs_opt_dead_code_eliminate);
87 }
88
89 OPT(brw_fs_lower_simd_width);
90 OPT(brw_fs_lower_barycentrics);
91 OPT(brw_fs_lower_logical_sends);
92
93 /* After logical SEND lowering. */
94
95 if (OPT(brw_fs_opt_copy_propagation))
96 OPT(brw_fs_opt_algebraic);
97
98 /* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
99 * Do this before splitting SENDs.
100 */
101 if (OPT(brw_fs_opt_zero_samples) && OPT(brw_fs_opt_copy_propagation))
102 OPT(brw_fs_opt_algebraic);
103
104 OPT(brw_fs_opt_split_sends);
105 OPT(brw_fs_workaround_nomask_control_flow);
106
107 if (progress) {
108 if (OPT(brw_fs_opt_copy_propagation))
109 OPT(brw_fs_opt_algebraic);
110
111 /* Run after logical send lowering to give it a chance to CSE the
112 * LOAD_PAYLOAD instructions created to construct the payloads of
113 * e.g. texturing messages in cases where it wasn't possible to CSE the
114 * whole logical instruction.
115 */
116 OPT(brw_fs_opt_cse);
117 OPT(brw_fs_opt_register_coalesce);
118 OPT(brw_fs_opt_dead_code_eliminate);
119 OPT(brw_fs_opt_peephole_sel);
120 }
121
122 OPT(brw_fs_opt_remove_redundant_halts);
123
124 if (OPT(brw_fs_lower_load_payload)) {
125 OPT(brw_fs_opt_split_virtual_grfs);
126
127 /* Lower 64 bit MOVs generated by payload lowering. */
128 if (!devinfo->has_64bit_float || !devinfo->has_64bit_int)
129 OPT(brw_fs_opt_algebraic);
130
131 OPT(brw_fs_opt_register_coalesce);
132 OPT(brw_fs_lower_simd_width);
133 OPT(brw_fs_opt_dead_code_eliminate);
134 }
135
136 OPT(brw_fs_opt_combine_constants);
137 if (OPT(brw_fs_lower_integer_multiplication)) {
138 /* If lower_integer_multiplication made progress, it may have produced
139 * some 32x32-bit MULs in the process of lowering 64-bit MULs. Run it
140 * one more time to clean those up if they exist.
141 */
142 OPT(brw_fs_lower_integer_multiplication);
143 }
144 OPT(brw_fs_lower_sub_sat);
145
146 progress = false;
147 OPT(brw_fs_lower_derivatives);
148 OPT(brw_fs_lower_regioning);
149 if (progress) {
150 if (OPT(brw_fs_opt_copy_propagation))
151 OPT(brw_fs_opt_algebraic);
152 OPT(brw_fs_opt_dead_code_eliminate);
153 OPT(brw_fs_lower_simd_width);
154 }
155
156 OPT(brw_fs_lower_sends_overlapping_payload);
157
158 OPT(brw_fs_lower_uniform_pull_constant_loads);
159
160 OPT(brw_fs_lower_find_live_channel);
161
162 s.validate();
163 }
164
165 static unsigned
load_payload_sources_read_for_size(fs_inst * lp,unsigned size_read)166 load_payload_sources_read_for_size(fs_inst *lp, unsigned size_read)
167 {
168 assert(lp->opcode == SHADER_OPCODE_LOAD_PAYLOAD);
169 assert(size_read >= lp->header_size * REG_SIZE);
170
171 unsigned i;
172 unsigned size = lp->header_size * REG_SIZE;
173 for (i = lp->header_size; size < size_read && i < lp->sources; i++)
174 size += lp->exec_size * type_sz(lp->src[i].type);
175
176 /* Size read must cover exactly a subset of sources. */
177 assert(size == size_read);
178 return i;
179 }
180
181 /**
182 * Optimize sample messages that have constant zero values for the trailing
183 * parameters. We can just reduce the message length for these
184 * instructions instead of reserving a register for it. Trailing parameters
185 * that aren't sent default to zero anyway. This will cause the dead code
186 * eliminator to remove the MOV instruction that would otherwise be emitted to
187 * set up the zero value.
188 */
189
190 bool
brw_fs_opt_zero_samples(fs_visitor & s)191 brw_fs_opt_zero_samples(fs_visitor &s)
192 {
193 bool progress = false;
194
195 foreach_block_and_inst(block, fs_inst, send, s.cfg) {
196 if (send->opcode != SHADER_OPCODE_SEND ||
197 send->sfid != BRW_SFID_SAMPLER)
198 continue;
199
200 /* Wa_14012688258:
201 *
202 * Don't trim zeros at the end of payload for sample operations
203 * in cube and cube arrays.
204 */
205 if (send->keep_payload_trailing_zeros)
206 continue;
207
208 /* This pass works on SENDs before splitting. */
209 if (send->ex_mlen > 0)
210 continue;
211
212 fs_inst *lp = (fs_inst *) send->prev;
213
214 if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
215 continue;
216
217 /* How much of the payload are actually read by this SEND. */
218 const unsigned params =
219 load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
220
221 /* We don't want to remove the message header or the first parameter.
222 * Removing the first parameter is not allowed, see the Haswell PRM
223 * volume 7, page 149:
224 *
225 * "Parameter 0 is required except for the sampleinfo message, which
226 * has no parameter 0"
227 */
228 const unsigned first_param_idx = lp->header_size;
229 unsigned zero_size = 0;
230 for (unsigned i = params - 1; i > first_param_idx; i--) {
231 if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero())
232 break;
233 zero_size += lp->exec_size * type_sz(lp->src[i].type) * lp->dst.stride;
234 }
235
236 const unsigned zero_len = zero_size / (reg_unit(s.devinfo) * REG_SIZE);
237 if (zero_len > 0) {
238 send->mlen -= zero_len;
239 progress = true;
240 }
241 }
242
243 if (progress)
244 s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
245
246 return progress;
247 }
248
249 /**
250 * Opportunistically split SEND message payloads.
251 *
252 * Gfx9+ supports "split" SEND messages, which take two payloads that are
253 * implicitly concatenated. If we find a SEND message with a single payload,
254 * we can split that payload in two. This results in smaller contiguous
255 * register blocks for us to allocate. But it can help beyond that, too.
256 *
257 * We try and split a LOAD_PAYLOAD between sources which change registers.
258 * For example, a sampler message often contains a x/y/z coordinate that may
259 * already be in a contiguous VGRF, combined with an LOD, shadow comparitor,
260 * or array index, which comes from elsewhere. In this case, the first few
261 * sources will be different offsets of the same VGRF, then a later source
262 * will be a different VGRF. So we split there, possibly eliminating the
263 * payload concatenation altogether.
264 */
265 bool
brw_fs_opt_split_sends(fs_visitor & s)266 brw_fs_opt_split_sends(fs_visitor &s)
267 {
268 bool progress = false;
269
270 foreach_block_and_inst(block, fs_inst, send, s.cfg) {
271 if (send->opcode != SHADER_OPCODE_SEND ||
272 send->mlen <= reg_unit(s.devinfo) || send->ex_mlen > 0)
273 continue;
274
275 assert(send->src[2].file == VGRF);
276
277 /* Currently don't split sends that reuse a previously used payload. */
278 fs_inst *lp = (fs_inst *) send->prev;
279
280 if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
281 continue;
282
283 if (lp->dst.file != send->src[2].file || lp->dst.nr != send->src[2].nr)
284 continue;
285
286 /* Split either after the header (if present), or when consecutive
287 * sources switch from one VGRF to a different one.
288 */
289 unsigned mid = lp->header_size;
290 if (mid == 0) {
291 for (mid = 1; mid < lp->sources; mid++) {
292 if (lp->src[mid].file == BAD_FILE)
293 continue;
294
295 if (lp->src[0].file != lp->src[mid].file ||
296 lp->src[0].nr != lp->src[mid].nr)
297 break;
298 }
299 }
300
301 /* SEND mlen might be smaller than what LOAD_PAYLOAD provides, so
302 * find out how many sources from the payload does it really need.
303 */
304 const unsigned end =
305 load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
306
307 /* Nothing to split. */
308 if (end <= mid)
309 continue;
310
311 const fs_builder ibld(&s, block, lp);
312 fs_inst *lp1 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[0], mid, lp->header_size);
313 fs_inst *lp2 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[mid], end - mid, 0);
314
315 assert(lp1->size_written % REG_SIZE == 0);
316 assert(lp2->size_written % REG_SIZE == 0);
317 assert((lp1->size_written + lp2->size_written) / REG_SIZE == send->mlen);
318
319 lp1->dst = fs_reg(VGRF, s.alloc.allocate(lp1->size_written / REG_SIZE), lp1->dst.type);
320 lp2->dst = fs_reg(VGRF, s.alloc.allocate(lp2->size_written / REG_SIZE), lp2->dst.type);
321
322 send->resize_sources(4);
323 send->src[2] = lp1->dst;
324 send->src[3] = lp2->dst;
325 send->ex_mlen = lp2->size_written / REG_SIZE;
326 send->mlen -= send->ex_mlen;
327
328 progress = true;
329 }
330
331 if (progress)
332 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
333
334 return progress;
335 }
336
337 /**
338 * Remove redundant or useless halts.
339 *
340 * For example, we can eliminate halts in the following sequence:
341 *
342 * halt (redundant with the next halt)
343 * halt (useless; jumps to the next instruction)
344 * halt-target
345 */
346 bool
brw_fs_opt_remove_redundant_halts(fs_visitor & s)347 brw_fs_opt_remove_redundant_halts(fs_visitor &s)
348 {
349 bool progress = false;
350
351 unsigned halt_count = 0;
352 fs_inst *halt_target = NULL;
353 bblock_t *halt_target_block = NULL;
354 foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
355 if (inst->opcode == BRW_OPCODE_HALT)
356 halt_count++;
357
358 if (inst->opcode == SHADER_OPCODE_HALT_TARGET) {
359 halt_target = inst;
360 halt_target_block = block;
361 break;
362 }
363 }
364
365 if (!halt_target) {
366 assert(halt_count == 0);
367 return false;
368 }
369
370 /* Delete any HALTs immediately before the halt target. */
371 for (fs_inst *prev = (fs_inst *) halt_target->prev;
372 !prev->is_head_sentinel() && prev->opcode == BRW_OPCODE_HALT;
373 prev = (fs_inst *) halt_target->prev) {
374 prev->remove(halt_target_block);
375 halt_count--;
376 progress = true;
377 }
378
379 if (halt_count == 0) {
380 halt_target->remove(halt_target_block);
381 progress = true;
382 }
383
384 if (progress)
385 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
386
387 return progress;
388 }
389
390 /**
391 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
392 * flow. We could probably do better here with some form of divergence
393 * analysis.
394 */
395 bool
brw_fs_opt_eliminate_find_live_channel(fs_visitor & s)396 brw_fs_opt_eliminate_find_live_channel(fs_visitor &s)
397 {
398 bool progress = false;
399 unsigned depth = 0;
400
401 if (!brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
402 s.stage_prog_data)) {
403 /* The optimization below assumes that channel zero is live on thread
404 * dispatch, which may not be the case if the fixed function dispatches
405 * threads sparsely.
406 */
407 return false;
408 }
409
410 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
411 switch (inst->opcode) {
412 case BRW_OPCODE_IF:
413 case BRW_OPCODE_DO:
414 depth++;
415 break;
416
417 case BRW_OPCODE_ENDIF:
418 case BRW_OPCODE_WHILE:
419 depth--;
420 break;
421
422 case BRW_OPCODE_HALT:
423 /* This can potentially make control flow non-uniform until the end
424 * of the program.
425 */
426 goto out;
427
428 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
429 if (depth == 0) {
430 inst->opcode = BRW_OPCODE_MOV;
431 inst->src[0] = brw_imm_ud(0u);
432 inst->sources = 1;
433 inst->force_writemask_all = true;
434 progress = true;
435 }
436 break;
437
438 default:
439 break;
440 }
441 }
442
443 out:
444 if (progress)
445 s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
446
447 return progress;
448 }
449
450 /**
451 * Rounding modes for conversion instructions are included for each
452 * conversion, but right now it is a state. So once it is set,
453 * we don't need to call it again for subsequent calls.
454 *
455 * This is useful for vector/matrices conversions, as setting the
456 * mode once is enough for the full vector/matrix
457 */
458 bool
brw_fs_opt_remove_extra_rounding_modes(fs_visitor & s)459 brw_fs_opt_remove_extra_rounding_modes(fs_visitor &s)
460 {
461 bool progress = false;
462 unsigned execution_mode = s.nir->info.float_controls_execution_mode;
463
464 brw_rnd_mode base_mode = BRW_RND_MODE_UNSPECIFIED;
465 if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
466 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
467 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
468 execution_mode)
469 base_mode = BRW_RND_MODE_RTNE;
470 if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
471 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
472 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
473 execution_mode)
474 base_mode = BRW_RND_MODE_RTZ;
475
476 foreach_block (block, s.cfg) {
477 brw_rnd_mode prev_mode = base_mode;
478
479 foreach_inst_in_block_safe (fs_inst, inst, block) {
480 if (inst->opcode == SHADER_OPCODE_RND_MODE) {
481 assert(inst->src[0].file == BRW_IMMEDIATE_VALUE);
482 const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d;
483 if (mode == prev_mode) {
484 inst->remove(block);
485 progress = true;
486 } else {
487 prev_mode = mode;
488 }
489 }
490 }
491 }
492
493 if (progress)
494 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
495
496 return progress;
497 }
498
499