1 /*
2 * Copyright © 2020 Google LLC
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /**
25 * @file
26 *
27 * Removes unused components of SSA defs.
28 *
29 * Due to various optimization passes (or frontend implementations,
30 * particularly prog_to_nir), we may have instructions generating vectors
31 * whose components don't get read by any instruction.
32 *
33 * For memory loads, while it can be tricky to eliminate unused low components
34 * or channels in the middle of a writemask (you might need to increment some
35 * offset from a load_uniform, for example), it is trivial to just drop the
36 * trailing components. This pass shrinks low components on select intrinsics.
37 * For vector ALU and load_const, only used by other ALU instructions,
38 * this pass eliminates arbitrary channels as well as duplicate channels,
39 * and reswizzles the uses.
40 *
41 * This pass is probably only of use to vector backends -- scalar backends
42 * typically get unused def channel trimming by scalarizing and dead code
43 * elimination.
44 */
45
46 #include "util/u_math.h"
47 #include "nir.h"
48 #include "nir_builder.h"
49
50 static void
reswizzle_alu_uses(nir_def * def,uint8_t * reswizzle)51 reswizzle_alu_uses(nir_def *def, uint8_t *reswizzle)
52 {
53 nir_foreach_use(use_src, def) {
54 /* all uses must be ALU instructions */
55 assert(nir_src_parent_instr(use_src)->type == nir_instr_type_alu);
56 nir_alu_src *alu_src = (nir_alu_src *)use_src;
57
58 /* reswizzle ALU sources */
59 for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++)
60 alu_src->swizzle[i] = reswizzle[alu_src->swizzle[i]];
61 }
62 }
63
64 static bool
is_only_used_by_alu(nir_def * def)65 is_only_used_by_alu(nir_def *def)
66 {
67 nir_foreach_use(use_src, def) {
68 if (nir_src_parent_instr(use_src)->type != nir_instr_type_alu)
69 return false;
70 }
71
72 return true;
73 }
74
75 static bool
shrink_dest_to_read_mask(nir_def * def,bool shrink_start)76 shrink_dest_to_read_mask(nir_def *def, bool shrink_start)
77 {
78 /* early out if there's nothing to do. */
79 if (def->num_components == 1)
80 return false;
81
82 /* don't remove any channels if used by an intrinsic */
83 nir_foreach_use(use_src, def) {
84 if (nir_src_parent_instr(use_src)->type == nir_instr_type_intrinsic)
85 return false;
86 }
87
88 unsigned mask = nir_def_components_read(def);
89
90 /* If nothing was read, leave it up to DCE. */
91 if (!mask)
92 return false;
93
94 nir_intrinsic_instr *intr = NULL;
95 nir_src *offset_src = NULL;
96
97 if (def->parent_instr->type == nir_instr_type_intrinsic) {
98 intr = nir_instr_as_intrinsic(def->parent_instr);
99 offset_src = nir_get_io_offset_src(intr);
100 }
101
102 shrink_start &= intr && (nir_intrinsic_has_component(intr) || offset_src) &&
103 is_only_used_by_alu(def);
104
105 int last_bit = util_last_bit(mask);
106 int first_bit = shrink_start ? (ffs(mask) - 1) : 0;
107
108 const unsigned comps = last_bit - first_bit;
109 const unsigned rounded = nir_round_up_components(comps);
110 assert(rounded <= def->num_components);
111
112 if ((def->num_components > rounded) || first_bit > 0) {
113 def->num_components = rounded;
114
115 if (first_bit) {
116 assert(shrink_start);
117
118 if (nir_intrinsic_has_component(intr)) {
119 unsigned new_component = nir_intrinsic_component(intr) + first_bit;
120 nir_intrinsic_set_component(intr, new_component);
121 } else {
122 /* Add the component offset into the src offset. */
123 unsigned offset = (def->bit_size / 8) * first_bit;
124
125 if (nir_intrinsic_has_align_offset(intr)) {
126 unsigned align_offset = (nir_intrinsic_align_offset(intr) + offset) %
127 nir_intrinsic_align_mul(intr);
128 nir_intrinsic_set_align_offset(intr, align_offset);
129 }
130
131 nir_builder b = nir_builder_at(nir_before_instr(&intr->instr));
132 nir_src_rewrite(offset_src, nir_iadd_imm(&b, offset_src->ssa, offset));
133 }
134
135 /* Reswizzle sources, which must be ALU since they have swizzle */
136 assert(first_bit + comps <= NIR_MAX_VEC_COMPONENTS);
137 uint8_t swizzle[NIR_MAX_VEC_COMPONENTS] = { 0 };
138 for (unsigned i = 0; i < comps; ++i) {
139 swizzle[first_bit + i] = i;
140 }
141
142 reswizzle_alu_uses(def, swizzle);
143 }
144
145 return true;
146 }
147
148 return false;
149 }
150
151 static bool
shrink_intrinsic_to_non_sparse(nir_intrinsic_instr * instr)152 shrink_intrinsic_to_non_sparse(nir_intrinsic_instr *instr)
153 {
154 unsigned mask = nir_def_components_read(&instr->def);
155 int last_bit = util_last_bit(mask);
156
157 /* If the sparse component is used, do nothing. */
158 if (last_bit == instr->def.num_components)
159 return false;
160
161 instr->def.num_components -= 1;
162 instr->num_components = instr->def.num_components;
163
164 /* Switch to the non-sparse intrinsic. */
165 switch (instr->intrinsic) {
166 case nir_intrinsic_image_sparse_load:
167 instr->intrinsic = nir_intrinsic_image_load;
168 break;
169 case nir_intrinsic_bindless_image_sparse_load:
170 instr->intrinsic = nir_intrinsic_bindless_image_load;
171 break;
172 case nir_intrinsic_image_deref_sparse_load:
173 instr->intrinsic = nir_intrinsic_image_deref_load;
174 break;
175 default:
176 break;
177 }
178
179 return true;
180 }
181
182 static bool
opt_shrink_vector(nir_builder * b,nir_alu_instr * instr)183 opt_shrink_vector(nir_builder *b, nir_alu_instr *instr)
184 {
185 nir_def *def = &instr->def;
186 unsigned mask = nir_def_components_read(def);
187
188 /* If nothing was read, leave it up to DCE. */
189 if (mask == 0)
190 return false;
191
192 /* don't remove any channels if used by non-ALU */
193 if (!is_only_used_by_alu(def))
194 return false;
195
196 uint8_t reswizzle[NIR_MAX_VEC_COMPONENTS] = { 0 };
197 nir_scalar srcs[NIR_MAX_VEC_COMPONENTS] = { 0 };
198 unsigned num_components = 0;
199 for (unsigned i = 0; i < def->num_components; i++) {
200 if (!((mask >> i) & 0x1))
201 continue;
202
203 nir_scalar scalar = nir_get_scalar(instr->src[i].src.ssa, instr->src[i].swizzle[0]);
204
205 /* Try reuse a component with the same value */
206 unsigned j;
207 for (j = 0; j < num_components; j++) {
208 if (nir_scalar_equal(scalar, srcs[j])) {
209 reswizzle[i] = j;
210 break;
211 }
212 }
213
214 /* Otherwise, just append the value */
215 if (j == num_components) {
216 srcs[num_components] = scalar;
217 reswizzle[i] = num_components++;
218 }
219 }
220
221 /* return if no component was removed */
222 if (num_components == def->num_components)
223 return false;
224
225 /* create new vecN and replace uses */
226 nir_def *new_vec = nir_vec_scalars(b, srcs, num_components);
227 nir_def_rewrite_uses(def, new_vec);
228 reswizzle_alu_uses(new_vec, reswizzle);
229
230 return true;
231 }
232
233 static bool
opt_shrink_vectors_alu(nir_builder * b,nir_alu_instr * instr)234 opt_shrink_vectors_alu(nir_builder *b, nir_alu_instr *instr)
235 {
236 nir_def *def = &instr->def;
237
238 /* Nothing to shrink */
239 if (def->num_components == 1)
240 return false;
241
242 switch (instr->op) {
243 /* don't use nir_op_is_vec() as not all vector sizes are supported. */
244 case nir_op_vec4:
245 case nir_op_vec3:
246 case nir_op_vec2:
247 return opt_shrink_vector(b, instr);
248 default:
249 if (nir_op_infos[instr->op].output_size != 0)
250 return false;
251 break;
252 }
253
254 /* don't remove any channels if used by non-ALU */
255 if (!is_only_used_by_alu(def))
256 return false;
257
258 unsigned mask = nir_def_components_read(def);
259 /* return, if there is nothing to do */
260 if (mask == 0)
261 return false;
262
263 uint8_t reswizzle[NIR_MAX_VEC_COMPONENTS] = { 0 };
264 unsigned num_components = 0;
265 bool progress = false;
266 for (unsigned i = 0; i < def->num_components; i++) {
267 /* skip unused components */
268 if (!((mask >> i) & 0x1))
269 continue;
270
271 /* Try reuse a component with the same swizzles */
272 unsigned j;
273 for (j = 0; j < num_components; j++) {
274 bool duplicate_channel = true;
275 for (unsigned k = 0; k < nir_op_infos[instr->op].num_inputs; k++) {
276 if (nir_op_infos[instr->op].input_sizes[k] != 0 ||
277 instr->src[k].swizzle[i] != instr->src[k].swizzle[j]) {
278 duplicate_channel = false;
279 break;
280 }
281 }
282
283 if (duplicate_channel) {
284 reswizzle[i] = j;
285 progress = true;
286 break;
287 }
288 }
289
290 /* Otherwise, just append the value */
291 if (j == num_components) {
292 for (int k = 0; k < nir_op_infos[instr->op].num_inputs; k++) {
293 instr->src[k].swizzle[num_components] = instr->src[k].swizzle[i];
294 }
295 if (i != num_components)
296 progress = true;
297 reswizzle[i] = num_components++;
298 }
299 }
300
301 /* update uses */
302 if (progress)
303 reswizzle_alu_uses(def, reswizzle);
304
305 unsigned rounded = nir_round_up_components(num_components);
306 assert(rounded <= def->num_components);
307 if (rounded < def->num_components)
308 progress = true;
309
310 /* update dest */
311 def->num_components = rounded;
312
313 return progress;
314 }
315
316 static bool
opt_shrink_vectors_intrinsic(nir_builder * b,nir_intrinsic_instr * instr,bool shrink_start)317 opt_shrink_vectors_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
318 bool shrink_start)
319 {
320 switch (instr->intrinsic) {
321 case nir_intrinsic_load_uniform:
322 case nir_intrinsic_load_ubo:
323 case nir_intrinsic_load_input:
324 case nir_intrinsic_load_per_primitive_input:
325 case nir_intrinsic_load_input_vertex:
326 case nir_intrinsic_load_per_vertex_input:
327 case nir_intrinsic_load_interpolated_input:
328 case nir_intrinsic_load_ssbo:
329 case nir_intrinsic_load_push_constant:
330 case nir_intrinsic_load_constant:
331 case nir_intrinsic_load_shared:
332 case nir_intrinsic_load_global:
333 case nir_intrinsic_load_global_constant:
334 case nir_intrinsic_load_kernel_input:
335 case nir_intrinsic_load_scratch:
336 case nir_intrinsic_load_attribute_pan: {
337 /* Must be a vectorized intrinsic that we can resize. */
338 assert(instr->num_components != 0);
339
340 /* Trim the dest to the used channels */
341 if (!shrink_dest_to_read_mask(&instr->def, shrink_start))
342 return false;
343
344 instr->num_components = instr->def.num_components;
345 return true;
346 }
347 case nir_intrinsic_image_sparse_load:
348 case nir_intrinsic_bindless_image_sparse_load:
349 case nir_intrinsic_image_deref_sparse_load:
350 return shrink_intrinsic_to_non_sparse(instr);
351 default:
352 return false;
353 }
354 }
355
356 static bool
opt_shrink_vectors_tex(nir_builder * b,nir_tex_instr * tex)357 opt_shrink_vectors_tex(nir_builder *b, nir_tex_instr *tex)
358 {
359 if (!tex->is_sparse)
360 return false;
361
362 unsigned mask = nir_def_components_read(&tex->def);
363 int last_bit = util_last_bit(mask);
364
365 /* If the sparse component is used, do nothing. */
366 if (last_bit == tex->def.num_components)
367 return false;
368
369 tex->def.num_components -= 1;
370 tex->is_sparse = false;
371
372 return true;
373 }
374
375 static bool
opt_shrink_vectors_load_const(nir_load_const_instr * instr)376 opt_shrink_vectors_load_const(nir_load_const_instr *instr)
377 {
378 nir_def *def = &instr->def;
379
380 /* early out if there's nothing to do. */
381 if (def->num_components == 1)
382 return false;
383
384 /* don't remove any channels if used by non-ALU */
385 if (!is_only_used_by_alu(def))
386 return false;
387
388 unsigned mask = nir_def_components_read(def);
389
390 /* If nothing was read, leave it up to DCE. */
391 if (!mask)
392 return false;
393
394 uint8_t reswizzle[NIR_MAX_VEC_COMPONENTS] = { 0 };
395 unsigned num_components = 0;
396 bool progress = false;
397 for (unsigned i = 0; i < def->num_components; i++) {
398 if (!((mask >> i) & 0x1))
399 continue;
400
401 /* Try reuse a component with the same constant */
402 unsigned j;
403 for (j = 0; j < num_components; j++) {
404 if (instr->value[i].u64 == instr->value[j].u64) {
405 reswizzle[i] = j;
406 progress = true;
407 break;
408 }
409 }
410
411 /* Otherwise, just append the value */
412 if (j == num_components) {
413 instr->value[num_components] = instr->value[i];
414 if (i != num_components)
415 progress = true;
416 reswizzle[i] = num_components++;
417 }
418 }
419
420 if (progress)
421 reswizzle_alu_uses(def, reswizzle);
422
423 unsigned rounded = nir_round_up_components(num_components);
424 assert(rounded <= def->num_components);
425 if (rounded < def->num_components)
426 progress = true;
427
428 def->num_components = rounded;
429
430 return progress;
431 }
432
433 static bool
opt_shrink_vectors_ssa_undef(nir_undef_instr * instr)434 opt_shrink_vectors_ssa_undef(nir_undef_instr *instr)
435 {
436 return shrink_dest_to_read_mask(&instr->def, false);
437 }
438
439 static bool
opt_shrink_vectors_phi(nir_builder * b,nir_phi_instr * instr)440 opt_shrink_vectors_phi(nir_builder *b, nir_phi_instr *instr)
441 {
442 nir_def *def = &instr->def;
443
444 /* early out if there's nothing to do. */
445 if (def->num_components == 1)
446 return false;
447
448 /* Ignore large vectors for now. */
449 if (def->num_components > 4)
450 return false;
451
452 /* Check the uses. */
453 nir_component_mask_t mask = 0;
454 nir_foreach_use(src, def) {
455 if (nir_src_parent_instr(src)->type != nir_instr_type_alu)
456 return false;
457
458 nir_alu_instr *alu = nir_instr_as_alu(nir_src_parent_instr(src));
459
460 nir_alu_src *alu_src = exec_node_data(nir_alu_src, src, src);
461 int src_idx = alu_src - &alu->src[0];
462 nir_component_mask_t src_read_mask = nir_alu_instr_src_read_mask(alu, src_idx);
463
464 nir_def *alu_def = &alu->def;
465
466 /* We don't mark the channels used if the only reader is the original phi.
467 * This can happen in the case of loops.
468 */
469 nir_foreach_use(alu_use_src, alu_def) {
470 if (nir_src_parent_instr(alu_use_src) != &instr->instr) {
471 mask |= src_read_mask;
472 }
473 }
474
475 /* However, even if the instruction only points back at the phi, we still
476 * need to check that the swizzles are trivial.
477 */
478 if (nir_op_is_vec(alu->op)) {
479 if (src_idx != alu->src[src_idx].swizzle[0]) {
480 mask |= src_read_mask;
481 }
482 } else if (!nir_alu_src_is_trivial_ssa(alu, src_idx)) {
483 mask |= src_read_mask;
484 }
485 }
486
487 /* DCE will handle this. */
488 if (mask == 0)
489 return false;
490
491 /* Nothing to shrink? */
492 if (BITFIELD_MASK(def->num_components) == mask)
493 return false;
494
495 /* Set up the reswizzles. */
496 unsigned num_components = 0;
497 uint8_t reswizzle[NIR_MAX_VEC_COMPONENTS] = { 0 };
498 uint8_t src_reswizzle[NIR_MAX_VEC_COMPONENTS] = { 0 };
499 for (unsigned i = 0; i < def->num_components; i++) {
500 if (!((mask >> i) & 0x1))
501 continue;
502 src_reswizzle[num_components] = i;
503 reswizzle[i] = num_components++;
504 }
505
506 /* Shrink the phi, this part is simple. */
507 def->num_components = num_components;
508
509 /* We can't swizzle phi sources directly so just insert extra mov
510 * with the correct swizzle and let the other parts of nir_shrink_vectors
511 * do its job on the original source instruction. If the original source was
512 * used only in the phi, the movs will disappear later after copy propagate.
513 */
514 nir_foreach_phi_src(phi_src, instr) {
515 b->cursor = nir_after_instr_and_phis(phi_src->src.ssa->parent_instr);
516
517 nir_alu_src alu_src = {
518 .src = nir_src_for_ssa(phi_src->src.ssa)
519 };
520
521 for (unsigned i = 0; i < num_components; i++)
522 alu_src.swizzle[i] = src_reswizzle[i];
523 nir_def *mov = nir_mov_alu(b, alu_src, num_components);
524
525 nir_src_rewrite(&phi_src->src, mov);
526 }
527 b->cursor = nir_before_instr(&instr->instr);
528
529 /* Reswizzle readers. */
530 reswizzle_alu_uses(def, reswizzle);
531
532 return true;
533 }
534
535 static bool
opt_shrink_vectors_instr(nir_builder * b,nir_instr * instr,bool shrink_start)536 opt_shrink_vectors_instr(nir_builder *b, nir_instr *instr, bool shrink_start)
537 {
538 b->cursor = nir_before_instr(instr);
539
540 switch (instr->type) {
541 case nir_instr_type_alu:
542 return opt_shrink_vectors_alu(b, nir_instr_as_alu(instr));
543
544 case nir_instr_type_tex:
545 return opt_shrink_vectors_tex(b, nir_instr_as_tex(instr));
546
547 case nir_instr_type_intrinsic:
548 return opt_shrink_vectors_intrinsic(b, nir_instr_as_intrinsic(instr),
549 shrink_start);
550
551 case nir_instr_type_load_const:
552 return opt_shrink_vectors_load_const(nir_instr_as_load_const(instr));
553
554 case nir_instr_type_undef:
555 return opt_shrink_vectors_ssa_undef(nir_instr_as_undef(instr));
556
557 case nir_instr_type_phi:
558 return opt_shrink_vectors_phi(b, nir_instr_as_phi(instr));
559
560 default:
561 return false;
562 }
563
564 return true;
565 }
566
567 bool
nir_opt_shrink_vectors(nir_shader * shader,bool shrink_start)568 nir_opt_shrink_vectors(nir_shader *shader, bool shrink_start)
569 {
570 bool progress = false;
571
572 nir_foreach_function_impl(impl, shader) {
573 nir_builder b = nir_builder_create(impl);
574
575 nir_foreach_block_reverse(block, impl) {
576 nir_foreach_instr_reverse(instr, block) {
577 progress |= opt_shrink_vectors_instr(&b, instr, shrink_start);
578 }
579 }
580
581 if (progress) {
582 nir_metadata_preserve(impl,
583 nir_metadata_control_flow);
584 } else {
585 nir_metadata_preserve(impl, nir_metadata_all);
586 }
587 }
588
589 return progress;
590 }
591