• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Google LLC
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /**
25  * @file
26  *
27  * Removes unused components of SSA defs.
28  *
29  * Due to various optimization passes (or frontend implementations,
30  * particularly prog_to_nir), we may have instructions generating vectors
31  * whose components don't get read by any instruction.
32  *
33  * For memory loads, while it can be tricky to eliminate unused low components
34  * or channels in the middle of a writemask (you might need to increment some
35  * offset from a load_uniform, for example), it is trivial to just drop the
36  * trailing components. This pass shrinks low components on select intrinsics.
37  * For vector ALU and load_const, only used by other ALU instructions,
38  * this pass eliminates arbitrary channels as well as duplicate channels,
39  * and reswizzles the uses.
40  *
41  * This pass is probably only of use to vector backends -- scalar backends
42  * typically get unused def channel trimming by scalarizing and dead code
43  * elimination.
44  */
45 
46 #include "util/u_math.h"
47 #include "nir.h"
48 #include "nir_builder.h"
49 
50 static void
reswizzle_alu_uses(nir_def * def,uint8_t * reswizzle)51 reswizzle_alu_uses(nir_def *def, uint8_t *reswizzle)
52 {
53    nir_foreach_use(use_src, def) {
54       /* all uses must be ALU instructions */
55       assert(nir_src_parent_instr(use_src)->type == nir_instr_type_alu);
56       nir_alu_src *alu_src = (nir_alu_src *)use_src;
57 
58       /* reswizzle ALU sources */
59       for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++)
60          alu_src->swizzle[i] = reswizzle[alu_src->swizzle[i]];
61    }
62 }
63 
64 static bool
is_only_used_by_alu(nir_def * def)65 is_only_used_by_alu(nir_def *def)
66 {
67    nir_foreach_use(use_src, def) {
68       if (nir_src_parent_instr(use_src)->type != nir_instr_type_alu)
69          return false;
70    }
71 
72    return true;
73 }
74 
75 static bool
shrink_dest_to_read_mask(nir_def * def,bool shrink_start)76 shrink_dest_to_read_mask(nir_def *def, bool shrink_start)
77 {
78    /* early out if there's nothing to do. */
79    if (def->num_components == 1)
80       return false;
81 
82    /* don't remove any channels if used by an intrinsic */
83    nir_foreach_use(use_src, def) {
84       if (nir_src_parent_instr(use_src)->type == nir_instr_type_intrinsic)
85          return false;
86    }
87 
88    unsigned mask = nir_def_components_read(def);
89 
90    /* If nothing was read, leave it up to DCE. */
91    if (!mask)
92       return false;
93 
94    nir_intrinsic_instr *intr = NULL;
95    nir_src *offset_src = NULL;
96 
97    if (def->parent_instr->type == nir_instr_type_intrinsic) {
98       intr = nir_instr_as_intrinsic(def->parent_instr);
99       offset_src = nir_get_io_offset_src(intr);
100    }
101 
102    shrink_start &= intr && (nir_intrinsic_has_component(intr) || offset_src) &&
103                    is_only_used_by_alu(def);
104 
105    int last_bit = util_last_bit(mask);
106    int first_bit = shrink_start ? (ffs(mask) - 1) : 0;
107 
108    const unsigned comps = last_bit - first_bit;
109    const unsigned rounded = nir_round_up_components(comps);
110    assert(rounded <= def->num_components);
111 
112    if ((def->num_components > rounded) || first_bit > 0) {
113       def->num_components = rounded;
114 
115       if (first_bit) {
116          assert(shrink_start);
117 
118          if (nir_intrinsic_has_component(intr)) {
119             unsigned new_component = nir_intrinsic_component(intr) + first_bit;
120             nir_intrinsic_set_component(intr, new_component);
121          } else {
122             /* Add the component offset into the src offset. */
123             unsigned offset = (def->bit_size / 8) * first_bit;
124 
125             if (nir_intrinsic_has_align_offset(intr)) {
126                unsigned align_offset = (nir_intrinsic_align_offset(intr) + offset) %
127                                        nir_intrinsic_align_mul(intr);
128                nir_intrinsic_set_align_offset(intr, align_offset);
129             }
130 
131             nir_builder b = nir_builder_at(nir_before_instr(&intr->instr));
132             nir_src_rewrite(offset_src, nir_iadd_imm(&b, offset_src->ssa, offset));
133          }
134 
135          /* Reswizzle sources, which must be ALU since they have swizzle */
136          assert(first_bit + comps <= NIR_MAX_VEC_COMPONENTS);
137          uint8_t swizzle[NIR_MAX_VEC_COMPONENTS] = { 0 };
138          for (unsigned i = 0; i < comps; ++i) {
139             swizzle[first_bit + i] = i;
140          }
141 
142          reswizzle_alu_uses(def, swizzle);
143       }
144 
145       return true;
146    }
147 
148    return false;
149 }
150 
151 static bool
shrink_intrinsic_to_non_sparse(nir_intrinsic_instr * instr)152 shrink_intrinsic_to_non_sparse(nir_intrinsic_instr *instr)
153 {
154    unsigned mask = nir_def_components_read(&instr->def);
155    int last_bit = util_last_bit(mask);
156 
157    /* If the sparse component is used, do nothing. */
158    if (last_bit == instr->def.num_components)
159       return false;
160 
161    instr->def.num_components -= 1;
162    instr->num_components = instr->def.num_components;
163 
164    /* Switch to the non-sparse intrinsic. */
165    switch (instr->intrinsic) {
166    case nir_intrinsic_image_sparse_load:
167       instr->intrinsic = nir_intrinsic_image_load;
168       break;
169    case nir_intrinsic_bindless_image_sparse_load:
170       instr->intrinsic = nir_intrinsic_bindless_image_load;
171       break;
172    case nir_intrinsic_image_deref_sparse_load:
173       instr->intrinsic = nir_intrinsic_image_deref_load;
174       break;
175    default:
176       break;
177    }
178 
179    return true;
180 }
181 
182 static bool
opt_shrink_vector(nir_builder * b,nir_alu_instr * instr)183 opt_shrink_vector(nir_builder *b, nir_alu_instr *instr)
184 {
185    nir_def *def = &instr->def;
186    unsigned mask = nir_def_components_read(def);
187 
188    /* If nothing was read, leave it up to DCE. */
189    if (mask == 0)
190       return false;
191 
192    /* don't remove any channels if used by non-ALU */
193    if (!is_only_used_by_alu(def))
194       return false;
195 
196    uint8_t reswizzle[NIR_MAX_VEC_COMPONENTS] = { 0 };
197    nir_scalar srcs[NIR_MAX_VEC_COMPONENTS] = { 0 };
198    unsigned num_components = 0;
199    for (unsigned i = 0; i < def->num_components; i++) {
200       if (!((mask >> i) & 0x1))
201          continue;
202 
203       nir_scalar scalar = nir_get_scalar(instr->src[i].src.ssa, instr->src[i].swizzle[0]);
204 
205       /* Try reuse a component with the same value */
206       unsigned j;
207       for (j = 0; j < num_components; j++) {
208          if (nir_scalar_equal(scalar, srcs[j])) {
209             reswizzle[i] = j;
210             break;
211          }
212       }
213 
214       /* Otherwise, just append the value */
215       if (j == num_components) {
216          srcs[num_components] = scalar;
217          reswizzle[i] = num_components++;
218       }
219    }
220 
221    /* return if no component was removed */
222    if (num_components == def->num_components)
223       return false;
224 
225    /* create new vecN and replace uses */
226    nir_def *new_vec = nir_vec_scalars(b, srcs, num_components);
227    nir_def_rewrite_uses(def, new_vec);
228    reswizzle_alu_uses(new_vec, reswizzle);
229 
230    return true;
231 }
232 
233 static bool
opt_shrink_vectors_alu(nir_builder * b,nir_alu_instr * instr)234 opt_shrink_vectors_alu(nir_builder *b, nir_alu_instr *instr)
235 {
236    nir_def *def = &instr->def;
237 
238    /* Nothing to shrink */
239    if (def->num_components == 1)
240       return false;
241 
242    switch (instr->op) {
243    /* don't use nir_op_is_vec() as not all vector sizes are supported. */
244    case nir_op_vec4:
245    case nir_op_vec3:
246    case nir_op_vec2:
247       return opt_shrink_vector(b, instr);
248    default:
249       if (nir_op_infos[instr->op].output_size != 0)
250          return false;
251       break;
252    }
253 
254    /* don't remove any channels if used by non-ALU */
255    if (!is_only_used_by_alu(def))
256       return false;
257 
258    unsigned mask = nir_def_components_read(def);
259    /* return, if there is nothing to do */
260    if (mask == 0)
261       return false;
262 
263    uint8_t reswizzle[NIR_MAX_VEC_COMPONENTS] = { 0 };
264    unsigned num_components = 0;
265    bool progress = false;
266    for (unsigned i = 0; i < def->num_components; i++) {
267       /* skip unused components */
268       if (!((mask >> i) & 0x1))
269          continue;
270 
271       /* Try reuse a component with the same swizzles */
272       unsigned j;
273       for (j = 0; j < num_components; j++) {
274          bool duplicate_channel = true;
275          for (unsigned k = 0; k < nir_op_infos[instr->op].num_inputs; k++) {
276             if (nir_op_infos[instr->op].input_sizes[k] != 0 ||
277                 instr->src[k].swizzle[i] != instr->src[k].swizzle[j]) {
278                duplicate_channel = false;
279                break;
280             }
281          }
282 
283          if (duplicate_channel) {
284             reswizzle[i] = j;
285             progress = true;
286             break;
287          }
288       }
289 
290       /* Otherwise, just append the value */
291       if (j == num_components) {
292          for (int k = 0; k < nir_op_infos[instr->op].num_inputs; k++) {
293             instr->src[k].swizzle[num_components] = instr->src[k].swizzle[i];
294          }
295          if (i != num_components)
296             progress = true;
297          reswizzle[i] = num_components++;
298       }
299    }
300 
301    /* update uses */
302    if (progress)
303       reswizzle_alu_uses(def, reswizzle);
304 
305    unsigned rounded = nir_round_up_components(num_components);
306    assert(rounded <= def->num_components);
307    if (rounded < def->num_components)
308       progress = true;
309 
310    /* update dest */
311    def->num_components = rounded;
312 
313    return progress;
314 }
315 
316 static bool
opt_shrink_vectors_intrinsic(nir_builder * b,nir_intrinsic_instr * instr,bool shrink_start)317 opt_shrink_vectors_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
318                              bool shrink_start)
319 {
320    switch (instr->intrinsic) {
321    case nir_intrinsic_load_uniform:
322    case nir_intrinsic_load_ubo:
323    case nir_intrinsic_load_input:
324    case nir_intrinsic_load_per_primitive_input:
325    case nir_intrinsic_load_input_vertex:
326    case nir_intrinsic_load_per_vertex_input:
327    case nir_intrinsic_load_interpolated_input:
328    case nir_intrinsic_load_ssbo:
329    case nir_intrinsic_load_push_constant:
330    case nir_intrinsic_load_constant:
331    case nir_intrinsic_load_shared:
332    case nir_intrinsic_load_global:
333    case nir_intrinsic_load_global_constant:
334    case nir_intrinsic_load_kernel_input:
335    case nir_intrinsic_load_scratch:
336    case nir_intrinsic_load_attribute_pan: {
337       /* Must be a vectorized intrinsic that we can resize. */
338       assert(instr->num_components != 0);
339 
340       /* Trim the dest to the used channels */
341       if (!shrink_dest_to_read_mask(&instr->def, shrink_start))
342          return false;
343 
344       instr->num_components = instr->def.num_components;
345       return true;
346    }
347    case nir_intrinsic_image_sparse_load:
348    case nir_intrinsic_bindless_image_sparse_load:
349    case nir_intrinsic_image_deref_sparse_load:
350       return shrink_intrinsic_to_non_sparse(instr);
351    default:
352       return false;
353    }
354 }
355 
356 static bool
opt_shrink_vectors_tex(nir_builder * b,nir_tex_instr * tex)357 opt_shrink_vectors_tex(nir_builder *b, nir_tex_instr *tex)
358 {
359    if (!tex->is_sparse)
360       return false;
361 
362    unsigned mask = nir_def_components_read(&tex->def);
363    int last_bit = util_last_bit(mask);
364 
365    /* If the sparse component is used, do nothing. */
366    if (last_bit == tex->def.num_components)
367       return false;
368 
369    tex->def.num_components -= 1;
370    tex->is_sparse = false;
371 
372    return true;
373 }
374 
375 static bool
opt_shrink_vectors_load_const(nir_load_const_instr * instr)376 opt_shrink_vectors_load_const(nir_load_const_instr *instr)
377 {
378    nir_def *def = &instr->def;
379 
380    /* early out if there's nothing to do. */
381    if (def->num_components == 1)
382       return false;
383 
384    /* don't remove any channels if used by non-ALU */
385    if (!is_only_used_by_alu(def))
386       return false;
387 
388    unsigned mask = nir_def_components_read(def);
389 
390    /* If nothing was read, leave it up to DCE. */
391    if (!mask)
392       return false;
393 
394    uint8_t reswizzle[NIR_MAX_VEC_COMPONENTS] = { 0 };
395    unsigned num_components = 0;
396    bool progress = false;
397    for (unsigned i = 0; i < def->num_components; i++) {
398       if (!((mask >> i) & 0x1))
399          continue;
400 
401       /* Try reuse a component with the same constant */
402       unsigned j;
403       for (j = 0; j < num_components; j++) {
404          if (instr->value[i].u64 == instr->value[j].u64) {
405             reswizzle[i] = j;
406             progress = true;
407             break;
408          }
409       }
410 
411       /* Otherwise, just append the value */
412       if (j == num_components) {
413          instr->value[num_components] = instr->value[i];
414          if (i != num_components)
415             progress = true;
416          reswizzle[i] = num_components++;
417       }
418    }
419 
420    if (progress)
421       reswizzle_alu_uses(def, reswizzle);
422 
423    unsigned rounded = nir_round_up_components(num_components);
424    assert(rounded <= def->num_components);
425    if (rounded < def->num_components)
426       progress = true;
427 
428    def->num_components = rounded;
429 
430    return progress;
431 }
432 
433 static bool
opt_shrink_vectors_ssa_undef(nir_undef_instr * instr)434 opt_shrink_vectors_ssa_undef(nir_undef_instr *instr)
435 {
436    return shrink_dest_to_read_mask(&instr->def, false);
437 }
438 
439 static bool
opt_shrink_vectors_phi(nir_builder * b,nir_phi_instr * instr)440 opt_shrink_vectors_phi(nir_builder *b, nir_phi_instr *instr)
441 {
442    nir_def *def = &instr->def;
443 
444    /* early out if there's nothing to do. */
445    if (def->num_components == 1)
446       return false;
447 
448    /* Ignore large vectors for now. */
449    if (def->num_components > 4)
450       return false;
451 
452    /* Check the uses. */
453    nir_component_mask_t mask = 0;
454    nir_foreach_use(src, def) {
455       if (nir_src_parent_instr(src)->type != nir_instr_type_alu)
456          return false;
457 
458       nir_alu_instr *alu = nir_instr_as_alu(nir_src_parent_instr(src));
459 
460       nir_alu_src *alu_src = exec_node_data(nir_alu_src, src, src);
461       int src_idx = alu_src - &alu->src[0];
462       nir_component_mask_t src_read_mask = nir_alu_instr_src_read_mask(alu, src_idx);
463 
464       nir_def *alu_def = &alu->def;
465 
466       /* We don't mark the channels used if the only reader is the original phi.
467        * This can happen in the case of loops.
468        */
469       nir_foreach_use(alu_use_src, alu_def) {
470          if (nir_src_parent_instr(alu_use_src) != &instr->instr) {
471             mask |= src_read_mask;
472          }
473       }
474 
475       /* However, even if the instruction only points back at the phi, we still
476        * need to check that the swizzles are trivial.
477        */
478       if (nir_op_is_vec(alu->op)) {
479          if (src_idx != alu->src[src_idx].swizzle[0]) {
480             mask |= src_read_mask;
481          }
482       } else if (!nir_alu_src_is_trivial_ssa(alu, src_idx)) {
483          mask |= src_read_mask;
484       }
485    }
486 
487    /* DCE will handle this. */
488    if (mask == 0)
489       return false;
490 
491    /* Nothing to shrink? */
492    if (BITFIELD_MASK(def->num_components) == mask)
493       return false;
494 
495    /* Set up the reswizzles. */
496    unsigned num_components = 0;
497    uint8_t reswizzle[NIR_MAX_VEC_COMPONENTS] = { 0 };
498    uint8_t src_reswizzle[NIR_MAX_VEC_COMPONENTS] = { 0 };
499    for (unsigned i = 0; i < def->num_components; i++) {
500       if (!((mask >> i) & 0x1))
501          continue;
502       src_reswizzle[num_components] = i;
503       reswizzle[i] = num_components++;
504    }
505 
506    /* Shrink the phi, this part is simple. */
507    def->num_components = num_components;
508 
509    /* We can't swizzle phi sources directly so just insert extra mov
510     * with the correct swizzle and let the other parts of nir_shrink_vectors
511     * do its job on the original source instruction. If the original source was
512     * used only in the phi, the movs will disappear later after copy propagate.
513     */
514    nir_foreach_phi_src(phi_src, instr) {
515       b->cursor = nir_after_instr_and_phis(phi_src->src.ssa->parent_instr);
516 
517       nir_alu_src alu_src = {
518          .src = nir_src_for_ssa(phi_src->src.ssa)
519       };
520 
521       for (unsigned i = 0; i < num_components; i++)
522          alu_src.swizzle[i] = src_reswizzle[i];
523       nir_def *mov = nir_mov_alu(b, alu_src, num_components);
524 
525       nir_src_rewrite(&phi_src->src, mov);
526    }
527    b->cursor = nir_before_instr(&instr->instr);
528 
529    /* Reswizzle readers. */
530    reswizzle_alu_uses(def, reswizzle);
531 
532    return true;
533 }
534 
535 static bool
opt_shrink_vectors_instr(nir_builder * b,nir_instr * instr,bool shrink_start)536 opt_shrink_vectors_instr(nir_builder *b, nir_instr *instr, bool shrink_start)
537 {
538    b->cursor = nir_before_instr(instr);
539 
540    switch (instr->type) {
541    case nir_instr_type_alu:
542       return opt_shrink_vectors_alu(b, nir_instr_as_alu(instr));
543 
544    case nir_instr_type_tex:
545       return opt_shrink_vectors_tex(b, nir_instr_as_tex(instr));
546 
547    case nir_instr_type_intrinsic:
548       return opt_shrink_vectors_intrinsic(b, nir_instr_as_intrinsic(instr),
549                                           shrink_start);
550 
551    case nir_instr_type_load_const:
552       return opt_shrink_vectors_load_const(nir_instr_as_load_const(instr));
553 
554    case nir_instr_type_undef:
555       return opt_shrink_vectors_ssa_undef(nir_instr_as_undef(instr));
556 
557    case nir_instr_type_phi:
558       return opt_shrink_vectors_phi(b, nir_instr_as_phi(instr));
559 
560    default:
561       return false;
562    }
563 
564    return true;
565 }
566 
567 bool
nir_opt_shrink_vectors(nir_shader * shader,bool shrink_start)568 nir_opt_shrink_vectors(nir_shader *shader, bool shrink_start)
569 {
570    bool progress = false;
571 
572    nir_foreach_function_impl(impl, shader) {
573       nir_builder b = nir_builder_create(impl);
574 
575       nir_foreach_block_reverse(block, impl) {
576          nir_foreach_instr_reverse(instr, block) {
577             progress |= opt_shrink_vectors_instr(&b, instr, shrink_start);
578          }
579       }
580 
581       if (progress) {
582          nir_metadata_preserve(impl,
583                                nir_metadata_control_flow);
584       } else {
585          nir_metadata_preserve(impl, nir_metadata_all);
586       }
587    }
588 
589    return progress;
590 }
591