• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2019 Google, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "nir.h"
25 #include "nir_vla.h"
26 
27 /* Lowering for amul instructions, for drivers that support imul24.
28  * This pass will analyze indirect derefs, and convert corresponding
29  * amul instructions to either imul or imul24, depending on the
30  * required range.
31  *
32  * 1) Analyze the uniform variables and build a table of UBOs and SSBOs
33  *    that are either too large, or might be too large (unknown size)
34  *    for imul24
35  *
36  * 2) Loop thru looking at all the intrinsics, finding dereferences of
37  *    large variables, and recursively replacing all amul instructions
38  *    used with imul
39  *
40  * 3) Finally loop again thru all instructions replacing any remaining
41  *    amul with imul24.  At this point any remaining amul instructions
42  *    are not involved in calculating an offset into a large variable,
43  *    thanks to the 2nd step, so they can be safely replace with imul24.
44  *
45  * Using two passes over all the instructions lets us handle the case
46  * where, due to CSE, an amul is used to calculate an offset into both
47  * a large and small variable.
48  */
49 
50 typedef struct {
51    nir_shader *shader;
52 
53    int (*type_size)(const struct glsl_type *, bool);
54 
55    /* Tables of UBOs and SSBOs mapping driver_location/base whether
56     * they are too large to use imul24:
57     */
58    bool *large_ubos;
59    bool *large_ssbos;
60 
61    /* for cases that we cannot determine UBO/SSBO index, track if *any*
62     * UBO/SSBO is too large for imul24:
63     */
64    bool has_large_ubo;
65    bool has_large_ssbo;
66 
67    unsigned max_slot;
68 
69    bool progress;
70 } lower_state;
71 
72 /* Lower 'amul's in offset src of large variables to 'imul': */
73 static bool
lower_large_src(nir_src * src,void * s)74 lower_large_src(nir_src *src, void *s)
75 {
76    lower_state *state = s;
77 
78    assert(src->is_ssa);
79 
80    nir_instr *parent = src->ssa->parent_instr;
81 
82    /* No need to visit instructions we've already visited.. this also
83     * avoids infinite recursion when phi's are involved:
84     */
85    if (parent->pass_flags)
86       return false;
87 
88    nir_foreach_src(parent, lower_large_src, state);
89 
90    if (parent->type == nir_instr_type_alu) {
91       nir_alu_instr *alu = nir_instr_as_alu(parent);
92       if (alu->op == nir_op_amul) {
93          alu->op = nir_op_imul;
94          state->progress = true;
95       }
96    }
97 
98    parent->pass_flags = 1;
99 
100    return true;
101 }
102 
103 static bool
large_ubo(lower_state * state,nir_src src)104 large_ubo(lower_state *state, nir_src src)
105 {
106    if (!nir_src_is_const(src))
107       return state->has_large_ubo;
108    unsigned idx = nir_src_as_uint(src);
109    assert(idx < state->shader->info.num_ubos);
110    return state->large_ubos[idx];
111 }
112 
113 static bool
large_ssbo(lower_state * state,nir_src src)114 large_ssbo(lower_state *state, nir_src src)
115 {
116    if (!nir_src_is_const(src))
117       return state->has_large_ssbo;
118    unsigned idx = nir_src_as_uint(src);
119    assert(idx < state->shader->info.num_ssbos);
120    return state->large_ssbos[idx];
121 }
122 
123 static void
lower_intrinsic(lower_state * state,nir_intrinsic_instr * intr)124 lower_intrinsic(lower_state *state, nir_intrinsic_instr *intr)
125 {
126    switch (intr->intrinsic) {
127    case nir_intrinsic_load_ubo:
128       //# src[] = { buffer_index, offset }.
129       if (large_ubo(state, intr->src[0]))
130          lower_large_src(&intr->src[1], state);
131       return;
132 
133    case nir_intrinsic_load_ssbo:
134       //# src[] = { buffer_index, offset }.
135       if (large_ssbo(state, intr->src[0]))
136          lower_large_src(&intr->src[1], state);
137       return;
138 
139    case nir_intrinsic_store_ssbo:
140       //# src[] = { value, block_index, offset }
141       if (large_ssbo(state, intr->src[1]))
142          lower_large_src(&intr->src[2], state);
143       return;
144 
145    case nir_intrinsic_ssbo_atomic_add:
146    case nir_intrinsic_ssbo_atomic_imin:
147    case nir_intrinsic_ssbo_atomic_umin:
148    case nir_intrinsic_ssbo_atomic_imax:
149    case nir_intrinsic_ssbo_atomic_umax:
150    case nir_intrinsic_ssbo_atomic_and:
151    case nir_intrinsic_ssbo_atomic_or:
152    case nir_intrinsic_ssbo_atomic_xor:
153    case nir_intrinsic_ssbo_atomic_exchange:
154    case nir_intrinsic_ssbo_atomic_comp_swap:
155    case nir_intrinsic_ssbo_atomic_fadd:
156    case nir_intrinsic_ssbo_atomic_fmin:
157    case nir_intrinsic_ssbo_atomic_fmax:
158    case nir_intrinsic_ssbo_atomic_fcomp_swap:
159       /* 0: SSBO index
160        * 1: offset
161        */
162       if (large_ssbo(state, intr->src[0]))
163          lower_large_src(&intr->src[1], state);
164       return;
165 
166    case nir_intrinsic_global_atomic_add:
167    case nir_intrinsic_global_atomic_imin:
168    case nir_intrinsic_global_atomic_umin:
169    case nir_intrinsic_global_atomic_imax:
170    case nir_intrinsic_global_atomic_umax:
171    case nir_intrinsic_global_atomic_and:
172    case nir_intrinsic_global_atomic_or:
173    case nir_intrinsic_global_atomic_xor:
174    case nir_intrinsic_global_atomic_exchange:
175    case nir_intrinsic_global_atomic_comp_swap:
176    case nir_intrinsic_global_atomic_fadd:
177    case nir_intrinsic_global_atomic_fmin:
178    case nir_intrinsic_global_atomic_fmax:
179    case nir_intrinsic_global_atomic_fcomp_swap:
180    case nir_intrinsic_load_global_constant:
181    case nir_intrinsic_load_global:
182       /* just assume we that 24b is not sufficient: */
183       lower_large_src(&intr->src[0], state);
184       return;
185 
186    case nir_intrinsic_store_global:
187       /* just assume we that 24b is not sufficient: */
188       lower_large_src(&intr->src[1], state);
189       return;
190 
191    /* These should all be small enough to unconditionally use imul24: */
192    case nir_intrinsic_shared_atomic_add:
193    case nir_intrinsic_shared_atomic_imin:
194    case nir_intrinsic_shared_atomic_umin:
195    case nir_intrinsic_shared_atomic_imax:
196    case nir_intrinsic_shared_atomic_umax:
197    case nir_intrinsic_shared_atomic_and:
198    case nir_intrinsic_shared_atomic_or:
199    case nir_intrinsic_shared_atomic_xor:
200    case nir_intrinsic_shared_atomic_exchange:
201    case nir_intrinsic_shared_atomic_comp_swap:
202    case nir_intrinsic_shared_atomic_fadd:
203    case nir_intrinsic_shared_atomic_fmin:
204    case nir_intrinsic_shared_atomic_fmax:
205    case nir_intrinsic_shared_atomic_fcomp_swap:
206    case nir_intrinsic_load_uniform:
207    case nir_intrinsic_load_input:
208    case nir_intrinsic_load_output:
209    case nir_intrinsic_store_output:
210    default:
211       return;
212    }
213 }
214 
215 static void
lower_instr(lower_state * state,nir_instr * instr)216 lower_instr(lower_state *state, nir_instr *instr)
217 {
218    if (instr->type == nir_instr_type_intrinsic) {
219       lower_intrinsic(state, nir_instr_as_intrinsic(instr));
220    }
221 }
222 
223 static bool
is_large(lower_state * state,nir_variable * var)224 is_large(lower_state *state, nir_variable *var)
225 {
226    const struct glsl_type *type = glsl_without_array(var->type);
227    unsigned size = state->type_size(type, false);
228 
229    /* if size is not known (ie. VLA) then assume the worst: */
230    if (!size)
231       return true;
232 
233    return size >= (1 << 23);
234 }
235 
236 bool
nir_lower_amul(nir_shader * shader,int (* type_size)(const struct glsl_type *,bool))237 nir_lower_amul(nir_shader *shader,
238                int (*type_size)(const struct glsl_type *, bool))
239 {
240    assert(shader->options->has_imul24);
241    assert(type_size);
242 
243    NIR_VLA_FILL(bool, large_ubos, shader->info.num_ubos, 0);
244    NIR_VLA_FILL(bool, large_ssbos, shader->info.num_ssbos, 0);
245 
246    lower_state state = {
247       .shader = shader,
248       .type_size = type_size,
249       .large_ubos = large_ubos,
250       .large_ssbos = large_ssbos,
251    };
252 
253    /* Figure out which UBOs or SSBOs are large enough to be
254     * disqualified from imul24:
255     */
256    nir_foreach_variable_in_shader (var, shader) {
257       if (var->data.mode == nir_var_mem_ubo) {
258          if (is_large(&state, var)) {
259             state.has_large_ubo = true;
260             unsigned size = MAX2(1, glsl_array_size(var->type));
261             for (unsigned i = 0; i < size; i++)
262                state.large_ubos[var->data.binding + i] = true;
263          }
264       } else if (var->data.mode == nir_var_mem_ssbo) {
265          if (is_large(&state, var)) {
266             state.has_large_ssbo = true;
267             unsigned size = MAX2(1, glsl_array_size(var->type));
268             for (unsigned i = 0; i < size; i++)
269                state.large_ssbos[var->data.binding + i] = true;
270          }
271       }
272    }
273 
274    /* clear pass flags: */
275    nir_foreach_function(function, shader) {
276       nir_function_impl *impl = function->impl;
277       if (!impl)
278          continue;
279 
280       nir_foreach_block(block, impl) {
281          nir_foreach_instr(instr, block) {
282             instr->pass_flags = 0;
283          }
284       }
285    }
286 
287    nir_foreach_function(function, shader) {
288       nir_function_impl *impl = function->impl;
289 
290       if (!impl)
291          continue;
292 
293       nir_foreach_block(block, impl) {
294          nir_foreach_instr(instr, block) {
295             lower_instr(&state, instr);
296          }
297       }
298    }
299 
300    /* At this point, all 'amul's used in calculating an offset into
301     * a large variable have been replaced with 'imul'.  So remaining
302     * 'amul's can be replaced with 'imul24':
303     *
304     * Note the exception for 64b (such as load/store_global where
305     * address size is 64b) as imul24 cannot have 64b bitsize
306     */
307    nir_foreach_function(function, shader) {
308       nir_function_impl *impl = function->impl;
309 
310       if (!impl)
311          continue;
312 
313       nir_foreach_block(block, impl) {
314          nir_foreach_instr(instr, block) {
315             if (instr->type != nir_instr_type_alu)
316                continue;
317 
318             nir_alu_instr *alu = nir_instr_as_alu(instr);
319             if (alu->op != nir_op_amul)
320                continue;
321 
322             if (nir_dest_bit_size(alu->dest.dest) <= 32)
323                alu->op = nir_op_imul24;
324             else
325                alu->op = nir_op_imul;
326 
327             state.progress |= true;
328          }
329       }
330 
331       nir_metadata_preserve(impl, nir_metadata_block_index |
332                                   nir_metadata_dominance);
333 
334    }
335 
336    return state.progress;
337 }
338