• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2019 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "nir.h"
25 #include "nir_builder.h"
26 #include "nir_deref.h"
27 
28 #include "util/bitscan.h"
29 #include "util/list.h"
30 #include "util/u_math.h"
31 
32 /* Combine stores of vectors to the same deref into a single store.
33  *
34  * This per-block pass keeps track of stores of vectors to the same
35  * destination and combines them into the last store of the sequence.  Dead
36  * stores (or parts of the store) found during the process are removed.
37  *
38  * A pending combination becomes an actual combination in various situations:
39  * at the end of the block, when another instruction uses the memory or due to
40  * barriers.
41  *
42  * Besides vectors, the pass also look at array derefs of vectors.  For direct
43  * array derefs, it works like a write mask access to the given component.
44  * For indirect access there's no way to know before hand what component it
45  * will overlap with, so the combination is finished -- the indirect remains
46  * unmodified.
47  */
48 
49 /* Keep track of a group of stores that can be combined.  All stores share the
50  * same destination.
51  */
52 struct combined_store {
53    struct list_head link;
54 
55    nir_component_mask_t write_mask;
56    nir_deref_instr *dst;
57 
58    /* Latest store added.  It is reused when combining. */
59    nir_intrinsic_instr *latest;
60 
61    /* Original store for each component.  The number of times a store appear
62     * in this array is kept in the store's pass_flags.
63     */
64    nir_intrinsic_instr *stores[NIR_MAX_VEC_COMPONENTS];
65 };
66 
67 struct combine_stores_state {
68    nir_variable_mode modes;
69 
70    /* Pending store combinations. */
71    struct list_head pending;
72 
73    /* Per function impl state. */
74    nir_builder b;
75    bool progress;
76 
77 
78    /* Allocator and freelist to reuse structs between functions. */
79    void *lin_ctx;
80    struct list_head freelist;
81 };
82 
83 static struct combined_store *
alloc_combined_store(struct combine_stores_state * state)84 alloc_combined_store(struct combine_stores_state *state)
85 {
86    struct combined_store *result;
87    if (list_is_empty(&state->freelist)) {
88       result = linear_zalloc_child(state->lin_ctx, sizeof(*result));
89    } else {
90       result = list_first_entry(&state->freelist,
91                                 struct combined_store,
92                                 link);
93       list_del(&result->link);
94       memset(result, 0, sizeof(*result));
95    }
96    return result;
97 }
98 
99 static void
free_combined_store(struct combine_stores_state * state,struct combined_store * combo)100 free_combined_store(struct combine_stores_state *state,
101                     struct combined_store *combo)
102 {
103    list_del(&combo->link);
104    combo->write_mask = 0;
105    list_add(&combo->link, &state->freelist);
106 }
107 
108 static void
combine_stores(struct combine_stores_state * state,struct combined_store * combo)109 combine_stores(struct combine_stores_state *state,
110                    struct combined_store *combo)
111 {
112    assert(combo->latest);
113    assert(combo->latest->intrinsic == nir_intrinsic_store_deref);
114 
115    /* If the combined writemask is the same as the latest store, we know there
116     * is only one store in the combination, so nothing to combine.
117     */
118    if ((combo->write_mask & nir_intrinsic_write_mask(combo->latest)) ==
119        combo->write_mask)
120       return;
121 
122    state->b.cursor = nir_before_instr(&combo->latest->instr);
123 
124    /* Build a new vec, to be used as source for the combined store.  As it
125     * gets build, remove previous stores that are not needed anymore.
126     */
127    nir_ssa_scalar comps[NIR_MAX_VEC_COMPONENTS] = {0};
128    unsigned num_components = glsl_get_vector_elements(combo->dst->type);
129    unsigned bit_size = combo->latest->src[1].ssa->bit_size;
130    for (unsigned i = 0; i < num_components; i++) {
131       nir_intrinsic_instr *store = combo->stores[i];
132       if (combo->write_mask & (1 << i)) {
133          assert(store);
134          assert(store->src[1].is_ssa);
135 
136          /* If store->num_components == 1 then we are in the deref-of-vec case
137           * and store->src[1] is a scalar.  Otherwise, we're a regular vector
138           * load and we have to pick off a component.
139           */
140          comps[i] = nir_get_ssa_scalar(store->src[1].ssa, store->num_components == 1 ? 0 : i);
141 
142          assert(store->instr.pass_flags > 0);
143          if (--store->instr.pass_flags == 0 && store != combo->latest)
144             nir_instr_remove(&store->instr);
145       } else {
146          comps[i] = nir_get_ssa_scalar(nir_ssa_undef(&state->b, 1, bit_size), 0);
147       }
148    }
149    assert(combo->latest->instr.pass_flags == 0);
150    nir_ssa_def *vec = nir_vec_scalars(&state->b, comps, num_components);
151 
152    /* Fix the latest store with the combined information. */
153    nir_intrinsic_instr *store = combo->latest;
154 
155    /* In this case, our store is as an array deref of a vector so we need to
156     * rewrite it to use a deref to the whole vector.
157     */
158    if (store->num_components == 1) {
159       store->num_components = num_components;
160       nir_instr_rewrite_src(&store->instr, &store->src[0],
161                             nir_src_for_ssa(&combo->dst->dest.ssa));
162    }
163 
164    assert(store->num_components == num_components);
165    nir_intrinsic_set_write_mask(store, combo->write_mask);
166    nir_instr_rewrite_src(&store->instr, &store->src[1],
167                          nir_src_for_ssa(vec));
168    state->progress = true;
169 }
170 
171 static void
combine_stores_with_deref(struct combine_stores_state * state,nir_deref_instr * deref)172 combine_stores_with_deref(struct combine_stores_state *state,
173                               nir_deref_instr *deref)
174 {
175    if (!nir_deref_mode_may_be(deref, state->modes))
176       return;
177 
178    list_for_each_entry_safe(struct combined_store, combo, &state->pending, link) {
179       if (nir_compare_derefs(combo->dst, deref) & nir_derefs_may_alias_bit) {
180          combine_stores(state, combo);
181          free_combined_store(state, combo);
182       }
183    }
184 }
185 
186 static void
combine_stores_with_modes(struct combine_stores_state * state,nir_variable_mode modes)187 combine_stores_with_modes(struct combine_stores_state *state,
188                               nir_variable_mode modes)
189 {
190    if ((state->modes & modes) == 0)
191       return;
192 
193    list_for_each_entry_safe(struct combined_store, combo, &state->pending, link) {
194       if (nir_deref_mode_may_be(combo->dst, modes)) {
195          combine_stores(state, combo);
196          free_combined_store(state, combo);
197       }
198    }
199 }
200 
201 static struct combined_store *
find_matching_combined_store(struct combine_stores_state * state,nir_deref_instr * deref)202 find_matching_combined_store(struct combine_stores_state *state,
203                              nir_deref_instr *deref)
204 {
205    list_for_each_entry(struct combined_store, combo, &state->pending, link) {
206       if (nir_compare_derefs(combo->dst, deref) & nir_derefs_equal_bit)
207          return combo;
208    }
209    return NULL;
210 }
211 
212 static void
update_combined_store(struct combine_stores_state * state,nir_intrinsic_instr * intrin)213 update_combined_store(struct combine_stores_state *state,
214                       nir_intrinsic_instr *intrin)
215 {
216    nir_deref_instr *dst = nir_src_as_deref(intrin->src[0]);
217    if (!nir_deref_mode_may_be(dst, state->modes))
218       return;
219 
220    unsigned vec_mask;
221    nir_deref_instr *vec_dst;
222 
223    if (glsl_type_is_vector(dst->type)) {
224       vec_mask = nir_intrinsic_write_mask(intrin);
225       vec_dst = dst;
226    } else {
227       /* Besides vectors, only direct array derefs of vectors are handled. */
228       if (dst->deref_type != nir_deref_type_array ||
229           !nir_src_is_const(dst->arr.index) ||
230           !glsl_type_is_vector(nir_deref_instr_parent(dst)->type)) {
231          combine_stores_with_deref(state, dst);
232          return;
233       }
234 
235       uint64_t index = nir_src_as_uint(dst->arr.index);
236       vec_dst = nir_deref_instr_parent(dst);
237 
238       if (index >= glsl_get_vector_elements(vec_dst->type)) {
239          /* Storing to an invalid index is a no-op. */
240          nir_instr_remove(&intrin->instr);
241          state->progress = true;
242          return;
243       }
244 
245       vec_mask = 1 << index;
246    }
247 
248    struct combined_store *combo = find_matching_combined_store(state, vec_dst);
249    if (!combo) {
250       combo = alloc_combined_store(state);
251       combo->dst = vec_dst;
252       list_add(&combo->link, &state->pending);
253    }
254 
255    /* Use pass_flags to reference count the store based on how many
256     * components are still used by the combination.
257     */
258    intrin->instr.pass_flags = util_bitcount(vec_mask);
259    combo->latest = intrin;
260 
261    /* Update the combined_store, clearing up older overlapping references. */
262    combo->write_mask |= vec_mask;
263    while (vec_mask) {
264       unsigned i = u_bit_scan(&vec_mask);
265       nir_intrinsic_instr *prev_store = combo->stores[i];
266 
267       if (prev_store) {
268          if (--prev_store->instr.pass_flags == 0) {
269             nir_instr_remove(&prev_store->instr);
270          } else {
271             assert(glsl_type_is_vector(
272                       nir_src_as_deref(prev_store->src[0])->type));
273             nir_component_mask_t prev_mask = nir_intrinsic_write_mask(prev_store);
274             nir_intrinsic_set_write_mask(prev_store, prev_mask & ~(1 << i));
275          }
276          state->progress = true;
277       }
278       combo->stores[i] = combo->latest;
279    }
280 }
281 
282 static void
combine_stores_block(struct combine_stores_state * state,nir_block * block)283 combine_stores_block(struct combine_stores_state *state, nir_block *block)
284 {
285    nir_foreach_instr_safe(instr, block) {
286       if (instr->type == nir_instr_type_call) {
287          combine_stores_with_modes(state, nir_var_shader_out |
288                                           nir_var_shader_temp |
289                                           nir_var_function_temp |
290                                           nir_var_mem_ssbo |
291                                           nir_var_mem_shared |
292                                           nir_var_mem_global);
293          continue;
294       }
295 
296       if (instr->type != nir_instr_type_intrinsic)
297          continue;
298 
299       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
300       switch (intrin->intrinsic) {
301       case nir_intrinsic_store_deref:
302          if (nir_intrinsic_access(intrin) & ACCESS_VOLATILE) {
303             nir_deref_instr *dst = nir_src_as_deref(intrin->src[0]);
304             /* When we see a volatile store, we go ahead and combine all
305              * previous non-volatile stores which touch that address and
306              * specifically don't add the volatile store to the list.  This
307              * way we guarantee that the volatile store isn't combined with
308              * anything and no non-volatile stores are combined across a
309              * volatile store.
310              */
311             combine_stores_with_deref(state, dst);
312          } else {
313             update_combined_store(state, intrin);
314          }
315          break;
316 
317       case nir_intrinsic_control_barrier:
318       case nir_intrinsic_group_memory_barrier:
319       case nir_intrinsic_memory_barrier:
320          combine_stores_with_modes(state, nir_var_shader_out |
321                                           nir_var_mem_ssbo |
322                                           nir_var_mem_shared |
323                                           nir_var_mem_global);
324          break;
325 
326       case nir_intrinsic_memory_barrier_buffer:
327          combine_stores_with_modes(state, nir_var_mem_ssbo |
328                                           nir_var_mem_global);
329          break;
330 
331       case nir_intrinsic_memory_barrier_shared:
332          combine_stores_with_modes(state, nir_var_mem_shared);
333          break;
334 
335       case nir_intrinsic_memory_barrier_tcs_patch:
336          combine_stores_with_modes(state, nir_var_shader_out);
337          break;
338 
339       case nir_intrinsic_scoped_barrier:
340          if (nir_intrinsic_memory_semantics(intrin) & NIR_MEMORY_RELEASE) {
341             combine_stores_with_modes(state,
342                                       nir_intrinsic_memory_modes(intrin));
343          }
344          break;
345 
346       case nir_intrinsic_emit_vertex:
347       case nir_intrinsic_emit_vertex_with_counter:
348          combine_stores_with_modes(state, nir_var_shader_out);
349          break;
350 
351       case nir_intrinsic_report_ray_intersection:
352          combine_stores_with_modes(state, nir_var_mem_ssbo |
353                                           nir_var_mem_global |
354                                           nir_var_shader_call_data |
355                                           nir_var_ray_hit_attrib);
356          break;
357 
358       case nir_intrinsic_ignore_ray_intersection:
359       case nir_intrinsic_terminate_ray:
360          combine_stores_with_modes(state, nir_var_mem_ssbo |
361                                           nir_var_mem_global |
362                                           nir_var_shader_call_data);
363          break;
364 
365       case nir_intrinsic_load_deref: {
366          nir_deref_instr *src = nir_src_as_deref(intrin->src[0]);
367          combine_stores_with_deref(state, src);
368          break;
369       }
370 
371       case nir_intrinsic_load_deref_block_intel:
372       case nir_intrinsic_store_deref_block_intel: {
373          /* Combine all the stores that may alias with the whole variable (or
374           * cast).
375           */
376          nir_deref_instr *operand = nir_src_as_deref(intrin->src[0]);
377          while (nir_deref_instr_parent(operand))
378             operand = nir_deref_instr_parent(operand);
379          assert(operand->deref_type == nir_deref_type_var ||
380                 operand->deref_type == nir_deref_type_cast);
381 
382          combine_stores_with_deref(state, operand);
383          break;
384       }
385 
386       case nir_intrinsic_copy_deref:
387       case nir_intrinsic_memcpy_deref: {
388          nir_deref_instr *dst = nir_src_as_deref(intrin->src[0]);
389          nir_deref_instr *src = nir_src_as_deref(intrin->src[1]);
390          combine_stores_with_deref(state, dst);
391          combine_stores_with_deref(state, src);
392          break;
393       }
394 
395       case nir_intrinsic_trace_ray:
396       case nir_intrinsic_execute_callable:
397       case nir_intrinsic_rt_trace_ray:
398       case nir_intrinsic_rt_execute_callable: {
399          nir_deref_instr *payload =
400             nir_src_as_deref(*nir_get_shader_call_payload_src(intrin));
401          combine_stores_with_deref(state, payload);
402          break;
403       }
404 
405       case nir_intrinsic_deref_atomic_add:
406       case nir_intrinsic_deref_atomic_imin:
407       case nir_intrinsic_deref_atomic_umin:
408       case nir_intrinsic_deref_atomic_imax:
409       case nir_intrinsic_deref_atomic_umax:
410       case nir_intrinsic_deref_atomic_and:
411       case nir_intrinsic_deref_atomic_or:
412       case nir_intrinsic_deref_atomic_xor:
413       case nir_intrinsic_deref_atomic_exchange:
414       case nir_intrinsic_deref_atomic_comp_swap: {
415          nir_deref_instr *dst = nir_src_as_deref(intrin->src[0]);
416          combine_stores_with_deref(state, dst);
417          break;
418       }
419 
420       default:
421          break;
422       }
423    }
424 
425    /* At the end of the block, try all the remaining combinations. */
426    combine_stores_with_modes(state, state->modes);
427 }
428 
429 static bool
combine_stores_impl(struct combine_stores_state * state,nir_function_impl * impl)430 combine_stores_impl(struct combine_stores_state *state, nir_function_impl *impl)
431 {
432    state->progress = false;
433    nir_builder_init(&state->b, impl);
434 
435    nir_foreach_block(block, impl)
436       combine_stores_block(state, block);
437 
438    if (state->progress) {
439       nir_metadata_preserve(impl, nir_metadata_block_index |
440                                   nir_metadata_dominance);
441    } else {
442       nir_metadata_preserve(impl, nir_metadata_all);
443    }
444 
445    return state->progress;
446 }
447 
448 bool
nir_opt_combine_stores(nir_shader * shader,nir_variable_mode modes)449 nir_opt_combine_stores(nir_shader *shader, nir_variable_mode modes)
450 {
451    void *mem_ctx = ralloc_context(NULL);
452    struct combine_stores_state state = {
453       .modes   = modes,
454       .lin_ctx = linear_zalloc_parent(mem_ctx, 0),
455    };
456 
457    list_inithead(&state.pending);
458    list_inithead(&state.freelist);
459 
460    bool progress = false;
461 
462    nir_foreach_function(function, shader) {
463       if (!function->impl)
464          continue;
465       progress |= combine_stores_impl(&state, function->impl);
466    }
467 
468    ralloc_free(mem_ctx);
469    return progress;
470 }
471