1 /*
2 * Copyright © 2019 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "nir.h"
25 #include "nir_builder.h"
26 #include "nir_deref.h"
27
28 #include "util/bitscan.h"
29 #include "util/list.h"
30 #include "util/u_math.h"
31
32 /* Combine stores of vectors to the same deref into a single store.
33 *
34 * This per-block pass keeps track of stores of vectors to the same
35 * destination and combines them into the last store of the sequence. Dead
36 * stores (or parts of the store) found during the process are removed.
37 *
38 * A pending combination becomes an actual combination in various situations:
39 * at the end of the block, when another instruction uses the memory or due to
40 * barriers.
41 *
42 * Besides vectors, the pass also look at array derefs of vectors. For direct
43 * array derefs, it works like a write mask access to the given component.
44 * For indirect access there's no way to know before hand what component it
45 * will overlap with, so the combination is finished -- the indirect remains
46 * unmodified.
47 */
48
49 /* Keep track of a group of stores that can be combined. All stores share the
50 * same destination.
51 */
52 struct combined_store {
53 struct list_head link;
54
55 nir_component_mask_t write_mask;
56 nir_deref_instr *dst;
57
58 /* Latest store added. It is reused when combining. */
59 nir_intrinsic_instr *latest;
60
61 /* Original store for each component. The number of times a store appear
62 * in this array is kept in the store's pass_flags.
63 */
64 nir_intrinsic_instr *stores[NIR_MAX_VEC_COMPONENTS];
65 };
66
67 struct combine_stores_state {
68 nir_variable_mode modes;
69
70 /* Pending store combinations. */
71 struct list_head pending;
72
73 /* Per function impl state. */
74 nir_builder b;
75 bool progress;
76
77
78 /* Allocator and freelist to reuse structs between functions. */
79 void *lin_ctx;
80 struct list_head freelist;
81 };
82
83 static struct combined_store *
alloc_combined_store(struct combine_stores_state * state)84 alloc_combined_store(struct combine_stores_state *state)
85 {
86 struct combined_store *result;
87 if (list_is_empty(&state->freelist)) {
88 result = linear_zalloc_child(state->lin_ctx, sizeof(*result));
89 } else {
90 result = list_first_entry(&state->freelist,
91 struct combined_store,
92 link);
93 list_del(&result->link);
94 memset(result, 0, sizeof(*result));
95 }
96 return result;
97 }
98
99 static void
free_combined_store(struct combine_stores_state * state,struct combined_store * combo)100 free_combined_store(struct combine_stores_state *state,
101 struct combined_store *combo)
102 {
103 list_del(&combo->link);
104 combo->write_mask = 0;
105 list_add(&combo->link, &state->freelist);
106 }
107
108 static void
combine_stores(struct combine_stores_state * state,struct combined_store * combo)109 combine_stores(struct combine_stores_state *state,
110 struct combined_store *combo)
111 {
112 assert(combo->latest);
113 assert(combo->latest->intrinsic == nir_intrinsic_store_deref);
114
115 /* If the combined writemask is the same as the latest store, we know there
116 * is only one store in the combination, so nothing to combine.
117 */
118 if ((combo->write_mask & nir_intrinsic_write_mask(combo->latest)) ==
119 combo->write_mask)
120 return;
121
122 state->b.cursor = nir_before_instr(&combo->latest->instr);
123
124 /* Build a new vec, to be used as source for the combined store. As it
125 * gets build, remove previous stores that are not needed anymore.
126 */
127 nir_ssa_scalar comps[NIR_MAX_VEC_COMPONENTS] = {0};
128 unsigned num_components = glsl_get_vector_elements(combo->dst->type);
129 unsigned bit_size = combo->latest->src[1].ssa->bit_size;
130 for (unsigned i = 0; i < num_components; i++) {
131 nir_intrinsic_instr *store = combo->stores[i];
132 if (combo->write_mask & (1 << i)) {
133 assert(store);
134 assert(store->src[1].is_ssa);
135
136 /* If store->num_components == 1 then we are in the deref-of-vec case
137 * and store->src[1] is a scalar. Otherwise, we're a regular vector
138 * load and we have to pick off a component.
139 */
140 comps[i] = nir_get_ssa_scalar(store->src[1].ssa, store->num_components == 1 ? 0 : i);
141
142 assert(store->instr.pass_flags > 0);
143 if (--store->instr.pass_flags == 0 && store != combo->latest)
144 nir_instr_remove(&store->instr);
145 } else {
146 comps[i] = nir_get_ssa_scalar(nir_ssa_undef(&state->b, 1, bit_size), 0);
147 }
148 }
149 assert(combo->latest->instr.pass_flags == 0);
150 nir_ssa_def *vec = nir_vec_scalars(&state->b, comps, num_components);
151
152 /* Fix the latest store with the combined information. */
153 nir_intrinsic_instr *store = combo->latest;
154
155 /* In this case, our store is as an array deref of a vector so we need to
156 * rewrite it to use a deref to the whole vector.
157 */
158 if (store->num_components == 1) {
159 store->num_components = num_components;
160 nir_instr_rewrite_src(&store->instr, &store->src[0],
161 nir_src_for_ssa(&combo->dst->dest.ssa));
162 }
163
164 assert(store->num_components == num_components);
165 nir_intrinsic_set_write_mask(store, combo->write_mask);
166 nir_instr_rewrite_src(&store->instr, &store->src[1],
167 nir_src_for_ssa(vec));
168 state->progress = true;
169 }
170
171 static void
combine_stores_with_deref(struct combine_stores_state * state,nir_deref_instr * deref)172 combine_stores_with_deref(struct combine_stores_state *state,
173 nir_deref_instr *deref)
174 {
175 if (!nir_deref_mode_may_be(deref, state->modes))
176 return;
177
178 list_for_each_entry_safe(struct combined_store, combo, &state->pending, link) {
179 if (nir_compare_derefs(combo->dst, deref) & nir_derefs_may_alias_bit) {
180 combine_stores(state, combo);
181 free_combined_store(state, combo);
182 }
183 }
184 }
185
186 static void
combine_stores_with_modes(struct combine_stores_state * state,nir_variable_mode modes)187 combine_stores_with_modes(struct combine_stores_state *state,
188 nir_variable_mode modes)
189 {
190 if ((state->modes & modes) == 0)
191 return;
192
193 list_for_each_entry_safe(struct combined_store, combo, &state->pending, link) {
194 if (nir_deref_mode_may_be(combo->dst, modes)) {
195 combine_stores(state, combo);
196 free_combined_store(state, combo);
197 }
198 }
199 }
200
201 static struct combined_store *
find_matching_combined_store(struct combine_stores_state * state,nir_deref_instr * deref)202 find_matching_combined_store(struct combine_stores_state *state,
203 nir_deref_instr *deref)
204 {
205 list_for_each_entry(struct combined_store, combo, &state->pending, link) {
206 if (nir_compare_derefs(combo->dst, deref) & nir_derefs_equal_bit)
207 return combo;
208 }
209 return NULL;
210 }
211
212 static void
update_combined_store(struct combine_stores_state * state,nir_intrinsic_instr * intrin)213 update_combined_store(struct combine_stores_state *state,
214 nir_intrinsic_instr *intrin)
215 {
216 nir_deref_instr *dst = nir_src_as_deref(intrin->src[0]);
217 if (!nir_deref_mode_may_be(dst, state->modes))
218 return;
219
220 unsigned vec_mask;
221 nir_deref_instr *vec_dst;
222
223 if (glsl_type_is_vector(dst->type)) {
224 vec_mask = nir_intrinsic_write_mask(intrin);
225 vec_dst = dst;
226 } else {
227 /* Besides vectors, only direct array derefs of vectors are handled. */
228 if (dst->deref_type != nir_deref_type_array ||
229 !nir_src_is_const(dst->arr.index) ||
230 !glsl_type_is_vector(nir_deref_instr_parent(dst)->type)) {
231 combine_stores_with_deref(state, dst);
232 return;
233 }
234
235 uint64_t index = nir_src_as_uint(dst->arr.index);
236 vec_dst = nir_deref_instr_parent(dst);
237
238 if (index >= glsl_get_vector_elements(vec_dst->type)) {
239 /* Storing to an invalid index is a no-op. */
240 nir_instr_remove(&intrin->instr);
241 state->progress = true;
242 return;
243 }
244
245 vec_mask = 1 << index;
246 }
247
248 struct combined_store *combo = find_matching_combined_store(state, vec_dst);
249 if (!combo) {
250 combo = alloc_combined_store(state);
251 combo->dst = vec_dst;
252 list_add(&combo->link, &state->pending);
253 }
254
255 /* Use pass_flags to reference count the store based on how many
256 * components are still used by the combination.
257 */
258 intrin->instr.pass_flags = util_bitcount(vec_mask);
259 combo->latest = intrin;
260
261 /* Update the combined_store, clearing up older overlapping references. */
262 combo->write_mask |= vec_mask;
263 while (vec_mask) {
264 unsigned i = u_bit_scan(&vec_mask);
265 nir_intrinsic_instr *prev_store = combo->stores[i];
266
267 if (prev_store) {
268 if (--prev_store->instr.pass_flags == 0) {
269 nir_instr_remove(&prev_store->instr);
270 } else {
271 assert(glsl_type_is_vector(
272 nir_src_as_deref(prev_store->src[0])->type));
273 nir_component_mask_t prev_mask = nir_intrinsic_write_mask(prev_store);
274 nir_intrinsic_set_write_mask(prev_store, prev_mask & ~(1 << i));
275 }
276 state->progress = true;
277 }
278 combo->stores[i] = combo->latest;
279 }
280 }
281
282 static void
combine_stores_block(struct combine_stores_state * state,nir_block * block)283 combine_stores_block(struct combine_stores_state *state, nir_block *block)
284 {
285 nir_foreach_instr_safe(instr, block) {
286 if (instr->type == nir_instr_type_call) {
287 combine_stores_with_modes(state, nir_var_shader_out |
288 nir_var_shader_temp |
289 nir_var_function_temp |
290 nir_var_mem_ssbo |
291 nir_var_mem_shared |
292 nir_var_mem_global);
293 continue;
294 }
295
296 if (instr->type != nir_instr_type_intrinsic)
297 continue;
298
299 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
300 switch (intrin->intrinsic) {
301 case nir_intrinsic_store_deref:
302 if (nir_intrinsic_access(intrin) & ACCESS_VOLATILE) {
303 nir_deref_instr *dst = nir_src_as_deref(intrin->src[0]);
304 /* When we see a volatile store, we go ahead and combine all
305 * previous non-volatile stores which touch that address and
306 * specifically don't add the volatile store to the list. This
307 * way we guarantee that the volatile store isn't combined with
308 * anything and no non-volatile stores are combined across a
309 * volatile store.
310 */
311 combine_stores_with_deref(state, dst);
312 } else {
313 update_combined_store(state, intrin);
314 }
315 break;
316
317 case nir_intrinsic_control_barrier:
318 case nir_intrinsic_group_memory_barrier:
319 case nir_intrinsic_memory_barrier:
320 combine_stores_with_modes(state, nir_var_shader_out |
321 nir_var_mem_ssbo |
322 nir_var_mem_shared |
323 nir_var_mem_global);
324 break;
325
326 case nir_intrinsic_memory_barrier_buffer:
327 combine_stores_with_modes(state, nir_var_mem_ssbo |
328 nir_var_mem_global);
329 break;
330
331 case nir_intrinsic_memory_barrier_shared:
332 combine_stores_with_modes(state, nir_var_mem_shared);
333 break;
334
335 case nir_intrinsic_memory_barrier_tcs_patch:
336 combine_stores_with_modes(state, nir_var_shader_out);
337 break;
338
339 case nir_intrinsic_scoped_barrier:
340 if (nir_intrinsic_memory_semantics(intrin) & NIR_MEMORY_RELEASE) {
341 combine_stores_with_modes(state,
342 nir_intrinsic_memory_modes(intrin));
343 }
344 break;
345
346 case nir_intrinsic_emit_vertex:
347 case nir_intrinsic_emit_vertex_with_counter:
348 combine_stores_with_modes(state, nir_var_shader_out);
349 break;
350
351 case nir_intrinsic_report_ray_intersection:
352 combine_stores_with_modes(state, nir_var_mem_ssbo |
353 nir_var_mem_global |
354 nir_var_shader_call_data |
355 nir_var_ray_hit_attrib);
356 break;
357
358 case nir_intrinsic_ignore_ray_intersection:
359 case nir_intrinsic_terminate_ray:
360 combine_stores_with_modes(state, nir_var_mem_ssbo |
361 nir_var_mem_global |
362 nir_var_shader_call_data);
363 break;
364
365 case nir_intrinsic_load_deref: {
366 nir_deref_instr *src = nir_src_as_deref(intrin->src[0]);
367 combine_stores_with_deref(state, src);
368 break;
369 }
370
371 case nir_intrinsic_load_deref_block_intel:
372 case nir_intrinsic_store_deref_block_intel: {
373 /* Combine all the stores that may alias with the whole variable (or
374 * cast).
375 */
376 nir_deref_instr *operand = nir_src_as_deref(intrin->src[0]);
377 while (nir_deref_instr_parent(operand))
378 operand = nir_deref_instr_parent(operand);
379 assert(operand->deref_type == nir_deref_type_var ||
380 operand->deref_type == nir_deref_type_cast);
381
382 combine_stores_with_deref(state, operand);
383 break;
384 }
385
386 case nir_intrinsic_copy_deref:
387 case nir_intrinsic_memcpy_deref: {
388 nir_deref_instr *dst = nir_src_as_deref(intrin->src[0]);
389 nir_deref_instr *src = nir_src_as_deref(intrin->src[1]);
390 combine_stores_with_deref(state, dst);
391 combine_stores_with_deref(state, src);
392 break;
393 }
394
395 case nir_intrinsic_trace_ray:
396 case nir_intrinsic_execute_callable:
397 case nir_intrinsic_rt_trace_ray:
398 case nir_intrinsic_rt_execute_callable: {
399 nir_deref_instr *payload =
400 nir_src_as_deref(*nir_get_shader_call_payload_src(intrin));
401 combine_stores_with_deref(state, payload);
402 break;
403 }
404
405 case nir_intrinsic_deref_atomic_add:
406 case nir_intrinsic_deref_atomic_imin:
407 case nir_intrinsic_deref_atomic_umin:
408 case nir_intrinsic_deref_atomic_imax:
409 case nir_intrinsic_deref_atomic_umax:
410 case nir_intrinsic_deref_atomic_and:
411 case nir_intrinsic_deref_atomic_or:
412 case nir_intrinsic_deref_atomic_xor:
413 case nir_intrinsic_deref_atomic_exchange:
414 case nir_intrinsic_deref_atomic_comp_swap: {
415 nir_deref_instr *dst = nir_src_as_deref(intrin->src[0]);
416 combine_stores_with_deref(state, dst);
417 break;
418 }
419
420 default:
421 break;
422 }
423 }
424
425 /* At the end of the block, try all the remaining combinations. */
426 combine_stores_with_modes(state, state->modes);
427 }
428
429 static bool
combine_stores_impl(struct combine_stores_state * state,nir_function_impl * impl)430 combine_stores_impl(struct combine_stores_state *state, nir_function_impl *impl)
431 {
432 state->progress = false;
433 nir_builder_init(&state->b, impl);
434
435 nir_foreach_block(block, impl)
436 combine_stores_block(state, block);
437
438 if (state->progress) {
439 nir_metadata_preserve(impl, nir_metadata_block_index |
440 nir_metadata_dominance);
441 } else {
442 nir_metadata_preserve(impl, nir_metadata_all);
443 }
444
445 return state->progress;
446 }
447
448 bool
nir_opt_combine_stores(nir_shader * shader,nir_variable_mode modes)449 nir_opt_combine_stores(nir_shader *shader, nir_variable_mode modes)
450 {
451 void *mem_ctx = ralloc_context(NULL);
452 struct combine_stores_state state = {
453 .modes = modes,
454 .lin_ctx = linear_zalloc_parent(mem_ctx, 0),
455 };
456
457 list_inithead(&state.pending);
458 list_inithead(&state.freelist);
459
460 bool progress = false;
461
462 nir_foreach_function(function, shader) {
463 if (!function->impl)
464 continue;
465 progress |= combine_stores_impl(&state, function->impl);
466 }
467
468 ralloc_free(mem_ctx);
469 return progress;
470 }
471