/* * Copyright © 2020 Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include "util/u_dynarray.h" #include "util/u_math.h" #include "nir.h" #include "nir_builder.h" #include "nir_phi_builder.h" static bool move_system_values_to_top(nir_shader *shader) { nir_function_impl *impl = nir_shader_get_entrypoint(shader); bool progress = false; nir_foreach_block(block, impl) { nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_intrinsic) continue; /* These intrinsics not only can't be re-materialized but aren't * preserved when moving to the continuation shader. We have to move * them to the top to ensure they get spilled as needed. */ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); switch (intrin->intrinsic) { case nir_intrinsic_load_shader_record_ptr: case nir_intrinsic_load_btd_local_arg_addr_intel: nir_instr_remove(instr); nir_instr_insert(nir_before_impl(impl), instr); progress = true; break; default: break; } } } if (progress) { nir_metadata_preserve(impl, nir_metadata_control_flow); } else { nir_metadata_preserve(impl, nir_metadata_all); } return progress; } static bool instr_is_shader_call(nir_instr *instr) { if (instr->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); return intrin->intrinsic == nir_intrinsic_trace_ray || intrin->intrinsic == nir_intrinsic_report_ray_intersection || intrin->intrinsic == nir_intrinsic_execute_callable; } /* Previously named bitset, it had to be renamed as FreeBSD defines a struct * named bitset in sys/_bitset.h required by pthread_np.h which is included * from src/util/u_thread.h that is indirectly included by this file. */ struct sized_bitset { BITSET_WORD *set; unsigned size; }; static struct sized_bitset bitset_create(void *mem_ctx, unsigned size) { return (struct sized_bitset){ .set = rzalloc_array(mem_ctx, BITSET_WORD, BITSET_WORDS(size)), .size = size, }; } static bool src_is_in_bitset(nir_src *src, void *_set) { struct sized_bitset *set = _set; /* Any SSA values which were added after we generated liveness information * are things generated by this pass and, while most of it is arithmetic * which we could re-materialize, we don't need to because it's only used * for a single load/store and so shouldn't cross any shader calls. */ if (src->ssa->index >= set->size) return false; return BITSET_TEST(set->set, src->ssa->index); } static void add_ssa_def_to_bitset(nir_def *def, struct sized_bitset *set) { if (def->index >= set->size) return; BITSET_SET(set->set, def->index); } static bool can_remat_instr(nir_instr *instr, struct sized_bitset *remat) { /* Set of all values which are trivially re-materializable and we shouldn't * ever spill them. This includes: * * - Undef values * - Constants * - Uniforms (UBO or push constant) * - ALU combinations of any of the above * - Derefs which are either complete or casts of any of the above * * Because this pass rewrites things in-order and phis are always turned * into register writes, we can use "is it SSA?" to answer the question * "can my source be re-materialized?". Register writes happen via * non-rematerializable intrinsics. */ switch (instr->type) { case nir_instr_type_alu: return nir_foreach_src(instr, src_is_in_bitset, remat); case nir_instr_type_deref: return nir_foreach_src(instr, src_is_in_bitset, remat); case nir_instr_type_intrinsic: { nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); switch (intrin->intrinsic) { case nir_intrinsic_load_uniform: case nir_intrinsic_load_ubo: case nir_intrinsic_vulkan_resource_index: case nir_intrinsic_vulkan_resource_reindex: case nir_intrinsic_load_vulkan_descriptor: case nir_intrinsic_load_push_constant: case nir_intrinsic_load_global_constant: /* These intrinsics don't need to be spilled as long as they don't * depend on any spilled values. */ return nir_foreach_src(instr, src_is_in_bitset, remat); case nir_intrinsic_load_scratch_base_ptr: case nir_intrinsic_load_ray_launch_id: case nir_intrinsic_load_topology_id_intel: case nir_intrinsic_load_btd_global_arg_addr_intel: case nir_intrinsic_load_btd_resume_sbt_addr_intel: case nir_intrinsic_load_ray_base_mem_addr_intel: case nir_intrinsic_load_ray_hw_stack_size_intel: case nir_intrinsic_load_ray_sw_stack_size_intel: case nir_intrinsic_load_ray_num_dss_rt_stacks_intel: case nir_intrinsic_load_ray_hit_sbt_addr_intel: case nir_intrinsic_load_ray_hit_sbt_stride_intel: case nir_intrinsic_load_ray_miss_sbt_addr_intel: case nir_intrinsic_load_ray_miss_sbt_stride_intel: case nir_intrinsic_load_callable_sbt_addr_intel: case nir_intrinsic_load_callable_sbt_stride_intel: case nir_intrinsic_load_reloc_const_intel: case nir_intrinsic_load_ray_query_global_intel: case nir_intrinsic_load_ray_launch_size: /* Notably missing from the above list is btd_local_arg_addr_intel. * This is because the resume shader will have a different local * argument pointer because it has a different BSR. Any access of * the original shader's local arguments needs to be preserved so * that pointer has to be saved on the stack. * * TODO: There may be some system values we want to avoid * re-materializing as well but we have to be very careful * to ensure that it's a system value which cannot change * across a shader call. */ return true; case nir_intrinsic_resource_intel: return nir_foreach_src(instr, src_is_in_bitset, remat); default: return false; } } case nir_instr_type_undef: case nir_instr_type_load_const: return true; default: return false; } } static bool can_remat_ssa_def(nir_def *def, struct sized_bitset *remat) { return can_remat_instr(def->parent_instr, remat); } struct add_instr_data { struct util_dynarray *buf; struct sized_bitset *remat; }; static bool add_src_instr(nir_src *src, void *state) { struct add_instr_data *data = state; if (BITSET_TEST(data->remat->set, src->ssa->index)) return true; util_dynarray_foreach(data->buf, nir_instr *, instr_ptr) { if (*instr_ptr == src->ssa->parent_instr) return true; } /* Abort rematerializing an instruction chain if it is too long. */ if (data->buf->size >= data->buf->capacity) return false; util_dynarray_append(data->buf, nir_instr *, src->ssa->parent_instr); return true; } static int compare_instr_indexes(const void *_inst1, const void *_inst2) { const nir_instr *const *inst1 = _inst1; const nir_instr *const *inst2 = _inst2; return (*inst1)->index - (*inst2)->index; } static bool can_remat_chain_ssa_def(nir_def *def, struct sized_bitset *remat, struct util_dynarray *buf) { assert(util_dynarray_num_elements(buf, nir_instr *) == 0); void *mem_ctx = ralloc_context(NULL); /* Add all the instructions involved in build this ssa_def */ util_dynarray_append(buf, nir_instr *, def->parent_instr); unsigned idx = 0; struct add_instr_data data = { .buf = buf, .remat = remat, }; while (idx < util_dynarray_num_elements(buf, nir_instr *)) { nir_instr *instr = *util_dynarray_element(buf, nir_instr *, idx++); if (!nir_foreach_src(instr, add_src_instr, &data)) goto fail; } /* Sort instructions by index */ qsort(util_dynarray_begin(buf), util_dynarray_num_elements(buf, nir_instr *), sizeof(nir_instr *), compare_instr_indexes); /* Create a temporary bitset with all values already * rematerialized/rematerializable. We'll add to this bit set as we go * through values that might not be in that set but that we can * rematerialize. */ struct sized_bitset potential_remat = bitset_create(mem_ctx, remat->size); memcpy(potential_remat.set, remat->set, BITSET_WORDS(remat->size) * sizeof(BITSET_WORD)); util_dynarray_foreach(buf, nir_instr *, instr_ptr) { nir_def *instr_ssa_def = nir_instr_def(*instr_ptr); /* If already in the potential rematerializable, nothing to do. */ if (BITSET_TEST(potential_remat.set, instr_ssa_def->index)) continue; if (!can_remat_instr(*instr_ptr, &potential_remat)) goto fail; /* All the sources are rematerializable and the instruction is also * rematerializable, mark it as rematerializable too. */ BITSET_SET(potential_remat.set, instr_ssa_def->index); } ralloc_free(mem_ctx); return true; fail: util_dynarray_clear(buf); ralloc_free(mem_ctx); return false; } static nir_def * remat_ssa_def(nir_builder *b, nir_def *def, struct hash_table *remap_table) { nir_instr *clone = nir_instr_clone_deep(b->shader, def->parent_instr, remap_table); nir_builder_instr_insert(b, clone); return nir_instr_def(clone); } static nir_def * remat_chain_ssa_def(nir_builder *b, struct util_dynarray *buf, struct sized_bitset *remat, nir_def ***fill_defs, unsigned call_idx, struct hash_table *remap_table) { nir_def *last_def = NULL; util_dynarray_foreach(buf, nir_instr *, instr_ptr) { nir_def *instr_ssa_def = nir_instr_def(*instr_ptr); unsigned ssa_index = instr_ssa_def->index; if (fill_defs[ssa_index] != NULL && fill_defs[ssa_index][call_idx] != NULL) continue; /* Clone the instruction we want to rematerialize */ nir_def *clone_ssa_def = remat_ssa_def(b, instr_ssa_def, remap_table); if (fill_defs[ssa_index] == NULL) { fill_defs[ssa_index] = rzalloc_array(fill_defs, nir_def *, remat->size); } /* Add the new ssa_def to the list fill_defs and flag it as * rematerialized */ fill_defs[ssa_index][call_idx] = last_def = clone_ssa_def; BITSET_SET(remat->set, ssa_index); _mesa_hash_table_insert(remap_table, instr_ssa_def, last_def); } return last_def; } struct pbv_array { struct nir_phi_builder_value **arr; unsigned len; }; static struct nir_phi_builder_value * get_phi_builder_value_for_def(nir_def *def, struct pbv_array *pbv_arr) { if (def->index >= pbv_arr->len) return NULL; return pbv_arr->arr[def->index]; } static nir_def * get_phi_builder_def_for_src(nir_src *src, struct pbv_array *pbv_arr, nir_block *block) { struct nir_phi_builder_value *pbv = get_phi_builder_value_for_def(src->ssa, pbv_arr); if (pbv == NULL) return NULL; return nir_phi_builder_value_get_block_def(pbv, block); } static bool rewrite_instr_src_from_phi_builder(nir_src *src, void *_pbv_arr) { nir_block *block; if (nir_src_parent_instr(src)->type == nir_instr_type_phi) { nir_phi_src *phi_src = exec_node_data(nir_phi_src, src, src); block = phi_src->pred; } else { block = nir_src_parent_instr(src)->block; } nir_def *new_def = get_phi_builder_def_for_src(src, _pbv_arr, block); if (new_def != NULL) nir_src_rewrite(src, new_def); return true; } static nir_def * spill_fill(nir_builder *before, nir_builder *after, nir_def *def, unsigned value_id, unsigned call_idx, unsigned offset, unsigned stack_alignment) { const unsigned comp_size = def->bit_size / 8; nir_store_stack(before, def, .base = offset, .call_idx = call_idx, .align_mul = MIN2(comp_size, stack_alignment), .value_id = value_id, .write_mask = BITFIELD_MASK(def->num_components)); return nir_load_stack(after, def->num_components, def->bit_size, .base = offset, .call_idx = call_idx, .value_id = value_id, .align_mul = MIN2(comp_size, stack_alignment)); } static bool add_src_to_call_live_bitset(nir_src *src, void *state) { BITSET_WORD *call_live = state; BITSET_SET(call_live, src->ssa->index); return true; } static void spill_ssa_defs_and_lower_shader_calls(nir_shader *shader, uint32_t num_calls, const nir_lower_shader_calls_options *options) { /* TODO: If a SSA def is filled more than once, we probably want to just * spill it at the LCM of the fill sites so we avoid unnecessary * extra spills * * TODO: If a SSA def is defined outside a loop but live through some call * inside the loop, we probably want to spill outside the loop. We * may also want to fill outside the loop if it's not used in the * loop. * * TODO: Right now, we only re-materialize things if their immediate * sources are things which we filled. We probably want to expand * that to re-materialize things whose sources are things we can * re-materialize from things we filled. We may want some DAG depth * heuristic on this. */ /* This happens per-shader rather than per-impl because we mess with * nir_shader::scratch_size. */ nir_function_impl *impl = nir_shader_get_entrypoint(shader); nir_metadata_require(impl, nir_metadata_live_defs | nir_metadata_dominance | nir_metadata_block_index | nir_metadata_instr_index); void *mem_ctx = ralloc_context(shader); const unsigned num_ssa_defs = impl->ssa_alloc; const unsigned live_words = BITSET_WORDS(num_ssa_defs); struct sized_bitset trivial_remat = bitset_create(mem_ctx, num_ssa_defs); /* Array of all live SSA defs which are spill candidates */ nir_def **spill_defs = rzalloc_array(mem_ctx, nir_def *, num_ssa_defs); /* For each spill candidate, an array of every time it's defined by a fill, * indexed by call instruction index. */ nir_def ***fill_defs = rzalloc_array(mem_ctx, nir_def **, num_ssa_defs); /* For each call instruction, the liveness set at the call */ const BITSET_WORD **call_live = rzalloc_array(mem_ctx, const BITSET_WORD *, num_calls); /* For each call instruction, the block index of the block it lives in */ uint32_t *call_block_indices = rzalloc_array(mem_ctx, uint32_t, num_calls); /* Remap table when rebuilding instructions out of fill operations */ struct hash_table *trivial_remap_table = _mesa_pointer_hash_table_create(mem_ctx); /* Walk the call instructions and fetch the liveness set and block index * for each one. We need to do this before we start modifying the shader * so that liveness doesn't complain that it's been invalidated. Don't * worry, we'll be very careful with our live sets. :-) */ unsigned call_idx = 0; nir_foreach_block(block, impl) { nir_foreach_instr(instr, block) { if (!instr_is_shader_call(instr)) continue; call_block_indices[call_idx] = block->index; /* The objective here is to preserve values around shader call * instructions. Therefore, we use the live set after the * instruction as the set of things we want to preserve. Because * none of our shader call intrinsics return anything, we don't have * to worry about spilling over a return value. * * TODO: This isn't quite true for report_intersection. */ call_live[call_idx] = nir_get_live_defs(nir_after_instr(instr), mem_ctx); call_idx++; } } /* If a should_remat_callback is given, call it on each of the live values * for each call site. If it returns true we need to rematerialize that * instruction (instead of spill/fill). Therefore we need to add the * sources as live values so that we can rematerialize on top of those * spilled/filled sources. */ if (options->should_remat_callback) { BITSET_WORD **updated_call_live = rzalloc_array(mem_ctx, BITSET_WORD *, num_calls); nir_foreach_block(block, impl) { nir_foreach_instr(instr, block) { nir_def *def = nir_instr_def(instr); if (def == NULL) continue; for (unsigned c = 0; c < num_calls; c++) { if (!BITSET_TEST(call_live[c], def->index)) continue; if (!options->should_remat_callback(def->parent_instr, options->should_remat_data)) continue; if (updated_call_live[c] == NULL) { const unsigned bitset_words = BITSET_WORDS(impl->ssa_alloc); updated_call_live[c] = ralloc_array(mem_ctx, BITSET_WORD, bitset_words); memcpy(updated_call_live[c], call_live[c], bitset_words * sizeof(BITSET_WORD)); } nir_foreach_src(instr, add_src_to_call_live_bitset, updated_call_live[c]); } } } for (unsigned c = 0; c < num_calls; c++) { if (updated_call_live[c] != NULL) call_live[c] = updated_call_live[c]; } } nir_builder before, after; before = nir_builder_create(impl); after = nir_builder_create(impl); call_idx = 0; unsigned max_scratch_size = shader->scratch_size; nir_foreach_block(block, impl) { nir_foreach_instr_safe(instr, block) { nir_def *def = nir_instr_def(instr); if (def != NULL) { if (can_remat_ssa_def(def, &trivial_remat)) { add_ssa_def_to_bitset(def, &trivial_remat); _mesa_hash_table_insert(trivial_remap_table, def, def); } else { spill_defs[def->index] = def; } } if (!instr_is_shader_call(instr)) continue; const BITSET_WORD *live = call_live[call_idx]; struct hash_table *remap_table = _mesa_hash_table_clone(trivial_remap_table, mem_ctx); /* Make a copy of trivial_remat that we'll update as we crawl through * the live SSA defs and unspill them. */ struct sized_bitset remat = bitset_create(mem_ctx, num_ssa_defs); memcpy(remat.set, trivial_remat.set, live_words * sizeof(BITSET_WORD)); /* Before the two builders are always separated by the call * instruction, it won't break anything to have two of them. */ before.cursor = nir_before_instr(instr); after.cursor = nir_after_instr(instr); /* Array used to hold all the values needed to rematerialize a live * value. The capacity is used to determine when we should abort testing * a remat chain. In practice, shaders can have chains with more than * 10k elements while only chains with less than 16 have realistic * chances. There also isn't any performance benefit in rematerializing * extremely long chains. */ nir_instr *remat_chain_instrs[16]; struct util_dynarray remat_chain; util_dynarray_init_from_stack(&remat_chain, remat_chain_instrs, sizeof(remat_chain_instrs)); unsigned offset = shader->scratch_size; for (unsigned w = 0; w < live_words; w++) { BITSET_WORD spill_mask = live[w] & ~trivial_remat.set[w]; while (spill_mask) { int i = u_bit_scan(&spill_mask); assert(i >= 0); unsigned index = w * BITSET_WORDBITS + i; assert(index < num_ssa_defs); def = spill_defs[index]; nir_def *original_def = def, *new_def; if (can_remat_ssa_def(def, &remat)) { /* If this SSA def is re-materializable or based on other * things we've already spilled, re-materialize it rather * than spilling and filling. Anything which is trivially * re-materializable won't even get here because we take * those into account in spill_mask above. */ new_def = remat_ssa_def(&after, def, remap_table); } else if (can_remat_chain_ssa_def(def, &remat, &remat_chain)) { new_def = remat_chain_ssa_def(&after, &remat_chain, &remat, fill_defs, call_idx, remap_table); util_dynarray_clear(&remat_chain); } else { bool is_bool = def->bit_size == 1; if (is_bool) def = nir_b2b32(&before, def); const unsigned comp_size = def->bit_size / 8; offset = ALIGN(offset, comp_size); new_def = spill_fill(&before, &after, def, index, call_idx, offset, options->stack_alignment); if (is_bool) new_def = nir_b2b1(&after, new_def); offset += def->num_components * comp_size; } /* Mark this SSA def as available in the remat set so that, if * some other SSA def we need is computed based on it, we can * just re-compute instead of fetching from memory. */ BITSET_SET(remat.set, index); /* For now, we just make a note of this new SSA def. We'll * fix things up with the phi builder as a second pass. */ if (fill_defs[index] == NULL) { fill_defs[index] = rzalloc_array(fill_defs, nir_def *, num_calls); } fill_defs[index][call_idx] = new_def; _mesa_hash_table_insert(remap_table, original_def, new_def); } } nir_builder *b = &before; offset = ALIGN(offset, options->stack_alignment); max_scratch_size = MAX2(max_scratch_size, offset); /* First thing on the called shader's stack is the resume address * followed by a pointer to the payload. */ nir_intrinsic_instr *call = nir_instr_as_intrinsic(instr); /* Lower to generic intrinsics with information about the stack & resume shader. */ switch (call->intrinsic) { case nir_intrinsic_trace_ray: { nir_rt_trace_ray(b, call->src[0].ssa, call->src[1].ssa, call->src[2].ssa, call->src[3].ssa, call->src[4].ssa, call->src[5].ssa, call->src[6].ssa, call->src[7].ssa, call->src[8].ssa, call->src[9].ssa, call->src[10].ssa, .call_idx = call_idx, .stack_size = offset); break; } case nir_intrinsic_report_ray_intersection: unreachable("Any-hit shaders must be inlined"); case nir_intrinsic_execute_callable: { nir_rt_execute_callable(b, call->src[0].ssa, call->src[1].ssa, .call_idx = call_idx, .stack_size = offset); break; } default: unreachable("Invalid shader call instruction"); } nir_rt_resume(b, .call_idx = call_idx, .stack_size = offset); nir_instr_remove(&call->instr); call_idx++; } } assert(call_idx == num_calls); shader->scratch_size = max_scratch_size; struct nir_phi_builder *pb = nir_phi_builder_create(impl); struct pbv_array pbv_arr = { .arr = rzalloc_array(mem_ctx, struct nir_phi_builder_value *, num_ssa_defs), .len = num_ssa_defs, }; const unsigned block_words = BITSET_WORDS(impl->num_blocks); BITSET_WORD *def_blocks = ralloc_array(mem_ctx, BITSET_WORD, block_words); /* Go through and set up phi builder values for each spillable value which * we ever needed to spill at any point. */ for (unsigned index = 0; index < num_ssa_defs; index++) { if (fill_defs[index] == NULL) continue; nir_def *def = spill_defs[index]; memset(def_blocks, 0, block_words * sizeof(BITSET_WORD)); BITSET_SET(def_blocks, def->parent_instr->block->index); for (unsigned call_idx = 0; call_idx < num_calls; call_idx++) { if (fill_defs[index][call_idx] != NULL) BITSET_SET(def_blocks, call_block_indices[call_idx]); } pbv_arr.arr[index] = nir_phi_builder_add_value(pb, def->num_components, def->bit_size, def_blocks); } /* Walk the shader one more time and rewrite SSA defs as needed using the * phi builder. */ nir_foreach_block(block, impl) { nir_foreach_instr_safe(instr, block) { nir_def *def = nir_instr_def(instr); if (def != NULL) { struct nir_phi_builder_value *pbv = get_phi_builder_value_for_def(def, &pbv_arr); if (pbv != NULL) nir_phi_builder_value_set_block_def(pbv, block, def); } if (instr->type == nir_instr_type_phi) continue; nir_foreach_src(instr, rewrite_instr_src_from_phi_builder, &pbv_arr); if (instr->type != nir_instr_type_intrinsic) continue; nir_intrinsic_instr *resume = nir_instr_as_intrinsic(instr); if (resume->intrinsic != nir_intrinsic_rt_resume) continue; call_idx = nir_intrinsic_call_idx(resume); /* Technically, this is the wrong place to add the fill defs to the * phi builder values because we haven't seen any of the load_scratch * instructions for this call yet. However, we know based on how we * emitted them that no value ever gets used until after the load * instruction has been emitted so this should be safe. If we ever * fail validation due this it likely means a bug in our spilling * code and not the phi re-construction code here. */ for (unsigned index = 0; index < num_ssa_defs; index++) { if (fill_defs[index] && fill_defs[index][call_idx]) { nir_phi_builder_value_set_block_def(pbv_arr.arr[index], block, fill_defs[index][call_idx]); } } } nir_if *following_if = nir_block_get_following_if(block); if (following_if) { nir_def *new_def = get_phi_builder_def_for_src(&following_if->condition, &pbv_arr, block); if (new_def != NULL) nir_src_rewrite(&following_if->condition, new_def); } /* Handle phi sources that source from this block. We have to do this * as a separate pass because the phi builder assumes that uses and * defs are processed in an order that respects dominance. When we have * loops, a phi source may be a back-edge so we have to handle it as if * it were one of the last instructions in the predecessor block. */ nir_foreach_phi_src_leaving_block(block, rewrite_instr_src_from_phi_builder, &pbv_arr); } nir_phi_builder_finish(pb); ralloc_free(mem_ctx); nir_metadata_preserve(impl, nir_metadata_control_flow); } static nir_instr * find_resume_instr(nir_function_impl *impl, unsigned call_idx) { nir_foreach_block(block, impl) { nir_foreach_instr(instr, block) { if (instr->type != nir_instr_type_intrinsic) continue; nir_intrinsic_instr *resume = nir_instr_as_intrinsic(instr); if (resume->intrinsic != nir_intrinsic_rt_resume) continue; if (nir_intrinsic_call_idx(resume) == call_idx) return &resume->instr; } } unreachable("Couldn't find resume instruction"); } /* Walk the CF tree and duplicate the contents of every loop, one half runs on * resume and the other half is for any post-resume loop iterations. We are * careful in our duplication to ensure that resume_instr is in the resume * half of the loop though a copy of resume_instr will remain in the other * half as well in case the same shader call happens twice. */ static bool duplicate_loop_bodies(nir_function_impl *impl, nir_instr *resume_instr) { nir_def *resume_reg = NULL; for (nir_cf_node *node = resume_instr->block->cf_node.parent; node->type != nir_cf_node_function; node = node->parent) { if (node->type != nir_cf_node_loop) continue; nir_loop *loop = nir_cf_node_as_loop(node); assert(!nir_loop_has_continue_construct(loop)); nir_builder b = nir_builder_create(impl); if (resume_reg == NULL) { /* We only create resume_reg if we encounter a loop. This way we can * avoid re-validating the shader and calling ssa_to_reg_intrinsics in * the case where it's just if-ladders. */ resume_reg = nir_decl_reg(&b, 1, 1, 0); /* Initialize resume to true at the start of the shader, right after * the register is declared at the start. */ b.cursor = nir_after_instr(resume_reg->parent_instr); nir_store_reg(&b, nir_imm_true(&b), resume_reg); /* Set resume to false right after the resume instruction */ b.cursor = nir_after_instr(resume_instr); nir_store_reg(&b, nir_imm_false(&b), resume_reg); } /* Before we go any further, make sure that everything which exits the * loop or continues around to the top of the loop does so through * registers. We're about to duplicate the loop body and we'll have * serious trouble if we don't do this. */ nir_convert_loop_to_lcssa(loop); nir_lower_phis_to_regs_block(nir_loop_first_block(loop)); nir_lower_phis_to_regs_block( nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node))); nir_cf_list cf_list; nir_cf_list_extract(&cf_list, &loop->body); nir_if *_if = nir_if_create(impl->function->shader); b.cursor = nir_after_cf_list(&loop->body); _if->condition = nir_src_for_ssa(nir_load_reg(&b, resume_reg)); nir_cf_node_insert(nir_after_cf_list(&loop->body), &_if->cf_node); nir_cf_list clone; nir_cf_list_clone(&clone, &cf_list, &loop->cf_node, NULL); /* Insert the clone in the else and the original in the then so that * the resume_instr remains valid even after the duplication. */ nir_cf_reinsert(&cf_list, nir_before_cf_list(&_if->then_list)); nir_cf_reinsert(&clone, nir_before_cf_list(&_if->else_list)); } if (resume_reg != NULL) nir_metadata_preserve(impl, nir_metadata_none); return resume_reg != NULL; } static bool cf_node_contains_block(nir_cf_node *node, nir_block *block) { for (nir_cf_node *n = &block->cf_node; n != NULL; n = n->parent) { if (n == node) return true; } return false; } static void rewrite_phis_to_pred(nir_block *block, nir_block *pred) { nir_foreach_phi(phi, block) { ASSERTED bool found = false; nir_foreach_phi_src(phi_src, phi) { if (phi_src->pred == pred) { found = true; nir_def_rewrite_uses(&phi->def, phi_src->src.ssa); break; } } assert(found); } } static bool cursor_is_after_jump(nir_cursor cursor) { switch (cursor.option) { case nir_cursor_before_instr: case nir_cursor_before_block: return false; case nir_cursor_after_instr: return cursor.instr->type == nir_instr_type_jump; case nir_cursor_after_block: return nir_block_ends_in_jump(cursor.block); ; } unreachable("Invalid cursor option"); } /** Flattens if ladders leading up to a resume * * Given a resume_instr, this function flattens any if ladders leading to the * resume instruction and deletes any code that cannot be encountered on a * direct path to the resume instruction. This way we get, for the most part, * straight-line control-flow up to the resume instruction. * * While we do this flattening, we also move any code which is in the remat * set up to the top of the function or to the top of the resume portion of * the current loop. We don't worry about control-flow as we do this because * phis will never be in the remat set (see can_remat_instr) and so nothing * control-dependent will ever need to be re-materialized. It is possible * that this algorithm will preserve too many instructions by moving them to * the top but we leave that for DCE to clean up. Any code not in the remat * set is deleted because it's either unused in the continuation or else * unspilled from a previous continuation and the unspill code is after the * resume instruction. * * If, for instance, we have something like this: * * // block 0 * if (cond1) { * // block 1 * } else { * // block 2 * if (cond2) { * // block 3 * resume; * if (cond3) { * // block 4 * } * } else { * // block 5 * } * } * * then we know, because we know the resume instruction had to be encoutered, * that cond1 = false and cond2 = true and we lower as follows: * * // block 0 * // block 2 * // block 3 * resume; * if (cond3) { * // block 4 * } * * As you can see, the code in blocks 1 and 5 was removed because there is no * path from the start of the shader to the resume instruction which execute * blocks 1 or 5. Any remat code from blocks 0, 2, and 3 is preserved and * moved to the top. If the resume instruction is inside a loop then we know * a priori that it is of the form * * loop { * if (resume) { * // Contents containing resume_instr * } else { * // Second copy of contents * } * } * * In this case, we only descend into the first half of the loop. The second * half is left alone as that portion is only ever executed after the resume * instruction. */ static bool flatten_resume_if_ladder(nir_builder *b, nir_cf_node *parent_node, struct exec_list *child_list, bool child_list_contains_cursor, nir_instr *resume_instr, struct sized_bitset *remat) { nir_cf_list cf_list; /* If our child list contains the cursor instruction then we start out * before the cursor instruction. We need to know this so that we can skip * moving instructions which are already before the cursor. */ bool before_cursor = child_list_contains_cursor; nir_cf_node *resume_node = NULL; foreach_list_typed_safe(nir_cf_node, child, node, child_list) { switch (child->type) { case nir_cf_node_block: { nir_block *block = nir_cf_node_as_block(child); if (b->cursor.option == nir_cursor_before_block && b->cursor.block == block) { assert(before_cursor); before_cursor = false; } nir_foreach_instr_safe(instr, block) { if ((b->cursor.option == nir_cursor_before_instr || b->cursor.option == nir_cursor_after_instr) && b->cursor.instr == instr) { assert(nir_cf_node_is_first(&block->cf_node)); assert(before_cursor); before_cursor = false; continue; } if (instr == resume_instr) goto found_resume; if (!before_cursor && can_remat_instr(instr, remat)) { nir_instr_remove(instr); nir_instr_insert(b->cursor, instr); b->cursor = nir_after_instr(instr); nir_def *def = nir_instr_def(instr); BITSET_SET(remat->set, def->index); } } if (b->cursor.option == nir_cursor_after_block && b->cursor.block == block) { assert(before_cursor); before_cursor = false; } break; } case nir_cf_node_if: { assert(!before_cursor); nir_if *_if = nir_cf_node_as_if(child); if (flatten_resume_if_ladder(b, &_if->cf_node, &_if->then_list, false, resume_instr, remat)) { resume_node = child; rewrite_phis_to_pred(nir_cf_node_as_block(nir_cf_node_next(child)), nir_if_last_then_block(_if)); goto found_resume; } if (flatten_resume_if_ladder(b, &_if->cf_node, &_if->else_list, false, resume_instr, remat)) { resume_node = child; rewrite_phis_to_pred(nir_cf_node_as_block(nir_cf_node_next(child)), nir_if_last_else_block(_if)); goto found_resume; } break; } case nir_cf_node_loop: { assert(!before_cursor); nir_loop *loop = nir_cf_node_as_loop(child); assert(!nir_loop_has_continue_construct(loop)); if (cf_node_contains_block(&loop->cf_node, resume_instr->block)) { /* Thanks to our loop body duplication pass, every level of loop * containing the resume instruction contains exactly three nodes: * two blocks and an if. We don't want to lower away this if * because it's the resume selection if. The resume half is * always the then_list so that's what we want to flatten. */ nir_block *header = nir_loop_first_block(loop); nir_if *_if = nir_cf_node_as_if(nir_cf_node_next(&header->cf_node)); /* We want to place anything re-materialized from inside the loop * at the top of the resume half of the loop. */ nir_builder bl = nir_builder_at(nir_before_cf_list(&_if->then_list)); ASSERTED bool found = flatten_resume_if_ladder(&bl, &_if->cf_node, &_if->then_list, true, resume_instr, remat); assert(found); resume_node = child; goto found_resume; } else { ASSERTED bool found = flatten_resume_if_ladder(b, &loop->cf_node, &loop->body, false, resume_instr, remat); assert(!found); } break; } case nir_cf_node_function: unreachable("Unsupported CF node type"); } } assert(!before_cursor); /* If we got here, we didn't find the resume node or instruction. */ return false; found_resume: /* If we got here then we found either the resume node or the resume * instruction in this CF list. */ if (resume_node) { /* If the resume instruction is buried in side one of our children CF * nodes, resume_node now points to that child. */ if (resume_node->type == nir_cf_node_if) { /* Thanks to the recursive call, all of the interesting contents of * resume_node have been copied before the cursor. We just need to * copy the stuff after resume_node. */ nir_cf_extract(&cf_list, nir_after_cf_node(resume_node), nir_after_cf_list(child_list)); } else { /* The loop contains its own cursor and still has useful stuff in it. * We want to move everything after and including the loop to before * the cursor. */ assert(resume_node->type == nir_cf_node_loop); nir_cf_extract(&cf_list, nir_before_cf_node(resume_node), nir_after_cf_list(child_list)); } } else { /* If we found the resume instruction in one of our blocks, grab * everything after it in the entire list (not just the one block), and * place it before the cursor instr. */ nir_cf_extract(&cf_list, nir_after_instr(resume_instr), nir_after_cf_list(child_list)); } /* If the resume instruction is in the first block of the child_list, * and the cursor is still before that block, the nir_cf_extract() may * extract the block object pointed by the cursor, and instead create * a new one for the code before the resume. In such case the cursor * will be broken, as it will point to a block which is no longer * in a function. * * Luckily, in both cases when this is possible, the intended cursor * position is right before the child_list, so we can fix the cursor here. */ if (child_list_contains_cursor && b->cursor.option == nir_cursor_before_block && b->cursor.block->cf_node.parent == NULL) b->cursor = nir_before_cf_list(child_list); if (cursor_is_after_jump(b->cursor)) { /* If the resume instruction is in a loop, it's possible cf_list ends * in a break or continue instruction, in which case we don't want to * insert anything. It's also possible we have an early return if * someone hasn't lowered those yet. In either case, nothing after that * point executes in this context so we can delete it. */ nir_cf_delete(&cf_list); } else { b->cursor = nir_cf_reinsert(&cf_list, b->cursor); } if (!resume_node) { /* We want the resume to be the first "interesting" instruction */ nir_instr_remove(resume_instr); nir_instr_insert(nir_before_impl(b->impl), resume_instr); } /* We've copied everything interesting out of this CF list to before the * cursor. Delete everything else. */ if (child_list_contains_cursor) { nir_cf_extract(&cf_list, b->cursor, nir_after_cf_list(child_list)); } else { nir_cf_list_extract(&cf_list, child_list); } nir_cf_delete(&cf_list); return true; } typedef bool (*wrap_instr_callback)(nir_instr *instr); static bool wrap_instr(nir_builder *b, nir_instr *instr, void *data) { wrap_instr_callback callback = data; if (!callback(instr)) return false; b->cursor = nir_before_instr(instr); nir_if *_if = nir_push_if(b, nir_imm_true(b)); nir_pop_if(b, NULL); nir_cf_list cf_list; nir_cf_extract(&cf_list, nir_before_instr(instr), nir_after_instr(instr)); nir_cf_reinsert(&cf_list, nir_before_block(nir_if_first_then_block(_if))); return true; } /* This pass wraps jump instructions in a dummy if block so that when * flatten_resume_if_ladder() does its job, it doesn't move a jump instruction * directly in front of another instruction which the NIR control flow helpers * do not allow. */ static bool wrap_instrs(nir_shader *shader, wrap_instr_callback callback) { return nir_shader_instructions_pass(shader, wrap_instr, nir_metadata_none, callback); } static bool instr_is_jump(nir_instr *instr) { return instr->type == nir_instr_type_jump; } static nir_instr * lower_resume(nir_shader *shader, int call_idx) { wrap_instrs(shader, instr_is_jump); nir_function_impl *impl = nir_shader_get_entrypoint(shader); nir_instr *resume_instr = find_resume_instr(impl, call_idx); if (duplicate_loop_bodies(impl, resume_instr)) { nir_validate_shader(shader, "after duplicate_loop_bodies in " "nir_lower_shader_calls"); /* If we duplicated the bodies of any loops, run reg_intrinsics_to_ssa to * get rid of all those pesky registers we just added. */ NIR_PASS_V(shader, nir_lower_reg_intrinsics_to_ssa); } /* Re-index nir_def::index. We don't care about actual liveness in * this pass but, so we can use the same helpers as the spilling pass, we * need to make sure that live_index is something sane. It's used * constantly for determining if an SSA value has been added since the * start of the pass. */ nir_index_ssa_defs(impl); void *mem_ctx = ralloc_context(shader); /* Used to track which things may have been assumed to be re-materialized * by the spilling pass and which we shouldn't delete. */ struct sized_bitset remat = bitset_create(mem_ctx, impl->ssa_alloc); /* Create a nop instruction to use as a cursor as we extract and re-insert * stuff into the CFG. */ nir_builder b = nir_builder_at(nir_before_impl(impl)); ASSERTED bool found = flatten_resume_if_ladder(&b, &impl->cf_node, &impl->body, true, resume_instr, &remat); assert(found); ralloc_free(mem_ctx); nir_metadata_preserve(impl, nir_metadata_none); nir_validate_shader(shader, "after flatten_resume_if_ladder in " "nir_lower_shader_calls"); return resume_instr; } static void replace_resume_with_halt(nir_shader *shader, nir_instr *keep) { nir_function_impl *impl = nir_shader_get_entrypoint(shader); nir_builder b = nir_builder_create(impl); nir_foreach_block_safe(block, impl) { nir_foreach_instr_safe(instr, block) { if (instr == keep) continue; if (instr->type != nir_instr_type_intrinsic) continue; nir_intrinsic_instr *resume = nir_instr_as_intrinsic(instr); if (resume->intrinsic != nir_intrinsic_rt_resume) continue; /* If this is some other resume, then we've kicked off a ray or * bindless thread and we don't want to go any further in this * shader. Insert a halt so that NIR will delete any instructions * dominated by this call instruction including the scratch_load * instructions we inserted. */ nir_cf_list cf_list; nir_cf_extract(&cf_list, nir_after_instr(&resume->instr), nir_after_block(block)); nir_cf_delete(&cf_list); b.cursor = nir_instr_remove(&resume->instr); nir_jump(&b, nir_jump_halt); break; } } } struct lower_scratch_state { nir_address_format address_format; }; static bool lower_stack_instr_to_scratch(struct nir_builder *b, nir_instr *instr, void *data) { struct lower_scratch_state *state = data; if (instr->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *stack = nir_instr_as_intrinsic(instr); switch (stack->intrinsic) { case nir_intrinsic_load_stack: { b->cursor = nir_instr_remove(instr); nir_def *data, *old_data = nir_instr_def(instr); if (state->address_format == nir_address_format_64bit_global) { nir_def *addr = nir_iadd_imm(b, nir_load_scratch_base_ptr(b, 1, 64, 1), nir_intrinsic_base(stack)); data = nir_build_load_global(b, stack->def.num_components, stack->def.bit_size, addr, .align_mul = nir_intrinsic_align_mul(stack), .align_offset = nir_intrinsic_align_offset(stack)); } else { assert(state->address_format == nir_address_format_32bit_offset); data = nir_load_scratch(b, old_data->num_components, old_data->bit_size, nir_imm_int(b, nir_intrinsic_base(stack)), .align_mul = nir_intrinsic_align_mul(stack), .align_offset = nir_intrinsic_align_offset(stack)); } nir_def_rewrite_uses(old_data, data); break; } case nir_intrinsic_store_stack: { b->cursor = nir_instr_remove(instr); nir_def *data = stack->src[0].ssa; if (state->address_format == nir_address_format_64bit_global) { nir_def *addr = nir_iadd_imm(b, nir_load_scratch_base_ptr(b, 1, 64, 1), nir_intrinsic_base(stack)); nir_store_global(b, addr, nir_intrinsic_align_mul(stack), data, nir_component_mask(data->num_components)); } else { assert(state->address_format == nir_address_format_32bit_offset); nir_store_scratch(b, data, nir_imm_int(b, nir_intrinsic_base(stack)), .align_mul = nir_intrinsic_align_mul(stack), .write_mask = BITFIELD_MASK(data->num_components)); } break; } default: return false; } return true; } static bool nir_lower_stack_to_scratch(nir_shader *shader, nir_address_format address_format) { struct lower_scratch_state state = { .address_format = address_format, }; return nir_shader_instructions_pass(shader, lower_stack_instr_to_scratch, nir_metadata_control_flow, &state); } static bool opt_remove_respills_instr(struct nir_builder *b, nir_intrinsic_instr *store_intrin, void *data) { if (store_intrin->intrinsic != nir_intrinsic_store_stack) return false; nir_instr *value_instr = store_intrin->src[0].ssa->parent_instr; if (value_instr->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *load_intrin = nir_instr_as_intrinsic(value_instr); if (load_intrin->intrinsic != nir_intrinsic_load_stack) return false; if (nir_intrinsic_base(load_intrin) != nir_intrinsic_base(store_intrin)) return false; nir_instr_remove(&store_intrin->instr); return true; } /* After shader split, look at stack load/store operations. If we're loading * and storing the same value at the same location, we can drop the store * instruction. */ static bool nir_opt_remove_respills(nir_shader *shader) { return nir_shader_intrinsics_pass(shader, opt_remove_respills_instr, nir_metadata_control_flow, NULL); } static void add_use_mask(struct hash_table_u64 *offset_to_mask, unsigned offset, unsigned mask) { uintptr_t old_mask = (uintptr_t) _mesa_hash_table_u64_search(offset_to_mask, offset); _mesa_hash_table_u64_insert(offset_to_mask, offset, (void *)(uintptr_t)(old_mask | mask)); } /* When splitting the shaders, we might have inserted store & loads of vec4s, * because a live value is a 4 components. But sometimes, only some components * of that vec4 will be used by after the scratch load. This pass removes the * unused components of scratch load/stores. */ static bool nir_opt_trim_stack_values(nir_shader *shader) { nir_function_impl *impl = nir_shader_get_entrypoint(shader); struct hash_table_u64 *value_id_to_mask = _mesa_hash_table_u64_create(NULL); bool progress = false; /* Find all the loads and how their value is being used */ nir_foreach_block_safe(block, impl) { nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_intrinsic) continue; nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); if (intrin->intrinsic != nir_intrinsic_load_stack) continue; const unsigned value_id = nir_intrinsic_value_id(intrin); const unsigned mask = nir_def_components_read(nir_instr_def(instr)); add_use_mask(value_id_to_mask, value_id, mask); } } /* For each store, if it stores more than is being used, trim it. * Otherwise, remove it from the hash table. */ nir_foreach_block_safe(block, impl) { nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_intrinsic) continue; nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); if (intrin->intrinsic != nir_intrinsic_store_stack) continue; const unsigned value_id = nir_intrinsic_value_id(intrin); const unsigned write_mask = nir_intrinsic_write_mask(intrin); const unsigned read_mask = (uintptr_t) _mesa_hash_table_u64_search(value_id_to_mask, value_id); /* Already removed from the table, nothing to do */ if (read_mask == 0) continue; /* Matching read/write mask, nothing to do, remove from the table. */ if (write_mask == read_mask) { _mesa_hash_table_u64_remove(value_id_to_mask, value_id); continue; } nir_builder b = nir_builder_at(nir_before_instr(instr)); nir_def *value = nir_channels(&b, intrin->src[0].ssa, read_mask); nir_src_rewrite(&intrin->src[0], value); intrin->num_components = util_bitcount(read_mask); nir_intrinsic_set_write_mask(intrin, (1u << intrin->num_components) - 1); progress = true; } } /* For each load remaining in the hash table (only the ones we changed the * number of components of), apply triming/reswizzle. */ nir_foreach_block_safe(block, impl) { nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_intrinsic) continue; nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); if (intrin->intrinsic != nir_intrinsic_load_stack) continue; const unsigned value_id = nir_intrinsic_value_id(intrin); unsigned read_mask = (uintptr_t) _mesa_hash_table_u64_search(value_id_to_mask, value_id); if (read_mask == 0) continue; unsigned swiz_map[NIR_MAX_VEC_COMPONENTS] = { 0, }; unsigned swiz_count = 0; u_foreach_bit(idx, read_mask) swiz_map[idx] = swiz_count++; nir_def *def = nir_instr_def(instr); nir_foreach_use_safe(use_src, def) { if (nir_src_parent_instr(use_src)->type == nir_instr_type_alu) { nir_alu_instr *alu = nir_instr_as_alu(nir_src_parent_instr(use_src)); nir_alu_src *alu_src = exec_node_data(nir_alu_src, use_src, src); unsigned count = alu->def.num_components; for (unsigned idx = 0; idx < count; ++idx) alu_src->swizzle[idx] = swiz_map[alu_src->swizzle[idx]]; } else if (nir_src_parent_instr(use_src)->type == nir_instr_type_intrinsic) { nir_intrinsic_instr *use_intrin = nir_instr_as_intrinsic(nir_src_parent_instr(use_src)); assert(nir_intrinsic_has_write_mask(use_intrin)); unsigned write_mask = nir_intrinsic_write_mask(use_intrin); unsigned new_write_mask = 0; u_foreach_bit(idx, write_mask) new_write_mask |= 1 << swiz_map[idx]; nir_intrinsic_set_write_mask(use_intrin, new_write_mask); } else { unreachable("invalid instruction type"); } } intrin->def.num_components = intrin->num_components = swiz_count; progress = true; } } nir_metadata_preserve(impl, progress ? (nir_metadata_control_flow | nir_metadata_loop_analysis) : nir_metadata_all); _mesa_hash_table_u64_destroy(value_id_to_mask); return progress; } struct scratch_item { unsigned old_offset; unsigned new_offset; unsigned bit_size; unsigned num_components; unsigned value; unsigned call_idx; }; static int sort_scratch_item_by_size_and_value_id(const void *_item1, const void *_item2) { const struct scratch_item *item1 = _item1; const struct scratch_item *item2 = _item2; /* By ascending value_id */ if (item1->bit_size == item2->bit_size) return (int)item1->value - (int)item2->value; /* By descending size */ return (int)item2->bit_size - (int)item1->bit_size; } static bool nir_opt_sort_and_pack_stack(nir_shader *shader, unsigned start_call_scratch, unsigned stack_alignment, unsigned num_calls) { nir_function_impl *impl = nir_shader_get_entrypoint(shader); void *mem_ctx = ralloc_context(NULL); struct hash_table_u64 *value_id_to_item = _mesa_hash_table_u64_create(mem_ctx); struct util_dynarray ops; util_dynarray_init(&ops, mem_ctx); for (unsigned call_idx = 0; call_idx < num_calls; call_idx++) { _mesa_hash_table_u64_clear(value_id_to_item); util_dynarray_clear(&ops); /* Find all the stack load and their offset. */ nir_foreach_block_safe(block, impl) { nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_intrinsic) continue; nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); if (intrin->intrinsic != nir_intrinsic_load_stack) continue; if (nir_intrinsic_call_idx(intrin) != call_idx) continue; const unsigned value_id = nir_intrinsic_value_id(intrin); nir_def *def = nir_instr_def(instr); assert(_mesa_hash_table_u64_search(value_id_to_item, value_id) == NULL); struct scratch_item item = { .old_offset = nir_intrinsic_base(intrin), .bit_size = def->bit_size, .num_components = def->num_components, .value = value_id, }; util_dynarray_append(&ops, struct scratch_item, item); _mesa_hash_table_u64_insert(value_id_to_item, value_id, (void *)(uintptr_t) true); } } /* Sort scratch item by component size. */ if (util_dynarray_num_elements(&ops, struct scratch_item)) { qsort(util_dynarray_begin(&ops), util_dynarray_num_elements(&ops, struct scratch_item), sizeof(struct scratch_item), sort_scratch_item_by_size_and_value_id); } /* Reorder things on the stack */ _mesa_hash_table_u64_clear(value_id_to_item); unsigned scratch_size = start_call_scratch; util_dynarray_foreach(&ops, struct scratch_item, item) { item->new_offset = ALIGN(scratch_size, item->bit_size / 8); scratch_size = item->new_offset + (item->bit_size * item->num_components) / 8; _mesa_hash_table_u64_insert(value_id_to_item, item->value, item); } shader->scratch_size = ALIGN(scratch_size, stack_alignment); /* Update offsets in the instructions */ nir_foreach_block_safe(block, impl) { nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_intrinsic) continue; nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); switch (intrin->intrinsic) { case nir_intrinsic_load_stack: case nir_intrinsic_store_stack: { if (nir_intrinsic_call_idx(intrin) != call_idx) continue; struct scratch_item *item = _mesa_hash_table_u64_search(value_id_to_item, nir_intrinsic_value_id(intrin)); assert(item); nir_intrinsic_set_base(intrin, item->new_offset); break; } case nir_intrinsic_rt_trace_ray: case nir_intrinsic_rt_execute_callable: case nir_intrinsic_rt_resume: if (nir_intrinsic_call_idx(intrin) != call_idx) continue; nir_intrinsic_set_stack_size(intrin, shader->scratch_size); break; default: break; } } } } ralloc_free(mem_ctx); nir_shader_preserve_all_metadata(shader); return true; } static unsigned nir_block_loop_depth(nir_block *block) { nir_cf_node *node = &block->cf_node; unsigned loop_depth = 0; while (node != NULL) { if (node->type == nir_cf_node_loop) loop_depth++; node = node->parent; } return loop_depth; } /* Find the last block dominating all the uses of a SSA value. */ static nir_block * find_last_dominant_use_block(nir_function_impl *impl, nir_def *value) { nir_block *old_block = value->parent_instr->block; unsigned old_block_loop_depth = nir_block_loop_depth(old_block); nir_foreach_block_reverse_safe(block, impl) { bool fits = true; /* Store on the current block of the value */ if (block == old_block) return block; /* Don't move instructions deeper into loops, this would generate more * memory traffic. */ unsigned block_loop_depth = nir_block_loop_depth(block); if (block_loop_depth > old_block_loop_depth) continue; nir_foreach_if_use(src, value) { nir_block *block_before_if = nir_cf_node_as_block(nir_cf_node_prev(&nir_src_parent_if(src)->cf_node)); if (!nir_block_dominates(block, block_before_if)) { fits = false; break; } } if (!fits) continue; nir_foreach_use(src, value) { if (nir_src_parent_instr(src)->type == nir_instr_type_phi && block == nir_src_parent_instr(src)->block) { fits = false; break; } if (!nir_block_dominates(block, nir_src_parent_instr(src)->block)) { fits = false; break; } } if (!fits) continue; return block; } unreachable("Cannot find block"); } /* Put the scratch loads in the branches where they're needed. */ static bool nir_opt_stack_loads(nir_shader *shader) { bool progress = false; nir_foreach_function_impl(impl, shader) { nir_metadata_require(impl, nir_metadata_control_flow); bool func_progress = false; nir_foreach_block_safe(block, impl) { nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_intrinsic) continue; nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); if (intrin->intrinsic != nir_intrinsic_load_stack) continue; nir_def *value = &intrin->def; nir_block *new_block = find_last_dominant_use_block(impl, value); if (new_block == block) continue; /* Move the scratch load in the new block, after the phis. */ nir_instr_remove(instr); nir_instr_insert(nir_before_block_after_phis(new_block), instr); func_progress = true; } } nir_metadata_preserve(impl, func_progress ? (nir_metadata_control_flow | nir_metadata_loop_analysis) : nir_metadata_all); progress |= func_progress; } return progress; } static bool split_stack_components_instr(struct nir_builder *b, nir_intrinsic_instr *intrin, void *data) { if (intrin->intrinsic != nir_intrinsic_load_stack && intrin->intrinsic != nir_intrinsic_store_stack) return false; if (intrin->intrinsic == nir_intrinsic_load_stack && intrin->def.num_components == 1) return false; if (intrin->intrinsic == nir_intrinsic_store_stack && intrin->src[0].ssa->num_components == 1) return false; b->cursor = nir_before_instr(&intrin->instr); unsigned align_mul = nir_intrinsic_align_mul(intrin); unsigned align_offset = nir_intrinsic_align_offset(intrin); if (intrin->intrinsic == nir_intrinsic_load_stack) { nir_def *components[NIR_MAX_VEC_COMPONENTS] = { 0, }; for (unsigned c = 0; c < intrin->def.num_components; c++) { unsigned offset = c * intrin->def.bit_size / 8; components[c] = nir_load_stack(b, 1, intrin->def.bit_size, .base = nir_intrinsic_base(intrin) + offset, .call_idx = nir_intrinsic_call_idx(intrin), .value_id = nir_intrinsic_value_id(intrin), .align_mul = align_mul, .align_offset = (align_offset + offset) % align_mul); } nir_def_rewrite_uses(&intrin->def, nir_vec(b, components, intrin->def.num_components)); } else { assert(intrin->intrinsic == nir_intrinsic_store_stack); for (unsigned c = 0; c < intrin->src[0].ssa->num_components; c++) { unsigned offset = c * intrin->src[0].ssa->bit_size / 8; nir_store_stack(b, nir_channel(b, intrin->src[0].ssa, c), .base = nir_intrinsic_base(intrin) + offset, .call_idx = nir_intrinsic_call_idx(intrin), .align_mul = align_mul, .align_offset = (align_offset + offset) % align_mul, .value_id = nir_intrinsic_value_id(intrin), .write_mask = 0x1); } } nir_instr_remove(&intrin->instr); return true; } /* Break the load_stack/store_stack intrinsics into single compoments. This * helps the vectorizer to pack components. */ static bool nir_split_stack_components(nir_shader *shader) { return nir_shader_intrinsics_pass(shader, split_stack_components_instr, nir_metadata_control_flow, NULL); } struct stack_op_vectorizer_state { nir_should_vectorize_mem_func driver_callback; void *driver_data; }; static bool should_vectorize(unsigned align_mul, unsigned align_offset, unsigned bit_size, unsigned num_components, int64_t hole_size, nir_intrinsic_instr *low, nir_intrinsic_instr *high, void *data) { /* We only care about those intrinsics */ if ((low->intrinsic != nir_intrinsic_load_stack && low->intrinsic != nir_intrinsic_store_stack) || (high->intrinsic != nir_intrinsic_load_stack && high->intrinsic != nir_intrinsic_store_stack)) return false; struct stack_op_vectorizer_state *state = data; return state->driver_callback(align_mul, align_offset, bit_size, num_components, hole_size, low, high, state->driver_data); } /** Lower shader call instructions to split shaders. * * Shader calls can be split into an initial shader and a series of "resume" * shaders. When the shader is first invoked, it is the initial shader which * is executed. At any point in the initial shader or any one of the resume * shaders, a shader call operation may be performed. The possible shader call * operations are: * * - trace_ray * - report_ray_intersection * - execute_callable * * When a shader call operation is performed, we push all live values to the * stack,call rt_trace_ray/rt_execute_callable and then kill the shader. Once * the operation we invoked is complete, a callee shader will return execution * to the respective resume shader. The resume shader pops the contents off * the stack and picks up where the calling shader left off. * * Stack management is assumed to be done after this pass. Call * instructions and their resumes get annotated with stack information that * should be enough for the backend to implement proper stack management. */ bool nir_lower_shader_calls(nir_shader *shader, const nir_lower_shader_calls_options *options, nir_shader ***resume_shaders_out, uint32_t *num_resume_shaders_out, void *mem_ctx) { nir_function_impl *impl = nir_shader_get_entrypoint(shader); int num_calls = 0; nir_foreach_block(block, impl) { nir_foreach_instr_safe(instr, block) { if (instr_is_shader_call(instr)) num_calls++; } } if (num_calls == 0) { nir_shader_preserve_all_metadata(shader); *num_resume_shaders_out = 0; return false; } /* Some intrinsics not only can't be re-materialized but aren't preserved * when moving to the continuation shader. We have to move them to the top * to ensure they get spilled as needed. */ { bool progress = false; NIR_PASS(progress, shader, move_system_values_to_top); if (progress) NIR_PASS(progress, shader, nir_opt_cse); } /* Deref chains contain metadata information that is needed by other passes * after this one. If we don't rematerialize the derefs in the blocks where * they're used here, the following lowerings will insert phis which can * prevent other passes from chasing deref chains. Additionally, derefs need * to be rematerialized after shader call instructions to avoid spilling. */ { bool progress = false; NIR_PASS(progress, shader, wrap_instrs, instr_is_shader_call); nir_rematerialize_derefs_in_use_blocks_impl(impl); if (progress) NIR_PASS(_, shader, nir_opt_dead_cf); } /* Save the start point of the call stack in scratch */ unsigned start_call_scratch = shader->scratch_size; NIR_PASS_V(shader, spill_ssa_defs_and_lower_shader_calls, num_calls, options); NIR_PASS_V(shader, nir_opt_remove_phis); NIR_PASS_V(shader, nir_opt_trim_stack_values); NIR_PASS_V(shader, nir_opt_sort_and_pack_stack, start_call_scratch, options->stack_alignment, num_calls); /* Make N copies of our shader */ nir_shader **resume_shaders = ralloc_array(mem_ctx, nir_shader *, num_calls); for (unsigned i = 0; i < num_calls; i++) { resume_shaders[i] = nir_shader_clone(mem_ctx, shader); /* Give them a recognizable name */ resume_shaders[i]->info.name = ralloc_asprintf(mem_ctx, "%s%sresume_%u", shader->info.name ? shader->info.name : "", shader->info.name ? "-" : "", i); } replace_resume_with_halt(shader, NULL); nir_opt_dce(shader); nir_opt_dead_cf(shader); for (unsigned i = 0; i < num_calls; i++) { nir_instr *resume_instr = lower_resume(resume_shaders[i], i); replace_resume_with_halt(resume_shaders[i], resume_instr); /* Remove CF after halt before nir_opt_if(). */ nir_opt_dead_cf(resume_shaders[i]); /* Remove the dummy blocks added by flatten_resume_if_ladder() */ nir_opt_if(resume_shaders[i], nir_opt_if_optimize_phi_true_false); nir_opt_dce(resume_shaders[i]); nir_opt_dead_cf(resume_shaders[i]); nir_opt_remove_phis(resume_shaders[i]); } for (unsigned i = 0; i < num_calls; i++) NIR_PASS_V(resume_shaders[i], nir_opt_remove_respills); if (options->localized_loads) { /* Once loads have been combined we can try to put them closer to where * they're needed. */ for (unsigned i = 0; i < num_calls; i++) NIR_PASS_V(resume_shaders[i], nir_opt_stack_loads); } struct stack_op_vectorizer_state vectorizer_state = { .driver_callback = options->vectorizer_callback, .driver_data = options->vectorizer_data, }; nir_load_store_vectorize_options vect_opts = { .modes = nir_var_shader_temp, .callback = should_vectorize, .cb_data = &vectorizer_state, }; if (options->vectorizer_callback != NULL) { NIR_PASS_V(shader, nir_split_stack_components); NIR_PASS_V(shader, nir_opt_load_store_vectorize, &vect_opts); } NIR_PASS_V(shader, nir_lower_stack_to_scratch, options->address_format); nir_opt_cse(shader); for (unsigned i = 0; i < num_calls; i++) { if (options->vectorizer_callback != NULL) { NIR_PASS_V(resume_shaders[i], nir_split_stack_components); NIR_PASS_V(resume_shaders[i], nir_opt_load_store_vectorize, &vect_opts); } NIR_PASS_V(resume_shaders[i], nir_lower_stack_to_scratch, options->address_format); nir_opt_cse(resume_shaders[i]); } *resume_shaders_out = resume_shaders; *num_resume_shaders_out = num_calls; return true; }