1 /*
2 * Copyright (c) 2020 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_nir_rt.h"
25 #include "brw_nir_rt_builder.h"
26
27 static nir_ssa_def *
build_leaf_is_procedural(nir_builder * b,struct brw_nir_rt_mem_hit_defs * hit)28 build_leaf_is_procedural(nir_builder *b, struct brw_nir_rt_mem_hit_defs *hit)
29 {
30 switch (b->shader->info.stage) {
31 case MESA_SHADER_ANY_HIT:
32 /* Any-hit shaders are always compiled into intersection shaders for
33 * procedural geometry. If we got here in an any-hit shader, it's for
34 * triangles.
35 */
36 return nir_imm_false(b);
37
38 case MESA_SHADER_INTERSECTION:
39 return nir_imm_true(b);
40
41 default:
42 return nir_ieq(b, hit->leaf_type,
43 nir_imm_int(b, BRW_RT_BVH_NODE_TYPE_PROCEDURAL));
44 }
45 }
46
47 static void
lower_rt_intrinsics_impl(nir_function_impl * impl,const struct intel_device_info * devinfo)48 lower_rt_intrinsics_impl(nir_function_impl *impl,
49 const struct intel_device_info *devinfo)
50 {
51 nir_builder build;
52 nir_builder_init(&build, impl);
53 nir_builder *b = &build;
54
55 b->cursor = nir_before_block(nir_start_block(b->impl));
56
57 struct brw_nir_rt_globals_defs globals;
58 brw_nir_rt_load_globals(b, &globals);
59
60 nir_ssa_def *hotzone_addr = brw_nir_rt_sw_hotzone_addr(b, devinfo);
61 nir_ssa_def *hotzone = nir_load_global(b, hotzone_addr, 16, 4, 32);
62
63 gl_shader_stage stage = b->shader->info.stage;
64 struct brw_nir_rt_mem_ray_defs world_ray_in = {};
65 struct brw_nir_rt_mem_ray_defs object_ray_in = {};
66 struct brw_nir_rt_mem_hit_defs hit_in = {};
67 switch (stage) {
68 case MESA_SHADER_ANY_HIT:
69 case MESA_SHADER_CLOSEST_HIT:
70 case MESA_SHADER_INTERSECTION:
71 brw_nir_rt_load_mem_hit(b, &hit_in,
72 stage == MESA_SHADER_CLOSEST_HIT);
73 brw_nir_rt_load_mem_ray(b, &object_ray_in,
74 BRW_RT_BVH_LEVEL_OBJECT);
75 FALLTHROUGH;
76
77 case MESA_SHADER_MISS:
78 brw_nir_rt_load_mem_ray(b, &world_ray_in,
79 BRW_RT_BVH_LEVEL_WORLD);
80 break;
81
82 default:
83 break;
84 }
85
86 nir_ssa_def *thread_stack_base_addr = brw_nir_rt_sw_stack_addr(b, devinfo);
87 nir_ssa_def *stack_base_offset = nir_channel(b, hotzone, 0);
88 nir_ssa_def *stack_base_addr =
89 nir_iadd(b, thread_stack_base_addr, nir_u2u64(b, stack_base_offset));
90 ASSERTED bool seen_scratch_base_ptr_load = false;
91 ASSERTED bool found_resume = false;
92
93 nir_foreach_block(block, impl) {
94 nir_foreach_instr_safe(instr, block) {
95 if (instr->type != nir_instr_type_intrinsic)
96 continue;
97
98 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
99
100 b->cursor = nir_after_instr(&intrin->instr);
101
102 nir_ssa_def *sysval = NULL;
103 switch (intrin->intrinsic) {
104 case nir_intrinsic_load_scratch_base_ptr:
105 assert(nir_intrinsic_base(intrin) == 1);
106 seen_scratch_base_ptr_load = true;
107 sysval = stack_base_addr;
108 break;
109
110 case nir_intrinsic_btd_stack_push_intel: {
111 int32_t stack_size = nir_intrinsic_stack_size(intrin);
112 if (stack_size > 0) {
113 nir_ssa_def *child_stack_offset =
114 nir_iadd_imm(b, stack_base_offset, stack_size);
115 nir_store_global(b, hotzone_addr, 16, child_stack_offset, 0x1);
116 }
117 nir_instr_remove(instr);
118 break;
119 }
120
121 case nir_intrinsic_rt_resume:
122 /* This is the first "interesting" instruction */
123 assert(block == nir_start_block(impl));
124 assert(!seen_scratch_base_ptr_load);
125 found_resume = true;
126
127 int32_t stack_size = nir_intrinsic_stack_size(intrin);
128 if (stack_size > 0) {
129 stack_base_offset =
130 nir_iadd_imm(b, stack_base_offset, -stack_size);
131 nir_store_global(b, hotzone_addr, 16, stack_base_offset, 0x1);
132 stack_base_addr = nir_iadd(b, thread_stack_base_addr,
133 nir_u2u64(b, stack_base_offset));
134 }
135 nir_instr_remove(instr);
136 break;
137
138 case nir_intrinsic_load_uniform: {
139 /* We don't want to lower this in the launch trampoline. */
140 if (stage == MESA_SHADER_COMPUTE)
141 break;
142
143 sysval = brw_nir_load_global_const(b, intrin,
144 nir_load_btd_global_arg_addr_intel(b),
145 BRW_RT_PUSH_CONST_OFFSET);
146
147 break;
148 }
149
150 case nir_intrinsic_load_ray_launch_id:
151 sysval = nir_channels(b, hotzone, 0xe);
152 break;
153
154 case nir_intrinsic_load_ray_launch_size:
155 sysval = globals.launch_size;
156 break;
157
158 case nir_intrinsic_load_ray_world_origin:
159 sysval = world_ray_in.orig;
160 break;
161
162 case nir_intrinsic_load_ray_world_direction:
163 sysval = world_ray_in.dir;
164 break;
165
166 case nir_intrinsic_load_ray_object_origin:
167 sysval = object_ray_in.orig;
168 break;
169
170 case nir_intrinsic_load_ray_object_direction:
171 sysval = object_ray_in.dir;
172 break;
173
174 case nir_intrinsic_load_ray_t_min:
175 /* It shouldn't matter which we pull this from */
176 sysval = world_ray_in.t_near;
177 break;
178
179 case nir_intrinsic_load_ray_t_max:
180 if (stage == MESA_SHADER_MISS)
181 sysval = world_ray_in.t_far;
182 else
183 sysval = hit_in.t;
184 break;
185
186 case nir_intrinsic_load_primitive_id: {
187 /* It's in dw[3] for procedural and dw[2] for quad
188 *
189 * TODO: We really need some helpers here.
190 */
191 nir_ssa_def *offset =
192 nir_bcsel(b, build_leaf_is_procedural(b, &hit_in),
193 nir_iadd_imm(b, hit_in.prim_leaf_index, 12),
194 nir_imm_int(b, 8));
195 sysval = nir_load_global(b, nir_iadd(b, hit_in.prim_leaf_ptr,
196 nir_u2u64(b, offset)),
197 4, /* align */ 1, 32);
198 break;
199 }
200
201 case nir_intrinsic_load_instance_id: {
202 struct brw_nir_rt_bvh_instance_leaf_defs leaf;
203 brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
204 sysval = leaf.instance_index;
205 break;
206 }
207
208 case nir_intrinsic_load_ray_object_to_world: {
209 struct brw_nir_rt_bvh_instance_leaf_defs leaf;
210 brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
211 sysval = leaf.object_to_world[nir_intrinsic_column(intrin)];
212 break;
213 }
214
215 case nir_intrinsic_load_ray_world_to_object: {
216 struct brw_nir_rt_bvh_instance_leaf_defs leaf;
217 brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
218 sysval = leaf.world_to_object[nir_intrinsic_column(intrin)];
219 break;
220 }
221
222 case nir_intrinsic_load_ray_hit_kind: {
223 nir_ssa_def *tri_hit_kind =
224 nir_bcsel(b, hit_in.front_face,
225 nir_imm_int(b, BRW_RT_HIT_KIND_FRONT_FACE),
226 nir_imm_int(b, BRW_RT_HIT_KIND_BACK_FACE));
227 sysval = nir_bcsel(b, build_leaf_is_procedural(b, &hit_in),
228 hit_in.aabb_hit_kind, tri_hit_kind);
229 break;
230 }
231
232 case nir_intrinsic_load_ray_flags:
233 sysval = nir_u2u32(b, world_ray_in.ray_flags);
234 break;
235
236 case nir_intrinsic_load_ray_geometry_index: {
237 nir_ssa_def *geometry_index_dw =
238 nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4,
239 1, 32);
240 sysval = nir_iand_imm(b, geometry_index_dw, BITFIELD_MASK(29));
241 break;
242 }
243
244 case nir_intrinsic_load_ray_instance_custom_index: {
245 struct brw_nir_rt_bvh_instance_leaf_defs leaf;
246 brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
247 sysval = leaf.instance_id;
248 break;
249 }
250
251 case nir_intrinsic_load_shader_record_ptr:
252 /* We can't handle this intrinsic in resume shaders because the
253 * handle we get there won't be from the original SBT. The shader
254 * call lowering/splitting pass should have ensured that this
255 * value was spilled from the initial shader and unspilled in any
256 * resume shaders that need it.
257 */
258 assert(!found_resume);
259 sysval = nir_load_btd_local_arg_addr_intel(b);
260 break;
261
262 case nir_intrinsic_load_ray_base_mem_addr_intel:
263 sysval = globals.base_mem_addr;
264 break;
265
266 case nir_intrinsic_load_ray_hw_stack_size_intel:
267 sysval = nir_imul_imm(b, globals.hw_stack_size, 64);
268 break;
269
270 case nir_intrinsic_load_ray_sw_stack_size_intel:
271 sysval = nir_imul_imm(b, globals.sw_stack_size, 64);
272 break;
273
274 case nir_intrinsic_load_ray_num_dss_rt_stacks_intel:
275 sysval = globals.num_dss_rt_stacks;
276 break;
277
278 case nir_intrinsic_load_ray_hit_sbt_addr_intel:
279 sysval = globals.hit_sbt_addr;
280 break;
281
282 case nir_intrinsic_load_ray_hit_sbt_stride_intel:
283 sysval = globals.hit_sbt_stride;
284 break;
285
286 case nir_intrinsic_load_ray_miss_sbt_addr_intel:
287 sysval = globals.miss_sbt_addr;
288 break;
289
290 case nir_intrinsic_load_ray_miss_sbt_stride_intel:
291 sysval = globals.miss_sbt_stride;
292 break;
293
294 case nir_intrinsic_load_callable_sbt_addr_intel:
295 sysval = globals.call_sbt_addr;
296 break;
297
298 case nir_intrinsic_load_callable_sbt_stride_intel:
299 sysval = globals.call_sbt_stride;
300 break;
301
302 case nir_intrinsic_load_btd_resume_sbt_addr_intel:
303 sysval = nir_pack_64_2x32_split(b,
304 nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_RESUME_SBT_ADDR_LOW),
305 nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_RESUME_SBT_ADDR_HIGH));
306 break;
307
308 case nir_intrinsic_load_leaf_procedural_intel:
309 sysval = build_leaf_is_procedural(b, &hit_in);
310 break;
311
312 case nir_intrinsic_load_leaf_opaque_intel: {
313 if (stage == MESA_SHADER_INTERSECTION) {
314 /* In intersection shaders, the opaque bit is passed to us in
315 * the front_face bit.
316 */
317 sysval = hit_in.front_face;
318 } else {
319 nir_ssa_def *flags_dw =
320 nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4,
321 1, 32);
322 sysval = nir_i2b(b, nir_iand_imm(b, flags_dw, 1u << 30));
323 }
324 break;
325 }
326
327 default:
328 continue;
329 }
330
331 if (sysval) {
332 nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
333 sysval);
334 nir_instr_remove(&intrin->instr);
335 }
336 }
337 }
338
339 nir_metadata_preserve(impl, nir_metadata_block_index |
340 nir_metadata_dominance);
341 }
342
343 /** Lower ray-tracing system values and intrinsics
344 *
345 * In most 3D shader stages, intrinsics are a fairly thin wrapper around
346 * hardware functionality and system values represent magic bits that come
347 * into the shader from FF hardware. Ray-tracing, however, looks a bit more
348 * like the OpenGL 1.0 world where the underlying hardware is simple and most
349 * of the API implementation is software.
350 *
351 * In particular, most things that are treated as system values (or built-ins
352 * in SPIR-V) don't get magically dropped into registers for us. Instead, we
353 * have to fetch them from the relevant data structures shared with the
354 * ray-tracing hardware. Most come from either the RT_DISPATCH_GLOBALS or
355 * from one of the MemHit data structures. Some, such as primitive_id require
356 * us to fetch the leaf address from the MemHit struct and then manually read
357 * the data out of the BVH. Instead of trying to emit all this code deep in
358 * the back-end where we can't effectively optimize it, we lower it all to
359 * global memory access in NIR.
360 *
361 * Once this pass is complete, the only real system values left are the two
362 * argument pointer system values for BTD dispatch: btd_local_arg_addr and
363 * btd_global_arg_addr.
364 */
365 void
brw_nir_lower_rt_intrinsics(nir_shader * nir,const struct intel_device_info * devinfo)366 brw_nir_lower_rt_intrinsics(nir_shader *nir,
367 const struct intel_device_info *devinfo)
368 {
369 nir_foreach_function(function, nir) {
370 if (function->impl)
371 lower_rt_intrinsics_impl(function->impl, devinfo);
372 }
373 }
374