1 /*
2 * Copyright (c) 2020 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_nir_rt.h"
25 #include "brw_nir_rt_builder.h"
26
27 static nir_def *
build_leaf_is_procedural(nir_builder * b,struct brw_nir_rt_mem_hit_defs * hit)28 build_leaf_is_procedural(nir_builder *b, struct brw_nir_rt_mem_hit_defs *hit)
29 {
30 switch (b->shader->info.stage) {
31 case MESA_SHADER_ANY_HIT:
32 /* Any-hit shaders are always compiled into intersection shaders for
33 * procedural geometry. If we got here in an any-hit shader, it's for
34 * triangles.
35 */
36 return nir_imm_false(b);
37
38 case MESA_SHADER_INTERSECTION:
39 return nir_imm_true(b);
40
41 default:
42 return nir_ieq_imm(b, hit->leaf_type,
43 BRW_RT_BVH_NODE_TYPE_PROCEDURAL);
44 }
45 }
46
47 static void
lower_rt_intrinsics_impl(nir_function_impl * impl,const struct intel_device_info * devinfo)48 lower_rt_intrinsics_impl(nir_function_impl *impl,
49 const struct intel_device_info *devinfo)
50 {
51 bool progress = false;
52
53 nir_builder build = nir_builder_at(nir_before_impl(impl));
54 nir_builder *b = &build;
55
56 struct brw_nir_rt_globals_defs globals;
57 brw_nir_rt_load_globals(b, &globals);
58
59 nir_def *hotzone_addr = brw_nir_rt_sw_hotzone_addr(b, devinfo);
60 nir_def *hotzone = nir_load_global(b, hotzone_addr, 16, 4, 32);
61
62 gl_shader_stage stage = b->shader->info.stage;
63 struct brw_nir_rt_mem_ray_defs world_ray_in = {};
64 struct brw_nir_rt_mem_ray_defs object_ray_in = {};
65 struct brw_nir_rt_mem_hit_defs hit_in = {};
66 switch (stage) {
67 case MESA_SHADER_ANY_HIT:
68 case MESA_SHADER_CLOSEST_HIT:
69 case MESA_SHADER_INTERSECTION:
70 brw_nir_rt_load_mem_hit(b, &hit_in,
71 stage == MESA_SHADER_CLOSEST_HIT);
72 brw_nir_rt_load_mem_ray(b, &object_ray_in,
73 BRW_RT_BVH_LEVEL_OBJECT);
74 FALLTHROUGH;
75
76 case MESA_SHADER_MISS:
77 brw_nir_rt_load_mem_ray(b, &world_ray_in,
78 BRW_RT_BVH_LEVEL_WORLD);
79 break;
80
81 default:
82 break;
83 }
84
85 nir_def *thread_stack_base_addr = brw_nir_rt_sw_stack_addr(b, devinfo);
86 nir_def *stack_base_offset = nir_channel(b, hotzone, 0);
87 nir_def *stack_base_addr =
88 nir_iadd(b, thread_stack_base_addr, nir_u2u64(b, stack_base_offset));
89 ASSERTED bool seen_scratch_base_ptr_load = false;
90 ASSERTED bool found_resume = false;
91
92 nir_foreach_block(block, impl) {
93 nir_foreach_instr_safe(instr, block) {
94 if (instr->type != nir_instr_type_intrinsic)
95 continue;
96
97 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
98
99 b->cursor = nir_after_instr(&intrin->instr);
100
101 nir_def *sysval = NULL;
102 switch (intrin->intrinsic) {
103 case nir_intrinsic_load_scratch_base_ptr:
104 assert(nir_intrinsic_base(intrin) == 1);
105 seen_scratch_base_ptr_load = true;
106 sysval = stack_base_addr;
107 break;
108
109 case nir_intrinsic_btd_stack_push_intel: {
110 int32_t stack_size = nir_intrinsic_stack_size(intrin);
111 if (stack_size > 0) {
112 nir_def *child_stack_offset =
113 nir_iadd_imm(b, stack_base_offset, stack_size);
114 nir_store_global(b, hotzone_addr, 16, child_stack_offset, 0x1);
115 }
116 nir_instr_remove(instr);
117 break;
118 }
119
120 case nir_intrinsic_rt_resume:
121 /* This is the first "interesting" instruction */
122 assert(block == nir_start_block(impl));
123 assert(!seen_scratch_base_ptr_load);
124 found_resume = true;
125
126 int32_t stack_size = nir_intrinsic_stack_size(intrin);
127 if (stack_size > 0) {
128 stack_base_offset =
129 nir_iadd_imm(b, stack_base_offset, -stack_size);
130 nir_store_global(b, hotzone_addr, 16, stack_base_offset, 0x1);
131 stack_base_addr = nir_iadd(b, thread_stack_base_addr,
132 nir_u2u64(b, stack_base_offset));
133 }
134 nir_instr_remove(instr);
135 break;
136
137 case nir_intrinsic_load_uniform: {
138 /* We don't want to lower this in the launch trampoline. */
139 if (stage == MESA_SHADER_COMPUTE)
140 break;
141
142 sysval = brw_nir_load_global_const(b, intrin,
143 nir_load_btd_global_arg_addr_intel(b),
144 BRW_RT_PUSH_CONST_OFFSET);
145
146 break;
147 }
148
149 case nir_intrinsic_load_ray_launch_id:
150 sysval = nir_channels(b, hotzone, 0xe);
151 break;
152
153 case nir_intrinsic_load_ray_launch_size:
154 sysval = globals.launch_size;
155 break;
156
157 case nir_intrinsic_load_ray_world_origin:
158 sysval = world_ray_in.orig;
159 break;
160
161 case nir_intrinsic_load_ray_world_direction:
162 sysval = world_ray_in.dir;
163 break;
164
165 case nir_intrinsic_load_ray_object_origin:
166 sysval = object_ray_in.orig;
167 break;
168
169 case nir_intrinsic_load_ray_object_direction:
170 sysval = object_ray_in.dir;
171 break;
172
173 case nir_intrinsic_load_ray_t_min:
174 /* It shouldn't matter which we pull this from */
175 sysval = world_ray_in.t_near;
176 break;
177
178 case nir_intrinsic_load_ray_t_max:
179 if (stage == MESA_SHADER_MISS)
180 sysval = world_ray_in.t_far;
181 else
182 sysval = hit_in.t;
183 break;
184
185 case nir_intrinsic_load_primitive_id:
186 sysval = brw_nir_rt_load_primitive_id_from_hit(b,
187 build_leaf_is_procedural(b, &hit_in),
188 &hit_in);
189 break;
190
191 case nir_intrinsic_load_instance_id: {
192 struct brw_nir_rt_bvh_instance_leaf_defs leaf;
193 brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
194 sysval = leaf.instance_index;
195 break;
196 }
197
198 case nir_intrinsic_load_ray_object_to_world: {
199 struct brw_nir_rt_bvh_instance_leaf_defs leaf;
200 brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
201 sysval = leaf.object_to_world[nir_intrinsic_column(intrin)];
202 break;
203 }
204
205 case nir_intrinsic_load_ray_world_to_object: {
206 struct brw_nir_rt_bvh_instance_leaf_defs leaf;
207 brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
208 sysval = leaf.world_to_object[nir_intrinsic_column(intrin)];
209 break;
210 }
211
212 case nir_intrinsic_load_ray_hit_kind: {
213 nir_def *tri_hit_kind =
214 nir_bcsel(b, hit_in.front_face,
215 nir_imm_int(b, BRW_RT_HIT_KIND_FRONT_FACE),
216 nir_imm_int(b, BRW_RT_HIT_KIND_BACK_FACE));
217 sysval = nir_bcsel(b, build_leaf_is_procedural(b, &hit_in),
218 hit_in.aabb_hit_kind, tri_hit_kind);
219 break;
220 }
221
222 case nir_intrinsic_load_ray_flags:
223 /* We need to fetch the original ray flags we stored in the
224 * leaf pointer, because the actual ray flags we get here
225 * will include any flags passed on the pipeline at creation
226 * time, and the spec for IncomingRayFlagsKHR says:
227 * Setting pipeline flags on the raytracing pipeline must not
228 * cause any corresponding flags to be set in variables with
229 * this decoration.
230 */
231 sysval = nir_u2u32(b, world_ray_in.inst_leaf_ptr);
232 break;
233
234 case nir_intrinsic_load_cull_mask:
235 sysval = nir_u2u32(b, world_ray_in.ray_mask);
236 break;
237
238 case nir_intrinsic_load_ray_geometry_index: {
239 nir_def *geometry_index_dw =
240 nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4,
241 1, 32);
242 sysval = nir_iand_imm(b, geometry_index_dw, BITFIELD_MASK(29));
243 break;
244 }
245
246 case nir_intrinsic_load_ray_instance_custom_index: {
247 struct brw_nir_rt_bvh_instance_leaf_defs leaf;
248 brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
249 sysval = leaf.instance_id;
250 break;
251 }
252
253 case nir_intrinsic_load_shader_record_ptr:
254 /* We can't handle this intrinsic in resume shaders because the
255 * handle we get there won't be from the original SBT. The shader
256 * call lowering/splitting pass should have ensured that this
257 * value was spilled from the initial shader and unspilled in any
258 * resume shaders that need it.
259 */
260 assert(!found_resume);
261 sysval = nir_load_btd_local_arg_addr_intel(b);
262 break;
263
264 case nir_intrinsic_load_ray_base_mem_addr_intel:
265 sysval = globals.base_mem_addr;
266 break;
267
268 case nir_intrinsic_load_ray_hw_stack_size_intel:
269 sysval = nir_imul_imm(b, globals.hw_stack_size, 64);
270 break;
271
272 case nir_intrinsic_load_ray_sw_stack_size_intel:
273 sysval = nir_imul_imm(b, globals.sw_stack_size, 64);
274 break;
275
276 case nir_intrinsic_load_ray_num_dss_rt_stacks_intel:
277 sysval = globals.num_dss_rt_stacks;
278 break;
279
280 case nir_intrinsic_load_ray_hit_sbt_addr_intel:
281 sysval = globals.hit_sbt_addr;
282 break;
283
284 case nir_intrinsic_load_ray_hit_sbt_stride_intel:
285 sysval = globals.hit_sbt_stride;
286 break;
287
288 case nir_intrinsic_load_ray_miss_sbt_addr_intel:
289 sysval = globals.miss_sbt_addr;
290 break;
291
292 case nir_intrinsic_load_ray_miss_sbt_stride_intel:
293 sysval = globals.miss_sbt_stride;
294 break;
295
296 case nir_intrinsic_load_callable_sbt_addr_intel:
297 sysval = globals.call_sbt_addr;
298 break;
299
300 case nir_intrinsic_load_callable_sbt_stride_intel:
301 sysval = globals.call_sbt_stride;
302 break;
303
304 case nir_intrinsic_load_btd_resume_sbt_addr_intel:
305 sysval = nir_pack_64_2x32_split(b,
306 nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_RESUME_SBT_ADDR_LOW),
307 nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_RESUME_SBT_ADDR_HIGH));
308 break;
309
310 case nir_intrinsic_load_leaf_procedural_intel:
311 sysval = build_leaf_is_procedural(b, &hit_in);
312 break;
313
314 case nir_intrinsic_load_ray_triangle_vertex_positions: {
315 struct brw_nir_rt_bvh_primitive_leaf_positions_defs pos;
316 brw_nir_rt_load_bvh_primitive_leaf_positions(b, &pos, hit_in.prim_leaf_ptr);
317 sysval = pos.positions[nir_intrinsic_column(intrin)];
318 break;
319 }
320
321 case nir_intrinsic_load_leaf_opaque_intel: {
322 if (stage == MESA_SHADER_INTERSECTION) {
323 /* In intersection shaders, the opaque bit is passed to us in
324 * the front_face bit.
325 */
326 sysval = hit_in.front_face;
327 } else {
328 nir_def *flags_dw =
329 nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4,
330 1, 32);
331 sysval = nir_i2b(b, nir_iand_imm(b, flags_dw, 1u << 30));
332 }
333 break;
334 }
335
336 default:
337 continue;
338 }
339
340 progress = true;
341
342 if (sysval) {
343 nir_def_rewrite_uses(&intrin->def,
344 sysval);
345 nir_instr_remove(&intrin->instr);
346 }
347 }
348 }
349
350 nir_metadata_preserve(impl,
351 progress ?
352 nir_metadata_none :
353 (nir_metadata_block_index |
354 nir_metadata_dominance));
355 }
356
357 /** Lower ray-tracing system values and intrinsics
358 *
359 * In most 3D shader stages, intrinsics are a fairly thin wrapper around
360 * hardware functionality and system values represent magic bits that come
361 * into the shader from FF hardware. Ray-tracing, however, looks a bit more
362 * like the OpenGL 1.0 world where the underlying hardware is simple and most
363 * of the API implementation is software.
364 *
365 * In particular, most things that are treated as system values (or built-ins
366 * in SPIR-V) don't get magically dropped into registers for us. Instead, we
367 * have to fetch them from the relevant data structures shared with the
368 * ray-tracing hardware. Most come from either the RT_DISPATCH_GLOBALS or
369 * from one of the MemHit data structures. Some, such as primitive_id require
370 * us to fetch the leaf address from the MemHit struct and then manually read
371 * the data out of the BVH. Instead of trying to emit all this code deep in
372 * the back-end where we can't effectively optimize it, we lower it all to
373 * global memory access in NIR.
374 *
375 * Once this pass is complete, the only real system values left are the two
376 * argument pointer system values for BTD dispatch: btd_local_arg_addr and
377 * btd_global_arg_addr.
378 */
379 void
brw_nir_lower_rt_intrinsics(nir_shader * nir,const struct intel_device_info * devinfo)380 brw_nir_lower_rt_intrinsics(nir_shader *nir,
381 const struct intel_device_info *devinfo)
382 {
383 nir_foreach_function_impl(impl, nir) {
384 lower_rt_intrinsics_impl(impl, devinfo);
385 }
386 }
387