1 /*
2 * Copyright © 2020 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #pragma once
25
26 #include <stdint.h>
27
28 #include "compiler/shader_enums.h"
29 #include "util/macros.h"
30
31 #ifdef __cplusplus
32 extern "C" {
33 #endif
34
35 /** Vulkan defines shaderGroupHandleSize = 32 */
36 #define BRW_RT_SBT_HANDLE_SIZE 32
37
38 /** RT_DISPATCH_GLOBALS size (see gen_rt.xml) */
39 #define BRW_RT_DISPATCH_GLOBALS_SIZE 80
40
41 /** Offset after the RT dispatch globals at which "push" constants live */
42 #define BRW_RT_PUSH_CONST_OFFSET 128
43
44 /** Stride of the resume SBT */
45 #define BRW_BTD_RESUME_SBT_STRIDE 8
46
47 /* Vulkan always uses exactly two levels of BVH: world and object. At the API
48 * level, these are referred to as top and bottom.
49 */
50 enum brw_rt_bvh_level {
51 BRW_RT_BVH_LEVEL_WORLD = 0,
52 BRW_RT_BVH_LEVEL_OBJECT = 1,
53 };
54 #define BRW_RT_MAX_BVH_LEVELS 2
55
56 enum brw_rt_bvh_node_type {
57 BRW_RT_BVH_NODE_TYPE_INTERNAL = 0,
58 BRW_RT_BVH_NODE_TYPE_INSTANCE = 1,
59 BRW_RT_BVH_NODE_TYPE_PROCEDURAL = 3,
60 BRW_RT_BVH_NODE_TYPE_QUAD = 4,
61 };
62
63 /** HitKind values returned for triangle geometry
64 *
65 * This enum must match the SPIR-V enum.
66 */
67 enum brw_rt_hit_kind {
68 BRW_RT_HIT_KIND_FRONT_FACE = 0xfe,
69 BRW_RT_HIT_KIND_BACK_FACE = 0xff,
70 };
71
72 /** Ray flags
73 *
74 * This enum must match the SPIR-V RayFlags enum.
75 */
76 enum brw_rt_ray_flags {
77 BRW_RT_RAY_FLAG_FORCE_OPAQUE = 0x01,
78 BRW_RT_RAY_FLAG_FORCE_NON_OPAQUE = 0x02,
79 BRW_RT_RAY_FLAG_TERMINATE_ON_FIRST_HIT = 0x04,
80 BRW_RT_RAY_FLAG_SKIP_CLOSEST_HIT_SHADER = 0x08,
81 BRW_RT_RAY_FLAG_CULL_BACK_FACING_TRIANGLES = 0x10,
82 BRW_RT_RAY_FLAG_CULL_FRONT_FACING_TRIANGLES = 0x20,
83 BRW_RT_RAY_FLAG_CULL_OPAQUE = 0x40,
84 BRW_RT_RAY_FLAG_CULL_NON_OPAQUE = 0x80,
85 BRW_RT_RAY_FLAG_SKIP_TRIANGLES = 0x100,
86 BRW_RT_RAY_FLAG_SKIP_AABBS = 0x200,
87 };
88
89 struct brw_rt_scratch_layout {
90 /** Number of stack IDs per DSS */
91 uint32_t stack_ids_per_dss;
92
93 /** Start offset (in bytes) of the hardware MemRay stack */
94 uint32_t ray_stack_start;
95
96 /** Stride (in bytes) of the hardware MemRay stack */
97 uint32_t ray_stack_stride;
98
99 /** Start offset (in bytes) of the SW stacks */
100 uint64_t sw_stack_start;
101
102 /** Size (in bytes) of the SW stack for a single shader invocation */
103 uint32_t sw_stack_size;
104
105 /** Total size (in bytes) of the RT scratch memory area */
106 uint64_t total_size;
107 };
108
109 /** Parameters passed to the raygen trampoline shader
110 *
111 * This struct is carefully construected to be 32B and must be passed to the
112 * raygen trampoline shader as as inline constant data.
113 */
114 struct brw_rt_raygen_trampoline_params {
115 /** The GPU address of the RT_DISPATCH_GLOBALS */
116 uint64_t rt_disp_globals_addr;
117
118 /** The GPU address of the BINDLESS_SHADER_RECORD for the raygen shader */
119 uint64_t raygen_bsr_addr;
120
121 /** 1 if this is an indirect dispatch, 0 otherwise */
122 uint8_t is_indirect;
123
124 /** The integer log2 of the local group size
125 *
126 * Ray-tracing shaders don't have a concept of local vs. global workgroup
127 * size. They only have a single 3D launch size. The raygen trampoline
128 * shader is always dispatched with a local workgroup size equal to the
129 * SIMD width but the shape of the local workgroup is determined at
130 * dispatch time based on the shape of the launch and passed to the
131 * trampoline via this field. (There's no sense having a Z dimension on
132 * the local workgroup if the launch is 2D.)
133 *
134 * We use the integer log2 of the size because there's no point in
135 * non-power-of-two sizes and shifts are cheaper than division.
136 */
137 uint8_t local_group_size_log2[3];
138
139 uint32_t pad[3];
140 };
141
142 /** Size of the "hot zone" in bytes
143 *
144 * The hot zone is a SW-defined data structure which is a single uvec4
145 * containing two bits of information:
146 *
147 * - hotzone.x: Stack offset (in bytes)
148 *
149 * This is the offset (in bytes) into the per-thread scratch space at which
150 * the current shader's stack starts. This is incremented by the calling
151 * shader prior to any shader call type instructions and gets decremented
152 * by the resume shader as part of completing the return operation.
153 *
154 *
155 * - hotzone.yzw: The launch ID associated with the current thread
156 *
157 * Inside a bindless shader, the only information we have is the DSS ID
158 * from the hardware EU and a per-DSS stack ID. In particular, the three-
159 * dimensional launch ID is lost the moment we leave the raygen trampoline.
160 */
161 #define BRW_RT_SIZEOF_HOTZONE 16
162
163 /* From the BSpec "Address Computation for Memory Based Data Structures:
164 * Ray and TraversalStack (Async Ray Tracing)":
165 *
166 * sizeof(Ray) = 64B, sizeof(HitInfo) = 32B, sizeof(TravStack) = 32B.
167 */
168 #define BRW_RT_SIZEOF_RAY 64
169 #define BRW_RT_SIZEOF_HIT_INFO 32
170 #define BRW_RT_SIZEOF_TRAV_STACK 32
171
172 /* From the BSpec:
173 *
174 * syncStackSize = (maxBVHLevels % 2 == 1) ?
175 * (sizeof(HitInfo) * 2 +
176 * (sizeof(Ray) + sizeof(TravStack)) * maxBVHLevels + 32B) :
177 * (sizeof(HitInfo) * 2 +
178 * (sizeof(Ray) + sizeof(TravStack)) * maxBVHLevels);
179 *
180 * The select is just to align to 64B.
181 */
182 #define BRW_RT_SIZEOF_RAY_QUERY \
183 (BRW_RT_SIZEOF_HIT_INFO * 2 + \
184 (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS + \
185 (BRW_RT_MAX_BVH_LEVELS % 2 ? 32 : 0))
186
187 #define BRW_RT_SIZEOF_SHADOW_RAY_QUERY \
188 (BRW_RT_SIZEOF_HIT_INFO * 2 + \
189 (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS)
190
191 #define BRW_RT_SIZEOF_HW_STACK \
192 (BRW_RT_SIZEOF_HIT_INFO * 2 + \
193 BRW_RT_SIZEOF_RAY * BRW_RT_MAX_BVH_LEVELS + \
194 BRW_RT_SIZEOF_TRAV_STACK * BRW_RT_MAX_BVH_LEVELS)
195
196 /* This is a mesa-defined region for hit attribute data */
197 #define BRW_RT_SIZEOF_HIT_ATTRIB_DATA 64
198 #define BRW_RT_OFFSETOF_HIT_ATTRIB_DATA BRW_RT_SIZEOF_HW_STACK
199
200 #define BRW_RT_ASYNC_STACK_STRIDE \
201 ALIGN_POT(BRW_RT_OFFSETOF_HIT_ATTRIB_DATA + \
202 BRW_RT_SIZEOF_HIT_ATTRIB_DATA, 64)
203
204 static inline void
brw_rt_compute_scratch_layout(struct brw_rt_scratch_layout * layout,const struct intel_device_info * devinfo,uint32_t stack_ids_per_dss,uint32_t sw_stack_size)205 brw_rt_compute_scratch_layout(struct brw_rt_scratch_layout *layout,
206 const struct intel_device_info *devinfo,
207 uint32_t stack_ids_per_dss,
208 uint32_t sw_stack_size)
209 {
210 layout->stack_ids_per_dss = stack_ids_per_dss;
211
212 const uint32_t dss_count = intel_device_info_dual_subslice_id_bound(devinfo);
213 const uint32_t num_stack_ids = dss_count * stack_ids_per_dss;
214
215 uint64_t size = 0;
216
217 /* The first thing in our scratch area is an array of "hot zones" which
218 * store the stack offset as well as the launch IDs for each active
219 * invocation.
220 */
221 size += BRW_RT_SIZEOF_HOTZONE * num_stack_ids;
222
223 /* Next, we place the HW ray stacks */
224 assert(size % 64 == 0); /* Cache-line aligned */
225 assert(size < UINT32_MAX);
226 layout->ray_stack_start = size;
227 layout->ray_stack_stride = BRW_RT_ASYNC_STACK_STRIDE;
228 size += num_stack_ids * layout->ray_stack_stride;
229
230 /* Finally, we place the SW stacks for the individual ray-tracing shader
231 * invocations. We align these to 64B to ensure that we don't have any
232 * shared cache lines which could hurt performance.
233 */
234 assert(size % 64 == 0);
235 layout->sw_stack_start = size;
236 layout->sw_stack_size = ALIGN(sw_stack_size, 64);
237
238 /* Currently it's always the case that sw_stack_size is a power of
239 * two, but power-of-two SW stack sizes are prone to causing
240 * collisions in the hashing function used by the L3 to map memory
241 * addresses to banks, which can cause stack accesses from most
242 * DSSes to bottleneck on a single L3 bank. Fix it by padding the
243 * SW stack by a single cacheline if it was a power of two.
244 */
245 if (layout->sw_stack_size > 64 &&
246 util_is_power_of_two_nonzero(layout->sw_stack_size))
247 layout->sw_stack_size += 64;
248
249 size += num_stack_ids * layout->sw_stack_size;
250
251 layout->total_size = size;
252 }
253
254 static inline uint32_t
brw_rt_ray_queries_hw_stacks_size(const struct intel_device_info * devinfo)255 brw_rt_ray_queries_hw_stacks_size(const struct intel_device_info *devinfo)
256 {
257 /* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids
258 * which includes all the threads.
259 */
260 uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE];
261 uint32_t max_simd_size = 16; /* Cannot run in SIMD32 with ray queries */
262 return max_eu_id * max_simd_size * BRW_RT_SIZEOF_RAY_QUERY;
263 }
264
265 static inline uint32_t
brw_rt_ray_queries_shadow_stack_size(const struct intel_device_info * devinfo)266 brw_rt_ray_queries_shadow_stack_size(const struct intel_device_info *devinfo)
267 {
268 /* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids
269 * which includes all the threads.
270 */
271 uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE];
272 uint32_t max_simd_size = 16; /* Cannot run in SIMD32 with ray queries */
273 return max_eu_id * max_simd_size * BRW_RT_SIZEOF_SHADOW_RAY_QUERY;
274 }
275
276 static inline uint32_t
brw_rt_ray_queries_shadow_stacks_size(const struct intel_device_info * devinfo,uint32_t ray_queries)277 brw_rt_ray_queries_shadow_stacks_size(const struct intel_device_info *devinfo,
278 uint32_t ray_queries)
279 {
280 /* Don't bother a shadow stack if we only have a single query. We can
281 * directly write in the HW buffer.
282 */
283 return (ray_queries > 1 ? ray_queries : 0) * brw_rt_ray_queries_shadow_stack_size(devinfo) +
284 ray_queries * 4; /* Ctrl + Level data */
285 }
286
287 #ifdef __cplusplus
288 }
289 #endif
290