1 /*
2 * Copyright © 2022 Konstantin Seurer
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #ifndef VK_BVH_BUILD_HELPERS_H
25 #define VK_BVH_BUILD_HELPERS_H
26
27 #include "vk_bvh.h"
28
29 #define VK_FORMAT_UNDEFINED 0
30 #define VK_FORMAT_R4G4_UNORM_PACK8 1
31 #define VK_FORMAT_R4G4B4A4_UNORM_PACK16 2
32 #define VK_FORMAT_B4G4R4A4_UNORM_PACK16 3
33 #define VK_FORMAT_R5G6B5_UNORM_PACK16 4
34 #define VK_FORMAT_B5G6R5_UNORM_PACK16 5
35 #define VK_FORMAT_R5G5B5A1_UNORM_PACK16 6
36 #define VK_FORMAT_B5G5R5A1_UNORM_PACK16 7
37 #define VK_FORMAT_A1R5G5B5_UNORM_PACK16 8
38 #define VK_FORMAT_R8_UNORM 9
39 #define VK_FORMAT_R8_SNORM 10
40 #define VK_FORMAT_R8_USCALED 11
41 #define VK_FORMAT_R8_SSCALED 12
42 #define VK_FORMAT_R8_UINT 13
43 #define VK_FORMAT_R8_SINT 14
44 #define VK_FORMAT_R8_SRGB 15
45 #define VK_FORMAT_R8G8_UNORM 16
46 #define VK_FORMAT_R8G8_SNORM 17
47 #define VK_FORMAT_R8G8_USCALED 18
48 #define VK_FORMAT_R8G8_SSCALED 19
49 #define VK_FORMAT_R8G8_UINT 20
50 #define VK_FORMAT_R8G8_SINT 21
51 #define VK_FORMAT_R8G8_SRGB 22
52 #define VK_FORMAT_R8G8B8_UNORM 23
53 #define VK_FORMAT_R8G8B8_SNORM 24
54 #define VK_FORMAT_R8G8B8_USCALED 25
55 #define VK_FORMAT_R8G8B8_SSCALED 26
56 #define VK_FORMAT_R8G8B8_UINT 27
57 #define VK_FORMAT_R8G8B8_SINT 28
58 #define VK_FORMAT_R8G8B8_SRGB 29
59 #define VK_FORMAT_B8G8R8_UNORM 30
60 #define VK_FORMAT_B8G8R8_SNORM 31
61 #define VK_FORMAT_B8G8R8_USCALED 32
62 #define VK_FORMAT_B8G8R8_SSCALED 33
63 #define VK_FORMAT_B8G8R8_UINT 34
64 #define VK_FORMAT_B8G8R8_SINT 35
65 #define VK_FORMAT_B8G8R8_SRGB 36
66 #define VK_FORMAT_R8G8B8A8_UNORM 37
67 #define VK_FORMAT_R8G8B8A8_SNORM 38
68 #define VK_FORMAT_R8G8B8A8_USCALED 39
69 #define VK_FORMAT_R8G8B8A8_SSCALED 40
70 #define VK_FORMAT_R8G8B8A8_UINT 41
71 #define VK_FORMAT_R8G8B8A8_SINT 42
72 #define VK_FORMAT_R8G8B8A8_SRGB 43
73 #define VK_FORMAT_B8G8R8A8_UNORM 44
74 #define VK_FORMAT_B8G8R8A8_SNORM 45
75 #define VK_FORMAT_B8G8R8A8_USCALED 46
76 #define VK_FORMAT_B8G8R8A8_SSCALED 47
77 #define VK_FORMAT_B8G8R8A8_UINT 48
78 #define VK_FORMAT_B8G8R8A8_SINT 49
79 #define VK_FORMAT_B8G8R8A8_SRGB 50
80 #define VK_FORMAT_A8B8G8R8_UNORM_PACK32 51
81 #define VK_FORMAT_A8B8G8R8_SNORM_PACK32 52
82 #define VK_FORMAT_A8B8G8R8_USCALED_PACK32 53
83 #define VK_FORMAT_A8B8G8R8_SSCALED_PACK32 54
84 #define VK_FORMAT_A8B8G8R8_UINT_PACK32 55
85 #define VK_FORMAT_A8B8G8R8_SINT_PACK32 56
86 #define VK_FORMAT_A8B8G8R8_SRGB_PACK32 57
87 #define VK_FORMAT_A2R10G10B10_UNORM_PACK32 58
88 #define VK_FORMAT_A2R10G10B10_SNORM_PACK32 59
89 #define VK_FORMAT_A2R10G10B10_USCALED_PACK32 60
90 #define VK_FORMAT_A2R10G10B10_SSCALED_PACK32 61
91 #define VK_FORMAT_A2R10G10B10_UINT_PACK32 62
92 #define VK_FORMAT_A2R10G10B10_SINT_PACK32 63
93 #define VK_FORMAT_A2B10G10R10_UNORM_PACK32 64
94 #define VK_FORMAT_A2B10G10R10_SNORM_PACK32 65
95 #define VK_FORMAT_A2B10G10R10_USCALED_PACK32 66
96 #define VK_FORMAT_A2B10G10R10_SSCALED_PACK32 67
97 #define VK_FORMAT_A2B10G10R10_UINT_PACK32 68
98 #define VK_FORMAT_A2B10G10R10_SINT_PACK32 69
99 #define VK_FORMAT_R16_UNORM 70
100 #define VK_FORMAT_R16_SNORM 71
101 #define VK_FORMAT_R16_USCALED 72
102 #define VK_FORMAT_R16_SSCALED 73
103 #define VK_FORMAT_R16_UINT 74
104 #define VK_FORMAT_R16_SINT 75
105 #define VK_FORMAT_R16_SFLOAT 76
106 #define VK_FORMAT_R16G16_UNORM 77
107 #define VK_FORMAT_R16G16_SNORM 78
108 #define VK_FORMAT_R16G16_USCALED 79
109 #define VK_FORMAT_R16G16_SSCALED 80
110 #define VK_FORMAT_R16G16_UINT 81
111 #define VK_FORMAT_R16G16_SINT 82
112 #define VK_FORMAT_R16G16_SFLOAT 83
113 #define VK_FORMAT_R16G16B16_UNORM 84
114 #define VK_FORMAT_R16G16B16_SNORM 85
115 #define VK_FORMAT_R16G16B16_USCALED 86
116 #define VK_FORMAT_R16G16B16_SSCALED 87
117 #define VK_FORMAT_R16G16B16_UINT 88
118 #define VK_FORMAT_R16G16B16_SINT 89
119 #define VK_FORMAT_R16G16B16_SFLOAT 90
120 #define VK_FORMAT_R16G16B16A16_UNORM 91
121 #define VK_FORMAT_R16G16B16A16_SNORM 92
122 #define VK_FORMAT_R16G16B16A16_USCALED 93
123 #define VK_FORMAT_R16G16B16A16_SSCALED 94
124 #define VK_FORMAT_R16G16B16A16_UINT 95
125 #define VK_FORMAT_R16G16B16A16_SINT 96
126 #define VK_FORMAT_R16G16B16A16_SFLOAT 97
127 #define VK_FORMAT_R32_UINT 98
128 #define VK_FORMAT_R32_SINT 99
129 #define VK_FORMAT_R32_SFLOAT 100
130 #define VK_FORMAT_R32G32_UINT 101
131 #define VK_FORMAT_R32G32_SINT 102
132 #define VK_FORMAT_R32G32_SFLOAT 103
133 #define VK_FORMAT_R32G32B32_UINT 104
134 #define VK_FORMAT_R32G32B32_SINT 105
135 #define VK_FORMAT_R32G32B32_SFLOAT 106
136 #define VK_FORMAT_R32G32B32A32_UINT 107
137 #define VK_FORMAT_R32G32B32A32_SINT 108
138 #define VK_FORMAT_R32G32B32A32_SFLOAT 109
139 #define VK_FORMAT_R64_UINT 110
140 #define VK_FORMAT_R64_SINT 111
141 #define VK_FORMAT_R64_SFLOAT 112
142 #define VK_FORMAT_R64G64_UINT 113
143 #define VK_FORMAT_R64G64_SINT 114
144 #define VK_FORMAT_R64G64_SFLOAT 115
145 #define VK_FORMAT_R64G64B64_UINT 116
146 #define VK_FORMAT_R64G64B64_SINT 117
147 #define VK_FORMAT_R64G64B64_SFLOAT 118
148 #define VK_FORMAT_R64G64B64A64_UINT 119
149 #define VK_FORMAT_R64G64B64A64_SINT 120
150 #define VK_FORMAT_R64G64B64A64_SFLOAT 121
151
152 #define VK_INDEX_TYPE_UINT16 0
153 #define VK_INDEX_TYPE_UINT32 1
154 #define VK_INDEX_TYPE_NONE_KHR 1000165000
155 #define VK_INDEX_TYPE_UINT8_EXT 1000265000
156
157 #define VK_GEOMETRY_TYPE_TRIANGLES_KHR 0
158 #define VK_GEOMETRY_TYPE_AABBS_KHR 1
159 #define VK_GEOMETRY_TYPE_INSTANCES_KHR 2
160
161 #define VK_GEOMETRY_INSTANCE_TRIANGLE_FACING_CULL_DISABLE_BIT_KHR 1
162 #define VK_GEOMETRY_INSTANCE_TRIANGLE_FLIP_FACING_BIT_KHR 2
163 #define VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR 4
164 #define VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR 8
165
166 #define TYPE(type, align) \
167 layout(buffer_reference, buffer_reference_align = align, scalar) buffer type##_ref \
168 { \
169 type value; \
170 };
171
172 #define REF(type) type##_ref
173 #define VOID_REF uint64_t
174 #define NULL 0
175 #define DEREF(var) var.value
176
177 #define SIZEOF(type) uint32_t(uint64_t(REF(type)(uint64_t(0)) + 1))
178
179 #define OFFSET(ptr, offset) (uint64_t(ptr) + offset)
180
181 #define INFINITY (1.0 / 0.0)
182 #define NAN (0.0 / 0.0)
183
184 #define INDEX(type, ptr, index) REF(type)(OFFSET(ptr, (index)*SIZEOF(type)))
185
186 TYPE(int8_t, 1);
187 TYPE(uint8_t, 1);
188 TYPE(int16_t, 2);
189 TYPE(uint16_t, 2);
190 TYPE(int32_t, 4);
191 TYPE(uint32_t, 4);
192 TYPE(int64_t, 8);
193 TYPE(uint64_t, 8);
194
195 TYPE(float, 4);
196
197 TYPE(vec2, 4);
198 TYPE(vec3, 4);
199 TYPE(vec4, 4);
200
201 TYPE(uvec4, 16);
202
203 TYPE(VOID_REF, 8);
204
205 /* copied from u_math.h */
206 uint32_t
align(uint32_t value,uint32_t alignment)207 align(uint32_t value, uint32_t alignment)
208 {
209 return (value + alignment - 1) & ~(alignment - 1);
210 }
211
212 int32_t
to_emulated_float(float f)213 to_emulated_float(float f)
214 {
215 int32_t bits = floatBitsToInt(f);
216 return f < 0 ? -2147483648 - bits : bits;
217 }
218
219 float
from_emulated_float(int32_t bits)220 from_emulated_float(int32_t bits)
221 {
222 return intBitsToFloat(bits < 0 ? -2147483648 - bits : bits);
223 }
224
225 TYPE(vk_aabb, 4);
226
227 struct key_id_pair {
228 uint32_t id;
229 uint32_t key;
230 };
231 TYPE(key_id_pair, 4);
232
233 TYPE(vk_accel_struct_serialization_header, 8);
234
235 TYPE(vk_ir_header, 4);
236 TYPE(vk_ir_node, 4);
237 TYPE(vk_ir_box_node, 4);
238 TYPE(vk_ir_triangle_node, 4);
239 TYPE(vk_ir_aabb_node, 4);
240 TYPE(vk_ir_instance_node, 8);
241
242 TYPE(vk_global_sync_data, 4);
243
244 uint32_t
ir_id_to_offset(uint32_t id)245 ir_id_to_offset(uint32_t id)
246 {
247 return id & (~3u);
248 }
249
250 uint32_t
ir_id_to_type(uint32_t id)251 ir_id_to_type(uint32_t id)
252 {
253 return id & 3u;
254 }
255
256 uint32_t
pack_ir_node_id(uint32_t offset,uint32_t type)257 pack_ir_node_id(uint32_t offset, uint32_t type)
258 {
259 return offset | type;
260 }
261
262 float
aabb_surface_area(vk_aabb aabb)263 aabb_surface_area(vk_aabb aabb)
264 {
265 vec3 diagonal = aabb.max - aabb.min;
266 return 2 * diagonal.x * diagonal.y + 2 * diagonal.y * diagonal.z + 2 * diagonal.x * diagonal.z;
267 }
268
269 /* Just a wrapper for 3 uints. */
270 struct triangle_indices {
271 uint32_t index[3];
272 };
273
274 triangle_indices
load_indices(VOID_REF indices,uint32_t index_format,uint32_t global_id)275 load_indices(VOID_REF indices, uint32_t index_format, uint32_t global_id)
276 {
277 triangle_indices result;
278
279 uint32_t index_base = global_id * 3;
280
281 switch (index_format) {
282 case VK_INDEX_TYPE_UINT16: {
283 result.index[0] = DEREF(INDEX(uint16_t, indices, index_base + 0));
284 result.index[1] = DEREF(INDEX(uint16_t, indices, index_base + 1));
285 result.index[2] = DEREF(INDEX(uint16_t, indices, index_base + 2));
286 break;
287 }
288 case VK_INDEX_TYPE_UINT32: {
289 result.index[0] = DEREF(INDEX(uint32_t, indices, index_base + 0));
290 result.index[1] = DEREF(INDEX(uint32_t, indices, index_base + 1));
291 result.index[2] = DEREF(INDEX(uint32_t, indices, index_base + 2));
292 break;
293 }
294 case VK_INDEX_TYPE_NONE_KHR: {
295 result.index[0] = index_base + 0;
296 result.index[1] = index_base + 1;
297 result.index[2] = index_base + 2;
298 break;
299 }
300 case VK_INDEX_TYPE_UINT8_EXT: {
301 result.index[0] = DEREF(INDEX(uint8_t, indices, index_base + 0));
302 result.index[1] = DEREF(INDEX(uint8_t, indices, index_base + 1));
303 result.index[2] = DEREF(INDEX(uint8_t, indices, index_base + 2));
304 break;
305 }
306 }
307
308 return result;
309 }
310
311 /* Just a wrapper for 3 vec4s. */
312 struct triangle_vertices {
313 vec4 vertex[3];
314 };
315
316 TYPE(float16_t, 2);
317
318 triangle_vertices
load_vertices(VOID_REF vertices,triangle_indices indices,uint32_t vertex_format,uint32_t stride)319 load_vertices(VOID_REF vertices, triangle_indices indices, uint32_t vertex_format, uint32_t stride)
320 {
321 triangle_vertices result;
322
323 for (uint32_t i = 0; i < 3; i++) {
324 VOID_REF vertex_ptr = OFFSET(vertices, indices.index[i] * stride);
325 vec4 vertex = vec4(0.0, 0.0, 0.0, 1.0);
326
327 switch (vertex_format) {
328 case VK_FORMAT_R32G32_SFLOAT:
329 vertex.x = DEREF(INDEX(float, vertex_ptr, 0));
330 vertex.y = DEREF(INDEX(float, vertex_ptr, 1));
331 break;
332 case VK_FORMAT_R32G32B32_SFLOAT:
333 case VK_FORMAT_R32G32B32A32_SFLOAT:
334 vertex.x = DEREF(INDEX(float, vertex_ptr, 0));
335 vertex.y = DEREF(INDEX(float, vertex_ptr, 1));
336 vertex.z = DEREF(INDEX(float, vertex_ptr, 2));
337 break;
338 case VK_FORMAT_R16G16_SFLOAT:
339 vertex.x = DEREF(INDEX(float16_t, vertex_ptr, 0));
340 vertex.y = DEREF(INDEX(float16_t, vertex_ptr, 1));
341 break;
342 case VK_FORMAT_R16G16B16_SFLOAT:
343 case VK_FORMAT_R16G16B16A16_SFLOAT:
344 vertex.x = DEREF(INDEX(float16_t, vertex_ptr, 0));
345 vertex.y = DEREF(INDEX(float16_t, vertex_ptr, 1));
346 vertex.z = DEREF(INDEX(float16_t, vertex_ptr, 2));
347 break;
348 case VK_FORMAT_R16G16_SNORM:
349 vertex.x = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 0)) / float(0x7FFF));
350 vertex.y = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 1)) / float(0x7FFF));
351 break;
352 case VK_FORMAT_R16G16B16A16_SNORM:
353 vertex.x = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 0)) / float(0x7FFF));
354 vertex.y = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 1)) / float(0x7FFF));
355 vertex.z = max(-1.0, DEREF(INDEX(int16_t, vertex_ptr, 2)) / float(0x7FFF));
356 break;
357 case VK_FORMAT_R8G8_SNORM:
358 vertex.x = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 0)) / float(0x7F));
359 vertex.y = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 1)) / float(0x7F));
360 break;
361 case VK_FORMAT_R8G8B8A8_SNORM:
362 vertex.x = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 0)) / float(0x7F));
363 vertex.y = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 1)) / float(0x7F));
364 vertex.z = max(-1.0, DEREF(INDEX(int8_t, vertex_ptr, 2)) / float(0x7F));
365 break;
366 case VK_FORMAT_R16G16_UNORM:
367 vertex.x = DEREF(INDEX(uint16_t, vertex_ptr, 0)) / float(0xFFFF);
368 vertex.y = DEREF(INDEX(uint16_t, vertex_ptr, 1)) / float(0xFFFF);
369 break;
370 case VK_FORMAT_R16G16B16A16_UNORM:
371 vertex.x = DEREF(INDEX(uint16_t, vertex_ptr, 0)) / float(0xFFFF);
372 vertex.y = DEREF(INDEX(uint16_t, vertex_ptr, 1)) / float(0xFFFF);
373 vertex.z = DEREF(INDEX(uint16_t, vertex_ptr, 2)) / float(0xFFFF);
374 break;
375 case VK_FORMAT_R8G8_UNORM:
376 vertex.x = DEREF(INDEX(uint8_t, vertex_ptr, 0)) / float(0xFF);
377 vertex.y = DEREF(INDEX(uint8_t, vertex_ptr, 1)) / float(0xFF);
378 break;
379 case VK_FORMAT_R8G8B8A8_UNORM:
380 vertex.x = DEREF(INDEX(uint8_t, vertex_ptr, 0)) / float(0xFF);
381 vertex.y = DEREF(INDEX(uint8_t, vertex_ptr, 1)) / float(0xFF);
382 vertex.z = DEREF(INDEX(uint8_t, vertex_ptr, 2)) / float(0xFF);
383 break;
384 case VK_FORMAT_A2B10G10R10_UNORM_PACK32: {
385 uint32_t data = DEREF(REF(uint32_t)(vertex_ptr));
386 vertex.x = float(data & 0x3FF) / 0x3FF;
387 vertex.y = float((data >> 10) & 0x3FF) / 0x3FF;
388 vertex.z = float((data >> 20) & 0x3FF) / 0x3FF;
389 break;
390 }
391 }
392
393 result.vertex[i] = vertex;
394 }
395
396 return result;
397 }
398
399 /** Compute ceiling of integer quotient of A divided by B.
400 From macros.h */
401 #define DIV_ROUND_UP(A, B) (((A) + (B)-1) / (B))
402
403 #ifdef USE_GLOBAL_SYNC
404
405 /* There might be more invocations available than tasks to do.
406 * In that case, the fetched task index is greater than the
407 * counter offset for the next phase. To avoid out-of-bounds
408 * accessing, phases will be skipped until the task index is
409 * is in-bounds again. */
410 uint32_t num_tasks_to_skip = 0;
411 uint32_t phase_index = 0;
412 bool should_skip = false;
413 shared uint32_t global_task_index;
414
415 shared uint32_t shared_phase_index;
416
417 uint32_t
task_count(REF (vk_ir_header)header)418 task_count(REF(vk_ir_header) header)
419 {
420 uint32_t phase_index = DEREF(header).sync_data.phase_index;
421 return DEREF(header).sync_data.task_counts[phase_index & 1];
422 }
423
424 /* Sets the task count for the next phase. */
425 void
set_next_task_count(REF (vk_ir_header)header,uint32_t new_count)426 set_next_task_count(REF(vk_ir_header) header, uint32_t new_count)
427 {
428 uint32_t phase_index = DEREF(header).sync_data.phase_index;
429 DEREF(header).sync_data.task_counts[(phase_index + 1) & 1] = new_count;
430 }
431
432 /*
433 * This function has two main objectives:
434 * Firstly, it partitions pending work among free invocations.
435 * Secondly, it guarantees global synchronization between different phases.
436 *
437 * After every call to fetch_task, a new task index is returned.
438 * fetch_task will also set num_tasks_to_skip. Use should_execute_phase
439 * to determine if the current phase should be executed or skipped.
440 *
441 * Since tasks are assigned per-workgroup, there is a possibility of the task index being
442 * greater than the total task count.
443 */
444 uint32_t
fetch_task(REF (vk_ir_header)header,bool did_work)445 fetch_task(REF(vk_ir_header) header, bool did_work)
446 {
447 /* Perform a memory + control barrier for all buffer writes for the entire workgroup.
448 * This guarantees that once the workgroup leaves the PHASE loop, all invocations have finished
449 * and their results are written to memory. */
450 controlBarrier(gl_ScopeWorkgroup, gl_ScopeDevice, gl_StorageSemanticsBuffer,
451 gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
452 if (gl_LocalInvocationIndex == 0) {
453 if (did_work)
454 atomicAdd(DEREF(header).sync_data.task_done_counter, 1);
455 global_task_index = atomicAdd(DEREF(header).sync_data.task_started_counter, 1);
456
457 do {
458 /* Perform a memory barrier to refresh the current phase's end counter, in case
459 * another workgroup changed it. */
460 memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
461 gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
462
463 /* The first invocation of the first workgroup in a new phase is responsible to initiate the
464 * switch to a new phase. It is only possible to switch to a new phase if all tasks of the
465 * previous phase have been completed. Switching to a new phase and incrementing the phase
466 * end counter in turn notifies all invocations for that phase that it is safe to execute.
467 */
468 if (global_task_index == DEREF(header).sync_data.current_phase_end_counter &&
469 DEREF(header).sync_data.task_done_counter == DEREF(header).sync_data.current_phase_end_counter) {
470 if (DEREF(header).sync_data.next_phase_exit_flag != 0) {
471 DEREF(header).sync_data.phase_index = TASK_INDEX_INVALID;
472 memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
473 gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
474 } else {
475 atomicAdd(DEREF(header).sync_data.phase_index, 1);
476 DEREF(header).sync_data.current_phase_start_counter = DEREF(header).sync_data.current_phase_end_counter;
477 /* Ensure the changes to the phase index and start/end counter are visible for other
478 * workgroup waiting in the loop. */
479 memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
480 gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
481 atomicAdd(DEREF(header).sync_data.current_phase_end_counter,
482 DIV_ROUND_UP(task_count(header), gl_WorkGroupSize.x));
483 }
484 break;
485 }
486
487 /* If other invocations have finished all nodes, break out; there is no work to do */
488 if (DEREF(header).sync_data.phase_index == TASK_INDEX_INVALID) {
489 break;
490 }
491 } while (global_task_index >= DEREF(header).sync_data.current_phase_end_counter);
492
493 shared_phase_index = DEREF(header).sync_data.phase_index;
494 }
495
496 barrier();
497 if (DEREF(header).sync_data.phase_index == TASK_INDEX_INVALID)
498 return TASK_INDEX_INVALID;
499
500 num_tasks_to_skip = shared_phase_index - phase_index;
501
502 uint32_t local_task_index = global_task_index - DEREF(header).sync_data.current_phase_start_counter;
503 return local_task_index * gl_WorkGroupSize.x + gl_LocalInvocationID.x;
504 }
505
506 bool
should_execute_phase()507 should_execute_phase()
508 {
509 if (num_tasks_to_skip > 0) {
510 /* Skip to next phase. */
511 ++phase_index;
512 --num_tasks_to_skip;
513 return false;
514 }
515 return true;
516 }
517
518 #define PHASE(header) \
519 for (; task_index != TASK_INDEX_INVALID && should_execute_phase(); task_index = fetch_task(header, true))
520 #endif
521
522 #endif
523