• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1//
2// Copyright (C) 2009-2021 Intel Corporation
3//
4// SPDX-License-Identifier: MIT
5//
6//
7
8module new_sah_builder;
9
10kernel_module bfs_kernels ("bvh_build_BFS.cl")
11{
12    links lsc_intrinsics;
13
14    kernel opencl_build_kernel_BinnedSAH_BFS_pass1_initial      <  kernelFunction="BFS_pass1_initial"  >   ;
15    kernel opencl_build_kernel_BinnedSAH_BFS_pass1_indexed      <  kernelFunction="BFS_pass1_indexed"  >   ;
16    kernel opencl_build_kernel_BinnedSAH_BFS_pass2_initial      <  kernelFunction="BFS_pass2_initial"  >   ;
17    kernel opencl_build_kernel_BinnedSAH_BFS_pass2_indexed      <  kernelFunction="BFS_pass2_indexed"  >   ;
18
19    kernel opencl_build_kernel_BinnedSAH_DFS                    <  kernelFunction="DFS"        >;
20    // kernel opencl_build_kernel_BinnedSAH_BuildQNodes            <  kernelFunction="build_qnodes" >;
21    kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Kickoff    <  kernelFunction="build_qnodes_pc_kickoff" >;
22    kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify    <  kernelFunction="build_qnodes_pc_amplify" >;
23    kernel opencl_build_kernel_BinnedSAH_begin                  <  kernelFunction = "begin" >;
24    kernel opencl_build_kernel_BinnedSAH_scheduler              <  kernelFunction = "scheduler" >;
25
26    kernel opencl_build_kernel_BinnedSAH_BFS_pass1_initial_batch   < kernelFunction="BFS_pass1_initial_batchable"  >;
27    kernel opencl_build_kernel_BinnedSAH_BFS_pass1_indexed_batch   < kernelFunction="BFS_pass1_indexed_batchable"  >;
28    kernel opencl_build_kernel_BinnedSAH_BFS_pass2_initial_batch   < kernelFunction="BFS_pass2_initial_batchable"  >;
29    kernel opencl_build_kernel_BinnedSAH_BFS_pass2_indexed_batch   < kernelFunction="BFS_pass2_indexed_batchable"  >;
30
31    kernel opencl_build_kernel_BinnedSAH_categorize_builds_and_init_scheduler < kernelFunction="categorize_builds_and_init_scheduler" >;
32    kernel opencl_build_kernel_BinnedSAH_begin_batched     < kernelFunction="begin_batchable"   >;
33
34    kernel opencl_build_kernel_BinnedSAH_qnode_init_scheduler_batched      < kernelFunction="build_qnodes_init_scheduler_batched" >;
35    kernel opencl_build_kernel_BinnedSAH_qnode_begin_batched               < kernelFunction="build_qnodes_begin_batchable" >;
36    kernel opencl_build_kernel_BinnedSAH_qnode_scheduler                   < kernelFunction="build_qnodes_scheduler" >;
37    kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify_batch         < kernelFunction="build_qnodes_pc_amplify_batched" >;
38
39    kernel opencl_build_kernel_BinnedSAH_BuildQnodes_TryToFillGRB_batched < kernelFunction="build_qnodes_try_to_fill_grb_batched" >;
40
41}
42
43kernel opencl_build_kernel_DFS_single_wg             < source="bvh_build_DFS.cl", kernelFunction="DFS_single_wg" >
44kernel opencl_build_kernel_DFS_trivial               < source="bvh_build_DFS.cl", kernelFunction="DFS_trivial"  >
45kernel opencl_build_kernel_DFS_single_wg_batch       < source="bvh_build_DFS.cl", kernelFunction="DFS_single_wg_batchable" >
46kernel opencl_build_kernel_DFS_trivial_batch         < source="bvh_build_DFS.cl", kernelFunction="DFS_trivial_batchable"   >
47
48kernel single_pass_binsah                            < source="bvh_build_DFS.cl", kernelFunction="DFS"                           >
49
50
51const DFS_MIN_PRIMREFS  = 6;
52const DFS_MAX_PRIMREFS  = 256;
53const BFS_WG_SIZE_SHIFT = 9;
54
55
56
57struct Scheduler
58{
59    dword num_bfs_wgs;
60    dword num_dfs_wgs;
61
62    dword scheduler_postsync;
63    dword _pad1;
64
65    dword num_trivial_builds;
66    dword num_single_builds;
67
68    dword batched_build_wg_count;
69    dword batched_build_loop_mask;
70
71};
72
73
74struct SAHBuildArgs
75{
76    qword p_num_primitives;
77    qword p_qnode_child_buffer;
78    qword p_scheduler;
79    qword p_sah_globals;
80    qword p_globals;
81    qword p_primref_buffer;
82    qword p_primref_index_buffers;
83    qword p_bvh_base;
84    qword p_bvh2;
85    qword p_root_buffer_counters;
86    dword sah_build_flags;
87    dword leaf_size;
88    dword leaf_type;
89    dword max_internal_nodes;
90};
91
92
93metakernel single_pass_binsah(
94    qword build_globals,
95    qword bvh_buffer,
96    qword build_primref_buffer,
97    qword build_primref_index_buffers,
98    dword alloc_backpointers )
99{
100
101    dispatch single_pass_binsah(1, 1, 1) args(
102        build_globals,
103        bvh_buffer,
104        build_primref_buffer,
105        build_primref_index_buffers,
106        alloc_backpointers
107    );
108
109}
110
111
112
113metakernel new_sah_build( SAHBuildArgs build_args )
114{
115    define REG_num_prims    REG0;
116
117    {
118        define C_MIN_PRIMREFS           REG1;
119        define C_MAX_PRIMREFS           REG2;
120        define REG_dispatch_trivial     REG3;
121        define REG_dispatch_single_wg   REG4;
122
123        REG_num_prims  = load_dword( build_args.p_num_primitives );
124        C_MIN_PRIMREFS = DFS_MIN_PRIMREFS;
125        C_MAX_PRIMREFS = DFS_MAX_PRIMREFS;
126
127        REG_dispatch_trivial   = REG_num_prims <= C_MIN_PRIMREFS;
128        REG_dispatch_single_wg = REG_num_prims <= C_MAX_PRIMREFS;
129
130        goto l_dispatch_trivial   if(REG_dispatch_trivial.lo);
131        goto l_dispatch_single_wg if(REG_dispatch_single_wg.lo);
132        goto l_full_build;
133    }
134
135l_dispatch_trivial:
136    {
137        dispatch opencl_build_kernel_DFS_trivial    (1,1,1)
138            args( build_args.p_globals,
139                  build_args.p_bvh_base,
140                  build_args.p_primref_buffer,
141                  build_args.p_primref_index_buffers,
142                  build_args.sah_build_flags
143                  );
144
145        control( wait_idle );
146        goto l_done;
147    }
148
149l_dispatch_single_wg:
150    {
151        dispatch opencl_build_kernel_DFS_single_wg    (1,1,1)
152            args( build_args.p_globals,
153                  build_args.p_bvh_base,
154                  build_args.p_primref_buffer,
155                  build_args.p_primref_index_buffers,
156                  build_args.sah_build_flags
157                  );
158
159        control( wait_idle );
160        goto l_done;
161    }
162
163
164l_full_build:
165
166
167    {
168        define p_scheduler                  build_args.p_scheduler;
169        define p_num_dfs_wgs                build_args.p_scheduler + offsetof(Scheduler.num_dfs_wgs);
170        define p_scheduler_postsync         (build_args.p_scheduler + offsetof(Scheduler.scheduler_postsync) );
171        define C_0    REG1;
172        define C_8    REG2;
173        C_8 = 8;
174        C_0 = 0;
175
176
177        //
178        //  Init pass
179        //
180        store_dword( p_scheduler_postsync, C_0.lo );
181
182        // compute number of BFS WGs from prim-count
183        // NOTE:  This code uses a hardcoded WG size of 512 for BFS
184        //    If the BFS wg size ever changes, it needs to be touched
185        //    This is necessary because DG2 shifter only supports POW2 shifts
186        {
187            define REG_scheduler_postsync    REG3;
188            define C_511    REG4;
189            define C_1      REG5;
190
191            REG_scheduler_postsync = p_scheduler_postsync;
192            C_511 = 511;
193            C_1   = 1;
194
195            store_qword( REG_scheduler_postsync, C_0 ); // initialize scheduler semaphore
196
197            REG_num_prims = REG_num_prims + C_511;
198            REG_num_prims = REG_num_prims >> C_8;
199            REG_num_prims = REG_num_prims >> C_1;
200
201            DISPATCHDIM_X = REG_num_prims.lo;
202            DISPATCHDIM_Y = 1;
203            DISPATCHDIM_Z = 1;
204
205            control( cs_store_fence ); // commit the semaphore write
206
207            // launch scheduler init kernel
208            dispatch opencl_build_kernel_BinnedSAH_begin (1,1,1)
209                args(
210                    build_args.p_scheduler,
211                    build_args.leaf_size,
212                    build_args.leaf_type,
213                    build_args.p_primref_index_buffers,
214                    build_args.p_primref_buffer,
215                    build_args.p_bvh2,
216                    build_args.p_bvh_base,
217                    build_args.p_globals,
218                    build_args.p_sah_globals,
219                    build_args.p_qnode_child_buffer,
220                    build_args.sah_build_flags
221                )
222                postsync store_dword( p_scheduler_postsync, 1 );
223
224            // wait on init kernel
225            semaphore_wait while( *p_scheduler_postsync != 1 );
226
227            // launch BFS1 pass1
228            dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_initial
229                args( build_args.p_scheduler,
230                      build_args.p_sah_globals)
231                postsync store_dword( p_scheduler_postsync, 0 );
232
233            // wait on BFS pass1
234            semaphore_wait while( *p_scheduler_postsync != 0 );
235
236            // launch BFS pass2
237            dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_initial
238                args( build_args.p_scheduler,
239                      build_args.p_sah_globals )
240                postsync store_dword( p_scheduler_postsync, 1 );
241        }
242
243        // after BFS pass 2 we drop into a scheduling loop
244
245        l_build_loop:
246        {
247            semaphore_wait while( *p_scheduler_postsync != 1 );
248
249            {
250                dispatch opencl_build_kernel_BinnedSAH_scheduler(1,1,1)
251                    args( build_args.p_scheduler, build_args.p_sah_globals )
252                    postsync store_dword( p_scheduler_postsync, 0 );
253
254                // wait on the scheduler
255                semaphore_wait while( *p_scheduler_postsync != 0 );
256            }
257
258            // load and process the scheduler results
259            define REG_wg_counts    REG0;
260            define REG_num_bfs_wgs  REG0.lo;
261            define REG_num_dfs_wgs  REG0.hi;
262            define REG_loop_break   REG1;
263            define REG_p_scheduler  REG2;
264            {
265                REG_p_scheduler = p_scheduler;
266                REG_wg_counts    = load_qword( REG_p_scheduler );
267
268                define C_MASK_LO REG3 ;
269                C_MASK_LO = 0xffffffff;
270
271                REG_loop_break = REG_wg_counts  & C_MASK_LO;
272                REG_loop_break = REG_loop_break == 0;
273            }
274
275            // dispatch new DFS WGs
276            DISPATCHDIM_X = REG_num_dfs_wgs;
277            dispatch_indirect opencl_build_kernel_BinnedSAH_DFS
278                args( p_scheduler,
279                      build_args.p_sah_globals );
280
281            // jump out if there are no bfs WGs
282            goto l_build_qnodes if (REG_loop_break);
283
284            // dispatch new BFS1 WGs
285            DISPATCHDIM_X = REG_num_bfs_wgs;
286            dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_indexed
287                args( p_scheduler,
288                      build_args.p_sah_globals )
289                postsync store_dword( p_scheduler_postsync, 2 );
290
291           semaphore_wait while( *p_scheduler_postsync != 2 );
292
293           // dispatch new BFS2 WGs
294           dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_indexed
295               args( p_scheduler,
296                     build_args.p_sah_globals )
297               postsync store_dword( p_scheduler_postsync, 1 );
298
299            //goto l_build_loop if not(REG_num_dfs_wgs); //TODO: maybe add some logic to do "bool have_dfs_work" which will add some cycles but may be faster than checking semaphore
300
301            // wait until all upcoming DFS WGs have finished launching
302            //   so that the scheduler can refill the launch array
303                // TODO_OPT:  Look at replacing this with a ring buffer so that scheduler stalls instead (and only rarely)
304            semaphore_wait while( *p_num_dfs_wgs != 0 );
305
306
307            goto l_build_loop;
308        }
309    }
310
311l_build_qnodes:
312
313    control( wait_idle );
314
315    // P/C qnode build
316
317    dispatch opencl_build_kernel_BinnedSAH_BuildQNodes_Kickoff (1,1,1)
318        args( build_args.p_sah_globals,
319              build_args.p_qnode_child_buffer,
320              build_args.sah_build_flags );
321
322    {
323        define p_pc_counters ( build_args.p_root_buffer_counters );
324
325        define REG_addr      REG0;
326        define REG_produced  REG1;
327        define REG_consumed  REG2;
328        define REG_have_work REG3;
329        define REG_wg_count  REG4;
330        define C_8 REG5;
331        define C_16 REG6;
332        define C_1 REG7;
333        C_1 = 1;
334        C_8 =  8;
335        C_16 = 16;
336        REG_addr =  build_args.p_root_buffer_counters; // HINT: should we use REG_addr or just pass separate arguments to metakernel to avoid add/sub from address
337
338        REG_consumed = 0;
339
340        l_qnode_loop:
341
342            control( wait_idle ); // wait for previous pass
343
344            // load counters and compute number of wgs to respawn
345            REG_produced  = load_qword( REG_addr ); REG_addr = REG_addr + C_8;
346            REG_wg_count  = REG_produced - REG_consumed;
347            REG_have_work = REG_wg_count > 0;
348
349            goto l_done if not(REG_have_work.lo);
350
351            // save REG_consumed as a starting position in p_qnode_child_buffer
352            store_qword(REG_addr, REG_consumed); REG_addr = REG_addr + C_8;
353
354            // save REG_produced as ending position in p_qnode_child_buffer
355            store_qword(REG_addr, REG_produced); REG_addr = REG_addr - C_16;
356
357            REG_consumed = REG_consumed + REG_wg_count; // update consumed for next iteration
358
359            // calculate amount of workgroups to schedule
360            REG_wg_count = REG_wg_count + C_1;
361            REG_wg_count = REG_wg_count >> C_1;
362
363            DISPATCHDIM_X = REG_wg_count.lo;
364
365            control( cs_store_fence ); // commit the stores
366
367            dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify
368                    args( build_args.p_sah_globals,
369                          build_args.p_qnode_child_buffer,
370                          build_args.sah_build_flags);
371
372            goto l_qnode_loop;
373    }
374
375l_done:
376}
377
378
379
380
381
382
383
384
385
386struct SAHBuildArgsBatchable
387{
388    qword p_globals_ptrs;
389    qword p_scheduler;
390    qword p_buffers_info;
391    qword p_sah_globals;
392
393    dword num_max_qnode_global_root_buffer_entries;
394    dword num_builds;
395
396};
397
398
399metakernel new_sah_build_batchable( SAHBuildArgsBatchable build_args )
400{
401    define p_scheduler                  build_args.p_scheduler;
402    define p_scheduler_postsync         (build_args.p_scheduler + offsetof(Scheduler.scheduler_postsync) );
403    define p_num_dfs_wgs                (build_args.p_scheduler + offsetof(Scheduler.num_dfs_wgs));
404
405    // initialize scheduler semaphore
406    REG0.lo = 0;
407    store_dword( p_scheduler_postsync, REG0.lo );
408
409
410    // dispatch categorization pass
411    dispatch opencl_build_kernel_BinnedSAH_categorize_builds_and_init_scheduler(2,1,1)
412        args(
413              build_args.p_scheduler,
414              build_args.p_globals_ptrs,
415              build_args.p_buffers_info,
416              build_args.p_sah_globals,
417              build_args.num_builds
418          )
419          postsync store_dword( p_scheduler_postsync, 1 );
420
421    // wait on the categorization pass
422    semaphore_wait while( *p_scheduler_postsync != 1 );
423
424
425    //  dispatch the trivial and single-WG passes
426    {
427        REG0 = load_qword( build_args.p_scheduler + offsetof(Scheduler.num_trivial_builds) );
428        DISPATCHDIM_X = REG0.lo;
429        DISPATCHDIM_Y = 1;
430        DISPATCHDIM_Z = 1;
431
432        // dispatch trivial builds
433
434        dispatch_indirect opencl_build_kernel_DFS_trivial_batch
435            args( build_args.p_sah_globals );
436
437        control( wait_idle );
438
439        // dispatch single-wg builds
440
441        DISPATCHDIM_X = REG0.hi;
442        dispatch_indirect opencl_build_kernel_DFS_single_wg_batch
443            args( build_args.p_sah_globals, build_args.p_scheduler );
444    }
445
446    // compute the number of builds not covered by the trivial passes
447    // skip the builder loop if all builds are satisfied by trivial passes
448    {
449        REG1 = REG0.lo;
450        REG2 = REG0.hi;
451        REG3 = build_args.num_builds;
452        REG5 = REG2 + REG1;
453        REG5 = REG3 - REG5;
454        REG4 = REG5 == 0 ;
455
456        goto l_done if (REG4.lo);
457    }
458
459    // REG5 (number of non-trivial builds) will be used to launch build_qnodes kernel after the build loop
460    define REG_num_nontrivial REG5;
461
462l_build_outer_loop:
463    {
464
465        // configure the scheduler to initiate a new block of builds
466
467        dispatch opencl_build_kernel_BinnedSAH_begin_batched (1,1,1)
468            args( build_args.p_scheduler, build_args.p_sah_globals )
469            postsync store_dword( p_scheduler_postsync, 0 );
470
471        // wait on init kernel
472        semaphore_wait while( *p_scheduler_postsync != 0 );
473
474
475        // read results produced by scheduler init kernel
476        //   lo == BFS wg count.  hi == all ones if we need to loop again
477        //
478        REG0 = build_args.p_scheduler + offsetof(Scheduler.batched_build_wg_count);
479        REG4 = load_qword( REG0 );
480
481        // launch BFS1 pass1
482        DISPATCHDIM_X = REG4.lo;
483        dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_initial_batch
484            args( build_args.p_scheduler,
485                    build_args.p_sah_globals)
486            postsync store_dword( p_scheduler_postsync, 1 );
487
488        // wait on BFS pass1
489        semaphore_wait while( *p_scheduler_postsync != 1 );
490
491        // launch BFS pass2
492        dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_initial_batch
493            args( build_args.p_scheduler,
494                    build_args.p_sah_globals )
495            postsync store_dword( p_scheduler_postsync, 0 );
496
497        l_build_loop:
498            {
499                semaphore_wait while( *p_scheduler_postsync != 0 );
500
501                {
502                    dispatch opencl_build_kernel_BinnedSAH_scheduler(1,1,1)
503                        args( build_args.p_scheduler, build_args.p_sah_globals )
504                        postsync store_dword( p_scheduler_postsync, 1 );
505
506                    // wait on the scheduler
507                    semaphore_wait while( *p_scheduler_postsync != 1 );
508                }
509
510                // load and process the scheduler results
511                define REG_wg_counts    REG0;
512                define REG_num_bfs_wgs  REG0.lo;
513                define REG_num_dfs_wgs  REG0.hi;
514                define REG_loop_break   REG1;
515                define REG_p_scheduler  REG2;
516                {
517                    REG_p_scheduler = p_scheduler;
518                    REG_wg_counts    = load_qword( REG_p_scheduler );
519
520                    define C_MASK_LO REG3 ;
521                    C_MASK_LO = 0xffffffff;
522
523                    REG_loop_break = REG_wg_counts  & C_MASK_LO;
524                    REG_loop_break = REG_loop_break == 0;
525                }
526
527                // dispatch new DFS WGs
528                DISPATCHDIM_X = REG_num_dfs_wgs;
529                dispatch_indirect opencl_build_kernel_BinnedSAH_DFS
530                    args( p_scheduler,
531                          build_args.p_sah_globals );
532
533                // jump out if there are no bfs WGs
534                goto l_continue_outer_loop if (REG_loop_break);
535
536                // dispatch new BFS1 WGs
537                DISPATCHDIM_X = REG_num_bfs_wgs;
538                dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_indexed_batch
539                    args( p_scheduler,
540                          build_args.p_sah_globals )
541                    postsync store_dword( p_scheduler_postsync, 2 );
542
543               semaphore_wait while( *p_scheduler_postsync != 2 );
544
545                // dispatch new BFS2 WGs
546                dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_indexed_batch
547                    args( p_scheduler,
548                          build_args.p_sah_globals )
549                    postsync store_dword( p_scheduler_postsync, 0 );
550
551                //goto l_build_loop if not(REG_num_dfs_wgs); //TODO: maybe add some logic to do "bool have_dfs_work" which will add some cycles but may be faster than checking semaphore
552
553                // wait until all upcoming DFS WGs have finished launching
554                //   so that the scheduler can refill the launch array
555                // TODO_OPT:  Look at replacing this with a ring buffer so that scheduler stalls instead (and only rarely)
556                semaphore_wait while( *p_num_dfs_wgs != 0 );
557
558                goto l_build_loop;
559            }
560
561
562        l_continue_outer_loop:
563
564
565            goto l_build_outer_loop if(REG4.hi);
566
567    }
568
569////////
570//
571// Qnode build phase
572//
573////////
574
575    //  Wait for all outstanding DFS dispatches to complete, then build the QNodes
576    control( wait_idle );
577
578    define REG_wg_counts   REG1;
579    define REG_p_scheduler REG2;
580    define REG_have_work   REG3;
581    define REG_GRB_NUM_MAX_ENTRIES    REG4;
582
583    // init scheduler for qnode phase
584    dispatch opencl_build_kernel_BinnedSAH_qnode_init_scheduler_batched(1,1,1)
585        args( build_args.p_scheduler,
586              build_args.num_builds,
587              build_args.num_max_qnode_global_root_buffer_entries);
588
589    REG_p_scheduler = p_scheduler;
590
591    control( wait_idle );
592
593    REG_wg_counts   = load_qword( REG_p_scheduler );
594
595    DISPATCHDIM_X = REG_wg_counts.lo;
596
597    // configure the scheduler to initiate a new block of builds
598    dispatch_indirect opencl_build_kernel_BinnedSAH_qnode_begin_batched
599        args( build_args.p_scheduler,
600              build_args.p_sah_globals);
601
602    // read results produced by init scheduler kernel
603    //   lo == num of builds processed.  hi == num of maximum global root buffer entries
604    //
605    REG0 = build_args.p_scheduler + offsetof(Scheduler.batched_build_wg_count);
606    REG5 = load_qword( REG0 );
607
608    REG_GRB_NUM_MAX_ENTRIES.lo = REG5.hi;
609    REG_GRB_NUM_MAX_ENTRIES.hi = 0;
610
611l_qnode_loop:
612    {
613        control( wait_idle ); // wait for previous pass
614
615        dispatch opencl_build_kernel_BinnedSAH_qnode_scheduler(1,1,1) args( build_args.p_scheduler );
616
617        control( wait_idle );
618
619        REG_wg_counts   = load_qword( REG_p_scheduler );
620        REG_have_work = REG_wg_counts > 0;
621
622        goto l_done if not(REG_have_work.lo);
623
624        DISPATCHDIM_X = REG_wg_counts.lo;
625
626        dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify_batch
627                args( build_args.p_sah_globals,
628                      build_args.p_scheduler );
629
630        control( wait_idle );
631
632        REG_wg_counts   = load_qword( REG_p_scheduler ); // reload values
633        REG_wg_counts.lo = REG_wg_counts.hi;
634        REG_wg_counts.hi = 0;
635
636        REG_have_work = REG_wg_counts < REG_GRB_NUM_MAX_ENTRIES;
637
638        goto l_qnode_loop if not(REG_have_work.lo);
639
640        DISPATCHDIM_X = REG5.lo; // dispatch single workgroup for each build scheduled
641
642        dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQnodes_TryToFillGRB_batched
643                args( build_args.p_sah_globals,
644                      build_args.p_scheduler );
645
646        goto l_qnode_loop;
647    }
648
649////////
650//
651// Old implementation - TODO: maybe add switch between two implementations?
652//
653////////
654    //  Wait for all outstanding DFS dispatches to complete, then build the QNodes
655    //DISPATCHDIM_X = REG5.lo;
656
657    //dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes
658    //    args( build_args.p_sah_globals, build_args.p_scheduler );
659
660
661l_done:
662
663    control( wait_idle );
664
665}
666