• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2017 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can
5  * be found in the LICENSE file.
6  *
7  */
8 
9 //
10 //
11 //
12 
13 #include <stddef.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #include <float.h>
17 #include <stdio.h>
18 
19 #include "common/cl/assert_cl.h"
20 
21 #include "context.h"
22 #include "handle.h"
23 #include "grid.h"
24 #include "path.h"
25 #include "path_builder.h"
26 
27 #include "config_cl.h"
28 #include "export_cl_12.h"
29 #include "runtime_cl_12.h"
30 #include "path_builder_cl_12.h"
31 
32 //
33 // OpenCL 1.2 devices support mapping of buffers into the host address
34 // space.
35 //
36 // Mapped buffers must be aligned on MIN_DATA_TYPE_ALIGN_SIZE bit
37 // boundary (e.g. 128 bytes).  This complicates coordinating sharing
38 // of data between the host and the device.
39 //
40 // Some OpenCL 2.0 devices support fine-grained shared virtual memory
41 // pointers with byte-addressing and allow simpler coordination
42 // strategies at the cost of maintaining cache coherency.
43 //
44 // The path builder is focused on moving bulk path data from the host
45 // into the device-managed "block" memory pool and arranging it into a
46 // SIMT/SIMD-friendly data structure that can be efficiently read by
47 // the rasterizer.
48 //
49 // Note that one simplifying assumption is that the maximum length of
50 // a *single* path can't be larger than what fits in the single extent
51 // (which is split into M subbuffers).  This would be a very long path
52 // and a legitimate size limitation.
53 //
54 // For some systems, it may be appropriate to never pull path data
55 // into the device-managed block pool and instead present the path
56 // data to the device in a temporarily available allocated memory
57 // "zone" of paths that can be discarded all at once.
58 //
59 // For other systems, it may be appropriate to simply copy the path
60 // data from host to device.
61 //
62 // But the majority of OpenCL (and VK, MTL, DX12) devices we'll be
63 // targeting support basic map/unmap functionality similar to OpenCL
64 // 1.2.  Furthermore, not all OpenCL 2.0 devices support fine-grained
65 // sharing of memory and still require a map/unmap step... but note
66 // that they all support byte-aligned mapping and subbuffers.
67 //
68 // The general strategy that this particular CL_12 implementation uses
69 // is to allocate a large mappable bulk-data path buffer and an
70 // auxilary mappable command buffer.
71 //
72 // The buffers are split into a reasonable number of properly aligned
73 // subbuffers to enable simultaneous host and device access.
74 //
75 
76 //
77 // Blocks:
78 //   1 extent
79 //   M mapped subbuffers (configurable) to allow for concurrency
80 //
81 // Commands:
82 //   1 extent
83 //   M mapped subbuffers (configurable) to allow for concurrency
84 //
85 // Spans:
86 //   M hi/lo structures
87 //
88 // { cl_sub, void*, event, base }
89 //
90 // - size of sub buffer
91 // - remaining
92 //
93 // - counts
94 //
95 
96 //
97 // For any kernel launch, at most one path will be discontiguous and
98 // defined across two sub-buffers.
99 //
100 // Nodes are updated locally until full and then stored so they will
101 // never be incomplete.  Headers are stored locally until the path is
102 // ended so they will never be incomplete.
103 //
104 // A line, quad or cubic acquires 4/6/8 segments which may be spread
105 // across one or more congtiguous blocks.
106 //
107 // If a flush() occurs then the remaining columns of multi-segment
108 // paths are initialized with zero-length line, quad, cubic elements.
109 //
110 // Every block's command word has a type and a count acquired from a
111 // rolling counter.
112 //
113 // The kernel is passed two spans of blocks { base, count } to
114 // process.  The grid is must process (lo.count + hi.count) blocks.
115 //
116 
117 struct skc_subbuffer_blocks
118 {
119   cl_mem   device;
120   void *   host;
121 };
122 
123 struct skc_subbuffer_cmds
124 {
125   cl_mem   device;
126   void *   host;
127   cl_event map;
128 };
129 
130 //
131 // ringdex is an index with range [0, blocks-per-subbuf * subbufs-per-buffer )
132 //
133 
134 typedef skc_uint skc_ringdex_t;
135 
136 union skc_ringdex_expand
137 {
138   div_t      qr;
139 
140   struct {
141 #ifndef SKC_DIV_REM_BEFORE_QUOT // offsetof(div_t,quot) != 0
142     skc_uint subbuf;
143     skc_uint block;
144 #else
145     skc_uint block;
146     skc_uint subbuf;
147 #endif
148   };
149 };
150 
151 //
152 // this record is executed by the grid
153 //
154 
155 struct skc_release_record
156 {
157   struct skc_path_builder_impl * impl; // back pointer to impl
158 
159   skc_grid_t                     grid; // pointer to scheduled grid
160 
161   skc_uint                       from; // inclusive starting index   : [from,to)
162   skc_uint                       to;   // non-inclusive ending index : [from,to)
163 };
164 
165 //
166 //
167 //
168 
169 struct skc_path_builder_impl
170 {
171   struct skc_path_builder       * path_builder;
172 
173   struct skc_runtime            * runtime;
174 
175   cl_command_queue                cq;
176 
177   struct {
178     cl_kernel                     alloc;
179     cl_kernel                     copy;
180   } kernels;
181 
182   //
183   // FIXME -- make this pointer to constant config
184   //
185   // vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
186   struct {
187     skc_uint                      subbufs;  // how many subbufs in the buffer?
188 
189     struct {
190       skc_uint                    buffer;   // how many blocks in the buffer?
191       skc_uint                    subbuf;   // how many blocks in a   subbuf?
192     } blocks_per;
193   } ring;
194   //
195   // ^^^^^^^^^^^ don't duplicate these constants ^^^^^^^^^^^^^^^^^^
196   //
197 
198   struct {
199     cl_mem                        buffer;   // backing buffer for blocks
200     struct skc_subbuffer_blocks * subbufs;  // array of structures
201   } blocks;
202 
203   struct {
204     cl_mem                        buffer;   // backing buffer for commands
205     struct skc_subbuffer_cmds   * subbufs;  // array of structures
206   } cmds;
207 
208   struct {
209     struct skc_release_record   * records;  // max release records is equal to max subbufs
210     skc_path_t                  * paths;    // max paths is less than or equal to max commands
211   } release;
212 
213   cl_mem                          reads;    // each kernel only requires one word to store the block pool "base"
214 
215   struct {
216     skc_uint                      rolling;  // rolling counter used by cmds to map to block pool alloc
217     skc_ringdex_t                 from;
218     skc_ringdex_t                 to;
219   } prev;
220 
221   struct {
222     skc_ringdex_t                 from;
223     skc_ringdex_t                 to;
224   } curr;
225 
226   struct {
227     struct skc_path_head        * head;     // pointer to local path header -- not written until path end
228     struct skc_path_node        * node;     // pointer to local node -- may alias head until head is full
229 
230     struct {
231       skc_uint                    rolling;  // rolling counter of wip node -- valid after one node is allocated
232       union skc_tagged_block_id * next;     // next slot in node -- may initially point to head.ids
233       skc_uint                    rem;      // how many id slots left in node block
234     } ids;
235 
236     struct {
237       skc_uint                    rem;      // how many subblocks left in block?
238       skc_uint                    rolling;  // rolling counter of block of subblocks
239       float                     * next;     // next subblock in current subblock block
240       skc_uint                    idx;      // index of next subblock
241     } subblocks;
242 
243     struct {
244       skc_uint                    one;      // .block = 1
245       skc_uint                    next;     // rolling counter used by cmds to map to block pool alloc
246     } rolling;
247 
248     skc_ringdex_t                 to;       // ringdex of _next_available_ command/block in ring -- FIXME -- should be current
249   } wip;
250 };
251 
252 //
253 // FIXME -- move to a pow2 subbuffer size and dispense with division
254 // and modulo operations
255 //
256 
257 static
258 union skc_ringdex_expand
skc_ringdex_expand(struct skc_path_builder_impl * const impl,skc_ringdex_t const ringdex)259 skc_ringdex_expand(struct skc_path_builder_impl * const impl,
260                    skc_ringdex_t                  const ringdex)
261 {
262   return (union skc_ringdex_expand){
263     .qr = div(ringdex,impl->ring.blocks_per.subbuf)
264   };
265 }
266 
267 static
268 void
skc_ringdex_wip_to_block_inc(struct skc_path_builder_impl * const impl)269 skc_ringdex_wip_to_block_inc(struct skc_path_builder_impl * const impl)
270 {
271   //
272   // FIXME - which is faster?
273   //
274 #if 1
275   impl->wip.to  = (impl->wip.to + 1) % impl->ring.blocks_per.buffer;
276 #else
277   impl->wip.to -= (impl->wip.to < impl->ring.blocks_per.buffer) ? -1 : impl->wip.to;
278 #endif
279 
280   // this path is too long -- for now assert() and die
281   assert(impl->wip.to != impl->curr.from);
282 }
283 
284 static
285 skc_ringdex_t
skc_ringdex_span(struct skc_path_builder_impl * const impl,skc_ringdex_t const from,skc_ringdex_t const to)286 skc_ringdex_span(struct skc_path_builder_impl * const impl,
287                  skc_ringdex_t                  const from,
288                  skc_ringdex_t                  const to)
289 {
290   return (to - from) % impl->ring.blocks_per.buffer;
291 }
292 
293 static
294 void
skc_ringdex_wip_to_subbuf_inc(struct skc_path_builder_impl * const impl)295 skc_ringdex_wip_to_subbuf_inc(struct skc_path_builder_impl * const impl)
296 {
297   union skc_ringdex_expand const to = skc_ringdex_expand(impl,impl->wip.to);
298 
299   // nothing to do if this is the first block in the subbuf
300   if (to.block == 0)
301     return;
302 
303   skc_uint const new_subbuf = (to.subbuf + 1) % impl->ring.subbufs;
304 
305   // otherwise increment and mod
306   impl->wip.to = new_subbuf * impl->ring.blocks_per.subbuf;
307 }
308 
309 static
310 skc_bool
skc_ringdex_curr_is_equal(struct skc_path_builder_impl * const impl)311 skc_ringdex_curr_is_equal(struct skc_path_builder_impl * const impl)
312 {
313   return impl->curr.from == impl->curr.to;
314 }
315 
316 static
317 skc_bool
skc_ringdex_prev_is_equal(struct skc_path_builder_impl * const impl)318 skc_ringdex_prev_is_equal(struct skc_path_builder_impl * const impl)
319 {
320   return impl->prev.from == impl->prev.to;
321 }
322 
323 static
324 skc_uint
skc_ringdex_dont_map_last(struct skc_path_builder_impl * const impl,skc_uint const to_block)325 skc_ringdex_dont_map_last(struct skc_path_builder_impl * const impl,
326                           skc_uint                       const to_block)
327 {
328   // no blocks acquired OR this is last block in subbuf
329   return !((impl->wip.to == impl->curr.to) || (to_block == 0));
330 }
331 
332 //
333 //
334 //
335 
336 static
337 struct skc_release_record *
skc_release_curr(struct skc_path_builder_impl * const impl)338 skc_release_curr(struct skc_path_builder_impl * const impl)
339 {
340   union skc_ringdex_expand curr_from = skc_ringdex_expand(impl,impl->curr.from);
341 
342   return impl->release.records + curr_from.subbuf;
343 }
344 
345 //
346 // FIXME -- get rid of all distant config references -- grab them at all at creation time
347 //
348 
349 static
350 void
skc_path_builder_pfn_begin(struct skc_path_builder_impl * const impl)351 skc_path_builder_pfn_begin(struct skc_path_builder_impl * const impl)
352 {
353   // init header counters // { handle, blocks, nodes, prims }
354   impl->wip.head->header = (union skc_path_header){
355     .handle = 0,
356     .blocks = 0,
357     .nodes  = 0,
358     .prims  = 0
359   };
360 
361   // FIXME -- BOUNDS SHOULD USE SIMD4 TRICK AND NEGATE ONE OF THE CORNERS
362   impl->wip.head->bounds  = (union skc_path_bounds){ +FLT_MIN, +FLT_MIN, -FLT_MIN, -FLT_MIN };
363 
364   // point wip ids at local head node
365   impl->wip.ids.next      = impl->wip.head->tag_ids; // point to local head node
366   impl->wip.ids.rem       = impl->runtime->config->block.words - SKC_PATH_HEAD_WORDS; // FIXME -- save this constant somewhere
367 
368   // start with no subblocks
369   impl->wip.subblocks.rem = 0;
370 }
371 
372 //
373 //
374 //
375 
376 static
377 void
skc_path_builder_impl_finalize_node(struct skc_path_builder_impl * const impl)378 skc_path_builder_impl_finalize_node(struct skc_path_builder_impl * const impl)
379 {
380 #if 1
381   //
382   // FIXME -- a Duff's device might be optimal here but would have to
383   // be customized per device since node's could be 16-128+ words
384   //
385   while (impl->wip.ids.rem > 0)
386     {
387       impl->wip.ids.rem      -= 1;
388       impl->wip.ids.next->u32 = SKC_TAGGED_BLOCK_ID_INVALID;
389       impl->wip.ids.next     += 1;
390     }
391 #else
392   memset(&impl->wip.ids.next->u32,
393          SKC_TAGGED_BLOCK_ID_INVALID, // 0xFF
394          sizeof(impl->wip.ids.next->u32) * impl->wip.ids.rem);
395 
396   impl->wip.ids.next += impl->wip.ids.rem;
397   impl->wip.ids.rem   = 0;
398 #endif
399 }
400 
401 //
402 //
403 //
404 
405 static
406 void
skc_zero_float(skc_float * p,skc_uint rem)407 skc_zero_float(skc_float * p, skc_uint rem)
408 {
409   memset(p,0,sizeof(*p)*rem);
410 }
411 
412 static
413 void
skc_path_builder_finalize_subblocks(struct skc_path_builder * const path_builder)414 skc_path_builder_finalize_subblocks(struct skc_path_builder * const path_builder)
415 {
416   //
417   // FIXME -- it might be more performant to zero the remaining
418   // columns in a subblock -- a subblock at a time -- instead of the
419   // same column across all the subblocks
420   //
421 #if 0
422   while (path_builder->line.rem > 0)
423     {
424       --path_builder->line.rem;
425 
426       *path_builder->line.coords[0]++ = 0.0f;
427       *path_builder->line.coords[1]++ = 0.0f;
428       *path_builder->line.coords[2]++ = 0.0f;
429       *path_builder->line.coords[3]++ = 0.0f;
430     }
431 
432   while (path_builder->quad.rem > 0)
433     {
434       --path_builder->quad.rem;
435 
436       *path_builder->line.coords[0]++ = 0.0f;
437       *path_builder->line.coords[1]++ = 0.0f;
438       *path_builder->line.coords[2]++ = 0.0f;
439       *path_builder->line.coords[3]++ = 0.0f;
440       *path_builder->line.coords[4]++ = 0.0f;
441       *path_builder->line.coords[5]++ = 0.0f;
442     }
443 
444   while (path_builder->cubic.rem > 0)
445     {
446       --path_builder->cubic.rem;
447 
448       *path_builder->line.coords[0]++ = 0.0f;
449       *path_builder->line.coords[1]++ = 0.0f;
450       *path_builder->line.coords[2]++ = 0.0f;
451       *path_builder->line.coords[3]++ = 0.0f;
452       *path_builder->line.coords[4]++ = 0.0f;
453       *path_builder->line.coords[5]++ = 0.0f;
454       *path_builder->line.coords[6]++ = 0.0f;
455       *path_builder->line.coords[7]++ = 0.0f;
456     }
457 #else
458   if (path_builder->line.rem > 0)
459     {
460       skc_zero_float(path_builder->line.coords[0],path_builder->line.rem);
461       skc_zero_float(path_builder->line.coords[1],path_builder->line.rem);
462       skc_zero_float(path_builder->line.coords[2],path_builder->line.rem);
463       skc_zero_float(path_builder->line.coords[3],path_builder->line.rem);
464 
465       path_builder->line.rem = 0;
466     }
467 
468   if (path_builder->quad.rem > 0)
469     {
470       skc_zero_float(path_builder->quad.coords[0],path_builder->quad.rem);
471       skc_zero_float(path_builder->quad.coords[1],path_builder->quad.rem);
472       skc_zero_float(path_builder->quad.coords[2],path_builder->quad.rem);
473       skc_zero_float(path_builder->quad.coords[3],path_builder->quad.rem);
474       skc_zero_float(path_builder->quad.coords[4],path_builder->quad.rem);
475       skc_zero_float(path_builder->quad.coords[5],path_builder->quad.rem);
476 
477       path_builder->quad.rem = 0;
478     }
479 
480   if (path_builder->cubic.rem > 0)
481     {
482       skc_zero_float(path_builder->cubic.coords[0],path_builder->cubic.rem);
483       skc_zero_float(path_builder->cubic.coords[1],path_builder->cubic.rem);
484       skc_zero_float(path_builder->cubic.coords[2],path_builder->cubic.rem);
485       skc_zero_float(path_builder->cubic.coords[3],path_builder->cubic.rem);
486       skc_zero_float(path_builder->cubic.coords[4],path_builder->cubic.rem);
487       skc_zero_float(path_builder->cubic.coords[5],path_builder->cubic.rem);
488       skc_zero_float(path_builder->cubic.coords[6],path_builder->cubic.rem);
489       skc_zero_float(path_builder->cubic.coords[7],path_builder->cubic.rem);
490 
491       path_builder->cubic.rem = 0;
492     }
493 #endif
494 }
495 
496 //
497 //
498 //
499 
500 static
501 void
skc_path_builder_impl_unmap(struct skc_path_builder_impl * const impl,skc_uint from,skc_uint to)502 skc_path_builder_impl_unmap(struct skc_path_builder_impl * const impl,
503                             skc_uint                             from,
504                             skc_uint                             to)
505 {
506   // to might be out of range
507   to = to % impl->ring.subbufs;
508 
509 #if 0
510   fprintf(stderr,"unmap: [%2u,%2u)\n",from,to);
511 #endif
512 
513   while (from != to) // 'to' might be out of range
514     {
515       // bring 'from' back in range
516       from = from % impl->ring.subbufs;
517 
518       struct skc_subbuffer_blocks * const blocks = impl->blocks.subbufs + from;
519       struct skc_subbuffer_cmds   * const cmds   = impl->cmds  .subbufs + from;
520 
521       cl(EnqueueUnmapMemObject(impl->cq,
522                                blocks->device,
523                                blocks->host,
524                                0,NULL,NULL));
525 
526       cl(EnqueueUnmapMemObject(impl->cq,
527                                cmds->device,
528                                cmds->host,
529                                0,NULL,NULL));
530 
531       // bring from back in range
532       from = ++from % impl->ring.subbufs;
533     }
534 }
535 
536 //
537 // FIXME -- reuse this in create()
538 //
539 
540 static
541 void
skc_path_builder_impl_map(struct skc_path_builder_impl * const impl,skc_uint from,skc_uint to)542 skc_path_builder_impl_map(struct skc_path_builder_impl * const impl,
543                           skc_uint                             from,
544                           skc_uint                             to)
545 {
546   // to might be out of range
547   to = to % impl->ring.subbufs;
548 
549 #if 0
550   fprintf(stderr,"  map: [%2u,%2u)\n",from,to);
551 #endif
552 
553   while (from != to)
554     {
555       cl_int cl_err;
556 
557       struct skc_subbuffer_blocks * const blocks = impl->blocks.subbufs + from;
558       struct skc_subbuffer_cmds   * const cmds   = impl->cmds  .subbufs + from;
559 
560       blocks->host = clEnqueueMapBuffer(impl->cq,
561                                         blocks->device,
562                                         CL_FALSE,
563                                         CL_MAP_WRITE_INVALIDATE_REGION,
564                                         0,impl->runtime->config->paths_copy.block.subbuf,
565                                         0,NULL,NULL,
566                                         &cl_err); cl_ok(cl_err);
567 
568       cl(ReleaseEvent(cmds->map));
569 
570       cmds->host   = clEnqueueMapBuffer(impl->cq,
571                                         cmds->device,
572                                         CL_FALSE,
573                                         CL_MAP_WRITE_INVALIDATE_REGION,
574                                         0,impl->runtime->config->paths_copy.command.subbuf,
575                                         0,NULL,&cmds->map,
576                                         &cl_err); cl_ok(cl_err);
577 
578       // bring from back in range
579       from = ++from % impl->ring.subbufs;
580     }
581   //
582   // FIXME -- when we switch to out of order queues we'll need a barrier here
583   //
584 }
585 
586 //
587 //
588 //
589 
590 static
591 void
skc_path_builder_release_dispose(struct skc_release_record * const release,struct skc_path_builder_impl * const impl)592 skc_path_builder_release_dispose(struct skc_release_record    * const release,
593                                  struct skc_path_builder_impl * const impl)
594 {
595   struct skc_runtime * runtime = impl->runtime;
596 
597   if (release->from <= release->to) // no wrap
598     {
599       skc_path_t const * paths = impl->release.paths + release->from;
600       skc_uint           count = release->to         - release->from;
601 
602       skc_grid_deps_unmap(runtime->deps,paths,count);
603       skc_runtime_path_device_release(runtime,paths,count);
604     }
605   else // from > to implies wrap
606     {
607       skc_path_t const * paths_lo = impl->release.paths + release->from;
608       skc_uint           count_lo = impl->ring.blocks_per.buffer - release->from;
609 
610       skc_grid_deps_unmap(runtime->deps,paths_lo,count_lo);
611       skc_runtime_path_device_release(runtime,paths_lo,count_lo);
612 
613       skc_grid_deps_unmap(runtime->deps,impl->release.paths,release->to);
614       skc_runtime_path_device_release(runtime,impl->release.paths,release->to);
615     }
616 
617   release->to = release->from;
618 }
619 
620 static
621 void
skc_path_builder_grid_pfn_dispose(skc_grid_t const grid)622 skc_path_builder_grid_pfn_dispose(skc_grid_t const grid)
623 {
624   struct skc_release_record    * const release = skc_grid_get_data(grid);
625   struct skc_path_builder_impl * const impl    = release->impl;
626 
627   skc_path_builder_release_dispose(release,impl);
628 }
629 
630 static
631 void
632 // skc_path_builder_complete(struct skc_release_record * const release)
skc_path_builder_complete(skc_grid_t grid)633 skc_path_builder_complete(skc_grid_t grid)
634 {
635   //
636   // notify deps that this grid is complete enough for other grids to
637   // proceed
638   //
639   // the path builder still has some cleanup to do before all its
640   // resources can be reused
641   //
642   skc_grid_complete(grid);
643 }
644 
645 static
646 void
skc_path_builder_paths_copy_cb(cl_event event,cl_int status,skc_grid_t grid)647 skc_path_builder_paths_copy_cb(cl_event event, cl_int status, skc_grid_t grid)
648 {
649   SKC_CL_CB(status);
650 
651   struct skc_release_record * const release = skc_grid_get_data(grid);
652 
653   SKC_SCHEDULER_SCHEDULE(release->impl->runtime->scheduler,skc_path_builder_complete,grid);
654 }
655 
656 //
657 //
658 //
659 
660 static
661 void
skc_path_builder_grid_pfn_waiting(skc_grid_t const grid)662 skc_path_builder_grid_pfn_waiting(skc_grid_t const grid)
663 {
664   struct skc_release_record    * const release = skc_grid_get_data(grid);
665   struct skc_path_builder_impl * const impl    = release->impl;
666 
667   // 1. flush incomplete subblocks of path elements
668   // 2. unmap subbuffer on cq.unmap
669   // 3. flush cq.unmap
670   // 4. launch kernel on cq.kernel but wait for unmap completion
671   // 5. flush cq.kernel
672   // 6. remap relevant subbuffers on cq.map but wait for kernel completion
673   // 7. flush cq.map
674 
675   //
676   // FIXME -- can be smarter about flushing if the wip paths are not
677   // in the same subbuf as curr.to
678   //
679   // THIS IS IMPORTANT TO FIX
680   //
681 
682   // flush incomplete subblocks
683   skc_path_builder_finalize_subblocks(impl->path_builder);
684 
685   //
686   // get range of subbufs that need to be unmapped
687   //
688   // note that impl->prev subbufs have already been unmapped
689   //
690   union skc_ringdex_expand       curr_from  = skc_ringdex_expand(impl,impl->curr.from);
691   union skc_ringdex_expand       curr_to    = skc_ringdex_expand(impl,impl->curr.to);
692   skc_uint                 const is_partial = curr_to.block > 0;
693   skc_uint                 const unmap_to   = curr_to.subbuf + is_partial;
694 
695   //
696   // unmap all subbufs in range [from,to)
697   //
698   skc_path_builder_impl_unmap(impl,curr_from.subbuf,unmap_to);
699 
700   //
701   // launch kernels
702   //
703   skc_uint const pb_prev_span = skc_ringdex_span(impl,impl->prev.from,impl->prev.to);
704   skc_uint const pb_curr_span = skc_ringdex_span(impl,impl->curr.from,impl->curr.to);
705   skc_uint const pb_cmds      = pb_prev_span + pb_curr_span;
706 
707   //
708   // 1) allocate blocks from pool
709   //
710 
711   //
712   // FIXME -- pack integers into struct/vector
713   //
714   cl(SetKernelArg(impl->kernels.alloc,0,SKC_CL_ARG(impl->runtime->block_pool.atomics.drw)));
715   cl(SetKernelArg(impl->kernels.alloc,1,SKC_CL_ARG(impl->reads)));
716   cl(SetKernelArg(impl->kernels.alloc,2,SKC_CL_ARG(curr_from.subbuf)));
717   cl(SetKernelArg(impl->kernels.alloc,3,SKC_CL_ARG(pb_cmds)));
718 
719   skc_device_enqueue_kernel(impl->runtime->device,
720                             SKC_DEVICE_KERNEL_ID_PATHS_ALLOC,
721                             impl->cq,
722                             impl->kernels.alloc,
723                             1,
724                             0,NULL,NULL);
725 
726   //
727   // 2) copy blocks from unmapped device-accessible memory
728   //
729 
730   //
731   // FIXME -- pack integers into struct/vector and reduce 13 arguments down to 7
732   //
733   cl(SetKernelArg(impl->kernels.copy, 0,SKC_CL_ARG(impl->runtime->handle_pool.map.drw)));
734 
735   cl(SetKernelArg(impl->kernels.copy, 1,SKC_CL_ARG(impl->runtime->block_pool.ids.drw)));
736   cl(SetKernelArg(impl->kernels.copy, 2,SKC_CL_ARG(impl->runtime->block_pool.blocks.drw)));
737   cl(SetKernelArg(impl->kernels.copy, 3,SKC_CL_ARG(impl->runtime->block_pool.size->ring_mask)));
738 
739   cl(SetKernelArg(impl->kernels.copy, 4,SKC_CL_ARG(impl->reads)));
740   cl(SetKernelArg(impl->kernels.copy, 5,SKC_CL_ARG(curr_from.subbuf)));
741 
742   cl(SetKernelArg(impl->kernels.copy, 6,SKC_CL_ARG(impl->cmds.buffer)));
743   cl(SetKernelArg(impl->kernels.copy, 7,SKC_CL_ARG(impl->blocks.buffer)));
744 
745   cl(SetKernelArg(impl->kernels.copy, 8,SKC_CL_ARG(impl->ring.blocks_per.buffer)));
746   cl(SetKernelArg(impl->kernels.copy, 9,SKC_CL_ARG(impl->prev.rolling)));
747 
748   cl(SetKernelArg(impl->kernels.copy,10,SKC_CL_ARG(impl->prev.from)));
749   cl(SetKernelArg(impl->kernels.copy,11,SKC_CL_ARG(pb_prev_span)));
750   cl(SetKernelArg(impl->kernels.copy,12,SKC_CL_ARG(impl->curr.from)));
751 
752   cl_event complete;
753 
754   skc_device_enqueue_kernel(impl->runtime->device,
755                             SKC_DEVICE_KERNEL_ID_PATHS_COPY,
756                             impl->cq,
757                             impl->kernels.copy,
758                             pb_cmds,
759                             0,NULL,&complete);
760 
761   // set a callback on completion
762   cl(SetEventCallback(complete,CL_COMPLETE,
763                       skc_path_builder_paths_copy_cb,
764                       grid));
765 
766   // immediately release
767   cl(ReleaseEvent(complete));
768 
769   //
770   // remap as many subbuffers as possible after the kernel completes
771   //
772   // note that remaps are async and enqueued on the same command queue
773   // as the kernel launch
774   //
775   // we can't remap subbuffers that are in the possibly empty range
776   //
777   // cases:
778   //
779   //   - curr.to == wip.to which means no blocks have been acquired
780   //   - curr.to points to first block in (next) subbuf
781   //   - otherwise, wip acquired blocks in the curr.to subbuf
782   //
783   // check for these first 2 cases!
784   //
785   union skc_ringdex_expand const prev_from = skc_ringdex_expand(impl,impl->prev.from);
786   skc_uint                 const no_wip    = impl->curr.to == impl->wip.to;
787   skc_uint                       map_to    = curr_to.subbuf + (is_partial && no_wip);
788 
789   // remap all subbufs in range [from,to)
790   skc_path_builder_impl_map(impl,prev_from.subbuf,map_to);
791 
792   // flush command queue
793   cl(Flush(impl->cq));
794 
795   // save rolling
796   impl->prev.rolling = impl->wip.rolling.next;
797 
798   // update prev and curr
799   if (no_wip)
800     {
801       //
802       // if there was no wip then round up to the next subbuf
803       //
804       skc_ringdex_wip_to_subbuf_inc(impl);
805 
806       //
807       // update prev/curr with with incremented wip
808       //
809       impl->prev.from = impl->prev.to = impl->wip.to;
810       impl->curr.from = impl->curr.to = impl->wip.to;
811     }
812   else
813     {
814       //
815       // update prev with wip partials
816       //
817       impl->prev.from    = impl->curr.to;
818       impl->prev.to      = impl->wip .to;
819 
820       //
821       // start curr on a new subbuf boundary
822       //
823       skc_ringdex_wip_to_subbuf_inc(impl);
824 
825       impl->curr.from    = impl->wip.to;
826       impl->curr.to      = impl->wip.to;
827     }
828 }
829 
830 //
831 //
832 //
833 
834 static
835 void
skc_path_builder_impl_acquire_subbuffer(struct skc_path_builder_impl * const impl,skc_uint const subbuf)836 skc_path_builder_impl_acquire_subbuffer(struct skc_path_builder_impl * const impl,
837                                         skc_uint                       const subbuf)
838 {
839   //
840   // FIXME -- move to a power-of-two subbuf size and kickstart path
841   // copies as early as possible
842   //
843   // FIXME -- the subbufs "self-clock" (flow control) the kernel
844   // launches and accounting.  Combine all the subbuffers and release
845   // records into a single indexable struct instead of 3.
846   //
847   struct skc_subbuffer_cmds * const sc        = impl->cmds.subbufs    + subbuf;
848   struct skc_release_record * const release   = impl->release.records + subbuf;
849   struct skc_scheduler      * const scheduler = impl->runtime->scheduler;
850 
851   // can't proceed until the paths have been released
852   SKC_SCHEDULER_WAIT_WHILE(scheduler,release->from != release->to);
853 
854   // throw in a scheduler yield ... FIXME -- get rid of
855   skc_scheduler_yield(scheduler);
856 
857   // can't proceed until the subbuffer is mapped
858   cl(WaitForEvents(1,&sc->map));
859 }
860 
861 //
862 //
863 //
864 
865 static
866 union skc_ringdex_expand
skc_path_builder_impl_acquire_block(struct skc_path_builder_impl * const impl)867 skc_path_builder_impl_acquire_block(struct skc_path_builder_impl * const impl)
868 {
869   // break ringdex into components
870   union skc_ringdex_expand const to = skc_ringdex_expand(impl,impl->wip.to);
871 
872   // does wip ringdex point to a new subbuffer?
873   if (to.block == 0)
874     {
875       // potentially spin/block waiting for subbuffer
876       skc_path_builder_impl_acquire_subbuffer(impl,to.subbuf);
877     }
878 
879   // post increment wip.to
880   skc_ringdex_wip_to_block_inc(impl);
881 
882   return to;
883 }
884 
885 //
886 //
887 //
888 
889 static
890 skc_uint
skc_rolling_block(skc_uint const rolling,skc_uint const tag)891 skc_rolling_block(skc_uint const rolling, skc_uint const tag)
892 {
893   return rolling | tag;
894 }
895 
896 static
897 skc_uint
skc_rolling_subblock(skc_uint const rolling,skc_uint const subblock,skc_uint const tag)898 skc_rolling_subblock(skc_uint const rolling, skc_uint const subblock, skc_uint const tag)
899 {
900   return rolling | (subblock << SKC_TAGGED_BLOCK_ID_BITS_TAG) | tag;
901 }
902 
903 static
904 void
skc_rolling_inc(struct skc_path_builder_impl * const impl)905 skc_rolling_inc(struct skc_path_builder_impl * const impl)
906 {
907   impl->wip.rolling.next += impl->wip.rolling.one;
908 }
909 
910 //
911 //
912 //
913 
914 static
915 void *
skc_path_builder_impl_new_command(struct skc_path_builder_impl * const impl,skc_uint const rolling,skc_cmd_paths_copy_tag const tag)916 skc_path_builder_impl_new_command(struct skc_path_builder_impl * const impl,
917                                   skc_uint                       const rolling,
918                                   skc_cmd_paths_copy_tag         const tag)
919 {
920   // bump blocks count
921   impl->wip.head->header.blocks += 1;
922 
923   // acquire a block
924   union skc_ringdex_expand    const to          = skc_path_builder_impl_acquire_block(impl);
925 
926   // make a pointer
927   union skc_tagged_block_id * const cmds_subbuf = impl->cmds.subbufs[to.subbuf].host;
928 
929   // store command for block
930   cmds_subbuf[to.block].u32 = skc_rolling_block(rolling,tag);
931 
932 #if 0
933   // store command for block
934   cmds_subbuf[to.block].u32 = skc_rolling_block(impl->wip.rolling.next,tag);
935 
936   // increment rolling
937   skc_rolling_inc(impl);
938 #endif
939 
940   // return pointer to block
941   float * const blocks_subbuf = impl->blocks.subbufs[to.subbuf].host;
942 
943   // FIXME -- make it easier to get config constant
944   return blocks_subbuf + (to.block * impl->runtime->config->block.words);
945 }
946 
947 //
948 //
949 //
950 
951 static
952 void
skc_path_builder_impl_flush_node(struct skc_path_builder_impl * const impl)953 skc_path_builder_impl_flush_node(struct skc_path_builder_impl * const impl)
954 {
955   // store command to subbuf and get pointer to blocks subbuf
956   void * const block = skc_path_builder_impl_new_command(impl,impl->wip.ids.rolling,
957                                                          SKC_CMD_PATHS_COPY_TAG_NODE);
958 
959   // copy head to blocks subbuf -- write-only
960   memcpy(block,impl->wip.node,impl->runtime->config->block.bytes);
961 }
962 
963 static
964 void
skc_path_builder_impl_flush_head(struct skc_path_builder_impl * const impl)965 skc_path_builder_impl_flush_head(struct skc_path_builder_impl * const impl)
966 {
967   // store command to subbuf and get pointer to blocks subbuf
968   void * const block = skc_path_builder_impl_new_command(impl,impl->wip.rolling.next,
969                                                          SKC_CMD_PATHS_COPY_TAG_HEAD);
970 
971   // copy head to blocks subbuf -- write-only
972   memcpy(block,impl->wip.head,impl->runtime->config->block.bytes);
973 
974   // increment rolling
975   skc_rolling_inc(impl);
976 
977   // the 'to' index is non-inclusive so assign wip.to after flush_head
978   impl->curr.to = impl->wip.to;
979 }
980 
981 //
982 //
983 //
984 
985 static
986 void
skc_path_builder_impl_new_node_block(struct skc_path_builder_impl * const impl)987 skc_path_builder_impl_new_node_block(struct skc_path_builder_impl * const impl)
988 {
989   // update final block id in node
990   impl->wip.ids.next->u32 = skc_rolling_block(impl->wip.rolling.next,SKC_BLOCK_ID_TAG_PATH_NEXT);
991 
992   // if wip.ids is not the header then flush now full wip node
993   if (impl->wip.head->header.nodes > 0)
994     skc_path_builder_impl_flush_node(impl);
995 
996   // bump node count
997   impl->wip.head->header.nodes += 1;
998 
999   // save current rolling
1000   impl->wip.ids.rolling = impl->wip.rolling.next;
1001 
1002   // increment rolling
1003   skc_rolling_inc(impl);
1004 
1005   // update wip.ids.*
1006   impl->wip.ids.next = impl->wip.node->tag_ids;
1007   impl->wip.ids.rem  = impl->runtime->config->block.words;
1008 }
1009 
1010 static
1011 void
skc_path_builder_impl_new_segs_block(struct skc_path_builder_impl * const impl)1012 skc_path_builder_impl_new_segs_block(struct skc_path_builder_impl * const impl)
1013 {
1014   impl->wip.subblocks.rem     = impl->runtime->config->block.subblocks; // FIXME -- move constants closer to structure
1015   impl->wip.subblocks.rolling = impl->wip.rolling.next;
1016   impl->wip.subblocks.next    = skc_path_builder_impl_new_command(impl,impl->wip.rolling.next,
1017                                                                   SKC_CMD_PATHS_COPY_TAG_SEGS);
1018   impl->wip.subblocks.idx     = 0;
1019 
1020   // increment rolling
1021   skc_rolling_inc(impl);
1022 }
1023 
1024 //
1025 //
1026 //
1027 
1028 static
1029 void
skc_path_builder_impl_acquire_subblocks(struct skc_path_builder_impl * const impl,skc_block_id_tag tag,skc_uint vertices,float ** subblocks)1030 skc_path_builder_impl_acquire_subblocks(struct skc_path_builder_impl * const impl,
1031                                         skc_block_id_tag                     tag,
1032                                         skc_uint                             vertices,
1033                                         float * *                            subblocks)
1034 {
1035   //
1036   // FIRST TAG RECORDS THE ELEMENT TYPE
1037   //
1038   while (true)
1039     {
1040       // if only one block id left in node then acquire new node block
1041       // and append its block id as with a next tag
1042       if (impl->wip.ids.rem == 1)
1043         skc_path_builder_impl_new_node_block(impl);
1044 
1045       // if zero subblocks left then acquire a new subblock block and
1046       // append its block id
1047       if (impl->wip.subblocks.rem == 0)
1048         skc_path_builder_impl_new_segs_block(impl);
1049 
1050       // save first command -- tag and subblocks may have been updated
1051       impl->wip.ids.next->u32 = skc_rolling_subblock(impl->wip.subblocks.rolling,impl->wip.subblocks.idx,tag);
1052 
1053       // increment node block subblock pointer
1054       impl->wip.ids.next += 1;
1055       impl->wip.ids.rem  -= 1;
1056 
1057       // how many vertices can we store
1058       skc_uint rem = min(vertices,impl->wip.subblocks.rem);
1059 
1060       // decrement vertices
1061       vertices                -= rem;
1062       impl->wip.subblocks.rem -= rem;
1063       impl->wip.subblocks.idx += rem;
1064 
1065       // assign subblocks
1066       do {
1067         *subblocks++              = impl->wip.subblocks.next;
1068         impl->wip.subblocks.next += impl->runtime->config->subblock.words;
1069         // FIXME -- move constants closer to structure
1070       } while (--rem > 0);
1071 
1072       // anything left to do?
1073       if (vertices == 0)
1074         break;
1075 
1076       // any tag after this will be a caboose command
1077       tag = SKC_BLOCK_ID_TAG_PATH_NEXT;
1078     }
1079 }
1080 
1081 //
1082 //
1083 //
1084 
1085 static
1086 void
skc_path_builder_pfn_end(struct skc_path_builder_impl * const impl,skc_path_t * const path)1087 skc_path_builder_pfn_end(struct skc_path_builder_impl * const impl, skc_path_t * const path)
1088 {
1089   // finalize incomplete active subblocks -- we don't care about any
1090   // remaining unused subblocks in block
1091   skc_path_builder_finalize_subblocks(impl->path_builder);
1092 
1093   // mark remaining wips.ids in the head or node as invalid
1094   skc_path_builder_impl_finalize_node(impl);
1095 
1096   // flush node if rem > 0 and node is not actually head
1097   if (impl->wip.head->header.nodes >= 1)
1098     skc_path_builder_impl_flush_node(impl);
1099 
1100   // acquire path host id
1101   *path = skc_runtime_handle_device_acquire(impl->runtime); // FIXME -- MAY WANT TO GRAB AN ID ON BEGIN
1102 
1103   // save path host handle
1104   impl->wip.head->header.handle = *path;
1105 
1106   // flush head -- acquires a block and bumps head->header.blocks
1107   skc_path_builder_impl_flush_head(impl);
1108 
1109   // get current release
1110   struct skc_release_record * const release = skc_release_curr(impl);
1111 
1112   // acquire grid if null
1113   if (release->grid == NULL)
1114     {
1115       release->grid =
1116         SKC_GRID_DEPS_ATTACH(impl->runtime->deps,
1117                              &release->grid, // NULL on start/force
1118                              release,        // data payload
1119                              skc_path_builder_grid_pfn_waiting,
1120                              NULL,           // no execute pfn
1121                              skc_path_builder_grid_pfn_dispose);
1122     }
1123 
1124   // update grid map
1125   skc_grid_map(release->grid,*path);
1126 
1127   // update path release
1128   impl->release.paths[release->to] = *path;
1129 
1130   // increment release.to
1131   release->to = (release->to + 1) % impl->ring.blocks_per.buffer;
1132 
1133   // add guard bit
1134   *path |= SKC_TYPED_HANDLE_TYPE_IS_PATH;
1135 
1136 #if 1
1137   //
1138   // eager kernel launch?
1139   //
1140   {
1141     union skc_ringdex_expand const curr_from = skc_ringdex_expand(impl,impl->curr.from);
1142     union skc_ringdex_expand const curr_to   = skc_ringdex_expand(impl,impl->curr.to);
1143 
1144     if (curr_from.subbuf != curr_to.subbuf)
1145       {
1146         skc_grid_start(release->grid);
1147         // skc_scheduler_yield(impl->runtime->scheduler);
1148       }
1149   }
1150 #endif
1151 }
1152 
1153 //
1154 // FIXME -- clean up accessing of CONFIG constants in these 3 routines
1155 //
1156 
1157 static
1158 void
skc_path_builder_pfn_new_line(struct skc_path_builder_impl * const impl)1159 skc_path_builder_pfn_new_line(struct skc_path_builder_impl * const impl)
1160 {
1161   // acquire subblock pointers
1162   skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_LINE,4,
1163                                           impl->path_builder->line.coords);
1164 
1165   // increment line count
1166   impl->wip.head->header.prims += 1;
1167 
1168   // update rem_count_xxx count
1169   impl->path_builder->line.rem = impl->runtime->config->subblock.words;
1170 }
1171 
1172 static
1173 void
skc_path_builder_pfn_new_quad(struct skc_path_builder_impl * const impl)1174 skc_path_builder_pfn_new_quad(struct skc_path_builder_impl * const impl)
1175 {
1176   // acquire subblock pointers
1177   skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_QUAD,6,
1178                                           impl->path_builder->quad.coords);
1179 
1180   // increment line count
1181   impl->wip.head->header.prims += 1;
1182 
1183   // update rem_count_xxx count
1184   impl->path_builder->quad.rem = impl->runtime->config->subblock.words;
1185 }
1186 
1187 static
1188 void
skc_path_builder_pfn_new_cubic(struct skc_path_builder_impl * const impl)1189 skc_path_builder_pfn_new_cubic(struct skc_path_builder_impl * const impl)
1190 {
1191   // acquire subblock pointers
1192   skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_CUBIC,8,
1193                                           impl->path_builder->cubic.coords);
1194 
1195   // increment line count
1196   impl->wip.head->header.prims += 1;
1197 
1198   // update rem_count_xxx count
1199   impl->path_builder->cubic.rem = impl->runtime->config->subblock.words;
1200 }
1201 
1202 //
1203 //
1204 //
1205 
1206 static
1207 void
skc_path_builder_pfn_release(struct skc_path_builder_impl * const impl)1208 skc_path_builder_pfn_release(struct skc_path_builder_impl * const impl)
1209 {
1210   // decrement reference count
1211   if (--impl->path_builder->refcount != 0)
1212     return;
1213 
1214   //
1215   // otherwise, dispose of everything
1216   //
1217   struct skc_runtime * const runtime = impl->runtime;
1218 
1219   // free path builder
1220   skc_runtime_host_perm_free(impl->runtime,impl->path_builder);
1221 
1222   // release cq
1223   skc_runtime_release_cq_in_order(runtime,impl->cq);
1224 
1225   // release kernels
1226   cl(ReleaseKernel(impl->kernels.alloc));
1227   cl(ReleaseKernel(impl->kernels.copy));
1228 
1229   // free blocks extents
1230   cl(ReleaseMemObject(impl->blocks.buffer));
1231   skc_runtime_host_perm_free(runtime,impl->blocks.subbufs);
1232 
1233   cl(ReleaseMemObject(impl->cmds.buffer));
1234   skc_runtime_host_perm_free(runtime,impl->cmds.subbufs);
1235 
1236   // free records
1237   skc_runtime_host_perm_free(runtime,impl->release.records);
1238   skc_runtime_host_perm_free(runtime,impl->release.paths);
1239 
1240   // release staging head and node
1241   skc_runtime_host_perm_free(runtime,impl->wip.head);
1242   skc_runtime_host_perm_free(runtime,impl->wip.node);
1243 
1244   // release reads scratch array
1245   cl(ReleaseMemObject(impl->reads));
1246 
1247   // for all subbuffers
1248   //   unmap   subbuffer
1249   //   release subbuffer
1250   // printf("%s not releasing subbuffers\n",__func__);
1251 
1252   skc_runtime_host_perm_free(impl->runtime,impl);
1253 }
1254 
1255 //
1256 //
1257 //
1258 
1259 skc_err
skc_path_builder_cl_12_create(struct skc_context * const context,struct skc_path_builder ** const path_builder)1260 skc_path_builder_cl_12_create(struct skc_context        * const context,
1261                               struct skc_path_builder * * const path_builder)
1262 {
1263   //
1264   // retain the context
1265   // skc_context_retain(context);
1266   //
1267   struct skc_runtime * const runtime = context->runtime;
1268 
1269   // allocate path builder
1270   (*path_builder)             = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**path_builder));
1271 
1272   // init state
1273   SKC_ASSERT_STATE_INIT((*path_builder),SKC_PATH_BUILDER_STATE_READY);
1274 
1275   (*path_builder)->context    = context;
1276 
1277   // save opaque impl-specific pointers
1278   (*path_builder)->begin      = skc_path_builder_pfn_begin;
1279   (*path_builder)->end        = skc_path_builder_pfn_end;
1280   (*path_builder)->new_line   = skc_path_builder_pfn_new_line;
1281   (*path_builder)->new_quad   = skc_path_builder_pfn_new_quad;
1282   (*path_builder)->new_cubic  = skc_path_builder_pfn_new_cubic;
1283   (*path_builder)->release    = skc_path_builder_pfn_release;
1284 
1285   // initialize path builder counts
1286   (*path_builder)->line.rem   = 0;
1287   (*path_builder)->quad.rem   = 0;
1288   (*path_builder)->cubic.rem  = 0;
1289 
1290   (*path_builder)->refcount   = 1;
1291 
1292   struct skc_path_builder_impl * const impl = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl));
1293 
1294   (*path_builder)->impl       = impl;
1295 
1296   //
1297   // init impl
1298   //
1299   impl->path_builder  = *path_builder;
1300   impl->runtime       = runtime;
1301 
1302   impl->cq            = skc_runtime_acquire_cq_in_order(runtime);
1303 
1304   impl->kernels.alloc = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_PATHS_ALLOC);
1305   impl->kernels.copy  = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_PATHS_COPY);
1306 
1307   //
1308   // FIXME -- let these config constants remain constant and in place
1309   //
1310   struct skc_config const * const config = runtime->config;
1311 
1312   impl->ring.subbufs           = config->paths_copy.buffer.count;
1313   impl->ring.blocks_per.buffer = config->paths_copy.subbuf.count * config->paths_copy.buffer.count;
1314   impl->ring.blocks_per.subbuf = config->paths_copy.subbuf.count;
1315   //
1316   // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1317   //
1318 
1319   cl_int cl_err;
1320 
1321   // allocate large device-side extent for path data
1322   impl->blocks.buffer   = clCreateBuffer(runtime->cl.context,
1323                                          CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
1324                                          config->paths_copy.block.buffer, // FIXME -- either use config or local constants everywhere
1325                                          NULL,&cl_err); cl_ok(cl_err);
1326 
1327   // allocate small host-side array of pointers to mapped subbufs
1328   impl->blocks.subbufs  = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
1329                                                       impl->ring.subbufs *
1330                                                       sizeof(*impl->blocks.subbufs));
1331 
1332   // allocate large device-side extent for path copy commands
1333   impl->cmds.buffer     = clCreateBuffer(runtime->cl.context,
1334                                          CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
1335                                          config->paths_copy.command.buffer,
1336                                          NULL,&cl_err); cl_ok(cl_err);
1337 
1338   // allocate small host-side array of pointers to mapped subbufs
1339   impl->cmds.subbufs    = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
1340                                                       impl->ring.subbufs *
1341                                                       sizeof(*impl->cmds.subbufs));
1342 
1343   // allocate small host-side array of intervals of path handles
1344   impl->release.records = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
1345                                                       impl->ring.subbufs *
1346                                                       sizeof(*impl->release.records));
1347 
1348   // allocate large host-side array that is max # of path handles in flight
1349   impl->release.paths   = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
1350                                                       impl->ring.blocks_per.buffer *
1351                                                       sizeof(*impl->release.paths));
1352 
1353   // small scratch used by kernels
1354   impl->reads           = clCreateBuffer(runtime->cl.context,
1355                                          CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS,
1356                                          sizeof(skc_uint) * impl->ring.subbufs,
1357                                          NULL,&cl_err); cl_ok(cl_err);
1358 
1359   // initialize release record with impl backpointer
1360   for (skc_uint ii=0; ii<impl->ring.subbufs; ii++)
1361     {
1362       struct skc_release_record * record = impl->release.records + ii;
1363 
1364       record->impl = impl;
1365       record->grid = NULL;
1366       record->from = record->to = ii * impl->ring.blocks_per.subbuf;
1367     }
1368 
1369   //
1370   // allocate and map subbuffers -- we always check the command
1371   // subbuffer's map/unmap events before touching it or its associated
1372   // block subbuffer.
1373   //
1374   struct skc_subbuffer_blocks * sb = impl->blocks.subbufs;
1375   struct skc_subbuffer_cmds   * sc = impl->cmds  .subbufs;
1376 
1377   cl_buffer_region              rb = { 0, config->paths_copy.block.subbuf   };
1378   cl_buffer_region              rc = { 0, config->paths_copy.command.subbuf };
1379 
1380   // for each subbuffer
1381   for (skc_uint ii=0; ii<config->paths_copy.buffer.count; ii++)
1382     {
1383       sb->device = clCreateSubBuffer(impl->blocks.buffer,
1384                                      CL_MEM_HOST_WRITE_ONLY,
1385                                      CL_BUFFER_CREATE_TYPE_REGION,
1386                                      &rb,
1387                                      &cl_err); cl_ok(cl_err);
1388 
1389       sb->host   = clEnqueueMapBuffer(impl->cq,
1390                                       sb->device,
1391                                       CL_FALSE,
1392                                       CL_MAP_WRITE_INVALIDATE_REGION,
1393                                       0,rb.size,
1394                                       0,NULL,NULL,
1395                                       &cl_err); cl_ok(cl_err);
1396 
1397       sc->device = clCreateSubBuffer(impl->cmds.buffer,
1398                                      CL_MEM_HOST_WRITE_ONLY,
1399                                      CL_BUFFER_CREATE_TYPE_REGION,
1400                                      &rc,
1401                                      &cl_err); cl_ok(cl_err);
1402 
1403       sc->host   = clEnqueueMapBuffer(impl->cq,
1404                                       sc->device,
1405                                       CL_FALSE,
1406                                       CL_MAP_WRITE_INVALIDATE_REGION,
1407                                       0,rc.size,
1408                                       0,NULL,&sc->map,
1409                                       &cl_err); cl_ok(cl_err);
1410       sb        += 1;
1411       sc        += 1;
1412 
1413       rb.origin += rb.size;
1414       rc.origin += rc.size;
1415     }
1416 
1417   //
1418   // initialize remaining members
1419   //
1420   impl->prev.from        = 0;
1421   impl->prev.to          = 0;
1422   impl->prev.rolling     = 0;
1423 
1424   impl->curr.from        = 0;
1425   impl->curr.to          = 0;
1426 
1427   impl->wip.to           = 0;
1428 
1429   impl->wip.head         = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,config->block.bytes);
1430   impl->wip.node         = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,config->block.bytes);
1431 
1432   impl->wip.rolling.one  = SKC_BLOCK_ID_TAG_COUNT * config->block.subblocks;
1433   impl->wip.rolling.next = 0;
1434 
1435   // for now, completely initialize builder before returning
1436   cl(Finish(impl->cq));
1437 
1438   return SKC_ERR_SUCCESS;
1439 }
1440 
1441 //
1442 //
1443 //
1444