1 /*
2 * Copyright 2017 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can
5 * be found in the LICENSE file.
6 *
7 */
8
9 //
10 //
11 //
12
13 #include <stddef.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #include <float.h>
17 #include <stdio.h>
18
19 #include "common/cl/assert_cl.h"
20
21 #include "context.h"
22 #include "handle.h"
23 #include "grid.h"
24 #include "path.h"
25 #include "path_builder.h"
26
27 #include "config_cl.h"
28 #include "export_cl_12.h"
29 #include "runtime_cl_12.h"
30 #include "path_builder_cl_12.h"
31
32 //
33 // OpenCL 1.2 devices support mapping of buffers into the host address
34 // space.
35 //
36 // Mapped buffers must be aligned on MIN_DATA_TYPE_ALIGN_SIZE bit
37 // boundary (e.g. 128 bytes). This complicates coordinating sharing
38 // of data between the host and the device.
39 //
40 // Some OpenCL 2.0 devices support fine-grained shared virtual memory
41 // pointers with byte-addressing and allow simpler coordination
42 // strategies at the cost of maintaining cache coherency.
43 //
44 // The path builder is focused on moving bulk path data from the host
45 // into the device-managed "block" memory pool and arranging it into a
46 // SIMT/SIMD-friendly data structure that can be efficiently read by
47 // the rasterizer.
48 //
49 // Note that one simplifying assumption is that the maximum length of
50 // a *single* path can't be larger than what fits in the single extent
51 // (which is split into M subbuffers). This would be a very long path
52 // and a legitimate size limitation.
53 //
54 // For some systems, it may be appropriate to never pull path data
55 // into the device-managed block pool and instead present the path
56 // data to the device in a temporarily available allocated memory
57 // "zone" of paths that can be discarded all at once.
58 //
59 // For other systems, it may be appropriate to simply copy the path
60 // data from host to device.
61 //
62 // But the majority of OpenCL (and VK, MTL, DX12) devices we'll be
63 // targeting support basic map/unmap functionality similar to OpenCL
64 // 1.2. Furthermore, not all OpenCL 2.0 devices support fine-grained
65 // sharing of memory and still require a map/unmap step... but note
66 // that they all support byte-aligned mapping and subbuffers.
67 //
68 // The general strategy that this particular CL_12 implementation uses
69 // is to allocate a large mappable bulk-data path buffer and an
70 // auxilary mappable command buffer.
71 //
72 // The buffers are split into a reasonable number of properly aligned
73 // subbuffers to enable simultaneous host and device access.
74 //
75
76 //
77 // Blocks:
78 // 1 extent
79 // M mapped subbuffers (configurable) to allow for concurrency
80 //
81 // Commands:
82 // 1 extent
83 // M mapped subbuffers (configurable) to allow for concurrency
84 //
85 // Spans:
86 // M hi/lo structures
87 //
88 // { cl_sub, void*, event, base }
89 //
90 // - size of sub buffer
91 // - remaining
92 //
93 // - counts
94 //
95
96 //
97 // For any kernel launch, at most one path will be discontiguous and
98 // defined across two sub-buffers.
99 //
100 // Nodes are updated locally until full and then stored so they will
101 // never be incomplete. Headers are stored locally until the path is
102 // ended so they will never be incomplete.
103 //
104 // A line, quad or cubic acquires 4/6/8 segments which may be spread
105 // across one or more congtiguous blocks.
106 //
107 // If a flush() occurs then the remaining columns of multi-segment
108 // paths are initialized with zero-length line, quad, cubic elements.
109 //
110 // Every block's command word has a type and a count acquired from a
111 // rolling counter.
112 //
113 // The kernel is passed two spans of blocks { base, count } to
114 // process. The grid is must process (lo.count + hi.count) blocks.
115 //
116
117 struct skc_subbuffer_blocks
118 {
119 cl_mem device;
120 void * host;
121 };
122
123 struct skc_subbuffer_cmds
124 {
125 cl_mem device;
126 void * host;
127 cl_event map;
128 };
129
130 //
131 // ringdex is an index with range [0, blocks-per-subbuf * subbufs-per-buffer )
132 //
133
134 typedef skc_uint skc_ringdex_t;
135
136 union skc_ringdex_expand
137 {
138 div_t qr;
139
140 struct {
141 #ifndef SKC_DIV_REM_BEFORE_QUOT // offsetof(div_t,quot) != 0
142 skc_uint subbuf;
143 skc_uint block;
144 #else
145 skc_uint block;
146 skc_uint subbuf;
147 #endif
148 };
149 };
150
151 //
152 // this record is executed by the grid
153 //
154
155 struct skc_release_record
156 {
157 struct skc_path_builder_impl * impl; // back pointer to impl
158
159 skc_grid_t grid; // pointer to scheduled grid
160
161 skc_uint from; // inclusive starting index : [from,to)
162 skc_uint to; // non-inclusive ending index : [from,to)
163 };
164
165 //
166 //
167 //
168
169 struct skc_path_builder_impl
170 {
171 struct skc_path_builder * path_builder;
172
173 struct skc_runtime * runtime;
174
175 cl_command_queue cq;
176
177 struct {
178 cl_kernel alloc;
179 cl_kernel copy;
180 } kernels;
181
182 //
183 // FIXME -- make this pointer to constant config
184 //
185 // vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
186 struct {
187 skc_uint subbufs; // how many subbufs in the buffer?
188
189 struct {
190 skc_uint buffer; // how many blocks in the buffer?
191 skc_uint subbuf; // how many blocks in a subbuf?
192 } blocks_per;
193 } ring;
194 //
195 // ^^^^^^^^^^^ don't duplicate these constants ^^^^^^^^^^^^^^^^^^
196 //
197
198 struct {
199 cl_mem buffer; // backing buffer for blocks
200 struct skc_subbuffer_blocks * subbufs; // array of structures
201 } blocks;
202
203 struct {
204 cl_mem buffer; // backing buffer for commands
205 struct skc_subbuffer_cmds * subbufs; // array of structures
206 } cmds;
207
208 struct {
209 struct skc_release_record * records; // max release records is equal to max subbufs
210 skc_path_t * paths; // max paths is less than or equal to max commands
211 } release;
212
213 cl_mem reads; // each kernel only requires one word to store the block pool "base"
214
215 struct {
216 skc_uint rolling; // rolling counter used by cmds to map to block pool alloc
217 skc_ringdex_t from;
218 skc_ringdex_t to;
219 } prev;
220
221 struct {
222 skc_ringdex_t from;
223 skc_ringdex_t to;
224 } curr;
225
226 struct {
227 struct skc_path_head * head; // pointer to local path header -- not written until path end
228 struct skc_path_node * node; // pointer to local node -- may alias head until head is full
229
230 struct {
231 skc_uint rolling; // rolling counter of wip node -- valid after one node is allocated
232 union skc_tagged_block_id * next; // next slot in node -- may initially point to head.ids
233 skc_uint rem; // how many id slots left in node block
234 } ids;
235
236 struct {
237 skc_uint rem; // how many subblocks left in block?
238 skc_uint rolling; // rolling counter of block of subblocks
239 float * next; // next subblock in current subblock block
240 skc_uint idx; // index of next subblock
241 } subblocks;
242
243 struct {
244 skc_uint one; // .block = 1
245 skc_uint next; // rolling counter used by cmds to map to block pool alloc
246 } rolling;
247
248 skc_ringdex_t to; // ringdex of _next_available_ command/block in ring -- FIXME -- should be current
249 } wip;
250 };
251
252 //
253 // FIXME -- move to a pow2 subbuffer size and dispense with division
254 // and modulo operations
255 //
256
257 static
258 union skc_ringdex_expand
skc_ringdex_expand(struct skc_path_builder_impl * const impl,skc_ringdex_t const ringdex)259 skc_ringdex_expand(struct skc_path_builder_impl * const impl,
260 skc_ringdex_t const ringdex)
261 {
262 return (union skc_ringdex_expand){
263 .qr = div(ringdex,impl->ring.blocks_per.subbuf)
264 };
265 }
266
267 static
268 void
skc_ringdex_wip_to_block_inc(struct skc_path_builder_impl * const impl)269 skc_ringdex_wip_to_block_inc(struct skc_path_builder_impl * const impl)
270 {
271 //
272 // FIXME - which is faster?
273 //
274 #if 1
275 impl->wip.to = (impl->wip.to + 1) % impl->ring.blocks_per.buffer;
276 #else
277 impl->wip.to -= (impl->wip.to < impl->ring.blocks_per.buffer) ? -1 : impl->wip.to;
278 #endif
279
280 // this path is too long -- for now assert() and die
281 assert(impl->wip.to != impl->curr.from);
282 }
283
284 static
285 skc_ringdex_t
skc_ringdex_span(struct skc_path_builder_impl * const impl,skc_ringdex_t const from,skc_ringdex_t const to)286 skc_ringdex_span(struct skc_path_builder_impl * const impl,
287 skc_ringdex_t const from,
288 skc_ringdex_t const to)
289 {
290 return (to - from) % impl->ring.blocks_per.buffer;
291 }
292
293 static
294 void
skc_ringdex_wip_to_subbuf_inc(struct skc_path_builder_impl * const impl)295 skc_ringdex_wip_to_subbuf_inc(struct skc_path_builder_impl * const impl)
296 {
297 union skc_ringdex_expand const to = skc_ringdex_expand(impl,impl->wip.to);
298
299 // nothing to do if this is the first block in the subbuf
300 if (to.block == 0)
301 return;
302
303 skc_uint const new_subbuf = (to.subbuf + 1) % impl->ring.subbufs;
304
305 // otherwise increment and mod
306 impl->wip.to = new_subbuf * impl->ring.blocks_per.subbuf;
307 }
308
309 static
310 skc_bool
skc_ringdex_curr_is_equal(struct skc_path_builder_impl * const impl)311 skc_ringdex_curr_is_equal(struct skc_path_builder_impl * const impl)
312 {
313 return impl->curr.from == impl->curr.to;
314 }
315
316 static
317 skc_bool
skc_ringdex_prev_is_equal(struct skc_path_builder_impl * const impl)318 skc_ringdex_prev_is_equal(struct skc_path_builder_impl * const impl)
319 {
320 return impl->prev.from == impl->prev.to;
321 }
322
323 static
324 skc_uint
skc_ringdex_dont_map_last(struct skc_path_builder_impl * const impl,skc_uint const to_block)325 skc_ringdex_dont_map_last(struct skc_path_builder_impl * const impl,
326 skc_uint const to_block)
327 {
328 // no blocks acquired OR this is last block in subbuf
329 return !((impl->wip.to == impl->curr.to) || (to_block == 0));
330 }
331
332 //
333 //
334 //
335
336 static
337 struct skc_release_record *
skc_release_curr(struct skc_path_builder_impl * const impl)338 skc_release_curr(struct skc_path_builder_impl * const impl)
339 {
340 union skc_ringdex_expand curr_from = skc_ringdex_expand(impl,impl->curr.from);
341
342 return impl->release.records + curr_from.subbuf;
343 }
344
345 //
346 // FIXME -- get rid of all distant config references -- grab them at all at creation time
347 //
348
349 static
350 void
skc_path_builder_pfn_begin(struct skc_path_builder_impl * const impl)351 skc_path_builder_pfn_begin(struct skc_path_builder_impl * const impl)
352 {
353 // init header counters // { handle, blocks, nodes, prims }
354 impl->wip.head->header = (union skc_path_header){
355 .handle = 0,
356 .blocks = 0,
357 .nodes = 0,
358 .prims = 0
359 };
360
361 // FIXME -- BOUNDS SHOULD USE SIMD4 TRICK AND NEGATE ONE OF THE CORNERS
362 impl->wip.head->bounds = (union skc_path_bounds){ +FLT_MIN, +FLT_MIN, -FLT_MIN, -FLT_MIN };
363
364 // point wip ids at local head node
365 impl->wip.ids.next = impl->wip.head->tag_ids; // point to local head node
366 impl->wip.ids.rem = impl->runtime->config->block.words - SKC_PATH_HEAD_WORDS; // FIXME -- save this constant somewhere
367
368 // start with no subblocks
369 impl->wip.subblocks.rem = 0;
370 }
371
372 //
373 //
374 //
375
376 static
377 void
skc_path_builder_impl_finalize_node(struct skc_path_builder_impl * const impl)378 skc_path_builder_impl_finalize_node(struct skc_path_builder_impl * const impl)
379 {
380 #if 1
381 //
382 // FIXME -- a Duff's device might be optimal here but would have to
383 // be customized per device since node's could be 16-128+ words
384 //
385 while (impl->wip.ids.rem > 0)
386 {
387 impl->wip.ids.rem -= 1;
388 impl->wip.ids.next->u32 = SKC_TAGGED_BLOCK_ID_INVALID;
389 impl->wip.ids.next += 1;
390 }
391 #else
392 memset(&impl->wip.ids.next->u32,
393 SKC_TAGGED_BLOCK_ID_INVALID, // 0xFF
394 sizeof(impl->wip.ids.next->u32) * impl->wip.ids.rem);
395
396 impl->wip.ids.next += impl->wip.ids.rem;
397 impl->wip.ids.rem = 0;
398 #endif
399 }
400
401 //
402 //
403 //
404
405 static
406 void
skc_zero_float(skc_float * p,skc_uint rem)407 skc_zero_float(skc_float * p, skc_uint rem)
408 {
409 memset(p,0,sizeof(*p)*rem);
410 }
411
412 static
413 void
skc_path_builder_finalize_subblocks(struct skc_path_builder * const path_builder)414 skc_path_builder_finalize_subblocks(struct skc_path_builder * const path_builder)
415 {
416 //
417 // FIXME -- it might be more performant to zero the remaining
418 // columns in a subblock -- a subblock at a time -- instead of the
419 // same column across all the subblocks
420 //
421 #if 0
422 while (path_builder->line.rem > 0)
423 {
424 --path_builder->line.rem;
425
426 *path_builder->line.coords[0]++ = 0.0f;
427 *path_builder->line.coords[1]++ = 0.0f;
428 *path_builder->line.coords[2]++ = 0.0f;
429 *path_builder->line.coords[3]++ = 0.0f;
430 }
431
432 while (path_builder->quad.rem > 0)
433 {
434 --path_builder->quad.rem;
435
436 *path_builder->line.coords[0]++ = 0.0f;
437 *path_builder->line.coords[1]++ = 0.0f;
438 *path_builder->line.coords[2]++ = 0.0f;
439 *path_builder->line.coords[3]++ = 0.0f;
440 *path_builder->line.coords[4]++ = 0.0f;
441 *path_builder->line.coords[5]++ = 0.0f;
442 }
443
444 while (path_builder->cubic.rem > 0)
445 {
446 --path_builder->cubic.rem;
447
448 *path_builder->line.coords[0]++ = 0.0f;
449 *path_builder->line.coords[1]++ = 0.0f;
450 *path_builder->line.coords[2]++ = 0.0f;
451 *path_builder->line.coords[3]++ = 0.0f;
452 *path_builder->line.coords[4]++ = 0.0f;
453 *path_builder->line.coords[5]++ = 0.0f;
454 *path_builder->line.coords[6]++ = 0.0f;
455 *path_builder->line.coords[7]++ = 0.0f;
456 }
457 #else
458 if (path_builder->line.rem > 0)
459 {
460 skc_zero_float(path_builder->line.coords[0],path_builder->line.rem);
461 skc_zero_float(path_builder->line.coords[1],path_builder->line.rem);
462 skc_zero_float(path_builder->line.coords[2],path_builder->line.rem);
463 skc_zero_float(path_builder->line.coords[3],path_builder->line.rem);
464
465 path_builder->line.rem = 0;
466 }
467
468 if (path_builder->quad.rem > 0)
469 {
470 skc_zero_float(path_builder->quad.coords[0],path_builder->quad.rem);
471 skc_zero_float(path_builder->quad.coords[1],path_builder->quad.rem);
472 skc_zero_float(path_builder->quad.coords[2],path_builder->quad.rem);
473 skc_zero_float(path_builder->quad.coords[3],path_builder->quad.rem);
474 skc_zero_float(path_builder->quad.coords[4],path_builder->quad.rem);
475 skc_zero_float(path_builder->quad.coords[5],path_builder->quad.rem);
476
477 path_builder->quad.rem = 0;
478 }
479
480 if (path_builder->cubic.rem > 0)
481 {
482 skc_zero_float(path_builder->cubic.coords[0],path_builder->cubic.rem);
483 skc_zero_float(path_builder->cubic.coords[1],path_builder->cubic.rem);
484 skc_zero_float(path_builder->cubic.coords[2],path_builder->cubic.rem);
485 skc_zero_float(path_builder->cubic.coords[3],path_builder->cubic.rem);
486 skc_zero_float(path_builder->cubic.coords[4],path_builder->cubic.rem);
487 skc_zero_float(path_builder->cubic.coords[5],path_builder->cubic.rem);
488 skc_zero_float(path_builder->cubic.coords[6],path_builder->cubic.rem);
489 skc_zero_float(path_builder->cubic.coords[7],path_builder->cubic.rem);
490
491 path_builder->cubic.rem = 0;
492 }
493 #endif
494 }
495
496 //
497 //
498 //
499
500 static
501 void
skc_path_builder_impl_unmap(struct skc_path_builder_impl * const impl,skc_uint from,skc_uint to)502 skc_path_builder_impl_unmap(struct skc_path_builder_impl * const impl,
503 skc_uint from,
504 skc_uint to)
505 {
506 // to might be out of range
507 to = to % impl->ring.subbufs;
508
509 #if 0
510 fprintf(stderr,"unmap: [%2u,%2u)\n",from,to);
511 #endif
512
513 while (from != to) // 'to' might be out of range
514 {
515 // bring 'from' back in range
516 from = from % impl->ring.subbufs;
517
518 struct skc_subbuffer_blocks * const blocks = impl->blocks.subbufs + from;
519 struct skc_subbuffer_cmds * const cmds = impl->cmds .subbufs + from;
520
521 cl(EnqueueUnmapMemObject(impl->cq,
522 blocks->device,
523 blocks->host,
524 0,NULL,NULL));
525
526 cl(EnqueueUnmapMemObject(impl->cq,
527 cmds->device,
528 cmds->host,
529 0,NULL,NULL));
530
531 // bring from back in range
532 from = ++from % impl->ring.subbufs;
533 }
534 }
535
536 //
537 // FIXME -- reuse this in create()
538 //
539
540 static
541 void
skc_path_builder_impl_map(struct skc_path_builder_impl * const impl,skc_uint from,skc_uint to)542 skc_path_builder_impl_map(struct skc_path_builder_impl * const impl,
543 skc_uint from,
544 skc_uint to)
545 {
546 // to might be out of range
547 to = to % impl->ring.subbufs;
548
549 #if 0
550 fprintf(stderr," map: [%2u,%2u)\n",from,to);
551 #endif
552
553 while (from != to)
554 {
555 cl_int cl_err;
556
557 struct skc_subbuffer_blocks * const blocks = impl->blocks.subbufs + from;
558 struct skc_subbuffer_cmds * const cmds = impl->cmds .subbufs + from;
559
560 blocks->host = clEnqueueMapBuffer(impl->cq,
561 blocks->device,
562 CL_FALSE,
563 CL_MAP_WRITE_INVALIDATE_REGION,
564 0,impl->runtime->config->paths_copy.block.subbuf,
565 0,NULL,NULL,
566 &cl_err); cl_ok(cl_err);
567
568 cl(ReleaseEvent(cmds->map));
569
570 cmds->host = clEnqueueMapBuffer(impl->cq,
571 cmds->device,
572 CL_FALSE,
573 CL_MAP_WRITE_INVALIDATE_REGION,
574 0,impl->runtime->config->paths_copy.command.subbuf,
575 0,NULL,&cmds->map,
576 &cl_err); cl_ok(cl_err);
577
578 // bring from back in range
579 from = ++from % impl->ring.subbufs;
580 }
581 //
582 // FIXME -- when we switch to out of order queues we'll need a barrier here
583 //
584 }
585
586 //
587 //
588 //
589
590 static
591 void
skc_path_builder_release_dispose(struct skc_release_record * const release,struct skc_path_builder_impl * const impl)592 skc_path_builder_release_dispose(struct skc_release_record * const release,
593 struct skc_path_builder_impl * const impl)
594 {
595 struct skc_runtime * runtime = impl->runtime;
596
597 if (release->from <= release->to) // no wrap
598 {
599 skc_path_t const * paths = impl->release.paths + release->from;
600 skc_uint count = release->to - release->from;
601
602 skc_grid_deps_unmap(runtime->deps,paths,count);
603 skc_runtime_path_device_release(runtime,paths,count);
604 }
605 else // from > to implies wrap
606 {
607 skc_path_t const * paths_lo = impl->release.paths + release->from;
608 skc_uint count_lo = impl->ring.blocks_per.buffer - release->from;
609
610 skc_grid_deps_unmap(runtime->deps,paths_lo,count_lo);
611 skc_runtime_path_device_release(runtime,paths_lo,count_lo);
612
613 skc_grid_deps_unmap(runtime->deps,impl->release.paths,release->to);
614 skc_runtime_path_device_release(runtime,impl->release.paths,release->to);
615 }
616
617 release->to = release->from;
618 }
619
620 static
621 void
skc_path_builder_grid_pfn_dispose(skc_grid_t const grid)622 skc_path_builder_grid_pfn_dispose(skc_grid_t const grid)
623 {
624 struct skc_release_record * const release = skc_grid_get_data(grid);
625 struct skc_path_builder_impl * const impl = release->impl;
626
627 skc_path_builder_release_dispose(release,impl);
628 }
629
630 static
631 void
632 // skc_path_builder_complete(struct skc_release_record * const release)
skc_path_builder_complete(skc_grid_t grid)633 skc_path_builder_complete(skc_grid_t grid)
634 {
635 //
636 // notify deps that this grid is complete enough for other grids to
637 // proceed
638 //
639 // the path builder still has some cleanup to do before all its
640 // resources can be reused
641 //
642 skc_grid_complete(grid);
643 }
644
645 static
646 void
skc_path_builder_paths_copy_cb(cl_event event,cl_int status,skc_grid_t grid)647 skc_path_builder_paths_copy_cb(cl_event event, cl_int status, skc_grid_t grid)
648 {
649 SKC_CL_CB(status);
650
651 struct skc_release_record * const release = skc_grid_get_data(grid);
652
653 SKC_SCHEDULER_SCHEDULE(release->impl->runtime->scheduler,skc_path_builder_complete,grid);
654 }
655
656 //
657 //
658 //
659
660 static
661 void
skc_path_builder_grid_pfn_waiting(skc_grid_t const grid)662 skc_path_builder_grid_pfn_waiting(skc_grid_t const grid)
663 {
664 struct skc_release_record * const release = skc_grid_get_data(grid);
665 struct skc_path_builder_impl * const impl = release->impl;
666
667 // 1. flush incomplete subblocks of path elements
668 // 2. unmap subbuffer on cq.unmap
669 // 3. flush cq.unmap
670 // 4. launch kernel on cq.kernel but wait for unmap completion
671 // 5. flush cq.kernel
672 // 6. remap relevant subbuffers on cq.map but wait for kernel completion
673 // 7. flush cq.map
674
675 //
676 // FIXME -- can be smarter about flushing if the wip paths are not
677 // in the same subbuf as curr.to
678 //
679 // THIS IS IMPORTANT TO FIX
680 //
681
682 // flush incomplete subblocks
683 skc_path_builder_finalize_subblocks(impl->path_builder);
684
685 //
686 // get range of subbufs that need to be unmapped
687 //
688 // note that impl->prev subbufs have already been unmapped
689 //
690 union skc_ringdex_expand curr_from = skc_ringdex_expand(impl,impl->curr.from);
691 union skc_ringdex_expand curr_to = skc_ringdex_expand(impl,impl->curr.to);
692 skc_uint const is_partial = curr_to.block > 0;
693 skc_uint const unmap_to = curr_to.subbuf + is_partial;
694
695 //
696 // unmap all subbufs in range [from,to)
697 //
698 skc_path_builder_impl_unmap(impl,curr_from.subbuf,unmap_to);
699
700 //
701 // launch kernels
702 //
703 skc_uint const pb_prev_span = skc_ringdex_span(impl,impl->prev.from,impl->prev.to);
704 skc_uint const pb_curr_span = skc_ringdex_span(impl,impl->curr.from,impl->curr.to);
705 skc_uint const pb_cmds = pb_prev_span + pb_curr_span;
706
707 //
708 // 1) allocate blocks from pool
709 //
710
711 //
712 // FIXME -- pack integers into struct/vector
713 //
714 cl(SetKernelArg(impl->kernels.alloc,0,SKC_CL_ARG(impl->runtime->block_pool.atomics.drw)));
715 cl(SetKernelArg(impl->kernels.alloc,1,SKC_CL_ARG(impl->reads)));
716 cl(SetKernelArg(impl->kernels.alloc,2,SKC_CL_ARG(curr_from.subbuf)));
717 cl(SetKernelArg(impl->kernels.alloc,3,SKC_CL_ARG(pb_cmds)));
718
719 skc_device_enqueue_kernel(impl->runtime->device,
720 SKC_DEVICE_KERNEL_ID_PATHS_ALLOC,
721 impl->cq,
722 impl->kernels.alloc,
723 1,
724 0,NULL,NULL);
725
726 //
727 // 2) copy blocks from unmapped device-accessible memory
728 //
729
730 //
731 // FIXME -- pack integers into struct/vector and reduce 13 arguments down to 7
732 //
733 cl(SetKernelArg(impl->kernels.copy, 0,SKC_CL_ARG(impl->runtime->handle_pool.map.drw)));
734
735 cl(SetKernelArg(impl->kernels.copy, 1,SKC_CL_ARG(impl->runtime->block_pool.ids.drw)));
736 cl(SetKernelArg(impl->kernels.copy, 2,SKC_CL_ARG(impl->runtime->block_pool.blocks.drw)));
737 cl(SetKernelArg(impl->kernels.copy, 3,SKC_CL_ARG(impl->runtime->block_pool.size->ring_mask)));
738
739 cl(SetKernelArg(impl->kernels.copy, 4,SKC_CL_ARG(impl->reads)));
740 cl(SetKernelArg(impl->kernels.copy, 5,SKC_CL_ARG(curr_from.subbuf)));
741
742 cl(SetKernelArg(impl->kernels.copy, 6,SKC_CL_ARG(impl->cmds.buffer)));
743 cl(SetKernelArg(impl->kernels.copy, 7,SKC_CL_ARG(impl->blocks.buffer)));
744
745 cl(SetKernelArg(impl->kernels.copy, 8,SKC_CL_ARG(impl->ring.blocks_per.buffer)));
746 cl(SetKernelArg(impl->kernels.copy, 9,SKC_CL_ARG(impl->prev.rolling)));
747
748 cl(SetKernelArg(impl->kernels.copy,10,SKC_CL_ARG(impl->prev.from)));
749 cl(SetKernelArg(impl->kernels.copy,11,SKC_CL_ARG(pb_prev_span)));
750 cl(SetKernelArg(impl->kernels.copy,12,SKC_CL_ARG(impl->curr.from)));
751
752 cl_event complete;
753
754 skc_device_enqueue_kernel(impl->runtime->device,
755 SKC_DEVICE_KERNEL_ID_PATHS_COPY,
756 impl->cq,
757 impl->kernels.copy,
758 pb_cmds,
759 0,NULL,&complete);
760
761 // set a callback on completion
762 cl(SetEventCallback(complete,CL_COMPLETE,
763 skc_path_builder_paths_copy_cb,
764 grid));
765
766 // immediately release
767 cl(ReleaseEvent(complete));
768
769 //
770 // remap as many subbuffers as possible after the kernel completes
771 //
772 // note that remaps are async and enqueued on the same command queue
773 // as the kernel launch
774 //
775 // we can't remap subbuffers that are in the possibly empty range
776 //
777 // cases:
778 //
779 // - curr.to == wip.to which means no blocks have been acquired
780 // - curr.to points to first block in (next) subbuf
781 // - otherwise, wip acquired blocks in the curr.to subbuf
782 //
783 // check for these first 2 cases!
784 //
785 union skc_ringdex_expand const prev_from = skc_ringdex_expand(impl,impl->prev.from);
786 skc_uint const no_wip = impl->curr.to == impl->wip.to;
787 skc_uint map_to = curr_to.subbuf + (is_partial && no_wip);
788
789 // remap all subbufs in range [from,to)
790 skc_path_builder_impl_map(impl,prev_from.subbuf,map_to);
791
792 // flush command queue
793 cl(Flush(impl->cq));
794
795 // save rolling
796 impl->prev.rolling = impl->wip.rolling.next;
797
798 // update prev and curr
799 if (no_wip)
800 {
801 //
802 // if there was no wip then round up to the next subbuf
803 //
804 skc_ringdex_wip_to_subbuf_inc(impl);
805
806 //
807 // update prev/curr with with incremented wip
808 //
809 impl->prev.from = impl->prev.to = impl->wip.to;
810 impl->curr.from = impl->curr.to = impl->wip.to;
811 }
812 else
813 {
814 //
815 // update prev with wip partials
816 //
817 impl->prev.from = impl->curr.to;
818 impl->prev.to = impl->wip .to;
819
820 //
821 // start curr on a new subbuf boundary
822 //
823 skc_ringdex_wip_to_subbuf_inc(impl);
824
825 impl->curr.from = impl->wip.to;
826 impl->curr.to = impl->wip.to;
827 }
828 }
829
830 //
831 //
832 //
833
834 static
835 void
skc_path_builder_impl_acquire_subbuffer(struct skc_path_builder_impl * const impl,skc_uint const subbuf)836 skc_path_builder_impl_acquire_subbuffer(struct skc_path_builder_impl * const impl,
837 skc_uint const subbuf)
838 {
839 //
840 // FIXME -- move to a power-of-two subbuf size and kickstart path
841 // copies as early as possible
842 //
843 // FIXME -- the subbufs "self-clock" (flow control) the kernel
844 // launches and accounting. Combine all the subbuffers and release
845 // records into a single indexable struct instead of 3.
846 //
847 struct skc_subbuffer_cmds * const sc = impl->cmds.subbufs + subbuf;
848 struct skc_release_record * const release = impl->release.records + subbuf;
849 struct skc_scheduler * const scheduler = impl->runtime->scheduler;
850
851 // can't proceed until the paths have been released
852 SKC_SCHEDULER_WAIT_WHILE(scheduler,release->from != release->to);
853
854 // throw in a scheduler yield ... FIXME -- get rid of
855 skc_scheduler_yield(scheduler);
856
857 // can't proceed until the subbuffer is mapped
858 cl(WaitForEvents(1,&sc->map));
859 }
860
861 //
862 //
863 //
864
865 static
866 union skc_ringdex_expand
skc_path_builder_impl_acquire_block(struct skc_path_builder_impl * const impl)867 skc_path_builder_impl_acquire_block(struct skc_path_builder_impl * const impl)
868 {
869 // break ringdex into components
870 union skc_ringdex_expand const to = skc_ringdex_expand(impl,impl->wip.to);
871
872 // does wip ringdex point to a new subbuffer?
873 if (to.block == 0)
874 {
875 // potentially spin/block waiting for subbuffer
876 skc_path_builder_impl_acquire_subbuffer(impl,to.subbuf);
877 }
878
879 // post increment wip.to
880 skc_ringdex_wip_to_block_inc(impl);
881
882 return to;
883 }
884
885 //
886 //
887 //
888
889 static
890 skc_uint
skc_rolling_block(skc_uint const rolling,skc_uint const tag)891 skc_rolling_block(skc_uint const rolling, skc_uint const tag)
892 {
893 return rolling | tag;
894 }
895
896 static
897 skc_uint
skc_rolling_subblock(skc_uint const rolling,skc_uint const subblock,skc_uint const tag)898 skc_rolling_subblock(skc_uint const rolling, skc_uint const subblock, skc_uint const tag)
899 {
900 return rolling | (subblock << SKC_TAGGED_BLOCK_ID_BITS_TAG) | tag;
901 }
902
903 static
904 void
skc_rolling_inc(struct skc_path_builder_impl * const impl)905 skc_rolling_inc(struct skc_path_builder_impl * const impl)
906 {
907 impl->wip.rolling.next += impl->wip.rolling.one;
908 }
909
910 //
911 //
912 //
913
914 static
915 void *
skc_path_builder_impl_new_command(struct skc_path_builder_impl * const impl,skc_uint const rolling,skc_cmd_paths_copy_tag const tag)916 skc_path_builder_impl_new_command(struct skc_path_builder_impl * const impl,
917 skc_uint const rolling,
918 skc_cmd_paths_copy_tag const tag)
919 {
920 // bump blocks count
921 impl->wip.head->header.blocks += 1;
922
923 // acquire a block
924 union skc_ringdex_expand const to = skc_path_builder_impl_acquire_block(impl);
925
926 // make a pointer
927 union skc_tagged_block_id * const cmds_subbuf = impl->cmds.subbufs[to.subbuf].host;
928
929 // store command for block
930 cmds_subbuf[to.block].u32 = skc_rolling_block(rolling,tag);
931
932 #if 0
933 // store command for block
934 cmds_subbuf[to.block].u32 = skc_rolling_block(impl->wip.rolling.next,tag);
935
936 // increment rolling
937 skc_rolling_inc(impl);
938 #endif
939
940 // return pointer to block
941 float * const blocks_subbuf = impl->blocks.subbufs[to.subbuf].host;
942
943 // FIXME -- make it easier to get config constant
944 return blocks_subbuf + (to.block * impl->runtime->config->block.words);
945 }
946
947 //
948 //
949 //
950
951 static
952 void
skc_path_builder_impl_flush_node(struct skc_path_builder_impl * const impl)953 skc_path_builder_impl_flush_node(struct skc_path_builder_impl * const impl)
954 {
955 // store command to subbuf and get pointer to blocks subbuf
956 void * const block = skc_path_builder_impl_new_command(impl,impl->wip.ids.rolling,
957 SKC_CMD_PATHS_COPY_TAG_NODE);
958
959 // copy head to blocks subbuf -- write-only
960 memcpy(block,impl->wip.node,impl->runtime->config->block.bytes);
961 }
962
963 static
964 void
skc_path_builder_impl_flush_head(struct skc_path_builder_impl * const impl)965 skc_path_builder_impl_flush_head(struct skc_path_builder_impl * const impl)
966 {
967 // store command to subbuf and get pointer to blocks subbuf
968 void * const block = skc_path_builder_impl_new_command(impl,impl->wip.rolling.next,
969 SKC_CMD_PATHS_COPY_TAG_HEAD);
970
971 // copy head to blocks subbuf -- write-only
972 memcpy(block,impl->wip.head,impl->runtime->config->block.bytes);
973
974 // increment rolling
975 skc_rolling_inc(impl);
976
977 // the 'to' index is non-inclusive so assign wip.to after flush_head
978 impl->curr.to = impl->wip.to;
979 }
980
981 //
982 //
983 //
984
985 static
986 void
skc_path_builder_impl_new_node_block(struct skc_path_builder_impl * const impl)987 skc_path_builder_impl_new_node_block(struct skc_path_builder_impl * const impl)
988 {
989 // update final block id in node
990 impl->wip.ids.next->u32 = skc_rolling_block(impl->wip.rolling.next,SKC_BLOCK_ID_TAG_PATH_NEXT);
991
992 // if wip.ids is not the header then flush now full wip node
993 if (impl->wip.head->header.nodes > 0)
994 skc_path_builder_impl_flush_node(impl);
995
996 // bump node count
997 impl->wip.head->header.nodes += 1;
998
999 // save current rolling
1000 impl->wip.ids.rolling = impl->wip.rolling.next;
1001
1002 // increment rolling
1003 skc_rolling_inc(impl);
1004
1005 // update wip.ids.*
1006 impl->wip.ids.next = impl->wip.node->tag_ids;
1007 impl->wip.ids.rem = impl->runtime->config->block.words;
1008 }
1009
1010 static
1011 void
skc_path_builder_impl_new_segs_block(struct skc_path_builder_impl * const impl)1012 skc_path_builder_impl_new_segs_block(struct skc_path_builder_impl * const impl)
1013 {
1014 impl->wip.subblocks.rem = impl->runtime->config->block.subblocks; // FIXME -- move constants closer to structure
1015 impl->wip.subblocks.rolling = impl->wip.rolling.next;
1016 impl->wip.subblocks.next = skc_path_builder_impl_new_command(impl,impl->wip.rolling.next,
1017 SKC_CMD_PATHS_COPY_TAG_SEGS);
1018 impl->wip.subblocks.idx = 0;
1019
1020 // increment rolling
1021 skc_rolling_inc(impl);
1022 }
1023
1024 //
1025 //
1026 //
1027
1028 static
1029 void
skc_path_builder_impl_acquire_subblocks(struct skc_path_builder_impl * const impl,skc_block_id_tag tag,skc_uint vertices,float ** subblocks)1030 skc_path_builder_impl_acquire_subblocks(struct skc_path_builder_impl * const impl,
1031 skc_block_id_tag tag,
1032 skc_uint vertices,
1033 float * * subblocks)
1034 {
1035 //
1036 // FIRST TAG RECORDS THE ELEMENT TYPE
1037 //
1038 while (true)
1039 {
1040 // if only one block id left in node then acquire new node block
1041 // and append its block id as with a next tag
1042 if (impl->wip.ids.rem == 1)
1043 skc_path_builder_impl_new_node_block(impl);
1044
1045 // if zero subblocks left then acquire a new subblock block and
1046 // append its block id
1047 if (impl->wip.subblocks.rem == 0)
1048 skc_path_builder_impl_new_segs_block(impl);
1049
1050 // save first command -- tag and subblocks may have been updated
1051 impl->wip.ids.next->u32 = skc_rolling_subblock(impl->wip.subblocks.rolling,impl->wip.subblocks.idx,tag);
1052
1053 // increment node block subblock pointer
1054 impl->wip.ids.next += 1;
1055 impl->wip.ids.rem -= 1;
1056
1057 // how many vertices can we store
1058 skc_uint rem = min(vertices,impl->wip.subblocks.rem);
1059
1060 // decrement vertices
1061 vertices -= rem;
1062 impl->wip.subblocks.rem -= rem;
1063 impl->wip.subblocks.idx += rem;
1064
1065 // assign subblocks
1066 do {
1067 *subblocks++ = impl->wip.subblocks.next;
1068 impl->wip.subblocks.next += impl->runtime->config->subblock.words;
1069 // FIXME -- move constants closer to structure
1070 } while (--rem > 0);
1071
1072 // anything left to do?
1073 if (vertices == 0)
1074 break;
1075
1076 // any tag after this will be a caboose command
1077 tag = SKC_BLOCK_ID_TAG_PATH_NEXT;
1078 }
1079 }
1080
1081 //
1082 //
1083 //
1084
1085 static
1086 void
skc_path_builder_pfn_end(struct skc_path_builder_impl * const impl,skc_path_t * const path)1087 skc_path_builder_pfn_end(struct skc_path_builder_impl * const impl, skc_path_t * const path)
1088 {
1089 // finalize incomplete active subblocks -- we don't care about any
1090 // remaining unused subblocks in block
1091 skc_path_builder_finalize_subblocks(impl->path_builder);
1092
1093 // mark remaining wips.ids in the head or node as invalid
1094 skc_path_builder_impl_finalize_node(impl);
1095
1096 // flush node if rem > 0 and node is not actually head
1097 if (impl->wip.head->header.nodes >= 1)
1098 skc_path_builder_impl_flush_node(impl);
1099
1100 // acquire path host id
1101 *path = skc_runtime_handle_device_acquire(impl->runtime); // FIXME -- MAY WANT TO GRAB AN ID ON BEGIN
1102
1103 // save path host handle
1104 impl->wip.head->header.handle = *path;
1105
1106 // flush head -- acquires a block and bumps head->header.blocks
1107 skc_path_builder_impl_flush_head(impl);
1108
1109 // get current release
1110 struct skc_release_record * const release = skc_release_curr(impl);
1111
1112 // acquire grid if null
1113 if (release->grid == NULL)
1114 {
1115 release->grid =
1116 SKC_GRID_DEPS_ATTACH(impl->runtime->deps,
1117 &release->grid, // NULL on start/force
1118 release, // data payload
1119 skc_path_builder_grid_pfn_waiting,
1120 NULL, // no execute pfn
1121 skc_path_builder_grid_pfn_dispose);
1122 }
1123
1124 // update grid map
1125 skc_grid_map(release->grid,*path);
1126
1127 // update path release
1128 impl->release.paths[release->to] = *path;
1129
1130 // increment release.to
1131 release->to = (release->to + 1) % impl->ring.blocks_per.buffer;
1132
1133 // add guard bit
1134 *path |= SKC_TYPED_HANDLE_TYPE_IS_PATH;
1135
1136 #if 1
1137 //
1138 // eager kernel launch?
1139 //
1140 {
1141 union skc_ringdex_expand const curr_from = skc_ringdex_expand(impl,impl->curr.from);
1142 union skc_ringdex_expand const curr_to = skc_ringdex_expand(impl,impl->curr.to);
1143
1144 if (curr_from.subbuf != curr_to.subbuf)
1145 {
1146 skc_grid_start(release->grid);
1147 // skc_scheduler_yield(impl->runtime->scheduler);
1148 }
1149 }
1150 #endif
1151 }
1152
1153 //
1154 // FIXME -- clean up accessing of CONFIG constants in these 3 routines
1155 //
1156
1157 static
1158 void
skc_path_builder_pfn_new_line(struct skc_path_builder_impl * const impl)1159 skc_path_builder_pfn_new_line(struct skc_path_builder_impl * const impl)
1160 {
1161 // acquire subblock pointers
1162 skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_LINE,4,
1163 impl->path_builder->line.coords);
1164
1165 // increment line count
1166 impl->wip.head->header.prims += 1;
1167
1168 // update rem_count_xxx count
1169 impl->path_builder->line.rem = impl->runtime->config->subblock.words;
1170 }
1171
1172 static
1173 void
skc_path_builder_pfn_new_quad(struct skc_path_builder_impl * const impl)1174 skc_path_builder_pfn_new_quad(struct skc_path_builder_impl * const impl)
1175 {
1176 // acquire subblock pointers
1177 skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_QUAD,6,
1178 impl->path_builder->quad.coords);
1179
1180 // increment line count
1181 impl->wip.head->header.prims += 1;
1182
1183 // update rem_count_xxx count
1184 impl->path_builder->quad.rem = impl->runtime->config->subblock.words;
1185 }
1186
1187 static
1188 void
skc_path_builder_pfn_new_cubic(struct skc_path_builder_impl * const impl)1189 skc_path_builder_pfn_new_cubic(struct skc_path_builder_impl * const impl)
1190 {
1191 // acquire subblock pointers
1192 skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_CUBIC,8,
1193 impl->path_builder->cubic.coords);
1194
1195 // increment line count
1196 impl->wip.head->header.prims += 1;
1197
1198 // update rem_count_xxx count
1199 impl->path_builder->cubic.rem = impl->runtime->config->subblock.words;
1200 }
1201
1202 //
1203 //
1204 //
1205
1206 static
1207 void
skc_path_builder_pfn_release(struct skc_path_builder_impl * const impl)1208 skc_path_builder_pfn_release(struct skc_path_builder_impl * const impl)
1209 {
1210 // decrement reference count
1211 if (--impl->path_builder->refcount != 0)
1212 return;
1213
1214 //
1215 // otherwise, dispose of everything
1216 //
1217 struct skc_runtime * const runtime = impl->runtime;
1218
1219 // free path builder
1220 skc_runtime_host_perm_free(impl->runtime,impl->path_builder);
1221
1222 // release cq
1223 skc_runtime_release_cq_in_order(runtime,impl->cq);
1224
1225 // release kernels
1226 cl(ReleaseKernel(impl->kernels.alloc));
1227 cl(ReleaseKernel(impl->kernels.copy));
1228
1229 // free blocks extents
1230 cl(ReleaseMemObject(impl->blocks.buffer));
1231 skc_runtime_host_perm_free(runtime,impl->blocks.subbufs);
1232
1233 cl(ReleaseMemObject(impl->cmds.buffer));
1234 skc_runtime_host_perm_free(runtime,impl->cmds.subbufs);
1235
1236 // free records
1237 skc_runtime_host_perm_free(runtime,impl->release.records);
1238 skc_runtime_host_perm_free(runtime,impl->release.paths);
1239
1240 // release staging head and node
1241 skc_runtime_host_perm_free(runtime,impl->wip.head);
1242 skc_runtime_host_perm_free(runtime,impl->wip.node);
1243
1244 // release reads scratch array
1245 cl(ReleaseMemObject(impl->reads));
1246
1247 // for all subbuffers
1248 // unmap subbuffer
1249 // release subbuffer
1250 // printf("%s not releasing subbuffers\n",__func__);
1251
1252 skc_runtime_host_perm_free(impl->runtime,impl);
1253 }
1254
1255 //
1256 //
1257 //
1258
1259 skc_err
skc_path_builder_cl_12_create(struct skc_context * const context,struct skc_path_builder ** const path_builder)1260 skc_path_builder_cl_12_create(struct skc_context * const context,
1261 struct skc_path_builder * * const path_builder)
1262 {
1263 //
1264 // retain the context
1265 // skc_context_retain(context);
1266 //
1267 struct skc_runtime * const runtime = context->runtime;
1268
1269 // allocate path builder
1270 (*path_builder) = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**path_builder));
1271
1272 // init state
1273 SKC_ASSERT_STATE_INIT((*path_builder),SKC_PATH_BUILDER_STATE_READY);
1274
1275 (*path_builder)->context = context;
1276
1277 // save opaque impl-specific pointers
1278 (*path_builder)->begin = skc_path_builder_pfn_begin;
1279 (*path_builder)->end = skc_path_builder_pfn_end;
1280 (*path_builder)->new_line = skc_path_builder_pfn_new_line;
1281 (*path_builder)->new_quad = skc_path_builder_pfn_new_quad;
1282 (*path_builder)->new_cubic = skc_path_builder_pfn_new_cubic;
1283 (*path_builder)->release = skc_path_builder_pfn_release;
1284
1285 // initialize path builder counts
1286 (*path_builder)->line.rem = 0;
1287 (*path_builder)->quad.rem = 0;
1288 (*path_builder)->cubic.rem = 0;
1289
1290 (*path_builder)->refcount = 1;
1291
1292 struct skc_path_builder_impl * const impl = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl));
1293
1294 (*path_builder)->impl = impl;
1295
1296 //
1297 // init impl
1298 //
1299 impl->path_builder = *path_builder;
1300 impl->runtime = runtime;
1301
1302 impl->cq = skc_runtime_acquire_cq_in_order(runtime);
1303
1304 impl->kernels.alloc = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_PATHS_ALLOC);
1305 impl->kernels.copy = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_PATHS_COPY);
1306
1307 //
1308 // FIXME -- let these config constants remain constant and in place
1309 //
1310 struct skc_config const * const config = runtime->config;
1311
1312 impl->ring.subbufs = config->paths_copy.buffer.count;
1313 impl->ring.blocks_per.buffer = config->paths_copy.subbuf.count * config->paths_copy.buffer.count;
1314 impl->ring.blocks_per.subbuf = config->paths_copy.subbuf.count;
1315 //
1316 // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1317 //
1318
1319 cl_int cl_err;
1320
1321 // allocate large device-side extent for path data
1322 impl->blocks.buffer = clCreateBuffer(runtime->cl.context,
1323 CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
1324 config->paths_copy.block.buffer, // FIXME -- either use config or local constants everywhere
1325 NULL,&cl_err); cl_ok(cl_err);
1326
1327 // allocate small host-side array of pointers to mapped subbufs
1328 impl->blocks.subbufs = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
1329 impl->ring.subbufs *
1330 sizeof(*impl->blocks.subbufs));
1331
1332 // allocate large device-side extent for path copy commands
1333 impl->cmds.buffer = clCreateBuffer(runtime->cl.context,
1334 CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
1335 config->paths_copy.command.buffer,
1336 NULL,&cl_err); cl_ok(cl_err);
1337
1338 // allocate small host-side array of pointers to mapped subbufs
1339 impl->cmds.subbufs = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
1340 impl->ring.subbufs *
1341 sizeof(*impl->cmds.subbufs));
1342
1343 // allocate small host-side array of intervals of path handles
1344 impl->release.records = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
1345 impl->ring.subbufs *
1346 sizeof(*impl->release.records));
1347
1348 // allocate large host-side array that is max # of path handles in flight
1349 impl->release.paths = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
1350 impl->ring.blocks_per.buffer *
1351 sizeof(*impl->release.paths));
1352
1353 // small scratch used by kernels
1354 impl->reads = clCreateBuffer(runtime->cl.context,
1355 CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS,
1356 sizeof(skc_uint) * impl->ring.subbufs,
1357 NULL,&cl_err); cl_ok(cl_err);
1358
1359 // initialize release record with impl backpointer
1360 for (skc_uint ii=0; ii<impl->ring.subbufs; ii++)
1361 {
1362 struct skc_release_record * record = impl->release.records + ii;
1363
1364 record->impl = impl;
1365 record->grid = NULL;
1366 record->from = record->to = ii * impl->ring.blocks_per.subbuf;
1367 }
1368
1369 //
1370 // allocate and map subbuffers -- we always check the command
1371 // subbuffer's map/unmap events before touching it or its associated
1372 // block subbuffer.
1373 //
1374 struct skc_subbuffer_blocks * sb = impl->blocks.subbufs;
1375 struct skc_subbuffer_cmds * sc = impl->cmds .subbufs;
1376
1377 cl_buffer_region rb = { 0, config->paths_copy.block.subbuf };
1378 cl_buffer_region rc = { 0, config->paths_copy.command.subbuf };
1379
1380 // for each subbuffer
1381 for (skc_uint ii=0; ii<config->paths_copy.buffer.count; ii++)
1382 {
1383 sb->device = clCreateSubBuffer(impl->blocks.buffer,
1384 CL_MEM_HOST_WRITE_ONLY,
1385 CL_BUFFER_CREATE_TYPE_REGION,
1386 &rb,
1387 &cl_err); cl_ok(cl_err);
1388
1389 sb->host = clEnqueueMapBuffer(impl->cq,
1390 sb->device,
1391 CL_FALSE,
1392 CL_MAP_WRITE_INVALIDATE_REGION,
1393 0,rb.size,
1394 0,NULL,NULL,
1395 &cl_err); cl_ok(cl_err);
1396
1397 sc->device = clCreateSubBuffer(impl->cmds.buffer,
1398 CL_MEM_HOST_WRITE_ONLY,
1399 CL_BUFFER_CREATE_TYPE_REGION,
1400 &rc,
1401 &cl_err); cl_ok(cl_err);
1402
1403 sc->host = clEnqueueMapBuffer(impl->cq,
1404 sc->device,
1405 CL_FALSE,
1406 CL_MAP_WRITE_INVALIDATE_REGION,
1407 0,rc.size,
1408 0,NULL,&sc->map,
1409 &cl_err); cl_ok(cl_err);
1410 sb += 1;
1411 sc += 1;
1412
1413 rb.origin += rb.size;
1414 rc.origin += rc.size;
1415 }
1416
1417 //
1418 // initialize remaining members
1419 //
1420 impl->prev.from = 0;
1421 impl->prev.to = 0;
1422 impl->prev.rolling = 0;
1423
1424 impl->curr.from = 0;
1425 impl->curr.to = 0;
1426
1427 impl->wip.to = 0;
1428
1429 impl->wip.head = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,config->block.bytes);
1430 impl->wip.node = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,config->block.bytes);
1431
1432 impl->wip.rolling.one = SKC_BLOCK_ID_TAG_COUNT * config->block.subblocks;
1433 impl->wip.rolling.next = 0;
1434
1435 // for now, completely initialize builder before returning
1436 cl(Finish(impl->cq));
1437
1438 return SKC_ERR_SUCCESS;
1439 }
1440
1441 //
1442 //
1443 //
1444