1 /*
2 * Copyright 2017 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can
5 * be found in the LICENSE file.
6 *
7 */
8
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <assert.h>
12
13 #include "common/cl/assert_cl.h"
14
15 #include "tile.h"
16 #include "raster.h"
17 #include "macros.h"
18
19 #include "config_cl.h"
20 #include "runtime_cl_12.h"
21
22 #include "kernel_cl_12.h"
23 #include "device_cl_12.h"
24
25 //
26 //
27 //
28
29 #include "hs/cl/hs_cl.h"
30 #include "hs/cl/intel/gen8/u64/hs_target.h"
31 #include "hs/cl/intel/gen8/u64/hs_config.h"
32
33 //
34 //
35 //
36
37 #define SKC_KERNEL_SPIRV 0
38 #define SKC_KERNEL_BINARY 1
39 #define SKC_KERNEL_SRC 0
40
41 //
42 //
43 //
44
45 #if SKC_KERNEL_SPIRV
46
47 #include "inl/block_pool_init.pre.spv.inl"
48 #include "inl/paths_copy.pre.spv.inl"
49 #include "inl/fills_expand.pre.spv.inl"
50 #include "inl/rasterize.pre.spv.inl"
51 #include "inl/segment_ttrk.pre.spv.inl"
52 #include "inl/rasters_alloc.pre.spv.inl"
53 #include "inl/prefix.pre.spv.inl"
54 #include "inl/place.pre.spv.inl"
55 #include "inl/segment_ttck.pre.spv.inl"
56 #include "inl/render.pre.spv.inl"
57 #include "inl/paths_reclaim.pre.spv.inl"
58 #include "inl/rasters_reclaim.pre.spv.inl"
59
60 #elif SKC_KERNEL_BINARY
61
62 #include "inl/block_pool_init.pre.bin.inl"
63 #include "inl/paths_copy.pre.bin.inl"
64 #include "inl/fills_expand.pre.bin.inl"
65 #include "inl/rasterize.pre.bin.inl"
66 #include "inl/segment_ttrk.pre.bin.inl"
67 #include "inl/rasters_alloc.pre.bin.inl"
68 #include "inl/prefix.pre.bin.inl"
69 #include "inl/place.pre.bin.inl"
70 #include "inl/segment_ttck.pre.bin.inl"
71 #include "inl/render.pre.bin.inl"
72 #include "inl/paths_reclaim.pre.bin.inl"
73 #include "inl/rasters_reclaim.pre.bin.inl"
74
75 #elif SKC_KERNEL_SRC
76
77 #include "inl/block_pool_init.pre.src.inl"
78 #include "inl/paths_copy.pre.src.inl"
79 #include "inl/fills_expand.pre.src.inl"
80 #include "inl/rasterize.pre.src.inl"
81 #include "inl/segment_ttrk.pre.src.inl"
82 #include "inl/rasters_alloc.pre.src.inl"
83 #include "inl/prefix.pre.src.inl"
84 #include "inl/place.pre.src.inl"
85 #include "inl/segment_ttck.pre.src.inl"
86 #include "inl/render.pre.src.inl"
87 #include "inl/paths_reclaim.pre.src.inl"
88 #include "inl/rasters_reclaim.pre.src.inl"
89
90 #endif
91
92 //
93 // FIXME -- THE CONFIG INITIALIZATION IS ONLY HERE TEMPORARILY
94 //
95 // FIXME -- move these to log2 values where appropriate
96 //
97
98 static
99 struct skc_config const config =
100 {
101 .suballocator = {
102 .host = {
103 .size = 1024 * 1024, // words
104 .subbufs = 1024 // must be <= (1 << (8 * sizeof(skc_subbuf_id_t)))
105 },
106 .device = {
107 .size = 128 * 1024 * 1024,
108 .subbufs = 1024 // must be <= (1 << (8 * sizeof(skc_subbuf_id_t)))
109 }
110 },
111
112 .scheduler = {
113 .size = 4096 // 128 // FIXME -- this is just for testing -- way too big -- schedulees should bring their own state
114 },
115
116 .subblock = {
117 .words = SKC_DEVICE_SUBBLOCK_WORDS, // words per subblock -- pow2
118 .bytes = SKC_DEVICE_SUBBLOCK_WORDS * sizeof(skc_uint) // bytes per subblock -- pow2
119 },
120
121 .block = {
122 .words = SKC_DEVICE_BLOCK_WORDS, // words per block -- pow2
123 .bytes = SKC_DEVICE_BLOCK_WORDS * sizeof(skc_uint), // bytes per block -- pow2
124 .subblocks = SKC_DEVICE_BLOCK_WORDS / SKC_DEVICE_SUBBLOCK_WORDS // subblocks per block -- block.bytes >= subblock.bytes
125 },
126
127 .block_pool = {
128 .pool_size = 524288, // blocks in pool -- 128 MB
129 .ring_pow2 = 524288, // blocks in pool rounded up pow2
130 .ring_mask = 524288 - 1
131 },
132
133 .cq_pool = {
134 #ifndef NDEBUG
135 .cq_props = CL_QUEUE_PROFILING_ENABLE,
136 #else
137 .cq_props = 0,
138 #endif
139 .size = 8
140 },
141
142 .handle_pool = {
143 .size = 262144, // large fraction of block pool size (for now, 1:2)
144 .width = SKC_RECLAIM_ARRAY_SIZE,
145 .recs = 256 // too many? too few?
146 },
147
148 .tile = {
149 .width = SKC_TILE_WIDTH, // tile width in pixels
150 .height = SKC_TILE_HEIGHT, // tile height in pixels
151 .ratio = SKC_TILE_HEIGHT / SKC_TILE_WIDTH // subblocks per TTPB
152 },
153
154 .paths_copy = {
155
156 .buffer = {
157 .count = 16 // # of subbufs in buffer
158 },
159
160 .subbuf = {
161 .count = 1024 // # of blocks/commands in subbuf
162 },
163
164 .block = {
165 .subbuf = SKC_DEVICE_BLOCK_WORDS * sizeof(skc_uint) * 1024, // block.bytes * subbuf.blocks -- multiple of CL_DEVICE_MEM_BASE_ADDR_ALIGN
166 .buffer = SKC_DEVICE_BLOCK_WORDS * sizeof(skc_uint) * 1024 * 16 // block.bytes * subbuf.blocks * subbuf.count
167 },
168
169 .command = {
170 .subbuf = sizeof(skc_uint) * 1024, // sizeof(skc_uint) * subbuf.blocks -- multiple of CL_DEVICE_MEM_BASE_ADDR_ALIGN
171 .buffer = sizeof(skc_uint) * 1024 * 16 // sizeof(skc_uint) * subbuf.blocks * subbuf.count
172 },
173
174 // skc_uint paths_lowat;
175 },
176
177 .raster_cohort = {
178 .path_ids = {
179 .elem_count = 8192,
180 .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER
181 },
182
183 .transforms = {
184 .elem_count = 8192,
185 .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER
186 },
187
188 .clips = {
189 .elem_count = 8192,
190 .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER
191 },
192
193 .fill = {
194 .elem_count = 8192,
195 .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER
196 },
197
198 .raster_ids = {
199 .elem_count = 8192,
200 .snap_count = (1<<SKC_TTRK_HI_BITS_COHORT) // 256
201 },
202
203 .expand = {
204 .cmds = 1024*128,
205 },
206
207 .rasterize = {
208 .keys = 1024*1024
209 }
210 },
211
212 .composition = {
213 .cmds = {
214 .elem_count = 1024*16,
215 .snap_count = 1024
216 },
217 .raster_ids = {
218 .elem_count = 1024*1024
219 },
220 .keys = {
221 .elem_count = 1024*1024,
222 }
223 },
224 };
225
226 //
227 //
228 //
229
230 static char const cl_build_options_optimized[] =
231 "-cl-std=CL1.2 "
232 "-cl-single-precision-constant "
233 "-cl-denorms-are-zero "
234 "-cl-mad-enable "
235 "-cl-no-signed-zeros "
236 "-cl-fast-relaxed-math "
237 "-cl-kernel-arg-info ";
238
239 static char const cl_build_options_debug[] =
240 "-cl-std=CL1.2 -cl-kernel-arg-info -g"; // -s c:/users/allanmac/home/google/skia_internal/src/compute/skc";
241
242 // #define SKC_BUILD_OPTIONS cl_build_options_debug
243 #define SKC_BUILD_OPTIONS cl_build_options_optimized
244
245 //
246 //
247 //
248
249 struct skc_program_source
250 {
251 char const * name;
252 char const * options;
253 char const * src;
254 size_t const srclen;
255 };
256
257 //
258 // THIS IS A RELATIVELY COMPACT WAY OF DECLARING EACH PROGRAM SOURCE
259 // AND ITS BUILD OPTIONS
260 //
261
262 union skc_program_sources
263 {
264 struct {
265 struct skc_program_source block_pool_init;
266 struct skc_program_source paths_copy;
267 struct skc_program_source fills_expand;
268 struct skc_program_source rasterize;
269 struct skc_program_source segment_ttrk;
270 struct skc_program_source rasters_alloc;
271 struct skc_program_source prefix;
272 struct skc_program_source place;
273 struct skc_program_source segment_ttck;
274 struct skc_program_source render;
275 struct skc_program_source paths_reclaim;
276 struct skc_program_source rasters_reclaim;
277 };
278 struct skc_program_source sources[];
279 };
280
281 typedef size_t * (*skc_grid_shaper)(size_t const work_size,
282 cl_uint * const work_dim,
283 size_t * const global_work_size,
284 size_t * const local_work_size);
285 struct skc_program_kernel
286 {
287 char const * name;
288 skc_grid_shaper shaper;
289 skc_device_kernel_id id;
290 };
291
292 union skc_program_kernels
293 {
294 struct {
295 struct skc_program_kernel block_pool_init[2];
296 struct skc_program_kernel paths_copy [2];
297 struct skc_program_kernel fills_expand [1];
298 struct skc_program_kernel rasterize [6];
299 struct skc_program_kernel segment_ttrk [1];
300 struct skc_program_kernel rasters_alloc [1];
301 struct skc_program_kernel prefix [1];
302 struct skc_program_kernel place [1];
303 struct skc_program_kernel segment_ttck [1];
304 struct skc_program_kernel render [1];
305 struct skc_program_kernel paths_reclaim [1];
306 struct skc_program_kernel rasters_reclaim[1];
307 };
308 struct skc_program_kernel kernels[];
309 };
310
311 //
312 //
313 //
314
315 #if SKC_KERNEL_SPIRV // PROGRAM IS SPIR-V
316 #define SKC_KERNEL_SUFFIX(n) n ## _pre_spv
317 #elif SKC_KERNEL_BINARY // PROGRAM IS BINARY
318 #define SKC_KERNEL_SUFFIX(n) n ## _pre_ir
319 #elif SKC_KERNEL_SRC // PROGRAM IS SOURCE CODE
320 #define SKC_KERNEL_SUFFIX(n) n ## _pre_cl
321 #else
322 #error "SKC_KERNEL_???"
323 #endif
324
325 //
326 //
327 //
328
329 #define SKC_PROGRAM_SOURCE_EXPAND(k,s,o) .k = { SKC_STRINGIFY(k), o, s, sizeof(s) }
330 #define SKC_PROGRAM_SOURCE(k,o) SKC_PROGRAM_SOURCE_EXPAND(k,SKC_KERNEL_SUFFIX(k),o)
331 #define SKC_PROGRAM_KERNEL(k) "skc_kernel_" SKC_STRINGIFY(k), SKC_CONCAT(skc_device_shaper_,k)
332
333 //
334 //
335 //
336
337 static
338 size_t *
skc_device_shaper_block_pool_init_ids(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)339 skc_device_shaper_block_pool_init_ids(size_t const work_size,
340 cl_uint * const work_dim,
341 size_t * const work_global,
342 size_t * const work_local)
343 {
344 work_dim [0] = 1;
345 work_global[0] = work_size;
346
347 return NULL; // let runtime figure out local work size
348 }
349
350 static
351 size_t *
skc_device_shaper_block_pool_init_atomics(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)352 skc_device_shaper_block_pool_init_atomics(size_t const work_size,
353 cl_uint * const work_dim,
354 size_t * const work_global,
355 size_t * const work_local)
356 {
357 work_dim [0] = 1;
358 work_global[0] = 2;
359
360 return NULL; // let runtime figure out local work size
361 }
362
363 static
364 size_t *
skc_device_shaper_paths_alloc(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)365 skc_device_shaper_paths_alloc(size_t const work_size,
366 cl_uint * const work_dim,
367 size_t * const work_global,
368 size_t * const work_local)
369 {
370 work_dim [0] = 1;
371 work_global[0] = 1;
372
373 return NULL; // let runtime figure out local work size
374 }
375
376
377 static
378 size_t *
skc_device_shaper_paths_copy(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)379 skc_device_shaper_paths_copy(size_t const work_size,
380 cl_uint * const work_dim,
381 size_t * const work_global,
382 size_t * const work_local)
383 {
384 work_dim [0] = 1;
385 work_global[0] = SKC_PATHS_COPY_SUBGROUP_SIZE * work_size;
386 #if 0
387 work_local [0] = SKC_PATHS_COPY_SUBGROUP_SIZE;
388
389 return work_local;
390 #else
391 return NULL; // let runtime figure out local work size
392 #endif
393 }
394
395 static
396 size_t *
skc_device_shaper_fills_expand(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)397 skc_device_shaper_fills_expand(size_t const work_size,
398 cl_uint * const work_dim,
399 size_t * const work_global,
400 size_t * const work_local)
401 {
402 work_dim [0] = 1;
403 work_global[0] = SKC_FILLS_EXPAND_SUBGROUP_SIZE * work_size;
404 work_local [0] = SKC_FILLS_EXPAND_SUBGROUP_SIZE;
405
406 return work_local;
407 }
408
409 static
410 size_t *
skc_device_shaper_rasterize(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)411 skc_device_shaper_rasterize(size_t const work_size,
412 cl_uint * const work_dim,
413 size_t * const work_global,
414 size_t * const work_local)
415 {
416 work_dim [0] = 1;
417 work_global[0] = SKC_RASTERIZE_SUBGROUP_SIZE * work_size;
418 work_local [0] = SKC_RASTERIZE_SUBGROUP_SIZE;
419
420 return work_local;
421 }
422
423 static
424 size_t *
skc_device_shaper_rasterize_all(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)425 skc_device_shaper_rasterize_all(size_t const work_size,
426 cl_uint * const work_dim,
427 size_t * const work_global,
428 size_t * const work_local)
429 {
430 return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
431 }
432
433 static
434 size_t *
skc_device_shaper_rasterize_lines(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)435 skc_device_shaper_rasterize_lines(size_t const work_size,
436 cl_uint * const work_dim,
437 size_t * const work_global,
438 size_t * const work_local)
439 {
440 return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
441 }
442
443 static
444 size_t *
skc_device_shaper_rasterize_quads(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)445 skc_device_shaper_rasterize_quads(size_t const work_size,
446 cl_uint * const work_dim,
447 size_t * const work_global,
448 size_t * const work_local)
449 {
450 return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
451 }
452
453 static
454 size_t *
skc_device_shaper_rasterize_cubics(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)455 skc_device_shaper_rasterize_cubics(size_t const work_size,
456 cl_uint * const work_dim,
457 size_t * const work_global,
458 size_t * const work_local)
459 {
460 return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
461 }
462
463 static
464 size_t *
skc_device_shaper_rasterize_rat_quads(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)465 skc_device_shaper_rasterize_rat_quads(size_t const work_size,
466 cl_uint * const work_dim,
467 size_t * const work_global,
468 size_t * const work_local)
469 {
470 return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
471 }
472
473 static
474 size_t *
skc_device_shaper_rasterize_rat_cubics(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)475 skc_device_shaper_rasterize_rat_cubics(size_t const work_size,
476 cl_uint * const work_dim,
477 size_t * const work_global,
478 size_t * const work_local)
479 {
480 return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
481 }
482
483 static
484 size_t *
skc_device_shaper_rasters_alloc(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)485 skc_device_shaper_rasters_alloc(size_t const work_size,
486 cl_uint * const work_dim,
487 size_t * const work_global,
488 size_t * const work_local)
489 {
490 // round up to whole groups
491 size_t gs = SKC_ROUND_UP(work_size,SKC_RASTERS_ALLOC_GROUP_SIZE);
492
493 work_dim [0] = 1;
494 work_global[0] = gs;
495 work_local [0] = SKC_RASTERS_ALLOC_GROUP_SIZE;
496
497 return work_local;
498 }
499
500 static
501 size_t *
skc_device_shaper_segment_ttrk(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)502 skc_device_shaper_segment_ttrk(size_t const work_size,
503 cl_uint * const work_dim,
504 size_t * const work_global,
505 size_t * const work_local)
506 {
507 // work_size is number of keys -- round up to a whole slab
508 size_t keys_ru = SKC_ROUND_UP(work_size,HS_SLAB_WIDTH*HS_SLAB_HEIGHT);
509
510 work_dim [0] = 1;
511 work_global[0] = keys_ru / HS_SLAB_HEIGHT;
512 work_local [0] = HS_SLAB_WIDTH; // or just return NULL
513
514 return work_local;
515 }
516
517 static
518 size_t *
skc_device_shaper_segment_ttck(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)519 skc_device_shaper_segment_ttck(size_t const work_size,
520 cl_uint * const work_dim,
521 size_t * const work_global,
522 size_t * const work_local)
523 {
524 // work_size is number of keys -- round up to a whole slab
525 size_t keys_ru = SKC_ROUND_UP(work_size,HS_SLAB_WIDTH*HS_SLAB_HEIGHT);
526
527 work_dim [0] = 1;
528 work_global[0] = keys_ru / HS_SLAB_HEIGHT;
529 work_local [0] = HS_SLAB_WIDTH; // or just return NULL
530
531 return work_local;
532 }
533
534 static
535 size_t *
skc_device_shaper_prefix(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)536 skc_device_shaper_prefix(size_t const work_size,
537 cl_uint * const work_dim,
538 size_t * const work_global,
539 size_t * const work_local)
540 {
541 work_dim [0] = 1;
542 work_global[0] = SKC_PREFIX_SUBGROUP_SIZE * work_size;
543 work_local [0] = SKC_PREFIX_SUBGROUP_SIZE;
544
545 return work_local;
546 }
547
548 static
549 size_t *
skc_device_shaper_place(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)550 skc_device_shaper_place(size_t const work_size,
551 cl_uint * const work_dim,
552 size_t * const work_global,
553 size_t * const work_local)
554 {
555 work_dim [0] = 1;
556 work_global[0] = SKC_PLACE_SUBGROUP_SIZE * work_size;
557 work_local [0] = SKC_PLACE_SUBGROUP_SIZE;
558
559 return work_local;
560 }
561
562 static
563 size_t *
skc_device_shaper_render(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)564 skc_device_shaper_render(size_t const work_size,
565 cl_uint * const work_dim,
566 size_t * const work_global,
567 size_t * const work_local)
568 {
569 work_dim [0] = 1;
570 work_global[0] = SKC_RENDER_SUBGROUP_SIZE * work_size;
571 work_local [0] = SKC_RENDER_SUBGROUP_SIZE;
572
573 return work_local;
574 }
575
576 static
577 size_t *
skc_device_shaper_paths_reclaim(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)578 skc_device_shaper_paths_reclaim(size_t const work_size,
579 cl_uint * const work_dim,
580 size_t * const work_global,
581 size_t * const work_local)
582 {
583 assert(work_size == SKC_RECLAIM_ARRAY_SIZE);
584
585 work_dim [0] = 1;
586 work_global[0] = SKC_RECLAIM_ARRAY_SIZE * SKC_PATHS_RECLAIM_SUBGROUP_SIZE;
587
588 return NULL; // let runtime figure out local work size
589 }
590
591 static
592 size_t *
skc_device_shaper_rasters_reclaim(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)593 skc_device_shaper_rasters_reclaim(size_t const work_size,
594 cl_uint * const work_dim,
595 size_t * const work_global,
596 size_t * const work_local)
597 {
598 assert(work_size == SKC_RECLAIM_ARRAY_SIZE);
599
600 work_dim [0] = 1;
601 work_global[0] = SKC_RECLAIM_ARRAY_SIZE * SKC_PATHS_RECLAIM_SUBGROUP_SIZE;
602
603 return NULL; // let runtime figure out local work size
604 }
605
606 //
607 //
608 //
609
610 static union skc_program_sources const program_sources = {
611 SKC_PROGRAM_SOURCE(block_pool_init,SKC_BUILD_OPTIONS),
612 SKC_PROGRAM_SOURCE(paths_copy, SKC_BUILD_OPTIONS),
613 SKC_PROGRAM_SOURCE(fills_expand, SKC_BUILD_OPTIONS),
614 SKC_PROGRAM_SOURCE(rasterize, SKC_BUILD_OPTIONS),
615 SKC_PROGRAM_SOURCE(segment_ttrk, SKC_BUILD_OPTIONS),
616 SKC_PROGRAM_SOURCE(rasters_alloc, SKC_BUILD_OPTIONS),
617 SKC_PROGRAM_SOURCE(prefix, SKC_BUILD_OPTIONS),
618 SKC_PROGRAM_SOURCE(place, SKC_BUILD_OPTIONS),
619 SKC_PROGRAM_SOURCE(segment_ttck, SKC_BUILD_OPTIONS),
620 SKC_PROGRAM_SOURCE(render, SKC_BUILD_OPTIONS),
621 SKC_PROGRAM_SOURCE(paths_reclaim, SKC_BUILD_OPTIONS),
622 SKC_PROGRAM_SOURCE(rasters_reclaim,SKC_BUILD_OPTIONS)
623 };
624
625 static union skc_program_kernels const program_kernels = {
626
627 .block_pool_init = { { SKC_PROGRAM_KERNEL(block_pool_init_ids), SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_IDS },
628 { SKC_PROGRAM_KERNEL(block_pool_init_atomics), SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_ATOMICS } },
629
630 .paths_copy = { { SKC_PROGRAM_KERNEL(paths_alloc), SKC_DEVICE_KERNEL_ID_PATHS_ALLOC },
631 { SKC_PROGRAM_KERNEL(paths_copy) , SKC_DEVICE_KERNEL_ID_PATHS_COPY } },
632
633 .fills_expand = { { SKC_PROGRAM_KERNEL(fills_expand), SKC_DEVICE_KERNEL_ID_FILLS_EXPAND } },
634
635 .rasterize = { { SKC_PROGRAM_KERNEL(rasterize_all), SKC_DEVICE_KERNEL_ID_RASTERIZE_ALL },
636 { SKC_PROGRAM_KERNEL(rasterize_lines), SKC_DEVICE_KERNEL_ID_RASTERIZE_LINES },
637 { SKC_PROGRAM_KERNEL(rasterize_quads), SKC_DEVICE_KERNEL_ID_RASTERIZE_QUADS },
638 { SKC_PROGRAM_KERNEL(rasterize_cubics), SKC_DEVICE_KERNEL_ID_RASTERIZE_CUBICS },
639 { SKC_PROGRAM_KERNEL(rasterize_rat_quads), SKC_DEVICE_KERNEL_ID_RASTERIZE_RAT_QUADS },
640 { SKC_PROGRAM_KERNEL(rasterize_rat_cubics), SKC_DEVICE_KERNEL_ID_RASTERIZE_RAT_CUBICS } },
641
642 .segment_ttrk = { { SKC_PROGRAM_KERNEL(segment_ttrk), SKC_DEVICE_KERNEL_ID_SEGMENT_TTRK } },
643
644 .rasters_alloc = { { SKC_PROGRAM_KERNEL(rasters_alloc), SKC_DEVICE_KERNEL_ID_RASTERS_ALLOC } },
645
646 .prefix = { { SKC_PROGRAM_KERNEL(prefix), SKC_DEVICE_KERNEL_ID_PREFIX } },
647
648 .place = { { SKC_PROGRAM_KERNEL(place), SKC_DEVICE_KERNEL_ID_PLACE } },
649
650 .segment_ttck = { { SKC_PROGRAM_KERNEL(segment_ttck) , SKC_DEVICE_KERNEL_ID_SEGMENT_TTCK } },
651
652 .render = { { SKC_PROGRAM_KERNEL(render), SKC_DEVICE_KERNEL_ID_RENDER } },
653
654 .paths_reclaim = { { SKC_PROGRAM_KERNEL(paths_reclaim), SKC_DEVICE_KERNEL_ID_PATHS_RECLAIM } },
655
656 .rasters_reclaim = { { SKC_PROGRAM_KERNEL(rasters_reclaim), SKC_DEVICE_KERNEL_ID_RASTERS_RECLAIM } }
657 };
658
659 //
660 //
661 //
662
663 struct skc_device
664 {
665 //
666 // FIXME -- an OpenCL 2.1+ device would clone these kernels in a
667 // multithreaded system.
668 //
669 // Not having the ability to clone kernels (yet set their sticky
670 // args) was an oversight in previous versions of OpenCL.
671 //
672 // For now, we can probably get away with just a single kernel
673 // instance as long as args are set and the kernel is launched
674 // before having its arguments stomped on.
675 //
676 cl_kernel kernels [SKC_DEVICE_KERNEL_ID_COUNT];
677 size_t reqd_szs[SKC_DEVICE_KERNEL_ID_COUNT][3];
678 };
679
680 //
681 // CREATE KERNELS
682 //
683
684 static
685 void
skc_device_create_kernels(struct skc_runtime * const runtime,struct skc_program_kernel const * const kernels,skc_uint const kernel_count,cl_program program)686 skc_device_create_kernels(struct skc_runtime * const runtime,
687 struct skc_program_kernel const * const kernels,
688 skc_uint const kernel_count,
689 cl_program program)
690 {
691 for (skc_uint ii=0; ii<kernel_count; ii++)
692 {
693 cl_int cl_err;
694
695 char const * name = kernels[ii].name;
696 skc_uint const id = kernels[ii].id;
697
698 fprintf(stderr,"\t\"%s\"\n",name);
699
700 // create the kernel
701 runtime->device->kernels[id] = clCreateKernel(program,name,&cl_err); cl_ok(cl_err);
702
703 //
704 // release program now
705 //
706 // FIXME -- if/when we multithread then we need to clone kernels
707 // (>=2.1) or keep programs around (<=2.0)
708 //
709
710 // get workgroup size
711 cl(GetKernelWorkGroupInfo(runtime->device->kernels[id],
712 runtime->cl.device_id,
713 CL_KERNEL_COMPILE_WORK_GROUP_SIZE,
714 sizeof(runtime->device->reqd_szs[0]),
715 runtime->device->reqd_szs[id],
716 NULL));
717
718 //
719 // GEN9+ PROBING
720 //
721 #define SKC_TARGET_GEN9
722 #ifdef SKC_TARGET_GEN9
723
724 #define CL_DEVICE_SUB_GROUP_SIZES_INTEL 0x4108
725 #define CL_KERNEL_SPILL_MEM_SIZE_INTEL 0x4109
726 #define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL 0x410A
727
728 cl_ulong spill_mem_size;
729
730 cl(GetKernelWorkGroupInfo(runtime->device->kernels[id],
731 runtime->cl.device_id,
732 CL_KERNEL_SPILL_MEM_SIZE_INTEL,
733 sizeof(spill_mem_size),
734 &spill_mem_size,
735 NULL));
736
737 fprintf(stderr,"\t\tspill mem size: %lu bytes\n",
738 (unsigned long)spill_mem_size);
739
740 cl_ulong local_mem_size;
741
742 cl(GetKernelWorkGroupInfo(runtime->device->kernels[id],
743 runtime->cl.device_id,
744 CL_KERNEL_LOCAL_MEM_SIZE,
745 sizeof(local_mem_size),
746 &local_mem_size,
747 NULL));
748
749 fprintf(stderr,"\t\tlocal mem size: %lu bytes\n",
750 (unsigned long)local_mem_size);
751 #endif
752 }
753 }
754
755 static
756 void
skc_device_build_program(struct skc_runtime * const runtime,struct skc_program_source const * const source,struct skc_program_kernel const * const kernels,skc_uint const kernel_count)757 skc_device_build_program(struct skc_runtime * const runtime,
758 struct skc_program_source const * const source,
759 struct skc_program_kernel const * const kernels,
760 skc_uint const kernel_count)
761 {
762 cl_program program;
763
764 fprintf(stderr,"%-20s: ",source->name);
765
766 cl_int cl_err;
767
768 #if SKC_KERNEL_SPIRV // PROGRAM IS SPIR-V
769
770 fprintf(stderr,"Creating (SPIR-V) ... ");
771
772 program = clCreateProgramWithIL(runtime->cl.context,
773 source->src,
774 source->srclen,
775 &cl_err);
776
777 #elif SKC_KERNEL_BINARY // PROGRAM IS BINARY
778
779 fprintf(stderr,"Creating (Binary) ... ");
780
781 cl_int status;
782 program = clCreateProgramWithBinary(runtime->cl.context,
783 1,
784 &runtime->cl.device_id,
785 &source->srclen,
786 (unsigned char const *[]){ source->src },
787 &status,
788 &cl_err);
789
790 #elif SKC_KERNEL_SRC // PROGRAM IS SOURCE CODE
791
792 fprintf(stderr,"Creating (Source) ... ");
793
794 program = clCreateProgramWithSource(runtime->cl.context,
795 1,
796 (char const *[]){ source->src },
797 &source->srclen,
798 &cl_err);
799 #else
800
801 #error "SKC_KERNEL_???"
802
803 #endif
804
805 cl_ok(cl_err);
806
807 fprintf(stderr,"Building ... ");
808
809 // build the program
810 cl(BuildProgram(program,
811 1,
812 &runtime->cl.device_id,
813 source->options, // build options are ignored by binary
814 NULL,
815 NULL));
816
817 fprintf(stderr,"Done\n");
818
819 // build the kernels
820 skc_device_create_kernels(runtime,kernels,kernel_count,program);
821
822 // we're done with program for now
823 // can always recover it from a kernel instance
824 cl(ReleaseProgram(program));
825 }
826
827 //
828 // RELEASE KERNELS
829 //
830
831 static
832 void
skc_device_release_kernels(struct skc_device * const device)833 skc_device_release_kernels(struct skc_device * const device)
834 {
835 for (skc_int ii=0; ii<SKC_COUNT_OF(device->kernels); ii++)
836 cl(ReleaseKernel(device->kernels[ii]));
837 }
838
839
840
841 cl_kernel
skc_device_acquire_kernel(struct skc_device * const device,skc_device_kernel_id const type)842 skc_device_acquire_kernel(struct skc_device * const device,
843 skc_device_kernel_id const type)
844 {
845 cl_kernel kernel = device->kernels[type];
846
847 cl(RetainKernel(kernel));
848
849 return kernel;
850 }
851
852
853 void
skc_device_release_kernel(struct skc_device * const device,cl_kernel kernel)854 skc_device_release_kernel(struct skc_device * const device,
855 cl_kernel kernel)
856 {
857 cl(ReleaseKernel(kernel));
858 }
859
860 //
861 // INITIALIZE KERNEL ARGS
862 //
863 // FIXME
864 //
865 // pre-assign any kernel arguments that are never going to change --
866 // for example, the block pool
867 //
868
869 //
870 //
871 //
872
873 #define SKC_DEVICE_BUILD_PROGRAM(p) \
874 skc_device_build_program(runtime,&program_sources.p,program_kernels.p,SKC_COUNT_OF(program_kernels.p))
875
876
877 void
skc_device_create(struct skc_runtime * const runtime)878 skc_device_create(struct skc_runtime * const runtime)
879 {
880 struct skc_device * const device = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*device));
881
882 // hang device off of runtime
883 runtime->device = device;
884
885 // hang config off of runtime
886 runtime->config = &config;
887
888 // create kernels
889 SKC_DEVICE_BUILD_PROGRAM(block_pool_init);
890 SKC_DEVICE_BUILD_PROGRAM(paths_copy);
891 SKC_DEVICE_BUILD_PROGRAM(fills_expand);
892 SKC_DEVICE_BUILD_PROGRAM(rasterize);
893 SKC_DEVICE_BUILD_PROGRAM(segment_ttrk);
894 SKC_DEVICE_BUILD_PROGRAM(rasters_alloc);
895 SKC_DEVICE_BUILD_PROGRAM(prefix);
896 SKC_DEVICE_BUILD_PROGRAM(place);
897 SKC_DEVICE_BUILD_PROGRAM(segment_ttck);
898 SKC_DEVICE_BUILD_PROGRAM(render);
899 SKC_DEVICE_BUILD_PROGRAM(paths_reclaim);
900 SKC_DEVICE_BUILD_PROGRAM(rasters_reclaim);
901
902 // create HotSort instance
903 runtime->hs = hs_cl_create(&hs_intel_gen8_u64,
904 runtime->cl.context,
905 runtime->cl.device_id);
906 }
907
908 void
skc_device_dispose(struct skc_runtime * const runtime)909 skc_device_dispose(struct skc_runtime * const runtime)
910 {
911 //
912 // FIXME -- dispose of programs, kernels, etc.
913 //
914
915 skc_runtime_host_perm_free(runtime,runtime->device);
916
917 // dispose of hotsort etc.
918 }
919
920 //
921 // FIXME -- just pass the device type
922 //
923
924 void
skc_device_enqueue_kernel(struct skc_device * const device,skc_device_kernel_id const type,cl_command_queue cq,cl_kernel kernel,size_t const work_size,cl_uint num_events_in_wait_list,cl_event const * const event_wait_list,cl_event * const event)925 skc_device_enqueue_kernel(struct skc_device * const device,
926 skc_device_kernel_id const type,
927 cl_command_queue cq,
928 cl_kernel kernel,
929 size_t const work_size,
930 cl_uint num_events_in_wait_list,
931 cl_event const * const event_wait_list,
932 cl_event * const event)
933 {
934 if (work_size == 0)
935 return;
936
937 cl_uint work_dim [1];
938 size_t work_global[3];
939 size_t work_local [3];
940
941 size_t * work_local_ptr = program_kernels.kernels[type].shaper(work_size,
942 work_dim,
943 work_global,
944 work_local);
945 cl(EnqueueNDRangeKernel(cq,
946 kernel,// device->kernels[type],
947 work_dim[0],
948 NULL,
949 work_global,
950 work_local_ptr,
951 num_events_in_wait_list,
952 event_wait_list,
953 event));
954 }
955
956 //
957 //
958 //
959