• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright 2017 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can
5 * be found in the LICENSE file.
6 *
7 */
8
9//
10//
11//
12
13#include "tile.h"
14#include "common.h"
15#include "atomic_cl.h"
16#include "block_pool_cl.h"
17#include "raster_builder_cl_12.h"
18#include "kernel_cl_12.h"
19
20// #define SKC_ARCH_AVX2
21// #define SKC_RASTERIZE_SIMD_USES_SMEM
22
23#define PRINTF_ENABLE       0
24#define PRINTF_BLOCK_COUNT  0
25
26//
27// NOTE:
28//
29// ON SIMD DEVICES THE BIN COUNT MUST BE POW2 SO THAT WE CAN LOAD IT
30// AS A VECTOR AND PERFORM A SWIZZLE/SHUFFLE
31//
32// NOTE:
33//
34// IGNORE FOR NOW ANY AVX2 CODE SNIPPETS.  THEY WILL BE MOVED ASAP.
35//
36//
37
38#if 0 // SKC_ARCH_AVX2
39
40// #define SKC_RASTERIZE_SUBGROUP_SIZE              1
41// #define SKC_RASTERIZE_VECTOR_SIZE_LOG2           3
42// #define SKC_RASTERIZE_WORKGROUP_COUNT_SUBGROUP   1
43
44// #define SKC_TTXB_WORDS                           8
45
46// #define SKC_RASTERIZE_FLOAT                      float8
47// #define SKC_RASTERIZE_UINT                       uint8
48// #define SKC_RASTERIZE_INT                        int8
49// #define SKC_RASTERIZE_PREDICATE                  int8
50
51// #define SKC_RASTERIZE_BIN_BLOCK                  uint16
52// #define SKC_RASTERIZE_BIN                        uint8
53
54// #define SKC_RASTERIZE_POOL                       uint8
55// #define SKC_RASTERIZE_POOL_SCALE                 6
56
57// #define SKC_RASTERIZE_TILE_HASH_X_BITS           1
58// #define SKC_RASTERIZE_TILE_HASH_Y_BITS           2
59
60// #define SKC_RASTERIZE_VECTOR_EXPAND()            SKC_EXPAND_8()
61
62#endif
63
64//
65// SIMT
66//
67
68#define SKC_RASTERIZE_BLOCK_ID_V_SIZE        SKC_RASTERIZE_SUBGROUP_SIZE
69#define SKC_RASTERIZE_TTSK_V_SIZE            SKC_RASTERIZE_SUBGROUP_SIZE
70#define SKC_RASTERIZE_TTSK_V_MASK            (SKC_RASTERIZE_TTSK_V_SIZE - 1)
71
72//
73//
74//
75
76#define SKC_RASTERIZE_VECTOR_SIZE            (1 << SKC_RASTERIZE_VECTOR_SIZE_LOG2)
77#define SKC_RASTERIZE_ELEMS_PER_SUBGROUP     (SKC_RASTERIZE_SUBGROUP_SIZE * SKC_RASTERIZE_VECTOR_SIZE)
78
79//
80//
81//
82
83#define SKC_RASTERIZE_YX_INIT                0x7FFF7FFF  // { +32767, +32767 }
84#define SKC_RASTERIZE_YX_INVALID             0x80008000  // { -32768, -32768 }
85
86//
87//
88//
89
90#define SKC_RASTERIZE_TILE_HASH_X_MASK       SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_X_BITS)
91#define SKC_RASTERIZE_TILE_HASH_Y_MASK       SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_Y_BITS)
92#define SKC_RASTERIZE_TILE_HASH_BITS         (SKC_RASTERIZE_TILE_HASH_X_BITS + SKC_RASTERIZE_TILE_HASH_Y_BITS)
93#define SKC_RASTERIZE_TILE_HASH_BIN_COUNT    (1 << SKC_RASTERIZE_TILE_HASH_BITS)
94#define SKC_RASTERIZE_TILE_HASH_BIN_BITS     (SKC_RASTERIZE_TILE_HASH_BITS + 1) // FIXME -- LOG2_RU(BIN_COUNT)
95#define SKC_RASTERIZE_TILE_HASH_BIN_MASK     SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_BIN_BITS)
96
97//
98// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++"
99//
100// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/
101//
102// Lerp in two fma/mad ops:
103//
104//    t * b + ((-t) * a + a)
105//
106// Note: OpenCL documents mix() as being implemented as:
107//
108//    a + (b - a) * t
109//
110// But this may be a native instruction on some devices. For example,
111// on GEN9 there is an LRP "linear interoplation" opcode but it
112// doesn't appear to support half floats.
113//
114// Feel free to toggle this option and then benchmark and inspect the
115// generated code.  We really want the double FMA to be generated when
116// there isn't support for a LERP/MIX operation.
117//
118
119#if 1
120#define SKC_LERP(a,b,t)      mad(t,b,mad(-(t),a,a))
121#else
122#define SKC_LERP(a,b,t)      mix(a,b,t)
123#endif
124
125//
126// There is no integer MAD in OpenCL with "don't care" overflow
127// semantics.
128//
129// FIXME -- verify if the platform needs explicit MAD operations even
130// if a "--fastmath" option is available at compile time.  It might
131// make sense to explicitly use MAD calls if the platform requires it.
132//
133
134#if 1
135#define SKC_MAD_UINT(a,b,c)  ((a) * (b) + (c))
136#else
137#define SKC_MAD_UINT(a,b,c)  mad_sat(a,b,c)
138#endif
139
140//
141//
142//
143
144#define SKC_RASTERIZE_SEGMENT(id) (id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane())
145
146//
147//
148//
149
150union skc_bp_elem
151{
152  skc_uint              u32;
153  skc_tagged_block_id_t tag_id;
154  skc_float             coord;
155};
156
157//
158//
159//
160
161struct skc_subgroup_smem
162{
163  //
164  // SIMT subgroup scratchpad for max scan -- also shared with 'winner' member
165  //
166#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 ) || defined ( SKC_RASTERIZE_SIMD_USES_SMEM )
167  struct {
168    union {
169
170      skc_uint                winner;
171
172      struct {
173        skc_uint              scratch[SKC_RASTERIZE_SUBGROUP_SIZE];
174      } aN;
175
176      struct {
177        SKC_RASTERIZE_UINT    scratch[SKC_RASTERIZE_SUBGROUP_SIZE];
178      } vN;
179    };
180  } subgroup;
181#endif
182
183  //
184  // work-in-progress TTSB blocks and associated YX keys
185  //
186  union {
187    struct {
188      // FIXME -- some typedefs are valid here
189      skc_uint                ttsb [SKC_RASTERIZE_TILE_HASH_BIN_COUNT][SKC_DEVICE_SUBBLOCK_WORDS];
190      skc_uint                yx   [SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
191      skc_uint                id   [SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
192      skc_uint                count[SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
193    } aN;
194#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
195    struct {
196      SKC_RASTERIZE_BIN_BLOCK ttsb[SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
197      SKC_RASTERIZE_BIN       yx;
198      SKC_RASTERIZE_BIN       id;
199      SKC_RASTERIZE_BIN       count;
200    } vN;
201#endif
202  } bin;
203};
204
205//
206//
207//
208
209#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
210#define skc_subgroup_lane()  0
211#else
212#define skc_subgroup_lane()  get_sub_group_local_id()
213#endif
214
215//
216//
217//
218
219#define SKC_PROJECT(tv,x,y,xp,yp)                                       \
220  {                                                                     \
221    float const d = native_recip(fma(x,tv->w0,fma(y,tv->w1,1.0f)));     \
222    xp *= d;                                                            \
223    yp *= d;                                                            \
224  }
225
226//
227// replenish block ids
228//
229// note that you can't overrun the block id pool since it's a ring
230//
231
232static
233void
234skc_blocks_replenish(skc_uint                           * const blocks_next,
235                     skc_block_id_v_t                   * const blocks,
236                     __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
237                     skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
238                     __global skc_block_id_t   const    * const bp_ids)
239{
240  //
241  // get a new vector of block ids -- this is kind of a narrow
242  // allocation but subblocks help stretch out the pool.
243  //
244  // FIXME -- there is now plenty of SMEM to allocate a LOT of block ids
245  //
246  skc_uint bp_idx = 0;
247
248  if (skc_subgroup_lane() == 0)
249    {
250      bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,
251                                                    SKC_RASTERIZE_BLOCK_ID_V_SIZE); // ring_reads
252#if 0
253      printf("r+: %8u + %u\n",bp_idx,SKC_RASTERIZE_BLOCK_ID_V_SIZE);
254#endif
255    }
256
257  bp_idx       = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane()) & bp_mask;
258  *blocks      = bp_ids[bp_idx];
259  *blocks_next = 0;
260}
261
262//
263//
264//
265
266static
267skc_block_id_t
268skc_blocks_get_next(skc_uint                           * const blocks_next,
269                    skc_block_id_v_t                   * const blocks,
270                    __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
271                    skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
272                    __global skc_block_id_t   const    * const bp_ids)
273{
274  // replenish?
275  if (*blocks_next == SKC_RASTERIZE_BLOCK_ID_V_SIZE)
276    {
277      skc_blocks_replenish(blocks_next,blocks,bp_atomics,bp_mask,bp_ids);
278    }
279
280#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 )
281  //
282  // SIMT
283  //
284  skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next);
285
286#else
287  //
288  // SIMD
289  //
290  skc_block_id_t id = blocks->s0;
291
292  skc_shuffle_down_1(*blocks);
293
294#endif
295
296  *blocks_next += 1;
297
298  return id;
299}
300
301//
302// subblock allocator
303//
304
305#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
306
307static
308skc_block_id_t
309skc_subblocks_get_next(skc_block_id_t                     * const subblocks,
310                       skc_uint                           * const blocks_next,
311                       skc_block_id_v_t                   * const blocks,
312                       __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
313                       skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
314                       __global skc_block_id_t   const    * const bp_ids)
315{
316  if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
317    {
318      *subblocks = skc_blocks_get_next(blocks_next,blocks,bp_atomics,bp_mask,bp_ids);
319    }
320
321  skc_block_id_t const sb_id = *subblocks;
322
323  *subblocks += 1;
324
325#if 0
326  if (get_sub_group_local_id() == 0)
327    printf("= %u\n",sb_id);
328#endif
329
330  return sb_id;
331}
332
333
334#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const subblocks, skc_block_id_t * const blocks
335#define SKC_SUBBLOCKS_BLOCKS_ARGS()  subblocks, blocks
336
337#else
338
339#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const blocks
340#define SKC_SUBBLOCKS_BLOCKS_ARGS()  blocks
341
342#endif
343
344//
345//
346//
347
348static
349skc_block_id_t
350skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_PROTO(),
351                  skc_uint                           * const blocks_next,
352                  __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
353                  skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
354                  __global skc_block_id_t   const    * const bp_ids,
355                  __global SKC_ATOMIC_UINT  volatile * const cohort_atomics,
356                  skc_ttsk_v_t                       * const sk_v,
357                  skc_uint                           * const sk_v_next,
358                  __global skc_ttsk_s_t              * const sk_extent,
359                  skc_uint                             const new_yx)
360{
361#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
362  skc_block_id_t const new_id = skc_subblocks_get_next(subblocks,
363                                                       blocks_next,
364                                                       blocks,
365                                                       bp_atomics,
366                                                       bp_mask,
367                                                       bp_ids);
368#else
369  skc_block_id_t const new_id = skc_blocks_get_next(blocks_next,
370                                                    blocks,
371                                                    bp_atomics,
372                                                    bp_mask, // pow2 modulo mask for block pool ring
373                                                    bp_ids);
374#endif
375
376  if (get_sub_group_local_id() == (*sk_v_next & SKC_RASTERIZE_TTSK_V_MASK))
377    {
378      sk_v->lo = new_id;
379      sk_v->hi = (sk_v->hi & SKC_TTRK_HI_MASK_COHORT) | new_yx;
380#if 0
381      printf("@ ( %3u, %3u ) %u\n",
382             (new_yx >> 12) & 0xFFF,
383             (new_yx      ) & 0xFFF,
384             new_id);
385#endif
386    }
387
388  *sk_v_next += 1;
389
390  if (*sk_v_next == SKC_RASTERIZE_TTSK_V_SIZE)
391    {
392      *sk_v_next = 0;
393
394      skc_uint sk_idx = 0;
395
396      if (skc_subgroup_lane() == 0)
397        {
398          sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
399            (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_TTSK_V_SIZE);
400#if 0
401          printf("+ %u\n",sk_idx);
402#endif
403        }
404
405      sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane();
406
407#if ( SKC_RASTERIZE_SUBGROUP_SIZE > SKC_RASTERIZE_TTSK_V_SIZE )
408      if (skc_subgroup_lane() < SKC_RASTERIZE_TTSK_V_SIZE)
409#endif
410        {
411          sk_extent[sk_idx] = *sk_v;
412#if 0
413          printf("> %u : %v2u\n",sk_idx,*sk_v);
414#endif
415        }
416    }
417
418  return new_id;
419}
420
421//
422//
423//
424
425static
426SKC_RASTERIZE_FLOAT
427skc_subgroup_scan_inclusive_add_float(SKC_RASTERIZE_FLOAT const v)
428{
429#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
430  //
431  // SIMD
432  //
433  // Note that there isn't a built-in horizontal scan for vectors so
434  // we'll define some here for various widths.
435  //
436  // FIXME -- a scalar version might be faster so put in a
437  // compile-time switch to selection between implementations
438  //
439
440#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
441  return v;
442
443#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
444  // 01
445  //  0 +
446  // --
447  // 01
448  SKC_RASTERIZE_FLOAT const w = mad(v.s10,(SKC_RASTERIZE_FLOAT)(0,1),v);
449  return w;
450
451#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
452  // 0123
453  //  012 +
454  // ----
455  // 0123
456  //   01 +
457  // ----
458  // 0123
459  //
460  SKC_RASTERIZE_FLOAT const w = mad(v.s3012,(SKC_RASTERIZE_FLOAT)(0,1,1,1),v);
461  SKC_RASTERIZE_FLOAT const x = mad(w.s2301,(SKC_RASTERIZE_FLOAT)(0,0,1,1),w);
462  return x;
463
464#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
465  // 01234567
466  //  0123456 +
467  // --------
468  // 01234567
469  //   012345 +
470  // --------
471  // 01234567
472  //     0123 +
473  // --------
474  // 01234567
475  //
476  SKC_RASTERIZE_FLOAT const w = mad(v.s70123456,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1),v);
477  SKC_RASTERIZE_FLOAT const x = mad(w.s67012345,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1),w);
478  SKC_RASTERIZE_FLOAT const y = mad(x.s45670123,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1),x);
479  return y;
480
481#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
482  // 0123456789abcdef
483  //  0123456789abcde +
484  // ----------------
485  // 0123456789abcdef
486  //   0123456789abcd +
487  // ----------------
488  // 0123456789abcdef
489  //     0123456789ab +
490  // ----------------
491  // 0123456789abcdef
492  //         01234567 +
493  // ----------------
494  // 0123456789abcdef
495  //
496  SKC_RASTERIZE_FLOAT const w = mad(v.sf0123456789abcde,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v);
497  SKC_RASTERIZE_FLOAT const x = mad(w.sef0123456789abcd,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w);
498  SKC_RASTERIZE_FLOAT const y = mad(x.scdef0123456789ab,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x);
499  SKC_RASTERIZE_FLOAT const z = mad(y.s89abcdef01234567,(SKC_RASTERIZE_FLOAT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y);
500  return z;
501
502#endif
503
504#else
505  //
506  // SIMT
507  //
508
509  return sub_group_scan_inclusive_add(v);
510
511#endif
512}
513
514//
515//
516//
517
518static
519SKC_RASTERIZE_UINT
520skc_subgroup_scan_inclusive_add_uint(SKC_RASTERIZE_UINT const v)
521{
522#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
523  //
524  // SIMD
525  //
526  // Note that there isn't a built-in horizontal scan for vectors so
527  // we'll define some here for various widths.
528  //
529  // FIXME -- a scalar version might be faster so put in a
530  // compile-time switch to selection between implementations
531  //
532
533#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
534  return v;
535
536#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
537  // 01
538  //  0 +
539  // --
540  // 01
541  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s10,(SKC_RASTERIZE_UINT)(0,1),v);
542  return w;
543
544#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
545  // 0123
546  //  012 +
547  // ----
548  // 0123
549  //   01 +
550  // ----
551  // 0123
552  //
553  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s3012,(SKC_RASTERIZE_UINT)(0,1,1,1),v);
554  SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s2301,(SKC_RASTERIZE_UINT)(0,0,1,1),w);
555  return x;
556
557#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
558  // 01234567
559  //  0123456 +
560  // --------
561  // 01234567
562  //   012345 +
563  // --------
564  // 01234567
565  //     0123 +
566  // --------
567  // 01234567
568  //
569  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s70123456,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1),v);
570  SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s67012345,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1),w);
571  SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.s45670123,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1),x);
572  return y;
573
574#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
575  // 0123456789abcdef
576  //  0123456789abcde +
577  // ----------------
578  // 0123456789abcdef
579  //   0123456789abcd +
580  // ----------------
581  // 0123456789abcdef
582  //     0123456789ab +
583  // ----------------
584  // 0123456789abcdef
585  //         01234567 +
586  // ----------------
587  // 0123456789abcdef
588  //
589  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.sf0123456789abcde,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v);
590  SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.sef0123456789abcd,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w);
591  SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.scdef0123456789ab,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x);
592  SKC_RASTERIZE_UINT const z = SKC_MAD_UINT(y.s89abcdef01234567,(SKC_RASTERIZE_UINT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y);
593  return z;
594
595#endif
596
597#else
598  //
599  // SIMT
600  //
601
602  return sub_group_scan_inclusive_add(v);
603
604#endif
605}
606
607//
608//
609//
610
611static
612SKC_RASTERIZE_UINT
613skc_subgroup_scan_inclusive_max(SKC_RASTERIZE_UINT const v)
614{
615#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
616  //
617  // SIMD
618  //
619  // Note that there isn't a built-in horizontal scan for vectors so
620  // we'll define some here for various widths.
621  //
622  // FIXME -- a scalar version might be faster so put in a
623  // compile-time switch to selection between implementations
624  //
625
626#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
627  return v;
628
629#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
630  // 01
631  // 00 max
632  // --
633  // 01
634  SKC_RASTERIZE_UINT const w = max(v.s00,v);
635  return w;
636
637#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
638  // 0123
639  // 0012 +
640  // ----
641  // 0123
642  // 0101 +
643  // ----
644  // 0123
645  //
646  SKC_RASTERIZE_UINT const w = max(v.s0012,v);
647  SKC_RASTERIZE_UINT const x = max(w.s0101,w);
648  return x;
649
650#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
651  // 01234567
652  // 00123456 +
653  // --------
654  // 01234567
655  // 01012345 +
656  // --------
657  // 01234567
658  // 01230123 +
659  // --------
660  // 01234567
661  //
662  SKC_RASTERIZE_UINT const w = max(v.s00123456,v);
663  SKC_RASTERIZE_UINT const x = max(w.s01012345,w);
664  SKC_RASTERIZE_UINT const y = max(x.s01230123,x);
665  return y;
666
667#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
668  // 0123456789abcdef
669  // 00123456789abcde +
670  // ----------------
671  // 0123456789abcdef
672  // 010123456789abcd +
673  // ----------------
674  // 0123456789abcdef
675  // 01230123456789ab +
676  // ----------------
677  // 0123456789abcdef
678  // 0123456701234567 +
679  // ----------------
680  // 0123456789abcdef
681  //
682  SKC_RASTERIZE_UINT const w = max(v.s00123456789abcde,v);
683  SKC_RASTERIZE_UINT const x = max(w.s010123456789abcd,w);
684  SKC_RASTERIZE_UINT const y = max(x.s01230123456789ab,x);
685  SKC_RASTERIZE_UINT const z = max(y.s0123456701234567,y);
686  return z;
687
688#endif
689
690#else
691  //
692  // SIMT
693  //
694
695  return sub_group_scan_inclusive_max(v);
696
697#endif
698}
699
700//
701//
702//
703
704static
705float
706skc_subgroup_last_float(SKC_RASTERIZE_FLOAT const v)
707{
708#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
709  //
710  // SIMD
711  //
712#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
713  return v;
714#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
715  return v.s1;
716#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
717  return v.s3;
718#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
719  return v.s7;
720#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
721  return v.sf;
722#endif
723
724#else
725  //
726  // SIMT
727  //
728  return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1);
729
730#endif
731}
732
733//
734//
735//
736
737static
738SKC_RASTERIZE_UINT
739skc_subgroup_last_uint(SKC_RASTERIZE_UINT const v)
740{
741#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
742  //
743  // SIMD
744  //
745#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
746  return v;
747#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
748  return v.s1;
749#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
750  return v.s3;
751#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
752  return v.s7;
753#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
754  return v.sf;
755#endif
756
757#else
758  //
759  // SIMT
760  //
761  return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1);
762
763#endif
764}
765
766//
767//
768//
769
770static
771float
772skc_subgroup_first(SKC_RASTERIZE_FLOAT const v)
773{
774#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
775  //
776  // SIMD
777  //
778#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
779  return v;
780#else
781  return v.s0;
782#endif
783
784#else
785  //
786  // SIMT
787  //
788  return sub_group_broadcast(v,0);
789
790#endif
791}
792
793//
794//
795//
796
797static
798SKC_RASTERIZE_FLOAT
799skc_subgroup_shuffle(SKC_RASTERIZE_FLOAT const v,
800                      SKC_RASTERIZE_UINT  const i)
801{
802#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
803  //
804  // SIMD
805  //
806#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
807  return v;
808#else
809  return shuffle(v,i);
810#endif
811
812#else
813  //
814  // SIMT
815  //
816  return intel_sub_group_shuffle(v,i);
817
818#endif
819}
820
821//
822//
823//
824
825static
826SKC_RASTERIZE_FLOAT
827skc_subgroup_shuffle_up_1(SKC_RASTERIZE_FLOAT const p, // previous
828                          SKC_RASTERIZE_FLOAT const c) // current
829{
830#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
831  //
832  // SIMD
833  //
834  // FIXME -- there are alternative formulations here:
835  //
836  // Option 1:
837  //
838  //   select(c.rotate(+1),p.rotate(-1),(1,0,0,...))
839  //
840  // Option 2:
841  //
842  //   p is a scalar
843  //   t    = c.rotate(+1)
844  //   t.s0 = p;
845  //
846  // Option 3: ...
847  //
848#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
849  return p;
850#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
851  return shuffle2(p,c,(uint2)(1,2));
852#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
853  return shuffle2(p,c,(uint4)(3,4,5,6));
854#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
855  return shuffle2(p,c,(uint8)(7,8,9,10,11,12,13,14));
856#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
857  return shuffle2(p,c,(uint16)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30));
858#endif
859
860#else
861  //
862  // SIMT
863  //
864  return intel_sub_group_shuffle_up(p,c,1);
865
866#endif
867}
868
869//
870//
871//
872
873static
874bool
875skc_is_lane_first()
876{
877#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1)
878  //
879  // SIMD
880  //
881  return true;
882#else
883  //
884  // SIMT
885  //
886  return get_sub_group_local_id() == 0;
887#endif
888}
889
890//
891//
892//
893
894static
895SKC_RASTERIZE_FLOAT
896skc_delta_offset()
897{
898#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
899  //
900  // SIMD
901  //
902#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
903  return 1;
904#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
905  return (SKC_RASTERIZE_FLOAT)( 1, 2 );
906#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
907  return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4 );
908#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
909  return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8 );
910#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
911  return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 );
912#endif
913
914#else
915  //
916  // SIMT
917  //
918  return 1.0f + get_sub_group_local_id();
919
920#endif
921
922}
923
924//
925//
926//
927
928static
929int
930skc_subgroup_any(SKC_RASTERIZE_PREDICATE const p)
931{
932#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
933  //
934  // SIMD
935  //
936  return any(p);
937#else
938  //
939  // SIMT
940  //
941  return sub_group_any(p);
942#endif
943}
944
945//
946//
947//
948
949#define SKC_PATH_NODEWORD_IS_LAST(n)  (((n) & SKC_DEVICE_BLOCK_WORDS_MASK) == SKC_DEVICE_BLOCK_WORDS_MASK)
950
951void
952skc_segment_next(__global union skc_bp_elem * const bp_elems,
953                 skc_uint                   * const nodeword,
954                 skc_block_id_t             * const id)
955{
956  if ((++*id & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
957    {
958      if (SKC_PATH_NODEWORD_IS_LAST(++*nodeword))
959        {
960          *nodeword = SKC_TAGGED_BLOCK_ID_GET_ID(bp_elems[*nodeword].tag_id) * SKC_DEVICE_SUBBLOCK_WORDS;
961        }
962
963      skc_tagged_block_id_t const tag_id = bp_elems[*nodeword].tag_id;
964
965      *id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
966    }
967}
968
969//
970//
971//
972
973static
974SKC_RASTERIZE_FLOAT
975skc_native_length(SKC_RASTERIZE_FLOAT const x, SKC_RASTERIZE_FLOAT const y)
976{
977  return native_sqrt(x * x + y * y);
978}
979
980//
981// Wang's Formula (1985)
982//
983
984#define SKC_WANG_PIXEL_RESL   0.25f // <-- this can be tuned
985
986#define SKC_WANG_EPSILON      (SKC_WANG_PIXEL_RESL * SKC_SUBPIXEL_RESL_X_F32)
987
988#define SKC_WANG_CUBIC        ((3.0f * 2.0f) / (8.0f * SKC_WANG_EPSILON))
989#define SKC_WANG_QUADRATIC    ((2.0f       ) / (8.0f * SKC_WANG_EPSILON))
990
991#define SKC_WANG_LENGTH(x,y)  skc_native_length(x,y)
992#define SKC_WANG_SQRT(x)      native_sqrt(x)
993
994//
995//
996//
997
998static
999SKC_RASTERIZE_FLOAT
1000skc_wangs_formula_cubic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y,
1001                        SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y,
1002                        SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y,
1003                        SKC_RASTERIZE_FLOAT const t3x, SKC_RASTERIZE_FLOAT const t3y)
1004{
1005  //
1006  // Return the number of evenly spaced (in the parametric sense) line
1007  // segments that are guaranteed to be within "epsilon" error of the
1008  // curve.
1009  //
1010  // We're then going to take multiples of the reciprocal of this
1011  // number so that the segmentation can be distributed across the
1012  // subgroup.
1013  //
1014  // Note, this can probably be slightly optimized per architecture
1015  // but it's probably far from being a hotspot since it's all
1016  // straight-line unpredicated code.
1017  //
1018  // The result is an integer ranging from [1.0,#segments]
1019  //
1020  // Note that even if all of the control points are coincident, the
1021  // max(1.0f) will categorize this as a line of 1 segment.
1022  //
1023  // This is what we want!  We want to convert cubics to lines as
1024  // easily as possible and *then* cull lines that are either
1025  // horizontal or zero length.
1026  //
1027  return max(1.0f,
1028             ceil(SKC_WANG_SQRT(SKC_WANG_CUBIC *
1029                                SKC_WANG_LENGTH(max(fabs(t2x - 2.0f * t1x + t0x),
1030                                                    fabs(t3x - 2.0f * t2x + t1x)),
1031                                                max(fabs(t2y - 2.0f * t1y + t0y),
1032                                                    fabs(t3y - 2.0f * t2y + t1y))))));
1033}
1034
1035static
1036SKC_RASTERIZE_FLOAT
1037skc_wangs_formula_quadratic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y,
1038                            SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y,
1039                            SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y)
1040{
1041  return max(1.0f,
1042             ceil(SKC_WANG_SQRT(SKC_WANG_QUADRATIC *
1043                                SKC_WANG_LENGTH(t2x - 2.0f * t1x + t0x,
1044                                                t2y - 2.0f * t1y + t0y))));
1045}
1046
1047//
1048// rational curves
1049//
1050
1051static
1052SKC_RASTERIZE_FLOAT
1053skc_wangs_formula_cubic_rat()
1054{
1055  return 0.0f;
1056}
1057
1058static
1059SKC_RASTERIZE_FLOAT
1060skc_wangs_formula_quad_rat()
1061{
1062  return 0.0f;
1063}
1064
1065//
1066// flush any work-in-progress blocks and return unused block ids
1067//
1068
1069static
1070void
1071skc_finalize(__global SKC_ATOMIC_UINT          volatile * const bp_atomics,
1072             __global union skc_bp_elem                 * const bp_elems,
1073             __global uint                              * const bp_ids,
1074             skc_uint                                     const bp_mask,
1075             __global SKC_ATOMIC_UINT          volatile * const cohort_atomics,
1076             skc_block_id_v_t                           * const blocks,
1077             skc_uint                                     const blocks_next,
1078             skc_ttsk_v_t                               * const sk_v,
1079             skc_uint                                     const sk_v_next,
1080             __global skc_ttsk_s_t                      * const sk_extent,
1081             __local  struct skc_subgroup_smem volatile * const smem)
1082{
1083  //
1084  // flush non-empty bins
1085  //
1086  // FIXME -- accelerate this iteration/search with a subgroup operation
1087  //
1088  for (skc_uint ii=0; ii<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; ii++)
1089    {
1090      if (smem->bin.aN.count[ii] > 0)
1091        {
1092          skc_block_id_v_t const id  = smem->bin.aN.id[ii];
1093          skc_uint         const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
1094          skc_uint         const tts = smem->bin.aN.ttsb[ii][skc_subgroup_lane()];
1095#if 0
1096          printf("???????? : [ %10u = %10u : %08X ]\n",id,idx,tts);
1097#endif
1098          bp_elems[idx].u32 = tts;
1099        }
1100
1101      //
1102      // FIXME -- vectorize with vstoreN()
1103      //
1104    }
1105
1106  //
1107  // return remaining block ids back to the pool
1108  //
1109  skc_uint const blocks_rem = SKC_RASTERIZE_BLOCK_ID_V_SIZE - blocks_next;
1110
1111  if (blocks_rem > 0)
1112    {
1113      skc_uint bp_idx = 0;
1114
1115      if (skc_subgroup_lane() == 0)
1116        {
1117          bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,blocks_rem);
1118
1119#if 0
1120          printf("r-: %8u + %u\n",bp_idx,blocks_rem);
1121#endif
1122        }
1123
1124      bp_idx = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane() - blocks_next) & bp_mask;
1125
1126      if (skc_subgroup_lane() >= blocks_next)
1127        {
1128          bp_ids[bp_idx] = *blocks;
1129        }
1130    }
1131
1132  //
1133  // flush work-in-progress ryx keys
1134  //
1135  if (sk_v_next > 0)
1136    {
1137      skc_uint sk_idx = 0;
1138
1139      if (skc_subgroup_lane() == 0)
1140        {
1141          sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
1142            (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,sk_v_next);
1143#if 0
1144          printf("* %u\n",sk_idx);
1145#endif
1146        }
1147
1148      sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane();
1149
1150      if (skc_subgroup_lane() < sk_v_next)
1151        {
1152          sk_extent[sk_idx] = *sk_v;
1153        }
1154    }
1155}
1156
1157//
1158// If there are lanes that were unable to append to a bin because
1159// their hashes collided with a bin's current ryx key then those bins
1160// must be ejected.
1161//
1162// Note that we do not eject "full" bins because lazily waiting for a
1163// collision results in simpler code.
1164//
1165
1166static
1167void
1168skc_flush(__global SKC_ATOMIC_UINT          volatile * const bp_atomics,
1169          __global union skc_bp_elem                 * const bp_elems,
1170          __global uint                              * const bp_ids,
1171          skc_uint                                     const bp_mask,
1172          __global SKC_ATOMIC_UINT          volatile * const cohort_atomics,
1173          skc_block_id_t                             * const subblocks,
1174          skc_block_id_v_t                           * const blocks,
1175          skc_uint                                   * const blocks_next,
1176          skc_ttsk_v_t                               * const sk_v,
1177          skc_uint                                   * const sk_v_next,
1178          __global skc_ttsk_s_t                      * const sk_extent,
1179          __local  struct skc_subgroup_smem volatile * const smem,
1180          SKC_RASTERIZE_UINT                           const hash,
1181          SKC_RASTERIZE_UINT                           const yx,
1182          SKC_RASTERIZE_PREDICATE                            is_collision) // pass by value
1183{
1184#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
1185  //
1186  // SIMD
1187  //
1188
1189  //
1190  // FIXME -- this code is now stale with the changes to the
1191  // subblock/block allocation strategy
1192  //
1193
1194  //
1195  // get local TTSB ID queue count
1196  //
1197  skc_uint ttsb_id_count  = smem->pool.count; // scalar
1198
1199  // init hash bit mask
1200  skc_uint component_mask = 0;
1201
1202  for (int cc=0; cc<SKC_RASTERIZE_VECTOR_SIZE; cc++)
1203    {
1204      // if no collision continue
1205      if (((int*)&is_collision)[cc] == 0)
1206        continue;
1207
1208      uint const winner        = ((uint*)&hash)[cc];
1209      uint const component_bit = 1u << winner;
1210
1211      // if already processed this hash then continue
1212      if (component_mask & component_bit)
1213        continue;
1214
1215      // update component mask
1216      component_mask |= component_bit;
1217
1218      //
1219      // new winner requires ejecting the old TTSB
1220      //
1221      if (smem->bin.aN.count[winner] > 0)
1222        {
1223          skc_uint const elem_idx = smem->bin.aN.id[winner] * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
1224
1225          bp_elems[elem_idx].u32 = smem->bin.aN.ttsb[winner][skc_subgroup_lane()];
1226        }
1227
1228        //
1229        // ensure there is at least one TTSK and TTSB ID
1230        //
1231        if (ttsb_id_count == SKC_RASTERIZE_POOL_SIZE)
1232          {
1233            //
1234            // update remaining count
1235            //
1236            ttsb_id_count = 0;
1237
1238            //
1239            // flush accumulated ttsk_ryx keys
1240            //
1241            uint const idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
1242              (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_POOL_SIZE); // ttsk_ryx_count
1243
1244#if 0
1245            printf("# %u\n",idx);
1246#endif
1247
1248            for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE)
1249              {
1250                ttsk_ryx[idx + ii] = skc_make_ttsk_ryx(smem,SKC_CMD_RASTERIZE_GET_COHORT(cmd),ii);
1251              }
1252
1253            //
1254            // allocate more ttsb ids from pool
1255            //
1256            uint const id = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+0,SKC_RASTERIZE_POOL_SIZE); // ring_reads
1257
1258            for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE)
1259              smem->pool.aN.id[ii] = bp_ids[id + ii];
1260          }
1261
1262      //
1263      // invalidate the winning block
1264      //
1265
1266      //
1267      // update bin with winning yx, new ttsb id and zero count
1268      //
1269      // all lanes are loading/storing from/to the same index
1270      //
1271      smem->bin.vN.ttsb [winner] = ( SKC_TTS_INVALID );
1272      smem->bin.aN.id   [winner] = smem->pool.aN.id[ttsb_id_count];
1273      smem->bin.aN.yx   [winner] = smem->pool.aN.yx[ttsb_id_count] = ((uint*)&yx)[cc];
1274      smem->bin.aN.count[winner] = 0;
1275
1276      //
1277      // update count
1278      //
1279      ttsb_id_count += 1;
1280    }
1281
1282  //
1283  // save count
1284  //
1285  smem->pool.count = ttsb_id_count;
1286
1287#else
1288  //
1289  // SIMT
1290  //
1291
1292  do {
1293    //
1294    // only one lane will win!
1295    //
1296    if (is_collision)
1297      smem->subgroup.winner = hash;
1298
1299    barrier(CLK_LOCAL_MEM_FENCE);
1300
1301    //
1302    // which bin is being ejected?
1303    //
1304    skc_uint const winner = smem->subgroup.winner;
1305
1306    //
1307    // which colliding hash is taking over the bin?
1308    //
1309    SKC_RASTERIZE_PREDICATE const is_winner = is_collision && (hash == winner);
1310
1311    //
1312    // all lanes with the same hash will try to store but only one
1313    // lane will win
1314    //
1315    if (is_winner)
1316      smem->subgroup.winner = yx;
1317
1318    barrier(CLK_LOCAL_MEM_FENCE);
1319
1320    //
1321    // flush this block to the pool
1322    //
1323    if (smem->bin.aN.count[winner] > 0)
1324      {
1325        skc_block_id_v_t const id  = smem->bin.aN.id[winner];
1326        skc_uint         const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
1327        skc_uint         const tts = smem->bin.aN.ttsb[winner][skc_subgroup_lane()];
1328#if 0
1329        printf("%08X : [ %10u = %10u : %08X ]\n",yx,id,idx,tts);
1330#endif
1331        bp_elems[idx].u32 = tts;
1332      }
1333
1334    //
1335    // append new ttsk
1336    //
1337    skc_uint       const new_yx = smem->subgroup.winner;
1338    skc_block_id_t const new_id = skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_ARGS(),
1339                                                    blocks_next,
1340                                                    bp_atomics,
1341                                                    bp_mask, // pow2 modulo mask for block pool ring
1342                                                    bp_ids,
1343                                                    cohort_atomics,
1344                                                    sk_v,
1345                                                    sk_v_next,
1346                                                    sk_extent,
1347                                                    new_yx);
1348
1349#if 0
1350    if (get_sub_group_local_id() == 0) {
1351      printf(">>> %9u\n",new_id);
1352    }
1353#endif
1354
1355    //
1356    // update bin with winning yx, new ttsb id and zero count
1357    //
1358    smem->bin.aN.ttsb [winner][skc_subgroup_lane()] = SKC_TTS_INVALID;
1359    smem->bin.aN.yx   [winner]                      = new_yx;
1360    smem->bin.aN.id   [winner]                      = new_id;
1361    smem->bin.aN.count[winner]                      = 0;
1362
1363    //
1364    // remove all lanes matching this hash
1365    //
1366    is_collision = is_collision && !is_winner;
1367
1368    //
1369    // exit if nothing left to do
1370    //
1371  } while (sub_group_any(is_collision));
1372
1373#endif
1374}
1375
1376//
1377// scatter scan max
1378//
1379static
1380SKC_RASTERIZE_UINT
1381skc_scatter_scan_max(__local struct skc_subgroup_smem volatile * const smem,
1382                     SKC_RASTERIZE_FLOAT                         const iss,
1383                     SKC_RASTERIZE_FLOAT                         const ess)
1384{
1385  //
1386  // prefix sums determine which lanes we're going to work on next
1387  //
1388  SKC_RASTERIZE_PREDICATE const is_scratch_store = (iss > 0.0f) && (ess < (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP);
1389  SKC_RASTERIZE_UINT      const scratch_idx      = SKC_CONVERT(SKC_RASTERIZE_UINT)(max(ess,0.0f));
1390
1391#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
1392  //
1393  // SIMD
1394  //
1395#ifdef SKC_RASTERIZE_SIMD_USES_SMEM
1396  //
1397  // SIMD APPROACH 1: SIMT'ISH
1398  //
1399
1400  // zero the volatile smem scratchpad using vector syntax
1401  smem->subgroup.vN.scratch[0] = ( 0 );
1402
1403#undef  SKC_EXPAND_X
1404#define SKC_EXPAND_X(I,S,C,P,A)                         \
1405  if (is_scratch_store C)                               \
1406    smem->subgroup.aN.scratch[scratch_idx C] = I;
1407
1408  SKC_RASTERIZE_VECTOR_EXPAND();
1409
1410  // propagate lanes to right using max scan
1411  SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[0];
1412  SKC_RASTERIZE_UINT const source  = skc_subgroup_scan_inclusive_max(scratch);
1413
1414#else
1415  //
1416  // SIMD APPROACH 2: SCALAR'ISH
1417  //
1418
1419  SKC_RASTERIZE_UINT source = ( 0 );
1420
1421#undef  SKC_EXPAND_X
1422#define SKC_EXPAND_X(I,S,C,P,A)                 \
1423  if (is_scratch_store C)                       \
1424    ((uint *)&source)[scratch_idx C] = I;
1425
1426  SKC_RASTERIZE_VECTOR_EXPAND();
1427
1428  for (uint ii=1; ii<SKC_RASTERIZE_ELEMS_PER_SUBGROUP; ii++)
1429    ((uint *)&source)[ii] = max(((uint *)&source)[ii-1],((uint *)&source)[ii]);
1430#endif
1431
1432#else
1433  //
1434  // SIMT
1435  //
1436
1437  //
1438  // zero the volatile smem scratchpad using vector syntax
1439  //
1440  smem->subgroup.vN.scratch[skc_subgroup_lane()] = ( 0 );
1441
1442  //
1443  // store source lane at starting lane
1444  //
1445  if (is_scratch_store)
1446    smem->subgroup.aN.scratch[scratch_idx] = skc_subgroup_lane();
1447
1448  //
1449  // propagate lanes to right using max scan
1450  //
1451  SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[skc_subgroup_lane()];
1452  SKC_RASTERIZE_UINT const source  = skc_subgroup_scan_inclusive_max(scratch);
1453#endif
1454
1455  return source;
1456}
1457
1458//
1459// sliver lines into subpixels
1460//
1461
1462static
1463void
1464skc_sliver(__global SKC_ATOMIC_UINT          volatile * const bp_atomics,
1465           __global union skc_bp_elem                 * const bp_elems,
1466           __global uint                              * const bp_ids,
1467           skc_uint                                     const bp_mask,
1468           __global SKC_ATOMIC_UINT          volatile * const cohort_atomics,
1469           skc_block_id_t                             * const subblocks,
1470           skc_block_id_v_t                           * const blocks,
1471           skc_uint                                   * const blocks_next,
1472           skc_ttsk_v_t                               * const sk_v,
1473           skc_uint                                   * const sk_v_next,
1474           __global skc_ttsk_s_t                      * const sk_extent,
1475           __local  struct skc_subgroup_smem volatile * const smem,
1476           SKC_RASTERIZE_FLOAT                          const l0x,
1477           SKC_RASTERIZE_FLOAT                          const l0y,
1478           SKC_RASTERIZE_FLOAT                          const l1x,
1479           SKC_RASTERIZE_FLOAT                          const l1y)
1480{
1481  //
1482  // Y-SLIVERING
1483  // -----------
1484  //
1485  // immediately sliver all multi-pixel lines in into 1-pixel high
1486  // lines
1487  //
1488  // note this implicitly squelches horizontal lines
1489  //
1490  // there is another test for horizontal lines after x-slivering
1491  // is complete
1492  //
1493
1494  //
1495  // will we need to flip the sign of y_delta ?
1496  //
1497  SKC_RASTERIZE_PREDICATE const y_lt   = (l0y <= l1y);
1498  SKC_RASTERIZE_UINT      const dy_xor = y_lt ? 0 : 0x80000000;
1499
1500  //
1501  // save 1/dy
1502  //
1503  SKC_RASTERIZE_FLOAT const y_denom = native_recip(l1y - l0y);
1504
1505  //
1506  // how many non-horizontal subpixel y-axis slivers are there?
1507  //
1508  SKC_RASTERIZE_FLOAT const y_min   = floor(fmin(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN);
1509  SKC_RASTERIZE_FLOAT const y_max   = ceil (fmax(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN);
1510  SKC_RASTERIZE_FLOAT const y_base  = y_lt ? y_min : y_max;
1511  SKC_RASTERIZE_FLOAT       y_segs  = y_max - y_min;
1512
1513  //
1514  // inclusive subgroup scan of y_segs
1515  //
1516  SKC_RASTERIZE_FLOAT       y_iss   = skc_subgroup_scan_inclusive_add_float(y_segs);
1517  SKC_RASTERIZE_FLOAT       y_ess   = y_iss - y_segs;
1518  float                     y_rem   = skc_subgroup_last_float(y_iss);
1519
1520  //
1521  // if this is a horizontal line then tweak y_iss so "is_scratch_store" always fails
1522  //
1523  if (y_segs == 0.0f)
1524    y_iss = 0.0f;
1525
1526#if 0
1527  printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } (* %5.0f / %5.0f / %5.0f / %5.0f *) }, \n",a0x,a0y,a1x,a1y,y_segs,y_iss,y_ess,y_rem);
1528#endif
1529
1530  //
1531  // these values don't matter on first iteration
1532  //
1533  SKC_RASTERIZE_FLOAT n1x_prev = 0;
1534  SKC_RASTERIZE_FLOAT n1y_prev = 0;
1535
1536  //
1537  // loop until done
1538  //
1539  while (y_rem > 0.0f)
1540    {
1541      //
1542      // distribute work across lanes
1543      //
1544      SKC_RASTERIZE_UINT const y_source = skc_scatter_scan_max(smem,y_iss,y_ess);
1545
1546      //
1547      // get line at y_source line
1548      //
1549      SKC_RASTERIZE_FLOAT const m0x = skc_subgroup_shuffle(l0x,y_source);
1550      SKC_RASTERIZE_FLOAT const m0y = skc_subgroup_shuffle(l0y,y_source);
1551      SKC_RASTERIZE_FLOAT const m1x = skc_subgroup_shuffle(l1x,y_source);
1552      SKC_RASTERIZE_FLOAT const m1y = skc_subgroup_shuffle(l1y,y_source);
1553
1554      //
1555      // every lane will create a 1 pixel tall line "sliver"
1556      //
1557      // FIXME -- this gets expanded on SIMD
1558      //
1559      // if numerator == 1 then this is the first lane
1560      // if numerator == s then this is the last  lane
1561      //
1562      SKC_RASTERIZE_FLOAT     const y_delta    = skc_delta_offset() - skc_subgroup_shuffle(y_ess,y_source);
1563      SKC_RASTERIZE_FLOAT     const y_count    = skc_subgroup_shuffle(y_segs,y_source);
1564
1565      SKC_RASTERIZE_PREDICATE const is_y_first = (y_delta == 1.0f);
1566      SKC_RASTERIZE_PREDICATE const is_y_last  = (y_delta >= y_count);
1567
1568      // toggle y_delta sign
1569      SKC_RASTERIZE_FLOAT     const y_offset   = as_float((as_uint(y_delta) ^ intel_sub_group_shuffle(dy_xor,y_source)));
1570
1571      //
1572      // calculate "right" line segment endpoint
1573      //
1574      SKC_RASTERIZE_FLOAT       n1y = (y_offset + skc_subgroup_shuffle(y_base,y_source)) * SKC_SUBPIXEL_Y_SCALE_UP;
1575      SKC_RASTERIZE_FLOAT const n_t = (n1y - m0y) * skc_subgroup_shuffle(y_denom,y_source);
1576      SKC_RASTERIZE_FLOAT       n1x = round(SKC_LERP(m0x,m1x,n_t));
1577
1578      //
1579      // override c1 if this is last point
1580      //
1581      n1y = select(n1y,m1y,is_y_last);
1582      n1x = select(n1x,m1x,is_y_last);
1583
1584      //
1585      // shuffle up "left" line segment endpoint
1586      //
1587      // NOTE: Intel's shuffle_up is unique with its elegant
1588      // "previous" argument so don't get used to it
1589      //
1590      SKC_RASTERIZE_FLOAT n0y = skc_subgroup_shuffle_up_1(n1y_prev,n1y);
1591      SKC_RASTERIZE_FLOAT n0x = skc_subgroup_shuffle_up_1(n1x_prev,n1x);
1592
1593      //
1594      // override shuffle up if this is the first line segment
1595      //
1596      n0y = select(n0y,m0y,is_y_first);
1597      n0x = select(n0x,m0x,is_y_first);
1598
1599      //
1600      // save previous right endpoint
1601      //
1602      n1x_prev = n1x;
1603      n1y_prev = n1y;
1604
1605      //
1606      // decrement by subgroup size
1607      //
1608      y_iss -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
1609      y_ess -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
1610      y_rem -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
1611
1612#if 0
1613      //
1614      // debug
1615      //
1616      if (n0y != n1y) {
1617        printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",n0x,n0y,n1x,n1y);
1618      }
1619#endif
1620
1621      //
1622      // X-SLIVERING
1623      // -----------
1624      //
1625      // now sliver 1-pixel high lines into at either vertical or
1626      // 1-pixel wide lines
1627      //
1628      // save original direction and work with increasing x
1629      //
1630      SKC_RASTERIZE_PREDICATE const x_lt   = (n0x <= n1x);
1631      SKC_RASTERIZE_UINT      const dx_xor = x_lt ? 0 : 0x80000000;
1632
1633      //
1634      // save 1/dy
1635      //
1636      SKC_RASTERIZE_FLOAT const x_denom  = native_recip(n1x - n0x);
1637
1638      //
1639      // how many non-horizontal subpixel y-axis slivers are there?
1640      //
1641      SKC_RASTERIZE_FLOAT const x_min    = floor(fmin(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN);
1642      SKC_RASTERIZE_FLOAT const x_max    = ceil (fmax(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN);
1643      SKC_RASTERIZE_FLOAT const x_base   = x_lt ? x_min : x_max;
1644      SKC_RASTERIZE_FLOAT const x_segs   = fmax(x_max - x_min,1.0f);
1645
1646      //
1647      // inclusive subgroup scan of y_segs
1648      //
1649      SKC_RASTERIZE_FLOAT       x_iss    = skc_subgroup_scan_inclusive_add_float(x_segs);
1650      SKC_RASTERIZE_FLOAT       x_ess    = x_iss - x_segs;
1651      float                     x_rem    = skc_subgroup_last_float(x_iss);
1652
1653      //
1654      // if this is a horizontal line then tweak x_iss so "is_scratch_store" always fails
1655      //
1656      //if (x_segs == 0.0f)
1657      // x_iss = 0.0f;
1658
1659      //
1660      // these values don't matter on first iteration
1661      //
1662      SKC_RASTERIZE_FLOAT       p1x_prev = 0;
1663      SKC_RASTERIZE_FLOAT       p1y_prev = 0;
1664
1665      //
1666      // loop until done
1667      //
1668      while (x_rem > 0)
1669        {
1670          //
1671          // distribute work across lanes
1672          //
1673          SKC_RASTERIZE_UINT const x_source = skc_scatter_scan_max(smem,x_iss,x_ess);
1674
1675          //
1676          // get line at y_source line
1677          //
1678          SKC_RASTERIZE_FLOAT const o0x = skc_subgroup_shuffle(n0x,x_source);
1679          SKC_RASTERIZE_FLOAT const o0y = skc_subgroup_shuffle(n0y,x_source);
1680          SKC_RASTERIZE_FLOAT const o1x = skc_subgroup_shuffle(n1x,x_source);
1681          SKC_RASTERIZE_FLOAT const o1y = skc_subgroup_shuffle(n1y,x_source);
1682
1683          //
1684          // every lane will create a 1 pixel tall line "sliver"
1685          //
1686          // FIXME -- this gets expanded on SIMD
1687          //
1688          // if numerator == 1 then this is the first lane
1689          // if numerator == s then this is the last  lane
1690          //
1691          SKC_RASTERIZE_FLOAT     const x_delta    = skc_delta_offset() - skc_subgroup_shuffle(x_ess,x_source);
1692          SKC_RASTERIZE_FLOAT     const x_count    = skc_subgroup_shuffle(x_segs,x_source);
1693
1694          SKC_RASTERIZE_PREDICATE const is_x_first = (x_delta == 1.0f);
1695          SKC_RASTERIZE_PREDICATE const is_x_last  = (x_delta >= x_count);
1696
1697          // toggle x_delta sign
1698          SKC_RASTERIZE_FLOAT     const x_offset   = as_float((as_uint(x_delta) ^ intel_sub_group_shuffle(dx_xor,x_source)));
1699
1700          //
1701          // calculate "right" line segment endpoint
1702          //
1703          SKC_RASTERIZE_FLOAT       p1x = (x_offset + skc_subgroup_shuffle(x_base,x_source)) * SKC_SUBPIXEL_X_SCALE_UP;
1704          SKC_RASTERIZE_FLOAT const p_t = (p1x - o0x) * skc_subgroup_shuffle(x_denom,x_source);
1705          SKC_RASTERIZE_FLOAT       p1y = round(SKC_LERP(o0y,o1y,p_t));
1706
1707          //
1708          // override c1 if this is last point
1709          //
1710          p1x = select(p1x,o1x,is_x_last);
1711          p1y = select(p1y,o1y,is_x_last);
1712
1713          //
1714          // shuffle up "left" line segment endpoint
1715          //
1716          // NOTE: Intel's shuffle_up is unique with its elegant
1717          // "previous" argument so don't get used to it
1718          //
1719          SKC_RASTERIZE_FLOAT p0x = skc_subgroup_shuffle_up_1(p1x_prev,p1x);
1720          SKC_RASTERIZE_FLOAT p0y = skc_subgroup_shuffle_up_1(p1y_prev,p1y);
1721
1722          //
1723          // override shuffle up if this is the first line segment
1724          //
1725          p0x = select(p0x,o0x,is_x_first);
1726          p0y = select(p0y,o0y,is_x_first);
1727
1728          //
1729          // save previous right endpoint
1730          //
1731          p1x_prev = p1x;
1732          p1y_prev = p1y;
1733
1734          //
1735          // decrement by subgroup size
1736          //
1737          x_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
1738          x_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
1739          x_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
1740
1741          //
1742          // only non-horizontal subpixel lines are valid
1743          //
1744          SKC_RASTERIZE_PREDICATE is_active = (p0y != p1y);
1745
1746          //
1747          // if no lanes are active then continue
1748          //
1749          // FIXME -- THIS SIMPLE SUB_GROUP_ANY TEST SIGNIFICANTLY
1750          // IMPACTS PERFORMANCE (+12% ?)
1751          //
1752          // IT SHOULDN'T !!!
1753          //
1754#if 0
1755          if (!skc_subgroup_any(is_active))
1756            continue;
1757#endif
1758
1759          //
1760          // Option 1: use SLM for explicitly managed coalesced stores
1761          //
1762          // 1. which tile does this line belong?
1763          // 2. hash tile coordinates
1764          // 3. lookup hash
1765          // 4. if tile matches then SLM append keys
1766          // 5. if tile doesn't match
1767          //   a. flush
1768          //   b. create new TTSK_RYX
1769          //   c. obtain TTSB block from pool
1770          //   d. goto 3.
1771          //
1772
1773          //
1774          // Option 2: rely on L1/L2/L3 to mitigate non-coalesced stores
1775          //
1776          // 1. which tile does this line belong?
1777          // 2. hash tile coordinates
1778          // 3. lookup hash
1779          // 4. if tile matches then GMEM append keys
1780          // 5. if tile doesn't match
1781          //   a. flush (and invalidate empty elems)
1782          //   b. create new TTSK_RYX
1783          //   c. obtain TTSB block from pool
1784          //   d. goto 3.
1785          //
1786
1787          //
1788          // The virtual rasterization surface is very large and
1789          // signed: +/- ~64K-256K, depending on the architecture.
1790          //
1791          // Rasters must be clipped to the virtual surface and,
1792          // optionally, clipped even further on a per raster
1793          // basis.
1794          //
1795
1796          //
1797          // Clip to the per-raster clip
1798          //
1799
1800          /*
1801
1802            CLIP HERE
1803
1804          */
1805
1806          //
1807          // Hash the tile coordinates
1808          //
1809          // This table lists nominal values for each architecture.
1810          // We want to choose values that are naturally fit the
1811          // "width" of the architecture.
1812          //
1813          //   SIMD   RANGE   BITS  MAX RANGE  MAX BINS  HASH BITS
1814          //   ----  -------  ----  ---------  --------  ---------
1815          //     4   [0,  4]    3    [0,  7]      10      mod(10)  <-- SSE42, ?
1816          //     8   [0,  8]    4    [0, 15]       8         3     <-- GEN*,AVX*
1817          //    16   [0, 16]    5    [0, 31]       6      mod(6)   <-- GEN*,?
1818          //    32   [0, 32]    6    [0, 63]       5      mod(5)   <-- CUDA,PowerVR,Adreno,GEN*
1819          //    64   [0, 64]    7    [0,127]       4         2     <-- AMD Radeon
1820          //
1821          // NOTE: When possible, bias the hash toward using more y
1822          // bits because of:
1823          //
1824          //   1. the 90 degree counter-clockwise rotation that we put
1825          //      in place to offset the render-time clockwise
1826          //      rotation
1827          //
1828          //   2. the likely presence of left-to-right or
1829          //      right-to-left glyphs.
1830          //
1831          // For power-of-two bins, the hash is easy.
1832          //
1833          // For non-power-of-two, we may want to either implement a
1834          // fast mod (compiler should do this for us... hahahaha) or
1835          // drop down to the next power-of-two.
1836          //
1837
1838          //
1839          // FIXME -- this snarl is not good -- can probably reduce
1840          // some of the sign casting but some is there to vectorize a
1841          // scalar
1842          //
1843          SKC_RASTERIZE_INT       const z0y    = SKC_CONVERT(SKC_RASTERIZE_INT)(p0y);
1844          SKC_RASTERIZE_INT       const z1y    = SKC_CONVERT(SKC_RASTERIZE_INT)(p1y);
1845
1846          SKC_RASTERIZE_INT       const z0x    = SKC_CONVERT(SKC_RASTERIZE_INT)(p0x);
1847          SKC_RASTERIZE_INT       const z1x    = SKC_CONVERT(SKC_RASTERIZE_INT)(p1x);
1848
1849          SKC_RASTERIZE_INT       const min_y  = min(z0y,z1y);
1850          SKC_RASTERIZE_INT       const max_y  = max(z0y,z1y);
1851
1852          SKC_RASTERIZE_INT       const tile_y = min_y >> SKC_SUBTILE_RESL_Y_LOG2;
1853
1854          SKC_RASTERIZE_UINT      const ty     = SKC_AS(SKC_RASTERIZE_UINT)(min_y) & SKC_SUBTILE_MASK_Y;
1855          SKC_RASTERIZE_INT             dy     = SKC_AS(SKC_RASTERIZE_INT)(z1y - z0y);
1856
1857          //
1858          // map [+1,+32] to [ 0,+31]
1859          // map [-1,-32] to [-1,-32]
1860          //
1861          SKC_RASTERIZE_INT             dys    = (dy + (~dy >> 31)) << 26;
1862
1863          SKC_RASTERIZE_INT       const min_x  = min(z0x,z1x);
1864          SKC_RASTERIZE_INT       const max_x  = max(z0x,z1x);
1865          SKC_RASTERIZE_INT       const tile_x = min_x >> SKC_SUBTILE_RESL_X_LOG2;
1866
1867          SKC_RASTERIZE_UINT      const tx     = SKC_AS(SKC_RASTERIZE_UINT)(min_x) & SKC_SUBTILE_MASK_X;
1868          SKC_RASTERIZE_UINT      const sx     = SKC_AS(SKC_RASTERIZE_UINT)(max_x - min_x);
1869
1870          SKC_RASTERIZE_UINT      const tts    = dys | (ty << 16) | (sx << 10) | tx;
1871
1872          SKC_RASTERIZE_UINT      const hash   = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & SKC_RASTERIZE_TILE_HASH_Y_MASK) << SKC_RASTERIZE_TILE_HASH_X_BITS) |
1873                                                   (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & SKC_RASTERIZE_TILE_HASH_X_MASK));
1874
1875          SKC_RASTERIZE_UINT      const yx     = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & 0xFFF) << 12) | (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & 0xFFF));
1876
1877#if 0
1878          printf("(%3u, %3u)\n",tile_y,tile_x);
1879#endif
1880
1881#if 0
1882          if (is_active)
1883            printf("( %3u, %3u ) : [ %3u, %3u, %3d, %3d, %3u ]\n",tile_y,tile_x,ty,tx,dy,((int)dys)>>26,sx);
1884#endif
1885
1886          //
1887          // debug
1888          //
1889#if 0 // PRINTF_ENABLE
1890
1891#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
1892
1893#undef  SKC_EXPAND_X
1894#define SKC_EXPAND_X(I,S,C,P,A)                                         \
1895          if (is_active C)                                              \
1896            printf("{ { %5d, %5d }, { %5d, %5d } (* %2u *) },\n",z0x C,z0y C,z1x C,z1y C,hash C);
1897
1898          SKC_RASTERIZE_VECTOR_EXPAND();
1899#else
1900          if (is_active)
1901            printf("{ { %5d, %5d }, { %5d, %5d } } (* %2u *),\n",z0x,z0y,z1x,z1y,hash);
1902#endif
1903
1904#endif
1905          //
1906          // flush all active lanes
1907          //
1908          while (true)
1909            {
1910              //
1911              // either gather load or vector load+shuffle the yx keys
1912              //
1913#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
1914              SKC_RASTERIZE_BIN       const yx_bin     = smem->bin.vN.yx;
1915              SKC_RASTERIZE_UINT      const yx_cur     = shuffle(yx_bin,hash);
1916#else
1917              SKC_RASTERIZE_UINT      const yx_cur     = smem->bin.aN.yx[hash];
1918#endif
1919
1920              //
1921              // does yx for lane match yx for hash?
1922              //
1923              SKC_RASTERIZE_UINT      const active_yx  = is_active ? yx : SKC_RASTERIZE_YX_INVALID;
1924              SKC_RASTERIZE_PREDICATE const is_match   = (yx_cur == active_yx);
1925
1926              //
1927              // OpenCL spec: "When casting a bool to a vector integer
1928              // data type, the vector components will be set to -1
1929              // (i.e. all bits set) if the vector bool value is true
1930              // and 0 otherwise.
1931              //
1932#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
1933              SKC_RASTERIZE_UINT      const h_match    = (SKC_RASTERIZE_UINT)is_match;
1934#else
1935              SKC_RASTERIZE_UINT      const h_match    = abs(is_match); // {-1,0} -> {+1,0}
1936#endif
1937              //
1938              // how many new elements for each matching hash bin?
1939              //
1940              SKC_RASTERIZE_UINT      const h_shl      = hash * SKC_RASTERIZE_TILE_HASH_BIN_BITS;
1941              SKC_RASTERIZE_UINT      const h          = h_match << h_shl;
1942
1943              //
1944              // prefix sum all of the bins in parallel
1945              //
1946              SKC_RASTERIZE_UINT      const h_iss      = skc_subgroup_scan_inclusive_add_uint(h);
1947              SKC_RASTERIZE_UINT      const h_total    = skc_subgroup_last_uint(h_iss);
1948
1949              //
1950              // current bin counts
1951              //
1952#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
1953              SKC_RASTERIZE_BIN       const count_bin  = smem->bin.vN.count;
1954              SKC_RASTERIZE_UINT      const count_cur  = shuffle(count_bin,hash);
1955#else
1956              SKC_RASTERIZE_UINT      const count_cur  = smem->bin.aN.count[hash];
1957#endif
1958
1959              //
1960              // calculate where each cache-hit and in-bounds tts should be stored
1961              //
1962              SKC_RASTERIZE_UINT      const ttsb_index = (h_iss   >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur - 1;
1963              SKC_RASTERIZE_UINT      const count_new  = (h_total >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur;
1964
1965              //
1966              // which lanes can append to a matching bin?
1967              //
1968              SKC_RASTERIZE_PREDICATE const is_append  = is_match && (ttsb_index < SKC_DEVICE_SUBBLOCK_WORDS);
1969
1970              //
1971              // scatter append tts elements to bin blocks
1972              //
1973#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1)
1974              //
1975              // SIMD
1976              //
1977#undef  SKC_EXPAND_X
1978#define SKC_EXPAND_X(I,S,C,P,A)                                         \
1979              if (is_append C)                                          \
1980                {                                                       \
1981                  smem->bin.aN.ttsb [hash C][ttsb_index C] = tts       C; \
1982                  smem->bin.aN.count[hash C]               = count_new C; \
1983                }
1984
1985              SKC_RASTERIZE_VECTOR_EXPAND();
1986#else
1987              //
1988              // SIMT
1989              //
1990              if (is_append)
1991                {
1992                  smem->bin.aN.ttsb [hash][ttsb_index] = tts;
1993                  smem->bin.aN.count[hash]             = count_new; // it's ok if this is > SKC_DEVICE_SUBBLOCK_WORDS
1994                }
1995#endif
1996              //
1997              // try to keep predicate updates SIMD-friendly and
1998              // outside of predicated code paths -- this is not
1999              // always how we would normally do things on SIMT but
2000              // either approach is acceptable
2001              //
2002
2003              //
2004              // mask off lanes/components that successfully appended
2005              //
2006              is_active = is_active && !is_append;
2007
2008              //
2009              // are there any active lanes left?
2010              //
2011              if (!skc_subgroup_any(is_active))
2012                break;
2013
2014              //
2015              // There are active lanes that couldn't be appended to a
2016              // bin because their hashes collided with the bin's
2017              // current ryx key then those bins must be ejected.
2018              //
2019              // Note that we do not eject "full" bins because lazily
2020              // waiting for a collision results in simpler code.
2021              //
2022              skc_flush(bp_atomics,
2023                        bp_elems,
2024                        bp_ids,
2025                        bp_mask,
2026                        cohort_atomics,
2027                        subblocks,
2028                        blocks,
2029                        blocks_next,
2030                        sk_v,
2031                        sk_v_next,
2032                        sk_extent,
2033                        smem,
2034                        hash,
2035                        yx,
2036                        is_active);
2037            }
2038        }
2039    }
2040}
2041
2042//
2043// INITIALIZE SMEM
2044//
2045// Note that SIMD/SIMT have nearly the same syntax.
2046//
2047static
2048void
2049skc_smem_init(__local struct skc_subgroup_smem volatile * const smem)
2050{
2051  //
2052  // initialize smem bins
2053  //
2054#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
2055  //
2056  // SIMD
2057  //
2058  smem->bin.vN.yx    = ( SKC_RASTERIZE_YX_INIT );
2059  smem->bin.vN.count = ( 0 );
2060#else
2061  //
2062  // SIMT
2063  //
2064  int idx = skc_subgroup_lane();
2065
2066#if   ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT < SKC_RASTERIZE_ELEMS_PER_SUBGROUP )
2067  if (idx < SKC_RASTERIZE_TILE_HASH_BIN_COUNT)
2068#elif ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT > SKC_RASTERIZE_ELEMS_PER_SUBGROUP )
2069  for (; idx<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; idx+=SKC_RASTERIZE_SUBGROUP_SIZE)
2070#endif
2071    {
2072      smem->bin.aN.yx   [idx] = ( SKC_RASTERIZE_YX_INIT );
2073      smem->bin.aN.count[idx] = ( 0 );
2074    }
2075#endif
2076}
2077
2078//
2079// RASTERIZE CUBIC KERNEL
2080//
2081
2082static
2083void
2084skc_rasterize_cubics(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
2085                     __global union skc_bp_elem                * const bp_elems,
2086                     __global uint                             * const bp_ids,
2087                     skc_uint                                    const bp_mask,
2088
2089                     __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
2090                     __global skc_ttsk_s_t                     * const sk_extent,
2091
2092                     __local struct skc_subgroup_smem volatile * const smem,
2093
2094                     skc_uint                                  * const nodeword,
2095                     skc_block_id_t                            * const id,
2096
2097                     union skc_transform              const    * const tv,
2098                     union skc_path_clip              const    * const cv,
2099                     skc_uint                                    const cohort)
2100{
2101  //
2102  // the initial segment idx and segments-per-block constant determine
2103  // how many block ids will need to be loaded
2104  //
2105  SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
2106
2107  skc_segment_next(bp_elems,nodeword,id);
2108
2109  SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
2110
2111  skc_segment_next(bp_elems,nodeword,id);
2112
2113  SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
2114
2115  skc_segment_next(bp_elems,nodeword,id);
2116
2117  SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
2118
2119  skc_segment_next(bp_elems,nodeword,id);
2120
2121  SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
2122
2123  skc_segment_next(bp_elems,nodeword,id);
2124
2125  SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
2126
2127  skc_segment_next(bp_elems,nodeword,id);
2128
2129  SKC_RASTERIZE_FLOAT const c3x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
2130
2131  skc_segment_next(bp_elems,nodeword,id);
2132
2133  SKC_RASTERIZE_FLOAT const c3y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
2134
2135  //
2136  // apply transform
2137  //
2138  // note that we only care if the end points are rounded to subpixel precision
2139  //
2140  // FIXME -- transformation is currently affine-only support perspective later
2141  //
2142  // the affine transformation requires 8 FMA + 2 ROUND operations
2143  //
2144
2145  SKC_RASTERIZE_FLOAT b0x = c0x * tv->sx  + c0y * tv->shx + tv->tx;
2146  SKC_RASTERIZE_FLOAT b0y = c0x * tv->shy + c0y * tv->sy  + tv->ty;
2147
2148  SKC_RASTERIZE_FLOAT t1x = c1x * tv->sx  + c1y * tv->shx + tv->tx;
2149  SKC_RASTERIZE_FLOAT t1y = c1x * tv->shy + c1y * tv->sy  + tv->ty;
2150
2151  SKC_RASTERIZE_FLOAT t2x = c2x * tv->sx  + c2y * tv->shx + tv->tx;
2152  SKC_RASTERIZE_FLOAT t2y = c2x * tv->shy + c2y * tv->sy  + tv->ty;
2153
2154  SKC_RASTERIZE_FLOAT t3x = c3x * tv->sx  + c3y * tv->shx + tv->tx;
2155  SKC_RASTERIZE_FLOAT t3y = c3x * tv->shy + c3y * tv->sy  + tv->ty;
2156
2157  //
2158  // FIXME -- this is temporary support for projection
2159  //
2160  bool const is_affine = (tv->w0 == 0.0f) && (tv->w1 == 0.0f);
2161
2162  if (!is_affine)
2163    {
2164      SKC_PROJECT(tv,c0x,c0y,b0x,b0y);
2165      SKC_PROJECT(tv,c1x,c1y,t1x,t1y);
2166      SKC_PROJECT(tv,c2x,c2y,t2x,t2y);
2167      SKC_PROJECT(tv,c3x,c3y,t3x,t3y);
2168    }
2169
2170  b0x = round(b0x);
2171  b0y = round(b0y);
2172
2173  t3x = round(t3x);
2174  t3y = round(t3y);
2175
2176  //
2177  //
2178  //
2179#if PRINTF_ENABLE
2180
2181#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
2182
2183#undef  SKC_EXPAND_X
2184#define SKC_EXPAND_X(I,S,C,P,A)                                         \
2185  printf("{ { %.02f, %.02f }, { %.02f, %.02f },"                        \
2186         "  { %.02f, %.02f }, { %.02f, %.02f } },\n",                   \
2187         b0x C,b0y C,t1x C,t1y C,                                       \
2188         t2x C,t2y C,t3x C,t3y C);
2189
2190  SKC_RASTERIZE_VECTOR_EXPAND();
2191
2192#else
2193
2194  printf("{ { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f } },\n",
2195         b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y);
2196
2197#endif
2198
2199#endif
2200
2201  //
2202  // OLD APPROACH
2203  // ------------
2204  //
2205  // The Spinel CUDA rasterizer was significantly more complex and
2206  // performed a few different tasks that are probably best kept
2207  // separate.
2208  //
2209  // The Spinel rasterizer Bezier held 4-element x and y coordinates
2210  // in adjacent lanes. This simplified intermingling of single lane
2211  // 4-coordinate line segments with two-lane cubic Beziers.
2212  //
2213  // After transformation of the input segments, the Spinel rasterizer
2214  // would test cubics for flatness and, if flat, collapse the
2215  // adjacent lanes into a single line lane and an empty lane.
2216  //
2217  // Any lines would then be appended to a line queue.
2218  //
2219  // Any cubics would then be subdivided.
2220  //
2221  // The reclassification process would be repeated.
2222  //
2223  // NEW APPROACH
2224  // ------------
2225  //
2226  // Assume we're only working with cubics in this kernel.
2227  //
2228  // Optimization: if the line segment is a special case -- a cusp,
2229  // has 1+ inflections, or a loop -- it might be beneficial to
2230  // subdivide the control cage 1+ times in order to separate the
2231  // flatter segments the high-velocity region(s).
2232  //
2233  // This means we want to split using [a,b] formulation to _directly_
2234  // subdivide producing a new control cage.
2235  //
2236  // Wang's Formula is still useful even if we subdivide once or twice
2237  // as it's so cheap that it might give some useful hints about where
2238  // the high-velocity sections of curve reside.
2239  //
2240  // But it seems like using Wang's and directly flattening to line
2241  // segments without any subdivision is good enough for the limited
2242  // set of test cases that I've tried.
2243  //
2244  // So... use Wang's Formula to estimate how many line segment are
2245  // required to properly flatten the cubics.
2246  //
2247  // Then use inclusive/exclusive scans to put all the lanes to work:
2248  //
2249  //   1. segmenting cubics to line segments
2250  //
2251  //   2. slivering line segments into 1-pixel high line segments
2252  //
2253  //   3. slivering 1-pixel high line segments into 1-pixel wide line
2254  //      segments
2255  //
2256  // MORE BACKGROUND ON NEW APPROACH
2257  // -------------------------------
2258  //
2259  // Two options for handling line segments:
2260  //
2261  // 1. append the line segments onto an SLM array until enough
2262  //    work has been accrued (Spinel does this)
2263  //
2264  // 2. immediately sliver the potentially multi-pixel line
2265  //    segments into subpixel lines
2266  //
2267  // The advantage of (1) is that it guarantees the slivering
2268  // process will, on average, always be emitting a full subgroup
2269  // of subpixel lines.
2270  //
2271  // The advantage of (2) is that it reduces code complexity and
2272  // leaves more room for SLM tile bins. The difference between Spinel
2273  // and Skia Compute is that Wang's Formula guarantees there will be
2274  // a full subgroup of multi-pixel lines unless this is the final
2275  // iteration of the warp of multi-pixel lines.
2276  //
2277  // Note that wider GPU architectures might benefit from (1) and
2278  // other work accumulation strategies because it will minimize
2279  // partial warp workloads in the final iteration of each stage.  It
2280  // also minimizes the sunk cost of the uniform control logic steps.
2281  //
2282  // So let's implement (2) for now...
2283  //
2284
2285  //
2286  // And... begin!
2287  //
2288  // Estimate how many line segments are in quad/cubic curve.
2289  //
2290  // Wang's Formula will return zero if the control points are
2291  // collinear but we bump it up to 1.0f.
2292  //
2293  SKC_RASTERIZE_FLOAT const s_segs  = skc_wangs_formula_cubic(b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y);
2294
2295  //
2296  // if there are free registers then precalculate the reciprocal for
2297  // each estimated segments since it will never change
2298  //
2299  SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs);
2300
2301
2302  //
2303  // inclusive add scan of estimated line segments
2304  // exclusive add scan of estimated line segments
2305  // total number       of estimated line segments
2306  //
2307  SKC_RASTERIZE_FLOAT       s_iss   = skc_subgroup_scan_inclusive_add_float(s_segs);
2308  SKC_RASTERIZE_FLOAT       s_ess   = s_iss - s_segs;
2309  float                     s_rem   = skc_subgroup_last_float(s_iss); // scalar
2310
2311  //
2312  // Precompute cubic polynomial coefficients from transformed control
2313  // cage so we can shuffle them in on each iteration of the outer
2314  // loop and then evaluate the polynomial in Horner form.
2315  //
2316  //                            |  1  0  0  0 | | c0 |
2317  //                            |             | |    |
2318  //                            | -3  3  0  0 | | c1 |
2319  //   B(t) = [ 1 t^1 t^2 t^3 ] |             | |    |
2320  //                            |  3 -6  3  0 | | c2 |
2321  //                            |             | |    |
2322  //                            | -1  3 -3  1 | | c3 |
2323  //
2324  //
2325  SKC_RASTERIZE_FLOAT const b1x = mad(-3.0f,b0x,3.0f*t1x);                // 2 - 1 MAD + MUL
2326  SKC_RASTERIZE_FLOAT const b1y = mad(-3.0f,b0y,3.0f*t1y);                // 2 - 1 MAD + MUL
2327
2328  SKC_RASTERIZE_FLOAT const b2x = mad(3.0f,b0x,mad(-6.0f,t1x,3.0f*t2x));  // 3 - 2 MAD + MUL
2329  SKC_RASTERIZE_FLOAT const b2y = mad(3.0f,b0y,mad(-6.0f,t1y,3.0f*t2y));  // 3 - 2 MAD + MUL
2330
2331  SKC_RASTERIZE_FLOAT const b3x = mad(3.0f,t1x,mad(-3.0f,t2x,t3x)) - b0x; // 3 - 2 MAD + SUB
2332  SKC_RASTERIZE_FLOAT const b3y = mad(3.0f,t1y,mad(-3.0f,t2y,t3y)) - b0y; // 3 - 2 MAD + SUB
2333
2334  //
2335  // these values don't matter on the first iteration
2336  //
2337  SKC_RASTERIZE_FLOAT l1x_prev  = 0;
2338  SKC_RASTERIZE_FLOAT l1y_prev  = 0;
2339
2340  //
2341  // allocate and init in-register TTSK keys
2342  //
2343  skc_uint     sk_v_next = 0;
2344  skc_ttsk_v_t sk_v;
2345
2346  sk_v.hi = cohort;
2347
2348  //
2349  // initialize smem
2350  //
2351  skc_smem_init(smem);
2352
2353  //
2354  // initialize blocks / subblocks
2355  //
2356  skc_block_id_v_t blocks;
2357  skc_uint         blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;
2358
2359#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
2360  skc_block_id_t   subblocks   = 0;
2361#endif
2362
2363  //
2364  // loop until done
2365  //
2366  while (s_rem > 0)
2367    {
2368      //
2369      // distribute work across lanes
2370      //
2371      SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess);
2372
2373      //
2374      // every lane has a fraction to work off of
2375      //
2376      // FIXME -- this gets expanded on SIMD
2377      //
2378      // if delta == 1      then this is the first lane
2379      // if count == s_segs then this is the last  lane
2380      //
2381      SKC_RASTERIZE_FLOAT     const s_delta    = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source);
2382      SKC_RASTERIZE_FLOAT     const s_count    = skc_subgroup_shuffle(s_segs,s_source);
2383
2384      SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f);
2385      SKC_RASTERIZE_PREDICATE const is_s_last  = (s_delta >= s_count);
2386
2387      //
2388      // init parametric t
2389      //
2390      SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)?
2391
2392      //
2393      // if last then override to a hard 1.0f
2394      //
2395      s_t    = is_s_last ? 1.0f : s_t;
2396
2397      //
2398      // decrement by subgroup size
2399      //
2400      s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
2401      s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
2402      s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
2403
2404      //
2405      // now every lane knows what to do and the following lines will
2406      // pump out up to SUBGROUP_SIZE line segments
2407      //
2408      // obtain the src vertices through shared or via a shuffle
2409      //
2410
2411      //
2412      // shuffle in the polynomial coefficients their source lane
2413      //
2414      SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source);
2415      SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source);
2416
2417      SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source);
2418      SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source);
2419
2420      SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source);
2421      SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source);
2422
2423      SKC_RASTERIZE_FLOAT const s3x = skc_subgroup_shuffle(b3x,s_source);
2424      SKC_RASTERIZE_FLOAT const s3y = skc_subgroup_shuffle(b3y,s_source);
2425
2426      //
2427      // calculate "right" line segment endpoint using Horner form
2428      //
2429      SKC_RASTERIZE_FLOAT       l1x = round(mad(mad(mad(s3x,s_t,s2x),s_t,s1x),s_t,s0x)); // 3 MAD + ROUND
2430      SKC_RASTERIZE_FLOAT       l1y = round(mad(mad(mad(s3y,s_t,s2y),s_t,s1y),s_t,s0y)); // 3 MAD + ROUND
2431
2432      //
2433      // shuffle up "left" line segment endpoint
2434      //
2435      // NOTE: Intel's shuffle_up is unique with its elegant
2436      // "previous" argument so don't get used to it
2437      //
2438      SKC_RASTERIZE_FLOAT       l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x);
2439      SKC_RASTERIZE_FLOAT       l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y);
2440
2441      //
2442      // save previous right endpoint
2443      //
2444      l1x_prev = l1x;
2445      l1y_prev = l1y;
2446
2447      //
2448      // override shuffle up if this is the first line segment
2449      //
2450      l0x = select(l0x,s0x,is_s_first);
2451      l0y = select(l0y,s0y,is_s_first);
2452
2453      //
2454      // sliver lines
2455      //
2456      skc_sliver(bp_atomics,
2457                 bp_elems,
2458                 bp_ids,
2459                 bp_mask,
2460                 cohort_atomics,
2461                 &subblocks,
2462                 &blocks,
2463                 &blocks_next,
2464                 &sk_v,
2465                 &sk_v_next,
2466                 sk_extent,
2467                 smem,
2468                 l0x,l0y,l1x,l1y);
2469    }
2470
2471  //
2472  // - flush work-in-progress blocks
2473  // - return unused block ids
2474  //
2475  skc_finalize(bp_atomics,
2476               bp_elems,
2477               bp_ids,
2478               bp_mask,
2479               cohort_atomics,
2480               &blocks,
2481               blocks_next,
2482               &sk_v,
2483               sk_v_next,
2484               sk_extent,
2485               smem);
2486}
2487
2488//
2489// RASTERIZE QUAD KERNEL
2490//
2491
2492static
2493void
2494skc_rasterize_quads(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
2495                    __global union skc_bp_elem                * const bp_elems,
2496                    __global uint                             * const bp_ids,
2497                    skc_uint                                    const bp_mask,
2498
2499                    __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
2500                    __global skc_ttsk_s_t                     * const sk_extent,
2501
2502                    __local struct skc_subgroup_smem volatile * const smem,
2503
2504                    skc_uint                                  * const nodeword,
2505                    skc_block_id_t                            * const id,
2506
2507                    union skc_transform              const    * const tv,
2508                    union skc_path_clip              const    * const cv,
2509                    skc_uint                                    const cohort)
2510{
2511  //
2512  // the initial segment idx and segments-per-block constant determine
2513  // how many block ids will need to be loaded
2514  //
2515  SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
2516
2517  skc_segment_next(bp_elems,nodeword,id);
2518
2519  SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
2520
2521  skc_segment_next(bp_elems,nodeword,id);
2522
2523  SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
2524
2525  skc_segment_next(bp_elems,nodeword,id);
2526
2527  SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
2528
2529  skc_segment_next(bp_elems,nodeword,id);
2530
2531  SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
2532
2533  skc_segment_next(bp_elems,nodeword,id);
2534
2535  SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
2536
2537  //
2538  // apply transform
2539  //
2540  // note that we only care if the end points are rounded to subpixel precision
2541  //
2542  // FIXME -- transformation is currently affine-only support perspective later
2543  //
2544  // the affine transformation requires 8 FMA + 2 ROUND operations
2545  //
2546  SKC_RASTERIZE_FLOAT b0x = c0x * tv->sx  + c0y * tv->shx + tv->tx;
2547  SKC_RASTERIZE_FLOAT b0y = c0x * tv->shy + c0y * tv->sy  + tv->ty;
2548
2549  SKC_RASTERIZE_FLOAT t1x = c1x * tv->sx  + c1y * tv->shx + tv->tx;
2550  SKC_RASTERIZE_FLOAT t1y = c1x * tv->shy + c1y * tv->sy  + tv->ty;
2551
2552  SKC_RASTERIZE_FLOAT t2x = c2x * tv->sx  + c2y * tv->shx + tv->tx;
2553  SKC_RASTERIZE_FLOAT t2y = c2x * tv->shy + c2y * tv->sy  + tv->ty;
2554
2555  //
2556  // FIXME -- this is temporary support for projection
2557  //
2558  bool const is_affine = (tv->w0 == 0.0f) && (tv->w1 == 0.0f);
2559
2560  if (!is_affine)
2561    {
2562      SKC_PROJECT(tv,c0x,c0y,b0x,b0y);
2563      SKC_PROJECT(tv,c1x,c1y,t1x,t1y);
2564      SKC_PROJECT(tv,c2x,c2y,t2x,t2y);
2565    }
2566
2567  b0x = round(b0x);
2568  b0y = round(b0y);
2569
2570  t2x = round(t2x);
2571  t2y = round(t2y);
2572
2573  //
2574  // Estimate how many line segments are in quad/cubic curve.
2575  //
2576  // Wang's Formula will return zero if the control points are
2577  // collinear but we bump it up to 1.0f.
2578  //
2579  SKC_RASTERIZE_FLOAT const s_segs  = skc_wangs_formula_quadratic(b0x,b0y,t1x,t1y,t2x,t2y);
2580
2581  //
2582  // if there are free registers then precalculate the reciprocal for
2583  // each estimated segments since it will never change
2584  //
2585  SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs);
2586
2587
2588  //
2589  // inclusive add scan of estimated line segments
2590  // exclusive add scan of estimated line segments
2591  // total number       of estimated line segments
2592  //
2593  SKC_RASTERIZE_FLOAT       s_iss   = skc_subgroup_scan_inclusive_add_float(s_segs);
2594  SKC_RASTERIZE_FLOAT       s_ess   = s_iss - s_segs;
2595  float                     s_rem   = skc_subgroup_last_float(s_iss); // scalar
2596
2597  //
2598  // Precompute quadratic polynomial coefficients from control cage so
2599  // we can shuffle them in on each iteration of the outer loop and
2600  // then evaluate the polynomial in Horner form.
2601  //
2602
2603  //                        |  1  0  0  | | c0 |
2604  //                        |           | |    |
2605  //   B(t) = [ 1 t^1 t^2 ] | -2  2  0  | | c1 |
2606  //                        |           | |    |
2607  //                        |  1 -2  1  | | c2 |
2608  //
2609  //
2610  SKC_RASTERIZE_FLOAT const b1x = mad(-2.0f,b0x,2.0f*t1x); // 2 - 1 MAD + MUL
2611  SKC_RASTERIZE_FLOAT const b1y = mad(-2.0f,b0y,2.0f*t1y); // 2 - 1 MAD + MUL
2612
2613  SKC_RASTERIZE_FLOAT const b2x = mad(-2.0f,t1x,b0x+t2x);  // 2 - 1 MAD + ADD
2614  SKC_RASTERIZE_FLOAT const b2y = mad(-2.0f,t1y,b0y+t2y);  // 2 - 1 MAD + ADD
2615
2616  //
2617  // these values don't matter on the first iteration
2618  //
2619  SKC_RASTERIZE_FLOAT l1x_prev  = 0;
2620  SKC_RASTERIZE_FLOAT l1y_prev  = 0;
2621
2622  //
2623  // allocate and init in-register TTSK keys
2624  //
2625  skc_uint     sk_v_next = 0;
2626  skc_ttsk_v_t sk_v;
2627
2628  sk_v.hi = cohort;
2629
2630  //
2631  // initialize smem
2632  //
2633  skc_smem_init(smem);
2634
2635  //
2636  // initialize blocks / subblocks
2637  //
2638  skc_block_id_v_t blocks;
2639  skc_uint         blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;
2640
2641#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
2642  skc_block_id_t   subblocks   = 0;
2643#endif
2644
2645  //
2646  // loop until done
2647  //
2648  while (s_rem > 0)
2649    {
2650      //
2651      // distribute work across lanes
2652      //
2653      SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess);
2654
2655      //
2656      // every lane has a fraction to work off of
2657      //
2658      // FIXME -- this gets expanded on SIMD
2659      //
2660      // if delta == 1      then this is the first lane
2661      // if count == s_segs then this is the last  lane
2662      //
2663      SKC_RASTERIZE_FLOAT     const s_delta    = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source);
2664      SKC_RASTERIZE_FLOAT     const s_count    = skc_subgroup_shuffle(s_segs,s_source);
2665
2666      SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f);
2667      SKC_RASTERIZE_PREDICATE const is_s_last  = (s_delta >= s_count);
2668
2669      //
2670      // init parametric t
2671      //
2672      SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)?
2673
2674      //
2675      // if last then override to a hard 1.0f
2676      //
2677      s_t    = is_s_last ? 1.0f : s_t;
2678
2679      //
2680      // decrement by subgroup size
2681      //
2682      s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
2683      s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
2684      s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
2685
2686      //
2687      // now every lane knows what to do and the following lines will
2688      // pump out up to SUBGROUP_SIZE line segments
2689      //
2690      // obtain the src vertices through shared or via a shuffle
2691      //
2692
2693      //
2694      // shuffle in the polynomial coefficients their source lane
2695      //
2696      SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source);
2697      SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source);
2698
2699      SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source);
2700      SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source);
2701
2702      SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source);
2703      SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source);
2704
2705      //
2706      // calculate "right" line segment endpoint using Horner form
2707      //
2708      SKC_RASTERIZE_FLOAT       l1x = round(mad(mad(s2x,s_t,s1x),s_t,s0x)); // 2 MAD + ROUND
2709      SKC_RASTERIZE_FLOAT       l1y = round(mad(mad(s2y,s_t,s1y),s_t,s0y)); // 2 MAD + ROUND
2710
2711      //
2712      // shuffle up "left" line segment endpoint
2713      //
2714      // NOTE: Intel's shuffle_up is unique with its elegant
2715      // "previous" argument so don't get used to it
2716      //
2717      SKC_RASTERIZE_FLOAT       l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x);
2718      SKC_RASTERIZE_FLOAT       l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y);
2719
2720      //
2721      // save previous right endpoint
2722      //
2723      l1x_prev = l1x;
2724      l1y_prev = l1y;
2725
2726      //
2727      // override shuffle up if this is the first line segment
2728      //
2729      l0x = select(l0x,s0x,is_s_first);
2730      l0y = select(l0y,s0y,is_s_first);
2731
2732      //
2733      // sliver lines
2734      //
2735      skc_sliver(bp_atomics,
2736                 bp_elems,
2737                 bp_ids,
2738                 bp_mask,
2739                 cohort_atomics,
2740                 &subblocks,
2741                 &blocks,
2742                 &blocks_next,
2743                 &sk_v,
2744                 &sk_v_next,
2745                 sk_extent,
2746                 smem,
2747                 l0x,l0y,l1x,l1y);
2748    }
2749
2750  //
2751  // - flush work-in-progress blocks
2752  // - return unused block ids
2753  //
2754  skc_finalize(bp_atomics,
2755               bp_elems,
2756               bp_ids,
2757               bp_mask,
2758               cohort_atomics,
2759               &blocks,
2760               blocks_next,
2761               &sk_v,
2762               sk_v_next,
2763               sk_extent,
2764               smem);
2765}
2766
2767//
2768// RASTERIZE LINE KERNEL
2769//
2770
2771static
2772void
2773skc_rasterize_lines(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
2774                    __global union skc_bp_elem                * const bp_elems,
2775                    __global uint                             * const bp_ids,
2776                    skc_uint                                    const bp_mask,
2777
2778                    __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
2779                    __global skc_ttsk_s_t                     * const sk_extent,
2780
2781                    __local struct skc_subgroup_smem volatile * const smem,
2782
2783                    skc_uint                                  * const nodeword,
2784                    skc_block_id_t                            * const id,
2785
2786                    union skc_transform              const    * const tv,
2787                    union skc_path_clip              const    * const cv,
2788                    skc_uint                                    const cohort)
2789{
2790  //
2791  // the initial segment idx and segments-per-block constant determine
2792  // how many block ids will need to be loaded
2793  //
2794  SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
2795
2796  skc_segment_next(bp_elems,nodeword,id);
2797
2798  SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
2799
2800  skc_segment_next(bp_elems,nodeword,id);
2801
2802  SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
2803
2804  skc_segment_next(bp_elems,nodeword,id);
2805
2806  SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
2807
2808#if 0
2809  printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",c0x,c0y,c1x,c1y);
2810#endif
2811
2812  //
2813  // apply transform
2814  //
2815  // note that we only care if the end points are rounded to subpixel precision
2816  //
2817  // FIXME -- transformation is currently affine-only
2818  // FIXME -- support perspective later
2819  //
2820  // the affine transformation requires 8 FMA + 4 ROUND operations
2821  //
2822  SKC_RASTERIZE_FLOAT l0x = c0x * tv->sx  + c0y * tv->shx + tv->tx;
2823  SKC_RASTERIZE_FLOAT l0y = c0x * tv->shy + c0y * tv->sy  + tv->ty;
2824
2825  SKC_RASTERIZE_FLOAT l1x = c1x * tv->sx  + c1y * tv->shx + tv->tx;
2826  SKC_RASTERIZE_FLOAT l1y = c1x * tv->shy + c1y * tv->sy  + tv->ty;
2827
2828  //
2829  // FIXME -- this is temporary support for projection
2830  //
2831  bool const is_affine = (tv->w0 == 0.0f) && (tv->w1 == 0.0f);
2832
2833  if (!is_affine) {
2834    SKC_PROJECT(tv,c0x,c0y,l0x,l0y);
2835    SKC_PROJECT(tv,c1x,c1y,l1x,l1y);
2836  }
2837
2838  l0x = round(l0x);
2839  l0y = round(l0y);
2840
2841  l1x = round(l1x);
2842  l1y = round(l1y);
2843
2844#if 0
2845  printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",l0x,l0y,l1x,l1y);
2846#endif
2847
2848  //
2849  // allocate and init in-register TTSK keys
2850  //
2851  skc_uint     sk_v_next = 0;
2852  skc_ttsk_v_t sk_v;
2853
2854  sk_v.hi = cohort;
2855
2856  //
2857  // initialize smem
2858  //
2859  skc_smem_init(smem);
2860
2861  //
2862  // initialize blocks / subblocks
2863  //
2864  skc_block_id_v_t blocks;
2865  skc_uint         blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;
2866
2867#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
2868  skc_block_id_t   subblocks   = 0;
2869#endif
2870
2871  //
2872  // sliver lines
2873  //
2874  skc_sliver(bp_atomics,
2875             bp_elems,
2876             bp_ids,
2877             bp_mask,
2878             cohort_atomics,
2879             &subblocks,
2880             &blocks,
2881             &blocks_next,
2882             &sk_v,
2883             &sk_v_next,
2884             sk_extent,
2885             smem,
2886             l0x,l0y,l1x,l1y);
2887
2888  //
2889  // - flush work-in-progress blocks
2890  // - return unused block ids
2891  //
2892  skc_finalize(bp_atomics,
2893               bp_elems,
2894               bp_ids,
2895               bp_mask,
2896               cohort_atomics,
2897               &blocks,
2898               blocks_next,
2899               &sk_v,
2900               sk_v_next,
2901               sk_extent,
2902               smem);
2903}
2904
2905//
2906//
2907//
2908
2909__kernel
2910SKC_RASTERIZE_KERNEL_ATTRIBS
2911void
2912skc_kernel_rasterize_all(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
2913                         __global union skc_bp_elem                * const bp_elems,
2914                         __global uint                             * const bp_ids,
2915                         skc_uint                                    const bp_mask,
2916
2917                         __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
2918                         __global skc_ttsk_s_t                     * const sk_extent,
2919
2920                         __global float8                  const    * const transforms, // FIXME -- __constant
2921                         __global float4                  const    * const clips,      // FIXME -- __constant
2922                         __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
2923                         skc_uint                                    const count)
2924{
2925  //
2926  // declare shared memory block
2927  //
2928#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
2929  __local struct skc_subgroup_smem volatile                smem[1];
2930#else
2931  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
2932  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
2933#endif
2934
2935  //
2936  // this is a subgroup/warp-centric kernel
2937  //
2938  // which subgroup in the grid is this?
2939  //
2940  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
2941  // get_group_id(0) as a uniform but the alternative calculation used
2942  // when there are multiple subgroups per workgroup is not
2943  // cooperating and driving spillage elsewhere.
2944  //
2945#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
2946  uint const cmd_idx = get_group_id(0);
2947#else
2948  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
2949#endif
2950
2951#if 0
2952  if (get_sub_group_local_id() == 0)
2953    printf("+cmd_idx = %u\n",cmd_idx);
2954#endif
2955
2956  //
2957  // if worksgroups are multi-subgroup then there may be excess
2958  // subgroups in the final workgroup
2959  //
2960  if (cmd_idx >= count)
2961    return;
2962
2963#if 0
2964  if (get_sub_group_local_id() == 0)
2965    printf("-cmd_idx = %u\n",cmd_idx);
2966#endif
2967
2968  //
2969  // load a single command for this subgroup
2970  //
2971  union skc_cmd_rasterize const cmd = cmds[cmd_idx];
2972
2973#if 0
2974  if (get_sub_group_local_id() == 0)
2975    printf("[ %u ]< %u, %u, %u, %u >\n",
2976           cmd_idx,
2977           cmd.nodeword,
2978           SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd),
2979           SKC_CMD_RASTERIZE_GET_CLIP(cmd),
2980           SKC_CMD_RASTERIZE_GET_COHORT(cmd));
2981#endif
2982
2983  //
2984  // get first block node command word and its subblock
2985  //
2986  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
2987  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
2988  skc_block_id_tag      tag      = SKC_TAGGED_BLOCK_ID_GET_TAG(tag_id);
2989  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
2990
2991  //
2992  // load transform -- uniform across subgroup
2993  //
2994  // v8: { sx shx tx shy sy ty w0 w1 }
2995  //
2996  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
2997  //
2998  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
2999  //
3000  // Coordinates are scaled to subpixel resolution.  All that matters
3001  // is that continuity is maintained between end path element
3002  // endpoints.
3003  //
3004  // It's the responsibility of the host to ensure that the transforms
3005  // are properly scaled either via intitializing a transform stack
3006  // with the subpixel resolution scaled identity or scaling the
3007  // transform before its loaded by a rasterization grid.
3008  //
3009  // FIXME -- horizontal load might be better than this broadcast load
3010  //
3011  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
3012  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
3013  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
3014
3015  switch (tag)
3016    {
3017    case SKC_BLOCK_ID_TAG_PATH_LINE:
3018      skc_rasterize_lines(bp_atomics,
3019                          bp_elems,
3020                          bp_ids,
3021                          bp_mask,
3022                          cohort_atomics,
3023                          sk_extent,
3024                          smem,
3025                          &nodeword,&id,
3026                          &tv,&cv,cohort);
3027      break;
3028
3029    case SKC_BLOCK_ID_TAG_PATH_QUAD:
3030      skc_rasterize_quads(bp_atomics,
3031                          bp_elems,
3032                          bp_ids,
3033                          bp_mask,
3034                          cohort_atomics,
3035                          sk_extent,
3036                          smem,
3037                          &nodeword,&id,
3038                          &tv,&cv,cohort);
3039      break;
3040
3041    case SKC_BLOCK_ID_TAG_PATH_CUBIC:
3042      skc_rasterize_cubics(bp_atomics,
3043                           bp_elems,
3044                           bp_ids,
3045                           bp_mask,
3046                           cohort_atomics,
3047                           sk_extent,
3048                           smem,
3049                           &nodeword,&id,
3050                           &tv,&cv,cohort);
3051      break;
3052
3053    case SKC_BLOCK_ID_TAG_PATH_RAT_QUAD:
3054      break;
3055    case SKC_BLOCK_ID_TAG_PATH_RAT_CUBIC:
3056      break;
3057
3058    default:
3059      break;
3060    }
3061}
3062
3063//
3064//
3065//
3066
3067__kernel
3068SKC_RASTERIZE_KERNEL_ATTRIBS
3069void
3070skc_kernel_rasterize_lines(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
3071                           __global union skc_bp_elem                * const bp_elems,
3072                           __global uint                             * const bp_ids,
3073                           skc_uint                                    const bp_mask,
3074
3075                           __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
3076                           __global skc_ttsk_s_t                     * const sk_extent,
3077
3078                           __global float8                  const    * const transforms, // FIXME -- __constant
3079                           __global float4                  const    * const clips,      // FIXME -- __constant
3080                           __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
3081                           skc_uint                                    const count)
3082{
3083  //
3084  // declare shared memory block
3085  //
3086#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
3087  __local struct skc_subgroup_smem volatile                smem[1];
3088#else
3089  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
3090  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
3091#endif
3092
3093  //
3094  // this is a subgroup/warp-centric kernel
3095  //
3096  // which subgroup in the grid is this?
3097  //
3098  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
3099  // get_group_id(0) as a uniform but the alternative calculation used
3100  // when there are multiple subgroups per workgroup is not
3101  // cooperating and driving spillage elsewhere.
3102  //
3103#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
3104  uint const cmd_idx = get_group_id(0);
3105#else
3106  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
3107#endif
3108
3109  //
3110  // if worksgroups are multi-subgroup then there may be excess
3111  // subgroups in the final workgroup
3112  //
3113  if (cmd_idx >= count)
3114    return;
3115
3116#if 0
3117  if (get_sub_group_local_id() == 0)
3118    printf("cmd_idx = %u\n",cmd_idx);
3119#endif
3120
3121  //
3122  // load a single command for this subgroup
3123  //
3124  union skc_cmd_rasterize const cmd = cmds[cmd_idx];
3125
3126  //
3127  // get first block node command word and its subblock
3128  //
3129  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
3130  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
3131  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
3132
3133  //
3134  // load transform -- uniform across subgroup
3135  //
3136  // v8: { sx shx tx shy sy ty w0 w1 }
3137  //
3138  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
3139  //
3140  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
3141  //
3142  // Coordinates are scaled to subpixel resolution.  All that matters
3143  // is that continuity is maintained between end path element
3144  // endpoints.
3145  //
3146  // It's the responsibility of the host to ensure that the transforms
3147  // are properly scaled either via intitializing a transform stack
3148  // with the subpixel resolution scaled identity or scaling the
3149  // transform before its loaded by a rasterization grid.
3150  //
3151  // FIXME -- horizontal load might be better than this broadcast load
3152  //
3153  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
3154  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
3155  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
3156
3157  skc_rasterize_lines(bp_atomics,
3158                      bp_elems,
3159                      bp_ids,
3160                      bp_mask,
3161                      cohort_atomics,
3162                      sk_extent,
3163                      smem,
3164                      &nodeword,&id,
3165                      &tv,&cv,cohort);
3166}
3167
3168//
3169//
3170//
3171
3172//
3173//
3174//
3175
3176__kernel
3177SKC_RASTERIZE_KERNEL_ATTRIBS
3178void
3179skc_kernel_rasterize_quads(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
3180                           __global union skc_bp_elem                * const bp_elems,
3181                           __global uint                             * const bp_ids,
3182                           skc_uint                                    const bp_mask,
3183
3184                           __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
3185                           __global skc_ttsk_s_t                     * const sk_extent,
3186
3187                           __global float8                  const    * const transforms, // FIXME -- __constant
3188                           __global float4                  const    * const clips,      // FIXME -- __constant
3189                           __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
3190                           skc_uint                                    const count)
3191{
3192  //
3193  // declare shared memory block
3194  //
3195#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
3196  __local struct skc_subgroup_smem volatile                smem[1];
3197#else
3198  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
3199  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
3200#endif
3201
3202  //
3203  // this is a subgroup/warp-centric kernel
3204  //
3205  // which subgroup in the grid is this?
3206  //
3207  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
3208  // get_group_id(0) as a uniform but the alternative calculation used
3209  // when there are multiple subgroups per workgroup is not
3210  // cooperating and driving spillage elsewhere.
3211  //
3212#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
3213  uint const cmd_idx = get_group_id(0);
3214#else
3215  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
3216#endif
3217
3218  //
3219  // if worksgroups are multi-subgroup then there may be excess
3220  // subgroups in the final workgroup
3221  //
3222  if (cmd_idx >= count)
3223    return;
3224
3225#if 0
3226  if (get_sub_group_local_id() == 0)
3227    printf("cmd_idx = %u\n",cmd_idx);
3228#endif
3229
3230  //
3231  // load a single command for this subgroup
3232  //
3233  union skc_cmd_rasterize const cmd = cmds[cmd_idx];
3234
3235  //
3236  // get first block node command word and its subblock
3237  //
3238  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
3239  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
3240  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
3241
3242  //
3243  // load transform -- uniform across subgroup
3244  //
3245  // v8: { sx shx tx shy sy ty w0 w1 }
3246  //
3247  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
3248  //
3249  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
3250  //
3251  // Coordinates are scaled to subpixel resolution.  All that matters
3252  // is that continuity is maintained between end path element
3253  // endpoints.
3254  //
3255  // It's the responsibility of the host to ensure that the transforms
3256  // are properly scaled either via intitializing a transform stack
3257  // with the subpixel resolution scaled identity or scaling the
3258  // transform before its loaded by a rasterization grid.
3259  //
3260  // FIXME -- horizontal load might be better than this broadcast load
3261  //
3262  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
3263  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
3264  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
3265
3266  skc_rasterize_quads(bp_atomics,
3267                      bp_elems,
3268                      bp_ids,
3269                      bp_mask,
3270                      cohort_atomics,
3271                      sk_extent,
3272                      smem,
3273                      &nodeword,&id,
3274                      &tv,&cv,cohort);
3275}
3276
3277//
3278//
3279//
3280
3281__kernel
3282SKC_RASTERIZE_KERNEL_ATTRIBS
3283void
3284skc_kernel_rasterize_cubics(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
3285                            __global union skc_bp_elem                * const bp_elems,
3286                            __global uint                             * const bp_ids,
3287                            skc_uint                                    const bp_mask,
3288
3289                            __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
3290                            __global skc_ttsk_s_t                     * const sk_extent,
3291
3292                            __global float8                  const    * const transforms, // FIXME -- __constant
3293                            __global float4                  const    * const clips,      // FIXME -- __constant
3294                            __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
3295                            skc_uint                                    const count)
3296{
3297  //
3298  // declare shared memory block
3299  //
3300#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
3301  __local struct skc_subgroup_smem volatile                smem[1];
3302#else
3303  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
3304  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
3305#endif
3306
3307  //
3308  // this is a subgroup/warp-centric kernel
3309  //
3310  // which subgroup in the grid is this?
3311  //
3312  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
3313  // get_group_id(0) as a uniform but the alternative calculation used
3314  // when there are multiple subgroups per workgroup is not
3315  // cooperating and driving spillage elsewhere.
3316  //
3317#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
3318  uint const cmd_idx = get_group_id(0);
3319#else
3320  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
3321#endif
3322
3323  //
3324  // if worksgroups are multi-subgroup then there may be excess
3325  // subgroups in the final workgroup
3326  //
3327  if (cmd_idx >= count)
3328    return;
3329
3330#if 0
3331  if (get_sub_group_local_id() == 0)
3332    printf("cmd_idx = %u\n",cmd_idx);
3333#endif
3334
3335  //
3336  // load a single command for this subgroup
3337  //
3338  union skc_cmd_rasterize const cmd = cmds[cmd_idx];
3339
3340  //
3341  // get first block node command word and its subblock
3342  //
3343  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
3344  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
3345  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
3346
3347  //
3348  // load transform -- uniform across subgroup
3349  //
3350  // v8: { sx shx tx shy sy ty w0 w1 }
3351  //
3352  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
3353  //
3354  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
3355  //
3356  // Coordinates are scaled to subpixel resolution.  All that matters
3357  // is that continuity is maintained between end path element
3358  // endpoints.
3359  //
3360  // It's the responsibility of the host to ensure that the transforms
3361  // are properly scaled either via intitializing a transform stack
3362  // with the subpixel resolution scaled identity or scaling the
3363  // transform before its loaded by a rasterization grid.
3364  //
3365  // FIXME -- horizontal load might be better than this broadcast load
3366  //
3367  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
3368  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
3369  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
3370
3371  skc_rasterize_cubics(bp_atomics,
3372                       bp_elems,
3373                       bp_ids,
3374                       bp_mask,
3375                       cohort_atomics,
3376                       sk_extent,
3377                       smem,
3378                       &nodeword,&id,
3379                       &tv,&cv,cohort);
3380}
3381
3382//
3383//
3384//
3385
3386__kernel
3387SKC_RASTERIZE_KERNEL_ATTRIBS
3388void
3389skc_kernel_rasterize_rat_quads(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
3390                               __global union skc_bp_elem                * const bp_elems,
3391                               __global uint                             * const bp_ids,
3392                               skc_uint                                    const bp_mask,
3393
3394                               __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
3395                               __global skc_ttsk_s_t                     * const sk_extent,
3396
3397                               __global float8                  const    * const transforms, // FIXME -- __constant
3398                               __global float4                  const    * const clips,      // FIXME -- __constant
3399                               __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
3400                               skc_uint                                    const count)
3401{
3402  ;
3403}
3404
3405//
3406//
3407//
3408
3409__kernel
3410SKC_RASTERIZE_KERNEL_ATTRIBS
3411void
3412skc_kernel_rasterize_rat_cubics(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
3413                                __global union skc_bp_elem                * const bp_elems,
3414                                __global uint                             * const bp_ids,
3415                                skc_uint                                    const bp_mask,
3416
3417                                __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
3418                                __global skc_ttsk_s_t                     * const sk_extent,
3419
3420                                __global float8                  const    * const transforms, // FIXME -- __constant
3421                                __global float4                  const    * const clips,      // FIXME -- __constant
3422                                __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
3423                                skc_uint                                    const count)
3424{
3425  ;
3426}
3427
3428//
3429//
3430//
3431