• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright 2016 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can
5 * be found in the LICENSE file.
6 *
7 */
8
9//
10//
11//
12
13#include "tile.h"
14#include "block.h"
15#include "styling_types.h"
16#include "atomic_cl.h"
17#include "kernel_cl_12.h"
18
19//
20//
21//
22
23#define SKC_RENDER_SUBGROUP_MASK  (SKC_RENDER_SUBGROUP_SIZE - 1)
24
25//
26//
27//
28
29#if   ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 )
30#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_1()
31#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      0
32
33#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 2 )
34#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_2()
35#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      1
36
37#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 4 )
38#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_4()
39#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      3
40
41#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 8 )
42#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_8()
43#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      7
44
45#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 16)
46#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_16()
47#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      15
48#endif
49
50//
51// tile state flag bits
52//
53
54typedef enum skc_tile_flags_e {
55
56  // FLUSH
57  SKC_TILE_FLAGS_FLUSH_FINALIZE    = 0x00000001,
58  SKC_TILE_FLAGS_FLUSH_UNWIND      = 0x00000002,
59  SKC_TILE_FLAGS_FLUSH_COMPLETE    = 0x00000004,
60
61  // OPACITY
62  SKC_TILE_FLAGS_SCATTER_SKIP      = 0x00000008,
63
64  //
65  // Note: testing for opacity and skipping scattering is on its way
66  // to becoming a much more programmable option because sometimes we
67  // may be compositing/blending from back-to-front and/or be using
68  // group blend rules that ignore opacity.
69  //
70  // The point is that all of these decisions should be encoded in
71  // styling commands and, as much as possible, removed from the final
72  // group/layer styling traversal render loop.
73  //
74
75} skc_tile_flags_e;
76
77//
78// COVER -- assumes availability of either fp16 or fp32
79//
80
81union skc_tile_cover
82{
83  struct {
84    SKC_RENDER_TILE_COVER             c[SKC_TILE_WIDTH];
85  } aN;
86
87#ifdef SKC_RENDER_TILE_COVER_VECTOR
88  struct {
89    SKC_RENDER_TILE_COVER_VECTOR      c[SKC_RENDER_TILE_COVER_VECTOR_COUNT];
90  } vN;
91#endif
92};
93
94//
95// COLOR -- assumes availability of either fp16 or fp32
96//
97
98union skc_tile_color
99{
100  union {
101    struct {
102      SKC_RENDER_TILE_COLOR           r;
103      SKC_RENDER_TILE_COLOR           g;
104      SKC_RENDER_TILE_COLOR           b;
105      SKC_RENDER_TILE_COLOR           a;
106    } rgba[SKC_TILE_WIDTH];
107  } aN;
108
109#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED
110  union {
111    SKC_RENDER_TILE_COLOR_INTERLEAVED rgba[SKC_TILE_WIDTH];
112  } iN;
113#endif
114
115#ifdef SKC_RENDER_TILE_COLOR_VECTOR
116  union {
117    SKC_RENDER_TILE_COLOR_VECTOR      rgba[SKC_RENDER_TILE_COLOR_VECTOR_COUNT];
118  } vN;
119#endif
120
121  struct {
122    union {
123      struct {
124        SKC_RENDER_TILE_COLOR         r;
125        SKC_RENDER_TILE_COLOR         g;
126      };
127      SKC_RENDER_GRADIENT_FLOAT       distance;
128    };
129    union {
130      struct {
131        SKC_RENDER_TILE_COLOR         b;
132        SKC_RENDER_TILE_COLOR         a;
133      };
134      SKC_RENDER_GRADIENT_FLOAT       stoplerp;
135    };
136  } grad[SKC_TILE_WIDTH];
137};
138
139//
140// SHARED MEMORY STATE
141//
142
143#define SKC_RENDER_TILE_SMEM_WORDS ((SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT)
144
145#define SKC_RENDER_WIDE_AA_BYTES   (SKC_RENDER_TILE_SMEM_WORDS * sizeof(int) / SKC_RENDER_SUBGROUP_SIZE)
146#define SKC_RENDER_WIDE_AA_WIDTH   (SKC_RENDER_WIDE_AA_BYTES / sizeof(SKC_RENDER_WIDE_AA))
147
148//
149//
150//
151
152union skc_subgroup_smem
153{
154  //
155  // The tiles are stored in column-major / height-major order
156  //
157  // The final column is a guard column that is OK to write to but
158  // will never be read.  It simplifies the TTSB scatter but could be
159  // predicated if SMEM is really at a premium.
160  //
161#if ( SKC_RENDER_SUBGROUP_SIZE > 1 )
162  struct {
163    SKC_ATOMIC_UINT              area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h]
164  } atomic;
165#endif
166
167  struct {
168    int                          area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h]
169  } aN;
170
171  struct { // assumption is that height = subgroup
172    SKC_RENDER_AREA_V            area[SKC_TILE_WIDTH + 1][SKC_RENDER_SUBGROUP_SIZE];
173  } vN;
174
175  struct { // assumption is that height = subgroup
176    SKC_RENDER_WIDE_AA           area[SKC_RENDER_WIDE_AA_WIDTH][SKC_RENDER_SUBGROUP_SIZE];
177  } wide;
178
179  union skc_styling_cmd          cmds[(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT];
180
181  half                           gc  [(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT * 2];
182
183#if 0
184  //
185  // SPILL TO GMEM
186  //
187#if (SKC_REGS_COLOR_S > 0) || (SKC_REGS_COVER_S > 0)
188  struct {
189
190#if (SKC_REGS_COLOR_S > 0)
191    union skc_color_r            color[SKC_REGS_COLOR_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH];
192#endif
193
194#if (SKC_REGS_COVER_S > 0)
195    union float                  cover[SKC_REGS_COVER_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH];
196#endif
197
198  } regs;
199#endif
200  //
201  //
202  //
203#endif
204};
205
206//
207//
208//
209
210#if ( SKC_RENDER_SUBGROUP_SIZE == 1 )
211
212#define skc_subgroup_lane()  0
213
214#else
215
216#define skc_subgroup_lane()  get_sub_group_local_id()
217
218#endif
219
220//
221//
222//
223
224typedef skc_uint  skc_ttsk_lo_t;
225typedef skc_uint  skc_ttsk_hi_t;
226
227typedef skc_uint  skc_ttpk_lo_t;
228typedef skc_uint  skc_ttpk_hi_t;
229
230typedef skc_uint  skc_ttxk_lo_t;
231typedef skc_uint  skc_ttxk_hi_t;
232
233typedef skc_uint  skc_ttck_lo_t;
234typedef skc_uint  skc_ttck_hi_t;
235
236typedef skc_uint2 skc_ttck_t;
237
238typedef skc_int   skc_ttxb_t;
239
240//
241// TTCK (32-BIT COMPARE) v1:
242//
243//  0                                                           63
244//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
245//  +----------------------+--------+--------+-------+-----+-----+
246//  |          30          |    1   |    1   |   18  |  7  |  7  |
247//
248//
249// TTCK (32-BIT COMPARE) v2:
250//
251//  0                                                           63
252//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
253//  +----------------------+--------+--------+-------+-----+-----+
254//  |          30          |    1   |    1   |   15  |  9  |  8  |
255//
256//
257// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:
258//
259//  0                                                           63
260//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
261//  +----------------------+--------+--------+-------+-----+-----+
262//  |          27          |    1   |    1   |   18  |  9  |  8  |
263//
264
265static
266skc_uint
267skc_ttck_lo_get_ttxb_id(skc_ttck_lo_t const a)
268{
269  return a & SKC_TTCK_LO_MASK_ID;
270}
271
272static
273skc_layer_id
274skc_ttck_get_layer(skc_ttck_t const a)
275{
276  //
277  // FIXME -- a union with a ulong and a shift down and mask is
278  // probably faster on some architectures
279  //
280  skc_uint const lo = (a.lo >> SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE);
281  skc_uint const hi = (a.hi  & SKC_TTCK_HI_MASK_LAYER) << SKC_TTCK_LO_BITS_LAYER;
282
283  return lo | hi;
284}
285
286static
287skc_uint
288skc_ttck_hi_get_x(skc_ttck_hi_t const a)
289{
290  return SKC_BFE(a,SKC_TTCK_HI_BITS_X,SKC_TTCK_HI_OFFSET_X);
291}
292
293static
294skc_uint
295skc_ttck_hi_get_y(skc_ttck_hi_t const a)
296{
297  return a >> SKC_TTCK_HI_OFFSET_Y;
298}
299
300static
301skc_bool
302skc_ttck_equal_yxl(skc_ttck_t const a, skc_ttck_t const b)
303{
304  skc_uint const lo = (a.lo ^ b.lo) & SKC_BITS_TO_MASK_AT(SKC_TTCK_LO_BITS_LAYER,SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE);
305  skc_uint const hi = (a.hi ^ b.hi);
306
307  return (lo | hi) == 0;
308}
309
310static
311skc_bool
312skc_ttck_hi_equal_yx(skc_ttck_hi_t const a, skc_ttck_hi_t const b)
313{
314  return ((a ^ b) & SKC_TTCK_HI_MASK_YX) == 0;
315}
316
317static
318skc_bool
319skc_ttck_lo_is_prefix(skc_ttck_lo_t const a)
320{
321  return (a & SKC_TTCK_LO_MASK_PREFIX) != 0;
322}
323
324//
325// TILE TRACE SUBPIXEL
326//
327// The subpixels are encoded with either absolute tile coordinates
328// (32-bits) or packed in delta-encoded form form.
329//
330// For 32-bit subpixel packing of a 32x32 tile:
331//
332// A tile X is encoded as:
333//
334//   TX : 10 : unsigned min(x0,x1) tile subpixel coordinate.
335//
336//   SX :  6 : unsigned subpixel span from min to max x with range
337//             [0,32]. The original direction is not captured. Would
338//             be nice to capture dx but not necessary right now but
339//             could be in the future. <--- SPARE VALUES AVAILABLE
340//
341// A tile Y is encoded as:
342//
343//   TY : 10 : unsigned min(y0,y1) tile subpixel coordinate.
344//
345//   DY :  6 : signed subpixel delta y1-y0. The range of delta is
346//             [-32,32] but horizontal lines are not encoded so [1,32]
347//             is mapped to [0,31]. The resulting range [-32,31] fits
348//             in 6 bits.
349//
350// TTS:
351//
352//  0                        31
353//  |  TX |  SX  |  TY |  DY  |
354//  +-----+------+-----+------+
355//  |  10 |   6  |  10 |   6  |
356//
357
358static
359SKC_RENDER_TTS_V_BITFIELD
360skc_tts_get_ty_pixel_v(SKC_RENDER_TTS_V const a)
361{
362  //
363  // extract the whole pixel y coordinate
364  //
365  return SKC_BFE(a,
366                 SKC_TTS_BITS_TY   - SKC_SUBPIXEL_RESL_Y_LOG2,
367                 SKC_TTS_OFFSET_TY + SKC_SUBPIXEL_RESL_Y_LOG2);
368}
369
370static
371SKC_RENDER_TTS_V_BITFIELD
372skc_tts_get_xy_idx_v(SKC_RENDER_TTS_V const a)
373{
374  //
375  // get the linear array tile index of the pixel
376  //
377  return (((a & SKC_TTS_MASK_TX_PIXEL)
378
379#if   (SKC_SUBPIXEL_RESL_X_LOG2 > SKC_TILE_HEIGHT_LOG2)
380           >> (SKC_SUBPIXEL_RESL_X_LOG2 - SKC_TILE_HEIGHT_LOG2)
381#elif (SKC_SUBPIXEL_RESL_X_LOG2 < SKC_TILE_HEIGHT_LOG2)
382           << (SKC_TILE_HEIGHT_LOG2     - SKC_SUBPIXEL_RESL_X_LOG2)
383#endif
384
385           ) | skc_tts_get_ty_pixel_v(a));
386}
387
388#if 0
389static
390skc_ttx_v_s32_t
391skc_tts_get_dy_v(SKC_RENDER_TTS_V const a)
392{
393  skc_ttx_v_s32_t const dy = SKC_AS(skc_ttx_v_s32_t)a >> SKC_TTS_OFFSET_DY;
394
395  return (dy + SKC_AS(skc_ttx_v_s32_t)(~a >> 31));
396}
397#else
398static
399SKC_RENDER_TTS_V_BITFIELD
400skc_tts_get_dy_v(SKC_RENDER_TTS_V const a)
401{
402  SKC_RENDER_TTS_V_BITFIELD const dy = a >> SKC_TTS_OFFSET_DY;
403
404  return dy - (~a >> 31);
405}
406#endif
407
408static
409SKC_RENDER_TTS_V_BITFIELD
410skc_tts_get_tx_subpixel_v(SKC_RENDER_TTS_V const a)
411{
412  return a & SKC_BITS_TO_MASK(SKC_SUBPIXEL_RESL_X_LOG2);
413}
414
415static
416SKC_RENDER_TTS_V_BITFIELD
417skc_tts_get_sx_v(SKC_RENDER_TTS_V const a)
418{
419  return SKC_BFE(a,SKC_TTS_BITS_SX,SKC_TTS_OFFSET_SX);
420}
421
422//
423//
424//
425
426static
427void
428skc_tile_aa_zero(__local union skc_subgroup_smem * SKC_RESTRICT const smem)
429{
430  //
431  // SIMD / CPU
432  //
433  //      &
434  //
435  // SIMT / GPU
436  //
437  // Note that atomic_init() is likely implemented as a simple
438  // assignment so there is no identifiable performance difference on
439  // current targets.
440  //
441  // If such an architecture appears in the future then we'll probably
442  // still want to implement this zero'ing operation as below but
443  // follow with an appropriate fence that occurs before any scatter
444  // operations.
445  //
446  // The baroque expansion below improves performance on Intel GEN by,
447  // presumably, achieving the 64-byte per clock SLM write as well as
448  // minimizing the overall number of SEND() block initializations and
449  // launches.
450  //
451  // Intel GENx has a documented 64 byte per cycle SLM write limit.
452  // So having each lane in an 8 lane subgroup zero-write 8 bytes is
453  // probably a safe bet (Later: benchmarking backs this up!).
454  //
455  // Note there is no reason at this time to unroll this loop.
456  //
457  for (uint ii=0; ii<SKC_RENDER_WIDE_AA_WIDTH; ii++)
458    smem->wide.area[ii][skc_subgroup_lane()] = ( 0 );
459}
460
461//
462// Note this is going to be vectorizable on most architectures.
463//
464// The return of the key translation feature might complicate things.
465//
466
467static
468void
469skc_scatter_ttpb(__global skc_ttxb_t        const * SKC_RESTRICT const ttxb_extent,
470                 __local  union skc_subgroup_smem * SKC_RESTRICT const smem,
471                 skc_block_id_t                                  const pb_id)
472{
473  skc_uint const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane();
474
475#if   ( SKC_TILE_RATIO == 1 )
476
477  SKC_RENDER_TTP_V const ttp_v = ttxb_extent[offset];
478
479#elif ( SKC_TILE_RATIO == 2 )
480
481  SKC_RENDER_TTP_V const ttp_v = vload2(offset,ttxb_extent);
482
483#else
484
485#error("tile ratio greater than 2 not supported")
486
487#endif
488
489  //
490  // Note there is no need to use an atomic for this operation on the
491  // current group of target platforms... but this may change if
492  // atomic ops truly go through a different path.
493  //
494  // As noted above, this direct increment is probably faster and can
495  // always be followed by a fence.
496  //
497  // Furthermore, note that the key sorting orders all ttck keys
498  // before ttpk keys.
499  //
500
501  //
502  // FIXME -- if the SMEM store is wider than bank word count then we
503  // might want to odd-even interleave the TTP values if the target
504  // device can't handle 64-bit stores
505  //
506
507  //
508  // skipping per-key translation for now
509  //
510  smem->vN.area[0][skc_subgroup_lane()] += ttp_v << (SKC_SUBPIXEL_RESL_X_LOG2 + 1);
511}
512
513//
514// Note that skc_scatter_ttsb is *not* vectorizable unless the
515// architecture supports a "scatter-add" capability.  All relevant
516// GPUs support atomic add on shared/local memory and thus support
517// scatter-add.
518//
519
520static
521void
522skc_scatter_ttsb(__global skc_ttxb_t        const * SKC_RESTRICT const ttxb_extent,
523                 __local  union skc_subgroup_smem * SKC_RESTRICT const smem,
524                 skc_block_id_t                                  const sb_id)
525{
526  skc_uint         const offset = sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
527
528  SKC_RENDER_TTS_V const tts_v  = ttxb_extent[offset];
529
530  //
531  // Skipping per-key translation for now
532  //
533
534  // Index into tile
535  //
536  // The tiles are stored in column-major / height-major order
537  //
538  // The final column is a guard column that is OK to write to but
539  // will never be read.  It simplifies the TTSB scatter but could be
540  // predicated if SMEM is really at a premium.
541  //
542
543  SKC_RENDER_TTS_V_BITFIELD const xy_idx = skc_tts_get_xy_idx_v(tts_v);
544
545#if 0
546  if (tts_v != SKC_TTS_INVALID)
547    printf("(%08X) = %u\n",tts_v,xy_idx);
548#endif
549
550  //
551  // adjust subpixel range to max y
552  //
553  // range is stored as [-32,31] and when read [0,31] is mapped to
554  // [1,32] because a dy of 0 is not possible.
555  //
556  // more succinctly: if dy >= 0 then ++dy
557  //
558  SKC_RENDER_TTS_V_BITFIELD const dy     = skc_tts_get_dy_v(tts_v);
559
560  //
561  // FIXME -- benchmark performance of setting dy to 0 if ttsv.vN is invalid?
562  //
563
564  // this "min(x0) * 2 + dx" is equivalent to "x0 + x1"
565  SKC_RENDER_TTS_V_BITFIELD const widths = skc_tts_get_tx_subpixel_v(tts_v) * 2 + skc_tts_get_sx_v(tts_v);
566
567  // Calculate left and right coverage contribution trapezoids
568  SKC_RENDER_TTS_V_BITFIELD const left   = dy * widths;
569  SKC_RENDER_TTS_V_BITFIELD const right  = (dy << (SKC_SUBPIXEL_RESL_X_LOG2 + 1)) - left;
570
571  //
572  // Accumulate altitudes and areas
573  //
574  // Optimization: if the device supports an CPU/SIMD vector-add or
575  // GPU/SIMT scatter-add atomic int2 add operation then placing the
576  // ALT and AREA values side-by-side would halve the number of
577  // additions.
578  //
579#if ( SKC_RENDER_SUBGROUP_SIZE == 1 )
580  //
581  // CPU/SIMD
582  //
583#undef  SKC_EXPAND_X
584#define SKC_EXPAND_X(I,S,C,P,A)                                 \
585  if (tts_v C != SKC_TTS_INVALID) {                             \
586    smem->aN.area[SKC_TILE_HEIGHT + xy_idx C] += left  C;       \
587    smem->aN.area[                  xy_idx C] += right C;       \
588  }
589
590#else
591  //
592  // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD
593  //
594#undef  SKC_EXPAND_X
595#define SKC_EXPAND_X(I,S,C,P,A)                                         \
596  if (tts_v C != SKC_TTS_INVALID) {                                     \
597    SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area +           \
598                                          SKC_TILE_HEIGHT   + xy_idx C, \
599                                          left C);                      \
600    SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area + xy_idx C, \
601                                          right C);                     \
602  }
603#endif
604
605  SKC_RENDER_TTSB_EXPAND();
606}
607
608//
609// Note that 2048.0 can be represented exactly with fp16... fortuitous!
610//
611
612#define SKC_RENDER_FILL_MAX_AREA          (2u * SKC_SUBPIXEL_RESL_X * SKC_SUBPIXEL_RESL_Y)
613#define SKC_RENDER_FILL_MAX_AREA_2        (2u * SKC_RENDER_FILL_MAX_AREA)
614#define SKC_RENDER_FILL_EVEN_ODD_MASK     (SKC_RENDER_FILL_MAX_AREA_2 - 1)
615#define SKC_RENDER_FILL_MAX_AREA_RCP_F32  (SKC_RENDER_TILE_COVER)(1.0f / SKC_RENDER_FILL_MAX_AREA)
616
617//
618//
619//
620
621static
622void
623skc_tile_cover_nonzero(__local union skc_subgroup_smem * SKC_RESTRICT const smem,
624                       union skc_tile_cover            * SKC_RESTRICT const cover,
625                       union skc_tile_color            * SKC_RESTRICT const color)
626{
627  SKC_RENDER_ACC_COVER_INT area = 0;
628
629  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2
630  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
631    {
632      area                                   += smem->vN.area[ii][skc_subgroup_lane()];
633      SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area);
634      SKC_RENDER_TILE_COVER     const nonzero = SKC_CONVERT(SKC_RENDER_TILE_COVER)(min(trapabs,SKC_RENDER_FILL_MAX_AREA));
635
636      cover->aN.c[ii] = nonzero * (SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA_RCP_F32);
637    }
638}
639
640static
641void
642skc_tile_cover_evenodd(__local union skc_subgroup_smem * SKC_RESTRICT const smem,
643                       union skc_tile_cover            * SKC_RESTRICT const cover,
644                       union skc_tile_color            * SKC_RESTRICT const color)
645{
646  SKC_RENDER_ACC_COVER_INT area = 0;
647
648  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2
649  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
650    {
651      area                                   += smem->vN.area[ii][skc_subgroup_lane()];
652      SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area);
653      SKC_RENDER_ACC_COVER_UINT const reflect = abs(SKC_AS(SKC_RENDER_ACC_COVER_INT)((trapabs & SKC_RENDER_FILL_EVEN_ODD_MASK) - SKC_RENDER_FILL_MAX_AREA));
654
655      cover->aN.c[ii] = SKC_CONVERT(SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA - reflect) * (SKC_RENDER_TILE_COVER)SKC_RENDER_FILL_MAX_AREA_RCP_F32;
656    }
657}
658
659//
660//
661//
662
663static
664void
665skc_tile_color_fill_solid(__global union skc_styling_cmd const * SKC_RESTRICT const commands,
666                          uint                                 * SKC_RESTRICT const cmd_next,
667                          union skc_tile_color                 * SKC_RESTRICT const color)
668{
669  //
670  // rgba = solid fill
671  //
672  __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0;
673
674  *cmd_next += 2;
675
676#if !defined( SKC_RENDER_TILE_COLOR_VECTOR )
677
678  SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);
679
680  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
681  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
682    color->aN.rgba[ii].r = rg.lo;
683
684  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
685  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
686    color->aN.rgba[ii].g = rg.hi;
687
688  SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);
689
690  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
691  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
692    color->aN.rgba[ii].b = ba.lo;
693
694  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
695  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
696    color->aN.rgba[ii].a = ba.hi;
697
698#else
699
700  SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);
701  SKC_RENDER_TILE_COLOR      const r  = rg.lo;
702
703  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
704  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
705    color->vN.rgba[ii].even.even = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(r);
706
707  SKC_RENDER_TILE_COLOR      const g  = rg.hi;
708
709  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
710  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
711    color->vN.rgba[ii].odd.even  = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(g);
712
713  SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);
714  SKC_RENDER_TILE_COLOR      const b  = ba.lo;
715
716  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
717  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
718    color->vN.rgba[ii].even.odd  = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(b);
719
720  SKC_RENDER_TILE_COLOR      const a  = ba.hi;
721
722  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
723  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
724    color->vN.rgba[ii].odd.odd   = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(a);
725
726#endif
727}
728
729//
730// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++"
731//
732// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/
733//
734// Lerp in two fma/mad ops:
735//
736//    t * b + ((-t) * a + a)
737//
738// Note: OpenCL documents mix() as being implemented as:
739//
740//    a + (b - a) * t
741//
742// But this may be a native instruction on some devices.  For example,
743// on GEN9 there is an LRP "linear interoplation" function but it
744// doesn't appear to support half floats.
745//
746
747#if 1
748#define SKC_LERP(a,b,t)  mad(t,b,mad(-(t),a,a))
749#else
750#define SKC_LERP(a,b,t)  mix(a,b,t)
751#endif
752
753//
754// CPUs have a mock local address space so copying the gradient header
755// is probably not useful.  Just read directly from global.
756//
757
758#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL
759#define SKC_RENDER_GRADIENT_SPACE  __local
760#else
761#define SKC_RENDER_GRADIENT_SPACE  __global
762#endif
763
764//
765// gradient is non-vertical
766//
767// removed the vertical (actually, horizontal) special case
768//
769
770static
771void
772skc_tile_color_fill_gradient_linear_nonvertical(__local  union skc_subgroup_smem     * SKC_RESTRICT const smem,
773                                                __global union skc_styling_cmd const * SKC_RESTRICT const commands,
774                                                uint                                 * SKC_RESTRICT const cmd_next,
775                                                union skc_tile_color                 * SKC_RESTRICT const color,
776                                                skc_ttck_hi_t                                       const ttck_hi)
777{
778  //
779  // Where is this tile?
780  //
781  // Note that the gradient is being sampled from pixel centers.
782  //
783  SKC_RENDER_GRADIENT_FLOAT const y =
784#undef  SKC_EXPAND_X
785#define SKC_EXPAND_X(I,S,C,P,A) I##.5f P
786    (SKC_RENDER_GRADIENT_FLOAT)( SKC_RENDER_SCANLINE_VECTOR_EXPAND() ) +
787    (skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE));
788
789  float                     const x = 0.5f + (skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH);
790
791  //
792  // Get starting numerator and denominator
793  //
794  // Note: if gh[0].dx is exactly 0.0f then this is a vertical
795  // gradient and can be handled by a special opcode.
796  //
797  // Note: the mad() ordering is slightly different than the original
798  // CUDA implementation.
799  //
800  union skc_gradient_vector const gv       = { vload4(0,&commands[*cmd_next].f32) };
801
802  *cmd_next += 4;
803
804  float                     const gv_x_dot = mad(x,gv.dx,gv.p0);
805  SKC_RENDER_GRADIENT_FLOAT const gv_numer = mad(y,gv.dy,gv_x_dot);
806
807  //
808  // Where are columns along gradient vector?
809  //
810  // TODO: Note that the gv_denom isn't multiplied through.
811  //
812  // Please doublecheck this... but I recall that in certain cases
813  // this wipes out some precision and results in minor but noticeable
814  // gradient artifacts.
815  //
816  // All arguments are scalars except gv_numer so a simpler
817  // evaluation might save some flops.
818  //
819
820  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
821  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
822    color->grad[ii].distance = mad(gv.dx,(float)ii,gv_numer) * gv.denom;
823
824  //
825  // is gradient non-repeating, repeating or reflecting?
826  //
827  switch (commands[(*cmd_next)++].u32)
828    {
829    case SKC_STYLING_GRADIENT_TYPE_LINEAR_NON_REPEATING:
830      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
831      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
832        color->grad[ii].distance = clamp(color->grad[ii].distance,0.0f,1.0f);
833      break;
834
835    case SKC_STYLING_GRADIENT_TYPE_LINEAR_REPEATING:
836      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
837      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
838        color->grad[ii].distance -= floor(color->grad[ii].distance);
839      break;
840
841    default: // PXL_STYLING_GRADIENT_TYPE_LINEAR_REFLECTING
842      //
843      // OPTIMIZATION: Can this be done in fewer than ~4 ops?
844      //
845      // Note: OpenCL "rint()" is round-to-nearest-even integer!
846      //
847      // Note: the floor() "round to -inf" op is implemented in the
848      // GEN op 'FRC' so probably don't use trunc() when floor will
849      // suffice.
850      //
851
852      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
853      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
854        {
855          SKC_RENDER_GRADIENT_FLOAT dist_abs = fabs(color->grad[ii].distance);
856          color->grad[ii].distance = fabs(dist_abs - rint(dist_abs));
857        }
858    }
859
860  //
861  // initialize "stoplerp" for all columns
862  //
863  uint const slope_count = commands[(*cmd_next)++].u32;
864  uint const gd_n_v1     = commands[(*cmd_next)++].u32; // REMOVE ME
865
866  {
867    float const slope = commands[(*cmd_next)++].f32;
868
869    // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
870    for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
871      color->grad[ii].stoplerp = color->grad[ii].distance * slope;
872  }
873
874  //
875  // compute stoplerp for remaining stops
876  //
877  for (int jj=1; jj<slope_count; jj++)
878    {
879      float const floor = (float)jj;
880      float const slope = commands[(*cmd_next)++].f32;
881
882      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
883      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
884        color->grad[ii].stoplerp = mad(min(0, color->grad[ii].stoplerp - floor),slope,color->grad[ii].stoplerp);
885    }
886
887  //
888  // copy gradient colors to local memory
889  //
890  uint const gd_n = slope_count + 1;
891
892#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL
893  //
894  // copy entire gradient descriptor to local memory
895  //
896  for (uint ii=skc_subgroup_lane(); ii<gd_n*4; ii+=SKC_RENDER_SUBGROUP_SIZE)
897    smem->cmds[ii].u32 = commands[*cmd_next + ii].u32;
898
899  __local  half const * const SKC_RESTRICT gc = smem->gc + 0;
900#else
901  //
902  // prefetch entire gradient header
903  //
904  // no noticeable impact on performance
905  //
906  // prefetch(&commands[*cmd_next].u32,gh_words);
907  //
908  __global half const * const SKC_RESTRICT gc = commands[*cmd_next].f16a2 + 0;
909#endif
910
911  //
912  // adjust cmd_next so that V1 structure is consumed -- FIXME
913  //
914  *cmd_next += SKC_GRADIENT_CMD_WORDS_V2_ADJUST(gd_n_v1,gd_n);
915
916  //
917  // lerp between color pair stops
918  //
919  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
920  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
921    {
922      //
923      // Finally, we have the gradient stop index and the color stop
924      // pair lerp fraction
925      //
926      // Note that if these are vector values then a gather operation
927      // must occur -- there may be platforms (AVX-512?) that can
928      // perform an explicit gather on a vector type but it's not
929      // really expressible in OpenCL except implicitly with a
930      // workgroup of work items.
931      //
932      // ***********************
933      //
934      // FIXME -- USE HERB'S SINGLE FMA LERP
935      //
936      // ***********************
937      //
938      SKC_RENDER_GRADIENT_STOP const gc_stop = SKC_CONVERT(SKC_RENDER_GRADIENT_STOP)(color->grad[ii].stoplerp);
939      SKC_RENDER_GRADIENT_FRAC const gc_frac = SKC_CONVERT(SKC_RENDER_GRADIENT_FRAC)(color->grad[ii].stoplerp - floor(color->grad[ii].stoplerp));
940
941      {
942        SKC_RENDER_TILE_COLOR lo, hi;
943
944#undef  SKC_EXPAND_X
945#define SKC_EXPAND_X(I,S,C,P,A) {                                       \
946          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + 0,gc); \
947          lo C                                = cc.lo;                  \
948          hi C                                = cc.hi;                  \
949        }
950
951        SKC_RENDER_SCANLINE_VECTOR_EXPAND();
952
953        color->aN.rgba[ii].r = SKC_LERP(lo,hi,gc_frac);
954      }
955
956      //
957      //
958      //
959      {
960        SKC_RENDER_TILE_COLOR lo, hi;
961
962#undef  SKC_EXPAND_X
963#define SKC_EXPAND_X(I,S,C,P,A) {                                       \
964          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n,gc); \
965          lo C                                = cc.lo;                  \
966          hi C                                = cc.hi;                  \
967        }
968
969        SKC_RENDER_SCANLINE_VECTOR_EXPAND();
970
971        color->aN.rgba[ii].g = SKC_LERP(lo,hi,gc_frac);
972      }
973
974      //
975      //
976      //
977      {
978        SKC_RENDER_TILE_COLOR lo, hi;
979
980#undef  SKC_EXPAND_X
981#define SKC_EXPAND_X(I,S,C,P,A) {                                       \
982          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*2,gc); \
983          lo C                                = cc.lo;                  \
984          hi C                                = cc.hi;                  \
985        }
986
987        SKC_RENDER_SCANLINE_VECTOR_EXPAND();
988
989        color->aN.rgba[ii].b = SKC_LERP(lo,hi,gc_frac);
990      }
991
992      //
993      //
994      //
995      {
996        SKC_RENDER_TILE_COLOR lo, hi;
997
998#undef  SKC_EXPAND_X
999#define SKC_EXPAND_X(I,S,C,P,A) {                                       \
1000          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*3,gc); \
1001          lo C                                = cc.lo;                  \
1002          hi C                                = cc.hi;                  \
1003        }
1004
1005        SKC_RENDER_SCANLINE_VECTOR_EXPAND();
1006
1007        color->aN.rgba[ii].a = SKC_LERP(lo,hi,gc_frac);
1008      }
1009    }
1010}
1011
1012//
1013//
1014//
1015
1016static
1017void
1018skc_tile_blend_over(union skc_tile_color       * SKC_RESTRICT const color_acc,
1019                    union skc_tile_cover const * SKC_RESTRICT const cover_wip,
1020                    union skc_tile_color const * SKC_RESTRICT const color_wip)
1021{
1022  //
1023  // fralunco = cover.wip * acc.a
1024  //
1025  // acc.r    =  fralunco * wip.r + acc.r
1026  // acc.g    =  fralunco * wip.g + acc.g
1027  // acc.b    =  fralunco * wip.b + acc.b
1028  // acc.a    = -fralunco * wip.a + acc.a
1029  //
1030
1031  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
1032  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
1033    {
1034      SKC_RENDER_TILE_COVER const fralunco = cover_wip->aN.c[ii] * color_acc->aN.rgba[ii].a;
1035
1036      color_acc->aN.rgba[ii].r = mad(+fralunco,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);
1037      color_acc->aN.rgba[ii].g = mad(+fralunco,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);
1038      color_acc->aN.rgba[ii].b = mad(+fralunco,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);
1039      color_acc->aN.rgba[ii].a = mad(-fralunco,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);
1040    }
1041}
1042
1043//
1044//
1045//
1046
1047static
1048void
1049skc_tile_blend_plus(union skc_tile_color       * SKC_RESTRICT const color_acc,
1050                    union skc_tile_cover const * SKC_RESTRICT const cover_wip,
1051                    union skc_tile_color const * SKC_RESTRICT const color_wip)
1052{
1053  //
1054  // cover_min = min(cover.wip,a.acc)
1055  //
1056  // r.acc =  cover_min * r.wip + r.acc
1057  // g.acc =  cover_min * g.wip + g.acc
1058  // b.acc =  cover_min * b.wip + b.acc
1059  // a.acc = -cover_min * a.wip + a.acc
1060  //
1061
1062  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
1063  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
1064    {
1065      SKC_RENDER_TILE_COVER const cover_min = fmin(cover_wip->aN.c[ii],color_acc->aN.rgba[ii].a);
1066
1067      color_acc->aN.rgba[ii].r = mad(+cover_min,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);
1068      color_acc->aN.rgba[ii].g = mad(+cover_min,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);
1069      color_acc->aN.rgba[ii].b = mad(+cover_min,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);
1070      color_acc->aN.rgba[ii].a = mad(-cover_min,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);
1071    }
1072}
1073
1074//
1075//
1076//
1077
1078static
1079void
1080skc_tile_blend_multiply(union skc_tile_color       * SKC_RESTRICT const color_acc,
1081                        union skc_tile_cover const * SKC_RESTRICT const cover_wip,
1082                        union skc_tile_color const * SKC_RESTRICT const color_wip)
1083{
1084  //
1085  // r.acc = (cover.wip * r.wip) * r.acc
1086  // g.acc = (cover.wip * g.wip) * g.acc
1087  // b.acc = (cover.wip * b.wip) * b.acc
1088  // a.acc = (cover.wip * a.wip) * (1.0 - a.acc) <-- a.acc is already (1.0 - alpha)
1089  //
1090
1091  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
1092  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
1093    {
1094      color_acc->aN.rgba[ii].r *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].r;
1095      color_acc->aN.rgba[ii].g *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].g;
1096      color_acc->aN.rgba[ii].b *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].b;
1097      color_acc->aN.rgba[ii].a *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].a;
1098    }
1099}
1100
1101//
1102//
1103//
1104
1105static
1106void
1107skc_tile_blend_knockout(union skc_tile_cover       * SKC_RESTRICT const cover_acc,
1108                        union skc_tile_color       * SKC_RESTRICT const color_acc,
1109                        union skc_tile_cover const * SKC_RESTRICT const cover_wip,
1110                        union skc_tile_color const * SKC_RESTRICT const color_wip)
1111{
1112  //
1113  // cover.wip.contrib = (1.0 - cover.acc) * cover.wip
1114  // cover.acc         = cover.acc + cover.wip.contrib
1115  //
1116  // r.acc =  cover.wip.contrib * r.wip + r.acc
1117  // g.acc =  cover.wip.contrib * g.wip + g.acc
1118  // b.acc =  cover.wip.contrib * b.wip + b.acc
1119  // a.acc = -cover.wip.contrib * a.wip * a.acc
1120  //
1121
1122  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
1123  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
1124    {
1125      SKC_RENDER_TILE_COVER const contrib = (1 - cover_acc->aN.c[ii]) * cover_wip->aN.c[ii];
1126
1127      cover_acc->aN.c[ii]     += contrib;
1128
1129      color_acc->aN.rgba[ii].r = mad(+contrib,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);
1130      color_acc->aN.rgba[ii].g = mad(+contrib,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);
1131      color_acc->aN.rgba[ii].b = mad(+contrib,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);
1132      color_acc->aN.rgba[ii].a = mad(-contrib,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);
1133    }
1134}
1135
1136//
1137//
1138//
1139
1140static
1141void
1142skc_tile_cover_msk_copy_wip(union skc_tile_cover       * SKC_RESTRICT const cover_msk,
1143                            union skc_tile_cover const * SKC_RESTRICT const cover_wip)
1144{
1145#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
1146
1147  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
1148  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
1149    cover_msk->aN.c[ii] = cover_wip->aN.c[ii];
1150
1151#else
1152
1153  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
1154  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
1155    cover_msk->vN.c[ii] = cover_wip->vN.c[ii];
1156
1157#endif
1158}
1159
1160//
1161//
1162//
1163
1164static
1165void
1166skc_tile_cover_msk_copy_acc(union skc_tile_cover       * SKC_RESTRICT const cover_msk,
1167                            union skc_tile_cover const * SKC_RESTRICT const cover_acc)
1168{
1169#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
1170
1171  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
1172  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
1173    cover_msk->aN.c[ii] = cover_acc->aN.c[ii];
1174
1175#else
1176
1177  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNTN)))
1178  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
1179    cover_msk->vN.c[ii] = cover_acc->vN.c[ii];
1180
1181#endif
1182}
1183
1184//
1185//
1186//
1187
1188static
1189void
1190skc_tile_cover_accumulate(union skc_tile_cover       * SKC_RESTRICT const cover_acc,
1191                          union skc_tile_cover const * SKC_RESTRICT const cover_wip)
1192{
1193  //
1194  // cover.wip.contrib = (1.0 - cover.acc) * cover.wip
1195  // cover.acc         = cover.acc + cover.wip.contrib
1196  //
1197
1198  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
1199  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
1200    cover_acc->aN.c[ii] = mad(1 - cover_acc->aN.c[ii],cover_wip->aN.c[ii],cover_acc->aN.c[ii]);
1201}
1202
1203//
1204//
1205//
1206
1207static
1208void
1209skc_tile_cover_wip_mask(union skc_tile_cover       * SKC_RESTRICT const cover_wip,
1210                        union skc_tile_cover const * SKC_RESTRICT const cover_msk)
1211{
1212  //
1213  // cover.wip *= cover.msk
1214  //
1215
1216  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
1217  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
1218    cover_wip->aN.c[ii] *= cover_msk->aN.c[ii];
1219}
1220
1221//
1222//
1223//
1224
1225static
1226void
1227skc_tile_cover_wip_zero(union skc_tile_cover * SKC_RESTRICT const cover)
1228{
1229#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 )
1230
1231  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
1232  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
1233    cover->aN.c[ii] = 0;
1234
1235#else
1236  //
1237  // GEN9 compiler underperforms on this
1238  //
1239
1240  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
1241  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
1242    cover->vN.c[ii] = 0;
1243
1244#endif
1245}
1246
1247static
1248void
1249skc_tile_cover_acc_zero(union skc_tile_cover * SKC_RESTRICT const cover)
1250{
1251#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 )
1252
1253  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
1254  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
1255    cover->aN.c[ii] = 0;
1256
1257#else
1258  //
1259  // GEN9 compiler underperforms on this
1260  //
1261
1262  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
1263  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
1264    cover->vN.c[ii] = 0;
1265
1266#endif
1267}
1268
1269static
1270void
1271skc_tile_cover_msk_zero(union skc_tile_cover * SKC_RESTRICT const cover)
1272{
1273#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
1274
1275  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
1276  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
1277    cover->aN.c[ii] = 0;
1278
1279#else
1280  //
1281  // GEN9 compiler underperforms on this
1282  //
1283
1284  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
1285  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
1286    cover->vN.c[ii] = 0;
1287
1288#endif
1289}
1290
1291//
1292//
1293//
1294
1295static
1296void
1297skc_tile_cover_msk_one(union skc_tile_cover * SKC_RESTRICT const cover)
1298{
1299#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
1300
1301  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
1302  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
1303    cover->aN.c[ii] = 1;
1304
1305#else
1306  //
1307  // GEN9 compiler underperforms on this
1308  //
1309
1310  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
1311  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
1312    cover->vN.c[ii] = SKC_RENDER_TILE_COVER_VECTOR_ONE;
1313
1314#endif
1315}
1316
1317//
1318//
1319//
1320
1321static
1322void
1323skc_tile_cover_msk_invert(union skc_tile_cover * SKC_RESTRICT const cover)
1324{
1325#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
1326
1327  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
1328  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
1329    cover->aN.c[ii] = 1 - cover->aN.c[ii];
1330
1331#else
1332
1333  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
1334  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
1335    cover->vN.c[ii] = 1 - cover->vN.c[ii];
1336
1337#endif
1338}
1339
1340//
1341//
1342//
1343
1344static
1345void
1346skc_tile_color_wip_zero(union skc_tile_color * SKC_RESTRICT const color)
1347{
1348#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 )
1349
1350  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
1351  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
1352    {
1353      color->aN.rgba[ii].r = 0;
1354      color->aN.rgba[ii].g = 0;
1355      color->aN.rgba[ii].b = 0;
1356      color->aN.rgba[ii].a = 1;
1357    }
1358
1359#else
1360  //
1361  // DISABLED ON GEN9 -- probably a compiler bug
1362  //
1363  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
1364  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
1365    color->vN.rgba[ii].even.even = 0;
1366
1367  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
1368  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
1369    color->vN.rgba[ii].odd.even  = 0;
1370
1371  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
1372  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
1373    color->vN.rgba[ii].even.odd  = 0;
1374
1375  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
1376  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
1377    color->vN.rgba[ii].odd.odd   = 1;
1378#endif
1379}
1380
1381static
1382void
1383skc_tile_color_acc_zero(union skc_tile_color * SKC_RESTRICT const color)
1384{
1385#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 )
1386
1387  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
1388  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
1389    {
1390      color->aN.rgba[ii].r = 0;
1391      color->aN.rgba[ii].g = 0;
1392      color->aN.rgba[ii].b = 0;
1393      color->aN.rgba[ii].a = 1;
1394    }
1395
1396#else
1397  //
1398  // DISABLED ON GEN9 -- probably a compiler bug
1399  //
1400  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
1401  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
1402    color->vN.rgba[ii].even.even = 0;
1403
1404  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
1405  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
1406    color->vN.rgba[ii].odd.even  = 0;
1407
1408  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
1409  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
1410    color->vN.rgba[ii].even.odd  = 0;
1411
1412  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
1413  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
1414    color->vN.rgba[ii].odd.odd   = 1;
1415#endif
1416}
1417
1418//
1419//
1420//
1421
1422static
1423bool
1424skc_tile_color_test_opacity(union skc_tile_color const * SKC_RESTRICT const color)
1425{
1426  //
1427  // returns true if tile is opaque
1428  //
1429  // various hacks to test for complete tile opacity
1430  //
1431  // note that front-to-back currently has alpha at 0.0f -- this can
1432  // be harmonized to use a traditional alpha if we want to support
1433  // rendering in either direction
1434  //
1435  // hack -- ADD/MAX/OR all alphas together and test for non-zero
1436  //
1437  SKC_RENDER_TILE_COLOR t = color->aN.rgba[0].a;
1438
1439  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
1440  for (uint ii=1; ii<SKC_TILE_WIDTH; ii++)
1441    t += color->aN.rgba[ii].a;
1442
1443#if ( SKC_RENDER_SUBGROUP_SIZE == 1 )
1444  //
1445  // SIMD
1446  //
1447  return !any(t != ( 0 ));
1448
1449#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 )
1450  //
1451  // SIMT - scalar per lane
1452  //
1453  return !sub_group_any(t != 0);
1454
1455#else
1456  //
1457  // SIMT - vector per lane
1458  //
1459  return !sub_group_any(any(t != ( 0 )));
1460
1461#endif
1462
1463  //
1464  // TODO: The alternative vector-per-lane implementation below is
1465  // *not* believed to be performant because the terse vector-wide
1466  // test is just hiding a series of comparisons and is likely worse
1467  // than the blind ADD/MAX/OR'ing of all alphas followed by a single
1468  // test.
1469  //
1470#if 0
1471  //
1472  // SIMT - vector per lane
1473  //
1474
1475  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT-1)))
1476  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
1477    {
1478      if (sub_group_any(any(color->vN.ba[ii].a != ( 0 ))))
1479        return false;
1480    }
1481
1482  return true;
1483#endif
1484}
1485
1486//
1487//
1488//
1489
1490static
1491void
1492skc_tile_background_over(__global union skc_styling_cmd const * SKC_RESTRICT const commands,
1493                         uint                                 * SKC_RESTRICT const cmd_next,
1494                         union skc_tile_color                 * SKC_RESTRICT const color)
1495{
1496  //
1497  // acc.r = acc.a * r + acc.r
1498  // acc.g = acc.a * g + acc.g
1499  // acc.b = acc.a * b + acc.b
1500  //
1501  __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0;
1502
1503  *cmd_next += 2;
1504
1505  SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);
1506
1507  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
1508  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
1509    color->aN.rgba[ii].r = mad(color->aN.rgba[ii].a,rg.lo,color->aN.rgba[ii].r);
1510
1511  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
1512  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
1513    color->aN.rgba[ii].g = mad(color->aN.rgba[ii].a,rg.hi,color->aN.rgba[ii].g);
1514
1515  SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);
1516
1517  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
1518  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
1519    color->aN.rgba[ii].b = mad(color->aN.rgba[ii].a,ba.lo,color->aN.rgba[ii].b);
1520}
1521
1522//
1523//
1524//
1525
1526// #define SKC_SURFACE_IS_BUFFER
1527#ifdef  SKC_SURFACE_IS_BUFFER
1528
1529static
1530void
1531skc_surface_composite_u8_rgba(__global SKC_RENDER_SURFACE_U8_RGBA * SKC_RESTRICT const surface,
1532                              skc_uint                                           const surface_pitch,
1533                              union skc_tile_color          const * SKC_RESTRICT const color,
1534                              skc_ttck_hi_t                                      const ttck_hi)
1535{
1536  //
1537  // NEW MAJOR OPTIMIZATION:
1538  //
1539  // Rotating and rasterizing the original world transform by -90
1540  // degrees and then rendering the scene scene by +90 degrees enables
1541  // all the final surface composite to be perfomed in perfectly
1542  // coalesced wide transactions.
1543  //
1544  // For this reason, linear access to the framebuffer is preferred.
1545  //
1546  // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv
1547  //
1548  // NOTE THIS IS TRANSPOSED BY 90 DEGREES
1549  //
1550  // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE
1551  // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING.
1552  //
1553  // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS
1554  // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS
1555  //
1556  // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL
1557  // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER
1558  //
1559  uint const pitch = surface_pitch / SKC_RENDER_SCANLINE_VECTOR_SIZE;
1560  uint const x     = skc_ttck_hi_get_x(ttck_hi);
1561  uint const y     = skc_ttck_hi_get_y(ttck_hi) ;
1562  uint const base  = x * SKC_TILE_WIDTH * pitch + y * (SKC_TILE_HEIGHT / SKC_RENDER_SCANLINE_VECTOR_SIZE) + skc_subgroup_lane();
1563
1564  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
1565  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
1566    {
1567      SKC_RENDER_SURFACE_U8_RGBA rgba = ( 0xFF000000 );
1568
1569      rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].r * 255);
1570      rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].g * 255) << 8;
1571      rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].b * 255) << 16;
1572
1573      surface[base + ii * pitch] = rgba;
1574
1575      // printf("%08v2X\n",rgba);
1576    }
1577}
1578
1579#else
1580
1581static
1582void
1583skc_surface_composite_u8_rgba(__write_only image2d_t                          surface,
1584                              union skc_tile_color const * SKC_RESTRICT const color,
1585                              skc_ttck_hi_t                                   const ttck_hi)
1586{
1587  //
1588  // NEW MAJOR OPTIMIZATION:
1589  //
1590  // Rotating and rasterizing the original world transform by -90
1591  // degrees and then rendering the scene scene by +90 degrees enables
1592  // all the final surface composite to be perfomed in perfectly
1593  // coalesced wide transactions.
1594  //
1595  // For this reason, linear access to the framebuffer is preferred.
1596  //
1597  // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv
1598  //
1599  // NOTE THIS IS TRANSPOSED BY 90 DEGREES
1600  //
1601  // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE
1602  // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING.
1603  //
1604  // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS
1605  // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS
1606  //
1607  // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL
1608  // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER
1609  //
1610
1611#if 1
1612  int x = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH;
1613  int y = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE);
1614
1615  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
1616  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
1617    {
1618#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED
1619
1620#undef  SKC_EXPAND_X
1621#define SKC_EXPAND_X(I,S,C,P,A) {                       \
1622        SKC_RENDER_SURFACE_WRITE(surface,               \
1623                                 (int2)(x,y+I),         \
1624                                 color->iN.rgba[ii] A); \
1625      }
1626
1627#else
1628
1629#undef  SKC_EXPAND_X
1630#define SKC_EXPAND_X(I,S,C,P,A) {                               \
1631        SKC_RENDER_SURFACE_COLOR const rgba =                   \
1632          (SKC_RENDER_SURFACE_COLOR)                            \
1633          (color->aN.rgba[ii].r C,                              \
1634           color->aN.rgba[ii].g C,                              \
1635           color->aN.rgba[ii].b C,                              \
1636           1.0);                                                \
1637        SKC_RENDER_SURFACE_WRITE(surface,(int2)(x,y+I),rgba);   \
1638      }
1639
1640#endif
1641
1642      SKC_RENDER_SCANLINE_VECTOR_EXPAND();
1643
1644      x += 1;
1645    }
1646#else
1647    int x = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE);
1648    int y = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH;
1649
1650    // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
1651    for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
1652      {
1653#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED
1654
1655#undef  SKC_EXPAND_X
1656#define SKC_EXPAND_X(I,S,C,P,A) {                       \
1657        SKC_RENDER_SURFACE_WRITE(surface,               \
1658                                 (int2)(x+I,y+ii),      \
1659                                 color->iN.rgba[ii] A); \
1660      }
1661
1662#else
1663
1664#undef  SKC_EXPAND_X
1665#define SKC_EXPAND_X(I,S,C,P,A) {                               \
1666      SKC_RENDER_SURFACE_COLOR const rgba =                     \
1667        (SKC_RENDER_SURFACE_COLOR)                              \
1668        (color->aN.rgba[ii].r C,                                \
1669        color->aN.rgba[ii].g C,                                 \
1670        color->aN.rgba[ii].b C,                                 \
1671        1.0);                                                   \
1672      SKC_RENDER_SURFACE_WRITE(surface,(int2)(x+I,y+ii),rgba);  \
1673    }
1674
1675#endif
1676
1677      SKC_RENDER_SCANLINE_VECTOR_EXPAND();
1678    }
1679
1680#endif
1681}
1682
1683#endif
1684
1685//
1686//
1687//
1688static
1689uint const
1690skc_ttck_lane(uint const ttck_idx)
1691{
1692  return ttck_idx & SKC_RENDER_SUBGROUP_MASK;
1693}
1694
1695//
1696// RENDER KERNEL
1697//
1698
1699__kernel
1700SKC_RENDER_KERNEL_ATTRIBS
1701void
1702skc_kernel_render(__global   union  skc_layer_node   const * SKC_RESTRICT const layers,
1703                  __global   struct skc_group_node   const * SKC_RESTRICT const groups,
1704                  __global   union  skc_styling_cmd  const * SKC_RESTRICT const commands,     // FIXME -- rename
1705
1706                  __global   skc_ttck_t              const * SKC_RESTRICT const ttck_keys,    // rename: keys
1707                  skc_uint                                                const ttck_count,   // rename: key_count
1708
1709                  __global   uint                    const * SKC_RESTRICT const ttck_offsets, // rename: offsets
1710                  skc_uint                                                const tile_count,   // rename: offset_count
1711
1712                  __global   skc_ttxb_t              const * SKC_RESTRICT const ttxb_extent,
1713#ifdef SKC_SURFACE_IS_BUFFER
1714                  __global   void                          * SKC_RESTRICT const surface,
1715#else
1716                  __write_only image2d_t                                        surface,
1717#endif
1718#ifdef SKC_SURFACE_IS_BUFFER
1719                  skc_uint                                                const surface_pitch,
1720#endif
1721                  uint4                                                   const tile_clip)    // rename: clip
1722{
1723  //
1724  // Each subgroup is responsible for a tile.  No extra subgroups are
1725  // launched.
1726  //
1727  // FIXME -- might be better implemented as a "grid stride loop" if
1728  // Intel GEN really has a local memory "quantum" of 4KB which means
1729  // we would need to launch 4 subgroups per workgroup.
1730  //
1731  // Confirmed: GEN8 has 4KB SLM workgroup min while GEN9 is 1KB.
1732  //
1733
1734  //
1735  // declare tile cover and color registers
1736  //
1737  // this used to be a neat unified struct but the Intel GEN compiler
1738  // wasn't cooperating and spilling to private memory even though all
1739  // registers were indexed by constants
1740  //
1741  union skc_tile_color  color_wip;
1742  union skc_tile_color  color_acc;
1743
1744  union skc_tile_cover  cover_wip;
1745  union skc_tile_cover  cover_acc;
1746  union skc_tile_cover  cover_msk;
1747
1748  //
1749  // which subgroup in the grid is this?
1750  //
1751  // TAKE NOTE: the Intel GEN compiler is recognizing get_group_id(0)
1752  // as a uniform but the alternative calculation used when there are
1753  // multiple subgroups per workgroup is not cooperating and
1754  // driving spillage elsewhere.
1755  //
1756#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 )
1757  skc_uint const ttck_offset_idx = get_group_id(0);
1758#else
1759  skc_uint const ttck_offset_idx = get_group_id(0) * SKC_RENDER_WORKGROUP_SUBGROUPS + get_sub_group_id();
1760#endif
1761
1762  //
1763  // load the starting ttck for this offset and get a bound on the max
1764  // number of keys that might be loaded
1765  //
1766  // these are uniform across all subgroup lanes
1767  //
1768  skc_uint ttck_idx = ttck_offsets[ttck_offset_idx];
1769
1770  //
1771  // FIXME -- SIMD/CPU version should probaby load a 256-bit (4-wide)
1772  // vector of ttck keys
1773  //
1774#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK
1775
1776  skc_ttck_t ttck = ttck_keys[ttck_idx];
1777
1778#else
1779
1780  uint const ttck_base = ttck_idx & ~SKC_RENDER_SUBGROUP_MASK;
1781  uint const ttck_lane = ttck_idx &  SKC_RENDER_SUBGROUP_MASK;
1782  skc_ttck_t ttck_s    = ttck_keys[min(ttck_base+max(get_sub_group_local_id(),ttck_lane),ttck_count-1)]
1783
1784#endif
1785
1786  //
1787  // set up style group/layer state
1788  //
1789  struct skc_styling_group {
1790    union skc_group_range range;
1791    skc_uint              depth;
1792    skc_uint              id;
1793  } group;
1794
1795  group.range.lo = 0;
1796  group.range.hi = SKC_UINT_MAX;
1797  group.depth    = SKC_UINT_MAX;
1798  group.id       = SKC_UINT_MAX;
1799
1800  //
1801  // start with clear tile opacity, knockout and flag bits
1802  //
1803  // uint color_acc_opacity  = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32
1804  // uint cover_acc_knockout = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32
1805  //
1806  skc_uint flags = 0;
1807
1808  //
1809  // declare and initialize accumulators
1810  //
1811#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 )
1812  __local union skc_subgroup_smem                      smem[1];
1813#else
1814  __local union skc_subgroup_smem                      smem_wg[SKC_RENDER_WORKGROUP_SUBGROUPS];
1815  __local union skc_subgroup_smem * SKC_RESTRICT const smem = smem_wg + get_sub_group_id();
1816#endif
1817
1818#ifdef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK
1819  //
1820  // select the initial ttck key
1821  //
1822  skc_ttck_t ttck;
1823#if 0
1824  ttck    = sub_group_broadcast(ttck_s,ttck_lane);    // SHOULD WORK BUT .4454 COMPILER IS BROKEN
1825#else
1826  ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane); // EXPLICIT WORKAROUND
1827  ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane);
1828#endif
1829
1830#endif
1831
1832  //
1833  // save the first key so we know what tile we're in
1834  //
1835  skc_ttck_t ttck0 = ttck;
1836
1837  //
1838  // evaluate the coarse clip as late as possible
1839  //
1840  skc_uint const ttck_hi_x = skc_ttck_hi_get_x(ttck0.hi);
1841
1842  if ((ttck_hi_x < tile_clip.lo.x) || (ttck_hi_x >= tile_clip.hi.x))
1843    return;
1844
1845  skc_uint const ttck_hi_y = skc_ttck_hi_get_y(ttck0.hi);
1846
1847  if ((ttck_hi_y < tile_clip.lo.y) || (ttck_hi_y >= tile_clip.hi.y))
1848    return;
1849
1850#if 0
1851  printf("< %u, %u >\n",ttck_hi_x,ttck_hi_y);
1852#endif
1853
1854  //
1855  // load -> scatter -> flush
1856  //
1857  while (true)
1858    {
1859      // if scattering is disabled then just run through ttck keys
1860      bool const is_scatter_enabled = (flags & SKC_TILE_FLAGS_SCATTER_SKIP) == 0;
1861
1862      // need to clear accumulators before a scatter loop
1863      if (is_scatter_enabled)
1864        {
1865          skc_tile_aa_zero(smem);
1866        }
1867
1868      do {
1869        // skip scattering?
1870        if (is_scatter_enabled)
1871          {
1872            skc_block_id_t const xb_id = skc_ttck_lo_get_ttxb_id(ttck.lo);
1873
1874            if (skc_ttck_lo_is_prefix(ttck.lo)) {
1875              skc_scatter_ttpb(ttxb_extent,smem,xb_id);
1876            } else {
1877              skc_scatter_ttsb(ttxb_extent,smem,xb_id);
1878            }
1879          }
1880
1881        //
1882        // any ttck keys left?
1883        //
1884        if (++ttck_idx >= ttck_count)
1885          {
1886            flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE;
1887            break;
1888          }
1889
1890        //
1891        // process next ttck key
1892        //
1893#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK
1894        //
1895        // SIMD -- read next key
1896        //
1897        ttck = ttck_keys[ttck_idx];
1898#else
1899        //
1900        // SIMT -- refresh the ttck_s?
1901        //
1902        uint const ttck_lane_next = ttck_idx & SKC_RENDER_SUBGROUP_MASK;
1903
1904        if (ttck_lane_next == 0)
1905          ttck_s = ttck_keys[min(ttck_idx+get_sub_group_local_id(),ttck_count-1)];
1906
1907        //
1908        // broadcast next key to entire subgroup
1909        //
1910#if 0
1911        ttck    = sub_group_broadcast(ttck_s,ttck_lane_next);    // SHOULD WORK BUT .4454 COMPILER IS BROKEN
1912#else
1913        ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane_next); // EXPLICIT WORKAROUND
1914        ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane_next);
1915#endif
1916#endif
1917        // continue scattering if on same YXL layer
1918      } while (skc_ttck_equal_yxl(ttck0,ttck));
1919
1920      // finalize if no longer on same YX tile
1921      if (!skc_ttck_hi_equal_yx(ttck0.hi,ttck.hi))
1922        {
1923          // otherwise, unwind the tile styling and exit
1924          flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE;
1925        }
1926
1927      //
1928      // given: new layer id from ttxk key
1929      //
1930      // load [layer id]{ group id, depth }
1931      //
1932      // if within current group's layer range
1933      //
1934      //   if at same depth
1935      //
1936      //     load and execute cover>[mask>]color>blend commands
1937      //
1938      //   else if not at same depth then move deeper
1939      //
1940      //     for all groups in group trail from cur depth to new depth
1941      //       enter group, saving and initializing regs as necessary
1942      //     increment depth and update layer range
1943      //     load and execute cover>[mask>]color>blend commands
1944      //
1945      // else not within layer range
1946      //
1947      //   exit current group, restoring regs as necessary
1948      //   decrement depth and update layer range
1949      //
1950      //
1951      skc_layer_id         const layer_id_new   = skc_ttck_get_layer(ttck0); // FIXME -- this was ttck_hi
1952      union skc_layer_node const layer_node_new = layers[layer_id_new];
1953
1954      // clear flag that controls group/layer traversal
1955      flags &= ~SKC_TILE_FLAGS_FLUSH_COMPLETE;
1956
1957      do {
1958        bool const unwind = (flags & SKC_TILE_FLAGS_FLUSH_UNWIND) != 0;
1959
1960        //
1961        // is layer a child of the current parent group?
1962        //
1963        uint cmd_next = 0;
1964
1965        if (!unwind && (layer_node_new.parent == group.id))
1966          {
1967            // execute this layer's cmds
1968            cmd_next = layer_node_new.cmds;
1969
1970            // if this is final then configure so groups get unwound, otherwise we're done
1971            flags   |= ((flags & SKC_TILE_FLAGS_FLUSH_FINALIZE)
1972                        ? SKC_TILE_FLAGS_FLUSH_UNWIND
1973                        : SKC_TILE_FLAGS_FLUSH_COMPLETE);
1974          }
1975        else if (!unwind && (layer_id_new >= group.range.lo && layer_id_new <= group.range.hi))
1976          {
1977            //
1978            // is layer in a child group?
1979            //
1980            union skc_group_parents const gp = groups[layer_node_new.parent].parents;
1981            uint                    const gn = gp.depth - ++group.depth;
1982
1983            if (gn == 0)
1984              group.id = layer_node_new.parent;
1985            else
1986              group.id = commands[gp.base + gn - 1].parent;
1987
1988            // update group layer range
1989            group.range = groups[group.id].range;
1990
1991            // enter current group
1992            cmd_next    = groups[group.id].cmds.enter;
1993          }
1994        else // otherwise, exit this group
1995          {
1996            // enter current group
1997            cmd_next = groups[group.id].cmds.leave;
1998
1999            // decrement group depth
2000            if (--group.depth == SKC_UINT_MAX)
2001              {
2002                flags |= SKC_TILE_FLAGS_FLUSH_COMPLETE;
2003              }
2004            else
2005              {
2006                // get path_base of current group
2007                uint const gnpb = groups[group.id].parents.base;
2008
2009                // get parent of current group
2010                group.id    = commands[gnpb].parent;
2011
2012                // update group layer range
2013                group.range = groups[group.id].range;
2014              }
2015          }
2016
2017        //
2018        // execute cmds
2019        //
2020        while (true)
2021          {
2022            union skc_styling_cmd const cmd = commands[cmd_next++];
2023
2024            switch (cmd.u32 & SKC_STYLING_OPCODE_MASK_OPCODE)
2025              {
2026              case SKC_STYLING_OPCODE_NOOP:
2027                break;
2028
2029              case SKC_STYLING_OPCODE_COVER_NONZERO:
2030                skc_tile_cover_nonzero(smem,&cover_wip,&color_wip);
2031                break;
2032
2033              case SKC_STYLING_OPCODE_COVER_EVENODD:
2034                skc_tile_cover_evenodd(smem,&cover_wip,&color_wip);
2035                break;
2036
2037              case SKC_STYLING_OPCODE_COVER_ACCUMULATE:
2038                skc_tile_cover_accumulate(&cover_acc,&cover_wip);
2039                break;
2040
2041              case SKC_STYLING_OPCODE_COVER_MASK:
2042                skc_tile_cover_wip_mask(&cover_wip,&cover_msk);
2043                break;
2044
2045              case SKC_STYLING_OPCODE_COVER_WIP_ZERO:
2046                skc_tile_cover_wip_zero(&cover_wip);
2047                break;
2048
2049              case SKC_STYLING_OPCODE_COVER_ACC_ZERO:
2050                skc_tile_cover_acc_zero(&cover_acc);
2051                break;
2052
2053              case SKC_STYLING_OPCODE_COVER_MASK_ZERO:
2054                skc_tile_cover_msk_zero(&cover_msk);
2055                break;
2056
2057              case SKC_STYLING_OPCODE_COVER_MASK_ONE:
2058                skc_tile_cover_msk_one(&cover_msk);
2059                break;
2060
2061              case SKC_STYLING_OPCODE_COVER_MASK_INVERT:
2062                skc_tile_cover_msk_invert(&cover_msk);
2063                break;
2064
2065              case SKC_STYLING_OPCODE_COLOR_FILL_SOLID:
2066                skc_tile_color_fill_solid(commands,&cmd_next,&color_wip);
2067                break;
2068
2069              case SKC_STYLING_OPCODE_COLOR_FILL_GRADIENT_LINEAR:
2070                //
2071                // FIXME -- gradients shouldn't be executing so much
2072                // conditional driven code at runtime since we *know*
2073                // the gradient style on the host can just create a
2074                // new styling command to exploit this.
2075                //
2076                // FIXME -- it might be time to try using the GPU's
2077                // sampler on a linear array of half4 vectors -- it
2078                // might outperform the explicit load/lerp routines.
2079                //
2080                // FIXME -- optimizing for vertical gradients (uhhh,
2081                // they're actually horizontal due to the -90 degree
2082                // view transform) is nice but is it worthwhile to
2083                // have this in the kernel?  Easy to add it back...
2084                //
2085#if defined( SKC_ARCH_GEN9 )
2086                // disable gradients due to exessive spillage -- fix later
2087                cmd_next += SKC_GRADIENT_CMD_WORDS_V1(commands[cmd_next+6].u32);
2088#else
2089                skc_tile_color_fill_gradient_linear_nonvertical(smem,commands,&cmd_next,&color_wip,ttck0.hi);
2090#endif
2091                break;
2092
2093              case SKC_STYLING_OPCODE_COLOR_WIP_ZERO:
2094                skc_tile_color_wip_zero(&color_wip);
2095                break;
2096
2097              case SKC_STYLING_OPCODE_COLOR_ACC_ZERO:
2098                skc_tile_color_acc_zero(&color_acc);
2099                break;
2100
2101              case SKC_STYLING_OPCODE_BLEND_OVER:
2102                skc_tile_blend_over(&color_acc,&cover_wip,&color_wip);
2103                break;
2104
2105              case SKC_STYLING_OPCODE_BLEND_PLUS:
2106                skc_tile_blend_plus(&color_acc,&cover_wip,&color_wip);
2107                break;
2108
2109              case SKC_STYLING_OPCODE_BLEND_MULTIPLY:
2110                skc_tile_blend_multiply(&color_acc,&cover_wip,&color_wip);
2111                break;
2112
2113              case SKC_STYLING_OPCODE_BLEND_KNOCKOUT:
2114                skc_tile_blend_knockout(&cover_acc,&color_acc,&cover_wip,&color_wip);
2115                break;
2116
2117              case SKC_STYLING_OPCODE_COVER_WIP_MOVE_TO_MASK:
2118                skc_tile_cover_msk_copy_wip(&cover_msk,&cover_wip);
2119                break;
2120
2121              case SKC_STYLING_OPCODE_COVER_ACC_MOVE_TO_MASK:
2122                skc_tile_cover_msk_copy_acc(&cover_msk,&cover_acc);
2123                break;
2124
2125              case SKC_STYLING_OPCODE_BACKGROUND_OVER:
2126                skc_tile_background_over(commands,&cmd_next,&color_acc);
2127                break;
2128
2129              case SKC_STYLING_OPCODE_SURFACE_COMPOSITE:
2130#ifdef SKC_SURFACE_IS_BUFFER
2131                skc_surface_composite_u8_rgba(surface,surface_pitch,&color_acc,ttck0.hi);
2132#else
2133                skc_surface_composite_u8_rgba(surface,              &color_acc,ttck0.hi);
2134#endif
2135                break;
2136
2137              case SKC_STYLING_OPCODE_COLOR_ACC_TEST_OPACITY:
2138                if (skc_tile_color_test_opacity(&color_acc))
2139                  flags |= SKC_TILE_FLAGS_SCATTER_SKIP;
2140                break;
2141
2142              default:
2143                return; // this is an illegal opcode -- trap and die!
2144              }
2145
2146            //
2147            // if sign bit is set then this was final command
2148            //
2149            if (cmd.s32 < 0)
2150              break;
2151          }
2152
2153        // continue as long as tile flush isn't complete
2154      } while ((flags & SKC_TILE_FLAGS_FLUSH_COMPLETE) == 0);
2155
2156      // return if was the final flush
2157      if (flags & SKC_TILE_FLAGS_FLUSH_FINALIZE)
2158        return;
2159
2160      // update wip ttck_hi
2161      ttck0 = ttck;
2162    }
2163}
2164
2165//
2166//
2167//
2168