1/* 2 * Copyright 2017 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can 5 * be found in the LICENSE file. 6 * 7 */ 8 9// 10// 11// 12 13#include "tile.h" 14#include "common.h" 15#include "atomic_cl.h" 16#include "block_pool_cl.h" 17#include "raster_builder_cl_12.h" 18#include "kernel_cl_12.h" 19 20// #define SKC_ARCH_AVX2 21// #define SKC_RASTERIZE_SIMD_USES_SMEM 22 23#define PRINTF_ENABLE 0 24#define PRINTF_BLOCK_COUNT 0 25 26// 27// NOTE: 28// 29// ON SIMD DEVICES THE BIN COUNT MUST BE POW2 SO THAT WE CAN LOAD IT 30// AS A VECTOR AND PERFORM A SWIZZLE/SHUFFLE 31// 32// NOTE: 33// 34// IGNORE FOR NOW ANY AVX2 CODE SNIPPETS. THEY WILL BE MOVED ASAP. 35// 36// 37 38#if 0 // SKC_ARCH_AVX2 39 40// #define SKC_RASTERIZE_SUBGROUP_SIZE 1 41// #define SKC_RASTERIZE_VECTOR_SIZE_LOG2 3 42// #define SKC_RASTERIZE_WORKGROUP_COUNT_SUBGROUP 1 43 44// #define SKC_TTXB_WORDS 8 45 46// #define SKC_RASTERIZE_FLOAT float8 47// #define SKC_RASTERIZE_UINT uint8 48// #define SKC_RASTERIZE_INT int8 49// #define SKC_RASTERIZE_PREDICATE int8 50 51// #define SKC_RASTERIZE_BIN_BLOCK uint16 52// #define SKC_RASTERIZE_BIN uint8 53 54// #define SKC_RASTERIZE_POOL uint8 55// #define SKC_RASTERIZE_POOL_SCALE 6 56 57// #define SKC_RASTERIZE_TILE_HASH_X_BITS 1 58// #define SKC_RASTERIZE_TILE_HASH_Y_BITS 2 59 60// #define SKC_RASTERIZE_VECTOR_EXPAND() SKC_EXPAND_8() 61 62#endif 63 64// 65// SIMT 66// 67 68#define SKC_RASTERIZE_BLOCK_ID_V_SIZE SKC_RASTERIZE_SUBGROUP_SIZE 69#define SKC_RASTERIZE_TTSK_V_SIZE SKC_RASTERIZE_SUBGROUP_SIZE 70#define SKC_RASTERIZE_TTSK_V_MASK (SKC_RASTERIZE_TTSK_V_SIZE - 1) 71 72// 73// 74// 75 76#define SKC_RASTERIZE_VECTOR_SIZE (1 << SKC_RASTERIZE_VECTOR_SIZE_LOG2) 77#define SKC_RASTERIZE_ELEMS_PER_SUBGROUP (SKC_RASTERIZE_SUBGROUP_SIZE * SKC_RASTERIZE_VECTOR_SIZE) 78 79// 80// 81// 82 83#define SKC_RASTERIZE_YX_INIT 0x7FFF7FFF // { +32767, +32767 } 84#define SKC_RASTERIZE_YX_INVALID 0x80008000 // { -32768, -32768 } 85 86// 87// 88// 89 90#define SKC_RASTERIZE_TILE_HASH_X_MASK SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_X_BITS) 91#define SKC_RASTERIZE_TILE_HASH_Y_MASK SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_Y_BITS) 92#define SKC_RASTERIZE_TILE_HASH_BITS (SKC_RASTERIZE_TILE_HASH_X_BITS + SKC_RASTERIZE_TILE_HASH_Y_BITS) 93#define SKC_RASTERIZE_TILE_HASH_BIN_COUNT (1 << SKC_RASTERIZE_TILE_HASH_BITS) 94#define SKC_RASTERIZE_TILE_HASH_BIN_BITS (SKC_RASTERIZE_TILE_HASH_BITS + 1) // FIXME -- LOG2_RU(BIN_COUNT) 95#define SKC_RASTERIZE_TILE_HASH_BIN_MASK SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_BIN_BITS) 96 97// 98// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++" 99// 100// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/ 101// 102// Lerp in two fma/mad ops: 103// 104// t * b + ((-t) * a + a) 105// 106// Note: OpenCL documents mix() as being implemented as: 107// 108// a + (b - a) * t 109// 110// But this may be a native instruction on some devices. For example, 111// on GEN9 there is an LRP "linear interoplation" opcode but it 112// doesn't appear to support half floats. 113// 114// Feel free to toggle this option and then benchmark and inspect the 115// generated code. We really want the double FMA to be generated when 116// there isn't support for a LERP/MIX operation. 117// 118 119#if 1 120#define SKC_LERP(a,b,t) mad(t,b,mad(-(t),a,a)) 121#else 122#define SKC_LERP(a,b,t) mix(a,b,t) 123#endif 124 125// 126// There is no integer MAD in OpenCL with "don't care" overflow 127// semantics. 128// 129// FIXME -- verify if the platform needs explicit MAD operations even 130// if a "--fastmath" option is available at compile time. It might 131// make sense to explicitly use MAD calls if the platform requires it. 132// 133 134#if 1 135#define SKC_MAD_UINT(a,b,c) ((a) * (b) + (c)) 136#else 137#define SKC_MAD_UINT(a,b,c) mad_sat(a,b,c) 138#endif 139 140// 141// 142// 143 144#define SKC_RASTERIZE_SEGMENT(id) (id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane()) 145 146// 147// 148// 149 150union skc_bp_elem 151{ 152 skc_uint u32; 153 skc_tagged_block_id_t tag_id; 154 skc_float coord; 155}; 156 157// 158// 159// 160 161struct skc_subgroup_smem 162{ 163 // 164 // SIMT subgroup scratchpad for max scan -- also shared with 'winner' member 165 // 166#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 ) || defined ( SKC_RASTERIZE_SIMD_USES_SMEM ) 167 struct { 168 union { 169 170 skc_uint winner; 171 172 struct { 173 skc_uint scratch[SKC_RASTERIZE_SUBGROUP_SIZE]; 174 } aN; 175 176 struct { 177 SKC_RASTERIZE_UINT scratch[SKC_RASTERIZE_SUBGROUP_SIZE]; 178 } vN; 179 }; 180 } subgroup; 181#endif 182 183 // 184 // work-in-progress TTSB blocks and associated YX keys 185 // 186 union { 187 struct { 188 // FIXME -- some typedefs are valid here 189 skc_uint ttsb [SKC_RASTERIZE_TILE_HASH_BIN_COUNT][SKC_DEVICE_SUBBLOCK_WORDS]; 190 skc_uint yx [SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; 191 skc_uint id [SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; 192 skc_uint count[SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; 193 } aN; 194#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) 195 struct { 196 SKC_RASTERIZE_BIN_BLOCK ttsb[SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; 197 SKC_RASTERIZE_BIN yx; 198 SKC_RASTERIZE_BIN id; 199 SKC_RASTERIZE_BIN count; 200 } vN; 201#endif 202 } bin; 203}; 204 205// 206// 207// 208 209#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) 210#define skc_subgroup_lane() 0 211#else 212#define skc_subgroup_lane() get_sub_group_local_id() 213#endif 214 215// 216// 217// 218 219#define SKC_PROJECT(tv,x,y,xp,yp) \ 220 { \ 221 float const d = native_recip(fma(x,tv->w0,fma(y,tv->w1,1.0f))); \ 222 xp *= d; \ 223 yp *= d; \ 224 } 225 226// 227// replenish block ids 228// 229// note that you can't overrun the block id pool since it's a ring 230// 231 232static 233void 234skc_blocks_replenish(skc_uint * const blocks_next, 235 skc_block_id_v_t * const blocks, 236 __global SKC_ATOMIC_UINT volatile * const bp_atomics, 237 skc_uint const bp_mask, // pow2 modulo mask for block pool ring 238 __global skc_block_id_t const * const bp_ids) 239{ 240 // 241 // get a new vector of block ids -- this is kind of a narrow 242 // allocation but subblocks help stretch out the pool. 243 // 244 // FIXME -- there is now plenty of SMEM to allocate a LOT of block ids 245 // 246 skc_uint bp_idx = 0; 247 248 if (skc_subgroup_lane() == 0) 249 { 250 bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS, 251 SKC_RASTERIZE_BLOCK_ID_V_SIZE); // ring_reads 252#if 0 253 printf("r+: %8u + %u\n",bp_idx,SKC_RASTERIZE_BLOCK_ID_V_SIZE); 254#endif 255 } 256 257 bp_idx = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane()) & bp_mask; 258 *blocks = bp_ids[bp_idx]; 259 *blocks_next = 0; 260} 261 262// 263// 264// 265 266static 267skc_block_id_t 268skc_blocks_get_next(skc_uint * const blocks_next, 269 skc_block_id_v_t * const blocks, 270 __global SKC_ATOMIC_UINT volatile * const bp_atomics, 271 skc_uint const bp_mask, // pow2 modulo mask for block pool ring 272 __global skc_block_id_t const * const bp_ids) 273{ 274 // replenish? 275 if (*blocks_next == SKC_RASTERIZE_BLOCK_ID_V_SIZE) 276 { 277 skc_blocks_replenish(blocks_next,blocks,bp_atomics,bp_mask,bp_ids); 278 } 279 280#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 ) 281 // 282 // SIMT 283 // 284 skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next); 285 286#else 287 // 288 // SIMD 289 // 290 skc_block_id_t id = blocks->s0; 291 292 skc_shuffle_down_1(*blocks); 293 294#endif 295 296 *blocks_next += 1; 297 298 return id; 299} 300 301// 302// subblock allocator 303// 304 305#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 306 307static 308skc_block_id_t 309skc_subblocks_get_next(skc_block_id_t * const subblocks, 310 skc_uint * const blocks_next, 311 skc_block_id_v_t * const blocks, 312 __global SKC_ATOMIC_UINT volatile * const bp_atomics, 313 skc_uint const bp_mask, // pow2 modulo mask for block pool ring 314 __global skc_block_id_t const * const bp_ids) 315{ 316 if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0) 317 { 318 *subblocks = skc_blocks_get_next(blocks_next,blocks,bp_atomics,bp_mask,bp_ids); 319 } 320 321 skc_block_id_t const sb_id = *subblocks; 322 323 *subblocks += 1; 324 325#if 0 326 if (get_sub_group_local_id() == 0) 327 printf("= %u\n",sb_id); 328#endif 329 330 return sb_id; 331} 332 333 334#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const subblocks, skc_block_id_t * const blocks 335#define SKC_SUBBLOCKS_BLOCKS_ARGS() subblocks, blocks 336 337#else 338 339#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const blocks 340#define SKC_SUBBLOCKS_BLOCKS_ARGS() blocks 341 342#endif 343 344// 345// 346// 347 348static 349skc_block_id_t 350skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_PROTO(), 351 skc_uint * const blocks_next, 352 __global SKC_ATOMIC_UINT volatile * const bp_atomics, 353 skc_uint const bp_mask, // pow2 modulo mask for block pool ring 354 __global skc_block_id_t const * const bp_ids, 355 __global SKC_ATOMIC_UINT volatile * const cohort_atomics, 356 skc_ttsk_v_t * const sk_v, 357 skc_uint * const sk_v_next, 358 __global skc_ttsk_s_t * const sk_extent, 359 skc_uint const new_yx) 360{ 361#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 362 skc_block_id_t const new_id = skc_subblocks_get_next(subblocks, 363 blocks_next, 364 blocks, 365 bp_atomics, 366 bp_mask, 367 bp_ids); 368#else 369 skc_block_id_t const new_id = skc_blocks_get_next(blocks_next, 370 blocks, 371 bp_atomics, 372 bp_mask, // pow2 modulo mask for block pool ring 373 bp_ids); 374#endif 375 376 if (get_sub_group_local_id() == (*sk_v_next & SKC_RASTERIZE_TTSK_V_MASK)) 377 { 378 sk_v->lo = new_id; 379 sk_v->hi = (sk_v->hi & SKC_TTRK_HI_MASK_COHORT) | new_yx; 380#if 0 381 printf("@ ( %3u, %3u ) %u\n", 382 (new_yx >> 12) & 0xFFF, 383 (new_yx ) & 0xFFF, 384 new_id); 385#endif 386 } 387 388 *sk_v_next += 1; 389 390 if (*sk_v_next == SKC_RASTERIZE_TTSK_V_SIZE) 391 { 392 *sk_v_next = 0; 393 394 skc_uint sk_idx = 0; 395 396 if (skc_subgroup_lane() == 0) 397 { 398 sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE 399 (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_TTSK_V_SIZE); 400#if 0 401 printf("+ %u\n",sk_idx); 402#endif 403 } 404 405 sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane(); 406 407#if ( SKC_RASTERIZE_SUBGROUP_SIZE > SKC_RASTERIZE_TTSK_V_SIZE ) 408 if (skc_subgroup_lane() < SKC_RASTERIZE_TTSK_V_SIZE) 409#endif 410 { 411 sk_extent[sk_idx] = *sk_v; 412#if 0 413 printf("> %u : %v2u\n",sk_idx,*sk_v); 414#endif 415 } 416 } 417 418 return new_id; 419} 420 421// 422// 423// 424 425static 426SKC_RASTERIZE_FLOAT 427skc_subgroup_scan_inclusive_add_float(SKC_RASTERIZE_FLOAT const v) 428{ 429#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) 430 // 431 // SIMD 432 // 433 // Note that there isn't a built-in horizontal scan for vectors so 434 // we'll define some here for various widths. 435 // 436 // FIXME -- a scalar version might be faster so put in a 437 // compile-time switch to selection between implementations 438 // 439 440#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) 441 return v; 442 443#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) 444 // 01 445 // 0 + 446 // -- 447 // 01 448 SKC_RASTERIZE_FLOAT const w = mad(v.s10,(SKC_RASTERIZE_FLOAT)(0,1),v); 449 return w; 450 451#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) 452 // 0123 453 // 012 + 454 // ---- 455 // 0123 456 // 01 + 457 // ---- 458 // 0123 459 // 460 SKC_RASTERIZE_FLOAT const w = mad(v.s3012,(SKC_RASTERIZE_FLOAT)(0,1,1,1),v); 461 SKC_RASTERIZE_FLOAT const x = mad(w.s2301,(SKC_RASTERIZE_FLOAT)(0,0,1,1),w); 462 return x; 463 464#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) 465 // 01234567 466 // 0123456 + 467 // -------- 468 // 01234567 469 // 012345 + 470 // -------- 471 // 01234567 472 // 0123 + 473 // -------- 474 // 01234567 475 // 476 SKC_RASTERIZE_FLOAT const w = mad(v.s70123456,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1),v); 477 SKC_RASTERIZE_FLOAT const x = mad(w.s67012345,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1),w); 478 SKC_RASTERIZE_FLOAT const y = mad(x.s45670123,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1),x); 479 return y; 480 481#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) 482 // 0123456789abcdef 483 // 0123456789abcde + 484 // ---------------- 485 // 0123456789abcdef 486 // 0123456789abcd + 487 // ---------------- 488 // 0123456789abcdef 489 // 0123456789ab + 490 // ---------------- 491 // 0123456789abcdef 492 // 01234567 + 493 // ---------------- 494 // 0123456789abcdef 495 // 496 SKC_RASTERIZE_FLOAT const w = mad(v.sf0123456789abcde,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v); 497 SKC_RASTERIZE_FLOAT const x = mad(w.sef0123456789abcd,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w); 498 SKC_RASTERIZE_FLOAT const y = mad(x.scdef0123456789ab,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x); 499 SKC_RASTERIZE_FLOAT const z = mad(y.s89abcdef01234567,(SKC_RASTERIZE_FLOAT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y); 500 return z; 501 502#endif 503 504#else 505 // 506 // SIMT 507 // 508 509 return sub_group_scan_inclusive_add(v); 510 511#endif 512} 513 514// 515// 516// 517 518static 519SKC_RASTERIZE_UINT 520skc_subgroup_scan_inclusive_add_uint(SKC_RASTERIZE_UINT const v) 521{ 522#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) 523 // 524 // SIMD 525 // 526 // Note that there isn't a built-in horizontal scan for vectors so 527 // we'll define some here for various widths. 528 // 529 // FIXME -- a scalar version might be faster so put in a 530 // compile-time switch to selection between implementations 531 // 532 533#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) 534 return v; 535 536#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) 537 // 01 538 // 0 + 539 // -- 540 // 01 541 SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s10,(SKC_RASTERIZE_UINT)(0,1),v); 542 return w; 543 544#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) 545 // 0123 546 // 012 + 547 // ---- 548 // 0123 549 // 01 + 550 // ---- 551 // 0123 552 // 553 SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s3012,(SKC_RASTERIZE_UINT)(0,1,1,1),v); 554 SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s2301,(SKC_RASTERIZE_UINT)(0,0,1,1),w); 555 return x; 556 557#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) 558 // 01234567 559 // 0123456 + 560 // -------- 561 // 01234567 562 // 012345 + 563 // -------- 564 // 01234567 565 // 0123 + 566 // -------- 567 // 01234567 568 // 569 SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s70123456,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1),v); 570 SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s67012345,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1),w); 571 SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.s45670123,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1),x); 572 return y; 573 574#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) 575 // 0123456789abcdef 576 // 0123456789abcde + 577 // ---------------- 578 // 0123456789abcdef 579 // 0123456789abcd + 580 // ---------------- 581 // 0123456789abcdef 582 // 0123456789ab + 583 // ---------------- 584 // 0123456789abcdef 585 // 01234567 + 586 // ---------------- 587 // 0123456789abcdef 588 // 589 SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.sf0123456789abcde,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v); 590 SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.sef0123456789abcd,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w); 591 SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.scdef0123456789ab,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x); 592 SKC_RASTERIZE_UINT const z = SKC_MAD_UINT(y.s89abcdef01234567,(SKC_RASTERIZE_UINT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y); 593 return z; 594 595#endif 596 597#else 598 // 599 // SIMT 600 // 601 602 return sub_group_scan_inclusive_add(v); 603 604#endif 605} 606 607// 608// 609// 610 611static 612SKC_RASTERIZE_UINT 613skc_subgroup_scan_inclusive_max(SKC_RASTERIZE_UINT const v) 614{ 615#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) 616 // 617 // SIMD 618 // 619 // Note that there isn't a built-in horizontal scan for vectors so 620 // we'll define some here for various widths. 621 // 622 // FIXME -- a scalar version might be faster so put in a 623 // compile-time switch to selection between implementations 624 // 625 626#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) 627 return v; 628 629#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) 630 // 01 631 // 00 max 632 // -- 633 // 01 634 SKC_RASTERIZE_UINT const w = max(v.s00,v); 635 return w; 636 637#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) 638 // 0123 639 // 0012 + 640 // ---- 641 // 0123 642 // 0101 + 643 // ---- 644 // 0123 645 // 646 SKC_RASTERIZE_UINT const w = max(v.s0012,v); 647 SKC_RASTERIZE_UINT const x = max(w.s0101,w); 648 return x; 649 650#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) 651 // 01234567 652 // 00123456 + 653 // -------- 654 // 01234567 655 // 01012345 + 656 // -------- 657 // 01234567 658 // 01230123 + 659 // -------- 660 // 01234567 661 // 662 SKC_RASTERIZE_UINT const w = max(v.s00123456,v); 663 SKC_RASTERIZE_UINT const x = max(w.s01012345,w); 664 SKC_RASTERIZE_UINT const y = max(x.s01230123,x); 665 return y; 666 667#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) 668 // 0123456789abcdef 669 // 00123456789abcde + 670 // ---------------- 671 // 0123456789abcdef 672 // 010123456789abcd + 673 // ---------------- 674 // 0123456789abcdef 675 // 01230123456789ab + 676 // ---------------- 677 // 0123456789abcdef 678 // 0123456701234567 + 679 // ---------------- 680 // 0123456789abcdef 681 // 682 SKC_RASTERIZE_UINT const w = max(v.s00123456789abcde,v); 683 SKC_RASTERIZE_UINT const x = max(w.s010123456789abcd,w); 684 SKC_RASTERIZE_UINT const y = max(x.s01230123456789ab,x); 685 SKC_RASTERIZE_UINT const z = max(y.s0123456701234567,y); 686 return z; 687 688#endif 689 690#else 691 // 692 // SIMT 693 // 694 695 return sub_group_scan_inclusive_max(v); 696 697#endif 698} 699 700// 701// 702// 703 704static 705float 706skc_subgroup_last_float(SKC_RASTERIZE_FLOAT const v) 707{ 708#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) 709 // 710 // SIMD 711 // 712#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) 713 return v; 714#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) 715 return v.s1; 716#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) 717 return v.s3; 718#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) 719 return v.s7; 720#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) 721 return v.sf; 722#endif 723 724#else 725 // 726 // SIMT 727 // 728 return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1); 729 730#endif 731} 732 733// 734// 735// 736 737static 738SKC_RASTERIZE_UINT 739skc_subgroup_last_uint(SKC_RASTERIZE_UINT const v) 740{ 741#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) 742 // 743 // SIMD 744 // 745#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) 746 return v; 747#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) 748 return v.s1; 749#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) 750 return v.s3; 751#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) 752 return v.s7; 753#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) 754 return v.sf; 755#endif 756 757#else 758 // 759 // SIMT 760 // 761 return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1); 762 763#endif 764} 765 766// 767// 768// 769 770static 771float 772skc_subgroup_first(SKC_RASTERIZE_FLOAT const v) 773{ 774#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) 775 // 776 // SIMD 777 // 778#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) 779 return v; 780#else 781 return v.s0; 782#endif 783 784#else 785 // 786 // SIMT 787 // 788 return sub_group_broadcast(v,0); 789 790#endif 791} 792 793// 794// 795// 796 797static 798SKC_RASTERIZE_FLOAT 799skc_subgroup_shuffle(SKC_RASTERIZE_FLOAT const v, 800 SKC_RASTERIZE_UINT const i) 801{ 802#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) 803 // 804 // SIMD 805 // 806#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) 807 return v; 808#else 809 return shuffle(v,i); 810#endif 811 812#else 813 // 814 // SIMT 815 // 816 return intel_sub_group_shuffle(v,i); 817 818#endif 819} 820 821// 822// 823// 824 825static 826SKC_RASTERIZE_FLOAT 827skc_subgroup_shuffle_up_1(SKC_RASTERIZE_FLOAT const p, // previous 828 SKC_RASTERIZE_FLOAT const c) // current 829{ 830#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) 831 // 832 // SIMD 833 // 834 // FIXME -- there are alternative formulations here: 835 // 836 // Option 1: 837 // 838 // select(c.rotate(+1),p.rotate(-1),(1,0,0,...)) 839 // 840 // Option 2: 841 // 842 // p is a scalar 843 // t = c.rotate(+1) 844 // t.s0 = p; 845 // 846 // Option 3: ... 847 // 848#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) 849 return p; 850#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) 851 return shuffle2(p,c,(uint2)(1,2)); 852#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) 853 return shuffle2(p,c,(uint4)(3,4,5,6)); 854#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) 855 return shuffle2(p,c,(uint8)(7,8,9,10,11,12,13,14)); 856#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) 857 return shuffle2(p,c,(uint16)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30)); 858#endif 859 860#else 861 // 862 // SIMT 863 // 864 return intel_sub_group_shuffle_up(p,c,1); 865 866#endif 867} 868 869// 870// 871// 872 873static 874bool 875skc_is_lane_first() 876{ 877#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1) 878 // 879 // SIMD 880 // 881 return true; 882#else 883 // 884 // SIMT 885 // 886 return get_sub_group_local_id() == 0; 887#endif 888} 889 890// 891// 892// 893 894static 895SKC_RASTERIZE_FLOAT 896skc_delta_offset() 897{ 898#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) 899 // 900 // SIMD 901 // 902#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) 903 return 1; 904#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) 905 return (SKC_RASTERIZE_FLOAT)( 1, 2 ); 906#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) 907 return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4 ); 908#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) 909 return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8 ); 910#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) 911 return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ); 912#endif 913 914#else 915 // 916 // SIMT 917 // 918 return 1.0f + get_sub_group_local_id(); 919 920#endif 921 922} 923 924// 925// 926// 927 928static 929int 930skc_subgroup_any(SKC_RASTERIZE_PREDICATE const p) 931{ 932#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) 933 // 934 // SIMD 935 // 936 return any(p); 937#else 938 // 939 // SIMT 940 // 941 return sub_group_any(p); 942#endif 943} 944 945// 946// 947// 948 949#define SKC_PATH_NODEWORD_IS_LAST(n) (((n) & SKC_DEVICE_BLOCK_WORDS_MASK) == SKC_DEVICE_BLOCK_WORDS_MASK) 950 951void 952skc_segment_next(__global union skc_bp_elem * const bp_elems, 953 skc_uint * const nodeword, 954 skc_block_id_t * const id) 955{ 956 if ((++*id & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0) 957 { 958 if (SKC_PATH_NODEWORD_IS_LAST(++*nodeword)) 959 { 960 *nodeword = SKC_TAGGED_BLOCK_ID_GET_ID(bp_elems[*nodeword].tag_id) * SKC_DEVICE_SUBBLOCK_WORDS; 961 } 962 963 skc_tagged_block_id_t const tag_id = bp_elems[*nodeword].tag_id; 964 965 *id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); 966 } 967} 968 969// 970// 971// 972 973static 974SKC_RASTERIZE_FLOAT 975skc_native_length(SKC_RASTERIZE_FLOAT const x, SKC_RASTERIZE_FLOAT const y) 976{ 977 return native_sqrt(x * x + y * y); 978} 979 980// 981// Wang's Formula (1985) 982// 983 984#define SKC_WANG_PIXEL_RESL 0.25f // <-- this can be tuned 985 986#define SKC_WANG_EPSILON (SKC_WANG_PIXEL_RESL * SKC_SUBPIXEL_RESL_X_F32) 987 988#define SKC_WANG_CUBIC ((3.0f * 2.0f) / (8.0f * SKC_WANG_EPSILON)) 989#define SKC_WANG_QUADRATIC ((2.0f ) / (8.0f * SKC_WANG_EPSILON)) 990 991#define SKC_WANG_LENGTH(x,y) skc_native_length(x,y) 992#define SKC_WANG_SQRT(x) native_sqrt(x) 993 994// 995// 996// 997 998static 999SKC_RASTERIZE_FLOAT 1000skc_wangs_formula_cubic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y, 1001 SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y, 1002 SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y, 1003 SKC_RASTERIZE_FLOAT const t3x, SKC_RASTERIZE_FLOAT const t3y) 1004{ 1005 // 1006 // Return the number of evenly spaced (in the parametric sense) line 1007 // segments that are guaranteed to be within "epsilon" error of the 1008 // curve. 1009 // 1010 // We're then going to take multiples of the reciprocal of this 1011 // number so that the segmentation can be distributed across the 1012 // subgroup. 1013 // 1014 // Note, this can probably be slightly optimized per architecture 1015 // but it's probably far from being a hotspot since it's all 1016 // straight-line unpredicated code. 1017 // 1018 // The result is an integer ranging from [1.0,#segments] 1019 // 1020 // Note that even if all of the control points are coincident, the 1021 // max(1.0f) will categorize this as a line of 1 segment. 1022 // 1023 // This is what we want! We want to convert cubics to lines as 1024 // easily as possible and *then* cull lines that are either 1025 // horizontal or zero length. 1026 // 1027 return max(1.0f, 1028 ceil(SKC_WANG_SQRT(SKC_WANG_CUBIC * 1029 SKC_WANG_LENGTH(max(fabs(t2x - 2.0f * t1x + t0x), 1030 fabs(t3x - 2.0f * t2x + t1x)), 1031 max(fabs(t2y - 2.0f * t1y + t0y), 1032 fabs(t3y - 2.0f * t2y + t1y)))))); 1033} 1034 1035static 1036SKC_RASTERIZE_FLOAT 1037skc_wangs_formula_quadratic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y, 1038 SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y, 1039 SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y) 1040{ 1041 return max(1.0f, 1042 ceil(SKC_WANG_SQRT(SKC_WANG_QUADRATIC * 1043 SKC_WANG_LENGTH(t2x - 2.0f * t1x + t0x, 1044 t2y - 2.0f * t1y + t0y)))); 1045} 1046 1047// 1048// rational curves 1049// 1050 1051static 1052SKC_RASTERIZE_FLOAT 1053skc_wangs_formula_cubic_rat() 1054{ 1055 return 0.0f; 1056} 1057 1058static 1059SKC_RASTERIZE_FLOAT 1060skc_wangs_formula_quad_rat() 1061{ 1062 return 0.0f; 1063} 1064 1065// 1066// flush any work-in-progress blocks and return unused block ids 1067// 1068 1069static 1070void 1071skc_finalize(__global SKC_ATOMIC_UINT volatile * const bp_atomics, 1072 __global union skc_bp_elem * const bp_elems, 1073 __global uint * const bp_ids, 1074 skc_uint const bp_mask, 1075 __global SKC_ATOMIC_UINT volatile * const cohort_atomics, 1076 skc_block_id_v_t * const blocks, 1077 skc_uint const blocks_next, 1078 skc_ttsk_v_t * const sk_v, 1079 skc_uint const sk_v_next, 1080 __global skc_ttsk_s_t * const sk_extent, 1081 __local struct skc_subgroup_smem volatile * const smem) 1082{ 1083 // 1084 // flush non-empty bins 1085 // 1086 // FIXME -- accelerate this iteration/search with a subgroup operation 1087 // 1088 for (skc_uint ii=0; ii<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; ii++) 1089 { 1090 if (smem->bin.aN.count[ii] > 0) 1091 { 1092 skc_block_id_v_t const id = smem->bin.aN.id[ii]; 1093 skc_uint const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane(); 1094 skc_uint const tts = smem->bin.aN.ttsb[ii][skc_subgroup_lane()]; 1095#if 0 1096 printf("???????? : [ %10u = %10u : %08X ]\n",id,idx,tts); 1097#endif 1098 bp_elems[idx].u32 = tts; 1099 } 1100 1101 // 1102 // FIXME -- vectorize with vstoreN() 1103 // 1104 } 1105 1106 // 1107 // return remaining block ids back to the pool 1108 // 1109 skc_uint const blocks_rem = SKC_RASTERIZE_BLOCK_ID_V_SIZE - blocks_next; 1110 1111 if (blocks_rem > 0) 1112 { 1113 skc_uint bp_idx = 0; 1114 1115 if (skc_subgroup_lane() == 0) 1116 { 1117 bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,blocks_rem); 1118 1119#if 0 1120 printf("r-: %8u + %u\n",bp_idx,blocks_rem); 1121#endif 1122 } 1123 1124 bp_idx = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane() - blocks_next) & bp_mask; 1125 1126 if (skc_subgroup_lane() >= blocks_next) 1127 { 1128 bp_ids[bp_idx] = *blocks; 1129 } 1130 } 1131 1132 // 1133 // flush work-in-progress ryx keys 1134 // 1135 if (sk_v_next > 0) 1136 { 1137 skc_uint sk_idx = 0; 1138 1139 if (skc_subgroup_lane() == 0) 1140 { 1141 sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE 1142 (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,sk_v_next); 1143#if 0 1144 printf("* %u\n",sk_idx); 1145#endif 1146 } 1147 1148 sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane(); 1149 1150 if (skc_subgroup_lane() < sk_v_next) 1151 { 1152 sk_extent[sk_idx] = *sk_v; 1153 } 1154 } 1155} 1156 1157// 1158// If there are lanes that were unable to append to a bin because 1159// their hashes collided with a bin's current ryx key then those bins 1160// must be ejected. 1161// 1162// Note that we do not eject "full" bins because lazily waiting for a 1163// collision results in simpler code. 1164// 1165 1166static 1167void 1168skc_flush(__global SKC_ATOMIC_UINT volatile * const bp_atomics, 1169 __global union skc_bp_elem * const bp_elems, 1170 __global uint * const bp_ids, 1171 skc_uint const bp_mask, 1172 __global SKC_ATOMIC_UINT volatile * const cohort_atomics, 1173 skc_block_id_t * const subblocks, 1174 skc_block_id_v_t * const blocks, 1175 skc_uint * const blocks_next, 1176 skc_ttsk_v_t * const sk_v, 1177 skc_uint * const sk_v_next, 1178 __global skc_ttsk_s_t * const sk_extent, 1179 __local struct skc_subgroup_smem volatile * const smem, 1180 SKC_RASTERIZE_UINT const hash, 1181 SKC_RASTERIZE_UINT const yx, 1182 SKC_RASTERIZE_PREDICATE is_collision) // pass by value 1183{ 1184#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) 1185 // 1186 // SIMD 1187 // 1188 1189 // 1190 // FIXME -- this code is now stale with the changes to the 1191 // subblock/block allocation strategy 1192 // 1193 1194 // 1195 // get local TTSB ID queue count 1196 // 1197 skc_uint ttsb_id_count = smem->pool.count; // scalar 1198 1199 // init hash bit mask 1200 skc_uint component_mask = 0; 1201 1202 for (int cc=0; cc<SKC_RASTERIZE_VECTOR_SIZE; cc++) 1203 { 1204 // if no collision continue 1205 if (((int*)&is_collision)[cc] == 0) 1206 continue; 1207 1208 uint const winner = ((uint*)&hash)[cc]; 1209 uint const component_bit = 1u << winner; 1210 1211 // if already processed this hash then continue 1212 if (component_mask & component_bit) 1213 continue; 1214 1215 // update component mask 1216 component_mask |= component_bit; 1217 1218 // 1219 // new winner requires ejecting the old TTSB 1220 // 1221 if (smem->bin.aN.count[winner] > 0) 1222 { 1223 skc_uint const elem_idx = smem->bin.aN.id[winner] * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane(); 1224 1225 bp_elems[elem_idx].u32 = smem->bin.aN.ttsb[winner][skc_subgroup_lane()]; 1226 } 1227 1228 // 1229 // ensure there is at least one TTSK and TTSB ID 1230 // 1231 if (ttsb_id_count == SKC_RASTERIZE_POOL_SIZE) 1232 { 1233 // 1234 // update remaining count 1235 // 1236 ttsb_id_count = 0; 1237 1238 // 1239 // flush accumulated ttsk_ryx keys 1240 // 1241 uint const idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE 1242 (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_POOL_SIZE); // ttsk_ryx_count 1243 1244#if 0 1245 printf("# %u\n",idx); 1246#endif 1247 1248 for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE) 1249 { 1250 ttsk_ryx[idx + ii] = skc_make_ttsk_ryx(smem,SKC_CMD_RASTERIZE_GET_COHORT(cmd),ii); 1251 } 1252 1253 // 1254 // allocate more ttsb ids from pool 1255 // 1256 uint const id = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+0,SKC_RASTERIZE_POOL_SIZE); // ring_reads 1257 1258 for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE) 1259 smem->pool.aN.id[ii] = bp_ids[id + ii]; 1260 } 1261 1262 // 1263 // invalidate the winning block 1264 // 1265 1266 // 1267 // update bin with winning yx, new ttsb id and zero count 1268 // 1269 // all lanes are loading/storing from/to the same index 1270 // 1271 smem->bin.vN.ttsb [winner] = ( SKC_TTS_INVALID ); 1272 smem->bin.aN.id [winner] = smem->pool.aN.id[ttsb_id_count]; 1273 smem->bin.aN.yx [winner] = smem->pool.aN.yx[ttsb_id_count] = ((uint*)&yx)[cc]; 1274 smem->bin.aN.count[winner] = 0; 1275 1276 // 1277 // update count 1278 // 1279 ttsb_id_count += 1; 1280 } 1281 1282 // 1283 // save count 1284 // 1285 smem->pool.count = ttsb_id_count; 1286 1287#else 1288 // 1289 // SIMT 1290 // 1291 1292 do { 1293 // 1294 // only one lane will win! 1295 // 1296 if (is_collision) 1297 smem->subgroup.winner = hash; 1298 1299 barrier(CLK_LOCAL_MEM_FENCE); 1300 1301 // 1302 // which bin is being ejected? 1303 // 1304 skc_uint const winner = smem->subgroup.winner; 1305 1306 // 1307 // which colliding hash is taking over the bin? 1308 // 1309 SKC_RASTERIZE_PREDICATE const is_winner = is_collision && (hash == winner); 1310 1311 // 1312 // all lanes with the same hash will try to store but only one 1313 // lane will win 1314 // 1315 if (is_winner) 1316 smem->subgroup.winner = yx; 1317 1318 barrier(CLK_LOCAL_MEM_FENCE); 1319 1320 // 1321 // flush this block to the pool 1322 // 1323 if (smem->bin.aN.count[winner] > 0) 1324 { 1325 skc_block_id_v_t const id = smem->bin.aN.id[winner]; 1326 skc_uint const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane(); 1327 skc_uint const tts = smem->bin.aN.ttsb[winner][skc_subgroup_lane()]; 1328#if 0 1329 printf("%08X : [ %10u = %10u : %08X ]\n",yx,id,idx,tts); 1330#endif 1331 bp_elems[idx].u32 = tts; 1332 } 1333 1334 // 1335 // append new ttsk 1336 // 1337 skc_uint const new_yx = smem->subgroup.winner; 1338 skc_block_id_t const new_id = skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_ARGS(), 1339 blocks_next, 1340 bp_atomics, 1341 bp_mask, // pow2 modulo mask for block pool ring 1342 bp_ids, 1343 cohort_atomics, 1344 sk_v, 1345 sk_v_next, 1346 sk_extent, 1347 new_yx); 1348 1349#if 0 1350 if (get_sub_group_local_id() == 0) { 1351 printf(">>> %9u\n",new_id); 1352 } 1353#endif 1354 1355 // 1356 // update bin with winning yx, new ttsb id and zero count 1357 // 1358 smem->bin.aN.ttsb [winner][skc_subgroup_lane()] = SKC_TTS_INVALID; 1359 smem->bin.aN.yx [winner] = new_yx; 1360 smem->bin.aN.id [winner] = new_id; 1361 smem->bin.aN.count[winner] = 0; 1362 1363 // 1364 // remove all lanes matching this hash 1365 // 1366 is_collision = is_collision && !is_winner; 1367 1368 // 1369 // exit if nothing left to do 1370 // 1371 } while (sub_group_any(is_collision)); 1372 1373#endif 1374} 1375 1376// 1377// scatter scan max 1378// 1379static 1380SKC_RASTERIZE_UINT 1381skc_scatter_scan_max(__local struct skc_subgroup_smem volatile * const smem, 1382 SKC_RASTERIZE_FLOAT const iss, 1383 SKC_RASTERIZE_FLOAT const ess) 1384{ 1385 // 1386 // prefix sums determine which lanes we're going to work on next 1387 // 1388 SKC_RASTERIZE_PREDICATE const is_scratch_store = (iss > 0.0f) && (ess < (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP); 1389 SKC_RASTERIZE_UINT const scratch_idx = SKC_CONVERT(SKC_RASTERIZE_UINT)(max(ess,0.0f)); 1390 1391#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) 1392 // 1393 // SIMD 1394 // 1395#ifdef SKC_RASTERIZE_SIMD_USES_SMEM 1396 // 1397 // SIMD APPROACH 1: SIMT'ISH 1398 // 1399 1400 // zero the volatile smem scratchpad using vector syntax 1401 smem->subgroup.vN.scratch[0] = ( 0 ); 1402 1403#undef SKC_EXPAND_X 1404#define SKC_EXPAND_X(I,S,C,P,A) \ 1405 if (is_scratch_store C) \ 1406 smem->subgroup.aN.scratch[scratch_idx C] = I; 1407 1408 SKC_RASTERIZE_VECTOR_EXPAND(); 1409 1410 // propagate lanes to right using max scan 1411 SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[0]; 1412 SKC_RASTERIZE_UINT const source = skc_subgroup_scan_inclusive_max(scratch); 1413 1414#else 1415 // 1416 // SIMD APPROACH 2: SCALAR'ISH 1417 // 1418 1419 SKC_RASTERIZE_UINT source = ( 0 ); 1420 1421#undef SKC_EXPAND_X 1422#define SKC_EXPAND_X(I,S,C,P,A) \ 1423 if (is_scratch_store C) \ 1424 ((uint *)&source)[scratch_idx C] = I; 1425 1426 SKC_RASTERIZE_VECTOR_EXPAND(); 1427 1428 for (uint ii=1; ii<SKC_RASTERIZE_ELEMS_PER_SUBGROUP; ii++) 1429 ((uint *)&source)[ii] = max(((uint *)&source)[ii-1],((uint *)&source)[ii]); 1430#endif 1431 1432#else 1433 // 1434 // SIMT 1435 // 1436 1437 // 1438 // zero the volatile smem scratchpad using vector syntax 1439 // 1440 smem->subgroup.vN.scratch[skc_subgroup_lane()] = ( 0 ); 1441 1442 // 1443 // store source lane at starting lane 1444 // 1445 if (is_scratch_store) 1446 smem->subgroup.aN.scratch[scratch_idx] = skc_subgroup_lane(); 1447 1448 // 1449 // propagate lanes to right using max scan 1450 // 1451 SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[skc_subgroup_lane()]; 1452 SKC_RASTERIZE_UINT const source = skc_subgroup_scan_inclusive_max(scratch); 1453#endif 1454 1455 return source; 1456} 1457 1458// 1459// sliver lines into subpixels 1460// 1461 1462static 1463void 1464skc_sliver(__global SKC_ATOMIC_UINT volatile * const bp_atomics, 1465 __global union skc_bp_elem * const bp_elems, 1466 __global uint * const bp_ids, 1467 skc_uint const bp_mask, 1468 __global SKC_ATOMIC_UINT volatile * const cohort_atomics, 1469 skc_block_id_t * const subblocks, 1470 skc_block_id_v_t * const blocks, 1471 skc_uint * const blocks_next, 1472 skc_ttsk_v_t * const sk_v, 1473 skc_uint * const sk_v_next, 1474 __global skc_ttsk_s_t * const sk_extent, 1475 __local struct skc_subgroup_smem volatile * const smem, 1476 SKC_RASTERIZE_FLOAT const l0x, 1477 SKC_RASTERIZE_FLOAT const l0y, 1478 SKC_RASTERIZE_FLOAT const l1x, 1479 SKC_RASTERIZE_FLOAT const l1y) 1480{ 1481 // 1482 // Y-SLIVERING 1483 // ----------- 1484 // 1485 // immediately sliver all multi-pixel lines in into 1-pixel high 1486 // lines 1487 // 1488 // note this implicitly squelches horizontal lines 1489 // 1490 // there is another test for horizontal lines after x-slivering 1491 // is complete 1492 // 1493 1494 // 1495 // will we need to flip the sign of y_delta ? 1496 // 1497 SKC_RASTERIZE_PREDICATE const y_lt = (l0y <= l1y); 1498 SKC_RASTERIZE_UINT const dy_xor = y_lt ? 0 : 0x80000000; 1499 1500 // 1501 // save 1/dy 1502 // 1503 SKC_RASTERIZE_FLOAT const y_denom = native_recip(l1y - l0y); 1504 1505 // 1506 // how many non-horizontal subpixel y-axis slivers are there? 1507 // 1508 SKC_RASTERIZE_FLOAT const y_min = floor(fmin(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN); 1509 SKC_RASTERIZE_FLOAT const y_max = ceil (fmax(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN); 1510 SKC_RASTERIZE_FLOAT const y_base = y_lt ? y_min : y_max; 1511 SKC_RASTERIZE_FLOAT y_segs = y_max - y_min; 1512 1513 // 1514 // inclusive subgroup scan of y_segs 1515 // 1516 SKC_RASTERIZE_FLOAT y_iss = skc_subgroup_scan_inclusive_add_float(y_segs); 1517 SKC_RASTERIZE_FLOAT y_ess = y_iss - y_segs; 1518 float y_rem = skc_subgroup_last_float(y_iss); 1519 1520 // 1521 // if this is a horizontal line then tweak y_iss so "is_scratch_store" always fails 1522 // 1523 if (y_segs == 0.0f) 1524 y_iss = 0.0f; 1525 1526#if 0 1527 printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } (* %5.0f / %5.0f / %5.0f / %5.0f *) }, \n",a0x,a0y,a1x,a1y,y_segs,y_iss,y_ess,y_rem); 1528#endif 1529 1530 // 1531 // these values don't matter on first iteration 1532 // 1533 SKC_RASTERIZE_FLOAT n1x_prev = 0; 1534 SKC_RASTERIZE_FLOAT n1y_prev = 0; 1535 1536 // 1537 // loop until done 1538 // 1539 while (y_rem > 0.0f) 1540 { 1541 // 1542 // distribute work across lanes 1543 // 1544 SKC_RASTERIZE_UINT const y_source = skc_scatter_scan_max(smem,y_iss,y_ess); 1545 1546 // 1547 // get line at y_source line 1548 // 1549 SKC_RASTERIZE_FLOAT const m0x = skc_subgroup_shuffle(l0x,y_source); 1550 SKC_RASTERIZE_FLOAT const m0y = skc_subgroup_shuffle(l0y,y_source); 1551 SKC_RASTERIZE_FLOAT const m1x = skc_subgroup_shuffle(l1x,y_source); 1552 SKC_RASTERIZE_FLOAT const m1y = skc_subgroup_shuffle(l1y,y_source); 1553 1554 // 1555 // every lane will create a 1 pixel tall line "sliver" 1556 // 1557 // FIXME -- this gets expanded on SIMD 1558 // 1559 // if numerator == 1 then this is the first lane 1560 // if numerator == s then this is the last lane 1561 // 1562 SKC_RASTERIZE_FLOAT const y_delta = skc_delta_offset() - skc_subgroup_shuffle(y_ess,y_source); 1563 SKC_RASTERIZE_FLOAT const y_count = skc_subgroup_shuffle(y_segs,y_source); 1564 1565 SKC_RASTERIZE_PREDICATE const is_y_first = (y_delta == 1.0f); 1566 SKC_RASTERIZE_PREDICATE const is_y_last = (y_delta >= y_count); 1567 1568 // toggle y_delta sign 1569 SKC_RASTERIZE_FLOAT const y_offset = as_float((as_uint(y_delta) ^ intel_sub_group_shuffle(dy_xor,y_source))); 1570 1571 // 1572 // calculate "right" line segment endpoint 1573 // 1574 SKC_RASTERIZE_FLOAT n1y = (y_offset + skc_subgroup_shuffle(y_base,y_source)) * SKC_SUBPIXEL_Y_SCALE_UP; 1575 SKC_RASTERIZE_FLOAT const n_t = (n1y - m0y) * skc_subgroup_shuffle(y_denom,y_source); 1576 SKC_RASTERIZE_FLOAT n1x = round(SKC_LERP(m0x,m1x,n_t)); 1577 1578 // 1579 // override c1 if this is last point 1580 // 1581 n1y = select(n1y,m1y,is_y_last); 1582 n1x = select(n1x,m1x,is_y_last); 1583 1584 // 1585 // shuffle up "left" line segment endpoint 1586 // 1587 // NOTE: Intel's shuffle_up is unique with its elegant 1588 // "previous" argument so don't get used to it 1589 // 1590 SKC_RASTERIZE_FLOAT n0y = skc_subgroup_shuffle_up_1(n1y_prev,n1y); 1591 SKC_RASTERIZE_FLOAT n0x = skc_subgroup_shuffle_up_1(n1x_prev,n1x); 1592 1593 // 1594 // override shuffle up if this is the first line segment 1595 // 1596 n0y = select(n0y,m0y,is_y_first); 1597 n0x = select(n0x,m0x,is_y_first); 1598 1599 // 1600 // save previous right endpoint 1601 // 1602 n1x_prev = n1x; 1603 n1y_prev = n1y; 1604 1605 // 1606 // decrement by subgroup size 1607 // 1608 y_iss -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP; 1609 y_ess -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP; 1610 y_rem -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP; 1611 1612#if 0 1613 // 1614 // debug 1615 // 1616 if (n0y != n1y) { 1617 printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",n0x,n0y,n1x,n1y); 1618 } 1619#endif 1620 1621 // 1622 // X-SLIVERING 1623 // ----------- 1624 // 1625 // now sliver 1-pixel high lines into at either vertical or 1626 // 1-pixel wide lines 1627 // 1628 // save original direction and work with increasing x 1629 // 1630 SKC_RASTERIZE_PREDICATE const x_lt = (n0x <= n1x); 1631 SKC_RASTERIZE_UINT const dx_xor = x_lt ? 0 : 0x80000000; 1632 1633 // 1634 // save 1/dy 1635 // 1636 SKC_RASTERIZE_FLOAT const x_denom = native_recip(n1x - n0x); 1637 1638 // 1639 // how many non-horizontal subpixel y-axis slivers are there? 1640 // 1641 SKC_RASTERIZE_FLOAT const x_min = floor(fmin(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN); 1642 SKC_RASTERIZE_FLOAT const x_max = ceil (fmax(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN); 1643 SKC_RASTERIZE_FLOAT const x_base = x_lt ? x_min : x_max; 1644 SKC_RASTERIZE_FLOAT const x_segs = fmax(x_max - x_min,1.0f); 1645 1646 // 1647 // inclusive subgroup scan of y_segs 1648 // 1649 SKC_RASTERIZE_FLOAT x_iss = skc_subgroup_scan_inclusive_add_float(x_segs); 1650 SKC_RASTERIZE_FLOAT x_ess = x_iss - x_segs; 1651 float x_rem = skc_subgroup_last_float(x_iss); 1652 1653 // 1654 // if this is a horizontal line then tweak x_iss so "is_scratch_store" always fails 1655 // 1656 //if (x_segs == 0.0f) 1657 // x_iss = 0.0f; 1658 1659 // 1660 // these values don't matter on first iteration 1661 // 1662 SKC_RASTERIZE_FLOAT p1x_prev = 0; 1663 SKC_RASTERIZE_FLOAT p1y_prev = 0; 1664 1665 // 1666 // loop until done 1667 // 1668 while (x_rem > 0) 1669 { 1670 // 1671 // distribute work across lanes 1672 // 1673 SKC_RASTERIZE_UINT const x_source = skc_scatter_scan_max(smem,x_iss,x_ess); 1674 1675 // 1676 // get line at y_source line 1677 // 1678 SKC_RASTERIZE_FLOAT const o0x = skc_subgroup_shuffle(n0x,x_source); 1679 SKC_RASTERIZE_FLOAT const o0y = skc_subgroup_shuffle(n0y,x_source); 1680 SKC_RASTERIZE_FLOAT const o1x = skc_subgroup_shuffle(n1x,x_source); 1681 SKC_RASTERIZE_FLOAT const o1y = skc_subgroup_shuffle(n1y,x_source); 1682 1683 // 1684 // every lane will create a 1 pixel tall line "sliver" 1685 // 1686 // FIXME -- this gets expanded on SIMD 1687 // 1688 // if numerator == 1 then this is the first lane 1689 // if numerator == s then this is the last lane 1690 // 1691 SKC_RASTERIZE_FLOAT const x_delta = skc_delta_offset() - skc_subgroup_shuffle(x_ess,x_source); 1692 SKC_RASTERIZE_FLOAT const x_count = skc_subgroup_shuffle(x_segs,x_source); 1693 1694 SKC_RASTERIZE_PREDICATE const is_x_first = (x_delta == 1.0f); 1695 SKC_RASTERIZE_PREDICATE const is_x_last = (x_delta >= x_count); 1696 1697 // toggle x_delta sign 1698 SKC_RASTERIZE_FLOAT const x_offset = as_float((as_uint(x_delta) ^ intel_sub_group_shuffle(dx_xor,x_source))); 1699 1700 // 1701 // calculate "right" line segment endpoint 1702 // 1703 SKC_RASTERIZE_FLOAT p1x = (x_offset + skc_subgroup_shuffle(x_base,x_source)) * SKC_SUBPIXEL_X_SCALE_UP; 1704 SKC_RASTERIZE_FLOAT const p_t = (p1x - o0x) * skc_subgroup_shuffle(x_denom,x_source); 1705 SKC_RASTERIZE_FLOAT p1y = round(SKC_LERP(o0y,o1y,p_t)); 1706 1707 // 1708 // override c1 if this is last point 1709 // 1710 p1x = select(p1x,o1x,is_x_last); 1711 p1y = select(p1y,o1y,is_x_last); 1712 1713 // 1714 // shuffle up "left" line segment endpoint 1715 // 1716 // NOTE: Intel's shuffle_up is unique with its elegant 1717 // "previous" argument so don't get used to it 1718 // 1719 SKC_RASTERIZE_FLOAT p0x = skc_subgroup_shuffle_up_1(p1x_prev,p1x); 1720 SKC_RASTERIZE_FLOAT p0y = skc_subgroup_shuffle_up_1(p1y_prev,p1y); 1721 1722 // 1723 // override shuffle up if this is the first line segment 1724 // 1725 p0x = select(p0x,o0x,is_x_first); 1726 p0y = select(p0y,o0y,is_x_first); 1727 1728 // 1729 // save previous right endpoint 1730 // 1731 p1x_prev = p1x; 1732 p1y_prev = p1y; 1733 1734 // 1735 // decrement by subgroup size 1736 // 1737 x_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; 1738 x_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; 1739 x_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; 1740 1741 // 1742 // only non-horizontal subpixel lines are valid 1743 // 1744 SKC_RASTERIZE_PREDICATE is_active = (p0y != p1y); 1745 1746 // 1747 // if no lanes are active then continue 1748 // 1749 // FIXME -- THIS SIMPLE SUB_GROUP_ANY TEST SIGNIFICANTLY 1750 // IMPACTS PERFORMANCE (+12% ?) 1751 // 1752 // IT SHOULDN'T !!! 1753 // 1754#if 0 1755 if (!skc_subgroup_any(is_active)) 1756 continue; 1757#endif 1758 1759 // 1760 // Option 1: use SLM for explicitly managed coalesced stores 1761 // 1762 // 1. which tile does this line belong? 1763 // 2. hash tile coordinates 1764 // 3. lookup hash 1765 // 4. if tile matches then SLM append keys 1766 // 5. if tile doesn't match 1767 // a. flush 1768 // b. create new TTSK_RYX 1769 // c. obtain TTSB block from pool 1770 // d. goto 3. 1771 // 1772 1773 // 1774 // Option 2: rely on L1/L2/L3 to mitigate non-coalesced stores 1775 // 1776 // 1. which tile does this line belong? 1777 // 2. hash tile coordinates 1778 // 3. lookup hash 1779 // 4. if tile matches then GMEM append keys 1780 // 5. if tile doesn't match 1781 // a. flush (and invalidate empty elems) 1782 // b. create new TTSK_RYX 1783 // c. obtain TTSB block from pool 1784 // d. goto 3. 1785 // 1786 1787 // 1788 // The virtual rasterization surface is very large and 1789 // signed: +/- ~64K-256K, depending on the architecture. 1790 // 1791 // Rasters must be clipped to the virtual surface and, 1792 // optionally, clipped even further on a per raster 1793 // basis. 1794 // 1795 1796 // 1797 // Clip to the per-raster clip 1798 // 1799 1800 /* 1801 1802 CLIP HERE 1803 1804 */ 1805 1806 // 1807 // Hash the tile coordinates 1808 // 1809 // This table lists nominal values for each architecture. 1810 // We want to choose values that are naturally fit the 1811 // "width" of the architecture. 1812 // 1813 // SIMD RANGE BITS MAX RANGE MAX BINS HASH BITS 1814 // ---- ------- ---- --------- -------- --------- 1815 // 4 [0, 4] 3 [0, 7] 10 mod(10) <-- SSE42, ? 1816 // 8 [0, 8] 4 [0, 15] 8 3 <-- GEN*,AVX* 1817 // 16 [0, 16] 5 [0, 31] 6 mod(6) <-- GEN*,? 1818 // 32 [0, 32] 6 [0, 63] 5 mod(5) <-- CUDA,PowerVR,Adreno,GEN* 1819 // 64 [0, 64] 7 [0,127] 4 2 <-- AMD Radeon 1820 // 1821 // NOTE: When possible, bias the hash toward using more y 1822 // bits because of: 1823 // 1824 // 1. the 90 degree counter-clockwise rotation that we put 1825 // in place to offset the render-time clockwise 1826 // rotation 1827 // 1828 // 2. the likely presence of left-to-right or 1829 // right-to-left glyphs. 1830 // 1831 // For power-of-two bins, the hash is easy. 1832 // 1833 // For non-power-of-two, we may want to either implement a 1834 // fast mod (compiler should do this for us... hahahaha) or 1835 // drop down to the next power-of-two. 1836 // 1837 1838 // 1839 // FIXME -- this snarl is not good -- can probably reduce 1840 // some of the sign casting but some is there to vectorize a 1841 // scalar 1842 // 1843 SKC_RASTERIZE_INT const z0y = SKC_CONVERT(SKC_RASTERIZE_INT)(p0y); 1844 SKC_RASTERIZE_INT const z1y = SKC_CONVERT(SKC_RASTERIZE_INT)(p1y); 1845 1846 SKC_RASTERIZE_INT const z0x = SKC_CONVERT(SKC_RASTERIZE_INT)(p0x); 1847 SKC_RASTERIZE_INT const z1x = SKC_CONVERT(SKC_RASTERIZE_INT)(p1x); 1848 1849 SKC_RASTERIZE_INT const min_y = min(z0y,z1y); 1850 SKC_RASTERIZE_INT const max_y = max(z0y,z1y); 1851 1852 SKC_RASTERIZE_INT const tile_y = min_y >> SKC_SUBTILE_RESL_Y_LOG2; 1853 1854 SKC_RASTERIZE_UINT const ty = SKC_AS(SKC_RASTERIZE_UINT)(min_y) & SKC_SUBTILE_MASK_Y; 1855 SKC_RASTERIZE_INT dy = SKC_AS(SKC_RASTERIZE_INT)(z1y - z0y); 1856 1857 // 1858 // map [+1,+32] to [ 0,+31] 1859 // map [-1,-32] to [-1,-32] 1860 // 1861 SKC_RASTERIZE_INT dys = (dy + (~dy >> 31)) << 26; 1862 1863 SKC_RASTERIZE_INT const min_x = min(z0x,z1x); 1864 SKC_RASTERIZE_INT const max_x = max(z0x,z1x); 1865 SKC_RASTERIZE_INT const tile_x = min_x >> SKC_SUBTILE_RESL_X_LOG2; 1866 1867 SKC_RASTERIZE_UINT const tx = SKC_AS(SKC_RASTERIZE_UINT)(min_x) & SKC_SUBTILE_MASK_X; 1868 SKC_RASTERIZE_UINT const sx = SKC_AS(SKC_RASTERIZE_UINT)(max_x - min_x); 1869 1870 SKC_RASTERIZE_UINT const tts = dys | (ty << 16) | (sx << 10) | tx; 1871 1872 SKC_RASTERIZE_UINT const hash = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & SKC_RASTERIZE_TILE_HASH_Y_MASK) << SKC_RASTERIZE_TILE_HASH_X_BITS) | 1873 (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & SKC_RASTERIZE_TILE_HASH_X_MASK)); 1874 1875 SKC_RASTERIZE_UINT const yx = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & 0xFFF) << 12) | (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & 0xFFF)); 1876 1877#if 0 1878 printf("(%3u, %3u)\n",tile_y,tile_x); 1879#endif 1880 1881#if 0 1882 if (is_active) 1883 printf("( %3u, %3u ) : [ %3u, %3u, %3d, %3d, %3u ]\n",tile_y,tile_x,ty,tx,dy,((int)dys)>>26,sx); 1884#endif 1885 1886 // 1887 // debug 1888 // 1889#if 0 // PRINTF_ENABLE 1890 1891#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) 1892 1893#undef SKC_EXPAND_X 1894#define SKC_EXPAND_X(I,S,C,P,A) \ 1895 if (is_active C) \ 1896 printf("{ { %5d, %5d }, { %5d, %5d } (* %2u *) },\n",z0x C,z0y C,z1x C,z1y C,hash C); 1897 1898 SKC_RASTERIZE_VECTOR_EXPAND(); 1899#else 1900 if (is_active) 1901 printf("{ { %5d, %5d }, { %5d, %5d } } (* %2u *),\n",z0x,z0y,z1x,z1y,hash); 1902#endif 1903 1904#endif 1905 // 1906 // flush all active lanes 1907 // 1908 while (true) 1909 { 1910 // 1911 // either gather load or vector load+shuffle the yx keys 1912 // 1913#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) 1914 SKC_RASTERIZE_BIN const yx_bin = smem->bin.vN.yx; 1915 SKC_RASTERIZE_UINT const yx_cur = shuffle(yx_bin,hash); 1916#else 1917 SKC_RASTERIZE_UINT const yx_cur = smem->bin.aN.yx[hash]; 1918#endif 1919 1920 // 1921 // does yx for lane match yx for hash? 1922 // 1923 SKC_RASTERIZE_UINT const active_yx = is_active ? yx : SKC_RASTERIZE_YX_INVALID; 1924 SKC_RASTERIZE_PREDICATE const is_match = (yx_cur == active_yx); 1925 1926 // 1927 // OpenCL spec: "When casting a bool to a vector integer 1928 // data type, the vector components will be set to -1 1929 // (i.e. all bits set) if the vector bool value is true 1930 // and 0 otherwise. 1931 // 1932#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) 1933 SKC_RASTERIZE_UINT const h_match = (SKC_RASTERIZE_UINT)is_match; 1934#else 1935 SKC_RASTERIZE_UINT const h_match = abs(is_match); // {-1,0} -> {+1,0} 1936#endif 1937 // 1938 // how many new elements for each matching hash bin? 1939 // 1940 SKC_RASTERIZE_UINT const h_shl = hash * SKC_RASTERIZE_TILE_HASH_BIN_BITS; 1941 SKC_RASTERIZE_UINT const h = h_match << h_shl; 1942 1943 // 1944 // prefix sum all of the bins in parallel 1945 // 1946 SKC_RASTERIZE_UINT const h_iss = skc_subgroup_scan_inclusive_add_uint(h); 1947 SKC_RASTERIZE_UINT const h_total = skc_subgroup_last_uint(h_iss); 1948 1949 // 1950 // current bin counts 1951 // 1952#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) 1953 SKC_RASTERIZE_BIN const count_bin = smem->bin.vN.count; 1954 SKC_RASTERIZE_UINT const count_cur = shuffle(count_bin,hash); 1955#else 1956 SKC_RASTERIZE_UINT const count_cur = smem->bin.aN.count[hash]; 1957#endif 1958 1959 // 1960 // calculate where each cache-hit and in-bounds tts should be stored 1961 // 1962 SKC_RASTERIZE_UINT const ttsb_index = (h_iss >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur - 1; 1963 SKC_RASTERIZE_UINT const count_new = (h_total >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur; 1964 1965 // 1966 // which lanes can append to a matching bin? 1967 // 1968 SKC_RASTERIZE_PREDICATE const is_append = is_match && (ttsb_index < SKC_DEVICE_SUBBLOCK_WORDS); 1969 1970 // 1971 // scatter append tts elements to bin blocks 1972 // 1973#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1) 1974 // 1975 // SIMD 1976 // 1977#undef SKC_EXPAND_X 1978#define SKC_EXPAND_X(I,S,C,P,A) \ 1979 if (is_append C) \ 1980 { \ 1981 smem->bin.aN.ttsb [hash C][ttsb_index C] = tts C; \ 1982 smem->bin.aN.count[hash C] = count_new C; \ 1983 } 1984 1985 SKC_RASTERIZE_VECTOR_EXPAND(); 1986#else 1987 // 1988 // SIMT 1989 // 1990 if (is_append) 1991 { 1992 smem->bin.aN.ttsb [hash][ttsb_index] = tts; 1993 smem->bin.aN.count[hash] = count_new; // it's ok if this is > SKC_DEVICE_SUBBLOCK_WORDS 1994 } 1995#endif 1996 // 1997 // try to keep predicate updates SIMD-friendly and 1998 // outside of predicated code paths -- this is not 1999 // always how we would normally do things on SIMT but 2000 // either approach is acceptable 2001 // 2002 2003 // 2004 // mask off lanes/components that successfully appended 2005 // 2006 is_active = is_active && !is_append; 2007 2008 // 2009 // are there any active lanes left? 2010 // 2011 if (!skc_subgroup_any(is_active)) 2012 break; 2013 2014 // 2015 // There are active lanes that couldn't be appended to a 2016 // bin because their hashes collided with the bin's 2017 // current ryx key then those bins must be ejected. 2018 // 2019 // Note that we do not eject "full" bins because lazily 2020 // waiting for a collision results in simpler code. 2021 // 2022 skc_flush(bp_atomics, 2023 bp_elems, 2024 bp_ids, 2025 bp_mask, 2026 cohort_atomics, 2027 subblocks, 2028 blocks, 2029 blocks_next, 2030 sk_v, 2031 sk_v_next, 2032 sk_extent, 2033 smem, 2034 hash, 2035 yx, 2036 is_active); 2037 } 2038 } 2039 } 2040} 2041 2042// 2043// INITIALIZE SMEM 2044// 2045// Note that SIMD/SIMT have nearly the same syntax. 2046// 2047static 2048void 2049skc_smem_init(__local struct skc_subgroup_smem volatile * const smem) 2050{ 2051 // 2052 // initialize smem bins 2053 // 2054#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) 2055 // 2056 // SIMD 2057 // 2058 smem->bin.vN.yx = ( SKC_RASTERIZE_YX_INIT ); 2059 smem->bin.vN.count = ( 0 ); 2060#else 2061 // 2062 // SIMT 2063 // 2064 int idx = skc_subgroup_lane(); 2065 2066#if ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT < SKC_RASTERIZE_ELEMS_PER_SUBGROUP ) 2067 if (idx < SKC_RASTERIZE_TILE_HASH_BIN_COUNT) 2068#elif ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT > SKC_RASTERIZE_ELEMS_PER_SUBGROUP ) 2069 for (; idx<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; idx+=SKC_RASTERIZE_SUBGROUP_SIZE) 2070#endif 2071 { 2072 smem->bin.aN.yx [idx] = ( SKC_RASTERIZE_YX_INIT ); 2073 smem->bin.aN.count[idx] = ( 0 ); 2074 } 2075#endif 2076} 2077 2078// 2079// RASTERIZE CUBIC KERNEL 2080// 2081 2082static 2083void 2084skc_rasterize_cubics(__global SKC_ATOMIC_UINT volatile * const bp_atomics, 2085 __global union skc_bp_elem * const bp_elems, 2086 __global uint * const bp_ids, 2087 skc_uint const bp_mask, 2088 2089 __global SKC_ATOMIC_UINT volatile * const cohort_atomics, 2090 __global skc_ttsk_s_t * const sk_extent, 2091 2092 __local struct skc_subgroup_smem volatile * const smem, 2093 2094 skc_uint * const nodeword, 2095 skc_block_id_t * const id, 2096 2097 union skc_transform const * const tv, 2098 union skc_path_clip const * const cv, 2099 skc_uint const cohort) 2100{ 2101 // 2102 // the initial segment idx and segments-per-block constant determine 2103 // how many block ids will need to be loaded 2104 // 2105 SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; 2106 2107 skc_segment_next(bp_elems,nodeword,id); 2108 2109 SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; 2110 2111 skc_segment_next(bp_elems,nodeword,id); 2112 2113 SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; 2114 2115 skc_segment_next(bp_elems,nodeword,id); 2116 2117 SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; 2118 2119 skc_segment_next(bp_elems,nodeword,id); 2120 2121 SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; 2122 2123 skc_segment_next(bp_elems,nodeword,id); 2124 2125 SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; 2126 2127 skc_segment_next(bp_elems,nodeword,id); 2128 2129 SKC_RASTERIZE_FLOAT const c3x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; 2130 2131 skc_segment_next(bp_elems,nodeword,id); 2132 2133 SKC_RASTERIZE_FLOAT const c3y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; 2134 2135 // 2136 // apply transform 2137 // 2138 // note that we only care if the end points are rounded to subpixel precision 2139 // 2140 // FIXME -- transformation is currently affine-only support perspective later 2141 // 2142 // the affine transformation requires 8 FMA + 2 ROUND operations 2143 // 2144 2145 SKC_RASTERIZE_FLOAT b0x = c0x * tv->sx + c0y * tv->shx + tv->tx; 2146 SKC_RASTERIZE_FLOAT b0y = c0x * tv->shy + c0y * tv->sy + tv->ty; 2147 2148 SKC_RASTERIZE_FLOAT t1x = c1x * tv->sx + c1y * tv->shx + tv->tx; 2149 SKC_RASTERIZE_FLOAT t1y = c1x * tv->shy + c1y * tv->sy + tv->ty; 2150 2151 SKC_RASTERIZE_FLOAT t2x = c2x * tv->sx + c2y * tv->shx + tv->tx; 2152 SKC_RASTERIZE_FLOAT t2y = c2x * tv->shy + c2y * tv->sy + tv->ty; 2153 2154 SKC_RASTERIZE_FLOAT t3x = c3x * tv->sx + c3y * tv->shx + tv->tx; 2155 SKC_RASTERIZE_FLOAT t3y = c3x * tv->shy + c3y * tv->sy + tv->ty; 2156 2157 // 2158 // FIXME -- this is temporary support for projection 2159 // 2160 bool const is_affine = (tv->w0 == 0.0f) && (tv->w1 == 0.0f); 2161 2162 if (!is_affine) 2163 { 2164 SKC_PROJECT(tv,c0x,c0y,b0x,b0y); 2165 SKC_PROJECT(tv,c1x,c1y,t1x,t1y); 2166 SKC_PROJECT(tv,c2x,c2y,t2x,t2y); 2167 SKC_PROJECT(tv,c3x,c3y,t3x,t3y); 2168 } 2169 2170 b0x = round(b0x); 2171 b0y = round(b0y); 2172 2173 t3x = round(t3x); 2174 t3y = round(t3y); 2175 2176 // 2177 // 2178 // 2179#if PRINTF_ENABLE 2180 2181#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) 2182 2183#undef SKC_EXPAND_X 2184#define SKC_EXPAND_X(I,S,C,P,A) \ 2185 printf("{ { %.02f, %.02f }, { %.02f, %.02f }," \ 2186 " { %.02f, %.02f }, { %.02f, %.02f } },\n", \ 2187 b0x C,b0y C,t1x C,t1y C, \ 2188 t2x C,t2y C,t3x C,t3y C); 2189 2190 SKC_RASTERIZE_VECTOR_EXPAND(); 2191 2192#else 2193 2194 printf("{ { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f } },\n", 2195 b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y); 2196 2197#endif 2198 2199#endif 2200 2201 // 2202 // OLD APPROACH 2203 // ------------ 2204 // 2205 // The Spinel CUDA rasterizer was significantly more complex and 2206 // performed a few different tasks that are probably best kept 2207 // separate. 2208 // 2209 // The Spinel rasterizer Bezier held 4-element x and y coordinates 2210 // in adjacent lanes. This simplified intermingling of single lane 2211 // 4-coordinate line segments with two-lane cubic Beziers. 2212 // 2213 // After transformation of the input segments, the Spinel rasterizer 2214 // would test cubics for flatness and, if flat, collapse the 2215 // adjacent lanes into a single line lane and an empty lane. 2216 // 2217 // Any lines would then be appended to a line queue. 2218 // 2219 // Any cubics would then be subdivided. 2220 // 2221 // The reclassification process would be repeated. 2222 // 2223 // NEW APPROACH 2224 // ------------ 2225 // 2226 // Assume we're only working with cubics in this kernel. 2227 // 2228 // Optimization: if the line segment is a special case -- a cusp, 2229 // has 1+ inflections, or a loop -- it might be beneficial to 2230 // subdivide the control cage 1+ times in order to separate the 2231 // flatter segments the high-velocity region(s). 2232 // 2233 // This means we want to split using [a,b] formulation to _directly_ 2234 // subdivide producing a new control cage. 2235 // 2236 // Wang's Formula is still useful even if we subdivide once or twice 2237 // as it's so cheap that it might give some useful hints about where 2238 // the high-velocity sections of curve reside. 2239 // 2240 // But it seems like using Wang's and directly flattening to line 2241 // segments without any subdivision is good enough for the limited 2242 // set of test cases that I've tried. 2243 // 2244 // So... use Wang's Formula to estimate how many line segment are 2245 // required to properly flatten the cubics. 2246 // 2247 // Then use inclusive/exclusive scans to put all the lanes to work: 2248 // 2249 // 1. segmenting cubics to line segments 2250 // 2251 // 2. slivering line segments into 1-pixel high line segments 2252 // 2253 // 3. slivering 1-pixel high line segments into 1-pixel wide line 2254 // segments 2255 // 2256 // MORE BACKGROUND ON NEW APPROACH 2257 // ------------------------------- 2258 // 2259 // Two options for handling line segments: 2260 // 2261 // 1. append the line segments onto an SLM array until enough 2262 // work has been accrued (Spinel does this) 2263 // 2264 // 2. immediately sliver the potentially multi-pixel line 2265 // segments into subpixel lines 2266 // 2267 // The advantage of (1) is that it guarantees the slivering 2268 // process will, on average, always be emitting a full subgroup 2269 // of subpixel lines. 2270 // 2271 // The advantage of (2) is that it reduces code complexity and 2272 // leaves more room for SLM tile bins. The difference between Spinel 2273 // and Skia Compute is that Wang's Formula guarantees there will be 2274 // a full subgroup of multi-pixel lines unless this is the final 2275 // iteration of the warp of multi-pixel lines. 2276 // 2277 // Note that wider GPU architectures might benefit from (1) and 2278 // other work accumulation strategies because it will minimize 2279 // partial warp workloads in the final iteration of each stage. It 2280 // also minimizes the sunk cost of the uniform control logic steps. 2281 // 2282 // So let's implement (2) for now... 2283 // 2284 2285 // 2286 // And... begin! 2287 // 2288 // Estimate how many line segments are in quad/cubic curve. 2289 // 2290 // Wang's Formula will return zero if the control points are 2291 // collinear but we bump it up to 1.0f. 2292 // 2293 SKC_RASTERIZE_FLOAT const s_segs = skc_wangs_formula_cubic(b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y); 2294 2295 // 2296 // if there are free registers then precalculate the reciprocal for 2297 // each estimated segments since it will never change 2298 // 2299 SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs); 2300 2301 2302 // 2303 // inclusive add scan of estimated line segments 2304 // exclusive add scan of estimated line segments 2305 // total number of estimated line segments 2306 // 2307 SKC_RASTERIZE_FLOAT s_iss = skc_subgroup_scan_inclusive_add_float(s_segs); 2308 SKC_RASTERIZE_FLOAT s_ess = s_iss - s_segs; 2309 float s_rem = skc_subgroup_last_float(s_iss); // scalar 2310 2311 // 2312 // Precompute cubic polynomial coefficients from transformed control 2313 // cage so we can shuffle them in on each iteration of the outer 2314 // loop and then evaluate the polynomial in Horner form. 2315 // 2316 // | 1 0 0 0 | | c0 | 2317 // | | | | 2318 // | -3 3 0 0 | | c1 | 2319 // B(t) = [ 1 t^1 t^2 t^3 ] | | | | 2320 // | 3 -6 3 0 | | c2 | 2321 // | | | | 2322 // | -1 3 -3 1 | | c3 | 2323 // 2324 // 2325 SKC_RASTERIZE_FLOAT const b1x = mad(-3.0f,b0x,3.0f*t1x); // 2 - 1 MAD + MUL 2326 SKC_RASTERIZE_FLOAT const b1y = mad(-3.0f,b0y,3.0f*t1y); // 2 - 1 MAD + MUL 2327 2328 SKC_RASTERIZE_FLOAT const b2x = mad(3.0f,b0x,mad(-6.0f,t1x,3.0f*t2x)); // 3 - 2 MAD + MUL 2329 SKC_RASTERIZE_FLOAT const b2y = mad(3.0f,b0y,mad(-6.0f,t1y,3.0f*t2y)); // 3 - 2 MAD + MUL 2330 2331 SKC_RASTERIZE_FLOAT const b3x = mad(3.0f,t1x,mad(-3.0f,t2x,t3x)) - b0x; // 3 - 2 MAD + SUB 2332 SKC_RASTERIZE_FLOAT const b3y = mad(3.0f,t1y,mad(-3.0f,t2y,t3y)) - b0y; // 3 - 2 MAD + SUB 2333 2334 // 2335 // these values don't matter on the first iteration 2336 // 2337 SKC_RASTERIZE_FLOAT l1x_prev = 0; 2338 SKC_RASTERIZE_FLOAT l1y_prev = 0; 2339 2340 // 2341 // allocate and init in-register TTSK keys 2342 // 2343 skc_uint sk_v_next = 0; 2344 skc_ttsk_v_t sk_v; 2345 2346 sk_v.hi = cohort; 2347 2348 // 2349 // initialize smem 2350 // 2351 skc_smem_init(smem); 2352 2353 // 2354 // initialize blocks / subblocks 2355 // 2356 skc_block_id_v_t blocks; 2357 skc_uint blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE; 2358 2359#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 2360 skc_block_id_t subblocks = 0; 2361#endif 2362 2363 // 2364 // loop until done 2365 // 2366 while (s_rem > 0) 2367 { 2368 // 2369 // distribute work across lanes 2370 // 2371 SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess); 2372 2373 // 2374 // every lane has a fraction to work off of 2375 // 2376 // FIXME -- this gets expanded on SIMD 2377 // 2378 // if delta == 1 then this is the first lane 2379 // if count == s_segs then this is the last lane 2380 // 2381 SKC_RASTERIZE_FLOAT const s_delta = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source); 2382 SKC_RASTERIZE_FLOAT const s_count = skc_subgroup_shuffle(s_segs,s_source); 2383 2384 SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f); 2385 SKC_RASTERIZE_PREDICATE const is_s_last = (s_delta >= s_count); 2386 2387 // 2388 // init parametric t 2389 // 2390 SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)? 2391 2392 // 2393 // if last then override to a hard 1.0f 2394 // 2395 s_t = is_s_last ? 1.0f : s_t; 2396 2397 // 2398 // decrement by subgroup size 2399 // 2400 s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; 2401 s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; 2402 s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; 2403 2404 // 2405 // now every lane knows what to do and the following lines will 2406 // pump out up to SUBGROUP_SIZE line segments 2407 // 2408 // obtain the src vertices through shared or via a shuffle 2409 // 2410 2411 // 2412 // shuffle in the polynomial coefficients their source lane 2413 // 2414 SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source); 2415 SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source); 2416 2417 SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source); 2418 SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source); 2419 2420 SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source); 2421 SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source); 2422 2423 SKC_RASTERIZE_FLOAT const s3x = skc_subgroup_shuffle(b3x,s_source); 2424 SKC_RASTERIZE_FLOAT const s3y = skc_subgroup_shuffle(b3y,s_source); 2425 2426 // 2427 // calculate "right" line segment endpoint using Horner form 2428 // 2429 SKC_RASTERIZE_FLOAT l1x = round(mad(mad(mad(s3x,s_t,s2x),s_t,s1x),s_t,s0x)); // 3 MAD + ROUND 2430 SKC_RASTERIZE_FLOAT l1y = round(mad(mad(mad(s3y,s_t,s2y),s_t,s1y),s_t,s0y)); // 3 MAD + ROUND 2431 2432 // 2433 // shuffle up "left" line segment endpoint 2434 // 2435 // NOTE: Intel's shuffle_up is unique with its elegant 2436 // "previous" argument so don't get used to it 2437 // 2438 SKC_RASTERIZE_FLOAT l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x); 2439 SKC_RASTERIZE_FLOAT l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y); 2440 2441 // 2442 // save previous right endpoint 2443 // 2444 l1x_prev = l1x; 2445 l1y_prev = l1y; 2446 2447 // 2448 // override shuffle up if this is the first line segment 2449 // 2450 l0x = select(l0x,s0x,is_s_first); 2451 l0y = select(l0y,s0y,is_s_first); 2452 2453 // 2454 // sliver lines 2455 // 2456 skc_sliver(bp_atomics, 2457 bp_elems, 2458 bp_ids, 2459 bp_mask, 2460 cohort_atomics, 2461 &subblocks, 2462 &blocks, 2463 &blocks_next, 2464 &sk_v, 2465 &sk_v_next, 2466 sk_extent, 2467 smem, 2468 l0x,l0y,l1x,l1y); 2469 } 2470 2471 // 2472 // - flush work-in-progress blocks 2473 // - return unused block ids 2474 // 2475 skc_finalize(bp_atomics, 2476 bp_elems, 2477 bp_ids, 2478 bp_mask, 2479 cohort_atomics, 2480 &blocks, 2481 blocks_next, 2482 &sk_v, 2483 sk_v_next, 2484 sk_extent, 2485 smem); 2486} 2487 2488// 2489// RASTERIZE QUAD KERNEL 2490// 2491 2492static 2493void 2494skc_rasterize_quads(__global SKC_ATOMIC_UINT volatile * const bp_atomics, 2495 __global union skc_bp_elem * const bp_elems, 2496 __global uint * const bp_ids, 2497 skc_uint const bp_mask, 2498 2499 __global SKC_ATOMIC_UINT volatile * const cohort_atomics, 2500 __global skc_ttsk_s_t * const sk_extent, 2501 2502 __local struct skc_subgroup_smem volatile * const smem, 2503 2504 skc_uint * const nodeword, 2505 skc_block_id_t * const id, 2506 2507 union skc_transform const * const tv, 2508 union skc_path_clip const * const cv, 2509 skc_uint const cohort) 2510{ 2511 // 2512 // the initial segment idx and segments-per-block constant determine 2513 // how many block ids will need to be loaded 2514 // 2515 SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; 2516 2517 skc_segment_next(bp_elems,nodeword,id); 2518 2519 SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; 2520 2521 skc_segment_next(bp_elems,nodeword,id); 2522 2523 SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; 2524 2525 skc_segment_next(bp_elems,nodeword,id); 2526 2527 SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; 2528 2529 skc_segment_next(bp_elems,nodeword,id); 2530 2531 SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; 2532 2533 skc_segment_next(bp_elems,nodeword,id); 2534 2535 SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; 2536 2537 // 2538 // apply transform 2539 // 2540 // note that we only care if the end points are rounded to subpixel precision 2541 // 2542 // FIXME -- transformation is currently affine-only support perspective later 2543 // 2544 // the affine transformation requires 8 FMA + 2 ROUND operations 2545 // 2546 SKC_RASTERIZE_FLOAT b0x = c0x * tv->sx + c0y * tv->shx + tv->tx; 2547 SKC_RASTERIZE_FLOAT b0y = c0x * tv->shy + c0y * tv->sy + tv->ty; 2548 2549 SKC_RASTERIZE_FLOAT t1x = c1x * tv->sx + c1y * tv->shx + tv->tx; 2550 SKC_RASTERIZE_FLOAT t1y = c1x * tv->shy + c1y * tv->sy + tv->ty; 2551 2552 SKC_RASTERIZE_FLOAT t2x = c2x * tv->sx + c2y * tv->shx + tv->tx; 2553 SKC_RASTERIZE_FLOAT t2y = c2x * tv->shy + c2y * tv->sy + tv->ty; 2554 2555 // 2556 // FIXME -- this is temporary support for projection 2557 // 2558 bool const is_affine = (tv->w0 == 0.0f) && (tv->w1 == 0.0f); 2559 2560 if (!is_affine) 2561 { 2562 SKC_PROJECT(tv,c0x,c0y,b0x,b0y); 2563 SKC_PROJECT(tv,c1x,c1y,t1x,t1y); 2564 SKC_PROJECT(tv,c2x,c2y,t2x,t2y); 2565 } 2566 2567 b0x = round(b0x); 2568 b0y = round(b0y); 2569 2570 t2x = round(t2x); 2571 t2y = round(t2y); 2572 2573 // 2574 // Estimate how many line segments are in quad/cubic curve. 2575 // 2576 // Wang's Formula will return zero if the control points are 2577 // collinear but we bump it up to 1.0f. 2578 // 2579 SKC_RASTERIZE_FLOAT const s_segs = skc_wangs_formula_quadratic(b0x,b0y,t1x,t1y,t2x,t2y); 2580 2581 // 2582 // if there are free registers then precalculate the reciprocal for 2583 // each estimated segments since it will never change 2584 // 2585 SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs); 2586 2587 2588 // 2589 // inclusive add scan of estimated line segments 2590 // exclusive add scan of estimated line segments 2591 // total number of estimated line segments 2592 // 2593 SKC_RASTERIZE_FLOAT s_iss = skc_subgroup_scan_inclusive_add_float(s_segs); 2594 SKC_RASTERIZE_FLOAT s_ess = s_iss - s_segs; 2595 float s_rem = skc_subgroup_last_float(s_iss); // scalar 2596 2597 // 2598 // Precompute quadratic polynomial coefficients from control cage so 2599 // we can shuffle them in on each iteration of the outer loop and 2600 // then evaluate the polynomial in Horner form. 2601 // 2602 2603 // | 1 0 0 | | c0 | 2604 // | | | | 2605 // B(t) = [ 1 t^1 t^2 ] | -2 2 0 | | c1 | 2606 // | | | | 2607 // | 1 -2 1 | | c2 | 2608 // 2609 // 2610 SKC_RASTERIZE_FLOAT const b1x = mad(-2.0f,b0x,2.0f*t1x); // 2 - 1 MAD + MUL 2611 SKC_RASTERIZE_FLOAT const b1y = mad(-2.0f,b0y,2.0f*t1y); // 2 - 1 MAD + MUL 2612 2613 SKC_RASTERIZE_FLOAT const b2x = mad(-2.0f,t1x,b0x+t2x); // 2 - 1 MAD + ADD 2614 SKC_RASTERIZE_FLOAT const b2y = mad(-2.0f,t1y,b0y+t2y); // 2 - 1 MAD + ADD 2615 2616 // 2617 // these values don't matter on the first iteration 2618 // 2619 SKC_RASTERIZE_FLOAT l1x_prev = 0; 2620 SKC_RASTERIZE_FLOAT l1y_prev = 0; 2621 2622 // 2623 // allocate and init in-register TTSK keys 2624 // 2625 skc_uint sk_v_next = 0; 2626 skc_ttsk_v_t sk_v; 2627 2628 sk_v.hi = cohort; 2629 2630 // 2631 // initialize smem 2632 // 2633 skc_smem_init(smem); 2634 2635 // 2636 // initialize blocks / subblocks 2637 // 2638 skc_block_id_v_t blocks; 2639 skc_uint blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE; 2640 2641#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 2642 skc_block_id_t subblocks = 0; 2643#endif 2644 2645 // 2646 // loop until done 2647 // 2648 while (s_rem > 0) 2649 { 2650 // 2651 // distribute work across lanes 2652 // 2653 SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess); 2654 2655 // 2656 // every lane has a fraction to work off of 2657 // 2658 // FIXME -- this gets expanded on SIMD 2659 // 2660 // if delta == 1 then this is the first lane 2661 // if count == s_segs then this is the last lane 2662 // 2663 SKC_RASTERIZE_FLOAT const s_delta = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source); 2664 SKC_RASTERIZE_FLOAT const s_count = skc_subgroup_shuffle(s_segs,s_source); 2665 2666 SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f); 2667 SKC_RASTERIZE_PREDICATE const is_s_last = (s_delta >= s_count); 2668 2669 // 2670 // init parametric t 2671 // 2672 SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)? 2673 2674 // 2675 // if last then override to a hard 1.0f 2676 // 2677 s_t = is_s_last ? 1.0f : s_t; 2678 2679 // 2680 // decrement by subgroup size 2681 // 2682 s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; 2683 s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; 2684 s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; 2685 2686 // 2687 // now every lane knows what to do and the following lines will 2688 // pump out up to SUBGROUP_SIZE line segments 2689 // 2690 // obtain the src vertices through shared or via a shuffle 2691 // 2692 2693 // 2694 // shuffle in the polynomial coefficients their source lane 2695 // 2696 SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source); 2697 SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source); 2698 2699 SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source); 2700 SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source); 2701 2702 SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source); 2703 SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source); 2704 2705 // 2706 // calculate "right" line segment endpoint using Horner form 2707 // 2708 SKC_RASTERIZE_FLOAT l1x = round(mad(mad(s2x,s_t,s1x),s_t,s0x)); // 2 MAD + ROUND 2709 SKC_RASTERIZE_FLOAT l1y = round(mad(mad(s2y,s_t,s1y),s_t,s0y)); // 2 MAD + ROUND 2710 2711 // 2712 // shuffle up "left" line segment endpoint 2713 // 2714 // NOTE: Intel's shuffle_up is unique with its elegant 2715 // "previous" argument so don't get used to it 2716 // 2717 SKC_RASTERIZE_FLOAT l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x); 2718 SKC_RASTERIZE_FLOAT l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y); 2719 2720 // 2721 // save previous right endpoint 2722 // 2723 l1x_prev = l1x; 2724 l1y_prev = l1y; 2725 2726 // 2727 // override shuffle up if this is the first line segment 2728 // 2729 l0x = select(l0x,s0x,is_s_first); 2730 l0y = select(l0y,s0y,is_s_first); 2731 2732 // 2733 // sliver lines 2734 // 2735 skc_sliver(bp_atomics, 2736 bp_elems, 2737 bp_ids, 2738 bp_mask, 2739 cohort_atomics, 2740 &subblocks, 2741 &blocks, 2742 &blocks_next, 2743 &sk_v, 2744 &sk_v_next, 2745 sk_extent, 2746 smem, 2747 l0x,l0y,l1x,l1y); 2748 } 2749 2750 // 2751 // - flush work-in-progress blocks 2752 // - return unused block ids 2753 // 2754 skc_finalize(bp_atomics, 2755 bp_elems, 2756 bp_ids, 2757 bp_mask, 2758 cohort_atomics, 2759 &blocks, 2760 blocks_next, 2761 &sk_v, 2762 sk_v_next, 2763 sk_extent, 2764 smem); 2765} 2766 2767// 2768// RASTERIZE LINE KERNEL 2769// 2770 2771static 2772void 2773skc_rasterize_lines(__global SKC_ATOMIC_UINT volatile * const bp_atomics, 2774 __global union skc_bp_elem * const bp_elems, 2775 __global uint * const bp_ids, 2776 skc_uint const bp_mask, 2777 2778 __global SKC_ATOMIC_UINT volatile * const cohort_atomics, 2779 __global skc_ttsk_s_t * const sk_extent, 2780 2781 __local struct skc_subgroup_smem volatile * const smem, 2782 2783 skc_uint * const nodeword, 2784 skc_block_id_t * const id, 2785 2786 union skc_transform const * const tv, 2787 union skc_path_clip const * const cv, 2788 skc_uint const cohort) 2789{ 2790 // 2791 // the initial segment idx and segments-per-block constant determine 2792 // how many block ids will need to be loaded 2793 // 2794 SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; 2795 2796 skc_segment_next(bp_elems,nodeword,id); 2797 2798 SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; 2799 2800 skc_segment_next(bp_elems,nodeword,id); 2801 2802 SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; 2803 2804 skc_segment_next(bp_elems,nodeword,id); 2805 2806 SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; 2807 2808#if 0 2809 printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",c0x,c0y,c1x,c1y); 2810#endif 2811 2812 // 2813 // apply transform 2814 // 2815 // note that we only care if the end points are rounded to subpixel precision 2816 // 2817 // FIXME -- transformation is currently affine-only 2818 // FIXME -- support perspective later 2819 // 2820 // the affine transformation requires 8 FMA + 4 ROUND operations 2821 // 2822 SKC_RASTERIZE_FLOAT l0x = c0x * tv->sx + c0y * tv->shx + tv->tx; 2823 SKC_RASTERIZE_FLOAT l0y = c0x * tv->shy + c0y * tv->sy + tv->ty; 2824 2825 SKC_RASTERIZE_FLOAT l1x = c1x * tv->sx + c1y * tv->shx + tv->tx; 2826 SKC_RASTERIZE_FLOAT l1y = c1x * tv->shy + c1y * tv->sy + tv->ty; 2827 2828 // 2829 // FIXME -- this is temporary support for projection 2830 // 2831 bool const is_affine = (tv->w0 == 0.0f) && (tv->w1 == 0.0f); 2832 2833 if (!is_affine) { 2834 SKC_PROJECT(tv,c0x,c0y,l0x,l0y); 2835 SKC_PROJECT(tv,c1x,c1y,l1x,l1y); 2836 } 2837 2838 l0x = round(l0x); 2839 l0y = round(l0y); 2840 2841 l1x = round(l1x); 2842 l1y = round(l1y); 2843 2844#if 0 2845 printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",l0x,l0y,l1x,l1y); 2846#endif 2847 2848 // 2849 // allocate and init in-register TTSK keys 2850 // 2851 skc_uint sk_v_next = 0; 2852 skc_ttsk_v_t sk_v; 2853 2854 sk_v.hi = cohort; 2855 2856 // 2857 // initialize smem 2858 // 2859 skc_smem_init(smem); 2860 2861 // 2862 // initialize blocks / subblocks 2863 // 2864 skc_block_id_v_t blocks; 2865 skc_uint blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE; 2866 2867#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 2868 skc_block_id_t subblocks = 0; 2869#endif 2870 2871 // 2872 // sliver lines 2873 // 2874 skc_sliver(bp_atomics, 2875 bp_elems, 2876 bp_ids, 2877 bp_mask, 2878 cohort_atomics, 2879 &subblocks, 2880 &blocks, 2881 &blocks_next, 2882 &sk_v, 2883 &sk_v_next, 2884 sk_extent, 2885 smem, 2886 l0x,l0y,l1x,l1y); 2887 2888 // 2889 // - flush work-in-progress blocks 2890 // - return unused block ids 2891 // 2892 skc_finalize(bp_atomics, 2893 bp_elems, 2894 bp_ids, 2895 bp_mask, 2896 cohort_atomics, 2897 &blocks, 2898 blocks_next, 2899 &sk_v, 2900 sk_v_next, 2901 sk_extent, 2902 smem); 2903} 2904 2905// 2906// 2907// 2908 2909__kernel 2910SKC_RASTERIZE_KERNEL_ATTRIBS 2911void 2912skc_kernel_rasterize_all(__global SKC_ATOMIC_UINT volatile * const bp_atomics, 2913 __global union skc_bp_elem * const bp_elems, 2914 __global uint * const bp_ids, 2915 skc_uint const bp_mask, 2916 2917 __global SKC_ATOMIC_UINT volatile * const cohort_atomics, 2918 __global skc_ttsk_s_t * const sk_extent, 2919 2920 __global float8 const * const transforms, // FIXME -- __constant 2921 __global float4 const * const clips, // FIXME -- __constant 2922 __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant 2923 skc_uint const count) 2924{ 2925 // 2926 // declare shared memory block 2927 // 2928#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) 2929 __local struct skc_subgroup_smem volatile smem[1]; 2930#else 2931 __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; 2932 __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); 2933#endif 2934 2935 // 2936 // this is a subgroup/warp-centric kernel 2937 // 2938 // which subgroup in the grid is this? 2939 // 2940 // TAKE NOTE: the Intel GEN compiler appears to be recognizing 2941 // get_group_id(0) as a uniform but the alternative calculation used 2942 // when there are multiple subgroups per workgroup is not 2943 // cooperating and driving spillage elsewhere. 2944 // 2945#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) 2946 uint const cmd_idx = get_group_id(0); 2947#else 2948 uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); 2949#endif 2950 2951#if 0 2952 if (get_sub_group_local_id() == 0) 2953 printf("+cmd_idx = %u\n",cmd_idx); 2954#endif 2955 2956 // 2957 // if worksgroups are multi-subgroup then there may be excess 2958 // subgroups in the final workgroup 2959 // 2960 if (cmd_idx >= count) 2961 return; 2962 2963#if 0 2964 if (get_sub_group_local_id() == 0) 2965 printf("-cmd_idx = %u\n",cmd_idx); 2966#endif 2967 2968 // 2969 // load a single command for this subgroup 2970 // 2971 union skc_cmd_rasterize const cmd = cmds[cmd_idx]; 2972 2973#if 0 2974 if (get_sub_group_local_id() == 0) 2975 printf("[ %u ]< %u, %u, %u, %u >\n", 2976 cmd_idx, 2977 cmd.nodeword, 2978 SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd), 2979 SKC_CMD_RASTERIZE_GET_CLIP(cmd), 2980 SKC_CMD_RASTERIZE_GET_COHORT(cmd)); 2981#endif 2982 2983 // 2984 // get first block node command word and its subblock 2985 // 2986 skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing 2987 skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; 2988 skc_block_id_tag tag = SKC_TAGGED_BLOCK_ID_GET_TAG(tag_id); 2989 skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); 2990 2991 // 2992 // load transform -- uniform across subgroup 2993 // 2994 // v8: { sx shx tx shy sy ty w0 w1 } 2995 // 2996 // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: 2997 // 2998 // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] 2999 // 3000 // Coordinates are scaled to subpixel resolution. All that matters 3001 // is that continuity is maintained between end path element 3002 // endpoints. 3003 // 3004 // It's the responsibility of the host to ensure that the transforms 3005 // are properly scaled either via intitializing a transform stack 3006 // with the subpixel resolution scaled identity or scaling the 3007 // transform before its loaded by a rasterization grid. 3008 // 3009 // FIXME -- horizontal load might be better than this broadcast load 3010 // 3011 union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load 3012 union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load 3013 skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted 3014 3015 switch (tag) 3016 { 3017 case SKC_BLOCK_ID_TAG_PATH_LINE: 3018 skc_rasterize_lines(bp_atomics, 3019 bp_elems, 3020 bp_ids, 3021 bp_mask, 3022 cohort_atomics, 3023 sk_extent, 3024 smem, 3025 &nodeword,&id, 3026 &tv,&cv,cohort); 3027 break; 3028 3029 case SKC_BLOCK_ID_TAG_PATH_QUAD: 3030 skc_rasterize_quads(bp_atomics, 3031 bp_elems, 3032 bp_ids, 3033 bp_mask, 3034 cohort_atomics, 3035 sk_extent, 3036 smem, 3037 &nodeword,&id, 3038 &tv,&cv,cohort); 3039 break; 3040 3041 case SKC_BLOCK_ID_TAG_PATH_CUBIC: 3042 skc_rasterize_cubics(bp_atomics, 3043 bp_elems, 3044 bp_ids, 3045 bp_mask, 3046 cohort_atomics, 3047 sk_extent, 3048 smem, 3049 &nodeword,&id, 3050 &tv,&cv,cohort); 3051 break; 3052 3053 case SKC_BLOCK_ID_TAG_PATH_RAT_QUAD: 3054 break; 3055 case SKC_BLOCK_ID_TAG_PATH_RAT_CUBIC: 3056 break; 3057 3058 default: 3059 break; 3060 } 3061} 3062 3063// 3064// 3065// 3066 3067__kernel 3068SKC_RASTERIZE_KERNEL_ATTRIBS 3069void 3070skc_kernel_rasterize_lines(__global SKC_ATOMIC_UINT volatile * const bp_atomics, 3071 __global union skc_bp_elem * const bp_elems, 3072 __global uint * const bp_ids, 3073 skc_uint const bp_mask, 3074 3075 __global SKC_ATOMIC_UINT volatile * const cohort_atomics, 3076 __global skc_ttsk_s_t * const sk_extent, 3077 3078 __global float8 const * const transforms, // FIXME -- __constant 3079 __global float4 const * const clips, // FIXME -- __constant 3080 __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant 3081 skc_uint const count) 3082{ 3083 // 3084 // declare shared memory block 3085 // 3086#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) 3087 __local struct skc_subgroup_smem volatile smem[1]; 3088#else 3089 __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; 3090 __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); 3091#endif 3092 3093 // 3094 // this is a subgroup/warp-centric kernel 3095 // 3096 // which subgroup in the grid is this? 3097 // 3098 // TAKE NOTE: the Intel GEN compiler appears to be recognizing 3099 // get_group_id(0) as a uniform but the alternative calculation used 3100 // when there are multiple subgroups per workgroup is not 3101 // cooperating and driving spillage elsewhere. 3102 // 3103#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) 3104 uint const cmd_idx = get_group_id(0); 3105#else 3106 uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); 3107#endif 3108 3109 // 3110 // if worksgroups are multi-subgroup then there may be excess 3111 // subgroups in the final workgroup 3112 // 3113 if (cmd_idx >= count) 3114 return; 3115 3116#if 0 3117 if (get_sub_group_local_id() == 0) 3118 printf("cmd_idx = %u\n",cmd_idx); 3119#endif 3120 3121 // 3122 // load a single command for this subgroup 3123 // 3124 union skc_cmd_rasterize const cmd = cmds[cmd_idx]; 3125 3126 // 3127 // get first block node command word and its subblock 3128 // 3129 skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing 3130 skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; 3131 skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); 3132 3133 // 3134 // load transform -- uniform across subgroup 3135 // 3136 // v8: { sx shx tx shy sy ty w0 w1 } 3137 // 3138 // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: 3139 // 3140 // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] 3141 // 3142 // Coordinates are scaled to subpixel resolution. All that matters 3143 // is that continuity is maintained between end path element 3144 // endpoints. 3145 // 3146 // It's the responsibility of the host to ensure that the transforms 3147 // are properly scaled either via intitializing a transform stack 3148 // with the subpixel resolution scaled identity or scaling the 3149 // transform before its loaded by a rasterization grid. 3150 // 3151 // FIXME -- horizontal load might be better than this broadcast load 3152 // 3153 union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load 3154 union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load 3155 skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted 3156 3157 skc_rasterize_lines(bp_atomics, 3158 bp_elems, 3159 bp_ids, 3160 bp_mask, 3161 cohort_atomics, 3162 sk_extent, 3163 smem, 3164 &nodeword,&id, 3165 &tv,&cv,cohort); 3166} 3167 3168// 3169// 3170// 3171 3172// 3173// 3174// 3175 3176__kernel 3177SKC_RASTERIZE_KERNEL_ATTRIBS 3178void 3179skc_kernel_rasterize_quads(__global SKC_ATOMIC_UINT volatile * const bp_atomics, 3180 __global union skc_bp_elem * const bp_elems, 3181 __global uint * const bp_ids, 3182 skc_uint const bp_mask, 3183 3184 __global SKC_ATOMIC_UINT volatile * const cohort_atomics, 3185 __global skc_ttsk_s_t * const sk_extent, 3186 3187 __global float8 const * const transforms, // FIXME -- __constant 3188 __global float4 const * const clips, // FIXME -- __constant 3189 __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant 3190 skc_uint const count) 3191{ 3192 // 3193 // declare shared memory block 3194 // 3195#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) 3196 __local struct skc_subgroup_smem volatile smem[1]; 3197#else 3198 __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; 3199 __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); 3200#endif 3201 3202 // 3203 // this is a subgroup/warp-centric kernel 3204 // 3205 // which subgroup in the grid is this? 3206 // 3207 // TAKE NOTE: the Intel GEN compiler appears to be recognizing 3208 // get_group_id(0) as a uniform but the alternative calculation used 3209 // when there are multiple subgroups per workgroup is not 3210 // cooperating and driving spillage elsewhere. 3211 // 3212#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) 3213 uint const cmd_idx = get_group_id(0); 3214#else 3215 uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); 3216#endif 3217 3218 // 3219 // if worksgroups are multi-subgroup then there may be excess 3220 // subgroups in the final workgroup 3221 // 3222 if (cmd_idx >= count) 3223 return; 3224 3225#if 0 3226 if (get_sub_group_local_id() == 0) 3227 printf("cmd_idx = %u\n",cmd_idx); 3228#endif 3229 3230 // 3231 // load a single command for this subgroup 3232 // 3233 union skc_cmd_rasterize const cmd = cmds[cmd_idx]; 3234 3235 // 3236 // get first block node command word and its subblock 3237 // 3238 skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing 3239 skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; 3240 skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); 3241 3242 // 3243 // load transform -- uniform across subgroup 3244 // 3245 // v8: { sx shx tx shy sy ty w0 w1 } 3246 // 3247 // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: 3248 // 3249 // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] 3250 // 3251 // Coordinates are scaled to subpixel resolution. All that matters 3252 // is that continuity is maintained between end path element 3253 // endpoints. 3254 // 3255 // It's the responsibility of the host to ensure that the transforms 3256 // are properly scaled either via intitializing a transform stack 3257 // with the subpixel resolution scaled identity or scaling the 3258 // transform before its loaded by a rasterization grid. 3259 // 3260 // FIXME -- horizontal load might be better than this broadcast load 3261 // 3262 union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load 3263 union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load 3264 skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted 3265 3266 skc_rasterize_quads(bp_atomics, 3267 bp_elems, 3268 bp_ids, 3269 bp_mask, 3270 cohort_atomics, 3271 sk_extent, 3272 smem, 3273 &nodeword,&id, 3274 &tv,&cv,cohort); 3275} 3276 3277// 3278// 3279// 3280 3281__kernel 3282SKC_RASTERIZE_KERNEL_ATTRIBS 3283void 3284skc_kernel_rasterize_cubics(__global SKC_ATOMIC_UINT volatile * const bp_atomics, 3285 __global union skc_bp_elem * const bp_elems, 3286 __global uint * const bp_ids, 3287 skc_uint const bp_mask, 3288 3289 __global SKC_ATOMIC_UINT volatile * const cohort_atomics, 3290 __global skc_ttsk_s_t * const sk_extent, 3291 3292 __global float8 const * const transforms, // FIXME -- __constant 3293 __global float4 const * const clips, // FIXME -- __constant 3294 __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant 3295 skc_uint const count) 3296{ 3297 // 3298 // declare shared memory block 3299 // 3300#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) 3301 __local struct skc_subgroup_smem volatile smem[1]; 3302#else 3303 __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; 3304 __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); 3305#endif 3306 3307 // 3308 // this is a subgroup/warp-centric kernel 3309 // 3310 // which subgroup in the grid is this? 3311 // 3312 // TAKE NOTE: the Intel GEN compiler appears to be recognizing 3313 // get_group_id(0) as a uniform but the alternative calculation used 3314 // when there are multiple subgroups per workgroup is not 3315 // cooperating and driving spillage elsewhere. 3316 // 3317#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) 3318 uint const cmd_idx = get_group_id(0); 3319#else 3320 uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); 3321#endif 3322 3323 // 3324 // if worksgroups are multi-subgroup then there may be excess 3325 // subgroups in the final workgroup 3326 // 3327 if (cmd_idx >= count) 3328 return; 3329 3330#if 0 3331 if (get_sub_group_local_id() == 0) 3332 printf("cmd_idx = %u\n",cmd_idx); 3333#endif 3334 3335 // 3336 // load a single command for this subgroup 3337 // 3338 union skc_cmd_rasterize const cmd = cmds[cmd_idx]; 3339 3340 // 3341 // get first block node command word and its subblock 3342 // 3343 skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing 3344 skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; 3345 skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); 3346 3347 // 3348 // load transform -- uniform across subgroup 3349 // 3350 // v8: { sx shx tx shy sy ty w0 w1 } 3351 // 3352 // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: 3353 // 3354 // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] 3355 // 3356 // Coordinates are scaled to subpixel resolution. All that matters 3357 // is that continuity is maintained between end path element 3358 // endpoints. 3359 // 3360 // It's the responsibility of the host to ensure that the transforms 3361 // are properly scaled either via intitializing a transform stack 3362 // with the subpixel resolution scaled identity or scaling the 3363 // transform before its loaded by a rasterization grid. 3364 // 3365 // FIXME -- horizontal load might be better than this broadcast load 3366 // 3367 union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load 3368 union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load 3369 skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted 3370 3371 skc_rasterize_cubics(bp_atomics, 3372 bp_elems, 3373 bp_ids, 3374 bp_mask, 3375 cohort_atomics, 3376 sk_extent, 3377 smem, 3378 &nodeword,&id, 3379 &tv,&cv,cohort); 3380} 3381 3382// 3383// 3384// 3385 3386__kernel 3387SKC_RASTERIZE_KERNEL_ATTRIBS 3388void 3389skc_kernel_rasterize_rat_quads(__global SKC_ATOMIC_UINT volatile * const bp_atomics, 3390 __global union skc_bp_elem * const bp_elems, 3391 __global uint * const bp_ids, 3392 skc_uint const bp_mask, 3393 3394 __global SKC_ATOMIC_UINT volatile * const cohort_atomics, 3395 __global skc_ttsk_s_t * const sk_extent, 3396 3397 __global float8 const * const transforms, // FIXME -- __constant 3398 __global float4 const * const clips, // FIXME -- __constant 3399 __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant 3400 skc_uint const count) 3401{ 3402 ; 3403} 3404 3405// 3406// 3407// 3408 3409__kernel 3410SKC_RASTERIZE_KERNEL_ATTRIBS 3411void 3412skc_kernel_rasterize_rat_cubics(__global SKC_ATOMIC_UINT volatile * const bp_atomics, 3413 __global union skc_bp_elem * const bp_elems, 3414 __global uint * const bp_ids, 3415 skc_uint const bp_mask, 3416 3417 __global SKC_ATOMIC_UINT volatile * const cohort_atomics, 3418 __global skc_ttsk_s_t * const sk_extent, 3419 3420 __global float8 const * const transforms, // FIXME -- __constant 3421 __global float4 const * const clips, // FIXME -- __constant 3422 __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant 3423 skc_uint const count) 3424{ 3425 ; 3426} 3427 3428// 3429// 3430// 3431