1/* 2 * Copyright 2017 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can 5 * be found in the LICENSE file. 6 * 7 */ 8 9// 10// 11// 12 13#include "tile.h" 14#include "common.h" 15#include "raster.h" 16#include "atomic_cl.h" 17#include "kernel_cl_12.h" 18 19// 20// 21// 22 23#define SKC_PLACE_SUBGROUP_MASK (SKC_PLACE_SUBGROUP_SIZE - 1) 24#define SKC_PLACE_SUBGROUP_LAST (SKC_PLACE_SUBGROUP_SIZE - 1) 25 26// 27// 28// 29 30#define SKC_PLACE_SMEM_COUNT_TTSK SKC_MAX_MACRO(SKC_RASTER_NODE_MAX_TTSK,SKC_PLACE_SUBGROUP_SIZE) 31#define SKC_PLACE_SMEM_COUNT_TTPK SKC_RASTER_NODE_MAX_TTPK 32 33// 34// 35// 36 37#define SKC_PLACE_X (SKC_DEVICE_BLOCK_DWORDS / SKC_PLACE_SUBGROUP_SIZE) 38 39// 40// 41// 42 43#if ( SKC_PLACE_X == 1 ) 44#define SKC_PLACE_EXPAND() SKC_EXPAND_1() 45#define SKC_PLACE_EXPAND_I_LAST 0 46 47#elif ( SKC_PLACE_X == 2 ) 48#define SKC_PLACE_EXPAND() SKC_EXPAND_2() 49#define SKC_PLACE_EXPAND_I_LAST 1 50 51#elif ( SKC_PLACE_X == 4 ) 52#define SKC_PLACE_EXPAND() SKC_EXPAND_4() 53#define SKC_PLACE_EXPAND_I_LAST 3 54 55#elif ( SKC_PLACE_X == 8 ) 56#define SKC_PLACE_EXPAND() SKC_EXPAND_8() 57#define SKC_PLACE_EXPAND_I_LAST 7 58 59#elif ( SKC_PLACE_X == 16) 60#define SKC_PLACE_EXPAND() SKC_EXPAND_16() 61#define SKC_PLACE_EXPAND_I_LAST 15 62#endif 63 64// 65// PREFIX STORES THE 64-BIT KEYS WITH TWO 32-BIT SUBGROUP-WIDE 66// COALESCED WRITES. LO FIRST, FOLLOWED BY HI. 67// 68// THIS SLIGHTLY COMPLICATES LOADING BY THE PLACE KERNEL IF THE 69// KERNELS USE DIFFERENT SUBGROUP SIZES. 70// 71// THE BENEFIT IS THAT THE RASTER RECLAIM KERNEL ONLY HAS TO LOAD THE 72// LO WORD OF THE KEY SINCE IT CONTAINS THE BLOCK ID. 73// 74// NOTE: AT THIS POINT, ONLY INTEL'S HD GRAPHICS ARCHITECTURE UNDER 75// OPENCL SUPPORTS SELECTING A SUBGROUP SIZE (8/16/32). VULKAN MAY 76// ONLY SUPPORT A SUBGROUP SIZE OF 16. 77// 78 79#if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_PLACE_SUBGROUP_SIZE ) 80 81#define SKC_PLACE_STRIDE_H(L) (L) 82#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE) 83#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE) 84 85#elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1 86 87#define SKC_PLACE_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_SIZE) 88#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_RATIO - 1) 89#define SKC_PLACE_SUBGROUP_RATIO_SCALE(I) ((I / SKC_PLACE_SUBGROUP_RATIO) * 2 * SKC_PLACE_SUBGROUP_RATIO + (I & SKC_PLACE_SUBGROUP_RATIO_MASK)) 90 91#define SKC_PLACE_STRIDE_H(L) (L) 92#define SKC_PLACE_STRIDE_V_LO(I) (SKC_PLACE_SUBGROUP_RATIO_SCALE(I) * SKC_PLACE_SUBGROUP_SIZE) 93#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_RATIO * SKC_PLACE_SUBGROUP_SIZE) 94 95#elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1 96 97#define SKC_PLACE_SUBGROUP_RATIO (SKC_PLACE_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE) 98#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask 99 100#define SKC_PLACE_STRIDE_H(L) (((L) & ~SKC_PLACE_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_PLACE_SUBGROUP_RATIO_MASK)) 101#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE) 102#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO) 103 104#endif 105 106// 107// A COARSE COMPILE-TIME GUARD -- WILL ONLY MATTER WHEN SUBGROUP SIZE 108// IS EQUAL TO THE RASTER HEADER SIZE (CURRENTLY 8) 109// 110 111#define SKC_PLACE_IS_ALL_HEADER_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE <= SKC_RASTER_HEAD_DWORDS) 112 113#define SKC_PLACE_IS_NOT_HEADER_ROW(i) ( (i) * SKC_PLACE_SUBGROUP_SIZE >= SKC_RASTER_HEAD_DWORDS) 114 115#define SKC_PLACE_IS_TRAILING_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE == SKC_DEVICE_BLOCK_DWORDS) 116 117#define SKC_PLACE_IS_HEADER_ROW_KEY(i) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k)) 118 119 120// 121// Note: HEADER_LESS_THAN purposefully wraps unsigned integer to ~UINT_MAX 122// 123#define SKC_PLACE_HEADER_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k)) 124#define SKC_PLACE_NODE_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() < (k)) 125 126// 127// TTSK v2: 128// 129// 0 63 130// | TTSB ID | PREFIX | SPAN | X | Y | 131// +---------+--------+---------+-----+-----+ 132// | 27 | 1 (=0) | 12 (=0) | 12 | 12 | 133// 134// 135// TTPK v2: 136// 137// 0 63 138// | TTPB ID | PREFIX | SPAN | X | Y | 139// +---------+--------+------+-----+-----+ 140// | 27 | 1 (=1) | 12 | 12 | 12 | 141// 142// 143 144// 145// TTCK (32-BIT COMPARE) v1: 146// 147// 0 63 148// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | 149// +----------------------+--------+--------+-------+-----+-----+ 150// | 30 | 1 | 1 | 18 | 7 | 7 | 151// 152// 153// TTCK (32-BIT COMPARE) v2: 154// 155// 0 63 156// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | 157// +----------------------+--------+--------+-------+-----+-----+ 158// | 30 | 1 | 1 | 15 | 9 | 8 | 159// 160// 161// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile: 162// 163// 0 63 164// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | 165// +----------------------+--------+--------+-------+-----+-----+ 166// | 27 | 1 | 1 | 18 | 9 | 8 | 167// 168 169union skc_subgroup_smem 170{ 171 skc_uint scratch[SKC_PLACE_SUBGROUP_SIZE]; // will only use SKC_PLACE_SUBGROUP_SIZE 172 173 struct { 174 struct { 175 skc_ttsk_lo_t sk[SKC_PLACE_SMEM_COUNT_TTSK]; 176 skc_ttpk_lo_t pk[SKC_PLACE_SMEM_COUNT_TTPK]; 177 } lo; 178 179 struct { 180 skc_ttsk_hi_t sk[SKC_PLACE_SMEM_COUNT_TTSK]; 181 skc_ttpk_hi_t pk[SKC_PLACE_SMEM_COUNT_TTPK]; 182 } hi; 183 184 // skc_uint span[SKC_PLACE_SMEM_COUNT_TTPK]; 185 }; 186 187}; 188 189// 190// scatter scan max 191// 192static 193skc_int_v_t 194skc_scatter_scan_max(__local union skc_subgroup_smem volatile * const smem, 195 skc_int_v_t const iss, 196 skc_int_v_t const ess) 197{ 198 // 199 // prefix sums determine which lanes we're going to work on next 200 // 201 skc_pred_v_t const is_scratch_store = (iss > 0) && (ess < SKC_PLACE_SUBGROUP_SIZE); 202 skc_int_v_t const scratch_idx = max(ess,0); 203 204 // 205 // SIMT 206 // 207 208 // 209 // zero the volatile smem scratchpad using vector syntax 210 // 211 smem->scratch[get_sub_group_local_id()] = ( 0 ); 212 213 // 214 // store source lane at starting lane 215 // 216 if (is_scratch_store) { 217 smem->scratch[scratch_idx] = get_sub_group_local_id(); 218 } 219 220 // 221 // propagate lanes to right using max scan 222 // 223 skc_int_v_t const scratch = smem->scratch[get_sub_group_local_id()]; 224 skc_int_v_t const source = sub_group_scan_inclusive_max(scratch); 225 226 return source; 227} 228 229// 230// 231// 232 233static 234skc_bool 235skc_xk_clip(union skc_tile_clip const * const tile_clip, 236 skc_ttxk_t * const xk) 237{ 238 // 239 // clip the sk and pk keys 240 // 241 // if fully clipped then return false 242 // 243 // alternatively -- we can expand all these keys in place 244 // 245 // alternatively -- keep sk and pk keys segregated because sk 246 // represents the vast majority of keys and are easier to process. 247 // don't mess with the fastpath! 248 // 249 return false; 250} 251 252// 253// 254// 255 256static 257skc_ttck_t 258skc_sk_to_ck(__local union skc_subgroup_smem volatile * const smem, 259 union skc_cmd_place const * const cmd, 260 skc_uint const sk_idx) 261{ 262 skc_uint const lo = smem->lo.sk[sk_idx]; // assumes prefix bit is 0 263 skc_uint const hi = smem->hi.sk[sk_idx]; 264 265 skc_ttck_t ck; 266 267 ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id 268 269 // FIXME -- x and y should already be clipped and shifted 270 skc_uint const x = (cmd->tx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X; 271 skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y; 272 273 ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y; 274 275 return ck; 276} 277 278static 279skc_ttck_t 280skc_pk_to_ck(__local union skc_subgroup_smem volatile * const smem, 281 union skc_cmd_place const * const cmd, 282 skc_uint const pk_idx, 283 skc_uint const dx) 284{ 285 skc_uint const lo = smem->lo.pk[pk_idx] & SKC_TTXK_LO_MASK_ID_PREFIX; // assumes prefix bit is 1 286 skc_uint const hi = smem->hi.pk[pk_idx]; 287 288 skc_ttck_t ck; 289 290 ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id 291 292 // FIXME -- x and y should already be clipped and shifted 293 skc_uint const x = (cmd->tx + dx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X; 294 skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y; 295 296 ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y; 297 298 return ck; 299} 300 301// 302// 303// 304 305static 306void 307skc_ttsk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics, 308 __global skc_ttck_t * const ck_extent, 309 __local union skc_subgroup_smem volatile * const smem, 310 union skc_cmd_place const * const cmd, 311 skc_uint const sk) 312{ 313 // 314 // Pretty sure you can never ever have an sk count equal to 0 315 // 316 skc_uint ck_base = 0; 317 318 // last lane performs the block pool allocation with an atomic increment 319 if (get_sub_group_local_id() == 0) { 320 ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,sk); 321 } 322 323 // broadcast base to all lanes 324 ck_base = sub_group_broadcast(ck_base,0); 325 326 // convert sk keys to ck keys 327 for (skc_uint ii=get_sub_group_local_id(); ii<sk; ii+=SKC_PLACE_SUBGROUP_SIZE) 328 { 329 ck_extent[ck_base+ii] = skc_sk_to_ck(smem,cmd,ii); 330 } 331} 332 333// 334// 335// 336 337static 338skc_int 339skc_ttpk_get_span(__local union skc_subgroup_smem volatile * const smem, 340 skc_uint const idx) 341{ 342 skc_uint const lo = smem->lo.pk[idx]; 343 skc_uint const hi = smem->hi.pk[idx]; 344 345 skc_uint const span_lo = lo >> SKC_TTXK_LO_OFFSET_SPAN; 346 skc_uint const span_hi = (hi & SKC_BITS_TO_MASK(SKC_TTXK_HI_BITS_SPAN)) << SKC_TTXK_LO_BITS_SPAN; 347 348 return (span_lo | span_hi) + 1; 349} 350 351// 352// 353// 354 355static 356void 357skc_ttpk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics, 358 __global skc_ttck_t * const ck_extent, 359 __local union skc_subgroup_smem volatile * const smem, 360 union skc_cmd_place const * const cmd, 361 skc_uint const pk) 362{ 363 // bail out if pk queue is empty 364 if (pk == 0) 365 return; 366 367#if 0 368 if (get_sub_group_local_id() == 0) 369 printf("%u\n",pk); 370#endif 371 372 // 373 // FIXME -- this nested loop iterates over the queue processing a 374 // subgroup of 64-bit keys at a time. This is probably not the most 375 // efficient approach so investigate how to store and iterate over a 376 // wider than subgroup (node-sized) queue of keys. 377 // 378 379 // round up so we work with full subgroups 380 skc_uint const pk_ru = (pk + SKC_PLACE_SUBGROUP_SIZE - 1) & ~SKC_PLACE_SUBGROUP_MASK; 381 skc_uint ii = 0; 382 383 // nested loop that expands all ttpk keys 384#if (SKC_PLACE_SMEM_COUNT_TTPK > SKC_PLACE_SUBGROUP_SIZE) 385 for (; ii<pk_ru; ii+=SKC_PLACE_SUBGROUP_SIZE) 386#endif 387 { 388 skc_uint idx = ii + get_sub_group_local_id(); 389 skc_int span = 0; 390 391 // how many tiles does this ttpk span? 392 if (idx < pk) 393 span = skc_ttpk_get_span(smem,idx); 394 395 // we need inclusive, exclusive and total 396 skc_int iss = sub_group_scan_inclusive_add(span); 397 skc_int ess = iss - span; 398 skc_int rem = sub_group_broadcast(iss,SKC_PLACE_SUBGROUP_SIZE-1); 399 400 // printf("%u : %u\n",span,iss); 401 // continue; 402 403 // atomically allocate space for the pk keys 404 skc_uint ck_base = 0; 405 406 // last lane performs the block pool allocation with an atomic increment 407 if (get_sub_group_local_id() == 0) { 408 ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,rem); 409 } 410 411 // broadcast atomically allocated extent base to all lanes 412 skc_uint ck_idx = sub_group_broadcast(ck_base,0) + get_sub_group_local_id(); 413 414 // 415 // FIXME -- this loop would probably be faster if the ttpk keys 416 // were held in registers and accessed with shuffles instead of 417 // SMEM loads 418 // 419 420 // 421 // loop until there are no more expanded pk keys 422 // 423 while (true) 424 { 425 skc_int const source = skc_scatter_scan_max(smem,iss,ess); 426 skc_int const dx = get_sub_group_local_id() - intel_sub_group_shuffle(ess,source); 427 428 // store valid ck keys to gmem 429 if (get_sub_group_local_id() < rem) { 430 ck_extent[ck_idx] = skc_pk_to_ck(smem,cmd,ii+source,dx); 431 } 432 433 // decrement remainder 434 rem -= SKC_PLACE_SUBGROUP_SIZE; 435 436 if (rem <= 0) 437 break; 438 439 // increment/decrement indices 440 ck_idx += SKC_PLACE_SUBGROUP_SIZE; 441 iss -= SKC_PLACE_SUBGROUP_SIZE; 442 ess -= SKC_PLACE_SUBGROUP_SIZE; 443 } 444 } 445} 446 447// 448// 449// 450 451static 452skc_uint 453skc_ballot(skc_uint * const xk, skc_uint const is_xk) 454{ 455#if 0 456 // 457 // FIXME -- when available, this should use the idiom: 458 // 459 // ballot() + lane_mask_less_than_or_equal + popcount() 460 // 461 // Supported by: 462 // 463 // - Vulkan 1.1 / SPIR-V 1.3 464 // - CUDA 465 // - AVX2 (SSE*?) 466 // 467#else 468 // 469 // otherwise, emulate with an inclusive scan (yuk) 470 // 471 skc_uint const prefix = sub_group_scan_inclusive_add(is_xk); 472 473 skc_uint const xk_idx = *xk + prefix - is_xk; 474 475 *xk += sub_group_broadcast(prefix,SKC_PLACE_SUBGROUP_LAST); 476 477#if 0 478 printf("< %3u >\n",xk_idx); 479#endif 480 481 return xk_idx; 482#endif 483} 484 485// 486// 487// 488__kernel 489SKC_PLACE_KERNEL_ATTRIBS 490void 491skc_kernel_place(__global skc_bp_elem_t * const bp_elems, 492 __global SKC_ATOMIC_UINT volatile * const place_atomics, 493 __global skc_ttck_t * const ck_extent, 494 __global union skc_cmd_place const * const cmds, 495 __global skc_block_id_t * const map, 496 skc_uint4 const clip, 497 skc_uint const count) 498{ 499 // 500 // declare shared memory block 501 // 502#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 ) 503 __local union skc_subgroup_smem volatile smem[1]; 504#else 505 __local union skc_subgroup_smem volatile smem_wg[SKC_PLACE_WORKGROUP_SUBGROUPS]; 506 __local union skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); 507#endif 508 509 // 510 // This is a subgroup-centric kernel 511 // 512 // Which subgroup in the grid is this? 513 // 514 // TAKE NOTE: the Intel GEN compiler appears to be recognizing 515 // get_group_id(0) as a uniform but the alternative calculation used 516 // when there are multiple subgroups per workgroup is not 517 // cooperating and driving spillage elsewhere. 518 // 519 // Test the raster's translated bounds against the composition's 520 // tile clip 521 // 522 // There are 3 cases: 523 // 524 // - the raster is completely clipped -> return 525 // - the raster is partially clipped -> all keys must clipped 526 // - the raster is not clipped -> no keys are tested 527 // 528 // 529 // There are at least 4 implementations of place and we want to 530 // special-case them as much as possible so that, at the least, the 531 // fastpath remains fast. 532 // 533 // - implement NO CLIP + NO TRANSLATION fastpath -- CAN ATOMICALLY ALLOCATE SK+PK KEYS IN ONE STEP 534 // 535 // - implement CLIPPED + NO TRANSLATION path 536 // 537 // - implement NO CLIP + TRANSLATION path 538 // 539 // - implement CLIPPED + TRANSLATION path 540 // 541 // 542 // FIXME/OPTIMIZATION: split scan accumulator into a triple-bin 543 // 12:12:8 integer where: 544 // 545 // 12: ttsk 546 // 12: ttpk 547 // 8: /dev/null -- clipped or invalid key 548 // 549 // Three kinds of nodes in a raster's list: 550 // 551 // - the head node 552 // - an internal node 553 // - the final node 554 // 555 556#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 ) 557 skc_uint const cmd_idx = get_group_id(0); 558#else 559 skc_uint const cmd_idx = get_group_id(0) * SKC_PLACE_WORKGROUP_SUBGROUPS + get_sub_group_id(); 560#endif 561 562 // load command 563 union skc_cmd_place const cmd = cmds[cmd_idx]; 564 565 // get the raster header from the raster host id -- scalar 566 skc_block_id_t id = map[cmd.raster_h]; 567 568 // 569 // load all of the head block ttxk keys into registers 570 // 571 // FIXME -- this pattern lends itself to using the higher 572 // performance Intel GEN block load instructions 573 // 574 skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id()); 575 576#undef SKC_EXPAND_X 577#define SKC_EXPAND_X(I,S,C,P,R) \ 578 union skc_raster_node_elem const h##I = { \ 579 .u32v2 = { bp_elems[head_id + SKC_PLACE_STRIDE_V_LO(I)], \ 580 bp_elems[head_id + SKC_PLACE_STRIDE_V_HI(I)] } \ 581 }; 582 583 SKC_PLACE_EXPAND(); 584 585 // 586 // load raster header counts -- we only need the "nodes" and "keys" 587 // words but the keys we loaded are doublewords. 588 // 589 // FIXME -- this can be made portable with compile-time macro expansion 590 // 591 skc_uint nodes = sub_group_broadcast(h0.u32v2.lo,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES 592 skc_uint keys = sub_group_broadcast(h0.u32v2.hi,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS 593 594 // 595 // 596 // 597#if 0 598#undef SKC_EXPAND_X 599#define SKC_EXPAND_X(I,S,C,P,R) \ 600 printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \ 601 nodes,keys, \ 602 I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \ 603 h##I.u32v2.hi,h##I.u32v2.lo, \ 604 h##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX); 605 606 SKC_PLACE_EXPAND(); 607#endif 608 609 // 610#if 0 611 if (get_sub_group_local_id() == 0) { 612 printf("place: %u / %u / %u\n",head_id,nodes,keys); 613 } 614#endif 615 616 { 617 // 618 // classify every key in the header 619 // 620 // keys: 0 is not a key / 1 is a key 621 // skpk: 0 is sk / 1 is pk 622 // 623 skc_uint bits_keys = 0; 624 skc_uint bits_skpk = 0; 625 626 // 627 // calculate bits_keys 628 // 629#undef SKC_EXPAND_X 630#define SKC_EXPAND_X(I,S,C,P,R) \ 631 if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ 632 skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS; \ 633 if (idx < keys) { \ 634 bits_keys |= (1u << I); \ 635 } \ 636 if (SKC_PLACE_IS_TRAILING_ROW(I)) { \ 637 if (keys > SKC_RASTER_HEAD_COUNT_KEYS) { \ 638 if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \ 639 bits_keys &= ~(1u << I); \ 640 } \ 641 } \ 642 } \ 643 } 644 645 SKC_PLACE_EXPAND(); 646 647 // 648 // blindly calculate bits_skpk 649 // 650#undef SKC_EXPAND_X 651#define SKC_EXPAND_X(I,S,C,P,R) \ 652 if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ 653 bits_skpk |= (h##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \ 654 } 655 656 SKC_PLACE_EXPAND(); 657 658#if 0 659 printf("%2X : %2X\n",bits_keys,bits_skpk); 660#endif 661 662 // 663 // next pointer is last element of last row. save it now because 664 // this might be recognized as a subgroup-uniform/scalar. 665 // 666 id = sub_group_broadcast(SKC_CONCAT(h,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST); 667 668 // 669 // append SK keys first 670 // 671 skc_uint const bits_sk = bits_keys & ~bits_skpk; 672 skc_uint sk = 0; 673 674#undef SKC_EXPAND_X 675#define SKC_EXPAND_X(I,S,C,P,R) \ 676 if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ 677 skc_uint is_sk = (bits_sk >> I) & 1; \ 678 skc_uint sk_idx = skc_ballot(&sk,is_sk); \ 679 if (is_sk) { \ 680 smem->lo.sk[sk_idx] = h##I.xk.lo; \ 681 smem->hi.sk[sk_idx] = h##I.xk.hi; \ 682 } \ 683 } 684 685 SKC_PLACE_EXPAND(); 686 687 // 688 // append PK keys next 689 // 690 skc_uint const bits_pk = bits_keys & bits_skpk; 691 skc_uint pk = 0; 692 693#undef SKC_EXPAND_X 694#define SKC_EXPAND_X(I,S,C,P,R) \ 695 if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ 696 skc_uint is_pk = (bits_pk >> I) & 1; \ 697 skc_uint pk_idx = skc_ballot(&pk,is_pk); \ 698 if (is_pk) { \ 699 smem->lo.pk[pk_idx] = h##I.xk.lo; \ 700 smem->hi.pk[pk_idx] = h##I.xk.hi; \ 701 } \ 702 } 703 704 SKC_PLACE_EXPAND(); 705 706#if 0 707 printf("%2u * %2u\n",sk,pk); 708#endif 709 // 710 // flush the keys 711 // 712 skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk); 713 skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk); 714 } 715 716 // 717 // we're done if there was only a head node 718 // 719 if (nodes == 0) 720 return; 721 722 // 723 // decrement keys 724 // 725 keys -= SKC_RASTER_HEAD_COUNT_KEYS; 726 727 // 728 // otherwise, append keys in trailing nodes to smem 729 // 730 while (true) 731 { 732 // 733 // load all of the node block ttxk keys into registers 734 // 735 // FIXME -- this pattern lends itself to using the higher 736 // performance Intel GEN block load instructions 737 // 738 skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id()); 739 740#undef SKC_EXPAND_X 741#define SKC_EXPAND_X(I,S,C,P,R) \ 742 union skc_raster_node_elem const n##I = { \ 743 .u32v2 = { bp_elems[node_id + SKC_PLACE_STRIDE_V_LO(I)], \ 744 bp_elems[node_id + SKC_PLACE_STRIDE_V_HI(I)] } \ 745 }; 746 747 SKC_PLACE_EXPAND(); 748 749#if 0 750#undef SKC_EXPAND_X 751#define SKC_EXPAND_X(I,S,C,P,R) \ 752 printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \ 753 nodes,keys, \ 754 I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \ 755 n##I.u32v2.hi,n##I.u32v2.lo, \ 756 n##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX); 757 758 SKC_PLACE_EXPAND(); 759#endif 760 761 // 762 // classify every key in the header 763 // 764 // keys: 0 is not a key / 1 is a key 765 // skpk: 0 is sk / 1 is pk 766 // 767 skc_uint bits_keys = 0; 768 skc_uint bits_skpk = 0; 769 770 // 771 // calculate bits_keys 772 // 773#undef SKC_EXPAND_X 774#define SKC_EXPAND_X(I,S,C,P,R) { \ 775 skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(); \ 776 if (idx < keys) { \ 777 bits_keys |= (1u << I); \ 778 } \ 779 if (SKC_PLACE_IS_TRAILING_ROW(I)) { \ 780 if (keys > SKC_RASTER_NODE_COUNT_KEYS) { \ 781 if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \ 782 bits_keys &= ~(1u << I); \ 783 } \ 784 } \ 785 } \ 786 } 787 788 SKC_PLACE_EXPAND(); 789 790 // 791 // blindly calculate bits_skpk 792 // 793#undef SKC_EXPAND_X 794#define SKC_EXPAND_X(I,S,C,P,R) { \ 795 bits_skpk |= (n##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \ 796 } 797 798 SKC_PLACE_EXPAND(); 799 800#if 0 801 printf("%2X : %2X\n",bits_keys,bits_skpk); 802#endif 803 804 // 805 // next pointer is last element of last row. save it now because 806 // this might be recognized as a subgroup-uniform/scalar. 807 // 808 id = sub_group_broadcast(SKC_CONCAT(n,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST); 809 810 // 811 // append SK keys first 812 // 813 skc_uint const bits_sk = bits_keys & ~bits_skpk; 814 skc_uint sk = 0; 815 816#undef SKC_EXPAND_X 817#define SKC_EXPAND_X(I,S,C,P,R) { \ 818 skc_uint is_sk = (bits_sk >> I) & 1; \ 819 skc_uint sk_idx = skc_ballot(&sk,is_sk); \ 820 if (is_sk) { \ 821 smem->lo.sk[sk_idx] = n##I.xk.lo; \ 822 smem->hi.sk[sk_idx] = n##I.xk.hi; \ 823 } \ 824 } 825 826 SKC_PLACE_EXPAND(); 827 828 // 829 // append PK keys next 830 // 831 skc_uint const bits_pk = bits_keys & bits_skpk; 832 skc_uint pk = 0; 833 834#undef SKC_EXPAND_X 835#define SKC_EXPAND_X(I,S,C,P,R) { \ 836 skc_uint is_pk = (bits_pk >> I) & 1; \ 837 skc_uint pk_idx = skc_ballot(&pk,is_pk); \ 838 if (is_pk) { \ 839 smem->lo.pk[pk_idx] = n##I.xk.lo; \ 840 smem->hi.pk[pk_idx] = n##I.xk.hi; \ 841 } \ 842 } 843 844 SKC_PLACE_EXPAND(); 845 846#if 0 847 printf("%2u * %2u\n",sk,pk); 848#endif 849 // 850 // if total for either the sk or pk queue reaches the 851 // highwater mark then flush it to the extent 852 // 853 skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk); 854 skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk); 855 856 // 857 // if this was the last node then we're done 858 // 859 if (--nodes == 0) 860 return; 861 862 // 863 // otherwise decrement keys 864 // 865 keys -= SKC_RASTER_NODE_COUNT_KEYS; 866 } 867} 868 869// 870// 871// 872