1 2 3__kernel __attribute__((intel_reqd_sub_group_size((1 << 3)))) 4__attribute__((reqd_work_group_size((1 << 3) * 1, 1, 1))) void 5hs_kernel_bs_0(__global ulong const* const restrict vin, 6 __global ulong* const restrict vout) 7{ 8 uint const gmem_idx = (get_global_id(0) & ~((1 << 3) - 1)) * 16 + 9 (get_local_id(0) & ((1 << 3) - 1)); 10 ulong r1 = vin[gmem_idx + (1 << 3) * 0]; 11 ulong r2 = vin[gmem_idx + (1 << 3) * 1]; 12 ulong r3 = vin[gmem_idx + (1 << 3) * 2]; 13 ulong r4 = vin[gmem_idx + (1 << 3) * 3]; 14 ulong r5 = vin[gmem_idx + (1 << 3) * 4]; 15 ulong r6 = vin[gmem_idx + (1 << 3) * 5]; 16 ulong r7 = vin[gmem_idx + (1 << 3) * 6]; 17 ulong r8 = vin[gmem_idx + (1 << 3) * 7]; 18 ulong r9 = vin[gmem_idx + (1 << 3) * 8]; 19 ulong r10 = vin[gmem_idx + (1 << 3) * 9]; 20 ulong r11 = vin[gmem_idx + (1 << 3) * 10]; 21 ulong r12 = vin[gmem_idx + (1 << 3) * 11]; 22 ulong r13 = vin[gmem_idx + (1 << 3) * 12]; 23 ulong r14 = vin[gmem_idx + (1 << 3) * 13]; 24 ulong r15 = vin[gmem_idx + (1 << 3) * 14]; 25 ulong r16 = vin[gmem_idx + (1 << 3) * 15]; 26 if (r1 >= r2) { 27 ulong const t = r1; 28 r1 = r2; 29 r2 = t; 30 }; 31 if (r3 >= r4) { 32 ulong const t = r3; 33 r3 = r4; 34 r4 = t; 35 }; 36 if (r5 >= r6) { 37 ulong const t = r5; 38 r5 = r6; 39 r6 = t; 40 }; 41 if (r7 >= r8) { 42 ulong const t = r7; 43 r7 = r8; 44 r8 = t; 45 }; 46 if (r9 >= r10) { 47 ulong const t = r9; 48 r9 = r10; 49 r10 = t; 50 }; 51 if (r11 >= r12) { 52 ulong const t = r11; 53 r11 = r12; 54 r12 = t; 55 }; 56 if (r13 >= r14) { 57 ulong const t = r13; 58 r13 = r14; 59 r14 = t; 60 }; 61 if (r15 >= r16) { 62 ulong const t = r15; 63 r15 = r16; 64 r16 = t; 65 }; 66 if (r1 >= r3) { 67 ulong const t = r1; 68 r1 = r3; 69 r3 = t; 70 }; 71 if (r5 >= r7) { 72 ulong const t = r5; 73 r5 = r7; 74 r7 = t; 75 }; 76 if (r9 >= r11) { 77 ulong const t = r9; 78 r9 = r11; 79 r11 = t; 80 }; 81 if (r13 >= r15) { 82 ulong const t = r13; 83 r13 = r15; 84 r15 = t; 85 }; 86 if (r2 >= r4) { 87 ulong const t = r2; 88 r2 = r4; 89 r4 = t; 90 }; 91 if (r6 >= r8) { 92 ulong const t = r6; 93 r6 = r8; 94 r8 = t; 95 }; 96 if (r10 >= r12) { 97 ulong const t = r10; 98 r10 = r12; 99 r12 = t; 100 }; 101 if (r14 >= r16) { 102 ulong const t = r14; 103 r14 = r16; 104 r16 = t; 105 }; 106 if (r1 >= r5) { 107 ulong const t = r1; 108 r1 = r5; 109 r5 = t; 110 }; 111 if (r9 >= r13) { 112 ulong const t = r9; 113 r9 = r13; 114 r13 = t; 115 }; 116 if (r2 >= r6) { 117 ulong const t = r2; 118 r2 = r6; 119 r6 = t; 120 }; 121 if (r10 >= r14) { 122 ulong const t = r10; 123 r10 = r14; 124 r14 = t; 125 }; 126 if (r3 >= r7) { 127 ulong const t = r3; 128 r3 = r7; 129 r7 = t; 130 }; 131 if (r11 >= r15) { 132 ulong const t = r11; 133 r11 = r15; 134 r15 = t; 135 }; 136 if (r4 >= r8) { 137 ulong const t = r4; 138 r4 = r8; 139 r8 = t; 140 }; 141 if (r12 >= r16) { 142 ulong const t = r12; 143 r12 = r16; 144 r16 = t; 145 }; 146 if (r1 >= r9) { 147 ulong const t = r1; 148 r1 = r9; 149 r9 = t; 150 }; 151 if (r2 >= r10) { 152 ulong const t = r2; 153 r2 = r10; 154 r10 = t; 155 }; 156 if (r3 >= r11) { 157 ulong const t = r3; 158 r3 = r11; 159 r11 = t; 160 }; 161 if (r4 >= r12) { 162 ulong const t = r4; 163 r4 = r12; 164 r12 = t; 165 }; 166 if (r5 >= r13) { 167 ulong const t = r5; 168 r5 = r13; 169 r13 = t; 170 }; 171 if (r6 >= r14) { 172 ulong const t = r6; 173 r6 = r14; 174 r14 = t; 175 }; 176 if (r7 >= r15) { 177 ulong const t = r7; 178 r7 = r15; 179 r15 = t; 180 }; 181 if (r8 >= r16) { 182 ulong const t = r8; 183 r8 = r16; 184 r16 = t; 185 }; 186 if (r6 >= r11) { 187 ulong const t = r6; 188 r6 = r11; 189 r11 = t; 190 }; 191 if (r7 >= r10) { 192 ulong const t = r7; 193 r7 = r10; 194 r10 = t; 195 }; 196 if (r4 >= r13) { 197 ulong const t = r4; 198 r4 = r13; 199 r13 = t; 200 }; 201 if (r14 >= r15) { 202 ulong const t = r14; 203 r14 = r15; 204 r15 = t; 205 }; 206 if (r8 >= r12) { 207 ulong const t = r8; 208 r8 = r12; 209 r12 = t; 210 }; 211 if (r2 >= r3) { 212 ulong const t = r2; 213 r2 = r3; 214 r3 = t; 215 }; 216 if (r5 >= r9) { 217 ulong const t = r5; 218 r5 = r9; 219 r9 = t; 220 }; 221 if (r2 >= r5) { 222 ulong const t = r2; 223 r2 = r5; 224 r5 = t; 225 }; 226 if (r8 >= r14) { 227 ulong const t = r8; 228 r8 = r14; 229 r14 = t; 230 }; 231 if (r3 >= r9) { 232 ulong const t = r3; 233 r3 = r9; 234 r9 = t; 235 }; 236 if (r12 >= r15) { 237 ulong const t = r12; 238 r12 = r15; 239 r15 = t; 240 }; 241 if (r3 >= r5) { 242 ulong const t = r3; 243 r3 = r5; 244 r5 = t; 245 }; 246 if (r6 >= r7) { 247 ulong const t = r6; 248 r6 = r7; 249 r7 = t; 250 }; 251 if (r10 >= r11) { 252 ulong const t = r10; 253 r10 = r11; 254 r11 = t; 255 }; 256 if (r12 >= r14) { 257 ulong const t = r12; 258 r12 = r14; 259 r14 = t; 260 }; 261 if (r4 >= r9) { 262 ulong const t = r4; 263 r4 = r9; 264 r9 = t; 265 }; 266 if (r8 >= r13) { 267 ulong const t = r8; 268 r8 = r13; 269 r13 = t; 270 }; 271 if (r7 >= r9) { 272 ulong const t = r7; 273 r7 = r9; 274 r9 = t; 275 }; 276 if (r11 >= r13) { 277 ulong const t = r11; 278 r11 = r13; 279 r13 = t; 280 }; 281 if (r4 >= r6) { 282 ulong const t = r4; 283 r4 = r6; 284 r6 = t; 285 }; 286 if (r8 >= r10) { 287 ulong const t = r8; 288 r8 = r10; 289 r10 = t; 290 }; 291 if (r4 >= r5) { 292 ulong const t = r4; 293 r4 = r5; 294 r5 = t; 295 }; 296 if (r6 >= r7) { 297 ulong const t = r6; 298 r6 = r7; 299 r7 = t; 300 }; 301 if (r8 >= r9) { 302 ulong const t = r8; 303 r8 = r9; 304 r9 = t; 305 }; 306 if (r10 >= r11) { 307 ulong const t = r10; 308 r10 = r11; 309 r11 = t; 310 }; 311 if (r12 >= r13) { 312 ulong const t = r12; 313 r12 = r13; 314 r13 = t; 315 }; 316 if (r7 >= r8) { 317 ulong const t = r7; 318 r7 = r8; 319 r8 = t; 320 }; 321 if (r9 >= r10) { 322 ulong const t = r9; 323 r9 = r10; 324 r10 = t; 325 }; 326 { 327 uint const flip_lane_idx = get_sub_group_local_id() ^ 1; 328 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 329 ; 330 { 331 ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 332 ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx); 333 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 334 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 335 }; 336 { 337 ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 338 ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx); 339 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 340 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 341 }; 342 { 343 ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 344 ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx); 345 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 346 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 347 }; 348 { 349 ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 350 ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx); 351 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 352 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 353 }; 354 { 355 ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx); 356 ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx); 357 r5 = ((r5 <= tb) ^ t_lt) ? tb : r5; 358 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 359 }; 360 { 361 ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx); 362 ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx); 363 r6 = ((r6 <= tb) ^ t_lt) ? tb : r6; 364 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 365 }; 366 { 367 ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx); 368 ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx); 369 r7 = ((r7 <= tb) ^ t_lt) ? tb : r7; 370 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 371 }; 372 { 373 ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx); 374 ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx); 375 r8 = ((r8 <= tb) ^ t_lt) ? tb : r8; 376 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 377 }; 378 } 379 if (r1 >= r9) { 380 ulong const t = r1; 381 r1 = r9; 382 r9 = t; 383 }; 384 if (r5 >= r13) { 385 ulong const t = r5; 386 r5 = r13; 387 r13 = t; 388 }; 389 if (r1 >= r5) { 390 ulong const t = r1; 391 r1 = r5; 392 r5 = t; 393 }; 394 if (r9 >= r13) { 395 ulong const t = r9; 396 r9 = r13; 397 r13 = t; 398 }; 399 if (r3 >= r11) { 400 ulong const t = r3; 401 r3 = r11; 402 r11 = t; 403 }; 404 if (r7 >= r15) { 405 ulong const t = r7; 406 r7 = r15; 407 r15 = t; 408 }; 409 if (r3 >= r7) { 410 ulong const t = r3; 411 r3 = r7; 412 r7 = t; 413 }; 414 if (r11 >= r15) { 415 ulong const t = r11; 416 r11 = r15; 417 r15 = t; 418 }; 419 if (r1 >= r3) { 420 ulong const t = r1; 421 r1 = r3; 422 r3 = t; 423 }; 424 if (r5 >= r7) { 425 ulong const t = r5; 426 r5 = r7; 427 r7 = t; 428 }; 429 if (r9 >= r11) { 430 ulong const t = r9; 431 r9 = r11; 432 r11 = t; 433 }; 434 if (r13 >= r15) { 435 ulong const t = r13; 436 r13 = r15; 437 r15 = t; 438 }; 439 if (r2 >= r10) { 440 ulong const t = r2; 441 r2 = r10; 442 r10 = t; 443 }; 444 if (r6 >= r14) { 445 ulong const t = r6; 446 r6 = r14; 447 r14 = t; 448 }; 449 if (r2 >= r6) { 450 ulong const t = r2; 451 r2 = r6; 452 r6 = t; 453 }; 454 if (r10 >= r14) { 455 ulong const t = r10; 456 r10 = r14; 457 r14 = t; 458 }; 459 if (r4 >= r12) { 460 ulong const t = r4; 461 r4 = r12; 462 r12 = t; 463 }; 464 if (r8 >= r16) { 465 ulong const t = r8; 466 r8 = r16; 467 r16 = t; 468 }; 469 if (r4 >= r8) { 470 ulong const t = r4; 471 r4 = r8; 472 r8 = t; 473 }; 474 if (r12 >= r16) { 475 ulong const t = r12; 476 r12 = r16; 477 r16 = t; 478 }; 479 if (r2 >= r4) { 480 ulong const t = r2; 481 r2 = r4; 482 r4 = t; 483 }; 484 if (r6 >= r8) { 485 ulong const t = r6; 486 r6 = r8; 487 r8 = t; 488 }; 489 if (r10 >= r12) { 490 ulong const t = r10; 491 r10 = r12; 492 r12 = t; 493 }; 494 if (r14 >= r16) { 495 ulong const t = r14; 496 r14 = r16; 497 r16 = t; 498 }; 499 if (r1 >= r2) { 500 ulong const t = r1; 501 r1 = r2; 502 r2 = t; 503 }; 504 if (r3 >= r4) { 505 ulong const t = r3; 506 r3 = r4; 507 r4 = t; 508 }; 509 if (r5 >= r6) { 510 ulong const t = r5; 511 r5 = r6; 512 r6 = t; 513 }; 514 if (r7 >= r8) { 515 ulong const t = r7; 516 r7 = r8; 517 r8 = t; 518 }; 519 if (r9 >= r10) { 520 ulong const t = r9; 521 r9 = r10; 522 r10 = t; 523 }; 524 if (r11 >= r12) { 525 ulong const t = r11; 526 r11 = r12; 527 r12 = t; 528 }; 529 if (r13 >= r14) { 530 ulong const t = r13; 531 r13 = r14; 532 r14 = t; 533 }; 534 if (r15 >= r16) { 535 ulong const t = r15; 536 r15 = r16; 537 r16 = t; 538 }; 539 { 540 uint const flip_lane_idx = get_sub_group_local_id() ^ 3; 541 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 542 ; 543 { 544 ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 545 ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx); 546 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 547 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 548 }; 549 { 550 ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 551 ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx); 552 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 553 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 554 }; 555 { 556 ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 557 ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx); 558 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 559 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 560 }; 561 { 562 ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 563 ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx); 564 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 565 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 566 }; 567 { 568 ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx); 569 ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx); 570 r5 = ((r5 <= tb) ^ t_lt) ? tb : r5; 571 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 572 }; 573 { 574 ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx); 575 ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx); 576 r6 = ((r6 <= tb) ^ t_lt) ? tb : r6; 577 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 578 }; 579 { 580 ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx); 581 ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx); 582 r7 = ((r7 <= tb) ^ t_lt) ? tb : r7; 583 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 584 }; 585 { 586 ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx); 587 ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx); 588 r8 = ((r8 <= tb) ^ t_lt) ? tb : r8; 589 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 590 }; 591 } 592 { 593 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 594 int const t_lt = get_sub_group_local_id() < half_lane_idx; 595 ; 596 { 597 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 598 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 599 }; 600 { 601 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 602 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 603 }; 604 { 605 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 606 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 607 }; 608 { 609 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 610 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 611 }; 612 { 613 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 614 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 615 }; 616 { 617 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 618 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 619 }; 620 { 621 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 622 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 623 }; 624 { 625 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 626 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 627 }; 628 { 629 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 630 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 631 }; 632 { 633 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 634 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 635 }; 636 { 637 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 638 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 639 }; 640 { 641 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 642 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 643 }; 644 { 645 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 646 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 647 }; 648 { 649 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 650 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 651 }; 652 { 653 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 654 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 655 }; 656 { 657 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 658 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 659 }; 660 } 661 if (r1 >= r9) { 662 ulong const t = r1; 663 r1 = r9; 664 r9 = t; 665 }; 666 if (r5 >= r13) { 667 ulong const t = r5; 668 r5 = r13; 669 r13 = t; 670 }; 671 if (r1 >= r5) { 672 ulong const t = r1; 673 r1 = r5; 674 r5 = t; 675 }; 676 if (r9 >= r13) { 677 ulong const t = r9; 678 r9 = r13; 679 r13 = t; 680 }; 681 if (r3 >= r11) { 682 ulong const t = r3; 683 r3 = r11; 684 r11 = t; 685 }; 686 if (r7 >= r15) { 687 ulong const t = r7; 688 r7 = r15; 689 r15 = t; 690 }; 691 if (r3 >= r7) { 692 ulong const t = r3; 693 r3 = r7; 694 r7 = t; 695 }; 696 if (r11 >= r15) { 697 ulong const t = r11; 698 r11 = r15; 699 r15 = t; 700 }; 701 if (r1 >= r3) { 702 ulong const t = r1; 703 r1 = r3; 704 r3 = t; 705 }; 706 if (r5 >= r7) { 707 ulong const t = r5; 708 r5 = r7; 709 r7 = t; 710 }; 711 if (r9 >= r11) { 712 ulong const t = r9; 713 r9 = r11; 714 r11 = t; 715 }; 716 if (r13 >= r15) { 717 ulong const t = r13; 718 r13 = r15; 719 r15 = t; 720 }; 721 if (r2 >= r10) { 722 ulong const t = r2; 723 r2 = r10; 724 r10 = t; 725 }; 726 if (r6 >= r14) { 727 ulong const t = r6; 728 r6 = r14; 729 r14 = t; 730 }; 731 if (r2 >= r6) { 732 ulong const t = r2; 733 r2 = r6; 734 r6 = t; 735 }; 736 if (r10 >= r14) { 737 ulong const t = r10; 738 r10 = r14; 739 r14 = t; 740 }; 741 if (r4 >= r12) { 742 ulong const t = r4; 743 r4 = r12; 744 r12 = t; 745 }; 746 if (r8 >= r16) { 747 ulong const t = r8; 748 r8 = r16; 749 r16 = t; 750 }; 751 if (r4 >= r8) { 752 ulong const t = r4; 753 r4 = r8; 754 r8 = t; 755 }; 756 if (r12 >= r16) { 757 ulong const t = r12; 758 r12 = r16; 759 r16 = t; 760 }; 761 if (r2 >= r4) { 762 ulong const t = r2; 763 r2 = r4; 764 r4 = t; 765 }; 766 if (r6 >= r8) { 767 ulong const t = r6; 768 r6 = r8; 769 r8 = t; 770 }; 771 if (r10 >= r12) { 772 ulong const t = r10; 773 r10 = r12; 774 r12 = t; 775 }; 776 if (r14 >= r16) { 777 ulong const t = r14; 778 r14 = r16; 779 r16 = t; 780 }; 781 if (r1 >= r2) { 782 ulong const t = r1; 783 r1 = r2; 784 r2 = t; 785 }; 786 if (r3 >= r4) { 787 ulong const t = r3; 788 r3 = r4; 789 r4 = t; 790 }; 791 if (r5 >= r6) { 792 ulong const t = r5; 793 r5 = r6; 794 r6 = t; 795 }; 796 if (r7 >= r8) { 797 ulong const t = r7; 798 r7 = r8; 799 r8 = t; 800 }; 801 if (r9 >= r10) { 802 ulong const t = r9; 803 r9 = r10; 804 r10 = t; 805 }; 806 if (r11 >= r12) { 807 ulong const t = r11; 808 r11 = r12; 809 r12 = t; 810 }; 811 if (r13 >= r14) { 812 ulong const t = r13; 813 r13 = r14; 814 r14 = t; 815 }; 816 if (r15 >= r16) { 817 ulong const t = r15; 818 r15 = r16; 819 r16 = t; 820 }; 821 { 822 uint const flip_lane_idx = get_sub_group_local_id() ^ 7; 823 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 824 ; 825 { 826 ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 827 ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx); 828 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 829 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 830 }; 831 { 832 ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 833 ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx); 834 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 835 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 836 }; 837 { 838 ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 839 ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx); 840 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 841 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 842 }; 843 { 844 ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 845 ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx); 846 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 847 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 848 }; 849 { 850 ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx); 851 ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx); 852 r5 = ((r5 <= tb) ^ t_lt) ? tb : r5; 853 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 854 }; 855 { 856 ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx); 857 ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx); 858 r6 = ((r6 <= tb) ^ t_lt) ? tb : r6; 859 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 860 }; 861 { 862 ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx); 863 ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx); 864 r7 = ((r7 <= tb) ^ t_lt) ? tb : r7; 865 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 866 }; 867 { 868 ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx); 869 ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx); 870 r8 = ((r8 <= tb) ^ t_lt) ? tb : r8; 871 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 872 }; 873 } 874 { 875 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 876 int const t_lt = get_sub_group_local_id() < half_lane_idx; 877 ; 878 { 879 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 880 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 881 }; 882 { 883 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 884 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 885 }; 886 { 887 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 888 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 889 }; 890 { 891 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 892 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 893 }; 894 { 895 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 896 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 897 }; 898 { 899 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 900 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 901 }; 902 { 903 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 904 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 905 }; 906 { 907 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 908 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 909 }; 910 { 911 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 912 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 913 }; 914 { 915 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 916 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 917 }; 918 { 919 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 920 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 921 }; 922 { 923 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 924 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 925 }; 926 { 927 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 928 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 929 }; 930 { 931 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 932 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 933 }; 934 { 935 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 936 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 937 }; 938 { 939 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 940 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 941 }; 942 } 943 { 944 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 945 int const t_lt = get_sub_group_local_id() < half_lane_idx; 946 ; 947 { 948 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 949 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 950 }; 951 { 952 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 953 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 954 }; 955 { 956 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 957 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 958 }; 959 { 960 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 961 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 962 }; 963 { 964 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 965 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 966 }; 967 { 968 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 969 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 970 }; 971 { 972 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 973 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 974 }; 975 { 976 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 977 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 978 }; 979 { 980 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 981 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 982 }; 983 { 984 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 985 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 986 }; 987 { 988 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 989 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 990 }; 991 { 992 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 993 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 994 }; 995 { 996 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 997 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 998 }; 999 { 1000 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 1001 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 1002 }; 1003 { 1004 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 1005 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 1006 }; 1007 { 1008 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 1009 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 1010 }; 1011 } 1012 if (r1 >= r9) { 1013 ulong const t = r1; 1014 r1 = r9; 1015 r9 = t; 1016 }; 1017 if (r5 >= r13) { 1018 ulong const t = r5; 1019 r5 = r13; 1020 r13 = t; 1021 }; 1022 if (r1 >= r5) { 1023 ulong const t = r1; 1024 r1 = r5; 1025 r5 = t; 1026 }; 1027 if (r9 >= r13) { 1028 ulong const t = r9; 1029 r9 = r13; 1030 r13 = t; 1031 }; 1032 if (r3 >= r11) { 1033 ulong const t = r3; 1034 r3 = r11; 1035 r11 = t; 1036 }; 1037 if (r7 >= r15) { 1038 ulong const t = r7; 1039 r7 = r15; 1040 r15 = t; 1041 }; 1042 if (r3 >= r7) { 1043 ulong const t = r3; 1044 r3 = r7; 1045 r7 = t; 1046 }; 1047 if (r11 >= r15) { 1048 ulong const t = r11; 1049 r11 = r15; 1050 r15 = t; 1051 }; 1052 if (r1 >= r3) { 1053 ulong const t = r1; 1054 r1 = r3; 1055 r3 = t; 1056 }; 1057 if (r5 >= r7) { 1058 ulong const t = r5; 1059 r5 = r7; 1060 r7 = t; 1061 }; 1062 if (r9 >= r11) { 1063 ulong const t = r9; 1064 r9 = r11; 1065 r11 = t; 1066 }; 1067 if (r13 >= r15) { 1068 ulong const t = r13; 1069 r13 = r15; 1070 r15 = t; 1071 }; 1072 if (r2 >= r10) { 1073 ulong const t = r2; 1074 r2 = r10; 1075 r10 = t; 1076 }; 1077 if (r6 >= r14) { 1078 ulong const t = r6; 1079 r6 = r14; 1080 r14 = t; 1081 }; 1082 if (r2 >= r6) { 1083 ulong const t = r2; 1084 r2 = r6; 1085 r6 = t; 1086 }; 1087 if (r10 >= r14) { 1088 ulong const t = r10; 1089 r10 = r14; 1090 r14 = t; 1091 }; 1092 if (r4 >= r12) { 1093 ulong const t = r4; 1094 r4 = r12; 1095 r12 = t; 1096 }; 1097 if (r8 >= r16) { 1098 ulong const t = r8; 1099 r8 = r16; 1100 r16 = t; 1101 }; 1102 if (r4 >= r8) { 1103 ulong const t = r4; 1104 r4 = r8; 1105 r8 = t; 1106 }; 1107 if (r12 >= r16) { 1108 ulong const t = r12; 1109 r12 = r16; 1110 r16 = t; 1111 }; 1112 if (r2 >= r4) { 1113 ulong const t = r2; 1114 r2 = r4; 1115 r4 = t; 1116 }; 1117 if (r6 >= r8) { 1118 ulong const t = r6; 1119 r6 = r8; 1120 r8 = t; 1121 }; 1122 if (r10 >= r12) { 1123 ulong const t = r10; 1124 r10 = r12; 1125 r12 = t; 1126 }; 1127 if (r14 >= r16) { 1128 ulong const t = r14; 1129 r14 = r16; 1130 r16 = t; 1131 }; 1132 if (r1 >= r2) { 1133 ulong const t = r1; 1134 r1 = r2; 1135 r2 = t; 1136 }; 1137 if (r3 >= r4) { 1138 ulong const t = r3; 1139 r3 = r4; 1140 r4 = t; 1141 }; 1142 if (r5 >= r6) { 1143 ulong const t = r5; 1144 r5 = r6; 1145 r6 = t; 1146 }; 1147 if (r7 >= r8) { 1148 ulong const t = r7; 1149 r7 = r8; 1150 r8 = t; 1151 }; 1152 if (r9 >= r10) { 1153 ulong const t = r9; 1154 r9 = r10; 1155 r10 = t; 1156 }; 1157 if (r11 >= r12) { 1158 ulong const t = r11; 1159 r11 = r12; 1160 r12 = t; 1161 }; 1162 if (r13 >= r14) { 1163 ulong const t = r13; 1164 r13 = r14; 1165 r14 = t; 1166 }; 1167 if (r15 >= r16) { 1168 ulong const t = r15; 1169 r15 = r16; 1170 r16 = t; 1171 }; 1172 vout[gmem_idx + (1 << 3) * 0] = r1; 1173 vout[gmem_idx + (1 << 3) * 1] = r2; 1174 vout[gmem_idx + (1 << 3) * 2] = r3; 1175 vout[gmem_idx + (1 << 3) * 3] = r4; 1176 vout[gmem_idx + (1 << 3) * 4] = r5; 1177 vout[gmem_idx + (1 << 3) * 5] = r6; 1178 vout[gmem_idx + (1 << 3) * 6] = r7; 1179 vout[gmem_idx + (1 << 3) * 7] = r8; 1180 vout[gmem_idx + (1 << 3) * 8] = r9; 1181 vout[gmem_idx + (1 << 3) * 9] = r10; 1182 vout[gmem_idx + (1 << 3) * 10] = r11; 1183 vout[gmem_idx + (1 << 3) * 11] = r12; 1184 vout[gmem_idx + (1 << 3) * 12] = r13; 1185 vout[gmem_idx + (1 << 3) * 13] = r14; 1186 vout[gmem_idx + (1 << 3) * 14] = r15; 1187 vout[gmem_idx + (1 << 3) * 15] = r16; 1188} 1189 1190__kernel __attribute__((intel_reqd_sub_group_size((1 << 3)))) 1191__attribute__((reqd_work_group_size((1 << 3) * 2, 1, 1))) void 1192hs_kernel_bs_1(__global ulong const* const restrict vin, 1193 __global ulong* const restrict vout) 1194{ 1195 __local struct 1196 { 1197 ulong m[16 * 16]; 1198 } shared; 1199 1200 uint const gmem_idx = (get_global_id(0) & ~((1 << 3) - 1)) * 16 + 1201 (get_local_id(0) & ((1 << 3) - 1)); 1202 ulong r1 = vin[gmem_idx + (1 << 3) * 0]; 1203 ulong r2 = vin[gmem_idx + (1 << 3) * 1]; 1204 ulong r3 = vin[gmem_idx + (1 << 3) * 2]; 1205 ulong r4 = vin[gmem_idx + (1 << 3) * 3]; 1206 ulong r5 = vin[gmem_idx + (1 << 3) * 4]; 1207 ulong r6 = vin[gmem_idx + (1 << 3) * 5]; 1208 ulong r7 = vin[gmem_idx + (1 << 3) * 6]; 1209 ulong r8 = vin[gmem_idx + (1 << 3) * 7]; 1210 ulong r9 = vin[gmem_idx + (1 << 3) * 8]; 1211 ulong r10 = vin[gmem_idx + (1 << 3) * 9]; 1212 ulong r11 = vin[gmem_idx + (1 << 3) * 10]; 1213 ulong r12 = vin[gmem_idx + (1 << 3) * 11]; 1214 ulong r13 = vin[gmem_idx + (1 << 3) * 12]; 1215 ulong r14 = vin[gmem_idx + (1 << 3) * 13]; 1216 ulong r15 = vin[gmem_idx + (1 << 3) * 14]; 1217 ulong r16 = vin[gmem_idx + (1 << 3) * 15]; 1218 if (r1 >= r2) { 1219 ulong const t = r1; 1220 r1 = r2; 1221 r2 = t; 1222 }; 1223 if (r3 >= r4) { 1224 ulong const t = r3; 1225 r3 = r4; 1226 r4 = t; 1227 }; 1228 if (r5 >= r6) { 1229 ulong const t = r5; 1230 r5 = r6; 1231 r6 = t; 1232 }; 1233 if (r7 >= r8) { 1234 ulong const t = r7; 1235 r7 = r8; 1236 r8 = t; 1237 }; 1238 if (r9 >= r10) { 1239 ulong const t = r9; 1240 r9 = r10; 1241 r10 = t; 1242 }; 1243 if (r11 >= r12) { 1244 ulong const t = r11; 1245 r11 = r12; 1246 r12 = t; 1247 }; 1248 if (r13 >= r14) { 1249 ulong const t = r13; 1250 r13 = r14; 1251 r14 = t; 1252 }; 1253 if (r15 >= r16) { 1254 ulong const t = r15; 1255 r15 = r16; 1256 r16 = t; 1257 }; 1258 if (r1 >= r3) { 1259 ulong const t = r1; 1260 r1 = r3; 1261 r3 = t; 1262 }; 1263 if (r5 >= r7) { 1264 ulong const t = r5; 1265 r5 = r7; 1266 r7 = t; 1267 }; 1268 if (r9 >= r11) { 1269 ulong const t = r9; 1270 r9 = r11; 1271 r11 = t; 1272 }; 1273 if (r13 >= r15) { 1274 ulong const t = r13; 1275 r13 = r15; 1276 r15 = t; 1277 }; 1278 if (r2 >= r4) { 1279 ulong const t = r2; 1280 r2 = r4; 1281 r4 = t; 1282 }; 1283 if (r6 >= r8) { 1284 ulong const t = r6; 1285 r6 = r8; 1286 r8 = t; 1287 }; 1288 if (r10 >= r12) { 1289 ulong const t = r10; 1290 r10 = r12; 1291 r12 = t; 1292 }; 1293 if (r14 >= r16) { 1294 ulong const t = r14; 1295 r14 = r16; 1296 r16 = t; 1297 }; 1298 if (r1 >= r5) { 1299 ulong const t = r1; 1300 r1 = r5; 1301 r5 = t; 1302 }; 1303 if (r9 >= r13) { 1304 ulong const t = r9; 1305 r9 = r13; 1306 r13 = t; 1307 }; 1308 if (r2 >= r6) { 1309 ulong const t = r2; 1310 r2 = r6; 1311 r6 = t; 1312 }; 1313 if (r10 >= r14) { 1314 ulong const t = r10; 1315 r10 = r14; 1316 r14 = t; 1317 }; 1318 if (r3 >= r7) { 1319 ulong const t = r3; 1320 r3 = r7; 1321 r7 = t; 1322 }; 1323 if (r11 >= r15) { 1324 ulong const t = r11; 1325 r11 = r15; 1326 r15 = t; 1327 }; 1328 if (r4 >= r8) { 1329 ulong const t = r4; 1330 r4 = r8; 1331 r8 = t; 1332 }; 1333 if (r12 >= r16) { 1334 ulong const t = r12; 1335 r12 = r16; 1336 r16 = t; 1337 }; 1338 if (r1 >= r9) { 1339 ulong const t = r1; 1340 r1 = r9; 1341 r9 = t; 1342 }; 1343 if (r2 >= r10) { 1344 ulong const t = r2; 1345 r2 = r10; 1346 r10 = t; 1347 }; 1348 if (r3 >= r11) { 1349 ulong const t = r3; 1350 r3 = r11; 1351 r11 = t; 1352 }; 1353 if (r4 >= r12) { 1354 ulong const t = r4; 1355 r4 = r12; 1356 r12 = t; 1357 }; 1358 if (r5 >= r13) { 1359 ulong const t = r5; 1360 r5 = r13; 1361 r13 = t; 1362 }; 1363 if (r6 >= r14) { 1364 ulong const t = r6; 1365 r6 = r14; 1366 r14 = t; 1367 }; 1368 if (r7 >= r15) { 1369 ulong const t = r7; 1370 r7 = r15; 1371 r15 = t; 1372 }; 1373 if (r8 >= r16) { 1374 ulong const t = r8; 1375 r8 = r16; 1376 r16 = t; 1377 }; 1378 if (r6 >= r11) { 1379 ulong const t = r6; 1380 r6 = r11; 1381 r11 = t; 1382 }; 1383 if (r7 >= r10) { 1384 ulong const t = r7; 1385 r7 = r10; 1386 r10 = t; 1387 }; 1388 if (r4 >= r13) { 1389 ulong const t = r4; 1390 r4 = r13; 1391 r13 = t; 1392 }; 1393 if (r14 >= r15) { 1394 ulong const t = r14; 1395 r14 = r15; 1396 r15 = t; 1397 }; 1398 if (r8 >= r12) { 1399 ulong const t = r8; 1400 r8 = r12; 1401 r12 = t; 1402 }; 1403 if (r2 >= r3) { 1404 ulong const t = r2; 1405 r2 = r3; 1406 r3 = t; 1407 }; 1408 if (r5 >= r9) { 1409 ulong const t = r5; 1410 r5 = r9; 1411 r9 = t; 1412 }; 1413 if (r2 >= r5) { 1414 ulong const t = r2; 1415 r2 = r5; 1416 r5 = t; 1417 }; 1418 if (r8 >= r14) { 1419 ulong const t = r8; 1420 r8 = r14; 1421 r14 = t; 1422 }; 1423 if (r3 >= r9) { 1424 ulong const t = r3; 1425 r3 = r9; 1426 r9 = t; 1427 }; 1428 if (r12 >= r15) { 1429 ulong const t = r12; 1430 r12 = r15; 1431 r15 = t; 1432 }; 1433 if (r3 >= r5) { 1434 ulong const t = r3; 1435 r3 = r5; 1436 r5 = t; 1437 }; 1438 if (r6 >= r7) { 1439 ulong const t = r6; 1440 r6 = r7; 1441 r7 = t; 1442 }; 1443 if (r10 >= r11) { 1444 ulong const t = r10; 1445 r10 = r11; 1446 r11 = t; 1447 }; 1448 if (r12 >= r14) { 1449 ulong const t = r12; 1450 r12 = r14; 1451 r14 = t; 1452 }; 1453 if (r4 >= r9) { 1454 ulong const t = r4; 1455 r4 = r9; 1456 r9 = t; 1457 }; 1458 if (r8 >= r13) { 1459 ulong const t = r8; 1460 r8 = r13; 1461 r13 = t; 1462 }; 1463 if (r7 >= r9) { 1464 ulong const t = r7; 1465 r7 = r9; 1466 r9 = t; 1467 }; 1468 if (r11 >= r13) { 1469 ulong const t = r11; 1470 r11 = r13; 1471 r13 = t; 1472 }; 1473 if (r4 >= r6) { 1474 ulong const t = r4; 1475 r4 = r6; 1476 r6 = t; 1477 }; 1478 if (r8 >= r10) { 1479 ulong const t = r8; 1480 r8 = r10; 1481 r10 = t; 1482 }; 1483 if (r4 >= r5) { 1484 ulong const t = r4; 1485 r4 = r5; 1486 r5 = t; 1487 }; 1488 if (r6 >= r7) { 1489 ulong const t = r6; 1490 r6 = r7; 1491 r7 = t; 1492 }; 1493 if (r8 >= r9) { 1494 ulong const t = r8; 1495 r8 = r9; 1496 r9 = t; 1497 }; 1498 if (r10 >= r11) { 1499 ulong const t = r10; 1500 r10 = r11; 1501 r11 = t; 1502 }; 1503 if (r12 >= r13) { 1504 ulong const t = r12; 1505 r12 = r13; 1506 r13 = t; 1507 }; 1508 if (r7 >= r8) { 1509 ulong const t = r7; 1510 r7 = r8; 1511 r8 = t; 1512 }; 1513 if (r9 >= r10) { 1514 ulong const t = r9; 1515 r9 = r10; 1516 r10 = t; 1517 }; 1518 { 1519 uint const flip_lane_idx = get_sub_group_local_id() ^ 1; 1520 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 1521 ; 1522 { 1523 ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 1524 ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx); 1525 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 1526 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 1527 }; 1528 { 1529 ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 1530 ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx); 1531 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 1532 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 1533 }; 1534 { 1535 ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 1536 ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx); 1537 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 1538 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 1539 }; 1540 { 1541 ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 1542 ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx); 1543 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 1544 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 1545 }; 1546 { 1547 ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx); 1548 ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx); 1549 r5 = ((r5 <= tb) ^ t_lt) ? tb : r5; 1550 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 1551 }; 1552 { 1553 ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx); 1554 ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx); 1555 r6 = ((r6 <= tb) ^ t_lt) ? tb : r6; 1556 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 1557 }; 1558 { 1559 ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx); 1560 ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx); 1561 r7 = ((r7 <= tb) ^ t_lt) ? tb : r7; 1562 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 1563 }; 1564 { 1565 ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx); 1566 ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx); 1567 r8 = ((r8 <= tb) ^ t_lt) ? tb : r8; 1568 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 1569 }; 1570 } 1571 if (r1 >= r9) { 1572 ulong const t = r1; 1573 r1 = r9; 1574 r9 = t; 1575 }; 1576 if (r5 >= r13) { 1577 ulong const t = r5; 1578 r5 = r13; 1579 r13 = t; 1580 }; 1581 if (r1 >= r5) { 1582 ulong const t = r1; 1583 r1 = r5; 1584 r5 = t; 1585 }; 1586 if (r9 >= r13) { 1587 ulong const t = r9; 1588 r9 = r13; 1589 r13 = t; 1590 }; 1591 if (r3 >= r11) { 1592 ulong const t = r3; 1593 r3 = r11; 1594 r11 = t; 1595 }; 1596 if (r7 >= r15) { 1597 ulong const t = r7; 1598 r7 = r15; 1599 r15 = t; 1600 }; 1601 if (r3 >= r7) { 1602 ulong const t = r3; 1603 r3 = r7; 1604 r7 = t; 1605 }; 1606 if (r11 >= r15) { 1607 ulong const t = r11; 1608 r11 = r15; 1609 r15 = t; 1610 }; 1611 if (r1 >= r3) { 1612 ulong const t = r1; 1613 r1 = r3; 1614 r3 = t; 1615 }; 1616 if (r5 >= r7) { 1617 ulong const t = r5; 1618 r5 = r7; 1619 r7 = t; 1620 }; 1621 if (r9 >= r11) { 1622 ulong const t = r9; 1623 r9 = r11; 1624 r11 = t; 1625 }; 1626 if (r13 >= r15) { 1627 ulong const t = r13; 1628 r13 = r15; 1629 r15 = t; 1630 }; 1631 if (r2 >= r10) { 1632 ulong const t = r2; 1633 r2 = r10; 1634 r10 = t; 1635 }; 1636 if (r6 >= r14) { 1637 ulong const t = r6; 1638 r6 = r14; 1639 r14 = t; 1640 }; 1641 if (r2 >= r6) { 1642 ulong const t = r2; 1643 r2 = r6; 1644 r6 = t; 1645 }; 1646 if (r10 >= r14) { 1647 ulong const t = r10; 1648 r10 = r14; 1649 r14 = t; 1650 }; 1651 if (r4 >= r12) { 1652 ulong const t = r4; 1653 r4 = r12; 1654 r12 = t; 1655 }; 1656 if (r8 >= r16) { 1657 ulong const t = r8; 1658 r8 = r16; 1659 r16 = t; 1660 }; 1661 if (r4 >= r8) { 1662 ulong const t = r4; 1663 r4 = r8; 1664 r8 = t; 1665 }; 1666 if (r12 >= r16) { 1667 ulong const t = r12; 1668 r12 = r16; 1669 r16 = t; 1670 }; 1671 if (r2 >= r4) { 1672 ulong const t = r2; 1673 r2 = r4; 1674 r4 = t; 1675 }; 1676 if (r6 >= r8) { 1677 ulong const t = r6; 1678 r6 = r8; 1679 r8 = t; 1680 }; 1681 if (r10 >= r12) { 1682 ulong const t = r10; 1683 r10 = r12; 1684 r12 = t; 1685 }; 1686 if (r14 >= r16) { 1687 ulong const t = r14; 1688 r14 = r16; 1689 r16 = t; 1690 }; 1691 if (r1 >= r2) { 1692 ulong const t = r1; 1693 r1 = r2; 1694 r2 = t; 1695 }; 1696 if (r3 >= r4) { 1697 ulong const t = r3; 1698 r3 = r4; 1699 r4 = t; 1700 }; 1701 if (r5 >= r6) { 1702 ulong const t = r5; 1703 r5 = r6; 1704 r6 = t; 1705 }; 1706 if (r7 >= r8) { 1707 ulong const t = r7; 1708 r7 = r8; 1709 r8 = t; 1710 }; 1711 if (r9 >= r10) { 1712 ulong const t = r9; 1713 r9 = r10; 1714 r10 = t; 1715 }; 1716 if (r11 >= r12) { 1717 ulong const t = r11; 1718 r11 = r12; 1719 r12 = t; 1720 }; 1721 if (r13 >= r14) { 1722 ulong const t = r13; 1723 r13 = r14; 1724 r14 = t; 1725 }; 1726 if (r15 >= r16) { 1727 ulong const t = r15; 1728 r15 = r16; 1729 r16 = t; 1730 }; 1731 { 1732 uint const flip_lane_idx = get_sub_group_local_id() ^ 3; 1733 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 1734 ; 1735 { 1736 ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 1737 ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx); 1738 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 1739 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 1740 }; 1741 { 1742 ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 1743 ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx); 1744 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 1745 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 1746 }; 1747 { 1748 ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 1749 ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx); 1750 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 1751 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 1752 }; 1753 { 1754 ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 1755 ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx); 1756 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 1757 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 1758 }; 1759 { 1760 ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx); 1761 ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx); 1762 r5 = ((r5 <= tb) ^ t_lt) ? tb : r5; 1763 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 1764 }; 1765 { 1766 ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx); 1767 ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx); 1768 r6 = ((r6 <= tb) ^ t_lt) ? tb : r6; 1769 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 1770 }; 1771 { 1772 ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx); 1773 ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx); 1774 r7 = ((r7 <= tb) ^ t_lt) ? tb : r7; 1775 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 1776 }; 1777 { 1778 ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx); 1779 ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx); 1780 r8 = ((r8 <= tb) ^ t_lt) ? tb : r8; 1781 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 1782 }; 1783 } 1784 { 1785 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 1786 int const t_lt = get_sub_group_local_id() < half_lane_idx; 1787 ; 1788 { 1789 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 1790 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 1791 }; 1792 { 1793 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 1794 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 1795 }; 1796 { 1797 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 1798 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 1799 }; 1800 { 1801 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 1802 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 1803 }; 1804 { 1805 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 1806 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 1807 }; 1808 { 1809 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 1810 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 1811 }; 1812 { 1813 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 1814 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 1815 }; 1816 { 1817 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 1818 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 1819 }; 1820 { 1821 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 1822 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 1823 }; 1824 { 1825 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 1826 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 1827 }; 1828 { 1829 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 1830 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 1831 }; 1832 { 1833 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 1834 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 1835 }; 1836 { 1837 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 1838 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 1839 }; 1840 { 1841 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 1842 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 1843 }; 1844 { 1845 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 1846 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 1847 }; 1848 { 1849 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 1850 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 1851 }; 1852 } 1853 if (r1 >= r9) { 1854 ulong const t = r1; 1855 r1 = r9; 1856 r9 = t; 1857 }; 1858 if (r5 >= r13) { 1859 ulong const t = r5; 1860 r5 = r13; 1861 r13 = t; 1862 }; 1863 if (r1 >= r5) { 1864 ulong const t = r1; 1865 r1 = r5; 1866 r5 = t; 1867 }; 1868 if (r9 >= r13) { 1869 ulong const t = r9; 1870 r9 = r13; 1871 r13 = t; 1872 }; 1873 if (r3 >= r11) { 1874 ulong const t = r3; 1875 r3 = r11; 1876 r11 = t; 1877 }; 1878 if (r7 >= r15) { 1879 ulong const t = r7; 1880 r7 = r15; 1881 r15 = t; 1882 }; 1883 if (r3 >= r7) { 1884 ulong const t = r3; 1885 r3 = r7; 1886 r7 = t; 1887 }; 1888 if (r11 >= r15) { 1889 ulong const t = r11; 1890 r11 = r15; 1891 r15 = t; 1892 }; 1893 if (r1 >= r3) { 1894 ulong const t = r1; 1895 r1 = r3; 1896 r3 = t; 1897 }; 1898 if (r5 >= r7) { 1899 ulong const t = r5; 1900 r5 = r7; 1901 r7 = t; 1902 }; 1903 if (r9 >= r11) { 1904 ulong const t = r9; 1905 r9 = r11; 1906 r11 = t; 1907 }; 1908 if (r13 >= r15) { 1909 ulong const t = r13; 1910 r13 = r15; 1911 r15 = t; 1912 }; 1913 if (r2 >= r10) { 1914 ulong const t = r2; 1915 r2 = r10; 1916 r10 = t; 1917 }; 1918 if (r6 >= r14) { 1919 ulong const t = r6; 1920 r6 = r14; 1921 r14 = t; 1922 }; 1923 if (r2 >= r6) { 1924 ulong const t = r2; 1925 r2 = r6; 1926 r6 = t; 1927 }; 1928 if (r10 >= r14) { 1929 ulong const t = r10; 1930 r10 = r14; 1931 r14 = t; 1932 }; 1933 if (r4 >= r12) { 1934 ulong const t = r4; 1935 r4 = r12; 1936 r12 = t; 1937 }; 1938 if (r8 >= r16) { 1939 ulong const t = r8; 1940 r8 = r16; 1941 r16 = t; 1942 }; 1943 if (r4 >= r8) { 1944 ulong const t = r4; 1945 r4 = r8; 1946 r8 = t; 1947 }; 1948 if (r12 >= r16) { 1949 ulong const t = r12; 1950 r12 = r16; 1951 r16 = t; 1952 }; 1953 if (r2 >= r4) { 1954 ulong const t = r2; 1955 r2 = r4; 1956 r4 = t; 1957 }; 1958 if (r6 >= r8) { 1959 ulong const t = r6; 1960 r6 = r8; 1961 r8 = t; 1962 }; 1963 if (r10 >= r12) { 1964 ulong const t = r10; 1965 r10 = r12; 1966 r12 = t; 1967 }; 1968 if (r14 >= r16) { 1969 ulong const t = r14; 1970 r14 = r16; 1971 r16 = t; 1972 }; 1973 if (r1 >= r2) { 1974 ulong const t = r1; 1975 r1 = r2; 1976 r2 = t; 1977 }; 1978 if (r3 >= r4) { 1979 ulong const t = r3; 1980 r3 = r4; 1981 r4 = t; 1982 }; 1983 if (r5 >= r6) { 1984 ulong const t = r5; 1985 r5 = r6; 1986 r6 = t; 1987 }; 1988 if (r7 >= r8) { 1989 ulong const t = r7; 1990 r7 = r8; 1991 r8 = t; 1992 }; 1993 if (r9 >= r10) { 1994 ulong const t = r9; 1995 r9 = r10; 1996 r10 = t; 1997 }; 1998 if (r11 >= r12) { 1999 ulong const t = r11; 2000 r11 = r12; 2001 r12 = t; 2002 }; 2003 if (r13 >= r14) { 2004 ulong const t = r13; 2005 r13 = r14; 2006 r14 = t; 2007 }; 2008 if (r15 >= r16) { 2009 ulong const t = r15; 2010 r15 = r16; 2011 r16 = t; 2012 }; 2013 { 2014 uint const flip_lane_idx = get_sub_group_local_id() ^ 7; 2015 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 2016 ; 2017 { 2018 ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 2019 ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx); 2020 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 2021 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 2022 }; 2023 { 2024 ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 2025 ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx); 2026 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 2027 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 2028 }; 2029 { 2030 ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 2031 ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx); 2032 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 2033 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 2034 }; 2035 { 2036 ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 2037 ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx); 2038 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 2039 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 2040 }; 2041 { 2042 ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx); 2043 ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx); 2044 r5 = ((r5 <= tb) ^ t_lt) ? tb : r5; 2045 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 2046 }; 2047 { 2048 ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx); 2049 ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx); 2050 r6 = ((r6 <= tb) ^ t_lt) ? tb : r6; 2051 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 2052 }; 2053 { 2054 ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx); 2055 ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx); 2056 r7 = ((r7 <= tb) ^ t_lt) ? tb : r7; 2057 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 2058 }; 2059 { 2060 ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx); 2061 ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx); 2062 r8 = ((r8 <= tb) ^ t_lt) ? tb : r8; 2063 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 2064 }; 2065 } 2066 { 2067 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 2068 int const t_lt = get_sub_group_local_id() < half_lane_idx; 2069 ; 2070 { 2071 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 2072 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 2073 }; 2074 { 2075 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 2076 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 2077 }; 2078 { 2079 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 2080 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 2081 }; 2082 { 2083 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 2084 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 2085 }; 2086 { 2087 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 2088 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 2089 }; 2090 { 2091 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 2092 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 2093 }; 2094 { 2095 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 2096 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 2097 }; 2098 { 2099 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 2100 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 2101 }; 2102 { 2103 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 2104 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 2105 }; 2106 { 2107 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 2108 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 2109 }; 2110 { 2111 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 2112 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 2113 }; 2114 { 2115 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 2116 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 2117 }; 2118 { 2119 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 2120 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 2121 }; 2122 { 2123 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 2124 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 2125 }; 2126 { 2127 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 2128 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 2129 }; 2130 { 2131 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 2132 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 2133 }; 2134 } 2135 { 2136 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 2137 int const t_lt = get_sub_group_local_id() < half_lane_idx; 2138 ; 2139 { 2140 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 2141 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 2142 }; 2143 { 2144 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 2145 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 2146 }; 2147 { 2148 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 2149 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 2150 }; 2151 { 2152 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 2153 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 2154 }; 2155 { 2156 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 2157 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 2158 }; 2159 { 2160 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 2161 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 2162 }; 2163 { 2164 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 2165 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 2166 }; 2167 { 2168 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 2169 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 2170 }; 2171 { 2172 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 2173 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 2174 }; 2175 { 2176 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 2177 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 2178 }; 2179 { 2180 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 2181 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 2182 }; 2183 { 2184 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 2185 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 2186 }; 2187 { 2188 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 2189 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 2190 }; 2191 { 2192 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 2193 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 2194 }; 2195 { 2196 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 2197 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 2198 }; 2199 { 2200 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 2201 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 2202 }; 2203 } 2204 if (r1 >= r9) { 2205 ulong const t = r1; 2206 r1 = r9; 2207 r9 = t; 2208 }; 2209 if (r5 >= r13) { 2210 ulong const t = r5; 2211 r5 = r13; 2212 r13 = t; 2213 }; 2214 if (r1 >= r5) { 2215 ulong const t = r1; 2216 r1 = r5; 2217 r5 = t; 2218 }; 2219 if (r9 >= r13) { 2220 ulong const t = r9; 2221 r9 = r13; 2222 r13 = t; 2223 }; 2224 if (r3 >= r11) { 2225 ulong const t = r3; 2226 r3 = r11; 2227 r11 = t; 2228 }; 2229 if (r7 >= r15) { 2230 ulong const t = r7; 2231 r7 = r15; 2232 r15 = t; 2233 }; 2234 if (r3 >= r7) { 2235 ulong const t = r3; 2236 r3 = r7; 2237 r7 = t; 2238 }; 2239 if (r11 >= r15) { 2240 ulong const t = r11; 2241 r11 = r15; 2242 r15 = t; 2243 }; 2244 if (r1 >= r3) { 2245 ulong const t = r1; 2246 r1 = r3; 2247 r3 = t; 2248 }; 2249 if (r5 >= r7) { 2250 ulong const t = r5; 2251 r5 = r7; 2252 r7 = t; 2253 }; 2254 if (r9 >= r11) { 2255 ulong const t = r9; 2256 r9 = r11; 2257 r11 = t; 2258 }; 2259 if (r13 >= r15) { 2260 ulong const t = r13; 2261 r13 = r15; 2262 r15 = t; 2263 }; 2264 if (r2 >= r10) { 2265 ulong const t = r2; 2266 r2 = r10; 2267 r10 = t; 2268 }; 2269 if (r6 >= r14) { 2270 ulong const t = r6; 2271 r6 = r14; 2272 r14 = t; 2273 }; 2274 if (r2 >= r6) { 2275 ulong const t = r2; 2276 r2 = r6; 2277 r6 = t; 2278 }; 2279 if (r10 >= r14) { 2280 ulong const t = r10; 2281 r10 = r14; 2282 r14 = t; 2283 }; 2284 if (r4 >= r12) { 2285 ulong const t = r4; 2286 r4 = r12; 2287 r12 = t; 2288 }; 2289 if (r8 >= r16) { 2290 ulong const t = r8; 2291 r8 = r16; 2292 r16 = t; 2293 }; 2294 if (r4 >= r8) { 2295 ulong const t = r4; 2296 r4 = r8; 2297 r8 = t; 2298 }; 2299 if (r12 >= r16) { 2300 ulong const t = r12; 2301 r12 = r16; 2302 r16 = t; 2303 }; 2304 if (r2 >= r4) { 2305 ulong const t = r2; 2306 r2 = r4; 2307 r4 = t; 2308 }; 2309 if (r6 >= r8) { 2310 ulong const t = r6; 2311 r6 = r8; 2312 r8 = t; 2313 }; 2314 if (r10 >= r12) { 2315 ulong const t = r10; 2316 r10 = r12; 2317 r12 = t; 2318 }; 2319 if (r14 >= r16) { 2320 ulong const t = r14; 2321 r14 = r16; 2322 r16 = t; 2323 }; 2324 if (r1 >= r2) { 2325 ulong const t = r1; 2326 r1 = r2; 2327 r2 = t; 2328 }; 2329 if (r3 >= r4) { 2330 ulong const t = r3; 2331 r3 = r4; 2332 r4 = t; 2333 }; 2334 if (r5 >= r6) { 2335 ulong const t = r5; 2336 r5 = r6; 2337 r6 = t; 2338 }; 2339 if (r7 >= r8) { 2340 ulong const t = r7; 2341 r7 = r8; 2342 r8 = t; 2343 }; 2344 if (r9 >= r10) { 2345 ulong const t = r9; 2346 r9 = r10; 2347 r10 = t; 2348 }; 2349 if (r11 >= r12) { 2350 ulong const t = r11; 2351 r11 = r12; 2352 r12 = t; 2353 }; 2354 if (r13 >= r14) { 2355 ulong const t = r13; 2356 r13 = r14; 2357 r14 = t; 2358 }; 2359 if (r15 >= r16) { 2360 ulong const t = r15; 2361 r15 = r16; 2362 r16 = t; 2363 }; 2364 uint const smem_l_idx = 2365 get_sub_group_id() * ((1 << 3) * 2) + get_sub_group_local_id(); 2366 uint const smem_r_idx = (get_sub_group_id() ^ 1) * ((1 << 3) * 2) + 2367 (get_sub_group_local_id() ^ ((1 << 3) - 1)); 2368 shared.m[get_local_id(0) + (2 * (1 << 3) * 0)] = r1; 2369 shared.m[get_local_id(0) + (2 * (1 << 3) * 1)] = r16; 2370 shared.m[get_local_id(0) + (2 * (1 << 3) * 2)] = r2; 2371 shared.m[get_local_id(0) + (2 * (1 << 3) * 3)] = r15; 2372 shared.m[get_local_id(0) + (2 * (1 << 3) * 4)] = r3; 2373 shared.m[get_local_id(0) + (2 * (1 << 3) * 5)] = r14; 2374 shared.m[get_local_id(0) + (2 * (1 << 3) * 6)] = r4; 2375 shared.m[get_local_id(0) + (2 * (1 << 3) * 7)] = r13; 2376 shared.m[get_local_id(0) + (2 * (1 << 3) * 8)] = r5; 2377 shared.m[get_local_id(0) + (2 * (1 << 3) * 9)] = r12; 2378 shared.m[get_local_id(0) + (2 * (1 << 3) * 10)] = r6; 2379 shared.m[get_local_id(0) + (2 * (1 << 3) * 11)] = r11; 2380 shared.m[get_local_id(0) + (2 * (1 << 3) * 12)] = r7; 2381 shared.m[get_local_id(0) + (2 * (1 << 3) * 13)] = r10; 2382 shared.m[get_local_id(0) + (2 * (1 << 3) * 14)] = r8; 2383 shared.m[get_local_id(0) + (2 * (1 << 3) * 15)] = r9; 2384 barrier(CLK_LOCAL_MEM_FENCE); 2385 { 2386 { 2387 ulong r0_1 = shared.m[smem_l_idx + (0)]; 2388 ulong r0_2 = shared.m[smem_r_idx + (8)]; 2389 if (r0_1 >= r0_2) { 2390 ulong const t = r0_1; 2391 r0_1 = r0_2; 2392 r0_2 = t; 2393 }; 2394 shared.m[smem_l_idx + (0)] = r0_1; 2395 shared.m[smem_r_idx + (8)] = r0_2; 2396 } 2397 { 2398 ulong r0_1 = shared.m[smem_l_idx + (32)]; 2399 ulong r0_2 = shared.m[smem_r_idx + (40)]; 2400 if (r0_1 >= r0_2) { 2401 ulong const t = r0_1; 2402 r0_1 = r0_2; 2403 r0_2 = t; 2404 }; 2405 shared.m[smem_l_idx + (32)] = r0_1; 2406 shared.m[smem_r_idx + (40)] = r0_2; 2407 } 2408 { 2409 ulong r0_1 = shared.m[smem_l_idx + (64)]; 2410 ulong r0_2 = shared.m[smem_r_idx + (72)]; 2411 if (r0_1 >= r0_2) { 2412 ulong const t = r0_1; 2413 r0_1 = r0_2; 2414 r0_2 = t; 2415 }; 2416 shared.m[smem_l_idx + (64)] = r0_1; 2417 shared.m[smem_r_idx + (72)] = r0_2; 2418 } 2419 { 2420 ulong r0_1 = shared.m[smem_l_idx + (96)]; 2421 ulong r0_2 = shared.m[smem_r_idx + (104)]; 2422 if (r0_1 >= r0_2) { 2423 ulong const t = r0_1; 2424 r0_1 = r0_2; 2425 r0_2 = t; 2426 }; 2427 shared.m[smem_l_idx + (96)] = r0_1; 2428 shared.m[smem_r_idx + (104)] = r0_2; 2429 } 2430 { 2431 ulong r0_1 = shared.m[smem_l_idx + (128)]; 2432 ulong r0_2 = shared.m[smem_r_idx + (136)]; 2433 if (r0_1 >= r0_2) { 2434 ulong const t = r0_1; 2435 r0_1 = r0_2; 2436 r0_2 = t; 2437 }; 2438 shared.m[smem_l_idx + (128)] = r0_1; 2439 shared.m[smem_r_idx + (136)] = r0_2; 2440 } 2441 { 2442 ulong r0_1 = shared.m[smem_l_idx + (160)]; 2443 ulong r0_2 = shared.m[smem_r_idx + (168)]; 2444 if (r0_1 >= r0_2) { 2445 ulong const t = r0_1; 2446 r0_1 = r0_2; 2447 r0_2 = t; 2448 }; 2449 shared.m[smem_l_idx + (160)] = r0_1; 2450 shared.m[smem_r_idx + (168)] = r0_2; 2451 } 2452 { 2453 ulong r0_1 = shared.m[smem_l_idx + (192)]; 2454 ulong r0_2 = shared.m[smem_r_idx + (200)]; 2455 if (r0_1 >= r0_2) { 2456 ulong const t = r0_1; 2457 r0_1 = r0_2; 2458 r0_2 = t; 2459 }; 2460 shared.m[smem_l_idx + (192)] = r0_1; 2461 shared.m[smem_r_idx + (200)] = r0_2; 2462 } 2463 { 2464 ulong r0_1 = shared.m[smem_l_idx + (224)]; 2465 ulong r0_2 = shared.m[smem_r_idx + (232)]; 2466 if (r0_1 >= r0_2) { 2467 ulong const t = r0_1; 2468 r0_1 = r0_2; 2469 r0_2 = t; 2470 }; 2471 shared.m[smem_l_idx + (224)] = r0_1; 2472 shared.m[smem_r_idx + (232)] = r0_2; 2473 } 2474 } 2475 barrier(CLK_LOCAL_MEM_FENCE); 2476 r1 = shared.m[get_local_id(0) + (2 * (1 << 3) * 0)]; 2477 r16 = shared.m[get_local_id(0) + (2 * (1 << 3) * 1)]; 2478 r2 = shared.m[get_local_id(0) + (2 * (1 << 3) * 2)]; 2479 r15 = shared.m[get_local_id(0) + (2 * (1 << 3) * 3)]; 2480 r3 = shared.m[get_local_id(0) + (2 * (1 << 3) * 4)]; 2481 r14 = shared.m[get_local_id(0) + (2 * (1 << 3) * 5)]; 2482 r4 = shared.m[get_local_id(0) + (2 * (1 << 3) * 6)]; 2483 r13 = shared.m[get_local_id(0) + (2 * (1 << 3) * 7)]; 2484 r5 = shared.m[get_local_id(0) + (2 * (1 << 3) * 8)]; 2485 r12 = shared.m[get_local_id(0) + (2 * (1 << 3) * 9)]; 2486 r6 = shared.m[get_local_id(0) + (2 * (1 << 3) * 10)]; 2487 r11 = shared.m[get_local_id(0) + (2 * (1 << 3) * 11)]; 2488 r7 = shared.m[get_local_id(0) + (2 * (1 << 3) * 12)]; 2489 r10 = shared.m[get_local_id(0) + (2 * (1 << 3) * 13)]; 2490 r8 = shared.m[get_local_id(0) + (2 * (1 << 3) * 14)]; 2491 r9 = shared.m[get_local_id(0) + (2 * (1 << 3) * 15)]; 2492 { 2493 { 2494 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 2495 int const t_lt = get_sub_group_local_id() < half_lane_idx; 2496 ; 2497 { 2498 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 2499 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 2500 }; 2501 { 2502 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 2503 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 2504 }; 2505 { 2506 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 2507 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 2508 }; 2509 { 2510 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 2511 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 2512 }; 2513 { 2514 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 2515 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 2516 }; 2517 { 2518 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 2519 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 2520 }; 2521 { 2522 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 2523 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 2524 }; 2525 { 2526 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 2527 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 2528 }; 2529 { 2530 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 2531 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 2532 }; 2533 { 2534 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 2535 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 2536 }; 2537 { 2538 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 2539 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 2540 }; 2541 { 2542 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 2543 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 2544 }; 2545 { 2546 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 2547 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 2548 }; 2549 { 2550 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 2551 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 2552 }; 2553 { 2554 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 2555 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 2556 }; 2557 { 2558 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 2559 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 2560 }; 2561 } 2562 { 2563 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 2564 int const t_lt = get_sub_group_local_id() < half_lane_idx; 2565 ; 2566 { 2567 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 2568 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 2569 }; 2570 { 2571 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 2572 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 2573 }; 2574 { 2575 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 2576 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 2577 }; 2578 { 2579 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 2580 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 2581 }; 2582 { 2583 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 2584 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 2585 }; 2586 { 2587 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 2588 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 2589 }; 2590 { 2591 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 2592 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 2593 }; 2594 { 2595 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 2596 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 2597 }; 2598 { 2599 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 2600 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 2601 }; 2602 { 2603 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 2604 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 2605 }; 2606 { 2607 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 2608 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 2609 }; 2610 { 2611 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 2612 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 2613 }; 2614 { 2615 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 2616 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 2617 }; 2618 { 2619 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 2620 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 2621 }; 2622 { 2623 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 2624 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 2625 }; 2626 { 2627 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 2628 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 2629 }; 2630 } 2631 { 2632 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 2633 int const t_lt = get_sub_group_local_id() < half_lane_idx; 2634 ; 2635 { 2636 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 2637 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 2638 }; 2639 { 2640 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 2641 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 2642 }; 2643 { 2644 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 2645 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 2646 }; 2647 { 2648 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 2649 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 2650 }; 2651 { 2652 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 2653 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 2654 }; 2655 { 2656 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 2657 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 2658 }; 2659 { 2660 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 2661 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 2662 }; 2663 { 2664 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 2665 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 2666 }; 2667 { 2668 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 2669 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 2670 }; 2671 { 2672 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 2673 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 2674 }; 2675 { 2676 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 2677 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 2678 }; 2679 { 2680 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 2681 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 2682 }; 2683 { 2684 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 2685 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 2686 }; 2687 { 2688 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 2689 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 2690 }; 2691 { 2692 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 2693 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 2694 }; 2695 { 2696 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 2697 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 2698 }; 2699 } 2700 if (r1 >= r9) { 2701 ulong const t = r1; 2702 r1 = r9; 2703 r9 = t; 2704 }; 2705 if (r5 >= r13) { 2706 ulong const t = r5; 2707 r5 = r13; 2708 r13 = t; 2709 }; 2710 if (r1 >= r5) { 2711 ulong const t = r1; 2712 r1 = r5; 2713 r5 = t; 2714 }; 2715 if (r9 >= r13) { 2716 ulong const t = r9; 2717 r9 = r13; 2718 r13 = t; 2719 }; 2720 if (r3 >= r11) { 2721 ulong const t = r3; 2722 r3 = r11; 2723 r11 = t; 2724 }; 2725 if (r7 >= r15) { 2726 ulong const t = r7; 2727 r7 = r15; 2728 r15 = t; 2729 }; 2730 if (r3 >= r7) { 2731 ulong const t = r3; 2732 r3 = r7; 2733 r7 = t; 2734 }; 2735 if (r11 >= r15) { 2736 ulong const t = r11; 2737 r11 = r15; 2738 r15 = t; 2739 }; 2740 if (r1 >= r3) { 2741 ulong const t = r1; 2742 r1 = r3; 2743 r3 = t; 2744 }; 2745 if (r5 >= r7) { 2746 ulong const t = r5; 2747 r5 = r7; 2748 r7 = t; 2749 }; 2750 if (r9 >= r11) { 2751 ulong const t = r9; 2752 r9 = r11; 2753 r11 = t; 2754 }; 2755 if (r13 >= r15) { 2756 ulong const t = r13; 2757 r13 = r15; 2758 r15 = t; 2759 }; 2760 if (r2 >= r10) { 2761 ulong const t = r2; 2762 r2 = r10; 2763 r10 = t; 2764 }; 2765 if (r6 >= r14) { 2766 ulong const t = r6; 2767 r6 = r14; 2768 r14 = t; 2769 }; 2770 if (r2 >= r6) { 2771 ulong const t = r2; 2772 r2 = r6; 2773 r6 = t; 2774 }; 2775 if (r10 >= r14) { 2776 ulong const t = r10; 2777 r10 = r14; 2778 r14 = t; 2779 }; 2780 if (r4 >= r12) { 2781 ulong const t = r4; 2782 r4 = r12; 2783 r12 = t; 2784 }; 2785 if (r8 >= r16) { 2786 ulong const t = r8; 2787 r8 = r16; 2788 r16 = t; 2789 }; 2790 if (r4 >= r8) { 2791 ulong const t = r4; 2792 r4 = r8; 2793 r8 = t; 2794 }; 2795 if (r12 >= r16) { 2796 ulong const t = r12; 2797 r12 = r16; 2798 r16 = t; 2799 }; 2800 if (r2 >= r4) { 2801 ulong const t = r2; 2802 r2 = r4; 2803 r4 = t; 2804 }; 2805 if (r6 >= r8) { 2806 ulong const t = r6; 2807 r6 = r8; 2808 r8 = t; 2809 }; 2810 if (r10 >= r12) { 2811 ulong const t = r10; 2812 r10 = r12; 2813 r12 = t; 2814 }; 2815 if (r14 >= r16) { 2816 ulong const t = r14; 2817 r14 = r16; 2818 r16 = t; 2819 }; 2820 if (r1 >= r2) { 2821 ulong const t = r1; 2822 r1 = r2; 2823 r2 = t; 2824 }; 2825 if (r3 >= r4) { 2826 ulong const t = r3; 2827 r3 = r4; 2828 r4 = t; 2829 }; 2830 if (r5 >= r6) { 2831 ulong const t = r5; 2832 r5 = r6; 2833 r6 = t; 2834 }; 2835 if (r7 >= r8) { 2836 ulong const t = r7; 2837 r7 = r8; 2838 r8 = t; 2839 }; 2840 if (r9 >= r10) { 2841 ulong const t = r9; 2842 r9 = r10; 2843 r10 = t; 2844 }; 2845 if (r11 >= r12) { 2846 ulong const t = r11; 2847 r11 = r12; 2848 r12 = t; 2849 }; 2850 if (r13 >= r14) { 2851 ulong const t = r13; 2852 r13 = r14; 2853 r14 = t; 2854 }; 2855 if (r15 >= r16) { 2856 ulong const t = r15; 2857 r15 = r16; 2858 r16 = t; 2859 }; 2860 } 2861 vout[gmem_idx + (1 << 3) * 0] = r1; 2862 vout[gmem_idx + (1 << 3) * 1] = r2; 2863 vout[gmem_idx + (1 << 3) * 2] = r3; 2864 vout[gmem_idx + (1 << 3) * 3] = r4; 2865 vout[gmem_idx + (1 << 3) * 4] = r5; 2866 vout[gmem_idx + (1 << 3) * 5] = r6; 2867 vout[gmem_idx + (1 << 3) * 6] = r7; 2868 vout[gmem_idx + (1 << 3) * 7] = r8; 2869 vout[gmem_idx + (1 << 3) * 8] = r9; 2870 vout[gmem_idx + (1 << 3) * 9] = r10; 2871 vout[gmem_idx + (1 << 3) * 10] = r11; 2872 vout[gmem_idx + (1 << 3) * 11] = r12; 2873 vout[gmem_idx + (1 << 3) * 12] = r13; 2874 vout[gmem_idx + (1 << 3) * 13] = r14; 2875 vout[gmem_idx + (1 << 3) * 14] = r15; 2876 vout[gmem_idx + (1 << 3) * 15] = r16; 2877} 2878 2879__kernel __attribute__((intel_reqd_sub_group_size((1 << 3)))) 2880__attribute__((reqd_work_group_size((1 << 3) * 4, 1, 1))) void 2881hs_kernel_bs_2(__global ulong const* const restrict vin, 2882 __global ulong* const restrict vout) 2883{ 2884 __local struct 2885 { 2886 ulong m[32 * 16]; 2887 } shared; 2888 2889 uint const gmem_idx = (get_global_id(0) & ~((1 << 3) - 1)) * 16 + 2890 (get_local_id(0) & ((1 << 3) - 1)); 2891 ulong r1 = vin[gmem_idx + (1 << 3) * 0]; 2892 ulong r2 = vin[gmem_idx + (1 << 3) * 1]; 2893 ulong r3 = vin[gmem_idx + (1 << 3) * 2]; 2894 ulong r4 = vin[gmem_idx + (1 << 3) * 3]; 2895 ulong r5 = vin[gmem_idx + (1 << 3) * 4]; 2896 ulong r6 = vin[gmem_idx + (1 << 3) * 5]; 2897 ulong r7 = vin[gmem_idx + (1 << 3) * 6]; 2898 ulong r8 = vin[gmem_idx + (1 << 3) * 7]; 2899 ulong r9 = vin[gmem_idx + (1 << 3) * 8]; 2900 ulong r10 = vin[gmem_idx + (1 << 3) * 9]; 2901 ulong r11 = vin[gmem_idx + (1 << 3) * 10]; 2902 ulong r12 = vin[gmem_idx + (1 << 3) * 11]; 2903 ulong r13 = vin[gmem_idx + (1 << 3) * 12]; 2904 ulong r14 = vin[gmem_idx + (1 << 3) * 13]; 2905 ulong r15 = vin[gmem_idx + (1 << 3) * 14]; 2906 ulong r16 = vin[gmem_idx + (1 << 3) * 15]; 2907 if (r1 >= r2) { 2908 ulong const t = r1; 2909 r1 = r2; 2910 r2 = t; 2911 }; 2912 if (r3 >= r4) { 2913 ulong const t = r3; 2914 r3 = r4; 2915 r4 = t; 2916 }; 2917 if (r5 >= r6) { 2918 ulong const t = r5; 2919 r5 = r6; 2920 r6 = t; 2921 }; 2922 if (r7 >= r8) { 2923 ulong const t = r7; 2924 r7 = r8; 2925 r8 = t; 2926 }; 2927 if (r9 >= r10) { 2928 ulong const t = r9; 2929 r9 = r10; 2930 r10 = t; 2931 }; 2932 if (r11 >= r12) { 2933 ulong const t = r11; 2934 r11 = r12; 2935 r12 = t; 2936 }; 2937 if (r13 >= r14) { 2938 ulong const t = r13; 2939 r13 = r14; 2940 r14 = t; 2941 }; 2942 if (r15 >= r16) { 2943 ulong const t = r15; 2944 r15 = r16; 2945 r16 = t; 2946 }; 2947 if (r1 >= r3) { 2948 ulong const t = r1; 2949 r1 = r3; 2950 r3 = t; 2951 }; 2952 if (r5 >= r7) { 2953 ulong const t = r5; 2954 r5 = r7; 2955 r7 = t; 2956 }; 2957 if (r9 >= r11) { 2958 ulong const t = r9; 2959 r9 = r11; 2960 r11 = t; 2961 }; 2962 if (r13 >= r15) { 2963 ulong const t = r13; 2964 r13 = r15; 2965 r15 = t; 2966 }; 2967 if (r2 >= r4) { 2968 ulong const t = r2; 2969 r2 = r4; 2970 r4 = t; 2971 }; 2972 if (r6 >= r8) { 2973 ulong const t = r6; 2974 r6 = r8; 2975 r8 = t; 2976 }; 2977 if (r10 >= r12) { 2978 ulong const t = r10; 2979 r10 = r12; 2980 r12 = t; 2981 }; 2982 if (r14 >= r16) { 2983 ulong const t = r14; 2984 r14 = r16; 2985 r16 = t; 2986 }; 2987 if (r1 >= r5) { 2988 ulong const t = r1; 2989 r1 = r5; 2990 r5 = t; 2991 }; 2992 if (r9 >= r13) { 2993 ulong const t = r9; 2994 r9 = r13; 2995 r13 = t; 2996 }; 2997 if (r2 >= r6) { 2998 ulong const t = r2; 2999 r2 = r6; 3000 r6 = t; 3001 }; 3002 if (r10 >= r14) { 3003 ulong const t = r10; 3004 r10 = r14; 3005 r14 = t; 3006 }; 3007 if (r3 >= r7) { 3008 ulong const t = r3; 3009 r3 = r7; 3010 r7 = t; 3011 }; 3012 if (r11 >= r15) { 3013 ulong const t = r11; 3014 r11 = r15; 3015 r15 = t; 3016 }; 3017 if (r4 >= r8) { 3018 ulong const t = r4; 3019 r4 = r8; 3020 r8 = t; 3021 }; 3022 if (r12 >= r16) { 3023 ulong const t = r12; 3024 r12 = r16; 3025 r16 = t; 3026 }; 3027 if (r1 >= r9) { 3028 ulong const t = r1; 3029 r1 = r9; 3030 r9 = t; 3031 }; 3032 if (r2 >= r10) { 3033 ulong const t = r2; 3034 r2 = r10; 3035 r10 = t; 3036 }; 3037 if (r3 >= r11) { 3038 ulong const t = r3; 3039 r3 = r11; 3040 r11 = t; 3041 }; 3042 if (r4 >= r12) { 3043 ulong const t = r4; 3044 r4 = r12; 3045 r12 = t; 3046 }; 3047 if (r5 >= r13) { 3048 ulong const t = r5; 3049 r5 = r13; 3050 r13 = t; 3051 }; 3052 if (r6 >= r14) { 3053 ulong const t = r6; 3054 r6 = r14; 3055 r14 = t; 3056 }; 3057 if (r7 >= r15) { 3058 ulong const t = r7; 3059 r7 = r15; 3060 r15 = t; 3061 }; 3062 if (r8 >= r16) { 3063 ulong const t = r8; 3064 r8 = r16; 3065 r16 = t; 3066 }; 3067 if (r6 >= r11) { 3068 ulong const t = r6; 3069 r6 = r11; 3070 r11 = t; 3071 }; 3072 if (r7 >= r10) { 3073 ulong const t = r7; 3074 r7 = r10; 3075 r10 = t; 3076 }; 3077 if (r4 >= r13) { 3078 ulong const t = r4; 3079 r4 = r13; 3080 r13 = t; 3081 }; 3082 if (r14 >= r15) { 3083 ulong const t = r14; 3084 r14 = r15; 3085 r15 = t; 3086 }; 3087 if (r8 >= r12) { 3088 ulong const t = r8; 3089 r8 = r12; 3090 r12 = t; 3091 }; 3092 if (r2 >= r3) { 3093 ulong const t = r2; 3094 r2 = r3; 3095 r3 = t; 3096 }; 3097 if (r5 >= r9) { 3098 ulong const t = r5; 3099 r5 = r9; 3100 r9 = t; 3101 }; 3102 if (r2 >= r5) { 3103 ulong const t = r2; 3104 r2 = r5; 3105 r5 = t; 3106 }; 3107 if (r8 >= r14) { 3108 ulong const t = r8; 3109 r8 = r14; 3110 r14 = t; 3111 }; 3112 if (r3 >= r9) { 3113 ulong const t = r3; 3114 r3 = r9; 3115 r9 = t; 3116 }; 3117 if (r12 >= r15) { 3118 ulong const t = r12; 3119 r12 = r15; 3120 r15 = t; 3121 }; 3122 if (r3 >= r5) { 3123 ulong const t = r3; 3124 r3 = r5; 3125 r5 = t; 3126 }; 3127 if (r6 >= r7) { 3128 ulong const t = r6; 3129 r6 = r7; 3130 r7 = t; 3131 }; 3132 if (r10 >= r11) { 3133 ulong const t = r10; 3134 r10 = r11; 3135 r11 = t; 3136 }; 3137 if (r12 >= r14) { 3138 ulong const t = r12; 3139 r12 = r14; 3140 r14 = t; 3141 }; 3142 if (r4 >= r9) { 3143 ulong const t = r4; 3144 r4 = r9; 3145 r9 = t; 3146 }; 3147 if (r8 >= r13) { 3148 ulong const t = r8; 3149 r8 = r13; 3150 r13 = t; 3151 }; 3152 if (r7 >= r9) { 3153 ulong const t = r7; 3154 r7 = r9; 3155 r9 = t; 3156 }; 3157 if (r11 >= r13) { 3158 ulong const t = r11; 3159 r11 = r13; 3160 r13 = t; 3161 }; 3162 if (r4 >= r6) { 3163 ulong const t = r4; 3164 r4 = r6; 3165 r6 = t; 3166 }; 3167 if (r8 >= r10) { 3168 ulong const t = r8; 3169 r8 = r10; 3170 r10 = t; 3171 }; 3172 if (r4 >= r5) { 3173 ulong const t = r4; 3174 r4 = r5; 3175 r5 = t; 3176 }; 3177 if (r6 >= r7) { 3178 ulong const t = r6; 3179 r6 = r7; 3180 r7 = t; 3181 }; 3182 if (r8 >= r9) { 3183 ulong const t = r8; 3184 r8 = r9; 3185 r9 = t; 3186 }; 3187 if (r10 >= r11) { 3188 ulong const t = r10; 3189 r10 = r11; 3190 r11 = t; 3191 }; 3192 if (r12 >= r13) { 3193 ulong const t = r12; 3194 r12 = r13; 3195 r13 = t; 3196 }; 3197 if (r7 >= r8) { 3198 ulong const t = r7; 3199 r7 = r8; 3200 r8 = t; 3201 }; 3202 if (r9 >= r10) { 3203 ulong const t = r9; 3204 r9 = r10; 3205 r10 = t; 3206 }; 3207 { 3208 uint const flip_lane_idx = get_sub_group_local_id() ^ 1; 3209 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 3210 ; 3211 { 3212 ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 3213 ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx); 3214 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 3215 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 3216 }; 3217 { 3218 ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 3219 ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx); 3220 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 3221 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 3222 }; 3223 { 3224 ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 3225 ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx); 3226 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 3227 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 3228 }; 3229 { 3230 ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 3231 ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx); 3232 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 3233 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 3234 }; 3235 { 3236 ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx); 3237 ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx); 3238 r5 = ((r5 <= tb) ^ t_lt) ? tb : r5; 3239 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 3240 }; 3241 { 3242 ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx); 3243 ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx); 3244 r6 = ((r6 <= tb) ^ t_lt) ? tb : r6; 3245 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 3246 }; 3247 { 3248 ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx); 3249 ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx); 3250 r7 = ((r7 <= tb) ^ t_lt) ? tb : r7; 3251 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 3252 }; 3253 { 3254 ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx); 3255 ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx); 3256 r8 = ((r8 <= tb) ^ t_lt) ? tb : r8; 3257 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 3258 }; 3259 } 3260 if (r1 >= r9) { 3261 ulong const t = r1; 3262 r1 = r9; 3263 r9 = t; 3264 }; 3265 if (r5 >= r13) { 3266 ulong const t = r5; 3267 r5 = r13; 3268 r13 = t; 3269 }; 3270 if (r1 >= r5) { 3271 ulong const t = r1; 3272 r1 = r5; 3273 r5 = t; 3274 }; 3275 if (r9 >= r13) { 3276 ulong const t = r9; 3277 r9 = r13; 3278 r13 = t; 3279 }; 3280 if (r3 >= r11) { 3281 ulong const t = r3; 3282 r3 = r11; 3283 r11 = t; 3284 }; 3285 if (r7 >= r15) { 3286 ulong const t = r7; 3287 r7 = r15; 3288 r15 = t; 3289 }; 3290 if (r3 >= r7) { 3291 ulong const t = r3; 3292 r3 = r7; 3293 r7 = t; 3294 }; 3295 if (r11 >= r15) { 3296 ulong const t = r11; 3297 r11 = r15; 3298 r15 = t; 3299 }; 3300 if (r1 >= r3) { 3301 ulong const t = r1; 3302 r1 = r3; 3303 r3 = t; 3304 }; 3305 if (r5 >= r7) { 3306 ulong const t = r5; 3307 r5 = r7; 3308 r7 = t; 3309 }; 3310 if (r9 >= r11) { 3311 ulong const t = r9; 3312 r9 = r11; 3313 r11 = t; 3314 }; 3315 if (r13 >= r15) { 3316 ulong const t = r13; 3317 r13 = r15; 3318 r15 = t; 3319 }; 3320 if (r2 >= r10) { 3321 ulong const t = r2; 3322 r2 = r10; 3323 r10 = t; 3324 }; 3325 if (r6 >= r14) { 3326 ulong const t = r6; 3327 r6 = r14; 3328 r14 = t; 3329 }; 3330 if (r2 >= r6) { 3331 ulong const t = r2; 3332 r2 = r6; 3333 r6 = t; 3334 }; 3335 if (r10 >= r14) { 3336 ulong const t = r10; 3337 r10 = r14; 3338 r14 = t; 3339 }; 3340 if (r4 >= r12) { 3341 ulong const t = r4; 3342 r4 = r12; 3343 r12 = t; 3344 }; 3345 if (r8 >= r16) { 3346 ulong const t = r8; 3347 r8 = r16; 3348 r16 = t; 3349 }; 3350 if (r4 >= r8) { 3351 ulong const t = r4; 3352 r4 = r8; 3353 r8 = t; 3354 }; 3355 if (r12 >= r16) { 3356 ulong const t = r12; 3357 r12 = r16; 3358 r16 = t; 3359 }; 3360 if (r2 >= r4) { 3361 ulong const t = r2; 3362 r2 = r4; 3363 r4 = t; 3364 }; 3365 if (r6 >= r8) { 3366 ulong const t = r6; 3367 r6 = r8; 3368 r8 = t; 3369 }; 3370 if (r10 >= r12) { 3371 ulong const t = r10; 3372 r10 = r12; 3373 r12 = t; 3374 }; 3375 if (r14 >= r16) { 3376 ulong const t = r14; 3377 r14 = r16; 3378 r16 = t; 3379 }; 3380 if (r1 >= r2) { 3381 ulong const t = r1; 3382 r1 = r2; 3383 r2 = t; 3384 }; 3385 if (r3 >= r4) { 3386 ulong const t = r3; 3387 r3 = r4; 3388 r4 = t; 3389 }; 3390 if (r5 >= r6) { 3391 ulong const t = r5; 3392 r5 = r6; 3393 r6 = t; 3394 }; 3395 if (r7 >= r8) { 3396 ulong const t = r7; 3397 r7 = r8; 3398 r8 = t; 3399 }; 3400 if (r9 >= r10) { 3401 ulong const t = r9; 3402 r9 = r10; 3403 r10 = t; 3404 }; 3405 if (r11 >= r12) { 3406 ulong const t = r11; 3407 r11 = r12; 3408 r12 = t; 3409 }; 3410 if (r13 >= r14) { 3411 ulong const t = r13; 3412 r13 = r14; 3413 r14 = t; 3414 }; 3415 if (r15 >= r16) { 3416 ulong const t = r15; 3417 r15 = r16; 3418 r16 = t; 3419 }; 3420 { 3421 uint const flip_lane_idx = get_sub_group_local_id() ^ 3; 3422 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 3423 ; 3424 { 3425 ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 3426 ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx); 3427 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 3428 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 3429 }; 3430 { 3431 ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 3432 ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx); 3433 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 3434 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 3435 }; 3436 { 3437 ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 3438 ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx); 3439 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 3440 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 3441 }; 3442 { 3443 ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 3444 ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx); 3445 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 3446 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 3447 }; 3448 { 3449 ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx); 3450 ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx); 3451 r5 = ((r5 <= tb) ^ t_lt) ? tb : r5; 3452 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 3453 }; 3454 { 3455 ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx); 3456 ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx); 3457 r6 = ((r6 <= tb) ^ t_lt) ? tb : r6; 3458 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 3459 }; 3460 { 3461 ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx); 3462 ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx); 3463 r7 = ((r7 <= tb) ^ t_lt) ? tb : r7; 3464 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 3465 }; 3466 { 3467 ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx); 3468 ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx); 3469 r8 = ((r8 <= tb) ^ t_lt) ? tb : r8; 3470 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 3471 }; 3472 } 3473 { 3474 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 3475 int const t_lt = get_sub_group_local_id() < half_lane_idx; 3476 ; 3477 { 3478 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 3479 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 3480 }; 3481 { 3482 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 3483 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 3484 }; 3485 { 3486 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 3487 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 3488 }; 3489 { 3490 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 3491 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 3492 }; 3493 { 3494 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 3495 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 3496 }; 3497 { 3498 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 3499 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 3500 }; 3501 { 3502 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 3503 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 3504 }; 3505 { 3506 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 3507 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 3508 }; 3509 { 3510 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 3511 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 3512 }; 3513 { 3514 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 3515 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 3516 }; 3517 { 3518 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 3519 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 3520 }; 3521 { 3522 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 3523 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 3524 }; 3525 { 3526 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 3527 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 3528 }; 3529 { 3530 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 3531 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 3532 }; 3533 { 3534 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 3535 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 3536 }; 3537 { 3538 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 3539 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 3540 }; 3541 } 3542 if (r1 >= r9) { 3543 ulong const t = r1; 3544 r1 = r9; 3545 r9 = t; 3546 }; 3547 if (r5 >= r13) { 3548 ulong const t = r5; 3549 r5 = r13; 3550 r13 = t; 3551 }; 3552 if (r1 >= r5) { 3553 ulong const t = r1; 3554 r1 = r5; 3555 r5 = t; 3556 }; 3557 if (r9 >= r13) { 3558 ulong const t = r9; 3559 r9 = r13; 3560 r13 = t; 3561 }; 3562 if (r3 >= r11) { 3563 ulong const t = r3; 3564 r3 = r11; 3565 r11 = t; 3566 }; 3567 if (r7 >= r15) { 3568 ulong const t = r7; 3569 r7 = r15; 3570 r15 = t; 3571 }; 3572 if (r3 >= r7) { 3573 ulong const t = r3; 3574 r3 = r7; 3575 r7 = t; 3576 }; 3577 if (r11 >= r15) { 3578 ulong const t = r11; 3579 r11 = r15; 3580 r15 = t; 3581 }; 3582 if (r1 >= r3) { 3583 ulong const t = r1; 3584 r1 = r3; 3585 r3 = t; 3586 }; 3587 if (r5 >= r7) { 3588 ulong const t = r5; 3589 r5 = r7; 3590 r7 = t; 3591 }; 3592 if (r9 >= r11) { 3593 ulong const t = r9; 3594 r9 = r11; 3595 r11 = t; 3596 }; 3597 if (r13 >= r15) { 3598 ulong const t = r13; 3599 r13 = r15; 3600 r15 = t; 3601 }; 3602 if (r2 >= r10) { 3603 ulong const t = r2; 3604 r2 = r10; 3605 r10 = t; 3606 }; 3607 if (r6 >= r14) { 3608 ulong const t = r6; 3609 r6 = r14; 3610 r14 = t; 3611 }; 3612 if (r2 >= r6) { 3613 ulong const t = r2; 3614 r2 = r6; 3615 r6 = t; 3616 }; 3617 if (r10 >= r14) { 3618 ulong const t = r10; 3619 r10 = r14; 3620 r14 = t; 3621 }; 3622 if (r4 >= r12) { 3623 ulong const t = r4; 3624 r4 = r12; 3625 r12 = t; 3626 }; 3627 if (r8 >= r16) { 3628 ulong const t = r8; 3629 r8 = r16; 3630 r16 = t; 3631 }; 3632 if (r4 >= r8) { 3633 ulong const t = r4; 3634 r4 = r8; 3635 r8 = t; 3636 }; 3637 if (r12 >= r16) { 3638 ulong const t = r12; 3639 r12 = r16; 3640 r16 = t; 3641 }; 3642 if (r2 >= r4) { 3643 ulong const t = r2; 3644 r2 = r4; 3645 r4 = t; 3646 }; 3647 if (r6 >= r8) { 3648 ulong const t = r6; 3649 r6 = r8; 3650 r8 = t; 3651 }; 3652 if (r10 >= r12) { 3653 ulong const t = r10; 3654 r10 = r12; 3655 r12 = t; 3656 }; 3657 if (r14 >= r16) { 3658 ulong const t = r14; 3659 r14 = r16; 3660 r16 = t; 3661 }; 3662 if (r1 >= r2) { 3663 ulong const t = r1; 3664 r1 = r2; 3665 r2 = t; 3666 }; 3667 if (r3 >= r4) { 3668 ulong const t = r3; 3669 r3 = r4; 3670 r4 = t; 3671 }; 3672 if (r5 >= r6) { 3673 ulong const t = r5; 3674 r5 = r6; 3675 r6 = t; 3676 }; 3677 if (r7 >= r8) { 3678 ulong const t = r7; 3679 r7 = r8; 3680 r8 = t; 3681 }; 3682 if (r9 >= r10) { 3683 ulong const t = r9; 3684 r9 = r10; 3685 r10 = t; 3686 }; 3687 if (r11 >= r12) { 3688 ulong const t = r11; 3689 r11 = r12; 3690 r12 = t; 3691 }; 3692 if (r13 >= r14) { 3693 ulong const t = r13; 3694 r13 = r14; 3695 r14 = t; 3696 }; 3697 if (r15 >= r16) { 3698 ulong const t = r15; 3699 r15 = r16; 3700 r16 = t; 3701 }; 3702 { 3703 uint const flip_lane_idx = get_sub_group_local_id() ^ 7; 3704 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 3705 ; 3706 { 3707 ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 3708 ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx); 3709 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 3710 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 3711 }; 3712 { 3713 ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 3714 ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx); 3715 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 3716 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 3717 }; 3718 { 3719 ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 3720 ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx); 3721 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 3722 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 3723 }; 3724 { 3725 ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 3726 ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx); 3727 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 3728 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 3729 }; 3730 { 3731 ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx); 3732 ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx); 3733 r5 = ((r5 <= tb) ^ t_lt) ? tb : r5; 3734 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 3735 }; 3736 { 3737 ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx); 3738 ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx); 3739 r6 = ((r6 <= tb) ^ t_lt) ? tb : r6; 3740 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 3741 }; 3742 { 3743 ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx); 3744 ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx); 3745 r7 = ((r7 <= tb) ^ t_lt) ? tb : r7; 3746 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 3747 }; 3748 { 3749 ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx); 3750 ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx); 3751 r8 = ((r8 <= tb) ^ t_lt) ? tb : r8; 3752 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 3753 }; 3754 } 3755 { 3756 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 3757 int const t_lt = get_sub_group_local_id() < half_lane_idx; 3758 ; 3759 { 3760 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 3761 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 3762 }; 3763 { 3764 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 3765 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 3766 }; 3767 { 3768 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 3769 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 3770 }; 3771 { 3772 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 3773 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 3774 }; 3775 { 3776 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 3777 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 3778 }; 3779 { 3780 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 3781 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 3782 }; 3783 { 3784 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 3785 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 3786 }; 3787 { 3788 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 3789 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 3790 }; 3791 { 3792 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 3793 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 3794 }; 3795 { 3796 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 3797 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 3798 }; 3799 { 3800 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 3801 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 3802 }; 3803 { 3804 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 3805 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 3806 }; 3807 { 3808 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 3809 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 3810 }; 3811 { 3812 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 3813 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 3814 }; 3815 { 3816 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 3817 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 3818 }; 3819 { 3820 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 3821 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 3822 }; 3823 } 3824 { 3825 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 3826 int const t_lt = get_sub_group_local_id() < half_lane_idx; 3827 ; 3828 { 3829 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 3830 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 3831 }; 3832 { 3833 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 3834 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 3835 }; 3836 { 3837 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 3838 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 3839 }; 3840 { 3841 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 3842 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 3843 }; 3844 { 3845 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 3846 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 3847 }; 3848 { 3849 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 3850 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 3851 }; 3852 { 3853 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 3854 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 3855 }; 3856 { 3857 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 3858 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 3859 }; 3860 { 3861 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 3862 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 3863 }; 3864 { 3865 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 3866 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 3867 }; 3868 { 3869 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 3870 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 3871 }; 3872 { 3873 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 3874 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 3875 }; 3876 { 3877 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 3878 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 3879 }; 3880 { 3881 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 3882 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 3883 }; 3884 { 3885 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 3886 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 3887 }; 3888 { 3889 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 3890 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 3891 }; 3892 } 3893 if (r1 >= r9) { 3894 ulong const t = r1; 3895 r1 = r9; 3896 r9 = t; 3897 }; 3898 if (r5 >= r13) { 3899 ulong const t = r5; 3900 r5 = r13; 3901 r13 = t; 3902 }; 3903 if (r1 >= r5) { 3904 ulong const t = r1; 3905 r1 = r5; 3906 r5 = t; 3907 }; 3908 if (r9 >= r13) { 3909 ulong const t = r9; 3910 r9 = r13; 3911 r13 = t; 3912 }; 3913 if (r3 >= r11) { 3914 ulong const t = r3; 3915 r3 = r11; 3916 r11 = t; 3917 }; 3918 if (r7 >= r15) { 3919 ulong const t = r7; 3920 r7 = r15; 3921 r15 = t; 3922 }; 3923 if (r3 >= r7) { 3924 ulong const t = r3; 3925 r3 = r7; 3926 r7 = t; 3927 }; 3928 if (r11 >= r15) { 3929 ulong const t = r11; 3930 r11 = r15; 3931 r15 = t; 3932 }; 3933 if (r1 >= r3) { 3934 ulong const t = r1; 3935 r1 = r3; 3936 r3 = t; 3937 }; 3938 if (r5 >= r7) { 3939 ulong const t = r5; 3940 r5 = r7; 3941 r7 = t; 3942 }; 3943 if (r9 >= r11) { 3944 ulong const t = r9; 3945 r9 = r11; 3946 r11 = t; 3947 }; 3948 if (r13 >= r15) { 3949 ulong const t = r13; 3950 r13 = r15; 3951 r15 = t; 3952 }; 3953 if (r2 >= r10) { 3954 ulong const t = r2; 3955 r2 = r10; 3956 r10 = t; 3957 }; 3958 if (r6 >= r14) { 3959 ulong const t = r6; 3960 r6 = r14; 3961 r14 = t; 3962 }; 3963 if (r2 >= r6) { 3964 ulong const t = r2; 3965 r2 = r6; 3966 r6 = t; 3967 }; 3968 if (r10 >= r14) { 3969 ulong const t = r10; 3970 r10 = r14; 3971 r14 = t; 3972 }; 3973 if (r4 >= r12) { 3974 ulong const t = r4; 3975 r4 = r12; 3976 r12 = t; 3977 }; 3978 if (r8 >= r16) { 3979 ulong const t = r8; 3980 r8 = r16; 3981 r16 = t; 3982 }; 3983 if (r4 >= r8) { 3984 ulong const t = r4; 3985 r4 = r8; 3986 r8 = t; 3987 }; 3988 if (r12 >= r16) { 3989 ulong const t = r12; 3990 r12 = r16; 3991 r16 = t; 3992 }; 3993 if (r2 >= r4) { 3994 ulong const t = r2; 3995 r2 = r4; 3996 r4 = t; 3997 }; 3998 if (r6 >= r8) { 3999 ulong const t = r6; 4000 r6 = r8; 4001 r8 = t; 4002 }; 4003 if (r10 >= r12) { 4004 ulong const t = r10; 4005 r10 = r12; 4006 r12 = t; 4007 }; 4008 if (r14 >= r16) { 4009 ulong const t = r14; 4010 r14 = r16; 4011 r16 = t; 4012 }; 4013 if (r1 >= r2) { 4014 ulong const t = r1; 4015 r1 = r2; 4016 r2 = t; 4017 }; 4018 if (r3 >= r4) { 4019 ulong const t = r3; 4020 r3 = r4; 4021 r4 = t; 4022 }; 4023 if (r5 >= r6) { 4024 ulong const t = r5; 4025 r5 = r6; 4026 r6 = t; 4027 }; 4028 if (r7 >= r8) { 4029 ulong const t = r7; 4030 r7 = r8; 4031 r8 = t; 4032 }; 4033 if (r9 >= r10) { 4034 ulong const t = r9; 4035 r9 = r10; 4036 r10 = t; 4037 }; 4038 if (r11 >= r12) { 4039 ulong const t = r11; 4040 r11 = r12; 4041 r12 = t; 4042 }; 4043 if (r13 >= r14) { 4044 ulong const t = r13; 4045 r13 = r14; 4046 r14 = t; 4047 }; 4048 if (r15 >= r16) { 4049 ulong const t = r15; 4050 r15 = r16; 4051 r16 = t; 4052 }; 4053 uint const smem_l_idx = 4054 get_sub_group_id() * ((1 << 3) * 4) + get_sub_group_local_id(); 4055 uint const smem_r_idx = (get_sub_group_id() ^ 1) * ((1 << 3) * 4) + 4056 (get_sub_group_local_id() ^ ((1 << 3) - 1)); 4057 shared.m[get_local_id(0) + (4 * (1 << 3) * 0)] = r1; 4058 shared.m[get_local_id(0) + (4 * (1 << 3) * 1)] = r16; 4059 shared.m[get_local_id(0) + (4 * (1 << 3) * 2)] = r2; 4060 shared.m[get_local_id(0) + (4 * (1 << 3) * 3)] = r15; 4061 shared.m[get_local_id(0) + (4 * (1 << 3) * 4)] = r3; 4062 shared.m[get_local_id(0) + (4 * (1 << 3) * 5)] = r14; 4063 shared.m[get_local_id(0) + (4 * (1 << 3) * 6)] = r4; 4064 shared.m[get_local_id(0) + (4 * (1 << 3) * 7)] = r13; 4065 shared.m[get_local_id(0) + (4 * (1 << 3) * 8)] = r5; 4066 shared.m[get_local_id(0) + (4 * (1 << 3) * 9)] = r12; 4067 shared.m[get_local_id(0) + (4 * (1 << 3) * 10)] = r6; 4068 shared.m[get_local_id(0) + (4 * (1 << 3) * 11)] = r11; 4069 shared.m[get_local_id(0) + (4 * (1 << 3) * 12)] = r7; 4070 shared.m[get_local_id(0) + (4 * (1 << 3) * 13)] = r10; 4071 shared.m[get_local_id(0) + (4 * (1 << 3) * 14)] = r8; 4072 shared.m[get_local_id(0) + (4 * (1 << 3) * 15)] = r9; 4073 barrier(CLK_LOCAL_MEM_FENCE); 4074 { 4075 { 4076 ulong r0_1 = shared.m[smem_l_idx + (0)]; 4077 ulong r0_2 = shared.m[smem_r_idx + (8)]; 4078 if (r0_1 >= r0_2) { 4079 ulong const t = r0_1; 4080 r0_1 = r0_2; 4081 r0_2 = t; 4082 }; 4083 shared.m[smem_l_idx + (0)] = r0_1; 4084 shared.m[smem_r_idx + (8)] = r0_2; 4085 } 4086 { 4087 ulong r1_1 = shared.m[smem_l_idx + (16)]; 4088 ulong r1_2 = shared.m[smem_r_idx + (24)]; 4089 if (r1_1 >= r1_2) { 4090 ulong const t = r1_1; 4091 r1_1 = r1_2; 4092 r1_2 = t; 4093 }; 4094 shared.m[smem_l_idx + (16)] = r1_1; 4095 shared.m[smem_r_idx + (24)] = r1_2; 4096 } 4097 { 4098 ulong r0_1 = shared.m[smem_l_idx + (128)]; 4099 ulong r0_2 = shared.m[smem_r_idx + (136)]; 4100 if (r0_1 >= r0_2) { 4101 ulong const t = r0_1; 4102 r0_1 = r0_2; 4103 r0_2 = t; 4104 }; 4105 shared.m[smem_l_idx + (128)] = r0_1; 4106 shared.m[smem_r_idx + (136)] = r0_2; 4107 } 4108 { 4109 ulong r1_1 = shared.m[smem_l_idx + (144)]; 4110 ulong r1_2 = shared.m[smem_r_idx + (152)]; 4111 if (r1_1 >= r1_2) { 4112 ulong const t = r1_1; 4113 r1_1 = r1_2; 4114 r1_2 = t; 4115 }; 4116 shared.m[smem_l_idx + (144)] = r1_1; 4117 shared.m[smem_r_idx + (152)] = r1_2; 4118 } 4119 { 4120 ulong r0_1 = shared.m[smem_l_idx + (256)]; 4121 ulong r0_2 = shared.m[smem_r_idx + (264)]; 4122 if (r0_1 >= r0_2) { 4123 ulong const t = r0_1; 4124 r0_1 = r0_2; 4125 r0_2 = t; 4126 }; 4127 shared.m[smem_l_idx + (256)] = r0_1; 4128 shared.m[smem_r_idx + (264)] = r0_2; 4129 } 4130 { 4131 ulong r1_1 = shared.m[smem_l_idx + (272)]; 4132 ulong r1_2 = shared.m[smem_r_idx + (280)]; 4133 if (r1_1 >= r1_2) { 4134 ulong const t = r1_1; 4135 r1_1 = r1_2; 4136 r1_2 = t; 4137 }; 4138 shared.m[smem_l_idx + (272)] = r1_1; 4139 shared.m[smem_r_idx + (280)] = r1_2; 4140 } 4141 { 4142 ulong r0_1 = shared.m[smem_l_idx + (384)]; 4143 ulong r0_2 = shared.m[smem_r_idx + (392)]; 4144 if (r0_1 >= r0_2) { 4145 ulong const t = r0_1; 4146 r0_1 = r0_2; 4147 r0_2 = t; 4148 }; 4149 shared.m[smem_l_idx + (384)] = r0_1; 4150 shared.m[smem_r_idx + (392)] = r0_2; 4151 } 4152 { 4153 ulong r1_1 = shared.m[smem_l_idx + (400)]; 4154 ulong r1_2 = shared.m[smem_r_idx + (408)]; 4155 if (r1_1 >= r1_2) { 4156 ulong const t = r1_1; 4157 r1_1 = r1_2; 4158 r1_2 = t; 4159 }; 4160 shared.m[smem_l_idx + (400)] = r1_1; 4161 shared.m[smem_r_idx + (408)] = r1_2; 4162 } 4163 } 4164 barrier(CLK_LOCAL_MEM_FENCE); 4165 r1 = shared.m[get_local_id(0) + (4 * (1 << 3) * 0)]; 4166 r16 = shared.m[get_local_id(0) + (4 * (1 << 3) * 1)]; 4167 r2 = shared.m[get_local_id(0) + (4 * (1 << 3) * 2)]; 4168 r15 = shared.m[get_local_id(0) + (4 * (1 << 3) * 3)]; 4169 r3 = shared.m[get_local_id(0) + (4 * (1 << 3) * 4)]; 4170 r14 = shared.m[get_local_id(0) + (4 * (1 << 3) * 5)]; 4171 r4 = shared.m[get_local_id(0) + (4 * (1 << 3) * 6)]; 4172 r13 = shared.m[get_local_id(0) + (4 * (1 << 3) * 7)]; 4173 r5 = shared.m[get_local_id(0) + (4 * (1 << 3) * 8)]; 4174 r12 = shared.m[get_local_id(0) + (4 * (1 << 3) * 9)]; 4175 r6 = shared.m[get_local_id(0) + (4 * (1 << 3) * 10)]; 4176 r11 = shared.m[get_local_id(0) + (4 * (1 << 3) * 11)]; 4177 r7 = shared.m[get_local_id(0) + (4 * (1 << 3) * 12)]; 4178 r10 = shared.m[get_local_id(0) + (4 * (1 << 3) * 13)]; 4179 r8 = shared.m[get_local_id(0) + (4 * (1 << 3) * 14)]; 4180 r9 = shared.m[get_local_id(0) + (4 * (1 << 3) * 15)]; 4181 { 4182 { 4183 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 4184 int const t_lt = get_sub_group_local_id() < half_lane_idx; 4185 ; 4186 { 4187 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 4188 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 4189 }; 4190 { 4191 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 4192 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 4193 }; 4194 { 4195 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 4196 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 4197 }; 4198 { 4199 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 4200 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 4201 }; 4202 { 4203 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 4204 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 4205 }; 4206 { 4207 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 4208 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 4209 }; 4210 { 4211 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 4212 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 4213 }; 4214 { 4215 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 4216 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 4217 }; 4218 { 4219 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 4220 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 4221 }; 4222 { 4223 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 4224 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 4225 }; 4226 { 4227 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 4228 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 4229 }; 4230 { 4231 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 4232 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 4233 }; 4234 { 4235 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 4236 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 4237 }; 4238 { 4239 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 4240 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 4241 }; 4242 { 4243 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 4244 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 4245 }; 4246 { 4247 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 4248 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 4249 }; 4250 } 4251 { 4252 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 4253 int const t_lt = get_sub_group_local_id() < half_lane_idx; 4254 ; 4255 { 4256 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 4257 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 4258 }; 4259 { 4260 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 4261 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 4262 }; 4263 { 4264 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 4265 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 4266 }; 4267 { 4268 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 4269 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 4270 }; 4271 { 4272 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 4273 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 4274 }; 4275 { 4276 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 4277 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 4278 }; 4279 { 4280 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 4281 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 4282 }; 4283 { 4284 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 4285 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 4286 }; 4287 { 4288 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 4289 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 4290 }; 4291 { 4292 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 4293 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 4294 }; 4295 { 4296 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 4297 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 4298 }; 4299 { 4300 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 4301 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 4302 }; 4303 { 4304 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 4305 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 4306 }; 4307 { 4308 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 4309 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 4310 }; 4311 { 4312 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 4313 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 4314 }; 4315 { 4316 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 4317 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 4318 }; 4319 } 4320 { 4321 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 4322 int const t_lt = get_sub_group_local_id() < half_lane_idx; 4323 ; 4324 { 4325 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 4326 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 4327 }; 4328 { 4329 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 4330 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 4331 }; 4332 { 4333 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 4334 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 4335 }; 4336 { 4337 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 4338 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 4339 }; 4340 { 4341 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 4342 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 4343 }; 4344 { 4345 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 4346 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 4347 }; 4348 { 4349 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 4350 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 4351 }; 4352 { 4353 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 4354 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 4355 }; 4356 { 4357 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 4358 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 4359 }; 4360 { 4361 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 4362 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 4363 }; 4364 { 4365 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 4366 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 4367 }; 4368 { 4369 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 4370 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 4371 }; 4372 { 4373 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 4374 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 4375 }; 4376 { 4377 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 4378 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 4379 }; 4380 { 4381 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 4382 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 4383 }; 4384 { 4385 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 4386 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 4387 }; 4388 } 4389 if (r1 >= r9) { 4390 ulong const t = r1; 4391 r1 = r9; 4392 r9 = t; 4393 }; 4394 if (r5 >= r13) { 4395 ulong const t = r5; 4396 r5 = r13; 4397 r13 = t; 4398 }; 4399 if (r1 >= r5) { 4400 ulong const t = r1; 4401 r1 = r5; 4402 r5 = t; 4403 }; 4404 if (r9 >= r13) { 4405 ulong const t = r9; 4406 r9 = r13; 4407 r13 = t; 4408 }; 4409 if (r3 >= r11) { 4410 ulong const t = r3; 4411 r3 = r11; 4412 r11 = t; 4413 }; 4414 if (r7 >= r15) { 4415 ulong const t = r7; 4416 r7 = r15; 4417 r15 = t; 4418 }; 4419 if (r3 >= r7) { 4420 ulong const t = r3; 4421 r3 = r7; 4422 r7 = t; 4423 }; 4424 if (r11 >= r15) { 4425 ulong const t = r11; 4426 r11 = r15; 4427 r15 = t; 4428 }; 4429 if (r1 >= r3) { 4430 ulong const t = r1; 4431 r1 = r3; 4432 r3 = t; 4433 }; 4434 if (r5 >= r7) { 4435 ulong const t = r5; 4436 r5 = r7; 4437 r7 = t; 4438 }; 4439 if (r9 >= r11) { 4440 ulong const t = r9; 4441 r9 = r11; 4442 r11 = t; 4443 }; 4444 if (r13 >= r15) { 4445 ulong const t = r13; 4446 r13 = r15; 4447 r15 = t; 4448 }; 4449 if (r2 >= r10) { 4450 ulong const t = r2; 4451 r2 = r10; 4452 r10 = t; 4453 }; 4454 if (r6 >= r14) { 4455 ulong const t = r6; 4456 r6 = r14; 4457 r14 = t; 4458 }; 4459 if (r2 >= r6) { 4460 ulong const t = r2; 4461 r2 = r6; 4462 r6 = t; 4463 }; 4464 if (r10 >= r14) { 4465 ulong const t = r10; 4466 r10 = r14; 4467 r14 = t; 4468 }; 4469 if (r4 >= r12) { 4470 ulong const t = r4; 4471 r4 = r12; 4472 r12 = t; 4473 }; 4474 if (r8 >= r16) { 4475 ulong const t = r8; 4476 r8 = r16; 4477 r16 = t; 4478 }; 4479 if (r4 >= r8) { 4480 ulong const t = r4; 4481 r4 = r8; 4482 r8 = t; 4483 }; 4484 if (r12 >= r16) { 4485 ulong const t = r12; 4486 r12 = r16; 4487 r16 = t; 4488 }; 4489 if (r2 >= r4) { 4490 ulong const t = r2; 4491 r2 = r4; 4492 r4 = t; 4493 }; 4494 if (r6 >= r8) { 4495 ulong const t = r6; 4496 r6 = r8; 4497 r8 = t; 4498 }; 4499 if (r10 >= r12) { 4500 ulong const t = r10; 4501 r10 = r12; 4502 r12 = t; 4503 }; 4504 if (r14 >= r16) { 4505 ulong const t = r14; 4506 r14 = r16; 4507 r16 = t; 4508 }; 4509 if (r1 >= r2) { 4510 ulong const t = r1; 4511 r1 = r2; 4512 r2 = t; 4513 }; 4514 if (r3 >= r4) { 4515 ulong const t = r3; 4516 r3 = r4; 4517 r4 = t; 4518 }; 4519 if (r5 >= r6) { 4520 ulong const t = r5; 4521 r5 = r6; 4522 r6 = t; 4523 }; 4524 if (r7 >= r8) { 4525 ulong const t = r7; 4526 r7 = r8; 4527 r8 = t; 4528 }; 4529 if (r9 >= r10) { 4530 ulong const t = r9; 4531 r9 = r10; 4532 r10 = t; 4533 }; 4534 if (r11 >= r12) { 4535 ulong const t = r11; 4536 r11 = r12; 4537 r12 = t; 4538 }; 4539 if (r13 >= r14) { 4540 ulong const t = r13; 4541 r13 = r14; 4542 r14 = t; 4543 }; 4544 if (r15 >= r16) { 4545 ulong const t = r15; 4546 r15 = r16; 4547 r16 = t; 4548 }; 4549 } 4550 shared.m[get_local_id(0) + (4 * (1 << 3) * 0)] = r1; 4551 shared.m[get_local_id(0) + (4 * (1 << 3) * 1)] = r16; 4552 shared.m[get_local_id(0) + (4 * (1 << 3) * 2)] = r2; 4553 shared.m[get_local_id(0) + (4 * (1 << 3) * 3)] = r15; 4554 shared.m[get_local_id(0) + (4 * (1 << 3) * 4)] = r3; 4555 shared.m[get_local_id(0) + (4 * (1 << 3) * 5)] = r14; 4556 shared.m[get_local_id(0) + (4 * (1 << 3) * 6)] = r4; 4557 shared.m[get_local_id(0) + (4 * (1 << 3) * 7)] = r13; 4558 shared.m[get_local_id(0) + (4 * (1 << 3) * 8)] = r5; 4559 shared.m[get_local_id(0) + (4 * (1 << 3) * 9)] = r12; 4560 shared.m[get_local_id(0) + (4 * (1 << 3) * 10)] = r6; 4561 shared.m[get_local_id(0) + (4 * (1 << 3) * 11)] = r11; 4562 shared.m[get_local_id(0) + (4 * (1 << 3) * 12)] = r7; 4563 shared.m[get_local_id(0) + (4 * (1 << 3) * 13)] = r10; 4564 shared.m[get_local_id(0) + (4 * (1 << 3) * 14)] = r8; 4565 shared.m[get_local_id(0) + (4 * (1 << 3) * 15)] = r9; 4566 barrier(CLK_LOCAL_MEM_FENCE); 4567 { 4568 { 4569 ulong r0_1 = shared.m[smem_l_idx + (0)]; 4570 ulong r0_2 = shared.m[smem_l_idx + (8)]; 4571 ulong r0_3 = shared.m[smem_r_idx + (16)]; 4572 ulong r0_4 = shared.m[smem_r_idx + (24)]; 4573 if (r0_2 >= r0_3) { 4574 ulong const t = r0_2; 4575 r0_2 = r0_3; 4576 r0_3 = t; 4577 }; 4578 if (r0_1 >= r0_4) { 4579 ulong const t = r0_1; 4580 r0_1 = r0_4; 4581 r0_4 = t; 4582 }; 4583 if (r0_3 >= r0_4) { 4584 ulong const t = r0_3; 4585 r0_3 = r0_4; 4586 r0_4 = t; 4587 }; 4588 if (r0_1 >= r0_2) { 4589 ulong const t = r0_1; 4590 r0_1 = r0_2; 4591 r0_2 = t; 4592 }; 4593 shared.m[smem_l_idx + (0)] = r0_1; 4594 shared.m[smem_l_idx + (8)] = r0_2; 4595 shared.m[smem_r_idx + (16)] = r0_3; 4596 shared.m[smem_r_idx + (24)] = r0_4; 4597 } 4598 { 4599 ulong r0_1 = shared.m[smem_l_idx + (128)]; 4600 ulong r0_2 = shared.m[smem_l_idx + (136)]; 4601 ulong r0_3 = shared.m[smem_r_idx + (144)]; 4602 ulong r0_4 = shared.m[smem_r_idx + (152)]; 4603 if (r0_2 >= r0_3) { 4604 ulong const t = r0_2; 4605 r0_2 = r0_3; 4606 r0_3 = t; 4607 }; 4608 if (r0_1 >= r0_4) { 4609 ulong const t = r0_1; 4610 r0_1 = r0_4; 4611 r0_4 = t; 4612 }; 4613 if (r0_3 >= r0_4) { 4614 ulong const t = r0_3; 4615 r0_3 = r0_4; 4616 r0_4 = t; 4617 }; 4618 if (r0_1 >= r0_2) { 4619 ulong const t = r0_1; 4620 r0_1 = r0_2; 4621 r0_2 = t; 4622 }; 4623 shared.m[smem_l_idx + (128)] = r0_1; 4624 shared.m[smem_l_idx + (136)] = r0_2; 4625 shared.m[smem_r_idx + (144)] = r0_3; 4626 shared.m[smem_r_idx + (152)] = r0_4; 4627 } 4628 { 4629 ulong r0_1 = shared.m[smem_l_idx + (256)]; 4630 ulong r0_2 = shared.m[smem_l_idx + (264)]; 4631 ulong r0_3 = shared.m[smem_r_idx + (272)]; 4632 ulong r0_4 = shared.m[smem_r_idx + (280)]; 4633 if (r0_2 >= r0_3) { 4634 ulong const t = r0_2; 4635 r0_2 = r0_3; 4636 r0_3 = t; 4637 }; 4638 if (r0_1 >= r0_4) { 4639 ulong const t = r0_1; 4640 r0_1 = r0_4; 4641 r0_4 = t; 4642 }; 4643 if (r0_3 >= r0_4) { 4644 ulong const t = r0_3; 4645 r0_3 = r0_4; 4646 r0_4 = t; 4647 }; 4648 if (r0_1 >= r0_2) { 4649 ulong const t = r0_1; 4650 r0_1 = r0_2; 4651 r0_2 = t; 4652 }; 4653 shared.m[smem_l_idx + (256)] = r0_1; 4654 shared.m[smem_l_idx + (264)] = r0_2; 4655 shared.m[smem_r_idx + (272)] = r0_3; 4656 shared.m[smem_r_idx + (280)] = r0_4; 4657 } 4658 { 4659 ulong r0_1 = shared.m[smem_l_idx + (384)]; 4660 ulong r0_2 = shared.m[smem_l_idx + (392)]; 4661 ulong r0_3 = shared.m[smem_r_idx + (400)]; 4662 ulong r0_4 = shared.m[smem_r_idx + (408)]; 4663 if (r0_2 >= r0_3) { 4664 ulong const t = r0_2; 4665 r0_2 = r0_3; 4666 r0_3 = t; 4667 }; 4668 if (r0_1 >= r0_4) { 4669 ulong const t = r0_1; 4670 r0_1 = r0_4; 4671 r0_4 = t; 4672 }; 4673 if (r0_3 >= r0_4) { 4674 ulong const t = r0_3; 4675 r0_3 = r0_4; 4676 r0_4 = t; 4677 }; 4678 if (r0_1 >= r0_2) { 4679 ulong const t = r0_1; 4680 r0_1 = r0_2; 4681 r0_2 = t; 4682 }; 4683 shared.m[smem_l_idx + (384)] = r0_1; 4684 shared.m[smem_l_idx + (392)] = r0_2; 4685 shared.m[smem_r_idx + (400)] = r0_3; 4686 shared.m[smem_r_idx + (408)] = r0_4; 4687 } 4688 } 4689 barrier(CLK_LOCAL_MEM_FENCE); 4690 r1 = shared.m[get_local_id(0) + (4 * (1 << 3) * 0)]; 4691 r16 = shared.m[get_local_id(0) + (4 * (1 << 3) * 1)]; 4692 r2 = shared.m[get_local_id(0) + (4 * (1 << 3) * 2)]; 4693 r15 = shared.m[get_local_id(0) + (4 * (1 << 3) * 3)]; 4694 r3 = shared.m[get_local_id(0) + (4 * (1 << 3) * 4)]; 4695 r14 = shared.m[get_local_id(0) + (4 * (1 << 3) * 5)]; 4696 r4 = shared.m[get_local_id(0) + (4 * (1 << 3) * 6)]; 4697 r13 = shared.m[get_local_id(0) + (4 * (1 << 3) * 7)]; 4698 r5 = shared.m[get_local_id(0) + (4 * (1 << 3) * 8)]; 4699 r12 = shared.m[get_local_id(0) + (4 * (1 << 3) * 9)]; 4700 r6 = shared.m[get_local_id(0) + (4 * (1 << 3) * 10)]; 4701 r11 = shared.m[get_local_id(0) + (4 * (1 << 3) * 11)]; 4702 r7 = shared.m[get_local_id(0) + (4 * (1 << 3) * 12)]; 4703 r10 = shared.m[get_local_id(0) + (4 * (1 << 3) * 13)]; 4704 r8 = shared.m[get_local_id(0) + (4 * (1 << 3) * 14)]; 4705 r9 = shared.m[get_local_id(0) + (4 * (1 << 3) * 15)]; 4706 { 4707 { 4708 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 4709 int const t_lt = get_sub_group_local_id() < half_lane_idx; 4710 ; 4711 { 4712 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 4713 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 4714 }; 4715 { 4716 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 4717 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 4718 }; 4719 { 4720 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 4721 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 4722 }; 4723 { 4724 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 4725 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 4726 }; 4727 { 4728 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 4729 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 4730 }; 4731 { 4732 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 4733 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 4734 }; 4735 { 4736 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 4737 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 4738 }; 4739 { 4740 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 4741 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 4742 }; 4743 { 4744 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 4745 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 4746 }; 4747 { 4748 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 4749 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 4750 }; 4751 { 4752 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 4753 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 4754 }; 4755 { 4756 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 4757 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 4758 }; 4759 { 4760 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 4761 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 4762 }; 4763 { 4764 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 4765 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 4766 }; 4767 { 4768 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 4769 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 4770 }; 4771 { 4772 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 4773 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 4774 }; 4775 } 4776 { 4777 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 4778 int const t_lt = get_sub_group_local_id() < half_lane_idx; 4779 ; 4780 { 4781 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 4782 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 4783 }; 4784 { 4785 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 4786 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 4787 }; 4788 { 4789 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 4790 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 4791 }; 4792 { 4793 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 4794 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 4795 }; 4796 { 4797 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 4798 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 4799 }; 4800 { 4801 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 4802 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 4803 }; 4804 { 4805 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 4806 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 4807 }; 4808 { 4809 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 4810 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 4811 }; 4812 { 4813 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 4814 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 4815 }; 4816 { 4817 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 4818 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 4819 }; 4820 { 4821 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 4822 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 4823 }; 4824 { 4825 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 4826 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 4827 }; 4828 { 4829 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 4830 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 4831 }; 4832 { 4833 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 4834 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 4835 }; 4836 { 4837 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 4838 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 4839 }; 4840 { 4841 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 4842 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 4843 }; 4844 } 4845 { 4846 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 4847 int const t_lt = get_sub_group_local_id() < half_lane_idx; 4848 ; 4849 { 4850 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 4851 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 4852 }; 4853 { 4854 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 4855 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 4856 }; 4857 { 4858 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 4859 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 4860 }; 4861 { 4862 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 4863 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 4864 }; 4865 { 4866 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 4867 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 4868 }; 4869 { 4870 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 4871 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 4872 }; 4873 { 4874 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 4875 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 4876 }; 4877 { 4878 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 4879 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 4880 }; 4881 { 4882 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 4883 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 4884 }; 4885 { 4886 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 4887 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 4888 }; 4889 { 4890 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 4891 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 4892 }; 4893 { 4894 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 4895 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 4896 }; 4897 { 4898 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 4899 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 4900 }; 4901 { 4902 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 4903 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 4904 }; 4905 { 4906 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 4907 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 4908 }; 4909 { 4910 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 4911 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 4912 }; 4913 } 4914 if (r1 >= r9) { 4915 ulong const t = r1; 4916 r1 = r9; 4917 r9 = t; 4918 }; 4919 if (r5 >= r13) { 4920 ulong const t = r5; 4921 r5 = r13; 4922 r13 = t; 4923 }; 4924 if (r1 >= r5) { 4925 ulong const t = r1; 4926 r1 = r5; 4927 r5 = t; 4928 }; 4929 if (r9 >= r13) { 4930 ulong const t = r9; 4931 r9 = r13; 4932 r13 = t; 4933 }; 4934 if (r3 >= r11) { 4935 ulong const t = r3; 4936 r3 = r11; 4937 r11 = t; 4938 }; 4939 if (r7 >= r15) { 4940 ulong const t = r7; 4941 r7 = r15; 4942 r15 = t; 4943 }; 4944 if (r3 >= r7) { 4945 ulong const t = r3; 4946 r3 = r7; 4947 r7 = t; 4948 }; 4949 if (r11 >= r15) { 4950 ulong const t = r11; 4951 r11 = r15; 4952 r15 = t; 4953 }; 4954 if (r1 >= r3) { 4955 ulong const t = r1; 4956 r1 = r3; 4957 r3 = t; 4958 }; 4959 if (r5 >= r7) { 4960 ulong const t = r5; 4961 r5 = r7; 4962 r7 = t; 4963 }; 4964 if (r9 >= r11) { 4965 ulong const t = r9; 4966 r9 = r11; 4967 r11 = t; 4968 }; 4969 if (r13 >= r15) { 4970 ulong const t = r13; 4971 r13 = r15; 4972 r15 = t; 4973 }; 4974 if (r2 >= r10) { 4975 ulong const t = r2; 4976 r2 = r10; 4977 r10 = t; 4978 }; 4979 if (r6 >= r14) { 4980 ulong const t = r6; 4981 r6 = r14; 4982 r14 = t; 4983 }; 4984 if (r2 >= r6) { 4985 ulong const t = r2; 4986 r2 = r6; 4987 r6 = t; 4988 }; 4989 if (r10 >= r14) { 4990 ulong const t = r10; 4991 r10 = r14; 4992 r14 = t; 4993 }; 4994 if (r4 >= r12) { 4995 ulong const t = r4; 4996 r4 = r12; 4997 r12 = t; 4998 }; 4999 if (r8 >= r16) { 5000 ulong const t = r8; 5001 r8 = r16; 5002 r16 = t; 5003 }; 5004 if (r4 >= r8) { 5005 ulong const t = r4; 5006 r4 = r8; 5007 r8 = t; 5008 }; 5009 if (r12 >= r16) { 5010 ulong const t = r12; 5011 r12 = r16; 5012 r16 = t; 5013 }; 5014 if (r2 >= r4) { 5015 ulong const t = r2; 5016 r2 = r4; 5017 r4 = t; 5018 }; 5019 if (r6 >= r8) { 5020 ulong const t = r6; 5021 r6 = r8; 5022 r8 = t; 5023 }; 5024 if (r10 >= r12) { 5025 ulong const t = r10; 5026 r10 = r12; 5027 r12 = t; 5028 }; 5029 if (r14 >= r16) { 5030 ulong const t = r14; 5031 r14 = r16; 5032 r16 = t; 5033 }; 5034 if (r1 >= r2) { 5035 ulong const t = r1; 5036 r1 = r2; 5037 r2 = t; 5038 }; 5039 if (r3 >= r4) { 5040 ulong const t = r3; 5041 r3 = r4; 5042 r4 = t; 5043 }; 5044 if (r5 >= r6) { 5045 ulong const t = r5; 5046 r5 = r6; 5047 r6 = t; 5048 }; 5049 if (r7 >= r8) { 5050 ulong const t = r7; 5051 r7 = r8; 5052 r8 = t; 5053 }; 5054 if (r9 >= r10) { 5055 ulong const t = r9; 5056 r9 = r10; 5057 r10 = t; 5058 }; 5059 if (r11 >= r12) { 5060 ulong const t = r11; 5061 r11 = r12; 5062 r12 = t; 5063 }; 5064 if (r13 >= r14) { 5065 ulong const t = r13; 5066 r13 = r14; 5067 r14 = t; 5068 }; 5069 if (r15 >= r16) { 5070 ulong const t = r15; 5071 r15 = r16; 5072 r16 = t; 5073 }; 5074 } 5075 vout[gmem_idx + (1 << 3) * 0] = r1; 5076 vout[gmem_idx + (1 << 3) * 1] = r2; 5077 vout[gmem_idx + (1 << 3) * 2] = r3; 5078 vout[gmem_idx + (1 << 3) * 3] = r4; 5079 vout[gmem_idx + (1 << 3) * 4] = r5; 5080 vout[gmem_idx + (1 << 3) * 5] = r6; 5081 vout[gmem_idx + (1 << 3) * 6] = r7; 5082 vout[gmem_idx + (1 << 3) * 7] = r8; 5083 vout[gmem_idx + (1 << 3) * 8] = r9; 5084 vout[gmem_idx + (1 << 3) * 9] = r10; 5085 vout[gmem_idx + (1 << 3) * 10] = r11; 5086 vout[gmem_idx + (1 << 3) * 11] = r12; 5087 vout[gmem_idx + (1 << 3) * 12] = r13; 5088 vout[gmem_idx + (1 << 3) * 13] = r14; 5089 vout[gmem_idx + (1 << 3) * 14] = r15; 5090 vout[gmem_idx + (1 << 3) * 15] = r16; 5091} 5092 5093__kernel __attribute__((intel_reqd_sub_group_size((1 << 3)))) 5094__attribute__((reqd_work_group_size((1 << 3) * 8, 1, 1))) void 5095hs_kernel_bs_3(__global ulong const* const restrict vin, 5096 __global ulong* const restrict vout) 5097{ 5098 __local struct 5099 { 5100 ulong m[64 * 16]; 5101 } shared; 5102 5103 uint const gmem_idx = (get_global_id(0) & ~((1 << 3) - 1)) * 16 + 5104 (get_local_id(0) & ((1 << 3) - 1)); 5105 ulong r1 = vin[gmem_idx + (1 << 3) * 0]; 5106 ulong r2 = vin[gmem_idx + (1 << 3) * 1]; 5107 ulong r3 = vin[gmem_idx + (1 << 3) * 2]; 5108 ulong r4 = vin[gmem_idx + (1 << 3) * 3]; 5109 ulong r5 = vin[gmem_idx + (1 << 3) * 4]; 5110 ulong r6 = vin[gmem_idx + (1 << 3) * 5]; 5111 ulong r7 = vin[gmem_idx + (1 << 3) * 6]; 5112 ulong r8 = vin[gmem_idx + (1 << 3) * 7]; 5113 ulong r9 = vin[gmem_idx + (1 << 3) * 8]; 5114 ulong r10 = vin[gmem_idx + (1 << 3) * 9]; 5115 ulong r11 = vin[gmem_idx + (1 << 3) * 10]; 5116 ulong r12 = vin[gmem_idx + (1 << 3) * 11]; 5117 ulong r13 = vin[gmem_idx + (1 << 3) * 12]; 5118 ulong r14 = vin[gmem_idx + (1 << 3) * 13]; 5119 ulong r15 = vin[gmem_idx + (1 << 3) * 14]; 5120 ulong r16 = vin[gmem_idx + (1 << 3) * 15]; 5121 if (r1 >= r2) { 5122 ulong const t = r1; 5123 r1 = r2; 5124 r2 = t; 5125 }; 5126 if (r3 >= r4) { 5127 ulong const t = r3; 5128 r3 = r4; 5129 r4 = t; 5130 }; 5131 if (r5 >= r6) { 5132 ulong const t = r5; 5133 r5 = r6; 5134 r6 = t; 5135 }; 5136 if (r7 >= r8) { 5137 ulong const t = r7; 5138 r7 = r8; 5139 r8 = t; 5140 }; 5141 if (r9 >= r10) { 5142 ulong const t = r9; 5143 r9 = r10; 5144 r10 = t; 5145 }; 5146 if (r11 >= r12) { 5147 ulong const t = r11; 5148 r11 = r12; 5149 r12 = t; 5150 }; 5151 if (r13 >= r14) { 5152 ulong const t = r13; 5153 r13 = r14; 5154 r14 = t; 5155 }; 5156 if (r15 >= r16) { 5157 ulong const t = r15; 5158 r15 = r16; 5159 r16 = t; 5160 }; 5161 if (r1 >= r3) { 5162 ulong const t = r1; 5163 r1 = r3; 5164 r3 = t; 5165 }; 5166 if (r5 >= r7) { 5167 ulong const t = r5; 5168 r5 = r7; 5169 r7 = t; 5170 }; 5171 if (r9 >= r11) { 5172 ulong const t = r9; 5173 r9 = r11; 5174 r11 = t; 5175 }; 5176 if (r13 >= r15) { 5177 ulong const t = r13; 5178 r13 = r15; 5179 r15 = t; 5180 }; 5181 if (r2 >= r4) { 5182 ulong const t = r2; 5183 r2 = r4; 5184 r4 = t; 5185 }; 5186 if (r6 >= r8) { 5187 ulong const t = r6; 5188 r6 = r8; 5189 r8 = t; 5190 }; 5191 if (r10 >= r12) { 5192 ulong const t = r10; 5193 r10 = r12; 5194 r12 = t; 5195 }; 5196 if (r14 >= r16) { 5197 ulong const t = r14; 5198 r14 = r16; 5199 r16 = t; 5200 }; 5201 if (r1 >= r5) { 5202 ulong const t = r1; 5203 r1 = r5; 5204 r5 = t; 5205 }; 5206 if (r9 >= r13) { 5207 ulong const t = r9; 5208 r9 = r13; 5209 r13 = t; 5210 }; 5211 if (r2 >= r6) { 5212 ulong const t = r2; 5213 r2 = r6; 5214 r6 = t; 5215 }; 5216 if (r10 >= r14) { 5217 ulong const t = r10; 5218 r10 = r14; 5219 r14 = t; 5220 }; 5221 if (r3 >= r7) { 5222 ulong const t = r3; 5223 r3 = r7; 5224 r7 = t; 5225 }; 5226 if (r11 >= r15) { 5227 ulong const t = r11; 5228 r11 = r15; 5229 r15 = t; 5230 }; 5231 if (r4 >= r8) { 5232 ulong const t = r4; 5233 r4 = r8; 5234 r8 = t; 5235 }; 5236 if (r12 >= r16) { 5237 ulong const t = r12; 5238 r12 = r16; 5239 r16 = t; 5240 }; 5241 if (r1 >= r9) { 5242 ulong const t = r1; 5243 r1 = r9; 5244 r9 = t; 5245 }; 5246 if (r2 >= r10) { 5247 ulong const t = r2; 5248 r2 = r10; 5249 r10 = t; 5250 }; 5251 if (r3 >= r11) { 5252 ulong const t = r3; 5253 r3 = r11; 5254 r11 = t; 5255 }; 5256 if (r4 >= r12) { 5257 ulong const t = r4; 5258 r4 = r12; 5259 r12 = t; 5260 }; 5261 if (r5 >= r13) { 5262 ulong const t = r5; 5263 r5 = r13; 5264 r13 = t; 5265 }; 5266 if (r6 >= r14) { 5267 ulong const t = r6; 5268 r6 = r14; 5269 r14 = t; 5270 }; 5271 if (r7 >= r15) { 5272 ulong const t = r7; 5273 r7 = r15; 5274 r15 = t; 5275 }; 5276 if (r8 >= r16) { 5277 ulong const t = r8; 5278 r8 = r16; 5279 r16 = t; 5280 }; 5281 if (r6 >= r11) { 5282 ulong const t = r6; 5283 r6 = r11; 5284 r11 = t; 5285 }; 5286 if (r7 >= r10) { 5287 ulong const t = r7; 5288 r7 = r10; 5289 r10 = t; 5290 }; 5291 if (r4 >= r13) { 5292 ulong const t = r4; 5293 r4 = r13; 5294 r13 = t; 5295 }; 5296 if (r14 >= r15) { 5297 ulong const t = r14; 5298 r14 = r15; 5299 r15 = t; 5300 }; 5301 if (r8 >= r12) { 5302 ulong const t = r8; 5303 r8 = r12; 5304 r12 = t; 5305 }; 5306 if (r2 >= r3) { 5307 ulong const t = r2; 5308 r2 = r3; 5309 r3 = t; 5310 }; 5311 if (r5 >= r9) { 5312 ulong const t = r5; 5313 r5 = r9; 5314 r9 = t; 5315 }; 5316 if (r2 >= r5) { 5317 ulong const t = r2; 5318 r2 = r5; 5319 r5 = t; 5320 }; 5321 if (r8 >= r14) { 5322 ulong const t = r8; 5323 r8 = r14; 5324 r14 = t; 5325 }; 5326 if (r3 >= r9) { 5327 ulong const t = r3; 5328 r3 = r9; 5329 r9 = t; 5330 }; 5331 if (r12 >= r15) { 5332 ulong const t = r12; 5333 r12 = r15; 5334 r15 = t; 5335 }; 5336 if (r3 >= r5) { 5337 ulong const t = r3; 5338 r3 = r5; 5339 r5 = t; 5340 }; 5341 if (r6 >= r7) { 5342 ulong const t = r6; 5343 r6 = r7; 5344 r7 = t; 5345 }; 5346 if (r10 >= r11) { 5347 ulong const t = r10; 5348 r10 = r11; 5349 r11 = t; 5350 }; 5351 if (r12 >= r14) { 5352 ulong const t = r12; 5353 r12 = r14; 5354 r14 = t; 5355 }; 5356 if (r4 >= r9) { 5357 ulong const t = r4; 5358 r4 = r9; 5359 r9 = t; 5360 }; 5361 if (r8 >= r13) { 5362 ulong const t = r8; 5363 r8 = r13; 5364 r13 = t; 5365 }; 5366 if (r7 >= r9) { 5367 ulong const t = r7; 5368 r7 = r9; 5369 r9 = t; 5370 }; 5371 if (r11 >= r13) { 5372 ulong const t = r11; 5373 r11 = r13; 5374 r13 = t; 5375 }; 5376 if (r4 >= r6) { 5377 ulong const t = r4; 5378 r4 = r6; 5379 r6 = t; 5380 }; 5381 if (r8 >= r10) { 5382 ulong const t = r8; 5383 r8 = r10; 5384 r10 = t; 5385 }; 5386 if (r4 >= r5) { 5387 ulong const t = r4; 5388 r4 = r5; 5389 r5 = t; 5390 }; 5391 if (r6 >= r7) { 5392 ulong const t = r6; 5393 r6 = r7; 5394 r7 = t; 5395 }; 5396 if (r8 >= r9) { 5397 ulong const t = r8; 5398 r8 = r9; 5399 r9 = t; 5400 }; 5401 if (r10 >= r11) { 5402 ulong const t = r10; 5403 r10 = r11; 5404 r11 = t; 5405 }; 5406 if (r12 >= r13) { 5407 ulong const t = r12; 5408 r12 = r13; 5409 r13 = t; 5410 }; 5411 if (r7 >= r8) { 5412 ulong const t = r7; 5413 r7 = r8; 5414 r8 = t; 5415 }; 5416 if (r9 >= r10) { 5417 ulong const t = r9; 5418 r9 = r10; 5419 r10 = t; 5420 }; 5421 { 5422 uint const flip_lane_idx = get_sub_group_local_id() ^ 1; 5423 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 5424 ; 5425 { 5426 ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 5427 ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx); 5428 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 5429 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 5430 }; 5431 { 5432 ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 5433 ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx); 5434 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 5435 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 5436 }; 5437 { 5438 ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 5439 ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx); 5440 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 5441 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 5442 }; 5443 { 5444 ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 5445 ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx); 5446 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 5447 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 5448 }; 5449 { 5450 ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx); 5451 ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx); 5452 r5 = ((r5 <= tb) ^ t_lt) ? tb : r5; 5453 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 5454 }; 5455 { 5456 ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx); 5457 ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx); 5458 r6 = ((r6 <= tb) ^ t_lt) ? tb : r6; 5459 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 5460 }; 5461 { 5462 ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx); 5463 ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx); 5464 r7 = ((r7 <= tb) ^ t_lt) ? tb : r7; 5465 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 5466 }; 5467 { 5468 ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx); 5469 ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx); 5470 r8 = ((r8 <= tb) ^ t_lt) ? tb : r8; 5471 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 5472 }; 5473 } 5474 if (r1 >= r9) { 5475 ulong const t = r1; 5476 r1 = r9; 5477 r9 = t; 5478 }; 5479 if (r5 >= r13) { 5480 ulong const t = r5; 5481 r5 = r13; 5482 r13 = t; 5483 }; 5484 if (r1 >= r5) { 5485 ulong const t = r1; 5486 r1 = r5; 5487 r5 = t; 5488 }; 5489 if (r9 >= r13) { 5490 ulong const t = r9; 5491 r9 = r13; 5492 r13 = t; 5493 }; 5494 if (r3 >= r11) { 5495 ulong const t = r3; 5496 r3 = r11; 5497 r11 = t; 5498 }; 5499 if (r7 >= r15) { 5500 ulong const t = r7; 5501 r7 = r15; 5502 r15 = t; 5503 }; 5504 if (r3 >= r7) { 5505 ulong const t = r3; 5506 r3 = r7; 5507 r7 = t; 5508 }; 5509 if (r11 >= r15) { 5510 ulong const t = r11; 5511 r11 = r15; 5512 r15 = t; 5513 }; 5514 if (r1 >= r3) { 5515 ulong const t = r1; 5516 r1 = r3; 5517 r3 = t; 5518 }; 5519 if (r5 >= r7) { 5520 ulong const t = r5; 5521 r5 = r7; 5522 r7 = t; 5523 }; 5524 if (r9 >= r11) { 5525 ulong const t = r9; 5526 r9 = r11; 5527 r11 = t; 5528 }; 5529 if (r13 >= r15) { 5530 ulong const t = r13; 5531 r13 = r15; 5532 r15 = t; 5533 }; 5534 if (r2 >= r10) { 5535 ulong const t = r2; 5536 r2 = r10; 5537 r10 = t; 5538 }; 5539 if (r6 >= r14) { 5540 ulong const t = r6; 5541 r6 = r14; 5542 r14 = t; 5543 }; 5544 if (r2 >= r6) { 5545 ulong const t = r2; 5546 r2 = r6; 5547 r6 = t; 5548 }; 5549 if (r10 >= r14) { 5550 ulong const t = r10; 5551 r10 = r14; 5552 r14 = t; 5553 }; 5554 if (r4 >= r12) { 5555 ulong const t = r4; 5556 r4 = r12; 5557 r12 = t; 5558 }; 5559 if (r8 >= r16) { 5560 ulong const t = r8; 5561 r8 = r16; 5562 r16 = t; 5563 }; 5564 if (r4 >= r8) { 5565 ulong const t = r4; 5566 r4 = r8; 5567 r8 = t; 5568 }; 5569 if (r12 >= r16) { 5570 ulong const t = r12; 5571 r12 = r16; 5572 r16 = t; 5573 }; 5574 if (r2 >= r4) { 5575 ulong const t = r2; 5576 r2 = r4; 5577 r4 = t; 5578 }; 5579 if (r6 >= r8) { 5580 ulong const t = r6; 5581 r6 = r8; 5582 r8 = t; 5583 }; 5584 if (r10 >= r12) { 5585 ulong const t = r10; 5586 r10 = r12; 5587 r12 = t; 5588 }; 5589 if (r14 >= r16) { 5590 ulong const t = r14; 5591 r14 = r16; 5592 r16 = t; 5593 }; 5594 if (r1 >= r2) { 5595 ulong const t = r1; 5596 r1 = r2; 5597 r2 = t; 5598 }; 5599 if (r3 >= r4) { 5600 ulong const t = r3; 5601 r3 = r4; 5602 r4 = t; 5603 }; 5604 if (r5 >= r6) { 5605 ulong const t = r5; 5606 r5 = r6; 5607 r6 = t; 5608 }; 5609 if (r7 >= r8) { 5610 ulong const t = r7; 5611 r7 = r8; 5612 r8 = t; 5613 }; 5614 if (r9 >= r10) { 5615 ulong const t = r9; 5616 r9 = r10; 5617 r10 = t; 5618 }; 5619 if (r11 >= r12) { 5620 ulong const t = r11; 5621 r11 = r12; 5622 r12 = t; 5623 }; 5624 if (r13 >= r14) { 5625 ulong const t = r13; 5626 r13 = r14; 5627 r14 = t; 5628 }; 5629 if (r15 >= r16) { 5630 ulong const t = r15; 5631 r15 = r16; 5632 r16 = t; 5633 }; 5634 { 5635 uint const flip_lane_idx = get_sub_group_local_id() ^ 3; 5636 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 5637 ; 5638 { 5639 ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 5640 ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx); 5641 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 5642 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 5643 }; 5644 { 5645 ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 5646 ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx); 5647 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 5648 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 5649 }; 5650 { 5651 ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 5652 ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx); 5653 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 5654 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 5655 }; 5656 { 5657 ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 5658 ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx); 5659 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 5660 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 5661 }; 5662 { 5663 ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx); 5664 ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx); 5665 r5 = ((r5 <= tb) ^ t_lt) ? tb : r5; 5666 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 5667 }; 5668 { 5669 ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx); 5670 ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx); 5671 r6 = ((r6 <= tb) ^ t_lt) ? tb : r6; 5672 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 5673 }; 5674 { 5675 ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx); 5676 ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx); 5677 r7 = ((r7 <= tb) ^ t_lt) ? tb : r7; 5678 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 5679 }; 5680 { 5681 ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx); 5682 ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx); 5683 r8 = ((r8 <= tb) ^ t_lt) ? tb : r8; 5684 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 5685 }; 5686 } 5687 { 5688 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 5689 int const t_lt = get_sub_group_local_id() < half_lane_idx; 5690 ; 5691 { 5692 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 5693 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 5694 }; 5695 { 5696 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 5697 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 5698 }; 5699 { 5700 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 5701 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 5702 }; 5703 { 5704 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 5705 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 5706 }; 5707 { 5708 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 5709 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 5710 }; 5711 { 5712 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 5713 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 5714 }; 5715 { 5716 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 5717 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 5718 }; 5719 { 5720 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 5721 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 5722 }; 5723 { 5724 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 5725 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 5726 }; 5727 { 5728 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 5729 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 5730 }; 5731 { 5732 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 5733 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 5734 }; 5735 { 5736 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 5737 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 5738 }; 5739 { 5740 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 5741 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 5742 }; 5743 { 5744 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 5745 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 5746 }; 5747 { 5748 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 5749 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 5750 }; 5751 { 5752 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 5753 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 5754 }; 5755 } 5756 if (r1 >= r9) { 5757 ulong const t = r1; 5758 r1 = r9; 5759 r9 = t; 5760 }; 5761 if (r5 >= r13) { 5762 ulong const t = r5; 5763 r5 = r13; 5764 r13 = t; 5765 }; 5766 if (r1 >= r5) { 5767 ulong const t = r1; 5768 r1 = r5; 5769 r5 = t; 5770 }; 5771 if (r9 >= r13) { 5772 ulong const t = r9; 5773 r9 = r13; 5774 r13 = t; 5775 }; 5776 if (r3 >= r11) { 5777 ulong const t = r3; 5778 r3 = r11; 5779 r11 = t; 5780 }; 5781 if (r7 >= r15) { 5782 ulong const t = r7; 5783 r7 = r15; 5784 r15 = t; 5785 }; 5786 if (r3 >= r7) { 5787 ulong const t = r3; 5788 r3 = r7; 5789 r7 = t; 5790 }; 5791 if (r11 >= r15) { 5792 ulong const t = r11; 5793 r11 = r15; 5794 r15 = t; 5795 }; 5796 if (r1 >= r3) { 5797 ulong const t = r1; 5798 r1 = r3; 5799 r3 = t; 5800 }; 5801 if (r5 >= r7) { 5802 ulong const t = r5; 5803 r5 = r7; 5804 r7 = t; 5805 }; 5806 if (r9 >= r11) { 5807 ulong const t = r9; 5808 r9 = r11; 5809 r11 = t; 5810 }; 5811 if (r13 >= r15) { 5812 ulong const t = r13; 5813 r13 = r15; 5814 r15 = t; 5815 }; 5816 if (r2 >= r10) { 5817 ulong const t = r2; 5818 r2 = r10; 5819 r10 = t; 5820 }; 5821 if (r6 >= r14) { 5822 ulong const t = r6; 5823 r6 = r14; 5824 r14 = t; 5825 }; 5826 if (r2 >= r6) { 5827 ulong const t = r2; 5828 r2 = r6; 5829 r6 = t; 5830 }; 5831 if (r10 >= r14) { 5832 ulong const t = r10; 5833 r10 = r14; 5834 r14 = t; 5835 }; 5836 if (r4 >= r12) { 5837 ulong const t = r4; 5838 r4 = r12; 5839 r12 = t; 5840 }; 5841 if (r8 >= r16) { 5842 ulong const t = r8; 5843 r8 = r16; 5844 r16 = t; 5845 }; 5846 if (r4 >= r8) { 5847 ulong const t = r4; 5848 r4 = r8; 5849 r8 = t; 5850 }; 5851 if (r12 >= r16) { 5852 ulong const t = r12; 5853 r12 = r16; 5854 r16 = t; 5855 }; 5856 if (r2 >= r4) { 5857 ulong const t = r2; 5858 r2 = r4; 5859 r4 = t; 5860 }; 5861 if (r6 >= r8) { 5862 ulong const t = r6; 5863 r6 = r8; 5864 r8 = t; 5865 }; 5866 if (r10 >= r12) { 5867 ulong const t = r10; 5868 r10 = r12; 5869 r12 = t; 5870 }; 5871 if (r14 >= r16) { 5872 ulong const t = r14; 5873 r14 = r16; 5874 r16 = t; 5875 }; 5876 if (r1 >= r2) { 5877 ulong const t = r1; 5878 r1 = r2; 5879 r2 = t; 5880 }; 5881 if (r3 >= r4) { 5882 ulong const t = r3; 5883 r3 = r4; 5884 r4 = t; 5885 }; 5886 if (r5 >= r6) { 5887 ulong const t = r5; 5888 r5 = r6; 5889 r6 = t; 5890 }; 5891 if (r7 >= r8) { 5892 ulong const t = r7; 5893 r7 = r8; 5894 r8 = t; 5895 }; 5896 if (r9 >= r10) { 5897 ulong const t = r9; 5898 r9 = r10; 5899 r10 = t; 5900 }; 5901 if (r11 >= r12) { 5902 ulong const t = r11; 5903 r11 = r12; 5904 r12 = t; 5905 }; 5906 if (r13 >= r14) { 5907 ulong const t = r13; 5908 r13 = r14; 5909 r14 = t; 5910 }; 5911 if (r15 >= r16) { 5912 ulong const t = r15; 5913 r15 = r16; 5914 r16 = t; 5915 }; 5916 { 5917 uint const flip_lane_idx = get_sub_group_local_id() ^ 7; 5918 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 5919 ; 5920 { 5921 ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 5922 ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx); 5923 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 5924 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 5925 }; 5926 { 5927 ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 5928 ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx); 5929 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 5930 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 5931 }; 5932 { 5933 ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 5934 ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx); 5935 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 5936 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 5937 }; 5938 { 5939 ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 5940 ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx); 5941 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 5942 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 5943 }; 5944 { 5945 ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx); 5946 ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx); 5947 r5 = ((r5 <= tb) ^ t_lt) ? tb : r5; 5948 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 5949 }; 5950 { 5951 ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx); 5952 ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx); 5953 r6 = ((r6 <= tb) ^ t_lt) ? tb : r6; 5954 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 5955 }; 5956 { 5957 ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx); 5958 ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx); 5959 r7 = ((r7 <= tb) ^ t_lt) ? tb : r7; 5960 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 5961 }; 5962 { 5963 ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx); 5964 ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx); 5965 r8 = ((r8 <= tb) ^ t_lt) ? tb : r8; 5966 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 5967 }; 5968 } 5969 { 5970 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 5971 int const t_lt = get_sub_group_local_id() < half_lane_idx; 5972 ; 5973 { 5974 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 5975 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 5976 }; 5977 { 5978 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 5979 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 5980 }; 5981 { 5982 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 5983 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 5984 }; 5985 { 5986 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 5987 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 5988 }; 5989 { 5990 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 5991 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 5992 }; 5993 { 5994 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 5995 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 5996 }; 5997 { 5998 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 5999 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 6000 }; 6001 { 6002 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 6003 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 6004 }; 6005 { 6006 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 6007 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 6008 }; 6009 { 6010 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 6011 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 6012 }; 6013 { 6014 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 6015 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 6016 }; 6017 { 6018 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 6019 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 6020 }; 6021 { 6022 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 6023 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 6024 }; 6025 { 6026 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 6027 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 6028 }; 6029 { 6030 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 6031 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 6032 }; 6033 { 6034 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 6035 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 6036 }; 6037 } 6038 { 6039 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 6040 int const t_lt = get_sub_group_local_id() < half_lane_idx; 6041 ; 6042 { 6043 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 6044 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 6045 }; 6046 { 6047 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 6048 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 6049 }; 6050 { 6051 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 6052 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 6053 }; 6054 { 6055 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 6056 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 6057 }; 6058 { 6059 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 6060 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 6061 }; 6062 { 6063 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 6064 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 6065 }; 6066 { 6067 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 6068 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 6069 }; 6070 { 6071 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 6072 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 6073 }; 6074 { 6075 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 6076 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 6077 }; 6078 { 6079 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 6080 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 6081 }; 6082 { 6083 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 6084 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 6085 }; 6086 { 6087 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 6088 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 6089 }; 6090 { 6091 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 6092 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 6093 }; 6094 { 6095 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 6096 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 6097 }; 6098 { 6099 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 6100 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 6101 }; 6102 { 6103 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 6104 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 6105 }; 6106 } 6107 if (r1 >= r9) { 6108 ulong const t = r1; 6109 r1 = r9; 6110 r9 = t; 6111 }; 6112 if (r5 >= r13) { 6113 ulong const t = r5; 6114 r5 = r13; 6115 r13 = t; 6116 }; 6117 if (r1 >= r5) { 6118 ulong const t = r1; 6119 r1 = r5; 6120 r5 = t; 6121 }; 6122 if (r9 >= r13) { 6123 ulong const t = r9; 6124 r9 = r13; 6125 r13 = t; 6126 }; 6127 if (r3 >= r11) { 6128 ulong const t = r3; 6129 r3 = r11; 6130 r11 = t; 6131 }; 6132 if (r7 >= r15) { 6133 ulong const t = r7; 6134 r7 = r15; 6135 r15 = t; 6136 }; 6137 if (r3 >= r7) { 6138 ulong const t = r3; 6139 r3 = r7; 6140 r7 = t; 6141 }; 6142 if (r11 >= r15) { 6143 ulong const t = r11; 6144 r11 = r15; 6145 r15 = t; 6146 }; 6147 if (r1 >= r3) { 6148 ulong const t = r1; 6149 r1 = r3; 6150 r3 = t; 6151 }; 6152 if (r5 >= r7) { 6153 ulong const t = r5; 6154 r5 = r7; 6155 r7 = t; 6156 }; 6157 if (r9 >= r11) { 6158 ulong const t = r9; 6159 r9 = r11; 6160 r11 = t; 6161 }; 6162 if (r13 >= r15) { 6163 ulong const t = r13; 6164 r13 = r15; 6165 r15 = t; 6166 }; 6167 if (r2 >= r10) { 6168 ulong const t = r2; 6169 r2 = r10; 6170 r10 = t; 6171 }; 6172 if (r6 >= r14) { 6173 ulong const t = r6; 6174 r6 = r14; 6175 r14 = t; 6176 }; 6177 if (r2 >= r6) { 6178 ulong const t = r2; 6179 r2 = r6; 6180 r6 = t; 6181 }; 6182 if (r10 >= r14) { 6183 ulong const t = r10; 6184 r10 = r14; 6185 r14 = t; 6186 }; 6187 if (r4 >= r12) { 6188 ulong const t = r4; 6189 r4 = r12; 6190 r12 = t; 6191 }; 6192 if (r8 >= r16) { 6193 ulong const t = r8; 6194 r8 = r16; 6195 r16 = t; 6196 }; 6197 if (r4 >= r8) { 6198 ulong const t = r4; 6199 r4 = r8; 6200 r8 = t; 6201 }; 6202 if (r12 >= r16) { 6203 ulong const t = r12; 6204 r12 = r16; 6205 r16 = t; 6206 }; 6207 if (r2 >= r4) { 6208 ulong const t = r2; 6209 r2 = r4; 6210 r4 = t; 6211 }; 6212 if (r6 >= r8) { 6213 ulong const t = r6; 6214 r6 = r8; 6215 r8 = t; 6216 }; 6217 if (r10 >= r12) { 6218 ulong const t = r10; 6219 r10 = r12; 6220 r12 = t; 6221 }; 6222 if (r14 >= r16) { 6223 ulong const t = r14; 6224 r14 = r16; 6225 r16 = t; 6226 }; 6227 if (r1 >= r2) { 6228 ulong const t = r1; 6229 r1 = r2; 6230 r2 = t; 6231 }; 6232 if (r3 >= r4) { 6233 ulong const t = r3; 6234 r3 = r4; 6235 r4 = t; 6236 }; 6237 if (r5 >= r6) { 6238 ulong const t = r5; 6239 r5 = r6; 6240 r6 = t; 6241 }; 6242 if (r7 >= r8) { 6243 ulong const t = r7; 6244 r7 = r8; 6245 r8 = t; 6246 }; 6247 if (r9 >= r10) { 6248 ulong const t = r9; 6249 r9 = r10; 6250 r10 = t; 6251 }; 6252 if (r11 >= r12) { 6253 ulong const t = r11; 6254 r11 = r12; 6255 r12 = t; 6256 }; 6257 if (r13 >= r14) { 6258 ulong const t = r13; 6259 r13 = r14; 6260 r14 = t; 6261 }; 6262 if (r15 >= r16) { 6263 ulong const t = r15; 6264 r15 = r16; 6265 r16 = t; 6266 }; 6267 uint const smem_l_idx = 6268 get_sub_group_id() * ((1 << 3) * 8) + get_sub_group_local_id(); 6269 uint const smem_r_idx = (get_sub_group_id() ^ 1) * ((1 << 3) * 8) + 6270 (get_sub_group_local_id() ^ ((1 << 3) - 1)); 6271 shared.m[get_local_id(0) + (8 * (1 << 3) * 0)] = r1; 6272 shared.m[get_local_id(0) + (8 * (1 << 3) * 1)] = r16; 6273 shared.m[get_local_id(0) + (8 * (1 << 3) * 2)] = r2; 6274 shared.m[get_local_id(0) + (8 * (1 << 3) * 3)] = r15; 6275 shared.m[get_local_id(0) + (8 * (1 << 3) * 4)] = r3; 6276 shared.m[get_local_id(0) + (8 * (1 << 3) * 5)] = r14; 6277 shared.m[get_local_id(0) + (8 * (1 << 3) * 6)] = r4; 6278 shared.m[get_local_id(0) + (8 * (1 << 3) * 7)] = r13; 6279 shared.m[get_local_id(0) + (8 * (1 << 3) * 8)] = r5; 6280 shared.m[get_local_id(0) + (8 * (1 << 3) * 9)] = r12; 6281 shared.m[get_local_id(0) + (8 * (1 << 3) * 10)] = r6; 6282 shared.m[get_local_id(0) + (8 * (1 << 3) * 11)] = r11; 6283 shared.m[get_local_id(0) + (8 * (1 << 3) * 12)] = r7; 6284 shared.m[get_local_id(0) + (8 * (1 << 3) * 13)] = r10; 6285 shared.m[get_local_id(0) + (8 * (1 << 3) * 14)] = r8; 6286 shared.m[get_local_id(0) + (8 * (1 << 3) * 15)] = r9; 6287 barrier(CLK_LOCAL_MEM_FENCE); 6288 { 6289 { 6290 ulong r0_1 = shared.m[smem_l_idx + (0)]; 6291 ulong r0_2 = shared.m[smem_r_idx + (8)]; 6292 if (r0_1 >= r0_2) { 6293 ulong const t = r0_1; 6294 r0_1 = r0_2; 6295 r0_2 = t; 6296 }; 6297 shared.m[smem_l_idx + (0)] = r0_1; 6298 shared.m[smem_r_idx + (8)] = r0_2; 6299 } 6300 { 6301 ulong r1_1 = shared.m[smem_l_idx + (16)]; 6302 ulong r1_2 = shared.m[smem_r_idx + (24)]; 6303 if (r1_1 >= r1_2) { 6304 ulong const t = r1_1; 6305 r1_1 = r1_2; 6306 r1_2 = t; 6307 }; 6308 shared.m[smem_l_idx + (16)] = r1_1; 6309 shared.m[smem_r_idx + (24)] = r1_2; 6310 } 6311 { 6312 ulong r2_1 = shared.m[smem_l_idx + (32)]; 6313 ulong r2_2 = shared.m[smem_r_idx + (40)]; 6314 if (r2_1 >= r2_2) { 6315 ulong const t = r2_1; 6316 r2_1 = r2_2; 6317 r2_2 = t; 6318 }; 6319 shared.m[smem_l_idx + (32)] = r2_1; 6320 shared.m[smem_r_idx + (40)] = r2_2; 6321 } 6322 { 6323 ulong r3_1 = shared.m[smem_l_idx + (48)]; 6324 ulong r3_2 = shared.m[smem_r_idx + (56)]; 6325 if (r3_1 >= r3_2) { 6326 ulong const t = r3_1; 6327 r3_1 = r3_2; 6328 r3_2 = t; 6329 }; 6330 shared.m[smem_l_idx + (48)] = r3_1; 6331 shared.m[smem_r_idx + (56)] = r3_2; 6332 } 6333 { 6334 ulong r0_1 = shared.m[smem_l_idx + (512)]; 6335 ulong r0_2 = shared.m[smem_r_idx + (520)]; 6336 if (r0_1 >= r0_2) { 6337 ulong const t = r0_1; 6338 r0_1 = r0_2; 6339 r0_2 = t; 6340 }; 6341 shared.m[smem_l_idx + (512)] = r0_1; 6342 shared.m[smem_r_idx + (520)] = r0_2; 6343 } 6344 { 6345 ulong r1_1 = shared.m[smem_l_idx + (528)]; 6346 ulong r1_2 = shared.m[smem_r_idx + (536)]; 6347 if (r1_1 >= r1_2) { 6348 ulong const t = r1_1; 6349 r1_1 = r1_2; 6350 r1_2 = t; 6351 }; 6352 shared.m[smem_l_idx + (528)] = r1_1; 6353 shared.m[smem_r_idx + (536)] = r1_2; 6354 } 6355 { 6356 ulong r2_1 = shared.m[smem_l_idx + (544)]; 6357 ulong r2_2 = shared.m[smem_r_idx + (552)]; 6358 if (r2_1 >= r2_2) { 6359 ulong const t = r2_1; 6360 r2_1 = r2_2; 6361 r2_2 = t; 6362 }; 6363 shared.m[smem_l_idx + (544)] = r2_1; 6364 shared.m[smem_r_idx + (552)] = r2_2; 6365 } 6366 { 6367 ulong r3_1 = shared.m[smem_l_idx + (560)]; 6368 ulong r3_2 = shared.m[smem_r_idx + (568)]; 6369 if (r3_1 >= r3_2) { 6370 ulong const t = r3_1; 6371 r3_1 = r3_2; 6372 r3_2 = t; 6373 }; 6374 shared.m[smem_l_idx + (560)] = r3_1; 6375 shared.m[smem_r_idx + (568)] = r3_2; 6376 } 6377 } 6378 barrier(CLK_LOCAL_MEM_FENCE); 6379 r1 = shared.m[get_local_id(0) + (8 * (1 << 3) * 0)]; 6380 r16 = shared.m[get_local_id(0) + (8 * (1 << 3) * 1)]; 6381 r2 = shared.m[get_local_id(0) + (8 * (1 << 3) * 2)]; 6382 r15 = shared.m[get_local_id(0) + (8 * (1 << 3) * 3)]; 6383 r3 = shared.m[get_local_id(0) + (8 * (1 << 3) * 4)]; 6384 r14 = shared.m[get_local_id(0) + (8 * (1 << 3) * 5)]; 6385 r4 = shared.m[get_local_id(0) + (8 * (1 << 3) * 6)]; 6386 r13 = shared.m[get_local_id(0) + (8 * (1 << 3) * 7)]; 6387 r5 = shared.m[get_local_id(0) + (8 * (1 << 3) * 8)]; 6388 r12 = shared.m[get_local_id(0) + (8 * (1 << 3) * 9)]; 6389 r6 = shared.m[get_local_id(0) + (8 * (1 << 3) * 10)]; 6390 r11 = shared.m[get_local_id(0) + (8 * (1 << 3) * 11)]; 6391 r7 = shared.m[get_local_id(0) + (8 * (1 << 3) * 12)]; 6392 r10 = shared.m[get_local_id(0) + (8 * (1 << 3) * 13)]; 6393 r8 = shared.m[get_local_id(0) + (8 * (1 << 3) * 14)]; 6394 r9 = shared.m[get_local_id(0) + (8 * (1 << 3) * 15)]; 6395 { 6396 { 6397 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 6398 int const t_lt = get_sub_group_local_id() < half_lane_idx; 6399 ; 6400 { 6401 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 6402 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 6403 }; 6404 { 6405 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 6406 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 6407 }; 6408 { 6409 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 6410 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 6411 }; 6412 { 6413 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 6414 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 6415 }; 6416 { 6417 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 6418 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 6419 }; 6420 { 6421 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 6422 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 6423 }; 6424 { 6425 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 6426 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 6427 }; 6428 { 6429 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 6430 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 6431 }; 6432 { 6433 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 6434 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 6435 }; 6436 { 6437 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 6438 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 6439 }; 6440 { 6441 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 6442 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 6443 }; 6444 { 6445 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 6446 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 6447 }; 6448 { 6449 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 6450 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 6451 }; 6452 { 6453 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 6454 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 6455 }; 6456 { 6457 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 6458 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 6459 }; 6460 { 6461 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 6462 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 6463 }; 6464 } 6465 { 6466 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 6467 int const t_lt = get_sub_group_local_id() < half_lane_idx; 6468 ; 6469 { 6470 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 6471 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 6472 }; 6473 { 6474 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 6475 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 6476 }; 6477 { 6478 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 6479 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 6480 }; 6481 { 6482 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 6483 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 6484 }; 6485 { 6486 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 6487 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 6488 }; 6489 { 6490 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 6491 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 6492 }; 6493 { 6494 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 6495 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 6496 }; 6497 { 6498 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 6499 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 6500 }; 6501 { 6502 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 6503 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 6504 }; 6505 { 6506 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 6507 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 6508 }; 6509 { 6510 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 6511 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 6512 }; 6513 { 6514 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 6515 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 6516 }; 6517 { 6518 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 6519 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 6520 }; 6521 { 6522 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 6523 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 6524 }; 6525 { 6526 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 6527 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 6528 }; 6529 { 6530 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 6531 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 6532 }; 6533 } 6534 { 6535 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 6536 int const t_lt = get_sub_group_local_id() < half_lane_idx; 6537 ; 6538 { 6539 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 6540 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 6541 }; 6542 { 6543 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 6544 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 6545 }; 6546 { 6547 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 6548 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 6549 }; 6550 { 6551 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 6552 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 6553 }; 6554 { 6555 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 6556 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 6557 }; 6558 { 6559 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 6560 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 6561 }; 6562 { 6563 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 6564 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 6565 }; 6566 { 6567 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 6568 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 6569 }; 6570 { 6571 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 6572 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 6573 }; 6574 { 6575 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 6576 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 6577 }; 6578 { 6579 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 6580 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 6581 }; 6582 { 6583 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 6584 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 6585 }; 6586 { 6587 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 6588 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 6589 }; 6590 { 6591 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 6592 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 6593 }; 6594 { 6595 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 6596 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 6597 }; 6598 { 6599 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 6600 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 6601 }; 6602 } 6603 if (r1 >= r9) { 6604 ulong const t = r1; 6605 r1 = r9; 6606 r9 = t; 6607 }; 6608 if (r5 >= r13) { 6609 ulong const t = r5; 6610 r5 = r13; 6611 r13 = t; 6612 }; 6613 if (r1 >= r5) { 6614 ulong const t = r1; 6615 r1 = r5; 6616 r5 = t; 6617 }; 6618 if (r9 >= r13) { 6619 ulong const t = r9; 6620 r9 = r13; 6621 r13 = t; 6622 }; 6623 if (r3 >= r11) { 6624 ulong const t = r3; 6625 r3 = r11; 6626 r11 = t; 6627 }; 6628 if (r7 >= r15) { 6629 ulong const t = r7; 6630 r7 = r15; 6631 r15 = t; 6632 }; 6633 if (r3 >= r7) { 6634 ulong const t = r3; 6635 r3 = r7; 6636 r7 = t; 6637 }; 6638 if (r11 >= r15) { 6639 ulong const t = r11; 6640 r11 = r15; 6641 r15 = t; 6642 }; 6643 if (r1 >= r3) { 6644 ulong const t = r1; 6645 r1 = r3; 6646 r3 = t; 6647 }; 6648 if (r5 >= r7) { 6649 ulong const t = r5; 6650 r5 = r7; 6651 r7 = t; 6652 }; 6653 if (r9 >= r11) { 6654 ulong const t = r9; 6655 r9 = r11; 6656 r11 = t; 6657 }; 6658 if (r13 >= r15) { 6659 ulong const t = r13; 6660 r13 = r15; 6661 r15 = t; 6662 }; 6663 if (r2 >= r10) { 6664 ulong const t = r2; 6665 r2 = r10; 6666 r10 = t; 6667 }; 6668 if (r6 >= r14) { 6669 ulong const t = r6; 6670 r6 = r14; 6671 r14 = t; 6672 }; 6673 if (r2 >= r6) { 6674 ulong const t = r2; 6675 r2 = r6; 6676 r6 = t; 6677 }; 6678 if (r10 >= r14) { 6679 ulong const t = r10; 6680 r10 = r14; 6681 r14 = t; 6682 }; 6683 if (r4 >= r12) { 6684 ulong const t = r4; 6685 r4 = r12; 6686 r12 = t; 6687 }; 6688 if (r8 >= r16) { 6689 ulong const t = r8; 6690 r8 = r16; 6691 r16 = t; 6692 }; 6693 if (r4 >= r8) { 6694 ulong const t = r4; 6695 r4 = r8; 6696 r8 = t; 6697 }; 6698 if (r12 >= r16) { 6699 ulong const t = r12; 6700 r12 = r16; 6701 r16 = t; 6702 }; 6703 if (r2 >= r4) { 6704 ulong const t = r2; 6705 r2 = r4; 6706 r4 = t; 6707 }; 6708 if (r6 >= r8) { 6709 ulong const t = r6; 6710 r6 = r8; 6711 r8 = t; 6712 }; 6713 if (r10 >= r12) { 6714 ulong const t = r10; 6715 r10 = r12; 6716 r12 = t; 6717 }; 6718 if (r14 >= r16) { 6719 ulong const t = r14; 6720 r14 = r16; 6721 r16 = t; 6722 }; 6723 if (r1 >= r2) { 6724 ulong const t = r1; 6725 r1 = r2; 6726 r2 = t; 6727 }; 6728 if (r3 >= r4) { 6729 ulong const t = r3; 6730 r3 = r4; 6731 r4 = t; 6732 }; 6733 if (r5 >= r6) { 6734 ulong const t = r5; 6735 r5 = r6; 6736 r6 = t; 6737 }; 6738 if (r7 >= r8) { 6739 ulong const t = r7; 6740 r7 = r8; 6741 r8 = t; 6742 }; 6743 if (r9 >= r10) { 6744 ulong const t = r9; 6745 r9 = r10; 6746 r10 = t; 6747 }; 6748 if (r11 >= r12) { 6749 ulong const t = r11; 6750 r11 = r12; 6751 r12 = t; 6752 }; 6753 if (r13 >= r14) { 6754 ulong const t = r13; 6755 r13 = r14; 6756 r14 = t; 6757 }; 6758 if (r15 >= r16) { 6759 ulong const t = r15; 6760 r15 = r16; 6761 r16 = t; 6762 }; 6763 } 6764 shared.m[get_local_id(0) + (8 * (1 << 3) * 0)] = r1; 6765 shared.m[get_local_id(0) + (8 * (1 << 3) * 1)] = r16; 6766 shared.m[get_local_id(0) + (8 * (1 << 3) * 2)] = r2; 6767 shared.m[get_local_id(0) + (8 * (1 << 3) * 3)] = r15; 6768 shared.m[get_local_id(0) + (8 * (1 << 3) * 4)] = r3; 6769 shared.m[get_local_id(0) + (8 * (1 << 3) * 5)] = r14; 6770 shared.m[get_local_id(0) + (8 * (1 << 3) * 6)] = r4; 6771 shared.m[get_local_id(0) + (8 * (1 << 3) * 7)] = r13; 6772 shared.m[get_local_id(0) + (8 * (1 << 3) * 8)] = r5; 6773 shared.m[get_local_id(0) + (8 * (1 << 3) * 9)] = r12; 6774 shared.m[get_local_id(0) + (8 * (1 << 3) * 10)] = r6; 6775 shared.m[get_local_id(0) + (8 * (1 << 3) * 11)] = r11; 6776 shared.m[get_local_id(0) + (8 * (1 << 3) * 12)] = r7; 6777 shared.m[get_local_id(0) + (8 * (1 << 3) * 13)] = r10; 6778 shared.m[get_local_id(0) + (8 * (1 << 3) * 14)] = r8; 6779 shared.m[get_local_id(0) + (8 * (1 << 3) * 15)] = r9; 6780 barrier(CLK_LOCAL_MEM_FENCE); 6781 { 6782 { 6783 ulong r0_1 = shared.m[smem_l_idx + (0)]; 6784 ulong r0_2 = shared.m[smem_l_idx + (8)]; 6785 ulong r0_3 = shared.m[smem_r_idx + (16)]; 6786 ulong r0_4 = shared.m[smem_r_idx + (24)]; 6787 if (r0_2 >= r0_3) { 6788 ulong const t = r0_2; 6789 r0_2 = r0_3; 6790 r0_3 = t; 6791 }; 6792 if (r0_1 >= r0_4) { 6793 ulong const t = r0_1; 6794 r0_1 = r0_4; 6795 r0_4 = t; 6796 }; 6797 if (r0_3 >= r0_4) { 6798 ulong const t = r0_3; 6799 r0_3 = r0_4; 6800 r0_4 = t; 6801 }; 6802 if (r0_1 >= r0_2) { 6803 ulong const t = r0_1; 6804 r0_1 = r0_2; 6805 r0_2 = t; 6806 }; 6807 shared.m[smem_l_idx + (0)] = r0_1; 6808 shared.m[smem_l_idx + (8)] = r0_2; 6809 shared.m[smem_r_idx + (16)] = r0_3; 6810 shared.m[smem_r_idx + (24)] = r0_4; 6811 } 6812 { 6813 ulong r1_1 = shared.m[smem_l_idx + (32)]; 6814 ulong r1_2 = shared.m[smem_l_idx + (40)]; 6815 ulong r1_3 = shared.m[smem_r_idx + (48)]; 6816 ulong r1_4 = shared.m[smem_r_idx + (56)]; 6817 if (r1_2 >= r1_3) { 6818 ulong const t = r1_2; 6819 r1_2 = r1_3; 6820 r1_3 = t; 6821 }; 6822 if (r1_1 >= r1_4) { 6823 ulong const t = r1_1; 6824 r1_1 = r1_4; 6825 r1_4 = t; 6826 }; 6827 if (r1_3 >= r1_4) { 6828 ulong const t = r1_3; 6829 r1_3 = r1_4; 6830 r1_4 = t; 6831 }; 6832 if (r1_1 >= r1_2) { 6833 ulong const t = r1_1; 6834 r1_1 = r1_2; 6835 r1_2 = t; 6836 }; 6837 shared.m[smem_l_idx + (32)] = r1_1; 6838 shared.m[smem_l_idx + (40)] = r1_2; 6839 shared.m[smem_r_idx + (48)] = r1_3; 6840 shared.m[smem_r_idx + (56)] = r1_4; 6841 } 6842 { 6843 ulong r0_1 = shared.m[smem_l_idx + (512)]; 6844 ulong r0_2 = shared.m[smem_l_idx + (520)]; 6845 ulong r0_3 = shared.m[smem_r_idx + (528)]; 6846 ulong r0_4 = shared.m[smem_r_idx + (536)]; 6847 if (r0_2 >= r0_3) { 6848 ulong const t = r0_2; 6849 r0_2 = r0_3; 6850 r0_3 = t; 6851 }; 6852 if (r0_1 >= r0_4) { 6853 ulong const t = r0_1; 6854 r0_1 = r0_4; 6855 r0_4 = t; 6856 }; 6857 if (r0_3 >= r0_4) { 6858 ulong const t = r0_3; 6859 r0_3 = r0_4; 6860 r0_4 = t; 6861 }; 6862 if (r0_1 >= r0_2) { 6863 ulong const t = r0_1; 6864 r0_1 = r0_2; 6865 r0_2 = t; 6866 }; 6867 shared.m[smem_l_idx + (512)] = r0_1; 6868 shared.m[smem_l_idx + (520)] = r0_2; 6869 shared.m[smem_r_idx + (528)] = r0_3; 6870 shared.m[smem_r_idx + (536)] = r0_4; 6871 } 6872 { 6873 ulong r1_1 = shared.m[smem_l_idx + (544)]; 6874 ulong r1_2 = shared.m[smem_l_idx + (552)]; 6875 ulong r1_3 = shared.m[smem_r_idx + (560)]; 6876 ulong r1_4 = shared.m[smem_r_idx + (568)]; 6877 if (r1_2 >= r1_3) { 6878 ulong const t = r1_2; 6879 r1_2 = r1_3; 6880 r1_3 = t; 6881 }; 6882 if (r1_1 >= r1_4) { 6883 ulong const t = r1_1; 6884 r1_1 = r1_4; 6885 r1_4 = t; 6886 }; 6887 if (r1_3 >= r1_4) { 6888 ulong const t = r1_3; 6889 r1_3 = r1_4; 6890 r1_4 = t; 6891 }; 6892 if (r1_1 >= r1_2) { 6893 ulong const t = r1_1; 6894 r1_1 = r1_2; 6895 r1_2 = t; 6896 }; 6897 shared.m[smem_l_idx + (544)] = r1_1; 6898 shared.m[smem_l_idx + (552)] = r1_2; 6899 shared.m[smem_r_idx + (560)] = r1_3; 6900 shared.m[smem_r_idx + (568)] = r1_4; 6901 } 6902 } 6903 barrier(CLK_LOCAL_MEM_FENCE); 6904 r1 = shared.m[get_local_id(0) + (8 * (1 << 3) * 0)]; 6905 r16 = shared.m[get_local_id(0) + (8 * (1 << 3) * 1)]; 6906 r2 = shared.m[get_local_id(0) + (8 * (1 << 3) * 2)]; 6907 r15 = shared.m[get_local_id(0) + (8 * (1 << 3) * 3)]; 6908 r3 = shared.m[get_local_id(0) + (8 * (1 << 3) * 4)]; 6909 r14 = shared.m[get_local_id(0) + (8 * (1 << 3) * 5)]; 6910 r4 = shared.m[get_local_id(0) + (8 * (1 << 3) * 6)]; 6911 r13 = shared.m[get_local_id(0) + (8 * (1 << 3) * 7)]; 6912 r5 = shared.m[get_local_id(0) + (8 * (1 << 3) * 8)]; 6913 r12 = shared.m[get_local_id(0) + (8 * (1 << 3) * 9)]; 6914 r6 = shared.m[get_local_id(0) + (8 * (1 << 3) * 10)]; 6915 r11 = shared.m[get_local_id(0) + (8 * (1 << 3) * 11)]; 6916 r7 = shared.m[get_local_id(0) + (8 * (1 << 3) * 12)]; 6917 r10 = shared.m[get_local_id(0) + (8 * (1 << 3) * 13)]; 6918 r8 = shared.m[get_local_id(0) + (8 * (1 << 3) * 14)]; 6919 r9 = shared.m[get_local_id(0) + (8 * (1 << 3) * 15)]; 6920 { 6921 { 6922 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 6923 int const t_lt = get_sub_group_local_id() < half_lane_idx; 6924 ; 6925 { 6926 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 6927 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 6928 }; 6929 { 6930 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 6931 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 6932 }; 6933 { 6934 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 6935 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 6936 }; 6937 { 6938 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 6939 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 6940 }; 6941 { 6942 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 6943 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 6944 }; 6945 { 6946 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 6947 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 6948 }; 6949 { 6950 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 6951 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 6952 }; 6953 { 6954 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 6955 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 6956 }; 6957 { 6958 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 6959 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 6960 }; 6961 { 6962 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 6963 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 6964 }; 6965 { 6966 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 6967 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 6968 }; 6969 { 6970 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 6971 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 6972 }; 6973 { 6974 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 6975 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 6976 }; 6977 { 6978 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 6979 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 6980 }; 6981 { 6982 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 6983 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 6984 }; 6985 { 6986 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 6987 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 6988 }; 6989 } 6990 { 6991 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 6992 int const t_lt = get_sub_group_local_id() < half_lane_idx; 6993 ; 6994 { 6995 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 6996 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 6997 }; 6998 { 6999 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 7000 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 7001 }; 7002 { 7003 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 7004 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 7005 }; 7006 { 7007 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 7008 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 7009 }; 7010 { 7011 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 7012 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 7013 }; 7014 { 7015 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 7016 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 7017 }; 7018 { 7019 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 7020 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 7021 }; 7022 { 7023 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 7024 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 7025 }; 7026 { 7027 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 7028 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 7029 }; 7030 { 7031 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 7032 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 7033 }; 7034 { 7035 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 7036 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 7037 }; 7038 { 7039 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 7040 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 7041 }; 7042 { 7043 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 7044 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 7045 }; 7046 { 7047 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 7048 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 7049 }; 7050 { 7051 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 7052 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 7053 }; 7054 { 7055 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 7056 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 7057 }; 7058 } 7059 { 7060 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 7061 int const t_lt = get_sub_group_local_id() < half_lane_idx; 7062 ; 7063 { 7064 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 7065 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 7066 }; 7067 { 7068 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 7069 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 7070 }; 7071 { 7072 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 7073 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 7074 }; 7075 { 7076 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 7077 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 7078 }; 7079 { 7080 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 7081 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 7082 }; 7083 { 7084 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 7085 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 7086 }; 7087 { 7088 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 7089 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 7090 }; 7091 { 7092 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 7093 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 7094 }; 7095 { 7096 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 7097 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 7098 }; 7099 { 7100 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 7101 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 7102 }; 7103 { 7104 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 7105 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 7106 }; 7107 { 7108 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 7109 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 7110 }; 7111 { 7112 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 7113 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 7114 }; 7115 { 7116 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 7117 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 7118 }; 7119 { 7120 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 7121 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 7122 }; 7123 { 7124 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 7125 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 7126 }; 7127 } 7128 if (r1 >= r9) { 7129 ulong const t = r1; 7130 r1 = r9; 7131 r9 = t; 7132 }; 7133 if (r5 >= r13) { 7134 ulong const t = r5; 7135 r5 = r13; 7136 r13 = t; 7137 }; 7138 if (r1 >= r5) { 7139 ulong const t = r1; 7140 r1 = r5; 7141 r5 = t; 7142 }; 7143 if (r9 >= r13) { 7144 ulong const t = r9; 7145 r9 = r13; 7146 r13 = t; 7147 }; 7148 if (r3 >= r11) { 7149 ulong const t = r3; 7150 r3 = r11; 7151 r11 = t; 7152 }; 7153 if (r7 >= r15) { 7154 ulong const t = r7; 7155 r7 = r15; 7156 r15 = t; 7157 }; 7158 if (r3 >= r7) { 7159 ulong const t = r3; 7160 r3 = r7; 7161 r7 = t; 7162 }; 7163 if (r11 >= r15) { 7164 ulong const t = r11; 7165 r11 = r15; 7166 r15 = t; 7167 }; 7168 if (r1 >= r3) { 7169 ulong const t = r1; 7170 r1 = r3; 7171 r3 = t; 7172 }; 7173 if (r5 >= r7) { 7174 ulong const t = r5; 7175 r5 = r7; 7176 r7 = t; 7177 }; 7178 if (r9 >= r11) { 7179 ulong const t = r9; 7180 r9 = r11; 7181 r11 = t; 7182 }; 7183 if (r13 >= r15) { 7184 ulong const t = r13; 7185 r13 = r15; 7186 r15 = t; 7187 }; 7188 if (r2 >= r10) { 7189 ulong const t = r2; 7190 r2 = r10; 7191 r10 = t; 7192 }; 7193 if (r6 >= r14) { 7194 ulong const t = r6; 7195 r6 = r14; 7196 r14 = t; 7197 }; 7198 if (r2 >= r6) { 7199 ulong const t = r2; 7200 r2 = r6; 7201 r6 = t; 7202 }; 7203 if (r10 >= r14) { 7204 ulong const t = r10; 7205 r10 = r14; 7206 r14 = t; 7207 }; 7208 if (r4 >= r12) { 7209 ulong const t = r4; 7210 r4 = r12; 7211 r12 = t; 7212 }; 7213 if (r8 >= r16) { 7214 ulong const t = r8; 7215 r8 = r16; 7216 r16 = t; 7217 }; 7218 if (r4 >= r8) { 7219 ulong const t = r4; 7220 r4 = r8; 7221 r8 = t; 7222 }; 7223 if (r12 >= r16) { 7224 ulong const t = r12; 7225 r12 = r16; 7226 r16 = t; 7227 }; 7228 if (r2 >= r4) { 7229 ulong const t = r2; 7230 r2 = r4; 7231 r4 = t; 7232 }; 7233 if (r6 >= r8) { 7234 ulong const t = r6; 7235 r6 = r8; 7236 r8 = t; 7237 }; 7238 if (r10 >= r12) { 7239 ulong const t = r10; 7240 r10 = r12; 7241 r12 = t; 7242 }; 7243 if (r14 >= r16) { 7244 ulong const t = r14; 7245 r14 = r16; 7246 r16 = t; 7247 }; 7248 if (r1 >= r2) { 7249 ulong const t = r1; 7250 r1 = r2; 7251 r2 = t; 7252 }; 7253 if (r3 >= r4) { 7254 ulong const t = r3; 7255 r3 = r4; 7256 r4 = t; 7257 }; 7258 if (r5 >= r6) { 7259 ulong const t = r5; 7260 r5 = r6; 7261 r6 = t; 7262 }; 7263 if (r7 >= r8) { 7264 ulong const t = r7; 7265 r7 = r8; 7266 r8 = t; 7267 }; 7268 if (r9 >= r10) { 7269 ulong const t = r9; 7270 r9 = r10; 7271 r10 = t; 7272 }; 7273 if (r11 >= r12) { 7274 ulong const t = r11; 7275 r11 = r12; 7276 r12 = t; 7277 }; 7278 if (r13 >= r14) { 7279 ulong const t = r13; 7280 r13 = r14; 7281 r14 = t; 7282 }; 7283 if (r15 >= r16) { 7284 ulong const t = r15; 7285 r15 = r16; 7286 r16 = t; 7287 }; 7288 } 7289 shared.m[get_local_id(0) + (8 * (1 << 3) * 0)] = r1; 7290 shared.m[get_local_id(0) + (8 * (1 << 3) * 1)] = r16; 7291 shared.m[get_local_id(0) + (8 * (1 << 3) * 2)] = r2; 7292 shared.m[get_local_id(0) + (8 * (1 << 3) * 3)] = r15; 7293 shared.m[get_local_id(0) + (8 * (1 << 3) * 4)] = r3; 7294 shared.m[get_local_id(0) + (8 * (1 << 3) * 5)] = r14; 7295 shared.m[get_local_id(0) + (8 * (1 << 3) * 6)] = r4; 7296 shared.m[get_local_id(0) + (8 * (1 << 3) * 7)] = r13; 7297 shared.m[get_local_id(0) + (8 * (1 << 3) * 8)] = r5; 7298 shared.m[get_local_id(0) + (8 * (1 << 3) * 9)] = r12; 7299 shared.m[get_local_id(0) + (8 * (1 << 3) * 10)] = r6; 7300 shared.m[get_local_id(0) + (8 * (1 << 3) * 11)] = r11; 7301 shared.m[get_local_id(0) + (8 * (1 << 3) * 12)] = r7; 7302 shared.m[get_local_id(0) + (8 * (1 << 3) * 13)] = r10; 7303 shared.m[get_local_id(0) + (8 * (1 << 3) * 14)] = r8; 7304 shared.m[get_local_id(0) + (8 * (1 << 3) * 15)] = r9; 7305 barrier(CLK_LOCAL_MEM_FENCE); 7306 { 7307 { 7308 ulong r0_1 = shared.m[smem_l_idx + (0)]; 7309 ulong r0_2 = shared.m[smem_l_idx + (8)]; 7310 ulong r0_3 = shared.m[smem_l_idx + (16)]; 7311 ulong r0_4 = shared.m[smem_l_idx + (24)]; 7312 ulong r0_5 = shared.m[smem_r_idx + (32)]; 7313 ulong r0_6 = shared.m[smem_r_idx + (40)]; 7314 ulong r0_7 = shared.m[smem_r_idx + (48)]; 7315 ulong r0_8 = shared.m[smem_r_idx + (56)]; 7316 if (r0_4 >= r0_5) { 7317 ulong const t = r0_4; 7318 r0_4 = r0_5; 7319 r0_5 = t; 7320 }; 7321 if (r0_3 >= r0_6) { 7322 ulong const t = r0_3; 7323 r0_3 = r0_6; 7324 r0_6 = t; 7325 }; 7326 if (r0_2 >= r0_7) { 7327 ulong const t = r0_2; 7328 r0_2 = r0_7; 7329 r0_7 = t; 7330 }; 7331 if (r0_1 >= r0_8) { 7332 ulong const t = r0_1; 7333 r0_1 = r0_8; 7334 r0_8 = t; 7335 }; 7336 if (r0_5 >= r0_7) { 7337 ulong const t = r0_5; 7338 r0_5 = r0_7; 7339 r0_7 = t; 7340 }; 7341 if (r0_6 >= r0_8) { 7342 ulong const t = r0_6; 7343 r0_6 = r0_8; 7344 r0_8 = t; 7345 }; 7346 if (r0_5 >= r0_6) { 7347 ulong const t = r0_5; 7348 r0_5 = r0_6; 7349 r0_6 = t; 7350 }; 7351 if (r0_7 >= r0_8) { 7352 ulong const t = r0_7; 7353 r0_7 = r0_8; 7354 r0_8 = t; 7355 }; 7356 if (r0_1 >= r0_3) { 7357 ulong const t = r0_1; 7358 r0_1 = r0_3; 7359 r0_3 = t; 7360 }; 7361 if (r0_2 >= r0_4) { 7362 ulong const t = r0_2; 7363 r0_2 = r0_4; 7364 r0_4 = t; 7365 }; 7366 if (r0_1 >= r0_2) { 7367 ulong const t = r0_1; 7368 r0_1 = r0_2; 7369 r0_2 = t; 7370 }; 7371 if (r0_3 >= r0_4) { 7372 ulong const t = r0_3; 7373 r0_3 = r0_4; 7374 r0_4 = t; 7375 }; 7376 shared.m[smem_l_idx + (0)] = r0_1; 7377 shared.m[smem_l_idx + (8)] = r0_2; 7378 shared.m[smem_l_idx + (16)] = r0_3; 7379 shared.m[smem_l_idx + (24)] = r0_4; 7380 shared.m[smem_r_idx + (32)] = r0_5; 7381 shared.m[smem_r_idx + (40)] = r0_6; 7382 shared.m[smem_r_idx + (48)] = r0_7; 7383 shared.m[smem_r_idx + (56)] = r0_8; 7384 } 7385 { 7386 ulong r0_1 = shared.m[smem_l_idx + (512)]; 7387 ulong r0_2 = shared.m[smem_l_idx + (520)]; 7388 ulong r0_3 = shared.m[smem_l_idx + (528)]; 7389 ulong r0_4 = shared.m[smem_l_idx + (536)]; 7390 ulong r0_5 = shared.m[smem_r_idx + (544)]; 7391 ulong r0_6 = shared.m[smem_r_idx + (552)]; 7392 ulong r0_7 = shared.m[smem_r_idx + (560)]; 7393 ulong r0_8 = shared.m[smem_r_idx + (568)]; 7394 if (r0_4 >= r0_5) { 7395 ulong const t = r0_4; 7396 r0_4 = r0_5; 7397 r0_5 = t; 7398 }; 7399 if (r0_3 >= r0_6) { 7400 ulong const t = r0_3; 7401 r0_3 = r0_6; 7402 r0_6 = t; 7403 }; 7404 if (r0_2 >= r0_7) { 7405 ulong const t = r0_2; 7406 r0_2 = r0_7; 7407 r0_7 = t; 7408 }; 7409 if (r0_1 >= r0_8) { 7410 ulong const t = r0_1; 7411 r0_1 = r0_8; 7412 r0_8 = t; 7413 }; 7414 if (r0_5 >= r0_7) { 7415 ulong const t = r0_5; 7416 r0_5 = r0_7; 7417 r0_7 = t; 7418 }; 7419 if (r0_6 >= r0_8) { 7420 ulong const t = r0_6; 7421 r0_6 = r0_8; 7422 r0_8 = t; 7423 }; 7424 if (r0_5 >= r0_6) { 7425 ulong const t = r0_5; 7426 r0_5 = r0_6; 7427 r0_6 = t; 7428 }; 7429 if (r0_7 >= r0_8) { 7430 ulong const t = r0_7; 7431 r0_7 = r0_8; 7432 r0_8 = t; 7433 }; 7434 if (r0_1 >= r0_3) { 7435 ulong const t = r0_1; 7436 r0_1 = r0_3; 7437 r0_3 = t; 7438 }; 7439 if (r0_2 >= r0_4) { 7440 ulong const t = r0_2; 7441 r0_2 = r0_4; 7442 r0_4 = t; 7443 }; 7444 if (r0_1 >= r0_2) { 7445 ulong const t = r0_1; 7446 r0_1 = r0_2; 7447 r0_2 = t; 7448 }; 7449 if (r0_3 >= r0_4) { 7450 ulong const t = r0_3; 7451 r0_3 = r0_4; 7452 r0_4 = t; 7453 }; 7454 shared.m[smem_l_idx + (512)] = r0_1; 7455 shared.m[smem_l_idx + (520)] = r0_2; 7456 shared.m[smem_l_idx + (528)] = r0_3; 7457 shared.m[smem_l_idx + (536)] = r0_4; 7458 shared.m[smem_r_idx + (544)] = r0_5; 7459 shared.m[smem_r_idx + (552)] = r0_6; 7460 shared.m[smem_r_idx + (560)] = r0_7; 7461 shared.m[smem_r_idx + (568)] = r0_8; 7462 } 7463 } 7464 barrier(CLK_LOCAL_MEM_FENCE); 7465 r1 = shared.m[get_local_id(0) + (8 * (1 << 3) * 0)]; 7466 r16 = shared.m[get_local_id(0) + (8 * (1 << 3) * 1)]; 7467 r2 = shared.m[get_local_id(0) + (8 * (1 << 3) * 2)]; 7468 r15 = shared.m[get_local_id(0) + (8 * (1 << 3) * 3)]; 7469 r3 = shared.m[get_local_id(0) + (8 * (1 << 3) * 4)]; 7470 r14 = shared.m[get_local_id(0) + (8 * (1 << 3) * 5)]; 7471 r4 = shared.m[get_local_id(0) + (8 * (1 << 3) * 6)]; 7472 r13 = shared.m[get_local_id(0) + (8 * (1 << 3) * 7)]; 7473 r5 = shared.m[get_local_id(0) + (8 * (1 << 3) * 8)]; 7474 r12 = shared.m[get_local_id(0) + (8 * (1 << 3) * 9)]; 7475 r6 = shared.m[get_local_id(0) + (8 * (1 << 3) * 10)]; 7476 r11 = shared.m[get_local_id(0) + (8 * (1 << 3) * 11)]; 7477 r7 = shared.m[get_local_id(0) + (8 * (1 << 3) * 12)]; 7478 r10 = shared.m[get_local_id(0) + (8 * (1 << 3) * 13)]; 7479 r8 = shared.m[get_local_id(0) + (8 * (1 << 3) * 14)]; 7480 r9 = shared.m[get_local_id(0) + (8 * (1 << 3) * 15)]; 7481 { 7482 { 7483 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 7484 int const t_lt = get_sub_group_local_id() < half_lane_idx; 7485 ; 7486 { 7487 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 7488 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 7489 }; 7490 { 7491 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 7492 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 7493 }; 7494 { 7495 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 7496 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 7497 }; 7498 { 7499 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 7500 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 7501 }; 7502 { 7503 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 7504 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 7505 }; 7506 { 7507 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 7508 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 7509 }; 7510 { 7511 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 7512 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 7513 }; 7514 { 7515 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 7516 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 7517 }; 7518 { 7519 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 7520 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 7521 }; 7522 { 7523 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 7524 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 7525 }; 7526 { 7527 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 7528 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 7529 }; 7530 { 7531 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 7532 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 7533 }; 7534 { 7535 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 7536 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 7537 }; 7538 { 7539 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 7540 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 7541 }; 7542 { 7543 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 7544 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 7545 }; 7546 { 7547 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 7548 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 7549 }; 7550 } 7551 { 7552 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 7553 int const t_lt = get_sub_group_local_id() < half_lane_idx; 7554 ; 7555 { 7556 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 7557 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 7558 }; 7559 { 7560 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 7561 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 7562 }; 7563 { 7564 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 7565 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 7566 }; 7567 { 7568 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 7569 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 7570 }; 7571 { 7572 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 7573 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 7574 }; 7575 { 7576 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 7577 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 7578 }; 7579 { 7580 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 7581 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 7582 }; 7583 { 7584 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 7585 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 7586 }; 7587 { 7588 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 7589 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 7590 }; 7591 { 7592 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 7593 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 7594 }; 7595 { 7596 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 7597 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 7598 }; 7599 { 7600 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 7601 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 7602 }; 7603 { 7604 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 7605 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 7606 }; 7607 { 7608 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 7609 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 7610 }; 7611 { 7612 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 7613 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 7614 }; 7615 { 7616 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 7617 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 7618 }; 7619 } 7620 { 7621 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 7622 int const t_lt = get_sub_group_local_id() < half_lane_idx; 7623 ; 7624 { 7625 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 7626 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 7627 }; 7628 { 7629 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 7630 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 7631 }; 7632 { 7633 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 7634 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 7635 }; 7636 { 7637 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 7638 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 7639 }; 7640 { 7641 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 7642 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 7643 }; 7644 { 7645 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 7646 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 7647 }; 7648 { 7649 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 7650 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 7651 }; 7652 { 7653 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 7654 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 7655 }; 7656 { 7657 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 7658 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 7659 }; 7660 { 7661 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 7662 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 7663 }; 7664 { 7665 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 7666 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 7667 }; 7668 { 7669 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 7670 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 7671 }; 7672 { 7673 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 7674 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 7675 }; 7676 { 7677 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 7678 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 7679 }; 7680 { 7681 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 7682 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 7683 }; 7684 { 7685 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 7686 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 7687 }; 7688 } 7689 if (r1 >= r9) { 7690 ulong const t = r1; 7691 r1 = r9; 7692 r9 = t; 7693 }; 7694 if (r5 >= r13) { 7695 ulong const t = r5; 7696 r5 = r13; 7697 r13 = t; 7698 }; 7699 if (r1 >= r5) { 7700 ulong const t = r1; 7701 r1 = r5; 7702 r5 = t; 7703 }; 7704 if (r9 >= r13) { 7705 ulong const t = r9; 7706 r9 = r13; 7707 r13 = t; 7708 }; 7709 if (r3 >= r11) { 7710 ulong const t = r3; 7711 r3 = r11; 7712 r11 = t; 7713 }; 7714 if (r7 >= r15) { 7715 ulong const t = r7; 7716 r7 = r15; 7717 r15 = t; 7718 }; 7719 if (r3 >= r7) { 7720 ulong const t = r3; 7721 r3 = r7; 7722 r7 = t; 7723 }; 7724 if (r11 >= r15) { 7725 ulong const t = r11; 7726 r11 = r15; 7727 r15 = t; 7728 }; 7729 if (r1 >= r3) { 7730 ulong const t = r1; 7731 r1 = r3; 7732 r3 = t; 7733 }; 7734 if (r5 >= r7) { 7735 ulong const t = r5; 7736 r5 = r7; 7737 r7 = t; 7738 }; 7739 if (r9 >= r11) { 7740 ulong const t = r9; 7741 r9 = r11; 7742 r11 = t; 7743 }; 7744 if (r13 >= r15) { 7745 ulong const t = r13; 7746 r13 = r15; 7747 r15 = t; 7748 }; 7749 if (r2 >= r10) { 7750 ulong const t = r2; 7751 r2 = r10; 7752 r10 = t; 7753 }; 7754 if (r6 >= r14) { 7755 ulong const t = r6; 7756 r6 = r14; 7757 r14 = t; 7758 }; 7759 if (r2 >= r6) { 7760 ulong const t = r2; 7761 r2 = r6; 7762 r6 = t; 7763 }; 7764 if (r10 >= r14) { 7765 ulong const t = r10; 7766 r10 = r14; 7767 r14 = t; 7768 }; 7769 if (r4 >= r12) { 7770 ulong const t = r4; 7771 r4 = r12; 7772 r12 = t; 7773 }; 7774 if (r8 >= r16) { 7775 ulong const t = r8; 7776 r8 = r16; 7777 r16 = t; 7778 }; 7779 if (r4 >= r8) { 7780 ulong const t = r4; 7781 r4 = r8; 7782 r8 = t; 7783 }; 7784 if (r12 >= r16) { 7785 ulong const t = r12; 7786 r12 = r16; 7787 r16 = t; 7788 }; 7789 if (r2 >= r4) { 7790 ulong const t = r2; 7791 r2 = r4; 7792 r4 = t; 7793 }; 7794 if (r6 >= r8) { 7795 ulong const t = r6; 7796 r6 = r8; 7797 r8 = t; 7798 }; 7799 if (r10 >= r12) { 7800 ulong const t = r10; 7801 r10 = r12; 7802 r12 = t; 7803 }; 7804 if (r14 >= r16) { 7805 ulong const t = r14; 7806 r14 = r16; 7807 r16 = t; 7808 }; 7809 if (r1 >= r2) { 7810 ulong const t = r1; 7811 r1 = r2; 7812 r2 = t; 7813 }; 7814 if (r3 >= r4) { 7815 ulong const t = r3; 7816 r3 = r4; 7817 r4 = t; 7818 }; 7819 if (r5 >= r6) { 7820 ulong const t = r5; 7821 r5 = r6; 7822 r6 = t; 7823 }; 7824 if (r7 >= r8) { 7825 ulong const t = r7; 7826 r7 = r8; 7827 r8 = t; 7828 }; 7829 if (r9 >= r10) { 7830 ulong const t = r9; 7831 r9 = r10; 7832 r10 = t; 7833 }; 7834 if (r11 >= r12) { 7835 ulong const t = r11; 7836 r11 = r12; 7837 r12 = t; 7838 }; 7839 if (r13 >= r14) { 7840 ulong const t = r13; 7841 r13 = r14; 7842 r14 = t; 7843 }; 7844 if (r15 >= r16) { 7845 ulong const t = r15; 7846 r15 = r16; 7847 r16 = t; 7848 }; 7849 } 7850 vout[gmem_idx + (1 << 3) * 0] = r1; 7851 vout[gmem_idx + (1 << 3) * 1] = r2; 7852 vout[gmem_idx + (1 << 3) * 2] = r3; 7853 vout[gmem_idx + (1 << 3) * 3] = r4; 7854 vout[gmem_idx + (1 << 3) * 4] = r5; 7855 vout[gmem_idx + (1 << 3) * 5] = r6; 7856 vout[gmem_idx + (1 << 3) * 6] = r7; 7857 vout[gmem_idx + (1 << 3) * 7] = r8; 7858 vout[gmem_idx + (1 << 3) * 8] = r9; 7859 vout[gmem_idx + (1 << 3) * 9] = r10; 7860 vout[gmem_idx + (1 << 3) * 10] = r11; 7861 vout[gmem_idx + (1 << 3) * 11] = r12; 7862 vout[gmem_idx + (1 << 3) * 12] = r13; 7863 vout[gmem_idx + (1 << 3) * 13] = r14; 7864 vout[gmem_idx + (1 << 3) * 14] = r15; 7865 vout[gmem_idx + (1 << 3) * 15] = r16; 7866} 7867 7868__kernel __attribute__((intel_reqd_sub_group_size((1 << 3)))) 7869__attribute__((reqd_work_group_size((1 << 3) * 16, 1, 1))) void 7870hs_kernel_bs_4(__global ulong const* const restrict vin, 7871 __global ulong* const restrict vout) 7872{ 7873 __local struct 7874 { 7875 ulong m[128 * 16]; 7876 } shared; 7877 7878 uint const gmem_idx = (get_global_id(0) & ~((1 << 3) - 1)) * 16 + 7879 (get_local_id(0) & ((1 << 3) - 1)); 7880 ulong r1 = vin[gmem_idx + (1 << 3) * 0]; 7881 ulong r2 = vin[gmem_idx + (1 << 3) * 1]; 7882 ulong r3 = vin[gmem_idx + (1 << 3) * 2]; 7883 ulong r4 = vin[gmem_idx + (1 << 3) * 3]; 7884 ulong r5 = vin[gmem_idx + (1 << 3) * 4]; 7885 ulong r6 = vin[gmem_idx + (1 << 3) * 5]; 7886 ulong r7 = vin[gmem_idx + (1 << 3) * 6]; 7887 ulong r8 = vin[gmem_idx + (1 << 3) * 7]; 7888 ulong r9 = vin[gmem_idx + (1 << 3) * 8]; 7889 ulong r10 = vin[gmem_idx + (1 << 3) * 9]; 7890 ulong r11 = vin[gmem_idx + (1 << 3) * 10]; 7891 ulong r12 = vin[gmem_idx + (1 << 3) * 11]; 7892 ulong r13 = vin[gmem_idx + (1 << 3) * 12]; 7893 ulong r14 = vin[gmem_idx + (1 << 3) * 13]; 7894 ulong r15 = vin[gmem_idx + (1 << 3) * 14]; 7895 ulong r16 = vin[gmem_idx + (1 << 3) * 15]; 7896 if (r1 >= r2) { 7897 ulong const t = r1; 7898 r1 = r2; 7899 r2 = t; 7900 }; 7901 if (r3 >= r4) { 7902 ulong const t = r3; 7903 r3 = r4; 7904 r4 = t; 7905 }; 7906 if (r5 >= r6) { 7907 ulong const t = r5; 7908 r5 = r6; 7909 r6 = t; 7910 }; 7911 if (r7 >= r8) { 7912 ulong const t = r7; 7913 r7 = r8; 7914 r8 = t; 7915 }; 7916 if (r9 >= r10) { 7917 ulong const t = r9; 7918 r9 = r10; 7919 r10 = t; 7920 }; 7921 if (r11 >= r12) { 7922 ulong const t = r11; 7923 r11 = r12; 7924 r12 = t; 7925 }; 7926 if (r13 >= r14) { 7927 ulong const t = r13; 7928 r13 = r14; 7929 r14 = t; 7930 }; 7931 if (r15 >= r16) { 7932 ulong const t = r15; 7933 r15 = r16; 7934 r16 = t; 7935 }; 7936 if (r1 >= r3) { 7937 ulong const t = r1; 7938 r1 = r3; 7939 r3 = t; 7940 }; 7941 if (r5 >= r7) { 7942 ulong const t = r5; 7943 r5 = r7; 7944 r7 = t; 7945 }; 7946 if (r9 >= r11) { 7947 ulong const t = r9; 7948 r9 = r11; 7949 r11 = t; 7950 }; 7951 if (r13 >= r15) { 7952 ulong const t = r13; 7953 r13 = r15; 7954 r15 = t; 7955 }; 7956 if (r2 >= r4) { 7957 ulong const t = r2; 7958 r2 = r4; 7959 r4 = t; 7960 }; 7961 if (r6 >= r8) { 7962 ulong const t = r6; 7963 r6 = r8; 7964 r8 = t; 7965 }; 7966 if (r10 >= r12) { 7967 ulong const t = r10; 7968 r10 = r12; 7969 r12 = t; 7970 }; 7971 if (r14 >= r16) { 7972 ulong const t = r14; 7973 r14 = r16; 7974 r16 = t; 7975 }; 7976 if (r1 >= r5) { 7977 ulong const t = r1; 7978 r1 = r5; 7979 r5 = t; 7980 }; 7981 if (r9 >= r13) { 7982 ulong const t = r9; 7983 r9 = r13; 7984 r13 = t; 7985 }; 7986 if (r2 >= r6) { 7987 ulong const t = r2; 7988 r2 = r6; 7989 r6 = t; 7990 }; 7991 if (r10 >= r14) { 7992 ulong const t = r10; 7993 r10 = r14; 7994 r14 = t; 7995 }; 7996 if (r3 >= r7) { 7997 ulong const t = r3; 7998 r3 = r7; 7999 r7 = t; 8000 }; 8001 if (r11 >= r15) { 8002 ulong const t = r11; 8003 r11 = r15; 8004 r15 = t; 8005 }; 8006 if (r4 >= r8) { 8007 ulong const t = r4; 8008 r4 = r8; 8009 r8 = t; 8010 }; 8011 if (r12 >= r16) { 8012 ulong const t = r12; 8013 r12 = r16; 8014 r16 = t; 8015 }; 8016 if (r1 >= r9) { 8017 ulong const t = r1; 8018 r1 = r9; 8019 r9 = t; 8020 }; 8021 if (r2 >= r10) { 8022 ulong const t = r2; 8023 r2 = r10; 8024 r10 = t; 8025 }; 8026 if (r3 >= r11) { 8027 ulong const t = r3; 8028 r3 = r11; 8029 r11 = t; 8030 }; 8031 if (r4 >= r12) { 8032 ulong const t = r4; 8033 r4 = r12; 8034 r12 = t; 8035 }; 8036 if (r5 >= r13) { 8037 ulong const t = r5; 8038 r5 = r13; 8039 r13 = t; 8040 }; 8041 if (r6 >= r14) { 8042 ulong const t = r6; 8043 r6 = r14; 8044 r14 = t; 8045 }; 8046 if (r7 >= r15) { 8047 ulong const t = r7; 8048 r7 = r15; 8049 r15 = t; 8050 }; 8051 if (r8 >= r16) { 8052 ulong const t = r8; 8053 r8 = r16; 8054 r16 = t; 8055 }; 8056 if (r6 >= r11) { 8057 ulong const t = r6; 8058 r6 = r11; 8059 r11 = t; 8060 }; 8061 if (r7 >= r10) { 8062 ulong const t = r7; 8063 r7 = r10; 8064 r10 = t; 8065 }; 8066 if (r4 >= r13) { 8067 ulong const t = r4; 8068 r4 = r13; 8069 r13 = t; 8070 }; 8071 if (r14 >= r15) { 8072 ulong const t = r14; 8073 r14 = r15; 8074 r15 = t; 8075 }; 8076 if (r8 >= r12) { 8077 ulong const t = r8; 8078 r8 = r12; 8079 r12 = t; 8080 }; 8081 if (r2 >= r3) { 8082 ulong const t = r2; 8083 r2 = r3; 8084 r3 = t; 8085 }; 8086 if (r5 >= r9) { 8087 ulong const t = r5; 8088 r5 = r9; 8089 r9 = t; 8090 }; 8091 if (r2 >= r5) { 8092 ulong const t = r2; 8093 r2 = r5; 8094 r5 = t; 8095 }; 8096 if (r8 >= r14) { 8097 ulong const t = r8; 8098 r8 = r14; 8099 r14 = t; 8100 }; 8101 if (r3 >= r9) { 8102 ulong const t = r3; 8103 r3 = r9; 8104 r9 = t; 8105 }; 8106 if (r12 >= r15) { 8107 ulong const t = r12; 8108 r12 = r15; 8109 r15 = t; 8110 }; 8111 if (r3 >= r5) { 8112 ulong const t = r3; 8113 r3 = r5; 8114 r5 = t; 8115 }; 8116 if (r6 >= r7) { 8117 ulong const t = r6; 8118 r6 = r7; 8119 r7 = t; 8120 }; 8121 if (r10 >= r11) { 8122 ulong const t = r10; 8123 r10 = r11; 8124 r11 = t; 8125 }; 8126 if (r12 >= r14) { 8127 ulong const t = r12; 8128 r12 = r14; 8129 r14 = t; 8130 }; 8131 if (r4 >= r9) { 8132 ulong const t = r4; 8133 r4 = r9; 8134 r9 = t; 8135 }; 8136 if (r8 >= r13) { 8137 ulong const t = r8; 8138 r8 = r13; 8139 r13 = t; 8140 }; 8141 if (r7 >= r9) { 8142 ulong const t = r7; 8143 r7 = r9; 8144 r9 = t; 8145 }; 8146 if (r11 >= r13) { 8147 ulong const t = r11; 8148 r11 = r13; 8149 r13 = t; 8150 }; 8151 if (r4 >= r6) { 8152 ulong const t = r4; 8153 r4 = r6; 8154 r6 = t; 8155 }; 8156 if (r8 >= r10) { 8157 ulong const t = r8; 8158 r8 = r10; 8159 r10 = t; 8160 }; 8161 if (r4 >= r5) { 8162 ulong const t = r4; 8163 r4 = r5; 8164 r5 = t; 8165 }; 8166 if (r6 >= r7) { 8167 ulong const t = r6; 8168 r6 = r7; 8169 r7 = t; 8170 }; 8171 if (r8 >= r9) { 8172 ulong const t = r8; 8173 r8 = r9; 8174 r9 = t; 8175 }; 8176 if (r10 >= r11) { 8177 ulong const t = r10; 8178 r10 = r11; 8179 r11 = t; 8180 }; 8181 if (r12 >= r13) { 8182 ulong const t = r12; 8183 r12 = r13; 8184 r13 = t; 8185 }; 8186 if (r7 >= r8) { 8187 ulong const t = r7; 8188 r7 = r8; 8189 r8 = t; 8190 }; 8191 if (r9 >= r10) { 8192 ulong const t = r9; 8193 r9 = r10; 8194 r10 = t; 8195 }; 8196 { 8197 uint const flip_lane_idx = get_sub_group_local_id() ^ 1; 8198 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 8199 ; 8200 { 8201 ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 8202 ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx); 8203 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 8204 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 8205 }; 8206 { 8207 ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 8208 ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx); 8209 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 8210 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 8211 }; 8212 { 8213 ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 8214 ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx); 8215 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 8216 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 8217 }; 8218 { 8219 ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 8220 ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx); 8221 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 8222 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 8223 }; 8224 { 8225 ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx); 8226 ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx); 8227 r5 = ((r5 <= tb) ^ t_lt) ? tb : r5; 8228 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 8229 }; 8230 { 8231 ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx); 8232 ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx); 8233 r6 = ((r6 <= tb) ^ t_lt) ? tb : r6; 8234 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 8235 }; 8236 { 8237 ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx); 8238 ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx); 8239 r7 = ((r7 <= tb) ^ t_lt) ? tb : r7; 8240 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 8241 }; 8242 { 8243 ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx); 8244 ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx); 8245 r8 = ((r8 <= tb) ^ t_lt) ? tb : r8; 8246 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 8247 }; 8248 } 8249 if (r1 >= r9) { 8250 ulong const t = r1; 8251 r1 = r9; 8252 r9 = t; 8253 }; 8254 if (r5 >= r13) { 8255 ulong const t = r5; 8256 r5 = r13; 8257 r13 = t; 8258 }; 8259 if (r1 >= r5) { 8260 ulong const t = r1; 8261 r1 = r5; 8262 r5 = t; 8263 }; 8264 if (r9 >= r13) { 8265 ulong const t = r9; 8266 r9 = r13; 8267 r13 = t; 8268 }; 8269 if (r3 >= r11) { 8270 ulong const t = r3; 8271 r3 = r11; 8272 r11 = t; 8273 }; 8274 if (r7 >= r15) { 8275 ulong const t = r7; 8276 r7 = r15; 8277 r15 = t; 8278 }; 8279 if (r3 >= r7) { 8280 ulong const t = r3; 8281 r3 = r7; 8282 r7 = t; 8283 }; 8284 if (r11 >= r15) { 8285 ulong const t = r11; 8286 r11 = r15; 8287 r15 = t; 8288 }; 8289 if (r1 >= r3) { 8290 ulong const t = r1; 8291 r1 = r3; 8292 r3 = t; 8293 }; 8294 if (r5 >= r7) { 8295 ulong const t = r5; 8296 r5 = r7; 8297 r7 = t; 8298 }; 8299 if (r9 >= r11) { 8300 ulong const t = r9; 8301 r9 = r11; 8302 r11 = t; 8303 }; 8304 if (r13 >= r15) { 8305 ulong const t = r13; 8306 r13 = r15; 8307 r15 = t; 8308 }; 8309 if (r2 >= r10) { 8310 ulong const t = r2; 8311 r2 = r10; 8312 r10 = t; 8313 }; 8314 if (r6 >= r14) { 8315 ulong const t = r6; 8316 r6 = r14; 8317 r14 = t; 8318 }; 8319 if (r2 >= r6) { 8320 ulong const t = r2; 8321 r2 = r6; 8322 r6 = t; 8323 }; 8324 if (r10 >= r14) { 8325 ulong const t = r10; 8326 r10 = r14; 8327 r14 = t; 8328 }; 8329 if (r4 >= r12) { 8330 ulong const t = r4; 8331 r4 = r12; 8332 r12 = t; 8333 }; 8334 if (r8 >= r16) { 8335 ulong const t = r8; 8336 r8 = r16; 8337 r16 = t; 8338 }; 8339 if (r4 >= r8) { 8340 ulong const t = r4; 8341 r4 = r8; 8342 r8 = t; 8343 }; 8344 if (r12 >= r16) { 8345 ulong const t = r12; 8346 r12 = r16; 8347 r16 = t; 8348 }; 8349 if (r2 >= r4) { 8350 ulong const t = r2; 8351 r2 = r4; 8352 r4 = t; 8353 }; 8354 if (r6 >= r8) { 8355 ulong const t = r6; 8356 r6 = r8; 8357 r8 = t; 8358 }; 8359 if (r10 >= r12) { 8360 ulong const t = r10; 8361 r10 = r12; 8362 r12 = t; 8363 }; 8364 if (r14 >= r16) { 8365 ulong const t = r14; 8366 r14 = r16; 8367 r16 = t; 8368 }; 8369 if (r1 >= r2) { 8370 ulong const t = r1; 8371 r1 = r2; 8372 r2 = t; 8373 }; 8374 if (r3 >= r4) { 8375 ulong const t = r3; 8376 r3 = r4; 8377 r4 = t; 8378 }; 8379 if (r5 >= r6) { 8380 ulong const t = r5; 8381 r5 = r6; 8382 r6 = t; 8383 }; 8384 if (r7 >= r8) { 8385 ulong const t = r7; 8386 r7 = r8; 8387 r8 = t; 8388 }; 8389 if (r9 >= r10) { 8390 ulong const t = r9; 8391 r9 = r10; 8392 r10 = t; 8393 }; 8394 if (r11 >= r12) { 8395 ulong const t = r11; 8396 r11 = r12; 8397 r12 = t; 8398 }; 8399 if (r13 >= r14) { 8400 ulong const t = r13; 8401 r13 = r14; 8402 r14 = t; 8403 }; 8404 if (r15 >= r16) { 8405 ulong const t = r15; 8406 r15 = r16; 8407 r16 = t; 8408 }; 8409 { 8410 uint const flip_lane_idx = get_sub_group_local_id() ^ 3; 8411 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 8412 ; 8413 { 8414 ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 8415 ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx); 8416 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 8417 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 8418 }; 8419 { 8420 ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 8421 ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx); 8422 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 8423 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 8424 }; 8425 { 8426 ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 8427 ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx); 8428 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 8429 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 8430 }; 8431 { 8432 ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 8433 ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx); 8434 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 8435 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 8436 }; 8437 { 8438 ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx); 8439 ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx); 8440 r5 = ((r5 <= tb) ^ t_lt) ? tb : r5; 8441 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 8442 }; 8443 { 8444 ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx); 8445 ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx); 8446 r6 = ((r6 <= tb) ^ t_lt) ? tb : r6; 8447 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 8448 }; 8449 { 8450 ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx); 8451 ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx); 8452 r7 = ((r7 <= tb) ^ t_lt) ? tb : r7; 8453 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 8454 }; 8455 { 8456 ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx); 8457 ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx); 8458 r8 = ((r8 <= tb) ^ t_lt) ? tb : r8; 8459 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 8460 }; 8461 } 8462 { 8463 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 8464 int const t_lt = get_sub_group_local_id() < half_lane_idx; 8465 ; 8466 { 8467 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 8468 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 8469 }; 8470 { 8471 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 8472 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 8473 }; 8474 { 8475 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 8476 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 8477 }; 8478 { 8479 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 8480 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 8481 }; 8482 { 8483 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 8484 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 8485 }; 8486 { 8487 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 8488 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 8489 }; 8490 { 8491 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 8492 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 8493 }; 8494 { 8495 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 8496 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 8497 }; 8498 { 8499 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 8500 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 8501 }; 8502 { 8503 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 8504 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 8505 }; 8506 { 8507 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 8508 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 8509 }; 8510 { 8511 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 8512 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 8513 }; 8514 { 8515 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 8516 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 8517 }; 8518 { 8519 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 8520 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 8521 }; 8522 { 8523 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 8524 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 8525 }; 8526 { 8527 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 8528 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 8529 }; 8530 } 8531 if (r1 >= r9) { 8532 ulong const t = r1; 8533 r1 = r9; 8534 r9 = t; 8535 }; 8536 if (r5 >= r13) { 8537 ulong const t = r5; 8538 r5 = r13; 8539 r13 = t; 8540 }; 8541 if (r1 >= r5) { 8542 ulong const t = r1; 8543 r1 = r5; 8544 r5 = t; 8545 }; 8546 if (r9 >= r13) { 8547 ulong const t = r9; 8548 r9 = r13; 8549 r13 = t; 8550 }; 8551 if (r3 >= r11) { 8552 ulong const t = r3; 8553 r3 = r11; 8554 r11 = t; 8555 }; 8556 if (r7 >= r15) { 8557 ulong const t = r7; 8558 r7 = r15; 8559 r15 = t; 8560 }; 8561 if (r3 >= r7) { 8562 ulong const t = r3; 8563 r3 = r7; 8564 r7 = t; 8565 }; 8566 if (r11 >= r15) { 8567 ulong const t = r11; 8568 r11 = r15; 8569 r15 = t; 8570 }; 8571 if (r1 >= r3) { 8572 ulong const t = r1; 8573 r1 = r3; 8574 r3 = t; 8575 }; 8576 if (r5 >= r7) { 8577 ulong const t = r5; 8578 r5 = r7; 8579 r7 = t; 8580 }; 8581 if (r9 >= r11) { 8582 ulong const t = r9; 8583 r9 = r11; 8584 r11 = t; 8585 }; 8586 if (r13 >= r15) { 8587 ulong const t = r13; 8588 r13 = r15; 8589 r15 = t; 8590 }; 8591 if (r2 >= r10) { 8592 ulong const t = r2; 8593 r2 = r10; 8594 r10 = t; 8595 }; 8596 if (r6 >= r14) { 8597 ulong const t = r6; 8598 r6 = r14; 8599 r14 = t; 8600 }; 8601 if (r2 >= r6) { 8602 ulong const t = r2; 8603 r2 = r6; 8604 r6 = t; 8605 }; 8606 if (r10 >= r14) { 8607 ulong const t = r10; 8608 r10 = r14; 8609 r14 = t; 8610 }; 8611 if (r4 >= r12) { 8612 ulong const t = r4; 8613 r4 = r12; 8614 r12 = t; 8615 }; 8616 if (r8 >= r16) { 8617 ulong const t = r8; 8618 r8 = r16; 8619 r16 = t; 8620 }; 8621 if (r4 >= r8) { 8622 ulong const t = r4; 8623 r4 = r8; 8624 r8 = t; 8625 }; 8626 if (r12 >= r16) { 8627 ulong const t = r12; 8628 r12 = r16; 8629 r16 = t; 8630 }; 8631 if (r2 >= r4) { 8632 ulong const t = r2; 8633 r2 = r4; 8634 r4 = t; 8635 }; 8636 if (r6 >= r8) { 8637 ulong const t = r6; 8638 r6 = r8; 8639 r8 = t; 8640 }; 8641 if (r10 >= r12) { 8642 ulong const t = r10; 8643 r10 = r12; 8644 r12 = t; 8645 }; 8646 if (r14 >= r16) { 8647 ulong const t = r14; 8648 r14 = r16; 8649 r16 = t; 8650 }; 8651 if (r1 >= r2) { 8652 ulong const t = r1; 8653 r1 = r2; 8654 r2 = t; 8655 }; 8656 if (r3 >= r4) { 8657 ulong const t = r3; 8658 r3 = r4; 8659 r4 = t; 8660 }; 8661 if (r5 >= r6) { 8662 ulong const t = r5; 8663 r5 = r6; 8664 r6 = t; 8665 }; 8666 if (r7 >= r8) { 8667 ulong const t = r7; 8668 r7 = r8; 8669 r8 = t; 8670 }; 8671 if (r9 >= r10) { 8672 ulong const t = r9; 8673 r9 = r10; 8674 r10 = t; 8675 }; 8676 if (r11 >= r12) { 8677 ulong const t = r11; 8678 r11 = r12; 8679 r12 = t; 8680 }; 8681 if (r13 >= r14) { 8682 ulong const t = r13; 8683 r13 = r14; 8684 r14 = t; 8685 }; 8686 if (r15 >= r16) { 8687 ulong const t = r15; 8688 r15 = r16; 8689 r16 = t; 8690 }; 8691 { 8692 uint const flip_lane_idx = get_sub_group_local_id() ^ 7; 8693 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 8694 ; 8695 { 8696 ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 8697 ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx); 8698 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 8699 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 8700 }; 8701 { 8702 ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 8703 ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx); 8704 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 8705 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 8706 }; 8707 { 8708 ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 8709 ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx); 8710 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 8711 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 8712 }; 8713 { 8714 ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 8715 ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx); 8716 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 8717 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 8718 }; 8719 { 8720 ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx); 8721 ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx); 8722 r5 = ((r5 <= tb) ^ t_lt) ? tb : r5; 8723 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 8724 }; 8725 { 8726 ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx); 8727 ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx); 8728 r6 = ((r6 <= tb) ^ t_lt) ? tb : r6; 8729 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 8730 }; 8731 { 8732 ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx); 8733 ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx); 8734 r7 = ((r7 <= tb) ^ t_lt) ? tb : r7; 8735 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 8736 }; 8737 { 8738 ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx); 8739 ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx); 8740 r8 = ((r8 <= tb) ^ t_lt) ? tb : r8; 8741 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 8742 }; 8743 } 8744 { 8745 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 8746 int const t_lt = get_sub_group_local_id() < half_lane_idx; 8747 ; 8748 { 8749 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 8750 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 8751 }; 8752 { 8753 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 8754 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 8755 }; 8756 { 8757 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 8758 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 8759 }; 8760 { 8761 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 8762 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 8763 }; 8764 { 8765 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 8766 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 8767 }; 8768 { 8769 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 8770 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 8771 }; 8772 { 8773 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 8774 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 8775 }; 8776 { 8777 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 8778 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 8779 }; 8780 { 8781 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 8782 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 8783 }; 8784 { 8785 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 8786 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 8787 }; 8788 { 8789 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 8790 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 8791 }; 8792 { 8793 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 8794 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 8795 }; 8796 { 8797 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 8798 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 8799 }; 8800 { 8801 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 8802 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 8803 }; 8804 { 8805 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 8806 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 8807 }; 8808 { 8809 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 8810 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 8811 }; 8812 } 8813 { 8814 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 8815 int const t_lt = get_sub_group_local_id() < half_lane_idx; 8816 ; 8817 { 8818 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 8819 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 8820 }; 8821 { 8822 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 8823 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 8824 }; 8825 { 8826 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 8827 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 8828 }; 8829 { 8830 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 8831 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 8832 }; 8833 { 8834 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 8835 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 8836 }; 8837 { 8838 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 8839 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 8840 }; 8841 { 8842 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 8843 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 8844 }; 8845 { 8846 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 8847 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 8848 }; 8849 { 8850 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 8851 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 8852 }; 8853 { 8854 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 8855 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 8856 }; 8857 { 8858 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 8859 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 8860 }; 8861 { 8862 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 8863 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 8864 }; 8865 { 8866 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 8867 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 8868 }; 8869 { 8870 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 8871 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 8872 }; 8873 { 8874 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 8875 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 8876 }; 8877 { 8878 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 8879 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 8880 }; 8881 } 8882 if (r1 >= r9) { 8883 ulong const t = r1; 8884 r1 = r9; 8885 r9 = t; 8886 }; 8887 if (r5 >= r13) { 8888 ulong const t = r5; 8889 r5 = r13; 8890 r13 = t; 8891 }; 8892 if (r1 >= r5) { 8893 ulong const t = r1; 8894 r1 = r5; 8895 r5 = t; 8896 }; 8897 if (r9 >= r13) { 8898 ulong const t = r9; 8899 r9 = r13; 8900 r13 = t; 8901 }; 8902 if (r3 >= r11) { 8903 ulong const t = r3; 8904 r3 = r11; 8905 r11 = t; 8906 }; 8907 if (r7 >= r15) { 8908 ulong const t = r7; 8909 r7 = r15; 8910 r15 = t; 8911 }; 8912 if (r3 >= r7) { 8913 ulong const t = r3; 8914 r3 = r7; 8915 r7 = t; 8916 }; 8917 if (r11 >= r15) { 8918 ulong const t = r11; 8919 r11 = r15; 8920 r15 = t; 8921 }; 8922 if (r1 >= r3) { 8923 ulong const t = r1; 8924 r1 = r3; 8925 r3 = t; 8926 }; 8927 if (r5 >= r7) { 8928 ulong const t = r5; 8929 r5 = r7; 8930 r7 = t; 8931 }; 8932 if (r9 >= r11) { 8933 ulong const t = r9; 8934 r9 = r11; 8935 r11 = t; 8936 }; 8937 if (r13 >= r15) { 8938 ulong const t = r13; 8939 r13 = r15; 8940 r15 = t; 8941 }; 8942 if (r2 >= r10) { 8943 ulong const t = r2; 8944 r2 = r10; 8945 r10 = t; 8946 }; 8947 if (r6 >= r14) { 8948 ulong const t = r6; 8949 r6 = r14; 8950 r14 = t; 8951 }; 8952 if (r2 >= r6) { 8953 ulong const t = r2; 8954 r2 = r6; 8955 r6 = t; 8956 }; 8957 if (r10 >= r14) { 8958 ulong const t = r10; 8959 r10 = r14; 8960 r14 = t; 8961 }; 8962 if (r4 >= r12) { 8963 ulong const t = r4; 8964 r4 = r12; 8965 r12 = t; 8966 }; 8967 if (r8 >= r16) { 8968 ulong const t = r8; 8969 r8 = r16; 8970 r16 = t; 8971 }; 8972 if (r4 >= r8) { 8973 ulong const t = r4; 8974 r4 = r8; 8975 r8 = t; 8976 }; 8977 if (r12 >= r16) { 8978 ulong const t = r12; 8979 r12 = r16; 8980 r16 = t; 8981 }; 8982 if (r2 >= r4) { 8983 ulong const t = r2; 8984 r2 = r4; 8985 r4 = t; 8986 }; 8987 if (r6 >= r8) { 8988 ulong const t = r6; 8989 r6 = r8; 8990 r8 = t; 8991 }; 8992 if (r10 >= r12) { 8993 ulong const t = r10; 8994 r10 = r12; 8995 r12 = t; 8996 }; 8997 if (r14 >= r16) { 8998 ulong const t = r14; 8999 r14 = r16; 9000 r16 = t; 9001 }; 9002 if (r1 >= r2) { 9003 ulong const t = r1; 9004 r1 = r2; 9005 r2 = t; 9006 }; 9007 if (r3 >= r4) { 9008 ulong const t = r3; 9009 r3 = r4; 9010 r4 = t; 9011 }; 9012 if (r5 >= r6) { 9013 ulong const t = r5; 9014 r5 = r6; 9015 r6 = t; 9016 }; 9017 if (r7 >= r8) { 9018 ulong const t = r7; 9019 r7 = r8; 9020 r8 = t; 9021 }; 9022 if (r9 >= r10) { 9023 ulong const t = r9; 9024 r9 = r10; 9025 r10 = t; 9026 }; 9027 if (r11 >= r12) { 9028 ulong const t = r11; 9029 r11 = r12; 9030 r12 = t; 9031 }; 9032 if (r13 >= r14) { 9033 ulong const t = r13; 9034 r13 = r14; 9035 r14 = t; 9036 }; 9037 if (r15 >= r16) { 9038 ulong const t = r15; 9039 r15 = r16; 9040 r16 = t; 9041 }; 9042 uint const smem_l_idx = 9043 get_sub_group_id() * ((1 << 3) * 16) + get_sub_group_local_id(); 9044 uint const smem_r_idx = (get_sub_group_id() ^ 1) * ((1 << 3) * 16) + 9045 (get_sub_group_local_id() ^ ((1 << 3) - 1)); 9046 shared.m[get_local_id(0) + (16 * (1 << 3) * 0)] = r1; 9047 shared.m[get_local_id(0) + (16 * (1 << 3) * 1)] = r16; 9048 shared.m[get_local_id(0) + (16 * (1 << 3) * 2)] = r2; 9049 shared.m[get_local_id(0) + (16 * (1 << 3) * 3)] = r15; 9050 shared.m[get_local_id(0) + (16 * (1 << 3) * 4)] = r3; 9051 shared.m[get_local_id(0) + (16 * (1 << 3) * 5)] = r14; 9052 shared.m[get_local_id(0) + (16 * (1 << 3) * 6)] = r4; 9053 shared.m[get_local_id(0) + (16 * (1 << 3) * 7)] = r13; 9054 shared.m[get_local_id(0) + (16 * (1 << 3) * 8)] = r5; 9055 shared.m[get_local_id(0) + (16 * (1 << 3) * 9)] = r12; 9056 shared.m[get_local_id(0) + (16 * (1 << 3) * 10)] = r6; 9057 shared.m[get_local_id(0) + (16 * (1 << 3) * 11)] = r11; 9058 shared.m[get_local_id(0) + (16 * (1 << 3) * 12)] = r7; 9059 shared.m[get_local_id(0) + (16 * (1 << 3) * 13)] = r10; 9060 shared.m[get_local_id(0) + (16 * (1 << 3) * 14)] = r8; 9061 shared.m[get_local_id(0) + (16 * (1 << 3) * 15)] = r9; 9062 barrier(CLK_LOCAL_MEM_FENCE); 9063 { 9064 { 9065 ulong r0_1 = shared.m[smem_l_idx + (0)]; 9066 ulong r0_2 = shared.m[smem_r_idx + (8)]; 9067 if (r0_1 >= r0_2) { 9068 ulong const t = r0_1; 9069 r0_1 = r0_2; 9070 r0_2 = t; 9071 }; 9072 shared.m[smem_l_idx + (0)] = r0_1; 9073 shared.m[smem_r_idx + (8)] = r0_2; 9074 } 9075 { 9076 ulong r1_1 = shared.m[smem_l_idx + (16)]; 9077 ulong r1_2 = shared.m[smem_r_idx + (24)]; 9078 if (r1_1 >= r1_2) { 9079 ulong const t = r1_1; 9080 r1_1 = r1_2; 9081 r1_2 = t; 9082 }; 9083 shared.m[smem_l_idx + (16)] = r1_1; 9084 shared.m[smem_r_idx + (24)] = r1_2; 9085 } 9086 { 9087 ulong r2_1 = shared.m[smem_l_idx + (32)]; 9088 ulong r2_2 = shared.m[smem_r_idx + (40)]; 9089 if (r2_1 >= r2_2) { 9090 ulong const t = r2_1; 9091 r2_1 = r2_2; 9092 r2_2 = t; 9093 }; 9094 shared.m[smem_l_idx + (32)] = r2_1; 9095 shared.m[smem_r_idx + (40)] = r2_2; 9096 } 9097 { 9098 ulong r3_1 = shared.m[smem_l_idx + (48)]; 9099 ulong r3_2 = shared.m[smem_r_idx + (56)]; 9100 if (r3_1 >= r3_2) { 9101 ulong const t = r3_1; 9102 r3_1 = r3_2; 9103 r3_2 = t; 9104 }; 9105 shared.m[smem_l_idx + (48)] = r3_1; 9106 shared.m[smem_r_idx + (56)] = r3_2; 9107 } 9108 { 9109 ulong r4_1 = shared.m[smem_l_idx + (64)]; 9110 ulong r4_2 = shared.m[smem_r_idx + (72)]; 9111 if (r4_1 >= r4_2) { 9112 ulong const t = r4_1; 9113 r4_1 = r4_2; 9114 r4_2 = t; 9115 }; 9116 shared.m[smem_l_idx + (64)] = r4_1; 9117 shared.m[smem_r_idx + (72)] = r4_2; 9118 } 9119 { 9120 ulong r5_1 = shared.m[smem_l_idx + (80)]; 9121 ulong r5_2 = shared.m[smem_r_idx + (88)]; 9122 if (r5_1 >= r5_2) { 9123 ulong const t = r5_1; 9124 r5_1 = r5_2; 9125 r5_2 = t; 9126 }; 9127 shared.m[smem_l_idx + (80)] = r5_1; 9128 shared.m[smem_r_idx + (88)] = r5_2; 9129 } 9130 { 9131 ulong r6_1 = shared.m[smem_l_idx + (96)]; 9132 ulong r6_2 = shared.m[smem_r_idx + (104)]; 9133 if (r6_1 >= r6_2) { 9134 ulong const t = r6_1; 9135 r6_1 = r6_2; 9136 r6_2 = t; 9137 }; 9138 shared.m[smem_l_idx + (96)] = r6_1; 9139 shared.m[smem_r_idx + (104)] = r6_2; 9140 } 9141 { 9142 ulong r7_1 = shared.m[smem_l_idx + (112)]; 9143 ulong r7_2 = shared.m[smem_r_idx + (120)]; 9144 if (r7_1 >= r7_2) { 9145 ulong const t = r7_1; 9146 r7_1 = r7_2; 9147 r7_2 = t; 9148 }; 9149 shared.m[smem_l_idx + (112)] = r7_1; 9150 shared.m[smem_r_idx + (120)] = r7_2; 9151 } 9152 } 9153 barrier(CLK_LOCAL_MEM_FENCE); 9154 r1 = shared.m[get_local_id(0) + (16 * (1 << 3) * 0)]; 9155 r16 = shared.m[get_local_id(0) + (16 * (1 << 3) * 1)]; 9156 r2 = shared.m[get_local_id(0) + (16 * (1 << 3) * 2)]; 9157 r15 = shared.m[get_local_id(0) + (16 * (1 << 3) * 3)]; 9158 r3 = shared.m[get_local_id(0) + (16 * (1 << 3) * 4)]; 9159 r14 = shared.m[get_local_id(0) + (16 * (1 << 3) * 5)]; 9160 r4 = shared.m[get_local_id(0) + (16 * (1 << 3) * 6)]; 9161 r13 = shared.m[get_local_id(0) + (16 * (1 << 3) * 7)]; 9162 r5 = shared.m[get_local_id(0) + (16 * (1 << 3) * 8)]; 9163 r12 = shared.m[get_local_id(0) + (16 * (1 << 3) * 9)]; 9164 r6 = shared.m[get_local_id(0) + (16 * (1 << 3) * 10)]; 9165 r11 = shared.m[get_local_id(0) + (16 * (1 << 3) * 11)]; 9166 r7 = shared.m[get_local_id(0) + (16 * (1 << 3) * 12)]; 9167 r10 = shared.m[get_local_id(0) + (16 * (1 << 3) * 13)]; 9168 r8 = shared.m[get_local_id(0) + (16 * (1 << 3) * 14)]; 9169 r9 = shared.m[get_local_id(0) + (16 * (1 << 3) * 15)]; 9170 { 9171 { 9172 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 9173 int const t_lt = get_sub_group_local_id() < half_lane_idx; 9174 ; 9175 { 9176 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 9177 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 9178 }; 9179 { 9180 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 9181 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 9182 }; 9183 { 9184 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 9185 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 9186 }; 9187 { 9188 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 9189 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 9190 }; 9191 { 9192 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 9193 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 9194 }; 9195 { 9196 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 9197 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 9198 }; 9199 { 9200 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 9201 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 9202 }; 9203 { 9204 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 9205 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 9206 }; 9207 { 9208 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 9209 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 9210 }; 9211 { 9212 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 9213 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 9214 }; 9215 { 9216 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 9217 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 9218 }; 9219 { 9220 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 9221 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 9222 }; 9223 { 9224 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 9225 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 9226 }; 9227 { 9228 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 9229 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 9230 }; 9231 { 9232 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 9233 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 9234 }; 9235 { 9236 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 9237 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 9238 }; 9239 } 9240 { 9241 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 9242 int const t_lt = get_sub_group_local_id() < half_lane_idx; 9243 ; 9244 { 9245 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 9246 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 9247 }; 9248 { 9249 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 9250 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 9251 }; 9252 { 9253 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 9254 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 9255 }; 9256 { 9257 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 9258 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 9259 }; 9260 { 9261 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 9262 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 9263 }; 9264 { 9265 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 9266 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 9267 }; 9268 { 9269 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 9270 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 9271 }; 9272 { 9273 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 9274 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 9275 }; 9276 { 9277 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 9278 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 9279 }; 9280 { 9281 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 9282 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 9283 }; 9284 { 9285 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 9286 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 9287 }; 9288 { 9289 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 9290 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 9291 }; 9292 { 9293 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 9294 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 9295 }; 9296 { 9297 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 9298 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 9299 }; 9300 { 9301 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 9302 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 9303 }; 9304 { 9305 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 9306 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 9307 }; 9308 } 9309 { 9310 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 9311 int const t_lt = get_sub_group_local_id() < half_lane_idx; 9312 ; 9313 { 9314 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 9315 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 9316 }; 9317 { 9318 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 9319 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 9320 }; 9321 { 9322 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 9323 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 9324 }; 9325 { 9326 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 9327 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 9328 }; 9329 { 9330 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 9331 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 9332 }; 9333 { 9334 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 9335 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 9336 }; 9337 { 9338 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 9339 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 9340 }; 9341 { 9342 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 9343 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 9344 }; 9345 { 9346 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 9347 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 9348 }; 9349 { 9350 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 9351 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 9352 }; 9353 { 9354 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 9355 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 9356 }; 9357 { 9358 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 9359 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 9360 }; 9361 { 9362 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 9363 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 9364 }; 9365 { 9366 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 9367 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 9368 }; 9369 { 9370 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 9371 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 9372 }; 9373 { 9374 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 9375 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 9376 }; 9377 } 9378 if (r1 >= r9) { 9379 ulong const t = r1; 9380 r1 = r9; 9381 r9 = t; 9382 }; 9383 if (r5 >= r13) { 9384 ulong const t = r5; 9385 r5 = r13; 9386 r13 = t; 9387 }; 9388 if (r1 >= r5) { 9389 ulong const t = r1; 9390 r1 = r5; 9391 r5 = t; 9392 }; 9393 if (r9 >= r13) { 9394 ulong const t = r9; 9395 r9 = r13; 9396 r13 = t; 9397 }; 9398 if (r3 >= r11) { 9399 ulong const t = r3; 9400 r3 = r11; 9401 r11 = t; 9402 }; 9403 if (r7 >= r15) { 9404 ulong const t = r7; 9405 r7 = r15; 9406 r15 = t; 9407 }; 9408 if (r3 >= r7) { 9409 ulong const t = r3; 9410 r3 = r7; 9411 r7 = t; 9412 }; 9413 if (r11 >= r15) { 9414 ulong const t = r11; 9415 r11 = r15; 9416 r15 = t; 9417 }; 9418 if (r1 >= r3) { 9419 ulong const t = r1; 9420 r1 = r3; 9421 r3 = t; 9422 }; 9423 if (r5 >= r7) { 9424 ulong const t = r5; 9425 r5 = r7; 9426 r7 = t; 9427 }; 9428 if (r9 >= r11) { 9429 ulong const t = r9; 9430 r9 = r11; 9431 r11 = t; 9432 }; 9433 if (r13 >= r15) { 9434 ulong const t = r13; 9435 r13 = r15; 9436 r15 = t; 9437 }; 9438 if (r2 >= r10) { 9439 ulong const t = r2; 9440 r2 = r10; 9441 r10 = t; 9442 }; 9443 if (r6 >= r14) { 9444 ulong const t = r6; 9445 r6 = r14; 9446 r14 = t; 9447 }; 9448 if (r2 >= r6) { 9449 ulong const t = r2; 9450 r2 = r6; 9451 r6 = t; 9452 }; 9453 if (r10 >= r14) { 9454 ulong const t = r10; 9455 r10 = r14; 9456 r14 = t; 9457 }; 9458 if (r4 >= r12) { 9459 ulong const t = r4; 9460 r4 = r12; 9461 r12 = t; 9462 }; 9463 if (r8 >= r16) { 9464 ulong const t = r8; 9465 r8 = r16; 9466 r16 = t; 9467 }; 9468 if (r4 >= r8) { 9469 ulong const t = r4; 9470 r4 = r8; 9471 r8 = t; 9472 }; 9473 if (r12 >= r16) { 9474 ulong const t = r12; 9475 r12 = r16; 9476 r16 = t; 9477 }; 9478 if (r2 >= r4) { 9479 ulong const t = r2; 9480 r2 = r4; 9481 r4 = t; 9482 }; 9483 if (r6 >= r8) { 9484 ulong const t = r6; 9485 r6 = r8; 9486 r8 = t; 9487 }; 9488 if (r10 >= r12) { 9489 ulong const t = r10; 9490 r10 = r12; 9491 r12 = t; 9492 }; 9493 if (r14 >= r16) { 9494 ulong const t = r14; 9495 r14 = r16; 9496 r16 = t; 9497 }; 9498 if (r1 >= r2) { 9499 ulong const t = r1; 9500 r1 = r2; 9501 r2 = t; 9502 }; 9503 if (r3 >= r4) { 9504 ulong const t = r3; 9505 r3 = r4; 9506 r4 = t; 9507 }; 9508 if (r5 >= r6) { 9509 ulong const t = r5; 9510 r5 = r6; 9511 r6 = t; 9512 }; 9513 if (r7 >= r8) { 9514 ulong const t = r7; 9515 r7 = r8; 9516 r8 = t; 9517 }; 9518 if (r9 >= r10) { 9519 ulong const t = r9; 9520 r9 = r10; 9521 r10 = t; 9522 }; 9523 if (r11 >= r12) { 9524 ulong const t = r11; 9525 r11 = r12; 9526 r12 = t; 9527 }; 9528 if (r13 >= r14) { 9529 ulong const t = r13; 9530 r13 = r14; 9531 r14 = t; 9532 }; 9533 if (r15 >= r16) { 9534 ulong const t = r15; 9535 r15 = r16; 9536 r16 = t; 9537 }; 9538 } 9539 shared.m[get_local_id(0) + (16 * (1 << 3) * 0)] = r1; 9540 shared.m[get_local_id(0) + (16 * (1 << 3) * 1)] = r16; 9541 shared.m[get_local_id(0) + (16 * (1 << 3) * 2)] = r2; 9542 shared.m[get_local_id(0) + (16 * (1 << 3) * 3)] = r15; 9543 shared.m[get_local_id(0) + (16 * (1 << 3) * 4)] = r3; 9544 shared.m[get_local_id(0) + (16 * (1 << 3) * 5)] = r14; 9545 shared.m[get_local_id(0) + (16 * (1 << 3) * 6)] = r4; 9546 shared.m[get_local_id(0) + (16 * (1 << 3) * 7)] = r13; 9547 shared.m[get_local_id(0) + (16 * (1 << 3) * 8)] = r5; 9548 shared.m[get_local_id(0) + (16 * (1 << 3) * 9)] = r12; 9549 shared.m[get_local_id(0) + (16 * (1 << 3) * 10)] = r6; 9550 shared.m[get_local_id(0) + (16 * (1 << 3) * 11)] = r11; 9551 shared.m[get_local_id(0) + (16 * (1 << 3) * 12)] = r7; 9552 shared.m[get_local_id(0) + (16 * (1 << 3) * 13)] = r10; 9553 shared.m[get_local_id(0) + (16 * (1 << 3) * 14)] = r8; 9554 shared.m[get_local_id(0) + (16 * (1 << 3) * 15)] = r9; 9555 barrier(CLK_LOCAL_MEM_FENCE); 9556 { 9557 { 9558 ulong r0_1 = shared.m[smem_l_idx + (0)]; 9559 ulong r0_2 = shared.m[smem_l_idx + (8)]; 9560 ulong r0_3 = shared.m[smem_r_idx + (16)]; 9561 ulong r0_4 = shared.m[smem_r_idx + (24)]; 9562 if (r0_2 >= r0_3) { 9563 ulong const t = r0_2; 9564 r0_2 = r0_3; 9565 r0_3 = t; 9566 }; 9567 if (r0_1 >= r0_4) { 9568 ulong const t = r0_1; 9569 r0_1 = r0_4; 9570 r0_4 = t; 9571 }; 9572 if (r0_3 >= r0_4) { 9573 ulong const t = r0_3; 9574 r0_3 = r0_4; 9575 r0_4 = t; 9576 }; 9577 if (r0_1 >= r0_2) { 9578 ulong const t = r0_1; 9579 r0_1 = r0_2; 9580 r0_2 = t; 9581 }; 9582 shared.m[smem_l_idx + (0)] = r0_1; 9583 shared.m[smem_l_idx + (8)] = r0_2; 9584 shared.m[smem_r_idx + (16)] = r0_3; 9585 shared.m[smem_r_idx + (24)] = r0_4; 9586 } 9587 { 9588 ulong r1_1 = shared.m[smem_l_idx + (32)]; 9589 ulong r1_2 = shared.m[smem_l_idx + (40)]; 9590 ulong r1_3 = shared.m[smem_r_idx + (48)]; 9591 ulong r1_4 = shared.m[smem_r_idx + (56)]; 9592 if (r1_2 >= r1_3) { 9593 ulong const t = r1_2; 9594 r1_2 = r1_3; 9595 r1_3 = t; 9596 }; 9597 if (r1_1 >= r1_4) { 9598 ulong const t = r1_1; 9599 r1_1 = r1_4; 9600 r1_4 = t; 9601 }; 9602 if (r1_3 >= r1_4) { 9603 ulong const t = r1_3; 9604 r1_3 = r1_4; 9605 r1_4 = t; 9606 }; 9607 if (r1_1 >= r1_2) { 9608 ulong const t = r1_1; 9609 r1_1 = r1_2; 9610 r1_2 = t; 9611 }; 9612 shared.m[smem_l_idx + (32)] = r1_1; 9613 shared.m[smem_l_idx + (40)] = r1_2; 9614 shared.m[smem_r_idx + (48)] = r1_3; 9615 shared.m[smem_r_idx + (56)] = r1_4; 9616 } 9617 { 9618 ulong r2_1 = shared.m[smem_l_idx + (64)]; 9619 ulong r2_2 = shared.m[smem_l_idx + (72)]; 9620 ulong r2_3 = shared.m[smem_r_idx + (80)]; 9621 ulong r2_4 = shared.m[smem_r_idx + (88)]; 9622 if (r2_2 >= r2_3) { 9623 ulong const t = r2_2; 9624 r2_2 = r2_3; 9625 r2_3 = t; 9626 }; 9627 if (r2_1 >= r2_4) { 9628 ulong const t = r2_1; 9629 r2_1 = r2_4; 9630 r2_4 = t; 9631 }; 9632 if (r2_3 >= r2_4) { 9633 ulong const t = r2_3; 9634 r2_3 = r2_4; 9635 r2_4 = t; 9636 }; 9637 if (r2_1 >= r2_2) { 9638 ulong const t = r2_1; 9639 r2_1 = r2_2; 9640 r2_2 = t; 9641 }; 9642 shared.m[smem_l_idx + (64)] = r2_1; 9643 shared.m[smem_l_idx + (72)] = r2_2; 9644 shared.m[smem_r_idx + (80)] = r2_3; 9645 shared.m[smem_r_idx + (88)] = r2_4; 9646 } 9647 { 9648 ulong r3_1 = shared.m[smem_l_idx + (96)]; 9649 ulong r3_2 = shared.m[smem_l_idx + (104)]; 9650 ulong r3_3 = shared.m[smem_r_idx + (112)]; 9651 ulong r3_4 = shared.m[smem_r_idx + (120)]; 9652 if (r3_2 >= r3_3) { 9653 ulong const t = r3_2; 9654 r3_2 = r3_3; 9655 r3_3 = t; 9656 }; 9657 if (r3_1 >= r3_4) { 9658 ulong const t = r3_1; 9659 r3_1 = r3_4; 9660 r3_4 = t; 9661 }; 9662 if (r3_3 >= r3_4) { 9663 ulong const t = r3_3; 9664 r3_3 = r3_4; 9665 r3_4 = t; 9666 }; 9667 if (r3_1 >= r3_2) { 9668 ulong const t = r3_1; 9669 r3_1 = r3_2; 9670 r3_2 = t; 9671 }; 9672 shared.m[smem_l_idx + (96)] = r3_1; 9673 shared.m[smem_l_idx + (104)] = r3_2; 9674 shared.m[smem_r_idx + (112)] = r3_3; 9675 shared.m[smem_r_idx + (120)] = r3_4; 9676 } 9677 } 9678 barrier(CLK_LOCAL_MEM_FENCE); 9679 r1 = shared.m[get_local_id(0) + (16 * (1 << 3) * 0)]; 9680 r16 = shared.m[get_local_id(0) + (16 * (1 << 3) * 1)]; 9681 r2 = shared.m[get_local_id(0) + (16 * (1 << 3) * 2)]; 9682 r15 = shared.m[get_local_id(0) + (16 * (1 << 3) * 3)]; 9683 r3 = shared.m[get_local_id(0) + (16 * (1 << 3) * 4)]; 9684 r14 = shared.m[get_local_id(0) + (16 * (1 << 3) * 5)]; 9685 r4 = shared.m[get_local_id(0) + (16 * (1 << 3) * 6)]; 9686 r13 = shared.m[get_local_id(0) + (16 * (1 << 3) * 7)]; 9687 r5 = shared.m[get_local_id(0) + (16 * (1 << 3) * 8)]; 9688 r12 = shared.m[get_local_id(0) + (16 * (1 << 3) * 9)]; 9689 r6 = shared.m[get_local_id(0) + (16 * (1 << 3) * 10)]; 9690 r11 = shared.m[get_local_id(0) + (16 * (1 << 3) * 11)]; 9691 r7 = shared.m[get_local_id(0) + (16 * (1 << 3) * 12)]; 9692 r10 = shared.m[get_local_id(0) + (16 * (1 << 3) * 13)]; 9693 r8 = shared.m[get_local_id(0) + (16 * (1 << 3) * 14)]; 9694 r9 = shared.m[get_local_id(0) + (16 * (1 << 3) * 15)]; 9695 { 9696 { 9697 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 9698 int const t_lt = get_sub_group_local_id() < half_lane_idx; 9699 ; 9700 { 9701 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 9702 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 9703 }; 9704 { 9705 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 9706 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 9707 }; 9708 { 9709 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 9710 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 9711 }; 9712 { 9713 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 9714 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 9715 }; 9716 { 9717 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 9718 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 9719 }; 9720 { 9721 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 9722 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 9723 }; 9724 { 9725 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 9726 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 9727 }; 9728 { 9729 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 9730 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 9731 }; 9732 { 9733 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 9734 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 9735 }; 9736 { 9737 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 9738 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 9739 }; 9740 { 9741 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 9742 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 9743 }; 9744 { 9745 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 9746 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 9747 }; 9748 { 9749 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 9750 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 9751 }; 9752 { 9753 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 9754 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 9755 }; 9756 { 9757 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 9758 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 9759 }; 9760 { 9761 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 9762 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 9763 }; 9764 } 9765 { 9766 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 9767 int const t_lt = get_sub_group_local_id() < half_lane_idx; 9768 ; 9769 { 9770 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 9771 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 9772 }; 9773 { 9774 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 9775 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 9776 }; 9777 { 9778 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 9779 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 9780 }; 9781 { 9782 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 9783 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 9784 }; 9785 { 9786 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 9787 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 9788 }; 9789 { 9790 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 9791 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 9792 }; 9793 { 9794 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 9795 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 9796 }; 9797 { 9798 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 9799 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 9800 }; 9801 { 9802 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 9803 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 9804 }; 9805 { 9806 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 9807 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 9808 }; 9809 { 9810 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 9811 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 9812 }; 9813 { 9814 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 9815 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 9816 }; 9817 { 9818 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 9819 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 9820 }; 9821 { 9822 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 9823 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 9824 }; 9825 { 9826 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 9827 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 9828 }; 9829 { 9830 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 9831 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 9832 }; 9833 } 9834 { 9835 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 9836 int const t_lt = get_sub_group_local_id() < half_lane_idx; 9837 ; 9838 { 9839 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 9840 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 9841 }; 9842 { 9843 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 9844 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 9845 }; 9846 { 9847 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 9848 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 9849 }; 9850 { 9851 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 9852 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 9853 }; 9854 { 9855 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 9856 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 9857 }; 9858 { 9859 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 9860 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 9861 }; 9862 { 9863 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 9864 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 9865 }; 9866 { 9867 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 9868 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 9869 }; 9870 { 9871 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 9872 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 9873 }; 9874 { 9875 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 9876 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 9877 }; 9878 { 9879 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 9880 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 9881 }; 9882 { 9883 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 9884 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 9885 }; 9886 { 9887 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 9888 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 9889 }; 9890 { 9891 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 9892 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 9893 }; 9894 { 9895 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 9896 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 9897 }; 9898 { 9899 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 9900 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 9901 }; 9902 } 9903 if (r1 >= r9) { 9904 ulong const t = r1; 9905 r1 = r9; 9906 r9 = t; 9907 }; 9908 if (r5 >= r13) { 9909 ulong const t = r5; 9910 r5 = r13; 9911 r13 = t; 9912 }; 9913 if (r1 >= r5) { 9914 ulong const t = r1; 9915 r1 = r5; 9916 r5 = t; 9917 }; 9918 if (r9 >= r13) { 9919 ulong const t = r9; 9920 r9 = r13; 9921 r13 = t; 9922 }; 9923 if (r3 >= r11) { 9924 ulong const t = r3; 9925 r3 = r11; 9926 r11 = t; 9927 }; 9928 if (r7 >= r15) { 9929 ulong const t = r7; 9930 r7 = r15; 9931 r15 = t; 9932 }; 9933 if (r3 >= r7) { 9934 ulong const t = r3; 9935 r3 = r7; 9936 r7 = t; 9937 }; 9938 if (r11 >= r15) { 9939 ulong const t = r11; 9940 r11 = r15; 9941 r15 = t; 9942 }; 9943 if (r1 >= r3) { 9944 ulong const t = r1; 9945 r1 = r3; 9946 r3 = t; 9947 }; 9948 if (r5 >= r7) { 9949 ulong const t = r5; 9950 r5 = r7; 9951 r7 = t; 9952 }; 9953 if (r9 >= r11) { 9954 ulong const t = r9; 9955 r9 = r11; 9956 r11 = t; 9957 }; 9958 if (r13 >= r15) { 9959 ulong const t = r13; 9960 r13 = r15; 9961 r15 = t; 9962 }; 9963 if (r2 >= r10) { 9964 ulong const t = r2; 9965 r2 = r10; 9966 r10 = t; 9967 }; 9968 if (r6 >= r14) { 9969 ulong const t = r6; 9970 r6 = r14; 9971 r14 = t; 9972 }; 9973 if (r2 >= r6) { 9974 ulong const t = r2; 9975 r2 = r6; 9976 r6 = t; 9977 }; 9978 if (r10 >= r14) { 9979 ulong const t = r10; 9980 r10 = r14; 9981 r14 = t; 9982 }; 9983 if (r4 >= r12) { 9984 ulong const t = r4; 9985 r4 = r12; 9986 r12 = t; 9987 }; 9988 if (r8 >= r16) { 9989 ulong const t = r8; 9990 r8 = r16; 9991 r16 = t; 9992 }; 9993 if (r4 >= r8) { 9994 ulong const t = r4; 9995 r4 = r8; 9996 r8 = t; 9997 }; 9998 if (r12 >= r16) { 9999 ulong const t = r12; 10000 r12 = r16; 10001 r16 = t; 10002 }; 10003 if (r2 >= r4) { 10004 ulong const t = r2; 10005 r2 = r4; 10006 r4 = t; 10007 }; 10008 if (r6 >= r8) { 10009 ulong const t = r6; 10010 r6 = r8; 10011 r8 = t; 10012 }; 10013 if (r10 >= r12) { 10014 ulong const t = r10; 10015 r10 = r12; 10016 r12 = t; 10017 }; 10018 if (r14 >= r16) { 10019 ulong const t = r14; 10020 r14 = r16; 10021 r16 = t; 10022 }; 10023 if (r1 >= r2) { 10024 ulong const t = r1; 10025 r1 = r2; 10026 r2 = t; 10027 }; 10028 if (r3 >= r4) { 10029 ulong const t = r3; 10030 r3 = r4; 10031 r4 = t; 10032 }; 10033 if (r5 >= r6) { 10034 ulong const t = r5; 10035 r5 = r6; 10036 r6 = t; 10037 }; 10038 if (r7 >= r8) { 10039 ulong const t = r7; 10040 r7 = r8; 10041 r8 = t; 10042 }; 10043 if (r9 >= r10) { 10044 ulong const t = r9; 10045 r9 = r10; 10046 r10 = t; 10047 }; 10048 if (r11 >= r12) { 10049 ulong const t = r11; 10050 r11 = r12; 10051 r12 = t; 10052 }; 10053 if (r13 >= r14) { 10054 ulong const t = r13; 10055 r13 = r14; 10056 r14 = t; 10057 }; 10058 if (r15 >= r16) { 10059 ulong const t = r15; 10060 r15 = r16; 10061 r16 = t; 10062 }; 10063 } 10064 shared.m[get_local_id(0) + (16 * (1 << 3) * 0)] = r1; 10065 shared.m[get_local_id(0) + (16 * (1 << 3) * 1)] = r16; 10066 shared.m[get_local_id(0) + (16 * (1 << 3) * 2)] = r2; 10067 shared.m[get_local_id(0) + (16 * (1 << 3) * 3)] = r15; 10068 shared.m[get_local_id(0) + (16 * (1 << 3) * 4)] = r3; 10069 shared.m[get_local_id(0) + (16 * (1 << 3) * 5)] = r14; 10070 shared.m[get_local_id(0) + (16 * (1 << 3) * 6)] = r4; 10071 shared.m[get_local_id(0) + (16 * (1 << 3) * 7)] = r13; 10072 shared.m[get_local_id(0) + (16 * (1 << 3) * 8)] = r5; 10073 shared.m[get_local_id(0) + (16 * (1 << 3) * 9)] = r12; 10074 shared.m[get_local_id(0) + (16 * (1 << 3) * 10)] = r6; 10075 shared.m[get_local_id(0) + (16 * (1 << 3) * 11)] = r11; 10076 shared.m[get_local_id(0) + (16 * (1 << 3) * 12)] = r7; 10077 shared.m[get_local_id(0) + (16 * (1 << 3) * 13)] = r10; 10078 shared.m[get_local_id(0) + (16 * (1 << 3) * 14)] = r8; 10079 shared.m[get_local_id(0) + (16 * (1 << 3) * 15)] = r9; 10080 barrier(CLK_LOCAL_MEM_FENCE); 10081 { 10082 { 10083 ulong r0_1 = shared.m[smem_l_idx + (0)]; 10084 ulong r0_2 = shared.m[smem_l_idx + (8)]; 10085 ulong r0_3 = shared.m[smem_l_idx + (16)]; 10086 ulong r0_4 = shared.m[smem_l_idx + (24)]; 10087 ulong r0_5 = shared.m[smem_r_idx + (32)]; 10088 ulong r0_6 = shared.m[smem_r_idx + (40)]; 10089 ulong r0_7 = shared.m[smem_r_idx + (48)]; 10090 ulong r0_8 = shared.m[smem_r_idx + (56)]; 10091 if (r0_4 >= r0_5) { 10092 ulong const t = r0_4; 10093 r0_4 = r0_5; 10094 r0_5 = t; 10095 }; 10096 if (r0_3 >= r0_6) { 10097 ulong const t = r0_3; 10098 r0_3 = r0_6; 10099 r0_6 = t; 10100 }; 10101 if (r0_2 >= r0_7) { 10102 ulong const t = r0_2; 10103 r0_2 = r0_7; 10104 r0_7 = t; 10105 }; 10106 if (r0_1 >= r0_8) { 10107 ulong const t = r0_1; 10108 r0_1 = r0_8; 10109 r0_8 = t; 10110 }; 10111 if (r0_5 >= r0_7) { 10112 ulong const t = r0_5; 10113 r0_5 = r0_7; 10114 r0_7 = t; 10115 }; 10116 if (r0_6 >= r0_8) { 10117 ulong const t = r0_6; 10118 r0_6 = r0_8; 10119 r0_8 = t; 10120 }; 10121 if (r0_5 >= r0_6) { 10122 ulong const t = r0_5; 10123 r0_5 = r0_6; 10124 r0_6 = t; 10125 }; 10126 if (r0_7 >= r0_8) { 10127 ulong const t = r0_7; 10128 r0_7 = r0_8; 10129 r0_8 = t; 10130 }; 10131 if (r0_1 >= r0_3) { 10132 ulong const t = r0_1; 10133 r0_1 = r0_3; 10134 r0_3 = t; 10135 }; 10136 if (r0_2 >= r0_4) { 10137 ulong const t = r0_2; 10138 r0_2 = r0_4; 10139 r0_4 = t; 10140 }; 10141 if (r0_1 >= r0_2) { 10142 ulong const t = r0_1; 10143 r0_1 = r0_2; 10144 r0_2 = t; 10145 }; 10146 if (r0_3 >= r0_4) { 10147 ulong const t = r0_3; 10148 r0_3 = r0_4; 10149 r0_4 = t; 10150 }; 10151 shared.m[smem_l_idx + (0)] = r0_1; 10152 shared.m[smem_l_idx + (8)] = r0_2; 10153 shared.m[smem_l_idx + (16)] = r0_3; 10154 shared.m[smem_l_idx + (24)] = r0_4; 10155 shared.m[smem_r_idx + (32)] = r0_5; 10156 shared.m[smem_r_idx + (40)] = r0_6; 10157 shared.m[smem_r_idx + (48)] = r0_7; 10158 shared.m[smem_r_idx + (56)] = r0_8; 10159 } 10160 { 10161 ulong r1_1 = shared.m[smem_l_idx + (64)]; 10162 ulong r1_2 = shared.m[smem_l_idx + (72)]; 10163 ulong r1_3 = shared.m[smem_l_idx + (80)]; 10164 ulong r1_4 = shared.m[smem_l_idx + (88)]; 10165 ulong r1_5 = shared.m[smem_r_idx + (96)]; 10166 ulong r1_6 = shared.m[smem_r_idx + (104)]; 10167 ulong r1_7 = shared.m[smem_r_idx + (112)]; 10168 ulong r1_8 = shared.m[smem_r_idx + (120)]; 10169 if (r1_4 >= r1_5) { 10170 ulong const t = r1_4; 10171 r1_4 = r1_5; 10172 r1_5 = t; 10173 }; 10174 if (r1_3 >= r1_6) { 10175 ulong const t = r1_3; 10176 r1_3 = r1_6; 10177 r1_6 = t; 10178 }; 10179 if (r1_2 >= r1_7) { 10180 ulong const t = r1_2; 10181 r1_2 = r1_7; 10182 r1_7 = t; 10183 }; 10184 if (r1_1 >= r1_8) { 10185 ulong const t = r1_1; 10186 r1_1 = r1_8; 10187 r1_8 = t; 10188 }; 10189 if (r1_5 >= r1_7) { 10190 ulong const t = r1_5; 10191 r1_5 = r1_7; 10192 r1_7 = t; 10193 }; 10194 if (r1_6 >= r1_8) { 10195 ulong const t = r1_6; 10196 r1_6 = r1_8; 10197 r1_8 = t; 10198 }; 10199 if (r1_5 >= r1_6) { 10200 ulong const t = r1_5; 10201 r1_5 = r1_6; 10202 r1_6 = t; 10203 }; 10204 if (r1_7 >= r1_8) { 10205 ulong const t = r1_7; 10206 r1_7 = r1_8; 10207 r1_8 = t; 10208 }; 10209 if (r1_1 >= r1_3) { 10210 ulong const t = r1_1; 10211 r1_1 = r1_3; 10212 r1_3 = t; 10213 }; 10214 if (r1_2 >= r1_4) { 10215 ulong const t = r1_2; 10216 r1_2 = r1_4; 10217 r1_4 = t; 10218 }; 10219 if (r1_1 >= r1_2) { 10220 ulong const t = r1_1; 10221 r1_1 = r1_2; 10222 r1_2 = t; 10223 }; 10224 if (r1_3 >= r1_4) { 10225 ulong const t = r1_3; 10226 r1_3 = r1_4; 10227 r1_4 = t; 10228 }; 10229 shared.m[smem_l_idx + (64)] = r1_1; 10230 shared.m[smem_l_idx + (72)] = r1_2; 10231 shared.m[smem_l_idx + (80)] = r1_3; 10232 shared.m[smem_l_idx + (88)] = r1_4; 10233 shared.m[smem_r_idx + (96)] = r1_5; 10234 shared.m[smem_r_idx + (104)] = r1_6; 10235 shared.m[smem_r_idx + (112)] = r1_7; 10236 shared.m[smem_r_idx + (120)] = r1_8; 10237 } 10238 } 10239 barrier(CLK_LOCAL_MEM_FENCE); 10240 r1 = shared.m[get_local_id(0) + (16 * (1 << 3) * 0)]; 10241 r16 = shared.m[get_local_id(0) + (16 * (1 << 3) * 1)]; 10242 r2 = shared.m[get_local_id(0) + (16 * (1 << 3) * 2)]; 10243 r15 = shared.m[get_local_id(0) + (16 * (1 << 3) * 3)]; 10244 r3 = shared.m[get_local_id(0) + (16 * (1 << 3) * 4)]; 10245 r14 = shared.m[get_local_id(0) + (16 * (1 << 3) * 5)]; 10246 r4 = shared.m[get_local_id(0) + (16 * (1 << 3) * 6)]; 10247 r13 = shared.m[get_local_id(0) + (16 * (1 << 3) * 7)]; 10248 r5 = shared.m[get_local_id(0) + (16 * (1 << 3) * 8)]; 10249 r12 = shared.m[get_local_id(0) + (16 * (1 << 3) * 9)]; 10250 r6 = shared.m[get_local_id(0) + (16 * (1 << 3) * 10)]; 10251 r11 = shared.m[get_local_id(0) + (16 * (1 << 3) * 11)]; 10252 r7 = shared.m[get_local_id(0) + (16 * (1 << 3) * 12)]; 10253 r10 = shared.m[get_local_id(0) + (16 * (1 << 3) * 13)]; 10254 r8 = shared.m[get_local_id(0) + (16 * (1 << 3) * 14)]; 10255 r9 = shared.m[get_local_id(0) + (16 * (1 << 3) * 15)]; 10256 { 10257 { 10258 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 10259 int const t_lt = get_sub_group_local_id() < half_lane_idx; 10260 ; 10261 { 10262 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 10263 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 10264 }; 10265 { 10266 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 10267 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 10268 }; 10269 { 10270 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 10271 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 10272 }; 10273 { 10274 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 10275 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 10276 }; 10277 { 10278 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 10279 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 10280 }; 10281 { 10282 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 10283 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 10284 }; 10285 { 10286 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 10287 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 10288 }; 10289 { 10290 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 10291 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 10292 }; 10293 { 10294 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 10295 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 10296 }; 10297 { 10298 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 10299 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 10300 }; 10301 { 10302 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 10303 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 10304 }; 10305 { 10306 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 10307 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 10308 }; 10309 { 10310 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 10311 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 10312 }; 10313 { 10314 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 10315 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 10316 }; 10317 { 10318 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 10319 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 10320 }; 10321 { 10322 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 10323 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 10324 }; 10325 } 10326 { 10327 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 10328 int const t_lt = get_sub_group_local_id() < half_lane_idx; 10329 ; 10330 { 10331 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 10332 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 10333 }; 10334 { 10335 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 10336 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 10337 }; 10338 { 10339 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 10340 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 10341 }; 10342 { 10343 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 10344 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 10345 }; 10346 { 10347 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 10348 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 10349 }; 10350 { 10351 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 10352 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 10353 }; 10354 { 10355 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 10356 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 10357 }; 10358 { 10359 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 10360 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 10361 }; 10362 { 10363 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 10364 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 10365 }; 10366 { 10367 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 10368 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 10369 }; 10370 { 10371 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 10372 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 10373 }; 10374 { 10375 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 10376 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 10377 }; 10378 { 10379 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 10380 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 10381 }; 10382 { 10383 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 10384 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 10385 }; 10386 { 10387 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 10388 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 10389 }; 10390 { 10391 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 10392 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 10393 }; 10394 } 10395 { 10396 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 10397 int const t_lt = get_sub_group_local_id() < half_lane_idx; 10398 ; 10399 { 10400 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 10401 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 10402 }; 10403 { 10404 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 10405 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 10406 }; 10407 { 10408 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 10409 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 10410 }; 10411 { 10412 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 10413 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 10414 }; 10415 { 10416 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 10417 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 10418 }; 10419 { 10420 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 10421 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 10422 }; 10423 { 10424 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 10425 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 10426 }; 10427 { 10428 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 10429 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 10430 }; 10431 { 10432 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 10433 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 10434 }; 10435 { 10436 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 10437 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 10438 }; 10439 { 10440 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 10441 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 10442 }; 10443 { 10444 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 10445 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 10446 }; 10447 { 10448 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 10449 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 10450 }; 10451 { 10452 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 10453 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 10454 }; 10455 { 10456 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 10457 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 10458 }; 10459 { 10460 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 10461 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 10462 }; 10463 } 10464 if (r1 >= r9) { 10465 ulong const t = r1; 10466 r1 = r9; 10467 r9 = t; 10468 }; 10469 if (r5 >= r13) { 10470 ulong const t = r5; 10471 r5 = r13; 10472 r13 = t; 10473 }; 10474 if (r1 >= r5) { 10475 ulong const t = r1; 10476 r1 = r5; 10477 r5 = t; 10478 }; 10479 if (r9 >= r13) { 10480 ulong const t = r9; 10481 r9 = r13; 10482 r13 = t; 10483 }; 10484 if (r3 >= r11) { 10485 ulong const t = r3; 10486 r3 = r11; 10487 r11 = t; 10488 }; 10489 if (r7 >= r15) { 10490 ulong const t = r7; 10491 r7 = r15; 10492 r15 = t; 10493 }; 10494 if (r3 >= r7) { 10495 ulong const t = r3; 10496 r3 = r7; 10497 r7 = t; 10498 }; 10499 if (r11 >= r15) { 10500 ulong const t = r11; 10501 r11 = r15; 10502 r15 = t; 10503 }; 10504 if (r1 >= r3) { 10505 ulong const t = r1; 10506 r1 = r3; 10507 r3 = t; 10508 }; 10509 if (r5 >= r7) { 10510 ulong const t = r5; 10511 r5 = r7; 10512 r7 = t; 10513 }; 10514 if (r9 >= r11) { 10515 ulong const t = r9; 10516 r9 = r11; 10517 r11 = t; 10518 }; 10519 if (r13 >= r15) { 10520 ulong const t = r13; 10521 r13 = r15; 10522 r15 = t; 10523 }; 10524 if (r2 >= r10) { 10525 ulong const t = r2; 10526 r2 = r10; 10527 r10 = t; 10528 }; 10529 if (r6 >= r14) { 10530 ulong const t = r6; 10531 r6 = r14; 10532 r14 = t; 10533 }; 10534 if (r2 >= r6) { 10535 ulong const t = r2; 10536 r2 = r6; 10537 r6 = t; 10538 }; 10539 if (r10 >= r14) { 10540 ulong const t = r10; 10541 r10 = r14; 10542 r14 = t; 10543 }; 10544 if (r4 >= r12) { 10545 ulong const t = r4; 10546 r4 = r12; 10547 r12 = t; 10548 }; 10549 if (r8 >= r16) { 10550 ulong const t = r8; 10551 r8 = r16; 10552 r16 = t; 10553 }; 10554 if (r4 >= r8) { 10555 ulong const t = r4; 10556 r4 = r8; 10557 r8 = t; 10558 }; 10559 if (r12 >= r16) { 10560 ulong const t = r12; 10561 r12 = r16; 10562 r16 = t; 10563 }; 10564 if (r2 >= r4) { 10565 ulong const t = r2; 10566 r2 = r4; 10567 r4 = t; 10568 }; 10569 if (r6 >= r8) { 10570 ulong const t = r6; 10571 r6 = r8; 10572 r8 = t; 10573 }; 10574 if (r10 >= r12) { 10575 ulong const t = r10; 10576 r10 = r12; 10577 r12 = t; 10578 }; 10579 if (r14 >= r16) { 10580 ulong const t = r14; 10581 r14 = r16; 10582 r16 = t; 10583 }; 10584 if (r1 >= r2) { 10585 ulong const t = r1; 10586 r1 = r2; 10587 r2 = t; 10588 }; 10589 if (r3 >= r4) { 10590 ulong const t = r3; 10591 r3 = r4; 10592 r4 = t; 10593 }; 10594 if (r5 >= r6) { 10595 ulong const t = r5; 10596 r5 = r6; 10597 r6 = t; 10598 }; 10599 if (r7 >= r8) { 10600 ulong const t = r7; 10601 r7 = r8; 10602 r8 = t; 10603 }; 10604 if (r9 >= r10) { 10605 ulong const t = r9; 10606 r9 = r10; 10607 r10 = t; 10608 }; 10609 if (r11 >= r12) { 10610 ulong const t = r11; 10611 r11 = r12; 10612 r12 = t; 10613 }; 10614 if (r13 >= r14) { 10615 ulong const t = r13; 10616 r13 = r14; 10617 r14 = t; 10618 }; 10619 if (r15 >= r16) { 10620 ulong const t = r15; 10621 r15 = r16; 10622 r16 = t; 10623 }; 10624 } 10625 shared.m[get_local_id(0) + (16 * (1 << 3) * 0)] = r1; 10626 shared.m[get_local_id(0) + (16 * (1 << 3) * 1)] = r16; 10627 shared.m[get_local_id(0) + (16 * (1 << 3) * 2)] = r2; 10628 shared.m[get_local_id(0) + (16 * (1 << 3) * 3)] = r15; 10629 shared.m[get_local_id(0) + (16 * (1 << 3) * 4)] = r3; 10630 shared.m[get_local_id(0) + (16 * (1 << 3) * 5)] = r14; 10631 shared.m[get_local_id(0) + (16 * (1 << 3) * 6)] = r4; 10632 shared.m[get_local_id(0) + (16 * (1 << 3) * 7)] = r13; 10633 shared.m[get_local_id(0) + (16 * (1 << 3) * 8)] = r5; 10634 shared.m[get_local_id(0) + (16 * (1 << 3) * 9)] = r12; 10635 shared.m[get_local_id(0) + (16 * (1 << 3) * 10)] = r6; 10636 shared.m[get_local_id(0) + (16 * (1 << 3) * 11)] = r11; 10637 shared.m[get_local_id(0) + (16 * (1 << 3) * 12)] = r7; 10638 shared.m[get_local_id(0) + (16 * (1 << 3) * 13)] = r10; 10639 shared.m[get_local_id(0) + (16 * (1 << 3) * 14)] = r8; 10640 shared.m[get_local_id(0) + (16 * (1 << 3) * 15)] = r9; 10641 barrier(CLK_LOCAL_MEM_FENCE); 10642 { 10643 { 10644 ulong r0_1 = shared.m[smem_l_idx + (0)]; 10645 ulong r0_2 = shared.m[smem_l_idx + (8)]; 10646 ulong r0_3 = shared.m[smem_l_idx + (16)]; 10647 ulong r0_4 = shared.m[smem_l_idx + (24)]; 10648 ulong r0_5 = shared.m[smem_l_idx + (32)]; 10649 ulong r0_6 = shared.m[smem_l_idx + (40)]; 10650 ulong r0_7 = shared.m[smem_l_idx + (48)]; 10651 ulong r0_8 = shared.m[smem_l_idx + (56)]; 10652 ulong r0_9 = shared.m[smem_r_idx + (64)]; 10653 ulong r0_10 = shared.m[smem_r_idx + (72)]; 10654 ulong r0_11 = shared.m[smem_r_idx + (80)]; 10655 ulong r0_12 = shared.m[smem_r_idx + (88)]; 10656 ulong r0_13 = shared.m[smem_r_idx + (96)]; 10657 ulong r0_14 = shared.m[smem_r_idx + (104)]; 10658 ulong r0_15 = shared.m[smem_r_idx + (112)]; 10659 ulong r0_16 = shared.m[smem_r_idx + (120)]; 10660 if (r0_8 >= r0_9) { 10661 ulong const t = r0_8; 10662 r0_8 = r0_9; 10663 r0_9 = t; 10664 }; 10665 if (r0_7 >= r0_10) { 10666 ulong const t = r0_7; 10667 r0_7 = r0_10; 10668 r0_10 = t; 10669 }; 10670 if (r0_6 >= r0_11) { 10671 ulong const t = r0_6; 10672 r0_6 = r0_11; 10673 r0_11 = t; 10674 }; 10675 if (r0_5 >= r0_12) { 10676 ulong const t = r0_5; 10677 r0_5 = r0_12; 10678 r0_12 = t; 10679 }; 10680 if (r0_4 >= r0_13) { 10681 ulong const t = r0_4; 10682 r0_4 = r0_13; 10683 r0_13 = t; 10684 }; 10685 if (r0_3 >= r0_14) { 10686 ulong const t = r0_3; 10687 r0_3 = r0_14; 10688 r0_14 = t; 10689 }; 10690 if (r0_2 >= r0_15) { 10691 ulong const t = r0_2; 10692 r0_2 = r0_15; 10693 r0_15 = t; 10694 }; 10695 if (r0_1 >= r0_16) { 10696 ulong const t = r0_1; 10697 r0_1 = r0_16; 10698 r0_16 = t; 10699 }; 10700 if (r0_9 >= r0_13) { 10701 ulong const t = r0_9; 10702 r0_9 = r0_13; 10703 r0_13 = t; 10704 }; 10705 if (r0_11 >= r0_15) { 10706 ulong const t = r0_11; 10707 r0_11 = r0_15; 10708 r0_15 = t; 10709 }; 10710 if (r0_9 >= r0_11) { 10711 ulong const t = r0_9; 10712 r0_9 = r0_11; 10713 r0_11 = t; 10714 }; 10715 if (r0_13 >= r0_15) { 10716 ulong const t = r0_13; 10717 r0_13 = r0_15; 10718 r0_15 = t; 10719 }; 10720 if (r0_10 >= r0_14) { 10721 ulong const t = r0_10; 10722 r0_10 = r0_14; 10723 r0_14 = t; 10724 }; 10725 if (r0_12 >= r0_16) { 10726 ulong const t = r0_12; 10727 r0_12 = r0_16; 10728 r0_16 = t; 10729 }; 10730 if (r0_10 >= r0_12) { 10731 ulong const t = r0_10; 10732 r0_10 = r0_12; 10733 r0_12 = t; 10734 }; 10735 if (r0_14 >= r0_16) { 10736 ulong const t = r0_14; 10737 r0_14 = r0_16; 10738 r0_16 = t; 10739 }; 10740 if (r0_9 >= r0_10) { 10741 ulong const t = r0_9; 10742 r0_9 = r0_10; 10743 r0_10 = t; 10744 }; 10745 if (r0_11 >= r0_12) { 10746 ulong const t = r0_11; 10747 r0_11 = r0_12; 10748 r0_12 = t; 10749 }; 10750 if (r0_13 >= r0_14) { 10751 ulong const t = r0_13; 10752 r0_13 = r0_14; 10753 r0_14 = t; 10754 }; 10755 if (r0_15 >= r0_16) { 10756 ulong const t = r0_15; 10757 r0_15 = r0_16; 10758 r0_16 = t; 10759 }; 10760 if (r0_1 >= r0_5) { 10761 ulong const t = r0_1; 10762 r0_1 = r0_5; 10763 r0_5 = t; 10764 }; 10765 if (r0_3 >= r0_7) { 10766 ulong const t = r0_3; 10767 r0_3 = r0_7; 10768 r0_7 = t; 10769 }; 10770 if (r0_1 >= r0_3) { 10771 ulong const t = r0_1; 10772 r0_1 = r0_3; 10773 r0_3 = t; 10774 }; 10775 if (r0_5 >= r0_7) { 10776 ulong const t = r0_5; 10777 r0_5 = r0_7; 10778 r0_7 = t; 10779 }; 10780 if (r0_2 >= r0_6) { 10781 ulong const t = r0_2; 10782 r0_2 = r0_6; 10783 r0_6 = t; 10784 }; 10785 if (r0_4 >= r0_8) { 10786 ulong const t = r0_4; 10787 r0_4 = r0_8; 10788 r0_8 = t; 10789 }; 10790 if (r0_2 >= r0_4) { 10791 ulong const t = r0_2; 10792 r0_2 = r0_4; 10793 r0_4 = t; 10794 }; 10795 if (r0_6 >= r0_8) { 10796 ulong const t = r0_6; 10797 r0_6 = r0_8; 10798 r0_8 = t; 10799 }; 10800 if (r0_1 >= r0_2) { 10801 ulong const t = r0_1; 10802 r0_1 = r0_2; 10803 r0_2 = t; 10804 }; 10805 if (r0_3 >= r0_4) { 10806 ulong const t = r0_3; 10807 r0_3 = r0_4; 10808 r0_4 = t; 10809 }; 10810 if (r0_5 >= r0_6) { 10811 ulong const t = r0_5; 10812 r0_5 = r0_6; 10813 r0_6 = t; 10814 }; 10815 if (r0_7 >= r0_8) { 10816 ulong const t = r0_7; 10817 r0_7 = r0_8; 10818 r0_8 = t; 10819 }; 10820 shared.m[smem_l_idx + (0)] = r0_1; 10821 shared.m[smem_l_idx + (8)] = r0_2; 10822 shared.m[smem_l_idx + (16)] = r0_3; 10823 shared.m[smem_l_idx + (24)] = r0_4; 10824 shared.m[smem_l_idx + (32)] = r0_5; 10825 shared.m[smem_l_idx + (40)] = r0_6; 10826 shared.m[smem_l_idx + (48)] = r0_7; 10827 shared.m[smem_l_idx + (56)] = r0_8; 10828 shared.m[smem_r_idx + (64)] = r0_9; 10829 shared.m[smem_r_idx + (72)] = r0_10; 10830 shared.m[smem_r_idx + (80)] = r0_11; 10831 shared.m[smem_r_idx + (88)] = r0_12; 10832 shared.m[smem_r_idx + (96)] = r0_13; 10833 shared.m[smem_r_idx + (104)] = r0_14; 10834 shared.m[smem_r_idx + (112)] = r0_15; 10835 shared.m[smem_r_idx + (120)] = r0_16; 10836 } 10837 } 10838 barrier(CLK_LOCAL_MEM_FENCE); 10839 r1 = shared.m[get_local_id(0) + (16 * (1 << 3) * 0)]; 10840 r16 = shared.m[get_local_id(0) + (16 * (1 << 3) * 1)]; 10841 r2 = shared.m[get_local_id(0) + (16 * (1 << 3) * 2)]; 10842 r15 = shared.m[get_local_id(0) + (16 * (1 << 3) * 3)]; 10843 r3 = shared.m[get_local_id(0) + (16 * (1 << 3) * 4)]; 10844 r14 = shared.m[get_local_id(0) + (16 * (1 << 3) * 5)]; 10845 r4 = shared.m[get_local_id(0) + (16 * (1 << 3) * 6)]; 10846 r13 = shared.m[get_local_id(0) + (16 * (1 << 3) * 7)]; 10847 r5 = shared.m[get_local_id(0) + (16 * (1 << 3) * 8)]; 10848 r12 = shared.m[get_local_id(0) + (16 * (1 << 3) * 9)]; 10849 r6 = shared.m[get_local_id(0) + (16 * (1 << 3) * 10)]; 10850 r11 = shared.m[get_local_id(0) + (16 * (1 << 3) * 11)]; 10851 r7 = shared.m[get_local_id(0) + (16 * (1 << 3) * 12)]; 10852 r10 = shared.m[get_local_id(0) + (16 * (1 << 3) * 13)]; 10853 r8 = shared.m[get_local_id(0) + (16 * (1 << 3) * 14)]; 10854 r9 = shared.m[get_local_id(0) + (16 * (1 << 3) * 15)]; 10855 { 10856 { 10857 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 10858 int const t_lt = get_sub_group_local_id() < half_lane_idx; 10859 ; 10860 { 10861 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 10862 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 10863 }; 10864 { 10865 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 10866 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 10867 }; 10868 { 10869 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 10870 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 10871 }; 10872 { 10873 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 10874 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 10875 }; 10876 { 10877 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 10878 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 10879 }; 10880 { 10881 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 10882 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 10883 }; 10884 { 10885 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 10886 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 10887 }; 10888 { 10889 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 10890 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 10891 }; 10892 { 10893 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 10894 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 10895 }; 10896 { 10897 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 10898 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 10899 }; 10900 { 10901 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 10902 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 10903 }; 10904 { 10905 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 10906 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 10907 }; 10908 { 10909 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 10910 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 10911 }; 10912 { 10913 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 10914 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 10915 }; 10916 { 10917 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 10918 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 10919 }; 10920 { 10921 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 10922 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 10923 }; 10924 } 10925 { 10926 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 10927 int const t_lt = get_sub_group_local_id() < half_lane_idx; 10928 ; 10929 { 10930 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 10931 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 10932 }; 10933 { 10934 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 10935 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 10936 }; 10937 { 10938 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 10939 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 10940 }; 10941 { 10942 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 10943 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 10944 }; 10945 { 10946 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 10947 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 10948 }; 10949 { 10950 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 10951 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 10952 }; 10953 { 10954 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 10955 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 10956 }; 10957 { 10958 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 10959 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 10960 }; 10961 { 10962 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 10963 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 10964 }; 10965 { 10966 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 10967 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 10968 }; 10969 { 10970 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 10971 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 10972 }; 10973 { 10974 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 10975 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 10976 }; 10977 { 10978 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 10979 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 10980 }; 10981 { 10982 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 10983 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 10984 }; 10985 { 10986 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 10987 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 10988 }; 10989 { 10990 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 10991 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 10992 }; 10993 } 10994 { 10995 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 10996 int const t_lt = get_sub_group_local_id() < half_lane_idx; 10997 ; 10998 { 10999 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 11000 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 11001 }; 11002 { 11003 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 11004 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 11005 }; 11006 { 11007 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 11008 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 11009 }; 11010 { 11011 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 11012 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 11013 }; 11014 { 11015 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 11016 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 11017 }; 11018 { 11019 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 11020 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 11021 }; 11022 { 11023 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 11024 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 11025 }; 11026 { 11027 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 11028 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 11029 }; 11030 { 11031 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 11032 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 11033 }; 11034 { 11035 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 11036 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 11037 }; 11038 { 11039 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 11040 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 11041 }; 11042 { 11043 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 11044 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 11045 }; 11046 { 11047 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 11048 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 11049 }; 11050 { 11051 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 11052 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 11053 }; 11054 { 11055 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 11056 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 11057 }; 11058 { 11059 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 11060 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 11061 }; 11062 } 11063 if (r1 >= r9) { 11064 ulong const t = r1; 11065 r1 = r9; 11066 r9 = t; 11067 }; 11068 if (r5 >= r13) { 11069 ulong const t = r5; 11070 r5 = r13; 11071 r13 = t; 11072 }; 11073 if (r1 >= r5) { 11074 ulong const t = r1; 11075 r1 = r5; 11076 r5 = t; 11077 }; 11078 if (r9 >= r13) { 11079 ulong const t = r9; 11080 r9 = r13; 11081 r13 = t; 11082 }; 11083 if (r3 >= r11) { 11084 ulong const t = r3; 11085 r3 = r11; 11086 r11 = t; 11087 }; 11088 if (r7 >= r15) { 11089 ulong const t = r7; 11090 r7 = r15; 11091 r15 = t; 11092 }; 11093 if (r3 >= r7) { 11094 ulong const t = r3; 11095 r3 = r7; 11096 r7 = t; 11097 }; 11098 if (r11 >= r15) { 11099 ulong const t = r11; 11100 r11 = r15; 11101 r15 = t; 11102 }; 11103 if (r1 >= r3) { 11104 ulong const t = r1; 11105 r1 = r3; 11106 r3 = t; 11107 }; 11108 if (r5 >= r7) { 11109 ulong const t = r5; 11110 r5 = r7; 11111 r7 = t; 11112 }; 11113 if (r9 >= r11) { 11114 ulong const t = r9; 11115 r9 = r11; 11116 r11 = t; 11117 }; 11118 if (r13 >= r15) { 11119 ulong const t = r13; 11120 r13 = r15; 11121 r15 = t; 11122 }; 11123 if (r2 >= r10) { 11124 ulong const t = r2; 11125 r2 = r10; 11126 r10 = t; 11127 }; 11128 if (r6 >= r14) { 11129 ulong const t = r6; 11130 r6 = r14; 11131 r14 = t; 11132 }; 11133 if (r2 >= r6) { 11134 ulong const t = r2; 11135 r2 = r6; 11136 r6 = t; 11137 }; 11138 if (r10 >= r14) { 11139 ulong const t = r10; 11140 r10 = r14; 11141 r14 = t; 11142 }; 11143 if (r4 >= r12) { 11144 ulong const t = r4; 11145 r4 = r12; 11146 r12 = t; 11147 }; 11148 if (r8 >= r16) { 11149 ulong const t = r8; 11150 r8 = r16; 11151 r16 = t; 11152 }; 11153 if (r4 >= r8) { 11154 ulong const t = r4; 11155 r4 = r8; 11156 r8 = t; 11157 }; 11158 if (r12 >= r16) { 11159 ulong const t = r12; 11160 r12 = r16; 11161 r16 = t; 11162 }; 11163 if (r2 >= r4) { 11164 ulong const t = r2; 11165 r2 = r4; 11166 r4 = t; 11167 }; 11168 if (r6 >= r8) { 11169 ulong const t = r6; 11170 r6 = r8; 11171 r8 = t; 11172 }; 11173 if (r10 >= r12) { 11174 ulong const t = r10; 11175 r10 = r12; 11176 r12 = t; 11177 }; 11178 if (r14 >= r16) { 11179 ulong const t = r14; 11180 r14 = r16; 11181 r16 = t; 11182 }; 11183 if (r1 >= r2) { 11184 ulong const t = r1; 11185 r1 = r2; 11186 r2 = t; 11187 }; 11188 if (r3 >= r4) { 11189 ulong const t = r3; 11190 r3 = r4; 11191 r4 = t; 11192 }; 11193 if (r5 >= r6) { 11194 ulong const t = r5; 11195 r5 = r6; 11196 r6 = t; 11197 }; 11198 if (r7 >= r8) { 11199 ulong const t = r7; 11200 r7 = r8; 11201 r8 = t; 11202 }; 11203 if (r9 >= r10) { 11204 ulong const t = r9; 11205 r9 = r10; 11206 r10 = t; 11207 }; 11208 if (r11 >= r12) { 11209 ulong const t = r11; 11210 r11 = r12; 11211 r12 = t; 11212 }; 11213 if (r13 >= r14) { 11214 ulong const t = r13; 11215 r13 = r14; 11216 r14 = t; 11217 }; 11218 if (r15 >= r16) { 11219 ulong const t = r15; 11220 r15 = r16; 11221 r16 = t; 11222 }; 11223 } 11224 vout[gmem_idx + (1 << 3) * 0] = r1; 11225 vout[gmem_idx + (1 << 3) * 1] = r2; 11226 vout[gmem_idx + (1 << 3) * 2] = r3; 11227 vout[gmem_idx + (1 << 3) * 3] = r4; 11228 vout[gmem_idx + (1 << 3) * 4] = r5; 11229 vout[gmem_idx + (1 << 3) * 5] = r6; 11230 vout[gmem_idx + (1 << 3) * 6] = r7; 11231 vout[gmem_idx + (1 << 3) * 7] = r8; 11232 vout[gmem_idx + (1 << 3) * 8] = r9; 11233 vout[gmem_idx + (1 << 3) * 9] = r10; 11234 vout[gmem_idx + (1 << 3) * 10] = r11; 11235 vout[gmem_idx + (1 << 3) * 11] = r12; 11236 vout[gmem_idx + (1 << 3) * 12] = r13; 11237 vout[gmem_idx + (1 << 3) * 13] = r14; 11238 vout[gmem_idx + (1 << 3) * 14] = r15; 11239 vout[gmem_idx + (1 << 3) * 15] = r16; 11240} 11241 11242__kernel __attribute__((intel_reqd_sub_group_size((1 << 3)))) 11243__attribute__((reqd_work_group_size((1 << 3) * 1, 1, 1))) void 11244hs_kernel_bc_0(__global ulong* const restrict vout) 11245{ 11246 uint const gmem_idx = (get_global_id(0) & ~((1 << 3) - 1)) * 16 + 11247 (get_local_id(0) & ((1 << 3) - 1)); 11248 ulong r1 = vout[gmem_idx + (1 << 3) * 0]; 11249 ulong r2 = vout[gmem_idx + (1 << 3) * 1]; 11250 ulong r3 = vout[gmem_idx + (1 << 3) * 2]; 11251 ulong r4 = vout[gmem_idx + (1 << 3) * 3]; 11252 ulong r5 = vout[gmem_idx + (1 << 3) * 4]; 11253 ulong r6 = vout[gmem_idx + (1 << 3) * 5]; 11254 ulong r7 = vout[gmem_idx + (1 << 3) * 6]; 11255 ulong r8 = vout[gmem_idx + (1 << 3) * 7]; 11256 ulong r9 = vout[gmem_idx + (1 << 3) * 8]; 11257 ulong r10 = vout[gmem_idx + (1 << 3) * 9]; 11258 ulong r11 = vout[gmem_idx + (1 << 3) * 10]; 11259 ulong r12 = vout[gmem_idx + (1 << 3) * 11]; 11260 ulong r13 = vout[gmem_idx + (1 << 3) * 12]; 11261 ulong r14 = vout[gmem_idx + (1 << 3) * 13]; 11262 ulong r15 = vout[gmem_idx + (1 << 3) * 14]; 11263 ulong r16 = vout[gmem_idx + (1 << 3) * 15]; 11264 { 11265 { 11266 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 11267 int const t_lt = get_sub_group_local_id() < half_lane_idx; 11268 ; 11269 { 11270 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 11271 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 11272 }; 11273 { 11274 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 11275 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 11276 }; 11277 { 11278 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 11279 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 11280 }; 11281 { 11282 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 11283 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 11284 }; 11285 { 11286 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 11287 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 11288 }; 11289 { 11290 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 11291 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 11292 }; 11293 { 11294 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 11295 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 11296 }; 11297 { 11298 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 11299 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 11300 }; 11301 { 11302 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 11303 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 11304 }; 11305 { 11306 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 11307 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 11308 }; 11309 { 11310 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 11311 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 11312 }; 11313 { 11314 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 11315 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 11316 }; 11317 { 11318 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 11319 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 11320 }; 11321 { 11322 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 11323 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 11324 }; 11325 { 11326 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 11327 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 11328 }; 11329 { 11330 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 11331 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 11332 }; 11333 } 11334 { 11335 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 11336 int const t_lt = get_sub_group_local_id() < half_lane_idx; 11337 ; 11338 { 11339 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 11340 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 11341 }; 11342 { 11343 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 11344 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 11345 }; 11346 { 11347 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 11348 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 11349 }; 11350 { 11351 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 11352 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 11353 }; 11354 { 11355 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 11356 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 11357 }; 11358 { 11359 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 11360 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 11361 }; 11362 { 11363 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 11364 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 11365 }; 11366 { 11367 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 11368 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 11369 }; 11370 { 11371 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 11372 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 11373 }; 11374 { 11375 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 11376 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 11377 }; 11378 { 11379 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 11380 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 11381 }; 11382 { 11383 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 11384 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 11385 }; 11386 { 11387 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 11388 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 11389 }; 11390 { 11391 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 11392 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 11393 }; 11394 { 11395 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 11396 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 11397 }; 11398 { 11399 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 11400 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 11401 }; 11402 } 11403 { 11404 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 11405 int const t_lt = get_sub_group_local_id() < half_lane_idx; 11406 ; 11407 { 11408 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 11409 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 11410 }; 11411 { 11412 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 11413 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 11414 }; 11415 { 11416 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 11417 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 11418 }; 11419 { 11420 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 11421 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 11422 }; 11423 { 11424 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 11425 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 11426 }; 11427 { 11428 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 11429 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 11430 }; 11431 { 11432 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 11433 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 11434 }; 11435 { 11436 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 11437 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 11438 }; 11439 { 11440 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 11441 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 11442 }; 11443 { 11444 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 11445 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 11446 }; 11447 { 11448 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 11449 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 11450 }; 11451 { 11452 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 11453 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 11454 }; 11455 { 11456 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 11457 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 11458 }; 11459 { 11460 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 11461 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 11462 }; 11463 { 11464 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 11465 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 11466 }; 11467 { 11468 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 11469 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 11470 }; 11471 } 11472 if (r1 >= r9) { 11473 ulong const t = r1; 11474 r1 = r9; 11475 r9 = t; 11476 }; 11477 if (r5 >= r13) { 11478 ulong const t = r5; 11479 r5 = r13; 11480 r13 = t; 11481 }; 11482 if (r1 >= r5) { 11483 ulong const t = r1; 11484 r1 = r5; 11485 r5 = t; 11486 }; 11487 if (r9 >= r13) { 11488 ulong const t = r9; 11489 r9 = r13; 11490 r13 = t; 11491 }; 11492 if (r3 >= r11) { 11493 ulong const t = r3; 11494 r3 = r11; 11495 r11 = t; 11496 }; 11497 if (r7 >= r15) { 11498 ulong const t = r7; 11499 r7 = r15; 11500 r15 = t; 11501 }; 11502 if (r3 >= r7) { 11503 ulong const t = r3; 11504 r3 = r7; 11505 r7 = t; 11506 }; 11507 if (r11 >= r15) { 11508 ulong const t = r11; 11509 r11 = r15; 11510 r15 = t; 11511 }; 11512 if (r1 >= r3) { 11513 ulong const t = r1; 11514 r1 = r3; 11515 r3 = t; 11516 }; 11517 if (r5 >= r7) { 11518 ulong const t = r5; 11519 r5 = r7; 11520 r7 = t; 11521 }; 11522 if (r9 >= r11) { 11523 ulong const t = r9; 11524 r9 = r11; 11525 r11 = t; 11526 }; 11527 if (r13 >= r15) { 11528 ulong const t = r13; 11529 r13 = r15; 11530 r15 = t; 11531 }; 11532 if (r2 >= r10) { 11533 ulong const t = r2; 11534 r2 = r10; 11535 r10 = t; 11536 }; 11537 if (r6 >= r14) { 11538 ulong const t = r6; 11539 r6 = r14; 11540 r14 = t; 11541 }; 11542 if (r2 >= r6) { 11543 ulong const t = r2; 11544 r2 = r6; 11545 r6 = t; 11546 }; 11547 if (r10 >= r14) { 11548 ulong const t = r10; 11549 r10 = r14; 11550 r14 = t; 11551 }; 11552 if (r4 >= r12) { 11553 ulong const t = r4; 11554 r4 = r12; 11555 r12 = t; 11556 }; 11557 if (r8 >= r16) { 11558 ulong const t = r8; 11559 r8 = r16; 11560 r16 = t; 11561 }; 11562 if (r4 >= r8) { 11563 ulong const t = r4; 11564 r4 = r8; 11565 r8 = t; 11566 }; 11567 if (r12 >= r16) { 11568 ulong const t = r12; 11569 r12 = r16; 11570 r16 = t; 11571 }; 11572 if (r2 >= r4) { 11573 ulong const t = r2; 11574 r2 = r4; 11575 r4 = t; 11576 }; 11577 if (r6 >= r8) { 11578 ulong const t = r6; 11579 r6 = r8; 11580 r8 = t; 11581 }; 11582 if (r10 >= r12) { 11583 ulong const t = r10; 11584 r10 = r12; 11585 r12 = t; 11586 }; 11587 if (r14 >= r16) { 11588 ulong const t = r14; 11589 r14 = r16; 11590 r16 = t; 11591 }; 11592 if (r1 >= r2) { 11593 ulong const t = r1; 11594 r1 = r2; 11595 r2 = t; 11596 }; 11597 if (r3 >= r4) { 11598 ulong const t = r3; 11599 r3 = r4; 11600 r4 = t; 11601 }; 11602 if (r5 >= r6) { 11603 ulong const t = r5; 11604 r5 = r6; 11605 r6 = t; 11606 }; 11607 if (r7 >= r8) { 11608 ulong const t = r7; 11609 r7 = r8; 11610 r8 = t; 11611 }; 11612 if (r9 >= r10) { 11613 ulong const t = r9; 11614 r9 = r10; 11615 r10 = t; 11616 }; 11617 if (r11 >= r12) { 11618 ulong const t = r11; 11619 r11 = r12; 11620 r12 = t; 11621 }; 11622 if (r13 >= r14) { 11623 ulong const t = r13; 11624 r13 = r14; 11625 r14 = t; 11626 }; 11627 if (r15 >= r16) { 11628 ulong const t = r15; 11629 r15 = r16; 11630 r16 = t; 11631 }; 11632 } 11633 vout[gmem_idx + (1 << 3) * 0] = r1; 11634 vout[gmem_idx + (1 << 3) * 1] = r2; 11635 vout[gmem_idx + (1 << 3) * 2] = r3; 11636 vout[gmem_idx + (1 << 3) * 3] = r4; 11637 vout[gmem_idx + (1 << 3) * 4] = r5; 11638 vout[gmem_idx + (1 << 3) * 5] = r6; 11639 vout[gmem_idx + (1 << 3) * 6] = r7; 11640 vout[gmem_idx + (1 << 3) * 7] = r8; 11641 vout[gmem_idx + (1 << 3) * 8] = r9; 11642 vout[gmem_idx + (1 << 3) * 9] = r10; 11643 vout[gmem_idx + (1 << 3) * 10] = r11; 11644 vout[gmem_idx + (1 << 3) * 11] = r12; 11645 vout[gmem_idx + (1 << 3) * 12] = r13; 11646 vout[gmem_idx + (1 << 3) * 13] = r14; 11647 vout[gmem_idx + (1 << 3) * 14] = r15; 11648 vout[gmem_idx + (1 << 3) * 15] = r16; 11649} 11650 11651__kernel __attribute__((intel_reqd_sub_group_size((1 << 3)))) 11652__attribute__((reqd_work_group_size((1 << 3) * 2, 1, 1))) void 11653hs_kernel_bc_1(__global ulong* const restrict vout) 11654{ 11655 __local struct 11656 { 11657 ulong m[16 * 16]; 11658 } shared; 11659 11660 uint const gmem_idx = (get_global_id(0) & ~((1 << 3) - 1)) * 16 + 11661 (get_local_id(0) & ((1 << 3) - 1)); 11662 uint const gmem_l_idx = 11663 (get_global_id(0) & ~((1 << 3) * 2 - 1)) * 16 + get_local_id(0); 11664 uint const smem_l_idx = 11665 get_sub_group_id() * ((1 << 3) * 2) + get_sub_group_local_id(); 11666 { 11667 { 11668 ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 0)]; 11669 ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 16)]; 11670 if (r0_1 >= r0_2) { 11671 ulong const t = r0_1; 11672 r0_1 = r0_2; 11673 r0_2 = t; 11674 }; 11675 shared.m[smem_l_idx + (0)] = r0_1; 11676 shared.m[smem_l_idx + (8)] = r0_2; 11677 } 11678 { 11679 ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 2)]; 11680 ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 18)]; 11681 if (r0_1 >= r0_2) { 11682 ulong const t = r0_1; 11683 r0_1 = r0_2; 11684 r0_2 = t; 11685 }; 11686 shared.m[smem_l_idx + (32)] = r0_1; 11687 shared.m[smem_l_idx + (40)] = r0_2; 11688 } 11689 { 11690 ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 4)]; 11691 ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 20)]; 11692 if (r0_1 >= r0_2) { 11693 ulong const t = r0_1; 11694 r0_1 = r0_2; 11695 r0_2 = t; 11696 }; 11697 shared.m[smem_l_idx + (64)] = r0_1; 11698 shared.m[smem_l_idx + (72)] = r0_2; 11699 } 11700 { 11701 ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 6)]; 11702 ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 22)]; 11703 if (r0_1 >= r0_2) { 11704 ulong const t = r0_1; 11705 r0_1 = r0_2; 11706 r0_2 = t; 11707 }; 11708 shared.m[smem_l_idx + (96)] = r0_1; 11709 shared.m[smem_l_idx + (104)] = r0_2; 11710 } 11711 { 11712 ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 8)]; 11713 ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 24)]; 11714 if (r0_1 >= r0_2) { 11715 ulong const t = r0_1; 11716 r0_1 = r0_2; 11717 r0_2 = t; 11718 }; 11719 shared.m[smem_l_idx + (128)] = r0_1; 11720 shared.m[smem_l_idx + (136)] = r0_2; 11721 } 11722 { 11723 ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 10)]; 11724 ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 26)]; 11725 if (r0_1 >= r0_2) { 11726 ulong const t = r0_1; 11727 r0_1 = r0_2; 11728 r0_2 = t; 11729 }; 11730 shared.m[smem_l_idx + (160)] = r0_1; 11731 shared.m[smem_l_idx + (168)] = r0_2; 11732 } 11733 { 11734 ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 12)]; 11735 ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 28)]; 11736 if (r0_1 >= r0_2) { 11737 ulong const t = r0_1; 11738 r0_1 = r0_2; 11739 r0_2 = t; 11740 }; 11741 shared.m[smem_l_idx + (192)] = r0_1; 11742 shared.m[smem_l_idx + (200)] = r0_2; 11743 } 11744 { 11745 ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 14)]; 11746 ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 30)]; 11747 if (r0_1 >= r0_2) { 11748 ulong const t = r0_1; 11749 r0_1 = r0_2; 11750 r0_2 = t; 11751 }; 11752 shared.m[smem_l_idx + (224)] = r0_1; 11753 shared.m[smem_l_idx + (232)] = r0_2; 11754 } 11755 } 11756 barrier(CLK_LOCAL_MEM_FENCE); 11757 ulong r1 = shared.m[get_local_id(0) + (2 * (1 << 3) * 0)]; 11758 ulong r2 = shared.m[get_local_id(0) + (2 * (1 << 3) * 1)]; 11759 ulong r3 = shared.m[get_local_id(0) + (2 * (1 << 3) * 2)]; 11760 ulong r4 = shared.m[get_local_id(0) + (2 * (1 << 3) * 3)]; 11761 ulong r5 = shared.m[get_local_id(0) + (2 * (1 << 3) * 4)]; 11762 ulong r6 = shared.m[get_local_id(0) + (2 * (1 << 3) * 5)]; 11763 ulong r7 = shared.m[get_local_id(0) + (2 * (1 << 3) * 6)]; 11764 ulong r8 = shared.m[get_local_id(0) + (2 * (1 << 3) * 7)]; 11765 ulong r9 = shared.m[get_local_id(0) + (2 * (1 << 3) * 8)]; 11766 ulong r10 = shared.m[get_local_id(0) + (2 * (1 << 3) * 9)]; 11767 ulong r11 = shared.m[get_local_id(0) + (2 * (1 << 3) * 10)]; 11768 ulong r12 = shared.m[get_local_id(0) + (2 * (1 << 3) * 11)]; 11769 ulong r13 = shared.m[get_local_id(0) + (2 * (1 << 3) * 12)]; 11770 ulong r14 = shared.m[get_local_id(0) + (2 * (1 << 3) * 13)]; 11771 ulong r15 = shared.m[get_local_id(0) + (2 * (1 << 3) * 14)]; 11772 ulong r16 = shared.m[get_local_id(0) + (2 * (1 << 3) * 15)]; 11773 { 11774 { 11775 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 11776 int const t_lt = get_sub_group_local_id() < half_lane_idx; 11777 ; 11778 { 11779 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 11780 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 11781 }; 11782 { 11783 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 11784 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 11785 }; 11786 { 11787 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 11788 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 11789 }; 11790 { 11791 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 11792 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 11793 }; 11794 { 11795 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 11796 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 11797 }; 11798 { 11799 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 11800 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 11801 }; 11802 { 11803 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 11804 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 11805 }; 11806 { 11807 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 11808 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 11809 }; 11810 { 11811 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 11812 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 11813 }; 11814 { 11815 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 11816 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 11817 }; 11818 { 11819 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 11820 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 11821 }; 11822 { 11823 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 11824 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 11825 }; 11826 { 11827 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 11828 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 11829 }; 11830 { 11831 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 11832 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 11833 }; 11834 { 11835 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 11836 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 11837 }; 11838 { 11839 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 11840 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 11841 }; 11842 } 11843 { 11844 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 11845 int const t_lt = get_sub_group_local_id() < half_lane_idx; 11846 ; 11847 { 11848 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 11849 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 11850 }; 11851 { 11852 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 11853 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 11854 }; 11855 { 11856 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 11857 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 11858 }; 11859 { 11860 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 11861 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 11862 }; 11863 { 11864 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 11865 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 11866 }; 11867 { 11868 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 11869 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 11870 }; 11871 { 11872 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 11873 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 11874 }; 11875 { 11876 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 11877 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 11878 }; 11879 { 11880 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 11881 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 11882 }; 11883 { 11884 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 11885 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 11886 }; 11887 { 11888 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 11889 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 11890 }; 11891 { 11892 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 11893 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 11894 }; 11895 { 11896 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 11897 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 11898 }; 11899 { 11900 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 11901 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 11902 }; 11903 { 11904 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 11905 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 11906 }; 11907 { 11908 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 11909 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 11910 }; 11911 } 11912 { 11913 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 11914 int const t_lt = get_sub_group_local_id() < half_lane_idx; 11915 ; 11916 { 11917 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 11918 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 11919 }; 11920 { 11921 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 11922 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 11923 }; 11924 { 11925 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 11926 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 11927 }; 11928 { 11929 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 11930 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 11931 }; 11932 { 11933 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 11934 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 11935 }; 11936 { 11937 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 11938 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 11939 }; 11940 { 11941 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 11942 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 11943 }; 11944 { 11945 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 11946 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 11947 }; 11948 { 11949 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 11950 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 11951 }; 11952 { 11953 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 11954 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 11955 }; 11956 { 11957 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 11958 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 11959 }; 11960 { 11961 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 11962 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 11963 }; 11964 { 11965 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 11966 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 11967 }; 11968 { 11969 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 11970 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 11971 }; 11972 { 11973 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 11974 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 11975 }; 11976 { 11977 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 11978 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 11979 }; 11980 } 11981 if (r1 >= r9) { 11982 ulong const t = r1; 11983 r1 = r9; 11984 r9 = t; 11985 }; 11986 if (r5 >= r13) { 11987 ulong const t = r5; 11988 r5 = r13; 11989 r13 = t; 11990 }; 11991 if (r1 >= r5) { 11992 ulong const t = r1; 11993 r1 = r5; 11994 r5 = t; 11995 }; 11996 if (r9 >= r13) { 11997 ulong const t = r9; 11998 r9 = r13; 11999 r13 = t; 12000 }; 12001 if (r3 >= r11) { 12002 ulong const t = r3; 12003 r3 = r11; 12004 r11 = t; 12005 }; 12006 if (r7 >= r15) { 12007 ulong const t = r7; 12008 r7 = r15; 12009 r15 = t; 12010 }; 12011 if (r3 >= r7) { 12012 ulong const t = r3; 12013 r3 = r7; 12014 r7 = t; 12015 }; 12016 if (r11 >= r15) { 12017 ulong const t = r11; 12018 r11 = r15; 12019 r15 = t; 12020 }; 12021 if (r1 >= r3) { 12022 ulong const t = r1; 12023 r1 = r3; 12024 r3 = t; 12025 }; 12026 if (r5 >= r7) { 12027 ulong const t = r5; 12028 r5 = r7; 12029 r7 = t; 12030 }; 12031 if (r9 >= r11) { 12032 ulong const t = r9; 12033 r9 = r11; 12034 r11 = t; 12035 }; 12036 if (r13 >= r15) { 12037 ulong const t = r13; 12038 r13 = r15; 12039 r15 = t; 12040 }; 12041 if (r2 >= r10) { 12042 ulong const t = r2; 12043 r2 = r10; 12044 r10 = t; 12045 }; 12046 if (r6 >= r14) { 12047 ulong const t = r6; 12048 r6 = r14; 12049 r14 = t; 12050 }; 12051 if (r2 >= r6) { 12052 ulong const t = r2; 12053 r2 = r6; 12054 r6 = t; 12055 }; 12056 if (r10 >= r14) { 12057 ulong const t = r10; 12058 r10 = r14; 12059 r14 = t; 12060 }; 12061 if (r4 >= r12) { 12062 ulong const t = r4; 12063 r4 = r12; 12064 r12 = t; 12065 }; 12066 if (r8 >= r16) { 12067 ulong const t = r8; 12068 r8 = r16; 12069 r16 = t; 12070 }; 12071 if (r4 >= r8) { 12072 ulong const t = r4; 12073 r4 = r8; 12074 r8 = t; 12075 }; 12076 if (r12 >= r16) { 12077 ulong const t = r12; 12078 r12 = r16; 12079 r16 = t; 12080 }; 12081 if (r2 >= r4) { 12082 ulong const t = r2; 12083 r2 = r4; 12084 r4 = t; 12085 }; 12086 if (r6 >= r8) { 12087 ulong const t = r6; 12088 r6 = r8; 12089 r8 = t; 12090 }; 12091 if (r10 >= r12) { 12092 ulong const t = r10; 12093 r10 = r12; 12094 r12 = t; 12095 }; 12096 if (r14 >= r16) { 12097 ulong const t = r14; 12098 r14 = r16; 12099 r16 = t; 12100 }; 12101 if (r1 >= r2) { 12102 ulong const t = r1; 12103 r1 = r2; 12104 r2 = t; 12105 }; 12106 if (r3 >= r4) { 12107 ulong const t = r3; 12108 r3 = r4; 12109 r4 = t; 12110 }; 12111 if (r5 >= r6) { 12112 ulong const t = r5; 12113 r5 = r6; 12114 r6 = t; 12115 }; 12116 if (r7 >= r8) { 12117 ulong const t = r7; 12118 r7 = r8; 12119 r8 = t; 12120 }; 12121 if (r9 >= r10) { 12122 ulong const t = r9; 12123 r9 = r10; 12124 r10 = t; 12125 }; 12126 if (r11 >= r12) { 12127 ulong const t = r11; 12128 r11 = r12; 12129 r12 = t; 12130 }; 12131 if (r13 >= r14) { 12132 ulong const t = r13; 12133 r13 = r14; 12134 r14 = t; 12135 }; 12136 if (r15 >= r16) { 12137 ulong const t = r15; 12138 r15 = r16; 12139 r16 = t; 12140 }; 12141 } 12142 vout[gmem_idx + (1 << 3) * 0] = r1; 12143 vout[gmem_idx + (1 << 3) * 1] = r2; 12144 vout[gmem_idx + (1 << 3) * 2] = r3; 12145 vout[gmem_idx + (1 << 3) * 3] = r4; 12146 vout[gmem_idx + (1 << 3) * 4] = r5; 12147 vout[gmem_idx + (1 << 3) * 5] = r6; 12148 vout[gmem_idx + (1 << 3) * 6] = r7; 12149 vout[gmem_idx + (1 << 3) * 7] = r8; 12150 vout[gmem_idx + (1 << 3) * 8] = r9; 12151 vout[gmem_idx + (1 << 3) * 9] = r10; 12152 vout[gmem_idx + (1 << 3) * 10] = r11; 12153 vout[gmem_idx + (1 << 3) * 11] = r12; 12154 vout[gmem_idx + (1 << 3) * 12] = r13; 12155 vout[gmem_idx + (1 << 3) * 13] = r14; 12156 vout[gmem_idx + (1 << 3) * 14] = r15; 12157 vout[gmem_idx + (1 << 3) * 15] = r16; 12158} 12159 12160__kernel __attribute__((intel_reqd_sub_group_size((1 << 3)))) 12161__attribute__((reqd_work_group_size((1 << 3) * 4, 1, 1))) void 12162hs_kernel_bc_2(__global ulong* const restrict vout) 12163{ 12164 __local struct 12165 { 12166 ulong m[32 * 16]; 12167 } shared; 12168 12169 uint const gmem_idx = (get_global_id(0) & ~((1 << 3) - 1)) * 16 + 12170 (get_local_id(0) & ((1 << 3) - 1)); 12171 uint const gmem_l_idx = 12172 (get_global_id(0) & ~((1 << 3) * 4 - 1)) * 16 + get_local_id(0); 12173 uint const smem_l_idx = 12174 get_sub_group_id() * ((1 << 3) * 4) + get_sub_group_local_id(); 12175 { 12176 { 12177 ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 0)]; 12178 ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 16)]; 12179 ulong r0_3 = vout[gmem_l_idx + ((1 << 3) * 32)]; 12180 ulong r0_4 = vout[gmem_l_idx + ((1 << 3) * 48)]; 12181 if (r0_1 >= r0_3) { 12182 ulong const t = r0_1; 12183 r0_1 = r0_3; 12184 r0_3 = t; 12185 }; 12186 if (r0_2 >= r0_4) { 12187 ulong const t = r0_2; 12188 r0_2 = r0_4; 12189 r0_4 = t; 12190 }; 12191 if (r0_1 >= r0_2) { 12192 ulong const t = r0_1; 12193 r0_1 = r0_2; 12194 r0_2 = t; 12195 }; 12196 if (r0_3 >= r0_4) { 12197 ulong const t = r0_3; 12198 r0_3 = r0_4; 12199 r0_4 = t; 12200 }; 12201 shared.m[smem_l_idx + (0)] = r0_1; 12202 shared.m[smem_l_idx + (8)] = r0_2; 12203 shared.m[smem_l_idx + (16)] = r0_3; 12204 shared.m[smem_l_idx + (24)] = r0_4; 12205 } 12206 { 12207 ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 4)]; 12208 ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 20)]; 12209 ulong r0_3 = vout[gmem_l_idx + ((1 << 3) * 36)]; 12210 ulong r0_4 = vout[gmem_l_idx + ((1 << 3) * 52)]; 12211 if (r0_1 >= r0_3) { 12212 ulong const t = r0_1; 12213 r0_1 = r0_3; 12214 r0_3 = t; 12215 }; 12216 if (r0_2 >= r0_4) { 12217 ulong const t = r0_2; 12218 r0_2 = r0_4; 12219 r0_4 = t; 12220 }; 12221 if (r0_1 >= r0_2) { 12222 ulong const t = r0_1; 12223 r0_1 = r0_2; 12224 r0_2 = t; 12225 }; 12226 if (r0_3 >= r0_4) { 12227 ulong const t = r0_3; 12228 r0_3 = r0_4; 12229 r0_4 = t; 12230 }; 12231 shared.m[smem_l_idx + (128)] = r0_1; 12232 shared.m[smem_l_idx + (136)] = r0_2; 12233 shared.m[smem_l_idx + (144)] = r0_3; 12234 shared.m[smem_l_idx + (152)] = r0_4; 12235 } 12236 { 12237 ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 8)]; 12238 ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 24)]; 12239 ulong r0_3 = vout[gmem_l_idx + ((1 << 3) * 40)]; 12240 ulong r0_4 = vout[gmem_l_idx + ((1 << 3) * 56)]; 12241 if (r0_1 >= r0_3) { 12242 ulong const t = r0_1; 12243 r0_1 = r0_3; 12244 r0_3 = t; 12245 }; 12246 if (r0_2 >= r0_4) { 12247 ulong const t = r0_2; 12248 r0_2 = r0_4; 12249 r0_4 = t; 12250 }; 12251 if (r0_1 >= r0_2) { 12252 ulong const t = r0_1; 12253 r0_1 = r0_2; 12254 r0_2 = t; 12255 }; 12256 if (r0_3 >= r0_4) { 12257 ulong const t = r0_3; 12258 r0_3 = r0_4; 12259 r0_4 = t; 12260 }; 12261 shared.m[smem_l_idx + (256)] = r0_1; 12262 shared.m[smem_l_idx + (264)] = r0_2; 12263 shared.m[smem_l_idx + (272)] = r0_3; 12264 shared.m[smem_l_idx + (280)] = r0_4; 12265 } 12266 { 12267 ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 12)]; 12268 ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 28)]; 12269 ulong r0_3 = vout[gmem_l_idx + ((1 << 3) * 44)]; 12270 ulong r0_4 = vout[gmem_l_idx + ((1 << 3) * 60)]; 12271 if (r0_1 >= r0_3) { 12272 ulong const t = r0_1; 12273 r0_1 = r0_3; 12274 r0_3 = t; 12275 }; 12276 if (r0_2 >= r0_4) { 12277 ulong const t = r0_2; 12278 r0_2 = r0_4; 12279 r0_4 = t; 12280 }; 12281 if (r0_1 >= r0_2) { 12282 ulong const t = r0_1; 12283 r0_1 = r0_2; 12284 r0_2 = t; 12285 }; 12286 if (r0_3 >= r0_4) { 12287 ulong const t = r0_3; 12288 r0_3 = r0_4; 12289 r0_4 = t; 12290 }; 12291 shared.m[smem_l_idx + (384)] = r0_1; 12292 shared.m[smem_l_idx + (392)] = r0_2; 12293 shared.m[smem_l_idx + (400)] = r0_3; 12294 shared.m[smem_l_idx + (408)] = r0_4; 12295 } 12296 } 12297 barrier(CLK_LOCAL_MEM_FENCE); 12298 ulong r1 = shared.m[get_local_id(0) + (4 * (1 << 3) * 0)]; 12299 ulong r2 = shared.m[get_local_id(0) + (4 * (1 << 3) * 1)]; 12300 ulong r3 = shared.m[get_local_id(0) + (4 * (1 << 3) * 2)]; 12301 ulong r4 = shared.m[get_local_id(0) + (4 * (1 << 3) * 3)]; 12302 ulong r5 = shared.m[get_local_id(0) + (4 * (1 << 3) * 4)]; 12303 ulong r6 = shared.m[get_local_id(0) + (4 * (1 << 3) * 5)]; 12304 ulong r7 = shared.m[get_local_id(0) + (4 * (1 << 3) * 6)]; 12305 ulong r8 = shared.m[get_local_id(0) + (4 * (1 << 3) * 7)]; 12306 ulong r9 = shared.m[get_local_id(0) + (4 * (1 << 3) * 8)]; 12307 ulong r10 = shared.m[get_local_id(0) + (4 * (1 << 3) * 9)]; 12308 ulong r11 = shared.m[get_local_id(0) + (4 * (1 << 3) * 10)]; 12309 ulong r12 = shared.m[get_local_id(0) + (4 * (1 << 3) * 11)]; 12310 ulong r13 = shared.m[get_local_id(0) + (4 * (1 << 3) * 12)]; 12311 ulong r14 = shared.m[get_local_id(0) + (4 * (1 << 3) * 13)]; 12312 ulong r15 = shared.m[get_local_id(0) + (4 * (1 << 3) * 14)]; 12313 ulong r16 = shared.m[get_local_id(0) + (4 * (1 << 3) * 15)]; 12314 { 12315 { 12316 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 12317 int const t_lt = get_sub_group_local_id() < half_lane_idx; 12318 ; 12319 { 12320 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 12321 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 12322 }; 12323 { 12324 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 12325 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 12326 }; 12327 { 12328 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 12329 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 12330 }; 12331 { 12332 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 12333 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 12334 }; 12335 { 12336 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 12337 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 12338 }; 12339 { 12340 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 12341 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 12342 }; 12343 { 12344 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 12345 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 12346 }; 12347 { 12348 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 12349 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 12350 }; 12351 { 12352 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 12353 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 12354 }; 12355 { 12356 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 12357 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 12358 }; 12359 { 12360 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 12361 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 12362 }; 12363 { 12364 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 12365 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 12366 }; 12367 { 12368 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 12369 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 12370 }; 12371 { 12372 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 12373 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 12374 }; 12375 { 12376 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 12377 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 12378 }; 12379 { 12380 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 12381 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 12382 }; 12383 } 12384 { 12385 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 12386 int const t_lt = get_sub_group_local_id() < half_lane_idx; 12387 ; 12388 { 12389 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 12390 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 12391 }; 12392 { 12393 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 12394 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 12395 }; 12396 { 12397 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 12398 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 12399 }; 12400 { 12401 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 12402 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 12403 }; 12404 { 12405 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 12406 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 12407 }; 12408 { 12409 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 12410 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 12411 }; 12412 { 12413 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 12414 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 12415 }; 12416 { 12417 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 12418 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 12419 }; 12420 { 12421 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 12422 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 12423 }; 12424 { 12425 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 12426 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 12427 }; 12428 { 12429 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 12430 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 12431 }; 12432 { 12433 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 12434 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 12435 }; 12436 { 12437 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 12438 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 12439 }; 12440 { 12441 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 12442 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 12443 }; 12444 { 12445 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 12446 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 12447 }; 12448 { 12449 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 12450 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 12451 }; 12452 } 12453 { 12454 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 12455 int const t_lt = get_sub_group_local_id() < half_lane_idx; 12456 ; 12457 { 12458 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 12459 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 12460 }; 12461 { 12462 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 12463 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 12464 }; 12465 { 12466 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 12467 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 12468 }; 12469 { 12470 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 12471 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 12472 }; 12473 { 12474 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 12475 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 12476 }; 12477 { 12478 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 12479 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 12480 }; 12481 { 12482 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 12483 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 12484 }; 12485 { 12486 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 12487 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 12488 }; 12489 { 12490 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 12491 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 12492 }; 12493 { 12494 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 12495 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 12496 }; 12497 { 12498 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 12499 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 12500 }; 12501 { 12502 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 12503 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 12504 }; 12505 { 12506 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 12507 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 12508 }; 12509 { 12510 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 12511 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 12512 }; 12513 { 12514 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 12515 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 12516 }; 12517 { 12518 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 12519 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 12520 }; 12521 } 12522 if (r1 >= r9) { 12523 ulong const t = r1; 12524 r1 = r9; 12525 r9 = t; 12526 }; 12527 if (r5 >= r13) { 12528 ulong const t = r5; 12529 r5 = r13; 12530 r13 = t; 12531 }; 12532 if (r1 >= r5) { 12533 ulong const t = r1; 12534 r1 = r5; 12535 r5 = t; 12536 }; 12537 if (r9 >= r13) { 12538 ulong const t = r9; 12539 r9 = r13; 12540 r13 = t; 12541 }; 12542 if (r3 >= r11) { 12543 ulong const t = r3; 12544 r3 = r11; 12545 r11 = t; 12546 }; 12547 if (r7 >= r15) { 12548 ulong const t = r7; 12549 r7 = r15; 12550 r15 = t; 12551 }; 12552 if (r3 >= r7) { 12553 ulong const t = r3; 12554 r3 = r7; 12555 r7 = t; 12556 }; 12557 if (r11 >= r15) { 12558 ulong const t = r11; 12559 r11 = r15; 12560 r15 = t; 12561 }; 12562 if (r1 >= r3) { 12563 ulong const t = r1; 12564 r1 = r3; 12565 r3 = t; 12566 }; 12567 if (r5 >= r7) { 12568 ulong const t = r5; 12569 r5 = r7; 12570 r7 = t; 12571 }; 12572 if (r9 >= r11) { 12573 ulong const t = r9; 12574 r9 = r11; 12575 r11 = t; 12576 }; 12577 if (r13 >= r15) { 12578 ulong const t = r13; 12579 r13 = r15; 12580 r15 = t; 12581 }; 12582 if (r2 >= r10) { 12583 ulong const t = r2; 12584 r2 = r10; 12585 r10 = t; 12586 }; 12587 if (r6 >= r14) { 12588 ulong const t = r6; 12589 r6 = r14; 12590 r14 = t; 12591 }; 12592 if (r2 >= r6) { 12593 ulong const t = r2; 12594 r2 = r6; 12595 r6 = t; 12596 }; 12597 if (r10 >= r14) { 12598 ulong const t = r10; 12599 r10 = r14; 12600 r14 = t; 12601 }; 12602 if (r4 >= r12) { 12603 ulong const t = r4; 12604 r4 = r12; 12605 r12 = t; 12606 }; 12607 if (r8 >= r16) { 12608 ulong const t = r8; 12609 r8 = r16; 12610 r16 = t; 12611 }; 12612 if (r4 >= r8) { 12613 ulong const t = r4; 12614 r4 = r8; 12615 r8 = t; 12616 }; 12617 if (r12 >= r16) { 12618 ulong const t = r12; 12619 r12 = r16; 12620 r16 = t; 12621 }; 12622 if (r2 >= r4) { 12623 ulong const t = r2; 12624 r2 = r4; 12625 r4 = t; 12626 }; 12627 if (r6 >= r8) { 12628 ulong const t = r6; 12629 r6 = r8; 12630 r8 = t; 12631 }; 12632 if (r10 >= r12) { 12633 ulong const t = r10; 12634 r10 = r12; 12635 r12 = t; 12636 }; 12637 if (r14 >= r16) { 12638 ulong const t = r14; 12639 r14 = r16; 12640 r16 = t; 12641 }; 12642 if (r1 >= r2) { 12643 ulong const t = r1; 12644 r1 = r2; 12645 r2 = t; 12646 }; 12647 if (r3 >= r4) { 12648 ulong const t = r3; 12649 r3 = r4; 12650 r4 = t; 12651 }; 12652 if (r5 >= r6) { 12653 ulong const t = r5; 12654 r5 = r6; 12655 r6 = t; 12656 }; 12657 if (r7 >= r8) { 12658 ulong const t = r7; 12659 r7 = r8; 12660 r8 = t; 12661 }; 12662 if (r9 >= r10) { 12663 ulong const t = r9; 12664 r9 = r10; 12665 r10 = t; 12666 }; 12667 if (r11 >= r12) { 12668 ulong const t = r11; 12669 r11 = r12; 12670 r12 = t; 12671 }; 12672 if (r13 >= r14) { 12673 ulong const t = r13; 12674 r13 = r14; 12675 r14 = t; 12676 }; 12677 if (r15 >= r16) { 12678 ulong const t = r15; 12679 r15 = r16; 12680 r16 = t; 12681 }; 12682 } 12683 vout[gmem_idx + (1 << 3) * 0] = r1; 12684 vout[gmem_idx + (1 << 3) * 1] = r2; 12685 vout[gmem_idx + (1 << 3) * 2] = r3; 12686 vout[gmem_idx + (1 << 3) * 3] = r4; 12687 vout[gmem_idx + (1 << 3) * 4] = r5; 12688 vout[gmem_idx + (1 << 3) * 5] = r6; 12689 vout[gmem_idx + (1 << 3) * 6] = r7; 12690 vout[gmem_idx + (1 << 3) * 7] = r8; 12691 vout[gmem_idx + (1 << 3) * 8] = r9; 12692 vout[gmem_idx + (1 << 3) * 9] = r10; 12693 vout[gmem_idx + (1 << 3) * 10] = r11; 12694 vout[gmem_idx + (1 << 3) * 11] = r12; 12695 vout[gmem_idx + (1 << 3) * 12] = r13; 12696 vout[gmem_idx + (1 << 3) * 13] = r14; 12697 vout[gmem_idx + (1 << 3) * 14] = r15; 12698 vout[gmem_idx + (1 << 3) * 15] = r16; 12699} 12700 12701__kernel __attribute__((intel_reqd_sub_group_size((1 << 3)))) 12702__attribute__((reqd_work_group_size((1 << 3) * 8, 1, 1))) void 12703hs_kernel_bc_3(__global ulong* const restrict vout) 12704{ 12705 __local struct 12706 { 12707 ulong m[64 * 16]; 12708 } shared; 12709 12710 uint const gmem_idx = (get_global_id(0) & ~((1 << 3) - 1)) * 16 + 12711 (get_local_id(0) & ((1 << 3) - 1)); 12712 uint const gmem_l_idx = 12713 (get_global_id(0) & ~((1 << 3) * 8 - 1)) * 16 + get_local_id(0); 12714 uint const smem_l_idx = 12715 get_sub_group_id() * ((1 << 3) * 8) + get_sub_group_local_id(); 12716 { 12717 { 12718 ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 0)]; 12719 ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 16)]; 12720 ulong r0_3 = vout[gmem_l_idx + ((1 << 3) * 32)]; 12721 ulong r0_4 = vout[gmem_l_idx + ((1 << 3) * 48)]; 12722 ulong r0_5 = vout[gmem_l_idx + ((1 << 3) * 64)]; 12723 ulong r0_6 = vout[gmem_l_idx + ((1 << 3) * 80)]; 12724 ulong r0_7 = vout[gmem_l_idx + ((1 << 3) * 96)]; 12725 ulong r0_8 = vout[gmem_l_idx + ((1 << 3) * 112)]; 12726 if (r0_1 >= r0_5) { 12727 ulong const t = r0_1; 12728 r0_1 = r0_5; 12729 r0_5 = t; 12730 }; 12731 if (r0_3 >= r0_7) { 12732 ulong const t = r0_3; 12733 r0_3 = r0_7; 12734 r0_7 = t; 12735 }; 12736 if (r0_1 >= r0_3) { 12737 ulong const t = r0_1; 12738 r0_1 = r0_3; 12739 r0_3 = t; 12740 }; 12741 if (r0_5 >= r0_7) { 12742 ulong const t = r0_5; 12743 r0_5 = r0_7; 12744 r0_7 = t; 12745 }; 12746 if (r0_2 >= r0_6) { 12747 ulong const t = r0_2; 12748 r0_2 = r0_6; 12749 r0_6 = t; 12750 }; 12751 if (r0_4 >= r0_8) { 12752 ulong const t = r0_4; 12753 r0_4 = r0_8; 12754 r0_8 = t; 12755 }; 12756 if (r0_2 >= r0_4) { 12757 ulong const t = r0_2; 12758 r0_2 = r0_4; 12759 r0_4 = t; 12760 }; 12761 if (r0_6 >= r0_8) { 12762 ulong const t = r0_6; 12763 r0_6 = r0_8; 12764 r0_8 = t; 12765 }; 12766 if (r0_1 >= r0_2) { 12767 ulong const t = r0_1; 12768 r0_1 = r0_2; 12769 r0_2 = t; 12770 }; 12771 if (r0_3 >= r0_4) { 12772 ulong const t = r0_3; 12773 r0_3 = r0_4; 12774 r0_4 = t; 12775 }; 12776 if (r0_5 >= r0_6) { 12777 ulong const t = r0_5; 12778 r0_5 = r0_6; 12779 r0_6 = t; 12780 }; 12781 if (r0_7 >= r0_8) { 12782 ulong const t = r0_7; 12783 r0_7 = r0_8; 12784 r0_8 = t; 12785 }; 12786 shared.m[smem_l_idx + (0)] = r0_1; 12787 shared.m[smem_l_idx + (8)] = r0_2; 12788 shared.m[smem_l_idx + (16)] = r0_3; 12789 shared.m[smem_l_idx + (24)] = r0_4; 12790 shared.m[smem_l_idx + (32)] = r0_5; 12791 shared.m[smem_l_idx + (40)] = r0_6; 12792 shared.m[smem_l_idx + (48)] = r0_7; 12793 shared.m[smem_l_idx + (56)] = r0_8; 12794 } 12795 { 12796 ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 8)]; 12797 ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 24)]; 12798 ulong r0_3 = vout[gmem_l_idx + ((1 << 3) * 40)]; 12799 ulong r0_4 = vout[gmem_l_idx + ((1 << 3) * 56)]; 12800 ulong r0_5 = vout[gmem_l_idx + ((1 << 3) * 72)]; 12801 ulong r0_6 = vout[gmem_l_idx + ((1 << 3) * 88)]; 12802 ulong r0_7 = vout[gmem_l_idx + ((1 << 3) * 104)]; 12803 ulong r0_8 = vout[gmem_l_idx + ((1 << 3) * 120)]; 12804 if (r0_1 >= r0_5) { 12805 ulong const t = r0_1; 12806 r0_1 = r0_5; 12807 r0_5 = t; 12808 }; 12809 if (r0_3 >= r0_7) { 12810 ulong const t = r0_3; 12811 r0_3 = r0_7; 12812 r0_7 = t; 12813 }; 12814 if (r0_1 >= r0_3) { 12815 ulong const t = r0_1; 12816 r0_1 = r0_3; 12817 r0_3 = t; 12818 }; 12819 if (r0_5 >= r0_7) { 12820 ulong const t = r0_5; 12821 r0_5 = r0_7; 12822 r0_7 = t; 12823 }; 12824 if (r0_2 >= r0_6) { 12825 ulong const t = r0_2; 12826 r0_2 = r0_6; 12827 r0_6 = t; 12828 }; 12829 if (r0_4 >= r0_8) { 12830 ulong const t = r0_4; 12831 r0_4 = r0_8; 12832 r0_8 = t; 12833 }; 12834 if (r0_2 >= r0_4) { 12835 ulong const t = r0_2; 12836 r0_2 = r0_4; 12837 r0_4 = t; 12838 }; 12839 if (r0_6 >= r0_8) { 12840 ulong const t = r0_6; 12841 r0_6 = r0_8; 12842 r0_8 = t; 12843 }; 12844 if (r0_1 >= r0_2) { 12845 ulong const t = r0_1; 12846 r0_1 = r0_2; 12847 r0_2 = t; 12848 }; 12849 if (r0_3 >= r0_4) { 12850 ulong const t = r0_3; 12851 r0_3 = r0_4; 12852 r0_4 = t; 12853 }; 12854 if (r0_5 >= r0_6) { 12855 ulong const t = r0_5; 12856 r0_5 = r0_6; 12857 r0_6 = t; 12858 }; 12859 if (r0_7 >= r0_8) { 12860 ulong const t = r0_7; 12861 r0_7 = r0_8; 12862 r0_8 = t; 12863 }; 12864 shared.m[smem_l_idx + (512)] = r0_1; 12865 shared.m[smem_l_idx + (520)] = r0_2; 12866 shared.m[smem_l_idx + (528)] = r0_3; 12867 shared.m[smem_l_idx + (536)] = r0_4; 12868 shared.m[smem_l_idx + (544)] = r0_5; 12869 shared.m[smem_l_idx + (552)] = r0_6; 12870 shared.m[smem_l_idx + (560)] = r0_7; 12871 shared.m[smem_l_idx + (568)] = r0_8; 12872 } 12873 } 12874 barrier(CLK_LOCAL_MEM_FENCE); 12875 ulong r1 = shared.m[get_local_id(0) + (8 * (1 << 3) * 0)]; 12876 ulong r2 = shared.m[get_local_id(0) + (8 * (1 << 3) * 1)]; 12877 ulong r3 = shared.m[get_local_id(0) + (8 * (1 << 3) * 2)]; 12878 ulong r4 = shared.m[get_local_id(0) + (8 * (1 << 3) * 3)]; 12879 ulong r5 = shared.m[get_local_id(0) + (8 * (1 << 3) * 4)]; 12880 ulong r6 = shared.m[get_local_id(0) + (8 * (1 << 3) * 5)]; 12881 ulong r7 = shared.m[get_local_id(0) + (8 * (1 << 3) * 6)]; 12882 ulong r8 = shared.m[get_local_id(0) + (8 * (1 << 3) * 7)]; 12883 ulong r9 = shared.m[get_local_id(0) + (8 * (1 << 3) * 8)]; 12884 ulong r10 = shared.m[get_local_id(0) + (8 * (1 << 3) * 9)]; 12885 ulong r11 = shared.m[get_local_id(0) + (8 * (1 << 3) * 10)]; 12886 ulong r12 = shared.m[get_local_id(0) + (8 * (1 << 3) * 11)]; 12887 ulong r13 = shared.m[get_local_id(0) + (8 * (1 << 3) * 12)]; 12888 ulong r14 = shared.m[get_local_id(0) + (8 * (1 << 3) * 13)]; 12889 ulong r15 = shared.m[get_local_id(0) + (8 * (1 << 3) * 14)]; 12890 ulong r16 = shared.m[get_local_id(0) + (8 * (1 << 3) * 15)]; 12891 { 12892 { 12893 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 12894 int const t_lt = get_sub_group_local_id() < half_lane_idx; 12895 ; 12896 { 12897 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 12898 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 12899 }; 12900 { 12901 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 12902 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 12903 }; 12904 { 12905 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 12906 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 12907 }; 12908 { 12909 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 12910 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 12911 }; 12912 { 12913 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 12914 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 12915 }; 12916 { 12917 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 12918 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 12919 }; 12920 { 12921 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 12922 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 12923 }; 12924 { 12925 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 12926 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 12927 }; 12928 { 12929 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 12930 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 12931 }; 12932 { 12933 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 12934 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 12935 }; 12936 { 12937 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 12938 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 12939 }; 12940 { 12941 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 12942 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 12943 }; 12944 { 12945 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 12946 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 12947 }; 12948 { 12949 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 12950 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 12951 }; 12952 { 12953 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 12954 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 12955 }; 12956 { 12957 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 12958 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 12959 }; 12960 } 12961 { 12962 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 12963 int const t_lt = get_sub_group_local_id() < half_lane_idx; 12964 ; 12965 { 12966 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 12967 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 12968 }; 12969 { 12970 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 12971 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 12972 }; 12973 { 12974 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 12975 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 12976 }; 12977 { 12978 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 12979 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 12980 }; 12981 { 12982 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 12983 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 12984 }; 12985 { 12986 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 12987 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 12988 }; 12989 { 12990 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 12991 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 12992 }; 12993 { 12994 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 12995 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 12996 }; 12997 { 12998 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 12999 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 13000 }; 13001 { 13002 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 13003 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 13004 }; 13005 { 13006 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 13007 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 13008 }; 13009 { 13010 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 13011 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 13012 }; 13013 { 13014 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 13015 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 13016 }; 13017 { 13018 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 13019 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 13020 }; 13021 { 13022 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 13023 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 13024 }; 13025 { 13026 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 13027 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 13028 }; 13029 } 13030 { 13031 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 13032 int const t_lt = get_sub_group_local_id() < half_lane_idx; 13033 ; 13034 { 13035 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 13036 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 13037 }; 13038 { 13039 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 13040 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 13041 }; 13042 { 13043 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 13044 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 13045 }; 13046 { 13047 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 13048 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 13049 }; 13050 { 13051 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 13052 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 13053 }; 13054 { 13055 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 13056 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 13057 }; 13058 { 13059 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 13060 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 13061 }; 13062 { 13063 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 13064 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 13065 }; 13066 { 13067 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 13068 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 13069 }; 13070 { 13071 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 13072 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 13073 }; 13074 { 13075 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 13076 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 13077 }; 13078 { 13079 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 13080 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 13081 }; 13082 { 13083 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 13084 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 13085 }; 13086 { 13087 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 13088 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 13089 }; 13090 { 13091 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 13092 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 13093 }; 13094 { 13095 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 13096 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 13097 }; 13098 } 13099 if (r1 >= r9) { 13100 ulong const t = r1; 13101 r1 = r9; 13102 r9 = t; 13103 }; 13104 if (r5 >= r13) { 13105 ulong const t = r5; 13106 r5 = r13; 13107 r13 = t; 13108 }; 13109 if (r1 >= r5) { 13110 ulong const t = r1; 13111 r1 = r5; 13112 r5 = t; 13113 }; 13114 if (r9 >= r13) { 13115 ulong const t = r9; 13116 r9 = r13; 13117 r13 = t; 13118 }; 13119 if (r3 >= r11) { 13120 ulong const t = r3; 13121 r3 = r11; 13122 r11 = t; 13123 }; 13124 if (r7 >= r15) { 13125 ulong const t = r7; 13126 r7 = r15; 13127 r15 = t; 13128 }; 13129 if (r3 >= r7) { 13130 ulong const t = r3; 13131 r3 = r7; 13132 r7 = t; 13133 }; 13134 if (r11 >= r15) { 13135 ulong const t = r11; 13136 r11 = r15; 13137 r15 = t; 13138 }; 13139 if (r1 >= r3) { 13140 ulong const t = r1; 13141 r1 = r3; 13142 r3 = t; 13143 }; 13144 if (r5 >= r7) { 13145 ulong const t = r5; 13146 r5 = r7; 13147 r7 = t; 13148 }; 13149 if (r9 >= r11) { 13150 ulong const t = r9; 13151 r9 = r11; 13152 r11 = t; 13153 }; 13154 if (r13 >= r15) { 13155 ulong const t = r13; 13156 r13 = r15; 13157 r15 = t; 13158 }; 13159 if (r2 >= r10) { 13160 ulong const t = r2; 13161 r2 = r10; 13162 r10 = t; 13163 }; 13164 if (r6 >= r14) { 13165 ulong const t = r6; 13166 r6 = r14; 13167 r14 = t; 13168 }; 13169 if (r2 >= r6) { 13170 ulong const t = r2; 13171 r2 = r6; 13172 r6 = t; 13173 }; 13174 if (r10 >= r14) { 13175 ulong const t = r10; 13176 r10 = r14; 13177 r14 = t; 13178 }; 13179 if (r4 >= r12) { 13180 ulong const t = r4; 13181 r4 = r12; 13182 r12 = t; 13183 }; 13184 if (r8 >= r16) { 13185 ulong const t = r8; 13186 r8 = r16; 13187 r16 = t; 13188 }; 13189 if (r4 >= r8) { 13190 ulong const t = r4; 13191 r4 = r8; 13192 r8 = t; 13193 }; 13194 if (r12 >= r16) { 13195 ulong const t = r12; 13196 r12 = r16; 13197 r16 = t; 13198 }; 13199 if (r2 >= r4) { 13200 ulong const t = r2; 13201 r2 = r4; 13202 r4 = t; 13203 }; 13204 if (r6 >= r8) { 13205 ulong const t = r6; 13206 r6 = r8; 13207 r8 = t; 13208 }; 13209 if (r10 >= r12) { 13210 ulong const t = r10; 13211 r10 = r12; 13212 r12 = t; 13213 }; 13214 if (r14 >= r16) { 13215 ulong const t = r14; 13216 r14 = r16; 13217 r16 = t; 13218 }; 13219 if (r1 >= r2) { 13220 ulong const t = r1; 13221 r1 = r2; 13222 r2 = t; 13223 }; 13224 if (r3 >= r4) { 13225 ulong const t = r3; 13226 r3 = r4; 13227 r4 = t; 13228 }; 13229 if (r5 >= r6) { 13230 ulong const t = r5; 13231 r5 = r6; 13232 r6 = t; 13233 }; 13234 if (r7 >= r8) { 13235 ulong const t = r7; 13236 r7 = r8; 13237 r8 = t; 13238 }; 13239 if (r9 >= r10) { 13240 ulong const t = r9; 13241 r9 = r10; 13242 r10 = t; 13243 }; 13244 if (r11 >= r12) { 13245 ulong const t = r11; 13246 r11 = r12; 13247 r12 = t; 13248 }; 13249 if (r13 >= r14) { 13250 ulong const t = r13; 13251 r13 = r14; 13252 r14 = t; 13253 }; 13254 if (r15 >= r16) { 13255 ulong const t = r15; 13256 r15 = r16; 13257 r16 = t; 13258 }; 13259 } 13260 vout[gmem_idx + (1 << 3) * 0] = r1; 13261 vout[gmem_idx + (1 << 3) * 1] = r2; 13262 vout[gmem_idx + (1 << 3) * 2] = r3; 13263 vout[gmem_idx + (1 << 3) * 3] = r4; 13264 vout[gmem_idx + (1 << 3) * 4] = r5; 13265 vout[gmem_idx + (1 << 3) * 5] = r6; 13266 vout[gmem_idx + (1 << 3) * 6] = r7; 13267 vout[gmem_idx + (1 << 3) * 7] = r8; 13268 vout[gmem_idx + (1 << 3) * 8] = r9; 13269 vout[gmem_idx + (1 << 3) * 9] = r10; 13270 vout[gmem_idx + (1 << 3) * 10] = r11; 13271 vout[gmem_idx + (1 << 3) * 11] = r12; 13272 vout[gmem_idx + (1 << 3) * 12] = r13; 13273 vout[gmem_idx + (1 << 3) * 13] = r14; 13274 vout[gmem_idx + (1 << 3) * 14] = r15; 13275 vout[gmem_idx + (1 << 3) * 15] = r16; 13276} 13277 13278__kernel __attribute__((intel_reqd_sub_group_size((1 << 3)))) 13279__attribute__((reqd_work_group_size((1 << 3) * 16, 1, 1))) void 13280hs_kernel_bc_4(__global ulong* const restrict vout) 13281{ 13282 __local struct 13283 { 13284 ulong m[128 * 16]; 13285 } shared; 13286 13287 uint const gmem_idx = (get_global_id(0) & ~((1 << 3) - 1)) * 16 + 13288 (get_local_id(0) & ((1 << 3) - 1)); 13289 uint const gmem_l_idx = 13290 (get_global_id(0) & ~((1 << 3) * 16 - 1)) * 16 + get_local_id(0); 13291 uint const smem_l_idx = 13292 get_sub_group_id() * ((1 << 3) * 16) + get_sub_group_local_id(); 13293 { 13294 { 13295 ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 0)]; 13296 ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 16)]; 13297 ulong r0_3 = vout[gmem_l_idx + ((1 << 3) * 32)]; 13298 ulong r0_4 = vout[gmem_l_idx + ((1 << 3) * 48)]; 13299 ulong r0_5 = vout[gmem_l_idx + ((1 << 3) * 64)]; 13300 ulong r0_6 = vout[gmem_l_idx + ((1 << 3) * 80)]; 13301 ulong r0_7 = vout[gmem_l_idx + ((1 << 3) * 96)]; 13302 ulong r0_8 = vout[gmem_l_idx + ((1 << 3) * 112)]; 13303 ulong r0_9 = vout[gmem_l_idx + ((1 << 3) * 128)]; 13304 ulong r0_10 = vout[gmem_l_idx + ((1 << 3) * 144)]; 13305 ulong r0_11 = vout[gmem_l_idx + ((1 << 3) * 160)]; 13306 ulong r0_12 = vout[gmem_l_idx + ((1 << 3) * 176)]; 13307 ulong r0_13 = vout[gmem_l_idx + ((1 << 3) * 192)]; 13308 ulong r0_14 = vout[gmem_l_idx + ((1 << 3) * 208)]; 13309 ulong r0_15 = vout[gmem_l_idx + ((1 << 3) * 224)]; 13310 ulong r0_16 = vout[gmem_l_idx + ((1 << 3) * 240)]; 13311 if (r0_1 >= r0_9) { 13312 ulong const t = r0_1; 13313 r0_1 = r0_9; 13314 r0_9 = t; 13315 }; 13316 if (r0_5 >= r0_13) { 13317 ulong const t = r0_5; 13318 r0_5 = r0_13; 13319 r0_13 = t; 13320 }; 13321 if (r0_1 >= r0_5) { 13322 ulong const t = r0_1; 13323 r0_1 = r0_5; 13324 r0_5 = t; 13325 }; 13326 if (r0_9 >= r0_13) { 13327 ulong const t = r0_9; 13328 r0_9 = r0_13; 13329 r0_13 = t; 13330 }; 13331 if (r0_3 >= r0_11) { 13332 ulong const t = r0_3; 13333 r0_3 = r0_11; 13334 r0_11 = t; 13335 }; 13336 if (r0_7 >= r0_15) { 13337 ulong const t = r0_7; 13338 r0_7 = r0_15; 13339 r0_15 = t; 13340 }; 13341 if (r0_3 >= r0_7) { 13342 ulong const t = r0_3; 13343 r0_3 = r0_7; 13344 r0_7 = t; 13345 }; 13346 if (r0_11 >= r0_15) { 13347 ulong const t = r0_11; 13348 r0_11 = r0_15; 13349 r0_15 = t; 13350 }; 13351 if (r0_1 >= r0_3) { 13352 ulong const t = r0_1; 13353 r0_1 = r0_3; 13354 r0_3 = t; 13355 }; 13356 if (r0_5 >= r0_7) { 13357 ulong const t = r0_5; 13358 r0_5 = r0_7; 13359 r0_7 = t; 13360 }; 13361 if (r0_9 >= r0_11) { 13362 ulong const t = r0_9; 13363 r0_9 = r0_11; 13364 r0_11 = t; 13365 }; 13366 if (r0_13 >= r0_15) { 13367 ulong const t = r0_13; 13368 r0_13 = r0_15; 13369 r0_15 = t; 13370 }; 13371 if (r0_2 >= r0_10) { 13372 ulong const t = r0_2; 13373 r0_2 = r0_10; 13374 r0_10 = t; 13375 }; 13376 if (r0_6 >= r0_14) { 13377 ulong const t = r0_6; 13378 r0_6 = r0_14; 13379 r0_14 = t; 13380 }; 13381 if (r0_2 >= r0_6) { 13382 ulong const t = r0_2; 13383 r0_2 = r0_6; 13384 r0_6 = t; 13385 }; 13386 if (r0_10 >= r0_14) { 13387 ulong const t = r0_10; 13388 r0_10 = r0_14; 13389 r0_14 = t; 13390 }; 13391 if (r0_4 >= r0_12) { 13392 ulong const t = r0_4; 13393 r0_4 = r0_12; 13394 r0_12 = t; 13395 }; 13396 if (r0_8 >= r0_16) { 13397 ulong const t = r0_8; 13398 r0_8 = r0_16; 13399 r0_16 = t; 13400 }; 13401 if (r0_4 >= r0_8) { 13402 ulong const t = r0_4; 13403 r0_4 = r0_8; 13404 r0_8 = t; 13405 }; 13406 if (r0_12 >= r0_16) { 13407 ulong const t = r0_12; 13408 r0_12 = r0_16; 13409 r0_16 = t; 13410 }; 13411 if (r0_2 >= r0_4) { 13412 ulong const t = r0_2; 13413 r0_2 = r0_4; 13414 r0_4 = t; 13415 }; 13416 if (r0_6 >= r0_8) { 13417 ulong const t = r0_6; 13418 r0_6 = r0_8; 13419 r0_8 = t; 13420 }; 13421 if (r0_10 >= r0_12) { 13422 ulong const t = r0_10; 13423 r0_10 = r0_12; 13424 r0_12 = t; 13425 }; 13426 if (r0_14 >= r0_16) { 13427 ulong const t = r0_14; 13428 r0_14 = r0_16; 13429 r0_16 = t; 13430 }; 13431 if (r0_1 >= r0_2) { 13432 ulong const t = r0_1; 13433 r0_1 = r0_2; 13434 r0_2 = t; 13435 }; 13436 if (r0_3 >= r0_4) { 13437 ulong const t = r0_3; 13438 r0_3 = r0_4; 13439 r0_4 = t; 13440 }; 13441 if (r0_5 >= r0_6) { 13442 ulong const t = r0_5; 13443 r0_5 = r0_6; 13444 r0_6 = t; 13445 }; 13446 if (r0_7 >= r0_8) { 13447 ulong const t = r0_7; 13448 r0_7 = r0_8; 13449 r0_8 = t; 13450 }; 13451 if (r0_9 >= r0_10) { 13452 ulong const t = r0_9; 13453 r0_9 = r0_10; 13454 r0_10 = t; 13455 }; 13456 if (r0_11 >= r0_12) { 13457 ulong const t = r0_11; 13458 r0_11 = r0_12; 13459 r0_12 = t; 13460 }; 13461 if (r0_13 >= r0_14) { 13462 ulong const t = r0_13; 13463 r0_13 = r0_14; 13464 r0_14 = t; 13465 }; 13466 if (r0_15 >= r0_16) { 13467 ulong const t = r0_15; 13468 r0_15 = r0_16; 13469 r0_16 = t; 13470 }; 13471 shared.m[smem_l_idx + (0)] = r0_1; 13472 shared.m[smem_l_idx + (8)] = r0_2; 13473 shared.m[smem_l_idx + (16)] = r0_3; 13474 shared.m[smem_l_idx + (24)] = r0_4; 13475 shared.m[smem_l_idx + (32)] = r0_5; 13476 shared.m[smem_l_idx + (40)] = r0_6; 13477 shared.m[smem_l_idx + (48)] = r0_7; 13478 shared.m[smem_l_idx + (56)] = r0_8; 13479 shared.m[smem_l_idx + (64)] = r0_9; 13480 shared.m[smem_l_idx + (72)] = r0_10; 13481 shared.m[smem_l_idx + (80)] = r0_11; 13482 shared.m[smem_l_idx + (88)] = r0_12; 13483 shared.m[smem_l_idx + (96)] = r0_13; 13484 shared.m[smem_l_idx + (104)] = r0_14; 13485 shared.m[smem_l_idx + (112)] = r0_15; 13486 shared.m[smem_l_idx + (120)] = r0_16; 13487 } 13488 } 13489 barrier(CLK_LOCAL_MEM_FENCE); 13490 ulong r1 = shared.m[get_local_id(0) + (16 * (1 << 3) * 0)]; 13491 ulong r2 = shared.m[get_local_id(0) + (16 * (1 << 3) * 1)]; 13492 ulong r3 = shared.m[get_local_id(0) + (16 * (1 << 3) * 2)]; 13493 ulong r4 = shared.m[get_local_id(0) + (16 * (1 << 3) * 3)]; 13494 ulong r5 = shared.m[get_local_id(0) + (16 * (1 << 3) * 4)]; 13495 ulong r6 = shared.m[get_local_id(0) + (16 * (1 << 3) * 5)]; 13496 ulong r7 = shared.m[get_local_id(0) + (16 * (1 << 3) * 6)]; 13497 ulong r8 = shared.m[get_local_id(0) + (16 * (1 << 3) * 7)]; 13498 ulong r9 = shared.m[get_local_id(0) + (16 * (1 << 3) * 8)]; 13499 ulong r10 = shared.m[get_local_id(0) + (16 * (1 << 3) * 9)]; 13500 ulong r11 = shared.m[get_local_id(0) + (16 * (1 << 3) * 10)]; 13501 ulong r12 = shared.m[get_local_id(0) + (16 * (1 << 3) * 11)]; 13502 ulong r13 = shared.m[get_local_id(0) + (16 * (1 << 3) * 12)]; 13503 ulong r14 = shared.m[get_local_id(0) + (16 * (1 << 3) * 13)]; 13504 ulong r15 = shared.m[get_local_id(0) + (16 * (1 << 3) * 14)]; 13505 ulong r16 = shared.m[get_local_id(0) + (16 * (1 << 3) * 15)]; 13506 { 13507 { 13508 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 13509 int const t_lt = get_sub_group_local_id() < half_lane_idx; 13510 ; 13511 { 13512 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 13513 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 13514 }; 13515 { 13516 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 13517 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 13518 }; 13519 { 13520 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 13521 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 13522 }; 13523 { 13524 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 13525 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 13526 }; 13527 { 13528 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 13529 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 13530 }; 13531 { 13532 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 13533 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 13534 }; 13535 { 13536 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 13537 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 13538 }; 13539 { 13540 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 13541 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 13542 }; 13543 { 13544 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 13545 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 13546 }; 13547 { 13548 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 13549 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 13550 }; 13551 { 13552 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 13553 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 13554 }; 13555 { 13556 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 13557 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 13558 }; 13559 { 13560 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 13561 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 13562 }; 13563 { 13564 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 13565 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 13566 }; 13567 { 13568 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 13569 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 13570 }; 13571 { 13572 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 13573 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 13574 }; 13575 } 13576 { 13577 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 13578 int const t_lt = get_sub_group_local_id() < half_lane_idx; 13579 ; 13580 { 13581 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 13582 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 13583 }; 13584 { 13585 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 13586 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 13587 }; 13588 { 13589 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 13590 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 13591 }; 13592 { 13593 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 13594 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 13595 }; 13596 { 13597 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 13598 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 13599 }; 13600 { 13601 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 13602 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 13603 }; 13604 { 13605 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 13606 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 13607 }; 13608 { 13609 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 13610 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 13611 }; 13612 { 13613 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 13614 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 13615 }; 13616 { 13617 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 13618 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 13619 }; 13620 { 13621 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 13622 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 13623 }; 13624 { 13625 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 13626 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 13627 }; 13628 { 13629 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 13630 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 13631 }; 13632 { 13633 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 13634 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 13635 }; 13636 { 13637 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 13638 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 13639 }; 13640 { 13641 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 13642 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 13643 }; 13644 } 13645 { 13646 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 13647 int const t_lt = get_sub_group_local_id() < half_lane_idx; 13648 ; 13649 { 13650 ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx); 13651 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 13652 }; 13653 { 13654 ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx); 13655 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 13656 }; 13657 { 13658 ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx); 13659 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 13660 }; 13661 { 13662 ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx); 13663 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 13664 }; 13665 { 13666 ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx); 13667 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 13668 }; 13669 { 13670 ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx); 13671 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 13672 }; 13673 { 13674 ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx); 13675 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 13676 }; 13677 { 13678 ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx); 13679 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 13680 }; 13681 { 13682 ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx); 13683 r9 = ((r9 <= ta) ^ t_lt) ? ta : r9; 13684 }; 13685 { 13686 ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx); 13687 r10 = ((r10 <= ta) ^ t_lt) ? ta : r10; 13688 }; 13689 { 13690 ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx); 13691 r11 = ((r11 <= ta) ^ t_lt) ? ta : r11; 13692 }; 13693 { 13694 ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx); 13695 r12 = ((r12 <= ta) ^ t_lt) ? ta : r12; 13696 }; 13697 { 13698 ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx); 13699 r13 = ((r13 <= ta) ^ t_lt) ? ta : r13; 13700 }; 13701 { 13702 ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx); 13703 r14 = ((r14 <= ta) ^ t_lt) ? ta : r14; 13704 }; 13705 { 13706 ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx); 13707 r15 = ((r15 <= ta) ^ t_lt) ? ta : r15; 13708 }; 13709 { 13710 ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx); 13711 r16 = ((r16 <= ta) ^ t_lt) ? ta : r16; 13712 }; 13713 } 13714 if (r1 >= r9) { 13715 ulong const t = r1; 13716 r1 = r9; 13717 r9 = t; 13718 }; 13719 if (r5 >= r13) { 13720 ulong const t = r5; 13721 r5 = r13; 13722 r13 = t; 13723 }; 13724 if (r1 >= r5) { 13725 ulong const t = r1; 13726 r1 = r5; 13727 r5 = t; 13728 }; 13729 if (r9 >= r13) { 13730 ulong const t = r9; 13731 r9 = r13; 13732 r13 = t; 13733 }; 13734 if (r3 >= r11) { 13735 ulong const t = r3; 13736 r3 = r11; 13737 r11 = t; 13738 }; 13739 if (r7 >= r15) { 13740 ulong const t = r7; 13741 r7 = r15; 13742 r15 = t; 13743 }; 13744 if (r3 >= r7) { 13745 ulong const t = r3; 13746 r3 = r7; 13747 r7 = t; 13748 }; 13749 if (r11 >= r15) { 13750 ulong const t = r11; 13751 r11 = r15; 13752 r15 = t; 13753 }; 13754 if (r1 >= r3) { 13755 ulong const t = r1; 13756 r1 = r3; 13757 r3 = t; 13758 }; 13759 if (r5 >= r7) { 13760 ulong const t = r5; 13761 r5 = r7; 13762 r7 = t; 13763 }; 13764 if (r9 >= r11) { 13765 ulong const t = r9; 13766 r9 = r11; 13767 r11 = t; 13768 }; 13769 if (r13 >= r15) { 13770 ulong const t = r13; 13771 r13 = r15; 13772 r15 = t; 13773 }; 13774 if (r2 >= r10) { 13775 ulong const t = r2; 13776 r2 = r10; 13777 r10 = t; 13778 }; 13779 if (r6 >= r14) { 13780 ulong const t = r6; 13781 r6 = r14; 13782 r14 = t; 13783 }; 13784 if (r2 >= r6) { 13785 ulong const t = r2; 13786 r2 = r6; 13787 r6 = t; 13788 }; 13789 if (r10 >= r14) { 13790 ulong const t = r10; 13791 r10 = r14; 13792 r14 = t; 13793 }; 13794 if (r4 >= r12) { 13795 ulong const t = r4; 13796 r4 = r12; 13797 r12 = t; 13798 }; 13799 if (r8 >= r16) { 13800 ulong const t = r8; 13801 r8 = r16; 13802 r16 = t; 13803 }; 13804 if (r4 >= r8) { 13805 ulong const t = r4; 13806 r4 = r8; 13807 r8 = t; 13808 }; 13809 if (r12 >= r16) { 13810 ulong const t = r12; 13811 r12 = r16; 13812 r16 = t; 13813 }; 13814 if (r2 >= r4) { 13815 ulong const t = r2; 13816 r2 = r4; 13817 r4 = t; 13818 }; 13819 if (r6 >= r8) { 13820 ulong const t = r6; 13821 r6 = r8; 13822 r8 = t; 13823 }; 13824 if (r10 >= r12) { 13825 ulong const t = r10; 13826 r10 = r12; 13827 r12 = t; 13828 }; 13829 if (r14 >= r16) { 13830 ulong const t = r14; 13831 r14 = r16; 13832 r16 = t; 13833 }; 13834 if (r1 >= r2) { 13835 ulong const t = r1; 13836 r1 = r2; 13837 r2 = t; 13838 }; 13839 if (r3 >= r4) { 13840 ulong const t = r3; 13841 r3 = r4; 13842 r4 = t; 13843 }; 13844 if (r5 >= r6) { 13845 ulong const t = r5; 13846 r5 = r6; 13847 r6 = t; 13848 }; 13849 if (r7 >= r8) { 13850 ulong const t = r7; 13851 r7 = r8; 13852 r8 = t; 13853 }; 13854 if (r9 >= r10) { 13855 ulong const t = r9; 13856 r9 = r10; 13857 r10 = t; 13858 }; 13859 if (r11 >= r12) { 13860 ulong const t = r11; 13861 r11 = r12; 13862 r12 = t; 13863 }; 13864 if (r13 >= r14) { 13865 ulong const t = r13; 13866 r13 = r14; 13867 r14 = t; 13868 }; 13869 if (r15 >= r16) { 13870 ulong const t = r15; 13871 r15 = r16; 13872 r16 = t; 13873 }; 13874 } 13875 vout[gmem_idx + (1 << 3) * 0] = r1; 13876 vout[gmem_idx + (1 << 3) * 1] = r2; 13877 vout[gmem_idx + (1 << 3) * 2] = r3; 13878 vout[gmem_idx + (1 << 3) * 3] = r4; 13879 vout[gmem_idx + (1 << 3) * 4] = r5; 13880 vout[gmem_idx + (1 << 3) * 5] = r6; 13881 vout[gmem_idx + (1 << 3) * 6] = r7; 13882 vout[gmem_idx + (1 << 3) * 7] = r8; 13883 vout[gmem_idx + (1 << 3) * 8] = r9; 13884 vout[gmem_idx + (1 << 3) * 9] = r10; 13885 vout[gmem_idx + (1 << 3) * 10] = r11; 13886 vout[gmem_idx + (1 << 3) * 11] = r12; 13887 vout[gmem_idx + (1 << 3) * 12] = r13; 13888 vout[gmem_idx + (1 << 3) * 13] = r14; 13889 vout[gmem_idx + (1 << 3) * 14] = r15; 13890 vout[gmem_idx + (1 << 3) * 15] = r16; 13891} 13892 13893__kernel __attribute__((intel_reqd_sub_group_size((1 << 3)))) void 13894hs_kernel_fm_1_0(__global ulong* const restrict vout) 13895{ 13896 uint const span_idx = get_global_id(1); 13897 uint const span_stride = get_global_size(0); 13898 uint const span_size = span_stride * 16 * 2; 13899 uint const span_base = span_idx * span_size; 13900 uint const span_off = get_global_id(0); 13901 uint const span_l = span_base + span_off; 13902 uint const span_r = span_base + span_stride * (16 + 1) - span_off - 1; 13903 ulong r1 = vout[span_l + span_stride * 0]; 13904 ulong r2 = vout[span_l + span_stride * 1]; 13905 ulong r3 = vout[span_l + span_stride * 2]; 13906 ulong r4 = vout[span_l + span_stride * 3]; 13907 ulong r5 = vout[span_l + span_stride * 4]; 13908 ulong r6 = vout[span_l + span_stride * 5]; 13909 ulong r7 = vout[span_l + span_stride * 6]; 13910 ulong r8 = vout[span_l + span_stride * 7]; 13911 ulong r9 = vout[span_l + span_stride * 8]; 13912 ulong r10 = vout[span_l + span_stride * 9]; 13913 ulong r11 = vout[span_l + span_stride * 10]; 13914 ulong r12 = vout[span_l + span_stride * 11]; 13915 ulong r13 = vout[span_l + span_stride * 12]; 13916 ulong r14 = vout[span_l + span_stride * 13]; 13917 ulong r15 = vout[span_l + span_stride * 14]; 13918 ulong r16 = vout[span_l + span_stride * 15]; 13919 ulong r17 = vout[span_r + span_stride * 0]; 13920 if (r16 >= r17) { 13921 ulong const t = r16; 13922 r16 = r17; 13923 r17 = t; 13924 }; 13925 if (r1 >= r9) { 13926 ulong const t = r1; 13927 r1 = r9; 13928 r9 = t; 13929 }; 13930 if (r5 >= r13) { 13931 ulong const t = r5; 13932 r5 = r13; 13933 r13 = t; 13934 }; 13935 if (r1 >= r5) { 13936 ulong const t = r1; 13937 r1 = r5; 13938 r5 = t; 13939 }; 13940 if (r9 >= r13) { 13941 ulong const t = r9; 13942 r9 = r13; 13943 r13 = t; 13944 }; 13945 if (r3 >= r11) { 13946 ulong const t = r3; 13947 r3 = r11; 13948 r11 = t; 13949 }; 13950 if (r7 >= r15) { 13951 ulong const t = r7; 13952 r7 = r15; 13953 r15 = t; 13954 }; 13955 if (r3 >= r7) { 13956 ulong const t = r3; 13957 r3 = r7; 13958 r7 = t; 13959 }; 13960 if (r11 >= r15) { 13961 ulong const t = r11; 13962 r11 = r15; 13963 r15 = t; 13964 }; 13965 if (r1 >= r3) { 13966 ulong const t = r1; 13967 r1 = r3; 13968 r3 = t; 13969 }; 13970 if (r5 >= r7) { 13971 ulong const t = r5; 13972 r5 = r7; 13973 r7 = t; 13974 }; 13975 if (r9 >= r11) { 13976 ulong const t = r9; 13977 r9 = r11; 13978 r11 = t; 13979 }; 13980 if (r13 >= r15) { 13981 ulong const t = r13; 13982 r13 = r15; 13983 r15 = t; 13984 }; 13985 if (r2 >= r10) { 13986 ulong const t = r2; 13987 r2 = r10; 13988 r10 = t; 13989 }; 13990 if (r6 >= r14) { 13991 ulong const t = r6; 13992 r6 = r14; 13993 r14 = t; 13994 }; 13995 if (r2 >= r6) { 13996 ulong const t = r2; 13997 r2 = r6; 13998 r6 = t; 13999 }; 14000 if (r10 >= r14) { 14001 ulong const t = r10; 14002 r10 = r14; 14003 r14 = t; 14004 }; 14005 if (r4 >= r12) { 14006 ulong const t = r4; 14007 r4 = r12; 14008 r12 = t; 14009 }; 14010 if (r8 >= r16) { 14011 ulong const t = r8; 14012 r8 = r16; 14013 r16 = t; 14014 }; 14015 if (r4 >= r8) { 14016 ulong const t = r4; 14017 r4 = r8; 14018 r8 = t; 14019 }; 14020 if (r12 >= r16) { 14021 ulong const t = r12; 14022 r12 = r16; 14023 r16 = t; 14024 }; 14025 if (r2 >= r4) { 14026 ulong const t = r2; 14027 r2 = r4; 14028 r4 = t; 14029 }; 14030 if (r6 >= r8) { 14031 ulong const t = r6; 14032 r6 = r8; 14033 r8 = t; 14034 }; 14035 if (r10 >= r12) { 14036 ulong const t = r10; 14037 r10 = r12; 14038 r12 = t; 14039 }; 14040 if (r14 >= r16) { 14041 ulong const t = r14; 14042 r14 = r16; 14043 r16 = t; 14044 }; 14045 if (r1 >= r2) { 14046 ulong const t = r1; 14047 r1 = r2; 14048 r2 = t; 14049 }; 14050 if (r3 >= r4) { 14051 ulong const t = r3; 14052 r3 = r4; 14053 r4 = t; 14054 }; 14055 if (r5 >= r6) { 14056 ulong const t = r5; 14057 r5 = r6; 14058 r6 = t; 14059 }; 14060 if (r7 >= r8) { 14061 ulong const t = r7; 14062 r7 = r8; 14063 r8 = t; 14064 }; 14065 if (r9 >= r10) { 14066 ulong const t = r9; 14067 r9 = r10; 14068 r10 = t; 14069 }; 14070 if (r11 >= r12) { 14071 ulong const t = r11; 14072 r11 = r12; 14073 r12 = t; 14074 }; 14075 if (r13 >= r14) { 14076 ulong const t = r13; 14077 r13 = r14; 14078 r14 = t; 14079 }; 14080 if (r15 >= r16) { 14081 ulong const t = r15; 14082 r15 = r16; 14083 r16 = t; 14084 }; 14085 vout[span_l + span_stride * 0] = r1; 14086 vout[span_l + span_stride * 1] = r2; 14087 vout[span_l + span_stride * 2] = r3; 14088 vout[span_l + span_stride * 3] = r4; 14089 vout[span_l + span_stride * 4] = r5; 14090 vout[span_l + span_stride * 5] = r6; 14091 vout[span_l + span_stride * 6] = r7; 14092 vout[span_l + span_stride * 7] = r8; 14093 vout[span_l + span_stride * 8] = r9; 14094 vout[span_l + span_stride * 9] = r10; 14095 vout[span_l + span_stride * 10] = r11; 14096 vout[span_l + span_stride * 11] = r12; 14097 vout[span_l + span_stride * 12] = r13; 14098 vout[span_l + span_stride * 13] = r14; 14099 vout[span_l + span_stride * 14] = r15; 14100 vout[span_l + span_stride * 15] = r16; 14101 vout[span_r + span_stride * 0] = r17; 14102} 14103 14104__kernel __attribute__((intel_reqd_sub_group_size((1 << 3)))) void 14105hs_kernel_fm_1_1(__global ulong* const restrict vout) 14106{ 14107 uint const span_idx = get_global_id(1); 14108 uint const span_stride = get_global_size(0); 14109 uint const span_size = span_stride * 16 * 2; 14110 uint const span_base = span_idx * span_size; 14111 uint const span_off = get_global_id(0); 14112 uint const span_l = span_base + span_off; 14113 uint const span_r = span_base + span_stride * (16 + 1) - span_off - 1; 14114 ulong r1 = vout[span_l + span_stride * 0]; 14115 ulong r2 = vout[span_l + span_stride * 1]; 14116 ulong r3 = vout[span_l + span_stride * 2]; 14117 ulong r4 = vout[span_l + span_stride * 3]; 14118 ulong r5 = vout[span_l + span_stride * 4]; 14119 ulong r6 = vout[span_l + span_stride * 5]; 14120 ulong r7 = vout[span_l + span_stride * 6]; 14121 ulong r8 = vout[span_l + span_stride * 7]; 14122 ulong r9 = vout[span_l + span_stride * 8]; 14123 ulong r10 = vout[span_l + span_stride * 9]; 14124 ulong r11 = vout[span_l + span_stride * 10]; 14125 ulong r12 = vout[span_l + span_stride * 11]; 14126 ulong r13 = vout[span_l + span_stride * 12]; 14127 ulong r14 = vout[span_l + span_stride * 13]; 14128 ulong r15 = vout[span_l + span_stride * 14]; 14129 ulong r16 = vout[span_l + span_stride * 15]; 14130 ulong r17 = vout[span_r + span_stride * 0]; 14131 ulong r18 = vout[span_r + span_stride * 1]; 14132 if (r16 >= r17) { 14133 ulong const t = r16; 14134 r16 = r17; 14135 r17 = t; 14136 }; 14137 if (r15 >= r18) { 14138 ulong const t = r15; 14139 r15 = r18; 14140 r18 = t; 14141 }; 14142 if (r1 >= r9) { 14143 ulong const t = r1; 14144 r1 = r9; 14145 r9 = t; 14146 }; 14147 if (r5 >= r13) { 14148 ulong const t = r5; 14149 r5 = r13; 14150 r13 = t; 14151 }; 14152 if (r1 >= r5) { 14153 ulong const t = r1; 14154 r1 = r5; 14155 r5 = t; 14156 }; 14157 if (r9 >= r13) { 14158 ulong const t = r9; 14159 r9 = r13; 14160 r13 = t; 14161 }; 14162 if (r3 >= r11) { 14163 ulong const t = r3; 14164 r3 = r11; 14165 r11 = t; 14166 }; 14167 if (r7 >= r15) { 14168 ulong const t = r7; 14169 r7 = r15; 14170 r15 = t; 14171 }; 14172 if (r3 >= r7) { 14173 ulong const t = r3; 14174 r3 = r7; 14175 r7 = t; 14176 }; 14177 if (r11 >= r15) { 14178 ulong const t = r11; 14179 r11 = r15; 14180 r15 = t; 14181 }; 14182 if (r1 >= r3) { 14183 ulong const t = r1; 14184 r1 = r3; 14185 r3 = t; 14186 }; 14187 if (r5 >= r7) { 14188 ulong const t = r5; 14189 r5 = r7; 14190 r7 = t; 14191 }; 14192 if (r9 >= r11) { 14193 ulong const t = r9; 14194 r9 = r11; 14195 r11 = t; 14196 }; 14197 if (r13 >= r15) { 14198 ulong const t = r13; 14199 r13 = r15; 14200 r15 = t; 14201 }; 14202 if (r2 >= r10) { 14203 ulong const t = r2; 14204 r2 = r10; 14205 r10 = t; 14206 }; 14207 if (r6 >= r14) { 14208 ulong const t = r6; 14209 r6 = r14; 14210 r14 = t; 14211 }; 14212 if (r2 >= r6) { 14213 ulong const t = r2; 14214 r2 = r6; 14215 r6 = t; 14216 }; 14217 if (r10 >= r14) { 14218 ulong const t = r10; 14219 r10 = r14; 14220 r14 = t; 14221 }; 14222 if (r4 >= r12) { 14223 ulong const t = r4; 14224 r4 = r12; 14225 r12 = t; 14226 }; 14227 if (r8 >= r16) { 14228 ulong const t = r8; 14229 r8 = r16; 14230 r16 = t; 14231 }; 14232 if (r4 >= r8) { 14233 ulong const t = r4; 14234 r4 = r8; 14235 r8 = t; 14236 }; 14237 if (r12 >= r16) { 14238 ulong const t = r12; 14239 r12 = r16; 14240 r16 = t; 14241 }; 14242 if (r2 >= r4) { 14243 ulong const t = r2; 14244 r2 = r4; 14245 r4 = t; 14246 }; 14247 if (r6 >= r8) { 14248 ulong const t = r6; 14249 r6 = r8; 14250 r8 = t; 14251 }; 14252 if (r10 >= r12) { 14253 ulong const t = r10; 14254 r10 = r12; 14255 r12 = t; 14256 }; 14257 if (r14 >= r16) { 14258 ulong const t = r14; 14259 r14 = r16; 14260 r16 = t; 14261 }; 14262 if (r1 >= r2) { 14263 ulong const t = r1; 14264 r1 = r2; 14265 r2 = t; 14266 }; 14267 if (r3 >= r4) { 14268 ulong const t = r3; 14269 r3 = r4; 14270 r4 = t; 14271 }; 14272 if (r5 >= r6) { 14273 ulong const t = r5; 14274 r5 = r6; 14275 r6 = t; 14276 }; 14277 if (r7 >= r8) { 14278 ulong const t = r7; 14279 r7 = r8; 14280 r8 = t; 14281 }; 14282 if (r9 >= r10) { 14283 ulong const t = r9; 14284 r9 = r10; 14285 r10 = t; 14286 }; 14287 if (r11 >= r12) { 14288 ulong const t = r11; 14289 r11 = r12; 14290 r12 = t; 14291 }; 14292 if (r13 >= r14) { 14293 ulong const t = r13; 14294 r13 = r14; 14295 r14 = t; 14296 }; 14297 if (r15 >= r16) { 14298 ulong const t = r15; 14299 r15 = r16; 14300 r16 = t; 14301 }; 14302 if (r17 >= r18) { 14303 ulong const t = r17; 14304 r17 = r18; 14305 r18 = t; 14306 }; 14307 vout[span_l + span_stride * 0] = r1; 14308 vout[span_l + span_stride * 1] = r2; 14309 vout[span_l + span_stride * 2] = r3; 14310 vout[span_l + span_stride * 3] = r4; 14311 vout[span_l + span_stride * 4] = r5; 14312 vout[span_l + span_stride * 5] = r6; 14313 vout[span_l + span_stride * 6] = r7; 14314 vout[span_l + span_stride * 7] = r8; 14315 vout[span_l + span_stride * 8] = r9; 14316 vout[span_l + span_stride * 9] = r10; 14317 vout[span_l + span_stride * 10] = r11; 14318 vout[span_l + span_stride * 11] = r12; 14319 vout[span_l + span_stride * 12] = r13; 14320 vout[span_l + span_stride * 13] = r14; 14321 vout[span_l + span_stride * 14] = r15; 14322 vout[span_l + span_stride * 15] = r16; 14323 vout[span_r + span_stride * 0] = r17; 14324 vout[span_r + span_stride * 1] = r18; 14325} 14326 14327__kernel __attribute__((intel_reqd_sub_group_size((1 << 3)))) void 14328hs_kernel_fm_1_2(__global ulong* const restrict vout) 14329{ 14330 uint const span_idx = get_global_id(1); 14331 uint const span_stride = get_global_size(0); 14332 uint const span_size = span_stride * 16 * 2; 14333 uint const span_base = span_idx * span_size; 14334 uint const span_off = get_global_id(0); 14335 uint const span_l = span_base + span_off; 14336 uint const span_r = span_base + span_stride * (16 + 1) - span_off - 1; 14337 ulong r1 = vout[span_l + span_stride * 0]; 14338 ulong r2 = vout[span_l + span_stride * 1]; 14339 ulong r3 = vout[span_l + span_stride * 2]; 14340 ulong r4 = vout[span_l + span_stride * 3]; 14341 ulong r5 = vout[span_l + span_stride * 4]; 14342 ulong r6 = vout[span_l + span_stride * 5]; 14343 ulong r7 = vout[span_l + span_stride * 6]; 14344 ulong r8 = vout[span_l + span_stride * 7]; 14345 ulong r9 = vout[span_l + span_stride * 8]; 14346 ulong r10 = vout[span_l + span_stride * 9]; 14347 ulong r11 = vout[span_l + span_stride * 10]; 14348 ulong r12 = vout[span_l + span_stride * 11]; 14349 ulong r13 = vout[span_l + span_stride * 12]; 14350 ulong r14 = vout[span_l + span_stride * 13]; 14351 ulong r15 = vout[span_l + span_stride * 14]; 14352 ulong r16 = vout[span_l + span_stride * 15]; 14353 ulong r17 = vout[span_r + span_stride * 0]; 14354 ulong r18 = vout[span_r + span_stride * 1]; 14355 ulong r19 = vout[span_r + span_stride * 2]; 14356 ulong r20 = vout[span_r + span_stride * 3]; 14357 if (r16 >= r17) { 14358 ulong const t = r16; 14359 r16 = r17; 14360 r17 = t; 14361 }; 14362 if (r15 >= r18) { 14363 ulong const t = r15; 14364 r15 = r18; 14365 r18 = t; 14366 }; 14367 if (r14 >= r19) { 14368 ulong const t = r14; 14369 r14 = r19; 14370 r19 = t; 14371 }; 14372 if (r13 >= r20) { 14373 ulong const t = r13; 14374 r13 = r20; 14375 r20 = t; 14376 }; 14377 if (r1 >= r9) { 14378 ulong const t = r1; 14379 r1 = r9; 14380 r9 = t; 14381 }; 14382 if (r5 >= r13) { 14383 ulong const t = r5; 14384 r5 = r13; 14385 r13 = t; 14386 }; 14387 if (r1 >= r5) { 14388 ulong const t = r1; 14389 r1 = r5; 14390 r5 = t; 14391 }; 14392 if (r9 >= r13) { 14393 ulong const t = r9; 14394 r9 = r13; 14395 r13 = t; 14396 }; 14397 if (r3 >= r11) { 14398 ulong const t = r3; 14399 r3 = r11; 14400 r11 = t; 14401 }; 14402 if (r7 >= r15) { 14403 ulong const t = r7; 14404 r7 = r15; 14405 r15 = t; 14406 }; 14407 if (r3 >= r7) { 14408 ulong const t = r3; 14409 r3 = r7; 14410 r7 = t; 14411 }; 14412 if (r11 >= r15) { 14413 ulong const t = r11; 14414 r11 = r15; 14415 r15 = t; 14416 }; 14417 if (r1 >= r3) { 14418 ulong const t = r1; 14419 r1 = r3; 14420 r3 = t; 14421 }; 14422 if (r5 >= r7) { 14423 ulong const t = r5; 14424 r5 = r7; 14425 r7 = t; 14426 }; 14427 if (r9 >= r11) { 14428 ulong const t = r9; 14429 r9 = r11; 14430 r11 = t; 14431 }; 14432 if (r13 >= r15) { 14433 ulong const t = r13; 14434 r13 = r15; 14435 r15 = t; 14436 }; 14437 if (r2 >= r10) { 14438 ulong const t = r2; 14439 r2 = r10; 14440 r10 = t; 14441 }; 14442 if (r6 >= r14) { 14443 ulong const t = r6; 14444 r6 = r14; 14445 r14 = t; 14446 }; 14447 if (r2 >= r6) { 14448 ulong const t = r2; 14449 r2 = r6; 14450 r6 = t; 14451 }; 14452 if (r10 >= r14) { 14453 ulong const t = r10; 14454 r10 = r14; 14455 r14 = t; 14456 }; 14457 if (r4 >= r12) { 14458 ulong const t = r4; 14459 r4 = r12; 14460 r12 = t; 14461 }; 14462 if (r8 >= r16) { 14463 ulong const t = r8; 14464 r8 = r16; 14465 r16 = t; 14466 }; 14467 if (r4 >= r8) { 14468 ulong const t = r4; 14469 r4 = r8; 14470 r8 = t; 14471 }; 14472 if (r12 >= r16) { 14473 ulong const t = r12; 14474 r12 = r16; 14475 r16 = t; 14476 }; 14477 if (r2 >= r4) { 14478 ulong const t = r2; 14479 r2 = r4; 14480 r4 = t; 14481 }; 14482 if (r6 >= r8) { 14483 ulong const t = r6; 14484 r6 = r8; 14485 r8 = t; 14486 }; 14487 if (r10 >= r12) { 14488 ulong const t = r10; 14489 r10 = r12; 14490 r12 = t; 14491 }; 14492 if (r14 >= r16) { 14493 ulong const t = r14; 14494 r14 = r16; 14495 r16 = t; 14496 }; 14497 if (r1 >= r2) { 14498 ulong const t = r1; 14499 r1 = r2; 14500 r2 = t; 14501 }; 14502 if (r3 >= r4) { 14503 ulong const t = r3; 14504 r3 = r4; 14505 r4 = t; 14506 }; 14507 if (r5 >= r6) { 14508 ulong const t = r5; 14509 r5 = r6; 14510 r6 = t; 14511 }; 14512 if (r7 >= r8) { 14513 ulong const t = r7; 14514 r7 = r8; 14515 r8 = t; 14516 }; 14517 if (r9 >= r10) { 14518 ulong const t = r9; 14519 r9 = r10; 14520 r10 = t; 14521 }; 14522 if (r11 >= r12) { 14523 ulong const t = r11; 14524 r11 = r12; 14525 r12 = t; 14526 }; 14527 if (r13 >= r14) { 14528 ulong const t = r13; 14529 r13 = r14; 14530 r14 = t; 14531 }; 14532 if (r15 >= r16) { 14533 ulong const t = r15; 14534 r15 = r16; 14535 r16 = t; 14536 }; 14537 if (r17 >= r19) { 14538 ulong const t = r17; 14539 r17 = r19; 14540 r19 = t; 14541 }; 14542 if (r18 >= r20) { 14543 ulong const t = r18; 14544 r18 = r20; 14545 r20 = t; 14546 }; 14547 if (r17 >= r18) { 14548 ulong const t = r17; 14549 r17 = r18; 14550 r18 = t; 14551 }; 14552 if (r19 >= r20) { 14553 ulong const t = r19; 14554 r19 = r20; 14555 r20 = t; 14556 }; 14557 vout[span_l + span_stride * 0] = r1; 14558 vout[span_l + span_stride * 1] = r2; 14559 vout[span_l + span_stride * 2] = r3; 14560 vout[span_l + span_stride * 3] = r4; 14561 vout[span_l + span_stride * 4] = r5; 14562 vout[span_l + span_stride * 5] = r6; 14563 vout[span_l + span_stride * 6] = r7; 14564 vout[span_l + span_stride * 7] = r8; 14565 vout[span_l + span_stride * 8] = r9; 14566 vout[span_l + span_stride * 9] = r10; 14567 vout[span_l + span_stride * 10] = r11; 14568 vout[span_l + span_stride * 11] = r12; 14569 vout[span_l + span_stride * 12] = r13; 14570 vout[span_l + span_stride * 13] = r14; 14571 vout[span_l + span_stride * 14] = r15; 14572 vout[span_l + span_stride * 15] = r16; 14573 vout[span_r + span_stride * 0] = r17; 14574 vout[span_r + span_stride * 1] = r18; 14575 vout[span_r + span_stride * 2] = r19; 14576 vout[span_r + span_stride * 3] = r20; 14577} 14578 14579__kernel __attribute__((intel_reqd_sub_group_size((1 << 3)))) void 14580hs_kernel_fm_1_3(__global ulong* const restrict vout) 14581{ 14582 uint const span_idx = get_global_id(1); 14583 uint const span_stride = get_global_size(0); 14584 uint const span_size = span_stride * 16 * 2; 14585 uint const span_base = span_idx * span_size; 14586 uint const span_off = get_global_id(0); 14587 uint const span_l = span_base + span_off; 14588 uint const span_r = span_base + span_stride * (16 + 1) - span_off - 1; 14589 ulong r1 = vout[span_l + span_stride * 0]; 14590 ulong r2 = vout[span_l + span_stride * 1]; 14591 ulong r3 = vout[span_l + span_stride * 2]; 14592 ulong r4 = vout[span_l + span_stride * 3]; 14593 ulong r5 = vout[span_l + span_stride * 4]; 14594 ulong r6 = vout[span_l + span_stride * 5]; 14595 ulong r7 = vout[span_l + span_stride * 6]; 14596 ulong r8 = vout[span_l + span_stride * 7]; 14597 ulong r9 = vout[span_l + span_stride * 8]; 14598 ulong r10 = vout[span_l + span_stride * 9]; 14599 ulong r11 = vout[span_l + span_stride * 10]; 14600 ulong r12 = vout[span_l + span_stride * 11]; 14601 ulong r13 = vout[span_l + span_stride * 12]; 14602 ulong r14 = vout[span_l + span_stride * 13]; 14603 ulong r15 = vout[span_l + span_stride * 14]; 14604 ulong r16 = vout[span_l + span_stride * 15]; 14605 ulong r17 = vout[span_r + span_stride * 0]; 14606 ulong r18 = vout[span_r + span_stride * 1]; 14607 ulong r19 = vout[span_r + span_stride * 2]; 14608 ulong r20 = vout[span_r + span_stride * 3]; 14609 ulong r21 = vout[span_r + span_stride * 4]; 14610 ulong r22 = vout[span_r + span_stride * 5]; 14611 ulong r23 = vout[span_r + span_stride * 6]; 14612 ulong r24 = vout[span_r + span_stride * 7]; 14613 if (r16 >= r17) { 14614 ulong const t = r16; 14615 r16 = r17; 14616 r17 = t; 14617 }; 14618 if (r15 >= r18) { 14619 ulong const t = r15; 14620 r15 = r18; 14621 r18 = t; 14622 }; 14623 if (r14 >= r19) { 14624 ulong const t = r14; 14625 r14 = r19; 14626 r19 = t; 14627 }; 14628 if (r13 >= r20) { 14629 ulong const t = r13; 14630 r13 = r20; 14631 r20 = t; 14632 }; 14633 if (r12 >= r21) { 14634 ulong const t = r12; 14635 r12 = r21; 14636 r21 = t; 14637 }; 14638 if (r11 >= r22) { 14639 ulong const t = r11; 14640 r11 = r22; 14641 r22 = t; 14642 }; 14643 if (r10 >= r23) { 14644 ulong const t = r10; 14645 r10 = r23; 14646 r23 = t; 14647 }; 14648 if (r9 >= r24) { 14649 ulong const t = r9; 14650 r9 = r24; 14651 r24 = t; 14652 }; 14653 if (r1 >= r9) { 14654 ulong const t = r1; 14655 r1 = r9; 14656 r9 = t; 14657 }; 14658 if (r5 >= r13) { 14659 ulong const t = r5; 14660 r5 = r13; 14661 r13 = t; 14662 }; 14663 if (r1 >= r5) { 14664 ulong const t = r1; 14665 r1 = r5; 14666 r5 = t; 14667 }; 14668 if (r9 >= r13) { 14669 ulong const t = r9; 14670 r9 = r13; 14671 r13 = t; 14672 }; 14673 if (r3 >= r11) { 14674 ulong const t = r3; 14675 r3 = r11; 14676 r11 = t; 14677 }; 14678 if (r7 >= r15) { 14679 ulong const t = r7; 14680 r7 = r15; 14681 r15 = t; 14682 }; 14683 if (r3 >= r7) { 14684 ulong const t = r3; 14685 r3 = r7; 14686 r7 = t; 14687 }; 14688 if (r11 >= r15) { 14689 ulong const t = r11; 14690 r11 = r15; 14691 r15 = t; 14692 }; 14693 if (r1 >= r3) { 14694 ulong const t = r1; 14695 r1 = r3; 14696 r3 = t; 14697 }; 14698 if (r5 >= r7) { 14699 ulong const t = r5; 14700 r5 = r7; 14701 r7 = t; 14702 }; 14703 if (r9 >= r11) { 14704 ulong const t = r9; 14705 r9 = r11; 14706 r11 = t; 14707 }; 14708 if (r13 >= r15) { 14709 ulong const t = r13; 14710 r13 = r15; 14711 r15 = t; 14712 }; 14713 if (r2 >= r10) { 14714 ulong const t = r2; 14715 r2 = r10; 14716 r10 = t; 14717 }; 14718 if (r6 >= r14) { 14719 ulong const t = r6; 14720 r6 = r14; 14721 r14 = t; 14722 }; 14723 if (r2 >= r6) { 14724 ulong const t = r2; 14725 r2 = r6; 14726 r6 = t; 14727 }; 14728 if (r10 >= r14) { 14729 ulong const t = r10; 14730 r10 = r14; 14731 r14 = t; 14732 }; 14733 if (r4 >= r12) { 14734 ulong const t = r4; 14735 r4 = r12; 14736 r12 = t; 14737 }; 14738 if (r8 >= r16) { 14739 ulong const t = r8; 14740 r8 = r16; 14741 r16 = t; 14742 }; 14743 if (r4 >= r8) { 14744 ulong const t = r4; 14745 r4 = r8; 14746 r8 = t; 14747 }; 14748 if (r12 >= r16) { 14749 ulong const t = r12; 14750 r12 = r16; 14751 r16 = t; 14752 }; 14753 if (r2 >= r4) { 14754 ulong const t = r2; 14755 r2 = r4; 14756 r4 = t; 14757 }; 14758 if (r6 >= r8) { 14759 ulong const t = r6; 14760 r6 = r8; 14761 r8 = t; 14762 }; 14763 if (r10 >= r12) { 14764 ulong const t = r10; 14765 r10 = r12; 14766 r12 = t; 14767 }; 14768 if (r14 >= r16) { 14769 ulong const t = r14; 14770 r14 = r16; 14771 r16 = t; 14772 }; 14773 if (r1 >= r2) { 14774 ulong const t = r1; 14775 r1 = r2; 14776 r2 = t; 14777 }; 14778 if (r3 >= r4) { 14779 ulong const t = r3; 14780 r3 = r4; 14781 r4 = t; 14782 }; 14783 if (r5 >= r6) { 14784 ulong const t = r5; 14785 r5 = r6; 14786 r6 = t; 14787 }; 14788 if (r7 >= r8) { 14789 ulong const t = r7; 14790 r7 = r8; 14791 r8 = t; 14792 }; 14793 if (r9 >= r10) { 14794 ulong const t = r9; 14795 r9 = r10; 14796 r10 = t; 14797 }; 14798 if (r11 >= r12) { 14799 ulong const t = r11; 14800 r11 = r12; 14801 r12 = t; 14802 }; 14803 if (r13 >= r14) { 14804 ulong const t = r13; 14805 r13 = r14; 14806 r14 = t; 14807 }; 14808 if (r15 >= r16) { 14809 ulong const t = r15; 14810 r15 = r16; 14811 r16 = t; 14812 }; 14813 if (r17 >= r21) { 14814 ulong const t = r17; 14815 r17 = r21; 14816 r21 = t; 14817 }; 14818 if (r19 >= r23) { 14819 ulong const t = r19; 14820 r19 = r23; 14821 r23 = t; 14822 }; 14823 if (r17 >= r19) { 14824 ulong const t = r17; 14825 r17 = r19; 14826 r19 = t; 14827 }; 14828 if (r21 >= r23) { 14829 ulong const t = r21; 14830 r21 = r23; 14831 r23 = t; 14832 }; 14833 if (r18 >= r22) { 14834 ulong const t = r18; 14835 r18 = r22; 14836 r22 = t; 14837 }; 14838 if (r20 >= r24) { 14839 ulong const t = r20; 14840 r20 = r24; 14841 r24 = t; 14842 }; 14843 if (r18 >= r20) { 14844 ulong const t = r18; 14845 r18 = r20; 14846 r20 = t; 14847 }; 14848 if (r22 >= r24) { 14849 ulong const t = r22; 14850 r22 = r24; 14851 r24 = t; 14852 }; 14853 if (r17 >= r18) { 14854 ulong const t = r17; 14855 r17 = r18; 14856 r18 = t; 14857 }; 14858 if (r19 >= r20) { 14859 ulong const t = r19; 14860 r19 = r20; 14861 r20 = t; 14862 }; 14863 if (r21 >= r22) { 14864 ulong const t = r21; 14865 r21 = r22; 14866 r22 = t; 14867 }; 14868 if (r23 >= r24) { 14869 ulong const t = r23; 14870 r23 = r24; 14871 r24 = t; 14872 }; 14873 vout[span_l + span_stride * 0] = r1; 14874 vout[span_l + span_stride * 1] = r2; 14875 vout[span_l + span_stride * 2] = r3; 14876 vout[span_l + span_stride * 3] = r4; 14877 vout[span_l + span_stride * 4] = r5; 14878 vout[span_l + span_stride * 5] = r6; 14879 vout[span_l + span_stride * 6] = r7; 14880 vout[span_l + span_stride * 7] = r8; 14881 vout[span_l + span_stride * 8] = r9; 14882 vout[span_l + span_stride * 9] = r10; 14883 vout[span_l + span_stride * 10] = r11; 14884 vout[span_l + span_stride * 11] = r12; 14885 vout[span_l + span_stride * 12] = r13; 14886 vout[span_l + span_stride * 13] = r14; 14887 vout[span_l + span_stride * 14] = r15; 14888 vout[span_l + span_stride * 15] = r16; 14889 vout[span_r + span_stride * 0] = r17; 14890 vout[span_r + span_stride * 1] = r18; 14891 vout[span_r + span_stride * 2] = r19; 14892 vout[span_r + span_stride * 3] = r20; 14893 vout[span_r + span_stride * 4] = r21; 14894 vout[span_r + span_stride * 5] = r22; 14895 vout[span_r + span_stride * 6] = r23; 14896 vout[span_r + span_stride * 7] = r24; 14897} 14898 14899__kernel __attribute__((intel_reqd_sub_group_size((1 << 3)))) void 14900hs_kernel_fm_1_4(__global ulong* const restrict vout) 14901{ 14902 uint const span_idx = get_global_id(1); 14903 uint const span_stride = get_global_size(0); 14904 uint const span_size = span_stride * 16 * 2; 14905 uint const span_base = span_idx * span_size; 14906 uint const span_off = get_global_id(0); 14907 uint const span_l = span_base + span_off; 14908 uint const span_r = span_base + span_stride * (16 + 1) - span_off - 1; 14909 ulong r1 = vout[span_l + span_stride * 0]; 14910 ulong r2 = vout[span_l + span_stride * 1]; 14911 ulong r3 = vout[span_l + span_stride * 2]; 14912 ulong r4 = vout[span_l + span_stride * 3]; 14913 ulong r5 = vout[span_l + span_stride * 4]; 14914 ulong r6 = vout[span_l + span_stride * 5]; 14915 ulong r7 = vout[span_l + span_stride * 6]; 14916 ulong r8 = vout[span_l + span_stride * 7]; 14917 ulong r9 = vout[span_l + span_stride * 8]; 14918 ulong r10 = vout[span_l + span_stride * 9]; 14919 ulong r11 = vout[span_l + span_stride * 10]; 14920 ulong r12 = vout[span_l + span_stride * 11]; 14921 ulong r13 = vout[span_l + span_stride * 12]; 14922 ulong r14 = vout[span_l + span_stride * 13]; 14923 ulong r15 = vout[span_l + span_stride * 14]; 14924 ulong r16 = vout[span_l + span_stride * 15]; 14925 ulong r17 = vout[span_r + span_stride * 0]; 14926 ulong r18 = vout[span_r + span_stride * 1]; 14927 ulong r19 = vout[span_r + span_stride * 2]; 14928 ulong r20 = vout[span_r + span_stride * 3]; 14929 ulong r21 = vout[span_r + span_stride * 4]; 14930 ulong r22 = vout[span_r + span_stride * 5]; 14931 ulong r23 = vout[span_r + span_stride * 6]; 14932 ulong r24 = vout[span_r + span_stride * 7]; 14933 ulong r25 = vout[span_r + span_stride * 8]; 14934 ulong r26 = vout[span_r + span_stride * 9]; 14935 ulong r27 = vout[span_r + span_stride * 10]; 14936 ulong r28 = vout[span_r + span_stride * 11]; 14937 ulong r29 = vout[span_r + span_stride * 12]; 14938 ulong r30 = vout[span_r + span_stride * 13]; 14939 ulong r31 = vout[span_r + span_stride * 14]; 14940 ulong r32 = vout[span_r + span_stride * 15]; 14941 if (r16 >= r17) { 14942 ulong const t = r16; 14943 r16 = r17; 14944 r17 = t; 14945 }; 14946 if (r15 >= r18) { 14947 ulong const t = r15; 14948 r15 = r18; 14949 r18 = t; 14950 }; 14951 if (r14 >= r19) { 14952 ulong const t = r14; 14953 r14 = r19; 14954 r19 = t; 14955 }; 14956 if (r13 >= r20) { 14957 ulong const t = r13; 14958 r13 = r20; 14959 r20 = t; 14960 }; 14961 if (r12 >= r21) { 14962 ulong const t = r12; 14963 r12 = r21; 14964 r21 = t; 14965 }; 14966 if (r11 >= r22) { 14967 ulong const t = r11; 14968 r11 = r22; 14969 r22 = t; 14970 }; 14971 if (r10 >= r23) { 14972 ulong const t = r10; 14973 r10 = r23; 14974 r23 = t; 14975 }; 14976 if (r9 >= r24) { 14977 ulong const t = r9; 14978 r9 = r24; 14979 r24 = t; 14980 }; 14981 if (r8 >= r25) { 14982 ulong const t = r8; 14983 r8 = r25; 14984 r25 = t; 14985 }; 14986 if (r7 >= r26) { 14987 ulong const t = r7; 14988 r7 = r26; 14989 r26 = t; 14990 }; 14991 if (r6 >= r27) { 14992 ulong const t = r6; 14993 r6 = r27; 14994 r27 = t; 14995 }; 14996 if (r5 >= r28) { 14997 ulong const t = r5; 14998 r5 = r28; 14999 r28 = t; 15000 }; 15001 if (r4 >= r29) { 15002 ulong const t = r4; 15003 r4 = r29; 15004 r29 = t; 15005 }; 15006 if (r3 >= r30) { 15007 ulong const t = r3; 15008 r3 = r30; 15009 r30 = t; 15010 }; 15011 if (r2 >= r31) { 15012 ulong const t = r2; 15013 r2 = r31; 15014 r31 = t; 15015 }; 15016 if (r1 >= r32) { 15017 ulong const t = r1; 15018 r1 = r32; 15019 r32 = t; 15020 }; 15021 if (r1 >= r9) { 15022 ulong const t = r1; 15023 r1 = r9; 15024 r9 = t; 15025 }; 15026 if (r5 >= r13) { 15027 ulong const t = r5; 15028 r5 = r13; 15029 r13 = t; 15030 }; 15031 if (r1 >= r5) { 15032 ulong const t = r1; 15033 r1 = r5; 15034 r5 = t; 15035 }; 15036 if (r9 >= r13) { 15037 ulong const t = r9; 15038 r9 = r13; 15039 r13 = t; 15040 }; 15041 if (r3 >= r11) { 15042 ulong const t = r3; 15043 r3 = r11; 15044 r11 = t; 15045 }; 15046 if (r7 >= r15) { 15047 ulong const t = r7; 15048 r7 = r15; 15049 r15 = t; 15050 }; 15051 if (r3 >= r7) { 15052 ulong const t = r3; 15053 r3 = r7; 15054 r7 = t; 15055 }; 15056 if (r11 >= r15) { 15057 ulong const t = r11; 15058 r11 = r15; 15059 r15 = t; 15060 }; 15061 if (r1 >= r3) { 15062 ulong const t = r1; 15063 r1 = r3; 15064 r3 = t; 15065 }; 15066 if (r5 >= r7) { 15067 ulong const t = r5; 15068 r5 = r7; 15069 r7 = t; 15070 }; 15071 if (r9 >= r11) { 15072 ulong const t = r9; 15073 r9 = r11; 15074 r11 = t; 15075 }; 15076 if (r13 >= r15) { 15077 ulong const t = r13; 15078 r13 = r15; 15079 r15 = t; 15080 }; 15081 if (r2 >= r10) { 15082 ulong const t = r2; 15083 r2 = r10; 15084 r10 = t; 15085 }; 15086 if (r6 >= r14) { 15087 ulong const t = r6; 15088 r6 = r14; 15089 r14 = t; 15090 }; 15091 if (r2 >= r6) { 15092 ulong const t = r2; 15093 r2 = r6; 15094 r6 = t; 15095 }; 15096 if (r10 >= r14) { 15097 ulong const t = r10; 15098 r10 = r14; 15099 r14 = t; 15100 }; 15101 if (r4 >= r12) { 15102 ulong const t = r4; 15103 r4 = r12; 15104 r12 = t; 15105 }; 15106 if (r8 >= r16) { 15107 ulong const t = r8; 15108 r8 = r16; 15109 r16 = t; 15110 }; 15111 if (r4 >= r8) { 15112 ulong const t = r4; 15113 r4 = r8; 15114 r8 = t; 15115 }; 15116 if (r12 >= r16) { 15117 ulong const t = r12; 15118 r12 = r16; 15119 r16 = t; 15120 }; 15121 if (r2 >= r4) { 15122 ulong const t = r2; 15123 r2 = r4; 15124 r4 = t; 15125 }; 15126 if (r6 >= r8) { 15127 ulong const t = r6; 15128 r6 = r8; 15129 r8 = t; 15130 }; 15131 if (r10 >= r12) { 15132 ulong const t = r10; 15133 r10 = r12; 15134 r12 = t; 15135 }; 15136 if (r14 >= r16) { 15137 ulong const t = r14; 15138 r14 = r16; 15139 r16 = t; 15140 }; 15141 if (r1 >= r2) { 15142 ulong const t = r1; 15143 r1 = r2; 15144 r2 = t; 15145 }; 15146 if (r3 >= r4) { 15147 ulong const t = r3; 15148 r3 = r4; 15149 r4 = t; 15150 }; 15151 if (r5 >= r6) { 15152 ulong const t = r5; 15153 r5 = r6; 15154 r6 = t; 15155 }; 15156 if (r7 >= r8) { 15157 ulong const t = r7; 15158 r7 = r8; 15159 r8 = t; 15160 }; 15161 if (r9 >= r10) { 15162 ulong const t = r9; 15163 r9 = r10; 15164 r10 = t; 15165 }; 15166 if (r11 >= r12) { 15167 ulong const t = r11; 15168 r11 = r12; 15169 r12 = t; 15170 }; 15171 if (r13 >= r14) { 15172 ulong const t = r13; 15173 r13 = r14; 15174 r14 = t; 15175 }; 15176 if (r15 >= r16) { 15177 ulong const t = r15; 15178 r15 = r16; 15179 r16 = t; 15180 }; 15181 if (r17 >= r25) { 15182 ulong const t = r17; 15183 r17 = r25; 15184 r25 = t; 15185 }; 15186 if (r21 >= r29) { 15187 ulong const t = r21; 15188 r21 = r29; 15189 r29 = t; 15190 }; 15191 if (r17 >= r21) { 15192 ulong const t = r17; 15193 r17 = r21; 15194 r21 = t; 15195 }; 15196 if (r25 >= r29) { 15197 ulong const t = r25; 15198 r25 = r29; 15199 r29 = t; 15200 }; 15201 if (r19 >= r27) { 15202 ulong const t = r19; 15203 r19 = r27; 15204 r27 = t; 15205 }; 15206 if (r23 >= r31) { 15207 ulong const t = r23; 15208 r23 = r31; 15209 r31 = t; 15210 }; 15211 if (r19 >= r23) { 15212 ulong const t = r19; 15213 r19 = r23; 15214 r23 = t; 15215 }; 15216 if (r27 >= r31) { 15217 ulong const t = r27; 15218 r27 = r31; 15219 r31 = t; 15220 }; 15221 if (r17 >= r19) { 15222 ulong const t = r17; 15223 r17 = r19; 15224 r19 = t; 15225 }; 15226 if (r21 >= r23) { 15227 ulong const t = r21; 15228 r21 = r23; 15229 r23 = t; 15230 }; 15231 if (r25 >= r27) { 15232 ulong const t = r25; 15233 r25 = r27; 15234 r27 = t; 15235 }; 15236 if (r29 >= r31) { 15237 ulong const t = r29; 15238 r29 = r31; 15239 r31 = t; 15240 }; 15241 if (r18 >= r26) { 15242 ulong const t = r18; 15243 r18 = r26; 15244 r26 = t; 15245 }; 15246 if (r22 >= r30) { 15247 ulong const t = r22; 15248 r22 = r30; 15249 r30 = t; 15250 }; 15251 if (r18 >= r22) { 15252 ulong const t = r18; 15253 r18 = r22; 15254 r22 = t; 15255 }; 15256 if (r26 >= r30) { 15257 ulong const t = r26; 15258 r26 = r30; 15259 r30 = t; 15260 }; 15261 if (r20 >= r28) { 15262 ulong const t = r20; 15263 r20 = r28; 15264 r28 = t; 15265 }; 15266 if (r24 >= r32) { 15267 ulong const t = r24; 15268 r24 = r32; 15269 r32 = t; 15270 }; 15271 if (r20 >= r24) { 15272 ulong const t = r20; 15273 r20 = r24; 15274 r24 = t; 15275 }; 15276 if (r28 >= r32) { 15277 ulong const t = r28; 15278 r28 = r32; 15279 r32 = t; 15280 }; 15281 if (r18 >= r20) { 15282 ulong const t = r18; 15283 r18 = r20; 15284 r20 = t; 15285 }; 15286 if (r22 >= r24) { 15287 ulong const t = r22; 15288 r22 = r24; 15289 r24 = t; 15290 }; 15291 if (r26 >= r28) { 15292 ulong const t = r26; 15293 r26 = r28; 15294 r28 = t; 15295 }; 15296 if (r30 >= r32) { 15297 ulong const t = r30; 15298 r30 = r32; 15299 r32 = t; 15300 }; 15301 if (r17 >= r18) { 15302 ulong const t = r17; 15303 r17 = r18; 15304 r18 = t; 15305 }; 15306 if (r19 >= r20) { 15307 ulong const t = r19; 15308 r19 = r20; 15309 r20 = t; 15310 }; 15311 if (r21 >= r22) { 15312 ulong const t = r21; 15313 r21 = r22; 15314 r22 = t; 15315 }; 15316 if (r23 >= r24) { 15317 ulong const t = r23; 15318 r23 = r24; 15319 r24 = t; 15320 }; 15321 if (r25 >= r26) { 15322 ulong const t = r25; 15323 r25 = r26; 15324 r26 = t; 15325 }; 15326 if (r27 >= r28) { 15327 ulong const t = r27; 15328 r27 = r28; 15329 r28 = t; 15330 }; 15331 if (r29 >= r30) { 15332 ulong const t = r29; 15333 r29 = r30; 15334 r30 = t; 15335 }; 15336 if (r31 >= r32) { 15337 ulong const t = r31; 15338 r31 = r32; 15339 r32 = t; 15340 }; 15341 vout[span_l + span_stride * 0] = r1; 15342 vout[span_l + span_stride * 1] = r2; 15343 vout[span_l + span_stride * 2] = r3; 15344 vout[span_l + span_stride * 3] = r4; 15345 vout[span_l + span_stride * 4] = r5; 15346 vout[span_l + span_stride * 5] = r6; 15347 vout[span_l + span_stride * 6] = r7; 15348 vout[span_l + span_stride * 7] = r8; 15349 vout[span_l + span_stride * 8] = r9; 15350 vout[span_l + span_stride * 9] = r10; 15351 vout[span_l + span_stride * 10] = r11; 15352 vout[span_l + span_stride * 11] = r12; 15353 vout[span_l + span_stride * 12] = r13; 15354 vout[span_l + span_stride * 13] = r14; 15355 vout[span_l + span_stride * 14] = r15; 15356 vout[span_l + span_stride * 15] = r16; 15357 vout[span_r + span_stride * 0] = r17; 15358 vout[span_r + span_stride * 1] = r18; 15359 vout[span_r + span_stride * 2] = r19; 15360 vout[span_r + span_stride * 3] = r20; 15361 vout[span_r + span_stride * 4] = r21; 15362 vout[span_r + span_stride * 5] = r22; 15363 vout[span_r + span_stride * 6] = r23; 15364 vout[span_r + span_stride * 7] = r24; 15365 vout[span_r + span_stride * 8] = r25; 15366 vout[span_r + span_stride * 9] = r26; 15367 vout[span_r + span_stride * 10] = r27; 15368 vout[span_r + span_stride * 11] = r28; 15369 vout[span_r + span_stride * 12] = r29; 15370 vout[span_r + span_stride * 13] = r30; 15371 vout[span_r + span_stride * 14] = r31; 15372 vout[span_r + span_stride * 15] = r32; 15373} 15374 15375__kernel __attribute__((intel_reqd_sub_group_size((1 << 3)))) void 15376hs_kernel_hm_1(__global ulong* const restrict vout) 15377{ 15378 uint const span_idx = get_global_id(1); 15379 uint const span_stride = get_global_size(0); 15380 uint const span_size = span_stride * 16 * 2; 15381 uint const span_base = span_idx * span_size; 15382 uint const span_off = get_global_id(0); 15383 uint const span_l = span_base + span_off; 15384 ulong r1 = vout[span_l + span_stride * 0]; 15385 ulong r2 = vout[span_l + span_stride * 1]; 15386 ulong r3 = vout[span_l + span_stride * 2]; 15387 ulong r4 = vout[span_l + span_stride * 3]; 15388 ulong r5 = vout[span_l + span_stride * 4]; 15389 ulong r6 = vout[span_l + span_stride * 5]; 15390 ulong r7 = vout[span_l + span_stride * 6]; 15391 ulong r8 = vout[span_l + span_stride * 7]; 15392 ulong r9 = vout[span_l + span_stride * 8]; 15393 ulong r10 = vout[span_l + span_stride * 9]; 15394 ulong r11 = vout[span_l + span_stride * 10]; 15395 ulong r12 = vout[span_l + span_stride * 11]; 15396 ulong r13 = vout[span_l + span_stride * 12]; 15397 ulong r14 = vout[span_l + span_stride * 13]; 15398 ulong r15 = vout[span_l + span_stride * 14]; 15399 ulong r16 = vout[span_l + span_stride * 15]; 15400 ulong r17 = vout[span_l + span_stride * 16]; 15401 ulong r18 = vout[span_l + span_stride * 17]; 15402 ulong r19 = vout[span_l + span_stride * 18]; 15403 ulong r20 = vout[span_l + span_stride * 19]; 15404 ulong r21 = vout[span_l + span_stride * 20]; 15405 ulong r22 = vout[span_l + span_stride * 21]; 15406 ulong r23 = vout[span_l + span_stride * 22]; 15407 ulong r24 = vout[span_l + span_stride * 23]; 15408 ulong r25 = vout[span_l + span_stride * 24]; 15409 ulong r26 = vout[span_l + span_stride * 25]; 15410 ulong r27 = vout[span_l + span_stride * 26]; 15411 ulong r28 = vout[span_l + span_stride * 27]; 15412 ulong r29 = vout[span_l + span_stride * 28]; 15413 ulong r30 = vout[span_l + span_stride * 29]; 15414 ulong r31 = vout[span_l + span_stride * 30]; 15415 ulong r32 = vout[span_l + span_stride * 31]; 15416 if (r1 >= r17) { 15417 ulong const t = r1; 15418 r1 = r17; 15419 r17 = t; 15420 }; 15421 if (r9 >= r25) { 15422 ulong const t = r9; 15423 r9 = r25; 15424 r25 = t; 15425 }; 15426 if (r1 >= r9) { 15427 ulong const t = r1; 15428 r1 = r9; 15429 r9 = t; 15430 }; 15431 if (r17 >= r25) { 15432 ulong const t = r17; 15433 r17 = r25; 15434 r25 = t; 15435 }; 15436 if (r5 >= r21) { 15437 ulong const t = r5; 15438 r5 = r21; 15439 r21 = t; 15440 }; 15441 if (r13 >= r29) { 15442 ulong const t = r13; 15443 r13 = r29; 15444 r29 = t; 15445 }; 15446 if (r5 >= r13) { 15447 ulong const t = r5; 15448 r5 = r13; 15449 r13 = t; 15450 }; 15451 if (r21 >= r29) { 15452 ulong const t = r21; 15453 r21 = r29; 15454 r29 = t; 15455 }; 15456 if (r1 >= r5) { 15457 ulong const t = r1; 15458 r1 = r5; 15459 r5 = t; 15460 }; 15461 if (r9 >= r13) { 15462 ulong const t = r9; 15463 r9 = r13; 15464 r13 = t; 15465 }; 15466 if (r17 >= r21) { 15467 ulong const t = r17; 15468 r17 = r21; 15469 r21 = t; 15470 }; 15471 if (r25 >= r29) { 15472 ulong const t = r25; 15473 r25 = r29; 15474 r29 = t; 15475 }; 15476 if (r3 >= r19) { 15477 ulong const t = r3; 15478 r3 = r19; 15479 r19 = t; 15480 }; 15481 if (r11 >= r27) { 15482 ulong const t = r11; 15483 r11 = r27; 15484 r27 = t; 15485 }; 15486 if (r3 >= r11) { 15487 ulong const t = r3; 15488 r3 = r11; 15489 r11 = t; 15490 }; 15491 if (r19 >= r27) { 15492 ulong const t = r19; 15493 r19 = r27; 15494 r27 = t; 15495 }; 15496 if (r7 >= r23) { 15497 ulong const t = r7; 15498 r7 = r23; 15499 r23 = t; 15500 }; 15501 if (r15 >= r31) { 15502 ulong const t = r15; 15503 r15 = r31; 15504 r31 = t; 15505 }; 15506 if (r7 >= r15) { 15507 ulong const t = r7; 15508 r7 = r15; 15509 r15 = t; 15510 }; 15511 if (r23 >= r31) { 15512 ulong const t = r23; 15513 r23 = r31; 15514 r31 = t; 15515 }; 15516 if (r3 >= r7) { 15517 ulong const t = r3; 15518 r3 = r7; 15519 r7 = t; 15520 }; 15521 if (r11 >= r15) { 15522 ulong const t = r11; 15523 r11 = r15; 15524 r15 = t; 15525 }; 15526 if (r19 >= r23) { 15527 ulong const t = r19; 15528 r19 = r23; 15529 r23 = t; 15530 }; 15531 if (r27 >= r31) { 15532 ulong const t = r27; 15533 r27 = r31; 15534 r31 = t; 15535 }; 15536 if (r1 >= r3) { 15537 ulong const t = r1; 15538 r1 = r3; 15539 r3 = t; 15540 }; 15541 if (r5 >= r7) { 15542 ulong const t = r5; 15543 r5 = r7; 15544 r7 = t; 15545 }; 15546 if (r9 >= r11) { 15547 ulong const t = r9; 15548 r9 = r11; 15549 r11 = t; 15550 }; 15551 if (r13 >= r15) { 15552 ulong const t = r13; 15553 r13 = r15; 15554 r15 = t; 15555 }; 15556 if (r17 >= r19) { 15557 ulong const t = r17; 15558 r17 = r19; 15559 r19 = t; 15560 }; 15561 if (r21 >= r23) { 15562 ulong const t = r21; 15563 r21 = r23; 15564 r23 = t; 15565 }; 15566 if (r25 >= r27) { 15567 ulong const t = r25; 15568 r25 = r27; 15569 r27 = t; 15570 }; 15571 if (r29 >= r31) { 15572 ulong const t = r29; 15573 r29 = r31; 15574 r31 = t; 15575 }; 15576 if (r2 >= r18) { 15577 ulong const t = r2; 15578 r2 = r18; 15579 r18 = t; 15580 }; 15581 if (r10 >= r26) { 15582 ulong const t = r10; 15583 r10 = r26; 15584 r26 = t; 15585 }; 15586 if (r2 >= r10) { 15587 ulong const t = r2; 15588 r2 = r10; 15589 r10 = t; 15590 }; 15591 if (r18 >= r26) { 15592 ulong const t = r18; 15593 r18 = r26; 15594 r26 = t; 15595 }; 15596 if (r6 >= r22) { 15597 ulong const t = r6; 15598 r6 = r22; 15599 r22 = t; 15600 }; 15601 if (r14 >= r30) { 15602 ulong const t = r14; 15603 r14 = r30; 15604 r30 = t; 15605 }; 15606 if (r6 >= r14) { 15607 ulong const t = r6; 15608 r6 = r14; 15609 r14 = t; 15610 }; 15611 if (r22 >= r30) { 15612 ulong const t = r22; 15613 r22 = r30; 15614 r30 = t; 15615 }; 15616 if (r2 >= r6) { 15617 ulong const t = r2; 15618 r2 = r6; 15619 r6 = t; 15620 }; 15621 if (r10 >= r14) { 15622 ulong const t = r10; 15623 r10 = r14; 15624 r14 = t; 15625 }; 15626 if (r18 >= r22) { 15627 ulong const t = r18; 15628 r18 = r22; 15629 r22 = t; 15630 }; 15631 if (r26 >= r30) { 15632 ulong const t = r26; 15633 r26 = r30; 15634 r30 = t; 15635 }; 15636 if (r4 >= r20) { 15637 ulong const t = r4; 15638 r4 = r20; 15639 r20 = t; 15640 }; 15641 if (r12 >= r28) { 15642 ulong const t = r12; 15643 r12 = r28; 15644 r28 = t; 15645 }; 15646 if (r4 >= r12) { 15647 ulong const t = r4; 15648 r4 = r12; 15649 r12 = t; 15650 }; 15651 if (r20 >= r28) { 15652 ulong const t = r20; 15653 r20 = r28; 15654 r28 = t; 15655 }; 15656 if (r8 >= r24) { 15657 ulong const t = r8; 15658 r8 = r24; 15659 r24 = t; 15660 }; 15661 if (r16 >= r32) { 15662 ulong const t = r16; 15663 r16 = r32; 15664 r32 = t; 15665 }; 15666 if (r8 >= r16) { 15667 ulong const t = r8; 15668 r8 = r16; 15669 r16 = t; 15670 }; 15671 if (r24 >= r32) { 15672 ulong const t = r24; 15673 r24 = r32; 15674 r32 = t; 15675 }; 15676 if (r4 >= r8) { 15677 ulong const t = r4; 15678 r4 = r8; 15679 r8 = t; 15680 }; 15681 if (r12 >= r16) { 15682 ulong const t = r12; 15683 r12 = r16; 15684 r16 = t; 15685 }; 15686 if (r20 >= r24) { 15687 ulong const t = r20; 15688 r20 = r24; 15689 r24 = t; 15690 }; 15691 if (r28 >= r32) { 15692 ulong const t = r28; 15693 r28 = r32; 15694 r32 = t; 15695 }; 15696 if (r2 >= r4) { 15697 ulong const t = r2; 15698 r2 = r4; 15699 r4 = t; 15700 }; 15701 if (r6 >= r8) { 15702 ulong const t = r6; 15703 r6 = r8; 15704 r8 = t; 15705 }; 15706 if (r10 >= r12) { 15707 ulong const t = r10; 15708 r10 = r12; 15709 r12 = t; 15710 }; 15711 if (r14 >= r16) { 15712 ulong const t = r14; 15713 r14 = r16; 15714 r16 = t; 15715 }; 15716 if (r18 >= r20) { 15717 ulong const t = r18; 15718 r18 = r20; 15719 r20 = t; 15720 }; 15721 if (r22 >= r24) { 15722 ulong const t = r22; 15723 r22 = r24; 15724 r24 = t; 15725 }; 15726 if (r26 >= r28) { 15727 ulong const t = r26; 15728 r26 = r28; 15729 r28 = t; 15730 }; 15731 if (r30 >= r32) { 15732 ulong const t = r30; 15733 r30 = r32; 15734 r32 = t; 15735 }; 15736 if (r1 >= r2) { 15737 ulong const t = r1; 15738 r1 = r2; 15739 r2 = t; 15740 }; 15741 if (r3 >= r4) { 15742 ulong const t = r3; 15743 r3 = r4; 15744 r4 = t; 15745 }; 15746 if (r5 >= r6) { 15747 ulong const t = r5; 15748 r5 = r6; 15749 r6 = t; 15750 }; 15751 if (r7 >= r8) { 15752 ulong const t = r7; 15753 r7 = r8; 15754 r8 = t; 15755 }; 15756 if (r9 >= r10) { 15757 ulong const t = r9; 15758 r9 = r10; 15759 r10 = t; 15760 }; 15761 if (r11 >= r12) { 15762 ulong const t = r11; 15763 r11 = r12; 15764 r12 = t; 15765 }; 15766 if (r13 >= r14) { 15767 ulong const t = r13; 15768 r13 = r14; 15769 r14 = t; 15770 }; 15771 if (r15 >= r16) { 15772 ulong const t = r15; 15773 r15 = r16; 15774 r16 = t; 15775 }; 15776 if (r17 >= r18) { 15777 ulong const t = r17; 15778 r17 = r18; 15779 r18 = t; 15780 }; 15781 if (r19 >= r20) { 15782 ulong const t = r19; 15783 r19 = r20; 15784 r20 = t; 15785 }; 15786 if (r21 >= r22) { 15787 ulong const t = r21; 15788 r21 = r22; 15789 r22 = t; 15790 }; 15791 if (r23 >= r24) { 15792 ulong const t = r23; 15793 r23 = r24; 15794 r24 = t; 15795 }; 15796 if (r25 >= r26) { 15797 ulong const t = r25; 15798 r25 = r26; 15799 r26 = t; 15800 }; 15801 if (r27 >= r28) { 15802 ulong const t = r27; 15803 r27 = r28; 15804 r28 = t; 15805 }; 15806 if (r29 >= r30) { 15807 ulong const t = r29; 15808 r29 = r30; 15809 r30 = t; 15810 }; 15811 if (r31 >= r32) { 15812 ulong const t = r31; 15813 r31 = r32; 15814 r32 = t; 15815 }; 15816 vout[span_l + span_stride * 0] = r1; 15817 vout[span_l + span_stride * 1] = r2; 15818 vout[span_l + span_stride * 2] = r3; 15819 vout[span_l + span_stride * 3] = r4; 15820 vout[span_l + span_stride * 4] = r5; 15821 vout[span_l + span_stride * 5] = r6; 15822 vout[span_l + span_stride * 6] = r7; 15823 vout[span_l + span_stride * 7] = r8; 15824 vout[span_l + span_stride * 8] = r9; 15825 vout[span_l + span_stride * 9] = r10; 15826 vout[span_l + span_stride * 10] = r11; 15827 vout[span_l + span_stride * 11] = r12; 15828 vout[span_l + span_stride * 12] = r13; 15829 vout[span_l + span_stride * 13] = r14; 15830 vout[span_l + span_stride * 14] = r15; 15831 vout[span_l + span_stride * 15] = r16; 15832 vout[span_l + span_stride * 16] = r17; 15833 vout[span_l + span_stride * 17] = r18; 15834 vout[span_l + span_stride * 18] = r19; 15835 vout[span_l + span_stride * 19] = r20; 15836 vout[span_l + span_stride * 20] = r21; 15837 vout[span_l + span_stride * 21] = r22; 15838 vout[span_l + span_stride * 22] = r23; 15839 vout[span_l + span_stride * 23] = r24; 15840 vout[span_l + span_stride * 24] = r25; 15841 vout[span_l + span_stride * 25] = r26; 15842 vout[span_l + span_stride * 26] = r27; 15843 vout[span_l + span_stride * 27] = r28; 15844 vout[span_l + span_stride * 28] = r29; 15845 vout[span_l + span_stride * 29] = r30; 15846 vout[span_l + span_stride * 30] = r31; 15847 vout[span_l + span_stride * 31] = r32; 15848} 15849 15850__kernel __attribute__((intel_reqd_sub_group_size((1 << 3)))) void 15851hs_kernel_transpose(__global ulong* const restrict vout) 15852{ 15853 uint const gmem_idx = (get_global_id(0) & ~((1 << 3) - 1)) * 16 + 15854 (get_local_id(0) & ((1 << 3) - 1)); 15855 ulong r1 = vout[gmem_idx + (1 << 3) * 0]; 15856 ulong r2 = vout[gmem_idx + (1 << 3) * 1]; 15857 ulong r3 = vout[gmem_idx + (1 << 3) * 2]; 15858 ulong r4 = vout[gmem_idx + (1 << 3) * 3]; 15859 ulong r5 = vout[gmem_idx + (1 << 3) * 4]; 15860 ulong r6 = vout[gmem_idx + (1 << 3) * 5]; 15861 ulong r7 = vout[gmem_idx + (1 << 3) * 6]; 15862 ulong r8 = vout[gmem_idx + (1 << 3) * 7]; 15863 ulong r9 = vout[gmem_idx + (1 << 3) * 8]; 15864 ulong r10 = vout[gmem_idx + (1 << 3) * 9]; 15865 ulong r11 = vout[gmem_idx + (1 << 3) * 10]; 15866 ulong r12 = vout[gmem_idx + (1 << 3) * 11]; 15867 ulong r13 = vout[gmem_idx + (1 << 3) * 12]; 15868 ulong r14 = vout[gmem_idx + (1 << 3) * 13]; 15869 ulong r15 = vout[gmem_idx + (1 << 3) * 14]; 15870 ulong r16 = vout[gmem_idx + (1 << 3) * 15]; 15871 bool const is_lo_1 = (get_sub_group_local_id() & (1 << (1 - 1))) == 0; 15872 bool const is_lo_2 = (get_sub_group_local_id() & (1 << (2 - 1))) == 0; 15873 bool const is_lo_3 = (get_sub_group_local_id() & (1 << (3 - 1))) == 0; 15874 ulong const s2_1 = 15875 intel_sub_group_shuffle_xor(is_lo_1 ? r2 : r1, 1 << (1 - 1)); 15876 ulong const s2 = is_lo_1 ? s2_1 : r2; 15877 ulong const s1 = is_lo_1 ? r1 : s2_1; 15878 ulong const s4_3 = 15879 intel_sub_group_shuffle_xor(is_lo_1 ? r4 : r3, 1 << (1 - 1)); 15880 ulong const s4 = is_lo_1 ? s4_3 : r4; 15881 ulong const s3 = is_lo_1 ? r3 : s4_3; 15882 ulong const s6_5 = 15883 intel_sub_group_shuffle_xor(is_lo_1 ? r6 : r5, 1 << (1 - 1)); 15884 ulong const s6 = is_lo_1 ? s6_5 : r6; 15885 ulong const s5 = is_lo_1 ? r5 : s6_5; 15886 ulong const s8_7 = 15887 intel_sub_group_shuffle_xor(is_lo_1 ? r8 : r7, 1 << (1 - 1)); 15888 ulong const s8 = is_lo_1 ? s8_7 : r8; 15889 ulong const s7 = is_lo_1 ? r7 : s8_7; 15890 ulong const s10_9 = 15891 intel_sub_group_shuffle_xor(is_lo_1 ? r10 : r9, 1 << (1 - 1)); 15892 ulong const s10 = is_lo_1 ? s10_9 : r10; 15893 ulong const s9 = is_lo_1 ? r9 : s10_9; 15894 ulong const s12_11 = 15895 intel_sub_group_shuffle_xor(is_lo_1 ? r12 : r11, 1 << (1 - 1)); 15896 ulong const s12 = is_lo_1 ? s12_11 : r12; 15897 ulong const s11 = is_lo_1 ? r11 : s12_11; 15898 ulong const s14_13 = 15899 intel_sub_group_shuffle_xor(is_lo_1 ? r14 : r13, 1 << (1 - 1)); 15900 ulong const s14 = is_lo_1 ? s14_13 : r14; 15901 ulong const s13 = is_lo_1 ? r13 : s14_13; 15902 ulong const s16_15 = 15903 intel_sub_group_shuffle_xor(is_lo_1 ? r16 : r15, 1 << (1 - 1)); 15904 ulong const s16 = is_lo_1 ? s16_15 : r16; 15905 ulong const s15 = is_lo_1 ? r15 : s16_15; 15906 ulong const t3_1 = 15907 intel_sub_group_shuffle_xor(is_lo_2 ? s3 : s1, 1 << (2 - 1)); 15908 ulong const t3 = is_lo_2 ? t3_1 : s3; 15909 ulong const t1 = is_lo_2 ? s1 : t3_1; 15910 ulong const t4_2 = 15911 intel_sub_group_shuffle_xor(is_lo_2 ? s4 : s2, 1 << (2 - 1)); 15912 ulong const t4 = is_lo_2 ? t4_2 : s4; 15913 ulong const t2 = is_lo_2 ? s2 : t4_2; 15914 ulong const t7_5 = 15915 intel_sub_group_shuffle_xor(is_lo_2 ? s7 : s5, 1 << (2 - 1)); 15916 ulong const t7 = is_lo_2 ? t7_5 : s7; 15917 ulong const t5 = is_lo_2 ? s5 : t7_5; 15918 ulong const t8_6 = 15919 intel_sub_group_shuffle_xor(is_lo_2 ? s8 : s6, 1 << (2 - 1)); 15920 ulong const t8 = is_lo_2 ? t8_6 : s8; 15921 ulong const t6 = is_lo_2 ? s6 : t8_6; 15922 ulong const t11_9 = 15923 intel_sub_group_shuffle_xor(is_lo_2 ? s11 : s9, 1 << (2 - 1)); 15924 ulong const t11 = is_lo_2 ? t11_9 : s11; 15925 ulong const t9 = is_lo_2 ? s9 : t11_9; 15926 ulong const t12_10 = 15927 intel_sub_group_shuffle_xor(is_lo_2 ? s12 : s10, 1 << (2 - 1)); 15928 ulong const t12 = is_lo_2 ? t12_10 : s12; 15929 ulong const t10 = is_lo_2 ? s10 : t12_10; 15930 ulong const t15_13 = 15931 intel_sub_group_shuffle_xor(is_lo_2 ? s15 : s13, 1 << (2 - 1)); 15932 ulong const t15 = is_lo_2 ? t15_13 : s15; 15933 ulong const t13 = is_lo_2 ? s13 : t15_13; 15934 ulong const t16_14 = 15935 intel_sub_group_shuffle_xor(is_lo_2 ? s16 : s14, 1 << (2 - 1)); 15936 ulong const t16 = is_lo_2 ? t16_14 : s16; 15937 ulong const t14 = is_lo_2 ? s14 : t16_14; 15938 ulong const u5_1 = 15939 intel_sub_group_shuffle_xor(is_lo_3 ? t5 : t1, 1 << (3 - 1)); 15940 ulong const u5 = is_lo_3 ? u5_1 : t5; 15941 ulong const u1 = is_lo_3 ? t1 : u5_1; 15942 ulong const u6_2 = 15943 intel_sub_group_shuffle_xor(is_lo_3 ? t6 : t2, 1 << (3 - 1)); 15944 ulong const u6 = is_lo_3 ? u6_2 : t6; 15945 ulong const u2 = is_lo_3 ? t2 : u6_2; 15946 ulong const u7_3 = 15947 intel_sub_group_shuffle_xor(is_lo_3 ? t7 : t3, 1 << (3 - 1)); 15948 ulong const u7 = is_lo_3 ? u7_3 : t7; 15949 ulong const u3 = is_lo_3 ? t3 : u7_3; 15950 ulong const u8_4 = 15951 intel_sub_group_shuffle_xor(is_lo_3 ? t8 : t4, 1 << (3 - 1)); 15952 ulong const u8 = is_lo_3 ? u8_4 : t8; 15953 ulong const u4 = is_lo_3 ? t4 : u8_4; 15954 ulong const u13_9 = 15955 intel_sub_group_shuffle_xor(is_lo_3 ? t13 : t9, 1 << (3 - 1)); 15956 ulong const u13 = is_lo_3 ? u13_9 : t13; 15957 ulong const u9 = is_lo_3 ? t9 : u13_9; 15958 ulong const u14_10 = 15959 intel_sub_group_shuffle_xor(is_lo_3 ? t14 : t10, 1 << (3 - 1)); 15960 ulong const u14 = is_lo_3 ? u14_10 : t14; 15961 ulong const u10 = is_lo_3 ? t10 : u14_10; 15962 ulong const u15_11 = 15963 intel_sub_group_shuffle_xor(is_lo_3 ? t15 : t11, 1 << (3 - 1)); 15964 ulong const u15 = is_lo_3 ? u15_11 : t15; 15965 ulong const u11 = is_lo_3 ? t11 : u15_11; 15966 ulong const u16_12 = 15967 intel_sub_group_shuffle_xor(is_lo_3 ? t16 : t12, 1 << (3 - 1)); 15968 ulong const u16 = is_lo_3 ? u16_12 : t16; 15969 ulong const u12 = is_lo_3 ? t12 : u16_12; 15970 vout[gmem_idx + ((1 - 1) << 3)] = u1; 15971 vout[gmem_idx + ((3 - 1) << 3)] = u2; 15972 vout[gmem_idx + ((5 - 1) << 3)] = u3; 15973 vout[gmem_idx + ((7 - 1) << 3)] = u4; 15974 vout[gmem_idx + ((9 - 1) << 3)] = u5; 15975 vout[gmem_idx + ((11 - 1) << 3)] = u6; 15976 vout[gmem_idx + ((13 - 1) << 3)] = u7; 15977 vout[gmem_idx + ((15 - 1) << 3)] = u8; 15978 vout[gmem_idx + ((2 - 1) << 3)] = u9; 15979 vout[gmem_idx + ((4 - 1) << 3)] = u10; 15980 vout[gmem_idx + ((6 - 1) << 3)] = u11; 15981 vout[gmem_idx + ((8 - 1) << 3)] = u12; 15982 vout[gmem_idx + ((10 - 1) << 3)] = u13; 15983 vout[gmem_idx + ((12 - 1) << 3)] = u14; 15984 vout[gmem_idx + ((14 - 1) << 3)] = u15; 15985 vout[gmem_idx + ((16 - 1) << 3)] = u16; 15986} 15987