1 2 3__kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) 4__attribute__((reqd_work_group_size((1 << 4) * 1, 1, 1))) void 5hs_kernel_bs_0(__global uint const* const restrict vin, 6 __global uint* const restrict vout) 7{ 8 uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 + 9 (get_local_id(0) & ((1 << 4) - 1)); 10 uint r1 = vin[gmem_idx + (1 << 4) * 0]; 11 uint r2 = vin[gmem_idx + (1 << 4) * 1]; 12 uint r3 = vin[gmem_idx + (1 << 4) * 2]; 13 uint r4 = vin[gmem_idx + (1 << 4) * 3]; 14 uint r5 = vin[gmem_idx + (1 << 4) * 4]; 15 uint r6 = vin[gmem_idx + (1 << 4) * 5]; 16 uint r7 = vin[gmem_idx + (1 << 4) * 6]; 17 uint r8 = vin[gmem_idx + (1 << 4) * 7]; 18 { 19 uint const t = min(r1, r5); 20 r5 = max(r1, r5); 21 r1 = t; 22 }; 23 { 24 uint const t = min(r2, r6); 25 r6 = max(r2, r6); 26 r2 = t; 27 }; 28 { 29 uint const t = min(r3, r7); 30 r7 = max(r3, r7); 31 r3 = t; 32 }; 33 { 34 uint const t = min(r4, r8); 35 r8 = max(r4, r8); 36 r4 = t; 37 }; 38 { 39 uint const t = min(r1, r3); 40 r3 = max(r1, r3); 41 r1 = t; 42 }; 43 { 44 uint const t = min(r2, r4); 45 r4 = max(r2, r4); 46 r2 = t; 47 }; 48 { 49 uint const t = min(r5, r7); 50 r7 = max(r5, r7); 51 r5 = t; 52 }; 53 { 54 uint const t = min(r6, r8); 55 r8 = max(r6, r8); 56 r6 = t; 57 }; 58 { 59 uint const t = min(r3, r5); 60 r5 = max(r3, r5); 61 r3 = t; 62 }; 63 { 64 uint const t = min(r4, r6); 65 r6 = max(r4, r6); 66 r4 = t; 67 }; 68 { 69 uint const t = min(r1, r2); 70 r2 = max(r1, r2); 71 r1 = t; 72 }; 73 { 74 uint const t = min(r3, r4); 75 r4 = max(r3, r4); 76 r3 = t; 77 }; 78 { 79 uint const t = min(r5, r6); 80 r6 = max(r5, r6); 81 r5 = t; 82 }; 83 { 84 uint const t = min(r7, r8); 85 r8 = max(r7, r8); 86 r7 = t; 87 }; 88 { 89 uint const t = min(r2, r5); 90 r5 = max(r2, r5); 91 r2 = t; 92 }; 93 { 94 uint const t = min(r4, r7); 95 r7 = max(r4, r7); 96 r4 = t; 97 }; 98 { 99 uint const t = min(r2, r3); 100 r3 = max(r2, r3); 101 r2 = t; 102 }; 103 { 104 uint const t = min(r4, r5); 105 r5 = max(r4, r5); 106 r4 = t; 107 }; 108 { 109 uint const t = min(r6, r7); 110 r7 = max(r6, r7); 111 r6 = t; 112 }; 113 { 114 uint const flip_lane_idx = get_sub_group_local_id() ^ 1; 115 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 116 ; 117 { 118 uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 119 uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx); 120 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 121 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 122 }; 123 { 124 uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 125 uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx); 126 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 127 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 128 }; 129 { 130 uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 131 uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx); 132 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 133 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 134 }; 135 { 136 uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 137 uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx); 138 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 139 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 140 }; 141 } 142 { 143 uint const t = min(r1, r5); 144 r5 = max(r1, r5); 145 r1 = t; 146 }; 147 { 148 uint const t = min(r3, r7); 149 r7 = max(r3, r7); 150 r3 = t; 151 }; 152 { 153 uint const t = min(r1, r3); 154 r3 = max(r1, r3); 155 r1 = t; 156 }; 157 { 158 uint const t = min(r5, r7); 159 r7 = max(r5, r7); 160 r5 = t; 161 }; 162 { 163 uint const t = min(r2, r6); 164 r6 = max(r2, r6); 165 r2 = t; 166 }; 167 { 168 uint const t = min(r4, r8); 169 r8 = max(r4, r8); 170 r4 = t; 171 }; 172 { 173 uint const t = min(r2, r4); 174 r4 = max(r2, r4); 175 r2 = t; 176 }; 177 { 178 uint const t = min(r6, r8); 179 r8 = max(r6, r8); 180 r6 = t; 181 }; 182 { 183 uint const t = min(r1, r2); 184 r2 = max(r1, r2); 185 r1 = t; 186 }; 187 { 188 uint const t = min(r3, r4); 189 r4 = max(r3, r4); 190 r3 = t; 191 }; 192 { 193 uint const t = min(r5, r6); 194 r6 = max(r5, r6); 195 r5 = t; 196 }; 197 { 198 uint const t = min(r7, r8); 199 r8 = max(r7, r8); 200 r7 = t; 201 }; 202 { 203 uint const flip_lane_idx = get_sub_group_local_id() ^ 3; 204 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 205 ; 206 { 207 uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 208 uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx); 209 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 210 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 211 }; 212 { 213 uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 214 uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx); 215 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 216 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 217 }; 218 { 219 uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 220 uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx); 221 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 222 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 223 }; 224 { 225 uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 226 uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx); 227 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 228 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 229 }; 230 } 231 { 232 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 233 int const t_lt = get_sub_group_local_id() < half_lane_idx; 234 ; 235 { 236 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 237 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 238 }; 239 { 240 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 241 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 242 }; 243 { 244 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 245 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 246 }; 247 { 248 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 249 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 250 }; 251 { 252 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 253 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 254 }; 255 { 256 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 257 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 258 }; 259 { 260 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 261 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 262 }; 263 { 264 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 265 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 266 }; 267 } 268 { 269 uint const t = min(r1, r5); 270 r5 = max(r1, r5); 271 r1 = t; 272 }; 273 { 274 uint const t = min(r3, r7); 275 r7 = max(r3, r7); 276 r3 = t; 277 }; 278 { 279 uint const t = min(r1, r3); 280 r3 = max(r1, r3); 281 r1 = t; 282 }; 283 { 284 uint const t = min(r5, r7); 285 r7 = max(r5, r7); 286 r5 = t; 287 }; 288 { 289 uint const t = min(r2, r6); 290 r6 = max(r2, r6); 291 r2 = t; 292 }; 293 { 294 uint const t = min(r4, r8); 295 r8 = max(r4, r8); 296 r4 = t; 297 }; 298 { 299 uint const t = min(r2, r4); 300 r4 = max(r2, r4); 301 r2 = t; 302 }; 303 { 304 uint const t = min(r6, r8); 305 r8 = max(r6, r8); 306 r6 = t; 307 }; 308 { 309 uint const t = min(r1, r2); 310 r2 = max(r1, r2); 311 r1 = t; 312 }; 313 { 314 uint const t = min(r3, r4); 315 r4 = max(r3, r4); 316 r3 = t; 317 }; 318 { 319 uint const t = min(r5, r6); 320 r6 = max(r5, r6); 321 r5 = t; 322 }; 323 { 324 uint const t = min(r7, r8); 325 r8 = max(r7, r8); 326 r7 = t; 327 }; 328 { 329 uint const flip_lane_idx = get_sub_group_local_id() ^ 7; 330 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 331 ; 332 { 333 uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 334 uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx); 335 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 336 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 337 }; 338 { 339 uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 340 uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx); 341 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 342 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 343 }; 344 { 345 uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 346 uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx); 347 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 348 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 349 }; 350 { 351 uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 352 uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx); 353 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 354 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 355 }; 356 } 357 { 358 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 359 int const t_lt = get_sub_group_local_id() < half_lane_idx; 360 ; 361 { 362 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 363 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 364 }; 365 { 366 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 367 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 368 }; 369 { 370 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 371 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 372 }; 373 { 374 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 375 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 376 }; 377 { 378 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 379 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 380 }; 381 { 382 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 383 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 384 }; 385 { 386 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 387 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 388 }; 389 { 390 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 391 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 392 }; 393 } 394 { 395 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 396 int const t_lt = get_sub_group_local_id() < half_lane_idx; 397 ; 398 { 399 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 400 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 401 }; 402 { 403 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 404 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 405 }; 406 { 407 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 408 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 409 }; 410 { 411 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 412 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 413 }; 414 { 415 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 416 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 417 }; 418 { 419 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 420 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 421 }; 422 { 423 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 424 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 425 }; 426 { 427 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 428 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 429 }; 430 } 431 { 432 uint const t = min(r1, r5); 433 r5 = max(r1, r5); 434 r1 = t; 435 }; 436 { 437 uint const t = min(r3, r7); 438 r7 = max(r3, r7); 439 r3 = t; 440 }; 441 { 442 uint const t = min(r1, r3); 443 r3 = max(r1, r3); 444 r1 = t; 445 }; 446 { 447 uint const t = min(r5, r7); 448 r7 = max(r5, r7); 449 r5 = t; 450 }; 451 { 452 uint const t = min(r2, r6); 453 r6 = max(r2, r6); 454 r2 = t; 455 }; 456 { 457 uint const t = min(r4, r8); 458 r8 = max(r4, r8); 459 r4 = t; 460 }; 461 { 462 uint const t = min(r2, r4); 463 r4 = max(r2, r4); 464 r2 = t; 465 }; 466 { 467 uint const t = min(r6, r8); 468 r8 = max(r6, r8); 469 r6 = t; 470 }; 471 { 472 uint const t = min(r1, r2); 473 r2 = max(r1, r2); 474 r1 = t; 475 }; 476 { 477 uint const t = min(r3, r4); 478 r4 = max(r3, r4); 479 r3 = t; 480 }; 481 { 482 uint const t = min(r5, r6); 483 r6 = max(r5, r6); 484 r5 = t; 485 }; 486 { 487 uint const t = min(r7, r8); 488 r8 = max(r7, r8); 489 r7 = t; 490 }; 491 { 492 uint const flip_lane_idx = get_sub_group_local_id() ^ 15; 493 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 494 ; 495 { 496 uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 497 uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx); 498 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 499 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 500 }; 501 { 502 uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 503 uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx); 504 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 505 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 506 }; 507 { 508 uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 509 uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx); 510 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 511 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 512 }; 513 { 514 uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 515 uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx); 516 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 517 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 518 }; 519 } 520 { 521 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 522 int const t_lt = get_sub_group_local_id() < half_lane_idx; 523 ; 524 { 525 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 526 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 527 }; 528 { 529 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 530 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 531 }; 532 { 533 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 534 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 535 }; 536 { 537 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 538 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 539 }; 540 { 541 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 542 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 543 }; 544 { 545 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 546 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 547 }; 548 { 549 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 550 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 551 }; 552 { 553 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 554 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 555 }; 556 } 557 { 558 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 559 int const t_lt = get_sub_group_local_id() < half_lane_idx; 560 ; 561 { 562 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 563 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 564 }; 565 { 566 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 567 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 568 }; 569 { 570 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 571 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 572 }; 573 { 574 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 575 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 576 }; 577 { 578 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 579 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 580 }; 581 { 582 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 583 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 584 }; 585 { 586 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 587 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 588 }; 589 { 590 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 591 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 592 }; 593 } 594 { 595 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 596 int const t_lt = get_sub_group_local_id() < half_lane_idx; 597 ; 598 { 599 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 600 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 601 }; 602 { 603 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 604 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 605 }; 606 { 607 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 608 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 609 }; 610 { 611 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 612 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 613 }; 614 { 615 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 616 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 617 }; 618 { 619 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 620 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 621 }; 622 { 623 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 624 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 625 }; 626 { 627 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 628 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 629 }; 630 } 631 { 632 uint const t = min(r1, r5); 633 r5 = max(r1, r5); 634 r1 = t; 635 }; 636 { 637 uint const t = min(r3, r7); 638 r7 = max(r3, r7); 639 r3 = t; 640 }; 641 { 642 uint const t = min(r1, r3); 643 r3 = max(r1, r3); 644 r1 = t; 645 }; 646 { 647 uint const t = min(r5, r7); 648 r7 = max(r5, r7); 649 r5 = t; 650 }; 651 { 652 uint const t = min(r2, r6); 653 r6 = max(r2, r6); 654 r2 = t; 655 }; 656 { 657 uint const t = min(r4, r8); 658 r8 = max(r4, r8); 659 r4 = t; 660 }; 661 { 662 uint const t = min(r2, r4); 663 r4 = max(r2, r4); 664 r2 = t; 665 }; 666 { 667 uint const t = min(r6, r8); 668 r8 = max(r6, r8); 669 r6 = t; 670 }; 671 { 672 uint const t = min(r1, r2); 673 r2 = max(r1, r2); 674 r1 = t; 675 }; 676 { 677 uint const t = min(r3, r4); 678 r4 = max(r3, r4); 679 r3 = t; 680 }; 681 { 682 uint const t = min(r5, r6); 683 r6 = max(r5, r6); 684 r5 = t; 685 }; 686 { 687 uint const t = min(r7, r8); 688 r8 = max(r7, r8); 689 r7 = t; 690 }; 691 vout[gmem_idx + (1 << 4) * 0] = r1; 692 vout[gmem_idx + (1 << 4) * 1] = r2; 693 vout[gmem_idx + (1 << 4) * 2] = r3; 694 vout[gmem_idx + (1 << 4) * 3] = r4; 695 vout[gmem_idx + (1 << 4) * 4] = r5; 696 vout[gmem_idx + (1 << 4) * 5] = r6; 697 vout[gmem_idx + (1 << 4) * 6] = r7; 698 vout[gmem_idx + (1 << 4) * 7] = r8; 699} 700 701__kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) 702__attribute__((reqd_work_group_size((1 << 4) * 2, 1, 1))) void 703hs_kernel_bs_1(__global uint const* const restrict vin, 704 __global uint* const restrict vout) 705{ 706 __local struct 707 { 708 uint m[32 * 8]; 709 } shared; 710 711 uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 + 712 (get_local_id(0) & ((1 << 4) - 1)); 713 uint r1 = vin[gmem_idx + (1 << 4) * 0]; 714 uint r2 = vin[gmem_idx + (1 << 4) * 1]; 715 uint r3 = vin[gmem_idx + (1 << 4) * 2]; 716 uint r4 = vin[gmem_idx + (1 << 4) * 3]; 717 uint r5 = vin[gmem_idx + (1 << 4) * 4]; 718 uint r6 = vin[gmem_idx + (1 << 4) * 5]; 719 uint r7 = vin[gmem_idx + (1 << 4) * 6]; 720 uint r8 = vin[gmem_idx + (1 << 4) * 7]; 721 { 722 uint const t = min(r1, r5); 723 r5 = max(r1, r5); 724 r1 = t; 725 }; 726 { 727 uint const t = min(r2, r6); 728 r6 = max(r2, r6); 729 r2 = t; 730 }; 731 { 732 uint const t = min(r3, r7); 733 r7 = max(r3, r7); 734 r3 = t; 735 }; 736 { 737 uint const t = min(r4, r8); 738 r8 = max(r4, r8); 739 r4 = t; 740 }; 741 { 742 uint const t = min(r1, r3); 743 r3 = max(r1, r3); 744 r1 = t; 745 }; 746 { 747 uint const t = min(r2, r4); 748 r4 = max(r2, r4); 749 r2 = t; 750 }; 751 { 752 uint const t = min(r5, r7); 753 r7 = max(r5, r7); 754 r5 = t; 755 }; 756 { 757 uint const t = min(r6, r8); 758 r8 = max(r6, r8); 759 r6 = t; 760 }; 761 { 762 uint const t = min(r3, r5); 763 r5 = max(r3, r5); 764 r3 = t; 765 }; 766 { 767 uint const t = min(r4, r6); 768 r6 = max(r4, r6); 769 r4 = t; 770 }; 771 { 772 uint const t = min(r1, r2); 773 r2 = max(r1, r2); 774 r1 = t; 775 }; 776 { 777 uint const t = min(r3, r4); 778 r4 = max(r3, r4); 779 r3 = t; 780 }; 781 { 782 uint const t = min(r5, r6); 783 r6 = max(r5, r6); 784 r5 = t; 785 }; 786 { 787 uint const t = min(r7, r8); 788 r8 = max(r7, r8); 789 r7 = t; 790 }; 791 { 792 uint const t = min(r2, r5); 793 r5 = max(r2, r5); 794 r2 = t; 795 }; 796 { 797 uint const t = min(r4, r7); 798 r7 = max(r4, r7); 799 r4 = t; 800 }; 801 { 802 uint const t = min(r2, r3); 803 r3 = max(r2, r3); 804 r2 = t; 805 }; 806 { 807 uint const t = min(r4, r5); 808 r5 = max(r4, r5); 809 r4 = t; 810 }; 811 { 812 uint const t = min(r6, r7); 813 r7 = max(r6, r7); 814 r6 = t; 815 }; 816 { 817 uint const flip_lane_idx = get_sub_group_local_id() ^ 1; 818 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 819 ; 820 { 821 uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 822 uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx); 823 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 824 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 825 }; 826 { 827 uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 828 uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx); 829 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 830 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 831 }; 832 { 833 uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 834 uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx); 835 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 836 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 837 }; 838 { 839 uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 840 uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx); 841 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 842 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 843 }; 844 } 845 { 846 uint const t = min(r1, r5); 847 r5 = max(r1, r5); 848 r1 = t; 849 }; 850 { 851 uint const t = min(r3, r7); 852 r7 = max(r3, r7); 853 r3 = t; 854 }; 855 { 856 uint const t = min(r1, r3); 857 r3 = max(r1, r3); 858 r1 = t; 859 }; 860 { 861 uint const t = min(r5, r7); 862 r7 = max(r5, r7); 863 r5 = t; 864 }; 865 { 866 uint const t = min(r2, r6); 867 r6 = max(r2, r6); 868 r2 = t; 869 }; 870 { 871 uint const t = min(r4, r8); 872 r8 = max(r4, r8); 873 r4 = t; 874 }; 875 { 876 uint const t = min(r2, r4); 877 r4 = max(r2, r4); 878 r2 = t; 879 }; 880 { 881 uint const t = min(r6, r8); 882 r8 = max(r6, r8); 883 r6 = t; 884 }; 885 { 886 uint const t = min(r1, r2); 887 r2 = max(r1, r2); 888 r1 = t; 889 }; 890 { 891 uint const t = min(r3, r4); 892 r4 = max(r3, r4); 893 r3 = t; 894 }; 895 { 896 uint const t = min(r5, r6); 897 r6 = max(r5, r6); 898 r5 = t; 899 }; 900 { 901 uint const t = min(r7, r8); 902 r8 = max(r7, r8); 903 r7 = t; 904 }; 905 { 906 uint const flip_lane_idx = get_sub_group_local_id() ^ 3; 907 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 908 ; 909 { 910 uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 911 uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx); 912 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 913 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 914 }; 915 { 916 uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 917 uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx); 918 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 919 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 920 }; 921 { 922 uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 923 uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx); 924 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 925 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 926 }; 927 { 928 uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 929 uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx); 930 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 931 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 932 }; 933 } 934 { 935 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 936 int const t_lt = get_sub_group_local_id() < half_lane_idx; 937 ; 938 { 939 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 940 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 941 }; 942 { 943 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 944 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 945 }; 946 { 947 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 948 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 949 }; 950 { 951 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 952 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 953 }; 954 { 955 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 956 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 957 }; 958 { 959 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 960 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 961 }; 962 { 963 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 964 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 965 }; 966 { 967 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 968 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 969 }; 970 } 971 { 972 uint const t = min(r1, r5); 973 r5 = max(r1, r5); 974 r1 = t; 975 }; 976 { 977 uint const t = min(r3, r7); 978 r7 = max(r3, r7); 979 r3 = t; 980 }; 981 { 982 uint const t = min(r1, r3); 983 r3 = max(r1, r3); 984 r1 = t; 985 }; 986 { 987 uint const t = min(r5, r7); 988 r7 = max(r5, r7); 989 r5 = t; 990 }; 991 { 992 uint const t = min(r2, r6); 993 r6 = max(r2, r6); 994 r2 = t; 995 }; 996 { 997 uint const t = min(r4, r8); 998 r8 = max(r4, r8); 999 r4 = t; 1000 }; 1001 { 1002 uint const t = min(r2, r4); 1003 r4 = max(r2, r4); 1004 r2 = t; 1005 }; 1006 { 1007 uint const t = min(r6, r8); 1008 r8 = max(r6, r8); 1009 r6 = t; 1010 }; 1011 { 1012 uint const t = min(r1, r2); 1013 r2 = max(r1, r2); 1014 r1 = t; 1015 }; 1016 { 1017 uint const t = min(r3, r4); 1018 r4 = max(r3, r4); 1019 r3 = t; 1020 }; 1021 { 1022 uint const t = min(r5, r6); 1023 r6 = max(r5, r6); 1024 r5 = t; 1025 }; 1026 { 1027 uint const t = min(r7, r8); 1028 r8 = max(r7, r8); 1029 r7 = t; 1030 }; 1031 { 1032 uint const flip_lane_idx = get_sub_group_local_id() ^ 7; 1033 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 1034 ; 1035 { 1036 uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 1037 uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx); 1038 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 1039 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 1040 }; 1041 { 1042 uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 1043 uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx); 1044 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 1045 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 1046 }; 1047 { 1048 uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 1049 uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx); 1050 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 1051 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 1052 }; 1053 { 1054 uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 1055 uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx); 1056 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 1057 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 1058 }; 1059 } 1060 { 1061 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 1062 int const t_lt = get_sub_group_local_id() < half_lane_idx; 1063 ; 1064 { 1065 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 1066 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 1067 }; 1068 { 1069 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 1070 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 1071 }; 1072 { 1073 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 1074 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 1075 }; 1076 { 1077 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 1078 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 1079 }; 1080 { 1081 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 1082 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 1083 }; 1084 { 1085 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 1086 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 1087 }; 1088 { 1089 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 1090 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 1091 }; 1092 { 1093 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 1094 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 1095 }; 1096 } 1097 { 1098 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 1099 int const t_lt = get_sub_group_local_id() < half_lane_idx; 1100 ; 1101 { 1102 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 1103 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 1104 }; 1105 { 1106 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 1107 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 1108 }; 1109 { 1110 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 1111 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 1112 }; 1113 { 1114 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 1115 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 1116 }; 1117 { 1118 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 1119 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 1120 }; 1121 { 1122 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 1123 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 1124 }; 1125 { 1126 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 1127 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 1128 }; 1129 { 1130 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 1131 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 1132 }; 1133 } 1134 { 1135 uint const t = min(r1, r5); 1136 r5 = max(r1, r5); 1137 r1 = t; 1138 }; 1139 { 1140 uint const t = min(r3, r7); 1141 r7 = max(r3, r7); 1142 r3 = t; 1143 }; 1144 { 1145 uint const t = min(r1, r3); 1146 r3 = max(r1, r3); 1147 r1 = t; 1148 }; 1149 { 1150 uint const t = min(r5, r7); 1151 r7 = max(r5, r7); 1152 r5 = t; 1153 }; 1154 { 1155 uint const t = min(r2, r6); 1156 r6 = max(r2, r6); 1157 r2 = t; 1158 }; 1159 { 1160 uint const t = min(r4, r8); 1161 r8 = max(r4, r8); 1162 r4 = t; 1163 }; 1164 { 1165 uint const t = min(r2, r4); 1166 r4 = max(r2, r4); 1167 r2 = t; 1168 }; 1169 { 1170 uint const t = min(r6, r8); 1171 r8 = max(r6, r8); 1172 r6 = t; 1173 }; 1174 { 1175 uint const t = min(r1, r2); 1176 r2 = max(r1, r2); 1177 r1 = t; 1178 }; 1179 { 1180 uint const t = min(r3, r4); 1181 r4 = max(r3, r4); 1182 r3 = t; 1183 }; 1184 { 1185 uint const t = min(r5, r6); 1186 r6 = max(r5, r6); 1187 r5 = t; 1188 }; 1189 { 1190 uint const t = min(r7, r8); 1191 r8 = max(r7, r8); 1192 r7 = t; 1193 }; 1194 { 1195 uint const flip_lane_idx = get_sub_group_local_id() ^ 15; 1196 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 1197 ; 1198 { 1199 uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 1200 uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx); 1201 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 1202 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 1203 }; 1204 { 1205 uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 1206 uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx); 1207 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 1208 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 1209 }; 1210 { 1211 uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 1212 uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx); 1213 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 1214 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 1215 }; 1216 { 1217 uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 1218 uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx); 1219 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 1220 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 1221 }; 1222 } 1223 { 1224 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 1225 int const t_lt = get_sub_group_local_id() < half_lane_idx; 1226 ; 1227 { 1228 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 1229 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 1230 }; 1231 { 1232 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 1233 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 1234 }; 1235 { 1236 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 1237 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 1238 }; 1239 { 1240 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 1241 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 1242 }; 1243 { 1244 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 1245 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 1246 }; 1247 { 1248 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 1249 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 1250 }; 1251 { 1252 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 1253 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 1254 }; 1255 { 1256 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 1257 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 1258 }; 1259 } 1260 { 1261 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 1262 int const t_lt = get_sub_group_local_id() < half_lane_idx; 1263 ; 1264 { 1265 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 1266 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 1267 }; 1268 { 1269 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 1270 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 1271 }; 1272 { 1273 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 1274 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 1275 }; 1276 { 1277 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 1278 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 1279 }; 1280 { 1281 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 1282 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 1283 }; 1284 { 1285 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 1286 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 1287 }; 1288 { 1289 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 1290 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 1291 }; 1292 { 1293 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 1294 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 1295 }; 1296 } 1297 { 1298 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 1299 int const t_lt = get_sub_group_local_id() < half_lane_idx; 1300 ; 1301 { 1302 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 1303 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 1304 }; 1305 { 1306 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 1307 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 1308 }; 1309 { 1310 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 1311 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 1312 }; 1313 { 1314 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 1315 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 1316 }; 1317 { 1318 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 1319 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 1320 }; 1321 { 1322 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 1323 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 1324 }; 1325 { 1326 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 1327 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 1328 }; 1329 { 1330 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 1331 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 1332 }; 1333 } 1334 { 1335 uint const t = min(r1, r5); 1336 r5 = max(r1, r5); 1337 r1 = t; 1338 }; 1339 { 1340 uint const t = min(r3, r7); 1341 r7 = max(r3, r7); 1342 r3 = t; 1343 }; 1344 { 1345 uint const t = min(r1, r3); 1346 r3 = max(r1, r3); 1347 r1 = t; 1348 }; 1349 { 1350 uint const t = min(r5, r7); 1351 r7 = max(r5, r7); 1352 r5 = t; 1353 }; 1354 { 1355 uint const t = min(r2, r6); 1356 r6 = max(r2, r6); 1357 r2 = t; 1358 }; 1359 { 1360 uint const t = min(r4, r8); 1361 r8 = max(r4, r8); 1362 r4 = t; 1363 }; 1364 { 1365 uint const t = min(r2, r4); 1366 r4 = max(r2, r4); 1367 r2 = t; 1368 }; 1369 { 1370 uint const t = min(r6, r8); 1371 r8 = max(r6, r8); 1372 r6 = t; 1373 }; 1374 { 1375 uint const t = min(r1, r2); 1376 r2 = max(r1, r2); 1377 r1 = t; 1378 }; 1379 { 1380 uint const t = min(r3, r4); 1381 r4 = max(r3, r4); 1382 r3 = t; 1383 }; 1384 { 1385 uint const t = min(r5, r6); 1386 r6 = max(r5, r6); 1387 r5 = t; 1388 }; 1389 { 1390 uint const t = min(r7, r8); 1391 r8 = max(r7, r8); 1392 r7 = t; 1393 }; 1394 uint const smem_l_idx = 1395 get_sub_group_id() * ((1 << 4) * 2) + get_sub_group_local_id(); 1396 uint const smem_r_idx = (get_sub_group_id() ^ 1) * ((1 << 4) * 2) + 1397 (get_sub_group_local_id() ^ ((1 << 4) - 1)); 1398 shared.m[get_local_id(0) + (2 * (1 << 4) * 0)] = r1; 1399 shared.m[get_local_id(0) + (2 * (1 << 4) * 1)] = r8; 1400 shared.m[get_local_id(0) + (2 * (1 << 4) * 2)] = r2; 1401 shared.m[get_local_id(0) + (2 * (1 << 4) * 3)] = r7; 1402 shared.m[get_local_id(0) + (2 * (1 << 4) * 4)] = r3; 1403 shared.m[get_local_id(0) + (2 * (1 << 4) * 5)] = r6; 1404 shared.m[get_local_id(0) + (2 * (1 << 4) * 6)] = r4; 1405 shared.m[get_local_id(0) + (2 * (1 << 4) * 7)] = r5; 1406 barrier(CLK_LOCAL_MEM_FENCE); 1407 { 1408 { 1409 uint r0_1 = shared.m[smem_l_idx + (0)]; 1410 uint r0_2 = shared.m[smem_r_idx + (16)]; 1411 { 1412 uint const t = min(r0_1, r0_2); 1413 r0_2 = max(r0_1, r0_2); 1414 r0_1 = t; 1415 }; 1416 shared.m[smem_l_idx + (0)] = r0_1; 1417 shared.m[smem_r_idx + (16)] = r0_2; 1418 } 1419 { 1420 uint r0_1 = shared.m[smem_l_idx + (64)]; 1421 uint r0_2 = shared.m[smem_r_idx + (80)]; 1422 { 1423 uint const t = min(r0_1, r0_2); 1424 r0_2 = max(r0_1, r0_2); 1425 r0_1 = t; 1426 }; 1427 shared.m[smem_l_idx + (64)] = r0_1; 1428 shared.m[smem_r_idx + (80)] = r0_2; 1429 } 1430 { 1431 uint r0_1 = shared.m[smem_l_idx + (128)]; 1432 uint r0_2 = shared.m[smem_r_idx + (144)]; 1433 { 1434 uint const t = min(r0_1, r0_2); 1435 r0_2 = max(r0_1, r0_2); 1436 r0_1 = t; 1437 }; 1438 shared.m[smem_l_idx + (128)] = r0_1; 1439 shared.m[smem_r_idx + (144)] = r0_2; 1440 } 1441 { 1442 uint r0_1 = shared.m[smem_l_idx + (192)]; 1443 uint r0_2 = shared.m[smem_r_idx + (208)]; 1444 { 1445 uint const t = min(r0_1, r0_2); 1446 r0_2 = max(r0_1, r0_2); 1447 r0_1 = t; 1448 }; 1449 shared.m[smem_l_idx + (192)] = r0_1; 1450 shared.m[smem_r_idx + (208)] = r0_2; 1451 } 1452 } 1453 barrier(CLK_LOCAL_MEM_FENCE); 1454 r1 = shared.m[get_local_id(0) + (2 * (1 << 4) * 0)]; 1455 r8 = shared.m[get_local_id(0) + (2 * (1 << 4) * 1)]; 1456 r2 = shared.m[get_local_id(0) + (2 * (1 << 4) * 2)]; 1457 r7 = shared.m[get_local_id(0) + (2 * (1 << 4) * 3)]; 1458 r3 = shared.m[get_local_id(0) + (2 * (1 << 4) * 4)]; 1459 r6 = shared.m[get_local_id(0) + (2 * (1 << 4) * 5)]; 1460 r4 = shared.m[get_local_id(0) + (2 * (1 << 4) * 6)]; 1461 r5 = shared.m[get_local_id(0) + (2 * (1 << 4) * 7)]; 1462 { 1463 { 1464 uint const half_lane_idx = get_sub_group_local_id() ^ 8; 1465 int const t_lt = get_sub_group_local_id() < half_lane_idx; 1466 ; 1467 { 1468 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 1469 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 1470 }; 1471 { 1472 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 1473 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 1474 }; 1475 { 1476 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 1477 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 1478 }; 1479 { 1480 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 1481 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 1482 }; 1483 { 1484 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 1485 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 1486 }; 1487 { 1488 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 1489 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 1490 }; 1491 { 1492 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 1493 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 1494 }; 1495 { 1496 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 1497 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 1498 }; 1499 } 1500 { 1501 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 1502 int const t_lt = get_sub_group_local_id() < half_lane_idx; 1503 ; 1504 { 1505 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 1506 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 1507 }; 1508 { 1509 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 1510 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 1511 }; 1512 { 1513 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 1514 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 1515 }; 1516 { 1517 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 1518 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 1519 }; 1520 { 1521 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 1522 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 1523 }; 1524 { 1525 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 1526 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 1527 }; 1528 { 1529 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 1530 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 1531 }; 1532 { 1533 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 1534 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 1535 }; 1536 } 1537 { 1538 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 1539 int const t_lt = get_sub_group_local_id() < half_lane_idx; 1540 ; 1541 { 1542 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 1543 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 1544 }; 1545 { 1546 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 1547 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 1548 }; 1549 { 1550 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 1551 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 1552 }; 1553 { 1554 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 1555 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 1556 }; 1557 { 1558 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 1559 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 1560 }; 1561 { 1562 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 1563 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 1564 }; 1565 { 1566 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 1567 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 1568 }; 1569 { 1570 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 1571 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 1572 }; 1573 } 1574 { 1575 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 1576 int const t_lt = get_sub_group_local_id() < half_lane_idx; 1577 ; 1578 { 1579 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 1580 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 1581 }; 1582 { 1583 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 1584 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 1585 }; 1586 { 1587 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 1588 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 1589 }; 1590 { 1591 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 1592 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 1593 }; 1594 { 1595 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 1596 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 1597 }; 1598 { 1599 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 1600 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 1601 }; 1602 { 1603 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 1604 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 1605 }; 1606 { 1607 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 1608 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 1609 }; 1610 } 1611 { 1612 uint const t = min(r1, r5); 1613 r5 = max(r1, r5); 1614 r1 = t; 1615 }; 1616 { 1617 uint const t = min(r3, r7); 1618 r7 = max(r3, r7); 1619 r3 = t; 1620 }; 1621 { 1622 uint const t = min(r1, r3); 1623 r3 = max(r1, r3); 1624 r1 = t; 1625 }; 1626 { 1627 uint const t = min(r5, r7); 1628 r7 = max(r5, r7); 1629 r5 = t; 1630 }; 1631 { 1632 uint const t = min(r2, r6); 1633 r6 = max(r2, r6); 1634 r2 = t; 1635 }; 1636 { 1637 uint const t = min(r4, r8); 1638 r8 = max(r4, r8); 1639 r4 = t; 1640 }; 1641 { 1642 uint const t = min(r2, r4); 1643 r4 = max(r2, r4); 1644 r2 = t; 1645 }; 1646 { 1647 uint const t = min(r6, r8); 1648 r8 = max(r6, r8); 1649 r6 = t; 1650 }; 1651 { 1652 uint const t = min(r1, r2); 1653 r2 = max(r1, r2); 1654 r1 = t; 1655 }; 1656 { 1657 uint const t = min(r3, r4); 1658 r4 = max(r3, r4); 1659 r3 = t; 1660 }; 1661 { 1662 uint const t = min(r5, r6); 1663 r6 = max(r5, r6); 1664 r5 = t; 1665 }; 1666 { 1667 uint const t = min(r7, r8); 1668 r8 = max(r7, r8); 1669 r7 = t; 1670 }; 1671 } 1672 vout[gmem_idx + (1 << 4) * 0] = r1; 1673 vout[gmem_idx + (1 << 4) * 1] = r2; 1674 vout[gmem_idx + (1 << 4) * 2] = r3; 1675 vout[gmem_idx + (1 << 4) * 3] = r4; 1676 vout[gmem_idx + (1 << 4) * 4] = r5; 1677 vout[gmem_idx + (1 << 4) * 5] = r6; 1678 vout[gmem_idx + (1 << 4) * 6] = r7; 1679 vout[gmem_idx + (1 << 4) * 7] = r8; 1680} 1681 1682__kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) 1683__attribute__((reqd_work_group_size((1 << 4) * 4, 1, 1))) void 1684hs_kernel_bs_2(__global uint const* const restrict vin, 1685 __global uint* const restrict vout) 1686{ 1687 __local struct 1688 { 1689 uint m[64 * 8]; 1690 } shared; 1691 1692 uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 + 1693 (get_local_id(0) & ((1 << 4) - 1)); 1694 uint r1 = vin[gmem_idx + (1 << 4) * 0]; 1695 uint r2 = vin[gmem_idx + (1 << 4) * 1]; 1696 uint r3 = vin[gmem_idx + (1 << 4) * 2]; 1697 uint r4 = vin[gmem_idx + (1 << 4) * 3]; 1698 uint r5 = vin[gmem_idx + (1 << 4) * 4]; 1699 uint r6 = vin[gmem_idx + (1 << 4) * 5]; 1700 uint r7 = vin[gmem_idx + (1 << 4) * 6]; 1701 uint r8 = vin[gmem_idx + (1 << 4) * 7]; 1702 { 1703 uint const t = min(r1, r5); 1704 r5 = max(r1, r5); 1705 r1 = t; 1706 }; 1707 { 1708 uint const t = min(r2, r6); 1709 r6 = max(r2, r6); 1710 r2 = t; 1711 }; 1712 { 1713 uint const t = min(r3, r7); 1714 r7 = max(r3, r7); 1715 r3 = t; 1716 }; 1717 { 1718 uint const t = min(r4, r8); 1719 r8 = max(r4, r8); 1720 r4 = t; 1721 }; 1722 { 1723 uint const t = min(r1, r3); 1724 r3 = max(r1, r3); 1725 r1 = t; 1726 }; 1727 { 1728 uint const t = min(r2, r4); 1729 r4 = max(r2, r4); 1730 r2 = t; 1731 }; 1732 { 1733 uint const t = min(r5, r7); 1734 r7 = max(r5, r7); 1735 r5 = t; 1736 }; 1737 { 1738 uint const t = min(r6, r8); 1739 r8 = max(r6, r8); 1740 r6 = t; 1741 }; 1742 { 1743 uint const t = min(r3, r5); 1744 r5 = max(r3, r5); 1745 r3 = t; 1746 }; 1747 { 1748 uint const t = min(r4, r6); 1749 r6 = max(r4, r6); 1750 r4 = t; 1751 }; 1752 { 1753 uint const t = min(r1, r2); 1754 r2 = max(r1, r2); 1755 r1 = t; 1756 }; 1757 { 1758 uint const t = min(r3, r4); 1759 r4 = max(r3, r4); 1760 r3 = t; 1761 }; 1762 { 1763 uint const t = min(r5, r6); 1764 r6 = max(r5, r6); 1765 r5 = t; 1766 }; 1767 { 1768 uint const t = min(r7, r8); 1769 r8 = max(r7, r8); 1770 r7 = t; 1771 }; 1772 { 1773 uint const t = min(r2, r5); 1774 r5 = max(r2, r5); 1775 r2 = t; 1776 }; 1777 { 1778 uint const t = min(r4, r7); 1779 r7 = max(r4, r7); 1780 r4 = t; 1781 }; 1782 { 1783 uint const t = min(r2, r3); 1784 r3 = max(r2, r3); 1785 r2 = t; 1786 }; 1787 { 1788 uint const t = min(r4, r5); 1789 r5 = max(r4, r5); 1790 r4 = t; 1791 }; 1792 { 1793 uint const t = min(r6, r7); 1794 r7 = max(r6, r7); 1795 r6 = t; 1796 }; 1797 { 1798 uint const flip_lane_idx = get_sub_group_local_id() ^ 1; 1799 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 1800 ; 1801 { 1802 uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 1803 uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx); 1804 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 1805 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 1806 }; 1807 { 1808 uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 1809 uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx); 1810 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 1811 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 1812 }; 1813 { 1814 uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 1815 uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx); 1816 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 1817 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 1818 }; 1819 { 1820 uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 1821 uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx); 1822 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 1823 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 1824 }; 1825 } 1826 { 1827 uint const t = min(r1, r5); 1828 r5 = max(r1, r5); 1829 r1 = t; 1830 }; 1831 { 1832 uint const t = min(r3, r7); 1833 r7 = max(r3, r7); 1834 r3 = t; 1835 }; 1836 { 1837 uint const t = min(r1, r3); 1838 r3 = max(r1, r3); 1839 r1 = t; 1840 }; 1841 { 1842 uint const t = min(r5, r7); 1843 r7 = max(r5, r7); 1844 r5 = t; 1845 }; 1846 { 1847 uint const t = min(r2, r6); 1848 r6 = max(r2, r6); 1849 r2 = t; 1850 }; 1851 { 1852 uint const t = min(r4, r8); 1853 r8 = max(r4, r8); 1854 r4 = t; 1855 }; 1856 { 1857 uint const t = min(r2, r4); 1858 r4 = max(r2, r4); 1859 r2 = t; 1860 }; 1861 { 1862 uint const t = min(r6, r8); 1863 r8 = max(r6, r8); 1864 r6 = t; 1865 }; 1866 { 1867 uint const t = min(r1, r2); 1868 r2 = max(r1, r2); 1869 r1 = t; 1870 }; 1871 { 1872 uint const t = min(r3, r4); 1873 r4 = max(r3, r4); 1874 r3 = t; 1875 }; 1876 { 1877 uint const t = min(r5, r6); 1878 r6 = max(r5, r6); 1879 r5 = t; 1880 }; 1881 { 1882 uint const t = min(r7, r8); 1883 r8 = max(r7, r8); 1884 r7 = t; 1885 }; 1886 { 1887 uint const flip_lane_idx = get_sub_group_local_id() ^ 3; 1888 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 1889 ; 1890 { 1891 uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 1892 uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx); 1893 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 1894 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 1895 }; 1896 { 1897 uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 1898 uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx); 1899 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 1900 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 1901 }; 1902 { 1903 uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 1904 uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx); 1905 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 1906 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 1907 }; 1908 { 1909 uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 1910 uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx); 1911 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 1912 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 1913 }; 1914 } 1915 { 1916 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 1917 int const t_lt = get_sub_group_local_id() < half_lane_idx; 1918 ; 1919 { 1920 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 1921 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 1922 }; 1923 { 1924 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 1925 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 1926 }; 1927 { 1928 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 1929 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 1930 }; 1931 { 1932 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 1933 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 1934 }; 1935 { 1936 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 1937 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 1938 }; 1939 { 1940 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 1941 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 1942 }; 1943 { 1944 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 1945 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 1946 }; 1947 { 1948 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 1949 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 1950 }; 1951 } 1952 { 1953 uint const t = min(r1, r5); 1954 r5 = max(r1, r5); 1955 r1 = t; 1956 }; 1957 { 1958 uint const t = min(r3, r7); 1959 r7 = max(r3, r7); 1960 r3 = t; 1961 }; 1962 { 1963 uint const t = min(r1, r3); 1964 r3 = max(r1, r3); 1965 r1 = t; 1966 }; 1967 { 1968 uint const t = min(r5, r7); 1969 r7 = max(r5, r7); 1970 r5 = t; 1971 }; 1972 { 1973 uint const t = min(r2, r6); 1974 r6 = max(r2, r6); 1975 r2 = t; 1976 }; 1977 { 1978 uint const t = min(r4, r8); 1979 r8 = max(r4, r8); 1980 r4 = t; 1981 }; 1982 { 1983 uint const t = min(r2, r4); 1984 r4 = max(r2, r4); 1985 r2 = t; 1986 }; 1987 { 1988 uint const t = min(r6, r8); 1989 r8 = max(r6, r8); 1990 r6 = t; 1991 }; 1992 { 1993 uint const t = min(r1, r2); 1994 r2 = max(r1, r2); 1995 r1 = t; 1996 }; 1997 { 1998 uint const t = min(r3, r4); 1999 r4 = max(r3, r4); 2000 r3 = t; 2001 }; 2002 { 2003 uint const t = min(r5, r6); 2004 r6 = max(r5, r6); 2005 r5 = t; 2006 }; 2007 { 2008 uint const t = min(r7, r8); 2009 r8 = max(r7, r8); 2010 r7 = t; 2011 }; 2012 { 2013 uint const flip_lane_idx = get_sub_group_local_id() ^ 7; 2014 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 2015 ; 2016 { 2017 uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 2018 uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx); 2019 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 2020 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 2021 }; 2022 { 2023 uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 2024 uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx); 2025 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 2026 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 2027 }; 2028 { 2029 uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 2030 uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx); 2031 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 2032 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 2033 }; 2034 { 2035 uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 2036 uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx); 2037 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 2038 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 2039 }; 2040 } 2041 { 2042 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 2043 int const t_lt = get_sub_group_local_id() < half_lane_idx; 2044 ; 2045 { 2046 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 2047 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 2048 }; 2049 { 2050 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 2051 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 2052 }; 2053 { 2054 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 2055 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 2056 }; 2057 { 2058 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 2059 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 2060 }; 2061 { 2062 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 2063 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 2064 }; 2065 { 2066 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 2067 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 2068 }; 2069 { 2070 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 2071 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 2072 }; 2073 { 2074 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 2075 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 2076 }; 2077 } 2078 { 2079 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 2080 int const t_lt = get_sub_group_local_id() < half_lane_idx; 2081 ; 2082 { 2083 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 2084 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 2085 }; 2086 { 2087 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 2088 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 2089 }; 2090 { 2091 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 2092 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 2093 }; 2094 { 2095 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 2096 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 2097 }; 2098 { 2099 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 2100 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 2101 }; 2102 { 2103 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 2104 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 2105 }; 2106 { 2107 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 2108 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 2109 }; 2110 { 2111 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 2112 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 2113 }; 2114 } 2115 { 2116 uint const t = min(r1, r5); 2117 r5 = max(r1, r5); 2118 r1 = t; 2119 }; 2120 { 2121 uint const t = min(r3, r7); 2122 r7 = max(r3, r7); 2123 r3 = t; 2124 }; 2125 { 2126 uint const t = min(r1, r3); 2127 r3 = max(r1, r3); 2128 r1 = t; 2129 }; 2130 { 2131 uint const t = min(r5, r7); 2132 r7 = max(r5, r7); 2133 r5 = t; 2134 }; 2135 { 2136 uint const t = min(r2, r6); 2137 r6 = max(r2, r6); 2138 r2 = t; 2139 }; 2140 { 2141 uint const t = min(r4, r8); 2142 r8 = max(r4, r8); 2143 r4 = t; 2144 }; 2145 { 2146 uint const t = min(r2, r4); 2147 r4 = max(r2, r4); 2148 r2 = t; 2149 }; 2150 { 2151 uint const t = min(r6, r8); 2152 r8 = max(r6, r8); 2153 r6 = t; 2154 }; 2155 { 2156 uint const t = min(r1, r2); 2157 r2 = max(r1, r2); 2158 r1 = t; 2159 }; 2160 { 2161 uint const t = min(r3, r4); 2162 r4 = max(r3, r4); 2163 r3 = t; 2164 }; 2165 { 2166 uint const t = min(r5, r6); 2167 r6 = max(r5, r6); 2168 r5 = t; 2169 }; 2170 { 2171 uint const t = min(r7, r8); 2172 r8 = max(r7, r8); 2173 r7 = t; 2174 }; 2175 { 2176 uint const flip_lane_idx = get_sub_group_local_id() ^ 15; 2177 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 2178 ; 2179 { 2180 uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 2181 uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx); 2182 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 2183 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 2184 }; 2185 { 2186 uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 2187 uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx); 2188 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 2189 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 2190 }; 2191 { 2192 uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 2193 uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx); 2194 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 2195 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 2196 }; 2197 { 2198 uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 2199 uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx); 2200 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 2201 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 2202 }; 2203 } 2204 { 2205 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 2206 int const t_lt = get_sub_group_local_id() < half_lane_idx; 2207 ; 2208 { 2209 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 2210 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 2211 }; 2212 { 2213 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 2214 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 2215 }; 2216 { 2217 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 2218 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 2219 }; 2220 { 2221 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 2222 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 2223 }; 2224 { 2225 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 2226 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 2227 }; 2228 { 2229 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 2230 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 2231 }; 2232 { 2233 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 2234 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 2235 }; 2236 { 2237 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 2238 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 2239 }; 2240 } 2241 { 2242 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 2243 int const t_lt = get_sub_group_local_id() < half_lane_idx; 2244 ; 2245 { 2246 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 2247 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 2248 }; 2249 { 2250 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 2251 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 2252 }; 2253 { 2254 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 2255 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 2256 }; 2257 { 2258 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 2259 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 2260 }; 2261 { 2262 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 2263 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 2264 }; 2265 { 2266 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 2267 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 2268 }; 2269 { 2270 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 2271 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 2272 }; 2273 { 2274 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 2275 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 2276 }; 2277 } 2278 { 2279 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 2280 int const t_lt = get_sub_group_local_id() < half_lane_idx; 2281 ; 2282 { 2283 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 2284 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 2285 }; 2286 { 2287 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 2288 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 2289 }; 2290 { 2291 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 2292 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 2293 }; 2294 { 2295 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 2296 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 2297 }; 2298 { 2299 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 2300 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 2301 }; 2302 { 2303 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 2304 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 2305 }; 2306 { 2307 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 2308 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 2309 }; 2310 { 2311 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 2312 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 2313 }; 2314 } 2315 { 2316 uint const t = min(r1, r5); 2317 r5 = max(r1, r5); 2318 r1 = t; 2319 }; 2320 { 2321 uint const t = min(r3, r7); 2322 r7 = max(r3, r7); 2323 r3 = t; 2324 }; 2325 { 2326 uint const t = min(r1, r3); 2327 r3 = max(r1, r3); 2328 r1 = t; 2329 }; 2330 { 2331 uint const t = min(r5, r7); 2332 r7 = max(r5, r7); 2333 r5 = t; 2334 }; 2335 { 2336 uint const t = min(r2, r6); 2337 r6 = max(r2, r6); 2338 r2 = t; 2339 }; 2340 { 2341 uint const t = min(r4, r8); 2342 r8 = max(r4, r8); 2343 r4 = t; 2344 }; 2345 { 2346 uint const t = min(r2, r4); 2347 r4 = max(r2, r4); 2348 r2 = t; 2349 }; 2350 { 2351 uint const t = min(r6, r8); 2352 r8 = max(r6, r8); 2353 r6 = t; 2354 }; 2355 { 2356 uint const t = min(r1, r2); 2357 r2 = max(r1, r2); 2358 r1 = t; 2359 }; 2360 { 2361 uint const t = min(r3, r4); 2362 r4 = max(r3, r4); 2363 r3 = t; 2364 }; 2365 { 2366 uint const t = min(r5, r6); 2367 r6 = max(r5, r6); 2368 r5 = t; 2369 }; 2370 { 2371 uint const t = min(r7, r8); 2372 r8 = max(r7, r8); 2373 r7 = t; 2374 }; 2375 uint const smem_l_idx = 2376 get_sub_group_id() * ((1 << 4) * 4) + get_sub_group_local_id(); 2377 uint const smem_r_idx = (get_sub_group_id() ^ 1) * ((1 << 4) * 4) + 2378 (get_sub_group_local_id() ^ ((1 << 4) - 1)); 2379 shared.m[get_local_id(0) + (4 * (1 << 4) * 0)] = r1; 2380 shared.m[get_local_id(0) + (4 * (1 << 4) * 1)] = r8; 2381 shared.m[get_local_id(0) + (4 * (1 << 4) * 2)] = r2; 2382 shared.m[get_local_id(0) + (4 * (1 << 4) * 3)] = r7; 2383 shared.m[get_local_id(0) + (4 * (1 << 4) * 4)] = r3; 2384 shared.m[get_local_id(0) + (4 * (1 << 4) * 5)] = r6; 2385 shared.m[get_local_id(0) + (4 * (1 << 4) * 6)] = r4; 2386 shared.m[get_local_id(0) + (4 * (1 << 4) * 7)] = r5; 2387 barrier(CLK_LOCAL_MEM_FENCE); 2388 { 2389 { 2390 uint r0_1 = shared.m[smem_l_idx + (0)]; 2391 uint r0_2 = shared.m[smem_r_idx + (16)]; 2392 { 2393 uint const t = min(r0_1, r0_2); 2394 r0_2 = max(r0_1, r0_2); 2395 r0_1 = t; 2396 }; 2397 shared.m[smem_l_idx + (0)] = r0_1; 2398 shared.m[smem_r_idx + (16)] = r0_2; 2399 } 2400 { 2401 uint r1_1 = shared.m[smem_l_idx + (32)]; 2402 uint r1_2 = shared.m[smem_r_idx + (48)]; 2403 { 2404 uint const t = min(r1_1, r1_2); 2405 r1_2 = max(r1_1, r1_2); 2406 r1_1 = t; 2407 }; 2408 shared.m[smem_l_idx + (32)] = r1_1; 2409 shared.m[smem_r_idx + (48)] = r1_2; 2410 } 2411 { 2412 uint r0_1 = shared.m[smem_l_idx + (256)]; 2413 uint r0_2 = shared.m[smem_r_idx + (272)]; 2414 { 2415 uint const t = min(r0_1, r0_2); 2416 r0_2 = max(r0_1, r0_2); 2417 r0_1 = t; 2418 }; 2419 shared.m[smem_l_idx + (256)] = r0_1; 2420 shared.m[smem_r_idx + (272)] = r0_2; 2421 } 2422 { 2423 uint r1_1 = shared.m[smem_l_idx + (288)]; 2424 uint r1_2 = shared.m[smem_r_idx + (304)]; 2425 { 2426 uint const t = min(r1_1, r1_2); 2427 r1_2 = max(r1_1, r1_2); 2428 r1_1 = t; 2429 }; 2430 shared.m[smem_l_idx + (288)] = r1_1; 2431 shared.m[smem_r_idx + (304)] = r1_2; 2432 } 2433 } 2434 barrier(CLK_LOCAL_MEM_FENCE); 2435 r1 = shared.m[get_local_id(0) + (4 * (1 << 4) * 0)]; 2436 r8 = shared.m[get_local_id(0) + (4 * (1 << 4) * 1)]; 2437 r2 = shared.m[get_local_id(0) + (4 * (1 << 4) * 2)]; 2438 r7 = shared.m[get_local_id(0) + (4 * (1 << 4) * 3)]; 2439 r3 = shared.m[get_local_id(0) + (4 * (1 << 4) * 4)]; 2440 r6 = shared.m[get_local_id(0) + (4 * (1 << 4) * 5)]; 2441 r4 = shared.m[get_local_id(0) + (4 * (1 << 4) * 6)]; 2442 r5 = shared.m[get_local_id(0) + (4 * (1 << 4) * 7)]; 2443 { 2444 { 2445 uint const half_lane_idx = get_sub_group_local_id() ^ 8; 2446 int const t_lt = get_sub_group_local_id() < half_lane_idx; 2447 ; 2448 { 2449 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 2450 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 2451 }; 2452 { 2453 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 2454 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 2455 }; 2456 { 2457 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 2458 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 2459 }; 2460 { 2461 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 2462 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 2463 }; 2464 { 2465 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 2466 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 2467 }; 2468 { 2469 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 2470 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 2471 }; 2472 { 2473 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 2474 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 2475 }; 2476 { 2477 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 2478 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 2479 }; 2480 } 2481 { 2482 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 2483 int const t_lt = get_sub_group_local_id() < half_lane_idx; 2484 ; 2485 { 2486 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 2487 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 2488 }; 2489 { 2490 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 2491 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 2492 }; 2493 { 2494 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 2495 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 2496 }; 2497 { 2498 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 2499 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 2500 }; 2501 { 2502 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 2503 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 2504 }; 2505 { 2506 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 2507 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 2508 }; 2509 { 2510 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 2511 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 2512 }; 2513 { 2514 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 2515 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 2516 }; 2517 } 2518 { 2519 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 2520 int const t_lt = get_sub_group_local_id() < half_lane_idx; 2521 ; 2522 { 2523 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 2524 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 2525 }; 2526 { 2527 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 2528 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 2529 }; 2530 { 2531 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 2532 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 2533 }; 2534 { 2535 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 2536 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 2537 }; 2538 { 2539 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 2540 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 2541 }; 2542 { 2543 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 2544 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 2545 }; 2546 { 2547 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 2548 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 2549 }; 2550 { 2551 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 2552 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 2553 }; 2554 } 2555 { 2556 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 2557 int const t_lt = get_sub_group_local_id() < half_lane_idx; 2558 ; 2559 { 2560 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 2561 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 2562 }; 2563 { 2564 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 2565 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 2566 }; 2567 { 2568 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 2569 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 2570 }; 2571 { 2572 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 2573 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 2574 }; 2575 { 2576 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 2577 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 2578 }; 2579 { 2580 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 2581 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 2582 }; 2583 { 2584 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 2585 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 2586 }; 2587 { 2588 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 2589 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 2590 }; 2591 } 2592 { 2593 uint const t = min(r1, r5); 2594 r5 = max(r1, r5); 2595 r1 = t; 2596 }; 2597 { 2598 uint const t = min(r3, r7); 2599 r7 = max(r3, r7); 2600 r3 = t; 2601 }; 2602 { 2603 uint const t = min(r1, r3); 2604 r3 = max(r1, r3); 2605 r1 = t; 2606 }; 2607 { 2608 uint const t = min(r5, r7); 2609 r7 = max(r5, r7); 2610 r5 = t; 2611 }; 2612 { 2613 uint const t = min(r2, r6); 2614 r6 = max(r2, r6); 2615 r2 = t; 2616 }; 2617 { 2618 uint const t = min(r4, r8); 2619 r8 = max(r4, r8); 2620 r4 = t; 2621 }; 2622 { 2623 uint const t = min(r2, r4); 2624 r4 = max(r2, r4); 2625 r2 = t; 2626 }; 2627 { 2628 uint const t = min(r6, r8); 2629 r8 = max(r6, r8); 2630 r6 = t; 2631 }; 2632 { 2633 uint const t = min(r1, r2); 2634 r2 = max(r1, r2); 2635 r1 = t; 2636 }; 2637 { 2638 uint const t = min(r3, r4); 2639 r4 = max(r3, r4); 2640 r3 = t; 2641 }; 2642 { 2643 uint const t = min(r5, r6); 2644 r6 = max(r5, r6); 2645 r5 = t; 2646 }; 2647 { 2648 uint const t = min(r7, r8); 2649 r8 = max(r7, r8); 2650 r7 = t; 2651 }; 2652 } 2653 shared.m[get_local_id(0) + (4 * (1 << 4) * 0)] = r1; 2654 shared.m[get_local_id(0) + (4 * (1 << 4) * 1)] = r8; 2655 shared.m[get_local_id(0) + (4 * (1 << 4) * 2)] = r2; 2656 shared.m[get_local_id(0) + (4 * (1 << 4) * 3)] = r7; 2657 shared.m[get_local_id(0) + (4 * (1 << 4) * 4)] = r3; 2658 shared.m[get_local_id(0) + (4 * (1 << 4) * 5)] = r6; 2659 shared.m[get_local_id(0) + (4 * (1 << 4) * 6)] = r4; 2660 shared.m[get_local_id(0) + (4 * (1 << 4) * 7)] = r5; 2661 barrier(CLK_LOCAL_MEM_FENCE); 2662 { 2663 { 2664 uint r0_1 = shared.m[smem_l_idx + (0)]; 2665 uint r0_2 = shared.m[smem_l_idx + (16)]; 2666 uint r0_3 = shared.m[smem_r_idx + (32)]; 2667 uint r0_4 = shared.m[smem_r_idx + (48)]; 2668 { 2669 uint const t = min(r0_2, r0_3); 2670 r0_3 = max(r0_2, r0_3); 2671 r0_2 = t; 2672 }; 2673 { 2674 uint const t = min(r0_1, r0_4); 2675 r0_4 = max(r0_1, r0_4); 2676 r0_1 = t; 2677 }; 2678 { 2679 uint const t = min(r0_3, r0_4); 2680 r0_4 = max(r0_3, r0_4); 2681 r0_3 = t; 2682 }; 2683 { 2684 uint const t = min(r0_1, r0_2); 2685 r0_2 = max(r0_1, r0_2); 2686 r0_1 = t; 2687 }; 2688 shared.m[smem_l_idx + (0)] = r0_1; 2689 shared.m[smem_l_idx + (16)] = r0_2; 2690 shared.m[smem_r_idx + (32)] = r0_3; 2691 shared.m[smem_r_idx + (48)] = r0_4; 2692 } 2693 { 2694 uint r0_1 = shared.m[smem_l_idx + (256)]; 2695 uint r0_2 = shared.m[smem_l_idx + (272)]; 2696 uint r0_3 = shared.m[smem_r_idx + (288)]; 2697 uint r0_4 = shared.m[smem_r_idx + (304)]; 2698 { 2699 uint const t = min(r0_2, r0_3); 2700 r0_3 = max(r0_2, r0_3); 2701 r0_2 = t; 2702 }; 2703 { 2704 uint const t = min(r0_1, r0_4); 2705 r0_4 = max(r0_1, r0_4); 2706 r0_1 = t; 2707 }; 2708 { 2709 uint const t = min(r0_3, r0_4); 2710 r0_4 = max(r0_3, r0_4); 2711 r0_3 = t; 2712 }; 2713 { 2714 uint const t = min(r0_1, r0_2); 2715 r0_2 = max(r0_1, r0_2); 2716 r0_1 = t; 2717 }; 2718 shared.m[smem_l_idx + (256)] = r0_1; 2719 shared.m[smem_l_idx + (272)] = r0_2; 2720 shared.m[smem_r_idx + (288)] = r0_3; 2721 shared.m[smem_r_idx + (304)] = r0_4; 2722 } 2723 } 2724 barrier(CLK_LOCAL_MEM_FENCE); 2725 r1 = shared.m[get_local_id(0) + (4 * (1 << 4) * 0)]; 2726 r8 = shared.m[get_local_id(0) + (4 * (1 << 4) * 1)]; 2727 r2 = shared.m[get_local_id(0) + (4 * (1 << 4) * 2)]; 2728 r7 = shared.m[get_local_id(0) + (4 * (1 << 4) * 3)]; 2729 r3 = shared.m[get_local_id(0) + (4 * (1 << 4) * 4)]; 2730 r6 = shared.m[get_local_id(0) + (4 * (1 << 4) * 5)]; 2731 r4 = shared.m[get_local_id(0) + (4 * (1 << 4) * 6)]; 2732 r5 = shared.m[get_local_id(0) + (4 * (1 << 4) * 7)]; 2733 { 2734 { 2735 uint const half_lane_idx = get_sub_group_local_id() ^ 8; 2736 int const t_lt = get_sub_group_local_id() < half_lane_idx; 2737 ; 2738 { 2739 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 2740 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 2741 }; 2742 { 2743 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 2744 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 2745 }; 2746 { 2747 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 2748 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 2749 }; 2750 { 2751 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 2752 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 2753 }; 2754 { 2755 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 2756 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 2757 }; 2758 { 2759 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 2760 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 2761 }; 2762 { 2763 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 2764 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 2765 }; 2766 { 2767 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 2768 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 2769 }; 2770 } 2771 { 2772 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 2773 int const t_lt = get_sub_group_local_id() < half_lane_idx; 2774 ; 2775 { 2776 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 2777 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 2778 }; 2779 { 2780 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 2781 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 2782 }; 2783 { 2784 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 2785 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 2786 }; 2787 { 2788 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 2789 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 2790 }; 2791 { 2792 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 2793 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 2794 }; 2795 { 2796 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 2797 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 2798 }; 2799 { 2800 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 2801 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 2802 }; 2803 { 2804 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 2805 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 2806 }; 2807 } 2808 { 2809 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 2810 int const t_lt = get_sub_group_local_id() < half_lane_idx; 2811 ; 2812 { 2813 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 2814 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 2815 }; 2816 { 2817 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 2818 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 2819 }; 2820 { 2821 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 2822 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 2823 }; 2824 { 2825 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 2826 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 2827 }; 2828 { 2829 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 2830 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 2831 }; 2832 { 2833 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 2834 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 2835 }; 2836 { 2837 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 2838 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 2839 }; 2840 { 2841 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 2842 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 2843 }; 2844 } 2845 { 2846 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 2847 int const t_lt = get_sub_group_local_id() < half_lane_idx; 2848 ; 2849 { 2850 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 2851 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 2852 }; 2853 { 2854 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 2855 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 2856 }; 2857 { 2858 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 2859 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 2860 }; 2861 { 2862 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 2863 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 2864 }; 2865 { 2866 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 2867 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 2868 }; 2869 { 2870 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 2871 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 2872 }; 2873 { 2874 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 2875 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 2876 }; 2877 { 2878 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 2879 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 2880 }; 2881 } 2882 { 2883 uint const t = min(r1, r5); 2884 r5 = max(r1, r5); 2885 r1 = t; 2886 }; 2887 { 2888 uint const t = min(r3, r7); 2889 r7 = max(r3, r7); 2890 r3 = t; 2891 }; 2892 { 2893 uint const t = min(r1, r3); 2894 r3 = max(r1, r3); 2895 r1 = t; 2896 }; 2897 { 2898 uint const t = min(r5, r7); 2899 r7 = max(r5, r7); 2900 r5 = t; 2901 }; 2902 { 2903 uint const t = min(r2, r6); 2904 r6 = max(r2, r6); 2905 r2 = t; 2906 }; 2907 { 2908 uint const t = min(r4, r8); 2909 r8 = max(r4, r8); 2910 r4 = t; 2911 }; 2912 { 2913 uint const t = min(r2, r4); 2914 r4 = max(r2, r4); 2915 r2 = t; 2916 }; 2917 { 2918 uint const t = min(r6, r8); 2919 r8 = max(r6, r8); 2920 r6 = t; 2921 }; 2922 { 2923 uint const t = min(r1, r2); 2924 r2 = max(r1, r2); 2925 r1 = t; 2926 }; 2927 { 2928 uint const t = min(r3, r4); 2929 r4 = max(r3, r4); 2930 r3 = t; 2931 }; 2932 { 2933 uint const t = min(r5, r6); 2934 r6 = max(r5, r6); 2935 r5 = t; 2936 }; 2937 { 2938 uint const t = min(r7, r8); 2939 r8 = max(r7, r8); 2940 r7 = t; 2941 }; 2942 } 2943 vout[gmem_idx + (1 << 4) * 0] = r1; 2944 vout[gmem_idx + (1 << 4) * 1] = r2; 2945 vout[gmem_idx + (1 << 4) * 2] = r3; 2946 vout[gmem_idx + (1 << 4) * 3] = r4; 2947 vout[gmem_idx + (1 << 4) * 4] = r5; 2948 vout[gmem_idx + (1 << 4) * 5] = r6; 2949 vout[gmem_idx + (1 << 4) * 6] = r7; 2950 vout[gmem_idx + (1 << 4) * 7] = r8; 2951} 2952 2953__kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) 2954__attribute__((reqd_work_group_size((1 << 4) * 8, 1, 1))) void 2955hs_kernel_bs_3(__global uint const* const restrict vin, 2956 __global uint* const restrict vout) 2957{ 2958 __local struct 2959 { 2960 uint m[128 * 8]; 2961 } shared; 2962 2963 uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 + 2964 (get_local_id(0) & ((1 << 4) - 1)); 2965 uint r1 = vin[gmem_idx + (1 << 4) * 0]; 2966 uint r2 = vin[gmem_idx + (1 << 4) * 1]; 2967 uint r3 = vin[gmem_idx + (1 << 4) * 2]; 2968 uint r4 = vin[gmem_idx + (1 << 4) * 3]; 2969 uint r5 = vin[gmem_idx + (1 << 4) * 4]; 2970 uint r6 = vin[gmem_idx + (1 << 4) * 5]; 2971 uint r7 = vin[gmem_idx + (1 << 4) * 6]; 2972 uint r8 = vin[gmem_idx + (1 << 4) * 7]; 2973 { 2974 uint const t = min(r1, r5); 2975 r5 = max(r1, r5); 2976 r1 = t; 2977 }; 2978 { 2979 uint const t = min(r2, r6); 2980 r6 = max(r2, r6); 2981 r2 = t; 2982 }; 2983 { 2984 uint const t = min(r3, r7); 2985 r7 = max(r3, r7); 2986 r3 = t; 2987 }; 2988 { 2989 uint const t = min(r4, r8); 2990 r8 = max(r4, r8); 2991 r4 = t; 2992 }; 2993 { 2994 uint const t = min(r1, r3); 2995 r3 = max(r1, r3); 2996 r1 = t; 2997 }; 2998 { 2999 uint const t = min(r2, r4); 3000 r4 = max(r2, r4); 3001 r2 = t; 3002 }; 3003 { 3004 uint const t = min(r5, r7); 3005 r7 = max(r5, r7); 3006 r5 = t; 3007 }; 3008 { 3009 uint const t = min(r6, r8); 3010 r8 = max(r6, r8); 3011 r6 = t; 3012 }; 3013 { 3014 uint const t = min(r3, r5); 3015 r5 = max(r3, r5); 3016 r3 = t; 3017 }; 3018 { 3019 uint const t = min(r4, r6); 3020 r6 = max(r4, r6); 3021 r4 = t; 3022 }; 3023 { 3024 uint const t = min(r1, r2); 3025 r2 = max(r1, r2); 3026 r1 = t; 3027 }; 3028 { 3029 uint const t = min(r3, r4); 3030 r4 = max(r3, r4); 3031 r3 = t; 3032 }; 3033 { 3034 uint const t = min(r5, r6); 3035 r6 = max(r5, r6); 3036 r5 = t; 3037 }; 3038 { 3039 uint const t = min(r7, r8); 3040 r8 = max(r7, r8); 3041 r7 = t; 3042 }; 3043 { 3044 uint const t = min(r2, r5); 3045 r5 = max(r2, r5); 3046 r2 = t; 3047 }; 3048 { 3049 uint const t = min(r4, r7); 3050 r7 = max(r4, r7); 3051 r4 = t; 3052 }; 3053 { 3054 uint const t = min(r2, r3); 3055 r3 = max(r2, r3); 3056 r2 = t; 3057 }; 3058 { 3059 uint const t = min(r4, r5); 3060 r5 = max(r4, r5); 3061 r4 = t; 3062 }; 3063 { 3064 uint const t = min(r6, r7); 3065 r7 = max(r6, r7); 3066 r6 = t; 3067 }; 3068 { 3069 uint const flip_lane_idx = get_sub_group_local_id() ^ 1; 3070 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 3071 ; 3072 { 3073 uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 3074 uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx); 3075 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 3076 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 3077 }; 3078 { 3079 uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 3080 uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx); 3081 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 3082 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 3083 }; 3084 { 3085 uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 3086 uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx); 3087 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 3088 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 3089 }; 3090 { 3091 uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 3092 uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx); 3093 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 3094 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 3095 }; 3096 } 3097 { 3098 uint const t = min(r1, r5); 3099 r5 = max(r1, r5); 3100 r1 = t; 3101 }; 3102 { 3103 uint const t = min(r3, r7); 3104 r7 = max(r3, r7); 3105 r3 = t; 3106 }; 3107 { 3108 uint const t = min(r1, r3); 3109 r3 = max(r1, r3); 3110 r1 = t; 3111 }; 3112 { 3113 uint const t = min(r5, r7); 3114 r7 = max(r5, r7); 3115 r5 = t; 3116 }; 3117 { 3118 uint const t = min(r2, r6); 3119 r6 = max(r2, r6); 3120 r2 = t; 3121 }; 3122 { 3123 uint const t = min(r4, r8); 3124 r8 = max(r4, r8); 3125 r4 = t; 3126 }; 3127 { 3128 uint const t = min(r2, r4); 3129 r4 = max(r2, r4); 3130 r2 = t; 3131 }; 3132 { 3133 uint const t = min(r6, r8); 3134 r8 = max(r6, r8); 3135 r6 = t; 3136 }; 3137 { 3138 uint const t = min(r1, r2); 3139 r2 = max(r1, r2); 3140 r1 = t; 3141 }; 3142 { 3143 uint const t = min(r3, r4); 3144 r4 = max(r3, r4); 3145 r3 = t; 3146 }; 3147 { 3148 uint const t = min(r5, r6); 3149 r6 = max(r5, r6); 3150 r5 = t; 3151 }; 3152 { 3153 uint const t = min(r7, r8); 3154 r8 = max(r7, r8); 3155 r7 = t; 3156 }; 3157 { 3158 uint const flip_lane_idx = get_sub_group_local_id() ^ 3; 3159 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 3160 ; 3161 { 3162 uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 3163 uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx); 3164 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 3165 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 3166 }; 3167 { 3168 uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 3169 uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx); 3170 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 3171 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 3172 }; 3173 { 3174 uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 3175 uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx); 3176 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 3177 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 3178 }; 3179 { 3180 uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 3181 uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx); 3182 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 3183 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 3184 }; 3185 } 3186 { 3187 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 3188 int const t_lt = get_sub_group_local_id() < half_lane_idx; 3189 ; 3190 { 3191 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 3192 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 3193 }; 3194 { 3195 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 3196 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 3197 }; 3198 { 3199 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 3200 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 3201 }; 3202 { 3203 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 3204 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 3205 }; 3206 { 3207 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 3208 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 3209 }; 3210 { 3211 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 3212 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 3213 }; 3214 { 3215 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 3216 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 3217 }; 3218 { 3219 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 3220 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 3221 }; 3222 } 3223 { 3224 uint const t = min(r1, r5); 3225 r5 = max(r1, r5); 3226 r1 = t; 3227 }; 3228 { 3229 uint const t = min(r3, r7); 3230 r7 = max(r3, r7); 3231 r3 = t; 3232 }; 3233 { 3234 uint const t = min(r1, r3); 3235 r3 = max(r1, r3); 3236 r1 = t; 3237 }; 3238 { 3239 uint const t = min(r5, r7); 3240 r7 = max(r5, r7); 3241 r5 = t; 3242 }; 3243 { 3244 uint const t = min(r2, r6); 3245 r6 = max(r2, r6); 3246 r2 = t; 3247 }; 3248 { 3249 uint const t = min(r4, r8); 3250 r8 = max(r4, r8); 3251 r4 = t; 3252 }; 3253 { 3254 uint const t = min(r2, r4); 3255 r4 = max(r2, r4); 3256 r2 = t; 3257 }; 3258 { 3259 uint const t = min(r6, r8); 3260 r8 = max(r6, r8); 3261 r6 = t; 3262 }; 3263 { 3264 uint const t = min(r1, r2); 3265 r2 = max(r1, r2); 3266 r1 = t; 3267 }; 3268 { 3269 uint const t = min(r3, r4); 3270 r4 = max(r3, r4); 3271 r3 = t; 3272 }; 3273 { 3274 uint const t = min(r5, r6); 3275 r6 = max(r5, r6); 3276 r5 = t; 3277 }; 3278 { 3279 uint const t = min(r7, r8); 3280 r8 = max(r7, r8); 3281 r7 = t; 3282 }; 3283 { 3284 uint const flip_lane_idx = get_sub_group_local_id() ^ 7; 3285 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 3286 ; 3287 { 3288 uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 3289 uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx); 3290 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 3291 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 3292 }; 3293 { 3294 uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 3295 uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx); 3296 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 3297 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 3298 }; 3299 { 3300 uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 3301 uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx); 3302 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 3303 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 3304 }; 3305 { 3306 uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 3307 uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx); 3308 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 3309 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 3310 }; 3311 } 3312 { 3313 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 3314 int const t_lt = get_sub_group_local_id() < half_lane_idx; 3315 ; 3316 { 3317 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 3318 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 3319 }; 3320 { 3321 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 3322 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 3323 }; 3324 { 3325 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 3326 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 3327 }; 3328 { 3329 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 3330 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 3331 }; 3332 { 3333 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 3334 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 3335 }; 3336 { 3337 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 3338 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 3339 }; 3340 { 3341 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 3342 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 3343 }; 3344 { 3345 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 3346 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 3347 }; 3348 } 3349 { 3350 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 3351 int const t_lt = get_sub_group_local_id() < half_lane_idx; 3352 ; 3353 { 3354 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 3355 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 3356 }; 3357 { 3358 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 3359 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 3360 }; 3361 { 3362 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 3363 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 3364 }; 3365 { 3366 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 3367 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 3368 }; 3369 { 3370 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 3371 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 3372 }; 3373 { 3374 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 3375 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 3376 }; 3377 { 3378 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 3379 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 3380 }; 3381 { 3382 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 3383 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 3384 }; 3385 } 3386 { 3387 uint const t = min(r1, r5); 3388 r5 = max(r1, r5); 3389 r1 = t; 3390 }; 3391 { 3392 uint const t = min(r3, r7); 3393 r7 = max(r3, r7); 3394 r3 = t; 3395 }; 3396 { 3397 uint const t = min(r1, r3); 3398 r3 = max(r1, r3); 3399 r1 = t; 3400 }; 3401 { 3402 uint const t = min(r5, r7); 3403 r7 = max(r5, r7); 3404 r5 = t; 3405 }; 3406 { 3407 uint const t = min(r2, r6); 3408 r6 = max(r2, r6); 3409 r2 = t; 3410 }; 3411 { 3412 uint const t = min(r4, r8); 3413 r8 = max(r4, r8); 3414 r4 = t; 3415 }; 3416 { 3417 uint const t = min(r2, r4); 3418 r4 = max(r2, r4); 3419 r2 = t; 3420 }; 3421 { 3422 uint const t = min(r6, r8); 3423 r8 = max(r6, r8); 3424 r6 = t; 3425 }; 3426 { 3427 uint const t = min(r1, r2); 3428 r2 = max(r1, r2); 3429 r1 = t; 3430 }; 3431 { 3432 uint const t = min(r3, r4); 3433 r4 = max(r3, r4); 3434 r3 = t; 3435 }; 3436 { 3437 uint const t = min(r5, r6); 3438 r6 = max(r5, r6); 3439 r5 = t; 3440 }; 3441 { 3442 uint const t = min(r7, r8); 3443 r8 = max(r7, r8); 3444 r7 = t; 3445 }; 3446 { 3447 uint const flip_lane_idx = get_sub_group_local_id() ^ 15; 3448 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 3449 ; 3450 { 3451 uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 3452 uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx); 3453 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 3454 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 3455 }; 3456 { 3457 uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 3458 uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx); 3459 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 3460 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 3461 }; 3462 { 3463 uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 3464 uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx); 3465 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 3466 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 3467 }; 3468 { 3469 uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 3470 uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx); 3471 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 3472 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 3473 }; 3474 } 3475 { 3476 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 3477 int const t_lt = get_sub_group_local_id() < half_lane_idx; 3478 ; 3479 { 3480 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 3481 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 3482 }; 3483 { 3484 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 3485 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 3486 }; 3487 { 3488 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 3489 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 3490 }; 3491 { 3492 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 3493 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 3494 }; 3495 { 3496 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 3497 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 3498 }; 3499 { 3500 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 3501 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 3502 }; 3503 { 3504 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 3505 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 3506 }; 3507 { 3508 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 3509 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 3510 }; 3511 } 3512 { 3513 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 3514 int const t_lt = get_sub_group_local_id() < half_lane_idx; 3515 ; 3516 { 3517 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 3518 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 3519 }; 3520 { 3521 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 3522 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 3523 }; 3524 { 3525 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 3526 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 3527 }; 3528 { 3529 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 3530 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 3531 }; 3532 { 3533 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 3534 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 3535 }; 3536 { 3537 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 3538 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 3539 }; 3540 { 3541 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 3542 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 3543 }; 3544 { 3545 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 3546 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 3547 }; 3548 } 3549 { 3550 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 3551 int const t_lt = get_sub_group_local_id() < half_lane_idx; 3552 ; 3553 { 3554 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 3555 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 3556 }; 3557 { 3558 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 3559 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 3560 }; 3561 { 3562 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 3563 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 3564 }; 3565 { 3566 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 3567 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 3568 }; 3569 { 3570 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 3571 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 3572 }; 3573 { 3574 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 3575 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 3576 }; 3577 { 3578 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 3579 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 3580 }; 3581 { 3582 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 3583 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 3584 }; 3585 } 3586 { 3587 uint const t = min(r1, r5); 3588 r5 = max(r1, r5); 3589 r1 = t; 3590 }; 3591 { 3592 uint const t = min(r3, r7); 3593 r7 = max(r3, r7); 3594 r3 = t; 3595 }; 3596 { 3597 uint const t = min(r1, r3); 3598 r3 = max(r1, r3); 3599 r1 = t; 3600 }; 3601 { 3602 uint const t = min(r5, r7); 3603 r7 = max(r5, r7); 3604 r5 = t; 3605 }; 3606 { 3607 uint const t = min(r2, r6); 3608 r6 = max(r2, r6); 3609 r2 = t; 3610 }; 3611 { 3612 uint const t = min(r4, r8); 3613 r8 = max(r4, r8); 3614 r4 = t; 3615 }; 3616 { 3617 uint const t = min(r2, r4); 3618 r4 = max(r2, r4); 3619 r2 = t; 3620 }; 3621 { 3622 uint const t = min(r6, r8); 3623 r8 = max(r6, r8); 3624 r6 = t; 3625 }; 3626 { 3627 uint const t = min(r1, r2); 3628 r2 = max(r1, r2); 3629 r1 = t; 3630 }; 3631 { 3632 uint const t = min(r3, r4); 3633 r4 = max(r3, r4); 3634 r3 = t; 3635 }; 3636 { 3637 uint const t = min(r5, r6); 3638 r6 = max(r5, r6); 3639 r5 = t; 3640 }; 3641 { 3642 uint const t = min(r7, r8); 3643 r8 = max(r7, r8); 3644 r7 = t; 3645 }; 3646 uint const smem_l_idx = 3647 get_sub_group_id() * ((1 << 4) * 8) + get_sub_group_local_id(); 3648 uint const smem_r_idx = (get_sub_group_id() ^ 1) * ((1 << 4) * 8) + 3649 (get_sub_group_local_id() ^ ((1 << 4) - 1)); 3650 shared.m[get_local_id(0) + (8 * (1 << 4) * 0)] = r1; 3651 shared.m[get_local_id(0) + (8 * (1 << 4) * 1)] = r8; 3652 shared.m[get_local_id(0) + (8 * (1 << 4) * 2)] = r2; 3653 shared.m[get_local_id(0) + (8 * (1 << 4) * 3)] = r7; 3654 shared.m[get_local_id(0) + (8 * (1 << 4) * 4)] = r3; 3655 shared.m[get_local_id(0) + (8 * (1 << 4) * 5)] = r6; 3656 shared.m[get_local_id(0) + (8 * (1 << 4) * 6)] = r4; 3657 shared.m[get_local_id(0) + (8 * (1 << 4) * 7)] = r5; 3658 barrier(CLK_LOCAL_MEM_FENCE); 3659 { 3660 { 3661 uint r0_1 = shared.m[smem_l_idx + (0)]; 3662 uint r0_2 = shared.m[smem_r_idx + (16)]; 3663 { 3664 uint const t = min(r0_1, r0_2); 3665 r0_2 = max(r0_1, r0_2); 3666 r0_1 = t; 3667 }; 3668 shared.m[smem_l_idx + (0)] = r0_1; 3669 shared.m[smem_r_idx + (16)] = r0_2; 3670 } 3671 { 3672 uint r1_1 = shared.m[smem_l_idx + (32)]; 3673 uint r1_2 = shared.m[smem_r_idx + (48)]; 3674 { 3675 uint const t = min(r1_1, r1_2); 3676 r1_2 = max(r1_1, r1_2); 3677 r1_1 = t; 3678 }; 3679 shared.m[smem_l_idx + (32)] = r1_1; 3680 shared.m[smem_r_idx + (48)] = r1_2; 3681 } 3682 { 3683 uint r2_1 = shared.m[smem_l_idx + (64)]; 3684 uint r2_2 = shared.m[smem_r_idx + (80)]; 3685 { 3686 uint const t = min(r2_1, r2_2); 3687 r2_2 = max(r2_1, r2_2); 3688 r2_1 = t; 3689 }; 3690 shared.m[smem_l_idx + (64)] = r2_1; 3691 shared.m[smem_r_idx + (80)] = r2_2; 3692 } 3693 { 3694 uint r3_1 = shared.m[smem_l_idx + (96)]; 3695 uint r3_2 = shared.m[smem_r_idx + (112)]; 3696 { 3697 uint const t = min(r3_1, r3_2); 3698 r3_2 = max(r3_1, r3_2); 3699 r3_1 = t; 3700 }; 3701 shared.m[smem_l_idx + (96)] = r3_1; 3702 shared.m[smem_r_idx + (112)] = r3_2; 3703 } 3704 } 3705 barrier(CLK_LOCAL_MEM_FENCE); 3706 r1 = shared.m[get_local_id(0) + (8 * (1 << 4) * 0)]; 3707 r8 = shared.m[get_local_id(0) + (8 * (1 << 4) * 1)]; 3708 r2 = shared.m[get_local_id(0) + (8 * (1 << 4) * 2)]; 3709 r7 = shared.m[get_local_id(0) + (8 * (1 << 4) * 3)]; 3710 r3 = shared.m[get_local_id(0) + (8 * (1 << 4) * 4)]; 3711 r6 = shared.m[get_local_id(0) + (8 * (1 << 4) * 5)]; 3712 r4 = shared.m[get_local_id(0) + (8 * (1 << 4) * 6)]; 3713 r5 = shared.m[get_local_id(0) + (8 * (1 << 4) * 7)]; 3714 { 3715 { 3716 uint const half_lane_idx = get_sub_group_local_id() ^ 8; 3717 int const t_lt = get_sub_group_local_id() < half_lane_idx; 3718 ; 3719 { 3720 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 3721 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 3722 }; 3723 { 3724 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 3725 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 3726 }; 3727 { 3728 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 3729 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 3730 }; 3731 { 3732 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 3733 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 3734 }; 3735 { 3736 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 3737 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 3738 }; 3739 { 3740 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 3741 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 3742 }; 3743 { 3744 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 3745 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 3746 }; 3747 { 3748 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 3749 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 3750 }; 3751 } 3752 { 3753 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 3754 int const t_lt = get_sub_group_local_id() < half_lane_idx; 3755 ; 3756 { 3757 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 3758 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 3759 }; 3760 { 3761 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 3762 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 3763 }; 3764 { 3765 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 3766 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 3767 }; 3768 { 3769 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 3770 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 3771 }; 3772 { 3773 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 3774 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 3775 }; 3776 { 3777 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 3778 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 3779 }; 3780 { 3781 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 3782 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 3783 }; 3784 { 3785 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 3786 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 3787 }; 3788 } 3789 { 3790 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 3791 int const t_lt = get_sub_group_local_id() < half_lane_idx; 3792 ; 3793 { 3794 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 3795 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 3796 }; 3797 { 3798 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 3799 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 3800 }; 3801 { 3802 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 3803 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 3804 }; 3805 { 3806 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 3807 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 3808 }; 3809 { 3810 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 3811 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 3812 }; 3813 { 3814 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 3815 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 3816 }; 3817 { 3818 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 3819 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 3820 }; 3821 { 3822 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 3823 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 3824 }; 3825 } 3826 { 3827 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 3828 int const t_lt = get_sub_group_local_id() < half_lane_idx; 3829 ; 3830 { 3831 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 3832 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 3833 }; 3834 { 3835 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 3836 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 3837 }; 3838 { 3839 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 3840 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 3841 }; 3842 { 3843 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 3844 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 3845 }; 3846 { 3847 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 3848 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 3849 }; 3850 { 3851 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 3852 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 3853 }; 3854 { 3855 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 3856 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 3857 }; 3858 { 3859 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 3860 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 3861 }; 3862 } 3863 { 3864 uint const t = min(r1, r5); 3865 r5 = max(r1, r5); 3866 r1 = t; 3867 }; 3868 { 3869 uint const t = min(r3, r7); 3870 r7 = max(r3, r7); 3871 r3 = t; 3872 }; 3873 { 3874 uint const t = min(r1, r3); 3875 r3 = max(r1, r3); 3876 r1 = t; 3877 }; 3878 { 3879 uint const t = min(r5, r7); 3880 r7 = max(r5, r7); 3881 r5 = t; 3882 }; 3883 { 3884 uint const t = min(r2, r6); 3885 r6 = max(r2, r6); 3886 r2 = t; 3887 }; 3888 { 3889 uint const t = min(r4, r8); 3890 r8 = max(r4, r8); 3891 r4 = t; 3892 }; 3893 { 3894 uint const t = min(r2, r4); 3895 r4 = max(r2, r4); 3896 r2 = t; 3897 }; 3898 { 3899 uint const t = min(r6, r8); 3900 r8 = max(r6, r8); 3901 r6 = t; 3902 }; 3903 { 3904 uint const t = min(r1, r2); 3905 r2 = max(r1, r2); 3906 r1 = t; 3907 }; 3908 { 3909 uint const t = min(r3, r4); 3910 r4 = max(r3, r4); 3911 r3 = t; 3912 }; 3913 { 3914 uint const t = min(r5, r6); 3915 r6 = max(r5, r6); 3916 r5 = t; 3917 }; 3918 { 3919 uint const t = min(r7, r8); 3920 r8 = max(r7, r8); 3921 r7 = t; 3922 }; 3923 } 3924 shared.m[get_local_id(0) + (8 * (1 << 4) * 0)] = r1; 3925 shared.m[get_local_id(0) + (8 * (1 << 4) * 1)] = r8; 3926 shared.m[get_local_id(0) + (8 * (1 << 4) * 2)] = r2; 3927 shared.m[get_local_id(0) + (8 * (1 << 4) * 3)] = r7; 3928 shared.m[get_local_id(0) + (8 * (1 << 4) * 4)] = r3; 3929 shared.m[get_local_id(0) + (8 * (1 << 4) * 5)] = r6; 3930 shared.m[get_local_id(0) + (8 * (1 << 4) * 6)] = r4; 3931 shared.m[get_local_id(0) + (8 * (1 << 4) * 7)] = r5; 3932 barrier(CLK_LOCAL_MEM_FENCE); 3933 { 3934 { 3935 uint r0_1 = shared.m[smem_l_idx + (0)]; 3936 uint r0_2 = shared.m[smem_l_idx + (16)]; 3937 uint r0_3 = shared.m[smem_r_idx + (32)]; 3938 uint r0_4 = shared.m[smem_r_idx + (48)]; 3939 { 3940 uint const t = min(r0_2, r0_3); 3941 r0_3 = max(r0_2, r0_3); 3942 r0_2 = t; 3943 }; 3944 { 3945 uint const t = min(r0_1, r0_4); 3946 r0_4 = max(r0_1, r0_4); 3947 r0_1 = t; 3948 }; 3949 { 3950 uint const t = min(r0_3, r0_4); 3951 r0_4 = max(r0_3, r0_4); 3952 r0_3 = t; 3953 }; 3954 { 3955 uint const t = min(r0_1, r0_2); 3956 r0_2 = max(r0_1, r0_2); 3957 r0_1 = t; 3958 }; 3959 shared.m[smem_l_idx + (0)] = r0_1; 3960 shared.m[smem_l_idx + (16)] = r0_2; 3961 shared.m[smem_r_idx + (32)] = r0_3; 3962 shared.m[smem_r_idx + (48)] = r0_4; 3963 } 3964 { 3965 uint r1_1 = shared.m[smem_l_idx + (64)]; 3966 uint r1_2 = shared.m[smem_l_idx + (80)]; 3967 uint r1_3 = shared.m[smem_r_idx + (96)]; 3968 uint r1_4 = shared.m[smem_r_idx + (112)]; 3969 { 3970 uint const t = min(r1_2, r1_3); 3971 r1_3 = max(r1_2, r1_3); 3972 r1_2 = t; 3973 }; 3974 { 3975 uint const t = min(r1_1, r1_4); 3976 r1_4 = max(r1_1, r1_4); 3977 r1_1 = t; 3978 }; 3979 { 3980 uint const t = min(r1_3, r1_4); 3981 r1_4 = max(r1_3, r1_4); 3982 r1_3 = t; 3983 }; 3984 { 3985 uint const t = min(r1_1, r1_2); 3986 r1_2 = max(r1_1, r1_2); 3987 r1_1 = t; 3988 }; 3989 shared.m[smem_l_idx + (64)] = r1_1; 3990 shared.m[smem_l_idx + (80)] = r1_2; 3991 shared.m[smem_r_idx + (96)] = r1_3; 3992 shared.m[smem_r_idx + (112)] = r1_4; 3993 } 3994 } 3995 barrier(CLK_LOCAL_MEM_FENCE); 3996 r1 = shared.m[get_local_id(0) + (8 * (1 << 4) * 0)]; 3997 r8 = shared.m[get_local_id(0) + (8 * (1 << 4) * 1)]; 3998 r2 = shared.m[get_local_id(0) + (8 * (1 << 4) * 2)]; 3999 r7 = shared.m[get_local_id(0) + (8 * (1 << 4) * 3)]; 4000 r3 = shared.m[get_local_id(0) + (8 * (1 << 4) * 4)]; 4001 r6 = shared.m[get_local_id(0) + (8 * (1 << 4) * 5)]; 4002 r4 = shared.m[get_local_id(0) + (8 * (1 << 4) * 6)]; 4003 r5 = shared.m[get_local_id(0) + (8 * (1 << 4) * 7)]; 4004 { 4005 { 4006 uint const half_lane_idx = get_sub_group_local_id() ^ 8; 4007 int const t_lt = get_sub_group_local_id() < half_lane_idx; 4008 ; 4009 { 4010 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 4011 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 4012 }; 4013 { 4014 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 4015 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 4016 }; 4017 { 4018 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 4019 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 4020 }; 4021 { 4022 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 4023 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 4024 }; 4025 { 4026 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 4027 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 4028 }; 4029 { 4030 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 4031 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 4032 }; 4033 { 4034 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 4035 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 4036 }; 4037 { 4038 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 4039 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 4040 }; 4041 } 4042 { 4043 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 4044 int const t_lt = get_sub_group_local_id() < half_lane_idx; 4045 ; 4046 { 4047 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 4048 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 4049 }; 4050 { 4051 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 4052 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 4053 }; 4054 { 4055 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 4056 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 4057 }; 4058 { 4059 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 4060 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 4061 }; 4062 { 4063 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 4064 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 4065 }; 4066 { 4067 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 4068 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 4069 }; 4070 { 4071 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 4072 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 4073 }; 4074 { 4075 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 4076 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 4077 }; 4078 } 4079 { 4080 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 4081 int const t_lt = get_sub_group_local_id() < half_lane_idx; 4082 ; 4083 { 4084 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 4085 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 4086 }; 4087 { 4088 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 4089 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 4090 }; 4091 { 4092 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 4093 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 4094 }; 4095 { 4096 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 4097 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 4098 }; 4099 { 4100 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 4101 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 4102 }; 4103 { 4104 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 4105 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 4106 }; 4107 { 4108 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 4109 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 4110 }; 4111 { 4112 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 4113 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 4114 }; 4115 } 4116 { 4117 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 4118 int const t_lt = get_sub_group_local_id() < half_lane_idx; 4119 ; 4120 { 4121 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 4122 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 4123 }; 4124 { 4125 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 4126 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 4127 }; 4128 { 4129 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 4130 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 4131 }; 4132 { 4133 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 4134 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 4135 }; 4136 { 4137 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 4138 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 4139 }; 4140 { 4141 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 4142 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 4143 }; 4144 { 4145 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 4146 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 4147 }; 4148 { 4149 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 4150 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 4151 }; 4152 } 4153 { 4154 uint const t = min(r1, r5); 4155 r5 = max(r1, r5); 4156 r1 = t; 4157 }; 4158 { 4159 uint const t = min(r3, r7); 4160 r7 = max(r3, r7); 4161 r3 = t; 4162 }; 4163 { 4164 uint const t = min(r1, r3); 4165 r3 = max(r1, r3); 4166 r1 = t; 4167 }; 4168 { 4169 uint const t = min(r5, r7); 4170 r7 = max(r5, r7); 4171 r5 = t; 4172 }; 4173 { 4174 uint const t = min(r2, r6); 4175 r6 = max(r2, r6); 4176 r2 = t; 4177 }; 4178 { 4179 uint const t = min(r4, r8); 4180 r8 = max(r4, r8); 4181 r4 = t; 4182 }; 4183 { 4184 uint const t = min(r2, r4); 4185 r4 = max(r2, r4); 4186 r2 = t; 4187 }; 4188 { 4189 uint const t = min(r6, r8); 4190 r8 = max(r6, r8); 4191 r6 = t; 4192 }; 4193 { 4194 uint const t = min(r1, r2); 4195 r2 = max(r1, r2); 4196 r1 = t; 4197 }; 4198 { 4199 uint const t = min(r3, r4); 4200 r4 = max(r3, r4); 4201 r3 = t; 4202 }; 4203 { 4204 uint const t = min(r5, r6); 4205 r6 = max(r5, r6); 4206 r5 = t; 4207 }; 4208 { 4209 uint const t = min(r7, r8); 4210 r8 = max(r7, r8); 4211 r7 = t; 4212 }; 4213 } 4214 shared.m[get_local_id(0) + (8 * (1 << 4) * 0)] = r1; 4215 shared.m[get_local_id(0) + (8 * (1 << 4) * 1)] = r8; 4216 shared.m[get_local_id(0) + (8 * (1 << 4) * 2)] = r2; 4217 shared.m[get_local_id(0) + (8 * (1 << 4) * 3)] = r7; 4218 shared.m[get_local_id(0) + (8 * (1 << 4) * 4)] = r3; 4219 shared.m[get_local_id(0) + (8 * (1 << 4) * 5)] = r6; 4220 shared.m[get_local_id(0) + (8 * (1 << 4) * 6)] = r4; 4221 shared.m[get_local_id(0) + (8 * (1 << 4) * 7)] = r5; 4222 barrier(CLK_LOCAL_MEM_FENCE); 4223 { 4224 { 4225 uint r0_1 = shared.m[smem_l_idx + (0)]; 4226 uint r0_2 = shared.m[smem_l_idx + (16)]; 4227 uint r0_3 = shared.m[smem_l_idx + (32)]; 4228 uint r0_4 = shared.m[smem_l_idx + (48)]; 4229 uint r0_5 = shared.m[smem_r_idx + (64)]; 4230 uint r0_6 = shared.m[smem_r_idx + (80)]; 4231 uint r0_7 = shared.m[smem_r_idx + (96)]; 4232 uint r0_8 = shared.m[smem_r_idx + (112)]; 4233 { 4234 uint const t = min(r0_4, r0_5); 4235 r0_5 = max(r0_4, r0_5); 4236 r0_4 = t; 4237 }; 4238 { 4239 uint const t = min(r0_3, r0_6); 4240 r0_6 = max(r0_3, r0_6); 4241 r0_3 = t; 4242 }; 4243 { 4244 uint const t = min(r0_2, r0_7); 4245 r0_7 = max(r0_2, r0_7); 4246 r0_2 = t; 4247 }; 4248 { 4249 uint const t = min(r0_1, r0_8); 4250 r0_8 = max(r0_1, r0_8); 4251 r0_1 = t; 4252 }; 4253 { 4254 uint const t = min(r0_5, r0_7); 4255 r0_7 = max(r0_5, r0_7); 4256 r0_5 = t; 4257 }; 4258 { 4259 uint const t = min(r0_6, r0_8); 4260 r0_8 = max(r0_6, r0_8); 4261 r0_6 = t; 4262 }; 4263 { 4264 uint const t = min(r0_5, r0_6); 4265 r0_6 = max(r0_5, r0_6); 4266 r0_5 = t; 4267 }; 4268 { 4269 uint const t = min(r0_7, r0_8); 4270 r0_8 = max(r0_7, r0_8); 4271 r0_7 = t; 4272 }; 4273 { 4274 uint const t = min(r0_1, r0_3); 4275 r0_3 = max(r0_1, r0_3); 4276 r0_1 = t; 4277 }; 4278 { 4279 uint const t = min(r0_2, r0_4); 4280 r0_4 = max(r0_2, r0_4); 4281 r0_2 = t; 4282 }; 4283 { 4284 uint const t = min(r0_1, r0_2); 4285 r0_2 = max(r0_1, r0_2); 4286 r0_1 = t; 4287 }; 4288 { 4289 uint const t = min(r0_3, r0_4); 4290 r0_4 = max(r0_3, r0_4); 4291 r0_3 = t; 4292 }; 4293 shared.m[smem_l_idx + (0)] = r0_1; 4294 shared.m[smem_l_idx + (16)] = r0_2; 4295 shared.m[smem_l_idx + (32)] = r0_3; 4296 shared.m[smem_l_idx + (48)] = r0_4; 4297 shared.m[smem_r_idx + (64)] = r0_5; 4298 shared.m[smem_r_idx + (80)] = r0_6; 4299 shared.m[smem_r_idx + (96)] = r0_7; 4300 shared.m[smem_r_idx + (112)] = r0_8; 4301 } 4302 } 4303 barrier(CLK_LOCAL_MEM_FENCE); 4304 r1 = shared.m[get_local_id(0) + (8 * (1 << 4) * 0)]; 4305 r8 = shared.m[get_local_id(0) + (8 * (1 << 4) * 1)]; 4306 r2 = shared.m[get_local_id(0) + (8 * (1 << 4) * 2)]; 4307 r7 = shared.m[get_local_id(0) + (8 * (1 << 4) * 3)]; 4308 r3 = shared.m[get_local_id(0) + (8 * (1 << 4) * 4)]; 4309 r6 = shared.m[get_local_id(0) + (8 * (1 << 4) * 5)]; 4310 r4 = shared.m[get_local_id(0) + (8 * (1 << 4) * 6)]; 4311 r5 = shared.m[get_local_id(0) + (8 * (1 << 4) * 7)]; 4312 { 4313 { 4314 uint const half_lane_idx = get_sub_group_local_id() ^ 8; 4315 int const t_lt = get_sub_group_local_id() < half_lane_idx; 4316 ; 4317 { 4318 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 4319 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 4320 }; 4321 { 4322 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 4323 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 4324 }; 4325 { 4326 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 4327 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 4328 }; 4329 { 4330 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 4331 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 4332 }; 4333 { 4334 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 4335 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 4336 }; 4337 { 4338 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 4339 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 4340 }; 4341 { 4342 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 4343 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 4344 }; 4345 { 4346 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 4347 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 4348 }; 4349 } 4350 { 4351 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 4352 int const t_lt = get_sub_group_local_id() < half_lane_idx; 4353 ; 4354 { 4355 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 4356 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 4357 }; 4358 { 4359 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 4360 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 4361 }; 4362 { 4363 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 4364 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 4365 }; 4366 { 4367 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 4368 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 4369 }; 4370 { 4371 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 4372 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 4373 }; 4374 { 4375 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 4376 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 4377 }; 4378 { 4379 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 4380 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 4381 }; 4382 { 4383 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 4384 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 4385 }; 4386 } 4387 { 4388 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 4389 int const t_lt = get_sub_group_local_id() < half_lane_idx; 4390 ; 4391 { 4392 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 4393 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 4394 }; 4395 { 4396 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 4397 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 4398 }; 4399 { 4400 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 4401 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 4402 }; 4403 { 4404 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 4405 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 4406 }; 4407 { 4408 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 4409 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 4410 }; 4411 { 4412 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 4413 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 4414 }; 4415 { 4416 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 4417 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 4418 }; 4419 { 4420 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 4421 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 4422 }; 4423 } 4424 { 4425 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 4426 int const t_lt = get_sub_group_local_id() < half_lane_idx; 4427 ; 4428 { 4429 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 4430 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 4431 }; 4432 { 4433 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 4434 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 4435 }; 4436 { 4437 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 4438 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 4439 }; 4440 { 4441 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 4442 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 4443 }; 4444 { 4445 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 4446 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 4447 }; 4448 { 4449 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 4450 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 4451 }; 4452 { 4453 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 4454 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 4455 }; 4456 { 4457 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 4458 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 4459 }; 4460 } 4461 { 4462 uint const t = min(r1, r5); 4463 r5 = max(r1, r5); 4464 r1 = t; 4465 }; 4466 { 4467 uint const t = min(r3, r7); 4468 r7 = max(r3, r7); 4469 r3 = t; 4470 }; 4471 { 4472 uint const t = min(r1, r3); 4473 r3 = max(r1, r3); 4474 r1 = t; 4475 }; 4476 { 4477 uint const t = min(r5, r7); 4478 r7 = max(r5, r7); 4479 r5 = t; 4480 }; 4481 { 4482 uint const t = min(r2, r6); 4483 r6 = max(r2, r6); 4484 r2 = t; 4485 }; 4486 { 4487 uint const t = min(r4, r8); 4488 r8 = max(r4, r8); 4489 r4 = t; 4490 }; 4491 { 4492 uint const t = min(r2, r4); 4493 r4 = max(r2, r4); 4494 r2 = t; 4495 }; 4496 { 4497 uint const t = min(r6, r8); 4498 r8 = max(r6, r8); 4499 r6 = t; 4500 }; 4501 { 4502 uint const t = min(r1, r2); 4503 r2 = max(r1, r2); 4504 r1 = t; 4505 }; 4506 { 4507 uint const t = min(r3, r4); 4508 r4 = max(r3, r4); 4509 r3 = t; 4510 }; 4511 { 4512 uint const t = min(r5, r6); 4513 r6 = max(r5, r6); 4514 r5 = t; 4515 }; 4516 { 4517 uint const t = min(r7, r8); 4518 r8 = max(r7, r8); 4519 r7 = t; 4520 }; 4521 } 4522 vout[gmem_idx + (1 << 4) * 0] = r1; 4523 vout[gmem_idx + (1 << 4) * 1] = r2; 4524 vout[gmem_idx + (1 << 4) * 2] = r3; 4525 vout[gmem_idx + (1 << 4) * 3] = r4; 4526 vout[gmem_idx + (1 << 4) * 4] = r5; 4527 vout[gmem_idx + (1 << 4) * 5] = r6; 4528 vout[gmem_idx + (1 << 4) * 6] = r7; 4529 vout[gmem_idx + (1 << 4) * 7] = r8; 4530} 4531 4532__kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) 4533__attribute__((reqd_work_group_size((1 << 4) * 16, 1, 1))) void 4534hs_kernel_bs_4(__global uint const* const restrict vin, 4535 __global uint* const restrict vout) 4536{ 4537 __local struct 4538 { 4539 uint m[256 * 8]; 4540 } shared; 4541 4542 uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 + 4543 (get_local_id(0) & ((1 << 4) - 1)); 4544 uint r1 = vin[gmem_idx + (1 << 4) * 0]; 4545 uint r2 = vin[gmem_idx + (1 << 4) * 1]; 4546 uint r3 = vin[gmem_idx + (1 << 4) * 2]; 4547 uint r4 = vin[gmem_idx + (1 << 4) * 3]; 4548 uint r5 = vin[gmem_idx + (1 << 4) * 4]; 4549 uint r6 = vin[gmem_idx + (1 << 4) * 5]; 4550 uint r7 = vin[gmem_idx + (1 << 4) * 6]; 4551 uint r8 = vin[gmem_idx + (1 << 4) * 7]; 4552 { 4553 uint const t = min(r1, r5); 4554 r5 = max(r1, r5); 4555 r1 = t; 4556 }; 4557 { 4558 uint const t = min(r2, r6); 4559 r6 = max(r2, r6); 4560 r2 = t; 4561 }; 4562 { 4563 uint const t = min(r3, r7); 4564 r7 = max(r3, r7); 4565 r3 = t; 4566 }; 4567 { 4568 uint const t = min(r4, r8); 4569 r8 = max(r4, r8); 4570 r4 = t; 4571 }; 4572 { 4573 uint const t = min(r1, r3); 4574 r3 = max(r1, r3); 4575 r1 = t; 4576 }; 4577 { 4578 uint const t = min(r2, r4); 4579 r4 = max(r2, r4); 4580 r2 = t; 4581 }; 4582 { 4583 uint const t = min(r5, r7); 4584 r7 = max(r5, r7); 4585 r5 = t; 4586 }; 4587 { 4588 uint const t = min(r6, r8); 4589 r8 = max(r6, r8); 4590 r6 = t; 4591 }; 4592 { 4593 uint const t = min(r3, r5); 4594 r5 = max(r3, r5); 4595 r3 = t; 4596 }; 4597 { 4598 uint const t = min(r4, r6); 4599 r6 = max(r4, r6); 4600 r4 = t; 4601 }; 4602 { 4603 uint const t = min(r1, r2); 4604 r2 = max(r1, r2); 4605 r1 = t; 4606 }; 4607 { 4608 uint const t = min(r3, r4); 4609 r4 = max(r3, r4); 4610 r3 = t; 4611 }; 4612 { 4613 uint const t = min(r5, r6); 4614 r6 = max(r5, r6); 4615 r5 = t; 4616 }; 4617 { 4618 uint const t = min(r7, r8); 4619 r8 = max(r7, r8); 4620 r7 = t; 4621 }; 4622 { 4623 uint const t = min(r2, r5); 4624 r5 = max(r2, r5); 4625 r2 = t; 4626 }; 4627 { 4628 uint const t = min(r4, r7); 4629 r7 = max(r4, r7); 4630 r4 = t; 4631 }; 4632 { 4633 uint const t = min(r2, r3); 4634 r3 = max(r2, r3); 4635 r2 = t; 4636 }; 4637 { 4638 uint const t = min(r4, r5); 4639 r5 = max(r4, r5); 4640 r4 = t; 4641 }; 4642 { 4643 uint const t = min(r6, r7); 4644 r7 = max(r6, r7); 4645 r6 = t; 4646 }; 4647 { 4648 uint const flip_lane_idx = get_sub_group_local_id() ^ 1; 4649 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 4650 ; 4651 { 4652 uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 4653 uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx); 4654 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 4655 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 4656 }; 4657 { 4658 uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 4659 uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx); 4660 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 4661 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 4662 }; 4663 { 4664 uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 4665 uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx); 4666 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 4667 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 4668 }; 4669 { 4670 uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 4671 uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx); 4672 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 4673 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 4674 }; 4675 } 4676 { 4677 uint const t = min(r1, r5); 4678 r5 = max(r1, r5); 4679 r1 = t; 4680 }; 4681 { 4682 uint const t = min(r3, r7); 4683 r7 = max(r3, r7); 4684 r3 = t; 4685 }; 4686 { 4687 uint const t = min(r1, r3); 4688 r3 = max(r1, r3); 4689 r1 = t; 4690 }; 4691 { 4692 uint const t = min(r5, r7); 4693 r7 = max(r5, r7); 4694 r5 = t; 4695 }; 4696 { 4697 uint const t = min(r2, r6); 4698 r6 = max(r2, r6); 4699 r2 = t; 4700 }; 4701 { 4702 uint const t = min(r4, r8); 4703 r8 = max(r4, r8); 4704 r4 = t; 4705 }; 4706 { 4707 uint const t = min(r2, r4); 4708 r4 = max(r2, r4); 4709 r2 = t; 4710 }; 4711 { 4712 uint const t = min(r6, r8); 4713 r8 = max(r6, r8); 4714 r6 = t; 4715 }; 4716 { 4717 uint const t = min(r1, r2); 4718 r2 = max(r1, r2); 4719 r1 = t; 4720 }; 4721 { 4722 uint const t = min(r3, r4); 4723 r4 = max(r3, r4); 4724 r3 = t; 4725 }; 4726 { 4727 uint const t = min(r5, r6); 4728 r6 = max(r5, r6); 4729 r5 = t; 4730 }; 4731 { 4732 uint const t = min(r7, r8); 4733 r8 = max(r7, r8); 4734 r7 = t; 4735 }; 4736 { 4737 uint const flip_lane_idx = get_sub_group_local_id() ^ 3; 4738 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 4739 ; 4740 { 4741 uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 4742 uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx); 4743 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 4744 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 4745 }; 4746 { 4747 uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 4748 uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx); 4749 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 4750 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 4751 }; 4752 { 4753 uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 4754 uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx); 4755 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 4756 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 4757 }; 4758 { 4759 uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 4760 uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx); 4761 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 4762 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 4763 }; 4764 } 4765 { 4766 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 4767 int const t_lt = get_sub_group_local_id() < half_lane_idx; 4768 ; 4769 { 4770 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 4771 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 4772 }; 4773 { 4774 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 4775 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 4776 }; 4777 { 4778 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 4779 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 4780 }; 4781 { 4782 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 4783 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 4784 }; 4785 { 4786 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 4787 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 4788 }; 4789 { 4790 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 4791 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 4792 }; 4793 { 4794 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 4795 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 4796 }; 4797 { 4798 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 4799 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 4800 }; 4801 } 4802 { 4803 uint const t = min(r1, r5); 4804 r5 = max(r1, r5); 4805 r1 = t; 4806 }; 4807 { 4808 uint const t = min(r3, r7); 4809 r7 = max(r3, r7); 4810 r3 = t; 4811 }; 4812 { 4813 uint const t = min(r1, r3); 4814 r3 = max(r1, r3); 4815 r1 = t; 4816 }; 4817 { 4818 uint const t = min(r5, r7); 4819 r7 = max(r5, r7); 4820 r5 = t; 4821 }; 4822 { 4823 uint const t = min(r2, r6); 4824 r6 = max(r2, r6); 4825 r2 = t; 4826 }; 4827 { 4828 uint const t = min(r4, r8); 4829 r8 = max(r4, r8); 4830 r4 = t; 4831 }; 4832 { 4833 uint const t = min(r2, r4); 4834 r4 = max(r2, r4); 4835 r2 = t; 4836 }; 4837 { 4838 uint const t = min(r6, r8); 4839 r8 = max(r6, r8); 4840 r6 = t; 4841 }; 4842 { 4843 uint const t = min(r1, r2); 4844 r2 = max(r1, r2); 4845 r1 = t; 4846 }; 4847 { 4848 uint const t = min(r3, r4); 4849 r4 = max(r3, r4); 4850 r3 = t; 4851 }; 4852 { 4853 uint const t = min(r5, r6); 4854 r6 = max(r5, r6); 4855 r5 = t; 4856 }; 4857 { 4858 uint const t = min(r7, r8); 4859 r8 = max(r7, r8); 4860 r7 = t; 4861 }; 4862 { 4863 uint const flip_lane_idx = get_sub_group_local_id() ^ 7; 4864 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 4865 ; 4866 { 4867 uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 4868 uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx); 4869 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 4870 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 4871 }; 4872 { 4873 uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 4874 uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx); 4875 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 4876 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 4877 }; 4878 { 4879 uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 4880 uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx); 4881 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 4882 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 4883 }; 4884 { 4885 uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 4886 uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx); 4887 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 4888 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 4889 }; 4890 } 4891 { 4892 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 4893 int const t_lt = get_sub_group_local_id() < half_lane_idx; 4894 ; 4895 { 4896 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 4897 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 4898 }; 4899 { 4900 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 4901 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 4902 }; 4903 { 4904 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 4905 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 4906 }; 4907 { 4908 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 4909 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 4910 }; 4911 { 4912 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 4913 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 4914 }; 4915 { 4916 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 4917 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 4918 }; 4919 { 4920 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 4921 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 4922 }; 4923 { 4924 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 4925 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 4926 }; 4927 } 4928 { 4929 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 4930 int const t_lt = get_sub_group_local_id() < half_lane_idx; 4931 ; 4932 { 4933 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 4934 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 4935 }; 4936 { 4937 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 4938 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 4939 }; 4940 { 4941 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 4942 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 4943 }; 4944 { 4945 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 4946 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 4947 }; 4948 { 4949 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 4950 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 4951 }; 4952 { 4953 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 4954 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 4955 }; 4956 { 4957 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 4958 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 4959 }; 4960 { 4961 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 4962 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 4963 }; 4964 } 4965 { 4966 uint const t = min(r1, r5); 4967 r5 = max(r1, r5); 4968 r1 = t; 4969 }; 4970 { 4971 uint const t = min(r3, r7); 4972 r7 = max(r3, r7); 4973 r3 = t; 4974 }; 4975 { 4976 uint const t = min(r1, r3); 4977 r3 = max(r1, r3); 4978 r1 = t; 4979 }; 4980 { 4981 uint const t = min(r5, r7); 4982 r7 = max(r5, r7); 4983 r5 = t; 4984 }; 4985 { 4986 uint const t = min(r2, r6); 4987 r6 = max(r2, r6); 4988 r2 = t; 4989 }; 4990 { 4991 uint const t = min(r4, r8); 4992 r8 = max(r4, r8); 4993 r4 = t; 4994 }; 4995 { 4996 uint const t = min(r2, r4); 4997 r4 = max(r2, r4); 4998 r2 = t; 4999 }; 5000 { 5001 uint const t = min(r6, r8); 5002 r8 = max(r6, r8); 5003 r6 = t; 5004 }; 5005 { 5006 uint const t = min(r1, r2); 5007 r2 = max(r1, r2); 5008 r1 = t; 5009 }; 5010 { 5011 uint const t = min(r3, r4); 5012 r4 = max(r3, r4); 5013 r3 = t; 5014 }; 5015 { 5016 uint const t = min(r5, r6); 5017 r6 = max(r5, r6); 5018 r5 = t; 5019 }; 5020 { 5021 uint const t = min(r7, r8); 5022 r8 = max(r7, r8); 5023 r7 = t; 5024 }; 5025 { 5026 uint const flip_lane_idx = get_sub_group_local_id() ^ 15; 5027 int const t_lt = get_sub_group_local_id() < flip_lane_idx; 5028 ; 5029 { 5030 uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx); 5031 uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx); 5032 r1 = ((r1 <= tb) ^ t_lt) ? tb : r1; 5033 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 5034 }; 5035 { 5036 uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx); 5037 uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx); 5038 r2 = ((r2 <= tb) ^ t_lt) ? tb : r2; 5039 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 5040 }; 5041 { 5042 uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx); 5043 uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx); 5044 r3 = ((r3 <= tb) ^ t_lt) ? tb : r3; 5045 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 5046 }; 5047 { 5048 uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx); 5049 uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx); 5050 r4 = ((r4 <= tb) ^ t_lt) ? tb : r4; 5051 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 5052 }; 5053 } 5054 { 5055 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 5056 int const t_lt = get_sub_group_local_id() < half_lane_idx; 5057 ; 5058 { 5059 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 5060 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 5061 }; 5062 { 5063 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 5064 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 5065 }; 5066 { 5067 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 5068 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 5069 }; 5070 { 5071 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 5072 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 5073 }; 5074 { 5075 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 5076 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 5077 }; 5078 { 5079 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 5080 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 5081 }; 5082 { 5083 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 5084 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 5085 }; 5086 { 5087 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 5088 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 5089 }; 5090 } 5091 { 5092 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 5093 int const t_lt = get_sub_group_local_id() < half_lane_idx; 5094 ; 5095 { 5096 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 5097 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 5098 }; 5099 { 5100 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 5101 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 5102 }; 5103 { 5104 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 5105 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 5106 }; 5107 { 5108 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 5109 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 5110 }; 5111 { 5112 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 5113 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 5114 }; 5115 { 5116 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 5117 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 5118 }; 5119 { 5120 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 5121 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 5122 }; 5123 { 5124 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 5125 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 5126 }; 5127 } 5128 { 5129 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 5130 int const t_lt = get_sub_group_local_id() < half_lane_idx; 5131 ; 5132 { 5133 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 5134 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 5135 }; 5136 { 5137 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 5138 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 5139 }; 5140 { 5141 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 5142 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 5143 }; 5144 { 5145 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 5146 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 5147 }; 5148 { 5149 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 5150 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 5151 }; 5152 { 5153 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 5154 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 5155 }; 5156 { 5157 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 5158 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 5159 }; 5160 { 5161 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 5162 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 5163 }; 5164 } 5165 { 5166 uint const t = min(r1, r5); 5167 r5 = max(r1, r5); 5168 r1 = t; 5169 }; 5170 { 5171 uint const t = min(r3, r7); 5172 r7 = max(r3, r7); 5173 r3 = t; 5174 }; 5175 { 5176 uint const t = min(r1, r3); 5177 r3 = max(r1, r3); 5178 r1 = t; 5179 }; 5180 { 5181 uint const t = min(r5, r7); 5182 r7 = max(r5, r7); 5183 r5 = t; 5184 }; 5185 { 5186 uint const t = min(r2, r6); 5187 r6 = max(r2, r6); 5188 r2 = t; 5189 }; 5190 { 5191 uint const t = min(r4, r8); 5192 r8 = max(r4, r8); 5193 r4 = t; 5194 }; 5195 { 5196 uint const t = min(r2, r4); 5197 r4 = max(r2, r4); 5198 r2 = t; 5199 }; 5200 { 5201 uint const t = min(r6, r8); 5202 r8 = max(r6, r8); 5203 r6 = t; 5204 }; 5205 { 5206 uint const t = min(r1, r2); 5207 r2 = max(r1, r2); 5208 r1 = t; 5209 }; 5210 { 5211 uint const t = min(r3, r4); 5212 r4 = max(r3, r4); 5213 r3 = t; 5214 }; 5215 { 5216 uint const t = min(r5, r6); 5217 r6 = max(r5, r6); 5218 r5 = t; 5219 }; 5220 { 5221 uint const t = min(r7, r8); 5222 r8 = max(r7, r8); 5223 r7 = t; 5224 }; 5225 uint const smem_l_idx = 5226 get_sub_group_id() * ((1 << 4) * 16) + get_sub_group_local_id(); 5227 uint const smem_r_idx = (get_sub_group_id() ^ 1) * ((1 << 4) * 16) + 5228 (get_sub_group_local_id() ^ ((1 << 4) - 1)); 5229 shared.m[get_local_id(0) + (16 * (1 << 4) * 0)] = r1; 5230 shared.m[get_local_id(0) + (16 * (1 << 4) * 1)] = r8; 5231 shared.m[get_local_id(0) + (16 * (1 << 4) * 2)] = r2; 5232 shared.m[get_local_id(0) + (16 * (1 << 4) * 3)] = r7; 5233 shared.m[get_local_id(0) + (16 * (1 << 4) * 4)] = r3; 5234 shared.m[get_local_id(0) + (16 * (1 << 4) * 5)] = r6; 5235 shared.m[get_local_id(0) + (16 * (1 << 4) * 6)] = r4; 5236 shared.m[get_local_id(0) + (16 * (1 << 4) * 7)] = r5; 5237 barrier(CLK_LOCAL_MEM_FENCE); 5238 if (get_sub_group_id() < 8) { 5239 { 5240 uint r0_1 = shared.m[smem_l_idx + (0)]; 5241 uint r0_2 = shared.m[smem_r_idx + (16)]; 5242 { 5243 uint const t = min(r0_1, r0_2); 5244 r0_2 = max(r0_1, r0_2); 5245 r0_1 = t; 5246 }; 5247 shared.m[smem_l_idx + (0)] = r0_1; 5248 shared.m[smem_r_idx + (16)] = r0_2; 5249 } 5250 { 5251 uint r1_1 = shared.m[smem_l_idx + (32)]; 5252 uint r1_2 = shared.m[smem_r_idx + (48)]; 5253 { 5254 uint const t = min(r1_1, r1_2); 5255 r1_2 = max(r1_1, r1_2); 5256 r1_1 = t; 5257 }; 5258 shared.m[smem_l_idx + (32)] = r1_1; 5259 shared.m[smem_r_idx + (48)] = r1_2; 5260 } 5261 { 5262 uint r2_1 = shared.m[smem_l_idx + (64)]; 5263 uint r2_2 = shared.m[smem_r_idx + (80)]; 5264 { 5265 uint const t = min(r2_1, r2_2); 5266 r2_2 = max(r2_1, r2_2); 5267 r2_1 = t; 5268 }; 5269 shared.m[smem_l_idx + (64)] = r2_1; 5270 shared.m[smem_r_idx + (80)] = r2_2; 5271 } 5272 { 5273 uint r3_1 = shared.m[smem_l_idx + (96)]; 5274 uint r3_2 = shared.m[smem_r_idx + (112)]; 5275 { 5276 uint const t = min(r3_1, r3_2); 5277 r3_2 = max(r3_1, r3_2); 5278 r3_1 = t; 5279 }; 5280 shared.m[smem_l_idx + (96)] = r3_1; 5281 shared.m[smem_r_idx + (112)] = r3_2; 5282 } 5283 { 5284 uint r4_1 = shared.m[smem_l_idx + (128)]; 5285 uint r4_2 = shared.m[smem_r_idx + (144)]; 5286 { 5287 uint const t = min(r4_1, r4_2); 5288 r4_2 = max(r4_1, r4_2); 5289 r4_1 = t; 5290 }; 5291 shared.m[smem_l_idx + (128)] = r4_1; 5292 shared.m[smem_r_idx + (144)] = r4_2; 5293 } 5294 { 5295 uint r5_1 = shared.m[smem_l_idx + (160)]; 5296 uint r5_2 = shared.m[smem_r_idx + (176)]; 5297 { 5298 uint const t = min(r5_1, r5_2); 5299 r5_2 = max(r5_1, r5_2); 5300 r5_1 = t; 5301 }; 5302 shared.m[smem_l_idx + (160)] = r5_1; 5303 shared.m[smem_r_idx + (176)] = r5_2; 5304 } 5305 { 5306 uint r6_1 = shared.m[smem_l_idx + (192)]; 5307 uint r6_2 = shared.m[smem_r_idx + (208)]; 5308 { 5309 uint const t = min(r6_1, r6_2); 5310 r6_2 = max(r6_1, r6_2); 5311 r6_1 = t; 5312 }; 5313 shared.m[smem_l_idx + (192)] = r6_1; 5314 shared.m[smem_r_idx + (208)] = r6_2; 5315 } 5316 { 5317 uint r7_1 = shared.m[smem_l_idx + (224)]; 5318 uint r7_2 = shared.m[smem_r_idx + (240)]; 5319 { 5320 uint const t = min(r7_1, r7_2); 5321 r7_2 = max(r7_1, r7_2); 5322 r7_1 = t; 5323 }; 5324 shared.m[smem_l_idx + (224)] = r7_1; 5325 shared.m[smem_r_idx + (240)] = r7_2; 5326 } 5327 } 5328 barrier(CLK_LOCAL_MEM_FENCE); 5329 r1 = shared.m[get_local_id(0) + (16 * (1 << 4) * 0)]; 5330 r8 = shared.m[get_local_id(0) + (16 * (1 << 4) * 1)]; 5331 r2 = shared.m[get_local_id(0) + (16 * (1 << 4) * 2)]; 5332 r7 = shared.m[get_local_id(0) + (16 * (1 << 4) * 3)]; 5333 r3 = shared.m[get_local_id(0) + (16 * (1 << 4) * 4)]; 5334 r6 = shared.m[get_local_id(0) + (16 * (1 << 4) * 5)]; 5335 r4 = shared.m[get_local_id(0) + (16 * (1 << 4) * 6)]; 5336 r5 = shared.m[get_local_id(0) + (16 * (1 << 4) * 7)]; 5337 { 5338 { 5339 uint const half_lane_idx = get_sub_group_local_id() ^ 8; 5340 int const t_lt = get_sub_group_local_id() < half_lane_idx; 5341 ; 5342 { 5343 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 5344 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 5345 }; 5346 { 5347 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 5348 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 5349 }; 5350 { 5351 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 5352 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 5353 }; 5354 { 5355 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 5356 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 5357 }; 5358 { 5359 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 5360 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 5361 }; 5362 { 5363 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 5364 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 5365 }; 5366 { 5367 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 5368 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 5369 }; 5370 { 5371 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 5372 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 5373 }; 5374 } 5375 { 5376 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 5377 int const t_lt = get_sub_group_local_id() < half_lane_idx; 5378 ; 5379 { 5380 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 5381 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 5382 }; 5383 { 5384 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 5385 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 5386 }; 5387 { 5388 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 5389 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 5390 }; 5391 { 5392 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 5393 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 5394 }; 5395 { 5396 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 5397 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 5398 }; 5399 { 5400 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 5401 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 5402 }; 5403 { 5404 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 5405 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 5406 }; 5407 { 5408 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 5409 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 5410 }; 5411 } 5412 { 5413 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 5414 int const t_lt = get_sub_group_local_id() < half_lane_idx; 5415 ; 5416 { 5417 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 5418 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 5419 }; 5420 { 5421 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 5422 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 5423 }; 5424 { 5425 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 5426 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 5427 }; 5428 { 5429 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 5430 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 5431 }; 5432 { 5433 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 5434 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 5435 }; 5436 { 5437 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 5438 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 5439 }; 5440 { 5441 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 5442 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 5443 }; 5444 { 5445 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 5446 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 5447 }; 5448 } 5449 { 5450 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 5451 int const t_lt = get_sub_group_local_id() < half_lane_idx; 5452 ; 5453 { 5454 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 5455 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 5456 }; 5457 { 5458 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 5459 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 5460 }; 5461 { 5462 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 5463 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 5464 }; 5465 { 5466 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 5467 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 5468 }; 5469 { 5470 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 5471 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 5472 }; 5473 { 5474 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 5475 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 5476 }; 5477 { 5478 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 5479 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 5480 }; 5481 { 5482 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 5483 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 5484 }; 5485 } 5486 { 5487 uint const t = min(r1, r5); 5488 r5 = max(r1, r5); 5489 r1 = t; 5490 }; 5491 { 5492 uint const t = min(r3, r7); 5493 r7 = max(r3, r7); 5494 r3 = t; 5495 }; 5496 { 5497 uint const t = min(r1, r3); 5498 r3 = max(r1, r3); 5499 r1 = t; 5500 }; 5501 { 5502 uint const t = min(r5, r7); 5503 r7 = max(r5, r7); 5504 r5 = t; 5505 }; 5506 { 5507 uint const t = min(r2, r6); 5508 r6 = max(r2, r6); 5509 r2 = t; 5510 }; 5511 { 5512 uint const t = min(r4, r8); 5513 r8 = max(r4, r8); 5514 r4 = t; 5515 }; 5516 { 5517 uint const t = min(r2, r4); 5518 r4 = max(r2, r4); 5519 r2 = t; 5520 }; 5521 { 5522 uint const t = min(r6, r8); 5523 r8 = max(r6, r8); 5524 r6 = t; 5525 }; 5526 { 5527 uint const t = min(r1, r2); 5528 r2 = max(r1, r2); 5529 r1 = t; 5530 }; 5531 { 5532 uint const t = min(r3, r4); 5533 r4 = max(r3, r4); 5534 r3 = t; 5535 }; 5536 { 5537 uint const t = min(r5, r6); 5538 r6 = max(r5, r6); 5539 r5 = t; 5540 }; 5541 { 5542 uint const t = min(r7, r8); 5543 r8 = max(r7, r8); 5544 r7 = t; 5545 }; 5546 } 5547 shared.m[get_local_id(0) + (16 * (1 << 4) * 0)] = r1; 5548 shared.m[get_local_id(0) + (16 * (1 << 4) * 1)] = r8; 5549 shared.m[get_local_id(0) + (16 * (1 << 4) * 2)] = r2; 5550 shared.m[get_local_id(0) + (16 * (1 << 4) * 3)] = r7; 5551 shared.m[get_local_id(0) + (16 * (1 << 4) * 4)] = r3; 5552 shared.m[get_local_id(0) + (16 * (1 << 4) * 5)] = r6; 5553 shared.m[get_local_id(0) + (16 * (1 << 4) * 6)] = r4; 5554 shared.m[get_local_id(0) + (16 * (1 << 4) * 7)] = r5; 5555 barrier(CLK_LOCAL_MEM_FENCE); 5556 if (get_sub_group_id() < 8) { 5557 { 5558 uint r0_1 = shared.m[smem_l_idx + (0)]; 5559 uint r0_2 = shared.m[smem_l_idx + (16)]; 5560 uint r0_3 = shared.m[smem_r_idx + (32)]; 5561 uint r0_4 = shared.m[smem_r_idx + (48)]; 5562 { 5563 uint const t = min(r0_2, r0_3); 5564 r0_3 = max(r0_2, r0_3); 5565 r0_2 = t; 5566 }; 5567 { 5568 uint const t = min(r0_1, r0_4); 5569 r0_4 = max(r0_1, r0_4); 5570 r0_1 = t; 5571 }; 5572 { 5573 uint const t = min(r0_3, r0_4); 5574 r0_4 = max(r0_3, r0_4); 5575 r0_3 = t; 5576 }; 5577 { 5578 uint const t = min(r0_1, r0_2); 5579 r0_2 = max(r0_1, r0_2); 5580 r0_1 = t; 5581 }; 5582 shared.m[smem_l_idx + (0)] = r0_1; 5583 shared.m[smem_l_idx + (16)] = r0_2; 5584 shared.m[smem_r_idx + (32)] = r0_3; 5585 shared.m[smem_r_idx + (48)] = r0_4; 5586 } 5587 { 5588 uint r1_1 = shared.m[smem_l_idx + (64)]; 5589 uint r1_2 = shared.m[smem_l_idx + (80)]; 5590 uint r1_3 = shared.m[smem_r_idx + (96)]; 5591 uint r1_4 = shared.m[smem_r_idx + (112)]; 5592 { 5593 uint const t = min(r1_2, r1_3); 5594 r1_3 = max(r1_2, r1_3); 5595 r1_2 = t; 5596 }; 5597 { 5598 uint const t = min(r1_1, r1_4); 5599 r1_4 = max(r1_1, r1_4); 5600 r1_1 = t; 5601 }; 5602 { 5603 uint const t = min(r1_3, r1_4); 5604 r1_4 = max(r1_3, r1_4); 5605 r1_3 = t; 5606 }; 5607 { 5608 uint const t = min(r1_1, r1_2); 5609 r1_2 = max(r1_1, r1_2); 5610 r1_1 = t; 5611 }; 5612 shared.m[smem_l_idx + (64)] = r1_1; 5613 shared.m[smem_l_idx + (80)] = r1_2; 5614 shared.m[smem_r_idx + (96)] = r1_3; 5615 shared.m[smem_r_idx + (112)] = r1_4; 5616 } 5617 { 5618 uint r2_1 = shared.m[smem_l_idx + (128)]; 5619 uint r2_2 = shared.m[smem_l_idx + (144)]; 5620 uint r2_3 = shared.m[smem_r_idx + (160)]; 5621 uint r2_4 = shared.m[smem_r_idx + (176)]; 5622 { 5623 uint const t = min(r2_2, r2_3); 5624 r2_3 = max(r2_2, r2_3); 5625 r2_2 = t; 5626 }; 5627 { 5628 uint const t = min(r2_1, r2_4); 5629 r2_4 = max(r2_1, r2_4); 5630 r2_1 = t; 5631 }; 5632 { 5633 uint const t = min(r2_3, r2_4); 5634 r2_4 = max(r2_3, r2_4); 5635 r2_3 = t; 5636 }; 5637 { 5638 uint const t = min(r2_1, r2_2); 5639 r2_2 = max(r2_1, r2_2); 5640 r2_1 = t; 5641 }; 5642 shared.m[smem_l_idx + (128)] = r2_1; 5643 shared.m[smem_l_idx + (144)] = r2_2; 5644 shared.m[smem_r_idx + (160)] = r2_3; 5645 shared.m[smem_r_idx + (176)] = r2_4; 5646 } 5647 { 5648 uint r3_1 = shared.m[smem_l_idx + (192)]; 5649 uint r3_2 = shared.m[smem_l_idx + (208)]; 5650 uint r3_3 = shared.m[smem_r_idx + (224)]; 5651 uint r3_4 = shared.m[smem_r_idx + (240)]; 5652 { 5653 uint const t = min(r3_2, r3_3); 5654 r3_3 = max(r3_2, r3_3); 5655 r3_2 = t; 5656 }; 5657 { 5658 uint const t = min(r3_1, r3_4); 5659 r3_4 = max(r3_1, r3_4); 5660 r3_1 = t; 5661 }; 5662 { 5663 uint const t = min(r3_3, r3_4); 5664 r3_4 = max(r3_3, r3_4); 5665 r3_3 = t; 5666 }; 5667 { 5668 uint const t = min(r3_1, r3_2); 5669 r3_2 = max(r3_1, r3_2); 5670 r3_1 = t; 5671 }; 5672 shared.m[smem_l_idx + (192)] = r3_1; 5673 shared.m[smem_l_idx + (208)] = r3_2; 5674 shared.m[smem_r_idx + (224)] = r3_3; 5675 shared.m[smem_r_idx + (240)] = r3_4; 5676 } 5677 } 5678 barrier(CLK_LOCAL_MEM_FENCE); 5679 r1 = shared.m[get_local_id(0) + (16 * (1 << 4) * 0)]; 5680 r8 = shared.m[get_local_id(0) + (16 * (1 << 4) * 1)]; 5681 r2 = shared.m[get_local_id(0) + (16 * (1 << 4) * 2)]; 5682 r7 = shared.m[get_local_id(0) + (16 * (1 << 4) * 3)]; 5683 r3 = shared.m[get_local_id(0) + (16 * (1 << 4) * 4)]; 5684 r6 = shared.m[get_local_id(0) + (16 * (1 << 4) * 5)]; 5685 r4 = shared.m[get_local_id(0) + (16 * (1 << 4) * 6)]; 5686 r5 = shared.m[get_local_id(0) + (16 * (1 << 4) * 7)]; 5687 { 5688 { 5689 uint const half_lane_idx = get_sub_group_local_id() ^ 8; 5690 int const t_lt = get_sub_group_local_id() < half_lane_idx; 5691 ; 5692 { 5693 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 5694 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 5695 }; 5696 { 5697 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 5698 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 5699 }; 5700 { 5701 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 5702 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 5703 }; 5704 { 5705 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 5706 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 5707 }; 5708 { 5709 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 5710 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 5711 }; 5712 { 5713 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 5714 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 5715 }; 5716 { 5717 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 5718 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 5719 }; 5720 { 5721 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 5722 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 5723 }; 5724 } 5725 { 5726 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 5727 int const t_lt = get_sub_group_local_id() < half_lane_idx; 5728 ; 5729 { 5730 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 5731 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 5732 }; 5733 { 5734 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 5735 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 5736 }; 5737 { 5738 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 5739 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 5740 }; 5741 { 5742 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 5743 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 5744 }; 5745 { 5746 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 5747 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 5748 }; 5749 { 5750 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 5751 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 5752 }; 5753 { 5754 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 5755 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 5756 }; 5757 { 5758 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 5759 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 5760 }; 5761 } 5762 { 5763 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 5764 int const t_lt = get_sub_group_local_id() < half_lane_idx; 5765 ; 5766 { 5767 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 5768 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 5769 }; 5770 { 5771 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 5772 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 5773 }; 5774 { 5775 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 5776 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 5777 }; 5778 { 5779 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 5780 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 5781 }; 5782 { 5783 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 5784 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 5785 }; 5786 { 5787 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 5788 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 5789 }; 5790 { 5791 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 5792 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 5793 }; 5794 { 5795 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 5796 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 5797 }; 5798 } 5799 { 5800 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 5801 int const t_lt = get_sub_group_local_id() < half_lane_idx; 5802 ; 5803 { 5804 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 5805 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 5806 }; 5807 { 5808 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 5809 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 5810 }; 5811 { 5812 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 5813 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 5814 }; 5815 { 5816 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 5817 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 5818 }; 5819 { 5820 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 5821 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 5822 }; 5823 { 5824 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 5825 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 5826 }; 5827 { 5828 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 5829 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 5830 }; 5831 { 5832 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 5833 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 5834 }; 5835 } 5836 { 5837 uint const t = min(r1, r5); 5838 r5 = max(r1, r5); 5839 r1 = t; 5840 }; 5841 { 5842 uint const t = min(r3, r7); 5843 r7 = max(r3, r7); 5844 r3 = t; 5845 }; 5846 { 5847 uint const t = min(r1, r3); 5848 r3 = max(r1, r3); 5849 r1 = t; 5850 }; 5851 { 5852 uint const t = min(r5, r7); 5853 r7 = max(r5, r7); 5854 r5 = t; 5855 }; 5856 { 5857 uint const t = min(r2, r6); 5858 r6 = max(r2, r6); 5859 r2 = t; 5860 }; 5861 { 5862 uint const t = min(r4, r8); 5863 r8 = max(r4, r8); 5864 r4 = t; 5865 }; 5866 { 5867 uint const t = min(r2, r4); 5868 r4 = max(r2, r4); 5869 r2 = t; 5870 }; 5871 { 5872 uint const t = min(r6, r8); 5873 r8 = max(r6, r8); 5874 r6 = t; 5875 }; 5876 { 5877 uint const t = min(r1, r2); 5878 r2 = max(r1, r2); 5879 r1 = t; 5880 }; 5881 { 5882 uint const t = min(r3, r4); 5883 r4 = max(r3, r4); 5884 r3 = t; 5885 }; 5886 { 5887 uint const t = min(r5, r6); 5888 r6 = max(r5, r6); 5889 r5 = t; 5890 }; 5891 { 5892 uint const t = min(r7, r8); 5893 r8 = max(r7, r8); 5894 r7 = t; 5895 }; 5896 } 5897 shared.m[get_local_id(0) + (16 * (1 << 4) * 0)] = r1; 5898 shared.m[get_local_id(0) + (16 * (1 << 4) * 1)] = r8; 5899 shared.m[get_local_id(0) + (16 * (1 << 4) * 2)] = r2; 5900 shared.m[get_local_id(0) + (16 * (1 << 4) * 3)] = r7; 5901 shared.m[get_local_id(0) + (16 * (1 << 4) * 4)] = r3; 5902 shared.m[get_local_id(0) + (16 * (1 << 4) * 5)] = r6; 5903 shared.m[get_local_id(0) + (16 * (1 << 4) * 6)] = r4; 5904 shared.m[get_local_id(0) + (16 * (1 << 4) * 7)] = r5; 5905 barrier(CLK_LOCAL_MEM_FENCE); 5906 if (get_sub_group_id() < 8) { 5907 { 5908 uint r0_1 = shared.m[smem_l_idx + (0)]; 5909 uint r0_2 = shared.m[smem_l_idx + (16)]; 5910 uint r0_3 = shared.m[smem_l_idx + (32)]; 5911 uint r0_4 = shared.m[smem_l_idx + (48)]; 5912 uint r0_5 = shared.m[smem_r_idx + (64)]; 5913 uint r0_6 = shared.m[smem_r_idx + (80)]; 5914 uint r0_7 = shared.m[smem_r_idx + (96)]; 5915 uint r0_8 = shared.m[smem_r_idx + (112)]; 5916 { 5917 uint const t = min(r0_4, r0_5); 5918 r0_5 = max(r0_4, r0_5); 5919 r0_4 = t; 5920 }; 5921 { 5922 uint const t = min(r0_3, r0_6); 5923 r0_6 = max(r0_3, r0_6); 5924 r0_3 = t; 5925 }; 5926 { 5927 uint const t = min(r0_2, r0_7); 5928 r0_7 = max(r0_2, r0_7); 5929 r0_2 = t; 5930 }; 5931 { 5932 uint const t = min(r0_1, r0_8); 5933 r0_8 = max(r0_1, r0_8); 5934 r0_1 = t; 5935 }; 5936 { 5937 uint const t = min(r0_5, r0_7); 5938 r0_7 = max(r0_5, r0_7); 5939 r0_5 = t; 5940 }; 5941 { 5942 uint const t = min(r0_6, r0_8); 5943 r0_8 = max(r0_6, r0_8); 5944 r0_6 = t; 5945 }; 5946 { 5947 uint const t = min(r0_5, r0_6); 5948 r0_6 = max(r0_5, r0_6); 5949 r0_5 = t; 5950 }; 5951 { 5952 uint const t = min(r0_7, r0_8); 5953 r0_8 = max(r0_7, r0_8); 5954 r0_7 = t; 5955 }; 5956 { 5957 uint const t = min(r0_1, r0_3); 5958 r0_3 = max(r0_1, r0_3); 5959 r0_1 = t; 5960 }; 5961 { 5962 uint const t = min(r0_2, r0_4); 5963 r0_4 = max(r0_2, r0_4); 5964 r0_2 = t; 5965 }; 5966 { 5967 uint const t = min(r0_1, r0_2); 5968 r0_2 = max(r0_1, r0_2); 5969 r0_1 = t; 5970 }; 5971 { 5972 uint const t = min(r0_3, r0_4); 5973 r0_4 = max(r0_3, r0_4); 5974 r0_3 = t; 5975 }; 5976 shared.m[smem_l_idx + (0)] = r0_1; 5977 shared.m[smem_l_idx + (16)] = r0_2; 5978 shared.m[smem_l_idx + (32)] = r0_3; 5979 shared.m[smem_l_idx + (48)] = r0_4; 5980 shared.m[smem_r_idx + (64)] = r0_5; 5981 shared.m[smem_r_idx + (80)] = r0_6; 5982 shared.m[smem_r_idx + (96)] = r0_7; 5983 shared.m[smem_r_idx + (112)] = r0_8; 5984 } 5985 { 5986 uint r1_1 = shared.m[smem_l_idx + (128)]; 5987 uint r1_2 = shared.m[smem_l_idx + (144)]; 5988 uint r1_3 = shared.m[smem_l_idx + (160)]; 5989 uint r1_4 = shared.m[smem_l_idx + (176)]; 5990 uint r1_5 = shared.m[smem_r_idx + (192)]; 5991 uint r1_6 = shared.m[smem_r_idx + (208)]; 5992 uint r1_7 = shared.m[smem_r_idx + (224)]; 5993 uint r1_8 = shared.m[smem_r_idx + (240)]; 5994 { 5995 uint const t = min(r1_4, r1_5); 5996 r1_5 = max(r1_4, r1_5); 5997 r1_4 = t; 5998 }; 5999 { 6000 uint const t = min(r1_3, r1_6); 6001 r1_6 = max(r1_3, r1_6); 6002 r1_3 = t; 6003 }; 6004 { 6005 uint const t = min(r1_2, r1_7); 6006 r1_7 = max(r1_2, r1_7); 6007 r1_2 = t; 6008 }; 6009 { 6010 uint const t = min(r1_1, r1_8); 6011 r1_8 = max(r1_1, r1_8); 6012 r1_1 = t; 6013 }; 6014 { 6015 uint const t = min(r1_5, r1_7); 6016 r1_7 = max(r1_5, r1_7); 6017 r1_5 = t; 6018 }; 6019 { 6020 uint const t = min(r1_6, r1_8); 6021 r1_8 = max(r1_6, r1_8); 6022 r1_6 = t; 6023 }; 6024 { 6025 uint const t = min(r1_5, r1_6); 6026 r1_6 = max(r1_5, r1_6); 6027 r1_5 = t; 6028 }; 6029 { 6030 uint const t = min(r1_7, r1_8); 6031 r1_8 = max(r1_7, r1_8); 6032 r1_7 = t; 6033 }; 6034 { 6035 uint const t = min(r1_1, r1_3); 6036 r1_3 = max(r1_1, r1_3); 6037 r1_1 = t; 6038 }; 6039 { 6040 uint const t = min(r1_2, r1_4); 6041 r1_4 = max(r1_2, r1_4); 6042 r1_2 = t; 6043 }; 6044 { 6045 uint const t = min(r1_1, r1_2); 6046 r1_2 = max(r1_1, r1_2); 6047 r1_1 = t; 6048 }; 6049 { 6050 uint const t = min(r1_3, r1_4); 6051 r1_4 = max(r1_3, r1_4); 6052 r1_3 = t; 6053 }; 6054 shared.m[smem_l_idx + (128)] = r1_1; 6055 shared.m[smem_l_idx + (144)] = r1_2; 6056 shared.m[smem_l_idx + (160)] = r1_3; 6057 shared.m[smem_l_idx + (176)] = r1_4; 6058 shared.m[smem_r_idx + (192)] = r1_5; 6059 shared.m[smem_r_idx + (208)] = r1_6; 6060 shared.m[smem_r_idx + (224)] = r1_7; 6061 shared.m[smem_r_idx + (240)] = r1_8; 6062 } 6063 } 6064 barrier(CLK_LOCAL_MEM_FENCE); 6065 r1 = shared.m[get_local_id(0) + (16 * (1 << 4) * 0)]; 6066 r8 = shared.m[get_local_id(0) + (16 * (1 << 4) * 1)]; 6067 r2 = shared.m[get_local_id(0) + (16 * (1 << 4) * 2)]; 6068 r7 = shared.m[get_local_id(0) + (16 * (1 << 4) * 3)]; 6069 r3 = shared.m[get_local_id(0) + (16 * (1 << 4) * 4)]; 6070 r6 = shared.m[get_local_id(0) + (16 * (1 << 4) * 5)]; 6071 r4 = shared.m[get_local_id(0) + (16 * (1 << 4) * 6)]; 6072 r5 = shared.m[get_local_id(0) + (16 * (1 << 4) * 7)]; 6073 { 6074 { 6075 uint const half_lane_idx = get_sub_group_local_id() ^ 8; 6076 int const t_lt = get_sub_group_local_id() < half_lane_idx; 6077 ; 6078 { 6079 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 6080 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 6081 }; 6082 { 6083 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 6084 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 6085 }; 6086 { 6087 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 6088 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 6089 }; 6090 { 6091 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 6092 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 6093 }; 6094 { 6095 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 6096 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 6097 }; 6098 { 6099 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 6100 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 6101 }; 6102 { 6103 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 6104 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 6105 }; 6106 { 6107 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 6108 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 6109 }; 6110 } 6111 { 6112 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 6113 int const t_lt = get_sub_group_local_id() < half_lane_idx; 6114 ; 6115 { 6116 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 6117 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 6118 }; 6119 { 6120 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 6121 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 6122 }; 6123 { 6124 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 6125 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 6126 }; 6127 { 6128 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 6129 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 6130 }; 6131 { 6132 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 6133 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 6134 }; 6135 { 6136 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 6137 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 6138 }; 6139 { 6140 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 6141 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 6142 }; 6143 { 6144 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 6145 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 6146 }; 6147 } 6148 { 6149 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 6150 int const t_lt = get_sub_group_local_id() < half_lane_idx; 6151 ; 6152 { 6153 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 6154 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 6155 }; 6156 { 6157 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 6158 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 6159 }; 6160 { 6161 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 6162 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 6163 }; 6164 { 6165 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 6166 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 6167 }; 6168 { 6169 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 6170 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 6171 }; 6172 { 6173 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 6174 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 6175 }; 6176 { 6177 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 6178 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 6179 }; 6180 { 6181 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 6182 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 6183 }; 6184 } 6185 { 6186 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 6187 int const t_lt = get_sub_group_local_id() < half_lane_idx; 6188 ; 6189 { 6190 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 6191 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 6192 }; 6193 { 6194 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 6195 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 6196 }; 6197 { 6198 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 6199 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 6200 }; 6201 { 6202 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 6203 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 6204 }; 6205 { 6206 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 6207 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 6208 }; 6209 { 6210 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 6211 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 6212 }; 6213 { 6214 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 6215 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 6216 }; 6217 { 6218 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 6219 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 6220 }; 6221 } 6222 { 6223 uint const t = min(r1, r5); 6224 r5 = max(r1, r5); 6225 r1 = t; 6226 }; 6227 { 6228 uint const t = min(r3, r7); 6229 r7 = max(r3, r7); 6230 r3 = t; 6231 }; 6232 { 6233 uint const t = min(r1, r3); 6234 r3 = max(r1, r3); 6235 r1 = t; 6236 }; 6237 { 6238 uint const t = min(r5, r7); 6239 r7 = max(r5, r7); 6240 r5 = t; 6241 }; 6242 { 6243 uint const t = min(r2, r6); 6244 r6 = max(r2, r6); 6245 r2 = t; 6246 }; 6247 { 6248 uint const t = min(r4, r8); 6249 r8 = max(r4, r8); 6250 r4 = t; 6251 }; 6252 { 6253 uint const t = min(r2, r4); 6254 r4 = max(r2, r4); 6255 r2 = t; 6256 }; 6257 { 6258 uint const t = min(r6, r8); 6259 r8 = max(r6, r8); 6260 r6 = t; 6261 }; 6262 { 6263 uint const t = min(r1, r2); 6264 r2 = max(r1, r2); 6265 r1 = t; 6266 }; 6267 { 6268 uint const t = min(r3, r4); 6269 r4 = max(r3, r4); 6270 r3 = t; 6271 }; 6272 { 6273 uint const t = min(r5, r6); 6274 r6 = max(r5, r6); 6275 r5 = t; 6276 }; 6277 { 6278 uint const t = min(r7, r8); 6279 r8 = max(r7, r8); 6280 r7 = t; 6281 }; 6282 } 6283 shared.m[get_local_id(0) + (16 * (1 << 4) * 0)] = r1; 6284 shared.m[get_local_id(0) + (16 * (1 << 4) * 1)] = r8; 6285 shared.m[get_local_id(0) + (16 * (1 << 4) * 2)] = r2; 6286 shared.m[get_local_id(0) + (16 * (1 << 4) * 3)] = r7; 6287 shared.m[get_local_id(0) + (16 * (1 << 4) * 4)] = r3; 6288 shared.m[get_local_id(0) + (16 * (1 << 4) * 5)] = r6; 6289 shared.m[get_local_id(0) + (16 * (1 << 4) * 6)] = r4; 6290 shared.m[get_local_id(0) + (16 * (1 << 4) * 7)] = r5; 6291 barrier(CLK_LOCAL_MEM_FENCE); 6292 if (get_sub_group_id() < 8) { 6293 { 6294 uint r0_1 = shared.m[smem_l_idx + (0)]; 6295 uint r0_2 = shared.m[smem_l_idx + (16)]; 6296 uint r0_3 = shared.m[smem_l_idx + (32)]; 6297 uint r0_4 = shared.m[smem_l_idx + (48)]; 6298 uint r0_5 = shared.m[smem_l_idx + (64)]; 6299 uint r0_6 = shared.m[smem_l_idx + (80)]; 6300 uint r0_7 = shared.m[smem_l_idx + (96)]; 6301 uint r0_8 = shared.m[smem_l_idx + (112)]; 6302 uint r0_9 = shared.m[smem_r_idx + (128)]; 6303 uint r0_10 = shared.m[smem_r_idx + (144)]; 6304 uint r0_11 = shared.m[smem_r_idx + (160)]; 6305 uint r0_12 = shared.m[smem_r_idx + (176)]; 6306 uint r0_13 = shared.m[smem_r_idx + (192)]; 6307 uint r0_14 = shared.m[smem_r_idx + (208)]; 6308 uint r0_15 = shared.m[smem_r_idx + (224)]; 6309 uint r0_16 = shared.m[smem_r_idx + (240)]; 6310 { 6311 uint const t = min(r0_8, r0_9); 6312 r0_9 = max(r0_8, r0_9); 6313 r0_8 = t; 6314 }; 6315 { 6316 uint const t = min(r0_7, r0_10); 6317 r0_10 = max(r0_7, r0_10); 6318 r0_7 = t; 6319 }; 6320 { 6321 uint const t = min(r0_6, r0_11); 6322 r0_11 = max(r0_6, r0_11); 6323 r0_6 = t; 6324 }; 6325 { 6326 uint const t = min(r0_5, r0_12); 6327 r0_12 = max(r0_5, r0_12); 6328 r0_5 = t; 6329 }; 6330 { 6331 uint const t = min(r0_4, r0_13); 6332 r0_13 = max(r0_4, r0_13); 6333 r0_4 = t; 6334 }; 6335 { 6336 uint const t = min(r0_3, r0_14); 6337 r0_14 = max(r0_3, r0_14); 6338 r0_3 = t; 6339 }; 6340 { 6341 uint const t = min(r0_2, r0_15); 6342 r0_15 = max(r0_2, r0_15); 6343 r0_2 = t; 6344 }; 6345 { 6346 uint const t = min(r0_1, r0_16); 6347 r0_16 = max(r0_1, r0_16); 6348 r0_1 = t; 6349 }; 6350 { 6351 uint const t = min(r0_9, r0_13); 6352 r0_13 = max(r0_9, r0_13); 6353 r0_9 = t; 6354 }; 6355 { 6356 uint const t = min(r0_11, r0_15); 6357 r0_15 = max(r0_11, r0_15); 6358 r0_11 = t; 6359 }; 6360 { 6361 uint const t = min(r0_9, r0_11); 6362 r0_11 = max(r0_9, r0_11); 6363 r0_9 = t; 6364 }; 6365 { 6366 uint const t = min(r0_13, r0_15); 6367 r0_15 = max(r0_13, r0_15); 6368 r0_13 = t; 6369 }; 6370 { 6371 uint const t = min(r0_10, r0_14); 6372 r0_14 = max(r0_10, r0_14); 6373 r0_10 = t; 6374 }; 6375 { 6376 uint const t = min(r0_12, r0_16); 6377 r0_16 = max(r0_12, r0_16); 6378 r0_12 = t; 6379 }; 6380 { 6381 uint const t = min(r0_10, r0_12); 6382 r0_12 = max(r0_10, r0_12); 6383 r0_10 = t; 6384 }; 6385 { 6386 uint const t = min(r0_14, r0_16); 6387 r0_16 = max(r0_14, r0_16); 6388 r0_14 = t; 6389 }; 6390 { 6391 uint const t = min(r0_9, r0_10); 6392 r0_10 = max(r0_9, r0_10); 6393 r0_9 = t; 6394 }; 6395 { 6396 uint const t = min(r0_11, r0_12); 6397 r0_12 = max(r0_11, r0_12); 6398 r0_11 = t; 6399 }; 6400 { 6401 uint const t = min(r0_13, r0_14); 6402 r0_14 = max(r0_13, r0_14); 6403 r0_13 = t; 6404 }; 6405 { 6406 uint const t = min(r0_15, r0_16); 6407 r0_16 = max(r0_15, r0_16); 6408 r0_15 = t; 6409 }; 6410 { 6411 uint const t = min(r0_1, r0_5); 6412 r0_5 = max(r0_1, r0_5); 6413 r0_1 = t; 6414 }; 6415 { 6416 uint const t = min(r0_3, r0_7); 6417 r0_7 = max(r0_3, r0_7); 6418 r0_3 = t; 6419 }; 6420 { 6421 uint const t = min(r0_1, r0_3); 6422 r0_3 = max(r0_1, r0_3); 6423 r0_1 = t; 6424 }; 6425 { 6426 uint const t = min(r0_5, r0_7); 6427 r0_7 = max(r0_5, r0_7); 6428 r0_5 = t; 6429 }; 6430 { 6431 uint const t = min(r0_2, r0_6); 6432 r0_6 = max(r0_2, r0_6); 6433 r0_2 = t; 6434 }; 6435 { 6436 uint const t = min(r0_4, r0_8); 6437 r0_8 = max(r0_4, r0_8); 6438 r0_4 = t; 6439 }; 6440 { 6441 uint const t = min(r0_2, r0_4); 6442 r0_4 = max(r0_2, r0_4); 6443 r0_2 = t; 6444 }; 6445 { 6446 uint const t = min(r0_6, r0_8); 6447 r0_8 = max(r0_6, r0_8); 6448 r0_6 = t; 6449 }; 6450 { 6451 uint const t = min(r0_1, r0_2); 6452 r0_2 = max(r0_1, r0_2); 6453 r0_1 = t; 6454 }; 6455 { 6456 uint const t = min(r0_3, r0_4); 6457 r0_4 = max(r0_3, r0_4); 6458 r0_3 = t; 6459 }; 6460 { 6461 uint const t = min(r0_5, r0_6); 6462 r0_6 = max(r0_5, r0_6); 6463 r0_5 = t; 6464 }; 6465 { 6466 uint const t = min(r0_7, r0_8); 6467 r0_8 = max(r0_7, r0_8); 6468 r0_7 = t; 6469 }; 6470 shared.m[smem_l_idx + (0)] = r0_1; 6471 shared.m[smem_l_idx + (16)] = r0_2; 6472 shared.m[smem_l_idx + (32)] = r0_3; 6473 shared.m[smem_l_idx + (48)] = r0_4; 6474 shared.m[smem_l_idx + (64)] = r0_5; 6475 shared.m[smem_l_idx + (80)] = r0_6; 6476 shared.m[smem_l_idx + (96)] = r0_7; 6477 shared.m[smem_l_idx + (112)] = r0_8; 6478 shared.m[smem_r_idx + (128)] = r0_9; 6479 shared.m[smem_r_idx + (144)] = r0_10; 6480 shared.m[smem_r_idx + (160)] = r0_11; 6481 shared.m[smem_r_idx + (176)] = r0_12; 6482 shared.m[smem_r_idx + (192)] = r0_13; 6483 shared.m[smem_r_idx + (208)] = r0_14; 6484 shared.m[smem_r_idx + (224)] = r0_15; 6485 shared.m[smem_r_idx + (240)] = r0_16; 6486 } 6487 } 6488 barrier(CLK_LOCAL_MEM_FENCE); 6489 r1 = shared.m[get_local_id(0) + (16 * (1 << 4) * 0)]; 6490 r8 = shared.m[get_local_id(0) + (16 * (1 << 4) * 1)]; 6491 r2 = shared.m[get_local_id(0) + (16 * (1 << 4) * 2)]; 6492 r7 = shared.m[get_local_id(0) + (16 * (1 << 4) * 3)]; 6493 r3 = shared.m[get_local_id(0) + (16 * (1 << 4) * 4)]; 6494 r6 = shared.m[get_local_id(0) + (16 * (1 << 4) * 5)]; 6495 r4 = shared.m[get_local_id(0) + (16 * (1 << 4) * 6)]; 6496 r5 = shared.m[get_local_id(0) + (16 * (1 << 4) * 7)]; 6497 { 6498 { 6499 uint const half_lane_idx = get_sub_group_local_id() ^ 8; 6500 int const t_lt = get_sub_group_local_id() < half_lane_idx; 6501 ; 6502 { 6503 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 6504 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 6505 }; 6506 { 6507 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 6508 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 6509 }; 6510 { 6511 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 6512 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 6513 }; 6514 { 6515 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 6516 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 6517 }; 6518 { 6519 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 6520 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 6521 }; 6522 { 6523 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 6524 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 6525 }; 6526 { 6527 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 6528 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 6529 }; 6530 { 6531 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 6532 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 6533 }; 6534 } 6535 { 6536 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 6537 int const t_lt = get_sub_group_local_id() < half_lane_idx; 6538 ; 6539 { 6540 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 6541 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 6542 }; 6543 { 6544 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 6545 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 6546 }; 6547 { 6548 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 6549 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 6550 }; 6551 { 6552 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 6553 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 6554 }; 6555 { 6556 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 6557 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 6558 }; 6559 { 6560 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 6561 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 6562 }; 6563 { 6564 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 6565 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 6566 }; 6567 { 6568 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 6569 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 6570 }; 6571 } 6572 { 6573 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 6574 int const t_lt = get_sub_group_local_id() < half_lane_idx; 6575 ; 6576 { 6577 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 6578 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 6579 }; 6580 { 6581 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 6582 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 6583 }; 6584 { 6585 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 6586 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 6587 }; 6588 { 6589 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 6590 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 6591 }; 6592 { 6593 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 6594 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 6595 }; 6596 { 6597 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 6598 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 6599 }; 6600 { 6601 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 6602 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 6603 }; 6604 { 6605 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 6606 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 6607 }; 6608 } 6609 { 6610 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 6611 int const t_lt = get_sub_group_local_id() < half_lane_idx; 6612 ; 6613 { 6614 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 6615 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 6616 }; 6617 { 6618 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 6619 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 6620 }; 6621 { 6622 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 6623 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 6624 }; 6625 { 6626 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 6627 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 6628 }; 6629 { 6630 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 6631 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 6632 }; 6633 { 6634 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 6635 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 6636 }; 6637 { 6638 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 6639 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 6640 }; 6641 { 6642 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 6643 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 6644 }; 6645 } 6646 { 6647 uint const t = min(r1, r5); 6648 r5 = max(r1, r5); 6649 r1 = t; 6650 }; 6651 { 6652 uint const t = min(r3, r7); 6653 r7 = max(r3, r7); 6654 r3 = t; 6655 }; 6656 { 6657 uint const t = min(r1, r3); 6658 r3 = max(r1, r3); 6659 r1 = t; 6660 }; 6661 { 6662 uint const t = min(r5, r7); 6663 r7 = max(r5, r7); 6664 r5 = t; 6665 }; 6666 { 6667 uint const t = min(r2, r6); 6668 r6 = max(r2, r6); 6669 r2 = t; 6670 }; 6671 { 6672 uint const t = min(r4, r8); 6673 r8 = max(r4, r8); 6674 r4 = t; 6675 }; 6676 { 6677 uint const t = min(r2, r4); 6678 r4 = max(r2, r4); 6679 r2 = t; 6680 }; 6681 { 6682 uint const t = min(r6, r8); 6683 r8 = max(r6, r8); 6684 r6 = t; 6685 }; 6686 { 6687 uint const t = min(r1, r2); 6688 r2 = max(r1, r2); 6689 r1 = t; 6690 }; 6691 { 6692 uint const t = min(r3, r4); 6693 r4 = max(r3, r4); 6694 r3 = t; 6695 }; 6696 { 6697 uint const t = min(r5, r6); 6698 r6 = max(r5, r6); 6699 r5 = t; 6700 }; 6701 { 6702 uint const t = min(r7, r8); 6703 r8 = max(r7, r8); 6704 r7 = t; 6705 }; 6706 } 6707 vout[gmem_idx + (1 << 4) * 0] = r1; 6708 vout[gmem_idx + (1 << 4) * 1] = r2; 6709 vout[gmem_idx + (1 << 4) * 2] = r3; 6710 vout[gmem_idx + (1 << 4) * 3] = r4; 6711 vout[gmem_idx + (1 << 4) * 4] = r5; 6712 vout[gmem_idx + (1 << 4) * 5] = r6; 6713 vout[gmem_idx + (1 << 4) * 6] = r7; 6714 vout[gmem_idx + (1 << 4) * 7] = r8; 6715} 6716 6717__kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) 6718__attribute__((reqd_work_group_size((1 << 4) * 1, 1, 1))) void 6719hs_kernel_bc_0(__global uint* const restrict vout) 6720{ 6721 uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 + 6722 (get_local_id(0) & ((1 << 4) - 1)); 6723 uint r1 = vout[gmem_idx + (1 << 4) * 0]; 6724 uint r2 = vout[gmem_idx + (1 << 4) * 1]; 6725 uint r3 = vout[gmem_idx + (1 << 4) * 2]; 6726 uint r4 = vout[gmem_idx + (1 << 4) * 3]; 6727 uint r5 = vout[gmem_idx + (1 << 4) * 4]; 6728 uint r6 = vout[gmem_idx + (1 << 4) * 5]; 6729 uint r7 = vout[gmem_idx + (1 << 4) * 6]; 6730 uint r8 = vout[gmem_idx + (1 << 4) * 7]; 6731 { 6732 { 6733 uint const half_lane_idx = get_sub_group_local_id() ^ 8; 6734 int const t_lt = get_sub_group_local_id() < half_lane_idx; 6735 ; 6736 { 6737 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 6738 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 6739 }; 6740 { 6741 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 6742 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 6743 }; 6744 { 6745 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 6746 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 6747 }; 6748 { 6749 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 6750 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 6751 }; 6752 { 6753 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 6754 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 6755 }; 6756 { 6757 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 6758 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 6759 }; 6760 { 6761 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 6762 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 6763 }; 6764 { 6765 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 6766 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 6767 }; 6768 } 6769 { 6770 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 6771 int const t_lt = get_sub_group_local_id() < half_lane_idx; 6772 ; 6773 { 6774 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 6775 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 6776 }; 6777 { 6778 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 6779 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 6780 }; 6781 { 6782 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 6783 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 6784 }; 6785 { 6786 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 6787 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 6788 }; 6789 { 6790 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 6791 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 6792 }; 6793 { 6794 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 6795 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 6796 }; 6797 { 6798 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 6799 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 6800 }; 6801 { 6802 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 6803 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 6804 }; 6805 } 6806 { 6807 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 6808 int const t_lt = get_sub_group_local_id() < half_lane_idx; 6809 ; 6810 { 6811 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 6812 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 6813 }; 6814 { 6815 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 6816 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 6817 }; 6818 { 6819 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 6820 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 6821 }; 6822 { 6823 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 6824 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 6825 }; 6826 { 6827 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 6828 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 6829 }; 6830 { 6831 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 6832 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 6833 }; 6834 { 6835 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 6836 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 6837 }; 6838 { 6839 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 6840 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 6841 }; 6842 } 6843 { 6844 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 6845 int const t_lt = get_sub_group_local_id() < half_lane_idx; 6846 ; 6847 { 6848 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 6849 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 6850 }; 6851 { 6852 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 6853 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 6854 }; 6855 { 6856 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 6857 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 6858 }; 6859 { 6860 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 6861 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 6862 }; 6863 { 6864 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 6865 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 6866 }; 6867 { 6868 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 6869 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 6870 }; 6871 { 6872 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 6873 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 6874 }; 6875 { 6876 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 6877 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 6878 }; 6879 } 6880 { 6881 uint const t = min(r1, r5); 6882 r5 = max(r1, r5); 6883 r1 = t; 6884 }; 6885 { 6886 uint const t = min(r3, r7); 6887 r7 = max(r3, r7); 6888 r3 = t; 6889 }; 6890 { 6891 uint const t = min(r1, r3); 6892 r3 = max(r1, r3); 6893 r1 = t; 6894 }; 6895 { 6896 uint const t = min(r5, r7); 6897 r7 = max(r5, r7); 6898 r5 = t; 6899 }; 6900 { 6901 uint const t = min(r2, r6); 6902 r6 = max(r2, r6); 6903 r2 = t; 6904 }; 6905 { 6906 uint const t = min(r4, r8); 6907 r8 = max(r4, r8); 6908 r4 = t; 6909 }; 6910 { 6911 uint const t = min(r2, r4); 6912 r4 = max(r2, r4); 6913 r2 = t; 6914 }; 6915 { 6916 uint const t = min(r6, r8); 6917 r8 = max(r6, r8); 6918 r6 = t; 6919 }; 6920 { 6921 uint const t = min(r1, r2); 6922 r2 = max(r1, r2); 6923 r1 = t; 6924 }; 6925 { 6926 uint const t = min(r3, r4); 6927 r4 = max(r3, r4); 6928 r3 = t; 6929 }; 6930 { 6931 uint const t = min(r5, r6); 6932 r6 = max(r5, r6); 6933 r5 = t; 6934 }; 6935 { 6936 uint const t = min(r7, r8); 6937 r8 = max(r7, r8); 6938 r7 = t; 6939 }; 6940 } 6941 vout[gmem_idx + (1 << 4) * 0] = r1; 6942 vout[gmem_idx + (1 << 4) * 1] = r2; 6943 vout[gmem_idx + (1 << 4) * 2] = r3; 6944 vout[gmem_idx + (1 << 4) * 3] = r4; 6945 vout[gmem_idx + (1 << 4) * 4] = r5; 6946 vout[gmem_idx + (1 << 4) * 5] = r6; 6947 vout[gmem_idx + (1 << 4) * 6] = r7; 6948 vout[gmem_idx + (1 << 4) * 7] = r8; 6949} 6950 6951__kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) 6952__attribute__((reqd_work_group_size((1 << 4) * 2, 1, 1))) void 6953hs_kernel_bc_1(__global uint* const restrict vout) 6954{ 6955 __local struct 6956 { 6957 uint m[32 * 8]; 6958 } shared; 6959 6960 uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 + 6961 (get_local_id(0) & ((1 << 4) - 1)); 6962 uint const gmem_l_idx = 6963 (get_global_id(0) & ~((1 << 4) * 2 - 1)) * 8 + get_local_id(0); 6964 uint const smem_l_idx = 6965 get_sub_group_id() * ((1 << 4) * 2) + get_sub_group_local_id(); 6966 { 6967 { 6968 uint r0_1 = vout[gmem_l_idx + ((1 << 4) * 0)]; 6969 uint r0_2 = vout[gmem_l_idx + ((1 << 4) * 8)]; 6970 { 6971 uint const t = min(r0_1, r0_2); 6972 r0_2 = max(r0_1, r0_2); 6973 r0_1 = t; 6974 }; 6975 shared.m[smem_l_idx + (0)] = r0_1; 6976 shared.m[smem_l_idx + (16)] = r0_2; 6977 } 6978 { 6979 uint r0_1 = vout[gmem_l_idx + ((1 << 4) * 2)]; 6980 uint r0_2 = vout[gmem_l_idx + ((1 << 4) * 10)]; 6981 { 6982 uint const t = min(r0_1, r0_2); 6983 r0_2 = max(r0_1, r0_2); 6984 r0_1 = t; 6985 }; 6986 shared.m[smem_l_idx + (64)] = r0_1; 6987 shared.m[smem_l_idx + (80)] = r0_2; 6988 } 6989 { 6990 uint r0_1 = vout[gmem_l_idx + ((1 << 4) * 4)]; 6991 uint r0_2 = vout[gmem_l_idx + ((1 << 4) * 12)]; 6992 { 6993 uint const t = min(r0_1, r0_2); 6994 r0_2 = max(r0_1, r0_2); 6995 r0_1 = t; 6996 }; 6997 shared.m[smem_l_idx + (128)] = r0_1; 6998 shared.m[smem_l_idx + (144)] = r0_2; 6999 } 7000 { 7001 uint r0_1 = vout[gmem_l_idx + ((1 << 4) * 6)]; 7002 uint r0_2 = vout[gmem_l_idx + ((1 << 4) * 14)]; 7003 { 7004 uint const t = min(r0_1, r0_2); 7005 r0_2 = max(r0_1, r0_2); 7006 r0_1 = t; 7007 }; 7008 shared.m[smem_l_idx + (192)] = r0_1; 7009 shared.m[smem_l_idx + (208)] = r0_2; 7010 } 7011 } 7012 barrier(CLK_LOCAL_MEM_FENCE); 7013 uint r1 = shared.m[get_local_id(0) + (2 * (1 << 4) * 0)]; 7014 uint r2 = shared.m[get_local_id(0) + (2 * (1 << 4) * 1)]; 7015 uint r3 = shared.m[get_local_id(0) + (2 * (1 << 4) * 2)]; 7016 uint r4 = shared.m[get_local_id(0) + (2 * (1 << 4) * 3)]; 7017 uint r5 = shared.m[get_local_id(0) + (2 * (1 << 4) * 4)]; 7018 uint r6 = shared.m[get_local_id(0) + (2 * (1 << 4) * 5)]; 7019 uint r7 = shared.m[get_local_id(0) + (2 * (1 << 4) * 6)]; 7020 uint r8 = shared.m[get_local_id(0) + (2 * (1 << 4) * 7)]; 7021 { 7022 { 7023 uint const half_lane_idx = get_sub_group_local_id() ^ 8; 7024 int const t_lt = get_sub_group_local_id() < half_lane_idx; 7025 ; 7026 { 7027 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 7028 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 7029 }; 7030 { 7031 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 7032 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 7033 }; 7034 { 7035 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 7036 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 7037 }; 7038 { 7039 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 7040 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 7041 }; 7042 { 7043 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 7044 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 7045 }; 7046 { 7047 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 7048 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 7049 }; 7050 { 7051 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 7052 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 7053 }; 7054 { 7055 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 7056 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 7057 }; 7058 } 7059 { 7060 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 7061 int const t_lt = get_sub_group_local_id() < half_lane_idx; 7062 ; 7063 { 7064 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 7065 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 7066 }; 7067 { 7068 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 7069 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 7070 }; 7071 { 7072 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 7073 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 7074 }; 7075 { 7076 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 7077 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 7078 }; 7079 { 7080 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 7081 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 7082 }; 7083 { 7084 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 7085 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 7086 }; 7087 { 7088 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 7089 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 7090 }; 7091 { 7092 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 7093 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 7094 }; 7095 } 7096 { 7097 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 7098 int const t_lt = get_sub_group_local_id() < half_lane_idx; 7099 ; 7100 { 7101 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 7102 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 7103 }; 7104 { 7105 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 7106 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 7107 }; 7108 { 7109 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 7110 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 7111 }; 7112 { 7113 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 7114 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 7115 }; 7116 { 7117 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 7118 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 7119 }; 7120 { 7121 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 7122 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 7123 }; 7124 { 7125 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 7126 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 7127 }; 7128 { 7129 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 7130 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 7131 }; 7132 } 7133 { 7134 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 7135 int const t_lt = get_sub_group_local_id() < half_lane_idx; 7136 ; 7137 { 7138 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 7139 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 7140 }; 7141 { 7142 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 7143 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 7144 }; 7145 { 7146 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 7147 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 7148 }; 7149 { 7150 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 7151 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 7152 }; 7153 { 7154 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 7155 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 7156 }; 7157 { 7158 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 7159 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 7160 }; 7161 { 7162 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 7163 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 7164 }; 7165 { 7166 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 7167 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 7168 }; 7169 } 7170 { 7171 uint const t = min(r1, r5); 7172 r5 = max(r1, r5); 7173 r1 = t; 7174 }; 7175 { 7176 uint const t = min(r3, r7); 7177 r7 = max(r3, r7); 7178 r3 = t; 7179 }; 7180 { 7181 uint const t = min(r1, r3); 7182 r3 = max(r1, r3); 7183 r1 = t; 7184 }; 7185 { 7186 uint const t = min(r5, r7); 7187 r7 = max(r5, r7); 7188 r5 = t; 7189 }; 7190 { 7191 uint const t = min(r2, r6); 7192 r6 = max(r2, r6); 7193 r2 = t; 7194 }; 7195 { 7196 uint const t = min(r4, r8); 7197 r8 = max(r4, r8); 7198 r4 = t; 7199 }; 7200 { 7201 uint const t = min(r2, r4); 7202 r4 = max(r2, r4); 7203 r2 = t; 7204 }; 7205 { 7206 uint const t = min(r6, r8); 7207 r8 = max(r6, r8); 7208 r6 = t; 7209 }; 7210 { 7211 uint const t = min(r1, r2); 7212 r2 = max(r1, r2); 7213 r1 = t; 7214 }; 7215 { 7216 uint const t = min(r3, r4); 7217 r4 = max(r3, r4); 7218 r3 = t; 7219 }; 7220 { 7221 uint const t = min(r5, r6); 7222 r6 = max(r5, r6); 7223 r5 = t; 7224 }; 7225 { 7226 uint const t = min(r7, r8); 7227 r8 = max(r7, r8); 7228 r7 = t; 7229 }; 7230 } 7231 vout[gmem_idx + (1 << 4) * 0] = r1; 7232 vout[gmem_idx + (1 << 4) * 1] = r2; 7233 vout[gmem_idx + (1 << 4) * 2] = r3; 7234 vout[gmem_idx + (1 << 4) * 3] = r4; 7235 vout[gmem_idx + (1 << 4) * 4] = r5; 7236 vout[gmem_idx + (1 << 4) * 5] = r6; 7237 vout[gmem_idx + (1 << 4) * 6] = r7; 7238 vout[gmem_idx + (1 << 4) * 7] = r8; 7239} 7240 7241__kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) 7242__attribute__((reqd_work_group_size((1 << 4) * 4, 1, 1))) void 7243hs_kernel_bc_2(__global uint* const restrict vout) 7244{ 7245 __local struct 7246 { 7247 uint m[64 * 8]; 7248 } shared; 7249 7250 uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 + 7251 (get_local_id(0) & ((1 << 4) - 1)); 7252 uint const gmem_l_idx = 7253 (get_global_id(0) & ~((1 << 4) * 4 - 1)) * 8 + get_local_id(0); 7254 uint const smem_l_idx = 7255 get_sub_group_id() * ((1 << 4) * 4) + get_sub_group_local_id(); 7256 { 7257 { 7258 uint r0_1 = vout[gmem_l_idx + ((1 << 4) * 0)]; 7259 uint r0_2 = vout[gmem_l_idx + ((1 << 4) * 8)]; 7260 uint r0_3 = vout[gmem_l_idx + ((1 << 4) * 16)]; 7261 uint r0_4 = vout[gmem_l_idx + ((1 << 4) * 24)]; 7262 { 7263 uint const t = min(r0_1, r0_3); 7264 r0_3 = max(r0_1, r0_3); 7265 r0_1 = t; 7266 }; 7267 { 7268 uint const t = min(r0_2, r0_4); 7269 r0_4 = max(r0_2, r0_4); 7270 r0_2 = t; 7271 }; 7272 { 7273 uint const t = min(r0_1, r0_2); 7274 r0_2 = max(r0_1, r0_2); 7275 r0_1 = t; 7276 }; 7277 { 7278 uint const t = min(r0_3, r0_4); 7279 r0_4 = max(r0_3, r0_4); 7280 r0_3 = t; 7281 }; 7282 shared.m[smem_l_idx + (0)] = r0_1; 7283 shared.m[smem_l_idx + (16)] = r0_2; 7284 shared.m[smem_l_idx + (32)] = r0_3; 7285 shared.m[smem_l_idx + (48)] = r0_4; 7286 } 7287 { 7288 uint r0_1 = vout[gmem_l_idx + ((1 << 4) * 4)]; 7289 uint r0_2 = vout[gmem_l_idx + ((1 << 4) * 12)]; 7290 uint r0_3 = vout[gmem_l_idx + ((1 << 4) * 20)]; 7291 uint r0_4 = vout[gmem_l_idx + ((1 << 4) * 28)]; 7292 { 7293 uint const t = min(r0_1, r0_3); 7294 r0_3 = max(r0_1, r0_3); 7295 r0_1 = t; 7296 }; 7297 { 7298 uint const t = min(r0_2, r0_4); 7299 r0_4 = max(r0_2, r0_4); 7300 r0_2 = t; 7301 }; 7302 { 7303 uint const t = min(r0_1, r0_2); 7304 r0_2 = max(r0_1, r0_2); 7305 r0_1 = t; 7306 }; 7307 { 7308 uint const t = min(r0_3, r0_4); 7309 r0_4 = max(r0_3, r0_4); 7310 r0_3 = t; 7311 }; 7312 shared.m[smem_l_idx + (256)] = r0_1; 7313 shared.m[smem_l_idx + (272)] = r0_2; 7314 shared.m[smem_l_idx + (288)] = r0_3; 7315 shared.m[smem_l_idx + (304)] = r0_4; 7316 } 7317 } 7318 barrier(CLK_LOCAL_MEM_FENCE); 7319 uint r1 = shared.m[get_local_id(0) + (4 * (1 << 4) * 0)]; 7320 uint r2 = shared.m[get_local_id(0) + (4 * (1 << 4) * 1)]; 7321 uint r3 = shared.m[get_local_id(0) + (4 * (1 << 4) * 2)]; 7322 uint r4 = shared.m[get_local_id(0) + (4 * (1 << 4) * 3)]; 7323 uint r5 = shared.m[get_local_id(0) + (4 * (1 << 4) * 4)]; 7324 uint r6 = shared.m[get_local_id(0) + (4 * (1 << 4) * 5)]; 7325 uint r7 = shared.m[get_local_id(0) + (4 * (1 << 4) * 6)]; 7326 uint r8 = shared.m[get_local_id(0) + (4 * (1 << 4) * 7)]; 7327 { 7328 { 7329 uint const half_lane_idx = get_sub_group_local_id() ^ 8; 7330 int const t_lt = get_sub_group_local_id() < half_lane_idx; 7331 ; 7332 { 7333 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 7334 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 7335 }; 7336 { 7337 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 7338 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 7339 }; 7340 { 7341 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 7342 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 7343 }; 7344 { 7345 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 7346 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 7347 }; 7348 { 7349 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 7350 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 7351 }; 7352 { 7353 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 7354 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 7355 }; 7356 { 7357 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 7358 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 7359 }; 7360 { 7361 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 7362 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 7363 }; 7364 } 7365 { 7366 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 7367 int const t_lt = get_sub_group_local_id() < half_lane_idx; 7368 ; 7369 { 7370 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 7371 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 7372 }; 7373 { 7374 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 7375 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 7376 }; 7377 { 7378 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 7379 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 7380 }; 7381 { 7382 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 7383 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 7384 }; 7385 { 7386 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 7387 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 7388 }; 7389 { 7390 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 7391 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 7392 }; 7393 { 7394 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 7395 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 7396 }; 7397 { 7398 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 7399 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 7400 }; 7401 } 7402 { 7403 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 7404 int const t_lt = get_sub_group_local_id() < half_lane_idx; 7405 ; 7406 { 7407 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 7408 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 7409 }; 7410 { 7411 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 7412 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 7413 }; 7414 { 7415 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 7416 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 7417 }; 7418 { 7419 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 7420 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 7421 }; 7422 { 7423 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 7424 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 7425 }; 7426 { 7427 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 7428 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 7429 }; 7430 { 7431 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 7432 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 7433 }; 7434 { 7435 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 7436 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 7437 }; 7438 } 7439 { 7440 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 7441 int const t_lt = get_sub_group_local_id() < half_lane_idx; 7442 ; 7443 { 7444 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 7445 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 7446 }; 7447 { 7448 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 7449 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 7450 }; 7451 { 7452 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 7453 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 7454 }; 7455 { 7456 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 7457 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 7458 }; 7459 { 7460 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 7461 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 7462 }; 7463 { 7464 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 7465 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 7466 }; 7467 { 7468 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 7469 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 7470 }; 7471 { 7472 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 7473 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 7474 }; 7475 } 7476 { 7477 uint const t = min(r1, r5); 7478 r5 = max(r1, r5); 7479 r1 = t; 7480 }; 7481 { 7482 uint const t = min(r3, r7); 7483 r7 = max(r3, r7); 7484 r3 = t; 7485 }; 7486 { 7487 uint const t = min(r1, r3); 7488 r3 = max(r1, r3); 7489 r1 = t; 7490 }; 7491 { 7492 uint const t = min(r5, r7); 7493 r7 = max(r5, r7); 7494 r5 = t; 7495 }; 7496 { 7497 uint const t = min(r2, r6); 7498 r6 = max(r2, r6); 7499 r2 = t; 7500 }; 7501 { 7502 uint const t = min(r4, r8); 7503 r8 = max(r4, r8); 7504 r4 = t; 7505 }; 7506 { 7507 uint const t = min(r2, r4); 7508 r4 = max(r2, r4); 7509 r2 = t; 7510 }; 7511 { 7512 uint const t = min(r6, r8); 7513 r8 = max(r6, r8); 7514 r6 = t; 7515 }; 7516 { 7517 uint const t = min(r1, r2); 7518 r2 = max(r1, r2); 7519 r1 = t; 7520 }; 7521 { 7522 uint const t = min(r3, r4); 7523 r4 = max(r3, r4); 7524 r3 = t; 7525 }; 7526 { 7527 uint const t = min(r5, r6); 7528 r6 = max(r5, r6); 7529 r5 = t; 7530 }; 7531 { 7532 uint const t = min(r7, r8); 7533 r8 = max(r7, r8); 7534 r7 = t; 7535 }; 7536 } 7537 vout[gmem_idx + (1 << 4) * 0] = r1; 7538 vout[gmem_idx + (1 << 4) * 1] = r2; 7539 vout[gmem_idx + (1 << 4) * 2] = r3; 7540 vout[gmem_idx + (1 << 4) * 3] = r4; 7541 vout[gmem_idx + (1 << 4) * 4] = r5; 7542 vout[gmem_idx + (1 << 4) * 5] = r6; 7543 vout[gmem_idx + (1 << 4) * 6] = r7; 7544 vout[gmem_idx + (1 << 4) * 7] = r8; 7545} 7546 7547__kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) 7548__attribute__((reqd_work_group_size((1 << 4) * 8, 1, 1))) void 7549hs_kernel_bc_3(__global uint* const restrict vout) 7550{ 7551 __local struct 7552 { 7553 uint m[128 * 8]; 7554 } shared; 7555 7556 uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 + 7557 (get_local_id(0) & ((1 << 4) - 1)); 7558 uint const gmem_l_idx = 7559 (get_global_id(0) & ~((1 << 4) * 8 - 1)) * 8 + get_local_id(0); 7560 uint const smem_l_idx = 7561 get_sub_group_id() * ((1 << 4) * 8) + get_sub_group_local_id(); 7562 { 7563 { 7564 uint r0_1 = vout[gmem_l_idx + ((1 << 4) * 0)]; 7565 uint r0_2 = vout[gmem_l_idx + ((1 << 4) * 8)]; 7566 uint r0_3 = vout[gmem_l_idx + ((1 << 4) * 16)]; 7567 uint r0_4 = vout[gmem_l_idx + ((1 << 4) * 24)]; 7568 uint r0_5 = vout[gmem_l_idx + ((1 << 4) * 32)]; 7569 uint r0_6 = vout[gmem_l_idx + ((1 << 4) * 40)]; 7570 uint r0_7 = vout[gmem_l_idx + ((1 << 4) * 48)]; 7571 uint r0_8 = vout[gmem_l_idx + ((1 << 4) * 56)]; 7572 { 7573 uint const t = min(r0_1, r0_5); 7574 r0_5 = max(r0_1, r0_5); 7575 r0_1 = t; 7576 }; 7577 { 7578 uint const t = min(r0_3, r0_7); 7579 r0_7 = max(r0_3, r0_7); 7580 r0_3 = t; 7581 }; 7582 { 7583 uint const t = min(r0_1, r0_3); 7584 r0_3 = max(r0_1, r0_3); 7585 r0_1 = t; 7586 }; 7587 { 7588 uint const t = min(r0_5, r0_7); 7589 r0_7 = max(r0_5, r0_7); 7590 r0_5 = t; 7591 }; 7592 { 7593 uint const t = min(r0_2, r0_6); 7594 r0_6 = max(r0_2, r0_6); 7595 r0_2 = t; 7596 }; 7597 { 7598 uint const t = min(r0_4, r0_8); 7599 r0_8 = max(r0_4, r0_8); 7600 r0_4 = t; 7601 }; 7602 { 7603 uint const t = min(r0_2, r0_4); 7604 r0_4 = max(r0_2, r0_4); 7605 r0_2 = t; 7606 }; 7607 { 7608 uint const t = min(r0_6, r0_8); 7609 r0_8 = max(r0_6, r0_8); 7610 r0_6 = t; 7611 }; 7612 { 7613 uint const t = min(r0_1, r0_2); 7614 r0_2 = max(r0_1, r0_2); 7615 r0_1 = t; 7616 }; 7617 { 7618 uint const t = min(r0_3, r0_4); 7619 r0_4 = max(r0_3, r0_4); 7620 r0_3 = t; 7621 }; 7622 { 7623 uint const t = min(r0_5, r0_6); 7624 r0_6 = max(r0_5, r0_6); 7625 r0_5 = t; 7626 }; 7627 { 7628 uint const t = min(r0_7, r0_8); 7629 r0_8 = max(r0_7, r0_8); 7630 r0_7 = t; 7631 }; 7632 shared.m[smem_l_idx + (0)] = r0_1; 7633 shared.m[smem_l_idx + (16)] = r0_2; 7634 shared.m[smem_l_idx + (32)] = r0_3; 7635 shared.m[smem_l_idx + (48)] = r0_4; 7636 shared.m[smem_l_idx + (64)] = r0_5; 7637 shared.m[smem_l_idx + (80)] = r0_6; 7638 shared.m[smem_l_idx + (96)] = r0_7; 7639 shared.m[smem_l_idx + (112)] = r0_8; 7640 } 7641 } 7642 barrier(CLK_LOCAL_MEM_FENCE); 7643 uint r1 = shared.m[get_local_id(0) + (8 * (1 << 4) * 0)]; 7644 uint r2 = shared.m[get_local_id(0) + (8 * (1 << 4) * 1)]; 7645 uint r3 = shared.m[get_local_id(0) + (8 * (1 << 4) * 2)]; 7646 uint r4 = shared.m[get_local_id(0) + (8 * (1 << 4) * 3)]; 7647 uint r5 = shared.m[get_local_id(0) + (8 * (1 << 4) * 4)]; 7648 uint r6 = shared.m[get_local_id(0) + (8 * (1 << 4) * 5)]; 7649 uint r7 = shared.m[get_local_id(0) + (8 * (1 << 4) * 6)]; 7650 uint r8 = shared.m[get_local_id(0) + (8 * (1 << 4) * 7)]; 7651 { 7652 { 7653 uint const half_lane_idx = get_sub_group_local_id() ^ 8; 7654 int const t_lt = get_sub_group_local_id() < half_lane_idx; 7655 ; 7656 { 7657 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 7658 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 7659 }; 7660 { 7661 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 7662 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 7663 }; 7664 { 7665 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 7666 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 7667 }; 7668 { 7669 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 7670 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 7671 }; 7672 { 7673 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 7674 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 7675 }; 7676 { 7677 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 7678 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 7679 }; 7680 { 7681 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 7682 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 7683 }; 7684 { 7685 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 7686 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 7687 }; 7688 } 7689 { 7690 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 7691 int const t_lt = get_sub_group_local_id() < half_lane_idx; 7692 ; 7693 { 7694 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 7695 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 7696 }; 7697 { 7698 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 7699 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 7700 }; 7701 { 7702 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 7703 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 7704 }; 7705 { 7706 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 7707 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 7708 }; 7709 { 7710 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 7711 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 7712 }; 7713 { 7714 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 7715 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 7716 }; 7717 { 7718 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 7719 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 7720 }; 7721 { 7722 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 7723 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 7724 }; 7725 } 7726 { 7727 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 7728 int const t_lt = get_sub_group_local_id() < half_lane_idx; 7729 ; 7730 { 7731 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 7732 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 7733 }; 7734 { 7735 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 7736 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 7737 }; 7738 { 7739 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 7740 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 7741 }; 7742 { 7743 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 7744 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 7745 }; 7746 { 7747 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 7748 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 7749 }; 7750 { 7751 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 7752 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 7753 }; 7754 { 7755 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 7756 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 7757 }; 7758 { 7759 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 7760 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 7761 }; 7762 } 7763 { 7764 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 7765 int const t_lt = get_sub_group_local_id() < half_lane_idx; 7766 ; 7767 { 7768 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 7769 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 7770 }; 7771 { 7772 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 7773 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 7774 }; 7775 { 7776 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 7777 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 7778 }; 7779 { 7780 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 7781 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 7782 }; 7783 { 7784 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 7785 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 7786 }; 7787 { 7788 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 7789 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 7790 }; 7791 { 7792 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 7793 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 7794 }; 7795 { 7796 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 7797 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 7798 }; 7799 } 7800 { 7801 uint const t = min(r1, r5); 7802 r5 = max(r1, r5); 7803 r1 = t; 7804 }; 7805 { 7806 uint const t = min(r3, r7); 7807 r7 = max(r3, r7); 7808 r3 = t; 7809 }; 7810 { 7811 uint const t = min(r1, r3); 7812 r3 = max(r1, r3); 7813 r1 = t; 7814 }; 7815 { 7816 uint const t = min(r5, r7); 7817 r7 = max(r5, r7); 7818 r5 = t; 7819 }; 7820 { 7821 uint const t = min(r2, r6); 7822 r6 = max(r2, r6); 7823 r2 = t; 7824 }; 7825 { 7826 uint const t = min(r4, r8); 7827 r8 = max(r4, r8); 7828 r4 = t; 7829 }; 7830 { 7831 uint const t = min(r2, r4); 7832 r4 = max(r2, r4); 7833 r2 = t; 7834 }; 7835 { 7836 uint const t = min(r6, r8); 7837 r8 = max(r6, r8); 7838 r6 = t; 7839 }; 7840 { 7841 uint const t = min(r1, r2); 7842 r2 = max(r1, r2); 7843 r1 = t; 7844 }; 7845 { 7846 uint const t = min(r3, r4); 7847 r4 = max(r3, r4); 7848 r3 = t; 7849 }; 7850 { 7851 uint const t = min(r5, r6); 7852 r6 = max(r5, r6); 7853 r5 = t; 7854 }; 7855 { 7856 uint const t = min(r7, r8); 7857 r8 = max(r7, r8); 7858 r7 = t; 7859 }; 7860 } 7861 vout[gmem_idx + (1 << 4) * 0] = r1; 7862 vout[gmem_idx + (1 << 4) * 1] = r2; 7863 vout[gmem_idx + (1 << 4) * 2] = r3; 7864 vout[gmem_idx + (1 << 4) * 3] = r4; 7865 vout[gmem_idx + (1 << 4) * 4] = r5; 7866 vout[gmem_idx + (1 << 4) * 5] = r6; 7867 vout[gmem_idx + (1 << 4) * 6] = r7; 7868 vout[gmem_idx + (1 << 4) * 7] = r8; 7869} 7870 7871__kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) 7872__attribute__((reqd_work_group_size((1 << 4) * 16, 1, 1))) void 7873hs_kernel_bc_4(__global uint* const restrict vout) 7874{ 7875 __local struct 7876 { 7877 uint m[256 * 8]; 7878 } shared; 7879 7880 uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 + 7881 (get_local_id(0) & ((1 << 4) - 1)); 7882 uint const gmem_l_idx = 7883 (get_global_id(0) & ~((1 << 4) * 16 - 1)) * 8 + get_local_id(0); 7884 uint const smem_l_idx = 7885 get_sub_group_id() * ((1 << 4) * 16) + get_sub_group_local_id(); 7886 if (get_sub_group_id() < 8) { 7887 { 7888 uint r0_1 = vout[gmem_l_idx + ((1 << 4) * 0)]; 7889 uint r0_2 = vout[gmem_l_idx + ((1 << 4) * 8)]; 7890 uint r0_3 = vout[gmem_l_idx + ((1 << 4) * 16)]; 7891 uint r0_4 = vout[gmem_l_idx + ((1 << 4) * 24)]; 7892 uint r0_5 = vout[gmem_l_idx + ((1 << 4) * 32)]; 7893 uint r0_6 = vout[gmem_l_idx + ((1 << 4) * 40)]; 7894 uint r0_7 = vout[gmem_l_idx + ((1 << 4) * 48)]; 7895 uint r0_8 = vout[gmem_l_idx + ((1 << 4) * 56)]; 7896 uint r0_9 = vout[gmem_l_idx + ((1 << 4) * 64)]; 7897 uint r0_10 = vout[gmem_l_idx + ((1 << 4) * 72)]; 7898 uint r0_11 = vout[gmem_l_idx + ((1 << 4) * 80)]; 7899 uint r0_12 = vout[gmem_l_idx + ((1 << 4) * 88)]; 7900 uint r0_13 = vout[gmem_l_idx + ((1 << 4) * 96)]; 7901 uint r0_14 = vout[gmem_l_idx + ((1 << 4) * 104)]; 7902 uint r0_15 = vout[gmem_l_idx + ((1 << 4) * 112)]; 7903 uint r0_16 = vout[gmem_l_idx + ((1 << 4) * 120)]; 7904 { 7905 uint const t = min(r0_1, r0_9); 7906 r0_9 = max(r0_1, r0_9); 7907 r0_1 = t; 7908 }; 7909 { 7910 uint const t = min(r0_5, r0_13); 7911 r0_13 = max(r0_5, r0_13); 7912 r0_5 = t; 7913 }; 7914 { 7915 uint const t = min(r0_1, r0_5); 7916 r0_5 = max(r0_1, r0_5); 7917 r0_1 = t; 7918 }; 7919 { 7920 uint const t = min(r0_9, r0_13); 7921 r0_13 = max(r0_9, r0_13); 7922 r0_9 = t; 7923 }; 7924 { 7925 uint const t = min(r0_3, r0_11); 7926 r0_11 = max(r0_3, r0_11); 7927 r0_3 = t; 7928 }; 7929 { 7930 uint const t = min(r0_7, r0_15); 7931 r0_15 = max(r0_7, r0_15); 7932 r0_7 = t; 7933 }; 7934 { 7935 uint const t = min(r0_3, r0_7); 7936 r0_7 = max(r0_3, r0_7); 7937 r0_3 = t; 7938 }; 7939 { 7940 uint const t = min(r0_11, r0_15); 7941 r0_15 = max(r0_11, r0_15); 7942 r0_11 = t; 7943 }; 7944 { 7945 uint const t = min(r0_1, r0_3); 7946 r0_3 = max(r0_1, r0_3); 7947 r0_1 = t; 7948 }; 7949 { 7950 uint const t = min(r0_5, r0_7); 7951 r0_7 = max(r0_5, r0_7); 7952 r0_5 = t; 7953 }; 7954 { 7955 uint const t = min(r0_9, r0_11); 7956 r0_11 = max(r0_9, r0_11); 7957 r0_9 = t; 7958 }; 7959 { 7960 uint const t = min(r0_13, r0_15); 7961 r0_15 = max(r0_13, r0_15); 7962 r0_13 = t; 7963 }; 7964 { 7965 uint const t = min(r0_2, r0_10); 7966 r0_10 = max(r0_2, r0_10); 7967 r0_2 = t; 7968 }; 7969 { 7970 uint const t = min(r0_6, r0_14); 7971 r0_14 = max(r0_6, r0_14); 7972 r0_6 = t; 7973 }; 7974 { 7975 uint const t = min(r0_2, r0_6); 7976 r0_6 = max(r0_2, r0_6); 7977 r0_2 = t; 7978 }; 7979 { 7980 uint const t = min(r0_10, r0_14); 7981 r0_14 = max(r0_10, r0_14); 7982 r0_10 = t; 7983 }; 7984 { 7985 uint const t = min(r0_4, r0_12); 7986 r0_12 = max(r0_4, r0_12); 7987 r0_4 = t; 7988 }; 7989 { 7990 uint const t = min(r0_8, r0_16); 7991 r0_16 = max(r0_8, r0_16); 7992 r0_8 = t; 7993 }; 7994 { 7995 uint const t = min(r0_4, r0_8); 7996 r0_8 = max(r0_4, r0_8); 7997 r0_4 = t; 7998 }; 7999 { 8000 uint const t = min(r0_12, r0_16); 8001 r0_16 = max(r0_12, r0_16); 8002 r0_12 = t; 8003 }; 8004 { 8005 uint const t = min(r0_2, r0_4); 8006 r0_4 = max(r0_2, r0_4); 8007 r0_2 = t; 8008 }; 8009 { 8010 uint const t = min(r0_6, r0_8); 8011 r0_8 = max(r0_6, r0_8); 8012 r0_6 = t; 8013 }; 8014 { 8015 uint const t = min(r0_10, r0_12); 8016 r0_12 = max(r0_10, r0_12); 8017 r0_10 = t; 8018 }; 8019 { 8020 uint const t = min(r0_14, r0_16); 8021 r0_16 = max(r0_14, r0_16); 8022 r0_14 = t; 8023 }; 8024 { 8025 uint const t = min(r0_1, r0_2); 8026 r0_2 = max(r0_1, r0_2); 8027 r0_1 = t; 8028 }; 8029 { 8030 uint const t = min(r0_3, r0_4); 8031 r0_4 = max(r0_3, r0_4); 8032 r0_3 = t; 8033 }; 8034 { 8035 uint const t = min(r0_5, r0_6); 8036 r0_6 = max(r0_5, r0_6); 8037 r0_5 = t; 8038 }; 8039 { 8040 uint const t = min(r0_7, r0_8); 8041 r0_8 = max(r0_7, r0_8); 8042 r0_7 = t; 8043 }; 8044 { 8045 uint const t = min(r0_9, r0_10); 8046 r0_10 = max(r0_9, r0_10); 8047 r0_9 = t; 8048 }; 8049 { 8050 uint const t = min(r0_11, r0_12); 8051 r0_12 = max(r0_11, r0_12); 8052 r0_11 = t; 8053 }; 8054 { 8055 uint const t = min(r0_13, r0_14); 8056 r0_14 = max(r0_13, r0_14); 8057 r0_13 = t; 8058 }; 8059 { 8060 uint const t = min(r0_15, r0_16); 8061 r0_16 = max(r0_15, r0_16); 8062 r0_15 = t; 8063 }; 8064 shared.m[smem_l_idx + (0)] = r0_1; 8065 shared.m[smem_l_idx + (16)] = r0_2; 8066 shared.m[smem_l_idx + (32)] = r0_3; 8067 shared.m[smem_l_idx + (48)] = r0_4; 8068 shared.m[smem_l_idx + (64)] = r0_5; 8069 shared.m[smem_l_idx + (80)] = r0_6; 8070 shared.m[smem_l_idx + (96)] = r0_7; 8071 shared.m[smem_l_idx + (112)] = r0_8; 8072 shared.m[smem_l_idx + (128)] = r0_9; 8073 shared.m[smem_l_idx + (144)] = r0_10; 8074 shared.m[smem_l_idx + (160)] = r0_11; 8075 shared.m[smem_l_idx + (176)] = r0_12; 8076 shared.m[smem_l_idx + (192)] = r0_13; 8077 shared.m[smem_l_idx + (208)] = r0_14; 8078 shared.m[smem_l_idx + (224)] = r0_15; 8079 shared.m[smem_l_idx + (240)] = r0_16; 8080 } 8081 } 8082 barrier(CLK_LOCAL_MEM_FENCE); 8083 uint r1 = shared.m[get_local_id(0) + (16 * (1 << 4) * 0)]; 8084 uint r2 = shared.m[get_local_id(0) + (16 * (1 << 4) * 1)]; 8085 uint r3 = shared.m[get_local_id(0) + (16 * (1 << 4) * 2)]; 8086 uint r4 = shared.m[get_local_id(0) + (16 * (1 << 4) * 3)]; 8087 uint r5 = shared.m[get_local_id(0) + (16 * (1 << 4) * 4)]; 8088 uint r6 = shared.m[get_local_id(0) + (16 * (1 << 4) * 5)]; 8089 uint r7 = shared.m[get_local_id(0) + (16 * (1 << 4) * 6)]; 8090 uint r8 = shared.m[get_local_id(0) + (16 * (1 << 4) * 7)]; 8091 { 8092 { 8093 uint const half_lane_idx = get_sub_group_local_id() ^ 8; 8094 int const t_lt = get_sub_group_local_id() < half_lane_idx; 8095 ; 8096 { 8097 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 8098 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 8099 }; 8100 { 8101 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 8102 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 8103 }; 8104 { 8105 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 8106 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 8107 }; 8108 { 8109 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 8110 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 8111 }; 8112 { 8113 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 8114 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 8115 }; 8116 { 8117 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 8118 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 8119 }; 8120 { 8121 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 8122 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 8123 }; 8124 { 8125 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 8126 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 8127 }; 8128 } 8129 { 8130 uint const half_lane_idx = get_sub_group_local_id() ^ 4; 8131 int const t_lt = get_sub_group_local_id() < half_lane_idx; 8132 ; 8133 { 8134 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 8135 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 8136 }; 8137 { 8138 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 8139 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 8140 }; 8141 { 8142 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 8143 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 8144 }; 8145 { 8146 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 8147 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 8148 }; 8149 { 8150 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 8151 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 8152 }; 8153 { 8154 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 8155 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 8156 }; 8157 { 8158 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 8159 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 8160 }; 8161 { 8162 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 8163 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 8164 }; 8165 } 8166 { 8167 uint const half_lane_idx = get_sub_group_local_id() ^ 2; 8168 int const t_lt = get_sub_group_local_id() < half_lane_idx; 8169 ; 8170 { 8171 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 8172 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 8173 }; 8174 { 8175 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 8176 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 8177 }; 8178 { 8179 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 8180 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 8181 }; 8182 { 8183 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 8184 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 8185 }; 8186 { 8187 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 8188 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 8189 }; 8190 { 8191 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 8192 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 8193 }; 8194 { 8195 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 8196 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 8197 }; 8198 { 8199 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 8200 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 8201 }; 8202 } 8203 { 8204 uint const half_lane_idx = get_sub_group_local_id() ^ 1; 8205 int const t_lt = get_sub_group_local_id() < half_lane_idx; 8206 ; 8207 { 8208 uint const ta = intel_sub_group_shuffle(r1, half_lane_idx); 8209 r1 = ((r1 <= ta) ^ t_lt) ? ta : r1; 8210 }; 8211 { 8212 uint const ta = intel_sub_group_shuffle(r2, half_lane_idx); 8213 r2 = ((r2 <= ta) ^ t_lt) ? ta : r2; 8214 }; 8215 { 8216 uint const ta = intel_sub_group_shuffle(r3, half_lane_idx); 8217 r3 = ((r3 <= ta) ^ t_lt) ? ta : r3; 8218 }; 8219 { 8220 uint const ta = intel_sub_group_shuffle(r4, half_lane_idx); 8221 r4 = ((r4 <= ta) ^ t_lt) ? ta : r4; 8222 }; 8223 { 8224 uint const ta = intel_sub_group_shuffle(r5, half_lane_idx); 8225 r5 = ((r5 <= ta) ^ t_lt) ? ta : r5; 8226 }; 8227 { 8228 uint const ta = intel_sub_group_shuffle(r6, half_lane_idx); 8229 r6 = ((r6 <= ta) ^ t_lt) ? ta : r6; 8230 }; 8231 { 8232 uint const ta = intel_sub_group_shuffle(r7, half_lane_idx); 8233 r7 = ((r7 <= ta) ^ t_lt) ? ta : r7; 8234 }; 8235 { 8236 uint const ta = intel_sub_group_shuffle(r8, half_lane_idx); 8237 r8 = ((r8 <= ta) ^ t_lt) ? ta : r8; 8238 }; 8239 } 8240 { 8241 uint const t = min(r1, r5); 8242 r5 = max(r1, r5); 8243 r1 = t; 8244 }; 8245 { 8246 uint const t = min(r3, r7); 8247 r7 = max(r3, r7); 8248 r3 = t; 8249 }; 8250 { 8251 uint const t = min(r1, r3); 8252 r3 = max(r1, r3); 8253 r1 = t; 8254 }; 8255 { 8256 uint const t = min(r5, r7); 8257 r7 = max(r5, r7); 8258 r5 = t; 8259 }; 8260 { 8261 uint const t = min(r2, r6); 8262 r6 = max(r2, r6); 8263 r2 = t; 8264 }; 8265 { 8266 uint const t = min(r4, r8); 8267 r8 = max(r4, r8); 8268 r4 = t; 8269 }; 8270 { 8271 uint const t = min(r2, r4); 8272 r4 = max(r2, r4); 8273 r2 = t; 8274 }; 8275 { 8276 uint const t = min(r6, r8); 8277 r8 = max(r6, r8); 8278 r6 = t; 8279 }; 8280 { 8281 uint const t = min(r1, r2); 8282 r2 = max(r1, r2); 8283 r1 = t; 8284 }; 8285 { 8286 uint const t = min(r3, r4); 8287 r4 = max(r3, r4); 8288 r3 = t; 8289 }; 8290 { 8291 uint const t = min(r5, r6); 8292 r6 = max(r5, r6); 8293 r5 = t; 8294 }; 8295 { 8296 uint const t = min(r7, r8); 8297 r8 = max(r7, r8); 8298 r7 = t; 8299 }; 8300 } 8301 vout[gmem_idx + (1 << 4) * 0] = r1; 8302 vout[gmem_idx + (1 << 4) * 1] = r2; 8303 vout[gmem_idx + (1 << 4) * 2] = r3; 8304 vout[gmem_idx + (1 << 4) * 3] = r4; 8305 vout[gmem_idx + (1 << 4) * 4] = r5; 8306 vout[gmem_idx + (1 << 4) * 5] = r6; 8307 vout[gmem_idx + (1 << 4) * 6] = r7; 8308 vout[gmem_idx + (1 << 4) * 7] = r8; 8309} 8310 8311__kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) void 8312hs_kernel_fm_0_0(__global uint* const restrict vout) 8313{ 8314 uint const span_idx = get_global_id(1); 8315 uint const span_stride = get_global_size(0); 8316 uint const span_size = span_stride * 8 * 2; 8317 uint const span_base = span_idx * span_size; 8318 uint const span_off = get_global_id(0); 8319 uint const span_l = span_base + span_off; 8320 uint const span_r = span_base + span_stride * (8 + 1) - span_off - 1; 8321 uint r1 = vout[span_l + span_stride * 0]; 8322 uint r2 = vout[span_l + span_stride * 1]; 8323 uint r3 = vout[span_l + span_stride * 2]; 8324 uint r4 = vout[span_l + span_stride * 3]; 8325 uint r5 = vout[span_l + span_stride * 4]; 8326 uint r6 = vout[span_l + span_stride * 5]; 8327 uint r7 = vout[span_l + span_stride * 6]; 8328 uint r8 = vout[span_l + span_stride * 7]; 8329 uint r9 = vout[span_r + span_stride * 0]; 8330 { 8331 uint const t = min(r8, r9); 8332 r9 = max(r8, r9); 8333 r8 = t; 8334 }; 8335 { 8336 uint const t = min(r1, r5); 8337 r5 = max(r1, r5); 8338 r1 = t; 8339 }; 8340 { 8341 uint const t = min(r3, r7); 8342 r7 = max(r3, r7); 8343 r3 = t; 8344 }; 8345 { 8346 uint const t = min(r1, r3); 8347 r3 = max(r1, r3); 8348 r1 = t; 8349 }; 8350 { 8351 uint const t = min(r5, r7); 8352 r7 = max(r5, r7); 8353 r5 = t; 8354 }; 8355 { 8356 uint const t = min(r2, r6); 8357 r6 = max(r2, r6); 8358 r2 = t; 8359 }; 8360 { 8361 uint const t = min(r4, r8); 8362 r8 = max(r4, r8); 8363 r4 = t; 8364 }; 8365 { 8366 uint const t = min(r2, r4); 8367 r4 = max(r2, r4); 8368 r2 = t; 8369 }; 8370 { 8371 uint const t = min(r6, r8); 8372 r8 = max(r6, r8); 8373 r6 = t; 8374 }; 8375 { 8376 uint const t = min(r1, r2); 8377 r2 = max(r1, r2); 8378 r1 = t; 8379 }; 8380 { 8381 uint const t = min(r3, r4); 8382 r4 = max(r3, r4); 8383 r3 = t; 8384 }; 8385 { 8386 uint const t = min(r5, r6); 8387 r6 = max(r5, r6); 8388 r5 = t; 8389 }; 8390 { 8391 uint const t = min(r7, r8); 8392 r8 = max(r7, r8); 8393 r7 = t; 8394 }; 8395 vout[span_l + span_stride * 0] = r1; 8396 vout[span_l + span_stride * 1] = r2; 8397 vout[span_l + span_stride * 2] = r3; 8398 vout[span_l + span_stride * 3] = r4; 8399 vout[span_l + span_stride * 4] = r5; 8400 vout[span_l + span_stride * 5] = r6; 8401 vout[span_l + span_stride * 6] = r7; 8402 vout[span_l + span_stride * 7] = r8; 8403 vout[span_r + span_stride * 0] = r9; 8404} 8405 8406__kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) void 8407hs_kernel_fm_0_1(__global uint* const restrict vout) 8408{ 8409 uint const span_idx = get_global_id(1); 8410 uint const span_stride = get_global_size(0); 8411 uint const span_size = span_stride * 8 * 2; 8412 uint const span_base = span_idx * span_size; 8413 uint const span_off = get_global_id(0); 8414 uint const span_l = span_base + span_off; 8415 uint const span_r = span_base + span_stride * (8 + 1) - span_off - 1; 8416 uint r1 = vout[span_l + span_stride * 0]; 8417 uint r2 = vout[span_l + span_stride * 1]; 8418 uint r3 = vout[span_l + span_stride * 2]; 8419 uint r4 = vout[span_l + span_stride * 3]; 8420 uint r5 = vout[span_l + span_stride * 4]; 8421 uint r6 = vout[span_l + span_stride * 5]; 8422 uint r7 = vout[span_l + span_stride * 6]; 8423 uint r8 = vout[span_l + span_stride * 7]; 8424 uint r9 = vout[span_r + span_stride * 0]; 8425 uint r10 = vout[span_r + span_stride * 1]; 8426 { 8427 uint const t = min(r8, r9); 8428 r9 = max(r8, r9); 8429 r8 = t; 8430 }; 8431 { 8432 uint const t = min(r7, r10); 8433 r10 = max(r7, r10); 8434 r7 = t; 8435 }; 8436 { 8437 uint const t = min(r1, r5); 8438 r5 = max(r1, r5); 8439 r1 = t; 8440 }; 8441 { 8442 uint const t = min(r3, r7); 8443 r7 = max(r3, r7); 8444 r3 = t; 8445 }; 8446 { 8447 uint const t = min(r1, r3); 8448 r3 = max(r1, r3); 8449 r1 = t; 8450 }; 8451 { 8452 uint const t = min(r5, r7); 8453 r7 = max(r5, r7); 8454 r5 = t; 8455 }; 8456 { 8457 uint const t = min(r2, r6); 8458 r6 = max(r2, r6); 8459 r2 = t; 8460 }; 8461 { 8462 uint const t = min(r4, r8); 8463 r8 = max(r4, r8); 8464 r4 = t; 8465 }; 8466 { 8467 uint const t = min(r2, r4); 8468 r4 = max(r2, r4); 8469 r2 = t; 8470 }; 8471 { 8472 uint const t = min(r6, r8); 8473 r8 = max(r6, r8); 8474 r6 = t; 8475 }; 8476 { 8477 uint const t = min(r1, r2); 8478 r2 = max(r1, r2); 8479 r1 = t; 8480 }; 8481 { 8482 uint const t = min(r3, r4); 8483 r4 = max(r3, r4); 8484 r3 = t; 8485 }; 8486 { 8487 uint const t = min(r5, r6); 8488 r6 = max(r5, r6); 8489 r5 = t; 8490 }; 8491 { 8492 uint const t = min(r7, r8); 8493 r8 = max(r7, r8); 8494 r7 = t; 8495 }; 8496 { 8497 uint const t = min(r9, r10); 8498 r10 = max(r9, r10); 8499 r9 = t; 8500 }; 8501 vout[span_l + span_stride * 0] = r1; 8502 vout[span_l + span_stride * 1] = r2; 8503 vout[span_l + span_stride * 2] = r3; 8504 vout[span_l + span_stride * 3] = r4; 8505 vout[span_l + span_stride * 4] = r5; 8506 vout[span_l + span_stride * 5] = r6; 8507 vout[span_l + span_stride * 6] = r7; 8508 vout[span_l + span_stride * 7] = r8; 8509 vout[span_r + span_stride * 0] = r9; 8510 vout[span_r + span_stride * 1] = r10; 8511} 8512 8513__kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) void 8514hs_kernel_fm_0_2(__global uint* const restrict vout) 8515{ 8516 uint const span_idx = get_global_id(1); 8517 uint const span_stride = get_global_size(0); 8518 uint const span_size = span_stride * 8 * 2; 8519 uint const span_base = span_idx * span_size; 8520 uint const span_off = get_global_id(0); 8521 uint const span_l = span_base + span_off; 8522 uint const span_r = span_base + span_stride * (8 + 1) - span_off - 1; 8523 uint r1 = vout[span_l + span_stride * 0]; 8524 uint r2 = vout[span_l + span_stride * 1]; 8525 uint r3 = vout[span_l + span_stride * 2]; 8526 uint r4 = vout[span_l + span_stride * 3]; 8527 uint r5 = vout[span_l + span_stride * 4]; 8528 uint r6 = vout[span_l + span_stride * 5]; 8529 uint r7 = vout[span_l + span_stride * 6]; 8530 uint r8 = vout[span_l + span_stride * 7]; 8531 uint r9 = vout[span_r + span_stride * 0]; 8532 uint r10 = vout[span_r + span_stride * 1]; 8533 uint r11 = vout[span_r + span_stride * 2]; 8534 uint r12 = vout[span_r + span_stride * 3]; 8535 { 8536 uint const t = min(r8, r9); 8537 r9 = max(r8, r9); 8538 r8 = t; 8539 }; 8540 { 8541 uint const t = min(r7, r10); 8542 r10 = max(r7, r10); 8543 r7 = t; 8544 }; 8545 { 8546 uint const t = min(r6, r11); 8547 r11 = max(r6, r11); 8548 r6 = t; 8549 }; 8550 { 8551 uint const t = min(r5, r12); 8552 r12 = max(r5, r12); 8553 r5 = t; 8554 }; 8555 { 8556 uint const t = min(r1, r5); 8557 r5 = max(r1, r5); 8558 r1 = t; 8559 }; 8560 { 8561 uint const t = min(r3, r7); 8562 r7 = max(r3, r7); 8563 r3 = t; 8564 }; 8565 { 8566 uint const t = min(r1, r3); 8567 r3 = max(r1, r3); 8568 r1 = t; 8569 }; 8570 { 8571 uint const t = min(r5, r7); 8572 r7 = max(r5, r7); 8573 r5 = t; 8574 }; 8575 { 8576 uint const t = min(r2, r6); 8577 r6 = max(r2, r6); 8578 r2 = t; 8579 }; 8580 { 8581 uint const t = min(r4, r8); 8582 r8 = max(r4, r8); 8583 r4 = t; 8584 }; 8585 { 8586 uint const t = min(r2, r4); 8587 r4 = max(r2, r4); 8588 r2 = t; 8589 }; 8590 { 8591 uint const t = min(r6, r8); 8592 r8 = max(r6, r8); 8593 r6 = t; 8594 }; 8595 { 8596 uint const t = min(r1, r2); 8597 r2 = max(r1, r2); 8598 r1 = t; 8599 }; 8600 { 8601 uint const t = min(r3, r4); 8602 r4 = max(r3, r4); 8603 r3 = t; 8604 }; 8605 { 8606 uint const t = min(r5, r6); 8607 r6 = max(r5, r6); 8608 r5 = t; 8609 }; 8610 { 8611 uint const t = min(r7, r8); 8612 r8 = max(r7, r8); 8613 r7 = t; 8614 }; 8615 { 8616 uint const t = min(r9, r11); 8617 r11 = max(r9, r11); 8618 r9 = t; 8619 }; 8620 { 8621 uint const t = min(r10, r12); 8622 r12 = max(r10, r12); 8623 r10 = t; 8624 }; 8625 { 8626 uint const t = min(r9, r10); 8627 r10 = max(r9, r10); 8628 r9 = t; 8629 }; 8630 { 8631 uint const t = min(r11, r12); 8632 r12 = max(r11, r12); 8633 r11 = t; 8634 }; 8635 vout[span_l + span_stride * 0] = r1; 8636 vout[span_l + span_stride * 1] = r2; 8637 vout[span_l + span_stride * 2] = r3; 8638 vout[span_l + span_stride * 3] = r4; 8639 vout[span_l + span_stride * 4] = r5; 8640 vout[span_l + span_stride * 5] = r6; 8641 vout[span_l + span_stride * 6] = r7; 8642 vout[span_l + span_stride * 7] = r8; 8643 vout[span_r + span_stride * 0] = r9; 8644 vout[span_r + span_stride * 1] = r10; 8645 vout[span_r + span_stride * 2] = r11; 8646 vout[span_r + span_stride * 3] = r12; 8647} 8648 8649__kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) void 8650hs_kernel_fm_0_3(__global uint* const restrict vout) 8651{ 8652 uint const span_idx = get_global_id(1); 8653 uint const span_stride = get_global_size(0); 8654 uint const span_size = span_stride * 8 * 2; 8655 uint const span_base = span_idx * span_size; 8656 uint const span_off = get_global_id(0); 8657 uint const span_l = span_base + span_off; 8658 uint const span_r = span_base + span_stride * (8 + 1) - span_off - 1; 8659 uint r1 = vout[span_l + span_stride * 0]; 8660 uint r2 = vout[span_l + span_stride * 1]; 8661 uint r3 = vout[span_l + span_stride * 2]; 8662 uint r4 = vout[span_l + span_stride * 3]; 8663 uint r5 = vout[span_l + span_stride * 4]; 8664 uint r6 = vout[span_l + span_stride * 5]; 8665 uint r7 = vout[span_l + span_stride * 6]; 8666 uint r8 = vout[span_l + span_stride * 7]; 8667 uint r9 = vout[span_r + span_stride * 0]; 8668 uint r10 = vout[span_r + span_stride * 1]; 8669 uint r11 = vout[span_r + span_stride * 2]; 8670 uint r12 = vout[span_r + span_stride * 3]; 8671 uint r13 = vout[span_r + span_stride * 4]; 8672 uint r14 = vout[span_r + span_stride * 5]; 8673 uint r15 = vout[span_r + span_stride * 6]; 8674 uint r16 = vout[span_r + span_stride * 7]; 8675 { 8676 uint const t = min(r8, r9); 8677 r9 = max(r8, r9); 8678 r8 = t; 8679 }; 8680 { 8681 uint const t = min(r7, r10); 8682 r10 = max(r7, r10); 8683 r7 = t; 8684 }; 8685 { 8686 uint const t = min(r6, r11); 8687 r11 = max(r6, r11); 8688 r6 = t; 8689 }; 8690 { 8691 uint const t = min(r5, r12); 8692 r12 = max(r5, r12); 8693 r5 = t; 8694 }; 8695 { 8696 uint const t = min(r4, r13); 8697 r13 = max(r4, r13); 8698 r4 = t; 8699 }; 8700 { 8701 uint const t = min(r3, r14); 8702 r14 = max(r3, r14); 8703 r3 = t; 8704 }; 8705 { 8706 uint const t = min(r2, r15); 8707 r15 = max(r2, r15); 8708 r2 = t; 8709 }; 8710 { 8711 uint const t = min(r1, r16); 8712 r16 = max(r1, r16); 8713 r1 = t; 8714 }; 8715 { 8716 uint const t = min(r1, r5); 8717 r5 = max(r1, r5); 8718 r1 = t; 8719 }; 8720 { 8721 uint const t = min(r3, r7); 8722 r7 = max(r3, r7); 8723 r3 = t; 8724 }; 8725 { 8726 uint const t = min(r1, r3); 8727 r3 = max(r1, r3); 8728 r1 = t; 8729 }; 8730 { 8731 uint const t = min(r5, r7); 8732 r7 = max(r5, r7); 8733 r5 = t; 8734 }; 8735 { 8736 uint const t = min(r2, r6); 8737 r6 = max(r2, r6); 8738 r2 = t; 8739 }; 8740 { 8741 uint const t = min(r4, r8); 8742 r8 = max(r4, r8); 8743 r4 = t; 8744 }; 8745 { 8746 uint const t = min(r2, r4); 8747 r4 = max(r2, r4); 8748 r2 = t; 8749 }; 8750 { 8751 uint const t = min(r6, r8); 8752 r8 = max(r6, r8); 8753 r6 = t; 8754 }; 8755 { 8756 uint const t = min(r1, r2); 8757 r2 = max(r1, r2); 8758 r1 = t; 8759 }; 8760 { 8761 uint const t = min(r3, r4); 8762 r4 = max(r3, r4); 8763 r3 = t; 8764 }; 8765 { 8766 uint const t = min(r5, r6); 8767 r6 = max(r5, r6); 8768 r5 = t; 8769 }; 8770 { 8771 uint const t = min(r7, r8); 8772 r8 = max(r7, r8); 8773 r7 = t; 8774 }; 8775 { 8776 uint const t = min(r9, r13); 8777 r13 = max(r9, r13); 8778 r9 = t; 8779 }; 8780 { 8781 uint const t = min(r11, r15); 8782 r15 = max(r11, r15); 8783 r11 = t; 8784 }; 8785 { 8786 uint const t = min(r9, r11); 8787 r11 = max(r9, r11); 8788 r9 = t; 8789 }; 8790 { 8791 uint const t = min(r13, r15); 8792 r15 = max(r13, r15); 8793 r13 = t; 8794 }; 8795 { 8796 uint const t = min(r10, r14); 8797 r14 = max(r10, r14); 8798 r10 = t; 8799 }; 8800 { 8801 uint const t = min(r12, r16); 8802 r16 = max(r12, r16); 8803 r12 = t; 8804 }; 8805 { 8806 uint const t = min(r10, r12); 8807 r12 = max(r10, r12); 8808 r10 = t; 8809 }; 8810 { 8811 uint const t = min(r14, r16); 8812 r16 = max(r14, r16); 8813 r14 = t; 8814 }; 8815 { 8816 uint const t = min(r9, r10); 8817 r10 = max(r9, r10); 8818 r9 = t; 8819 }; 8820 { 8821 uint const t = min(r11, r12); 8822 r12 = max(r11, r12); 8823 r11 = t; 8824 }; 8825 { 8826 uint const t = min(r13, r14); 8827 r14 = max(r13, r14); 8828 r13 = t; 8829 }; 8830 { 8831 uint const t = min(r15, r16); 8832 r16 = max(r15, r16); 8833 r15 = t; 8834 }; 8835 vout[span_l + span_stride * 0] = r1; 8836 vout[span_l + span_stride * 1] = r2; 8837 vout[span_l + span_stride * 2] = r3; 8838 vout[span_l + span_stride * 3] = r4; 8839 vout[span_l + span_stride * 4] = r5; 8840 vout[span_l + span_stride * 5] = r6; 8841 vout[span_l + span_stride * 6] = r7; 8842 vout[span_l + span_stride * 7] = r8; 8843 vout[span_r + span_stride * 0] = r9; 8844 vout[span_r + span_stride * 1] = r10; 8845 vout[span_r + span_stride * 2] = r11; 8846 vout[span_r + span_stride * 3] = r12; 8847 vout[span_r + span_stride * 4] = r13; 8848 vout[span_r + span_stride * 5] = r14; 8849 vout[span_r + span_stride * 6] = r15; 8850 vout[span_r + span_stride * 7] = r16; 8851} 8852 8853__kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) void 8854hs_kernel_hm_0(__global uint* const restrict vout) 8855{ 8856 uint const span_idx = get_global_id(1); 8857 uint const span_stride = get_global_size(0); 8858 uint const span_size = span_stride * 8 * 2; 8859 uint const span_base = span_idx * span_size; 8860 uint const span_off = get_global_id(0); 8861 uint const span_l = span_base + span_off; 8862 uint r1 = vout[span_l + span_stride * 0]; 8863 uint r2 = vout[span_l + span_stride * 1]; 8864 uint r3 = vout[span_l + span_stride * 2]; 8865 uint r4 = vout[span_l + span_stride * 3]; 8866 uint r5 = vout[span_l + span_stride * 4]; 8867 uint r6 = vout[span_l + span_stride * 5]; 8868 uint r7 = vout[span_l + span_stride * 6]; 8869 uint r8 = vout[span_l + span_stride * 7]; 8870 uint r9 = vout[span_l + span_stride * 8]; 8871 uint r10 = vout[span_l + span_stride * 9]; 8872 uint r11 = vout[span_l + span_stride * 10]; 8873 uint r12 = vout[span_l + span_stride * 11]; 8874 uint r13 = vout[span_l + span_stride * 12]; 8875 uint r14 = vout[span_l + span_stride * 13]; 8876 uint r15 = vout[span_l + span_stride * 14]; 8877 uint r16 = vout[span_l + span_stride * 15]; 8878 { 8879 uint const t = min(r1, r9); 8880 r9 = max(r1, r9); 8881 r1 = t; 8882 }; 8883 { 8884 uint const t = min(r5, r13); 8885 r13 = max(r5, r13); 8886 r5 = t; 8887 }; 8888 { 8889 uint const t = min(r1, r5); 8890 r5 = max(r1, r5); 8891 r1 = t; 8892 }; 8893 { 8894 uint const t = min(r9, r13); 8895 r13 = max(r9, r13); 8896 r9 = t; 8897 }; 8898 { 8899 uint const t = min(r3, r11); 8900 r11 = max(r3, r11); 8901 r3 = t; 8902 }; 8903 { 8904 uint const t = min(r7, r15); 8905 r15 = max(r7, r15); 8906 r7 = t; 8907 }; 8908 { 8909 uint const t = min(r3, r7); 8910 r7 = max(r3, r7); 8911 r3 = t; 8912 }; 8913 { 8914 uint const t = min(r11, r15); 8915 r15 = max(r11, r15); 8916 r11 = t; 8917 }; 8918 { 8919 uint const t = min(r1, r3); 8920 r3 = max(r1, r3); 8921 r1 = t; 8922 }; 8923 { 8924 uint const t = min(r5, r7); 8925 r7 = max(r5, r7); 8926 r5 = t; 8927 }; 8928 { 8929 uint const t = min(r9, r11); 8930 r11 = max(r9, r11); 8931 r9 = t; 8932 }; 8933 { 8934 uint const t = min(r13, r15); 8935 r15 = max(r13, r15); 8936 r13 = t; 8937 }; 8938 { 8939 uint const t = min(r2, r10); 8940 r10 = max(r2, r10); 8941 r2 = t; 8942 }; 8943 { 8944 uint const t = min(r6, r14); 8945 r14 = max(r6, r14); 8946 r6 = t; 8947 }; 8948 { 8949 uint const t = min(r2, r6); 8950 r6 = max(r2, r6); 8951 r2 = t; 8952 }; 8953 { 8954 uint const t = min(r10, r14); 8955 r14 = max(r10, r14); 8956 r10 = t; 8957 }; 8958 { 8959 uint const t = min(r4, r12); 8960 r12 = max(r4, r12); 8961 r4 = t; 8962 }; 8963 { 8964 uint const t = min(r8, r16); 8965 r16 = max(r8, r16); 8966 r8 = t; 8967 }; 8968 { 8969 uint const t = min(r4, r8); 8970 r8 = max(r4, r8); 8971 r4 = t; 8972 }; 8973 { 8974 uint const t = min(r12, r16); 8975 r16 = max(r12, r16); 8976 r12 = t; 8977 }; 8978 { 8979 uint const t = min(r2, r4); 8980 r4 = max(r2, r4); 8981 r2 = t; 8982 }; 8983 { 8984 uint const t = min(r6, r8); 8985 r8 = max(r6, r8); 8986 r6 = t; 8987 }; 8988 { 8989 uint const t = min(r10, r12); 8990 r12 = max(r10, r12); 8991 r10 = t; 8992 }; 8993 { 8994 uint const t = min(r14, r16); 8995 r16 = max(r14, r16); 8996 r14 = t; 8997 }; 8998 { 8999 uint const t = min(r1, r2); 9000 r2 = max(r1, r2); 9001 r1 = t; 9002 }; 9003 { 9004 uint const t = min(r3, r4); 9005 r4 = max(r3, r4); 9006 r3 = t; 9007 }; 9008 { 9009 uint const t = min(r5, r6); 9010 r6 = max(r5, r6); 9011 r5 = t; 9012 }; 9013 { 9014 uint const t = min(r7, r8); 9015 r8 = max(r7, r8); 9016 r7 = t; 9017 }; 9018 { 9019 uint const t = min(r9, r10); 9020 r10 = max(r9, r10); 9021 r9 = t; 9022 }; 9023 { 9024 uint const t = min(r11, r12); 9025 r12 = max(r11, r12); 9026 r11 = t; 9027 }; 9028 { 9029 uint const t = min(r13, r14); 9030 r14 = max(r13, r14); 9031 r13 = t; 9032 }; 9033 { 9034 uint const t = min(r15, r16); 9035 r16 = max(r15, r16); 9036 r15 = t; 9037 }; 9038 vout[span_l + span_stride * 0] = r1; 9039 vout[span_l + span_stride * 1] = r2; 9040 vout[span_l + span_stride * 2] = r3; 9041 vout[span_l + span_stride * 3] = r4; 9042 vout[span_l + span_stride * 4] = r5; 9043 vout[span_l + span_stride * 5] = r6; 9044 vout[span_l + span_stride * 6] = r7; 9045 vout[span_l + span_stride * 7] = r8; 9046 vout[span_l + span_stride * 8] = r9; 9047 vout[span_l + span_stride * 9] = r10; 9048 vout[span_l + span_stride * 10] = r11; 9049 vout[span_l + span_stride * 11] = r12; 9050 vout[span_l + span_stride * 12] = r13; 9051 vout[span_l + span_stride * 13] = r14; 9052 vout[span_l + span_stride * 14] = r15; 9053 vout[span_l + span_stride * 15] = r16; 9054} 9055 9056__kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) void 9057hs_kernel_transpose(__global uint* const restrict vout) 9058{ 9059 uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 + 9060 (get_local_id(0) & ((1 << 4) - 1)); 9061 uint r1 = vout[gmem_idx + (1 << 4) * 0]; 9062 uint r2 = vout[gmem_idx + (1 << 4) * 1]; 9063 uint r3 = vout[gmem_idx + (1 << 4) * 2]; 9064 uint r4 = vout[gmem_idx + (1 << 4) * 3]; 9065 uint r5 = vout[gmem_idx + (1 << 4) * 4]; 9066 uint r6 = vout[gmem_idx + (1 << 4) * 5]; 9067 uint r7 = vout[gmem_idx + (1 << 4) * 6]; 9068 uint r8 = vout[gmem_idx + (1 << 4) * 7]; 9069 bool const is_lo_1 = (get_sub_group_local_id() & (1 << (1 - 1))) == 0; 9070 bool const is_lo_2 = (get_sub_group_local_id() & (1 << (2 - 1))) == 0; 9071 bool const is_lo_3 = (get_sub_group_local_id() & (1 << (3 - 1))) == 0; 9072 bool const is_lo_4 = (get_sub_group_local_id() & (1 << (4 - 1))) == 0; 9073 uint const s2_1 = 9074 intel_sub_group_shuffle_xor(is_lo_1 ? r2 : r1, 1 << (1 - 1)); 9075 uint const s2 = is_lo_1 ? s2_1 : r2; 9076 uint const s1 = is_lo_1 ? r1 : s2_1; 9077 uint const s4_3 = 9078 intel_sub_group_shuffle_xor(is_lo_1 ? r4 : r3, 1 << (1 - 1)); 9079 uint const s4 = is_lo_1 ? s4_3 : r4; 9080 uint const s3 = is_lo_1 ? r3 : s4_3; 9081 uint const s6_5 = 9082 intel_sub_group_shuffle_xor(is_lo_1 ? r6 : r5, 1 << (1 - 1)); 9083 uint const s6 = is_lo_1 ? s6_5 : r6; 9084 uint const s5 = is_lo_1 ? r5 : s6_5; 9085 uint const s8_7 = 9086 intel_sub_group_shuffle_xor(is_lo_1 ? r8 : r7, 1 << (1 - 1)); 9087 uint const s8 = is_lo_1 ? s8_7 : r8; 9088 uint const s7 = is_lo_1 ? r7 : s8_7; 9089 uint const t3_1 = 9090 intel_sub_group_shuffle_xor(is_lo_2 ? s3 : s1, 1 << (2 - 1)); 9091 uint const t3 = is_lo_2 ? t3_1 : s3; 9092 uint const t1 = is_lo_2 ? s1 : t3_1; 9093 uint const t4_2 = 9094 intel_sub_group_shuffle_xor(is_lo_2 ? s4 : s2, 1 << (2 - 1)); 9095 uint const t4 = is_lo_2 ? t4_2 : s4; 9096 uint const t2 = is_lo_2 ? s2 : t4_2; 9097 uint const t7_5 = 9098 intel_sub_group_shuffle_xor(is_lo_2 ? s7 : s5, 1 << (2 - 1)); 9099 uint const t7 = is_lo_2 ? t7_5 : s7; 9100 uint const t5 = is_lo_2 ? s5 : t7_5; 9101 uint const t8_6 = 9102 intel_sub_group_shuffle_xor(is_lo_2 ? s8 : s6, 1 << (2 - 1)); 9103 uint const t8 = is_lo_2 ? t8_6 : s8; 9104 uint const t6 = is_lo_2 ? s6 : t8_6; 9105 uint const u5_1 = 9106 intel_sub_group_shuffle_xor(is_lo_3 ? t5 : t1, 1 << (3 - 1)); 9107 uint const u5 = is_lo_3 ? u5_1 : t5; 9108 uint const u1 = is_lo_3 ? t1 : u5_1; 9109 uint const u6_2 = 9110 intel_sub_group_shuffle_xor(is_lo_3 ? t6 : t2, 1 << (3 - 1)); 9111 uint const u6 = is_lo_3 ? u6_2 : t6; 9112 uint const u2 = is_lo_3 ? t2 : u6_2; 9113 uint const u7_3 = 9114 intel_sub_group_shuffle_xor(is_lo_3 ? t7 : t3, 1 << (3 - 1)); 9115 uint const u7 = is_lo_3 ? u7_3 : t7; 9116 uint const u3 = is_lo_3 ? t3 : u7_3; 9117 uint const u8_4 = 9118 intel_sub_group_shuffle_xor(is_lo_3 ? t8 : t4, 1 << (3 - 1)); 9119 uint const u8 = is_lo_3 ? u8_4 : t8; 9120 uint const u4 = is_lo_3 ? t4 : u8_4; 9121 uint const v2_1 = 9122 intel_sub_group_shuffle_xor(is_lo_4 ? u2 : u1, 1 << (4 - 1)); 9123 uint const v2 = is_lo_4 ? v2_1 : u2; 9124 uint const v1 = is_lo_4 ? u1 : v2_1; 9125 uint const v4_3 = 9126 intel_sub_group_shuffle_xor(is_lo_4 ? u4 : u3, 1 << (4 - 1)); 9127 uint const v4 = is_lo_4 ? v4_3 : u4; 9128 uint const v3 = is_lo_4 ? u3 : v4_3; 9129 uint const v6_5 = 9130 intel_sub_group_shuffle_xor(is_lo_4 ? u6 : u5, 1 << (4 - 1)); 9131 uint const v6 = is_lo_4 ? v6_5 : u6; 9132 uint const v5 = is_lo_4 ? u5 : v6_5; 9133 uint const v8_7 = 9134 intel_sub_group_shuffle_xor(is_lo_4 ? u8 : u7, 1 << (4 - 1)); 9135 uint const v8 = is_lo_4 ? v8_7 : u8; 9136 uint const v7 = is_lo_4 ? u7 : v8_7; 9137 vout[gmem_idx + ((1 - 1) << 4)] = v1; 9138 vout[gmem_idx + ((5 - 1) << 4)] = v2; 9139 vout[gmem_idx + ((2 - 1) << 4)] = v3; 9140 vout[gmem_idx + ((6 - 1) << 4)] = v4; 9141 vout[gmem_idx + ((3 - 1) << 4)] = v5; 9142 vout[gmem_idx + ((7 - 1) << 4)] = v6; 9143 vout[gmem_idx + ((4 - 1) << 4)] = v7; 9144 vout[gmem_idx + ((8 - 1) << 4)] = v8; 9145} 9146