// // Copyright 2016 Google Inc. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // // target-specific config #include "hs_config.h" // arch/target-specific macros #include "hs_cl_macros.h" // // // HS_BS_KERNEL_PROTO(1, 0) { HS_SLAB_GLOBAL_PREAMBLE(); HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0); HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1); HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2); HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3); HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4); HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5); HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6); HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7); HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r3, r5); HS_CMP_XCHG(r4, r6); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); HS_CMP_XCHG(r2, r5); HS_CMP_XCHG(r4, r7); HS_CMP_XCHG(r2, r3); HS_CMP_XCHG(r4, r5); HS_CMP_XCHG(r6, r7); { HS_SLAB_FLIP_PREAMBLE(1); HS_CMP_FLIP(0, r1, r8); HS_CMP_FLIP(1, r2, r7); HS_CMP_FLIP(2, r3, r6); HS_CMP_FLIP(3, r4, r5); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); { HS_SLAB_FLIP_PREAMBLE(3); HS_CMP_FLIP(0, r1, r8); HS_CMP_FLIP(1, r2, r7); HS_CMP_FLIP(2, r3, r6); HS_CMP_FLIP(3, r4, r5); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); { HS_SLAB_FLIP_PREAMBLE(7); HS_CMP_FLIP(0, r1, r8); HS_CMP_FLIP(1, r2, r7); HS_CMP_FLIP(2, r3, r6); HS_CMP_FLIP(3, r4, r5); } { HS_SLAB_HALF_PREAMBLE(2); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); { HS_SLAB_FLIP_PREAMBLE(15); HS_CMP_FLIP(0, r1, r8); HS_CMP_FLIP(1, r2, r7); HS_CMP_FLIP(2, r3, r6); HS_CMP_FLIP(3, r4, r5); } { HS_SLAB_HALF_PREAMBLE(4); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(2); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); HS_SLAB_GLOBAL_STORE(0, r1); HS_SLAB_GLOBAL_STORE(1, r2); HS_SLAB_GLOBAL_STORE(2, r3); HS_SLAB_GLOBAL_STORE(3, r4); HS_SLAB_GLOBAL_STORE(4, r5); HS_SLAB_GLOBAL_STORE(5, r6); HS_SLAB_GLOBAL_STORE(6, r7); HS_SLAB_GLOBAL_STORE(7, r8); } HS_BS_KERNEL_PROTO(2, 1) { HS_BLOCK_LOCAL_MEM_DECL(32, 8); HS_SLAB_GLOBAL_PREAMBLE(); HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0); HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1); HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2); HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3); HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4); HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5); HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6); HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7); HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r3, r5); HS_CMP_XCHG(r4, r6); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); HS_CMP_XCHG(r2, r5); HS_CMP_XCHG(r4, r7); HS_CMP_XCHG(r2, r3); HS_CMP_XCHG(r4, r5); HS_CMP_XCHG(r6, r7); { HS_SLAB_FLIP_PREAMBLE(1); HS_CMP_FLIP(0, r1, r8); HS_CMP_FLIP(1, r2, r7); HS_CMP_FLIP(2, r3, r6); HS_CMP_FLIP(3, r4, r5); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); { HS_SLAB_FLIP_PREAMBLE(3); HS_CMP_FLIP(0, r1, r8); HS_CMP_FLIP(1, r2, r7); HS_CMP_FLIP(2, r3, r6); HS_CMP_FLIP(3, r4, r5); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); { HS_SLAB_FLIP_PREAMBLE(7); HS_CMP_FLIP(0, r1, r8); HS_CMP_FLIP(1, r2, r7); HS_CMP_FLIP(2, r3, r6); HS_CMP_FLIP(3, r4, r5); } { HS_SLAB_HALF_PREAMBLE(2); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); { HS_SLAB_FLIP_PREAMBLE(15); HS_CMP_FLIP(0, r1, r8); HS_CMP_FLIP(1, r2, r7); HS_CMP_FLIP(2, r3, r6); HS_CMP_FLIP(3, r4, r5); } { HS_SLAB_HALF_PREAMBLE(4); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(2); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); HS_BS_MERGE_H_PREAMBLE(2); HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 0) = r1; HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 1) = r8; HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 2) = r2; HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 3) = r7; HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 4) = r3; HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 5) = r6; HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 6) = r4; HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 7) = r5; HS_BLOCK_BARRIER(); { { HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(16); HS_CMP_XCHG(r0_1, r0_2); HS_SLAB_LOCAL_L(0) = r0_1; HS_SLAB_LOCAL_R(16) = r0_2; } { HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(64); HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(80); HS_CMP_XCHG(r0_1, r0_2); HS_SLAB_LOCAL_L(64) = r0_1; HS_SLAB_LOCAL_R(80) = r0_2; } { HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(128); HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(144); HS_CMP_XCHG(r0_1, r0_2); HS_SLAB_LOCAL_L(128) = r0_1; HS_SLAB_LOCAL_R(144) = r0_2; } { HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(192); HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(208); HS_CMP_XCHG(r0_1, r0_2); HS_SLAB_LOCAL_L(192) = r0_1; HS_SLAB_LOCAL_R(208) = r0_2; } } HS_BLOCK_BARRIER(); r1 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 0); r8 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 1); r2 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 2); r7 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 3); r3 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 4); r6 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 5); r4 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 6); r5 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 7); { { HS_SLAB_HALF_PREAMBLE(8); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(4); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(2); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); } HS_SLAB_GLOBAL_STORE(0, r1); HS_SLAB_GLOBAL_STORE(1, r2); HS_SLAB_GLOBAL_STORE(2, r3); HS_SLAB_GLOBAL_STORE(3, r4); HS_SLAB_GLOBAL_STORE(4, r5); HS_SLAB_GLOBAL_STORE(5, r6); HS_SLAB_GLOBAL_STORE(6, r7); HS_SLAB_GLOBAL_STORE(7, r8); } HS_BS_KERNEL_PROTO(4, 2) { HS_BLOCK_LOCAL_MEM_DECL(64, 8); HS_SLAB_GLOBAL_PREAMBLE(); HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0); HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1); HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2); HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3); HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4); HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5); HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6); HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7); HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r3, r5); HS_CMP_XCHG(r4, r6); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); HS_CMP_XCHG(r2, r5); HS_CMP_XCHG(r4, r7); HS_CMP_XCHG(r2, r3); HS_CMP_XCHG(r4, r5); HS_CMP_XCHG(r6, r7); { HS_SLAB_FLIP_PREAMBLE(1); HS_CMP_FLIP(0, r1, r8); HS_CMP_FLIP(1, r2, r7); HS_CMP_FLIP(2, r3, r6); HS_CMP_FLIP(3, r4, r5); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); { HS_SLAB_FLIP_PREAMBLE(3); HS_CMP_FLIP(0, r1, r8); HS_CMP_FLIP(1, r2, r7); HS_CMP_FLIP(2, r3, r6); HS_CMP_FLIP(3, r4, r5); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); { HS_SLAB_FLIP_PREAMBLE(7); HS_CMP_FLIP(0, r1, r8); HS_CMP_FLIP(1, r2, r7); HS_CMP_FLIP(2, r3, r6); HS_CMP_FLIP(3, r4, r5); } { HS_SLAB_HALF_PREAMBLE(2); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); { HS_SLAB_FLIP_PREAMBLE(15); HS_CMP_FLIP(0, r1, r8); HS_CMP_FLIP(1, r2, r7); HS_CMP_FLIP(2, r3, r6); HS_CMP_FLIP(3, r4, r5); } { HS_SLAB_HALF_PREAMBLE(4); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(2); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); HS_BS_MERGE_H_PREAMBLE(4); HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0) = r1; HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1) = r8; HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2) = r2; HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3) = r7; HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4) = r3; HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5) = r6; HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6) = r4; HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7) = r5; HS_BLOCK_BARRIER(); { { HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(16); HS_CMP_XCHG(r0_1, r0_2); HS_SLAB_LOCAL_L(0) = r0_1; HS_SLAB_LOCAL_R(16) = r0_2; } { HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(32); HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(48); HS_CMP_XCHG(r1_1, r1_2); HS_SLAB_LOCAL_L(32) = r1_1; HS_SLAB_LOCAL_R(48) = r1_2; } { HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(256); HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(272); HS_CMP_XCHG(r0_1, r0_2); HS_SLAB_LOCAL_L(256) = r0_1; HS_SLAB_LOCAL_R(272) = r0_2; } { HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(288); HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(304); HS_CMP_XCHG(r1_1, r1_2); HS_SLAB_LOCAL_L(288) = r1_1; HS_SLAB_LOCAL_R(304) = r1_2; } } HS_BLOCK_BARRIER(); r1 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0); r8 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1); r2 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2); r7 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3); r3 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4); r6 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5); r4 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6); r5 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7); { { HS_SLAB_HALF_PREAMBLE(8); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(4); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(2); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); } HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0) = r1; HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1) = r8; HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2) = r2; HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3) = r7; HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4) = r3; HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5) = r6; HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6) = r4; HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7) = r5; HS_BLOCK_BARRIER(); { { HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(16); HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(32); HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(48); HS_CMP_XCHG(r0_2, r0_3); HS_CMP_XCHG(r0_1, r0_4); HS_CMP_XCHG(r0_3, r0_4); HS_CMP_XCHG(r0_1, r0_2); HS_SLAB_LOCAL_L(0) = r0_1; HS_SLAB_LOCAL_L(16) = r0_2; HS_SLAB_LOCAL_R(32) = r0_3; HS_SLAB_LOCAL_R(48) = r0_4; } { HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(256); HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(272); HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(288); HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(304); HS_CMP_XCHG(r0_2, r0_3); HS_CMP_XCHG(r0_1, r0_4); HS_CMP_XCHG(r0_3, r0_4); HS_CMP_XCHG(r0_1, r0_2); HS_SLAB_LOCAL_L(256) = r0_1; HS_SLAB_LOCAL_L(272) = r0_2; HS_SLAB_LOCAL_R(288) = r0_3; HS_SLAB_LOCAL_R(304) = r0_4; } } HS_BLOCK_BARRIER(); r1 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0); r8 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1); r2 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2); r7 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3); r3 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4); r6 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5); r4 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6); r5 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7); { { HS_SLAB_HALF_PREAMBLE(8); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(4); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(2); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); } HS_SLAB_GLOBAL_STORE(0, r1); HS_SLAB_GLOBAL_STORE(1, r2); HS_SLAB_GLOBAL_STORE(2, r3); HS_SLAB_GLOBAL_STORE(3, r4); HS_SLAB_GLOBAL_STORE(4, r5); HS_SLAB_GLOBAL_STORE(5, r6); HS_SLAB_GLOBAL_STORE(6, r7); HS_SLAB_GLOBAL_STORE(7, r8); } HS_BS_KERNEL_PROTO(8, 3) { HS_BLOCK_LOCAL_MEM_DECL(128, 8); HS_SLAB_GLOBAL_PREAMBLE(); HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0); HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1); HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2); HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3); HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4); HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5); HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6); HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7); HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r3, r5); HS_CMP_XCHG(r4, r6); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); HS_CMP_XCHG(r2, r5); HS_CMP_XCHG(r4, r7); HS_CMP_XCHG(r2, r3); HS_CMP_XCHG(r4, r5); HS_CMP_XCHG(r6, r7); { HS_SLAB_FLIP_PREAMBLE(1); HS_CMP_FLIP(0, r1, r8); HS_CMP_FLIP(1, r2, r7); HS_CMP_FLIP(2, r3, r6); HS_CMP_FLIP(3, r4, r5); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); { HS_SLAB_FLIP_PREAMBLE(3); HS_CMP_FLIP(0, r1, r8); HS_CMP_FLIP(1, r2, r7); HS_CMP_FLIP(2, r3, r6); HS_CMP_FLIP(3, r4, r5); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); { HS_SLAB_FLIP_PREAMBLE(7); HS_CMP_FLIP(0, r1, r8); HS_CMP_FLIP(1, r2, r7); HS_CMP_FLIP(2, r3, r6); HS_CMP_FLIP(3, r4, r5); } { HS_SLAB_HALF_PREAMBLE(2); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); { HS_SLAB_FLIP_PREAMBLE(15); HS_CMP_FLIP(0, r1, r8); HS_CMP_FLIP(1, r2, r7); HS_CMP_FLIP(2, r3, r6); HS_CMP_FLIP(3, r4, r5); } { HS_SLAB_HALF_PREAMBLE(4); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(2); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); HS_BS_MERGE_H_PREAMBLE(8); HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0) = r1; HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1) = r8; HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2) = r2; HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3) = r7; HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4) = r3; HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5) = r6; HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6) = r4; HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7) = r5; HS_BLOCK_BARRIER(); { { HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(16); HS_CMP_XCHG(r0_1, r0_2); HS_SLAB_LOCAL_L(0) = r0_1; HS_SLAB_LOCAL_R(16) = r0_2; } { HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(32); HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(48); HS_CMP_XCHG(r1_1, r1_2); HS_SLAB_LOCAL_L(32) = r1_1; HS_SLAB_LOCAL_R(48) = r1_2; } { HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(64); HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(80); HS_CMP_XCHG(r2_1, r2_2); HS_SLAB_LOCAL_L(64) = r2_1; HS_SLAB_LOCAL_R(80) = r2_2; } { HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(96); HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(112); HS_CMP_XCHG(r3_1, r3_2); HS_SLAB_LOCAL_L(96) = r3_1; HS_SLAB_LOCAL_R(112) = r3_2; } } HS_BLOCK_BARRIER(); r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0); r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1); r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2); r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3); r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4); r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5); r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6); r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7); { { HS_SLAB_HALF_PREAMBLE(8); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(4); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(2); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); } HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0) = r1; HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1) = r8; HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2) = r2; HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3) = r7; HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4) = r3; HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5) = r6; HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6) = r4; HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7) = r5; HS_BLOCK_BARRIER(); { { HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(16); HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(32); HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(48); HS_CMP_XCHG(r0_2, r0_3); HS_CMP_XCHG(r0_1, r0_4); HS_CMP_XCHG(r0_3, r0_4); HS_CMP_XCHG(r0_1, r0_2); HS_SLAB_LOCAL_L(0) = r0_1; HS_SLAB_LOCAL_L(16) = r0_2; HS_SLAB_LOCAL_R(32) = r0_3; HS_SLAB_LOCAL_R(48) = r0_4; } { HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(64); HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(80); HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(96); HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(112); HS_CMP_XCHG(r1_2, r1_3); HS_CMP_XCHG(r1_1, r1_4); HS_CMP_XCHG(r1_3, r1_4); HS_CMP_XCHG(r1_1, r1_2); HS_SLAB_LOCAL_L(64) = r1_1; HS_SLAB_LOCAL_L(80) = r1_2; HS_SLAB_LOCAL_R(96) = r1_3; HS_SLAB_LOCAL_R(112) = r1_4; } } HS_BLOCK_BARRIER(); r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0); r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1); r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2); r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3); r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4); r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5); r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6); r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7); { { HS_SLAB_HALF_PREAMBLE(8); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(4); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(2); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); } HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0) = r1; HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1) = r8; HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2) = r2; HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3) = r7; HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4) = r3; HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5) = r6; HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6) = r4; HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7) = r5; HS_BLOCK_BARRIER(); { { HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(16); HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(32); HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(48); HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(64); HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(80); HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(96); HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(112); HS_CMP_XCHG(r0_4, r0_5); HS_CMP_XCHG(r0_3, r0_6); HS_CMP_XCHG(r0_2, r0_7); HS_CMP_XCHG(r0_1, r0_8); HS_CMP_XCHG(r0_5, r0_7); HS_CMP_XCHG(r0_6, r0_8); HS_CMP_XCHG(r0_5, r0_6); HS_CMP_XCHG(r0_7, r0_8); HS_CMP_XCHG(r0_1, r0_3); HS_CMP_XCHG(r0_2, r0_4); HS_CMP_XCHG(r0_1, r0_2); HS_CMP_XCHG(r0_3, r0_4); HS_SLAB_LOCAL_L(0) = r0_1; HS_SLAB_LOCAL_L(16) = r0_2; HS_SLAB_LOCAL_L(32) = r0_3; HS_SLAB_LOCAL_L(48) = r0_4; HS_SLAB_LOCAL_R(64) = r0_5; HS_SLAB_LOCAL_R(80) = r0_6; HS_SLAB_LOCAL_R(96) = r0_7; HS_SLAB_LOCAL_R(112) = r0_8; } } HS_BLOCK_BARRIER(); r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0); r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1); r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2); r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3); r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4); r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5); r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6); r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7); { { HS_SLAB_HALF_PREAMBLE(8); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(4); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(2); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); } HS_SLAB_GLOBAL_STORE(0, r1); HS_SLAB_GLOBAL_STORE(1, r2); HS_SLAB_GLOBAL_STORE(2, r3); HS_SLAB_GLOBAL_STORE(3, r4); HS_SLAB_GLOBAL_STORE(4, r5); HS_SLAB_GLOBAL_STORE(5, r6); HS_SLAB_GLOBAL_STORE(6, r7); HS_SLAB_GLOBAL_STORE(7, r8); } HS_BS_KERNEL_PROTO(16, 4) { HS_BLOCK_LOCAL_MEM_DECL(256, 8); HS_SLAB_GLOBAL_PREAMBLE(); HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0); HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1); HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2); HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3); HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4); HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5); HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6); HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7); HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r3, r5); HS_CMP_XCHG(r4, r6); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); HS_CMP_XCHG(r2, r5); HS_CMP_XCHG(r4, r7); HS_CMP_XCHG(r2, r3); HS_CMP_XCHG(r4, r5); HS_CMP_XCHG(r6, r7); { HS_SLAB_FLIP_PREAMBLE(1); HS_CMP_FLIP(0, r1, r8); HS_CMP_FLIP(1, r2, r7); HS_CMP_FLIP(2, r3, r6); HS_CMP_FLIP(3, r4, r5); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); { HS_SLAB_FLIP_PREAMBLE(3); HS_CMP_FLIP(0, r1, r8); HS_CMP_FLIP(1, r2, r7); HS_CMP_FLIP(2, r3, r6); HS_CMP_FLIP(3, r4, r5); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); { HS_SLAB_FLIP_PREAMBLE(7); HS_CMP_FLIP(0, r1, r8); HS_CMP_FLIP(1, r2, r7); HS_CMP_FLIP(2, r3, r6); HS_CMP_FLIP(3, r4, r5); } { HS_SLAB_HALF_PREAMBLE(2); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); { HS_SLAB_FLIP_PREAMBLE(15); HS_CMP_FLIP(0, r1, r8); HS_CMP_FLIP(1, r2, r7); HS_CMP_FLIP(2, r3, r6); HS_CMP_FLIP(3, r4, r5); } { HS_SLAB_HALF_PREAMBLE(4); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(2); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); HS_BS_MERGE_H_PREAMBLE(16); HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r8; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r7; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r6; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r5; HS_BLOCK_BARRIER(); if (HS_SUBGROUP_ID() < 8) { { HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(16); HS_CMP_XCHG(r0_1, r0_2); HS_SLAB_LOCAL_L(0) = r0_1; HS_SLAB_LOCAL_R(16) = r0_2; } { HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(32); HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(48); HS_CMP_XCHG(r1_1, r1_2); HS_SLAB_LOCAL_L(32) = r1_1; HS_SLAB_LOCAL_R(48) = r1_2; } { HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(64); HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(80); HS_CMP_XCHG(r2_1, r2_2); HS_SLAB_LOCAL_L(64) = r2_1; HS_SLAB_LOCAL_R(80) = r2_2; } { HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(96); HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(112); HS_CMP_XCHG(r3_1, r3_2); HS_SLAB_LOCAL_L(96) = r3_1; HS_SLAB_LOCAL_R(112) = r3_2; } { HS_KEY_TYPE r4_1 = HS_SLAB_LOCAL_L(128); HS_KEY_TYPE r4_2 = HS_SLAB_LOCAL_R(144); HS_CMP_XCHG(r4_1, r4_2); HS_SLAB_LOCAL_L(128) = r4_1; HS_SLAB_LOCAL_R(144) = r4_2; } { HS_KEY_TYPE r5_1 = HS_SLAB_LOCAL_L(160); HS_KEY_TYPE r5_2 = HS_SLAB_LOCAL_R(176); HS_CMP_XCHG(r5_1, r5_2); HS_SLAB_LOCAL_L(160) = r5_1; HS_SLAB_LOCAL_R(176) = r5_2; } { HS_KEY_TYPE r6_1 = HS_SLAB_LOCAL_L(192); HS_KEY_TYPE r6_2 = HS_SLAB_LOCAL_R(208); HS_CMP_XCHG(r6_1, r6_2); HS_SLAB_LOCAL_L(192) = r6_1; HS_SLAB_LOCAL_R(208) = r6_2; } { HS_KEY_TYPE r7_1 = HS_SLAB_LOCAL_L(224); HS_KEY_TYPE r7_2 = HS_SLAB_LOCAL_R(240); HS_CMP_XCHG(r7_1, r7_2); HS_SLAB_LOCAL_L(224) = r7_1; HS_SLAB_LOCAL_R(240) = r7_2; } } HS_BLOCK_BARRIER(); r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0); r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1); r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2); r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3); r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4); r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5); r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6); r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7); { { HS_SLAB_HALF_PREAMBLE(8); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(4); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(2); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); } HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r8; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r7; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r6; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r5; HS_BLOCK_BARRIER(); if (HS_SUBGROUP_ID() < 8) { { HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(16); HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(32); HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(48); HS_CMP_XCHG(r0_2, r0_3); HS_CMP_XCHG(r0_1, r0_4); HS_CMP_XCHG(r0_3, r0_4); HS_CMP_XCHG(r0_1, r0_2); HS_SLAB_LOCAL_L(0) = r0_1; HS_SLAB_LOCAL_L(16) = r0_2; HS_SLAB_LOCAL_R(32) = r0_3; HS_SLAB_LOCAL_R(48) = r0_4; } { HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(64); HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(80); HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(96); HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(112); HS_CMP_XCHG(r1_2, r1_3); HS_CMP_XCHG(r1_1, r1_4); HS_CMP_XCHG(r1_3, r1_4); HS_CMP_XCHG(r1_1, r1_2); HS_SLAB_LOCAL_L(64) = r1_1; HS_SLAB_LOCAL_L(80) = r1_2; HS_SLAB_LOCAL_R(96) = r1_3; HS_SLAB_LOCAL_R(112) = r1_4; } { HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(128); HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_L(144); HS_KEY_TYPE r2_3 = HS_SLAB_LOCAL_R(160); HS_KEY_TYPE r2_4 = HS_SLAB_LOCAL_R(176); HS_CMP_XCHG(r2_2, r2_3); HS_CMP_XCHG(r2_1, r2_4); HS_CMP_XCHG(r2_3, r2_4); HS_CMP_XCHG(r2_1, r2_2); HS_SLAB_LOCAL_L(128) = r2_1; HS_SLAB_LOCAL_L(144) = r2_2; HS_SLAB_LOCAL_R(160) = r2_3; HS_SLAB_LOCAL_R(176) = r2_4; } { HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(192); HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_L(208); HS_KEY_TYPE r3_3 = HS_SLAB_LOCAL_R(224); HS_KEY_TYPE r3_4 = HS_SLAB_LOCAL_R(240); HS_CMP_XCHG(r3_2, r3_3); HS_CMP_XCHG(r3_1, r3_4); HS_CMP_XCHG(r3_3, r3_4); HS_CMP_XCHG(r3_1, r3_2); HS_SLAB_LOCAL_L(192) = r3_1; HS_SLAB_LOCAL_L(208) = r3_2; HS_SLAB_LOCAL_R(224) = r3_3; HS_SLAB_LOCAL_R(240) = r3_4; } } HS_BLOCK_BARRIER(); r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0); r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1); r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2); r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3); r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4); r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5); r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6); r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7); { { HS_SLAB_HALF_PREAMBLE(8); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(4); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(2); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); } HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r8; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r7; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r6; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r5; HS_BLOCK_BARRIER(); if (HS_SUBGROUP_ID() < 8) { { HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(16); HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(32); HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(48); HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(64); HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(80); HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(96); HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(112); HS_CMP_XCHG(r0_4, r0_5); HS_CMP_XCHG(r0_3, r0_6); HS_CMP_XCHG(r0_2, r0_7); HS_CMP_XCHG(r0_1, r0_8); HS_CMP_XCHG(r0_5, r0_7); HS_CMP_XCHG(r0_6, r0_8); HS_CMP_XCHG(r0_5, r0_6); HS_CMP_XCHG(r0_7, r0_8); HS_CMP_XCHG(r0_1, r0_3); HS_CMP_XCHG(r0_2, r0_4); HS_CMP_XCHG(r0_1, r0_2); HS_CMP_XCHG(r0_3, r0_4); HS_SLAB_LOCAL_L(0) = r0_1; HS_SLAB_LOCAL_L(16) = r0_2; HS_SLAB_LOCAL_L(32) = r0_3; HS_SLAB_LOCAL_L(48) = r0_4; HS_SLAB_LOCAL_R(64) = r0_5; HS_SLAB_LOCAL_R(80) = r0_6; HS_SLAB_LOCAL_R(96) = r0_7; HS_SLAB_LOCAL_R(112) = r0_8; } { HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(128); HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(144); HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_L(160); HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_L(176); HS_KEY_TYPE r1_5 = HS_SLAB_LOCAL_R(192); HS_KEY_TYPE r1_6 = HS_SLAB_LOCAL_R(208); HS_KEY_TYPE r1_7 = HS_SLAB_LOCAL_R(224); HS_KEY_TYPE r1_8 = HS_SLAB_LOCAL_R(240); HS_CMP_XCHG(r1_4, r1_5); HS_CMP_XCHG(r1_3, r1_6); HS_CMP_XCHG(r1_2, r1_7); HS_CMP_XCHG(r1_1, r1_8); HS_CMP_XCHG(r1_5, r1_7); HS_CMP_XCHG(r1_6, r1_8); HS_CMP_XCHG(r1_5, r1_6); HS_CMP_XCHG(r1_7, r1_8); HS_CMP_XCHG(r1_1, r1_3); HS_CMP_XCHG(r1_2, r1_4); HS_CMP_XCHG(r1_1, r1_2); HS_CMP_XCHG(r1_3, r1_4); HS_SLAB_LOCAL_L(128) = r1_1; HS_SLAB_LOCAL_L(144) = r1_2; HS_SLAB_LOCAL_L(160) = r1_3; HS_SLAB_LOCAL_L(176) = r1_4; HS_SLAB_LOCAL_R(192) = r1_5; HS_SLAB_LOCAL_R(208) = r1_6; HS_SLAB_LOCAL_R(224) = r1_7; HS_SLAB_LOCAL_R(240) = r1_8; } } HS_BLOCK_BARRIER(); r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0); r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1); r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2); r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3); r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4); r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5); r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6); r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7); { { HS_SLAB_HALF_PREAMBLE(8); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(4); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(2); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); } HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r8; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r7; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r6; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4; HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r5; HS_BLOCK_BARRIER(); if (HS_SUBGROUP_ID() < 8) { { HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(16); HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(32); HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(48); HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_L(64); HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_L(80); HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_L(96); HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_L(112); HS_KEY_TYPE r0_9 = HS_SLAB_LOCAL_R(128); HS_KEY_TYPE r0_10 = HS_SLAB_LOCAL_R(144); HS_KEY_TYPE r0_11 = HS_SLAB_LOCAL_R(160); HS_KEY_TYPE r0_12 = HS_SLAB_LOCAL_R(176); HS_KEY_TYPE r0_13 = HS_SLAB_LOCAL_R(192); HS_KEY_TYPE r0_14 = HS_SLAB_LOCAL_R(208); HS_KEY_TYPE r0_15 = HS_SLAB_LOCAL_R(224); HS_KEY_TYPE r0_16 = HS_SLAB_LOCAL_R(240); HS_CMP_XCHG(r0_8, r0_9); HS_CMP_XCHG(r0_7, r0_10); HS_CMP_XCHG(r0_6, r0_11); HS_CMP_XCHG(r0_5, r0_12); HS_CMP_XCHG(r0_4, r0_13); HS_CMP_XCHG(r0_3, r0_14); HS_CMP_XCHG(r0_2, r0_15); HS_CMP_XCHG(r0_1, r0_16); HS_CMP_XCHG(r0_9, r0_13); HS_CMP_XCHG(r0_11, r0_15); HS_CMP_XCHG(r0_9, r0_11); HS_CMP_XCHG(r0_13, r0_15); HS_CMP_XCHG(r0_10, r0_14); HS_CMP_XCHG(r0_12, r0_16); HS_CMP_XCHG(r0_10, r0_12); HS_CMP_XCHG(r0_14, r0_16); HS_CMP_XCHG(r0_9, r0_10); HS_CMP_XCHG(r0_11, r0_12); HS_CMP_XCHG(r0_13, r0_14); HS_CMP_XCHG(r0_15, r0_16); HS_CMP_XCHG(r0_1, r0_5); HS_CMP_XCHG(r0_3, r0_7); HS_CMP_XCHG(r0_1, r0_3); HS_CMP_XCHG(r0_5, r0_7); HS_CMP_XCHG(r0_2, r0_6); HS_CMP_XCHG(r0_4, r0_8); HS_CMP_XCHG(r0_2, r0_4); HS_CMP_XCHG(r0_6, r0_8); HS_CMP_XCHG(r0_1, r0_2); HS_CMP_XCHG(r0_3, r0_4); HS_CMP_XCHG(r0_5, r0_6); HS_CMP_XCHG(r0_7, r0_8); HS_SLAB_LOCAL_L(0) = r0_1; HS_SLAB_LOCAL_L(16) = r0_2; HS_SLAB_LOCAL_L(32) = r0_3; HS_SLAB_LOCAL_L(48) = r0_4; HS_SLAB_LOCAL_L(64) = r0_5; HS_SLAB_LOCAL_L(80) = r0_6; HS_SLAB_LOCAL_L(96) = r0_7; HS_SLAB_LOCAL_L(112) = r0_8; HS_SLAB_LOCAL_R(128) = r0_9; HS_SLAB_LOCAL_R(144) = r0_10; HS_SLAB_LOCAL_R(160) = r0_11; HS_SLAB_LOCAL_R(176) = r0_12; HS_SLAB_LOCAL_R(192) = r0_13; HS_SLAB_LOCAL_R(208) = r0_14; HS_SLAB_LOCAL_R(224) = r0_15; HS_SLAB_LOCAL_R(240) = r0_16; } } HS_BLOCK_BARRIER(); r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0); r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1); r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2); r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3); r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4); r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5); r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6); r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7); { { HS_SLAB_HALF_PREAMBLE(8); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(4); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(2); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); } HS_SLAB_GLOBAL_STORE(0, r1); HS_SLAB_GLOBAL_STORE(1, r2); HS_SLAB_GLOBAL_STORE(2, r3); HS_SLAB_GLOBAL_STORE(3, r4); HS_SLAB_GLOBAL_STORE(4, r5); HS_SLAB_GLOBAL_STORE(5, r6); HS_SLAB_GLOBAL_STORE(6, r7); HS_SLAB_GLOBAL_STORE(7, r8); } HS_BC_KERNEL_PROTO(1, 0) { HS_SLAB_GLOBAL_PREAMBLE(); HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vout, 0); HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vout, 1); HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vout, 2); HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vout, 3); HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vout, 4); HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vout, 5); HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vout, 6); HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vout, 7); { { HS_SLAB_HALF_PREAMBLE(8); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(4); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(2); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); } HS_SLAB_GLOBAL_STORE(0, r1); HS_SLAB_GLOBAL_STORE(1, r2); HS_SLAB_GLOBAL_STORE(2, r3); HS_SLAB_GLOBAL_STORE(3, r4); HS_SLAB_GLOBAL_STORE(4, r5); HS_SLAB_GLOBAL_STORE(5, r6); HS_SLAB_GLOBAL_STORE(6, r7); HS_SLAB_GLOBAL_STORE(7, r8); } HS_BC_KERNEL_PROTO(2, 1) { HS_BLOCK_LOCAL_MEM_DECL(32, 8); HS_SLAB_GLOBAL_PREAMBLE(); HS_BC_MERGE_H_PREAMBLE(2); { { HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0); HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8); HS_CMP_XCHG(r0_1, r0_2); HS_SLAB_LOCAL_L(0) = r0_1; HS_SLAB_LOCAL_L(16) = r0_2; } { HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(2); HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(10); HS_CMP_XCHG(r0_1, r0_2); HS_SLAB_LOCAL_L(64) = r0_1; HS_SLAB_LOCAL_L(80) = r0_2; } { HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(4); HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(12); HS_CMP_XCHG(r0_1, r0_2); HS_SLAB_LOCAL_L(128) = r0_1; HS_SLAB_LOCAL_L(144) = r0_2; } { HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(6); HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(14); HS_CMP_XCHG(r0_1, r0_2); HS_SLAB_LOCAL_L(192) = r0_1; HS_SLAB_LOCAL_L(208) = r0_2; } } HS_BLOCK_BARRIER(); HS_KEY_TYPE r1 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 0); HS_KEY_TYPE r2 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 1); HS_KEY_TYPE r3 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 2); HS_KEY_TYPE r4 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 3); HS_KEY_TYPE r5 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 4); HS_KEY_TYPE r6 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 5); HS_KEY_TYPE r7 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 6); HS_KEY_TYPE r8 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 7); { { HS_SLAB_HALF_PREAMBLE(8); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(4); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(2); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); } HS_SLAB_GLOBAL_STORE(0, r1); HS_SLAB_GLOBAL_STORE(1, r2); HS_SLAB_GLOBAL_STORE(2, r3); HS_SLAB_GLOBAL_STORE(3, r4); HS_SLAB_GLOBAL_STORE(4, r5); HS_SLAB_GLOBAL_STORE(5, r6); HS_SLAB_GLOBAL_STORE(6, r7); HS_SLAB_GLOBAL_STORE(7, r8); } HS_BC_KERNEL_PROTO(4, 2) { HS_BLOCK_LOCAL_MEM_DECL(64, 8); HS_SLAB_GLOBAL_PREAMBLE(); HS_BC_MERGE_H_PREAMBLE(4); { { HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0); HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8); HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(16); HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(24); HS_CMP_XCHG(r0_1, r0_3); HS_CMP_XCHG(r0_2, r0_4); HS_CMP_XCHG(r0_1, r0_2); HS_CMP_XCHG(r0_3, r0_4); HS_SLAB_LOCAL_L(0) = r0_1; HS_SLAB_LOCAL_L(16) = r0_2; HS_SLAB_LOCAL_L(32) = r0_3; HS_SLAB_LOCAL_L(48) = r0_4; } { HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(4); HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(12); HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(20); HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(28); HS_CMP_XCHG(r0_1, r0_3); HS_CMP_XCHG(r0_2, r0_4); HS_CMP_XCHG(r0_1, r0_2); HS_CMP_XCHG(r0_3, r0_4); HS_SLAB_LOCAL_L(256) = r0_1; HS_SLAB_LOCAL_L(272) = r0_2; HS_SLAB_LOCAL_L(288) = r0_3; HS_SLAB_LOCAL_L(304) = r0_4; } } HS_BLOCK_BARRIER(); HS_KEY_TYPE r1 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0); HS_KEY_TYPE r2 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1); HS_KEY_TYPE r3 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2); HS_KEY_TYPE r4 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3); HS_KEY_TYPE r5 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4); HS_KEY_TYPE r6 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5); HS_KEY_TYPE r7 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6); HS_KEY_TYPE r8 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7); { { HS_SLAB_HALF_PREAMBLE(8); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(4); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(2); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); } HS_SLAB_GLOBAL_STORE(0, r1); HS_SLAB_GLOBAL_STORE(1, r2); HS_SLAB_GLOBAL_STORE(2, r3); HS_SLAB_GLOBAL_STORE(3, r4); HS_SLAB_GLOBAL_STORE(4, r5); HS_SLAB_GLOBAL_STORE(5, r6); HS_SLAB_GLOBAL_STORE(6, r7); HS_SLAB_GLOBAL_STORE(7, r8); } HS_BC_KERNEL_PROTO(8, 3) { HS_BLOCK_LOCAL_MEM_DECL(128, 8); HS_SLAB_GLOBAL_PREAMBLE(); HS_BC_MERGE_H_PREAMBLE(8); { { HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0); HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8); HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(16); HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(24); HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(32); HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(40); HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(48); HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(56); HS_CMP_XCHG(r0_1, r0_5); HS_CMP_XCHG(r0_3, r0_7); HS_CMP_XCHG(r0_1, r0_3); HS_CMP_XCHG(r0_5, r0_7); HS_CMP_XCHG(r0_2, r0_6); HS_CMP_XCHG(r0_4, r0_8); HS_CMP_XCHG(r0_2, r0_4); HS_CMP_XCHG(r0_6, r0_8); HS_CMP_XCHG(r0_1, r0_2); HS_CMP_XCHG(r0_3, r0_4); HS_CMP_XCHG(r0_5, r0_6); HS_CMP_XCHG(r0_7, r0_8); HS_SLAB_LOCAL_L(0) = r0_1; HS_SLAB_LOCAL_L(16) = r0_2; HS_SLAB_LOCAL_L(32) = r0_3; HS_SLAB_LOCAL_L(48) = r0_4; HS_SLAB_LOCAL_L(64) = r0_5; HS_SLAB_LOCAL_L(80) = r0_6; HS_SLAB_LOCAL_L(96) = r0_7; HS_SLAB_LOCAL_L(112) = r0_8; } } HS_BLOCK_BARRIER(); HS_KEY_TYPE r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0); HS_KEY_TYPE r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1); HS_KEY_TYPE r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2); HS_KEY_TYPE r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3); HS_KEY_TYPE r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4); HS_KEY_TYPE r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5); HS_KEY_TYPE r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6); HS_KEY_TYPE r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7); { { HS_SLAB_HALF_PREAMBLE(8); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(4); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(2); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); } HS_SLAB_GLOBAL_STORE(0, r1); HS_SLAB_GLOBAL_STORE(1, r2); HS_SLAB_GLOBAL_STORE(2, r3); HS_SLAB_GLOBAL_STORE(3, r4); HS_SLAB_GLOBAL_STORE(4, r5); HS_SLAB_GLOBAL_STORE(5, r6); HS_SLAB_GLOBAL_STORE(6, r7); HS_SLAB_GLOBAL_STORE(7, r8); } HS_BC_KERNEL_PROTO(16, 4) { HS_BLOCK_LOCAL_MEM_DECL(256, 8); HS_SLAB_GLOBAL_PREAMBLE(); HS_BC_MERGE_H_PREAMBLE(16); if (HS_SUBGROUP_ID() < 8) { { HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0); HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8); HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(16); HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(24); HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(32); HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(40); HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(48); HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(56); HS_KEY_TYPE r0_9 = HS_BC_GLOBAL_LOAD_L(64); HS_KEY_TYPE r0_10 = HS_BC_GLOBAL_LOAD_L(72); HS_KEY_TYPE r0_11 = HS_BC_GLOBAL_LOAD_L(80); HS_KEY_TYPE r0_12 = HS_BC_GLOBAL_LOAD_L(88); HS_KEY_TYPE r0_13 = HS_BC_GLOBAL_LOAD_L(96); HS_KEY_TYPE r0_14 = HS_BC_GLOBAL_LOAD_L(104); HS_KEY_TYPE r0_15 = HS_BC_GLOBAL_LOAD_L(112); HS_KEY_TYPE r0_16 = HS_BC_GLOBAL_LOAD_L(120); HS_CMP_XCHG(r0_1, r0_9); HS_CMP_XCHG(r0_5, r0_13); HS_CMP_XCHG(r0_1, r0_5); HS_CMP_XCHG(r0_9, r0_13); HS_CMP_XCHG(r0_3, r0_11); HS_CMP_XCHG(r0_7, r0_15); HS_CMP_XCHG(r0_3, r0_7); HS_CMP_XCHG(r0_11, r0_15); HS_CMP_XCHG(r0_1, r0_3); HS_CMP_XCHG(r0_5, r0_7); HS_CMP_XCHG(r0_9, r0_11); HS_CMP_XCHG(r0_13, r0_15); HS_CMP_XCHG(r0_2, r0_10); HS_CMP_XCHG(r0_6, r0_14); HS_CMP_XCHG(r0_2, r0_6); HS_CMP_XCHG(r0_10, r0_14); HS_CMP_XCHG(r0_4, r0_12); HS_CMP_XCHG(r0_8, r0_16); HS_CMP_XCHG(r0_4, r0_8); HS_CMP_XCHG(r0_12, r0_16); HS_CMP_XCHG(r0_2, r0_4); HS_CMP_XCHG(r0_6, r0_8); HS_CMP_XCHG(r0_10, r0_12); HS_CMP_XCHG(r0_14, r0_16); HS_CMP_XCHG(r0_1, r0_2); HS_CMP_XCHG(r0_3, r0_4); HS_CMP_XCHG(r0_5, r0_6); HS_CMP_XCHG(r0_7, r0_8); HS_CMP_XCHG(r0_9, r0_10); HS_CMP_XCHG(r0_11, r0_12); HS_CMP_XCHG(r0_13, r0_14); HS_CMP_XCHG(r0_15, r0_16); HS_SLAB_LOCAL_L(0) = r0_1; HS_SLAB_LOCAL_L(16) = r0_2; HS_SLAB_LOCAL_L(32) = r0_3; HS_SLAB_LOCAL_L(48) = r0_4; HS_SLAB_LOCAL_L(64) = r0_5; HS_SLAB_LOCAL_L(80) = r0_6; HS_SLAB_LOCAL_L(96) = r0_7; HS_SLAB_LOCAL_L(112) = r0_8; HS_SLAB_LOCAL_L(128) = r0_9; HS_SLAB_LOCAL_L(144) = r0_10; HS_SLAB_LOCAL_L(160) = r0_11; HS_SLAB_LOCAL_L(176) = r0_12; HS_SLAB_LOCAL_L(192) = r0_13; HS_SLAB_LOCAL_L(208) = r0_14; HS_SLAB_LOCAL_L(224) = r0_15; HS_SLAB_LOCAL_L(240) = r0_16; } } HS_BLOCK_BARRIER(); HS_KEY_TYPE r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0); HS_KEY_TYPE r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1); HS_KEY_TYPE r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2); HS_KEY_TYPE r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3); HS_KEY_TYPE r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4); HS_KEY_TYPE r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5); HS_KEY_TYPE r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6); HS_KEY_TYPE r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7); { { HS_SLAB_HALF_PREAMBLE(8); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(4); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(2); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } { HS_SLAB_HALF_PREAMBLE(1); HS_CMP_HALF(0, r1); HS_CMP_HALF(1, r2); HS_CMP_HALF(2, r3); HS_CMP_HALF(3, r4); HS_CMP_HALF(4, r5); HS_CMP_HALF(5, r6); HS_CMP_HALF(6, r7); HS_CMP_HALF(7, r8); } HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); } HS_SLAB_GLOBAL_STORE(0, r1); HS_SLAB_GLOBAL_STORE(1, r2); HS_SLAB_GLOBAL_STORE(2, r3); HS_SLAB_GLOBAL_STORE(3, r4); HS_SLAB_GLOBAL_STORE(4, r5); HS_SLAB_GLOBAL_STORE(5, r6); HS_SLAB_GLOBAL_STORE(6, r7); HS_SLAB_GLOBAL_STORE(7, r8); } HS_FM_KERNEL_PROTO(0, 0) { HS_FM_PREAMBLE(8); HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0); HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1); HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2); HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3); HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4); HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5); HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6); HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7); HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0); HS_CMP_XCHG(r8, r9); HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); HS_XM_GLOBAL_STORE_L(0, r1); HS_XM_GLOBAL_STORE_L(1, r2); HS_XM_GLOBAL_STORE_L(2, r3); HS_XM_GLOBAL_STORE_L(3, r4); HS_XM_GLOBAL_STORE_L(4, r5); HS_XM_GLOBAL_STORE_L(5, r6); HS_XM_GLOBAL_STORE_L(6, r7); HS_XM_GLOBAL_STORE_L(7, r8); HS_FM_GLOBAL_STORE_R(0, r9); } HS_FM_KERNEL_PROTO(0, 1) { HS_FM_PREAMBLE(8); HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0); HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1); HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2); HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3); HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4); HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5); HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6); HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7); HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0); HS_KEY_TYPE r10 = HS_FM_GLOBAL_LOAD_R(1); HS_CMP_XCHG(r8, r9); HS_CMP_XCHG(r7, r10); HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); HS_CMP_XCHG(r9, r10); HS_XM_GLOBAL_STORE_L(0, r1); HS_XM_GLOBAL_STORE_L(1, r2); HS_XM_GLOBAL_STORE_L(2, r3); HS_XM_GLOBAL_STORE_L(3, r4); HS_XM_GLOBAL_STORE_L(4, r5); HS_XM_GLOBAL_STORE_L(5, r6); HS_XM_GLOBAL_STORE_L(6, r7); HS_XM_GLOBAL_STORE_L(7, r8); HS_FM_GLOBAL_STORE_R(0, r9); HS_FM_GLOBAL_STORE_R(1, r10); } HS_FM_KERNEL_PROTO(0, 2) { HS_FM_PREAMBLE(8); HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0); HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1); HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2); HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3); HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4); HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5); HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6); HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7); HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0); HS_KEY_TYPE r10 = HS_FM_GLOBAL_LOAD_R(1); HS_KEY_TYPE r11 = HS_FM_GLOBAL_LOAD_R(2); HS_KEY_TYPE r12 = HS_FM_GLOBAL_LOAD_R(3); HS_CMP_XCHG(r8, r9); HS_CMP_XCHG(r7, r10); HS_CMP_XCHG(r6, r11); HS_CMP_XCHG(r5, r12); HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); HS_CMP_XCHG(r9, r11); HS_CMP_XCHG(r10, r12); HS_CMP_XCHG(r9, r10); HS_CMP_XCHG(r11, r12); HS_XM_GLOBAL_STORE_L(0, r1); HS_XM_GLOBAL_STORE_L(1, r2); HS_XM_GLOBAL_STORE_L(2, r3); HS_XM_GLOBAL_STORE_L(3, r4); HS_XM_GLOBAL_STORE_L(4, r5); HS_XM_GLOBAL_STORE_L(5, r6); HS_XM_GLOBAL_STORE_L(6, r7); HS_XM_GLOBAL_STORE_L(7, r8); HS_FM_GLOBAL_STORE_R(0, r9); HS_FM_GLOBAL_STORE_R(1, r10); HS_FM_GLOBAL_STORE_R(2, r11); HS_FM_GLOBAL_STORE_R(3, r12); } HS_FM_KERNEL_PROTO(0, 3) { HS_FM_PREAMBLE(8); HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0); HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1); HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2); HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3); HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4); HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5); HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6); HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7); HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0); HS_KEY_TYPE r10 = HS_FM_GLOBAL_LOAD_R(1); HS_KEY_TYPE r11 = HS_FM_GLOBAL_LOAD_R(2); HS_KEY_TYPE r12 = HS_FM_GLOBAL_LOAD_R(3); HS_KEY_TYPE r13 = HS_FM_GLOBAL_LOAD_R(4); HS_KEY_TYPE r14 = HS_FM_GLOBAL_LOAD_R(5); HS_KEY_TYPE r15 = HS_FM_GLOBAL_LOAD_R(6); HS_KEY_TYPE r16 = HS_FM_GLOBAL_LOAD_R(7); HS_CMP_XCHG(r8, r9); HS_CMP_XCHG(r7, r10); HS_CMP_XCHG(r6, r11); HS_CMP_XCHG(r5, r12); HS_CMP_XCHG(r4, r13); HS_CMP_XCHG(r3, r14); HS_CMP_XCHG(r2, r15); HS_CMP_XCHG(r1, r16); HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); HS_CMP_XCHG(r9, r13); HS_CMP_XCHG(r11, r15); HS_CMP_XCHG(r9, r11); HS_CMP_XCHG(r13, r15); HS_CMP_XCHG(r10, r14); HS_CMP_XCHG(r12, r16); HS_CMP_XCHG(r10, r12); HS_CMP_XCHG(r14, r16); HS_CMP_XCHG(r9, r10); HS_CMP_XCHG(r11, r12); HS_CMP_XCHG(r13, r14); HS_CMP_XCHG(r15, r16); HS_XM_GLOBAL_STORE_L(0, r1); HS_XM_GLOBAL_STORE_L(1, r2); HS_XM_GLOBAL_STORE_L(2, r3); HS_XM_GLOBAL_STORE_L(3, r4); HS_XM_GLOBAL_STORE_L(4, r5); HS_XM_GLOBAL_STORE_L(5, r6); HS_XM_GLOBAL_STORE_L(6, r7); HS_XM_GLOBAL_STORE_L(7, r8); HS_FM_GLOBAL_STORE_R(0, r9); HS_FM_GLOBAL_STORE_R(1, r10); HS_FM_GLOBAL_STORE_R(2, r11); HS_FM_GLOBAL_STORE_R(3, r12); HS_FM_GLOBAL_STORE_R(4, r13); HS_FM_GLOBAL_STORE_R(5, r14); HS_FM_GLOBAL_STORE_R(6, r15); HS_FM_GLOBAL_STORE_R(7, r16); } HS_HM_KERNEL_PROTO(0) { HS_HM_PREAMBLE(8); HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0); HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1); HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2); HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3); HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4); HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5); HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6); HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7); HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8); HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9); HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10); HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11); HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12); HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13); HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14); HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15); HS_CMP_XCHG(r1, r9); HS_CMP_XCHG(r5, r13); HS_CMP_XCHG(r1, r5); HS_CMP_XCHG(r9, r13); HS_CMP_XCHG(r3, r11); HS_CMP_XCHG(r7, r15); HS_CMP_XCHG(r3, r7); HS_CMP_XCHG(r11, r15); HS_CMP_XCHG(r1, r3); HS_CMP_XCHG(r5, r7); HS_CMP_XCHG(r9, r11); HS_CMP_XCHG(r13, r15); HS_CMP_XCHG(r2, r10); HS_CMP_XCHG(r6, r14); HS_CMP_XCHG(r2, r6); HS_CMP_XCHG(r10, r14); HS_CMP_XCHG(r4, r12); HS_CMP_XCHG(r8, r16); HS_CMP_XCHG(r4, r8); HS_CMP_XCHG(r12, r16); HS_CMP_XCHG(r2, r4); HS_CMP_XCHG(r6, r8); HS_CMP_XCHG(r10, r12); HS_CMP_XCHG(r14, r16); HS_CMP_XCHG(r1, r2); HS_CMP_XCHG(r3, r4); HS_CMP_XCHG(r5, r6); HS_CMP_XCHG(r7, r8); HS_CMP_XCHG(r9, r10); HS_CMP_XCHG(r11, r12); HS_CMP_XCHG(r13, r14); HS_CMP_XCHG(r15, r16); HS_XM_GLOBAL_STORE_L(0, r1); HS_XM_GLOBAL_STORE_L(1, r2); HS_XM_GLOBAL_STORE_L(2, r3); HS_XM_GLOBAL_STORE_L(3, r4); HS_XM_GLOBAL_STORE_L(4, r5); HS_XM_GLOBAL_STORE_L(5, r6); HS_XM_GLOBAL_STORE_L(6, r7); HS_XM_GLOBAL_STORE_L(7, r8); HS_XM_GLOBAL_STORE_L(8, r9); HS_XM_GLOBAL_STORE_L(9, r10); HS_XM_GLOBAL_STORE_L(10, r11); HS_XM_GLOBAL_STORE_L(11, r12); HS_XM_GLOBAL_STORE_L(12, r13); HS_XM_GLOBAL_STORE_L(13, r14); HS_XM_GLOBAL_STORE_L(14, r15); HS_XM_GLOBAL_STORE_L(15, r16); } HS_TRANSPOSE_KERNEL_PROTO() { HS_SLAB_GLOBAL_PREAMBLE(); HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vout, 0); HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vout, 1); HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vout, 2); HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vout, 3); HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vout, 4); HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vout, 5); HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vout, 6); HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vout, 7); HS_TRANSPOSE_SLAB() } // // //