1// 2// Copyright 2016 Google Inc. 3// 4// Use of this source code is governed by a BSD-style 5// license that can be found in the LICENSE file. 6// 7 8// target-specific config 9#include "hs_config.h" 10 11// arch/target-specific macros 12#include "hs_cl_macros.h" 13 14// 15// 16// 17 18HS_BS_KERNEL_PROTO(1, 0) 19{ 20 HS_SLAB_GLOBAL_PREAMBLE(); 21 HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0); 22 HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1); 23 HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2); 24 HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3); 25 HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4); 26 HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5); 27 HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6); 28 HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7); 29 HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8); 30 HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 9); 31 HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 10); 32 HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 11); 33 HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 12); 34 HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 13); 35 HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 14); 36 HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 15); 37 HS_CMP_XCHG(r1, r2); 38 HS_CMP_XCHG(r3, r4); 39 HS_CMP_XCHG(r5, r6); 40 HS_CMP_XCHG(r7, r8); 41 HS_CMP_XCHG(r9, r10); 42 HS_CMP_XCHG(r11, r12); 43 HS_CMP_XCHG(r13, r14); 44 HS_CMP_XCHG(r15, r16); 45 HS_CMP_XCHG(r1, r3); 46 HS_CMP_XCHG(r5, r7); 47 HS_CMP_XCHG(r9, r11); 48 HS_CMP_XCHG(r13, r15); 49 HS_CMP_XCHG(r2, r4); 50 HS_CMP_XCHG(r6, r8); 51 HS_CMP_XCHG(r10, r12); 52 HS_CMP_XCHG(r14, r16); 53 HS_CMP_XCHG(r1, r5); 54 HS_CMP_XCHG(r9, r13); 55 HS_CMP_XCHG(r2, r6); 56 HS_CMP_XCHG(r10, r14); 57 HS_CMP_XCHG(r3, r7); 58 HS_CMP_XCHG(r11, r15); 59 HS_CMP_XCHG(r4, r8); 60 HS_CMP_XCHG(r12, r16); 61 HS_CMP_XCHG(r1, r9); 62 HS_CMP_XCHG(r2, r10); 63 HS_CMP_XCHG(r3, r11); 64 HS_CMP_XCHG(r4, r12); 65 HS_CMP_XCHG(r5, r13); 66 HS_CMP_XCHG(r6, r14); 67 HS_CMP_XCHG(r7, r15); 68 HS_CMP_XCHG(r8, r16); 69 HS_CMP_XCHG(r6, r11); 70 HS_CMP_XCHG(r7, r10); 71 HS_CMP_XCHG(r4, r13); 72 HS_CMP_XCHG(r14, r15); 73 HS_CMP_XCHG(r8, r12); 74 HS_CMP_XCHG(r2, r3); 75 HS_CMP_XCHG(r5, r9); 76 HS_CMP_XCHG(r2, r5); 77 HS_CMP_XCHG(r8, r14); 78 HS_CMP_XCHG(r3, r9); 79 HS_CMP_XCHG(r12, r15); 80 HS_CMP_XCHG(r3, r5); 81 HS_CMP_XCHG(r6, r7); 82 HS_CMP_XCHG(r10, r11); 83 HS_CMP_XCHG(r12, r14); 84 HS_CMP_XCHG(r4, r9); 85 HS_CMP_XCHG(r8, r13); 86 HS_CMP_XCHG(r7, r9); 87 HS_CMP_XCHG(r11, r13); 88 HS_CMP_XCHG(r4, r6); 89 HS_CMP_XCHG(r8, r10); 90 HS_CMP_XCHG(r4, r5); 91 HS_CMP_XCHG(r6, r7); 92 HS_CMP_XCHG(r8, r9); 93 HS_CMP_XCHG(r10, r11); 94 HS_CMP_XCHG(r12, r13); 95 HS_CMP_XCHG(r7, r8); 96 HS_CMP_XCHG(r9, r10); 97 { 98 HS_SLAB_FLIP_PREAMBLE(1); 99 HS_CMP_FLIP(0, r1, r16); 100 HS_CMP_FLIP(1, r2, r15); 101 HS_CMP_FLIP(2, r3, r14); 102 HS_CMP_FLIP(3, r4, r13); 103 HS_CMP_FLIP(4, r5, r12); 104 HS_CMP_FLIP(5, r6, r11); 105 HS_CMP_FLIP(6, r7, r10); 106 HS_CMP_FLIP(7, r8, r9); 107 } 108 HS_CMP_XCHG(r1, r9); 109 HS_CMP_XCHG(r5, r13); 110 HS_CMP_XCHG(r1, r5); 111 HS_CMP_XCHG(r9, r13); 112 HS_CMP_XCHG(r3, r11); 113 HS_CMP_XCHG(r7, r15); 114 HS_CMP_XCHG(r3, r7); 115 HS_CMP_XCHG(r11, r15); 116 HS_CMP_XCHG(r1, r3); 117 HS_CMP_XCHG(r5, r7); 118 HS_CMP_XCHG(r9, r11); 119 HS_CMP_XCHG(r13, r15); 120 HS_CMP_XCHG(r2, r10); 121 HS_CMP_XCHG(r6, r14); 122 HS_CMP_XCHG(r2, r6); 123 HS_CMP_XCHG(r10, r14); 124 HS_CMP_XCHG(r4, r12); 125 HS_CMP_XCHG(r8, r16); 126 HS_CMP_XCHG(r4, r8); 127 HS_CMP_XCHG(r12, r16); 128 HS_CMP_XCHG(r2, r4); 129 HS_CMP_XCHG(r6, r8); 130 HS_CMP_XCHG(r10, r12); 131 HS_CMP_XCHG(r14, r16); 132 HS_CMP_XCHG(r1, r2); 133 HS_CMP_XCHG(r3, r4); 134 HS_CMP_XCHG(r5, r6); 135 HS_CMP_XCHG(r7, r8); 136 HS_CMP_XCHG(r9, r10); 137 HS_CMP_XCHG(r11, r12); 138 HS_CMP_XCHG(r13, r14); 139 HS_CMP_XCHG(r15, r16); 140 { 141 HS_SLAB_FLIP_PREAMBLE(3); 142 HS_CMP_FLIP(0, r1, r16); 143 HS_CMP_FLIP(1, r2, r15); 144 HS_CMP_FLIP(2, r3, r14); 145 HS_CMP_FLIP(3, r4, r13); 146 HS_CMP_FLIP(4, r5, r12); 147 HS_CMP_FLIP(5, r6, r11); 148 HS_CMP_FLIP(6, r7, r10); 149 HS_CMP_FLIP(7, r8, r9); 150 } 151 { 152 HS_SLAB_HALF_PREAMBLE(1); 153 HS_CMP_HALF(0, r1); 154 HS_CMP_HALF(1, r2); 155 HS_CMP_HALF(2, r3); 156 HS_CMP_HALF(3, r4); 157 HS_CMP_HALF(4, r5); 158 HS_CMP_HALF(5, r6); 159 HS_CMP_HALF(6, r7); 160 HS_CMP_HALF(7, r8); 161 HS_CMP_HALF(8, r9); 162 HS_CMP_HALF(9, r10); 163 HS_CMP_HALF(10, r11); 164 HS_CMP_HALF(11, r12); 165 HS_CMP_HALF(12, r13); 166 HS_CMP_HALF(13, r14); 167 HS_CMP_HALF(14, r15); 168 HS_CMP_HALF(15, r16); 169 } 170 HS_CMP_XCHG(r1, r9); 171 HS_CMP_XCHG(r5, r13); 172 HS_CMP_XCHG(r1, r5); 173 HS_CMP_XCHG(r9, r13); 174 HS_CMP_XCHG(r3, r11); 175 HS_CMP_XCHG(r7, r15); 176 HS_CMP_XCHG(r3, r7); 177 HS_CMP_XCHG(r11, r15); 178 HS_CMP_XCHG(r1, r3); 179 HS_CMP_XCHG(r5, r7); 180 HS_CMP_XCHG(r9, r11); 181 HS_CMP_XCHG(r13, r15); 182 HS_CMP_XCHG(r2, r10); 183 HS_CMP_XCHG(r6, r14); 184 HS_CMP_XCHG(r2, r6); 185 HS_CMP_XCHG(r10, r14); 186 HS_CMP_XCHG(r4, r12); 187 HS_CMP_XCHG(r8, r16); 188 HS_CMP_XCHG(r4, r8); 189 HS_CMP_XCHG(r12, r16); 190 HS_CMP_XCHG(r2, r4); 191 HS_CMP_XCHG(r6, r8); 192 HS_CMP_XCHG(r10, r12); 193 HS_CMP_XCHG(r14, r16); 194 HS_CMP_XCHG(r1, r2); 195 HS_CMP_XCHG(r3, r4); 196 HS_CMP_XCHG(r5, r6); 197 HS_CMP_XCHG(r7, r8); 198 HS_CMP_XCHG(r9, r10); 199 HS_CMP_XCHG(r11, r12); 200 HS_CMP_XCHG(r13, r14); 201 HS_CMP_XCHG(r15, r16); 202 { 203 HS_SLAB_FLIP_PREAMBLE(7); 204 HS_CMP_FLIP(0, r1, r16); 205 HS_CMP_FLIP(1, r2, r15); 206 HS_CMP_FLIP(2, r3, r14); 207 HS_CMP_FLIP(3, r4, r13); 208 HS_CMP_FLIP(4, r5, r12); 209 HS_CMP_FLIP(5, r6, r11); 210 HS_CMP_FLIP(6, r7, r10); 211 HS_CMP_FLIP(7, r8, r9); 212 } 213 { 214 HS_SLAB_HALF_PREAMBLE(2); 215 HS_CMP_HALF(0, r1); 216 HS_CMP_HALF(1, r2); 217 HS_CMP_HALF(2, r3); 218 HS_CMP_HALF(3, r4); 219 HS_CMP_HALF(4, r5); 220 HS_CMP_HALF(5, r6); 221 HS_CMP_HALF(6, r7); 222 HS_CMP_HALF(7, r8); 223 HS_CMP_HALF(8, r9); 224 HS_CMP_HALF(9, r10); 225 HS_CMP_HALF(10, r11); 226 HS_CMP_HALF(11, r12); 227 HS_CMP_HALF(12, r13); 228 HS_CMP_HALF(13, r14); 229 HS_CMP_HALF(14, r15); 230 HS_CMP_HALF(15, r16); 231 } 232 { 233 HS_SLAB_HALF_PREAMBLE(1); 234 HS_CMP_HALF(0, r1); 235 HS_CMP_HALF(1, r2); 236 HS_CMP_HALF(2, r3); 237 HS_CMP_HALF(3, r4); 238 HS_CMP_HALF(4, r5); 239 HS_CMP_HALF(5, r6); 240 HS_CMP_HALF(6, r7); 241 HS_CMP_HALF(7, r8); 242 HS_CMP_HALF(8, r9); 243 HS_CMP_HALF(9, r10); 244 HS_CMP_HALF(10, r11); 245 HS_CMP_HALF(11, r12); 246 HS_CMP_HALF(12, r13); 247 HS_CMP_HALF(13, r14); 248 HS_CMP_HALF(14, r15); 249 HS_CMP_HALF(15, r16); 250 } 251 HS_CMP_XCHG(r1, r9); 252 HS_CMP_XCHG(r5, r13); 253 HS_CMP_XCHG(r1, r5); 254 HS_CMP_XCHG(r9, r13); 255 HS_CMP_XCHG(r3, r11); 256 HS_CMP_XCHG(r7, r15); 257 HS_CMP_XCHG(r3, r7); 258 HS_CMP_XCHG(r11, r15); 259 HS_CMP_XCHG(r1, r3); 260 HS_CMP_XCHG(r5, r7); 261 HS_CMP_XCHG(r9, r11); 262 HS_CMP_XCHG(r13, r15); 263 HS_CMP_XCHG(r2, r10); 264 HS_CMP_XCHG(r6, r14); 265 HS_CMP_XCHG(r2, r6); 266 HS_CMP_XCHG(r10, r14); 267 HS_CMP_XCHG(r4, r12); 268 HS_CMP_XCHG(r8, r16); 269 HS_CMP_XCHG(r4, r8); 270 HS_CMP_XCHG(r12, r16); 271 HS_CMP_XCHG(r2, r4); 272 HS_CMP_XCHG(r6, r8); 273 HS_CMP_XCHG(r10, r12); 274 HS_CMP_XCHG(r14, r16); 275 HS_CMP_XCHG(r1, r2); 276 HS_CMP_XCHG(r3, r4); 277 HS_CMP_XCHG(r5, r6); 278 HS_CMP_XCHG(r7, r8); 279 HS_CMP_XCHG(r9, r10); 280 HS_CMP_XCHG(r11, r12); 281 HS_CMP_XCHG(r13, r14); 282 HS_CMP_XCHG(r15, r16); 283 HS_SLAB_GLOBAL_STORE(0, r1); 284 HS_SLAB_GLOBAL_STORE(1, r2); 285 HS_SLAB_GLOBAL_STORE(2, r3); 286 HS_SLAB_GLOBAL_STORE(3, r4); 287 HS_SLAB_GLOBAL_STORE(4, r5); 288 HS_SLAB_GLOBAL_STORE(5, r6); 289 HS_SLAB_GLOBAL_STORE(6, r7); 290 HS_SLAB_GLOBAL_STORE(7, r8); 291 HS_SLAB_GLOBAL_STORE(8, r9); 292 HS_SLAB_GLOBAL_STORE(9, r10); 293 HS_SLAB_GLOBAL_STORE(10, r11); 294 HS_SLAB_GLOBAL_STORE(11, r12); 295 HS_SLAB_GLOBAL_STORE(12, r13); 296 HS_SLAB_GLOBAL_STORE(13, r14); 297 HS_SLAB_GLOBAL_STORE(14, r15); 298 HS_SLAB_GLOBAL_STORE(15, r16); 299} 300 301HS_BS_KERNEL_PROTO(2, 1) 302{ 303 HS_BLOCK_LOCAL_MEM_DECL(16, 16); 304 305 HS_SLAB_GLOBAL_PREAMBLE(); 306 HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0); 307 HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1); 308 HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2); 309 HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3); 310 HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4); 311 HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5); 312 HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6); 313 HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7); 314 HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8); 315 HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 9); 316 HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 10); 317 HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 11); 318 HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 12); 319 HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 13); 320 HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 14); 321 HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 15); 322 HS_CMP_XCHG(r1, r2); 323 HS_CMP_XCHG(r3, r4); 324 HS_CMP_XCHG(r5, r6); 325 HS_CMP_XCHG(r7, r8); 326 HS_CMP_XCHG(r9, r10); 327 HS_CMP_XCHG(r11, r12); 328 HS_CMP_XCHG(r13, r14); 329 HS_CMP_XCHG(r15, r16); 330 HS_CMP_XCHG(r1, r3); 331 HS_CMP_XCHG(r5, r7); 332 HS_CMP_XCHG(r9, r11); 333 HS_CMP_XCHG(r13, r15); 334 HS_CMP_XCHG(r2, r4); 335 HS_CMP_XCHG(r6, r8); 336 HS_CMP_XCHG(r10, r12); 337 HS_CMP_XCHG(r14, r16); 338 HS_CMP_XCHG(r1, r5); 339 HS_CMP_XCHG(r9, r13); 340 HS_CMP_XCHG(r2, r6); 341 HS_CMP_XCHG(r10, r14); 342 HS_CMP_XCHG(r3, r7); 343 HS_CMP_XCHG(r11, r15); 344 HS_CMP_XCHG(r4, r8); 345 HS_CMP_XCHG(r12, r16); 346 HS_CMP_XCHG(r1, r9); 347 HS_CMP_XCHG(r2, r10); 348 HS_CMP_XCHG(r3, r11); 349 HS_CMP_XCHG(r4, r12); 350 HS_CMP_XCHG(r5, r13); 351 HS_CMP_XCHG(r6, r14); 352 HS_CMP_XCHG(r7, r15); 353 HS_CMP_XCHG(r8, r16); 354 HS_CMP_XCHG(r6, r11); 355 HS_CMP_XCHG(r7, r10); 356 HS_CMP_XCHG(r4, r13); 357 HS_CMP_XCHG(r14, r15); 358 HS_CMP_XCHG(r8, r12); 359 HS_CMP_XCHG(r2, r3); 360 HS_CMP_XCHG(r5, r9); 361 HS_CMP_XCHG(r2, r5); 362 HS_CMP_XCHG(r8, r14); 363 HS_CMP_XCHG(r3, r9); 364 HS_CMP_XCHG(r12, r15); 365 HS_CMP_XCHG(r3, r5); 366 HS_CMP_XCHG(r6, r7); 367 HS_CMP_XCHG(r10, r11); 368 HS_CMP_XCHG(r12, r14); 369 HS_CMP_XCHG(r4, r9); 370 HS_CMP_XCHG(r8, r13); 371 HS_CMP_XCHG(r7, r9); 372 HS_CMP_XCHG(r11, r13); 373 HS_CMP_XCHG(r4, r6); 374 HS_CMP_XCHG(r8, r10); 375 HS_CMP_XCHG(r4, r5); 376 HS_CMP_XCHG(r6, r7); 377 HS_CMP_XCHG(r8, r9); 378 HS_CMP_XCHG(r10, r11); 379 HS_CMP_XCHG(r12, r13); 380 HS_CMP_XCHG(r7, r8); 381 HS_CMP_XCHG(r9, r10); 382 { 383 HS_SLAB_FLIP_PREAMBLE(1); 384 HS_CMP_FLIP(0, r1, r16); 385 HS_CMP_FLIP(1, r2, r15); 386 HS_CMP_FLIP(2, r3, r14); 387 HS_CMP_FLIP(3, r4, r13); 388 HS_CMP_FLIP(4, r5, r12); 389 HS_CMP_FLIP(5, r6, r11); 390 HS_CMP_FLIP(6, r7, r10); 391 HS_CMP_FLIP(7, r8, r9); 392 } 393 HS_CMP_XCHG(r1, r9); 394 HS_CMP_XCHG(r5, r13); 395 HS_CMP_XCHG(r1, r5); 396 HS_CMP_XCHG(r9, r13); 397 HS_CMP_XCHG(r3, r11); 398 HS_CMP_XCHG(r7, r15); 399 HS_CMP_XCHG(r3, r7); 400 HS_CMP_XCHG(r11, r15); 401 HS_CMP_XCHG(r1, r3); 402 HS_CMP_XCHG(r5, r7); 403 HS_CMP_XCHG(r9, r11); 404 HS_CMP_XCHG(r13, r15); 405 HS_CMP_XCHG(r2, r10); 406 HS_CMP_XCHG(r6, r14); 407 HS_CMP_XCHG(r2, r6); 408 HS_CMP_XCHG(r10, r14); 409 HS_CMP_XCHG(r4, r12); 410 HS_CMP_XCHG(r8, r16); 411 HS_CMP_XCHG(r4, r8); 412 HS_CMP_XCHG(r12, r16); 413 HS_CMP_XCHG(r2, r4); 414 HS_CMP_XCHG(r6, r8); 415 HS_CMP_XCHG(r10, r12); 416 HS_CMP_XCHG(r14, r16); 417 HS_CMP_XCHG(r1, r2); 418 HS_CMP_XCHG(r3, r4); 419 HS_CMP_XCHG(r5, r6); 420 HS_CMP_XCHG(r7, r8); 421 HS_CMP_XCHG(r9, r10); 422 HS_CMP_XCHG(r11, r12); 423 HS_CMP_XCHG(r13, r14); 424 HS_CMP_XCHG(r15, r16); 425 { 426 HS_SLAB_FLIP_PREAMBLE(3); 427 HS_CMP_FLIP(0, r1, r16); 428 HS_CMP_FLIP(1, r2, r15); 429 HS_CMP_FLIP(2, r3, r14); 430 HS_CMP_FLIP(3, r4, r13); 431 HS_CMP_FLIP(4, r5, r12); 432 HS_CMP_FLIP(5, r6, r11); 433 HS_CMP_FLIP(6, r7, r10); 434 HS_CMP_FLIP(7, r8, r9); 435 } 436 { 437 HS_SLAB_HALF_PREAMBLE(1); 438 HS_CMP_HALF(0, r1); 439 HS_CMP_HALF(1, r2); 440 HS_CMP_HALF(2, r3); 441 HS_CMP_HALF(3, r4); 442 HS_CMP_HALF(4, r5); 443 HS_CMP_HALF(5, r6); 444 HS_CMP_HALF(6, r7); 445 HS_CMP_HALF(7, r8); 446 HS_CMP_HALF(8, r9); 447 HS_CMP_HALF(9, r10); 448 HS_CMP_HALF(10, r11); 449 HS_CMP_HALF(11, r12); 450 HS_CMP_HALF(12, r13); 451 HS_CMP_HALF(13, r14); 452 HS_CMP_HALF(14, r15); 453 HS_CMP_HALF(15, r16); 454 } 455 HS_CMP_XCHG(r1, r9); 456 HS_CMP_XCHG(r5, r13); 457 HS_CMP_XCHG(r1, r5); 458 HS_CMP_XCHG(r9, r13); 459 HS_CMP_XCHG(r3, r11); 460 HS_CMP_XCHG(r7, r15); 461 HS_CMP_XCHG(r3, r7); 462 HS_CMP_XCHG(r11, r15); 463 HS_CMP_XCHG(r1, r3); 464 HS_CMP_XCHG(r5, r7); 465 HS_CMP_XCHG(r9, r11); 466 HS_CMP_XCHG(r13, r15); 467 HS_CMP_XCHG(r2, r10); 468 HS_CMP_XCHG(r6, r14); 469 HS_CMP_XCHG(r2, r6); 470 HS_CMP_XCHG(r10, r14); 471 HS_CMP_XCHG(r4, r12); 472 HS_CMP_XCHG(r8, r16); 473 HS_CMP_XCHG(r4, r8); 474 HS_CMP_XCHG(r12, r16); 475 HS_CMP_XCHG(r2, r4); 476 HS_CMP_XCHG(r6, r8); 477 HS_CMP_XCHG(r10, r12); 478 HS_CMP_XCHG(r14, r16); 479 HS_CMP_XCHG(r1, r2); 480 HS_CMP_XCHG(r3, r4); 481 HS_CMP_XCHG(r5, r6); 482 HS_CMP_XCHG(r7, r8); 483 HS_CMP_XCHG(r9, r10); 484 HS_CMP_XCHG(r11, r12); 485 HS_CMP_XCHG(r13, r14); 486 HS_CMP_XCHG(r15, r16); 487 { 488 HS_SLAB_FLIP_PREAMBLE(7); 489 HS_CMP_FLIP(0, r1, r16); 490 HS_CMP_FLIP(1, r2, r15); 491 HS_CMP_FLIP(2, r3, r14); 492 HS_CMP_FLIP(3, r4, r13); 493 HS_CMP_FLIP(4, r5, r12); 494 HS_CMP_FLIP(5, r6, r11); 495 HS_CMP_FLIP(6, r7, r10); 496 HS_CMP_FLIP(7, r8, r9); 497 } 498 { 499 HS_SLAB_HALF_PREAMBLE(2); 500 HS_CMP_HALF(0, r1); 501 HS_CMP_HALF(1, r2); 502 HS_CMP_HALF(2, r3); 503 HS_CMP_HALF(3, r4); 504 HS_CMP_HALF(4, r5); 505 HS_CMP_HALF(5, r6); 506 HS_CMP_HALF(6, r7); 507 HS_CMP_HALF(7, r8); 508 HS_CMP_HALF(8, r9); 509 HS_CMP_HALF(9, r10); 510 HS_CMP_HALF(10, r11); 511 HS_CMP_HALF(11, r12); 512 HS_CMP_HALF(12, r13); 513 HS_CMP_HALF(13, r14); 514 HS_CMP_HALF(14, r15); 515 HS_CMP_HALF(15, r16); 516 } 517 { 518 HS_SLAB_HALF_PREAMBLE(1); 519 HS_CMP_HALF(0, r1); 520 HS_CMP_HALF(1, r2); 521 HS_CMP_HALF(2, r3); 522 HS_CMP_HALF(3, r4); 523 HS_CMP_HALF(4, r5); 524 HS_CMP_HALF(5, r6); 525 HS_CMP_HALF(6, r7); 526 HS_CMP_HALF(7, r8); 527 HS_CMP_HALF(8, r9); 528 HS_CMP_HALF(9, r10); 529 HS_CMP_HALF(10, r11); 530 HS_CMP_HALF(11, r12); 531 HS_CMP_HALF(12, r13); 532 HS_CMP_HALF(13, r14); 533 HS_CMP_HALF(14, r15); 534 HS_CMP_HALF(15, r16); 535 } 536 HS_CMP_XCHG(r1, r9); 537 HS_CMP_XCHG(r5, r13); 538 HS_CMP_XCHG(r1, r5); 539 HS_CMP_XCHG(r9, r13); 540 HS_CMP_XCHG(r3, r11); 541 HS_CMP_XCHG(r7, r15); 542 HS_CMP_XCHG(r3, r7); 543 HS_CMP_XCHG(r11, r15); 544 HS_CMP_XCHG(r1, r3); 545 HS_CMP_XCHG(r5, r7); 546 HS_CMP_XCHG(r9, r11); 547 HS_CMP_XCHG(r13, r15); 548 HS_CMP_XCHG(r2, r10); 549 HS_CMP_XCHG(r6, r14); 550 HS_CMP_XCHG(r2, r6); 551 HS_CMP_XCHG(r10, r14); 552 HS_CMP_XCHG(r4, r12); 553 HS_CMP_XCHG(r8, r16); 554 HS_CMP_XCHG(r4, r8); 555 HS_CMP_XCHG(r12, r16); 556 HS_CMP_XCHG(r2, r4); 557 HS_CMP_XCHG(r6, r8); 558 HS_CMP_XCHG(r10, r12); 559 HS_CMP_XCHG(r14, r16); 560 HS_CMP_XCHG(r1, r2); 561 HS_CMP_XCHG(r3, r4); 562 HS_CMP_XCHG(r5, r6); 563 HS_CMP_XCHG(r7, r8); 564 HS_CMP_XCHG(r9, r10); 565 HS_CMP_XCHG(r11, r12); 566 HS_CMP_XCHG(r13, r14); 567 HS_CMP_XCHG(r15, r16); 568 HS_BS_MERGE_H_PREAMBLE(2); 569 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 0) = r1; 570 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 1) = r16; 571 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 2) = r2; 572 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 3) = r15; 573 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 4) = r3; 574 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 5) = r14; 575 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 6) = r4; 576 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 7) = r13; 577 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 8) = r5; 578 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 9) = r12; 579 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 10) = r6; 580 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 11) = r11; 581 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 12) = r7; 582 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 13) = r10; 583 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 14) = r8; 584 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 15) = r9; 585 HS_BLOCK_BARRIER(); 586 { 587 { 588 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); 589 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(8); 590 HS_CMP_XCHG(r0_1, r0_2); 591 HS_SLAB_LOCAL_L(0) = r0_1; 592 HS_SLAB_LOCAL_R(8) = r0_2; 593 } 594 { 595 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(32); 596 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(40); 597 HS_CMP_XCHG(r0_1, r0_2); 598 HS_SLAB_LOCAL_L(32) = r0_1; 599 HS_SLAB_LOCAL_R(40) = r0_2; 600 } 601 { 602 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(64); 603 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(72); 604 HS_CMP_XCHG(r0_1, r0_2); 605 HS_SLAB_LOCAL_L(64) = r0_1; 606 HS_SLAB_LOCAL_R(72) = r0_2; 607 } 608 { 609 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(96); 610 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(104); 611 HS_CMP_XCHG(r0_1, r0_2); 612 HS_SLAB_LOCAL_L(96) = r0_1; 613 HS_SLAB_LOCAL_R(104) = r0_2; 614 } 615 { 616 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(128); 617 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(136); 618 HS_CMP_XCHG(r0_1, r0_2); 619 HS_SLAB_LOCAL_L(128) = r0_1; 620 HS_SLAB_LOCAL_R(136) = r0_2; 621 } 622 { 623 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(160); 624 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(168); 625 HS_CMP_XCHG(r0_1, r0_2); 626 HS_SLAB_LOCAL_L(160) = r0_1; 627 HS_SLAB_LOCAL_R(168) = r0_2; 628 } 629 { 630 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(192); 631 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(200); 632 HS_CMP_XCHG(r0_1, r0_2); 633 HS_SLAB_LOCAL_L(192) = r0_1; 634 HS_SLAB_LOCAL_R(200) = r0_2; 635 } 636 { 637 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(224); 638 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(232); 639 HS_CMP_XCHG(r0_1, r0_2); 640 HS_SLAB_LOCAL_L(224) = r0_1; 641 HS_SLAB_LOCAL_R(232) = r0_2; 642 } 643 } 644 HS_BLOCK_BARRIER(); 645 r1 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 0); 646 r16 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 1); 647 r2 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 2); 648 r15 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 3); 649 r3 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 4); 650 r14 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 5); 651 r4 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 6); 652 r13 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 7); 653 r5 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 8); 654 r12 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 9); 655 r6 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 10); 656 r11 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 11); 657 r7 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 12); 658 r10 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 13); 659 r8 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 14); 660 r9 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 15); 661 { 662 { 663 HS_SLAB_HALF_PREAMBLE(4); 664 HS_CMP_HALF(0, r1); 665 HS_CMP_HALF(1, r2); 666 HS_CMP_HALF(2, r3); 667 HS_CMP_HALF(3, r4); 668 HS_CMP_HALF(4, r5); 669 HS_CMP_HALF(5, r6); 670 HS_CMP_HALF(6, r7); 671 HS_CMP_HALF(7, r8); 672 HS_CMP_HALF(8, r9); 673 HS_CMP_HALF(9, r10); 674 HS_CMP_HALF(10, r11); 675 HS_CMP_HALF(11, r12); 676 HS_CMP_HALF(12, r13); 677 HS_CMP_HALF(13, r14); 678 HS_CMP_HALF(14, r15); 679 HS_CMP_HALF(15, r16); 680 } 681 { 682 HS_SLAB_HALF_PREAMBLE(2); 683 HS_CMP_HALF(0, r1); 684 HS_CMP_HALF(1, r2); 685 HS_CMP_HALF(2, r3); 686 HS_CMP_HALF(3, r4); 687 HS_CMP_HALF(4, r5); 688 HS_CMP_HALF(5, r6); 689 HS_CMP_HALF(6, r7); 690 HS_CMP_HALF(7, r8); 691 HS_CMP_HALF(8, r9); 692 HS_CMP_HALF(9, r10); 693 HS_CMP_HALF(10, r11); 694 HS_CMP_HALF(11, r12); 695 HS_CMP_HALF(12, r13); 696 HS_CMP_HALF(13, r14); 697 HS_CMP_HALF(14, r15); 698 HS_CMP_HALF(15, r16); 699 } 700 { 701 HS_SLAB_HALF_PREAMBLE(1); 702 HS_CMP_HALF(0, r1); 703 HS_CMP_HALF(1, r2); 704 HS_CMP_HALF(2, r3); 705 HS_CMP_HALF(3, r4); 706 HS_CMP_HALF(4, r5); 707 HS_CMP_HALF(5, r6); 708 HS_CMP_HALF(6, r7); 709 HS_CMP_HALF(7, r8); 710 HS_CMP_HALF(8, r9); 711 HS_CMP_HALF(9, r10); 712 HS_CMP_HALF(10, r11); 713 HS_CMP_HALF(11, r12); 714 HS_CMP_HALF(12, r13); 715 HS_CMP_HALF(13, r14); 716 HS_CMP_HALF(14, r15); 717 HS_CMP_HALF(15, r16); 718 } 719 HS_CMP_XCHG(r1, r9); 720 HS_CMP_XCHG(r5, r13); 721 HS_CMP_XCHG(r1, r5); 722 HS_CMP_XCHG(r9, r13); 723 HS_CMP_XCHG(r3, r11); 724 HS_CMP_XCHG(r7, r15); 725 HS_CMP_XCHG(r3, r7); 726 HS_CMP_XCHG(r11, r15); 727 HS_CMP_XCHG(r1, r3); 728 HS_CMP_XCHG(r5, r7); 729 HS_CMP_XCHG(r9, r11); 730 HS_CMP_XCHG(r13, r15); 731 HS_CMP_XCHG(r2, r10); 732 HS_CMP_XCHG(r6, r14); 733 HS_CMP_XCHG(r2, r6); 734 HS_CMP_XCHG(r10, r14); 735 HS_CMP_XCHG(r4, r12); 736 HS_CMP_XCHG(r8, r16); 737 HS_CMP_XCHG(r4, r8); 738 HS_CMP_XCHG(r12, r16); 739 HS_CMP_XCHG(r2, r4); 740 HS_CMP_XCHG(r6, r8); 741 HS_CMP_XCHG(r10, r12); 742 HS_CMP_XCHG(r14, r16); 743 HS_CMP_XCHG(r1, r2); 744 HS_CMP_XCHG(r3, r4); 745 HS_CMP_XCHG(r5, r6); 746 HS_CMP_XCHG(r7, r8); 747 HS_CMP_XCHG(r9, r10); 748 HS_CMP_XCHG(r11, r12); 749 HS_CMP_XCHG(r13, r14); 750 HS_CMP_XCHG(r15, r16); 751 } 752 HS_SLAB_GLOBAL_STORE(0, r1); 753 HS_SLAB_GLOBAL_STORE(1, r2); 754 HS_SLAB_GLOBAL_STORE(2, r3); 755 HS_SLAB_GLOBAL_STORE(3, r4); 756 HS_SLAB_GLOBAL_STORE(4, r5); 757 HS_SLAB_GLOBAL_STORE(5, r6); 758 HS_SLAB_GLOBAL_STORE(6, r7); 759 HS_SLAB_GLOBAL_STORE(7, r8); 760 HS_SLAB_GLOBAL_STORE(8, r9); 761 HS_SLAB_GLOBAL_STORE(9, r10); 762 HS_SLAB_GLOBAL_STORE(10, r11); 763 HS_SLAB_GLOBAL_STORE(11, r12); 764 HS_SLAB_GLOBAL_STORE(12, r13); 765 HS_SLAB_GLOBAL_STORE(13, r14); 766 HS_SLAB_GLOBAL_STORE(14, r15); 767 HS_SLAB_GLOBAL_STORE(15, r16); 768} 769 770HS_BS_KERNEL_PROTO(4, 2) 771{ 772 HS_BLOCK_LOCAL_MEM_DECL(32, 16); 773 774 HS_SLAB_GLOBAL_PREAMBLE(); 775 HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0); 776 HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1); 777 HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2); 778 HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3); 779 HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4); 780 HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5); 781 HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6); 782 HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7); 783 HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8); 784 HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 9); 785 HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 10); 786 HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 11); 787 HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 12); 788 HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 13); 789 HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 14); 790 HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 15); 791 HS_CMP_XCHG(r1, r2); 792 HS_CMP_XCHG(r3, r4); 793 HS_CMP_XCHG(r5, r6); 794 HS_CMP_XCHG(r7, r8); 795 HS_CMP_XCHG(r9, r10); 796 HS_CMP_XCHG(r11, r12); 797 HS_CMP_XCHG(r13, r14); 798 HS_CMP_XCHG(r15, r16); 799 HS_CMP_XCHG(r1, r3); 800 HS_CMP_XCHG(r5, r7); 801 HS_CMP_XCHG(r9, r11); 802 HS_CMP_XCHG(r13, r15); 803 HS_CMP_XCHG(r2, r4); 804 HS_CMP_XCHG(r6, r8); 805 HS_CMP_XCHG(r10, r12); 806 HS_CMP_XCHG(r14, r16); 807 HS_CMP_XCHG(r1, r5); 808 HS_CMP_XCHG(r9, r13); 809 HS_CMP_XCHG(r2, r6); 810 HS_CMP_XCHG(r10, r14); 811 HS_CMP_XCHG(r3, r7); 812 HS_CMP_XCHG(r11, r15); 813 HS_CMP_XCHG(r4, r8); 814 HS_CMP_XCHG(r12, r16); 815 HS_CMP_XCHG(r1, r9); 816 HS_CMP_XCHG(r2, r10); 817 HS_CMP_XCHG(r3, r11); 818 HS_CMP_XCHG(r4, r12); 819 HS_CMP_XCHG(r5, r13); 820 HS_CMP_XCHG(r6, r14); 821 HS_CMP_XCHG(r7, r15); 822 HS_CMP_XCHG(r8, r16); 823 HS_CMP_XCHG(r6, r11); 824 HS_CMP_XCHG(r7, r10); 825 HS_CMP_XCHG(r4, r13); 826 HS_CMP_XCHG(r14, r15); 827 HS_CMP_XCHG(r8, r12); 828 HS_CMP_XCHG(r2, r3); 829 HS_CMP_XCHG(r5, r9); 830 HS_CMP_XCHG(r2, r5); 831 HS_CMP_XCHG(r8, r14); 832 HS_CMP_XCHG(r3, r9); 833 HS_CMP_XCHG(r12, r15); 834 HS_CMP_XCHG(r3, r5); 835 HS_CMP_XCHG(r6, r7); 836 HS_CMP_XCHG(r10, r11); 837 HS_CMP_XCHG(r12, r14); 838 HS_CMP_XCHG(r4, r9); 839 HS_CMP_XCHG(r8, r13); 840 HS_CMP_XCHG(r7, r9); 841 HS_CMP_XCHG(r11, r13); 842 HS_CMP_XCHG(r4, r6); 843 HS_CMP_XCHG(r8, r10); 844 HS_CMP_XCHG(r4, r5); 845 HS_CMP_XCHG(r6, r7); 846 HS_CMP_XCHG(r8, r9); 847 HS_CMP_XCHG(r10, r11); 848 HS_CMP_XCHG(r12, r13); 849 HS_CMP_XCHG(r7, r8); 850 HS_CMP_XCHG(r9, r10); 851 { 852 HS_SLAB_FLIP_PREAMBLE(1); 853 HS_CMP_FLIP(0, r1, r16); 854 HS_CMP_FLIP(1, r2, r15); 855 HS_CMP_FLIP(2, r3, r14); 856 HS_CMP_FLIP(3, r4, r13); 857 HS_CMP_FLIP(4, r5, r12); 858 HS_CMP_FLIP(5, r6, r11); 859 HS_CMP_FLIP(6, r7, r10); 860 HS_CMP_FLIP(7, r8, r9); 861 } 862 HS_CMP_XCHG(r1, r9); 863 HS_CMP_XCHG(r5, r13); 864 HS_CMP_XCHG(r1, r5); 865 HS_CMP_XCHG(r9, r13); 866 HS_CMP_XCHG(r3, r11); 867 HS_CMP_XCHG(r7, r15); 868 HS_CMP_XCHG(r3, r7); 869 HS_CMP_XCHG(r11, r15); 870 HS_CMP_XCHG(r1, r3); 871 HS_CMP_XCHG(r5, r7); 872 HS_CMP_XCHG(r9, r11); 873 HS_CMP_XCHG(r13, r15); 874 HS_CMP_XCHG(r2, r10); 875 HS_CMP_XCHG(r6, r14); 876 HS_CMP_XCHG(r2, r6); 877 HS_CMP_XCHG(r10, r14); 878 HS_CMP_XCHG(r4, r12); 879 HS_CMP_XCHG(r8, r16); 880 HS_CMP_XCHG(r4, r8); 881 HS_CMP_XCHG(r12, r16); 882 HS_CMP_XCHG(r2, r4); 883 HS_CMP_XCHG(r6, r8); 884 HS_CMP_XCHG(r10, r12); 885 HS_CMP_XCHG(r14, r16); 886 HS_CMP_XCHG(r1, r2); 887 HS_CMP_XCHG(r3, r4); 888 HS_CMP_XCHG(r5, r6); 889 HS_CMP_XCHG(r7, r8); 890 HS_CMP_XCHG(r9, r10); 891 HS_CMP_XCHG(r11, r12); 892 HS_CMP_XCHG(r13, r14); 893 HS_CMP_XCHG(r15, r16); 894 { 895 HS_SLAB_FLIP_PREAMBLE(3); 896 HS_CMP_FLIP(0, r1, r16); 897 HS_CMP_FLIP(1, r2, r15); 898 HS_CMP_FLIP(2, r3, r14); 899 HS_CMP_FLIP(3, r4, r13); 900 HS_CMP_FLIP(4, r5, r12); 901 HS_CMP_FLIP(5, r6, r11); 902 HS_CMP_FLIP(6, r7, r10); 903 HS_CMP_FLIP(7, r8, r9); 904 } 905 { 906 HS_SLAB_HALF_PREAMBLE(1); 907 HS_CMP_HALF(0, r1); 908 HS_CMP_HALF(1, r2); 909 HS_CMP_HALF(2, r3); 910 HS_CMP_HALF(3, r4); 911 HS_CMP_HALF(4, r5); 912 HS_CMP_HALF(5, r6); 913 HS_CMP_HALF(6, r7); 914 HS_CMP_HALF(7, r8); 915 HS_CMP_HALF(8, r9); 916 HS_CMP_HALF(9, r10); 917 HS_CMP_HALF(10, r11); 918 HS_CMP_HALF(11, r12); 919 HS_CMP_HALF(12, r13); 920 HS_CMP_HALF(13, r14); 921 HS_CMP_HALF(14, r15); 922 HS_CMP_HALF(15, r16); 923 } 924 HS_CMP_XCHG(r1, r9); 925 HS_CMP_XCHG(r5, r13); 926 HS_CMP_XCHG(r1, r5); 927 HS_CMP_XCHG(r9, r13); 928 HS_CMP_XCHG(r3, r11); 929 HS_CMP_XCHG(r7, r15); 930 HS_CMP_XCHG(r3, r7); 931 HS_CMP_XCHG(r11, r15); 932 HS_CMP_XCHG(r1, r3); 933 HS_CMP_XCHG(r5, r7); 934 HS_CMP_XCHG(r9, r11); 935 HS_CMP_XCHG(r13, r15); 936 HS_CMP_XCHG(r2, r10); 937 HS_CMP_XCHG(r6, r14); 938 HS_CMP_XCHG(r2, r6); 939 HS_CMP_XCHG(r10, r14); 940 HS_CMP_XCHG(r4, r12); 941 HS_CMP_XCHG(r8, r16); 942 HS_CMP_XCHG(r4, r8); 943 HS_CMP_XCHG(r12, r16); 944 HS_CMP_XCHG(r2, r4); 945 HS_CMP_XCHG(r6, r8); 946 HS_CMP_XCHG(r10, r12); 947 HS_CMP_XCHG(r14, r16); 948 HS_CMP_XCHG(r1, r2); 949 HS_CMP_XCHG(r3, r4); 950 HS_CMP_XCHG(r5, r6); 951 HS_CMP_XCHG(r7, r8); 952 HS_CMP_XCHG(r9, r10); 953 HS_CMP_XCHG(r11, r12); 954 HS_CMP_XCHG(r13, r14); 955 HS_CMP_XCHG(r15, r16); 956 { 957 HS_SLAB_FLIP_PREAMBLE(7); 958 HS_CMP_FLIP(0, r1, r16); 959 HS_CMP_FLIP(1, r2, r15); 960 HS_CMP_FLIP(2, r3, r14); 961 HS_CMP_FLIP(3, r4, r13); 962 HS_CMP_FLIP(4, r5, r12); 963 HS_CMP_FLIP(5, r6, r11); 964 HS_CMP_FLIP(6, r7, r10); 965 HS_CMP_FLIP(7, r8, r9); 966 } 967 { 968 HS_SLAB_HALF_PREAMBLE(2); 969 HS_CMP_HALF(0, r1); 970 HS_CMP_HALF(1, r2); 971 HS_CMP_HALF(2, r3); 972 HS_CMP_HALF(3, r4); 973 HS_CMP_HALF(4, r5); 974 HS_CMP_HALF(5, r6); 975 HS_CMP_HALF(6, r7); 976 HS_CMP_HALF(7, r8); 977 HS_CMP_HALF(8, r9); 978 HS_CMP_HALF(9, r10); 979 HS_CMP_HALF(10, r11); 980 HS_CMP_HALF(11, r12); 981 HS_CMP_HALF(12, r13); 982 HS_CMP_HALF(13, r14); 983 HS_CMP_HALF(14, r15); 984 HS_CMP_HALF(15, r16); 985 } 986 { 987 HS_SLAB_HALF_PREAMBLE(1); 988 HS_CMP_HALF(0, r1); 989 HS_CMP_HALF(1, r2); 990 HS_CMP_HALF(2, r3); 991 HS_CMP_HALF(3, r4); 992 HS_CMP_HALF(4, r5); 993 HS_CMP_HALF(5, r6); 994 HS_CMP_HALF(6, r7); 995 HS_CMP_HALF(7, r8); 996 HS_CMP_HALF(8, r9); 997 HS_CMP_HALF(9, r10); 998 HS_CMP_HALF(10, r11); 999 HS_CMP_HALF(11, r12); 1000 HS_CMP_HALF(12, r13); 1001 HS_CMP_HALF(13, r14); 1002 HS_CMP_HALF(14, r15); 1003 HS_CMP_HALF(15, r16); 1004 } 1005 HS_CMP_XCHG(r1, r9); 1006 HS_CMP_XCHG(r5, r13); 1007 HS_CMP_XCHG(r1, r5); 1008 HS_CMP_XCHG(r9, r13); 1009 HS_CMP_XCHG(r3, r11); 1010 HS_CMP_XCHG(r7, r15); 1011 HS_CMP_XCHG(r3, r7); 1012 HS_CMP_XCHG(r11, r15); 1013 HS_CMP_XCHG(r1, r3); 1014 HS_CMP_XCHG(r5, r7); 1015 HS_CMP_XCHG(r9, r11); 1016 HS_CMP_XCHG(r13, r15); 1017 HS_CMP_XCHG(r2, r10); 1018 HS_CMP_XCHG(r6, r14); 1019 HS_CMP_XCHG(r2, r6); 1020 HS_CMP_XCHG(r10, r14); 1021 HS_CMP_XCHG(r4, r12); 1022 HS_CMP_XCHG(r8, r16); 1023 HS_CMP_XCHG(r4, r8); 1024 HS_CMP_XCHG(r12, r16); 1025 HS_CMP_XCHG(r2, r4); 1026 HS_CMP_XCHG(r6, r8); 1027 HS_CMP_XCHG(r10, r12); 1028 HS_CMP_XCHG(r14, r16); 1029 HS_CMP_XCHG(r1, r2); 1030 HS_CMP_XCHG(r3, r4); 1031 HS_CMP_XCHG(r5, r6); 1032 HS_CMP_XCHG(r7, r8); 1033 HS_CMP_XCHG(r9, r10); 1034 HS_CMP_XCHG(r11, r12); 1035 HS_CMP_XCHG(r13, r14); 1036 HS_CMP_XCHG(r15, r16); 1037 HS_BS_MERGE_H_PREAMBLE(4); 1038 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0) = r1; 1039 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1) = r16; 1040 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2) = r2; 1041 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3) = r15; 1042 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4) = r3; 1043 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5) = r14; 1044 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6) = r4; 1045 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7) = r13; 1046 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 8) = r5; 1047 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 9) = r12; 1048 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 10) = r6; 1049 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 11) = r11; 1050 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 12) = r7; 1051 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 13) = r10; 1052 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 14) = r8; 1053 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 15) = r9; 1054 HS_BLOCK_BARRIER(); 1055 { 1056 { 1057 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); 1058 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(8); 1059 HS_CMP_XCHG(r0_1, r0_2); 1060 HS_SLAB_LOCAL_L(0) = r0_1; 1061 HS_SLAB_LOCAL_R(8) = r0_2; 1062 } 1063 { 1064 HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(16); 1065 HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(24); 1066 HS_CMP_XCHG(r1_1, r1_2); 1067 HS_SLAB_LOCAL_L(16) = r1_1; 1068 HS_SLAB_LOCAL_R(24) = r1_2; 1069 } 1070 { 1071 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(128); 1072 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(136); 1073 HS_CMP_XCHG(r0_1, r0_2); 1074 HS_SLAB_LOCAL_L(128) = r0_1; 1075 HS_SLAB_LOCAL_R(136) = r0_2; 1076 } 1077 { 1078 HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(144); 1079 HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(152); 1080 HS_CMP_XCHG(r1_1, r1_2); 1081 HS_SLAB_LOCAL_L(144) = r1_1; 1082 HS_SLAB_LOCAL_R(152) = r1_2; 1083 } 1084 { 1085 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(256); 1086 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(264); 1087 HS_CMP_XCHG(r0_1, r0_2); 1088 HS_SLAB_LOCAL_L(256) = r0_1; 1089 HS_SLAB_LOCAL_R(264) = r0_2; 1090 } 1091 { 1092 HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(272); 1093 HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(280); 1094 HS_CMP_XCHG(r1_1, r1_2); 1095 HS_SLAB_LOCAL_L(272) = r1_1; 1096 HS_SLAB_LOCAL_R(280) = r1_2; 1097 } 1098 { 1099 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(384); 1100 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(392); 1101 HS_CMP_XCHG(r0_1, r0_2); 1102 HS_SLAB_LOCAL_L(384) = r0_1; 1103 HS_SLAB_LOCAL_R(392) = r0_2; 1104 } 1105 { 1106 HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(400); 1107 HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(408); 1108 HS_CMP_XCHG(r1_1, r1_2); 1109 HS_SLAB_LOCAL_L(400) = r1_1; 1110 HS_SLAB_LOCAL_R(408) = r1_2; 1111 } 1112 } 1113 HS_BLOCK_BARRIER(); 1114 r1 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0); 1115 r16 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1); 1116 r2 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2); 1117 r15 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3); 1118 r3 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4); 1119 r14 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5); 1120 r4 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6); 1121 r13 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7); 1122 r5 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 8); 1123 r12 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 9); 1124 r6 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 10); 1125 r11 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 11); 1126 r7 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 12); 1127 r10 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 13); 1128 r8 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 14); 1129 r9 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 15); 1130 { 1131 { 1132 HS_SLAB_HALF_PREAMBLE(4); 1133 HS_CMP_HALF(0, r1); 1134 HS_CMP_HALF(1, r2); 1135 HS_CMP_HALF(2, r3); 1136 HS_CMP_HALF(3, r4); 1137 HS_CMP_HALF(4, r5); 1138 HS_CMP_HALF(5, r6); 1139 HS_CMP_HALF(6, r7); 1140 HS_CMP_HALF(7, r8); 1141 HS_CMP_HALF(8, r9); 1142 HS_CMP_HALF(9, r10); 1143 HS_CMP_HALF(10, r11); 1144 HS_CMP_HALF(11, r12); 1145 HS_CMP_HALF(12, r13); 1146 HS_CMP_HALF(13, r14); 1147 HS_CMP_HALF(14, r15); 1148 HS_CMP_HALF(15, r16); 1149 } 1150 { 1151 HS_SLAB_HALF_PREAMBLE(2); 1152 HS_CMP_HALF(0, r1); 1153 HS_CMP_HALF(1, r2); 1154 HS_CMP_HALF(2, r3); 1155 HS_CMP_HALF(3, r4); 1156 HS_CMP_HALF(4, r5); 1157 HS_CMP_HALF(5, r6); 1158 HS_CMP_HALF(6, r7); 1159 HS_CMP_HALF(7, r8); 1160 HS_CMP_HALF(8, r9); 1161 HS_CMP_HALF(9, r10); 1162 HS_CMP_HALF(10, r11); 1163 HS_CMP_HALF(11, r12); 1164 HS_CMP_HALF(12, r13); 1165 HS_CMP_HALF(13, r14); 1166 HS_CMP_HALF(14, r15); 1167 HS_CMP_HALF(15, r16); 1168 } 1169 { 1170 HS_SLAB_HALF_PREAMBLE(1); 1171 HS_CMP_HALF(0, r1); 1172 HS_CMP_HALF(1, r2); 1173 HS_CMP_HALF(2, r3); 1174 HS_CMP_HALF(3, r4); 1175 HS_CMP_HALF(4, r5); 1176 HS_CMP_HALF(5, r6); 1177 HS_CMP_HALF(6, r7); 1178 HS_CMP_HALF(7, r8); 1179 HS_CMP_HALF(8, r9); 1180 HS_CMP_HALF(9, r10); 1181 HS_CMP_HALF(10, r11); 1182 HS_CMP_HALF(11, r12); 1183 HS_CMP_HALF(12, r13); 1184 HS_CMP_HALF(13, r14); 1185 HS_CMP_HALF(14, r15); 1186 HS_CMP_HALF(15, r16); 1187 } 1188 HS_CMP_XCHG(r1, r9); 1189 HS_CMP_XCHG(r5, r13); 1190 HS_CMP_XCHG(r1, r5); 1191 HS_CMP_XCHG(r9, r13); 1192 HS_CMP_XCHG(r3, r11); 1193 HS_CMP_XCHG(r7, r15); 1194 HS_CMP_XCHG(r3, r7); 1195 HS_CMP_XCHG(r11, r15); 1196 HS_CMP_XCHG(r1, r3); 1197 HS_CMP_XCHG(r5, r7); 1198 HS_CMP_XCHG(r9, r11); 1199 HS_CMP_XCHG(r13, r15); 1200 HS_CMP_XCHG(r2, r10); 1201 HS_CMP_XCHG(r6, r14); 1202 HS_CMP_XCHG(r2, r6); 1203 HS_CMP_XCHG(r10, r14); 1204 HS_CMP_XCHG(r4, r12); 1205 HS_CMP_XCHG(r8, r16); 1206 HS_CMP_XCHG(r4, r8); 1207 HS_CMP_XCHG(r12, r16); 1208 HS_CMP_XCHG(r2, r4); 1209 HS_CMP_XCHG(r6, r8); 1210 HS_CMP_XCHG(r10, r12); 1211 HS_CMP_XCHG(r14, r16); 1212 HS_CMP_XCHG(r1, r2); 1213 HS_CMP_XCHG(r3, r4); 1214 HS_CMP_XCHG(r5, r6); 1215 HS_CMP_XCHG(r7, r8); 1216 HS_CMP_XCHG(r9, r10); 1217 HS_CMP_XCHG(r11, r12); 1218 HS_CMP_XCHG(r13, r14); 1219 HS_CMP_XCHG(r15, r16); 1220 } 1221 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0) = r1; 1222 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1) = r16; 1223 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2) = r2; 1224 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3) = r15; 1225 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4) = r3; 1226 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5) = r14; 1227 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6) = r4; 1228 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7) = r13; 1229 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 8) = r5; 1230 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 9) = r12; 1231 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 10) = r6; 1232 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 11) = r11; 1233 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 12) = r7; 1234 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 13) = r10; 1235 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 14) = r8; 1236 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 15) = r9; 1237 HS_BLOCK_BARRIER(); 1238 { 1239 { 1240 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); 1241 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(8); 1242 HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(16); 1243 HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(24); 1244 HS_CMP_XCHG(r0_2, r0_3); 1245 HS_CMP_XCHG(r0_1, r0_4); 1246 HS_CMP_XCHG(r0_3, r0_4); 1247 HS_CMP_XCHG(r0_1, r0_2); 1248 HS_SLAB_LOCAL_L(0) = r0_1; 1249 HS_SLAB_LOCAL_L(8) = r0_2; 1250 HS_SLAB_LOCAL_R(16) = r0_3; 1251 HS_SLAB_LOCAL_R(24) = r0_4; 1252 } 1253 { 1254 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(128); 1255 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(136); 1256 HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(144); 1257 HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(152); 1258 HS_CMP_XCHG(r0_2, r0_3); 1259 HS_CMP_XCHG(r0_1, r0_4); 1260 HS_CMP_XCHG(r0_3, r0_4); 1261 HS_CMP_XCHG(r0_1, r0_2); 1262 HS_SLAB_LOCAL_L(128) = r0_1; 1263 HS_SLAB_LOCAL_L(136) = r0_2; 1264 HS_SLAB_LOCAL_R(144) = r0_3; 1265 HS_SLAB_LOCAL_R(152) = r0_4; 1266 } 1267 { 1268 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(256); 1269 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(264); 1270 HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(272); 1271 HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(280); 1272 HS_CMP_XCHG(r0_2, r0_3); 1273 HS_CMP_XCHG(r0_1, r0_4); 1274 HS_CMP_XCHG(r0_3, r0_4); 1275 HS_CMP_XCHG(r0_1, r0_2); 1276 HS_SLAB_LOCAL_L(256) = r0_1; 1277 HS_SLAB_LOCAL_L(264) = r0_2; 1278 HS_SLAB_LOCAL_R(272) = r0_3; 1279 HS_SLAB_LOCAL_R(280) = r0_4; 1280 } 1281 { 1282 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(384); 1283 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(392); 1284 HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(400); 1285 HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(408); 1286 HS_CMP_XCHG(r0_2, r0_3); 1287 HS_CMP_XCHG(r0_1, r0_4); 1288 HS_CMP_XCHG(r0_3, r0_4); 1289 HS_CMP_XCHG(r0_1, r0_2); 1290 HS_SLAB_LOCAL_L(384) = r0_1; 1291 HS_SLAB_LOCAL_L(392) = r0_2; 1292 HS_SLAB_LOCAL_R(400) = r0_3; 1293 HS_SLAB_LOCAL_R(408) = r0_4; 1294 } 1295 } 1296 HS_BLOCK_BARRIER(); 1297 r1 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0); 1298 r16 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1); 1299 r2 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2); 1300 r15 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3); 1301 r3 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4); 1302 r14 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5); 1303 r4 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6); 1304 r13 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7); 1305 r5 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 8); 1306 r12 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 9); 1307 r6 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 10); 1308 r11 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 11); 1309 r7 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 12); 1310 r10 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 13); 1311 r8 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 14); 1312 r9 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 15); 1313 { 1314 { 1315 HS_SLAB_HALF_PREAMBLE(4); 1316 HS_CMP_HALF(0, r1); 1317 HS_CMP_HALF(1, r2); 1318 HS_CMP_HALF(2, r3); 1319 HS_CMP_HALF(3, r4); 1320 HS_CMP_HALF(4, r5); 1321 HS_CMP_HALF(5, r6); 1322 HS_CMP_HALF(6, r7); 1323 HS_CMP_HALF(7, r8); 1324 HS_CMP_HALF(8, r9); 1325 HS_CMP_HALF(9, r10); 1326 HS_CMP_HALF(10, r11); 1327 HS_CMP_HALF(11, r12); 1328 HS_CMP_HALF(12, r13); 1329 HS_CMP_HALF(13, r14); 1330 HS_CMP_HALF(14, r15); 1331 HS_CMP_HALF(15, r16); 1332 } 1333 { 1334 HS_SLAB_HALF_PREAMBLE(2); 1335 HS_CMP_HALF(0, r1); 1336 HS_CMP_HALF(1, r2); 1337 HS_CMP_HALF(2, r3); 1338 HS_CMP_HALF(3, r4); 1339 HS_CMP_HALF(4, r5); 1340 HS_CMP_HALF(5, r6); 1341 HS_CMP_HALF(6, r7); 1342 HS_CMP_HALF(7, r8); 1343 HS_CMP_HALF(8, r9); 1344 HS_CMP_HALF(9, r10); 1345 HS_CMP_HALF(10, r11); 1346 HS_CMP_HALF(11, r12); 1347 HS_CMP_HALF(12, r13); 1348 HS_CMP_HALF(13, r14); 1349 HS_CMP_HALF(14, r15); 1350 HS_CMP_HALF(15, r16); 1351 } 1352 { 1353 HS_SLAB_HALF_PREAMBLE(1); 1354 HS_CMP_HALF(0, r1); 1355 HS_CMP_HALF(1, r2); 1356 HS_CMP_HALF(2, r3); 1357 HS_CMP_HALF(3, r4); 1358 HS_CMP_HALF(4, r5); 1359 HS_CMP_HALF(5, r6); 1360 HS_CMP_HALF(6, r7); 1361 HS_CMP_HALF(7, r8); 1362 HS_CMP_HALF(8, r9); 1363 HS_CMP_HALF(9, r10); 1364 HS_CMP_HALF(10, r11); 1365 HS_CMP_HALF(11, r12); 1366 HS_CMP_HALF(12, r13); 1367 HS_CMP_HALF(13, r14); 1368 HS_CMP_HALF(14, r15); 1369 HS_CMP_HALF(15, r16); 1370 } 1371 HS_CMP_XCHG(r1, r9); 1372 HS_CMP_XCHG(r5, r13); 1373 HS_CMP_XCHG(r1, r5); 1374 HS_CMP_XCHG(r9, r13); 1375 HS_CMP_XCHG(r3, r11); 1376 HS_CMP_XCHG(r7, r15); 1377 HS_CMP_XCHG(r3, r7); 1378 HS_CMP_XCHG(r11, r15); 1379 HS_CMP_XCHG(r1, r3); 1380 HS_CMP_XCHG(r5, r7); 1381 HS_CMP_XCHG(r9, r11); 1382 HS_CMP_XCHG(r13, r15); 1383 HS_CMP_XCHG(r2, r10); 1384 HS_CMP_XCHG(r6, r14); 1385 HS_CMP_XCHG(r2, r6); 1386 HS_CMP_XCHG(r10, r14); 1387 HS_CMP_XCHG(r4, r12); 1388 HS_CMP_XCHG(r8, r16); 1389 HS_CMP_XCHG(r4, r8); 1390 HS_CMP_XCHG(r12, r16); 1391 HS_CMP_XCHG(r2, r4); 1392 HS_CMP_XCHG(r6, r8); 1393 HS_CMP_XCHG(r10, r12); 1394 HS_CMP_XCHG(r14, r16); 1395 HS_CMP_XCHG(r1, r2); 1396 HS_CMP_XCHG(r3, r4); 1397 HS_CMP_XCHG(r5, r6); 1398 HS_CMP_XCHG(r7, r8); 1399 HS_CMP_XCHG(r9, r10); 1400 HS_CMP_XCHG(r11, r12); 1401 HS_CMP_XCHG(r13, r14); 1402 HS_CMP_XCHG(r15, r16); 1403 } 1404 HS_SLAB_GLOBAL_STORE(0, r1); 1405 HS_SLAB_GLOBAL_STORE(1, r2); 1406 HS_SLAB_GLOBAL_STORE(2, r3); 1407 HS_SLAB_GLOBAL_STORE(3, r4); 1408 HS_SLAB_GLOBAL_STORE(4, r5); 1409 HS_SLAB_GLOBAL_STORE(5, r6); 1410 HS_SLAB_GLOBAL_STORE(6, r7); 1411 HS_SLAB_GLOBAL_STORE(7, r8); 1412 HS_SLAB_GLOBAL_STORE(8, r9); 1413 HS_SLAB_GLOBAL_STORE(9, r10); 1414 HS_SLAB_GLOBAL_STORE(10, r11); 1415 HS_SLAB_GLOBAL_STORE(11, r12); 1416 HS_SLAB_GLOBAL_STORE(12, r13); 1417 HS_SLAB_GLOBAL_STORE(13, r14); 1418 HS_SLAB_GLOBAL_STORE(14, r15); 1419 HS_SLAB_GLOBAL_STORE(15, r16); 1420} 1421 1422HS_BS_KERNEL_PROTO(8, 3) 1423{ 1424 HS_BLOCK_LOCAL_MEM_DECL(64, 16); 1425 1426 HS_SLAB_GLOBAL_PREAMBLE(); 1427 HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0); 1428 HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1); 1429 HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2); 1430 HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3); 1431 HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4); 1432 HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5); 1433 HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6); 1434 HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7); 1435 HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8); 1436 HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 9); 1437 HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 10); 1438 HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 11); 1439 HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 12); 1440 HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 13); 1441 HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 14); 1442 HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 15); 1443 HS_CMP_XCHG(r1, r2); 1444 HS_CMP_XCHG(r3, r4); 1445 HS_CMP_XCHG(r5, r6); 1446 HS_CMP_XCHG(r7, r8); 1447 HS_CMP_XCHG(r9, r10); 1448 HS_CMP_XCHG(r11, r12); 1449 HS_CMP_XCHG(r13, r14); 1450 HS_CMP_XCHG(r15, r16); 1451 HS_CMP_XCHG(r1, r3); 1452 HS_CMP_XCHG(r5, r7); 1453 HS_CMP_XCHG(r9, r11); 1454 HS_CMP_XCHG(r13, r15); 1455 HS_CMP_XCHG(r2, r4); 1456 HS_CMP_XCHG(r6, r8); 1457 HS_CMP_XCHG(r10, r12); 1458 HS_CMP_XCHG(r14, r16); 1459 HS_CMP_XCHG(r1, r5); 1460 HS_CMP_XCHG(r9, r13); 1461 HS_CMP_XCHG(r2, r6); 1462 HS_CMP_XCHG(r10, r14); 1463 HS_CMP_XCHG(r3, r7); 1464 HS_CMP_XCHG(r11, r15); 1465 HS_CMP_XCHG(r4, r8); 1466 HS_CMP_XCHG(r12, r16); 1467 HS_CMP_XCHG(r1, r9); 1468 HS_CMP_XCHG(r2, r10); 1469 HS_CMP_XCHG(r3, r11); 1470 HS_CMP_XCHG(r4, r12); 1471 HS_CMP_XCHG(r5, r13); 1472 HS_CMP_XCHG(r6, r14); 1473 HS_CMP_XCHG(r7, r15); 1474 HS_CMP_XCHG(r8, r16); 1475 HS_CMP_XCHG(r6, r11); 1476 HS_CMP_XCHG(r7, r10); 1477 HS_CMP_XCHG(r4, r13); 1478 HS_CMP_XCHG(r14, r15); 1479 HS_CMP_XCHG(r8, r12); 1480 HS_CMP_XCHG(r2, r3); 1481 HS_CMP_XCHG(r5, r9); 1482 HS_CMP_XCHG(r2, r5); 1483 HS_CMP_XCHG(r8, r14); 1484 HS_CMP_XCHG(r3, r9); 1485 HS_CMP_XCHG(r12, r15); 1486 HS_CMP_XCHG(r3, r5); 1487 HS_CMP_XCHG(r6, r7); 1488 HS_CMP_XCHG(r10, r11); 1489 HS_CMP_XCHG(r12, r14); 1490 HS_CMP_XCHG(r4, r9); 1491 HS_CMP_XCHG(r8, r13); 1492 HS_CMP_XCHG(r7, r9); 1493 HS_CMP_XCHG(r11, r13); 1494 HS_CMP_XCHG(r4, r6); 1495 HS_CMP_XCHG(r8, r10); 1496 HS_CMP_XCHG(r4, r5); 1497 HS_CMP_XCHG(r6, r7); 1498 HS_CMP_XCHG(r8, r9); 1499 HS_CMP_XCHG(r10, r11); 1500 HS_CMP_XCHG(r12, r13); 1501 HS_CMP_XCHG(r7, r8); 1502 HS_CMP_XCHG(r9, r10); 1503 { 1504 HS_SLAB_FLIP_PREAMBLE(1); 1505 HS_CMP_FLIP(0, r1, r16); 1506 HS_CMP_FLIP(1, r2, r15); 1507 HS_CMP_FLIP(2, r3, r14); 1508 HS_CMP_FLIP(3, r4, r13); 1509 HS_CMP_FLIP(4, r5, r12); 1510 HS_CMP_FLIP(5, r6, r11); 1511 HS_CMP_FLIP(6, r7, r10); 1512 HS_CMP_FLIP(7, r8, r9); 1513 } 1514 HS_CMP_XCHG(r1, r9); 1515 HS_CMP_XCHG(r5, r13); 1516 HS_CMP_XCHG(r1, r5); 1517 HS_CMP_XCHG(r9, r13); 1518 HS_CMP_XCHG(r3, r11); 1519 HS_CMP_XCHG(r7, r15); 1520 HS_CMP_XCHG(r3, r7); 1521 HS_CMP_XCHG(r11, r15); 1522 HS_CMP_XCHG(r1, r3); 1523 HS_CMP_XCHG(r5, r7); 1524 HS_CMP_XCHG(r9, r11); 1525 HS_CMP_XCHG(r13, r15); 1526 HS_CMP_XCHG(r2, r10); 1527 HS_CMP_XCHG(r6, r14); 1528 HS_CMP_XCHG(r2, r6); 1529 HS_CMP_XCHG(r10, r14); 1530 HS_CMP_XCHG(r4, r12); 1531 HS_CMP_XCHG(r8, r16); 1532 HS_CMP_XCHG(r4, r8); 1533 HS_CMP_XCHG(r12, r16); 1534 HS_CMP_XCHG(r2, r4); 1535 HS_CMP_XCHG(r6, r8); 1536 HS_CMP_XCHG(r10, r12); 1537 HS_CMP_XCHG(r14, r16); 1538 HS_CMP_XCHG(r1, r2); 1539 HS_CMP_XCHG(r3, r4); 1540 HS_CMP_XCHG(r5, r6); 1541 HS_CMP_XCHG(r7, r8); 1542 HS_CMP_XCHG(r9, r10); 1543 HS_CMP_XCHG(r11, r12); 1544 HS_CMP_XCHG(r13, r14); 1545 HS_CMP_XCHG(r15, r16); 1546 { 1547 HS_SLAB_FLIP_PREAMBLE(3); 1548 HS_CMP_FLIP(0, r1, r16); 1549 HS_CMP_FLIP(1, r2, r15); 1550 HS_CMP_FLIP(2, r3, r14); 1551 HS_CMP_FLIP(3, r4, r13); 1552 HS_CMP_FLIP(4, r5, r12); 1553 HS_CMP_FLIP(5, r6, r11); 1554 HS_CMP_FLIP(6, r7, r10); 1555 HS_CMP_FLIP(7, r8, r9); 1556 } 1557 { 1558 HS_SLAB_HALF_PREAMBLE(1); 1559 HS_CMP_HALF(0, r1); 1560 HS_CMP_HALF(1, r2); 1561 HS_CMP_HALF(2, r3); 1562 HS_CMP_HALF(3, r4); 1563 HS_CMP_HALF(4, r5); 1564 HS_CMP_HALF(5, r6); 1565 HS_CMP_HALF(6, r7); 1566 HS_CMP_HALF(7, r8); 1567 HS_CMP_HALF(8, r9); 1568 HS_CMP_HALF(9, r10); 1569 HS_CMP_HALF(10, r11); 1570 HS_CMP_HALF(11, r12); 1571 HS_CMP_HALF(12, r13); 1572 HS_CMP_HALF(13, r14); 1573 HS_CMP_HALF(14, r15); 1574 HS_CMP_HALF(15, r16); 1575 } 1576 HS_CMP_XCHG(r1, r9); 1577 HS_CMP_XCHG(r5, r13); 1578 HS_CMP_XCHG(r1, r5); 1579 HS_CMP_XCHG(r9, r13); 1580 HS_CMP_XCHG(r3, r11); 1581 HS_CMP_XCHG(r7, r15); 1582 HS_CMP_XCHG(r3, r7); 1583 HS_CMP_XCHG(r11, r15); 1584 HS_CMP_XCHG(r1, r3); 1585 HS_CMP_XCHG(r5, r7); 1586 HS_CMP_XCHG(r9, r11); 1587 HS_CMP_XCHG(r13, r15); 1588 HS_CMP_XCHG(r2, r10); 1589 HS_CMP_XCHG(r6, r14); 1590 HS_CMP_XCHG(r2, r6); 1591 HS_CMP_XCHG(r10, r14); 1592 HS_CMP_XCHG(r4, r12); 1593 HS_CMP_XCHG(r8, r16); 1594 HS_CMP_XCHG(r4, r8); 1595 HS_CMP_XCHG(r12, r16); 1596 HS_CMP_XCHG(r2, r4); 1597 HS_CMP_XCHG(r6, r8); 1598 HS_CMP_XCHG(r10, r12); 1599 HS_CMP_XCHG(r14, r16); 1600 HS_CMP_XCHG(r1, r2); 1601 HS_CMP_XCHG(r3, r4); 1602 HS_CMP_XCHG(r5, r6); 1603 HS_CMP_XCHG(r7, r8); 1604 HS_CMP_XCHG(r9, r10); 1605 HS_CMP_XCHG(r11, r12); 1606 HS_CMP_XCHG(r13, r14); 1607 HS_CMP_XCHG(r15, r16); 1608 { 1609 HS_SLAB_FLIP_PREAMBLE(7); 1610 HS_CMP_FLIP(0, r1, r16); 1611 HS_CMP_FLIP(1, r2, r15); 1612 HS_CMP_FLIP(2, r3, r14); 1613 HS_CMP_FLIP(3, r4, r13); 1614 HS_CMP_FLIP(4, r5, r12); 1615 HS_CMP_FLIP(5, r6, r11); 1616 HS_CMP_FLIP(6, r7, r10); 1617 HS_CMP_FLIP(7, r8, r9); 1618 } 1619 { 1620 HS_SLAB_HALF_PREAMBLE(2); 1621 HS_CMP_HALF(0, r1); 1622 HS_CMP_HALF(1, r2); 1623 HS_CMP_HALF(2, r3); 1624 HS_CMP_HALF(3, r4); 1625 HS_CMP_HALF(4, r5); 1626 HS_CMP_HALF(5, r6); 1627 HS_CMP_HALF(6, r7); 1628 HS_CMP_HALF(7, r8); 1629 HS_CMP_HALF(8, r9); 1630 HS_CMP_HALF(9, r10); 1631 HS_CMP_HALF(10, r11); 1632 HS_CMP_HALF(11, r12); 1633 HS_CMP_HALF(12, r13); 1634 HS_CMP_HALF(13, r14); 1635 HS_CMP_HALF(14, r15); 1636 HS_CMP_HALF(15, r16); 1637 } 1638 { 1639 HS_SLAB_HALF_PREAMBLE(1); 1640 HS_CMP_HALF(0, r1); 1641 HS_CMP_HALF(1, r2); 1642 HS_CMP_HALF(2, r3); 1643 HS_CMP_HALF(3, r4); 1644 HS_CMP_HALF(4, r5); 1645 HS_CMP_HALF(5, r6); 1646 HS_CMP_HALF(6, r7); 1647 HS_CMP_HALF(7, r8); 1648 HS_CMP_HALF(8, r9); 1649 HS_CMP_HALF(9, r10); 1650 HS_CMP_HALF(10, r11); 1651 HS_CMP_HALF(11, r12); 1652 HS_CMP_HALF(12, r13); 1653 HS_CMP_HALF(13, r14); 1654 HS_CMP_HALF(14, r15); 1655 HS_CMP_HALF(15, r16); 1656 } 1657 HS_CMP_XCHG(r1, r9); 1658 HS_CMP_XCHG(r5, r13); 1659 HS_CMP_XCHG(r1, r5); 1660 HS_CMP_XCHG(r9, r13); 1661 HS_CMP_XCHG(r3, r11); 1662 HS_CMP_XCHG(r7, r15); 1663 HS_CMP_XCHG(r3, r7); 1664 HS_CMP_XCHG(r11, r15); 1665 HS_CMP_XCHG(r1, r3); 1666 HS_CMP_XCHG(r5, r7); 1667 HS_CMP_XCHG(r9, r11); 1668 HS_CMP_XCHG(r13, r15); 1669 HS_CMP_XCHG(r2, r10); 1670 HS_CMP_XCHG(r6, r14); 1671 HS_CMP_XCHG(r2, r6); 1672 HS_CMP_XCHG(r10, r14); 1673 HS_CMP_XCHG(r4, r12); 1674 HS_CMP_XCHG(r8, r16); 1675 HS_CMP_XCHG(r4, r8); 1676 HS_CMP_XCHG(r12, r16); 1677 HS_CMP_XCHG(r2, r4); 1678 HS_CMP_XCHG(r6, r8); 1679 HS_CMP_XCHG(r10, r12); 1680 HS_CMP_XCHG(r14, r16); 1681 HS_CMP_XCHG(r1, r2); 1682 HS_CMP_XCHG(r3, r4); 1683 HS_CMP_XCHG(r5, r6); 1684 HS_CMP_XCHG(r7, r8); 1685 HS_CMP_XCHG(r9, r10); 1686 HS_CMP_XCHG(r11, r12); 1687 HS_CMP_XCHG(r13, r14); 1688 HS_CMP_XCHG(r15, r16); 1689 HS_BS_MERGE_H_PREAMBLE(8); 1690 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0) = r1; 1691 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1) = r16; 1692 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2) = r2; 1693 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3) = r15; 1694 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4) = r3; 1695 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5) = r14; 1696 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6) = r4; 1697 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7) = r13; 1698 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8) = r5; 1699 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9) = r12; 1700 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10) = r6; 1701 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11) = r11; 1702 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12) = r7; 1703 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13) = r10; 1704 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14) = r8; 1705 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15) = r9; 1706 HS_BLOCK_BARRIER(); 1707 { 1708 { 1709 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); 1710 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(8); 1711 HS_CMP_XCHG(r0_1, r0_2); 1712 HS_SLAB_LOCAL_L(0) = r0_1; 1713 HS_SLAB_LOCAL_R(8) = r0_2; 1714 } 1715 { 1716 HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(16); 1717 HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(24); 1718 HS_CMP_XCHG(r1_1, r1_2); 1719 HS_SLAB_LOCAL_L(16) = r1_1; 1720 HS_SLAB_LOCAL_R(24) = r1_2; 1721 } 1722 { 1723 HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(32); 1724 HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(40); 1725 HS_CMP_XCHG(r2_1, r2_2); 1726 HS_SLAB_LOCAL_L(32) = r2_1; 1727 HS_SLAB_LOCAL_R(40) = r2_2; 1728 } 1729 { 1730 HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(48); 1731 HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(56); 1732 HS_CMP_XCHG(r3_1, r3_2); 1733 HS_SLAB_LOCAL_L(48) = r3_1; 1734 HS_SLAB_LOCAL_R(56) = r3_2; 1735 } 1736 { 1737 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(512); 1738 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(520); 1739 HS_CMP_XCHG(r0_1, r0_2); 1740 HS_SLAB_LOCAL_L(512) = r0_1; 1741 HS_SLAB_LOCAL_R(520) = r0_2; 1742 } 1743 { 1744 HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(528); 1745 HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(536); 1746 HS_CMP_XCHG(r1_1, r1_2); 1747 HS_SLAB_LOCAL_L(528) = r1_1; 1748 HS_SLAB_LOCAL_R(536) = r1_2; 1749 } 1750 { 1751 HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(544); 1752 HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(552); 1753 HS_CMP_XCHG(r2_1, r2_2); 1754 HS_SLAB_LOCAL_L(544) = r2_1; 1755 HS_SLAB_LOCAL_R(552) = r2_2; 1756 } 1757 { 1758 HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(560); 1759 HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(568); 1760 HS_CMP_XCHG(r3_1, r3_2); 1761 HS_SLAB_LOCAL_L(560) = r3_1; 1762 HS_SLAB_LOCAL_R(568) = r3_2; 1763 } 1764 } 1765 HS_BLOCK_BARRIER(); 1766 r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0); 1767 r16 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1); 1768 r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2); 1769 r15 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3); 1770 r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4); 1771 r14 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5); 1772 r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6); 1773 r13 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7); 1774 r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8); 1775 r12 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9); 1776 r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10); 1777 r11 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11); 1778 r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12); 1779 r10 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13); 1780 r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14); 1781 r9 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15); 1782 { 1783 { 1784 HS_SLAB_HALF_PREAMBLE(4); 1785 HS_CMP_HALF(0, r1); 1786 HS_CMP_HALF(1, r2); 1787 HS_CMP_HALF(2, r3); 1788 HS_CMP_HALF(3, r4); 1789 HS_CMP_HALF(4, r5); 1790 HS_CMP_HALF(5, r6); 1791 HS_CMP_HALF(6, r7); 1792 HS_CMP_HALF(7, r8); 1793 HS_CMP_HALF(8, r9); 1794 HS_CMP_HALF(9, r10); 1795 HS_CMP_HALF(10, r11); 1796 HS_CMP_HALF(11, r12); 1797 HS_CMP_HALF(12, r13); 1798 HS_CMP_HALF(13, r14); 1799 HS_CMP_HALF(14, r15); 1800 HS_CMP_HALF(15, r16); 1801 } 1802 { 1803 HS_SLAB_HALF_PREAMBLE(2); 1804 HS_CMP_HALF(0, r1); 1805 HS_CMP_HALF(1, r2); 1806 HS_CMP_HALF(2, r3); 1807 HS_CMP_HALF(3, r4); 1808 HS_CMP_HALF(4, r5); 1809 HS_CMP_HALF(5, r6); 1810 HS_CMP_HALF(6, r7); 1811 HS_CMP_HALF(7, r8); 1812 HS_CMP_HALF(8, r9); 1813 HS_CMP_HALF(9, r10); 1814 HS_CMP_HALF(10, r11); 1815 HS_CMP_HALF(11, r12); 1816 HS_CMP_HALF(12, r13); 1817 HS_CMP_HALF(13, r14); 1818 HS_CMP_HALF(14, r15); 1819 HS_CMP_HALF(15, r16); 1820 } 1821 { 1822 HS_SLAB_HALF_PREAMBLE(1); 1823 HS_CMP_HALF(0, r1); 1824 HS_CMP_HALF(1, r2); 1825 HS_CMP_HALF(2, r3); 1826 HS_CMP_HALF(3, r4); 1827 HS_CMP_HALF(4, r5); 1828 HS_CMP_HALF(5, r6); 1829 HS_CMP_HALF(6, r7); 1830 HS_CMP_HALF(7, r8); 1831 HS_CMP_HALF(8, r9); 1832 HS_CMP_HALF(9, r10); 1833 HS_CMP_HALF(10, r11); 1834 HS_CMP_HALF(11, r12); 1835 HS_CMP_HALF(12, r13); 1836 HS_CMP_HALF(13, r14); 1837 HS_CMP_HALF(14, r15); 1838 HS_CMP_HALF(15, r16); 1839 } 1840 HS_CMP_XCHG(r1, r9); 1841 HS_CMP_XCHG(r5, r13); 1842 HS_CMP_XCHG(r1, r5); 1843 HS_CMP_XCHG(r9, r13); 1844 HS_CMP_XCHG(r3, r11); 1845 HS_CMP_XCHG(r7, r15); 1846 HS_CMP_XCHG(r3, r7); 1847 HS_CMP_XCHG(r11, r15); 1848 HS_CMP_XCHG(r1, r3); 1849 HS_CMP_XCHG(r5, r7); 1850 HS_CMP_XCHG(r9, r11); 1851 HS_CMP_XCHG(r13, r15); 1852 HS_CMP_XCHG(r2, r10); 1853 HS_CMP_XCHG(r6, r14); 1854 HS_CMP_XCHG(r2, r6); 1855 HS_CMP_XCHG(r10, r14); 1856 HS_CMP_XCHG(r4, r12); 1857 HS_CMP_XCHG(r8, r16); 1858 HS_CMP_XCHG(r4, r8); 1859 HS_CMP_XCHG(r12, r16); 1860 HS_CMP_XCHG(r2, r4); 1861 HS_CMP_XCHG(r6, r8); 1862 HS_CMP_XCHG(r10, r12); 1863 HS_CMP_XCHG(r14, r16); 1864 HS_CMP_XCHG(r1, r2); 1865 HS_CMP_XCHG(r3, r4); 1866 HS_CMP_XCHG(r5, r6); 1867 HS_CMP_XCHG(r7, r8); 1868 HS_CMP_XCHG(r9, r10); 1869 HS_CMP_XCHG(r11, r12); 1870 HS_CMP_XCHG(r13, r14); 1871 HS_CMP_XCHG(r15, r16); 1872 } 1873 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0) = r1; 1874 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1) = r16; 1875 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2) = r2; 1876 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3) = r15; 1877 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4) = r3; 1878 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5) = r14; 1879 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6) = r4; 1880 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7) = r13; 1881 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8) = r5; 1882 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9) = r12; 1883 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10) = r6; 1884 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11) = r11; 1885 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12) = r7; 1886 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13) = r10; 1887 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14) = r8; 1888 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15) = r9; 1889 HS_BLOCK_BARRIER(); 1890 { 1891 { 1892 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); 1893 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(8); 1894 HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(16); 1895 HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(24); 1896 HS_CMP_XCHG(r0_2, r0_3); 1897 HS_CMP_XCHG(r0_1, r0_4); 1898 HS_CMP_XCHG(r0_3, r0_4); 1899 HS_CMP_XCHG(r0_1, r0_2); 1900 HS_SLAB_LOCAL_L(0) = r0_1; 1901 HS_SLAB_LOCAL_L(8) = r0_2; 1902 HS_SLAB_LOCAL_R(16) = r0_3; 1903 HS_SLAB_LOCAL_R(24) = r0_4; 1904 } 1905 { 1906 HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(32); 1907 HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(40); 1908 HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(48); 1909 HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(56); 1910 HS_CMP_XCHG(r1_2, r1_3); 1911 HS_CMP_XCHG(r1_1, r1_4); 1912 HS_CMP_XCHG(r1_3, r1_4); 1913 HS_CMP_XCHG(r1_1, r1_2); 1914 HS_SLAB_LOCAL_L(32) = r1_1; 1915 HS_SLAB_LOCAL_L(40) = r1_2; 1916 HS_SLAB_LOCAL_R(48) = r1_3; 1917 HS_SLAB_LOCAL_R(56) = r1_4; 1918 } 1919 { 1920 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(512); 1921 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(520); 1922 HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(528); 1923 HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(536); 1924 HS_CMP_XCHG(r0_2, r0_3); 1925 HS_CMP_XCHG(r0_1, r0_4); 1926 HS_CMP_XCHG(r0_3, r0_4); 1927 HS_CMP_XCHG(r0_1, r0_2); 1928 HS_SLAB_LOCAL_L(512) = r0_1; 1929 HS_SLAB_LOCAL_L(520) = r0_2; 1930 HS_SLAB_LOCAL_R(528) = r0_3; 1931 HS_SLAB_LOCAL_R(536) = r0_4; 1932 } 1933 { 1934 HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(544); 1935 HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(552); 1936 HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(560); 1937 HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(568); 1938 HS_CMP_XCHG(r1_2, r1_3); 1939 HS_CMP_XCHG(r1_1, r1_4); 1940 HS_CMP_XCHG(r1_3, r1_4); 1941 HS_CMP_XCHG(r1_1, r1_2); 1942 HS_SLAB_LOCAL_L(544) = r1_1; 1943 HS_SLAB_LOCAL_L(552) = r1_2; 1944 HS_SLAB_LOCAL_R(560) = r1_3; 1945 HS_SLAB_LOCAL_R(568) = r1_4; 1946 } 1947 } 1948 HS_BLOCK_BARRIER(); 1949 r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0); 1950 r16 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1); 1951 r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2); 1952 r15 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3); 1953 r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4); 1954 r14 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5); 1955 r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6); 1956 r13 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7); 1957 r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8); 1958 r12 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9); 1959 r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10); 1960 r11 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11); 1961 r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12); 1962 r10 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13); 1963 r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14); 1964 r9 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15); 1965 { 1966 { 1967 HS_SLAB_HALF_PREAMBLE(4); 1968 HS_CMP_HALF(0, r1); 1969 HS_CMP_HALF(1, r2); 1970 HS_CMP_HALF(2, r3); 1971 HS_CMP_HALF(3, r4); 1972 HS_CMP_HALF(4, r5); 1973 HS_CMP_HALF(5, r6); 1974 HS_CMP_HALF(6, r7); 1975 HS_CMP_HALF(7, r8); 1976 HS_CMP_HALF(8, r9); 1977 HS_CMP_HALF(9, r10); 1978 HS_CMP_HALF(10, r11); 1979 HS_CMP_HALF(11, r12); 1980 HS_CMP_HALF(12, r13); 1981 HS_CMP_HALF(13, r14); 1982 HS_CMP_HALF(14, r15); 1983 HS_CMP_HALF(15, r16); 1984 } 1985 { 1986 HS_SLAB_HALF_PREAMBLE(2); 1987 HS_CMP_HALF(0, r1); 1988 HS_CMP_HALF(1, r2); 1989 HS_CMP_HALF(2, r3); 1990 HS_CMP_HALF(3, r4); 1991 HS_CMP_HALF(4, r5); 1992 HS_CMP_HALF(5, r6); 1993 HS_CMP_HALF(6, r7); 1994 HS_CMP_HALF(7, r8); 1995 HS_CMP_HALF(8, r9); 1996 HS_CMP_HALF(9, r10); 1997 HS_CMP_HALF(10, r11); 1998 HS_CMP_HALF(11, r12); 1999 HS_CMP_HALF(12, r13); 2000 HS_CMP_HALF(13, r14); 2001 HS_CMP_HALF(14, r15); 2002 HS_CMP_HALF(15, r16); 2003 } 2004 { 2005 HS_SLAB_HALF_PREAMBLE(1); 2006 HS_CMP_HALF(0, r1); 2007 HS_CMP_HALF(1, r2); 2008 HS_CMP_HALF(2, r3); 2009 HS_CMP_HALF(3, r4); 2010 HS_CMP_HALF(4, r5); 2011 HS_CMP_HALF(5, r6); 2012 HS_CMP_HALF(6, r7); 2013 HS_CMP_HALF(7, r8); 2014 HS_CMP_HALF(8, r9); 2015 HS_CMP_HALF(9, r10); 2016 HS_CMP_HALF(10, r11); 2017 HS_CMP_HALF(11, r12); 2018 HS_CMP_HALF(12, r13); 2019 HS_CMP_HALF(13, r14); 2020 HS_CMP_HALF(14, r15); 2021 HS_CMP_HALF(15, r16); 2022 } 2023 HS_CMP_XCHG(r1, r9); 2024 HS_CMP_XCHG(r5, r13); 2025 HS_CMP_XCHG(r1, r5); 2026 HS_CMP_XCHG(r9, r13); 2027 HS_CMP_XCHG(r3, r11); 2028 HS_CMP_XCHG(r7, r15); 2029 HS_CMP_XCHG(r3, r7); 2030 HS_CMP_XCHG(r11, r15); 2031 HS_CMP_XCHG(r1, r3); 2032 HS_CMP_XCHG(r5, r7); 2033 HS_CMP_XCHG(r9, r11); 2034 HS_CMP_XCHG(r13, r15); 2035 HS_CMP_XCHG(r2, r10); 2036 HS_CMP_XCHG(r6, r14); 2037 HS_CMP_XCHG(r2, r6); 2038 HS_CMP_XCHG(r10, r14); 2039 HS_CMP_XCHG(r4, r12); 2040 HS_CMP_XCHG(r8, r16); 2041 HS_CMP_XCHG(r4, r8); 2042 HS_CMP_XCHG(r12, r16); 2043 HS_CMP_XCHG(r2, r4); 2044 HS_CMP_XCHG(r6, r8); 2045 HS_CMP_XCHG(r10, r12); 2046 HS_CMP_XCHG(r14, r16); 2047 HS_CMP_XCHG(r1, r2); 2048 HS_CMP_XCHG(r3, r4); 2049 HS_CMP_XCHG(r5, r6); 2050 HS_CMP_XCHG(r7, r8); 2051 HS_CMP_XCHG(r9, r10); 2052 HS_CMP_XCHG(r11, r12); 2053 HS_CMP_XCHG(r13, r14); 2054 HS_CMP_XCHG(r15, r16); 2055 } 2056 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0) = r1; 2057 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1) = r16; 2058 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2) = r2; 2059 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3) = r15; 2060 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4) = r3; 2061 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5) = r14; 2062 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6) = r4; 2063 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7) = r13; 2064 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8) = r5; 2065 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9) = r12; 2066 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10) = r6; 2067 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11) = r11; 2068 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12) = r7; 2069 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13) = r10; 2070 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14) = r8; 2071 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15) = r9; 2072 HS_BLOCK_BARRIER(); 2073 { 2074 { 2075 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); 2076 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(8); 2077 HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(16); 2078 HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(24); 2079 HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(32); 2080 HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(40); 2081 HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(48); 2082 HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(56); 2083 HS_CMP_XCHG(r0_4, r0_5); 2084 HS_CMP_XCHG(r0_3, r0_6); 2085 HS_CMP_XCHG(r0_2, r0_7); 2086 HS_CMP_XCHG(r0_1, r0_8); 2087 HS_CMP_XCHG(r0_5, r0_7); 2088 HS_CMP_XCHG(r0_6, r0_8); 2089 HS_CMP_XCHG(r0_5, r0_6); 2090 HS_CMP_XCHG(r0_7, r0_8); 2091 HS_CMP_XCHG(r0_1, r0_3); 2092 HS_CMP_XCHG(r0_2, r0_4); 2093 HS_CMP_XCHG(r0_1, r0_2); 2094 HS_CMP_XCHG(r0_3, r0_4); 2095 HS_SLAB_LOCAL_L(0) = r0_1; 2096 HS_SLAB_LOCAL_L(8) = r0_2; 2097 HS_SLAB_LOCAL_L(16) = r0_3; 2098 HS_SLAB_LOCAL_L(24) = r0_4; 2099 HS_SLAB_LOCAL_R(32) = r0_5; 2100 HS_SLAB_LOCAL_R(40) = r0_6; 2101 HS_SLAB_LOCAL_R(48) = r0_7; 2102 HS_SLAB_LOCAL_R(56) = r0_8; 2103 } 2104 { 2105 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(512); 2106 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(520); 2107 HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(528); 2108 HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(536); 2109 HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(544); 2110 HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(552); 2111 HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(560); 2112 HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(568); 2113 HS_CMP_XCHG(r0_4, r0_5); 2114 HS_CMP_XCHG(r0_3, r0_6); 2115 HS_CMP_XCHG(r0_2, r0_7); 2116 HS_CMP_XCHG(r0_1, r0_8); 2117 HS_CMP_XCHG(r0_5, r0_7); 2118 HS_CMP_XCHG(r0_6, r0_8); 2119 HS_CMP_XCHG(r0_5, r0_6); 2120 HS_CMP_XCHG(r0_7, r0_8); 2121 HS_CMP_XCHG(r0_1, r0_3); 2122 HS_CMP_XCHG(r0_2, r0_4); 2123 HS_CMP_XCHG(r0_1, r0_2); 2124 HS_CMP_XCHG(r0_3, r0_4); 2125 HS_SLAB_LOCAL_L(512) = r0_1; 2126 HS_SLAB_LOCAL_L(520) = r0_2; 2127 HS_SLAB_LOCAL_L(528) = r0_3; 2128 HS_SLAB_LOCAL_L(536) = r0_4; 2129 HS_SLAB_LOCAL_R(544) = r0_5; 2130 HS_SLAB_LOCAL_R(552) = r0_6; 2131 HS_SLAB_LOCAL_R(560) = r0_7; 2132 HS_SLAB_LOCAL_R(568) = r0_8; 2133 } 2134 } 2135 HS_BLOCK_BARRIER(); 2136 r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0); 2137 r16 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1); 2138 r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2); 2139 r15 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3); 2140 r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4); 2141 r14 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5); 2142 r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6); 2143 r13 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7); 2144 r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8); 2145 r12 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9); 2146 r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10); 2147 r11 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11); 2148 r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12); 2149 r10 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13); 2150 r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14); 2151 r9 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15); 2152 { 2153 { 2154 HS_SLAB_HALF_PREAMBLE(4); 2155 HS_CMP_HALF(0, r1); 2156 HS_CMP_HALF(1, r2); 2157 HS_CMP_HALF(2, r3); 2158 HS_CMP_HALF(3, r4); 2159 HS_CMP_HALF(4, r5); 2160 HS_CMP_HALF(5, r6); 2161 HS_CMP_HALF(6, r7); 2162 HS_CMP_HALF(7, r8); 2163 HS_CMP_HALF(8, r9); 2164 HS_CMP_HALF(9, r10); 2165 HS_CMP_HALF(10, r11); 2166 HS_CMP_HALF(11, r12); 2167 HS_CMP_HALF(12, r13); 2168 HS_CMP_HALF(13, r14); 2169 HS_CMP_HALF(14, r15); 2170 HS_CMP_HALF(15, r16); 2171 } 2172 { 2173 HS_SLAB_HALF_PREAMBLE(2); 2174 HS_CMP_HALF(0, r1); 2175 HS_CMP_HALF(1, r2); 2176 HS_CMP_HALF(2, r3); 2177 HS_CMP_HALF(3, r4); 2178 HS_CMP_HALF(4, r5); 2179 HS_CMP_HALF(5, r6); 2180 HS_CMP_HALF(6, r7); 2181 HS_CMP_HALF(7, r8); 2182 HS_CMP_HALF(8, r9); 2183 HS_CMP_HALF(9, r10); 2184 HS_CMP_HALF(10, r11); 2185 HS_CMP_HALF(11, r12); 2186 HS_CMP_HALF(12, r13); 2187 HS_CMP_HALF(13, r14); 2188 HS_CMP_HALF(14, r15); 2189 HS_CMP_HALF(15, r16); 2190 } 2191 { 2192 HS_SLAB_HALF_PREAMBLE(1); 2193 HS_CMP_HALF(0, r1); 2194 HS_CMP_HALF(1, r2); 2195 HS_CMP_HALF(2, r3); 2196 HS_CMP_HALF(3, r4); 2197 HS_CMP_HALF(4, r5); 2198 HS_CMP_HALF(5, r6); 2199 HS_CMP_HALF(6, r7); 2200 HS_CMP_HALF(7, r8); 2201 HS_CMP_HALF(8, r9); 2202 HS_CMP_HALF(9, r10); 2203 HS_CMP_HALF(10, r11); 2204 HS_CMP_HALF(11, r12); 2205 HS_CMP_HALF(12, r13); 2206 HS_CMP_HALF(13, r14); 2207 HS_CMP_HALF(14, r15); 2208 HS_CMP_HALF(15, r16); 2209 } 2210 HS_CMP_XCHG(r1, r9); 2211 HS_CMP_XCHG(r5, r13); 2212 HS_CMP_XCHG(r1, r5); 2213 HS_CMP_XCHG(r9, r13); 2214 HS_CMP_XCHG(r3, r11); 2215 HS_CMP_XCHG(r7, r15); 2216 HS_CMP_XCHG(r3, r7); 2217 HS_CMP_XCHG(r11, r15); 2218 HS_CMP_XCHG(r1, r3); 2219 HS_CMP_XCHG(r5, r7); 2220 HS_CMP_XCHG(r9, r11); 2221 HS_CMP_XCHG(r13, r15); 2222 HS_CMP_XCHG(r2, r10); 2223 HS_CMP_XCHG(r6, r14); 2224 HS_CMP_XCHG(r2, r6); 2225 HS_CMP_XCHG(r10, r14); 2226 HS_CMP_XCHG(r4, r12); 2227 HS_CMP_XCHG(r8, r16); 2228 HS_CMP_XCHG(r4, r8); 2229 HS_CMP_XCHG(r12, r16); 2230 HS_CMP_XCHG(r2, r4); 2231 HS_CMP_XCHG(r6, r8); 2232 HS_CMP_XCHG(r10, r12); 2233 HS_CMP_XCHG(r14, r16); 2234 HS_CMP_XCHG(r1, r2); 2235 HS_CMP_XCHG(r3, r4); 2236 HS_CMP_XCHG(r5, r6); 2237 HS_CMP_XCHG(r7, r8); 2238 HS_CMP_XCHG(r9, r10); 2239 HS_CMP_XCHG(r11, r12); 2240 HS_CMP_XCHG(r13, r14); 2241 HS_CMP_XCHG(r15, r16); 2242 } 2243 HS_SLAB_GLOBAL_STORE(0, r1); 2244 HS_SLAB_GLOBAL_STORE(1, r2); 2245 HS_SLAB_GLOBAL_STORE(2, r3); 2246 HS_SLAB_GLOBAL_STORE(3, r4); 2247 HS_SLAB_GLOBAL_STORE(4, r5); 2248 HS_SLAB_GLOBAL_STORE(5, r6); 2249 HS_SLAB_GLOBAL_STORE(6, r7); 2250 HS_SLAB_GLOBAL_STORE(7, r8); 2251 HS_SLAB_GLOBAL_STORE(8, r9); 2252 HS_SLAB_GLOBAL_STORE(9, r10); 2253 HS_SLAB_GLOBAL_STORE(10, r11); 2254 HS_SLAB_GLOBAL_STORE(11, r12); 2255 HS_SLAB_GLOBAL_STORE(12, r13); 2256 HS_SLAB_GLOBAL_STORE(13, r14); 2257 HS_SLAB_GLOBAL_STORE(14, r15); 2258 HS_SLAB_GLOBAL_STORE(15, r16); 2259} 2260 2261HS_BS_KERNEL_PROTO(16, 4) 2262{ 2263 HS_BLOCK_LOCAL_MEM_DECL(128, 16); 2264 2265 HS_SLAB_GLOBAL_PREAMBLE(); 2266 HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0); 2267 HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1); 2268 HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2); 2269 HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3); 2270 HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4); 2271 HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5); 2272 HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6); 2273 HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7); 2274 HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8); 2275 HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 9); 2276 HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 10); 2277 HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 11); 2278 HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 12); 2279 HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 13); 2280 HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 14); 2281 HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 15); 2282 HS_CMP_XCHG(r1, r2); 2283 HS_CMP_XCHG(r3, r4); 2284 HS_CMP_XCHG(r5, r6); 2285 HS_CMP_XCHG(r7, r8); 2286 HS_CMP_XCHG(r9, r10); 2287 HS_CMP_XCHG(r11, r12); 2288 HS_CMP_XCHG(r13, r14); 2289 HS_CMP_XCHG(r15, r16); 2290 HS_CMP_XCHG(r1, r3); 2291 HS_CMP_XCHG(r5, r7); 2292 HS_CMP_XCHG(r9, r11); 2293 HS_CMP_XCHG(r13, r15); 2294 HS_CMP_XCHG(r2, r4); 2295 HS_CMP_XCHG(r6, r8); 2296 HS_CMP_XCHG(r10, r12); 2297 HS_CMP_XCHG(r14, r16); 2298 HS_CMP_XCHG(r1, r5); 2299 HS_CMP_XCHG(r9, r13); 2300 HS_CMP_XCHG(r2, r6); 2301 HS_CMP_XCHG(r10, r14); 2302 HS_CMP_XCHG(r3, r7); 2303 HS_CMP_XCHG(r11, r15); 2304 HS_CMP_XCHG(r4, r8); 2305 HS_CMP_XCHG(r12, r16); 2306 HS_CMP_XCHG(r1, r9); 2307 HS_CMP_XCHG(r2, r10); 2308 HS_CMP_XCHG(r3, r11); 2309 HS_CMP_XCHG(r4, r12); 2310 HS_CMP_XCHG(r5, r13); 2311 HS_CMP_XCHG(r6, r14); 2312 HS_CMP_XCHG(r7, r15); 2313 HS_CMP_XCHG(r8, r16); 2314 HS_CMP_XCHG(r6, r11); 2315 HS_CMP_XCHG(r7, r10); 2316 HS_CMP_XCHG(r4, r13); 2317 HS_CMP_XCHG(r14, r15); 2318 HS_CMP_XCHG(r8, r12); 2319 HS_CMP_XCHG(r2, r3); 2320 HS_CMP_XCHG(r5, r9); 2321 HS_CMP_XCHG(r2, r5); 2322 HS_CMP_XCHG(r8, r14); 2323 HS_CMP_XCHG(r3, r9); 2324 HS_CMP_XCHG(r12, r15); 2325 HS_CMP_XCHG(r3, r5); 2326 HS_CMP_XCHG(r6, r7); 2327 HS_CMP_XCHG(r10, r11); 2328 HS_CMP_XCHG(r12, r14); 2329 HS_CMP_XCHG(r4, r9); 2330 HS_CMP_XCHG(r8, r13); 2331 HS_CMP_XCHG(r7, r9); 2332 HS_CMP_XCHG(r11, r13); 2333 HS_CMP_XCHG(r4, r6); 2334 HS_CMP_XCHG(r8, r10); 2335 HS_CMP_XCHG(r4, r5); 2336 HS_CMP_XCHG(r6, r7); 2337 HS_CMP_XCHG(r8, r9); 2338 HS_CMP_XCHG(r10, r11); 2339 HS_CMP_XCHG(r12, r13); 2340 HS_CMP_XCHG(r7, r8); 2341 HS_CMP_XCHG(r9, r10); 2342 { 2343 HS_SLAB_FLIP_PREAMBLE(1); 2344 HS_CMP_FLIP(0, r1, r16); 2345 HS_CMP_FLIP(1, r2, r15); 2346 HS_CMP_FLIP(2, r3, r14); 2347 HS_CMP_FLIP(3, r4, r13); 2348 HS_CMP_FLIP(4, r5, r12); 2349 HS_CMP_FLIP(5, r6, r11); 2350 HS_CMP_FLIP(6, r7, r10); 2351 HS_CMP_FLIP(7, r8, r9); 2352 } 2353 HS_CMP_XCHG(r1, r9); 2354 HS_CMP_XCHG(r5, r13); 2355 HS_CMP_XCHG(r1, r5); 2356 HS_CMP_XCHG(r9, r13); 2357 HS_CMP_XCHG(r3, r11); 2358 HS_CMP_XCHG(r7, r15); 2359 HS_CMP_XCHG(r3, r7); 2360 HS_CMP_XCHG(r11, r15); 2361 HS_CMP_XCHG(r1, r3); 2362 HS_CMP_XCHG(r5, r7); 2363 HS_CMP_XCHG(r9, r11); 2364 HS_CMP_XCHG(r13, r15); 2365 HS_CMP_XCHG(r2, r10); 2366 HS_CMP_XCHG(r6, r14); 2367 HS_CMP_XCHG(r2, r6); 2368 HS_CMP_XCHG(r10, r14); 2369 HS_CMP_XCHG(r4, r12); 2370 HS_CMP_XCHG(r8, r16); 2371 HS_CMP_XCHG(r4, r8); 2372 HS_CMP_XCHG(r12, r16); 2373 HS_CMP_XCHG(r2, r4); 2374 HS_CMP_XCHG(r6, r8); 2375 HS_CMP_XCHG(r10, r12); 2376 HS_CMP_XCHG(r14, r16); 2377 HS_CMP_XCHG(r1, r2); 2378 HS_CMP_XCHG(r3, r4); 2379 HS_CMP_XCHG(r5, r6); 2380 HS_CMP_XCHG(r7, r8); 2381 HS_CMP_XCHG(r9, r10); 2382 HS_CMP_XCHG(r11, r12); 2383 HS_CMP_XCHG(r13, r14); 2384 HS_CMP_XCHG(r15, r16); 2385 { 2386 HS_SLAB_FLIP_PREAMBLE(3); 2387 HS_CMP_FLIP(0, r1, r16); 2388 HS_CMP_FLIP(1, r2, r15); 2389 HS_CMP_FLIP(2, r3, r14); 2390 HS_CMP_FLIP(3, r4, r13); 2391 HS_CMP_FLIP(4, r5, r12); 2392 HS_CMP_FLIP(5, r6, r11); 2393 HS_CMP_FLIP(6, r7, r10); 2394 HS_CMP_FLIP(7, r8, r9); 2395 } 2396 { 2397 HS_SLAB_HALF_PREAMBLE(1); 2398 HS_CMP_HALF(0, r1); 2399 HS_CMP_HALF(1, r2); 2400 HS_CMP_HALF(2, r3); 2401 HS_CMP_HALF(3, r4); 2402 HS_CMP_HALF(4, r5); 2403 HS_CMP_HALF(5, r6); 2404 HS_CMP_HALF(6, r7); 2405 HS_CMP_HALF(7, r8); 2406 HS_CMP_HALF(8, r9); 2407 HS_CMP_HALF(9, r10); 2408 HS_CMP_HALF(10, r11); 2409 HS_CMP_HALF(11, r12); 2410 HS_CMP_HALF(12, r13); 2411 HS_CMP_HALF(13, r14); 2412 HS_CMP_HALF(14, r15); 2413 HS_CMP_HALF(15, r16); 2414 } 2415 HS_CMP_XCHG(r1, r9); 2416 HS_CMP_XCHG(r5, r13); 2417 HS_CMP_XCHG(r1, r5); 2418 HS_CMP_XCHG(r9, r13); 2419 HS_CMP_XCHG(r3, r11); 2420 HS_CMP_XCHG(r7, r15); 2421 HS_CMP_XCHG(r3, r7); 2422 HS_CMP_XCHG(r11, r15); 2423 HS_CMP_XCHG(r1, r3); 2424 HS_CMP_XCHG(r5, r7); 2425 HS_CMP_XCHG(r9, r11); 2426 HS_CMP_XCHG(r13, r15); 2427 HS_CMP_XCHG(r2, r10); 2428 HS_CMP_XCHG(r6, r14); 2429 HS_CMP_XCHG(r2, r6); 2430 HS_CMP_XCHG(r10, r14); 2431 HS_CMP_XCHG(r4, r12); 2432 HS_CMP_XCHG(r8, r16); 2433 HS_CMP_XCHG(r4, r8); 2434 HS_CMP_XCHG(r12, r16); 2435 HS_CMP_XCHG(r2, r4); 2436 HS_CMP_XCHG(r6, r8); 2437 HS_CMP_XCHG(r10, r12); 2438 HS_CMP_XCHG(r14, r16); 2439 HS_CMP_XCHG(r1, r2); 2440 HS_CMP_XCHG(r3, r4); 2441 HS_CMP_XCHG(r5, r6); 2442 HS_CMP_XCHG(r7, r8); 2443 HS_CMP_XCHG(r9, r10); 2444 HS_CMP_XCHG(r11, r12); 2445 HS_CMP_XCHG(r13, r14); 2446 HS_CMP_XCHG(r15, r16); 2447 { 2448 HS_SLAB_FLIP_PREAMBLE(7); 2449 HS_CMP_FLIP(0, r1, r16); 2450 HS_CMP_FLIP(1, r2, r15); 2451 HS_CMP_FLIP(2, r3, r14); 2452 HS_CMP_FLIP(3, r4, r13); 2453 HS_CMP_FLIP(4, r5, r12); 2454 HS_CMP_FLIP(5, r6, r11); 2455 HS_CMP_FLIP(6, r7, r10); 2456 HS_CMP_FLIP(7, r8, r9); 2457 } 2458 { 2459 HS_SLAB_HALF_PREAMBLE(2); 2460 HS_CMP_HALF(0, r1); 2461 HS_CMP_HALF(1, r2); 2462 HS_CMP_HALF(2, r3); 2463 HS_CMP_HALF(3, r4); 2464 HS_CMP_HALF(4, r5); 2465 HS_CMP_HALF(5, r6); 2466 HS_CMP_HALF(6, r7); 2467 HS_CMP_HALF(7, r8); 2468 HS_CMP_HALF(8, r9); 2469 HS_CMP_HALF(9, r10); 2470 HS_CMP_HALF(10, r11); 2471 HS_CMP_HALF(11, r12); 2472 HS_CMP_HALF(12, r13); 2473 HS_CMP_HALF(13, r14); 2474 HS_CMP_HALF(14, r15); 2475 HS_CMP_HALF(15, r16); 2476 } 2477 { 2478 HS_SLAB_HALF_PREAMBLE(1); 2479 HS_CMP_HALF(0, r1); 2480 HS_CMP_HALF(1, r2); 2481 HS_CMP_HALF(2, r3); 2482 HS_CMP_HALF(3, r4); 2483 HS_CMP_HALF(4, r5); 2484 HS_CMP_HALF(5, r6); 2485 HS_CMP_HALF(6, r7); 2486 HS_CMP_HALF(7, r8); 2487 HS_CMP_HALF(8, r9); 2488 HS_CMP_HALF(9, r10); 2489 HS_CMP_HALF(10, r11); 2490 HS_CMP_HALF(11, r12); 2491 HS_CMP_HALF(12, r13); 2492 HS_CMP_HALF(13, r14); 2493 HS_CMP_HALF(14, r15); 2494 HS_CMP_HALF(15, r16); 2495 } 2496 HS_CMP_XCHG(r1, r9); 2497 HS_CMP_XCHG(r5, r13); 2498 HS_CMP_XCHG(r1, r5); 2499 HS_CMP_XCHG(r9, r13); 2500 HS_CMP_XCHG(r3, r11); 2501 HS_CMP_XCHG(r7, r15); 2502 HS_CMP_XCHG(r3, r7); 2503 HS_CMP_XCHG(r11, r15); 2504 HS_CMP_XCHG(r1, r3); 2505 HS_CMP_XCHG(r5, r7); 2506 HS_CMP_XCHG(r9, r11); 2507 HS_CMP_XCHG(r13, r15); 2508 HS_CMP_XCHG(r2, r10); 2509 HS_CMP_XCHG(r6, r14); 2510 HS_CMP_XCHG(r2, r6); 2511 HS_CMP_XCHG(r10, r14); 2512 HS_CMP_XCHG(r4, r12); 2513 HS_CMP_XCHG(r8, r16); 2514 HS_CMP_XCHG(r4, r8); 2515 HS_CMP_XCHG(r12, r16); 2516 HS_CMP_XCHG(r2, r4); 2517 HS_CMP_XCHG(r6, r8); 2518 HS_CMP_XCHG(r10, r12); 2519 HS_CMP_XCHG(r14, r16); 2520 HS_CMP_XCHG(r1, r2); 2521 HS_CMP_XCHG(r3, r4); 2522 HS_CMP_XCHG(r5, r6); 2523 HS_CMP_XCHG(r7, r8); 2524 HS_CMP_XCHG(r9, r10); 2525 HS_CMP_XCHG(r11, r12); 2526 HS_CMP_XCHG(r13, r14); 2527 HS_CMP_XCHG(r15, r16); 2528 HS_BS_MERGE_H_PREAMBLE(16); 2529 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1; 2530 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r16; 2531 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2; 2532 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r15; 2533 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3; 2534 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r14; 2535 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4; 2536 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r13; 2537 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8) = r5; 2538 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9) = r12; 2539 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10) = r6; 2540 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11) = r11; 2541 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12) = r7; 2542 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13) = r10; 2543 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14) = r8; 2544 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15) = r9; 2545 HS_BLOCK_BARRIER(); 2546 { 2547 { 2548 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); 2549 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(8); 2550 HS_CMP_XCHG(r0_1, r0_2); 2551 HS_SLAB_LOCAL_L(0) = r0_1; 2552 HS_SLAB_LOCAL_R(8) = r0_2; 2553 } 2554 { 2555 HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(16); 2556 HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(24); 2557 HS_CMP_XCHG(r1_1, r1_2); 2558 HS_SLAB_LOCAL_L(16) = r1_1; 2559 HS_SLAB_LOCAL_R(24) = r1_2; 2560 } 2561 { 2562 HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(32); 2563 HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(40); 2564 HS_CMP_XCHG(r2_1, r2_2); 2565 HS_SLAB_LOCAL_L(32) = r2_1; 2566 HS_SLAB_LOCAL_R(40) = r2_2; 2567 } 2568 { 2569 HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(48); 2570 HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(56); 2571 HS_CMP_XCHG(r3_1, r3_2); 2572 HS_SLAB_LOCAL_L(48) = r3_1; 2573 HS_SLAB_LOCAL_R(56) = r3_2; 2574 } 2575 { 2576 HS_KEY_TYPE r4_1 = HS_SLAB_LOCAL_L(64); 2577 HS_KEY_TYPE r4_2 = HS_SLAB_LOCAL_R(72); 2578 HS_CMP_XCHG(r4_1, r4_2); 2579 HS_SLAB_LOCAL_L(64) = r4_1; 2580 HS_SLAB_LOCAL_R(72) = r4_2; 2581 } 2582 { 2583 HS_KEY_TYPE r5_1 = HS_SLAB_LOCAL_L(80); 2584 HS_KEY_TYPE r5_2 = HS_SLAB_LOCAL_R(88); 2585 HS_CMP_XCHG(r5_1, r5_2); 2586 HS_SLAB_LOCAL_L(80) = r5_1; 2587 HS_SLAB_LOCAL_R(88) = r5_2; 2588 } 2589 { 2590 HS_KEY_TYPE r6_1 = HS_SLAB_LOCAL_L(96); 2591 HS_KEY_TYPE r6_2 = HS_SLAB_LOCAL_R(104); 2592 HS_CMP_XCHG(r6_1, r6_2); 2593 HS_SLAB_LOCAL_L(96) = r6_1; 2594 HS_SLAB_LOCAL_R(104) = r6_2; 2595 } 2596 { 2597 HS_KEY_TYPE r7_1 = HS_SLAB_LOCAL_L(112); 2598 HS_KEY_TYPE r7_2 = HS_SLAB_LOCAL_R(120); 2599 HS_CMP_XCHG(r7_1, r7_2); 2600 HS_SLAB_LOCAL_L(112) = r7_1; 2601 HS_SLAB_LOCAL_R(120) = r7_2; 2602 } 2603 } 2604 HS_BLOCK_BARRIER(); 2605 r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0); 2606 r16 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1); 2607 r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2); 2608 r15 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3); 2609 r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4); 2610 r14 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5); 2611 r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6); 2612 r13 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7); 2613 r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8); 2614 r12 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9); 2615 r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10); 2616 r11 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11); 2617 r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12); 2618 r10 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13); 2619 r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14); 2620 r9 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15); 2621 { 2622 { 2623 HS_SLAB_HALF_PREAMBLE(4); 2624 HS_CMP_HALF(0, r1); 2625 HS_CMP_HALF(1, r2); 2626 HS_CMP_HALF(2, r3); 2627 HS_CMP_HALF(3, r4); 2628 HS_CMP_HALF(4, r5); 2629 HS_CMP_HALF(5, r6); 2630 HS_CMP_HALF(6, r7); 2631 HS_CMP_HALF(7, r8); 2632 HS_CMP_HALF(8, r9); 2633 HS_CMP_HALF(9, r10); 2634 HS_CMP_HALF(10, r11); 2635 HS_CMP_HALF(11, r12); 2636 HS_CMP_HALF(12, r13); 2637 HS_CMP_HALF(13, r14); 2638 HS_CMP_HALF(14, r15); 2639 HS_CMP_HALF(15, r16); 2640 } 2641 { 2642 HS_SLAB_HALF_PREAMBLE(2); 2643 HS_CMP_HALF(0, r1); 2644 HS_CMP_HALF(1, r2); 2645 HS_CMP_HALF(2, r3); 2646 HS_CMP_HALF(3, r4); 2647 HS_CMP_HALF(4, r5); 2648 HS_CMP_HALF(5, r6); 2649 HS_CMP_HALF(6, r7); 2650 HS_CMP_HALF(7, r8); 2651 HS_CMP_HALF(8, r9); 2652 HS_CMP_HALF(9, r10); 2653 HS_CMP_HALF(10, r11); 2654 HS_CMP_HALF(11, r12); 2655 HS_CMP_HALF(12, r13); 2656 HS_CMP_HALF(13, r14); 2657 HS_CMP_HALF(14, r15); 2658 HS_CMP_HALF(15, r16); 2659 } 2660 { 2661 HS_SLAB_HALF_PREAMBLE(1); 2662 HS_CMP_HALF(0, r1); 2663 HS_CMP_HALF(1, r2); 2664 HS_CMP_HALF(2, r3); 2665 HS_CMP_HALF(3, r4); 2666 HS_CMP_HALF(4, r5); 2667 HS_CMP_HALF(5, r6); 2668 HS_CMP_HALF(6, r7); 2669 HS_CMP_HALF(7, r8); 2670 HS_CMP_HALF(8, r9); 2671 HS_CMP_HALF(9, r10); 2672 HS_CMP_HALF(10, r11); 2673 HS_CMP_HALF(11, r12); 2674 HS_CMP_HALF(12, r13); 2675 HS_CMP_HALF(13, r14); 2676 HS_CMP_HALF(14, r15); 2677 HS_CMP_HALF(15, r16); 2678 } 2679 HS_CMP_XCHG(r1, r9); 2680 HS_CMP_XCHG(r5, r13); 2681 HS_CMP_XCHG(r1, r5); 2682 HS_CMP_XCHG(r9, r13); 2683 HS_CMP_XCHG(r3, r11); 2684 HS_CMP_XCHG(r7, r15); 2685 HS_CMP_XCHG(r3, r7); 2686 HS_CMP_XCHG(r11, r15); 2687 HS_CMP_XCHG(r1, r3); 2688 HS_CMP_XCHG(r5, r7); 2689 HS_CMP_XCHG(r9, r11); 2690 HS_CMP_XCHG(r13, r15); 2691 HS_CMP_XCHG(r2, r10); 2692 HS_CMP_XCHG(r6, r14); 2693 HS_CMP_XCHG(r2, r6); 2694 HS_CMP_XCHG(r10, r14); 2695 HS_CMP_XCHG(r4, r12); 2696 HS_CMP_XCHG(r8, r16); 2697 HS_CMP_XCHG(r4, r8); 2698 HS_CMP_XCHG(r12, r16); 2699 HS_CMP_XCHG(r2, r4); 2700 HS_CMP_XCHG(r6, r8); 2701 HS_CMP_XCHG(r10, r12); 2702 HS_CMP_XCHG(r14, r16); 2703 HS_CMP_XCHG(r1, r2); 2704 HS_CMP_XCHG(r3, r4); 2705 HS_CMP_XCHG(r5, r6); 2706 HS_CMP_XCHG(r7, r8); 2707 HS_CMP_XCHG(r9, r10); 2708 HS_CMP_XCHG(r11, r12); 2709 HS_CMP_XCHG(r13, r14); 2710 HS_CMP_XCHG(r15, r16); 2711 } 2712 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1; 2713 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r16; 2714 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2; 2715 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r15; 2716 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3; 2717 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r14; 2718 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4; 2719 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r13; 2720 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8) = r5; 2721 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9) = r12; 2722 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10) = r6; 2723 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11) = r11; 2724 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12) = r7; 2725 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13) = r10; 2726 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14) = r8; 2727 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15) = r9; 2728 HS_BLOCK_BARRIER(); 2729 { 2730 { 2731 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); 2732 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(8); 2733 HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(16); 2734 HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(24); 2735 HS_CMP_XCHG(r0_2, r0_3); 2736 HS_CMP_XCHG(r0_1, r0_4); 2737 HS_CMP_XCHG(r0_3, r0_4); 2738 HS_CMP_XCHG(r0_1, r0_2); 2739 HS_SLAB_LOCAL_L(0) = r0_1; 2740 HS_SLAB_LOCAL_L(8) = r0_2; 2741 HS_SLAB_LOCAL_R(16) = r0_3; 2742 HS_SLAB_LOCAL_R(24) = r0_4; 2743 } 2744 { 2745 HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(32); 2746 HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(40); 2747 HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(48); 2748 HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(56); 2749 HS_CMP_XCHG(r1_2, r1_3); 2750 HS_CMP_XCHG(r1_1, r1_4); 2751 HS_CMP_XCHG(r1_3, r1_4); 2752 HS_CMP_XCHG(r1_1, r1_2); 2753 HS_SLAB_LOCAL_L(32) = r1_1; 2754 HS_SLAB_LOCAL_L(40) = r1_2; 2755 HS_SLAB_LOCAL_R(48) = r1_3; 2756 HS_SLAB_LOCAL_R(56) = r1_4; 2757 } 2758 { 2759 HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(64); 2760 HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_L(72); 2761 HS_KEY_TYPE r2_3 = HS_SLAB_LOCAL_R(80); 2762 HS_KEY_TYPE r2_4 = HS_SLAB_LOCAL_R(88); 2763 HS_CMP_XCHG(r2_2, r2_3); 2764 HS_CMP_XCHG(r2_1, r2_4); 2765 HS_CMP_XCHG(r2_3, r2_4); 2766 HS_CMP_XCHG(r2_1, r2_2); 2767 HS_SLAB_LOCAL_L(64) = r2_1; 2768 HS_SLAB_LOCAL_L(72) = r2_2; 2769 HS_SLAB_LOCAL_R(80) = r2_3; 2770 HS_SLAB_LOCAL_R(88) = r2_4; 2771 } 2772 { 2773 HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(96); 2774 HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_L(104); 2775 HS_KEY_TYPE r3_3 = HS_SLAB_LOCAL_R(112); 2776 HS_KEY_TYPE r3_4 = HS_SLAB_LOCAL_R(120); 2777 HS_CMP_XCHG(r3_2, r3_3); 2778 HS_CMP_XCHG(r3_1, r3_4); 2779 HS_CMP_XCHG(r3_3, r3_4); 2780 HS_CMP_XCHG(r3_1, r3_2); 2781 HS_SLAB_LOCAL_L(96) = r3_1; 2782 HS_SLAB_LOCAL_L(104) = r3_2; 2783 HS_SLAB_LOCAL_R(112) = r3_3; 2784 HS_SLAB_LOCAL_R(120) = r3_4; 2785 } 2786 } 2787 HS_BLOCK_BARRIER(); 2788 r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0); 2789 r16 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1); 2790 r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2); 2791 r15 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3); 2792 r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4); 2793 r14 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5); 2794 r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6); 2795 r13 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7); 2796 r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8); 2797 r12 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9); 2798 r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10); 2799 r11 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11); 2800 r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12); 2801 r10 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13); 2802 r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14); 2803 r9 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15); 2804 { 2805 { 2806 HS_SLAB_HALF_PREAMBLE(4); 2807 HS_CMP_HALF(0, r1); 2808 HS_CMP_HALF(1, r2); 2809 HS_CMP_HALF(2, r3); 2810 HS_CMP_HALF(3, r4); 2811 HS_CMP_HALF(4, r5); 2812 HS_CMP_HALF(5, r6); 2813 HS_CMP_HALF(6, r7); 2814 HS_CMP_HALF(7, r8); 2815 HS_CMP_HALF(8, r9); 2816 HS_CMP_HALF(9, r10); 2817 HS_CMP_HALF(10, r11); 2818 HS_CMP_HALF(11, r12); 2819 HS_CMP_HALF(12, r13); 2820 HS_CMP_HALF(13, r14); 2821 HS_CMP_HALF(14, r15); 2822 HS_CMP_HALF(15, r16); 2823 } 2824 { 2825 HS_SLAB_HALF_PREAMBLE(2); 2826 HS_CMP_HALF(0, r1); 2827 HS_CMP_HALF(1, r2); 2828 HS_CMP_HALF(2, r3); 2829 HS_CMP_HALF(3, r4); 2830 HS_CMP_HALF(4, r5); 2831 HS_CMP_HALF(5, r6); 2832 HS_CMP_HALF(6, r7); 2833 HS_CMP_HALF(7, r8); 2834 HS_CMP_HALF(8, r9); 2835 HS_CMP_HALF(9, r10); 2836 HS_CMP_HALF(10, r11); 2837 HS_CMP_HALF(11, r12); 2838 HS_CMP_HALF(12, r13); 2839 HS_CMP_HALF(13, r14); 2840 HS_CMP_HALF(14, r15); 2841 HS_CMP_HALF(15, r16); 2842 } 2843 { 2844 HS_SLAB_HALF_PREAMBLE(1); 2845 HS_CMP_HALF(0, r1); 2846 HS_CMP_HALF(1, r2); 2847 HS_CMP_HALF(2, r3); 2848 HS_CMP_HALF(3, r4); 2849 HS_CMP_HALF(4, r5); 2850 HS_CMP_HALF(5, r6); 2851 HS_CMP_HALF(6, r7); 2852 HS_CMP_HALF(7, r8); 2853 HS_CMP_HALF(8, r9); 2854 HS_CMP_HALF(9, r10); 2855 HS_CMP_HALF(10, r11); 2856 HS_CMP_HALF(11, r12); 2857 HS_CMP_HALF(12, r13); 2858 HS_CMP_HALF(13, r14); 2859 HS_CMP_HALF(14, r15); 2860 HS_CMP_HALF(15, r16); 2861 } 2862 HS_CMP_XCHG(r1, r9); 2863 HS_CMP_XCHG(r5, r13); 2864 HS_CMP_XCHG(r1, r5); 2865 HS_CMP_XCHG(r9, r13); 2866 HS_CMP_XCHG(r3, r11); 2867 HS_CMP_XCHG(r7, r15); 2868 HS_CMP_XCHG(r3, r7); 2869 HS_CMP_XCHG(r11, r15); 2870 HS_CMP_XCHG(r1, r3); 2871 HS_CMP_XCHG(r5, r7); 2872 HS_CMP_XCHG(r9, r11); 2873 HS_CMP_XCHG(r13, r15); 2874 HS_CMP_XCHG(r2, r10); 2875 HS_CMP_XCHG(r6, r14); 2876 HS_CMP_XCHG(r2, r6); 2877 HS_CMP_XCHG(r10, r14); 2878 HS_CMP_XCHG(r4, r12); 2879 HS_CMP_XCHG(r8, r16); 2880 HS_CMP_XCHG(r4, r8); 2881 HS_CMP_XCHG(r12, r16); 2882 HS_CMP_XCHG(r2, r4); 2883 HS_CMP_XCHG(r6, r8); 2884 HS_CMP_XCHG(r10, r12); 2885 HS_CMP_XCHG(r14, r16); 2886 HS_CMP_XCHG(r1, r2); 2887 HS_CMP_XCHG(r3, r4); 2888 HS_CMP_XCHG(r5, r6); 2889 HS_CMP_XCHG(r7, r8); 2890 HS_CMP_XCHG(r9, r10); 2891 HS_CMP_XCHG(r11, r12); 2892 HS_CMP_XCHG(r13, r14); 2893 HS_CMP_XCHG(r15, r16); 2894 } 2895 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1; 2896 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r16; 2897 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2; 2898 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r15; 2899 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3; 2900 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r14; 2901 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4; 2902 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r13; 2903 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8) = r5; 2904 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9) = r12; 2905 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10) = r6; 2906 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11) = r11; 2907 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12) = r7; 2908 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13) = r10; 2909 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14) = r8; 2910 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15) = r9; 2911 HS_BLOCK_BARRIER(); 2912 { 2913 { 2914 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); 2915 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(8); 2916 HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(16); 2917 HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(24); 2918 HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(32); 2919 HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(40); 2920 HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(48); 2921 HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(56); 2922 HS_CMP_XCHG(r0_4, r0_5); 2923 HS_CMP_XCHG(r0_3, r0_6); 2924 HS_CMP_XCHG(r0_2, r0_7); 2925 HS_CMP_XCHG(r0_1, r0_8); 2926 HS_CMP_XCHG(r0_5, r0_7); 2927 HS_CMP_XCHG(r0_6, r0_8); 2928 HS_CMP_XCHG(r0_5, r0_6); 2929 HS_CMP_XCHG(r0_7, r0_8); 2930 HS_CMP_XCHG(r0_1, r0_3); 2931 HS_CMP_XCHG(r0_2, r0_4); 2932 HS_CMP_XCHG(r0_1, r0_2); 2933 HS_CMP_XCHG(r0_3, r0_4); 2934 HS_SLAB_LOCAL_L(0) = r0_1; 2935 HS_SLAB_LOCAL_L(8) = r0_2; 2936 HS_SLAB_LOCAL_L(16) = r0_3; 2937 HS_SLAB_LOCAL_L(24) = r0_4; 2938 HS_SLAB_LOCAL_R(32) = r0_5; 2939 HS_SLAB_LOCAL_R(40) = r0_6; 2940 HS_SLAB_LOCAL_R(48) = r0_7; 2941 HS_SLAB_LOCAL_R(56) = r0_8; 2942 } 2943 { 2944 HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(64); 2945 HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(72); 2946 HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_L(80); 2947 HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_L(88); 2948 HS_KEY_TYPE r1_5 = HS_SLAB_LOCAL_R(96); 2949 HS_KEY_TYPE r1_6 = HS_SLAB_LOCAL_R(104); 2950 HS_KEY_TYPE r1_7 = HS_SLAB_LOCAL_R(112); 2951 HS_KEY_TYPE r1_8 = HS_SLAB_LOCAL_R(120); 2952 HS_CMP_XCHG(r1_4, r1_5); 2953 HS_CMP_XCHG(r1_3, r1_6); 2954 HS_CMP_XCHG(r1_2, r1_7); 2955 HS_CMP_XCHG(r1_1, r1_8); 2956 HS_CMP_XCHG(r1_5, r1_7); 2957 HS_CMP_XCHG(r1_6, r1_8); 2958 HS_CMP_XCHG(r1_5, r1_6); 2959 HS_CMP_XCHG(r1_7, r1_8); 2960 HS_CMP_XCHG(r1_1, r1_3); 2961 HS_CMP_XCHG(r1_2, r1_4); 2962 HS_CMP_XCHG(r1_1, r1_2); 2963 HS_CMP_XCHG(r1_3, r1_4); 2964 HS_SLAB_LOCAL_L(64) = r1_1; 2965 HS_SLAB_LOCAL_L(72) = r1_2; 2966 HS_SLAB_LOCAL_L(80) = r1_3; 2967 HS_SLAB_LOCAL_L(88) = r1_4; 2968 HS_SLAB_LOCAL_R(96) = r1_5; 2969 HS_SLAB_LOCAL_R(104) = r1_6; 2970 HS_SLAB_LOCAL_R(112) = r1_7; 2971 HS_SLAB_LOCAL_R(120) = r1_8; 2972 } 2973 } 2974 HS_BLOCK_BARRIER(); 2975 r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0); 2976 r16 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1); 2977 r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2); 2978 r15 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3); 2979 r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4); 2980 r14 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5); 2981 r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6); 2982 r13 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7); 2983 r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8); 2984 r12 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9); 2985 r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10); 2986 r11 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11); 2987 r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12); 2988 r10 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13); 2989 r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14); 2990 r9 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15); 2991 { 2992 { 2993 HS_SLAB_HALF_PREAMBLE(4); 2994 HS_CMP_HALF(0, r1); 2995 HS_CMP_HALF(1, r2); 2996 HS_CMP_HALF(2, r3); 2997 HS_CMP_HALF(3, r4); 2998 HS_CMP_HALF(4, r5); 2999 HS_CMP_HALF(5, r6); 3000 HS_CMP_HALF(6, r7); 3001 HS_CMP_HALF(7, r8); 3002 HS_CMP_HALF(8, r9); 3003 HS_CMP_HALF(9, r10); 3004 HS_CMP_HALF(10, r11); 3005 HS_CMP_HALF(11, r12); 3006 HS_CMP_HALF(12, r13); 3007 HS_CMP_HALF(13, r14); 3008 HS_CMP_HALF(14, r15); 3009 HS_CMP_HALF(15, r16); 3010 } 3011 { 3012 HS_SLAB_HALF_PREAMBLE(2); 3013 HS_CMP_HALF(0, r1); 3014 HS_CMP_HALF(1, r2); 3015 HS_CMP_HALF(2, r3); 3016 HS_CMP_HALF(3, r4); 3017 HS_CMP_HALF(4, r5); 3018 HS_CMP_HALF(5, r6); 3019 HS_CMP_HALF(6, r7); 3020 HS_CMP_HALF(7, r8); 3021 HS_CMP_HALF(8, r9); 3022 HS_CMP_HALF(9, r10); 3023 HS_CMP_HALF(10, r11); 3024 HS_CMP_HALF(11, r12); 3025 HS_CMP_HALF(12, r13); 3026 HS_CMP_HALF(13, r14); 3027 HS_CMP_HALF(14, r15); 3028 HS_CMP_HALF(15, r16); 3029 } 3030 { 3031 HS_SLAB_HALF_PREAMBLE(1); 3032 HS_CMP_HALF(0, r1); 3033 HS_CMP_HALF(1, r2); 3034 HS_CMP_HALF(2, r3); 3035 HS_CMP_HALF(3, r4); 3036 HS_CMP_HALF(4, r5); 3037 HS_CMP_HALF(5, r6); 3038 HS_CMP_HALF(6, r7); 3039 HS_CMP_HALF(7, r8); 3040 HS_CMP_HALF(8, r9); 3041 HS_CMP_HALF(9, r10); 3042 HS_CMP_HALF(10, r11); 3043 HS_CMP_HALF(11, r12); 3044 HS_CMP_HALF(12, r13); 3045 HS_CMP_HALF(13, r14); 3046 HS_CMP_HALF(14, r15); 3047 HS_CMP_HALF(15, r16); 3048 } 3049 HS_CMP_XCHG(r1, r9); 3050 HS_CMP_XCHG(r5, r13); 3051 HS_CMP_XCHG(r1, r5); 3052 HS_CMP_XCHG(r9, r13); 3053 HS_CMP_XCHG(r3, r11); 3054 HS_CMP_XCHG(r7, r15); 3055 HS_CMP_XCHG(r3, r7); 3056 HS_CMP_XCHG(r11, r15); 3057 HS_CMP_XCHG(r1, r3); 3058 HS_CMP_XCHG(r5, r7); 3059 HS_CMP_XCHG(r9, r11); 3060 HS_CMP_XCHG(r13, r15); 3061 HS_CMP_XCHG(r2, r10); 3062 HS_CMP_XCHG(r6, r14); 3063 HS_CMP_XCHG(r2, r6); 3064 HS_CMP_XCHG(r10, r14); 3065 HS_CMP_XCHG(r4, r12); 3066 HS_CMP_XCHG(r8, r16); 3067 HS_CMP_XCHG(r4, r8); 3068 HS_CMP_XCHG(r12, r16); 3069 HS_CMP_XCHG(r2, r4); 3070 HS_CMP_XCHG(r6, r8); 3071 HS_CMP_XCHG(r10, r12); 3072 HS_CMP_XCHG(r14, r16); 3073 HS_CMP_XCHG(r1, r2); 3074 HS_CMP_XCHG(r3, r4); 3075 HS_CMP_XCHG(r5, r6); 3076 HS_CMP_XCHG(r7, r8); 3077 HS_CMP_XCHG(r9, r10); 3078 HS_CMP_XCHG(r11, r12); 3079 HS_CMP_XCHG(r13, r14); 3080 HS_CMP_XCHG(r15, r16); 3081 } 3082 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1; 3083 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r16; 3084 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2; 3085 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r15; 3086 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3; 3087 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r14; 3088 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4; 3089 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r13; 3090 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8) = r5; 3091 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9) = r12; 3092 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10) = r6; 3093 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11) = r11; 3094 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12) = r7; 3095 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13) = r10; 3096 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14) = r8; 3097 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15) = r9; 3098 HS_BLOCK_BARRIER(); 3099 { 3100 { 3101 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); 3102 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(8); 3103 HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(16); 3104 HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(24); 3105 HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_L(32); 3106 HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_L(40); 3107 HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_L(48); 3108 HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_L(56); 3109 HS_KEY_TYPE r0_9 = HS_SLAB_LOCAL_R(64); 3110 HS_KEY_TYPE r0_10 = HS_SLAB_LOCAL_R(72); 3111 HS_KEY_TYPE r0_11 = HS_SLAB_LOCAL_R(80); 3112 HS_KEY_TYPE r0_12 = HS_SLAB_LOCAL_R(88); 3113 HS_KEY_TYPE r0_13 = HS_SLAB_LOCAL_R(96); 3114 HS_KEY_TYPE r0_14 = HS_SLAB_LOCAL_R(104); 3115 HS_KEY_TYPE r0_15 = HS_SLAB_LOCAL_R(112); 3116 HS_KEY_TYPE r0_16 = HS_SLAB_LOCAL_R(120); 3117 HS_CMP_XCHG(r0_8, r0_9); 3118 HS_CMP_XCHG(r0_7, r0_10); 3119 HS_CMP_XCHG(r0_6, r0_11); 3120 HS_CMP_XCHG(r0_5, r0_12); 3121 HS_CMP_XCHG(r0_4, r0_13); 3122 HS_CMP_XCHG(r0_3, r0_14); 3123 HS_CMP_XCHG(r0_2, r0_15); 3124 HS_CMP_XCHG(r0_1, r0_16); 3125 HS_CMP_XCHG(r0_9, r0_13); 3126 HS_CMP_XCHG(r0_11, r0_15); 3127 HS_CMP_XCHG(r0_9, r0_11); 3128 HS_CMP_XCHG(r0_13, r0_15); 3129 HS_CMP_XCHG(r0_10, r0_14); 3130 HS_CMP_XCHG(r0_12, r0_16); 3131 HS_CMP_XCHG(r0_10, r0_12); 3132 HS_CMP_XCHG(r0_14, r0_16); 3133 HS_CMP_XCHG(r0_9, r0_10); 3134 HS_CMP_XCHG(r0_11, r0_12); 3135 HS_CMP_XCHG(r0_13, r0_14); 3136 HS_CMP_XCHG(r0_15, r0_16); 3137 HS_CMP_XCHG(r0_1, r0_5); 3138 HS_CMP_XCHG(r0_3, r0_7); 3139 HS_CMP_XCHG(r0_1, r0_3); 3140 HS_CMP_XCHG(r0_5, r0_7); 3141 HS_CMP_XCHG(r0_2, r0_6); 3142 HS_CMP_XCHG(r0_4, r0_8); 3143 HS_CMP_XCHG(r0_2, r0_4); 3144 HS_CMP_XCHG(r0_6, r0_8); 3145 HS_CMP_XCHG(r0_1, r0_2); 3146 HS_CMP_XCHG(r0_3, r0_4); 3147 HS_CMP_XCHG(r0_5, r0_6); 3148 HS_CMP_XCHG(r0_7, r0_8); 3149 HS_SLAB_LOCAL_L(0) = r0_1; 3150 HS_SLAB_LOCAL_L(8) = r0_2; 3151 HS_SLAB_LOCAL_L(16) = r0_3; 3152 HS_SLAB_LOCAL_L(24) = r0_4; 3153 HS_SLAB_LOCAL_L(32) = r0_5; 3154 HS_SLAB_LOCAL_L(40) = r0_6; 3155 HS_SLAB_LOCAL_L(48) = r0_7; 3156 HS_SLAB_LOCAL_L(56) = r0_8; 3157 HS_SLAB_LOCAL_R(64) = r0_9; 3158 HS_SLAB_LOCAL_R(72) = r0_10; 3159 HS_SLAB_LOCAL_R(80) = r0_11; 3160 HS_SLAB_LOCAL_R(88) = r0_12; 3161 HS_SLAB_LOCAL_R(96) = r0_13; 3162 HS_SLAB_LOCAL_R(104) = r0_14; 3163 HS_SLAB_LOCAL_R(112) = r0_15; 3164 HS_SLAB_LOCAL_R(120) = r0_16; 3165 } 3166 } 3167 HS_BLOCK_BARRIER(); 3168 r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0); 3169 r16 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1); 3170 r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2); 3171 r15 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3); 3172 r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4); 3173 r14 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5); 3174 r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6); 3175 r13 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7); 3176 r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8); 3177 r12 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9); 3178 r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10); 3179 r11 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11); 3180 r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12); 3181 r10 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13); 3182 r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14); 3183 r9 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15); 3184 { 3185 { 3186 HS_SLAB_HALF_PREAMBLE(4); 3187 HS_CMP_HALF(0, r1); 3188 HS_CMP_HALF(1, r2); 3189 HS_CMP_HALF(2, r3); 3190 HS_CMP_HALF(3, r4); 3191 HS_CMP_HALF(4, r5); 3192 HS_CMP_HALF(5, r6); 3193 HS_CMP_HALF(6, r7); 3194 HS_CMP_HALF(7, r8); 3195 HS_CMP_HALF(8, r9); 3196 HS_CMP_HALF(9, r10); 3197 HS_CMP_HALF(10, r11); 3198 HS_CMP_HALF(11, r12); 3199 HS_CMP_HALF(12, r13); 3200 HS_CMP_HALF(13, r14); 3201 HS_CMP_HALF(14, r15); 3202 HS_CMP_HALF(15, r16); 3203 } 3204 { 3205 HS_SLAB_HALF_PREAMBLE(2); 3206 HS_CMP_HALF(0, r1); 3207 HS_CMP_HALF(1, r2); 3208 HS_CMP_HALF(2, r3); 3209 HS_CMP_HALF(3, r4); 3210 HS_CMP_HALF(4, r5); 3211 HS_CMP_HALF(5, r6); 3212 HS_CMP_HALF(6, r7); 3213 HS_CMP_HALF(7, r8); 3214 HS_CMP_HALF(8, r9); 3215 HS_CMP_HALF(9, r10); 3216 HS_CMP_HALF(10, r11); 3217 HS_CMP_HALF(11, r12); 3218 HS_CMP_HALF(12, r13); 3219 HS_CMP_HALF(13, r14); 3220 HS_CMP_HALF(14, r15); 3221 HS_CMP_HALF(15, r16); 3222 } 3223 { 3224 HS_SLAB_HALF_PREAMBLE(1); 3225 HS_CMP_HALF(0, r1); 3226 HS_CMP_HALF(1, r2); 3227 HS_CMP_HALF(2, r3); 3228 HS_CMP_HALF(3, r4); 3229 HS_CMP_HALF(4, r5); 3230 HS_CMP_HALF(5, r6); 3231 HS_CMP_HALF(6, r7); 3232 HS_CMP_HALF(7, r8); 3233 HS_CMP_HALF(8, r9); 3234 HS_CMP_HALF(9, r10); 3235 HS_CMP_HALF(10, r11); 3236 HS_CMP_HALF(11, r12); 3237 HS_CMP_HALF(12, r13); 3238 HS_CMP_HALF(13, r14); 3239 HS_CMP_HALF(14, r15); 3240 HS_CMP_HALF(15, r16); 3241 } 3242 HS_CMP_XCHG(r1, r9); 3243 HS_CMP_XCHG(r5, r13); 3244 HS_CMP_XCHG(r1, r5); 3245 HS_CMP_XCHG(r9, r13); 3246 HS_CMP_XCHG(r3, r11); 3247 HS_CMP_XCHG(r7, r15); 3248 HS_CMP_XCHG(r3, r7); 3249 HS_CMP_XCHG(r11, r15); 3250 HS_CMP_XCHG(r1, r3); 3251 HS_CMP_XCHG(r5, r7); 3252 HS_CMP_XCHG(r9, r11); 3253 HS_CMP_XCHG(r13, r15); 3254 HS_CMP_XCHG(r2, r10); 3255 HS_CMP_XCHG(r6, r14); 3256 HS_CMP_XCHG(r2, r6); 3257 HS_CMP_XCHG(r10, r14); 3258 HS_CMP_XCHG(r4, r12); 3259 HS_CMP_XCHG(r8, r16); 3260 HS_CMP_XCHG(r4, r8); 3261 HS_CMP_XCHG(r12, r16); 3262 HS_CMP_XCHG(r2, r4); 3263 HS_CMP_XCHG(r6, r8); 3264 HS_CMP_XCHG(r10, r12); 3265 HS_CMP_XCHG(r14, r16); 3266 HS_CMP_XCHG(r1, r2); 3267 HS_CMP_XCHG(r3, r4); 3268 HS_CMP_XCHG(r5, r6); 3269 HS_CMP_XCHG(r7, r8); 3270 HS_CMP_XCHG(r9, r10); 3271 HS_CMP_XCHG(r11, r12); 3272 HS_CMP_XCHG(r13, r14); 3273 HS_CMP_XCHG(r15, r16); 3274 } 3275 HS_SLAB_GLOBAL_STORE(0, r1); 3276 HS_SLAB_GLOBAL_STORE(1, r2); 3277 HS_SLAB_GLOBAL_STORE(2, r3); 3278 HS_SLAB_GLOBAL_STORE(3, r4); 3279 HS_SLAB_GLOBAL_STORE(4, r5); 3280 HS_SLAB_GLOBAL_STORE(5, r6); 3281 HS_SLAB_GLOBAL_STORE(6, r7); 3282 HS_SLAB_GLOBAL_STORE(7, r8); 3283 HS_SLAB_GLOBAL_STORE(8, r9); 3284 HS_SLAB_GLOBAL_STORE(9, r10); 3285 HS_SLAB_GLOBAL_STORE(10, r11); 3286 HS_SLAB_GLOBAL_STORE(11, r12); 3287 HS_SLAB_GLOBAL_STORE(12, r13); 3288 HS_SLAB_GLOBAL_STORE(13, r14); 3289 HS_SLAB_GLOBAL_STORE(14, r15); 3290 HS_SLAB_GLOBAL_STORE(15, r16); 3291} 3292 3293HS_BC_KERNEL_PROTO(1, 0) 3294{ 3295 HS_SLAB_GLOBAL_PREAMBLE(); 3296 HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vout, 0); 3297 HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vout, 1); 3298 HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vout, 2); 3299 HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vout, 3); 3300 HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vout, 4); 3301 HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vout, 5); 3302 HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vout, 6); 3303 HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vout, 7); 3304 HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vout, 8); 3305 HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vout, 9); 3306 HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vout, 10); 3307 HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vout, 11); 3308 HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vout, 12); 3309 HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vout, 13); 3310 HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vout, 14); 3311 HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vout, 15); 3312 { 3313 { 3314 HS_SLAB_HALF_PREAMBLE(4); 3315 HS_CMP_HALF(0, r1); 3316 HS_CMP_HALF(1, r2); 3317 HS_CMP_HALF(2, r3); 3318 HS_CMP_HALF(3, r4); 3319 HS_CMP_HALF(4, r5); 3320 HS_CMP_HALF(5, r6); 3321 HS_CMP_HALF(6, r7); 3322 HS_CMP_HALF(7, r8); 3323 HS_CMP_HALF(8, r9); 3324 HS_CMP_HALF(9, r10); 3325 HS_CMP_HALF(10, r11); 3326 HS_CMP_HALF(11, r12); 3327 HS_CMP_HALF(12, r13); 3328 HS_CMP_HALF(13, r14); 3329 HS_CMP_HALF(14, r15); 3330 HS_CMP_HALF(15, r16); 3331 } 3332 { 3333 HS_SLAB_HALF_PREAMBLE(2); 3334 HS_CMP_HALF(0, r1); 3335 HS_CMP_HALF(1, r2); 3336 HS_CMP_HALF(2, r3); 3337 HS_CMP_HALF(3, r4); 3338 HS_CMP_HALF(4, r5); 3339 HS_CMP_HALF(5, r6); 3340 HS_CMP_HALF(6, r7); 3341 HS_CMP_HALF(7, r8); 3342 HS_CMP_HALF(8, r9); 3343 HS_CMP_HALF(9, r10); 3344 HS_CMP_HALF(10, r11); 3345 HS_CMP_HALF(11, r12); 3346 HS_CMP_HALF(12, r13); 3347 HS_CMP_HALF(13, r14); 3348 HS_CMP_HALF(14, r15); 3349 HS_CMP_HALF(15, r16); 3350 } 3351 { 3352 HS_SLAB_HALF_PREAMBLE(1); 3353 HS_CMP_HALF(0, r1); 3354 HS_CMP_HALF(1, r2); 3355 HS_CMP_HALF(2, r3); 3356 HS_CMP_HALF(3, r4); 3357 HS_CMP_HALF(4, r5); 3358 HS_CMP_HALF(5, r6); 3359 HS_CMP_HALF(6, r7); 3360 HS_CMP_HALF(7, r8); 3361 HS_CMP_HALF(8, r9); 3362 HS_CMP_HALF(9, r10); 3363 HS_CMP_HALF(10, r11); 3364 HS_CMP_HALF(11, r12); 3365 HS_CMP_HALF(12, r13); 3366 HS_CMP_HALF(13, r14); 3367 HS_CMP_HALF(14, r15); 3368 HS_CMP_HALF(15, r16); 3369 } 3370 HS_CMP_XCHG(r1, r9); 3371 HS_CMP_XCHG(r5, r13); 3372 HS_CMP_XCHG(r1, r5); 3373 HS_CMP_XCHG(r9, r13); 3374 HS_CMP_XCHG(r3, r11); 3375 HS_CMP_XCHG(r7, r15); 3376 HS_CMP_XCHG(r3, r7); 3377 HS_CMP_XCHG(r11, r15); 3378 HS_CMP_XCHG(r1, r3); 3379 HS_CMP_XCHG(r5, r7); 3380 HS_CMP_XCHG(r9, r11); 3381 HS_CMP_XCHG(r13, r15); 3382 HS_CMP_XCHG(r2, r10); 3383 HS_CMP_XCHG(r6, r14); 3384 HS_CMP_XCHG(r2, r6); 3385 HS_CMP_XCHG(r10, r14); 3386 HS_CMP_XCHG(r4, r12); 3387 HS_CMP_XCHG(r8, r16); 3388 HS_CMP_XCHG(r4, r8); 3389 HS_CMP_XCHG(r12, r16); 3390 HS_CMP_XCHG(r2, r4); 3391 HS_CMP_XCHG(r6, r8); 3392 HS_CMP_XCHG(r10, r12); 3393 HS_CMP_XCHG(r14, r16); 3394 HS_CMP_XCHG(r1, r2); 3395 HS_CMP_XCHG(r3, r4); 3396 HS_CMP_XCHG(r5, r6); 3397 HS_CMP_XCHG(r7, r8); 3398 HS_CMP_XCHG(r9, r10); 3399 HS_CMP_XCHG(r11, r12); 3400 HS_CMP_XCHG(r13, r14); 3401 HS_CMP_XCHG(r15, r16); 3402 } 3403 HS_SLAB_GLOBAL_STORE(0, r1); 3404 HS_SLAB_GLOBAL_STORE(1, r2); 3405 HS_SLAB_GLOBAL_STORE(2, r3); 3406 HS_SLAB_GLOBAL_STORE(3, r4); 3407 HS_SLAB_GLOBAL_STORE(4, r5); 3408 HS_SLAB_GLOBAL_STORE(5, r6); 3409 HS_SLAB_GLOBAL_STORE(6, r7); 3410 HS_SLAB_GLOBAL_STORE(7, r8); 3411 HS_SLAB_GLOBAL_STORE(8, r9); 3412 HS_SLAB_GLOBAL_STORE(9, r10); 3413 HS_SLAB_GLOBAL_STORE(10, r11); 3414 HS_SLAB_GLOBAL_STORE(11, r12); 3415 HS_SLAB_GLOBAL_STORE(12, r13); 3416 HS_SLAB_GLOBAL_STORE(13, r14); 3417 HS_SLAB_GLOBAL_STORE(14, r15); 3418 HS_SLAB_GLOBAL_STORE(15, r16); 3419} 3420 3421HS_BC_KERNEL_PROTO(2, 1) 3422{ 3423 HS_BLOCK_LOCAL_MEM_DECL(16, 16); 3424 3425 HS_SLAB_GLOBAL_PREAMBLE(); 3426 HS_BC_MERGE_H_PREAMBLE(2); 3427 { 3428 { 3429 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0); 3430 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(16); 3431 HS_CMP_XCHG(r0_1, r0_2); 3432 HS_SLAB_LOCAL_L(0) = r0_1; 3433 HS_SLAB_LOCAL_L(8) = r0_2; 3434 } 3435 { 3436 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(2); 3437 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(18); 3438 HS_CMP_XCHG(r0_1, r0_2); 3439 HS_SLAB_LOCAL_L(32) = r0_1; 3440 HS_SLAB_LOCAL_L(40) = r0_2; 3441 } 3442 { 3443 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(4); 3444 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(20); 3445 HS_CMP_XCHG(r0_1, r0_2); 3446 HS_SLAB_LOCAL_L(64) = r0_1; 3447 HS_SLAB_LOCAL_L(72) = r0_2; 3448 } 3449 { 3450 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(6); 3451 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(22); 3452 HS_CMP_XCHG(r0_1, r0_2); 3453 HS_SLAB_LOCAL_L(96) = r0_1; 3454 HS_SLAB_LOCAL_L(104) = r0_2; 3455 } 3456 { 3457 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8); 3458 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(24); 3459 HS_CMP_XCHG(r0_1, r0_2); 3460 HS_SLAB_LOCAL_L(128) = r0_1; 3461 HS_SLAB_LOCAL_L(136) = r0_2; 3462 } 3463 { 3464 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(10); 3465 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(26); 3466 HS_CMP_XCHG(r0_1, r0_2); 3467 HS_SLAB_LOCAL_L(160) = r0_1; 3468 HS_SLAB_LOCAL_L(168) = r0_2; 3469 } 3470 { 3471 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(12); 3472 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(28); 3473 HS_CMP_XCHG(r0_1, r0_2); 3474 HS_SLAB_LOCAL_L(192) = r0_1; 3475 HS_SLAB_LOCAL_L(200) = r0_2; 3476 } 3477 { 3478 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(14); 3479 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(30); 3480 HS_CMP_XCHG(r0_1, r0_2); 3481 HS_SLAB_LOCAL_L(224) = r0_1; 3482 HS_SLAB_LOCAL_L(232) = r0_2; 3483 } 3484 } 3485 HS_BLOCK_BARRIER(); 3486 HS_KEY_TYPE r1 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 0); 3487 HS_KEY_TYPE r2 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 1); 3488 HS_KEY_TYPE r3 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 2); 3489 HS_KEY_TYPE r4 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 3); 3490 HS_KEY_TYPE r5 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 4); 3491 HS_KEY_TYPE r6 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 5); 3492 HS_KEY_TYPE r7 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 6); 3493 HS_KEY_TYPE r8 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 7); 3494 HS_KEY_TYPE r9 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 8); 3495 HS_KEY_TYPE r10 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 9); 3496 HS_KEY_TYPE r11 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 10); 3497 HS_KEY_TYPE r12 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 11); 3498 HS_KEY_TYPE r13 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 12); 3499 HS_KEY_TYPE r14 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 13); 3500 HS_KEY_TYPE r15 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 14); 3501 HS_KEY_TYPE r16 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 15); 3502 { 3503 { 3504 HS_SLAB_HALF_PREAMBLE(4); 3505 HS_CMP_HALF(0, r1); 3506 HS_CMP_HALF(1, r2); 3507 HS_CMP_HALF(2, r3); 3508 HS_CMP_HALF(3, r4); 3509 HS_CMP_HALF(4, r5); 3510 HS_CMP_HALF(5, r6); 3511 HS_CMP_HALF(6, r7); 3512 HS_CMP_HALF(7, r8); 3513 HS_CMP_HALF(8, r9); 3514 HS_CMP_HALF(9, r10); 3515 HS_CMP_HALF(10, r11); 3516 HS_CMP_HALF(11, r12); 3517 HS_CMP_HALF(12, r13); 3518 HS_CMP_HALF(13, r14); 3519 HS_CMP_HALF(14, r15); 3520 HS_CMP_HALF(15, r16); 3521 } 3522 { 3523 HS_SLAB_HALF_PREAMBLE(2); 3524 HS_CMP_HALF(0, r1); 3525 HS_CMP_HALF(1, r2); 3526 HS_CMP_HALF(2, r3); 3527 HS_CMP_HALF(3, r4); 3528 HS_CMP_HALF(4, r5); 3529 HS_CMP_HALF(5, r6); 3530 HS_CMP_HALF(6, r7); 3531 HS_CMP_HALF(7, r8); 3532 HS_CMP_HALF(8, r9); 3533 HS_CMP_HALF(9, r10); 3534 HS_CMP_HALF(10, r11); 3535 HS_CMP_HALF(11, r12); 3536 HS_CMP_HALF(12, r13); 3537 HS_CMP_HALF(13, r14); 3538 HS_CMP_HALF(14, r15); 3539 HS_CMP_HALF(15, r16); 3540 } 3541 { 3542 HS_SLAB_HALF_PREAMBLE(1); 3543 HS_CMP_HALF(0, r1); 3544 HS_CMP_HALF(1, r2); 3545 HS_CMP_HALF(2, r3); 3546 HS_CMP_HALF(3, r4); 3547 HS_CMP_HALF(4, r5); 3548 HS_CMP_HALF(5, r6); 3549 HS_CMP_HALF(6, r7); 3550 HS_CMP_HALF(7, r8); 3551 HS_CMP_HALF(8, r9); 3552 HS_CMP_HALF(9, r10); 3553 HS_CMP_HALF(10, r11); 3554 HS_CMP_HALF(11, r12); 3555 HS_CMP_HALF(12, r13); 3556 HS_CMP_HALF(13, r14); 3557 HS_CMP_HALF(14, r15); 3558 HS_CMP_HALF(15, r16); 3559 } 3560 HS_CMP_XCHG(r1, r9); 3561 HS_CMP_XCHG(r5, r13); 3562 HS_CMP_XCHG(r1, r5); 3563 HS_CMP_XCHG(r9, r13); 3564 HS_CMP_XCHG(r3, r11); 3565 HS_CMP_XCHG(r7, r15); 3566 HS_CMP_XCHG(r3, r7); 3567 HS_CMP_XCHG(r11, r15); 3568 HS_CMP_XCHG(r1, r3); 3569 HS_CMP_XCHG(r5, r7); 3570 HS_CMP_XCHG(r9, r11); 3571 HS_CMP_XCHG(r13, r15); 3572 HS_CMP_XCHG(r2, r10); 3573 HS_CMP_XCHG(r6, r14); 3574 HS_CMP_XCHG(r2, r6); 3575 HS_CMP_XCHG(r10, r14); 3576 HS_CMP_XCHG(r4, r12); 3577 HS_CMP_XCHG(r8, r16); 3578 HS_CMP_XCHG(r4, r8); 3579 HS_CMP_XCHG(r12, r16); 3580 HS_CMP_XCHG(r2, r4); 3581 HS_CMP_XCHG(r6, r8); 3582 HS_CMP_XCHG(r10, r12); 3583 HS_CMP_XCHG(r14, r16); 3584 HS_CMP_XCHG(r1, r2); 3585 HS_CMP_XCHG(r3, r4); 3586 HS_CMP_XCHG(r5, r6); 3587 HS_CMP_XCHG(r7, r8); 3588 HS_CMP_XCHG(r9, r10); 3589 HS_CMP_XCHG(r11, r12); 3590 HS_CMP_XCHG(r13, r14); 3591 HS_CMP_XCHG(r15, r16); 3592 } 3593 HS_SLAB_GLOBAL_STORE(0, r1); 3594 HS_SLAB_GLOBAL_STORE(1, r2); 3595 HS_SLAB_GLOBAL_STORE(2, r3); 3596 HS_SLAB_GLOBAL_STORE(3, r4); 3597 HS_SLAB_GLOBAL_STORE(4, r5); 3598 HS_SLAB_GLOBAL_STORE(5, r6); 3599 HS_SLAB_GLOBAL_STORE(6, r7); 3600 HS_SLAB_GLOBAL_STORE(7, r8); 3601 HS_SLAB_GLOBAL_STORE(8, r9); 3602 HS_SLAB_GLOBAL_STORE(9, r10); 3603 HS_SLAB_GLOBAL_STORE(10, r11); 3604 HS_SLAB_GLOBAL_STORE(11, r12); 3605 HS_SLAB_GLOBAL_STORE(12, r13); 3606 HS_SLAB_GLOBAL_STORE(13, r14); 3607 HS_SLAB_GLOBAL_STORE(14, r15); 3608 HS_SLAB_GLOBAL_STORE(15, r16); 3609} 3610 3611HS_BC_KERNEL_PROTO(4, 2) 3612{ 3613 HS_BLOCK_LOCAL_MEM_DECL(32, 16); 3614 3615 HS_SLAB_GLOBAL_PREAMBLE(); 3616 HS_BC_MERGE_H_PREAMBLE(4); 3617 { 3618 { 3619 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0); 3620 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(16); 3621 HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(32); 3622 HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(48); 3623 HS_CMP_XCHG(r0_1, r0_3); 3624 HS_CMP_XCHG(r0_2, r0_4); 3625 HS_CMP_XCHG(r0_1, r0_2); 3626 HS_CMP_XCHG(r0_3, r0_4); 3627 HS_SLAB_LOCAL_L(0) = r0_1; 3628 HS_SLAB_LOCAL_L(8) = r0_2; 3629 HS_SLAB_LOCAL_L(16) = r0_3; 3630 HS_SLAB_LOCAL_L(24) = r0_4; 3631 } 3632 { 3633 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(4); 3634 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(20); 3635 HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(36); 3636 HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(52); 3637 HS_CMP_XCHG(r0_1, r0_3); 3638 HS_CMP_XCHG(r0_2, r0_4); 3639 HS_CMP_XCHG(r0_1, r0_2); 3640 HS_CMP_XCHG(r0_3, r0_4); 3641 HS_SLAB_LOCAL_L(128) = r0_1; 3642 HS_SLAB_LOCAL_L(136) = r0_2; 3643 HS_SLAB_LOCAL_L(144) = r0_3; 3644 HS_SLAB_LOCAL_L(152) = r0_4; 3645 } 3646 { 3647 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8); 3648 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(24); 3649 HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(40); 3650 HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(56); 3651 HS_CMP_XCHG(r0_1, r0_3); 3652 HS_CMP_XCHG(r0_2, r0_4); 3653 HS_CMP_XCHG(r0_1, r0_2); 3654 HS_CMP_XCHG(r0_3, r0_4); 3655 HS_SLAB_LOCAL_L(256) = r0_1; 3656 HS_SLAB_LOCAL_L(264) = r0_2; 3657 HS_SLAB_LOCAL_L(272) = r0_3; 3658 HS_SLAB_LOCAL_L(280) = r0_4; 3659 } 3660 { 3661 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(12); 3662 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(28); 3663 HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(44); 3664 HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(60); 3665 HS_CMP_XCHG(r0_1, r0_3); 3666 HS_CMP_XCHG(r0_2, r0_4); 3667 HS_CMP_XCHG(r0_1, r0_2); 3668 HS_CMP_XCHG(r0_3, r0_4); 3669 HS_SLAB_LOCAL_L(384) = r0_1; 3670 HS_SLAB_LOCAL_L(392) = r0_2; 3671 HS_SLAB_LOCAL_L(400) = r0_3; 3672 HS_SLAB_LOCAL_L(408) = r0_4; 3673 } 3674 } 3675 HS_BLOCK_BARRIER(); 3676 HS_KEY_TYPE r1 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0); 3677 HS_KEY_TYPE r2 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1); 3678 HS_KEY_TYPE r3 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2); 3679 HS_KEY_TYPE r4 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3); 3680 HS_KEY_TYPE r5 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4); 3681 HS_KEY_TYPE r6 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5); 3682 HS_KEY_TYPE r7 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6); 3683 HS_KEY_TYPE r8 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7); 3684 HS_KEY_TYPE r9 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 8); 3685 HS_KEY_TYPE r10 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 9); 3686 HS_KEY_TYPE r11 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 10); 3687 HS_KEY_TYPE r12 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 11); 3688 HS_KEY_TYPE r13 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 12); 3689 HS_KEY_TYPE r14 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 13); 3690 HS_KEY_TYPE r15 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 14); 3691 HS_KEY_TYPE r16 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 15); 3692 { 3693 { 3694 HS_SLAB_HALF_PREAMBLE(4); 3695 HS_CMP_HALF(0, r1); 3696 HS_CMP_HALF(1, r2); 3697 HS_CMP_HALF(2, r3); 3698 HS_CMP_HALF(3, r4); 3699 HS_CMP_HALF(4, r5); 3700 HS_CMP_HALF(5, r6); 3701 HS_CMP_HALF(6, r7); 3702 HS_CMP_HALF(7, r8); 3703 HS_CMP_HALF(8, r9); 3704 HS_CMP_HALF(9, r10); 3705 HS_CMP_HALF(10, r11); 3706 HS_CMP_HALF(11, r12); 3707 HS_CMP_HALF(12, r13); 3708 HS_CMP_HALF(13, r14); 3709 HS_CMP_HALF(14, r15); 3710 HS_CMP_HALF(15, r16); 3711 } 3712 { 3713 HS_SLAB_HALF_PREAMBLE(2); 3714 HS_CMP_HALF(0, r1); 3715 HS_CMP_HALF(1, r2); 3716 HS_CMP_HALF(2, r3); 3717 HS_CMP_HALF(3, r4); 3718 HS_CMP_HALF(4, r5); 3719 HS_CMP_HALF(5, r6); 3720 HS_CMP_HALF(6, r7); 3721 HS_CMP_HALF(7, r8); 3722 HS_CMP_HALF(8, r9); 3723 HS_CMP_HALF(9, r10); 3724 HS_CMP_HALF(10, r11); 3725 HS_CMP_HALF(11, r12); 3726 HS_CMP_HALF(12, r13); 3727 HS_CMP_HALF(13, r14); 3728 HS_CMP_HALF(14, r15); 3729 HS_CMP_HALF(15, r16); 3730 } 3731 { 3732 HS_SLAB_HALF_PREAMBLE(1); 3733 HS_CMP_HALF(0, r1); 3734 HS_CMP_HALF(1, r2); 3735 HS_CMP_HALF(2, r3); 3736 HS_CMP_HALF(3, r4); 3737 HS_CMP_HALF(4, r5); 3738 HS_CMP_HALF(5, r6); 3739 HS_CMP_HALF(6, r7); 3740 HS_CMP_HALF(7, r8); 3741 HS_CMP_HALF(8, r9); 3742 HS_CMP_HALF(9, r10); 3743 HS_CMP_HALF(10, r11); 3744 HS_CMP_HALF(11, r12); 3745 HS_CMP_HALF(12, r13); 3746 HS_CMP_HALF(13, r14); 3747 HS_CMP_HALF(14, r15); 3748 HS_CMP_HALF(15, r16); 3749 } 3750 HS_CMP_XCHG(r1, r9); 3751 HS_CMP_XCHG(r5, r13); 3752 HS_CMP_XCHG(r1, r5); 3753 HS_CMP_XCHG(r9, r13); 3754 HS_CMP_XCHG(r3, r11); 3755 HS_CMP_XCHG(r7, r15); 3756 HS_CMP_XCHG(r3, r7); 3757 HS_CMP_XCHG(r11, r15); 3758 HS_CMP_XCHG(r1, r3); 3759 HS_CMP_XCHG(r5, r7); 3760 HS_CMP_XCHG(r9, r11); 3761 HS_CMP_XCHG(r13, r15); 3762 HS_CMP_XCHG(r2, r10); 3763 HS_CMP_XCHG(r6, r14); 3764 HS_CMP_XCHG(r2, r6); 3765 HS_CMP_XCHG(r10, r14); 3766 HS_CMP_XCHG(r4, r12); 3767 HS_CMP_XCHG(r8, r16); 3768 HS_CMP_XCHG(r4, r8); 3769 HS_CMP_XCHG(r12, r16); 3770 HS_CMP_XCHG(r2, r4); 3771 HS_CMP_XCHG(r6, r8); 3772 HS_CMP_XCHG(r10, r12); 3773 HS_CMP_XCHG(r14, r16); 3774 HS_CMP_XCHG(r1, r2); 3775 HS_CMP_XCHG(r3, r4); 3776 HS_CMP_XCHG(r5, r6); 3777 HS_CMP_XCHG(r7, r8); 3778 HS_CMP_XCHG(r9, r10); 3779 HS_CMP_XCHG(r11, r12); 3780 HS_CMP_XCHG(r13, r14); 3781 HS_CMP_XCHG(r15, r16); 3782 } 3783 HS_SLAB_GLOBAL_STORE(0, r1); 3784 HS_SLAB_GLOBAL_STORE(1, r2); 3785 HS_SLAB_GLOBAL_STORE(2, r3); 3786 HS_SLAB_GLOBAL_STORE(3, r4); 3787 HS_SLAB_GLOBAL_STORE(4, r5); 3788 HS_SLAB_GLOBAL_STORE(5, r6); 3789 HS_SLAB_GLOBAL_STORE(6, r7); 3790 HS_SLAB_GLOBAL_STORE(7, r8); 3791 HS_SLAB_GLOBAL_STORE(8, r9); 3792 HS_SLAB_GLOBAL_STORE(9, r10); 3793 HS_SLAB_GLOBAL_STORE(10, r11); 3794 HS_SLAB_GLOBAL_STORE(11, r12); 3795 HS_SLAB_GLOBAL_STORE(12, r13); 3796 HS_SLAB_GLOBAL_STORE(13, r14); 3797 HS_SLAB_GLOBAL_STORE(14, r15); 3798 HS_SLAB_GLOBAL_STORE(15, r16); 3799} 3800 3801HS_BC_KERNEL_PROTO(8, 3) 3802{ 3803 HS_BLOCK_LOCAL_MEM_DECL(64, 16); 3804 3805 HS_SLAB_GLOBAL_PREAMBLE(); 3806 HS_BC_MERGE_H_PREAMBLE(8); 3807 { 3808 { 3809 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0); 3810 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(16); 3811 HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(32); 3812 HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(48); 3813 HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(64); 3814 HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(80); 3815 HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(96); 3816 HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(112); 3817 HS_CMP_XCHG(r0_1, r0_5); 3818 HS_CMP_XCHG(r0_3, r0_7); 3819 HS_CMP_XCHG(r0_1, r0_3); 3820 HS_CMP_XCHG(r0_5, r0_7); 3821 HS_CMP_XCHG(r0_2, r0_6); 3822 HS_CMP_XCHG(r0_4, r0_8); 3823 HS_CMP_XCHG(r0_2, r0_4); 3824 HS_CMP_XCHG(r0_6, r0_8); 3825 HS_CMP_XCHG(r0_1, r0_2); 3826 HS_CMP_XCHG(r0_3, r0_4); 3827 HS_CMP_XCHG(r0_5, r0_6); 3828 HS_CMP_XCHG(r0_7, r0_8); 3829 HS_SLAB_LOCAL_L(0) = r0_1; 3830 HS_SLAB_LOCAL_L(8) = r0_2; 3831 HS_SLAB_LOCAL_L(16) = r0_3; 3832 HS_SLAB_LOCAL_L(24) = r0_4; 3833 HS_SLAB_LOCAL_L(32) = r0_5; 3834 HS_SLAB_LOCAL_L(40) = r0_6; 3835 HS_SLAB_LOCAL_L(48) = r0_7; 3836 HS_SLAB_LOCAL_L(56) = r0_8; 3837 } 3838 { 3839 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8); 3840 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(24); 3841 HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(40); 3842 HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(56); 3843 HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(72); 3844 HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(88); 3845 HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(104); 3846 HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(120); 3847 HS_CMP_XCHG(r0_1, r0_5); 3848 HS_CMP_XCHG(r0_3, r0_7); 3849 HS_CMP_XCHG(r0_1, r0_3); 3850 HS_CMP_XCHG(r0_5, r0_7); 3851 HS_CMP_XCHG(r0_2, r0_6); 3852 HS_CMP_XCHG(r0_4, r0_8); 3853 HS_CMP_XCHG(r0_2, r0_4); 3854 HS_CMP_XCHG(r0_6, r0_8); 3855 HS_CMP_XCHG(r0_1, r0_2); 3856 HS_CMP_XCHG(r0_3, r0_4); 3857 HS_CMP_XCHG(r0_5, r0_6); 3858 HS_CMP_XCHG(r0_7, r0_8); 3859 HS_SLAB_LOCAL_L(512) = r0_1; 3860 HS_SLAB_LOCAL_L(520) = r0_2; 3861 HS_SLAB_LOCAL_L(528) = r0_3; 3862 HS_SLAB_LOCAL_L(536) = r0_4; 3863 HS_SLAB_LOCAL_L(544) = r0_5; 3864 HS_SLAB_LOCAL_L(552) = r0_6; 3865 HS_SLAB_LOCAL_L(560) = r0_7; 3866 HS_SLAB_LOCAL_L(568) = r0_8; 3867 } 3868 } 3869 HS_BLOCK_BARRIER(); 3870 HS_KEY_TYPE r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0); 3871 HS_KEY_TYPE r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1); 3872 HS_KEY_TYPE r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2); 3873 HS_KEY_TYPE r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3); 3874 HS_KEY_TYPE r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4); 3875 HS_KEY_TYPE r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5); 3876 HS_KEY_TYPE r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6); 3877 HS_KEY_TYPE r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7); 3878 HS_KEY_TYPE r9 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8); 3879 HS_KEY_TYPE r10 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9); 3880 HS_KEY_TYPE r11 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10); 3881 HS_KEY_TYPE r12 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11); 3882 HS_KEY_TYPE r13 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12); 3883 HS_KEY_TYPE r14 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13); 3884 HS_KEY_TYPE r15 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14); 3885 HS_KEY_TYPE r16 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15); 3886 { 3887 { 3888 HS_SLAB_HALF_PREAMBLE(4); 3889 HS_CMP_HALF(0, r1); 3890 HS_CMP_HALF(1, r2); 3891 HS_CMP_HALF(2, r3); 3892 HS_CMP_HALF(3, r4); 3893 HS_CMP_HALF(4, r5); 3894 HS_CMP_HALF(5, r6); 3895 HS_CMP_HALF(6, r7); 3896 HS_CMP_HALF(7, r8); 3897 HS_CMP_HALF(8, r9); 3898 HS_CMP_HALF(9, r10); 3899 HS_CMP_HALF(10, r11); 3900 HS_CMP_HALF(11, r12); 3901 HS_CMP_HALF(12, r13); 3902 HS_CMP_HALF(13, r14); 3903 HS_CMP_HALF(14, r15); 3904 HS_CMP_HALF(15, r16); 3905 } 3906 { 3907 HS_SLAB_HALF_PREAMBLE(2); 3908 HS_CMP_HALF(0, r1); 3909 HS_CMP_HALF(1, r2); 3910 HS_CMP_HALF(2, r3); 3911 HS_CMP_HALF(3, r4); 3912 HS_CMP_HALF(4, r5); 3913 HS_CMP_HALF(5, r6); 3914 HS_CMP_HALF(6, r7); 3915 HS_CMP_HALF(7, r8); 3916 HS_CMP_HALF(8, r9); 3917 HS_CMP_HALF(9, r10); 3918 HS_CMP_HALF(10, r11); 3919 HS_CMP_HALF(11, r12); 3920 HS_CMP_HALF(12, r13); 3921 HS_CMP_HALF(13, r14); 3922 HS_CMP_HALF(14, r15); 3923 HS_CMP_HALF(15, r16); 3924 } 3925 { 3926 HS_SLAB_HALF_PREAMBLE(1); 3927 HS_CMP_HALF(0, r1); 3928 HS_CMP_HALF(1, r2); 3929 HS_CMP_HALF(2, r3); 3930 HS_CMP_HALF(3, r4); 3931 HS_CMP_HALF(4, r5); 3932 HS_CMP_HALF(5, r6); 3933 HS_CMP_HALF(6, r7); 3934 HS_CMP_HALF(7, r8); 3935 HS_CMP_HALF(8, r9); 3936 HS_CMP_HALF(9, r10); 3937 HS_CMP_HALF(10, r11); 3938 HS_CMP_HALF(11, r12); 3939 HS_CMP_HALF(12, r13); 3940 HS_CMP_HALF(13, r14); 3941 HS_CMP_HALF(14, r15); 3942 HS_CMP_HALF(15, r16); 3943 } 3944 HS_CMP_XCHG(r1, r9); 3945 HS_CMP_XCHG(r5, r13); 3946 HS_CMP_XCHG(r1, r5); 3947 HS_CMP_XCHG(r9, r13); 3948 HS_CMP_XCHG(r3, r11); 3949 HS_CMP_XCHG(r7, r15); 3950 HS_CMP_XCHG(r3, r7); 3951 HS_CMP_XCHG(r11, r15); 3952 HS_CMP_XCHG(r1, r3); 3953 HS_CMP_XCHG(r5, r7); 3954 HS_CMP_XCHG(r9, r11); 3955 HS_CMP_XCHG(r13, r15); 3956 HS_CMP_XCHG(r2, r10); 3957 HS_CMP_XCHG(r6, r14); 3958 HS_CMP_XCHG(r2, r6); 3959 HS_CMP_XCHG(r10, r14); 3960 HS_CMP_XCHG(r4, r12); 3961 HS_CMP_XCHG(r8, r16); 3962 HS_CMP_XCHG(r4, r8); 3963 HS_CMP_XCHG(r12, r16); 3964 HS_CMP_XCHG(r2, r4); 3965 HS_CMP_XCHG(r6, r8); 3966 HS_CMP_XCHG(r10, r12); 3967 HS_CMP_XCHG(r14, r16); 3968 HS_CMP_XCHG(r1, r2); 3969 HS_CMP_XCHG(r3, r4); 3970 HS_CMP_XCHG(r5, r6); 3971 HS_CMP_XCHG(r7, r8); 3972 HS_CMP_XCHG(r9, r10); 3973 HS_CMP_XCHG(r11, r12); 3974 HS_CMP_XCHG(r13, r14); 3975 HS_CMP_XCHG(r15, r16); 3976 } 3977 HS_SLAB_GLOBAL_STORE(0, r1); 3978 HS_SLAB_GLOBAL_STORE(1, r2); 3979 HS_SLAB_GLOBAL_STORE(2, r3); 3980 HS_SLAB_GLOBAL_STORE(3, r4); 3981 HS_SLAB_GLOBAL_STORE(4, r5); 3982 HS_SLAB_GLOBAL_STORE(5, r6); 3983 HS_SLAB_GLOBAL_STORE(6, r7); 3984 HS_SLAB_GLOBAL_STORE(7, r8); 3985 HS_SLAB_GLOBAL_STORE(8, r9); 3986 HS_SLAB_GLOBAL_STORE(9, r10); 3987 HS_SLAB_GLOBAL_STORE(10, r11); 3988 HS_SLAB_GLOBAL_STORE(11, r12); 3989 HS_SLAB_GLOBAL_STORE(12, r13); 3990 HS_SLAB_GLOBAL_STORE(13, r14); 3991 HS_SLAB_GLOBAL_STORE(14, r15); 3992 HS_SLAB_GLOBAL_STORE(15, r16); 3993} 3994 3995HS_BC_KERNEL_PROTO(16, 4) 3996{ 3997 HS_BLOCK_LOCAL_MEM_DECL(128, 16); 3998 3999 HS_SLAB_GLOBAL_PREAMBLE(); 4000 HS_BC_MERGE_H_PREAMBLE(16); 4001 { 4002 { 4003 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0); 4004 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(16); 4005 HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(32); 4006 HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(48); 4007 HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(64); 4008 HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(80); 4009 HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(96); 4010 HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(112); 4011 HS_KEY_TYPE r0_9 = HS_BC_GLOBAL_LOAD_L(128); 4012 HS_KEY_TYPE r0_10 = HS_BC_GLOBAL_LOAD_L(144); 4013 HS_KEY_TYPE r0_11 = HS_BC_GLOBAL_LOAD_L(160); 4014 HS_KEY_TYPE r0_12 = HS_BC_GLOBAL_LOAD_L(176); 4015 HS_KEY_TYPE r0_13 = HS_BC_GLOBAL_LOAD_L(192); 4016 HS_KEY_TYPE r0_14 = HS_BC_GLOBAL_LOAD_L(208); 4017 HS_KEY_TYPE r0_15 = HS_BC_GLOBAL_LOAD_L(224); 4018 HS_KEY_TYPE r0_16 = HS_BC_GLOBAL_LOAD_L(240); 4019 HS_CMP_XCHG(r0_1, r0_9); 4020 HS_CMP_XCHG(r0_5, r0_13); 4021 HS_CMP_XCHG(r0_1, r0_5); 4022 HS_CMP_XCHG(r0_9, r0_13); 4023 HS_CMP_XCHG(r0_3, r0_11); 4024 HS_CMP_XCHG(r0_7, r0_15); 4025 HS_CMP_XCHG(r0_3, r0_7); 4026 HS_CMP_XCHG(r0_11, r0_15); 4027 HS_CMP_XCHG(r0_1, r0_3); 4028 HS_CMP_XCHG(r0_5, r0_7); 4029 HS_CMP_XCHG(r0_9, r0_11); 4030 HS_CMP_XCHG(r0_13, r0_15); 4031 HS_CMP_XCHG(r0_2, r0_10); 4032 HS_CMP_XCHG(r0_6, r0_14); 4033 HS_CMP_XCHG(r0_2, r0_6); 4034 HS_CMP_XCHG(r0_10, r0_14); 4035 HS_CMP_XCHG(r0_4, r0_12); 4036 HS_CMP_XCHG(r0_8, r0_16); 4037 HS_CMP_XCHG(r0_4, r0_8); 4038 HS_CMP_XCHG(r0_12, r0_16); 4039 HS_CMP_XCHG(r0_2, r0_4); 4040 HS_CMP_XCHG(r0_6, r0_8); 4041 HS_CMP_XCHG(r0_10, r0_12); 4042 HS_CMP_XCHG(r0_14, r0_16); 4043 HS_CMP_XCHG(r0_1, r0_2); 4044 HS_CMP_XCHG(r0_3, r0_4); 4045 HS_CMP_XCHG(r0_5, r0_6); 4046 HS_CMP_XCHG(r0_7, r0_8); 4047 HS_CMP_XCHG(r0_9, r0_10); 4048 HS_CMP_XCHG(r0_11, r0_12); 4049 HS_CMP_XCHG(r0_13, r0_14); 4050 HS_CMP_XCHG(r0_15, r0_16); 4051 HS_SLAB_LOCAL_L(0) = r0_1; 4052 HS_SLAB_LOCAL_L(8) = r0_2; 4053 HS_SLAB_LOCAL_L(16) = r0_3; 4054 HS_SLAB_LOCAL_L(24) = r0_4; 4055 HS_SLAB_LOCAL_L(32) = r0_5; 4056 HS_SLAB_LOCAL_L(40) = r0_6; 4057 HS_SLAB_LOCAL_L(48) = r0_7; 4058 HS_SLAB_LOCAL_L(56) = r0_8; 4059 HS_SLAB_LOCAL_L(64) = r0_9; 4060 HS_SLAB_LOCAL_L(72) = r0_10; 4061 HS_SLAB_LOCAL_L(80) = r0_11; 4062 HS_SLAB_LOCAL_L(88) = r0_12; 4063 HS_SLAB_LOCAL_L(96) = r0_13; 4064 HS_SLAB_LOCAL_L(104) = r0_14; 4065 HS_SLAB_LOCAL_L(112) = r0_15; 4066 HS_SLAB_LOCAL_L(120) = r0_16; 4067 } 4068 } 4069 HS_BLOCK_BARRIER(); 4070 HS_KEY_TYPE r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0); 4071 HS_KEY_TYPE r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1); 4072 HS_KEY_TYPE r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2); 4073 HS_KEY_TYPE r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3); 4074 HS_KEY_TYPE r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4); 4075 HS_KEY_TYPE r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5); 4076 HS_KEY_TYPE r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6); 4077 HS_KEY_TYPE r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7); 4078 HS_KEY_TYPE r9 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8); 4079 HS_KEY_TYPE r10 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9); 4080 HS_KEY_TYPE r11 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10); 4081 HS_KEY_TYPE r12 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11); 4082 HS_KEY_TYPE r13 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12); 4083 HS_KEY_TYPE r14 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13); 4084 HS_KEY_TYPE r15 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14); 4085 HS_KEY_TYPE r16 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15); 4086 { 4087 { 4088 HS_SLAB_HALF_PREAMBLE(4); 4089 HS_CMP_HALF(0, r1); 4090 HS_CMP_HALF(1, r2); 4091 HS_CMP_HALF(2, r3); 4092 HS_CMP_HALF(3, r4); 4093 HS_CMP_HALF(4, r5); 4094 HS_CMP_HALF(5, r6); 4095 HS_CMP_HALF(6, r7); 4096 HS_CMP_HALF(7, r8); 4097 HS_CMP_HALF(8, r9); 4098 HS_CMP_HALF(9, r10); 4099 HS_CMP_HALF(10, r11); 4100 HS_CMP_HALF(11, r12); 4101 HS_CMP_HALF(12, r13); 4102 HS_CMP_HALF(13, r14); 4103 HS_CMP_HALF(14, r15); 4104 HS_CMP_HALF(15, r16); 4105 } 4106 { 4107 HS_SLAB_HALF_PREAMBLE(2); 4108 HS_CMP_HALF(0, r1); 4109 HS_CMP_HALF(1, r2); 4110 HS_CMP_HALF(2, r3); 4111 HS_CMP_HALF(3, r4); 4112 HS_CMP_HALF(4, r5); 4113 HS_CMP_HALF(5, r6); 4114 HS_CMP_HALF(6, r7); 4115 HS_CMP_HALF(7, r8); 4116 HS_CMP_HALF(8, r9); 4117 HS_CMP_HALF(9, r10); 4118 HS_CMP_HALF(10, r11); 4119 HS_CMP_HALF(11, r12); 4120 HS_CMP_HALF(12, r13); 4121 HS_CMP_HALF(13, r14); 4122 HS_CMP_HALF(14, r15); 4123 HS_CMP_HALF(15, r16); 4124 } 4125 { 4126 HS_SLAB_HALF_PREAMBLE(1); 4127 HS_CMP_HALF(0, r1); 4128 HS_CMP_HALF(1, r2); 4129 HS_CMP_HALF(2, r3); 4130 HS_CMP_HALF(3, r4); 4131 HS_CMP_HALF(4, r5); 4132 HS_CMP_HALF(5, r6); 4133 HS_CMP_HALF(6, r7); 4134 HS_CMP_HALF(7, r8); 4135 HS_CMP_HALF(8, r9); 4136 HS_CMP_HALF(9, r10); 4137 HS_CMP_HALF(10, r11); 4138 HS_CMP_HALF(11, r12); 4139 HS_CMP_HALF(12, r13); 4140 HS_CMP_HALF(13, r14); 4141 HS_CMP_HALF(14, r15); 4142 HS_CMP_HALF(15, r16); 4143 } 4144 HS_CMP_XCHG(r1, r9); 4145 HS_CMP_XCHG(r5, r13); 4146 HS_CMP_XCHG(r1, r5); 4147 HS_CMP_XCHG(r9, r13); 4148 HS_CMP_XCHG(r3, r11); 4149 HS_CMP_XCHG(r7, r15); 4150 HS_CMP_XCHG(r3, r7); 4151 HS_CMP_XCHG(r11, r15); 4152 HS_CMP_XCHG(r1, r3); 4153 HS_CMP_XCHG(r5, r7); 4154 HS_CMP_XCHG(r9, r11); 4155 HS_CMP_XCHG(r13, r15); 4156 HS_CMP_XCHG(r2, r10); 4157 HS_CMP_XCHG(r6, r14); 4158 HS_CMP_XCHG(r2, r6); 4159 HS_CMP_XCHG(r10, r14); 4160 HS_CMP_XCHG(r4, r12); 4161 HS_CMP_XCHG(r8, r16); 4162 HS_CMP_XCHG(r4, r8); 4163 HS_CMP_XCHG(r12, r16); 4164 HS_CMP_XCHG(r2, r4); 4165 HS_CMP_XCHG(r6, r8); 4166 HS_CMP_XCHG(r10, r12); 4167 HS_CMP_XCHG(r14, r16); 4168 HS_CMP_XCHG(r1, r2); 4169 HS_CMP_XCHG(r3, r4); 4170 HS_CMP_XCHG(r5, r6); 4171 HS_CMP_XCHG(r7, r8); 4172 HS_CMP_XCHG(r9, r10); 4173 HS_CMP_XCHG(r11, r12); 4174 HS_CMP_XCHG(r13, r14); 4175 HS_CMP_XCHG(r15, r16); 4176 } 4177 HS_SLAB_GLOBAL_STORE(0, r1); 4178 HS_SLAB_GLOBAL_STORE(1, r2); 4179 HS_SLAB_GLOBAL_STORE(2, r3); 4180 HS_SLAB_GLOBAL_STORE(3, r4); 4181 HS_SLAB_GLOBAL_STORE(4, r5); 4182 HS_SLAB_GLOBAL_STORE(5, r6); 4183 HS_SLAB_GLOBAL_STORE(6, r7); 4184 HS_SLAB_GLOBAL_STORE(7, r8); 4185 HS_SLAB_GLOBAL_STORE(8, r9); 4186 HS_SLAB_GLOBAL_STORE(9, r10); 4187 HS_SLAB_GLOBAL_STORE(10, r11); 4188 HS_SLAB_GLOBAL_STORE(11, r12); 4189 HS_SLAB_GLOBAL_STORE(12, r13); 4190 HS_SLAB_GLOBAL_STORE(13, r14); 4191 HS_SLAB_GLOBAL_STORE(14, r15); 4192 HS_SLAB_GLOBAL_STORE(15, r16); 4193} 4194 4195HS_FM_KERNEL_PROTO(1, 0) 4196{ 4197 HS_FM_PREAMBLE(16); 4198 HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0); 4199 HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1); 4200 HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2); 4201 HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3); 4202 HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4); 4203 HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5); 4204 HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6); 4205 HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7); 4206 HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8); 4207 HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9); 4208 HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10); 4209 HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11); 4210 HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12); 4211 HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13); 4212 HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14); 4213 HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15); 4214 HS_KEY_TYPE r17 = HS_FM_GLOBAL_LOAD_R(0); 4215 HS_CMP_XCHG(r16, r17); 4216 HS_CMP_XCHG(r1, r9); 4217 HS_CMP_XCHG(r5, r13); 4218 HS_CMP_XCHG(r1, r5); 4219 HS_CMP_XCHG(r9, r13); 4220 HS_CMP_XCHG(r3, r11); 4221 HS_CMP_XCHG(r7, r15); 4222 HS_CMP_XCHG(r3, r7); 4223 HS_CMP_XCHG(r11, r15); 4224 HS_CMP_XCHG(r1, r3); 4225 HS_CMP_XCHG(r5, r7); 4226 HS_CMP_XCHG(r9, r11); 4227 HS_CMP_XCHG(r13, r15); 4228 HS_CMP_XCHG(r2, r10); 4229 HS_CMP_XCHG(r6, r14); 4230 HS_CMP_XCHG(r2, r6); 4231 HS_CMP_XCHG(r10, r14); 4232 HS_CMP_XCHG(r4, r12); 4233 HS_CMP_XCHG(r8, r16); 4234 HS_CMP_XCHG(r4, r8); 4235 HS_CMP_XCHG(r12, r16); 4236 HS_CMP_XCHG(r2, r4); 4237 HS_CMP_XCHG(r6, r8); 4238 HS_CMP_XCHG(r10, r12); 4239 HS_CMP_XCHG(r14, r16); 4240 HS_CMP_XCHG(r1, r2); 4241 HS_CMP_XCHG(r3, r4); 4242 HS_CMP_XCHG(r5, r6); 4243 HS_CMP_XCHG(r7, r8); 4244 HS_CMP_XCHG(r9, r10); 4245 HS_CMP_XCHG(r11, r12); 4246 HS_CMP_XCHG(r13, r14); 4247 HS_CMP_XCHG(r15, r16); 4248 HS_XM_GLOBAL_STORE_L(0, r1); 4249 HS_XM_GLOBAL_STORE_L(1, r2); 4250 HS_XM_GLOBAL_STORE_L(2, r3); 4251 HS_XM_GLOBAL_STORE_L(3, r4); 4252 HS_XM_GLOBAL_STORE_L(4, r5); 4253 HS_XM_GLOBAL_STORE_L(5, r6); 4254 HS_XM_GLOBAL_STORE_L(6, r7); 4255 HS_XM_GLOBAL_STORE_L(7, r8); 4256 HS_XM_GLOBAL_STORE_L(8, r9); 4257 HS_XM_GLOBAL_STORE_L(9, r10); 4258 HS_XM_GLOBAL_STORE_L(10, r11); 4259 HS_XM_GLOBAL_STORE_L(11, r12); 4260 HS_XM_GLOBAL_STORE_L(12, r13); 4261 HS_XM_GLOBAL_STORE_L(13, r14); 4262 HS_XM_GLOBAL_STORE_L(14, r15); 4263 HS_XM_GLOBAL_STORE_L(15, r16); 4264 HS_FM_GLOBAL_STORE_R(0, r17); 4265} 4266 4267HS_FM_KERNEL_PROTO(1, 1) 4268{ 4269 HS_FM_PREAMBLE(16); 4270 HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0); 4271 HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1); 4272 HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2); 4273 HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3); 4274 HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4); 4275 HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5); 4276 HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6); 4277 HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7); 4278 HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8); 4279 HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9); 4280 HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10); 4281 HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11); 4282 HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12); 4283 HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13); 4284 HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14); 4285 HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15); 4286 HS_KEY_TYPE r17 = HS_FM_GLOBAL_LOAD_R(0); 4287 HS_KEY_TYPE r18 = HS_FM_GLOBAL_LOAD_R(1); 4288 HS_CMP_XCHG(r16, r17); 4289 HS_CMP_XCHG(r15, r18); 4290 HS_CMP_XCHG(r1, r9); 4291 HS_CMP_XCHG(r5, r13); 4292 HS_CMP_XCHG(r1, r5); 4293 HS_CMP_XCHG(r9, r13); 4294 HS_CMP_XCHG(r3, r11); 4295 HS_CMP_XCHG(r7, r15); 4296 HS_CMP_XCHG(r3, r7); 4297 HS_CMP_XCHG(r11, r15); 4298 HS_CMP_XCHG(r1, r3); 4299 HS_CMP_XCHG(r5, r7); 4300 HS_CMP_XCHG(r9, r11); 4301 HS_CMP_XCHG(r13, r15); 4302 HS_CMP_XCHG(r2, r10); 4303 HS_CMP_XCHG(r6, r14); 4304 HS_CMP_XCHG(r2, r6); 4305 HS_CMP_XCHG(r10, r14); 4306 HS_CMP_XCHG(r4, r12); 4307 HS_CMP_XCHG(r8, r16); 4308 HS_CMP_XCHG(r4, r8); 4309 HS_CMP_XCHG(r12, r16); 4310 HS_CMP_XCHG(r2, r4); 4311 HS_CMP_XCHG(r6, r8); 4312 HS_CMP_XCHG(r10, r12); 4313 HS_CMP_XCHG(r14, r16); 4314 HS_CMP_XCHG(r1, r2); 4315 HS_CMP_XCHG(r3, r4); 4316 HS_CMP_XCHG(r5, r6); 4317 HS_CMP_XCHG(r7, r8); 4318 HS_CMP_XCHG(r9, r10); 4319 HS_CMP_XCHG(r11, r12); 4320 HS_CMP_XCHG(r13, r14); 4321 HS_CMP_XCHG(r15, r16); 4322 HS_CMP_XCHG(r17, r18); 4323 HS_XM_GLOBAL_STORE_L(0, r1); 4324 HS_XM_GLOBAL_STORE_L(1, r2); 4325 HS_XM_GLOBAL_STORE_L(2, r3); 4326 HS_XM_GLOBAL_STORE_L(3, r4); 4327 HS_XM_GLOBAL_STORE_L(4, r5); 4328 HS_XM_GLOBAL_STORE_L(5, r6); 4329 HS_XM_GLOBAL_STORE_L(6, r7); 4330 HS_XM_GLOBAL_STORE_L(7, r8); 4331 HS_XM_GLOBAL_STORE_L(8, r9); 4332 HS_XM_GLOBAL_STORE_L(9, r10); 4333 HS_XM_GLOBAL_STORE_L(10, r11); 4334 HS_XM_GLOBAL_STORE_L(11, r12); 4335 HS_XM_GLOBAL_STORE_L(12, r13); 4336 HS_XM_GLOBAL_STORE_L(13, r14); 4337 HS_XM_GLOBAL_STORE_L(14, r15); 4338 HS_XM_GLOBAL_STORE_L(15, r16); 4339 HS_FM_GLOBAL_STORE_R(0, r17); 4340 HS_FM_GLOBAL_STORE_R(1, r18); 4341} 4342 4343HS_FM_KERNEL_PROTO(1, 2) 4344{ 4345 HS_FM_PREAMBLE(16); 4346 HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0); 4347 HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1); 4348 HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2); 4349 HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3); 4350 HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4); 4351 HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5); 4352 HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6); 4353 HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7); 4354 HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8); 4355 HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9); 4356 HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10); 4357 HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11); 4358 HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12); 4359 HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13); 4360 HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14); 4361 HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15); 4362 HS_KEY_TYPE r17 = HS_FM_GLOBAL_LOAD_R(0); 4363 HS_KEY_TYPE r18 = HS_FM_GLOBAL_LOAD_R(1); 4364 HS_KEY_TYPE r19 = HS_FM_GLOBAL_LOAD_R(2); 4365 HS_KEY_TYPE r20 = HS_FM_GLOBAL_LOAD_R(3); 4366 HS_CMP_XCHG(r16, r17); 4367 HS_CMP_XCHG(r15, r18); 4368 HS_CMP_XCHG(r14, r19); 4369 HS_CMP_XCHG(r13, r20); 4370 HS_CMP_XCHG(r1, r9); 4371 HS_CMP_XCHG(r5, r13); 4372 HS_CMP_XCHG(r1, r5); 4373 HS_CMP_XCHG(r9, r13); 4374 HS_CMP_XCHG(r3, r11); 4375 HS_CMP_XCHG(r7, r15); 4376 HS_CMP_XCHG(r3, r7); 4377 HS_CMP_XCHG(r11, r15); 4378 HS_CMP_XCHG(r1, r3); 4379 HS_CMP_XCHG(r5, r7); 4380 HS_CMP_XCHG(r9, r11); 4381 HS_CMP_XCHG(r13, r15); 4382 HS_CMP_XCHG(r2, r10); 4383 HS_CMP_XCHG(r6, r14); 4384 HS_CMP_XCHG(r2, r6); 4385 HS_CMP_XCHG(r10, r14); 4386 HS_CMP_XCHG(r4, r12); 4387 HS_CMP_XCHG(r8, r16); 4388 HS_CMP_XCHG(r4, r8); 4389 HS_CMP_XCHG(r12, r16); 4390 HS_CMP_XCHG(r2, r4); 4391 HS_CMP_XCHG(r6, r8); 4392 HS_CMP_XCHG(r10, r12); 4393 HS_CMP_XCHG(r14, r16); 4394 HS_CMP_XCHG(r1, r2); 4395 HS_CMP_XCHG(r3, r4); 4396 HS_CMP_XCHG(r5, r6); 4397 HS_CMP_XCHG(r7, r8); 4398 HS_CMP_XCHG(r9, r10); 4399 HS_CMP_XCHG(r11, r12); 4400 HS_CMP_XCHG(r13, r14); 4401 HS_CMP_XCHG(r15, r16); 4402 HS_CMP_XCHG(r17, r19); 4403 HS_CMP_XCHG(r18, r20); 4404 HS_CMP_XCHG(r17, r18); 4405 HS_CMP_XCHG(r19, r20); 4406 HS_XM_GLOBAL_STORE_L(0, r1); 4407 HS_XM_GLOBAL_STORE_L(1, r2); 4408 HS_XM_GLOBAL_STORE_L(2, r3); 4409 HS_XM_GLOBAL_STORE_L(3, r4); 4410 HS_XM_GLOBAL_STORE_L(4, r5); 4411 HS_XM_GLOBAL_STORE_L(5, r6); 4412 HS_XM_GLOBAL_STORE_L(6, r7); 4413 HS_XM_GLOBAL_STORE_L(7, r8); 4414 HS_XM_GLOBAL_STORE_L(8, r9); 4415 HS_XM_GLOBAL_STORE_L(9, r10); 4416 HS_XM_GLOBAL_STORE_L(10, r11); 4417 HS_XM_GLOBAL_STORE_L(11, r12); 4418 HS_XM_GLOBAL_STORE_L(12, r13); 4419 HS_XM_GLOBAL_STORE_L(13, r14); 4420 HS_XM_GLOBAL_STORE_L(14, r15); 4421 HS_XM_GLOBAL_STORE_L(15, r16); 4422 HS_FM_GLOBAL_STORE_R(0, r17); 4423 HS_FM_GLOBAL_STORE_R(1, r18); 4424 HS_FM_GLOBAL_STORE_R(2, r19); 4425 HS_FM_GLOBAL_STORE_R(3, r20); 4426} 4427 4428HS_FM_KERNEL_PROTO(1, 3) 4429{ 4430 HS_FM_PREAMBLE(16); 4431 HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0); 4432 HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1); 4433 HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2); 4434 HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3); 4435 HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4); 4436 HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5); 4437 HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6); 4438 HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7); 4439 HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8); 4440 HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9); 4441 HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10); 4442 HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11); 4443 HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12); 4444 HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13); 4445 HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14); 4446 HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15); 4447 HS_KEY_TYPE r17 = HS_FM_GLOBAL_LOAD_R(0); 4448 HS_KEY_TYPE r18 = HS_FM_GLOBAL_LOAD_R(1); 4449 HS_KEY_TYPE r19 = HS_FM_GLOBAL_LOAD_R(2); 4450 HS_KEY_TYPE r20 = HS_FM_GLOBAL_LOAD_R(3); 4451 HS_KEY_TYPE r21 = HS_FM_GLOBAL_LOAD_R(4); 4452 HS_KEY_TYPE r22 = HS_FM_GLOBAL_LOAD_R(5); 4453 HS_KEY_TYPE r23 = HS_FM_GLOBAL_LOAD_R(6); 4454 HS_KEY_TYPE r24 = HS_FM_GLOBAL_LOAD_R(7); 4455 HS_CMP_XCHG(r16, r17); 4456 HS_CMP_XCHG(r15, r18); 4457 HS_CMP_XCHG(r14, r19); 4458 HS_CMP_XCHG(r13, r20); 4459 HS_CMP_XCHG(r12, r21); 4460 HS_CMP_XCHG(r11, r22); 4461 HS_CMP_XCHG(r10, r23); 4462 HS_CMP_XCHG(r9, r24); 4463 HS_CMP_XCHG(r1, r9); 4464 HS_CMP_XCHG(r5, r13); 4465 HS_CMP_XCHG(r1, r5); 4466 HS_CMP_XCHG(r9, r13); 4467 HS_CMP_XCHG(r3, r11); 4468 HS_CMP_XCHG(r7, r15); 4469 HS_CMP_XCHG(r3, r7); 4470 HS_CMP_XCHG(r11, r15); 4471 HS_CMP_XCHG(r1, r3); 4472 HS_CMP_XCHG(r5, r7); 4473 HS_CMP_XCHG(r9, r11); 4474 HS_CMP_XCHG(r13, r15); 4475 HS_CMP_XCHG(r2, r10); 4476 HS_CMP_XCHG(r6, r14); 4477 HS_CMP_XCHG(r2, r6); 4478 HS_CMP_XCHG(r10, r14); 4479 HS_CMP_XCHG(r4, r12); 4480 HS_CMP_XCHG(r8, r16); 4481 HS_CMP_XCHG(r4, r8); 4482 HS_CMP_XCHG(r12, r16); 4483 HS_CMP_XCHG(r2, r4); 4484 HS_CMP_XCHG(r6, r8); 4485 HS_CMP_XCHG(r10, r12); 4486 HS_CMP_XCHG(r14, r16); 4487 HS_CMP_XCHG(r1, r2); 4488 HS_CMP_XCHG(r3, r4); 4489 HS_CMP_XCHG(r5, r6); 4490 HS_CMP_XCHG(r7, r8); 4491 HS_CMP_XCHG(r9, r10); 4492 HS_CMP_XCHG(r11, r12); 4493 HS_CMP_XCHG(r13, r14); 4494 HS_CMP_XCHG(r15, r16); 4495 HS_CMP_XCHG(r17, r21); 4496 HS_CMP_XCHG(r19, r23); 4497 HS_CMP_XCHG(r17, r19); 4498 HS_CMP_XCHG(r21, r23); 4499 HS_CMP_XCHG(r18, r22); 4500 HS_CMP_XCHG(r20, r24); 4501 HS_CMP_XCHG(r18, r20); 4502 HS_CMP_XCHG(r22, r24); 4503 HS_CMP_XCHG(r17, r18); 4504 HS_CMP_XCHG(r19, r20); 4505 HS_CMP_XCHG(r21, r22); 4506 HS_CMP_XCHG(r23, r24); 4507 HS_XM_GLOBAL_STORE_L(0, r1); 4508 HS_XM_GLOBAL_STORE_L(1, r2); 4509 HS_XM_GLOBAL_STORE_L(2, r3); 4510 HS_XM_GLOBAL_STORE_L(3, r4); 4511 HS_XM_GLOBAL_STORE_L(4, r5); 4512 HS_XM_GLOBAL_STORE_L(5, r6); 4513 HS_XM_GLOBAL_STORE_L(6, r7); 4514 HS_XM_GLOBAL_STORE_L(7, r8); 4515 HS_XM_GLOBAL_STORE_L(8, r9); 4516 HS_XM_GLOBAL_STORE_L(9, r10); 4517 HS_XM_GLOBAL_STORE_L(10, r11); 4518 HS_XM_GLOBAL_STORE_L(11, r12); 4519 HS_XM_GLOBAL_STORE_L(12, r13); 4520 HS_XM_GLOBAL_STORE_L(13, r14); 4521 HS_XM_GLOBAL_STORE_L(14, r15); 4522 HS_XM_GLOBAL_STORE_L(15, r16); 4523 HS_FM_GLOBAL_STORE_R(0, r17); 4524 HS_FM_GLOBAL_STORE_R(1, r18); 4525 HS_FM_GLOBAL_STORE_R(2, r19); 4526 HS_FM_GLOBAL_STORE_R(3, r20); 4527 HS_FM_GLOBAL_STORE_R(4, r21); 4528 HS_FM_GLOBAL_STORE_R(5, r22); 4529 HS_FM_GLOBAL_STORE_R(6, r23); 4530 HS_FM_GLOBAL_STORE_R(7, r24); 4531} 4532 4533HS_FM_KERNEL_PROTO(1, 4) 4534{ 4535 HS_FM_PREAMBLE(16); 4536 HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0); 4537 HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1); 4538 HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2); 4539 HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3); 4540 HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4); 4541 HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5); 4542 HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6); 4543 HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7); 4544 HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8); 4545 HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9); 4546 HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10); 4547 HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11); 4548 HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12); 4549 HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13); 4550 HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14); 4551 HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15); 4552 HS_KEY_TYPE r17 = HS_FM_GLOBAL_LOAD_R(0); 4553 HS_KEY_TYPE r18 = HS_FM_GLOBAL_LOAD_R(1); 4554 HS_KEY_TYPE r19 = HS_FM_GLOBAL_LOAD_R(2); 4555 HS_KEY_TYPE r20 = HS_FM_GLOBAL_LOAD_R(3); 4556 HS_KEY_TYPE r21 = HS_FM_GLOBAL_LOAD_R(4); 4557 HS_KEY_TYPE r22 = HS_FM_GLOBAL_LOAD_R(5); 4558 HS_KEY_TYPE r23 = HS_FM_GLOBAL_LOAD_R(6); 4559 HS_KEY_TYPE r24 = HS_FM_GLOBAL_LOAD_R(7); 4560 HS_KEY_TYPE r25 = HS_FM_GLOBAL_LOAD_R(8); 4561 HS_KEY_TYPE r26 = HS_FM_GLOBAL_LOAD_R(9); 4562 HS_KEY_TYPE r27 = HS_FM_GLOBAL_LOAD_R(10); 4563 HS_KEY_TYPE r28 = HS_FM_GLOBAL_LOAD_R(11); 4564 HS_KEY_TYPE r29 = HS_FM_GLOBAL_LOAD_R(12); 4565 HS_KEY_TYPE r30 = HS_FM_GLOBAL_LOAD_R(13); 4566 HS_KEY_TYPE r31 = HS_FM_GLOBAL_LOAD_R(14); 4567 HS_KEY_TYPE r32 = HS_FM_GLOBAL_LOAD_R(15); 4568 HS_CMP_XCHG(r16, r17); 4569 HS_CMP_XCHG(r15, r18); 4570 HS_CMP_XCHG(r14, r19); 4571 HS_CMP_XCHG(r13, r20); 4572 HS_CMP_XCHG(r12, r21); 4573 HS_CMP_XCHG(r11, r22); 4574 HS_CMP_XCHG(r10, r23); 4575 HS_CMP_XCHG(r9, r24); 4576 HS_CMP_XCHG(r8, r25); 4577 HS_CMP_XCHG(r7, r26); 4578 HS_CMP_XCHG(r6, r27); 4579 HS_CMP_XCHG(r5, r28); 4580 HS_CMP_XCHG(r4, r29); 4581 HS_CMP_XCHG(r3, r30); 4582 HS_CMP_XCHG(r2, r31); 4583 HS_CMP_XCHG(r1, r32); 4584 HS_CMP_XCHG(r1, r9); 4585 HS_CMP_XCHG(r5, r13); 4586 HS_CMP_XCHG(r1, r5); 4587 HS_CMP_XCHG(r9, r13); 4588 HS_CMP_XCHG(r3, r11); 4589 HS_CMP_XCHG(r7, r15); 4590 HS_CMP_XCHG(r3, r7); 4591 HS_CMP_XCHG(r11, r15); 4592 HS_CMP_XCHG(r1, r3); 4593 HS_CMP_XCHG(r5, r7); 4594 HS_CMP_XCHG(r9, r11); 4595 HS_CMP_XCHG(r13, r15); 4596 HS_CMP_XCHG(r2, r10); 4597 HS_CMP_XCHG(r6, r14); 4598 HS_CMP_XCHG(r2, r6); 4599 HS_CMP_XCHG(r10, r14); 4600 HS_CMP_XCHG(r4, r12); 4601 HS_CMP_XCHG(r8, r16); 4602 HS_CMP_XCHG(r4, r8); 4603 HS_CMP_XCHG(r12, r16); 4604 HS_CMP_XCHG(r2, r4); 4605 HS_CMP_XCHG(r6, r8); 4606 HS_CMP_XCHG(r10, r12); 4607 HS_CMP_XCHG(r14, r16); 4608 HS_CMP_XCHG(r1, r2); 4609 HS_CMP_XCHG(r3, r4); 4610 HS_CMP_XCHG(r5, r6); 4611 HS_CMP_XCHG(r7, r8); 4612 HS_CMP_XCHG(r9, r10); 4613 HS_CMP_XCHG(r11, r12); 4614 HS_CMP_XCHG(r13, r14); 4615 HS_CMP_XCHG(r15, r16); 4616 HS_CMP_XCHG(r17, r25); 4617 HS_CMP_XCHG(r21, r29); 4618 HS_CMP_XCHG(r17, r21); 4619 HS_CMP_XCHG(r25, r29); 4620 HS_CMP_XCHG(r19, r27); 4621 HS_CMP_XCHG(r23, r31); 4622 HS_CMP_XCHG(r19, r23); 4623 HS_CMP_XCHG(r27, r31); 4624 HS_CMP_XCHG(r17, r19); 4625 HS_CMP_XCHG(r21, r23); 4626 HS_CMP_XCHG(r25, r27); 4627 HS_CMP_XCHG(r29, r31); 4628 HS_CMP_XCHG(r18, r26); 4629 HS_CMP_XCHG(r22, r30); 4630 HS_CMP_XCHG(r18, r22); 4631 HS_CMP_XCHG(r26, r30); 4632 HS_CMP_XCHG(r20, r28); 4633 HS_CMP_XCHG(r24, r32); 4634 HS_CMP_XCHG(r20, r24); 4635 HS_CMP_XCHG(r28, r32); 4636 HS_CMP_XCHG(r18, r20); 4637 HS_CMP_XCHG(r22, r24); 4638 HS_CMP_XCHG(r26, r28); 4639 HS_CMP_XCHG(r30, r32); 4640 HS_CMP_XCHG(r17, r18); 4641 HS_CMP_XCHG(r19, r20); 4642 HS_CMP_XCHG(r21, r22); 4643 HS_CMP_XCHG(r23, r24); 4644 HS_CMP_XCHG(r25, r26); 4645 HS_CMP_XCHG(r27, r28); 4646 HS_CMP_XCHG(r29, r30); 4647 HS_CMP_XCHG(r31, r32); 4648 HS_XM_GLOBAL_STORE_L(0, r1); 4649 HS_XM_GLOBAL_STORE_L(1, r2); 4650 HS_XM_GLOBAL_STORE_L(2, r3); 4651 HS_XM_GLOBAL_STORE_L(3, r4); 4652 HS_XM_GLOBAL_STORE_L(4, r5); 4653 HS_XM_GLOBAL_STORE_L(5, r6); 4654 HS_XM_GLOBAL_STORE_L(6, r7); 4655 HS_XM_GLOBAL_STORE_L(7, r8); 4656 HS_XM_GLOBAL_STORE_L(8, r9); 4657 HS_XM_GLOBAL_STORE_L(9, r10); 4658 HS_XM_GLOBAL_STORE_L(10, r11); 4659 HS_XM_GLOBAL_STORE_L(11, r12); 4660 HS_XM_GLOBAL_STORE_L(12, r13); 4661 HS_XM_GLOBAL_STORE_L(13, r14); 4662 HS_XM_GLOBAL_STORE_L(14, r15); 4663 HS_XM_GLOBAL_STORE_L(15, r16); 4664 HS_FM_GLOBAL_STORE_R(0, r17); 4665 HS_FM_GLOBAL_STORE_R(1, r18); 4666 HS_FM_GLOBAL_STORE_R(2, r19); 4667 HS_FM_GLOBAL_STORE_R(3, r20); 4668 HS_FM_GLOBAL_STORE_R(4, r21); 4669 HS_FM_GLOBAL_STORE_R(5, r22); 4670 HS_FM_GLOBAL_STORE_R(6, r23); 4671 HS_FM_GLOBAL_STORE_R(7, r24); 4672 HS_FM_GLOBAL_STORE_R(8, r25); 4673 HS_FM_GLOBAL_STORE_R(9, r26); 4674 HS_FM_GLOBAL_STORE_R(10, r27); 4675 HS_FM_GLOBAL_STORE_R(11, r28); 4676 HS_FM_GLOBAL_STORE_R(12, r29); 4677 HS_FM_GLOBAL_STORE_R(13, r30); 4678 HS_FM_GLOBAL_STORE_R(14, r31); 4679 HS_FM_GLOBAL_STORE_R(15, r32); 4680} 4681 4682HS_HM_KERNEL_PROTO(1) 4683{ 4684 HS_HM_PREAMBLE(16); 4685 HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0); 4686 HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1); 4687 HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2); 4688 HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3); 4689 HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4); 4690 HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5); 4691 HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6); 4692 HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7); 4693 HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8); 4694 HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9); 4695 HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10); 4696 HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11); 4697 HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12); 4698 HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13); 4699 HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14); 4700 HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15); 4701 HS_KEY_TYPE r17 = HS_XM_GLOBAL_LOAD_L(16); 4702 HS_KEY_TYPE r18 = HS_XM_GLOBAL_LOAD_L(17); 4703 HS_KEY_TYPE r19 = HS_XM_GLOBAL_LOAD_L(18); 4704 HS_KEY_TYPE r20 = HS_XM_GLOBAL_LOAD_L(19); 4705 HS_KEY_TYPE r21 = HS_XM_GLOBAL_LOAD_L(20); 4706 HS_KEY_TYPE r22 = HS_XM_GLOBAL_LOAD_L(21); 4707 HS_KEY_TYPE r23 = HS_XM_GLOBAL_LOAD_L(22); 4708 HS_KEY_TYPE r24 = HS_XM_GLOBAL_LOAD_L(23); 4709 HS_KEY_TYPE r25 = HS_XM_GLOBAL_LOAD_L(24); 4710 HS_KEY_TYPE r26 = HS_XM_GLOBAL_LOAD_L(25); 4711 HS_KEY_TYPE r27 = HS_XM_GLOBAL_LOAD_L(26); 4712 HS_KEY_TYPE r28 = HS_XM_GLOBAL_LOAD_L(27); 4713 HS_KEY_TYPE r29 = HS_XM_GLOBAL_LOAD_L(28); 4714 HS_KEY_TYPE r30 = HS_XM_GLOBAL_LOAD_L(29); 4715 HS_KEY_TYPE r31 = HS_XM_GLOBAL_LOAD_L(30); 4716 HS_KEY_TYPE r32 = HS_XM_GLOBAL_LOAD_L(31); 4717 HS_CMP_XCHG(r1, r17); 4718 HS_CMP_XCHG(r9, r25); 4719 HS_CMP_XCHG(r1, r9); 4720 HS_CMP_XCHG(r17, r25); 4721 HS_CMP_XCHG(r5, r21); 4722 HS_CMP_XCHG(r13, r29); 4723 HS_CMP_XCHG(r5, r13); 4724 HS_CMP_XCHG(r21, r29); 4725 HS_CMP_XCHG(r1, r5); 4726 HS_CMP_XCHG(r9, r13); 4727 HS_CMP_XCHG(r17, r21); 4728 HS_CMP_XCHG(r25, r29); 4729 HS_CMP_XCHG(r3, r19); 4730 HS_CMP_XCHG(r11, r27); 4731 HS_CMP_XCHG(r3, r11); 4732 HS_CMP_XCHG(r19, r27); 4733 HS_CMP_XCHG(r7, r23); 4734 HS_CMP_XCHG(r15, r31); 4735 HS_CMP_XCHG(r7, r15); 4736 HS_CMP_XCHG(r23, r31); 4737 HS_CMP_XCHG(r3, r7); 4738 HS_CMP_XCHG(r11, r15); 4739 HS_CMP_XCHG(r19, r23); 4740 HS_CMP_XCHG(r27, r31); 4741 HS_CMP_XCHG(r1, r3); 4742 HS_CMP_XCHG(r5, r7); 4743 HS_CMP_XCHG(r9, r11); 4744 HS_CMP_XCHG(r13, r15); 4745 HS_CMP_XCHG(r17, r19); 4746 HS_CMP_XCHG(r21, r23); 4747 HS_CMP_XCHG(r25, r27); 4748 HS_CMP_XCHG(r29, r31); 4749 HS_CMP_XCHG(r2, r18); 4750 HS_CMP_XCHG(r10, r26); 4751 HS_CMP_XCHG(r2, r10); 4752 HS_CMP_XCHG(r18, r26); 4753 HS_CMP_XCHG(r6, r22); 4754 HS_CMP_XCHG(r14, r30); 4755 HS_CMP_XCHG(r6, r14); 4756 HS_CMP_XCHG(r22, r30); 4757 HS_CMP_XCHG(r2, r6); 4758 HS_CMP_XCHG(r10, r14); 4759 HS_CMP_XCHG(r18, r22); 4760 HS_CMP_XCHG(r26, r30); 4761 HS_CMP_XCHG(r4, r20); 4762 HS_CMP_XCHG(r12, r28); 4763 HS_CMP_XCHG(r4, r12); 4764 HS_CMP_XCHG(r20, r28); 4765 HS_CMP_XCHG(r8, r24); 4766 HS_CMP_XCHG(r16, r32); 4767 HS_CMP_XCHG(r8, r16); 4768 HS_CMP_XCHG(r24, r32); 4769 HS_CMP_XCHG(r4, r8); 4770 HS_CMP_XCHG(r12, r16); 4771 HS_CMP_XCHG(r20, r24); 4772 HS_CMP_XCHG(r28, r32); 4773 HS_CMP_XCHG(r2, r4); 4774 HS_CMP_XCHG(r6, r8); 4775 HS_CMP_XCHG(r10, r12); 4776 HS_CMP_XCHG(r14, r16); 4777 HS_CMP_XCHG(r18, r20); 4778 HS_CMP_XCHG(r22, r24); 4779 HS_CMP_XCHG(r26, r28); 4780 HS_CMP_XCHG(r30, r32); 4781 HS_CMP_XCHG(r1, r2); 4782 HS_CMP_XCHG(r3, r4); 4783 HS_CMP_XCHG(r5, r6); 4784 HS_CMP_XCHG(r7, r8); 4785 HS_CMP_XCHG(r9, r10); 4786 HS_CMP_XCHG(r11, r12); 4787 HS_CMP_XCHG(r13, r14); 4788 HS_CMP_XCHG(r15, r16); 4789 HS_CMP_XCHG(r17, r18); 4790 HS_CMP_XCHG(r19, r20); 4791 HS_CMP_XCHG(r21, r22); 4792 HS_CMP_XCHG(r23, r24); 4793 HS_CMP_XCHG(r25, r26); 4794 HS_CMP_XCHG(r27, r28); 4795 HS_CMP_XCHG(r29, r30); 4796 HS_CMP_XCHG(r31, r32); 4797 HS_XM_GLOBAL_STORE_L(0, r1); 4798 HS_XM_GLOBAL_STORE_L(1, r2); 4799 HS_XM_GLOBAL_STORE_L(2, r3); 4800 HS_XM_GLOBAL_STORE_L(3, r4); 4801 HS_XM_GLOBAL_STORE_L(4, r5); 4802 HS_XM_GLOBAL_STORE_L(5, r6); 4803 HS_XM_GLOBAL_STORE_L(6, r7); 4804 HS_XM_GLOBAL_STORE_L(7, r8); 4805 HS_XM_GLOBAL_STORE_L(8, r9); 4806 HS_XM_GLOBAL_STORE_L(9, r10); 4807 HS_XM_GLOBAL_STORE_L(10, r11); 4808 HS_XM_GLOBAL_STORE_L(11, r12); 4809 HS_XM_GLOBAL_STORE_L(12, r13); 4810 HS_XM_GLOBAL_STORE_L(13, r14); 4811 HS_XM_GLOBAL_STORE_L(14, r15); 4812 HS_XM_GLOBAL_STORE_L(15, r16); 4813 HS_XM_GLOBAL_STORE_L(16, r17); 4814 HS_XM_GLOBAL_STORE_L(17, r18); 4815 HS_XM_GLOBAL_STORE_L(18, r19); 4816 HS_XM_GLOBAL_STORE_L(19, r20); 4817 HS_XM_GLOBAL_STORE_L(20, r21); 4818 HS_XM_GLOBAL_STORE_L(21, r22); 4819 HS_XM_GLOBAL_STORE_L(22, r23); 4820 HS_XM_GLOBAL_STORE_L(23, r24); 4821 HS_XM_GLOBAL_STORE_L(24, r25); 4822 HS_XM_GLOBAL_STORE_L(25, r26); 4823 HS_XM_GLOBAL_STORE_L(26, r27); 4824 HS_XM_GLOBAL_STORE_L(27, r28); 4825 HS_XM_GLOBAL_STORE_L(28, r29); 4826 HS_XM_GLOBAL_STORE_L(29, r30); 4827 HS_XM_GLOBAL_STORE_L(30, r31); 4828 HS_XM_GLOBAL_STORE_L(31, r32); 4829} 4830 4831HS_TRANSPOSE_KERNEL_PROTO() 4832{ 4833 HS_SLAB_GLOBAL_PREAMBLE(); 4834 HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vout, 0); 4835 HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vout, 1); 4836 HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vout, 2); 4837 HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vout, 3); 4838 HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vout, 4); 4839 HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vout, 5); 4840 HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vout, 6); 4841 HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vout, 7); 4842 HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vout, 8); 4843 HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vout, 9); 4844 HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vout, 10); 4845 HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vout, 11); 4846 HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vout, 12); 4847 HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vout, 13); 4848 HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vout, 14); 4849 HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vout, 15); 4850 HS_TRANSPOSE_SLAB() 4851} 4852 4853// 4854// 4855// 4856