1 //
2 // Copyright 2016 Google Inc.
3 //
4 // Use of this source code is governed by a BSD-style
5 // license that can be found in the LICENSE file.
6 //
7
8 #ifdef __cplusplus
9 extern "C"
10 {
11 #endif
12
13 #include "hs_cuda.h"
14
15 #ifdef __cplusplus
16 }
17 #endif
18
19 #include "hs_cuda_config.h"
20
21 #include "../hs_cuda_macros.h"
22
23 //
24 //
25 //
26
27 HS_OFFSET_BS_KERNEL_PROTO(1, 0)
28 {
29 HS_OFFSET_SLAB_GLOBAL_PREAMBLE();
30 HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0);
31 HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1);
32 HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2);
33 HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3);
34 HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4);
35 HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5);
36 HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6);
37 HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7);
38 HS_CMP_XCHG(r1, r5);
39 HS_CMP_XCHG(r2, r6);
40 HS_CMP_XCHG(r3, r7);
41 HS_CMP_XCHG(r4, r8);
42 HS_CMP_XCHG(r1, r3);
43 HS_CMP_XCHG(r2, r4);
44 HS_CMP_XCHG(r5, r7);
45 HS_CMP_XCHG(r6, r8);
46 HS_CMP_XCHG(r3, r5);
47 HS_CMP_XCHG(r4, r6);
48 HS_CMP_XCHG(r1, r2);
49 HS_CMP_XCHG(r3, r4);
50 HS_CMP_XCHG(r5, r6);
51 HS_CMP_XCHG(r7, r8);
52 HS_CMP_XCHG(r2, r5);
53 HS_CMP_XCHG(r4, r7);
54 HS_CMP_XCHG(r2, r3);
55 HS_CMP_XCHG(r4, r5);
56 HS_CMP_XCHG(r6, r7);
57 {
58 HS_SLAB_FLIP_PREAMBLE(1);
59 HS_CMP_FLIP(0, r1, r8);
60 HS_CMP_FLIP(1, r2, r7);
61 HS_CMP_FLIP(2, r3, r6);
62 HS_CMP_FLIP(3, r4, r5);
63 }
64 HS_CMP_XCHG(r1, r5);
65 HS_CMP_XCHG(r3, r7);
66 HS_CMP_XCHG(r1, r3);
67 HS_CMP_XCHG(r5, r7);
68 HS_CMP_XCHG(r2, r6);
69 HS_CMP_XCHG(r4, r8);
70 HS_CMP_XCHG(r2, r4);
71 HS_CMP_XCHG(r6, r8);
72 HS_CMP_XCHG(r1, r2);
73 HS_CMP_XCHG(r3, r4);
74 HS_CMP_XCHG(r5, r6);
75 HS_CMP_XCHG(r7, r8);
76 {
77 HS_SLAB_FLIP_PREAMBLE(3);
78 HS_CMP_FLIP(0, r1, r8);
79 HS_CMP_FLIP(1, r2, r7);
80 HS_CMP_FLIP(2, r3, r6);
81 HS_CMP_FLIP(3, r4, r5);
82 }
83 {
84 HS_SLAB_HALF_PREAMBLE(1);
85 HS_CMP_HALF(0, r1);
86 HS_CMP_HALF(1, r2);
87 HS_CMP_HALF(2, r3);
88 HS_CMP_HALF(3, r4);
89 HS_CMP_HALF(4, r5);
90 HS_CMP_HALF(5, r6);
91 HS_CMP_HALF(6, r7);
92 HS_CMP_HALF(7, r8);
93 }
94 HS_CMP_XCHG(r1, r5);
95 HS_CMP_XCHG(r3, r7);
96 HS_CMP_XCHG(r1, r3);
97 HS_CMP_XCHG(r5, r7);
98 HS_CMP_XCHG(r2, r6);
99 HS_CMP_XCHG(r4, r8);
100 HS_CMP_XCHG(r2, r4);
101 HS_CMP_XCHG(r6, r8);
102 HS_CMP_XCHG(r1, r2);
103 HS_CMP_XCHG(r3, r4);
104 HS_CMP_XCHG(r5, r6);
105 HS_CMP_XCHG(r7, r8);
106 {
107 HS_SLAB_FLIP_PREAMBLE(7);
108 HS_CMP_FLIP(0, r1, r8);
109 HS_CMP_FLIP(1, r2, r7);
110 HS_CMP_FLIP(2, r3, r6);
111 HS_CMP_FLIP(3, r4, r5);
112 }
113 {
114 HS_SLAB_HALF_PREAMBLE(2);
115 HS_CMP_HALF(0, r1);
116 HS_CMP_HALF(1, r2);
117 HS_CMP_HALF(2, r3);
118 HS_CMP_HALF(3, r4);
119 HS_CMP_HALF(4, r5);
120 HS_CMP_HALF(5, r6);
121 HS_CMP_HALF(6, r7);
122 HS_CMP_HALF(7, r8);
123 }
124 {
125 HS_SLAB_HALF_PREAMBLE(1);
126 HS_CMP_HALF(0, r1);
127 HS_CMP_HALF(1, r2);
128 HS_CMP_HALF(2, r3);
129 HS_CMP_HALF(3, r4);
130 HS_CMP_HALF(4, r5);
131 HS_CMP_HALF(5, r6);
132 HS_CMP_HALF(6, r7);
133 HS_CMP_HALF(7, r8);
134 }
135 HS_CMP_XCHG(r1, r5);
136 HS_CMP_XCHG(r3, r7);
137 HS_CMP_XCHG(r1, r3);
138 HS_CMP_XCHG(r5, r7);
139 HS_CMP_XCHG(r2, r6);
140 HS_CMP_XCHG(r4, r8);
141 HS_CMP_XCHG(r2, r4);
142 HS_CMP_XCHG(r6, r8);
143 HS_CMP_XCHG(r1, r2);
144 HS_CMP_XCHG(r3, r4);
145 HS_CMP_XCHG(r5, r6);
146 HS_CMP_XCHG(r7, r8);
147 {
148 HS_SLAB_FLIP_PREAMBLE(15);
149 HS_CMP_FLIP(0, r1, r8);
150 HS_CMP_FLIP(1, r2, r7);
151 HS_CMP_FLIP(2, r3, r6);
152 HS_CMP_FLIP(3, r4, r5);
153 }
154 {
155 HS_SLAB_HALF_PREAMBLE(4);
156 HS_CMP_HALF(0, r1);
157 HS_CMP_HALF(1, r2);
158 HS_CMP_HALF(2, r3);
159 HS_CMP_HALF(3, r4);
160 HS_CMP_HALF(4, r5);
161 HS_CMP_HALF(5, r6);
162 HS_CMP_HALF(6, r7);
163 HS_CMP_HALF(7, r8);
164 }
165 {
166 HS_SLAB_HALF_PREAMBLE(2);
167 HS_CMP_HALF(0, r1);
168 HS_CMP_HALF(1, r2);
169 HS_CMP_HALF(2, r3);
170 HS_CMP_HALF(3, r4);
171 HS_CMP_HALF(4, r5);
172 HS_CMP_HALF(5, r6);
173 HS_CMP_HALF(6, r7);
174 HS_CMP_HALF(7, r8);
175 }
176 {
177 HS_SLAB_HALF_PREAMBLE(1);
178 HS_CMP_HALF(0, r1);
179 HS_CMP_HALF(1, r2);
180 HS_CMP_HALF(2, r3);
181 HS_CMP_HALF(3, r4);
182 HS_CMP_HALF(4, r5);
183 HS_CMP_HALF(5, r6);
184 HS_CMP_HALF(6, r7);
185 HS_CMP_HALF(7, r8);
186 }
187 HS_CMP_XCHG(r1, r5);
188 HS_CMP_XCHG(r3, r7);
189 HS_CMP_XCHG(r1, r3);
190 HS_CMP_XCHG(r5, r7);
191 HS_CMP_XCHG(r2, r6);
192 HS_CMP_XCHG(r4, r8);
193 HS_CMP_XCHG(r2, r4);
194 HS_CMP_XCHG(r6, r8);
195 HS_CMP_XCHG(r1, r2);
196 HS_CMP_XCHG(r3, r4);
197 HS_CMP_XCHG(r5, r6);
198 HS_CMP_XCHG(r7, r8);
199 {
200 HS_SLAB_FLIP_PREAMBLE(31);
201 HS_CMP_FLIP(0, r1, r8);
202 HS_CMP_FLIP(1, r2, r7);
203 HS_CMP_FLIP(2, r3, r6);
204 HS_CMP_FLIP(3, r4, r5);
205 }
206 {
207 HS_SLAB_HALF_PREAMBLE(8);
208 HS_CMP_HALF(0, r1);
209 HS_CMP_HALF(1, r2);
210 HS_CMP_HALF(2, r3);
211 HS_CMP_HALF(3, r4);
212 HS_CMP_HALF(4, r5);
213 HS_CMP_HALF(5, r6);
214 HS_CMP_HALF(6, r7);
215 HS_CMP_HALF(7, r8);
216 }
217 {
218 HS_SLAB_HALF_PREAMBLE(4);
219 HS_CMP_HALF(0, r1);
220 HS_CMP_HALF(1, r2);
221 HS_CMP_HALF(2, r3);
222 HS_CMP_HALF(3, r4);
223 HS_CMP_HALF(4, r5);
224 HS_CMP_HALF(5, r6);
225 HS_CMP_HALF(6, r7);
226 HS_CMP_HALF(7, r8);
227 }
228 {
229 HS_SLAB_HALF_PREAMBLE(2);
230 HS_CMP_HALF(0, r1);
231 HS_CMP_HALF(1, r2);
232 HS_CMP_HALF(2, r3);
233 HS_CMP_HALF(3, r4);
234 HS_CMP_HALF(4, r5);
235 HS_CMP_HALF(5, r6);
236 HS_CMP_HALF(6, r7);
237 HS_CMP_HALF(7, r8);
238 }
239 {
240 HS_SLAB_HALF_PREAMBLE(1);
241 HS_CMP_HALF(0, r1);
242 HS_CMP_HALF(1, r2);
243 HS_CMP_HALF(2, r3);
244 HS_CMP_HALF(3, r4);
245 HS_CMP_HALF(4, r5);
246 HS_CMP_HALF(5, r6);
247 HS_CMP_HALF(6, r7);
248 HS_CMP_HALF(7, r8);
249 }
250 HS_CMP_XCHG(r1, r5);
251 HS_CMP_XCHG(r3, r7);
252 HS_CMP_XCHG(r1, r3);
253 HS_CMP_XCHG(r5, r7);
254 HS_CMP_XCHG(r2, r6);
255 HS_CMP_XCHG(r4, r8);
256 HS_CMP_XCHG(r2, r4);
257 HS_CMP_XCHG(r6, r8);
258 HS_CMP_XCHG(r1, r2);
259 HS_CMP_XCHG(r3, r4);
260 HS_CMP_XCHG(r5, r6);
261 HS_CMP_XCHG(r7, r8);
262 HS_SLAB_GLOBAL_STORE(0, r1);
263 HS_SLAB_GLOBAL_STORE(1, r2);
264 HS_SLAB_GLOBAL_STORE(2, r3);
265 HS_SLAB_GLOBAL_STORE(3, r4);
266 HS_SLAB_GLOBAL_STORE(4, r5);
267 HS_SLAB_GLOBAL_STORE(5, r6);
268 HS_SLAB_GLOBAL_STORE(6, r7);
269 HS_SLAB_GLOBAL_STORE(7, r8);
270 }
271
272 HS_OFFSET_BS_KERNEL_PROTO(2, 1)
273 {
274 HS_BLOCK_LOCAL_MEM_DECL(64, 8);
275
276 HS_OFFSET_SLAB_GLOBAL_PREAMBLE();
277 HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0);
278 HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1);
279 HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2);
280 HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3);
281 HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4);
282 HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5);
283 HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6);
284 HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7);
285 HS_CMP_XCHG(r1, r5);
286 HS_CMP_XCHG(r2, r6);
287 HS_CMP_XCHG(r3, r7);
288 HS_CMP_XCHG(r4, r8);
289 HS_CMP_XCHG(r1, r3);
290 HS_CMP_XCHG(r2, r4);
291 HS_CMP_XCHG(r5, r7);
292 HS_CMP_XCHG(r6, r8);
293 HS_CMP_XCHG(r3, r5);
294 HS_CMP_XCHG(r4, r6);
295 HS_CMP_XCHG(r1, r2);
296 HS_CMP_XCHG(r3, r4);
297 HS_CMP_XCHG(r5, r6);
298 HS_CMP_XCHG(r7, r8);
299 HS_CMP_XCHG(r2, r5);
300 HS_CMP_XCHG(r4, r7);
301 HS_CMP_XCHG(r2, r3);
302 HS_CMP_XCHG(r4, r5);
303 HS_CMP_XCHG(r6, r7);
304 {
305 HS_SLAB_FLIP_PREAMBLE(1);
306 HS_CMP_FLIP(0, r1, r8);
307 HS_CMP_FLIP(1, r2, r7);
308 HS_CMP_FLIP(2, r3, r6);
309 HS_CMP_FLIP(3, r4, r5);
310 }
311 HS_CMP_XCHG(r1, r5);
312 HS_CMP_XCHG(r3, r7);
313 HS_CMP_XCHG(r1, r3);
314 HS_CMP_XCHG(r5, r7);
315 HS_CMP_XCHG(r2, r6);
316 HS_CMP_XCHG(r4, r8);
317 HS_CMP_XCHG(r2, r4);
318 HS_CMP_XCHG(r6, r8);
319 HS_CMP_XCHG(r1, r2);
320 HS_CMP_XCHG(r3, r4);
321 HS_CMP_XCHG(r5, r6);
322 HS_CMP_XCHG(r7, r8);
323 {
324 HS_SLAB_FLIP_PREAMBLE(3);
325 HS_CMP_FLIP(0, r1, r8);
326 HS_CMP_FLIP(1, r2, r7);
327 HS_CMP_FLIP(2, r3, r6);
328 HS_CMP_FLIP(3, r4, r5);
329 }
330 {
331 HS_SLAB_HALF_PREAMBLE(1);
332 HS_CMP_HALF(0, r1);
333 HS_CMP_HALF(1, r2);
334 HS_CMP_HALF(2, r3);
335 HS_CMP_HALF(3, r4);
336 HS_CMP_HALF(4, r5);
337 HS_CMP_HALF(5, r6);
338 HS_CMP_HALF(6, r7);
339 HS_CMP_HALF(7, r8);
340 }
341 HS_CMP_XCHG(r1, r5);
342 HS_CMP_XCHG(r3, r7);
343 HS_CMP_XCHG(r1, r3);
344 HS_CMP_XCHG(r5, r7);
345 HS_CMP_XCHG(r2, r6);
346 HS_CMP_XCHG(r4, r8);
347 HS_CMP_XCHG(r2, r4);
348 HS_CMP_XCHG(r6, r8);
349 HS_CMP_XCHG(r1, r2);
350 HS_CMP_XCHG(r3, r4);
351 HS_CMP_XCHG(r5, r6);
352 HS_CMP_XCHG(r7, r8);
353 {
354 HS_SLAB_FLIP_PREAMBLE(7);
355 HS_CMP_FLIP(0, r1, r8);
356 HS_CMP_FLIP(1, r2, r7);
357 HS_CMP_FLIP(2, r3, r6);
358 HS_CMP_FLIP(3, r4, r5);
359 }
360 {
361 HS_SLAB_HALF_PREAMBLE(2);
362 HS_CMP_HALF(0, r1);
363 HS_CMP_HALF(1, r2);
364 HS_CMP_HALF(2, r3);
365 HS_CMP_HALF(3, r4);
366 HS_CMP_HALF(4, r5);
367 HS_CMP_HALF(5, r6);
368 HS_CMP_HALF(6, r7);
369 HS_CMP_HALF(7, r8);
370 }
371 {
372 HS_SLAB_HALF_PREAMBLE(1);
373 HS_CMP_HALF(0, r1);
374 HS_CMP_HALF(1, r2);
375 HS_CMP_HALF(2, r3);
376 HS_CMP_HALF(3, r4);
377 HS_CMP_HALF(4, r5);
378 HS_CMP_HALF(5, r6);
379 HS_CMP_HALF(6, r7);
380 HS_CMP_HALF(7, r8);
381 }
382 HS_CMP_XCHG(r1, r5);
383 HS_CMP_XCHG(r3, r7);
384 HS_CMP_XCHG(r1, r3);
385 HS_CMP_XCHG(r5, r7);
386 HS_CMP_XCHG(r2, r6);
387 HS_CMP_XCHG(r4, r8);
388 HS_CMP_XCHG(r2, r4);
389 HS_CMP_XCHG(r6, r8);
390 HS_CMP_XCHG(r1, r2);
391 HS_CMP_XCHG(r3, r4);
392 HS_CMP_XCHG(r5, r6);
393 HS_CMP_XCHG(r7, r8);
394 {
395 HS_SLAB_FLIP_PREAMBLE(15);
396 HS_CMP_FLIP(0, r1, r8);
397 HS_CMP_FLIP(1, r2, r7);
398 HS_CMP_FLIP(2, r3, r6);
399 HS_CMP_FLIP(3, r4, r5);
400 }
401 {
402 HS_SLAB_HALF_PREAMBLE(4);
403 HS_CMP_HALF(0, r1);
404 HS_CMP_HALF(1, r2);
405 HS_CMP_HALF(2, r3);
406 HS_CMP_HALF(3, r4);
407 HS_CMP_HALF(4, r5);
408 HS_CMP_HALF(5, r6);
409 HS_CMP_HALF(6, r7);
410 HS_CMP_HALF(7, r8);
411 }
412 {
413 HS_SLAB_HALF_PREAMBLE(2);
414 HS_CMP_HALF(0, r1);
415 HS_CMP_HALF(1, r2);
416 HS_CMP_HALF(2, r3);
417 HS_CMP_HALF(3, r4);
418 HS_CMP_HALF(4, r5);
419 HS_CMP_HALF(5, r6);
420 HS_CMP_HALF(6, r7);
421 HS_CMP_HALF(7, r8);
422 }
423 {
424 HS_SLAB_HALF_PREAMBLE(1);
425 HS_CMP_HALF(0, r1);
426 HS_CMP_HALF(1, r2);
427 HS_CMP_HALF(2, r3);
428 HS_CMP_HALF(3, r4);
429 HS_CMP_HALF(4, r5);
430 HS_CMP_HALF(5, r6);
431 HS_CMP_HALF(6, r7);
432 HS_CMP_HALF(7, r8);
433 }
434 HS_CMP_XCHG(r1, r5);
435 HS_CMP_XCHG(r3, r7);
436 HS_CMP_XCHG(r1, r3);
437 HS_CMP_XCHG(r5, r7);
438 HS_CMP_XCHG(r2, r6);
439 HS_CMP_XCHG(r4, r8);
440 HS_CMP_XCHG(r2, r4);
441 HS_CMP_XCHG(r6, r8);
442 HS_CMP_XCHG(r1, r2);
443 HS_CMP_XCHG(r3, r4);
444 HS_CMP_XCHG(r5, r6);
445 HS_CMP_XCHG(r7, r8);
446 {
447 HS_SLAB_FLIP_PREAMBLE(31);
448 HS_CMP_FLIP(0, r1, r8);
449 HS_CMP_FLIP(1, r2, r7);
450 HS_CMP_FLIP(2, r3, r6);
451 HS_CMP_FLIP(3, r4, r5);
452 }
453 {
454 HS_SLAB_HALF_PREAMBLE(8);
455 HS_CMP_HALF(0, r1);
456 HS_CMP_HALF(1, r2);
457 HS_CMP_HALF(2, r3);
458 HS_CMP_HALF(3, r4);
459 HS_CMP_HALF(4, r5);
460 HS_CMP_HALF(5, r6);
461 HS_CMP_HALF(6, r7);
462 HS_CMP_HALF(7, r8);
463 }
464 {
465 HS_SLAB_HALF_PREAMBLE(4);
466 HS_CMP_HALF(0, r1);
467 HS_CMP_HALF(1, r2);
468 HS_CMP_HALF(2, r3);
469 HS_CMP_HALF(3, r4);
470 HS_CMP_HALF(4, r5);
471 HS_CMP_HALF(5, r6);
472 HS_CMP_HALF(6, r7);
473 HS_CMP_HALF(7, r8);
474 }
475 {
476 HS_SLAB_HALF_PREAMBLE(2);
477 HS_CMP_HALF(0, r1);
478 HS_CMP_HALF(1, r2);
479 HS_CMP_HALF(2, r3);
480 HS_CMP_HALF(3, r4);
481 HS_CMP_HALF(4, r5);
482 HS_CMP_HALF(5, r6);
483 HS_CMP_HALF(6, r7);
484 HS_CMP_HALF(7, r8);
485 }
486 {
487 HS_SLAB_HALF_PREAMBLE(1);
488 HS_CMP_HALF(0, r1);
489 HS_CMP_HALF(1, r2);
490 HS_CMP_HALF(2, r3);
491 HS_CMP_HALF(3, r4);
492 HS_CMP_HALF(4, r5);
493 HS_CMP_HALF(5, r6);
494 HS_CMP_HALF(6, r7);
495 HS_CMP_HALF(7, r8);
496 }
497 HS_CMP_XCHG(r1, r5);
498 HS_CMP_XCHG(r3, r7);
499 HS_CMP_XCHG(r1, r3);
500 HS_CMP_XCHG(r5, r7);
501 HS_CMP_XCHG(r2, r6);
502 HS_CMP_XCHG(r4, r8);
503 HS_CMP_XCHG(r2, r4);
504 HS_CMP_XCHG(r6, r8);
505 HS_CMP_XCHG(r1, r2);
506 HS_CMP_XCHG(r3, r4);
507 HS_CMP_XCHG(r5, r6);
508 HS_CMP_XCHG(r7, r8);
509 HS_BS_MERGE_H_PREAMBLE(2);
510 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 0) = r1;
511 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 1) = r8;
512 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 2) = r2;
513 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 3) = r7;
514 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 4) = r3;
515 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 5) = r6;
516 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 6) = r4;
517 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 7) = r5;
518 HS_BLOCK_BARRIER();
519 {
520 {
521 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
522 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(32);
523 HS_CMP_XCHG(r0_1, r0_2);
524 HS_SLAB_LOCAL_L(0) = r0_1;
525 HS_SLAB_LOCAL_R(32) = r0_2;
526 }
527 {
528 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(128);
529 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(160);
530 HS_CMP_XCHG(r0_1, r0_2);
531 HS_SLAB_LOCAL_L(128) = r0_1;
532 HS_SLAB_LOCAL_R(160) = r0_2;
533 }
534 {
535 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(256);
536 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(288);
537 HS_CMP_XCHG(r0_1, r0_2);
538 HS_SLAB_LOCAL_L(256) = r0_1;
539 HS_SLAB_LOCAL_R(288) = r0_2;
540 }
541 {
542 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(384);
543 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(416);
544 HS_CMP_XCHG(r0_1, r0_2);
545 HS_SLAB_LOCAL_L(384) = r0_1;
546 HS_SLAB_LOCAL_R(416) = r0_2;
547 }
548 }
549 HS_BLOCK_BARRIER();
550 r1 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 0);
551 r8 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 1);
552 r2 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 2);
553 r7 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 3);
554 r3 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 4);
555 r6 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 5);
556 r4 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 6);
557 r5 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 7);
558 {
559 {
560 HS_SLAB_HALF_PREAMBLE(16);
561 HS_CMP_HALF(0, r1);
562 HS_CMP_HALF(1, r2);
563 HS_CMP_HALF(2, r3);
564 HS_CMP_HALF(3, r4);
565 HS_CMP_HALF(4, r5);
566 HS_CMP_HALF(5, r6);
567 HS_CMP_HALF(6, r7);
568 HS_CMP_HALF(7, r8);
569 }
570 {
571 HS_SLAB_HALF_PREAMBLE(8);
572 HS_CMP_HALF(0, r1);
573 HS_CMP_HALF(1, r2);
574 HS_CMP_HALF(2, r3);
575 HS_CMP_HALF(3, r4);
576 HS_CMP_HALF(4, r5);
577 HS_CMP_HALF(5, r6);
578 HS_CMP_HALF(6, r7);
579 HS_CMP_HALF(7, r8);
580 }
581 {
582 HS_SLAB_HALF_PREAMBLE(4);
583 HS_CMP_HALF(0, r1);
584 HS_CMP_HALF(1, r2);
585 HS_CMP_HALF(2, r3);
586 HS_CMP_HALF(3, r4);
587 HS_CMP_HALF(4, r5);
588 HS_CMP_HALF(5, r6);
589 HS_CMP_HALF(6, r7);
590 HS_CMP_HALF(7, r8);
591 }
592 {
593 HS_SLAB_HALF_PREAMBLE(2);
594 HS_CMP_HALF(0, r1);
595 HS_CMP_HALF(1, r2);
596 HS_CMP_HALF(2, r3);
597 HS_CMP_HALF(3, r4);
598 HS_CMP_HALF(4, r5);
599 HS_CMP_HALF(5, r6);
600 HS_CMP_HALF(6, r7);
601 HS_CMP_HALF(7, r8);
602 }
603 {
604 HS_SLAB_HALF_PREAMBLE(1);
605 HS_CMP_HALF(0, r1);
606 HS_CMP_HALF(1, r2);
607 HS_CMP_HALF(2, r3);
608 HS_CMP_HALF(3, r4);
609 HS_CMP_HALF(4, r5);
610 HS_CMP_HALF(5, r6);
611 HS_CMP_HALF(6, r7);
612 HS_CMP_HALF(7, r8);
613 }
614 HS_CMP_XCHG(r1, r5);
615 HS_CMP_XCHG(r3, r7);
616 HS_CMP_XCHG(r1, r3);
617 HS_CMP_XCHG(r5, r7);
618 HS_CMP_XCHG(r2, r6);
619 HS_CMP_XCHG(r4, r8);
620 HS_CMP_XCHG(r2, r4);
621 HS_CMP_XCHG(r6, r8);
622 HS_CMP_XCHG(r1, r2);
623 HS_CMP_XCHG(r3, r4);
624 HS_CMP_XCHG(r5, r6);
625 HS_CMP_XCHG(r7, r8);
626 }
627 HS_SLAB_GLOBAL_STORE(0, r1);
628 HS_SLAB_GLOBAL_STORE(1, r2);
629 HS_SLAB_GLOBAL_STORE(2, r3);
630 HS_SLAB_GLOBAL_STORE(3, r4);
631 HS_SLAB_GLOBAL_STORE(4, r5);
632 HS_SLAB_GLOBAL_STORE(5, r6);
633 HS_SLAB_GLOBAL_STORE(6, r7);
634 HS_SLAB_GLOBAL_STORE(7, r8);
635 }
636
637 HS_OFFSET_BS_KERNEL_PROTO(4, 2)
638 {
639 HS_BLOCK_LOCAL_MEM_DECL(128, 8);
640
641 HS_OFFSET_SLAB_GLOBAL_PREAMBLE();
642 HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0);
643 HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1);
644 HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2);
645 HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3);
646 HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4);
647 HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5);
648 HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6);
649 HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7);
650 HS_CMP_XCHG(r1, r5);
651 HS_CMP_XCHG(r2, r6);
652 HS_CMP_XCHG(r3, r7);
653 HS_CMP_XCHG(r4, r8);
654 HS_CMP_XCHG(r1, r3);
655 HS_CMP_XCHG(r2, r4);
656 HS_CMP_XCHG(r5, r7);
657 HS_CMP_XCHG(r6, r8);
658 HS_CMP_XCHG(r3, r5);
659 HS_CMP_XCHG(r4, r6);
660 HS_CMP_XCHG(r1, r2);
661 HS_CMP_XCHG(r3, r4);
662 HS_CMP_XCHG(r5, r6);
663 HS_CMP_XCHG(r7, r8);
664 HS_CMP_XCHG(r2, r5);
665 HS_CMP_XCHG(r4, r7);
666 HS_CMP_XCHG(r2, r3);
667 HS_CMP_XCHG(r4, r5);
668 HS_CMP_XCHG(r6, r7);
669 {
670 HS_SLAB_FLIP_PREAMBLE(1);
671 HS_CMP_FLIP(0, r1, r8);
672 HS_CMP_FLIP(1, r2, r7);
673 HS_CMP_FLIP(2, r3, r6);
674 HS_CMP_FLIP(3, r4, r5);
675 }
676 HS_CMP_XCHG(r1, r5);
677 HS_CMP_XCHG(r3, r7);
678 HS_CMP_XCHG(r1, r3);
679 HS_CMP_XCHG(r5, r7);
680 HS_CMP_XCHG(r2, r6);
681 HS_CMP_XCHG(r4, r8);
682 HS_CMP_XCHG(r2, r4);
683 HS_CMP_XCHG(r6, r8);
684 HS_CMP_XCHG(r1, r2);
685 HS_CMP_XCHG(r3, r4);
686 HS_CMP_XCHG(r5, r6);
687 HS_CMP_XCHG(r7, r8);
688 {
689 HS_SLAB_FLIP_PREAMBLE(3);
690 HS_CMP_FLIP(0, r1, r8);
691 HS_CMP_FLIP(1, r2, r7);
692 HS_CMP_FLIP(2, r3, r6);
693 HS_CMP_FLIP(3, r4, r5);
694 }
695 {
696 HS_SLAB_HALF_PREAMBLE(1);
697 HS_CMP_HALF(0, r1);
698 HS_CMP_HALF(1, r2);
699 HS_CMP_HALF(2, r3);
700 HS_CMP_HALF(3, r4);
701 HS_CMP_HALF(4, r5);
702 HS_CMP_HALF(5, r6);
703 HS_CMP_HALF(6, r7);
704 HS_CMP_HALF(7, r8);
705 }
706 HS_CMP_XCHG(r1, r5);
707 HS_CMP_XCHG(r3, r7);
708 HS_CMP_XCHG(r1, r3);
709 HS_CMP_XCHG(r5, r7);
710 HS_CMP_XCHG(r2, r6);
711 HS_CMP_XCHG(r4, r8);
712 HS_CMP_XCHG(r2, r4);
713 HS_CMP_XCHG(r6, r8);
714 HS_CMP_XCHG(r1, r2);
715 HS_CMP_XCHG(r3, r4);
716 HS_CMP_XCHG(r5, r6);
717 HS_CMP_XCHG(r7, r8);
718 {
719 HS_SLAB_FLIP_PREAMBLE(7);
720 HS_CMP_FLIP(0, r1, r8);
721 HS_CMP_FLIP(1, r2, r7);
722 HS_CMP_FLIP(2, r3, r6);
723 HS_CMP_FLIP(3, r4, r5);
724 }
725 {
726 HS_SLAB_HALF_PREAMBLE(2);
727 HS_CMP_HALF(0, r1);
728 HS_CMP_HALF(1, r2);
729 HS_CMP_HALF(2, r3);
730 HS_CMP_HALF(3, r4);
731 HS_CMP_HALF(4, r5);
732 HS_CMP_HALF(5, r6);
733 HS_CMP_HALF(6, r7);
734 HS_CMP_HALF(7, r8);
735 }
736 {
737 HS_SLAB_HALF_PREAMBLE(1);
738 HS_CMP_HALF(0, r1);
739 HS_CMP_HALF(1, r2);
740 HS_CMP_HALF(2, r3);
741 HS_CMP_HALF(3, r4);
742 HS_CMP_HALF(4, r5);
743 HS_CMP_HALF(5, r6);
744 HS_CMP_HALF(6, r7);
745 HS_CMP_HALF(7, r8);
746 }
747 HS_CMP_XCHG(r1, r5);
748 HS_CMP_XCHG(r3, r7);
749 HS_CMP_XCHG(r1, r3);
750 HS_CMP_XCHG(r5, r7);
751 HS_CMP_XCHG(r2, r6);
752 HS_CMP_XCHG(r4, r8);
753 HS_CMP_XCHG(r2, r4);
754 HS_CMP_XCHG(r6, r8);
755 HS_CMP_XCHG(r1, r2);
756 HS_CMP_XCHG(r3, r4);
757 HS_CMP_XCHG(r5, r6);
758 HS_CMP_XCHG(r7, r8);
759 {
760 HS_SLAB_FLIP_PREAMBLE(15);
761 HS_CMP_FLIP(0, r1, r8);
762 HS_CMP_FLIP(1, r2, r7);
763 HS_CMP_FLIP(2, r3, r6);
764 HS_CMP_FLIP(3, r4, r5);
765 }
766 {
767 HS_SLAB_HALF_PREAMBLE(4);
768 HS_CMP_HALF(0, r1);
769 HS_CMP_HALF(1, r2);
770 HS_CMP_HALF(2, r3);
771 HS_CMP_HALF(3, r4);
772 HS_CMP_HALF(4, r5);
773 HS_CMP_HALF(5, r6);
774 HS_CMP_HALF(6, r7);
775 HS_CMP_HALF(7, r8);
776 }
777 {
778 HS_SLAB_HALF_PREAMBLE(2);
779 HS_CMP_HALF(0, r1);
780 HS_CMP_HALF(1, r2);
781 HS_CMP_HALF(2, r3);
782 HS_CMP_HALF(3, r4);
783 HS_CMP_HALF(4, r5);
784 HS_CMP_HALF(5, r6);
785 HS_CMP_HALF(6, r7);
786 HS_CMP_HALF(7, r8);
787 }
788 {
789 HS_SLAB_HALF_PREAMBLE(1);
790 HS_CMP_HALF(0, r1);
791 HS_CMP_HALF(1, r2);
792 HS_CMP_HALF(2, r3);
793 HS_CMP_HALF(3, r4);
794 HS_CMP_HALF(4, r5);
795 HS_CMP_HALF(5, r6);
796 HS_CMP_HALF(6, r7);
797 HS_CMP_HALF(7, r8);
798 }
799 HS_CMP_XCHG(r1, r5);
800 HS_CMP_XCHG(r3, r7);
801 HS_CMP_XCHG(r1, r3);
802 HS_CMP_XCHG(r5, r7);
803 HS_CMP_XCHG(r2, r6);
804 HS_CMP_XCHG(r4, r8);
805 HS_CMP_XCHG(r2, r4);
806 HS_CMP_XCHG(r6, r8);
807 HS_CMP_XCHG(r1, r2);
808 HS_CMP_XCHG(r3, r4);
809 HS_CMP_XCHG(r5, r6);
810 HS_CMP_XCHG(r7, r8);
811 {
812 HS_SLAB_FLIP_PREAMBLE(31);
813 HS_CMP_FLIP(0, r1, r8);
814 HS_CMP_FLIP(1, r2, r7);
815 HS_CMP_FLIP(2, r3, r6);
816 HS_CMP_FLIP(3, r4, r5);
817 }
818 {
819 HS_SLAB_HALF_PREAMBLE(8);
820 HS_CMP_HALF(0, r1);
821 HS_CMP_HALF(1, r2);
822 HS_CMP_HALF(2, r3);
823 HS_CMP_HALF(3, r4);
824 HS_CMP_HALF(4, r5);
825 HS_CMP_HALF(5, r6);
826 HS_CMP_HALF(6, r7);
827 HS_CMP_HALF(7, r8);
828 }
829 {
830 HS_SLAB_HALF_PREAMBLE(4);
831 HS_CMP_HALF(0, r1);
832 HS_CMP_HALF(1, r2);
833 HS_CMP_HALF(2, r3);
834 HS_CMP_HALF(3, r4);
835 HS_CMP_HALF(4, r5);
836 HS_CMP_HALF(5, r6);
837 HS_CMP_HALF(6, r7);
838 HS_CMP_HALF(7, r8);
839 }
840 {
841 HS_SLAB_HALF_PREAMBLE(2);
842 HS_CMP_HALF(0, r1);
843 HS_CMP_HALF(1, r2);
844 HS_CMP_HALF(2, r3);
845 HS_CMP_HALF(3, r4);
846 HS_CMP_HALF(4, r5);
847 HS_CMP_HALF(5, r6);
848 HS_CMP_HALF(6, r7);
849 HS_CMP_HALF(7, r8);
850 }
851 {
852 HS_SLAB_HALF_PREAMBLE(1);
853 HS_CMP_HALF(0, r1);
854 HS_CMP_HALF(1, r2);
855 HS_CMP_HALF(2, r3);
856 HS_CMP_HALF(3, r4);
857 HS_CMP_HALF(4, r5);
858 HS_CMP_HALF(5, r6);
859 HS_CMP_HALF(6, r7);
860 HS_CMP_HALF(7, r8);
861 }
862 HS_CMP_XCHG(r1, r5);
863 HS_CMP_XCHG(r3, r7);
864 HS_CMP_XCHG(r1, r3);
865 HS_CMP_XCHG(r5, r7);
866 HS_CMP_XCHG(r2, r6);
867 HS_CMP_XCHG(r4, r8);
868 HS_CMP_XCHG(r2, r4);
869 HS_CMP_XCHG(r6, r8);
870 HS_CMP_XCHG(r1, r2);
871 HS_CMP_XCHG(r3, r4);
872 HS_CMP_XCHG(r5, r6);
873 HS_CMP_XCHG(r7, r8);
874 HS_BS_MERGE_H_PREAMBLE(4);
875 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0) = r1;
876 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1) = r8;
877 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2) = r2;
878 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3) = r7;
879 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4) = r3;
880 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5) = r6;
881 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6) = r4;
882 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7) = r5;
883 HS_BLOCK_BARRIER();
884 {
885 {
886 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
887 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(32);
888 HS_CMP_XCHG(r0_1, r0_2);
889 HS_SLAB_LOCAL_L(0) = r0_1;
890 HS_SLAB_LOCAL_R(32) = r0_2;
891 }
892 {
893 HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(64);
894 HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(96);
895 HS_CMP_XCHG(r1_1, r1_2);
896 HS_SLAB_LOCAL_L(64) = r1_1;
897 HS_SLAB_LOCAL_R(96) = r1_2;
898 }
899 {
900 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(512);
901 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(544);
902 HS_CMP_XCHG(r0_1, r0_2);
903 HS_SLAB_LOCAL_L(512) = r0_1;
904 HS_SLAB_LOCAL_R(544) = r0_2;
905 }
906 {
907 HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(576);
908 HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(608);
909 HS_CMP_XCHG(r1_1, r1_2);
910 HS_SLAB_LOCAL_L(576) = r1_1;
911 HS_SLAB_LOCAL_R(608) = r1_2;
912 }
913 }
914 HS_BLOCK_BARRIER();
915 r1 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0);
916 r8 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1);
917 r2 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2);
918 r7 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3);
919 r3 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4);
920 r6 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5);
921 r4 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6);
922 r5 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7);
923 {
924 {
925 HS_SLAB_HALF_PREAMBLE(16);
926 HS_CMP_HALF(0, r1);
927 HS_CMP_HALF(1, r2);
928 HS_CMP_HALF(2, r3);
929 HS_CMP_HALF(3, r4);
930 HS_CMP_HALF(4, r5);
931 HS_CMP_HALF(5, r6);
932 HS_CMP_HALF(6, r7);
933 HS_CMP_HALF(7, r8);
934 }
935 {
936 HS_SLAB_HALF_PREAMBLE(8);
937 HS_CMP_HALF(0, r1);
938 HS_CMP_HALF(1, r2);
939 HS_CMP_HALF(2, r3);
940 HS_CMP_HALF(3, r4);
941 HS_CMP_HALF(4, r5);
942 HS_CMP_HALF(5, r6);
943 HS_CMP_HALF(6, r7);
944 HS_CMP_HALF(7, r8);
945 }
946 {
947 HS_SLAB_HALF_PREAMBLE(4);
948 HS_CMP_HALF(0, r1);
949 HS_CMP_HALF(1, r2);
950 HS_CMP_HALF(2, r3);
951 HS_CMP_HALF(3, r4);
952 HS_CMP_HALF(4, r5);
953 HS_CMP_HALF(5, r6);
954 HS_CMP_HALF(6, r7);
955 HS_CMP_HALF(7, r8);
956 }
957 {
958 HS_SLAB_HALF_PREAMBLE(2);
959 HS_CMP_HALF(0, r1);
960 HS_CMP_HALF(1, r2);
961 HS_CMP_HALF(2, r3);
962 HS_CMP_HALF(3, r4);
963 HS_CMP_HALF(4, r5);
964 HS_CMP_HALF(5, r6);
965 HS_CMP_HALF(6, r7);
966 HS_CMP_HALF(7, r8);
967 }
968 {
969 HS_SLAB_HALF_PREAMBLE(1);
970 HS_CMP_HALF(0, r1);
971 HS_CMP_HALF(1, r2);
972 HS_CMP_HALF(2, r3);
973 HS_CMP_HALF(3, r4);
974 HS_CMP_HALF(4, r5);
975 HS_CMP_HALF(5, r6);
976 HS_CMP_HALF(6, r7);
977 HS_CMP_HALF(7, r8);
978 }
979 HS_CMP_XCHG(r1, r5);
980 HS_CMP_XCHG(r3, r7);
981 HS_CMP_XCHG(r1, r3);
982 HS_CMP_XCHG(r5, r7);
983 HS_CMP_XCHG(r2, r6);
984 HS_CMP_XCHG(r4, r8);
985 HS_CMP_XCHG(r2, r4);
986 HS_CMP_XCHG(r6, r8);
987 HS_CMP_XCHG(r1, r2);
988 HS_CMP_XCHG(r3, r4);
989 HS_CMP_XCHG(r5, r6);
990 HS_CMP_XCHG(r7, r8);
991 }
992 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0) = r1;
993 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1) = r8;
994 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2) = r2;
995 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3) = r7;
996 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4) = r3;
997 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5) = r6;
998 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6) = r4;
999 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7) = r5;
1000 HS_BLOCK_BARRIER();
1001 {
1002 {
1003 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
1004 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(32);
1005 HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(64);
1006 HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(96);
1007 HS_CMP_XCHG(r0_2, r0_3);
1008 HS_CMP_XCHG(r0_1, r0_4);
1009 HS_CMP_XCHG(r0_3, r0_4);
1010 HS_CMP_XCHG(r0_1, r0_2);
1011 HS_SLAB_LOCAL_L(0) = r0_1;
1012 HS_SLAB_LOCAL_L(32) = r0_2;
1013 HS_SLAB_LOCAL_R(64) = r0_3;
1014 HS_SLAB_LOCAL_R(96) = r0_4;
1015 }
1016 {
1017 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(512);
1018 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(544);
1019 HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(576);
1020 HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(608);
1021 HS_CMP_XCHG(r0_2, r0_3);
1022 HS_CMP_XCHG(r0_1, r0_4);
1023 HS_CMP_XCHG(r0_3, r0_4);
1024 HS_CMP_XCHG(r0_1, r0_2);
1025 HS_SLAB_LOCAL_L(512) = r0_1;
1026 HS_SLAB_LOCAL_L(544) = r0_2;
1027 HS_SLAB_LOCAL_R(576) = r0_3;
1028 HS_SLAB_LOCAL_R(608) = r0_4;
1029 }
1030 }
1031 HS_BLOCK_BARRIER();
1032 r1 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0);
1033 r8 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1);
1034 r2 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2);
1035 r7 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3);
1036 r3 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4);
1037 r6 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5);
1038 r4 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6);
1039 r5 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7);
1040 {
1041 {
1042 HS_SLAB_HALF_PREAMBLE(16);
1043 HS_CMP_HALF(0, r1);
1044 HS_CMP_HALF(1, r2);
1045 HS_CMP_HALF(2, r3);
1046 HS_CMP_HALF(3, r4);
1047 HS_CMP_HALF(4, r5);
1048 HS_CMP_HALF(5, r6);
1049 HS_CMP_HALF(6, r7);
1050 HS_CMP_HALF(7, r8);
1051 }
1052 {
1053 HS_SLAB_HALF_PREAMBLE(8);
1054 HS_CMP_HALF(0, r1);
1055 HS_CMP_HALF(1, r2);
1056 HS_CMP_HALF(2, r3);
1057 HS_CMP_HALF(3, r4);
1058 HS_CMP_HALF(4, r5);
1059 HS_CMP_HALF(5, r6);
1060 HS_CMP_HALF(6, r7);
1061 HS_CMP_HALF(7, r8);
1062 }
1063 {
1064 HS_SLAB_HALF_PREAMBLE(4);
1065 HS_CMP_HALF(0, r1);
1066 HS_CMP_HALF(1, r2);
1067 HS_CMP_HALF(2, r3);
1068 HS_CMP_HALF(3, r4);
1069 HS_CMP_HALF(4, r5);
1070 HS_CMP_HALF(5, r6);
1071 HS_CMP_HALF(6, r7);
1072 HS_CMP_HALF(7, r8);
1073 }
1074 {
1075 HS_SLAB_HALF_PREAMBLE(2);
1076 HS_CMP_HALF(0, r1);
1077 HS_CMP_HALF(1, r2);
1078 HS_CMP_HALF(2, r3);
1079 HS_CMP_HALF(3, r4);
1080 HS_CMP_HALF(4, r5);
1081 HS_CMP_HALF(5, r6);
1082 HS_CMP_HALF(6, r7);
1083 HS_CMP_HALF(7, r8);
1084 }
1085 {
1086 HS_SLAB_HALF_PREAMBLE(1);
1087 HS_CMP_HALF(0, r1);
1088 HS_CMP_HALF(1, r2);
1089 HS_CMP_HALF(2, r3);
1090 HS_CMP_HALF(3, r4);
1091 HS_CMP_HALF(4, r5);
1092 HS_CMP_HALF(5, r6);
1093 HS_CMP_HALF(6, r7);
1094 HS_CMP_HALF(7, r8);
1095 }
1096 HS_CMP_XCHG(r1, r5);
1097 HS_CMP_XCHG(r3, r7);
1098 HS_CMP_XCHG(r1, r3);
1099 HS_CMP_XCHG(r5, r7);
1100 HS_CMP_XCHG(r2, r6);
1101 HS_CMP_XCHG(r4, r8);
1102 HS_CMP_XCHG(r2, r4);
1103 HS_CMP_XCHG(r6, r8);
1104 HS_CMP_XCHG(r1, r2);
1105 HS_CMP_XCHG(r3, r4);
1106 HS_CMP_XCHG(r5, r6);
1107 HS_CMP_XCHG(r7, r8);
1108 }
1109 HS_SLAB_GLOBAL_STORE(0, r1);
1110 HS_SLAB_GLOBAL_STORE(1, r2);
1111 HS_SLAB_GLOBAL_STORE(2, r3);
1112 HS_SLAB_GLOBAL_STORE(3, r4);
1113 HS_SLAB_GLOBAL_STORE(4, r5);
1114 HS_SLAB_GLOBAL_STORE(5, r6);
1115 HS_SLAB_GLOBAL_STORE(6, r7);
1116 HS_SLAB_GLOBAL_STORE(7, r8);
1117 }
1118
1119 HS_OFFSET_BS_KERNEL_PROTO(8, 3)
1120 {
1121 HS_BLOCK_LOCAL_MEM_DECL(256, 8);
1122
1123 HS_OFFSET_SLAB_GLOBAL_PREAMBLE();
1124 HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0);
1125 HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1);
1126 HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2);
1127 HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3);
1128 HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4);
1129 HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5);
1130 HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6);
1131 HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7);
1132 HS_CMP_XCHG(r1, r5);
1133 HS_CMP_XCHG(r2, r6);
1134 HS_CMP_XCHG(r3, r7);
1135 HS_CMP_XCHG(r4, r8);
1136 HS_CMP_XCHG(r1, r3);
1137 HS_CMP_XCHG(r2, r4);
1138 HS_CMP_XCHG(r5, r7);
1139 HS_CMP_XCHG(r6, r8);
1140 HS_CMP_XCHG(r3, r5);
1141 HS_CMP_XCHG(r4, r6);
1142 HS_CMP_XCHG(r1, r2);
1143 HS_CMP_XCHG(r3, r4);
1144 HS_CMP_XCHG(r5, r6);
1145 HS_CMP_XCHG(r7, r8);
1146 HS_CMP_XCHG(r2, r5);
1147 HS_CMP_XCHG(r4, r7);
1148 HS_CMP_XCHG(r2, r3);
1149 HS_CMP_XCHG(r4, r5);
1150 HS_CMP_XCHG(r6, r7);
1151 {
1152 HS_SLAB_FLIP_PREAMBLE(1);
1153 HS_CMP_FLIP(0, r1, r8);
1154 HS_CMP_FLIP(1, r2, r7);
1155 HS_CMP_FLIP(2, r3, r6);
1156 HS_CMP_FLIP(3, r4, r5);
1157 }
1158 HS_CMP_XCHG(r1, r5);
1159 HS_CMP_XCHG(r3, r7);
1160 HS_CMP_XCHG(r1, r3);
1161 HS_CMP_XCHG(r5, r7);
1162 HS_CMP_XCHG(r2, r6);
1163 HS_CMP_XCHG(r4, r8);
1164 HS_CMP_XCHG(r2, r4);
1165 HS_CMP_XCHG(r6, r8);
1166 HS_CMP_XCHG(r1, r2);
1167 HS_CMP_XCHG(r3, r4);
1168 HS_CMP_XCHG(r5, r6);
1169 HS_CMP_XCHG(r7, r8);
1170 {
1171 HS_SLAB_FLIP_PREAMBLE(3);
1172 HS_CMP_FLIP(0, r1, r8);
1173 HS_CMP_FLIP(1, r2, r7);
1174 HS_CMP_FLIP(2, r3, r6);
1175 HS_CMP_FLIP(3, r4, r5);
1176 }
1177 {
1178 HS_SLAB_HALF_PREAMBLE(1);
1179 HS_CMP_HALF(0, r1);
1180 HS_CMP_HALF(1, r2);
1181 HS_CMP_HALF(2, r3);
1182 HS_CMP_HALF(3, r4);
1183 HS_CMP_HALF(4, r5);
1184 HS_CMP_HALF(5, r6);
1185 HS_CMP_HALF(6, r7);
1186 HS_CMP_HALF(7, r8);
1187 }
1188 HS_CMP_XCHG(r1, r5);
1189 HS_CMP_XCHG(r3, r7);
1190 HS_CMP_XCHG(r1, r3);
1191 HS_CMP_XCHG(r5, r7);
1192 HS_CMP_XCHG(r2, r6);
1193 HS_CMP_XCHG(r4, r8);
1194 HS_CMP_XCHG(r2, r4);
1195 HS_CMP_XCHG(r6, r8);
1196 HS_CMP_XCHG(r1, r2);
1197 HS_CMP_XCHG(r3, r4);
1198 HS_CMP_XCHG(r5, r6);
1199 HS_CMP_XCHG(r7, r8);
1200 {
1201 HS_SLAB_FLIP_PREAMBLE(7);
1202 HS_CMP_FLIP(0, r1, r8);
1203 HS_CMP_FLIP(1, r2, r7);
1204 HS_CMP_FLIP(2, r3, r6);
1205 HS_CMP_FLIP(3, r4, r5);
1206 }
1207 {
1208 HS_SLAB_HALF_PREAMBLE(2);
1209 HS_CMP_HALF(0, r1);
1210 HS_CMP_HALF(1, r2);
1211 HS_CMP_HALF(2, r3);
1212 HS_CMP_HALF(3, r4);
1213 HS_CMP_HALF(4, r5);
1214 HS_CMP_HALF(5, r6);
1215 HS_CMP_HALF(6, r7);
1216 HS_CMP_HALF(7, r8);
1217 }
1218 {
1219 HS_SLAB_HALF_PREAMBLE(1);
1220 HS_CMP_HALF(0, r1);
1221 HS_CMP_HALF(1, r2);
1222 HS_CMP_HALF(2, r3);
1223 HS_CMP_HALF(3, r4);
1224 HS_CMP_HALF(4, r5);
1225 HS_CMP_HALF(5, r6);
1226 HS_CMP_HALF(6, r7);
1227 HS_CMP_HALF(7, r8);
1228 }
1229 HS_CMP_XCHG(r1, r5);
1230 HS_CMP_XCHG(r3, r7);
1231 HS_CMP_XCHG(r1, r3);
1232 HS_CMP_XCHG(r5, r7);
1233 HS_CMP_XCHG(r2, r6);
1234 HS_CMP_XCHG(r4, r8);
1235 HS_CMP_XCHG(r2, r4);
1236 HS_CMP_XCHG(r6, r8);
1237 HS_CMP_XCHG(r1, r2);
1238 HS_CMP_XCHG(r3, r4);
1239 HS_CMP_XCHG(r5, r6);
1240 HS_CMP_XCHG(r7, r8);
1241 {
1242 HS_SLAB_FLIP_PREAMBLE(15);
1243 HS_CMP_FLIP(0, r1, r8);
1244 HS_CMP_FLIP(1, r2, r7);
1245 HS_CMP_FLIP(2, r3, r6);
1246 HS_CMP_FLIP(3, r4, r5);
1247 }
1248 {
1249 HS_SLAB_HALF_PREAMBLE(4);
1250 HS_CMP_HALF(0, r1);
1251 HS_CMP_HALF(1, r2);
1252 HS_CMP_HALF(2, r3);
1253 HS_CMP_HALF(3, r4);
1254 HS_CMP_HALF(4, r5);
1255 HS_CMP_HALF(5, r6);
1256 HS_CMP_HALF(6, r7);
1257 HS_CMP_HALF(7, r8);
1258 }
1259 {
1260 HS_SLAB_HALF_PREAMBLE(2);
1261 HS_CMP_HALF(0, r1);
1262 HS_CMP_HALF(1, r2);
1263 HS_CMP_HALF(2, r3);
1264 HS_CMP_HALF(3, r4);
1265 HS_CMP_HALF(4, r5);
1266 HS_CMP_HALF(5, r6);
1267 HS_CMP_HALF(6, r7);
1268 HS_CMP_HALF(7, r8);
1269 }
1270 {
1271 HS_SLAB_HALF_PREAMBLE(1);
1272 HS_CMP_HALF(0, r1);
1273 HS_CMP_HALF(1, r2);
1274 HS_CMP_HALF(2, r3);
1275 HS_CMP_HALF(3, r4);
1276 HS_CMP_HALF(4, r5);
1277 HS_CMP_HALF(5, r6);
1278 HS_CMP_HALF(6, r7);
1279 HS_CMP_HALF(7, r8);
1280 }
1281 HS_CMP_XCHG(r1, r5);
1282 HS_CMP_XCHG(r3, r7);
1283 HS_CMP_XCHG(r1, r3);
1284 HS_CMP_XCHG(r5, r7);
1285 HS_CMP_XCHG(r2, r6);
1286 HS_CMP_XCHG(r4, r8);
1287 HS_CMP_XCHG(r2, r4);
1288 HS_CMP_XCHG(r6, r8);
1289 HS_CMP_XCHG(r1, r2);
1290 HS_CMP_XCHG(r3, r4);
1291 HS_CMP_XCHG(r5, r6);
1292 HS_CMP_XCHG(r7, r8);
1293 {
1294 HS_SLAB_FLIP_PREAMBLE(31);
1295 HS_CMP_FLIP(0, r1, r8);
1296 HS_CMP_FLIP(1, r2, r7);
1297 HS_CMP_FLIP(2, r3, r6);
1298 HS_CMP_FLIP(3, r4, r5);
1299 }
1300 {
1301 HS_SLAB_HALF_PREAMBLE(8);
1302 HS_CMP_HALF(0, r1);
1303 HS_CMP_HALF(1, r2);
1304 HS_CMP_HALF(2, r3);
1305 HS_CMP_HALF(3, r4);
1306 HS_CMP_HALF(4, r5);
1307 HS_CMP_HALF(5, r6);
1308 HS_CMP_HALF(6, r7);
1309 HS_CMP_HALF(7, r8);
1310 }
1311 {
1312 HS_SLAB_HALF_PREAMBLE(4);
1313 HS_CMP_HALF(0, r1);
1314 HS_CMP_HALF(1, r2);
1315 HS_CMP_HALF(2, r3);
1316 HS_CMP_HALF(3, r4);
1317 HS_CMP_HALF(4, r5);
1318 HS_CMP_HALF(5, r6);
1319 HS_CMP_HALF(6, r7);
1320 HS_CMP_HALF(7, r8);
1321 }
1322 {
1323 HS_SLAB_HALF_PREAMBLE(2);
1324 HS_CMP_HALF(0, r1);
1325 HS_CMP_HALF(1, r2);
1326 HS_CMP_HALF(2, r3);
1327 HS_CMP_HALF(3, r4);
1328 HS_CMP_HALF(4, r5);
1329 HS_CMP_HALF(5, r6);
1330 HS_CMP_HALF(6, r7);
1331 HS_CMP_HALF(7, r8);
1332 }
1333 {
1334 HS_SLAB_HALF_PREAMBLE(1);
1335 HS_CMP_HALF(0, r1);
1336 HS_CMP_HALF(1, r2);
1337 HS_CMP_HALF(2, r3);
1338 HS_CMP_HALF(3, r4);
1339 HS_CMP_HALF(4, r5);
1340 HS_CMP_HALF(5, r6);
1341 HS_CMP_HALF(6, r7);
1342 HS_CMP_HALF(7, r8);
1343 }
1344 HS_CMP_XCHG(r1, r5);
1345 HS_CMP_XCHG(r3, r7);
1346 HS_CMP_XCHG(r1, r3);
1347 HS_CMP_XCHG(r5, r7);
1348 HS_CMP_XCHG(r2, r6);
1349 HS_CMP_XCHG(r4, r8);
1350 HS_CMP_XCHG(r2, r4);
1351 HS_CMP_XCHG(r6, r8);
1352 HS_CMP_XCHG(r1, r2);
1353 HS_CMP_XCHG(r3, r4);
1354 HS_CMP_XCHG(r5, r6);
1355 HS_CMP_XCHG(r7, r8);
1356 HS_BS_MERGE_H_PREAMBLE(8);
1357 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0) = r1;
1358 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1) = r8;
1359 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2) = r2;
1360 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3) = r7;
1361 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4) = r3;
1362 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5) = r6;
1363 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6) = r4;
1364 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7) = r5;
1365 HS_BLOCK_BARRIER();
1366 {
1367 {
1368 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
1369 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(32);
1370 HS_CMP_XCHG(r0_1, r0_2);
1371 HS_SLAB_LOCAL_L(0) = r0_1;
1372 HS_SLAB_LOCAL_R(32) = r0_2;
1373 }
1374 {
1375 HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(64);
1376 HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(96);
1377 HS_CMP_XCHG(r1_1, r1_2);
1378 HS_SLAB_LOCAL_L(64) = r1_1;
1379 HS_SLAB_LOCAL_R(96) = r1_2;
1380 }
1381 {
1382 HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(128);
1383 HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(160);
1384 HS_CMP_XCHG(r2_1, r2_2);
1385 HS_SLAB_LOCAL_L(128) = r2_1;
1386 HS_SLAB_LOCAL_R(160) = r2_2;
1387 }
1388 {
1389 HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(192);
1390 HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(224);
1391 HS_CMP_XCHG(r3_1, r3_2);
1392 HS_SLAB_LOCAL_L(192) = r3_1;
1393 HS_SLAB_LOCAL_R(224) = r3_2;
1394 }
1395 }
1396 HS_BLOCK_BARRIER();
1397 r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0);
1398 r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1);
1399 r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2);
1400 r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3);
1401 r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4);
1402 r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5);
1403 r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6);
1404 r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7);
1405 {
1406 {
1407 HS_SLAB_HALF_PREAMBLE(16);
1408 HS_CMP_HALF(0, r1);
1409 HS_CMP_HALF(1, r2);
1410 HS_CMP_HALF(2, r3);
1411 HS_CMP_HALF(3, r4);
1412 HS_CMP_HALF(4, r5);
1413 HS_CMP_HALF(5, r6);
1414 HS_CMP_HALF(6, r7);
1415 HS_CMP_HALF(7, r8);
1416 }
1417 {
1418 HS_SLAB_HALF_PREAMBLE(8);
1419 HS_CMP_HALF(0, r1);
1420 HS_CMP_HALF(1, r2);
1421 HS_CMP_HALF(2, r3);
1422 HS_CMP_HALF(3, r4);
1423 HS_CMP_HALF(4, r5);
1424 HS_CMP_HALF(5, r6);
1425 HS_CMP_HALF(6, r7);
1426 HS_CMP_HALF(7, r8);
1427 }
1428 {
1429 HS_SLAB_HALF_PREAMBLE(4);
1430 HS_CMP_HALF(0, r1);
1431 HS_CMP_HALF(1, r2);
1432 HS_CMP_HALF(2, r3);
1433 HS_CMP_HALF(3, r4);
1434 HS_CMP_HALF(4, r5);
1435 HS_CMP_HALF(5, r6);
1436 HS_CMP_HALF(6, r7);
1437 HS_CMP_HALF(7, r8);
1438 }
1439 {
1440 HS_SLAB_HALF_PREAMBLE(2);
1441 HS_CMP_HALF(0, r1);
1442 HS_CMP_HALF(1, r2);
1443 HS_CMP_HALF(2, r3);
1444 HS_CMP_HALF(3, r4);
1445 HS_CMP_HALF(4, r5);
1446 HS_CMP_HALF(5, r6);
1447 HS_CMP_HALF(6, r7);
1448 HS_CMP_HALF(7, r8);
1449 }
1450 {
1451 HS_SLAB_HALF_PREAMBLE(1);
1452 HS_CMP_HALF(0, r1);
1453 HS_CMP_HALF(1, r2);
1454 HS_CMP_HALF(2, r3);
1455 HS_CMP_HALF(3, r4);
1456 HS_CMP_HALF(4, r5);
1457 HS_CMP_HALF(5, r6);
1458 HS_CMP_HALF(6, r7);
1459 HS_CMP_HALF(7, r8);
1460 }
1461 HS_CMP_XCHG(r1, r5);
1462 HS_CMP_XCHG(r3, r7);
1463 HS_CMP_XCHG(r1, r3);
1464 HS_CMP_XCHG(r5, r7);
1465 HS_CMP_XCHG(r2, r6);
1466 HS_CMP_XCHG(r4, r8);
1467 HS_CMP_XCHG(r2, r4);
1468 HS_CMP_XCHG(r6, r8);
1469 HS_CMP_XCHG(r1, r2);
1470 HS_CMP_XCHG(r3, r4);
1471 HS_CMP_XCHG(r5, r6);
1472 HS_CMP_XCHG(r7, r8);
1473 }
1474 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0) = r1;
1475 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1) = r8;
1476 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2) = r2;
1477 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3) = r7;
1478 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4) = r3;
1479 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5) = r6;
1480 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6) = r4;
1481 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7) = r5;
1482 HS_BLOCK_BARRIER();
1483 {
1484 {
1485 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
1486 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(32);
1487 HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(64);
1488 HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(96);
1489 HS_CMP_XCHG(r0_2, r0_3);
1490 HS_CMP_XCHG(r0_1, r0_4);
1491 HS_CMP_XCHG(r0_3, r0_4);
1492 HS_CMP_XCHG(r0_1, r0_2);
1493 HS_SLAB_LOCAL_L(0) = r0_1;
1494 HS_SLAB_LOCAL_L(32) = r0_2;
1495 HS_SLAB_LOCAL_R(64) = r0_3;
1496 HS_SLAB_LOCAL_R(96) = r0_4;
1497 }
1498 {
1499 HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(128);
1500 HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(160);
1501 HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(192);
1502 HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(224);
1503 HS_CMP_XCHG(r1_2, r1_3);
1504 HS_CMP_XCHG(r1_1, r1_4);
1505 HS_CMP_XCHG(r1_3, r1_4);
1506 HS_CMP_XCHG(r1_1, r1_2);
1507 HS_SLAB_LOCAL_L(128) = r1_1;
1508 HS_SLAB_LOCAL_L(160) = r1_2;
1509 HS_SLAB_LOCAL_R(192) = r1_3;
1510 HS_SLAB_LOCAL_R(224) = r1_4;
1511 }
1512 }
1513 HS_BLOCK_BARRIER();
1514 r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0);
1515 r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1);
1516 r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2);
1517 r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3);
1518 r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4);
1519 r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5);
1520 r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6);
1521 r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7);
1522 {
1523 {
1524 HS_SLAB_HALF_PREAMBLE(16);
1525 HS_CMP_HALF(0, r1);
1526 HS_CMP_HALF(1, r2);
1527 HS_CMP_HALF(2, r3);
1528 HS_CMP_HALF(3, r4);
1529 HS_CMP_HALF(4, r5);
1530 HS_CMP_HALF(5, r6);
1531 HS_CMP_HALF(6, r7);
1532 HS_CMP_HALF(7, r8);
1533 }
1534 {
1535 HS_SLAB_HALF_PREAMBLE(8);
1536 HS_CMP_HALF(0, r1);
1537 HS_CMP_HALF(1, r2);
1538 HS_CMP_HALF(2, r3);
1539 HS_CMP_HALF(3, r4);
1540 HS_CMP_HALF(4, r5);
1541 HS_CMP_HALF(5, r6);
1542 HS_CMP_HALF(6, r7);
1543 HS_CMP_HALF(7, r8);
1544 }
1545 {
1546 HS_SLAB_HALF_PREAMBLE(4);
1547 HS_CMP_HALF(0, r1);
1548 HS_CMP_HALF(1, r2);
1549 HS_CMP_HALF(2, r3);
1550 HS_CMP_HALF(3, r4);
1551 HS_CMP_HALF(4, r5);
1552 HS_CMP_HALF(5, r6);
1553 HS_CMP_HALF(6, r7);
1554 HS_CMP_HALF(7, r8);
1555 }
1556 {
1557 HS_SLAB_HALF_PREAMBLE(2);
1558 HS_CMP_HALF(0, r1);
1559 HS_CMP_HALF(1, r2);
1560 HS_CMP_HALF(2, r3);
1561 HS_CMP_HALF(3, r4);
1562 HS_CMP_HALF(4, r5);
1563 HS_CMP_HALF(5, r6);
1564 HS_CMP_HALF(6, r7);
1565 HS_CMP_HALF(7, r8);
1566 }
1567 {
1568 HS_SLAB_HALF_PREAMBLE(1);
1569 HS_CMP_HALF(0, r1);
1570 HS_CMP_HALF(1, r2);
1571 HS_CMP_HALF(2, r3);
1572 HS_CMP_HALF(3, r4);
1573 HS_CMP_HALF(4, r5);
1574 HS_CMP_HALF(5, r6);
1575 HS_CMP_HALF(6, r7);
1576 HS_CMP_HALF(7, r8);
1577 }
1578 HS_CMP_XCHG(r1, r5);
1579 HS_CMP_XCHG(r3, r7);
1580 HS_CMP_XCHG(r1, r3);
1581 HS_CMP_XCHG(r5, r7);
1582 HS_CMP_XCHG(r2, r6);
1583 HS_CMP_XCHG(r4, r8);
1584 HS_CMP_XCHG(r2, r4);
1585 HS_CMP_XCHG(r6, r8);
1586 HS_CMP_XCHG(r1, r2);
1587 HS_CMP_XCHG(r3, r4);
1588 HS_CMP_XCHG(r5, r6);
1589 HS_CMP_XCHG(r7, r8);
1590 }
1591 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0) = r1;
1592 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1) = r8;
1593 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2) = r2;
1594 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3) = r7;
1595 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4) = r3;
1596 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5) = r6;
1597 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6) = r4;
1598 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7) = r5;
1599 HS_BLOCK_BARRIER();
1600 {
1601 {
1602 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
1603 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(32);
1604 HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(64);
1605 HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(96);
1606 HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(128);
1607 HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(160);
1608 HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(192);
1609 HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(224);
1610 HS_CMP_XCHG(r0_4, r0_5);
1611 HS_CMP_XCHG(r0_3, r0_6);
1612 HS_CMP_XCHG(r0_2, r0_7);
1613 HS_CMP_XCHG(r0_1, r0_8);
1614 HS_CMP_XCHG(r0_5, r0_7);
1615 HS_CMP_XCHG(r0_6, r0_8);
1616 HS_CMP_XCHG(r0_5, r0_6);
1617 HS_CMP_XCHG(r0_7, r0_8);
1618 HS_CMP_XCHG(r0_1, r0_3);
1619 HS_CMP_XCHG(r0_2, r0_4);
1620 HS_CMP_XCHG(r0_1, r0_2);
1621 HS_CMP_XCHG(r0_3, r0_4);
1622 HS_SLAB_LOCAL_L(0) = r0_1;
1623 HS_SLAB_LOCAL_L(32) = r0_2;
1624 HS_SLAB_LOCAL_L(64) = r0_3;
1625 HS_SLAB_LOCAL_L(96) = r0_4;
1626 HS_SLAB_LOCAL_R(128) = r0_5;
1627 HS_SLAB_LOCAL_R(160) = r0_6;
1628 HS_SLAB_LOCAL_R(192) = r0_7;
1629 HS_SLAB_LOCAL_R(224) = r0_8;
1630 }
1631 }
1632 HS_BLOCK_BARRIER();
1633 r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0);
1634 r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1);
1635 r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2);
1636 r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3);
1637 r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4);
1638 r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5);
1639 r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6);
1640 r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7);
1641 {
1642 {
1643 HS_SLAB_HALF_PREAMBLE(16);
1644 HS_CMP_HALF(0, r1);
1645 HS_CMP_HALF(1, r2);
1646 HS_CMP_HALF(2, r3);
1647 HS_CMP_HALF(3, r4);
1648 HS_CMP_HALF(4, r5);
1649 HS_CMP_HALF(5, r6);
1650 HS_CMP_HALF(6, r7);
1651 HS_CMP_HALF(7, r8);
1652 }
1653 {
1654 HS_SLAB_HALF_PREAMBLE(8);
1655 HS_CMP_HALF(0, r1);
1656 HS_CMP_HALF(1, r2);
1657 HS_CMP_HALF(2, r3);
1658 HS_CMP_HALF(3, r4);
1659 HS_CMP_HALF(4, r5);
1660 HS_CMP_HALF(5, r6);
1661 HS_CMP_HALF(6, r7);
1662 HS_CMP_HALF(7, r8);
1663 }
1664 {
1665 HS_SLAB_HALF_PREAMBLE(4);
1666 HS_CMP_HALF(0, r1);
1667 HS_CMP_HALF(1, r2);
1668 HS_CMP_HALF(2, r3);
1669 HS_CMP_HALF(3, r4);
1670 HS_CMP_HALF(4, r5);
1671 HS_CMP_HALF(5, r6);
1672 HS_CMP_HALF(6, r7);
1673 HS_CMP_HALF(7, r8);
1674 }
1675 {
1676 HS_SLAB_HALF_PREAMBLE(2);
1677 HS_CMP_HALF(0, r1);
1678 HS_CMP_HALF(1, r2);
1679 HS_CMP_HALF(2, r3);
1680 HS_CMP_HALF(3, r4);
1681 HS_CMP_HALF(4, r5);
1682 HS_CMP_HALF(5, r6);
1683 HS_CMP_HALF(6, r7);
1684 HS_CMP_HALF(7, r8);
1685 }
1686 {
1687 HS_SLAB_HALF_PREAMBLE(1);
1688 HS_CMP_HALF(0, r1);
1689 HS_CMP_HALF(1, r2);
1690 HS_CMP_HALF(2, r3);
1691 HS_CMP_HALF(3, r4);
1692 HS_CMP_HALF(4, r5);
1693 HS_CMP_HALF(5, r6);
1694 HS_CMP_HALF(6, r7);
1695 HS_CMP_HALF(7, r8);
1696 }
1697 HS_CMP_XCHG(r1, r5);
1698 HS_CMP_XCHG(r3, r7);
1699 HS_CMP_XCHG(r1, r3);
1700 HS_CMP_XCHG(r5, r7);
1701 HS_CMP_XCHG(r2, r6);
1702 HS_CMP_XCHG(r4, r8);
1703 HS_CMP_XCHG(r2, r4);
1704 HS_CMP_XCHG(r6, r8);
1705 HS_CMP_XCHG(r1, r2);
1706 HS_CMP_XCHG(r3, r4);
1707 HS_CMP_XCHG(r5, r6);
1708 HS_CMP_XCHG(r7, r8);
1709 }
1710 HS_SLAB_GLOBAL_STORE(0, r1);
1711 HS_SLAB_GLOBAL_STORE(1, r2);
1712 HS_SLAB_GLOBAL_STORE(2, r3);
1713 HS_SLAB_GLOBAL_STORE(3, r4);
1714 HS_SLAB_GLOBAL_STORE(4, r5);
1715 HS_SLAB_GLOBAL_STORE(5, r6);
1716 HS_SLAB_GLOBAL_STORE(6, r7);
1717 HS_SLAB_GLOBAL_STORE(7, r8);
1718 }
1719
1720 HS_BS_KERNEL_PROTO(16, 4)
1721 {
1722 HS_BLOCK_LOCAL_MEM_DECL(512, 8);
1723
1724 HS_SLAB_GLOBAL_PREAMBLE();
1725 HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0);
1726 HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1);
1727 HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2);
1728 HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3);
1729 HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4);
1730 HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5);
1731 HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6);
1732 HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7);
1733 HS_CMP_XCHG(r1, r5);
1734 HS_CMP_XCHG(r2, r6);
1735 HS_CMP_XCHG(r3, r7);
1736 HS_CMP_XCHG(r4, r8);
1737 HS_CMP_XCHG(r1, r3);
1738 HS_CMP_XCHG(r2, r4);
1739 HS_CMP_XCHG(r5, r7);
1740 HS_CMP_XCHG(r6, r8);
1741 HS_CMP_XCHG(r3, r5);
1742 HS_CMP_XCHG(r4, r6);
1743 HS_CMP_XCHG(r1, r2);
1744 HS_CMP_XCHG(r3, r4);
1745 HS_CMP_XCHG(r5, r6);
1746 HS_CMP_XCHG(r7, r8);
1747 HS_CMP_XCHG(r2, r5);
1748 HS_CMP_XCHG(r4, r7);
1749 HS_CMP_XCHG(r2, r3);
1750 HS_CMP_XCHG(r4, r5);
1751 HS_CMP_XCHG(r6, r7);
1752 {
1753 HS_SLAB_FLIP_PREAMBLE(1);
1754 HS_CMP_FLIP(0, r1, r8);
1755 HS_CMP_FLIP(1, r2, r7);
1756 HS_CMP_FLIP(2, r3, r6);
1757 HS_CMP_FLIP(3, r4, r5);
1758 }
1759 HS_CMP_XCHG(r1, r5);
1760 HS_CMP_XCHG(r3, r7);
1761 HS_CMP_XCHG(r1, r3);
1762 HS_CMP_XCHG(r5, r7);
1763 HS_CMP_XCHG(r2, r6);
1764 HS_CMP_XCHG(r4, r8);
1765 HS_CMP_XCHG(r2, r4);
1766 HS_CMP_XCHG(r6, r8);
1767 HS_CMP_XCHG(r1, r2);
1768 HS_CMP_XCHG(r3, r4);
1769 HS_CMP_XCHG(r5, r6);
1770 HS_CMP_XCHG(r7, r8);
1771 {
1772 HS_SLAB_FLIP_PREAMBLE(3);
1773 HS_CMP_FLIP(0, r1, r8);
1774 HS_CMP_FLIP(1, r2, r7);
1775 HS_CMP_FLIP(2, r3, r6);
1776 HS_CMP_FLIP(3, r4, r5);
1777 }
1778 {
1779 HS_SLAB_HALF_PREAMBLE(1);
1780 HS_CMP_HALF(0, r1);
1781 HS_CMP_HALF(1, r2);
1782 HS_CMP_HALF(2, r3);
1783 HS_CMP_HALF(3, r4);
1784 HS_CMP_HALF(4, r5);
1785 HS_CMP_HALF(5, r6);
1786 HS_CMP_HALF(6, r7);
1787 HS_CMP_HALF(7, r8);
1788 }
1789 HS_CMP_XCHG(r1, r5);
1790 HS_CMP_XCHG(r3, r7);
1791 HS_CMP_XCHG(r1, r3);
1792 HS_CMP_XCHG(r5, r7);
1793 HS_CMP_XCHG(r2, r6);
1794 HS_CMP_XCHG(r4, r8);
1795 HS_CMP_XCHG(r2, r4);
1796 HS_CMP_XCHG(r6, r8);
1797 HS_CMP_XCHG(r1, r2);
1798 HS_CMP_XCHG(r3, r4);
1799 HS_CMP_XCHG(r5, r6);
1800 HS_CMP_XCHG(r7, r8);
1801 {
1802 HS_SLAB_FLIP_PREAMBLE(7);
1803 HS_CMP_FLIP(0, r1, r8);
1804 HS_CMP_FLIP(1, r2, r7);
1805 HS_CMP_FLIP(2, r3, r6);
1806 HS_CMP_FLIP(3, r4, r5);
1807 }
1808 {
1809 HS_SLAB_HALF_PREAMBLE(2);
1810 HS_CMP_HALF(0, r1);
1811 HS_CMP_HALF(1, r2);
1812 HS_CMP_HALF(2, r3);
1813 HS_CMP_HALF(3, r4);
1814 HS_CMP_HALF(4, r5);
1815 HS_CMP_HALF(5, r6);
1816 HS_CMP_HALF(6, r7);
1817 HS_CMP_HALF(7, r8);
1818 }
1819 {
1820 HS_SLAB_HALF_PREAMBLE(1);
1821 HS_CMP_HALF(0, r1);
1822 HS_CMP_HALF(1, r2);
1823 HS_CMP_HALF(2, r3);
1824 HS_CMP_HALF(3, r4);
1825 HS_CMP_HALF(4, r5);
1826 HS_CMP_HALF(5, r6);
1827 HS_CMP_HALF(6, r7);
1828 HS_CMP_HALF(7, r8);
1829 }
1830 HS_CMP_XCHG(r1, r5);
1831 HS_CMP_XCHG(r3, r7);
1832 HS_CMP_XCHG(r1, r3);
1833 HS_CMP_XCHG(r5, r7);
1834 HS_CMP_XCHG(r2, r6);
1835 HS_CMP_XCHG(r4, r8);
1836 HS_CMP_XCHG(r2, r4);
1837 HS_CMP_XCHG(r6, r8);
1838 HS_CMP_XCHG(r1, r2);
1839 HS_CMP_XCHG(r3, r4);
1840 HS_CMP_XCHG(r5, r6);
1841 HS_CMP_XCHG(r7, r8);
1842 {
1843 HS_SLAB_FLIP_PREAMBLE(15);
1844 HS_CMP_FLIP(0, r1, r8);
1845 HS_CMP_FLIP(1, r2, r7);
1846 HS_CMP_FLIP(2, r3, r6);
1847 HS_CMP_FLIP(3, r4, r5);
1848 }
1849 {
1850 HS_SLAB_HALF_PREAMBLE(4);
1851 HS_CMP_HALF(0, r1);
1852 HS_CMP_HALF(1, r2);
1853 HS_CMP_HALF(2, r3);
1854 HS_CMP_HALF(3, r4);
1855 HS_CMP_HALF(4, r5);
1856 HS_CMP_HALF(5, r6);
1857 HS_CMP_HALF(6, r7);
1858 HS_CMP_HALF(7, r8);
1859 }
1860 {
1861 HS_SLAB_HALF_PREAMBLE(2);
1862 HS_CMP_HALF(0, r1);
1863 HS_CMP_HALF(1, r2);
1864 HS_CMP_HALF(2, r3);
1865 HS_CMP_HALF(3, r4);
1866 HS_CMP_HALF(4, r5);
1867 HS_CMP_HALF(5, r6);
1868 HS_CMP_HALF(6, r7);
1869 HS_CMP_HALF(7, r8);
1870 }
1871 {
1872 HS_SLAB_HALF_PREAMBLE(1);
1873 HS_CMP_HALF(0, r1);
1874 HS_CMP_HALF(1, r2);
1875 HS_CMP_HALF(2, r3);
1876 HS_CMP_HALF(3, r4);
1877 HS_CMP_HALF(4, r5);
1878 HS_CMP_HALF(5, r6);
1879 HS_CMP_HALF(6, r7);
1880 HS_CMP_HALF(7, r8);
1881 }
1882 HS_CMP_XCHG(r1, r5);
1883 HS_CMP_XCHG(r3, r7);
1884 HS_CMP_XCHG(r1, r3);
1885 HS_CMP_XCHG(r5, r7);
1886 HS_CMP_XCHG(r2, r6);
1887 HS_CMP_XCHG(r4, r8);
1888 HS_CMP_XCHG(r2, r4);
1889 HS_CMP_XCHG(r6, r8);
1890 HS_CMP_XCHG(r1, r2);
1891 HS_CMP_XCHG(r3, r4);
1892 HS_CMP_XCHG(r5, r6);
1893 HS_CMP_XCHG(r7, r8);
1894 {
1895 HS_SLAB_FLIP_PREAMBLE(31);
1896 HS_CMP_FLIP(0, r1, r8);
1897 HS_CMP_FLIP(1, r2, r7);
1898 HS_CMP_FLIP(2, r3, r6);
1899 HS_CMP_FLIP(3, r4, r5);
1900 }
1901 {
1902 HS_SLAB_HALF_PREAMBLE(8);
1903 HS_CMP_HALF(0, r1);
1904 HS_CMP_HALF(1, r2);
1905 HS_CMP_HALF(2, r3);
1906 HS_CMP_HALF(3, r4);
1907 HS_CMP_HALF(4, r5);
1908 HS_CMP_HALF(5, r6);
1909 HS_CMP_HALF(6, r7);
1910 HS_CMP_HALF(7, r8);
1911 }
1912 {
1913 HS_SLAB_HALF_PREAMBLE(4);
1914 HS_CMP_HALF(0, r1);
1915 HS_CMP_HALF(1, r2);
1916 HS_CMP_HALF(2, r3);
1917 HS_CMP_HALF(3, r4);
1918 HS_CMP_HALF(4, r5);
1919 HS_CMP_HALF(5, r6);
1920 HS_CMP_HALF(6, r7);
1921 HS_CMP_HALF(7, r8);
1922 }
1923 {
1924 HS_SLAB_HALF_PREAMBLE(2);
1925 HS_CMP_HALF(0, r1);
1926 HS_CMP_HALF(1, r2);
1927 HS_CMP_HALF(2, r3);
1928 HS_CMP_HALF(3, r4);
1929 HS_CMP_HALF(4, r5);
1930 HS_CMP_HALF(5, r6);
1931 HS_CMP_HALF(6, r7);
1932 HS_CMP_HALF(7, r8);
1933 }
1934 {
1935 HS_SLAB_HALF_PREAMBLE(1);
1936 HS_CMP_HALF(0, r1);
1937 HS_CMP_HALF(1, r2);
1938 HS_CMP_HALF(2, r3);
1939 HS_CMP_HALF(3, r4);
1940 HS_CMP_HALF(4, r5);
1941 HS_CMP_HALF(5, r6);
1942 HS_CMP_HALF(6, r7);
1943 HS_CMP_HALF(7, r8);
1944 }
1945 HS_CMP_XCHG(r1, r5);
1946 HS_CMP_XCHG(r3, r7);
1947 HS_CMP_XCHG(r1, r3);
1948 HS_CMP_XCHG(r5, r7);
1949 HS_CMP_XCHG(r2, r6);
1950 HS_CMP_XCHG(r4, r8);
1951 HS_CMP_XCHG(r2, r4);
1952 HS_CMP_XCHG(r6, r8);
1953 HS_CMP_XCHG(r1, r2);
1954 HS_CMP_XCHG(r3, r4);
1955 HS_CMP_XCHG(r5, r6);
1956 HS_CMP_XCHG(r7, r8);
1957 HS_BS_MERGE_H_PREAMBLE(16);
1958 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1;
1959 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r8;
1960 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2;
1961 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r7;
1962 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3;
1963 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r6;
1964 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4;
1965 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r5;
1966 HS_BLOCK_BARRIER();
1967 if (HS_WARP_ID_X() < 8) {
1968 {
1969 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
1970 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(32);
1971 HS_CMP_XCHG(r0_1, r0_2);
1972 HS_SLAB_LOCAL_L(0) = r0_1;
1973 HS_SLAB_LOCAL_R(32) = r0_2;
1974 }
1975 {
1976 HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(64);
1977 HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(96);
1978 HS_CMP_XCHG(r1_1, r1_2);
1979 HS_SLAB_LOCAL_L(64) = r1_1;
1980 HS_SLAB_LOCAL_R(96) = r1_2;
1981 }
1982 {
1983 HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(128);
1984 HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(160);
1985 HS_CMP_XCHG(r2_1, r2_2);
1986 HS_SLAB_LOCAL_L(128) = r2_1;
1987 HS_SLAB_LOCAL_R(160) = r2_2;
1988 }
1989 {
1990 HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(192);
1991 HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(224);
1992 HS_CMP_XCHG(r3_1, r3_2);
1993 HS_SLAB_LOCAL_L(192) = r3_1;
1994 HS_SLAB_LOCAL_R(224) = r3_2;
1995 }
1996 {
1997 HS_KEY_TYPE r4_1 = HS_SLAB_LOCAL_L(256);
1998 HS_KEY_TYPE r4_2 = HS_SLAB_LOCAL_R(288);
1999 HS_CMP_XCHG(r4_1, r4_2);
2000 HS_SLAB_LOCAL_L(256) = r4_1;
2001 HS_SLAB_LOCAL_R(288) = r4_2;
2002 }
2003 {
2004 HS_KEY_TYPE r5_1 = HS_SLAB_LOCAL_L(320);
2005 HS_KEY_TYPE r5_2 = HS_SLAB_LOCAL_R(352);
2006 HS_CMP_XCHG(r5_1, r5_2);
2007 HS_SLAB_LOCAL_L(320) = r5_1;
2008 HS_SLAB_LOCAL_R(352) = r5_2;
2009 }
2010 {
2011 HS_KEY_TYPE r6_1 = HS_SLAB_LOCAL_L(384);
2012 HS_KEY_TYPE r6_2 = HS_SLAB_LOCAL_R(416);
2013 HS_CMP_XCHG(r6_1, r6_2);
2014 HS_SLAB_LOCAL_L(384) = r6_1;
2015 HS_SLAB_LOCAL_R(416) = r6_2;
2016 }
2017 {
2018 HS_KEY_TYPE r7_1 = HS_SLAB_LOCAL_L(448);
2019 HS_KEY_TYPE r7_2 = HS_SLAB_LOCAL_R(480);
2020 HS_CMP_XCHG(r7_1, r7_2);
2021 HS_SLAB_LOCAL_L(448) = r7_1;
2022 HS_SLAB_LOCAL_R(480) = r7_2;
2023 }
2024 }
2025 HS_BLOCK_BARRIER();
2026 r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0);
2027 r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1);
2028 r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2);
2029 r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3);
2030 r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4);
2031 r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5);
2032 r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6);
2033 r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7);
2034 {
2035 {
2036 HS_SLAB_HALF_PREAMBLE(16);
2037 HS_CMP_HALF(0, r1);
2038 HS_CMP_HALF(1, r2);
2039 HS_CMP_HALF(2, r3);
2040 HS_CMP_HALF(3, r4);
2041 HS_CMP_HALF(4, r5);
2042 HS_CMP_HALF(5, r6);
2043 HS_CMP_HALF(6, r7);
2044 HS_CMP_HALF(7, r8);
2045 }
2046 {
2047 HS_SLAB_HALF_PREAMBLE(8);
2048 HS_CMP_HALF(0, r1);
2049 HS_CMP_HALF(1, r2);
2050 HS_CMP_HALF(2, r3);
2051 HS_CMP_HALF(3, r4);
2052 HS_CMP_HALF(4, r5);
2053 HS_CMP_HALF(5, r6);
2054 HS_CMP_HALF(6, r7);
2055 HS_CMP_HALF(7, r8);
2056 }
2057 {
2058 HS_SLAB_HALF_PREAMBLE(4);
2059 HS_CMP_HALF(0, r1);
2060 HS_CMP_HALF(1, r2);
2061 HS_CMP_HALF(2, r3);
2062 HS_CMP_HALF(3, r4);
2063 HS_CMP_HALF(4, r5);
2064 HS_CMP_HALF(5, r6);
2065 HS_CMP_HALF(6, r7);
2066 HS_CMP_HALF(7, r8);
2067 }
2068 {
2069 HS_SLAB_HALF_PREAMBLE(2);
2070 HS_CMP_HALF(0, r1);
2071 HS_CMP_HALF(1, r2);
2072 HS_CMP_HALF(2, r3);
2073 HS_CMP_HALF(3, r4);
2074 HS_CMP_HALF(4, r5);
2075 HS_CMP_HALF(5, r6);
2076 HS_CMP_HALF(6, r7);
2077 HS_CMP_HALF(7, r8);
2078 }
2079 {
2080 HS_SLAB_HALF_PREAMBLE(1);
2081 HS_CMP_HALF(0, r1);
2082 HS_CMP_HALF(1, r2);
2083 HS_CMP_HALF(2, r3);
2084 HS_CMP_HALF(3, r4);
2085 HS_CMP_HALF(4, r5);
2086 HS_CMP_HALF(5, r6);
2087 HS_CMP_HALF(6, r7);
2088 HS_CMP_HALF(7, r8);
2089 }
2090 HS_CMP_XCHG(r1, r5);
2091 HS_CMP_XCHG(r3, r7);
2092 HS_CMP_XCHG(r1, r3);
2093 HS_CMP_XCHG(r5, r7);
2094 HS_CMP_XCHG(r2, r6);
2095 HS_CMP_XCHG(r4, r8);
2096 HS_CMP_XCHG(r2, r4);
2097 HS_CMP_XCHG(r6, r8);
2098 HS_CMP_XCHG(r1, r2);
2099 HS_CMP_XCHG(r3, r4);
2100 HS_CMP_XCHG(r5, r6);
2101 HS_CMP_XCHG(r7, r8);
2102 }
2103 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1;
2104 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r8;
2105 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2;
2106 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r7;
2107 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3;
2108 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r6;
2109 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4;
2110 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r5;
2111 HS_BLOCK_BARRIER();
2112 if (HS_WARP_ID_X() < 8) {
2113 {
2114 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
2115 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(32);
2116 HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(64);
2117 HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(96);
2118 HS_CMP_XCHG(r0_2, r0_3);
2119 HS_CMP_XCHG(r0_1, r0_4);
2120 HS_CMP_XCHG(r0_3, r0_4);
2121 HS_CMP_XCHG(r0_1, r0_2);
2122 HS_SLAB_LOCAL_L(0) = r0_1;
2123 HS_SLAB_LOCAL_L(32) = r0_2;
2124 HS_SLAB_LOCAL_R(64) = r0_3;
2125 HS_SLAB_LOCAL_R(96) = r0_4;
2126 }
2127 {
2128 HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(128);
2129 HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(160);
2130 HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(192);
2131 HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(224);
2132 HS_CMP_XCHG(r1_2, r1_3);
2133 HS_CMP_XCHG(r1_1, r1_4);
2134 HS_CMP_XCHG(r1_3, r1_4);
2135 HS_CMP_XCHG(r1_1, r1_2);
2136 HS_SLAB_LOCAL_L(128) = r1_1;
2137 HS_SLAB_LOCAL_L(160) = r1_2;
2138 HS_SLAB_LOCAL_R(192) = r1_3;
2139 HS_SLAB_LOCAL_R(224) = r1_4;
2140 }
2141 {
2142 HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(256);
2143 HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_L(288);
2144 HS_KEY_TYPE r2_3 = HS_SLAB_LOCAL_R(320);
2145 HS_KEY_TYPE r2_4 = HS_SLAB_LOCAL_R(352);
2146 HS_CMP_XCHG(r2_2, r2_3);
2147 HS_CMP_XCHG(r2_1, r2_4);
2148 HS_CMP_XCHG(r2_3, r2_4);
2149 HS_CMP_XCHG(r2_1, r2_2);
2150 HS_SLAB_LOCAL_L(256) = r2_1;
2151 HS_SLAB_LOCAL_L(288) = r2_2;
2152 HS_SLAB_LOCAL_R(320) = r2_3;
2153 HS_SLAB_LOCAL_R(352) = r2_4;
2154 }
2155 {
2156 HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(384);
2157 HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_L(416);
2158 HS_KEY_TYPE r3_3 = HS_SLAB_LOCAL_R(448);
2159 HS_KEY_TYPE r3_4 = HS_SLAB_LOCAL_R(480);
2160 HS_CMP_XCHG(r3_2, r3_3);
2161 HS_CMP_XCHG(r3_1, r3_4);
2162 HS_CMP_XCHG(r3_3, r3_4);
2163 HS_CMP_XCHG(r3_1, r3_2);
2164 HS_SLAB_LOCAL_L(384) = r3_1;
2165 HS_SLAB_LOCAL_L(416) = r3_2;
2166 HS_SLAB_LOCAL_R(448) = r3_3;
2167 HS_SLAB_LOCAL_R(480) = r3_4;
2168 }
2169 }
2170 HS_BLOCK_BARRIER();
2171 r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0);
2172 r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1);
2173 r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2);
2174 r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3);
2175 r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4);
2176 r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5);
2177 r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6);
2178 r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7);
2179 {
2180 {
2181 HS_SLAB_HALF_PREAMBLE(16);
2182 HS_CMP_HALF(0, r1);
2183 HS_CMP_HALF(1, r2);
2184 HS_CMP_HALF(2, r3);
2185 HS_CMP_HALF(3, r4);
2186 HS_CMP_HALF(4, r5);
2187 HS_CMP_HALF(5, r6);
2188 HS_CMP_HALF(6, r7);
2189 HS_CMP_HALF(7, r8);
2190 }
2191 {
2192 HS_SLAB_HALF_PREAMBLE(8);
2193 HS_CMP_HALF(0, r1);
2194 HS_CMP_HALF(1, r2);
2195 HS_CMP_HALF(2, r3);
2196 HS_CMP_HALF(3, r4);
2197 HS_CMP_HALF(4, r5);
2198 HS_CMP_HALF(5, r6);
2199 HS_CMP_HALF(6, r7);
2200 HS_CMP_HALF(7, r8);
2201 }
2202 {
2203 HS_SLAB_HALF_PREAMBLE(4);
2204 HS_CMP_HALF(0, r1);
2205 HS_CMP_HALF(1, r2);
2206 HS_CMP_HALF(2, r3);
2207 HS_CMP_HALF(3, r4);
2208 HS_CMP_HALF(4, r5);
2209 HS_CMP_HALF(5, r6);
2210 HS_CMP_HALF(6, r7);
2211 HS_CMP_HALF(7, r8);
2212 }
2213 {
2214 HS_SLAB_HALF_PREAMBLE(2);
2215 HS_CMP_HALF(0, r1);
2216 HS_CMP_HALF(1, r2);
2217 HS_CMP_HALF(2, r3);
2218 HS_CMP_HALF(3, r4);
2219 HS_CMP_HALF(4, r5);
2220 HS_CMP_HALF(5, r6);
2221 HS_CMP_HALF(6, r7);
2222 HS_CMP_HALF(7, r8);
2223 }
2224 {
2225 HS_SLAB_HALF_PREAMBLE(1);
2226 HS_CMP_HALF(0, r1);
2227 HS_CMP_HALF(1, r2);
2228 HS_CMP_HALF(2, r3);
2229 HS_CMP_HALF(3, r4);
2230 HS_CMP_HALF(4, r5);
2231 HS_CMP_HALF(5, r6);
2232 HS_CMP_HALF(6, r7);
2233 HS_CMP_HALF(7, r8);
2234 }
2235 HS_CMP_XCHG(r1, r5);
2236 HS_CMP_XCHG(r3, r7);
2237 HS_CMP_XCHG(r1, r3);
2238 HS_CMP_XCHG(r5, r7);
2239 HS_CMP_XCHG(r2, r6);
2240 HS_CMP_XCHG(r4, r8);
2241 HS_CMP_XCHG(r2, r4);
2242 HS_CMP_XCHG(r6, r8);
2243 HS_CMP_XCHG(r1, r2);
2244 HS_CMP_XCHG(r3, r4);
2245 HS_CMP_XCHG(r5, r6);
2246 HS_CMP_XCHG(r7, r8);
2247 }
2248 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1;
2249 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r8;
2250 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2;
2251 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r7;
2252 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3;
2253 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r6;
2254 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4;
2255 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r5;
2256 HS_BLOCK_BARRIER();
2257 if (HS_WARP_ID_X() < 8) {
2258 {
2259 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
2260 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(32);
2261 HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(64);
2262 HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(96);
2263 HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(128);
2264 HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(160);
2265 HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(192);
2266 HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(224);
2267 HS_CMP_XCHG(r0_4, r0_5);
2268 HS_CMP_XCHG(r0_3, r0_6);
2269 HS_CMP_XCHG(r0_2, r0_7);
2270 HS_CMP_XCHG(r0_1, r0_8);
2271 HS_CMP_XCHG(r0_5, r0_7);
2272 HS_CMP_XCHG(r0_6, r0_8);
2273 HS_CMP_XCHG(r0_5, r0_6);
2274 HS_CMP_XCHG(r0_7, r0_8);
2275 HS_CMP_XCHG(r0_1, r0_3);
2276 HS_CMP_XCHG(r0_2, r0_4);
2277 HS_CMP_XCHG(r0_1, r0_2);
2278 HS_CMP_XCHG(r0_3, r0_4);
2279 HS_SLAB_LOCAL_L(0) = r0_1;
2280 HS_SLAB_LOCAL_L(32) = r0_2;
2281 HS_SLAB_LOCAL_L(64) = r0_3;
2282 HS_SLAB_LOCAL_L(96) = r0_4;
2283 HS_SLAB_LOCAL_R(128) = r0_5;
2284 HS_SLAB_LOCAL_R(160) = r0_6;
2285 HS_SLAB_LOCAL_R(192) = r0_7;
2286 HS_SLAB_LOCAL_R(224) = r0_8;
2287 }
2288 {
2289 HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(256);
2290 HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(288);
2291 HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_L(320);
2292 HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_L(352);
2293 HS_KEY_TYPE r1_5 = HS_SLAB_LOCAL_R(384);
2294 HS_KEY_TYPE r1_6 = HS_SLAB_LOCAL_R(416);
2295 HS_KEY_TYPE r1_7 = HS_SLAB_LOCAL_R(448);
2296 HS_KEY_TYPE r1_8 = HS_SLAB_LOCAL_R(480);
2297 HS_CMP_XCHG(r1_4, r1_5);
2298 HS_CMP_XCHG(r1_3, r1_6);
2299 HS_CMP_XCHG(r1_2, r1_7);
2300 HS_CMP_XCHG(r1_1, r1_8);
2301 HS_CMP_XCHG(r1_5, r1_7);
2302 HS_CMP_XCHG(r1_6, r1_8);
2303 HS_CMP_XCHG(r1_5, r1_6);
2304 HS_CMP_XCHG(r1_7, r1_8);
2305 HS_CMP_XCHG(r1_1, r1_3);
2306 HS_CMP_XCHG(r1_2, r1_4);
2307 HS_CMP_XCHG(r1_1, r1_2);
2308 HS_CMP_XCHG(r1_3, r1_4);
2309 HS_SLAB_LOCAL_L(256) = r1_1;
2310 HS_SLAB_LOCAL_L(288) = r1_2;
2311 HS_SLAB_LOCAL_L(320) = r1_3;
2312 HS_SLAB_LOCAL_L(352) = r1_4;
2313 HS_SLAB_LOCAL_R(384) = r1_5;
2314 HS_SLAB_LOCAL_R(416) = r1_6;
2315 HS_SLAB_LOCAL_R(448) = r1_7;
2316 HS_SLAB_LOCAL_R(480) = r1_8;
2317 }
2318 }
2319 HS_BLOCK_BARRIER();
2320 r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0);
2321 r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1);
2322 r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2);
2323 r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3);
2324 r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4);
2325 r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5);
2326 r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6);
2327 r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7);
2328 {
2329 {
2330 HS_SLAB_HALF_PREAMBLE(16);
2331 HS_CMP_HALF(0, r1);
2332 HS_CMP_HALF(1, r2);
2333 HS_CMP_HALF(2, r3);
2334 HS_CMP_HALF(3, r4);
2335 HS_CMP_HALF(4, r5);
2336 HS_CMP_HALF(5, r6);
2337 HS_CMP_HALF(6, r7);
2338 HS_CMP_HALF(7, r8);
2339 }
2340 {
2341 HS_SLAB_HALF_PREAMBLE(8);
2342 HS_CMP_HALF(0, r1);
2343 HS_CMP_HALF(1, r2);
2344 HS_CMP_HALF(2, r3);
2345 HS_CMP_HALF(3, r4);
2346 HS_CMP_HALF(4, r5);
2347 HS_CMP_HALF(5, r6);
2348 HS_CMP_HALF(6, r7);
2349 HS_CMP_HALF(7, r8);
2350 }
2351 {
2352 HS_SLAB_HALF_PREAMBLE(4);
2353 HS_CMP_HALF(0, r1);
2354 HS_CMP_HALF(1, r2);
2355 HS_CMP_HALF(2, r3);
2356 HS_CMP_HALF(3, r4);
2357 HS_CMP_HALF(4, r5);
2358 HS_CMP_HALF(5, r6);
2359 HS_CMP_HALF(6, r7);
2360 HS_CMP_HALF(7, r8);
2361 }
2362 {
2363 HS_SLAB_HALF_PREAMBLE(2);
2364 HS_CMP_HALF(0, r1);
2365 HS_CMP_HALF(1, r2);
2366 HS_CMP_HALF(2, r3);
2367 HS_CMP_HALF(3, r4);
2368 HS_CMP_HALF(4, r5);
2369 HS_CMP_HALF(5, r6);
2370 HS_CMP_HALF(6, r7);
2371 HS_CMP_HALF(7, r8);
2372 }
2373 {
2374 HS_SLAB_HALF_PREAMBLE(1);
2375 HS_CMP_HALF(0, r1);
2376 HS_CMP_HALF(1, r2);
2377 HS_CMP_HALF(2, r3);
2378 HS_CMP_HALF(3, r4);
2379 HS_CMP_HALF(4, r5);
2380 HS_CMP_HALF(5, r6);
2381 HS_CMP_HALF(6, r7);
2382 HS_CMP_HALF(7, r8);
2383 }
2384 HS_CMP_XCHG(r1, r5);
2385 HS_CMP_XCHG(r3, r7);
2386 HS_CMP_XCHG(r1, r3);
2387 HS_CMP_XCHG(r5, r7);
2388 HS_CMP_XCHG(r2, r6);
2389 HS_CMP_XCHG(r4, r8);
2390 HS_CMP_XCHG(r2, r4);
2391 HS_CMP_XCHG(r6, r8);
2392 HS_CMP_XCHG(r1, r2);
2393 HS_CMP_XCHG(r3, r4);
2394 HS_CMP_XCHG(r5, r6);
2395 HS_CMP_XCHG(r7, r8);
2396 }
2397 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1;
2398 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r8;
2399 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2;
2400 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r7;
2401 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3;
2402 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r6;
2403 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4;
2404 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r5;
2405 HS_BLOCK_BARRIER();
2406 if (HS_WARP_ID_X() < 8) {
2407 {
2408 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
2409 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(32);
2410 HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(64);
2411 HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(96);
2412 HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_L(128);
2413 HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_L(160);
2414 HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_L(192);
2415 HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_L(224);
2416 HS_KEY_TYPE r0_9 = HS_SLAB_LOCAL_R(256);
2417 HS_KEY_TYPE r0_10 = HS_SLAB_LOCAL_R(288);
2418 HS_KEY_TYPE r0_11 = HS_SLAB_LOCAL_R(320);
2419 HS_KEY_TYPE r0_12 = HS_SLAB_LOCAL_R(352);
2420 HS_KEY_TYPE r0_13 = HS_SLAB_LOCAL_R(384);
2421 HS_KEY_TYPE r0_14 = HS_SLAB_LOCAL_R(416);
2422 HS_KEY_TYPE r0_15 = HS_SLAB_LOCAL_R(448);
2423 HS_KEY_TYPE r0_16 = HS_SLAB_LOCAL_R(480);
2424 HS_CMP_XCHG(r0_8, r0_9);
2425 HS_CMP_XCHG(r0_7, r0_10);
2426 HS_CMP_XCHG(r0_6, r0_11);
2427 HS_CMP_XCHG(r0_5, r0_12);
2428 HS_CMP_XCHG(r0_4, r0_13);
2429 HS_CMP_XCHG(r0_3, r0_14);
2430 HS_CMP_XCHG(r0_2, r0_15);
2431 HS_CMP_XCHG(r0_1, r0_16);
2432 HS_CMP_XCHG(r0_9, r0_13);
2433 HS_CMP_XCHG(r0_11, r0_15);
2434 HS_CMP_XCHG(r0_9, r0_11);
2435 HS_CMP_XCHG(r0_13, r0_15);
2436 HS_CMP_XCHG(r0_10, r0_14);
2437 HS_CMP_XCHG(r0_12, r0_16);
2438 HS_CMP_XCHG(r0_10, r0_12);
2439 HS_CMP_XCHG(r0_14, r0_16);
2440 HS_CMP_XCHG(r0_9, r0_10);
2441 HS_CMP_XCHG(r0_11, r0_12);
2442 HS_CMP_XCHG(r0_13, r0_14);
2443 HS_CMP_XCHG(r0_15, r0_16);
2444 HS_CMP_XCHG(r0_1, r0_5);
2445 HS_CMP_XCHG(r0_3, r0_7);
2446 HS_CMP_XCHG(r0_1, r0_3);
2447 HS_CMP_XCHG(r0_5, r0_7);
2448 HS_CMP_XCHG(r0_2, r0_6);
2449 HS_CMP_XCHG(r0_4, r0_8);
2450 HS_CMP_XCHG(r0_2, r0_4);
2451 HS_CMP_XCHG(r0_6, r0_8);
2452 HS_CMP_XCHG(r0_1, r0_2);
2453 HS_CMP_XCHG(r0_3, r0_4);
2454 HS_CMP_XCHG(r0_5, r0_6);
2455 HS_CMP_XCHG(r0_7, r0_8);
2456 HS_SLAB_LOCAL_L(0) = r0_1;
2457 HS_SLAB_LOCAL_L(32) = r0_2;
2458 HS_SLAB_LOCAL_L(64) = r0_3;
2459 HS_SLAB_LOCAL_L(96) = r0_4;
2460 HS_SLAB_LOCAL_L(128) = r0_5;
2461 HS_SLAB_LOCAL_L(160) = r0_6;
2462 HS_SLAB_LOCAL_L(192) = r0_7;
2463 HS_SLAB_LOCAL_L(224) = r0_8;
2464 HS_SLAB_LOCAL_R(256) = r0_9;
2465 HS_SLAB_LOCAL_R(288) = r0_10;
2466 HS_SLAB_LOCAL_R(320) = r0_11;
2467 HS_SLAB_LOCAL_R(352) = r0_12;
2468 HS_SLAB_LOCAL_R(384) = r0_13;
2469 HS_SLAB_LOCAL_R(416) = r0_14;
2470 HS_SLAB_LOCAL_R(448) = r0_15;
2471 HS_SLAB_LOCAL_R(480) = r0_16;
2472 }
2473 }
2474 HS_BLOCK_BARRIER();
2475 r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0);
2476 r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1);
2477 r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2);
2478 r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3);
2479 r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4);
2480 r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5);
2481 r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6);
2482 r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7);
2483 {
2484 {
2485 HS_SLAB_HALF_PREAMBLE(16);
2486 HS_CMP_HALF(0, r1);
2487 HS_CMP_HALF(1, r2);
2488 HS_CMP_HALF(2, r3);
2489 HS_CMP_HALF(3, r4);
2490 HS_CMP_HALF(4, r5);
2491 HS_CMP_HALF(5, r6);
2492 HS_CMP_HALF(6, r7);
2493 HS_CMP_HALF(7, r8);
2494 }
2495 {
2496 HS_SLAB_HALF_PREAMBLE(8);
2497 HS_CMP_HALF(0, r1);
2498 HS_CMP_HALF(1, r2);
2499 HS_CMP_HALF(2, r3);
2500 HS_CMP_HALF(3, r4);
2501 HS_CMP_HALF(4, r5);
2502 HS_CMP_HALF(5, r6);
2503 HS_CMP_HALF(6, r7);
2504 HS_CMP_HALF(7, r8);
2505 }
2506 {
2507 HS_SLAB_HALF_PREAMBLE(4);
2508 HS_CMP_HALF(0, r1);
2509 HS_CMP_HALF(1, r2);
2510 HS_CMP_HALF(2, r3);
2511 HS_CMP_HALF(3, r4);
2512 HS_CMP_HALF(4, r5);
2513 HS_CMP_HALF(5, r6);
2514 HS_CMP_HALF(6, r7);
2515 HS_CMP_HALF(7, r8);
2516 }
2517 {
2518 HS_SLAB_HALF_PREAMBLE(2);
2519 HS_CMP_HALF(0, r1);
2520 HS_CMP_HALF(1, r2);
2521 HS_CMP_HALF(2, r3);
2522 HS_CMP_HALF(3, r4);
2523 HS_CMP_HALF(4, r5);
2524 HS_CMP_HALF(5, r6);
2525 HS_CMP_HALF(6, r7);
2526 HS_CMP_HALF(7, r8);
2527 }
2528 {
2529 HS_SLAB_HALF_PREAMBLE(1);
2530 HS_CMP_HALF(0, r1);
2531 HS_CMP_HALF(1, r2);
2532 HS_CMP_HALF(2, r3);
2533 HS_CMP_HALF(3, r4);
2534 HS_CMP_HALF(4, r5);
2535 HS_CMP_HALF(5, r6);
2536 HS_CMP_HALF(6, r7);
2537 HS_CMP_HALF(7, r8);
2538 }
2539 HS_CMP_XCHG(r1, r5);
2540 HS_CMP_XCHG(r3, r7);
2541 HS_CMP_XCHG(r1, r3);
2542 HS_CMP_XCHG(r5, r7);
2543 HS_CMP_XCHG(r2, r6);
2544 HS_CMP_XCHG(r4, r8);
2545 HS_CMP_XCHG(r2, r4);
2546 HS_CMP_XCHG(r6, r8);
2547 HS_CMP_XCHG(r1, r2);
2548 HS_CMP_XCHG(r3, r4);
2549 HS_CMP_XCHG(r5, r6);
2550 HS_CMP_XCHG(r7, r8);
2551 }
2552 HS_SLAB_GLOBAL_STORE(0, r1);
2553 HS_SLAB_GLOBAL_STORE(1, r2);
2554 HS_SLAB_GLOBAL_STORE(2, r3);
2555 HS_SLAB_GLOBAL_STORE(3, r4);
2556 HS_SLAB_GLOBAL_STORE(4, r5);
2557 HS_SLAB_GLOBAL_STORE(5, r6);
2558 HS_SLAB_GLOBAL_STORE(6, r7);
2559 HS_SLAB_GLOBAL_STORE(7, r8);
2560 }
2561
2562 HS_BC_KERNEL_PROTO(1, 0)
2563 {
2564 HS_SLAB_GLOBAL_PREAMBLE();
2565 HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vout, 0);
2566 HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vout, 1);
2567 HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vout, 2);
2568 HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vout, 3);
2569 HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vout, 4);
2570 HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vout, 5);
2571 HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vout, 6);
2572 HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vout, 7);
2573 {
2574 {
2575 HS_SLAB_HALF_PREAMBLE(16);
2576 HS_CMP_HALF(0, r1);
2577 HS_CMP_HALF(1, r2);
2578 HS_CMP_HALF(2, r3);
2579 HS_CMP_HALF(3, r4);
2580 HS_CMP_HALF(4, r5);
2581 HS_CMP_HALF(5, r6);
2582 HS_CMP_HALF(6, r7);
2583 HS_CMP_HALF(7, r8);
2584 }
2585 {
2586 HS_SLAB_HALF_PREAMBLE(8);
2587 HS_CMP_HALF(0, r1);
2588 HS_CMP_HALF(1, r2);
2589 HS_CMP_HALF(2, r3);
2590 HS_CMP_HALF(3, r4);
2591 HS_CMP_HALF(4, r5);
2592 HS_CMP_HALF(5, r6);
2593 HS_CMP_HALF(6, r7);
2594 HS_CMP_HALF(7, r8);
2595 }
2596 {
2597 HS_SLAB_HALF_PREAMBLE(4);
2598 HS_CMP_HALF(0, r1);
2599 HS_CMP_HALF(1, r2);
2600 HS_CMP_HALF(2, r3);
2601 HS_CMP_HALF(3, r4);
2602 HS_CMP_HALF(4, r5);
2603 HS_CMP_HALF(5, r6);
2604 HS_CMP_HALF(6, r7);
2605 HS_CMP_HALF(7, r8);
2606 }
2607 {
2608 HS_SLAB_HALF_PREAMBLE(2);
2609 HS_CMP_HALF(0, r1);
2610 HS_CMP_HALF(1, r2);
2611 HS_CMP_HALF(2, r3);
2612 HS_CMP_HALF(3, r4);
2613 HS_CMP_HALF(4, r5);
2614 HS_CMP_HALF(5, r6);
2615 HS_CMP_HALF(6, r7);
2616 HS_CMP_HALF(7, r8);
2617 }
2618 {
2619 HS_SLAB_HALF_PREAMBLE(1);
2620 HS_CMP_HALF(0, r1);
2621 HS_CMP_HALF(1, r2);
2622 HS_CMP_HALF(2, r3);
2623 HS_CMP_HALF(3, r4);
2624 HS_CMP_HALF(4, r5);
2625 HS_CMP_HALF(5, r6);
2626 HS_CMP_HALF(6, r7);
2627 HS_CMP_HALF(7, r8);
2628 }
2629 HS_CMP_XCHG(r1, r5);
2630 HS_CMP_XCHG(r3, r7);
2631 HS_CMP_XCHG(r1, r3);
2632 HS_CMP_XCHG(r5, r7);
2633 HS_CMP_XCHG(r2, r6);
2634 HS_CMP_XCHG(r4, r8);
2635 HS_CMP_XCHG(r2, r4);
2636 HS_CMP_XCHG(r6, r8);
2637 HS_CMP_XCHG(r1, r2);
2638 HS_CMP_XCHG(r3, r4);
2639 HS_CMP_XCHG(r5, r6);
2640 HS_CMP_XCHG(r7, r8);
2641 }
2642 HS_SLAB_GLOBAL_STORE(0, r1);
2643 HS_SLAB_GLOBAL_STORE(1, r2);
2644 HS_SLAB_GLOBAL_STORE(2, r3);
2645 HS_SLAB_GLOBAL_STORE(3, r4);
2646 HS_SLAB_GLOBAL_STORE(4, r5);
2647 HS_SLAB_GLOBAL_STORE(5, r6);
2648 HS_SLAB_GLOBAL_STORE(6, r7);
2649 HS_SLAB_GLOBAL_STORE(7, r8);
2650 }
2651
2652 HS_BC_KERNEL_PROTO(2, 1)
2653 {
2654 HS_BLOCK_LOCAL_MEM_DECL(64, 8);
2655
2656 HS_SLAB_GLOBAL_PREAMBLE();
2657 HS_BC_MERGE_H_PREAMBLE(2);
2658 {
2659 {
2660 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0);
2661 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8);
2662 HS_CMP_XCHG(r0_1, r0_2);
2663 HS_SLAB_LOCAL_L(0) = r0_1;
2664 HS_SLAB_LOCAL_L(32) = r0_2;
2665 }
2666 {
2667 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(2);
2668 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(10);
2669 HS_CMP_XCHG(r0_1, r0_2);
2670 HS_SLAB_LOCAL_L(128) = r0_1;
2671 HS_SLAB_LOCAL_L(160) = r0_2;
2672 }
2673 {
2674 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(4);
2675 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(12);
2676 HS_CMP_XCHG(r0_1, r0_2);
2677 HS_SLAB_LOCAL_L(256) = r0_1;
2678 HS_SLAB_LOCAL_L(288) = r0_2;
2679 }
2680 {
2681 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(6);
2682 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(14);
2683 HS_CMP_XCHG(r0_1, r0_2);
2684 HS_SLAB_LOCAL_L(384) = r0_1;
2685 HS_SLAB_LOCAL_L(416) = r0_2;
2686 }
2687 }
2688 HS_BLOCK_BARRIER();
2689 HS_KEY_TYPE r1 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 0);
2690 HS_KEY_TYPE r2 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 1);
2691 HS_KEY_TYPE r3 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 2);
2692 HS_KEY_TYPE r4 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 3);
2693 HS_KEY_TYPE r5 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 4);
2694 HS_KEY_TYPE r6 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 5);
2695 HS_KEY_TYPE r7 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 6);
2696 HS_KEY_TYPE r8 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 7);
2697 {
2698 {
2699 HS_SLAB_HALF_PREAMBLE(16);
2700 HS_CMP_HALF(0, r1);
2701 HS_CMP_HALF(1, r2);
2702 HS_CMP_HALF(2, r3);
2703 HS_CMP_HALF(3, r4);
2704 HS_CMP_HALF(4, r5);
2705 HS_CMP_HALF(5, r6);
2706 HS_CMP_HALF(6, r7);
2707 HS_CMP_HALF(7, r8);
2708 }
2709 {
2710 HS_SLAB_HALF_PREAMBLE(8);
2711 HS_CMP_HALF(0, r1);
2712 HS_CMP_HALF(1, r2);
2713 HS_CMP_HALF(2, r3);
2714 HS_CMP_HALF(3, r4);
2715 HS_CMP_HALF(4, r5);
2716 HS_CMP_HALF(5, r6);
2717 HS_CMP_HALF(6, r7);
2718 HS_CMP_HALF(7, r8);
2719 }
2720 {
2721 HS_SLAB_HALF_PREAMBLE(4);
2722 HS_CMP_HALF(0, r1);
2723 HS_CMP_HALF(1, r2);
2724 HS_CMP_HALF(2, r3);
2725 HS_CMP_HALF(3, r4);
2726 HS_CMP_HALF(4, r5);
2727 HS_CMP_HALF(5, r6);
2728 HS_CMP_HALF(6, r7);
2729 HS_CMP_HALF(7, r8);
2730 }
2731 {
2732 HS_SLAB_HALF_PREAMBLE(2);
2733 HS_CMP_HALF(0, r1);
2734 HS_CMP_HALF(1, r2);
2735 HS_CMP_HALF(2, r3);
2736 HS_CMP_HALF(3, r4);
2737 HS_CMP_HALF(4, r5);
2738 HS_CMP_HALF(5, r6);
2739 HS_CMP_HALF(6, r7);
2740 HS_CMP_HALF(7, r8);
2741 }
2742 {
2743 HS_SLAB_HALF_PREAMBLE(1);
2744 HS_CMP_HALF(0, r1);
2745 HS_CMP_HALF(1, r2);
2746 HS_CMP_HALF(2, r3);
2747 HS_CMP_HALF(3, r4);
2748 HS_CMP_HALF(4, r5);
2749 HS_CMP_HALF(5, r6);
2750 HS_CMP_HALF(6, r7);
2751 HS_CMP_HALF(7, r8);
2752 }
2753 HS_CMP_XCHG(r1, r5);
2754 HS_CMP_XCHG(r3, r7);
2755 HS_CMP_XCHG(r1, r3);
2756 HS_CMP_XCHG(r5, r7);
2757 HS_CMP_XCHG(r2, r6);
2758 HS_CMP_XCHG(r4, r8);
2759 HS_CMP_XCHG(r2, r4);
2760 HS_CMP_XCHG(r6, r8);
2761 HS_CMP_XCHG(r1, r2);
2762 HS_CMP_XCHG(r3, r4);
2763 HS_CMP_XCHG(r5, r6);
2764 HS_CMP_XCHG(r7, r8);
2765 }
2766 HS_SLAB_GLOBAL_STORE(0, r1);
2767 HS_SLAB_GLOBAL_STORE(1, r2);
2768 HS_SLAB_GLOBAL_STORE(2, r3);
2769 HS_SLAB_GLOBAL_STORE(3, r4);
2770 HS_SLAB_GLOBAL_STORE(4, r5);
2771 HS_SLAB_GLOBAL_STORE(5, r6);
2772 HS_SLAB_GLOBAL_STORE(6, r7);
2773 HS_SLAB_GLOBAL_STORE(7, r8);
2774 }
2775
2776 HS_BC_KERNEL_PROTO(4, 2)
2777 {
2778 HS_BLOCK_LOCAL_MEM_DECL(128, 8);
2779
2780 HS_SLAB_GLOBAL_PREAMBLE();
2781 HS_BC_MERGE_H_PREAMBLE(4);
2782 {
2783 {
2784 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0);
2785 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8);
2786 HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(16);
2787 HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(24);
2788 HS_CMP_XCHG(r0_1, r0_3);
2789 HS_CMP_XCHG(r0_2, r0_4);
2790 HS_CMP_XCHG(r0_1, r0_2);
2791 HS_CMP_XCHG(r0_3, r0_4);
2792 HS_SLAB_LOCAL_L(0) = r0_1;
2793 HS_SLAB_LOCAL_L(32) = r0_2;
2794 HS_SLAB_LOCAL_L(64) = r0_3;
2795 HS_SLAB_LOCAL_L(96) = r0_4;
2796 }
2797 {
2798 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(4);
2799 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(12);
2800 HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(20);
2801 HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(28);
2802 HS_CMP_XCHG(r0_1, r0_3);
2803 HS_CMP_XCHG(r0_2, r0_4);
2804 HS_CMP_XCHG(r0_1, r0_2);
2805 HS_CMP_XCHG(r0_3, r0_4);
2806 HS_SLAB_LOCAL_L(512) = r0_1;
2807 HS_SLAB_LOCAL_L(544) = r0_2;
2808 HS_SLAB_LOCAL_L(576) = r0_3;
2809 HS_SLAB_LOCAL_L(608) = r0_4;
2810 }
2811 }
2812 HS_BLOCK_BARRIER();
2813 HS_KEY_TYPE r1 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0);
2814 HS_KEY_TYPE r2 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1);
2815 HS_KEY_TYPE r3 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2);
2816 HS_KEY_TYPE r4 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3);
2817 HS_KEY_TYPE r5 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4);
2818 HS_KEY_TYPE r6 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5);
2819 HS_KEY_TYPE r7 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6);
2820 HS_KEY_TYPE r8 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7);
2821 {
2822 {
2823 HS_SLAB_HALF_PREAMBLE(16);
2824 HS_CMP_HALF(0, r1);
2825 HS_CMP_HALF(1, r2);
2826 HS_CMP_HALF(2, r3);
2827 HS_CMP_HALF(3, r4);
2828 HS_CMP_HALF(4, r5);
2829 HS_CMP_HALF(5, r6);
2830 HS_CMP_HALF(6, r7);
2831 HS_CMP_HALF(7, r8);
2832 }
2833 {
2834 HS_SLAB_HALF_PREAMBLE(8);
2835 HS_CMP_HALF(0, r1);
2836 HS_CMP_HALF(1, r2);
2837 HS_CMP_HALF(2, r3);
2838 HS_CMP_HALF(3, r4);
2839 HS_CMP_HALF(4, r5);
2840 HS_CMP_HALF(5, r6);
2841 HS_CMP_HALF(6, r7);
2842 HS_CMP_HALF(7, r8);
2843 }
2844 {
2845 HS_SLAB_HALF_PREAMBLE(4);
2846 HS_CMP_HALF(0, r1);
2847 HS_CMP_HALF(1, r2);
2848 HS_CMP_HALF(2, r3);
2849 HS_CMP_HALF(3, r4);
2850 HS_CMP_HALF(4, r5);
2851 HS_CMP_HALF(5, r6);
2852 HS_CMP_HALF(6, r7);
2853 HS_CMP_HALF(7, r8);
2854 }
2855 {
2856 HS_SLAB_HALF_PREAMBLE(2);
2857 HS_CMP_HALF(0, r1);
2858 HS_CMP_HALF(1, r2);
2859 HS_CMP_HALF(2, r3);
2860 HS_CMP_HALF(3, r4);
2861 HS_CMP_HALF(4, r5);
2862 HS_CMP_HALF(5, r6);
2863 HS_CMP_HALF(6, r7);
2864 HS_CMP_HALF(7, r8);
2865 }
2866 {
2867 HS_SLAB_HALF_PREAMBLE(1);
2868 HS_CMP_HALF(0, r1);
2869 HS_CMP_HALF(1, r2);
2870 HS_CMP_HALF(2, r3);
2871 HS_CMP_HALF(3, r4);
2872 HS_CMP_HALF(4, r5);
2873 HS_CMP_HALF(5, r6);
2874 HS_CMP_HALF(6, r7);
2875 HS_CMP_HALF(7, r8);
2876 }
2877 HS_CMP_XCHG(r1, r5);
2878 HS_CMP_XCHG(r3, r7);
2879 HS_CMP_XCHG(r1, r3);
2880 HS_CMP_XCHG(r5, r7);
2881 HS_CMP_XCHG(r2, r6);
2882 HS_CMP_XCHG(r4, r8);
2883 HS_CMP_XCHG(r2, r4);
2884 HS_CMP_XCHG(r6, r8);
2885 HS_CMP_XCHG(r1, r2);
2886 HS_CMP_XCHG(r3, r4);
2887 HS_CMP_XCHG(r5, r6);
2888 HS_CMP_XCHG(r7, r8);
2889 }
2890 HS_SLAB_GLOBAL_STORE(0, r1);
2891 HS_SLAB_GLOBAL_STORE(1, r2);
2892 HS_SLAB_GLOBAL_STORE(2, r3);
2893 HS_SLAB_GLOBAL_STORE(3, r4);
2894 HS_SLAB_GLOBAL_STORE(4, r5);
2895 HS_SLAB_GLOBAL_STORE(5, r6);
2896 HS_SLAB_GLOBAL_STORE(6, r7);
2897 HS_SLAB_GLOBAL_STORE(7, r8);
2898 }
2899
2900 HS_BC_KERNEL_PROTO(8, 3)
2901 {
2902 HS_BLOCK_LOCAL_MEM_DECL(256, 8);
2903
2904 HS_SLAB_GLOBAL_PREAMBLE();
2905 HS_BC_MERGE_H_PREAMBLE(8);
2906 {
2907 {
2908 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0);
2909 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8);
2910 HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(16);
2911 HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(24);
2912 HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(32);
2913 HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(40);
2914 HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(48);
2915 HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(56);
2916 HS_CMP_XCHG(r0_1, r0_5);
2917 HS_CMP_XCHG(r0_3, r0_7);
2918 HS_CMP_XCHG(r0_1, r0_3);
2919 HS_CMP_XCHG(r0_5, r0_7);
2920 HS_CMP_XCHG(r0_2, r0_6);
2921 HS_CMP_XCHG(r0_4, r0_8);
2922 HS_CMP_XCHG(r0_2, r0_4);
2923 HS_CMP_XCHG(r0_6, r0_8);
2924 HS_CMP_XCHG(r0_1, r0_2);
2925 HS_CMP_XCHG(r0_3, r0_4);
2926 HS_CMP_XCHG(r0_5, r0_6);
2927 HS_CMP_XCHG(r0_7, r0_8);
2928 HS_SLAB_LOCAL_L(0) = r0_1;
2929 HS_SLAB_LOCAL_L(32) = r0_2;
2930 HS_SLAB_LOCAL_L(64) = r0_3;
2931 HS_SLAB_LOCAL_L(96) = r0_4;
2932 HS_SLAB_LOCAL_L(128) = r0_5;
2933 HS_SLAB_LOCAL_L(160) = r0_6;
2934 HS_SLAB_LOCAL_L(192) = r0_7;
2935 HS_SLAB_LOCAL_L(224) = r0_8;
2936 }
2937 }
2938 HS_BLOCK_BARRIER();
2939 HS_KEY_TYPE r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0);
2940 HS_KEY_TYPE r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1);
2941 HS_KEY_TYPE r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2);
2942 HS_KEY_TYPE r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3);
2943 HS_KEY_TYPE r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4);
2944 HS_KEY_TYPE r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5);
2945 HS_KEY_TYPE r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6);
2946 HS_KEY_TYPE r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7);
2947 {
2948 {
2949 HS_SLAB_HALF_PREAMBLE(16);
2950 HS_CMP_HALF(0, r1);
2951 HS_CMP_HALF(1, r2);
2952 HS_CMP_HALF(2, r3);
2953 HS_CMP_HALF(3, r4);
2954 HS_CMP_HALF(4, r5);
2955 HS_CMP_HALF(5, r6);
2956 HS_CMP_HALF(6, r7);
2957 HS_CMP_HALF(7, r8);
2958 }
2959 {
2960 HS_SLAB_HALF_PREAMBLE(8);
2961 HS_CMP_HALF(0, r1);
2962 HS_CMP_HALF(1, r2);
2963 HS_CMP_HALF(2, r3);
2964 HS_CMP_HALF(3, r4);
2965 HS_CMP_HALF(4, r5);
2966 HS_CMP_HALF(5, r6);
2967 HS_CMP_HALF(6, r7);
2968 HS_CMP_HALF(7, r8);
2969 }
2970 {
2971 HS_SLAB_HALF_PREAMBLE(4);
2972 HS_CMP_HALF(0, r1);
2973 HS_CMP_HALF(1, r2);
2974 HS_CMP_HALF(2, r3);
2975 HS_CMP_HALF(3, r4);
2976 HS_CMP_HALF(4, r5);
2977 HS_CMP_HALF(5, r6);
2978 HS_CMP_HALF(6, r7);
2979 HS_CMP_HALF(7, r8);
2980 }
2981 {
2982 HS_SLAB_HALF_PREAMBLE(2);
2983 HS_CMP_HALF(0, r1);
2984 HS_CMP_HALF(1, r2);
2985 HS_CMP_HALF(2, r3);
2986 HS_CMP_HALF(3, r4);
2987 HS_CMP_HALF(4, r5);
2988 HS_CMP_HALF(5, r6);
2989 HS_CMP_HALF(6, r7);
2990 HS_CMP_HALF(7, r8);
2991 }
2992 {
2993 HS_SLAB_HALF_PREAMBLE(1);
2994 HS_CMP_HALF(0, r1);
2995 HS_CMP_HALF(1, r2);
2996 HS_CMP_HALF(2, r3);
2997 HS_CMP_HALF(3, r4);
2998 HS_CMP_HALF(4, r5);
2999 HS_CMP_HALF(5, r6);
3000 HS_CMP_HALF(6, r7);
3001 HS_CMP_HALF(7, r8);
3002 }
3003 HS_CMP_XCHG(r1, r5);
3004 HS_CMP_XCHG(r3, r7);
3005 HS_CMP_XCHG(r1, r3);
3006 HS_CMP_XCHG(r5, r7);
3007 HS_CMP_XCHG(r2, r6);
3008 HS_CMP_XCHG(r4, r8);
3009 HS_CMP_XCHG(r2, r4);
3010 HS_CMP_XCHG(r6, r8);
3011 HS_CMP_XCHG(r1, r2);
3012 HS_CMP_XCHG(r3, r4);
3013 HS_CMP_XCHG(r5, r6);
3014 HS_CMP_XCHG(r7, r8);
3015 }
3016 HS_SLAB_GLOBAL_STORE(0, r1);
3017 HS_SLAB_GLOBAL_STORE(1, r2);
3018 HS_SLAB_GLOBAL_STORE(2, r3);
3019 HS_SLAB_GLOBAL_STORE(3, r4);
3020 HS_SLAB_GLOBAL_STORE(4, r5);
3021 HS_SLAB_GLOBAL_STORE(5, r6);
3022 HS_SLAB_GLOBAL_STORE(6, r7);
3023 HS_SLAB_GLOBAL_STORE(7, r8);
3024 }
3025
3026 HS_BC_KERNEL_PROTO(16, 4)
3027 {
3028 HS_BLOCK_LOCAL_MEM_DECL(512, 8);
3029
3030 HS_SLAB_GLOBAL_PREAMBLE();
3031 HS_BC_MERGE_H_PREAMBLE(16);
3032 if (HS_WARP_ID_X() < 8) {
3033 {
3034 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0);
3035 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8);
3036 HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(16);
3037 HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(24);
3038 HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(32);
3039 HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(40);
3040 HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(48);
3041 HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(56);
3042 HS_KEY_TYPE r0_9 = HS_BC_GLOBAL_LOAD_L(64);
3043 HS_KEY_TYPE r0_10 = HS_BC_GLOBAL_LOAD_L(72);
3044 HS_KEY_TYPE r0_11 = HS_BC_GLOBAL_LOAD_L(80);
3045 HS_KEY_TYPE r0_12 = HS_BC_GLOBAL_LOAD_L(88);
3046 HS_KEY_TYPE r0_13 = HS_BC_GLOBAL_LOAD_L(96);
3047 HS_KEY_TYPE r0_14 = HS_BC_GLOBAL_LOAD_L(104);
3048 HS_KEY_TYPE r0_15 = HS_BC_GLOBAL_LOAD_L(112);
3049 HS_KEY_TYPE r0_16 = HS_BC_GLOBAL_LOAD_L(120);
3050 HS_CMP_XCHG(r0_1, r0_9);
3051 HS_CMP_XCHG(r0_5, r0_13);
3052 HS_CMP_XCHG(r0_1, r0_5);
3053 HS_CMP_XCHG(r0_9, r0_13);
3054 HS_CMP_XCHG(r0_3, r0_11);
3055 HS_CMP_XCHG(r0_7, r0_15);
3056 HS_CMP_XCHG(r0_3, r0_7);
3057 HS_CMP_XCHG(r0_11, r0_15);
3058 HS_CMP_XCHG(r0_1, r0_3);
3059 HS_CMP_XCHG(r0_5, r0_7);
3060 HS_CMP_XCHG(r0_9, r0_11);
3061 HS_CMP_XCHG(r0_13, r0_15);
3062 HS_CMP_XCHG(r0_2, r0_10);
3063 HS_CMP_XCHG(r0_6, r0_14);
3064 HS_CMP_XCHG(r0_2, r0_6);
3065 HS_CMP_XCHG(r0_10, r0_14);
3066 HS_CMP_XCHG(r0_4, r0_12);
3067 HS_CMP_XCHG(r0_8, r0_16);
3068 HS_CMP_XCHG(r0_4, r0_8);
3069 HS_CMP_XCHG(r0_12, r0_16);
3070 HS_CMP_XCHG(r0_2, r0_4);
3071 HS_CMP_XCHG(r0_6, r0_8);
3072 HS_CMP_XCHG(r0_10, r0_12);
3073 HS_CMP_XCHG(r0_14, r0_16);
3074 HS_CMP_XCHG(r0_1, r0_2);
3075 HS_CMP_XCHG(r0_3, r0_4);
3076 HS_CMP_XCHG(r0_5, r0_6);
3077 HS_CMP_XCHG(r0_7, r0_8);
3078 HS_CMP_XCHG(r0_9, r0_10);
3079 HS_CMP_XCHG(r0_11, r0_12);
3080 HS_CMP_XCHG(r0_13, r0_14);
3081 HS_CMP_XCHG(r0_15, r0_16);
3082 HS_SLAB_LOCAL_L(0) = r0_1;
3083 HS_SLAB_LOCAL_L(32) = r0_2;
3084 HS_SLAB_LOCAL_L(64) = r0_3;
3085 HS_SLAB_LOCAL_L(96) = r0_4;
3086 HS_SLAB_LOCAL_L(128) = r0_5;
3087 HS_SLAB_LOCAL_L(160) = r0_6;
3088 HS_SLAB_LOCAL_L(192) = r0_7;
3089 HS_SLAB_LOCAL_L(224) = r0_8;
3090 HS_SLAB_LOCAL_L(256) = r0_9;
3091 HS_SLAB_LOCAL_L(288) = r0_10;
3092 HS_SLAB_LOCAL_L(320) = r0_11;
3093 HS_SLAB_LOCAL_L(352) = r0_12;
3094 HS_SLAB_LOCAL_L(384) = r0_13;
3095 HS_SLAB_LOCAL_L(416) = r0_14;
3096 HS_SLAB_LOCAL_L(448) = r0_15;
3097 HS_SLAB_LOCAL_L(480) = r0_16;
3098 }
3099 }
3100 HS_BLOCK_BARRIER();
3101 HS_KEY_TYPE r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0);
3102 HS_KEY_TYPE r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1);
3103 HS_KEY_TYPE r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2);
3104 HS_KEY_TYPE r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3);
3105 HS_KEY_TYPE r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4);
3106 HS_KEY_TYPE r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5);
3107 HS_KEY_TYPE r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6);
3108 HS_KEY_TYPE r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7);
3109 {
3110 {
3111 HS_SLAB_HALF_PREAMBLE(16);
3112 HS_CMP_HALF(0, r1);
3113 HS_CMP_HALF(1, r2);
3114 HS_CMP_HALF(2, r3);
3115 HS_CMP_HALF(3, r4);
3116 HS_CMP_HALF(4, r5);
3117 HS_CMP_HALF(5, r6);
3118 HS_CMP_HALF(6, r7);
3119 HS_CMP_HALF(7, r8);
3120 }
3121 {
3122 HS_SLAB_HALF_PREAMBLE(8);
3123 HS_CMP_HALF(0, r1);
3124 HS_CMP_HALF(1, r2);
3125 HS_CMP_HALF(2, r3);
3126 HS_CMP_HALF(3, r4);
3127 HS_CMP_HALF(4, r5);
3128 HS_CMP_HALF(5, r6);
3129 HS_CMP_HALF(6, r7);
3130 HS_CMP_HALF(7, r8);
3131 }
3132 {
3133 HS_SLAB_HALF_PREAMBLE(4);
3134 HS_CMP_HALF(0, r1);
3135 HS_CMP_HALF(1, r2);
3136 HS_CMP_HALF(2, r3);
3137 HS_CMP_HALF(3, r4);
3138 HS_CMP_HALF(4, r5);
3139 HS_CMP_HALF(5, r6);
3140 HS_CMP_HALF(6, r7);
3141 HS_CMP_HALF(7, r8);
3142 }
3143 {
3144 HS_SLAB_HALF_PREAMBLE(2);
3145 HS_CMP_HALF(0, r1);
3146 HS_CMP_HALF(1, r2);
3147 HS_CMP_HALF(2, r3);
3148 HS_CMP_HALF(3, r4);
3149 HS_CMP_HALF(4, r5);
3150 HS_CMP_HALF(5, r6);
3151 HS_CMP_HALF(6, r7);
3152 HS_CMP_HALF(7, r8);
3153 }
3154 {
3155 HS_SLAB_HALF_PREAMBLE(1);
3156 HS_CMP_HALF(0, r1);
3157 HS_CMP_HALF(1, r2);
3158 HS_CMP_HALF(2, r3);
3159 HS_CMP_HALF(3, r4);
3160 HS_CMP_HALF(4, r5);
3161 HS_CMP_HALF(5, r6);
3162 HS_CMP_HALF(6, r7);
3163 HS_CMP_HALF(7, r8);
3164 }
3165 HS_CMP_XCHG(r1, r5);
3166 HS_CMP_XCHG(r3, r7);
3167 HS_CMP_XCHG(r1, r3);
3168 HS_CMP_XCHG(r5, r7);
3169 HS_CMP_XCHG(r2, r6);
3170 HS_CMP_XCHG(r4, r8);
3171 HS_CMP_XCHG(r2, r4);
3172 HS_CMP_XCHG(r6, r8);
3173 HS_CMP_XCHG(r1, r2);
3174 HS_CMP_XCHG(r3, r4);
3175 HS_CMP_XCHG(r5, r6);
3176 HS_CMP_XCHG(r7, r8);
3177 }
3178 HS_SLAB_GLOBAL_STORE(0, r1);
3179 HS_SLAB_GLOBAL_STORE(1, r2);
3180 HS_SLAB_GLOBAL_STORE(2, r3);
3181 HS_SLAB_GLOBAL_STORE(3, r4);
3182 HS_SLAB_GLOBAL_STORE(4, r5);
3183 HS_SLAB_GLOBAL_STORE(5, r6);
3184 HS_SLAB_GLOBAL_STORE(6, r7);
3185 HS_SLAB_GLOBAL_STORE(7, r8);
3186 }
3187
3188 HS_OFFSET_FM_KERNEL_PROTO(0, 0)
3189 {
3190 HS_OFFSET_FM_PREAMBLE(8);
3191 HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
3192 HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
3193 HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
3194 HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
3195 HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
3196 HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
3197 HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
3198 HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
3199 HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0);
3200 HS_CMP_XCHG(r8, r9);
3201 HS_CMP_XCHG(r1, r5);
3202 HS_CMP_XCHG(r3, r7);
3203 HS_CMP_XCHG(r1, r3);
3204 HS_CMP_XCHG(r5, r7);
3205 HS_CMP_XCHG(r2, r6);
3206 HS_CMP_XCHG(r4, r8);
3207 HS_CMP_XCHG(r2, r4);
3208 HS_CMP_XCHG(r6, r8);
3209 HS_CMP_XCHG(r1, r2);
3210 HS_CMP_XCHG(r3, r4);
3211 HS_CMP_XCHG(r5, r6);
3212 HS_CMP_XCHG(r7, r8);
3213 HS_XM_GLOBAL_STORE_L(0, r1);
3214 HS_XM_GLOBAL_STORE_L(1, r2);
3215 HS_XM_GLOBAL_STORE_L(2, r3);
3216 HS_XM_GLOBAL_STORE_L(3, r4);
3217 HS_XM_GLOBAL_STORE_L(4, r5);
3218 HS_XM_GLOBAL_STORE_L(5, r6);
3219 HS_XM_GLOBAL_STORE_L(6, r7);
3220 HS_XM_GLOBAL_STORE_L(7, r8);
3221 HS_FM_GLOBAL_STORE_R(0, r9);
3222 }
3223
3224 HS_OFFSET_FM_KERNEL_PROTO(0, 1)
3225 {
3226 HS_OFFSET_FM_PREAMBLE(8);
3227 HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
3228 HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
3229 HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
3230 HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
3231 HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
3232 HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
3233 HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
3234 HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
3235 HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0);
3236 HS_KEY_TYPE r10 = HS_FM_GLOBAL_LOAD_R(1);
3237 HS_CMP_XCHG(r8, r9);
3238 HS_CMP_XCHG(r7, r10);
3239 HS_CMP_XCHG(r1, r5);
3240 HS_CMP_XCHG(r3, r7);
3241 HS_CMP_XCHG(r1, r3);
3242 HS_CMP_XCHG(r5, r7);
3243 HS_CMP_XCHG(r2, r6);
3244 HS_CMP_XCHG(r4, r8);
3245 HS_CMP_XCHG(r2, r4);
3246 HS_CMP_XCHG(r6, r8);
3247 HS_CMP_XCHG(r1, r2);
3248 HS_CMP_XCHG(r3, r4);
3249 HS_CMP_XCHG(r5, r6);
3250 HS_CMP_XCHG(r7, r8);
3251 HS_CMP_XCHG(r9, r10);
3252 HS_XM_GLOBAL_STORE_L(0, r1);
3253 HS_XM_GLOBAL_STORE_L(1, r2);
3254 HS_XM_GLOBAL_STORE_L(2, r3);
3255 HS_XM_GLOBAL_STORE_L(3, r4);
3256 HS_XM_GLOBAL_STORE_L(4, r5);
3257 HS_XM_GLOBAL_STORE_L(5, r6);
3258 HS_XM_GLOBAL_STORE_L(6, r7);
3259 HS_XM_GLOBAL_STORE_L(7, r8);
3260 HS_FM_GLOBAL_STORE_R(0, r9);
3261 HS_FM_GLOBAL_STORE_R(1, r10);
3262 }
3263
3264 HS_OFFSET_FM_KERNEL_PROTO(0, 2)
3265 {
3266 HS_OFFSET_FM_PREAMBLE(8);
3267 HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
3268 HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
3269 HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
3270 HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
3271 HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
3272 HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
3273 HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
3274 HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
3275 HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0);
3276 HS_KEY_TYPE r10 = HS_FM_GLOBAL_LOAD_R(1);
3277 HS_KEY_TYPE r11 = HS_FM_GLOBAL_LOAD_R(2);
3278 HS_KEY_TYPE r12 = HS_FM_GLOBAL_LOAD_R(3);
3279 HS_CMP_XCHG(r8, r9);
3280 HS_CMP_XCHG(r7, r10);
3281 HS_CMP_XCHG(r6, r11);
3282 HS_CMP_XCHG(r5, r12);
3283 HS_CMP_XCHG(r1, r5);
3284 HS_CMP_XCHG(r3, r7);
3285 HS_CMP_XCHG(r1, r3);
3286 HS_CMP_XCHG(r5, r7);
3287 HS_CMP_XCHG(r2, r6);
3288 HS_CMP_XCHG(r4, r8);
3289 HS_CMP_XCHG(r2, r4);
3290 HS_CMP_XCHG(r6, r8);
3291 HS_CMP_XCHG(r1, r2);
3292 HS_CMP_XCHG(r3, r4);
3293 HS_CMP_XCHG(r5, r6);
3294 HS_CMP_XCHG(r7, r8);
3295 HS_CMP_XCHG(r9, r11);
3296 HS_CMP_XCHG(r10, r12);
3297 HS_CMP_XCHG(r9, r10);
3298 HS_CMP_XCHG(r11, r12);
3299 HS_XM_GLOBAL_STORE_L(0, r1);
3300 HS_XM_GLOBAL_STORE_L(1, r2);
3301 HS_XM_GLOBAL_STORE_L(2, r3);
3302 HS_XM_GLOBAL_STORE_L(3, r4);
3303 HS_XM_GLOBAL_STORE_L(4, r5);
3304 HS_XM_GLOBAL_STORE_L(5, r6);
3305 HS_XM_GLOBAL_STORE_L(6, r7);
3306 HS_XM_GLOBAL_STORE_L(7, r8);
3307 HS_FM_GLOBAL_STORE_R(0, r9);
3308 HS_FM_GLOBAL_STORE_R(1, r10);
3309 HS_FM_GLOBAL_STORE_R(2, r11);
3310 HS_FM_GLOBAL_STORE_R(3, r12);
3311 }
3312
3313 HS_FM_KERNEL_PROTO(0, 3)
3314 {
3315 HS_FM_PREAMBLE(8);
3316 HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
3317 HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
3318 HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
3319 HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
3320 HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
3321 HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
3322 HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
3323 HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
3324 HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0);
3325 HS_KEY_TYPE r10 = HS_FM_GLOBAL_LOAD_R(1);
3326 HS_KEY_TYPE r11 = HS_FM_GLOBAL_LOAD_R(2);
3327 HS_KEY_TYPE r12 = HS_FM_GLOBAL_LOAD_R(3);
3328 HS_KEY_TYPE r13 = HS_FM_GLOBAL_LOAD_R(4);
3329 HS_KEY_TYPE r14 = HS_FM_GLOBAL_LOAD_R(5);
3330 HS_KEY_TYPE r15 = HS_FM_GLOBAL_LOAD_R(6);
3331 HS_KEY_TYPE r16 = HS_FM_GLOBAL_LOAD_R(7);
3332 HS_CMP_XCHG(r8, r9);
3333 HS_CMP_XCHG(r7, r10);
3334 HS_CMP_XCHG(r6, r11);
3335 HS_CMP_XCHG(r5, r12);
3336 HS_CMP_XCHG(r4, r13);
3337 HS_CMP_XCHG(r3, r14);
3338 HS_CMP_XCHG(r2, r15);
3339 HS_CMP_XCHG(r1, r16);
3340 HS_CMP_XCHG(r1, r5);
3341 HS_CMP_XCHG(r3, r7);
3342 HS_CMP_XCHG(r1, r3);
3343 HS_CMP_XCHG(r5, r7);
3344 HS_CMP_XCHG(r2, r6);
3345 HS_CMP_XCHG(r4, r8);
3346 HS_CMP_XCHG(r2, r4);
3347 HS_CMP_XCHG(r6, r8);
3348 HS_CMP_XCHG(r1, r2);
3349 HS_CMP_XCHG(r3, r4);
3350 HS_CMP_XCHG(r5, r6);
3351 HS_CMP_XCHG(r7, r8);
3352 HS_CMP_XCHG(r9, r13);
3353 HS_CMP_XCHG(r11, r15);
3354 HS_CMP_XCHG(r9, r11);
3355 HS_CMP_XCHG(r13, r15);
3356 HS_CMP_XCHG(r10, r14);
3357 HS_CMP_XCHG(r12, r16);
3358 HS_CMP_XCHG(r10, r12);
3359 HS_CMP_XCHG(r14, r16);
3360 HS_CMP_XCHG(r9, r10);
3361 HS_CMP_XCHG(r11, r12);
3362 HS_CMP_XCHG(r13, r14);
3363 HS_CMP_XCHG(r15, r16);
3364 HS_XM_GLOBAL_STORE_L(0, r1);
3365 HS_XM_GLOBAL_STORE_L(1, r2);
3366 HS_XM_GLOBAL_STORE_L(2, r3);
3367 HS_XM_GLOBAL_STORE_L(3, r4);
3368 HS_XM_GLOBAL_STORE_L(4, r5);
3369 HS_XM_GLOBAL_STORE_L(5, r6);
3370 HS_XM_GLOBAL_STORE_L(6, r7);
3371 HS_XM_GLOBAL_STORE_L(7, r8);
3372 HS_FM_GLOBAL_STORE_R(0, r9);
3373 HS_FM_GLOBAL_STORE_R(1, r10);
3374 HS_FM_GLOBAL_STORE_R(2, r11);
3375 HS_FM_GLOBAL_STORE_R(3, r12);
3376 HS_FM_GLOBAL_STORE_R(4, r13);
3377 HS_FM_GLOBAL_STORE_R(5, r14);
3378 HS_FM_GLOBAL_STORE_R(6, r15);
3379 HS_FM_GLOBAL_STORE_R(7, r16);
3380 }
3381
3382 HS_HM_KERNEL_PROTO(0)
3383 {
3384 HS_HM_PREAMBLE(8);
3385 HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
3386 HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
3387 HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
3388 HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
3389 HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
3390 HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
3391 HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
3392 HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
3393 HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8);
3394 HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9);
3395 HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10);
3396 HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11);
3397 HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12);
3398 HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13);
3399 HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14);
3400 HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15);
3401 HS_CMP_XCHG(r1, r9);
3402 HS_CMP_XCHG(r5, r13);
3403 HS_CMP_XCHG(r1, r5);
3404 HS_CMP_XCHG(r9, r13);
3405 HS_CMP_XCHG(r3, r11);
3406 HS_CMP_XCHG(r7, r15);
3407 HS_CMP_XCHG(r3, r7);
3408 HS_CMP_XCHG(r11, r15);
3409 HS_CMP_XCHG(r1, r3);
3410 HS_CMP_XCHG(r5, r7);
3411 HS_CMP_XCHG(r9, r11);
3412 HS_CMP_XCHG(r13, r15);
3413 HS_CMP_XCHG(r2, r10);
3414 HS_CMP_XCHG(r6, r14);
3415 HS_CMP_XCHG(r2, r6);
3416 HS_CMP_XCHG(r10, r14);
3417 HS_CMP_XCHG(r4, r12);
3418 HS_CMP_XCHG(r8, r16);
3419 HS_CMP_XCHG(r4, r8);
3420 HS_CMP_XCHG(r12, r16);
3421 HS_CMP_XCHG(r2, r4);
3422 HS_CMP_XCHG(r6, r8);
3423 HS_CMP_XCHG(r10, r12);
3424 HS_CMP_XCHG(r14, r16);
3425 HS_CMP_XCHG(r1, r2);
3426 HS_CMP_XCHG(r3, r4);
3427 HS_CMP_XCHG(r5, r6);
3428 HS_CMP_XCHG(r7, r8);
3429 HS_CMP_XCHG(r9, r10);
3430 HS_CMP_XCHG(r11, r12);
3431 HS_CMP_XCHG(r13, r14);
3432 HS_CMP_XCHG(r15, r16);
3433 HS_XM_GLOBAL_STORE_L(0, r1);
3434 HS_XM_GLOBAL_STORE_L(1, r2);
3435 HS_XM_GLOBAL_STORE_L(2, r3);
3436 HS_XM_GLOBAL_STORE_L(3, r4);
3437 HS_XM_GLOBAL_STORE_L(4, r5);
3438 HS_XM_GLOBAL_STORE_L(5, r6);
3439 HS_XM_GLOBAL_STORE_L(6, r7);
3440 HS_XM_GLOBAL_STORE_L(7, r8);
3441 HS_XM_GLOBAL_STORE_L(8, r9);
3442 HS_XM_GLOBAL_STORE_L(9, r10);
3443 HS_XM_GLOBAL_STORE_L(10, r11);
3444 HS_XM_GLOBAL_STORE_L(11, r12);
3445 HS_XM_GLOBAL_STORE_L(12, r13);
3446 HS_XM_GLOBAL_STORE_L(13, r14);
3447 HS_XM_GLOBAL_STORE_L(14, r15);
3448 HS_XM_GLOBAL_STORE_L(15, r16);
3449 }
3450
HS_TRANSPOSE_KERNEL_PROTO()3451 HS_TRANSPOSE_KERNEL_PROTO()
3452 {
3453 HS_SLAB_GLOBAL_PREAMBLE();
3454 HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vout, 0);
3455 HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vout, 1);
3456 HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vout, 2);
3457 HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vout, 3);
3458 HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vout, 4);
3459 HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vout, 5);
3460 HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vout, 6);
3461 HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vout, 7);
3462 HS_TRANSPOSE_SLAB();
3463 }
3464
3465 //
3466 //
3467 //
3468
3469 #include "../../hs_cuda.inl"
3470
3471 //
3472 //
3473 //
3474