• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright 2016 Google Inc.
3 //
4 // Use of this source code is governed by a BSD-style
5 // license that can be found in the LICENSE file.
6 //
7 
8 #ifdef __cplusplus
9 extern "C"
10 {
11 #endif
12 
13 #include "hs_cuda.h"
14 
15 #ifdef __cplusplus
16 }
17 #endif
18 
19 #include "hs_cuda_config.h"
20 
21 #include "../hs_cuda_macros.h"
22 
23 //
24 //
25 //
26 
27 HS_OFFSET_BS_KERNEL_PROTO(1, 0)
28 {
29   HS_OFFSET_SLAB_GLOBAL_PREAMBLE();
30   HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0);
31   HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1);
32   HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2);
33   HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3);
34   HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4);
35   HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5);
36   HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6);
37   HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7);
38   HS_CMP_XCHG(r1, r5);
39   HS_CMP_XCHG(r2, r6);
40   HS_CMP_XCHG(r3, r7);
41   HS_CMP_XCHG(r4, r8);
42   HS_CMP_XCHG(r1, r3);
43   HS_CMP_XCHG(r2, r4);
44   HS_CMP_XCHG(r5, r7);
45   HS_CMP_XCHG(r6, r8);
46   HS_CMP_XCHG(r3, r5);
47   HS_CMP_XCHG(r4, r6);
48   HS_CMP_XCHG(r1, r2);
49   HS_CMP_XCHG(r3, r4);
50   HS_CMP_XCHG(r5, r6);
51   HS_CMP_XCHG(r7, r8);
52   HS_CMP_XCHG(r2, r5);
53   HS_CMP_XCHG(r4, r7);
54   HS_CMP_XCHG(r2, r3);
55   HS_CMP_XCHG(r4, r5);
56   HS_CMP_XCHG(r6, r7);
57   {
58     HS_SLAB_FLIP_PREAMBLE(1);
59     HS_CMP_FLIP(0, r1, r8);
60     HS_CMP_FLIP(1, r2, r7);
61     HS_CMP_FLIP(2, r3, r6);
62     HS_CMP_FLIP(3, r4, r5);
63   }
64   HS_CMP_XCHG(r1, r5);
65   HS_CMP_XCHG(r3, r7);
66   HS_CMP_XCHG(r1, r3);
67   HS_CMP_XCHG(r5, r7);
68   HS_CMP_XCHG(r2, r6);
69   HS_CMP_XCHG(r4, r8);
70   HS_CMP_XCHG(r2, r4);
71   HS_CMP_XCHG(r6, r8);
72   HS_CMP_XCHG(r1, r2);
73   HS_CMP_XCHG(r3, r4);
74   HS_CMP_XCHG(r5, r6);
75   HS_CMP_XCHG(r7, r8);
76   {
77     HS_SLAB_FLIP_PREAMBLE(3);
78     HS_CMP_FLIP(0, r1, r8);
79     HS_CMP_FLIP(1, r2, r7);
80     HS_CMP_FLIP(2, r3, r6);
81     HS_CMP_FLIP(3, r4, r5);
82   }
83   {
84     HS_SLAB_HALF_PREAMBLE(1);
85     HS_CMP_HALF(0, r1);
86     HS_CMP_HALF(1, r2);
87     HS_CMP_HALF(2, r3);
88     HS_CMP_HALF(3, r4);
89     HS_CMP_HALF(4, r5);
90     HS_CMP_HALF(5, r6);
91     HS_CMP_HALF(6, r7);
92     HS_CMP_HALF(7, r8);
93   }
94   HS_CMP_XCHG(r1, r5);
95   HS_CMP_XCHG(r3, r7);
96   HS_CMP_XCHG(r1, r3);
97   HS_CMP_XCHG(r5, r7);
98   HS_CMP_XCHG(r2, r6);
99   HS_CMP_XCHG(r4, r8);
100   HS_CMP_XCHG(r2, r4);
101   HS_CMP_XCHG(r6, r8);
102   HS_CMP_XCHG(r1, r2);
103   HS_CMP_XCHG(r3, r4);
104   HS_CMP_XCHG(r5, r6);
105   HS_CMP_XCHG(r7, r8);
106   {
107     HS_SLAB_FLIP_PREAMBLE(7);
108     HS_CMP_FLIP(0, r1, r8);
109     HS_CMP_FLIP(1, r2, r7);
110     HS_CMP_FLIP(2, r3, r6);
111     HS_CMP_FLIP(3, r4, r5);
112   }
113   {
114     HS_SLAB_HALF_PREAMBLE(2);
115     HS_CMP_HALF(0, r1);
116     HS_CMP_HALF(1, r2);
117     HS_CMP_HALF(2, r3);
118     HS_CMP_HALF(3, r4);
119     HS_CMP_HALF(4, r5);
120     HS_CMP_HALF(5, r6);
121     HS_CMP_HALF(6, r7);
122     HS_CMP_HALF(7, r8);
123   }
124   {
125     HS_SLAB_HALF_PREAMBLE(1);
126     HS_CMP_HALF(0, r1);
127     HS_CMP_HALF(1, r2);
128     HS_CMP_HALF(2, r3);
129     HS_CMP_HALF(3, r4);
130     HS_CMP_HALF(4, r5);
131     HS_CMP_HALF(5, r6);
132     HS_CMP_HALF(6, r7);
133     HS_CMP_HALF(7, r8);
134   }
135   HS_CMP_XCHG(r1, r5);
136   HS_CMP_XCHG(r3, r7);
137   HS_CMP_XCHG(r1, r3);
138   HS_CMP_XCHG(r5, r7);
139   HS_CMP_XCHG(r2, r6);
140   HS_CMP_XCHG(r4, r8);
141   HS_CMP_XCHG(r2, r4);
142   HS_CMP_XCHG(r6, r8);
143   HS_CMP_XCHG(r1, r2);
144   HS_CMP_XCHG(r3, r4);
145   HS_CMP_XCHG(r5, r6);
146   HS_CMP_XCHG(r7, r8);
147   {
148     HS_SLAB_FLIP_PREAMBLE(15);
149     HS_CMP_FLIP(0, r1, r8);
150     HS_CMP_FLIP(1, r2, r7);
151     HS_CMP_FLIP(2, r3, r6);
152     HS_CMP_FLIP(3, r4, r5);
153   }
154   {
155     HS_SLAB_HALF_PREAMBLE(4);
156     HS_CMP_HALF(0, r1);
157     HS_CMP_HALF(1, r2);
158     HS_CMP_HALF(2, r3);
159     HS_CMP_HALF(3, r4);
160     HS_CMP_HALF(4, r5);
161     HS_CMP_HALF(5, r6);
162     HS_CMP_HALF(6, r7);
163     HS_CMP_HALF(7, r8);
164   }
165   {
166     HS_SLAB_HALF_PREAMBLE(2);
167     HS_CMP_HALF(0, r1);
168     HS_CMP_HALF(1, r2);
169     HS_CMP_HALF(2, r3);
170     HS_CMP_HALF(3, r4);
171     HS_CMP_HALF(4, r5);
172     HS_CMP_HALF(5, r6);
173     HS_CMP_HALF(6, r7);
174     HS_CMP_HALF(7, r8);
175   }
176   {
177     HS_SLAB_HALF_PREAMBLE(1);
178     HS_CMP_HALF(0, r1);
179     HS_CMP_HALF(1, r2);
180     HS_CMP_HALF(2, r3);
181     HS_CMP_HALF(3, r4);
182     HS_CMP_HALF(4, r5);
183     HS_CMP_HALF(5, r6);
184     HS_CMP_HALF(6, r7);
185     HS_CMP_HALF(7, r8);
186   }
187   HS_CMP_XCHG(r1, r5);
188   HS_CMP_XCHG(r3, r7);
189   HS_CMP_XCHG(r1, r3);
190   HS_CMP_XCHG(r5, r7);
191   HS_CMP_XCHG(r2, r6);
192   HS_CMP_XCHG(r4, r8);
193   HS_CMP_XCHG(r2, r4);
194   HS_CMP_XCHG(r6, r8);
195   HS_CMP_XCHG(r1, r2);
196   HS_CMP_XCHG(r3, r4);
197   HS_CMP_XCHG(r5, r6);
198   HS_CMP_XCHG(r7, r8);
199   {
200     HS_SLAB_FLIP_PREAMBLE(31);
201     HS_CMP_FLIP(0, r1, r8);
202     HS_CMP_FLIP(1, r2, r7);
203     HS_CMP_FLIP(2, r3, r6);
204     HS_CMP_FLIP(3, r4, r5);
205   }
206   {
207     HS_SLAB_HALF_PREAMBLE(8);
208     HS_CMP_HALF(0, r1);
209     HS_CMP_HALF(1, r2);
210     HS_CMP_HALF(2, r3);
211     HS_CMP_HALF(3, r4);
212     HS_CMP_HALF(4, r5);
213     HS_CMP_HALF(5, r6);
214     HS_CMP_HALF(6, r7);
215     HS_CMP_HALF(7, r8);
216   }
217   {
218     HS_SLAB_HALF_PREAMBLE(4);
219     HS_CMP_HALF(0, r1);
220     HS_CMP_HALF(1, r2);
221     HS_CMP_HALF(2, r3);
222     HS_CMP_HALF(3, r4);
223     HS_CMP_HALF(4, r5);
224     HS_CMP_HALF(5, r6);
225     HS_CMP_HALF(6, r7);
226     HS_CMP_HALF(7, r8);
227   }
228   {
229     HS_SLAB_HALF_PREAMBLE(2);
230     HS_CMP_HALF(0, r1);
231     HS_CMP_HALF(1, r2);
232     HS_CMP_HALF(2, r3);
233     HS_CMP_HALF(3, r4);
234     HS_CMP_HALF(4, r5);
235     HS_CMP_HALF(5, r6);
236     HS_CMP_HALF(6, r7);
237     HS_CMP_HALF(7, r8);
238   }
239   {
240     HS_SLAB_HALF_PREAMBLE(1);
241     HS_CMP_HALF(0, r1);
242     HS_CMP_HALF(1, r2);
243     HS_CMP_HALF(2, r3);
244     HS_CMP_HALF(3, r4);
245     HS_CMP_HALF(4, r5);
246     HS_CMP_HALF(5, r6);
247     HS_CMP_HALF(6, r7);
248     HS_CMP_HALF(7, r8);
249   }
250   HS_CMP_XCHG(r1, r5);
251   HS_CMP_XCHG(r3, r7);
252   HS_CMP_XCHG(r1, r3);
253   HS_CMP_XCHG(r5, r7);
254   HS_CMP_XCHG(r2, r6);
255   HS_CMP_XCHG(r4, r8);
256   HS_CMP_XCHG(r2, r4);
257   HS_CMP_XCHG(r6, r8);
258   HS_CMP_XCHG(r1, r2);
259   HS_CMP_XCHG(r3, r4);
260   HS_CMP_XCHG(r5, r6);
261   HS_CMP_XCHG(r7, r8);
262   HS_SLAB_GLOBAL_STORE(0, r1);
263   HS_SLAB_GLOBAL_STORE(1, r2);
264   HS_SLAB_GLOBAL_STORE(2, r3);
265   HS_SLAB_GLOBAL_STORE(3, r4);
266   HS_SLAB_GLOBAL_STORE(4, r5);
267   HS_SLAB_GLOBAL_STORE(5, r6);
268   HS_SLAB_GLOBAL_STORE(6, r7);
269   HS_SLAB_GLOBAL_STORE(7, r8);
270 }
271 
272 HS_OFFSET_BS_KERNEL_PROTO(2, 1)
273 {
274   HS_BLOCK_LOCAL_MEM_DECL(64, 8);
275 
276   HS_OFFSET_SLAB_GLOBAL_PREAMBLE();
277   HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0);
278   HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1);
279   HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2);
280   HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3);
281   HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4);
282   HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5);
283   HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6);
284   HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7);
285   HS_CMP_XCHG(r1, r5);
286   HS_CMP_XCHG(r2, r6);
287   HS_CMP_XCHG(r3, r7);
288   HS_CMP_XCHG(r4, r8);
289   HS_CMP_XCHG(r1, r3);
290   HS_CMP_XCHG(r2, r4);
291   HS_CMP_XCHG(r5, r7);
292   HS_CMP_XCHG(r6, r8);
293   HS_CMP_XCHG(r3, r5);
294   HS_CMP_XCHG(r4, r6);
295   HS_CMP_XCHG(r1, r2);
296   HS_CMP_XCHG(r3, r4);
297   HS_CMP_XCHG(r5, r6);
298   HS_CMP_XCHG(r7, r8);
299   HS_CMP_XCHG(r2, r5);
300   HS_CMP_XCHG(r4, r7);
301   HS_CMP_XCHG(r2, r3);
302   HS_CMP_XCHG(r4, r5);
303   HS_CMP_XCHG(r6, r7);
304   {
305     HS_SLAB_FLIP_PREAMBLE(1);
306     HS_CMP_FLIP(0, r1, r8);
307     HS_CMP_FLIP(1, r2, r7);
308     HS_CMP_FLIP(2, r3, r6);
309     HS_CMP_FLIP(3, r4, r5);
310   }
311   HS_CMP_XCHG(r1, r5);
312   HS_CMP_XCHG(r3, r7);
313   HS_CMP_XCHG(r1, r3);
314   HS_CMP_XCHG(r5, r7);
315   HS_CMP_XCHG(r2, r6);
316   HS_CMP_XCHG(r4, r8);
317   HS_CMP_XCHG(r2, r4);
318   HS_CMP_XCHG(r6, r8);
319   HS_CMP_XCHG(r1, r2);
320   HS_CMP_XCHG(r3, r4);
321   HS_CMP_XCHG(r5, r6);
322   HS_CMP_XCHG(r7, r8);
323   {
324     HS_SLAB_FLIP_PREAMBLE(3);
325     HS_CMP_FLIP(0, r1, r8);
326     HS_CMP_FLIP(1, r2, r7);
327     HS_CMP_FLIP(2, r3, r6);
328     HS_CMP_FLIP(3, r4, r5);
329   }
330   {
331     HS_SLAB_HALF_PREAMBLE(1);
332     HS_CMP_HALF(0, r1);
333     HS_CMP_HALF(1, r2);
334     HS_CMP_HALF(2, r3);
335     HS_CMP_HALF(3, r4);
336     HS_CMP_HALF(4, r5);
337     HS_CMP_HALF(5, r6);
338     HS_CMP_HALF(6, r7);
339     HS_CMP_HALF(7, r8);
340   }
341   HS_CMP_XCHG(r1, r5);
342   HS_CMP_XCHG(r3, r7);
343   HS_CMP_XCHG(r1, r3);
344   HS_CMP_XCHG(r5, r7);
345   HS_CMP_XCHG(r2, r6);
346   HS_CMP_XCHG(r4, r8);
347   HS_CMP_XCHG(r2, r4);
348   HS_CMP_XCHG(r6, r8);
349   HS_CMP_XCHG(r1, r2);
350   HS_CMP_XCHG(r3, r4);
351   HS_CMP_XCHG(r5, r6);
352   HS_CMP_XCHG(r7, r8);
353   {
354     HS_SLAB_FLIP_PREAMBLE(7);
355     HS_CMP_FLIP(0, r1, r8);
356     HS_CMP_FLIP(1, r2, r7);
357     HS_CMP_FLIP(2, r3, r6);
358     HS_CMP_FLIP(3, r4, r5);
359   }
360   {
361     HS_SLAB_HALF_PREAMBLE(2);
362     HS_CMP_HALF(0, r1);
363     HS_CMP_HALF(1, r2);
364     HS_CMP_HALF(2, r3);
365     HS_CMP_HALF(3, r4);
366     HS_CMP_HALF(4, r5);
367     HS_CMP_HALF(5, r6);
368     HS_CMP_HALF(6, r7);
369     HS_CMP_HALF(7, r8);
370   }
371   {
372     HS_SLAB_HALF_PREAMBLE(1);
373     HS_CMP_HALF(0, r1);
374     HS_CMP_HALF(1, r2);
375     HS_CMP_HALF(2, r3);
376     HS_CMP_HALF(3, r4);
377     HS_CMP_HALF(4, r5);
378     HS_CMP_HALF(5, r6);
379     HS_CMP_HALF(6, r7);
380     HS_CMP_HALF(7, r8);
381   }
382   HS_CMP_XCHG(r1, r5);
383   HS_CMP_XCHG(r3, r7);
384   HS_CMP_XCHG(r1, r3);
385   HS_CMP_XCHG(r5, r7);
386   HS_CMP_XCHG(r2, r6);
387   HS_CMP_XCHG(r4, r8);
388   HS_CMP_XCHG(r2, r4);
389   HS_CMP_XCHG(r6, r8);
390   HS_CMP_XCHG(r1, r2);
391   HS_CMP_XCHG(r3, r4);
392   HS_CMP_XCHG(r5, r6);
393   HS_CMP_XCHG(r7, r8);
394   {
395     HS_SLAB_FLIP_PREAMBLE(15);
396     HS_CMP_FLIP(0, r1, r8);
397     HS_CMP_FLIP(1, r2, r7);
398     HS_CMP_FLIP(2, r3, r6);
399     HS_CMP_FLIP(3, r4, r5);
400   }
401   {
402     HS_SLAB_HALF_PREAMBLE(4);
403     HS_CMP_HALF(0, r1);
404     HS_CMP_HALF(1, r2);
405     HS_CMP_HALF(2, r3);
406     HS_CMP_HALF(3, r4);
407     HS_CMP_HALF(4, r5);
408     HS_CMP_HALF(5, r6);
409     HS_CMP_HALF(6, r7);
410     HS_CMP_HALF(7, r8);
411   }
412   {
413     HS_SLAB_HALF_PREAMBLE(2);
414     HS_CMP_HALF(0, r1);
415     HS_CMP_HALF(1, r2);
416     HS_CMP_HALF(2, r3);
417     HS_CMP_HALF(3, r4);
418     HS_CMP_HALF(4, r5);
419     HS_CMP_HALF(5, r6);
420     HS_CMP_HALF(6, r7);
421     HS_CMP_HALF(7, r8);
422   }
423   {
424     HS_SLAB_HALF_PREAMBLE(1);
425     HS_CMP_HALF(0, r1);
426     HS_CMP_HALF(1, r2);
427     HS_CMP_HALF(2, r3);
428     HS_CMP_HALF(3, r4);
429     HS_CMP_HALF(4, r5);
430     HS_CMP_HALF(5, r6);
431     HS_CMP_HALF(6, r7);
432     HS_CMP_HALF(7, r8);
433   }
434   HS_CMP_XCHG(r1, r5);
435   HS_CMP_XCHG(r3, r7);
436   HS_CMP_XCHG(r1, r3);
437   HS_CMP_XCHG(r5, r7);
438   HS_CMP_XCHG(r2, r6);
439   HS_CMP_XCHG(r4, r8);
440   HS_CMP_XCHG(r2, r4);
441   HS_CMP_XCHG(r6, r8);
442   HS_CMP_XCHG(r1, r2);
443   HS_CMP_XCHG(r3, r4);
444   HS_CMP_XCHG(r5, r6);
445   HS_CMP_XCHG(r7, r8);
446   {
447     HS_SLAB_FLIP_PREAMBLE(31);
448     HS_CMP_FLIP(0, r1, r8);
449     HS_CMP_FLIP(1, r2, r7);
450     HS_CMP_FLIP(2, r3, r6);
451     HS_CMP_FLIP(3, r4, r5);
452   }
453   {
454     HS_SLAB_HALF_PREAMBLE(8);
455     HS_CMP_HALF(0, r1);
456     HS_CMP_HALF(1, r2);
457     HS_CMP_HALF(2, r3);
458     HS_CMP_HALF(3, r4);
459     HS_CMP_HALF(4, r5);
460     HS_CMP_HALF(5, r6);
461     HS_CMP_HALF(6, r7);
462     HS_CMP_HALF(7, r8);
463   }
464   {
465     HS_SLAB_HALF_PREAMBLE(4);
466     HS_CMP_HALF(0, r1);
467     HS_CMP_HALF(1, r2);
468     HS_CMP_HALF(2, r3);
469     HS_CMP_HALF(3, r4);
470     HS_CMP_HALF(4, r5);
471     HS_CMP_HALF(5, r6);
472     HS_CMP_HALF(6, r7);
473     HS_CMP_HALF(7, r8);
474   }
475   {
476     HS_SLAB_HALF_PREAMBLE(2);
477     HS_CMP_HALF(0, r1);
478     HS_CMP_HALF(1, r2);
479     HS_CMP_HALF(2, r3);
480     HS_CMP_HALF(3, r4);
481     HS_CMP_HALF(4, r5);
482     HS_CMP_HALF(5, r6);
483     HS_CMP_HALF(6, r7);
484     HS_CMP_HALF(7, r8);
485   }
486   {
487     HS_SLAB_HALF_PREAMBLE(1);
488     HS_CMP_HALF(0, r1);
489     HS_CMP_HALF(1, r2);
490     HS_CMP_HALF(2, r3);
491     HS_CMP_HALF(3, r4);
492     HS_CMP_HALF(4, r5);
493     HS_CMP_HALF(5, r6);
494     HS_CMP_HALF(6, r7);
495     HS_CMP_HALF(7, r8);
496   }
497   HS_CMP_XCHG(r1, r5);
498   HS_CMP_XCHG(r3, r7);
499   HS_CMP_XCHG(r1, r3);
500   HS_CMP_XCHG(r5, r7);
501   HS_CMP_XCHG(r2, r6);
502   HS_CMP_XCHG(r4, r8);
503   HS_CMP_XCHG(r2, r4);
504   HS_CMP_XCHG(r6, r8);
505   HS_CMP_XCHG(r1, r2);
506   HS_CMP_XCHG(r3, r4);
507   HS_CMP_XCHG(r5, r6);
508   HS_CMP_XCHG(r7, r8);
509   HS_BS_MERGE_H_PREAMBLE(2);
510   HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 0) = r1;
511   HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 1) = r8;
512   HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 2) = r2;
513   HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 3) = r7;
514   HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 4) = r3;
515   HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 5) = r6;
516   HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 6) = r4;
517   HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 7) = r5;
518   HS_BLOCK_BARRIER();
519   {
520     {
521       HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
522       HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(32);
523       HS_CMP_XCHG(r0_1, r0_2);
524       HS_SLAB_LOCAL_L(0) = r0_1;
525       HS_SLAB_LOCAL_R(32) = r0_2;
526     }
527     {
528       HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(128);
529       HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(160);
530       HS_CMP_XCHG(r0_1, r0_2);
531       HS_SLAB_LOCAL_L(128) = r0_1;
532       HS_SLAB_LOCAL_R(160) = r0_2;
533     }
534     {
535       HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(256);
536       HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(288);
537       HS_CMP_XCHG(r0_1, r0_2);
538       HS_SLAB_LOCAL_L(256) = r0_1;
539       HS_SLAB_LOCAL_R(288) = r0_2;
540     }
541     {
542       HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(384);
543       HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(416);
544       HS_CMP_XCHG(r0_1, r0_2);
545       HS_SLAB_LOCAL_L(384) = r0_1;
546       HS_SLAB_LOCAL_R(416) = r0_2;
547     }
548   }
549   HS_BLOCK_BARRIER();
550   r1 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 0);
551   r8 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 1);
552   r2 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 2);
553   r7 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 3);
554   r3 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 4);
555   r6 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 5);
556   r4 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 6);
557   r5 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 7);
558   {
559     {
560       HS_SLAB_HALF_PREAMBLE(16);
561       HS_CMP_HALF(0, r1);
562       HS_CMP_HALF(1, r2);
563       HS_CMP_HALF(2, r3);
564       HS_CMP_HALF(3, r4);
565       HS_CMP_HALF(4, r5);
566       HS_CMP_HALF(5, r6);
567       HS_CMP_HALF(6, r7);
568       HS_CMP_HALF(7, r8);
569     }
570     {
571       HS_SLAB_HALF_PREAMBLE(8);
572       HS_CMP_HALF(0, r1);
573       HS_CMP_HALF(1, r2);
574       HS_CMP_HALF(2, r3);
575       HS_CMP_HALF(3, r4);
576       HS_CMP_HALF(4, r5);
577       HS_CMP_HALF(5, r6);
578       HS_CMP_HALF(6, r7);
579       HS_CMP_HALF(7, r8);
580     }
581     {
582       HS_SLAB_HALF_PREAMBLE(4);
583       HS_CMP_HALF(0, r1);
584       HS_CMP_HALF(1, r2);
585       HS_CMP_HALF(2, r3);
586       HS_CMP_HALF(3, r4);
587       HS_CMP_HALF(4, r5);
588       HS_CMP_HALF(5, r6);
589       HS_CMP_HALF(6, r7);
590       HS_CMP_HALF(7, r8);
591     }
592     {
593       HS_SLAB_HALF_PREAMBLE(2);
594       HS_CMP_HALF(0, r1);
595       HS_CMP_HALF(1, r2);
596       HS_CMP_HALF(2, r3);
597       HS_CMP_HALF(3, r4);
598       HS_CMP_HALF(4, r5);
599       HS_CMP_HALF(5, r6);
600       HS_CMP_HALF(6, r7);
601       HS_CMP_HALF(7, r8);
602     }
603     {
604       HS_SLAB_HALF_PREAMBLE(1);
605       HS_CMP_HALF(0, r1);
606       HS_CMP_HALF(1, r2);
607       HS_CMP_HALF(2, r3);
608       HS_CMP_HALF(3, r4);
609       HS_CMP_HALF(4, r5);
610       HS_CMP_HALF(5, r6);
611       HS_CMP_HALF(6, r7);
612       HS_CMP_HALF(7, r8);
613     }
614     HS_CMP_XCHG(r1, r5);
615     HS_CMP_XCHG(r3, r7);
616     HS_CMP_XCHG(r1, r3);
617     HS_CMP_XCHG(r5, r7);
618     HS_CMP_XCHG(r2, r6);
619     HS_CMP_XCHG(r4, r8);
620     HS_CMP_XCHG(r2, r4);
621     HS_CMP_XCHG(r6, r8);
622     HS_CMP_XCHG(r1, r2);
623     HS_CMP_XCHG(r3, r4);
624     HS_CMP_XCHG(r5, r6);
625     HS_CMP_XCHG(r7, r8);
626   }
627   HS_SLAB_GLOBAL_STORE(0, r1);
628   HS_SLAB_GLOBAL_STORE(1, r2);
629   HS_SLAB_GLOBAL_STORE(2, r3);
630   HS_SLAB_GLOBAL_STORE(3, r4);
631   HS_SLAB_GLOBAL_STORE(4, r5);
632   HS_SLAB_GLOBAL_STORE(5, r6);
633   HS_SLAB_GLOBAL_STORE(6, r7);
634   HS_SLAB_GLOBAL_STORE(7, r8);
635 }
636 
637 HS_OFFSET_BS_KERNEL_PROTO(4, 2)
638 {
639   HS_BLOCK_LOCAL_MEM_DECL(128, 8);
640 
641   HS_OFFSET_SLAB_GLOBAL_PREAMBLE();
642   HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0);
643   HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1);
644   HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2);
645   HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3);
646   HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4);
647   HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5);
648   HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6);
649   HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7);
650   HS_CMP_XCHG(r1, r5);
651   HS_CMP_XCHG(r2, r6);
652   HS_CMP_XCHG(r3, r7);
653   HS_CMP_XCHG(r4, r8);
654   HS_CMP_XCHG(r1, r3);
655   HS_CMP_XCHG(r2, r4);
656   HS_CMP_XCHG(r5, r7);
657   HS_CMP_XCHG(r6, r8);
658   HS_CMP_XCHG(r3, r5);
659   HS_CMP_XCHG(r4, r6);
660   HS_CMP_XCHG(r1, r2);
661   HS_CMP_XCHG(r3, r4);
662   HS_CMP_XCHG(r5, r6);
663   HS_CMP_XCHG(r7, r8);
664   HS_CMP_XCHG(r2, r5);
665   HS_CMP_XCHG(r4, r7);
666   HS_CMP_XCHG(r2, r3);
667   HS_CMP_XCHG(r4, r5);
668   HS_CMP_XCHG(r6, r7);
669   {
670     HS_SLAB_FLIP_PREAMBLE(1);
671     HS_CMP_FLIP(0, r1, r8);
672     HS_CMP_FLIP(1, r2, r7);
673     HS_CMP_FLIP(2, r3, r6);
674     HS_CMP_FLIP(3, r4, r5);
675   }
676   HS_CMP_XCHG(r1, r5);
677   HS_CMP_XCHG(r3, r7);
678   HS_CMP_XCHG(r1, r3);
679   HS_CMP_XCHG(r5, r7);
680   HS_CMP_XCHG(r2, r6);
681   HS_CMP_XCHG(r4, r8);
682   HS_CMP_XCHG(r2, r4);
683   HS_CMP_XCHG(r6, r8);
684   HS_CMP_XCHG(r1, r2);
685   HS_CMP_XCHG(r3, r4);
686   HS_CMP_XCHG(r5, r6);
687   HS_CMP_XCHG(r7, r8);
688   {
689     HS_SLAB_FLIP_PREAMBLE(3);
690     HS_CMP_FLIP(0, r1, r8);
691     HS_CMP_FLIP(1, r2, r7);
692     HS_CMP_FLIP(2, r3, r6);
693     HS_CMP_FLIP(3, r4, r5);
694   }
695   {
696     HS_SLAB_HALF_PREAMBLE(1);
697     HS_CMP_HALF(0, r1);
698     HS_CMP_HALF(1, r2);
699     HS_CMP_HALF(2, r3);
700     HS_CMP_HALF(3, r4);
701     HS_CMP_HALF(4, r5);
702     HS_CMP_HALF(5, r6);
703     HS_CMP_HALF(6, r7);
704     HS_CMP_HALF(7, r8);
705   }
706   HS_CMP_XCHG(r1, r5);
707   HS_CMP_XCHG(r3, r7);
708   HS_CMP_XCHG(r1, r3);
709   HS_CMP_XCHG(r5, r7);
710   HS_CMP_XCHG(r2, r6);
711   HS_CMP_XCHG(r4, r8);
712   HS_CMP_XCHG(r2, r4);
713   HS_CMP_XCHG(r6, r8);
714   HS_CMP_XCHG(r1, r2);
715   HS_CMP_XCHG(r3, r4);
716   HS_CMP_XCHG(r5, r6);
717   HS_CMP_XCHG(r7, r8);
718   {
719     HS_SLAB_FLIP_PREAMBLE(7);
720     HS_CMP_FLIP(0, r1, r8);
721     HS_CMP_FLIP(1, r2, r7);
722     HS_CMP_FLIP(2, r3, r6);
723     HS_CMP_FLIP(3, r4, r5);
724   }
725   {
726     HS_SLAB_HALF_PREAMBLE(2);
727     HS_CMP_HALF(0, r1);
728     HS_CMP_HALF(1, r2);
729     HS_CMP_HALF(2, r3);
730     HS_CMP_HALF(3, r4);
731     HS_CMP_HALF(4, r5);
732     HS_CMP_HALF(5, r6);
733     HS_CMP_HALF(6, r7);
734     HS_CMP_HALF(7, r8);
735   }
736   {
737     HS_SLAB_HALF_PREAMBLE(1);
738     HS_CMP_HALF(0, r1);
739     HS_CMP_HALF(1, r2);
740     HS_CMP_HALF(2, r3);
741     HS_CMP_HALF(3, r4);
742     HS_CMP_HALF(4, r5);
743     HS_CMP_HALF(5, r6);
744     HS_CMP_HALF(6, r7);
745     HS_CMP_HALF(7, r8);
746   }
747   HS_CMP_XCHG(r1, r5);
748   HS_CMP_XCHG(r3, r7);
749   HS_CMP_XCHG(r1, r3);
750   HS_CMP_XCHG(r5, r7);
751   HS_CMP_XCHG(r2, r6);
752   HS_CMP_XCHG(r4, r8);
753   HS_CMP_XCHG(r2, r4);
754   HS_CMP_XCHG(r6, r8);
755   HS_CMP_XCHG(r1, r2);
756   HS_CMP_XCHG(r3, r4);
757   HS_CMP_XCHG(r5, r6);
758   HS_CMP_XCHG(r7, r8);
759   {
760     HS_SLAB_FLIP_PREAMBLE(15);
761     HS_CMP_FLIP(0, r1, r8);
762     HS_CMP_FLIP(1, r2, r7);
763     HS_CMP_FLIP(2, r3, r6);
764     HS_CMP_FLIP(3, r4, r5);
765   }
766   {
767     HS_SLAB_HALF_PREAMBLE(4);
768     HS_CMP_HALF(0, r1);
769     HS_CMP_HALF(1, r2);
770     HS_CMP_HALF(2, r3);
771     HS_CMP_HALF(3, r4);
772     HS_CMP_HALF(4, r5);
773     HS_CMP_HALF(5, r6);
774     HS_CMP_HALF(6, r7);
775     HS_CMP_HALF(7, r8);
776   }
777   {
778     HS_SLAB_HALF_PREAMBLE(2);
779     HS_CMP_HALF(0, r1);
780     HS_CMP_HALF(1, r2);
781     HS_CMP_HALF(2, r3);
782     HS_CMP_HALF(3, r4);
783     HS_CMP_HALF(4, r5);
784     HS_CMP_HALF(5, r6);
785     HS_CMP_HALF(6, r7);
786     HS_CMP_HALF(7, r8);
787   }
788   {
789     HS_SLAB_HALF_PREAMBLE(1);
790     HS_CMP_HALF(0, r1);
791     HS_CMP_HALF(1, r2);
792     HS_CMP_HALF(2, r3);
793     HS_CMP_HALF(3, r4);
794     HS_CMP_HALF(4, r5);
795     HS_CMP_HALF(5, r6);
796     HS_CMP_HALF(6, r7);
797     HS_CMP_HALF(7, r8);
798   }
799   HS_CMP_XCHG(r1, r5);
800   HS_CMP_XCHG(r3, r7);
801   HS_CMP_XCHG(r1, r3);
802   HS_CMP_XCHG(r5, r7);
803   HS_CMP_XCHG(r2, r6);
804   HS_CMP_XCHG(r4, r8);
805   HS_CMP_XCHG(r2, r4);
806   HS_CMP_XCHG(r6, r8);
807   HS_CMP_XCHG(r1, r2);
808   HS_CMP_XCHG(r3, r4);
809   HS_CMP_XCHG(r5, r6);
810   HS_CMP_XCHG(r7, r8);
811   {
812     HS_SLAB_FLIP_PREAMBLE(31);
813     HS_CMP_FLIP(0, r1, r8);
814     HS_CMP_FLIP(1, r2, r7);
815     HS_CMP_FLIP(2, r3, r6);
816     HS_CMP_FLIP(3, r4, r5);
817   }
818   {
819     HS_SLAB_HALF_PREAMBLE(8);
820     HS_CMP_HALF(0, r1);
821     HS_CMP_HALF(1, r2);
822     HS_CMP_HALF(2, r3);
823     HS_CMP_HALF(3, r4);
824     HS_CMP_HALF(4, r5);
825     HS_CMP_HALF(5, r6);
826     HS_CMP_HALF(6, r7);
827     HS_CMP_HALF(7, r8);
828   }
829   {
830     HS_SLAB_HALF_PREAMBLE(4);
831     HS_CMP_HALF(0, r1);
832     HS_CMP_HALF(1, r2);
833     HS_CMP_HALF(2, r3);
834     HS_CMP_HALF(3, r4);
835     HS_CMP_HALF(4, r5);
836     HS_CMP_HALF(5, r6);
837     HS_CMP_HALF(6, r7);
838     HS_CMP_HALF(7, r8);
839   }
840   {
841     HS_SLAB_HALF_PREAMBLE(2);
842     HS_CMP_HALF(0, r1);
843     HS_CMP_HALF(1, r2);
844     HS_CMP_HALF(2, r3);
845     HS_CMP_HALF(3, r4);
846     HS_CMP_HALF(4, r5);
847     HS_CMP_HALF(5, r6);
848     HS_CMP_HALF(6, r7);
849     HS_CMP_HALF(7, r8);
850   }
851   {
852     HS_SLAB_HALF_PREAMBLE(1);
853     HS_CMP_HALF(0, r1);
854     HS_CMP_HALF(1, r2);
855     HS_CMP_HALF(2, r3);
856     HS_CMP_HALF(3, r4);
857     HS_CMP_HALF(4, r5);
858     HS_CMP_HALF(5, r6);
859     HS_CMP_HALF(6, r7);
860     HS_CMP_HALF(7, r8);
861   }
862   HS_CMP_XCHG(r1, r5);
863   HS_CMP_XCHG(r3, r7);
864   HS_CMP_XCHG(r1, r3);
865   HS_CMP_XCHG(r5, r7);
866   HS_CMP_XCHG(r2, r6);
867   HS_CMP_XCHG(r4, r8);
868   HS_CMP_XCHG(r2, r4);
869   HS_CMP_XCHG(r6, r8);
870   HS_CMP_XCHG(r1, r2);
871   HS_CMP_XCHG(r3, r4);
872   HS_CMP_XCHG(r5, r6);
873   HS_CMP_XCHG(r7, r8);
874   HS_BS_MERGE_H_PREAMBLE(4);
875   HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0) = r1;
876   HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1) = r8;
877   HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2) = r2;
878   HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3) = r7;
879   HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4) = r3;
880   HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5) = r6;
881   HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6) = r4;
882   HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7) = r5;
883   HS_BLOCK_BARRIER();
884   {
885     {
886       HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
887       HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(32);
888       HS_CMP_XCHG(r0_1, r0_2);
889       HS_SLAB_LOCAL_L(0) = r0_1;
890       HS_SLAB_LOCAL_R(32) = r0_2;
891     }
892     {
893       HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(64);
894       HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(96);
895       HS_CMP_XCHG(r1_1, r1_2);
896       HS_SLAB_LOCAL_L(64) = r1_1;
897       HS_SLAB_LOCAL_R(96) = r1_2;
898     }
899     {
900       HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(512);
901       HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(544);
902       HS_CMP_XCHG(r0_1, r0_2);
903       HS_SLAB_LOCAL_L(512) = r0_1;
904       HS_SLAB_LOCAL_R(544) = r0_2;
905     }
906     {
907       HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(576);
908       HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(608);
909       HS_CMP_XCHG(r1_1, r1_2);
910       HS_SLAB_LOCAL_L(576) = r1_1;
911       HS_SLAB_LOCAL_R(608) = r1_2;
912     }
913   }
914   HS_BLOCK_BARRIER();
915   r1 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0);
916   r8 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1);
917   r2 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2);
918   r7 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3);
919   r3 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4);
920   r6 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5);
921   r4 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6);
922   r5 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7);
923   {
924     {
925       HS_SLAB_HALF_PREAMBLE(16);
926       HS_CMP_HALF(0, r1);
927       HS_CMP_HALF(1, r2);
928       HS_CMP_HALF(2, r3);
929       HS_CMP_HALF(3, r4);
930       HS_CMP_HALF(4, r5);
931       HS_CMP_HALF(5, r6);
932       HS_CMP_HALF(6, r7);
933       HS_CMP_HALF(7, r8);
934     }
935     {
936       HS_SLAB_HALF_PREAMBLE(8);
937       HS_CMP_HALF(0, r1);
938       HS_CMP_HALF(1, r2);
939       HS_CMP_HALF(2, r3);
940       HS_CMP_HALF(3, r4);
941       HS_CMP_HALF(4, r5);
942       HS_CMP_HALF(5, r6);
943       HS_CMP_HALF(6, r7);
944       HS_CMP_HALF(7, r8);
945     }
946     {
947       HS_SLAB_HALF_PREAMBLE(4);
948       HS_CMP_HALF(0, r1);
949       HS_CMP_HALF(1, r2);
950       HS_CMP_HALF(2, r3);
951       HS_CMP_HALF(3, r4);
952       HS_CMP_HALF(4, r5);
953       HS_CMP_HALF(5, r6);
954       HS_CMP_HALF(6, r7);
955       HS_CMP_HALF(7, r8);
956     }
957     {
958       HS_SLAB_HALF_PREAMBLE(2);
959       HS_CMP_HALF(0, r1);
960       HS_CMP_HALF(1, r2);
961       HS_CMP_HALF(2, r3);
962       HS_CMP_HALF(3, r4);
963       HS_CMP_HALF(4, r5);
964       HS_CMP_HALF(5, r6);
965       HS_CMP_HALF(6, r7);
966       HS_CMP_HALF(7, r8);
967     }
968     {
969       HS_SLAB_HALF_PREAMBLE(1);
970       HS_CMP_HALF(0, r1);
971       HS_CMP_HALF(1, r2);
972       HS_CMP_HALF(2, r3);
973       HS_CMP_HALF(3, r4);
974       HS_CMP_HALF(4, r5);
975       HS_CMP_HALF(5, r6);
976       HS_CMP_HALF(6, r7);
977       HS_CMP_HALF(7, r8);
978     }
979     HS_CMP_XCHG(r1, r5);
980     HS_CMP_XCHG(r3, r7);
981     HS_CMP_XCHG(r1, r3);
982     HS_CMP_XCHG(r5, r7);
983     HS_CMP_XCHG(r2, r6);
984     HS_CMP_XCHG(r4, r8);
985     HS_CMP_XCHG(r2, r4);
986     HS_CMP_XCHG(r6, r8);
987     HS_CMP_XCHG(r1, r2);
988     HS_CMP_XCHG(r3, r4);
989     HS_CMP_XCHG(r5, r6);
990     HS_CMP_XCHG(r7, r8);
991   }
992   HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0) = r1;
993   HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1) = r8;
994   HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2) = r2;
995   HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3) = r7;
996   HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4) = r3;
997   HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5) = r6;
998   HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6) = r4;
999   HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7) = r5;
1000   HS_BLOCK_BARRIER();
1001   {
1002     {
1003       HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
1004       HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(32);
1005       HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(64);
1006       HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(96);
1007       HS_CMP_XCHG(r0_2, r0_3);
1008       HS_CMP_XCHG(r0_1, r0_4);
1009       HS_CMP_XCHG(r0_3, r0_4);
1010       HS_CMP_XCHG(r0_1, r0_2);
1011       HS_SLAB_LOCAL_L(0) = r0_1;
1012       HS_SLAB_LOCAL_L(32) = r0_2;
1013       HS_SLAB_LOCAL_R(64) = r0_3;
1014       HS_SLAB_LOCAL_R(96) = r0_4;
1015     }
1016     {
1017       HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(512);
1018       HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(544);
1019       HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(576);
1020       HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(608);
1021       HS_CMP_XCHG(r0_2, r0_3);
1022       HS_CMP_XCHG(r0_1, r0_4);
1023       HS_CMP_XCHG(r0_3, r0_4);
1024       HS_CMP_XCHG(r0_1, r0_2);
1025       HS_SLAB_LOCAL_L(512) = r0_1;
1026       HS_SLAB_LOCAL_L(544) = r0_2;
1027       HS_SLAB_LOCAL_R(576) = r0_3;
1028       HS_SLAB_LOCAL_R(608) = r0_4;
1029     }
1030   }
1031   HS_BLOCK_BARRIER();
1032   r1 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0);
1033   r8 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1);
1034   r2 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2);
1035   r7 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3);
1036   r3 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4);
1037   r6 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5);
1038   r4 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6);
1039   r5 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7);
1040   {
1041     {
1042       HS_SLAB_HALF_PREAMBLE(16);
1043       HS_CMP_HALF(0, r1);
1044       HS_CMP_HALF(1, r2);
1045       HS_CMP_HALF(2, r3);
1046       HS_CMP_HALF(3, r4);
1047       HS_CMP_HALF(4, r5);
1048       HS_CMP_HALF(5, r6);
1049       HS_CMP_HALF(6, r7);
1050       HS_CMP_HALF(7, r8);
1051     }
1052     {
1053       HS_SLAB_HALF_PREAMBLE(8);
1054       HS_CMP_HALF(0, r1);
1055       HS_CMP_HALF(1, r2);
1056       HS_CMP_HALF(2, r3);
1057       HS_CMP_HALF(3, r4);
1058       HS_CMP_HALF(4, r5);
1059       HS_CMP_HALF(5, r6);
1060       HS_CMP_HALF(6, r7);
1061       HS_CMP_HALF(7, r8);
1062     }
1063     {
1064       HS_SLAB_HALF_PREAMBLE(4);
1065       HS_CMP_HALF(0, r1);
1066       HS_CMP_HALF(1, r2);
1067       HS_CMP_HALF(2, r3);
1068       HS_CMP_HALF(3, r4);
1069       HS_CMP_HALF(4, r5);
1070       HS_CMP_HALF(5, r6);
1071       HS_CMP_HALF(6, r7);
1072       HS_CMP_HALF(7, r8);
1073     }
1074     {
1075       HS_SLAB_HALF_PREAMBLE(2);
1076       HS_CMP_HALF(0, r1);
1077       HS_CMP_HALF(1, r2);
1078       HS_CMP_HALF(2, r3);
1079       HS_CMP_HALF(3, r4);
1080       HS_CMP_HALF(4, r5);
1081       HS_CMP_HALF(5, r6);
1082       HS_CMP_HALF(6, r7);
1083       HS_CMP_HALF(7, r8);
1084     }
1085     {
1086       HS_SLAB_HALF_PREAMBLE(1);
1087       HS_CMP_HALF(0, r1);
1088       HS_CMP_HALF(1, r2);
1089       HS_CMP_HALF(2, r3);
1090       HS_CMP_HALF(3, r4);
1091       HS_CMP_HALF(4, r5);
1092       HS_CMP_HALF(5, r6);
1093       HS_CMP_HALF(6, r7);
1094       HS_CMP_HALF(7, r8);
1095     }
1096     HS_CMP_XCHG(r1, r5);
1097     HS_CMP_XCHG(r3, r7);
1098     HS_CMP_XCHG(r1, r3);
1099     HS_CMP_XCHG(r5, r7);
1100     HS_CMP_XCHG(r2, r6);
1101     HS_CMP_XCHG(r4, r8);
1102     HS_CMP_XCHG(r2, r4);
1103     HS_CMP_XCHG(r6, r8);
1104     HS_CMP_XCHG(r1, r2);
1105     HS_CMP_XCHG(r3, r4);
1106     HS_CMP_XCHG(r5, r6);
1107     HS_CMP_XCHG(r7, r8);
1108   }
1109   HS_SLAB_GLOBAL_STORE(0, r1);
1110   HS_SLAB_GLOBAL_STORE(1, r2);
1111   HS_SLAB_GLOBAL_STORE(2, r3);
1112   HS_SLAB_GLOBAL_STORE(3, r4);
1113   HS_SLAB_GLOBAL_STORE(4, r5);
1114   HS_SLAB_GLOBAL_STORE(5, r6);
1115   HS_SLAB_GLOBAL_STORE(6, r7);
1116   HS_SLAB_GLOBAL_STORE(7, r8);
1117 }
1118 
1119 HS_OFFSET_BS_KERNEL_PROTO(8, 3)
1120 {
1121   HS_BLOCK_LOCAL_MEM_DECL(256, 8);
1122 
1123   HS_OFFSET_SLAB_GLOBAL_PREAMBLE();
1124   HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0);
1125   HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1);
1126   HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2);
1127   HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3);
1128   HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4);
1129   HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5);
1130   HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6);
1131   HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7);
1132   HS_CMP_XCHG(r1, r5);
1133   HS_CMP_XCHG(r2, r6);
1134   HS_CMP_XCHG(r3, r7);
1135   HS_CMP_XCHG(r4, r8);
1136   HS_CMP_XCHG(r1, r3);
1137   HS_CMP_XCHG(r2, r4);
1138   HS_CMP_XCHG(r5, r7);
1139   HS_CMP_XCHG(r6, r8);
1140   HS_CMP_XCHG(r3, r5);
1141   HS_CMP_XCHG(r4, r6);
1142   HS_CMP_XCHG(r1, r2);
1143   HS_CMP_XCHG(r3, r4);
1144   HS_CMP_XCHG(r5, r6);
1145   HS_CMP_XCHG(r7, r8);
1146   HS_CMP_XCHG(r2, r5);
1147   HS_CMP_XCHG(r4, r7);
1148   HS_CMP_XCHG(r2, r3);
1149   HS_CMP_XCHG(r4, r5);
1150   HS_CMP_XCHG(r6, r7);
1151   {
1152     HS_SLAB_FLIP_PREAMBLE(1);
1153     HS_CMP_FLIP(0, r1, r8);
1154     HS_CMP_FLIP(1, r2, r7);
1155     HS_CMP_FLIP(2, r3, r6);
1156     HS_CMP_FLIP(3, r4, r5);
1157   }
1158   HS_CMP_XCHG(r1, r5);
1159   HS_CMP_XCHG(r3, r7);
1160   HS_CMP_XCHG(r1, r3);
1161   HS_CMP_XCHG(r5, r7);
1162   HS_CMP_XCHG(r2, r6);
1163   HS_CMP_XCHG(r4, r8);
1164   HS_CMP_XCHG(r2, r4);
1165   HS_CMP_XCHG(r6, r8);
1166   HS_CMP_XCHG(r1, r2);
1167   HS_CMP_XCHG(r3, r4);
1168   HS_CMP_XCHG(r5, r6);
1169   HS_CMP_XCHG(r7, r8);
1170   {
1171     HS_SLAB_FLIP_PREAMBLE(3);
1172     HS_CMP_FLIP(0, r1, r8);
1173     HS_CMP_FLIP(1, r2, r7);
1174     HS_CMP_FLIP(2, r3, r6);
1175     HS_CMP_FLIP(3, r4, r5);
1176   }
1177   {
1178     HS_SLAB_HALF_PREAMBLE(1);
1179     HS_CMP_HALF(0, r1);
1180     HS_CMP_HALF(1, r2);
1181     HS_CMP_HALF(2, r3);
1182     HS_CMP_HALF(3, r4);
1183     HS_CMP_HALF(4, r5);
1184     HS_CMP_HALF(5, r6);
1185     HS_CMP_HALF(6, r7);
1186     HS_CMP_HALF(7, r8);
1187   }
1188   HS_CMP_XCHG(r1, r5);
1189   HS_CMP_XCHG(r3, r7);
1190   HS_CMP_XCHG(r1, r3);
1191   HS_CMP_XCHG(r5, r7);
1192   HS_CMP_XCHG(r2, r6);
1193   HS_CMP_XCHG(r4, r8);
1194   HS_CMP_XCHG(r2, r4);
1195   HS_CMP_XCHG(r6, r8);
1196   HS_CMP_XCHG(r1, r2);
1197   HS_CMP_XCHG(r3, r4);
1198   HS_CMP_XCHG(r5, r6);
1199   HS_CMP_XCHG(r7, r8);
1200   {
1201     HS_SLAB_FLIP_PREAMBLE(7);
1202     HS_CMP_FLIP(0, r1, r8);
1203     HS_CMP_FLIP(1, r2, r7);
1204     HS_CMP_FLIP(2, r3, r6);
1205     HS_CMP_FLIP(3, r4, r5);
1206   }
1207   {
1208     HS_SLAB_HALF_PREAMBLE(2);
1209     HS_CMP_HALF(0, r1);
1210     HS_CMP_HALF(1, r2);
1211     HS_CMP_HALF(2, r3);
1212     HS_CMP_HALF(3, r4);
1213     HS_CMP_HALF(4, r5);
1214     HS_CMP_HALF(5, r6);
1215     HS_CMP_HALF(6, r7);
1216     HS_CMP_HALF(7, r8);
1217   }
1218   {
1219     HS_SLAB_HALF_PREAMBLE(1);
1220     HS_CMP_HALF(0, r1);
1221     HS_CMP_HALF(1, r2);
1222     HS_CMP_HALF(2, r3);
1223     HS_CMP_HALF(3, r4);
1224     HS_CMP_HALF(4, r5);
1225     HS_CMP_HALF(5, r6);
1226     HS_CMP_HALF(6, r7);
1227     HS_CMP_HALF(7, r8);
1228   }
1229   HS_CMP_XCHG(r1, r5);
1230   HS_CMP_XCHG(r3, r7);
1231   HS_CMP_XCHG(r1, r3);
1232   HS_CMP_XCHG(r5, r7);
1233   HS_CMP_XCHG(r2, r6);
1234   HS_CMP_XCHG(r4, r8);
1235   HS_CMP_XCHG(r2, r4);
1236   HS_CMP_XCHG(r6, r8);
1237   HS_CMP_XCHG(r1, r2);
1238   HS_CMP_XCHG(r3, r4);
1239   HS_CMP_XCHG(r5, r6);
1240   HS_CMP_XCHG(r7, r8);
1241   {
1242     HS_SLAB_FLIP_PREAMBLE(15);
1243     HS_CMP_FLIP(0, r1, r8);
1244     HS_CMP_FLIP(1, r2, r7);
1245     HS_CMP_FLIP(2, r3, r6);
1246     HS_CMP_FLIP(3, r4, r5);
1247   }
1248   {
1249     HS_SLAB_HALF_PREAMBLE(4);
1250     HS_CMP_HALF(0, r1);
1251     HS_CMP_HALF(1, r2);
1252     HS_CMP_HALF(2, r3);
1253     HS_CMP_HALF(3, r4);
1254     HS_CMP_HALF(4, r5);
1255     HS_CMP_HALF(5, r6);
1256     HS_CMP_HALF(6, r7);
1257     HS_CMP_HALF(7, r8);
1258   }
1259   {
1260     HS_SLAB_HALF_PREAMBLE(2);
1261     HS_CMP_HALF(0, r1);
1262     HS_CMP_HALF(1, r2);
1263     HS_CMP_HALF(2, r3);
1264     HS_CMP_HALF(3, r4);
1265     HS_CMP_HALF(4, r5);
1266     HS_CMP_HALF(5, r6);
1267     HS_CMP_HALF(6, r7);
1268     HS_CMP_HALF(7, r8);
1269   }
1270   {
1271     HS_SLAB_HALF_PREAMBLE(1);
1272     HS_CMP_HALF(0, r1);
1273     HS_CMP_HALF(1, r2);
1274     HS_CMP_HALF(2, r3);
1275     HS_CMP_HALF(3, r4);
1276     HS_CMP_HALF(4, r5);
1277     HS_CMP_HALF(5, r6);
1278     HS_CMP_HALF(6, r7);
1279     HS_CMP_HALF(7, r8);
1280   }
1281   HS_CMP_XCHG(r1, r5);
1282   HS_CMP_XCHG(r3, r7);
1283   HS_CMP_XCHG(r1, r3);
1284   HS_CMP_XCHG(r5, r7);
1285   HS_CMP_XCHG(r2, r6);
1286   HS_CMP_XCHG(r4, r8);
1287   HS_CMP_XCHG(r2, r4);
1288   HS_CMP_XCHG(r6, r8);
1289   HS_CMP_XCHG(r1, r2);
1290   HS_CMP_XCHG(r3, r4);
1291   HS_CMP_XCHG(r5, r6);
1292   HS_CMP_XCHG(r7, r8);
1293   {
1294     HS_SLAB_FLIP_PREAMBLE(31);
1295     HS_CMP_FLIP(0, r1, r8);
1296     HS_CMP_FLIP(1, r2, r7);
1297     HS_CMP_FLIP(2, r3, r6);
1298     HS_CMP_FLIP(3, r4, r5);
1299   }
1300   {
1301     HS_SLAB_HALF_PREAMBLE(8);
1302     HS_CMP_HALF(0, r1);
1303     HS_CMP_HALF(1, r2);
1304     HS_CMP_HALF(2, r3);
1305     HS_CMP_HALF(3, r4);
1306     HS_CMP_HALF(4, r5);
1307     HS_CMP_HALF(5, r6);
1308     HS_CMP_HALF(6, r7);
1309     HS_CMP_HALF(7, r8);
1310   }
1311   {
1312     HS_SLAB_HALF_PREAMBLE(4);
1313     HS_CMP_HALF(0, r1);
1314     HS_CMP_HALF(1, r2);
1315     HS_CMP_HALF(2, r3);
1316     HS_CMP_HALF(3, r4);
1317     HS_CMP_HALF(4, r5);
1318     HS_CMP_HALF(5, r6);
1319     HS_CMP_HALF(6, r7);
1320     HS_CMP_HALF(7, r8);
1321   }
1322   {
1323     HS_SLAB_HALF_PREAMBLE(2);
1324     HS_CMP_HALF(0, r1);
1325     HS_CMP_HALF(1, r2);
1326     HS_CMP_HALF(2, r3);
1327     HS_CMP_HALF(3, r4);
1328     HS_CMP_HALF(4, r5);
1329     HS_CMP_HALF(5, r6);
1330     HS_CMP_HALF(6, r7);
1331     HS_CMP_HALF(7, r8);
1332   }
1333   {
1334     HS_SLAB_HALF_PREAMBLE(1);
1335     HS_CMP_HALF(0, r1);
1336     HS_CMP_HALF(1, r2);
1337     HS_CMP_HALF(2, r3);
1338     HS_CMP_HALF(3, r4);
1339     HS_CMP_HALF(4, r5);
1340     HS_CMP_HALF(5, r6);
1341     HS_CMP_HALF(6, r7);
1342     HS_CMP_HALF(7, r8);
1343   }
1344   HS_CMP_XCHG(r1, r5);
1345   HS_CMP_XCHG(r3, r7);
1346   HS_CMP_XCHG(r1, r3);
1347   HS_CMP_XCHG(r5, r7);
1348   HS_CMP_XCHG(r2, r6);
1349   HS_CMP_XCHG(r4, r8);
1350   HS_CMP_XCHG(r2, r4);
1351   HS_CMP_XCHG(r6, r8);
1352   HS_CMP_XCHG(r1, r2);
1353   HS_CMP_XCHG(r3, r4);
1354   HS_CMP_XCHG(r5, r6);
1355   HS_CMP_XCHG(r7, r8);
1356   HS_BS_MERGE_H_PREAMBLE(8);
1357   HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0) = r1;
1358   HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1) = r8;
1359   HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2) = r2;
1360   HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3) = r7;
1361   HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4) = r3;
1362   HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5) = r6;
1363   HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6) = r4;
1364   HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7) = r5;
1365   HS_BLOCK_BARRIER();
1366   {
1367     {
1368       HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
1369       HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(32);
1370       HS_CMP_XCHG(r0_1, r0_2);
1371       HS_SLAB_LOCAL_L(0) = r0_1;
1372       HS_SLAB_LOCAL_R(32) = r0_2;
1373     }
1374     {
1375       HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(64);
1376       HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(96);
1377       HS_CMP_XCHG(r1_1, r1_2);
1378       HS_SLAB_LOCAL_L(64) = r1_1;
1379       HS_SLAB_LOCAL_R(96) = r1_2;
1380     }
1381     {
1382       HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(128);
1383       HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(160);
1384       HS_CMP_XCHG(r2_1, r2_2);
1385       HS_SLAB_LOCAL_L(128) = r2_1;
1386       HS_SLAB_LOCAL_R(160) = r2_2;
1387     }
1388     {
1389       HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(192);
1390       HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(224);
1391       HS_CMP_XCHG(r3_1, r3_2);
1392       HS_SLAB_LOCAL_L(192) = r3_1;
1393       HS_SLAB_LOCAL_R(224) = r3_2;
1394     }
1395   }
1396   HS_BLOCK_BARRIER();
1397   r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0);
1398   r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1);
1399   r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2);
1400   r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3);
1401   r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4);
1402   r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5);
1403   r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6);
1404   r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7);
1405   {
1406     {
1407       HS_SLAB_HALF_PREAMBLE(16);
1408       HS_CMP_HALF(0, r1);
1409       HS_CMP_HALF(1, r2);
1410       HS_CMP_HALF(2, r3);
1411       HS_CMP_HALF(3, r4);
1412       HS_CMP_HALF(4, r5);
1413       HS_CMP_HALF(5, r6);
1414       HS_CMP_HALF(6, r7);
1415       HS_CMP_HALF(7, r8);
1416     }
1417     {
1418       HS_SLAB_HALF_PREAMBLE(8);
1419       HS_CMP_HALF(0, r1);
1420       HS_CMP_HALF(1, r2);
1421       HS_CMP_HALF(2, r3);
1422       HS_CMP_HALF(3, r4);
1423       HS_CMP_HALF(4, r5);
1424       HS_CMP_HALF(5, r6);
1425       HS_CMP_HALF(6, r7);
1426       HS_CMP_HALF(7, r8);
1427     }
1428     {
1429       HS_SLAB_HALF_PREAMBLE(4);
1430       HS_CMP_HALF(0, r1);
1431       HS_CMP_HALF(1, r2);
1432       HS_CMP_HALF(2, r3);
1433       HS_CMP_HALF(3, r4);
1434       HS_CMP_HALF(4, r5);
1435       HS_CMP_HALF(5, r6);
1436       HS_CMP_HALF(6, r7);
1437       HS_CMP_HALF(7, r8);
1438     }
1439     {
1440       HS_SLAB_HALF_PREAMBLE(2);
1441       HS_CMP_HALF(0, r1);
1442       HS_CMP_HALF(1, r2);
1443       HS_CMP_HALF(2, r3);
1444       HS_CMP_HALF(3, r4);
1445       HS_CMP_HALF(4, r5);
1446       HS_CMP_HALF(5, r6);
1447       HS_CMP_HALF(6, r7);
1448       HS_CMP_HALF(7, r8);
1449     }
1450     {
1451       HS_SLAB_HALF_PREAMBLE(1);
1452       HS_CMP_HALF(0, r1);
1453       HS_CMP_HALF(1, r2);
1454       HS_CMP_HALF(2, r3);
1455       HS_CMP_HALF(3, r4);
1456       HS_CMP_HALF(4, r5);
1457       HS_CMP_HALF(5, r6);
1458       HS_CMP_HALF(6, r7);
1459       HS_CMP_HALF(7, r8);
1460     }
1461     HS_CMP_XCHG(r1, r5);
1462     HS_CMP_XCHG(r3, r7);
1463     HS_CMP_XCHG(r1, r3);
1464     HS_CMP_XCHG(r5, r7);
1465     HS_CMP_XCHG(r2, r6);
1466     HS_CMP_XCHG(r4, r8);
1467     HS_CMP_XCHG(r2, r4);
1468     HS_CMP_XCHG(r6, r8);
1469     HS_CMP_XCHG(r1, r2);
1470     HS_CMP_XCHG(r3, r4);
1471     HS_CMP_XCHG(r5, r6);
1472     HS_CMP_XCHG(r7, r8);
1473   }
1474   HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0) = r1;
1475   HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1) = r8;
1476   HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2) = r2;
1477   HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3) = r7;
1478   HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4) = r3;
1479   HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5) = r6;
1480   HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6) = r4;
1481   HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7) = r5;
1482   HS_BLOCK_BARRIER();
1483   {
1484     {
1485       HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
1486       HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(32);
1487       HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(64);
1488       HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(96);
1489       HS_CMP_XCHG(r0_2, r0_3);
1490       HS_CMP_XCHG(r0_1, r0_4);
1491       HS_CMP_XCHG(r0_3, r0_4);
1492       HS_CMP_XCHG(r0_1, r0_2);
1493       HS_SLAB_LOCAL_L(0) = r0_1;
1494       HS_SLAB_LOCAL_L(32) = r0_2;
1495       HS_SLAB_LOCAL_R(64) = r0_3;
1496       HS_SLAB_LOCAL_R(96) = r0_4;
1497     }
1498     {
1499       HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(128);
1500       HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(160);
1501       HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(192);
1502       HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(224);
1503       HS_CMP_XCHG(r1_2, r1_3);
1504       HS_CMP_XCHG(r1_1, r1_4);
1505       HS_CMP_XCHG(r1_3, r1_4);
1506       HS_CMP_XCHG(r1_1, r1_2);
1507       HS_SLAB_LOCAL_L(128) = r1_1;
1508       HS_SLAB_LOCAL_L(160) = r1_2;
1509       HS_SLAB_LOCAL_R(192) = r1_3;
1510       HS_SLAB_LOCAL_R(224) = r1_4;
1511     }
1512   }
1513   HS_BLOCK_BARRIER();
1514   r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0);
1515   r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1);
1516   r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2);
1517   r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3);
1518   r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4);
1519   r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5);
1520   r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6);
1521   r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7);
1522   {
1523     {
1524       HS_SLAB_HALF_PREAMBLE(16);
1525       HS_CMP_HALF(0, r1);
1526       HS_CMP_HALF(1, r2);
1527       HS_CMP_HALF(2, r3);
1528       HS_CMP_HALF(3, r4);
1529       HS_CMP_HALF(4, r5);
1530       HS_CMP_HALF(5, r6);
1531       HS_CMP_HALF(6, r7);
1532       HS_CMP_HALF(7, r8);
1533     }
1534     {
1535       HS_SLAB_HALF_PREAMBLE(8);
1536       HS_CMP_HALF(0, r1);
1537       HS_CMP_HALF(1, r2);
1538       HS_CMP_HALF(2, r3);
1539       HS_CMP_HALF(3, r4);
1540       HS_CMP_HALF(4, r5);
1541       HS_CMP_HALF(5, r6);
1542       HS_CMP_HALF(6, r7);
1543       HS_CMP_HALF(7, r8);
1544     }
1545     {
1546       HS_SLAB_HALF_PREAMBLE(4);
1547       HS_CMP_HALF(0, r1);
1548       HS_CMP_HALF(1, r2);
1549       HS_CMP_HALF(2, r3);
1550       HS_CMP_HALF(3, r4);
1551       HS_CMP_HALF(4, r5);
1552       HS_CMP_HALF(5, r6);
1553       HS_CMP_HALF(6, r7);
1554       HS_CMP_HALF(7, r8);
1555     }
1556     {
1557       HS_SLAB_HALF_PREAMBLE(2);
1558       HS_CMP_HALF(0, r1);
1559       HS_CMP_HALF(1, r2);
1560       HS_CMP_HALF(2, r3);
1561       HS_CMP_HALF(3, r4);
1562       HS_CMP_HALF(4, r5);
1563       HS_CMP_HALF(5, r6);
1564       HS_CMP_HALF(6, r7);
1565       HS_CMP_HALF(7, r8);
1566     }
1567     {
1568       HS_SLAB_HALF_PREAMBLE(1);
1569       HS_CMP_HALF(0, r1);
1570       HS_CMP_HALF(1, r2);
1571       HS_CMP_HALF(2, r3);
1572       HS_CMP_HALF(3, r4);
1573       HS_CMP_HALF(4, r5);
1574       HS_CMP_HALF(5, r6);
1575       HS_CMP_HALF(6, r7);
1576       HS_CMP_HALF(7, r8);
1577     }
1578     HS_CMP_XCHG(r1, r5);
1579     HS_CMP_XCHG(r3, r7);
1580     HS_CMP_XCHG(r1, r3);
1581     HS_CMP_XCHG(r5, r7);
1582     HS_CMP_XCHG(r2, r6);
1583     HS_CMP_XCHG(r4, r8);
1584     HS_CMP_XCHG(r2, r4);
1585     HS_CMP_XCHG(r6, r8);
1586     HS_CMP_XCHG(r1, r2);
1587     HS_CMP_XCHG(r3, r4);
1588     HS_CMP_XCHG(r5, r6);
1589     HS_CMP_XCHG(r7, r8);
1590   }
1591   HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0) = r1;
1592   HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1) = r8;
1593   HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2) = r2;
1594   HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3) = r7;
1595   HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4) = r3;
1596   HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5) = r6;
1597   HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6) = r4;
1598   HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7) = r5;
1599   HS_BLOCK_BARRIER();
1600   {
1601     {
1602       HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
1603       HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(32);
1604       HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(64);
1605       HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(96);
1606       HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(128);
1607       HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(160);
1608       HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(192);
1609       HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(224);
1610       HS_CMP_XCHG(r0_4, r0_5);
1611       HS_CMP_XCHG(r0_3, r0_6);
1612       HS_CMP_XCHG(r0_2, r0_7);
1613       HS_CMP_XCHG(r0_1, r0_8);
1614       HS_CMP_XCHG(r0_5, r0_7);
1615       HS_CMP_XCHG(r0_6, r0_8);
1616       HS_CMP_XCHG(r0_5, r0_6);
1617       HS_CMP_XCHG(r0_7, r0_8);
1618       HS_CMP_XCHG(r0_1, r0_3);
1619       HS_CMP_XCHG(r0_2, r0_4);
1620       HS_CMP_XCHG(r0_1, r0_2);
1621       HS_CMP_XCHG(r0_3, r0_4);
1622       HS_SLAB_LOCAL_L(0) = r0_1;
1623       HS_SLAB_LOCAL_L(32) = r0_2;
1624       HS_SLAB_LOCAL_L(64) = r0_3;
1625       HS_SLAB_LOCAL_L(96) = r0_4;
1626       HS_SLAB_LOCAL_R(128) = r0_5;
1627       HS_SLAB_LOCAL_R(160) = r0_6;
1628       HS_SLAB_LOCAL_R(192) = r0_7;
1629       HS_SLAB_LOCAL_R(224) = r0_8;
1630     }
1631   }
1632   HS_BLOCK_BARRIER();
1633   r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0);
1634   r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1);
1635   r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2);
1636   r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3);
1637   r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4);
1638   r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5);
1639   r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6);
1640   r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7);
1641   {
1642     {
1643       HS_SLAB_HALF_PREAMBLE(16);
1644       HS_CMP_HALF(0, r1);
1645       HS_CMP_HALF(1, r2);
1646       HS_CMP_HALF(2, r3);
1647       HS_CMP_HALF(3, r4);
1648       HS_CMP_HALF(4, r5);
1649       HS_CMP_HALF(5, r6);
1650       HS_CMP_HALF(6, r7);
1651       HS_CMP_HALF(7, r8);
1652     }
1653     {
1654       HS_SLAB_HALF_PREAMBLE(8);
1655       HS_CMP_HALF(0, r1);
1656       HS_CMP_HALF(1, r2);
1657       HS_CMP_HALF(2, r3);
1658       HS_CMP_HALF(3, r4);
1659       HS_CMP_HALF(4, r5);
1660       HS_CMP_HALF(5, r6);
1661       HS_CMP_HALF(6, r7);
1662       HS_CMP_HALF(7, r8);
1663     }
1664     {
1665       HS_SLAB_HALF_PREAMBLE(4);
1666       HS_CMP_HALF(0, r1);
1667       HS_CMP_HALF(1, r2);
1668       HS_CMP_HALF(2, r3);
1669       HS_CMP_HALF(3, r4);
1670       HS_CMP_HALF(4, r5);
1671       HS_CMP_HALF(5, r6);
1672       HS_CMP_HALF(6, r7);
1673       HS_CMP_HALF(7, r8);
1674     }
1675     {
1676       HS_SLAB_HALF_PREAMBLE(2);
1677       HS_CMP_HALF(0, r1);
1678       HS_CMP_HALF(1, r2);
1679       HS_CMP_HALF(2, r3);
1680       HS_CMP_HALF(3, r4);
1681       HS_CMP_HALF(4, r5);
1682       HS_CMP_HALF(5, r6);
1683       HS_CMP_HALF(6, r7);
1684       HS_CMP_HALF(7, r8);
1685     }
1686     {
1687       HS_SLAB_HALF_PREAMBLE(1);
1688       HS_CMP_HALF(0, r1);
1689       HS_CMP_HALF(1, r2);
1690       HS_CMP_HALF(2, r3);
1691       HS_CMP_HALF(3, r4);
1692       HS_CMP_HALF(4, r5);
1693       HS_CMP_HALF(5, r6);
1694       HS_CMP_HALF(6, r7);
1695       HS_CMP_HALF(7, r8);
1696     }
1697     HS_CMP_XCHG(r1, r5);
1698     HS_CMP_XCHG(r3, r7);
1699     HS_CMP_XCHG(r1, r3);
1700     HS_CMP_XCHG(r5, r7);
1701     HS_CMP_XCHG(r2, r6);
1702     HS_CMP_XCHG(r4, r8);
1703     HS_CMP_XCHG(r2, r4);
1704     HS_CMP_XCHG(r6, r8);
1705     HS_CMP_XCHG(r1, r2);
1706     HS_CMP_XCHG(r3, r4);
1707     HS_CMP_XCHG(r5, r6);
1708     HS_CMP_XCHG(r7, r8);
1709   }
1710   HS_SLAB_GLOBAL_STORE(0, r1);
1711   HS_SLAB_GLOBAL_STORE(1, r2);
1712   HS_SLAB_GLOBAL_STORE(2, r3);
1713   HS_SLAB_GLOBAL_STORE(3, r4);
1714   HS_SLAB_GLOBAL_STORE(4, r5);
1715   HS_SLAB_GLOBAL_STORE(5, r6);
1716   HS_SLAB_GLOBAL_STORE(6, r7);
1717   HS_SLAB_GLOBAL_STORE(7, r8);
1718 }
1719 
1720 HS_BS_KERNEL_PROTO(16, 4)
1721 {
1722   HS_BLOCK_LOCAL_MEM_DECL(512, 8);
1723 
1724   HS_SLAB_GLOBAL_PREAMBLE();
1725   HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0);
1726   HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1);
1727   HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2);
1728   HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3);
1729   HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4);
1730   HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5);
1731   HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6);
1732   HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7);
1733   HS_CMP_XCHG(r1, r5);
1734   HS_CMP_XCHG(r2, r6);
1735   HS_CMP_XCHG(r3, r7);
1736   HS_CMP_XCHG(r4, r8);
1737   HS_CMP_XCHG(r1, r3);
1738   HS_CMP_XCHG(r2, r4);
1739   HS_CMP_XCHG(r5, r7);
1740   HS_CMP_XCHG(r6, r8);
1741   HS_CMP_XCHG(r3, r5);
1742   HS_CMP_XCHG(r4, r6);
1743   HS_CMP_XCHG(r1, r2);
1744   HS_CMP_XCHG(r3, r4);
1745   HS_CMP_XCHG(r5, r6);
1746   HS_CMP_XCHG(r7, r8);
1747   HS_CMP_XCHG(r2, r5);
1748   HS_CMP_XCHG(r4, r7);
1749   HS_CMP_XCHG(r2, r3);
1750   HS_CMP_XCHG(r4, r5);
1751   HS_CMP_XCHG(r6, r7);
1752   {
1753     HS_SLAB_FLIP_PREAMBLE(1);
1754     HS_CMP_FLIP(0, r1, r8);
1755     HS_CMP_FLIP(1, r2, r7);
1756     HS_CMP_FLIP(2, r3, r6);
1757     HS_CMP_FLIP(3, r4, r5);
1758   }
1759   HS_CMP_XCHG(r1, r5);
1760   HS_CMP_XCHG(r3, r7);
1761   HS_CMP_XCHG(r1, r3);
1762   HS_CMP_XCHG(r5, r7);
1763   HS_CMP_XCHG(r2, r6);
1764   HS_CMP_XCHG(r4, r8);
1765   HS_CMP_XCHG(r2, r4);
1766   HS_CMP_XCHG(r6, r8);
1767   HS_CMP_XCHG(r1, r2);
1768   HS_CMP_XCHG(r3, r4);
1769   HS_CMP_XCHG(r5, r6);
1770   HS_CMP_XCHG(r7, r8);
1771   {
1772     HS_SLAB_FLIP_PREAMBLE(3);
1773     HS_CMP_FLIP(0, r1, r8);
1774     HS_CMP_FLIP(1, r2, r7);
1775     HS_CMP_FLIP(2, r3, r6);
1776     HS_CMP_FLIP(3, r4, r5);
1777   }
1778   {
1779     HS_SLAB_HALF_PREAMBLE(1);
1780     HS_CMP_HALF(0, r1);
1781     HS_CMP_HALF(1, r2);
1782     HS_CMP_HALF(2, r3);
1783     HS_CMP_HALF(3, r4);
1784     HS_CMP_HALF(4, r5);
1785     HS_CMP_HALF(5, r6);
1786     HS_CMP_HALF(6, r7);
1787     HS_CMP_HALF(7, r8);
1788   }
1789   HS_CMP_XCHG(r1, r5);
1790   HS_CMP_XCHG(r3, r7);
1791   HS_CMP_XCHG(r1, r3);
1792   HS_CMP_XCHG(r5, r7);
1793   HS_CMP_XCHG(r2, r6);
1794   HS_CMP_XCHG(r4, r8);
1795   HS_CMP_XCHG(r2, r4);
1796   HS_CMP_XCHG(r6, r8);
1797   HS_CMP_XCHG(r1, r2);
1798   HS_CMP_XCHG(r3, r4);
1799   HS_CMP_XCHG(r5, r6);
1800   HS_CMP_XCHG(r7, r8);
1801   {
1802     HS_SLAB_FLIP_PREAMBLE(7);
1803     HS_CMP_FLIP(0, r1, r8);
1804     HS_CMP_FLIP(1, r2, r7);
1805     HS_CMP_FLIP(2, r3, r6);
1806     HS_CMP_FLIP(3, r4, r5);
1807   }
1808   {
1809     HS_SLAB_HALF_PREAMBLE(2);
1810     HS_CMP_HALF(0, r1);
1811     HS_CMP_HALF(1, r2);
1812     HS_CMP_HALF(2, r3);
1813     HS_CMP_HALF(3, r4);
1814     HS_CMP_HALF(4, r5);
1815     HS_CMP_HALF(5, r6);
1816     HS_CMP_HALF(6, r7);
1817     HS_CMP_HALF(7, r8);
1818   }
1819   {
1820     HS_SLAB_HALF_PREAMBLE(1);
1821     HS_CMP_HALF(0, r1);
1822     HS_CMP_HALF(1, r2);
1823     HS_CMP_HALF(2, r3);
1824     HS_CMP_HALF(3, r4);
1825     HS_CMP_HALF(4, r5);
1826     HS_CMP_HALF(5, r6);
1827     HS_CMP_HALF(6, r7);
1828     HS_CMP_HALF(7, r8);
1829   }
1830   HS_CMP_XCHG(r1, r5);
1831   HS_CMP_XCHG(r3, r7);
1832   HS_CMP_XCHG(r1, r3);
1833   HS_CMP_XCHG(r5, r7);
1834   HS_CMP_XCHG(r2, r6);
1835   HS_CMP_XCHG(r4, r8);
1836   HS_CMP_XCHG(r2, r4);
1837   HS_CMP_XCHG(r6, r8);
1838   HS_CMP_XCHG(r1, r2);
1839   HS_CMP_XCHG(r3, r4);
1840   HS_CMP_XCHG(r5, r6);
1841   HS_CMP_XCHG(r7, r8);
1842   {
1843     HS_SLAB_FLIP_PREAMBLE(15);
1844     HS_CMP_FLIP(0, r1, r8);
1845     HS_CMP_FLIP(1, r2, r7);
1846     HS_CMP_FLIP(2, r3, r6);
1847     HS_CMP_FLIP(3, r4, r5);
1848   }
1849   {
1850     HS_SLAB_HALF_PREAMBLE(4);
1851     HS_CMP_HALF(0, r1);
1852     HS_CMP_HALF(1, r2);
1853     HS_CMP_HALF(2, r3);
1854     HS_CMP_HALF(3, r4);
1855     HS_CMP_HALF(4, r5);
1856     HS_CMP_HALF(5, r6);
1857     HS_CMP_HALF(6, r7);
1858     HS_CMP_HALF(7, r8);
1859   }
1860   {
1861     HS_SLAB_HALF_PREAMBLE(2);
1862     HS_CMP_HALF(0, r1);
1863     HS_CMP_HALF(1, r2);
1864     HS_CMP_HALF(2, r3);
1865     HS_CMP_HALF(3, r4);
1866     HS_CMP_HALF(4, r5);
1867     HS_CMP_HALF(5, r6);
1868     HS_CMP_HALF(6, r7);
1869     HS_CMP_HALF(7, r8);
1870   }
1871   {
1872     HS_SLAB_HALF_PREAMBLE(1);
1873     HS_CMP_HALF(0, r1);
1874     HS_CMP_HALF(1, r2);
1875     HS_CMP_HALF(2, r3);
1876     HS_CMP_HALF(3, r4);
1877     HS_CMP_HALF(4, r5);
1878     HS_CMP_HALF(5, r6);
1879     HS_CMP_HALF(6, r7);
1880     HS_CMP_HALF(7, r8);
1881   }
1882   HS_CMP_XCHG(r1, r5);
1883   HS_CMP_XCHG(r3, r7);
1884   HS_CMP_XCHG(r1, r3);
1885   HS_CMP_XCHG(r5, r7);
1886   HS_CMP_XCHG(r2, r6);
1887   HS_CMP_XCHG(r4, r8);
1888   HS_CMP_XCHG(r2, r4);
1889   HS_CMP_XCHG(r6, r8);
1890   HS_CMP_XCHG(r1, r2);
1891   HS_CMP_XCHG(r3, r4);
1892   HS_CMP_XCHG(r5, r6);
1893   HS_CMP_XCHG(r7, r8);
1894   {
1895     HS_SLAB_FLIP_PREAMBLE(31);
1896     HS_CMP_FLIP(0, r1, r8);
1897     HS_CMP_FLIP(1, r2, r7);
1898     HS_CMP_FLIP(2, r3, r6);
1899     HS_CMP_FLIP(3, r4, r5);
1900   }
1901   {
1902     HS_SLAB_HALF_PREAMBLE(8);
1903     HS_CMP_HALF(0, r1);
1904     HS_CMP_HALF(1, r2);
1905     HS_CMP_HALF(2, r3);
1906     HS_CMP_HALF(3, r4);
1907     HS_CMP_HALF(4, r5);
1908     HS_CMP_HALF(5, r6);
1909     HS_CMP_HALF(6, r7);
1910     HS_CMP_HALF(7, r8);
1911   }
1912   {
1913     HS_SLAB_HALF_PREAMBLE(4);
1914     HS_CMP_HALF(0, r1);
1915     HS_CMP_HALF(1, r2);
1916     HS_CMP_HALF(2, r3);
1917     HS_CMP_HALF(3, r4);
1918     HS_CMP_HALF(4, r5);
1919     HS_CMP_HALF(5, r6);
1920     HS_CMP_HALF(6, r7);
1921     HS_CMP_HALF(7, r8);
1922   }
1923   {
1924     HS_SLAB_HALF_PREAMBLE(2);
1925     HS_CMP_HALF(0, r1);
1926     HS_CMP_HALF(1, r2);
1927     HS_CMP_HALF(2, r3);
1928     HS_CMP_HALF(3, r4);
1929     HS_CMP_HALF(4, r5);
1930     HS_CMP_HALF(5, r6);
1931     HS_CMP_HALF(6, r7);
1932     HS_CMP_HALF(7, r8);
1933   }
1934   {
1935     HS_SLAB_HALF_PREAMBLE(1);
1936     HS_CMP_HALF(0, r1);
1937     HS_CMP_HALF(1, r2);
1938     HS_CMP_HALF(2, r3);
1939     HS_CMP_HALF(3, r4);
1940     HS_CMP_HALF(4, r5);
1941     HS_CMP_HALF(5, r6);
1942     HS_CMP_HALF(6, r7);
1943     HS_CMP_HALF(7, r8);
1944   }
1945   HS_CMP_XCHG(r1, r5);
1946   HS_CMP_XCHG(r3, r7);
1947   HS_CMP_XCHG(r1, r3);
1948   HS_CMP_XCHG(r5, r7);
1949   HS_CMP_XCHG(r2, r6);
1950   HS_CMP_XCHG(r4, r8);
1951   HS_CMP_XCHG(r2, r4);
1952   HS_CMP_XCHG(r6, r8);
1953   HS_CMP_XCHG(r1, r2);
1954   HS_CMP_XCHG(r3, r4);
1955   HS_CMP_XCHG(r5, r6);
1956   HS_CMP_XCHG(r7, r8);
1957   HS_BS_MERGE_H_PREAMBLE(16);
1958   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1;
1959   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r8;
1960   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2;
1961   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r7;
1962   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3;
1963   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r6;
1964   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4;
1965   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r5;
1966   HS_BLOCK_BARRIER();
1967   if (HS_WARP_ID_X() < 8) {
1968     {
1969       HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
1970       HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(32);
1971       HS_CMP_XCHG(r0_1, r0_2);
1972       HS_SLAB_LOCAL_L(0) = r0_1;
1973       HS_SLAB_LOCAL_R(32) = r0_2;
1974     }
1975     {
1976       HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(64);
1977       HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(96);
1978       HS_CMP_XCHG(r1_1, r1_2);
1979       HS_SLAB_LOCAL_L(64) = r1_1;
1980       HS_SLAB_LOCAL_R(96) = r1_2;
1981     }
1982     {
1983       HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(128);
1984       HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(160);
1985       HS_CMP_XCHG(r2_1, r2_2);
1986       HS_SLAB_LOCAL_L(128) = r2_1;
1987       HS_SLAB_LOCAL_R(160) = r2_2;
1988     }
1989     {
1990       HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(192);
1991       HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(224);
1992       HS_CMP_XCHG(r3_1, r3_2);
1993       HS_SLAB_LOCAL_L(192) = r3_1;
1994       HS_SLAB_LOCAL_R(224) = r3_2;
1995     }
1996     {
1997       HS_KEY_TYPE r4_1 = HS_SLAB_LOCAL_L(256);
1998       HS_KEY_TYPE r4_2 = HS_SLAB_LOCAL_R(288);
1999       HS_CMP_XCHG(r4_1, r4_2);
2000       HS_SLAB_LOCAL_L(256) = r4_1;
2001       HS_SLAB_LOCAL_R(288) = r4_2;
2002     }
2003     {
2004       HS_KEY_TYPE r5_1 = HS_SLAB_LOCAL_L(320);
2005       HS_KEY_TYPE r5_2 = HS_SLAB_LOCAL_R(352);
2006       HS_CMP_XCHG(r5_1, r5_2);
2007       HS_SLAB_LOCAL_L(320) = r5_1;
2008       HS_SLAB_LOCAL_R(352) = r5_2;
2009     }
2010     {
2011       HS_KEY_TYPE r6_1 = HS_SLAB_LOCAL_L(384);
2012       HS_KEY_TYPE r6_2 = HS_SLAB_LOCAL_R(416);
2013       HS_CMP_XCHG(r6_1, r6_2);
2014       HS_SLAB_LOCAL_L(384) = r6_1;
2015       HS_SLAB_LOCAL_R(416) = r6_2;
2016     }
2017     {
2018       HS_KEY_TYPE r7_1 = HS_SLAB_LOCAL_L(448);
2019       HS_KEY_TYPE r7_2 = HS_SLAB_LOCAL_R(480);
2020       HS_CMP_XCHG(r7_1, r7_2);
2021       HS_SLAB_LOCAL_L(448) = r7_1;
2022       HS_SLAB_LOCAL_R(480) = r7_2;
2023     }
2024   }
2025   HS_BLOCK_BARRIER();
2026   r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0);
2027   r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1);
2028   r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2);
2029   r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3);
2030   r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4);
2031   r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5);
2032   r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6);
2033   r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7);
2034   {
2035     {
2036       HS_SLAB_HALF_PREAMBLE(16);
2037       HS_CMP_HALF(0, r1);
2038       HS_CMP_HALF(1, r2);
2039       HS_CMP_HALF(2, r3);
2040       HS_CMP_HALF(3, r4);
2041       HS_CMP_HALF(4, r5);
2042       HS_CMP_HALF(5, r6);
2043       HS_CMP_HALF(6, r7);
2044       HS_CMP_HALF(7, r8);
2045     }
2046     {
2047       HS_SLAB_HALF_PREAMBLE(8);
2048       HS_CMP_HALF(0, r1);
2049       HS_CMP_HALF(1, r2);
2050       HS_CMP_HALF(2, r3);
2051       HS_CMP_HALF(3, r4);
2052       HS_CMP_HALF(4, r5);
2053       HS_CMP_HALF(5, r6);
2054       HS_CMP_HALF(6, r7);
2055       HS_CMP_HALF(7, r8);
2056     }
2057     {
2058       HS_SLAB_HALF_PREAMBLE(4);
2059       HS_CMP_HALF(0, r1);
2060       HS_CMP_HALF(1, r2);
2061       HS_CMP_HALF(2, r3);
2062       HS_CMP_HALF(3, r4);
2063       HS_CMP_HALF(4, r5);
2064       HS_CMP_HALF(5, r6);
2065       HS_CMP_HALF(6, r7);
2066       HS_CMP_HALF(7, r8);
2067     }
2068     {
2069       HS_SLAB_HALF_PREAMBLE(2);
2070       HS_CMP_HALF(0, r1);
2071       HS_CMP_HALF(1, r2);
2072       HS_CMP_HALF(2, r3);
2073       HS_CMP_HALF(3, r4);
2074       HS_CMP_HALF(4, r5);
2075       HS_CMP_HALF(5, r6);
2076       HS_CMP_HALF(6, r7);
2077       HS_CMP_HALF(7, r8);
2078     }
2079     {
2080       HS_SLAB_HALF_PREAMBLE(1);
2081       HS_CMP_HALF(0, r1);
2082       HS_CMP_HALF(1, r2);
2083       HS_CMP_HALF(2, r3);
2084       HS_CMP_HALF(3, r4);
2085       HS_CMP_HALF(4, r5);
2086       HS_CMP_HALF(5, r6);
2087       HS_CMP_HALF(6, r7);
2088       HS_CMP_HALF(7, r8);
2089     }
2090     HS_CMP_XCHG(r1, r5);
2091     HS_CMP_XCHG(r3, r7);
2092     HS_CMP_XCHG(r1, r3);
2093     HS_CMP_XCHG(r5, r7);
2094     HS_CMP_XCHG(r2, r6);
2095     HS_CMP_XCHG(r4, r8);
2096     HS_CMP_XCHG(r2, r4);
2097     HS_CMP_XCHG(r6, r8);
2098     HS_CMP_XCHG(r1, r2);
2099     HS_CMP_XCHG(r3, r4);
2100     HS_CMP_XCHG(r5, r6);
2101     HS_CMP_XCHG(r7, r8);
2102   }
2103   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1;
2104   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r8;
2105   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2;
2106   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r7;
2107   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3;
2108   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r6;
2109   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4;
2110   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r5;
2111   HS_BLOCK_BARRIER();
2112   if (HS_WARP_ID_X() < 8) {
2113     {
2114       HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
2115       HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(32);
2116       HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(64);
2117       HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(96);
2118       HS_CMP_XCHG(r0_2, r0_3);
2119       HS_CMP_XCHG(r0_1, r0_4);
2120       HS_CMP_XCHG(r0_3, r0_4);
2121       HS_CMP_XCHG(r0_1, r0_2);
2122       HS_SLAB_LOCAL_L(0) = r0_1;
2123       HS_SLAB_LOCAL_L(32) = r0_2;
2124       HS_SLAB_LOCAL_R(64) = r0_3;
2125       HS_SLAB_LOCAL_R(96) = r0_4;
2126     }
2127     {
2128       HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(128);
2129       HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(160);
2130       HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(192);
2131       HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(224);
2132       HS_CMP_XCHG(r1_2, r1_3);
2133       HS_CMP_XCHG(r1_1, r1_4);
2134       HS_CMP_XCHG(r1_3, r1_4);
2135       HS_CMP_XCHG(r1_1, r1_2);
2136       HS_SLAB_LOCAL_L(128) = r1_1;
2137       HS_SLAB_LOCAL_L(160) = r1_2;
2138       HS_SLAB_LOCAL_R(192) = r1_3;
2139       HS_SLAB_LOCAL_R(224) = r1_4;
2140     }
2141     {
2142       HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(256);
2143       HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_L(288);
2144       HS_KEY_TYPE r2_3 = HS_SLAB_LOCAL_R(320);
2145       HS_KEY_TYPE r2_4 = HS_SLAB_LOCAL_R(352);
2146       HS_CMP_XCHG(r2_2, r2_3);
2147       HS_CMP_XCHG(r2_1, r2_4);
2148       HS_CMP_XCHG(r2_3, r2_4);
2149       HS_CMP_XCHG(r2_1, r2_2);
2150       HS_SLAB_LOCAL_L(256) = r2_1;
2151       HS_SLAB_LOCAL_L(288) = r2_2;
2152       HS_SLAB_LOCAL_R(320) = r2_3;
2153       HS_SLAB_LOCAL_R(352) = r2_4;
2154     }
2155     {
2156       HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(384);
2157       HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_L(416);
2158       HS_KEY_TYPE r3_3 = HS_SLAB_LOCAL_R(448);
2159       HS_KEY_TYPE r3_4 = HS_SLAB_LOCAL_R(480);
2160       HS_CMP_XCHG(r3_2, r3_3);
2161       HS_CMP_XCHG(r3_1, r3_4);
2162       HS_CMP_XCHG(r3_3, r3_4);
2163       HS_CMP_XCHG(r3_1, r3_2);
2164       HS_SLAB_LOCAL_L(384) = r3_1;
2165       HS_SLAB_LOCAL_L(416) = r3_2;
2166       HS_SLAB_LOCAL_R(448) = r3_3;
2167       HS_SLAB_LOCAL_R(480) = r3_4;
2168     }
2169   }
2170   HS_BLOCK_BARRIER();
2171   r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0);
2172   r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1);
2173   r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2);
2174   r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3);
2175   r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4);
2176   r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5);
2177   r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6);
2178   r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7);
2179   {
2180     {
2181       HS_SLAB_HALF_PREAMBLE(16);
2182       HS_CMP_HALF(0, r1);
2183       HS_CMP_HALF(1, r2);
2184       HS_CMP_HALF(2, r3);
2185       HS_CMP_HALF(3, r4);
2186       HS_CMP_HALF(4, r5);
2187       HS_CMP_HALF(5, r6);
2188       HS_CMP_HALF(6, r7);
2189       HS_CMP_HALF(7, r8);
2190     }
2191     {
2192       HS_SLAB_HALF_PREAMBLE(8);
2193       HS_CMP_HALF(0, r1);
2194       HS_CMP_HALF(1, r2);
2195       HS_CMP_HALF(2, r3);
2196       HS_CMP_HALF(3, r4);
2197       HS_CMP_HALF(4, r5);
2198       HS_CMP_HALF(5, r6);
2199       HS_CMP_HALF(6, r7);
2200       HS_CMP_HALF(7, r8);
2201     }
2202     {
2203       HS_SLAB_HALF_PREAMBLE(4);
2204       HS_CMP_HALF(0, r1);
2205       HS_CMP_HALF(1, r2);
2206       HS_CMP_HALF(2, r3);
2207       HS_CMP_HALF(3, r4);
2208       HS_CMP_HALF(4, r5);
2209       HS_CMP_HALF(5, r6);
2210       HS_CMP_HALF(6, r7);
2211       HS_CMP_HALF(7, r8);
2212     }
2213     {
2214       HS_SLAB_HALF_PREAMBLE(2);
2215       HS_CMP_HALF(0, r1);
2216       HS_CMP_HALF(1, r2);
2217       HS_CMP_HALF(2, r3);
2218       HS_CMP_HALF(3, r4);
2219       HS_CMP_HALF(4, r5);
2220       HS_CMP_HALF(5, r6);
2221       HS_CMP_HALF(6, r7);
2222       HS_CMP_HALF(7, r8);
2223     }
2224     {
2225       HS_SLAB_HALF_PREAMBLE(1);
2226       HS_CMP_HALF(0, r1);
2227       HS_CMP_HALF(1, r2);
2228       HS_CMP_HALF(2, r3);
2229       HS_CMP_HALF(3, r4);
2230       HS_CMP_HALF(4, r5);
2231       HS_CMP_HALF(5, r6);
2232       HS_CMP_HALF(6, r7);
2233       HS_CMP_HALF(7, r8);
2234     }
2235     HS_CMP_XCHG(r1, r5);
2236     HS_CMP_XCHG(r3, r7);
2237     HS_CMP_XCHG(r1, r3);
2238     HS_CMP_XCHG(r5, r7);
2239     HS_CMP_XCHG(r2, r6);
2240     HS_CMP_XCHG(r4, r8);
2241     HS_CMP_XCHG(r2, r4);
2242     HS_CMP_XCHG(r6, r8);
2243     HS_CMP_XCHG(r1, r2);
2244     HS_CMP_XCHG(r3, r4);
2245     HS_CMP_XCHG(r5, r6);
2246     HS_CMP_XCHG(r7, r8);
2247   }
2248   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1;
2249   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r8;
2250   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2;
2251   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r7;
2252   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3;
2253   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r6;
2254   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4;
2255   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r5;
2256   HS_BLOCK_BARRIER();
2257   if (HS_WARP_ID_X() < 8) {
2258     {
2259       HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
2260       HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(32);
2261       HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(64);
2262       HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(96);
2263       HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(128);
2264       HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(160);
2265       HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(192);
2266       HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(224);
2267       HS_CMP_XCHG(r0_4, r0_5);
2268       HS_CMP_XCHG(r0_3, r0_6);
2269       HS_CMP_XCHG(r0_2, r0_7);
2270       HS_CMP_XCHG(r0_1, r0_8);
2271       HS_CMP_XCHG(r0_5, r0_7);
2272       HS_CMP_XCHG(r0_6, r0_8);
2273       HS_CMP_XCHG(r0_5, r0_6);
2274       HS_CMP_XCHG(r0_7, r0_8);
2275       HS_CMP_XCHG(r0_1, r0_3);
2276       HS_CMP_XCHG(r0_2, r0_4);
2277       HS_CMP_XCHG(r0_1, r0_2);
2278       HS_CMP_XCHG(r0_3, r0_4);
2279       HS_SLAB_LOCAL_L(0) = r0_1;
2280       HS_SLAB_LOCAL_L(32) = r0_2;
2281       HS_SLAB_LOCAL_L(64) = r0_3;
2282       HS_SLAB_LOCAL_L(96) = r0_4;
2283       HS_SLAB_LOCAL_R(128) = r0_5;
2284       HS_SLAB_LOCAL_R(160) = r0_6;
2285       HS_SLAB_LOCAL_R(192) = r0_7;
2286       HS_SLAB_LOCAL_R(224) = r0_8;
2287     }
2288     {
2289       HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(256);
2290       HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(288);
2291       HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_L(320);
2292       HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_L(352);
2293       HS_KEY_TYPE r1_5 = HS_SLAB_LOCAL_R(384);
2294       HS_KEY_TYPE r1_6 = HS_SLAB_LOCAL_R(416);
2295       HS_KEY_TYPE r1_7 = HS_SLAB_LOCAL_R(448);
2296       HS_KEY_TYPE r1_8 = HS_SLAB_LOCAL_R(480);
2297       HS_CMP_XCHG(r1_4, r1_5);
2298       HS_CMP_XCHG(r1_3, r1_6);
2299       HS_CMP_XCHG(r1_2, r1_7);
2300       HS_CMP_XCHG(r1_1, r1_8);
2301       HS_CMP_XCHG(r1_5, r1_7);
2302       HS_CMP_XCHG(r1_6, r1_8);
2303       HS_CMP_XCHG(r1_5, r1_6);
2304       HS_CMP_XCHG(r1_7, r1_8);
2305       HS_CMP_XCHG(r1_1, r1_3);
2306       HS_CMP_XCHG(r1_2, r1_4);
2307       HS_CMP_XCHG(r1_1, r1_2);
2308       HS_CMP_XCHG(r1_3, r1_4);
2309       HS_SLAB_LOCAL_L(256) = r1_1;
2310       HS_SLAB_LOCAL_L(288) = r1_2;
2311       HS_SLAB_LOCAL_L(320) = r1_3;
2312       HS_SLAB_LOCAL_L(352) = r1_4;
2313       HS_SLAB_LOCAL_R(384) = r1_5;
2314       HS_SLAB_LOCAL_R(416) = r1_6;
2315       HS_SLAB_LOCAL_R(448) = r1_7;
2316       HS_SLAB_LOCAL_R(480) = r1_8;
2317     }
2318   }
2319   HS_BLOCK_BARRIER();
2320   r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0);
2321   r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1);
2322   r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2);
2323   r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3);
2324   r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4);
2325   r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5);
2326   r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6);
2327   r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7);
2328   {
2329     {
2330       HS_SLAB_HALF_PREAMBLE(16);
2331       HS_CMP_HALF(0, r1);
2332       HS_CMP_HALF(1, r2);
2333       HS_CMP_HALF(2, r3);
2334       HS_CMP_HALF(3, r4);
2335       HS_CMP_HALF(4, r5);
2336       HS_CMP_HALF(5, r6);
2337       HS_CMP_HALF(6, r7);
2338       HS_CMP_HALF(7, r8);
2339     }
2340     {
2341       HS_SLAB_HALF_PREAMBLE(8);
2342       HS_CMP_HALF(0, r1);
2343       HS_CMP_HALF(1, r2);
2344       HS_CMP_HALF(2, r3);
2345       HS_CMP_HALF(3, r4);
2346       HS_CMP_HALF(4, r5);
2347       HS_CMP_HALF(5, r6);
2348       HS_CMP_HALF(6, r7);
2349       HS_CMP_HALF(7, r8);
2350     }
2351     {
2352       HS_SLAB_HALF_PREAMBLE(4);
2353       HS_CMP_HALF(0, r1);
2354       HS_CMP_HALF(1, r2);
2355       HS_CMP_HALF(2, r3);
2356       HS_CMP_HALF(3, r4);
2357       HS_CMP_HALF(4, r5);
2358       HS_CMP_HALF(5, r6);
2359       HS_CMP_HALF(6, r7);
2360       HS_CMP_HALF(7, r8);
2361     }
2362     {
2363       HS_SLAB_HALF_PREAMBLE(2);
2364       HS_CMP_HALF(0, r1);
2365       HS_CMP_HALF(1, r2);
2366       HS_CMP_HALF(2, r3);
2367       HS_CMP_HALF(3, r4);
2368       HS_CMP_HALF(4, r5);
2369       HS_CMP_HALF(5, r6);
2370       HS_CMP_HALF(6, r7);
2371       HS_CMP_HALF(7, r8);
2372     }
2373     {
2374       HS_SLAB_HALF_PREAMBLE(1);
2375       HS_CMP_HALF(0, r1);
2376       HS_CMP_HALF(1, r2);
2377       HS_CMP_HALF(2, r3);
2378       HS_CMP_HALF(3, r4);
2379       HS_CMP_HALF(4, r5);
2380       HS_CMP_HALF(5, r6);
2381       HS_CMP_HALF(6, r7);
2382       HS_CMP_HALF(7, r8);
2383     }
2384     HS_CMP_XCHG(r1, r5);
2385     HS_CMP_XCHG(r3, r7);
2386     HS_CMP_XCHG(r1, r3);
2387     HS_CMP_XCHG(r5, r7);
2388     HS_CMP_XCHG(r2, r6);
2389     HS_CMP_XCHG(r4, r8);
2390     HS_CMP_XCHG(r2, r4);
2391     HS_CMP_XCHG(r6, r8);
2392     HS_CMP_XCHG(r1, r2);
2393     HS_CMP_XCHG(r3, r4);
2394     HS_CMP_XCHG(r5, r6);
2395     HS_CMP_XCHG(r7, r8);
2396   }
2397   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1;
2398   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r8;
2399   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2;
2400   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r7;
2401   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3;
2402   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r6;
2403   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4;
2404   HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r5;
2405   HS_BLOCK_BARRIER();
2406   if (HS_WARP_ID_X() < 8) {
2407     {
2408       HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
2409       HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(32);
2410       HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(64);
2411       HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(96);
2412       HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_L(128);
2413       HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_L(160);
2414       HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_L(192);
2415       HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_L(224);
2416       HS_KEY_TYPE r0_9 = HS_SLAB_LOCAL_R(256);
2417       HS_KEY_TYPE r0_10 = HS_SLAB_LOCAL_R(288);
2418       HS_KEY_TYPE r0_11 = HS_SLAB_LOCAL_R(320);
2419       HS_KEY_TYPE r0_12 = HS_SLAB_LOCAL_R(352);
2420       HS_KEY_TYPE r0_13 = HS_SLAB_LOCAL_R(384);
2421       HS_KEY_TYPE r0_14 = HS_SLAB_LOCAL_R(416);
2422       HS_KEY_TYPE r0_15 = HS_SLAB_LOCAL_R(448);
2423       HS_KEY_TYPE r0_16 = HS_SLAB_LOCAL_R(480);
2424       HS_CMP_XCHG(r0_8, r0_9);
2425       HS_CMP_XCHG(r0_7, r0_10);
2426       HS_CMP_XCHG(r0_6, r0_11);
2427       HS_CMP_XCHG(r0_5, r0_12);
2428       HS_CMP_XCHG(r0_4, r0_13);
2429       HS_CMP_XCHG(r0_3, r0_14);
2430       HS_CMP_XCHG(r0_2, r0_15);
2431       HS_CMP_XCHG(r0_1, r0_16);
2432       HS_CMP_XCHG(r0_9, r0_13);
2433       HS_CMP_XCHG(r0_11, r0_15);
2434       HS_CMP_XCHG(r0_9, r0_11);
2435       HS_CMP_XCHG(r0_13, r0_15);
2436       HS_CMP_XCHG(r0_10, r0_14);
2437       HS_CMP_XCHG(r0_12, r0_16);
2438       HS_CMP_XCHG(r0_10, r0_12);
2439       HS_CMP_XCHG(r0_14, r0_16);
2440       HS_CMP_XCHG(r0_9, r0_10);
2441       HS_CMP_XCHG(r0_11, r0_12);
2442       HS_CMP_XCHG(r0_13, r0_14);
2443       HS_CMP_XCHG(r0_15, r0_16);
2444       HS_CMP_XCHG(r0_1, r0_5);
2445       HS_CMP_XCHG(r0_3, r0_7);
2446       HS_CMP_XCHG(r0_1, r0_3);
2447       HS_CMP_XCHG(r0_5, r0_7);
2448       HS_CMP_XCHG(r0_2, r0_6);
2449       HS_CMP_XCHG(r0_4, r0_8);
2450       HS_CMP_XCHG(r0_2, r0_4);
2451       HS_CMP_XCHG(r0_6, r0_8);
2452       HS_CMP_XCHG(r0_1, r0_2);
2453       HS_CMP_XCHG(r0_3, r0_4);
2454       HS_CMP_XCHG(r0_5, r0_6);
2455       HS_CMP_XCHG(r0_7, r0_8);
2456       HS_SLAB_LOCAL_L(0) = r0_1;
2457       HS_SLAB_LOCAL_L(32) = r0_2;
2458       HS_SLAB_LOCAL_L(64) = r0_3;
2459       HS_SLAB_LOCAL_L(96) = r0_4;
2460       HS_SLAB_LOCAL_L(128) = r0_5;
2461       HS_SLAB_LOCAL_L(160) = r0_6;
2462       HS_SLAB_LOCAL_L(192) = r0_7;
2463       HS_SLAB_LOCAL_L(224) = r0_8;
2464       HS_SLAB_LOCAL_R(256) = r0_9;
2465       HS_SLAB_LOCAL_R(288) = r0_10;
2466       HS_SLAB_LOCAL_R(320) = r0_11;
2467       HS_SLAB_LOCAL_R(352) = r0_12;
2468       HS_SLAB_LOCAL_R(384) = r0_13;
2469       HS_SLAB_LOCAL_R(416) = r0_14;
2470       HS_SLAB_LOCAL_R(448) = r0_15;
2471       HS_SLAB_LOCAL_R(480) = r0_16;
2472     }
2473   }
2474   HS_BLOCK_BARRIER();
2475   r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0);
2476   r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1);
2477   r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2);
2478   r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3);
2479   r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4);
2480   r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5);
2481   r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6);
2482   r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7);
2483   {
2484     {
2485       HS_SLAB_HALF_PREAMBLE(16);
2486       HS_CMP_HALF(0, r1);
2487       HS_CMP_HALF(1, r2);
2488       HS_CMP_HALF(2, r3);
2489       HS_CMP_HALF(3, r4);
2490       HS_CMP_HALF(4, r5);
2491       HS_CMP_HALF(5, r6);
2492       HS_CMP_HALF(6, r7);
2493       HS_CMP_HALF(7, r8);
2494     }
2495     {
2496       HS_SLAB_HALF_PREAMBLE(8);
2497       HS_CMP_HALF(0, r1);
2498       HS_CMP_HALF(1, r2);
2499       HS_CMP_HALF(2, r3);
2500       HS_CMP_HALF(3, r4);
2501       HS_CMP_HALF(4, r5);
2502       HS_CMP_HALF(5, r6);
2503       HS_CMP_HALF(6, r7);
2504       HS_CMP_HALF(7, r8);
2505     }
2506     {
2507       HS_SLAB_HALF_PREAMBLE(4);
2508       HS_CMP_HALF(0, r1);
2509       HS_CMP_HALF(1, r2);
2510       HS_CMP_HALF(2, r3);
2511       HS_CMP_HALF(3, r4);
2512       HS_CMP_HALF(4, r5);
2513       HS_CMP_HALF(5, r6);
2514       HS_CMP_HALF(6, r7);
2515       HS_CMP_HALF(7, r8);
2516     }
2517     {
2518       HS_SLAB_HALF_PREAMBLE(2);
2519       HS_CMP_HALF(0, r1);
2520       HS_CMP_HALF(1, r2);
2521       HS_CMP_HALF(2, r3);
2522       HS_CMP_HALF(3, r4);
2523       HS_CMP_HALF(4, r5);
2524       HS_CMP_HALF(5, r6);
2525       HS_CMP_HALF(6, r7);
2526       HS_CMP_HALF(7, r8);
2527     }
2528     {
2529       HS_SLAB_HALF_PREAMBLE(1);
2530       HS_CMP_HALF(0, r1);
2531       HS_CMP_HALF(1, r2);
2532       HS_CMP_HALF(2, r3);
2533       HS_CMP_HALF(3, r4);
2534       HS_CMP_HALF(4, r5);
2535       HS_CMP_HALF(5, r6);
2536       HS_CMP_HALF(6, r7);
2537       HS_CMP_HALF(7, r8);
2538     }
2539     HS_CMP_XCHG(r1, r5);
2540     HS_CMP_XCHG(r3, r7);
2541     HS_CMP_XCHG(r1, r3);
2542     HS_CMP_XCHG(r5, r7);
2543     HS_CMP_XCHG(r2, r6);
2544     HS_CMP_XCHG(r4, r8);
2545     HS_CMP_XCHG(r2, r4);
2546     HS_CMP_XCHG(r6, r8);
2547     HS_CMP_XCHG(r1, r2);
2548     HS_CMP_XCHG(r3, r4);
2549     HS_CMP_XCHG(r5, r6);
2550     HS_CMP_XCHG(r7, r8);
2551   }
2552   HS_SLAB_GLOBAL_STORE(0, r1);
2553   HS_SLAB_GLOBAL_STORE(1, r2);
2554   HS_SLAB_GLOBAL_STORE(2, r3);
2555   HS_SLAB_GLOBAL_STORE(3, r4);
2556   HS_SLAB_GLOBAL_STORE(4, r5);
2557   HS_SLAB_GLOBAL_STORE(5, r6);
2558   HS_SLAB_GLOBAL_STORE(6, r7);
2559   HS_SLAB_GLOBAL_STORE(7, r8);
2560 }
2561 
2562 HS_BC_KERNEL_PROTO(1, 0)
2563 {
2564   HS_SLAB_GLOBAL_PREAMBLE();
2565   HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vout, 0);
2566   HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vout, 1);
2567   HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vout, 2);
2568   HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vout, 3);
2569   HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vout, 4);
2570   HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vout, 5);
2571   HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vout, 6);
2572   HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vout, 7);
2573   {
2574     {
2575       HS_SLAB_HALF_PREAMBLE(16);
2576       HS_CMP_HALF(0, r1);
2577       HS_CMP_HALF(1, r2);
2578       HS_CMP_HALF(2, r3);
2579       HS_CMP_HALF(3, r4);
2580       HS_CMP_HALF(4, r5);
2581       HS_CMP_HALF(5, r6);
2582       HS_CMP_HALF(6, r7);
2583       HS_CMP_HALF(7, r8);
2584     }
2585     {
2586       HS_SLAB_HALF_PREAMBLE(8);
2587       HS_CMP_HALF(0, r1);
2588       HS_CMP_HALF(1, r2);
2589       HS_CMP_HALF(2, r3);
2590       HS_CMP_HALF(3, r4);
2591       HS_CMP_HALF(4, r5);
2592       HS_CMP_HALF(5, r6);
2593       HS_CMP_HALF(6, r7);
2594       HS_CMP_HALF(7, r8);
2595     }
2596     {
2597       HS_SLAB_HALF_PREAMBLE(4);
2598       HS_CMP_HALF(0, r1);
2599       HS_CMP_HALF(1, r2);
2600       HS_CMP_HALF(2, r3);
2601       HS_CMP_HALF(3, r4);
2602       HS_CMP_HALF(4, r5);
2603       HS_CMP_HALF(5, r6);
2604       HS_CMP_HALF(6, r7);
2605       HS_CMP_HALF(7, r8);
2606     }
2607     {
2608       HS_SLAB_HALF_PREAMBLE(2);
2609       HS_CMP_HALF(0, r1);
2610       HS_CMP_HALF(1, r2);
2611       HS_CMP_HALF(2, r3);
2612       HS_CMP_HALF(3, r4);
2613       HS_CMP_HALF(4, r5);
2614       HS_CMP_HALF(5, r6);
2615       HS_CMP_HALF(6, r7);
2616       HS_CMP_HALF(7, r8);
2617     }
2618     {
2619       HS_SLAB_HALF_PREAMBLE(1);
2620       HS_CMP_HALF(0, r1);
2621       HS_CMP_HALF(1, r2);
2622       HS_CMP_HALF(2, r3);
2623       HS_CMP_HALF(3, r4);
2624       HS_CMP_HALF(4, r5);
2625       HS_CMP_HALF(5, r6);
2626       HS_CMP_HALF(6, r7);
2627       HS_CMP_HALF(7, r8);
2628     }
2629     HS_CMP_XCHG(r1, r5);
2630     HS_CMP_XCHG(r3, r7);
2631     HS_CMP_XCHG(r1, r3);
2632     HS_CMP_XCHG(r5, r7);
2633     HS_CMP_XCHG(r2, r6);
2634     HS_CMP_XCHG(r4, r8);
2635     HS_CMP_XCHG(r2, r4);
2636     HS_CMP_XCHG(r6, r8);
2637     HS_CMP_XCHG(r1, r2);
2638     HS_CMP_XCHG(r3, r4);
2639     HS_CMP_XCHG(r5, r6);
2640     HS_CMP_XCHG(r7, r8);
2641   }
2642   HS_SLAB_GLOBAL_STORE(0, r1);
2643   HS_SLAB_GLOBAL_STORE(1, r2);
2644   HS_SLAB_GLOBAL_STORE(2, r3);
2645   HS_SLAB_GLOBAL_STORE(3, r4);
2646   HS_SLAB_GLOBAL_STORE(4, r5);
2647   HS_SLAB_GLOBAL_STORE(5, r6);
2648   HS_SLAB_GLOBAL_STORE(6, r7);
2649   HS_SLAB_GLOBAL_STORE(7, r8);
2650 }
2651 
2652 HS_BC_KERNEL_PROTO(2, 1)
2653 {
2654   HS_BLOCK_LOCAL_MEM_DECL(64, 8);
2655 
2656   HS_SLAB_GLOBAL_PREAMBLE();
2657   HS_BC_MERGE_H_PREAMBLE(2);
2658   {
2659     {
2660       HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0);
2661       HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8);
2662       HS_CMP_XCHG(r0_1, r0_2);
2663       HS_SLAB_LOCAL_L(0) = r0_1;
2664       HS_SLAB_LOCAL_L(32) = r0_2;
2665     }
2666     {
2667       HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(2);
2668       HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(10);
2669       HS_CMP_XCHG(r0_1, r0_2);
2670       HS_SLAB_LOCAL_L(128) = r0_1;
2671       HS_SLAB_LOCAL_L(160) = r0_2;
2672     }
2673     {
2674       HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(4);
2675       HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(12);
2676       HS_CMP_XCHG(r0_1, r0_2);
2677       HS_SLAB_LOCAL_L(256) = r0_1;
2678       HS_SLAB_LOCAL_L(288) = r0_2;
2679     }
2680     {
2681       HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(6);
2682       HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(14);
2683       HS_CMP_XCHG(r0_1, r0_2);
2684       HS_SLAB_LOCAL_L(384) = r0_1;
2685       HS_SLAB_LOCAL_L(416) = r0_2;
2686     }
2687   }
2688   HS_BLOCK_BARRIER();
2689   HS_KEY_TYPE r1 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 0);
2690   HS_KEY_TYPE r2 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 1);
2691   HS_KEY_TYPE r3 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 2);
2692   HS_KEY_TYPE r4 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 3);
2693   HS_KEY_TYPE r5 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 4);
2694   HS_KEY_TYPE r6 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 5);
2695   HS_KEY_TYPE r7 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 6);
2696   HS_KEY_TYPE r8 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 7);
2697   {
2698     {
2699       HS_SLAB_HALF_PREAMBLE(16);
2700       HS_CMP_HALF(0, r1);
2701       HS_CMP_HALF(1, r2);
2702       HS_CMP_HALF(2, r3);
2703       HS_CMP_HALF(3, r4);
2704       HS_CMP_HALF(4, r5);
2705       HS_CMP_HALF(5, r6);
2706       HS_CMP_HALF(6, r7);
2707       HS_CMP_HALF(7, r8);
2708     }
2709     {
2710       HS_SLAB_HALF_PREAMBLE(8);
2711       HS_CMP_HALF(0, r1);
2712       HS_CMP_HALF(1, r2);
2713       HS_CMP_HALF(2, r3);
2714       HS_CMP_HALF(3, r4);
2715       HS_CMP_HALF(4, r5);
2716       HS_CMP_HALF(5, r6);
2717       HS_CMP_HALF(6, r7);
2718       HS_CMP_HALF(7, r8);
2719     }
2720     {
2721       HS_SLAB_HALF_PREAMBLE(4);
2722       HS_CMP_HALF(0, r1);
2723       HS_CMP_HALF(1, r2);
2724       HS_CMP_HALF(2, r3);
2725       HS_CMP_HALF(3, r4);
2726       HS_CMP_HALF(4, r5);
2727       HS_CMP_HALF(5, r6);
2728       HS_CMP_HALF(6, r7);
2729       HS_CMP_HALF(7, r8);
2730     }
2731     {
2732       HS_SLAB_HALF_PREAMBLE(2);
2733       HS_CMP_HALF(0, r1);
2734       HS_CMP_HALF(1, r2);
2735       HS_CMP_HALF(2, r3);
2736       HS_CMP_HALF(3, r4);
2737       HS_CMP_HALF(4, r5);
2738       HS_CMP_HALF(5, r6);
2739       HS_CMP_HALF(6, r7);
2740       HS_CMP_HALF(7, r8);
2741     }
2742     {
2743       HS_SLAB_HALF_PREAMBLE(1);
2744       HS_CMP_HALF(0, r1);
2745       HS_CMP_HALF(1, r2);
2746       HS_CMP_HALF(2, r3);
2747       HS_CMP_HALF(3, r4);
2748       HS_CMP_HALF(4, r5);
2749       HS_CMP_HALF(5, r6);
2750       HS_CMP_HALF(6, r7);
2751       HS_CMP_HALF(7, r8);
2752     }
2753     HS_CMP_XCHG(r1, r5);
2754     HS_CMP_XCHG(r3, r7);
2755     HS_CMP_XCHG(r1, r3);
2756     HS_CMP_XCHG(r5, r7);
2757     HS_CMP_XCHG(r2, r6);
2758     HS_CMP_XCHG(r4, r8);
2759     HS_CMP_XCHG(r2, r4);
2760     HS_CMP_XCHG(r6, r8);
2761     HS_CMP_XCHG(r1, r2);
2762     HS_CMP_XCHG(r3, r4);
2763     HS_CMP_XCHG(r5, r6);
2764     HS_CMP_XCHG(r7, r8);
2765   }
2766   HS_SLAB_GLOBAL_STORE(0, r1);
2767   HS_SLAB_GLOBAL_STORE(1, r2);
2768   HS_SLAB_GLOBAL_STORE(2, r3);
2769   HS_SLAB_GLOBAL_STORE(3, r4);
2770   HS_SLAB_GLOBAL_STORE(4, r5);
2771   HS_SLAB_GLOBAL_STORE(5, r6);
2772   HS_SLAB_GLOBAL_STORE(6, r7);
2773   HS_SLAB_GLOBAL_STORE(7, r8);
2774 }
2775 
2776 HS_BC_KERNEL_PROTO(4, 2)
2777 {
2778   HS_BLOCK_LOCAL_MEM_DECL(128, 8);
2779 
2780   HS_SLAB_GLOBAL_PREAMBLE();
2781   HS_BC_MERGE_H_PREAMBLE(4);
2782   {
2783     {
2784       HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0);
2785       HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8);
2786       HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(16);
2787       HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(24);
2788       HS_CMP_XCHG(r0_1, r0_3);
2789       HS_CMP_XCHG(r0_2, r0_4);
2790       HS_CMP_XCHG(r0_1, r0_2);
2791       HS_CMP_XCHG(r0_3, r0_4);
2792       HS_SLAB_LOCAL_L(0) = r0_1;
2793       HS_SLAB_LOCAL_L(32) = r0_2;
2794       HS_SLAB_LOCAL_L(64) = r0_3;
2795       HS_SLAB_LOCAL_L(96) = r0_4;
2796     }
2797     {
2798       HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(4);
2799       HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(12);
2800       HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(20);
2801       HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(28);
2802       HS_CMP_XCHG(r0_1, r0_3);
2803       HS_CMP_XCHG(r0_2, r0_4);
2804       HS_CMP_XCHG(r0_1, r0_2);
2805       HS_CMP_XCHG(r0_3, r0_4);
2806       HS_SLAB_LOCAL_L(512) = r0_1;
2807       HS_SLAB_LOCAL_L(544) = r0_2;
2808       HS_SLAB_LOCAL_L(576) = r0_3;
2809       HS_SLAB_LOCAL_L(608) = r0_4;
2810     }
2811   }
2812   HS_BLOCK_BARRIER();
2813   HS_KEY_TYPE r1 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0);
2814   HS_KEY_TYPE r2 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1);
2815   HS_KEY_TYPE r3 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2);
2816   HS_KEY_TYPE r4 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3);
2817   HS_KEY_TYPE r5 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4);
2818   HS_KEY_TYPE r6 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5);
2819   HS_KEY_TYPE r7 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6);
2820   HS_KEY_TYPE r8 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7);
2821   {
2822     {
2823       HS_SLAB_HALF_PREAMBLE(16);
2824       HS_CMP_HALF(0, r1);
2825       HS_CMP_HALF(1, r2);
2826       HS_CMP_HALF(2, r3);
2827       HS_CMP_HALF(3, r4);
2828       HS_CMP_HALF(4, r5);
2829       HS_CMP_HALF(5, r6);
2830       HS_CMP_HALF(6, r7);
2831       HS_CMP_HALF(7, r8);
2832     }
2833     {
2834       HS_SLAB_HALF_PREAMBLE(8);
2835       HS_CMP_HALF(0, r1);
2836       HS_CMP_HALF(1, r2);
2837       HS_CMP_HALF(2, r3);
2838       HS_CMP_HALF(3, r4);
2839       HS_CMP_HALF(4, r5);
2840       HS_CMP_HALF(5, r6);
2841       HS_CMP_HALF(6, r7);
2842       HS_CMP_HALF(7, r8);
2843     }
2844     {
2845       HS_SLAB_HALF_PREAMBLE(4);
2846       HS_CMP_HALF(0, r1);
2847       HS_CMP_HALF(1, r2);
2848       HS_CMP_HALF(2, r3);
2849       HS_CMP_HALF(3, r4);
2850       HS_CMP_HALF(4, r5);
2851       HS_CMP_HALF(5, r6);
2852       HS_CMP_HALF(6, r7);
2853       HS_CMP_HALF(7, r8);
2854     }
2855     {
2856       HS_SLAB_HALF_PREAMBLE(2);
2857       HS_CMP_HALF(0, r1);
2858       HS_CMP_HALF(1, r2);
2859       HS_CMP_HALF(2, r3);
2860       HS_CMP_HALF(3, r4);
2861       HS_CMP_HALF(4, r5);
2862       HS_CMP_HALF(5, r6);
2863       HS_CMP_HALF(6, r7);
2864       HS_CMP_HALF(7, r8);
2865     }
2866     {
2867       HS_SLAB_HALF_PREAMBLE(1);
2868       HS_CMP_HALF(0, r1);
2869       HS_CMP_HALF(1, r2);
2870       HS_CMP_HALF(2, r3);
2871       HS_CMP_HALF(3, r4);
2872       HS_CMP_HALF(4, r5);
2873       HS_CMP_HALF(5, r6);
2874       HS_CMP_HALF(6, r7);
2875       HS_CMP_HALF(7, r8);
2876     }
2877     HS_CMP_XCHG(r1, r5);
2878     HS_CMP_XCHG(r3, r7);
2879     HS_CMP_XCHG(r1, r3);
2880     HS_CMP_XCHG(r5, r7);
2881     HS_CMP_XCHG(r2, r6);
2882     HS_CMP_XCHG(r4, r8);
2883     HS_CMP_XCHG(r2, r4);
2884     HS_CMP_XCHG(r6, r8);
2885     HS_CMP_XCHG(r1, r2);
2886     HS_CMP_XCHG(r3, r4);
2887     HS_CMP_XCHG(r5, r6);
2888     HS_CMP_XCHG(r7, r8);
2889   }
2890   HS_SLAB_GLOBAL_STORE(0, r1);
2891   HS_SLAB_GLOBAL_STORE(1, r2);
2892   HS_SLAB_GLOBAL_STORE(2, r3);
2893   HS_SLAB_GLOBAL_STORE(3, r4);
2894   HS_SLAB_GLOBAL_STORE(4, r5);
2895   HS_SLAB_GLOBAL_STORE(5, r6);
2896   HS_SLAB_GLOBAL_STORE(6, r7);
2897   HS_SLAB_GLOBAL_STORE(7, r8);
2898 }
2899 
2900 HS_BC_KERNEL_PROTO(8, 3)
2901 {
2902   HS_BLOCK_LOCAL_MEM_DECL(256, 8);
2903 
2904   HS_SLAB_GLOBAL_PREAMBLE();
2905   HS_BC_MERGE_H_PREAMBLE(8);
2906   {
2907     {
2908       HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0);
2909       HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8);
2910       HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(16);
2911       HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(24);
2912       HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(32);
2913       HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(40);
2914       HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(48);
2915       HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(56);
2916       HS_CMP_XCHG(r0_1, r0_5);
2917       HS_CMP_XCHG(r0_3, r0_7);
2918       HS_CMP_XCHG(r0_1, r0_3);
2919       HS_CMP_XCHG(r0_5, r0_7);
2920       HS_CMP_XCHG(r0_2, r0_6);
2921       HS_CMP_XCHG(r0_4, r0_8);
2922       HS_CMP_XCHG(r0_2, r0_4);
2923       HS_CMP_XCHG(r0_6, r0_8);
2924       HS_CMP_XCHG(r0_1, r0_2);
2925       HS_CMP_XCHG(r0_3, r0_4);
2926       HS_CMP_XCHG(r0_5, r0_6);
2927       HS_CMP_XCHG(r0_7, r0_8);
2928       HS_SLAB_LOCAL_L(0) = r0_1;
2929       HS_SLAB_LOCAL_L(32) = r0_2;
2930       HS_SLAB_LOCAL_L(64) = r0_3;
2931       HS_SLAB_LOCAL_L(96) = r0_4;
2932       HS_SLAB_LOCAL_L(128) = r0_5;
2933       HS_SLAB_LOCAL_L(160) = r0_6;
2934       HS_SLAB_LOCAL_L(192) = r0_7;
2935       HS_SLAB_LOCAL_L(224) = r0_8;
2936     }
2937   }
2938   HS_BLOCK_BARRIER();
2939   HS_KEY_TYPE r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0);
2940   HS_KEY_TYPE r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1);
2941   HS_KEY_TYPE r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2);
2942   HS_KEY_TYPE r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3);
2943   HS_KEY_TYPE r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4);
2944   HS_KEY_TYPE r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5);
2945   HS_KEY_TYPE r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6);
2946   HS_KEY_TYPE r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7);
2947   {
2948     {
2949       HS_SLAB_HALF_PREAMBLE(16);
2950       HS_CMP_HALF(0, r1);
2951       HS_CMP_HALF(1, r2);
2952       HS_CMP_HALF(2, r3);
2953       HS_CMP_HALF(3, r4);
2954       HS_CMP_HALF(4, r5);
2955       HS_CMP_HALF(5, r6);
2956       HS_CMP_HALF(6, r7);
2957       HS_CMP_HALF(7, r8);
2958     }
2959     {
2960       HS_SLAB_HALF_PREAMBLE(8);
2961       HS_CMP_HALF(0, r1);
2962       HS_CMP_HALF(1, r2);
2963       HS_CMP_HALF(2, r3);
2964       HS_CMP_HALF(3, r4);
2965       HS_CMP_HALF(4, r5);
2966       HS_CMP_HALF(5, r6);
2967       HS_CMP_HALF(6, r7);
2968       HS_CMP_HALF(7, r8);
2969     }
2970     {
2971       HS_SLAB_HALF_PREAMBLE(4);
2972       HS_CMP_HALF(0, r1);
2973       HS_CMP_HALF(1, r2);
2974       HS_CMP_HALF(2, r3);
2975       HS_CMP_HALF(3, r4);
2976       HS_CMP_HALF(4, r5);
2977       HS_CMP_HALF(5, r6);
2978       HS_CMP_HALF(6, r7);
2979       HS_CMP_HALF(7, r8);
2980     }
2981     {
2982       HS_SLAB_HALF_PREAMBLE(2);
2983       HS_CMP_HALF(0, r1);
2984       HS_CMP_HALF(1, r2);
2985       HS_CMP_HALF(2, r3);
2986       HS_CMP_HALF(3, r4);
2987       HS_CMP_HALF(4, r5);
2988       HS_CMP_HALF(5, r6);
2989       HS_CMP_HALF(6, r7);
2990       HS_CMP_HALF(7, r8);
2991     }
2992     {
2993       HS_SLAB_HALF_PREAMBLE(1);
2994       HS_CMP_HALF(0, r1);
2995       HS_CMP_HALF(1, r2);
2996       HS_CMP_HALF(2, r3);
2997       HS_CMP_HALF(3, r4);
2998       HS_CMP_HALF(4, r5);
2999       HS_CMP_HALF(5, r6);
3000       HS_CMP_HALF(6, r7);
3001       HS_CMP_HALF(7, r8);
3002     }
3003     HS_CMP_XCHG(r1, r5);
3004     HS_CMP_XCHG(r3, r7);
3005     HS_CMP_XCHG(r1, r3);
3006     HS_CMP_XCHG(r5, r7);
3007     HS_CMP_XCHG(r2, r6);
3008     HS_CMP_XCHG(r4, r8);
3009     HS_CMP_XCHG(r2, r4);
3010     HS_CMP_XCHG(r6, r8);
3011     HS_CMP_XCHG(r1, r2);
3012     HS_CMP_XCHG(r3, r4);
3013     HS_CMP_XCHG(r5, r6);
3014     HS_CMP_XCHG(r7, r8);
3015   }
3016   HS_SLAB_GLOBAL_STORE(0, r1);
3017   HS_SLAB_GLOBAL_STORE(1, r2);
3018   HS_SLAB_GLOBAL_STORE(2, r3);
3019   HS_SLAB_GLOBAL_STORE(3, r4);
3020   HS_SLAB_GLOBAL_STORE(4, r5);
3021   HS_SLAB_GLOBAL_STORE(5, r6);
3022   HS_SLAB_GLOBAL_STORE(6, r7);
3023   HS_SLAB_GLOBAL_STORE(7, r8);
3024 }
3025 
3026 HS_BC_KERNEL_PROTO(16, 4)
3027 {
3028   HS_BLOCK_LOCAL_MEM_DECL(512, 8);
3029 
3030   HS_SLAB_GLOBAL_PREAMBLE();
3031   HS_BC_MERGE_H_PREAMBLE(16);
3032   if (HS_WARP_ID_X() < 8) {
3033     {
3034       HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0);
3035       HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8);
3036       HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(16);
3037       HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(24);
3038       HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(32);
3039       HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(40);
3040       HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(48);
3041       HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(56);
3042       HS_KEY_TYPE r0_9 = HS_BC_GLOBAL_LOAD_L(64);
3043       HS_KEY_TYPE r0_10 = HS_BC_GLOBAL_LOAD_L(72);
3044       HS_KEY_TYPE r0_11 = HS_BC_GLOBAL_LOAD_L(80);
3045       HS_KEY_TYPE r0_12 = HS_BC_GLOBAL_LOAD_L(88);
3046       HS_KEY_TYPE r0_13 = HS_BC_GLOBAL_LOAD_L(96);
3047       HS_KEY_TYPE r0_14 = HS_BC_GLOBAL_LOAD_L(104);
3048       HS_KEY_TYPE r0_15 = HS_BC_GLOBAL_LOAD_L(112);
3049       HS_KEY_TYPE r0_16 = HS_BC_GLOBAL_LOAD_L(120);
3050       HS_CMP_XCHG(r0_1, r0_9);
3051       HS_CMP_XCHG(r0_5, r0_13);
3052       HS_CMP_XCHG(r0_1, r0_5);
3053       HS_CMP_XCHG(r0_9, r0_13);
3054       HS_CMP_XCHG(r0_3, r0_11);
3055       HS_CMP_XCHG(r0_7, r0_15);
3056       HS_CMP_XCHG(r0_3, r0_7);
3057       HS_CMP_XCHG(r0_11, r0_15);
3058       HS_CMP_XCHG(r0_1, r0_3);
3059       HS_CMP_XCHG(r0_5, r0_7);
3060       HS_CMP_XCHG(r0_9, r0_11);
3061       HS_CMP_XCHG(r0_13, r0_15);
3062       HS_CMP_XCHG(r0_2, r0_10);
3063       HS_CMP_XCHG(r0_6, r0_14);
3064       HS_CMP_XCHG(r0_2, r0_6);
3065       HS_CMP_XCHG(r0_10, r0_14);
3066       HS_CMP_XCHG(r0_4, r0_12);
3067       HS_CMP_XCHG(r0_8, r0_16);
3068       HS_CMP_XCHG(r0_4, r0_8);
3069       HS_CMP_XCHG(r0_12, r0_16);
3070       HS_CMP_XCHG(r0_2, r0_4);
3071       HS_CMP_XCHG(r0_6, r0_8);
3072       HS_CMP_XCHG(r0_10, r0_12);
3073       HS_CMP_XCHG(r0_14, r0_16);
3074       HS_CMP_XCHG(r0_1, r0_2);
3075       HS_CMP_XCHG(r0_3, r0_4);
3076       HS_CMP_XCHG(r0_5, r0_6);
3077       HS_CMP_XCHG(r0_7, r0_8);
3078       HS_CMP_XCHG(r0_9, r0_10);
3079       HS_CMP_XCHG(r0_11, r0_12);
3080       HS_CMP_XCHG(r0_13, r0_14);
3081       HS_CMP_XCHG(r0_15, r0_16);
3082       HS_SLAB_LOCAL_L(0) = r0_1;
3083       HS_SLAB_LOCAL_L(32) = r0_2;
3084       HS_SLAB_LOCAL_L(64) = r0_3;
3085       HS_SLAB_LOCAL_L(96) = r0_4;
3086       HS_SLAB_LOCAL_L(128) = r0_5;
3087       HS_SLAB_LOCAL_L(160) = r0_6;
3088       HS_SLAB_LOCAL_L(192) = r0_7;
3089       HS_SLAB_LOCAL_L(224) = r0_8;
3090       HS_SLAB_LOCAL_L(256) = r0_9;
3091       HS_SLAB_LOCAL_L(288) = r0_10;
3092       HS_SLAB_LOCAL_L(320) = r0_11;
3093       HS_SLAB_LOCAL_L(352) = r0_12;
3094       HS_SLAB_LOCAL_L(384) = r0_13;
3095       HS_SLAB_LOCAL_L(416) = r0_14;
3096       HS_SLAB_LOCAL_L(448) = r0_15;
3097       HS_SLAB_LOCAL_L(480) = r0_16;
3098     }
3099   }
3100   HS_BLOCK_BARRIER();
3101   HS_KEY_TYPE r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0);
3102   HS_KEY_TYPE r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1);
3103   HS_KEY_TYPE r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2);
3104   HS_KEY_TYPE r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3);
3105   HS_KEY_TYPE r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4);
3106   HS_KEY_TYPE r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5);
3107   HS_KEY_TYPE r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6);
3108   HS_KEY_TYPE r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7);
3109   {
3110     {
3111       HS_SLAB_HALF_PREAMBLE(16);
3112       HS_CMP_HALF(0, r1);
3113       HS_CMP_HALF(1, r2);
3114       HS_CMP_HALF(2, r3);
3115       HS_CMP_HALF(3, r4);
3116       HS_CMP_HALF(4, r5);
3117       HS_CMP_HALF(5, r6);
3118       HS_CMP_HALF(6, r7);
3119       HS_CMP_HALF(7, r8);
3120     }
3121     {
3122       HS_SLAB_HALF_PREAMBLE(8);
3123       HS_CMP_HALF(0, r1);
3124       HS_CMP_HALF(1, r2);
3125       HS_CMP_HALF(2, r3);
3126       HS_CMP_HALF(3, r4);
3127       HS_CMP_HALF(4, r5);
3128       HS_CMP_HALF(5, r6);
3129       HS_CMP_HALF(6, r7);
3130       HS_CMP_HALF(7, r8);
3131     }
3132     {
3133       HS_SLAB_HALF_PREAMBLE(4);
3134       HS_CMP_HALF(0, r1);
3135       HS_CMP_HALF(1, r2);
3136       HS_CMP_HALF(2, r3);
3137       HS_CMP_HALF(3, r4);
3138       HS_CMP_HALF(4, r5);
3139       HS_CMP_HALF(5, r6);
3140       HS_CMP_HALF(6, r7);
3141       HS_CMP_HALF(7, r8);
3142     }
3143     {
3144       HS_SLAB_HALF_PREAMBLE(2);
3145       HS_CMP_HALF(0, r1);
3146       HS_CMP_HALF(1, r2);
3147       HS_CMP_HALF(2, r3);
3148       HS_CMP_HALF(3, r4);
3149       HS_CMP_HALF(4, r5);
3150       HS_CMP_HALF(5, r6);
3151       HS_CMP_HALF(6, r7);
3152       HS_CMP_HALF(7, r8);
3153     }
3154     {
3155       HS_SLAB_HALF_PREAMBLE(1);
3156       HS_CMP_HALF(0, r1);
3157       HS_CMP_HALF(1, r2);
3158       HS_CMP_HALF(2, r3);
3159       HS_CMP_HALF(3, r4);
3160       HS_CMP_HALF(4, r5);
3161       HS_CMP_HALF(5, r6);
3162       HS_CMP_HALF(6, r7);
3163       HS_CMP_HALF(7, r8);
3164     }
3165     HS_CMP_XCHG(r1, r5);
3166     HS_CMP_XCHG(r3, r7);
3167     HS_CMP_XCHG(r1, r3);
3168     HS_CMP_XCHG(r5, r7);
3169     HS_CMP_XCHG(r2, r6);
3170     HS_CMP_XCHG(r4, r8);
3171     HS_CMP_XCHG(r2, r4);
3172     HS_CMP_XCHG(r6, r8);
3173     HS_CMP_XCHG(r1, r2);
3174     HS_CMP_XCHG(r3, r4);
3175     HS_CMP_XCHG(r5, r6);
3176     HS_CMP_XCHG(r7, r8);
3177   }
3178   HS_SLAB_GLOBAL_STORE(0, r1);
3179   HS_SLAB_GLOBAL_STORE(1, r2);
3180   HS_SLAB_GLOBAL_STORE(2, r3);
3181   HS_SLAB_GLOBAL_STORE(3, r4);
3182   HS_SLAB_GLOBAL_STORE(4, r5);
3183   HS_SLAB_GLOBAL_STORE(5, r6);
3184   HS_SLAB_GLOBAL_STORE(6, r7);
3185   HS_SLAB_GLOBAL_STORE(7, r8);
3186 }
3187 
3188 HS_OFFSET_FM_KERNEL_PROTO(0, 0)
3189 {
3190   HS_OFFSET_FM_PREAMBLE(8);
3191   HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
3192   HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
3193   HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
3194   HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
3195   HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
3196   HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
3197   HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
3198   HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
3199   HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0);
3200   HS_CMP_XCHG(r8, r9);
3201   HS_CMP_XCHG(r1, r5);
3202   HS_CMP_XCHG(r3, r7);
3203   HS_CMP_XCHG(r1, r3);
3204   HS_CMP_XCHG(r5, r7);
3205   HS_CMP_XCHG(r2, r6);
3206   HS_CMP_XCHG(r4, r8);
3207   HS_CMP_XCHG(r2, r4);
3208   HS_CMP_XCHG(r6, r8);
3209   HS_CMP_XCHG(r1, r2);
3210   HS_CMP_XCHG(r3, r4);
3211   HS_CMP_XCHG(r5, r6);
3212   HS_CMP_XCHG(r7, r8);
3213   HS_XM_GLOBAL_STORE_L(0, r1);
3214   HS_XM_GLOBAL_STORE_L(1, r2);
3215   HS_XM_GLOBAL_STORE_L(2, r3);
3216   HS_XM_GLOBAL_STORE_L(3, r4);
3217   HS_XM_GLOBAL_STORE_L(4, r5);
3218   HS_XM_GLOBAL_STORE_L(5, r6);
3219   HS_XM_GLOBAL_STORE_L(6, r7);
3220   HS_XM_GLOBAL_STORE_L(7, r8);
3221   HS_FM_GLOBAL_STORE_R(0, r9);
3222 }
3223 
3224 HS_OFFSET_FM_KERNEL_PROTO(0, 1)
3225 {
3226   HS_OFFSET_FM_PREAMBLE(8);
3227   HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
3228   HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
3229   HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
3230   HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
3231   HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
3232   HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
3233   HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
3234   HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
3235   HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0);
3236   HS_KEY_TYPE r10 = HS_FM_GLOBAL_LOAD_R(1);
3237   HS_CMP_XCHG(r8, r9);
3238   HS_CMP_XCHG(r7, r10);
3239   HS_CMP_XCHG(r1, r5);
3240   HS_CMP_XCHG(r3, r7);
3241   HS_CMP_XCHG(r1, r3);
3242   HS_CMP_XCHG(r5, r7);
3243   HS_CMP_XCHG(r2, r6);
3244   HS_CMP_XCHG(r4, r8);
3245   HS_CMP_XCHG(r2, r4);
3246   HS_CMP_XCHG(r6, r8);
3247   HS_CMP_XCHG(r1, r2);
3248   HS_CMP_XCHG(r3, r4);
3249   HS_CMP_XCHG(r5, r6);
3250   HS_CMP_XCHG(r7, r8);
3251   HS_CMP_XCHG(r9, r10);
3252   HS_XM_GLOBAL_STORE_L(0, r1);
3253   HS_XM_GLOBAL_STORE_L(1, r2);
3254   HS_XM_GLOBAL_STORE_L(2, r3);
3255   HS_XM_GLOBAL_STORE_L(3, r4);
3256   HS_XM_GLOBAL_STORE_L(4, r5);
3257   HS_XM_GLOBAL_STORE_L(5, r6);
3258   HS_XM_GLOBAL_STORE_L(6, r7);
3259   HS_XM_GLOBAL_STORE_L(7, r8);
3260   HS_FM_GLOBAL_STORE_R(0, r9);
3261   HS_FM_GLOBAL_STORE_R(1, r10);
3262 }
3263 
3264 HS_OFFSET_FM_KERNEL_PROTO(0, 2)
3265 {
3266   HS_OFFSET_FM_PREAMBLE(8);
3267   HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
3268   HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
3269   HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
3270   HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
3271   HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
3272   HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
3273   HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
3274   HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
3275   HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0);
3276   HS_KEY_TYPE r10 = HS_FM_GLOBAL_LOAD_R(1);
3277   HS_KEY_TYPE r11 = HS_FM_GLOBAL_LOAD_R(2);
3278   HS_KEY_TYPE r12 = HS_FM_GLOBAL_LOAD_R(3);
3279   HS_CMP_XCHG(r8, r9);
3280   HS_CMP_XCHG(r7, r10);
3281   HS_CMP_XCHG(r6, r11);
3282   HS_CMP_XCHG(r5, r12);
3283   HS_CMP_XCHG(r1, r5);
3284   HS_CMP_XCHG(r3, r7);
3285   HS_CMP_XCHG(r1, r3);
3286   HS_CMP_XCHG(r5, r7);
3287   HS_CMP_XCHG(r2, r6);
3288   HS_CMP_XCHG(r4, r8);
3289   HS_CMP_XCHG(r2, r4);
3290   HS_CMP_XCHG(r6, r8);
3291   HS_CMP_XCHG(r1, r2);
3292   HS_CMP_XCHG(r3, r4);
3293   HS_CMP_XCHG(r5, r6);
3294   HS_CMP_XCHG(r7, r8);
3295   HS_CMP_XCHG(r9, r11);
3296   HS_CMP_XCHG(r10, r12);
3297   HS_CMP_XCHG(r9, r10);
3298   HS_CMP_XCHG(r11, r12);
3299   HS_XM_GLOBAL_STORE_L(0, r1);
3300   HS_XM_GLOBAL_STORE_L(1, r2);
3301   HS_XM_GLOBAL_STORE_L(2, r3);
3302   HS_XM_GLOBAL_STORE_L(3, r4);
3303   HS_XM_GLOBAL_STORE_L(4, r5);
3304   HS_XM_GLOBAL_STORE_L(5, r6);
3305   HS_XM_GLOBAL_STORE_L(6, r7);
3306   HS_XM_GLOBAL_STORE_L(7, r8);
3307   HS_FM_GLOBAL_STORE_R(0, r9);
3308   HS_FM_GLOBAL_STORE_R(1, r10);
3309   HS_FM_GLOBAL_STORE_R(2, r11);
3310   HS_FM_GLOBAL_STORE_R(3, r12);
3311 }
3312 
3313 HS_FM_KERNEL_PROTO(0, 3)
3314 {
3315   HS_FM_PREAMBLE(8);
3316   HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
3317   HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
3318   HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
3319   HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
3320   HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
3321   HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
3322   HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
3323   HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
3324   HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0);
3325   HS_KEY_TYPE r10 = HS_FM_GLOBAL_LOAD_R(1);
3326   HS_KEY_TYPE r11 = HS_FM_GLOBAL_LOAD_R(2);
3327   HS_KEY_TYPE r12 = HS_FM_GLOBAL_LOAD_R(3);
3328   HS_KEY_TYPE r13 = HS_FM_GLOBAL_LOAD_R(4);
3329   HS_KEY_TYPE r14 = HS_FM_GLOBAL_LOAD_R(5);
3330   HS_KEY_TYPE r15 = HS_FM_GLOBAL_LOAD_R(6);
3331   HS_KEY_TYPE r16 = HS_FM_GLOBAL_LOAD_R(7);
3332   HS_CMP_XCHG(r8, r9);
3333   HS_CMP_XCHG(r7, r10);
3334   HS_CMP_XCHG(r6, r11);
3335   HS_CMP_XCHG(r5, r12);
3336   HS_CMP_XCHG(r4, r13);
3337   HS_CMP_XCHG(r3, r14);
3338   HS_CMP_XCHG(r2, r15);
3339   HS_CMP_XCHG(r1, r16);
3340   HS_CMP_XCHG(r1, r5);
3341   HS_CMP_XCHG(r3, r7);
3342   HS_CMP_XCHG(r1, r3);
3343   HS_CMP_XCHG(r5, r7);
3344   HS_CMP_XCHG(r2, r6);
3345   HS_CMP_XCHG(r4, r8);
3346   HS_CMP_XCHG(r2, r4);
3347   HS_CMP_XCHG(r6, r8);
3348   HS_CMP_XCHG(r1, r2);
3349   HS_CMP_XCHG(r3, r4);
3350   HS_CMP_XCHG(r5, r6);
3351   HS_CMP_XCHG(r7, r8);
3352   HS_CMP_XCHG(r9, r13);
3353   HS_CMP_XCHG(r11, r15);
3354   HS_CMP_XCHG(r9, r11);
3355   HS_CMP_XCHG(r13, r15);
3356   HS_CMP_XCHG(r10, r14);
3357   HS_CMP_XCHG(r12, r16);
3358   HS_CMP_XCHG(r10, r12);
3359   HS_CMP_XCHG(r14, r16);
3360   HS_CMP_XCHG(r9, r10);
3361   HS_CMP_XCHG(r11, r12);
3362   HS_CMP_XCHG(r13, r14);
3363   HS_CMP_XCHG(r15, r16);
3364   HS_XM_GLOBAL_STORE_L(0, r1);
3365   HS_XM_GLOBAL_STORE_L(1, r2);
3366   HS_XM_GLOBAL_STORE_L(2, r3);
3367   HS_XM_GLOBAL_STORE_L(3, r4);
3368   HS_XM_GLOBAL_STORE_L(4, r5);
3369   HS_XM_GLOBAL_STORE_L(5, r6);
3370   HS_XM_GLOBAL_STORE_L(6, r7);
3371   HS_XM_GLOBAL_STORE_L(7, r8);
3372   HS_FM_GLOBAL_STORE_R(0, r9);
3373   HS_FM_GLOBAL_STORE_R(1, r10);
3374   HS_FM_GLOBAL_STORE_R(2, r11);
3375   HS_FM_GLOBAL_STORE_R(3, r12);
3376   HS_FM_GLOBAL_STORE_R(4, r13);
3377   HS_FM_GLOBAL_STORE_R(5, r14);
3378   HS_FM_GLOBAL_STORE_R(6, r15);
3379   HS_FM_GLOBAL_STORE_R(7, r16);
3380 }
3381 
3382 HS_HM_KERNEL_PROTO(0)
3383 {
3384   HS_HM_PREAMBLE(8);
3385   HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
3386   HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
3387   HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
3388   HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
3389   HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
3390   HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
3391   HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
3392   HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
3393   HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8);
3394   HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9);
3395   HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10);
3396   HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11);
3397   HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12);
3398   HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13);
3399   HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14);
3400   HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15);
3401   HS_CMP_XCHG(r1, r9);
3402   HS_CMP_XCHG(r5, r13);
3403   HS_CMP_XCHG(r1, r5);
3404   HS_CMP_XCHG(r9, r13);
3405   HS_CMP_XCHG(r3, r11);
3406   HS_CMP_XCHG(r7, r15);
3407   HS_CMP_XCHG(r3, r7);
3408   HS_CMP_XCHG(r11, r15);
3409   HS_CMP_XCHG(r1, r3);
3410   HS_CMP_XCHG(r5, r7);
3411   HS_CMP_XCHG(r9, r11);
3412   HS_CMP_XCHG(r13, r15);
3413   HS_CMP_XCHG(r2, r10);
3414   HS_CMP_XCHG(r6, r14);
3415   HS_CMP_XCHG(r2, r6);
3416   HS_CMP_XCHG(r10, r14);
3417   HS_CMP_XCHG(r4, r12);
3418   HS_CMP_XCHG(r8, r16);
3419   HS_CMP_XCHG(r4, r8);
3420   HS_CMP_XCHG(r12, r16);
3421   HS_CMP_XCHG(r2, r4);
3422   HS_CMP_XCHG(r6, r8);
3423   HS_CMP_XCHG(r10, r12);
3424   HS_CMP_XCHG(r14, r16);
3425   HS_CMP_XCHG(r1, r2);
3426   HS_CMP_XCHG(r3, r4);
3427   HS_CMP_XCHG(r5, r6);
3428   HS_CMP_XCHG(r7, r8);
3429   HS_CMP_XCHG(r9, r10);
3430   HS_CMP_XCHG(r11, r12);
3431   HS_CMP_XCHG(r13, r14);
3432   HS_CMP_XCHG(r15, r16);
3433   HS_XM_GLOBAL_STORE_L(0, r1);
3434   HS_XM_GLOBAL_STORE_L(1, r2);
3435   HS_XM_GLOBAL_STORE_L(2, r3);
3436   HS_XM_GLOBAL_STORE_L(3, r4);
3437   HS_XM_GLOBAL_STORE_L(4, r5);
3438   HS_XM_GLOBAL_STORE_L(5, r6);
3439   HS_XM_GLOBAL_STORE_L(6, r7);
3440   HS_XM_GLOBAL_STORE_L(7, r8);
3441   HS_XM_GLOBAL_STORE_L(8, r9);
3442   HS_XM_GLOBAL_STORE_L(9, r10);
3443   HS_XM_GLOBAL_STORE_L(10, r11);
3444   HS_XM_GLOBAL_STORE_L(11, r12);
3445   HS_XM_GLOBAL_STORE_L(12, r13);
3446   HS_XM_GLOBAL_STORE_L(13, r14);
3447   HS_XM_GLOBAL_STORE_L(14, r15);
3448   HS_XM_GLOBAL_STORE_L(15, r16);
3449 }
3450 
HS_TRANSPOSE_KERNEL_PROTO()3451 HS_TRANSPOSE_KERNEL_PROTO()
3452 {
3453   HS_SLAB_GLOBAL_PREAMBLE();
3454   HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vout, 0);
3455   HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vout, 1);
3456   HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vout, 2);
3457   HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vout, 3);
3458   HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vout, 4);
3459   HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vout, 5);
3460   HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vout, 6);
3461   HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vout, 7);
3462   HS_TRANSPOSE_SLAB();
3463 }
3464 
3465 //
3466 //
3467 //
3468 
3469 #include "../../hs_cuda.inl"
3470 
3471 //
3472 //
3473 //
3474