• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1
2
3__kernel __attribute__((intel_reqd_sub_group_size((1 << 3))))
4__attribute__((reqd_work_group_size((1 << 3) * 1, 1, 1))) void
5hs_kernel_bs_0(__global ulong const* const restrict vin,
6               __global ulong* const restrict vout)
7{
8  uint const gmem_idx = (get_global_id(0) & ~((1 << 3) - 1)) * 16 +
9                        (get_local_id(0) & ((1 << 3) - 1));
10  ulong r1 = vin[gmem_idx + (1 << 3) * 0];
11  ulong r2 = vin[gmem_idx + (1 << 3) * 1];
12  ulong r3 = vin[gmem_idx + (1 << 3) * 2];
13  ulong r4 = vin[gmem_idx + (1 << 3) * 3];
14  ulong r5 = vin[gmem_idx + (1 << 3) * 4];
15  ulong r6 = vin[gmem_idx + (1 << 3) * 5];
16  ulong r7 = vin[gmem_idx + (1 << 3) * 6];
17  ulong r8 = vin[gmem_idx + (1 << 3) * 7];
18  ulong r9 = vin[gmem_idx + (1 << 3) * 8];
19  ulong r10 = vin[gmem_idx + (1 << 3) * 9];
20  ulong r11 = vin[gmem_idx + (1 << 3) * 10];
21  ulong r12 = vin[gmem_idx + (1 << 3) * 11];
22  ulong r13 = vin[gmem_idx + (1 << 3) * 12];
23  ulong r14 = vin[gmem_idx + (1 << 3) * 13];
24  ulong r15 = vin[gmem_idx + (1 << 3) * 14];
25  ulong r16 = vin[gmem_idx + (1 << 3) * 15];
26  if (r1 >= r2) {
27    ulong const t = r1;
28    r1 = r2;
29    r2 = t;
30  };
31  if (r3 >= r4) {
32    ulong const t = r3;
33    r3 = r4;
34    r4 = t;
35  };
36  if (r5 >= r6) {
37    ulong const t = r5;
38    r5 = r6;
39    r6 = t;
40  };
41  if (r7 >= r8) {
42    ulong const t = r7;
43    r7 = r8;
44    r8 = t;
45  };
46  if (r9 >= r10) {
47    ulong const t = r9;
48    r9 = r10;
49    r10 = t;
50  };
51  if (r11 >= r12) {
52    ulong const t = r11;
53    r11 = r12;
54    r12 = t;
55  };
56  if (r13 >= r14) {
57    ulong const t = r13;
58    r13 = r14;
59    r14 = t;
60  };
61  if (r15 >= r16) {
62    ulong const t = r15;
63    r15 = r16;
64    r16 = t;
65  };
66  if (r1 >= r3) {
67    ulong const t = r1;
68    r1 = r3;
69    r3 = t;
70  };
71  if (r5 >= r7) {
72    ulong const t = r5;
73    r5 = r7;
74    r7 = t;
75  };
76  if (r9 >= r11) {
77    ulong const t = r9;
78    r9 = r11;
79    r11 = t;
80  };
81  if (r13 >= r15) {
82    ulong const t = r13;
83    r13 = r15;
84    r15 = t;
85  };
86  if (r2 >= r4) {
87    ulong const t = r2;
88    r2 = r4;
89    r4 = t;
90  };
91  if (r6 >= r8) {
92    ulong const t = r6;
93    r6 = r8;
94    r8 = t;
95  };
96  if (r10 >= r12) {
97    ulong const t = r10;
98    r10 = r12;
99    r12 = t;
100  };
101  if (r14 >= r16) {
102    ulong const t = r14;
103    r14 = r16;
104    r16 = t;
105  };
106  if (r1 >= r5) {
107    ulong const t = r1;
108    r1 = r5;
109    r5 = t;
110  };
111  if (r9 >= r13) {
112    ulong const t = r9;
113    r9 = r13;
114    r13 = t;
115  };
116  if (r2 >= r6) {
117    ulong const t = r2;
118    r2 = r6;
119    r6 = t;
120  };
121  if (r10 >= r14) {
122    ulong const t = r10;
123    r10 = r14;
124    r14 = t;
125  };
126  if (r3 >= r7) {
127    ulong const t = r3;
128    r3 = r7;
129    r7 = t;
130  };
131  if (r11 >= r15) {
132    ulong const t = r11;
133    r11 = r15;
134    r15 = t;
135  };
136  if (r4 >= r8) {
137    ulong const t = r4;
138    r4 = r8;
139    r8 = t;
140  };
141  if (r12 >= r16) {
142    ulong const t = r12;
143    r12 = r16;
144    r16 = t;
145  };
146  if (r1 >= r9) {
147    ulong const t = r1;
148    r1 = r9;
149    r9 = t;
150  };
151  if (r2 >= r10) {
152    ulong const t = r2;
153    r2 = r10;
154    r10 = t;
155  };
156  if (r3 >= r11) {
157    ulong const t = r3;
158    r3 = r11;
159    r11 = t;
160  };
161  if (r4 >= r12) {
162    ulong const t = r4;
163    r4 = r12;
164    r12 = t;
165  };
166  if (r5 >= r13) {
167    ulong const t = r5;
168    r5 = r13;
169    r13 = t;
170  };
171  if (r6 >= r14) {
172    ulong const t = r6;
173    r6 = r14;
174    r14 = t;
175  };
176  if (r7 >= r15) {
177    ulong const t = r7;
178    r7 = r15;
179    r15 = t;
180  };
181  if (r8 >= r16) {
182    ulong const t = r8;
183    r8 = r16;
184    r16 = t;
185  };
186  if (r6 >= r11) {
187    ulong const t = r6;
188    r6 = r11;
189    r11 = t;
190  };
191  if (r7 >= r10) {
192    ulong const t = r7;
193    r7 = r10;
194    r10 = t;
195  };
196  if (r4 >= r13) {
197    ulong const t = r4;
198    r4 = r13;
199    r13 = t;
200  };
201  if (r14 >= r15) {
202    ulong const t = r14;
203    r14 = r15;
204    r15 = t;
205  };
206  if (r8 >= r12) {
207    ulong const t = r8;
208    r8 = r12;
209    r12 = t;
210  };
211  if (r2 >= r3) {
212    ulong const t = r2;
213    r2 = r3;
214    r3 = t;
215  };
216  if (r5 >= r9) {
217    ulong const t = r5;
218    r5 = r9;
219    r9 = t;
220  };
221  if (r2 >= r5) {
222    ulong const t = r2;
223    r2 = r5;
224    r5 = t;
225  };
226  if (r8 >= r14) {
227    ulong const t = r8;
228    r8 = r14;
229    r14 = t;
230  };
231  if (r3 >= r9) {
232    ulong const t = r3;
233    r3 = r9;
234    r9 = t;
235  };
236  if (r12 >= r15) {
237    ulong const t = r12;
238    r12 = r15;
239    r15 = t;
240  };
241  if (r3 >= r5) {
242    ulong const t = r3;
243    r3 = r5;
244    r5 = t;
245  };
246  if (r6 >= r7) {
247    ulong const t = r6;
248    r6 = r7;
249    r7 = t;
250  };
251  if (r10 >= r11) {
252    ulong const t = r10;
253    r10 = r11;
254    r11 = t;
255  };
256  if (r12 >= r14) {
257    ulong const t = r12;
258    r12 = r14;
259    r14 = t;
260  };
261  if (r4 >= r9) {
262    ulong const t = r4;
263    r4 = r9;
264    r9 = t;
265  };
266  if (r8 >= r13) {
267    ulong const t = r8;
268    r8 = r13;
269    r13 = t;
270  };
271  if (r7 >= r9) {
272    ulong const t = r7;
273    r7 = r9;
274    r9 = t;
275  };
276  if (r11 >= r13) {
277    ulong const t = r11;
278    r11 = r13;
279    r13 = t;
280  };
281  if (r4 >= r6) {
282    ulong const t = r4;
283    r4 = r6;
284    r6 = t;
285  };
286  if (r8 >= r10) {
287    ulong const t = r8;
288    r8 = r10;
289    r10 = t;
290  };
291  if (r4 >= r5) {
292    ulong const t = r4;
293    r4 = r5;
294    r5 = t;
295  };
296  if (r6 >= r7) {
297    ulong const t = r6;
298    r6 = r7;
299    r7 = t;
300  };
301  if (r8 >= r9) {
302    ulong const t = r8;
303    r8 = r9;
304    r9 = t;
305  };
306  if (r10 >= r11) {
307    ulong const t = r10;
308    r10 = r11;
309    r11 = t;
310  };
311  if (r12 >= r13) {
312    ulong const t = r12;
313    r12 = r13;
314    r13 = t;
315  };
316  if (r7 >= r8) {
317    ulong const t = r7;
318    r7 = r8;
319    r8 = t;
320  };
321  if (r9 >= r10) {
322    ulong const t = r9;
323    r9 = r10;
324    r10 = t;
325  };
326  {
327    uint const flip_lane_idx = get_sub_group_local_id() ^ 1;
328    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
329    ;
330    {
331      ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
332      ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx);
333      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
334      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
335    };
336    {
337      ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
338      ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx);
339      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
340      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
341    };
342    {
343      ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
344      ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx);
345      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
346      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
347    };
348    {
349      ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
350      ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx);
351      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
352      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
353    };
354    {
355      ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx);
356      ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx);
357      r5 = ((r5 <= tb) ^ t_lt) ? tb : r5;
358      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
359    };
360    {
361      ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx);
362      ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx);
363      r6 = ((r6 <= tb) ^ t_lt) ? tb : r6;
364      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
365    };
366    {
367      ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx);
368      ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx);
369      r7 = ((r7 <= tb) ^ t_lt) ? tb : r7;
370      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
371    };
372    {
373      ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx);
374      ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx);
375      r8 = ((r8 <= tb) ^ t_lt) ? tb : r8;
376      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
377    };
378  }
379  if (r1 >= r9) {
380    ulong const t = r1;
381    r1 = r9;
382    r9 = t;
383  };
384  if (r5 >= r13) {
385    ulong const t = r5;
386    r5 = r13;
387    r13 = t;
388  };
389  if (r1 >= r5) {
390    ulong const t = r1;
391    r1 = r5;
392    r5 = t;
393  };
394  if (r9 >= r13) {
395    ulong const t = r9;
396    r9 = r13;
397    r13 = t;
398  };
399  if (r3 >= r11) {
400    ulong const t = r3;
401    r3 = r11;
402    r11 = t;
403  };
404  if (r7 >= r15) {
405    ulong const t = r7;
406    r7 = r15;
407    r15 = t;
408  };
409  if (r3 >= r7) {
410    ulong const t = r3;
411    r3 = r7;
412    r7 = t;
413  };
414  if (r11 >= r15) {
415    ulong const t = r11;
416    r11 = r15;
417    r15 = t;
418  };
419  if (r1 >= r3) {
420    ulong const t = r1;
421    r1 = r3;
422    r3 = t;
423  };
424  if (r5 >= r7) {
425    ulong const t = r5;
426    r5 = r7;
427    r7 = t;
428  };
429  if (r9 >= r11) {
430    ulong const t = r9;
431    r9 = r11;
432    r11 = t;
433  };
434  if (r13 >= r15) {
435    ulong const t = r13;
436    r13 = r15;
437    r15 = t;
438  };
439  if (r2 >= r10) {
440    ulong const t = r2;
441    r2 = r10;
442    r10 = t;
443  };
444  if (r6 >= r14) {
445    ulong const t = r6;
446    r6 = r14;
447    r14 = t;
448  };
449  if (r2 >= r6) {
450    ulong const t = r2;
451    r2 = r6;
452    r6 = t;
453  };
454  if (r10 >= r14) {
455    ulong const t = r10;
456    r10 = r14;
457    r14 = t;
458  };
459  if (r4 >= r12) {
460    ulong const t = r4;
461    r4 = r12;
462    r12 = t;
463  };
464  if (r8 >= r16) {
465    ulong const t = r8;
466    r8 = r16;
467    r16 = t;
468  };
469  if (r4 >= r8) {
470    ulong const t = r4;
471    r4 = r8;
472    r8 = t;
473  };
474  if (r12 >= r16) {
475    ulong const t = r12;
476    r12 = r16;
477    r16 = t;
478  };
479  if (r2 >= r4) {
480    ulong const t = r2;
481    r2 = r4;
482    r4 = t;
483  };
484  if (r6 >= r8) {
485    ulong const t = r6;
486    r6 = r8;
487    r8 = t;
488  };
489  if (r10 >= r12) {
490    ulong const t = r10;
491    r10 = r12;
492    r12 = t;
493  };
494  if (r14 >= r16) {
495    ulong const t = r14;
496    r14 = r16;
497    r16 = t;
498  };
499  if (r1 >= r2) {
500    ulong const t = r1;
501    r1 = r2;
502    r2 = t;
503  };
504  if (r3 >= r4) {
505    ulong const t = r3;
506    r3 = r4;
507    r4 = t;
508  };
509  if (r5 >= r6) {
510    ulong const t = r5;
511    r5 = r6;
512    r6 = t;
513  };
514  if (r7 >= r8) {
515    ulong const t = r7;
516    r7 = r8;
517    r8 = t;
518  };
519  if (r9 >= r10) {
520    ulong const t = r9;
521    r9 = r10;
522    r10 = t;
523  };
524  if (r11 >= r12) {
525    ulong const t = r11;
526    r11 = r12;
527    r12 = t;
528  };
529  if (r13 >= r14) {
530    ulong const t = r13;
531    r13 = r14;
532    r14 = t;
533  };
534  if (r15 >= r16) {
535    ulong const t = r15;
536    r15 = r16;
537    r16 = t;
538  };
539  {
540    uint const flip_lane_idx = get_sub_group_local_id() ^ 3;
541    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
542    ;
543    {
544      ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
545      ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx);
546      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
547      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
548    };
549    {
550      ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
551      ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx);
552      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
553      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
554    };
555    {
556      ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
557      ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx);
558      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
559      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
560    };
561    {
562      ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
563      ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx);
564      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
565      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
566    };
567    {
568      ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx);
569      ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx);
570      r5 = ((r5 <= tb) ^ t_lt) ? tb : r5;
571      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
572    };
573    {
574      ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx);
575      ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx);
576      r6 = ((r6 <= tb) ^ t_lt) ? tb : r6;
577      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
578    };
579    {
580      ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx);
581      ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx);
582      r7 = ((r7 <= tb) ^ t_lt) ? tb : r7;
583      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
584    };
585    {
586      ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx);
587      ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx);
588      r8 = ((r8 <= tb) ^ t_lt) ? tb : r8;
589      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
590    };
591  }
592  {
593    uint const half_lane_idx = get_sub_group_local_id() ^ 1;
594    int const t_lt = get_sub_group_local_id() < half_lane_idx;
595    ;
596    {
597      ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
598      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
599    };
600    {
601      ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
602      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
603    };
604    {
605      ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
606      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
607    };
608    {
609      ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
610      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
611    };
612    {
613      ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
614      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
615    };
616    {
617      ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
618      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
619    };
620    {
621      ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
622      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
623    };
624    {
625      ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
626      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
627    };
628    {
629      ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
630      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
631    };
632    {
633      ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
634      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
635    };
636    {
637      ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
638      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
639    };
640    {
641      ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
642      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
643    };
644    {
645      ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
646      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
647    };
648    {
649      ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
650      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
651    };
652    {
653      ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
654      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
655    };
656    {
657      ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
658      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
659    };
660  }
661  if (r1 >= r9) {
662    ulong const t = r1;
663    r1 = r9;
664    r9 = t;
665  };
666  if (r5 >= r13) {
667    ulong const t = r5;
668    r5 = r13;
669    r13 = t;
670  };
671  if (r1 >= r5) {
672    ulong const t = r1;
673    r1 = r5;
674    r5 = t;
675  };
676  if (r9 >= r13) {
677    ulong const t = r9;
678    r9 = r13;
679    r13 = t;
680  };
681  if (r3 >= r11) {
682    ulong const t = r3;
683    r3 = r11;
684    r11 = t;
685  };
686  if (r7 >= r15) {
687    ulong const t = r7;
688    r7 = r15;
689    r15 = t;
690  };
691  if (r3 >= r7) {
692    ulong const t = r3;
693    r3 = r7;
694    r7 = t;
695  };
696  if (r11 >= r15) {
697    ulong const t = r11;
698    r11 = r15;
699    r15 = t;
700  };
701  if (r1 >= r3) {
702    ulong const t = r1;
703    r1 = r3;
704    r3 = t;
705  };
706  if (r5 >= r7) {
707    ulong const t = r5;
708    r5 = r7;
709    r7 = t;
710  };
711  if (r9 >= r11) {
712    ulong const t = r9;
713    r9 = r11;
714    r11 = t;
715  };
716  if (r13 >= r15) {
717    ulong const t = r13;
718    r13 = r15;
719    r15 = t;
720  };
721  if (r2 >= r10) {
722    ulong const t = r2;
723    r2 = r10;
724    r10 = t;
725  };
726  if (r6 >= r14) {
727    ulong const t = r6;
728    r6 = r14;
729    r14 = t;
730  };
731  if (r2 >= r6) {
732    ulong const t = r2;
733    r2 = r6;
734    r6 = t;
735  };
736  if (r10 >= r14) {
737    ulong const t = r10;
738    r10 = r14;
739    r14 = t;
740  };
741  if (r4 >= r12) {
742    ulong const t = r4;
743    r4 = r12;
744    r12 = t;
745  };
746  if (r8 >= r16) {
747    ulong const t = r8;
748    r8 = r16;
749    r16 = t;
750  };
751  if (r4 >= r8) {
752    ulong const t = r4;
753    r4 = r8;
754    r8 = t;
755  };
756  if (r12 >= r16) {
757    ulong const t = r12;
758    r12 = r16;
759    r16 = t;
760  };
761  if (r2 >= r4) {
762    ulong const t = r2;
763    r2 = r4;
764    r4 = t;
765  };
766  if (r6 >= r8) {
767    ulong const t = r6;
768    r6 = r8;
769    r8 = t;
770  };
771  if (r10 >= r12) {
772    ulong const t = r10;
773    r10 = r12;
774    r12 = t;
775  };
776  if (r14 >= r16) {
777    ulong const t = r14;
778    r14 = r16;
779    r16 = t;
780  };
781  if (r1 >= r2) {
782    ulong const t = r1;
783    r1 = r2;
784    r2 = t;
785  };
786  if (r3 >= r4) {
787    ulong const t = r3;
788    r3 = r4;
789    r4 = t;
790  };
791  if (r5 >= r6) {
792    ulong const t = r5;
793    r5 = r6;
794    r6 = t;
795  };
796  if (r7 >= r8) {
797    ulong const t = r7;
798    r7 = r8;
799    r8 = t;
800  };
801  if (r9 >= r10) {
802    ulong const t = r9;
803    r9 = r10;
804    r10 = t;
805  };
806  if (r11 >= r12) {
807    ulong const t = r11;
808    r11 = r12;
809    r12 = t;
810  };
811  if (r13 >= r14) {
812    ulong const t = r13;
813    r13 = r14;
814    r14 = t;
815  };
816  if (r15 >= r16) {
817    ulong const t = r15;
818    r15 = r16;
819    r16 = t;
820  };
821  {
822    uint const flip_lane_idx = get_sub_group_local_id() ^ 7;
823    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
824    ;
825    {
826      ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
827      ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx);
828      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
829      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
830    };
831    {
832      ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
833      ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx);
834      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
835      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
836    };
837    {
838      ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
839      ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx);
840      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
841      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
842    };
843    {
844      ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
845      ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx);
846      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
847      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
848    };
849    {
850      ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx);
851      ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx);
852      r5 = ((r5 <= tb) ^ t_lt) ? tb : r5;
853      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
854    };
855    {
856      ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx);
857      ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx);
858      r6 = ((r6 <= tb) ^ t_lt) ? tb : r6;
859      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
860    };
861    {
862      ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx);
863      ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx);
864      r7 = ((r7 <= tb) ^ t_lt) ? tb : r7;
865      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
866    };
867    {
868      ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx);
869      ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx);
870      r8 = ((r8 <= tb) ^ t_lt) ? tb : r8;
871      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
872    };
873  }
874  {
875    uint const half_lane_idx = get_sub_group_local_id() ^ 2;
876    int const t_lt = get_sub_group_local_id() < half_lane_idx;
877    ;
878    {
879      ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
880      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
881    };
882    {
883      ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
884      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
885    };
886    {
887      ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
888      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
889    };
890    {
891      ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
892      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
893    };
894    {
895      ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
896      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
897    };
898    {
899      ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
900      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
901    };
902    {
903      ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
904      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
905    };
906    {
907      ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
908      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
909    };
910    {
911      ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
912      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
913    };
914    {
915      ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
916      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
917    };
918    {
919      ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
920      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
921    };
922    {
923      ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
924      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
925    };
926    {
927      ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
928      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
929    };
930    {
931      ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
932      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
933    };
934    {
935      ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
936      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
937    };
938    {
939      ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
940      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
941    };
942  }
943  {
944    uint const half_lane_idx = get_sub_group_local_id() ^ 1;
945    int const t_lt = get_sub_group_local_id() < half_lane_idx;
946    ;
947    {
948      ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
949      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
950    };
951    {
952      ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
953      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
954    };
955    {
956      ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
957      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
958    };
959    {
960      ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
961      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
962    };
963    {
964      ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
965      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
966    };
967    {
968      ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
969      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
970    };
971    {
972      ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
973      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
974    };
975    {
976      ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
977      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
978    };
979    {
980      ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
981      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
982    };
983    {
984      ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
985      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
986    };
987    {
988      ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
989      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
990    };
991    {
992      ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
993      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
994    };
995    {
996      ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
997      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
998    };
999    {
1000      ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
1001      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
1002    };
1003    {
1004      ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
1005      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
1006    };
1007    {
1008      ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
1009      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
1010    };
1011  }
1012  if (r1 >= r9) {
1013    ulong const t = r1;
1014    r1 = r9;
1015    r9 = t;
1016  };
1017  if (r5 >= r13) {
1018    ulong const t = r5;
1019    r5 = r13;
1020    r13 = t;
1021  };
1022  if (r1 >= r5) {
1023    ulong const t = r1;
1024    r1 = r5;
1025    r5 = t;
1026  };
1027  if (r9 >= r13) {
1028    ulong const t = r9;
1029    r9 = r13;
1030    r13 = t;
1031  };
1032  if (r3 >= r11) {
1033    ulong const t = r3;
1034    r3 = r11;
1035    r11 = t;
1036  };
1037  if (r7 >= r15) {
1038    ulong const t = r7;
1039    r7 = r15;
1040    r15 = t;
1041  };
1042  if (r3 >= r7) {
1043    ulong const t = r3;
1044    r3 = r7;
1045    r7 = t;
1046  };
1047  if (r11 >= r15) {
1048    ulong const t = r11;
1049    r11 = r15;
1050    r15 = t;
1051  };
1052  if (r1 >= r3) {
1053    ulong const t = r1;
1054    r1 = r3;
1055    r3 = t;
1056  };
1057  if (r5 >= r7) {
1058    ulong const t = r5;
1059    r5 = r7;
1060    r7 = t;
1061  };
1062  if (r9 >= r11) {
1063    ulong const t = r9;
1064    r9 = r11;
1065    r11 = t;
1066  };
1067  if (r13 >= r15) {
1068    ulong const t = r13;
1069    r13 = r15;
1070    r15 = t;
1071  };
1072  if (r2 >= r10) {
1073    ulong const t = r2;
1074    r2 = r10;
1075    r10 = t;
1076  };
1077  if (r6 >= r14) {
1078    ulong const t = r6;
1079    r6 = r14;
1080    r14 = t;
1081  };
1082  if (r2 >= r6) {
1083    ulong const t = r2;
1084    r2 = r6;
1085    r6 = t;
1086  };
1087  if (r10 >= r14) {
1088    ulong const t = r10;
1089    r10 = r14;
1090    r14 = t;
1091  };
1092  if (r4 >= r12) {
1093    ulong const t = r4;
1094    r4 = r12;
1095    r12 = t;
1096  };
1097  if (r8 >= r16) {
1098    ulong const t = r8;
1099    r8 = r16;
1100    r16 = t;
1101  };
1102  if (r4 >= r8) {
1103    ulong const t = r4;
1104    r4 = r8;
1105    r8 = t;
1106  };
1107  if (r12 >= r16) {
1108    ulong const t = r12;
1109    r12 = r16;
1110    r16 = t;
1111  };
1112  if (r2 >= r4) {
1113    ulong const t = r2;
1114    r2 = r4;
1115    r4 = t;
1116  };
1117  if (r6 >= r8) {
1118    ulong const t = r6;
1119    r6 = r8;
1120    r8 = t;
1121  };
1122  if (r10 >= r12) {
1123    ulong const t = r10;
1124    r10 = r12;
1125    r12 = t;
1126  };
1127  if (r14 >= r16) {
1128    ulong const t = r14;
1129    r14 = r16;
1130    r16 = t;
1131  };
1132  if (r1 >= r2) {
1133    ulong const t = r1;
1134    r1 = r2;
1135    r2 = t;
1136  };
1137  if (r3 >= r4) {
1138    ulong const t = r3;
1139    r3 = r4;
1140    r4 = t;
1141  };
1142  if (r5 >= r6) {
1143    ulong const t = r5;
1144    r5 = r6;
1145    r6 = t;
1146  };
1147  if (r7 >= r8) {
1148    ulong const t = r7;
1149    r7 = r8;
1150    r8 = t;
1151  };
1152  if (r9 >= r10) {
1153    ulong const t = r9;
1154    r9 = r10;
1155    r10 = t;
1156  };
1157  if (r11 >= r12) {
1158    ulong const t = r11;
1159    r11 = r12;
1160    r12 = t;
1161  };
1162  if (r13 >= r14) {
1163    ulong const t = r13;
1164    r13 = r14;
1165    r14 = t;
1166  };
1167  if (r15 >= r16) {
1168    ulong const t = r15;
1169    r15 = r16;
1170    r16 = t;
1171  };
1172  vout[gmem_idx + (1 << 3) * 0] = r1;
1173  vout[gmem_idx + (1 << 3) * 1] = r2;
1174  vout[gmem_idx + (1 << 3) * 2] = r3;
1175  vout[gmem_idx + (1 << 3) * 3] = r4;
1176  vout[gmem_idx + (1 << 3) * 4] = r5;
1177  vout[gmem_idx + (1 << 3) * 5] = r6;
1178  vout[gmem_idx + (1 << 3) * 6] = r7;
1179  vout[gmem_idx + (1 << 3) * 7] = r8;
1180  vout[gmem_idx + (1 << 3) * 8] = r9;
1181  vout[gmem_idx + (1 << 3) * 9] = r10;
1182  vout[gmem_idx + (1 << 3) * 10] = r11;
1183  vout[gmem_idx + (1 << 3) * 11] = r12;
1184  vout[gmem_idx + (1 << 3) * 12] = r13;
1185  vout[gmem_idx + (1 << 3) * 13] = r14;
1186  vout[gmem_idx + (1 << 3) * 14] = r15;
1187  vout[gmem_idx + (1 << 3) * 15] = r16;
1188}
1189
1190__kernel __attribute__((intel_reqd_sub_group_size((1 << 3))))
1191__attribute__((reqd_work_group_size((1 << 3) * 2, 1, 1))) void
1192hs_kernel_bs_1(__global ulong const* const restrict vin,
1193               __global ulong* const restrict vout)
1194{
1195  __local struct
1196  {
1197    ulong m[16 * 16];
1198  } shared;
1199
1200  uint const gmem_idx = (get_global_id(0) & ~((1 << 3) - 1)) * 16 +
1201                        (get_local_id(0) & ((1 << 3) - 1));
1202  ulong r1 = vin[gmem_idx + (1 << 3) * 0];
1203  ulong r2 = vin[gmem_idx + (1 << 3) * 1];
1204  ulong r3 = vin[gmem_idx + (1 << 3) * 2];
1205  ulong r4 = vin[gmem_idx + (1 << 3) * 3];
1206  ulong r5 = vin[gmem_idx + (1 << 3) * 4];
1207  ulong r6 = vin[gmem_idx + (1 << 3) * 5];
1208  ulong r7 = vin[gmem_idx + (1 << 3) * 6];
1209  ulong r8 = vin[gmem_idx + (1 << 3) * 7];
1210  ulong r9 = vin[gmem_idx + (1 << 3) * 8];
1211  ulong r10 = vin[gmem_idx + (1 << 3) * 9];
1212  ulong r11 = vin[gmem_idx + (1 << 3) * 10];
1213  ulong r12 = vin[gmem_idx + (1 << 3) * 11];
1214  ulong r13 = vin[gmem_idx + (1 << 3) * 12];
1215  ulong r14 = vin[gmem_idx + (1 << 3) * 13];
1216  ulong r15 = vin[gmem_idx + (1 << 3) * 14];
1217  ulong r16 = vin[gmem_idx + (1 << 3) * 15];
1218  if (r1 >= r2) {
1219    ulong const t = r1;
1220    r1 = r2;
1221    r2 = t;
1222  };
1223  if (r3 >= r4) {
1224    ulong const t = r3;
1225    r3 = r4;
1226    r4 = t;
1227  };
1228  if (r5 >= r6) {
1229    ulong const t = r5;
1230    r5 = r6;
1231    r6 = t;
1232  };
1233  if (r7 >= r8) {
1234    ulong const t = r7;
1235    r7 = r8;
1236    r8 = t;
1237  };
1238  if (r9 >= r10) {
1239    ulong const t = r9;
1240    r9 = r10;
1241    r10 = t;
1242  };
1243  if (r11 >= r12) {
1244    ulong const t = r11;
1245    r11 = r12;
1246    r12 = t;
1247  };
1248  if (r13 >= r14) {
1249    ulong const t = r13;
1250    r13 = r14;
1251    r14 = t;
1252  };
1253  if (r15 >= r16) {
1254    ulong const t = r15;
1255    r15 = r16;
1256    r16 = t;
1257  };
1258  if (r1 >= r3) {
1259    ulong const t = r1;
1260    r1 = r3;
1261    r3 = t;
1262  };
1263  if (r5 >= r7) {
1264    ulong const t = r5;
1265    r5 = r7;
1266    r7 = t;
1267  };
1268  if (r9 >= r11) {
1269    ulong const t = r9;
1270    r9 = r11;
1271    r11 = t;
1272  };
1273  if (r13 >= r15) {
1274    ulong const t = r13;
1275    r13 = r15;
1276    r15 = t;
1277  };
1278  if (r2 >= r4) {
1279    ulong const t = r2;
1280    r2 = r4;
1281    r4 = t;
1282  };
1283  if (r6 >= r8) {
1284    ulong const t = r6;
1285    r6 = r8;
1286    r8 = t;
1287  };
1288  if (r10 >= r12) {
1289    ulong const t = r10;
1290    r10 = r12;
1291    r12 = t;
1292  };
1293  if (r14 >= r16) {
1294    ulong const t = r14;
1295    r14 = r16;
1296    r16 = t;
1297  };
1298  if (r1 >= r5) {
1299    ulong const t = r1;
1300    r1 = r5;
1301    r5 = t;
1302  };
1303  if (r9 >= r13) {
1304    ulong const t = r9;
1305    r9 = r13;
1306    r13 = t;
1307  };
1308  if (r2 >= r6) {
1309    ulong const t = r2;
1310    r2 = r6;
1311    r6 = t;
1312  };
1313  if (r10 >= r14) {
1314    ulong const t = r10;
1315    r10 = r14;
1316    r14 = t;
1317  };
1318  if (r3 >= r7) {
1319    ulong const t = r3;
1320    r3 = r7;
1321    r7 = t;
1322  };
1323  if (r11 >= r15) {
1324    ulong const t = r11;
1325    r11 = r15;
1326    r15 = t;
1327  };
1328  if (r4 >= r8) {
1329    ulong const t = r4;
1330    r4 = r8;
1331    r8 = t;
1332  };
1333  if (r12 >= r16) {
1334    ulong const t = r12;
1335    r12 = r16;
1336    r16 = t;
1337  };
1338  if (r1 >= r9) {
1339    ulong const t = r1;
1340    r1 = r9;
1341    r9 = t;
1342  };
1343  if (r2 >= r10) {
1344    ulong const t = r2;
1345    r2 = r10;
1346    r10 = t;
1347  };
1348  if (r3 >= r11) {
1349    ulong const t = r3;
1350    r3 = r11;
1351    r11 = t;
1352  };
1353  if (r4 >= r12) {
1354    ulong const t = r4;
1355    r4 = r12;
1356    r12 = t;
1357  };
1358  if (r5 >= r13) {
1359    ulong const t = r5;
1360    r5 = r13;
1361    r13 = t;
1362  };
1363  if (r6 >= r14) {
1364    ulong const t = r6;
1365    r6 = r14;
1366    r14 = t;
1367  };
1368  if (r7 >= r15) {
1369    ulong const t = r7;
1370    r7 = r15;
1371    r15 = t;
1372  };
1373  if (r8 >= r16) {
1374    ulong const t = r8;
1375    r8 = r16;
1376    r16 = t;
1377  };
1378  if (r6 >= r11) {
1379    ulong const t = r6;
1380    r6 = r11;
1381    r11 = t;
1382  };
1383  if (r7 >= r10) {
1384    ulong const t = r7;
1385    r7 = r10;
1386    r10 = t;
1387  };
1388  if (r4 >= r13) {
1389    ulong const t = r4;
1390    r4 = r13;
1391    r13 = t;
1392  };
1393  if (r14 >= r15) {
1394    ulong const t = r14;
1395    r14 = r15;
1396    r15 = t;
1397  };
1398  if (r8 >= r12) {
1399    ulong const t = r8;
1400    r8 = r12;
1401    r12 = t;
1402  };
1403  if (r2 >= r3) {
1404    ulong const t = r2;
1405    r2 = r3;
1406    r3 = t;
1407  };
1408  if (r5 >= r9) {
1409    ulong const t = r5;
1410    r5 = r9;
1411    r9 = t;
1412  };
1413  if (r2 >= r5) {
1414    ulong const t = r2;
1415    r2 = r5;
1416    r5 = t;
1417  };
1418  if (r8 >= r14) {
1419    ulong const t = r8;
1420    r8 = r14;
1421    r14 = t;
1422  };
1423  if (r3 >= r9) {
1424    ulong const t = r3;
1425    r3 = r9;
1426    r9 = t;
1427  };
1428  if (r12 >= r15) {
1429    ulong const t = r12;
1430    r12 = r15;
1431    r15 = t;
1432  };
1433  if (r3 >= r5) {
1434    ulong const t = r3;
1435    r3 = r5;
1436    r5 = t;
1437  };
1438  if (r6 >= r7) {
1439    ulong const t = r6;
1440    r6 = r7;
1441    r7 = t;
1442  };
1443  if (r10 >= r11) {
1444    ulong const t = r10;
1445    r10 = r11;
1446    r11 = t;
1447  };
1448  if (r12 >= r14) {
1449    ulong const t = r12;
1450    r12 = r14;
1451    r14 = t;
1452  };
1453  if (r4 >= r9) {
1454    ulong const t = r4;
1455    r4 = r9;
1456    r9 = t;
1457  };
1458  if (r8 >= r13) {
1459    ulong const t = r8;
1460    r8 = r13;
1461    r13 = t;
1462  };
1463  if (r7 >= r9) {
1464    ulong const t = r7;
1465    r7 = r9;
1466    r9 = t;
1467  };
1468  if (r11 >= r13) {
1469    ulong const t = r11;
1470    r11 = r13;
1471    r13 = t;
1472  };
1473  if (r4 >= r6) {
1474    ulong const t = r4;
1475    r4 = r6;
1476    r6 = t;
1477  };
1478  if (r8 >= r10) {
1479    ulong const t = r8;
1480    r8 = r10;
1481    r10 = t;
1482  };
1483  if (r4 >= r5) {
1484    ulong const t = r4;
1485    r4 = r5;
1486    r5 = t;
1487  };
1488  if (r6 >= r7) {
1489    ulong const t = r6;
1490    r6 = r7;
1491    r7 = t;
1492  };
1493  if (r8 >= r9) {
1494    ulong const t = r8;
1495    r8 = r9;
1496    r9 = t;
1497  };
1498  if (r10 >= r11) {
1499    ulong const t = r10;
1500    r10 = r11;
1501    r11 = t;
1502  };
1503  if (r12 >= r13) {
1504    ulong const t = r12;
1505    r12 = r13;
1506    r13 = t;
1507  };
1508  if (r7 >= r8) {
1509    ulong const t = r7;
1510    r7 = r8;
1511    r8 = t;
1512  };
1513  if (r9 >= r10) {
1514    ulong const t = r9;
1515    r9 = r10;
1516    r10 = t;
1517  };
1518  {
1519    uint const flip_lane_idx = get_sub_group_local_id() ^ 1;
1520    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
1521    ;
1522    {
1523      ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
1524      ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx);
1525      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
1526      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
1527    };
1528    {
1529      ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
1530      ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx);
1531      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
1532      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
1533    };
1534    {
1535      ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
1536      ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx);
1537      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
1538      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
1539    };
1540    {
1541      ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
1542      ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx);
1543      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
1544      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
1545    };
1546    {
1547      ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx);
1548      ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx);
1549      r5 = ((r5 <= tb) ^ t_lt) ? tb : r5;
1550      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
1551    };
1552    {
1553      ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx);
1554      ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx);
1555      r6 = ((r6 <= tb) ^ t_lt) ? tb : r6;
1556      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
1557    };
1558    {
1559      ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx);
1560      ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx);
1561      r7 = ((r7 <= tb) ^ t_lt) ? tb : r7;
1562      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
1563    };
1564    {
1565      ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx);
1566      ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx);
1567      r8 = ((r8 <= tb) ^ t_lt) ? tb : r8;
1568      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
1569    };
1570  }
1571  if (r1 >= r9) {
1572    ulong const t = r1;
1573    r1 = r9;
1574    r9 = t;
1575  };
1576  if (r5 >= r13) {
1577    ulong const t = r5;
1578    r5 = r13;
1579    r13 = t;
1580  };
1581  if (r1 >= r5) {
1582    ulong const t = r1;
1583    r1 = r5;
1584    r5 = t;
1585  };
1586  if (r9 >= r13) {
1587    ulong const t = r9;
1588    r9 = r13;
1589    r13 = t;
1590  };
1591  if (r3 >= r11) {
1592    ulong const t = r3;
1593    r3 = r11;
1594    r11 = t;
1595  };
1596  if (r7 >= r15) {
1597    ulong const t = r7;
1598    r7 = r15;
1599    r15 = t;
1600  };
1601  if (r3 >= r7) {
1602    ulong const t = r3;
1603    r3 = r7;
1604    r7 = t;
1605  };
1606  if (r11 >= r15) {
1607    ulong const t = r11;
1608    r11 = r15;
1609    r15 = t;
1610  };
1611  if (r1 >= r3) {
1612    ulong const t = r1;
1613    r1 = r3;
1614    r3 = t;
1615  };
1616  if (r5 >= r7) {
1617    ulong const t = r5;
1618    r5 = r7;
1619    r7 = t;
1620  };
1621  if (r9 >= r11) {
1622    ulong const t = r9;
1623    r9 = r11;
1624    r11 = t;
1625  };
1626  if (r13 >= r15) {
1627    ulong const t = r13;
1628    r13 = r15;
1629    r15 = t;
1630  };
1631  if (r2 >= r10) {
1632    ulong const t = r2;
1633    r2 = r10;
1634    r10 = t;
1635  };
1636  if (r6 >= r14) {
1637    ulong const t = r6;
1638    r6 = r14;
1639    r14 = t;
1640  };
1641  if (r2 >= r6) {
1642    ulong const t = r2;
1643    r2 = r6;
1644    r6 = t;
1645  };
1646  if (r10 >= r14) {
1647    ulong const t = r10;
1648    r10 = r14;
1649    r14 = t;
1650  };
1651  if (r4 >= r12) {
1652    ulong const t = r4;
1653    r4 = r12;
1654    r12 = t;
1655  };
1656  if (r8 >= r16) {
1657    ulong const t = r8;
1658    r8 = r16;
1659    r16 = t;
1660  };
1661  if (r4 >= r8) {
1662    ulong const t = r4;
1663    r4 = r8;
1664    r8 = t;
1665  };
1666  if (r12 >= r16) {
1667    ulong const t = r12;
1668    r12 = r16;
1669    r16 = t;
1670  };
1671  if (r2 >= r4) {
1672    ulong const t = r2;
1673    r2 = r4;
1674    r4 = t;
1675  };
1676  if (r6 >= r8) {
1677    ulong const t = r6;
1678    r6 = r8;
1679    r8 = t;
1680  };
1681  if (r10 >= r12) {
1682    ulong const t = r10;
1683    r10 = r12;
1684    r12 = t;
1685  };
1686  if (r14 >= r16) {
1687    ulong const t = r14;
1688    r14 = r16;
1689    r16 = t;
1690  };
1691  if (r1 >= r2) {
1692    ulong const t = r1;
1693    r1 = r2;
1694    r2 = t;
1695  };
1696  if (r3 >= r4) {
1697    ulong const t = r3;
1698    r3 = r4;
1699    r4 = t;
1700  };
1701  if (r5 >= r6) {
1702    ulong const t = r5;
1703    r5 = r6;
1704    r6 = t;
1705  };
1706  if (r7 >= r8) {
1707    ulong const t = r7;
1708    r7 = r8;
1709    r8 = t;
1710  };
1711  if (r9 >= r10) {
1712    ulong const t = r9;
1713    r9 = r10;
1714    r10 = t;
1715  };
1716  if (r11 >= r12) {
1717    ulong const t = r11;
1718    r11 = r12;
1719    r12 = t;
1720  };
1721  if (r13 >= r14) {
1722    ulong const t = r13;
1723    r13 = r14;
1724    r14 = t;
1725  };
1726  if (r15 >= r16) {
1727    ulong const t = r15;
1728    r15 = r16;
1729    r16 = t;
1730  };
1731  {
1732    uint const flip_lane_idx = get_sub_group_local_id() ^ 3;
1733    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
1734    ;
1735    {
1736      ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
1737      ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx);
1738      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
1739      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
1740    };
1741    {
1742      ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
1743      ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx);
1744      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
1745      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
1746    };
1747    {
1748      ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
1749      ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx);
1750      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
1751      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
1752    };
1753    {
1754      ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
1755      ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx);
1756      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
1757      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
1758    };
1759    {
1760      ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx);
1761      ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx);
1762      r5 = ((r5 <= tb) ^ t_lt) ? tb : r5;
1763      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
1764    };
1765    {
1766      ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx);
1767      ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx);
1768      r6 = ((r6 <= tb) ^ t_lt) ? tb : r6;
1769      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
1770    };
1771    {
1772      ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx);
1773      ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx);
1774      r7 = ((r7 <= tb) ^ t_lt) ? tb : r7;
1775      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
1776    };
1777    {
1778      ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx);
1779      ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx);
1780      r8 = ((r8 <= tb) ^ t_lt) ? tb : r8;
1781      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
1782    };
1783  }
1784  {
1785    uint const half_lane_idx = get_sub_group_local_id() ^ 1;
1786    int const t_lt = get_sub_group_local_id() < half_lane_idx;
1787    ;
1788    {
1789      ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
1790      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
1791    };
1792    {
1793      ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
1794      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
1795    };
1796    {
1797      ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
1798      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
1799    };
1800    {
1801      ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
1802      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
1803    };
1804    {
1805      ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
1806      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
1807    };
1808    {
1809      ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
1810      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
1811    };
1812    {
1813      ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
1814      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
1815    };
1816    {
1817      ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
1818      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
1819    };
1820    {
1821      ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
1822      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
1823    };
1824    {
1825      ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
1826      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
1827    };
1828    {
1829      ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
1830      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
1831    };
1832    {
1833      ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
1834      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
1835    };
1836    {
1837      ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
1838      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
1839    };
1840    {
1841      ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
1842      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
1843    };
1844    {
1845      ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
1846      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
1847    };
1848    {
1849      ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
1850      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
1851    };
1852  }
1853  if (r1 >= r9) {
1854    ulong const t = r1;
1855    r1 = r9;
1856    r9 = t;
1857  };
1858  if (r5 >= r13) {
1859    ulong const t = r5;
1860    r5 = r13;
1861    r13 = t;
1862  };
1863  if (r1 >= r5) {
1864    ulong const t = r1;
1865    r1 = r5;
1866    r5 = t;
1867  };
1868  if (r9 >= r13) {
1869    ulong const t = r9;
1870    r9 = r13;
1871    r13 = t;
1872  };
1873  if (r3 >= r11) {
1874    ulong const t = r3;
1875    r3 = r11;
1876    r11 = t;
1877  };
1878  if (r7 >= r15) {
1879    ulong const t = r7;
1880    r7 = r15;
1881    r15 = t;
1882  };
1883  if (r3 >= r7) {
1884    ulong const t = r3;
1885    r3 = r7;
1886    r7 = t;
1887  };
1888  if (r11 >= r15) {
1889    ulong const t = r11;
1890    r11 = r15;
1891    r15 = t;
1892  };
1893  if (r1 >= r3) {
1894    ulong const t = r1;
1895    r1 = r3;
1896    r3 = t;
1897  };
1898  if (r5 >= r7) {
1899    ulong const t = r5;
1900    r5 = r7;
1901    r7 = t;
1902  };
1903  if (r9 >= r11) {
1904    ulong const t = r9;
1905    r9 = r11;
1906    r11 = t;
1907  };
1908  if (r13 >= r15) {
1909    ulong const t = r13;
1910    r13 = r15;
1911    r15 = t;
1912  };
1913  if (r2 >= r10) {
1914    ulong const t = r2;
1915    r2 = r10;
1916    r10 = t;
1917  };
1918  if (r6 >= r14) {
1919    ulong const t = r6;
1920    r6 = r14;
1921    r14 = t;
1922  };
1923  if (r2 >= r6) {
1924    ulong const t = r2;
1925    r2 = r6;
1926    r6 = t;
1927  };
1928  if (r10 >= r14) {
1929    ulong const t = r10;
1930    r10 = r14;
1931    r14 = t;
1932  };
1933  if (r4 >= r12) {
1934    ulong const t = r4;
1935    r4 = r12;
1936    r12 = t;
1937  };
1938  if (r8 >= r16) {
1939    ulong const t = r8;
1940    r8 = r16;
1941    r16 = t;
1942  };
1943  if (r4 >= r8) {
1944    ulong const t = r4;
1945    r4 = r8;
1946    r8 = t;
1947  };
1948  if (r12 >= r16) {
1949    ulong const t = r12;
1950    r12 = r16;
1951    r16 = t;
1952  };
1953  if (r2 >= r4) {
1954    ulong const t = r2;
1955    r2 = r4;
1956    r4 = t;
1957  };
1958  if (r6 >= r8) {
1959    ulong const t = r6;
1960    r6 = r8;
1961    r8 = t;
1962  };
1963  if (r10 >= r12) {
1964    ulong const t = r10;
1965    r10 = r12;
1966    r12 = t;
1967  };
1968  if (r14 >= r16) {
1969    ulong const t = r14;
1970    r14 = r16;
1971    r16 = t;
1972  };
1973  if (r1 >= r2) {
1974    ulong const t = r1;
1975    r1 = r2;
1976    r2 = t;
1977  };
1978  if (r3 >= r4) {
1979    ulong const t = r3;
1980    r3 = r4;
1981    r4 = t;
1982  };
1983  if (r5 >= r6) {
1984    ulong const t = r5;
1985    r5 = r6;
1986    r6 = t;
1987  };
1988  if (r7 >= r8) {
1989    ulong const t = r7;
1990    r7 = r8;
1991    r8 = t;
1992  };
1993  if (r9 >= r10) {
1994    ulong const t = r9;
1995    r9 = r10;
1996    r10 = t;
1997  };
1998  if (r11 >= r12) {
1999    ulong const t = r11;
2000    r11 = r12;
2001    r12 = t;
2002  };
2003  if (r13 >= r14) {
2004    ulong const t = r13;
2005    r13 = r14;
2006    r14 = t;
2007  };
2008  if (r15 >= r16) {
2009    ulong const t = r15;
2010    r15 = r16;
2011    r16 = t;
2012  };
2013  {
2014    uint const flip_lane_idx = get_sub_group_local_id() ^ 7;
2015    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
2016    ;
2017    {
2018      ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
2019      ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx);
2020      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
2021      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
2022    };
2023    {
2024      ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
2025      ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx);
2026      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
2027      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
2028    };
2029    {
2030      ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
2031      ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx);
2032      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
2033      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
2034    };
2035    {
2036      ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
2037      ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx);
2038      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
2039      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
2040    };
2041    {
2042      ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx);
2043      ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx);
2044      r5 = ((r5 <= tb) ^ t_lt) ? tb : r5;
2045      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
2046    };
2047    {
2048      ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx);
2049      ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx);
2050      r6 = ((r6 <= tb) ^ t_lt) ? tb : r6;
2051      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
2052    };
2053    {
2054      ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx);
2055      ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx);
2056      r7 = ((r7 <= tb) ^ t_lt) ? tb : r7;
2057      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
2058    };
2059    {
2060      ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx);
2061      ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx);
2062      r8 = ((r8 <= tb) ^ t_lt) ? tb : r8;
2063      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
2064    };
2065  }
2066  {
2067    uint const half_lane_idx = get_sub_group_local_id() ^ 2;
2068    int const t_lt = get_sub_group_local_id() < half_lane_idx;
2069    ;
2070    {
2071      ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
2072      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
2073    };
2074    {
2075      ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
2076      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
2077    };
2078    {
2079      ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
2080      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
2081    };
2082    {
2083      ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
2084      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
2085    };
2086    {
2087      ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
2088      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
2089    };
2090    {
2091      ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
2092      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
2093    };
2094    {
2095      ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
2096      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
2097    };
2098    {
2099      ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
2100      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
2101    };
2102    {
2103      ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
2104      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
2105    };
2106    {
2107      ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
2108      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
2109    };
2110    {
2111      ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
2112      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
2113    };
2114    {
2115      ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
2116      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
2117    };
2118    {
2119      ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
2120      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
2121    };
2122    {
2123      ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
2124      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
2125    };
2126    {
2127      ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
2128      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
2129    };
2130    {
2131      ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
2132      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
2133    };
2134  }
2135  {
2136    uint const half_lane_idx = get_sub_group_local_id() ^ 1;
2137    int const t_lt = get_sub_group_local_id() < half_lane_idx;
2138    ;
2139    {
2140      ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
2141      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
2142    };
2143    {
2144      ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
2145      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
2146    };
2147    {
2148      ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
2149      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
2150    };
2151    {
2152      ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
2153      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
2154    };
2155    {
2156      ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
2157      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
2158    };
2159    {
2160      ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
2161      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
2162    };
2163    {
2164      ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
2165      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
2166    };
2167    {
2168      ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
2169      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
2170    };
2171    {
2172      ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
2173      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
2174    };
2175    {
2176      ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
2177      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
2178    };
2179    {
2180      ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
2181      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
2182    };
2183    {
2184      ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
2185      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
2186    };
2187    {
2188      ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
2189      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
2190    };
2191    {
2192      ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
2193      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
2194    };
2195    {
2196      ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
2197      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
2198    };
2199    {
2200      ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
2201      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
2202    };
2203  }
2204  if (r1 >= r9) {
2205    ulong const t = r1;
2206    r1 = r9;
2207    r9 = t;
2208  };
2209  if (r5 >= r13) {
2210    ulong const t = r5;
2211    r5 = r13;
2212    r13 = t;
2213  };
2214  if (r1 >= r5) {
2215    ulong const t = r1;
2216    r1 = r5;
2217    r5 = t;
2218  };
2219  if (r9 >= r13) {
2220    ulong const t = r9;
2221    r9 = r13;
2222    r13 = t;
2223  };
2224  if (r3 >= r11) {
2225    ulong const t = r3;
2226    r3 = r11;
2227    r11 = t;
2228  };
2229  if (r7 >= r15) {
2230    ulong const t = r7;
2231    r7 = r15;
2232    r15 = t;
2233  };
2234  if (r3 >= r7) {
2235    ulong const t = r3;
2236    r3 = r7;
2237    r7 = t;
2238  };
2239  if (r11 >= r15) {
2240    ulong const t = r11;
2241    r11 = r15;
2242    r15 = t;
2243  };
2244  if (r1 >= r3) {
2245    ulong const t = r1;
2246    r1 = r3;
2247    r3 = t;
2248  };
2249  if (r5 >= r7) {
2250    ulong const t = r5;
2251    r5 = r7;
2252    r7 = t;
2253  };
2254  if (r9 >= r11) {
2255    ulong const t = r9;
2256    r9 = r11;
2257    r11 = t;
2258  };
2259  if (r13 >= r15) {
2260    ulong const t = r13;
2261    r13 = r15;
2262    r15 = t;
2263  };
2264  if (r2 >= r10) {
2265    ulong const t = r2;
2266    r2 = r10;
2267    r10 = t;
2268  };
2269  if (r6 >= r14) {
2270    ulong const t = r6;
2271    r6 = r14;
2272    r14 = t;
2273  };
2274  if (r2 >= r6) {
2275    ulong const t = r2;
2276    r2 = r6;
2277    r6 = t;
2278  };
2279  if (r10 >= r14) {
2280    ulong const t = r10;
2281    r10 = r14;
2282    r14 = t;
2283  };
2284  if (r4 >= r12) {
2285    ulong const t = r4;
2286    r4 = r12;
2287    r12 = t;
2288  };
2289  if (r8 >= r16) {
2290    ulong const t = r8;
2291    r8 = r16;
2292    r16 = t;
2293  };
2294  if (r4 >= r8) {
2295    ulong const t = r4;
2296    r4 = r8;
2297    r8 = t;
2298  };
2299  if (r12 >= r16) {
2300    ulong const t = r12;
2301    r12 = r16;
2302    r16 = t;
2303  };
2304  if (r2 >= r4) {
2305    ulong const t = r2;
2306    r2 = r4;
2307    r4 = t;
2308  };
2309  if (r6 >= r8) {
2310    ulong const t = r6;
2311    r6 = r8;
2312    r8 = t;
2313  };
2314  if (r10 >= r12) {
2315    ulong const t = r10;
2316    r10 = r12;
2317    r12 = t;
2318  };
2319  if (r14 >= r16) {
2320    ulong const t = r14;
2321    r14 = r16;
2322    r16 = t;
2323  };
2324  if (r1 >= r2) {
2325    ulong const t = r1;
2326    r1 = r2;
2327    r2 = t;
2328  };
2329  if (r3 >= r4) {
2330    ulong const t = r3;
2331    r3 = r4;
2332    r4 = t;
2333  };
2334  if (r5 >= r6) {
2335    ulong const t = r5;
2336    r5 = r6;
2337    r6 = t;
2338  };
2339  if (r7 >= r8) {
2340    ulong const t = r7;
2341    r7 = r8;
2342    r8 = t;
2343  };
2344  if (r9 >= r10) {
2345    ulong const t = r9;
2346    r9 = r10;
2347    r10 = t;
2348  };
2349  if (r11 >= r12) {
2350    ulong const t = r11;
2351    r11 = r12;
2352    r12 = t;
2353  };
2354  if (r13 >= r14) {
2355    ulong const t = r13;
2356    r13 = r14;
2357    r14 = t;
2358  };
2359  if (r15 >= r16) {
2360    ulong const t = r15;
2361    r15 = r16;
2362    r16 = t;
2363  };
2364  uint const smem_l_idx =
2365    get_sub_group_id() * ((1 << 3) * 2) + get_sub_group_local_id();
2366  uint const smem_r_idx = (get_sub_group_id() ^ 1) * ((1 << 3) * 2) +
2367                          (get_sub_group_local_id() ^ ((1 << 3) - 1));
2368  shared.m[get_local_id(0) + (2 * (1 << 3) * 0)] = r1;
2369  shared.m[get_local_id(0) + (2 * (1 << 3) * 1)] = r16;
2370  shared.m[get_local_id(0) + (2 * (1 << 3) * 2)] = r2;
2371  shared.m[get_local_id(0) + (2 * (1 << 3) * 3)] = r15;
2372  shared.m[get_local_id(0) + (2 * (1 << 3) * 4)] = r3;
2373  shared.m[get_local_id(0) + (2 * (1 << 3) * 5)] = r14;
2374  shared.m[get_local_id(0) + (2 * (1 << 3) * 6)] = r4;
2375  shared.m[get_local_id(0) + (2 * (1 << 3) * 7)] = r13;
2376  shared.m[get_local_id(0) + (2 * (1 << 3) * 8)] = r5;
2377  shared.m[get_local_id(0) + (2 * (1 << 3) * 9)] = r12;
2378  shared.m[get_local_id(0) + (2 * (1 << 3) * 10)] = r6;
2379  shared.m[get_local_id(0) + (2 * (1 << 3) * 11)] = r11;
2380  shared.m[get_local_id(0) + (2 * (1 << 3) * 12)] = r7;
2381  shared.m[get_local_id(0) + (2 * (1 << 3) * 13)] = r10;
2382  shared.m[get_local_id(0) + (2 * (1 << 3) * 14)] = r8;
2383  shared.m[get_local_id(0) + (2 * (1 << 3) * 15)] = r9;
2384  barrier(CLK_LOCAL_MEM_FENCE);
2385  {
2386    {
2387      ulong r0_1 = shared.m[smem_l_idx + (0)];
2388      ulong r0_2 = shared.m[smem_r_idx + (8)];
2389      if (r0_1 >= r0_2) {
2390        ulong const t = r0_1;
2391        r0_1 = r0_2;
2392        r0_2 = t;
2393      };
2394      shared.m[smem_l_idx + (0)] = r0_1;
2395      shared.m[smem_r_idx + (8)] = r0_2;
2396    }
2397    {
2398      ulong r0_1 = shared.m[smem_l_idx + (32)];
2399      ulong r0_2 = shared.m[smem_r_idx + (40)];
2400      if (r0_1 >= r0_2) {
2401        ulong const t = r0_1;
2402        r0_1 = r0_2;
2403        r0_2 = t;
2404      };
2405      shared.m[smem_l_idx + (32)] = r0_1;
2406      shared.m[smem_r_idx + (40)] = r0_2;
2407    }
2408    {
2409      ulong r0_1 = shared.m[smem_l_idx + (64)];
2410      ulong r0_2 = shared.m[smem_r_idx + (72)];
2411      if (r0_1 >= r0_2) {
2412        ulong const t = r0_1;
2413        r0_1 = r0_2;
2414        r0_2 = t;
2415      };
2416      shared.m[smem_l_idx + (64)] = r0_1;
2417      shared.m[smem_r_idx + (72)] = r0_2;
2418    }
2419    {
2420      ulong r0_1 = shared.m[smem_l_idx + (96)];
2421      ulong r0_2 = shared.m[smem_r_idx + (104)];
2422      if (r0_1 >= r0_2) {
2423        ulong const t = r0_1;
2424        r0_1 = r0_2;
2425        r0_2 = t;
2426      };
2427      shared.m[smem_l_idx + (96)] = r0_1;
2428      shared.m[smem_r_idx + (104)] = r0_2;
2429    }
2430    {
2431      ulong r0_1 = shared.m[smem_l_idx + (128)];
2432      ulong r0_2 = shared.m[smem_r_idx + (136)];
2433      if (r0_1 >= r0_2) {
2434        ulong const t = r0_1;
2435        r0_1 = r0_2;
2436        r0_2 = t;
2437      };
2438      shared.m[smem_l_idx + (128)] = r0_1;
2439      shared.m[smem_r_idx + (136)] = r0_2;
2440    }
2441    {
2442      ulong r0_1 = shared.m[smem_l_idx + (160)];
2443      ulong r0_2 = shared.m[smem_r_idx + (168)];
2444      if (r0_1 >= r0_2) {
2445        ulong const t = r0_1;
2446        r0_1 = r0_2;
2447        r0_2 = t;
2448      };
2449      shared.m[smem_l_idx + (160)] = r0_1;
2450      shared.m[smem_r_idx + (168)] = r0_2;
2451    }
2452    {
2453      ulong r0_1 = shared.m[smem_l_idx + (192)];
2454      ulong r0_2 = shared.m[smem_r_idx + (200)];
2455      if (r0_1 >= r0_2) {
2456        ulong const t = r0_1;
2457        r0_1 = r0_2;
2458        r0_2 = t;
2459      };
2460      shared.m[smem_l_idx + (192)] = r0_1;
2461      shared.m[smem_r_idx + (200)] = r0_2;
2462    }
2463    {
2464      ulong r0_1 = shared.m[smem_l_idx + (224)];
2465      ulong r0_2 = shared.m[smem_r_idx + (232)];
2466      if (r0_1 >= r0_2) {
2467        ulong const t = r0_1;
2468        r0_1 = r0_2;
2469        r0_2 = t;
2470      };
2471      shared.m[smem_l_idx + (224)] = r0_1;
2472      shared.m[smem_r_idx + (232)] = r0_2;
2473    }
2474  }
2475  barrier(CLK_LOCAL_MEM_FENCE);
2476  r1 = shared.m[get_local_id(0) + (2 * (1 << 3) * 0)];
2477  r16 = shared.m[get_local_id(0) + (2 * (1 << 3) * 1)];
2478  r2 = shared.m[get_local_id(0) + (2 * (1 << 3) * 2)];
2479  r15 = shared.m[get_local_id(0) + (2 * (1 << 3) * 3)];
2480  r3 = shared.m[get_local_id(0) + (2 * (1 << 3) * 4)];
2481  r14 = shared.m[get_local_id(0) + (2 * (1 << 3) * 5)];
2482  r4 = shared.m[get_local_id(0) + (2 * (1 << 3) * 6)];
2483  r13 = shared.m[get_local_id(0) + (2 * (1 << 3) * 7)];
2484  r5 = shared.m[get_local_id(0) + (2 * (1 << 3) * 8)];
2485  r12 = shared.m[get_local_id(0) + (2 * (1 << 3) * 9)];
2486  r6 = shared.m[get_local_id(0) + (2 * (1 << 3) * 10)];
2487  r11 = shared.m[get_local_id(0) + (2 * (1 << 3) * 11)];
2488  r7 = shared.m[get_local_id(0) + (2 * (1 << 3) * 12)];
2489  r10 = shared.m[get_local_id(0) + (2 * (1 << 3) * 13)];
2490  r8 = shared.m[get_local_id(0) + (2 * (1 << 3) * 14)];
2491  r9 = shared.m[get_local_id(0) + (2 * (1 << 3) * 15)];
2492  {
2493    {
2494      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
2495      int const t_lt = get_sub_group_local_id() < half_lane_idx;
2496      ;
2497      {
2498        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
2499        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
2500      };
2501      {
2502        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
2503        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
2504      };
2505      {
2506        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
2507        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
2508      };
2509      {
2510        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
2511        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
2512      };
2513      {
2514        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
2515        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
2516      };
2517      {
2518        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
2519        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
2520      };
2521      {
2522        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
2523        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
2524      };
2525      {
2526        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
2527        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
2528      };
2529      {
2530        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
2531        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
2532      };
2533      {
2534        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
2535        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
2536      };
2537      {
2538        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
2539        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
2540      };
2541      {
2542        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
2543        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
2544      };
2545      {
2546        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
2547        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
2548      };
2549      {
2550        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
2551        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
2552      };
2553      {
2554        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
2555        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
2556      };
2557      {
2558        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
2559        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
2560      };
2561    }
2562    {
2563      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
2564      int const t_lt = get_sub_group_local_id() < half_lane_idx;
2565      ;
2566      {
2567        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
2568        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
2569      };
2570      {
2571        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
2572        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
2573      };
2574      {
2575        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
2576        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
2577      };
2578      {
2579        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
2580        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
2581      };
2582      {
2583        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
2584        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
2585      };
2586      {
2587        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
2588        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
2589      };
2590      {
2591        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
2592        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
2593      };
2594      {
2595        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
2596        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
2597      };
2598      {
2599        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
2600        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
2601      };
2602      {
2603        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
2604        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
2605      };
2606      {
2607        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
2608        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
2609      };
2610      {
2611        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
2612        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
2613      };
2614      {
2615        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
2616        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
2617      };
2618      {
2619        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
2620        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
2621      };
2622      {
2623        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
2624        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
2625      };
2626      {
2627        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
2628        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
2629      };
2630    }
2631    {
2632      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
2633      int const t_lt = get_sub_group_local_id() < half_lane_idx;
2634      ;
2635      {
2636        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
2637        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
2638      };
2639      {
2640        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
2641        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
2642      };
2643      {
2644        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
2645        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
2646      };
2647      {
2648        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
2649        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
2650      };
2651      {
2652        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
2653        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
2654      };
2655      {
2656        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
2657        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
2658      };
2659      {
2660        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
2661        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
2662      };
2663      {
2664        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
2665        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
2666      };
2667      {
2668        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
2669        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
2670      };
2671      {
2672        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
2673        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
2674      };
2675      {
2676        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
2677        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
2678      };
2679      {
2680        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
2681        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
2682      };
2683      {
2684        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
2685        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
2686      };
2687      {
2688        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
2689        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
2690      };
2691      {
2692        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
2693        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
2694      };
2695      {
2696        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
2697        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
2698      };
2699    }
2700    if (r1 >= r9) {
2701      ulong const t = r1;
2702      r1 = r9;
2703      r9 = t;
2704    };
2705    if (r5 >= r13) {
2706      ulong const t = r5;
2707      r5 = r13;
2708      r13 = t;
2709    };
2710    if (r1 >= r5) {
2711      ulong const t = r1;
2712      r1 = r5;
2713      r5 = t;
2714    };
2715    if (r9 >= r13) {
2716      ulong const t = r9;
2717      r9 = r13;
2718      r13 = t;
2719    };
2720    if (r3 >= r11) {
2721      ulong const t = r3;
2722      r3 = r11;
2723      r11 = t;
2724    };
2725    if (r7 >= r15) {
2726      ulong const t = r7;
2727      r7 = r15;
2728      r15 = t;
2729    };
2730    if (r3 >= r7) {
2731      ulong const t = r3;
2732      r3 = r7;
2733      r7 = t;
2734    };
2735    if (r11 >= r15) {
2736      ulong const t = r11;
2737      r11 = r15;
2738      r15 = t;
2739    };
2740    if (r1 >= r3) {
2741      ulong const t = r1;
2742      r1 = r3;
2743      r3 = t;
2744    };
2745    if (r5 >= r7) {
2746      ulong const t = r5;
2747      r5 = r7;
2748      r7 = t;
2749    };
2750    if (r9 >= r11) {
2751      ulong const t = r9;
2752      r9 = r11;
2753      r11 = t;
2754    };
2755    if (r13 >= r15) {
2756      ulong const t = r13;
2757      r13 = r15;
2758      r15 = t;
2759    };
2760    if (r2 >= r10) {
2761      ulong const t = r2;
2762      r2 = r10;
2763      r10 = t;
2764    };
2765    if (r6 >= r14) {
2766      ulong const t = r6;
2767      r6 = r14;
2768      r14 = t;
2769    };
2770    if (r2 >= r6) {
2771      ulong const t = r2;
2772      r2 = r6;
2773      r6 = t;
2774    };
2775    if (r10 >= r14) {
2776      ulong const t = r10;
2777      r10 = r14;
2778      r14 = t;
2779    };
2780    if (r4 >= r12) {
2781      ulong const t = r4;
2782      r4 = r12;
2783      r12 = t;
2784    };
2785    if (r8 >= r16) {
2786      ulong const t = r8;
2787      r8 = r16;
2788      r16 = t;
2789    };
2790    if (r4 >= r8) {
2791      ulong const t = r4;
2792      r4 = r8;
2793      r8 = t;
2794    };
2795    if (r12 >= r16) {
2796      ulong const t = r12;
2797      r12 = r16;
2798      r16 = t;
2799    };
2800    if (r2 >= r4) {
2801      ulong const t = r2;
2802      r2 = r4;
2803      r4 = t;
2804    };
2805    if (r6 >= r8) {
2806      ulong const t = r6;
2807      r6 = r8;
2808      r8 = t;
2809    };
2810    if (r10 >= r12) {
2811      ulong const t = r10;
2812      r10 = r12;
2813      r12 = t;
2814    };
2815    if (r14 >= r16) {
2816      ulong const t = r14;
2817      r14 = r16;
2818      r16 = t;
2819    };
2820    if (r1 >= r2) {
2821      ulong const t = r1;
2822      r1 = r2;
2823      r2 = t;
2824    };
2825    if (r3 >= r4) {
2826      ulong const t = r3;
2827      r3 = r4;
2828      r4 = t;
2829    };
2830    if (r5 >= r6) {
2831      ulong const t = r5;
2832      r5 = r6;
2833      r6 = t;
2834    };
2835    if (r7 >= r8) {
2836      ulong const t = r7;
2837      r7 = r8;
2838      r8 = t;
2839    };
2840    if (r9 >= r10) {
2841      ulong const t = r9;
2842      r9 = r10;
2843      r10 = t;
2844    };
2845    if (r11 >= r12) {
2846      ulong const t = r11;
2847      r11 = r12;
2848      r12 = t;
2849    };
2850    if (r13 >= r14) {
2851      ulong const t = r13;
2852      r13 = r14;
2853      r14 = t;
2854    };
2855    if (r15 >= r16) {
2856      ulong const t = r15;
2857      r15 = r16;
2858      r16 = t;
2859    };
2860  }
2861  vout[gmem_idx + (1 << 3) * 0] = r1;
2862  vout[gmem_idx + (1 << 3) * 1] = r2;
2863  vout[gmem_idx + (1 << 3) * 2] = r3;
2864  vout[gmem_idx + (1 << 3) * 3] = r4;
2865  vout[gmem_idx + (1 << 3) * 4] = r5;
2866  vout[gmem_idx + (1 << 3) * 5] = r6;
2867  vout[gmem_idx + (1 << 3) * 6] = r7;
2868  vout[gmem_idx + (1 << 3) * 7] = r8;
2869  vout[gmem_idx + (1 << 3) * 8] = r9;
2870  vout[gmem_idx + (1 << 3) * 9] = r10;
2871  vout[gmem_idx + (1 << 3) * 10] = r11;
2872  vout[gmem_idx + (1 << 3) * 11] = r12;
2873  vout[gmem_idx + (1 << 3) * 12] = r13;
2874  vout[gmem_idx + (1 << 3) * 13] = r14;
2875  vout[gmem_idx + (1 << 3) * 14] = r15;
2876  vout[gmem_idx + (1 << 3) * 15] = r16;
2877}
2878
2879__kernel __attribute__((intel_reqd_sub_group_size((1 << 3))))
2880__attribute__((reqd_work_group_size((1 << 3) * 4, 1, 1))) void
2881hs_kernel_bs_2(__global ulong const* const restrict vin,
2882               __global ulong* const restrict vout)
2883{
2884  __local struct
2885  {
2886    ulong m[32 * 16];
2887  } shared;
2888
2889  uint const gmem_idx = (get_global_id(0) & ~((1 << 3) - 1)) * 16 +
2890                        (get_local_id(0) & ((1 << 3) - 1));
2891  ulong r1 = vin[gmem_idx + (1 << 3) * 0];
2892  ulong r2 = vin[gmem_idx + (1 << 3) * 1];
2893  ulong r3 = vin[gmem_idx + (1 << 3) * 2];
2894  ulong r4 = vin[gmem_idx + (1 << 3) * 3];
2895  ulong r5 = vin[gmem_idx + (1 << 3) * 4];
2896  ulong r6 = vin[gmem_idx + (1 << 3) * 5];
2897  ulong r7 = vin[gmem_idx + (1 << 3) * 6];
2898  ulong r8 = vin[gmem_idx + (1 << 3) * 7];
2899  ulong r9 = vin[gmem_idx + (1 << 3) * 8];
2900  ulong r10 = vin[gmem_idx + (1 << 3) * 9];
2901  ulong r11 = vin[gmem_idx + (1 << 3) * 10];
2902  ulong r12 = vin[gmem_idx + (1 << 3) * 11];
2903  ulong r13 = vin[gmem_idx + (1 << 3) * 12];
2904  ulong r14 = vin[gmem_idx + (1 << 3) * 13];
2905  ulong r15 = vin[gmem_idx + (1 << 3) * 14];
2906  ulong r16 = vin[gmem_idx + (1 << 3) * 15];
2907  if (r1 >= r2) {
2908    ulong const t = r1;
2909    r1 = r2;
2910    r2 = t;
2911  };
2912  if (r3 >= r4) {
2913    ulong const t = r3;
2914    r3 = r4;
2915    r4 = t;
2916  };
2917  if (r5 >= r6) {
2918    ulong const t = r5;
2919    r5 = r6;
2920    r6 = t;
2921  };
2922  if (r7 >= r8) {
2923    ulong const t = r7;
2924    r7 = r8;
2925    r8 = t;
2926  };
2927  if (r9 >= r10) {
2928    ulong const t = r9;
2929    r9 = r10;
2930    r10 = t;
2931  };
2932  if (r11 >= r12) {
2933    ulong const t = r11;
2934    r11 = r12;
2935    r12 = t;
2936  };
2937  if (r13 >= r14) {
2938    ulong const t = r13;
2939    r13 = r14;
2940    r14 = t;
2941  };
2942  if (r15 >= r16) {
2943    ulong const t = r15;
2944    r15 = r16;
2945    r16 = t;
2946  };
2947  if (r1 >= r3) {
2948    ulong const t = r1;
2949    r1 = r3;
2950    r3 = t;
2951  };
2952  if (r5 >= r7) {
2953    ulong const t = r5;
2954    r5 = r7;
2955    r7 = t;
2956  };
2957  if (r9 >= r11) {
2958    ulong const t = r9;
2959    r9 = r11;
2960    r11 = t;
2961  };
2962  if (r13 >= r15) {
2963    ulong const t = r13;
2964    r13 = r15;
2965    r15 = t;
2966  };
2967  if (r2 >= r4) {
2968    ulong const t = r2;
2969    r2 = r4;
2970    r4 = t;
2971  };
2972  if (r6 >= r8) {
2973    ulong const t = r6;
2974    r6 = r8;
2975    r8 = t;
2976  };
2977  if (r10 >= r12) {
2978    ulong const t = r10;
2979    r10 = r12;
2980    r12 = t;
2981  };
2982  if (r14 >= r16) {
2983    ulong const t = r14;
2984    r14 = r16;
2985    r16 = t;
2986  };
2987  if (r1 >= r5) {
2988    ulong const t = r1;
2989    r1 = r5;
2990    r5 = t;
2991  };
2992  if (r9 >= r13) {
2993    ulong const t = r9;
2994    r9 = r13;
2995    r13 = t;
2996  };
2997  if (r2 >= r6) {
2998    ulong const t = r2;
2999    r2 = r6;
3000    r6 = t;
3001  };
3002  if (r10 >= r14) {
3003    ulong const t = r10;
3004    r10 = r14;
3005    r14 = t;
3006  };
3007  if (r3 >= r7) {
3008    ulong const t = r3;
3009    r3 = r7;
3010    r7 = t;
3011  };
3012  if (r11 >= r15) {
3013    ulong const t = r11;
3014    r11 = r15;
3015    r15 = t;
3016  };
3017  if (r4 >= r8) {
3018    ulong const t = r4;
3019    r4 = r8;
3020    r8 = t;
3021  };
3022  if (r12 >= r16) {
3023    ulong const t = r12;
3024    r12 = r16;
3025    r16 = t;
3026  };
3027  if (r1 >= r9) {
3028    ulong const t = r1;
3029    r1 = r9;
3030    r9 = t;
3031  };
3032  if (r2 >= r10) {
3033    ulong const t = r2;
3034    r2 = r10;
3035    r10 = t;
3036  };
3037  if (r3 >= r11) {
3038    ulong const t = r3;
3039    r3 = r11;
3040    r11 = t;
3041  };
3042  if (r4 >= r12) {
3043    ulong const t = r4;
3044    r4 = r12;
3045    r12 = t;
3046  };
3047  if (r5 >= r13) {
3048    ulong const t = r5;
3049    r5 = r13;
3050    r13 = t;
3051  };
3052  if (r6 >= r14) {
3053    ulong const t = r6;
3054    r6 = r14;
3055    r14 = t;
3056  };
3057  if (r7 >= r15) {
3058    ulong const t = r7;
3059    r7 = r15;
3060    r15 = t;
3061  };
3062  if (r8 >= r16) {
3063    ulong const t = r8;
3064    r8 = r16;
3065    r16 = t;
3066  };
3067  if (r6 >= r11) {
3068    ulong const t = r6;
3069    r6 = r11;
3070    r11 = t;
3071  };
3072  if (r7 >= r10) {
3073    ulong const t = r7;
3074    r7 = r10;
3075    r10 = t;
3076  };
3077  if (r4 >= r13) {
3078    ulong const t = r4;
3079    r4 = r13;
3080    r13 = t;
3081  };
3082  if (r14 >= r15) {
3083    ulong const t = r14;
3084    r14 = r15;
3085    r15 = t;
3086  };
3087  if (r8 >= r12) {
3088    ulong const t = r8;
3089    r8 = r12;
3090    r12 = t;
3091  };
3092  if (r2 >= r3) {
3093    ulong const t = r2;
3094    r2 = r3;
3095    r3 = t;
3096  };
3097  if (r5 >= r9) {
3098    ulong const t = r5;
3099    r5 = r9;
3100    r9 = t;
3101  };
3102  if (r2 >= r5) {
3103    ulong const t = r2;
3104    r2 = r5;
3105    r5 = t;
3106  };
3107  if (r8 >= r14) {
3108    ulong const t = r8;
3109    r8 = r14;
3110    r14 = t;
3111  };
3112  if (r3 >= r9) {
3113    ulong const t = r3;
3114    r3 = r9;
3115    r9 = t;
3116  };
3117  if (r12 >= r15) {
3118    ulong const t = r12;
3119    r12 = r15;
3120    r15 = t;
3121  };
3122  if (r3 >= r5) {
3123    ulong const t = r3;
3124    r3 = r5;
3125    r5 = t;
3126  };
3127  if (r6 >= r7) {
3128    ulong const t = r6;
3129    r6 = r7;
3130    r7 = t;
3131  };
3132  if (r10 >= r11) {
3133    ulong const t = r10;
3134    r10 = r11;
3135    r11 = t;
3136  };
3137  if (r12 >= r14) {
3138    ulong const t = r12;
3139    r12 = r14;
3140    r14 = t;
3141  };
3142  if (r4 >= r9) {
3143    ulong const t = r4;
3144    r4 = r9;
3145    r9 = t;
3146  };
3147  if (r8 >= r13) {
3148    ulong const t = r8;
3149    r8 = r13;
3150    r13 = t;
3151  };
3152  if (r7 >= r9) {
3153    ulong const t = r7;
3154    r7 = r9;
3155    r9 = t;
3156  };
3157  if (r11 >= r13) {
3158    ulong const t = r11;
3159    r11 = r13;
3160    r13 = t;
3161  };
3162  if (r4 >= r6) {
3163    ulong const t = r4;
3164    r4 = r6;
3165    r6 = t;
3166  };
3167  if (r8 >= r10) {
3168    ulong const t = r8;
3169    r8 = r10;
3170    r10 = t;
3171  };
3172  if (r4 >= r5) {
3173    ulong const t = r4;
3174    r4 = r5;
3175    r5 = t;
3176  };
3177  if (r6 >= r7) {
3178    ulong const t = r6;
3179    r6 = r7;
3180    r7 = t;
3181  };
3182  if (r8 >= r9) {
3183    ulong const t = r8;
3184    r8 = r9;
3185    r9 = t;
3186  };
3187  if (r10 >= r11) {
3188    ulong const t = r10;
3189    r10 = r11;
3190    r11 = t;
3191  };
3192  if (r12 >= r13) {
3193    ulong const t = r12;
3194    r12 = r13;
3195    r13 = t;
3196  };
3197  if (r7 >= r8) {
3198    ulong const t = r7;
3199    r7 = r8;
3200    r8 = t;
3201  };
3202  if (r9 >= r10) {
3203    ulong const t = r9;
3204    r9 = r10;
3205    r10 = t;
3206  };
3207  {
3208    uint const flip_lane_idx = get_sub_group_local_id() ^ 1;
3209    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
3210    ;
3211    {
3212      ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
3213      ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx);
3214      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
3215      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
3216    };
3217    {
3218      ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
3219      ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx);
3220      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
3221      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
3222    };
3223    {
3224      ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
3225      ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx);
3226      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
3227      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
3228    };
3229    {
3230      ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
3231      ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx);
3232      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
3233      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
3234    };
3235    {
3236      ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx);
3237      ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx);
3238      r5 = ((r5 <= tb) ^ t_lt) ? tb : r5;
3239      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
3240    };
3241    {
3242      ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx);
3243      ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx);
3244      r6 = ((r6 <= tb) ^ t_lt) ? tb : r6;
3245      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
3246    };
3247    {
3248      ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx);
3249      ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx);
3250      r7 = ((r7 <= tb) ^ t_lt) ? tb : r7;
3251      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
3252    };
3253    {
3254      ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx);
3255      ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx);
3256      r8 = ((r8 <= tb) ^ t_lt) ? tb : r8;
3257      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
3258    };
3259  }
3260  if (r1 >= r9) {
3261    ulong const t = r1;
3262    r1 = r9;
3263    r9 = t;
3264  };
3265  if (r5 >= r13) {
3266    ulong const t = r5;
3267    r5 = r13;
3268    r13 = t;
3269  };
3270  if (r1 >= r5) {
3271    ulong const t = r1;
3272    r1 = r5;
3273    r5 = t;
3274  };
3275  if (r9 >= r13) {
3276    ulong const t = r9;
3277    r9 = r13;
3278    r13 = t;
3279  };
3280  if (r3 >= r11) {
3281    ulong const t = r3;
3282    r3 = r11;
3283    r11 = t;
3284  };
3285  if (r7 >= r15) {
3286    ulong const t = r7;
3287    r7 = r15;
3288    r15 = t;
3289  };
3290  if (r3 >= r7) {
3291    ulong const t = r3;
3292    r3 = r7;
3293    r7 = t;
3294  };
3295  if (r11 >= r15) {
3296    ulong const t = r11;
3297    r11 = r15;
3298    r15 = t;
3299  };
3300  if (r1 >= r3) {
3301    ulong const t = r1;
3302    r1 = r3;
3303    r3 = t;
3304  };
3305  if (r5 >= r7) {
3306    ulong const t = r5;
3307    r5 = r7;
3308    r7 = t;
3309  };
3310  if (r9 >= r11) {
3311    ulong const t = r9;
3312    r9 = r11;
3313    r11 = t;
3314  };
3315  if (r13 >= r15) {
3316    ulong const t = r13;
3317    r13 = r15;
3318    r15 = t;
3319  };
3320  if (r2 >= r10) {
3321    ulong const t = r2;
3322    r2 = r10;
3323    r10 = t;
3324  };
3325  if (r6 >= r14) {
3326    ulong const t = r6;
3327    r6 = r14;
3328    r14 = t;
3329  };
3330  if (r2 >= r6) {
3331    ulong const t = r2;
3332    r2 = r6;
3333    r6 = t;
3334  };
3335  if (r10 >= r14) {
3336    ulong const t = r10;
3337    r10 = r14;
3338    r14 = t;
3339  };
3340  if (r4 >= r12) {
3341    ulong const t = r4;
3342    r4 = r12;
3343    r12 = t;
3344  };
3345  if (r8 >= r16) {
3346    ulong const t = r8;
3347    r8 = r16;
3348    r16 = t;
3349  };
3350  if (r4 >= r8) {
3351    ulong const t = r4;
3352    r4 = r8;
3353    r8 = t;
3354  };
3355  if (r12 >= r16) {
3356    ulong const t = r12;
3357    r12 = r16;
3358    r16 = t;
3359  };
3360  if (r2 >= r4) {
3361    ulong const t = r2;
3362    r2 = r4;
3363    r4 = t;
3364  };
3365  if (r6 >= r8) {
3366    ulong const t = r6;
3367    r6 = r8;
3368    r8 = t;
3369  };
3370  if (r10 >= r12) {
3371    ulong const t = r10;
3372    r10 = r12;
3373    r12 = t;
3374  };
3375  if (r14 >= r16) {
3376    ulong const t = r14;
3377    r14 = r16;
3378    r16 = t;
3379  };
3380  if (r1 >= r2) {
3381    ulong const t = r1;
3382    r1 = r2;
3383    r2 = t;
3384  };
3385  if (r3 >= r4) {
3386    ulong const t = r3;
3387    r3 = r4;
3388    r4 = t;
3389  };
3390  if (r5 >= r6) {
3391    ulong const t = r5;
3392    r5 = r6;
3393    r6 = t;
3394  };
3395  if (r7 >= r8) {
3396    ulong const t = r7;
3397    r7 = r8;
3398    r8 = t;
3399  };
3400  if (r9 >= r10) {
3401    ulong const t = r9;
3402    r9 = r10;
3403    r10 = t;
3404  };
3405  if (r11 >= r12) {
3406    ulong const t = r11;
3407    r11 = r12;
3408    r12 = t;
3409  };
3410  if (r13 >= r14) {
3411    ulong const t = r13;
3412    r13 = r14;
3413    r14 = t;
3414  };
3415  if (r15 >= r16) {
3416    ulong const t = r15;
3417    r15 = r16;
3418    r16 = t;
3419  };
3420  {
3421    uint const flip_lane_idx = get_sub_group_local_id() ^ 3;
3422    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
3423    ;
3424    {
3425      ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
3426      ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx);
3427      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
3428      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
3429    };
3430    {
3431      ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
3432      ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx);
3433      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
3434      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
3435    };
3436    {
3437      ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
3438      ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx);
3439      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
3440      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
3441    };
3442    {
3443      ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
3444      ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx);
3445      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
3446      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
3447    };
3448    {
3449      ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx);
3450      ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx);
3451      r5 = ((r5 <= tb) ^ t_lt) ? tb : r5;
3452      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
3453    };
3454    {
3455      ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx);
3456      ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx);
3457      r6 = ((r6 <= tb) ^ t_lt) ? tb : r6;
3458      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
3459    };
3460    {
3461      ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx);
3462      ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx);
3463      r7 = ((r7 <= tb) ^ t_lt) ? tb : r7;
3464      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
3465    };
3466    {
3467      ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx);
3468      ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx);
3469      r8 = ((r8 <= tb) ^ t_lt) ? tb : r8;
3470      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
3471    };
3472  }
3473  {
3474    uint const half_lane_idx = get_sub_group_local_id() ^ 1;
3475    int const t_lt = get_sub_group_local_id() < half_lane_idx;
3476    ;
3477    {
3478      ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
3479      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
3480    };
3481    {
3482      ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
3483      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
3484    };
3485    {
3486      ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
3487      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
3488    };
3489    {
3490      ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
3491      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
3492    };
3493    {
3494      ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
3495      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
3496    };
3497    {
3498      ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
3499      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
3500    };
3501    {
3502      ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
3503      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
3504    };
3505    {
3506      ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
3507      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
3508    };
3509    {
3510      ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
3511      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
3512    };
3513    {
3514      ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
3515      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
3516    };
3517    {
3518      ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
3519      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
3520    };
3521    {
3522      ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
3523      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
3524    };
3525    {
3526      ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
3527      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
3528    };
3529    {
3530      ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
3531      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
3532    };
3533    {
3534      ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
3535      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
3536    };
3537    {
3538      ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
3539      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
3540    };
3541  }
3542  if (r1 >= r9) {
3543    ulong const t = r1;
3544    r1 = r9;
3545    r9 = t;
3546  };
3547  if (r5 >= r13) {
3548    ulong const t = r5;
3549    r5 = r13;
3550    r13 = t;
3551  };
3552  if (r1 >= r5) {
3553    ulong const t = r1;
3554    r1 = r5;
3555    r5 = t;
3556  };
3557  if (r9 >= r13) {
3558    ulong const t = r9;
3559    r9 = r13;
3560    r13 = t;
3561  };
3562  if (r3 >= r11) {
3563    ulong const t = r3;
3564    r3 = r11;
3565    r11 = t;
3566  };
3567  if (r7 >= r15) {
3568    ulong const t = r7;
3569    r7 = r15;
3570    r15 = t;
3571  };
3572  if (r3 >= r7) {
3573    ulong const t = r3;
3574    r3 = r7;
3575    r7 = t;
3576  };
3577  if (r11 >= r15) {
3578    ulong const t = r11;
3579    r11 = r15;
3580    r15 = t;
3581  };
3582  if (r1 >= r3) {
3583    ulong const t = r1;
3584    r1 = r3;
3585    r3 = t;
3586  };
3587  if (r5 >= r7) {
3588    ulong const t = r5;
3589    r5 = r7;
3590    r7 = t;
3591  };
3592  if (r9 >= r11) {
3593    ulong const t = r9;
3594    r9 = r11;
3595    r11 = t;
3596  };
3597  if (r13 >= r15) {
3598    ulong const t = r13;
3599    r13 = r15;
3600    r15 = t;
3601  };
3602  if (r2 >= r10) {
3603    ulong const t = r2;
3604    r2 = r10;
3605    r10 = t;
3606  };
3607  if (r6 >= r14) {
3608    ulong const t = r6;
3609    r6 = r14;
3610    r14 = t;
3611  };
3612  if (r2 >= r6) {
3613    ulong const t = r2;
3614    r2 = r6;
3615    r6 = t;
3616  };
3617  if (r10 >= r14) {
3618    ulong const t = r10;
3619    r10 = r14;
3620    r14 = t;
3621  };
3622  if (r4 >= r12) {
3623    ulong const t = r4;
3624    r4 = r12;
3625    r12 = t;
3626  };
3627  if (r8 >= r16) {
3628    ulong const t = r8;
3629    r8 = r16;
3630    r16 = t;
3631  };
3632  if (r4 >= r8) {
3633    ulong const t = r4;
3634    r4 = r8;
3635    r8 = t;
3636  };
3637  if (r12 >= r16) {
3638    ulong const t = r12;
3639    r12 = r16;
3640    r16 = t;
3641  };
3642  if (r2 >= r4) {
3643    ulong const t = r2;
3644    r2 = r4;
3645    r4 = t;
3646  };
3647  if (r6 >= r8) {
3648    ulong const t = r6;
3649    r6 = r8;
3650    r8 = t;
3651  };
3652  if (r10 >= r12) {
3653    ulong const t = r10;
3654    r10 = r12;
3655    r12 = t;
3656  };
3657  if (r14 >= r16) {
3658    ulong const t = r14;
3659    r14 = r16;
3660    r16 = t;
3661  };
3662  if (r1 >= r2) {
3663    ulong const t = r1;
3664    r1 = r2;
3665    r2 = t;
3666  };
3667  if (r3 >= r4) {
3668    ulong const t = r3;
3669    r3 = r4;
3670    r4 = t;
3671  };
3672  if (r5 >= r6) {
3673    ulong const t = r5;
3674    r5 = r6;
3675    r6 = t;
3676  };
3677  if (r7 >= r8) {
3678    ulong const t = r7;
3679    r7 = r8;
3680    r8 = t;
3681  };
3682  if (r9 >= r10) {
3683    ulong const t = r9;
3684    r9 = r10;
3685    r10 = t;
3686  };
3687  if (r11 >= r12) {
3688    ulong const t = r11;
3689    r11 = r12;
3690    r12 = t;
3691  };
3692  if (r13 >= r14) {
3693    ulong const t = r13;
3694    r13 = r14;
3695    r14 = t;
3696  };
3697  if (r15 >= r16) {
3698    ulong const t = r15;
3699    r15 = r16;
3700    r16 = t;
3701  };
3702  {
3703    uint const flip_lane_idx = get_sub_group_local_id() ^ 7;
3704    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
3705    ;
3706    {
3707      ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
3708      ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx);
3709      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
3710      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
3711    };
3712    {
3713      ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
3714      ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx);
3715      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
3716      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
3717    };
3718    {
3719      ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
3720      ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx);
3721      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
3722      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
3723    };
3724    {
3725      ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
3726      ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx);
3727      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
3728      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
3729    };
3730    {
3731      ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx);
3732      ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx);
3733      r5 = ((r5 <= tb) ^ t_lt) ? tb : r5;
3734      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
3735    };
3736    {
3737      ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx);
3738      ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx);
3739      r6 = ((r6 <= tb) ^ t_lt) ? tb : r6;
3740      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
3741    };
3742    {
3743      ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx);
3744      ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx);
3745      r7 = ((r7 <= tb) ^ t_lt) ? tb : r7;
3746      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
3747    };
3748    {
3749      ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx);
3750      ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx);
3751      r8 = ((r8 <= tb) ^ t_lt) ? tb : r8;
3752      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
3753    };
3754  }
3755  {
3756    uint const half_lane_idx = get_sub_group_local_id() ^ 2;
3757    int const t_lt = get_sub_group_local_id() < half_lane_idx;
3758    ;
3759    {
3760      ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
3761      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
3762    };
3763    {
3764      ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
3765      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
3766    };
3767    {
3768      ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
3769      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
3770    };
3771    {
3772      ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
3773      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
3774    };
3775    {
3776      ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
3777      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
3778    };
3779    {
3780      ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
3781      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
3782    };
3783    {
3784      ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
3785      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
3786    };
3787    {
3788      ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
3789      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
3790    };
3791    {
3792      ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
3793      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
3794    };
3795    {
3796      ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
3797      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
3798    };
3799    {
3800      ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
3801      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
3802    };
3803    {
3804      ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
3805      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
3806    };
3807    {
3808      ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
3809      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
3810    };
3811    {
3812      ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
3813      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
3814    };
3815    {
3816      ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
3817      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
3818    };
3819    {
3820      ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
3821      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
3822    };
3823  }
3824  {
3825    uint const half_lane_idx = get_sub_group_local_id() ^ 1;
3826    int const t_lt = get_sub_group_local_id() < half_lane_idx;
3827    ;
3828    {
3829      ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
3830      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
3831    };
3832    {
3833      ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
3834      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
3835    };
3836    {
3837      ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
3838      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
3839    };
3840    {
3841      ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
3842      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
3843    };
3844    {
3845      ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
3846      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
3847    };
3848    {
3849      ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
3850      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
3851    };
3852    {
3853      ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
3854      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
3855    };
3856    {
3857      ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
3858      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
3859    };
3860    {
3861      ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
3862      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
3863    };
3864    {
3865      ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
3866      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
3867    };
3868    {
3869      ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
3870      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
3871    };
3872    {
3873      ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
3874      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
3875    };
3876    {
3877      ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
3878      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
3879    };
3880    {
3881      ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
3882      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
3883    };
3884    {
3885      ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
3886      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
3887    };
3888    {
3889      ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
3890      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
3891    };
3892  }
3893  if (r1 >= r9) {
3894    ulong const t = r1;
3895    r1 = r9;
3896    r9 = t;
3897  };
3898  if (r5 >= r13) {
3899    ulong const t = r5;
3900    r5 = r13;
3901    r13 = t;
3902  };
3903  if (r1 >= r5) {
3904    ulong const t = r1;
3905    r1 = r5;
3906    r5 = t;
3907  };
3908  if (r9 >= r13) {
3909    ulong const t = r9;
3910    r9 = r13;
3911    r13 = t;
3912  };
3913  if (r3 >= r11) {
3914    ulong const t = r3;
3915    r3 = r11;
3916    r11 = t;
3917  };
3918  if (r7 >= r15) {
3919    ulong const t = r7;
3920    r7 = r15;
3921    r15 = t;
3922  };
3923  if (r3 >= r7) {
3924    ulong const t = r3;
3925    r3 = r7;
3926    r7 = t;
3927  };
3928  if (r11 >= r15) {
3929    ulong const t = r11;
3930    r11 = r15;
3931    r15 = t;
3932  };
3933  if (r1 >= r3) {
3934    ulong const t = r1;
3935    r1 = r3;
3936    r3 = t;
3937  };
3938  if (r5 >= r7) {
3939    ulong const t = r5;
3940    r5 = r7;
3941    r7 = t;
3942  };
3943  if (r9 >= r11) {
3944    ulong const t = r9;
3945    r9 = r11;
3946    r11 = t;
3947  };
3948  if (r13 >= r15) {
3949    ulong const t = r13;
3950    r13 = r15;
3951    r15 = t;
3952  };
3953  if (r2 >= r10) {
3954    ulong const t = r2;
3955    r2 = r10;
3956    r10 = t;
3957  };
3958  if (r6 >= r14) {
3959    ulong const t = r6;
3960    r6 = r14;
3961    r14 = t;
3962  };
3963  if (r2 >= r6) {
3964    ulong const t = r2;
3965    r2 = r6;
3966    r6 = t;
3967  };
3968  if (r10 >= r14) {
3969    ulong const t = r10;
3970    r10 = r14;
3971    r14 = t;
3972  };
3973  if (r4 >= r12) {
3974    ulong const t = r4;
3975    r4 = r12;
3976    r12 = t;
3977  };
3978  if (r8 >= r16) {
3979    ulong const t = r8;
3980    r8 = r16;
3981    r16 = t;
3982  };
3983  if (r4 >= r8) {
3984    ulong const t = r4;
3985    r4 = r8;
3986    r8 = t;
3987  };
3988  if (r12 >= r16) {
3989    ulong const t = r12;
3990    r12 = r16;
3991    r16 = t;
3992  };
3993  if (r2 >= r4) {
3994    ulong const t = r2;
3995    r2 = r4;
3996    r4 = t;
3997  };
3998  if (r6 >= r8) {
3999    ulong const t = r6;
4000    r6 = r8;
4001    r8 = t;
4002  };
4003  if (r10 >= r12) {
4004    ulong const t = r10;
4005    r10 = r12;
4006    r12 = t;
4007  };
4008  if (r14 >= r16) {
4009    ulong const t = r14;
4010    r14 = r16;
4011    r16 = t;
4012  };
4013  if (r1 >= r2) {
4014    ulong const t = r1;
4015    r1 = r2;
4016    r2 = t;
4017  };
4018  if (r3 >= r4) {
4019    ulong const t = r3;
4020    r3 = r4;
4021    r4 = t;
4022  };
4023  if (r5 >= r6) {
4024    ulong const t = r5;
4025    r5 = r6;
4026    r6 = t;
4027  };
4028  if (r7 >= r8) {
4029    ulong const t = r7;
4030    r7 = r8;
4031    r8 = t;
4032  };
4033  if (r9 >= r10) {
4034    ulong const t = r9;
4035    r9 = r10;
4036    r10 = t;
4037  };
4038  if (r11 >= r12) {
4039    ulong const t = r11;
4040    r11 = r12;
4041    r12 = t;
4042  };
4043  if (r13 >= r14) {
4044    ulong const t = r13;
4045    r13 = r14;
4046    r14 = t;
4047  };
4048  if (r15 >= r16) {
4049    ulong const t = r15;
4050    r15 = r16;
4051    r16 = t;
4052  };
4053  uint const smem_l_idx =
4054    get_sub_group_id() * ((1 << 3) * 4) + get_sub_group_local_id();
4055  uint const smem_r_idx = (get_sub_group_id() ^ 1) * ((1 << 3) * 4) +
4056                          (get_sub_group_local_id() ^ ((1 << 3) - 1));
4057  shared.m[get_local_id(0) + (4 * (1 << 3) * 0)] = r1;
4058  shared.m[get_local_id(0) + (4 * (1 << 3) * 1)] = r16;
4059  shared.m[get_local_id(0) + (4 * (1 << 3) * 2)] = r2;
4060  shared.m[get_local_id(0) + (4 * (1 << 3) * 3)] = r15;
4061  shared.m[get_local_id(0) + (4 * (1 << 3) * 4)] = r3;
4062  shared.m[get_local_id(0) + (4 * (1 << 3) * 5)] = r14;
4063  shared.m[get_local_id(0) + (4 * (1 << 3) * 6)] = r4;
4064  shared.m[get_local_id(0) + (4 * (1 << 3) * 7)] = r13;
4065  shared.m[get_local_id(0) + (4 * (1 << 3) * 8)] = r5;
4066  shared.m[get_local_id(0) + (4 * (1 << 3) * 9)] = r12;
4067  shared.m[get_local_id(0) + (4 * (1 << 3) * 10)] = r6;
4068  shared.m[get_local_id(0) + (4 * (1 << 3) * 11)] = r11;
4069  shared.m[get_local_id(0) + (4 * (1 << 3) * 12)] = r7;
4070  shared.m[get_local_id(0) + (4 * (1 << 3) * 13)] = r10;
4071  shared.m[get_local_id(0) + (4 * (1 << 3) * 14)] = r8;
4072  shared.m[get_local_id(0) + (4 * (1 << 3) * 15)] = r9;
4073  barrier(CLK_LOCAL_MEM_FENCE);
4074  {
4075    {
4076      ulong r0_1 = shared.m[smem_l_idx + (0)];
4077      ulong r0_2 = shared.m[smem_r_idx + (8)];
4078      if (r0_1 >= r0_2) {
4079        ulong const t = r0_1;
4080        r0_1 = r0_2;
4081        r0_2 = t;
4082      };
4083      shared.m[smem_l_idx + (0)] = r0_1;
4084      shared.m[smem_r_idx + (8)] = r0_2;
4085    }
4086    {
4087      ulong r1_1 = shared.m[smem_l_idx + (16)];
4088      ulong r1_2 = shared.m[smem_r_idx + (24)];
4089      if (r1_1 >= r1_2) {
4090        ulong const t = r1_1;
4091        r1_1 = r1_2;
4092        r1_2 = t;
4093      };
4094      shared.m[smem_l_idx + (16)] = r1_1;
4095      shared.m[smem_r_idx + (24)] = r1_2;
4096    }
4097    {
4098      ulong r0_1 = shared.m[smem_l_idx + (128)];
4099      ulong r0_2 = shared.m[smem_r_idx + (136)];
4100      if (r0_1 >= r0_2) {
4101        ulong const t = r0_1;
4102        r0_1 = r0_2;
4103        r0_2 = t;
4104      };
4105      shared.m[smem_l_idx + (128)] = r0_1;
4106      shared.m[smem_r_idx + (136)] = r0_2;
4107    }
4108    {
4109      ulong r1_1 = shared.m[smem_l_idx + (144)];
4110      ulong r1_2 = shared.m[smem_r_idx + (152)];
4111      if (r1_1 >= r1_2) {
4112        ulong const t = r1_1;
4113        r1_1 = r1_2;
4114        r1_2 = t;
4115      };
4116      shared.m[smem_l_idx + (144)] = r1_1;
4117      shared.m[smem_r_idx + (152)] = r1_2;
4118    }
4119    {
4120      ulong r0_1 = shared.m[smem_l_idx + (256)];
4121      ulong r0_2 = shared.m[smem_r_idx + (264)];
4122      if (r0_1 >= r0_2) {
4123        ulong const t = r0_1;
4124        r0_1 = r0_2;
4125        r0_2 = t;
4126      };
4127      shared.m[smem_l_idx + (256)] = r0_1;
4128      shared.m[smem_r_idx + (264)] = r0_2;
4129    }
4130    {
4131      ulong r1_1 = shared.m[smem_l_idx + (272)];
4132      ulong r1_2 = shared.m[smem_r_idx + (280)];
4133      if (r1_1 >= r1_2) {
4134        ulong const t = r1_1;
4135        r1_1 = r1_2;
4136        r1_2 = t;
4137      };
4138      shared.m[smem_l_idx + (272)] = r1_1;
4139      shared.m[smem_r_idx + (280)] = r1_2;
4140    }
4141    {
4142      ulong r0_1 = shared.m[smem_l_idx + (384)];
4143      ulong r0_2 = shared.m[smem_r_idx + (392)];
4144      if (r0_1 >= r0_2) {
4145        ulong const t = r0_1;
4146        r0_1 = r0_2;
4147        r0_2 = t;
4148      };
4149      shared.m[smem_l_idx + (384)] = r0_1;
4150      shared.m[smem_r_idx + (392)] = r0_2;
4151    }
4152    {
4153      ulong r1_1 = shared.m[smem_l_idx + (400)];
4154      ulong r1_2 = shared.m[smem_r_idx + (408)];
4155      if (r1_1 >= r1_2) {
4156        ulong const t = r1_1;
4157        r1_1 = r1_2;
4158        r1_2 = t;
4159      };
4160      shared.m[smem_l_idx + (400)] = r1_1;
4161      shared.m[smem_r_idx + (408)] = r1_2;
4162    }
4163  }
4164  barrier(CLK_LOCAL_MEM_FENCE);
4165  r1 = shared.m[get_local_id(0) + (4 * (1 << 3) * 0)];
4166  r16 = shared.m[get_local_id(0) + (4 * (1 << 3) * 1)];
4167  r2 = shared.m[get_local_id(0) + (4 * (1 << 3) * 2)];
4168  r15 = shared.m[get_local_id(0) + (4 * (1 << 3) * 3)];
4169  r3 = shared.m[get_local_id(0) + (4 * (1 << 3) * 4)];
4170  r14 = shared.m[get_local_id(0) + (4 * (1 << 3) * 5)];
4171  r4 = shared.m[get_local_id(0) + (4 * (1 << 3) * 6)];
4172  r13 = shared.m[get_local_id(0) + (4 * (1 << 3) * 7)];
4173  r5 = shared.m[get_local_id(0) + (4 * (1 << 3) * 8)];
4174  r12 = shared.m[get_local_id(0) + (4 * (1 << 3) * 9)];
4175  r6 = shared.m[get_local_id(0) + (4 * (1 << 3) * 10)];
4176  r11 = shared.m[get_local_id(0) + (4 * (1 << 3) * 11)];
4177  r7 = shared.m[get_local_id(0) + (4 * (1 << 3) * 12)];
4178  r10 = shared.m[get_local_id(0) + (4 * (1 << 3) * 13)];
4179  r8 = shared.m[get_local_id(0) + (4 * (1 << 3) * 14)];
4180  r9 = shared.m[get_local_id(0) + (4 * (1 << 3) * 15)];
4181  {
4182    {
4183      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
4184      int const t_lt = get_sub_group_local_id() < half_lane_idx;
4185      ;
4186      {
4187        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
4188        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
4189      };
4190      {
4191        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
4192        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
4193      };
4194      {
4195        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
4196        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
4197      };
4198      {
4199        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
4200        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
4201      };
4202      {
4203        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
4204        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
4205      };
4206      {
4207        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
4208        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
4209      };
4210      {
4211        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
4212        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
4213      };
4214      {
4215        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
4216        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
4217      };
4218      {
4219        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
4220        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
4221      };
4222      {
4223        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
4224        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
4225      };
4226      {
4227        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
4228        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
4229      };
4230      {
4231        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
4232        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
4233      };
4234      {
4235        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
4236        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
4237      };
4238      {
4239        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
4240        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
4241      };
4242      {
4243        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
4244        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
4245      };
4246      {
4247        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
4248        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
4249      };
4250    }
4251    {
4252      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
4253      int const t_lt = get_sub_group_local_id() < half_lane_idx;
4254      ;
4255      {
4256        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
4257        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
4258      };
4259      {
4260        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
4261        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
4262      };
4263      {
4264        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
4265        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
4266      };
4267      {
4268        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
4269        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
4270      };
4271      {
4272        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
4273        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
4274      };
4275      {
4276        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
4277        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
4278      };
4279      {
4280        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
4281        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
4282      };
4283      {
4284        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
4285        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
4286      };
4287      {
4288        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
4289        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
4290      };
4291      {
4292        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
4293        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
4294      };
4295      {
4296        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
4297        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
4298      };
4299      {
4300        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
4301        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
4302      };
4303      {
4304        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
4305        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
4306      };
4307      {
4308        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
4309        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
4310      };
4311      {
4312        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
4313        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
4314      };
4315      {
4316        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
4317        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
4318      };
4319    }
4320    {
4321      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
4322      int const t_lt = get_sub_group_local_id() < half_lane_idx;
4323      ;
4324      {
4325        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
4326        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
4327      };
4328      {
4329        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
4330        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
4331      };
4332      {
4333        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
4334        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
4335      };
4336      {
4337        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
4338        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
4339      };
4340      {
4341        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
4342        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
4343      };
4344      {
4345        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
4346        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
4347      };
4348      {
4349        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
4350        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
4351      };
4352      {
4353        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
4354        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
4355      };
4356      {
4357        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
4358        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
4359      };
4360      {
4361        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
4362        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
4363      };
4364      {
4365        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
4366        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
4367      };
4368      {
4369        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
4370        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
4371      };
4372      {
4373        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
4374        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
4375      };
4376      {
4377        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
4378        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
4379      };
4380      {
4381        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
4382        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
4383      };
4384      {
4385        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
4386        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
4387      };
4388    }
4389    if (r1 >= r9) {
4390      ulong const t = r1;
4391      r1 = r9;
4392      r9 = t;
4393    };
4394    if (r5 >= r13) {
4395      ulong const t = r5;
4396      r5 = r13;
4397      r13 = t;
4398    };
4399    if (r1 >= r5) {
4400      ulong const t = r1;
4401      r1 = r5;
4402      r5 = t;
4403    };
4404    if (r9 >= r13) {
4405      ulong const t = r9;
4406      r9 = r13;
4407      r13 = t;
4408    };
4409    if (r3 >= r11) {
4410      ulong const t = r3;
4411      r3 = r11;
4412      r11 = t;
4413    };
4414    if (r7 >= r15) {
4415      ulong const t = r7;
4416      r7 = r15;
4417      r15 = t;
4418    };
4419    if (r3 >= r7) {
4420      ulong const t = r3;
4421      r3 = r7;
4422      r7 = t;
4423    };
4424    if (r11 >= r15) {
4425      ulong const t = r11;
4426      r11 = r15;
4427      r15 = t;
4428    };
4429    if (r1 >= r3) {
4430      ulong const t = r1;
4431      r1 = r3;
4432      r3 = t;
4433    };
4434    if (r5 >= r7) {
4435      ulong const t = r5;
4436      r5 = r7;
4437      r7 = t;
4438    };
4439    if (r9 >= r11) {
4440      ulong const t = r9;
4441      r9 = r11;
4442      r11 = t;
4443    };
4444    if (r13 >= r15) {
4445      ulong const t = r13;
4446      r13 = r15;
4447      r15 = t;
4448    };
4449    if (r2 >= r10) {
4450      ulong const t = r2;
4451      r2 = r10;
4452      r10 = t;
4453    };
4454    if (r6 >= r14) {
4455      ulong const t = r6;
4456      r6 = r14;
4457      r14 = t;
4458    };
4459    if (r2 >= r6) {
4460      ulong const t = r2;
4461      r2 = r6;
4462      r6 = t;
4463    };
4464    if (r10 >= r14) {
4465      ulong const t = r10;
4466      r10 = r14;
4467      r14 = t;
4468    };
4469    if (r4 >= r12) {
4470      ulong const t = r4;
4471      r4 = r12;
4472      r12 = t;
4473    };
4474    if (r8 >= r16) {
4475      ulong const t = r8;
4476      r8 = r16;
4477      r16 = t;
4478    };
4479    if (r4 >= r8) {
4480      ulong const t = r4;
4481      r4 = r8;
4482      r8 = t;
4483    };
4484    if (r12 >= r16) {
4485      ulong const t = r12;
4486      r12 = r16;
4487      r16 = t;
4488    };
4489    if (r2 >= r4) {
4490      ulong const t = r2;
4491      r2 = r4;
4492      r4 = t;
4493    };
4494    if (r6 >= r8) {
4495      ulong const t = r6;
4496      r6 = r8;
4497      r8 = t;
4498    };
4499    if (r10 >= r12) {
4500      ulong const t = r10;
4501      r10 = r12;
4502      r12 = t;
4503    };
4504    if (r14 >= r16) {
4505      ulong const t = r14;
4506      r14 = r16;
4507      r16 = t;
4508    };
4509    if (r1 >= r2) {
4510      ulong const t = r1;
4511      r1 = r2;
4512      r2 = t;
4513    };
4514    if (r3 >= r4) {
4515      ulong const t = r3;
4516      r3 = r4;
4517      r4 = t;
4518    };
4519    if (r5 >= r6) {
4520      ulong const t = r5;
4521      r5 = r6;
4522      r6 = t;
4523    };
4524    if (r7 >= r8) {
4525      ulong const t = r7;
4526      r7 = r8;
4527      r8 = t;
4528    };
4529    if (r9 >= r10) {
4530      ulong const t = r9;
4531      r9 = r10;
4532      r10 = t;
4533    };
4534    if (r11 >= r12) {
4535      ulong const t = r11;
4536      r11 = r12;
4537      r12 = t;
4538    };
4539    if (r13 >= r14) {
4540      ulong const t = r13;
4541      r13 = r14;
4542      r14 = t;
4543    };
4544    if (r15 >= r16) {
4545      ulong const t = r15;
4546      r15 = r16;
4547      r16 = t;
4548    };
4549  }
4550  shared.m[get_local_id(0) + (4 * (1 << 3) * 0)] = r1;
4551  shared.m[get_local_id(0) + (4 * (1 << 3) * 1)] = r16;
4552  shared.m[get_local_id(0) + (4 * (1 << 3) * 2)] = r2;
4553  shared.m[get_local_id(0) + (4 * (1 << 3) * 3)] = r15;
4554  shared.m[get_local_id(0) + (4 * (1 << 3) * 4)] = r3;
4555  shared.m[get_local_id(0) + (4 * (1 << 3) * 5)] = r14;
4556  shared.m[get_local_id(0) + (4 * (1 << 3) * 6)] = r4;
4557  shared.m[get_local_id(0) + (4 * (1 << 3) * 7)] = r13;
4558  shared.m[get_local_id(0) + (4 * (1 << 3) * 8)] = r5;
4559  shared.m[get_local_id(0) + (4 * (1 << 3) * 9)] = r12;
4560  shared.m[get_local_id(0) + (4 * (1 << 3) * 10)] = r6;
4561  shared.m[get_local_id(0) + (4 * (1 << 3) * 11)] = r11;
4562  shared.m[get_local_id(0) + (4 * (1 << 3) * 12)] = r7;
4563  shared.m[get_local_id(0) + (4 * (1 << 3) * 13)] = r10;
4564  shared.m[get_local_id(0) + (4 * (1 << 3) * 14)] = r8;
4565  shared.m[get_local_id(0) + (4 * (1 << 3) * 15)] = r9;
4566  barrier(CLK_LOCAL_MEM_FENCE);
4567  {
4568    {
4569      ulong r0_1 = shared.m[smem_l_idx + (0)];
4570      ulong r0_2 = shared.m[smem_l_idx + (8)];
4571      ulong r0_3 = shared.m[smem_r_idx + (16)];
4572      ulong r0_4 = shared.m[smem_r_idx + (24)];
4573      if (r0_2 >= r0_3) {
4574        ulong const t = r0_2;
4575        r0_2 = r0_3;
4576        r0_3 = t;
4577      };
4578      if (r0_1 >= r0_4) {
4579        ulong const t = r0_1;
4580        r0_1 = r0_4;
4581        r0_4 = t;
4582      };
4583      if (r0_3 >= r0_4) {
4584        ulong const t = r0_3;
4585        r0_3 = r0_4;
4586        r0_4 = t;
4587      };
4588      if (r0_1 >= r0_2) {
4589        ulong const t = r0_1;
4590        r0_1 = r0_2;
4591        r0_2 = t;
4592      };
4593      shared.m[smem_l_idx + (0)] = r0_1;
4594      shared.m[smem_l_idx + (8)] = r0_2;
4595      shared.m[smem_r_idx + (16)] = r0_3;
4596      shared.m[smem_r_idx + (24)] = r0_4;
4597    }
4598    {
4599      ulong r0_1 = shared.m[smem_l_idx + (128)];
4600      ulong r0_2 = shared.m[smem_l_idx + (136)];
4601      ulong r0_3 = shared.m[smem_r_idx + (144)];
4602      ulong r0_4 = shared.m[smem_r_idx + (152)];
4603      if (r0_2 >= r0_3) {
4604        ulong const t = r0_2;
4605        r0_2 = r0_3;
4606        r0_3 = t;
4607      };
4608      if (r0_1 >= r0_4) {
4609        ulong const t = r0_1;
4610        r0_1 = r0_4;
4611        r0_4 = t;
4612      };
4613      if (r0_3 >= r0_4) {
4614        ulong const t = r0_3;
4615        r0_3 = r0_4;
4616        r0_4 = t;
4617      };
4618      if (r0_1 >= r0_2) {
4619        ulong const t = r0_1;
4620        r0_1 = r0_2;
4621        r0_2 = t;
4622      };
4623      shared.m[smem_l_idx + (128)] = r0_1;
4624      shared.m[smem_l_idx + (136)] = r0_2;
4625      shared.m[smem_r_idx + (144)] = r0_3;
4626      shared.m[smem_r_idx + (152)] = r0_4;
4627    }
4628    {
4629      ulong r0_1 = shared.m[smem_l_idx + (256)];
4630      ulong r0_2 = shared.m[smem_l_idx + (264)];
4631      ulong r0_3 = shared.m[smem_r_idx + (272)];
4632      ulong r0_4 = shared.m[smem_r_idx + (280)];
4633      if (r0_2 >= r0_3) {
4634        ulong const t = r0_2;
4635        r0_2 = r0_3;
4636        r0_3 = t;
4637      };
4638      if (r0_1 >= r0_4) {
4639        ulong const t = r0_1;
4640        r0_1 = r0_4;
4641        r0_4 = t;
4642      };
4643      if (r0_3 >= r0_4) {
4644        ulong const t = r0_3;
4645        r0_3 = r0_4;
4646        r0_4 = t;
4647      };
4648      if (r0_1 >= r0_2) {
4649        ulong const t = r0_1;
4650        r0_1 = r0_2;
4651        r0_2 = t;
4652      };
4653      shared.m[smem_l_idx + (256)] = r0_1;
4654      shared.m[smem_l_idx + (264)] = r0_2;
4655      shared.m[smem_r_idx + (272)] = r0_3;
4656      shared.m[smem_r_idx + (280)] = r0_4;
4657    }
4658    {
4659      ulong r0_1 = shared.m[smem_l_idx + (384)];
4660      ulong r0_2 = shared.m[smem_l_idx + (392)];
4661      ulong r0_3 = shared.m[smem_r_idx + (400)];
4662      ulong r0_4 = shared.m[smem_r_idx + (408)];
4663      if (r0_2 >= r0_3) {
4664        ulong const t = r0_2;
4665        r0_2 = r0_3;
4666        r0_3 = t;
4667      };
4668      if (r0_1 >= r0_4) {
4669        ulong const t = r0_1;
4670        r0_1 = r0_4;
4671        r0_4 = t;
4672      };
4673      if (r0_3 >= r0_4) {
4674        ulong const t = r0_3;
4675        r0_3 = r0_4;
4676        r0_4 = t;
4677      };
4678      if (r0_1 >= r0_2) {
4679        ulong const t = r0_1;
4680        r0_1 = r0_2;
4681        r0_2 = t;
4682      };
4683      shared.m[smem_l_idx + (384)] = r0_1;
4684      shared.m[smem_l_idx + (392)] = r0_2;
4685      shared.m[smem_r_idx + (400)] = r0_3;
4686      shared.m[smem_r_idx + (408)] = r0_4;
4687    }
4688  }
4689  barrier(CLK_LOCAL_MEM_FENCE);
4690  r1 = shared.m[get_local_id(0) + (4 * (1 << 3) * 0)];
4691  r16 = shared.m[get_local_id(0) + (4 * (1 << 3) * 1)];
4692  r2 = shared.m[get_local_id(0) + (4 * (1 << 3) * 2)];
4693  r15 = shared.m[get_local_id(0) + (4 * (1 << 3) * 3)];
4694  r3 = shared.m[get_local_id(0) + (4 * (1 << 3) * 4)];
4695  r14 = shared.m[get_local_id(0) + (4 * (1 << 3) * 5)];
4696  r4 = shared.m[get_local_id(0) + (4 * (1 << 3) * 6)];
4697  r13 = shared.m[get_local_id(0) + (4 * (1 << 3) * 7)];
4698  r5 = shared.m[get_local_id(0) + (4 * (1 << 3) * 8)];
4699  r12 = shared.m[get_local_id(0) + (4 * (1 << 3) * 9)];
4700  r6 = shared.m[get_local_id(0) + (4 * (1 << 3) * 10)];
4701  r11 = shared.m[get_local_id(0) + (4 * (1 << 3) * 11)];
4702  r7 = shared.m[get_local_id(0) + (4 * (1 << 3) * 12)];
4703  r10 = shared.m[get_local_id(0) + (4 * (1 << 3) * 13)];
4704  r8 = shared.m[get_local_id(0) + (4 * (1 << 3) * 14)];
4705  r9 = shared.m[get_local_id(0) + (4 * (1 << 3) * 15)];
4706  {
4707    {
4708      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
4709      int const t_lt = get_sub_group_local_id() < half_lane_idx;
4710      ;
4711      {
4712        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
4713        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
4714      };
4715      {
4716        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
4717        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
4718      };
4719      {
4720        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
4721        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
4722      };
4723      {
4724        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
4725        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
4726      };
4727      {
4728        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
4729        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
4730      };
4731      {
4732        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
4733        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
4734      };
4735      {
4736        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
4737        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
4738      };
4739      {
4740        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
4741        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
4742      };
4743      {
4744        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
4745        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
4746      };
4747      {
4748        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
4749        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
4750      };
4751      {
4752        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
4753        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
4754      };
4755      {
4756        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
4757        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
4758      };
4759      {
4760        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
4761        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
4762      };
4763      {
4764        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
4765        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
4766      };
4767      {
4768        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
4769        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
4770      };
4771      {
4772        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
4773        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
4774      };
4775    }
4776    {
4777      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
4778      int const t_lt = get_sub_group_local_id() < half_lane_idx;
4779      ;
4780      {
4781        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
4782        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
4783      };
4784      {
4785        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
4786        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
4787      };
4788      {
4789        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
4790        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
4791      };
4792      {
4793        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
4794        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
4795      };
4796      {
4797        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
4798        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
4799      };
4800      {
4801        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
4802        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
4803      };
4804      {
4805        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
4806        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
4807      };
4808      {
4809        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
4810        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
4811      };
4812      {
4813        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
4814        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
4815      };
4816      {
4817        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
4818        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
4819      };
4820      {
4821        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
4822        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
4823      };
4824      {
4825        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
4826        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
4827      };
4828      {
4829        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
4830        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
4831      };
4832      {
4833        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
4834        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
4835      };
4836      {
4837        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
4838        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
4839      };
4840      {
4841        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
4842        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
4843      };
4844    }
4845    {
4846      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
4847      int const t_lt = get_sub_group_local_id() < half_lane_idx;
4848      ;
4849      {
4850        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
4851        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
4852      };
4853      {
4854        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
4855        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
4856      };
4857      {
4858        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
4859        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
4860      };
4861      {
4862        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
4863        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
4864      };
4865      {
4866        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
4867        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
4868      };
4869      {
4870        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
4871        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
4872      };
4873      {
4874        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
4875        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
4876      };
4877      {
4878        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
4879        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
4880      };
4881      {
4882        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
4883        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
4884      };
4885      {
4886        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
4887        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
4888      };
4889      {
4890        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
4891        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
4892      };
4893      {
4894        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
4895        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
4896      };
4897      {
4898        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
4899        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
4900      };
4901      {
4902        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
4903        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
4904      };
4905      {
4906        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
4907        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
4908      };
4909      {
4910        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
4911        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
4912      };
4913    }
4914    if (r1 >= r9) {
4915      ulong const t = r1;
4916      r1 = r9;
4917      r9 = t;
4918    };
4919    if (r5 >= r13) {
4920      ulong const t = r5;
4921      r5 = r13;
4922      r13 = t;
4923    };
4924    if (r1 >= r5) {
4925      ulong const t = r1;
4926      r1 = r5;
4927      r5 = t;
4928    };
4929    if (r9 >= r13) {
4930      ulong const t = r9;
4931      r9 = r13;
4932      r13 = t;
4933    };
4934    if (r3 >= r11) {
4935      ulong const t = r3;
4936      r3 = r11;
4937      r11 = t;
4938    };
4939    if (r7 >= r15) {
4940      ulong const t = r7;
4941      r7 = r15;
4942      r15 = t;
4943    };
4944    if (r3 >= r7) {
4945      ulong const t = r3;
4946      r3 = r7;
4947      r7 = t;
4948    };
4949    if (r11 >= r15) {
4950      ulong const t = r11;
4951      r11 = r15;
4952      r15 = t;
4953    };
4954    if (r1 >= r3) {
4955      ulong const t = r1;
4956      r1 = r3;
4957      r3 = t;
4958    };
4959    if (r5 >= r7) {
4960      ulong const t = r5;
4961      r5 = r7;
4962      r7 = t;
4963    };
4964    if (r9 >= r11) {
4965      ulong const t = r9;
4966      r9 = r11;
4967      r11 = t;
4968    };
4969    if (r13 >= r15) {
4970      ulong const t = r13;
4971      r13 = r15;
4972      r15 = t;
4973    };
4974    if (r2 >= r10) {
4975      ulong const t = r2;
4976      r2 = r10;
4977      r10 = t;
4978    };
4979    if (r6 >= r14) {
4980      ulong const t = r6;
4981      r6 = r14;
4982      r14 = t;
4983    };
4984    if (r2 >= r6) {
4985      ulong const t = r2;
4986      r2 = r6;
4987      r6 = t;
4988    };
4989    if (r10 >= r14) {
4990      ulong const t = r10;
4991      r10 = r14;
4992      r14 = t;
4993    };
4994    if (r4 >= r12) {
4995      ulong const t = r4;
4996      r4 = r12;
4997      r12 = t;
4998    };
4999    if (r8 >= r16) {
5000      ulong const t = r8;
5001      r8 = r16;
5002      r16 = t;
5003    };
5004    if (r4 >= r8) {
5005      ulong const t = r4;
5006      r4 = r8;
5007      r8 = t;
5008    };
5009    if (r12 >= r16) {
5010      ulong const t = r12;
5011      r12 = r16;
5012      r16 = t;
5013    };
5014    if (r2 >= r4) {
5015      ulong const t = r2;
5016      r2 = r4;
5017      r4 = t;
5018    };
5019    if (r6 >= r8) {
5020      ulong const t = r6;
5021      r6 = r8;
5022      r8 = t;
5023    };
5024    if (r10 >= r12) {
5025      ulong const t = r10;
5026      r10 = r12;
5027      r12 = t;
5028    };
5029    if (r14 >= r16) {
5030      ulong const t = r14;
5031      r14 = r16;
5032      r16 = t;
5033    };
5034    if (r1 >= r2) {
5035      ulong const t = r1;
5036      r1 = r2;
5037      r2 = t;
5038    };
5039    if (r3 >= r4) {
5040      ulong const t = r3;
5041      r3 = r4;
5042      r4 = t;
5043    };
5044    if (r5 >= r6) {
5045      ulong const t = r5;
5046      r5 = r6;
5047      r6 = t;
5048    };
5049    if (r7 >= r8) {
5050      ulong const t = r7;
5051      r7 = r8;
5052      r8 = t;
5053    };
5054    if (r9 >= r10) {
5055      ulong const t = r9;
5056      r9 = r10;
5057      r10 = t;
5058    };
5059    if (r11 >= r12) {
5060      ulong const t = r11;
5061      r11 = r12;
5062      r12 = t;
5063    };
5064    if (r13 >= r14) {
5065      ulong const t = r13;
5066      r13 = r14;
5067      r14 = t;
5068    };
5069    if (r15 >= r16) {
5070      ulong const t = r15;
5071      r15 = r16;
5072      r16 = t;
5073    };
5074  }
5075  vout[gmem_idx + (1 << 3) * 0] = r1;
5076  vout[gmem_idx + (1 << 3) * 1] = r2;
5077  vout[gmem_idx + (1 << 3) * 2] = r3;
5078  vout[gmem_idx + (1 << 3) * 3] = r4;
5079  vout[gmem_idx + (1 << 3) * 4] = r5;
5080  vout[gmem_idx + (1 << 3) * 5] = r6;
5081  vout[gmem_idx + (1 << 3) * 6] = r7;
5082  vout[gmem_idx + (1 << 3) * 7] = r8;
5083  vout[gmem_idx + (1 << 3) * 8] = r9;
5084  vout[gmem_idx + (1 << 3) * 9] = r10;
5085  vout[gmem_idx + (1 << 3) * 10] = r11;
5086  vout[gmem_idx + (1 << 3) * 11] = r12;
5087  vout[gmem_idx + (1 << 3) * 12] = r13;
5088  vout[gmem_idx + (1 << 3) * 13] = r14;
5089  vout[gmem_idx + (1 << 3) * 14] = r15;
5090  vout[gmem_idx + (1 << 3) * 15] = r16;
5091}
5092
5093__kernel __attribute__((intel_reqd_sub_group_size((1 << 3))))
5094__attribute__((reqd_work_group_size((1 << 3) * 8, 1, 1))) void
5095hs_kernel_bs_3(__global ulong const* const restrict vin,
5096               __global ulong* const restrict vout)
5097{
5098  __local struct
5099  {
5100    ulong m[64 * 16];
5101  } shared;
5102
5103  uint const gmem_idx = (get_global_id(0) & ~((1 << 3) - 1)) * 16 +
5104                        (get_local_id(0) & ((1 << 3) - 1));
5105  ulong r1 = vin[gmem_idx + (1 << 3) * 0];
5106  ulong r2 = vin[gmem_idx + (1 << 3) * 1];
5107  ulong r3 = vin[gmem_idx + (1 << 3) * 2];
5108  ulong r4 = vin[gmem_idx + (1 << 3) * 3];
5109  ulong r5 = vin[gmem_idx + (1 << 3) * 4];
5110  ulong r6 = vin[gmem_idx + (1 << 3) * 5];
5111  ulong r7 = vin[gmem_idx + (1 << 3) * 6];
5112  ulong r8 = vin[gmem_idx + (1 << 3) * 7];
5113  ulong r9 = vin[gmem_idx + (1 << 3) * 8];
5114  ulong r10 = vin[gmem_idx + (1 << 3) * 9];
5115  ulong r11 = vin[gmem_idx + (1 << 3) * 10];
5116  ulong r12 = vin[gmem_idx + (1 << 3) * 11];
5117  ulong r13 = vin[gmem_idx + (1 << 3) * 12];
5118  ulong r14 = vin[gmem_idx + (1 << 3) * 13];
5119  ulong r15 = vin[gmem_idx + (1 << 3) * 14];
5120  ulong r16 = vin[gmem_idx + (1 << 3) * 15];
5121  if (r1 >= r2) {
5122    ulong const t = r1;
5123    r1 = r2;
5124    r2 = t;
5125  };
5126  if (r3 >= r4) {
5127    ulong const t = r3;
5128    r3 = r4;
5129    r4 = t;
5130  };
5131  if (r5 >= r6) {
5132    ulong const t = r5;
5133    r5 = r6;
5134    r6 = t;
5135  };
5136  if (r7 >= r8) {
5137    ulong const t = r7;
5138    r7 = r8;
5139    r8 = t;
5140  };
5141  if (r9 >= r10) {
5142    ulong const t = r9;
5143    r9 = r10;
5144    r10 = t;
5145  };
5146  if (r11 >= r12) {
5147    ulong const t = r11;
5148    r11 = r12;
5149    r12 = t;
5150  };
5151  if (r13 >= r14) {
5152    ulong const t = r13;
5153    r13 = r14;
5154    r14 = t;
5155  };
5156  if (r15 >= r16) {
5157    ulong const t = r15;
5158    r15 = r16;
5159    r16 = t;
5160  };
5161  if (r1 >= r3) {
5162    ulong const t = r1;
5163    r1 = r3;
5164    r3 = t;
5165  };
5166  if (r5 >= r7) {
5167    ulong const t = r5;
5168    r5 = r7;
5169    r7 = t;
5170  };
5171  if (r9 >= r11) {
5172    ulong const t = r9;
5173    r9 = r11;
5174    r11 = t;
5175  };
5176  if (r13 >= r15) {
5177    ulong const t = r13;
5178    r13 = r15;
5179    r15 = t;
5180  };
5181  if (r2 >= r4) {
5182    ulong const t = r2;
5183    r2 = r4;
5184    r4 = t;
5185  };
5186  if (r6 >= r8) {
5187    ulong const t = r6;
5188    r6 = r8;
5189    r8 = t;
5190  };
5191  if (r10 >= r12) {
5192    ulong const t = r10;
5193    r10 = r12;
5194    r12 = t;
5195  };
5196  if (r14 >= r16) {
5197    ulong const t = r14;
5198    r14 = r16;
5199    r16 = t;
5200  };
5201  if (r1 >= r5) {
5202    ulong const t = r1;
5203    r1 = r5;
5204    r5 = t;
5205  };
5206  if (r9 >= r13) {
5207    ulong const t = r9;
5208    r9 = r13;
5209    r13 = t;
5210  };
5211  if (r2 >= r6) {
5212    ulong const t = r2;
5213    r2 = r6;
5214    r6 = t;
5215  };
5216  if (r10 >= r14) {
5217    ulong const t = r10;
5218    r10 = r14;
5219    r14 = t;
5220  };
5221  if (r3 >= r7) {
5222    ulong const t = r3;
5223    r3 = r7;
5224    r7 = t;
5225  };
5226  if (r11 >= r15) {
5227    ulong const t = r11;
5228    r11 = r15;
5229    r15 = t;
5230  };
5231  if (r4 >= r8) {
5232    ulong const t = r4;
5233    r4 = r8;
5234    r8 = t;
5235  };
5236  if (r12 >= r16) {
5237    ulong const t = r12;
5238    r12 = r16;
5239    r16 = t;
5240  };
5241  if (r1 >= r9) {
5242    ulong const t = r1;
5243    r1 = r9;
5244    r9 = t;
5245  };
5246  if (r2 >= r10) {
5247    ulong const t = r2;
5248    r2 = r10;
5249    r10 = t;
5250  };
5251  if (r3 >= r11) {
5252    ulong const t = r3;
5253    r3 = r11;
5254    r11 = t;
5255  };
5256  if (r4 >= r12) {
5257    ulong const t = r4;
5258    r4 = r12;
5259    r12 = t;
5260  };
5261  if (r5 >= r13) {
5262    ulong const t = r5;
5263    r5 = r13;
5264    r13 = t;
5265  };
5266  if (r6 >= r14) {
5267    ulong const t = r6;
5268    r6 = r14;
5269    r14 = t;
5270  };
5271  if (r7 >= r15) {
5272    ulong const t = r7;
5273    r7 = r15;
5274    r15 = t;
5275  };
5276  if (r8 >= r16) {
5277    ulong const t = r8;
5278    r8 = r16;
5279    r16 = t;
5280  };
5281  if (r6 >= r11) {
5282    ulong const t = r6;
5283    r6 = r11;
5284    r11 = t;
5285  };
5286  if (r7 >= r10) {
5287    ulong const t = r7;
5288    r7 = r10;
5289    r10 = t;
5290  };
5291  if (r4 >= r13) {
5292    ulong const t = r4;
5293    r4 = r13;
5294    r13 = t;
5295  };
5296  if (r14 >= r15) {
5297    ulong const t = r14;
5298    r14 = r15;
5299    r15 = t;
5300  };
5301  if (r8 >= r12) {
5302    ulong const t = r8;
5303    r8 = r12;
5304    r12 = t;
5305  };
5306  if (r2 >= r3) {
5307    ulong const t = r2;
5308    r2 = r3;
5309    r3 = t;
5310  };
5311  if (r5 >= r9) {
5312    ulong const t = r5;
5313    r5 = r9;
5314    r9 = t;
5315  };
5316  if (r2 >= r5) {
5317    ulong const t = r2;
5318    r2 = r5;
5319    r5 = t;
5320  };
5321  if (r8 >= r14) {
5322    ulong const t = r8;
5323    r8 = r14;
5324    r14 = t;
5325  };
5326  if (r3 >= r9) {
5327    ulong const t = r3;
5328    r3 = r9;
5329    r9 = t;
5330  };
5331  if (r12 >= r15) {
5332    ulong const t = r12;
5333    r12 = r15;
5334    r15 = t;
5335  };
5336  if (r3 >= r5) {
5337    ulong const t = r3;
5338    r3 = r5;
5339    r5 = t;
5340  };
5341  if (r6 >= r7) {
5342    ulong const t = r6;
5343    r6 = r7;
5344    r7 = t;
5345  };
5346  if (r10 >= r11) {
5347    ulong const t = r10;
5348    r10 = r11;
5349    r11 = t;
5350  };
5351  if (r12 >= r14) {
5352    ulong const t = r12;
5353    r12 = r14;
5354    r14 = t;
5355  };
5356  if (r4 >= r9) {
5357    ulong const t = r4;
5358    r4 = r9;
5359    r9 = t;
5360  };
5361  if (r8 >= r13) {
5362    ulong const t = r8;
5363    r8 = r13;
5364    r13 = t;
5365  };
5366  if (r7 >= r9) {
5367    ulong const t = r7;
5368    r7 = r9;
5369    r9 = t;
5370  };
5371  if (r11 >= r13) {
5372    ulong const t = r11;
5373    r11 = r13;
5374    r13 = t;
5375  };
5376  if (r4 >= r6) {
5377    ulong const t = r4;
5378    r4 = r6;
5379    r6 = t;
5380  };
5381  if (r8 >= r10) {
5382    ulong const t = r8;
5383    r8 = r10;
5384    r10 = t;
5385  };
5386  if (r4 >= r5) {
5387    ulong const t = r4;
5388    r4 = r5;
5389    r5 = t;
5390  };
5391  if (r6 >= r7) {
5392    ulong const t = r6;
5393    r6 = r7;
5394    r7 = t;
5395  };
5396  if (r8 >= r9) {
5397    ulong const t = r8;
5398    r8 = r9;
5399    r9 = t;
5400  };
5401  if (r10 >= r11) {
5402    ulong const t = r10;
5403    r10 = r11;
5404    r11 = t;
5405  };
5406  if (r12 >= r13) {
5407    ulong const t = r12;
5408    r12 = r13;
5409    r13 = t;
5410  };
5411  if (r7 >= r8) {
5412    ulong const t = r7;
5413    r7 = r8;
5414    r8 = t;
5415  };
5416  if (r9 >= r10) {
5417    ulong const t = r9;
5418    r9 = r10;
5419    r10 = t;
5420  };
5421  {
5422    uint const flip_lane_idx = get_sub_group_local_id() ^ 1;
5423    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
5424    ;
5425    {
5426      ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
5427      ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx);
5428      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
5429      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
5430    };
5431    {
5432      ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
5433      ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx);
5434      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
5435      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
5436    };
5437    {
5438      ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
5439      ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx);
5440      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
5441      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
5442    };
5443    {
5444      ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
5445      ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx);
5446      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
5447      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
5448    };
5449    {
5450      ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx);
5451      ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx);
5452      r5 = ((r5 <= tb) ^ t_lt) ? tb : r5;
5453      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
5454    };
5455    {
5456      ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx);
5457      ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx);
5458      r6 = ((r6 <= tb) ^ t_lt) ? tb : r6;
5459      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
5460    };
5461    {
5462      ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx);
5463      ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx);
5464      r7 = ((r7 <= tb) ^ t_lt) ? tb : r7;
5465      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
5466    };
5467    {
5468      ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx);
5469      ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx);
5470      r8 = ((r8 <= tb) ^ t_lt) ? tb : r8;
5471      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
5472    };
5473  }
5474  if (r1 >= r9) {
5475    ulong const t = r1;
5476    r1 = r9;
5477    r9 = t;
5478  };
5479  if (r5 >= r13) {
5480    ulong const t = r5;
5481    r5 = r13;
5482    r13 = t;
5483  };
5484  if (r1 >= r5) {
5485    ulong const t = r1;
5486    r1 = r5;
5487    r5 = t;
5488  };
5489  if (r9 >= r13) {
5490    ulong const t = r9;
5491    r9 = r13;
5492    r13 = t;
5493  };
5494  if (r3 >= r11) {
5495    ulong const t = r3;
5496    r3 = r11;
5497    r11 = t;
5498  };
5499  if (r7 >= r15) {
5500    ulong const t = r7;
5501    r7 = r15;
5502    r15 = t;
5503  };
5504  if (r3 >= r7) {
5505    ulong const t = r3;
5506    r3 = r7;
5507    r7 = t;
5508  };
5509  if (r11 >= r15) {
5510    ulong const t = r11;
5511    r11 = r15;
5512    r15 = t;
5513  };
5514  if (r1 >= r3) {
5515    ulong const t = r1;
5516    r1 = r3;
5517    r3 = t;
5518  };
5519  if (r5 >= r7) {
5520    ulong const t = r5;
5521    r5 = r7;
5522    r7 = t;
5523  };
5524  if (r9 >= r11) {
5525    ulong const t = r9;
5526    r9 = r11;
5527    r11 = t;
5528  };
5529  if (r13 >= r15) {
5530    ulong const t = r13;
5531    r13 = r15;
5532    r15 = t;
5533  };
5534  if (r2 >= r10) {
5535    ulong const t = r2;
5536    r2 = r10;
5537    r10 = t;
5538  };
5539  if (r6 >= r14) {
5540    ulong const t = r6;
5541    r6 = r14;
5542    r14 = t;
5543  };
5544  if (r2 >= r6) {
5545    ulong const t = r2;
5546    r2 = r6;
5547    r6 = t;
5548  };
5549  if (r10 >= r14) {
5550    ulong const t = r10;
5551    r10 = r14;
5552    r14 = t;
5553  };
5554  if (r4 >= r12) {
5555    ulong const t = r4;
5556    r4 = r12;
5557    r12 = t;
5558  };
5559  if (r8 >= r16) {
5560    ulong const t = r8;
5561    r8 = r16;
5562    r16 = t;
5563  };
5564  if (r4 >= r8) {
5565    ulong const t = r4;
5566    r4 = r8;
5567    r8 = t;
5568  };
5569  if (r12 >= r16) {
5570    ulong const t = r12;
5571    r12 = r16;
5572    r16 = t;
5573  };
5574  if (r2 >= r4) {
5575    ulong const t = r2;
5576    r2 = r4;
5577    r4 = t;
5578  };
5579  if (r6 >= r8) {
5580    ulong const t = r6;
5581    r6 = r8;
5582    r8 = t;
5583  };
5584  if (r10 >= r12) {
5585    ulong const t = r10;
5586    r10 = r12;
5587    r12 = t;
5588  };
5589  if (r14 >= r16) {
5590    ulong const t = r14;
5591    r14 = r16;
5592    r16 = t;
5593  };
5594  if (r1 >= r2) {
5595    ulong const t = r1;
5596    r1 = r2;
5597    r2 = t;
5598  };
5599  if (r3 >= r4) {
5600    ulong const t = r3;
5601    r3 = r4;
5602    r4 = t;
5603  };
5604  if (r5 >= r6) {
5605    ulong const t = r5;
5606    r5 = r6;
5607    r6 = t;
5608  };
5609  if (r7 >= r8) {
5610    ulong const t = r7;
5611    r7 = r8;
5612    r8 = t;
5613  };
5614  if (r9 >= r10) {
5615    ulong const t = r9;
5616    r9 = r10;
5617    r10 = t;
5618  };
5619  if (r11 >= r12) {
5620    ulong const t = r11;
5621    r11 = r12;
5622    r12 = t;
5623  };
5624  if (r13 >= r14) {
5625    ulong const t = r13;
5626    r13 = r14;
5627    r14 = t;
5628  };
5629  if (r15 >= r16) {
5630    ulong const t = r15;
5631    r15 = r16;
5632    r16 = t;
5633  };
5634  {
5635    uint const flip_lane_idx = get_sub_group_local_id() ^ 3;
5636    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
5637    ;
5638    {
5639      ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
5640      ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx);
5641      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
5642      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
5643    };
5644    {
5645      ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
5646      ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx);
5647      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
5648      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
5649    };
5650    {
5651      ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
5652      ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx);
5653      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
5654      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
5655    };
5656    {
5657      ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
5658      ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx);
5659      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
5660      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
5661    };
5662    {
5663      ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx);
5664      ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx);
5665      r5 = ((r5 <= tb) ^ t_lt) ? tb : r5;
5666      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
5667    };
5668    {
5669      ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx);
5670      ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx);
5671      r6 = ((r6 <= tb) ^ t_lt) ? tb : r6;
5672      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
5673    };
5674    {
5675      ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx);
5676      ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx);
5677      r7 = ((r7 <= tb) ^ t_lt) ? tb : r7;
5678      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
5679    };
5680    {
5681      ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx);
5682      ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx);
5683      r8 = ((r8 <= tb) ^ t_lt) ? tb : r8;
5684      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
5685    };
5686  }
5687  {
5688    uint const half_lane_idx = get_sub_group_local_id() ^ 1;
5689    int const t_lt = get_sub_group_local_id() < half_lane_idx;
5690    ;
5691    {
5692      ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
5693      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
5694    };
5695    {
5696      ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
5697      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
5698    };
5699    {
5700      ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
5701      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
5702    };
5703    {
5704      ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
5705      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
5706    };
5707    {
5708      ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
5709      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
5710    };
5711    {
5712      ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
5713      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
5714    };
5715    {
5716      ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
5717      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
5718    };
5719    {
5720      ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
5721      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
5722    };
5723    {
5724      ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
5725      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
5726    };
5727    {
5728      ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
5729      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
5730    };
5731    {
5732      ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
5733      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
5734    };
5735    {
5736      ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
5737      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
5738    };
5739    {
5740      ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
5741      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
5742    };
5743    {
5744      ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
5745      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
5746    };
5747    {
5748      ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
5749      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
5750    };
5751    {
5752      ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
5753      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
5754    };
5755  }
5756  if (r1 >= r9) {
5757    ulong const t = r1;
5758    r1 = r9;
5759    r9 = t;
5760  };
5761  if (r5 >= r13) {
5762    ulong const t = r5;
5763    r5 = r13;
5764    r13 = t;
5765  };
5766  if (r1 >= r5) {
5767    ulong const t = r1;
5768    r1 = r5;
5769    r5 = t;
5770  };
5771  if (r9 >= r13) {
5772    ulong const t = r9;
5773    r9 = r13;
5774    r13 = t;
5775  };
5776  if (r3 >= r11) {
5777    ulong const t = r3;
5778    r3 = r11;
5779    r11 = t;
5780  };
5781  if (r7 >= r15) {
5782    ulong const t = r7;
5783    r7 = r15;
5784    r15 = t;
5785  };
5786  if (r3 >= r7) {
5787    ulong const t = r3;
5788    r3 = r7;
5789    r7 = t;
5790  };
5791  if (r11 >= r15) {
5792    ulong const t = r11;
5793    r11 = r15;
5794    r15 = t;
5795  };
5796  if (r1 >= r3) {
5797    ulong const t = r1;
5798    r1 = r3;
5799    r3 = t;
5800  };
5801  if (r5 >= r7) {
5802    ulong const t = r5;
5803    r5 = r7;
5804    r7 = t;
5805  };
5806  if (r9 >= r11) {
5807    ulong const t = r9;
5808    r9 = r11;
5809    r11 = t;
5810  };
5811  if (r13 >= r15) {
5812    ulong const t = r13;
5813    r13 = r15;
5814    r15 = t;
5815  };
5816  if (r2 >= r10) {
5817    ulong const t = r2;
5818    r2 = r10;
5819    r10 = t;
5820  };
5821  if (r6 >= r14) {
5822    ulong const t = r6;
5823    r6 = r14;
5824    r14 = t;
5825  };
5826  if (r2 >= r6) {
5827    ulong const t = r2;
5828    r2 = r6;
5829    r6 = t;
5830  };
5831  if (r10 >= r14) {
5832    ulong const t = r10;
5833    r10 = r14;
5834    r14 = t;
5835  };
5836  if (r4 >= r12) {
5837    ulong const t = r4;
5838    r4 = r12;
5839    r12 = t;
5840  };
5841  if (r8 >= r16) {
5842    ulong const t = r8;
5843    r8 = r16;
5844    r16 = t;
5845  };
5846  if (r4 >= r8) {
5847    ulong const t = r4;
5848    r4 = r8;
5849    r8 = t;
5850  };
5851  if (r12 >= r16) {
5852    ulong const t = r12;
5853    r12 = r16;
5854    r16 = t;
5855  };
5856  if (r2 >= r4) {
5857    ulong const t = r2;
5858    r2 = r4;
5859    r4 = t;
5860  };
5861  if (r6 >= r8) {
5862    ulong const t = r6;
5863    r6 = r8;
5864    r8 = t;
5865  };
5866  if (r10 >= r12) {
5867    ulong const t = r10;
5868    r10 = r12;
5869    r12 = t;
5870  };
5871  if (r14 >= r16) {
5872    ulong const t = r14;
5873    r14 = r16;
5874    r16 = t;
5875  };
5876  if (r1 >= r2) {
5877    ulong const t = r1;
5878    r1 = r2;
5879    r2 = t;
5880  };
5881  if (r3 >= r4) {
5882    ulong const t = r3;
5883    r3 = r4;
5884    r4 = t;
5885  };
5886  if (r5 >= r6) {
5887    ulong const t = r5;
5888    r5 = r6;
5889    r6 = t;
5890  };
5891  if (r7 >= r8) {
5892    ulong const t = r7;
5893    r7 = r8;
5894    r8 = t;
5895  };
5896  if (r9 >= r10) {
5897    ulong const t = r9;
5898    r9 = r10;
5899    r10 = t;
5900  };
5901  if (r11 >= r12) {
5902    ulong const t = r11;
5903    r11 = r12;
5904    r12 = t;
5905  };
5906  if (r13 >= r14) {
5907    ulong const t = r13;
5908    r13 = r14;
5909    r14 = t;
5910  };
5911  if (r15 >= r16) {
5912    ulong const t = r15;
5913    r15 = r16;
5914    r16 = t;
5915  };
5916  {
5917    uint const flip_lane_idx = get_sub_group_local_id() ^ 7;
5918    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
5919    ;
5920    {
5921      ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
5922      ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx);
5923      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
5924      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
5925    };
5926    {
5927      ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
5928      ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx);
5929      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
5930      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
5931    };
5932    {
5933      ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
5934      ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx);
5935      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
5936      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
5937    };
5938    {
5939      ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
5940      ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx);
5941      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
5942      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
5943    };
5944    {
5945      ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx);
5946      ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx);
5947      r5 = ((r5 <= tb) ^ t_lt) ? tb : r5;
5948      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
5949    };
5950    {
5951      ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx);
5952      ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx);
5953      r6 = ((r6 <= tb) ^ t_lt) ? tb : r6;
5954      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
5955    };
5956    {
5957      ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx);
5958      ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx);
5959      r7 = ((r7 <= tb) ^ t_lt) ? tb : r7;
5960      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
5961    };
5962    {
5963      ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx);
5964      ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx);
5965      r8 = ((r8 <= tb) ^ t_lt) ? tb : r8;
5966      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
5967    };
5968  }
5969  {
5970    uint const half_lane_idx = get_sub_group_local_id() ^ 2;
5971    int const t_lt = get_sub_group_local_id() < half_lane_idx;
5972    ;
5973    {
5974      ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
5975      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
5976    };
5977    {
5978      ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
5979      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
5980    };
5981    {
5982      ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
5983      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
5984    };
5985    {
5986      ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
5987      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
5988    };
5989    {
5990      ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
5991      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
5992    };
5993    {
5994      ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
5995      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
5996    };
5997    {
5998      ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
5999      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
6000    };
6001    {
6002      ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
6003      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
6004    };
6005    {
6006      ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
6007      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
6008    };
6009    {
6010      ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
6011      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
6012    };
6013    {
6014      ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
6015      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
6016    };
6017    {
6018      ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
6019      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
6020    };
6021    {
6022      ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
6023      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
6024    };
6025    {
6026      ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
6027      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
6028    };
6029    {
6030      ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
6031      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
6032    };
6033    {
6034      ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
6035      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
6036    };
6037  }
6038  {
6039    uint const half_lane_idx = get_sub_group_local_id() ^ 1;
6040    int const t_lt = get_sub_group_local_id() < half_lane_idx;
6041    ;
6042    {
6043      ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
6044      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
6045    };
6046    {
6047      ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
6048      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
6049    };
6050    {
6051      ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
6052      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
6053    };
6054    {
6055      ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
6056      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
6057    };
6058    {
6059      ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
6060      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
6061    };
6062    {
6063      ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
6064      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
6065    };
6066    {
6067      ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
6068      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
6069    };
6070    {
6071      ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
6072      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
6073    };
6074    {
6075      ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
6076      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
6077    };
6078    {
6079      ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
6080      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
6081    };
6082    {
6083      ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
6084      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
6085    };
6086    {
6087      ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
6088      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
6089    };
6090    {
6091      ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
6092      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
6093    };
6094    {
6095      ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
6096      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
6097    };
6098    {
6099      ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
6100      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
6101    };
6102    {
6103      ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
6104      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
6105    };
6106  }
6107  if (r1 >= r9) {
6108    ulong const t = r1;
6109    r1 = r9;
6110    r9 = t;
6111  };
6112  if (r5 >= r13) {
6113    ulong const t = r5;
6114    r5 = r13;
6115    r13 = t;
6116  };
6117  if (r1 >= r5) {
6118    ulong const t = r1;
6119    r1 = r5;
6120    r5 = t;
6121  };
6122  if (r9 >= r13) {
6123    ulong const t = r9;
6124    r9 = r13;
6125    r13 = t;
6126  };
6127  if (r3 >= r11) {
6128    ulong const t = r3;
6129    r3 = r11;
6130    r11 = t;
6131  };
6132  if (r7 >= r15) {
6133    ulong const t = r7;
6134    r7 = r15;
6135    r15 = t;
6136  };
6137  if (r3 >= r7) {
6138    ulong const t = r3;
6139    r3 = r7;
6140    r7 = t;
6141  };
6142  if (r11 >= r15) {
6143    ulong const t = r11;
6144    r11 = r15;
6145    r15 = t;
6146  };
6147  if (r1 >= r3) {
6148    ulong const t = r1;
6149    r1 = r3;
6150    r3 = t;
6151  };
6152  if (r5 >= r7) {
6153    ulong const t = r5;
6154    r5 = r7;
6155    r7 = t;
6156  };
6157  if (r9 >= r11) {
6158    ulong const t = r9;
6159    r9 = r11;
6160    r11 = t;
6161  };
6162  if (r13 >= r15) {
6163    ulong const t = r13;
6164    r13 = r15;
6165    r15 = t;
6166  };
6167  if (r2 >= r10) {
6168    ulong const t = r2;
6169    r2 = r10;
6170    r10 = t;
6171  };
6172  if (r6 >= r14) {
6173    ulong const t = r6;
6174    r6 = r14;
6175    r14 = t;
6176  };
6177  if (r2 >= r6) {
6178    ulong const t = r2;
6179    r2 = r6;
6180    r6 = t;
6181  };
6182  if (r10 >= r14) {
6183    ulong const t = r10;
6184    r10 = r14;
6185    r14 = t;
6186  };
6187  if (r4 >= r12) {
6188    ulong const t = r4;
6189    r4 = r12;
6190    r12 = t;
6191  };
6192  if (r8 >= r16) {
6193    ulong const t = r8;
6194    r8 = r16;
6195    r16 = t;
6196  };
6197  if (r4 >= r8) {
6198    ulong const t = r4;
6199    r4 = r8;
6200    r8 = t;
6201  };
6202  if (r12 >= r16) {
6203    ulong const t = r12;
6204    r12 = r16;
6205    r16 = t;
6206  };
6207  if (r2 >= r4) {
6208    ulong const t = r2;
6209    r2 = r4;
6210    r4 = t;
6211  };
6212  if (r6 >= r8) {
6213    ulong const t = r6;
6214    r6 = r8;
6215    r8 = t;
6216  };
6217  if (r10 >= r12) {
6218    ulong const t = r10;
6219    r10 = r12;
6220    r12 = t;
6221  };
6222  if (r14 >= r16) {
6223    ulong const t = r14;
6224    r14 = r16;
6225    r16 = t;
6226  };
6227  if (r1 >= r2) {
6228    ulong const t = r1;
6229    r1 = r2;
6230    r2 = t;
6231  };
6232  if (r3 >= r4) {
6233    ulong const t = r3;
6234    r3 = r4;
6235    r4 = t;
6236  };
6237  if (r5 >= r6) {
6238    ulong const t = r5;
6239    r5 = r6;
6240    r6 = t;
6241  };
6242  if (r7 >= r8) {
6243    ulong const t = r7;
6244    r7 = r8;
6245    r8 = t;
6246  };
6247  if (r9 >= r10) {
6248    ulong const t = r9;
6249    r9 = r10;
6250    r10 = t;
6251  };
6252  if (r11 >= r12) {
6253    ulong const t = r11;
6254    r11 = r12;
6255    r12 = t;
6256  };
6257  if (r13 >= r14) {
6258    ulong const t = r13;
6259    r13 = r14;
6260    r14 = t;
6261  };
6262  if (r15 >= r16) {
6263    ulong const t = r15;
6264    r15 = r16;
6265    r16 = t;
6266  };
6267  uint const smem_l_idx =
6268    get_sub_group_id() * ((1 << 3) * 8) + get_sub_group_local_id();
6269  uint const smem_r_idx = (get_sub_group_id() ^ 1) * ((1 << 3) * 8) +
6270                          (get_sub_group_local_id() ^ ((1 << 3) - 1));
6271  shared.m[get_local_id(0) + (8 * (1 << 3) * 0)] = r1;
6272  shared.m[get_local_id(0) + (8 * (1 << 3) * 1)] = r16;
6273  shared.m[get_local_id(0) + (8 * (1 << 3) * 2)] = r2;
6274  shared.m[get_local_id(0) + (8 * (1 << 3) * 3)] = r15;
6275  shared.m[get_local_id(0) + (8 * (1 << 3) * 4)] = r3;
6276  shared.m[get_local_id(0) + (8 * (1 << 3) * 5)] = r14;
6277  shared.m[get_local_id(0) + (8 * (1 << 3) * 6)] = r4;
6278  shared.m[get_local_id(0) + (8 * (1 << 3) * 7)] = r13;
6279  shared.m[get_local_id(0) + (8 * (1 << 3) * 8)] = r5;
6280  shared.m[get_local_id(0) + (8 * (1 << 3) * 9)] = r12;
6281  shared.m[get_local_id(0) + (8 * (1 << 3) * 10)] = r6;
6282  shared.m[get_local_id(0) + (8 * (1 << 3) * 11)] = r11;
6283  shared.m[get_local_id(0) + (8 * (1 << 3) * 12)] = r7;
6284  shared.m[get_local_id(0) + (8 * (1 << 3) * 13)] = r10;
6285  shared.m[get_local_id(0) + (8 * (1 << 3) * 14)] = r8;
6286  shared.m[get_local_id(0) + (8 * (1 << 3) * 15)] = r9;
6287  barrier(CLK_LOCAL_MEM_FENCE);
6288  {
6289    {
6290      ulong r0_1 = shared.m[smem_l_idx + (0)];
6291      ulong r0_2 = shared.m[smem_r_idx + (8)];
6292      if (r0_1 >= r0_2) {
6293        ulong const t = r0_1;
6294        r0_1 = r0_2;
6295        r0_2 = t;
6296      };
6297      shared.m[smem_l_idx + (0)] = r0_1;
6298      shared.m[smem_r_idx + (8)] = r0_2;
6299    }
6300    {
6301      ulong r1_1 = shared.m[smem_l_idx + (16)];
6302      ulong r1_2 = shared.m[smem_r_idx + (24)];
6303      if (r1_1 >= r1_2) {
6304        ulong const t = r1_1;
6305        r1_1 = r1_2;
6306        r1_2 = t;
6307      };
6308      shared.m[smem_l_idx + (16)] = r1_1;
6309      shared.m[smem_r_idx + (24)] = r1_2;
6310    }
6311    {
6312      ulong r2_1 = shared.m[smem_l_idx + (32)];
6313      ulong r2_2 = shared.m[smem_r_idx + (40)];
6314      if (r2_1 >= r2_2) {
6315        ulong const t = r2_1;
6316        r2_1 = r2_2;
6317        r2_2 = t;
6318      };
6319      shared.m[smem_l_idx + (32)] = r2_1;
6320      shared.m[smem_r_idx + (40)] = r2_2;
6321    }
6322    {
6323      ulong r3_1 = shared.m[smem_l_idx + (48)];
6324      ulong r3_2 = shared.m[smem_r_idx + (56)];
6325      if (r3_1 >= r3_2) {
6326        ulong const t = r3_1;
6327        r3_1 = r3_2;
6328        r3_2 = t;
6329      };
6330      shared.m[smem_l_idx + (48)] = r3_1;
6331      shared.m[smem_r_idx + (56)] = r3_2;
6332    }
6333    {
6334      ulong r0_1 = shared.m[smem_l_idx + (512)];
6335      ulong r0_2 = shared.m[smem_r_idx + (520)];
6336      if (r0_1 >= r0_2) {
6337        ulong const t = r0_1;
6338        r0_1 = r0_2;
6339        r0_2 = t;
6340      };
6341      shared.m[smem_l_idx + (512)] = r0_1;
6342      shared.m[smem_r_idx + (520)] = r0_2;
6343    }
6344    {
6345      ulong r1_1 = shared.m[smem_l_idx + (528)];
6346      ulong r1_2 = shared.m[smem_r_idx + (536)];
6347      if (r1_1 >= r1_2) {
6348        ulong const t = r1_1;
6349        r1_1 = r1_2;
6350        r1_2 = t;
6351      };
6352      shared.m[smem_l_idx + (528)] = r1_1;
6353      shared.m[smem_r_idx + (536)] = r1_2;
6354    }
6355    {
6356      ulong r2_1 = shared.m[smem_l_idx + (544)];
6357      ulong r2_2 = shared.m[smem_r_idx + (552)];
6358      if (r2_1 >= r2_2) {
6359        ulong const t = r2_1;
6360        r2_1 = r2_2;
6361        r2_2 = t;
6362      };
6363      shared.m[smem_l_idx + (544)] = r2_1;
6364      shared.m[smem_r_idx + (552)] = r2_2;
6365    }
6366    {
6367      ulong r3_1 = shared.m[smem_l_idx + (560)];
6368      ulong r3_2 = shared.m[smem_r_idx + (568)];
6369      if (r3_1 >= r3_2) {
6370        ulong const t = r3_1;
6371        r3_1 = r3_2;
6372        r3_2 = t;
6373      };
6374      shared.m[smem_l_idx + (560)] = r3_1;
6375      shared.m[smem_r_idx + (568)] = r3_2;
6376    }
6377  }
6378  barrier(CLK_LOCAL_MEM_FENCE);
6379  r1 = shared.m[get_local_id(0) + (8 * (1 << 3) * 0)];
6380  r16 = shared.m[get_local_id(0) + (8 * (1 << 3) * 1)];
6381  r2 = shared.m[get_local_id(0) + (8 * (1 << 3) * 2)];
6382  r15 = shared.m[get_local_id(0) + (8 * (1 << 3) * 3)];
6383  r3 = shared.m[get_local_id(0) + (8 * (1 << 3) * 4)];
6384  r14 = shared.m[get_local_id(0) + (8 * (1 << 3) * 5)];
6385  r4 = shared.m[get_local_id(0) + (8 * (1 << 3) * 6)];
6386  r13 = shared.m[get_local_id(0) + (8 * (1 << 3) * 7)];
6387  r5 = shared.m[get_local_id(0) + (8 * (1 << 3) * 8)];
6388  r12 = shared.m[get_local_id(0) + (8 * (1 << 3) * 9)];
6389  r6 = shared.m[get_local_id(0) + (8 * (1 << 3) * 10)];
6390  r11 = shared.m[get_local_id(0) + (8 * (1 << 3) * 11)];
6391  r7 = shared.m[get_local_id(0) + (8 * (1 << 3) * 12)];
6392  r10 = shared.m[get_local_id(0) + (8 * (1 << 3) * 13)];
6393  r8 = shared.m[get_local_id(0) + (8 * (1 << 3) * 14)];
6394  r9 = shared.m[get_local_id(0) + (8 * (1 << 3) * 15)];
6395  {
6396    {
6397      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
6398      int const t_lt = get_sub_group_local_id() < half_lane_idx;
6399      ;
6400      {
6401        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
6402        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
6403      };
6404      {
6405        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
6406        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
6407      };
6408      {
6409        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
6410        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
6411      };
6412      {
6413        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
6414        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
6415      };
6416      {
6417        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
6418        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
6419      };
6420      {
6421        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
6422        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
6423      };
6424      {
6425        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
6426        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
6427      };
6428      {
6429        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
6430        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
6431      };
6432      {
6433        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
6434        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
6435      };
6436      {
6437        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
6438        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
6439      };
6440      {
6441        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
6442        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
6443      };
6444      {
6445        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
6446        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
6447      };
6448      {
6449        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
6450        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
6451      };
6452      {
6453        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
6454        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
6455      };
6456      {
6457        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
6458        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
6459      };
6460      {
6461        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
6462        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
6463      };
6464    }
6465    {
6466      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
6467      int const t_lt = get_sub_group_local_id() < half_lane_idx;
6468      ;
6469      {
6470        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
6471        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
6472      };
6473      {
6474        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
6475        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
6476      };
6477      {
6478        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
6479        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
6480      };
6481      {
6482        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
6483        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
6484      };
6485      {
6486        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
6487        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
6488      };
6489      {
6490        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
6491        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
6492      };
6493      {
6494        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
6495        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
6496      };
6497      {
6498        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
6499        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
6500      };
6501      {
6502        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
6503        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
6504      };
6505      {
6506        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
6507        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
6508      };
6509      {
6510        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
6511        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
6512      };
6513      {
6514        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
6515        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
6516      };
6517      {
6518        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
6519        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
6520      };
6521      {
6522        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
6523        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
6524      };
6525      {
6526        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
6527        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
6528      };
6529      {
6530        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
6531        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
6532      };
6533    }
6534    {
6535      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
6536      int const t_lt = get_sub_group_local_id() < half_lane_idx;
6537      ;
6538      {
6539        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
6540        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
6541      };
6542      {
6543        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
6544        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
6545      };
6546      {
6547        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
6548        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
6549      };
6550      {
6551        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
6552        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
6553      };
6554      {
6555        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
6556        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
6557      };
6558      {
6559        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
6560        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
6561      };
6562      {
6563        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
6564        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
6565      };
6566      {
6567        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
6568        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
6569      };
6570      {
6571        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
6572        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
6573      };
6574      {
6575        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
6576        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
6577      };
6578      {
6579        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
6580        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
6581      };
6582      {
6583        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
6584        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
6585      };
6586      {
6587        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
6588        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
6589      };
6590      {
6591        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
6592        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
6593      };
6594      {
6595        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
6596        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
6597      };
6598      {
6599        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
6600        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
6601      };
6602    }
6603    if (r1 >= r9) {
6604      ulong const t = r1;
6605      r1 = r9;
6606      r9 = t;
6607    };
6608    if (r5 >= r13) {
6609      ulong const t = r5;
6610      r5 = r13;
6611      r13 = t;
6612    };
6613    if (r1 >= r5) {
6614      ulong const t = r1;
6615      r1 = r5;
6616      r5 = t;
6617    };
6618    if (r9 >= r13) {
6619      ulong const t = r9;
6620      r9 = r13;
6621      r13 = t;
6622    };
6623    if (r3 >= r11) {
6624      ulong const t = r3;
6625      r3 = r11;
6626      r11 = t;
6627    };
6628    if (r7 >= r15) {
6629      ulong const t = r7;
6630      r7 = r15;
6631      r15 = t;
6632    };
6633    if (r3 >= r7) {
6634      ulong const t = r3;
6635      r3 = r7;
6636      r7 = t;
6637    };
6638    if (r11 >= r15) {
6639      ulong const t = r11;
6640      r11 = r15;
6641      r15 = t;
6642    };
6643    if (r1 >= r3) {
6644      ulong const t = r1;
6645      r1 = r3;
6646      r3 = t;
6647    };
6648    if (r5 >= r7) {
6649      ulong const t = r5;
6650      r5 = r7;
6651      r7 = t;
6652    };
6653    if (r9 >= r11) {
6654      ulong const t = r9;
6655      r9 = r11;
6656      r11 = t;
6657    };
6658    if (r13 >= r15) {
6659      ulong const t = r13;
6660      r13 = r15;
6661      r15 = t;
6662    };
6663    if (r2 >= r10) {
6664      ulong const t = r2;
6665      r2 = r10;
6666      r10 = t;
6667    };
6668    if (r6 >= r14) {
6669      ulong const t = r6;
6670      r6 = r14;
6671      r14 = t;
6672    };
6673    if (r2 >= r6) {
6674      ulong const t = r2;
6675      r2 = r6;
6676      r6 = t;
6677    };
6678    if (r10 >= r14) {
6679      ulong const t = r10;
6680      r10 = r14;
6681      r14 = t;
6682    };
6683    if (r4 >= r12) {
6684      ulong const t = r4;
6685      r4 = r12;
6686      r12 = t;
6687    };
6688    if (r8 >= r16) {
6689      ulong const t = r8;
6690      r8 = r16;
6691      r16 = t;
6692    };
6693    if (r4 >= r8) {
6694      ulong const t = r4;
6695      r4 = r8;
6696      r8 = t;
6697    };
6698    if (r12 >= r16) {
6699      ulong const t = r12;
6700      r12 = r16;
6701      r16 = t;
6702    };
6703    if (r2 >= r4) {
6704      ulong const t = r2;
6705      r2 = r4;
6706      r4 = t;
6707    };
6708    if (r6 >= r8) {
6709      ulong const t = r6;
6710      r6 = r8;
6711      r8 = t;
6712    };
6713    if (r10 >= r12) {
6714      ulong const t = r10;
6715      r10 = r12;
6716      r12 = t;
6717    };
6718    if (r14 >= r16) {
6719      ulong const t = r14;
6720      r14 = r16;
6721      r16 = t;
6722    };
6723    if (r1 >= r2) {
6724      ulong const t = r1;
6725      r1 = r2;
6726      r2 = t;
6727    };
6728    if (r3 >= r4) {
6729      ulong const t = r3;
6730      r3 = r4;
6731      r4 = t;
6732    };
6733    if (r5 >= r6) {
6734      ulong const t = r5;
6735      r5 = r6;
6736      r6 = t;
6737    };
6738    if (r7 >= r8) {
6739      ulong const t = r7;
6740      r7 = r8;
6741      r8 = t;
6742    };
6743    if (r9 >= r10) {
6744      ulong const t = r9;
6745      r9 = r10;
6746      r10 = t;
6747    };
6748    if (r11 >= r12) {
6749      ulong const t = r11;
6750      r11 = r12;
6751      r12 = t;
6752    };
6753    if (r13 >= r14) {
6754      ulong const t = r13;
6755      r13 = r14;
6756      r14 = t;
6757    };
6758    if (r15 >= r16) {
6759      ulong const t = r15;
6760      r15 = r16;
6761      r16 = t;
6762    };
6763  }
6764  shared.m[get_local_id(0) + (8 * (1 << 3) * 0)] = r1;
6765  shared.m[get_local_id(0) + (8 * (1 << 3) * 1)] = r16;
6766  shared.m[get_local_id(0) + (8 * (1 << 3) * 2)] = r2;
6767  shared.m[get_local_id(0) + (8 * (1 << 3) * 3)] = r15;
6768  shared.m[get_local_id(0) + (8 * (1 << 3) * 4)] = r3;
6769  shared.m[get_local_id(0) + (8 * (1 << 3) * 5)] = r14;
6770  shared.m[get_local_id(0) + (8 * (1 << 3) * 6)] = r4;
6771  shared.m[get_local_id(0) + (8 * (1 << 3) * 7)] = r13;
6772  shared.m[get_local_id(0) + (8 * (1 << 3) * 8)] = r5;
6773  shared.m[get_local_id(0) + (8 * (1 << 3) * 9)] = r12;
6774  shared.m[get_local_id(0) + (8 * (1 << 3) * 10)] = r6;
6775  shared.m[get_local_id(0) + (8 * (1 << 3) * 11)] = r11;
6776  shared.m[get_local_id(0) + (8 * (1 << 3) * 12)] = r7;
6777  shared.m[get_local_id(0) + (8 * (1 << 3) * 13)] = r10;
6778  shared.m[get_local_id(0) + (8 * (1 << 3) * 14)] = r8;
6779  shared.m[get_local_id(0) + (8 * (1 << 3) * 15)] = r9;
6780  barrier(CLK_LOCAL_MEM_FENCE);
6781  {
6782    {
6783      ulong r0_1 = shared.m[smem_l_idx + (0)];
6784      ulong r0_2 = shared.m[smem_l_idx + (8)];
6785      ulong r0_3 = shared.m[smem_r_idx + (16)];
6786      ulong r0_4 = shared.m[smem_r_idx + (24)];
6787      if (r0_2 >= r0_3) {
6788        ulong const t = r0_2;
6789        r0_2 = r0_3;
6790        r0_3 = t;
6791      };
6792      if (r0_1 >= r0_4) {
6793        ulong const t = r0_1;
6794        r0_1 = r0_4;
6795        r0_4 = t;
6796      };
6797      if (r0_3 >= r0_4) {
6798        ulong const t = r0_3;
6799        r0_3 = r0_4;
6800        r0_4 = t;
6801      };
6802      if (r0_1 >= r0_2) {
6803        ulong const t = r0_1;
6804        r0_1 = r0_2;
6805        r0_2 = t;
6806      };
6807      shared.m[smem_l_idx + (0)] = r0_1;
6808      shared.m[smem_l_idx + (8)] = r0_2;
6809      shared.m[smem_r_idx + (16)] = r0_3;
6810      shared.m[smem_r_idx + (24)] = r0_4;
6811    }
6812    {
6813      ulong r1_1 = shared.m[smem_l_idx + (32)];
6814      ulong r1_2 = shared.m[smem_l_idx + (40)];
6815      ulong r1_3 = shared.m[smem_r_idx + (48)];
6816      ulong r1_4 = shared.m[smem_r_idx + (56)];
6817      if (r1_2 >= r1_3) {
6818        ulong const t = r1_2;
6819        r1_2 = r1_3;
6820        r1_3 = t;
6821      };
6822      if (r1_1 >= r1_4) {
6823        ulong const t = r1_1;
6824        r1_1 = r1_4;
6825        r1_4 = t;
6826      };
6827      if (r1_3 >= r1_4) {
6828        ulong const t = r1_3;
6829        r1_3 = r1_4;
6830        r1_4 = t;
6831      };
6832      if (r1_1 >= r1_2) {
6833        ulong const t = r1_1;
6834        r1_1 = r1_2;
6835        r1_2 = t;
6836      };
6837      shared.m[smem_l_idx + (32)] = r1_1;
6838      shared.m[smem_l_idx + (40)] = r1_2;
6839      shared.m[smem_r_idx + (48)] = r1_3;
6840      shared.m[smem_r_idx + (56)] = r1_4;
6841    }
6842    {
6843      ulong r0_1 = shared.m[smem_l_idx + (512)];
6844      ulong r0_2 = shared.m[smem_l_idx + (520)];
6845      ulong r0_3 = shared.m[smem_r_idx + (528)];
6846      ulong r0_4 = shared.m[smem_r_idx + (536)];
6847      if (r0_2 >= r0_3) {
6848        ulong const t = r0_2;
6849        r0_2 = r0_3;
6850        r0_3 = t;
6851      };
6852      if (r0_1 >= r0_4) {
6853        ulong const t = r0_1;
6854        r0_1 = r0_4;
6855        r0_4 = t;
6856      };
6857      if (r0_3 >= r0_4) {
6858        ulong const t = r0_3;
6859        r0_3 = r0_4;
6860        r0_4 = t;
6861      };
6862      if (r0_1 >= r0_2) {
6863        ulong const t = r0_1;
6864        r0_1 = r0_2;
6865        r0_2 = t;
6866      };
6867      shared.m[smem_l_idx + (512)] = r0_1;
6868      shared.m[smem_l_idx + (520)] = r0_2;
6869      shared.m[smem_r_idx + (528)] = r0_3;
6870      shared.m[smem_r_idx + (536)] = r0_4;
6871    }
6872    {
6873      ulong r1_1 = shared.m[smem_l_idx + (544)];
6874      ulong r1_2 = shared.m[smem_l_idx + (552)];
6875      ulong r1_3 = shared.m[smem_r_idx + (560)];
6876      ulong r1_4 = shared.m[smem_r_idx + (568)];
6877      if (r1_2 >= r1_3) {
6878        ulong const t = r1_2;
6879        r1_2 = r1_3;
6880        r1_3 = t;
6881      };
6882      if (r1_1 >= r1_4) {
6883        ulong const t = r1_1;
6884        r1_1 = r1_4;
6885        r1_4 = t;
6886      };
6887      if (r1_3 >= r1_4) {
6888        ulong const t = r1_3;
6889        r1_3 = r1_4;
6890        r1_4 = t;
6891      };
6892      if (r1_1 >= r1_2) {
6893        ulong const t = r1_1;
6894        r1_1 = r1_2;
6895        r1_2 = t;
6896      };
6897      shared.m[smem_l_idx + (544)] = r1_1;
6898      shared.m[smem_l_idx + (552)] = r1_2;
6899      shared.m[smem_r_idx + (560)] = r1_3;
6900      shared.m[smem_r_idx + (568)] = r1_4;
6901    }
6902  }
6903  barrier(CLK_LOCAL_MEM_FENCE);
6904  r1 = shared.m[get_local_id(0) + (8 * (1 << 3) * 0)];
6905  r16 = shared.m[get_local_id(0) + (8 * (1 << 3) * 1)];
6906  r2 = shared.m[get_local_id(0) + (8 * (1 << 3) * 2)];
6907  r15 = shared.m[get_local_id(0) + (8 * (1 << 3) * 3)];
6908  r3 = shared.m[get_local_id(0) + (8 * (1 << 3) * 4)];
6909  r14 = shared.m[get_local_id(0) + (8 * (1 << 3) * 5)];
6910  r4 = shared.m[get_local_id(0) + (8 * (1 << 3) * 6)];
6911  r13 = shared.m[get_local_id(0) + (8 * (1 << 3) * 7)];
6912  r5 = shared.m[get_local_id(0) + (8 * (1 << 3) * 8)];
6913  r12 = shared.m[get_local_id(0) + (8 * (1 << 3) * 9)];
6914  r6 = shared.m[get_local_id(0) + (8 * (1 << 3) * 10)];
6915  r11 = shared.m[get_local_id(0) + (8 * (1 << 3) * 11)];
6916  r7 = shared.m[get_local_id(0) + (8 * (1 << 3) * 12)];
6917  r10 = shared.m[get_local_id(0) + (8 * (1 << 3) * 13)];
6918  r8 = shared.m[get_local_id(0) + (8 * (1 << 3) * 14)];
6919  r9 = shared.m[get_local_id(0) + (8 * (1 << 3) * 15)];
6920  {
6921    {
6922      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
6923      int const t_lt = get_sub_group_local_id() < half_lane_idx;
6924      ;
6925      {
6926        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
6927        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
6928      };
6929      {
6930        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
6931        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
6932      };
6933      {
6934        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
6935        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
6936      };
6937      {
6938        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
6939        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
6940      };
6941      {
6942        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
6943        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
6944      };
6945      {
6946        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
6947        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
6948      };
6949      {
6950        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
6951        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
6952      };
6953      {
6954        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
6955        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
6956      };
6957      {
6958        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
6959        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
6960      };
6961      {
6962        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
6963        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
6964      };
6965      {
6966        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
6967        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
6968      };
6969      {
6970        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
6971        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
6972      };
6973      {
6974        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
6975        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
6976      };
6977      {
6978        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
6979        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
6980      };
6981      {
6982        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
6983        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
6984      };
6985      {
6986        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
6987        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
6988      };
6989    }
6990    {
6991      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
6992      int const t_lt = get_sub_group_local_id() < half_lane_idx;
6993      ;
6994      {
6995        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
6996        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
6997      };
6998      {
6999        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
7000        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
7001      };
7002      {
7003        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
7004        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
7005      };
7006      {
7007        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
7008        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
7009      };
7010      {
7011        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
7012        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
7013      };
7014      {
7015        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
7016        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
7017      };
7018      {
7019        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
7020        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
7021      };
7022      {
7023        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
7024        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
7025      };
7026      {
7027        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
7028        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
7029      };
7030      {
7031        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
7032        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
7033      };
7034      {
7035        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
7036        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
7037      };
7038      {
7039        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
7040        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
7041      };
7042      {
7043        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
7044        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
7045      };
7046      {
7047        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
7048        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
7049      };
7050      {
7051        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
7052        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
7053      };
7054      {
7055        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
7056        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
7057      };
7058    }
7059    {
7060      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
7061      int const t_lt = get_sub_group_local_id() < half_lane_idx;
7062      ;
7063      {
7064        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
7065        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
7066      };
7067      {
7068        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
7069        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
7070      };
7071      {
7072        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
7073        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
7074      };
7075      {
7076        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
7077        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
7078      };
7079      {
7080        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
7081        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
7082      };
7083      {
7084        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
7085        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
7086      };
7087      {
7088        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
7089        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
7090      };
7091      {
7092        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
7093        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
7094      };
7095      {
7096        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
7097        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
7098      };
7099      {
7100        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
7101        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
7102      };
7103      {
7104        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
7105        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
7106      };
7107      {
7108        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
7109        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
7110      };
7111      {
7112        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
7113        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
7114      };
7115      {
7116        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
7117        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
7118      };
7119      {
7120        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
7121        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
7122      };
7123      {
7124        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
7125        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
7126      };
7127    }
7128    if (r1 >= r9) {
7129      ulong const t = r1;
7130      r1 = r9;
7131      r9 = t;
7132    };
7133    if (r5 >= r13) {
7134      ulong const t = r5;
7135      r5 = r13;
7136      r13 = t;
7137    };
7138    if (r1 >= r5) {
7139      ulong const t = r1;
7140      r1 = r5;
7141      r5 = t;
7142    };
7143    if (r9 >= r13) {
7144      ulong const t = r9;
7145      r9 = r13;
7146      r13 = t;
7147    };
7148    if (r3 >= r11) {
7149      ulong const t = r3;
7150      r3 = r11;
7151      r11 = t;
7152    };
7153    if (r7 >= r15) {
7154      ulong const t = r7;
7155      r7 = r15;
7156      r15 = t;
7157    };
7158    if (r3 >= r7) {
7159      ulong const t = r3;
7160      r3 = r7;
7161      r7 = t;
7162    };
7163    if (r11 >= r15) {
7164      ulong const t = r11;
7165      r11 = r15;
7166      r15 = t;
7167    };
7168    if (r1 >= r3) {
7169      ulong const t = r1;
7170      r1 = r3;
7171      r3 = t;
7172    };
7173    if (r5 >= r7) {
7174      ulong const t = r5;
7175      r5 = r7;
7176      r7 = t;
7177    };
7178    if (r9 >= r11) {
7179      ulong const t = r9;
7180      r9 = r11;
7181      r11 = t;
7182    };
7183    if (r13 >= r15) {
7184      ulong const t = r13;
7185      r13 = r15;
7186      r15 = t;
7187    };
7188    if (r2 >= r10) {
7189      ulong const t = r2;
7190      r2 = r10;
7191      r10 = t;
7192    };
7193    if (r6 >= r14) {
7194      ulong const t = r6;
7195      r6 = r14;
7196      r14 = t;
7197    };
7198    if (r2 >= r6) {
7199      ulong const t = r2;
7200      r2 = r6;
7201      r6 = t;
7202    };
7203    if (r10 >= r14) {
7204      ulong const t = r10;
7205      r10 = r14;
7206      r14 = t;
7207    };
7208    if (r4 >= r12) {
7209      ulong const t = r4;
7210      r4 = r12;
7211      r12 = t;
7212    };
7213    if (r8 >= r16) {
7214      ulong const t = r8;
7215      r8 = r16;
7216      r16 = t;
7217    };
7218    if (r4 >= r8) {
7219      ulong const t = r4;
7220      r4 = r8;
7221      r8 = t;
7222    };
7223    if (r12 >= r16) {
7224      ulong const t = r12;
7225      r12 = r16;
7226      r16 = t;
7227    };
7228    if (r2 >= r4) {
7229      ulong const t = r2;
7230      r2 = r4;
7231      r4 = t;
7232    };
7233    if (r6 >= r8) {
7234      ulong const t = r6;
7235      r6 = r8;
7236      r8 = t;
7237    };
7238    if (r10 >= r12) {
7239      ulong const t = r10;
7240      r10 = r12;
7241      r12 = t;
7242    };
7243    if (r14 >= r16) {
7244      ulong const t = r14;
7245      r14 = r16;
7246      r16 = t;
7247    };
7248    if (r1 >= r2) {
7249      ulong const t = r1;
7250      r1 = r2;
7251      r2 = t;
7252    };
7253    if (r3 >= r4) {
7254      ulong const t = r3;
7255      r3 = r4;
7256      r4 = t;
7257    };
7258    if (r5 >= r6) {
7259      ulong const t = r5;
7260      r5 = r6;
7261      r6 = t;
7262    };
7263    if (r7 >= r8) {
7264      ulong const t = r7;
7265      r7 = r8;
7266      r8 = t;
7267    };
7268    if (r9 >= r10) {
7269      ulong const t = r9;
7270      r9 = r10;
7271      r10 = t;
7272    };
7273    if (r11 >= r12) {
7274      ulong const t = r11;
7275      r11 = r12;
7276      r12 = t;
7277    };
7278    if (r13 >= r14) {
7279      ulong const t = r13;
7280      r13 = r14;
7281      r14 = t;
7282    };
7283    if (r15 >= r16) {
7284      ulong const t = r15;
7285      r15 = r16;
7286      r16 = t;
7287    };
7288  }
7289  shared.m[get_local_id(0) + (8 * (1 << 3) * 0)] = r1;
7290  shared.m[get_local_id(0) + (8 * (1 << 3) * 1)] = r16;
7291  shared.m[get_local_id(0) + (8 * (1 << 3) * 2)] = r2;
7292  shared.m[get_local_id(0) + (8 * (1 << 3) * 3)] = r15;
7293  shared.m[get_local_id(0) + (8 * (1 << 3) * 4)] = r3;
7294  shared.m[get_local_id(0) + (8 * (1 << 3) * 5)] = r14;
7295  shared.m[get_local_id(0) + (8 * (1 << 3) * 6)] = r4;
7296  shared.m[get_local_id(0) + (8 * (1 << 3) * 7)] = r13;
7297  shared.m[get_local_id(0) + (8 * (1 << 3) * 8)] = r5;
7298  shared.m[get_local_id(0) + (8 * (1 << 3) * 9)] = r12;
7299  shared.m[get_local_id(0) + (8 * (1 << 3) * 10)] = r6;
7300  shared.m[get_local_id(0) + (8 * (1 << 3) * 11)] = r11;
7301  shared.m[get_local_id(0) + (8 * (1 << 3) * 12)] = r7;
7302  shared.m[get_local_id(0) + (8 * (1 << 3) * 13)] = r10;
7303  shared.m[get_local_id(0) + (8 * (1 << 3) * 14)] = r8;
7304  shared.m[get_local_id(0) + (8 * (1 << 3) * 15)] = r9;
7305  barrier(CLK_LOCAL_MEM_FENCE);
7306  {
7307    {
7308      ulong r0_1 = shared.m[smem_l_idx + (0)];
7309      ulong r0_2 = shared.m[smem_l_idx + (8)];
7310      ulong r0_3 = shared.m[smem_l_idx + (16)];
7311      ulong r0_4 = shared.m[smem_l_idx + (24)];
7312      ulong r0_5 = shared.m[smem_r_idx + (32)];
7313      ulong r0_6 = shared.m[smem_r_idx + (40)];
7314      ulong r0_7 = shared.m[smem_r_idx + (48)];
7315      ulong r0_8 = shared.m[smem_r_idx + (56)];
7316      if (r0_4 >= r0_5) {
7317        ulong const t = r0_4;
7318        r0_4 = r0_5;
7319        r0_5 = t;
7320      };
7321      if (r0_3 >= r0_6) {
7322        ulong const t = r0_3;
7323        r0_3 = r0_6;
7324        r0_6 = t;
7325      };
7326      if (r0_2 >= r0_7) {
7327        ulong const t = r0_2;
7328        r0_2 = r0_7;
7329        r0_7 = t;
7330      };
7331      if (r0_1 >= r0_8) {
7332        ulong const t = r0_1;
7333        r0_1 = r0_8;
7334        r0_8 = t;
7335      };
7336      if (r0_5 >= r0_7) {
7337        ulong const t = r0_5;
7338        r0_5 = r0_7;
7339        r0_7 = t;
7340      };
7341      if (r0_6 >= r0_8) {
7342        ulong const t = r0_6;
7343        r0_6 = r0_8;
7344        r0_8 = t;
7345      };
7346      if (r0_5 >= r0_6) {
7347        ulong const t = r0_5;
7348        r0_5 = r0_6;
7349        r0_6 = t;
7350      };
7351      if (r0_7 >= r0_8) {
7352        ulong const t = r0_7;
7353        r0_7 = r0_8;
7354        r0_8 = t;
7355      };
7356      if (r0_1 >= r0_3) {
7357        ulong const t = r0_1;
7358        r0_1 = r0_3;
7359        r0_3 = t;
7360      };
7361      if (r0_2 >= r0_4) {
7362        ulong const t = r0_2;
7363        r0_2 = r0_4;
7364        r0_4 = t;
7365      };
7366      if (r0_1 >= r0_2) {
7367        ulong const t = r0_1;
7368        r0_1 = r0_2;
7369        r0_2 = t;
7370      };
7371      if (r0_3 >= r0_4) {
7372        ulong const t = r0_3;
7373        r0_3 = r0_4;
7374        r0_4 = t;
7375      };
7376      shared.m[smem_l_idx + (0)] = r0_1;
7377      shared.m[smem_l_idx + (8)] = r0_2;
7378      shared.m[smem_l_idx + (16)] = r0_3;
7379      shared.m[smem_l_idx + (24)] = r0_4;
7380      shared.m[smem_r_idx + (32)] = r0_5;
7381      shared.m[smem_r_idx + (40)] = r0_6;
7382      shared.m[smem_r_idx + (48)] = r0_7;
7383      shared.m[smem_r_idx + (56)] = r0_8;
7384    }
7385    {
7386      ulong r0_1 = shared.m[smem_l_idx + (512)];
7387      ulong r0_2 = shared.m[smem_l_idx + (520)];
7388      ulong r0_3 = shared.m[smem_l_idx + (528)];
7389      ulong r0_4 = shared.m[smem_l_idx + (536)];
7390      ulong r0_5 = shared.m[smem_r_idx + (544)];
7391      ulong r0_6 = shared.m[smem_r_idx + (552)];
7392      ulong r0_7 = shared.m[smem_r_idx + (560)];
7393      ulong r0_8 = shared.m[smem_r_idx + (568)];
7394      if (r0_4 >= r0_5) {
7395        ulong const t = r0_4;
7396        r0_4 = r0_5;
7397        r0_5 = t;
7398      };
7399      if (r0_3 >= r0_6) {
7400        ulong const t = r0_3;
7401        r0_3 = r0_6;
7402        r0_6 = t;
7403      };
7404      if (r0_2 >= r0_7) {
7405        ulong const t = r0_2;
7406        r0_2 = r0_7;
7407        r0_7 = t;
7408      };
7409      if (r0_1 >= r0_8) {
7410        ulong const t = r0_1;
7411        r0_1 = r0_8;
7412        r0_8 = t;
7413      };
7414      if (r0_5 >= r0_7) {
7415        ulong const t = r0_5;
7416        r0_5 = r0_7;
7417        r0_7 = t;
7418      };
7419      if (r0_6 >= r0_8) {
7420        ulong const t = r0_6;
7421        r0_6 = r0_8;
7422        r0_8 = t;
7423      };
7424      if (r0_5 >= r0_6) {
7425        ulong const t = r0_5;
7426        r0_5 = r0_6;
7427        r0_6 = t;
7428      };
7429      if (r0_7 >= r0_8) {
7430        ulong const t = r0_7;
7431        r0_7 = r0_8;
7432        r0_8 = t;
7433      };
7434      if (r0_1 >= r0_3) {
7435        ulong const t = r0_1;
7436        r0_1 = r0_3;
7437        r0_3 = t;
7438      };
7439      if (r0_2 >= r0_4) {
7440        ulong const t = r0_2;
7441        r0_2 = r0_4;
7442        r0_4 = t;
7443      };
7444      if (r0_1 >= r0_2) {
7445        ulong const t = r0_1;
7446        r0_1 = r0_2;
7447        r0_2 = t;
7448      };
7449      if (r0_3 >= r0_4) {
7450        ulong const t = r0_3;
7451        r0_3 = r0_4;
7452        r0_4 = t;
7453      };
7454      shared.m[smem_l_idx + (512)] = r0_1;
7455      shared.m[smem_l_idx + (520)] = r0_2;
7456      shared.m[smem_l_idx + (528)] = r0_3;
7457      shared.m[smem_l_idx + (536)] = r0_4;
7458      shared.m[smem_r_idx + (544)] = r0_5;
7459      shared.m[smem_r_idx + (552)] = r0_6;
7460      shared.m[smem_r_idx + (560)] = r0_7;
7461      shared.m[smem_r_idx + (568)] = r0_8;
7462    }
7463  }
7464  barrier(CLK_LOCAL_MEM_FENCE);
7465  r1 = shared.m[get_local_id(0) + (8 * (1 << 3) * 0)];
7466  r16 = shared.m[get_local_id(0) + (8 * (1 << 3) * 1)];
7467  r2 = shared.m[get_local_id(0) + (8 * (1 << 3) * 2)];
7468  r15 = shared.m[get_local_id(0) + (8 * (1 << 3) * 3)];
7469  r3 = shared.m[get_local_id(0) + (8 * (1 << 3) * 4)];
7470  r14 = shared.m[get_local_id(0) + (8 * (1 << 3) * 5)];
7471  r4 = shared.m[get_local_id(0) + (8 * (1 << 3) * 6)];
7472  r13 = shared.m[get_local_id(0) + (8 * (1 << 3) * 7)];
7473  r5 = shared.m[get_local_id(0) + (8 * (1 << 3) * 8)];
7474  r12 = shared.m[get_local_id(0) + (8 * (1 << 3) * 9)];
7475  r6 = shared.m[get_local_id(0) + (8 * (1 << 3) * 10)];
7476  r11 = shared.m[get_local_id(0) + (8 * (1 << 3) * 11)];
7477  r7 = shared.m[get_local_id(0) + (8 * (1 << 3) * 12)];
7478  r10 = shared.m[get_local_id(0) + (8 * (1 << 3) * 13)];
7479  r8 = shared.m[get_local_id(0) + (8 * (1 << 3) * 14)];
7480  r9 = shared.m[get_local_id(0) + (8 * (1 << 3) * 15)];
7481  {
7482    {
7483      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
7484      int const t_lt = get_sub_group_local_id() < half_lane_idx;
7485      ;
7486      {
7487        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
7488        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
7489      };
7490      {
7491        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
7492        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
7493      };
7494      {
7495        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
7496        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
7497      };
7498      {
7499        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
7500        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
7501      };
7502      {
7503        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
7504        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
7505      };
7506      {
7507        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
7508        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
7509      };
7510      {
7511        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
7512        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
7513      };
7514      {
7515        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
7516        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
7517      };
7518      {
7519        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
7520        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
7521      };
7522      {
7523        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
7524        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
7525      };
7526      {
7527        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
7528        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
7529      };
7530      {
7531        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
7532        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
7533      };
7534      {
7535        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
7536        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
7537      };
7538      {
7539        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
7540        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
7541      };
7542      {
7543        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
7544        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
7545      };
7546      {
7547        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
7548        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
7549      };
7550    }
7551    {
7552      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
7553      int const t_lt = get_sub_group_local_id() < half_lane_idx;
7554      ;
7555      {
7556        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
7557        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
7558      };
7559      {
7560        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
7561        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
7562      };
7563      {
7564        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
7565        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
7566      };
7567      {
7568        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
7569        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
7570      };
7571      {
7572        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
7573        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
7574      };
7575      {
7576        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
7577        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
7578      };
7579      {
7580        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
7581        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
7582      };
7583      {
7584        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
7585        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
7586      };
7587      {
7588        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
7589        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
7590      };
7591      {
7592        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
7593        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
7594      };
7595      {
7596        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
7597        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
7598      };
7599      {
7600        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
7601        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
7602      };
7603      {
7604        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
7605        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
7606      };
7607      {
7608        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
7609        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
7610      };
7611      {
7612        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
7613        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
7614      };
7615      {
7616        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
7617        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
7618      };
7619    }
7620    {
7621      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
7622      int const t_lt = get_sub_group_local_id() < half_lane_idx;
7623      ;
7624      {
7625        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
7626        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
7627      };
7628      {
7629        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
7630        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
7631      };
7632      {
7633        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
7634        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
7635      };
7636      {
7637        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
7638        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
7639      };
7640      {
7641        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
7642        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
7643      };
7644      {
7645        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
7646        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
7647      };
7648      {
7649        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
7650        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
7651      };
7652      {
7653        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
7654        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
7655      };
7656      {
7657        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
7658        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
7659      };
7660      {
7661        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
7662        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
7663      };
7664      {
7665        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
7666        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
7667      };
7668      {
7669        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
7670        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
7671      };
7672      {
7673        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
7674        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
7675      };
7676      {
7677        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
7678        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
7679      };
7680      {
7681        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
7682        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
7683      };
7684      {
7685        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
7686        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
7687      };
7688    }
7689    if (r1 >= r9) {
7690      ulong const t = r1;
7691      r1 = r9;
7692      r9 = t;
7693    };
7694    if (r5 >= r13) {
7695      ulong const t = r5;
7696      r5 = r13;
7697      r13 = t;
7698    };
7699    if (r1 >= r5) {
7700      ulong const t = r1;
7701      r1 = r5;
7702      r5 = t;
7703    };
7704    if (r9 >= r13) {
7705      ulong const t = r9;
7706      r9 = r13;
7707      r13 = t;
7708    };
7709    if (r3 >= r11) {
7710      ulong const t = r3;
7711      r3 = r11;
7712      r11 = t;
7713    };
7714    if (r7 >= r15) {
7715      ulong const t = r7;
7716      r7 = r15;
7717      r15 = t;
7718    };
7719    if (r3 >= r7) {
7720      ulong const t = r3;
7721      r3 = r7;
7722      r7 = t;
7723    };
7724    if (r11 >= r15) {
7725      ulong const t = r11;
7726      r11 = r15;
7727      r15 = t;
7728    };
7729    if (r1 >= r3) {
7730      ulong const t = r1;
7731      r1 = r3;
7732      r3 = t;
7733    };
7734    if (r5 >= r7) {
7735      ulong const t = r5;
7736      r5 = r7;
7737      r7 = t;
7738    };
7739    if (r9 >= r11) {
7740      ulong const t = r9;
7741      r9 = r11;
7742      r11 = t;
7743    };
7744    if (r13 >= r15) {
7745      ulong const t = r13;
7746      r13 = r15;
7747      r15 = t;
7748    };
7749    if (r2 >= r10) {
7750      ulong const t = r2;
7751      r2 = r10;
7752      r10 = t;
7753    };
7754    if (r6 >= r14) {
7755      ulong const t = r6;
7756      r6 = r14;
7757      r14 = t;
7758    };
7759    if (r2 >= r6) {
7760      ulong const t = r2;
7761      r2 = r6;
7762      r6 = t;
7763    };
7764    if (r10 >= r14) {
7765      ulong const t = r10;
7766      r10 = r14;
7767      r14 = t;
7768    };
7769    if (r4 >= r12) {
7770      ulong const t = r4;
7771      r4 = r12;
7772      r12 = t;
7773    };
7774    if (r8 >= r16) {
7775      ulong const t = r8;
7776      r8 = r16;
7777      r16 = t;
7778    };
7779    if (r4 >= r8) {
7780      ulong const t = r4;
7781      r4 = r8;
7782      r8 = t;
7783    };
7784    if (r12 >= r16) {
7785      ulong const t = r12;
7786      r12 = r16;
7787      r16 = t;
7788    };
7789    if (r2 >= r4) {
7790      ulong const t = r2;
7791      r2 = r4;
7792      r4 = t;
7793    };
7794    if (r6 >= r8) {
7795      ulong const t = r6;
7796      r6 = r8;
7797      r8 = t;
7798    };
7799    if (r10 >= r12) {
7800      ulong const t = r10;
7801      r10 = r12;
7802      r12 = t;
7803    };
7804    if (r14 >= r16) {
7805      ulong const t = r14;
7806      r14 = r16;
7807      r16 = t;
7808    };
7809    if (r1 >= r2) {
7810      ulong const t = r1;
7811      r1 = r2;
7812      r2 = t;
7813    };
7814    if (r3 >= r4) {
7815      ulong const t = r3;
7816      r3 = r4;
7817      r4 = t;
7818    };
7819    if (r5 >= r6) {
7820      ulong const t = r5;
7821      r5 = r6;
7822      r6 = t;
7823    };
7824    if (r7 >= r8) {
7825      ulong const t = r7;
7826      r7 = r8;
7827      r8 = t;
7828    };
7829    if (r9 >= r10) {
7830      ulong const t = r9;
7831      r9 = r10;
7832      r10 = t;
7833    };
7834    if (r11 >= r12) {
7835      ulong const t = r11;
7836      r11 = r12;
7837      r12 = t;
7838    };
7839    if (r13 >= r14) {
7840      ulong const t = r13;
7841      r13 = r14;
7842      r14 = t;
7843    };
7844    if (r15 >= r16) {
7845      ulong const t = r15;
7846      r15 = r16;
7847      r16 = t;
7848    };
7849  }
7850  vout[gmem_idx + (1 << 3) * 0] = r1;
7851  vout[gmem_idx + (1 << 3) * 1] = r2;
7852  vout[gmem_idx + (1 << 3) * 2] = r3;
7853  vout[gmem_idx + (1 << 3) * 3] = r4;
7854  vout[gmem_idx + (1 << 3) * 4] = r5;
7855  vout[gmem_idx + (1 << 3) * 5] = r6;
7856  vout[gmem_idx + (1 << 3) * 6] = r7;
7857  vout[gmem_idx + (1 << 3) * 7] = r8;
7858  vout[gmem_idx + (1 << 3) * 8] = r9;
7859  vout[gmem_idx + (1 << 3) * 9] = r10;
7860  vout[gmem_idx + (1 << 3) * 10] = r11;
7861  vout[gmem_idx + (1 << 3) * 11] = r12;
7862  vout[gmem_idx + (1 << 3) * 12] = r13;
7863  vout[gmem_idx + (1 << 3) * 13] = r14;
7864  vout[gmem_idx + (1 << 3) * 14] = r15;
7865  vout[gmem_idx + (1 << 3) * 15] = r16;
7866}
7867
7868__kernel __attribute__((intel_reqd_sub_group_size((1 << 3))))
7869__attribute__((reqd_work_group_size((1 << 3) * 16, 1, 1))) void
7870hs_kernel_bs_4(__global ulong const* const restrict vin,
7871               __global ulong* const restrict vout)
7872{
7873  __local struct
7874  {
7875    ulong m[128 * 16];
7876  } shared;
7877
7878  uint const gmem_idx = (get_global_id(0) & ~((1 << 3) - 1)) * 16 +
7879                        (get_local_id(0) & ((1 << 3) - 1));
7880  ulong r1 = vin[gmem_idx + (1 << 3) * 0];
7881  ulong r2 = vin[gmem_idx + (1 << 3) * 1];
7882  ulong r3 = vin[gmem_idx + (1 << 3) * 2];
7883  ulong r4 = vin[gmem_idx + (1 << 3) * 3];
7884  ulong r5 = vin[gmem_idx + (1 << 3) * 4];
7885  ulong r6 = vin[gmem_idx + (1 << 3) * 5];
7886  ulong r7 = vin[gmem_idx + (1 << 3) * 6];
7887  ulong r8 = vin[gmem_idx + (1 << 3) * 7];
7888  ulong r9 = vin[gmem_idx + (1 << 3) * 8];
7889  ulong r10 = vin[gmem_idx + (1 << 3) * 9];
7890  ulong r11 = vin[gmem_idx + (1 << 3) * 10];
7891  ulong r12 = vin[gmem_idx + (1 << 3) * 11];
7892  ulong r13 = vin[gmem_idx + (1 << 3) * 12];
7893  ulong r14 = vin[gmem_idx + (1 << 3) * 13];
7894  ulong r15 = vin[gmem_idx + (1 << 3) * 14];
7895  ulong r16 = vin[gmem_idx + (1 << 3) * 15];
7896  if (r1 >= r2) {
7897    ulong const t = r1;
7898    r1 = r2;
7899    r2 = t;
7900  };
7901  if (r3 >= r4) {
7902    ulong const t = r3;
7903    r3 = r4;
7904    r4 = t;
7905  };
7906  if (r5 >= r6) {
7907    ulong const t = r5;
7908    r5 = r6;
7909    r6 = t;
7910  };
7911  if (r7 >= r8) {
7912    ulong const t = r7;
7913    r7 = r8;
7914    r8 = t;
7915  };
7916  if (r9 >= r10) {
7917    ulong const t = r9;
7918    r9 = r10;
7919    r10 = t;
7920  };
7921  if (r11 >= r12) {
7922    ulong const t = r11;
7923    r11 = r12;
7924    r12 = t;
7925  };
7926  if (r13 >= r14) {
7927    ulong const t = r13;
7928    r13 = r14;
7929    r14 = t;
7930  };
7931  if (r15 >= r16) {
7932    ulong const t = r15;
7933    r15 = r16;
7934    r16 = t;
7935  };
7936  if (r1 >= r3) {
7937    ulong const t = r1;
7938    r1 = r3;
7939    r3 = t;
7940  };
7941  if (r5 >= r7) {
7942    ulong const t = r5;
7943    r5 = r7;
7944    r7 = t;
7945  };
7946  if (r9 >= r11) {
7947    ulong const t = r9;
7948    r9 = r11;
7949    r11 = t;
7950  };
7951  if (r13 >= r15) {
7952    ulong const t = r13;
7953    r13 = r15;
7954    r15 = t;
7955  };
7956  if (r2 >= r4) {
7957    ulong const t = r2;
7958    r2 = r4;
7959    r4 = t;
7960  };
7961  if (r6 >= r8) {
7962    ulong const t = r6;
7963    r6 = r8;
7964    r8 = t;
7965  };
7966  if (r10 >= r12) {
7967    ulong const t = r10;
7968    r10 = r12;
7969    r12 = t;
7970  };
7971  if (r14 >= r16) {
7972    ulong const t = r14;
7973    r14 = r16;
7974    r16 = t;
7975  };
7976  if (r1 >= r5) {
7977    ulong const t = r1;
7978    r1 = r5;
7979    r5 = t;
7980  };
7981  if (r9 >= r13) {
7982    ulong const t = r9;
7983    r9 = r13;
7984    r13 = t;
7985  };
7986  if (r2 >= r6) {
7987    ulong const t = r2;
7988    r2 = r6;
7989    r6 = t;
7990  };
7991  if (r10 >= r14) {
7992    ulong const t = r10;
7993    r10 = r14;
7994    r14 = t;
7995  };
7996  if (r3 >= r7) {
7997    ulong const t = r3;
7998    r3 = r7;
7999    r7 = t;
8000  };
8001  if (r11 >= r15) {
8002    ulong const t = r11;
8003    r11 = r15;
8004    r15 = t;
8005  };
8006  if (r4 >= r8) {
8007    ulong const t = r4;
8008    r4 = r8;
8009    r8 = t;
8010  };
8011  if (r12 >= r16) {
8012    ulong const t = r12;
8013    r12 = r16;
8014    r16 = t;
8015  };
8016  if (r1 >= r9) {
8017    ulong const t = r1;
8018    r1 = r9;
8019    r9 = t;
8020  };
8021  if (r2 >= r10) {
8022    ulong const t = r2;
8023    r2 = r10;
8024    r10 = t;
8025  };
8026  if (r3 >= r11) {
8027    ulong const t = r3;
8028    r3 = r11;
8029    r11 = t;
8030  };
8031  if (r4 >= r12) {
8032    ulong const t = r4;
8033    r4 = r12;
8034    r12 = t;
8035  };
8036  if (r5 >= r13) {
8037    ulong const t = r5;
8038    r5 = r13;
8039    r13 = t;
8040  };
8041  if (r6 >= r14) {
8042    ulong const t = r6;
8043    r6 = r14;
8044    r14 = t;
8045  };
8046  if (r7 >= r15) {
8047    ulong const t = r7;
8048    r7 = r15;
8049    r15 = t;
8050  };
8051  if (r8 >= r16) {
8052    ulong const t = r8;
8053    r8 = r16;
8054    r16 = t;
8055  };
8056  if (r6 >= r11) {
8057    ulong const t = r6;
8058    r6 = r11;
8059    r11 = t;
8060  };
8061  if (r7 >= r10) {
8062    ulong const t = r7;
8063    r7 = r10;
8064    r10 = t;
8065  };
8066  if (r4 >= r13) {
8067    ulong const t = r4;
8068    r4 = r13;
8069    r13 = t;
8070  };
8071  if (r14 >= r15) {
8072    ulong const t = r14;
8073    r14 = r15;
8074    r15 = t;
8075  };
8076  if (r8 >= r12) {
8077    ulong const t = r8;
8078    r8 = r12;
8079    r12 = t;
8080  };
8081  if (r2 >= r3) {
8082    ulong const t = r2;
8083    r2 = r3;
8084    r3 = t;
8085  };
8086  if (r5 >= r9) {
8087    ulong const t = r5;
8088    r5 = r9;
8089    r9 = t;
8090  };
8091  if (r2 >= r5) {
8092    ulong const t = r2;
8093    r2 = r5;
8094    r5 = t;
8095  };
8096  if (r8 >= r14) {
8097    ulong const t = r8;
8098    r8 = r14;
8099    r14 = t;
8100  };
8101  if (r3 >= r9) {
8102    ulong const t = r3;
8103    r3 = r9;
8104    r9 = t;
8105  };
8106  if (r12 >= r15) {
8107    ulong const t = r12;
8108    r12 = r15;
8109    r15 = t;
8110  };
8111  if (r3 >= r5) {
8112    ulong const t = r3;
8113    r3 = r5;
8114    r5 = t;
8115  };
8116  if (r6 >= r7) {
8117    ulong const t = r6;
8118    r6 = r7;
8119    r7 = t;
8120  };
8121  if (r10 >= r11) {
8122    ulong const t = r10;
8123    r10 = r11;
8124    r11 = t;
8125  };
8126  if (r12 >= r14) {
8127    ulong const t = r12;
8128    r12 = r14;
8129    r14 = t;
8130  };
8131  if (r4 >= r9) {
8132    ulong const t = r4;
8133    r4 = r9;
8134    r9 = t;
8135  };
8136  if (r8 >= r13) {
8137    ulong const t = r8;
8138    r8 = r13;
8139    r13 = t;
8140  };
8141  if (r7 >= r9) {
8142    ulong const t = r7;
8143    r7 = r9;
8144    r9 = t;
8145  };
8146  if (r11 >= r13) {
8147    ulong const t = r11;
8148    r11 = r13;
8149    r13 = t;
8150  };
8151  if (r4 >= r6) {
8152    ulong const t = r4;
8153    r4 = r6;
8154    r6 = t;
8155  };
8156  if (r8 >= r10) {
8157    ulong const t = r8;
8158    r8 = r10;
8159    r10 = t;
8160  };
8161  if (r4 >= r5) {
8162    ulong const t = r4;
8163    r4 = r5;
8164    r5 = t;
8165  };
8166  if (r6 >= r7) {
8167    ulong const t = r6;
8168    r6 = r7;
8169    r7 = t;
8170  };
8171  if (r8 >= r9) {
8172    ulong const t = r8;
8173    r8 = r9;
8174    r9 = t;
8175  };
8176  if (r10 >= r11) {
8177    ulong const t = r10;
8178    r10 = r11;
8179    r11 = t;
8180  };
8181  if (r12 >= r13) {
8182    ulong const t = r12;
8183    r12 = r13;
8184    r13 = t;
8185  };
8186  if (r7 >= r8) {
8187    ulong const t = r7;
8188    r7 = r8;
8189    r8 = t;
8190  };
8191  if (r9 >= r10) {
8192    ulong const t = r9;
8193    r9 = r10;
8194    r10 = t;
8195  };
8196  {
8197    uint const flip_lane_idx = get_sub_group_local_id() ^ 1;
8198    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
8199    ;
8200    {
8201      ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
8202      ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx);
8203      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
8204      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
8205    };
8206    {
8207      ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
8208      ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx);
8209      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
8210      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
8211    };
8212    {
8213      ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
8214      ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx);
8215      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
8216      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
8217    };
8218    {
8219      ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
8220      ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx);
8221      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
8222      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
8223    };
8224    {
8225      ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx);
8226      ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx);
8227      r5 = ((r5 <= tb) ^ t_lt) ? tb : r5;
8228      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
8229    };
8230    {
8231      ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx);
8232      ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx);
8233      r6 = ((r6 <= tb) ^ t_lt) ? tb : r6;
8234      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
8235    };
8236    {
8237      ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx);
8238      ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx);
8239      r7 = ((r7 <= tb) ^ t_lt) ? tb : r7;
8240      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
8241    };
8242    {
8243      ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx);
8244      ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx);
8245      r8 = ((r8 <= tb) ^ t_lt) ? tb : r8;
8246      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
8247    };
8248  }
8249  if (r1 >= r9) {
8250    ulong const t = r1;
8251    r1 = r9;
8252    r9 = t;
8253  };
8254  if (r5 >= r13) {
8255    ulong const t = r5;
8256    r5 = r13;
8257    r13 = t;
8258  };
8259  if (r1 >= r5) {
8260    ulong const t = r1;
8261    r1 = r5;
8262    r5 = t;
8263  };
8264  if (r9 >= r13) {
8265    ulong const t = r9;
8266    r9 = r13;
8267    r13 = t;
8268  };
8269  if (r3 >= r11) {
8270    ulong const t = r3;
8271    r3 = r11;
8272    r11 = t;
8273  };
8274  if (r7 >= r15) {
8275    ulong const t = r7;
8276    r7 = r15;
8277    r15 = t;
8278  };
8279  if (r3 >= r7) {
8280    ulong const t = r3;
8281    r3 = r7;
8282    r7 = t;
8283  };
8284  if (r11 >= r15) {
8285    ulong const t = r11;
8286    r11 = r15;
8287    r15 = t;
8288  };
8289  if (r1 >= r3) {
8290    ulong const t = r1;
8291    r1 = r3;
8292    r3 = t;
8293  };
8294  if (r5 >= r7) {
8295    ulong const t = r5;
8296    r5 = r7;
8297    r7 = t;
8298  };
8299  if (r9 >= r11) {
8300    ulong const t = r9;
8301    r9 = r11;
8302    r11 = t;
8303  };
8304  if (r13 >= r15) {
8305    ulong const t = r13;
8306    r13 = r15;
8307    r15 = t;
8308  };
8309  if (r2 >= r10) {
8310    ulong const t = r2;
8311    r2 = r10;
8312    r10 = t;
8313  };
8314  if (r6 >= r14) {
8315    ulong const t = r6;
8316    r6 = r14;
8317    r14 = t;
8318  };
8319  if (r2 >= r6) {
8320    ulong const t = r2;
8321    r2 = r6;
8322    r6 = t;
8323  };
8324  if (r10 >= r14) {
8325    ulong const t = r10;
8326    r10 = r14;
8327    r14 = t;
8328  };
8329  if (r4 >= r12) {
8330    ulong const t = r4;
8331    r4 = r12;
8332    r12 = t;
8333  };
8334  if (r8 >= r16) {
8335    ulong const t = r8;
8336    r8 = r16;
8337    r16 = t;
8338  };
8339  if (r4 >= r8) {
8340    ulong const t = r4;
8341    r4 = r8;
8342    r8 = t;
8343  };
8344  if (r12 >= r16) {
8345    ulong const t = r12;
8346    r12 = r16;
8347    r16 = t;
8348  };
8349  if (r2 >= r4) {
8350    ulong const t = r2;
8351    r2 = r4;
8352    r4 = t;
8353  };
8354  if (r6 >= r8) {
8355    ulong const t = r6;
8356    r6 = r8;
8357    r8 = t;
8358  };
8359  if (r10 >= r12) {
8360    ulong const t = r10;
8361    r10 = r12;
8362    r12 = t;
8363  };
8364  if (r14 >= r16) {
8365    ulong const t = r14;
8366    r14 = r16;
8367    r16 = t;
8368  };
8369  if (r1 >= r2) {
8370    ulong const t = r1;
8371    r1 = r2;
8372    r2 = t;
8373  };
8374  if (r3 >= r4) {
8375    ulong const t = r3;
8376    r3 = r4;
8377    r4 = t;
8378  };
8379  if (r5 >= r6) {
8380    ulong const t = r5;
8381    r5 = r6;
8382    r6 = t;
8383  };
8384  if (r7 >= r8) {
8385    ulong const t = r7;
8386    r7 = r8;
8387    r8 = t;
8388  };
8389  if (r9 >= r10) {
8390    ulong const t = r9;
8391    r9 = r10;
8392    r10 = t;
8393  };
8394  if (r11 >= r12) {
8395    ulong const t = r11;
8396    r11 = r12;
8397    r12 = t;
8398  };
8399  if (r13 >= r14) {
8400    ulong const t = r13;
8401    r13 = r14;
8402    r14 = t;
8403  };
8404  if (r15 >= r16) {
8405    ulong const t = r15;
8406    r15 = r16;
8407    r16 = t;
8408  };
8409  {
8410    uint const flip_lane_idx = get_sub_group_local_id() ^ 3;
8411    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
8412    ;
8413    {
8414      ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
8415      ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx);
8416      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
8417      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
8418    };
8419    {
8420      ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
8421      ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx);
8422      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
8423      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
8424    };
8425    {
8426      ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
8427      ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx);
8428      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
8429      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
8430    };
8431    {
8432      ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
8433      ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx);
8434      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
8435      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
8436    };
8437    {
8438      ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx);
8439      ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx);
8440      r5 = ((r5 <= tb) ^ t_lt) ? tb : r5;
8441      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
8442    };
8443    {
8444      ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx);
8445      ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx);
8446      r6 = ((r6 <= tb) ^ t_lt) ? tb : r6;
8447      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
8448    };
8449    {
8450      ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx);
8451      ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx);
8452      r7 = ((r7 <= tb) ^ t_lt) ? tb : r7;
8453      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
8454    };
8455    {
8456      ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx);
8457      ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx);
8458      r8 = ((r8 <= tb) ^ t_lt) ? tb : r8;
8459      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
8460    };
8461  }
8462  {
8463    uint const half_lane_idx = get_sub_group_local_id() ^ 1;
8464    int const t_lt = get_sub_group_local_id() < half_lane_idx;
8465    ;
8466    {
8467      ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
8468      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
8469    };
8470    {
8471      ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
8472      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
8473    };
8474    {
8475      ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
8476      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
8477    };
8478    {
8479      ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
8480      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
8481    };
8482    {
8483      ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
8484      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
8485    };
8486    {
8487      ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
8488      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
8489    };
8490    {
8491      ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
8492      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
8493    };
8494    {
8495      ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
8496      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
8497    };
8498    {
8499      ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
8500      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
8501    };
8502    {
8503      ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
8504      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
8505    };
8506    {
8507      ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
8508      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
8509    };
8510    {
8511      ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
8512      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
8513    };
8514    {
8515      ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
8516      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
8517    };
8518    {
8519      ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
8520      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
8521    };
8522    {
8523      ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
8524      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
8525    };
8526    {
8527      ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
8528      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
8529    };
8530  }
8531  if (r1 >= r9) {
8532    ulong const t = r1;
8533    r1 = r9;
8534    r9 = t;
8535  };
8536  if (r5 >= r13) {
8537    ulong const t = r5;
8538    r5 = r13;
8539    r13 = t;
8540  };
8541  if (r1 >= r5) {
8542    ulong const t = r1;
8543    r1 = r5;
8544    r5 = t;
8545  };
8546  if (r9 >= r13) {
8547    ulong const t = r9;
8548    r9 = r13;
8549    r13 = t;
8550  };
8551  if (r3 >= r11) {
8552    ulong const t = r3;
8553    r3 = r11;
8554    r11 = t;
8555  };
8556  if (r7 >= r15) {
8557    ulong const t = r7;
8558    r7 = r15;
8559    r15 = t;
8560  };
8561  if (r3 >= r7) {
8562    ulong const t = r3;
8563    r3 = r7;
8564    r7 = t;
8565  };
8566  if (r11 >= r15) {
8567    ulong const t = r11;
8568    r11 = r15;
8569    r15 = t;
8570  };
8571  if (r1 >= r3) {
8572    ulong const t = r1;
8573    r1 = r3;
8574    r3 = t;
8575  };
8576  if (r5 >= r7) {
8577    ulong const t = r5;
8578    r5 = r7;
8579    r7 = t;
8580  };
8581  if (r9 >= r11) {
8582    ulong const t = r9;
8583    r9 = r11;
8584    r11 = t;
8585  };
8586  if (r13 >= r15) {
8587    ulong const t = r13;
8588    r13 = r15;
8589    r15 = t;
8590  };
8591  if (r2 >= r10) {
8592    ulong const t = r2;
8593    r2 = r10;
8594    r10 = t;
8595  };
8596  if (r6 >= r14) {
8597    ulong const t = r6;
8598    r6 = r14;
8599    r14 = t;
8600  };
8601  if (r2 >= r6) {
8602    ulong const t = r2;
8603    r2 = r6;
8604    r6 = t;
8605  };
8606  if (r10 >= r14) {
8607    ulong const t = r10;
8608    r10 = r14;
8609    r14 = t;
8610  };
8611  if (r4 >= r12) {
8612    ulong const t = r4;
8613    r4 = r12;
8614    r12 = t;
8615  };
8616  if (r8 >= r16) {
8617    ulong const t = r8;
8618    r8 = r16;
8619    r16 = t;
8620  };
8621  if (r4 >= r8) {
8622    ulong const t = r4;
8623    r4 = r8;
8624    r8 = t;
8625  };
8626  if (r12 >= r16) {
8627    ulong const t = r12;
8628    r12 = r16;
8629    r16 = t;
8630  };
8631  if (r2 >= r4) {
8632    ulong const t = r2;
8633    r2 = r4;
8634    r4 = t;
8635  };
8636  if (r6 >= r8) {
8637    ulong const t = r6;
8638    r6 = r8;
8639    r8 = t;
8640  };
8641  if (r10 >= r12) {
8642    ulong const t = r10;
8643    r10 = r12;
8644    r12 = t;
8645  };
8646  if (r14 >= r16) {
8647    ulong const t = r14;
8648    r14 = r16;
8649    r16 = t;
8650  };
8651  if (r1 >= r2) {
8652    ulong const t = r1;
8653    r1 = r2;
8654    r2 = t;
8655  };
8656  if (r3 >= r4) {
8657    ulong const t = r3;
8658    r3 = r4;
8659    r4 = t;
8660  };
8661  if (r5 >= r6) {
8662    ulong const t = r5;
8663    r5 = r6;
8664    r6 = t;
8665  };
8666  if (r7 >= r8) {
8667    ulong const t = r7;
8668    r7 = r8;
8669    r8 = t;
8670  };
8671  if (r9 >= r10) {
8672    ulong const t = r9;
8673    r9 = r10;
8674    r10 = t;
8675  };
8676  if (r11 >= r12) {
8677    ulong const t = r11;
8678    r11 = r12;
8679    r12 = t;
8680  };
8681  if (r13 >= r14) {
8682    ulong const t = r13;
8683    r13 = r14;
8684    r14 = t;
8685  };
8686  if (r15 >= r16) {
8687    ulong const t = r15;
8688    r15 = r16;
8689    r16 = t;
8690  };
8691  {
8692    uint const flip_lane_idx = get_sub_group_local_id() ^ 7;
8693    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
8694    ;
8695    {
8696      ulong const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
8697      ulong const tb = intel_sub_group_shuffle(r16, flip_lane_idx);
8698      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
8699      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
8700    };
8701    {
8702      ulong const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
8703      ulong const tb = intel_sub_group_shuffle(r15, flip_lane_idx);
8704      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
8705      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
8706    };
8707    {
8708      ulong const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
8709      ulong const tb = intel_sub_group_shuffle(r14, flip_lane_idx);
8710      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
8711      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
8712    };
8713    {
8714      ulong const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
8715      ulong const tb = intel_sub_group_shuffle(r13, flip_lane_idx);
8716      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
8717      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
8718    };
8719    {
8720      ulong const ta = intel_sub_group_shuffle(r5, flip_lane_idx);
8721      ulong const tb = intel_sub_group_shuffle(r12, flip_lane_idx);
8722      r5 = ((r5 <= tb) ^ t_lt) ? tb : r5;
8723      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
8724    };
8725    {
8726      ulong const ta = intel_sub_group_shuffle(r6, flip_lane_idx);
8727      ulong const tb = intel_sub_group_shuffle(r11, flip_lane_idx);
8728      r6 = ((r6 <= tb) ^ t_lt) ? tb : r6;
8729      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
8730    };
8731    {
8732      ulong const ta = intel_sub_group_shuffle(r7, flip_lane_idx);
8733      ulong const tb = intel_sub_group_shuffle(r10, flip_lane_idx);
8734      r7 = ((r7 <= tb) ^ t_lt) ? tb : r7;
8735      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
8736    };
8737    {
8738      ulong const ta = intel_sub_group_shuffle(r8, flip_lane_idx);
8739      ulong const tb = intel_sub_group_shuffle(r9, flip_lane_idx);
8740      r8 = ((r8 <= tb) ^ t_lt) ? tb : r8;
8741      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
8742    };
8743  }
8744  {
8745    uint const half_lane_idx = get_sub_group_local_id() ^ 2;
8746    int const t_lt = get_sub_group_local_id() < half_lane_idx;
8747    ;
8748    {
8749      ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
8750      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
8751    };
8752    {
8753      ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
8754      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
8755    };
8756    {
8757      ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
8758      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
8759    };
8760    {
8761      ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
8762      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
8763    };
8764    {
8765      ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
8766      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
8767    };
8768    {
8769      ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
8770      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
8771    };
8772    {
8773      ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
8774      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
8775    };
8776    {
8777      ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
8778      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
8779    };
8780    {
8781      ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
8782      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
8783    };
8784    {
8785      ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
8786      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
8787    };
8788    {
8789      ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
8790      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
8791    };
8792    {
8793      ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
8794      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
8795    };
8796    {
8797      ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
8798      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
8799    };
8800    {
8801      ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
8802      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
8803    };
8804    {
8805      ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
8806      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
8807    };
8808    {
8809      ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
8810      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
8811    };
8812  }
8813  {
8814    uint const half_lane_idx = get_sub_group_local_id() ^ 1;
8815    int const t_lt = get_sub_group_local_id() < half_lane_idx;
8816    ;
8817    {
8818      ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
8819      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
8820    };
8821    {
8822      ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
8823      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
8824    };
8825    {
8826      ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
8827      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
8828    };
8829    {
8830      ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
8831      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
8832    };
8833    {
8834      ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
8835      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
8836    };
8837    {
8838      ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
8839      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
8840    };
8841    {
8842      ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
8843      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
8844    };
8845    {
8846      ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
8847      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
8848    };
8849    {
8850      ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
8851      r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
8852    };
8853    {
8854      ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
8855      r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
8856    };
8857    {
8858      ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
8859      r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
8860    };
8861    {
8862      ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
8863      r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
8864    };
8865    {
8866      ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
8867      r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
8868    };
8869    {
8870      ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
8871      r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
8872    };
8873    {
8874      ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
8875      r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
8876    };
8877    {
8878      ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
8879      r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
8880    };
8881  }
8882  if (r1 >= r9) {
8883    ulong const t = r1;
8884    r1 = r9;
8885    r9 = t;
8886  };
8887  if (r5 >= r13) {
8888    ulong const t = r5;
8889    r5 = r13;
8890    r13 = t;
8891  };
8892  if (r1 >= r5) {
8893    ulong const t = r1;
8894    r1 = r5;
8895    r5 = t;
8896  };
8897  if (r9 >= r13) {
8898    ulong const t = r9;
8899    r9 = r13;
8900    r13 = t;
8901  };
8902  if (r3 >= r11) {
8903    ulong const t = r3;
8904    r3 = r11;
8905    r11 = t;
8906  };
8907  if (r7 >= r15) {
8908    ulong const t = r7;
8909    r7 = r15;
8910    r15 = t;
8911  };
8912  if (r3 >= r7) {
8913    ulong const t = r3;
8914    r3 = r7;
8915    r7 = t;
8916  };
8917  if (r11 >= r15) {
8918    ulong const t = r11;
8919    r11 = r15;
8920    r15 = t;
8921  };
8922  if (r1 >= r3) {
8923    ulong const t = r1;
8924    r1 = r3;
8925    r3 = t;
8926  };
8927  if (r5 >= r7) {
8928    ulong const t = r5;
8929    r5 = r7;
8930    r7 = t;
8931  };
8932  if (r9 >= r11) {
8933    ulong const t = r9;
8934    r9 = r11;
8935    r11 = t;
8936  };
8937  if (r13 >= r15) {
8938    ulong const t = r13;
8939    r13 = r15;
8940    r15 = t;
8941  };
8942  if (r2 >= r10) {
8943    ulong const t = r2;
8944    r2 = r10;
8945    r10 = t;
8946  };
8947  if (r6 >= r14) {
8948    ulong const t = r6;
8949    r6 = r14;
8950    r14 = t;
8951  };
8952  if (r2 >= r6) {
8953    ulong const t = r2;
8954    r2 = r6;
8955    r6 = t;
8956  };
8957  if (r10 >= r14) {
8958    ulong const t = r10;
8959    r10 = r14;
8960    r14 = t;
8961  };
8962  if (r4 >= r12) {
8963    ulong const t = r4;
8964    r4 = r12;
8965    r12 = t;
8966  };
8967  if (r8 >= r16) {
8968    ulong const t = r8;
8969    r8 = r16;
8970    r16 = t;
8971  };
8972  if (r4 >= r8) {
8973    ulong const t = r4;
8974    r4 = r8;
8975    r8 = t;
8976  };
8977  if (r12 >= r16) {
8978    ulong const t = r12;
8979    r12 = r16;
8980    r16 = t;
8981  };
8982  if (r2 >= r4) {
8983    ulong const t = r2;
8984    r2 = r4;
8985    r4 = t;
8986  };
8987  if (r6 >= r8) {
8988    ulong const t = r6;
8989    r6 = r8;
8990    r8 = t;
8991  };
8992  if (r10 >= r12) {
8993    ulong const t = r10;
8994    r10 = r12;
8995    r12 = t;
8996  };
8997  if (r14 >= r16) {
8998    ulong const t = r14;
8999    r14 = r16;
9000    r16 = t;
9001  };
9002  if (r1 >= r2) {
9003    ulong const t = r1;
9004    r1 = r2;
9005    r2 = t;
9006  };
9007  if (r3 >= r4) {
9008    ulong const t = r3;
9009    r3 = r4;
9010    r4 = t;
9011  };
9012  if (r5 >= r6) {
9013    ulong const t = r5;
9014    r5 = r6;
9015    r6 = t;
9016  };
9017  if (r7 >= r8) {
9018    ulong const t = r7;
9019    r7 = r8;
9020    r8 = t;
9021  };
9022  if (r9 >= r10) {
9023    ulong const t = r9;
9024    r9 = r10;
9025    r10 = t;
9026  };
9027  if (r11 >= r12) {
9028    ulong const t = r11;
9029    r11 = r12;
9030    r12 = t;
9031  };
9032  if (r13 >= r14) {
9033    ulong const t = r13;
9034    r13 = r14;
9035    r14 = t;
9036  };
9037  if (r15 >= r16) {
9038    ulong const t = r15;
9039    r15 = r16;
9040    r16 = t;
9041  };
9042  uint const smem_l_idx =
9043    get_sub_group_id() * ((1 << 3) * 16) + get_sub_group_local_id();
9044  uint const smem_r_idx = (get_sub_group_id() ^ 1) * ((1 << 3) * 16) +
9045                          (get_sub_group_local_id() ^ ((1 << 3) - 1));
9046  shared.m[get_local_id(0) + (16 * (1 << 3) * 0)] = r1;
9047  shared.m[get_local_id(0) + (16 * (1 << 3) * 1)] = r16;
9048  shared.m[get_local_id(0) + (16 * (1 << 3) * 2)] = r2;
9049  shared.m[get_local_id(0) + (16 * (1 << 3) * 3)] = r15;
9050  shared.m[get_local_id(0) + (16 * (1 << 3) * 4)] = r3;
9051  shared.m[get_local_id(0) + (16 * (1 << 3) * 5)] = r14;
9052  shared.m[get_local_id(0) + (16 * (1 << 3) * 6)] = r4;
9053  shared.m[get_local_id(0) + (16 * (1 << 3) * 7)] = r13;
9054  shared.m[get_local_id(0) + (16 * (1 << 3) * 8)] = r5;
9055  shared.m[get_local_id(0) + (16 * (1 << 3) * 9)] = r12;
9056  shared.m[get_local_id(0) + (16 * (1 << 3) * 10)] = r6;
9057  shared.m[get_local_id(0) + (16 * (1 << 3) * 11)] = r11;
9058  shared.m[get_local_id(0) + (16 * (1 << 3) * 12)] = r7;
9059  shared.m[get_local_id(0) + (16 * (1 << 3) * 13)] = r10;
9060  shared.m[get_local_id(0) + (16 * (1 << 3) * 14)] = r8;
9061  shared.m[get_local_id(0) + (16 * (1 << 3) * 15)] = r9;
9062  barrier(CLK_LOCAL_MEM_FENCE);
9063  {
9064    {
9065      ulong r0_1 = shared.m[smem_l_idx + (0)];
9066      ulong r0_2 = shared.m[smem_r_idx + (8)];
9067      if (r0_1 >= r0_2) {
9068        ulong const t = r0_1;
9069        r0_1 = r0_2;
9070        r0_2 = t;
9071      };
9072      shared.m[smem_l_idx + (0)] = r0_1;
9073      shared.m[smem_r_idx + (8)] = r0_2;
9074    }
9075    {
9076      ulong r1_1 = shared.m[smem_l_idx + (16)];
9077      ulong r1_2 = shared.m[smem_r_idx + (24)];
9078      if (r1_1 >= r1_2) {
9079        ulong const t = r1_1;
9080        r1_1 = r1_2;
9081        r1_2 = t;
9082      };
9083      shared.m[smem_l_idx + (16)] = r1_1;
9084      shared.m[smem_r_idx + (24)] = r1_2;
9085    }
9086    {
9087      ulong r2_1 = shared.m[smem_l_idx + (32)];
9088      ulong r2_2 = shared.m[smem_r_idx + (40)];
9089      if (r2_1 >= r2_2) {
9090        ulong const t = r2_1;
9091        r2_1 = r2_2;
9092        r2_2 = t;
9093      };
9094      shared.m[smem_l_idx + (32)] = r2_1;
9095      shared.m[smem_r_idx + (40)] = r2_2;
9096    }
9097    {
9098      ulong r3_1 = shared.m[smem_l_idx + (48)];
9099      ulong r3_2 = shared.m[smem_r_idx + (56)];
9100      if (r3_1 >= r3_2) {
9101        ulong const t = r3_1;
9102        r3_1 = r3_2;
9103        r3_2 = t;
9104      };
9105      shared.m[smem_l_idx + (48)] = r3_1;
9106      shared.m[smem_r_idx + (56)] = r3_2;
9107    }
9108    {
9109      ulong r4_1 = shared.m[smem_l_idx + (64)];
9110      ulong r4_2 = shared.m[smem_r_idx + (72)];
9111      if (r4_1 >= r4_2) {
9112        ulong const t = r4_1;
9113        r4_1 = r4_2;
9114        r4_2 = t;
9115      };
9116      shared.m[smem_l_idx + (64)] = r4_1;
9117      shared.m[smem_r_idx + (72)] = r4_2;
9118    }
9119    {
9120      ulong r5_1 = shared.m[smem_l_idx + (80)];
9121      ulong r5_2 = shared.m[smem_r_idx + (88)];
9122      if (r5_1 >= r5_2) {
9123        ulong const t = r5_1;
9124        r5_1 = r5_2;
9125        r5_2 = t;
9126      };
9127      shared.m[smem_l_idx + (80)] = r5_1;
9128      shared.m[smem_r_idx + (88)] = r5_2;
9129    }
9130    {
9131      ulong r6_1 = shared.m[smem_l_idx + (96)];
9132      ulong r6_2 = shared.m[smem_r_idx + (104)];
9133      if (r6_1 >= r6_2) {
9134        ulong const t = r6_1;
9135        r6_1 = r6_2;
9136        r6_2 = t;
9137      };
9138      shared.m[smem_l_idx + (96)] = r6_1;
9139      shared.m[smem_r_idx + (104)] = r6_2;
9140    }
9141    {
9142      ulong r7_1 = shared.m[smem_l_idx + (112)];
9143      ulong r7_2 = shared.m[smem_r_idx + (120)];
9144      if (r7_1 >= r7_2) {
9145        ulong const t = r7_1;
9146        r7_1 = r7_2;
9147        r7_2 = t;
9148      };
9149      shared.m[smem_l_idx + (112)] = r7_1;
9150      shared.m[smem_r_idx + (120)] = r7_2;
9151    }
9152  }
9153  barrier(CLK_LOCAL_MEM_FENCE);
9154  r1 = shared.m[get_local_id(0) + (16 * (1 << 3) * 0)];
9155  r16 = shared.m[get_local_id(0) + (16 * (1 << 3) * 1)];
9156  r2 = shared.m[get_local_id(0) + (16 * (1 << 3) * 2)];
9157  r15 = shared.m[get_local_id(0) + (16 * (1 << 3) * 3)];
9158  r3 = shared.m[get_local_id(0) + (16 * (1 << 3) * 4)];
9159  r14 = shared.m[get_local_id(0) + (16 * (1 << 3) * 5)];
9160  r4 = shared.m[get_local_id(0) + (16 * (1 << 3) * 6)];
9161  r13 = shared.m[get_local_id(0) + (16 * (1 << 3) * 7)];
9162  r5 = shared.m[get_local_id(0) + (16 * (1 << 3) * 8)];
9163  r12 = shared.m[get_local_id(0) + (16 * (1 << 3) * 9)];
9164  r6 = shared.m[get_local_id(0) + (16 * (1 << 3) * 10)];
9165  r11 = shared.m[get_local_id(0) + (16 * (1 << 3) * 11)];
9166  r7 = shared.m[get_local_id(0) + (16 * (1 << 3) * 12)];
9167  r10 = shared.m[get_local_id(0) + (16 * (1 << 3) * 13)];
9168  r8 = shared.m[get_local_id(0) + (16 * (1 << 3) * 14)];
9169  r9 = shared.m[get_local_id(0) + (16 * (1 << 3) * 15)];
9170  {
9171    {
9172      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
9173      int const t_lt = get_sub_group_local_id() < half_lane_idx;
9174      ;
9175      {
9176        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
9177        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
9178      };
9179      {
9180        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
9181        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
9182      };
9183      {
9184        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
9185        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
9186      };
9187      {
9188        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
9189        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
9190      };
9191      {
9192        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
9193        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
9194      };
9195      {
9196        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
9197        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
9198      };
9199      {
9200        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
9201        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
9202      };
9203      {
9204        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
9205        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
9206      };
9207      {
9208        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
9209        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
9210      };
9211      {
9212        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
9213        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
9214      };
9215      {
9216        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
9217        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
9218      };
9219      {
9220        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
9221        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
9222      };
9223      {
9224        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
9225        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
9226      };
9227      {
9228        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
9229        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
9230      };
9231      {
9232        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
9233        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
9234      };
9235      {
9236        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
9237        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
9238      };
9239    }
9240    {
9241      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
9242      int const t_lt = get_sub_group_local_id() < half_lane_idx;
9243      ;
9244      {
9245        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
9246        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
9247      };
9248      {
9249        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
9250        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
9251      };
9252      {
9253        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
9254        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
9255      };
9256      {
9257        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
9258        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
9259      };
9260      {
9261        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
9262        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
9263      };
9264      {
9265        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
9266        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
9267      };
9268      {
9269        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
9270        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
9271      };
9272      {
9273        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
9274        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
9275      };
9276      {
9277        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
9278        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
9279      };
9280      {
9281        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
9282        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
9283      };
9284      {
9285        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
9286        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
9287      };
9288      {
9289        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
9290        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
9291      };
9292      {
9293        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
9294        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
9295      };
9296      {
9297        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
9298        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
9299      };
9300      {
9301        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
9302        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
9303      };
9304      {
9305        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
9306        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
9307      };
9308    }
9309    {
9310      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
9311      int const t_lt = get_sub_group_local_id() < half_lane_idx;
9312      ;
9313      {
9314        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
9315        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
9316      };
9317      {
9318        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
9319        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
9320      };
9321      {
9322        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
9323        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
9324      };
9325      {
9326        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
9327        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
9328      };
9329      {
9330        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
9331        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
9332      };
9333      {
9334        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
9335        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
9336      };
9337      {
9338        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
9339        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
9340      };
9341      {
9342        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
9343        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
9344      };
9345      {
9346        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
9347        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
9348      };
9349      {
9350        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
9351        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
9352      };
9353      {
9354        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
9355        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
9356      };
9357      {
9358        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
9359        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
9360      };
9361      {
9362        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
9363        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
9364      };
9365      {
9366        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
9367        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
9368      };
9369      {
9370        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
9371        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
9372      };
9373      {
9374        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
9375        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
9376      };
9377    }
9378    if (r1 >= r9) {
9379      ulong const t = r1;
9380      r1 = r9;
9381      r9 = t;
9382    };
9383    if (r5 >= r13) {
9384      ulong const t = r5;
9385      r5 = r13;
9386      r13 = t;
9387    };
9388    if (r1 >= r5) {
9389      ulong const t = r1;
9390      r1 = r5;
9391      r5 = t;
9392    };
9393    if (r9 >= r13) {
9394      ulong const t = r9;
9395      r9 = r13;
9396      r13 = t;
9397    };
9398    if (r3 >= r11) {
9399      ulong const t = r3;
9400      r3 = r11;
9401      r11 = t;
9402    };
9403    if (r7 >= r15) {
9404      ulong const t = r7;
9405      r7 = r15;
9406      r15 = t;
9407    };
9408    if (r3 >= r7) {
9409      ulong const t = r3;
9410      r3 = r7;
9411      r7 = t;
9412    };
9413    if (r11 >= r15) {
9414      ulong const t = r11;
9415      r11 = r15;
9416      r15 = t;
9417    };
9418    if (r1 >= r3) {
9419      ulong const t = r1;
9420      r1 = r3;
9421      r3 = t;
9422    };
9423    if (r5 >= r7) {
9424      ulong const t = r5;
9425      r5 = r7;
9426      r7 = t;
9427    };
9428    if (r9 >= r11) {
9429      ulong const t = r9;
9430      r9 = r11;
9431      r11 = t;
9432    };
9433    if (r13 >= r15) {
9434      ulong const t = r13;
9435      r13 = r15;
9436      r15 = t;
9437    };
9438    if (r2 >= r10) {
9439      ulong const t = r2;
9440      r2 = r10;
9441      r10 = t;
9442    };
9443    if (r6 >= r14) {
9444      ulong const t = r6;
9445      r6 = r14;
9446      r14 = t;
9447    };
9448    if (r2 >= r6) {
9449      ulong const t = r2;
9450      r2 = r6;
9451      r6 = t;
9452    };
9453    if (r10 >= r14) {
9454      ulong const t = r10;
9455      r10 = r14;
9456      r14 = t;
9457    };
9458    if (r4 >= r12) {
9459      ulong const t = r4;
9460      r4 = r12;
9461      r12 = t;
9462    };
9463    if (r8 >= r16) {
9464      ulong const t = r8;
9465      r8 = r16;
9466      r16 = t;
9467    };
9468    if (r4 >= r8) {
9469      ulong const t = r4;
9470      r4 = r8;
9471      r8 = t;
9472    };
9473    if (r12 >= r16) {
9474      ulong const t = r12;
9475      r12 = r16;
9476      r16 = t;
9477    };
9478    if (r2 >= r4) {
9479      ulong const t = r2;
9480      r2 = r4;
9481      r4 = t;
9482    };
9483    if (r6 >= r8) {
9484      ulong const t = r6;
9485      r6 = r8;
9486      r8 = t;
9487    };
9488    if (r10 >= r12) {
9489      ulong const t = r10;
9490      r10 = r12;
9491      r12 = t;
9492    };
9493    if (r14 >= r16) {
9494      ulong const t = r14;
9495      r14 = r16;
9496      r16 = t;
9497    };
9498    if (r1 >= r2) {
9499      ulong const t = r1;
9500      r1 = r2;
9501      r2 = t;
9502    };
9503    if (r3 >= r4) {
9504      ulong const t = r3;
9505      r3 = r4;
9506      r4 = t;
9507    };
9508    if (r5 >= r6) {
9509      ulong const t = r5;
9510      r5 = r6;
9511      r6 = t;
9512    };
9513    if (r7 >= r8) {
9514      ulong const t = r7;
9515      r7 = r8;
9516      r8 = t;
9517    };
9518    if (r9 >= r10) {
9519      ulong const t = r9;
9520      r9 = r10;
9521      r10 = t;
9522    };
9523    if (r11 >= r12) {
9524      ulong const t = r11;
9525      r11 = r12;
9526      r12 = t;
9527    };
9528    if (r13 >= r14) {
9529      ulong const t = r13;
9530      r13 = r14;
9531      r14 = t;
9532    };
9533    if (r15 >= r16) {
9534      ulong const t = r15;
9535      r15 = r16;
9536      r16 = t;
9537    };
9538  }
9539  shared.m[get_local_id(0) + (16 * (1 << 3) * 0)] = r1;
9540  shared.m[get_local_id(0) + (16 * (1 << 3) * 1)] = r16;
9541  shared.m[get_local_id(0) + (16 * (1 << 3) * 2)] = r2;
9542  shared.m[get_local_id(0) + (16 * (1 << 3) * 3)] = r15;
9543  shared.m[get_local_id(0) + (16 * (1 << 3) * 4)] = r3;
9544  shared.m[get_local_id(0) + (16 * (1 << 3) * 5)] = r14;
9545  shared.m[get_local_id(0) + (16 * (1 << 3) * 6)] = r4;
9546  shared.m[get_local_id(0) + (16 * (1 << 3) * 7)] = r13;
9547  shared.m[get_local_id(0) + (16 * (1 << 3) * 8)] = r5;
9548  shared.m[get_local_id(0) + (16 * (1 << 3) * 9)] = r12;
9549  shared.m[get_local_id(0) + (16 * (1 << 3) * 10)] = r6;
9550  shared.m[get_local_id(0) + (16 * (1 << 3) * 11)] = r11;
9551  shared.m[get_local_id(0) + (16 * (1 << 3) * 12)] = r7;
9552  shared.m[get_local_id(0) + (16 * (1 << 3) * 13)] = r10;
9553  shared.m[get_local_id(0) + (16 * (1 << 3) * 14)] = r8;
9554  shared.m[get_local_id(0) + (16 * (1 << 3) * 15)] = r9;
9555  barrier(CLK_LOCAL_MEM_FENCE);
9556  {
9557    {
9558      ulong r0_1 = shared.m[smem_l_idx + (0)];
9559      ulong r0_2 = shared.m[smem_l_idx + (8)];
9560      ulong r0_3 = shared.m[smem_r_idx + (16)];
9561      ulong r0_4 = shared.m[smem_r_idx + (24)];
9562      if (r0_2 >= r0_3) {
9563        ulong const t = r0_2;
9564        r0_2 = r0_3;
9565        r0_3 = t;
9566      };
9567      if (r0_1 >= r0_4) {
9568        ulong const t = r0_1;
9569        r0_1 = r0_4;
9570        r0_4 = t;
9571      };
9572      if (r0_3 >= r0_4) {
9573        ulong const t = r0_3;
9574        r0_3 = r0_4;
9575        r0_4 = t;
9576      };
9577      if (r0_1 >= r0_2) {
9578        ulong const t = r0_1;
9579        r0_1 = r0_2;
9580        r0_2 = t;
9581      };
9582      shared.m[smem_l_idx + (0)] = r0_1;
9583      shared.m[smem_l_idx + (8)] = r0_2;
9584      shared.m[smem_r_idx + (16)] = r0_3;
9585      shared.m[smem_r_idx + (24)] = r0_4;
9586    }
9587    {
9588      ulong r1_1 = shared.m[smem_l_idx + (32)];
9589      ulong r1_2 = shared.m[smem_l_idx + (40)];
9590      ulong r1_3 = shared.m[smem_r_idx + (48)];
9591      ulong r1_4 = shared.m[smem_r_idx + (56)];
9592      if (r1_2 >= r1_3) {
9593        ulong const t = r1_2;
9594        r1_2 = r1_3;
9595        r1_3 = t;
9596      };
9597      if (r1_1 >= r1_4) {
9598        ulong const t = r1_1;
9599        r1_1 = r1_4;
9600        r1_4 = t;
9601      };
9602      if (r1_3 >= r1_4) {
9603        ulong const t = r1_3;
9604        r1_3 = r1_4;
9605        r1_4 = t;
9606      };
9607      if (r1_1 >= r1_2) {
9608        ulong const t = r1_1;
9609        r1_1 = r1_2;
9610        r1_2 = t;
9611      };
9612      shared.m[smem_l_idx + (32)] = r1_1;
9613      shared.m[smem_l_idx + (40)] = r1_2;
9614      shared.m[smem_r_idx + (48)] = r1_3;
9615      shared.m[smem_r_idx + (56)] = r1_4;
9616    }
9617    {
9618      ulong r2_1 = shared.m[smem_l_idx + (64)];
9619      ulong r2_2 = shared.m[smem_l_idx + (72)];
9620      ulong r2_3 = shared.m[smem_r_idx + (80)];
9621      ulong r2_4 = shared.m[smem_r_idx + (88)];
9622      if (r2_2 >= r2_3) {
9623        ulong const t = r2_2;
9624        r2_2 = r2_3;
9625        r2_3 = t;
9626      };
9627      if (r2_1 >= r2_4) {
9628        ulong const t = r2_1;
9629        r2_1 = r2_4;
9630        r2_4 = t;
9631      };
9632      if (r2_3 >= r2_4) {
9633        ulong const t = r2_3;
9634        r2_3 = r2_4;
9635        r2_4 = t;
9636      };
9637      if (r2_1 >= r2_2) {
9638        ulong const t = r2_1;
9639        r2_1 = r2_2;
9640        r2_2 = t;
9641      };
9642      shared.m[smem_l_idx + (64)] = r2_1;
9643      shared.m[smem_l_idx + (72)] = r2_2;
9644      shared.m[smem_r_idx + (80)] = r2_3;
9645      shared.m[smem_r_idx + (88)] = r2_4;
9646    }
9647    {
9648      ulong r3_1 = shared.m[smem_l_idx + (96)];
9649      ulong r3_2 = shared.m[smem_l_idx + (104)];
9650      ulong r3_3 = shared.m[smem_r_idx + (112)];
9651      ulong r3_4 = shared.m[smem_r_idx + (120)];
9652      if (r3_2 >= r3_3) {
9653        ulong const t = r3_2;
9654        r3_2 = r3_3;
9655        r3_3 = t;
9656      };
9657      if (r3_1 >= r3_4) {
9658        ulong const t = r3_1;
9659        r3_1 = r3_4;
9660        r3_4 = t;
9661      };
9662      if (r3_3 >= r3_4) {
9663        ulong const t = r3_3;
9664        r3_3 = r3_4;
9665        r3_4 = t;
9666      };
9667      if (r3_1 >= r3_2) {
9668        ulong const t = r3_1;
9669        r3_1 = r3_2;
9670        r3_2 = t;
9671      };
9672      shared.m[smem_l_idx + (96)] = r3_1;
9673      shared.m[smem_l_idx + (104)] = r3_2;
9674      shared.m[smem_r_idx + (112)] = r3_3;
9675      shared.m[smem_r_idx + (120)] = r3_4;
9676    }
9677  }
9678  barrier(CLK_LOCAL_MEM_FENCE);
9679  r1 = shared.m[get_local_id(0) + (16 * (1 << 3) * 0)];
9680  r16 = shared.m[get_local_id(0) + (16 * (1 << 3) * 1)];
9681  r2 = shared.m[get_local_id(0) + (16 * (1 << 3) * 2)];
9682  r15 = shared.m[get_local_id(0) + (16 * (1 << 3) * 3)];
9683  r3 = shared.m[get_local_id(0) + (16 * (1 << 3) * 4)];
9684  r14 = shared.m[get_local_id(0) + (16 * (1 << 3) * 5)];
9685  r4 = shared.m[get_local_id(0) + (16 * (1 << 3) * 6)];
9686  r13 = shared.m[get_local_id(0) + (16 * (1 << 3) * 7)];
9687  r5 = shared.m[get_local_id(0) + (16 * (1 << 3) * 8)];
9688  r12 = shared.m[get_local_id(0) + (16 * (1 << 3) * 9)];
9689  r6 = shared.m[get_local_id(0) + (16 * (1 << 3) * 10)];
9690  r11 = shared.m[get_local_id(0) + (16 * (1 << 3) * 11)];
9691  r7 = shared.m[get_local_id(0) + (16 * (1 << 3) * 12)];
9692  r10 = shared.m[get_local_id(0) + (16 * (1 << 3) * 13)];
9693  r8 = shared.m[get_local_id(0) + (16 * (1 << 3) * 14)];
9694  r9 = shared.m[get_local_id(0) + (16 * (1 << 3) * 15)];
9695  {
9696    {
9697      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
9698      int const t_lt = get_sub_group_local_id() < half_lane_idx;
9699      ;
9700      {
9701        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
9702        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
9703      };
9704      {
9705        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
9706        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
9707      };
9708      {
9709        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
9710        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
9711      };
9712      {
9713        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
9714        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
9715      };
9716      {
9717        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
9718        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
9719      };
9720      {
9721        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
9722        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
9723      };
9724      {
9725        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
9726        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
9727      };
9728      {
9729        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
9730        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
9731      };
9732      {
9733        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
9734        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
9735      };
9736      {
9737        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
9738        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
9739      };
9740      {
9741        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
9742        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
9743      };
9744      {
9745        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
9746        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
9747      };
9748      {
9749        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
9750        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
9751      };
9752      {
9753        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
9754        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
9755      };
9756      {
9757        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
9758        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
9759      };
9760      {
9761        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
9762        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
9763      };
9764    }
9765    {
9766      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
9767      int const t_lt = get_sub_group_local_id() < half_lane_idx;
9768      ;
9769      {
9770        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
9771        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
9772      };
9773      {
9774        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
9775        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
9776      };
9777      {
9778        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
9779        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
9780      };
9781      {
9782        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
9783        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
9784      };
9785      {
9786        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
9787        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
9788      };
9789      {
9790        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
9791        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
9792      };
9793      {
9794        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
9795        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
9796      };
9797      {
9798        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
9799        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
9800      };
9801      {
9802        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
9803        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
9804      };
9805      {
9806        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
9807        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
9808      };
9809      {
9810        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
9811        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
9812      };
9813      {
9814        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
9815        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
9816      };
9817      {
9818        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
9819        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
9820      };
9821      {
9822        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
9823        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
9824      };
9825      {
9826        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
9827        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
9828      };
9829      {
9830        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
9831        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
9832      };
9833    }
9834    {
9835      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
9836      int const t_lt = get_sub_group_local_id() < half_lane_idx;
9837      ;
9838      {
9839        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
9840        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
9841      };
9842      {
9843        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
9844        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
9845      };
9846      {
9847        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
9848        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
9849      };
9850      {
9851        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
9852        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
9853      };
9854      {
9855        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
9856        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
9857      };
9858      {
9859        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
9860        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
9861      };
9862      {
9863        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
9864        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
9865      };
9866      {
9867        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
9868        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
9869      };
9870      {
9871        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
9872        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
9873      };
9874      {
9875        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
9876        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
9877      };
9878      {
9879        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
9880        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
9881      };
9882      {
9883        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
9884        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
9885      };
9886      {
9887        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
9888        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
9889      };
9890      {
9891        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
9892        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
9893      };
9894      {
9895        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
9896        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
9897      };
9898      {
9899        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
9900        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
9901      };
9902    }
9903    if (r1 >= r9) {
9904      ulong const t = r1;
9905      r1 = r9;
9906      r9 = t;
9907    };
9908    if (r5 >= r13) {
9909      ulong const t = r5;
9910      r5 = r13;
9911      r13 = t;
9912    };
9913    if (r1 >= r5) {
9914      ulong const t = r1;
9915      r1 = r5;
9916      r5 = t;
9917    };
9918    if (r9 >= r13) {
9919      ulong const t = r9;
9920      r9 = r13;
9921      r13 = t;
9922    };
9923    if (r3 >= r11) {
9924      ulong const t = r3;
9925      r3 = r11;
9926      r11 = t;
9927    };
9928    if (r7 >= r15) {
9929      ulong const t = r7;
9930      r7 = r15;
9931      r15 = t;
9932    };
9933    if (r3 >= r7) {
9934      ulong const t = r3;
9935      r3 = r7;
9936      r7 = t;
9937    };
9938    if (r11 >= r15) {
9939      ulong const t = r11;
9940      r11 = r15;
9941      r15 = t;
9942    };
9943    if (r1 >= r3) {
9944      ulong const t = r1;
9945      r1 = r3;
9946      r3 = t;
9947    };
9948    if (r5 >= r7) {
9949      ulong const t = r5;
9950      r5 = r7;
9951      r7 = t;
9952    };
9953    if (r9 >= r11) {
9954      ulong const t = r9;
9955      r9 = r11;
9956      r11 = t;
9957    };
9958    if (r13 >= r15) {
9959      ulong const t = r13;
9960      r13 = r15;
9961      r15 = t;
9962    };
9963    if (r2 >= r10) {
9964      ulong const t = r2;
9965      r2 = r10;
9966      r10 = t;
9967    };
9968    if (r6 >= r14) {
9969      ulong const t = r6;
9970      r6 = r14;
9971      r14 = t;
9972    };
9973    if (r2 >= r6) {
9974      ulong const t = r2;
9975      r2 = r6;
9976      r6 = t;
9977    };
9978    if (r10 >= r14) {
9979      ulong const t = r10;
9980      r10 = r14;
9981      r14 = t;
9982    };
9983    if (r4 >= r12) {
9984      ulong const t = r4;
9985      r4 = r12;
9986      r12 = t;
9987    };
9988    if (r8 >= r16) {
9989      ulong const t = r8;
9990      r8 = r16;
9991      r16 = t;
9992    };
9993    if (r4 >= r8) {
9994      ulong const t = r4;
9995      r4 = r8;
9996      r8 = t;
9997    };
9998    if (r12 >= r16) {
9999      ulong const t = r12;
10000      r12 = r16;
10001      r16 = t;
10002    };
10003    if (r2 >= r4) {
10004      ulong const t = r2;
10005      r2 = r4;
10006      r4 = t;
10007    };
10008    if (r6 >= r8) {
10009      ulong const t = r6;
10010      r6 = r8;
10011      r8 = t;
10012    };
10013    if (r10 >= r12) {
10014      ulong const t = r10;
10015      r10 = r12;
10016      r12 = t;
10017    };
10018    if (r14 >= r16) {
10019      ulong const t = r14;
10020      r14 = r16;
10021      r16 = t;
10022    };
10023    if (r1 >= r2) {
10024      ulong const t = r1;
10025      r1 = r2;
10026      r2 = t;
10027    };
10028    if (r3 >= r4) {
10029      ulong const t = r3;
10030      r3 = r4;
10031      r4 = t;
10032    };
10033    if (r5 >= r6) {
10034      ulong const t = r5;
10035      r5 = r6;
10036      r6 = t;
10037    };
10038    if (r7 >= r8) {
10039      ulong const t = r7;
10040      r7 = r8;
10041      r8 = t;
10042    };
10043    if (r9 >= r10) {
10044      ulong const t = r9;
10045      r9 = r10;
10046      r10 = t;
10047    };
10048    if (r11 >= r12) {
10049      ulong const t = r11;
10050      r11 = r12;
10051      r12 = t;
10052    };
10053    if (r13 >= r14) {
10054      ulong const t = r13;
10055      r13 = r14;
10056      r14 = t;
10057    };
10058    if (r15 >= r16) {
10059      ulong const t = r15;
10060      r15 = r16;
10061      r16 = t;
10062    };
10063  }
10064  shared.m[get_local_id(0) + (16 * (1 << 3) * 0)] = r1;
10065  shared.m[get_local_id(0) + (16 * (1 << 3) * 1)] = r16;
10066  shared.m[get_local_id(0) + (16 * (1 << 3) * 2)] = r2;
10067  shared.m[get_local_id(0) + (16 * (1 << 3) * 3)] = r15;
10068  shared.m[get_local_id(0) + (16 * (1 << 3) * 4)] = r3;
10069  shared.m[get_local_id(0) + (16 * (1 << 3) * 5)] = r14;
10070  shared.m[get_local_id(0) + (16 * (1 << 3) * 6)] = r4;
10071  shared.m[get_local_id(0) + (16 * (1 << 3) * 7)] = r13;
10072  shared.m[get_local_id(0) + (16 * (1 << 3) * 8)] = r5;
10073  shared.m[get_local_id(0) + (16 * (1 << 3) * 9)] = r12;
10074  shared.m[get_local_id(0) + (16 * (1 << 3) * 10)] = r6;
10075  shared.m[get_local_id(0) + (16 * (1 << 3) * 11)] = r11;
10076  shared.m[get_local_id(0) + (16 * (1 << 3) * 12)] = r7;
10077  shared.m[get_local_id(0) + (16 * (1 << 3) * 13)] = r10;
10078  shared.m[get_local_id(0) + (16 * (1 << 3) * 14)] = r8;
10079  shared.m[get_local_id(0) + (16 * (1 << 3) * 15)] = r9;
10080  barrier(CLK_LOCAL_MEM_FENCE);
10081  {
10082    {
10083      ulong r0_1 = shared.m[smem_l_idx + (0)];
10084      ulong r0_2 = shared.m[smem_l_idx + (8)];
10085      ulong r0_3 = shared.m[smem_l_idx + (16)];
10086      ulong r0_4 = shared.m[smem_l_idx + (24)];
10087      ulong r0_5 = shared.m[smem_r_idx + (32)];
10088      ulong r0_6 = shared.m[smem_r_idx + (40)];
10089      ulong r0_7 = shared.m[smem_r_idx + (48)];
10090      ulong r0_8 = shared.m[smem_r_idx + (56)];
10091      if (r0_4 >= r0_5) {
10092        ulong const t = r0_4;
10093        r0_4 = r0_5;
10094        r0_5 = t;
10095      };
10096      if (r0_3 >= r0_6) {
10097        ulong const t = r0_3;
10098        r0_3 = r0_6;
10099        r0_6 = t;
10100      };
10101      if (r0_2 >= r0_7) {
10102        ulong const t = r0_2;
10103        r0_2 = r0_7;
10104        r0_7 = t;
10105      };
10106      if (r0_1 >= r0_8) {
10107        ulong const t = r0_1;
10108        r0_1 = r0_8;
10109        r0_8 = t;
10110      };
10111      if (r0_5 >= r0_7) {
10112        ulong const t = r0_5;
10113        r0_5 = r0_7;
10114        r0_7 = t;
10115      };
10116      if (r0_6 >= r0_8) {
10117        ulong const t = r0_6;
10118        r0_6 = r0_8;
10119        r0_8 = t;
10120      };
10121      if (r0_5 >= r0_6) {
10122        ulong const t = r0_5;
10123        r0_5 = r0_6;
10124        r0_6 = t;
10125      };
10126      if (r0_7 >= r0_8) {
10127        ulong const t = r0_7;
10128        r0_7 = r0_8;
10129        r0_8 = t;
10130      };
10131      if (r0_1 >= r0_3) {
10132        ulong const t = r0_1;
10133        r0_1 = r0_3;
10134        r0_3 = t;
10135      };
10136      if (r0_2 >= r0_4) {
10137        ulong const t = r0_2;
10138        r0_2 = r0_4;
10139        r0_4 = t;
10140      };
10141      if (r0_1 >= r0_2) {
10142        ulong const t = r0_1;
10143        r0_1 = r0_2;
10144        r0_2 = t;
10145      };
10146      if (r0_3 >= r0_4) {
10147        ulong const t = r0_3;
10148        r0_3 = r0_4;
10149        r0_4 = t;
10150      };
10151      shared.m[smem_l_idx + (0)] = r0_1;
10152      shared.m[smem_l_idx + (8)] = r0_2;
10153      shared.m[smem_l_idx + (16)] = r0_3;
10154      shared.m[smem_l_idx + (24)] = r0_4;
10155      shared.m[smem_r_idx + (32)] = r0_5;
10156      shared.m[smem_r_idx + (40)] = r0_6;
10157      shared.m[smem_r_idx + (48)] = r0_7;
10158      shared.m[smem_r_idx + (56)] = r0_8;
10159    }
10160    {
10161      ulong r1_1 = shared.m[smem_l_idx + (64)];
10162      ulong r1_2 = shared.m[smem_l_idx + (72)];
10163      ulong r1_3 = shared.m[smem_l_idx + (80)];
10164      ulong r1_4 = shared.m[smem_l_idx + (88)];
10165      ulong r1_5 = shared.m[smem_r_idx + (96)];
10166      ulong r1_6 = shared.m[smem_r_idx + (104)];
10167      ulong r1_7 = shared.m[smem_r_idx + (112)];
10168      ulong r1_8 = shared.m[smem_r_idx + (120)];
10169      if (r1_4 >= r1_5) {
10170        ulong const t = r1_4;
10171        r1_4 = r1_5;
10172        r1_5 = t;
10173      };
10174      if (r1_3 >= r1_6) {
10175        ulong const t = r1_3;
10176        r1_3 = r1_6;
10177        r1_6 = t;
10178      };
10179      if (r1_2 >= r1_7) {
10180        ulong const t = r1_2;
10181        r1_2 = r1_7;
10182        r1_7 = t;
10183      };
10184      if (r1_1 >= r1_8) {
10185        ulong const t = r1_1;
10186        r1_1 = r1_8;
10187        r1_8 = t;
10188      };
10189      if (r1_5 >= r1_7) {
10190        ulong const t = r1_5;
10191        r1_5 = r1_7;
10192        r1_7 = t;
10193      };
10194      if (r1_6 >= r1_8) {
10195        ulong const t = r1_6;
10196        r1_6 = r1_8;
10197        r1_8 = t;
10198      };
10199      if (r1_5 >= r1_6) {
10200        ulong const t = r1_5;
10201        r1_5 = r1_6;
10202        r1_6 = t;
10203      };
10204      if (r1_7 >= r1_8) {
10205        ulong const t = r1_7;
10206        r1_7 = r1_8;
10207        r1_8 = t;
10208      };
10209      if (r1_1 >= r1_3) {
10210        ulong const t = r1_1;
10211        r1_1 = r1_3;
10212        r1_3 = t;
10213      };
10214      if (r1_2 >= r1_4) {
10215        ulong const t = r1_2;
10216        r1_2 = r1_4;
10217        r1_4 = t;
10218      };
10219      if (r1_1 >= r1_2) {
10220        ulong const t = r1_1;
10221        r1_1 = r1_2;
10222        r1_2 = t;
10223      };
10224      if (r1_3 >= r1_4) {
10225        ulong const t = r1_3;
10226        r1_3 = r1_4;
10227        r1_4 = t;
10228      };
10229      shared.m[smem_l_idx + (64)] = r1_1;
10230      shared.m[smem_l_idx + (72)] = r1_2;
10231      shared.m[smem_l_idx + (80)] = r1_3;
10232      shared.m[smem_l_idx + (88)] = r1_4;
10233      shared.m[smem_r_idx + (96)] = r1_5;
10234      shared.m[smem_r_idx + (104)] = r1_6;
10235      shared.m[smem_r_idx + (112)] = r1_7;
10236      shared.m[smem_r_idx + (120)] = r1_8;
10237    }
10238  }
10239  barrier(CLK_LOCAL_MEM_FENCE);
10240  r1 = shared.m[get_local_id(0) + (16 * (1 << 3) * 0)];
10241  r16 = shared.m[get_local_id(0) + (16 * (1 << 3) * 1)];
10242  r2 = shared.m[get_local_id(0) + (16 * (1 << 3) * 2)];
10243  r15 = shared.m[get_local_id(0) + (16 * (1 << 3) * 3)];
10244  r3 = shared.m[get_local_id(0) + (16 * (1 << 3) * 4)];
10245  r14 = shared.m[get_local_id(0) + (16 * (1 << 3) * 5)];
10246  r4 = shared.m[get_local_id(0) + (16 * (1 << 3) * 6)];
10247  r13 = shared.m[get_local_id(0) + (16 * (1 << 3) * 7)];
10248  r5 = shared.m[get_local_id(0) + (16 * (1 << 3) * 8)];
10249  r12 = shared.m[get_local_id(0) + (16 * (1 << 3) * 9)];
10250  r6 = shared.m[get_local_id(0) + (16 * (1 << 3) * 10)];
10251  r11 = shared.m[get_local_id(0) + (16 * (1 << 3) * 11)];
10252  r7 = shared.m[get_local_id(0) + (16 * (1 << 3) * 12)];
10253  r10 = shared.m[get_local_id(0) + (16 * (1 << 3) * 13)];
10254  r8 = shared.m[get_local_id(0) + (16 * (1 << 3) * 14)];
10255  r9 = shared.m[get_local_id(0) + (16 * (1 << 3) * 15)];
10256  {
10257    {
10258      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
10259      int const t_lt = get_sub_group_local_id() < half_lane_idx;
10260      ;
10261      {
10262        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
10263        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
10264      };
10265      {
10266        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
10267        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
10268      };
10269      {
10270        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
10271        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
10272      };
10273      {
10274        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
10275        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
10276      };
10277      {
10278        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
10279        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
10280      };
10281      {
10282        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
10283        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
10284      };
10285      {
10286        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
10287        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
10288      };
10289      {
10290        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
10291        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
10292      };
10293      {
10294        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
10295        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
10296      };
10297      {
10298        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
10299        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
10300      };
10301      {
10302        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
10303        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
10304      };
10305      {
10306        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
10307        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
10308      };
10309      {
10310        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
10311        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
10312      };
10313      {
10314        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
10315        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
10316      };
10317      {
10318        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
10319        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
10320      };
10321      {
10322        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
10323        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
10324      };
10325    }
10326    {
10327      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
10328      int const t_lt = get_sub_group_local_id() < half_lane_idx;
10329      ;
10330      {
10331        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
10332        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
10333      };
10334      {
10335        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
10336        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
10337      };
10338      {
10339        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
10340        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
10341      };
10342      {
10343        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
10344        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
10345      };
10346      {
10347        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
10348        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
10349      };
10350      {
10351        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
10352        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
10353      };
10354      {
10355        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
10356        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
10357      };
10358      {
10359        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
10360        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
10361      };
10362      {
10363        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
10364        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
10365      };
10366      {
10367        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
10368        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
10369      };
10370      {
10371        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
10372        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
10373      };
10374      {
10375        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
10376        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
10377      };
10378      {
10379        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
10380        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
10381      };
10382      {
10383        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
10384        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
10385      };
10386      {
10387        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
10388        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
10389      };
10390      {
10391        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
10392        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
10393      };
10394    }
10395    {
10396      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
10397      int const t_lt = get_sub_group_local_id() < half_lane_idx;
10398      ;
10399      {
10400        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
10401        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
10402      };
10403      {
10404        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
10405        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
10406      };
10407      {
10408        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
10409        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
10410      };
10411      {
10412        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
10413        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
10414      };
10415      {
10416        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
10417        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
10418      };
10419      {
10420        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
10421        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
10422      };
10423      {
10424        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
10425        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
10426      };
10427      {
10428        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
10429        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
10430      };
10431      {
10432        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
10433        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
10434      };
10435      {
10436        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
10437        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
10438      };
10439      {
10440        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
10441        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
10442      };
10443      {
10444        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
10445        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
10446      };
10447      {
10448        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
10449        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
10450      };
10451      {
10452        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
10453        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
10454      };
10455      {
10456        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
10457        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
10458      };
10459      {
10460        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
10461        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
10462      };
10463    }
10464    if (r1 >= r9) {
10465      ulong const t = r1;
10466      r1 = r9;
10467      r9 = t;
10468    };
10469    if (r5 >= r13) {
10470      ulong const t = r5;
10471      r5 = r13;
10472      r13 = t;
10473    };
10474    if (r1 >= r5) {
10475      ulong const t = r1;
10476      r1 = r5;
10477      r5 = t;
10478    };
10479    if (r9 >= r13) {
10480      ulong const t = r9;
10481      r9 = r13;
10482      r13 = t;
10483    };
10484    if (r3 >= r11) {
10485      ulong const t = r3;
10486      r3 = r11;
10487      r11 = t;
10488    };
10489    if (r7 >= r15) {
10490      ulong const t = r7;
10491      r7 = r15;
10492      r15 = t;
10493    };
10494    if (r3 >= r7) {
10495      ulong const t = r3;
10496      r3 = r7;
10497      r7 = t;
10498    };
10499    if (r11 >= r15) {
10500      ulong const t = r11;
10501      r11 = r15;
10502      r15 = t;
10503    };
10504    if (r1 >= r3) {
10505      ulong const t = r1;
10506      r1 = r3;
10507      r3 = t;
10508    };
10509    if (r5 >= r7) {
10510      ulong const t = r5;
10511      r5 = r7;
10512      r7 = t;
10513    };
10514    if (r9 >= r11) {
10515      ulong const t = r9;
10516      r9 = r11;
10517      r11 = t;
10518    };
10519    if (r13 >= r15) {
10520      ulong const t = r13;
10521      r13 = r15;
10522      r15 = t;
10523    };
10524    if (r2 >= r10) {
10525      ulong const t = r2;
10526      r2 = r10;
10527      r10 = t;
10528    };
10529    if (r6 >= r14) {
10530      ulong const t = r6;
10531      r6 = r14;
10532      r14 = t;
10533    };
10534    if (r2 >= r6) {
10535      ulong const t = r2;
10536      r2 = r6;
10537      r6 = t;
10538    };
10539    if (r10 >= r14) {
10540      ulong const t = r10;
10541      r10 = r14;
10542      r14 = t;
10543    };
10544    if (r4 >= r12) {
10545      ulong const t = r4;
10546      r4 = r12;
10547      r12 = t;
10548    };
10549    if (r8 >= r16) {
10550      ulong const t = r8;
10551      r8 = r16;
10552      r16 = t;
10553    };
10554    if (r4 >= r8) {
10555      ulong const t = r4;
10556      r4 = r8;
10557      r8 = t;
10558    };
10559    if (r12 >= r16) {
10560      ulong const t = r12;
10561      r12 = r16;
10562      r16 = t;
10563    };
10564    if (r2 >= r4) {
10565      ulong const t = r2;
10566      r2 = r4;
10567      r4 = t;
10568    };
10569    if (r6 >= r8) {
10570      ulong const t = r6;
10571      r6 = r8;
10572      r8 = t;
10573    };
10574    if (r10 >= r12) {
10575      ulong const t = r10;
10576      r10 = r12;
10577      r12 = t;
10578    };
10579    if (r14 >= r16) {
10580      ulong const t = r14;
10581      r14 = r16;
10582      r16 = t;
10583    };
10584    if (r1 >= r2) {
10585      ulong const t = r1;
10586      r1 = r2;
10587      r2 = t;
10588    };
10589    if (r3 >= r4) {
10590      ulong const t = r3;
10591      r3 = r4;
10592      r4 = t;
10593    };
10594    if (r5 >= r6) {
10595      ulong const t = r5;
10596      r5 = r6;
10597      r6 = t;
10598    };
10599    if (r7 >= r8) {
10600      ulong const t = r7;
10601      r7 = r8;
10602      r8 = t;
10603    };
10604    if (r9 >= r10) {
10605      ulong const t = r9;
10606      r9 = r10;
10607      r10 = t;
10608    };
10609    if (r11 >= r12) {
10610      ulong const t = r11;
10611      r11 = r12;
10612      r12 = t;
10613    };
10614    if (r13 >= r14) {
10615      ulong const t = r13;
10616      r13 = r14;
10617      r14 = t;
10618    };
10619    if (r15 >= r16) {
10620      ulong const t = r15;
10621      r15 = r16;
10622      r16 = t;
10623    };
10624  }
10625  shared.m[get_local_id(0) + (16 * (1 << 3) * 0)] = r1;
10626  shared.m[get_local_id(0) + (16 * (1 << 3) * 1)] = r16;
10627  shared.m[get_local_id(0) + (16 * (1 << 3) * 2)] = r2;
10628  shared.m[get_local_id(0) + (16 * (1 << 3) * 3)] = r15;
10629  shared.m[get_local_id(0) + (16 * (1 << 3) * 4)] = r3;
10630  shared.m[get_local_id(0) + (16 * (1 << 3) * 5)] = r14;
10631  shared.m[get_local_id(0) + (16 * (1 << 3) * 6)] = r4;
10632  shared.m[get_local_id(0) + (16 * (1 << 3) * 7)] = r13;
10633  shared.m[get_local_id(0) + (16 * (1 << 3) * 8)] = r5;
10634  shared.m[get_local_id(0) + (16 * (1 << 3) * 9)] = r12;
10635  shared.m[get_local_id(0) + (16 * (1 << 3) * 10)] = r6;
10636  shared.m[get_local_id(0) + (16 * (1 << 3) * 11)] = r11;
10637  shared.m[get_local_id(0) + (16 * (1 << 3) * 12)] = r7;
10638  shared.m[get_local_id(0) + (16 * (1 << 3) * 13)] = r10;
10639  shared.m[get_local_id(0) + (16 * (1 << 3) * 14)] = r8;
10640  shared.m[get_local_id(0) + (16 * (1 << 3) * 15)] = r9;
10641  barrier(CLK_LOCAL_MEM_FENCE);
10642  {
10643    {
10644      ulong r0_1 = shared.m[smem_l_idx + (0)];
10645      ulong r0_2 = shared.m[smem_l_idx + (8)];
10646      ulong r0_3 = shared.m[smem_l_idx + (16)];
10647      ulong r0_4 = shared.m[smem_l_idx + (24)];
10648      ulong r0_5 = shared.m[smem_l_idx + (32)];
10649      ulong r0_6 = shared.m[smem_l_idx + (40)];
10650      ulong r0_7 = shared.m[smem_l_idx + (48)];
10651      ulong r0_8 = shared.m[smem_l_idx + (56)];
10652      ulong r0_9 = shared.m[smem_r_idx + (64)];
10653      ulong r0_10 = shared.m[smem_r_idx + (72)];
10654      ulong r0_11 = shared.m[smem_r_idx + (80)];
10655      ulong r0_12 = shared.m[smem_r_idx + (88)];
10656      ulong r0_13 = shared.m[smem_r_idx + (96)];
10657      ulong r0_14 = shared.m[smem_r_idx + (104)];
10658      ulong r0_15 = shared.m[smem_r_idx + (112)];
10659      ulong r0_16 = shared.m[smem_r_idx + (120)];
10660      if (r0_8 >= r0_9) {
10661        ulong const t = r0_8;
10662        r0_8 = r0_9;
10663        r0_9 = t;
10664      };
10665      if (r0_7 >= r0_10) {
10666        ulong const t = r0_7;
10667        r0_7 = r0_10;
10668        r0_10 = t;
10669      };
10670      if (r0_6 >= r0_11) {
10671        ulong const t = r0_6;
10672        r0_6 = r0_11;
10673        r0_11 = t;
10674      };
10675      if (r0_5 >= r0_12) {
10676        ulong const t = r0_5;
10677        r0_5 = r0_12;
10678        r0_12 = t;
10679      };
10680      if (r0_4 >= r0_13) {
10681        ulong const t = r0_4;
10682        r0_4 = r0_13;
10683        r0_13 = t;
10684      };
10685      if (r0_3 >= r0_14) {
10686        ulong const t = r0_3;
10687        r0_3 = r0_14;
10688        r0_14 = t;
10689      };
10690      if (r0_2 >= r0_15) {
10691        ulong const t = r0_2;
10692        r0_2 = r0_15;
10693        r0_15 = t;
10694      };
10695      if (r0_1 >= r0_16) {
10696        ulong const t = r0_1;
10697        r0_1 = r0_16;
10698        r0_16 = t;
10699      };
10700      if (r0_9 >= r0_13) {
10701        ulong const t = r0_9;
10702        r0_9 = r0_13;
10703        r0_13 = t;
10704      };
10705      if (r0_11 >= r0_15) {
10706        ulong const t = r0_11;
10707        r0_11 = r0_15;
10708        r0_15 = t;
10709      };
10710      if (r0_9 >= r0_11) {
10711        ulong const t = r0_9;
10712        r0_9 = r0_11;
10713        r0_11 = t;
10714      };
10715      if (r0_13 >= r0_15) {
10716        ulong const t = r0_13;
10717        r0_13 = r0_15;
10718        r0_15 = t;
10719      };
10720      if (r0_10 >= r0_14) {
10721        ulong const t = r0_10;
10722        r0_10 = r0_14;
10723        r0_14 = t;
10724      };
10725      if (r0_12 >= r0_16) {
10726        ulong const t = r0_12;
10727        r0_12 = r0_16;
10728        r0_16 = t;
10729      };
10730      if (r0_10 >= r0_12) {
10731        ulong const t = r0_10;
10732        r0_10 = r0_12;
10733        r0_12 = t;
10734      };
10735      if (r0_14 >= r0_16) {
10736        ulong const t = r0_14;
10737        r0_14 = r0_16;
10738        r0_16 = t;
10739      };
10740      if (r0_9 >= r0_10) {
10741        ulong const t = r0_9;
10742        r0_9 = r0_10;
10743        r0_10 = t;
10744      };
10745      if (r0_11 >= r0_12) {
10746        ulong const t = r0_11;
10747        r0_11 = r0_12;
10748        r0_12 = t;
10749      };
10750      if (r0_13 >= r0_14) {
10751        ulong const t = r0_13;
10752        r0_13 = r0_14;
10753        r0_14 = t;
10754      };
10755      if (r0_15 >= r0_16) {
10756        ulong const t = r0_15;
10757        r0_15 = r0_16;
10758        r0_16 = t;
10759      };
10760      if (r0_1 >= r0_5) {
10761        ulong const t = r0_1;
10762        r0_1 = r0_5;
10763        r0_5 = t;
10764      };
10765      if (r0_3 >= r0_7) {
10766        ulong const t = r0_3;
10767        r0_3 = r0_7;
10768        r0_7 = t;
10769      };
10770      if (r0_1 >= r0_3) {
10771        ulong const t = r0_1;
10772        r0_1 = r0_3;
10773        r0_3 = t;
10774      };
10775      if (r0_5 >= r0_7) {
10776        ulong const t = r0_5;
10777        r0_5 = r0_7;
10778        r0_7 = t;
10779      };
10780      if (r0_2 >= r0_6) {
10781        ulong const t = r0_2;
10782        r0_2 = r0_6;
10783        r0_6 = t;
10784      };
10785      if (r0_4 >= r0_8) {
10786        ulong const t = r0_4;
10787        r0_4 = r0_8;
10788        r0_8 = t;
10789      };
10790      if (r0_2 >= r0_4) {
10791        ulong const t = r0_2;
10792        r0_2 = r0_4;
10793        r0_4 = t;
10794      };
10795      if (r0_6 >= r0_8) {
10796        ulong const t = r0_6;
10797        r0_6 = r0_8;
10798        r0_8 = t;
10799      };
10800      if (r0_1 >= r0_2) {
10801        ulong const t = r0_1;
10802        r0_1 = r0_2;
10803        r0_2 = t;
10804      };
10805      if (r0_3 >= r0_4) {
10806        ulong const t = r0_3;
10807        r0_3 = r0_4;
10808        r0_4 = t;
10809      };
10810      if (r0_5 >= r0_6) {
10811        ulong const t = r0_5;
10812        r0_5 = r0_6;
10813        r0_6 = t;
10814      };
10815      if (r0_7 >= r0_8) {
10816        ulong const t = r0_7;
10817        r0_7 = r0_8;
10818        r0_8 = t;
10819      };
10820      shared.m[smem_l_idx + (0)] = r0_1;
10821      shared.m[smem_l_idx + (8)] = r0_2;
10822      shared.m[smem_l_idx + (16)] = r0_3;
10823      shared.m[smem_l_idx + (24)] = r0_4;
10824      shared.m[smem_l_idx + (32)] = r0_5;
10825      shared.m[smem_l_idx + (40)] = r0_6;
10826      shared.m[smem_l_idx + (48)] = r0_7;
10827      shared.m[smem_l_idx + (56)] = r0_8;
10828      shared.m[smem_r_idx + (64)] = r0_9;
10829      shared.m[smem_r_idx + (72)] = r0_10;
10830      shared.m[smem_r_idx + (80)] = r0_11;
10831      shared.m[smem_r_idx + (88)] = r0_12;
10832      shared.m[smem_r_idx + (96)] = r0_13;
10833      shared.m[smem_r_idx + (104)] = r0_14;
10834      shared.m[smem_r_idx + (112)] = r0_15;
10835      shared.m[smem_r_idx + (120)] = r0_16;
10836    }
10837  }
10838  barrier(CLK_LOCAL_MEM_FENCE);
10839  r1 = shared.m[get_local_id(0) + (16 * (1 << 3) * 0)];
10840  r16 = shared.m[get_local_id(0) + (16 * (1 << 3) * 1)];
10841  r2 = shared.m[get_local_id(0) + (16 * (1 << 3) * 2)];
10842  r15 = shared.m[get_local_id(0) + (16 * (1 << 3) * 3)];
10843  r3 = shared.m[get_local_id(0) + (16 * (1 << 3) * 4)];
10844  r14 = shared.m[get_local_id(0) + (16 * (1 << 3) * 5)];
10845  r4 = shared.m[get_local_id(0) + (16 * (1 << 3) * 6)];
10846  r13 = shared.m[get_local_id(0) + (16 * (1 << 3) * 7)];
10847  r5 = shared.m[get_local_id(0) + (16 * (1 << 3) * 8)];
10848  r12 = shared.m[get_local_id(0) + (16 * (1 << 3) * 9)];
10849  r6 = shared.m[get_local_id(0) + (16 * (1 << 3) * 10)];
10850  r11 = shared.m[get_local_id(0) + (16 * (1 << 3) * 11)];
10851  r7 = shared.m[get_local_id(0) + (16 * (1 << 3) * 12)];
10852  r10 = shared.m[get_local_id(0) + (16 * (1 << 3) * 13)];
10853  r8 = shared.m[get_local_id(0) + (16 * (1 << 3) * 14)];
10854  r9 = shared.m[get_local_id(0) + (16 * (1 << 3) * 15)];
10855  {
10856    {
10857      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
10858      int const t_lt = get_sub_group_local_id() < half_lane_idx;
10859      ;
10860      {
10861        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
10862        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
10863      };
10864      {
10865        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
10866        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
10867      };
10868      {
10869        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
10870        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
10871      };
10872      {
10873        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
10874        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
10875      };
10876      {
10877        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
10878        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
10879      };
10880      {
10881        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
10882        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
10883      };
10884      {
10885        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
10886        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
10887      };
10888      {
10889        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
10890        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
10891      };
10892      {
10893        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
10894        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
10895      };
10896      {
10897        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
10898        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
10899      };
10900      {
10901        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
10902        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
10903      };
10904      {
10905        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
10906        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
10907      };
10908      {
10909        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
10910        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
10911      };
10912      {
10913        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
10914        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
10915      };
10916      {
10917        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
10918        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
10919      };
10920      {
10921        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
10922        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
10923      };
10924    }
10925    {
10926      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
10927      int const t_lt = get_sub_group_local_id() < half_lane_idx;
10928      ;
10929      {
10930        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
10931        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
10932      };
10933      {
10934        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
10935        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
10936      };
10937      {
10938        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
10939        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
10940      };
10941      {
10942        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
10943        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
10944      };
10945      {
10946        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
10947        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
10948      };
10949      {
10950        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
10951        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
10952      };
10953      {
10954        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
10955        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
10956      };
10957      {
10958        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
10959        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
10960      };
10961      {
10962        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
10963        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
10964      };
10965      {
10966        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
10967        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
10968      };
10969      {
10970        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
10971        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
10972      };
10973      {
10974        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
10975        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
10976      };
10977      {
10978        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
10979        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
10980      };
10981      {
10982        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
10983        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
10984      };
10985      {
10986        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
10987        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
10988      };
10989      {
10990        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
10991        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
10992      };
10993    }
10994    {
10995      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
10996      int const t_lt = get_sub_group_local_id() < half_lane_idx;
10997      ;
10998      {
10999        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
11000        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
11001      };
11002      {
11003        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
11004        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
11005      };
11006      {
11007        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
11008        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
11009      };
11010      {
11011        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
11012        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
11013      };
11014      {
11015        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
11016        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
11017      };
11018      {
11019        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
11020        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
11021      };
11022      {
11023        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
11024        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
11025      };
11026      {
11027        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
11028        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
11029      };
11030      {
11031        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
11032        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
11033      };
11034      {
11035        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
11036        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
11037      };
11038      {
11039        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
11040        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
11041      };
11042      {
11043        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
11044        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
11045      };
11046      {
11047        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
11048        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
11049      };
11050      {
11051        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
11052        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
11053      };
11054      {
11055        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
11056        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
11057      };
11058      {
11059        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
11060        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
11061      };
11062    }
11063    if (r1 >= r9) {
11064      ulong const t = r1;
11065      r1 = r9;
11066      r9 = t;
11067    };
11068    if (r5 >= r13) {
11069      ulong const t = r5;
11070      r5 = r13;
11071      r13 = t;
11072    };
11073    if (r1 >= r5) {
11074      ulong const t = r1;
11075      r1 = r5;
11076      r5 = t;
11077    };
11078    if (r9 >= r13) {
11079      ulong const t = r9;
11080      r9 = r13;
11081      r13 = t;
11082    };
11083    if (r3 >= r11) {
11084      ulong const t = r3;
11085      r3 = r11;
11086      r11 = t;
11087    };
11088    if (r7 >= r15) {
11089      ulong const t = r7;
11090      r7 = r15;
11091      r15 = t;
11092    };
11093    if (r3 >= r7) {
11094      ulong const t = r3;
11095      r3 = r7;
11096      r7 = t;
11097    };
11098    if (r11 >= r15) {
11099      ulong const t = r11;
11100      r11 = r15;
11101      r15 = t;
11102    };
11103    if (r1 >= r3) {
11104      ulong const t = r1;
11105      r1 = r3;
11106      r3 = t;
11107    };
11108    if (r5 >= r7) {
11109      ulong const t = r5;
11110      r5 = r7;
11111      r7 = t;
11112    };
11113    if (r9 >= r11) {
11114      ulong const t = r9;
11115      r9 = r11;
11116      r11 = t;
11117    };
11118    if (r13 >= r15) {
11119      ulong const t = r13;
11120      r13 = r15;
11121      r15 = t;
11122    };
11123    if (r2 >= r10) {
11124      ulong const t = r2;
11125      r2 = r10;
11126      r10 = t;
11127    };
11128    if (r6 >= r14) {
11129      ulong const t = r6;
11130      r6 = r14;
11131      r14 = t;
11132    };
11133    if (r2 >= r6) {
11134      ulong const t = r2;
11135      r2 = r6;
11136      r6 = t;
11137    };
11138    if (r10 >= r14) {
11139      ulong const t = r10;
11140      r10 = r14;
11141      r14 = t;
11142    };
11143    if (r4 >= r12) {
11144      ulong const t = r4;
11145      r4 = r12;
11146      r12 = t;
11147    };
11148    if (r8 >= r16) {
11149      ulong const t = r8;
11150      r8 = r16;
11151      r16 = t;
11152    };
11153    if (r4 >= r8) {
11154      ulong const t = r4;
11155      r4 = r8;
11156      r8 = t;
11157    };
11158    if (r12 >= r16) {
11159      ulong const t = r12;
11160      r12 = r16;
11161      r16 = t;
11162    };
11163    if (r2 >= r4) {
11164      ulong const t = r2;
11165      r2 = r4;
11166      r4 = t;
11167    };
11168    if (r6 >= r8) {
11169      ulong const t = r6;
11170      r6 = r8;
11171      r8 = t;
11172    };
11173    if (r10 >= r12) {
11174      ulong const t = r10;
11175      r10 = r12;
11176      r12 = t;
11177    };
11178    if (r14 >= r16) {
11179      ulong const t = r14;
11180      r14 = r16;
11181      r16 = t;
11182    };
11183    if (r1 >= r2) {
11184      ulong const t = r1;
11185      r1 = r2;
11186      r2 = t;
11187    };
11188    if (r3 >= r4) {
11189      ulong const t = r3;
11190      r3 = r4;
11191      r4 = t;
11192    };
11193    if (r5 >= r6) {
11194      ulong const t = r5;
11195      r5 = r6;
11196      r6 = t;
11197    };
11198    if (r7 >= r8) {
11199      ulong const t = r7;
11200      r7 = r8;
11201      r8 = t;
11202    };
11203    if (r9 >= r10) {
11204      ulong const t = r9;
11205      r9 = r10;
11206      r10 = t;
11207    };
11208    if (r11 >= r12) {
11209      ulong const t = r11;
11210      r11 = r12;
11211      r12 = t;
11212    };
11213    if (r13 >= r14) {
11214      ulong const t = r13;
11215      r13 = r14;
11216      r14 = t;
11217    };
11218    if (r15 >= r16) {
11219      ulong const t = r15;
11220      r15 = r16;
11221      r16 = t;
11222    };
11223  }
11224  vout[gmem_idx + (1 << 3) * 0] = r1;
11225  vout[gmem_idx + (1 << 3) * 1] = r2;
11226  vout[gmem_idx + (1 << 3) * 2] = r3;
11227  vout[gmem_idx + (1 << 3) * 3] = r4;
11228  vout[gmem_idx + (1 << 3) * 4] = r5;
11229  vout[gmem_idx + (1 << 3) * 5] = r6;
11230  vout[gmem_idx + (1 << 3) * 6] = r7;
11231  vout[gmem_idx + (1 << 3) * 7] = r8;
11232  vout[gmem_idx + (1 << 3) * 8] = r9;
11233  vout[gmem_idx + (1 << 3) * 9] = r10;
11234  vout[gmem_idx + (1 << 3) * 10] = r11;
11235  vout[gmem_idx + (1 << 3) * 11] = r12;
11236  vout[gmem_idx + (1 << 3) * 12] = r13;
11237  vout[gmem_idx + (1 << 3) * 13] = r14;
11238  vout[gmem_idx + (1 << 3) * 14] = r15;
11239  vout[gmem_idx + (1 << 3) * 15] = r16;
11240}
11241
11242__kernel __attribute__((intel_reqd_sub_group_size((1 << 3))))
11243__attribute__((reqd_work_group_size((1 << 3) * 1, 1, 1))) void
11244hs_kernel_bc_0(__global ulong* const restrict vout)
11245{
11246  uint const gmem_idx = (get_global_id(0) & ~((1 << 3) - 1)) * 16 +
11247                        (get_local_id(0) & ((1 << 3) - 1));
11248  ulong r1 = vout[gmem_idx + (1 << 3) * 0];
11249  ulong r2 = vout[gmem_idx + (1 << 3) * 1];
11250  ulong r3 = vout[gmem_idx + (1 << 3) * 2];
11251  ulong r4 = vout[gmem_idx + (1 << 3) * 3];
11252  ulong r5 = vout[gmem_idx + (1 << 3) * 4];
11253  ulong r6 = vout[gmem_idx + (1 << 3) * 5];
11254  ulong r7 = vout[gmem_idx + (1 << 3) * 6];
11255  ulong r8 = vout[gmem_idx + (1 << 3) * 7];
11256  ulong r9 = vout[gmem_idx + (1 << 3) * 8];
11257  ulong r10 = vout[gmem_idx + (1 << 3) * 9];
11258  ulong r11 = vout[gmem_idx + (1 << 3) * 10];
11259  ulong r12 = vout[gmem_idx + (1 << 3) * 11];
11260  ulong r13 = vout[gmem_idx + (1 << 3) * 12];
11261  ulong r14 = vout[gmem_idx + (1 << 3) * 13];
11262  ulong r15 = vout[gmem_idx + (1 << 3) * 14];
11263  ulong r16 = vout[gmem_idx + (1 << 3) * 15];
11264  {
11265    {
11266      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
11267      int const t_lt = get_sub_group_local_id() < half_lane_idx;
11268      ;
11269      {
11270        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
11271        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
11272      };
11273      {
11274        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
11275        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
11276      };
11277      {
11278        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
11279        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
11280      };
11281      {
11282        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
11283        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
11284      };
11285      {
11286        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
11287        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
11288      };
11289      {
11290        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
11291        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
11292      };
11293      {
11294        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
11295        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
11296      };
11297      {
11298        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
11299        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
11300      };
11301      {
11302        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
11303        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
11304      };
11305      {
11306        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
11307        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
11308      };
11309      {
11310        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
11311        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
11312      };
11313      {
11314        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
11315        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
11316      };
11317      {
11318        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
11319        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
11320      };
11321      {
11322        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
11323        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
11324      };
11325      {
11326        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
11327        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
11328      };
11329      {
11330        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
11331        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
11332      };
11333    }
11334    {
11335      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
11336      int const t_lt = get_sub_group_local_id() < half_lane_idx;
11337      ;
11338      {
11339        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
11340        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
11341      };
11342      {
11343        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
11344        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
11345      };
11346      {
11347        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
11348        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
11349      };
11350      {
11351        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
11352        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
11353      };
11354      {
11355        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
11356        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
11357      };
11358      {
11359        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
11360        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
11361      };
11362      {
11363        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
11364        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
11365      };
11366      {
11367        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
11368        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
11369      };
11370      {
11371        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
11372        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
11373      };
11374      {
11375        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
11376        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
11377      };
11378      {
11379        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
11380        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
11381      };
11382      {
11383        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
11384        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
11385      };
11386      {
11387        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
11388        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
11389      };
11390      {
11391        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
11392        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
11393      };
11394      {
11395        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
11396        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
11397      };
11398      {
11399        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
11400        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
11401      };
11402    }
11403    {
11404      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
11405      int const t_lt = get_sub_group_local_id() < half_lane_idx;
11406      ;
11407      {
11408        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
11409        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
11410      };
11411      {
11412        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
11413        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
11414      };
11415      {
11416        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
11417        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
11418      };
11419      {
11420        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
11421        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
11422      };
11423      {
11424        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
11425        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
11426      };
11427      {
11428        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
11429        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
11430      };
11431      {
11432        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
11433        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
11434      };
11435      {
11436        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
11437        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
11438      };
11439      {
11440        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
11441        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
11442      };
11443      {
11444        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
11445        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
11446      };
11447      {
11448        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
11449        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
11450      };
11451      {
11452        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
11453        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
11454      };
11455      {
11456        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
11457        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
11458      };
11459      {
11460        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
11461        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
11462      };
11463      {
11464        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
11465        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
11466      };
11467      {
11468        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
11469        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
11470      };
11471    }
11472    if (r1 >= r9) {
11473      ulong const t = r1;
11474      r1 = r9;
11475      r9 = t;
11476    };
11477    if (r5 >= r13) {
11478      ulong const t = r5;
11479      r5 = r13;
11480      r13 = t;
11481    };
11482    if (r1 >= r5) {
11483      ulong const t = r1;
11484      r1 = r5;
11485      r5 = t;
11486    };
11487    if (r9 >= r13) {
11488      ulong const t = r9;
11489      r9 = r13;
11490      r13 = t;
11491    };
11492    if (r3 >= r11) {
11493      ulong const t = r3;
11494      r3 = r11;
11495      r11 = t;
11496    };
11497    if (r7 >= r15) {
11498      ulong const t = r7;
11499      r7 = r15;
11500      r15 = t;
11501    };
11502    if (r3 >= r7) {
11503      ulong const t = r3;
11504      r3 = r7;
11505      r7 = t;
11506    };
11507    if (r11 >= r15) {
11508      ulong const t = r11;
11509      r11 = r15;
11510      r15 = t;
11511    };
11512    if (r1 >= r3) {
11513      ulong const t = r1;
11514      r1 = r3;
11515      r3 = t;
11516    };
11517    if (r5 >= r7) {
11518      ulong const t = r5;
11519      r5 = r7;
11520      r7 = t;
11521    };
11522    if (r9 >= r11) {
11523      ulong const t = r9;
11524      r9 = r11;
11525      r11 = t;
11526    };
11527    if (r13 >= r15) {
11528      ulong const t = r13;
11529      r13 = r15;
11530      r15 = t;
11531    };
11532    if (r2 >= r10) {
11533      ulong const t = r2;
11534      r2 = r10;
11535      r10 = t;
11536    };
11537    if (r6 >= r14) {
11538      ulong const t = r6;
11539      r6 = r14;
11540      r14 = t;
11541    };
11542    if (r2 >= r6) {
11543      ulong const t = r2;
11544      r2 = r6;
11545      r6 = t;
11546    };
11547    if (r10 >= r14) {
11548      ulong const t = r10;
11549      r10 = r14;
11550      r14 = t;
11551    };
11552    if (r4 >= r12) {
11553      ulong const t = r4;
11554      r4 = r12;
11555      r12 = t;
11556    };
11557    if (r8 >= r16) {
11558      ulong const t = r8;
11559      r8 = r16;
11560      r16 = t;
11561    };
11562    if (r4 >= r8) {
11563      ulong const t = r4;
11564      r4 = r8;
11565      r8 = t;
11566    };
11567    if (r12 >= r16) {
11568      ulong const t = r12;
11569      r12 = r16;
11570      r16 = t;
11571    };
11572    if (r2 >= r4) {
11573      ulong const t = r2;
11574      r2 = r4;
11575      r4 = t;
11576    };
11577    if (r6 >= r8) {
11578      ulong const t = r6;
11579      r6 = r8;
11580      r8 = t;
11581    };
11582    if (r10 >= r12) {
11583      ulong const t = r10;
11584      r10 = r12;
11585      r12 = t;
11586    };
11587    if (r14 >= r16) {
11588      ulong const t = r14;
11589      r14 = r16;
11590      r16 = t;
11591    };
11592    if (r1 >= r2) {
11593      ulong const t = r1;
11594      r1 = r2;
11595      r2 = t;
11596    };
11597    if (r3 >= r4) {
11598      ulong const t = r3;
11599      r3 = r4;
11600      r4 = t;
11601    };
11602    if (r5 >= r6) {
11603      ulong const t = r5;
11604      r5 = r6;
11605      r6 = t;
11606    };
11607    if (r7 >= r8) {
11608      ulong const t = r7;
11609      r7 = r8;
11610      r8 = t;
11611    };
11612    if (r9 >= r10) {
11613      ulong const t = r9;
11614      r9 = r10;
11615      r10 = t;
11616    };
11617    if (r11 >= r12) {
11618      ulong const t = r11;
11619      r11 = r12;
11620      r12 = t;
11621    };
11622    if (r13 >= r14) {
11623      ulong const t = r13;
11624      r13 = r14;
11625      r14 = t;
11626    };
11627    if (r15 >= r16) {
11628      ulong const t = r15;
11629      r15 = r16;
11630      r16 = t;
11631    };
11632  }
11633  vout[gmem_idx + (1 << 3) * 0] = r1;
11634  vout[gmem_idx + (1 << 3) * 1] = r2;
11635  vout[gmem_idx + (1 << 3) * 2] = r3;
11636  vout[gmem_idx + (1 << 3) * 3] = r4;
11637  vout[gmem_idx + (1 << 3) * 4] = r5;
11638  vout[gmem_idx + (1 << 3) * 5] = r6;
11639  vout[gmem_idx + (1 << 3) * 6] = r7;
11640  vout[gmem_idx + (1 << 3) * 7] = r8;
11641  vout[gmem_idx + (1 << 3) * 8] = r9;
11642  vout[gmem_idx + (1 << 3) * 9] = r10;
11643  vout[gmem_idx + (1 << 3) * 10] = r11;
11644  vout[gmem_idx + (1 << 3) * 11] = r12;
11645  vout[gmem_idx + (1 << 3) * 12] = r13;
11646  vout[gmem_idx + (1 << 3) * 13] = r14;
11647  vout[gmem_idx + (1 << 3) * 14] = r15;
11648  vout[gmem_idx + (1 << 3) * 15] = r16;
11649}
11650
11651__kernel __attribute__((intel_reqd_sub_group_size((1 << 3))))
11652__attribute__((reqd_work_group_size((1 << 3) * 2, 1, 1))) void
11653hs_kernel_bc_1(__global ulong* const restrict vout)
11654{
11655  __local struct
11656  {
11657    ulong m[16 * 16];
11658  } shared;
11659
11660  uint const gmem_idx = (get_global_id(0) & ~((1 << 3) - 1)) * 16 +
11661                        (get_local_id(0) & ((1 << 3) - 1));
11662  uint const gmem_l_idx =
11663    (get_global_id(0) & ~((1 << 3) * 2 - 1)) * 16 + get_local_id(0);
11664  uint const smem_l_idx =
11665    get_sub_group_id() * ((1 << 3) * 2) + get_sub_group_local_id();
11666  {
11667    {
11668      ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 0)];
11669      ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 16)];
11670      if (r0_1 >= r0_2) {
11671        ulong const t = r0_1;
11672        r0_1 = r0_2;
11673        r0_2 = t;
11674      };
11675      shared.m[smem_l_idx + (0)] = r0_1;
11676      shared.m[smem_l_idx + (8)] = r0_2;
11677    }
11678    {
11679      ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 2)];
11680      ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 18)];
11681      if (r0_1 >= r0_2) {
11682        ulong const t = r0_1;
11683        r0_1 = r0_2;
11684        r0_2 = t;
11685      };
11686      shared.m[smem_l_idx + (32)] = r0_1;
11687      shared.m[smem_l_idx + (40)] = r0_2;
11688    }
11689    {
11690      ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 4)];
11691      ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 20)];
11692      if (r0_1 >= r0_2) {
11693        ulong const t = r0_1;
11694        r0_1 = r0_2;
11695        r0_2 = t;
11696      };
11697      shared.m[smem_l_idx + (64)] = r0_1;
11698      shared.m[smem_l_idx + (72)] = r0_2;
11699    }
11700    {
11701      ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 6)];
11702      ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 22)];
11703      if (r0_1 >= r0_2) {
11704        ulong const t = r0_1;
11705        r0_1 = r0_2;
11706        r0_2 = t;
11707      };
11708      shared.m[smem_l_idx + (96)] = r0_1;
11709      shared.m[smem_l_idx + (104)] = r0_2;
11710    }
11711    {
11712      ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 8)];
11713      ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 24)];
11714      if (r0_1 >= r0_2) {
11715        ulong const t = r0_1;
11716        r0_1 = r0_2;
11717        r0_2 = t;
11718      };
11719      shared.m[smem_l_idx + (128)] = r0_1;
11720      shared.m[smem_l_idx + (136)] = r0_2;
11721    }
11722    {
11723      ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 10)];
11724      ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 26)];
11725      if (r0_1 >= r0_2) {
11726        ulong const t = r0_1;
11727        r0_1 = r0_2;
11728        r0_2 = t;
11729      };
11730      shared.m[smem_l_idx + (160)] = r0_1;
11731      shared.m[smem_l_idx + (168)] = r0_2;
11732    }
11733    {
11734      ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 12)];
11735      ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 28)];
11736      if (r0_1 >= r0_2) {
11737        ulong const t = r0_1;
11738        r0_1 = r0_2;
11739        r0_2 = t;
11740      };
11741      shared.m[smem_l_idx + (192)] = r0_1;
11742      shared.m[smem_l_idx + (200)] = r0_2;
11743    }
11744    {
11745      ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 14)];
11746      ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 30)];
11747      if (r0_1 >= r0_2) {
11748        ulong const t = r0_1;
11749        r0_1 = r0_2;
11750        r0_2 = t;
11751      };
11752      shared.m[smem_l_idx + (224)] = r0_1;
11753      shared.m[smem_l_idx + (232)] = r0_2;
11754    }
11755  }
11756  barrier(CLK_LOCAL_MEM_FENCE);
11757  ulong r1 = shared.m[get_local_id(0) + (2 * (1 << 3) * 0)];
11758  ulong r2 = shared.m[get_local_id(0) + (2 * (1 << 3) * 1)];
11759  ulong r3 = shared.m[get_local_id(0) + (2 * (1 << 3) * 2)];
11760  ulong r4 = shared.m[get_local_id(0) + (2 * (1 << 3) * 3)];
11761  ulong r5 = shared.m[get_local_id(0) + (2 * (1 << 3) * 4)];
11762  ulong r6 = shared.m[get_local_id(0) + (2 * (1 << 3) * 5)];
11763  ulong r7 = shared.m[get_local_id(0) + (2 * (1 << 3) * 6)];
11764  ulong r8 = shared.m[get_local_id(0) + (2 * (1 << 3) * 7)];
11765  ulong r9 = shared.m[get_local_id(0) + (2 * (1 << 3) * 8)];
11766  ulong r10 = shared.m[get_local_id(0) + (2 * (1 << 3) * 9)];
11767  ulong r11 = shared.m[get_local_id(0) + (2 * (1 << 3) * 10)];
11768  ulong r12 = shared.m[get_local_id(0) + (2 * (1 << 3) * 11)];
11769  ulong r13 = shared.m[get_local_id(0) + (2 * (1 << 3) * 12)];
11770  ulong r14 = shared.m[get_local_id(0) + (2 * (1 << 3) * 13)];
11771  ulong r15 = shared.m[get_local_id(0) + (2 * (1 << 3) * 14)];
11772  ulong r16 = shared.m[get_local_id(0) + (2 * (1 << 3) * 15)];
11773  {
11774    {
11775      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
11776      int const t_lt = get_sub_group_local_id() < half_lane_idx;
11777      ;
11778      {
11779        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
11780        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
11781      };
11782      {
11783        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
11784        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
11785      };
11786      {
11787        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
11788        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
11789      };
11790      {
11791        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
11792        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
11793      };
11794      {
11795        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
11796        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
11797      };
11798      {
11799        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
11800        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
11801      };
11802      {
11803        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
11804        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
11805      };
11806      {
11807        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
11808        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
11809      };
11810      {
11811        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
11812        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
11813      };
11814      {
11815        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
11816        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
11817      };
11818      {
11819        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
11820        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
11821      };
11822      {
11823        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
11824        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
11825      };
11826      {
11827        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
11828        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
11829      };
11830      {
11831        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
11832        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
11833      };
11834      {
11835        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
11836        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
11837      };
11838      {
11839        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
11840        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
11841      };
11842    }
11843    {
11844      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
11845      int const t_lt = get_sub_group_local_id() < half_lane_idx;
11846      ;
11847      {
11848        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
11849        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
11850      };
11851      {
11852        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
11853        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
11854      };
11855      {
11856        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
11857        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
11858      };
11859      {
11860        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
11861        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
11862      };
11863      {
11864        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
11865        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
11866      };
11867      {
11868        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
11869        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
11870      };
11871      {
11872        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
11873        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
11874      };
11875      {
11876        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
11877        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
11878      };
11879      {
11880        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
11881        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
11882      };
11883      {
11884        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
11885        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
11886      };
11887      {
11888        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
11889        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
11890      };
11891      {
11892        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
11893        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
11894      };
11895      {
11896        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
11897        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
11898      };
11899      {
11900        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
11901        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
11902      };
11903      {
11904        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
11905        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
11906      };
11907      {
11908        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
11909        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
11910      };
11911    }
11912    {
11913      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
11914      int const t_lt = get_sub_group_local_id() < half_lane_idx;
11915      ;
11916      {
11917        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
11918        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
11919      };
11920      {
11921        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
11922        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
11923      };
11924      {
11925        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
11926        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
11927      };
11928      {
11929        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
11930        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
11931      };
11932      {
11933        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
11934        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
11935      };
11936      {
11937        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
11938        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
11939      };
11940      {
11941        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
11942        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
11943      };
11944      {
11945        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
11946        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
11947      };
11948      {
11949        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
11950        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
11951      };
11952      {
11953        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
11954        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
11955      };
11956      {
11957        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
11958        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
11959      };
11960      {
11961        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
11962        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
11963      };
11964      {
11965        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
11966        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
11967      };
11968      {
11969        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
11970        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
11971      };
11972      {
11973        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
11974        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
11975      };
11976      {
11977        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
11978        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
11979      };
11980    }
11981    if (r1 >= r9) {
11982      ulong const t = r1;
11983      r1 = r9;
11984      r9 = t;
11985    };
11986    if (r5 >= r13) {
11987      ulong const t = r5;
11988      r5 = r13;
11989      r13 = t;
11990    };
11991    if (r1 >= r5) {
11992      ulong const t = r1;
11993      r1 = r5;
11994      r5 = t;
11995    };
11996    if (r9 >= r13) {
11997      ulong const t = r9;
11998      r9 = r13;
11999      r13 = t;
12000    };
12001    if (r3 >= r11) {
12002      ulong const t = r3;
12003      r3 = r11;
12004      r11 = t;
12005    };
12006    if (r7 >= r15) {
12007      ulong const t = r7;
12008      r7 = r15;
12009      r15 = t;
12010    };
12011    if (r3 >= r7) {
12012      ulong const t = r3;
12013      r3 = r7;
12014      r7 = t;
12015    };
12016    if (r11 >= r15) {
12017      ulong const t = r11;
12018      r11 = r15;
12019      r15 = t;
12020    };
12021    if (r1 >= r3) {
12022      ulong const t = r1;
12023      r1 = r3;
12024      r3 = t;
12025    };
12026    if (r5 >= r7) {
12027      ulong const t = r5;
12028      r5 = r7;
12029      r7 = t;
12030    };
12031    if (r9 >= r11) {
12032      ulong const t = r9;
12033      r9 = r11;
12034      r11 = t;
12035    };
12036    if (r13 >= r15) {
12037      ulong const t = r13;
12038      r13 = r15;
12039      r15 = t;
12040    };
12041    if (r2 >= r10) {
12042      ulong const t = r2;
12043      r2 = r10;
12044      r10 = t;
12045    };
12046    if (r6 >= r14) {
12047      ulong const t = r6;
12048      r6 = r14;
12049      r14 = t;
12050    };
12051    if (r2 >= r6) {
12052      ulong const t = r2;
12053      r2 = r6;
12054      r6 = t;
12055    };
12056    if (r10 >= r14) {
12057      ulong const t = r10;
12058      r10 = r14;
12059      r14 = t;
12060    };
12061    if (r4 >= r12) {
12062      ulong const t = r4;
12063      r4 = r12;
12064      r12 = t;
12065    };
12066    if (r8 >= r16) {
12067      ulong const t = r8;
12068      r8 = r16;
12069      r16 = t;
12070    };
12071    if (r4 >= r8) {
12072      ulong const t = r4;
12073      r4 = r8;
12074      r8 = t;
12075    };
12076    if (r12 >= r16) {
12077      ulong const t = r12;
12078      r12 = r16;
12079      r16 = t;
12080    };
12081    if (r2 >= r4) {
12082      ulong const t = r2;
12083      r2 = r4;
12084      r4 = t;
12085    };
12086    if (r6 >= r8) {
12087      ulong const t = r6;
12088      r6 = r8;
12089      r8 = t;
12090    };
12091    if (r10 >= r12) {
12092      ulong const t = r10;
12093      r10 = r12;
12094      r12 = t;
12095    };
12096    if (r14 >= r16) {
12097      ulong const t = r14;
12098      r14 = r16;
12099      r16 = t;
12100    };
12101    if (r1 >= r2) {
12102      ulong const t = r1;
12103      r1 = r2;
12104      r2 = t;
12105    };
12106    if (r3 >= r4) {
12107      ulong const t = r3;
12108      r3 = r4;
12109      r4 = t;
12110    };
12111    if (r5 >= r6) {
12112      ulong const t = r5;
12113      r5 = r6;
12114      r6 = t;
12115    };
12116    if (r7 >= r8) {
12117      ulong const t = r7;
12118      r7 = r8;
12119      r8 = t;
12120    };
12121    if (r9 >= r10) {
12122      ulong const t = r9;
12123      r9 = r10;
12124      r10 = t;
12125    };
12126    if (r11 >= r12) {
12127      ulong const t = r11;
12128      r11 = r12;
12129      r12 = t;
12130    };
12131    if (r13 >= r14) {
12132      ulong const t = r13;
12133      r13 = r14;
12134      r14 = t;
12135    };
12136    if (r15 >= r16) {
12137      ulong const t = r15;
12138      r15 = r16;
12139      r16 = t;
12140    };
12141  }
12142  vout[gmem_idx + (1 << 3) * 0] = r1;
12143  vout[gmem_idx + (1 << 3) * 1] = r2;
12144  vout[gmem_idx + (1 << 3) * 2] = r3;
12145  vout[gmem_idx + (1 << 3) * 3] = r4;
12146  vout[gmem_idx + (1 << 3) * 4] = r5;
12147  vout[gmem_idx + (1 << 3) * 5] = r6;
12148  vout[gmem_idx + (1 << 3) * 6] = r7;
12149  vout[gmem_idx + (1 << 3) * 7] = r8;
12150  vout[gmem_idx + (1 << 3) * 8] = r9;
12151  vout[gmem_idx + (1 << 3) * 9] = r10;
12152  vout[gmem_idx + (1 << 3) * 10] = r11;
12153  vout[gmem_idx + (1 << 3) * 11] = r12;
12154  vout[gmem_idx + (1 << 3) * 12] = r13;
12155  vout[gmem_idx + (1 << 3) * 13] = r14;
12156  vout[gmem_idx + (1 << 3) * 14] = r15;
12157  vout[gmem_idx + (1 << 3) * 15] = r16;
12158}
12159
12160__kernel __attribute__((intel_reqd_sub_group_size((1 << 3))))
12161__attribute__((reqd_work_group_size((1 << 3) * 4, 1, 1))) void
12162hs_kernel_bc_2(__global ulong* const restrict vout)
12163{
12164  __local struct
12165  {
12166    ulong m[32 * 16];
12167  } shared;
12168
12169  uint const gmem_idx = (get_global_id(0) & ~((1 << 3) - 1)) * 16 +
12170                        (get_local_id(0) & ((1 << 3) - 1));
12171  uint const gmem_l_idx =
12172    (get_global_id(0) & ~((1 << 3) * 4 - 1)) * 16 + get_local_id(0);
12173  uint const smem_l_idx =
12174    get_sub_group_id() * ((1 << 3) * 4) + get_sub_group_local_id();
12175  {
12176    {
12177      ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 0)];
12178      ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 16)];
12179      ulong r0_3 = vout[gmem_l_idx + ((1 << 3) * 32)];
12180      ulong r0_4 = vout[gmem_l_idx + ((1 << 3) * 48)];
12181      if (r0_1 >= r0_3) {
12182        ulong const t = r0_1;
12183        r0_1 = r0_3;
12184        r0_3 = t;
12185      };
12186      if (r0_2 >= r0_4) {
12187        ulong const t = r0_2;
12188        r0_2 = r0_4;
12189        r0_4 = t;
12190      };
12191      if (r0_1 >= r0_2) {
12192        ulong const t = r0_1;
12193        r0_1 = r0_2;
12194        r0_2 = t;
12195      };
12196      if (r0_3 >= r0_4) {
12197        ulong const t = r0_3;
12198        r0_3 = r0_4;
12199        r0_4 = t;
12200      };
12201      shared.m[smem_l_idx + (0)] = r0_1;
12202      shared.m[smem_l_idx + (8)] = r0_2;
12203      shared.m[smem_l_idx + (16)] = r0_3;
12204      shared.m[smem_l_idx + (24)] = r0_4;
12205    }
12206    {
12207      ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 4)];
12208      ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 20)];
12209      ulong r0_3 = vout[gmem_l_idx + ((1 << 3) * 36)];
12210      ulong r0_4 = vout[gmem_l_idx + ((1 << 3) * 52)];
12211      if (r0_1 >= r0_3) {
12212        ulong const t = r0_1;
12213        r0_1 = r0_3;
12214        r0_3 = t;
12215      };
12216      if (r0_2 >= r0_4) {
12217        ulong const t = r0_2;
12218        r0_2 = r0_4;
12219        r0_4 = t;
12220      };
12221      if (r0_1 >= r0_2) {
12222        ulong const t = r0_1;
12223        r0_1 = r0_2;
12224        r0_2 = t;
12225      };
12226      if (r0_3 >= r0_4) {
12227        ulong const t = r0_3;
12228        r0_3 = r0_4;
12229        r0_4 = t;
12230      };
12231      shared.m[smem_l_idx + (128)] = r0_1;
12232      shared.m[smem_l_idx + (136)] = r0_2;
12233      shared.m[smem_l_idx + (144)] = r0_3;
12234      shared.m[smem_l_idx + (152)] = r0_4;
12235    }
12236    {
12237      ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 8)];
12238      ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 24)];
12239      ulong r0_3 = vout[gmem_l_idx + ((1 << 3) * 40)];
12240      ulong r0_4 = vout[gmem_l_idx + ((1 << 3) * 56)];
12241      if (r0_1 >= r0_3) {
12242        ulong const t = r0_1;
12243        r0_1 = r0_3;
12244        r0_3 = t;
12245      };
12246      if (r0_2 >= r0_4) {
12247        ulong const t = r0_2;
12248        r0_2 = r0_4;
12249        r0_4 = t;
12250      };
12251      if (r0_1 >= r0_2) {
12252        ulong const t = r0_1;
12253        r0_1 = r0_2;
12254        r0_2 = t;
12255      };
12256      if (r0_3 >= r0_4) {
12257        ulong const t = r0_3;
12258        r0_3 = r0_4;
12259        r0_4 = t;
12260      };
12261      shared.m[smem_l_idx + (256)] = r0_1;
12262      shared.m[smem_l_idx + (264)] = r0_2;
12263      shared.m[smem_l_idx + (272)] = r0_3;
12264      shared.m[smem_l_idx + (280)] = r0_4;
12265    }
12266    {
12267      ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 12)];
12268      ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 28)];
12269      ulong r0_3 = vout[gmem_l_idx + ((1 << 3) * 44)];
12270      ulong r0_4 = vout[gmem_l_idx + ((1 << 3) * 60)];
12271      if (r0_1 >= r0_3) {
12272        ulong const t = r0_1;
12273        r0_1 = r0_3;
12274        r0_3 = t;
12275      };
12276      if (r0_2 >= r0_4) {
12277        ulong const t = r0_2;
12278        r0_2 = r0_4;
12279        r0_4 = t;
12280      };
12281      if (r0_1 >= r0_2) {
12282        ulong const t = r0_1;
12283        r0_1 = r0_2;
12284        r0_2 = t;
12285      };
12286      if (r0_3 >= r0_4) {
12287        ulong const t = r0_3;
12288        r0_3 = r0_4;
12289        r0_4 = t;
12290      };
12291      shared.m[smem_l_idx + (384)] = r0_1;
12292      shared.m[smem_l_idx + (392)] = r0_2;
12293      shared.m[smem_l_idx + (400)] = r0_3;
12294      shared.m[smem_l_idx + (408)] = r0_4;
12295    }
12296  }
12297  barrier(CLK_LOCAL_MEM_FENCE);
12298  ulong r1 = shared.m[get_local_id(0) + (4 * (1 << 3) * 0)];
12299  ulong r2 = shared.m[get_local_id(0) + (4 * (1 << 3) * 1)];
12300  ulong r3 = shared.m[get_local_id(0) + (4 * (1 << 3) * 2)];
12301  ulong r4 = shared.m[get_local_id(0) + (4 * (1 << 3) * 3)];
12302  ulong r5 = shared.m[get_local_id(0) + (4 * (1 << 3) * 4)];
12303  ulong r6 = shared.m[get_local_id(0) + (4 * (1 << 3) * 5)];
12304  ulong r7 = shared.m[get_local_id(0) + (4 * (1 << 3) * 6)];
12305  ulong r8 = shared.m[get_local_id(0) + (4 * (1 << 3) * 7)];
12306  ulong r9 = shared.m[get_local_id(0) + (4 * (1 << 3) * 8)];
12307  ulong r10 = shared.m[get_local_id(0) + (4 * (1 << 3) * 9)];
12308  ulong r11 = shared.m[get_local_id(0) + (4 * (1 << 3) * 10)];
12309  ulong r12 = shared.m[get_local_id(0) + (4 * (1 << 3) * 11)];
12310  ulong r13 = shared.m[get_local_id(0) + (4 * (1 << 3) * 12)];
12311  ulong r14 = shared.m[get_local_id(0) + (4 * (1 << 3) * 13)];
12312  ulong r15 = shared.m[get_local_id(0) + (4 * (1 << 3) * 14)];
12313  ulong r16 = shared.m[get_local_id(0) + (4 * (1 << 3) * 15)];
12314  {
12315    {
12316      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
12317      int const t_lt = get_sub_group_local_id() < half_lane_idx;
12318      ;
12319      {
12320        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
12321        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
12322      };
12323      {
12324        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
12325        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
12326      };
12327      {
12328        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
12329        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
12330      };
12331      {
12332        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
12333        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
12334      };
12335      {
12336        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
12337        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
12338      };
12339      {
12340        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
12341        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
12342      };
12343      {
12344        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
12345        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
12346      };
12347      {
12348        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
12349        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
12350      };
12351      {
12352        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
12353        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
12354      };
12355      {
12356        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
12357        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
12358      };
12359      {
12360        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
12361        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
12362      };
12363      {
12364        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
12365        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
12366      };
12367      {
12368        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
12369        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
12370      };
12371      {
12372        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
12373        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
12374      };
12375      {
12376        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
12377        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
12378      };
12379      {
12380        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
12381        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
12382      };
12383    }
12384    {
12385      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
12386      int const t_lt = get_sub_group_local_id() < half_lane_idx;
12387      ;
12388      {
12389        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
12390        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
12391      };
12392      {
12393        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
12394        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
12395      };
12396      {
12397        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
12398        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
12399      };
12400      {
12401        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
12402        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
12403      };
12404      {
12405        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
12406        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
12407      };
12408      {
12409        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
12410        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
12411      };
12412      {
12413        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
12414        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
12415      };
12416      {
12417        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
12418        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
12419      };
12420      {
12421        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
12422        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
12423      };
12424      {
12425        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
12426        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
12427      };
12428      {
12429        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
12430        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
12431      };
12432      {
12433        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
12434        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
12435      };
12436      {
12437        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
12438        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
12439      };
12440      {
12441        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
12442        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
12443      };
12444      {
12445        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
12446        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
12447      };
12448      {
12449        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
12450        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
12451      };
12452    }
12453    {
12454      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
12455      int const t_lt = get_sub_group_local_id() < half_lane_idx;
12456      ;
12457      {
12458        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
12459        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
12460      };
12461      {
12462        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
12463        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
12464      };
12465      {
12466        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
12467        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
12468      };
12469      {
12470        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
12471        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
12472      };
12473      {
12474        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
12475        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
12476      };
12477      {
12478        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
12479        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
12480      };
12481      {
12482        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
12483        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
12484      };
12485      {
12486        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
12487        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
12488      };
12489      {
12490        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
12491        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
12492      };
12493      {
12494        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
12495        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
12496      };
12497      {
12498        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
12499        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
12500      };
12501      {
12502        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
12503        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
12504      };
12505      {
12506        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
12507        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
12508      };
12509      {
12510        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
12511        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
12512      };
12513      {
12514        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
12515        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
12516      };
12517      {
12518        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
12519        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
12520      };
12521    }
12522    if (r1 >= r9) {
12523      ulong const t = r1;
12524      r1 = r9;
12525      r9 = t;
12526    };
12527    if (r5 >= r13) {
12528      ulong const t = r5;
12529      r5 = r13;
12530      r13 = t;
12531    };
12532    if (r1 >= r5) {
12533      ulong const t = r1;
12534      r1 = r5;
12535      r5 = t;
12536    };
12537    if (r9 >= r13) {
12538      ulong const t = r9;
12539      r9 = r13;
12540      r13 = t;
12541    };
12542    if (r3 >= r11) {
12543      ulong const t = r3;
12544      r3 = r11;
12545      r11 = t;
12546    };
12547    if (r7 >= r15) {
12548      ulong const t = r7;
12549      r7 = r15;
12550      r15 = t;
12551    };
12552    if (r3 >= r7) {
12553      ulong const t = r3;
12554      r3 = r7;
12555      r7 = t;
12556    };
12557    if (r11 >= r15) {
12558      ulong const t = r11;
12559      r11 = r15;
12560      r15 = t;
12561    };
12562    if (r1 >= r3) {
12563      ulong const t = r1;
12564      r1 = r3;
12565      r3 = t;
12566    };
12567    if (r5 >= r7) {
12568      ulong const t = r5;
12569      r5 = r7;
12570      r7 = t;
12571    };
12572    if (r9 >= r11) {
12573      ulong const t = r9;
12574      r9 = r11;
12575      r11 = t;
12576    };
12577    if (r13 >= r15) {
12578      ulong const t = r13;
12579      r13 = r15;
12580      r15 = t;
12581    };
12582    if (r2 >= r10) {
12583      ulong const t = r2;
12584      r2 = r10;
12585      r10 = t;
12586    };
12587    if (r6 >= r14) {
12588      ulong const t = r6;
12589      r6 = r14;
12590      r14 = t;
12591    };
12592    if (r2 >= r6) {
12593      ulong const t = r2;
12594      r2 = r6;
12595      r6 = t;
12596    };
12597    if (r10 >= r14) {
12598      ulong const t = r10;
12599      r10 = r14;
12600      r14 = t;
12601    };
12602    if (r4 >= r12) {
12603      ulong const t = r4;
12604      r4 = r12;
12605      r12 = t;
12606    };
12607    if (r8 >= r16) {
12608      ulong const t = r8;
12609      r8 = r16;
12610      r16 = t;
12611    };
12612    if (r4 >= r8) {
12613      ulong const t = r4;
12614      r4 = r8;
12615      r8 = t;
12616    };
12617    if (r12 >= r16) {
12618      ulong const t = r12;
12619      r12 = r16;
12620      r16 = t;
12621    };
12622    if (r2 >= r4) {
12623      ulong const t = r2;
12624      r2 = r4;
12625      r4 = t;
12626    };
12627    if (r6 >= r8) {
12628      ulong const t = r6;
12629      r6 = r8;
12630      r8 = t;
12631    };
12632    if (r10 >= r12) {
12633      ulong const t = r10;
12634      r10 = r12;
12635      r12 = t;
12636    };
12637    if (r14 >= r16) {
12638      ulong const t = r14;
12639      r14 = r16;
12640      r16 = t;
12641    };
12642    if (r1 >= r2) {
12643      ulong const t = r1;
12644      r1 = r2;
12645      r2 = t;
12646    };
12647    if (r3 >= r4) {
12648      ulong const t = r3;
12649      r3 = r4;
12650      r4 = t;
12651    };
12652    if (r5 >= r6) {
12653      ulong const t = r5;
12654      r5 = r6;
12655      r6 = t;
12656    };
12657    if (r7 >= r8) {
12658      ulong const t = r7;
12659      r7 = r8;
12660      r8 = t;
12661    };
12662    if (r9 >= r10) {
12663      ulong const t = r9;
12664      r9 = r10;
12665      r10 = t;
12666    };
12667    if (r11 >= r12) {
12668      ulong const t = r11;
12669      r11 = r12;
12670      r12 = t;
12671    };
12672    if (r13 >= r14) {
12673      ulong const t = r13;
12674      r13 = r14;
12675      r14 = t;
12676    };
12677    if (r15 >= r16) {
12678      ulong const t = r15;
12679      r15 = r16;
12680      r16 = t;
12681    };
12682  }
12683  vout[gmem_idx + (1 << 3) * 0] = r1;
12684  vout[gmem_idx + (1 << 3) * 1] = r2;
12685  vout[gmem_idx + (1 << 3) * 2] = r3;
12686  vout[gmem_idx + (1 << 3) * 3] = r4;
12687  vout[gmem_idx + (1 << 3) * 4] = r5;
12688  vout[gmem_idx + (1 << 3) * 5] = r6;
12689  vout[gmem_idx + (1 << 3) * 6] = r7;
12690  vout[gmem_idx + (1 << 3) * 7] = r8;
12691  vout[gmem_idx + (1 << 3) * 8] = r9;
12692  vout[gmem_idx + (1 << 3) * 9] = r10;
12693  vout[gmem_idx + (1 << 3) * 10] = r11;
12694  vout[gmem_idx + (1 << 3) * 11] = r12;
12695  vout[gmem_idx + (1 << 3) * 12] = r13;
12696  vout[gmem_idx + (1 << 3) * 13] = r14;
12697  vout[gmem_idx + (1 << 3) * 14] = r15;
12698  vout[gmem_idx + (1 << 3) * 15] = r16;
12699}
12700
12701__kernel __attribute__((intel_reqd_sub_group_size((1 << 3))))
12702__attribute__((reqd_work_group_size((1 << 3) * 8, 1, 1))) void
12703hs_kernel_bc_3(__global ulong* const restrict vout)
12704{
12705  __local struct
12706  {
12707    ulong m[64 * 16];
12708  } shared;
12709
12710  uint const gmem_idx = (get_global_id(0) & ~((1 << 3) - 1)) * 16 +
12711                        (get_local_id(0) & ((1 << 3) - 1));
12712  uint const gmem_l_idx =
12713    (get_global_id(0) & ~((1 << 3) * 8 - 1)) * 16 + get_local_id(0);
12714  uint const smem_l_idx =
12715    get_sub_group_id() * ((1 << 3) * 8) + get_sub_group_local_id();
12716  {
12717    {
12718      ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 0)];
12719      ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 16)];
12720      ulong r0_3 = vout[gmem_l_idx + ((1 << 3) * 32)];
12721      ulong r0_4 = vout[gmem_l_idx + ((1 << 3) * 48)];
12722      ulong r0_5 = vout[gmem_l_idx + ((1 << 3) * 64)];
12723      ulong r0_6 = vout[gmem_l_idx + ((1 << 3) * 80)];
12724      ulong r0_7 = vout[gmem_l_idx + ((1 << 3) * 96)];
12725      ulong r0_8 = vout[gmem_l_idx + ((1 << 3) * 112)];
12726      if (r0_1 >= r0_5) {
12727        ulong const t = r0_1;
12728        r0_1 = r0_5;
12729        r0_5 = t;
12730      };
12731      if (r0_3 >= r0_7) {
12732        ulong const t = r0_3;
12733        r0_3 = r0_7;
12734        r0_7 = t;
12735      };
12736      if (r0_1 >= r0_3) {
12737        ulong const t = r0_1;
12738        r0_1 = r0_3;
12739        r0_3 = t;
12740      };
12741      if (r0_5 >= r0_7) {
12742        ulong const t = r0_5;
12743        r0_5 = r0_7;
12744        r0_7 = t;
12745      };
12746      if (r0_2 >= r0_6) {
12747        ulong const t = r0_2;
12748        r0_2 = r0_6;
12749        r0_6 = t;
12750      };
12751      if (r0_4 >= r0_8) {
12752        ulong const t = r0_4;
12753        r0_4 = r0_8;
12754        r0_8 = t;
12755      };
12756      if (r0_2 >= r0_4) {
12757        ulong const t = r0_2;
12758        r0_2 = r0_4;
12759        r0_4 = t;
12760      };
12761      if (r0_6 >= r0_8) {
12762        ulong const t = r0_6;
12763        r0_6 = r0_8;
12764        r0_8 = t;
12765      };
12766      if (r0_1 >= r0_2) {
12767        ulong const t = r0_1;
12768        r0_1 = r0_2;
12769        r0_2 = t;
12770      };
12771      if (r0_3 >= r0_4) {
12772        ulong const t = r0_3;
12773        r0_3 = r0_4;
12774        r0_4 = t;
12775      };
12776      if (r0_5 >= r0_6) {
12777        ulong const t = r0_5;
12778        r0_5 = r0_6;
12779        r0_6 = t;
12780      };
12781      if (r0_7 >= r0_8) {
12782        ulong const t = r0_7;
12783        r0_7 = r0_8;
12784        r0_8 = t;
12785      };
12786      shared.m[smem_l_idx + (0)] = r0_1;
12787      shared.m[smem_l_idx + (8)] = r0_2;
12788      shared.m[smem_l_idx + (16)] = r0_3;
12789      shared.m[smem_l_idx + (24)] = r0_4;
12790      shared.m[smem_l_idx + (32)] = r0_5;
12791      shared.m[smem_l_idx + (40)] = r0_6;
12792      shared.m[smem_l_idx + (48)] = r0_7;
12793      shared.m[smem_l_idx + (56)] = r0_8;
12794    }
12795    {
12796      ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 8)];
12797      ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 24)];
12798      ulong r0_3 = vout[gmem_l_idx + ((1 << 3) * 40)];
12799      ulong r0_4 = vout[gmem_l_idx + ((1 << 3) * 56)];
12800      ulong r0_5 = vout[gmem_l_idx + ((1 << 3) * 72)];
12801      ulong r0_6 = vout[gmem_l_idx + ((1 << 3) * 88)];
12802      ulong r0_7 = vout[gmem_l_idx + ((1 << 3) * 104)];
12803      ulong r0_8 = vout[gmem_l_idx + ((1 << 3) * 120)];
12804      if (r0_1 >= r0_5) {
12805        ulong const t = r0_1;
12806        r0_1 = r0_5;
12807        r0_5 = t;
12808      };
12809      if (r0_3 >= r0_7) {
12810        ulong const t = r0_3;
12811        r0_3 = r0_7;
12812        r0_7 = t;
12813      };
12814      if (r0_1 >= r0_3) {
12815        ulong const t = r0_1;
12816        r0_1 = r0_3;
12817        r0_3 = t;
12818      };
12819      if (r0_5 >= r0_7) {
12820        ulong const t = r0_5;
12821        r0_5 = r0_7;
12822        r0_7 = t;
12823      };
12824      if (r0_2 >= r0_6) {
12825        ulong const t = r0_2;
12826        r0_2 = r0_6;
12827        r0_6 = t;
12828      };
12829      if (r0_4 >= r0_8) {
12830        ulong const t = r0_4;
12831        r0_4 = r0_8;
12832        r0_8 = t;
12833      };
12834      if (r0_2 >= r0_4) {
12835        ulong const t = r0_2;
12836        r0_2 = r0_4;
12837        r0_4 = t;
12838      };
12839      if (r0_6 >= r0_8) {
12840        ulong const t = r0_6;
12841        r0_6 = r0_8;
12842        r0_8 = t;
12843      };
12844      if (r0_1 >= r0_2) {
12845        ulong const t = r0_1;
12846        r0_1 = r0_2;
12847        r0_2 = t;
12848      };
12849      if (r0_3 >= r0_4) {
12850        ulong const t = r0_3;
12851        r0_3 = r0_4;
12852        r0_4 = t;
12853      };
12854      if (r0_5 >= r0_6) {
12855        ulong const t = r0_5;
12856        r0_5 = r0_6;
12857        r0_6 = t;
12858      };
12859      if (r0_7 >= r0_8) {
12860        ulong const t = r0_7;
12861        r0_7 = r0_8;
12862        r0_8 = t;
12863      };
12864      shared.m[smem_l_idx + (512)] = r0_1;
12865      shared.m[smem_l_idx + (520)] = r0_2;
12866      shared.m[smem_l_idx + (528)] = r0_3;
12867      shared.m[smem_l_idx + (536)] = r0_4;
12868      shared.m[smem_l_idx + (544)] = r0_5;
12869      shared.m[smem_l_idx + (552)] = r0_6;
12870      shared.m[smem_l_idx + (560)] = r0_7;
12871      shared.m[smem_l_idx + (568)] = r0_8;
12872    }
12873  }
12874  barrier(CLK_LOCAL_MEM_FENCE);
12875  ulong r1 = shared.m[get_local_id(0) + (8 * (1 << 3) * 0)];
12876  ulong r2 = shared.m[get_local_id(0) + (8 * (1 << 3) * 1)];
12877  ulong r3 = shared.m[get_local_id(0) + (8 * (1 << 3) * 2)];
12878  ulong r4 = shared.m[get_local_id(0) + (8 * (1 << 3) * 3)];
12879  ulong r5 = shared.m[get_local_id(0) + (8 * (1 << 3) * 4)];
12880  ulong r6 = shared.m[get_local_id(0) + (8 * (1 << 3) * 5)];
12881  ulong r7 = shared.m[get_local_id(0) + (8 * (1 << 3) * 6)];
12882  ulong r8 = shared.m[get_local_id(0) + (8 * (1 << 3) * 7)];
12883  ulong r9 = shared.m[get_local_id(0) + (8 * (1 << 3) * 8)];
12884  ulong r10 = shared.m[get_local_id(0) + (8 * (1 << 3) * 9)];
12885  ulong r11 = shared.m[get_local_id(0) + (8 * (1 << 3) * 10)];
12886  ulong r12 = shared.m[get_local_id(0) + (8 * (1 << 3) * 11)];
12887  ulong r13 = shared.m[get_local_id(0) + (8 * (1 << 3) * 12)];
12888  ulong r14 = shared.m[get_local_id(0) + (8 * (1 << 3) * 13)];
12889  ulong r15 = shared.m[get_local_id(0) + (8 * (1 << 3) * 14)];
12890  ulong r16 = shared.m[get_local_id(0) + (8 * (1 << 3) * 15)];
12891  {
12892    {
12893      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
12894      int const t_lt = get_sub_group_local_id() < half_lane_idx;
12895      ;
12896      {
12897        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
12898        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
12899      };
12900      {
12901        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
12902        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
12903      };
12904      {
12905        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
12906        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
12907      };
12908      {
12909        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
12910        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
12911      };
12912      {
12913        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
12914        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
12915      };
12916      {
12917        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
12918        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
12919      };
12920      {
12921        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
12922        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
12923      };
12924      {
12925        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
12926        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
12927      };
12928      {
12929        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
12930        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
12931      };
12932      {
12933        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
12934        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
12935      };
12936      {
12937        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
12938        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
12939      };
12940      {
12941        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
12942        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
12943      };
12944      {
12945        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
12946        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
12947      };
12948      {
12949        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
12950        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
12951      };
12952      {
12953        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
12954        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
12955      };
12956      {
12957        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
12958        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
12959      };
12960    }
12961    {
12962      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
12963      int const t_lt = get_sub_group_local_id() < half_lane_idx;
12964      ;
12965      {
12966        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
12967        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
12968      };
12969      {
12970        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
12971        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
12972      };
12973      {
12974        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
12975        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
12976      };
12977      {
12978        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
12979        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
12980      };
12981      {
12982        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
12983        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
12984      };
12985      {
12986        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
12987        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
12988      };
12989      {
12990        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
12991        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
12992      };
12993      {
12994        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
12995        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
12996      };
12997      {
12998        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
12999        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
13000      };
13001      {
13002        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
13003        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
13004      };
13005      {
13006        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
13007        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
13008      };
13009      {
13010        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
13011        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
13012      };
13013      {
13014        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
13015        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
13016      };
13017      {
13018        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
13019        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
13020      };
13021      {
13022        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
13023        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
13024      };
13025      {
13026        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
13027        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
13028      };
13029    }
13030    {
13031      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
13032      int const t_lt = get_sub_group_local_id() < half_lane_idx;
13033      ;
13034      {
13035        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
13036        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
13037      };
13038      {
13039        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
13040        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
13041      };
13042      {
13043        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
13044        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
13045      };
13046      {
13047        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
13048        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
13049      };
13050      {
13051        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
13052        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
13053      };
13054      {
13055        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
13056        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
13057      };
13058      {
13059        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
13060        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
13061      };
13062      {
13063        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
13064        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
13065      };
13066      {
13067        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
13068        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
13069      };
13070      {
13071        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
13072        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
13073      };
13074      {
13075        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
13076        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
13077      };
13078      {
13079        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
13080        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
13081      };
13082      {
13083        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
13084        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
13085      };
13086      {
13087        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
13088        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
13089      };
13090      {
13091        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
13092        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
13093      };
13094      {
13095        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
13096        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
13097      };
13098    }
13099    if (r1 >= r9) {
13100      ulong const t = r1;
13101      r1 = r9;
13102      r9 = t;
13103    };
13104    if (r5 >= r13) {
13105      ulong const t = r5;
13106      r5 = r13;
13107      r13 = t;
13108    };
13109    if (r1 >= r5) {
13110      ulong const t = r1;
13111      r1 = r5;
13112      r5 = t;
13113    };
13114    if (r9 >= r13) {
13115      ulong const t = r9;
13116      r9 = r13;
13117      r13 = t;
13118    };
13119    if (r3 >= r11) {
13120      ulong const t = r3;
13121      r3 = r11;
13122      r11 = t;
13123    };
13124    if (r7 >= r15) {
13125      ulong const t = r7;
13126      r7 = r15;
13127      r15 = t;
13128    };
13129    if (r3 >= r7) {
13130      ulong const t = r3;
13131      r3 = r7;
13132      r7 = t;
13133    };
13134    if (r11 >= r15) {
13135      ulong const t = r11;
13136      r11 = r15;
13137      r15 = t;
13138    };
13139    if (r1 >= r3) {
13140      ulong const t = r1;
13141      r1 = r3;
13142      r3 = t;
13143    };
13144    if (r5 >= r7) {
13145      ulong const t = r5;
13146      r5 = r7;
13147      r7 = t;
13148    };
13149    if (r9 >= r11) {
13150      ulong const t = r9;
13151      r9 = r11;
13152      r11 = t;
13153    };
13154    if (r13 >= r15) {
13155      ulong const t = r13;
13156      r13 = r15;
13157      r15 = t;
13158    };
13159    if (r2 >= r10) {
13160      ulong const t = r2;
13161      r2 = r10;
13162      r10 = t;
13163    };
13164    if (r6 >= r14) {
13165      ulong const t = r6;
13166      r6 = r14;
13167      r14 = t;
13168    };
13169    if (r2 >= r6) {
13170      ulong const t = r2;
13171      r2 = r6;
13172      r6 = t;
13173    };
13174    if (r10 >= r14) {
13175      ulong const t = r10;
13176      r10 = r14;
13177      r14 = t;
13178    };
13179    if (r4 >= r12) {
13180      ulong const t = r4;
13181      r4 = r12;
13182      r12 = t;
13183    };
13184    if (r8 >= r16) {
13185      ulong const t = r8;
13186      r8 = r16;
13187      r16 = t;
13188    };
13189    if (r4 >= r8) {
13190      ulong const t = r4;
13191      r4 = r8;
13192      r8 = t;
13193    };
13194    if (r12 >= r16) {
13195      ulong const t = r12;
13196      r12 = r16;
13197      r16 = t;
13198    };
13199    if (r2 >= r4) {
13200      ulong const t = r2;
13201      r2 = r4;
13202      r4 = t;
13203    };
13204    if (r6 >= r8) {
13205      ulong const t = r6;
13206      r6 = r8;
13207      r8 = t;
13208    };
13209    if (r10 >= r12) {
13210      ulong const t = r10;
13211      r10 = r12;
13212      r12 = t;
13213    };
13214    if (r14 >= r16) {
13215      ulong const t = r14;
13216      r14 = r16;
13217      r16 = t;
13218    };
13219    if (r1 >= r2) {
13220      ulong const t = r1;
13221      r1 = r2;
13222      r2 = t;
13223    };
13224    if (r3 >= r4) {
13225      ulong const t = r3;
13226      r3 = r4;
13227      r4 = t;
13228    };
13229    if (r5 >= r6) {
13230      ulong const t = r5;
13231      r5 = r6;
13232      r6 = t;
13233    };
13234    if (r7 >= r8) {
13235      ulong const t = r7;
13236      r7 = r8;
13237      r8 = t;
13238    };
13239    if (r9 >= r10) {
13240      ulong const t = r9;
13241      r9 = r10;
13242      r10 = t;
13243    };
13244    if (r11 >= r12) {
13245      ulong const t = r11;
13246      r11 = r12;
13247      r12 = t;
13248    };
13249    if (r13 >= r14) {
13250      ulong const t = r13;
13251      r13 = r14;
13252      r14 = t;
13253    };
13254    if (r15 >= r16) {
13255      ulong const t = r15;
13256      r15 = r16;
13257      r16 = t;
13258    };
13259  }
13260  vout[gmem_idx + (1 << 3) * 0] = r1;
13261  vout[gmem_idx + (1 << 3) * 1] = r2;
13262  vout[gmem_idx + (1 << 3) * 2] = r3;
13263  vout[gmem_idx + (1 << 3) * 3] = r4;
13264  vout[gmem_idx + (1 << 3) * 4] = r5;
13265  vout[gmem_idx + (1 << 3) * 5] = r6;
13266  vout[gmem_idx + (1 << 3) * 6] = r7;
13267  vout[gmem_idx + (1 << 3) * 7] = r8;
13268  vout[gmem_idx + (1 << 3) * 8] = r9;
13269  vout[gmem_idx + (1 << 3) * 9] = r10;
13270  vout[gmem_idx + (1 << 3) * 10] = r11;
13271  vout[gmem_idx + (1 << 3) * 11] = r12;
13272  vout[gmem_idx + (1 << 3) * 12] = r13;
13273  vout[gmem_idx + (1 << 3) * 13] = r14;
13274  vout[gmem_idx + (1 << 3) * 14] = r15;
13275  vout[gmem_idx + (1 << 3) * 15] = r16;
13276}
13277
13278__kernel __attribute__((intel_reqd_sub_group_size((1 << 3))))
13279__attribute__((reqd_work_group_size((1 << 3) * 16, 1, 1))) void
13280hs_kernel_bc_4(__global ulong* const restrict vout)
13281{
13282  __local struct
13283  {
13284    ulong m[128 * 16];
13285  } shared;
13286
13287  uint const gmem_idx = (get_global_id(0) & ~((1 << 3) - 1)) * 16 +
13288                        (get_local_id(0) & ((1 << 3) - 1));
13289  uint const gmem_l_idx =
13290    (get_global_id(0) & ~((1 << 3) * 16 - 1)) * 16 + get_local_id(0);
13291  uint const smem_l_idx =
13292    get_sub_group_id() * ((1 << 3) * 16) + get_sub_group_local_id();
13293  {
13294    {
13295      ulong r0_1 = vout[gmem_l_idx + ((1 << 3) * 0)];
13296      ulong r0_2 = vout[gmem_l_idx + ((1 << 3) * 16)];
13297      ulong r0_3 = vout[gmem_l_idx + ((1 << 3) * 32)];
13298      ulong r0_4 = vout[gmem_l_idx + ((1 << 3) * 48)];
13299      ulong r0_5 = vout[gmem_l_idx + ((1 << 3) * 64)];
13300      ulong r0_6 = vout[gmem_l_idx + ((1 << 3) * 80)];
13301      ulong r0_7 = vout[gmem_l_idx + ((1 << 3) * 96)];
13302      ulong r0_8 = vout[gmem_l_idx + ((1 << 3) * 112)];
13303      ulong r0_9 = vout[gmem_l_idx + ((1 << 3) * 128)];
13304      ulong r0_10 = vout[gmem_l_idx + ((1 << 3) * 144)];
13305      ulong r0_11 = vout[gmem_l_idx + ((1 << 3) * 160)];
13306      ulong r0_12 = vout[gmem_l_idx + ((1 << 3) * 176)];
13307      ulong r0_13 = vout[gmem_l_idx + ((1 << 3) * 192)];
13308      ulong r0_14 = vout[gmem_l_idx + ((1 << 3) * 208)];
13309      ulong r0_15 = vout[gmem_l_idx + ((1 << 3) * 224)];
13310      ulong r0_16 = vout[gmem_l_idx + ((1 << 3) * 240)];
13311      if (r0_1 >= r0_9) {
13312        ulong const t = r0_1;
13313        r0_1 = r0_9;
13314        r0_9 = t;
13315      };
13316      if (r0_5 >= r0_13) {
13317        ulong const t = r0_5;
13318        r0_5 = r0_13;
13319        r0_13 = t;
13320      };
13321      if (r0_1 >= r0_5) {
13322        ulong const t = r0_1;
13323        r0_1 = r0_5;
13324        r0_5 = t;
13325      };
13326      if (r0_9 >= r0_13) {
13327        ulong const t = r0_9;
13328        r0_9 = r0_13;
13329        r0_13 = t;
13330      };
13331      if (r0_3 >= r0_11) {
13332        ulong const t = r0_3;
13333        r0_3 = r0_11;
13334        r0_11 = t;
13335      };
13336      if (r0_7 >= r0_15) {
13337        ulong const t = r0_7;
13338        r0_7 = r0_15;
13339        r0_15 = t;
13340      };
13341      if (r0_3 >= r0_7) {
13342        ulong const t = r0_3;
13343        r0_3 = r0_7;
13344        r0_7 = t;
13345      };
13346      if (r0_11 >= r0_15) {
13347        ulong const t = r0_11;
13348        r0_11 = r0_15;
13349        r0_15 = t;
13350      };
13351      if (r0_1 >= r0_3) {
13352        ulong const t = r0_1;
13353        r0_1 = r0_3;
13354        r0_3 = t;
13355      };
13356      if (r0_5 >= r0_7) {
13357        ulong const t = r0_5;
13358        r0_5 = r0_7;
13359        r0_7 = t;
13360      };
13361      if (r0_9 >= r0_11) {
13362        ulong const t = r0_9;
13363        r0_9 = r0_11;
13364        r0_11 = t;
13365      };
13366      if (r0_13 >= r0_15) {
13367        ulong const t = r0_13;
13368        r0_13 = r0_15;
13369        r0_15 = t;
13370      };
13371      if (r0_2 >= r0_10) {
13372        ulong const t = r0_2;
13373        r0_2 = r0_10;
13374        r0_10 = t;
13375      };
13376      if (r0_6 >= r0_14) {
13377        ulong const t = r0_6;
13378        r0_6 = r0_14;
13379        r0_14 = t;
13380      };
13381      if (r0_2 >= r0_6) {
13382        ulong const t = r0_2;
13383        r0_2 = r0_6;
13384        r0_6 = t;
13385      };
13386      if (r0_10 >= r0_14) {
13387        ulong const t = r0_10;
13388        r0_10 = r0_14;
13389        r0_14 = t;
13390      };
13391      if (r0_4 >= r0_12) {
13392        ulong const t = r0_4;
13393        r0_4 = r0_12;
13394        r0_12 = t;
13395      };
13396      if (r0_8 >= r0_16) {
13397        ulong const t = r0_8;
13398        r0_8 = r0_16;
13399        r0_16 = t;
13400      };
13401      if (r0_4 >= r0_8) {
13402        ulong const t = r0_4;
13403        r0_4 = r0_8;
13404        r0_8 = t;
13405      };
13406      if (r0_12 >= r0_16) {
13407        ulong const t = r0_12;
13408        r0_12 = r0_16;
13409        r0_16 = t;
13410      };
13411      if (r0_2 >= r0_4) {
13412        ulong const t = r0_2;
13413        r0_2 = r0_4;
13414        r0_4 = t;
13415      };
13416      if (r0_6 >= r0_8) {
13417        ulong const t = r0_6;
13418        r0_6 = r0_8;
13419        r0_8 = t;
13420      };
13421      if (r0_10 >= r0_12) {
13422        ulong const t = r0_10;
13423        r0_10 = r0_12;
13424        r0_12 = t;
13425      };
13426      if (r0_14 >= r0_16) {
13427        ulong const t = r0_14;
13428        r0_14 = r0_16;
13429        r0_16 = t;
13430      };
13431      if (r0_1 >= r0_2) {
13432        ulong const t = r0_1;
13433        r0_1 = r0_2;
13434        r0_2 = t;
13435      };
13436      if (r0_3 >= r0_4) {
13437        ulong const t = r0_3;
13438        r0_3 = r0_4;
13439        r0_4 = t;
13440      };
13441      if (r0_5 >= r0_6) {
13442        ulong const t = r0_5;
13443        r0_5 = r0_6;
13444        r0_6 = t;
13445      };
13446      if (r0_7 >= r0_8) {
13447        ulong const t = r0_7;
13448        r0_7 = r0_8;
13449        r0_8 = t;
13450      };
13451      if (r0_9 >= r0_10) {
13452        ulong const t = r0_9;
13453        r0_9 = r0_10;
13454        r0_10 = t;
13455      };
13456      if (r0_11 >= r0_12) {
13457        ulong const t = r0_11;
13458        r0_11 = r0_12;
13459        r0_12 = t;
13460      };
13461      if (r0_13 >= r0_14) {
13462        ulong const t = r0_13;
13463        r0_13 = r0_14;
13464        r0_14 = t;
13465      };
13466      if (r0_15 >= r0_16) {
13467        ulong const t = r0_15;
13468        r0_15 = r0_16;
13469        r0_16 = t;
13470      };
13471      shared.m[smem_l_idx + (0)] = r0_1;
13472      shared.m[smem_l_idx + (8)] = r0_2;
13473      shared.m[smem_l_idx + (16)] = r0_3;
13474      shared.m[smem_l_idx + (24)] = r0_4;
13475      shared.m[smem_l_idx + (32)] = r0_5;
13476      shared.m[smem_l_idx + (40)] = r0_6;
13477      shared.m[smem_l_idx + (48)] = r0_7;
13478      shared.m[smem_l_idx + (56)] = r0_8;
13479      shared.m[smem_l_idx + (64)] = r0_9;
13480      shared.m[smem_l_idx + (72)] = r0_10;
13481      shared.m[smem_l_idx + (80)] = r0_11;
13482      shared.m[smem_l_idx + (88)] = r0_12;
13483      shared.m[smem_l_idx + (96)] = r0_13;
13484      shared.m[smem_l_idx + (104)] = r0_14;
13485      shared.m[smem_l_idx + (112)] = r0_15;
13486      shared.m[smem_l_idx + (120)] = r0_16;
13487    }
13488  }
13489  barrier(CLK_LOCAL_MEM_FENCE);
13490  ulong r1 = shared.m[get_local_id(0) + (16 * (1 << 3) * 0)];
13491  ulong r2 = shared.m[get_local_id(0) + (16 * (1 << 3) * 1)];
13492  ulong r3 = shared.m[get_local_id(0) + (16 * (1 << 3) * 2)];
13493  ulong r4 = shared.m[get_local_id(0) + (16 * (1 << 3) * 3)];
13494  ulong r5 = shared.m[get_local_id(0) + (16 * (1 << 3) * 4)];
13495  ulong r6 = shared.m[get_local_id(0) + (16 * (1 << 3) * 5)];
13496  ulong r7 = shared.m[get_local_id(0) + (16 * (1 << 3) * 6)];
13497  ulong r8 = shared.m[get_local_id(0) + (16 * (1 << 3) * 7)];
13498  ulong r9 = shared.m[get_local_id(0) + (16 * (1 << 3) * 8)];
13499  ulong r10 = shared.m[get_local_id(0) + (16 * (1 << 3) * 9)];
13500  ulong r11 = shared.m[get_local_id(0) + (16 * (1 << 3) * 10)];
13501  ulong r12 = shared.m[get_local_id(0) + (16 * (1 << 3) * 11)];
13502  ulong r13 = shared.m[get_local_id(0) + (16 * (1 << 3) * 12)];
13503  ulong r14 = shared.m[get_local_id(0) + (16 * (1 << 3) * 13)];
13504  ulong r15 = shared.m[get_local_id(0) + (16 * (1 << 3) * 14)];
13505  ulong r16 = shared.m[get_local_id(0) + (16 * (1 << 3) * 15)];
13506  {
13507    {
13508      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
13509      int const t_lt = get_sub_group_local_id() < half_lane_idx;
13510      ;
13511      {
13512        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
13513        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
13514      };
13515      {
13516        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
13517        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
13518      };
13519      {
13520        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
13521        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
13522      };
13523      {
13524        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
13525        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
13526      };
13527      {
13528        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
13529        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
13530      };
13531      {
13532        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
13533        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
13534      };
13535      {
13536        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
13537        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
13538      };
13539      {
13540        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
13541        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
13542      };
13543      {
13544        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
13545        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
13546      };
13547      {
13548        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
13549        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
13550      };
13551      {
13552        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
13553        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
13554      };
13555      {
13556        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
13557        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
13558      };
13559      {
13560        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
13561        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
13562      };
13563      {
13564        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
13565        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
13566      };
13567      {
13568        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
13569        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
13570      };
13571      {
13572        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
13573        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
13574      };
13575    }
13576    {
13577      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
13578      int const t_lt = get_sub_group_local_id() < half_lane_idx;
13579      ;
13580      {
13581        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
13582        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
13583      };
13584      {
13585        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
13586        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
13587      };
13588      {
13589        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
13590        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
13591      };
13592      {
13593        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
13594        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
13595      };
13596      {
13597        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
13598        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
13599      };
13600      {
13601        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
13602        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
13603      };
13604      {
13605        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
13606        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
13607      };
13608      {
13609        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
13610        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
13611      };
13612      {
13613        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
13614        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
13615      };
13616      {
13617        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
13618        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
13619      };
13620      {
13621        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
13622        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
13623      };
13624      {
13625        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
13626        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
13627      };
13628      {
13629        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
13630        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
13631      };
13632      {
13633        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
13634        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
13635      };
13636      {
13637        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
13638        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
13639      };
13640      {
13641        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
13642        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
13643      };
13644    }
13645    {
13646      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
13647      int const t_lt = get_sub_group_local_id() < half_lane_idx;
13648      ;
13649      {
13650        ulong const ta = intel_sub_group_shuffle(r1, half_lane_idx);
13651        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
13652      };
13653      {
13654        ulong const ta = intel_sub_group_shuffle(r2, half_lane_idx);
13655        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
13656      };
13657      {
13658        ulong const ta = intel_sub_group_shuffle(r3, half_lane_idx);
13659        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
13660      };
13661      {
13662        ulong const ta = intel_sub_group_shuffle(r4, half_lane_idx);
13663        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
13664      };
13665      {
13666        ulong const ta = intel_sub_group_shuffle(r5, half_lane_idx);
13667        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
13668      };
13669      {
13670        ulong const ta = intel_sub_group_shuffle(r6, half_lane_idx);
13671        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
13672      };
13673      {
13674        ulong const ta = intel_sub_group_shuffle(r7, half_lane_idx);
13675        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
13676      };
13677      {
13678        ulong const ta = intel_sub_group_shuffle(r8, half_lane_idx);
13679        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
13680      };
13681      {
13682        ulong const ta = intel_sub_group_shuffle(r9, half_lane_idx);
13683        r9 = ((r9 <= ta) ^ t_lt) ? ta : r9;
13684      };
13685      {
13686        ulong const ta = intel_sub_group_shuffle(r10, half_lane_idx);
13687        r10 = ((r10 <= ta) ^ t_lt) ? ta : r10;
13688      };
13689      {
13690        ulong const ta = intel_sub_group_shuffle(r11, half_lane_idx);
13691        r11 = ((r11 <= ta) ^ t_lt) ? ta : r11;
13692      };
13693      {
13694        ulong const ta = intel_sub_group_shuffle(r12, half_lane_idx);
13695        r12 = ((r12 <= ta) ^ t_lt) ? ta : r12;
13696      };
13697      {
13698        ulong const ta = intel_sub_group_shuffle(r13, half_lane_idx);
13699        r13 = ((r13 <= ta) ^ t_lt) ? ta : r13;
13700      };
13701      {
13702        ulong const ta = intel_sub_group_shuffle(r14, half_lane_idx);
13703        r14 = ((r14 <= ta) ^ t_lt) ? ta : r14;
13704      };
13705      {
13706        ulong const ta = intel_sub_group_shuffle(r15, half_lane_idx);
13707        r15 = ((r15 <= ta) ^ t_lt) ? ta : r15;
13708      };
13709      {
13710        ulong const ta = intel_sub_group_shuffle(r16, half_lane_idx);
13711        r16 = ((r16 <= ta) ^ t_lt) ? ta : r16;
13712      };
13713    }
13714    if (r1 >= r9) {
13715      ulong const t = r1;
13716      r1 = r9;
13717      r9 = t;
13718    };
13719    if (r5 >= r13) {
13720      ulong const t = r5;
13721      r5 = r13;
13722      r13 = t;
13723    };
13724    if (r1 >= r5) {
13725      ulong const t = r1;
13726      r1 = r5;
13727      r5 = t;
13728    };
13729    if (r9 >= r13) {
13730      ulong const t = r9;
13731      r9 = r13;
13732      r13 = t;
13733    };
13734    if (r3 >= r11) {
13735      ulong const t = r3;
13736      r3 = r11;
13737      r11 = t;
13738    };
13739    if (r7 >= r15) {
13740      ulong const t = r7;
13741      r7 = r15;
13742      r15 = t;
13743    };
13744    if (r3 >= r7) {
13745      ulong const t = r3;
13746      r3 = r7;
13747      r7 = t;
13748    };
13749    if (r11 >= r15) {
13750      ulong const t = r11;
13751      r11 = r15;
13752      r15 = t;
13753    };
13754    if (r1 >= r3) {
13755      ulong const t = r1;
13756      r1 = r3;
13757      r3 = t;
13758    };
13759    if (r5 >= r7) {
13760      ulong const t = r5;
13761      r5 = r7;
13762      r7 = t;
13763    };
13764    if (r9 >= r11) {
13765      ulong const t = r9;
13766      r9 = r11;
13767      r11 = t;
13768    };
13769    if (r13 >= r15) {
13770      ulong const t = r13;
13771      r13 = r15;
13772      r15 = t;
13773    };
13774    if (r2 >= r10) {
13775      ulong const t = r2;
13776      r2 = r10;
13777      r10 = t;
13778    };
13779    if (r6 >= r14) {
13780      ulong const t = r6;
13781      r6 = r14;
13782      r14 = t;
13783    };
13784    if (r2 >= r6) {
13785      ulong const t = r2;
13786      r2 = r6;
13787      r6 = t;
13788    };
13789    if (r10 >= r14) {
13790      ulong const t = r10;
13791      r10 = r14;
13792      r14 = t;
13793    };
13794    if (r4 >= r12) {
13795      ulong const t = r4;
13796      r4 = r12;
13797      r12 = t;
13798    };
13799    if (r8 >= r16) {
13800      ulong const t = r8;
13801      r8 = r16;
13802      r16 = t;
13803    };
13804    if (r4 >= r8) {
13805      ulong const t = r4;
13806      r4 = r8;
13807      r8 = t;
13808    };
13809    if (r12 >= r16) {
13810      ulong const t = r12;
13811      r12 = r16;
13812      r16 = t;
13813    };
13814    if (r2 >= r4) {
13815      ulong const t = r2;
13816      r2 = r4;
13817      r4 = t;
13818    };
13819    if (r6 >= r8) {
13820      ulong const t = r6;
13821      r6 = r8;
13822      r8 = t;
13823    };
13824    if (r10 >= r12) {
13825      ulong const t = r10;
13826      r10 = r12;
13827      r12 = t;
13828    };
13829    if (r14 >= r16) {
13830      ulong const t = r14;
13831      r14 = r16;
13832      r16 = t;
13833    };
13834    if (r1 >= r2) {
13835      ulong const t = r1;
13836      r1 = r2;
13837      r2 = t;
13838    };
13839    if (r3 >= r4) {
13840      ulong const t = r3;
13841      r3 = r4;
13842      r4 = t;
13843    };
13844    if (r5 >= r6) {
13845      ulong const t = r5;
13846      r5 = r6;
13847      r6 = t;
13848    };
13849    if (r7 >= r8) {
13850      ulong const t = r7;
13851      r7 = r8;
13852      r8 = t;
13853    };
13854    if (r9 >= r10) {
13855      ulong const t = r9;
13856      r9 = r10;
13857      r10 = t;
13858    };
13859    if (r11 >= r12) {
13860      ulong const t = r11;
13861      r11 = r12;
13862      r12 = t;
13863    };
13864    if (r13 >= r14) {
13865      ulong const t = r13;
13866      r13 = r14;
13867      r14 = t;
13868    };
13869    if (r15 >= r16) {
13870      ulong const t = r15;
13871      r15 = r16;
13872      r16 = t;
13873    };
13874  }
13875  vout[gmem_idx + (1 << 3) * 0] = r1;
13876  vout[gmem_idx + (1 << 3) * 1] = r2;
13877  vout[gmem_idx + (1 << 3) * 2] = r3;
13878  vout[gmem_idx + (1 << 3) * 3] = r4;
13879  vout[gmem_idx + (1 << 3) * 4] = r5;
13880  vout[gmem_idx + (1 << 3) * 5] = r6;
13881  vout[gmem_idx + (1 << 3) * 6] = r7;
13882  vout[gmem_idx + (1 << 3) * 7] = r8;
13883  vout[gmem_idx + (1 << 3) * 8] = r9;
13884  vout[gmem_idx + (1 << 3) * 9] = r10;
13885  vout[gmem_idx + (1 << 3) * 10] = r11;
13886  vout[gmem_idx + (1 << 3) * 11] = r12;
13887  vout[gmem_idx + (1 << 3) * 12] = r13;
13888  vout[gmem_idx + (1 << 3) * 13] = r14;
13889  vout[gmem_idx + (1 << 3) * 14] = r15;
13890  vout[gmem_idx + (1 << 3) * 15] = r16;
13891}
13892
13893__kernel __attribute__((intel_reqd_sub_group_size((1 << 3)))) void
13894hs_kernel_fm_1_0(__global ulong* const restrict vout)
13895{
13896  uint const span_idx = get_global_id(1);
13897  uint const span_stride = get_global_size(0);
13898  uint const span_size = span_stride * 16 * 2;
13899  uint const span_base = span_idx * span_size;
13900  uint const span_off = get_global_id(0);
13901  uint const span_l = span_base + span_off;
13902  uint const span_r = span_base + span_stride * (16 + 1) - span_off - 1;
13903  ulong r1 = vout[span_l + span_stride * 0];
13904  ulong r2 = vout[span_l + span_stride * 1];
13905  ulong r3 = vout[span_l + span_stride * 2];
13906  ulong r4 = vout[span_l + span_stride * 3];
13907  ulong r5 = vout[span_l + span_stride * 4];
13908  ulong r6 = vout[span_l + span_stride * 5];
13909  ulong r7 = vout[span_l + span_stride * 6];
13910  ulong r8 = vout[span_l + span_stride * 7];
13911  ulong r9 = vout[span_l + span_stride * 8];
13912  ulong r10 = vout[span_l + span_stride * 9];
13913  ulong r11 = vout[span_l + span_stride * 10];
13914  ulong r12 = vout[span_l + span_stride * 11];
13915  ulong r13 = vout[span_l + span_stride * 12];
13916  ulong r14 = vout[span_l + span_stride * 13];
13917  ulong r15 = vout[span_l + span_stride * 14];
13918  ulong r16 = vout[span_l + span_stride * 15];
13919  ulong r17 = vout[span_r + span_stride * 0];
13920  if (r16 >= r17) {
13921    ulong const t = r16;
13922    r16 = r17;
13923    r17 = t;
13924  };
13925  if (r1 >= r9) {
13926    ulong const t = r1;
13927    r1 = r9;
13928    r9 = t;
13929  };
13930  if (r5 >= r13) {
13931    ulong const t = r5;
13932    r5 = r13;
13933    r13 = t;
13934  };
13935  if (r1 >= r5) {
13936    ulong const t = r1;
13937    r1 = r5;
13938    r5 = t;
13939  };
13940  if (r9 >= r13) {
13941    ulong const t = r9;
13942    r9 = r13;
13943    r13 = t;
13944  };
13945  if (r3 >= r11) {
13946    ulong const t = r3;
13947    r3 = r11;
13948    r11 = t;
13949  };
13950  if (r7 >= r15) {
13951    ulong const t = r7;
13952    r7 = r15;
13953    r15 = t;
13954  };
13955  if (r3 >= r7) {
13956    ulong const t = r3;
13957    r3 = r7;
13958    r7 = t;
13959  };
13960  if (r11 >= r15) {
13961    ulong const t = r11;
13962    r11 = r15;
13963    r15 = t;
13964  };
13965  if (r1 >= r3) {
13966    ulong const t = r1;
13967    r1 = r3;
13968    r3 = t;
13969  };
13970  if (r5 >= r7) {
13971    ulong const t = r5;
13972    r5 = r7;
13973    r7 = t;
13974  };
13975  if (r9 >= r11) {
13976    ulong const t = r9;
13977    r9 = r11;
13978    r11 = t;
13979  };
13980  if (r13 >= r15) {
13981    ulong const t = r13;
13982    r13 = r15;
13983    r15 = t;
13984  };
13985  if (r2 >= r10) {
13986    ulong const t = r2;
13987    r2 = r10;
13988    r10 = t;
13989  };
13990  if (r6 >= r14) {
13991    ulong const t = r6;
13992    r6 = r14;
13993    r14 = t;
13994  };
13995  if (r2 >= r6) {
13996    ulong const t = r2;
13997    r2 = r6;
13998    r6 = t;
13999  };
14000  if (r10 >= r14) {
14001    ulong const t = r10;
14002    r10 = r14;
14003    r14 = t;
14004  };
14005  if (r4 >= r12) {
14006    ulong const t = r4;
14007    r4 = r12;
14008    r12 = t;
14009  };
14010  if (r8 >= r16) {
14011    ulong const t = r8;
14012    r8 = r16;
14013    r16 = t;
14014  };
14015  if (r4 >= r8) {
14016    ulong const t = r4;
14017    r4 = r8;
14018    r8 = t;
14019  };
14020  if (r12 >= r16) {
14021    ulong const t = r12;
14022    r12 = r16;
14023    r16 = t;
14024  };
14025  if (r2 >= r4) {
14026    ulong const t = r2;
14027    r2 = r4;
14028    r4 = t;
14029  };
14030  if (r6 >= r8) {
14031    ulong const t = r6;
14032    r6 = r8;
14033    r8 = t;
14034  };
14035  if (r10 >= r12) {
14036    ulong const t = r10;
14037    r10 = r12;
14038    r12 = t;
14039  };
14040  if (r14 >= r16) {
14041    ulong const t = r14;
14042    r14 = r16;
14043    r16 = t;
14044  };
14045  if (r1 >= r2) {
14046    ulong const t = r1;
14047    r1 = r2;
14048    r2 = t;
14049  };
14050  if (r3 >= r4) {
14051    ulong const t = r3;
14052    r3 = r4;
14053    r4 = t;
14054  };
14055  if (r5 >= r6) {
14056    ulong const t = r5;
14057    r5 = r6;
14058    r6 = t;
14059  };
14060  if (r7 >= r8) {
14061    ulong const t = r7;
14062    r7 = r8;
14063    r8 = t;
14064  };
14065  if (r9 >= r10) {
14066    ulong const t = r9;
14067    r9 = r10;
14068    r10 = t;
14069  };
14070  if (r11 >= r12) {
14071    ulong const t = r11;
14072    r11 = r12;
14073    r12 = t;
14074  };
14075  if (r13 >= r14) {
14076    ulong const t = r13;
14077    r13 = r14;
14078    r14 = t;
14079  };
14080  if (r15 >= r16) {
14081    ulong const t = r15;
14082    r15 = r16;
14083    r16 = t;
14084  };
14085  vout[span_l + span_stride * 0] = r1;
14086  vout[span_l + span_stride * 1] = r2;
14087  vout[span_l + span_stride * 2] = r3;
14088  vout[span_l + span_stride * 3] = r4;
14089  vout[span_l + span_stride * 4] = r5;
14090  vout[span_l + span_stride * 5] = r6;
14091  vout[span_l + span_stride * 6] = r7;
14092  vout[span_l + span_stride * 7] = r8;
14093  vout[span_l + span_stride * 8] = r9;
14094  vout[span_l + span_stride * 9] = r10;
14095  vout[span_l + span_stride * 10] = r11;
14096  vout[span_l + span_stride * 11] = r12;
14097  vout[span_l + span_stride * 12] = r13;
14098  vout[span_l + span_stride * 13] = r14;
14099  vout[span_l + span_stride * 14] = r15;
14100  vout[span_l + span_stride * 15] = r16;
14101  vout[span_r + span_stride * 0] = r17;
14102}
14103
14104__kernel __attribute__((intel_reqd_sub_group_size((1 << 3)))) void
14105hs_kernel_fm_1_1(__global ulong* const restrict vout)
14106{
14107  uint const span_idx = get_global_id(1);
14108  uint const span_stride = get_global_size(0);
14109  uint const span_size = span_stride * 16 * 2;
14110  uint const span_base = span_idx * span_size;
14111  uint const span_off = get_global_id(0);
14112  uint const span_l = span_base + span_off;
14113  uint const span_r = span_base + span_stride * (16 + 1) - span_off - 1;
14114  ulong r1 = vout[span_l + span_stride * 0];
14115  ulong r2 = vout[span_l + span_stride * 1];
14116  ulong r3 = vout[span_l + span_stride * 2];
14117  ulong r4 = vout[span_l + span_stride * 3];
14118  ulong r5 = vout[span_l + span_stride * 4];
14119  ulong r6 = vout[span_l + span_stride * 5];
14120  ulong r7 = vout[span_l + span_stride * 6];
14121  ulong r8 = vout[span_l + span_stride * 7];
14122  ulong r9 = vout[span_l + span_stride * 8];
14123  ulong r10 = vout[span_l + span_stride * 9];
14124  ulong r11 = vout[span_l + span_stride * 10];
14125  ulong r12 = vout[span_l + span_stride * 11];
14126  ulong r13 = vout[span_l + span_stride * 12];
14127  ulong r14 = vout[span_l + span_stride * 13];
14128  ulong r15 = vout[span_l + span_stride * 14];
14129  ulong r16 = vout[span_l + span_stride * 15];
14130  ulong r17 = vout[span_r + span_stride * 0];
14131  ulong r18 = vout[span_r + span_stride * 1];
14132  if (r16 >= r17) {
14133    ulong const t = r16;
14134    r16 = r17;
14135    r17 = t;
14136  };
14137  if (r15 >= r18) {
14138    ulong const t = r15;
14139    r15 = r18;
14140    r18 = t;
14141  };
14142  if (r1 >= r9) {
14143    ulong const t = r1;
14144    r1 = r9;
14145    r9 = t;
14146  };
14147  if (r5 >= r13) {
14148    ulong const t = r5;
14149    r5 = r13;
14150    r13 = t;
14151  };
14152  if (r1 >= r5) {
14153    ulong const t = r1;
14154    r1 = r5;
14155    r5 = t;
14156  };
14157  if (r9 >= r13) {
14158    ulong const t = r9;
14159    r9 = r13;
14160    r13 = t;
14161  };
14162  if (r3 >= r11) {
14163    ulong const t = r3;
14164    r3 = r11;
14165    r11 = t;
14166  };
14167  if (r7 >= r15) {
14168    ulong const t = r7;
14169    r7 = r15;
14170    r15 = t;
14171  };
14172  if (r3 >= r7) {
14173    ulong const t = r3;
14174    r3 = r7;
14175    r7 = t;
14176  };
14177  if (r11 >= r15) {
14178    ulong const t = r11;
14179    r11 = r15;
14180    r15 = t;
14181  };
14182  if (r1 >= r3) {
14183    ulong const t = r1;
14184    r1 = r3;
14185    r3 = t;
14186  };
14187  if (r5 >= r7) {
14188    ulong const t = r5;
14189    r5 = r7;
14190    r7 = t;
14191  };
14192  if (r9 >= r11) {
14193    ulong const t = r9;
14194    r9 = r11;
14195    r11 = t;
14196  };
14197  if (r13 >= r15) {
14198    ulong const t = r13;
14199    r13 = r15;
14200    r15 = t;
14201  };
14202  if (r2 >= r10) {
14203    ulong const t = r2;
14204    r2 = r10;
14205    r10 = t;
14206  };
14207  if (r6 >= r14) {
14208    ulong const t = r6;
14209    r6 = r14;
14210    r14 = t;
14211  };
14212  if (r2 >= r6) {
14213    ulong const t = r2;
14214    r2 = r6;
14215    r6 = t;
14216  };
14217  if (r10 >= r14) {
14218    ulong const t = r10;
14219    r10 = r14;
14220    r14 = t;
14221  };
14222  if (r4 >= r12) {
14223    ulong const t = r4;
14224    r4 = r12;
14225    r12 = t;
14226  };
14227  if (r8 >= r16) {
14228    ulong const t = r8;
14229    r8 = r16;
14230    r16 = t;
14231  };
14232  if (r4 >= r8) {
14233    ulong const t = r4;
14234    r4 = r8;
14235    r8 = t;
14236  };
14237  if (r12 >= r16) {
14238    ulong const t = r12;
14239    r12 = r16;
14240    r16 = t;
14241  };
14242  if (r2 >= r4) {
14243    ulong const t = r2;
14244    r2 = r4;
14245    r4 = t;
14246  };
14247  if (r6 >= r8) {
14248    ulong const t = r6;
14249    r6 = r8;
14250    r8 = t;
14251  };
14252  if (r10 >= r12) {
14253    ulong const t = r10;
14254    r10 = r12;
14255    r12 = t;
14256  };
14257  if (r14 >= r16) {
14258    ulong const t = r14;
14259    r14 = r16;
14260    r16 = t;
14261  };
14262  if (r1 >= r2) {
14263    ulong const t = r1;
14264    r1 = r2;
14265    r2 = t;
14266  };
14267  if (r3 >= r4) {
14268    ulong const t = r3;
14269    r3 = r4;
14270    r4 = t;
14271  };
14272  if (r5 >= r6) {
14273    ulong const t = r5;
14274    r5 = r6;
14275    r6 = t;
14276  };
14277  if (r7 >= r8) {
14278    ulong const t = r7;
14279    r7 = r8;
14280    r8 = t;
14281  };
14282  if (r9 >= r10) {
14283    ulong const t = r9;
14284    r9 = r10;
14285    r10 = t;
14286  };
14287  if (r11 >= r12) {
14288    ulong const t = r11;
14289    r11 = r12;
14290    r12 = t;
14291  };
14292  if (r13 >= r14) {
14293    ulong const t = r13;
14294    r13 = r14;
14295    r14 = t;
14296  };
14297  if (r15 >= r16) {
14298    ulong const t = r15;
14299    r15 = r16;
14300    r16 = t;
14301  };
14302  if (r17 >= r18) {
14303    ulong const t = r17;
14304    r17 = r18;
14305    r18 = t;
14306  };
14307  vout[span_l + span_stride * 0] = r1;
14308  vout[span_l + span_stride * 1] = r2;
14309  vout[span_l + span_stride * 2] = r3;
14310  vout[span_l + span_stride * 3] = r4;
14311  vout[span_l + span_stride * 4] = r5;
14312  vout[span_l + span_stride * 5] = r6;
14313  vout[span_l + span_stride * 6] = r7;
14314  vout[span_l + span_stride * 7] = r8;
14315  vout[span_l + span_stride * 8] = r9;
14316  vout[span_l + span_stride * 9] = r10;
14317  vout[span_l + span_stride * 10] = r11;
14318  vout[span_l + span_stride * 11] = r12;
14319  vout[span_l + span_stride * 12] = r13;
14320  vout[span_l + span_stride * 13] = r14;
14321  vout[span_l + span_stride * 14] = r15;
14322  vout[span_l + span_stride * 15] = r16;
14323  vout[span_r + span_stride * 0] = r17;
14324  vout[span_r + span_stride * 1] = r18;
14325}
14326
14327__kernel __attribute__((intel_reqd_sub_group_size((1 << 3)))) void
14328hs_kernel_fm_1_2(__global ulong* const restrict vout)
14329{
14330  uint const span_idx = get_global_id(1);
14331  uint const span_stride = get_global_size(0);
14332  uint const span_size = span_stride * 16 * 2;
14333  uint const span_base = span_idx * span_size;
14334  uint const span_off = get_global_id(0);
14335  uint const span_l = span_base + span_off;
14336  uint const span_r = span_base + span_stride * (16 + 1) - span_off - 1;
14337  ulong r1 = vout[span_l + span_stride * 0];
14338  ulong r2 = vout[span_l + span_stride * 1];
14339  ulong r3 = vout[span_l + span_stride * 2];
14340  ulong r4 = vout[span_l + span_stride * 3];
14341  ulong r5 = vout[span_l + span_stride * 4];
14342  ulong r6 = vout[span_l + span_stride * 5];
14343  ulong r7 = vout[span_l + span_stride * 6];
14344  ulong r8 = vout[span_l + span_stride * 7];
14345  ulong r9 = vout[span_l + span_stride * 8];
14346  ulong r10 = vout[span_l + span_stride * 9];
14347  ulong r11 = vout[span_l + span_stride * 10];
14348  ulong r12 = vout[span_l + span_stride * 11];
14349  ulong r13 = vout[span_l + span_stride * 12];
14350  ulong r14 = vout[span_l + span_stride * 13];
14351  ulong r15 = vout[span_l + span_stride * 14];
14352  ulong r16 = vout[span_l + span_stride * 15];
14353  ulong r17 = vout[span_r + span_stride * 0];
14354  ulong r18 = vout[span_r + span_stride * 1];
14355  ulong r19 = vout[span_r + span_stride * 2];
14356  ulong r20 = vout[span_r + span_stride * 3];
14357  if (r16 >= r17) {
14358    ulong const t = r16;
14359    r16 = r17;
14360    r17 = t;
14361  };
14362  if (r15 >= r18) {
14363    ulong const t = r15;
14364    r15 = r18;
14365    r18 = t;
14366  };
14367  if (r14 >= r19) {
14368    ulong const t = r14;
14369    r14 = r19;
14370    r19 = t;
14371  };
14372  if (r13 >= r20) {
14373    ulong const t = r13;
14374    r13 = r20;
14375    r20 = t;
14376  };
14377  if (r1 >= r9) {
14378    ulong const t = r1;
14379    r1 = r9;
14380    r9 = t;
14381  };
14382  if (r5 >= r13) {
14383    ulong const t = r5;
14384    r5 = r13;
14385    r13 = t;
14386  };
14387  if (r1 >= r5) {
14388    ulong const t = r1;
14389    r1 = r5;
14390    r5 = t;
14391  };
14392  if (r9 >= r13) {
14393    ulong const t = r9;
14394    r9 = r13;
14395    r13 = t;
14396  };
14397  if (r3 >= r11) {
14398    ulong const t = r3;
14399    r3 = r11;
14400    r11 = t;
14401  };
14402  if (r7 >= r15) {
14403    ulong const t = r7;
14404    r7 = r15;
14405    r15 = t;
14406  };
14407  if (r3 >= r7) {
14408    ulong const t = r3;
14409    r3 = r7;
14410    r7 = t;
14411  };
14412  if (r11 >= r15) {
14413    ulong const t = r11;
14414    r11 = r15;
14415    r15 = t;
14416  };
14417  if (r1 >= r3) {
14418    ulong const t = r1;
14419    r1 = r3;
14420    r3 = t;
14421  };
14422  if (r5 >= r7) {
14423    ulong const t = r5;
14424    r5 = r7;
14425    r7 = t;
14426  };
14427  if (r9 >= r11) {
14428    ulong const t = r9;
14429    r9 = r11;
14430    r11 = t;
14431  };
14432  if (r13 >= r15) {
14433    ulong const t = r13;
14434    r13 = r15;
14435    r15 = t;
14436  };
14437  if (r2 >= r10) {
14438    ulong const t = r2;
14439    r2 = r10;
14440    r10 = t;
14441  };
14442  if (r6 >= r14) {
14443    ulong const t = r6;
14444    r6 = r14;
14445    r14 = t;
14446  };
14447  if (r2 >= r6) {
14448    ulong const t = r2;
14449    r2 = r6;
14450    r6 = t;
14451  };
14452  if (r10 >= r14) {
14453    ulong const t = r10;
14454    r10 = r14;
14455    r14 = t;
14456  };
14457  if (r4 >= r12) {
14458    ulong const t = r4;
14459    r4 = r12;
14460    r12 = t;
14461  };
14462  if (r8 >= r16) {
14463    ulong const t = r8;
14464    r8 = r16;
14465    r16 = t;
14466  };
14467  if (r4 >= r8) {
14468    ulong const t = r4;
14469    r4 = r8;
14470    r8 = t;
14471  };
14472  if (r12 >= r16) {
14473    ulong const t = r12;
14474    r12 = r16;
14475    r16 = t;
14476  };
14477  if (r2 >= r4) {
14478    ulong const t = r2;
14479    r2 = r4;
14480    r4 = t;
14481  };
14482  if (r6 >= r8) {
14483    ulong const t = r6;
14484    r6 = r8;
14485    r8 = t;
14486  };
14487  if (r10 >= r12) {
14488    ulong const t = r10;
14489    r10 = r12;
14490    r12 = t;
14491  };
14492  if (r14 >= r16) {
14493    ulong const t = r14;
14494    r14 = r16;
14495    r16 = t;
14496  };
14497  if (r1 >= r2) {
14498    ulong const t = r1;
14499    r1 = r2;
14500    r2 = t;
14501  };
14502  if (r3 >= r4) {
14503    ulong const t = r3;
14504    r3 = r4;
14505    r4 = t;
14506  };
14507  if (r5 >= r6) {
14508    ulong const t = r5;
14509    r5 = r6;
14510    r6 = t;
14511  };
14512  if (r7 >= r8) {
14513    ulong const t = r7;
14514    r7 = r8;
14515    r8 = t;
14516  };
14517  if (r9 >= r10) {
14518    ulong const t = r9;
14519    r9 = r10;
14520    r10 = t;
14521  };
14522  if (r11 >= r12) {
14523    ulong const t = r11;
14524    r11 = r12;
14525    r12 = t;
14526  };
14527  if (r13 >= r14) {
14528    ulong const t = r13;
14529    r13 = r14;
14530    r14 = t;
14531  };
14532  if (r15 >= r16) {
14533    ulong const t = r15;
14534    r15 = r16;
14535    r16 = t;
14536  };
14537  if (r17 >= r19) {
14538    ulong const t = r17;
14539    r17 = r19;
14540    r19 = t;
14541  };
14542  if (r18 >= r20) {
14543    ulong const t = r18;
14544    r18 = r20;
14545    r20 = t;
14546  };
14547  if (r17 >= r18) {
14548    ulong const t = r17;
14549    r17 = r18;
14550    r18 = t;
14551  };
14552  if (r19 >= r20) {
14553    ulong const t = r19;
14554    r19 = r20;
14555    r20 = t;
14556  };
14557  vout[span_l + span_stride * 0] = r1;
14558  vout[span_l + span_stride * 1] = r2;
14559  vout[span_l + span_stride * 2] = r3;
14560  vout[span_l + span_stride * 3] = r4;
14561  vout[span_l + span_stride * 4] = r5;
14562  vout[span_l + span_stride * 5] = r6;
14563  vout[span_l + span_stride * 6] = r7;
14564  vout[span_l + span_stride * 7] = r8;
14565  vout[span_l + span_stride * 8] = r9;
14566  vout[span_l + span_stride * 9] = r10;
14567  vout[span_l + span_stride * 10] = r11;
14568  vout[span_l + span_stride * 11] = r12;
14569  vout[span_l + span_stride * 12] = r13;
14570  vout[span_l + span_stride * 13] = r14;
14571  vout[span_l + span_stride * 14] = r15;
14572  vout[span_l + span_stride * 15] = r16;
14573  vout[span_r + span_stride * 0] = r17;
14574  vout[span_r + span_stride * 1] = r18;
14575  vout[span_r + span_stride * 2] = r19;
14576  vout[span_r + span_stride * 3] = r20;
14577}
14578
14579__kernel __attribute__((intel_reqd_sub_group_size((1 << 3)))) void
14580hs_kernel_fm_1_3(__global ulong* const restrict vout)
14581{
14582  uint const span_idx = get_global_id(1);
14583  uint const span_stride = get_global_size(0);
14584  uint const span_size = span_stride * 16 * 2;
14585  uint const span_base = span_idx * span_size;
14586  uint const span_off = get_global_id(0);
14587  uint const span_l = span_base + span_off;
14588  uint const span_r = span_base + span_stride * (16 + 1) - span_off - 1;
14589  ulong r1 = vout[span_l + span_stride * 0];
14590  ulong r2 = vout[span_l + span_stride * 1];
14591  ulong r3 = vout[span_l + span_stride * 2];
14592  ulong r4 = vout[span_l + span_stride * 3];
14593  ulong r5 = vout[span_l + span_stride * 4];
14594  ulong r6 = vout[span_l + span_stride * 5];
14595  ulong r7 = vout[span_l + span_stride * 6];
14596  ulong r8 = vout[span_l + span_stride * 7];
14597  ulong r9 = vout[span_l + span_stride * 8];
14598  ulong r10 = vout[span_l + span_stride * 9];
14599  ulong r11 = vout[span_l + span_stride * 10];
14600  ulong r12 = vout[span_l + span_stride * 11];
14601  ulong r13 = vout[span_l + span_stride * 12];
14602  ulong r14 = vout[span_l + span_stride * 13];
14603  ulong r15 = vout[span_l + span_stride * 14];
14604  ulong r16 = vout[span_l + span_stride * 15];
14605  ulong r17 = vout[span_r + span_stride * 0];
14606  ulong r18 = vout[span_r + span_stride * 1];
14607  ulong r19 = vout[span_r + span_stride * 2];
14608  ulong r20 = vout[span_r + span_stride * 3];
14609  ulong r21 = vout[span_r + span_stride * 4];
14610  ulong r22 = vout[span_r + span_stride * 5];
14611  ulong r23 = vout[span_r + span_stride * 6];
14612  ulong r24 = vout[span_r + span_stride * 7];
14613  if (r16 >= r17) {
14614    ulong const t = r16;
14615    r16 = r17;
14616    r17 = t;
14617  };
14618  if (r15 >= r18) {
14619    ulong const t = r15;
14620    r15 = r18;
14621    r18 = t;
14622  };
14623  if (r14 >= r19) {
14624    ulong const t = r14;
14625    r14 = r19;
14626    r19 = t;
14627  };
14628  if (r13 >= r20) {
14629    ulong const t = r13;
14630    r13 = r20;
14631    r20 = t;
14632  };
14633  if (r12 >= r21) {
14634    ulong const t = r12;
14635    r12 = r21;
14636    r21 = t;
14637  };
14638  if (r11 >= r22) {
14639    ulong const t = r11;
14640    r11 = r22;
14641    r22 = t;
14642  };
14643  if (r10 >= r23) {
14644    ulong const t = r10;
14645    r10 = r23;
14646    r23 = t;
14647  };
14648  if (r9 >= r24) {
14649    ulong const t = r9;
14650    r9 = r24;
14651    r24 = t;
14652  };
14653  if (r1 >= r9) {
14654    ulong const t = r1;
14655    r1 = r9;
14656    r9 = t;
14657  };
14658  if (r5 >= r13) {
14659    ulong const t = r5;
14660    r5 = r13;
14661    r13 = t;
14662  };
14663  if (r1 >= r5) {
14664    ulong const t = r1;
14665    r1 = r5;
14666    r5 = t;
14667  };
14668  if (r9 >= r13) {
14669    ulong const t = r9;
14670    r9 = r13;
14671    r13 = t;
14672  };
14673  if (r3 >= r11) {
14674    ulong const t = r3;
14675    r3 = r11;
14676    r11 = t;
14677  };
14678  if (r7 >= r15) {
14679    ulong const t = r7;
14680    r7 = r15;
14681    r15 = t;
14682  };
14683  if (r3 >= r7) {
14684    ulong const t = r3;
14685    r3 = r7;
14686    r7 = t;
14687  };
14688  if (r11 >= r15) {
14689    ulong const t = r11;
14690    r11 = r15;
14691    r15 = t;
14692  };
14693  if (r1 >= r3) {
14694    ulong const t = r1;
14695    r1 = r3;
14696    r3 = t;
14697  };
14698  if (r5 >= r7) {
14699    ulong const t = r5;
14700    r5 = r7;
14701    r7 = t;
14702  };
14703  if (r9 >= r11) {
14704    ulong const t = r9;
14705    r9 = r11;
14706    r11 = t;
14707  };
14708  if (r13 >= r15) {
14709    ulong const t = r13;
14710    r13 = r15;
14711    r15 = t;
14712  };
14713  if (r2 >= r10) {
14714    ulong const t = r2;
14715    r2 = r10;
14716    r10 = t;
14717  };
14718  if (r6 >= r14) {
14719    ulong const t = r6;
14720    r6 = r14;
14721    r14 = t;
14722  };
14723  if (r2 >= r6) {
14724    ulong const t = r2;
14725    r2 = r6;
14726    r6 = t;
14727  };
14728  if (r10 >= r14) {
14729    ulong const t = r10;
14730    r10 = r14;
14731    r14 = t;
14732  };
14733  if (r4 >= r12) {
14734    ulong const t = r4;
14735    r4 = r12;
14736    r12 = t;
14737  };
14738  if (r8 >= r16) {
14739    ulong const t = r8;
14740    r8 = r16;
14741    r16 = t;
14742  };
14743  if (r4 >= r8) {
14744    ulong const t = r4;
14745    r4 = r8;
14746    r8 = t;
14747  };
14748  if (r12 >= r16) {
14749    ulong const t = r12;
14750    r12 = r16;
14751    r16 = t;
14752  };
14753  if (r2 >= r4) {
14754    ulong const t = r2;
14755    r2 = r4;
14756    r4 = t;
14757  };
14758  if (r6 >= r8) {
14759    ulong const t = r6;
14760    r6 = r8;
14761    r8 = t;
14762  };
14763  if (r10 >= r12) {
14764    ulong const t = r10;
14765    r10 = r12;
14766    r12 = t;
14767  };
14768  if (r14 >= r16) {
14769    ulong const t = r14;
14770    r14 = r16;
14771    r16 = t;
14772  };
14773  if (r1 >= r2) {
14774    ulong const t = r1;
14775    r1 = r2;
14776    r2 = t;
14777  };
14778  if (r3 >= r4) {
14779    ulong const t = r3;
14780    r3 = r4;
14781    r4 = t;
14782  };
14783  if (r5 >= r6) {
14784    ulong const t = r5;
14785    r5 = r6;
14786    r6 = t;
14787  };
14788  if (r7 >= r8) {
14789    ulong const t = r7;
14790    r7 = r8;
14791    r8 = t;
14792  };
14793  if (r9 >= r10) {
14794    ulong const t = r9;
14795    r9 = r10;
14796    r10 = t;
14797  };
14798  if (r11 >= r12) {
14799    ulong const t = r11;
14800    r11 = r12;
14801    r12 = t;
14802  };
14803  if (r13 >= r14) {
14804    ulong const t = r13;
14805    r13 = r14;
14806    r14 = t;
14807  };
14808  if (r15 >= r16) {
14809    ulong const t = r15;
14810    r15 = r16;
14811    r16 = t;
14812  };
14813  if (r17 >= r21) {
14814    ulong const t = r17;
14815    r17 = r21;
14816    r21 = t;
14817  };
14818  if (r19 >= r23) {
14819    ulong const t = r19;
14820    r19 = r23;
14821    r23 = t;
14822  };
14823  if (r17 >= r19) {
14824    ulong const t = r17;
14825    r17 = r19;
14826    r19 = t;
14827  };
14828  if (r21 >= r23) {
14829    ulong const t = r21;
14830    r21 = r23;
14831    r23 = t;
14832  };
14833  if (r18 >= r22) {
14834    ulong const t = r18;
14835    r18 = r22;
14836    r22 = t;
14837  };
14838  if (r20 >= r24) {
14839    ulong const t = r20;
14840    r20 = r24;
14841    r24 = t;
14842  };
14843  if (r18 >= r20) {
14844    ulong const t = r18;
14845    r18 = r20;
14846    r20 = t;
14847  };
14848  if (r22 >= r24) {
14849    ulong const t = r22;
14850    r22 = r24;
14851    r24 = t;
14852  };
14853  if (r17 >= r18) {
14854    ulong const t = r17;
14855    r17 = r18;
14856    r18 = t;
14857  };
14858  if (r19 >= r20) {
14859    ulong const t = r19;
14860    r19 = r20;
14861    r20 = t;
14862  };
14863  if (r21 >= r22) {
14864    ulong const t = r21;
14865    r21 = r22;
14866    r22 = t;
14867  };
14868  if (r23 >= r24) {
14869    ulong const t = r23;
14870    r23 = r24;
14871    r24 = t;
14872  };
14873  vout[span_l + span_stride * 0] = r1;
14874  vout[span_l + span_stride * 1] = r2;
14875  vout[span_l + span_stride * 2] = r3;
14876  vout[span_l + span_stride * 3] = r4;
14877  vout[span_l + span_stride * 4] = r5;
14878  vout[span_l + span_stride * 5] = r6;
14879  vout[span_l + span_stride * 6] = r7;
14880  vout[span_l + span_stride * 7] = r8;
14881  vout[span_l + span_stride * 8] = r9;
14882  vout[span_l + span_stride * 9] = r10;
14883  vout[span_l + span_stride * 10] = r11;
14884  vout[span_l + span_stride * 11] = r12;
14885  vout[span_l + span_stride * 12] = r13;
14886  vout[span_l + span_stride * 13] = r14;
14887  vout[span_l + span_stride * 14] = r15;
14888  vout[span_l + span_stride * 15] = r16;
14889  vout[span_r + span_stride * 0] = r17;
14890  vout[span_r + span_stride * 1] = r18;
14891  vout[span_r + span_stride * 2] = r19;
14892  vout[span_r + span_stride * 3] = r20;
14893  vout[span_r + span_stride * 4] = r21;
14894  vout[span_r + span_stride * 5] = r22;
14895  vout[span_r + span_stride * 6] = r23;
14896  vout[span_r + span_stride * 7] = r24;
14897}
14898
14899__kernel __attribute__((intel_reqd_sub_group_size((1 << 3)))) void
14900hs_kernel_fm_1_4(__global ulong* const restrict vout)
14901{
14902  uint const span_idx = get_global_id(1);
14903  uint const span_stride = get_global_size(0);
14904  uint const span_size = span_stride * 16 * 2;
14905  uint const span_base = span_idx * span_size;
14906  uint const span_off = get_global_id(0);
14907  uint const span_l = span_base + span_off;
14908  uint const span_r = span_base + span_stride * (16 + 1) - span_off - 1;
14909  ulong r1 = vout[span_l + span_stride * 0];
14910  ulong r2 = vout[span_l + span_stride * 1];
14911  ulong r3 = vout[span_l + span_stride * 2];
14912  ulong r4 = vout[span_l + span_stride * 3];
14913  ulong r5 = vout[span_l + span_stride * 4];
14914  ulong r6 = vout[span_l + span_stride * 5];
14915  ulong r7 = vout[span_l + span_stride * 6];
14916  ulong r8 = vout[span_l + span_stride * 7];
14917  ulong r9 = vout[span_l + span_stride * 8];
14918  ulong r10 = vout[span_l + span_stride * 9];
14919  ulong r11 = vout[span_l + span_stride * 10];
14920  ulong r12 = vout[span_l + span_stride * 11];
14921  ulong r13 = vout[span_l + span_stride * 12];
14922  ulong r14 = vout[span_l + span_stride * 13];
14923  ulong r15 = vout[span_l + span_stride * 14];
14924  ulong r16 = vout[span_l + span_stride * 15];
14925  ulong r17 = vout[span_r + span_stride * 0];
14926  ulong r18 = vout[span_r + span_stride * 1];
14927  ulong r19 = vout[span_r + span_stride * 2];
14928  ulong r20 = vout[span_r + span_stride * 3];
14929  ulong r21 = vout[span_r + span_stride * 4];
14930  ulong r22 = vout[span_r + span_stride * 5];
14931  ulong r23 = vout[span_r + span_stride * 6];
14932  ulong r24 = vout[span_r + span_stride * 7];
14933  ulong r25 = vout[span_r + span_stride * 8];
14934  ulong r26 = vout[span_r + span_stride * 9];
14935  ulong r27 = vout[span_r + span_stride * 10];
14936  ulong r28 = vout[span_r + span_stride * 11];
14937  ulong r29 = vout[span_r + span_stride * 12];
14938  ulong r30 = vout[span_r + span_stride * 13];
14939  ulong r31 = vout[span_r + span_stride * 14];
14940  ulong r32 = vout[span_r + span_stride * 15];
14941  if (r16 >= r17) {
14942    ulong const t = r16;
14943    r16 = r17;
14944    r17 = t;
14945  };
14946  if (r15 >= r18) {
14947    ulong const t = r15;
14948    r15 = r18;
14949    r18 = t;
14950  };
14951  if (r14 >= r19) {
14952    ulong const t = r14;
14953    r14 = r19;
14954    r19 = t;
14955  };
14956  if (r13 >= r20) {
14957    ulong const t = r13;
14958    r13 = r20;
14959    r20 = t;
14960  };
14961  if (r12 >= r21) {
14962    ulong const t = r12;
14963    r12 = r21;
14964    r21 = t;
14965  };
14966  if (r11 >= r22) {
14967    ulong const t = r11;
14968    r11 = r22;
14969    r22 = t;
14970  };
14971  if (r10 >= r23) {
14972    ulong const t = r10;
14973    r10 = r23;
14974    r23 = t;
14975  };
14976  if (r9 >= r24) {
14977    ulong const t = r9;
14978    r9 = r24;
14979    r24 = t;
14980  };
14981  if (r8 >= r25) {
14982    ulong const t = r8;
14983    r8 = r25;
14984    r25 = t;
14985  };
14986  if (r7 >= r26) {
14987    ulong const t = r7;
14988    r7 = r26;
14989    r26 = t;
14990  };
14991  if (r6 >= r27) {
14992    ulong const t = r6;
14993    r6 = r27;
14994    r27 = t;
14995  };
14996  if (r5 >= r28) {
14997    ulong const t = r5;
14998    r5 = r28;
14999    r28 = t;
15000  };
15001  if (r4 >= r29) {
15002    ulong const t = r4;
15003    r4 = r29;
15004    r29 = t;
15005  };
15006  if (r3 >= r30) {
15007    ulong const t = r3;
15008    r3 = r30;
15009    r30 = t;
15010  };
15011  if (r2 >= r31) {
15012    ulong const t = r2;
15013    r2 = r31;
15014    r31 = t;
15015  };
15016  if (r1 >= r32) {
15017    ulong const t = r1;
15018    r1 = r32;
15019    r32 = t;
15020  };
15021  if (r1 >= r9) {
15022    ulong const t = r1;
15023    r1 = r9;
15024    r9 = t;
15025  };
15026  if (r5 >= r13) {
15027    ulong const t = r5;
15028    r5 = r13;
15029    r13 = t;
15030  };
15031  if (r1 >= r5) {
15032    ulong const t = r1;
15033    r1 = r5;
15034    r5 = t;
15035  };
15036  if (r9 >= r13) {
15037    ulong const t = r9;
15038    r9 = r13;
15039    r13 = t;
15040  };
15041  if (r3 >= r11) {
15042    ulong const t = r3;
15043    r3 = r11;
15044    r11 = t;
15045  };
15046  if (r7 >= r15) {
15047    ulong const t = r7;
15048    r7 = r15;
15049    r15 = t;
15050  };
15051  if (r3 >= r7) {
15052    ulong const t = r3;
15053    r3 = r7;
15054    r7 = t;
15055  };
15056  if (r11 >= r15) {
15057    ulong const t = r11;
15058    r11 = r15;
15059    r15 = t;
15060  };
15061  if (r1 >= r3) {
15062    ulong const t = r1;
15063    r1 = r3;
15064    r3 = t;
15065  };
15066  if (r5 >= r7) {
15067    ulong const t = r5;
15068    r5 = r7;
15069    r7 = t;
15070  };
15071  if (r9 >= r11) {
15072    ulong const t = r9;
15073    r9 = r11;
15074    r11 = t;
15075  };
15076  if (r13 >= r15) {
15077    ulong const t = r13;
15078    r13 = r15;
15079    r15 = t;
15080  };
15081  if (r2 >= r10) {
15082    ulong const t = r2;
15083    r2 = r10;
15084    r10 = t;
15085  };
15086  if (r6 >= r14) {
15087    ulong const t = r6;
15088    r6 = r14;
15089    r14 = t;
15090  };
15091  if (r2 >= r6) {
15092    ulong const t = r2;
15093    r2 = r6;
15094    r6 = t;
15095  };
15096  if (r10 >= r14) {
15097    ulong const t = r10;
15098    r10 = r14;
15099    r14 = t;
15100  };
15101  if (r4 >= r12) {
15102    ulong const t = r4;
15103    r4 = r12;
15104    r12 = t;
15105  };
15106  if (r8 >= r16) {
15107    ulong const t = r8;
15108    r8 = r16;
15109    r16 = t;
15110  };
15111  if (r4 >= r8) {
15112    ulong const t = r4;
15113    r4 = r8;
15114    r8 = t;
15115  };
15116  if (r12 >= r16) {
15117    ulong const t = r12;
15118    r12 = r16;
15119    r16 = t;
15120  };
15121  if (r2 >= r4) {
15122    ulong const t = r2;
15123    r2 = r4;
15124    r4 = t;
15125  };
15126  if (r6 >= r8) {
15127    ulong const t = r6;
15128    r6 = r8;
15129    r8 = t;
15130  };
15131  if (r10 >= r12) {
15132    ulong const t = r10;
15133    r10 = r12;
15134    r12 = t;
15135  };
15136  if (r14 >= r16) {
15137    ulong const t = r14;
15138    r14 = r16;
15139    r16 = t;
15140  };
15141  if (r1 >= r2) {
15142    ulong const t = r1;
15143    r1 = r2;
15144    r2 = t;
15145  };
15146  if (r3 >= r4) {
15147    ulong const t = r3;
15148    r3 = r4;
15149    r4 = t;
15150  };
15151  if (r5 >= r6) {
15152    ulong const t = r5;
15153    r5 = r6;
15154    r6 = t;
15155  };
15156  if (r7 >= r8) {
15157    ulong const t = r7;
15158    r7 = r8;
15159    r8 = t;
15160  };
15161  if (r9 >= r10) {
15162    ulong const t = r9;
15163    r9 = r10;
15164    r10 = t;
15165  };
15166  if (r11 >= r12) {
15167    ulong const t = r11;
15168    r11 = r12;
15169    r12 = t;
15170  };
15171  if (r13 >= r14) {
15172    ulong const t = r13;
15173    r13 = r14;
15174    r14 = t;
15175  };
15176  if (r15 >= r16) {
15177    ulong const t = r15;
15178    r15 = r16;
15179    r16 = t;
15180  };
15181  if (r17 >= r25) {
15182    ulong const t = r17;
15183    r17 = r25;
15184    r25 = t;
15185  };
15186  if (r21 >= r29) {
15187    ulong const t = r21;
15188    r21 = r29;
15189    r29 = t;
15190  };
15191  if (r17 >= r21) {
15192    ulong const t = r17;
15193    r17 = r21;
15194    r21 = t;
15195  };
15196  if (r25 >= r29) {
15197    ulong const t = r25;
15198    r25 = r29;
15199    r29 = t;
15200  };
15201  if (r19 >= r27) {
15202    ulong const t = r19;
15203    r19 = r27;
15204    r27 = t;
15205  };
15206  if (r23 >= r31) {
15207    ulong const t = r23;
15208    r23 = r31;
15209    r31 = t;
15210  };
15211  if (r19 >= r23) {
15212    ulong const t = r19;
15213    r19 = r23;
15214    r23 = t;
15215  };
15216  if (r27 >= r31) {
15217    ulong const t = r27;
15218    r27 = r31;
15219    r31 = t;
15220  };
15221  if (r17 >= r19) {
15222    ulong const t = r17;
15223    r17 = r19;
15224    r19 = t;
15225  };
15226  if (r21 >= r23) {
15227    ulong const t = r21;
15228    r21 = r23;
15229    r23 = t;
15230  };
15231  if (r25 >= r27) {
15232    ulong const t = r25;
15233    r25 = r27;
15234    r27 = t;
15235  };
15236  if (r29 >= r31) {
15237    ulong const t = r29;
15238    r29 = r31;
15239    r31 = t;
15240  };
15241  if (r18 >= r26) {
15242    ulong const t = r18;
15243    r18 = r26;
15244    r26 = t;
15245  };
15246  if (r22 >= r30) {
15247    ulong const t = r22;
15248    r22 = r30;
15249    r30 = t;
15250  };
15251  if (r18 >= r22) {
15252    ulong const t = r18;
15253    r18 = r22;
15254    r22 = t;
15255  };
15256  if (r26 >= r30) {
15257    ulong const t = r26;
15258    r26 = r30;
15259    r30 = t;
15260  };
15261  if (r20 >= r28) {
15262    ulong const t = r20;
15263    r20 = r28;
15264    r28 = t;
15265  };
15266  if (r24 >= r32) {
15267    ulong const t = r24;
15268    r24 = r32;
15269    r32 = t;
15270  };
15271  if (r20 >= r24) {
15272    ulong const t = r20;
15273    r20 = r24;
15274    r24 = t;
15275  };
15276  if (r28 >= r32) {
15277    ulong const t = r28;
15278    r28 = r32;
15279    r32 = t;
15280  };
15281  if (r18 >= r20) {
15282    ulong const t = r18;
15283    r18 = r20;
15284    r20 = t;
15285  };
15286  if (r22 >= r24) {
15287    ulong const t = r22;
15288    r22 = r24;
15289    r24 = t;
15290  };
15291  if (r26 >= r28) {
15292    ulong const t = r26;
15293    r26 = r28;
15294    r28 = t;
15295  };
15296  if (r30 >= r32) {
15297    ulong const t = r30;
15298    r30 = r32;
15299    r32 = t;
15300  };
15301  if (r17 >= r18) {
15302    ulong const t = r17;
15303    r17 = r18;
15304    r18 = t;
15305  };
15306  if (r19 >= r20) {
15307    ulong const t = r19;
15308    r19 = r20;
15309    r20 = t;
15310  };
15311  if (r21 >= r22) {
15312    ulong const t = r21;
15313    r21 = r22;
15314    r22 = t;
15315  };
15316  if (r23 >= r24) {
15317    ulong const t = r23;
15318    r23 = r24;
15319    r24 = t;
15320  };
15321  if (r25 >= r26) {
15322    ulong const t = r25;
15323    r25 = r26;
15324    r26 = t;
15325  };
15326  if (r27 >= r28) {
15327    ulong const t = r27;
15328    r27 = r28;
15329    r28 = t;
15330  };
15331  if (r29 >= r30) {
15332    ulong const t = r29;
15333    r29 = r30;
15334    r30 = t;
15335  };
15336  if (r31 >= r32) {
15337    ulong const t = r31;
15338    r31 = r32;
15339    r32 = t;
15340  };
15341  vout[span_l + span_stride * 0] = r1;
15342  vout[span_l + span_stride * 1] = r2;
15343  vout[span_l + span_stride * 2] = r3;
15344  vout[span_l + span_stride * 3] = r4;
15345  vout[span_l + span_stride * 4] = r5;
15346  vout[span_l + span_stride * 5] = r6;
15347  vout[span_l + span_stride * 6] = r7;
15348  vout[span_l + span_stride * 7] = r8;
15349  vout[span_l + span_stride * 8] = r9;
15350  vout[span_l + span_stride * 9] = r10;
15351  vout[span_l + span_stride * 10] = r11;
15352  vout[span_l + span_stride * 11] = r12;
15353  vout[span_l + span_stride * 12] = r13;
15354  vout[span_l + span_stride * 13] = r14;
15355  vout[span_l + span_stride * 14] = r15;
15356  vout[span_l + span_stride * 15] = r16;
15357  vout[span_r + span_stride * 0] = r17;
15358  vout[span_r + span_stride * 1] = r18;
15359  vout[span_r + span_stride * 2] = r19;
15360  vout[span_r + span_stride * 3] = r20;
15361  vout[span_r + span_stride * 4] = r21;
15362  vout[span_r + span_stride * 5] = r22;
15363  vout[span_r + span_stride * 6] = r23;
15364  vout[span_r + span_stride * 7] = r24;
15365  vout[span_r + span_stride * 8] = r25;
15366  vout[span_r + span_stride * 9] = r26;
15367  vout[span_r + span_stride * 10] = r27;
15368  vout[span_r + span_stride * 11] = r28;
15369  vout[span_r + span_stride * 12] = r29;
15370  vout[span_r + span_stride * 13] = r30;
15371  vout[span_r + span_stride * 14] = r31;
15372  vout[span_r + span_stride * 15] = r32;
15373}
15374
15375__kernel __attribute__((intel_reqd_sub_group_size((1 << 3)))) void
15376hs_kernel_hm_1(__global ulong* const restrict vout)
15377{
15378  uint const span_idx = get_global_id(1);
15379  uint const span_stride = get_global_size(0);
15380  uint const span_size = span_stride * 16 * 2;
15381  uint const span_base = span_idx * span_size;
15382  uint const span_off = get_global_id(0);
15383  uint const span_l = span_base + span_off;
15384  ulong r1 = vout[span_l + span_stride * 0];
15385  ulong r2 = vout[span_l + span_stride * 1];
15386  ulong r3 = vout[span_l + span_stride * 2];
15387  ulong r4 = vout[span_l + span_stride * 3];
15388  ulong r5 = vout[span_l + span_stride * 4];
15389  ulong r6 = vout[span_l + span_stride * 5];
15390  ulong r7 = vout[span_l + span_stride * 6];
15391  ulong r8 = vout[span_l + span_stride * 7];
15392  ulong r9 = vout[span_l + span_stride * 8];
15393  ulong r10 = vout[span_l + span_stride * 9];
15394  ulong r11 = vout[span_l + span_stride * 10];
15395  ulong r12 = vout[span_l + span_stride * 11];
15396  ulong r13 = vout[span_l + span_stride * 12];
15397  ulong r14 = vout[span_l + span_stride * 13];
15398  ulong r15 = vout[span_l + span_stride * 14];
15399  ulong r16 = vout[span_l + span_stride * 15];
15400  ulong r17 = vout[span_l + span_stride * 16];
15401  ulong r18 = vout[span_l + span_stride * 17];
15402  ulong r19 = vout[span_l + span_stride * 18];
15403  ulong r20 = vout[span_l + span_stride * 19];
15404  ulong r21 = vout[span_l + span_stride * 20];
15405  ulong r22 = vout[span_l + span_stride * 21];
15406  ulong r23 = vout[span_l + span_stride * 22];
15407  ulong r24 = vout[span_l + span_stride * 23];
15408  ulong r25 = vout[span_l + span_stride * 24];
15409  ulong r26 = vout[span_l + span_stride * 25];
15410  ulong r27 = vout[span_l + span_stride * 26];
15411  ulong r28 = vout[span_l + span_stride * 27];
15412  ulong r29 = vout[span_l + span_stride * 28];
15413  ulong r30 = vout[span_l + span_stride * 29];
15414  ulong r31 = vout[span_l + span_stride * 30];
15415  ulong r32 = vout[span_l + span_stride * 31];
15416  if (r1 >= r17) {
15417    ulong const t = r1;
15418    r1 = r17;
15419    r17 = t;
15420  };
15421  if (r9 >= r25) {
15422    ulong const t = r9;
15423    r9 = r25;
15424    r25 = t;
15425  };
15426  if (r1 >= r9) {
15427    ulong const t = r1;
15428    r1 = r9;
15429    r9 = t;
15430  };
15431  if (r17 >= r25) {
15432    ulong const t = r17;
15433    r17 = r25;
15434    r25 = t;
15435  };
15436  if (r5 >= r21) {
15437    ulong const t = r5;
15438    r5 = r21;
15439    r21 = t;
15440  };
15441  if (r13 >= r29) {
15442    ulong const t = r13;
15443    r13 = r29;
15444    r29 = t;
15445  };
15446  if (r5 >= r13) {
15447    ulong const t = r5;
15448    r5 = r13;
15449    r13 = t;
15450  };
15451  if (r21 >= r29) {
15452    ulong const t = r21;
15453    r21 = r29;
15454    r29 = t;
15455  };
15456  if (r1 >= r5) {
15457    ulong const t = r1;
15458    r1 = r5;
15459    r5 = t;
15460  };
15461  if (r9 >= r13) {
15462    ulong const t = r9;
15463    r9 = r13;
15464    r13 = t;
15465  };
15466  if (r17 >= r21) {
15467    ulong const t = r17;
15468    r17 = r21;
15469    r21 = t;
15470  };
15471  if (r25 >= r29) {
15472    ulong const t = r25;
15473    r25 = r29;
15474    r29 = t;
15475  };
15476  if (r3 >= r19) {
15477    ulong const t = r3;
15478    r3 = r19;
15479    r19 = t;
15480  };
15481  if (r11 >= r27) {
15482    ulong const t = r11;
15483    r11 = r27;
15484    r27 = t;
15485  };
15486  if (r3 >= r11) {
15487    ulong const t = r3;
15488    r3 = r11;
15489    r11 = t;
15490  };
15491  if (r19 >= r27) {
15492    ulong const t = r19;
15493    r19 = r27;
15494    r27 = t;
15495  };
15496  if (r7 >= r23) {
15497    ulong const t = r7;
15498    r7 = r23;
15499    r23 = t;
15500  };
15501  if (r15 >= r31) {
15502    ulong const t = r15;
15503    r15 = r31;
15504    r31 = t;
15505  };
15506  if (r7 >= r15) {
15507    ulong const t = r7;
15508    r7 = r15;
15509    r15 = t;
15510  };
15511  if (r23 >= r31) {
15512    ulong const t = r23;
15513    r23 = r31;
15514    r31 = t;
15515  };
15516  if (r3 >= r7) {
15517    ulong const t = r3;
15518    r3 = r7;
15519    r7 = t;
15520  };
15521  if (r11 >= r15) {
15522    ulong const t = r11;
15523    r11 = r15;
15524    r15 = t;
15525  };
15526  if (r19 >= r23) {
15527    ulong const t = r19;
15528    r19 = r23;
15529    r23 = t;
15530  };
15531  if (r27 >= r31) {
15532    ulong const t = r27;
15533    r27 = r31;
15534    r31 = t;
15535  };
15536  if (r1 >= r3) {
15537    ulong const t = r1;
15538    r1 = r3;
15539    r3 = t;
15540  };
15541  if (r5 >= r7) {
15542    ulong const t = r5;
15543    r5 = r7;
15544    r7 = t;
15545  };
15546  if (r9 >= r11) {
15547    ulong const t = r9;
15548    r9 = r11;
15549    r11 = t;
15550  };
15551  if (r13 >= r15) {
15552    ulong const t = r13;
15553    r13 = r15;
15554    r15 = t;
15555  };
15556  if (r17 >= r19) {
15557    ulong const t = r17;
15558    r17 = r19;
15559    r19 = t;
15560  };
15561  if (r21 >= r23) {
15562    ulong const t = r21;
15563    r21 = r23;
15564    r23 = t;
15565  };
15566  if (r25 >= r27) {
15567    ulong const t = r25;
15568    r25 = r27;
15569    r27 = t;
15570  };
15571  if (r29 >= r31) {
15572    ulong const t = r29;
15573    r29 = r31;
15574    r31 = t;
15575  };
15576  if (r2 >= r18) {
15577    ulong const t = r2;
15578    r2 = r18;
15579    r18 = t;
15580  };
15581  if (r10 >= r26) {
15582    ulong const t = r10;
15583    r10 = r26;
15584    r26 = t;
15585  };
15586  if (r2 >= r10) {
15587    ulong const t = r2;
15588    r2 = r10;
15589    r10 = t;
15590  };
15591  if (r18 >= r26) {
15592    ulong const t = r18;
15593    r18 = r26;
15594    r26 = t;
15595  };
15596  if (r6 >= r22) {
15597    ulong const t = r6;
15598    r6 = r22;
15599    r22 = t;
15600  };
15601  if (r14 >= r30) {
15602    ulong const t = r14;
15603    r14 = r30;
15604    r30 = t;
15605  };
15606  if (r6 >= r14) {
15607    ulong const t = r6;
15608    r6 = r14;
15609    r14 = t;
15610  };
15611  if (r22 >= r30) {
15612    ulong const t = r22;
15613    r22 = r30;
15614    r30 = t;
15615  };
15616  if (r2 >= r6) {
15617    ulong const t = r2;
15618    r2 = r6;
15619    r6 = t;
15620  };
15621  if (r10 >= r14) {
15622    ulong const t = r10;
15623    r10 = r14;
15624    r14 = t;
15625  };
15626  if (r18 >= r22) {
15627    ulong const t = r18;
15628    r18 = r22;
15629    r22 = t;
15630  };
15631  if (r26 >= r30) {
15632    ulong const t = r26;
15633    r26 = r30;
15634    r30 = t;
15635  };
15636  if (r4 >= r20) {
15637    ulong const t = r4;
15638    r4 = r20;
15639    r20 = t;
15640  };
15641  if (r12 >= r28) {
15642    ulong const t = r12;
15643    r12 = r28;
15644    r28 = t;
15645  };
15646  if (r4 >= r12) {
15647    ulong const t = r4;
15648    r4 = r12;
15649    r12 = t;
15650  };
15651  if (r20 >= r28) {
15652    ulong const t = r20;
15653    r20 = r28;
15654    r28 = t;
15655  };
15656  if (r8 >= r24) {
15657    ulong const t = r8;
15658    r8 = r24;
15659    r24 = t;
15660  };
15661  if (r16 >= r32) {
15662    ulong const t = r16;
15663    r16 = r32;
15664    r32 = t;
15665  };
15666  if (r8 >= r16) {
15667    ulong const t = r8;
15668    r8 = r16;
15669    r16 = t;
15670  };
15671  if (r24 >= r32) {
15672    ulong const t = r24;
15673    r24 = r32;
15674    r32 = t;
15675  };
15676  if (r4 >= r8) {
15677    ulong const t = r4;
15678    r4 = r8;
15679    r8 = t;
15680  };
15681  if (r12 >= r16) {
15682    ulong const t = r12;
15683    r12 = r16;
15684    r16 = t;
15685  };
15686  if (r20 >= r24) {
15687    ulong const t = r20;
15688    r20 = r24;
15689    r24 = t;
15690  };
15691  if (r28 >= r32) {
15692    ulong const t = r28;
15693    r28 = r32;
15694    r32 = t;
15695  };
15696  if (r2 >= r4) {
15697    ulong const t = r2;
15698    r2 = r4;
15699    r4 = t;
15700  };
15701  if (r6 >= r8) {
15702    ulong const t = r6;
15703    r6 = r8;
15704    r8 = t;
15705  };
15706  if (r10 >= r12) {
15707    ulong const t = r10;
15708    r10 = r12;
15709    r12 = t;
15710  };
15711  if (r14 >= r16) {
15712    ulong const t = r14;
15713    r14 = r16;
15714    r16 = t;
15715  };
15716  if (r18 >= r20) {
15717    ulong const t = r18;
15718    r18 = r20;
15719    r20 = t;
15720  };
15721  if (r22 >= r24) {
15722    ulong const t = r22;
15723    r22 = r24;
15724    r24 = t;
15725  };
15726  if (r26 >= r28) {
15727    ulong const t = r26;
15728    r26 = r28;
15729    r28 = t;
15730  };
15731  if (r30 >= r32) {
15732    ulong const t = r30;
15733    r30 = r32;
15734    r32 = t;
15735  };
15736  if (r1 >= r2) {
15737    ulong const t = r1;
15738    r1 = r2;
15739    r2 = t;
15740  };
15741  if (r3 >= r4) {
15742    ulong const t = r3;
15743    r3 = r4;
15744    r4 = t;
15745  };
15746  if (r5 >= r6) {
15747    ulong const t = r5;
15748    r5 = r6;
15749    r6 = t;
15750  };
15751  if (r7 >= r8) {
15752    ulong const t = r7;
15753    r7 = r8;
15754    r8 = t;
15755  };
15756  if (r9 >= r10) {
15757    ulong const t = r9;
15758    r9 = r10;
15759    r10 = t;
15760  };
15761  if (r11 >= r12) {
15762    ulong const t = r11;
15763    r11 = r12;
15764    r12 = t;
15765  };
15766  if (r13 >= r14) {
15767    ulong const t = r13;
15768    r13 = r14;
15769    r14 = t;
15770  };
15771  if (r15 >= r16) {
15772    ulong const t = r15;
15773    r15 = r16;
15774    r16 = t;
15775  };
15776  if (r17 >= r18) {
15777    ulong const t = r17;
15778    r17 = r18;
15779    r18 = t;
15780  };
15781  if (r19 >= r20) {
15782    ulong const t = r19;
15783    r19 = r20;
15784    r20 = t;
15785  };
15786  if (r21 >= r22) {
15787    ulong const t = r21;
15788    r21 = r22;
15789    r22 = t;
15790  };
15791  if (r23 >= r24) {
15792    ulong const t = r23;
15793    r23 = r24;
15794    r24 = t;
15795  };
15796  if (r25 >= r26) {
15797    ulong const t = r25;
15798    r25 = r26;
15799    r26 = t;
15800  };
15801  if (r27 >= r28) {
15802    ulong const t = r27;
15803    r27 = r28;
15804    r28 = t;
15805  };
15806  if (r29 >= r30) {
15807    ulong const t = r29;
15808    r29 = r30;
15809    r30 = t;
15810  };
15811  if (r31 >= r32) {
15812    ulong const t = r31;
15813    r31 = r32;
15814    r32 = t;
15815  };
15816  vout[span_l + span_stride * 0] = r1;
15817  vout[span_l + span_stride * 1] = r2;
15818  vout[span_l + span_stride * 2] = r3;
15819  vout[span_l + span_stride * 3] = r4;
15820  vout[span_l + span_stride * 4] = r5;
15821  vout[span_l + span_stride * 5] = r6;
15822  vout[span_l + span_stride * 6] = r7;
15823  vout[span_l + span_stride * 7] = r8;
15824  vout[span_l + span_stride * 8] = r9;
15825  vout[span_l + span_stride * 9] = r10;
15826  vout[span_l + span_stride * 10] = r11;
15827  vout[span_l + span_stride * 11] = r12;
15828  vout[span_l + span_stride * 12] = r13;
15829  vout[span_l + span_stride * 13] = r14;
15830  vout[span_l + span_stride * 14] = r15;
15831  vout[span_l + span_stride * 15] = r16;
15832  vout[span_l + span_stride * 16] = r17;
15833  vout[span_l + span_stride * 17] = r18;
15834  vout[span_l + span_stride * 18] = r19;
15835  vout[span_l + span_stride * 19] = r20;
15836  vout[span_l + span_stride * 20] = r21;
15837  vout[span_l + span_stride * 21] = r22;
15838  vout[span_l + span_stride * 22] = r23;
15839  vout[span_l + span_stride * 23] = r24;
15840  vout[span_l + span_stride * 24] = r25;
15841  vout[span_l + span_stride * 25] = r26;
15842  vout[span_l + span_stride * 26] = r27;
15843  vout[span_l + span_stride * 27] = r28;
15844  vout[span_l + span_stride * 28] = r29;
15845  vout[span_l + span_stride * 29] = r30;
15846  vout[span_l + span_stride * 30] = r31;
15847  vout[span_l + span_stride * 31] = r32;
15848}
15849
15850__kernel __attribute__((intel_reqd_sub_group_size((1 << 3)))) void
15851hs_kernel_transpose(__global ulong* const restrict vout)
15852{
15853  uint const gmem_idx = (get_global_id(0) & ~((1 << 3) - 1)) * 16 +
15854                        (get_local_id(0) & ((1 << 3) - 1));
15855  ulong r1 = vout[gmem_idx + (1 << 3) * 0];
15856  ulong r2 = vout[gmem_idx + (1 << 3) * 1];
15857  ulong r3 = vout[gmem_idx + (1 << 3) * 2];
15858  ulong r4 = vout[gmem_idx + (1 << 3) * 3];
15859  ulong r5 = vout[gmem_idx + (1 << 3) * 4];
15860  ulong r6 = vout[gmem_idx + (1 << 3) * 5];
15861  ulong r7 = vout[gmem_idx + (1 << 3) * 6];
15862  ulong r8 = vout[gmem_idx + (1 << 3) * 7];
15863  ulong r9 = vout[gmem_idx + (1 << 3) * 8];
15864  ulong r10 = vout[gmem_idx + (1 << 3) * 9];
15865  ulong r11 = vout[gmem_idx + (1 << 3) * 10];
15866  ulong r12 = vout[gmem_idx + (1 << 3) * 11];
15867  ulong r13 = vout[gmem_idx + (1 << 3) * 12];
15868  ulong r14 = vout[gmem_idx + (1 << 3) * 13];
15869  ulong r15 = vout[gmem_idx + (1 << 3) * 14];
15870  ulong r16 = vout[gmem_idx + (1 << 3) * 15];
15871  bool const is_lo_1 = (get_sub_group_local_id() & (1 << (1 - 1))) == 0;
15872  bool const is_lo_2 = (get_sub_group_local_id() & (1 << (2 - 1))) == 0;
15873  bool const is_lo_3 = (get_sub_group_local_id() & (1 << (3 - 1))) == 0;
15874  ulong const s2_1 =
15875    intel_sub_group_shuffle_xor(is_lo_1 ? r2 : r1, 1 << (1 - 1));
15876  ulong const s2 = is_lo_1 ? s2_1 : r2;
15877  ulong const s1 = is_lo_1 ? r1 : s2_1;
15878  ulong const s4_3 =
15879    intel_sub_group_shuffle_xor(is_lo_1 ? r4 : r3, 1 << (1 - 1));
15880  ulong const s4 = is_lo_1 ? s4_3 : r4;
15881  ulong const s3 = is_lo_1 ? r3 : s4_3;
15882  ulong const s6_5 =
15883    intel_sub_group_shuffle_xor(is_lo_1 ? r6 : r5, 1 << (1 - 1));
15884  ulong const s6 = is_lo_1 ? s6_5 : r6;
15885  ulong const s5 = is_lo_1 ? r5 : s6_5;
15886  ulong const s8_7 =
15887    intel_sub_group_shuffle_xor(is_lo_1 ? r8 : r7, 1 << (1 - 1));
15888  ulong const s8 = is_lo_1 ? s8_7 : r8;
15889  ulong const s7 = is_lo_1 ? r7 : s8_7;
15890  ulong const s10_9 =
15891    intel_sub_group_shuffle_xor(is_lo_1 ? r10 : r9, 1 << (1 - 1));
15892  ulong const s10 = is_lo_1 ? s10_9 : r10;
15893  ulong const s9 = is_lo_1 ? r9 : s10_9;
15894  ulong const s12_11 =
15895    intel_sub_group_shuffle_xor(is_lo_1 ? r12 : r11, 1 << (1 - 1));
15896  ulong const s12 = is_lo_1 ? s12_11 : r12;
15897  ulong const s11 = is_lo_1 ? r11 : s12_11;
15898  ulong const s14_13 =
15899    intel_sub_group_shuffle_xor(is_lo_1 ? r14 : r13, 1 << (1 - 1));
15900  ulong const s14 = is_lo_1 ? s14_13 : r14;
15901  ulong const s13 = is_lo_1 ? r13 : s14_13;
15902  ulong const s16_15 =
15903    intel_sub_group_shuffle_xor(is_lo_1 ? r16 : r15, 1 << (1 - 1));
15904  ulong const s16 = is_lo_1 ? s16_15 : r16;
15905  ulong const s15 = is_lo_1 ? r15 : s16_15;
15906  ulong const t3_1 =
15907    intel_sub_group_shuffle_xor(is_lo_2 ? s3 : s1, 1 << (2 - 1));
15908  ulong const t3 = is_lo_2 ? t3_1 : s3;
15909  ulong const t1 = is_lo_2 ? s1 : t3_1;
15910  ulong const t4_2 =
15911    intel_sub_group_shuffle_xor(is_lo_2 ? s4 : s2, 1 << (2 - 1));
15912  ulong const t4 = is_lo_2 ? t4_2 : s4;
15913  ulong const t2 = is_lo_2 ? s2 : t4_2;
15914  ulong const t7_5 =
15915    intel_sub_group_shuffle_xor(is_lo_2 ? s7 : s5, 1 << (2 - 1));
15916  ulong const t7 = is_lo_2 ? t7_5 : s7;
15917  ulong const t5 = is_lo_2 ? s5 : t7_5;
15918  ulong const t8_6 =
15919    intel_sub_group_shuffle_xor(is_lo_2 ? s8 : s6, 1 << (2 - 1));
15920  ulong const t8 = is_lo_2 ? t8_6 : s8;
15921  ulong const t6 = is_lo_2 ? s6 : t8_6;
15922  ulong const t11_9 =
15923    intel_sub_group_shuffle_xor(is_lo_2 ? s11 : s9, 1 << (2 - 1));
15924  ulong const t11 = is_lo_2 ? t11_9 : s11;
15925  ulong const t9 = is_lo_2 ? s9 : t11_9;
15926  ulong const t12_10 =
15927    intel_sub_group_shuffle_xor(is_lo_2 ? s12 : s10, 1 << (2 - 1));
15928  ulong const t12 = is_lo_2 ? t12_10 : s12;
15929  ulong const t10 = is_lo_2 ? s10 : t12_10;
15930  ulong const t15_13 =
15931    intel_sub_group_shuffle_xor(is_lo_2 ? s15 : s13, 1 << (2 - 1));
15932  ulong const t15 = is_lo_2 ? t15_13 : s15;
15933  ulong const t13 = is_lo_2 ? s13 : t15_13;
15934  ulong const t16_14 =
15935    intel_sub_group_shuffle_xor(is_lo_2 ? s16 : s14, 1 << (2 - 1));
15936  ulong const t16 = is_lo_2 ? t16_14 : s16;
15937  ulong const t14 = is_lo_2 ? s14 : t16_14;
15938  ulong const u5_1 =
15939    intel_sub_group_shuffle_xor(is_lo_3 ? t5 : t1, 1 << (3 - 1));
15940  ulong const u5 = is_lo_3 ? u5_1 : t5;
15941  ulong const u1 = is_lo_3 ? t1 : u5_1;
15942  ulong const u6_2 =
15943    intel_sub_group_shuffle_xor(is_lo_3 ? t6 : t2, 1 << (3 - 1));
15944  ulong const u6 = is_lo_3 ? u6_2 : t6;
15945  ulong const u2 = is_lo_3 ? t2 : u6_2;
15946  ulong const u7_3 =
15947    intel_sub_group_shuffle_xor(is_lo_3 ? t7 : t3, 1 << (3 - 1));
15948  ulong const u7 = is_lo_3 ? u7_3 : t7;
15949  ulong const u3 = is_lo_3 ? t3 : u7_3;
15950  ulong const u8_4 =
15951    intel_sub_group_shuffle_xor(is_lo_3 ? t8 : t4, 1 << (3 - 1));
15952  ulong const u8 = is_lo_3 ? u8_4 : t8;
15953  ulong const u4 = is_lo_3 ? t4 : u8_4;
15954  ulong const u13_9 =
15955    intel_sub_group_shuffle_xor(is_lo_3 ? t13 : t9, 1 << (3 - 1));
15956  ulong const u13 = is_lo_3 ? u13_9 : t13;
15957  ulong const u9 = is_lo_3 ? t9 : u13_9;
15958  ulong const u14_10 =
15959    intel_sub_group_shuffle_xor(is_lo_3 ? t14 : t10, 1 << (3 - 1));
15960  ulong const u14 = is_lo_3 ? u14_10 : t14;
15961  ulong const u10 = is_lo_3 ? t10 : u14_10;
15962  ulong const u15_11 =
15963    intel_sub_group_shuffle_xor(is_lo_3 ? t15 : t11, 1 << (3 - 1));
15964  ulong const u15 = is_lo_3 ? u15_11 : t15;
15965  ulong const u11 = is_lo_3 ? t11 : u15_11;
15966  ulong const u16_12 =
15967    intel_sub_group_shuffle_xor(is_lo_3 ? t16 : t12, 1 << (3 - 1));
15968  ulong const u16 = is_lo_3 ? u16_12 : t16;
15969  ulong const u12 = is_lo_3 ? t12 : u16_12;
15970  vout[gmem_idx + ((1 - 1) << 3)] = u1;
15971  vout[gmem_idx + ((3 - 1) << 3)] = u2;
15972  vout[gmem_idx + ((5 - 1) << 3)] = u3;
15973  vout[gmem_idx + ((7 - 1) << 3)] = u4;
15974  vout[gmem_idx + ((9 - 1) << 3)] = u5;
15975  vout[gmem_idx + ((11 - 1) << 3)] = u6;
15976  vout[gmem_idx + ((13 - 1) << 3)] = u7;
15977  vout[gmem_idx + ((15 - 1) << 3)] = u8;
15978  vout[gmem_idx + ((2 - 1) << 3)] = u9;
15979  vout[gmem_idx + ((4 - 1) << 3)] = u10;
15980  vout[gmem_idx + ((6 - 1) << 3)] = u11;
15981  vout[gmem_idx + ((8 - 1) << 3)] = u12;
15982  vout[gmem_idx + ((10 - 1) << 3)] = u13;
15983  vout[gmem_idx + ((12 - 1) << 3)] = u14;
15984  vout[gmem_idx + ((14 - 1) << 3)] = u15;
15985  vout[gmem_idx + ((16 - 1) << 3)] = u16;
15986}
15987