• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1
2
3__kernel __attribute__((intel_reqd_sub_group_size((1 << 4))))
4__attribute__((reqd_work_group_size((1 << 4) * 1, 1, 1))) void
5hs_kernel_bs_0(__global uint const* const restrict vin,
6               __global uint* const restrict vout)
7{
8  uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 +
9                        (get_local_id(0) & ((1 << 4) - 1));
10  uint r1 = vin[gmem_idx + (1 << 4) * 0];
11  uint r2 = vin[gmem_idx + (1 << 4) * 1];
12  uint r3 = vin[gmem_idx + (1 << 4) * 2];
13  uint r4 = vin[gmem_idx + (1 << 4) * 3];
14  uint r5 = vin[gmem_idx + (1 << 4) * 4];
15  uint r6 = vin[gmem_idx + (1 << 4) * 5];
16  uint r7 = vin[gmem_idx + (1 << 4) * 6];
17  uint r8 = vin[gmem_idx + (1 << 4) * 7];
18  {
19    uint const t = min(r1, r5);
20    r5 = max(r1, r5);
21    r1 = t;
22  };
23  {
24    uint const t = min(r2, r6);
25    r6 = max(r2, r6);
26    r2 = t;
27  };
28  {
29    uint const t = min(r3, r7);
30    r7 = max(r3, r7);
31    r3 = t;
32  };
33  {
34    uint const t = min(r4, r8);
35    r8 = max(r4, r8);
36    r4 = t;
37  };
38  {
39    uint const t = min(r1, r3);
40    r3 = max(r1, r3);
41    r1 = t;
42  };
43  {
44    uint const t = min(r2, r4);
45    r4 = max(r2, r4);
46    r2 = t;
47  };
48  {
49    uint const t = min(r5, r7);
50    r7 = max(r5, r7);
51    r5 = t;
52  };
53  {
54    uint const t = min(r6, r8);
55    r8 = max(r6, r8);
56    r6 = t;
57  };
58  {
59    uint const t = min(r3, r5);
60    r5 = max(r3, r5);
61    r3 = t;
62  };
63  {
64    uint const t = min(r4, r6);
65    r6 = max(r4, r6);
66    r4 = t;
67  };
68  {
69    uint const t = min(r1, r2);
70    r2 = max(r1, r2);
71    r1 = t;
72  };
73  {
74    uint const t = min(r3, r4);
75    r4 = max(r3, r4);
76    r3 = t;
77  };
78  {
79    uint const t = min(r5, r6);
80    r6 = max(r5, r6);
81    r5 = t;
82  };
83  {
84    uint const t = min(r7, r8);
85    r8 = max(r7, r8);
86    r7 = t;
87  };
88  {
89    uint const t = min(r2, r5);
90    r5 = max(r2, r5);
91    r2 = t;
92  };
93  {
94    uint const t = min(r4, r7);
95    r7 = max(r4, r7);
96    r4 = t;
97  };
98  {
99    uint const t = min(r2, r3);
100    r3 = max(r2, r3);
101    r2 = t;
102  };
103  {
104    uint const t = min(r4, r5);
105    r5 = max(r4, r5);
106    r4 = t;
107  };
108  {
109    uint const t = min(r6, r7);
110    r7 = max(r6, r7);
111    r6 = t;
112  };
113  {
114    uint const flip_lane_idx = get_sub_group_local_id() ^ 1;
115    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
116    ;
117    {
118      uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
119      uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
120      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
121      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
122    };
123    {
124      uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
125      uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
126      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
127      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
128    };
129    {
130      uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
131      uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
132      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
133      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
134    };
135    {
136      uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
137      uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
138      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
139      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
140    };
141  }
142  {
143    uint const t = min(r1, r5);
144    r5 = max(r1, r5);
145    r1 = t;
146  };
147  {
148    uint const t = min(r3, r7);
149    r7 = max(r3, r7);
150    r3 = t;
151  };
152  {
153    uint const t = min(r1, r3);
154    r3 = max(r1, r3);
155    r1 = t;
156  };
157  {
158    uint const t = min(r5, r7);
159    r7 = max(r5, r7);
160    r5 = t;
161  };
162  {
163    uint const t = min(r2, r6);
164    r6 = max(r2, r6);
165    r2 = t;
166  };
167  {
168    uint const t = min(r4, r8);
169    r8 = max(r4, r8);
170    r4 = t;
171  };
172  {
173    uint const t = min(r2, r4);
174    r4 = max(r2, r4);
175    r2 = t;
176  };
177  {
178    uint const t = min(r6, r8);
179    r8 = max(r6, r8);
180    r6 = t;
181  };
182  {
183    uint const t = min(r1, r2);
184    r2 = max(r1, r2);
185    r1 = t;
186  };
187  {
188    uint const t = min(r3, r4);
189    r4 = max(r3, r4);
190    r3 = t;
191  };
192  {
193    uint const t = min(r5, r6);
194    r6 = max(r5, r6);
195    r5 = t;
196  };
197  {
198    uint const t = min(r7, r8);
199    r8 = max(r7, r8);
200    r7 = t;
201  };
202  {
203    uint const flip_lane_idx = get_sub_group_local_id() ^ 3;
204    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
205    ;
206    {
207      uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
208      uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
209      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
210      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
211    };
212    {
213      uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
214      uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
215      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
216      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
217    };
218    {
219      uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
220      uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
221      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
222      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
223    };
224    {
225      uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
226      uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
227      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
228      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
229    };
230  }
231  {
232    uint const half_lane_idx = get_sub_group_local_id() ^ 1;
233    int const t_lt = get_sub_group_local_id() < half_lane_idx;
234    ;
235    {
236      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
237      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
238    };
239    {
240      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
241      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
242    };
243    {
244      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
245      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
246    };
247    {
248      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
249      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
250    };
251    {
252      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
253      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
254    };
255    {
256      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
257      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
258    };
259    {
260      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
261      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
262    };
263    {
264      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
265      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
266    };
267  }
268  {
269    uint const t = min(r1, r5);
270    r5 = max(r1, r5);
271    r1 = t;
272  };
273  {
274    uint const t = min(r3, r7);
275    r7 = max(r3, r7);
276    r3 = t;
277  };
278  {
279    uint const t = min(r1, r3);
280    r3 = max(r1, r3);
281    r1 = t;
282  };
283  {
284    uint const t = min(r5, r7);
285    r7 = max(r5, r7);
286    r5 = t;
287  };
288  {
289    uint const t = min(r2, r6);
290    r6 = max(r2, r6);
291    r2 = t;
292  };
293  {
294    uint const t = min(r4, r8);
295    r8 = max(r4, r8);
296    r4 = t;
297  };
298  {
299    uint const t = min(r2, r4);
300    r4 = max(r2, r4);
301    r2 = t;
302  };
303  {
304    uint const t = min(r6, r8);
305    r8 = max(r6, r8);
306    r6 = t;
307  };
308  {
309    uint const t = min(r1, r2);
310    r2 = max(r1, r2);
311    r1 = t;
312  };
313  {
314    uint const t = min(r3, r4);
315    r4 = max(r3, r4);
316    r3 = t;
317  };
318  {
319    uint const t = min(r5, r6);
320    r6 = max(r5, r6);
321    r5 = t;
322  };
323  {
324    uint const t = min(r7, r8);
325    r8 = max(r7, r8);
326    r7 = t;
327  };
328  {
329    uint const flip_lane_idx = get_sub_group_local_id() ^ 7;
330    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
331    ;
332    {
333      uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
334      uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
335      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
336      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
337    };
338    {
339      uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
340      uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
341      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
342      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
343    };
344    {
345      uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
346      uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
347      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
348      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
349    };
350    {
351      uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
352      uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
353      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
354      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
355    };
356  }
357  {
358    uint const half_lane_idx = get_sub_group_local_id() ^ 2;
359    int const t_lt = get_sub_group_local_id() < half_lane_idx;
360    ;
361    {
362      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
363      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
364    };
365    {
366      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
367      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
368    };
369    {
370      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
371      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
372    };
373    {
374      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
375      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
376    };
377    {
378      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
379      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
380    };
381    {
382      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
383      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
384    };
385    {
386      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
387      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
388    };
389    {
390      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
391      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
392    };
393  }
394  {
395    uint const half_lane_idx = get_sub_group_local_id() ^ 1;
396    int const t_lt = get_sub_group_local_id() < half_lane_idx;
397    ;
398    {
399      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
400      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
401    };
402    {
403      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
404      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
405    };
406    {
407      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
408      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
409    };
410    {
411      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
412      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
413    };
414    {
415      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
416      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
417    };
418    {
419      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
420      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
421    };
422    {
423      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
424      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
425    };
426    {
427      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
428      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
429    };
430  }
431  {
432    uint const t = min(r1, r5);
433    r5 = max(r1, r5);
434    r1 = t;
435  };
436  {
437    uint const t = min(r3, r7);
438    r7 = max(r3, r7);
439    r3 = t;
440  };
441  {
442    uint const t = min(r1, r3);
443    r3 = max(r1, r3);
444    r1 = t;
445  };
446  {
447    uint const t = min(r5, r7);
448    r7 = max(r5, r7);
449    r5 = t;
450  };
451  {
452    uint const t = min(r2, r6);
453    r6 = max(r2, r6);
454    r2 = t;
455  };
456  {
457    uint const t = min(r4, r8);
458    r8 = max(r4, r8);
459    r4 = t;
460  };
461  {
462    uint const t = min(r2, r4);
463    r4 = max(r2, r4);
464    r2 = t;
465  };
466  {
467    uint const t = min(r6, r8);
468    r8 = max(r6, r8);
469    r6 = t;
470  };
471  {
472    uint const t = min(r1, r2);
473    r2 = max(r1, r2);
474    r1 = t;
475  };
476  {
477    uint const t = min(r3, r4);
478    r4 = max(r3, r4);
479    r3 = t;
480  };
481  {
482    uint const t = min(r5, r6);
483    r6 = max(r5, r6);
484    r5 = t;
485  };
486  {
487    uint const t = min(r7, r8);
488    r8 = max(r7, r8);
489    r7 = t;
490  };
491  {
492    uint const flip_lane_idx = get_sub_group_local_id() ^ 15;
493    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
494    ;
495    {
496      uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
497      uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
498      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
499      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
500    };
501    {
502      uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
503      uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
504      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
505      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
506    };
507    {
508      uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
509      uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
510      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
511      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
512    };
513    {
514      uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
515      uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
516      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
517      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
518    };
519  }
520  {
521    uint const half_lane_idx = get_sub_group_local_id() ^ 4;
522    int const t_lt = get_sub_group_local_id() < half_lane_idx;
523    ;
524    {
525      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
526      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
527    };
528    {
529      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
530      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
531    };
532    {
533      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
534      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
535    };
536    {
537      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
538      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
539    };
540    {
541      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
542      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
543    };
544    {
545      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
546      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
547    };
548    {
549      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
550      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
551    };
552    {
553      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
554      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
555    };
556  }
557  {
558    uint const half_lane_idx = get_sub_group_local_id() ^ 2;
559    int const t_lt = get_sub_group_local_id() < half_lane_idx;
560    ;
561    {
562      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
563      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
564    };
565    {
566      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
567      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
568    };
569    {
570      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
571      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
572    };
573    {
574      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
575      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
576    };
577    {
578      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
579      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
580    };
581    {
582      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
583      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
584    };
585    {
586      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
587      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
588    };
589    {
590      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
591      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
592    };
593  }
594  {
595    uint const half_lane_idx = get_sub_group_local_id() ^ 1;
596    int const t_lt = get_sub_group_local_id() < half_lane_idx;
597    ;
598    {
599      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
600      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
601    };
602    {
603      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
604      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
605    };
606    {
607      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
608      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
609    };
610    {
611      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
612      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
613    };
614    {
615      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
616      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
617    };
618    {
619      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
620      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
621    };
622    {
623      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
624      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
625    };
626    {
627      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
628      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
629    };
630  }
631  {
632    uint const t = min(r1, r5);
633    r5 = max(r1, r5);
634    r1 = t;
635  };
636  {
637    uint const t = min(r3, r7);
638    r7 = max(r3, r7);
639    r3 = t;
640  };
641  {
642    uint const t = min(r1, r3);
643    r3 = max(r1, r3);
644    r1 = t;
645  };
646  {
647    uint const t = min(r5, r7);
648    r7 = max(r5, r7);
649    r5 = t;
650  };
651  {
652    uint const t = min(r2, r6);
653    r6 = max(r2, r6);
654    r2 = t;
655  };
656  {
657    uint const t = min(r4, r8);
658    r8 = max(r4, r8);
659    r4 = t;
660  };
661  {
662    uint const t = min(r2, r4);
663    r4 = max(r2, r4);
664    r2 = t;
665  };
666  {
667    uint const t = min(r6, r8);
668    r8 = max(r6, r8);
669    r6 = t;
670  };
671  {
672    uint const t = min(r1, r2);
673    r2 = max(r1, r2);
674    r1 = t;
675  };
676  {
677    uint const t = min(r3, r4);
678    r4 = max(r3, r4);
679    r3 = t;
680  };
681  {
682    uint const t = min(r5, r6);
683    r6 = max(r5, r6);
684    r5 = t;
685  };
686  {
687    uint const t = min(r7, r8);
688    r8 = max(r7, r8);
689    r7 = t;
690  };
691  vout[gmem_idx + (1 << 4) * 0] = r1;
692  vout[gmem_idx + (1 << 4) * 1] = r2;
693  vout[gmem_idx + (1 << 4) * 2] = r3;
694  vout[gmem_idx + (1 << 4) * 3] = r4;
695  vout[gmem_idx + (1 << 4) * 4] = r5;
696  vout[gmem_idx + (1 << 4) * 5] = r6;
697  vout[gmem_idx + (1 << 4) * 6] = r7;
698  vout[gmem_idx + (1 << 4) * 7] = r8;
699}
700
701__kernel __attribute__((intel_reqd_sub_group_size((1 << 4))))
702__attribute__((reqd_work_group_size((1 << 4) * 2, 1, 1))) void
703hs_kernel_bs_1(__global uint const* const restrict vin,
704               __global uint* const restrict vout)
705{
706  __local struct
707  {
708    uint m[32 * 8];
709  } shared;
710
711  uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 +
712                        (get_local_id(0) & ((1 << 4) - 1));
713  uint r1 = vin[gmem_idx + (1 << 4) * 0];
714  uint r2 = vin[gmem_idx + (1 << 4) * 1];
715  uint r3 = vin[gmem_idx + (1 << 4) * 2];
716  uint r4 = vin[gmem_idx + (1 << 4) * 3];
717  uint r5 = vin[gmem_idx + (1 << 4) * 4];
718  uint r6 = vin[gmem_idx + (1 << 4) * 5];
719  uint r7 = vin[gmem_idx + (1 << 4) * 6];
720  uint r8 = vin[gmem_idx + (1 << 4) * 7];
721  {
722    uint const t = min(r1, r5);
723    r5 = max(r1, r5);
724    r1 = t;
725  };
726  {
727    uint const t = min(r2, r6);
728    r6 = max(r2, r6);
729    r2 = t;
730  };
731  {
732    uint const t = min(r3, r7);
733    r7 = max(r3, r7);
734    r3 = t;
735  };
736  {
737    uint const t = min(r4, r8);
738    r8 = max(r4, r8);
739    r4 = t;
740  };
741  {
742    uint const t = min(r1, r3);
743    r3 = max(r1, r3);
744    r1 = t;
745  };
746  {
747    uint const t = min(r2, r4);
748    r4 = max(r2, r4);
749    r2 = t;
750  };
751  {
752    uint const t = min(r5, r7);
753    r7 = max(r5, r7);
754    r5 = t;
755  };
756  {
757    uint const t = min(r6, r8);
758    r8 = max(r6, r8);
759    r6 = t;
760  };
761  {
762    uint const t = min(r3, r5);
763    r5 = max(r3, r5);
764    r3 = t;
765  };
766  {
767    uint const t = min(r4, r6);
768    r6 = max(r4, r6);
769    r4 = t;
770  };
771  {
772    uint const t = min(r1, r2);
773    r2 = max(r1, r2);
774    r1 = t;
775  };
776  {
777    uint const t = min(r3, r4);
778    r4 = max(r3, r4);
779    r3 = t;
780  };
781  {
782    uint const t = min(r5, r6);
783    r6 = max(r5, r6);
784    r5 = t;
785  };
786  {
787    uint const t = min(r7, r8);
788    r8 = max(r7, r8);
789    r7 = t;
790  };
791  {
792    uint const t = min(r2, r5);
793    r5 = max(r2, r5);
794    r2 = t;
795  };
796  {
797    uint const t = min(r4, r7);
798    r7 = max(r4, r7);
799    r4 = t;
800  };
801  {
802    uint const t = min(r2, r3);
803    r3 = max(r2, r3);
804    r2 = t;
805  };
806  {
807    uint const t = min(r4, r5);
808    r5 = max(r4, r5);
809    r4 = t;
810  };
811  {
812    uint const t = min(r6, r7);
813    r7 = max(r6, r7);
814    r6 = t;
815  };
816  {
817    uint const flip_lane_idx = get_sub_group_local_id() ^ 1;
818    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
819    ;
820    {
821      uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
822      uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
823      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
824      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
825    };
826    {
827      uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
828      uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
829      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
830      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
831    };
832    {
833      uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
834      uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
835      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
836      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
837    };
838    {
839      uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
840      uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
841      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
842      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
843    };
844  }
845  {
846    uint const t = min(r1, r5);
847    r5 = max(r1, r5);
848    r1 = t;
849  };
850  {
851    uint const t = min(r3, r7);
852    r7 = max(r3, r7);
853    r3 = t;
854  };
855  {
856    uint const t = min(r1, r3);
857    r3 = max(r1, r3);
858    r1 = t;
859  };
860  {
861    uint const t = min(r5, r7);
862    r7 = max(r5, r7);
863    r5 = t;
864  };
865  {
866    uint const t = min(r2, r6);
867    r6 = max(r2, r6);
868    r2 = t;
869  };
870  {
871    uint const t = min(r4, r8);
872    r8 = max(r4, r8);
873    r4 = t;
874  };
875  {
876    uint const t = min(r2, r4);
877    r4 = max(r2, r4);
878    r2 = t;
879  };
880  {
881    uint const t = min(r6, r8);
882    r8 = max(r6, r8);
883    r6 = t;
884  };
885  {
886    uint const t = min(r1, r2);
887    r2 = max(r1, r2);
888    r1 = t;
889  };
890  {
891    uint const t = min(r3, r4);
892    r4 = max(r3, r4);
893    r3 = t;
894  };
895  {
896    uint const t = min(r5, r6);
897    r6 = max(r5, r6);
898    r5 = t;
899  };
900  {
901    uint const t = min(r7, r8);
902    r8 = max(r7, r8);
903    r7 = t;
904  };
905  {
906    uint const flip_lane_idx = get_sub_group_local_id() ^ 3;
907    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
908    ;
909    {
910      uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
911      uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
912      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
913      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
914    };
915    {
916      uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
917      uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
918      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
919      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
920    };
921    {
922      uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
923      uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
924      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
925      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
926    };
927    {
928      uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
929      uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
930      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
931      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
932    };
933  }
934  {
935    uint const half_lane_idx = get_sub_group_local_id() ^ 1;
936    int const t_lt = get_sub_group_local_id() < half_lane_idx;
937    ;
938    {
939      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
940      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
941    };
942    {
943      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
944      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
945    };
946    {
947      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
948      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
949    };
950    {
951      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
952      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
953    };
954    {
955      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
956      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
957    };
958    {
959      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
960      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
961    };
962    {
963      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
964      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
965    };
966    {
967      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
968      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
969    };
970  }
971  {
972    uint const t = min(r1, r5);
973    r5 = max(r1, r5);
974    r1 = t;
975  };
976  {
977    uint const t = min(r3, r7);
978    r7 = max(r3, r7);
979    r3 = t;
980  };
981  {
982    uint const t = min(r1, r3);
983    r3 = max(r1, r3);
984    r1 = t;
985  };
986  {
987    uint const t = min(r5, r7);
988    r7 = max(r5, r7);
989    r5 = t;
990  };
991  {
992    uint const t = min(r2, r6);
993    r6 = max(r2, r6);
994    r2 = t;
995  };
996  {
997    uint const t = min(r4, r8);
998    r8 = max(r4, r8);
999    r4 = t;
1000  };
1001  {
1002    uint const t = min(r2, r4);
1003    r4 = max(r2, r4);
1004    r2 = t;
1005  };
1006  {
1007    uint const t = min(r6, r8);
1008    r8 = max(r6, r8);
1009    r6 = t;
1010  };
1011  {
1012    uint const t = min(r1, r2);
1013    r2 = max(r1, r2);
1014    r1 = t;
1015  };
1016  {
1017    uint const t = min(r3, r4);
1018    r4 = max(r3, r4);
1019    r3 = t;
1020  };
1021  {
1022    uint const t = min(r5, r6);
1023    r6 = max(r5, r6);
1024    r5 = t;
1025  };
1026  {
1027    uint const t = min(r7, r8);
1028    r8 = max(r7, r8);
1029    r7 = t;
1030  };
1031  {
1032    uint const flip_lane_idx = get_sub_group_local_id() ^ 7;
1033    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
1034    ;
1035    {
1036      uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
1037      uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
1038      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
1039      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
1040    };
1041    {
1042      uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
1043      uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
1044      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
1045      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
1046    };
1047    {
1048      uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
1049      uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
1050      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
1051      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
1052    };
1053    {
1054      uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
1055      uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
1056      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
1057      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
1058    };
1059  }
1060  {
1061    uint const half_lane_idx = get_sub_group_local_id() ^ 2;
1062    int const t_lt = get_sub_group_local_id() < half_lane_idx;
1063    ;
1064    {
1065      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
1066      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
1067    };
1068    {
1069      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
1070      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
1071    };
1072    {
1073      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
1074      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
1075    };
1076    {
1077      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
1078      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
1079    };
1080    {
1081      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
1082      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
1083    };
1084    {
1085      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
1086      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
1087    };
1088    {
1089      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
1090      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
1091    };
1092    {
1093      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
1094      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
1095    };
1096  }
1097  {
1098    uint const half_lane_idx = get_sub_group_local_id() ^ 1;
1099    int const t_lt = get_sub_group_local_id() < half_lane_idx;
1100    ;
1101    {
1102      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
1103      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
1104    };
1105    {
1106      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
1107      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
1108    };
1109    {
1110      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
1111      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
1112    };
1113    {
1114      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
1115      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
1116    };
1117    {
1118      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
1119      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
1120    };
1121    {
1122      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
1123      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
1124    };
1125    {
1126      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
1127      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
1128    };
1129    {
1130      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
1131      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
1132    };
1133  }
1134  {
1135    uint const t = min(r1, r5);
1136    r5 = max(r1, r5);
1137    r1 = t;
1138  };
1139  {
1140    uint const t = min(r3, r7);
1141    r7 = max(r3, r7);
1142    r3 = t;
1143  };
1144  {
1145    uint const t = min(r1, r3);
1146    r3 = max(r1, r3);
1147    r1 = t;
1148  };
1149  {
1150    uint const t = min(r5, r7);
1151    r7 = max(r5, r7);
1152    r5 = t;
1153  };
1154  {
1155    uint const t = min(r2, r6);
1156    r6 = max(r2, r6);
1157    r2 = t;
1158  };
1159  {
1160    uint const t = min(r4, r8);
1161    r8 = max(r4, r8);
1162    r4 = t;
1163  };
1164  {
1165    uint const t = min(r2, r4);
1166    r4 = max(r2, r4);
1167    r2 = t;
1168  };
1169  {
1170    uint const t = min(r6, r8);
1171    r8 = max(r6, r8);
1172    r6 = t;
1173  };
1174  {
1175    uint const t = min(r1, r2);
1176    r2 = max(r1, r2);
1177    r1 = t;
1178  };
1179  {
1180    uint const t = min(r3, r4);
1181    r4 = max(r3, r4);
1182    r3 = t;
1183  };
1184  {
1185    uint const t = min(r5, r6);
1186    r6 = max(r5, r6);
1187    r5 = t;
1188  };
1189  {
1190    uint const t = min(r7, r8);
1191    r8 = max(r7, r8);
1192    r7 = t;
1193  };
1194  {
1195    uint const flip_lane_idx = get_sub_group_local_id() ^ 15;
1196    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
1197    ;
1198    {
1199      uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
1200      uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
1201      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
1202      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
1203    };
1204    {
1205      uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
1206      uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
1207      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
1208      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
1209    };
1210    {
1211      uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
1212      uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
1213      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
1214      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
1215    };
1216    {
1217      uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
1218      uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
1219      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
1220      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
1221    };
1222  }
1223  {
1224    uint const half_lane_idx = get_sub_group_local_id() ^ 4;
1225    int const t_lt = get_sub_group_local_id() < half_lane_idx;
1226    ;
1227    {
1228      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
1229      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
1230    };
1231    {
1232      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
1233      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
1234    };
1235    {
1236      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
1237      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
1238    };
1239    {
1240      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
1241      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
1242    };
1243    {
1244      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
1245      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
1246    };
1247    {
1248      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
1249      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
1250    };
1251    {
1252      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
1253      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
1254    };
1255    {
1256      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
1257      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
1258    };
1259  }
1260  {
1261    uint const half_lane_idx = get_sub_group_local_id() ^ 2;
1262    int const t_lt = get_sub_group_local_id() < half_lane_idx;
1263    ;
1264    {
1265      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
1266      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
1267    };
1268    {
1269      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
1270      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
1271    };
1272    {
1273      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
1274      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
1275    };
1276    {
1277      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
1278      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
1279    };
1280    {
1281      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
1282      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
1283    };
1284    {
1285      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
1286      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
1287    };
1288    {
1289      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
1290      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
1291    };
1292    {
1293      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
1294      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
1295    };
1296  }
1297  {
1298    uint const half_lane_idx = get_sub_group_local_id() ^ 1;
1299    int const t_lt = get_sub_group_local_id() < half_lane_idx;
1300    ;
1301    {
1302      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
1303      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
1304    };
1305    {
1306      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
1307      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
1308    };
1309    {
1310      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
1311      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
1312    };
1313    {
1314      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
1315      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
1316    };
1317    {
1318      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
1319      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
1320    };
1321    {
1322      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
1323      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
1324    };
1325    {
1326      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
1327      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
1328    };
1329    {
1330      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
1331      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
1332    };
1333  }
1334  {
1335    uint const t = min(r1, r5);
1336    r5 = max(r1, r5);
1337    r1 = t;
1338  };
1339  {
1340    uint const t = min(r3, r7);
1341    r7 = max(r3, r7);
1342    r3 = t;
1343  };
1344  {
1345    uint const t = min(r1, r3);
1346    r3 = max(r1, r3);
1347    r1 = t;
1348  };
1349  {
1350    uint const t = min(r5, r7);
1351    r7 = max(r5, r7);
1352    r5 = t;
1353  };
1354  {
1355    uint const t = min(r2, r6);
1356    r6 = max(r2, r6);
1357    r2 = t;
1358  };
1359  {
1360    uint const t = min(r4, r8);
1361    r8 = max(r4, r8);
1362    r4 = t;
1363  };
1364  {
1365    uint const t = min(r2, r4);
1366    r4 = max(r2, r4);
1367    r2 = t;
1368  };
1369  {
1370    uint const t = min(r6, r8);
1371    r8 = max(r6, r8);
1372    r6 = t;
1373  };
1374  {
1375    uint const t = min(r1, r2);
1376    r2 = max(r1, r2);
1377    r1 = t;
1378  };
1379  {
1380    uint const t = min(r3, r4);
1381    r4 = max(r3, r4);
1382    r3 = t;
1383  };
1384  {
1385    uint const t = min(r5, r6);
1386    r6 = max(r5, r6);
1387    r5 = t;
1388  };
1389  {
1390    uint const t = min(r7, r8);
1391    r8 = max(r7, r8);
1392    r7 = t;
1393  };
1394  uint const smem_l_idx =
1395    get_sub_group_id() * ((1 << 4) * 2) + get_sub_group_local_id();
1396  uint const smem_r_idx = (get_sub_group_id() ^ 1) * ((1 << 4) * 2) +
1397                          (get_sub_group_local_id() ^ ((1 << 4) - 1));
1398  shared.m[get_local_id(0) + (2 * (1 << 4) * 0)] = r1;
1399  shared.m[get_local_id(0) + (2 * (1 << 4) * 1)] = r8;
1400  shared.m[get_local_id(0) + (2 * (1 << 4) * 2)] = r2;
1401  shared.m[get_local_id(0) + (2 * (1 << 4) * 3)] = r7;
1402  shared.m[get_local_id(0) + (2 * (1 << 4) * 4)] = r3;
1403  shared.m[get_local_id(0) + (2 * (1 << 4) * 5)] = r6;
1404  shared.m[get_local_id(0) + (2 * (1 << 4) * 6)] = r4;
1405  shared.m[get_local_id(0) + (2 * (1 << 4) * 7)] = r5;
1406  barrier(CLK_LOCAL_MEM_FENCE);
1407  {
1408    {
1409      uint r0_1 = shared.m[smem_l_idx + (0)];
1410      uint r0_2 = shared.m[smem_r_idx + (16)];
1411      {
1412        uint const t = min(r0_1, r0_2);
1413        r0_2 = max(r0_1, r0_2);
1414        r0_1 = t;
1415      };
1416      shared.m[smem_l_idx + (0)] = r0_1;
1417      shared.m[smem_r_idx + (16)] = r0_2;
1418    }
1419    {
1420      uint r0_1 = shared.m[smem_l_idx + (64)];
1421      uint r0_2 = shared.m[smem_r_idx + (80)];
1422      {
1423        uint const t = min(r0_1, r0_2);
1424        r0_2 = max(r0_1, r0_2);
1425        r0_1 = t;
1426      };
1427      shared.m[smem_l_idx + (64)] = r0_1;
1428      shared.m[smem_r_idx + (80)] = r0_2;
1429    }
1430    {
1431      uint r0_1 = shared.m[smem_l_idx + (128)];
1432      uint r0_2 = shared.m[smem_r_idx + (144)];
1433      {
1434        uint const t = min(r0_1, r0_2);
1435        r0_2 = max(r0_1, r0_2);
1436        r0_1 = t;
1437      };
1438      shared.m[smem_l_idx + (128)] = r0_1;
1439      shared.m[smem_r_idx + (144)] = r0_2;
1440    }
1441    {
1442      uint r0_1 = shared.m[smem_l_idx + (192)];
1443      uint r0_2 = shared.m[smem_r_idx + (208)];
1444      {
1445        uint const t = min(r0_1, r0_2);
1446        r0_2 = max(r0_1, r0_2);
1447        r0_1 = t;
1448      };
1449      shared.m[smem_l_idx + (192)] = r0_1;
1450      shared.m[smem_r_idx + (208)] = r0_2;
1451    }
1452  }
1453  barrier(CLK_LOCAL_MEM_FENCE);
1454  r1 = shared.m[get_local_id(0) + (2 * (1 << 4) * 0)];
1455  r8 = shared.m[get_local_id(0) + (2 * (1 << 4) * 1)];
1456  r2 = shared.m[get_local_id(0) + (2 * (1 << 4) * 2)];
1457  r7 = shared.m[get_local_id(0) + (2 * (1 << 4) * 3)];
1458  r3 = shared.m[get_local_id(0) + (2 * (1 << 4) * 4)];
1459  r6 = shared.m[get_local_id(0) + (2 * (1 << 4) * 5)];
1460  r4 = shared.m[get_local_id(0) + (2 * (1 << 4) * 6)];
1461  r5 = shared.m[get_local_id(0) + (2 * (1 << 4) * 7)];
1462  {
1463    {
1464      uint const half_lane_idx = get_sub_group_local_id() ^ 8;
1465      int const t_lt = get_sub_group_local_id() < half_lane_idx;
1466      ;
1467      {
1468        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
1469        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
1470      };
1471      {
1472        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
1473        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
1474      };
1475      {
1476        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
1477        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
1478      };
1479      {
1480        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
1481        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
1482      };
1483      {
1484        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
1485        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
1486      };
1487      {
1488        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
1489        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
1490      };
1491      {
1492        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
1493        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
1494      };
1495      {
1496        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
1497        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
1498      };
1499    }
1500    {
1501      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
1502      int const t_lt = get_sub_group_local_id() < half_lane_idx;
1503      ;
1504      {
1505        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
1506        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
1507      };
1508      {
1509        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
1510        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
1511      };
1512      {
1513        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
1514        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
1515      };
1516      {
1517        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
1518        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
1519      };
1520      {
1521        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
1522        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
1523      };
1524      {
1525        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
1526        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
1527      };
1528      {
1529        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
1530        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
1531      };
1532      {
1533        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
1534        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
1535      };
1536    }
1537    {
1538      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
1539      int const t_lt = get_sub_group_local_id() < half_lane_idx;
1540      ;
1541      {
1542        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
1543        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
1544      };
1545      {
1546        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
1547        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
1548      };
1549      {
1550        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
1551        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
1552      };
1553      {
1554        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
1555        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
1556      };
1557      {
1558        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
1559        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
1560      };
1561      {
1562        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
1563        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
1564      };
1565      {
1566        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
1567        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
1568      };
1569      {
1570        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
1571        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
1572      };
1573    }
1574    {
1575      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
1576      int const t_lt = get_sub_group_local_id() < half_lane_idx;
1577      ;
1578      {
1579        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
1580        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
1581      };
1582      {
1583        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
1584        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
1585      };
1586      {
1587        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
1588        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
1589      };
1590      {
1591        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
1592        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
1593      };
1594      {
1595        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
1596        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
1597      };
1598      {
1599        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
1600        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
1601      };
1602      {
1603        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
1604        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
1605      };
1606      {
1607        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
1608        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
1609      };
1610    }
1611    {
1612      uint const t = min(r1, r5);
1613      r5 = max(r1, r5);
1614      r1 = t;
1615    };
1616    {
1617      uint const t = min(r3, r7);
1618      r7 = max(r3, r7);
1619      r3 = t;
1620    };
1621    {
1622      uint const t = min(r1, r3);
1623      r3 = max(r1, r3);
1624      r1 = t;
1625    };
1626    {
1627      uint const t = min(r5, r7);
1628      r7 = max(r5, r7);
1629      r5 = t;
1630    };
1631    {
1632      uint const t = min(r2, r6);
1633      r6 = max(r2, r6);
1634      r2 = t;
1635    };
1636    {
1637      uint const t = min(r4, r8);
1638      r8 = max(r4, r8);
1639      r4 = t;
1640    };
1641    {
1642      uint const t = min(r2, r4);
1643      r4 = max(r2, r4);
1644      r2 = t;
1645    };
1646    {
1647      uint const t = min(r6, r8);
1648      r8 = max(r6, r8);
1649      r6 = t;
1650    };
1651    {
1652      uint const t = min(r1, r2);
1653      r2 = max(r1, r2);
1654      r1 = t;
1655    };
1656    {
1657      uint const t = min(r3, r4);
1658      r4 = max(r3, r4);
1659      r3 = t;
1660    };
1661    {
1662      uint const t = min(r5, r6);
1663      r6 = max(r5, r6);
1664      r5 = t;
1665    };
1666    {
1667      uint const t = min(r7, r8);
1668      r8 = max(r7, r8);
1669      r7 = t;
1670    };
1671  }
1672  vout[gmem_idx + (1 << 4) * 0] = r1;
1673  vout[gmem_idx + (1 << 4) * 1] = r2;
1674  vout[gmem_idx + (1 << 4) * 2] = r3;
1675  vout[gmem_idx + (1 << 4) * 3] = r4;
1676  vout[gmem_idx + (1 << 4) * 4] = r5;
1677  vout[gmem_idx + (1 << 4) * 5] = r6;
1678  vout[gmem_idx + (1 << 4) * 6] = r7;
1679  vout[gmem_idx + (1 << 4) * 7] = r8;
1680}
1681
1682__kernel __attribute__((intel_reqd_sub_group_size((1 << 4))))
1683__attribute__((reqd_work_group_size((1 << 4) * 4, 1, 1))) void
1684hs_kernel_bs_2(__global uint const* const restrict vin,
1685               __global uint* const restrict vout)
1686{
1687  __local struct
1688  {
1689    uint m[64 * 8];
1690  } shared;
1691
1692  uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 +
1693                        (get_local_id(0) & ((1 << 4) - 1));
1694  uint r1 = vin[gmem_idx + (1 << 4) * 0];
1695  uint r2 = vin[gmem_idx + (1 << 4) * 1];
1696  uint r3 = vin[gmem_idx + (1 << 4) * 2];
1697  uint r4 = vin[gmem_idx + (1 << 4) * 3];
1698  uint r5 = vin[gmem_idx + (1 << 4) * 4];
1699  uint r6 = vin[gmem_idx + (1 << 4) * 5];
1700  uint r7 = vin[gmem_idx + (1 << 4) * 6];
1701  uint r8 = vin[gmem_idx + (1 << 4) * 7];
1702  {
1703    uint const t = min(r1, r5);
1704    r5 = max(r1, r5);
1705    r1 = t;
1706  };
1707  {
1708    uint const t = min(r2, r6);
1709    r6 = max(r2, r6);
1710    r2 = t;
1711  };
1712  {
1713    uint const t = min(r3, r7);
1714    r7 = max(r3, r7);
1715    r3 = t;
1716  };
1717  {
1718    uint const t = min(r4, r8);
1719    r8 = max(r4, r8);
1720    r4 = t;
1721  };
1722  {
1723    uint const t = min(r1, r3);
1724    r3 = max(r1, r3);
1725    r1 = t;
1726  };
1727  {
1728    uint const t = min(r2, r4);
1729    r4 = max(r2, r4);
1730    r2 = t;
1731  };
1732  {
1733    uint const t = min(r5, r7);
1734    r7 = max(r5, r7);
1735    r5 = t;
1736  };
1737  {
1738    uint const t = min(r6, r8);
1739    r8 = max(r6, r8);
1740    r6 = t;
1741  };
1742  {
1743    uint const t = min(r3, r5);
1744    r5 = max(r3, r5);
1745    r3 = t;
1746  };
1747  {
1748    uint const t = min(r4, r6);
1749    r6 = max(r4, r6);
1750    r4 = t;
1751  };
1752  {
1753    uint const t = min(r1, r2);
1754    r2 = max(r1, r2);
1755    r1 = t;
1756  };
1757  {
1758    uint const t = min(r3, r4);
1759    r4 = max(r3, r4);
1760    r3 = t;
1761  };
1762  {
1763    uint const t = min(r5, r6);
1764    r6 = max(r5, r6);
1765    r5 = t;
1766  };
1767  {
1768    uint const t = min(r7, r8);
1769    r8 = max(r7, r8);
1770    r7 = t;
1771  };
1772  {
1773    uint const t = min(r2, r5);
1774    r5 = max(r2, r5);
1775    r2 = t;
1776  };
1777  {
1778    uint const t = min(r4, r7);
1779    r7 = max(r4, r7);
1780    r4 = t;
1781  };
1782  {
1783    uint const t = min(r2, r3);
1784    r3 = max(r2, r3);
1785    r2 = t;
1786  };
1787  {
1788    uint const t = min(r4, r5);
1789    r5 = max(r4, r5);
1790    r4 = t;
1791  };
1792  {
1793    uint const t = min(r6, r7);
1794    r7 = max(r6, r7);
1795    r6 = t;
1796  };
1797  {
1798    uint const flip_lane_idx = get_sub_group_local_id() ^ 1;
1799    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
1800    ;
1801    {
1802      uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
1803      uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
1804      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
1805      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
1806    };
1807    {
1808      uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
1809      uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
1810      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
1811      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
1812    };
1813    {
1814      uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
1815      uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
1816      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
1817      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
1818    };
1819    {
1820      uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
1821      uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
1822      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
1823      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
1824    };
1825  }
1826  {
1827    uint const t = min(r1, r5);
1828    r5 = max(r1, r5);
1829    r1 = t;
1830  };
1831  {
1832    uint const t = min(r3, r7);
1833    r7 = max(r3, r7);
1834    r3 = t;
1835  };
1836  {
1837    uint const t = min(r1, r3);
1838    r3 = max(r1, r3);
1839    r1 = t;
1840  };
1841  {
1842    uint const t = min(r5, r7);
1843    r7 = max(r5, r7);
1844    r5 = t;
1845  };
1846  {
1847    uint const t = min(r2, r6);
1848    r6 = max(r2, r6);
1849    r2 = t;
1850  };
1851  {
1852    uint const t = min(r4, r8);
1853    r8 = max(r4, r8);
1854    r4 = t;
1855  };
1856  {
1857    uint const t = min(r2, r4);
1858    r4 = max(r2, r4);
1859    r2 = t;
1860  };
1861  {
1862    uint const t = min(r6, r8);
1863    r8 = max(r6, r8);
1864    r6 = t;
1865  };
1866  {
1867    uint const t = min(r1, r2);
1868    r2 = max(r1, r2);
1869    r1 = t;
1870  };
1871  {
1872    uint const t = min(r3, r4);
1873    r4 = max(r3, r4);
1874    r3 = t;
1875  };
1876  {
1877    uint const t = min(r5, r6);
1878    r6 = max(r5, r6);
1879    r5 = t;
1880  };
1881  {
1882    uint const t = min(r7, r8);
1883    r8 = max(r7, r8);
1884    r7 = t;
1885  };
1886  {
1887    uint const flip_lane_idx = get_sub_group_local_id() ^ 3;
1888    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
1889    ;
1890    {
1891      uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
1892      uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
1893      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
1894      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
1895    };
1896    {
1897      uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
1898      uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
1899      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
1900      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
1901    };
1902    {
1903      uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
1904      uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
1905      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
1906      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
1907    };
1908    {
1909      uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
1910      uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
1911      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
1912      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
1913    };
1914  }
1915  {
1916    uint const half_lane_idx = get_sub_group_local_id() ^ 1;
1917    int const t_lt = get_sub_group_local_id() < half_lane_idx;
1918    ;
1919    {
1920      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
1921      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
1922    };
1923    {
1924      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
1925      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
1926    };
1927    {
1928      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
1929      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
1930    };
1931    {
1932      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
1933      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
1934    };
1935    {
1936      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
1937      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
1938    };
1939    {
1940      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
1941      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
1942    };
1943    {
1944      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
1945      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
1946    };
1947    {
1948      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
1949      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
1950    };
1951  }
1952  {
1953    uint const t = min(r1, r5);
1954    r5 = max(r1, r5);
1955    r1 = t;
1956  };
1957  {
1958    uint const t = min(r3, r7);
1959    r7 = max(r3, r7);
1960    r3 = t;
1961  };
1962  {
1963    uint const t = min(r1, r3);
1964    r3 = max(r1, r3);
1965    r1 = t;
1966  };
1967  {
1968    uint const t = min(r5, r7);
1969    r7 = max(r5, r7);
1970    r5 = t;
1971  };
1972  {
1973    uint const t = min(r2, r6);
1974    r6 = max(r2, r6);
1975    r2 = t;
1976  };
1977  {
1978    uint const t = min(r4, r8);
1979    r8 = max(r4, r8);
1980    r4 = t;
1981  };
1982  {
1983    uint const t = min(r2, r4);
1984    r4 = max(r2, r4);
1985    r2 = t;
1986  };
1987  {
1988    uint const t = min(r6, r8);
1989    r8 = max(r6, r8);
1990    r6 = t;
1991  };
1992  {
1993    uint const t = min(r1, r2);
1994    r2 = max(r1, r2);
1995    r1 = t;
1996  };
1997  {
1998    uint const t = min(r3, r4);
1999    r4 = max(r3, r4);
2000    r3 = t;
2001  };
2002  {
2003    uint const t = min(r5, r6);
2004    r6 = max(r5, r6);
2005    r5 = t;
2006  };
2007  {
2008    uint const t = min(r7, r8);
2009    r8 = max(r7, r8);
2010    r7 = t;
2011  };
2012  {
2013    uint const flip_lane_idx = get_sub_group_local_id() ^ 7;
2014    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
2015    ;
2016    {
2017      uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
2018      uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
2019      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
2020      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
2021    };
2022    {
2023      uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
2024      uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
2025      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
2026      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
2027    };
2028    {
2029      uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
2030      uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
2031      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
2032      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
2033    };
2034    {
2035      uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
2036      uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
2037      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
2038      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
2039    };
2040  }
2041  {
2042    uint const half_lane_idx = get_sub_group_local_id() ^ 2;
2043    int const t_lt = get_sub_group_local_id() < half_lane_idx;
2044    ;
2045    {
2046      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
2047      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
2048    };
2049    {
2050      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
2051      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
2052    };
2053    {
2054      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
2055      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
2056    };
2057    {
2058      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
2059      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
2060    };
2061    {
2062      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
2063      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
2064    };
2065    {
2066      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
2067      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
2068    };
2069    {
2070      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
2071      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
2072    };
2073    {
2074      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
2075      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
2076    };
2077  }
2078  {
2079    uint const half_lane_idx = get_sub_group_local_id() ^ 1;
2080    int const t_lt = get_sub_group_local_id() < half_lane_idx;
2081    ;
2082    {
2083      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
2084      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
2085    };
2086    {
2087      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
2088      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
2089    };
2090    {
2091      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
2092      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
2093    };
2094    {
2095      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
2096      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
2097    };
2098    {
2099      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
2100      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
2101    };
2102    {
2103      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
2104      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
2105    };
2106    {
2107      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
2108      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
2109    };
2110    {
2111      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
2112      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
2113    };
2114  }
2115  {
2116    uint const t = min(r1, r5);
2117    r5 = max(r1, r5);
2118    r1 = t;
2119  };
2120  {
2121    uint const t = min(r3, r7);
2122    r7 = max(r3, r7);
2123    r3 = t;
2124  };
2125  {
2126    uint const t = min(r1, r3);
2127    r3 = max(r1, r3);
2128    r1 = t;
2129  };
2130  {
2131    uint const t = min(r5, r7);
2132    r7 = max(r5, r7);
2133    r5 = t;
2134  };
2135  {
2136    uint const t = min(r2, r6);
2137    r6 = max(r2, r6);
2138    r2 = t;
2139  };
2140  {
2141    uint const t = min(r4, r8);
2142    r8 = max(r4, r8);
2143    r4 = t;
2144  };
2145  {
2146    uint const t = min(r2, r4);
2147    r4 = max(r2, r4);
2148    r2 = t;
2149  };
2150  {
2151    uint const t = min(r6, r8);
2152    r8 = max(r6, r8);
2153    r6 = t;
2154  };
2155  {
2156    uint const t = min(r1, r2);
2157    r2 = max(r1, r2);
2158    r1 = t;
2159  };
2160  {
2161    uint const t = min(r3, r4);
2162    r4 = max(r3, r4);
2163    r3 = t;
2164  };
2165  {
2166    uint const t = min(r5, r6);
2167    r6 = max(r5, r6);
2168    r5 = t;
2169  };
2170  {
2171    uint const t = min(r7, r8);
2172    r8 = max(r7, r8);
2173    r7 = t;
2174  };
2175  {
2176    uint const flip_lane_idx = get_sub_group_local_id() ^ 15;
2177    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
2178    ;
2179    {
2180      uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
2181      uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
2182      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
2183      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
2184    };
2185    {
2186      uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
2187      uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
2188      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
2189      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
2190    };
2191    {
2192      uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
2193      uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
2194      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
2195      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
2196    };
2197    {
2198      uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
2199      uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
2200      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
2201      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
2202    };
2203  }
2204  {
2205    uint const half_lane_idx = get_sub_group_local_id() ^ 4;
2206    int const t_lt = get_sub_group_local_id() < half_lane_idx;
2207    ;
2208    {
2209      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
2210      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
2211    };
2212    {
2213      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
2214      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
2215    };
2216    {
2217      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
2218      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
2219    };
2220    {
2221      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
2222      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
2223    };
2224    {
2225      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
2226      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
2227    };
2228    {
2229      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
2230      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
2231    };
2232    {
2233      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
2234      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
2235    };
2236    {
2237      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
2238      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
2239    };
2240  }
2241  {
2242    uint const half_lane_idx = get_sub_group_local_id() ^ 2;
2243    int const t_lt = get_sub_group_local_id() < half_lane_idx;
2244    ;
2245    {
2246      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
2247      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
2248    };
2249    {
2250      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
2251      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
2252    };
2253    {
2254      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
2255      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
2256    };
2257    {
2258      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
2259      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
2260    };
2261    {
2262      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
2263      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
2264    };
2265    {
2266      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
2267      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
2268    };
2269    {
2270      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
2271      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
2272    };
2273    {
2274      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
2275      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
2276    };
2277  }
2278  {
2279    uint const half_lane_idx = get_sub_group_local_id() ^ 1;
2280    int const t_lt = get_sub_group_local_id() < half_lane_idx;
2281    ;
2282    {
2283      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
2284      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
2285    };
2286    {
2287      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
2288      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
2289    };
2290    {
2291      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
2292      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
2293    };
2294    {
2295      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
2296      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
2297    };
2298    {
2299      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
2300      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
2301    };
2302    {
2303      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
2304      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
2305    };
2306    {
2307      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
2308      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
2309    };
2310    {
2311      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
2312      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
2313    };
2314  }
2315  {
2316    uint const t = min(r1, r5);
2317    r5 = max(r1, r5);
2318    r1 = t;
2319  };
2320  {
2321    uint const t = min(r3, r7);
2322    r7 = max(r3, r7);
2323    r3 = t;
2324  };
2325  {
2326    uint const t = min(r1, r3);
2327    r3 = max(r1, r3);
2328    r1 = t;
2329  };
2330  {
2331    uint const t = min(r5, r7);
2332    r7 = max(r5, r7);
2333    r5 = t;
2334  };
2335  {
2336    uint const t = min(r2, r6);
2337    r6 = max(r2, r6);
2338    r2 = t;
2339  };
2340  {
2341    uint const t = min(r4, r8);
2342    r8 = max(r4, r8);
2343    r4 = t;
2344  };
2345  {
2346    uint const t = min(r2, r4);
2347    r4 = max(r2, r4);
2348    r2 = t;
2349  };
2350  {
2351    uint const t = min(r6, r8);
2352    r8 = max(r6, r8);
2353    r6 = t;
2354  };
2355  {
2356    uint const t = min(r1, r2);
2357    r2 = max(r1, r2);
2358    r1 = t;
2359  };
2360  {
2361    uint const t = min(r3, r4);
2362    r4 = max(r3, r4);
2363    r3 = t;
2364  };
2365  {
2366    uint const t = min(r5, r6);
2367    r6 = max(r5, r6);
2368    r5 = t;
2369  };
2370  {
2371    uint const t = min(r7, r8);
2372    r8 = max(r7, r8);
2373    r7 = t;
2374  };
2375  uint const smem_l_idx =
2376    get_sub_group_id() * ((1 << 4) * 4) + get_sub_group_local_id();
2377  uint const smem_r_idx = (get_sub_group_id() ^ 1) * ((1 << 4) * 4) +
2378                          (get_sub_group_local_id() ^ ((1 << 4) - 1));
2379  shared.m[get_local_id(0) + (4 * (1 << 4) * 0)] = r1;
2380  shared.m[get_local_id(0) + (4 * (1 << 4) * 1)] = r8;
2381  shared.m[get_local_id(0) + (4 * (1 << 4) * 2)] = r2;
2382  shared.m[get_local_id(0) + (4 * (1 << 4) * 3)] = r7;
2383  shared.m[get_local_id(0) + (4 * (1 << 4) * 4)] = r3;
2384  shared.m[get_local_id(0) + (4 * (1 << 4) * 5)] = r6;
2385  shared.m[get_local_id(0) + (4 * (1 << 4) * 6)] = r4;
2386  shared.m[get_local_id(0) + (4 * (1 << 4) * 7)] = r5;
2387  barrier(CLK_LOCAL_MEM_FENCE);
2388  {
2389    {
2390      uint r0_1 = shared.m[smem_l_idx + (0)];
2391      uint r0_2 = shared.m[smem_r_idx + (16)];
2392      {
2393        uint const t = min(r0_1, r0_2);
2394        r0_2 = max(r0_1, r0_2);
2395        r0_1 = t;
2396      };
2397      shared.m[smem_l_idx + (0)] = r0_1;
2398      shared.m[smem_r_idx + (16)] = r0_2;
2399    }
2400    {
2401      uint r1_1 = shared.m[smem_l_idx + (32)];
2402      uint r1_2 = shared.m[smem_r_idx + (48)];
2403      {
2404        uint const t = min(r1_1, r1_2);
2405        r1_2 = max(r1_1, r1_2);
2406        r1_1 = t;
2407      };
2408      shared.m[smem_l_idx + (32)] = r1_1;
2409      shared.m[smem_r_idx + (48)] = r1_2;
2410    }
2411    {
2412      uint r0_1 = shared.m[smem_l_idx + (256)];
2413      uint r0_2 = shared.m[smem_r_idx + (272)];
2414      {
2415        uint const t = min(r0_1, r0_2);
2416        r0_2 = max(r0_1, r0_2);
2417        r0_1 = t;
2418      };
2419      shared.m[smem_l_idx + (256)] = r0_1;
2420      shared.m[smem_r_idx + (272)] = r0_2;
2421    }
2422    {
2423      uint r1_1 = shared.m[smem_l_idx + (288)];
2424      uint r1_2 = shared.m[smem_r_idx + (304)];
2425      {
2426        uint const t = min(r1_1, r1_2);
2427        r1_2 = max(r1_1, r1_2);
2428        r1_1 = t;
2429      };
2430      shared.m[smem_l_idx + (288)] = r1_1;
2431      shared.m[smem_r_idx + (304)] = r1_2;
2432    }
2433  }
2434  barrier(CLK_LOCAL_MEM_FENCE);
2435  r1 = shared.m[get_local_id(0) + (4 * (1 << 4) * 0)];
2436  r8 = shared.m[get_local_id(0) + (4 * (1 << 4) * 1)];
2437  r2 = shared.m[get_local_id(0) + (4 * (1 << 4) * 2)];
2438  r7 = shared.m[get_local_id(0) + (4 * (1 << 4) * 3)];
2439  r3 = shared.m[get_local_id(0) + (4 * (1 << 4) * 4)];
2440  r6 = shared.m[get_local_id(0) + (4 * (1 << 4) * 5)];
2441  r4 = shared.m[get_local_id(0) + (4 * (1 << 4) * 6)];
2442  r5 = shared.m[get_local_id(0) + (4 * (1 << 4) * 7)];
2443  {
2444    {
2445      uint const half_lane_idx = get_sub_group_local_id() ^ 8;
2446      int const t_lt = get_sub_group_local_id() < half_lane_idx;
2447      ;
2448      {
2449        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
2450        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
2451      };
2452      {
2453        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
2454        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
2455      };
2456      {
2457        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
2458        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
2459      };
2460      {
2461        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
2462        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
2463      };
2464      {
2465        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
2466        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
2467      };
2468      {
2469        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
2470        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
2471      };
2472      {
2473        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
2474        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
2475      };
2476      {
2477        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
2478        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
2479      };
2480    }
2481    {
2482      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
2483      int const t_lt = get_sub_group_local_id() < half_lane_idx;
2484      ;
2485      {
2486        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
2487        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
2488      };
2489      {
2490        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
2491        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
2492      };
2493      {
2494        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
2495        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
2496      };
2497      {
2498        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
2499        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
2500      };
2501      {
2502        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
2503        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
2504      };
2505      {
2506        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
2507        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
2508      };
2509      {
2510        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
2511        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
2512      };
2513      {
2514        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
2515        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
2516      };
2517    }
2518    {
2519      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
2520      int const t_lt = get_sub_group_local_id() < half_lane_idx;
2521      ;
2522      {
2523        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
2524        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
2525      };
2526      {
2527        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
2528        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
2529      };
2530      {
2531        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
2532        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
2533      };
2534      {
2535        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
2536        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
2537      };
2538      {
2539        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
2540        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
2541      };
2542      {
2543        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
2544        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
2545      };
2546      {
2547        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
2548        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
2549      };
2550      {
2551        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
2552        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
2553      };
2554    }
2555    {
2556      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
2557      int const t_lt = get_sub_group_local_id() < half_lane_idx;
2558      ;
2559      {
2560        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
2561        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
2562      };
2563      {
2564        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
2565        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
2566      };
2567      {
2568        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
2569        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
2570      };
2571      {
2572        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
2573        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
2574      };
2575      {
2576        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
2577        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
2578      };
2579      {
2580        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
2581        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
2582      };
2583      {
2584        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
2585        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
2586      };
2587      {
2588        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
2589        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
2590      };
2591    }
2592    {
2593      uint const t = min(r1, r5);
2594      r5 = max(r1, r5);
2595      r1 = t;
2596    };
2597    {
2598      uint const t = min(r3, r7);
2599      r7 = max(r3, r7);
2600      r3 = t;
2601    };
2602    {
2603      uint const t = min(r1, r3);
2604      r3 = max(r1, r3);
2605      r1 = t;
2606    };
2607    {
2608      uint const t = min(r5, r7);
2609      r7 = max(r5, r7);
2610      r5 = t;
2611    };
2612    {
2613      uint const t = min(r2, r6);
2614      r6 = max(r2, r6);
2615      r2 = t;
2616    };
2617    {
2618      uint const t = min(r4, r8);
2619      r8 = max(r4, r8);
2620      r4 = t;
2621    };
2622    {
2623      uint const t = min(r2, r4);
2624      r4 = max(r2, r4);
2625      r2 = t;
2626    };
2627    {
2628      uint const t = min(r6, r8);
2629      r8 = max(r6, r8);
2630      r6 = t;
2631    };
2632    {
2633      uint const t = min(r1, r2);
2634      r2 = max(r1, r2);
2635      r1 = t;
2636    };
2637    {
2638      uint const t = min(r3, r4);
2639      r4 = max(r3, r4);
2640      r3 = t;
2641    };
2642    {
2643      uint const t = min(r5, r6);
2644      r6 = max(r5, r6);
2645      r5 = t;
2646    };
2647    {
2648      uint const t = min(r7, r8);
2649      r8 = max(r7, r8);
2650      r7 = t;
2651    };
2652  }
2653  shared.m[get_local_id(0) + (4 * (1 << 4) * 0)] = r1;
2654  shared.m[get_local_id(0) + (4 * (1 << 4) * 1)] = r8;
2655  shared.m[get_local_id(0) + (4 * (1 << 4) * 2)] = r2;
2656  shared.m[get_local_id(0) + (4 * (1 << 4) * 3)] = r7;
2657  shared.m[get_local_id(0) + (4 * (1 << 4) * 4)] = r3;
2658  shared.m[get_local_id(0) + (4 * (1 << 4) * 5)] = r6;
2659  shared.m[get_local_id(0) + (4 * (1 << 4) * 6)] = r4;
2660  shared.m[get_local_id(0) + (4 * (1 << 4) * 7)] = r5;
2661  barrier(CLK_LOCAL_MEM_FENCE);
2662  {
2663    {
2664      uint r0_1 = shared.m[smem_l_idx + (0)];
2665      uint r0_2 = shared.m[smem_l_idx + (16)];
2666      uint r0_3 = shared.m[smem_r_idx + (32)];
2667      uint r0_4 = shared.m[smem_r_idx + (48)];
2668      {
2669        uint const t = min(r0_2, r0_3);
2670        r0_3 = max(r0_2, r0_3);
2671        r0_2 = t;
2672      };
2673      {
2674        uint const t = min(r0_1, r0_4);
2675        r0_4 = max(r0_1, r0_4);
2676        r0_1 = t;
2677      };
2678      {
2679        uint const t = min(r0_3, r0_4);
2680        r0_4 = max(r0_3, r0_4);
2681        r0_3 = t;
2682      };
2683      {
2684        uint const t = min(r0_1, r0_2);
2685        r0_2 = max(r0_1, r0_2);
2686        r0_1 = t;
2687      };
2688      shared.m[smem_l_idx + (0)] = r0_1;
2689      shared.m[smem_l_idx + (16)] = r0_2;
2690      shared.m[smem_r_idx + (32)] = r0_3;
2691      shared.m[smem_r_idx + (48)] = r0_4;
2692    }
2693    {
2694      uint r0_1 = shared.m[smem_l_idx + (256)];
2695      uint r0_2 = shared.m[smem_l_idx + (272)];
2696      uint r0_3 = shared.m[smem_r_idx + (288)];
2697      uint r0_4 = shared.m[smem_r_idx + (304)];
2698      {
2699        uint const t = min(r0_2, r0_3);
2700        r0_3 = max(r0_2, r0_3);
2701        r0_2 = t;
2702      };
2703      {
2704        uint const t = min(r0_1, r0_4);
2705        r0_4 = max(r0_1, r0_4);
2706        r0_1 = t;
2707      };
2708      {
2709        uint const t = min(r0_3, r0_4);
2710        r0_4 = max(r0_3, r0_4);
2711        r0_3 = t;
2712      };
2713      {
2714        uint const t = min(r0_1, r0_2);
2715        r0_2 = max(r0_1, r0_2);
2716        r0_1 = t;
2717      };
2718      shared.m[smem_l_idx + (256)] = r0_1;
2719      shared.m[smem_l_idx + (272)] = r0_2;
2720      shared.m[smem_r_idx + (288)] = r0_3;
2721      shared.m[smem_r_idx + (304)] = r0_4;
2722    }
2723  }
2724  barrier(CLK_LOCAL_MEM_FENCE);
2725  r1 = shared.m[get_local_id(0) + (4 * (1 << 4) * 0)];
2726  r8 = shared.m[get_local_id(0) + (4 * (1 << 4) * 1)];
2727  r2 = shared.m[get_local_id(0) + (4 * (1 << 4) * 2)];
2728  r7 = shared.m[get_local_id(0) + (4 * (1 << 4) * 3)];
2729  r3 = shared.m[get_local_id(0) + (4 * (1 << 4) * 4)];
2730  r6 = shared.m[get_local_id(0) + (4 * (1 << 4) * 5)];
2731  r4 = shared.m[get_local_id(0) + (4 * (1 << 4) * 6)];
2732  r5 = shared.m[get_local_id(0) + (4 * (1 << 4) * 7)];
2733  {
2734    {
2735      uint const half_lane_idx = get_sub_group_local_id() ^ 8;
2736      int const t_lt = get_sub_group_local_id() < half_lane_idx;
2737      ;
2738      {
2739        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
2740        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
2741      };
2742      {
2743        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
2744        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
2745      };
2746      {
2747        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
2748        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
2749      };
2750      {
2751        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
2752        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
2753      };
2754      {
2755        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
2756        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
2757      };
2758      {
2759        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
2760        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
2761      };
2762      {
2763        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
2764        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
2765      };
2766      {
2767        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
2768        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
2769      };
2770    }
2771    {
2772      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
2773      int const t_lt = get_sub_group_local_id() < half_lane_idx;
2774      ;
2775      {
2776        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
2777        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
2778      };
2779      {
2780        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
2781        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
2782      };
2783      {
2784        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
2785        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
2786      };
2787      {
2788        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
2789        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
2790      };
2791      {
2792        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
2793        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
2794      };
2795      {
2796        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
2797        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
2798      };
2799      {
2800        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
2801        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
2802      };
2803      {
2804        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
2805        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
2806      };
2807    }
2808    {
2809      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
2810      int const t_lt = get_sub_group_local_id() < half_lane_idx;
2811      ;
2812      {
2813        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
2814        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
2815      };
2816      {
2817        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
2818        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
2819      };
2820      {
2821        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
2822        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
2823      };
2824      {
2825        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
2826        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
2827      };
2828      {
2829        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
2830        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
2831      };
2832      {
2833        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
2834        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
2835      };
2836      {
2837        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
2838        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
2839      };
2840      {
2841        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
2842        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
2843      };
2844    }
2845    {
2846      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
2847      int const t_lt = get_sub_group_local_id() < half_lane_idx;
2848      ;
2849      {
2850        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
2851        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
2852      };
2853      {
2854        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
2855        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
2856      };
2857      {
2858        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
2859        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
2860      };
2861      {
2862        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
2863        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
2864      };
2865      {
2866        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
2867        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
2868      };
2869      {
2870        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
2871        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
2872      };
2873      {
2874        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
2875        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
2876      };
2877      {
2878        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
2879        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
2880      };
2881    }
2882    {
2883      uint const t = min(r1, r5);
2884      r5 = max(r1, r5);
2885      r1 = t;
2886    };
2887    {
2888      uint const t = min(r3, r7);
2889      r7 = max(r3, r7);
2890      r3 = t;
2891    };
2892    {
2893      uint const t = min(r1, r3);
2894      r3 = max(r1, r3);
2895      r1 = t;
2896    };
2897    {
2898      uint const t = min(r5, r7);
2899      r7 = max(r5, r7);
2900      r5 = t;
2901    };
2902    {
2903      uint const t = min(r2, r6);
2904      r6 = max(r2, r6);
2905      r2 = t;
2906    };
2907    {
2908      uint const t = min(r4, r8);
2909      r8 = max(r4, r8);
2910      r4 = t;
2911    };
2912    {
2913      uint const t = min(r2, r4);
2914      r4 = max(r2, r4);
2915      r2 = t;
2916    };
2917    {
2918      uint const t = min(r6, r8);
2919      r8 = max(r6, r8);
2920      r6 = t;
2921    };
2922    {
2923      uint const t = min(r1, r2);
2924      r2 = max(r1, r2);
2925      r1 = t;
2926    };
2927    {
2928      uint const t = min(r3, r4);
2929      r4 = max(r3, r4);
2930      r3 = t;
2931    };
2932    {
2933      uint const t = min(r5, r6);
2934      r6 = max(r5, r6);
2935      r5 = t;
2936    };
2937    {
2938      uint const t = min(r7, r8);
2939      r8 = max(r7, r8);
2940      r7 = t;
2941    };
2942  }
2943  vout[gmem_idx + (1 << 4) * 0] = r1;
2944  vout[gmem_idx + (1 << 4) * 1] = r2;
2945  vout[gmem_idx + (1 << 4) * 2] = r3;
2946  vout[gmem_idx + (1 << 4) * 3] = r4;
2947  vout[gmem_idx + (1 << 4) * 4] = r5;
2948  vout[gmem_idx + (1 << 4) * 5] = r6;
2949  vout[gmem_idx + (1 << 4) * 6] = r7;
2950  vout[gmem_idx + (1 << 4) * 7] = r8;
2951}
2952
2953__kernel __attribute__((intel_reqd_sub_group_size((1 << 4))))
2954__attribute__((reqd_work_group_size((1 << 4) * 8, 1, 1))) void
2955hs_kernel_bs_3(__global uint const* const restrict vin,
2956               __global uint* const restrict vout)
2957{
2958  __local struct
2959  {
2960    uint m[128 * 8];
2961  } shared;
2962
2963  uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 +
2964                        (get_local_id(0) & ((1 << 4) - 1));
2965  uint r1 = vin[gmem_idx + (1 << 4) * 0];
2966  uint r2 = vin[gmem_idx + (1 << 4) * 1];
2967  uint r3 = vin[gmem_idx + (1 << 4) * 2];
2968  uint r4 = vin[gmem_idx + (1 << 4) * 3];
2969  uint r5 = vin[gmem_idx + (1 << 4) * 4];
2970  uint r6 = vin[gmem_idx + (1 << 4) * 5];
2971  uint r7 = vin[gmem_idx + (1 << 4) * 6];
2972  uint r8 = vin[gmem_idx + (1 << 4) * 7];
2973  {
2974    uint const t = min(r1, r5);
2975    r5 = max(r1, r5);
2976    r1 = t;
2977  };
2978  {
2979    uint const t = min(r2, r6);
2980    r6 = max(r2, r6);
2981    r2 = t;
2982  };
2983  {
2984    uint const t = min(r3, r7);
2985    r7 = max(r3, r7);
2986    r3 = t;
2987  };
2988  {
2989    uint const t = min(r4, r8);
2990    r8 = max(r4, r8);
2991    r4 = t;
2992  };
2993  {
2994    uint const t = min(r1, r3);
2995    r3 = max(r1, r3);
2996    r1 = t;
2997  };
2998  {
2999    uint const t = min(r2, r4);
3000    r4 = max(r2, r4);
3001    r2 = t;
3002  };
3003  {
3004    uint const t = min(r5, r7);
3005    r7 = max(r5, r7);
3006    r5 = t;
3007  };
3008  {
3009    uint const t = min(r6, r8);
3010    r8 = max(r6, r8);
3011    r6 = t;
3012  };
3013  {
3014    uint const t = min(r3, r5);
3015    r5 = max(r3, r5);
3016    r3 = t;
3017  };
3018  {
3019    uint const t = min(r4, r6);
3020    r6 = max(r4, r6);
3021    r4 = t;
3022  };
3023  {
3024    uint const t = min(r1, r2);
3025    r2 = max(r1, r2);
3026    r1 = t;
3027  };
3028  {
3029    uint const t = min(r3, r4);
3030    r4 = max(r3, r4);
3031    r3 = t;
3032  };
3033  {
3034    uint const t = min(r5, r6);
3035    r6 = max(r5, r6);
3036    r5 = t;
3037  };
3038  {
3039    uint const t = min(r7, r8);
3040    r8 = max(r7, r8);
3041    r7 = t;
3042  };
3043  {
3044    uint const t = min(r2, r5);
3045    r5 = max(r2, r5);
3046    r2 = t;
3047  };
3048  {
3049    uint const t = min(r4, r7);
3050    r7 = max(r4, r7);
3051    r4 = t;
3052  };
3053  {
3054    uint const t = min(r2, r3);
3055    r3 = max(r2, r3);
3056    r2 = t;
3057  };
3058  {
3059    uint const t = min(r4, r5);
3060    r5 = max(r4, r5);
3061    r4 = t;
3062  };
3063  {
3064    uint const t = min(r6, r7);
3065    r7 = max(r6, r7);
3066    r6 = t;
3067  };
3068  {
3069    uint const flip_lane_idx = get_sub_group_local_id() ^ 1;
3070    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
3071    ;
3072    {
3073      uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
3074      uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
3075      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
3076      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
3077    };
3078    {
3079      uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
3080      uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
3081      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
3082      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
3083    };
3084    {
3085      uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
3086      uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
3087      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
3088      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
3089    };
3090    {
3091      uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
3092      uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
3093      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
3094      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
3095    };
3096  }
3097  {
3098    uint const t = min(r1, r5);
3099    r5 = max(r1, r5);
3100    r1 = t;
3101  };
3102  {
3103    uint const t = min(r3, r7);
3104    r7 = max(r3, r7);
3105    r3 = t;
3106  };
3107  {
3108    uint const t = min(r1, r3);
3109    r3 = max(r1, r3);
3110    r1 = t;
3111  };
3112  {
3113    uint const t = min(r5, r7);
3114    r7 = max(r5, r7);
3115    r5 = t;
3116  };
3117  {
3118    uint const t = min(r2, r6);
3119    r6 = max(r2, r6);
3120    r2 = t;
3121  };
3122  {
3123    uint const t = min(r4, r8);
3124    r8 = max(r4, r8);
3125    r4 = t;
3126  };
3127  {
3128    uint const t = min(r2, r4);
3129    r4 = max(r2, r4);
3130    r2 = t;
3131  };
3132  {
3133    uint const t = min(r6, r8);
3134    r8 = max(r6, r8);
3135    r6 = t;
3136  };
3137  {
3138    uint const t = min(r1, r2);
3139    r2 = max(r1, r2);
3140    r1 = t;
3141  };
3142  {
3143    uint const t = min(r3, r4);
3144    r4 = max(r3, r4);
3145    r3 = t;
3146  };
3147  {
3148    uint const t = min(r5, r6);
3149    r6 = max(r5, r6);
3150    r5 = t;
3151  };
3152  {
3153    uint const t = min(r7, r8);
3154    r8 = max(r7, r8);
3155    r7 = t;
3156  };
3157  {
3158    uint const flip_lane_idx = get_sub_group_local_id() ^ 3;
3159    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
3160    ;
3161    {
3162      uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
3163      uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
3164      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
3165      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
3166    };
3167    {
3168      uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
3169      uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
3170      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
3171      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
3172    };
3173    {
3174      uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
3175      uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
3176      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
3177      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
3178    };
3179    {
3180      uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
3181      uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
3182      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
3183      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
3184    };
3185  }
3186  {
3187    uint const half_lane_idx = get_sub_group_local_id() ^ 1;
3188    int const t_lt = get_sub_group_local_id() < half_lane_idx;
3189    ;
3190    {
3191      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
3192      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
3193    };
3194    {
3195      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
3196      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
3197    };
3198    {
3199      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
3200      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
3201    };
3202    {
3203      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
3204      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
3205    };
3206    {
3207      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
3208      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
3209    };
3210    {
3211      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
3212      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
3213    };
3214    {
3215      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
3216      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
3217    };
3218    {
3219      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
3220      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
3221    };
3222  }
3223  {
3224    uint const t = min(r1, r5);
3225    r5 = max(r1, r5);
3226    r1 = t;
3227  };
3228  {
3229    uint const t = min(r3, r7);
3230    r7 = max(r3, r7);
3231    r3 = t;
3232  };
3233  {
3234    uint const t = min(r1, r3);
3235    r3 = max(r1, r3);
3236    r1 = t;
3237  };
3238  {
3239    uint const t = min(r5, r7);
3240    r7 = max(r5, r7);
3241    r5 = t;
3242  };
3243  {
3244    uint const t = min(r2, r6);
3245    r6 = max(r2, r6);
3246    r2 = t;
3247  };
3248  {
3249    uint const t = min(r4, r8);
3250    r8 = max(r4, r8);
3251    r4 = t;
3252  };
3253  {
3254    uint const t = min(r2, r4);
3255    r4 = max(r2, r4);
3256    r2 = t;
3257  };
3258  {
3259    uint const t = min(r6, r8);
3260    r8 = max(r6, r8);
3261    r6 = t;
3262  };
3263  {
3264    uint const t = min(r1, r2);
3265    r2 = max(r1, r2);
3266    r1 = t;
3267  };
3268  {
3269    uint const t = min(r3, r4);
3270    r4 = max(r3, r4);
3271    r3 = t;
3272  };
3273  {
3274    uint const t = min(r5, r6);
3275    r6 = max(r5, r6);
3276    r5 = t;
3277  };
3278  {
3279    uint const t = min(r7, r8);
3280    r8 = max(r7, r8);
3281    r7 = t;
3282  };
3283  {
3284    uint const flip_lane_idx = get_sub_group_local_id() ^ 7;
3285    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
3286    ;
3287    {
3288      uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
3289      uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
3290      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
3291      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
3292    };
3293    {
3294      uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
3295      uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
3296      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
3297      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
3298    };
3299    {
3300      uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
3301      uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
3302      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
3303      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
3304    };
3305    {
3306      uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
3307      uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
3308      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
3309      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
3310    };
3311  }
3312  {
3313    uint const half_lane_idx = get_sub_group_local_id() ^ 2;
3314    int const t_lt = get_sub_group_local_id() < half_lane_idx;
3315    ;
3316    {
3317      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
3318      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
3319    };
3320    {
3321      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
3322      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
3323    };
3324    {
3325      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
3326      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
3327    };
3328    {
3329      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
3330      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
3331    };
3332    {
3333      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
3334      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
3335    };
3336    {
3337      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
3338      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
3339    };
3340    {
3341      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
3342      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
3343    };
3344    {
3345      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
3346      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
3347    };
3348  }
3349  {
3350    uint const half_lane_idx = get_sub_group_local_id() ^ 1;
3351    int const t_lt = get_sub_group_local_id() < half_lane_idx;
3352    ;
3353    {
3354      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
3355      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
3356    };
3357    {
3358      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
3359      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
3360    };
3361    {
3362      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
3363      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
3364    };
3365    {
3366      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
3367      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
3368    };
3369    {
3370      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
3371      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
3372    };
3373    {
3374      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
3375      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
3376    };
3377    {
3378      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
3379      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
3380    };
3381    {
3382      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
3383      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
3384    };
3385  }
3386  {
3387    uint const t = min(r1, r5);
3388    r5 = max(r1, r5);
3389    r1 = t;
3390  };
3391  {
3392    uint const t = min(r3, r7);
3393    r7 = max(r3, r7);
3394    r3 = t;
3395  };
3396  {
3397    uint const t = min(r1, r3);
3398    r3 = max(r1, r3);
3399    r1 = t;
3400  };
3401  {
3402    uint const t = min(r5, r7);
3403    r7 = max(r5, r7);
3404    r5 = t;
3405  };
3406  {
3407    uint const t = min(r2, r6);
3408    r6 = max(r2, r6);
3409    r2 = t;
3410  };
3411  {
3412    uint const t = min(r4, r8);
3413    r8 = max(r4, r8);
3414    r4 = t;
3415  };
3416  {
3417    uint const t = min(r2, r4);
3418    r4 = max(r2, r4);
3419    r2 = t;
3420  };
3421  {
3422    uint const t = min(r6, r8);
3423    r8 = max(r6, r8);
3424    r6 = t;
3425  };
3426  {
3427    uint const t = min(r1, r2);
3428    r2 = max(r1, r2);
3429    r1 = t;
3430  };
3431  {
3432    uint const t = min(r3, r4);
3433    r4 = max(r3, r4);
3434    r3 = t;
3435  };
3436  {
3437    uint const t = min(r5, r6);
3438    r6 = max(r5, r6);
3439    r5 = t;
3440  };
3441  {
3442    uint const t = min(r7, r8);
3443    r8 = max(r7, r8);
3444    r7 = t;
3445  };
3446  {
3447    uint const flip_lane_idx = get_sub_group_local_id() ^ 15;
3448    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
3449    ;
3450    {
3451      uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
3452      uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
3453      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
3454      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
3455    };
3456    {
3457      uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
3458      uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
3459      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
3460      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
3461    };
3462    {
3463      uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
3464      uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
3465      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
3466      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
3467    };
3468    {
3469      uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
3470      uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
3471      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
3472      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
3473    };
3474  }
3475  {
3476    uint const half_lane_idx = get_sub_group_local_id() ^ 4;
3477    int const t_lt = get_sub_group_local_id() < half_lane_idx;
3478    ;
3479    {
3480      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
3481      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
3482    };
3483    {
3484      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
3485      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
3486    };
3487    {
3488      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
3489      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
3490    };
3491    {
3492      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
3493      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
3494    };
3495    {
3496      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
3497      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
3498    };
3499    {
3500      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
3501      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
3502    };
3503    {
3504      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
3505      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
3506    };
3507    {
3508      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
3509      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
3510    };
3511  }
3512  {
3513    uint const half_lane_idx = get_sub_group_local_id() ^ 2;
3514    int const t_lt = get_sub_group_local_id() < half_lane_idx;
3515    ;
3516    {
3517      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
3518      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
3519    };
3520    {
3521      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
3522      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
3523    };
3524    {
3525      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
3526      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
3527    };
3528    {
3529      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
3530      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
3531    };
3532    {
3533      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
3534      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
3535    };
3536    {
3537      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
3538      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
3539    };
3540    {
3541      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
3542      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
3543    };
3544    {
3545      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
3546      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
3547    };
3548  }
3549  {
3550    uint const half_lane_idx = get_sub_group_local_id() ^ 1;
3551    int const t_lt = get_sub_group_local_id() < half_lane_idx;
3552    ;
3553    {
3554      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
3555      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
3556    };
3557    {
3558      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
3559      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
3560    };
3561    {
3562      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
3563      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
3564    };
3565    {
3566      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
3567      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
3568    };
3569    {
3570      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
3571      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
3572    };
3573    {
3574      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
3575      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
3576    };
3577    {
3578      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
3579      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
3580    };
3581    {
3582      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
3583      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
3584    };
3585  }
3586  {
3587    uint const t = min(r1, r5);
3588    r5 = max(r1, r5);
3589    r1 = t;
3590  };
3591  {
3592    uint const t = min(r3, r7);
3593    r7 = max(r3, r7);
3594    r3 = t;
3595  };
3596  {
3597    uint const t = min(r1, r3);
3598    r3 = max(r1, r3);
3599    r1 = t;
3600  };
3601  {
3602    uint const t = min(r5, r7);
3603    r7 = max(r5, r7);
3604    r5 = t;
3605  };
3606  {
3607    uint const t = min(r2, r6);
3608    r6 = max(r2, r6);
3609    r2 = t;
3610  };
3611  {
3612    uint const t = min(r4, r8);
3613    r8 = max(r4, r8);
3614    r4 = t;
3615  };
3616  {
3617    uint const t = min(r2, r4);
3618    r4 = max(r2, r4);
3619    r2 = t;
3620  };
3621  {
3622    uint const t = min(r6, r8);
3623    r8 = max(r6, r8);
3624    r6 = t;
3625  };
3626  {
3627    uint const t = min(r1, r2);
3628    r2 = max(r1, r2);
3629    r1 = t;
3630  };
3631  {
3632    uint const t = min(r3, r4);
3633    r4 = max(r3, r4);
3634    r3 = t;
3635  };
3636  {
3637    uint const t = min(r5, r6);
3638    r6 = max(r5, r6);
3639    r5 = t;
3640  };
3641  {
3642    uint const t = min(r7, r8);
3643    r8 = max(r7, r8);
3644    r7 = t;
3645  };
3646  uint const smem_l_idx =
3647    get_sub_group_id() * ((1 << 4) * 8) + get_sub_group_local_id();
3648  uint const smem_r_idx = (get_sub_group_id() ^ 1) * ((1 << 4) * 8) +
3649                          (get_sub_group_local_id() ^ ((1 << 4) - 1));
3650  shared.m[get_local_id(0) + (8 * (1 << 4) * 0)] = r1;
3651  shared.m[get_local_id(0) + (8 * (1 << 4) * 1)] = r8;
3652  shared.m[get_local_id(0) + (8 * (1 << 4) * 2)] = r2;
3653  shared.m[get_local_id(0) + (8 * (1 << 4) * 3)] = r7;
3654  shared.m[get_local_id(0) + (8 * (1 << 4) * 4)] = r3;
3655  shared.m[get_local_id(0) + (8 * (1 << 4) * 5)] = r6;
3656  shared.m[get_local_id(0) + (8 * (1 << 4) * 6)] = r4;
3657  shared.m[get_local_id(0) + (8 * (1 << 4) * 7)] = r5;
3658  barrier(CLK_LOCAL_MEM_FENCE);
3659  {
3660    {
3661      uint r0_1 = shared.m[smem_l_idx + (0)];
3662      uint r0_2 = shared.m[smem_r_idx + (16)];
3663      {
3664        uint const t = min(r0_1, r0_2);
3665        r0_2 = max(r0_1, r0_2);
3666        r0_1 = t;
3667      };
3668      shared.m[smem_l_idx + (0)] = r0_1;
3669      shared.m[smem_r_idx + (16)] = r0_2;
3670    }
3671    {
3672      uint r1_1 = shared.m[smem_l_idx + (32)];
3673      uint r1_2 = shared.m[smem_r_idx + (48)];
3674      {
3675        uint const t = min(r1_1, r1_2);
3676        r1_2 = max(r1_1, r1_2);
3677        r1_1 = t;
3678      };
3679      shared.m[smem_l_idx + (32)] = r1_1;
3680      shared.m[smem_r_idx + (48)] = r1_2;
3681    }
3682    {
3683      uint r2_1 = shared.m[smem_l_idx + (64)];
3684      uint r2_2 = shared.m[smem_r_idx + (80)];
3685      {
3686        uint const t = min(r2_1, r2_2);
3687        r2_2 = max(r2_1, r2_2);
3688        r2_1 = t;
3689      };
3690      shared.m[smem_l_idx + (64)] = r2_1;
3691      shared.m[smem_r_idx + (80)] = r2_2;
3692    }
3693    {
3694      uint r3_1 = shared.m[smem_l_idx + (96)];
3695      uint r3_2 = shared.m[smem_r_idx + (112)];
3696      {
3697        uint const t = min(r3_1, r3_2);
3698        r3_2 = max(r3_1, r3_2);
3699        r3_1 = t;
3700      };
3701      shared.m[smem_l_idx + (96)] = r3_1;
3702      shared.m[smem_r_idx + (112)] = r3_2;
3703    }
3704  }
3705  barrier(CLK_LOCAL_MEM_FENCE);
3706  r1 = shared.m[get_local_id(0) + (8 * (1 << 4) * 0)];
3707  r8 = shared.m[get_local_id(0) + (8 * (1 << 4) * 1)];
3708  r2 = shared.m[get_local_id(0) + (8 * (1 << 4) * 2)];
3709  r7 = shared.m[get_local_id(0) + (8 * (1 << 4) * 3)];
3710  r3 = shared.m[get_local_id(0) + (8 * (1 << 4) * 4)];
3711  r6 = shared.m[get_local_id(0) + (8 * (1 << 4) * 5)];
3712  r4 = shared.m[get_local_id(0) + (8 * (1 << 4) * 6)];
3713  r5 = shared.m[get_local_id(0) + (8 * (1 << 4) * 7)];
3714  {
3715    {
3716      uint const half_lane_idx = get_sub_group_local_id() ^ 8;
3717      int const t_lt = get_sub_group_local_id() < half_lane_idx;
3718      ;
3719      {
3720        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
3721        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
3722      };
3723      {
3724        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
3725        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
3726      };
3727      {
3728        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
3729        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
3730      };
3731      {
3732        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
3733        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
3734      };
3735      {
3736        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
3737        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
3738      };
3739      {
3740        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
3741        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
3742      };
3743      {
3744        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
3745        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
3746      };
3747      {
3748        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
3749        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
3750      };
3751    }
3752    {
3753      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
3754      int const t_lt = get_sub_group_local_id() < half_lane_idx;
3755      ;
3756      {
3757        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
3758        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
3759      };
3760      {
3761        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
3762        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
3763      };
3764      {
3765        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
3766        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
3767      };
3768      {
3769        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
3770        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
3771      };
3772      {
3773        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
3774        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
3775      };
3776      {
3777        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
3778        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
3779      };
3780      {
3781        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
3782        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
3783      };
3784      {
3785        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
3786        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
3787      };
3788    }
3789    {
3790      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
3791      int const t_lt = get_sub_group_local_id() < half_lane_idx;
3792      ;
3793      {
3794        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
3795        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
3796      };
3797      {
3798        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
3799        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
3800      };
3801      {
3802        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
3803        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
3804      };
3805      {
3806        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
3807        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
3808      };
3809      {
3810        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
3811        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
3812      };
3813      {
3814        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
3815        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
3816      };
3817      {
3818        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
3819        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
3820      };
3821      {
3822        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
3823        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
3824      };
3825    }
3826    {
3827      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
3828      int const t_lt = get_sub_group_local_id() < half_lane_idx;
3829      ;
3830      {
3831        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
3832        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
3833      };
3834      {
3835        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
3836        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
3837      };
3838      {
3839        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
3840        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
3841      };
3842      {
3843        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
3844        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
3845      };
3846      {
3847        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
3848        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
3849      };
3850      {
3851        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
3852        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
3853      };
3854      {
3855        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
3856        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
3857      };
3858      {
3859        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
3860        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
3861      };
3862    }
3863    {
3864      uint const t = min(r1, r5);
3865      r5 = max(r1, r5);
3866      r1 = t;
3867    };
3868    {
3869      uint const t = min(r3, r7);
3870      r7 = max(r3, r7);
3871      r3 = t;
3872    };
3873    {
3874      uint const t = min(r1, r3);
3875      r3 = max(r1, r3);
3876      r1 = t;
3877    };
3878    {
3879      uint const t = min(r5, r7);
3880      r7 = max(r5, r7);
3881      r5 = t;
3882    };
3883    {
3884      uint const t = min(r2, r6);
3885      r6 = max(r2, r6);
3886      r2 = t;
3887    };
3888    {
3889      uint const t = min(r4, r8);
3890      r8 = max(r4, r8);
3891      r4 = t;
3892    };
3893    {
3894      uint const t = min(r2, r4);
3895      r4 = max(r2, r4);
3896      r2 = t;
3897    };
3898    {
3899      uint const t = min(r6, r8);
3900      r8 = max(r6, r8);
3901      r6 = t;
3902    };
3903    {
3904      uint const t = min(r1, r2);
3905      r2 = max(r1, r2);
3906      r1 = t;
3907    };
3908    {
3909      uint const t = min(r3, r4);
3910      r4 = max(r3, r4);
3911      r3 = t;
3912    };
3913    {
3914      uint const t = min(r5, r6);
3915      r6 = max(r5, r6);
3916      r5 = t;
3917    };
3918    {
3919      uint const t = min(r7, r8);
3920      r8 = max(r7, r8);
3921      r7 = t;
3922    };
3923  }
3924  shared.m[get_local_id(0) + (8 * (1 << 4) * 0)] = r1;
3925  shared.m[get_local_id(0) + (8 * (1 << 4) * 1)] = r8;
3926  shared.m[get_local_id(0) + (8 * (1 << 4) * 2)] = r2;
3927  shared.m[get_local_id(0) + (8 * (1 << 4) * 3)] = r7;
3928  shared.m[get_local_id(0) + (8 * (1 << 4) * 4)] = r3;
3929  shared.m[get_local_id(0) + (8 * (1 << 4) * 5)] = r6;
3930  shared.m[get_local_id(0) + (8 * (1 << 4) * 6)] = r4;
3931  shared.m[get_local_id(0) + (8 * (1 << 4) * 7)] = r5;
3932  barrier(CLK_LOCAL_MEM_FENCE);
3933  {
3934    {
3935      uint r0_1 = shared.m[smem_l_idx + (0)];
3936      uint r0_2 = shared.m[smem_l_idx + (16)];
3937      uint r0_3 = shared.m[smem_r_idx + (32)];
3938      uint r0_4 = shared.m[smem_r_idx + (48)];
3939      {
3940        uint const t = min(r0_2, r0_3);
3941        r0_3 = max(r0_2, r0_3);
3942        r0_2 = t;
3943      };
3944      {
3945        uint const t = min(r0_1, r0_4);
3946        r0_4 = max(r0_1, r0_4);
3947        r0_1 = t;
3948      };
3949      {
3950        uint const t = min(r0_3, r0_4);
3951        r0_4 = max(r0_3, r0_4);
3952        r0_3 = t;
3953      };
3954      {
3955        uint const t = min(r0_1, r0_2);
3956        r0_2 = max(r0_1, r0_2);
3957        r0_1 = t;
3958      };
3959      shared.m[smem_l_idx + (0)] = r0_1;
3960      shared.m[smem_l_idx + (16)] = r0_2;
3961      shared.m[smem_r_idx + (32)] = r0_3;
3962      shared.m[smem_r_idx + (48)] = r0_4;
3963    }
3964    {
3965      uint r1_1 = shared.m[smem_l_idx + (64)];
3966      uint r1_2 = shared.m[smem_l_idx + (80)];
3967      uint r1_3 = shared.m[smem_r_idx + (96)];
3968      uint r1_4 = shared.m[smem_r_idx + (112)];
3969      {
3970        uint const t = min(r1_2, r1_3);
3971        r1_3 = max(r1_2, r1_3);
3972        r1_2 = t;
3973      };
3974      {
3975        uint const t = min(r1_1, r1_4);
3976        r1_4 = max(r1_1, r1_4);
3977        r1_1 = t;
3978      };
3979      {
3980        uint const t = min(r1_3, r1_4);
3981        r1_4 = max(r1_3, r1_4);
3982        r1_3 = t;
3983      };
3984      {
3985        uint const t = min(r1_1, r1_2);
3986        r1_2 = max(r1_1, r1_2);
3987        r1_1 = t;
3988      };
3989      shared.m[smem_l_idx + (64)] = r1_1;
3990      shared.m[smem_l_idx + (80)] = r1_2;
3991      shared.m[smem_r_idx + (96)] = r1_3;
3992      shared.m[smem_r_idx + (112)] = r1_4;
3993    }
3994  }
3995  barrier(CLK_LOCAL_MEM_FENCE);
3996  r1 = shared.m[get_local_id(0) + (8 * (1 << 4) * 0)];
3997  r8 = shared.m[get_local_id(0) + (8 * (1 << 4) * 1)];
3998  r2 = shared.m[get_local_id(0) + (8 * (1 << 4) * 2)];
3999  r7 = shared.m[get_local_id(0) + (8 * (1 << 4) * 3)];
4000  r3 = shared.m[get_local_id(0) + (8 * (1 << 4) * 4)];
4001  r6 = shared.m[get_local_id(0) + (8 * (1 << 4) * 5)];
4002  r4 = shared.m[get_local_id(0) + (8 * (1 << 4) * 6)];
4003  r5 = shared.m[get_local_id(0) + (8 * (1 << 4) * 7)];
4004  {
4005    {
4006      uint const half_lane_idx = get_sub_group_local_id() ^ 8;
4007      int const t_lt = get_sub_group_local_id() < half_lane_idx;
4008      ;
4009      {
4010        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
4011        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
4012      };
4013      {
4014        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
4015        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
4016      };
4017      {
4018        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
4019        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
4020      };
4021      {
4022        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
4023        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
4024      };
4025      {
4026        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
4027        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
4028      };
4029      {
4030        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
4031        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
4032      };
4033      {
4034        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
4035        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
4036      };
4037      {
4038        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
4039        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
4040      };
4041    }
4042    {
4043      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
4044      int const t_lt = get_sub_group_local_id() < half_lane_idx;
4045      ;
4046      {
4047        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
4048        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
4049      };
4050      {
4051        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
4052        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
4053      };
4054      {
4055        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
4056        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
4057      };
4058      {
4059        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
4060        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
4061      };
4062      {
4063        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
4064        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
4065      };
4066      {
4067        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
4068        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
4069      };
4070      {
4071        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
4072        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
4073      };
4074      {
4075        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
4076        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
4077      };
4078    }
4079    {
4080      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
4081      int const t_lt = get_sub_group_local_id() < half_lane_idx;
4082      ;
4083      {
4084        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
4085        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
4086      };
4087      {
4088        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
4089        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
4090      };
4091      {
4092        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
4093        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
4094      };
4095      {
4096        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
4097        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
4098      };
4099      {
4100        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
4101        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
4102      };
4103      {
4104        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
4105        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
4106      };
4107      {
4108        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
4109        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
4110      };
4111      {
4112        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
4113        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
4114      };
4115    }
4116    {
4117      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
4118      int const t_lt = get_sub_group_local_id() < half_lane_idx;
4119      ;
4120      {
4121        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
4122        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
4123      };
4124      {
4125        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
4126        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
4127      };
4128      {
4129        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
4130        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
4131      };
4132      {
4133        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
4134        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
4135      };
4136      {
4137        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
4138        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
4139      };
4140      {
4141        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
4142        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
4143      };
4144      {
4145        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
4146        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
4147      };
4148      {
4149        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
4150        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
4151      };
4152    }
4153    {
4154      uint const t = min(r1, r5);
4155      r5 = max(r1, r5);
4156      r1 = t;
4157    };
4158    {
4159      uint const t = min(r3, r7);
4160      r7 = max(r3, r7);
4161      r3 = t;
4162    };
4163    {
4164      uint const t = min(r1, r3);
4165      r3 = max(r1, r3);
4166      r1 = t;
4167    };
4168    {
4169      uint const t = min(r5, r7);
4170      r7 = max(r5, r7);
4171      r5 = t;
4172    };
4173    {
4174      uint const t = min(r2, r6);
4175      r6 = max(r2, r6);
4176      r2 = t;
4177    };
4178    {
4179      uint const t = min(r4, r8);
4180      r8 = max(r4, r8);
4181      r4 = t;
4182    };
4183    {
4184      uint const t = min(r2, r4);
4185      r4 = max(r2, r4);
4186      r2 = t;
4187    };
4188    {
4189      uint const t = min(r6, r8);
4190      r8 = max(r6, r8);
4191      r6 = t;
4192    };
4193    {
4194      uint const t = min(r1, r2);
4195      r2 = max(r1, r2);
4196      r1 = t;
4197    };
4198    {
4199      uint const t = min(r3, r4);
4200      r4 = max(r3, r4);
4201      r3 = t;
4202    };
4203    {
4204      uint const t = min(r5, r6);
4205      r6 = max(r5, r6);
4206      r5 = t;
4207    };
4208    {
4209      uint const t = min(r7, r8);
4210      r8 = max(r7, r8);
4211      r7 = t;
4212    };
4213  }
4214  shared.m[get_local_id(0) + (8 * (1 << 4) * 0)] = r1;
4215  shared.m[get_local_id(0) + (8 * (1 << 4) * 1)] = r8;
4216  shared.m[get_local_id(0) + (8 * (1 << 4) * 2)] = r2;
4217  shared.m[get_local_id(0) + (8 * (1 << 4) * 3)] = r7;
4218  shared.m[get_local_id(0) + (8 * (1 << 4) * 4)] = r3;
4219  shared.m[get_local_id(0) + (8 * (1 << 4) * 5)] = r6;
4220  shared.m[get_local_id(0) + (8 * (1 << 4) * 6)] = r4;
4221  shared.m[get_local_id(0) + (8 * (1 << 4) * 7)] = r5;
4222  barrier(CLK_LOCAL_MEM_FENCE);
4223  {
4224    {
4225      uint r0_1 = shared.m[smem_l_idx + (0)];
4226      uint r0_2 = shared.m[smem_l_idx + (16)];
4227      uint r0_3 = shared.m[smem_l_idx + (32)];
4228      uint r0_4 = shared.m[smem_l_idx + (48)];
4229      uint r0_5 = shared.m[smem_r_idx + (64)];
4230      uint r0_6 = shared.m[smem_r_idx + (80)];
4231      uint r0_7 = shared.m[smem_r_idx + (96)];
4232      uint r0_8 = shared.m[smem_r_idx + (112)];
4233      {
4234        uint const t = min(r0_4, r0_5);
4235        r0_5 = max(r0_4, r0_5);
4236        r0_4 = t;
4237      };
4238      {
4239        uint const t = min(r0_3, r0_6);
4240        r0_6 = max(r0_3, r0_6);
4241        r0_3 = t;
4242      };
4243      {
4244        uint const t = min(r0_2, r0_7);
4245        r0_7 = max(r0_2, r0_7);
4246        r0_2 = t;
4247      };
4248      {
4249        uint const t = min(r0_1, r0_8);
4250        r0_8 = max(r0_1, r0_8);
4251        r0_1 = t;
4252      };
4253      {
4254        uint const t = min(r0_5, r0_7);
4255        r0_7 = max(r0_5, r0_7);
4256        r0_5 = t;
4257      };
4258      {
4259        uint const t = min(r0_6, r0_8);
4260        r0_8 = max(r0_6, r0_8);
4261        r0_6 = t;
4262      };
4263      {
4264        uint const t = min(r0_5, r0_6);
4265        r0_6 = max(r0_5, r0_6);
4266        r0_5 = t;
4267      };
4268      {
4269        uint const t = min(r0_7, r0_8);
4270        r0_8 = max(r0_7, r0_8);
4271        r0_7 = t;
4272      };
4273      {
4274        uint const t = min(r0_1, r0_3);
4275        r0_3 = max(r0_1, r0_3);
4276        r0_1 = t;
4277      };
4278      {
4279        uint const t = min(r0_2, r0_4);
4280        r0_4 = max(r0_2, r0_4);
4281        r0_2 = t;
4282      };
4283      {
4284        uint const t = min(r0_1, r0_2);
4285        r0_2 = max(r0_1, r0_2);
4286        r0_1 = t;
4287      };
4288      {
4289        uint const t = min(r0_3, r0_4);
4290        r0_4 = max(r0_3, r0_4);
4291        r0_3 = t;
4292      };
4293      shared.m[smem_l_idx + (0)] = r0_1;
4294      shared.m[smem_l_idx + (16)] = r0_2;
4295      shared.m[smem_l_idx + (32)] = r0_3;
4296      shared.m[smem_l_idx + (48)] = r0_4;
4297      shared.m[smem_r_idx + (64)] = r0_5;
4298      shared.m[smem_r_idx + (80)] = r0_6;
4299      shared.m[smem_r_idx + (96)] = r0_7;
4300      shared.m[smem_r_idx + (112)] = r0_8;
4301    }
4302  }
4303  barrier(CLK_LOCAL_MEM_FENCE);
4304  r1 = shared.m[get_local_id(0) + (8 * (1 << 4) * 0)];
4305  r8 = shared.m[get_local_id(0) + (8 * (1 << 4) * 1)];
4306  r2 = shared.m[get_local_id(0) + (8 * (1 << 4) * 2)];
4307  r7 = shared.m[get_local_id(0) + (8 * (1 << 4) * 3)];
4308  r3 = shared.m[get_local_id(0) + (8 * (1 << 4) * 4)];
4309  r6 = shared.m[get_local_id(0) + (8 * (1 << 4) * 5)];
4310  r4 = shared.m[get_local_id(0) + (8 * (1 << 4) * 6)];
4311  r5 = shared.m[get_local_id(0) + (8 * (1 << 4) * 7)];
4312  {
4313    {
4314      uint const half_lane_idx = get_sub_group_local_id() ^ 8;
4315      int const t_lt = get_sub_group_local_id() < half_lane_idx;
4316      ;
4317      {
4318        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
4319        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
4320      };
4321      {
4322        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
4323        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
4324      };
4325      {
4326        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
4327        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
4328      };
4329      {
4330        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
4331        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
4332      };
4333      {
4334        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
4335        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
4336      };
4337      {
4338        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
4339        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
4340      };
4341      {
4342        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
4343        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
4344      };
4345      {
4346        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
4347        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
4348      };
4349    }
4350    {
4351      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
4352      int const t_lt = get_sub_group_local_id() < half_lane_idx;
4353      ;
4354      {
4355        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
4356        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
4357      };
4358      {
4359        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
4360        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
4361      };
4362      {
4363        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
4364        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
4365      };
4366      {
4367        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
4368        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
4369      };
4370      {
4371        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
4372        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
4373      };
4374      {
4375        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
4376        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
4377      };
4378      {
4379        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
4380        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
4381      };
4382      {
4383        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
4384        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
4385      };
4386    }
4387    {
4388      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
4389      int const t_lt = get_sub_group_local_id() < half_lane_idx;
4390      ;
4391      {
4392        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
4393        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
4394      };
4395      {
4396        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
4397        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
4398      };
4399      {
4400        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
4401        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
4402      };
4403      {
4404        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
4405        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
4406      };
4407      {
4408        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
4409        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
4410      };
4411      {
4412        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
4413        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
4414      };
4415      {
4416        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
4417        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
4418      };
4419      {
4420        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
4421        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
4422      };
4423    }
4424    {
4425      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
4426      int const t_lt = get_sub_group_local_id() < half_lane_idx;
4427      ;
4428      {
4429        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
4430        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
4431      };
4432      {
4433        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
4434        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
4435      };
4436      {
4437        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
4438        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
4439      };
4440      {
4441        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
4442        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
4443      };
4444      {
4445        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
4446        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
4447      };
4448      {
4449        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
4450        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
4451      };
4452      {
4453        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
4454        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
4455      };
4456      {
4457        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
4458        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
4459      };
4460    }
4461    {
4462      uint const t = min(r1, r5);
4463      r5 = max(r1, r5);
4464      r1 = t;
4465    };
4466    {
4467      uint const t = min(r3, r7);
4468      r7 = max(r3, r7);
4469      r3 = t;
4470    };
4471    {
4472      uint const t = min(r1, r3);
4473      r3 = max(r1, r3);
4474      r1 = t;
4475    };
4476    {
4477      uint const t = min(r5, r7);
4478      r7 = max(r5, r7);
4479      r5 = t;
4480    };
4481    {
4482      uint const t = min(r2, r6);
4483      r6 = max(r2, r6);
4484      r2 = t;
4485    };
4486    {
4487      uint const t = min(r4, r8);
4488      r8 = max(r4, r8);
4489      r4 = t;
4490    };
4491    {
4492      uint const t = min(r2, r4);
4493      r4 = max(r2, r4);
4494      r2 = t;
4495    };
4496    {
4497      uint const t = min(r6, r8);
4498      r8 = max(r6, r8);
4499      r6 = t;
4500    };
4501    {
4502      uint const t = min(r1, r2);
4503      r2 = max(r1, r2);
4504      r1 = t;
4505    };
4506    {
4507      uint const t = min(r3, r4);
4508      r4 = max(r3, r4);
4509      r3 = t;
4510    };
4511    {
4512      uint const t = min(r5, r6);
4513      r6 = max(r5, r6);
4514      r5 = t;
4515    };
4516    {
4517      uint const t = min(r7, r8);
4518      r8 = max(r7, r8);
4519      r7 = t;
4520    };
4521  }
4522  vout[gmem_idx + (1 << 4) * 0] = r1;
4523  vout[gmem_idx + (1 << 4) * 1] = r2;
4524  vout[gmem_idx + (1 << 4) * 2] = r3;
4525  vout[gmem_idx + (1 << 4) * 3] = r4;
4526  vout[gmem_idx + (1 << 4) * 4] = r5;
4527  vout[gmem_idx + (1 << 4) * 5] = r6;
4528  vout[gmem_idx + (1 << 4) * 6] = r7;
4529  vout[gmem_idx + (1 << 4) * 7] = r8;
4530}
4531
4532__kernel __attribute__((intel_reqd_sub_group_size((1 << 4))))
4533__attribute__((reqd_work_group_size((1 << 4) * 16, 1, 1))) void
4534hs_kernel_bs_4(__global uint const* const restrict vin,
4535               __global uint* const restrict vout)
4536{
4537  __local struct
4538  {
4539    uint m[256 * 8];
4540  } shared;
4541
4542  uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 +
4543                        (get_local_id(0) & ((1 << 4) - 1));
4544  uint r1 = vin[gmem_idx + (1 << 4) * 0];
4545  uint r2 = vin[gmem_idx + (1 << 4) * 1];
4546  uint r3 = vin[gmem_idx + (1 << 4) * 2];
4547  uint r4 = vin[gmem_idx + (1 << 4) * 3];
4548  uint r5 = vin[gmem_idx + (1 << 4) * 4];
4549  uint r6 = vin[gmem_idx + (1 << 4) * 5];
4550  uint r7 = vin[gmem_idx + (1 << 4) * 6];
4551  uint r8 = vin[gmem_idx + (1 << 4) * 7];
4552  {
4553    uint const t = min(r1, r5);
4554    r5 = max(r1, r5);
4555    r1 = t;
4556  };
4557  {
4558    uint const t = min(r2, r6);
4559    r6 = max(r2, r6);
4560    r2 = t;
4561  };
4562  {
4563    uint const t = min(r3, r7);
4564    r7 = max(r3, r7);
4565    r3 = t;
4566  };
4567  {
4568    uint const t = min(r4, r8);
4569    r8 = max(r4, r8);
4570    r4 = t;
4571  };
4572  {
4573    uint const t = min(r1, r3);
4574    r3 = max(r1, r3);
4575    r1 = t;
4576  };
4577  {
4578    uint const t = min(r2, r4);
4579    r4 = max(r2, r4);
4580    r2 = t;
4581  };
4582  {
4583    uint const t = min(r5, r7);
4584    r7 = max(r5, r7);
4585    r5 = t;
4586  };
4587  {
4588    uint const t = min(r6, r8);
4589    r8 = max(r6, r8);
4590    r6 = t;
4591  };
4592  {
4593    uint const t = min(r3, r5);
4594    r5 = max(r3, r5);
4595    r3 = t;
4596  };
4597  {
4598    uint const t = min(r4, r6);
4599    r6 = max(r4, r6);
4600    r4 = t;
4601  };
4602  {
4603    uint const t = min(r1, r2);
4604    r2 = max(r1, r2);
4605    r1 = t;
4606  };
4607  {
4608    uint const t = min(r3, r4);
4609    r4 = max(r3, r4);
4610    r3 = t;
4611  };
4612  {
4613    uint const t = min(r5, r6);
4614    r6 = max(r5, r6);
4615    r5 = t;
4616  };
4617  {
4618    uint const t = min(r7, r8);
4619    r8 = max(r7, r8);
4620    r7 = t;
4621  };
4622  {
4623    uint const t = min(r2, r5);
4624    r5 = max(r2, r5);
4625    r2 = t;
4626  };
4627  {
4628    uint const t = min(r4, r7);
4629    r7 = max(r4, r7);
4630    r4 = t;
4631  };
4632  {
4633    uint const t = min(r2, r3);
4634    r3 = max(r2, r3);
4635    r2 = t;
4636  };
4637  {
4638    uint const t = min(r4, r5);
4639    r5 = max(r4, r5);
4640    r4 = t;
4641  };
4642  {
4643    uint const t = min(r6, r7);
4644    r7 = max(r6, r7);
4645    r6 = t;
4646  };
4647  {
4648    uint const flip_lane_idx = get_sub_group_local_id() ^ 1;
4649    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
4650    ;
4651    {
4652      uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
4653      uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
4654      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
4655      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
4656    };
4657    {
4658      uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
4659      uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
4660      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
4661      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
4662    };
4663    {
4664      uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
4665      uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
4666      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
4667      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
4668    };
4669    {
4670      uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
4671      uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
4672      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
4673      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
4674    };
4675  }
4676  {
4677    uint const t = min(r1, r5);
4678    r5 = max(r1, r5);
4679    r1 = t;
4680  };
4681  {
4682    uint const t = min(r3, r7);
4683    r7 = max(r3, r7);
4684    r3 = t;
4685  };
4686  {
4687    uint const t = min(r1, r3);
4688    r3 = max(r1, r3);
4689    r1 = t;
4690  };
4691  {
4692    uint const t = min(r5, r7);
4693    r7 = max(r5, r7);
4694    r5 = t;
4695  };
4696  {
4697    uint const t = min(r2, r6);
4698    r6 = max(r2, r6);
4699    r2 = t;
4700  };
4701  {
4702    uint const t = min(r4, r8);
4703    r8 = max(r4, r8);
4704    r4 = t;
4705  };
4706  {
4707    uint const t = min(r2, r4);
4708    r4 = max(r2, r4);
4709    r2 = t;
4710  };
4711  {
4712    uint const t = min(r6, r8);
4713    r8 = max(r6, r8);
4714    r6 = t;
4715  };
4716  {
4717    uint const t = min(r1, r2);
4718    r2 = max(r1, r2);
4719    r1 = t;
4720  };
4721  {
4722    uint const t = min(r3, r4);
4723    r4 = max(r3, r4);
4724    r3 = t;
4725  };
4726  {
4727    uint const t = min(r5, r6);
4728    r6 = max(r5, r6);
4729    r5 = t;
4730  };
4731  {
4732    uint const t = min(r7, r8);
4733    r8 = max(r7, r8);
4734    r7 = t;
4735  };
4736  {
4737    uint const flip_lane_idx = get_sub_group_local_id() ^ 3;
4738    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
4739    ;
4740    {
4741      uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
4742      uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
4743      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
4744      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
4745    };
4746    {
4747      uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
4748      uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
4749      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
4750      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
4751    };
4752    {
4753      uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
4754      uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
4755      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
4756      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
4757    };
4758    {
4759      uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
4760      uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
4761      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
4762      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
4763    };
4764  }
4765  {
4766    uint const half_lane_idx = get_sub_group_local_id() ^ 1;
4767    int const t_lt = get_sub_group_local_id() < half_lane_idx;
4768    ;
4769    {
4770      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
4771      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
4772    };
4773    {
4774      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
4775      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
4776    };
4777    {
4778      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
4779      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
4780    };
4781    {
4782      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
4783      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
4784    };
4785    {
4786      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
4787      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
4788    };
4789    {
4790      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
4791      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
4792    };
4793    {
4794      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
4795      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
4796    };
4797    {
4798      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
4799      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
4800    };
4801  }
4802  {
4803    uint const t = min(r1, r5);
4804    r5 = max(r1, r5);
4805    r1 = t;
4806  };
4807  {
4808    uint const t = min(r3, r7);
4809    r7 = max(r3, r7);
4810    r3 = t;
4811  };
4812  {
4813    uint const t = min(r1, r3);
4814    r3 = max(r1, r3);
4815    r1 = t;
4816  };
4817  {
4818    uint const t = min(r5, r7);
4819    r7 = max(r5, r7);
4820    r5 = t;
4821  };
4822  {
4823    uint const t = min(r2, r6);
4824    r6 = max(r2, r6);
4825    r2 = t;
4826  };
4827  {
4828    uint const t = min(r4, r8);
4829    r8 = max(r4, r8);
4830    r4 = t;
4831  };
4832  {
4833    uint const t = min(r2, r4);
4834    r4 = max(r2, r4);
4835    r2 = t;
4836  };
4837  {
4838    uint const t = min(r6, r8);
4839    r8 = max(r6, r8);
4840    r6 = t;
4841  };
4842  {
4843    uint const t = min(r1, r2);
4844    r2 = max(r1, r2);
4845    r1 = t;
4846  };
4847  {
4848    uint const t = min(r3, r4);
4849    r4 = max(r3, r4);
4850    r3 = t;
4851  };
4852  {
4853    uint const t = min(r5, r6);
4854    r6 = max(r5, r6);
4855    r5 = t;
4856  };
4857  {
4858    uint const t = min(r7, r8);
4859    r8 = max(r7, r8);
4860    r7 = t;
4861  };
4862  {
4863    uint const flip_lane_idx = get_sub_group_local_id() ^ 7;
4864    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
4865    ;
4866    {
4867      uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
4868      uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
4869      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
4870      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
4871    };
4872    {
4873      uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
4874      uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
4875      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
4876      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
4877    };
4878    {
4879      uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
4880      uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
4881      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
4882      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
4883    };
4884    {
4885      uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
4886      uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
4887      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
4888      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
4889    };
4890  }
4891  {
4892    uint const half_lane_idx = get_sub_group_local_id() ^ 2;
4893    int const t_lt = get_sub_group_local_id() < half_lane_idx;
4894    ;
4895    {
4896      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
4897      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
4898    };
4899    {
4900      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
4901      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
4902    };
4903    {
4904      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
4905      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
4906    };
4907    {
4908      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
4909      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
4910    };
4911    {
4912      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
4913      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
4914    };
4915    {
4916      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
4917      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
4918    };
4919    {
4920      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
4921      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
4922    };
4923    {
4924      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
4925      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
4926    };
4927  }
4928  {
4929    uint const half_lane_idx = get_sub_group_local_id() ^ 1;
4930    int const t_lt = get_sub_group_local_id() < half_lane_idx;
4931    ;
4932    {
4933      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
4934      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
4935    };
4936    {
4937      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
4938      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
4939    };
4940    {
4941      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
4942      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
4943    };
4944    {
4945      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
4946      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
4947    };
4948    {
4949      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
4950      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
4951    };
4952    {
4953      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
4954      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
4955    };
4956    {
4957      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
4958      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
4959    };
4960    {
4961      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
4962      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
4963    };
4964  }
4965  {
4966    uint const t = min(r1, r5);
4967    r5 = max(r1, r5);
4968    r1 = t;
4969  };
4970  {
4971    uint const t = min(r3, r7);
4972    r7 = max(r3, r7);
4973    r3 = t;
4974  };
4975  {
4976    uint const t = min(r1, r3);
4977    r3 = max(r1, r3);
4978    r1 = t;
4979  };
4980  {
4981    uint const t = min(r5, r7);
4982    r7 = max(r5, r7);
4983    r5 = t;
4984  };
4985  {
4986    uint const t = min(r2, r6);
4987    r6 = max(r2, r6);
4988    r2 = t;
4989  };
4990  {
4991    uint const t = min(r4, r8);
4992    r8 = max(r4, r8);
4993    r4 = t;
4994  };
4995  {
4996    uint const t = min(r2, r4);
4997    r4 = max(r2, r4);
4998    r2 = t;
4999  };
5000  {
5001    uint const t = min(r6, r8);
5002    r8 = max(r6, r8);
5003    r6 = t;
5004  };
5005  {
5006    uint const t = min(r1, r2);
5007    r2 = max(r1, r2);
5008    r1 = t;
5009  };
5010  {
5011    uint const t = min(r3, r4);
5012    r4 = max(r3, r4);
5013    r3 = t;
5014  };
5015  {
5016    uint const t = min(r5, r6);
5017    r6 = max(r5, r6);
5018    r5 = t;
5019  };
5020  {
5021    uint const t = min(r7, r8);
5022    r8 = max(r7, r8);
5023    r7 = t;
5024  };
5025  {
5026    uint const flip_lane_idx = get_sub_group_local_id() ^ 15;
5027    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
5028    ;
5029    {
5030      uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
5031      uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
5032      r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
5033      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
5034    };
5035    {
5036      uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
5037      uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
5038      r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
5039      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
5040    };
5041    {
5042      uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
5043      uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
5044      r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
5045      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
5046    };
5047    {
5048      uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
5049      uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
5050      r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
5051      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
5052    };
5053  }
5054  {
5055    uint const half_lane_idx = get_sub_group_local_id() ^ 4;
5056    int const t_lt = get_sub_group_local_id() < half_lane_idx;
5057    ;
5058    {
5059      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
5060      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
5061    };
5062    {
5063      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
5064      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
5065    };
5066    {
5067      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
5068      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
5069    };
5070    {
5071      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
5072      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
5073    };
5074    {
5075      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
5076      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
5077    };
5078    {
5079      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
5080      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
5081    };
5082    {
5083      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
5084      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
5085    };
5086    {
5087      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
5088      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
5089    };
5090  }
5091  {
5092    uint const half_lane_idx = get_sub_group_local_id() ^ 2;
5093    int const t_lt = get_sub_group_local_id() < half_lane_idx;
5094    ;
5095    {
5096      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
5097      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
5098    };
5099    {
5100      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
5101      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
5102    };
5103    {
5104      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
5105      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
5106    };
5107    {
5108      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
5109      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
5110    };
5111    {
5112      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
5113      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
5114    };
5115    {
5116      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
5117      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
5118    };
5119    {
5120      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
5121      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
5122    };
5123    {
5124      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
5125      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
5126    };
5127  }
5128  {
5129    uint const half_lane_idx = get_sub_group_local_id() ^ 1;
5130    int const t_lt = get_sub_group_local_id() < half_lane_idx;
5131    ;
5132    {
5133      uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
5134      r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
5135    };
5136    {
5137      uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
5138      r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
5139    };
5140    {
5141      uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
5142      r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
5143    };
5144    {
5145      uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
5146      r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
5147    };
5148    {
5149      uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
5150      r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
5151    };
5152    {
5153      uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
5154      r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
5155    };
5156    {
5157      uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
5158      r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
5159    };
5160    {
5161      uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
5162      r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
5163    };
5164  }
5165  {
5166    uint const t = min(r1, r5);
5167    r5 = max(r1, r5);
5168    r1 = t;
5169  };
5170  {
5171    uint const t = min(r3, r7);
5172    r7 = max(r3, r7);
5173    r3 = t;
5174  };
5175  {
5176    uint const t = min(r1, r3);
5177    r3 = max(r1, r3);
5178    r1 = t;
5179  };
5180  {
5181    uint const t = min(r5, r7);
5182    r7 = max(r5, r7);
5183    r5 = t;
5184  };
5185  {
5186    uint const t = min(r2, r6);
5187    r6 = max(r2, r6);
5188    r2 = t;
5189  };
5190  {
5191    uint const t = min(r4, r8);
5192    r8 = max(r4, r8);
5193    r4 = t;
5194  };
5195  {
5196    uint const t = min(r2, r4);
5197    r4 = max(r2, r4);
5198    r2 = t;
5199  };
5200  {
5201    uint const t = min(r6, r8);
5202    r8 = max(r6, r8);
5203    r6 = t;
5204  };
5205  {
5206    uint const t = min(r1, r2);
5207    r2 = max(r1, r2);
5208    r1 = t;
5209  };
5210  {
5211    uint const t = min(r3, r4);
5212    r4 = max(r3, r4);
5213    r3 = t;
5214  };
5215  {
5216    uint const t = min(r5, r6);
5217    r6 = max(r5, r6);
5218    r5 = t;
5219  };
5220  {
5221    uint const t = min(r7, r8);
5222    r8 = max(r7, r8);
5223    r7 = t;
5224  };
5225  uint const smem_l_idx =
5226    get_sub_group_id() * ((1 << 4) * 16) + get_sub_group_local_id();
5227  uint const smem_r_idx = (get_sub_group_id() ^ 1) * ((1 << 4) * 16) +
5228                          (get_sub_group_local_id() ^ ((1 << 4) - 1));
5229  shared.m[get_local_id(0) + (16 * (1 << 4) * 0)] = r1;
5230  shared.m[get_local_id(0) + (16 * (1 << 4) * 1)] = r8;
5231  shared.m[get_local_id(0) + (16 * (1 << 4) * 2)] = r2;
5232  shared.m[get_local_id(0) + (16 * (1 << 4) * 3)] = r7;
5233  shared.m[get_local_id(0) + (16 * (1 << 4) * 4)] = r3;
5234  shared.m[get_local_id(0) + (16 * (1 << 4) * 5)] = r6;
5235  shared.m[get_local_id(0) + (16 * (1 << 4) * 6)] = r4;
5236  shared.m[get_local_id(0) + (16 * (1 << 4) * 7)] = r5;
5237  barrier(CLK_LOCAL_MEM_FENCE);
5238  if (get_sub_group_id() < 8) {
5239    {
5240      uint r0_1 = shared.m[smem_l_idx + (0)];
5241      uint r0_2 = shared.m[smem_r_idx + (16)];
5242      {
5243        uint const t = min(r0_1, r0_2);
5244        r0_2 = max(r0_1, r0_2);
5245        r0_1 = t;
5246      };
5247      shared.m[smem_l_idx + (0)] = r0_1;
5248      shared.m[smem_r_idx + (16)] = r0_2;
5249    }
5250    {
5251      uint r1_1 = shared.m[smem_l_idx + (32)];
5252      uint r1_2 = shared.m[smem_r_idx + (48)];
5253      {
5254        uint const t = min(r1_1, r1_2);
5255        r1_2 = max(r1_1, r1_2);
5256        r1_1 = t;
5257      };
5258      shared.m[smem_l_idx + (32)] = r1_1;
5259      shared.m[smem_r_idx + (48)] = r1_2;
5260    }
5261    {
5262      uint r2_1 = shared.m[smem_l_idx + (64)];
5263      uint r2_2 = shared.m[smem_r_idx + (80)];
5264      {
5265        uint const t = min(r2_1, r2_2);
5266        r2_2 = max(r2_1, r2_2);
5267        r2_1 = t;
5268      };
5269      shared.m[smem_l_idx + (64)] = r2_1;
5270      shared.m[smem_r_idx + (80)] = r2_2;
5271    }
5272    {
5273      uint r3_1 = shared.m[smem_l_idx + (96)];
5274      uint r3_2 = shared.m[smem_r_idx + (112)];
5275      {
5276        uint const t = min(r3_1, r3_2);
5277        r3_2 = max(r3_1, r3_2);
5278        r3_1 = t;
5279      };
5280      shared.m[smem_l_idx + (96)] = r3_1;
5281      shared.m[smem_r_idx + (112)] = r3_2;
5282    }
5283    {
5284      uint r4_1 = shared.m[smem_l_idx + (128)];
5285      uint r4_2 = shared.m[smem_r_idx + (144)];
5286      {
5287        uint const t = min(r4_1, r4_2);
5288        r4_2 = max(r4_1, r4_2);
5289        r4_1 = t;
5290      };
5291      shared.m[smem_l_idx + (128)] = r4_1;
5292      shared.m[smem_r_idx + (144)] = r4_2;
5293    }
5294    {
5295      uint r5_1 = shared.m[smem_l_idx + (160)];
5296      uint r5_2 = shared.m[smem_r_idx + (176)];
5297      {
5298        uint const t = min(r5_1, r5_2);
5299        r5_2 = max(r5_1, r5_2);
5300        r5_1 = t;
5301      };
5302      shared.m[smem_l_idx + (160)] = r5_1;
5303      shared.m[smem_r_idx + (176)] = r5_2;
5304    }
5305    {
5306      uint r6_1 = shared.m[smem_l_idx + (192)];
5307      uint r6_2 = shared.m[smem_r_idx + (208)];
5308      {
5309        uint const t = min(r6_1, r6_2);
5310        r6_2 = max(r6_1, r6_2);
5311        r6_1 = t;
5312      };
5313      shared.m[smem_l_idx + (192)] = r6_1;
5314      shared.m[smem_r_idx + (208)] = r6_2;
5315    }
5316    {
5317      uint r7_1 = shared.m[smem_l_idx + (224)];
5318      uint r7_2 = shared.m[smem_r_idx + (240)];
5319      {
5320        uint const t = min(r7_1, r7_2);
5321        r7_2 = max(r7_1, r7_2);
5322        r7_1 = t;
5323      };
5324      shared.m[smem_l_idx + (224)] = r7_1;
5325      shared.m[smem_r_idx + (240)] = r7_2;
5326    }
5327  }
5328  barrier(CLK_LOCAL_MEM_FENCE);
5329  r1 = shared.m[get_local_id(0) + (16 * (1 << 4) * 0)];
5330  r8 = shared.m[get_local_id(0) + (16 * (1 << 4) * 1)];
5331  r2 = shared.m[get_local_id(0) + (16 * (1 << 4) * 2)];
5332  r7 = shared.m[get_local_id(0) + (16 * (1 << 4) * 3)];
5333  r3 = shared.m[get_local_id(0) + (16 * (1 << 4) * 4)];
5334  r6 = shared.m[get_local_id(0) + (16 * (1 << 4) * 5)];
5335  r4 = shared.m[get_local_id(0) + (16 * (1 << 4) * 6)];
5336  r5 = shared.m[get_local_id(0) + (16 * (1 << 4) * 7)];
5337  {
5338    {
5339      uint const half_lane_idx = get_sub_group_local_id() ^ 8;
5340      int const t_lt = get_sub_group_local_id() < half_lane_idx;
5341      ;
5342      {
5343        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
5344        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
5345      };
5346      {
5347        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
5348        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
5349      };
5350      {
5351        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
5352        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
5353      };
5354      {
5355        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
5356        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
5357      };
5358      {
5359        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
5360        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
5361      };
5362      {
5363        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
5364        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
5365      };
5366      {
5367        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
5368        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
5369      };
5370      {
5371        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
5372        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
5373      };
5374    }
5375    {
5376      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
5377      int const t_lt = get_sub_group_local_id() < half_lane_idx;
5378      ;
5379      {
5380        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
5381        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
5382      };
5383      {
5384        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
5385        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
5386      };
5387      {
5388        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
5389        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
5390      };
5391      {
5392        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
5393        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
5394      };
5395      {
5396        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
5397        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
5398      };
5399      {
5400        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
5401        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
5402      };
5403      {
5404        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
5405        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
5406      };
5407      {
5408        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
5409        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
5410      };
5411    }
5412    {
5413      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
5414      int const t_lt = get_sub_group_local_id() < half_lane_idx;
5415      ;
5416      {
5417        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
5418        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
5419      };
5420      {
5421        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
5422        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
5423      };
5424      {
5425        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
5426        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
5427      };
5428      {
5429        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
5430        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
5431      };
5432      {
5433        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
5434        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
5435      };
5436      {
5437        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
5438        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
5439      };
5440      {
5441        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
5442        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
5443      };
5444      {
5445        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
5446        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
5447      };
5448    }
5449    {
5450      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
5451      int const t_lt = get_sub_group_local_id() < half_lane_idx;
5452      ;
5453      {
5454        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
5455        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
5456      };
5457      {
5458        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
5459        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
5460      };
5461      {
5462        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
5463        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
5464      };
5465      {
5466        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
5467        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
5468      };
5469      {
5470        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
5471        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
5472      };
5473      {
5474        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
5475        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
5476      };
5477      {
5478        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
5479        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
5480      };
5481      {
5482        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
5483        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
5484      };
5485    }
5486    {
5487      uint const t = min(r1, r5);
5488      r5 = max(r1, r5);
5489      r1 = t;
5490    };
5491    {
5492      uint const t = min(r3, r7);
5493      r7 = max(r3, r7);
5494      r3 = t;
5495    };
5496    {
5497      uint const t = min(r1, r3);
5498      r3 = max(r1, r3);
5499      r1 = t;
5500    };
5501    {
5502      uint const t = min(r5, r7);
5503      r7 = max(r5, r7);
5504      r5 = t;
5505    };
5506    {
5507      uint const t = min(r2, r6);
5508      r6 = max(r2, r6);
5509      r2 = t;
5510    };
5511    {
5512      uint const t = min(r4, r8);
5513      r8 = max(r4, r8);
5514      r4 = t;
5515    };
5516    {
5517      uint const t = min(r2, r4);
5518      r4 = max(r2, r4);
5519      r2 = t;
5520    };
5521    {
5522      uint const t = min(r6, r8);
5523      r8 = max(r6, r8);
5524      r6 = t;
5525    };
5526    {
5527      uint const t = min(r1, r2);
5528      r2 = max(r1, r2);
5529      r1 = t;
5530    };
5531    {
5532      uint const t = min(r3, r4);
5533      r4 = max(r3, r4);
5534      r3 = t;
5535    };
5536    {
5537      uint const t = min(r5, r6);
5538      r6 = max(r5, r6);
5539      r5 = t;
5540    };
5541    {
5542      uint const t = min(r7, r8);
5543      r8 = max(r7, r8);
5544      r7 = t;
5545    };
5546  }
5547  shared.m[get_local_id(0) + (16 * (1 << 4) * 0)] = r1;
5548  shared.m[get_local_id(0) + (16 * (1 << 4) * 1)] = r8;
5549  shared.m[get_local_id(0) + (16 * (1 << 4) * 2)] = r2;
5550  shared.m[get_local_id(0) + (16 * (1 << 4) * 3)] = r7;
5551  shared.m[get_local_id(0) + (16 * (1 << 4) * 4)] = r3;
5552  shared.m[get_local_id(0) + (16 * (1 << 4) * 5)] = r6;
5553  shared.m[get_local_id(0) + (16 * (1 << 4) * 6)] = r4;
5554  shared.m[get_local_id(0) + (16 * (1 << 4) * 7)] = r5;
5555  barrier(CLK_LOCAL_MEM_FENCE);
5556  if (get_sub_group_id() < 8) {
5557    {
5558      uint r0_1 = shared.m[smem_l_idx + (0)];
5559      uint r0_2 = shared.m[smem_l_idx + (16)];
5560      uint r0_3 = shared.m[smem_r_idx + (32)];
5561      uint r0_4 = shared.m[smem_r_idx + (48)];
5562      {
5563        uint const t = min(r0_2, r0_3);
5564        r0_3 = max(r0_2, r0_3);
5565        r0_2 = t;
5566      };
5567      {
5568        uint const t = min(r0_1, r0_4);
5569        r0_4 = max(r0_1, r0_4);
5570        r0_1 = t;
5571      };
5572      {
5573        uint const t = min(r0_3, r0_4);
5574        r0_4 = max(r0_3, r0_4);
5575        r0_3 = t;
5576      };
5577      {
5578        uint const t = min(r0_1, r0_2);
5579        r0_2 = max(r0_1, r0_2);
5580        r0_1 = t;
5581      };
5582      shared.m[smem_l_idx + (0)] = r0_1;
5583      shared.m[smem_l_idx + (16)] = r0_2;
5584      shared.m[smem_r_idx + (32)] = r0_3;
5585      shared.m[smem_r_idx + (48)] = r0_4;
5586    }
5587    {
5588      uint r1_1 = shared.m[smem_l_idx + (64)];
5589      uint r1_2 = shared.m[smem_l_idx + (80)];
5590      uint r1_3 = shared.m[smem_r_idx + (96)];
5591      uint r1_4 = shared.m[smem_r_idx + (112)];
5592      {
5593        uint const t = min(r1_2, r1_3);
5594        r1_3 = max(r1_2, r1_3);
5595        r1_2 = t;
5596      };
5597      {
5598        uint const t = min(r1_1, r1_4);
5599        r1_4 = max(r1_1, r1_4);
5600        r1_1 = t;
5601      };
5602      {
5603        uint const t = min(r1_3, r1_4);
5604        r1_4 = max(r1_3, r1_4);
5605        r1_3 = t;
5606      };
5607      {
5608        uint const t = min(r1_1, r1_2);
5609        r1_2 = max(r1_1, r1_2);
5610        r1_1 = t;
5611      };
5612      shared.m[smem_l_idx + (64)] = r1_1;
5613      shared.m[smem_l_idx + (80)] = r1_2;
5614      shared.m[smem_r_idx + (96)] = r1_3;
5615      shared.m[smem_r_idx + (112)] = r1_4;
5616    }
5617    {
5618      uint r2_1 = shared.m[smem_l_idx + (128)];
5619      uint r2_2 = shared.m[smem_l_idx + (144)];
5620      uint r2_3 = shared.m[smem_r_idx + (160)];
5621      uint r2_4 = shared.m[smem_r_idx + (176)];
5622      {
5623        uint const t = min(r2_2, r2_3);
5624        r2_3 = max(r2_2, r2_3);
5625        r2_2 = t;
5626      };
5627      {
5628        uint const t = min(r2_1, r2_4);
5629        r2_4 = max(r2_1, r2_4);
5630        r2_1 = t;
5631      };
5632      {
5633        uint const t = min(r2_3, r2_4);
5634        r2_4 = max(r2_3, r2_4);
5635        r2_3 = t;
5636      };
5637      {
5638        uint const t = min(r2_1, r2_2);
5639        r2_2 = max(r2_1, r2_2);
5640        r2_1 = t;
5641      };
5642      shared.m[smem_l_idx + (128)] = r2_1;
5643      shared.m[smem_l_idx + (144)] = r2_2;
5644      shared.m[smem_r_idx + (160)] = r2_3;
5645      shared.m[smem_r_idx + (176)] = r2_4;
5646    }
5647    {
5648      uint r3_1 = shared.m[smem_l_idx + (192)];
5649      uint r3_2 = shared.m[smem_l_idx + (208)];
5650      uint r3_3 = shared.m[smem_r_idx + (224)];
5651      uint r3_4 = shared.m[smem_r_idx + (240)];
5652      {
5653        uint const t = min(r3_2, r3_3);
5654        r3_3 = max(r3_2, r3_3);
5655        r3_2 = t;
5656      };
5657      {
5658        uint const t = min(r3_1, r3_4);
5659        r3_4 = max(r3_1, r3_4);
5660        r3_1 = t;
5661      };
5662      {
5663        uint const t = min(r3_3, r3_4);
5664        r3_4 = max(r3_3, r3_4);
5665        r3_3 = t;
5666      };
5667      {
5668        uint const t = min(r3_1, r3_2);
5669        r3_2 = max(r3_1, r3_2);
5670        r3_1 = t;
5671      };
5672      shared.m[smem_l_idx + (192)] = r3_1;
5673      shared.m[smem_l_idx + (208)] = r3_2;
5674      shared.m[smem_r_idx + (224)] = r3_3;
5675      shared.m[smem_r_idx + (240)] = r3_4;
5676    }
5677  }
5678  barrier(CLK_LOCAL_MEM_FENCE);
5679  r1 = shared.m[get_local_id(0) + (16 * (1 << 4) * 0)];
5680  r8 = shared.m[get_local_id(0) + (16 * (1 << 4) * 1)];
5681  r2 = shared.m[get_local_id(0) + (16 * (1 << 4) * 2)];
5682  r7 = shared.m[get_local_id(0) + (16 * (1 << 4) * 3)];
5683  r3 = shared.m[get_local_id(0) + (16 * (1 << 4) * 4)];
5684  r6 = shared.m[get_local_id(0) + (16 * (1 << 4) * 5)];
5685  r4 = shared.m[get_local_id(0) + (16 * (1 << 4) * 6)];
5686  r5 = shared.m[get_local_id(0) + (16 * (1 << 4) * 7)];
5687  {
5688    {
5689      uint const half_lane_idx = get_sub_group_local_id() ^ 8;
5690      int const t_lt = get_sub_group_local_id() < half_lane_idx;
5691      ;
5692      {
5693        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
5694        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
5695      };
5696      {
5697        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
5698        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
5699      };
5700      {
5701        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
5702        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
5703      };
5704      {
5705        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
5706        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
5707      };
5708      {
5709        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
5710        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
5711      };
5712      {
5713        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
5714        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
5715      };
5716      {
5717        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
5718        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
5719      };
5720      {
5721        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
5722        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
5723      };
5724    }
5725    {
5726      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
5727      int const t_lt = get_sub_group_local_id() < half_lane_idx;
5728      ;
5729      {
5730        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
5731        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
5732      };
5733      {
5734        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
5735        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
5736      };
5737      {
5738        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
5739        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
5740      };
5741      {
5742        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
5743        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
5744      };
5745      {
5746        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
5747        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
5748      };
5749      {
5750        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
5751        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
5752      };
5753      {
5754        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
5755        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
5756      };
5757      {
5758        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
5759        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
5760      };
5761    }
5762    {
5763      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
5764      int const t_lt = get_sub_group_local_id() < half_lane_idx;
5765      ;
5766      {
5767        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
5768        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
5769      };
5770      {
5771        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
5772        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
5773      };
5774      {
5775        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
5776        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
5777      };
5778      {
5779        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
5780        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
5781      };
5782      {
5783        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
5784        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
5785      };
5786      {
5787        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
5788        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
5789      };
5790      {
5791        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
5792        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
5793      };
5794      {
5795        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
5796        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
5797      };
5798    }
5799    {
5800      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
5801      int const t_lt = get_sub_group_local_id() < half_lane_idx;
5802      ;
5803      {
5804        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
5805        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
5806      };
5807      {
5808        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
5809        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
5810      };
5811      {
5812        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
5813        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
5814      };
5815      {
5816        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
5817        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
5818      };
5819      {
5820        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
5821        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
5822      };
5823      {
5824        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
5825        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
5826      };
5827      {
5828        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
5829        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
5830      };
5831      {
5832        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
5833        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
5834      };
5835    }
5836    {
5837      uint const t = min(r1, r5);
5838      r5 = max(r1, r5);
5839      r1 = t;
5840    };
5841    {
5842      uint const t = min(r3, r7);
5843      r7 = max(r3, r7);
5844      r3 = t;
5845    };
5846    {
5847      uint const t = min(r1, r3);
5848      r3 = max(r1, r3);
5849      r1 = t;
5850    };
5851    {
5852      uint const t = min(r5, r7);
5853      r7 = max(r5, r7);
5854      r5 = t;
5855    };
5856    {
5857      uint const t = min(r2, r6);
5858      r6 = max(r2, r6);
5859      r2 = t;
5860    };
5861    {
5862      uint const t = min(r4, r8);
5863      r8 = max(r4, r8);
5864      r4 = t;
5865    };
5866    {
5867      uint const t = min(r2, r4);
5868      r4 = max(r2, r4);
5869      r2 = t;
5870    };
5871    {
5872      uint const t = min(r6, r8);
5873      r8 = max(r6, r8);
5874      r6 = t;
5875    };
5876    {
5877      uint const t = min(r1, r2);
5878      r2 = max(r1, r2);
5879      r1 = t;
5880    };
5881    {
5882      uint const t = min(r3, r4);
5883      r4 = max(r3, r4);
5884      r3 = t;
5885    };
5886    {
5887      uint const t = min(r5, r6);
5888      r6 = max(r5, r6);
5889      r5 = t;
5890    };
5891    {
5892      uint const t = min(r7, r8);
5893      r8 = max(r7, r8);
5894      r7 = t;
5895    };
5896  }
5897  shared.m[get_local_id(0) + (16 * (1 << 4) * 0)] = r1;
5898  shared.m[get_local_id(0) + (16 * (1 << 4) * 1)] = r8;
5899  shared.m[get_local_id(0) + (16 * (1 << 4) * 2)] = r2;
5900  shared.m[get_local_id(0) + (16 * (1 << 4) * 3)] = r7;
5901  shared.m[get_local_id(0) + (16 * (1 << 4) * 4)] = r3;
5902  shared.m[get_local_id(0) + (16 * (1 << 4) * 5)] = r6;
5903  shared.m[get_local_id(0) + (16 * (1 << 4) * 6)] = r4;
5904  shared.m[get_local_id(0) + (16 * (1 << 4) * 7)] = r5;
5905  barrier(CLK_LOCAL_MEM_FENCE);
5906  if (get_sub_group_id() < 8) {
5907    {
5908      uint r0_1 = shared.m[smem_l_idx + (0)];
5909      uint r0_2 = shared.m[smem_l_idx + (16)];
5910      uint r0_3 = shared.m[smem_l_idx + (32)];
5911      uint r0_4 = shared.m[smem_l_idx + (48)];
5912      uint r0_5 = shared.m[smem_r_idx + (64)];
5913      uint r0_6 = shared.m[smem_r_idx + (80)];
5914      uint r0_7 = shared.m[smem_r_idx + (96)];
5915      uint r0_8 = shared.m[smem_r_idx + (112)];
5916      {
5917        uint const t = min(r0_4, r0_5);
5918        r0_5 = max(r0_4, r0_5);
5919        r0_4 = t;
5920      };
5921      {
5922        uint const t = min(r0_3, r0_6);
5923        r0_6 = max(r0_3, r0_6);
5924        r0_3 = t;
5925      };
5926      {
5927        uint const t = min(r0_2, r0_7);
5928        r0_7 = max(r0_2, r0_7);
5929        r0_2 = t;
5930      };
5931      {
5932        uint const t = min(r0_1, r0_8);
5933        r0_8 = max(r0_1, r0_8);
5934        r0_1 = t;
5935      };
5936      {
5937        uint const t = min(r0_5, r0_7);
5938        r0_7 = max(r0_5, r0_7);
5939        r0_5 = t;
5940      };
5941      {
5942        uint const t = min(r0_6, r0_8);
5943        r0_8 = max(r0_6, r0_8);
5944        r0_6 = t;
5945      };
5946      {
5947        uint const t = min(r0_5, r0_6);
5948        r0_6 = max(r0_5, r0_6);
5949        r0_5 = t;
5950      };
5951      {
5952        uint const t = min(r0_7, r0_8);
5953        r0_8 = max(r0_7, r0_8);
5954        r0_7 = t;
5955      };
5956      {
5957        uint const t = min(r0_1, r0_3);
5958        r0_3 = max(r0_1, r0_3);
5959        r0_1 = t;
5960      };
5961      {
5962        uint const t = min(r0_2, r0_4);
5963        r0_4 = max(r0_2, r0_4);
5964        r0_2 = t;
5965      };
5966      {
5967        uint const t = min(r0_1, r0_2);
5968        r0_2 = max(r0_1, r0_2);
5969        r0_1 = t;
5970      };
5971      {
5972        uint const t = min(r0_3, r0_4);
5973        r0_4 = max(r0_3, r0_4);
5974        r0_3 = t;
5975      };
5976      shared.m[smem_l_idx + (0)] = r0_1;
5977      shared.m[smem_l_idx + (16)] = r0_2;
5978      shared.m[smem_l_idx + (32)] = r0_3;
5979      shared.m[smem_l_idx + (48)] = r0_4;
5980      shared.m[smem_r_idx + (64)] = r0_5;
5981      shared.m[smem_r_idx + (80)] = r0_6;
5982      shared.m[smem_r_idx + (96)] = r0_7;
5983      shared.m[smem_r_idx + (112)] = r0_8;
5984    }
5985    {
5986      uint r1_1 = shared.m[smem_l_idx + (128)];
5987      uint r1_2 = shared.m[smem_l_idx + (144)];
5988      uint r1_3 = shared.m[smem_l_idx + (160)];
5989      uint r1_4 = shared.m[smem_l_idx + (176)];
5990      uint r1_5 = shared.m[smem_r_idx + (192)];
5991      uint r1_6 = shared.m[smem_r_idx + (208)];
5992      uint r1_7 = shared.m[smem_r_idx + (224)];
5993      uint r1_8 = shared.m[smem_r_idx + (240)];
5994      {
5995        uint const t = min(r1_4, r1_5);
5996        r1_5 = max(r1_4, r1_5);
5997        r1_4 = t;
5998      };
5999      {
6000        uint const t = min(r1_3, r1_6);
6001        r1_6 = max(r1_3, r1_6);
6002        r1_3 = t;
6003      };
6004      {
6005        uint const t = min(r1_2, r1_7);
6006        r1_7 = max(r1_2, r1_7);
6007        r1_2 = t;
6008      };
6009      {
6010        uint const t = min(r1_1, r1_8);
6011        r1_8 = max(r1_1, r1_8);
6012        r1_1 = t;
6013      };
6014      {
6015        uint const t = min(r1_5, r1_7);
6016        r1_7 = max(r1_5, r1_7);
6017        r1_5 = t;
6018      };
6019      {
6020        uint const t = min(r1_6, r1_8);
6021        r1_8 = max(r1_6, r1_8);
6022        r1_6 = t;
6023      };
6024      {
6025        uint const t = min(r1_5, r1_6);
6026        r1_6 = max(r1_5, r1_6);
6027        r1_5 = t;
6028      };
6029      {
6030        uint const t = min(r1_7, r1_8);
6031        r1_8 = max(r1_7, r1_8);
6032        r1_7 = t;
6033      };
6034      {
6035        uint const t = min(r1_1, r1_3);
6036        r1_3 = max(r1_1, r1_3);
6037        r1_1 = t;
6038      };
6039      {
6040        uint const t = min(r1_2, r1_4);
6041        r1_4 = max(r1_2, r1_4);
6042        r1_2 = t;
6043      };
6044      {
6045        uint const t = min(r1_1, r1_2);
6046        r1_2 = max(r1_1, r1_2);
6047        r1_1 = t;
6048      };
6049      {
6050        uint const t = min(r1_3, r1_4);
6051        r1_4 = max(r1_3, r1_4);
6052        r1_3 = t;
6053      };
6054      shared.m[smem_l_idx + (128)] = r1_1;
6055      shared.m[smem_l_idx + (144)] = r1_2;
6056      shared.m[smem_l_idx + (160)] = r1_3;
6057      shared.m[smem_l_idx + (176)] = r1_4;
6058      shared.m[smem_r_idx + (192)] = r1_5;
6059      shared.m[smem_r_idx + (208)] = r1_6;
6060      shared.m[smem_r_idx + (224)] = r1_7;
6061      shared.m[smem_r_idx + (240)] = r1_8;
6062    }
6063  }
6064  barrier(CLK_LOCAL_MEM_FENCE);
6065  r1 = shared.m[get_local_id(0) + (16 * (1 << 4) * 0)];
6066  r8 = shared.m[get_local_id(0) + (16 * (1 << 4) * 1)];
6067  r2 = shared.m[get_local_id(0) + (16 * (1 << 4) * 2)];
6068  r7 = shared.m[get_local_id(0) + (16 * (1 << 4) * 3)];
6069  r3 = shared.m[get_local_id(0) + (16 * (1 << 4) * 4)];
6070  r6 = shared.m[get_local_id(0) + (16 * (1 << 4) * 5)];
6071  r4 = shared.m[get_local_id(0) + (16 * (1 << 4) * 6)];
6072  r5 = shared.m[get_local_id(0) + (16 * (1 << 4) * 7)];
6073  {
6074    {
6075      uint const half_lane_idx = get_sub_group_local_id() ^ 8;
6076      int const t_lt = get_sub_group_local_id() < half_lane_idx;
6077      ;
6078      {
6079        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
6080        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
6081      };
6082      {
6083        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
6084        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
6085      };
6086      {
6087        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
6088        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
6089      };
6090      {
6091        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
6092        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
6093      };
6094      {
6095        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
6096        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
6097      };
6098      {
6099        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
6100        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
6101      };
6102      {
6103        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
6104        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
6105      };
6106      {
6107        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
6108        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
6109      };
6110    }
6111    {
6112      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
6113      int const t_lt = get_sub_group_local_id() < half_lane_idx;
6114      ;
6115      {
6116        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
6117        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
6118      };
6119      {
6120        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
6121        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
6122      };
6123      {
6124        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
6125        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
6126      };
6127      {
6128        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
6129        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
6130      };
6131      {
6132        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
6133        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
6134      };
6135      {
6136        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
6137        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
6138      };
6139      {
6140        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
6141        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
6142      };
6143      {
6144        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
6145        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
6146      };
6147    }
6148    {
6149      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
6150      int const t_lt = get_sub_group_local_id() < half_lane_idx;
6151      ;
6152      {
6153        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
6154        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
6155      };
6156      {
6157        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
6158        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
6159      };
6160      {
6161        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
6162        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
6163      };
6164      {
6165        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
6166        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
6167      };
6168      {
6169        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
6170        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
6171      };
6172      {
6173        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
6174        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
6175      };
6176      {
6177        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
6178        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
6179      };
6180      {
6181        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
6182        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
6183      };
6184    }
6185    {
6186      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
6187      int const t_lt = get_sub_group_local_id() < half_lane_idx;
6188      ;
6189      {
6190        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
6191        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
6192      };
6193      {
6194        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
6195        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
6196      };
6197      {
6198        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
6199        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
6200      };
6201      {
6202        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
6203        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
6204      };
6205      {
6206        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
6207        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
6208      };
6209      {
6210        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
6211        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
6212      };
6213      {
6214        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
6215        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
6216      };
6217      {
6218        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
6219        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
6220      };
6221    }
6222    {
6223      uint const t = min(r1, r5);
6224      r5 = max(r1, r5);
6225      r1 = t;
6226    };
6227    {
6228      uint const t = min(r3, r7);
6229      r7 = max(r3, r7);
6230      r3 = t;
6231    };
6232    {
6233      uint const t = min(r1, r3);
6234      r3 = max(r1, r3);
6235      r1 = t;
6236    };
6237    {
6238      uint const t = min(r5, r7);
6239      r7 = max(r5, r7);
6240      r5 = t;
6241    };
6242    {
6243      uint const t = min(r2, r6);
6244      r6 = max(r2, r6);
6245      r2 = t;
6246    };
6247    {
6248      uint const t = min(r4, r8);
6249      r8 = max(r4, r8);
6250      r4 = t;
6251    };
6252    {
6253      uint const t = min(r2, r4);
6254      r4 = max(r2, r4);
6255      r2 = t;
6256    };
6257    {
6258      uint const t = min(r6, r8);
6259      r8 = max(r6, r8);
6260      r6 = t;
6261    };
6262    {
6263      uint const t = min(r1, r2);
6264      r2 = max(r1, r2);
6265      r1 = t;
6266    };
6267    {
6268      uint const t = min(r3, r4);
6269      r4 = max(r3, r4);
6270      r3 = t;
6271    };
6272    {
6273      uint const t = min(r5, r6);
6274      r6 = max(r5, r6);
6275      r5 = t;
6276    };
6277    {
6278      uint const t = min(r7, r8);
6279      r8 = max(r7, r8);
6280      r7 = t;
6281    };
6282  }
6283  shared.m[get_local_id(0) + (16 * (1 << 4) * 0)] = r1;
6284  shared.m[get_local_id(0) + (16 * (1 << 4) * 1)] = r8;
6285  shared.m[get_local_id(0) + (16 * (1 << 4) * 2)] = r2;
6286  shared.m[get_local_id(0) + (16 * (1 << 4) * 3)] = r7;
6287  shared.m[get_local_id(0) + (16 * (1 << 4) * 4)] = r3;
6288  shared.m[get_local_id(0) + (16 * (1 << 4) * 5)] = r6;
6289  shared.m[get_local_id(0) + (16 * (1 << 4) * 6)] = r4;
6290  shared.m[get_local_id(0) + (16 * (1 << 4) * 7)] = r5;
6291  barrier(CLK_LOCAL_MEM_FENCE);
6292  if (get_sub_group_id() < 8) {
6293    {
6294      uint r0_1 = shared.m[smem_l_idx + (0)];
6295      uint r0_2 = shared.m[smem_l_idx + (16)];
6296      uint r0_3 = shared.m[smem_l_idx + (32)];
6297      uint r0_4 = shared.m[smem_l_idx + (48)];
6298      uint r0_5 = shared.m[smem_l_idx + (64)];
6299      uint r0_6 = shared.m[smem_l_idx + (80)];
6300      uint r0_7 = shared.m[smem_l_idx + (96)];
6301      uint r0_8 = shared.m[smem_l_idx + (112)];
6302      uint r0_9 = shared.m[smem_r_idx + (128)];
6303      uint r0_10 = shared.m[smem_r_idx + (144)];
6304      uint r0_11 = shared.m[smem_r_idx + (160)];
6305      uint r0_12 = shared.m[smem_r_idx + (176)];
6306      uint r0_13 = shared.m[smem_r_idx + (192)];
6307      uint r0_14 = shared.m[smem_r_idx + (208)];
6308      uint r0_15 = shared.m[smem_r_idx + (224)];
6309      uint r0_16 = shared.m[smem_r_idx + (240)];
6310      {
6311        uint const t = min(r0_8, r0_9);
6312        r0_9 = max(r0_8, r0_9);
6313        r0_8 = t;
6314      };
6315      {
6316        uint const t = min(r0_7, r0_10);
6317        r0_10 = max(r0_7, r0_10);
6318        r0_7 = t;
6319      };
6320      {
6321        uint const t = min(r0_6, r0_11);
6322        r0_11 = max(r0_6, r0_11);
6323        r0_6 = t;
6324      };
6325      {
6326        uint const t = min(r0_5, r0_12);
6327        r0_12 = max(r0_5, r0_12);
6328        r0_5 = t;
6329      };
6330      {
6331        uint const t = min(r0_4, r0_13);
6332        r0_13 = max(r0_4, r0_13);
6333        r0_4 = t;
6334      };
6335      {
6336        uint const t = min(r0_3, r0_14);
6337        r0_14 = max(r0_3, r0_14);
6338        r0_3 = t;
6339      };
6340      {
6341        uint const t = min(r0_2, r0_15);
6342        r0_15 = max(r0_2, r0_15);
6343        r0_2 = t;
6344      };
6345      {
6346        uint const t = min(r0_1, r0_16);
6347        r0_16 = max(r0_1, r0_16);
6348        r0_1 = t;
6349      };
6350      {
6351        uint const t = min(r0_9, r0_13);
6352        r0_13 = max(r0_9, r0_13);
6353        r0_9 = t;
6354      };
6355      {
6356        uint const t = min(r0_11, r0_15);
6357        r0_15 = max(r0_11, r0_15);
6358        r0_11 = t;
6359      };
6360      {
6361        uint const t = min(r0_9, r0_11);
6362        r0_11 = max(r0_9, r0_11);
6363        r0_9 = t;
6364      };
6365      {
6366        uint const t = min(r0_13, r0_15);
6367        r0_15 = max(r0_13, r0_15);
6368        r0_13 = t;
6369      };
6370      {
6371        uint const t = min(r0_10, r0_14);
6372        r0_14 = max(r0_10, r0_14);
6373        r0_10 = t;
6374      };
6375      {
6376        uint const t = min(r0_12, r0_16);
6377        r0_16 = max(r0_12, r0_16);
6378        r0_12 = t;
6379      };
6380      {
6381        uint const t = min(r0_10, r0_12);
6382        r0_12 = max(r0_10, r0_12);
6383        r0_10 = t;
6384      };
6385      {
6386        uint const t = min(r0_14, r0_16);
6387        r0_16 = max(r0_14, r0_16);
6388        r0_14 = t;
6389      };
6390      {
6391        uint const t = min(r0_9, r0_10);
6392        r0_10 = max(r0_9, r0_10);
6393        r0_9 = t;
6394      };
6395      {
6396        uint const t = min(r0_11, r0_12);
6397        r0_12 = max(r0_11, r0_12);
6398        r0_11 = t;
6399      };
6400      {
6401        uint const t = min(r0_13, r0_14);
6402        r0_14 = max(r0_13, r0_14);
6403        r0_13 = t;
6404      };
6405      {
6406        uint const t = min(r0_15, r0_16);
6407        r0_16 = max(r0_15, r0_16);
6408        r0_15 = t;
6409      };
6410      {
6411        uint const t = min(r0_1, r0_5);
6412        r0_5 = max(r0_1, r0_5);
6413        r0_1 = t;
6414      };
6415      {
6416        uint const t = min(r0_3, r0_7);
6417        r0_7 = max(r0_3, r0_7);
6418        r0_3 = t;
6419      };
6420      {
6421        uint const t = min(r0_1, r0_3);
6422        r0_3 = max(r0_1, r0_3);
6423        r0_1 = t;
6424      };
6425      {
6426        uint const t = min(r0_5, r0_7);
6427        r0_7 = max(r0_5, r0_7);
6428        r0_5 = t;
6429      };
6430      {
6431        uint const t = min(r0_2, r0_6);
6432        r0_6 = max(r0_2, r0_6);
6433        r0_2 = t;
6434      };
6435      {
6436        uint const t = min(r0_4, r0_8);
6437        r0_8 = max(r0_4, r0_8);
6438        r0_4 = t;
6439      };
6440      {
6441        uint const t = min(r0_2, r0_4);
6442        r0_4 = max(r0_2, r0_4);
6443        r0_2 = t;
6444      };
6445      {
6446        uint const t = min(r0_6, r0_8);
6447        r0_8 = max(r0_6, r0_8);
6448        r0_6 = t;
6449      };
6450      {
6451        uint const t = min(r0_1, r0_2);
6452        r0_2 = max(r0_1, r0_2);
6453        r0_1 = t;
6454      };
6455      {
6456        uint const t = min(r0_3, r0_4);
6457        r0_4 = max(r0_3, r0_4);
6458        r0_3 = t;
6459      };
6460      {
6461        uint const t = min(r0_5, r0_6);
6462        r0_6 = max(r0_5, r0_6);
6463        r0_5 = t;
6464      };
6465      {
6466        uint const t = min(r0_7, r0_8);
6467        r0_8 = max(r0_7, r0_8);
6468        r0_7 = t;
6469      };
6470      shared.m[smem_l_idx + (0)] = r0_1;
6471      shared.m[smem_l_idx + (16)] = r0_2;
6472      shared.m[smem_l_idx + (32)] = r0_3;
6473      shared.m[smem_l_idx + (48)] = r0_4;
6474      shared.m[smem_l_idx + (64)] = r0_5;
6475      shared.m[smem_l_idx + (80)] = r0_6;
6476      shared.m[smem_l_idx + (96)] = r0_7;
6477      shared.m[smem_l_idx + (112)] = r0_8;
6478      shared.m[smem_r_idx + (128)] = r0_9;
6479      shared.m[smem_r_idx + (144)] = r0_10;
6480      shared.m[smem_r_idx + (160)] = r0_11;
6481      shared.m[smem_r_idx + (176)] = r0_12;
6482      shared.m[smem_r_idx + (192)] = r0_13;
6483      shared.m[smem_r_idx + (208)] = r0_14;
6484      shared.m[smem_r_idx + (224)] = r0_15;
6485      shared.m[smem_r_idx + (240)] = r0_16;
6486    }
6487  }
6488  barrier(CLK_LOCAL_MEM_FENCE);
6489  r1 = shared.m[get_local_id(0) + (16 * (1 << 4) * 0)];
6490  r8 = shared.m[get_local_id(0) + (16 * (1 << 4) * 1)];
6491  r2 = shared.m[get_local_id(0) + (16 * (1 << 4) * 2)];
6492  r7 = shared.m[get_local_id(0) + (16 * (1 << 4) * 3)];
6493  r3 = shared.m[get_local_id(0) + (16 * (1 << 4) * 4)];
6494  r6 = shared.m[get_local_id(0) + (16 * (1 << 4) * 5)];
6495  r4 = shared.m[get_local_id(0) + (16 * (1 << 4) * 6)];
6496  r5 = shared.m[get_local_id(0) + (16 * (1 << 4) * 7)];
6497  {
6498    {
6499      uint const half_lane_idx = get_sub_group_local_id() ^ 8;
6500      int const t_lt = get_sub_group_local_id() < half_lane_idx;
6501      ;
6502      {
6503        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
6504        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
6505      };
6506      {
6507        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
6508        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
6509      };
6510      {
6511        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
6512        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
6513      };
6514      {
6515        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
6516        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
6517      };
6518      {
6519        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
6520        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
6521      };
6522      {
6523        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
6524        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
6525      };
6526      {
6527        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
6528        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
6529      };
6530      {
6531        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
6532        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
6533      };
6534    }
6535    {
6536      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
6537      int const t_lt = get_sub_group_local_id() < half_lane_idx;
6538      ;
6539      {
6540        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
6541        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
6542      };
6543      {
6544        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
6545        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
6546      };
6547      {
6548        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
6549        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
6550      };
6551      {
6552        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
6553        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
6554      };
6555      {
6556        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
6557        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
6558      };
6559      {
6560        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
6561        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
6562      };
6563      {
6564        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
6565        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
6566      };
6567      {
6568        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
6569        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
6570      };
6571    }
6572    {
6573      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
6574      int const t_lt = get_sub_group_local_id() < half_lane_idx;
6575      ;
6576      {
6577        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
6578        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
6579      };
6580      {
6581        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
6582        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
6583      };
6584      {
6585        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
6586        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
6587      };
6588      {
6589        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
6590        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
6591      };
6592      {
6593        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
6594        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
6595      };
6596      {
6597        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
6598        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
6599      };
6600      {
6601        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
6602        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
6603      };
6604      {
6605        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
6606        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
6607      };
6608    }
6609    {
6610      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
6611      int const t_lt = get_sub_group_local_id() < half_lane_idx;
6612      ;
6613      {
6614        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
6615        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
6616      };
6617      {
6618        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
6619        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
6620      };
6621      {
6622        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
6623        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
6624      };
6625      {
6626        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
6627        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
6628      };
6629      {
6630        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
6631        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
6632      };
6633      {
6634        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
6635        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
6636      };
6637      {
6638        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
6639        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
6640      };
6641      {
6642        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
6643        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
6644      };
6645    }
6646    {
6647      uint const t = min(r1, r5);
6648      r5 = max(r1, r5);
6649      r1 = t;
6650    };
6651    {
6652      uint const t = min(r3, r7);
6653      r7 = max(r3, r7);
6654      r3 = t;
6655    };
6656    {
6657      uint const t = min(r1, r3);
6658      r3 = max(r1, r3);
6659      r1 = t;
6660    };
6661    {
6662      uint const t = min(r5, r7);
6663      r7 = max(r5, r7);
6664      r5 = t;
6665    };
6666    {
6667      uint const t = min(r2, r6);
6668      r6 = max(r2, r6);
6669      r2 = t;
6670    };
6671    {
6672      uint const t = min(r4, r8);
6673      r8 = max(r4, r8);
6674      r4 = t;
6675    };
6676    {
6677      uint const t = min(r2, r4);
6678      r4 = max(r2, r4);
6679      r2 = t;
6680    };
6681    {
6682      uint const t = min(r6, r8);
6683      r8 = max(r6, r8);
6684      r6 = t;
6685    };
6686    {
6687      uint const t = min(r1, r2);
6688      r2 = max(r1, r2);
6689      r1 = t;
6690    };
6691    {
6692      uint const t = min(r3, r4);
6693      r4 = max(r3, r4);
6694      r3 = t;
6695    };
6696    {
6697      uint const t = min(r5, r6);
6698      r6 = max(r5, r6);
6699      r5 = t;
6700    };
6701    {
6702      uint const t = min(r7, r8);
6703      r8 = max(r7, r8);
6704      r7 = t;
6705    };
6706  }
6707  vout[gmem_idx + (1 << 4) * 0] = r1;
6708  vout[gmem_idx + (1 << 4) * 1] = r2;
6709  vout[gmem_idx + (1 << 4) * 2] = r3;
6710  vout[gmem_idx + (1 << 4) * 3] = r4;
6711  vout[gmem_idx + (1 << 4) * 4] = r5;
6712  vout[gmem_idx + (1 << 4) * 5] = r6;
6713  vout[gmem_idx + (1 << 4) * 6] = r7;
6714  vout[gmem_idx + (1 << 4) * 7] = r8;
6715}
6716
6717__kernel __attribute__((intel_reqd_sub_group_size((1 << 4))))
6718__attribute__((reqd_work_group_size((1 << 4) * 1, 1, 1))) void
6719hs_kernel_bc_0(__global uint* const restrict vout)
6720{
6721  uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 +
6722                        (get_local_id(0) & ((1 << 4) - 1));
6723  uint r1 = vout[gmem_idx + (1 << 4) * 0];
6724  uint r2 = vout[gmem_idx + (1 << 4) * 1];
6725  uint r3 = vout[gmem_idx + (1 << 4) * 2];
6726  uint r4 = vout[gmem_idx + (1 << 4) * 3];
6727  uint r5 = vout[gmem_idx + (1 << 4) * 4];
6728  uint r6 = vout[gmem_idx + (1 << 4) * 5];
6729  uint r7 = vout[gmem_idx + (1 << 4) * 6];
6730  uint r8 = vout[gmem_idx + (1 << 4) * 7];
6731  {
6732    {
6733      uint const half_lane_idx = get_sub_group_local_id() ^ 8;
6734      int const t_lt = get_sub_group_local_id() < half_lane_idx;
6735      ;
6736      {
6737        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
6738        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
6739      };
6740      {
6741        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
6742        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
6743      };
6744      {
6745        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
6746        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
6747      };
6748      {
6749        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
6750        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
6751      };
6752      {
6753        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
6754        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
6755      };
6756      {
6757        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
6758        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
6759      };
6760      {
6761        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
6762        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
6763      };
6764      {
6765        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
6766        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
6767      };
6768    }
6769    {
6770      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
6771      int const t_lt = get_sub_group_local_id() < half_lane_idx;
6772      ;
6773      {
6774        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
6775        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
6776      };
6777      {
6778        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
6779        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
6780      };
6781      {
6782        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
6783        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
6784      };
6785      {
6786        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
6787        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
6788      };
6789      {
6790        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
6791        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
6792      };
6793      {
6794        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
6795        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
6796      };
6797      {
6798        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
6799        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
6800      };
6801      {
6802        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
6803        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
6804      };
6805    }
6806    {
6807      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
6808      int const t_lt = get_sub_group_local_id() < half_lane_idx;
6809      ;
6810      {
6811        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
6812        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
6813      };
6814      {
6815        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
6816        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
6817      };
6818      {
6819        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
6820        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
6821      };
6822      {
6823        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
6824        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
6825      };
6826      {
6827        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
6828        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
6829      };
6830      {
6831        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
6832        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
6833      };
6834      {
6835        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
6836        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
6837      };
6838      {
6839        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
6840        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
6841      };
6842    }
6843    {
6844      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
6845      int const t_lt = get_sub_group_local_id() < half_lane_idx;
6846      ;
6847      {
6848        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
6849        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
6850      };
6851      {
6852        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
6853        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
6854      };
6855      {
6856        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
6857        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
6858      };
6859      {
6860        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
6861        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
6862      };
6863      {
6864        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
6865        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
6866      };
6867      {
6868        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
6869        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
6870      };
6871      {
6872        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
6873        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
6874      };
6875      {
6876        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
6877        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
6878      };
6879    }
6880    {
6881      uint const t = min(r1, r5);
6882      r5 = max(r1, r5);
6883      r1 = t;
6884    };
6885    {
6886      uint const t = min(r3, r7);
6887      r7 = max(r3, r7);
6888      r3 = t;
6889    };
6890    {
6891      uint const t = min(r1, r3);
6892      r3 = max(r1, r3);
6893      r1 = t;
6894    };
6895    {
6896      uint const t = min(r5, r7);
6897      r7 = max(r5, r7);
6898      r5 = t;
6899    };
6900    {
6901      uint const t = min(r2, r6);
6902      r6 = max(r2, r6);
6903      r2 = t;
6904    };
6905    {
6906      uint const t = min(r4, r8);
6907      r8 = max(r4, r8);
6908      r4 = t;
6909    };
6910    {
6911      uint const t = min(r2, r4);
6912      r4 = max(r2, r4);
6913      r2 = t;
6914    };
6915    {
6916      uint const t = min(r6, r8);
6917      r8 = max(r6, r8);
6918      r6 = t;
6919    };
6920    {
6921      uint const t = min(r1, r2);
6922      r2 = max(r1, r2);
6923      r1 = t;
6924    };
6925    {
6926      uint const t = min(r3, r4);
6927      r4 = max(r3, r4);
6928      r3 = t;
6929    };
6930    {
6931      uint const t = min(r5, r6);
6932      r6 = max(r5, r6);
6933      r5 = t;
6934    };
6935    {
6936      uint const t = min(r7, r8);
6937      r8 = max(r7, r8);
6938      r7 = t;
6939    };
6940  }
6941  vout[gmem_idx + (1 << 4) * 0] = r1;
6942  vout[gmem_idx + (1 << 4) * 1] = r2;
6943  vout[gmem_idx + (1 << 4) * 2] = r3;
6944  vout[gmem_idx + (1 << 4) * 3] = r4;
6945  vout[gmem_idx + (1 << 4) * 4] = r5;
6946  vout[gmem_idx + (1 << 4) * 5] = r6;
6947  vout[gmem_idx + (1 << 4) * 6] = r7;
6948  vout[gmem_idx + (1 << 4) * 7] = r8;
6949}
6950
6951__kernel __attribute__((intel_reqd_sub_group_size((1 << 4))))
6952__attribute__((reqd_work_group_size((1 << 4) * 2, 1, 1))) void
6953hs_kernel_bc_1(__global uint* const restrict vout)
6954{
6955  __local struct
6956  {
6957    uint m[32 * 8];
6958  } shared;
6959
6960  uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 +
6961                        (get_local_id(0) & ((1 << 4) - 1));
6962  uint const gmem_l_idx =
6963    (get_global_id(0) & ~((1 << 4) * 2 - 1)) * 8 + get_local_id(0);
6964  uint const smem_l_idx =
6965    get_sub_group_id() * ((1 << 4) * 2) + get_sub_group_local_id();
6966  {
6967    {
6968      uint r0_1 = vout[gmem_l_idx + ((1 << 4) * 0)];
6969      uint r0_2 = vout[gmem_l_idx + ((1 << 4) * 8)];
6970      {
6971        uint const t = min(r0_1, r0_2);
6972        r0_2 = max(r0_1, r0_2);
6973        r0_1 = t;
6974      };
6975      shared.m[smem_l_idx + (0)] = r0_1;
6976      shared.m[smem_l_idx + (16)] = r0_2;
6977    }
6978    {
6979      uint r0_1 = vout[gmem_l_idx + ((1 << 4) * 2)];
6980      uint r0_2 = vout[gmem_l_idx + ((1 << 4) * 10)];
6981      {
6982        uint const t = min(r0_1, r0_2);
6983        r0_2 = max(r0_1, r0_2);
6984        r0_1 = t;
6985      };
6986      shared.m[smem_l_idx + (64)] = r0_1;
6987      shared.m[smem_l_idx + (80)] = r0_2;
6988    }
6989    {
6990      uint r0_1 = vout[gmem_l_idx + ((1 << 4) * 4)];
6991      uint r0_2 = vout[gmem_l_idx + ((1 << 4) * 12)];
6992      {
6993        uint const t = min(r0_1, r0_2);
6994        r0_2 = max(r0_1, r0_2);
6995        r0_1 = t;
6996      };
6997      shared.m[smem_l_idx + (128)] = r0_1;
6998      shared.m[smem_l_idx + (144)] = r0_2;
6999    }
7000    {
7001      uint r0_1 = vout[gmem_l_idx + ((1 << 4) * 6)];
7002      uint r0_2 = vout[gmem_l_idx + ((1 << 4) * 14)];
7003      {
7004        uint const t = min(r0_1, r0_2);
7005        r0_2 = max(r0_1, r0_2);
7006        r0_1 = t;
7007      };
7008      shared.m[smem_l_idx + (192)] = r0_1;
7009      shared.m[smem_l_idx + (208)] = r0_2;
7010    }
7011  }
7012  barrier(CLK_LOCAL_MEM_FENCE);
7013  uint r1 = shared.m[get_local_id(0) + (2 * (1 << 4) * 0)];
7014  uint r2 = shared.m[get_local_id(0) + (2 * (1 << 4) * 1)];
7015  uint r3 = shared.m[get_local_id(0) + (2 * (1 << 4) * 2)];
7016  uint r4 = shared.m[get_local_id(0) + (2 * (1 << 4) * 3)];
7017  uint r5 = shared.m[get_local_id(0) + (2 * (1 << 4) * 4)];
7018  uint r6 = shared.m[get_local_id(0) + (2 * (1 << 4) * 5)];
7019  uint r7 = shared.m[get_local_id(0) + (2 * (1 << 4) * 6)];
7020  uint r8 = shared.m[get_local_id(0) + (2 * (1 << 4) * 7)];
7021  {
7022    {
7023      uint const half_lane_idx = get_sub_group_local_id() ^ 8;
7024      int const t_lt = get_sub_group_local_id() < half_lane_idx;
7025      ;
7026      {
7027        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
7028        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
7029      };
7030      {
7031        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
7032        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
7033      };
7034      {
7035        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
7036        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
7037      };
7038      {
7039        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
7040        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
7041      };
7042      {
7043        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
7044        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
7045      };
7046      {
7047        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
7048        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
7049      };
7050      {
7051        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
7052        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
7053      };
7054      {
7055        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
7056        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
7057      };
7058    }
7059    {
7060      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
7061      int const t_lt = get_sub_group_local_id() < half_lane_idx;
7062      ;
7063      {
7064        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
7065        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
7066      };
7067      {
7068        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
7069        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
7070      };
7071      {
7072        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
7073        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
7074      };
7075      {
7076        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
7077        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
7078      };
7079      {
7080        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
7081        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
7082      };
7083      {
7084        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
7085        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
7086      };
7087      {
7088        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
7089        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
7090      };
7091      {
7092        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
7093        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
7094      };
7095    }
7096    {
7097      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
7098      int const t_lt = get_sub_group_local_id() < half_lane_idx;
7099      ;
7100      {
7101        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
7102        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
7103      };
7104      {
7105        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
7106        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
7107      };
7108      {
7109        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
7110        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
7111      };
7112      {
7113        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
7114        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
7115      };
7116      {
7117        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
7118        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
7119      };
7120      {
7121        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
7122        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
7123      };
7124      {
7125        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
7126        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
7127      };
7128      {
7129        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
7130        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
7131      };
7132    }
7133    {
7134      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
7135      int const t_lt = get_sub_group_local_id() < half_lane_idx;
7136      ;
7137      {
7138        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
7139        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
7140      };
7141      {
7142        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
7143        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
7144      };
7145      {
7146        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
7147        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
7148      };
7149      {
7150        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
7151        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
7152      };
7153      {
7154        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
7155        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
7156      };
7157      {
7158        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
7159        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
7160      };
7161      {
7162        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
7163        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
7164      };
7165      {
7166        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
7167        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
7168      };
7169    }
7170    {
7171      uint const t = min(r1, r5);
7172      r5 = max(r1, r5);
7173      r1 = t;
7174    };
7175    {
7176      uint const t = min(r3, r7);
7177      r7 = max(r3, r7);
7178      r3 = t;
7179    };
7180    {
7181      uint const t = min(r1, r3);
7182      r3 = max(r1, r3);
7183      r1 = t;
7184    };
7185    {
7186      uint const t = min(r5, r7);
7187      r7 = max(r5, r7);
7188      r5 = t;
7189    };
7190    {
7191      uint const t = min(r2, r6);
7192      r6 = max(r2, r6);
7193      r2 = t;
7194    };
7195    {
7196      uint const t = min(r4, r8);
7197      r8 = max(r4, r8);
7198      r4 = t;
7199    };
7200    {
7201      uint const t = min(r2, r4);
7202      r4 = max(r2, r4);
7203      r2 = t;
7204    };
7205    {
7206      uint const t = min(r6, r8);
7207      r8 = max(r6, r8);
7208      r6 = t;
7209    };
7210    {
7211      uint const t = min(r1, r2);
7212      r2 = max(r1, r2);
7213      r1 = t;
7214    };
7215    {
7216      uint const t = min(r3, r4);
7217      r4 = max(r3, r4);
7218      r3 = t;
7219    };
7220    {
7221      uint const t = min(r5, r6);
7222      r6 = max(r5, r6);
7223      r5 = t;
7224    };
7225    {
7226      uint const t = min(r7, r8);
7227      r8 = max(r7, r8);
7228      r7 = t;
7229    };
7230  }
7231  vout[gmem_idx + (1 << 4) * 0] = r1;
7232  vout[gmem_idx + (1 << 4) * 1] = r2;
7233  vout[gmem_idx + (1 << 4) * 2] = r3;
7234  vout[gmem_idx + (1 << 4) * 3] = r4;
7235  vout[gmem_idx + (1 << 4) * 4] = r5;
7236  vout[gmem_idx + (1 << 4) * 5] = r6;
7237  vout[gmem_idx + (1 << 4) * 6] = r7;
7238  vout[gmem_idx + (1 << 4) * 7] = r8;
7239}
7240
7241__kernel __attribute__((intel_reqd_sub_group_size((1 << 4))))
7242__attribute__((reqd_work_group_size((1 << 4) * 4, 1, 1))) void
7243hs_kernel_bc_2(__global uint* const restrict vout)
7244{
7245  __local struct
7246  {
7247    uint m[64 * 8];
7248  } shared;
7249
7250  uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 +
7251                        (get_local_id(0) & ((1 << 4) - 1));
7252  uint const gmem_l_idx =
7253    (get_global_id(0) & ~((1 << 4) * 4 - 1)) * 8 + get_local_id(0);
7254  uint const smem_l_idx =
7255    get_sub_group_id() * ((1 << 4) * 4) + get_sub_group_local_id();
7256  {
7257    {
7258      uint r0_1 = vout[gmem_l_idx + ((1 << 4) * 0)];
7259      uint r0_2 = vout[gmem_l_idx + ((1 << 4) * 8)];
7260      uint r0_3 = vout[gmem_l_idx + ((1 << 4) * 16)];
7261      uint r0_4 = vout[gmem_l_idx + ((1 << 4) * 24)];
7262      {
7263        uint const t = min(r0_1, r0_3);
7264        r0_3 = max(r0_1, r0_3);
7265        r0_1 = t;
7266      };
7267      {
7268        uint const t = min(r0_2, r0_4);
7269        r0_4 = max(r0_2, r0_4);
7270        r0_2 = t;
7271      };
7272      {
7273        uint const t = min(r0_1, r0_2);
7274        r0_2 = max(r0_1, r0_2);
7275        r0_1 = t;
7276      };
7277      {
7278        uint const t = min(r0_3, r0_4);
7279        r0_4 = max(r0_3, r0_4);
7280        r0_3 = t;
7281      };
7282      shared.m[smem_l_idx + (0)] = r0_1;
7283      shared.m[smem_l_idx + (16)] = r0_2;
7284      shared.m[smem_l_idx + (32)] = r0_3;
7285      shared.m[smem_l_idx + (48)] = r0_4;
7286    }
7287    {
7288      uint r0_1 = vout[gmem_l_idx + ((1 << 4) * 4)];
7289      uint r0_2 = vout[gmem_l_idx + ((1 << 4) * 12)];
7290      uint r0_3 = vout[gmem_l_idx + ((1 << 4) * 20)];
7291      uint r0_4 = vout[gmem_l_idx + ((1 << 4) * 28)];
7292      {
7293        uint const t = min(r0_1, r0_3);
7294        r0_3 = max(r0_1, r0_3);
7295        r0_1 = t;
7296      };
7297      {
7298        uint const t = min(r0_2, r0_4);
7299        r0_4 = max(r0_2, r0_4);
7300        r0_2 = t;
7301      };
7302      {
7303        uint const t = min(r0_1, r0_2);
7304        r0_2 = max(r0_1, r0_2);
7305        r0_1 = t;
7306      };
7307      {
7308        uint const t = min(r0_3, r0_4);
7309        r0_4 = max(r0_3, r0_4);
7310        r0_3 = t;
7311      };
7312      shared.m[smem_l_idx + (256)] = r0_1;
7313      shared.m[smem_l_idx + (272)] = r0_2;
7314      shared.m[smem_l_idx + (288)] = r0_3;
7315      shared.m[smem_l_idx + (304)] = r0_4;
7316    }
7317  }
7318  barrier(CLK_LOCAL_MEM_FENCE);
7319  uint r1 = shared.m[get_local_id(0) + (4 * (1 << 4) * 0)];
7320  uint r2 = shared.m[get_local_id(0) + (4 * (1 << 4) * 1)];
7321  uint r3 = shared.m[get_local_id(0) + (4 * (1 << 4) * 2)];
7322  uint r4 = shared.m[get_local_id(0) + (4 * (1 << 4) * 3)];
7323  uint r5 = shared.m[get_local_id(0) + (4 * (1 << 4) * 4)];
7324  uint r6 = shared.m[get_local_id(0) + (4 * (1 << 4) * 5)];
7325  uint r7 = shared.m[get_local_id(0) + (4 * (1 << 4) * 6)];
7326  uint r8 = shared.m[get_local_id(0) + (4 * (1 << 4) * 7)];
7327  {
7328    {
7329      uint const half_lane_idx = get_sub_group_local_id() ^ 8;
7330      int const t_lt = get_sub_group_local_id() < half_lane_idx;
7331      ;
7332      {
7333        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
7334        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
7335      };
7336      {
7337        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
7338        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
7339      };
7340      {
7341        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
7342        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
7343      };
7344      {
7345        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
7346        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
7347      };
7348      {
7349        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
7350        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
7351      };
7352      {
7353        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
7354        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
7355      };
7356      {
7357        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
7358        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
7359      };
7360      {
7361        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
7362        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
7363      };
7364    }
7365    {
7366      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
7367      int const t_lt = get_sub_group_local_id() < half_lane_idx;
7368      ;
7369      {
7370        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
7371        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
7372      };
7373      {
7374        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
7375        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
7376      };
7377      {
7378        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
7379        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
7380      };
7381      {
7382        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
7383        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
7384      };
7385      {
7386        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
7387        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
7388      };
7389      {
7390        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
7391        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
7392      };
7393      {
7394        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
7395        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
7396      };
7397      {
7398        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
7399        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
7400      };
7401    }
7402    {
7403      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
7404      int const t_lt = get_sub_group_local_id() < half_lane_idx;
7405      ;
7406      {
7407        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
7408        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
7409      };
7410      {
7411        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
7412        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
7413      };
7414      {
7415        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
7416        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
7417      };
7418      {
7419        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
7420        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
7421      };
7422      {
7423        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
7424        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
7425      };
7426      {
7427        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
7428        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
7429      };
7430      {
7431        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
7432        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
7433      };
7434      {
7435        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
7436        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
7437      };
7438    }
7439    {
7440      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
7441      int const t_lt = get_sub_group_local_id() < half_lane_idx;
7442      ;
7443      {
7444        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
7445        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
7446      };
7447      {
7448        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
7449        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
7450      };
7451      {
7452        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
7453        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
7454      };
7455      {
7456        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
7457        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
7458      };
7459      {
7460        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
7461        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
7462      };
7463      {
7464        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
7465        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
7466      };
7467      {
7468        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
7469        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
7470      };
7471      {
7472        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
7473        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
7474      };
7475    }
7476    {
7477      uint const t = min(r1, r5);
7478      r5 = max(r1, r5);
7479      r1 = t;
7480    };
7481    {
7482      uint const t = min(r3, r7);
7483      r7 = max(r3, r7);
7484      r3 = t;
7485    };
7486    {
7487      uint const t = min(r1, r3);
7488      r3 = max(r1, r3);
7489      r1 = t;
7490    };
7491    {
7492      uint const t = min(r5, r7);
7493      r7 = max(r5, r7);
7494      r5 = t;
7495    };
7496    {
7497      uint const t = min(r2, r6);
7498      r6 = max(r2, r6);
7499      r2 = t;
7500    };
7501    {
7502      uint const t = min(r4, r8);
7503      r8 = max(r4, r8);
7504      r4 = t;
7505    };
7506    {
7507      uint const t = min(r2, r4);
7508      r4 = max(r2, r4);
7509      r2 = t;
7510    };
7511    {
7512      uint const t = min(r6, r8);
7513      r8 = max(r6, r8);
7514      r6 = t;
7515    };
7516    {
7517      uint const t = min(r1, r2);
7518      r2 = max(r1, r2);
7519      r1 = t;
7520    };
7521    {
7522      uint const t = min(r3, r4);
7523      r4 = max(r3, r4);
7524      r3 = t;
7525    };
7526    {
7527      uint const t = min(r5, r6);
7528      r6 = max(r5, r6);
7529      r5 = t;
7530    };
7531    {
7532      uint const t = min(r7, r8);
7533      r8 = max(r7, r8);
7534      r7 = t;
7535    };
7536  }
7537  vout[gmem_idx + (1 << 4) * 0] = r1;
7538  vout[gmem_idx + (1 << 4) * 1] = r2;
7539  vout[gmem_idx + (1 << 4) * 2] = r3;
7540  vout[gmem_idx + (1 << 4) * 3] = r4;
7541  vout[gmem_idx + (1 << 4) * 4] = r5;
7542  vout[gmem_idx + (1 << 4) * 5] = r6;
7543  vout[gmem_idx + (1 << 4) * 6] = r7;
7544  vout[gmem_idx + (1 << 4) * 7] = r8;
7545}
7546
7547__kernel __attribute__((intel_reqd_sub_group_size((1 << 4))))
7548__attribute__((reqd_work_group_size((1 << 4) * 8, 1, 1))) void
7549hs_kernel_bc_3(__global uint* const restrict vout)
7550{
7551  __local struct
7552  {
7553    uint m[128 * 8];
7554  } shared;
7555
7556  uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 +
7557                        (get_local_id(0) & ((1 << 4) - 1));
7558  uint const gmem_l_idx =
7559    (get_global_id(0) & ~((1 << 4) * 8 - 1)) * 8 + get_local_id(0);
7560  uint const smem_l_idx =
7561    get_sub_group_id() * ((1 << 4) * 8) + get_sub_group_local_id();
7562  {
7563    {
7564      uint r0_1 = vout[gmem_l_idx + ((1 << 4) * 0)];
7565      uint r0_2 = vout[gmem_l_idx + ((1 << 4) * 8)];
7566      uint r0_3 = vout[gmem_l_idx + ((1 << 4) * 16)];
7567      uint r0_4 = vout[gmem_l_idx + ((1 << 4) * 24)];
7568      uint r0_5 = vout[gmem_l_idx + ((1 << 4) * 32)];
7569      uint r0_6 = vout[gmem_l_idx + ((1 << 4) * 40)];
7570      uint r0_7 = vout[gmem_l_idx + ((1 << 4) * 48)];
7571      uint r0_8 = vout[gmem_l_idx + ((1 << 4) * 56)];
7572      {
7573        uint const t = min(r0_1, r0_5);
7574        r0_5 = max(r0_1, r0_5);
7575        r0_1 = t;
7576      };
7577      {
7578        uint const t = min(r0_3, r0_7);
7579        r0_7 = max(r0_3, r0_7);
7580        r0_3 = t;
7581      };
7582      {
7583        uint const t = min(r0_1, r0_3);
7584        r0_3 = max(r0_1, r0_3);
7585        r0_1 = t;
7586      };
7587      {
7588        uint const t = min(r0_5, r0_7);
7589        r0_7 = max(r0_5, r0_7);
7590        r0_5 = t;
7591      };
7592      {
7593        uint const t = min(r0_2, r0_6);
7594        r0_6 = max(r0_2, r0_6);
7595        r0_2 = t;
7596      };
7597      {
7598        uint const t = min(r0_4, r0_8);
7599        r0_8 = max(r0_4, r0_8);
7600        r0_4 = t;
7601      };
7602      {
7603        uint const t = min(r0_2, r0_4);
7604        r0_4 = max(r0_2, r0_4);
7605        r0_2 = t;
7606      };
7607      {
7608        uint const t = min(r0_6, r0_8);
7609        r0_8 = max(r0_6, r0_8);
7610        r0_6 = t;
7611      };
7612      {
7613        uint const t = min(r0_1, r0_2);
7614        r0_2 = max(r0_1, r0_2);
7615        r0_1 = t;
7616      };
7617      {
7618        uint const t = min(r0_3, r0_4);
7619        r0_4 = max(r0_3, r0_4);
7620        r0_3 = t;
7621      };
7622      {
7623        uint const t = min(r0_5, r0_6);
7624        r0_6 = max(r0_5, r0_6);
7625        r0_5 = t;
7626      };
7627      {
7628        uint const t = min(r0_7, r0_8);
7629        r0_8 = max(r0_7, r0_8);
7630        r0_7 = t;
7631      };
7632      shared.m[smem_l_idx + (0)] = r0_1;
7633      shared.m[smem_l_idx + (16)] = r0_2;
7634      shared.m[smem_l_idx + (32)] = r0_3;
7635      shared.m[smem_l_idx + (48)] = r0_4;
7636      shared.m[smem_l_idx + (64)] = r0_5;
7637      shared.m[smem_l_idx + (80)] = r0_6;
7638      shared.m[smem_l_idx + (96)] = r0_7;
7639      shared.m[smem_l_idx + (112)] = r0_8;
7640    }
7641  }
7642  barrier(CLK_LOCAL_MEM_FENCE);
7643  uint r1 = shared.m[get_local_id(0) + (8 * (1 << 4) * 0)];
7644  uint r2 = shared.m[get_local_id(0) + (8 * (1 << 4) * 1)];
7645  uint r3 = shared.m[get_local_id(0) + (8 * (1 << 4) * 2)];
7646  uint r4 = shared.m[get_local_id(0) + (8 * (1 << 4) * 3)];
7647  uint r5 = shared.m[get_local_id(0) + (8 * (1 << 4) * 4)];
7648  uint r6 = shared.m[get_local_id(0) + (8 * (1 << 4) * 5)];
7649  uint r7 = shared.m[get_local_id(0) + (8 * (1 << 4) * 6)];
7650  uint r8 = shared.m[get_local_id(0) + (8 * (1 << 4) * 7)];
7651  {
7652    {
7653      uint const half_lane_idx = get_sub_group_local_id() ^ 8;
7654      int const t_lt = get_sub_group_local_id() < half_lane_idx;
7655      ;
7656      {
7657        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
7658        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
7659      };
7660      {
7661        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
7662        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
7663      };
7664      {
7665        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
7666        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
7667      };
7668      {
7669        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
7670        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
7671      };
7672      {
7673        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
7674        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
7675      };
7676      {
7677        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
7678        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
7679      };
7680      {
7681        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
7682        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
7683      };
7684      {
7685        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
7686        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
7687      };
7688    }
7689    {
7690      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
7691      int const t_lt = get_sub_group_local_id() < half_lane_idx;
7692      ;
7693      {
7694        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
7695        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
7696      };
7697      {
7698        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
7699        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
7700      };
7701      {
7702        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
7703        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
7704      };
7705      {
7706        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
7707        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
7708      };
7709      {
7710        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
7711        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
7712      };
7713      {
7714        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
7715        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
7716      };
7717      {
7718        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
7719        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
7720      };
7721      {
7722        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
7723        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
7724      };
7725    }
7726    {
7727      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
7728      int const t_lt = get_sub_group_local_id() < half_lane_idx;
7729      ;
7730      {
7731        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
7732        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
7733      };
7734      {
7735        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
7736        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
7737      };
7738      {
7739        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
7740        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
7741      };
7742      {
7743        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
7744        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
7745      };
7746      {
7747        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
7748        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
7749      };
7750      {
7751        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
7752        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
7753      };
7754      {
7755        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
7756        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
7757      };
7758      {
7759        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
7760        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
7761      };
7762    }
7763    {
7764      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
7765      int const t_lt = get_sub_group_local_id() < half_lane_idx;
7766      ;
7767      {
7768        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
7769        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
7770      };
7771      {
7772        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
7773        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
7774      };
7775      {
7776        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
7777        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
7778      };
7779      {
7780        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
7781        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
7782      };
7783      {
7784        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
7785        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
7786      };
7787      {
7788        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
7789        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
7790      };
7791      {
7792        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
7793        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
7794      };
7795      {
7796        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
7797        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
7798      };
7799    }
7800    {
7801      uint const t = min(r1, r5);
7802      r5 = max(r1, r5);
7803      r1 = t;
7804    };
7805    {
7806      uint const t = min(r3, r7);
7807      r7 = max(r3, r7);
7808      r3 = t;
7809    };
7810    {
7811      uint const t = min(r1, r3);
7812      r3 = max(r1, r3);
7813      r1 = t;
7814    };
7815    {
7816      uint const t = min(r5, r7);
7817      r7 = max(r5, r7);
7818      r5 = t;
7819    };
7820    {
7821      uint const t = min(r2, r6);
7822      r6 = max(r2, r6);
7823      r2 = t;
7824    };
7825    {
7826      uint const t = min(r4, r8);
7827      r8 = max(r4, r8);
7828      r4 = t;
7829    };
7830    {
7831      uint const t = min(r2, r4);
7832      r4 = max(r2, r4);
7833      r2 = t;
7834    };
7835    {
7836      uint const t = min(r6, r8);
7837      r8 = max(r6, r8);
7838      r6 = t;
7839    };
7840    {
7841      uint const t = min(r1, r2);
7842      r2 = max(r1, r2);
7843      r1 = t;
7844    };
7845    {
7846      uint const t = min(r3, r4);
7847      r4 = max(r3, r4);
7848      r3 = t;
7849    };
7850    {
7851      uint const t = min(r5, r6);
7852      r6 = max(r5, r6);
7853      r5 = t;
7854    };
7855    {
7856      uint const t = min(r7, r8);
7857      r8 = max(r7, r8);
7858      r7 = t;
7859    };
7860  }
7861  vout[gmem_idx + (1 << 4) * 0] = r1;
7862  vout[gmem_idx + (1 << 4) * 1] = r2;
7863  vout[gmem_idx + (1 << 4) * 2] = r3;
7864  vout[gmem_idx + (1 << 4) * 3] = r4;
7865  vout[gmem_idx + (1 << 4) * 4] = r5;
7866  vout[gmem_idx + (1 << 4) * 5] = r6;
7867  vout[gmem_idx + (1 << 4) * 6] = r7;
7868  vout[gmem_idx + (1 << 4) * 7] = r8;
7869}
7870
7871__kernel __attribute__((intel_reqd_sub_group_size((1 << 4))))
7872__attribute__((reqd_work_group_size((1 << 4) * 16, 1, 1))) void
7873hs_kernel_bc_4(__global uint* const restrict vout)
7874{
7875  __local struct
7876  {
7877    uint m[256 * 8];
7878  } shared;
7879
7880  uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 +
7881                        (get_local_id(0) & ((1 << 4) - 1));
7882  uint const gmem_l_idx =
7883    (get_global_id(0) & ~((1 << 4) * 16 - 1)) * 8 + get_local_id(0);
7884  uint const smem_l_idx =
7885    get_sub_group_id() * ((1 << 4) * 16) + get_sub_group_local_id();
7886  if (get_sub_group_id() < 8) {
7887    {
7888      uint r0_1 = vout[gmem_l_idx + ((1 << 4) * 0)];
7889      uint r0_2 = vout[gmem_l_idx + ((1 << 4) * 8)];
7890      uint r0_3 = vout[gmem_l_idx + ((1 << 4) * 16)];
7891      uint r0_4 = vout[gmem_l_idx + ((1 << 4) * 24)];
7892      uint r0_5 = vout[gmem_l_idx + ((1 << 4) * 32)];
7893      uint r0_6 = vout[gmem_l_idx + ((1 << 4) * 40)];
7894      uint r0_7 = vout[gmem_l_idx + ((1 << 4) * 48)];
7895      uint r0_8 = vout[gmem_l_idx + ((1 << 4) * 56)];
7896      uint r0_9 = vout[gmem_l_idx + ((1 << 4) * 64)];
7897      uint r0_10 = vout[gmem_l_idx + ((1 << 4) * 72)];
7898      uint r0_11 = vout[gmem_l_idx + ((1 << 4) * 80)];
7899      uint r0_12 = vout[gmem_l_idx + ((1 << 4) * 88)];
7900      uint r0_13 = vout[gmem_l_idx + ((1 << 4) * 96)];
7901      uint r0_14 = vout[gmem_l_idx + ((1 << 4) * 104)];
7902      uint r0_15 = vout[gmem_l_idx + ((1 << 4) * 112)];
7903      uint r0_16 = vout[gmem_l_idx + ((1 << 4) * 120)];
7904      {
7905        uint const t = min(r0_1, r0_9);
7906        r0_9 = max(r0_1, r0_9);
7907        r0_1 = t;
7908      };
7909      {
7910        uint const t = min(r0_5, r0_13);
7911        r0_13 = max(r0_5, r0_13);
7912        r0_5 = t;
7913      };
7914      {
7915        uint const t = min(r0_1, r0_5);
7916        r0_5 = max(r0_1, r0_5);
7917        r0_1 = t;
7918      };
7919      {
7920        uint const t = min(r0_9, r0_13);
7921        r0_13 = max(r0_9, r0_13);
7922        r0_9 = t;
7923      };
7924      {
7925        uint const t = min(r0_3, r0_11);
7926        r0_11 = max(r0_3, r0_11);
7927        r0_3 = t;
7928      };
7929      {
7930        uint const t = min(r0_7, r0_15);
7931        r0_15 = max(r0_7, r0_15);
7932        r0_7 = t;
7933      };
7934      {
7935        uint const t = min(r0_3, r0_7);
7936        r0_7 = max(r0_3, r0_7);
7937        r0_3 = t;
7938      };
7939      {
7940        uint const t = min(r0_11, r0_15);
7941        r0_15 = max(r0_11, r0_15);
7942        r0_11 = t;
7943      };
7944      {
7945        uint const t = min(r0_1, r0_3);
7946        r0_3 = max(r0_1, r0_3);
7947        r0_1 = t;
7948      };
7949      {
7950        uint const t = min(r0_5, r0_7);
7951        r0_7 = max(r0_5, r0_7);
7952        r0_5 = t;
7953      };
7954      {
7955        uint const t = min(r0_9, r0_11);
7956        r0_11 = max(r0_9, r0_11);
7957        r0_9 = t;
7958      };
7959      {
7960        uint const t = min(r0_13, r0_15);
7961        r0_15 = max(r0_13, r0_15);
7962        r0_13 = t;
7963      };
7964      {
7965        uint const t = min(r0_2, r0_10);
7966        r0_10 = max(r0_2, r0_10);
7967        r0_2 = t;
7968      };
7969      {
7970        uint const t = min(r0_6, r0_14);
7971        r0_14 = max(r0_6, r0_14);
7972        r0_6 = t;
7973      };
7974      {
7975        uint const t = min(r0_2, r0_6);
7976        r0_6 = max(r0_2, r0_6);
7977        r0_2 = t;
7978      };
7979      {
7980        uint const t = min(r0_10, r0_14);
7981        r0_14 = max(r0_10, r0_14);
7982        r0_10 = t;
7983      };
7984      {
7985        uint const t = min(r0_4, r0_12);
7986        r0_12 = max(r0_4, r0_12);
7987        r0_4 = t;
7988      };
7989      {
7990        uint const t = min(r0_8, r0_16);
7991        r0_16 = max(r0_8, r0_16);
7992        r0_8 = t;
7993      };
7994      {
7995        uint const t = min(r0_4, r0_8);
7996        r0_8 = max(r0_4, r0_8);
7997        r0_4 = t;
7998      };
7999      {
8000        uint const t = min(r0_12, r0_16);
8001        r0_16 = max(r0_12, r0_16);
8002        r0_12 = t;
8003      };
8004      {
8005        uint const t = min(r0_2, r0_4);
8006        r0_4 = max(r0_2, r0_4);
8007        r0_2 = t;
8008      };
8009      {
8010        uint const t = min(r0_6, r0_8);
8011        r0_8 = max(r0_6, r0_8);
8012        r0_6 = t;
8013      };
8014      {
8015        uint const t = min(r0_10, r0_12);
8016        r0_12 = max(r0_10, r0_12);
8017        r0_10 = t;
8018      };
8019      {
8020        uint const t = min(r0_14, r0_16);
8021        r0_16 = max(r0_14, r0_16);
8022        r0_14 = t;
8023      };
8024      {
8025        uint const t = min(r0_1, r0_2);
8026        r0_2 = max(r0_1, r0_2);
8027        r0_1 = t;
8028      };
8029      {
8030        uint const t = min(r0_3, r0_4);
8031        r0_4 = max(r0_3, r0_4);
8032        r0_3 = t;
8033      };
8034      {
8035        uint const t = min(r0_5, r0_6);
8036        r0_6 = max(r0_5, r0_6);
8037        r0_5 = t;
8038      };
8039      {
8040        uint const t = min(r0_7, r0_8);
8041        r0_8 = max(r0_7, r0_8);
8042        r0_7 = t;
8043      };
8044      {
8045        uint const t = min(r0_9, r0_10);
8046        r0_10 = max(r0_9, r0_10);
8047        r0_9 = t;
8048      };
8049      {
8050        uint const t = min(r0_11, r0_12);
8051        r0_12 = max(r0_11, r0_12);
8052        r0_11 = t;
8053      };
8054      {
8055        uint const t = min(r0_13, r0_14);
8056        r0_14 = max(r0_13, r0_14);
8057        r0_13 = t;
8058      };
8059      {
8060        uint const t = min(r0_15, r0_16);
8061        r0_16 = max(r0_15, r0_16);
8062        r0_15 = t;
8063      };
8064      shared.m[smem_l_idx + (0)] = r0_1;
8065      shared.m[smem_l_idx + (16)] = r0_2;
8066      shared.m[smem_l_idx + (32)] = r0_3;
8067      shared.m[smem_l_idx + (48)] = r0_4;
8068      shared.m[smem_l_idx + (64)] = r0_5;
8069      shared.m[smem_l_idx + (80)] = r0_6;
8070      shared.m[smem_l_idx + (96)] = r0_7;
8071      shared.m[smem_l_idx + (112)] = r0_8;
8072      shared.m[smem_l_idx + (128)] = r0_9;
8073      shared.m[smem_l_idx + (144)] = r0_10;
8074      shared.m[smem_l_idx + (160)] = r0_11;
8075      shared.m[smem_l_idx + (176)] = r0_12;
8076      shared.m[smem_l_idx + (192)] = r0_13;
8077      shared.m[smem_l_idx + (208)] = r0_14;
8078      shared.m[smem_l_idx + (224)] = r0_15;
8079      shared.m[smem_l_idx + (240)] = r0_16;
8080    }
8081  }
8082  barrier(CLK_LOCAL_MEM_FENCE);
8083  uint r1 = shared.m[get_local_id(0) + (16 * (1 << 4) * 0)];
8084  uint r2 = shared.m[get_local_id(0) + (16 * (1 << 4) * 1)];
8085  uint r3 = shared.m[get_local_id(0) + (16 * (1 << 4) * 2)];
8086  uint r4 = shared.m[get_local_id(0) + (16 * (1 << 4) * 3)];
8087  uint r5 = shared.m[get_local_id(0) + (16 * (1 << 4) * 4)];
8088  uint r6 = shared.m[get_local_id(0) + (16 * (1 << 4) * 5)];
8089  uint r7 = shared.m[get_local_id(0) + (16 * (1 << 4) * 6)];
8090  uint r8 = shared.m[get_local_id(0) + (16 * (1 << 4) * 7)];
8091  {
8092    {
8093      uint const half_lane_idx = get_sub_group_local_id() ^ 8;
8094      int const t_lt = get_sub_group_local_id() < half_lane_idx;
8095      ;
8096      {
8097        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
8098        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
8099      };
8100      {
8101        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
8102        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
8103      };
8104      {
8105        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
8106        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
8107      };
8108      {
8109        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
8110        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
8111      };
8112      {
8113        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
8114        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
8115      };
8116      {
8117        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
8118        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
8119      };
8120      {
8121        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
8122        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
8123      };
8124      {
8125        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
8126        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
8127      };
8128    }
8129    {
8130      uint const half_lane_idx = get_sub_group_local_id() ^ 4;
8131      int const t_lt = get_sub_group_local_id() < half_lane_idx;
8132      ;
8133      {
8134        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
8135        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
8136      };
8137      {
8138        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
8139        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
8140      };
8141      {
8142        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
8143        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
8144      };
8145      {
8146        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
8147        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
8148      };
8149      {
8150        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
8151        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
8152      };
8153      {
8154        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
8155        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
8156      };
8157      {
8158        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
8159        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
8160      };
8161      {
8162        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
8163        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
8164      };
8165    }
8166    {
8167      uint const half_lane_idx = get_sub_group_local_id() ^ 2;
8168      int const t_lt = get_sub_group_local_id() < half_lane_idx;
8169      ;
8170      {
8171        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
8172        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
8173      };
8174      {
8175        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
8176        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
8177      };
8178      {
8179        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
8180        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
8181      };
8182      {
8183        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
8184        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
8185      };
8186      {
8187        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
8188        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
8189      };
8190      {
8191        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
8192        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
8193      };
8194      {
8195        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
8196        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
8197      };
8198      {
8199        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
8200        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
8201      };
8202    }
8203    {
8204      uint const half_lane_idx = get_sub_group_local_id() ^ 1;
8205      int const t_lt = get_sub_group_local_id() < half_lane_idx;
8206      ;
8207      {
8208        uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
8209        r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
8210      };
8211      {
8212        uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
8213        r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
8214      };
8215      {
8216        uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
8217        r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
8218      };
8219      {
8220        uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
8221        r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
8222      };
8223      {
8224        uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
8225        r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
8226      };
8227      {
8228        uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
8229        r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
8230      };
8231      {
8232        uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
8233        r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
8234      };
8235      {
8236        uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
8237        r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
8238      };
8239    }
8240    {
8241      uint const t = min(r1, r5);
8242      r5 = max(r1, r5);
8243      r1 = t;
8244    };
8245    {
8246      uint const t = min(r3, r7);
8247      r7 = max(r3, r7);
8248      r3 = t;
8249    };
8250    {
8251      uint const t = min(r1, r3);
8252      r3 = max(r1, r3);
8253      r1 = t;
8254    };
8255    {
8256      uint const t = min(r5, r7);
8257      r7 = max(r5, r7);
8258      r5 = t;
8259    };
8260    {
8261      uint const t = min(r2, r6);
8262      r6 = max(r2, r6);
8263      r2 = t;
8264    };
8265    {
8266      uint const t = min(r4, r8);
8267      r8 = max(r4, r8);
8268      r4 = t;
8269    };
8270    {
8271      uint const t = min(r2, r4);
8272      r4 = max(r2, r4);
8273      r2 = t;
8274    };
8275    {
8276      uint const t = min(r6, r8);
8277      r8 = max(r6, r8);
8278      r6 = t;
8279    };
8280    {
8281      uint const t = min(r1, r2);
8282      r2 = max(r1, r2);
8283      r1 = t;
8284    };
8285    {
8286      uint const t = min(r3, r4);
8287      r4 = max(r3, r4);
8288      r3 = t;
8289    };
8290    {
8291      uint const t = min(r5, r6);
8292      r6 = max(r5, r6);
8293      r5 = t;
8294    };
8295    {
8296      uint const t = min(r7, r8);
8297      r8 = max(r7, r8);
8298      r7 = t;
8299    };
8300  }
8301  vout[gmem_idx + (1 << 4) * 0] = r1;
8302  vout[gmem_idx + (1 << 4) * 1] = r2;
8303  vout[gmem_idx + (1 << 4) * 2] = r3;
8304  vout[gmem_idx + (1 << 4) * 3] = r4;
8305  vout[gmem_idx + (1 << 4) * 4] = r5;
8306  vout[gmem_idx + (1 << 4) * 5] = r6;
8307  vout[gmem_idx + (1 << 4) * 6] = r7;
8308  vout[gmem_idx + (1 << 4) * 7] = r8;
8309}
8310
8311__kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) void
8312hs_kernel_fm_0_0(__global uint* const restrict vout)
8313{
8314  uint const span_idx = get_global_id(1);
8315  uint const span_stride = get_global_size(0);
8316  uint const span_size = span_stride * 8 * 2;
8317  uint const span_base = span_idx * span_size;
8318  uint const span_off = get_global_id(0);
8319  uint const span_l = span_base + span_off;
8320  uint const span_r = span_base + span_stride * (8 + 1) - span_off - 1;
8321  uint r1 = vout[span_l + span_stride * 0];
8322  uint r2 = vout[span_l + span_stride * 1];
8323  uint r3 = vout[span_l + span_stride * 2];
8324  uint r4 = vout[span_l + span_stride * 3];
8325  uint r5 = vout[span_l + span_stride * 4];
8326  uint r6 = vout[span_l + span_stride * 5];
8327  uint r7 = vout[span_l + span_stride * 6];
8328  uint r8 = vout[span_l + span_stride * 7];
8329  uint r9 = vout[span_r + span_stride * 0];
8330  {
8331    uint const t = min(r8, r9);
8332    r9 = max(r8, r9);
8333    r8 = t;
8334  };
8335  {
8336    uint const t = min(r1, r5);
8337    r5 = max(r1, r5);
8338    r1 = t;
8339  };
8340  {
8341    uint const t = min(r3, r7);
8342    r7 = max(r3, r7);
8343    r3 = t;
8344  };
8345  {
8346    uint const t = min(r1, r3);
8347    r3 = max(r1, r3);
8348    r1 = t;
8349  };
8350  {
8351    uint const t = min(r5, r7);
8352    r7 = max(r5, r7);
8353    r5 = t;
8354  };
8355  {
8356    uint const t = min(r2, r6);
8357    r6 = max(r2, r6);
8358    r2 = t;
8359  };
8360  {
8361    uint const t = min(r4, r8);
8362    r8 = max(r4, r8);
8363    r4 = t;
8364  };
8365  {
8366    uint const t = min(r2, r4);
8367    r4 = max(r2, r4);
8368    r2 = t;
8369  };
8370  {
8371    uint const t = min(r6, r8);
8372    r8 = max(r6, r8);
8373    r6 = t;
8374  };
8375  {
8376    uint const t = min(r1, r2);
8377    r2 = max(r1, r2);
8378    r1 = t;
8379  };
8380  {
8381    uint const t = min(r3, r4);
8382    r4 = max(r3, r4);
8383    r3 = t;
8384  };
8385  {
8386    uint const t = min(r5, r6);
8387    r6 = max(r5, r6);
8388    r5 = t;
8389  };
8390  {
8391    uint const t = min(r7, r8);
8392    r8 = max(r7, r8);
8393    r7 = t;
8394  };
8395  vout[span_l + span_stride * 0] = r1;
8396  vout[span_l + span_stride * 1] = r2;
8397  vout[span_l + span_stride * 2] = r3;
8398  vout[span_l + span_stride * 3] = r4;
8399  vout[span_l + span_stride * 4] = r5;
8400  vout[span_l + span_stride * 5] = r6;
8401  vout[span_l + span_stride * 6] = r7;
8402  vout[span_l + span_stride * 7] = r8;
8403  vout[span_r + span_stride * 0] = r9;
8404}
8405
8406__kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) void
8407hs_kernel_fm_0_1(__global uint* const restrict vout)
8408{
8409  uint const span_idx = get_global_id(1);
8410  uint const span_stride = get_global_size(0);
8411  uint const span_size = span_stride * 8 * 2;
8412  uint const span_base = span_idx * span_size;
8413  uint const span_off = get_global_id(0);
8414  uint const span_l = span_base + span_off;
8415  uint const span_r = span_base + span_stride * (8 + 1) - span_off - 1;
8416  uint r1 = vout[span_l + span_stride * 0];
8417  uint r2 = vout[span_l + span_stride * 1];
8418  uint r3 = vout[span_l + span_stride * 2];
8419  uint r4 = vout[span_l + span_stride * 3];
8420  uint r5 = vout[span_l + span_stride * 4];
8421  uint r6 = vout[span_l + span_stride * 5];
8422  uint r7 = vout[span_l + span_stride * 6];
8423  uint r8 = vout[span_l + span_stride * 7];
8424  uint r9 = vout[span_r + span_stride * 0];
8425  uint r10 = vout[span_r + span_stride * 1];
8426  {
8427    uint const t = min(r8, r9);
8428    r9 = max(r8, r9);
8429    r8 = t;
8430  };
8431  {
8432    uint const t = min(r7, r10);
8433    r10 = max(r7, r10);
8434    r7 = t;
8435  };
8436  {
8437    uint const t = min(r1, r5);
8438    r5 = max(r1, r5);
8439    r1 = t;
8440  };
8441  {
8442    uint const t = min(r3, r7);
8443    r7 = max(r3, r7);
8444    r3 = t;
8445  };
8446  {
8447    uint const t = min(r1, r3);
8448    r3 = max(r1, r3);
8449    r1 = t;
8450  };
8451  {
8452    uint const t = min(r5, r7);
8453    r7 = max(r5, r7);
8454    r5 = t;
8455  };
8456  {
8457    uint const t = min(r2, r6);
8458    r6 = max(r2, r6);
8459    r2 = t;
8460  };
8461  {
8462    uint const t = min(r4, r8);
8463    r8 = max(r4, r8);
8464    r4 = t;
8465  };
8466  {
8467    uint const t = min(r2, r4);
8468    r4 = max(r2, r4);
8469    r2 = t;
8470  };
8471  {
8472    uint const t = min(r6, r8);
8473    r8 = max(r6, r8);
8474    r6 = t;
8475  };
8476  {
8477    uint const t = min(r1, r2);
8478    r2 = max(r1, r2);
8479    r1 = t;
8480  };
8481  {
8482    uint const t = min(r3, r4);
8483    r4 = max(r3, r4);
8484    r3 = t;
8485  };
8486  {
8487    uint const t = min(r5, r6);
8488    r6 = max(r5, r6);
8489    r5 = t;
8490  };
8491  {
8492    uint const t = min(r7, r8);
8493    r8 = max(r7, r8);
8494    r7 = t;
8495  };
8496  {
8497    uint const t = min(r9, r10);
8498    r10 = max(r9, r10);
8499    r9 = t;
8500  };
8501  vout[span_l + span_stride * 0] = r1;
8502  vout[span_l + span_stride * 1] = r2;
8503  vout[span_l + span_stride * 2] = r3;
8504  vout[span_l + span_stride * 3] = r4;
8505  vout[span_l + span_stride * 4] = r5;
8506  vout[span_l + span_stride * 5] = r6;
8507  vout[span_l + span_stride * 6] = r7;
8508  vout[span_l + span_stride * 7] = r8;
8509  vout[span_r + span_stride * 0] = r9;
8510  vout[span_r + span_stride * 1] = r10;
8511}
8512
8513__kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) void
8514hs_kernel_fm_0_2(__global uint* const restrict vout)
8515{
8516  uint const span_idx = get_global_id(1);
8517  uint const span_stride = get_global_size(0);
8518  uint const span_size = span_stride * 8 * 2;
8519  uint const span_base = span_idx * span_size;
8520  uint const span_off = get_global_id(0);
8521  uint const span_l = span_base + span_off;
8522  uint const span_r = span_base + span_stride * (8 + 1) - span_off - 1;
8523  uint r1 = vout[span_l + span_stride * 0];
8524  uint r2 = vout[span_l + span_stride * 1];
8525  uint r3 = vout[span_l + span_stride * 2];
8526  uint r4 = vout[span_l + span_stride * 3];
8527  uint r5 = vout[span_l + span_stride * 4];
8528  uint r6 = vout[span_l + span_stride * 5];
8529  uint r7 = vout[span_l + span_stride * 6];
8530  uint r8 = vout[span_l + span_stride * 7];
8531  uint r9 = vout[span_r + span_stride * 0];
8532  uint r10 = vout[span_r + span_stride * 1];
8533  uint r11 = vout[span_r + span_stride * 2];
8534  uint r12 = vout[span_r + span_stride * 3];
8535  {
8536    uint const t = min(r8, r9);
8537    r9 = max(r8, r9);
8538    r8 = t;
8539  };
8540  {
8541    uint const t = min(r7, r10);
8542    r10 = max(r7, r10);
8543    r7 = t;
8544  };
8545  {
8546    uint const t = min(r6, r11);
8547    r11 = max(r6, r11);
8548    r6 = t;
8549  };
8550  {
8551    uint const t = min(r5, r12);
8552    r12 = max(r5, r12);
8553    r5 = t;
8554  };
8555  {
8556    uint const t = min(r1, r5);
8557    r5 = max(r1, r5);
8558    r1 = t;
8559  };
8560  {
8561    uint const t = min(r3, r7);
8562    r7 = max(r3, r7);
8563    r3 = t;
8564  };
8565  {
8566    uint const t = min(r1, r3);
8567    r3 = max(r1, r3);
8568    r1 = t;
8569  };
8570  {
8571    uint const t = min(r5, r7);
8572    r7 = max(r5, r7);
8573    r5 = t;
8574  };
8575  {
8576    uint const t = min(r2, r6);
8577    r6 = max(r2, r6);
8578    r2 = t;
8579  };
8580  {
8581    uint const t = min(r4, r8);
8582    r8 = max(r4, r8);
8583    r4 = t;
8584  };
8585  {
8586    uint const t = min(r2, r4);
8587    r4 = max(r2, r4);
8588    r2 = t;
8589  };
8590  {
8591    uint const t = min(r6, r8);
8592    r8 = max(r6, r8);
8593    r6 = t;
8594  };
8595  {
8596    uint const t = min(r1, r2);
8597    r2 = max(r1, r2);
8598    r1 = t;
8599  };
8600  {
8601    uint const t = min(r3, r4);
8602    r4 = max(r3, r4);
8603    r3 = t;
8604  };
8605  {
8606    uint const t = min(r5, r6);
8607    r6 = max(r5, r6);
8608    r5 = t;
8609  };
8610  {
8611    uint const t = min(r7, r8);
8612    r8 = max(r7, r8);
8613    r7 = t;
8614  };
8615  {
8616    uint const t = min(r9, r11);
8617    r11 = max(r9, r11);
8618    r9 = t;
8619  };
8620  {
8621    uint const t = min(r10, r12);
8622    r12 = max(r10, r12);
8623    r10 = t;
8624  };
8625  {
8626    uint const t = min(r9, r10);
8627    r10 = max(r9, r10);
8628    r9 = t;
8629  };
8630  {
8631    uint const t = min(r11, r12);
8632    r12 = max(r11, r12);
8633    r11 = t;
8634  };
8635  vout[span_l + span_stride * 0] = r1;
8636  vout[span_l + span_stride * 1] = r2;
8637  vout[span_l + span_stride * 2] = r3;
8638  vout[span_l + span_stride * 3] = r4;
8639  vout[span_l + span_stride * 4] = r5;
8640  vout[span_l + span_stride * 5] = r6;
8641  vout[span_l + span_stride * 6] = r7;
8642  vout[span_l + span_stride * 7] = r8;
8643  vout[span_r + span_stride * 0] = r9;
8644  vout[span_r + span_stride * 1] = r10;
8645  vout[span_r + span_stride * 2] = r11;
8646  vout[span_r + span_stride * 3] = r12;
8647}
8648
8649__kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) void
8650hs_kernel_fm_0_3(__global uint* const restrict vout)
8651{
8652  uint const span_idx = get_global_id(1);
8653  uint const span_stride = get_global_size(0);
8654  uint const span_size = span_stride * 8 * 2;
8655  uint const span_base = span_idx * span_size;
8656  uint const span_off = get_global_id(0);
8657  uint const span_l = span_base + span_off;
8658  uint const span_r = span_base + span_stride * (8 + 1) - span_off - 1;
8659  uint r1 = vout[span_l + span_stride * 0];
8660  uint r2 = vout[span_l + span_stride * 1];
8661  uint r3 = vout[span_l + span_stride * 2];
8662  uint r4 = vout[span_l + span_stride * 3];
8663  uint r5 = vout[span_l + span_stride * 4];
8664  uint r6 = vout[span_l + span_stride * 5];
8665  uint r7 = vout[span_l + span_stride * 6];
8666  uint r8 = vout[span_l + span_stride * 7];
8667  uint r9 = vout[span_r + span_stride * 0];
8668  uint r10 = vout[span_r + span_stride * 1];
8669  uint r11 = vout[span_r + span_stride * 2];
8670  uint r12 = vout[span_r + span_stride * 3];
8671  uint r13 = vout[span_r + span_stride * 4];
8672  uint r14 = vout[span_r + span_stride * 5];
8673  uint r15 = vout[span_r + span_stride * 6];
8674  uint r16 = vout[span_r + span_stride * 7];
8675  {
8676    uint const t = min(r8, r9);
8677    r9 = max(r8, r9);
8678    r8 = t;
8679  };
8680  {
8681    uint const t = min(r7, r10);
8682    r10 = max(r7, r10);
8683    r7 = t;
8684  };
8685  {
8686    uint const t = min(r6, r11);
8687    r11 = max(r6, r11);
8688    r6 = t;
8689  };
8690  {
8691    uint const t = min(r5, r12);
8692    r12 = max(r5, r12);
8693    r5 = t;
8694  };
8695  {
8696    uint const t = min(r4, r13);
8697    r13 = max(r4, r13);
8698    r4 = t;
8699  };
8700  {
8701    uint const t = min(r3, r14);
8702    r14 = max(r3, r14);
8703    r3 = t;
8704  };
8705  {
8706    uint const t = min(r2, r15);
8707    r15 = max(r2, r15);
8708    r2 = t;
8709  };
8710  {
8711    uint const t = min(r1, r16);
8712    r16 = max(r1, r16);
8713    r1 = t;
8714  };
8715  {
8716    uint const t = min(r1, r5);
8717    r5 = max(r1, r5);
8718    r1 = t;
8719  };
8720  {
8721    uint const t = min(r3, r7);
8722    r7 = max(r3, r7);
8723    r3 = t;
8724  };
8725  {
8726    uint const t = min(r1, r3);
8727    r3 = max(r1, r3);
8728    r1 = t;
8729  };
8730  {
8731    uint const t = min(r5, r7);
8732    r7 = max(r5, r7);
8733    r5 = t;
8734  };
8735  {
8736    uint const t = min(r2, r6);
8737    r6 = max(r2, r6);
8738    r2 = t;
8739  };
8740  {
8741    uint const t = min(r4, r8);
8742    r8 = max(r4, r8);
8743    r4 = t;
8744  };
8745  {
8746    uint const t = min(r2, r4);
8747    r4 = max(r2, r4);
8748    r2 = t;
8749  };
8750  {
8751    uint const t = min(r6, r8);
8752    r8 = max(r6, r8);
8753    r6 = t;
8754  };
8755  {
8756    uint const t = min(r1, r2);
8757    r2 = max(r1, r2);
8758    r1 = t;
8759  };
8760  {
8761    uint const t = min(r3, r4);
8762    r4 = max(r3, r4);
8763    r3 = t;
8764  };
8765  {
8766    uint const t = min(r5, r6);
8767    r6 = max(r5, r6);
8768    r5 = t;
8769  };
8770  {
8771    uint const t = min(r7, r8);
8772    r8 = max(r7, r8);
8773    r7 = t;
8774  };
8775  {
8776    uint const t = min(r9, r13);
8777    r13 = max(r9, r13);
8778    r9 = t;
8779  };
8780  {
8781    uint const t = min(r11, r15);
8782    r15 = max(r11, r15);
8783    r11 = t;
8784  };
8785  {
8786    uint const t = min(r9, r11);
8787    r11 = max(r9, r11);
8788    r9 = t;
8789  };
8790  {
8791    uint const t = min(r13, r15);
8792    r15 = max(r13, r15);
8793    r13 = t;
8794  };
8795  {
8796    uint const t = min(r10, r14);
8797    r14 = max(r10, r14);
8798    r10 = t;
8799  };
8800  {
8801    uint const t = min(r12, r16);
8802    r16 = max(r12, r16);
8803    r12 = t;
8804  };
8805  {
8806    uint const t = min(r10, r12);
8807    r12 = max(r10, r12);
8808    r10 = t;
8809  };
8810  {
8811    uint const t = min(r14, r16);
8812    r16 = max(r14, r16);
8813    r14 = t;
8814  };
8815  {
8816    uint const t = min(r9, r10);
8817    r10 = max(r9, r10);
8818    r9 = t;
8819  };
8820  {
8821    uint const t = min(r11, r12);
8822    r12 = max(r11, r12);
8823    r11 = t;
8824  };
8825  {
8826    uint const t = min(r13, r14);
8827    r14 = max(r13, r14);
8828    r13 = t;
8829  };
8830  {
8831    uint const t = min(r15, r16);
8832    r16 = max(r15, r16);
8833    r15 = t;
8834  };
8835  vout[span_l + span_stride * 0] = r1;
8836  vout[span_l + span_stride * 1] = r2;
8837  vout[span_l + span_stride * 2] = r3;
8838  vout[span_l + span_stride * 3] = r4;
8839  vout[span_l + span_stride * 4] = r5;
8840  vout[span_l + span_stride * 5] = r6;
8841  vout[span_l + span_stride * 6] = r7;
8842  vout[span_l + span_stride * 7] = r8;
8843  vout[span_r + span_stride * 0] = r9;
8844  vout[span_r + span_stride * 1] = r10;
8845  vout[span_r + span_stride * 2] = r11;
8846  vout[span_r + span_stride * 3] = r12;
8847  vout[span_r + span_stride * 4] = r13;
8848  vout[span_r + span_stride * 5] = r14;
8849  vout[span_r + span_stride * 6] = r15;
8850  vout[span_r + span_stride * 7] = r16;
8851}
8852
8853__kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) void
8854hs_kernel_hm_0(__global uint* const restrict vout)
8855{
8856  uint const span_idx = get_global_id(1);
8857  uint const span_stride = get_global_size(0);
8858  uint const span_size = span_stride * 8 * 2;
8859  uint const span_base = span_idx * span_size;
8860  uint const span_off = get_global_id(0);
8861  uint const span_l = span_base + span_off;
8862  uint r1 = vout[span_l + span_stride * 0];
8863  uint r2 = vout[span_l + span_stride * 1];
8864  uint r3 = vout[span_l + span_stride * 2];
8865  uint r4 = vout[span_l + span_stride * 3];
8866  uint r5 = vout[span_l + span_stride * 4];
8867  uint r6 = vout[span_l + span_stride * 5];
8868  uint r7 = vout[span_l + span_stride * 6];
8869  uint r8 = vout[span_l + span_stride * 7];
8870  uint r9 = vout[span_l + span_stride * 8];
8871  uint r10 = vout[span_l + span_stride * 9];
8872  uint r11 = vout[span_l + span_stride * 10];
8873  uint r12 = vout[span_l + span_stride * 11];
8874  uint r13 = vout[span_l + span_stride * 12];
8875  uint r14 = vout[span_l + span_stride * 13];
8876  uint r15 = vout[span_l + span_stride * 14];
8877  uint r16 = vout[span_l + span_stride * 15];
8878  {
8879    uint const t = min(r1, r9);
8880    r9 = max(r1, r9);
8881    r1 = t;
8882  };
8883  {
8884    uint const t = min(r5, r13);
8885    r13 = max(r5, r13);
8886    r5 = t;
8887  };
8888  {
8889    uint const t = min(r1, r5);
8890    r5 = max(r1, r5);
8891    r1 = t;
8892  };
8893  {
8894    uint const t = min(r9, r13);
8895    r13 = max(r9, r13);
8896    r9 = t;
8897  };
8898  {
8899    uint const t = min(r3, r11);
8900    r11 = max(r3, r11);
8901    r3 = t;
8902  };
8903  {
8904    uint const t = min(r7, r15);
8905    r15 = max(r7, r15);
8906    r7 = t;
8907  };
8908  {
8909    uint const t = min(r3, r7);
8910    r7 = max(r3, r7);
8911    r3 = t;
8912  };
8913  {
8914    uint const t = min(r11, r15);
8915    r15 = max(r11, r15);
8916    r11 = t;
8917  };
8918  {
8919    uint const t = min(r1, r3);
8920    r3 = max(r1, r3);
8921    r1 = t;
8922  };
8923  {
8924    uint const t = min(r5, r7);
8925    r7 = max(r5, r7);
8926    r5 = t;
8927  };
8928  {
8929    uint const t = min(r9, r11);
8930    r11 = max(r9, r11);
8931    r9 = t;
8932  };
8933  {
8934    uint const t = min(r13, r15);
8935    r15 = max(r13, r15);
8936    r13 = t;
8937  };
8938  {
8939    uint const t = min(r2, r10);
8940    r10 = max(r2, r10);
8941    r2 = t;
8942  };
8943  {
8944    uint const t = min(r6, r14);
8945    r14 = max(r6, r14);
8946    r6 = t;
8947  };
8948  {
8949    uint const t = min(r2, r6);
8950    r6 = max(r2, r6);
8951    r2 = t;
8952  };
8953  {
8954    uint const t = min(r10, r14);
8955    r14 = max(r10, r14);
8956    r10 = t;
8957  };
8958  {
8959    uint const t = min(r4, r12);
8960    r12 = max(r4, r12);
8961    r4 = t;
8962  };
8963  {
8964    uint const t = min(r8, r16);
8965    r16 = max(r8, r16);
8966    r8 = t;
8967  };
8968  {
8969    uint const t = min(r4, r8);
8970    r8 = max(r4, r8);
8971    r4 = t;
8972  };
8973  {
8974    uint const t = min(r12, r16);
8975    r16 = max(r12, r16);
8976    r12 = t;
8977  };
8978  {
8979    uint const t = min(r2, r4);
8980    r4 = max(r2, r4);
8981    r2 = t;
8982  };
8983  {
8984    uint const t = min(r6, r8);
8985    r8 = max(r6, r8);
8986    r6 = t;
8987  };
8988  {
8989    uint const t = min(r10, r12);
8990    r12 = max(r10, r12);
8991    r10 = t;
8992  };
8993  {
8994    uint const t = min(r14, r16);
8995    r16 = max(r14, r16);
8996    r14 = t;
8997  };
8998  {
8999    uint const t = min(r1, r2);
9000    r2 = max(r1, r2);
9001    r1 = t;
9002  };
9003  {
9004    uint const t = min(r3, r4);
9005    r4 = max(r3, r4);
9006    r3 = t;
9007  };
9008  {
9009    uint const t = min(r5, r6);
9010    r6 = max(r5, r6);
9011    r5 = t;
9012  };
9013  {
9014    uint const t = min(r7, r8);
9015    r8 = max(r7, r8);
9016    r7 = t;
9017  };
9018  {
9019    uint const t = min(r9, r10);
9020    r10 = max(r9, r10);
9021    r9 = t;
9022  };
9023  {
9024    uint const t = min(r11, r12);
9025    r12 = max(r11, r12);
9026    r11 = t;
9027  };
9028  {
9029    uint const t = min(r13, r14);
9030    r14 = max(r13, r14);
9031    r13 = t;
9032  };
9033  {
9034    uint const t = min(r15, r16);
9035    r16 = max(r15, r16);
9036    r15 = t;
9037  };
9038  vout[span_l + span_stride * 0] = r1;
9039  vout[span_l + span_stride * 1] = r2;
9040  vout[span_l + span_stride * 2] = r3;
9041  vout[span_l + span_stride * 3] = r4;
9042  vout[span_l + span_stride * 4] = r5;
9043  vout[span_l + span_stride * 5] = r6;
9044  vout[span_l + span_stride * 6] = r7;
9045  vout[span_l + span_stride * 7] = r8;
9046  vout[span_l + span_stride * 8] = r9;
9047  vout[span_l + span_stride * 9] = r10;
9048  vout[span_l + span_stride * 10] = r11;
9049  vout[span_l + span_stride * 11] = r12;
9050  vout[span_l + span_stride * 12] = r13;
9051  vout[span_l + span_stride * 13] = r14;
9052  vout[span_l + span_stride * 14] = r15;
9053  vout[span_l + span_stride * 15] = r16;
9054}
9055
9056__kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) void
9057hs_kernel_transpose(__global uint* const restrict vout)
9058{
9059  uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 +
9060                        (get_local_id(0) & ((1 << 4) - 1));
9061  uint r1 = vout[gmem_idx + (1 << 4) * 0];
9062  uint r2 = vout[gmem_idx + (1 << 4) * 1];
9063  uint r3 = vout[gmem_idx + (1 << 4) * 2];
9064  uint r4 = vout[gmem_idx + (1 << 4) * 3];
9065  uint r5 = vout[gmem_idx + (1 << 4) * 4];
9066  uint r6 = vout[gmem_idx + (1 << 4) * 5];
9067  uint r7 = vout[gmem_idx + (1 << 4) * 6];
9068  uint r8 = vout[gmem_idx + (1 << 4) * 7];
9069  bool const is_lo_1 = (get_sub_group_local_id() & (1 << (1 - 1))) == 0;
9070  bool const is_lo_2 = (get_sub_group_local_id() & (1 << (2 - 1))) == 0;
9071  bool const is_lo_3 = (get_sub_group_local_id() & (1 << (3 - 1))) == 0;
9072  bool const is_lo_4 = (get_sub_group_local_id() & (1 << (4 - 1))) == 0;
9073  uint const s2_1 =
9074    intel_sub_group_shuffle_xor(is_lo_1 ? r2 : r1, 1 << (1 - 1));
9075  uint const s2 = is_lo_1 ? s2_1 : r2;
9076  uint const s1 = is_lo_1 ? r1 : s2_1;
9077  uint const s4_3 =
9078    intel_sub_group_shuffle_xor(is_lo_1 ? r4 : r3, 1 << (1 - 1));
9079  uint const s4 = is_lo_1 ? s4_3 : r4;
9080  uint const s3 = is_lo_1 ? r3 : s4_3;
9081  uint const s6_5 =
9082    intel_sub_group_shuffle_xor(is_lo_1 ? r6 : r5, 1 << (1 - 1));
9083  uint const s6 = is_lo_1 ? s6_5 : r6;
9084  uint const s5 = is_lo_1 ? r5 : s6_5;
9085  uint const s8_7 =
9086    intel_sub_group_shuffle_xor(is_lo_1 ? r8 : r7, 1 << (1 - 1));
9087  uint const s8 = is_lo_1 ? s8_7 : r8;
9088  uint const s7 = is_lo_1 ? r7 : s8_7;
9089  uint const t3_1 =
9090    intel_sub_group_shuffle_xor(is_lo_2 ? s3 : s1, 1 << (2 - 1));
9091  uint const t3 = is_lo_2 ? t3_1 : s3;
9092  uint const t1 = is_lo_2 ? s1 : t3_1;
9093  uint const t4_2 =
9094    intel_sub_group_shuffle_xor(is_lo_2 ? s4 : s2, 1 << (2 - 1));
9095  uint const t4 = is_lo_2 ? t4_2 : s4;
9096  uint const t2 = is_lo_2 ? s2 : t4_2;
9097  uint const t7_5 =
9098    intel_sub_group_shuffle_xor(is_lo_2 ? s7 : s5, 1 << (2 - 1));
9099  uint const t7 = is_lo_2 ? t7_5 : s7;
9100  uint const t5 = is_lo_2 ? s5 : t7_5;
9101  uint const t8_6 =
9102    intel_sub_group_shuffle_xor(is_lo_2 ? s8 : s6, 1 << (2 - 1));
9103  uint const t8 = is_lo_2 ? t8_6 : s8;
9104  uint const t6 = is_lo_2 ? s6 : t8_6;
9105  uint const u5_1 =
9106    intel_sub_group_shuffle_xor(is_lo_3 ? t5 : t1, 1 << (3 - 1));
9107  uint const u5 = is_lo_3 ? u5_1 : t5;
9108  uint const u1 = is_lo_3 ? t1 : u5_1;
9109  uint const u6_2 =
9110    intel_sub_group_shuffle_xor(is_lo_3 ? t6 : t2, 1 << (3 - 1));
9111  uint const u6 = is_lo_3 ? u6_2 : t6;
9112  uint const u2 = is_lo_3 ? t2 : u6_2;
9113  uint const u7_3 =
9114    intel_sub_group_shuffle_xor(is_lo_3 ? t7 : t3, 1 << (3 - 1));
9115  uint const u7 = is_lo_3 ? u7_3 : t7;
9116  uint const u3 = is_lo_3 ? t3 : u7_3;
9117  uint const u8_4 =
9118    intel_sub_group_shuffle_xor(is_lo_3 ? t8 : t4, 1 << (3 - 1));
9119  uint const u8 = is_lo_3 ? u8_4 : t8;
9120  uint const u4 = is_lo_3 ? t4 : u8_4;
9121  uint const v2_1 =
9122    intel_sub_group_shuffle_xor(is_lo_4 ? u2 : u1, 1 << (4 - 1));
9123  uint const v2 = is_lo_4 ? v2_1 : u2;
9124  uint const v1 = is_lo_4 ? u1 : v2_1;
9125  uint const v4_3 =
9126    intel_sub_group_shuffle_xor(is_lo_4 ? u4 : u3, 1 << (4 - 1));
9127  uint const v4 = is_lo_4 ? v4_3 : u4;
9128  uint const v3 = is_lo_4 ? u3 : v4_3;
9129  uint const v6_5 =
9130    intel_sub_group_shuffle_xor(is_lo_4 ? u6 : u5, 1 << (4 - 1));
9131  uint const v6 = is_lo_4 ? v6_5 : u6;
9132  uint const v5 = is_lo_4 ? u5 : v6_5;
9133  uint const v8_7 =
9134    intel_sub_group_shuffle_xor(is_lo_4 ? u8 : u7, 1 << (4 - 1));
9135  uint const v8 = is_lo_4 ? v8_7 : u8;
9136  uint const v7 = is_lo_4 ? u7 : v8_7;
9137  vout[gmem_idx + ((1 - 1) << 4)] = v1;
9138  vout[gmem_idx + ((5 - 1) << 4)] = v2;
9139  vout[gmem_idx + ((2 - 1) << 4)] = v3;
9140  vout[gmem_idx + ((6 - 1) << 4)] = v4;
9141  vout[gmem_idx + ((3 - 1) << 4)] = v5;
9142  vout[gmem_idx + ((7 - 1) << 4)] = v6;
9143  vout[gmem_idx + ((4 - 1) << 4)] = v7;
9144  vout[gmem_idx + ((8 - 1) << 4)] = v8;
9145}
9146