• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=SI %s
2; RUN: llc -march=amdgcn -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SNAN -check-prefix=GCN -check-prefix=SI %s
3; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
4; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SNAN -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
5; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
6; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SNAN -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
7
8
9; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f32:
10; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v{{[0-9]+}}
11; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 2.0, 4.0
12define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
13  %tid = call i32 @llvm.amdgcn.workitem.id.x()
14  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
15  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
16  %a = load float, float addrspace(1)* %gep0
17  %a.add = fadd nnan float %a, 1.0
18  %max = call float @llvm.maxnum.f32(float %a.add, float 2.0)
19  %med = call float @llvm.minnum.f32(float %max, float 4.0)
20
21  store float %med, float addrspace(1)* %outgep
22  ret void
23}
24
25; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_f32:
26; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
27
28; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
29; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
30define amdgpu_kernel void @v_test_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
31  %tid = call i32 @llvm.amdgcn.workitem.id.x()
32  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
33  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
34  %a = load float, float addrspace(1)* %gep0
35
36  %max = call float @llvm.maxnum.f32(float %a, float 2.0)
37  %med = call float @llvm.minnum.f32(float %max, float 4.0)
38
39  store float %med, float addrspace(1)* %outgep
40  ret void
41}
42
43; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_commute0_f32:
44; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
45
46; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
47; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
48define amdgpu_kernel void @v_test_fmed3_r_i_i_commute0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
49  %tid = call i32 @llvm.amdgcn.workitem.id.x()
50  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
51  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
52  %a = load float, float addrspace(1)* %gep0
53
54  %max = call float @llvm.maxnum.f32(float 2.0, float %a)
55  %med = call float @llvm.minnum.f32(float 4.0, float %max)
56
57  store float %med, float addrspace(1)* %outgep
58  ret void
59}
60
61; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_commute1_f32:
62; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
63
64; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
65; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
66define amdgpu_kernel void @v_test_fmed3_r_i_i_commute1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
67  %tid = call i32 @llvm.amdgcn.workitem.id.x()
68  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
69  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
70  %a = load float, float addrspace(1)* %gep0
71
72  %max = call float @llvm.maxnum.f32(float %a, float 2.0)
73  %med = call float @llvm.minnum.f32(float 4.0, float %max)
74
75  store float %med, float addrspace(1)* %outgep
76  ret void
77}
78
79; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_constant_order_f32:
80; GCN: v_max_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
81; GCN: v_min_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
82define amdgpu_kernel void @v_test_fmed3_r_i_i_constant_order_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
83  %tid = call i32 @llvm.amdgcn.workitem.id.x()
84  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
85  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
86  %a = load float, float addrspace(1)* %gep0
87
88  %max = call float @llvm.maxnum.f32(float %a, float 4.0)
89  %med = call float @llvm.minnum.f32(float %max, float 2.0)
90
91  store float %med, float addrspace(1)* %outgep
92  ret void
93}
94
95
96; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_multi_use_f32:
97; GCN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
98; GCN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
99define amdgpu_kernel void @v_test_fmed3_r_i_i_multi_use_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
100  %tid = call i32 @llvm.amdgcn.workitem.id.x()
101  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
102  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
103  %a = load float, float addrspace(1)* %gep0
104
105  %max = call float @llvm.maxnum.f32(float %a, float 2.0)
106  %med = call float @llvm.minnum.f32(float %max, float 4.0)
107
108  store volatile float %med, float addrspace(1)* %outgep
109  store volatile float %max, float addrspace(1)* %outgep
110  ret void
111}
112
113; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_f64:
114; GCN: v_max_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 2.0
115; GCN: v_min_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 4.0
116define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #1 {
117  %tid = call i32 @llvm.amdgcn.workitem.id.x()
118  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
119  %outgep = getelementptr double, double addrspace(1)* %out, i32 %tid
120  %a = load double, double addrspace(1)* %gep0
121
122  %max = call double @llvm.maxnum.f64(double %a, double 2.0)
123  %med = call double @llvm.minnum.f64(double %max, double 4.0)
124
125  store double %med, double addrspace(1)* %outgep
126  ret void
127}
128
129; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_no_nans_f32:
130; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
131define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
132  %tid = call i32 @llvm.amdgcn.workitem.id.x()
133  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
134  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
135  %a = load float, float addrspace(1)* %gep0
136
137  %max = call float @llvm.maxnum.f32(float %a, float 2.0)
138  %med = call float @llvm.minnum.f32(float %max, float 4.0)
139
140  store float %med, float addrspace(1)* %outgep
141  ret void
142}
143
144; GCN-LABEL: {{^}}v_test_legacy_fmed3_r_i_i_f32:
145; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
146
147; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
148; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
149define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
150  %tid = call i32 @llvm.amdgcn.workitem.id.x()
151  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
152  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
153  %a = load float, float addrspace(1)* %gep0
154
155  ; fmax_legacy
156  %cmp0 = fcmp ule float %a, 2.0
157  %max = select i1 %cmp0, float 2.0, float %a
158
159  ; fmin_legacy
160  %cmp1 = fcmp uge float %max, 4.0
161  %med = select i1 %cmp1, float 4.0, float %max
162
163  store float %med, float addrspace(1)* %outgep
164  ret void
165}
166
167; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod0:
168; GCN: {{buffer_|flat_|global_}}load_dword [[A:v[0-9]+]]
169; GCN: {{buffer_|flat_|global_}}load_dword [[B:v[0-9]+]]
170; GCN: {{buffer_|flat_|global_}}load_dword [[C:v[0-9]+]]
171; GCN: v_med3_f32 v{{[0-9]+}}, -[[A]], [[B]], [[C]]
172define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
173  %tid = call i32 @llvm.amdgcn.workitem.id.x()
174  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
175  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
176  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
177  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
178  %a = load volatile float, float addrspace(1)* %gep0
179  %b = load volatile float, float addrspace(1)* %gep1
180  %c = load volatile float, float addrspace(1)* %gep2
181  %a.fneg = fsub float -0.0, %a
182  %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b)
183  %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b)
184  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
185  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
186  store float %med3, float addrspace(1)* %outgep
187  ret void
188}
189
190; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod1:
191; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
192; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
193; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
194; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], -[[B]], [[C]]
195define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
196  %tid = call i32 @llvm.amdgcn.workitem.id.x()
197  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
198  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
199  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
200  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
201  %a = load volatile float, float addrspace(1)* %gep0
202  %b = load volatile float, float addrspace(1)* %gep1
203  %c = load volatile float, float addrspace(1)* %gep2
204  %b.fneg = fsub float -0.0, %b
205  %tmp0 = call float @llvm.minnum.f32(float %a, float %b.fneg)
206  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b.fneg)
207  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
208  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
209  store float %med3, float addrspace(1)* %outgep
210  ret void
211}
212
213; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod2:
214; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
215; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
216; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
217; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], -[[C]]
218define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
219  %tid = call i32 @llvm.amdgcn.workitem.id.x()
220  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
221  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
222  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
223  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
224  %a = load volatile float, float addrspace(1)* %gep0
225  %b = load volatile float, float addrspace(1)* %gep1
226  %c = load volatile float, float addrspace(1)* %gep2
227  %c.fneg = fsub float -0.0, %c
228  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
229  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
230  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fneg)
231  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
232  store float %med3, float addrspace(1)* %outgep
233  ret void
234}
235
236; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod012:
237; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
238; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
239; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
240; GCN: v_med3_f32 v{{[0-9]+}}, -[[A]], |[[B]]|, -|[[C]]|
241define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
242  %tid = call i32 @llvm.amdgcn.workitem.id.x()
243  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
244  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
245  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
246  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
247  %a = load volatile float, float addrspace(1)* %gep0
248  %b = load volatile float, float addrspace(1)* %gep1
249  %c = load volatile float, float addrspace(1)* %gep2
250
251  %a.fneg = fsub float -0.0, %a
252  %b.fabs = call float @llvm.fabs.f32(float %b)
253  %c.fabs = call float @llvm.fabs.f32(float %c)
254  %c.fabs.fneg = fsub float -0.0, %c.fabs
255
256  %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b.fabs)
257  %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b.fabs)
258  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
259  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
260
261  store float %med3, float addrspace(1)* %outgep
262  ret void
263}
264
265; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_negabs012:
266; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
267; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
268; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
269; GCN: v_med3_f32 v{{[0-9]+}}, -|[[A]]|, -|[[B]]|, -|[[C]]|
270define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
271  %tid = call i32 @llvm.amdgcn.workitem.id.x()
272  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
273  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
274  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
275  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
276  %a = load volatile float, float addrspace(1)* %gep0
277  %b = load volatile float, float addrspace(1)* %gep1
278  %c = load volatile float, float addrspace(1)* %gep2
279
280  %a.fabs = call float @llvm.fabs.f32(float %a)
281  %a.fabs.fneg = fsub float -0.0, %a.fabs
282  %b.fabs = call float @llvm.fabs.f32(float %b)
283  %b.fabs.fneg = fsub float -0.0, %b.fabs
284  %c.fabs = call float @llvm.fabs.f32(float %c)
285  %c.fabs.fneg = fsub float -0.0, %c.fabs
286
287  %tmp0 = call float @llvm.minnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
288  %tmp1 = call float @llvm.maxnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
289  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
290  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
291
292  store float %med3, float addrspace(1)* %outgep
293  ret void
294}
295
296; GCN-LABEL: {{^}}v_nnan_inputs_med3_f32_pat0:
297; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
298; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
299; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
300; GCN-DAG: v_add_f32_e32 [[A_ADD:v[0-9]+]], 1.0, [[A]]
301; GCN-DAG: v_add_f32_e32 [[B_ADD:v[0-9]+]], 2.0, [[B]]
302; GCN-DAG: v_add_f32_e32 [[C_ADD:v[0-9]+]], 4.0, [[C]]
303; GCN: v_med3_f32 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]]
304define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
305  %tid = call i32 @llvm.amdgcn.workitem.id.x()
306  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
307  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
308  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
309  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
310  %a = load volatile float, float addrspace(1)* %gep0
311  %b = load volatile float, float addrspace(1)* %gep1
312  %c = load volatile float, float addrspace(1)* %gep2
313
314  %a.nnan = fadd nnan float %a, 1.0
315  %b.nnan = fadd nnan float %b, 2.0
316  %c.nnan = fadd nnan float %c, 4.0
317
318  %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan)
319  %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan)
320  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan)
321  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
322  store float %med3, float addrspace(1)* %outgep
323  ret void
324}
325
326; 16 combinations
327
328; 0: max(min(x, y), min(max(x, y), z))
329; 1: max(min(x, y), min(max(y, x), z))
330; 2: max(min(x, y), min(z, max(x, y)))
331; 3: max(min(x, y), min(z, max(y, x)))
332; 4: max(min(y, x), min(max(x, y), z))
333; 5: max(min(y, x), min(max(y, x), z))
334; 6: max(min(y, x), min(z, max(x, y)))
335; 7: max(min(y, x), min(z, max(y, x)))
336;
337; + commute outermost max
338
339; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0:
340; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
341; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
342; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
343; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
344define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
345  %tid = call i32 @llvm.amdgcn.workitem.id.x()
346  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
347  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
348  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
349  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
350  %a = load volatile float, float addrspace(1)* %gep0
351  %b = load volatile float, float addrspace(1)* %gep1
352  %c = load volatile float, float addrspace(1)* %gep2
353  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
354  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
355  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
356  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
357  store float %med3, float addrspace(1)* %outgep
358  ret void
359}
360
361; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat1:
362; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
363; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
364; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
365; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
366define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
367  %tid = call i32 @llvm.amdgcn.workitem.id.x()
368  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
369  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
370  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
371  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
372  %a = load volatile float, float addrspace(1)* %gep0
373  %b = load volatile float, float addrspace(1)* %gep1
374  %c = load volatile float, float addrspace(1)* %gep2
375  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
376  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
377  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
378  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
379  store float %med3, float addrspace(1)* %outgep
380  ret void
381}
382
383; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat2:
384; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
385; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
386; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
387; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
388define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
389  %tid = call i32 @llvm.amdgcn.workitem.id.x()
390  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
391  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
392  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
393  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
394  %a = load volatile float, float addrspace(1)* %gep0
395  %b = load volatile float, float addrspace(1)* %gep1
396  %c = load volatile float, float addrspace(1)* %gep2
397  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
398  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
399  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
400  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
401  store float %med3, float addrspace(1)* %outgep
402  ret void
403}
404
405; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat3:
406; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
407; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
408; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
409; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
410define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
411  %tid = call i32 @llvm.amdgcn.workitem.id.x()
412  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
413  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
414  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
415  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
416  %a = load volatile float, float addrspace(1)* %gep0
417  %b = load volatile float, float addrspace(1)* %gep1
418  %c = load volatile float, float addrspace(1)* %gep2
419  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
420  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
421  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
422  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
423  store float %med3, float addrspace(1)* %outgep
424  ret void
425}
426
427; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat4:
428; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
429; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
430; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
431; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
432define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
433  %tid = call i32 @llvm.amdgcn.workitem.id.x()
434  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
435  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
436  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
437  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
438  %a = load volatile float, float addrspace(1)* %gep0
439  %b = load volatile float, float addrspace(1)* %gep1
440  %c = load volatile float, float addrspace(1)* %gep2
441  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
442  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
443  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
444  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
445  store float %med3, float addrspace(1)* %outgep
446  ret void
447}
448
449; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat5:
450; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
451; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
452; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
453; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
454define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
455  %tid = call i32 @llvm.amdgcn.workitem.id.x()
456  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
457  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
458  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
459  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
460  %a = load volatile float, float addrspace(1)* %gep0
461  %b = load volatile float, float addrspace(1)* %gep1
462  %c = load volatile float, float addrspace(1)* %gep2
463  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
464  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
465  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
466  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
467  store float %med3, float addrspace(1)* %outgep
468  ret void
469}
470
471; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat6:
472; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
473; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
474; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
475; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
476define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
477  %tid = call i32 @llvm.amdgcn.workitem.id.x()
478  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
479  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
480  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
481  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
482  %a = load volatile float, float addrspace(1)* %gep0
483  %b = load volatile float, float addrspace(1)* %gep1
484  %c = load volatile float, float addrspace(1)* %gep2
485  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
486  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
487  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
488  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
489  store float %med3, float addrspace(1)* %outgep
490  ret void
491}
492
493; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat7:
494; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
495; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
496; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
497; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
498define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
499  %tid = call i32 @llvm.amdgcn.workitem.id.x()
500  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
501  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
502  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
503  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
504  %a = load volatile float, float addrspace(1)* %gep0
505  %b = load volatile float, float addrspace(1)* %gep1
506  %c = load volatile float, float addrspace(1)* %gep2
507  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
508  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
509  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
510  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
511  store float %med3, float addrspace(1)* %outgep
512  ret void
513}
514
515; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat8:
516; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
517; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
518; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
519; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
520define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
521  %tid = call i32 @llvm.amdgcn.workitem.id.x()
522  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
523  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
524  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
525  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
526  %a = load volatile float, float addrspace(1)* %gep0
527  %b = load volatile float, float addrspace(1)* %gep1
528  %c = load volatile float, float addrspace(1)* %gep2
529  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
530  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
531  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
532  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
533  store float %med3, float addrspace(1)* %outgep
534  ret void
535}
536
537; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat9:
538; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
539; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
540; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
541; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
542define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
543  %tid = call i32 @llvm.amdgcn.workitem.id.x()
544  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
545  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
546  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
547  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
548  %a = load volatile float, float addrspace(1)* %gep0
549  %b = load volatile float, float addrspace(1)* %gep1
550  %c = load volatile float, float addrspace(1)* %gep2
551  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
552  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
553  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
554  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
555  store float %med3, float addrspace(1)* %outgep
556  ret void
557}
558
559; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat10:
560; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
561; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
562; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
563; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
564define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
565  %tid = call i32 @llvm.amdgcn.workitem.id.x()
566  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
567  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
568  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
569  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
570  %a = load volatile float, float addrspace(1)* %gep0
571  %b = load volatile float, float addrspace(1)* %gep1
572  %c = load volatile float, float addrspace(1)* %gep2
573  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
574  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
575  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
576  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
577  store float %med3, float addrspace(1)* %outgep
578  ret void
579}
580
581; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat11:
582; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
583; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
584; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
585; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
586define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
587  %tid = call i32 @llvm.amdgcn.workitem.id.x()
588  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
589  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
590  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
591  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
592  %a = load volatile float, float addrspace(1)* %gep0
593  %b = load volatile float, float addrspace(1)* %gep1
594  %c = load volatile float, float addrspace(1)* %gep2
595  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
596  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
597  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
598  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
599  store float %med3, float addrspace(1)* %outgep
600  ret void
601}
602
603; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat12:
604; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
605; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
606; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
607; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
608define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
609  %tid = call i32 @llvm.amdgcn.workitem.id.x()
610  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
611  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
612  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
613  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
614  %a = load volatile float, float addrspace(1)* %gep0
615  %b = load volatile float, float addrspace(1)* %gep1
616  %c = load volatile float, float addrspace(1)* %gep2
617  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
618  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
619  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
620  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
621  store float %med3, float addrspace(1)* %outgep
622  ret void
623}
624
625; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat13:
626; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
627; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
628; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
629; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
630define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
631  %tid = call i32 @llvm.amdgcn.workitem.id.x()
632  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
633  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
634  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
635  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
636  %a = load volatile float, float addrspace(1)* %gep0
637  %b = load volatile float, float addrspace(1)* %gep1
638  %c = load volatile float, float addrspace(1)* %gep2
639  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
640  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
641  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
642  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
643  store float %med3, float addrspace(1)* %outgep
644  ret void
645}
646
647; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat14:
648; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
649; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
650; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
651; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
652define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
653  %tid = call i32 @llvm.amdgcn.workitem.id.x()
654  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
655  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
656  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
657  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
658  %a = load volatile float, float addrspace(1)* %gep0
659  %b = load volatile float, float addrspace(1)* %gep1
660  %c = load volatile float, float addrspace(1)* %gep2
661  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
662  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
663  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
664  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
665  store float %med3, float addrspace(1)* %outgep
666  ret void
667}
668
669; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat15:
670; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
671; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
672; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
673; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
674define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
675  %tid = call i32 @llvm.amdgcn.workitem.id.x()
676  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
677  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
678  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
679  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
680  %a = load volatile float, float addrspace(1)* %gep0
681  %b = load volatile float, float addrspace(1)* %gep1
682  %c = load volatile float, float addrspace(1)* %gep2
683  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
684  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
685  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
686  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
687  store float %med3, float addrspace(1)* %outgep
688  ret void
689}
690
691; ---------------------------------------------------------------------
692; Negative patterns
693; ---------------------------------------------------------------------
694
695; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use0:
696; GCN-DAG: v_min_f32
697; GCN-DAG: v_max_f32
698; GCN: v_min_f32
699; GCN: v_max_f32
700define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
701  %tid = call i32 @llvm.amdgcn.workitem.id.x()
702  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
703  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
704  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
705  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
706  %a = load volatile float, float addrspace(1)* %gep0
707  %b = load volatile float, float addrspace(1)* %gep1
708  %c = load volatile float, float addrspace(1)* %gep2
709  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
710  store volatile float %tmp0, float addrspace(1)* undef
711  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
712  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
713  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
714  store float %med3, float addrspace(1)* %outgep
715  ret void
716}
717
718; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use1:
719define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
720  %tid = call i32 @llvm.amdgcn.workitem.id.x()
721  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
722  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
723  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
724  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
725  %a = load volatile float, float addrspace(1)* %gep0
726  %b = load volatile float, float addrspace(1)* %gep1
727  %c = load volatile float, float addrspace(1)* %gep2
728  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
729  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
730  store volatile float %tmp1, float addrspace(1)* undef
731  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
732  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
733  store float %med3, float addrspace(1)* %outgep
734  ret void
735}
736
737; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use2:
738define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
739  %tid = call i32 @llvm.amdgcn.workitem.id.x()
740  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
741  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
742  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
743  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
744  %a = load volatile float, float addrspace(1)* %gep0
745  %b = load volatile float, float addrspace(1)* %gep1
746  %c = load volatile float, float addrspace(1)* %gep2
747  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
748  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
749  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
750  store volatile float %tmp2, float addrspace(1)* undef
751  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
752  store float %med3, float addrspace(1)* %outgep
753  ret void
754}
755
756
757; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0:
758define amdgpu_kernel void @v_test_safe_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
759  %tid = call i32 @llvm.amdgcn.workitem.id.x()
760  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
761  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
762  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
763  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
764  %a = load volatile float, float addrspace(1)* %gep0
765  %b = load volatile float, float addrspace(1)* %gep1
766  %c = load volatile float, float addrspace(1)* %gep2
767  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
768  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
769  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
770  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
771  store float %med3, float addrspace(1)* %outgep
772  ret void
773}
774
775; GCN-LABEL: {{^}}v_nnan_inputs_missing0_med3_f32_pat0:
776define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
777  %tid = call i32 @llvm.amdgcn.workitem.id.x()
778  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
779  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
780  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
781  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
782  %a = load volatile float, float addrspace(1)* %gep0
783  %b = load volatile float, float addrspace(1)* %gep1
784  %c = load volatile float, float addrspace(1)* %gep2
785
786  %a.nnan = fadd float %a, 1.0
787  %b.nnan = fadd nnan float %b, 2.0
788  %c.nnan = fadd nnan float %c, 4.0
789
790  %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan)
791  %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan)
792  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan)
793  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
794  store float %med3, float addrspace(1)* %outgep
795  ret void
796}
797
798; GCN-LABEL: {{^}}v_nnan_inputs_missing1_med3_f32_pat0:
799define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
800  %tid = call i32 @llvm.amdgcn.workitem.id.x()
801  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
802  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
803  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
804  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
805  %a = load volatile float, float addrspace(1)* %gep0
806  %b = load volatile float, float addrspace(1)* %gep1
807  %c = load volatile float, float addrspace(1)* %gep2
808
809  %a.nnan = fadd nnan float %a, 1.0
810  %b.nnan = fadd float %b, 2.0
811  %c.nnan = fadd nnan float %c, 4.0
812
813  %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan)
814  %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan)
815  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan)
816  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
817  store float %med3, float addrspace(1)* %outgep
818  ret void
819}
820
821; GCN-LABEL: {{^}}v_nnan_inputs_missing2_med3_f32_pat0:
822define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
823  %tid = call i32 @llvm.amdgcn.workitem.id.x()
824  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
825  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
826  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
827  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
828  %a = load volatile float, float addrspace(1)* %gep0
829  %b = load volatile float, float addrspace(1)* %gep1
830  %c = load volatile float, float addrspace(1)* %gep2
831
832  %a.nnan = fadd nnan float %a, 1.0
833  %b.nnan = fadd nnan float %b, 2.0
834  %c.nnan = fadd float %c, 4.0
835
836  %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan)
837  %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan)
838  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan)
839  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
840  store float %med3, float addrspace(1)* %outgep
841  ret void
842}
843
844; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
845; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
846; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
847; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
848; GCN-DAG: v_min_f32
849; GCN-DAG: v_max_f32
850; GCN-DAG: v_min_f32
851; GCN-DAG: v_max_f32
852define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
853  %tid = call i32 @llvm.amdgcn.workitem.id.x()
854  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
855  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
856  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
857  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
858  %a = load volatile float, float addrspace(1)* %gep0
859  %b = load volatile float, float addrspace(1)* %gep1
860  %c = load volatile float, float addrspace(1)* %gep2
861  %a.fneg = fsub float -0.0, %a
862  %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b)
863  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
864  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
865  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
866  store float %med3, float addrspace(1)* %outgep
867  ret void
868}
869
870; A simple min and max is not sufficient
871; GCN-LABEL: {{^}}v_test_global_nnans_min_max_f32:
872; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
873; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
874; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
875; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], [[A]], [[B]]
876; GCN: v_min_f32_e32 v{{[0-9]+}}, [[MAX]], [[C]]
877define amdgpu_kernel void @v_test_global_nnans_min_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
878  %tid = call i32 @llvm.amdgcn.workitem.id.x()
879  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
880  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
881  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
882  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
883  %a = load volatile float, float addrspace(1)* %gep0
884  %b = load volatile float, float addrspace(1)* %gep1
885  %c = load volatile float, float addrspace(1)* %gep2
886  %max = call float @llvm.maxnum.f32(float %a, float %b)
887  %minmax = call float @llvm.minnum.f32(float %max, float %c)
888  store float %minmax, float addrspace(1)* %outgep
889  ret void
890}
891
892; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f16:
893; SI: v_cvt_f32_f16
894; SI: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
895; SI: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
896; SI: v_cvt_f16_f32
897
898; VI: v_add_f16_e32 v{{[0-9]+}}, 1.0
899; VI: v_max_f16_e32 v{{[0-9]+}}, 2.0
900; VI: v_min_f16_e32 v{{[0-9]+}}, 4.0
901
902; GFX9: v_add_f16_e32 [[ADD:v[0-9]+]], 1.0
903; GFX9: v_med3_f16 v{{[0-9]+}}, [[ADD]], 2.0, 4.0
904define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #1 {
905  %tid = call i32 @llvm.amdgcn.workitem.id.x()
906  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
907  %outgep = getelementptr half, half addrspace(1)* %out, i32 %tid
908  %a = load half, half addrspace(1)* %gep0
909  %a.add = fadd nnan half %a, 1.0
910  %max = call half @llvm.maxnum.f16(half %a.add, half 2.0)
911  %med = call half @llvm.minnum.f16(half %max, half 4.0)
912
913  store half %med, half addrspace(1)* %outgep
914  ret void
915}
916
917; GCN-LABEL: {{^}}v_nnan_inputs_med3_f16_pat0:
918; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]]
919; GCN: {{buffer|flat|global}}_load_ushort [[B:v[0-9]+]]
920; GCN: {{buffer|flat|global}}_load_ushort [[C:v[0-9]+]]
921
922; SI: v_cvt_f32_f16
923; SI: v_cvt_f32_f16
924; SI: v_add_f32_e32
925; SI: v_add_f32_e32
926; SI: v_add_f32_e32
927; SI: v_med3_f32
928; SI: v_cvt_f16_f32_e32
929
930
931; GFX89-DAG: v_add_f16_e32 [[A_ADD:v[0-9]+]], 1.0, [[A]]
932; GFX89-DAG: v_add_f16_e32 [[B_ADD:v[0-9]+]], 2.0, [[B]]
933; GFX89-DAG: v_add_f16_e32 [[C_ADD:v[0-9]+]], 4.0, [[C]]
934
935; VI-DAG: v_min_f16
936; VI-DAG: v_max_f16
937; VI: v_min_f16
938; VI: v_max_f16
939
940; GFX9: v_med3_f16 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]]
941define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #1 {
942  %tid = call i32 @llvm.amdgcn.workitem.id.x()
943  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
944  %gep1 = getelementptr half, half addrspace(1)* %bptr, i32 %tid
945  %gep2 = getelementptr half, half addrspace(1)* %cptr, i32 %tid
946  %outgep = getelementptr half, half addrspace(1)* %out, i32 %tid
947  %a = load volatile half, half addrspace(1)* %gep0
948  %b = load volatile half, half addrspace(1)* %gep1
949  %c = load volatile half, half addrspace(1)* %gep2
950
951  %a.nnan = fadd nnan half %a, 1.0
952  %b.nnan = fadd nnan half %b, 2.0
953  %c.nnan = fadd nnan half %c, 4.0
954
955  %tmp0 = call half @llvm.minnum.f16(half %a.nnan, half %b.nnan)
956  %tmp1 = call half @llvm.maxnum.f16(half %a.nnan, half %b.nnan)
957  %tmp2 = call half @llvm.minnum.f16(half %tmp1, half %c.nnan)
958  %med3 = call half @llvm.maxnum.f16(half %tmp0, half %tmp2)
959  store half %med3, half addrspace(1)* %outgep
960  ret void
961}
962
963declare i32 @llvm.amdgcn.workitem.id.x() #0
964declare float @llvm.fabs.f32(float) #0
965declare float @llvm.minnum.f32(float, float) #0
966declare float @llvm.maxnum.f32(float, float) #0
967declare double @llvm.minnum.f64(double, double) #0
968declare double @llvm.maxnum.f64(double, double) #0
969declare half @llvm.fabs.f16(half) #0
970declare half @llvm.minnum.f16(half, half) #0
971declare half @llvm.maxnum.f16(half, half) #0
972
973attributes #0 = { nounwind readnone }
974attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" }
975attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
976