• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
3; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
4; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
5
6define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
7; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
8; SI:       ; %bb.0:
9; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
10; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
11; SI-NEXT:    v_mov_b32_e32 v1, 0
12; SI-NEXT:    s_mov_b32 s10, 0
13; SI-NEXT:    s_mov_b32 s11, 0xf000
14; SI-NEXT:    s_waitcnt lgkmcnt(0)
15; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
16; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
17; SI-NEXT:    s_mov_b64 s[8:9], s[4:5]
18; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
19; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
20; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
21; SI-NEXT:    s_waitcnt vmcnt(2)
22; SI-NEXT:    v_sub_f32_e32 v2, 0x80000000, v2
23; SI-NEXT:    s_waitcnt vmcnt(0)
24; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
25; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
26; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
27; SI-NEXT:    s_endpgm
28;
29; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
30; VI:       ; %bb.0:
31; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
32; VI-NEXT:    v_lshlrev_b32_e32 v8, 2, v0
33; VI-NEXT:    s_waitcnt lgkmcnt(0)
34; VI-NEXT:    v_mov_b32_e32 v0, s2
35; VI-NEXT:    v_mov_b32_e32 v1, s3
36; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
37; VI-NEXT:    v_mov_b32_e32 v2, s4
38; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
39; VI-NEXT:    v_mov_b32_e32 v3, s5
40; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v8
41; VI-NEXT:    v_mov_b32_e32 v4, s6
42; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
43; VI-NEXT:    v_mov_b32_e32 v5, s7
44; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
45; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
46; VI-NEXT:    flat_load_dword v0, v[0:1]
47; VI-NEXT:    flat_load_dword v1, v[2:3]
48; VI-NEXT:    flat_load_dword v2, v[4:5]
49; VI-NEXT:    v_mov_b32_e32 v7, s1
50; VI-NEXT:    v_mov_b32_e32 v6, s0
51; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
52; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
53; VI-NEXT:    s_waitcnt vmcnt(2)
54; VI-NEXT:    v_sub_f32_e32 v0, 0x80000000, v0
55; VI-NEXT:    s_waitcnt vmcnt(0)
56; VI-NEXT:    v_med3_f32 v0, v0, v1, v2
57; VI-NEXT:    flat_store_dword v[6:7], v0
58; VI-NEXT:    s_endpgm
59;
60; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
61; GFX9:       ; %bb.0:
62; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
63; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
64; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
65; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
66; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
67; GFX9-NEXT:    global_load_dword v3, v0, s[6:7]
68; GFX9-NEXT:    s_waitcnt vmcnt(2)
69; GFX9-NEXT:    v_sub_f32_e32 v1, 0x80000000, v1
70; GFX9-NEXT:    s_waitcnt vmcnt(0)
71; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
72; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
73; GFX9-NEXT:    s_endpgm
74  %tid = call i32 @llvm.amdgcn.workitem.id.x()
75  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
76  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
77  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
78  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
79  %a = load volatile float, float addrspace(1)* %gep0
80  %b = load volatile float, float addrspace(1)* %gep1
81  %c = load volatile float, float addrspace(1)* %gep2
82  %a.fneg = fsub float -0.0, %a
83  %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b)
84  %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b)
85  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
86  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
87  store float %med3, float addrspace(1)* %outgep
88  ret void
89}
90
91define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
92; SI-LABEL: v_test_no_global_nnans_med3_f32_pat0_srcmod0:
93; SI:       ; %bb.0:
94; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
95; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
96; SI-NEXT:    v_mov_b32_e32 v1, 0
97; SI-NEXT:    s_mov_b32 s10, 0
98; SI-NEXT:    s_mov_b32 s11, 0xf000
99; SI-NEXT:    s_waitcnt lgkmcnt(0)
100; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
101; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
102; SI-NEXT:    s_mov_b64 s[8:9], s[4:5]
103; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
104; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
105; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
106; SI-NEXT:    s_waitcnt vmcnt(2)
107; SI-NEXT:    v_sub_f32_e32 v2, 0x80000000, v2
108; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
109; SI-NEXT:    s_waitcnt vmcnt(1)
110; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
111; SI-NEXT:    v_min_f32_e32 v5, v2, v3
112; SI-NEXT:    v_max_f32_e32 v2, v2, v3
113; SI-NEXT:    s_waitcnt vmcnt(0)
114; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v4
115; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
116; SI-NEXT:    v_min_f32_e32 v2, v2, v3
117; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v5
118; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
119; SI-NEXT:    v_max_f32_e32 v2, v3, v2
120; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
121; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
122; SI-NEXT:    s_endpgm
123;
124; VI-LABEL: v_test_no_global_nnans_med3_f32_pat0_srcmod0:
125; VI:       ; %bb.0:
126; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
127; VI-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
128; VI-NEXT:    s_waitcnt lgkmcnt(0)
129; VI-NEXT:    v_mov_b32_e32 v0, s2
130; VI-NEXT:    v_mov_b32_e32 v1, s3
131; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
132; VI-NEXT:    v_mov_b32_e32 v2, s4
133; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
134; VI-NEXT:    v_mov_b32_e32 v3, s5
135; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
136; VI-NEXT:    v_mov_b32_e32 v4, s6
137; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
138; VI-NEXT:    v_mov_b32_e32 v5, s7
139; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
140; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
141; VI-NEXT:    flat_load_dword v7, v[0:1]
142; VI-NEXT:    flat_load_dword v2, v[2:3]
143; VI-NEXT:    flat_load_dword v3, v[4:5]
144; VI-NEXT:    v_mov_b32_e32 v0, s0
145; VI-NEXT:    v_mov_b32_e32 v1, s1
146; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
147; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
148; VI-NEXT:    s_waitcnt vmcnt(2)
149; VI-NEXT:    v_sub_f32_e32 v4, 0x80000000, v7
150; VI-NEXT:    s_waitcnt vmcnt(1)
151; VI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
152; VI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
153; VI-NEXT:    v_min_f32_e32 v5, v4, v2
154; VI-NEXT:    v_max_f32_e32 v2, v4, v2
155; VI-NEXT:    s_waitcnt vmcnt(0)
156; VI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
157; VI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
158; VI-NEXT:    v_min_f32_e32 v2, v2, v3
159; VI-NEXT:    v_mul_f32_e32 v3, 1.0, v5
160; VI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
161; VI-NEXT:    v_max_f32_e32 v2, v3, v2
162; VI-NEXT:    flat_store_dword v[0:1], v2
163; VI-NEXT:    s_endpgm
164;
165; GFX9-LABEL: v_test_no_global_nnans_med3_f32_pat0_srcmod0:
166; GFX9:       ; %bb.0:
167; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
168; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
169; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
170; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
171; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
172; GFX9-NEXT:    global_load_dword v3, v0, s[6:7]
173; GFX9-NEXT:    s_waitcnt vmcnt(2)
174; GFX9-NEXT:    v_sub_f32_e32 v1, 0x80000000, v1
175; GFX9-NEXT:    s_waitcnt vmcnt(1)
176; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
177; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
178; GFX9-NEXT:    v_min_f32_e32 v4, v1, v2
179; GFX9-NEXT:    v_max_f32_e32 v1, v1, v2
180; GFX9-NEXT:    s_waitcnt vmcnt(0)
181; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
182; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
183; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
184; GFX9-NEXT:    v_max_f32_e32 v2, v4, v4
185; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
186; GFX9-NEXT:    v_max_f32_e32 v1, v2, v1
187; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
188; GFX9-NEXT:    s_endpgm
189  %tid = call i32 @llvm.amdgcn.workitem.id.x()
190  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
191  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
192  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
193  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
194  %a = load volatile float, float addrspace(1)* %gep0
195  %b = load volatile float, float addrspace(1)* %gep1
196  %c = load volatile float, float addrspace(1)* %gep2
197  %a.fneg = fsub float -0.0, %a
198  %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b)
199  %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b)
200  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
201  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
202  store float %med3, float addrspace(1)* %outgep
203  ret void
204}
205
206define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
207; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
208; SI:       ; %bb.0:
209; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
210; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
211; SI-NEXT:    v_mov_b32_e32 v1, 0
212; SI-NEXT:    s_mov_b32 s10, 0
213; SI-NEXT:    s_mov_b32 s11, 0xf000
214; SI-NEXT:    s_waitcnt lgkmcnt(0)
215; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
216; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
217; SI-NEXT:    s_mov_b64 s[8:9], s[4:5]
218; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
219; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
220; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
221; SI-NEXT:    s_mov_b32 s2, 0x80000000
222; SI-NEXT:    s_waitcnt vmcnt(2)
223; SI-NEXT:    v_sub_f32_e32 v2, s2, v2
224; SI-NEXT:    s_waitcnt vmcnt(0)
225; SI-NEXT:    v_sub_f32_e64 v4, s2, |v4|
226; SI-NEXT:    v_med3_f32 v2, v2, |v3|, v4
227; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
228; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
229; SI-NEXT:    s_endpgm
230;
231; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
232; VI:       ; %bb.0:
233; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
234; VI-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
235; VI-NEXT:    s_waitcnt lgkmcnt(0)
236; VI-NEXT:    v_mov_b32_e32 v0, s2
237; VI-NEXT:    v_mov_b32_e32 v1, s3
238; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
239; VI-NEXT:    v_mov_b32_e32 v2, s4
240; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
241; VI-NEXT:    v_mov_b32_e32 v3, s5
242; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
243; VI-NEXT:    v_mov_b32_e32 v4, s6
244; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
245; VI-NEXT:    v_mov_b32_e32 v5, s7
246; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
247; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
248; VI-NEXT:    flat_load_dword v7, v[0:1]
249; VI-NEXT:    flat_load_dword v2, v[2:3]
250; VI-NEXT:    flat_load_dword v3, v[4:5]
251; VI-NEXT:    v_mov_b32_e32 v0, s0
252; VI-NEXT:    s_mov_b32 s2, 0x80000000
253; VI-NEXT:    v_mov_b32_e32 v1, s1
254; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
255; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
256; VI-NEXT:    s_waitcnt vmcnt(2)
257; VI-NEXT:    v_sub_f32_e32 v4, s2, v7
258; VI-NEXT:    s_waitcnt vmcnt(0)
259; VI-NEXT:    v_sub_f32_e64 v3, s2, |v3|
260; VI-NEXT:    v_med3_f32 v2, v4, |v2|, v3
261; VI-NEXT:    flat_store_dword v[0:1], v2
262; VI-NEXT:    s_endpgm
263;
264; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
265; GFX9:       ; %bb.0:
266; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
267; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
268; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
269; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
270; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
271; GFX9-NEXT:    global_load_dword v3, v0, s[6:7]
272; GFX9-NEXT:    s_mov_b32 s2, 0x80000000
273; GFX9-NEXT:    s_waitcnt vmcnt(2)
274; GFX9-NEXT:    v_sub_f32_e32 v1, s2, v1
275; GFX9-NEXT:    s_waitcnt vmcnt(0)
276; GFX9-NEXT:    v_sub_f32_e64 v3, s2, |v3|
277; GFX9-NEXT:    v_med3_f32 v1, v1, |v2|, v3
278; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
279; GFX9-NEXT:    s_endpgm
280  %tid = call i32 @llvm.amdgcn.workitem.id.x()
281  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
282  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
283  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
284  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
285  %a = load volatile float, float addrspace(1)* %gep0
286  %b = load volatile float, float addrspace(1)* %gep1
287  %c = load volatile float, float addrspace(1)* %gep2
288
289  %a.fneg = fsub float -0.0, %a
290  %b.fabs = call float @llvm.fabs.f32(float %b)
291  %c.fabs = call float @llvm.fabs.f32(float %c)
292  %c.fabs.fneg = fsub float -0.0, %c.fabs
293
294  %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b.fabs)
295  %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b.fabs)
296  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
297  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
298
299  store float %med3, float addrspace(1)* %outgep
300  ret void
301}
302
303define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
304; SI-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
305; SI:       ; %bb.0:
306; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
307; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
308; SI-NEXT:    v_mov_b32_e32 v1, 0
309; SI-NEXT:    s_mov_b32 s10, 0
310; SI-NEXT:    s_mov_b32 s11, 0xf000
311; SI-NEXT:    s_waitcnt lgkmcnt(0)
312; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
313; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
314; SI-NEXT:    s_mov_b64 s[8:9], s[4:5]
315; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
316; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
317; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
318; SI-NEXT:    s_mov_b32 s2, 0x80000000
319; SI-NEXT:    s_waitcnt vmcnt(2)
320; SI-NEXT:    v_sub_f32_e64 v2, s2, |v2|
321; SI-NEXT:    s_waitcnt vmcnt(1)
322; SI-NEXT:    v_sub_f32_e64 v3, s2, |v3|
323; SI-NEXT:    s_waitcnt vmcnt(0)
324; SI-NEXT:    v_sub_f32_e64 v4, s2, |v4|
325; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
326; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
327; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
328; SI-NEXT:    s_endpgm
329;
330; VI-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
331; VI:       ; %bb.0:
332; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
333; VI-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
334; VI-NEXT:    s_waitcnt lgkmcnt(0)
335; VI-NEXT:    v_mov_b32_e32 v0, s2
336; VI-NEXT:    v_mov_b32_e32 v1, s3
337; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
338; VI-NEXT:    v_mov_b32_e32 v2, s4
339; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
340; VI-NEXT:    v_mov_b32_e32 v3, s5
341; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
342; VI-NEXT:    v_mov_b32_e32 v4, s6
343; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
344; VI-NEXT:    v_mov_b32_e32 v5, s7
345; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
346; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
347; VI-NEXT:    flat_load_dword v7, v[0:1]
348; VI-NEXT:    flat_load_dword v2, v[2:3]
349; VI-NEXT:    flat_load_dword v3, v[4:5]
350; VI-NEXT:    s_mov_b32 s2, 0x80000000
351; VI-NEXT:    v_mov_b32_e32 v0, s0
352; VI-NEXT:    v_mov_b32_e32 v1, s1
353; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
354; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
355; VI-NEXT:    s_waitcnt vmcnt(2)
356; VI-NEXT:    v_sub_f32_e64 v4, s2, |v7|
357; VI-NEXT:    s_waitcnt vmcnt(1)
358; VI-NEXT:    v_sub_f32_e64 v2, s2, |v2|
359; VI-NEXT:    s_waitcnt vmcnt(0)
360; VI-NEXT:    v_sub_f32_e64 v3, s2, |v3|
361; VI-NEXT:    v_med3_f32 v2, v4, v2, v3
362; VI-NEXT:    flat_store_dword v[0:1], v2
363; VI-NEXT:    s_endpgm
364;
365; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
366; GFX9:       ; %bb.0:
367; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
368; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
369; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
370; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
371; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
372; GFX9-NEXT:    global_load_dword v3, v0, s[6:7]
373; GFX9-NEXT:    s_mov_b32 s2, 0x80000000
374; GFX9-NEXT:    s_waitcnt vmcnt(2)
375; GFX9-NEXT:    v_sub_f32_e64 v1, s2, |v1|
376; GFX9-NEXT:    s_waitcnt vmcnt(1)
377; GFX9-NEXT:    v_sub_f32_e64 v2, s2, |v2|
378; GFX9-NEXT:    s_waitcnt vmcnt(0)
379; GFX9-NEXT:    v_sub_f32_e64 v3, s2, |v3|
380; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
381; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
382; GFX9-NEXT:    s_endpgm
383  %tid = call i32 @llvm.amdgcn.workitem.id.x()
384  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
385  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
386  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
387  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
388  %a = load volatile float, float addrspace(1)* %gep0
389  %b = load volatile float, float addrspace(1)* %gep1
390  %c = load volatile float, float addrspace(1)* %gep2
391
392  %a.fabs = call float @llvm.fabs.f32(float %a)
393  %a.fabs.fneg = fsub float -0.0, %a.fabs
394  %b.fabs = call float @llvm.fabs.f32(float %b)
395  %b.fabs.fneg = fsub float -0.0, %b.fabs
396  %c.fabs = call float @llvm.fabs.f32(float %c)
397  %c.fabs.fneg = fsub float -0.0, %c.fabs
398
399  %tmp0 = call float @llvm.minnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
400  %tmp1 = call float @llvm.maxnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
401  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
402  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
403
404  store float %med3, float addrspace(1)* %outgep
405  ret void
406}
407
408define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
409; SI-LABEL: v_nnan_inputs_med3_f32_pat0:
410; SI:       ; %bb.0:
411; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
412; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
413; SI-NEXT:    v_mov_b32_e32 v1, 0
414; SI-NEXT:    s_mov_b32 s10, 0
415; SI-NEXT:    s_mov_b32 s11, 0xf000
416; SI-NEXT:    s_waitcnt lgkmcnt(0)
417; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
418; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
419; SI-NEXT:    s_mov_b64 s[8:9], s[4:5]
420; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
421; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
422; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
423; SI-NEXT:    s_waitcnt vmcnt(2)
424; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
425; SI-NEXT:    s_waitcnt vmcnt(1)
426; SI-NEXT:    v_add_f32_e32 v3, 2.0, v3
427; SI-NEXT:    s_waitcnt vmcnt(0)
428; SI-NEXT:    v_add_f32_e32 v4, 4.0, v4
429; SI-NEXT:    v_min_f32_e32 v5, v2, v3
430; SI-NEXT:    v_max_f32_e32 v2, v2, v3
431; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
432; SI-NEXT:    v_min_f32_e32 v2, v2, v4
433; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v5
434; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
435; SI-NEXT:    v_max_f32_e32 v2, v3, v2
436; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
437; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
438; SI-NEXT:    s_endpgm
439;
440; VI-LABEL: v_nnan_inputs_med3_f32_pat0:
441; VI:       ; %bb.0:
442; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
443; VI-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
444; VI-NEXT:    s_waitcnt lgkmcnt(0)
445; VI-NEXT:    v_mov_b32_e32 v0, s2
446; VI-NEXT:    v_mov_b32_e32 v1, s3
447; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
448; VI-NEXT:    v_mov_b32_e32 v2, s4
449; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
450; VI-NEXT:    v_mov_b32_e32 v3, s5
451; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
452; VI-NEXT:    v_mov_b32_e32 v4, s6
453; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
454; VI-NEXT:    v_mov_b32_e32 v5, s7
455; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
456; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
457; VI-NEXT:    flat_load_dword v7, v[0:1]
458; VI-NEXT:    flat_load_dword v2, v[2:3]
459; VI-NEXT:    flat_load_dword v3, v[4:5]
460; VI-NEXT:    v_mov_b32_e32 v0, s0
461; VI-NEXT:    v_mov_b32_e32 v1, s1
462; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
463; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
464; VI-NEXT:    s_waitcnt vmcnt(2)
465; VI-NEXT:    v_add_f32_e32 v4, 1.0, v7
466; VI-NEXT:    s_waitcnt vmcnt(1)
467; VI-NEXT:    v_add_f32_e32 v2, 2.0, v2
468; VI-NEXT:    v_min_f32_e32 v5, v4, v2
469; VI-NEXT:    v_max_f32_e32 v2, v4, v2
470; VI-NEXT:    s_waitcnt vmcnt(0)
471; VI-NEXT:    v_add_f32_e32 v3, 4.0, v3
472; VI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
473; VI-NEXT:    v_min_f32_e32 v2, v2, v3
474; VI-NEXT:    v_mul_f32_e32 v3, 1.0, v5
475; VI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
476; VI-NEXT:    v_max_f32_e32 v2, v3, v2
477; VI-NEXT:    flat_store_dword v[0:1], v2
478; VI-NEXT:    s_endpgm
479;
480; GFX9-LABEL: v_nnan_inputs_med3_f32_pat0:
481; GFX9:       ; %bb.0:
482; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
483; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
484; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
485; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
486; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
487; GFX9-NEXT:    global_load_dword v3, v0, s[6:7]
488; GFX9-NEXT:    s_waitcnt vmcnt(2)
489; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
490; GFX9-NEXT:    s_waitcnt vmcnt(1)
491; GFX9-NEXT:    v_add_f32_e32 v2, 2.0, v2
492; GFX9-NEXT:    v_min_f32_e32 v4, v1, v2
493; GFX9-NEXT:    v_max_f32_e32 v1, v1, v2
494; GFX9-NEXT:    s_waitcnt vmcnt(0)
495; GFX9-NEXT:    v_add_f32_e32 v3, 4.0, v3
496; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
497; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
498; GFX9-NEXT:    v_max_f32_e32 v2, v4, v4
499; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
500; GFX9-NEXT:    v_max_f32_e32 v1, v2, v1
501; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
502; GFX9-NEXT:    s_endpgm
503  %tid = call i32 @llvm.amdgcn.workitem.id.x()
504  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
505  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
506  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
507  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
508  %a = load volatile float, float addrspace(1)* %gep0
509  %b = load volatile float, float addrspace(1)* %gep1
510  %c = load volatile float, float addrspace(1)* %gep2
511
512  %a.nnan = fadd nnan float %a, 1.0
513  %b.nnan = fadd nnan float %b, 2.0
514  %c.nnan = fadd nnan float %c, 4.0
515
516  %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan)
517  %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan)
518  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan)
519  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
520  store float %med3, float addrspace(1)* %outgep
521  ret void
522}
523
524
525; ---------------------------------------------------------------------
526; Negative patterns
527; ---------------------------------------------------------------------
528
529define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
530; SI-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
531; SI:       ; %bb.0:
532; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
533; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
534; SI-NEXT:    v_mov_b32_e32 v1, 0
535; SI-NEXT:    s_mov_b32 s10, 0
536; SI-NEXT:    s_mov_b32 s11, 0xf000
537; SI-NEXT:    s_waitcnt lgkmcnt(0)
538; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
539; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
540; SI-NEXT:    s_mov_b64 s[8:9], s[4:5]
541; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
542; SI-NEXT:    s_mov_b32 s2, -1
543; SI-NEXT:    s_mov_b32 s3, s11
544; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
545; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
546; SI-NEXT:    s_waitcnt vmcnt(2)
547; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
548; SI-NEXT:    s_waitcnt vmcnt(1)
549; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
550; SI-NEXT:    v_min_f32_e32 v5, v2, v3
551; SI-NEXT:    v_max_f32_e32 v2, v2, v3
552; SI-NEXT:    s_waitcnt vmcnt(0)
553; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v4
554; SI-NEXT:    buffer_store_dword v5, off, s[0:3], 0
555; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
556; SI-NEXT:    v_min_f32_e32 v2, v2, v3
557; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v5
558; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
559; SI-NEXT:    v_max_f32_e32 v2, v3, v2
560; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
561; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
562; SI-NEXT:    s_endpgm
563;
564; VI-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
565; VI:       ; %bb.0:
566; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
567; VI-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
568; VI-NEXT:    s_waitcnt lgkmcnt(0)
569; VI-NEXT:    v_mov_b32_e32 v0, s2
570; VI-NEXT:    v_mov_b32_e32 v1, s3
571; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
572; VI-NEXT:    v_mov_b32_e32 v2, s4
573; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
574; VI-NEXT:    v_mov_b32_e32 v3, s5
575; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
576; VI-NEXT:    v_mov_b32_e32 v4, s6
577; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
578; VI-NEXT:    v_mov_b32_e32 v5, s7
579; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
580; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
581; VI-NEXT:    flat_load_dword v7, v[0:1]
582; VI-NEXT:    flat_load_dword v2, v[2:3]
583; VI-NEXT:    flat_load_dword v3, v[4:5]
584; VI-NEXT:    v_mov_b32_e32 v0, s0
585; VI-NEXT:    v_mov_b32_e32 v1, s1
586; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
587; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
588; VI-NEXT:    s_waitcnt vmcnt(2)
589; VI-NEXT:    v_mul_f32_e32 v4, 1.0, v7
590; VI-NEXT:    s_waitcnt vmcnt(1)
591; VI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
592; VI-NEXT:    v_min_f32_e32 v5, v4, v2
593; VI-NEXT:    v_max_f32_e32 v2, v4, v2
594; VI-NEXT:    s_waitcnt vmcnt(0)
595; VI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
596; VI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
597; VI-NEXT:    v_min_f32_e32 v2, v2, v3
598; VI-NEXT:    v_mul_f32_e32 v3, 1.0, v5
599; VI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
600; VI-NEXT:    v_max_f32_e32 v2, v3, v2
601; VI-NEXT:    flat_store_dword v[0:1], v5
602; VI-NEXT:    flat_store_dword v[0:1], v2
603; VI-NEXT:    s_endpgm
604;
605; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
606; GFX9:       ; %bb.0:
607; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
608; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
609; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
610; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
611; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
612; GFX9-NEXT:    global_load_dword v3, v0, s[6:7]
613; GFX9-NEXT:    s_waitcnt vmcnt(2)
614; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
615; GFX9-NEXT:    s_waitcnt vmcnt(1)
616; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
617; GFX9-NEXT:    v_min_f32_e32 v4, v1, v2
618; GFX9-NEXT:    v_max_f32_e32 v1, v1, v2
619; GFX9-NEXT:    global_store_dword v[0:1], v4, off
620; GFX9-NEXT:    s_waitcnt vmcnt(1)
621; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
622; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
623; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
624; GFX9-NEXT:    v_max_f32_e32 v2, v4, v4
625; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
626; GFX9-NEXT:    v_max_f32_e32 v1, v2, v1
627; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
628; GFX9-NEXT:    s_endpgm
629  %tid = call i32 @llvm.amdgcn.workitem.id.x()
630  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
631  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
632  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
633  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
634  %a = load volatile float, float addrspace(1)* %gep0
635  %b = load volatile float, float addrspace(1)* %gep1
636  %c = load volatile float, float addrspace(1)* %gep2
637  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
638  store volatile float %tmp0, float addrspace(1)* undef
639  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
640  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
641  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
642  store float %med3, float addrspace(1)* %outgep
643  ret void
644}
645
646declare i32 @llvm.amdgcn.workitem.id.x() #0
647declare float @llvm.fabs.f32(float) #0
648declare float @llvm.minnum.f32(float, float) #0
649declare float @llvm.maxnum.f32(float, float) #0
650declare double @llvm.minnum.f64(double, double) #0
651declare double @llvm.maxnum.f64(double, double) #0
652declare half @llvm.fabs.f16(half) #0
653declare half @llvm.minnum.f16(half, half) #0
654declare half @llvm.maxnum.f16(half, half) #0
655
656attributes #0 = { nounwind readnone }
657attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" }
658attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
659