• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5
6declare half @llvm.minnum.f16(half %a, half %b)
7declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
8declare <3 x half> @llvm.minnum.v3f16(<3 x half> %a, <3 x half> %b)
9declare <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b)
10
11define amdgpu_kernel void @minnum_f16_ieee(
12; SI-LABEL: minnum_f16_ieee:
13; SI:       ; %bb.0: ; %entry
14; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
15; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
16; SI-NEXT:    s_mov_b32 s3, 0xf000
17; SI-NEXT:    s_mov_b32 s2, -1
18; SI-NEXT:    s_mov_b32 s14, s2
19; SI-NEXT:    s_waitcnt lgkmcnt(0)
20; SI-NEXT:    s_mov_b32 s12, s6
21; SI-NEXT:    s_mov_b32 s13, s7
22; SI-NEXT:    s_mov_b32 s15, s3
23; SI-NEXT:    s_mov_b32 s10, s2
24; SI-NEXT:    s_mov_b32 s11, s3
25; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
26; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
27; SI-NEXT:    s_mov_b32 s0, s4
28; SI-NEXT:    s_mov_b32 s1, s5
29; SI-NEXT:    s_waitcnt vmcnt(1)
30; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
31; SI-NEXT:    s_waitcnt vmcnt(0)
32; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
33; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
34; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
35; SI-NEXT:    v_min_f32_e32 v0, v0, v1
36; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
37; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
38; SI-NEXT:    s_endpgm
39;
40; VI-LABEL: minnum_f16_ieee:
41; VI:       ; %bb.0: ; %entry
42; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
43; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
44; VI-NEXT:    s_mov_b32 s3, 0xf000
45; VI-NEXT:    s_mov_b32 s2, -1
46; VI-NEXT:    s_mov_b32 s14, s2
47; VI-NEXT:    s_waitcnt lgkmcnt(0)
48; VI-NEXT:    s_mov_b32 s12, s6
49; VI-NEXT:    s_mov_b32 s13, s7
50; VI-NEXT:    s_mov_b32 s15, s3
51; VI-NEXT:    s_mov_b32 s10, s2
52; VI-NEXT:    s_mov_b32 s11, s3
53; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
54; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
55; VI-NEXT:    s_mov_b32 s0, s4
56; VI-NEXT:    s_mov_b32 s1, s5
57; VI-NEXT:    s_waitcnt vmcnt(1)
58; VI-NEXT:    v_max_f16_e32 v0, v0, v0
59; VI-NEXT:    s_waitcnt vmcnt(0)
60; VI-NEXT:    v_max_f16_e32 v1, v1, v1
61; VI-NEXT:    v_min_f16_e32 v0, v0, v1
62; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
63; VI-NEXT:    s_endpgm
64;
65; GFX9-LABEL: minnum_f16_ieee:
66; GFX9:       ; %bb.0: ; %entry
67; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
68; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
69; GFX9-NEXT:    s_mov_b32 s3, 0xf000
70; GFX9-NEXT:    s_mov_b32 s2, -1
71; GFX9-NEXT:    s_mov_b32 s14, s2
72; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
73; GFX9-NEXT:    s_mov_b32 s12, s6
74; GFX9-NEXT:    s_mov_b32 s13, s7
75; GFX9-NEXT:    s_mov_b32 s15, s3
76; GFX9-NEXT:    s_mov_b32 s10, s2
77; GFX9-NEXT:    s_mov_b32 s11, s3
78; GFX9-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
79; GFX9-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
80; GFX9-NEXT:    s_mov_b32 s0, s4
81; GFX9-NEXT:    s_mov_b32 s1, s5
82; GFX9-NEXT:    s_waitcnt vmcnt(1)
83; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
84; GFX9-NEXT:    s_waitcnt vmcnt(0)
85; GFX9-NEXT:    v_max_f16_e32 v1, v1, v1
86; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
87; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
88; GFX9-NEXT:    s_endpgm
89    half addrspace(1)* %r,
90    half addrspace(1)* %a,
91    half addrspace(1)* %b) #0 {
92entry:
93  %a.val = load volatile half, half addrspace(1)* %a
94  %b.val = load volatile half, half addrspace(1)* %b
95  %r.val = call half @llvm.minnum.f16(half %a.val, half %b.val)
96  store half %r.val, half addrspace(1)* %r
97  ret void
98}
99
100define amdgpu_ps half @minnum_f16_no_ieee(half %a, half %b) #0 {
101; SI-LABEL: minnum_f16_no_ieee:
102; SI:       ; %bb.0:
103; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
104; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
105; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
106; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
107; SI-NEXT:    v_min_f32_e32 v0, v0, v1
108; SI-NEXT:    ; return to shader part epilog
109;
110; VI-LABEL: minnum_f16_no_ieee:
111; VI:       ; %bb.0:
112; VI-NEXT:    v_min_f16_e32 v0, v0, v1
113; VI-NEXT:    ; return to shader part epilog
114;
115; GFX9-LABEL: minnum_f16_no_ieee:
116; GFX9:       ; %bb.0:
117; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
118; GFX9-NEXT:    ; return to shader part epilog
119  %r.val = call half @llvm.minnum.f16(half %a, half %b)
120  ret half %r.val
121}
122
123define amdgpu_kernel void @minnum_f16_imm_a(
124; SI-LABEL: minnum_f16_imm_a:
125; SI:       ; %bb.0: ; %entry
126; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
127; SI-NEXT:    s_mov_b32 s3, 0xf000
128; SI-NEXT:    s_mov_b32 s2, -1
129; SI-NEXT:    s_mov_b32 s10, s2
130; SI-NEXT:    s_mov_b32 s11, s3
131; SI-NEXT:    s_waitcnt lgkmcnt(0)
132; SI-NEXT:    s_mov_b32 s8, s6
133; SI-NEXT:    s_mov_b32 s9, s7
134; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
135; SI-NEXT:    s_mov_b32 s0, s4
136; SI-NEXT:    s_mov_b32 s1, s5
137; SI-NEXT:    s_waitcnt vmcnt(0)
138; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
139; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
140; SI-NEXT:    v_min_f32_e32 v0, 0x40400000, v0
141; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
142; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
143; SI-NEXT:    s_endpgm
144;
145; VI-LABEL: minnum_f16_imm_a:
146; VI:       ; %bb.0: ; %entry
147; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
148; VI-NEXT:    s_mov_b32 s3, 0xf000
149; VI-NEXT:    s_mov_b32 s2, -1
150; VI-NEXT:    s_waitcnt lgkmcnt(0)
151; VI-NEXT:    s_mov_b32 s0, s4
152; VI-NEXT:    s_mov_b32 s1, s5
153; VI-NEXT:    s_mov_b32 s4, s6
154; VI-NEXT:    s_mov_b32 s5, s7
155; VI-NEXT:    s_mov_b32 s6, s2
156; VI-NEXT:    s_mov_b32 s7, s3
157; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
158; VI-NEXT:    s_waitcnt vmcnt(0)
159; VI-NEXT:    v_max_f16_e32 v0, v0, v0
160; VI-NEXT:    v_min_f16_e32 v0, 0x4200, v0
161; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
162; VI-NEXT:    s_endpgm
163;
164; GFX9-LABEL: minnum_f16_imm_a:
165; GFX9:       ; %bb.0: ; %entry
166; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
167; GFX9-NEXT:    s_mov_b32 s3, 0xf000
168; GFX9-NEXT:    s_mov_b32 s2, -1
169; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
170; GFX9-NEXT:    s_mov_b32 s0, s4
171; GFX9-NEXT:    s_mov_b32 s1, s5
172; GFX9-NEXT:    s_mov_b32 s4, s6
173; GFX9-NEXT:    s_mov_b32 s5, s7
174; GFX9-NEXT:    s_mov_b32 s6, s2
175; GFX9-NEXT:    s_mov_b32 s7, s3
176; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
177; GFX9-NEXT:    s_waitcnt vmcnt(0)
178; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
179; GFX9-NEXT:    v_min_f16_e32 v0, 0x4200, v0
180; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
181; GFX9-NEXT:    s_endpgm
182    half addrspace(1)* %r,
183    half addrspace(1)* %b) #0 {
184entry:
185  %b.val = load half, half addrspace(1)* %b
186  %r.val = call half @llvm.minnum.f16(half 3.0, half %b.val)
187  store half %r.val, half addrspace(1)* %r
188  ret void
189}
190
191define amdgpu_kernel void @minnum_f16_imm_b(
192; SI-LABEL: minnum_f16_imm_b:
193; SI:       ; %bb.0: ; %entry
194; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
195; SI-NEXT:    s_mov_b32 s3, 0xf000
196; SI-NEXT:    s_mov_b32 s2, -1
197; SI-NEXT:    s_mov_b32 s10, s2
198; SI-NEXT:    s_mov_b32 s11, s3
199; SI-NEXT:    s_waitcnt lgkmcnt(0)
200; SI-NEXT:    s_mov_b32 s8, s6
201; SI-NEXT:    s_mov_b32 s9, s7
202; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
203; SI-NEXT:    s_mov_b32 s0, s4
204; SI-NEXT:    s_mov_b32 s1, s5
205; SI-NEXT:    s_waitcnt vmcnt(0)
206; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
207; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
208; SI-NEXT:    v_min_f32_e32 v0, 4.0, v0
209; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
210; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
211; SI-NEXT:    s_endpgm
212;
213; VI-LABEL: minnum_f16_imm_b:
214; VI:       ; %bb.0: ; %entry
215; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
216; VI-NEXT:    s_mov_b32 s3, 0xf000
217; VI-NEXT:    s_mov_b32 s2, -1
218; VI-NEXT:    s_waitcnt lgkmcnt(0)
219; VI-NEXT:    s_mov_b32 s0, s4
220; VI-NEXT:    s_mov_b32 s1, s5
221; VI-NEXT:    s_mov_b32 s4, s6
222; VI-NEXT:    s_mov_b32 s5, s7
223; VI-NEXT:    s_mov_b32 s6, s2
224; VI-NEXT:    s_mov_b32 s7, s3
225; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
226; VI-NEXT:    s_waitcnt vmcnt(0)
227; VI-NEXT:    v_max_f16_e32 v0, v0, v0
228; VI-NEXT:    v_min_f16_e32 v0, 4.0, v0
229; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
230; VI-NEXT:    s_endpgm
231;
232; GFX9-LABEL: minnum_f16_imm_b:
233; GFX9:       ; %bb.0: ; %entry
234; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
235; GFX9-NEXT:    s_mov_b32 s3, 0xf000
236; GFX9-NEXT:    s_mov_b32 s2, -1
237; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
238; GFX9-NEXT:    s_mov_b32 s0, s4
239; GFX9-NEXT:    s_mov_b32 s1, s5
240; GFX9-NEXT:    s_mov_b32 s4, s6
241; GFX9-NEXT:    s_mov_b32 s5, s7
242; GFX9-NEXT:    s_mov_b32 s6, s2
243; GFX9-NEXT:    s_mov_b32 s7, s3
244; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
245; GFX9-NEXT:    s_waitcnt vmcnt(0)
246; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
247; GFX9-NEXT:    v_min_f16_e32 v0, 4.0, v0
248; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
249; GFX9-NEXT:    s_endpgm
250    half addrspace(1)* %r,
251    half addrspace(1)* %a) #0 {
252entry:
253  %a.val = load half, half addrspace(1)* %a
254  %r.val = call half @llvm.minnum.f16(half %a.val, half 4.0)
255  store half %r.val, half addrspace(1)* %r
256  ret void
257}
258
259define amdgpu_kernel void @minnum_v2f16_ieee(
260; SI-LABEL: minnum_v2f16_ieee:
261; SI:       ; %bb.0: ; %entry
262; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
263; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
264; SI-NEXT:    s_mov_b32 s3, 0xf000
265; SI-NEXT:    s_mov_b32 s2, -1
266; SI-NEXT:    s_waitcnt lgkmcnt(0)
267; SI-NEXT:    s_load_dword s6, s[6:7], 0x0
268; SI-NEXT:    s_load_dword s0, s[0:1], 0x0
269; SI-NEXT:    s_waitcnt lgkmcnt(0)
270; SI-NEXT:    s_lshr_b32 s1, s6, 16
271; SI-NEXT:    v_cvt_f32_f16_e32 v1, s0
272; SI-NEXT:    s_lshr_b32 s0, s0, 16
273; SI-NEXT:    v_cvt_f32_f16_e32 v2, s0
274; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
275; SI-NEXT:    v_cvt_f32_f16_e32 v0, s6
276; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
277; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
278; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
279; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
280; SI-NEXT:    v_min_f32_e32 v2, v3, v2
281; SI-NEXT:    v_min_f32_e32 v0, v0, v1
282; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
283; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
284; SI-NEXT:    s_mov_b32 s0, s4
285; SI-NEXT:    s_mov_b32 s1, s5
286; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
287; SI-NEXT:    v_or_b32_e32 v0, v0, v1
288; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
289; SI-NEXT:    s_endpgm
290;
291; VI-LABEL: minnum_v2f16_ieee:
292; VI:       ; %bb.0: ; %entry
293; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
294; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
295; VI-NEXT:    s_mov_b32 s3, 0xf000
296; VI-NEXT:    s_mov_b32 s2, -1
297; VI-NEXT:    s_waitcnt lgkmcnt(0)
298; VI-NEXT:    s_mov_b32 s0, s4
299; VI-NEXT:    s_mov_b32 s1, s5
300; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
301; VI-NEXT:    s_load_dword s5, s[8:9], 0x0
302; VI-NEXT:    s_waitcnt lgkmcnt(0)
303; VI-NEXT:    v_max_f16_e64 v1, s4, s4
304; VI-NEXT:    v_max_f16_e64 v0, s5, s5
305; VI-NEXT:    s_lshr_b32 s4, s4, 16
306; VI-NEXT:    s_lshr_b32 s5, s5, 16
307; VI-NEXT:    v_min_f16_e32 v0, v1, v0
308; VI-NEXT:    v_max_f16_e64 v1, s5, s5
309; VI-NEXT:    v_max_f16_e64 v2, s4, s4
310; VI-NEXT:    v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
311; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
312; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
313; VI-NEXT:    s_endpgm
314;
315; GFX9-LABEL: minnum_v2f16_ieee:
316; GFX9:       ; %bb.0: ; %entry
317; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
318; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
319; GFX9-NEXT:    s_mov_b32 s3, 0xf000
320; GFX9-NEXT:    s_mov_b32 s2, -1
321; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
322; GFX9-NEXT:    s_load_dword s6, s[6:7], 0x0
323; GFX9-NEXT:    s_load_dword s7, s[0:1], 0x0
324; GFX9-NEXT:    s_mov_b32 s0, s4
325; GFX9-NEXT:    s_mov_b32 s1, s5
326; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
327; GFX9-NEXT:    v_pk_max_f16 v1, s6, s6
328; GFX9-NEXT:    v_pk_max_f16 v0, s7, s7
329; GFX9-NEXT:    v_pk_min_f16 v0, v1, v0
330; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
331; GFX9-NEXT:    s_endpgm
332    <2 x half> addrspace(1)* %r,
333    <2 x half> addrspace(1)* %a,
334    <2 x half> addrspace(1)* %b) #0 {
335entry:
336  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
337  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
338  %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> %b.val)
339  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
340  ret void
341}
342
343define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b) #0 {
344; SI-LABEL: minnum_v2f16_no_ieee:
345; SI:       ; %bb.0:
346; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
347; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
348; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
349; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
350; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
351; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
352; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
353; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
354; SI-NEXT:    v_min_f32_e32 v0, v0, v2
355; SI-NEXT:    v_min_f32_e32 v1, v1, v3
356; SI-NEXT:    ; return to shader part epilog
357;
358; VI-LABEL: minnum_v2f16_no_ieee:
359; VI:       ; %bb.0:
360; VI-NEXT:    v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
361; VI-NEXT:    v_min_f16_e32 v0, v0, v1
362; VI-NEXT:    v_or_b32_e32 v0, v0, v2
363; VI-NEXT:    ; return to shader part epilog
364;
365; GFX9-LABEL: minnum_v2f16_no_ieee:
366; GFX9:       ; %bb.0:
367; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
368; GFX9-NEXT:    ; return to shader part epilog
369  %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
370  ret <2 x half> %r.val
371}
372
373define amdgpu_kernel void @minnum_v2f16_imm_a(
374; SI-LABEL: minnum_v2f16_imm_a:
375; SI:       ; %bb.0: ; %entry
376; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
377; SI-NEXT:    s_waitcnt lgkmcnt(0)
378; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
379; SI-NEXT:    s_mov_b32 s3, 0xf000
380; SI-NEXT:    s_waitcnt lgkmcnt(0)
381; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
382; SI-NEXT:    s_lshr_b32 s2, s2, 16
383; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
384; SI-NEXT:    s_mov_b32 s2, -1
385; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
386; SI-NEXT:    v_min_f32_e32 v0, 0x40400000, v0
387; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
388; SI-NEXT:    v_min_f32_e32 v1, 4.0, v1
389; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
390; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
391; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
392; SI-NEXT:    v_or_b32_e32 v0, v0, v1
393; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
394; SI-NEXT:    s_endpgm
395;
396; VI-LABEL: minnum_v2f16_imm_a:
397; VI:       ; %bb.0: ; %entry
398; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
399; VI-NEXT:    v_mov_b32_e32 v2, 0x4400
400; VI-NEXT:    s_mov_b32 s3, 0xf000
401; VI-NEXT:    s_mov_b32 s2, -1
402; VI-NEXT:    s_waitcnt lgkmcnt(0)
403; VI-NEXT:    s_mov_b32 s0, s4
404; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
405; VI-NEXT:    s_mov_b32 s1, s5
406; VI-NEXT:    s_waitcnt lgkmcnt(0)
407; VI-NEXT:    v_max_f16_e64 v0, s4, s4
408; VI-NEXT:    s_lshr_b32 s4, s4, 16
409; VI-NEXT:    v_max_f16_e64 v1, s4, s4
410; VI-NEXT:    v_min_f16_e32 v0, 0x4200, v0
411; VI-NEXT:    v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
412; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
413; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
414; VI-NEXT:    s_endpgm
415;
416; GFX9-LABEL: minnum_v2f16_imm_a:
417; GFX9:       ; %bb.0: ; %entry
418; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
419; GFX9-NEXT:    s_mov_b32 s7, 0xf000
420; GFX9-NEXT:    s_mov_b32 s6, -1
421; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
422; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
423; GFX9-NEXT:    s_mov_b32 s4, s0
424; GFX9-NEXT:    s_mov_b32 s0, 0x44004200
425; GFX9-NEXT:    s_mov_b32 s5, s1
426; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
427; GFX9-NEXT:    v_pk_max_f16 v0, s2, s2
428; GFX9-NEXT:    v_pk_min_f16 v0, v0, s0
429; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
430; GFX9-NEXT:    s_endpgm
431    <2 x half> addrspace(1)* %r,
432    <2 x half> addrspace(1)* %b) #0 {
433entry:
434  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
435  %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> <half 3.0, half 4.0>, <2 x half> %b.val)
436  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
437  ret void
438}
439
440define amdgpu_kernel void @minnum_v2f16_imm_b(
441; SI-LABEL: minnum_v2f16_imm_b:
442; SI:       ; %bb.0: ; %entry
443; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
444; SI-NEXT:    s_waitcnt lgkmcnt(0)
445; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
446; SI-NEXT:    s_mov_b32 s3, 0xf000
447; SI-NEXT:    s_waitcnt lgkmcnt(0)
448; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
449; SI-NEXT:    s_lshr_b32 s2, s2, 16
450; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
451; SI-NEXT:    s_mov_b32 s2, -1
452; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
453; SI-NEXT:    v_min_f32_e32 v0, 4.0, v0
454; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
455; SI-NEXT:    v_min_f32_e32 v1, 0x40400000, v1
456; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
457; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
458; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
459; SI-NEXT:    v_or_b32_e32 v0, v0, v1
460; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
461; SI-NEXT:    s_endpgm
462;
463; VI-LABEL: minnum_v2f16_imm_b:
464; VI:       ; %bb.0: ; %entry
465; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
466; VI-NEXT:    v_mov_b32_e32 v2, 0x4200
467; VI-NEXT:    s_mov_b32 s3, 0xf000
468; VI-NEXT:    s_mov_b32 s2, -1
469; VI-NEXT:    s_waitcnt lgkmcnt(0)
470; VI-NEXT:    s_mov_b32 s0, s4
471; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
472; VI-NEXT:    s_mov_b32 s1, s5
473; VI-NEXT:    s_waitcnt lgkmcnt(0)
474; VI-NEXT:    v_max_f16_e64 v0, s4, s4
475; VI-NEXT:    s_lshr_b32 s4, s4, 16
476; VI-NEXT:    v_max_f16_e64 v1, s4, s4
477; VI-NEXT:    v_min_f16_e32 v0, 4.0, v0
478; VI-NEXT:    v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
479; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
480; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
481; VI-NEXT:    s_endpgm
482;
483; GFX9-LABEL: minnum_v2f16_imm_b:
484; GFX9:       ; %bb.0: ; %entry
485; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
486; GFX9-NEXT:    s_mov_b32 s7, 0xf000
487; GFX9-NEXT:    s_mov_b32 s6, -1
488; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
489; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
490; GFX9-NEXT:    s_mov_b32 s4, s0
491; GFX9-NEXT:    s_mov_b32 s0, 0x42004400
492; GFX9-NEXT:    s_mov_b32 s5, s1
493; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
494; GFX9-NEXT:    v_pk_max_f16 v0, s2, s2
495; GFX9-NEXT:    v_pk_min_f16 v0, v0, s0
496; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
497; GFX9-NEXT:    s_endpgm
498    <2 x half> addrspace(1)* %r,
499    <2 x half> addrspace(1)* %a) #0 {
500entry:
501  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
502  %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> <half 4.0, half 3.0>)
503  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
504  ret void
505}
506
507; FIXME: Scalarize with undef half
508define amdgpu_kernel void @minnum_v3f16(
509; SI-LABEL: minnum_v3f16:
510; SI:       ; %bb.0: ; %entry
511; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
512; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
513; SI-NEXT:    s_mov_b32 s3, 0xf000
514; SI-NEXT:    s_mov_b32 s2, -1
515; SI-NEXT:    s_waitcnt lgkmcnt(0)
516; SI-NEXT:    s_mov_b32 s0, s4
517; SI-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
518; SI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
519; SI-NEXT:    s_waitcnt lgkmcnt(0)
520; SI-NEXT:    s_lshr_b32 s1, s6, 16
521; SI-NEXT:    s_lshr_b32 s4, s8, 16
522; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
523; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
524; SI-NEXT:    v_cvt_f32_f16_e32 v1, s6
525; SI-NEXT:    v_cvt_f32_f16_e32 v5, s8
526; SI-NEXT:    v_cvt_f32_f16_e32 v0, s7
527; SI-NEXT:    v_cvt_f32_f16_e32 v4, s9
528; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
529; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
530; SI-NEXT:    v_min_f32_e32 v2, v3, v2
531; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v5
532; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
533; SI-NEXT:    v_min_f32_e32 v1, v1, v3
534; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v4
535; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
536; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
537; SI-NEXT:    v_min_f32_e32 v0, v0, v3
538; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
539; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
540; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
541; SI-NEXT:    s_mov_b32 s1, s5
542; SI-NEXT:    v_or_b32_e32 v1, v1, v2
543; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
544; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
545; SI-NEXT:    s_endpgm
546;
547; VI-LABEL: minnum_v3f16:
548; VI:       ; %bb.0: ; %entry
549; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
550; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
551; VI-NEXT:    s_mov_b32 s3, 0xf000
552; VI-NEXT:    s_mov_b32 s2, -1
553; VI-NEXT:    s_waitcnt lgkmcnt(0)
554; VI-NEXT:    s_mov_b32 s0, s4
555; VI-NEXT:    s_mov_b32 s1, s5
556; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
557; VI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
558; VI-NEXT:    s_waitcnt lgkmcnt(0)
559; VI-NEXT:    v_max_f16_e64 v1, s4, s4
560; VI-NEXT:    v_max_f16_e64 v0, s6, s6
561; VI-NEXT:    s_lshr_b32 s4, s4, 16
562; VI-NEXT:    s_lshr_b32 s6, s6, 16
563; VI-NEXT:    v_min_f16_e32 v0, v1, v0
564; VI-NEXT:    v_max_f16_e64 v1, s6, s6
565; VI-NEXT:    v_max_f16_e64 v2, s4, s4
566; VI-NEXT:    v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
567; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
568; VI-NEXT:    v_max_f16_e64 v1, s7, s7
569; VI-NEXT:    v_max_f16_e64 v2, s5, s5
570; VI-NEXT:    v_min_f16_e32 v1, v2, v1
571; VI-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
572; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
573; VI-NEXT:    s_endpgm
574;
575; GFX9-LABEL: minnum_v3f16:
576; GFX9:       ; %bb.0: ; %entry
577; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
578; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
579; GFX9-NEXT:    s_mov_b32 s3, 0xf000
580; GFX9-NEXT:    s_mov_b32 s2, -1
581; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
582; GFX9-NEXT:    s_mov_b32 s0, s4
583; GFX9-NEXT:    s_mov_b32 s1, s5
584; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
585; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
586; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
587; GFX9-NEXT:    v_pk_max_f16 v1, s4, s4
588; GFX9-NEXT:    v_pk_max_f16 v0, s6, s6
589; GFX9-NEXT:    v_pk_min_f16 v0, v1, v0
590; GFX9-NEXT:    v_pk_max_f16 v2, s7, s7
591; GFX9-NEXT:    v_pk_max_f16 v1, s5, s5
592; GFX9-NEXT:    v_pk_min_f16 v1, v1, v2
593; GFX9-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
594; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
595; GFX9-NEXT:    s_endpgm
596    <3 x half> addrspace(1)* %r,
597    <3 x half> addrspace(1)* %a,
598    <3 x half> addrspace(1)* %b) #0 {
599entry:
600  %a.val = load <3 x half>, <3 x half> addrspace(1)* %a
601  %b.val = load <3 x half>, <3 x half> addrspace(1)* %b
602  %r.val = call <3 x half> @llvm.minnum.v3f16(<3 x half> %a.val, <3 x half> %b.val)
603  store <3 x half> %r.val, <3 x half> addrspace(1)* %r
604  ret void
605}
606
607define amdgpu_kernel void @minnum_v4f16(
608; SI-LABEL: minnum_v4f16:
609; SI:       ; %bb.0: ; %entry
610; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
611; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
612; SI-NEXT:    s_mov_b32 s3, 0xf000
613; SI-NEXT:    s_mov_b32 s2, -1
614; SI-NEXT:    s_waitcnt lgkmcnt(0)
615; SI-NEXT:    s_mov_b32 s0, s4
616; SI-NEXT:    s_mov_b32 s1, s5
617; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
618; SI-NEXT:    s_waitcnt lgkmcnt(0)
619; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
620; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
621; SI-NEXT:    s_lshr_b32 s4, s4, 16
622; SI-NEXT:    s_lshr_b32 s5, s5, 16
623; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
624; SI-NEXT:    v_cvt_f32_f16_e32 v3, s5
625; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
626; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
627; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
628; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
629; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
630; SI-NEXT:    s_waitcnt lgkmcnt(0)
631; SI-NEXT:    s_lshr_b32 s6, s5, 16
632; SI-NEXT:    v_cvt_f32_f16_e32 v5, s6
633; SI-NEXT:    v_cvt_f32_f16_e32 v4, s4
634; SI-NEXT:    s_lshr_b32 s4, s4, 16
635; SI-NEXT:    v_cvt_f32_f16_e32 v7, s5
636; SI-NEXT:    v_cvt_f32_f16_e32 v6, s4
637; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v5
638; SI-NEXT:    v_min_f32_e32 v3, v3, v5
639; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v7
640; SI-NEXT:    v_min_f32_e32 v1, v1, v5
641; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v6
642; SI-NEXT:    v_min_f32_e32 v2, v2, v5
643; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
644; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
645; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
646; SI-NEXT:    v_min_f32_e32 v0, v0, v4
647; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
648; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
649; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
650; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
651; SI-NEXT:    v_or_b32_e32 v1, v1, v3
652; SI-NEXT:    v_or_b32_e32 v0, v0, v2
653; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
654; SI-NEXT:    s_endpgm
655;
656; VI-LABEL: minnum_v4f16:
657; VI:       ; %bb.0: ; %entry
658; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
659; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
660; VI-NEXT:    s_mov_b32 s3, 0xf000
661; VI-NEXT:    s_mov_b32 s2, -1
662; VI-NEXT:    s_waitcnt lgkmcnt(0)
663; VI-NEXT:    s_mov_b32 s0, s4
664; VI-NEXT:    s_mov_b32 s1, s5
665; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
666; VI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
667; VI-NEXT:    s_waitcnt lgkmcnt(0)
668; VI-NEXT:    v_max_f16_e64 v1, s5, s5
669; VI-NEXT:    v_max_f16_e64 v0, s7, s7
670; VI-NEXT:    s_lshr_b32 s5, s5, 16
671; VI-NEXT:    s_lshr_b32 s7, s7, 16
672; VI-NEXT:    v_min_f16_e32 v0, v1, v0
673; VI-NEXT:    v_max_f16_e64 v2, s5, s5
674; VI-NEXT:    v_max_f16_e64 v1, s7, s7
675; VI-NEXT:    v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
676; VI-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
677; VI-NEXT:    v_max_f16_e64 v2, s4, s4
678; VI-NEXT:    v_max_f16_e64 v0, s6, s6
679; VI-NEXT:    s_lshr_b32 s4, s4, 16
680; VI-NEXT:    s_lshr_b32 s5, s6, 16
681; VI-NEXT:    v_min_f16_e32 v0, v2, v0
682; VI-NEXT:    v_max_f16_e64 v2, s5, s5
683; VI-NEXT:    v_max_f16_e64 v3, s4, s4
684; VI-NEXT:    v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
685; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
686; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
687; VI-NEXT:    s_endpgm
688;
689; GFX9-LABEL: minnum_v4f16:
690; GFX9:       ; %bb.0: ; %entry
691; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
692; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
693; GFX9-NEXT:    s_mov_b32 s3, 0xf000
694; GFX9-NEXT:    s_mov_b32 s2, -1
695; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
696; GFX9-NEXT:    s_mov_b32 s0, s4
697; GFX9-NEXT:    s_mov_b32 s1, s5
698; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
699; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
700; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
701; GFX9-NEXT:    v_pk_max_f16 v1, s5, s5
702; GFX9-NEXT:    v_pk_max_f16 v0, s7, s7
703; GFX9-NEXT:    v_pk_min_f16 v1, v1, v0
704; GFX9-NEXT:    v_pk_max_f16 v2, s6, s6
705; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
706; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
707; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
708; GFX9-NEXT:    s_endpgm
709    <4 x half> addrspace(1)* %r,
710    <4 x half> addrspace(1)* %a,
711    <4 x half> addrspace(1)* %b) #0 {
712entry:
713  %a.val = load <4 x half>, <4 x half> addrspace(1)* %a
714  %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
715  %r.val = call <4 x half> @llvm.minnum.v4f16(<4 x half> %a.val, <4 x half> %b.val)
716  store <4 x half> %r.val, <4 x half> addrspace(1)* %r
717  ret void
718}
719
720define amdgpu_kernel void @fmin_v4f16_imm_a(
721; SI-LABEL: fmin_v4f16_imm_a:
722; SI:       ; %bb.0: ; %entry
723; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
724; SI-NEXT:    s_mov_b32 s3, 0xf000
725; SI-NEXT:    s_mov_b32 s2, -1
726; SI-NEXT:    s_waitcnt lgkmcnt(0)
727; SI-NEXT:    s_mov_b32 s0, s4
728; SI-NEXT:    s_mov_b32 s1, s5
729; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
730; SI-NEXT:    s_waitcnt lgkmcnt(0)
731; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
732; SI-NEXT:    s_lshr_b32 s5, s5, 16
733; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
734; SI-NEXT:    v_cvt_f32_f16_e32 v2, s5
735; SI-NEXT:    s_lshr_b32 s4, s4, 16
736; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
737; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
738; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
739; SI-NEXT:    v_min_f32_e32 v2, 4.0, v2
740; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
741; SI-NEXT:    v_min_f32_e32 v1, 0x40400000, v1
742; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
743; SI-NEXT:    v_min_f32_e32 v3, 2.0, v3
744; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
745; SI-NEXT:    v_min_f32_e32 v0, 0x41000000, v0
746; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
747; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
748; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
749; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
750; SI-NEXT:    v_or_b32_e32 v1, v1, v2
751; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
752; SI-NEXT:    v_or_b32_e32 v0, v0, v2
753; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
754; SI-NEXT:    s_endpgm
755;
756; VI-LABEL: fmin_v4f16_imm_a:
757; VI:       ; %bb.0: ; %entry
758; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
759; VI-NEXT:    v_mov_b32_e32 v0, 0x4400
760; VI-NEXT:    s_mov_b32 s3, 0xf000
761; VI-NEXT:    s_mov_b32 s2, -1
762; VI-NEXT:    s_waitcnt lgkmcnt(0)
763; VI-NEXT:    s_mov_b32 s0, s4
764; VI-NEXT:    s_mov_b32 s1, s5
765; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
766; VI-NEXT:    s_waitcnt lgkmcnt(0)
767; VI-NEXT:    v_max_f16_e64 v1, s5, s5
768; VI-NEXT:    s_lshr_b32 s5, s5, 16
769; VI-NEXT:    v_max_f16_e64 v3, s5, s5
770; VI-NEXT:    v_max_f16_e64 v2, s4, s4
771; VI-NEXT:    v_min_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
772; VI-NEXT:    v_min_f16_e32 v1, 0x4200, v1
773; VI-NEXT:    s_lshr_b32 s4, s4, 16
774; VI-NEXT:    v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
775; VI-NEXT:    v_min_f16_e32 v0, 0x4800, v2
776; VI-NEXT:    v_max_f16_e64 v2, s4, s4
777; VI-NEXT:    v_mov_b32_e32 v3, 0x4000
778; VI-NEXT:    v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
779; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
780; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
781; VI-NEXT:    s_endpgm
782;
783; GFX9-LABEL: fmin_v4f16_imm_a:
784; GFX9:       ; %bb.0: ; %entry
785; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
786; GFX9-NEXT:    s_mov_b32 s8, 0x44004200
787; GFX9-NEXT:    s_mov_b32 s9, 0x40004800
788; GFX9-NEXT:    s_mov_b32 s3, 0xf000
789; GFX9-NEXT:    s_mov_b32 s2, -1
790; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
791; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
792; GFX9-NEXT:    s_mov_b32 s0, s4
793; GFX9-NEXT:    s_mov_b32 s1, s5
794; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
795; GFX9-NEXT:    v_pk_max_f16 v0, s7, s7
796; GFX9-NEXT:    v_pk_max_f16 v2, s6, s6
797; GFX9-NEXT:    v_pk_min_f16 v1, v0, s8
798; GFX9-NEXT:    v_pk_min_f16 v0, v2, s9
799; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
800; GFX9-NEXT:    s_endpgm
801    <4 x half> addrspace(1)* %r,
802    <4 x half> addrspace(1)* %b) #0 {
803entry:
804  %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
805  %r.val = call <4 x half> @llvm.minnum.v4f16(<4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, <4 x half> %b.val)
806  store <4 x half> %r.val, <4 x half> addrspace(1)* %r
807  ret void
808}
809
810attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
811