• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,SIVI %s
4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5
6declare half @llvm.maxnum.f16(half %a, half %b)
7declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
8declare <3 x half> @llvm.maxnum.v3f16(<3 x half> %a, <3 x half> %b)
9declare <4 x half> @llvm.maxnum.v4f16(<4 x half> %a, <4 x half> %b)
10
11define amdgpu_kernel void @maxnum_f16(
12; SI-LABEL: maxnum_f16:
13; SI:       ; %bb.0: ; %entry
14; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
15; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
16; SI-NEXT:    s_mov_b32 s3, 0xf000
17; SI-NEXT:    s_mov_b32 s2, -1
18; SI-NEXT:    s_mov_b32 s14, s2
19; SI-NEXT:    s_waitcnt lgkmcnt(0)
20; SI-NEXT:    s_mov_b32 s12, s6
21; SI-NEXT:    s_mov_b32 s13, s7
22; SI-NEXT:    s_mov_b32 s15, s3
23; SI-NEXT:    s_mov_b32 s10, s2
24; SI-NEXT:    s_mov_b32 s11, s3
25; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
26; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
27; SI-NEXT:    s_mov_b32 s0, s4
28; SI-NEXT:    s_mov_b32 s1, s5
29; SI-NEXT:    s_waitcnt vmcnt(1)
30; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
31; SI-NEXT:    s_waitcnt vmcnt(0)
32; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
33; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
34; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
35; SI-NEXT:    v_max_f32_e32 v0, v0, v1
36; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
37; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
38; SI-NEXT:    s_endpgm
39;
40; VI-LABEL: maxnum_f16:
41; VI:       ; %bb.0: ; %entry
42; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
43; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
44; VI-NEXT:    s_mov_b32 s3, 0xf000
45; VI-NEXT:    s_mov_b32 s2, -1
46; VI-NEXT:    s_mov_b32 s14, s2
47; VI-NEXT:    s_waitcnt lgkmcnt(0)
48; VI-NEXT:    s_mov_b32 s12, s6
49; VI-NEXT:    s_mov_b32 s13, s7
50; VI-NEXT:    s_mov_b32 s15, s3
51; VI-NEXT:    s_mov_b32 s10, s2
52; VI-NEXT:    s_mov_b32 s11, s3
53; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
54; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
55; VI-NEXT:    s_mov_b32 s0, s4
56; VI-NEXT:    s_mov_b32 s1, s5
57; VI-NEXT:    s_waitcnt vmcnt(1)
58; VI-NEXT:    v_max_f16_e32 v0, v0, v0
59; VI-NEXT:    s_waitcnt vmcnt(0)
60; VI-NEXT:    v_max_f16_e32 v1, v1, v1
61; VI-NEXT:    v_max_f16_e32 v0, v0, v1
62; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
63; VI-NEXT:    s_endpgm
64;
65; GFX9-LABEL: maxnum_f16:
66; GFX9:       ; %bb.0: ; %entry
67; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
68; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
69; GFX9-NEXT:    s_mov_b32 s3, 0xf000
70; GFX9-NEXT:    s_mov_b32 s2, -1
71; GFX9-NEXT:    s_mov_b32 s14, s2
72; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
73; GFX9-NEXT:    s_mov_b32 s12, s6
74; GFX9-NEXT:    s_mov_b32 s13, s7
75; GFX9-NEXT:    s_mov_b32 s15, s3
76; GFX9-NEXT:    s_mov_b32 s10, s2
77; GFX9-NEXT:    s_mov_b32 s11, s3
78; GFX9-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
79; GFX9-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
80; GFX9-NEXT:    s_mov_b32 s0, s4
81; GFX9-NEXT:    s_mov_b32 s1, s5
82; GFX9-NEXT:    s_waitcnt vmcnt(1)
83; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
84; GFX9-NEXT:    s_waitcnt vmcnt(0)
85; GFX9-NEXT:    v_max_f16_e32 v1, v1, v1
86; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
87; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
88; GFX9-NEXT:    s_endpgm
89    half addrspace(1)* %r,
90    half addrspace(1)* %a,
91    half addrspace(1)* %b) #0 {
92entry:
93  %a.val = load volatile half, half addrspace(1)* %a
94  %b.val = load volatile half, half addrspace(1)* %b
95  %r.val = call half @llvm.maxnum.f16(half %a.val, half %b.val)
96  store half %r.val, half addrspace(1)* %r
97  ret void
98}
99
100define amdgpu_kernel void @maxnum_f16_imm_a(
101; SI-LABEL: maxnum_f16_imm_a:
102; SI:       ; %bb.0: ; %entry
103; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
104; SI-NEXT:    s_mov_b32 s3, 0xf000
105; SI-NEXT:    s_mov_b32 s2, -1
106; SI-NEXT:    s_mov_b32 s10, s2
107; SI-NEXT:    s_mov_b32 s11, s3
108; SI-NEXT:    s_waitcnt lgkmcnt(0)
109; SI-NEXT:    s_mov_b32 s8, s6
110; SI-NEXT:    s_mov_b32 s9, s7
111; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
112; SI-NEXT:    s_mov_b32 s0, s4
113; SI-NEXT:    s_mov_b32 s1, s5
114; SI-NEXT:    s_waitcnt vmcnt(0)
115; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
116; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
117; SI-NEXT:    v_max_f32_e32 v0, 0x40400000, v0
118; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
119; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
120; SI-NEXT:    s_endpgm
121;
122; VI-LABEL: maxnum_f16_imm_a:
123; VI:       ; %bb.0: ; %entry
124; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
125; VI-NEXT:    s_mov_b32 s3, 0xf000
126; VI-NEXT:    s_mov_b32 s2, -1
127; VI-NEXT:    s_waitcnt lgkmcnt(0)
128; VI-NEXT:    s_mov_b32 s0, s4
129; VI-NEXT:    s_mov_b32 s1, s5
130; VI-NEXT:    s_mov_b32 s4, s6
131; VI-NEXT:    s_mov_b32 s5, s7
132; VI-NEXT:    s_mov_b32 s6, s2
133; VI-NEXT:    s_mov_b32 s7, s3
134; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
135; VI-NEXT:    s_waitcnt vmcnt(0)
136; VI-NEXT:    v_max_f16_e32 v0, v0, v0
137; VI-NEXT:    v_max_f16_e32 v0, 0x4200, v0
138; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
139; VI-NEXT:    s_endpgm
140;
141; GFX9-LABEL: maxnum_f16_imm_a:
142; GFX9:       ; %bb.0: ; %entry
143; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
144; GFX9-NEXT:    s_mov_b32 s3, 0xf000
145; GFX9-NEXT:    s_mov_b32 s2, -1
146; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
147; GFX9-NEXT:    s_mov_b32 s0, s4
148; GFX9-NEXT:    s_mov_b32 s1, s5
149; GFX9-NEXT:    s_mov_b32 s4, s6
150; GFX9-NEXT:    s_mov_b32 s5, s7
151; GFX9-NEXT:    s_mov_b32 s6, s2
152; GFX9-NEXT:    s_mov_b32 s7, s3
153; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
154; GFX9-NEXT:    s_waitcnt vmcnt(0)
155; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
156; GFX9-NEXT:    v_max_f16_e32 v0, 0x4200, v0
157; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
158; GFX9-NEXT:    s_endpgm
159    half addrspace(1)* %r,
160    half addrspace(1)* %b) #0 {
161entry:
162  %b.val = load half, half addrspace(1)* %b
163  %r.val = call half @llvm.maxnum.f16(half 3.0, half %b.val)
164  store half %r.val, half addrspace(1)* %r
165  ret void
166}
167
168define amdgpu_kernel void @maxnum_f16_imm_b(
169; SI-LABEL: maxnum_f16_imm_b:
170; SI:       ; %bb.0: ; %entry
171; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
172; SI-NEXT:    s_mov_b32 s3, 0xf000
173; SI-NEXT:    s_mov_b32 s2, -1
174; SI-NEXT:    s_mov_b32 s10, s2
175; SI-NEXT:    s_mov_b32 s11, s3
176; SI-NEXT:    s_waitcnt lgkmcnt(0)
177; SI-NEXT:    s_mov_b32 s8, s6
178; SI-NEXT:    s_mov_b32 s9, s7
179; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
180; SI-NEXT:    s_mov_b32 s0, s4
181; SI-NEXT:    s_mov_b32 s1, s5
182; SI-NEXT:    s_waitcnt vmcnt(0)
183; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
184; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
185; SI-NEXT:    v_max_f32_e32 v0, 4.0, v0
186; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
187; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
188; SI-NEXT:    s_endpgm
189;
190; VI-LABEL: maxnum_f16_imm_b:
191; VI:       ; %bb.0: ; %entry
192; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
193; VI-NEXT:    s_mov_b32 s3, 0xf000
194; VI-NEXT:    s_mov_b32 s2, -1
195; VI-NEXT:    s_waitcnt lgkmcnt(0)
196; VI-NEXT:    s_mov_b32 s0, s4
197; VI-NEXT:    s_mov_b32 s1, s5
198; VI-NEXT:    s_mov_b32 s4, s6
199; VI-NEXT:    s_mov_b32 s5, s7
200; VI-NEXT:    s_mov_b32 s6, s2
201; VI-NEXT:    s_mov_b32 s7, s3
202; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
203; VI-NEXT:    s_waitcnt vmcnt(0)
204; VI-NEXT:    v_max_f16_e32 v0, v0, v0
205; VI-NEXT:    v_max_f16_e32 v0, 4.0, v0
206; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
207; VI-NEXT:    s_endpgm
208;
209; GFX9-LABEL: maxnum_f16_imm_b:
210; GFX9:       ; %bb.0: ; %entry
211; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
212; GFX9-NEXT:    s_mov_b32 s3, 0xf000
213; GFX9-NEXT:    s_mov_b32 s2, -1
214; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
215; GFX9-NEXT:    s_mov_b32 s0, s4
216; GFX9-NEXT:    s_mov_b32 s1, s5
217; GFX9-NEXT:    s_mov_b32 s4, s6
218; GFX9-NEXT:    s_mov_b32 s5, s7
219; GFX9-NEXT:    s_mov_b32 s6, s2
220; GFX9-NEXT:    s_mov_b32 s7, s3
221; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
222; GFX9-NEXT:    s_waitcnt vmcnt(0)
223; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
224; GFX9-NEXT:    v_max_f16_e32 v0, 4.0, v0
225; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
226; GFX9-NEXT:    s_endpgm
227    half addrspace(1)* %r,
228    half addrspace(1)* %a) #0 {
229entry:
230  %a.val = load half, half addrspace(1)* %a
231  %r.val = call half @llvm.maxnum.f16(half %a.val, half 4.0)
232  store half %r.val, half addrspace(1)* %r
233  ret void
234}
235
236define amdgpu_kernel void @maxnum_v2f16(
237; SI-LABEL: maxnum_v2f16:
238; SI:       ; %bb.0: ; %entry
239; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
240; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
241; SI-NEXT:    s_mov_b32 s3, 0xf000
242; SI-NEXT:    s_mov_b32 s2, -1
243; SI-NEXT:    s_waitcnt lgkmcnt(0)
244; SI-NEXT:    s_load_dword s6, s[6:7], 0x0
245; SI-NEXT:    s_load_dword s0, s[0:1], 0x0
246; SI-NEXT:    s_waitcnt lgkmcnt(0)
247; SI-NEXT:    s_lshr_b32 s1, s6, 16
248; SI-NEXT:    v_cvt_f32_f16_e32 v1, s0
249; SI-NEXT:    s_lshr_b32 s0, s0, 16
250; SI-NEXT:    v_cvt_f32_f16_e32 v2, s0
251; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
252; SI-NEXT:    v_cvt_f32_f16_e32 v0, s6
253; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
254; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
255; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
256; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
257; SI-NEXT:    v_max_f32_e32 v2, v3, v2
258; SI-NEXT:    v_max_f32_e32 v0, v0, v1
259; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
260; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
261; SI-NEXT:    s_mov_b32 s0, s4
262; SI-NEXT:    s_mov_b32 s1, s5
263; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
264; SI-NEXT:    v_or_b32_e32 v0, v0, v1
265; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
266; SI-NEXT:    s_endpgm
267;
268; VI-LABEL: maxnum_v2f16:
269; VI:       ; %bb.0: ; %entry
270; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
271; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
272; VI-NEXT:    s_mov_b32 s3, 0xf000
273; VI-NEXT:    s_mov_b32 s2, -1
274; VI-NEXT:    s_waitcnt lgkmcnt(0)
275; VI-NEXT:    s_mov_b32 s0, s4
276; VI-NEXT:    s_mov_b32 s1, s5
277; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
278; VI-NEXT:    s_load_dword s5, s[8:9], 0x0
279; VI-NEXT:    s_waitcnt lgkmcnt(0)
280; VI-NEXT:    v_max_f16_e64 v1, s4, s4
281; VI-NEXT:    v_max_f16_e64 v0, s5, s5
282; VI-NEXT:    s_lshr_b32 s4, s4, 16
283; VI-NEXT:    s_lshr_b32 s5, s5, 16
284; VI-NEXT:    v_max_f16_e32 v0, v1, v0
285; VI-NEXT:    v_max_f16_e64 v1, s5, s5
286; VI-NEXT:    v_max_f16_e64 v2, s4, s4
287; VI-NEXT:    v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
288; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
289; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
290; VI-NEXT:    s_endpgm
291;
292; GFX9-LABEL: maxnum_v2f16:
293; GFX9:       ; %bb.0: ; %entry
294; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
295; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
296; GFX9-NEXT:    s_mov_b32 s3, 0xf000
297; GFX9-NEXT:    s_mov_b32 s2, -1
298; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
299; GFX9-NEXT:    s_load_dword s6, s[6:7], 0x0
300; GFX9-NEXT:    s_load_dword s7, s[0:1], 0x0
301; GFX9-NEXT:    s_mov_b32 s0, s4
302; GFX9-NEXT:    s_mov_b32 s1, s5
303; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
304; GFX9-NEXT:    v_pk_max_f16 v1, s6, s6
305; GFX9-NEXT:    v_pk_max_f16 v0, s7, s7
306; GFX9-NEXT:    v_pk_max_f16 v0, v1, v0
307; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
308; GFX9-NEXT:    s_endpgm
309    <2 x half> addrspace(1)* %r,
310    <2 x half> addrspace(1)* %a,
311    <2 x half> addrspace(1)* %b) #0 {
312entry:
313  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
314  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
315  %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> %b.val)
316  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
317  ret void
318}
319
320define amdgpu_kernel void @maxnum_v2f16_imm_a(
321; SI-LABEL: maxnum_v2f16_imm_a:
322; SI:       ; %bb.0: ; %entry
323; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
324; SI-NEXT:    s_waitcnt lgkmcnt(0)
325; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
326; SI-NEXT:    s_mov_b32 s3, 0xf000
327; SI-NEXT:    s_waitcnt lgkmcnt(0)
328; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
329; SI-NEXT:    s_lshr_b32 s2, s2, 16
330; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
331; SI-NEXT:    s_mov_b32 s2, -1
332; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
333; SI-NEXT:    v_max_f32_e32 v0, 0x40400000, v0
334; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
335; SI-NEXT:    v_max_f32_e32 v1, 4.0, v1
336; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
337; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
338; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
339; SI-NEXT:    v_or_b32_e32 v0, v0, v1
340; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
341; SI-NEXT:    s_endpgm
342;
343; VI-LABEL: maxnum_v2f16_imm_a:
344; VI:       ; %bb.0: ; %entry
345; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
346; VI-NEXT:    v_mov_b32_e32 v2, 0x4400
347; VI-NEXT:    s_mov_b32 s3, 0xf000
348; VI-NEXT:    s_mov_b32 s2, -1
349; VI-NEXT:    s_waitcnt lgkmcnt(0)
350; VI-NEXT:    s_mov_b32 s0, s4
351; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
352; VI-NEXT:    s_mov_b32 s1, s5
353; VI-NEXT:    s_waitcnt lgkmcnt(0)
354; VI-NEXT:    v_max_f16_e64 v0, s4, s4
355; VI-NEXT:    s_lshr_b32 s4, s4, 16
356; VI-NEXT:    v_max_f16_e64 v1, s4, s4
357; VI-NEXT:    v_max_f16_e32 v0, 0x4200, v0
358; VI-NEXT:    v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
359; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
360; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
361; VI-NEXT:    s_endpgm
362;
363; GFX9-LABEL: maxnum_v2f16_imm_a:
364; GFX9:       ; %bb.0: ; %entry
365; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
366; GFX9-NEXT:    s_mov_b32 s7, 0xf000
367; GFX9-NEXT:    s_mov_b32 s6, -1
368; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
369; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
370; GFX9-NEXT:    s_mov_b32 s4, s0
371; GFX9-NEXT:    s_mov_b32 s0, 0x44004200
372; GFX9-NEXT:    s_mov_b32 s5, s1
373; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
374; GFX9-NEXT:    v_pk_max_f16 v0, s2, s2
375; GFX9-NEXT:    v_pk_max_f16 v0, v0, s0
376; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
377; GFX9-NEXT:    s_endpgm
378    <2 x half> addrspace(1)* %r,
379    <2 x half> addrspace(1)* %b) #0 {
380entry:
381  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
382  %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> <half 3.0, half 4.0>, <2 x half> %b.val)
383  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
384  ret void
385}
386
387define amdgpu_kernel void @maxnum_v2f16_imm_b(
388; SI-LABEL: maxnum_v2f16_imm_b:
389; SI:       ; %bb.0: ; %entry
390; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
391; SI-NEXT:    s_waitcnt lgkmcnt(0)
392; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
393; SI-NEXT:    s_mov_b32 s3, 0xf000
394; SI-NEXT:    s_waitcnt lgkmcnt(0)
395; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
396; SI-NEXT:    s_lshr_b32 s2, s2, 16
397; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
398; SI-NEXT:    s_mov_b32 s2, -1
399; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
400; SI-NEXT:    v_max_f32_e32 v0, 4.0, v0
401; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
402; SI-NEXT:    v_max_f32_e32 v1, 0x40400000, v1
403; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
404; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
405; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
406; SI-NEXT:    v_or_b32_e32 v0, v0, v1
407; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
408; SI-NEXT:    s_endpgm
409;
410; VI-LABEL: maxnum_v2f16_imm_b:
411; VI:       ; %bb.0: ; %entry
412; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
413; VI-NEXT:    v_mov_b32_e32 v2, 0x4200
414; VI-NEXT:    s_mov_b32 s3, 0xf000
415; VI-NEXT:    s_mov_b32 s2, -1
416; VI-NEXT:    s_waitcnt lgkmcnt(0)
417; VI-NEXT:    s_mov_b32 s0, s4
418; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
419; VI-NEXT:    s_mov_b32 s1, s5
420; VI-NEXT:    s_waitcnt lgkmcnt(0)
421; VI-NEXT:    v_max_f16_e64 v0, s4, s4
422; VI-NEXT:    s_lshr_b32 s4, s4, 16
423; VI-NEXT:    v_max_f16_e64 v1, s4, s4
424; VI-NEXT:    v_max_f16_e32 v0, 4.0, v0
425; VI-NEXT:    v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
426; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
427; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
428; VI-NEXT:    s_endpgm
429;
430; GFX9-LABEL: maxnum_v2f16_imm_b:
431; GFX9:       ; %bb.0: ; %entry
432; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
433; GFX9-NEXT:    s_mov_b32 s7, 0xf000
434; GFX9-NEXT:    s_mov_b32 s6, -1
435; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
436; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
437; GFX9-NEXT:    s_mov_b32 s4, s0
438; GFX9-NEXT:    s_mov_b32 s0, 0x42004400
439; GFX9-NEXT:    s_mov_b32 s5, s1
440; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
441; GFX9-NEXT:    v_pk_max_f16 v0, s2, s2
442; GFX9-NEXT:    v_pk_max_f16 v0, v0, s0
443; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
444; GFX9-NEXT:    s_endpgm
445    <2 x half> addrspace(1)* %r,
446    <2 x half> addrspace(1)* %a) #0 {
447entry:
448  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
449  %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> <half 4.0, half 3.0>)
450  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
451  ret void
452}
453
454; FIXME: Scalarize with undef half
455define amdgpu_kernel void @maxnum_v3f16(
456; SI-LABEL: maxnum_v3f16:
457; SI:       ; %bb.0: ; %entry
458; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
459; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
460; SI-NEXT:    s_mov_b32 s3, 0xf000
461; SI-NEXT:    s_mov_b32 s2, -1
462; SI-NEXT:    s_waitcnt lgkmcnt(0)
463; SI-NEXT:    s_mov_b32 s0, s4
464; SI-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
465; SI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
466; SI-NEXT:    s_waitcnt lgkmcnt(0)
467; SI-NEXT:    s_lshr_b32 s1, s6, 16
468; SI-NEXT:    s_lshr_b32 s4, s8, 16
469; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
470; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
471; SI-NEXT:    v_cvt_f32_f16_e32 v1, s6
472; SI-NEXT:    v_cvt_f32_f16_e32 v5, s8
473; SI-NEXT:    v_cvt_f32_f16_e32 v0, s7
474; SI-NEXT:    v_cvt_f32_f16_e32 v4, s9
475; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
476; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
477; SI-NEXT:    v_max_f32_e32 v2, v3, v2
478; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v5
479; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
480; SI-NEXT:    v_max_f32_e32 v1, v1, v3
481; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v4
482; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
483; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
484; SI-NEXT:    v_max_f32_e32 v0, v0, v3
485; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
486; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
487; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
488; SI-NEXT:    s_mov_b32 s1, s5
489; SI-NEXT:    v_or_b32_e32 v1, v1, v2
490; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
491; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
492; SI-NEXT:    s_endpgm
493;
494; VI-LABEL: maxnum_v3f16:
495; VI:       ; %bb.0: ; %entry
496; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
497; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
498; VI-NEXT:    s_mov_b32 s3, 0xf000
499; VI-NEXT:    s_mov_b32 s2, -1
500; VI-NEXT:    s_waitcnt lgkmcnt(0)
501; VI-NEXT:    s_mov_b32 s0, s4
502; VI-NEXT:    s_mov_b32 s1, s5
503; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
504; VI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
505; VI-NEXT:    s_waitcnt lgkmcnt(0)
506; VI-NEXT:    v_max_f16_e64 v1, s4, s4
507; VI-NEXT:    v_max_f16_e64 v0, s6, s6
508; VI-NEXT:    s_lshr_b32 s4, s4, 16
509; VI-NEXT:    s_lshr_b32 s6, s6, 16
510; VI-NEXT:    v_max_f16_e32 v0, v1, v0
511; VI-NEXT:    v_max_f16_e64 v1, s6, s6
512; VI-NEXT:    v_max_f16_e64 v2, s4, s4
513; VI-NEXT:    v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
514; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
515; VI-NEXT:    v_max_f16_e64 v1, s7, s7
516; VI-NEXT:    v_max_f16_e64 v2, s5, s5
517; VI-NEXT:    v_max_f16_e32 v1, v2, v1
518; VI-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
519; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
520; VI-NEXT:    s_endpgm
521;
522; GFX9-LABEL: maxnum_v3f16:
523; GFX9:       ; %bb.0: ; %entry
524; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
525; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
526; GFX9-NEXT:    s_mov_b32 s3, 0xf000
527; GFX9-NEXT:    s_mov_b32 s2, -1
528; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
529; GFX9-NEXT:    s_mov_b32 s0, s4
530; GFX9-NEXT:    s_mov_b32 s1, s5
531; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
532; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
533; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
534; GFX9-NEXT:    v_pk_max_f16 v1, s4, s4
535; GFX9-NEXT:    v_pk_max_f16 v0, s6, s6
536; GFX9-NEXT:    v_pk_max_f16 v0, v1, v0
537; GFX9-NEXT:    v_pk_max_f16 v2, s7, s7
538; GFX9-NEXT:    v_pk_max_f16 v1, s5, s5
539; GFX9-NEXT:    v_pk_max_f16 v1, v1, v2
540; GFX9-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
541; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
542; GFX9-NEXT:    s_endpgm
543    <3 x half> addrspace(1)* %r,
544    <3 x half> addrspace(1)* %a,
545    <3 x half> addrspace(1)* %b) #0 {
546entry:
547  %a.val = load <3 x half>, <3 x half> addrspace(1)* %a
548  %b.val = load <3 x half>, <3 x half> addrspace(1)* %b
549  %r.val = call <3 x half> @llvm.maxnum.v3f16(<3 x half> %a.val, <3 x half> %b.val)
550  store <3 x half> %r.val, <3 x half> addrspace(1)* %r
551  ret void
552}
553
554define amdgpu_kernel void @maxnum_v4f16(
555; SI-LABEL: maxnum_v4f16:
556; SI:       ; %bb.0: ; %entry
557; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
558; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
559; SI-NEXT:    s_mov_b32 s3, 0xf000
560; SI-NEXT:    s_mov_b32 s2, -1
561; SI-NEXT:    s_waitcnt lgkmcnt(0)
562; SI-NEXT:    s_mov_b32 s0, s4
563; SI-NEXT:    s_mov_b32 s1, s5
564; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
565; SI-NEXT:    s_waitcnt lgkmcnt(0)
566; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
567; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
568; SI-NEXT:    s_lshr_b32 s4, s4, 16
569; SI-NEXT:    s_lshr_b32 s5, s5, 16
570; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
571; SI-NEXT:    v_cvt_f32_f16_e32 v3, s5
572; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
573; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
574; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
575; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
576; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
577; SI-NEXT:    s_waitcnt lgkmcnt(0)
578; SI-NEXT:    s_lshr_b32 s6, s5, 16
579; SI-NEXT:    v_cvt_f32_f16_e32 v5, s6
580; SI-NEXT:    v_cvt_f32_f16_e32 v4, s4
581; SI-NEXT:    s_lshr_b32 s4, s4, 16
582; SI-NEXT:    v_cvt_f32_f16_e32 v7, s5
583; SI-NEXT:    v_cvt_f32_f16_e32 v6, s4
584; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v5
585; SI-NEXT:    v_max_f32_e32 v3, v3, v5
586; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v7
587; SI-NEXT:    v_max_f32_e32 v1, v1, v5
588; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v6
589; SI-NEXT:    v_max_f32_e32 v2, v2, v5
590; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
591; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
592; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
593; SI-NEXT:    v_max_f32_e32 v0, v0, v4
594; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
595; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
596; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
597; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
598; SI-NEXT:    v_or_b32_e32 v1, v1, v3
599; SI-NEXT:    v_or_b32_e32 v0, v0, v2
600; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
601; SI-NEXT:    s_endpgm
602;
603; VI-LABEL: maxnum_v4f16:
604; VI:       ; %bb.0: ; %entry
605; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
606; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
607; VI-NEXT:    s_mov_b32 s3, 0xf000
608; VI-NEXT:    s_mov_b32 s2, -1
609; VI-NEXT:    s_waitcnt lgkmcnt(0)
610; VI-NEXT:    s_mov_b32 s0, s4
611; VI-NEXT:    s_mov_b32 s1, s5
612; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
613; VI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
614; VI-NEXT:    s_waitcnt lgkmcnt(0)
615; VI-NEXT:    v_max_f16_e64 v1, s5, s5
616; VI-NEXT:    v_max_f16_e64 v0, s7, s7
617; VI-NEXT:    s_lshr_b32 s5, s5, 16
618; VI-NEXT:    s_lshr_b32 s7, s7, 16
619; VI-NEXT:    v_max_f16_e32 v0, v1, v0
620; VI-NEXT:    v_max_f16_e64 v2, s5, s5
621; VI-NEXT:    v_max_f16_e64 v1, s7, s7
622; VI-NEXT:    v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
623; VI-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
624; VI-NEXT:    v_max_f16_e64 v2, s4, s4
625; VI-NEXT:    v_max_f16_e64 v0, s6, s6
626; VI-NEXT:    s_lshr_b32 s4, s4, 16
627; VI-NEXT:    s_lshr_b32 s5, s6, 16
628; VI-NEXT:    v_max_f16_e32 v0, v2, v0
629; VI-NEXT:    v_max_f16_e64 v2, s5, s5
630; VI-NEXT:    v_max_f16_e64 v3, s4, s4
631; VI-NEXT:    v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
632; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
633; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
634; VI-NEXT:    s_endpgm
635;
636; GFX9-LABEL: maxnum_v4f16:
637; GFX9:       ; %bb.0: ; %entry
638; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
639; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
640; GFX9-NEXT:    s_mov_b32 s3, 0xf000
641; GFX9-NEXT:    s_mov_b32 s2, -1
642; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
643; GFX9-NEXT:    s_mov_b32 s0, s4
644; GFX9-NEXT:    s_mov_b32 s1, s5
645; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
646; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
647; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
648; GFX9-NEXT:    v_pk_max_f16 v1, s5, s5
649; GFX9-NEXT:    v_pk_max_f16 v0, s7, s7
650; GFX9-NEXT:    v_pk_max_f16 v1, v1, v0
651; GFX9-NEXT:    v_pk_max_f16 v2, s6, s6
652; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
653; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
654; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
655; GFX9-NEXT:    s_endpgm
656    <4 x half> addrspace(1)* %r,
657    <4 x half> addrspace(1)* %a,
658    <4 x half> addrspace(1)* %b) #0 {
659entry:
660  %a.val = load <4 x half>, <4 x half> addrspace(1)* %a
661  %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
662  %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %a.val, <4 x half> %b.val)
663  store <4 x half> %r.val, <4 x half> addrspace(1)* %r
664  ret void
665}
666
667define amdgpu_kernel void @fmax_v4f16_imm_a(
668; SI-LABEL: fmax_v4f16_imm_a:
669; SI:       ; %bb.0: ; %entry
670; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
671; SI-NEXT:    s_mov_b32 s3, 0xf000
672; SI-NEXT:    s_mov_b32 s2, -1
673; SI-NEXT:    s_waitcnt lgkmcnt(0)
674; SI-NEXT:    s_mov_b32 s0, s4
675; SI-NEXT:    s_mov_b32 s1, s5
676; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
677; SI-NEXT:    s_waitcnt lgkmcnt(0)
678; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
679; SI-NEXT:    s_lshr_b32 s5, s5, 16
680; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
681; SI-NEXT:    v_cvt_f32_f16_e32 v2, s5
682; SI-NEXT:    s_lshr_b32 s4, s4, 16
683; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
684; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
685; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
686; SI-NEXT:    v_max_f32_e32 v2, 4.0, v2
687; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
688; SI-NEXT:    v_max_f32_e32 v1, 0x40400000, v1
689; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
690; SI-NEXT:    v_max_f32_e32 v3, 2.0, v3
691; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
692; SI-NEXT:    v_max_f32_e32 v0, 0x41000000, v0
693; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
694; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
695; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
696; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
697; SI-NEXT:    v_or_b32_e32 v1, v1, v2
698; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
699; SI-NEXT:    v_or_b32_e32 v0, v0, v2
700; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
701; SI-NEXT:    s_endpgm
702;
703; VI-LABEL: fmax_v4f16_imm_a:
704; VI:       ; %bb.0: ; %entry
705; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
706; VI-NEXT:    v_mov_b32_e32 v0, 0x4400
707; VI-NEXT:    s_mov_b32 s3, 0xf000
708; VI-NEXT:    s_mov_b32 s2, -1
709; VI-NEXT:    s_waitcnt lgkmcnt(0)
710; VI-NEXT:    s_mov_b32 s0, s4
711; VI-NEXT:    s_mov_b32 s1, s5
712; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
713; VI-NEXT:    s_waitcnt lgkmcnt(0)
714; VI-NEXT:    v_max_f16_e64 v1, s5, s5
715; VI-NEXT:    s_lshr_b32 s5, s5, 16
716; VI-NEXT:    v_max_f16_e64 v3, s5, s5
717; VI-NEXT:    v_max_f16_e64 v2, s4, s4
718; VI-NEXT:    v_max_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
719; VI-NEXT:    v_max_f16_e32 v1, 0x4200, v1
720; VI-NEXT:    s_lshr_b32 s4, s4, 16
721; VI-NEXT:    v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
722; VI-NEXT:    v_max_f16_e32 v0, 0x4800, v2
723; VI-NEXT:    v_max_f16_e64 v2, s4, s4
724; VI-NEXT:    v_mov_b32_e32 v3, 0x4000
725; VI-NEXT:    v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
726; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
727; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
728; VI-NEXT:    s_endpgm
729;
730; GFX9-LABEL: fmax_v4f16_imm_a:
731; GFX9:       ; %bb.0: ; %entry
732; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
733; GFX9-NEXT:    s_mov_b32 s8, 0x44004200
734; GFX9-NEXT:    s_mov_b32 s9, 0x40004800
735; GFX9-NEXT:    s_mov_b32 s3, 0xf000
736; GFX9-NEXT:    s_mov_b32 s2, -1
737; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
738; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
739; GFX9-NEXT:    s_mov_b32 s0, s4
740; GFX9-NEXT:    s_mov_b32 s1, s5
741; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
742; GFX9-NEXT:    v_pk_max_f16 v0, s7, s7
743; GFX9-NEXT:    v_pk_max_f16 v2, s6, s6
744; GFX9-NEXT:    v_pk_max_f16 v1, v0, s8
745; GFX9-NEXT:    v_pk_max_f16 v0, v2, s9
746; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
747; GFX9-NEXT:    s_endpgm
748    <4 x half> addrspace(1)* %r,
749    <4 x half> addrspace(1)* %b) #0 {
750entry:
751  %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
752  %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, <4 x half> %b.val)
753  store <4 x half> %r.val, <4 x half> addrspace(1)* %r
754  ret void
755}
756
757attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
758