1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s 3; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,SIVI %s 4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5 6declare half @llvm.maxnum.f16(half %a, half %b) 7declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) 8declare <3 x half> @llvm.maxnum.v3f16(<3 x half> %a, <3 x half> %b) 9declare <4 x half> @llvm.maxnum.v4f16(<4 x half> %a, <4 x half> %b) 10 11define amdgpu_kernel void @maxnum_f16( 12; SI-LABEL: maxnum_f16: 13; SI: ; %bb.0: ; %entry 14; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 15; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 16; SI-NEXT: s_mov_b32 s3, 0xf000 17; SI-NEXT: s_mov_b32 s2, -1 18; SI-NEXT: s_mov_b32 s14, s2 19; SI-NEXT: s_waitcnt lgkmcnt(0) 20; SI-NEXT: s_mov_b32 s12, s6 21; SI-NEXT: s_mov_b32 s13, s7 22; SI-NEXT: s_mov_b32 s15, s3 23; SI-NEXT: s_mov_b32 s10, s2 24; SI-NEXT: s_mov_b32 s11, s3 25; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 26; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 27; SI-NEXT: s_mov_b32 s0, s4 28; SI-NEXT: s_mov_b32 s1, s5 29; SI-NEXT: s_waitcnt vmcnt(1) 30; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 31; SI-NEXT: s_waitcnt vmcnt(0) 32; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 33; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 34; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 35; SI-NEXT: v_max_f32_e32 v0, v0, v1 36; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 37; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 38; SI-NEXT: s_endpgm 39; 40; VI-LABEL: maxnum_f16: 41; VI: ; %bb.0: ; %entry 42; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 43; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 44; VI-NEXT: s_mov_b32 s3, 0xf000 45; VI-NEXT: s_mov_b32 s2, -1 46; VI-NEXT: s_mov_b32 s14, s2 47; VI-NEXT: s_waitcnt lgkmcnt(0) 48; VI-NEXT: s_mov_b32 s12, s6 49; VI-NEXT: s_mov_b32 s13, s7 50; VI-NEXT: s_mov_b32 s15, s3 51; VI-NEXT: s_mov_b32 s10, s2 52; VI-NEXT: s_mov_b32 s11, s3 53; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 54; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 55; VI-NEXT: s_mov_b32 s0, s4 56; VI-NEXT: s_mov_b32 s1, s5 57; VI-NEXT: s_waitcnt vmcnt(1) 58; VI-NEXT: v_max_f16_e32 v0, v0, v0 59; VI-NEXT: s_waitcnt vmcnt(0) 60; VI-NEXT: v_max_f16_e32 v1, v1, v1 61; VI-NEXT: v_max_f16_e32 v0, v0, v1 62; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 63; VI-NEXT: s_endpgm 64; 65; GFX9-LABEL: maxnum_f16: 66; GFX9: ; %bb.0: ; %entry 67; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 68; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 69; GFX9-NEXT: s_mov_b32 s3, 0xf000 70; GFX9-NEXT: s_mov_b32 s2, -1 71; GFX9-NEXT: s_mov_b32 s14, s2 72; GFX9-NEXT: s_waitcnt lgkmcnt(0) 73; GFX9-NEXT: s_mov_b32 s12, s6 74; GFX9-NEXT: s_mov_b32 s13, s7 75; GFX9-NEXT: s_mov_b32 s15, s3 76; GFX9-NEXT: s_mov_b32 s10, s2 77; GFX9-NEXT: s_mov_b32 s11, s3 78; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 79; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 80; GFX9-NEXT: s_mov_b32 s0, s4 81; GFX9-NEXT: s_mov_b32 s1, s5 82; GFX9-NEXT: s_waitcnt vmcnt(1) 83; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 84; GFX9-NEXT: s_waitcnt vmcnt(0) 85; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 86; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 87; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 88; GFX9-NEXT: s_endpgm 89 half addrspace(1)* %r, 90 half addrspace(1)* %a, 91 half addrspace(1)* %b) #0 { 92entry: 93 %a.val = load volatile half, half addrspace(1)* %a 94 %b.val = load volatile half, half addrspace(1)* %b 95 %r.val = call half @llvm.maxnum.f16(half %a.val, half %b.val) 96 store half %r.val, half addrspace(1)* %r 97 ret void 98} 99 100define amdgpu_kernel void @maxnum_f16_imm_a( 101; SI-LABEL: maxnum_f16_imm_a: 102; SI: ; %bb.0: ; %entry 103; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 104; SI-NEXT: s_mov_b32 s3, 0xf000 105; SI-NEXT: s_mov_b32 s2, -1 106; SI-NEXT: s_mov_b32 s10, s2 107; SI-NEXT: s_mov_b32 s11, s3 108; SI-NEXT: s_waitcnt lgkmcnt(0) 109; SI-NEXT: s_mov_b32 s8, s6 110; SI-NEXT: s_mov_b32 s9, s7 111; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 112; SI-NEXT: s_mov_b32 s0, s4 113; SI-NEXT: s_mov_b32 s1, s5 114; SI-NEXT: s_waitcnt vmcnt(0) 115; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 116; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 117; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0 118; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 119; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 120; SI-NEXT: s_endpgm 121; 122; VI-LABEL: maxnum_f16_imm_a: 123; VI: ; %bb.0: ; %entry 124; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 125; VI-NEXT: s_mov_b32 s3, 0xf000 126; VI-NEXT: s_mov_b32 s2, -1 127; VI-NEXT: s_waitcnt lgkmcnt(0) 128; VI-NEXT: s_mov_b32 s0, s4 129; VI-NEXT: s_mov_b32 s1, s5 130; VI-NEXT: s_mov_b32 s4, s6 131; VI-NEXT: s_mov_b32 s5, s7 132; VI-NEXT: s_mov_b32 s6, s2 133; VI-NEXT: s_mov_b32 s7, s3 134; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 135; VI-NEXT: s_waitcnt vmcnt(0) 136; VI-NEXT: v_max_f16_e32 v0, v0, v0 137; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0 138; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 139; VI-NEXT: s_endpgm 140; 141; GFX9-LABEL: maxnum_f16_imm_a: 142; GFX9: ; %bb.0: ; %entry 143; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 144; GFX9-NEXT: s_mov_b32 s3, 0xf000 145; GFX9-NEXT: s_mov_b32 s2, -1 146; GFX9-NEXT: s_waitcnt lgkmcnt(0) 147; GFX9-NEXT: s_mov_b32 s0, s4 148; GFX9-NEXT: s_mov_b32 s1, s5 149; GFX9-NEXT: s_mov_b32 s4, s6 150; GFX9-NEXT: s_mov_b32 s5, s7 151; GFX9-NEXT: s_mov_b32 s6, s2 152; GFX9-NEXT: s_mov_b32 s7, s3 153; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 154; GFX9-NEXT: s_waitcnt vmcnt(0) 155; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 156; GFX9-NEXT: v_max_f16_e32 v0, 0x4200, v0 157; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 158; GFX9-NEXT: s_endpgm 159 half addrspace(1)* %r, 160 half addrspace(1)* %b) #0 { 161entry: 162 %b.val = load half, half addrspace(1)* %b 163 %r.val = call half @llvm.maxnum.f16(half 3.0, half %b.val) 164 store half %r.val, half addrspace(1)* %r 165 ret void 166} 167 168define amdgpu_kernel void @maxnum_f16_imm_b( 169; SI-LABEL: maxnum_f16_imm_b: 170; SI: ; %bb.0: ; %entry 171; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 172; SI-NEXT: s_mov_b32 s3, 0xf000 173; SI-NEXT: s_mov_b32 s2, -1 174; SI-NEXT: s_mov_b32 s10, s2 175; SI-NEXT: s_mov_b32 s11, s3 176; SI-NEXT: s_waitcnt lgkmcnt(0) 177; SI-NEXT: s_mov_b32 s8, s6 178; SI-NEXT: s_mov_b32 s9, s7 179; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 180; SI-NEXT: s_mov_b32 s0, s4 181; SI-NEXT: s_mov_b32 s1, s5 182; SI-NEXT: s_waitcnt vmcnt(0) 183; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 184; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 185; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 186; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 187; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 188; SI-NEXT: s_endpgm 189; 190; VI-LABEL: maxnum_f16_imm_b: 191; VI: ; %bb.0: ; %entry 192; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 193; VI-NEXT: s_mov_b32 s3, 0xf000 194; VI-NEXT: s_mov_b32 s2, -1 195; VI-NEXT: s_waitcnt lgkmcnt(0) 196; VI-NEXT: s_mov_b32 s0, s4 197; VI-NEXT: s_mov_b32 s1, s5 198; VI-NEXT: s_mov_b32 s4, s6 199; VI-NEXT: s_mov_b32 s5, s7 200; VI-NEXT: s_mov_b32 s6, s2 201; VI-NEXT: s_mov_b32 s7, s3 202; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 203; VI-NEXT: s_waitcnt vmcnt(0) 204; VI-NEXT: v_max_f16_e32 v0, v0, v0 205; VI-NEXT: v_max_f16_e32 v0, 4.0, v0 206; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 207; VI-NEXT: s_endpgm 208; 209; GFX9-LABEL: maxnum_f16_imm_b: 210; GFX9: ; %bb.0: ; %entry 211; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 212; GFX9-NEXT: s_mov_b32 s3, 0xf000 213; GFX9-NEXT: s_mov_b32 s2, -1 214; GFX9-NEXT: s_waitcnt lgkmcnt(0) 215; GFX9-NEXT: s_mov_b32 s0, s4 216; GFX9-NEXT: s_mov_b32 s1, s5 217; GFX9-NEXT: s_mov_b32 s4, s6 218; GFX9-NEXT: s_mov_b32 s5, s7 219; GFX9-NEXT: s_mov_b32 s6, s2 220; GFX9-NEXT: s_mov_b32 s7, s3 221; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 222; GFX9-NEXT: s_waitcnt vmcnt(0) 223; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 224; GFX9-NEXT: v_max_f16_e32 v0, 4.0, v0 225; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 226; GFX9-NEXT: s_endpgm 227 half addrspace(1)* %r, 228 half addrspace(1)* %a) #0 { 229entry: 230 %a.val = load half, half addrspace(1)* %a 231 %r.val = call half @llvm.maxnum.f16(half %a.val, half 4.0) 232 store half %r.val, half addrspace(1)* %r 233 ret void 234} 235 236define amdgpu_kernel void @maxnum_v2f16( 237; SI-LABEL: maxnum_v2f16: 238; SI: ; %bb.0: ; %entry 239; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 240; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 241; SI-NEXT: s_mov_b32 s3, 0xf000 242; SI-NEXT: s_mov_b32 s2, -1 243; SI-NEXT: s_waitcnt lgkmcnt(0) 244; SI-NEXT: s_load_dword s6, s[6:7], 0x0 245; SI-NEXT: s_load_dword s0, s[0:1], 0x0 246; SI-NEXT: s_waitcnt lgkmcnt(0) 247; SI-NEXT: s_lshr_b32 s1, s6, 16 248; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 249; SI-NEXT: s_lshr_b32 s0, s0, 16 250; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 251; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 252; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 253; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 254; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 255; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 256; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 257; SI-NEXT: v_max_f32_e32 v2, v3, v2 258; SI-NEXT: v_max_f32_e32 v0, v0, v1 259; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 260; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 261; SI-NEXT: s_mov_b32 s0, s4 262; SI-NEXT: s_mov_b32 s1, s5 263; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 264; SI-NEXT: v_or_b32_e32 v0, v0, v1 265; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 266; SI-NEXT: s_endpgm 267; 268; VI-LABEL: maxnum_v2f16: 269; VI: ; %bb.0: ; %entry 270; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 271; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 272; VI-NEXT: s_mov_b32 s3, 0xf000 273; VI-NEXT: s_mov_b32 s2, -1 274; VI-NEXT: s_waitcnt lgkmcnt(0) 275; VI-NEXT: s_mov_b32 s0, s4 276; VI-NEXT: s_mov_b32 s1, s5 277; VI-NEXT: s_load_dword s4, s[6:7], 0x0 278; VI-NEXT: s_load_dword s5, s[8:9], 0x0 279; VI-NEXT: s_waitcnt lgkmcnt(0) 280; VI-NEXT: v_max_f16_e64 v1, s4, s4 281; VI-NEXT: v_max_f16_e64 v0, s5, s5 282; VI-NEXT: s_lshr_b32 s4, s4, 16 283; VI-NEXT: s_lshr_b32 s5, s5, 16 284; VI-NEXT: v_max_f16_e32 v0, v1, v0 285; VI-NEXT: v_max_f16_e64 v1, s5, s5 286; VI-NEXT: v_max_f16_e64 v2, s4, s4 287; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 288; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 289; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 290; VI-NEXT: s_endpgm 291; 292; GFX9-LABEL: maxnum_v2f16: 293; GFX9: ; %bb.0: ; %entry 294; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 295; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 296; GFX9-NEXT: s_mov_b32 s3, 0xf000 297; GFX9-NEXT: s_mov_b32 s2, -1 298; GFX9-NEXT: s_waitcnt lgkmcnt(0) 299; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 300; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0 301; GFX9-NEXT: s_mov_b32 s0, s4 302; GFX9-NEXT: s_mov_b32 s1, s5 303; GFX9-NEXT: s_waitcnt lgkmcnt(0) 304; GFX9-NEXT: v_pk_max_f16 v1, s6, s6 305; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 306; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 307; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 308; GFX9-NEXT: s_endpgm 309 <2 x half> addrspace(1)* %r, 310 <2 x half> addrspace(1)* %a, 311 <2 x half> addrspace(1)* %b) #0 { 312entry: 313 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 314 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 315 %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> %b.val) 316 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 317 ret void 318} 319 320define amdgpu_kernel void @maxnum_v2f16_imm_a( 321; SI-LABEL: maxnum_v2f16_imm_a: 322; SI: ; %bb.0: ; %entry 323; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 324; SI-NEXT: s_waitcnt lgkmcnt(0) 325; SI-NEXT: s_load_dword s2, s[2:3], 0x0 326; SI-NEXT: s_mov_b32 s3, 0xf000 327; SI-NEXT: s_waitcnt lgkmcnt(0) 328; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 329; SI-NEXT: s_lshr_b32 s2, s2, 16 330; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 331; SI-NEXT: s_mov_b32 s2, -1 332; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 333; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0 334; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 335; SI-NEXT: v_max_f32_e32 v1, 4.0, v1 336; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 337; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 338; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 339; SI-NEXT: v_or_b32_e32 v0, v0, v1 340; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 341; SI-NEXT: s_endpgm 342; 343; VI-LABEL: maxnum_v2f16_imm_a: 344; VI: ; %bb.0: ; %entry 345; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 346; VI-NEXT: v_mov_b32_e32 v2, 0x4400 347; VI-NEXT: s_mov_b32 s3, 0xf000 348; VI-NEXT: s_mov_b32 s2, -1 349; VI-NEXT: s_waitcnt lgkmcnt(0) 350; VI-NEXT: s_mov_b32 s0, s4 351; VI-NEXT: s_load_dword s4, s[6:7], 0x0 352; VI-NEXT: s_mov_b32 s1, s5 353; VI-NEXT: s_waitcnt lgkmcnt(0) 354; VI-NEXT: v_max_f16_e64 v0, s4, s4 355; VI-NEXT: s_lshr_b32 s4, s4, 16 356; VI-NEXT: v_max_f16_e64 v1, s4, s4 357; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0 358; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 359; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 360; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 361; VI-NEXT: s_endpgm 362; 363; GFX9-LABEL: maxnum_v2f16_imm_a: 364; GFX9: ; %bb.0: ; %entry 365; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 366; GFX9-NEXT: s_mov_b32 s7, 0xf000 367; GFX9-NEXT: s_mov_b32 s6, -1 368; GFX9-NEXT: s_waitcnt lgkmcnt(0) 369; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 370; GFX9-NEXT: s_mov_b32 s4, s0 371; GFX9-NEXT: s_mov_b32 s0, 0x44004200 372; GFX9-NEXT: s_mov_b32 s5, s1 373; GFX9-NEXT: s_waitcnt lgkmcnt(0) 374; GFX9-NEXT: v_pk_max_f16 v0, s2, s2 375; GFX9-NEXT: v_pk_max_f16 v0, v0, s0 376; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 377; GFX9-NEXT: s_endpgm 378 <2 x half> addrspace(1)* %r, 379 <2 x half> addrspace(1)* %b) #0 { 380entry: 381 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 382 %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> <half 3.0, half 4.0>, <2 x half> %b.val) 383 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 384 ret void 385} 386 387define amdgpu_kernel void @maxnum_v2f16_imm_b( 388; SI-LABEL: maxnum_v2f16_imm_b: 389; SI: ; %bb.0: ; %entry 390; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 391; SI-NEXT: s_waitcnt lgkmcnt(0) 392; SI-NEXT: s_load_dword s2, s[2:3], 0x0 393; SI-NEXT: s_mov_b32 s3, 0xf000 394; SI-NEXT: s_waitcnt lgkmcnt(0) 395; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 396; SI-NEXT: s_lshr_b32 s2, s2, 16 397; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 398; SI-NEXT: s_mov_b32 s2, -1 399; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 400; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 401; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 402; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1 403; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 404; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 405; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 406; SI-NEXT: v_or_b32_e32 v0, v0, v1 407; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 408; SI-NEXT: s_endpgm 409; 410; VI-LABEL: maxnum_v2f16_imm_b: 411; VI: ; %bb.0: ; %entry 412; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 413; VI-NEXT: v_mov_b32_e32 v2, 0x4200 414; VI-NEXT: s_mov_b32 s3, 0xf000 415; VI-NEXT: s_mov_b32 s2, -1 416; VI-NEXT: s_waitcnt lgkmcnt(0) 417; VI-NEXT: s_mov_b32 s0, s4 418; VI-NEXT: s_load_dword s4, s[6:7], 0x0 419; VI-NEXT: s_mov_b32 s1, s5 420; VI-NEXT: s_waitcnt lgkmcnt(0) 421; VI-NEXT: v_max_f16_e64 v0, s4, s4 422; VI-NEXT: s_lshr_b32 s4, s4, 16 423; VI-NEXT: v_max_f16_e64 v1, s4, s4 424; VI-NEXT: v_max_f16_e32 v0, 4.0, v0 425; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 426; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 427; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 428; VI-NEXT: s_endpgm 429; 430; GFX9-LABEL: maxnum_v2f16_imm_b: 431; GFX9: ; %bb.0: ; %entry 432; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 433; GFX9-NEXT: s_mov_b32 s7, 0xf000 434; GFX9-NEXT: s_mov_b32 s6, -1 435; GFX9-NEXT: s_waitcnt lgkmcnt(0) 436; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 437; GFX9-NEXT: s_mov_b32 s4, s0 438; GFX9-NEXT: s_mov_b32 s0, 0x42004400 439; GFX9-NEXT: s_mov_b32 s5, s1 440; GFX9-NEXT: s_waitcnt lgkmcnt(0) 441; GFX9-NEXT: v_pk_max_f16 v0, s2, s2 442; GFX9-NEXT: v_pk_max_f16 v0, v0, s0 443; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 444; GFX9-NEXT: s_endpgm 445 <2 x half> addrspace(1)* %r, 446 <2 x half> addrspace(1)* %a) #0 { 447entry: 448 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 449 %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> <half 4.0, half 3.0>) 450 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 451 ret void 452} 453 454; FIXME: Scalarize with undef half 455define amdgpu_kernel void @maxnum_v3f16( 456; SI-LABEL: maxnum_v3f16: 457; SI: ; %bb.0: ; %entry 458; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 459; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 460; SI-NEXT: s_mov_b32 s3, 0xf000 461; SI-NEXT: s_mov_b32 s2, -1 462; SI-NEXT: s_waitcnt lgkmcnt(0) 463; SI-NEXT: s_mov_b32 s0, s4 464; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 465; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 466; SI-NEXT: s_waitcnt lgkmcnt(0) 467; SI-NEXT: s_lshr_b32 s1, s6, 16 468; SI-NEXT: s_lshr_b32 s4, s8, 16 469; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 470; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 471; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 472; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 473; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 474; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 475; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 476; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 477; SI-NEXT: v_max_f32_e32 v2, v3, v2 478; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 479; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 480; SI-NEXT: v_max_f32_e32 v1, v1, v3 481; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 482; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 483; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 484; SI-NEXT: v_max_f32_e32 v0, v0, v3 485; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 486; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 487; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 488; SI-NEXT: s_mov_b32 s1, s5 489; SI-NEXT: v_or_b32_e32 v1, v1, v2 490; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 491; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 492; SI-NEXT: s_endpgm 493; 494; VI-LABEL: maxnum_v3f16: 495; VI: ; %bb.0: ; %entry 496; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 497; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 498; VI-NEXT: s_mov_b32 s3, 0xf000 499; VI-NEXT: s_mov_b32 s2, -1 500; VI-NEXT: s_waitcnt lgkmcnt(0) 501; VI-NEXT: s_mov_b32 s0, s4 502; VI-NEXT: s_mov_b32 s1, s5 503; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 504; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 505; VI-NEXT: s_waitcnt lgkmcnt(0) 506; VI-NEXT: v_max_f16_e64 v1, s4, s4 507; VI-NEXT: v_max_f16_e64 v0, s6, s6 508; VI-NEXT: s_lshr_b32 s4, s4, 16 509; VI-NEXT: s_lshr_b32 s6, s6, 16 510; VI-NEXT: v_max_f16_e32 v0, v1, v0 511; VI-NEXT: v_max_f16_e64 v1, s6, s6 512; VI-NEXT: v_max_f16_e64 v2, s4, s4 513; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 514; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 515; VI-NEXT: v_max_f16_e64 v1, s7, s7 516; VI-NEXT: v_max_f16_e64 v2, s5, s5 517; VI-NEXT: v_max_f16_e32 v1, v2, v1 518; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 519; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 520; VI-NEXT: s_endpgm 521; 522; GFX9-LABEL: maxnum_v3f16: 523; GFX9: ; %bb.0: ; %entry 524; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 525; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 526; GFX9-NEXT: s_mov_b32 s3, 0xf000 527; GFX9-NEXT: s_mov_b32 s2, -1 528; GFX9-NEXT: s_waitcnt lgkmcnt(0) 529; GFX9-NEXT: s_mov_b32 s0, s4 530; GFX9-NEXT: s_mov_b32 s1, s5 531; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 532; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 533; GFX9-NEXT: s_waitcnt lgkmcnt(0) 534; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 535; GFX9-NEXT: v_pk_max_f16 v0, s6, s6 536; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 537; GFX9-NEXT: v_pk_max_f16 v2, s7, s7 538; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 539; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 540; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 541; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 542; GFX9-NEXT: s_endpgm 543 <3 x half> addrspace(1)* %r, 544 <3 x half> addrspace(1)* %a, 545 <3 x half> addrspace(1)* %b) #0 { 546entry: 547 %a.val = load <3 x half>, <3 x half> addrspace(1)* %a 548 %b.val = load <3 x half>, <3 x half> addrspace(1)* %b 549 %r.val = call <3 x half> @llvm.maxnum.v3f16(<3 x half> %a.val, <3 x half> %b.val) 550 store <3 x half> %r.val, <3 x half> addrspace(1)* %r 551 ret void 552} 553 554define amdgpu_kernel void @maxnum_v4f16( 555; SI-LABEL: maxnum_v4f16: 556; SI: ; %bb.0: ; %entry 557; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 558; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 559; SI-NEXT: s_mov_b32 s3, 0xf000 560; SI-NEXT: s_mov_b32 s2, -1 561; SI-NEXT: s_waitcnt lgkmcnt(0) 562; SI-NEXT: s_mov_b32 s0, s4 563; SI-NEXT: s_mov_b32 s1, s5 564; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 565; SI-NEXT: s_waitcnt lgkmcnt(0) 566; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 567; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 568; SI-NEXT: s_lshr_b32 s4, s4, 16 569; SI-NEXT: s_lshr_b32 s5, s5, 16 570; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 571; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 572; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 573; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 574; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 575; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 576; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 577; SI-NEXT: s_waitcnt lgkmcnt(0) 578; SI-NEXT: s_lshr_b32 s6, s5, 16 579; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 580; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 581; SI-NEXT: s_lshr_b32 s4, s4, 16 582; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 583; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 584; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 585; SI-NEXT: v_max_f32_e32 v3, v3, v5 586; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 587; SI-NEXT: v_max_f32_e32 v1, v1, v5 588; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 589; SI-NEXT: v_max_f32_e32 v2, v2, v5 590; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 591; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 592; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 593; SI-NEXT: v_max_f32_e32 v0, v0, v4 594; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 595; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 596; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 597; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 598; SI-NEXT: v_or_b32_e32 v1, v1, v3 599; SI-NEXT: v_or_b32_e32 v0, v0, v2 600; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 601; SI-NEXT: s_endpgm 602; 603; VI-LABEL: maxnum_v4f16: 604; VI: ; %bb.0: ; %entry 605; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 606; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 607; VI-NEXT: s_mov_b32 s3, 0xf000 608; VI-NEXT: s_mov_b32 s2, -1 609; VI-NEXT: s_waitcnt lgkmcnt(0) 610; VI-NEXT: s_mov_b32 s0, s4 611; VI-NEXT: s_mov_b32 s1, s5 612; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 613; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 614; VI-NEXT: s_waitcnt lgkmcnt(0) 615; VI-NEXT: v_max_f16_e64 v1, s5, s5 616; VI-NEXT: v_max_f16_e64 v0, s7, s7 617; VI-NEXT: s_lshr_b32 s5, s5, 16 618; VI-NEXT: s_lshr_b32 s7, s7, 16 619; VI-NEXT: v_max_f16_e32 v0, v1, v0 620; VI-NEXT: v_max_f16_e64 v2, s5, s5 621; VI-NEXT: v_max_f16_e64 v1, s7, s7 622; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 623; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 624; VI-NEXT: v_max_f16_e64 v2, s4, s4 625; VI-NEXT: v_max_f16_e64 v0, s6, s6 626; VI-NEXT: s_lshr_b32 s4, s4, 16 627; VI-NEXT: s_lshr_b32 s5, s6, 16 628; VI-NEXT: v_max_f16_e32 v0, v2, v0 629; VI-NEXT: v_max_f16_e64 v2, s5, s5 630; VI-NEXT: v_max_f16_e64 v3, s4, s4 631; VI-NEXT: v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 632; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 633; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 634; VI-NEXT: s_endpgm 635; 636; GFX9-LABEL: maxnum_v4f16: 637; GFX9: ; %bb.0: ; %entry 638; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 639; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 640; GFX9-NEXT: s_mov_b32 s3, 0xf000 641; GFX9-NEXT: s_mov_b32 s2, -1 642; GFX9-NEXT: s_waitcnt lgkmcnt(0) 643; GFX9-NEXT: s_mov_b32 s0, s4 644; GFX9-NEXT: s_mov_b32 s1, s5 645; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 646; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 647; GFX9-NEXT: s_waitcnt lgkmcnt(0) 648; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 649; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 650; GFX9-NEXT: v_pk_max_f16 v1, v1, v0 651; GFX9-NEXT: v_pk_max_f16 v2, s6, s6 652; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 653; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 654; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 655; GFX9-NEXT: s_endpgm 656 <4 x half> addrspace(1)* %r, 657 <4 x half> addrspace(1)* %a, 658 <4 x half> addrspace(1)* %b) #0 { 659entry: 660 %a.val = load <4 x half>, <4 x half> addrspace(1)* %a 661 %b.val = load <4 x half>, <4 x half> addrspace(1)* %b 662 %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %a.val, <4 x half> %b.val) 663 store <4 x half> %r.val, <4 x half> addrspace(1)* %r 664 ret void 665} 666 667define amdgpu_kernel void @fmax_v4f16_imm_a( 668; SI-LABEL: fmax_v4f16_imm_a: 669; SI: ; %bb.0: ; %entry 670; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 671; SI-NEXT: s_mov_b32 s3, 0xf000 672; SI-NEXT: s_mov_b32 s2, -1 673; SI-NEXT: s_waitcnt lgkmcnt(0) 674; SI-NEXT: s_mov_b32 s0, s4 675; SI-NEXT: s_mov_b32 s1, s5 676; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 677; SI-NEXT: s_waitcnt lgkmcnt(0) 678; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 679; SI-NEXT: s_lshr_b32 s5, s5, 16 680; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 681; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 682; SI-NEXT: s_lshr_b32 s4, s4, 16 683; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 684; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 685; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 686; SI-NEXT: v_max_f32_e32 v2, 4.0, v2 687; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 688; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1 689; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 690; SI-NEXT: v_max_f32_e32 v3, 2.0, v3 691; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 692; SI-NEXT: v_max_f32_e32 v0, 0x41000000, v0 693; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 694; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 695; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 696; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 697; SI-NEXT: v_or_b32_e32 v1, v1, v2 698; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 699; SI-NEXT: v_or_b32_e32 v0, v0, v2 700; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 701; SI-NEXT: s_endpgm 702; 703; VI-LABEL: fmax_v4f16_imm_a: 704; VI: ; %bb.0: ; %entry 705; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 706; VI-NEXT: v_mov_b32_e32 v0, 0x4400 707; VI-NEXT: s_mov_b32 s3, 0xf000 708; VI-NEXT: s_mov_b32 s2, -1 709; VI-NEXT: s_waitcnt lgkmcnt(0) 710; VI-NEXT: s_mov_b32 s0, s4 711; VI-NEXT: s_mov_b32 s1, s5 712; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 713; VI-NEXT: s_waitcnt lgkmcnt(0) 714; VI-NEXT: v_max_f16_e64 v1, s5, s5 715; VI-NEXT: s_lshr_b32 s5, s5, 16 716; VI-NEXT: v_max_f16_e64 v3, s5, s5 717; VI-NEXT: v_max_f16_e64 v2, s4, s4 718; VI-NEXT: v_max_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 719; VI-NEXT: v_max_f16_e32 v1, 0x4200, v1 720; VI-NEXT: s_lshr_b32 s4, s4, 16 721; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 722; VI-NEXT: v_max_f16_e32 v0, 0x4800, v2 723; VI-NEXT: v_max_f16_e64 v2, s4, s4 724; VI-NEXT: v_mov_b32_e32 v3, 0x4000 725; VI-NEXT: v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 726; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 727; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 728; VI-NEXT: s_endpgm 729; 730; GFX9-LABEL: fmax_v4f16_imm_a: 731; GFX9: ; %bb.0: ; %entry 732; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 733; GFX9-NEXT: s_mov_b32 s8, 0x44004200 734; GFX9-NEXT: s_mov_b32 s9, 0x40004800 735; GFX9-NEXT: s_mov_b32 s3, 0xf000 736; GFX9-NEXT: s_mov_b32 s2, -1 737; GFX9-NEXT: s_waitcnt lgkmcnt(0) 738; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 739; GFX9-NEXT: s_mov_b32 s0, s4 740; GFX9-NEXT: s_mov_b32 s1, s5 741; GFX9-NEXT: s_waitcnt lgkmcnt(0) 742; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 743; GFX9-NEXT: v_pk_max_f16 v2, s6, s6 744; GFX9-NEXT: v_pk_max_f16 v1, v0, s8 745; GFX9-NEXT: v_pk_max_f16 v0, v2, s9 746; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 747; GFX9-NEXT: s_endpgm 748 <4 x half> addrspace(1)* %r, 749 <4 x half> addrspace(1)* %b) #0 { 750entry: 751 %b.val = load <4 x half>, <4 x half> addrspace(1)* %b 752 %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, <4 x half> %b.val) 753 store <4 x half> %r.val, <4 x half> addrspace(1)* %r 754 ret void 755} 756 757attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 758