1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s 3; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s 4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5 6declare half @llvm.minnum.f16(half %a, half %b) 7declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) 8declare <3 x half> @llvm.minnum.v3f16(<3 x half> %a, <3 x half> %b) 9declare <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b) 10 11define amdgpu_kernel void @minnum_f16_ieee( 12; SI-LABEL: minnum_f16_ieee: 13; SI: ; %bb.0: ; %entry 14; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 15; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 16; SI-NEXT: s_mov_b32 s3, 0xf000 17; SI-NEXT: s_mov_b32 s2, -1 18; SI-NEXT: s_mov_b32 s14, s2 19; SI-NEXT: s_waitcnt lgkmcnt(0) 20; SI-NEXT: s_mov_b32 s12, s6 21; SI-NEXT: s_mov_b32 s13, s7 22; SI-NEXT: s_mov_b32 s15, s3 23; SI-NEXT: s_mov_b32 s10, s2 24; SI-NEXT: s_mov_b32 s11, s3 25; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 26; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 27; SI-NEXT: s_mov_b32 s0, s4 28; SI-NEXT: s_mov_b32 s1, s5 29; SI-NEXT: s_waitcnt vmcnt(1) 30; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 31; SI-NEXT: s_waitcnt vmcnt(0) 32; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 33; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 34; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 35; SI-NEXT: v_min_f32_e32 v0, v0, v1 36; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 37; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 38; SI-NEXT: s_endpgm 39; 40; VI-LABEL: minnum_f16_ieee: 41; VI: ; %bb.0: ; %entry 42; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 43; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 44; VI-NEXT: s_mov_b32 s3, 0xf000 45; VI-NEXT: s_mov_b32 s2, -1 46; VI-NEXT: s_mov_b32 s14, s2 47; VI-NEXT: s_waitcnt lgkmcnt(0) 48; VI-NEXT: s_mov_b32 s12, s6 49; VI-NEXT: s_mov_b32 s13, s7 50; VI-NEXT: s_mov_b32 s15, s3 51; VI-NEXT: s_mov_b32 s10, s2 52; VI-NEXT: s_mov_b32 s11, s3 53; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 54; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 55; VI-NEXT: s_mov_b32 s0, s4 56; VI-NEXT: s_mov_b32 s1, s5 57; VI-NEXT: s_waitcnt vmcnt(1) 58; VI-NEXT: v_max_f16_e32 v0, v0, v0 59; VI-NEXT: s_waitcnt vmcnt(0) 60; VI-NEXT: v_max_f16_e32 v1, v1, v1 61; VI-NEXT: v_min_f16_e32 v0, v0, v1 62; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 63; VI-NEXT: s_endpgm 64; 65; GFX9-LABEL: minnum_f16_ieee: 66; GFX9: ; %bb.0: ; %entry 67; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 68; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 69; GFX9-NEXT: s_mov_b32 s3, 0xf000 70; GFX9-NEXT: s_mov_b32 s2, -1 71; GFX9-NEXT: s_mov_b32 s14, s2 72; GFX9-NEXT: s_waitcnt lgkmcnt(0) 73; GFX9-NEXT: s_mov_b32 s12, s6 74; GFX9-NEXT: s_mov_b32 s13, s7 75; GFX9-NEXT: s_mov_b32 s15, s3 76; GFX9-NEXT: s_mov_b32 s10, s2 77; GFX9-NEXT: s_mov_b32 s11, s3 78; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 79; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 80; GFX9-NEXT: s_mov_b32 s0, s4 81; GFX9-NEXT: s_mov_b32 s1, s5 82; GFX9-NEXT: s_waitcnt vmcnt(1) 83; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 84; GFX9-NEXT: s_waitcnt vmcnt(0) 85; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 86; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 87; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 88; GFX9-NEXT: s_endpgm 89 half addrspace(1)* %r, 90 half addrspace(1)* %a, 91 half addrspace(1)* %b) #0 { 92entry: 93 %a.val = load volatile half, half addrspace(1)* %a 94 %b.val = load volatile half, half addrspace(1)* %b 95 %r.val = call half @llvm.minnum.f16(half %a.val, half %b.val) 96 store half %r.val, half addrspace(1)* %r 97 ret void 98} 99 100define amdgpu_ps half @minnum_f16_no_ieee(half %a, half %b) #0 { 101; SI-LABEL: minnum_f16_no_ieee: 102; SI: ; %bb.0: 103; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 104; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 105; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 106; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 107; SI-NEXT: v_min_f32_e32 v0, v0, v1 108; SI-NEXT: ; return to shader part epilog 109; 110; VI-LABEL: minnum_f16_no_ieee: 111; VI: ; %bb.0: 112; VI-NEXT: v_min_f16_e32 v0, v0, v1 113; VI-NEXT: ; return to shader part epilog 114; 115; GFX9-LABEL: minnum_f16_no_ieee: 116; GFX9: ; %bb.0: 117; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 118; GFX9-NEXT: ; return to shader part epilog 119 %r.val = call half @llvm.minnum.f16(half %a, half %b) 120 ret half %r.val 121} 122 123define amdgpu_kernel void @minnum_f16_imm_a( 124; SI-LABEL: minnum_f16_imm_a: 125; SI: ; %bb.0: ; %entry 126; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 127; SI-NEXT: s_mov_b32 s3, 0xf000 128; SI-NEXT: s_mov_b32 s2, -1 129; SI-NEXT: s_mov_b32 s10, s2 130; SI-NEXT: s_mov_b32 s11, s3 131; SI-NEXT: s_waitcnt lgkmcnt(0) 132; SI-NEXT: s_mov_b32 s8, s6 133; SI-NEXT: s_mov_b32 s9, s7 134; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 135; SI-NEXT: s_mov_b32 s0, s4 136; SI-NEXT: s_mov_b32 s1, s5 137; SI-NEXT: s_waitcnt vmcnt(0) 138; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 139; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 140; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0 141; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 142; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 143; SI-NEXT: s_endpgm 144; 145; VI-LABEL: minnum_f16_imm_a: 146; VI: ; %bb.0: ; %entry 147; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 148; VI-NEXT: s_mov_b32 s3, 0xf000 149; VI-NEXT: s_mov_b32 s2, -1 150; VI-NEXT: s_waitcnt lgkmcnt(0) 151; VI-NEXT: s_mov_b32 s0, s4 152; VI-NEXT: s_mov_b32 s1, s5 153; VI-NEXT: s_mov_b32 s4, s6 154; VI-NEXT: s_mov_b32 s5, s7 155; VI-NEXT: s_mov_b32 s6, s2 156; VI-NEXT: s_mov_b32 s7, s3 157; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 158; VI-NEXT: s_waitcnt vmcnt(0) 159; VI-NEXT: v_max_f16_e32 v0, v0, v0 160; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0 161; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 162; VI-NEXT: s_endpgm 163; 164; GFX9-LABEL: minnum_f16_imm_a: 165; GFX9: ; %bb.0: ; %entry 166; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 167; GFX9-NEXT: s_mov_b32 s3, 0xf000 168; GFX9-NEXT: s_mov_b32 s2, -1 169; GFX9-NEXT: s_waitcnt lgkmcnt(0) 170; GFX9-NEXT: s_mov_b32 s0, s4 171; GFX9-NEXT: s_mov_b32 s1, s5 172; GFX9-NEXT: s_mov_b32 s4, s6 173; GFX9-NEXT: s_mov_b32 s5, s7 174; GFX9-NEXT: s_mov_b32 s6, s2 175; GFX9-NEXT: s_mov_b32 s7, s3 176; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 177; GFX9-NEXT: s_waitcnt vmcnt(0) 178; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 179; GFX9-NEXT: v_min_f16_e32 v0, 0x4200, v0 180; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 181; GFX9-NEXT: s_endpgm 182 half addrspace(1)* %r, 183 half addrspace(1)* %b) #0 { 184entry: 185 %b.val = load half, half addrspace(1)* %b 186 %r.val = call half @llvm.minnum.f16(half 3.0, half %b.val) 187 store half %r.val, half addrspace(1)* %r 188 ret void 189} 190 191define amdgpu_kernel void @minnum_f16_imm_b( 192; SI-LABEL: minnum_f16_imm_b: 193; SI: ; %bb.0: ; %entry 194; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 195; SI-NEXT: s_mov_b32 s3, 0xf000 196; SI-NEXT: s_mov_b32 s2, -1 197; SI-NEXT: s_mov_b32 s10, s2 198; SI-NEXT: s_mov_b32 s11, s3 199; SI-NEXT: s_waitcnt lgkmcnt(0) 200; SI-NEXT: s_mov_b32 s8, s6 201; SI-NEXT: s_mov_b32 s9, s7 202; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 203; SI-NEXT: s_mov_b32 s0, s4 204; SI-NEXT: s_mov_b32 s1, s5 205; SI-NEXT: s_waitcnt vmcnt(0) 206; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 207; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 208; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 209; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 210; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 211; SI-NEXT: s_endpgm 212; 213; VI-LABEL: minnum_f16_imm_b: 214; VI: ; %bb.0: ; %entry 215; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 216; VI-NEXT: s_mov_b32 s3, 0xf000 217; VI-NEXT: s_mov_b32 s2, -1 218; VI-NEXT: s_waitcnt lgkmcnt(0) 219; VI-NEXT: s_mov_b32 s0, s4 220; VI-NEXT: s_mov_b32 s1, s5 221; VI-NEXT: s_mov_b32 s4, s6 222; VI-NEXT: s_mov_b32 s5, s7 223; VI-NEXT: s_mov_b32 s6, s2 224; VI-NEXT: s_mov_b32 s7, s3 225; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 226; VI-NEXT: s_waitcnt vmcnt(0) 227; VI-NEXT: v_max_f16_e32 v0, v0, v0 228; VI-NEXT: v_min_f16_e32 v0, 4.0, v0 229; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 230; VI-NEXT: s_endpgm 231; 232; GFX9-LABEL: minnum_f16_imm_b: 233; GFX9: ; %bb.0: ; %entry 234; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 235; GFX9-NEXT: s_mov_b32 s3, 0xf000 236; GFX9-NEXT: s_mov_b32 s2, -1 237; GFX9-NEXT: s_waitcnt lgkmcnt(0) 238; GFX9-NEXT: s_mov_b32 s0, s4 239; GFX9-NEXT: s_mov_b32 s1, s5 240; GFX9-NEXT: s_mov_b32 s4, s6 241; GFX9-NEXT: s_mov_b32 s5, s7 242; GFX9-NEXT: s_mov_b32 s6, s2 243; GFX9-NEXT: s_mov_b32 s7, s3 244; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 245; GFX9-NEXT: s_waitcnt vmcnt(0) 246; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 247; GFX9-NEXT: v_min_f16_e32 v0, 4.0, v0 248; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 249; GFX9-NEXT: s_endpgm 250 half addrspace(1)* %r, 251 half addrspace(1)* %a) #0 { 252entry: 253 %a.val = load half, half addrspace(1)* %a 254 %r.val = call half @llvm.minnum.f16(half %a.val, half 4.0) 255 store half %r.val, half addrspace(1)* %r 256 ret void 257} 258 259define amdgpu_kernel void @minnum_v2f16_ieee( 260; SI-LABEL: minnum_v2f16_ieee: 261; SI: ; %bb.0: ; %entry 262; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 263; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 264; SI-NEXT: s_mov_b32 s3, 0xf000 265; SI-NEXT: s_mov_b32 s2, -1 266; SI-NEXT: s_waitcnt lgkmcnt(0) 267; SI-NEXT: s_load_dword s6, s[6:7], 0x0 268; SI-NEXT: s_load_dword s0, s[0:1], 0x0 269; SI-NEXT: s_waitcnt lgkmcnt(0) 270; SI-NEXT: s_lshr_b32 s1, s6, 16 271; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 272; SI-NEXT: s_lshr_b32 s0, s0, 16 273; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 274; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 275; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 276; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 277; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 278; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 279; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 280; SI-NEXT: v_min_f32_e32 v2, v3, v2 281; SI-NEXT: v_min_f32_e32 v0, v0, v1 282; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 283; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 284; SI-NEXT: s_mov_b32 s0, s4 285; SI-NEXT: s_mov_b32 s1, s5 286; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 287; SI-NEXT: v_or_b32_e32 v0, v0, v1 288; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 289; SI-NEXT: s_endpgm 290; 291; VI-LABEL: minnum_v2f16_ieee: 292; VI: ; %bb.0: ; %entry 293; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 294; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 295; VI-NEXT: s_mov_b32 s3, 0xf000 296; VI-NEXT: s_mov_b32 s2, -1 297; VI-NEXT: s_waitcnt lgkmcnt(0) 298; VI-NEXT: s_mov_b32 s0, s4 299; VI-NEXT: s_mov_b32 s1, s5 300; VI-NEXT: s_load_dword s4, s[6:7], 0x0 301; VI-NEXT: s_load_dword s5, s[8:9], 0x0 302; VI-NEXT: s_waitcnt lgkmcnt(0) 303; VI-NEXT: v_max_f16_e64 v1, s4, s4 304; VI-NEXT: v_max_f16_e64 v0, s5, s5 305; VI-NEXT: s_lshr_b32 s4, s4, 16 306; VI-NEXT: s_lshr_b32 s5, s5, 16 307; VI-NEXT: v_min_f16_e32 v0, v1, v0 308; VI-NEXT: v_max_f16_e64 v1, s5, s5 309; VI-NEXT: v_max_f16_e64 v2, s4, s4 310; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 311; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 312; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 313; VI-NEXT: s_endpgm 314; 315; GFX9-LABEL: minnum_v2f16_ieee: 316; GFX9: ; %bb.0: ; %entry 317; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 318; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 319; GFX9-NEXT: s_mov_b32 s3, 0xf000 320; GFX9-NEXT: s_mov_b32 s2, -1 321; GFX9-NEXT: s_waitcnt lgkmcnt(0) 322; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 323; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0 324; GFX9-NEXT: s_mov_b32 s0, s4 325; GFX9-NEXT: s_mov_b32 s1, s5 326; GFX9-NEXT: s_waitcnt lgkmcnt(0) 327; GFX9-NEXT: v_pk_max_f16 v1, s6, s6 328; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 329; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 330; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 331; GFX9-NEXT: s_endpgm 332 <2 x half> addrspace(1)* %r, 333 <2 x half> addrspace(1)* %a, 334 <2 x half> addrspace(1)* %b) #0 { 335entry: 336 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 337 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 338 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> %b.val) 339 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 340 ret void 341} 342 343define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b) #0 { 344; SI-LABEL: minnum_v2f16_no_ieee: 345; SI: ; %bb.0: 346; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 347; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 348; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 349; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 350; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 351; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 352; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 353; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 354; SI-NEXT: v_min_f32_e32 v0, v0, v2 355; SI-NEXT: v_min_f32_e32 v1, v1, v3 356; SI-NEXT: ; return to shader part epilog 357; 358; VI-LABEL: minnum_v2f16_no_ieee: 359; VI: ; %bb.0: 360; VI-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 361; VI-NEXT: v_min_f16_e32 v0, v0, v1 362; VI-NEXT: v_or_b32_e32 v0, v0, v2 363; VI-NEXT: ; return to shader part epilog 364; 365; GFX9-LABEL: minnum_v2f16_no_ieee: 366; GFX9: ; %bb.0: 367; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 368; GFX9-NEXT: ; return to shader part epilog 369 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) 370 ret <2 x half> %r.val 371} 372 373define amdgpu_kernel void @minnum_v2f16_imm_a( 374; SI-LABEL: minnum_v2f16_imm_a: 375; SI: ; %bb.0: ; %entry 376; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 377; SI-NEXT: s_waitcnt lgkmcnt(0) 378; SI-NEXT: s_load_dword s2, s[2:3], 0x0 379; SI-NEXT: s_mov_b32 s3, 0xf000 380; SI-NEXT: s_waitcnt lgkmcnt(0) 381; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 382; SI-NEXT: s_lshr_b32 s2, s2, 16 383; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 384; SI-NEXT: s_mov_b32 s2, -1 385; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 386; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0 387; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 388; SI-NEXT: v_min_f32_e32 v1, 4.0, v1 389; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 390; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 391; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 392; SI-NEXT: v_or_b32_e32 v0, v0, v1 393; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 394; SI-NEXT: s_endpgm 395; 396; VI-LABEL: minnum_v2f16_imm_a: 397; VI: ; %bb.0: ; %entry 398; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 399; VI-NEXT: v_mov_b32_e32 v2, 0x4400 400; VI-NEXT: s_mov_b32 s3, 0xf000 401; VI-NEXT: s_mov_b32 s2, -1 402; VI-NEXT: s_waitcnt lgkmcnt(0) 403; VI-NEXT: s_mov_b32 s0, s4 404; VI-NEXT: s_load_dword s4, s[6:7], 0x0 405; VI-NEXT: s_mov_b32 s1, s5 406; VI-NEXT: s_waitcnt lgkmcnt(0) 407; VI-NEXT: v_max_f16_e64 v0, s4, s4 408; VI-NEXT: s_lshr_b32 s4, s4, 16 409; VI-NEXT: v_max_f16_e64 v1, s4, s4 410; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0 411; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 412; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 413; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 414; VI-NEXT: s_endpgm 415; 416; GFX9-LABEL: minnum_v2f16_imm_a: 417; GFX9: ; %bb.0: ; %entry 418; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 419; GFX9-NEXT: s_mov_b32 s7, 0xf000 420; GFX9-NEXT: s_mov_b32 s6, -1 421; GFX9-NEXT: s_waitcnt lgkmcnt(0) 422; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 423; GFX9-NEXT: s_mov_b32 s4, s0 424; GFX9-NEXT: s_mov_b32 s0, 0x44004200 425; GFX9-NEXT: s_mov_b32 s5, s1 426; GFX9-NEXT: s_waitcnt lgkmcnt(0) 427; GFX9-NEXT: v_pk_max_f16 v0, s2, s2 428; GFX9-NEXT: v_pk_min_f16 v0, v0, s0 429; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 430; GFX9-NEXT: s_endpgm 431 <2 x half> addrspace(1)* %r, 432 <2 x half> addrspace(1)* %b) #0 { 433entry: 434 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 435 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> <half 3.0, half 4.0>, <2 x half> %b.val) 436 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 437 ret void 438} 439 440define amdgpu_kernel void @minnum_v2f16_imm_b( 441; SI-LABEL: minnum_v2f16_imm_b: 442; SI: ; %bb.0: ; %entry 443; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 444; SI-NEXT: s_waitcnt lgkmcnt(0) 445; SI-NEXT: s_load_dword s2, s[2:3], 0x0 446; SI-NEXT: s_mov_b32 s3, 0xf000 447; SI-NEXT: s_waitcnt lgkmcnt(0) 448; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 449; SI-NEXT: s_lshr_b32 s2, s2, 16 450; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 451; SI-NEXT: s_mov_b32 s2, -1 452; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 453; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 454; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 455; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1 456; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 457; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 458; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 459; SI-NEXT: v_or_b32_e32 v0, v0, v1 460; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 461; SI-NEXT: s_endpgm 462; 463; VI-LABEL: minnum_v2f16_imm_b: 464; VI: ; %bb.0: ; %entry 465; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 466; VI-NEXT: v_mov_b32_e32 v2, 0x4200 467; VI-NEXT: s_mov_b32 s3, 0xf000 468; VI-NEXT: s_mov_b32 s2, -1 469; VI-NEXT: s_waitcnt lgkmcnt(0) 470; VI-NEXT: s_mov_b32 s0, s4 471; VI-NEXT: s_load_dword s4, s[6:7], 0x0 472; VI-NEXT: s_mov_b32 s1, s5 473; VI-NEXT: s_waitcnt lgkmcnt(0) 474; VI-NEXT: v_max_f16_e64 v0, s4, s4 475; VI-NEXT: s_lshr_b32 s4, s4, 16 476; VI-NEXT: v_max_f16_e64 v1, s4, s4 477; VI-NEXT: v_min_f16_e32 v0, 4.0, v0 478; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 479; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 480; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 481; VI-NEXT: s_endpgm 482; 483; GFX9-LABEL: minnum_v2f16_imm_b: 484; GFX9: ; %bb.0: ; %entry 485; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 486; GFX9-NEXT: s_mov_b32 s7, 0xf000 487; GFX9-NEXT: s_mov_b32 s6, -1 488; GFX9-NEXT: s_waitcnt lgkmcnt(0) 489; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 490; GFX9-NEXT: s_mov_b32 s4, s0 491; GFX9-NEXT: s_mov_b32 s0, 0x42004400 492; GFX9-NEXT: s_mov_b32 s5, s1 493; GFX9-NEXT: s_waitcnt lgkmcnt(0) 494; GFX9-NEXT: v_pk_max_f16 v0, s2, s2 495; GFX9-NEXT: v_pk_min_f16 v0, v0, s0 496; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 497; GFX9-NEXT: s_endpgm 498 <2 x half> addrspace(1)* %r, 499 <2 x half> addrspace(1)* %a) #0 { 500entry: 501 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 502 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> <half 4.0, half 3.0>) 503 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 504 ret void 505} 506 507; FIXME: Scalarize with undef half 508define amdgpu_kernel void @minnum_v3f16( 509; SI-LABEL: minnum_v3f16: 510; SI: ; %bb.0: ; %entry 511; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 512; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 513; SI-NEXT: s_mov_b32 s3, 0xf000 514; SI-NEXT: s_mov_b32 s2, -1 515; SI-NEXT: s_waitcnt lgkmcnt(0) 516; SI-NEXT: s_mov_b32 s0, s4 517; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 518; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 519; SI-NEXT: s_waitcnt lgkmcnt(0) 520; SI-NEXT: s_lshr_b32 s1, s6, 16 521; SI-NEXT: s_lshr_b32 s4, s8, 16 522; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 523; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 524; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 525; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 526; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 527; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 528; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 529; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 530; SI-NEXT: v_min_f32_e32 v2, v3, v2 531; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 532; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 533; SI-NEXT: v_min_f32_e32 v1, v1, v3 534; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 535; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 536; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 537; SI-NEXT: v_min_f32_e32 v0, v0, v3 538; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 539; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 540; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 541; SI-NEXT: s_mov_b32 s1, s5 542; SI-NEXT: v_or_b32_e32 v1, v1, v2 543; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 544; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 545; SI-NEXT: s_endpgm 546; 547; VI-LABEL: minnum_v3f16: 548; VI: ; %bb.0: ; %entry 549; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 550; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 551; VI-NEXT: s_mov_b32 s3, 0xf000 552; VI-NEXT: s_mov_b32 s2, -1 553; VI-NEXT: s_waitcnt lgkmcnt(0) 554; VI-NEXT: s_mov_b32 s0, s4 555; VI-NEXT: s_mov_b32 s1, s5 556; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 557; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 558; VI-NEXT: s_waitcnt lgkmcnt(0) 559; VI-NEXT: v_max_f16_e64 v1, s4, s4 560; VI-NEXT: v_max_f16_e64 v0, s6, s6 561; VI-NEXT: s_lshr_b32 s4, s4, 16 562; VI-NEXT: s_lshr_b32 s6, s6, 16 563; VI-NEXT: v_min_f16_e32 v0, v1, v0 564; VI-NEXT: v_max_f16_e64 v1, s6, s6 565; VI-NEXT: v_max_f16_e64 v2, s4, s4 566; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 567; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 568; VI-NEXT: v_max_f16_e64 v1, s7, s7 569; VI-NEXT: v_max_f16_e64 v2, s5, s5 570; VI-NEXT: v_min_f16_e32 v1, v2, v1 571; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 572; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 573; VI-NEXT: s_endpgm 574; 575; GFX9-LABEL: minnum_v3f16: 576; GFX9: ; %bb.0: ; %entry 577; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 578; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 579; GFX9-NEXT: s_mov_b32 s3, 0xf000 580; GFX9-NEXT: s_mov_b32 s2, -1 581; GFX9-NEXT: s_waitcnt lgkmcnt(0) 582; GFX9-NEXT: s_mov_b32 s0, s4 583; GFX9-NEXT: s_mov_b32 s1, s5 584; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 585; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 586; GFX9-NEXT: s_waitcnt lgkmcnt(0) 587; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 588; GFX9-NEXT: v_pk_max_f16 v0, s6, s6 589; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 590; GFX9-NEXT: v_pk_max_f16 v2, s7, s7 591; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 592; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 593; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 594; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 595; GFX9-NEXT: s_endpgm 596 <3 x half> addrspace(1)* %r, 597 <3 x half> addrspace(1)* %a, 598 <3 x half> addrspace(1)* %b) #0 { 599entry: 600 %a.val = load <3 x half>, <3 x half> addrspace(1)* %a 601 %b.val = load <3 x half>, <3 x half> addrspace(1)* %b 602 %r.val = call <3 x half> @llvm.minnum.v3f16(<3 x half> %a.val, <3 x half> %b.val) 603 store <3 x half> %r.val, <3 x half> addrspace(1)* %r 604 ret void 605} 606 607define amdgpu_kernel void @minnum_v4f16( 608; SI-LABEL: minnum_v4f16: 609; SI: ; %bb.0: ; %entry 610; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 611; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 612; SI-NEXT: s_mov_b32 s3, 0xf000 613; SI-NEXT: s_mov_b32 s2, -1 614; SI-NEXT: s_waitcnt lgkmcnt(0) 615; SI-NEXT: s_mov_b32 s0, s4 616; SI-NEXT: s_mov_b32 s1, s5 617; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 618; SI-NEXT: s_waitcnt lgkmcnt(0) 619; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 620; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 621; SI-NEXT: s_lshr_b32 s4, s4, 16 622; SI-NEXT: s_lshr_b32 s5, s5, 16 623; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 624; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 625; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 626; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 627; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 628; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 629; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 630; SI-NEXT: s_waitcnt lgkmcnt(0) 631; SI-NEXT: s_lshr_b32 s6, s5, 16 632; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 633; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 634; SI-NEXT: s_lshr_b32 s4, s4, 16 635; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 636; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 637; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 638; SI-NEXT: v_min_f32_e32 v3, v3, v5 639; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 640; SI-NEXT: v_min_f32_e32 v1, v1, v5 641; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 642; SI-NEXT: v_min_f32_e32 v2, v2, v5 643; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 644; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 645; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 646; SI-NEXT: v_min_f32_e32 v0, v0, v4 647; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 648; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 649; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 650; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 651; SI-NEXT: v_or_b32_e32 v1, v1, v3 652; SI-NEXT: v_or_b32_e32 v0, v0, v2 653; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 654; SI-NEXT: s_endpgm 655; 656; VI-LABEL: minnum_v4f16: 657; VI: ; %bb.0: ; %entry 658; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 659; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 660; VI-NEXT: s_mov_b32 s3, 0xf000 661; VI-NEXT: s_mov_b32 s2, -1 662; VI-NEXT: s_waitcnt lgkmcnt(0) 663; VI-NEXT: s_mov_b32 s0, s4 664; VI-NEXT: s_mov_b32 s1, s5 665; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 666; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 667; VI-NEXT: s_waitcnt lgkmcnt(0) 668; VI-NEXT: v_max_f16_e64 v1, s5, s5 669; VI-NEXT: v_max_f16_e64 v0, s7, s7 670; VI-NEXT: s_lshr_b32 s5, s5, 16 671; VI-NEXT: s_lshr_b32 s7, s7, 16 672; VI-NEXT: v_min_f16_e32 v0, v1, v0 673; VI-NEXT: v_max_f16_e64 v2, s5, s5 674; VI-NEXT: v_max_f16_e64 v1, s7, s7 675; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 676; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 677; VI-NEXT: v_max_f16_e64 v2, s4, s4 678; VI-NEXT: v_max_f16_e64 v0, s6, s6 679; VI-NEXT: s_lshr_b32 s4, s4, 16 680; VI-NEXT: s_lshr_b32 s5, s6, 16 681; VI-NEXT: v_min_f16_e32 v0, v2, v0 682; VI-NEXT: v_max_f16_e64 v2, s5, s5 683; VI-NEXT: v_max_f16_e64 v3, s4, s4 684; VI-NEXT: v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 685; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 686; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 687; VI-NEXT: s_endpgm 688; 689; GFX9-LABEL: minnum_v4f16: 690; GFX9: ; %bb.0: ; %entry 691; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 692; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 693; GFX9-NEXT: s_mov_b32 s3, 0xf000 694; GFX9-NEXT: s_mov_b32 s2, -1 695; GFX9-NEXT: s_waitcnt lgkmcnt(0) 696; GFX9-NEXT: s_mov_b32 s0, s4 697; GFX9-NEXT: s_mov_b32 s1, s5 698; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 699; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 700; GFX9-NEXT: s_waitcnt lgkmcnt(0) 701; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 702; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 703; GFX9-NEXT: v_pk_min_f16 v1, v1, v0 704; GFX9-NEXT: v_pk_max_f16 v2, s6, s6 705; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 706; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 707; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 708; GFX9-NEXT: s_endpgm 709 <4 x half> addrspace(1)* %r, 710 <4 x half> addrspace(1)* %a, 711 <4 x half> addrspace(1)* %b) #0 { 712entry: 713 %a.val = load <4 x half>, <4 x half> addrspace(1)* %a 714 %b.val = load <4 x half>, <4 x half> addrspace(1)* %b 715 %r.val = call <4 x half> @llvm.minnum.v4f16(<4 x half> %a.val, <4 x half> %b.val) 716 store <4 x half> %r.val, <4 x half> addrspace(1)* %r 717 ret void 718} 719 720define amdgpu_kernel void @fmin_v4f16_imm_a( 721; SI-LABEL: fmin_v4f16_imm_a: 722; SI: ; %bb.0: ; %entry 723; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 724; SI-NEXT: s_mov_b32 s3, 0xf000 725; SI-NEXT: s_mov_b32 s2, -1 726; SI-NEXT: s_waitcnt lgkmcnt(0) 727; SI-NEXT: s_mov_b32 s0, s4 728; SI-NEXT: s_mov_b32 s1, s5 729; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 730; SI-NEXT: s_waitcnt lgkmcnt(0) 731; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 732; SI-NEXT: s_lshr_b32 s5, s5, 16 733; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 734; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 735; SI-NEXT: s_lshr_b32 s4, s4, 16 736; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 737; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 738; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 739; SI-NEXT: v_min_f32_e32 v2, 4.0, v2 740; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 741; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1 742; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 743; SI-NEXT: v_min_f32_e32 v3, 2.0, v3 744; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 745; SI-NEXT: v_min_f32_e32 v0, 0x41000000, v0 746; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 747; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 748; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 749; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 750; SI-NEXT: v_or_b32_e32 v1, v1, v2 751; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 752; SI-NEXT: v_or_b32_e32 v0, v0, v2 753; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 754; SI-NEXT: s_endpgm 755; 756; VI-LABEL: fmin_v4f16_imm_a: 757; VI: ; %bb.0: ; %entry 758; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 759; VI-NEXT: v_mov_b32_e32 v0, 0x4400 760; VI-NEXT: s_mov_b32 s3, 0xf000 761; VI-NEXT: s_mov_b32 s2, -1 762; VI-NEXT: s_waitcnt lgkmcnt(0) 763; VI-NEXT: s_mov_b32 s0, s4 764; VI-NEXT: s_mov_b32 s1, s5 765; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 766; VI-NEXT: s_waitcnt lgkmcnt(0) 767; VI-NEXT: v_max_f16_e64 v1, s5, s5 768; VI-NEXT: s_lshr_b32 s5, s5, 16 769; VI-NEXT: v_max_f16_e64 v3, s5, s5 770; VI-NEXT: v_max_f16_e64 v2, s4, s4 771; VI-NEXT: v_min_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 772; VI-NEXT: v_min_f16_e32 v1, 0x4200, v1 773; VI-NEXT: s_lshr_b32 s4, s4, 16 774; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 775; VI-NEXT: v_min_f16_e32 v0, 0x4800, v2 776; VI-NEXT: v_max_f16_e64 v2, s4, s4 777; VI-NEXT: v_mov_b32_e32 v3, 0x4000 778; VI-NEXT: v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 779; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 780; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 781; VI-NEXT: s_endpgm 782; 783; GFX9-LABEL: fmin_v4f16_imm_a: 784; GFX9: ; %bb.0: ; %entry 785; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 786; GFX9-NEXT: s_mov_b32 s8, 0x44004200 787; GFX9-NEXT: s_mov_b32 s9, 0x40004800 788; GFX9-NEXT: s_mov_b32 s3, 0xf000 789; GFX9-NEXT: s_mov_b32 s2, -1 790; GFX9-NEXT: s_waitcnt lgkmcnt(0) 791; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 792; GFX9-NEXT: s_mov_b32 s0, s4 793; GFX9-NEXT: s_mov_b32 s1, s5 794; GFX9-NEXT: s_waitcnt lgkmcnt(0) 795; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 796; GFX9-NEXT: v_pk_max_f16 v2, s6, s6 797; GFX9-NEXT: v_pk_min_f16 v1, v0, s8 798; GFX9-NEXT: v_pk_min_f16 v0, v2, s9 799; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 800; GFX9-NEXT: s_endpgm 801 <4 x half> addrspace(1)* %r, 802 <4 x half> addrspace(1)* %b) #0 { 803entry: 804 %b.val = load <4 x half>, <4 x half> addrspace(1)* %b 805 %r.val = call <4 x half> @llvm.minnum.v4f16(<4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, <4 x half> %b.val) 806 store <4 x half> %r.val, <4 x half> addrspace(1)* %r 807 ret void 808} 809 810attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 811