1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,SI 3; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX89,VI 4; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 5 6define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 { 7; SI-LABEL: s_cvt_pkrtz_v2f16_f32: 8; SI: ; %bb.0: 9; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 10; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 11; SI-NEXT: s_mov_b32 s7, 0xf000 12; SI-NEXT: s_mov_b32 s6, -1 13; SI-NEXT: s_waitcnt lgkmcnt(0) 14; SI-NEXT: v_mov_b32_e32 v0, s3 15; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, s2, v0 16; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 17; SI-NEXT: s_endpgm 18; 19; VI-LABEL: s_cvt_pkrtz_v2f16_f32: 20; VI: ; %bb.0: 21; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 22; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 23; VI-NEXT: s_waitcnt lgkmcnt(0) 24; VI-NEXT: v_mov_b32_e32 v0, s1 25; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v0 26; VI-NEXT: v_mov_b32_e32 v0, s2 27; VI-NEXT: v_mov_b32_e32 v1, s3 28; VI-NEXT: flat_store_dword v[0:1], v2 29; VI-NEXT: s_endpgm 30; 31; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32: 32; GFX9: ; %bb.0: 33; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 34; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 35; GFX9-NEXT: v_mov_b32_e32 v0, 0 36; GFX9-NEXT: s_waitcnt lgkmcnt(0) 37; GFX9-NEXT: v_mov_b32_e32 v1, s1 38; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s0, v1 39; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 40; GFX9-NEXT: s_endpgm 41 %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y) 42 store <2 x half> %result, <2 x half> addrspace(1)* %out 43 ret void 44} 45 46define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)* %out, float %x) #0 { 47; SI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: 48; SI: ; %bb.0: 49; SI-NEXT: s_load_dword s2, s[0:1], 0xb 50; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 51; SI-NEXT: s_mov_b32 s7, 0xf000 52; SI-NEXT: s_mov_b32 s6, -1 53; SI-NEXT: s_waitcnt lgkmcnt(0) 54; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v0, s2, s2 55; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 56; SI-NEXT: s_endpgm 57; 58; VI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: 59; VI: ; %bb.0: 60; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 61; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 62; VI-NEXT: s_waitcnt lgkmcnt(0) 63; VI-NEXT: v_mov_b32_e32 v0, s2 64; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, s0 65; VI-NEXT: v_mov_b32_e32 v1, s3 66; VI-NEXT: flat_store_dword v[0:1], v2 67; VI-NEXT: s_endpgm 68; 69; GFX9-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: 70; GFX9: ; %bb.0: 71; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 72; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c 73; GFX9-NEXT: v_mov_b32_e32 v0, 0 74; GFX9-NEXT: s_waitcnt lgkmcnt(0) 75; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s0, s0 76; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 77; GFX9-NEXT: s_endpgm 78 %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x) 79 store <2 x half> %result, <2 x half> addrspace(1)* %out 80 ret void 81} 82 83define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(<2 x half> addrspace(1)* %out) #0 { 84; GCN-LABEL: s_cvt_pkrtz_undef_undef: 85; GCN: ; %bb.0: 86; GCN-NEXT: s_endpgm 87 %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef) 88 store <2 x half> %result, <2 x half> addrspace(1)* %out 89 ret void 90} 91 92define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 93; SI-LABEL: v_cvt_pkrtz_v2f16_f32: 94; SI: ; %bb.0: 95; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 96; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd 97; SI-NEXT: s_mov_b32 s3, 0xf000 98; SI-NEXT: s_mov_b32 s2, 0 99; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 100; SI-NEXT: v_mov_b32_e32 v1, 0 101; SI-NEXT: s_mov_b64 s[6:7], s[2:3] 102; SI-NEXT: s_waitcnt lgkmcnt(0) 103; SI-NEXT: s_mov_b64 s[0:1], s[10:11] 104; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 105; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 106; SI-NEXT: s_mov_b64 s[10:11], s[2:3] 107; SI-NEXT: s_waitcnt vmcnt(0) 108; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v2, v2, v3 109; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64 110; SI-NEXT: s_endpgm 111; 112; VI-LABEL: v_cvt_pkrtz_v2f16_f32: 113; VI: ; %bb.0: 114; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 115; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 116; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 117; VI-NEXT: s_waitcnt lgkmcnt(0) 118; VI-NEXT: v_mov_b32_e32 v1, s7 119; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 120; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 121; VI-NEXT: v_mov_b32_e32 v3, s1 122; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 123; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 124; VI-NEXT: flat_load_dword v0, v[0:1] 125; VI-NEXT: flat_load_dword v1, v[2:3] 126; VI-NEXT: v_mov_b32_e32 v5, s5 127; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 128; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 129; VI-NEXT: s_waitcnt vmcnt(0) 130; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1 131; VI-NEXT: flat_store_dword v[4:5], v0 132; VI-NEXT: s_endpgm 133; 134; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32: 135; GFX9: ; %bb.0: 136; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 137; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 138; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 139; GFX9-NEXT: s_waitcnt lgkmcnt(0) 140; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 141; GFX9-NEXT: global_load_dword v2, v0, s[0:1] 142; GFX9-NEXT: s_waitcnt vmcnt(0) 143; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, v2 144; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 145; GFX9-NEXT: s_endpgm 146 %tid = call i32 @llvm.amdgcn.workitem.id.x() 147 %tid.ext = sext i32 %tid to i64 148 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 149 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 150 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext 151 %a = load volatile float, float addrspace(1)* %a.gep 152 %b = load volatile float, float addrspace(1)* %b.gep 153 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b) 154 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep 155 ret void 156} 157 158define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 159; SI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: 160; SI: ; %bb.0: 161; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 162; SI-NEXT: s_mov_b32 s7, 0xf000 163; SI-NEXT: s_mov_b32 s6, 0 164; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 165; SI-NEXT: v_mov_b32_e32 v1, 0 166; SI-NEXT: s_waitcnt lgkmcnt(0) 167; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 168; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 169; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 170; SI-NEXT: s_waitcnt vmcnt(0) 171; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, v2, 1.0 172; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 173; SI-NEXT: s_endpgm 174; 175; VI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: 176; VI: ; %bb.0: 177; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 178; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 179; VI-NEXT: s_waitcnt lgkmcnt(0) 180; VI-NEXT: v_mov_b32_e32 v1, s3 181; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 182; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 183; VI-NEXT: flat_load_dword v0, v[0:1] 184; VI-NEXT: v_mov_b32_e32 v3, s1 185; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 186; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 187; VI-NEXT: s_waitcnt vmcnt(0) 188; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, 1.0 189; VI-NEXT: flat_store_dword v[2:3], v0 190; VI-NEXT: s_endpgm 191; 192; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: 193; GFX9: ; %bb.0: 194; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 195; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 196; GFX9-NEXT: s_waitcnt lgkmcnt(0) 197; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 198; GFX9-NEXT: s_waitcnt vmcnt(0) 199; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, 1.0 200; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 201; GFX9-NEXT: s_endpgm 202 %tid = call i32 @llvm.amdgcn.workitem.id.x() 203 %tid.ext = sext i32 %tid to i64 204 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 205 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext 206 %a = load volatile float, float addrspace(1)* %a.gep 207 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float 1.0) 208 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep 209 ret void 210} 211 212define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { 213; SI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: 214; SI: ; %bb.0: 215; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 216; SI-NEXT: s_mov_b32 s7, 0xf000 217; SI-NEXT: s_mov_b32 s6, 0 218; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 219; SI-NEXT: v_mov_b32_e32 v1, 0 220; SI-NEXT: s_waitcnt lgkmcnt(0) 221; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 222; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 223; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 224; SI-NEXT: s_waitcnt vmcnt(0) 225; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v2, 1.0, v2 226; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 227; SI-NEXT: s_endpgm 228; 229; VI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: 230; VI: ; %bb.0: 231; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 232; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 233; VI-NEXT: s_waitcnt lgkmcnt(0) 234; VI-NEXT: v_mov_b32_e32 v1, s3 235; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 236; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 237; VI-NEXT: flat_load_dword v0, v[0:1] 238; VI-NEXT: v_mov_b32_e32 v3, s1 239; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 240; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 241; VI-NEXT: s_waitcnt vmcnt(0) 242; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, 1.0, v0 243; VI-NEXT: flat_store_dword v[2:3], v0 244; VI-NEXT: s_endpgm 245; 246; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: 247; GFX9: ; %bb.0: 248; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 249; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 250; GFX9-NEXT: s_waitcnt lgkmcnt(0) 251; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 252; GFX9-NEXT: s_waitcnt vmcnt(0) 253; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, 1.0, v1 254; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 255; GFX9-NEXT: s_endpgm 256 %tid = call i32 @llvm.amdgcn.workitem.id.x() 257 %tid.ext = sext i32 %tid to i64 258 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 259 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext 260 %a = load volatile float, float addrspace(1)* %a.gep 261 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 1.0, float %a) 262 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep 263 ret void 264} 265 266define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 267; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: 268; SI: ; %bb.0: 269; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 270; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd 271; SI-NEXT: s_mov_b32 s3, 0xf000 272; SI-NEXT: s_mov_b32 s2, 0 273; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 274; SI-NEXT: v_mov_b32_e32 v1, 0 275; SI-NEXT: s_mov_b64 s[6:7], s[2:3] 276; SI-NEXT: s_waitcnt lgkmcnt(0) 277; SI-NEXT: s_mov_b64 s[0:1], s[10:11] 278; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 279; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 280; SI-NEXT: s_mov_b64 s[10:11], s[2:3] 281; SI-NEXT: s_waitcnt vmcnt(0) 282; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, v3 283; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64 284; SI-NEXT: s_endpgm 285; 286; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: 287; VI: ; %bb.0: 288; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 289; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 290; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 291; VI-NEXT: s_waitcnt lgkmcnt(0) 292; VI-NEXT: v_mov_b32_e32 v1, s7 293; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 294; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 295; VI-NEXT: v_mov_b32_e32 v3, s1 296; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 297; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 298; VI-NEXT: flat_load_dword v0, v[0:1] 299; VI-NEXT: flat_load_dword v1, v[2:3] 300; VI-NEXT: v_mov_b32_e32 v5, s5 301; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 302; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 303; VI-NEXT: s_waitcnt vmcnt(0) 304; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, v1 305; VI-NEXT: flat_store_dword v[4:5], v0 306; VI-NEXT: s_endpgm 307; 308; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: 309; GFX9: ; %bb.0: 310; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 311; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 312; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 313; GFX9-NEXT: s_waitcnt lgkmcnt(0) 314; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 315; GFX9-NEXT: global_load_dword v2, v0, s[0:1] 316; GFX9-NEXT: s_waitcnt vmcnt(0) 317; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, v2 318; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 319; GFX9-NEXT: s_endpgm 320 %tid = call i32 @llvm.amdgcn.workitem.id.x() 321 %tid.ext = sext i32 %tid to i64 322 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 323 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 324 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext 325 %a = load volatile float, float addrspace(1)* %a.gep 326 %b = load volatile float, float addrspace(1)* %b.gep 327 %neg.a = fsub float -0.0, %a 328 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %b) 329 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep 330 ret void 331} 332 333define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 334; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: 335; SI: ; %bb.0: 336; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 337; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd 338; SI-NEXT: s_mov_b32 s3, 0xf000 339; SI-NEXT: s_mov_b32 s2, 0 340; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 341; SI-NEXT: v_mov_b32_e32 v1, 0 342; SI-NEXT: s_mov_b64 s[6:7], s[2:3] 343; SI-NEXT: s_waitcnt lgkmcnt(0) 344; SI-NEXT: s_mov_b64 s[0:1], s[10:11] 345; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 346; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 347; SI-NEXT: s_mov_b64 s[10:11], s[2:3] 348; SI-NEXT: s_waitcnt vmcnt(0) 349; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, v2, -v3 350; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64 351; SI-NEXT: s_endpgm 352; 353; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: 354; VI: ; %bb.0: 355; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 356; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 357; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 358; VI-NEXT: s_waitcnt lgkmcnt(0) 359; VI-NEXT: v_mov_b32_e32 v1, s7 360; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 361; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 362; VI-NEXT: v_mov_b32_e32 v3, s1 363; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 364; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 365; VI-NEXT: flat_load_dword v0, v[0:1] 366; VI-NEXT: flat_load_dword v1, v[2:3] 367; VI-NEXT: v_mov_b32_e32 v5, s5 368; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 369; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 370; VI-NEXT: s_waitcnt vmcnt(0) 371; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, -v1 372; VI-NEXT: flat_store_dword v[4:5], v0 373; VI-NEXT: s_endpgm 374; 375; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: 376; GFX9: ; %bb.0: 377; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 378; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 379; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 380; GFX9-NEXT: s_waitcnt lgkmcnt(0) 381; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 382; GFX9-NEXT: global_load_dword v2, v0, s[0:1] 383; GFX9-NEXT: s_waitcnt vmcnt(0) 384; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, -v2 385; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 386; GFX9-NEXT: s_endpgm 387 %tid = call i32 @llvm.amdgcn.workitem.id.x() 388 %tid.ext = sext i32 %tid to i64 389 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 390 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 391 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext 392 %a = load volatile float, float addrspace(1)* %a.gep 393 %b = load volatile float, float addrspace(1)* %b.gep 394 %neg.b = fsub float -0.0, %b 395 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %neg.b) 396 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep 397 ret void 398} 399 400define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 401; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: 402; SI: ; %bb.0: 403; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 404; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd 405; SI-NEXT: s_mov_b32 s3, 0xf000 406; SI-NEXT: s_mov_b32 s2, 0 407; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 408; SI-NEXT: v_mov_b32_e32 v1, 0 409; SI-NEXT: s_mov_b64 s[6:7], s[2:3] 410; SI-NEXT: s_waitcnt lgkmcnt(0) 411; SI-NEXT: s_mov_b64 s[0:1], s[10:11] 412; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 413; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 414; SI-NEXT: s_mov_b64 s[10:11], s[2:3] 415; SI-NEXT: s_waitcnt vmcnt(0) 416; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, -v3 417; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64 418; SI-NEXT: s_endpgm 419; 420; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: 421; VI: ; %bb.0: 422; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 423; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 424; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 425; VI-NEXT: s_waitcnt lgkmcnt(0) 426; VI-NEXT: v_mov_b32_e32 v1, s7 427; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 428; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 429; VI-NEXT: v_mov_b32_e32 v3, s1 430; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 431; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 432; VI-NEXT: flat_load_dword v0, v[0:1] 433; VI-NEXT: flat_load_dword v1, v[2:3] 434; VI-NEXT: v_mov_b32_e32 v5, s5 435; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 436; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 437; VI-NEXT: s_waitcnt vmcnt(0) 438; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, -v1 439; VI-NEXT: flat_store_dword v[4:5], v0 440; VI-NEXT: s_endpgm 441; 442; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: 443; GFX9: ; %bb.0: 444; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 445; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 446; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 447; GFX9-NEXT: s_waitcnt lgkmcnt(0) 448; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 449; GFX9-NEXT: global_load_dword v2, v0, s[0:1] 450; GFX9-NEXT: s_waitcnt vmcnt(0) 451; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, -v2 452; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 453; GFX9-NEXT: s_endpgm 454 %tid = call i32 @llvm.amdgcn.workitem.id.x() 455 %tid.ext = sext i32 %tid to i64 456 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 457 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 458 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext 459 %a = load volatile float, float addrspace(1)* %a.gep 460 %b = load volatile float, float addrspace(1)* %b.gep 461 %neg.a = fsub float -0.0, %a 462 %neg.b = fsub float -0.0, %b 463 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %neg.b) 464 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep 465 ret void 466} 467 468define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { 469; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: 470; SI: ; %bb.0: 471; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 472; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd 473; SI-NEXT: s_mov_b32 s3, 0xf000 474; SI-NEXT: s_mov_b32 s2, 0 475; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 476; SI-NEXT: v_mov_b32_e32 v1, 0 477; SI-NEXT: s_mov_b64 s[6:7], s[2:3] 478; SI-NEXT: s_waitcnt lgkmcnt(0) 479; SI-NEXT: s_mov_b64 s[0:1], s[10:11] 480; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 481; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 482; SI-NEXT: s_mov_b64 s[10:11], s[2:3] 483; SI-NEXT: s_waitcnt vmcnt(0) 484; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -|v2|, -v3 485; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64 486; SI-NEXT: s_endpgm 487; 488; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: 489; VI: ; %bb.0: 490; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 491; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 492; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 493; VI-NEXT: s_waitcnt lgkmcnt(0) 494; VI-NEXT: v_mov_b32_e32 v1, s7 495; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 496; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 497; VI-NEXT: v_mov_b32_e32 v3, s1 498; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 499; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 500; VI-NEXT: flat_load_dword v0, v[0:1] 501; VI-NEXT: flat_load_dword v1, v[2:3] 502; VI-NEXT: v_mov_b32_e32 v5, s5 503; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 504; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 505; VI-NEXT: s_waitcnt vmcnt(0) 506; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -|v0|, -v1 507; VI-NEXT: flat_store_dword v[4:5], v0 508; VI-NEXT: s_endpgm 509; 510; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: 511; GFX9: ; %bb.0: 512; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 513; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 514; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 515; GFX9-NEXT: s_waitcnt lgkmcnt(0) 516; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 517; GFX9-NEXT: global_load_dword v2, v0, s[0:1] 518; GFX9-NEXT: s_waitcnt vmcnt(0) 519; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -|v1|, -v2 520; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 521; GFX9-NEXT: s_endpgm 522 %tid = call i32 @llvm.amdgcn.workitem.id.x() 523 %tid.ext = sext i32 %tid to i64 524 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext 525 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext 526 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext 527 %a = load volatile float, float addrspace(1)* %a.gep 528 %b = load volatile float, float addrspace(1)* %b.gep 529 %fabs.a = call float @llvm.fabs.f32(float %a) 530 %neg.fabs.a = fsub float -0.0, %fabs.a 531 %neg.b = fsub float -0.0, %b 532 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.fabs.a, float %neg.b) 533 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep 534 ret void 535} 536 537declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 538declare float @llvm.fabs.f32(float) #1 539declare i32 @llvm.amdgcn.workitem.id.x() #1 540 541 542attributes #0 = { nounwind } 543attributes #1 = { nounwind readnone } 544