1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s 4 5declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone speculatable 6 7; GCN-LABEL: {{^}}s_sub_i32: 8; GCN: s_load_dwordx2 9; GCN: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}} 10; GCN: s_sub_i32 s{{[0-9]+}}, s[[A]], s[[B]] 11define amdgpu_kernel void @s_sub_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { 12 %result = sub i32 %a, %b 13 store i32 %result, i32 addrspace(1)* %out 14 ret void 15} 16 17; GCN-LABEL: {{^}}s_sub_imm_i32: 18; GCN: s_load_dword [[A:s[0-9]+]] 19; GCN: s_sub_i32 s{{[0-9]+}}, 0x4d2, [[A]] 20define amdgpu_kernel void @s_sub_imm_i32(i32 addrspace(1)* %out, i32 %a) { 21 %result = sub i32 1234, %a 22 store i32 %result, i32 addrspace(1)* %out 23 ret void 24} 25 26; GCN-LABEL: {{^}}test_sub_i32: 27; SI: v_subrev_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} 28; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} 29define amdgpu_kernel void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 30 %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 31 %a = load i32, i32 addrspace(1)* %in 32 %b = load i32, i32 addrspace(1)* %b_ptr 33 %result = sub i32 %a, %b 34 store i32 %result, i32 addrspace(1)* %out 35 ret void 36} 37 38; GCN-LABEL: {{^}}test_sub_imm_i32: 39; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, 0x7b, v{{[0-9]+}} 40; GFX9: v_sub_u32_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}} 41define amdgpu_kernel void @test_sub_imm_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 42 %a = load i32, i32 addrspace(1)* %in 43 %result = sub i32 123, %a 44 store i32 %result, i32 addrspace(1)* %out 45 ret void 46} 47 48; GCN-LABEL: {{^}}test_sub_v2i32: 49; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} 50; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} 51 52; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} 53; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} 54define amdgpu_kernel void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { 55 %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 56 %a = load <2 x i32>, <2 x i32> addrspace(1) * %in 57 %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr 58 %result = sub <2 x i32> %a, %b 59 store <2 x i32> %result, <2 x i32> addrspace(1)* %out 60 ret void 61} 62 63; GCN-LABEL: {{^}}test_sub_v4i32: 64; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} 65; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} 66; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} 67; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} 68 69; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} 70; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} 71; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} 72; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} 73define amdgpu_kernel void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { 74 %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 75 %a = load <4 x i32>, <4 x i32> addrspace(1) * %in 76 %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr 77 %result = sub <4 x i32> %a, %b 78 store <4 x i32> %result, <4 x i32> addrspace(1)* %out 79 ret void 80} 81 82; GCN-LABEL: {{^}}test_sub_i16: 83; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, 84; GFX89: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} 85define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { 86 %tid = call i32 @llvm.amdgcn.workitem.id.x() 87 %gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid 88 %b_ptr = getelementptr i16, i16 addrspace(1)* %gep, i32 1 89 %a = load volatile i16, i16 addrspace(1)* %gep 90 %b = load volatile i16, i16 addrspace(1)* %b_ptr 91 %result = sub i16 %a, %b 92 store i16 %result, i16 addrspace(1)* %out 93 ret void 94} 95 96; GCN-LABEL: {{^}}test_sub_v2i16: 97; VI: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} 98; VI: v_sub_u16_sdwa v{{[0-9]+, v[0-9]+, v[0-9]+}} 99 100; GFX9: v_pk_sub_i16 101define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { 102 %tid = call i32 @llvm.amdgcn.workitem.id.x() 103 %gep = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i32 %tid 104 %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %gep, i16 1 105 %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep 106 %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr 107 %result = sub <2 x i16> %a, %b 108 store <2 x i16> %result, <2 x i16> addrspace(1)* %out 109 ret void 110} 111 112; GCN-LABEL: {{^}}test_sub_v4i16: 113; VI: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} 114; VI: v_sub_u16_sdwa v{{[0-9]+, v[0-9]+, v[0-9]+}} 115; VI: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} 116; VI: v_sub_u16_sdwa v{{[0-9]+, v[0-9]+, v[0-9]+}} 117 118; GFX9: v_pk_sub_i16 119; GFX9: v_pk_sub_i16 120define amdgpu_kernel void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { 121 %tid = call i32 @llvm.amdgcn.workitem.id.x() 122 %gep = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i32 %tid 123 %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %gep, i16 1 124 %a = load <4 x i16>, <4 x i16> addrspace(1) * %gep 125 %b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr 126 %result = sub <4 x i16> %a, %b 127 store <4 x i16> %result, <4 x i16> addrspace(1)* %out 128 ret void 129} 130 131; GCN-LABEL: {{^}}s_sub_i64: 132; GCN: s_sub_u32 133; GCN: s_subb_u32 134define amdgpu_kernel void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind { 135 %result = sub i64 %a, %b 136 store i64 %result, i64 addrspace(1)* %out, align 8 137 ret void 138} 139 140; GCN-LABEL: {{^}}v_sub_i64: 141; SI: v_sub_i32_e32 142; SI: v_subb_u32_e32 143 144; VI: v_sub_u32_e32 145; VI: v_subb_u32_e32 146 147; GFX9: v_sub_co_u32_e32 148; GFX9: v_subb_co_u32_e32 149define amdgpu_kernel void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind { 150 %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone 151 %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid 152 %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid 153 %a = load i64, i64 addrspace(1)* %a_ptr 154 %b = load i64, i64 addrspace(1)* %b_ptr 155 %result = sub i64 %a, %b 156 store i64 %result, i64 addrspace(1)* %out, align 8 157 ret void 158} 159 160; GCN-LABEL: {{^}}v_test_sub_v2i64: 161; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, 162; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc, 163; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, 164; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc, 165 166; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc, 167; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc, 168; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc, 169; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc, 170 171; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc, 172; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc, 173; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc, 174; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc, 175define amdgpu_kernel void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) { 176 %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone 177 %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid 178 %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid 179 %a = load <2 x i64>, <2 x i64> addrspace(1)* %a_ptr 180 %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr 181 %result = sub <2 x i64> %a, %b 182 store <2 x i64> %result, <2 x i64> addrspace(1)* %out 183 ret void 184} 185 186; GCN-LABEL: {{^}}v_test_sub_v4i64: 187; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, 188; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc, 189; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, 190; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc, 191; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, 192; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc, 193; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, 194; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc, 195 196; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc, 197; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc, 198; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc, 199; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc, 200; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc, 201; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc, 202; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc, 203; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc, 204 205; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc, 206; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc, 207; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc, 208; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc, 209; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc, 210; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc, 211; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc, 212; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc, 213define amdgpu_kernel void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) { 214 %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone 215 %a_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inA, i32 %tid 216 %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inB, i32 %tid 217 %a = load <4 x i64>, <4 x i64> addrspace(1)* %a_ptr 218 %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr 219 %result = sub <4 x i64> %a, %b 220 store <4 x i64> %result, <4 x i64> addrspace(1)* %out 221 ret void 222} 223 224; Make sure the VOP3 form of sub is initially selected. Otherwise pair 225; of opies from/to VCC would be necessary 226 227; GCN-LABEL: {{^}}sub_select_vop3: 228; SI: v_subrev_i32_e64 v0, s[0:1], s0, v0 229; VI: v_subrev_u32_e64 v0, s[0:1], s0, v0 230; GFX9: v_subrev_u32_e32 v0, s0, v0 231 232; GCN: ; def vcc 233; GCN: ds_write_b32 234; GCN: ; use vcc 235define amdgpu_ps void @sub_select_vop3(i32 inreg %s, i32 %v) { 236 %vcc = call i64 asm sideeffect "; def vcc", "={vcc}"() 237 %sub = sub i32 %v, %s 238 store i32 %sub, i32 addrspace(3)* undef 239 call void asm sideeffect "; use vcc", "{vcc}"(i64 %vcc) 240 ret void 241} 242