1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,FUNC %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,FUNC %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89,FUNC %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=EG,FUNC %s 5 6declare i32 @llvm.r600.read.tidig.x() readnone 7 8; FUNC-LABEL: {{^}}s_sub_i32: 9; GCN: s_load_dwordx2 10; GCN: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}} 11; GCN: s_sub_i32 s{{[0-9]+}}, s[[A]], s[[B]] 12define amdgpu_kernel void @s_sub_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { 13 %result = sub i32 %a, %b 14 store i32 %result, i32 addrspace(1)* %out 15 ret void 16} 17 18; FUNC-LABEL: {{^}}s_sub_imm_i32: 19; GCN: s_load_dword [[A:s[0-9]+]] 20; GCN: s_sub_i32 s{{[0-9]+}}, 0x4d2, [[A]] 21define amdgpu_kernel void @s_sub_imm_i32(i32 addrspace(1)* %out, i32 %a) { 22 %result = sub i32 1234, %a 23 store i32 %result, i32 addrspace(1)* %out 24 ret void 25} 26 27; FUNC-LABEL: {{^}}test_sub_i32: 28; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 29 30; SI: v_subrev_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} 31; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} 32define amdgpu_kernel void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 33 %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 34 %a = load i32, i32 addrspace(1)* %in 35 %b = load i32, i32 addrspace(1)* %b_ptr 36 %result = sub i32 %a, %b 37 store i32 %result, i32 addrspace(1)* %out 38 ret void 39} 40 41; FUNC-LABEL: {{^}}test_sub_imm_i32: 42; EG: SUB_INT 43 44; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, 0x7b, v{{[0-9]+}} 45; GFX9: v_sub_u32_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}} 46define amdgpu_kernel void @test_sub_imm_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 47 %a = load i32, i32 addrspace(1)* %in 48 %result = sub i32 123, %a 49 store i32 %result, i32 addrspace(1)* %out 50 ret void 51} 52 53; FUNC-LABEL: {{^}}test_sub_v2i32: 54; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 55; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 56 57; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} 58; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} 59 60; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} 61; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} 62define amdgpu_kernel void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { 63 %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 64 %a = load <2 x i32>, <2 x i32> addrspace(1) * %in 65 %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr 66 %result = sub <2 x i32> %a, %b 67 store <2 x i32> %result, <2 x i32> addrspace(1)* %out 68 ret void 69} 70 71; FUNC-LABEL: {{^}}test_sub_v4i32: 72; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 73; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 74; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 75; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 76 77; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} 78; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} 79; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} 80; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} 81 82; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} 83; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} 84; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} 85; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} 86define amdgpu_kernel void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { 87 %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 88 %a = load <4 x i32>, <4 x i32> addrspace(1) * %in 89 %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr 90 %result = sub <4 x i32> %a, %b 91 store <4 x i32> %result, <4 x i32> addrspace(1)* %out 92 ret void 93} 94 95; FUNC-LABEL: {{^}}test_sub_i16: 96; SI: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 97; GFX89: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} 98define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { 99 %tid = call i32 @llvm.r600.read.tidig.x() 100 %gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid 101 %b_ptr = getelementptr i16, i16 addrspace(1)* %gep, i32 1 102 %a = load volatile i16, i16 addrspace(1)* %gep 103 %b = load volatile i16, i16 addrspace(1)* %b_ptr 104 %result = sub i16 %a, %b 105 store i16 %result, i16 addrspace(1)* %out 106 ret void 107} 108 109; FUNC-LABEL: {{^}}test_sub_v2i16: 110; VI: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} 111; VI: v_sub_u16_sdwa v{{[0-9]+, v[0-9]+, v[0-9]+}} 112 113; GFX9: v_pk_sub_i16 114define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { 115 %tid = call i32 @llvm.r600.read.tidig.x() 116 %gep = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i32 %tid 117 %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %gep, i16 1 118 %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep 119 %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr 120 %result = sub <2 x i16> %a, %b 121 store <2 x i16> %result, <2 x i16> addrspace(1)* %out 122 ret void 123} 124 125; FUNC-LABEL: {{^}}test_sub_v4i16: 126; VI: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} 127; VI: v_sub_u16_sdwa v{{[0-9]+, v[0-9]+, v[0-9]+}} 128; VI: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} 129; VI: v_sub_u16_sdwa v{{[0-9]+, v[0-9]+, v[0-9]+}} 130 131; GFX9: v_pk_sub_i16 132; GFX9: v_pk_sub_i16 133define amdgpu_kernel void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { 134 %tid = call i32 @llvm.r600.read.tidig.x() 135 %gep = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i32 %tid 136 %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %gep, i16 1 137 %a = load <4 x i16>, <4 x i16> addrspace(1) * %gep 138 %b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr 139 %result = sub <4 x i16> %a, %b 140 store <4 x i16> %result, <4 x i16> addrspace(1)* %out 141 ret void 142} 143 144; FUNC-LABEL: {{^}}s_sub_i64: 145; GCN: s_sub_u32 146; GCN: s_subb_u32 147 148; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY 149; EG-DAG: SUB_INT {{[* ]*}} 150; EG-DAG: SUBB_UINT 151; EG-DAG: SUB_INT 152; EG-DAG: SUB_INT {{[* ]*}} 153define amdgpu_kernel void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind { 154 %result = sub i64 %a, %b 155 store i64 %result, i64 addrspace(1)* %out, align 8 156 ret void 157} 158 159; FUNC-LABEL: {{^}}v_sub_i64: 160; SI: v_sub_i32_e32 161; SI: v_subb_u32_e32 162 163; VI: v_sub_u32_e32 164; VI: v_subb_u32_e32 165 166; GFX9: v_sub_co_u32_e32 167; GFX9: v_subb_co_u32_e32 168 169; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY 170; EG-DAG: SUB_INT {{[* ]*}} 171; EG-DAG: SUBB_UINT 172; EG-DAG: SUB_INT 173; EG-DAG: SUB_INT {{[* ]*}} 174define amdgpu_kernel void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind { 175 %tid = call i32 @llvm.r600.read.tidig.x() readnone 176 %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid 177 %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid 178 %a = load i64, i64 addrspace(1)* %a_ptr 179 %b = load i64, i64 addrspace(1)* %b_ptr 180 %result = sub i64 %a, %b 181 store i64 %result, i64 addrspace(1)* %out, align 8 182 ret void 183} 184 185; FUNC-LABEL: {{^}}v_test_sub_v2i64: 186; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, 187; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc, 188; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, 189; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc, 190 191; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc, 192; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc, 193; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc, 194; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc, 195 196; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc, 197; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc, 198; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc, 199; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc, 200define amdgpu_kernel void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) { 201 %tid = call i32 @llvm.r600.read.tidig.x() readnone 202 %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid 203 %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid 204 %a = load <2 x i64>, <2 x i64> addrspace(1)* %a_ptr 205 %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr 206 %result = sub <2 x i64> %a, %b 207 store <2 x i64> %result, <2 x i64> addrspace(1)* %out 208 ret void 209} 210 211; FUNC-LABEL: {{^}}v_test_sub_v4i64: 212; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, 213; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc, 214; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, 215; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc, 216; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, 217; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc, 218; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, 219; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc, 220 221; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc, 222; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc, 223; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc, 224; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc, 225; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc, 226; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc, 227; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc, 228; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc, 229 230; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc, 231; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc, 232; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc, 233; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc, 234; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc, 235; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc, 236; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc, 237; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc, 238define amdgpu_kernel void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) { 239 %tid = call i32 @llvm.r600.read.tidig.x() readnone 240 %a_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inA, i32 %tid 241 %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inB, i32 %tid 242 %a = load <4 x i64>, <4 x i64> addrspace(1)* %a_ptr 243 %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr 244 %result = sub <4 x i64> %a, %b 245 store <4 x i64> %result, <4 x i64> addrspace(1)* %out 246 ret void 247} 248