1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,SI %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,VI %s 3 4; GCN-LABEL: {{^}}s_sext_i1_to_i32: 5; GCN: v_cndmask_b32_e64 6; GCN: s_endpgm 7define amdgpu_kernel void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { 8 %cmp = icmp eq i32 %a, %b 9 %sext = sext i1 %cmp to i32 10 store i32 %sext, i32 addrspace(1)* %out, align 4 11 ret void 12} 13 14; GCN-LABEL: {{^}}test_s_sext_i32_to_i64: 15; GCN: s_ashr_i32 16; GCN: s_endpg 17define amdgpu_kernel void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind { 18entry: 19 %mul = mul i32 %a, %b 20 %add = add i32 %mul, %c 21 %sext = sext i32 %add to i64 22 store i64 %sext, i64 addrspace(1)* %out, align 8 23 ret void 24} 25 26; GCN-LABEL: {{^}}s_sext_i1_to_i64: 27; GCN: v_cndmask_b32_e64 v[[LOREG:[0-9]+]], 0, -1, vcc 28; GCN: v_mov_b32_e32 v[[HIREG:[0-9]+]], v[[LOREG]] 29; GCN: buffer_store_dwordx2 v{{\[}}[[LOREG]]:[[HIREG]]{{\]}} 30; GCN: s_endpgm 31define amdgpu_kernel void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind { 32 %cmp = icmp eq i32 %a, %b 33 %sext = sext i1 %cmp to i64 34 store i64 %sext, i64 addrspace(1)* %out, align 8 35 ret void 36} 37 38; GCN-LABEL: {{^}}s_sext_i32_to_i64: 39; GCN: s_ashr_i32 40; GCN: s_endpgm 41define amdgpu_kernel void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind { 42 %sext = sext i32 %a to i64 43 store i64 %sext, i64 addrspace(1)* %out, align 8 44 ret void 45} 46 47; GCN-LABEL: {{^}}v_sext_i32_to_i64: 48; GCN: v_ashr 49; GCN: s_endpgm 50define amdgpu_kernel void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { 51 %val = load i32, i32 addrspace(1)* %in, align 4 52 %sext = sext i32 %val to i64 53 store i64 %sext, i64 addrspace(1)* %out, align 8 54 ret void 55} 56 57; GCN-LABEL: {{^}}s_sext_i16_to_i64: 58; GCN: s_load_dword [[VAL:s[0-9]+]] 59; GCN: s_bfe_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100000 60define amdgpu_kernel void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind { 61 %sext = sext i16 %a to i64 62 store i64 %sext, i64 addrspace(1)* %out, align 8 63 ret void 64} 65 66; GCN-LABEL: {{^}}s_sext_i1_to_i16: 67; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1 68; GCN-NEXT: buffer_store_short [[RESULT]] 69define amdgpu_kernel void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind { 70 %cmp = icmp eq i32 %a, %b 71 %sext = sext i1 %cmp to i16 72 store i16 %sext, i16 addrspace(1)* %out 73 ret void 74} 75 76; This purpose of this test is to make sure the i16 = sign_extend i1 node 77; makes it all the way throught the legalizer/optimizer to make sure 78; we select this correctly. In the s_sext_i1_to_i16, the sign_extend node 79; is optimized to a select very early. 80; GCN-LABEL: {{^}}s_sext_i1_to_i16_with_and: 81; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1 82; GCN-NEXT: buffer_store_short [[RESULT]] 83define amdgpu_kernel void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind { 84 %cmp0 = icmp eq i32 %a, %b 85 %cmp1 = icmp eq i32 %c, %d 86 %cmp = and i1 %cmp0, %cmp1 87 %sext = sext i1 %cmp to i16 88 store i16 %sext, i16 addrspace(1)* %out 89 ret void 90} 91 92; GCN-LABEL: {{^}}v_sext_i1_to_i16_with_and: 93; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1 94; GCN-NEXT: buffer_store_short [[RESULT]] 95define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind { 96 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1 97 %cmp0 = icmp eq i32 %a, %tid 98 %cmp1 = icmp eq i32 %b, %c 99 %cmp = and i1 %cmp0, %cmp1 100 %sext = sext i1 %cmp to i16 101 store i16 %sext, i16 addrspace(1)* %out 102 ret void 103} 104 105; GCN-LABEL: {{^}}s_sext_v4i8_to_v4i32: 106; GCN: s_load_dword [[VAL:s[0-9]+]] 107; GCN-DAG: s_bfe_i32 [[EXT2:s[0-9]+]], [[VAL]], 0x80010 108; GCN-DAG: s_ashr_i32 [[EXT3:s[0-9]+]], [[VAL]], 24 109; SI-DAG: s_bfe_i32 [[EXT1:s[0-9]+]], [[VAL]], 0x80008 110; GCN-DAG: s_sext_i32_i8 [[EXT0:s[0-9]+]], [[VAL]] 111 112; FIXME: We end up with a v_bfe instruction, because the i16 srl 113; gets selected to a v_lshrrev_b16 instructions, so the input to 114; the bfe is a vector registers. To fix this we need to be able to 115; optimize: 116; t29: i16 = truncate t10 117; t55: i16 = srl t29, Constant:i32<8> 118; t63: i32 = any_extend t55 119; t64: i32 = sign_extend_inreg t63, ValueType:ch:i8 120 121; VI-DAG: v_bfe_i32 [[VEXT1:v[0-9]+]], v{{[0-9]+}}, 0, 8 122 123; GCN-DAG: v_mov_b32_e32 [[VEXT0:v[0-9]+]], [[EXT0]] 124; SI-DAG: v_mov_b32_e32 [[VEXT1:v[0-9]+]], [[EXT1]] 125; GCN-DAG: v_mov_b32_e32 [[VEXT2:v[0-9]+]], [[EXT2]] 126; GCN-DAG: v_mov_b32_e32 [[VEXT3:v[0-9]+]], [[EXT3]] 127 128; GCN-DAG: buffer_store_dword [[VEXT0]] 129; GCN-DAG: buffer_store_dword [[VEXT1]] 130; GCN-DAG: buffer_store_dword [[VEXT2]] 131; GCN-DAG: buffer_store_dword [[VEXT3]] 132 133; GCN: s_endpgm 134define amdgpu_kernel void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind { 135 %cast = bitcast i32 %a to <4 x i8> 136 %ext = sext <4 x i8> %cast to <4 x i32> 137 %elt0 = extractelement <4 x i32> %ext, i32 0 138 %elt1 = extractelement <4 x i32> %ext, i32 1 139 %elt2 = extractelement <4 x i32> %ext, i32 2 140 %elt3 = extractelement <4 x i32> %ext, i32 3 141 store volatile i32 %elt0, i32 addrspace(1)* %out 142 store volatile i32 %elt1, i32 addrspace(1)* %out 143 store volatile i32 %elt2, i32 addrspace(1)* %out 144 store volatile i32 %elt3, i32 addrspace(1)* %out 145 ret void 146} 147 148; GCN-LABEL: {{^}}v_sext_v4i8_to_v4i32: 149; GCN: buffer_load_dword [[VAL:v[0-9]+]] 150; FIXME: need to optimize same sequence as above test to avoid 151; this shift. 152; VI-DAG: v_lshrrev_b16_e32 [[SH16:v[0-9]+]], 8, [[VAL]] 153; GCN-DAG: v_ashrrev_i32_e32 [[EXT3:v[0-9]+]], 24, [[VAL]] 154; VI-DAG: v_bfe_i32 [[EXT0:v[0-9]+]], [[VAL]], 0, 8 155; VI-DAG: v_bfe_i32 [[EXT2:v[0-9]+]], [[VAL]], 16, 8 156; VI-DAG: v_bfe_i32 [[EXT1:v[0-9]+]], [[SH16]], 0, 8 157 158; SI-DAG: v_bfe_i32 [[EXT2:v[0-9]+]], [[VAL]], 16, 8 159; SI-DAG: v_bfe_i32 [[EXT1:v[0-9]+]], [[VAL]], 8, 8 160; SI: v_bfe_i32 [[EXT0:v[0-9]+]], [[VAL]], 0, 8 161 162; GCN: buffer_store_dword [[EXT0]] 163; GCN: buffer_store_dword [[EXT1]] 164; GCN: buffer_store_dword [[EXT2]] 165; GCN: buffer_store_dword [[EXT3]] 166define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { 167 %a = load i32, i32 addrspace(1)* %in 168 %cast = bitcast i32 %a to <4 x i8> 169 %ext = sext <4 x i8> %cast to <4 x i32> 170 %elt0 = extractelement <4 x i32> %ext, i32 0 171 %elt1 = extractelement <4 x i32> %ext, i32 1 172 %elt2 = extractelement <4 x i32> %ext, i32 2 173 %elt3 = extractelement <4 x i32> %ext, i32 3 174 store volatile i32 %elt0, i32 addrspace(1)* %out 175 store volatile i32 %elt1, i32 addrspace(1)* %out 176 store volatile i32 %elt2, i32 addrspace(1)* %out 177 store volatile i32 %elt3, i32 addrspace(1)* %out 178 ret void 179} 180 181; FIXME: s_bfe_i64, same on SI and VI 182; GCN-LABEL: {{^}}s_sext_v4i16_to_v4i32: 183; SI-DAG: s_ashr_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 48 184; SI-DAG: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16 185 186; VI: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16 187; VI: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16 188 189 190; GCN-DAG: s_sext_i32_i16 191; GCN-DAG: s_sext_i32_i16 192; GCN: s_endpgm 193define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind { 194 %cast = bitcast i64 %a to <4 x i16> 195 %ext = sext <4 x i16> %cast to <4 x i32> 196 %elt0 = extractelement <4 x i32> %ext, i32 0 197 %elt1 = extractelement <4 x i32> %ext, i32 1 198 %elt2 = extractelement <4 x i32> %ext, i32 2 199 %elt3 = extractelement <4 x i32> %ext, i32 3 200 store volatile i32 %elt0, i32 addrspace(1)* %out 201 store volatile i32 %elt1, i32 addrspace(1)* %out 202 store volatile i32 %elt2, i32 addrspace(1)* %out 203 store volatile i32 %elt3, i32 addrspace(1)* %out 204 ret void 205} 206 207; GCN-LABEL: {{^}}v_sext_v4i16_to_v4i32: 208; GCN-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} 209; GCN-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} 210; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 211; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 212; GCN: s_endpgm 213define amdgpu_kernel void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { 214 %a = load i64, i64 addrspace(1)* %in 215 %cast = bitcast i64 %a to <4 x i16> 216 %ext = sext <4 x i16> %cast to <4 x i32> 217 %elt0 = extractelement <4 x i32> %ext, i32 0 218 %elt1 = extractelement <4 x i32> %ext, i32 1 219 %elt2 = extractelement <4 x i32> %ext, i32 2 220 %elt3 = extractelement <4 x i32> %ext, i32 3 221 store volatile i32 %elt0, i32 addrspace(1)* %out 222 store volatile i32 %elt1, i32 addrspace(1)* %out 223 store volatile i32 %elt2, i32 addrspace(1)* %out 224 store volatile i32 %elt3, i32 addrspace(1)* %out 225 ret void 226} 227 228declare i32 @llvm.amdgcn.workitem.id.x() #1 229 230attributes #1 = { nounwind readnone } 231