1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s 2 3declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 4 5; Make sure we don't turn the 32-bit argument load into a 16-bit 6; load. There aren't extending scalar lods, so that would require 7; using a buffer_load instruction. 8 9; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i16: 10; SI: s_load_dword s 11; SI: buffer_store_short v 12define amdgpu_kernel void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind { 13 %trunc = trunc i32 %arg to i16 14 store i16 %trunc, i16 addrspace(1)* %out 15 ret void 16} 17 18; It should be OK (and probably performance neutral) to reduce this, 19; but we don't know if the load is uniform yet. 20 21; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i16: 22; SI: buffer_load_dword v 23; SI: buffer_store_short v 24define amdgpu_kernel void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { 25 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 26 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 27 %gep.out = getelementptr i16, i16 addrspace(1)* %out, i32 %tid 28 %load = load i32, i32 addrspace(1)* %gep.in 29 %trunc = trunc i32 %load to i16 30 store i16 %trunc, i16 addrspace(1)* %gep.out 31 ret void 32} 33 34; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i8: 35; SI: s_load_dword s 36; SI: buffer_store_byte v 37define amdgpu_kernel void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind { 38 %trunc = trunc i32 %arg to i8 39 store i8 %trunc, i8 addrspace(1)* %out 40 ret void 41} 42 43; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i8: 44; SI: buffer_load_dword v 45; SI: buffer_store_byte v 46define amdgpu_kernel void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { 47 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 48 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 49 %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid 50 %load = load i32, i32 addrspace(1)* %gep.in 51 %trunc = trunc i32 %load to i8 52 store i8 %trunc, i8 addrspace(1)* %gep.out 53 ret void 54} 55 56; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i1: 57; SI: s_load_dword s 58; SI: buffer_store_byte v 59define amdgpu_kernel void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind { 60 %trunc = trunc i32 %arg to i1 61 store i1 %trunc, i1 addrspace(1)* %out 62 ret void 63} 64 65; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i1: 66; SI: buffer_load_dword v 67; SI: buffer_store_byte v 68define amdgpu_kernel void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { 69 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 70 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 71 %gep.out = getelementptr i1, i1 addrspace(1)* %out, i32 %tid 72 %load = load i32, i32 addrspace(1)* %gep.in 73 %trunc = trunc i32 %load to i1 74 store i1 %trunc, i1 addrspace(1)* %gep.out 75 ret void 76} 77 78; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32: 79; SI: s_load_dword s 80; SI: buffer_store_dword v 81define amdgpu_kernel void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind { 82 %trunc = trunc i64 %arg to i32 83 store i32 %trunc, i32 addrspace(1)* %out 84 ret void 85} 86 87; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i32: 88; SI: buffer_load_dword v 89; SI: buffer_store_dword v 90define amdgpu_kernel void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { 91 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 92 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 93 %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid 94 %load = load i64, i64 addrspace(1)* %gep.in 95 %trunc = trunc i64 %load to i32 96 store i32 %trunc, i32 addrspace(1)* %gep.out 97 ret void 98} 99 100; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32: 101; SI: s_load_dword s 102; SI: buffer_store_dword v 103define amdgpu_kernel void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind { 104 %srl = lshr i64 %arg, 32 105 %trunc = trunc i64 %srl to i32 106 store i32 %trunc, i32 addrspace(1)* %out 107 ret void 108} 109 110; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i32: 111; SI: buffer_load_dword v 112; SI: buffer_store_dword v 113define amdgpu_kernel void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { 114 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 115 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 116 %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid 117 %load = load i64, i64 addrspace(1)* %gep.in 118 %srl = lshr i64 %load, 32 119 %trunc = trunc i64 %srl to i32 120 store i32 %trunc, i32 addrspace(1)* %gep.out 121 ret void 122} 123 124; Might as well reduce to 8-bit loads. 125; FUNC-LABEL: {{^}}truncate_kernarg_i16_to_i8: 126; SI: s_load_dword s 127; SI: buffer_store_byte v 128define amdgpu_kernel void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind { 129 %trunc = trunc i16 %arg to i8 130 store i8 %trunc, i8 addrspace(1)* %out 131 ret void 132} 133 134; FUNC-LABEL: {{^}}truncate_buffer_load_i16_to_i8: 135; SI: buffer_load_ubyte v 136; SI: buffer_store_byte v 137define amdgpu_kernel void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { 138 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 139 %gep.in = getelementptr i16, i16 addrspace(1)* %in, i32 %tid 140 %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid 141 %load = load i16, i16 addrspace(1)* %gep.in 142 %trunc = trunc i16 %load to i8 143 store i8 %trunc, i8 addrspace(1)* %gep.out 144 ret void 145} 146 147; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8: 148; SI: s_load_dword s 149; SI: buffer_store_byte v 150define amdgpu_kernel void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind { 151 %srl = lshr i64 %arg, 32 152 %trunc = trunc i64 %srl to i8 153 store i8 %trunc, i8 addrspace(1)* %out 154 ret void 155} 156 157; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i8: 158; SI: buffer_load_dword v 159; SI: buffer_store_byte v 160define amdgpu_kernel void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { 161 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 162 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 163 %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid 164 %load = load i64, i64 addrspace(1)* %gep.in 165 %srl = lshr i64 %load, 32 166 %trunc = trunc i64 %srl to i8 167 store i8 %trunc, i8 addrspace(1)* %gep.out 168 ret void 169} 170 171; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8: 172; SI: s_load_dword s 173; SI: buffer_store_byte v 174define amdgpu_kernel void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind { 175 %trunc = trunc i64 %arg to i8 176 store i8 %trunc, i8 addrspace(1)* %out 177 ret void 178} 179 180; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i8: 181; SI: buffer_load_dword v 182; SI: buffer_store_byte v 183define amdgpu_kernel void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { 184 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 185 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 186 %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid 187 %load = load i64, i64 addrspace(1)* %gep.in 188 %trunc = trunc i64 %load to i8 189 store i8 %trunc, i8 addrspace(1)* %gep.out 190 ret void 191} 192 193; FUNC-LABEL: {{^}}smrd_mask_i32_to_i16 194; SI: s_load_dword [[LOAD:s[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0x0 195; SI: s_waitcnt lgkmcnt(0) 196; SI: s_and_b32 s{{[0-9]+}}, [[LOAD]], 0xffff 197define amdgpu_kernel void @smrd_mask_i32_to_i16(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { 198entry: 199 %val = load i32, i32 addrspace(4)* %in 200 %mask = and i32 %val, 65535 201 store i32 %mask, i32 addrspace(1)* %out 202 ret void 203} 204 205; FUNC-LABEL: {{^}}extract_hi_i64_bitcast_v2i32: 206; SI: buffer_load_dword v 207; SI: buffer_store_dword v 208define amdgpu_kernel void @extract_hi_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind { 209 %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in 210 %bc = bitcast <2 x i32> %ld to i64 211 %hi = lshr i64 %bc, 32 212 %trunc = trunc i64 %hi to i32 213 store i32 %trunc, i32 addrspace(1)* %out 214 ret void 215} 216