1; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s 2 3define void @vst1lanei8(i8* %A, <8 x i8>* %B) nounwind { 4;CHECK: vst1lanei8: 5;Check the (default) alignment. 6;CHECK: vst1.8 {d16[3]}, [r0] 7 %tmp1 = load <8 x i8>* %B 8 %tmp2 = extractelement <8 x i8> %tmp1, i32 3 9 store i8 %tmp2, i8* %A, align 8 10 ret void 11} 12 13;Check for a post-increment updating store. 14define void @vst1lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind { 15;CHECK: vst1lanei8_update: 16;CHECK: vst1.8 {d16[3]}, [r2]! 17 %A = load i8** %ptr 18 %tmp1 = load <8 x i8>* %B 19 %tmp2 = extractelement <8 x i8> %tmp1, i32 3 20 store i8 %tmp2, i8* %A, align 8 21 %tmp3 = getelementptr i8* %A, i32 1 22 store i8* %tmp3, i8** %ptr 23 ret void 24} 25 26define void @vst1lanei16(i16* %A, <4 x i16>* %B) nounwind { 27;CHECK: vst1lanei16: 28;Check the alignment value. Max for this instruction is 16 bits: 29;CHECK: vst1.16 {d16[2]}, [r0, :16] 30 %tmp1 = load <4 x i16>* %B 31 %tmp2 = extractelement <4 x i16> %tmp1, i32 2 32 store i16 %tmp2, i16* %A, align 8 33 ret void 34} 35 36define void @vst1lanei32(i32* %A, <2 x i32>* %B) nounwind { 37;CHECK: vst1lanei32: 38;Check the alignment value. Max for this instruction is 32 bits: 39;CHECK: vst1.32 {d16[1]}, [r0, :32] 40 %tmp1 = load <2 x i32>* %B 41 %tmp2 = extractelement <2 x i32> %tmp1, i32 1 42 store i32 %tmp2, i32* %A, align 8 43 ret void 44} 45 46define void @vst1lanef(float* %A, <2 x float>* %B) nounwind { 47;CHECK: vst1lanef: 48;CHECK: vst1.32 {d16[1]}, [r0] 49 %tmp1 = load <2 x float>* %B 50 %tmp2 = extractelement <2 x float> %tmp1, i32 1 51 store float %tmp2, float* %A 52 ret void 53} 54 55define void @vst1laneQi8(i8* %A, <16 x i8>* %B) nounwind { 56;CHECK: vst1laneQi8: 57; // Can use scalar load. No need to use vectors. 58; // CHE-CK: vst1.8 {d17[1]}, [r0] 59 %tmp1 = load <16 x i8>* %B 60 %tmp2 = extractelement <16 x i8> %tmp1, i32 9 61 store i8 %tmp2, i8* %A, align 8 62 ret void 63} 64 65define void @vst1laneQi16(i16* %A, <8 x i16>* %B) nounwind { 66;CHECK: vst1laneQi16: 67;CHECK: vst1.16 {d17[1]}, [r0, :16] 68 %tmp1 = load <8 x i16>* %B 69 %tmp2 = extractelement <8 x i16> %tmp1, i32 5 70 store i16 %tmp2, i16* %A, align 8 71 ret void 72} 73 74define void @vst1laneQi32(i32* %A, <4 x i32>* %B) nounwind { 75;CHECK: vst1laneQi32: 76; // Can use scalar load. No need to use vectors. 77; // CHE-CK: vst1.32 {d17[1]}, [r0, :32] 78 %tmp1 = load <4 x i32>* %B 79 %tmp2 = extractelement <4 x i32> %tmp1, i32 3 80 store i32 %tmp2, i32* %A, align 8 81 ret void 82} 83 84;Check for a post-increment updating store. 85define void @vst1laneQi32_update(i32** %ptr, <4 x i32>* %B) nounwind { 86;CHECK: vst1laneQi32_update: 87; // Can use scalar load. No need to use vectors. 88; // CHE-CK: vst1.32 {d17[1]}, [r1, :32]! 89 %A = load i32** %ptr 90 %tmp1 = load <4 x i32>* %B 91 %tmp2 = extractelement <4 x i32> %tmp1, i32 3 92 store i32 %tmp2, i32* %A, align 8 93 %tmp3 = getelementptr i32* %A, i32 1 94 store i32* %tmp3, i32** %ptr 95 ret void 96} 97 98define void @vst1laneQf(float* %A, <4 x float>* %B) nounwind { 99;CHECK: vst1laneQf: 100; // Can use scalar load. No need to use vectors. 101; // CHE-CK: vst1.32 {d17[1]}, [r0] 102 %tmp1 = load <4 x float>* %B 103 %tmp2 = extractelement <4 x float> %tmp1, i32 3 104 store float %tmp2, float* %A 105 ret void 106} 107 108define void @vst2lanei8(i8* %A, <8 x i8>* %B) nounwind { 109;CHECK: vst2lanei8: 110;Check the alignment value. Max for this instruction is 16 bits: 111;CHECK: vst2.8 {d16[1], d17[1]}, [r0, :16] 112 %tmp1 = load <8 x i8>* %B 113 call void @llvm.arm.neon.vst2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4) 114 ret void 115} 116 117define void @vst2lanei16(i16* %A, <4 x i16>* %B) nounwind { 118;CHECK: vst2lanei16: 119;Check the alignment value. Max for this instruction is 32 bits: 120;CHECK: vst2.16 {d16[1], d17[1]}, [r0, :32] 121 %tmp0 = bitcast i16* %A to i8* 122 %tmp1 = load <4 x i16>* %B 123 call void @llvm.arm.neon.vst2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8) 124 ret void 125} 126 127;Check for a post-increment updating store with register increment. 128define void @vst2lanei16_update(i16** %ptr, <4 x i16>* %B, i32 %inc) nounwind { 129;CHECK: vst2lanei16_update: 130;CHECK: vst2.16 {d16[1], d17[1]}, [r1], r2 131 %A = load i16** %ptr 132 %tmp0 = bitcast i16* %A to i8* 133 %tmp1 = load <4 x i16>* %B 134 call void @llvm.arm.neon.vst2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 2) 135 %tmp2 = getelementptr i16* %A, i32 %inc 136 store i16* %tmp2, i16** %ptr 137 ret void 138} 139 140define void @vst2lanei32(i32* %A, <2 x i32>* %B) nounwind { 141;CHECK: vst2lanei32: 142;CHECK: vst2.32 143 %tmp0 = bitcast i32* %A to i8* 144 %tmp1 = load <2 x i32>* %B 145 call void @llvm.arm.neon.vst2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 146 ret void 147} 148 149define void @vst2lanef(float* %A, <2 x float>* %B) nounwind { 150;CHECK: vst2lanef: 151;CHECK: vst2.32 152 %tmp0 = bitcast float* %A to i8* 153 %tmp1 = load <2 x float>* %B 154 call void @llvm.arm.neon.vst2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 155 ret void 156} 157 158define void @vst2laneQi16(i16* %A, <8 x i16>* %B) nounwind { 159;CHECK: vst2laneQi16: 160;Check the (default) alignment. 161;CHECK: vst2.16 {d17[1], d19[1]}, [r0] 162 %tmp0 = bitcast i16* %A to i8* 163 %tmp1 = load <8 x i16>* %B 164 call void @llvm.arm.neon.vst2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1) 165 ret void 166} 167 168define void @vst2laneQi32(i32* %A, <4 x i32>* %B) nounwind { 169;CHECK: vst2laneQi32: 170;Check the alignment value. Max for this instruction is 64 bits: 171;CHECK: vst2.32 {d17[0], d19[0]}, [r0, :64] 172 %tmp0 = bitcast i32* %A to i8* 173 %tmp1 = load <4 x i32>* %B 174 call void @llvm.arm.neon.vst2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16) 175 ret void 176} 177 178define void @vst2laneQf(float* %A, <4 x float>* %B) nounwind { 179;CHECK: vst2laneQf: 180;CHECK: vst2.32 181 %tmp0 = bitcast float* %A to i8* 182 %tmp1 = load <4 x float>* %B 183 call void @llvm.arm.neon.vst2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 3, i32 1) 184 ret void 185} 186 187declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind 188declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind 189declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind 190declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind 191 192declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind 193declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind 194declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind 195 196define void @vst3lanei8(i8* %A, <8 x i8>* %B) nounwind { 197;CHECK: vst3lanei8: 198;CHECK: vst3.8 199 %tmp1 = load <8 x i8>* %B 200 call void @llvm.arm.neon.vst3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1) 201 ret void 202} 203 204define void @vst3lanei16(i16* %A, <4 x i16>* %B) nounwind { 205;CHECK: vst3lanei16: 206;Check the (default) alignment value. VST3 does not support alignment. 207;CHECK: vst3.16 {d16[1], d17[1], d18[1]}, [r0] 208 %tmp0 = bitcast i16* %A to i8* 209 %tmp1 = load <4 x i16>* %B 210 call void @llvm.arm.neon.vst3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8) 211 ret void 212} 213 214define void @vst3lanei32(i32* %A, <2 x i32>* %B) nounwind { 215;CHECK: vst3lanei32: 216;CHECK: vst3.32 217 %tmp0 = bitcast i32* %A to i8* 218 %tmp1 = load <2 x i32>* %B 219 call void @llvm.arm.neon.vst3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 220 ret void 221} 222 223define void @vst3lanef(float* %A, <2 x float>* %B) nounwind { 224;CHECK: vst3lanef: 225;CHECK: vst3.32 226 %tmp0 = bitcast float* %A to i8* 227 %tmp1 = load <2 x float>* %B 228 call void @llvm.arm.neon.vst3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 229 ret void 230} 231 232define void @vst3laneQi16(i16* %A, <8 x i16>* %B) nounwind { 233;CHECK: vst3laneQi16: 234;Check the (default) alignment value. VST3 does not support alignment. 235;CHECK: vst3.16 {d17[2], d19[2], d21[2]}, [r0] 236 %tmp0 = bitcast i16* %A to i8* 237 %tmp1 = load <8 x i16>* %B 238 call void @llvm.arm.neon.vst3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 6, i32 8) 239 ret void 240} 241 242define void @vst3laneQi32(i32* %A, <4 x i32>* %B) nounwind { 243;CHECK: vst3laneQi32: 244;CHECK: vst3.32 245 %tmp0 = bitcast i32* %A to i8* 246 %tmp1 = load <4 x i32>* %B 247 call void @llvm.arm.neon.vst3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1) 248 ret void 249} 250 251;Check for a post-increment updating store. 252define void @vst3laneQi32_update(i32** %ptr, <4 x i32>* %B) nounwind { 253;CHECK: vst3laneQi32_update: 254;CHECK: vst3.32 {d16[0], d18[0], d20[0]}, [r1]! 255 %A = load i32** %ptr 256 %tmp0 = bitcast i32* %A to i8* 257 %tmp1 = load <4 x i32>* %B 258 call void @llvm.arm.neon.vst3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1) 259 %tmp2 = getelementptr i32* %A, i32 3 260 store i32* %tmp2, i32** %ptr 261 ret void 262} 263 264define void @vst3laneQf(float* %A, <4 x float>* %B) nounwind { 265;CHECK: vst3laneQf: 266;CHECK: vst3.32 267 %tmp0 = bitcast float* %A to i8* 268 %tmp1 = load <4 x float>* %B 269 call void @llvm.arm.neon.vst3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 270 ret void 271} 272 273declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind 274declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind 275declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind 276declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind 277 278declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind 279declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind 280declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind 281 282 283define void @vst4lanei8(i8* %A, <8 x i8>* %B) nounwind { 284;CHECK: vst4lanei8: 285;Check the alignment value. Max for this instruction is 32 bits: 286;CHECK: vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0, :32] 287 %tmp1 = load <8 x i8>* %B 288 call void @llvm.arm.neon.vst4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8) 289 ret void 290} 291 292;Check for a post-increment updating store. 293define void @vst4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind { 294;CHECK: vst4lanei8_update: 295;CHECK: vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r1, :32]! 296 %A = load i8** %ptr 297 %tmp1 = load <8 x i8>* %B 298 call void @llvm.arm.neon.vst4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8) 299 %tmp2 = getelementptr i8* %A, i32 4 300 store i8* %tmp2, i8** %ptr 301 ret void 302} 303 304define void @vst4lanei16(i16* %A, <4 x i16>* %B) nounwind { 305;CHECK: vst4lanei16: 306;CHECK: vst4.16 307 %tmp0 = bitcast i16* %A to i8* 308 %tmp1 = load <4 x i16>* %B 309 call void @llvm.arm.neon.vst4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 1) 310 ret void 311} 312 313define void @vst4lanei32(i32* %A, <2 x i32>* %B) nounwind { 314;CHECK: vst4lanei32: 315;Check the alignment value. Max for this instruction is 128 bits: 316;CHECK: vst4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0, :128] 317 %tmp0 = bitcast i32* %A to i8* 318 %tmp1 = load <2 x i32>* %B 319 call void @llvm.arm.neon.vst4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 16) 320 ret void 321} 322 323define void @vst4lanef(float* %A, <2 x float>* %B) nounwind { 324;CHECK: vst4lanef: 325;CHECK: vst4.32 326 %tmp0 = bitcast float* %A to i8* 327 %tmp1 = load <2 x float>* %B 328 call void @llvm.arm.neon.vst4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 329 ret void 330} 331 332define void @vst4laneQi16(i16* %A, <8 x i16>* %B) nounwind { 333;CHECK: vst4laneQi16: 334;Check the alignment value. Max for this instruction is 64 bits: 335;CHECK: vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r0, :64] 336 %tmp0 = bitcast i16* %A to i8* 337 %tmp1 = load <8 x i16>* %B 338 call void @llvm.arm.neon.vst4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 7, i32 16) 339 ret void 340} 341 342define void @vst4laneQi32(i32* %A, <4 x i32>* %B) nounwind { 343;CHECK: vst4laneQi32: 344;Check the (default) alignment. 345;CHECK: vst4.32 {d17[0], d19[0], d21[0], d23[0]}, [r0] 346 %tmp0 = bitcast i32* %A to i8* 347 %tmp1 = load <4 x i32>* %B 348 call void @llvm.arm.neon.vst4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1) 349 ret void 350} 351 352define void @vst4laneQf(float* %A, <4 x float>* %B) nounwind { 353;CHECK: vst4laneQf: 354;CHECK: vst4.32 355 %tmp0 = bitcast float* %A to i8* 356 %tmp1 = load <4 x float>* %B 357 call void @llvm.arm.neon.vst4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 358 ret void 359} 360 361declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind 362declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind 363declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind 364declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind 365 366declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind 367declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind 368declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind 369