1; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s 2; RUN: llc < %s -march=arm -mattr=+neon -regalloc=basic | FileCheck %s 3 4define <8 x i8> @vld1lanei8(i8* %A, <8 x i8>* %B) nounwind { 5;CHECK: vld1lanei8: 6;Check the (default) alignment value. 7;CHECK: vld1.8 {d16[3]}, [r0] 8 %tmp1 = load <8 x i8>* %B 9 %tmp2 = load i8* %A, align 8 10 %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 3 11 ret <8 x i8> %tmp3 12} 13 14define <4 x i16> @vld1lanei16(i16* %A, <4 x i16>* %B) nounwind { 15;CHECK: vld1lanei16: 16;Check the alignment value. Max for this instruction is 16 bits: 17;CHECK: vld1.16 {d16[2]}, [r0, :16] 18 %tmp1 = load <4 x i16>* %B 19 %tmp2 = load i16* %A, align 8 20 %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 2 21 ret <4 x i16> %tmp3 22} 23 24define <2 x i32> @vld1lanei32(i32* %A, <2 x i32>* %B) nounwind { 25;CHECK: vld1lanei32: 26;Check the alignment value. Max for this instruction is 32 bits: 27;CHECK: vld1.32 {d16[1]}, [r0, :32] 28 %tmp1 = load <2 x i32>* %B 29 %tmp2 = load i32* %A, align 8 30 %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1 31 ret <2 x i32> %tmp3 32} 33 34define <2 x float> @vld1lanef(float* %A, <2 x float>* %B) nounwind { 35;CHECK: vld1lanef: 36;CHECK: vld1.32 {d16[1]}, [r0] 37 %tmp1 = load <2 x float>* %B 38 %tmp2 = load float* %A, align 4 39 %tmp3 = insertelement <2 x float> %tmp1, float %tmp2, i32 1 40 ret <2 x float> %tmp3 41} 42 43define <16 x i8> @vld1laneQi8(i8* %A, <16 x i8>* %B) nounwind { 44;CHECK: vld1laneQi8: 45;CHECK: vld1.8 {d17[1]}, [r0] 46 %tmp1 = load <16 x i8>* %B 47 %tmp2 = load i8* %A, align 8 48 %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 9 49 ret <16 x i8> %tmp3 50} 51 52define <8 x i16> @vld1laneQi16(i16* %A, <8 x i16>* %B) nounwind { 53;CHECK: vld1laneQi16: 54;CHECK: vld1.16 {d17[1]}, [r0, :16] 55 %tmp1 = load <8 x i16>* %B 56 %tmp2 = load i16* %A, align 8 57 %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 5 58 ret <8 x i16> %tmp3 59} 60 61define <4 x i32> @vld1laneQi32(i32* %A, <4 x i32>* %B) nounwind { 62;CHECK: vld1laneQi32: 63;CHECK: vld1.32 {d17[1]}, [r0, :32] 64 %tmp1 = load <4 x i32>* %B 65 %tmp2 = load i32* %A, align 8 66 %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 3 67 ret <4 x i32> %tmp3 68} 69 70define <4 x float> @vld1laneQf(float* %A, <4 x float>* %B) nounwind { 71;CHECK: vld1laneQf: 72;CHECK: vld1.32 {d16[0]}, [r0] 73 %tmp1 = load <4 x float>* %B 74 %tmp2 = load float* %A 75 %tmp3 = insertelement <4 x float> %tmp1, float %tmp2, i32 0 76 ret <4 x float> %tmp3 77} 78 79%struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> } 80%struct.__neon_int16x4x2_t = type { <4 x i16>, <4 x i16> } 81%struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> } 82%struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> } 83 84%struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> } 85%struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> } 86%struct.__neon_float32x4x2_t = type { <4 x float>, <4 x float> } 87 88define <8 x i8> @vld2lanei8(i8* %A, <8 x i8>* %B) nounwind { 89;CHECK: vld2lanei8: 90;Check the alignment value. Max for this instruction is 16 bits: 91;CHECK: vld2.8 {d16[1], d17[1]}, [r0, :16] 92 %tmp1 = load <8 x i8>* %B 93 %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4) 94 %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0 95 %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1 96 %tmp5 = add <8 x i8> %tmp3, %tmp4 97 ret <8 x i8> %tmp5 98} 99 100define <4 x i16> @vld2lanei16(i16* %A, <4 x i16>* %B) nounwind { 101;CHECK: vld2lanei16: 102;Check the alignment value. Max for this instruction is 32 bits: 103;CHECK: vld2.16 {d16[1], d17[1]}, [r0, :32] 104 %tmp0 = bitcast i16* %A to i8* 105 %tmp1 = load <4 x i16>* %B 106 %tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8) 107 %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0 108 %tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1 109 %tmp5 = add <4 x i16> %tmp3, %tmp4 110 ret <4 x i16> %tmp5 111} 112 113define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind { 114;CHECK: vld2lanei32: 115;CHECK: vld2.32 116 %tmp0 = bitcast i32* %A to i8* 117 %tmp1 = load <2 x i32>* %B 118 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 119 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0 120 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1 121 %tmp5 = add <2 x i32> %tmp3, %tmp4 122 ret <2 x i32> %tmp5 123} 124 125;Check for a post-increment updating load. 126define <2 x i32> @vld2lanei32_update(i32** %ptr, <2 x i32>* %B) nounwind { 127;CHECK: vld2lanei32_update: 128;CHECK: vld2.32 {d16[1], d17[1]}, [{{r[0-9]+}}]! 129 %A = load i32** %ptr 130 %tmp0 = bitcast i32* %A to i8* 131 %tmp1 = load <2 x i32>* %B 132 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 133 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0 134 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1 135 %tmp5 = add <2 x i32> %tmp3, %tmp4 136 %tmp6 = getelementptr i32* %A, i32 2 137 store i32* %tmp6, i32** %ptr 138 ret <2 x i32> %tmp5 139} 140 141define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind { 142;CHECK: vld2lanef: 143;CHECK: vld2.32 144 %tmp0 = bitcast float* %A to i8* 145 %tmp1 = load <2 x float>* %B 146 %tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 147 %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 0 148 %tmp4 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 1 149 %tmp5 = fadd <2 x float> %tmp3, %tmp4 150 ret <2 x float> %tmp5 151} 152 153define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind { 154;CHECK: vld2laneQi16: 155;Check the (default) alignment. 156;CHECK: vld2.16 {d17[1], d19[1]}, [{{r[0-9]+}}] 157 %tmp0 = bitcast i16* %A to i8* 158 %tmp1 = load <8 x i16>* %B 159 %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1) 160 %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0 161 %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1 162 %tmp5 = add <8 x i16> %tmp3, %tmp4 163 ret <8 x i16> %tmp5 164} 165 166define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind { 167;CHECK: vld2laneQi32: 168;Check the alignment value. Max for this instruction is 64 bits: 169;CHECK: vld2.32 {d17[0], d19[0]}, [{{r[0-9]+}}, :64] 170 %tmp0 = bitcast i32* %A to i8* 171 %tmp1 = load <4 x i32>* %B 172 %tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16) 173 %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 174 %tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1 175 %tmp5 = add <4 x i32> %tmp3, %tmp4 176 ret <4 x i32> %tmp5 177} 178 179define <4 x float> @vld2laneQf(float* %A, <4 x float>* %B) nounwind { 180;CHECK: vld2laneQf: 181;CHECK: vld2.32 182 %tmp0 = bitcast float* %A to i8* 183 %tmp1 = load <4 x float>* %B 184 %tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 185 %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 0 186 %tmp4 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 1 187 %tmp5 = fadd <4 x float> %tmp3, %tmp4 188 ret <4 x float> %tmp5 189} 190 191declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 192declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 193declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 194declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind readonly 195 196declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly 197declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly 198declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind readonly 199 200%struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> } 201%struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> } 202%struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> } 203%struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> } 204 205%struct.__neon_int16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> } 206%struct.__neon_int32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> } 207%struct.__neon_float32x4x3_t = type { <4 x float>, <4 x float>, <4 x float> } 208 209define <8 x i8> @vld3lanei8(i8* %A, <8 x i8>* %B) nounwind { 210;CHECK: vld3lanei8: 211;CHECK: vld3.8 212 %tmp1 = load <8 x i8>* %B 213 %tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1) 214 %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0 215 %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1 216 %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2 217 %tmp6 = add <8 x i8> %tmp3, %tmp4 218 %tmp7 = add <8 x i8> %tmp5, %tmp6 219 ret <8 x i8> %tmp7 220} 221 222define <4 x i16> @vld3lanei16(i16* %A, <4 x i16>* %B) nounwind { 223;CHECK: vld3lanei16: 224;Check the (default) alignment value. VLD3 does not support alignment. 225;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}] 226 %tmp0 = bitcast i16* %A to i8* 227 %tmp1 = load <4 x i16>* %B 228 %tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8) 229 %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0 230 %tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1 231 %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2 232 %tmp6 = add <4 x i16> %tmp3, %tmp4 233 %tmp7 = add <4 x i16> %tmp5, %tmp6 234 ret <4 x i16> %tmp7 235} 236 237define <2 x i32> @vld3lanei32(i32* %A, <2 x i32>* %B) nounwind { 238;CHECK: vld3lanei32: 239;CHECK: vld3.32 240 %tmp0 = bitcast i32* %A to i8* 241 %tmp1 = load <2 x i32>* %B 242 %tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 243 %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0 244 %tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1 245 %tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2 246 %tmp6 = add <2 x i32> %tmp3, %tmp4 247 %tmp7 = add <2 x i32> %tmp5, %tmp6 248 ret <2 x i32> %tmp7 249} 250 251define <2 x float> @vld3lanef(float* %A, <2 x float>* %B) nounwind { 252;CHECK: vld3lanef: 253;CHECK: vld3.32 254 %tmp0 = bitcast float* %A to i8* 255 %tmp1 = load <2 x float>* %B 256 %tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 257 %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 0 258 %tmp4 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 1 259 %tmp5 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 2 260 %tmp6 = fadd <2 x float> %tmp3, %tmp4 261 %tmp7 = fadd <2 x float> %tmp5, %tmp6 262 ret <2 x float> %tmp7 263} 264 265define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind { 266;CHECK: vld3laneQi16: 267;Check the (default) alignment value. VLD3 does not support alignment. 268;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}] 269 %tmp0 = bitcast i16* %A to i8* 270 %tmp1 = load <8 x i16>* %B 271 %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8) 272 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0 273 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1 274 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2 275 %tmp6 = add <8 x i16> %tmp3, %tmp4 276 %tmp7 = add <8 x i16> %tmp5, %tmp6 277 ret <8 x i16> %tmp7 278} 279 280;Check for a post-increment updating load with register increment. 281define <8 x i16> @vld3laneQi16_update(i16** %ptr, <8 x i16>* %B, i32 %inc) nounwind { 282;CHECK: vld3laneQi16_update: 283;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}], {{r[0-9]+}} 284 %A = load i16** %ptr 285 %tmp0 = bitcast i16* %A to i8* 286 %tmp1 = load <8 x i16>* %B 287 %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8) 288 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0 289 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1 290 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2 291 %tmp6 = add <8 x i16> %tmp3, %tmp4 292 %tmp7 = add <8 x i16> %tmp5, %tmp6 293 %tmp8 = getelementptr i16* %A, i32 %inc 294 store i16* %tmp8, i16** %ptr 295 ret <8 x i16> %tmp7 296} 297 298define <4 x i32> @vld3laneQi32(i32* %A, <4 x i32>* %B) nounwind { 299;CHECK: vld3laneQi32: 300;CHECK: vld3.32 301 %tmp0 = bitcast i32* %A to i8* 302 %tmp1 = load <4 x i32>* %B 303 %tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1) 304 %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0 305 %tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1 306 %tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2 307 %tmp6 = add <4 x i32> %tmp3, %tmp4 308 %tmp7 = add <4 x i32> %tmp5, %tmp6 309 ret <4 x i32> %tmp7 310} 311 312define <4 x float> @vld3laneQf(float* %A, <4 x float>* %B) nounwind { 313;CHECK: vld3laneQf: 314;CHECK: vld3.32 315 %tmp0 = bitcast float* %A to i8* 316 %tmp1 = load <4 x float>* %B 317 %tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 318 %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 0 319 %tmp4 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 1 320 %tmp5 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 2 321 %tmp6 = fadd <4 x float> %tmp3, %tmp4 322 %tmp7 = fadd <4 x float> %tmp5, %tmp6 323 ret <4 x float> %tmp7 324} 325 326declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 327declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 328declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 329declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly 330 331declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly 332declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly 333declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly 334 335%struct.__neon_int8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } 336%struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } 337%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } 338%struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> } 339 340%struct.__neon_int16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } 341%struct.__neon_int32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } 342%struct.__neon_float32x4x4_t = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> } 343 344define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind { 345;CHECK: vld4lanei8: 346;Check the alignment value. Max for this instruction is 32 bits: 347;CHECK: vld4.8 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}, :32] 348 %tmp1 = load <8 x i8>* %B 349 %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8) 350 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0 351 %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1 352 %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2 353 %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3 354 %tmp7 = add <8 x i8> %tmp3, %tmp4 355 %tmp8 = add <8 x i8> %tmp5, %tmp6 356 %tmp9 = add <8 x i8> %tmp7, %tmp8 357 ret <8 x i8> %tmp9 358} 359 360;Check for a post-increment updating load. 361define <8 x i8> @vld4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind { 362;CHECK: vld4lanei8_update: 363;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}, :32]! 364 %A = load i8** %ptr 365 %tmp1 = load <8 x i8>* %B 366 %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8) 367 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0 368 %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1 369 %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2 370 %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3 371 %tmp7 = add <8 x i8> %tmp3, %tmp4 372 %tmp8 = add <8 x i8> %tmp5, %tmp6 373 %tmp9 = add <8 x i8> %tmp7, %tmp8 374 %tmp10 = getelementptr i8* %A, i32 4 375 store i8* %tmp10, i8** %ptr 376 ret <8 x i8> %tmp9 377} 378 379define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind { 380;CHECK: vld4lanei16: 381;Check that a power-of-two alignment smaller than the total size of the memory 382;being loaded is ignored. 383;CHECK: vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}] 384 %tmp0 = bitcast i16* %A to i8* 385 %tmp1 = load <4 x i16>* %B 386 %tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 4) 387 %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0 388 %tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1 389 %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2 390 %tmp6 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 3 391 %tmp7 = add <4 x i16> %tmp3, %tmp4 392 %tmp8 = add <4 x i16> %tmp5, %tmp6 393 %tmp9 = add <4 x i16> %tmp7, %tmp8 394 ret <4 x i16> %tmp9 395} 396 397define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind { 398;CHECK: vld4lanei32: 399;Check the alignment value. An 8-byte alignment is allowed here even though 400;it is smaller than the total size of the memory being loaded. 401;CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}, :64] 402 %tmp0 = bitcast i32* %A to i8* 403 %tmp1 = load <2 x i32>* %B 404 %tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8) 405 %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0 406 %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1 407 %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2 408 %tmp6 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 3 409 %tmp7 = add <2 x i32> %tmp3, %tmp4 410 %tmp8 = add <2 x i32> %tmp5, %tmp6 411 %tmp9 = add <2 x i32> %tmp7, %tmp8 412 ret <2 x i32> %tmp9 413} 414 415define <2 x float> @vld4lanef(float* %A, <2 x float>* %B) nounwind { 416;CHECK: vld4lanef: 417;CHECK: vld4.32 418 %tmp0 = bitcast float* %A to i8* 419 %tmp1 = load <2 x float>* %B 420 %tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 421 %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 0 422 %tmp4 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 1 423 %tmp5 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 2 424 %tmp6 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 3 425 %tmp7 = fadd <2 x float> %tmp3, %tmp4 426 %tmp8 = fadd <2 x float> %tmp5, %tmp6 427 %tmp9 = fadd <2 x float> %tmp7, %tmp8 428 ret <2 x float> %tmp9 429} 430 431define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind { 432;CHECK: vld4laneQi16: 433;Check the alignment value. Max for this instruction is 64 bits: 434;CHECK: vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [{{r[0-9]+}}, :64] 435 %tmp0 = bitcast i16* %A to i8* 436 %tmp1 = load <8 x i16>* %B 437 %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16) 438 %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 0 439 %tmp4 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 1 440 %tmp5 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 2 441 %tmp6 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 3 442 %tmp7 = add <8 x i16> %tmp3, %tmp4 443 %tmp8 = add <8 x i16> %tmp5, %tmp6 444 %tmp9 = add <8 x i16> %tmp7, %tmp8 445 ret <8 x i16> %tmp9 446} 447 448define <4 x i32> @vld4laneQi32(i32* %A, <4 x i32>* %B) nounwind { 449;CHECK: vld4laneQi32: 450;Check the (default) alignment. 451;CHECK: vld4.32 {d17[0], d19[0], d21[0], d23[0]}, [{{r[0-9]+}}] 452 %tmp0 = bitcast i32* %A to i8* 453 %tmp1 = load <4 x i32>* %B 454 %tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1) 455 %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 0 456 %tmp4 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 1 457 %tmp5 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 2 458 %tmp6 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 3 459 %tmp7 = add <4 x i32> %tmp3, %tmp4 460 %tmp8 = add <4 x i32> %tmp5, %tmp6 461 %tmp9 = add <4 x i32> %tmp7, %tmp8 462 ret <4 x i32> %tmp9 463} 464 465define <4 x float> @vld4laneQf(float* %A, <4 x float>* %B) nounwind { 466;CHECK: vld4laneQf: 467;CHECK: vld4.32 468 %tmp0 = bitcast float* %A to i8* 469 %tmp1 = load <4 x float>* %B 470 %tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 471 %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 0 472 %tmp4 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 1 473 %tmp5 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 2 474 %tmp6 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 3 475 %tmp7 = fadd <4 x float> %tmp3, %tmp4 476 %tmp8 = fadd <4 x float> %tmp5, %tmp6 477 %tmp9 = fadd <4 x float> %tmp7, %tmp8 478 ret <4 x float> %tmp9 479} 480 481declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 482declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 483declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 484declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly 485 486declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly 487declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly 488declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly 489 490; Radar 8776599: If one of the operands to a QQQQ REG_SEQUENCE is a register 491; in the QPR_VFP2 regclass, it needs to be copied to a QPR regclass because 492; we don't currently have a QQQQ_VFP2 super-regclass. (The "0" for the low 493; part of %ins67 is supposed to be loaded by a VLDRS instruction in this test.) 494define <8 x i16> @test_qqqq_regsequence_subreg([6 x i64] %b) nounwind { 495;CHECK: test_qqqq_regsequence_subreg 496;CHECK: vld3.16 497 %tmp63 = extractvalue [6 x i64] %b, 5 498 %tmp64 = zext i64 %tmp63 to i128 499 %tmp65 = shl i128 %tmp64, 64 500 %ins67 = or i128 %tmp65, 0 501 %tmp78 = bitcast i128 %ins67 to <8 x i16> 502 %vld3_lane = tail call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> %tmp78, i32 1, i32 2) 503 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 0 504 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 1 505 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 2 506 %tmp6 = add <8 x i16> %tmp3, %tmp4 507 %tmp7 = add <8 x i16> %tmp5, %tmp6 508 ret <8 x i16> %tmp7 509} 510 511declare void @llvm.trap() nounwind 512