1; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s 2 3; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon -regalloc=basic %s -o - \ 4; RUN: | FileCheck %s 5 6define <8 x i8> @vld1lanei8(i8* %A, <8 x i8>* %B) nounwind { 7;CHECK-LABEL: vld1lanei8: 8;Check the (default) alignment value. 9;CHECK: vld1.8 {d16[3]}, [r0] 10 %tmp1 = load <8 x i8>, <8 x i8>* %B 11 %tmp2 = load i8, i8* %A, align 8 12 %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 3 13 ret <8 x i8> %tmp3 14} 15 16define <4 x i16> @vld1lanei16(i16* %A, <4 x i16>* %B) nounwind { 17;CHECK-LABEL: vld1lanei16: 18;Check the alignment value. Max for this instruction is 16 bits: 19;CHECK: vld1.16 {d16[2]}, [r0:16] 20 %tmp1 = load <4 x i16>, <4 x i16>* %B 21 %tmp2 = load i16, i16* %A, align 8 22 %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 2 23 ret <4 x i16> %tmp3 24} 25 26define <2 x i32> @vld1lanei32(i32* %A, <2 x i32>* %B) nounwind { 27;CHECK-LABEL: vld1lanei32: 28;Check the alignment value. Max for this instruction is 32 bits: 29;CHECK: vld1.32 {d16[1]}, [r0:32] 30 %tmp1 = load <2 x i32>, <2 x i32>* %B 31 %tmp2 = load i32, i32* %A, align 8 32 %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1 33 ret <2 x i32> %tmp3 34} 35 36define <2 x i32> @vld1lanei32a32(i32* %A, <2 x i32>* %B) nounwind { 37;CHECK-LABEL: vld1lanei32a32: 38;Check the alignment value. Legal values are none or :32. 39;CHECK: vld1.32 {d16[1]}, [r0:32] 40 %tmp1 = load <2 x i32>, <2 x i32>* %B 41 %tmp2 = load i32, i32* %A, align 4 42 %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1 43 ret <2 x i32> %tmp3 44} 45 46define <2 x float> @vld1lanef(float* %A, <2 x float>* %B) nounwind { 47;CHECK-LABEL: vld1lanef: 48;CHECK: vld1.32 {d16[1]}, [r0:32] 49 %tmp1 = load <2 x float>, <2 x float>* %B 50 %tmp2 = load float, float* %A, align 4 51 %tmp3 = insertelement <2 x float> %tmp1, float %tmp2, i32 1 52 ret <2 x float> %tmp3 53} 54 55define <16 x i8> @vld1laneQi8(i8* %A, <16 x i8>* %B) nounwind { 56;CHECK-LABEL: vld1laneQi8: 57;CHECK: vld1.8 {d17[1]}, [r0] 58 %tmp1 = load <16 x i8>, <16 x i8>* %B 59 %tmp2 = load i8, i8* %A, align 8 60 %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 9 61 ret <16 x i8> %tmp3 62} 63 64define <8 x i16> @vld1laneQi16(i16* %A, <8 x i16>* %B) nounwind { 65;CHECK-LABEL: vld1laneQi16: 66;CHECK: vld1.16 {d17[1]}, [r0:16] 67 %tmp1 = load <8 x i16>, <8 x i16>* %B 68 %tmp2 = load i16, i16* %A, align 8 69 %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 5 70 ret <8 x i16> %tmp3 71} 72 73define <4 x i32> @vld1laneQi32(i32* %A, <4 x i32>* %B) nounwind { 74;CHECK-LABEL: vld1laneQi32: 75;CHECK: vld1.32 {d17[1]}, [r0:32] 76 %tmp1 = load <4 x i32>, <4 x i32>* %B 77 %tmp2 = load i32, i32* %A, align 8 78 %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 3 79 ret <4 x i32> %tmp3 80} 81 82define <4 x float> @vld1laneQf(float* %A, <4 x float>* %B) nounwind { 83;CHECK-LABEL: vld1laneQf: 84;CHECK: vld1.32 {d16[0]}, [r0:32] 85 %tmp1 = load <4 x float>, <4 x float>* %B 86 %tmp2 = load float, float* %A 87 %tmp3 = insertelement <4 x float> %tmp1, float %tmp2, i32 0 88 ret <4 x float> %tmp3 89} 90 91%struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> } 92%struct.__neon_int16x4x2_t = type { <4 x i16>, <4 x i16> } 93%struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> } 94%struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> } 95 96%struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> } 97%struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> } 98%struct.__neon_float32x4x2_t = type { <4 x float>, <4 x float> } 99 100define <8 x i8> @vld2lanei8(i8* %A, <8 x i8>* %B) nounwind { 101;CHECK-LABEL: vld2lanei8: 102;Check the alignment value. Max for this instruction is 16 bits: 103;CHECK: vld2.8 {d16[1], d17[1]}, [r0:16] 104 %tmp1 = load <8 x i8>, <8 x i8>* %B 105 %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4) 106 %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0 107 %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1 108 %tmp5 = add <8 x i8> %tmp3, %tmp4 109 ret <8 x i8> %tmp5 110} 111 112define <4 x i16> @vld2lanei16(i16* %A, <4 x i16>* %B) nounwind { 113;CHECK-LABEL: vld2lanei16: 114;Check the alignment value. Max for this instruction is 32 bits: 115;CHECK: vld2.16 {d16[1], d17[1]}, [r0:32] 116 %tmp0 = bitcast i16* %A to i8* 117 %tmp1 = load <4 x i16>, <4 x i16>* %B 118 %tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8) 119 %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0 120 %tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1 121 %tmp5 = add <4 x i16> %tmp3, %tmp4 122 ret <4 x i16> %tmp5 123} 124 125define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind { 126;CHECK-LABEL: vld2lanei32: 127;CHECK: vld2.32 128 %tmp0 = bitcast i32* %A to i8* 129 %tmp1 = load <2 x i32>, <2 x i32>* %B 130 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 131 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0 132 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1 133 %tmp5 = add <2 x i32> %tmp3, %tmp4 134 ret <2 x i32> %tmp5 135} 136 137;Check for a post-increment updating load. 138define <2 x i32> @vld2lanei32_update(i32** %ptr, <2 x i32>* %B) nounwind { 139;CHECK-LABEL: vld2lanei32_update: 140;CHECK: vld2.32 {d16[1], d17[1]}, [{{r[0-9]+}}]! 141 %A = load i32*, i32** %ptr 142 %tmp0 = bitcast i32* %A to i8* 143 %tmp1 = load <2 x i32>, <2 x i32>* %B 144 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 145 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0 146 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1 147 %tmp5 = add <2 x i32> %tmp3, %tmp4 148 %tmp6 = getelementptr i32, i32* %A, i32 2 149 store i32* %tmp6, i32** %ptr 150 ret <2 x i32> %tmp5 151} 152 153define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind { 154;CHECK-LABEL: vld2lanef: 155;CHECK: vld2.32 156 %tmp0 = bitcast float* %A to i8* 157 %tmp1 = load <2 x float>, <2 x float>* %B 158 %tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32.p0i8(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 159 %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 0 160 %tmp4 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 1 161 %tmp5 = fadd <2 x float> %tmp3, %tmp4 162 ret <2 x float> %tmp5 163} 164 165define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind { 166;CHECK-LABEL: vld2laneQi16: 167;Check the (default) alignment. 168;CHECK: vld2.16 {d17[1], d19[1]}, [{{r[0-9]+}}] 169 %tmp0 = bitcast i16* %A to i8* 170 %tmp1 = load <8 x i16>, <8 x i16>* %B 171 %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1) 172 %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0 173 %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1 174 %tmp5 = add <8 x i16> %tmp3, %tmp4 175 ret <8 x i16> %tmp5 176} 177 178define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind { 179;CHECK-LABEL: vld2laneQi32: 180;Check the alignment value. Max for this instruction is 64 bits: 181;CHECK: vld2.32 {d17[0], d19[0]}, [{{r[0-9]+}}:64] 182 %tmp0 = bitcast i32* %A to i8* 183 %tmp1 = load <4 x i32>, <4 x i32>* %B 184 %tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32.p0i8(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16) 185 %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 186 %tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1 187 %tmp5 = add <4 x i32> %tmp3, %tmp4 188 ret <4 x i32> %tmp5 189} 190 191define <4 x float> @vld2laneQf(float* %A, <4 x float>* %B) nounwind { 192;CHECK-LABEL: vld2laneQf: 193;CHECK: vld2.32 194 %tmp0 = bitcast float* %A to i8* 195 %tmp1 = load <4 x float>, <4 x float>* %B 196 %tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32.p0i8(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 197 %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 0 198 %tmp4 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 1 199 %tmp5 = fadd <4 x float> %tmp3, %tmp4 200 ret <4 x float> %tmp5 201} 202 203declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 204declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 205declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 206declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32.p0i8(i8*, <2 x float>, <2 x float>, i32, i32) nounwind readonly 207 208declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly 209declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32.p0i8(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly 210declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32.p0i8(i8*, <4 x float>, <4 x float>, i32, i32) nounwind readonly 211 212%struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> } 213%struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> } 214%struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> } 215%struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> } 216 217%struct.__neon_int16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> } 218%struct.__neon_int32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> } 219%struct.__neon_float32x4x3_t = type { <4 x float>, <4 x float>, <4 x float> } 220 221define <8 x i8> @vld3lanei8(i8* %A, <8 x i8>* %B) nounwind { 222;CHECK-LABEL: vld3lanei8: 223;CHECK: vld3.8 224 %tmp1 = load <8 x i8>, <8 x i8>* %B 225 %tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1) 226 %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0 227 %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1 228 %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2 229 %tmp6 = add <8 x i8> %tmp3, %tmp4 230 %tmp7 = add <8 x i8> %tmp5, %tmp6 231 ret <8 x i8> %tmp7 232} 233 234define <4 x i16> @vld3lanei16(i16* %A, <4 x i16>* %B) nounwind { 235;CHECK-LABEL: vld3lanei16: 236;Check the (default) alignment value. VLD3 does not support alignment. 237;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}] 238 %tmp0 = bitcast i16* %A to i8* 239 %tmp1 = load <4 x i16>, <4 x i16>* %B 240 %tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8) 241 %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0 242 %tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1 243 %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2 244 %tmp6 = add <4 x i16> %tmp3, %tmp4 245 %tmp7 = add <4 x i16> %tmp5, %tmp6 246 ret <4 x i16> %tmp7 247} 248 249define <2 x i32> @vld3lanei32(i32* %A, <2 x i32>* %B) nounwind { 250;CHECK-LABEL: vld3lanei32: 251;CHECK: vld3.32 252 %tmp0 = bitcast i32* %A to i8* 253 %tmp1 = load <2 x i32>, <2 x i32>* %B 254 %tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 255 %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0 256 %tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1 257 %tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2 258 %tmp6 = add <2 x i32> %tmp3, %tmp4 259 %tmp7 = add <2 x i32> %tmp5, %tmp6 260 ret <2 x i32> %tmp7 261} 262 263define <2 x float> @vld3lanef(float* %A, <2 x float>* %B) nounwind { 264;CHECK-LABEL: vld3lanef: 265;CHECK: vld3.32 266 %tmp0 = bitcast float* %A to i8* 267 %tmp1 = load <2 x float>, <2 x float>* %B 268 %tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32.p0i8(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 269 %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 0 270 %tmp4 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 1 271 %tmp5 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 2 272 %tmp6 = fadd <2 x float> %tmp3, %tmp4 273 %tmp7 = fadd <2 x float> %tmp5, %tmp6 274 ret <2 x float> %tmp7 275} 276 277define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind { 278;CHECK-LABEL: vld3laneQi16: 279;Check the (default) alignment value. VLD3 does not support alignment. 280;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}] 281 %tmp0 = bitcast i16* %A to i8* 282 %tmp1 = load <8 x i16>, <8 x i16>* %B 283 %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8) 284 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0 285 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1 286 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2 287 %tmp6 = add <8 x i16> %tmp3, %tmp4 288 %tmp7 = add <8 x i16> %tmp5, %tmp6 289 ret <8 x i16> %tmp7 290} 291 292;Check for a post-increment updating load with register increment. 293define <8 x i16> @vld3laneQi16_update(i16** %ptr, <8 x i16>* %B, i32 %inc) nounwind { 294;CHECK-LABEL: vld3laneQi16_update: 295;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}], {{r[0-9]+}} 296 %A = load i16*, i16** %ptr 297 %tmp0 = bitcast i16* %A to i8* 298 %tmp1 = load <8 x i16>, <8 x i16>* %B 299 %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8) 300 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0 301 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1 302 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2 303 %tmp6 = add <8 x i16> %tmp3, %tmp4 304 %tmp7 = add <8 x i16> %tmp5, %tmp6 305 %tmp8 = getelementptr i16, i16* %A, i32 %inc 306 store i16* %tmp8, i16** %ptr 307 ret <8 x i16> %tmp7 308} 309 310define <4 x i32> @vld3laneQi32(i32* %A, <4 x i32>* %B) nounwind { 311;CHECK-LABEL: vld3laneQi32: 312;CHECK: vld3.32 313 %tmp0 = bitcast i32* %A to i8* 314 %tmp1 = load <4 x i32>, <4 x i32>* %B 315 %tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32.p0i8(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1) 316 %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0 317 %tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1 318 %tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2 319 %tmp6 = add <4 x i32> %tmp3, %tmp4 320 %tmp7 = add <4 x i32> %tmp5, %tmp6 321 ret <4 x i32> %tmp7 322} 323 324define <4 x float> @vld3laneQf(float* %A, <4 x float>* %B) nounwind { 325;CHECK-LABEL: vld3laneQf: 326;CHECK: vld3.32 327 %tmp0 = bitcast float* %A to i8* 328 %tmp1 = load <4 x float>, <4 x float>* %B 329 %tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32.p0i8(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 330 %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 0 331 %tmp4 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 1 332 %tmp5 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 2 333 %tmp6 = fadd <4 x float> %tmp3, %tmp4 334 %tmp7 = fadd <4 x float> %tmp5, %tmp6 335 ret <4 x float> %tmp7 336} 337 338declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 339declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 340declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 341declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32.p0i8(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly 342 343declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly 344declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32.p0i8(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly 345declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32.p0i8(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly 346 347%struct.__neon_int8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } 348%struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } 349%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } 350%struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> } 351 352%struct.__neon_int16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } 353%struct.__neon_int32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } 354%struct.__neon_float32x4x4_t = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> } 355 356define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind { 357;CHECK-LABEL: vld4lanei8: 358;Check the alignment value. Max for this instruction is 32 bits: 359;CHECK: vld4.8 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}:32] 360 %tmp1 = load <8 x i8>, <8 x i8>* %B 361 %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8) 362 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0 363 %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1 364 %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2 365 %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3 366 %tmp7 = add <8 x i8> %tmp3, %tmp4 367 %tmp8 = add <8 x i8> %tmp5, %tmp6 368 %tmp9 = add <8 x i8> %tmp7, %tmp8 369 ret <8 x i8> %tmp9 370} 371 372;Check for a post-increment updating load. 373define <8 x i8> @vld4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind { 374;CHECK-LABEL: vld4lanei8_update: 375;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}:32]! 376 %A = load i8*, i8** %ptr 377 %tmp1 = load <8 x i8>, <8 x i8>* %B 378 %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8) 379 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0 380 %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1 381 %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2 382 %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3 383 %tmp7 = add <8 x i8> %tmp3, %tmp4 384 %tmp8 = add <8 x i8> %tmp5, %tmp6 385 %tmp9 = add <8 x i8> %tmp7, %tmp8 386 %tmp10 = getelementptr i8, i8* %A, i32 4 387 store i8* %tmp10, i8** %ptr 388 ret <8 x i8> %tmp9 389} 390 391define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind { 392;CHECK-LABEL: vld4lanei16: 393;Check that a power-of-two alignment smaller than the total size of the memory 394;being loaded is ignored. 395;CHECK: vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}] 396 %tmp0 = bitcast i16* %A to i8* 397 %tmp1 = load <4 x i16>, <4 x i16>* %B 398 %tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 4) 399 %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0 400 %tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1 401 %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2 402 %tmp6 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 3 403 %tmp7 = add <4 x i16> %tmp3, %tmp4 404 %tmp8 = add <4 x i16> %tmp5, %tmp6 405 %tmp9 = add <4 x i16> %tmp7, %tmp8 406 ret <4 x i16> %tmp9 407} 408 409define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind { 410;CHECK-LABEL: vld4lanei32: 411;Check the alignment value. An 8-byte alignment is allowed here even though 412;it is smaller than the total size of the memory being loaded. 413;CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}:64] 414 %tmp0 = bitcast i32* %A to i8* 415 %tmp1 = load <2 x i32>, <2 x i32>* %B 416 %tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8) 417 %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0 418 %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1 419 %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2 420 %tmp6 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 3 421 %tmp7 = add <2 x i32> %tmp3, %tmp4 422 %tmp8 = add <2 x i32> %tmp5, %tmp6 423 %tmp9 = add <2 x i32> %tmp7, %tmp8 424 ret <2 x i32> %tmp9 425} 426 427define <2 x float> @vld4lanef(float* %A, <2 x float>* %B) nounwind { 428;CHECK-LABEL: vld4lanef: 429;CHECK: vld4.32 430 %tmp0 = bitcast float* %A to i8* 431 %tmp1 = load <2 x float>, <2 x float>* %B 432 %tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32.p0i8(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 433 %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 0 434 %tmp4 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 1 435 %tmp5 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 2 436 %tmp6 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 3 437 %tmp7 = fadd <2 x float> %tmp3, %tmp4 438 %tmp8 = fadd <2 x float> %tmp5, %tmp6 439 %tmp9 = fadd <2 x float> %tmp7, %tmp8 440 ret <2 x float> %tmp9 441} 442 443define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind { 444;CHECK-LABEL: vld4laneQi16: 445;Check the alignment value. Max for this instruction is 64 bits: 446;CHECK: vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [{{r[0-9]+}}:64] 447 %tmp0 = bitcast i16* %A to i8* 448 %tmp1 = load <8 x i16>, <8 x i16>* %B 449 %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16) 450 %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 0 451 %tmp4 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 1 452 %tmp5 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 2 453 %tmp6 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 3 454 %tmp7 = add <8 x i16> %tmp3, %tmp4 455 %tmp8 = add <8 x i16> %tmp5, %tmp6 456 %tmp9 = add <8 x i16> %tmp7, %tmp8 457 ret <8 x i16> %tmp9 458} 459 460define <4 x i32> @vld4laneQi32(i32* %A, <4 x i32>* %B) nounwind { 461;CHECK-LABEL: vld4laneQi32: 462;Check the (default) alignment. 463;CHECK: vld4.32 {d17[0], d19[0], d21[0], d23[0]}, [{{r[0-9]+}}] 464 %tmp0 = bitcast i32* %A to i8* 465 %tmp1 = load <4 x i32>, <4 x i32>* %B 466 %tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32.p0i8(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1) 467 %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 0 468 %tmp4 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 1 469 %tmp5 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 2 470 %tmp6 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 3 471 %tmp7 = add <4 x i32> %tmp3, %tmp4 472 %tmp8 = add <4 x i32> %tmp5, %tmp6 473 %tmp9 = add <4 x i32> %tmp7, %tmp8 474 ret <4 x i32> %tmp9 475} 476 477define <4 x float> @vld4laneQf(float* %A, <4 x float>* %B) nounwind { 478;CHECK-LABEL: vld4laneQf: 479;CHECK: vld4.32 480 %tmp0 = bitcast float* %A to i8* 481 %tmp1 = load <4 x float>, <4 x float>* %B 482 %tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32.p0i8(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 483 %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 0 484 %tmp4 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 1 485 %tmp5 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 2 486 %tmp6 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 3 487 %tmp7 = fadd <4 x float> %tmp3, %tmp4 488 %tmp8 = fadd <4 x float> %tmp5, %tmp6 489 %tmp9 = fadd <4 x float> %tmp7, %tmp8 490 ret <4 x float> %tmp9 491} 492 493declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 494declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 495declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 496declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32.p0i8(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly 497 498declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly 499declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32.p0i8(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly 500declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32.p0i8(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly 501 502; Radar 8776599: If one of the operands to a QQQQ REG_SEQUENCE is a register 503; in the QPR_VFP2 regclass, it needs to be copied to a QPR regclass because 504; we don't currently have a QQQQ_VFP2 super-regclass. (The "0" for the low 505; part of %ins67 is supposed to be loaded by a VLDRS instruction in this test.) 506define <8 x i16> @test_qqqq_regsequence_subreg([6 x i64] %b) nounwind { 507;CHECK-LABEL: test_qqqq_regsequence_subreg: 508;CHECK: vld3.16 509 %tmp63 = extractvalue [6 x i64] %b, 5 510 %tmp64 = zext i64 %tmp63 to i128 511 %tmp65 = shl i128 %tmp64, 64 512 %ins67 = or i128 %tmp65, 0 513 %tmp78 = bitcast i128 %ins67 to <8 x i16> 514 %vld3_lane = tail call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> %tmp78, i32 1, i32 2) 515 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 0 516 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 1 517 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 2 518 %tmp6 = add <8 x i16> %tmp3, %tmp4 519 %tmp7 = add <8 x i16> %tmp5, %tmp6 520 ret <8 x i16> %tmp7 521} 522 523declare void @llvm.trap() nounwind 524