1; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s 2 3; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon -regalloc=basic %s -o - \ 4; RUN: | FileCheck %s 5 6define <8 x i8> @vld1lanei8(i8* %A, <8 x i8>* %B) nounwind { 7;CHECK-LABEL: vld1lanei8: 8;Check the (default) alignment value. 9;CHECK: vld1.8 {d16[3]}, [r0] 10 %tmp1 = load <8 x i8>, <8 x i8>* %B 11 %tmp2 = load i8, i8* %A, align 8 12 %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 3 13 ret <8 x i8> %tmp3 14} 15 16define <4 x i16> @vld1lanei16(i16* %A, <4 x i16>* %B) nounwind { 17;CHECK-LABEL: vld1lanei16: 18;Check the alignment value. Max for this instruction is 16 bits: 19;CHECK: vld1.16 {d16[2]}, [r0:16] 20 %tmp1 = load <4 x i16>, <4 x i16>* %B 21 %tmp2 = load i16, i16* %A, align 8 22 %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 2 23 ret <4 x i16> %tmp3 24} 25 26define <2 x i32> @vld1lanei32(i32* %A, <2 x i32>* %B) nounwind { 27;CHECK-LABEL: vld1lanei32: 28;Check the alignment value. Max for this instruction is 32 bits: 29;CHECK: vld1.32 {d16[1]}, [r0:32] 30 %tmp1 = load <2 x i32>, <2 x i32>* %B 31 %tmp2 = load i32, i32* %A, align 8 32 %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1 33 ret <2 x i32> %tmp3 34} 35 36define <2 x i32> @vld1lanei32a32(i32* %A, <2 x i32>* %B) nounwind { 37;CHECK-LABEL: vld1lanei32a32: 38;Check the alignment value. Legal values are none or :32. 39;CHECK: vld1.32 {d16[1]}, [r0:32] 40 %tmp1 = load <2 x i32>, <2 x i32>* %B 41 %tmp2 = load i32, i32* %A, align 4 42 %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1 43 ret <2 x i32> %tmp3 44} 45 46define <2 x float> @vld1lanef(float* %A, <2 x float>* %B) nounwind { 47;CHECK-LABEL: vld1lanef: 48;CHECK: vld1.32 {d16[1]}, [r0:32] 49 %tmp1 = load <2 x float>, <2 x float>* %B 50 %tmp2 = load float, float* %A, align 4 51 %tmp3 = insertelement <2 x float> %tmp1, float %tmp2, i32 1 52 ret <2 x float> %tmp3 53} 54 55define <16 x i8> @vld1laneQi8(i8* %A, <16 x i8>* %B) nounwind { 56;CHECK-LABEL: vld1laneQi8: 57;CHECK: vld1.8 {d17[1]}, [r0] 58 %tmp1 = load <16 x i8>, <16 x i8>* %B 59 %tmp2 = load i8, i8* %A, align 8 60 %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 9 61 ret <16 x i8> %tmp3 62} 63 64define <8 x i16> @vld1laneQi16(i16* %A, <8 x i16>* %B) nounwind { 65;CHECK-LABEL: vld1laneQi16: 66;CHECK: vld1.16 {d17[1]}, [r0:16] 67 %tmp1 = load <8 x i16>, <8 x i16>* %B 68 %tmp2 = load i16, i16* %A, align 8 69 %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 5 70 ret <8 x i16> %tmp3 71} 72 73define <4 x i32> @vld1laneQi32(i32* %A, <4 x i32>* %B) nounwind { 74;CHECK-LABEL: vld1laneQi32: 75;CHECK: vld1.32 {d17[1]}, [r0:32] 76 %tmp1 = load <4 x i32>, <4 x i32>* %B 77 %tmp2 = load i32, i32* %A, align 8 78 %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 3 79 ret <4 x i32> %tmp3 80} 81 82define <4 x float> @vld1laneQf(float* %A, <4 x float>* %B) nounwind { 83;CHECK-LABEL: vld1laneQf: 84;CHECK: vld1.32 {d16[0]}, [r0:32] 85 %tmp1 = load <4 x float>, <4 x float>* %B 86 %tmp2 = load float, float* %A 87 %tmp3 = insertelement <4 x float> %tmp1, float %tmp2, i32 0 88 ret <4 x float> %tmp3 89} 90 91%struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> } 92%struct.__neon_int16x4x2_t = type { <4 x i16>, <4 x i16> } 93%struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> } 94%struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> } 95 96%struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> } 97%struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> } 98%struct.__neon_float32x4x2_t = type { <4 x float>, <4 x float> } 99 100define <8 x i8> @vld2lanei8(i8* %A, <8 x i8>* %B) nounwind { 101;CHECK-LABEL: vld2lanei8: 102;Check the alignment value. Max for this instruction is 16 bits: 103;CHECK: vld2.8 {d16[1], d17[1]}, [r0:16] 104 %tmp1 = load <8 x i8>, <8 x i8>* %B 105 %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4) 106 %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0 107 %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1 108 %tmp5 = add <8 x i8> %tmp3, %tmp4 109 ret <8 x i8> %tmp5 110} 111 112define <4 x i16> @vld2lanei16(i16* %A, <4 x i16>* %B) nounwind { 113;CHECK-LABEL: vld2lanei16: 114;Check the alignment value. Max for this instruction is 32 bits: 115;CHECK: vld2.16 {d16[1], d17[1]}, [r0:32] 116 %tmp0 = bitcast i16* %A to i8* 117 %tmp1 = load <4 x i16>, <4 x i16>* %B 118 %tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8) 119 %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0 120 %tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1 121 %tmp5 = add <4 x i16> %tmp3, %tmp4 122 ret <4 x i16> %tmp5 123} 124 125define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind { 126;CHECK-LABEL: vld2lanei32: 127;CHECK: vld2.32 128 %tmp0 = bitcast i32* %A to i8* 129 %tmp1 = load <2 x i32>, <2 x i32>* %B 130 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 131 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0 132 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1 133 %tmp5 = add <2 x i32> %tmp3, %tmp4 134 ret <2 x i32> %tmp5 135} 136 137;Check for a post-increment updating load. 138define <2 x i32> @vld2lanei32_update(i32** %ptr, <2 x i32>* %B) nounwind { 139;CHECK-LABEL: vld2lanei32_update: 140;CHECK: vld2.32 {d16[1], d17[1]}, [{{r[0-9]+}}]! 141 %A = load i32*, i32** %ptr 142 %tmp0 = bitcast i32* %A to i8* 143 %tmp1 = load <2 x i32>, <2 x i32>* %B 144 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 145 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0 146 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1 147 %tmp5 = add <2 x i32> %tmp3, %tmp4 148 %tmp6 = getelementptr i32, i32* %A, i32 2 149 store i32* %tmp6, i32** %ptr 150 ret <2 x i32> %tmp5 151} 152 153define <2 x i32> @vld2lanei32_odd_update(i32** %ptr, <2 x i32>* %B) nounwind { 154;CHECK-LABEL: vld2lanei32_odd_update: 155;CHECK: mov [[INC:r[0-9]+]], #12 156;CHECK: vld2.32 {d16[1], d17[1]}, [{{r[0-9]+}}], [[INC]] 157 %A = load i32*, i32** %ptr 158 %tmp0 = bitcast i32* %A to i8* 159 %tmp1 = load <2 x i32>, <2 x i32>* %B 160 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 161 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0 162 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1 163 %tmp5 = add <2 x i32> %tmp3, %tmp4 164 %tmp6 = getelementptr i32, i32* %A, i32 3 165 store i32* %tmp6, i32** %ptr 166 ret <2 x i32> %tmp5 167} 168 169define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind { 170;CHECK-LABEL: vld2lanef: 171;CHECK: vld2.32 172 %tmp0 = bitcast float* %A to i8* 173 %tmp1 = load <2 x float>, <2 x float>* %B 174 %tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32.p0i8(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 175 %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 0 176 %tmp4 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 1 177 %tmp5 = fadd <2 x float> %tmp3, %tmp4 178 ret <2 x float> %tmp5 179} 180 181define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind { 182;CHECK-LABEL: vld2laneQi16: 183;Check the (default) alignment. 184;CHECK: vld2.16 {d17[1], d19[1]}, [{{r[0-9]+}}] 185 %tmp0 = bitcast i16* %A to i8* 186 %tmp1 = load <8 x i16>, <8 x i16>* %B 187 %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1) 188 %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0 189 %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1 190 %tmp5 = add <8 x i16> %tmp3, %tmp4 191 ret <8 x i16> %tmp5 192} 193 194define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind { 195;CHECK-LABEL: vld2laneQi32: 196;Check the alignment value. Max for this instruction is 64 bits: 197;CHECK: vld2.32 {d17[0], d19[0]}, [{{r[0-9]+}}:64] 198 %tmp0 = bitcast i32* %A to i8* 199 %tmp1 = load <4 x i32>, <4 x i32>* %B 200 %tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32.p0i8(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16) 201 %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 202 %tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1 203 %tmp5 = add <4 x i32> %tmp3, %tmp4 204 ret <4 x i32> %tmp5 205} 206 207define <4 x float> @vld2laneQf(float* %A, <4 x float>* %B) nounwind { 208;CHECK-LABEL: vld2laneQf: 209;CHECK: vld2.32 210 %tmp0 = bitcast float* %A to i8* 211 %tmp1 = load <4 x float>, <4 x float>* %B 212 %tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32.p0i8(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 213 %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 0 214 %tmp4 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 1 215 %tmp5 = fadd <4 x float> %tmp3, %tmp4 216 ret <4 x float> %tmp5 217} 218 219declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 220declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 221declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 222declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32.p0i8(i8*, <2 x float>, <2 x float>, i32, i32) nounwind readonly 223 224declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly 225declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32.p0i8(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly 226declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32.p0i8(i8*, <4 x float>, <4 x float>, i32, i32) nounwind readonly 227 228%struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> } 229%struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> } 230%struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> } 231%struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> } 232 233%struct.__neon_int16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> } 234%struct.__neon_int32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> } 235%struct.__neon_float32x4x3_t = type { <4 x float>, <4 x float>, <4 x float> } 236 237define <8 x i8> @vld3lanei8(i8* %A, <8 x i8>* %B) nounwind { 238;CHECK-LABEL: vld3lanei8: 239;CHECK: vld3.8 240 %tmp1 = load <8 x i8>, <8 x i8>* %B 241 %tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1) 242 %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0 243 %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1 244 %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2 245 %tmp6 = add <8 x i8> %tmp3, %tmp4 246 %tmp7 = add <8 x i8> %tmp5, %tmp6 247 ret <8 x i8> %tmp7 248} 249 250define <4 x i16> @vld3lanei16(i16* %A, <4 x i16>* %B) nounwind { 251;CHECK-LABEL: vld3lanei16: 252;Check the (default) alignment value. VLD3 does not support alignment. 253;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}] 254 %tmp0 = bitcast i16* %A to i8* 255 %tmp1 = load <4 x i16>, <4 x i16>* %B 256 %tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8) 257 %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0 258 %tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1 259 %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2 260 %tmp6 = add <4 x i16> %tmp3, %tmp4 261 %tmp7 = add <4 x i16> %tmp5, %tmp6 262 ret <4 x i16> %tmp7 263} 264 265define <2 x i32> @vld3lanei32(i32* %A, <2 x i32>* %B) nounwind { 266;CHECK-LABEL: vld3lanei32: 267;CHECK: vld3.32 268 %tmp0 = bitcast i32* %A to i8* 269 %tmp1 = load <2 x i32>, <2 x i32>* %B 270 %tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 271 %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0 272 %tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1 273 %tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2 274 %tmp6 = add <2 x i32> %tmp3, %tmp4 275 %tmp7 = add <2 x i32> %tmp5, %tmp6 276 ret <2 x i32> %tmp7 277} 278 279define <2 x float> @vld3lanef(float* %A, <2 x float>* %B) nounwind { 280;CHECK-LABEL: vld3lanef: 281;CHECK: vld3.32 282 %tmp0 = bitcast float* %A to i8* 283 %tmp1 = load <2 x float>, <2 x float>* %B 284 %tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32.p0i8(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 285 %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 0 286 %tmp4 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 1 287 %tmp5 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 2 288 %tmp6 = fadd <2 x float> %tmp3, %tmp4 289 %tmp7 = fadd <2 x float> %tmp5, %tmp6 290 ret <2 x float> %tmp7 291} 292 293define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind { 294;CHECK-LABEL: vld3laneQi16: 295;Check the (default) alignment value. VLD3 does not support alignment. 296;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}] 297 %tmp0 = bitcast i16* %A to i8* 298 %tmp1 = load <8 x i16>, <8 x i16>* %B 299 %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8) 300 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0 301 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1 302 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2 303 %tmp6 = add <8 x i16> %tmp3, %tmp4 304 %tmp7 = add <8 x i16> %tmp5, %tmp6 305 ret <8 x i16> %tmp7 306} 307 308;Check for a post-increment updating load with register increment. 309define <8 x i16> @vld3laneQi16_update(i16** %ptr, <8 x i16>* %B, i32 %inc) nounwind { 310;CHECK-LABEL: vld3laneQi16_update: 311;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+|lr}}], {{r[0-9]+}} 312 %A = load i16*, i16** %ptr 313 %tmp0 = bitcast i16* %A to i8* 314 %tmp1 = load <8 x i16>, <8 x i16>* %B 315 %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8) 316 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0 317 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1 318 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2 319 %tmp6 = add <8 x i16> %tmp3, %tmp4 320 %tmp7 = add <8 x i16> %tmp5, %tmp6 321 %tmp8 = getelementptr i16, i16* %A, i32 %inc 322 store i16* %tmp8, i16** %ptr 323 ret <8 x i16> %tmp7 324} 325 326define <4 x i32> @vld3laneQi32(i32* %A, <4 x i32>* %B) nounwind { 327;CHECK-LABEL: vld3laneQi32: 328;CHECK: vld3.32 329 %tmp0 = bitcast i32* %A to i8* 330 %tmp1 = load <4 x i32>, <4 x i32>* %B 331 %tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32.p0i8(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1) 332 %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0 333 %tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1 334 %tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2 335 %tmp6 = add <4 x i32> %tmp3, %tmp4 336 %tmp7 = add <4 x i32> %tmp5, %tmp6 337 ret <4 x i32> %tmp7 338} 339 340define <4 x float> @vld3laneQf(float* %A, <4 x float>* %B) nounwind { 341;CHECK-LABEL: vld3laneQf: 342;CHECK: vld3.32 343 %tmp0 = bitcast float* %A to i8* 344 %tmp1 = load <4 x float>, <4 x float>* %B 345 %tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32.p0i8(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 346 %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 0 347 %tmp4 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 1 348 %tmp5 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 2 349 %tmp6 = fadd <4 x float> %tmp3, %tmp4 350 %tmp7 = fadd <4 x float> %tmp5, %tmp6 351 ret <4 x float> %tmp7 352} 353 354declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 355declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 356declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 357declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32.p0i8(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly 358 359declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly 360declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32.p0i8(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly 361declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32.p0i8(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly 362 363%struct.__neon_int8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } 364%struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } 365%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } 366%struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> } 367 368%struct.__neon_int16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } 369%struct.__neon_int32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } 370%struct.__neon_float32x4x4_t = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> } 371 372define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind { 373;CHECK-LABEL: vld4lanei8: 374;Check the alignment value. Max for this instruction is 32 bits: 375;CHECK: vld4.8 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}:32] 376 %tmp1 = load <8 x i8>, <8 x i8>* %B 377 %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8) 378 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0 379 %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1 380 %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2 381 %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3 382 %tmp7 = add <8 x i8> %tmp3, %tmp4 383 %tmp8 = add <8 x i8> %tmp5, %tmp6 384 %tmp9 = add <8 x i8> %tmp7, %tmp8 385 ret <8 x i8> %tmp9 386} 387 388;Check for a post-increment updating load. 389define <8 x i8> @vld4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind { 390;CHECK-LABEL: vld4lanei8_update: 391;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}:32]! 392 %A = load i8*, i8** %ptr 393 %tmp1 = load <8 x i8>, <8 x i8>* %B 394 %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8) 395 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0 396 %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1 397 %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2 398 %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3 399 %tmp7 = add <8 x i8> %tmp3, %tmp4 400 %tmp8 = add <8 x i8> %tmp5, %tmp6 401 %tmp9 = add <8 x i8> %tmp7, %tmp8 402 %tmp10 = getelementptr i8, i8* %A, i32 4 403 store i8* %tmp10, i8** %ptr 404 ret <8 x i8> %tmp9 405} 406 407define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind { 408;CHECK-LABEL: vld4lanei16: 409;Check that a power-of-two alignment smaller than the total size of the memory 410;being loaded is ignored. 411;CHECK: vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}] 412 %tmp0 = bitcast i16* %A to i8* 413 %tmp1 = load <4 x i16>, <4 x i16>* %B 414 %tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 4) 415 %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0 416 %tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1 417 %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2 418 %tmp6 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 3 419 %tmp7 = add <4 x i16> %tmp3, %tmp4 420 %tmp8 = add <4 x i16> %tmp5, %tmp6 421 %tmp9 = add <4 x i16> %tmp7, %tmp8 422 ret <4 x i16> %tmp9 423} 424 425define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind { 426;CHECK-LABEL: vld4lanei32: 427;Check the alignment value. An 8-byte alignment is allowed here even though 428;it is smaller than the total size of the memory being loaded. 429;CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}:64] 430 %tmp0 = bitcast i32* %A to i8* 431 %tmp1 = load <2 x i32>, <2 x i32>* %B 432 %tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8) 433 %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0 434 %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1 435 %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2 436 %tmp6 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 3 437 %tmp7 = add <2 x i32> %tmp3, %tmp4 438 %tmp8 = add <2 x i32> %tmp5, %tmp6 439 %tmp9 = add <2 x i32> %tmp7, %tmp8 440 ret <2 x i32> %tmp9 441} 442 443define <2 x float> @vld4lanef(float* %A, <2 x float>* %B) nounwind { 444;CHECK-LABEL: vld4lanef: 445;CHECK: vld4.32 446 %tmp0 = bitcast float* %A to i8* 447 %tmp1 = load <2 x float>, <2 x float>* %B 448 %tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32.p0i8(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 449 %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 0 450 %tmp4 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 1 451 %tmp5 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 2 452 %tmp6 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 3 453 %tmp7 = fadd <2 x float> %tmp3, %tmp4 454 %tmp8 = fadd <2 x float> %tmp5, %tmp6 455 %tmp9 = fadd <2 x float> %tmp7, %tmp8 456 ret <2 x float> %tmp9 457} 458 459define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind { 460;CHECK-LABEL: vld4laneQi16: 461;Check the alignment value. Max for this instruction is 64 bits: 462;CHECK: vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [{{r[0-9]+}}:64] 463 %tmp0 = bitcast i16* %A to i8* 464 %tmp1 = load <8 x i16>, <8 x i16>* %B 465 %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16) 466 %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 0 467 %tmp4 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 1 468 %tmp5 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 2 469 %tmp6 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 3 470 %tmp7 = add <8 x i16> %tmp3, %tmp4 471 %tmp8 = add <8 x i16> %tmp5, %tmp6 472 %tmp9 = add <8 x i16> %tmp7, %tmp8 473 ret <8 x i16> %tmp9 474} 475 476define <4 x i32> @vld4laneQi32(i32* %A, <4 x i32>* %B) nounwind { 477;CHECK-LABEL: vld4laneQi32: 478;Check the (default) alignment. 479;CHECK: vld4.32 {d17[0], d19[0], d21[0], d23[0]}, [{{r[0-9]+}}] 480 %tmp0 = bitcast i32* %A to i8* 481 %tmp1 = load <4 x i32>, <4 x i32>* %B 482 %tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32.p0i8(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1) 483 %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 0 484 %tmp4 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 1 485 %tmp5 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 2 486 %tmp6 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 3 487 %tmp7 = add <4 x i32> %tmp3, %tmp4 488 %tmp8 = add <4 x i32> %tmp5, %tmp6 489 %tmp9 = add <4 x i32> %tmp7, %tmp8 490 ret <4 x i32> %tmp9 491} 492 493define <4 x float> @vld4laneQf(float* %A, <4 x float>* %B) nounwind { 494;CHECK-LABEL: vld4laneQf: 495;CHECK: vld4.32 496 %tmp0 = bitcast float* %A to i8* 497 %tmp1 = load <4 x float>, <4 x float>* %B 498 %tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32.p0i8(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 499 %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 0 500 %tmp4 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 1 501 %tmp5 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 2 502 %tmp6 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 3 503 %tmp7 = fadd <4 x float> %tmp3, %tmp4 504 %tmp8 = fadd <4 x float> %tmp5, %tmp6 505 %tmp9 = fadd <4 x float> %tmp7, %tmp8 506 ret <4 x float> %tmp9 507} 508 509declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 510declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 511declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 512declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32.p0i8(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly 513 514declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly 515declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32.p0i8(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly 516declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32.p0i8(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly 517 518; Radar 8776599: If one of the operands to a QQQQ REG_SEQUENCE is a register 519; in the QPR_VFP2 regclass, it needs to be copied to a QPR regclass because 520; we don't currently have a QQQQ_VFP2 super-regclass. (The "0" for the low 521; part of %ins67 is supposed to be loaded by a VLDRS instruction in this test.) 522define <8 x i16> @test_qqqq_regsequence_subreg([6 x i64] %b) nounwind { 523;CHECK-LABEL: test_qqqq_regsequence_subreg: 524;CHECK: vld3.16 525 %tmp63 = extractvalue [6 x i64] %b, 5 526 %tmp64 = zext i64 %tmp63 to i128 527 %tmp65 = shl i128 %tmp64, 64 528 %ins67 = or i128 %tmp65, 0 529 %tmp78 = bitcast i128 %ins67 to <8 x i16> 530 %vld3_lane = tail call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> %tmp78, i32 1, i32 2) 531 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 0 532 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 1 533 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 2 534 %tmp6 = add <8 x i16> %tmp3, %tmp4 535 %tmp7 = add <8 x i16> %tmp5, %tmp6 536 ret <8 x i16> %tmp7 537} 538 539declare void @llvm.trap() nounwind 540