1; RUN: llc -mtriple=arm-eabi -mattr=+neon -lower-interleaved-accesses=true < %s | FileCheck %s -check-prefix=NEON 2; RUN: llc -mtriple=arm-eabi -mattr=-neon -lower-interleaved-accesses=true < %s | FileCheck %s -check-prefix=NONEON 3 4; NEON-LABEL: load_factor2: 5; NEON: vld2.8 {d16, d17}, [r0] 6; NONEON-LABEL: load_factor2: 7; NONEON-NOT: vld2 8define <8 x i8> @load_factor2(<16 x i8>* %ptr) { 9 %wide.vec = load <16 x i8>, <16 x i8>* %ptr, align 4 10 %strided.v0 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 11 %strided.v1 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 12 %add = add nsw <8 x i8> %strided.v0, %strided.v1 13 ret <8 x i8> %add 14} 15 16; NEON-LABEL: load_factor3: 17; NEON: vld3.32 {d16, d17, d18}, [r0] 18; NONEON-LABEL: load_factor3: 19; NONEON-NOT: vld3 20define <2 x i32> @load_factor3(i32* %ptr) { 21 %base = bitcast i32* %ptr to <6 x i32>* 22 %wide.vec = load <6 x i32>, <6 x i32>* %base, align 4 23 %strided.v2 = shufflevector <6 x i32> %wide.vec, <6 x i32> undef, <2 x i32> <i32 2, i32 5> 24 %strided.v1 = shufflevector <6 x i32> %wide.vec, <6 x i32> undef, <2 x i32> <i32 1, i32 4> 25 %add = add nsw <2 x i32> %strided.v2, %strided.v1 26 ret <2 x i32> %add 27} 28 29; NEON-LABEL: load_factor4: 30; NEON: vld4.32 {d16, d18, d20, d22}, [r0]! 31; NEON: vld4.32 {d17, d19, d21, d23}, [r0] 32; NONEON-LABEL: load_factor4: 33; NONEON-NOT: vld4 34define <4 x i32> @load_factor4(i32* %ptr) { 35 %base = bitcast i32* %ptr to <16 x i32>* 36 %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4 37 %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 38 %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 39 %add = add nsw <4 x i32> %strided.v0, %strided.v2 40 ret <4 x i32> %add 41} 42 43; NEON-LABEL: store_factor2: 44; NEON: vst2.8 {d16, d17}, [r0] 45; NONEON-LABEL: store_factor2: 46; NONEON-NOT: vst2 47define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) { 48 %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 49 store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4 50 ret void 51} 52 53; NEON-LABEL: store_factor3: 54; NEON: vst3.32 {d16, d18, d20}, [r0]! 55; NEON: vst3.32 {d17, d19, d21}, [r0] 56; NONEON-LABEL: store_factor3: 57; NONEON-NOT: vst3.32 58define void @store_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) { 59 %base = bitcast i32* %ptr to <12 x i32>* 60 %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 61 %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 62 %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 63 store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4 64 ret void 65} 66 67; NEON-LABEL: store_factor4: 68; NEON: vst4.32 {d16, d18, d20, d22}, [r0]! 69; NEON: vst4.32 {d17, d19, d21, d23}, [r0] 70; NONEON-LABEL: store_factor4: 71; NONEON-NOT: vst4 72define void @store_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { 73 %base = bitcast i32* %ptr to <16 x i32>* 74 %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 75 %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 76 %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> 77 store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4 78 ret void 79} 80 81; The following cases test that interleaved access of pointer vectors can be 82; matched to ldN/stN instruction. 83 84; NEON-LABEL: load_ptrvec_factor2: 85; NEON: vld2.32 {d16, d17}, [r0] 86; NONEON-LABEL: load_ptrvec_factor2: 87; NONEON-NOT: vld2 88define <2 x i32*> @load_ptrvec_factor2(i32** %ptr) { 89 %base = bitcast i32** %ptr to <4 x i32*>* 90 %wide.vec = load <4 x i32*>, <4 x i32*>* %base, align 4 91 %strided.v0 = shufflevector <4 x i32*> %wide.vec, <4 x i32*> undef, <2 x i32> <i32 0, i32 2> 92 ret <2 x i32*> %strided.v0 93} 94 95; NEON-LABEL: load_ptrvec_factor3: 96; NEON: vld3.32 {d16, d17, d18}, [r0] 97; NONEON-LABEL: load_ptrvec_factor3: 98; NONEON-NOT: vld3 99define void @load_ptrvec_factor3(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) { 100 %base = bitcast i32** %ptr to <6 x i32*>* 101 %wide.vec = load <6 x i32*>, <6 x i32*>* %base, align 4 102 %strided.v2 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 2, i32 5> 103 store <2 x i32*> %strided.v2, <2 x i32*>* %ptr1 104 %strided.v1 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 1, i32 4> 105 store <2 x i32*> %strided.v1, <2 x i32*>* %ptr2 106 ret void 107} 108 109; NEON-LABEL: load_ptrvec_factor4: 110; NEON: vld4.32 {d16, d17, d18, d19}, [r0] 111; NONEON-LABEL: load_ptrvec_factor4: 112; NONEON-NOT: vld4 113define void @load_ptrvec_factor4(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) { 114 %base = bitcast i32** %ptr to <8 x i32*>* 115 %wide.vec = load <8 x i32*>, <8 x i32*>* %base, align 4 116 %strided.v1 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 1, i32 5> 117 %strided.v3 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 3, i32 7> 118 store <2 x i32*> %strided.v1, <2 x i32*>* %ptr1 119 store <2 x i32*> %strided.v3, <2 x i32*>* %ptr2 120 ret void 121} 122 123; NEON-LABEL: store_ptrvec_factor2: 124; NEON: vst2.32 {d16, d17}, [r0] 125; NONEON-LABEL: store_ptrvec_factor2: 126; NONEON-NOT: vst2 127define void @store_ptrvec_factor2(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1) { 128 %base = bitcast i32** %ptr to <4 x i32*>* 129 %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 130 store <4 x i32*> %interleaved.vec, <4 x i32*>* %base, align 4 131 ret void 132} 133 134; NEON-LABEL: store_ptrvec_factor3: 135; NEON: vst3.32 {d16, d17, d18}, [r0] 136; NONEON-LABEL: store_ptrvec_factor3: 137; NONEON-NOT: vst3 138define void @store_ptrvec_factor3(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) { 139 %base = bitcast i32** %ptr to <6 x i32*>* 140 %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 141 %v2_u = shufflevector <2 x i32*> %v2, <2 x i32*> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 142 %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_u, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5> 143 store <6 x i32*> %interleaved.vec, <6 x i32*>* %base, align 4 144 ret void 145} 146 147; NEON-LABEL: store_ptrvec_factor4: 148; NEON: vst4.32 {d16, d17, d18, d19}, [r0] 149; NONEON-LABEL: store_ptrvec_factor4: 150; NONEON-NOT: vst4 151define void @store_ptrvec_factor4(i32* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) { 152 %base = bitcast i32* %ptr to <8 x i32*>* 153 %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 154 %v2_v3 = shufflevector <2 x i32*> %v2, <2 x i32*> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 155 %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_v3, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 156 store <8 x i32*> %interleaved.vec, <8 x i32*>* %base, align 4 157 ret void 158} 159 160; Following cases check that shuffle maskes with undef indices can be matched 161; into ldN/stN instruction. 162 163; NEON-LABEL: load_undef_mask_factor2: 164; NEON: vld2.32 {d16, d17, d18, d19}, [r0] 165; NONEON-LABEL: load_undef_mask_factor2: 166; NONEON-NOT: vld2 167define <4 x i32> @load_undef_mask_factor2(i32* %ptr) { 168 %base = bitcast i32* %ptr to <8 x i32>* 169 %wide.vec = load <8 x i32>, <8 x i32>* %base, align 4 170 %strided.v0 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6> 171 %strided.v1 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7> 172 %add = add nsw <4 x i32> %strided.v0, %strided.v1 173 ret <4 x i32> %add 174} 175 176; NEON-LABEL: load_undef_mask_factor3: 177; NEON: vld3.32 {d16, d18, d20}, [r0]! 178; NEON: vld3.32 {d17, d19, d21}, [r0] 179; NONEON-LABEL: load_undef_mask_factor3: 180; NONEON-NOT: vld3 181define <4 x i32> @load_undef_mask_factor3(i32* %ptr) { 182 %base = bitcast i32* %ptr to <12 x i32>* 183 %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4 184 %strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef> 185 %strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 186 %add = add nsw <4 x i32> %strided.v2, %strided.v1 187 ret <4 x i32> %add 188} 189 190; NEON-LABEL: load_undef_mask_factor4: 191; NEON: vld4.32 {d16, d18, d20, d22}, [r0]! 192; NEON: vld4.32 {d17, d19, d21, d23}, [r0] 193; NONEON-LABEL: load_undef_mask_factor4: 194; NONEON-NOT: vld4 195define <4 x i32> @load_undef_mask_factor4(i32* %ptr) { 196 %base = bitcast i32* %ptr to <16 x i32>* 197 %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4 198 %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef> 199 %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef> 200 %add = add nsw <4 x i32> %strided.v0, %strided.v2 201 ret <4 x i32> %add 202} 203 204; NEON-LABEL: store_undef_mask_factor2: 205; NEON: vst2.32 {d16, d17, d18, d19}, [r0] 206; NONEON-LABEL: store_undef_mask_factor2: 207; NONEON-NOT: vst2 208define void @store_undef_mask_factor2(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1) { 209 %base = bitcast i32* %ptr to <8 x i32>* 210 %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7> 211 store <8 x i32> %interleaved.vec, <8 x i32>* %base, align 4 212 ret void 213} 214 215; NEON-LABEL: store_undef_mask_factor3: 216; NEON: vst3.32 {d16, d18, d20}, [r0]! 217; NEON: vst3.32 {d17, d19, d21}, [r0] 218; NONEON-LABEL: store_undef_mask_factor3: 219; NONEON-NOT: vst3 220define void @store_undef_mask_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) { 221 %base = bitcast i32* %ptr to <12 x i32>* 222 %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 223 %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 224 %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 225 store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4 226 ret void 227} 228 229; NEON-LABEL: store_undef_mask_factor4: 230; NEON: vst4.32 {d16, d18, d20, d22}, [r0]! 231; NEON: vst4.32 {d17, d19, d21, d23}, [r0] 232; NONEON-LABEL: store_undef_mask_factor4: 233; NONEON-NOT: vst4 234define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { 235 %base = bitcast i32* %ptr to <16 x i32>* 236 %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 237 %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 238 %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> 239 store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4 240 ret void 241} 242 243; The following test cases check that address spaces are properly handled 244 245; NEON-LABEL: load_address_space 246; NEON: vld3.32 247; NONEON-LABEL: load_address_space 248; NONEON-NOT: vld3 249define void @load_address_space(<4 x i32> addrspace(1)* %A, <2 x i32>* %B) { 250 %tmp = load <4 x i32>, <4 x i32> addrspace(1)* %A 251 %interleaved = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 0, i32 3> 252 store <2 x i32> %interleaved, <2 x i32>* %B 253 ret void 254} 255 256; NEON-LABEL: store_address_space 257; NEON: vst2.32 258; NONEON-LABEL: store_address_space 259; NONEON-NOT: vst2 260define void @store_address_space(<2 x i32>* %A, <2 x i32>* %B, <4 x i32> addrspace(1)* %C) { 261 %tmp0 = load <2 x i32>, <2 x i32>* %A 262 %tmp1 = load <2 x i32>, <2 x i32>* %B 263 %interleaved = shufflevector <2 x i32> %tmp0, <2 x i32> %tmp1, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 264 store <4 x i32> %interleaved, <4 x i32> addrspace(1)* %C 265 ret void 266} 267 268; Check that we do something sane with illegal types. 269 270; NEON-LABEL: load_illegal_factor2: 271; NEON: BB#0: 272; NEON-NEXT: vld1.64 {d16, d17}, [r0:128] 273; NEON-NEXT: vuzp.32 q8, {{.*}} 274; NEON-NEXT: vmov r0, r1, d16 275; NEON-NEXT: vmov r2, r3, {{.*}} 276; NEON-NEXT: mov pc, lr 277; NONEON-LABEL: load_illegal_factor2: 278; NONEON: BB#0: 279; NONEON-NEXT: ldr [[ELT0:r[0-9]+]], [r0] 280; NONEON-NEXT: ldr r1, [r0, #8] 281; NONEON-NEXT: mov r0, [[ELT0]] 282; NONEON-NEXT: mov pc, lr 283define <3 x float> @load_illegal_factor2(<3 x float>* %p) nounwind { 284 %tmp1 = load <3 x float>, <3 x float>* %p, align 16 285 %tmp2 = shufflevector <3 x float> %tmp1, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef> 286 ret <3 x float> %tmp2 287} 288 289; This lowering isn't great, but it's at least correct. 290 291; NEON-LABEL: store_illegal_factor2: 292; NEON: BB#0: 293; NEON-NEXT: vldr d17, [sp] 294; NEON-NEXT: vmov d16, r2, r3 295; NEON-NEXT: vuzp.32 q8, {{.*}} 296; NEON-NEXT: vstr d16, [r0] 297; NEON-NEXT: mov pc, lr 298; NONEON-LABEL: store_illegal_factor2: 299; NONEON: BB#0: 300; NONEON-NEXT: stm r0, {r1, r3} 301; NONEON-NEXT: mov pc, lr 302define void @store_illegal_factor2(<3 x float>* %p, <3 x float> %v) nounwind { 303 %tmp1 = shufflevector <3 x float> %v, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef> 304 store <3 x float> %tmp1, <3 x float>* %p, align 16 305 ret void 306} 307 308; NEON-LABEL: load_factor2_with_extract_user: 309; NEON: vld2.32 {d16, d17, d18, d19}, [r0:64] 310; NEON: vmov.32 r0, d16[1] 311; NONEON-LABEL: load_factor2_with_extract_user: 312; NONEON-NOT: vld2 313define i32 @load_factor2_with_extract_user(<8 x i32>* %a) { 314 %1 = load <8 x i32>, <8 x i32>* %a, align 8 315 %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 316 %3 = extractelement <8 x i32> %1, i32 2 317 ret i32 %3 318} 319