1; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s 2 3define <8 x i8> @v_dup8(i8 %A) nounwind { 4;CHECK-LABEL: v_dup8: 5;CHECK: dup.8b 6 %tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0 7 %tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1 8 %tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2 9 %tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3 10 %tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4 11 %tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5 12 %tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6 13 %tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7 14 ret <8 x i8> %tmp8 15} 16 17define <4 x i16> @v_dup16(i16 %A) nounwind { 18;CHECK-LABEL: v_dup16: 19;CHECK: dup.4h 20 %tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0 21 %tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1 22 %tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2 23 %tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3 24 ret <4 x i16> %tmp4 25} 26 27define <2 x i32> @v_dup32(i32 %A) nounwind { 28;CHECK-LABEL: v_dup32: 29;CHECK: dup.2s 30 %tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0 31 %tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1 32 ret <2 x i32> %tmp2 33} 34 35define <2 x float> @v_dupfloat(float %A) nounwind { 36;CHECK-LABEL: v_dupfloat: 37;CHECK: dup.2s 38 %tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0 39 %tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1 40 ret <2 x float> %tmp2 41} 42 43define <16 x i8> @v_dupQ8(i8 %A) nounwind { 44;CHECK-LABEL: v_dupQ8: 45;CHECK: dup.16b 46 %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0 47 %tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1 48 %tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2 49 %tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3 50 %tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4 51 %tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5 52 %tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6 53 %tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7 54 %tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8 55 %tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9 56 %tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10 57 %tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11 58 %tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12 59 %tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13 60 %tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14 61 %tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15 62 ret <16 x i8> %tmp16 63} 64 65define <8 x i16> @v_dupQ16(i16 %A) nounwind { 66;CHECK-LABEL: v_dupQ16: 67;CHECK: dup.8h 68 %tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0 69 %tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1 70 %tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2 71 %tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3 72 %tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4 73 %tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5 74 %tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6 75 %tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7 76 ret <8 x i16> %tmp8 77} 78 79define <4 x i32> @v_dupQ32(i32 %A) nounwind { 80;CHECK-LABEL: v_dupQ32: 81;CHECK: dup.4s 82 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0 83 %tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1 84 %tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2 85 %tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3 86 ret <4 x i32> %tmp4 87} 88 89define <4 x float> @v_dupQfloat(float %A) nounwind { 90;CHECK-LABEL: v_dupQfloat: 91;CHECK: dup.4s 92 %tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0 93 %tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1 94 %tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2 95 %tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3 96 ret <4 x float> %tmp4 97} 98 99; Check to make sure it works with shuffles, too. 100 101define <8 x i8> @v_shuffledup8(i8 %A) nounwind { 102;CHECK-LABEL: v_shuffledup8: 103;CHECK: dup.8b 104 %tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0 105 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer 106 ret <8 x i8> %tmp2 107} 108 109define <4 x i16> @v_shuffledup16(i16 %A) nounwind { 110;CHECK-LABEL: v_shuffledup16: 111;CHECK: dup.4h 112 %tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0 113 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer 114 ret <4 x i16> %tmp2 115} 116 117define <2 x i32> @v_shuffledup32(i32 %A) nounwind { 118;CHECK-LABEL: v_shuffledup32: 119;CHECK: dup.2s 120 %tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0 121 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer 122 ret <2 x i32> %tmp2 123} 124 125define <2 x float> @v_shuffledupfloat(float %A) nounwind { 126;CHECK-LABEL: v_shuffledupfloat: 127;CHECK: dup.2s 128 %tmp1 = insertelement <2 x float> undef, float %A, i32 0 129 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer 130 ret <2 x float> %tmp2 131} 132 133define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind { 134;CHECK-LABEL: v_shuffledupQ8: 135;CHECK: dup.16b 136 %tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0 137 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer 138 ret <16 x i8> %tmp2 139} 140 141define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind { 142;CHECK-LABEL: v_shuffledupQ16: 143;CHECK: dup.8h 144 %tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0 145 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer 146 ret <8 x i16> %tmp2 147} 148 149define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind { 150;CHECK-LABEL: v_shuffledupQ32: 151;CHECK: dup.4s 152 %tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0 153 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer 154 ret <4 x i32> %tmp2 155} 156 157define <4 x float> @v_shuffledupQfloat(float %A) nounwind { 158;CHECK-LABEL: v_shuffledupQfloat: 159;CHECK: dup.4s 160 %tmp1 = insertelement <4 x float> undef, float %A, i32 0 161 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer 162 ret <4 x float> %tmp2 163} 164 165define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind { 166;CHECK-LABEL: vduplane8: 167;CHECK: dup.8b 168 %tmp1 = load <8 x i8>, <8 x i8>* %A 169 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > 170 ret <8 x i8> %tmp2 171} 172 173define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind { 174;CHECK-LABEL: vduplane16: 175;CHECK: dup.4h 176 %tmp1 = load <4 x i16>, <4 x i16>* %A 177 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > 178 ret <4 x i16> %tmp2 179} 180 181define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind { 182;CHECK-LABEL: vduplane32: 183;CHECK: dup.2s 184 %tmp1 = load <2 x i32>, <2 x i32>* %A 185 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 > 186 ret <2 x i32> %tmp2 187} 188 189define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind { 190;CHECK-LABEL: vduplanefloat: 191;CHECK: dup.2s 192 %tmp1 = load <2 x float>, <2 x float>* %A 193 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 > 194 ret <2 x float> %tmp2 195} 196 197define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind { 198;CHECK-LABEL: vduplaneQ8: 199;CHECK: dup.16b 200 %tmp1 = load <8 x i8>, <8 x i8>* %A 201 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > 202 ret <16 x i8> %tmp2 203} 204 205define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind { 206;CHECK-LABEL: vduplaneQ16: 207;CHECK: dup.8h 208 %tmp1 = load <4 x i16>, <4 x i16>* %A 209 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > 210 ret <8 x i16> %tmp2 211} 212 213define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind { 214;CHECK-LABEL: vduplaneQ32: 215;CHECK: dup.4s 216 %tmp1 = load <2 x i32>, <2 x i32>* %A 217 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > 218 ret <4 x i32> %tmp2 219} 220 221define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind { 222;CHECK-LABEL: vduplaneQfloat: 223;CHECK: dup.4s 224 %tmp1 = load <2 x float>, <2 x float>* %A 225 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > 226 ret <4 x float> %tmp2 227} 228 229define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone { 230;CHECK-LABEL: foo: 231;CHECK: dup.2d 232entry: 233 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1> 234 ret <2 x i64> %0 235} 236 237define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone { 238;CHECK-LABEL: bar: 239;CHECK: dup.2d 240entry: 241 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0> 242 ret <2 x i64> %0 243} 244 245define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone { 246;CHECK-LABEL: baz: 247;CHECK: dup.2d 248entry: 249 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1> 250 ret <2 x double> %0 251} 252 253define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone { 254;CHECK-LABEL: qux: 255;CHECK: dup.2d 256entry: 257 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0> 258 ret <2 x double> %0 259} 260 261define <2 x i32> @f(i32 %a, i32 %b) nounwind readnone { 262; CHECK-LABEL: f: 263; CHECK-NEXT: fmov s0, w0 264; CHECK-NEXT: ins.s v0[1], w1 265; CHECK-NEXT: ret 266 %vecinit = insertelement <2 x i32> undef, i32 %a, i32 0 267 %vecinit1 = insertelement <2 x i32> %vecinit, i32 %b, i32 1 268 ret <2 x i32> %vecinit1 269} 270 271define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone { 272; CHECK-LABEL: g: 273; CHECK-NEXT: fmov s0, w0 274; CHECK-NEXT: ins.s v0[1], w1 275; CHECK-NEXT: ins.s v0[2], w1 276; CHECK-NEXT: ins.s v0[3], w0 277; CHECK-NEXT: ret 278 %vecinit = insertelement <4 x i32> undef, i32 %a, i32 0 279 %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1 280 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2 281 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a, i32 3 282 ret <4 x i32> %vecinit3 283} 284 285define <2 x i64> @h(i64 %a, i64 %b) nounwind readnone { 286; CHECK-LABEL: h: 287; CHECK-NEXT: fmov d0, x0 288; CHECK-NEXT: ins.d v0[1], x1 289; CHECK-NEXT: ret 290 %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0 291 %vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1 292 ret <2 x i64> %vecinit1 293} 294 295; We used to spot this as a BUILD_VECTOR implementable by dup, but assume that 296; the single value needed was of the same type as the vector. This is false if 297; the scalar corresponding to the vector type is illegal (e.g. a <4 x i16> 298; BUILD_VECTOR will have an i32 as its source). In that case, the operation is 299; not a simple "dup vD.4h, vN.h[idx]" after all, and we crashed. 300; 301; *However*, it is a dup vD.4h, vN.h[2*idx]. 302define <4 x i16> @test_build_illegal(<4 x i32> %in) { 303; CHECK-LABEL: test_build_illegal: 304; CHECK: dup.4h v0, v0[6] 305 %val = extractelement <4 x i32> %in, i32 3 306 %smallval = trunc i32 %val to i16 307 %vec = insertelement <4x i16> undef, i16 %smallval, i32 3 308 309 ret <4 x i16> %vec 310} 311 312; We used to inherit an already extract_subvectored v4i16 from 313; SelectionDAGBuilder here. We then added a DUPLANE on top of that, preventing 314; the formation of an indexed-by-7 MLS. 315define <4 x i16> @test_high_splat(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 { 316; CHECK-LABEL: test_high_splat: 317; CHECK: mls.4h v0, v1, v2[7] 318entry: 319 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 320 %mul = mul <4 x i16> %shuffle, %b 321 %sub = sub <4 x i16> %a, %mul 322 ret <4 x i16> %sub 323} 324 325; Also test the DUP path in the PerfectShuffle generator. 326 327; CHECK-LABEL: test_perfectshuffle_dupext_v4i16: 328; CHECK-NEXT: dup.4h v0, v0[0] 329; CHECK-NEXT: ext.8b v0, v0, v1, #4 330define <4 x i16> @test_perfectshuffle_dupext_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { 331 %r = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5> 332 ret <4 x i16> %r 333} 334 335; CHECK-LABEL: test_perfectshuffle_dupext_v4f16: 336; CHECK-NEXT: dup.4h v0, v0[0] 337; CHECK-NEXT: ext.8b v0, v0, v1, #4 338; CHECK-NEXT: ret 339define <4 x half> @test_perfectshuffle_dupext_v4f16(<4 x half> %a, <4 x half> %b) nounwind { 340 %r = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5> 341 ret <4 x half> %r 342} 343 344; CHECK-LABEL: test_perfectshuffle_dupext_v4i32: 345; CHECK-NEXT: dup.4s v0, v0[0] 346; CHECK-NEXT: ext.16b v0, v0, v1, #8 347; CHECK-NEXT: ret 348define <4 x i32> @test_perfectshuffle_dupext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 349 %r = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5> 350 ret <4 x i32> %r 351} 352 353; CHECK-LABEL: test_perfectshuffle_dupext_v4f32: 354; CHECK-NEXT: dup.4s v0, v0[0] 355; CHECK-NEXT: ext.16b v0, v0, v1, #8 356; CHECK-NEXT: ret 357define <4 x float> @test_perfectshuffle_dupext_v4f32(<4 x float> %a, <4 x float> %b) nounwind { 358 %r = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5> 359 ret <4 x float> %r 360} 361