1; RUN: llc -asm-verbose=false < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s 2 3 4define <8 x i16> @smull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { 5;CHECK-LABEL: smull8h: 6;CHECK: smull.8h 7 %tmp1 = load <8 x i8>, <8 x i8>* %A 8 %tmp2 = load <8 x i8>, <8 x i8>* %B 9 %tmp3 = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) 10 ret <8 x i16> %tmp3 11} 12 13define <4 x i32> @smull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 14;CHECK-LABEL: smull4s: 15;CHECK: smull.4s 16 %tmp1 = load <4 x i16>, <4 x i16>* %A 17 %tmp2 = load <4 x i16>, <4 x i16>* %B 18 %tmp3 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 19 ret <4 x i32> %tmp3 20} 21 22define <2 x i64> @smull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 23;CHECK-LABEL: smull2d: 24;CHECK: smull.2d 25 %tmp1 = load <2 x i32>, <2 x i32>* %A 26 %tmp2 = load <2 x i32>, <2 x i32>* %B 27 %tmp3 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 28 ret <2 x i64> %tmp3 29} 30 31declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone 32declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone 33declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone 34 35define <8 x i16> @umull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { 36;CHECK-LABEL: umull8h: 37;CHECK: umull.8h 38 %tmp1 = load <8 x i8>, <8 x i8>* %A 39 %tmp2 = load <8 x i8>, <8 x i8>* %B 40 %tmp3 = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) 41 ret <8 x i16> %tmp3 42} 43 44define <4 x i32> @umull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 45;CHECK-LABEL: umull4s: 46;CHECK: umull.4s 47 %tmp1 = load <4 x i16>, <4 x i16>* %A 48 %tmp2 = load <4 x i16>, <4 x i16>* %B 49 %tmp3 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 50 ret <4 x i32> %tmp3 51} 52 53define <2 x i64> @umull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 54;CHECK-LABEL: umull2d: 55;CHECK: umull.2d 56 %tmp1 = load <2 x i32>, <2 x i32>* %A 57 %tmp2 = load <2 x i32>, <2 x i32>* %B 58 %tmp3 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 59 ret <2 x i64> %tmp3 60} 61 62declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone 63declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone 64declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone 65 66define <4 x i32> @sqdmull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 67;CHECK-LABEL: sqdmull4s: 68;CHECK: sqdmull.4s 69 %tmp1 = load <4 x i16>, <4 x i16>* %A 70 %tmp2 = load <4 x i16>, <4 x i16>* %B 71 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 72 ret <4 x i32> %tmp3 73} 74 75define <2 x i64> @sqdmull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 76;CHECK-LABEL: sqdmull2d: 77;CHECK: sqdmull.2d 78 %tmp1 = load <2 x i32>, <2 x i32>* %A 79 %tmp2 = load <2 x i32>, <2 x i32>* %B 80 %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 81 ret <2 x i64> %tmp3 82} 83 84define <4 x i32> @sqdmull2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { 85;CHECK-LABEL: sqdmull2_4s: 86;CHECK: sqdmull2.4s 87 %load1 = load <8 x i16>, <8 x i16>* %A 88 %load2 = load <8 x i16>, <8 x i16>* %B 89 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 90 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 91 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 92 ret <4 x i32> %tmp3 93} 94 95define <2 x i64> @sqdmull2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { 96;CHECK-LABEL: sqdmull2_2d: 97;CHECK: sqdmull2.2d 98 %load1 = load <4 x i32>, <4 x i32>* %A 99 %load2 = load <4 x i32>, <4 x i32>* %B 100 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 101 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 102 %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 103 ret <2 x i64> %tmp3 104} 105 106 107declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone 108declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone 109 110define <8 x i16> @pmull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { 111;CHECK-LABEL: pmull8h: 112;CHECK: pmull.8h 113 %tmp1 = load <8 x i8>, <8 x i8>* %A 114 %tmp2 = load <8 x i8>, <8 x i8>* %B 115 %tmp3 = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) 116 ret <8 x i16> %tmp3 117} 118 119declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone 120 121define <4 x i16> @sqdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 122;CHECK-LABEL: sqdmulh_4h: 123;CHECK: sqdmulh.4h 124 %tmp1 = load <4 x i16>, <4 x i16>* %A 125 %tmp2 = load <4 x i16>, <4 x i16>* %B 126 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 127 ret <4 x i16> %tmp3 128} 129 130define <8 x i16> @sqdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 131;CHECK-LABEL: sqdmulh_8h: 132;CHECK: sqdmulh.8h 133 %tmp1 = load <8 x i16>, <8 x i16>* %A 134 %tmp2 = load <8 x i16>, <8 x i16>* %B 135 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 136 ret <8 x i16> %tmp3 137} 138 139define <2 x i32> @sqdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 140;CHECK-LABEL: sqdmulh_2s: 141;CHECK: sqdmulh.2s 142 %tmp1 = load <2 x i32>, <2 x i32>* %A 143 %tmp2 = load <2 x i32>, <2 x i32>* %B 144 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 145 ret <2 x i32> %tmp3 146} 147 148define <4 x i32> @sqdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 149;CHECK-LABEL: sqdmulh_4s: 150;CHECK: sqdmulh.4s 151 %tmp1 = load <4 x i32>, <4 x i32>* %A 152 %tmp2 = load <4 x i32>, <4 x i32>* %B 153 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 154 ret <4 x i32> %tmp3 155} 156 157define i32 @sqdmulh_1s(i32* %A, i32* %B) nounwind { 158;CHECK-LABEL: sqdmulh_1s: 159;CHECK: sqdmulh s0, {{s[0-9]+}}, {{s[0-9]+}} 160 %tmp1 = load i32, i32* %A 161 %tmp2 = load i32, i32* %B 162 %tmp3 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %tmp1, i32 %tmp2) 163 ret i32 %tmp3 164} 165 166declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 167declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 168declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 169declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 170declare i32 @llvm.aarch64.neon.sqdmulh.i32(i32, i32) nounwind readnone 171 172define <4 x i16> @sqrdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 173;CHECK-LABEL: sqrdmulh_4h: 174;CHECK: sqrdmulh.4h 175 %tmp1 = load <4 x i16>, <4 x i16>* %A 176 %tmp2 = load <4 x i16>, <4 x i16>* %B 177 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 178 ret <4 x i16> %tmp3 179} 180 181define <8 x i16> @sqrdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 182;CHECK-LABEL: sqrdmulh_8h: 183;CHECK: sqrdmulh.8h 184 %tmp1 = load <8 x i16>, <8 x i16>* %A 185 %tmp2 = load <8 x i16>, <8 x i16>* %B 186 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 187 ret <8 x i16> %tmp3 188} 189 190define <2 x i32> @sqrdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 191;CHECK-LABEL: sqrdmulh_2s: 192;CHECK: sqrdmulh.2s 193 %tmp1 = load <2 x i32>, <2 x i32>* %A 194 %tmp2 = load <2 x i32>, <2 x i32>* %B 195 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 196 ret <2 x i32> %tmp3 197} 198 199define <4 x i32> @sqrdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 200;CHECK-LABEL: sqrdmulh_4s: 201;CHECK: sqrdmulh.4s 202 %tmp1 = load <4 x i32>, <4 x i32>* %A 203 %tmp2 = load <4 x i32>, <4 x i32>* %B 204 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 205 ret <4 x i32> %tmp3 206} 207 208define i32 @sqrdmulh_1s(i32* %A, i32* %B) nounwind { 209;CHECK-LABEL: sqrdmulh_1s: 210;CHECK: sqrdmulh s0, {{s[0-9]+}}, {{s[0-9]+}} 211 %tmp1 = load i32, i32* %A 212 %tmp2 = load i32, i32* %B 213 %tmp3 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %tmp1, i32 %tmp2) 214 ret i32 %tmp3 215} 216 217declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 218declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 219declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 220declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 221declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32) nounwind readnone 222 223define <2 x float> @fmulx_2s(<2 x float>* %A, <2 x float>* %B) nounwind { 224;CHECK-LABEL: fmulx_2s: 225;CHECK: fmulx.2s 226 %tmp1 = load <2 x float>, <2 x float>* %A 227 %tmp2 = load <2 x float>, <2 x float>* %B 228 %tmp3 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) 229 ret <2 x float> %tmp3 230} 231 232define <4 x float> @fmulx_4s(<4 x float>* %A, <4 x float>* %B) nounwind { 233;CHECK-LABEL: fmulx_4s: 234;CHECK: fmulx.4s 235 %tmp1 = load <4 x float>, <4 x float>* %A 236 %tmp2 = load <4 x float>, <4 x float>* %B 237 %tmp3 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) 238 ret <4 x float> %tmp3 239} 240 241define <2 x double> @fmulx_2d(<2 x double>* %A, <2 x double>* %B) nounwind { 242;CHECK-LABEL: fmulx_2d: 243;CHECK: fmulx.2d 244 %tmp1 = load <2 x double>, <2 x double>* %A 245 %tmp2 = load <2 x double>, <2 x double>* %B 246 %tmp3 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp2) 247 ret <2 x double> %tmp3 248} 249 250declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>) nounwind readnone 251declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>) nounwind readnone 252declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>) nounwind readnone 253 254define <4 x i32> @smlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 255;CHECK-LABEL: smlal4s: 256;CHECK: smlal.4s 257 %tmp1 = load <4 x i16>, <4 x i16>* %A 258 %tmp2 = load <4 x i16>, <4 x i16>* %B 259 %tmp3 = load <4 x i32>, <4 x i32>* %C 260 %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 261 %tmp5 = add <4 x i32> %tmp3, %tmp4 262 ret <4 x i32> %tmp5 263} 264 265define <2 x i64> @smlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 266;CHECK-LABEL: smlal2d: 267;CHECK: smlal.2d 268 %tmp1 = load <2 x i32>, <2 x i32>* %A 269 %tmp2 = load <2 x i32>, <2 x i32>* %B 270 %tmp3 = load <2 x i64>, <2 x i64>* %C 271 %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 272 %tmp5 = add <2 x i64> %tmp3, %tmp4 273 ret <2 x i64> %tmp5 274} 275 276define <4 x i32> @smlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 277;CHECK-LABEL: smlsl4s: 278;CHECK: smlsl.4s 279 %tmp1 = load <4 x i16>, <4 x i16>* %A 280 %tmp2 = load <4 x i16>, <4 x i16>* %B 281 %tmp3 = load <4 x i32>, <4 x i32>* %C 282 %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 283 %tmp5 = sub <4 x i32> %tmp3, %tmp4 284 ret <4 x i32> %tmp5 285} 286 287define <2 x i64> @smlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 288;CHECK-LABEL: smlsl2d: 289;CHECK: smlsl.2d 290 %tmp1 = load <2 x i32>, <2 x i32>* %A 291 %tmp2 = load <2 x i32>, <2 x i32>* %B 292 %tmp3 = load <2 x i64>, <2 x i64>* %C 293 %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 294 %tmp5 = sub <2 x i64> %tmp3, %tmp4 295 ret <2 x i64> %tmp5 296} 297 298declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) 299declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>) 300declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>) 301declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>) 302 303define <4 x i32> @sqdmlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 304;CHECK-LABEL: sqdmlal4s: 305;CHECK: sqdmlal.4s 306 %tmp1 = load <4 x i16>, <4 x i16>* %A 307 %tmp2 = load <4 x i16>, <4 x i16>* %B 308 %tmp3 = load <4 x i32>, <4 x i32>* %C 309 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 310 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4) 311 ret <4 x i32> %tmp5 312} 313 314define <2 x i64> @sqdmlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 315;CHECK-LABEL: sqdmlal2d: 316;CHECK: sqdmlal.2d 317 %tmp1 = load <2 x i32>, <2 x i32>* %A 318 %tmp2 = load <2 x i32>, <2 x i32>* %B 319 %tmp3 = load <2 x i64>, <2 x i64>* %C 320 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 321 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4) 322 ret <2 x i64> %tmp5 323} 324 325define <4 x i32> @sqdmlal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { 326;CHECK-LABEL: sqdmlal2_4s: 327;CHECK: sqdmlal2.4s 328 %load1 = load <8 x i16>, <8 x i16>* %A 329 %load2 = load <8 x i16>, <8 x i16>* %B 330 %tmp3 = load <4 x i32>, <4 x i32>* %C 331 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 332 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 333 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 334 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4) 335 ret <4 x i32> %tmp5 336} 337 338define <2 x i64> @sqdmlal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { 339;CHECK-LABEL: sqdmlal2_2d: 340;CHECK: sqdmlal2.2d 341 %load1 = load <4 x i32>, <4 x i32>* %A 342 %load2 = load <4 x i32>, <4 x i32>* %B 343 %tmp3 = load <2 x i64>, <2 x i64>* %C 344 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 345 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 346 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 347 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4) 348 ret <2 x i64> %tmp5 349} 350 351define <4 x i32> @sqdmlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 352;CHECK-LABEL: sqdmlsl4s: 353;CHECK: sqdmlsl.4s 354 %tmp1 = load <4 x i16>, <4 x i16>* %A 355 %tmp2 = load <4 x i16>, <4 x i16>* %B 356 %tmp3 = load <4 x i32>, <4 x i32>* %C 357 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 358 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4) 359 ret <4 x i32> %tmp5 360} 361 362define <2 x i64> @sqdmlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 363;CHECK-LABEL: sqdmlsl2d: 364;CHECK: sqdmlsl.2d 365 %tmp1 = load <2 x i32>, <2 x i32>* %A 366 %tmp2 = load <2 x i32>, <2 x i32>* %B 367 %tmp3 = load <2 x i64>, <2 x i64>* %C 368 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 369 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4) 370 ret <2 x i64> %tmp5 371} 372 373define <4 x i32> @sqdmlsl2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { 374;CHECK-LABEL: sqdmlsl2_4s: 375;CHECK: sqdmlsl2.4s 376 %load1 = load <8 x i16>, <8 x i16>* %A 377 %load2 = load <8 x i16>, <8 x i16>* %B 378 %tmp3 = load <4 x i32>, <4 x i32>* %C 379 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 380 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 381 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 382 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4) 383 ret <4 x i32> %tmp5 384} 385 386define <2 x i64> @sqdmlsl2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { 387;CHECK-LABEL: sqdmlsl2_2d: 388;CHECK: sqdmlsl2.2d 389 %load1 = load <4 x i32>, <4 x i32>* %A 390 %load2 = load <4 x i32>, <4 x i32>* %B 391 %tmp3 = load <2 x i64>, <2 x i64>* %C 392 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 393 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 394 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 395 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4) 396 ret <2 x i64> %tmp5 397} 398 399define <4 x i32> @umlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 400;CHECK-LABEL: umlal4s: 401;CHECK: umlal.4s 402 %tmp1 = load <4 x i16>, <4 x i16>* %A 403 %tmp2 = load <4 x i16>, <4 x i16>* %B 404 %tmp3 = load <4 x i32>, <4 x i32>* %C 405 %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 406 %tmp5 = add <4 x i32> %tmp3, %tmp4 407 ret <4 x i32> %tmp5 408} 409 410define <2 x i64> @umlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 411;CHECK-LABEL: umlal2d: 412;CHECK: umlal.2d 413 %tmp1 = load <2 x i32>, <2 x i32>* %A 414 %tmp2 = load <2 x i32>, <2 x i32>* %B 415 %tmp3 = load <2 x i64>, <2 x i64>* %C 416 %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 417 %tmp5 = add <2 x i64> %tmp3, %tmp4 418 ret <2 x i64> %tmp5 419} 420 421define <4 x i32> @umlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 422;CHECK-LABEL: umlsl4s: 423;CHECK: umlsl.4s 424 %tmp1 = load <4 x i16>, <4 x i16>* %A 425 %tmp2 = load <4 x i16>, <4 x i16>* %B 426 %tmp3 = load <4 x i32>, <4 x i32>* %C 427 %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 428 %tmp5 = sub <4 x i32> %tmp3, %tmp4 429 ret <4 x i32> %tmp5 430} 431 432define <2 x i64> @umlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 433;CHECK-LABEL: umlsl2d: 434;CHECK: umlsl.2d 435 %tmp1 = load <2 x i32>, <2 x i32>* %A 436 %tmp2 = load <2 x i32>, <2 x i32>* %B 437 %tmp3 = load <2 x i64>, <2 x i64>* %C 438 %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 439 %tmp5 = sub <2 x i64> %tmp3, %tmp4 440 ret <2 x i64> %tmp5 441} 442 443define <2 x float> @fmla_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind { 444;CHECK-LABEL: fmla_2s: 445;CHECK: fmla.2s 446 %tmp1 = load <2 x float>, <2 x float>* %A 447 %tmp2 = load <2 x float>, <2 x float>* %B 448 %tmp3 = load <2 x float>, <2 x float>* %C 449 %tmp4 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp2, <2 x float> %tmp3) 450 ret <2 x float> %tmp4 451} 452 453define <4 x float> @fmla_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind { 454;CHECK-LABEL: fmla_4s: 455;CHECK: fmla.4s 456 %tmp1 = load <4 x float>, <4 x float>* %A 457 %tmp2 = load <4 x float>, <4 x float>* %B 458 %tmp3 = load <4 x float>, <4 x float>* %C 459 %tmp4 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp2, <4 x float> %tmp3) 460 ret <4 x float> %tmp4 461} 462 463define <2 x double> @fmla_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind { 464;CHECK-LABEL: fmla_2d: 465;CHECK: fmla.2d 466 %tmp1 = load <2 x double>, <2 x double>* %A 467 %tmp2 = load <2 x double>, <2 x double>* %B 468 %tmp3 = load <2 x double>, <2 x double>* %C 469 %tmp4 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp2, <2 x double> %tmp3) 470 ret <2 x double> %tmp4 471} 472 473declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone 474declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone 475declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone 476 477define <2 x float> @fmls_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind { 478;CHECK-LABEL: fmls_2s: 479;CHECK: fmls.2s 480 %tmp1 = load <2 x float>, <2 x float>* %A 481 %tmp2 = load <2 x float>, <2 x float>* %B 482 %tmp3 = load <2 x float>, <2 x float>* %C 483 %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2 484 %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp4, <2 x float> %tmp3) 485 ret <2 x float> %tmp5 486} 487 488define <4 x float> @fmls_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind { 489;CHECK-LABEL: fmls_4s: 490;CHECK: fmls.4s 491 %tmp1 = load <4 x float>, <4 x float>* %A 492 %tmp2 = load <4 x float>, <4 x float>* %B 493 %tmp3 = load <4 x float>, <4 x float>* %C 494 %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2 495 %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp4, <4 x float> %tmp3) 496 ret <4 x float> %tmp5 497} 498 499define <2 x double> @fmls_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind { 500;CHECK-LABEL: fmls_2d: 501;CHECK: fmls.2d 502 %tmp1 = load <2 x double>, <2 x double>* %A 503 %tmp2 = load <2 x double>, <2 x double>* %B 504 %tmp3 = load <2 x double>, <2 x double>* %C 505 %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2 506 %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp4, <2 x double> %tmp3) 507 ret <2 x double> %tmp5 508} 509 510define <2 x float> @fmls_commuted_neg_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind { 511;CHECK-LABEL: fmls_commuted_neg_2s: 512;CHECK: fmls.2s 513 %tmp1 = load <2 x float>, <2 x float>* %A 514 %tmp2 = load <2 x float>, <2 x float>* %B 515 %tmp3 = load <2 x float>, <2 x float>* %C 516 %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2 517 %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp4, <2 x float> %tmp1, <2 x float> %tmp3) 518 ret <2 x float> %tmp5 519} 520 521define <4 x float> @fmls_commuted_neg_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind { 522;CHECK-LABEL: fmls_commuted_neg_4s: 523;CHECK: fmls.4s 524 %tmp1 = load <4 x float>, <4 x float>* %A 525 %tmp2 = load <4 x float>, <4 x float>* %B 526 %tmp3 = load <4 x float>, <4 x float>* %C 527 %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2 528 %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp4, <4 x float> %tmp1, <4 x float> %tmp3) 529 ret <4 x float> %tmp5 530} 531 532define <2 x double> @fmls_commuted_neg_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind { 533;CHECK-LABEL: fmls_commuted_neg_2d: 534;CHECK: fmls.2d 535 %tmp1 = load <2 x double>, <2 x double>* %A 536 %tmp2 = load <2 x double>, <2 x double>* %B 537 %tmp3 = load <2 x double>, <2 x double>* %C 538 %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2 539 %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp4, <2 x double> %tmp1, <2 x double> %tmp3) 540 ret <2 x double> %tmp5 541} 542 543define <2 x float> @fmls_indexed_2s(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp { 544;CHECK-LABEL: fmls_indexed_2s: 545;CHECK: fmls.2s 546entry: 547 %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %c 548 %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer 549 %fmls1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a) 550 ret <2 x float> %fmls1 551} 552 553define <4 x float> @fmls_indexed_4s(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp { 554;CHECK-LABEL: fmls_indexed_4s: 555;CHECK: fmls.4s 556entry: 557 %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c 558 %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer 559 %fmls1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a) 560 ret <4 x float> %fmls1 561} 562 563define <2 x double> @fmls_indexed_2d(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp { 564;CHECK-LABEL: fmls_indexed_2d: 565;CHECK: fmls.2d 566entry: 567 %0 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c 568 %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer 569 %fmls1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a) 570 ret <2 x double> %fmls1 571} 572 573define <2 x float> @fmla_indexed_scalar_2s(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp { 574entry: 575; CHECK-LABEL: fmla_indexed_scalar_2s: 576; CHECK-NEXT: fmla.2s 577; CHECK-NEXT: ret 578 %v1 = insertelement <2 x float> undef, float %c, i32 0 579 %v2 = insertelement <2 x float> %v1, float %c, i32 1 580 %fmla1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %v1, <2 x float> %b, <2 x float> %a) nounwind 581 ret <2 x float> %fmla1 582} 583 584define <4 x float> @fmla_indexed_scalar_4s(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp { 585entry: 586; CHECK-LABEL: fmla_indexed_scalar_4s: 587; CHECK-NEXT: fmla.4s 588; CHECK-NEXT: ret 589 %v1 = insertelement <4 x float> undef, float %c, i32 0 590 %v2 = insertelement <4 x float> %v1, float %c, i32 1 591 %v3 = insertelement <4 x float> %v2, float %c, i32 2 592 %v4 = insertelement <4 x float> %v3, float %c, i32 3 593 %fmla1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a) nounwind 594 ret <4 x float> %fmla1 595} 596 597define <2 x double> @fmla_indexed_scalar_2d(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp { 598; CHECK-LABEL: fmla_indexed_scalar_2d: 599; CHECK-NEXT: fmla.2d 600; CHECK-NEXT: ret 601entry: 602 %v1 = insertelement <2 x double> undef, double %c, i32 0 603 %v2 = insertelement <2 x double> %v1, double %c, i32 1 604 %fmla1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a) nounwind 605 ret <2 x double> %fmla1 606} 607 608define <4 x i16> @mul_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 609;CHECK-LABEL: mul_4h: 610;CHECK-NOT: dup 611;CHECK: mul.4h 612 %tmp1 = load <4 x i16>, <4 x i16>* %A 613 %tmp2 = load <4 x i16>, <4 x i16>* %B 614 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 615 %tmp4 = mul <4 x i16> %tmp1, %tmp3 616 ret <4 x i16> %tmp4 617} 618 619define <8 x i16> @mul_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 620;CHECK-LABEL: mul_8h: 621;CHECK-NOT: dup 622;CHECK: mul.8h 623 %tmp1 = load <8 x i16>, <8 x i16>* %A 624 %tmp2 = load <8 x i16>, <8 x i16>* %B 625 %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 626 %tmp4 = mul <8 x i16> %tmp1, %tmp3 627 ret <8 x i16> %tmp4 628} 629 630define <2 x i32> @mul_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 631;CHECK-LABEL: mul_2s: 632;CHECK-NOT: dup 633;CHECK: mul.2s 634 %tmp1 = load <2 x i32>, <2 x i32>* %A 635 %tmp2 = load <2 x i32>, <2 x i32>* %B 636 %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1> 637 %tmp4 = mul <2 x i32> %tmp1, %tmp3 638 ret <2 x i32> %tmp4 639} 640 641define <4 x i32> @mul_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 642;CHECK-LABEL: mul_4s: 643;CHECK-NOT: dup 644;CHECK: mul.4s 645 %tmp1 = load <4 x i32>, <4 x i32>* %A 646 %tmp2 = load <4 x i32>, <4 x i32>* %B 647 %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 648 %tmp4 = mul <4 x i32> %tmp1, %tmp3 649 ret <4 x i32> %tmp4 650} 651 652define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind { 653; CHECK-LABEL: mul_2d: 654; CHECK: mul 655; CHECK: mul 656 %tmp1 = mul <2 x i64> %A, %B 657 ret <2 x i64> %tmp1 658} 659 660define <2 x float> @fmul_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind { 661;CHECK-LABEL: fmul_lane_2s: 662;CHECK-NOT: dup 663;CHECK: fmul.2s 664 %tmp1 = load <2 x float>, <2 x float>* %A 665 %tmp2 = load <2 x float>, <2 x float>* %B 666 %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1> 667 %tmp4 = fmul <2 x float> %tmp1, %tmp3 668 ret <2 x float> %tmp4 669} 670 671define <4 x float> @fmul_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind { 672;CHECK-LABEL: fmul_lane_4s: 673;CHECK-NOT: dup 674;CHECK: fmul.4s 675 %tmp1 = load <4 x float>, <4 x float>* %A 676 %tmp2 = load <4 x float>, <4 x float>* %B 677 %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 678 %tmp4 = fmul <4 x float> %tmp1, %tmp3 679 ret <4 x float> %tmp4 680} 681 682define <2 x double> @fmul_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind { 683;CHECK-LABEL: fmul_lane_2d: 684;CHECK-NOT: dup 685;CHECK: fmul.2d 686 %tmp1 = load <2 x double>, <2 x double>* %A 687 %tmp2 = load <2 x double>, <2 x double>* %B 688 %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1> 689 %tmp4 = fmul <2 x double> %tmp1, %tmp3 690 ret <2 x double> %tmp4 691} 692 693define float @fmul_lane_s(float %A, <4 x float> %vec) nounwind { 694;CHECK-LABEL: fmul_lane_s: 695;CHECK-NOT: dup 696;CHECK: fmul.s s0, s0, v1[3] 697 %B = extractelement <4 x float> %vec, i32 3 698 %res = fmul float %A, %B 699 ret float %res 700} 701 702define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind { 703;CHECK-LABEL: fmul_lane_d: 704;CHECK-NOT: dup 705;CHECK: fmul.d d0, d0, v1[1] 706 %B = extractelement <2 x double> %vec, i32 1 707 %res = fmul double %A, %B 708 ret double %res 709} 710 711 712 713define <2 x float> @fmulx_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind { 714;CHECK-LABEL: fmulx_lane_2s: 715;CHECK-NOT: dup 716;CHECK: fmulx.2s 717 %tmp1 = load <2 x float>, <2 x float>* %A 718 %tmp2 = load <2 x float>, <2 x float>* %B 719 %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1> 720 %tmp4 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp3) 721 ret <2 x float> %tmp4 722} 723 724define <4 x float> @fmulx_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind { 725;CHECK-LABEL: fmulx_lane_4s: 726;CHECK-NOT: dup 727;CHECK: fmulx.4s 728 %tmp1 = load <4 x float>, <4 x float>* %A 729 %tmp2 = load <4 x float>, <4 x float>* %B 730 %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 731 %tmp4 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp3) 732 ret <4 x float> %tmp4 733} 734 735define <2 x double> @fmulx_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind { 736;CHECK-LABEL: fmulx_lane_2d: 737;CHECK-NOT: dup 738;CHECK: fmulx.2d 739 %tmp1 = load <2 x double>, <2 x double>* %A 740 %tmp2 = load <2 x double>, <2 x double>* %B 741 %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1> 742 %tmp4 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp3) 743 ret <2 x double> %tmp4 744} 745 746define <4 x i16> @sqdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 747;CHECK-LABEL: sqdmulh_lane_4h: 748;CHECK-NOT: dup 749;CHECK: sqdmulh.4h 750 %tmp1 = load <4 x i16>, <4 x i16>* %A 751 %tmp2 = load <4 x i16>, <4 x i16>* %B 752 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 753 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3) 754 ret <4 x i16> %tmp4 755} 756 757define <8 x i16> @sqdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 758;CHECK-LABEL: sqdmulh_lane_8h: 759;CHECK-NOT: dup 760;CHECK: sqdmulh.8h 761 %tmp1 = load <8 x i16>, <8 x i16>* %A 762 %tmp2 = load <8 x i16>, <8 x i16>* %B 763 %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 764 %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3) 765 ret <8 x i16> %tmp4 766} 767 768define <2 x i32> @sqdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 769;CHECK-LABEL: sqdmulh_lane_2s: 770;CHECK-NOT: dup 771;CHECK: sqdmulh.2s 772 %tmp1 = load <2 x i32>, <2 x i32>* %A 773 %tmp2 = load <2 x i32>, <2 x i32>* %B 774 %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1> 775 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3) 776 ret <2 x i32> %tmp4 777} 778 779define <4 x i32> @sqdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 780;CHECK-LABEL: sqdmulh_lane_4s: 781;CHECK-NOT: dup 782;CHECK: sqdmulh.4s 783 %tmp1 = load <4 x i32>, <4 x i32>* %A 784 %tmp2 = load <4 x i32>, <4 x i32>* %B 785 %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 786 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3) 787 ret <4 x i32> %tmp4 788} 789 790define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind { 791;CHECK-LABEL: sqdmulh_lane_1s: 792;CHECK-NOT: dup 793;CHECK: sqdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1] 794 %tmp1 = extractelement <4 x i32> %B, i32 1 795 %tmp2 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %A, i32 %tmp1) 796 ret i32 %tmp2 797} 798 799define <4 x i16> @sqrdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 800;CHECK-LABEL: sqrdmulh_lane_4h: 801;CHECK-NOT: dup 802;CHECK: sqrdmulh.4h 803 %tmp1 = load <4 x i16>, <4 x i16>* %A 804 %tmp2 = load <4 x i16>, <4 x i16>* %B 805 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 806 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3) 807 ret <4 x i16> %tmp4 808} 809 810define <8 x i16> @sqrdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 811;CHECK-LABEL: sqrdmulh_lane_8h: 812;CHECK-NOT: dup 813;CHECK: sqrdmulh.8h 814 %tmp1 = load <8 x i16>, <8 x i16>* %A 815 %tmp2 = load <8 x i16>, <8 x i16>* %B 816 %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 817 %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3) 818 ret <8 x i16> %tmp4 819} 820 821define <2 x i32> @sqrdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 822;CHECK-LABEL: sqrdmulh_lane_2s: 823;CHECK-NOT: dup 824;CHECK: sqrdmulh.2s 825 %tmp1 = load <2 x i32>, <2 x i32>* %A 826 %tmp2 = load <2 x i32>, <2 x i32>* %B 827 %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1> 828 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3) 829 ret <2 x i32> %tmp4 830} 831 832define <4 x i32> @sqrdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 833;CHECK-LABEL: sqrdmulh_lane_4s: 834;CHECK-NOT: dup 835;CHECK: sqrdmulh.4s 836 %tmp1 = load <4 x i32>, <4 x i32>* %A 837 %tmp2 = load <4 x i32>, <4 x i32>* %B 838 %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 839 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3) 840 ret <4 x i32> %tmp4 841} 842 843define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind { 844;CHECK-LABEL: sqrdmulh_lane_1s: 845;CHECK-NOT: dup 846;CHECK: sqrdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1] 847 %tmp1 = extractelement <4 x i32> %B, i32 1 848 %tmp2 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %A, i32 %tmp1) 849 ret i32 %tmp2 850} 851 852define <4 x i32> @sqdmull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 853;CHECK-LABEL: sqdmull_lane_4s: 854;CHECK-NOT: dup 855;CHECK: sqdmull.4s 856 %tmp1 = load <4 x i16>, <4 x i16>* %A 857 %tmp2 = load <4 x i16>, <4 x i16>* %B 858 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 859 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) 860 ret <4 x i32> %tmp4 861} 862 863define <2 x i64> @sqdmull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 864;CHECK-LABEL: sqdmull_lane_2d: 865;CHECK-NOT: dup 866;CHECK: sqdmull.2d 867 %tmp1 = load <2 x i32>, <2 x i32>* %A 868 %tmp2 = load <2 x i32>, <2 x i32>* %B 869 %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1> 870 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) 871 ret <2 x i64> %tmp4 872} 873 874define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { 875;CHECK-LABEL: sqdmull2_lane_4s: 876;CHECK-NOT: dup 877;CHECK: sqdmull2.4s 878 %load1 = load <8 x i16>, <8 x i16>* %A 879 %load2 = load <8 x i16>, <8 x i16>* %B 880 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 881 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 882 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 883 ret <4 x i32> %tmp4 884} 885 886define <2 x i64> @sqdmull2_lane_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { 887;CHECK-LABEL: sqdmull2_lane_2d: 888;CHECK-NOT: dup 889;CHECK: sqdmull2.2d 890 %load1 = load <4 x i32>, <4 x i32>* %A 891 %load2 = load <4 x i32>, <4 x i32>* %B 892 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 893 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1> 894 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 895 ret <2 x i64> %tmp4 896} 897 898define <4 x i32> @umull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 899;CHECK-LABEL: umull_lane_4s: 900;CHECK-NOT: dup 901;CHECK: umull.4s 902 %tmp1 = load <4 x i16>, <4 x i16>* %A 903 %tmp2 = load <4 x i16>, <4 x i16>* %B 904 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 905 %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) 906 ret <4 x i32> %tmp4 907} 908 909define <2 x i64> @umull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 910;CHECK-LABEL: umull_lane_2d: 911;CHECK-NOT: dup 912;CHECK: umull.2d 913 %tmp1 = load <2 x i32>, <2 x i32>* %A 914 %tmp2 = load <2 x i32>, <2 x i32>* %B 915 %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1> 916 %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) 917 ret <2 x i64> %tmp4 918} 919 920define <4 x i32> @smull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 921;CHECK-LABEL: smull_lane_4s: 922;CHECK-NOT: dup 923;CHECK: smull.4s 924 %tmp1 = load <4 x i16>, <4 x i16>* %A 925 %tmp2 = load <4 x i16>, <4 x i16>* %B 926 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 927 %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) 928 ret <4 x i32> %tmp4 929} 930 931define <2 x i64> @smull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 932;CHECK-LABEL: smull_lane_2d: 933;CHECK-NOT: dup 934;CHECK: smull.2d 935 %tmp1 = load <2 x i32>, <2 x i32>* %A 936 %tmp2 = load <2 x i32>, <2 x i32>* %B 937 %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1> 938 %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) 939 ret <2 x i64> %tmp4 940} 941 942define <4 x i32> @smlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 943;CHECK-LABEL: smlal_lane_4s: 944;CHECK-NOT: dup 945;CHECK: smlal.4s 946 %tmp1 = load <4 x i16>, <4 x i16>* %A 947 %tmp2 = load <4 x i16>, <4 x i16>* %B 948 %tmp3 = load <4 x i32>, <4 x i32>* %C 949 %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 950 %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4) 951 %tmp6 = add <4 x i32> %tmp3, %tmp5 952 ret <4 x i32> %tmp6 953} 954 955define <2 x i64> @smlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 956;CHECK-LABEL: smlal_lane_2d: 957;CHECK-NOT: dup 958;CHECK: smlal.2d 959 %tmp1 = load <2 x i32>, <2 x i32>* %A 960 %tmp2 = load <2 x i32>, <2 x i32>* %B 961 %tmp3 = load <2 x i64>, <2 x i64>* %C 962 %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1> 963 %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4) 964 %tmp6 = add <2 x i64> %tmp3, %tmp5 965 ret <2 x i64> %tmp6 966} 967 968define <4 x i32> @sqdmlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 969;CHECK-LABEL: sqdmlal_lane_4s: 970;CHECK-NOT: dup 971;CHECK: sqdmlal.4s 972 %tmp1 = load <4 x i16>, <4 x i16>* %A 973 %tmp2 = load <4 x i16>, <4 x i16>* %B 974 %tmp3 = load <4 x i32>, <4 x i32>* %C 975 %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 976 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4) 977 %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5) 978 ret <4 x i32> %tmp6 979} 980 981define <2 x i64> @sqdmlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 982;CHECK-LABEL: sqdmlal_lane_2d: 983;CHECK-NOT: dup 984;CHECK: sqdmlal.2d 985 %tmp1 = load <2 x i32>, <2 x i32>* %A 986 %tmp2 = load <2 x i32>, <2 x i32>* %B 987 %tmp3 = load <2 x i64>, <2 x i64>* %C 988 %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1> 989 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4) 990 %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5) 991 ret <2 x i64> %tmp6 992} 993 994define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { 995;CHECK-LABEL: sqdmlal2_lane_4s: 996;CHECK-NOT: dup 997;CHECK: sqdmlal2.4s 998 %load1 = load <8 x i16>, <8 x i16>* %A 999 %load2 = load <8 x i16>, <8 x i16>* %B 1000 %tmp3 = load <4 x i32>, <4 x i32>* %C 1001 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1002 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1003 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 1004 %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5) 1005 ret <4 x i32> %tmp6 1006} 1007 1008define <2 x i64> @sqdmlal2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { 1009;CHECK-LABEL: sqdmlal2_lane_2d: 1010;CHECK-NOT: dup 1011;CHECK: sqdmlal2.2d 1012 %load1 = load <4 x i32>, <4 x i32>* %A 1013 %load2 = load <4 x i32>, <4 x i32>* %B 1014 %tmp3 = load <2 x i64>, <2 x i64>* %C 1015 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1016 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1> 1017 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 1018 %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5) 1019 ret <2 x i64> %tmp6 1020} 1021 1022define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind { 1023;CHECK-LABEL: sqdmlal_lane_1s: 1024;CHECK: sqdmlal.4s 1025 %lhs = insertelement <4 x i16> undef, i16 %B, i32 0 1026 %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 1027 %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs) 1028 %prod = extractelement <4 x i32> %prod.vec, i32 0 1029 %res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod) 1030 ret i32 %res 1031} 1032declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32) 1033 1034define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind { 1035;CHECK-LABEL: sqdmlsl_lane_1s: 1036;CHECK: sqdmlsl.4s 1037 %lhs = insertelement <4 x i16> undef, i16 %B, i32 0 1038 %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 1039 %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs) 1040 %prod = extractelement <4 x i32> %prod.vec, i32 0 1041 %res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod) 1042 ret i32 %res 1043} 1044declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32) 1045 1046define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind { 1047;CHECK-LABEL: sqdmlal_lane_1d: 1048;CHECK: sqdmlal.s 1049 %rhs = extractelement <2 x i32> %C, i32 1 1050 %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs) 1051 %res = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %A, i64 %prod) 1052 ret i64 %res 1053} 1054declare i64 @llvm.aarch64.neon.sqdmulls.scalar(i32, i32) 1055declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64) 1056 1057define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind { 1058;CHECK-LABEL: sqdmlsl_lane_1d: 1059;CHECK: sqdmlsl.s 1060 %rhs = extractelement <2 x i32> %C, i32 1 1061 %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs) 1062 %res = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %A, i64 %prod) 1063 ret i64 %res 1064} 1065declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64) 1066 1067 1068define <4 x i32> @umlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 1069;CHECK-LABEL: umlal_lane_4s: 1070;CHECK-NOT: dup 1071;CHECK: umlal.4s 1072 %tmp1 = load <4 x i16>, <4 x i16>* %A 1073 %tmp2 = load <4 x i16>, <4 x i16>* %B 1074 %tmp3 = load <4 x i32>, <4 x i32>* %C 1075 %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1076 %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4) 1077 %tmp6 = add <4 x i32> %tmp3, %tmp5 1078 ret <4 x i32> %tmp6 1079} 1080 1081define <2 x i64> @umlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 1082;CHECK-LABEL: umlal_lane_2d: 1083;CHECK-NOT: dup 1084;CHECK: umlal.2d 1085 %tmp1 = load <2 x i32>, <2 x i32>* %A 1086 %tmp2 = load <2 x i32>, <2 x i32>* %B 1087 %tmp3 = load <2 x i64>, <2 x i64>* %C 1088 %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1> 1089 %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4) 1090 %tmp6 = add <2 x i64> %tmp3, %tmp5 1091 ret <2 x i64> %tmp6 1092} 1093 1094 1095define <4 x i32> @smlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 1096;CHECK-LABEL: smlsl_lane_4s: 1097;CHECK-NOT: dup 1098;CHECK: smlsl.4s 1099 %tmp1 = load <4 x i16>, <4 x i16>* %A 1100 %tmp2 = load <4 x i16>, <4 x i16>* %B 1101 %tmp3 = load <4 x i32>, <4 x i32>* %C 1102 %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1103 %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4) 1104 %tmp6 = sub <4 x i32> %tmp3, %tmp5 1105 ret <4 x i32> %tmp6 1106} 1107 1108define <2 x i64> @smlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 1109;CHECK-LABEL: smlsl_lane_2d: 1110;CHECK-NOT: dup 1111;CHECK: smlsl.2d 1112 %tmp1 = load <2 x i32>, <2 x i32>* %A 1113 %tmp2 = load <2 x i32>, <2 x i32>* %B 1114 %tmp3 = load <2 x i64>, <2 x i64>* %C 1115 %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1> 1116 %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4) 1117 %tmp6 = sub <2 x i64> %tmp3, %tmp5 1118 ret <2 x i64> %tmp6 1119} 1120 1121define <4 x i32> @sqdmlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 1122;CHECK-LABEL: sqdmlsl_lane_4s: 1123;CHECK-NOT: dup 1124;CHECK: sqdmlsl.4s 1125 %tmp1 = load <4 x i16>, <4 x i16>* %A 1126 %tmp2 = load <4 x i16>, <4 x i16>* %B 1127 %tmp3 = load <4 x i32>, <4 x i32>* %C 1128 %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1129 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4) 1130 %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5) 1131 ret <4 x i32> %tmp6 1132} 1133 1134define <2 x i64> @sqdmlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 1135;CHECK-LABEL: sqdmlsl_lane_2d: 1136;CHECK-NOT: dup 1137;CHECK: sqdmlsl.2d 1138 %tmp1 = load <2 x i32>, <2 x i32>* %A 1139 %tmp2 = load <2 x i32>, <2 x i32>* %B 1140 %tmp3 = load <2 x i64>, <2 x i64>* %C 1141 %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1> 1142 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4) 1143 %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5) 1144 ret <2 x i64> %tmp6 1145} 1146 1147define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { 1148;CHECK-LABEL: sqdmlsl2_lane_4s: 1149;CHECK-NOT: dup 1150;CHECK: sqdmlsl2.4s 1151 %load1 = load <8 x i16>, <8 x i16>* %A 1152 %load2 = load <8 x i16>, <8 x i16>* %B 1153 %tmp3 = load <4 x i32>, <4 x i32>* %C 1154 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1155 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1156 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 1157 %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5) 1158 ret <4 x i32> %tmp6 1159} 1160 1161define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { 1162;CHECK-LABEL: sqdmlsl2_lane_2d: 1163;CHECK-NOT: dup 1164;CHECK: sqdmlsl2.2d 1165 %load1 = load <4 x i32>, <4 x i32>* %A 1166 %load2 = load <4 x i32>, <4 x i32>* %B 1167 %tmp3 = load <2 x i64>, <2 x i64>* %C 1168 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1169 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1> 1170 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 1171 %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5) 1172 ret <2 x i64> %tmp6 1173} 1174 1175define <4 x i32> @umlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 1176;CHECK-LABEL: umlsl_lane_4s: 1177;CHECK-NOT: dup 1178;CHECK: umlsl.4s 1179 %tmp1 = load <4 x i16>, <4 x i16>* %A 1180 %tmp2 = load <4 x i16>, <4 x i16>* %B 1181 %tmp3 = load <4 x i32>, <4 x i32>* %C 1182 %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1183 %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4) 1184 %tmp6 = sub <4 x i32> %tmp3, %tmp5 1185 ret <4 x i32> %tmp6 1186} 1187 1188define <2 x i64> @umlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 1189;CHECK-LABEL: umlsl_lane_2d: 1190;CHECK-NOT: dup 1191;CHECK: umlsl.2d 1192 %tmp1 = load <2 x i32>, <2 x i32>* %A 1193 %tmp2 = load <2 x i32>, <2 x i32>* %B 1194 %tmp3 = load <2 x i64>, <2 x i64>* %C 1195 %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1> 1196 %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4) 1197 %tmp6 = sub <2 x i64> %tmp3, %tmp5 1198 ret <2 x i64> %tmp6 1199} 1200 1201; Scalar FMULX 1202define float @fmulxs(float %a, float %b) nounwind { 1203; CHECK-LABEL: fmulxs: 1204; CHECKNEXT: fmulx s0, s0, s1 1205 %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind 1206; CHECKNEXT: ret 1207 ret float %fmulx.i 1208} 1209 1210define double @fmulxd(double %a, double %b) nounwind { 1211; CHECK-LABEL: fmulxd: 1212; CHECKNEXT: fmulx d0, d0, d1 1213 %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind 1214; CHECKNEXT: ret 1215 ret double %fmulx.i 1216} 1217 1218define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind { 1219; CHECK-LABEL: fmulxs_lane: 1220; CHECKNEXT: fmulx.s s0, s0, v1[3] 1221 %b = extractelement <4 x float> %vec, i32 3 1222 %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind 1223; CHECKNEXT: ret 1224 ret float %fmulx.i 1225} 1226 1227define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind { 1228; CHECK-LABEL: fmulxd_lane: 1229; CHECKNEXT: fmulx d0, d0, v1[1] 1230 %b = extractelement <2 x double> %vec, i32 1 1231 %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind 1232; CHECKNEXT: ret 1233 ret double %fmulx.i 1234} 1235 1236declare double @llvm.aarch64.neon.fmulx.f64(double, double) nounwind readnone 1237declare float @llvm.aarch64.neon.fmulx.f32(float, float) nounwind readnone 1238 1239 1240define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind { 1241; CHECK-LABEL: smull2_8h_simple: 1242; CHECK-NEXT: smull2.8h v0, v0, v1 1243; CHECK-NEXT: ret 1244 %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1245 %2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1246 %3 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %1, <8 x i8> %2) #2 1247 ret <8 x i16> %3 1248} 1249 1250define <8 x i16> @foo0(<16 x i8> %a, <16 x i8> %b) nounwind { 1251; CHECK-LABEL: foo0: 1252; CHECK: smull2.8h v0, v0, v1 1253 %tmp = bitcast <16 x i8> %a to <2 x i64> 1254 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1255 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8> 1256 %tmp2 = bitcast <16 x i8> %b to <2 x i64> 1257 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1258 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8> 1259 %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind 1260 ret <8 x i16> %vmull.i.i 1261} 1262 1263define <4 x i32> @foo1(<8 x i16> %a, <8 x i16> %b) nounwind { 1264; CHECK-LABEL: foo1: 1265; CHECK: smull2.4s v0, v0, v1 1266 %tmp = bitcast <8 x i16> %a to <2 x i64> 1267 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1268 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16> 1269 %tmp2 = bitcast <8 x i16> %b to <2 x i64> 1270 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1271 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16> 1272 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind 1273 ret <4 x i32> %vmull2.i.i 1274} 1275 1276define <2 x i64> @foo2(<4 x i32> %a, <4 x i32> %b) nounwind { 1277; CHECK-LABEL: foo2: 1278; CHECK: smull2.2d v0, v0, v1 1279 %tmp = bitcast <4 x i32> %a to <2 x i64> 1280 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1281 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32> 1282 %tmp2 = bitcast <4 x i32> %b to <2 x i64> 1283 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1284 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32> 1285 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind 1286 ret <2 x i64> %vmull2.i.i 1287} 1288 1289define <8 x i16> @foo3(<16 x i8> %a, <16 x i8> %b) nounwind { 1290; CHECK-LABEL: foo3: 1291; CHECK: umull2.8h v0, v0, v1 1292 %tmp = bitcast <16 x i8> %a to <2 x i64> 1293 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1294 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8> 1295 %tmp2 = bitcast <16 x i8> %b to <2 x i64> 1296 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1297 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8> 1298 %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind 1299 ret <8 x i16> %vmull.i.i 1300} 1301 1302define <4 x i32> @foo4(<8 x i16> %a, <8 x i16> %b) nounwind { 1303; CHECK-LABEL: foo4: 1304; CHECK: umull2.4s v0, v0, v1 1305 %tmp = bitcast <8 x i16> %a to <2 x i64> 1306 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1307 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16> 1308 %tmp2 = bitcast <8 x i16> %b to <2 x i64> 1309 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1310 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16> 1311 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind 1312 ret <4 x i32> %vmull2.i.i 1313} 1314 1315define <2 x i64> @foo5(<4 x i32> %a, <4 x i32> %b) nounwind { 1316; CHECK-LABEL: foo5: 1317; CHECK: umull2.2d v0, v0, v1 1318 %tmp = bitcast <4 x i32> %a to <2 x i64> 1319 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1320 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32> 1321 %tmp2 = bitcast <4 x i32> %b to <2 x i64> 1322 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1323 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32> 1324 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind 1325 ret <2 x i64> %vmull2.i.i 1326} 1327 1328define <4 x i32> @foo6(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp { 1329; CHECK-LABEL: foo6: 1330; CHECK-NEXT: smull2.4s v0, v1, v2[1] 1331; CHECK-NEXT: ret 1332entry: 1333 %0 = bitcast <8 x i16> %b to <2 x i64> 1334 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> 1335 %1 = bitcast <1 x i64> %shuffle.i to <4 x i16> 1336 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1337 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind 1338 ret <4 x i32> %vmull2.i 1339} 1340 1341define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp { 1342; CHECK-LABEL: foo7: 1343; CHECK-NEXT: smull2.2d v0, v1, v2[1] 1344; CHECK-NEXT: ret 1345entry: 1346 %0 = bitcast <4 x i32> %b to <2 x i64> 1347 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> 1348 %1 = bitcast <1 x i64> %shuffle.i to <2 x i32> 1349 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1> 1350 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind 1351 ret <2 x i64> %vmull2.i 1352} 1353 1354define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp { 1355; CHECK-LABEL: foo8: 1356; CHECK-NEXT: umull2.4s v0, v1, v2[1] 1357; CHECK-NEXT: ret 1358entry: 1359 %0 = bitcast <8 x i16> %b to <2 x i64> 1360 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> 1361 %1 = bitcast <1 x i64> %shuffle.i to <4 x i16> 1362 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1363 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind 1364 ret <4 x i32> %vmull2.i 1365} 1366 1367define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp { 1368; CHECK-LABEL: foo9: 1369; CHECK-NEXT: umull2.2d v0, v1, v2[1] 1370; CHECK-NEXT: ret 1371entry: 1372 %0 = bitcast <4 x i32> %b to <2 x i64> 1373 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> 1374 %1 = bitcast <1 x i64> %shuffle.i to <2 x i32> 1375 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1> 1376 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind 1377 ret <2 x i64> %vmull2.i 1378} 1379 1380define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind { 1381; CHECK-LABEL: bar0: 1382; CHECK: smlal2.8h v0, v1, v2 1383; CHECK-NEXT: ret 1384 1385 %tmp = bitcast <16 x i8> %b to <2 x i64> 1386 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1387 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8> 1388 %tmp2 = bitcast <16 x i8> %c to <2 x i64> 1389 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1390 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8> 1391 %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind 1392 %add.i = add <8 x i16> %vmull.i.i.i, %a 1393 ret <8 x i16> %add.i 1394} 1395 1396define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind { 1397; CHECK-LABEL: bar1: 1398; CHECK: smlal2.4s v0, v1, v2 1399; CHECK-NEXT: ret 1400 1401 %tmp = bitcast <8 x i16> %b to <2 x i64> 1402 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1403 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16> 1404 %tmp2 = bitcast <8 x i16> %c to <2 x i64> 1405 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1406 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16> 1407 %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind 1408 %add.i = add <4 x i32> %vmull2.i.i.i, %a 1409 ret <4 x i32> %add.i 1410} 1411 1412define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind { 1413; CHECK-LABEL: bar2: 1414; CHECK: smlal2.2d v0, v1, v2 1415; CHECK-NEXT: ret 1416 1417 %tmp = bitcast <4 x i32> %b to <2 x i64> 1418 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1419 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32> 1420 %tmp2 = bitcast <4 x i32> %c to <2 x i64> 1421 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1422 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32> 1423 %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind 1424 %add.i = add <2 x i64> %vmull2.i.i.i, %a 1425 ret <2 x i64> %add.i 1426} 1427 1428define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind { 1429; CHECK-LABEL: bar3: 1430; CHECK: umlal2.8h v0, v1, v2 1431; CHECK-NEXT: ret 1432 1433 %tmp = bitcast <16 x i8> %b to <2 x i64> 1434 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1435 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8> 1436 %tmp2 = bitcast <16 x i8> %c to <2 x i64> 1437 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1438 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8> 1439 %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind 1440 %add.i = add <8 x i16> %vmull.i.i.i, %a 1441 ret <8 x i16> %add.i 1442} 1443 1444define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind { 1445; CHECK-LABEL: bar4: 1446; CHECK: umlal2.4s v0, v1, v2 1447; CHECK-NEXT: ret 1448 1449 %tmp = bitcast <8 x i16> %b to <2 x i64> 1450 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1451 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16> 1452 %tmp2 = bitcast <8 x i16> %c to <2 x i64> 1453 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1454 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16> 1455 %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind 1456 %add.i = add <4 x i32> %vmull2.i.i.i, %a 1457 ret <4 x i32> %add.i 1458} 1459 1460define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind { 1461; CHECK-LABEL: bar5: 1462; CHECK: umlal2.2d v0, v1, v2 1463; CHECK-NEXT: ret 1464 1465 %tmp = bitcast <4 x i32> %b to <2 x i64> 1466 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1467 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32> 1468 %tmp2 = bitcast <4 x i32> %c to <2 x i64> 1469 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1470 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32> 1471 %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind 1472 %add.i = add <2 x i64> %vmull2.i.i.i, %a 1473 ret <2 x i64> %add.i 1474} 1475 1476define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind { 1477; CHECK-LABEL: mlal2_1: 1478; CHECK: smlal2.4s v0, v1, v2[3] 1479; CHECK-NEXT: ret 1480 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 1481 %tmp = bitcast <8 x i16> %b to <2 x i64> 1482 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1483 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16> 1484 %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64> 1485 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1486 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16> 1487 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind 1488 %add = add <4 x i32> %vmull2.i.i, %a 1489 ret <4 x i32> %add 1490} 1491 1492define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind { 1493; CHECK-LABEL: mlal2_2: 1494; CHECK: smlal2.2d v0, v1, v2[1] 1495; CHECK-NEXT: ret 1496 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1497 %tmp = bitcast <4 x i32> %b to <2 x i64> 1498 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1499 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32> 1500 %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64> 1501 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1502 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32> 1503 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind 1504 %add = add <2 x i64> %vmull2.i.i, %a 1505 ret <2 x i64> %add 1506} 1507 1508define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind { 1509; CHECK-LABEL: mlal2_4: 1510; CHECK: umlal2.4s v0, v1, v2[2] 1511; CHECK-NEXT: ret 1512 1513 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 1514 %tmp = bitcast <8 x i16> %b to <2 x i64> 1515 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1516 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16> 1517 %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64> 1518 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1519 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16> 1520 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind 1521 %add = add <4 x i32> %vmull2.i.i, %a 1522 ret <4 x i32> %add 1523} 1524 1525define <2 x i64> @mlal2_5(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind { 1526; CHECK-LABEL: mlal2_5: 1527; CHECK: umlal2.2d v0, v1, v2[0] 1528; CHECK-NEXT: ret 1529 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> zeroinitializer 1530 %tmp = bitcast <4 x i32> %b to <2 x i64> 1531 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1532 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32> 1533 %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64> 1534 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1535 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32> 1536 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind 1537 %add = add <2 x i64> %vmull2.i.i, %a 1538 ret <2 x i64> %add 1539} 1540 1541; rdar://12328502 1542define <2 x double> @vmulq_n_f64(<2 x double> %x, double %y) nounwind readnone ssp { 1543entry: 1544; CHECK-LABEL: vmulq_n_f64: 1545; CHECK-NOT: dup.2d 1546; CHECK: fmul.2d v0, v0, v1[0] 1547 %vecinit.i = insertelement <2 x double> undef, double %y, i32 0 1548 %vecinit1.i = insertelement <2 x double> %vecinit.i, double %y, i32 1 1549 %mul.i = fmul <2 x double> %vecinit1.i, %x 1550 ret <2 x double> %mul.i 1551} 1552 1553define <4 x float> @vmulq_n_f32(<4 x float> %x, float %y) nounwind readnone ssp { 1554entry: 1555; CHECK-LABEL: vmulq_n_f32: 1556; CHECK-NOT: dup.4s 1557; CHECK: fmul.4s v0, v0, v1[0] 1558 %vecinit.i = insertelement <4 x float> undef, float %y, i32 0 1559 %vecinit1.i = insertelement <4 x float> %vecinit.i, float %y, i32 1 1560 %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %y, i32 2 1561 %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %y, i32 3 1562 %mul.i = fmul <4 x float> %vecinit3.i, %x 1563 ret <4 x float> %mul.i 1564} 1565 1566define <2 x float> @vmul_n_f32(<2 x float> %x, float %y) nounwind readnone ssp { 1567entry: 1568; CHECK-LABEL: vmul_n_f32: 1569; CHECK-NOT: dup.2s 1570; CHECK: fmul.2s v0, v0, v1[0] 1571 %vecinit.i = insertelement <2 x float> undef, float %y, i32 0 1572 %vecinit1.i = insertelement <2 x float> %vecinit.i, float %y, i32 1 1573 %mul.i = fmul <2 x float> %vecinit1.i, %x 1574 ret <2 x float> %mul.i 1575} 1576 1577define <4 x i16> @vmla_laneq_s16_test(<4 x i16> %a, <4 x i16> %b, <8 x i16> %c) nounwind readnone ssp { 1578entry: 1579; CHECK: vmla_laneq_s16_test 1580; CHECK-NOT: ext 1581; CHECK: mla.4h v0, v1, v2[6] 1582; CHECK-NEXT: ret 1583 %shuffle = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6> 1584 %mul = mul <4 x i16> %shuffle, %b 1585 %add = add <4 x i16> %mul, %a 1586 ret <4 x i16> %add 1587} 1588 1589define <2 x i32> @vmla_laneq_s32_test(<2 x i32> %a, <2 x i32> %b, <4 x i32> %c) nounwind readnone ssp { 1590entry: 1591; CHECK: vmla_laneq_s32_test 1592; CHECK-NOT: ext 1593; CHECK: mla.2s v0, v1, v2[3] 1594; CHECK-NEXT: ret 1595 %shuffle = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 3, i32 3> 1596 %mul = mul <2 x i32> %shuffle, %b 1597 %add = add <2 x i32> %mul, %a 1598 ret <2 x i32> %add 1599} 1600 1601define <8 x i16> @not_really_vmlaq_laneq_s16_test(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone ssp { 1602entry: 1603; CHECK: not_really_vmlaq_laneq_s16_test 1604; CHECK-NOT: ext 1605; CHECK: mla.8h v0, v1, v2[5] 1606; CHECK-NEXT: ret 1607 %shuffle1 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1608 %shuffle2 = shufflevector <4 x i16> %shuffle1, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1609 %mul = mul <8 x i16> %shuffle2, %b 1610 %add = add <8 x i16> %mul, %a 1611 ret <8 x i16> %add 1612} 1613 1614define <4 x i32> @not_really_vmlaq_laneq_s32_test(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone ssp { 1615entry: 1616; CHECK: not_really_vmlaq_laneq_s32_test 1617; CHECK-NOT: ext 1618; CHECK: mla.4s v0, v1, v2[3] 1619; CHECK-NEXT: ret 1620 %shuffle1 = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1621 %shuffle2 = shufflevector <2 x i32> %shuffle1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1622 %mul = mul <4 x i32> %shuffle2, %b 1623 %add = add <4 x i32> %mul, %a 1624 ret <4 x i32> %add 1625} 1626 1627define <4 x i32> @vmull_laneq_s16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp { 1628entry: 1629; CHECK: vmull_laneq_s16_test 1630; CHECK-NOT: ext 1631; CHECK: smull.4s v0, v0, v1[6] 1632; CHECK-NEXT: ret 1633 %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6> 1634 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2 1635 ret <4 x i32> %vmull2.i 1636} 1637 1638define <2 x i64> @vmull_laneq_s32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp { 1639entry: 1640; CHECK: vmull_laneq_s32_test 1641; CHECK-NOT: ext 1642; CHECK: smull.2d v0, v0, v1[2] 1643; CHECK-NEXT: ret 1644 %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2> 1645 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2 1646 ret <2 x i64> %vmull2.i 1647} 1648define <4 x i32> @vmull_laneq_u16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp { 1649entry: 1650; CHECK: vmull_laneq_u16_test 1651; CHECK-NOT: ext 1652; CHECK: umull.4s v0, v0, v1[6] 1653; CHECK-NEXT: ret 1654 %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6> 1655 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2 1656 ret <4 x i32> %vmull2.i 1657} 1658 1659define <2 x i64> @vmull_laneq_u32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp { 1660entry: 1661; CHECK: vmull_laneq_u32_test 1662; CHECK-NOT: ext 1663; CHECK: umull.2d v0, v0, v1[2] 1664; CHECK-NEXT: ret 1665 %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2> 1666 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2 1667 ret <2 x i64> %vmull2.i 1668} 1669 1670define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp { 1671entry: 1672; CHECK: vmull_high_n_s16_test 1673; CHECK-NOT: ext 1674; CHECK: smull2.4s 1675; CHECK-NEXT: ret 1676 %conv = trunc i32 %d to i16 1677 %0 = bitcast <8 x i16> %b to <2 x i64> 1678 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> 1679 %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16> 1680 %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0 1681 %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1 1682 %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2 1683 %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3 1684 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind 1685 ret <4 x i32> %vmull2.i.i 1686} 1687 1688define <2 x i64> @vmull_high_n_s32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp { 1689entry: 1690; CHECK: vmull_high_n_s32_test 1691; CHECK-NOT: ext 1692; CHECK: smull2.2d 1693; CHECK-NEXT: ret 1694 %0 = bitcast <4 x i32> %b to <2 x i64> 1695 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> 1696 %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32> 1697 %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0 1698 %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1 1699 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind 1700 ret <2 x i64> %vmull2.i.i 1701} 1702 1703define <4 x i32> @vmull_high_n_u16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp { 1704entry: 1705; CHECK: vmull_high_n_u16_test 1706; CHECK-NOT: ext 1707; CHECK: umull2.4s 1708; CHECK-NEXT: ret 1709 %conv = trunc i32 %d to i16 1710 %0 = bitcast <8 x i16> %b to <2 x i64> 1711 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> 1712 %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16> 1713 %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0 1714 %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1 1715 %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2 1716 %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3 1717 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind 1718 ret <4 x i32> %vmull2.i.i 1719} 1720 1721define <2 x i64> @vmull_high_n_u32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp { 1722entry: 1723; CHECK: vmull_high_n_u32_test 1724; CHECK-NOT: ext 1725; CHECK: umull2.2d 1726; CHECK-NEXT: ret 1727 %0 = bitcast <4 x i32> %b to <2 x i64> 1728 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> 1729 %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32> 1730 %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0 1731 %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1 1732 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind 1733 ret <2 x i64> %vmull2.i.i 1734} 1735 1736define <4 x i32> @vmul_built_dup_test(<4 x i32> %a, <4 x i32> %b) { 1737; CHECK-LABEL: vmul_built_dup_test: 1738; CHECK-NOT: ins 1739; CHECK-NOT: dup 1740; CHECK: mul.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[1] 1741 %vget_lane = extractelement <4 x i32> %b, i32 1 1742 %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0 1743 %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1 1744 %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2 1745 %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3 1746 %prod = mul <4 x i32> %a, %vecinit3.i 1747 ret <4 x i32> %prod 1748} 1749 1750define <4 x i16> @vmul_built_dup_fromsmall_test(<4 x i16> %a, <4 x i16> %b) { 1751; CHECK-LABEL: vmul_built_dup_fromsmall_test: 1752; CHECK-NOT: ins 1753; CHECK-NOT: dup 1754; CHECK: mul.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[3] 1755 %vget_lane = extractelement <4 x i16> %b, i32 3 1756 %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0 1757 %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1 1758 %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2 1759 %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3 1760 %prod = mul <4 x i16> %a, %vecinit3.i 1761 ret <4 x i16> %prod 1762} 1763 1764define <8 x i16> @vmulq_built_dup_fromsmall_test(<8 x i16> %a, <4 x i16> %b) { 1765; CHECK-LABEL: vmulq_built_dup_fromsmall_test: 1766; CHECK-NOT: ins 1767; CHECK-NOT: dup 1768; CHECK: mul.8h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] 1769 %vget_lane = extractelement <4 x i16> %b, i32 0 1770 %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0 1771 %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1 1772 %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2 1773 %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3 1774 %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4 1775 %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5 1776 %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6 1777 %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7 1778 %prod = mul <8 x i16> %a, %vecinit7.i 1779 ret <8 x i16> %prod 1780} 1781 1782define <2 x i64> @mull_from_two_extracts(<4 x i32> %lhs, <4 x i32> %rhs) { 1783; CHECK-LABEL: mull_from_two_extracts: 1784; CHECK-NOT: ext 1785; CHECK: sqdmull2.2d 1786 1787 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1788 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1789 1790 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind 1791 ret <2 x i64> %res 1792} 1793 1794define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) { 1795; CHECK-LABEL: mlal_from_two_extracts: 1796; CHECK-NOT: ext 1797; CHECK: sqdmlal2.2d 1798 1799 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1800 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1801 1802 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind 1803 %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res) 1804 ret <2 x i64> %sum 1805} 1806 1807define <2 x i64> @mull_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { 1808; CHECK-LABEL: mull_from_extract_dup: 1809; CHECK-NOT: ext 1810; CHECK: sqdmull2.2d 1811 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 1812 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 1813 1814 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1815 1816 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind 1817 ret <2 x i64> %res 1818} 1819 1820define <8 x i16> @pmull_from_extract_dup(<16 x i8> %lhs, i8 %rhs) { 1821; CHECK-LABEL: pmull_from_extract_dup: 1822; CHECK-NOT: ext 1823; CHECK: pmull2.8h 1824 %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0 1825 %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> 1826 1827 %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1828 1829 %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind 1830 ret <8 x i16> %res 1831} 1832 1833define <8 x i16> @pmull_from_extract_duplane(<16 x i8> %lhs, <8 x i8> %rhs) { 1834; CHECK-LABEL: pmull_from_extract_duplane: 1835; CHECK-NOT: ext 1836; CHECK: pmull2.8h 1837 1838 %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1839 %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> 1840 1841 %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind 1842 ret <8 x i16> %res 1843} 1844 1845define <2 x i64> @sqdmull_from_extract_duplane(<4 x i32> %lhs, <4 x i32> %rhs) { 1846; CHECK-LABEL: sqdmull_from_extract_duplane: 1847; CHECK-NOT: ext 1848; CHECK: sqdmull2.2d 1849 1850 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1851 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0> 1852 1853 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind 1854 ret <2 x i64> %res 1855} 1856 1857define <2 x i64> @sqdmlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) { 1858; CHECK-LABEL: sqdmlal_from_extract_duplane: 1859; CHECK-NOT: ext 1860; CHECK: sqdmlal2.2d 1861 1862 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1863 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0> 1864 1865 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind 1866 %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res) 1867 ret <2 x i64> %sum 1868} 1869 1870define <2 x i64> @umlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) { 1871; CHECK-LABEL: umlal_from_extract_duplane: 1872; CHECK-NOT: ext 1873; CHECK: umlal2.2d 1874 1875 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1876 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0> 1877 1878 %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind 1879 %sum = add <2 x i64> %accum, %res 1880 ret <2 x i64> %sum 1881} 1882 1883define float @scalar_fmla_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) { 1884; CHECK-LABEL: scalar_fmla_from_extract_v4f32: 1885; CHECK: fmla.s s0, s1, v2[3] 1886 %rhs = extractelement <4 x float> %rvec, i32 3 1887 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum) 1888 ret float %res 1889} 1890 1891define float @scalar_fmla_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) { 1892; CHECK-LABEL: scalar_fmla_from_extract_v2f32: 1893; CHECK: fmla.s s0, s1, v2[1] 1894 %rhs = extractelement <2 x float> %rvec, i32 1 1895 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum) 1896 ret float %res 1897} 1898 1899define float @scalar_fmls_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) { 1900; CHECK-LABEL: scalar_fmls_from_extract_v4f32: 1901; CHECK: fmls.s s0, s1, v2[3] 1902 %rhs.scal = extractelement <4 x float> %rvec, i32 3 1903 %rhs = fsub float -0.0, %rhs.scal 1904 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum) 1905 ret float %res 1906} 1907 1908define float @scalar_fmls_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) { 1909; CHECK-LABEL: scalar_fmls_from_extract_v2f32: 1910; CHECK: fmls.s s0, s1, v2[1] 1911 %rhs.scal = extractelement <2 x float> %rvec, i32 1 1912 %rhs = fsub float -0.0, %rhs.scal 1913 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum) 1914 ret float %res 1915} 1916 1917declare float @llvm.fma.f32(float, float, float) 1918 1919define double @scalar_fmla_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) { 1920; CHECK-LABEL: scalar_fmla_from_extract_v2f64: 1921; CHECK: fmla.d d0, d1, v2[1] 1922 %rhs = extractelement <2 x double> %rvec, i32 1 1923 %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum) 1924 ret double %res 1925} 1926 1927define double @scalar_fmls_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) { 1928; CHECK-LABEL: scalar_fmls_from_extract_v2f64: 1929; CHECK: fmls.d d0, d1, v2[1] 1930 %rhs.scal = extractelement <2 x double> %rvec, i32 1 1931 %rhs = fsub double -0.0, %rhs.scal 1932 %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum) 1933 ret double %res 1934} 1935 1936declare double @llvm.fma.f64(double, double, double) 1937 1938define <2 x float> @fmls_with_fneg_before_extract_v2f32(<2 x float> %accum, <2 x float> %lhs, <4 x float> %rhs) { 1939; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32: 1940; CHECK: fmls.2s v0, v1, v2[3] 1941 %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs 1942 %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <2 x i32> <i32 3, i32 3> 1943 %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum) 1944 ret <2 x float> %res 1945} 1946 1947define <2 x float> @fmls_with_fneg_before_extract_v2f32_1(<2 x float> %accum, <2 x float> %lhs, <2 x float> %rhs) { 1948; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32_1: 1949; CHECK: fmls.2s v0, v1, v2[1] 1950 %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs 1951 %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <2 x i32> <i32 1, i32 1> 1952 %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum) 1953 ret <2 x float> %res 1954} 1955 1956define <4 x float> @fmls_with_fneg_before_extract_v4f32(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) { 1957; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32: 1958; CHECK: fmls.4s v0, v1, v2[3] 1959 %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs 1960 %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1961 %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum) 1962 ret <4 x float> %res 1963} 1964 1965define <4 x float> @fmls_with_fneg_before_extract_v4f32_1(<4 x float> %accum, <4 x float> %lhs, <2 x float> %rhs) { 1966; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32_1: 1967; CHECK: fmls.4s v0, v1, v2[1] 1968 %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs 1969 %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1970 %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum) 1971 ret <4 x float> %res 1972} 1973 1974define <2 x double> @fmls_with_fneg_before_extract_v2f64(<2 x double> %accum, <2 x double> %lhs, <2 x double> %rhs) { 1975; CHECK-LABEL: fmls_with_fneg_before_extract_v2f64: 1976; CHECK: fmls.2d v0, v1, v2[1] 1977 %rhs_neg = fsub <2 x double> <double -0.0, double -0.0>, %rhs 1978 %splat = shufflevector <2 x double> %rhs_neg, <2 x double> undef, <2 x i32> <i32 1, i32 1> 1979 %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %lhs, <2 x double> %splat, <2 x double> %accum) 1980 ret <2 x double> %res 1981} 1982 1983define <1 x double> @test_fmul_v1f64(<1 x double> %L, <1 x double> %R) nounwind { 1984; CHECK-LABEL: test_fmul_v1f64: 1985; CHECK: fmul 1986 %prod = fmul <1 x double> %L, %R 1987 ret <1 x double> %prod 1988} 1989 1990define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind { 1991; CHECK-LABEL: test_fdiv_v1f64: 1992; CHECK-LABEL: fdiv 1993 %prod = fdiv <1 x double> %L, %R 1994 ret <1 x double> %prod 1995} 1996 1997define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind { 1998;CHECK-LABEL: sqdmlal_d: 1999;CHECK: sqdmlal 2000 %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B) 2001 %tmp5 = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %C, i64 %tmp4) 2002 ret i64 %tmp5 2003} 2004 2005define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind { 2006;CHECK-LABEL: sqdmlsl_d: 2007;CHECK: sqdmlsl 2008 %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B) 2009 %tmp5 = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %C, i64 %tmp4) 2010 ret i64 %tmp5 2011} 2012 2013define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind { 2014; CHECK-LABEL: test_pmull_64: 2015; CHECK: pmull.1q 2016 %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r) 2017 ret <16 x i8> %val 2018} 2019 2020define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind { 2021; CHECK-LABEL: test_pmull_high_64: 2022; CHECK: pmull2.1q 2023 %l_hi = extractelement <2 x i64> %l, i32 1 2024 %r_hi = extractelement <2 x i64> %r, i32 1 2025 %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l_hi, i64 %r_hi) 2026 ret <16 x i8> %val 2027} 2028 2029declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) 2030 2031define <1 x i64> @test_mul_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) nounwind { 2032; CHECK-LABEL: test_mul_v1i64: 2033; CHECK: mul 2034 %prod = mul <1 x i64> %lhs, %rhs 2035 ret <1 x i64> %prod 2036} 2037