1; RUN: llc -mattr=+neon < %s | FileCheck %s 2target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32" 3target triple = "thumbv7-elf" 4 5define <4 x i16> @vqdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind { 6;CHECK-LABEL: vqdmulhs16: 7;CHECK: vqdmulh.s16 8 %tmp1 = load <4 x i16>, <4 x i16>* %A 9 %tmp2 = load <4 x i16>, <4 x i16>* %B 10 %tmp3 = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 11 ret <4 x i16> %tmp3 12} 13 14define <2 x i32> @vqdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind { 15;CHECK-LABEL: vqdmulhs32: 16;CHECK: vqdmulh.s32 17 %tmp1 = load <2 x i32>, <2 x i32>* %A 18 %tmp2 = load <2 x i32>, <2 x i32>* %B 19 %tmp3 = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 20 ret <2 x i32> %tmp3 21} 22 23define <8 x i16> @vqdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { 24;CHECK-LABEL: vqdmulhQs16: 25;CHECK: vqdmulh.s16 26 %tmp1 = load <8 x i16>, <8 x i16>* %A 27 %tmp2 = load <8 x i16>, <8 x i16>* %B 28 %tmp3 = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 29 ret <8 x i16> %tmp3 30} 31 32define <4 x i32> @vqdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { 33;CHECK-LABEL: vqdmulhQs32: 34;CHECK: vqdmulh.s32 35 %tmp1 = load <4 x i32>, <4 x i32>* %A 36 %tmp2 = load <4 x i32>, <4 x i32>* %B 37 %tmp3 = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 38 ret <4 x i32> %tmp3 39} 40 41define arm_aapcs_vfpcc <8 x i16> @test_vqdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { 42entry: 43; CHECK: test_vqdmulhQ_lanes16 44; CHECK: vqdmulh.s16 q0, q0, d2[1] 45 %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1] 46 %1 = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1] 47 ret <8 x i16> %1 48} 49 50define arm_aapcs_vfpcc <4 x i32> @test_vqdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { 51entry: 52; CHECK: test_vqdmulhQ_lanes32 53; CHECK: vqdmulh.s32 q0, q0, d2[1] 54 %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1] 55 %1 = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1] 56 ret <4 x i32> %1 57} 58 59define arm_aapcs_vfpcc <4 x i16> @test_vqdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { 60entry: 61; CHECK: test_vqdmulh_lanes16 62; CHECK: vqdmulh.s16 d0, d0, d1[1] 63 %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] 64 %1 = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1] 65 ret <4 x i16> %1 66} 67 68define arm_aapcs_vfpcc <2 x i32> @test_vqdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { 69entry: 70; CHECK: test_vqdmulh_lanes32 71; CHECK: vqdmulh.s32 d0, d0, d1[1] 72 %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] 73 %1 = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1] 74 ret <2 x i32> %1 75} 76 77declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 78declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 79 80declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 81declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 82 83define <4 x i16> @vqrdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind { 84;CHECK-LABEL: vqrdmulhs16: 85;CHECK: vqrdmulh.s16 86 %tmp1 = load <4 x i16>, <4 x i16>* %A 87 %tmp2 = load <4 x i16>, <4 x i16>* %B 88 %tmp3 = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 89 ret <4 x i16> %tmp3 90} 91 92define <2 x i32> @vqrdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind { 93;CHECK-LABEL: vqrdmulhs32: 94;CHECK: vqrdmulh.s32 95 %tmp1 = load <2 x i32>, <2 x i32>* %A 96 %tmp2 = load <2 x i32>, <2 x i32>* %B 97 %tmp3 = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 98 ret <2 x i32> %tmp3 99} 100 101define <8 x i16> @vqrdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { 102;CHECK-LABEL: vqrdmulhQs16: 103;CHECK: vqrdmulh.s16 104 %tmp1 = load <8 x i16>, <8 x i16>* %A 105 %tmp2 = load <8 x i16>, <8 x i16>* %B 106 %tmp3 = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 107 ret <8 x i16> %tmp3 108} 109 110define <4 x i32> @vqrdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { 111;CHECK-LABEL: vqrdmulhQs32: 112;CHECK: vqrdmulh.s32 113 %tmp1 = load <4 x i32>, <4 x i32>* %A 114 %tmp2 = load <4 x i32>, <4 x i32>* %B 115 %tmp3 = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 116 ret <4 x i32> %tmp3 117} 118 119define arm_aapcs_vfpcc <8 x i16> @test_vqRdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { 120entry: 121; CHECK: test_vqRdmulhQ_lanes16 122; CHECK: vqrdmulh.s16 q0, q0, d2[1] 123 %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1] 124 %1 = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1] 125 ret <8 x i16> %1 126} 127 128define arm_aapcs_vfpcc <4 x i32> @test_vqRdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { 129entry: 130; CHECK: test_vqRdmulhQ_lanes32 131; CHECK: vqrdmulh.s32 q0, q0, d2[1] 132 %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1] 133 %1 = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1] 134 ret <4 x i32> %1 135} 136 137define arm_aapcs_vfpcc <4 x i16> @test_vqRdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { 138entry: 139; CHECK: test_vqRdmulh_lanes16 140; CHECK: vqrdmulh.s16 d0, d0, d1[1] 141 %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] 142 %1 = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1] 143 ret <4 x i16> %1 144} 145 146define arm_aapcs_vfpcc <2 x i32> @test_vqRdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { 147entry: 148; CHECK: test_vqRdmulh_lanes32 149; CHECK: vqrdmulh.s32 d0, d0, d1[1] 150 %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] 151 %1 = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1] 152 ret <2 x i32> %1 153} 154 155declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 156declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 157 158declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 159declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 160 161define <4 x i32> @vqdmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { 162;CHECK-LABEL: vqdmulls16: 163;CHECK: vqdmull.s16 164 %tmp1 = load <4 x i16>, <4 x i16>* %A 165 %tmp2 = load <4 x i16>, <4 x i16>* %B 166 %tmp3 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 167 ret <4 x i32> %tmp3 168} 169 170define <2 x i64> @vqdmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { 171;CHECK-LABEL: vqdmulls32: 172;CHECK: vqdmull.s32 173 %tmp1 = load <2 x i32>, <2 x i32>* %A 174 %tmp2 = load <2 x i32>, <2 x i32>* %B 175 %tmp3 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 176 ret <2 x i64> %tmp3 177} 178 179define arm_aapcs_vfpcc <4 x i32> @test_vqdmull_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { 180entry: 181; CHECK: test_vqdmull_lanes16 182; CHECK: vqdmull.s16 q0, d0, d1[1] 183 %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] 184 %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] 185 ret <4 x i32> %1 186} 187 188define arm_aapcs_vfpcc <2 x i64> @test_vqdmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { 189entry: 190; CHECK: test_vqdmull_lanes32 191; CHECK: vqdmull.s32 q0, d0, d1[1] 192 %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] 193 %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] 194 ret <2 x i64> %1 195} 196 197declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone 198declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone 199 200define <4 x i32> @vqdmlals16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { 201;CHECK-LABEL: vqdmlals16_natural: 202;CHECK: vqdmlal.s16 203 %tmp1 = load <4 x i32>, <4 x i32>* %A 204 %tmp2 = load <4 x i16>, <4 x i16>* %B 205 %tmp3 = load <4 x i16>, <4 x i16>* %C 206 %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp2, <4 x i16> %tmp3) 207 %tmp5 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4) 208 ret <4 x i32> %tmp5 209} 210 211define <2 x i64> @vqdmlals32_natural(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { 212;CHECK-LABEL: vqdmlals32_natural: 213;CHECK: vqdmlal.s32 214 %tmp1 = load <2 x i64>, <2 x i64>* %A 215 %tmp2 = load <2 x i32>, <2 x i32>* %B 216 %tmp3 = load <2 x i32>, <2 x i32>* %C 217 %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp2, <2 x i32> %tmp3) 218 %tmp5 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4) 219 ret <2 x i64> %tmp5 220} 221 222define arm_aapcs_vfpcc <4 x i32> @test_vqdmlal_lanes16_natural(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone { 223entry: 224; CHECK-LABEL: test_vqdmlal_lanes16_natural: 225; CHECK: vqdmlal.s16 q0, d2, d3[1] 226 %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] 227 %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg1_int16x4_t, <4 x i16> %0) 228 %2 = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1) 229 ret <4 x i32> %2 230} 231 232define arm_aapcs_vfpcc <2 x i64> @test_vqdmlal_lanes32_natural(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone { 233entry: 234; CHECK-LABEL: test_vqdmlal_lanes32_natural: 235; CHECK: vqdmlal.s32 q0, d2, d3[1] 236 %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] 237 %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg1_int32x2_t, <2 x i32> %0) 238 %2 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1) 239 ret <2 x i64> %2 240} 241 242declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 243declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) nounwind readnone 244 245define <4 x i32> @vqdmlsls16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { 246;CHECK-LABEL: vqdmlsls16_natural: 247;CHECK: vqdmlsl.s16 248 %tmp1 = load <4 x i32>, <4 x i32>* %A 249 %tmp2 = load <4 x i16>, <4 x i16>* %B 250 %tmp3 = load <4 x i16>, <4 x i16>* %C 251 %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp2, <4 x i16> %tmp3) 252 %tmp5 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4) 253 ret <4 x i32> %tmp5 254} 255 256define <2 x i64> @vqdmlsls32_natural(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { 257;CHECK-LABEL: vqdmlsls32_natural: 258;CHECK: vqdmlsl.s32 259 %tmp1 = load <2 x i64>, <2 x i64>* %A 260 %tmp2 = load <2 x i32>, <2 x i32>* %B 261 %tmp3 = load <2 x i32>, <2 x i32>* %C 262 %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp2, <2 x i32> %tmp3) 263 %tmp5 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4) 264 ret <2 x i64> %tmp5 265} 266 267define arm_aapcs_vfpcc <4 x i32> @test_vqdmlsl_lanes16_natural(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone { 268entry: 269; CHECK-LABEL: test_vqdmlsl_lanes16_natural: 270; CHECK: vqdmlsl.s16 q0, d2, d3[1] 271 %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] 272 %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg1_int16x4_t, <4 x i16> %0) 273 %2 = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1) 274 ret <4 x i32> %2 275} 276 277define arm_aapcs_vfpcc <2 x i64> @test_vqdmlsl_lanes32_natural(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone { 278entry: 279; CHECK-LABEL: test_vqdmlsl_lanes32_natural: 280; CHECK: vqdmlsl.s32 q0, d2, d3[1] 281 %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] 282 %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg1_int32x2_t, <2 x i32> %0) 283 %2 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1) 284 ret <2 x i64> %2 285} 286 287declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 288declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) nounwind readnone 289