1; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s 2 3define <8 x i8> @addhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind { 4;CHECK-LABEL: addhn8b: 5;CHECK: addhn.8b 6 %tmp1 = load <8 x i16>, <8 x i16>* %A 7 %tmp2 = load <8 x i16>, <8 x i16>* %B 8 %tmp3 = call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2) 9 ret <8 x i8> %tmp3 10} 11 12define <4 x i16> @addhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind { 13;CHECK-LABEL: addhn4h: 14;CHECK: addhn.4h 15 %tmp1 = load <4 x i32>, <4 x i32>* %A 16 %tmp2 = load <4 x i32>, <4 x i32>* %B 17 %tmp3 = call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2) 18 ret <4 x i16> %tmp3 19} 20 21define <2 x i32> @addhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind { 22;CHECK-LABEL: addhn2s: 23;CHECK: addhn.2s 24 %tmp1 = load <2 x i64>, <2 x i64>* %A 25 %tmp2 = load <2 x i64>, <2 x i64>* %B 26 %tmp3 = call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2) 27 ret <2 x i32> %tmp3 28} 29 30define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind { 31;CHECK-LABEL: addhn2_16b: 32;CHECK: addhn.8b 33;CHECK-NEXT: addhn2.16b 34 %vaddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind 35 %vaddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind 36 %res = shufflevector <8 x i8> %vaddhn2.i, <8 x i8> %vaddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 37 ret <16 x i8> %res 38} 39 40define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind { 41;CHECK-LABEL: addhn2_8h: 42;CHECK: addhn.4h 43;CHECK-NEXT: addhn2.8h 44 %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind 45 %vaddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind 46 %res = shufflevector <4 x i16> %vaddhn2.i, <4 x i16> %vaddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 47 ret <8 x i16> %res 48} 49 50define <4 x i32> @addhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind { 51;CHECK-LABEL: addhn2_4s: 52;CHECK: addhn.2s 53;CHECK-NEXT: addhn2.4s 54 %vaddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind 55 %vaddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind 56 %res = shufflevector <2 x i32> %vaddhn2.i, <2 x i32> %vaddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 57 ret <4 x i32> %res 58} 59 60declare <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone 61declare <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone 62declare <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone 63 64 65define <8 x i8> @raddhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind { 66;CHECK-LABEL: raddhn8b: 67;CHECK: raddhn.8b 68 %tmp1 = load <8 x i16>, <8 x i16>* %A 69 %tmp2 = load <8 x i16>, <8 x i16>* %B 70 %tmp3 = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2) 71 ret <8 x i8> %tmp3 72} 73 74define <4 x i16> @raddhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind { 75;CHECK-LABEL: raddhn4h: 76;CHECK: raddhn.4h 77 %tmp1 = load <4 x i32>, <4 x i32>* %A 78 %tmp2 = load <4 x i32>, <4 x i32>* %B 79 %tmp3 = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2) 80 ret <4 x i16> %tmp3 81} 82 83define <2 x i32> @raddhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind { 84;CHECK-LABEL: raddhn2s: 85;CHECK: raddhn.2s 86 %tmp1 = load <2 x i64>, <2 x i64>* %A 87 %tmp2 = load <2 x i64>, <2 x i64>* %B 88 %tmp3 = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2) 89 ret <2 x i32> %tmp3 90} 91 92define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind { 93;CHECK-LABEL: raddhn2_16b: 94;CHECK: raddhn.8b 95;CHECK-NEXT: raddhn2.16b 96 %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind 97 %vraddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind 98 %res = shufflevector <8 x i8> %vraddhn2.i, <8 x i8> %vraddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 99 ret <16 x i8> %res 100} 101 102define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind { 103;CHECK-LABEL: raddhn2_8h: 104;CHECK: raddhn.4h 105;CHECK-NEXT: raddhn2.8h 106 %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind 107 %vraddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind 108 %res = shufflevector <4 x i16> %vraddhn2.i, <4 x i16> %vraddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 109 ret <8 x i16> %res 110} 111 112define <4 x i32> @raddhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind { 113;CHECK-LABEL: raddhn2_4s: 114;CHECK: raddhn.2s 115;CHECK-NEXT: raddhn2.4s 116 %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind 117 %vraddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind 118 %res = shufflevector <2 x i32> %vraddhn2.i, <2 x i32> %vraddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 119 ret <4 x i32> %res 120} 121 122declare <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone 123declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone 124declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone 125 126define <8 x i16> @saddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { 127;CHECK-LABEL: saddl8h: 128;CHECK: saddl.8h 129 %tmp1 = load <8 x i8>, <8 x i8>* %A 130 %tmp2 = load <8 x i8>, <8 x i8>* %B 131 %tmp3 = sext <8 x i8> %tmp1 to <8 x i16> 132 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16> 133 %tmp5 = add <8 x i16> %tmp3, %tmp4 134 ret <8 x i16> %tmp5 135} 136 137define <4 x i32> @saddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 138;CHECK-LABEL: saddl4s: 139;CHECK: saddl.4s 140 %tmp1 = load <4 x i16>, <4 x i16>* %A 141 %tmp2 = load <4 x i16>, <4 x i16>* %B 142 %tmp3 = sext <4 x i16> %tmp1 to <4 x i32> 143 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32> 144 %tmp5 = add <4 x i32> %tmp3, %tmp4 145 ret <4 x i32> %tmp5 146} 147 148define <2 x i64> @saddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 149;CHECK-LABEL: saddl2d: 150;CHECK: saddl.2d 151 %tmp1 = load <2 x i32>, <2 x i32>* %A 152 %tmp2 = load <2 x i32>, <2 x i32>* %B 153 %tmp3 = sext <2 x i32> %tmp1 to <2 x i64> 154 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64> 155 %tmp5 = add <2 x i64> %tmp3, %tmp4 156 ret <2 x i64> %tmp5 157} 158 159define <8 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind { 160; CHECK-LABEL: saddl2_8h: 161; CHECK-NEXT: saddl2.8h v0, v0, v1 162; CHECK-NEXT: ret 163 %tmp = bitcast <16 x i8> %a to <2 x i64> 164 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 165 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8> 166 %vmovl.i.i.i = sext <8 x i8> %tmp1 to <8 x i16> 167 %tmp2 = bitcast <16 x i8> %b to <2 x i64> 168 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 169 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8> 170 %vmovl.i.i5.i = sext <8 x i8> %tmp3 to <8 x i16> 171 %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i 172 ret <8 x i16> %add.i 173} 174 175define <4 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind { 176; CHECK-LABEL: saddl2_4s: 177; CHECK-NEXT: saddl2.4s v0, v0, v1 178; CHECK-NEXT: ret 179 %tmp = bitcast <8 x i16> %a to <2 x i64> 180 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 181 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16> 182 %vmovl.i.i.i = sext <4 x i16> %tmp1 to <4 x i32> 183 %tmp2 = bitcast <8 x i16> %b to <2 x i64> 184 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 185 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16> 186 %vmovl.i.i5.i = sext <4 x i16> %tmp3 to <4 x i32> 187 %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i 188 ret <4 x i32> %add.i 189} 190 191define <2 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind { 192; CHECK-LABEL: saddl2_2d: 193; CHECK-NEXT: saddl2.2d v0, v0, v1 194; CHECK-NEXT: ret 195 %tmp = bitcast <4 x i32> %a to <2 x i64> 196 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 197 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32> 198 %vmovl.i.i.i = sext <2 x i32> %tmp1 to <2 x i64> 199 %tmp2 = bitcast <4 x i32> %b to <2 x i64> 200 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 201 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32> 202 %vmovl.i.i5.i = sext <2 x i32> %tmp3 to <2 x i64> 203 %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i 204 ret <2 x i64> %add.i 205} 206 207define <8 x i16> @uaddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { 208;CHECK-LABEL: uaddl8h: 209;CHECK: uaddl.8h 210 %tmp1 = load <8 x i8>, <8 x i8>* %A 211 %tmp2 = load <8 x i8>, <8 x i8>* %B 212 %tmp3 = zext <8 x i8> %tmp1 to <8 x i16> 213 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> 214 %tmp5 = add <8 x i16> %tmp3, %tmp4 215 ret <8 x i16> %tmp5 216} 217 218define <4 x i32> @uaddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 219;CHECK-LABEL: uaddl4s: 220;CHECK: uaddl.4s 221 %tmp1 = load <4 x i16>, <4 x i16>* %A 222 %tmp2 = load <4 x i16>, <4 x i16>* %B 223 %tmp3 = zext <4 x i16> %tmp1 to <4 x i32> 224 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> 225 %tmp5 = add <4 x i32> %tmp3, %tmp4 226 ret <4 x i32> %tmp5 227} 228 229define <2 x i64> @uaddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 230;CHECK-LABEL: uaddl2d: 231;CHECK: uaddl.2d 232 %tmp1 = load <2 x i32>, <2 x i32>* %A 233 %tmp2 = load <2 x i32>, <2 x i32>* %B 234 %tmp3 = zext <2 x i32> %tmp1 to <2 x i64> 235 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> 236 %tmp5 = add <2 x i64> %tmp3, %tmp4 237 ret <2 x i64> %tmp5 238} 239 240 241define <8 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind { 242; CHECK-LABEL: uaddl2_8h: 243; CHECK-NEXT: uaddl2.8h v0, v0, v1 244; CHECK-NEXT: ret 245 %tmp = bitcast <16 x i8> %a to <2 x i64> 246 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 247 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8> 248 %vmovl.i.i.i = zext <8 x i8> %tmp1 to <8 x i16> 249 %tmp2 = bitcast <16 x i8> %b to <2 x i64> 250 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 251 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8> 252 %vmovl.i.i5.i = zext <8 x i8> %tmp3 to <8 x i16> 253 %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i 254 ret <8 x i16> %add.i 255} 256 257define <4 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind { 258; CHECK-LABEL: uaddl2_4s: 259; CHECK-NEXT: uaddl2.4s v0, v0, v1 260; CHECK-NEXT: ret 261 %tmp = bitcast <8 x i16> %a to <2 x i64> 262 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 263 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16> 264 %vmovl.i.i.i = zext <4 x i16> %tmp1 to <4 x i32> 265 %tmp2 = bitcast <8 x i16> %b to <2 x i64> 266 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 267 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16> 268 %vmovl.i.i5.i = zext <4 x i16> %tmp3 to <4 x i32> 269 %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i 270 ret <4 x i32> %add.i 271} 272 273define <2 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind { 274; CHECK-LABEL: uaddl2_2d: 275; CHECK-NEXT: uaddl2.2d v0, v0, v1 276; CHECK-NEXT: ret 277 %tmp = bitcast <4 x i32> %a to <2 x i64> 278 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 279 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32> 280 %vmovl.i.i.i = zext <2 x i32> %tmp1 to <2 x i64> 281 %tmp2 = bitcast <4 x i32> %b to <2 x i64> 282 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 283 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32> 284 %vmovl.i.i5.i = zext <2 x i32> %tmp3 to <2 x i64> 285 %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i 286 ret <2 x i64> %add.i 287} 288 289define <8 x i16> @uaddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind { 290;CHECK-LABEL: uaddw8h: 291;CHECK: uaddw.8h 292 %tmp1 = load <8 x i16>, <8 x i16>* %A 293 %tmp2 = load <8 x i8>, <8 x i8>* %B 294 %tmp3 = zext <8 x i8> %tmp2 to <8 x i16> 295 %tmp4 = add <8 x i16> %tmp1, %tmp3 296 ret <8 x i16> %tmp4 297} 298 299define <4 x i32> @uaddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind { 300;CHECK-LABEL: uaddw4s: 301;CHECK: uaddw.4s 302 %tmp1 = load <4 x i32>, <4 x i32>* %A 303 %tmp2 = load <4 x i16>, <4 x i16>* %B 304 %tmp3 = zext <4 x i16> %tmp2 to <4 x i32> 305 %tmp4 = add <4 x i32> %tmp1, %tmp3 306 ret <4 x i32> %tmp4 307} 308 309define <2 x i64> @uaddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind { 310;CHECK-LABEL: uaddw2d: 311;CHECK: uaddw.2d 312 %tmp1 = load <2 x i64>, <2 x i64>* %A 313 %tmp2 = load <2 x i32>, <2 x i32>* %B 314 %tmp3 = zext <2 x i32> %tmp2 to <2 x i64> 315 %tmp4 = add <2 x i64> %tmp1, %tmp3 316 ret <2 x i64> %tmp4 317} 318 319define <8 x i16> @uaddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind { 320;CHECK-LABEL: uaddw2_8h: 321;CHECK: uaddw.8h 322 %tmp1 = load <8 x i16>, <8 x i16>* %A 323 324 %tmp2 = load <16 x i8>, <16 x i8>* %B 325 %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 326 %ext2 = zext <8 x i8> %high2 to <8 x i16> 327 328 %res = add <8 x i16> %tmp1, %ext2 329 ret <8 x i16> %res 330} 331 332define <4 x i32> @uaddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind { 333;CHECK-LABEL: uaddw2_4s: 334;CHECK: uaddw.4s 335 %tmp1 = load <4 x i32>, <4 x i32>* %A 336 337 %tmp2 = load <8 x i16>, <8 x i16>* %B 338 %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 339 %ext2 = zext <4 x i16> %high2 to <4 x i32> 340 341 %res = add <4 x i32> %tmp1, %ext2 342 ret <4 x i32> %res 343} 344 345define <2 x i64> @uaddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind { 346;CHECK-LABEL: uaddw2_2d: 347;CHECK: uaddw.2d 348 %tmp1 = load <2 x i64>, <2 x i64>* %A 349 350 %tmp2 = load <4 x i32>, <4 x i32>* %B 351 %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 352 %ext2 = zext <2 x i32> %high2 to <2 x i64> 353 354 %res = add <2 x i64> %tmp1, %ext2 355 ret <2 x i64> %res 356} 357 358define <8 x i16> @saddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind { 359;CHECK-LABEL: saddw8h: 360;CHECK: saddw.8h 361 %tmp1 = load <8 x i16>, <8 x i16>* %A 362 %tmp2 = load <8 x i8>, <8 x i8>* %B 363 %tmp3 = sext <8 x i8> %tmp2 to <8 x i16> 364 %tmp4 = add <8 x i16> %tmp1, %tmp3 365 ret <8 x i16> %tmp4 366} 367 368define <4 x i32> @saddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind { 369;CHECK-LABEL: saddw4s: 370;CHECK: saddw.4s 371 %tmp1 = load <4 x i32>, <4 x i32>* %A 372 %tmp2 = load <4 x i16>, <4 x i16>* %B 373 %tmp3 = sext <4 x i16> %tmp2 to <4 x i32> 374 %tmp4 = add <4 x i32> %tmp1, %tmp3 375 ret <4 x i32> %tmp4 376} 377 378define <2 x i64> @saddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind { 379;CHECK-LABEL: saddw2d: 380;CHECK: saddw.2d 381 %tmp1 = load <2 x i64>, <2 x i64>* %A 382 %tmp2 = load <2 x i32>, <2 x i32>* %B 383 %tmp3 = sext <2 x i32> %tmp2 to <2 x i64> 384 %tmp4 = add <2 x i64> %tmp1, %tmp3 385 ret <2 x i64> %tmp4 386} 387 388define <8 x i16> @saddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind { 389;CHECK-LABEL: saddw2_8h: 390;CHECK: saddw.8h 391 %tmp1 = load <8 x i16>, <8 x i16>* %A 392 393 %tmp2 = load <16 x i8>, <16 x i8>* %B 394 %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 395 %ext2 = sext <8 x i8> %high2 to <8 x i16> 396 397 %res = add <8 x i16> %tmp1, %ext2 398 ret <8 x i16> %res 399} 400 401define <4 x i32> @saddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind { 402;CHECK-LABEL: saddw2_4s: 403;CHECK: saddw.4s 404 %tmp1 = load <4 x i32>, <4 x i32>* %A 405 406 %tmp2 = load <8 x i16>, <8 x i16>* %B 407 %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 408 %ext2 = sext <4 x i16> %high2 to <4 x i32> 409 410 %res = add <4 x i32> %tmp1, %ext2 411 ret <4 x i32> %res 412} 413 414define <2 x i64> @saddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind { 415;CHECK-LABEL: saddw2_2d: 416;CHECK: saddw.2d 417 %tmp1 = load <2 x i64>, <2 x i64>* %A 418 419 %tmp2 = load <4 x i32>, <4 x i32>* %B 420 %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 421 %ext2 = sext <2 x i32> %high2 to <2 x i64> 422 423 %res = add <2 x i64> %tmp1, %ext2 424 ret <2 x i64> %res 425} 426 427define <4 x i16> @saddlp4h(<8 x i8>* %A) nounwind { 428;CHECK-LABEL: saddlp4h: 429;CHECK: saddlp.4h 430 %tmp1 = load <8 x i8>, <8 x i8>* %A 431 %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1) 432 ret <4 x i16> %tmp3 433} 434 435define <2 x i32> @saddlp2s(<4 x i16>* %A) nounwind { 436;CHECK-LABEL: saddlp2s: 437;CHECK: saddlp.2s 438 %tmp1 = load <4 x i16>, <4 x i16>* %A 439 %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1) 440 ret <2 x i32> %tmp3 441} 442 443define <1 x i64> @saddlp1d(<2 x i32>* %A) nounwind { 444;CHECK-LABEL: saddlp1d: 445;CHECK: saddlp.1d 446 %tmp1 = load <2 x i32>, <2 x i32>* %A 447 %tmp3 = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1) 448 ret <1 x i64> %tmp3 449} 450 451define <8 x i16> @saddlp8h(<16 x i8>* %A) nounwind { 452;CHECK-LABEL: saddlp8h: 453;CHECK: saddlp.8h 454 %tmp1 = load <16 x i8>, <16 x i8>* %A 455 %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1) 456 ret <8 x i16> %tmp3 457} 458 459define <4 x i32> @saddlp4s(<8 x i16>* %A) nounwind { 460;CHECK-LABEL: saddlp4s: 461;CHECK: saddlp.4s 462 %tmp1 = load <8 x i16>, <8 x i16>* %A 463 %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1) 464 ret <4 x i32> %tmp3 465} 466 467define <2 x i64> @saddlp2d(<4 x i32>* %A) nounwind { 468;CHECK-LABEL: saddlp2d: 469;CHECK: saddlp.2d 470 %tmp1 = load <4 x i32>, <4 x i32>* %A 471 %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1) 472 ret <2 x i64> %tmp3 473} 474 475declare <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8>) nounwind readnone 476declare <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16>) nounwind readnone 477declare <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32>) nounwind readnone 478 479declare <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8>) nounwind readnone 480declare <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind readnone 481declare <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone 482 483define <4 x i16> @uaddlp4h(<8 x i8>* %A) nounwind { 484;CHECK-LABEL: uaddlp4h: 485;CHECK: uaddlp.4h 486 %tmp1 = load <8 x i8>, <8 x i8>* %A 487 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1) 488 ret <4 x i16> %tmp3 489} 490 491define <2 x i32> @uaddlp2s(<4 x i16>* %A) nounwind { 492;CHECK-LABEL: uaddlp2s: 493;CHECK: uaddlp.2s 494 %tmp1 = load <4 x i16>, <4 x i16>* %A 495 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1) 496 ret <2 x i32> %tmp3 497} 498 499define <1 x i64> @uaddlp1d(<2 x i32>* %A) nounwind { 500;CHECK-LABEL: uaddlp1d: 501;CHECK: uaddlp.1d 502 %tmp1 = load <2 x i32>, <2 x i32>* %A 503 %tmp3 = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1) 504 ret <1 x i64> %tmp3 505} 506 507define <8 x i16> @uaddlp8h(<16 x i8>* %A) nounwind { 508;CHECK-LABEL: uaddlp8h: 509;CHECK: uaddlp.8h 510 %tmp1 = load <16 x i8>, <16 x i8>* %A 511 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1) 512 ret <8 x i16> %tmp3 513} 514 515define <4 x i32> @uaddlp4s(<8 x i16>* %A) nounwind { 516;CHECK-LABEL: uaddlp4s: 517;CHECK: uaddlp.4s 518 %tmp1 = load <8 x i16>, <8 x i16>* %A 519 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1) 520 ret <4 x i32> %tmp3 521} 522 523define <2 x i64> @uaddlp2d(<4 x i32>* %A) nounwind { 524;CHECK-LABEL: uaddlp2d: 525;CHECK: uaddlp.2d 526 %tmp1 = load <4 x i32>, <4 x i32>* %A 527 %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1) 528 ret <2 x i64> %tmp3 529} 530 531declare <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone 532declare <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16>) nounwind readnone 533declare <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32>) nounwind readnone 534 535declare <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone 536declare <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind readnone 537declare <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone 538 539define <4 x i16> @sadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind { 540;CHECK-LABEL: sadalp4h: 541;CHECK: sadalp.4h 542 %tmp1 = load <8 x i8>, <8 x i8>* %A 543 %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1) 544 %tmp4 = load <4 x i16>, <4 x i16>* %B 545 %tmp5 = add <4 x i16> %tmp3, %tmp4 546 ret <4 x i16> %tmp5 547} 548 549define <2 x i32> @sadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind { 550;CHECK-LABEL: sadalp2s: 551;CHECK: sadalp.2s 552 %tmp1 = load <4 x i16>, <4 x i16>* %A 553 %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1) 554 %tmp4 = load <2 x i32>, <2 x i32>* %B 555 %tmp5 = add <2 x i32> %tmp3, %tmp4 556 ret <2 x i32> %tmp5 557} 558 559define <8 x i16> @sadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind { 560;CHECK-LABEL: sadalp8h: 561;CHECK: sadalp.8h 562 %tmp1 = load <16 x i8>, <16 x i8>* %A 563 %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1) 564 %tmp4 = load <8 x i16>, <8 x i16>* %B 565 %tmp5 = add <8 x i16> %tmp3, %tmp4 566 ret <8 x i16> %tmp5 567} 568 569define <4 x i32> @sadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind { 570;CHECK-LABEL: sadalp4s: 571;CHECK: sadalp.4s 572 %tmp1 = load <8 x i16>, <8 x i16>* %A 573 %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1) 574 %tmp4 = load <4 x i32>, <4 x i32>* %B 575 %tmp5 = add <4 x i32> %tmp3, %tmp4 576 ret <4 x i32> %tmp5 577} 578 579define <2 x i64> @sadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind { 580;CHECK-LABEL: sadalp2d: 581;CHECK: sadalp.2d 582 %tmp1 = load <4 x i32>, <4 x i32>* %A 583 %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1) 584 %tmp4 = load <2 x i64>, <2 x i64>* %B 585 %tmp5 = add <2 x i64> %tmp3, %tmp4 586 ret <2 x i64> %tmp5 587} 588 589define <4 x i16> @uadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind { 590;CHECK-LABEL: uadalp4h: 591;CHECK: uadalp.4h 592 %tmp1 = load <8 x i8>, <8 x i8>* %A 593 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1) 594 %tmp4 = load <4 x i16>, <4 x i16>* %B 595 %tmp5 = add <4 x i16> %tmp3, %tmp4 596 ret <4 x i16> %tmp5 597} 598 599define <2 x i32> @uadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind { 600;CHECK-LABEL: uadalp2s: 601;CHECK: uadalp.2s 602 %tmp1 = load <4 x i16>, <4 x i16>* %A 603 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1) 604 %tmp4 = load <2 x i32>, <2 x i32>* %B 605 %tmp5 = add <2 x i32> %tmp3, %tmp4 606 ret <2 x i32> %tmp5 607} 608 609define <8 x i16> @uadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind { 610;CHECK-LABEL: uadalp8h: 611;CHECK: uadalp.8h 612 %tmp1 = load <16 x i8>, <16 x i8>* %A 613 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1) 614 %tmp4 = load <8 x i16>, <8 x i16>* %B 615 %tmp5 = add <8 x i16> %tmp3, %tmp4 616 ret <8 x i16> %tmp5 617} 618 619define <4 x i32> @uadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind { 620;CHECK-LABEL: uadalp4s: 621;CHECK: uadalp.4s 622 %tmp1 = load <8 x i16>, <8 x i16>* %A 623 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1) 624 %tmp4 = load <4 x i32>, <4 x i32>* %B 625 %tmp5 = add <4 x i32> %tmp3, %tmp4 626 ret <4 x i32> %tmp5 627} 628 629define <2 x i64> @uadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind { 630;CHECK-LABEL: uadalp2d: 631;CHECK: uadalp.2d 632 %tmp1 = load <4 x i32>, <4 x i32>* %A 633 %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1) 634 %tmp4 = load <2 x i64>, <2 x i64>* %B 635 %tmp5 = add <2 x i64> %tmp3, %tmp4 636 ret <2 x i64> %tmp5 637} 638 639define <8 x i8> @addp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { 640;CHECK-LABEL: addp_8b: 641;CHECK: addp.8b 642 %tmp1 = load <8 x i8>, <8 x i8>* %A 643 %tmp2 = load <8 x i8>, <8 x i8>* %B 644 %tmp3 = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 645 ret <8 x i8> %tmp3 646} 647 648define <16 x i8> @addp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { 649;CHECK-LABEL: addp_16b: 650;CHECK: addp.16b 651 %tmp1 = load <16 x i8>, <16 x i8>* %A 652 %tmp2 = load <16 x i8>, <16 x i8>* %B 653 %tmp3 = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 654 ret <16 x i8> %tmp3 655} 656 657define <4 x i16> @addp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 658;CHECK-LABEL: addp_4h: 659;CHECK: addp.4h 660 %tmp1 = load <4 x i16>, <4 x i16>* %A 661 %tmp2 = load <4 x i16>, <4 x i16>* %B 662 %tmp3 = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 663 ret <4 x i16> %tmp3 664} 665 666define <8 x i16> @addp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 667;CHECK-LABEL: addp_8h: 668;CHECK: addp.8h 669 %tmp1 = load <8 x i16>, <8 x i16>* %A 670 %tmp2 = load <8 x i16>, <8 x i16>* %B 671 %tmp3 = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 672 ret <8 x i16> %tmp3 673} 674 675define <2 x i32> @addp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 676;CHECK-LABEL: addp_2s: 677;CHECK: addp.2s 678 %tmp1 = load <2 x i32>, <2 x i32>* %A 679 %tmp2 = load <2 x i32>, <2 x i32>* %B 680 %tmp3 = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 681 ret <2 x i32> %tmp3 682} 683 684define <4 x i32> @addp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 685;CHECK-LABEL: addp_4s: 686;CHECK: addp.4s 687 %tmp1 = load <4 x i32>, <4 x i32>* %A 688 %tmp2 = load <4 x i32>, <4 x i32>* %B 689 %tmp3 = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 690 ret <4 x i32> %tmp3 691} 692 693define <2 x i64> @addp_2d(<2 x i64>* %A, <2 x i64>* %B) nounwind { 694;CHECK-LABEL: addp_2d: 695;CHECK: addp.2d 696 %tmp1 = load <2 x i64>, <2 x i64>* %A 697 %tmp2 = load <2 x i64>, <2 x i64>* %B 698 %tmp3 = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) 699 ret <2 x i64> %tmp3 700} 701 702declare <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone 703declare <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 704declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 705declare <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 706declare <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 707declare <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 708declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone 709 710define <2 x float> @faddp_2s(<2 x float>* %A, <2 x float>* %B) nounwind { 711;CHECK-LABEL: faddp_2s: 712;CHECK: faddp.2s 713 %tmp1 = load <2 x float>, <2 x float>* %A 714 %tmp2 = load <2 x float>, <2 x float>* %B 715 %tmp3 = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) 716 ret <2 x float> %tmp3 717} 718 719define <4 x float> @faddp_4s(<4 x float>* %A, <4 x float>* %B) nounwind { 720;CHECK-LABEL: faddp_4s: 721;CHECK: faddp.4s 722 %tmp1 = load <4 x float>, <4 x float>* %A 723 %tmp2 = load <4 x float>, <4 x float>* %B 724 %tmp3 = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) 725 ret <4 x float> %tmp3 726} 727 728define <2 x double> @faddp_2d(<2 x double>* %A, <2 x double>* %B) nounwind { 729;CHECK-LABEL: faddp_2d: 730;CHECK: faddp.2d 731 %tmp1 = load <2 x double>, <2 x double>* %A 732 %tmp2 = load <2 x double>, <2 x double>* %B 733 %tmp3 = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2) 734 ret <2 x double> %tmp3 735} 736 737declare <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float>, <2 x float>) nounwind readnone 738declare <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float>, <4 x float>) nounwind readnone 739declare <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double>, <2 x double>) nounwind readnone 740 741define <2 x i64> @uaddl_duprhs(<4 x i32> %lhs, i32 %rhs) { 742; CHECK-LABEL: uaddl_duprhs 743; CHECK-NOT: ext.16b 744; CHECK: uaddl.2d 745 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 746 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 747 748 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 749 750 %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64> 751 %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64> 752 753 %res = add <2 x i64> %lhs.ext, %rhs.ext 754 ret <2 x i64> %res 755} 756 757define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) { 758; CHECK-LABEL: uaddl2_duprhs 759; CHECK-NOT: ext.16b 760; CHECK: uaddl2.2d 761 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 762 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 763 764 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 765 766 %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64> 767 %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64> 768 769 %res = add <2 x i64> %lhs.ext, %rhs.ext 770 ret <2 x i64> %res 771} 772 773define <2 x i64> @saddl_duplhs(i32 %lhs, <4 x i32> %rhs) { 774; CHECK-LABEL: saddl_duplhs 775; CHECK-NOT: ext.16b 776; CHECK: saddl.2d 777 %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0 778 %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1 779 780 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 781 782 %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64> 783 %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64> 784 785 %res = add <2 x i64> %lhs.ext, %rhs.ext 786 ret <2 x i64> %res 787} 788 789define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) { 790; CHECK-LABEL: saddl2_duplhs 791; CHECK-NOT: ext.16b 792; CHECK: saddl2.2d 793 %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0 794 %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1 795 796 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 797 798 %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64> 799 %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64> 800 801 %res = add <2 x i64> %lhs.ext, %rhs.ext 802 ret <2 x i64> %res 803} 804 805define <2 x i64> @usubl_duprhs(<4 x i32> %lhs, i32 %rhs) { 806; CHECK-LABEL: usubl_duprhs 807; CHECK-NOT: ext.16b 808; CHECK: usubl.2d 809 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 810 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 811 812 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 813 814 %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64> 815 %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64> 816 817 %res = sub <2 x i64> %lhs.ext, %rhs.ext 818 ret <2 x i64> %res 819} 820 821define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) { 822; CHECK-LABEL: usubl2_duprhs 823; CHECK-NOT: ext.16b 824; CHECK: usubl2.2d 825 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 826 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 827 828 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 829 830 %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64> 831 %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64> 832 833 %res = sub <2 x i64> %lhs.ext, %rhs.ext 834 ret <2 x i64> %res 835} 836 837define <2 x i64> @ssubl_duplhs(i32 %lhs, <4 x i32> %rhs) { 838; CHECK-LABEL: ssubl_duplhs: 839; CHECK-NOT: ext.16b 840; CHECK: ssubl.2d 841 %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0 842 %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1 843 844 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 845 846 %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64> 847 %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64> 848 849 %res = sub <2 x i64> %lhs.ext, %rhs.ext 850 ret <2 x i64> %res 851} 852 853define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) { 854; CHECK-LABEL: ssubl2_duplhs: 855; CHECK-NOT: ext.16b 856; CHECK: ssubl2.2d 857 %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0 858 %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1 859 860 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 861 862 %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64> 863 %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64> 864 865 %res = sub <2 x i64> %lhs.ext, %rhs.ext 866 ret <2 x i64> %res 867} 868 869define <8 x i8> @addhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind { 870;CHECK-LABEL: addhn8b_natural: 871;CHECK: addhn.8b 872 %tmp1 = load <8 x i16>, <8 x i16>* %A 873 %tmp2 = load <8 x i16>, <8 x i16>* %B 874 %sum = add <8 x i16> %tmp1, %tmp2 875 %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 876 %narrowed = trunc <8 x i16> %high_bits to <8 x i8> 877 ret <8 x i8> %narrowed 878} 879 880define <4 x i16> @addhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind { 881;CHECK-LABEL: addhn4h_natural: 882;CHECK: addhn.4h 883 %tmp1 = load <4 x i32>, <4 x i32>* %A 884 %tmp2 = load <4 x i32>, <4 x i32>* %B 885 %sum = add <4 x i32> %tmp1, %tmp2 886 %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16> 887 %narrowed = trunc <4 x i32> %high_bits to <4 x i16> 888 ret <4 x i16> %narrowed 889} 890 891define <2 x i32> @addhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind { 892;CHECK-LABEL: addhn2s_natural: 893;CHECK: addhn.2s 894 %tmp1 = load <2 x i64>, <2 x i64>* %A 895 %tmp2 = load <2 x i64>, <2 x i64>* %B 896 %sum = add <2 x i64> %tmp1, %tmp2 897 %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32> 898 %narrowed = trunc <2 x i64> %high_bits to <2 x i32> 899 ret <2 x i32> %narrowed 900} 901 902define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind { 903;CHECK-LABEL: addhn2_16b_natural: 904;CHECK: addhn2.16b 905 %tmp1 = load <8 x i16>, <8 x i16>* %A 906 %tmp2 = load <8 x i16>, <8 x i16>* %B 907 %sum = add <8 x i16> %tmp1, %tmp2 908 %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 909 %narrowed = trunc <8 x i16> %high_bits to <8 x i8> 910 %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 911 ret <16 x i8> %res 912} 913 914define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind { 915;CHECK-LABEL: addhn2_8h_natural: 916;CHECK: addhn2.8h 917 %tmp1 = load <4 x i32>, <4 x i32>* %A 918 %tmp2 = load <4 x i32>, <4 x i32>* %B 919 %sum = add <4 x i32> %tmp1, %tmp2 920 %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16> 921 %narrowed = trunc <4 x i32> %high_bits to <4 x i16> 922 %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 923 ret <8 x i16> %res 924} 925 926define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind { 927;CHECK-LABEL: addhn2_4s_natural: 928;CHECK: addhn2.4s 929 %tmp1 = load <2 x i64>, <2 x i64>* %A 930 %tmp2 = load <2 x i64>, <2 x i64>* %B 931 %sum = add <2 x i64> %tmp1, %tmp2 932 %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32> 933 %narrowed = trunc <2 x i64> %high_bits to <2 x i32> 934 %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 935 ret <4 x i32> %res 936} 937 938define <8 x i8> @subhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind { 939;CHECK-LABEL: subhn8b_natural: 940;CHECK: subhn.8b 941 %tmp1 = load <8 x i16>, <8 x i16>* %A 942 %tmp2 = load <8 x i16>, <8 x i16>* %B 943 %diff = sub <8 x i16> %tmp1, %tmp2 944 %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 945 %narrowed = trunc <8 x i16> %high_bits to <8 x i8> 946 ret <8 x i8> %narrowed 947} 948 949define <4 x i16> @subhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind { 950;CHECK-LABEL: subhn4h_natural: 951;CHECK: subhn.4h 952 %tmp1 = load <4 x i32>, <4 x i32>* %A 953 %tmp2 = load <4 x i32>, <4 x i32>* %B 954 %diff = sub <4 x i32> %tmp1, %tmp2 955 %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16> 956 %narrowed = trunc <4 x i32> %high_bits to <4 x i16> 957 ret <4 x i16> %narrowed 958} 959 960define <2 x i32> @subhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind { 961;CHECK-LABEL: subhn2s_natural: 962;CHECK: subhn.2s 963 %tmp1 = load <2 x i64>, <2 x i64>* %A 964 %tmp2 = load <2 x i64>, <2 x i64>* %B 965 %diff = sub <2 x i64> %tmp1, %tmp2 966 %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32> 967 %narrowed = trunc <2 x i64> %high_bits to <2 x i32> 968 ret <2 x i32> %narrowed 969} 970 971define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind { 972;CHECK-LABEL: subhn2_16b_natural: 973;CHECK: subhn2.16b 974 %tmp1 = load <8 x i16>, <8 x i16>* %A 975 %tmp2 = load <8 x i16>, <8 x i16>* %B 976 %diff = sub <8 x i16> %tmp1, %tmp2 977 %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 978 %narrowed = trunc <8 x i16> %high_bits to <8 x i8> 979 %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 980 ret <16 x i8> %res 981} 982 983define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind { 984;CHECK-LABEL: subhn2_8h_natural: 985;CHECK: subhn2.8h 986 %tmp1 = load <4 x i32>, <4 x i32>* %A 987 %tmp2 = load <4 x i32>, <4 x i32>* %B 988 %diff = sub <4 x i32> %tmp1, %tmp2 989 %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16> 990 %narrowed = trunc <4 x i32> %high_bits to <4 x i16> 991 %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 992 ret <8 x i16> %res 993} 994 995define <4 x i32> @subhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind { 996;CHECK-LABEL: subhn2_4s_natural: 997;CHECK: subhn2.4s 998 %tmp1 = load <2 x i64>, <2 x i64>* %A 999 %tmp2 = load <2 x i64>, <2 x i64>* %B 1000 %diff = sub <2 x i64> %tmp1, %tmp2 1001 %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32> 1002 %narrowed = trunc <2 x i64> %high_bits to <2 x i32> 1003 %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1004 ret <4 x i32> %res 1005} 1006