1; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast \ 2; RUN: < %s -verify-machineinstrs -asm-verbose=false | FileCheck %s 3 4define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) #0 { 5; CHECK-LABEL: test_vmull_high_n_s16: 6; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0 7; CHECK-NEXT: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 8; CHECK-NEXT: ret 9entry: 10 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 11 %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0 12 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1 13 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2 14 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3 15 %vmull15.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) 16 ret <4 x i32> %vmull15.i.i 17} 18 19define <4 x i32> @test_vmull_high_n_s16_imm(<8 x i16> %a) #0 { 20; CHECK-LABEL: test_vmull_high_n_s16_imm: 21; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29 22; CHECK-NEXT: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 23; CHECK-NEXT: ret 24entry: 25 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 26 %vmull15.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>) 27 ret <4 x i32> %vmull15.i.i 28} 29 30define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) #0 { 31; CHECK-LABEL: test_vmull_high_n_s32: 32; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0 33; CHECK-NEXT: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 34; CHECK-NEXT: ret 35entry: 36 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 37 %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0 38 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1 39 %vmull9.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) 40 ret <2 x i64> %vmull9.i.i 41} 42 43define <2 x i64> @test_vmull_high_n_s32_imm(<4 x i32> %a) #0 { 44; CHECK-LABEL: test_vmull_high_n_s32_imm: 45; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #1, msl #8 46; CHECK-NEXT: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 47; CHECK-NEXT: ret 48entry: 49 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 50 %vmull9.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 511, i32 511>) 51 ret <2 x i64> %vmull9.i.i 52} 53 54define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) #0 { 55; CHECK-LABEL: test_vmull_high_n_u16: 56; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0 57; CHECK-NEXT: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 58; CHECK-NEXT: ret 59entry: 60 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 61 %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0 62 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1 63 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2 64 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3 65 %vmull15.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) 66 ret <4 x i32> %vmull15.i.i 67} 68 69define <4 x i32> @test_vmull_high_n_u16_imm(<8 x i16> %a) #0 { 70; CHECK-LABEL: test_vmull_high_n_u16_imm: 71; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #17, lsl #8 72; CHECK-NEXT: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 73; CHECK-NEXT: ret 74entry: 75 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 76 %vmull15.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 4352, i16 4352, i16 4352, i16 4352>) 77 ret <4 x i32> %vmull15.i.i 78} 79 80define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) #0 { 81; CHECK-LABEL: test_vmull_high_n_u32: 82; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0 83; CHECK-NEXT: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 84; CHECK-NEXT: ret 85entry: 86 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 87 %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0 88 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1 89 %vmull9.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) 90 ret <2 x i64> %vmull9.i.i 91} 92 93define <2 x i64> @test_vmull_high_n_u32_imm(<4 x i32> %a) #0 { 94; CHECK-LABEL: test_vmull_high_n_u32_imm: 95; CHECK-NEXT: mvni [[REPLICATE:v[0-9]+]].4s, #1, msl #8 96; CHECK-NEXT: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 97; CHECK-NEXT: ret 98entry: 99 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 100 %vmull9.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 4294966784, i32 4294966784>) 101 ret <2 x i64> %vmull9.i.i 102} 103 104define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) #0 { 105; CHECK-LABEL: test_vqdmull_high_n_s16: 106; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0 107; CHECK-NEXT: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 108; CHECK-NEXT: ret 109entry: 110 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 111 %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0 112 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1 113 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2 114 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3 115 %vqdmull15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) 116 ret <4 x i32> %vqdmull15.i.i 117} 118 119define <4 x i32> @test_vqdmull_high_n_s16_imm(<8 x i16> %a) #0 { 120; CHECK-LABEL: test_vqdmull_high_n_s16_imm: 121; CHECK-NEXT: mvni [[REPLICATE:v[0-9]+]].8h, #17, lsl #8 122; CHECK-NEXT: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 123; CHECK-NEXT: ret 124entry: 125 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 126 %vqdmull15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 61183, i16 61183, i16 61183, i16 61183>) 127 ret <4 x i32> %vqdmull15.i.i 128} 129 130define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) #0 { 131; CHECK-LABEL: test_vqdmull_high_n_s32: 132; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0 133; CHECK-NEXT: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 134; CHECK-NEXT: ret 135entry: 136 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 137 %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0 138 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1 139 %vqdmull9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) 140 ret <2 x i64> %vqdmull9.i.i 141} 142 143define <2 x i64> @test_vqdmull_high_n_s32_imm(<4 x i32> %a) #0 { 144; CHECK-LABEL: test_vqdmull_high_n_s32_imm: 145; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29 146; CHECK-NEXT: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 147; CHECK-NEXT: ret 148entry: 149 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 150 %vqdmull9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>) 151 ret <2 x i64> %vqdmull9.i.i 152} 153 154define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { 155; CHECK-LABEL: test_vmlal_high_n_s16: 156; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0 157; CHECK-NEXT: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 158; CHECK-NEXT: ret 159entry: 160 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 161 %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 162 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 163 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 164 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 165 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) 166 %add.i.i = add <4 x i32> %vmull2.i.i.i, %a 167 ret <4 x i32> %add.i.i 168} 169 170define <4 x i32> @test_vmlal_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 { 171; CHECK-LABEL: test_vmlal_high_n_s16_imm: 172; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29 173; CHECK-NEXT: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 174; CHECK-NEXT: ret 175entry: 176 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 177 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>) 178 %add.i.i = add <4 x i32> %vmull2.i.i.i, %a 179 ret <4 x i32> %add.i.i 180} 181 182define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { 183; CHECK-LABEL: test_vmlal_high_n_s32: 184; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0 185; CHECK-NEXT: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 186; CHECK-NEXT: ret 187entry: 188 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 189 %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 190 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 191 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) 192 %add.i.i = add <2 x i64> %vmull2.i.i.i, %a 193 ret <2 x i64> %add.i.i 194} 195 196define <2 x i64> @test_vmlal_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 { 197; CHECK-LABEL: test_vmlal_high_n_s32_imm: 198; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29 199; CHECK-NEXT: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 200; CHECK-NEXT: ret 201entry: 202 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 203 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>) 204 %add.i.i = add <2 x i64> %vmull2.i.i.i, %a 205 ret <2 x i64> %add.i.i 206} 207 208define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { 209; CHECK-LABEL: test_vmlal_high_n_u16: 210; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0 211; CHECK-NEXT: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 212; CHECK-NEXT: ret 213entry: 214 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 215 %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 216 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 217 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 218 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 219 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) 220 %add.i.i = add <4 x i32> %vmull2.i.i.i, %a 221 ret <4 x i32> %add.i.i 222} 223 224define <4 x i32> @test_vmlal_high_n_u16_imm(<4 x i32> %a, <8 x i16> %b) #0 { 225; CHECK-LABEL: test_vmlal_high_n_u16_imm: 226; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29 227; CHECK-NEXT: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 228; CHECK-NEXT: ret 229entry: 230 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 231 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>) 232 %add.i.i = add <4 x i32> %vmull2.i.i.i, %a 233 ret <4 x i32> %add.i.i 234} 235 236define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { 237; CHECK-LABEL: test_vmlal_high_n_u32: 238; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0 239; CHECK-NEXT: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 240; CHECK-NEXT: ret 241entry: 242 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 243 %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 244 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 245 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) 246 %add.i.i = add <2 x i64> %vmull2.i.i.i, %a 247 ret <2 x i64> %add.i.i 248} 249 250define <2 x i64> @test_vmlal_high_n_u32_imm(<2 x i64> %a, <4 x i32> %b) #0 { 251; CHECK-LABEL: test_vmlal_high_n_u32_imm: 252; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29 253; CHECK-NEXT: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 254; CHECK-NEXT: ret 255entry: 256 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 257 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>) 258 %add.i.i = add <2 x i64> %vmull2.i.i.i, %a 259 ret <2 x i64> %add.i.i 260} 261 262define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { 263; CHECK-LABEL: test_vqdmlal_high_n_s16: 264; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0 265; CHECK-NEXT: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 266; CHECK-NEXT: ret 267entry: 268 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 269 %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 270 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 271 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 272 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 273 %vqdmlal15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) 274 %vqdmlal17.i.i = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i) 275 ret <4 x i32> %vqdmlal17.i.i 276} 277 278define <4 x i32> @test_vqdmlal_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 { 279; CHECK-LABEL: test_vqdmlal_high_n_s16_imm: 280; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29 281; CHECK-NEXT: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 282; CHECK-NEXT: ret 283entry: 284 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 285 %vqdmlal15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>) 286 %vqdmlal17.i.i = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i) 287 ret <4 x i32> %vqdmlal17.i.i 288} 289 290define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { 291; CHECK-LABEL: test_vqdmlal_high_n_s32: 292; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0 293; CHECK-NEXT: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 294; CHECK-NEXT: ret 295entry: 296 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 297 %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 298 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 299 %vqdmlal9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) 300 %vqdmlal11.i.i = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i) 301 ret <2 x i64> %vqdmlal11.i.i 302} 303 304define <2 x i64> @test_vqdmlal_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 { 305; CHECK-LABEL: test_vqdmlal_high_n_s32_imm: 306; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29 307; CHECK-NEXT: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 308; CHECK-NEXT: ret 309entry: 310 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 311 %vqdmlal9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>) 312 %vqdmlal11.i.i = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i) 313 ret <2 x i64> %vqdmlal11.i.i 314} 315 316define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { 317; CHECK-LABEL: test_vmlsl_high_n_s16: 318; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0 319; CHECK-NEXT: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 320; CHECK-NEXT: ret 321entry: 322 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 323 %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 324 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 325 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 326 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 327 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) 328 %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i 329 ret <4 x i32> %sub.i.i 330} 331 332define <4 x i32> @test_vmlsl_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 { 333; CHECK-LABEL: test_vmlsl_high_n_s16_imm: 334; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29 335; CHECK-NEXT: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 336; CHECK-NEXT: ret 337entry: 338 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 339 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>) 340 %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i 341 ret <4 x i32> %sub.i.i 342} 343 344define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { 345; CHECK-LABEL: test_vmlsl_high_n_s32: 346; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0 347; CHECK-NEXT: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 348; CHECK-NEXT: ret 349entry: 350 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 351 %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 352 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 353 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) 354 %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i 355 ret <2 x i64> %sub.i.i 356} 357 358define <2 x i64> @test_vmlsl_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 { 359; CHECK-LABEL: test_vmlsl_high_n_s32_imm: 360; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29 361; CHECK-NEXT: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 362; CHECK-NEXT: ret 363entry: 364 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 365 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>) 366 %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i 367 ret <2 x i64> %sub.i.i 368} 369 370define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { 371; CHECK-LABEL: test_vmlsl_high_n_u16: 372; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0 373; CHECK-NEXT: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 374; CHECK-NEXT: ret 375entry: 376 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 377 %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 378 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 379 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 380 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 381 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) 382 %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i 383 ret <4 x i32> %sub.i.i 384} 385 386define <4 x i32> @test_vmlsl_high_n_u16_imm(<4 x i32> %a, <8 x i16> %b) #0 { 387; CHECK-LABEL: test_vmlsl_high_n_u16_imm: 388; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29 389; CHECK-NEXT: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 390; CHECK-NEXT: ret 391entry: 392 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 393 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>) 394 %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i 395 ret <4 x i32> %sub.i.i 396} 397 398define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { 399; CHECK-LABEL: test_vmlsl_high_n_u32: 400; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0 401; CHECK-NEXT: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 402; CHECK-NEXT: ret 403entry: 404 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 405 %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 406 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 407 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) 408 %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i 409 ret <2 x i64> %sub.i.i 410} 411 412define <2 x i64> @test_vmlsl_high_n_u32_imm(<2 x i64> %a, <4 x i32> %b) #0 { 413; CHECK-LABEL: test_vmlsl_high_n_u32_imm: 414; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29 415; CHECK-NEXT: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 416; CHECK-NEXT: ret 417entry: 418 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 419 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>) 420 %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i 421 ret <2 x i64> %sub.i.i 422} 423 424define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { 425; CHECK-LABEL: test_vqdmlsl_high_n_s16: 426; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0 427; CHECK-NEXT: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 428; CHECK-NEXT: ret 429entry: 430 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 431 %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 432 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 433 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 434 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 435 %vqdmlsl15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) 436 %vqdmlsl17.i.i = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i) 437 ret <4 x i32> %vqdmlsl17.i.i 438} 439 440define <4 x i32> @test_vqdmlsl_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 { 441; CHECK-LABEL: test_vqdmlsl_high_n_s16_imm: 442; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29 443; CHECK-NEXT: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 444; CHECK-NEXT: ret 445entry: 446 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 447 %vqdmlsl15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>) 448 %vqdmlsl17.i.i = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i) 449 ret <4 x i32> %vqdmlsl17.i.i 450} 451 452define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { 453; CHECK-LABEL: test_vqdmlsl_high_n_s32: 454; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0 455; CHECK-NEXT: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 456; CHECK-NEXT: ret 457entry: 458 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 459 %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 460 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 461 %vqdmlsl9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) 462 %vqdmlsl11.i.i = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i) 463 ret <2 x i64> %vqdmlsl11.i.i 464} 465 466define <2 x i64> @test_vqdmlsl_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 { 467; CHECK-LABEL: test_vqdmlsl_high_n_s32_imm: 468; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29 469; CHECK-NEXT: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 470; CHECK-NEXT: ret 471entry: 472 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 473 %vqdmlsl9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>) 474 %vqdmlsl11.i.i = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i) 475 ret <2 x i64> %vqdmlsl11.i.i 476} 477 478define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) #0 { 479; CHECK-LABEL: test_vmul_n_f32: 480; CHECK-NEXT: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] 481; CHECK-NEXT: ret 482entry: 483 %vecinit.i = insertelement <2 x float> undef, float %b, i32 0 484 %vecinit1.i = insertelement <2 x float> %vecinit.i, float %b, i32 1 485 %mul.i = fmul <2 x float> %vecinit1.i, %a 486 ret <2 x float> %mul.i 487} 488 489define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) #0 { 490; CHECK-LABEL: test_vmulq_n_f32: 491; CHECK-NEXT: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] 492; CHECK-NEXT: ret 493entry: 494 %vecinit.i = insertelement <4 x float> undef, float %b, i32 0 495 %vecinit1.i = insertelement <4 x float> %vecinit.i, float %b, i32 1 496 %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %b, i32 2 497 %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %b, i32 3 498 %mul.i = fmul <4 x float> %vecinit3.i, %a 499 ret <4 x float> %mul.i 500} 501 502define <2 x double> @test_vmulq_n_f64(<2 x double> %a, double %b) #0 { 503; CHECK-LABEL: test_vmulq_n_f64: 504; CHECK-NEXT: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] 505; CHECK-NEXT: ret 506entry: 507 %vecinit.i = insertelement <2 x double> undef, double %b, i32 0 508 %vecinit1.i = insertelement <2 x double> %vecinit.i, double %b, i32 1 509 %mul.i = fmul <2 x double> %vecinit1.i, %a 510 ret <2 x double> %mul.i 511} 512 513define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 { 514; CHECK-LABEL: test_vfma_n_f32: 515; CHECK-NEXT: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] 516; CHECK-NEXT: ret 517entry: 518 %vecinit.i = insertelement <2 x float> undef, float %n, i32 0 519 %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1 520 %0 = call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> %vecinit1.i, <2 x float> %a) 521 ret <2 x float> %0 522} 523 524define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 { 525; CHECK-LABEL: test_vfmaq_n_f32: 526; CHECK-NEXT: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] 527; CHECK-NEXT: ret 528entry: 529 %vecinit.i = insertelement <4 x float> undef, float %n, i32 0 530 %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1 531 %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2 532 %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3 533 %0 = call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %vecinit3.i, <4 x float> %a) 534 ret <4 x float> %0 535} 536 537define <2 x float> @test_vfms_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 { 538; CHECK-LABEL: test_vfms_n_f32: 539; CHECK-NEXT: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] 540; CHECK-NEXT: ret 541entry: 542 %vecinit.i = insertelement <2 x float> undef, float %n, i32 0 543 %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1 544 %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b 545 %1 = call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %vecinit1.i, <2 x float> %a) 546 ret <2 x float> %1 547} 548 549define <4 x float> @test_vfmsq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 { 550; CHECK-LABEL: test_vfmsq_n_f32: 551; CHECK-NEXT: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] 552; CHECK-NEXT: ret 553entry: 554 %vecinit.i = insertelement <4 x float> undef, float %n, i32 0 555 %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1 556 %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2 557 %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3 558 %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b 559 %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %vecinit3.i, <4 x float> %a) 560 ret <4 x float> %1 561} 562 563attributes #0 = { nounwind } 564 565declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) 566declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>) 567declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) 568declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>) 569declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) 570declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) 571declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>) 572declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>) 573declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) 574declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>) 575declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) 576declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) 577