1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK 3 4define arm_aapcs_vfpcc i32 @add_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b) { 5; CHECK-LABEL: add_v4i32_v4i32: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: vpt.i32 eq, q2, zr 8; CHECK-NEXT: vmlavt.u32 r0, q0, q1 9; CHECK-NEXT: bx lr 10entry: 11 %c = icmp eq <4 x i32> %b, zeroinitializer 12 %m = mul <4 x i32> %x, %y 13 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer 14 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) 15 ret i32 %z 16} 17 18define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b) { 19; CHECK-LABEL: add_v4i32_v4i64_zext: 20; CHECK: @ %bb.0: @ %entry 21; CHECK-NEXT: vpt.i32 eq, q2, zr 22; CHECK-NEXT: vmlalvt.u32 r0, r1, q0, q1 23; CHECK-NEXT: bx lr 24entry: 25 %c = icmp eq <4 x i32> %b, zeroinitializer 26 %xx = zext <4 x i32> %x to <4 x i64> 27 %yy = zext <4 x i32> %y to <4 x i64> 28 %m = mul <4 x i64> %xx, %yy 29 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer 30 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s) 31 ret i64 %z 32} 33 34define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b) { 35; CHECK-LABEL: add_v4i32_v4i64_sext: 36; CHECK: @ %bb.0: @ %entry 37; CHECK-NEXT: vpt.i32 eq, q2, zr 38; CHECK-NEXT: vmlalvt.s32 r0, r1, q0, q1 39; CHECK-NEXT: bx lr 40entry: 41 %c = icmp eq <4 x i32> %b, zeroinitializer 42 %xx = sext <4 x i32> %x to <4 x i64> 43 %yy = sext <4 x i32> %y to <4 x i64> 44 %m = mul <4 x i64> %xx, %yy 45 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer 46 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s) 47 ret i64 %z 48} 49 50define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b) { 51; CHECK-LABEL: add_v2i32_v2i64_zext: 52; CHECK: @ %bb.0: @ %entry 53; CHECK-NEXT: vmov r0, s8 54; CHECK-NEXT: vmullb.u32 q3, q0, q1 55; CHECK-NEXT: cmp r0, #0 56; CHECK-NEXT: cset r0, eq 57; CHECK-NEXT: tst.w r0, #1 58; CHECK-NEXT: csetm r0, ne 59; CHECK-NEXT: vmov.32 q0[0], r0 60; CHECK-NEXT: vmov.32 q0[1], r0 61; CHECK-NEXT: vmov r0, s10 62; CHECK-NEXT: cmp r0, #0 63; CHECK-NEXT: cset r0, eq 64; CHECK-NEXT: tst.w r0, #1 65; CHECK-NEXT: csetm r0, ne 66; CHECK-NEXT: vmov.32 q0[2], r0 67; CHECK-NEXT: vmov.32 q0[3], r0 68; CHECK-NEXT: vand q0, q3, q0 69; CHECK-NEXT: vmov r0, s2 70; CHECK-NEXT: vmov r3, s0 71; CHECK-NEXT: vmov r1, s3 72; CHECK-NEXT: vmov r2, s1 73; CHECK-NEXT: adds r0, r0, r3 74; CHECK-NEXT: adcs r1, r2 75; CHECK-NEXT: bx lr 76entry: 77 %c = icmp eq <2 x i32> %b, zeroinitializer 78 %xx = zext <2 x i32> %x to <2 x i64> 79 %yy = zext <2 x i32> %y to <2 x i64> 80 %m = mul <2 x i64> %xx, %yy 81 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 82 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 83 ret i64 %z 84} 85 86define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b) { 87; CHECK-LABEL: add_v2i32_v2i64_sext: 88; CHECK: @ %bb.0: @ %entry 89; CHECK-NEXT: vmov r0, s8 90; CHECK-NEXT: vmullb.s32 q3, q0, q1 91; CHECK-NEXT: cmp r0, #0 92; CHECK-NEXT: cset r0, eq 93; CHECK-NEXT: tst.w r0, #1 94; CHECK-NEXT: csetm r0, ne 95; CHECK-NEXT: vmov.32 q0[0], r0 96; CHECK-NEXT: vmov.32 q0[1], r0 97; CHECK-NEXT: vmov r0, s10 98; CHECK-NEXT: cmp r0, #0 99; CHECK-NEXT: cset r0, eq 100; CHECK-NEXT: tst.w r0, #1 101; CHECK-NEXT: csetm r0, ne 102; CHECK-NEXT: vmov.32 q0[2], r0 103; CHECK-NEXT: vmov.32 q0[3], r0 104; CHECK-NEXT: vand q0, q3, q0 105; CHECK-NEXT: vmov r0, s2 106; CHECK-NEXT: vmov r3, s0 107; CHECK-NEXT: vmov r1, s3 108; CHECK-NEXT: vmov r2, s1 109; CHECK-NEXT: adds r0, r0, r3 110; CHECK-NEXT: adcs r1, r2 111; CHECK-NEXT: bx lr 112entry: 113 %c = icmp eq <2 x i32> %b, zeroinitializer 114 %xx = sext <2 x i32> %x to <2 x i64> 115 %yy = sext <2 x i32> %y to <2 x i64> 116 %m = mul <2 x i64> %xx, %yy 117 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 118 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 119 ret i64 %z 120} 121 122define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) { 123; CHECK-LABEL: add_v8i16_v8i32_zext: 124; CHECK: @ %bb.0: @ %entry 125; CHECK-NEXT: vpt.i16 eq, q2, zr 126; CHECK-NEXT: vmlavt.u16 r0, q0, q1 127; CHECK-NEXT: bx lr 128entry: 129 %c = icmp eq <8 x i16> %b, zeroinitializer 130 %xx = zext <8 x i16> %x to <8 x i32> 131 %yy = zext <8 x i16> %y to <8 x i32> 132 %m = mul <8 x i32> %xx, %yy 133 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer 134 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s) 135 ret i32 %z 136} 137 138define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) { 139; CHECK-LABEL: add_v8i16_v8i32_sext: 140; CHECK: @ %bb.0: @ %entry 141; CHECK-NEXT: vpt.i16 eq, q2, zr 142; CHECK-NEXT: vmlavt.s16 r0, q0, q1 143; CHECK-NEXT: bx lr 144entry: 145 %c = icmp eq <8 x i16> %b, zeroinitializer 146 %xx = sext <8 x i16> %x to <8 x i32> 147 %yy = sext <8 x i16> %y to <8 x i32> 148 %m = mul <8 x i32> %xx, %yy 149 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer 150 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s) 151 ret i32 %z 152} 153 154define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) { 155; CHECK-LABEL: add_v4i16_v4i32_zext: 156; CHECK: @ %bb.0: @ %entry 157; CHECK-NEXT: vmovlb.u16 q1, q1 158; CHECK-NEXT: vmovlb.u16 q0, q0 159; CHECK-NEXT: vmovlb.u16 q2, q2 160; CHECK-NEXT: vpt.i32 eq, q2, zr 161; CHECK-NEXT: vmlavt.u32 r0, q0, q1 162; CHECK-NEXT: bx lr 163entry: 164 %c = icmp eq <4 x i16> %b, zeroinitializer 165 %xx = zext <4 x i16> %x to <4 x i32> 166 %yy = zext <4 x i16> %y to <4 x i32> 167 %m = mul <4 x i32> %xx, %yy 168 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer 169 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) 170 ret i32 %z 171} 172 173define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) { 174; CHECK-LABEL: add_v4i16_v4i32_sext: 175; CHECK: @ %bb.0: @ %entry 176; CHECK-NEXT: vmovlb.s16 q1, q1 177; CHECK-NEXT: vmovlb.s16 q0, q0 178; CHECK-NEXT: vmovlb.u16 q2, q2 179; CHECK-NEXT: vpt.i32 eq, q2, zr 180; CHECK-NEXT: vmlavt.u32 r0, q0, q1 181; CHECK-NEXT: bx lr 182entry: 183 %c = icmp eq <4 x i16> %b, zeroinitializer 184 %xx = sext <4 x i16> %x to <4 x i32> 185 %yy = sext <4 x i16> %y to <4 x i32> 186 %m = mul <4 x i32> %xx, %yy 187 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer 188 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) 189 ret i32 %z 190} 191 192define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) { 193; CHECK-LABEL: add_v8i16_v8i16: 194; CHECK: @ %bb.0: @ %entry 195; CHECK-NEXT: vpt.i16 eq, q2, zr 196; CHECK-NEXT: vmlavt.u16 r0, q0, q1 197; CHECK-NEXT: uxth r0, r0 198; CHECK-NEXT: bx lr 199entry: 200 %c = icmp eq <8 x i16> %b, zeroinitializer 201 %m = mul <8 x i16> %x, %y 202 %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer 203 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s) 204 ret i16 %z 205} 206 207define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) { 208; CHECK-LABEL: add_v8i16_v8i64_zext: 209; CHECK: @ %bb.0: @ %entry 210; CHECK-NEXT: vpt.i16 eq, q2, zr 211; CHECK-NEXT: vmlalvt.u16 r0, r1, q0, q1 212; CHECK-NEXT: bx lr 213entry: 214 %c = icmp eq <8 x i16> %b, zeroinitializer 215 %xx = zext <8 x i16> %x to <8 x i64> 216 %yy = zext <8 x i16> %y to <8 x i64> 217 %m = mul <8 x i64> %xx, %yy 218 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer 219 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) 220 ret i64 %z 221} 222 223define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) { 224; CHECK-LABEL: add_v8i16_v8i64_sext: 225; CHECK: @ %bb.0: @ %entry 226; CHECK-NEXT: vpt.i16 eq, q2, zr 227; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q1 228; CHECK-NEXT: bx lr 229entry: 230 %c = icmp eq <8 x i16> %b, zeroinitializer 231 %xx = sext <8 x i16> %x to <8 x i64> 232 %yy = sext <8 x i16> %y to <8 x i64> 233 %m = mul <8 x i64> %xx, %yy 234 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer 235 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) 236 ret i64 %z 237} 238 239define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) { 240; CHECK-LABEL: add_v8i16_v8i32_v8i64_zext: 241; CHECK: @ %bb.0: @ %entry 242; CHECK-NEXT: vpt.i16 eq, q2, zr 243; CHECK-NEXT: vmlalvt.u16 r0, r1, q0, q1 244; CHECK-NEXT: bx lr 245entry: 246 %c = icmp eq <8 x i16> %b, zeroinitializer 247 %xx = zext <8 x i16> %x to <8 x i32> 248 %yy = zext <8 x i16> %y to <8 x i32> 249 %m = mul <8 x i32> %xx, %yy 250 %ma = zext <8 x i32> %m to <8 x i64> 251 %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer 252 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) 253 ret i64 %z 254} 255 256define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) { 257; CHECK-LABEL: add_v8i16_v8i32_v8i64_sext: 258; CHECK: @ %bb.0: @ %entry 259; CHECK-NEXT: vpt.i16 eq, q2, zr 260; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q1 261; CHECK-NEXT: bx lr 262entry: 263 %c = icmp eq <8 x i16> %b, zeroinitializer 264 %xx = sext <8 x i16> %x to <8 x i32> 265 %yy = sext <8 x i16> %y to <8 x i32> 266 %m = mul <8 x i32> %xx, %yy 267 %ma = sext <8 x i32> %m to <8 x i64> 268 %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer 269 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) 270 ret i64 %z 271} 272 273define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_sextzext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) { 274; CHECK-LABEL: add_v8i16_v8i32_v8i64_sextzext: 275; CHECK: @ %bb.0: @ %entry 276; CHECK-NEXT: vpt.i16 eq, q2, zr 277; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q0 278; CHECK-NEXT: bx lr 279entry: 280 %c = icmp eq <8 x i16> %b, zeroinitializer 281 %xx = sext <8 x i16> %x to <8 x i32> 282 %m = mul <8 x i32> %xx, %xx 283 %ma = zext <8 x i32> %m to <8 x i64> 284 %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer 285 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) 286 ret i64 %z 287} 288 289define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b) { 290; CHECK-LABEL: add_v2i16_v2i64_zext: 291; CHECK: @ %bb.0: @ %entry 292; CHECK-NEXT: .vsave {d8, d9} 293; CHECK-NEXT: vpush {d8, d9} 294; CHECK-NEXT: vmov.i64 q3, #0xffff 295; CHECK-NEXT: vand q1, q1, q3 296; CHECK-NEXT: vand q4, q0, q3 297; CHECK-NEXT: vmov r0, s4 298; CHECK-NEXT: vmov r1, s16 299; CHECK-NEXT: umull r0, r1, r1, r0 300; CHECK-NEXT: vmov.32 q0[0], r0 301; CHECK-NEXT: vmov r0, s6 302; CHECK-NEXT: vmov.32 q0[1], r1 303; CHECK-NEXT: vmov r1, s18 304; CHECK-NEXT: vand q1, q2, q3 305; CHECK-NEXT: umull r0, r1, r1, r0 306; CHECK-NEXT: vmov.32 q0[2], r0 307; CHECK-NEXT: vmov r0, s4 308; CHECK-NEXT: vmov.32 q0[3], r1 309; CHECK-NEXT: cmp r0, #0 310; CHECK-NEXT: cset r0, eq 311; CHECK-NEXT: tst.w r0, #1 312; CHECK-NEXT: csetm r0, ne 313; CHECK-NEXT: vmov.32 q2[0], r0 314; CHECK-NEXT: vmov.32 q2[1], r0 315; CHECK-NEXT: vmov r0, s6 316; CHECK-NEXT: cmp r0, #0 317; CHECK-NEXT: cset r0, eq 318; CHECK-NEXT: tst.w r0, #1 319; CHECK-NEXT: csetm r0, ne 320; CHECK-NEXT: vmov.32 q2[2], r0 321; CHECK-NEXT: vmov.32 q2[3], r0 322; CHECK-NEXT: vand q0, q0, q2 323; CHECK-NEXT: vmov r0, s2 324; CHECK-NEXT: vmov r3, s0 325; CHECK-NEXT: vmov r1, s3 326; CHECK-NEXT: vmov r2, s1 327; CHECK-NEXT: adds r0, r0, r3 328; CHECK-NEXT: adcs r1, r2 329; CHECK-NEXT: vpop {d8, d9} 330; CHECK-NEXT: bx lr 331entry: 332 %c = icmp eq <2 x i16> %b, zeroinitializer 333 %xx = zext <2 x i16> %x to <2 x i64> 334 %yy = zext <2 x i16> %y to <2 x i64> 335 %m = mul <2 x i64> %xx, %yy 336 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 337 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 338 ret i64 %z 339} 340 341define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b) { 342; CHECK-LABEL: add_v2i16_v2i64_sext: 343; CHECK: @ %bb.0: @ %entry 344; CHECK-NEXT: vmov.i32 q3, #0xffff 345; CHECK-NEXT: vmov r1, s0 346; CHECK-NEXT: vand q3, q2, q3 347; CHECK-NEXT: vmov r0, s12 348; CHECK-NEXT: sxth r1, r1 349; CHECK-NEXT: cmp r0, #0 350; CHECK-NEXT: cset r0, eq 351; CHECK-NEXT: tst.w r0, #1 352; CHECK-NEXT: csetm r0, ne 353; CHECK-NEXT: vmov.32 q2[0], r0 354; CHECK-NEXT: vmov.32 q2[1], r0 355; CHECK-NEXT: vmov r0, s14 356; CHECK-NEXT: cmp r0, #0 357; CHECK-NEXT: cset r0, eq 358; CHECK-NEXT: tst.w r0, #1 359; CHECK-NEXT: csetm r0, ne 360; CHECK-NEXT: vmov.32 q2[2], r0 361; CHECK-NEXT: vmov.32 q2[3], r0 362; CHECK-NEXT: vmov r0, s4 363; CHECK-NEXT: sxth r0, r0 364; CHECK-NEXT: smull r0, r1, r1, r0 365; CHECK-NEXT: vmov.32 q3[0], r0 366; CHECK-NEXT: vmov r0, s6 367; CHECK-NEXT: vmov.32 q3[1], r1 368; CHECK-NEXT: vmov r1, s2 369; CHECK-NEXT: sxth r0, r0 370; CHECK-NEXT: sxth r1, r1 371; CHECK-NEXT: smull r0, r1, r1, r0 372; CHECK-NEXT: vmov.32 q3[2], r0 373; CHECK-NEXT: vmov.32 q3[3], r1 374; CHECK-NEXT: vand q0, q3, q2 375; CHECK-NEXT: vmov r0, s2 376; CHECK-NEXT: vmov r3, s0 377; CHECK-NEXT: vmov r1, s3 378; CHECK-NEXT: vmov r2, s1 379; CHECK-NEXT: adds r0, r0, r3 380; CHECK-NEXT: adcs r1, r2 381; CHECK-NEXT: bx lr 382entry: 383 %c = icmp eq <2 x i16> %b, zeroinitializer 384 %xx = sext <2 x i16> %x to <2 x i64> 385 %yy = sext <2 x i16> %y to <2 x i64> 386 %m = mul <2 x i64> %xx, %yy 387 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 388 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 389 ret i64 %z 390} 391 392define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { 393; CHECK-LABEL: add_v16i8_v16i32_zext: 394; CHECK: @ %bb.0: @ %entry 395; CHECK-NEXT: vpt.i8 eq, q2, zr 396; CHECK-NEXT: vmlavt.u8 r0, q0, q1 397; CHECK-NEXT: bx lr 398entry: 399 %c = icmp eq <16 x i8> %b, zeroinitializer 400 %xx = zext <16 x i8> %x to <16 x i32> 401 %yy = zext <16 x i8> %y to <16 x i32> 402 %m = mul <16 x i32> %xx, %yy 403 %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer 404 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) 405 ret i32 %z 406} 407 408define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { 409; CHECK-LABEL: add_v16i8_v16i32_sext: 410; CHECK: @ %bb.0: @ %entry 411; CHECK-NEXT: vpt.i8 eq, q2, zr 412; CHECK-NEXT: vmlavt.s8 r0, q0, q1 413; CHECK-NEXT: bx lr 414entry: 415 %c = icmp eq <16 x i8> %b, zeroinitializer 416 %xx = sext <16 x i8> %x to <16 x i32> 417 %yy = sext <16 x i8> %y to <16 x i32> 418 %m = mul <16 x i32> %xx, %yy 419 %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer 420 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) 421 ret i32 %z 422} 423 424define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { 425; CHECK-LABEL: add_v16i8_v16i16_v16i32_zext: 426; CHECK: @ %bb.0: @ %entry 427; CHECK-NEXT: vpt.i8 eq, q2, zr 428; CHECK-NEXT: vmlavt.u8 r0, q0, q1 429; CHECK-NEXT: bx lr 430entry: 431 %c = icmp eq <16 x i8> %b, zeroinitializer 432 %xx = zext <16 x i8> %x to <16 x i16> 433 %yy = zext <16 x i8> %y to <16 x i16> 434 %m = mul <16 x i16> %xx, %yy 435 %ma = zext <16 x i16> %m to <16 x i32> 436 %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer 437 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) 438 ret i32 %z 439} 440 441define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { 442; CHECK-LABEL: add_v16i8_v16i16_v16i32_sext: 443; CHECK: @ %bb.0: @ %entry 444; CHECK-NEXT: vpt.i8 eq, q2, zr 445; CHECK-NEXT: vmlavt.s8 r0, q0, q1 446; CHECK-NEXT: bx lr 447entry: 448 %c = icmp eq <16 x i8> %b, zeroinitializer 449 %xx = sext <16 x i8> %x to <16 x i16> 450 %yy = sext <16 x i8> %y to <16 x i16> 451 %m = mul <16 x i16> %xx, %yy 452 %ma = sext <16 x i16> %m to <16 x i32> 453 %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer 454 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) 455 ret i32 %z 456} 457 458define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_sextzext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { 459; CHECK-LABEL: add_v16i8_v16i16_v16i32_sextzext: 460; CHECK: @ %bb.0: @ %entry 461; CHECK-NEXT: vpt.i8 eq, q2, zr 462; CHECK-NEXT: vmlavt.s8 r0, q0, q0 463; CHECK-NEXT: bx lr 464entry: 465 %c = icmp eq <16 x i8> %b, zeroinitializer 466 %xx = sext <16 x i8> %x to <16 x i16> 467 %m = mul <16 x i16> %xx, %xx 468 %ma = zext <16 x i16> %m to <16 x i32> 469 %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer 470 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) 471 ret i32 %z 472} 473 474define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) { 475; CHECK-LABEL: add_v4i8_v4i32_zext: 476; CHECK: @ %bb.0: @ %entry 477; CHECK-NEXT: vmov.i32 q3, #0xff 478; CHECK-NEXT: vand q1, q1, q3 479; CHECK-NEXT: vand q0, q0, q3 480; CHECK-NEXT: vand q2, q2, q3 481; CHECK-NEXT: vpt.i32 eq, q2, zr 482; CHECK-NEXT: vmlavt.u32 r0, q0, q1 483; CHECK-NEXT: bx lr 484entry: 485 %c = icmp eq <4 x i8> %b, zeroinitializer 486 %xx = zext <4 x i8> %x to <4 x i32> 487 %yy = zext <4 x i8> %y to <4 x i32> 488 %m = mul <4 x i32> %xx, %yy 489 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer 490 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) 491 ret i32 %z 492} 493 494define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) { 495; CHECK-LABEL: add_v4i8_v4i32_sext: 496; CHECK: @ %bb.0: @ %entry 497; CHECK-NEXT: vmovlb.s8 q1, q1 498; CHECK-NEXT: vmovlb.s8 q0, q0 499; CHECK-NEXT: vmov.i32 q3, #0xff 500; CHECK-NEXT: vmovlb.s16 q1, q1 501; CHECK-NEXT: vand q2, q2, q3 502; CHECK-NEXT: vmovlb.s16 q0, q0 503; CHECK-NEXT: vpt.i32 eq, q2, zr 504; CHECK-NEXT: vmlavt.u32 r0, q0, q1 505; CHECK-NEXT: bx lr 506entry: 507 %c = icmp eq <4 x i8> %b, zeroinitializer 508 %xx = sext <4 x i8> %x to <4 x i32> 509 %yy = sext <4 x i8> %y to <4 x i32> 510 %m = mul <4 x i32> %xx, %yy 511 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer 512 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) 513 ret i32 %z 514} 515 516define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { 517; CHECK-LABEL: add_v16i8_v16i16_zext: 518; CHECK: @ %bb.0: @ %entry 519; CHECK-NEXT: vpt.i8 eq, q2, zr 520; CHECK-NEXT: vmlavt.u8 r0, q0, q1 521; CHECK-NEXT: uxth r0, r0 522; CHECK-NEXT: bx lr 523entry: 524 %c = icmp eq <16 x i8> %b, zeroinitializer 525 %xx = zext <16 x i8> %x to <16 x i16> 526 %yy = zext <16 x i8> %y to <16 x i16> 527 %m = mul <16 x i16> %xx, %yy 528 %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer 529 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) 530 ret i16 %z 531} 532 533define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { 534; CHECK-LABEL: add_v16i8_v16i16_sext: 535; CHECK: @ %bb.0: @ %entry 536; CHECK-NEXT: vpt.i8 eq, q2, zr 537; CHECK-NEXT: vmlavt.s8 r0, q0, q1 538; CHECK-NEXT: sxth r0, r0 539; CHECK-NEXT: bx lr 540entry: 541 %c = icmp eq <16 x i8> %b, zeroinitializer 542 %xx = sext <16 x i8> %x to <16 x i16> 543 %yy = sext <16 x i8> %y to <16 x i16> 544 %m = mul <16 x i16> %xx, %yy 545 %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer 546 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) 547 ret i16 %z 548} 549 550define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) { 551; CHECK-LABEL: add_v8i8_v8i16_zext: 552; CHECK: @ %bb.0: @ %entry 553; CHECK-NEXT: vmovlb.u8 q1, q1 554; CHECK-NEXT: vmovlb.u8 q0, q0 555; CHECK-NEXT: vmovlb.u8 q2, q2 556; CHECK-NEXT: vpt.i16 eq, q2, zr 557; CHECK-NEXT: vmlavt.u16 r0, q0, q1 558; CHECK-NEXT: uxth r0, r0 559; CHECK-NEXT: bx lr 560entry: 561 %c = icmp eq <8 x i8> %b, zeroinitializer 562 %xx = zext <8 x i8> %x to <8 x i16> 563 %yy = zext <8 x i8> %y to <8 x i16> 564 %m = mul <8 x i16> %xx, %yy 565 %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer 566 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s) 567 ret i16 %z 568} 569 570define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) { 571; CHECK-LABEL: add_v8i8_v8i16_sext: 572; CHECK: @ %bb.0: @ %entry 573; CHECK-NEXT: vmovlb.s8 q1, q1 574; CHECK-NEXT: vmovlb.s8 q0, q0 575; CHECK-NEXT: vmovlb.u8 q2, q2 576; CHECK-NEXT: vpt.i16 eq, q2, zr 577; CHECK-NEXT: vmlavt.u16 r0, q0, q1 578; CHECK-NEXT: sxth r0, r0 579; CHECK-NEXT: bx lr 580entry: 581 %c = icmp eq <8 x i8> %b, zeroinitializer 582 %xx = sext <8 x i8> %x to <8 x i16> 583 %yy = sext <8 x i8> %y to <8 x i16> 584 %m = mul <8 x i16> %xx, %yy 585 %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer 586 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s) 587 ret i16 %z 588} 589 590define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { 591; CHECK-LABEL: add_v16i8_v16i8: 592; CHECK: @ %bb.0: @ %entry 593; CHECK-NEXT: vpt.i8 eq, q2, zr 594; CHECK-NEXT: vmlavt.u8 r0, q0, q1 595; CHECK-NEXT: uxtb r0, r0 596; CHECK-NEXT: bx lr 597entry: 598 %c = icmp eq <16 x i8> %b, zeroinitializer 599 %m = mul <16 x i8> %x, %y 600 %s = select <16 x i1> %c, <16 x i8> %m, <16 x i8> zeroinitializer 601 %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %s) 602 ret i8 %z 603} 604 605define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { 606; CHECK-LABEL: add_v16i8_v16i64_zext: 607; CHECK: @ %bb.0: @ %entry 608; CHECK-NEXT: .save {r7, lr} 609; CHECK-NEXT: push {r7, lr} 610; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 611; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 612; CHECK-NEXT: .pad #88 613; CHECK-NEXT: sub sp, #88 614; CHECK-NEXT: vmov q3, q1 615; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill 616; CHECK-NEXT: vmov.i8 q0, #0x0 617; CHECK-NEXT: vmov.i8 q1, #0xff 618; CHECK-NEXT: vcmp.i8 eq, q2, zr 619; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill 620; CHECK-NEXT: vpsel q5, q1, q0 621; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill 622; CHECK-NEXT: vmov.u8 r0, q5[0] 623; CHECK-NEXT: vmov.i64 q4, #0xff 624; CHECK-NEXT: vmov.16 q2[0], r0 625; CHECK-NEXT: vmov.u8 r0, q5[1] 626; CHECK-NEXT: vmov.16 q2[1], r0 627; CHECK-NEXT: vmov.u8 r0, q5[2] 628; CHECK-NEXT: vmov.16 q2[2], r0 629; CHECK-NEXT: vmov.u8 r0, q5[3] 630; CHECK-NEXT: vmov.16 q2[3], r0 631; CHECK-NEXT: vmov.u8 r0, q5[4] 632; CHECK-NEXT: vmov.16 q2[4], r0 633; CHECK-NEXT: vmov.u8 r0, q5[5] 634; CHECK-NEXT: vmov.16 q2[5], r0 635; CHECK-NEXT: vmov.u8 r0, q5[6] 636; CHECK-NEXT: vmov.16 q2[6], r0 637; CHECK-NEXT: vmov.u8 r0, q5[7] 638; CHECK-NEXT: vmov.16 q2[7], r0 639; CHECK-NEXT: vcmp.i16 ne, q2, zr 640; CHECK-NEXT: vpsel q6, q1, q0 641; CHECK-NEXT: vmov.u16 r0, q6[0] 642; CHECK-NEXT: vmov.32 q2[0], r0 643; CHECK-NEXT: vmov.u16 r0, q6[1] 644; CHECK-NEXT: vmov.32 q2[1], r0 645; CHECK-NEXT: vmov.u16 r0, q6[2] 646; CHECK-NEXT: vmov.32 q2[2], r0 647; CHECK-NEXT: vmov.u16 r0, q6[3] 648; CHECK-NEXT: vmov.32 q2[3], r0 649; CHECK-NEXT: vcmp.i32 ne, q2, zr 650; CHECK-NEXT: vmrs r0, p0 651; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload 652; CHECK-NEXT: vmov.u8 r2, q1[0] 653; CHECK-NEXT: and r1, r0, #1 654; CHECK-NEXT: rsbs r1, r1, #0 655; CHECK-NEXT: vmov.32 q7[0], r1 656; CHECK-NEXT: vmov.32 q7[1], r1 657; CHECK-NEXT: ubfx r1, r0, #4, #1 658; CHECK-NEXT: rsbs r1, r1, #0 659; CHECK-NEXT: vmov.32 q7[2], r1 660; CHECK-NEXT: vmov.32 q7[3], r1 661; CHECK-NEXT: vmov.u8 r1, q3[0] 662; CHECK-NEXT: vmov.32 q0[0], r1 663; CHECK-NEXT: vmov.u8 r1, q3[1] 664; CHECK-NEXT: vmov.32 q0[2], r1 665; CHECK-NEXT: vand q2, q0, q4 666; CHECK-NEXT: vmov.32 q0[0], r2 667; CHECK-NEXT: vmov.u8 r2, q1[1] 668; CHECK-NEXT: vmov r1, s8 669; CHECK-NEXT: vmov.32 q0[2], r2 670; CHECK-NEXT: vand q1, q0, q4 671; CHECK-NEXT: vmov r2, s4 672; CHECK-NEXT: umull r1, r2, r2, r1 673; CHECK-NEXT: vmov.32 q0[0], r1 674; CHECK-NEXT: vmov r1, s10 675; CHECK-NEXT: vmov.32 q0[1], r2 676; CHECK-NEXT: vmov r2, s6 677; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload 678; CHECK-NEXT: umull r1, r2, r2, r1 679; CHECK-NEXT: vmov.32 q0[2], r1 680; CHECK-NEXT: vmov.32 q0[3], r2 681; CHECK-NEXT: vand q0, q0, q7 682; CHECK-NEXT: vmov r1, s2 683; CHECK-NEXT: vmov r2, s0 684; CHECK-NEXT: vmov r12, s3 685; CHECK-NEXT: vmov r3, s1 686; CHECK-NEXT: adds r1, r1, r2 687; CHECK-NEXT: adc.w r2, r3, r12 688; CHECK-NEXT: ubfx r3, r0, #8, #1 689; CHECK-NEXT: rsbs r3, r3, #0 690; CHECK-NEXT: ubfx r0, r0, #12, #1 691; CHECK-NEXT: vmov.32 q7[0], r3 692; CHECK-NEXT: rsbs r0, r0, #0 693; CHECK-NEXT: vmov.32 q7[1], r3 694; CHECK-NEXT: vmov.u8 r3, q2[2] 695; CHECK-NEXT: vmov.32 q7[2], r0 696; CHECK-NEXT: vmov.32 q1[0], r3 697; CHECK-NEXT: vmov.32 q7[3], r0 698; CHECK-NEXT: vmov.u8 r0, q3[2] 699; CHECK-NEXT: vmov.32 q0[0], r0 700; CHECK-NEXT: vmov.u8 r0, q3[3] 701; CHECK-NEXT: vmov.u8 r3, q2[3] 702; CHECK-NEXT: vmov.32 q0[2], r0 703; CHECK-NEXT: vmov.32 q1[2], r3 704; CHECK-NEXT: vand q0, q0, q4 705; CHECK-NEXT: vand q1, q1, q4 706; CHECK-NEXT: vmov r0, s0 707; CHECK-NEXT: vmov r3, s4 708; CHECK-NEXT: umull r0, r3, r3, r0 709; CHECK-NEXT: vmov.32 q2[0], r0 710; CHECK-NEXT: vmov r0, s2 711; CHECK-NEXT: vmov.32 q2[1], r3 712; CHECK-NEXT: vmov r3, s6 713; CHECK-NEXT: umull r0, r3, r3, r0 714; CHECK-NEXT: vmov.32 q2[2], r0 715; CHECK-NEXT: vmov.32 q2[3], r3 716; CHECK-NEXT: vand q0, q2, q7 717; CHECK-NEXT: vmov r3, s0 718; CHECK-NEXT: vmov r0, s1 719; CHECK-NEXT: adds r1, r1, r3 720; CHECK-NEXT: vmov r3, s3 721; CHECK-NEXT: adcs r2, r0 722; CHECK-NEXT: vmov r0, s2 723; CHECK-NEXT: adds.w r12, r1, r0 724; CHECK-NEXT: adc.w r1, r2, r3 725; CHECK-NEXT: vmov.u16 r2, q6[4] 726; CHECK-NEXT: vmov.32 q0[0], r2 727; CHECK-NEXT: vmov.u16 r2, q6[5] 728; CHECK-NEXT: vmov.32 q0[1], r2 729; CHECK-NEXT: vmov.u16 r2, q6[6] 730; CHECK-NEXT: vmov.32 q0[2], r2 731; CHECK-NEXT: vmov.u16 r2, q6[7] 732; CHECK-NEXT: vmov.32 q0[3], r2 733; CHECK-NEXT: vcmp.i32 ne, q0, zr 734; CHECK-NEXT: vmrs lr, p0 735; CHECK-NEXT: vstrw.32 q3, [sp, #48] @ 16-byte Spill 736; CHECK-NEXT: vstrw.32 q4, [sp] @ 16-byte Spill 737; CHECK-NEXT: and r3, lr, #1 738; CHECK-NEXT: rsbs r3, r3, #0 739; CHECK-NEXT: vmov.32 q6[0], r3 740; CHECK-NEXT: vmov.32 q6[1], r3 741; CHECK-NEXT: ubfx r3, lr, #4, #1 742; CHECK-NEXT: rsbs r3, r3, #0 743; CHECK-NEXT: vmov.32 q6[2], r3 744; CHECK-NEXT: vmov.32 q6[3], r3 745; CHECK-NEXT: vmov.u8 r3, q3[4] 746; CHECK-NEXT: vmov.32 q0[0], r3 747; CHECK-NEXT: vmov.u8 r3, q3[5] 748; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload 749; CHECK-NEXT: vmov.32 q0[2], r3 750; CHECK-NEXT: vand q0, q0, q4 751; CHECK-NEXT: vmov.u8 r0, q3[4] 752; CHECK-NEXT: vmov r3, s0 753; CHECK-NEXT: vmov.32 q1[0], r0 754; CHECK-NEXT: vmov.u8 r0, q3[5] 755; CHECK-NEXT: vmov.32 q1[2], r0 756; CHECK-NEXT: vmov q7, q3 757; CHECK-NEXT: vand q1, q1, q4 758; CHECK-NEXT: vmov r0, s4 759; CHECK-NEXT: umull r0, r3, r0, r3 760; CHECK-NEXT: vmov.32 q2[0], r0 761; CHECK-NEXT: vmov r0, s2 762; CHECK-NEXT: vmov.32 q2[1], r3 763; CHECK-NEXT: vmov r3, s6 764; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload 765; CHECK-NEXT: umull r0, r3, r3, r0 766; CHECK-NEXT: vmov.32 q2[2], r0 767; CHECK-NEXT: vmov.32 q2[3], r3 768; CHECK-NEXT: vand q0, q2, q6 769; CHECK-NEXT: vmov r3, s0 770; CHECK-NEXT: vmov r0, s1 771; CHECK-NEXT: vmov r2, s2 772; CHECK-NEXT: adds.w r3, r3, r12 773; CHECK-NEXT: adcs r1, r0 774; CHECK-NEXT: vmov r0, s3 775; CHECK-NEXT: adds r3, r3, r2 776; CHECK-NEXT: vmov.u8 r2, q3[6] 777; CHECK-NEXT: adcs r1, r0 778; CHECK-NEXT: ubfx r0, lr, #8, #1 779; CHECK-NEXT: rsbs r0, r0, #0 780; CHECK-NEXT: vmov.32 q6[0], r0 781; CHECK-NEXT: vmov.32 q6[1], r0 782; CHECK-NEXT: ubfx r0, lr, #12, #1 783; CHECK-NEXT: rsbs r0, r0, #0 784; CHECK-NEXT: vmov.32 q6[2], r0 785; CHECK-NEXT: vmov.32 q6[3], r0 786; CHECK-NEXT: vmov.u8 r0, q1[6] 787; CHECK-NEXT: vmov.32 q0[0], r0 788; CHECK-NEXT: vmov.u8 r0, q1[7] 789; CHECK-NEXT: vmov.32 q1[0], r2 790; CHECK-NEXT: vmov.u8 r2, q3[7] 791; CHECK-NEXT: vmov.32 q0[2], r0 792; CHECK-NEXT: vmov.32 q1[2], r2 793; CHECK-NEXT: vand q0, q0, q4 794; CHECK-NEXT: vand q1, q1, q4 795; CHECK-NEXT: vmov r0, s0 796; CHECK-NEXT: vmov r2, s4 797; CHECK-NEXT: umull r0, r2, r2, r0 798; CHECK-NEXT: vmov.32 q2[0], r0 799; CHECK-NEXT: vmov r0, s2 800; CHECK-NEXT: vmov.32 q2[1], r2 801; CHECK-NEXT: vmov r2, s6 802; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload 803; CHECK-NEXT: umull r0, r2, r2, r0 804; CHECK-NEXT: vmov.32 q2[2], r0 805; CHECK-NEXT: vmov.32 q2[3], r2 806; CHECK-NEXT: vand q0, q2, q6 807; CHECK-NEXT: vmov r2, s0 808; CHECK-NEXT: vmov r0, s1 809; CHECK-NEXT: adds r2, r2, r3 810; CHECK-NEXT: vmov r3, s3 811; CHECK-NEXT: adcs r1, r0 812; CHECK-NEXT: vmov r0, s2 813; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload 814; CHECK-NEXT: adds.w r12, r2, r0 815; CHECK-NEXT: vmov.u8 r2, q5[8] 816; CHECK-NEXT: vmov.16 q6[0], r2 817; CHECK-NEXT: vmov.u8 r2, q5[9] 818; CHECK-NEXT: vmov.16 q6[1], r2 819; CHECK-NEXT: vmov.u8 r2, q5[10] 820; CHECK-NEXT: vmov.16 q6[2], r2 821; CHECK-NEXT: vmov.u8 r2, q5[11] 822; CHECK-NEXT: vmov.16 q6[3], r2 823; CHECK-NEXT: vmov.u8 r2, q5[12] 824; CHECK-NEXT: vmov.16 q6[4], r2 825; CHECK-NEXT: vmov.u8 r2, q5[13] 826; CHECK-NEXT: vmov.16 q6[5], r2 827; CHECK-NEXT: vmov.u8 r2, q5[14] 828; CHECK-NEXT: vmov.16 q6[6], r2 829; CHECK-NEXT: vmov.u8 r2, q5[15] 830; CHECK-NEXT: vmov.16 q6[7], r2 831; CHECK-NEXT: adcs r1, r3 832; CHECK-NEXT: vcmp.i16 ne, q6, zr 833; CHECK-NEXT: vmov.u8 r0, q7[8] 834; CHECK-NEXT: vpsel q3, q1, q0 835; CHECK-NEXT: vmov.32 q1[0], r0 836; CHECK-NEXT: vmov.u16 r2, q3[0] 837; CHECK-NEXT: vmov.u8 r0, q7[9] 838; CHECK-NEXT: vmov.32 q0[0], r2 839; CHECK-NEXT: vmov.u16 r2, q3[1] 840; CHECK-NEXT: vmov.32 q0[1], r2 841; CHECK-NEXT: vmov.u16 r2, q3[2] 842; CHECK-NEXT: vmov.32 q0[2], r2 843; CHECK-NEXT: vmov.u16 r2, q3[3] 844; CHECK-NEXT: vmov.32 q0[3], r2 845; CHECK-NEXT: vmov.32 q1[2], r0 846; CHECK-NEXT: vcmp.i32 ne, q0, zr 847; CHECK-NEXT: vmrs lr, p0 848; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload 849; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload 850; CHECK-NEXT: vand q1, q1, q5 851; CHECK-NEXT: vmov r0, s4 852; CHECK-NEXT: and r3, lr, #1 853; CHECK-NEXT: rsbs r3, r3, #0 854; CHECK-NEXT: vmov.32 q4[0], r3 855; CHECK-NEXT: vmov.32 q4[1], r3 856; CHECK-NEXT: ubfx r3, lr, #4, #1 857; CHECK-NEXT: rsbs r3, r3, #0 858; CHECK-NEXT: vmov.32 q4[2], r3 859; CHECK-NEXT: vmov.32 q4[3], r3 860; CHECK-NEXT: vmov.u8 r3, q6[8] 861; CHECK-NEXT: vmov.32 q0[0], r3 862; CHECK-NEXT: vmov.u8 r3, q6[9] 863; CHECK-NEXT: vmov.32 q0[2], r3 864; CHECK-NEXT: vand q0, q0, q5 865; CHECK-NEXT: vmov r3, s0 866; CHECK-NEXT: umull r0, r3, r0, r3 867; CHECK-NEXT: vmov.32 q2[0], r0 868; CHECK-NEXT: vmov r0, s2 869; CHECK-NEXT: vmov.32 q2[1], r3 870; CHECK-NEXT: vmov r3, s6 871; CHECK-NEXT: umull r0, r3, r3, r0 872; CHECK-NEXT: vmov.32 q2[2], r0 873; CHECK-NEXT: vmov.32 q2[3], r3 874; CHECK-NEXT: vand q0, q2, q4 875; CHECK-NEXT: vmov r3, s0 876; CHECK-NEXT: vmov r0, s1 877; CHECK-NEXT: vmov r2, s2 878; CHECK-NEXT: adds.w r3, r3, r12 879; CHECK-NEXT: adcs r1, r0 880; CHECK-NEXT: vmov r0, s3 881; CHECK-NEXT: adds r3, r3, r2 882; CHECK-NEXT: vmov.u8 r2, q7[10] 883; CHECK-NEXT: vmov.32 q1[0], r2 884; CHECK-NEXT: vmov.u8 r2, q7[11] 885; CHECK-NEXT: vmov.32 q1[2], r2 886; CHECK-NEXT: vand q1, q1, q5 887; CHECK-NEXT: vmov r2, s4 888; CHECK-NEXT: adcs r1, r0 889; CHECK-NEXT: ubfx r0, lr, #8, #1 890; CHECK-NEXT: rsbs r0, r0, #0 891; CHECK-NEXT: vmov.32 q4[0], r0 892; CHECK-NEXT: vmov.32 q4[1], r0 893; CHECK-NEXT: ubfx r0, lr, #12, #1 894; CHECK-NEXT: rsbs r0, r0, #0 895; CHECK-NEXT: vmov.32 q4[2], r0 896; CHECK-NEXT: vmov.32 q4[3], r0 897; CHECK-NEXT: vmov.u8 r0, q6[10] 898; CHECK-NEXT: vmov.32 q0[0], r0 899; CHECK-NEXT: vmov.u8 r0, q6[11] 900; CHECK-NEXT: vmov.32 q0[2], r0 901; CHECK-NEXT: vand q0, q0, q5 902; CHECK-NEXT: vmov r0, s0 903; CHECK-NEXT: umull r0, r2, r2, r0 904; CHECK-NEXT: vmov.32 q2[0], r0 905; CHECK-NEXT: vmov r0, s2 906; CHECK-NEXT: vmov.32 q2[1], r2 907; CHECK-NEXT: vmov r2, s6 908; CHECK-NEXT: umull r0, r2, r2, r0 909; CHECK-NEXT: vmov.32 q2[2], r0 910; CHECK-NEXT: vmov.32 q2[3], r2 911; CHECK-NEXT: vand q0, q2, q4 912; CHECK-NEXT: vmov r2, s0 913; CHECK-NEXT: vmov r0, s1 914; CHECK-NEXT: adds r2, r2, r3 915; CHECK-NEXT: vmov r3, s3 916; CHECK-NEXT: adcs r1, r0 917; CHECK-NEXT: vmov r0, s2 918; CHECK-NEXT: adds.w r12, r2, r0 919; CHECK-NEXT: vmov.u16 r2, q3[4] 920; CHECK-NEXT: vmov.32 q0[0], r2 921; CHECK-NEXT: vmov.u16 r2, q3[5] 922; CHECK-NEXT: vmov.32 q0[1], r2 923; CHECK-NEXT: vmov.u16 r2, q3[6] 924; CHECK-NEXT: vmov.32 q0[2], r2 925; CHECK-NEXT: vmov.u16 r2, q3[7] 926; CHECK-NEXT: vmov.32 q0[3], r2 927; CHECK-NEXT: adcs r1, r3 928; CHECK-NEXT: vcmp.i32 ne, q0, zr 929; CHECK-NEXT: vmov.u8 r0, q7[12] 930; CHECK-NEXT: vmrs lr, p0 931; CHECK-NEXT: vmov.32 q1[0], r0 932; CHECK-NEXT: vmov.u8 r0, q7[13] 933; CHECK-NEXT: vmov.32 q1[2], r0 934; CHECK-NEXT: vand q1, q1, q5 935; CHECK-NEXT: vmov r0, s4 936; CHECK-NEXT: and r3, lr, #1 937; CHECK-NEXT: rsbs r3, r3, #0 938; CHECK-NEXT: vmov.32 q3[0], r3 939; CHECK-NEXT: vmov.32 q3[1], r3 940; CHECK-NEXT: ubfx r3, lr, #4, #1 941; CHECK-NEXT: rsbs r3, r3, #0 942; CHECK-NEXT: vmov.32 q3[2], r3 943; CHECK-NEXT: vmov.32 q3[3], r3 944; CHECK-NEXT: vmov.u8 r3, q6[12] 945; CHECK-NEXT: vmov.32 q0[0], r3 946; CHECK-NEXT: vmov.u8 r3, q6[13] 947; CHECK-NEXT: vmov.32 q0[2], r3 948; CHECK-NEXT: vand q0, q0, q5 949; CHECK-NEXT: vmov r3, s0 950; CHECK-NEXT: umull r0, r3, r0, r3 951; CHECK-NEXT: vmov.32 q2[0], r0 952; CHECK-NEXT: vmov r0, s2 953; CHECK-NEXT: vmov.32 q2[1], r3 954; CHECK-NEXT: vmov r3, s6 955; CHECK-NEXT: umull r0, r3, r3, r0 956; CHECK-NEXT: vmov.32 q2[2], r0 957; CHECK-NEXT: vmov.32 q2[3], r3 958; CHECK-NEXT: vand q0, q2, q3 959; CHECK-NEXT: vmov r3, s0 960; CHECK-NEXT: vmov r0, s1 961; CHECK-NEXT: vmov r2, s2 962; CHECK-NEXT: adds.w r3, r3, r12 963; CHECK-NEXT: adcs r1, r0 964; CHECK-NEXT: vmov r0, s3 965; CHECK-NEXT: adds r3, r3, r2 966; CHECK-NEXT: vmov.u8 r2, q7[14] 967; CHECK-NEXT: vmov.32 q1[0], r2 968; CHECK-NEXT: vmov.u8 r2, q7[15] 969; CHECK-NEXT: vmov.32 q1[2], r2 970; CHECK-NEXT: vand q1, q1, q5 971; CHECK-NEXT: vmov r2, s4 972; CHECK-NEXT: adcs r1, r0 973; CHECK-NEXT: ubfx r0, lr, #8, #1 974; CHECK-NEXT: rsbs r0, r0, #0 975; CHECK-NEXT: vmov.32 q3[0], r0 976; CHECK-NEXT: vmov.32 q3[1], r0 977; CHECK-NEXT: ubfx r0, lr, #12, #1 978; CHECK-NEXT: rsbs r0, r0, #0 979; CHECK-NEXT: vmov.32 q3[2], r0 980; CHECK-NEXT: vmov.32 q3[3], r0 981; CHECK-NEXT: vmov.u8 r0, q6[14] 982; CHECK-NEXT: vmov.32 q0[0], r0 983; CHECK-NEXT: vmov.u8 r0, q6[15] 984; CHECK-NEXT: vmov.32 q0[2], r0 985; CHECK-NEXT: vand q0, q0, q5 986; CHECK-NEXT: vmov r0, s0 987; CHECK-NEXT: umull r0, r2, r2, r0 988; CHECK-NEXT: vmov.32 q2[0], r0 989; CHECK-NEXT: vmov r0, s2 990; CHECK-NEXT: vmov.32 q2[1], r2 991; CHECK-NEXT: vmov r2, s6 992; CHECK-NEXT: umull r0, r2, r2, r0 993; CHECK-NEXT: vmov.32 q2[2], r0 994; CHECK-NEXT: vmov.32 q2[3], r2 995; CHECK-NEXT: vand q0, q2, q3 996; CHECK-NEXT: vmov r2, s0 997; CHECK-NEXT: vmov r0, s1 998; CHECK-NEXT: adds r2, r2, r3 999; CHECK-NEXT: vmov r3, s3 1000; CHECK-NEXT: adcs r1, r0 1001; CHECK-NEXT: vmov r0, s2 1002; CHECK-NEXT: adds r0, r0, r2 1003; CHECK-NEXT: adcs r1, r3 1004; CHECK-NEXT: add sp, #88 1005; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1006; CHECK-NEXT: pop {r7, pc} 1007entry: 1008 %c = icmp eq <16 x i8> %b, zeroinitializer 1009 %xx = zext <16 x i8> %x to <16 x i64> 1010 %yy = zext <16 x i8> %y to <16 x i64> 1011 %m = mul <16 x i64> %xx, %yy 1012 %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer 1013 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s) 1014 ret i64 %z 1015} 1016 1017define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { 1018; CHECK-LABEL: add_v16i8_v16i64_sext: 1019; CHECK: @ %bb.0: @ %entry 1020; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1021; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1022; CHECK-NEXT: vcmp.i8 eq, q2, zr 1023; CHECK-NEXT: vmov.i8 q2, #0x0 1024; CHECK-NEXT: vmov.i8 q3, #0xff 1025; CHECK-NEXT: vmov.u8 r2, q0[0] 1026; CHECK-NEXT: vpsel q4, q3, q2 1027; CHECK-NEXT: sxtb r2, r2 1028; CHECK-NEXT: vmov.u8 r0, q4[0] 1029; CHECK-NEXT: vmov.16 q5[0], r0 1030; CHECK-NEXT: vmov.u8 r0, q4[1] 1031; CHECK-NEXT: vmov.16 q5[1], r0 1032; CHECK-NEXT: vmov.u8 r0, q4[2] 1033; CHECK-NEXT: vmov.16 q5[2], r0 1034; CHECK-NEXT: vmov.u8 r0, q4[3] 1035; CHECK-NEXT: vmov.16 q5[3], r0 1036; CHECK-NEXT: vmov.u8 r0, q4[4] 1037; CHECK-NEXT: vmov.16 q5[4], r0 1038; CHECK-NEXT: vmov.u8 r0, q4[5] 1039; CHECK-NEXT: vmov.16 q5[5], r0 1040; CHECK-NEXT: vmov.u8 r0, q4[6] 1041; CHECK-NEXT: vmov.16 q5[6], r0 1042; CHECK-NEXT: vmov.u8 r0, q4[7] 1043; CHECK-NEXT: vmov.16 q5[7], r0 1044; CHECK-NEXT: vcmp.i16 ne, q5, zr 1045; CHECK-NEXT: vpsel q5, q3, q2 1046; CHECK-NEXT: vmov.u16 r0, q5[0] 1047; CHECK-NEXT: vmov.32 q6[0], r0 1048; CHECK-NEXT: vmov.u16 r0, q5[1] 1049; CHECK-NEXT: vmov.32 q6[1], r0 1050; CHECK-NEXT: vmov.u16 r0, q5[2] 1051; CHECK-NEXT: vmov.32 q6[2], r0 1052; CHECK-NEXT: vmov.u16 r0, q5[3] 1053; CHECK-NEXT: vmov.32 q6[3], r0 1054; CHECK-NEXT: vcmp.i32 ne, q6, zr 1055; CHECK-NEXT: vmrs r0, p0 1056; CHECK-NEXT: and r1, r0, #1 1057; CHECK-NEXT: rsbs r1, r1, #0 1058; CHECK-NEXT: vmov.32 q6[0], r1 1059; CHECK-NEXT: vmov.32 q6[1], r1 1060; CHECK-NEXT: ubfx r1, r0, #4, #1 1061; CHECK-NEXT: rsbs r1, r1, #0 1062; CHECK-NEXT: vmov.32 q6[2], r1 1063; CHECK-NEXT: vmov.32 q6[3], r1 1064; CHECK-NEXT: vmov.u8 r1, q1[0] 1065; CHECK-NEXT: sxtb r1, r1 1066; CHECK-NEXT: smull r1, r2, r2, r1 1067; CHECK-NEXT: vmov.32 q7[0], r1 1068; CHECK-NEXT: vmov.u8 r1, q1[1] 1069; CHECK-NEXT: vmov.32 q7[1], r2 1070; CHECK-NEXT: vmov.u8 r2, q0[1] 1071; CHECK-NEXT: sxtb r1, r1 1072; CHECK-NEXT: sxtb r2, r2 1073; CHECK-NEXT: smull r1, r2, r2, r1 1074; CHECK-NEXT: vmov.32 q7[2], r1 1075; CHECK-NEXT: vmov.32 q7[3], r2 1076; CHECK-NEXT: vand q6, q7, q6 1077; CHECK-NEXT: vmov r1, s26 1078; CHECK-NEXT: vmov r2, s24 1079; CHECK-NEXT: vmov r12, s27 1080; CHECK-NEXT: vmov r3, s25 1081; CHECK-NEXT: adds r1, r1, r2 1082; CHECK-NEXT: adc.w r2, r3, r12 1083; CHECK-NEXT: ubfx r3, r0, #8, #1 1084; CHECK-NEXT: rsbs r3, r3, #0 1085; CHECK-NEXT: ubfx r0, r0, #12, #1 1086; CHECK-NEXT: vmov.32 q6[0], r3 1087; CHECK-NEXT: rsbs r0, r0, #0 1088; CHECK-NEXT: vmov.32 q6[1], r3 1089; CHECK-NEXT: vmov.u8 r3, q0[2] 1090; CHECK-NEXT: vmov.32 q6[2], r0 1091; CHECK-NEXT: sxtb r3, r3 1092; CHECK-NEXT: vmov.32 q6[3], r0 1093; CHECK-NEXT: vmov.u8 r0, q1[2] 1094; CHECK-NEXT: sxtb r0, r0 1095; CHECK-NEXT: smull r0, r3, r3, r0 1096; CHECK-NEXT: vmov.32 q7[0], r0 1097; CHECK-NEXT: vmov.u8 r0, q1[3] 1098; CHECK-NEXT: vmov.32 q7[1], r3 1099; CHECK-NEXT: vmov.u8 r3, q0[3] 1100; CHECK-NEXT: sxtb r0, r0 1101; CHECK-NEXT: sxtb r3, r3 1102; CHECK-NEXT: smull r0, r3, r3, r0 1103; CHECK-NEXT: vmov.32 q7[2], r0 1104; CHECK-NEXT: vmov.32 q7[3], r3 1105; CHECK-NEXT: vand q6, q7, q6 1106; CHECK-NEXT: vmov r3, s24 1107; CHECK-NEXT: vmov r0, s25 1108; CHECK-NEXT: adds r1, r1, r3 1109; CHECK-NEXT: vmov r3, s27 1110; CHECK-NEXT: adcs r2, r0 1111; CHECK-NEXT: vmov r0, s26 1112; CHECK-NEXT: adds.w r12, r1, r0 1113; CHECK-NEXT: vmov.u8 r0, q0[4] 1114; CHECK-NEXT: adc.w r1, r2, r3 1115; CHECK-NEXT: vmov.u16 r2, q5[4] 1116; CHECK-NEXT: vmov.32 q6[0], r2 1117; CHECK-NEXT: vmov.u16 r2, q5[5] 1118; CHECK-NEXT: vmov.32 q6[1], r2 1119; CHECK-NEXT: vmov.u16 r2, q5[6] 1120; CHECK-NEXT: vmov.32 q6[2], r2 1121; CHECK-NEXT: vmov.u16 r2, q5[7] 1122; CHECK-NEXT: vmov.32 q6[3], r2 1123; CHECK-NEXT: sxtb r0, r0 1124; CHECK-NEXT: vcmp.i32 ne, q6, zr 1125; CHECK-NEXT: vmrs r2, p0 1126; CHECK-NEXT: and r3, r2, #1 1127; CHECK-NEXT: rsbs r3, r3, #0 1128; CHECK-NEXT: vmov.32 q5[0], r3 1129; CHECK-NEXT: vmov.32 q5[1], r3 1130; CHECK-NEXT: ubfx r3, r2, #4, #1 1131; CHECK-NEXT: rsbs r3, r3, #0 1132; CHECK-NEXT: vmov.32 q5[2], r3 1133; CHECK-NEXT: vmov.32 q5[3], r3 1134; CHECK-NEXT: vmov.u8 r3, q1[4] 1135; CHECK-NEXT: sxtb r3, r3 1136; CHECK-NEXT: smull r0, r3, r0, r3 1137; CHECK-NEXT: vmov.32 q6[0], r0 1138; CHECK-NEXT: vmov.u8 r0, q1[5] 1139; CHECK-NEXT: vmov.32 q6[1], r3 1140; CHECK-NEXT: vmov.u8 r3, q0[5] 1141; CHECK-NEXT: sxtb r0, r0 1142; CHECK-NEXT: sxtb r3, r3 1143; CHECK-NEXT: smull r0, r3, r3, r0 1144; CHECK-NEXT: vmov.32 q6[2], r0 1145; CHECK-NEXT: vmov.32 q6[3], r3 1146; CHECK-NEXT: vand q5, q6, q5 1147; CHECK-NEXT: vmov r3, s20 1148; CHECK-NEXT: vmov r0, s21 1149; CHECK-NEXT: adds.w r3, r3, r12 1150; CHECK-NEXT: adc.w r12, r1, r0 1151; CHECK-NEXT: vmov r1, s22 1152; CHECK-NEXT: vmov r0, s23 1153; CHECK-NEXT: adds r3, r3, r1 1154; CHECK-NEXT: adc.w r1, r12, r0 1155; CHECK-NEXT: ubfx r0, r2, #8, #1 1156; CHECK-NEXT: rsbs r0, r0, #0 1157; CHECK-NEXT: vmov.32 q5[0], r0 1158; CHECK-NEXT: vmov.32 q5[1], r0 1159; CHECK-NEXT: ubfx r0, r2, #12, #1 1160; CHECK-NEXT: rsbs r0, r0, #0 1161; CHECK-NEXT: vmov.u8 r2, q0[6] 1162; CHECK-NEXT: vmov.32 q5[2], r0 1163; CHECK-NEXT: sxtb r2, r2 1164; CHECK-NEXT: vmov.32 q5[3], r0 1165; CHECK-NEXT: vmov.u8 r0, q1[6] 1166; CHECK-NEXT: sxtb r0, r0 1167; CHECK-NEXT: smull r0, r2, r2, r0 1168; CHECK-NEXT: vmov.32 q6[0], r0 1169; CHECK-NEXT: vmov.u8 r0, q1[7] 1170; CHECK-NEXT: vmov.32 q6[1], r2 1171; CHECK-NEXT: vmov.u8 r2, q0[7] 1172; CHECK-NEXT: sxtb r0, r0 1173; CHECK-NEXT: sxtb r2, r2 1174; CHECK-NEXT: smull r0, r2, r2, r0 1175; CHECK-NEXT: vmov.32 q6[2], r0 1176; CHECK-NEXT: vmov.32 q6[3], r2 1177; CHECK-NEXT: vand q5, q6, q5 1178; CHECK-NEXT: vmov r2, s20 1179; CHECK-NEXT: vmov r0, s21 1180; CHECK-NEXT: adds r2, r2, r3 1181; CHECK-NEXT: vmov r3, s23 1182; CHECK-NEXT: adcs r1, r0 1183; CHECK-NEXT: vmov r0, s22 1184; CHECK-NEXT: adds.w r12, r2, r0 1185; CHECK-NEXT: vmov.u8 r2, q4[8] 1186; CHECK-NEXT: vmov.16 q5[0], r2 1187; CHECK-NEXT: vmov.u8 r2, q4[9] 1188; CHECK-NEXT: vmov.16 q5[1], r2 1189; CHECK-NEXT: vmov.u8 r2, q4[10] 1190; CHECK-NEXT: vmov.16 q5[2], r2 1191; CHECK-NEXT: vmov.u8 r2, q4[11] 1192; CHECK-NEXT: vmov.16 q5[3], r2 1193; CHECK-NEXT: vmov.u8 r2, q4[12] 1194; CHECK-NEXT: vmov.16 q5[4], r2 1195; CHECK-NEXT: vmov.u8 r2, q4[13] 1196; CHECK-NEXT: vmov.16 q5[5], r2 1197; CHECK-NEXT: vmov.u8 r2, q4[14] 1198; CHECK-NEXT: vmov.16 q5[6], r2 1199; CHECK-NEXT: vmov.u8 r2, q4[15] 1200; CHECK-NEXT: vmov.16 q5[7], r2 1201; CHECK-NEXT: adcs r1, r3 1202; CHECK-NEXT: vcmp.i16 ne, q5, zr 1203; CHECK-NEXT: vmov.u8 r0, q0[8] 1204; CHECK-NEXT: vpsel q2, q3, q2 1205; CHECK-NEXT: sxtb r0, r0 1206; CHECK-NEXT: vmov.u16 r2, q2[0] 1207; CHECK-NEXT: vmov.32 q3[0], r2 1208; CHECK-NEXT: vmov.u16 r2, q2[1] 1209; CHECK-NEXT: vmov.32 q3[1], r2 1210; CHECK-NEXT: vmov.u16 r2, q2[2] 1211; CHECK-NEXT: vmov.32 q3[2], r2 1212; CHECK-NEXT: vmov.u16 r2, q2[3] 1213; CHECK-NEXT: vmov.32 q3[3], r2 1214; CHECK-NEXT: vcmp.i32 ne, q3, zr 1215; CHECK-NEXT: vmrs r2, p0 1216; CHECK-NEXT: and r3, r2, #1 1217; CHECK-NEXT: rsbs r3, r3, #0 1218; CHECK-NEXT: vmov.32 q3[0], r3 1219; CHECK-NEXT: vmov.32 q3[1], r3 1220; CHECK-NEXT: ubfx r3, r2, #4, #1 1221; CHECK-NEXT: rsbs r3, r3, #0 1222; CHECK-NEXT: vmov.32 q3[2], r3 1223; CHECK-NEXT: vmov.32 q3[3], r3 1224; CHECK-NEXT: vmov.u8 r3, q1[8] 1225; CHECK-NEXT: sxtb r3, r3 1226; CHECK-NEXT: smull r0, r3, r0, r3 1227; CHECK-NEXT: vmov.32 q4[0], r0 1228; CHECK-NEXT: vmov.u8 r0, q1[9] 1229; CHECK-NEXT: vmov.32 q4[1], r3 1230; CHECK-NEXT: vmov.u8 r3, q0[9] 1231; CHECK-NEXT: sxtb r0, r0 1232; CHECK-NEXT: sxtb r3, r3 1233; CHECK-NEXT: smull r0, r3, r3, r0 1234; CHECK-NEXT: vmov.32 q4[2], r0 1235; CHECK-NEXT: vmov.32 q4[3], r3 1236; CHECK-NEXT: vand q3, q4, q3 1237; CHECK-NEXT: vmov r3, s12 1238; CHECK-NEXT: vmov r0, s13 1239; CHECK-NEXT: adds.w r3, r3, r12 1240; CHECK-NEXT: adc.w r12, r1, r0 1241; CHECK-NEXT: vmov r1, s14 1242; CHECK-NEXT: vmov r0, s15 1243; CHECK-NEXT: adds r3, r3, r1 1244; CHECK-NEXT: adc.w r1, r12, r0 1245; CHECK-NEXT: ubfx r0, r2, #8, #1 1246; CHECK-NEXT: rsbs r0, r0, #0 1247; CHECK-NEXT: vmov.32 q3[0], r0 1248; CHECK-NEXT: vmov.32 q3[1], r0 1249; CHECK-NEXT: ubfx r0, r2, #12, #1 1250; CHECK-NEXT: rsbs r0, r0, #0 1251; CHECK-NEXT: vmov.u8 r2, q0[10] 1252; CHECK-NEXT: vmov.32 q3[2], r0 1253; CHECK-NEXT: sxtb r2, r2 1254; CHECK-NEXT: vmov.32 q3[3], r0 1255; CHECK-NEXT: vmov.u8 r0, q1[10] 1256; CHECK-NEXT: sxtb r0, r0 1257; CHECK-NEXT: smull r0, r2, r2, r0 1258; CHECK-NEXT: vmov.32 q4[0], r0 1259; CHECK-NEXT: vmov.u8 r0, q1[11] 1260; CHECK-NEXT: vmov.32 q4[1], r2 1261; CHECK-NEXT: vmov.u8 r2, q0[11] 1262; CHECK-NEXT: sxtb r0, r0 1263; CHECK-NEXT: sxtb r2, r2 1264; CHECK-NEXT: smull r0, r2, r2, r0 1265; CHECK-NEXT: vmov.32 q4[2], r0 1266; CHECK-NEXT: vmov.32 q4[3], r2 1267; CHECK-NEXT: vand q3, q4, q3 1268; CHECK-NEXT: vmov r2, s12 1269; CHECK-NEXT: vmov r0, s13 1270; CHECK-NEXT: adds r2, r2, r3 1271; CHECK-NEXT: vmov r3, s15 1272; CHECK-NEXT: adcs r1, r0 1273; CHECK-NEXT: vmov r0, s14 1274; CHECK-NEXT: adds.w r12, r2, r0 1275; CHECK-NEXT: vmov.u16 r2, q2[4] 1276; CHECK-NEXT: vmov.32 q3[0], r2 1277; CHECK-NEXT: vmov.u16 r2, q2[5] 1278; CHECK-NEXT: vmov.32 q3[1], r2 1279; CHECK-NEXT: vmov.u16 r2, q2[6] 1280; CHECK-NEXT: vmov.32 q3[2], r2 1281; CHECK-NEXT: vmov.u16 r2, q2[7] 1282; CHECK-NEXT: vmov.32 q3[3], r2 1283; CHECK-NEXT: adcs r1, r3 1284; CHECK-NEXT: vcmp.i32 ne, q3, zr 1285; CHECK-NEXT: vmov.u8 r0, q0[12] 1286; CHECK-NEXT: vmrs r2, p0 1287; CHECK-NEXT: sxtb r0, r0 1288; CHECK-NEXT: and r3, r2, #1 1289; CHECK-NEXT: rsbs r3, r3, #0 1290; CHECK-NEXT: vmov.32 q2[0], r3 1291; CHECK-NEXT: vmov.32 q2[1], r3 1292; CHECK-NEXT: ubfx r3, r2, #4, #1 1293; CHECK-NEXT: rsbs r3, r3, #0 1294; CHECK-NEXT: vmov.32 q2[2], r3 1295; CHECK-NEXT: vmov.32 q2[3], r3 1296; CHECK-NEXT: vmov.u8 r3, q1[12] 1297; CHECK-NEXT: sxtb r3, r3 1298; CHECK-NEXT: smull r0, r3, r0, r3 1299; CHECK-NEXT: vmov.32 q3[0], r0 1300; CHECK-NEXT: vmov.u8 r0, q1[13] 1301; CHECK-NEXT: vmov.32 q3[1], r3 1302; CHECK-NEXT: vmov.u8 r3, q0[13] 1303; CHECK-NEXT: sxtb r0, r0 1304; CHECK-NEXT: sxtb r3, r3 1305; CHECK-NEXT: smull r0, r3, r3, r0 1306; CHECK-NEXT: vmov.32 q3[2], r0 1307; CHECK-NEXT: vmov.32 q3[3], r3 1308; CHECK-NEXT: vand q2, q3, q2 1309; CHECK-NEXT: vmov r3, s8 1310; CHECK-NEXT: vmov r0, s9 1311; CHECK-NEXT: adds.w r3, r3, r12 1312; CHECK-NEXT: adc.w r12, r1, r0 1313; CHECK-NEXT: vmov r1, s10 1314; CHECK-NEXT: vmov r0, s11 1315; CHECK-NEXT: adds r3, r3, r1 1316; CHECK-NEXT: adc.w r1, r12, r0 1317; CHECK-NEXT: ubfx r0, r2, #8, #1 1318; CHECK-NEXT: rsbs r0, r0, #0 1319; CHECK-NEXT: vmov.32 q2[0], r0 1320; CHECK-NEXT: vmov.32 q2[1], r0 1321; CHECK-NEXT: ubfx r0, r2, #12, #1 1322; CHECK-NEXT: rsbs r0, r0, #0 1323; CHECK-NEXT: vmov.u8 r2, q0[14] 1324; CHECK-NEXT: vmov.32 q2[2], r0 1325; CHECK-NEXT: sxtb r2, r2 1326; CHECK-NEXT: vmov.32 q2[3], r0 1327; CHECK-NEXT: vmov.u8 r0, q1[14] 1328; CHECK-NEXT: sxtb r0, r0 1329; CHECK-NEXT: smull r0, r2, r2, r0 1330; CHECK-NEXT: vmov.32 q3[0], r0 1331; CHECK-NEXT: vmov.u8 r0, q1[15] 1332; CHECK-NEXT: vmov.32 q3[1], r2 1333; CHECK-NEXT: vmov.u8 r2, q0[15] 1334; CHECK-NEXT: sxtb r0, r0 1335; CHECK-NEXT: sxtb r2, r2 1336; CHECK-NEXT: smull r0, r2, r2, r0 1337; CHECK-NEXT: vmov.32 q3[2], r0 1338; CHECK-NEXT: vmov.32 q3[3], r2 1339; CHECK-NEXT: vand q0, q3, q2 1340; CHECK-NEXT: vmov r2, s0 1341; CHECK-NEXT: vmov r0, s1 1342; CHECK-NEXT: adds r2, r2, r3 1343; CHECK-NEXT: vmov r3, s3 1344; CHECK-NEXT: adcs r1, r0 1345; CHECK-NEXT: vmov r0, s2 1346; CHECK-NEXT: adds r0, r0, r2 1347; CHECK-NEXT: adcs r1, r3 1348; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1349; CHECK-NEXT: bx lr 1350entry: 1351 %c = icmp eq <16 x i8> %b, zeroinitializer 1352 %xx = sext <16 x i8> %x to <16 x i64> 1353 %yy = sext <16 x i8> %y to <16 x i64> 1354 %m = mul <16 x i64> %xx, %yy 1355 %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer 1356 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s) 1357 ret i64 %z 1358} 1359 1360define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b) { 1361; CHECK-LABEL: add_v2i8_v2i64_zext: 1362; CHECK: @ %bb.0: @ %entry 1363; CHECK-NEXT: .vsave {d8, d9} 1364; CHECK-NEXT: vpush {d8, d9} 1365; CHECK-NEXT: vmov.i64 q3, #0xff 1366; CHECK-NEXT: vand q1, q1, q3 1367; CHECK-NEXT: vand q4, q0, q3 1368; CHECK-NEXT: vmov r0, s4 1369; CHECK-NEXT: vmov r1, s16 1370; CHECK-NEXT: umull r0, r1, r1, r0 1371; CHECK-NEXT: vmov.32 q0[0], r0 1372; CHECK-NEXT: vmov r0, s6 1373; CHECK-NEXT: vmov.32 q0[1], r1 1374; CHECK-NEXT: vmov r1, s18 1375; CHECK-NEXT: vand q1, q2, q3 1376; CHECK-NEXT: umull r0, r1, r1, r0 1377; CHECK-NEXT: vmov.32 q0[2], r0 1378; CHECK-NEXT: vmov r0, s4 1379; CHECK-NEXT: vmov.32 q0[3], r1 1380; CHECK-NEXT: cmp r0, #0 1381; CHECK-NEXT: cset r0, eq 1382; CHECK-NEXT: tst.w r0, #1 1383; CHECK-NEXT: csetm r0, ne 1384; CHECK-NEXT: vmov.32 q2[0], r0 1385; CHECK-NEXT: vmov.32 q2[1], r0 1386; CHECK-NEXT: vmov r0, s6 1387; CHECK-NEXT: cmp r0, #0 1388; CHECK-NEXT: cset r0, eq 1389; CHECK-NEXT: tst.w r0, #1 1390; CHECK-NEXT: csetm r0, ne 1391; CHECK-NEXT: vmov.32 q2[2], r0 1392; CHECK-NEXT: vmov.32 q2[3], r0 1393; CHECK-NEXT: vand q0, q0, q2 1394; CHECK-NEXT: vmov r0, s2 1395; CHECK-NEXT: vmov r3, s0 1396; CHECK-NEXT: vmov r1, s3 1397; CHECK-NEXT: vmov r2, s1 1398; CHECK-NEXT: adds r0, r0, r3 1399; CHECK-NEXT: adcs r1, r2 1400; CHECK-NEXT: vpop {d8, d9} 1401; CHECK-NEXT: bx lr 1402entry: 1403 %c = icmp eq <2 x i8> %b, zeroinitializer 1404 %xx = zext <2 x i8> %x to <2 x i64> 1405 %yy = zext <2 x i8> %y to <2 x i64> 1406 %m = mul <2 x i64> %xx, %yy 1407 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 1408 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 1409 ret i64 %z 1410} 1411 1412define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b) { 1413; CHECK-LABEL: add_v2i8_v2i64_sext: 1414; CHECK: @ %bb.0: @ %entry 1415; CHECK-NEXT: vmov.i32 q3, #0xff 1416; CHECK-NEXT: vmov r1, s0 1417; CHECK-NEXT: vand q3, q2, q3 1418; CHECK-NEXT: vmov r0, s12 1419; CHECK-NEXT: sxtb r1, r1 1420; CHECK-NEXT: cmp r0, #0 1421; CHECK-NEXT: cset r0, eq 1422; CHECK-NEXT: tst.w r0, #1 1423; CHECK-NEXT: csetm r0, ne 1424; CHECK-NEXT: vmov.32 q2[0], r0 1425; CHECK-NEXT: vmov.32 q2[1], r0 1426; CHECK-NEXT: vmov r0, s14 1427; CHECK-NEXT: cmp r0, #0 1428; CHECK-NEXT: cset r0, eq 1429; CHECK-NEXT: tst.w r0, #1 1430; CHECK-NEXT: csetm r0, ne 1431; CHECK-NEXT: vmov.32 q2[2], r0 1432; CHECK-NEXT: vmov.32 q2[3], r0 1433; CHECK-NEXT: vmov r0, s4 1434; CHECK-NEXT: sxtb r0, r0 1435; CHECK-NEXT: smull r0, r1, r1, r0 1436; CHECK-NEXT: vmov.32 q3[0], r0 1437; CHECK-NEXT: vmov r0, s6 1438; CHECK-NEXT: vmov.32 q3[1], r1 1439; CHECK-NEXT: vmov r1, s2 1440; CHECK-NEXT: sxtb r0, r0 1441; CHECK-NEXT: sxtb r1, r1 1442; CHECK-NEXT: smull r0, r1, r1, r0 1443; CHECK-NEXT: vmov.32 q3[2], r0 1444; CHECK-NEXT: vmov.32 q3[3], r1 1445; CHECK-NEXT: vand q0, q3, q2 1446; CHECK-NEXT: vmov r0, s2 1447; CHECK-NEXT: vmov r3, s0 1448; CHECK-NEXT: vmov r1, s3 1449; CHECK-NEXT: vmov r2, s1 1450; CHECK-NEXT: adds r0, r0, r3 1451; CHECK-NEXT: adcs r1, r2 1452; CHECK-NEXT: bx lr 1453entry: 1454 %c = icmp eq <2 x i8> %b, zeroinitializer 1455 %xx = sext <2 x i8> %x to <2 x i64> 1456 %yy = sext <2 x i8> %y to <2 x i64> 1457 %m = mul <2 x i64> %xx, %yy 1458 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 1459 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 1460 ret i64 %z 1461} 1462 1463define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %b) { 1464; CHECK-LABEL: add_v2i64_v2i64: 1465; CHECK: @ %bb.0: @ %entry 1466; CHECK-NEXT: vmov r0, s4 1467; CHECK-NEXT: vmov r1, s0 1468; CHECK-NEXT: vmov r2, s5 1469; CHECK-NEXT: umull r12, r3, r1, r0 1470; CHECK-NEXT: mla r1, r1, r2, r3 1471; CHECK-NEXT: vmov r2, s1 1472; CHECK-NEXT: vmov.32 q3[0], r12 1473; CHECK-NEXT: mla r0, r2, r0, r1 1474; CHECK-NEXT: vmov r1, s2 1475; CHECK-NEXT: vmov.32 q3[1], r0 1476; CHECK-NEXT: vmov r0, s6 1477; CHECK-NEXT: umull r2, r3, r1, r0 1478; CHECK-NEXT: vmov.32 q3[2], r2 1479; CHECK-NEXT: vmov r2, s7 1480; CHECK-NEXT: mla r1, r1, r2, r3 1481; CHECK-NEXT: vmov r2, s3 1482; CHECK-NEXT: mla r0, r2, r0, r1 1483; CHECK-NEXT: vmov r1, s8 1484; CHECK-NEXT: vmov.32 q3[3], r0 1485; CHECK-NEXT: vmov r0, s9 1486; CHECK-NEXT: orrs r0, r1 1487; CHECK-NEXT: vmov r1, s10 1488; CHECK-NEXT: cset r0, eq 1489; CHECK-NEXT: tst.w r0, #1 1490; CHECK-NEXT: csetm r0, ne 1491; CHECK-NEXT: vmov.32 q0[0], r0 1492; CHECK-NEXT: vmov.32 q0[1], r0 1493; CHECK-NEXT: vmov r0, s11 1494; CHECK-NEXT: orrs r0, r1 1495; CHECK-NEXT: cset r0, eq 1496; CHECK-NEXT: tst.w r0, #1 1497; CHECK-NEXT: csetm r0, ne 1498; CHECK-NEXT: vmov.32 q0[2], r0 1499; CHECK-NEXT: vmov.32 q0[3], r0 1500; CHECK-NEXT: vand q0, q3, q0 1501; CHECK-NEXT: vmov r0, s2 1502; CHECK-NEXT: vmov r3, s0 1503; CHECK-NEXT: vmov r1, s3 1504; CHECK-NEXT: vmov r2, s1 1505; CHECK-NEXT: adds r0, r0, r3 1506; CHECK-NEXT: adcs r1, r2 1507; CHECK-NEXT: bx lr 1508entry: 1509 %c = icmp eq <2 x i64> %b, zeroinitializer 1510 %m = mul <2 x i64> %x, %y 1511 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 1512 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 1513 ret i64 %z 1514} 1515 1516define arm_aapcs_vfpcc i32 @add_v4i32_v4i32_acc(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b, i32 %a) { 1517; CHECK-LABEL: add_v4i32_v4i32_acc: 1518; CHECK: @ %bb.0: @ %entry 1519; CHECK-NEXT: vpt.i32 eq, q2, zr 1520; CHECK-NEXT: vmlavat.u32 r0, q0, q1 1521; CHECK-NEXT: bx lr 1522entry: 1523 %c = icmp eq <4 x i32> %b, zeroinitializer 1524 %m = mul <4 x i32> %x, %y 1525 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer 1526 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) 1527 %r = add i32 %z, %a 1528 ret i32 %r 1529} 1530 1531define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b, i64 %a) { 1532; CHECK-LABEL: add_v4i32_v4i64_acc_zext: 1533; CHECK: @ %bb.0: @ %entry 1534; CHECK-NEXT: vpt.i32 eq, q2, zr 1535; CHECK-NEXT: vmlalvat.u32 r0, r1, q0, q1 1536; CHECK-NEXT: bx lr 1537entry: 1538 %c = icmp eq <4 x i32> %b, zeroinitializer 1539 %xx = zext <4 x i32> %x to <4 x i64> 1540 %yy = zext <4 x i32> %y to <4 x i64> 1541 %m = mul <4 x i64> %xx, %yy 1542 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer 1543 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s) 1544 %r = add i64 %z, %a 1545 ret i64 %r 1546} 1547 1548define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b, i64 %a) { 1549; CHECK-LABEL: add_v4i32_v4i64_acc_sext: 1550; CHECK: @ %bb.0: @ %entry 1551; CHECK-NEXT: vpt.i32 eq, q2, zr 1552; CHECK-NEXT: vmlalvat.s32 r0, r1, q0, q1 1553; CHECK-NEXT: bx lr 1554entry: 1555 %c = icmp eq <4 x i32> %b, zeroinitializer 1556 %xx = sext <4 x i32> %x to <4 x i64> 1557 %yy = sext <4 x i32> %y to <4 x i64> 1558 %m = mul <4 x i64> %xx, %yy 1559 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer 1560 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s) 1561 %r = add i64 %z, %a 1562 ret i64 %r 1563} 1564 1565define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b, i64 %a) { 1566; CHECK-LABEL: add_v2i32_v2i64_acc_zext: 1567; CHECK: @ %bb.0: @ %entry 1568; CHECK-NEXT: .save {r7, lr} 1569; CHECK-NEXT: push {r7, lr} 1570; CHECK-NEXT: vmov r2, s8 1571; CHECK-NEXT: vmullb.u32 q3, q0, q1 1572; CHECK-NEXT: cmp r2, #0 1573; CHECK-NEXT: cset r2, eq 1574; CHECK-NEXT: tst.w r2, #1 1575; CHECK-NEXT: csetm r2, ne 1576; CHECK-NEXT: vmov.32 q0[0], r2 1577; CHECK-NEXT: vmov.32 q0[1], r2 1578; CHECK-NEXT: vmov r2, s10 1579; CHECK-NEXT: cmp r2, #0 1580; CHECK-NEXT: cset r2, eq 1581; CHECK-NEXT: tst.w r2, #1 1582; CHECK-NEXT: csetm r2, ne 1583; CHECK-NEXT: vmov.32 q0[2], r2 1584; CHECK-NEXT: vmov.32 q0[3], r2 1585; CHECK-NEXT: vand q0, q3, q0 1586; CHECK-NEXT: vmov r2, s2 1587; CHECK-NEXT: vmov r3, s0 1588; CHECK-NEXT: vmov r12, s3 1589; CHECK-NEXT: vmov lr, s1 1590; CHECK-NEXT: adds r2, r2, r3 1591; CHECK-NEXT: adc.w r3, lr, r12 1592; CHECK-NEXT: adds r0, r0, r2 1593; CHECK-NEXT: adcs r1, r3 1594; CHECK-NEXT: pop {r7, pc} 1595entry: 1596 %c = icmp eq <2 x i32> %b, zeroinitializer 1597 %xx = zext <2 x i32> %x to <2 x i64> 1598 %yy = zext <2 x i32> %y to <2 x i64> 1599 %m = mul <2 x i64> %xx, %yy 1600 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 1601 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 1602 %r = add i64 %z, %a 1603 ret i64 %r 1604} 1605 1606define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b, i64 %a) { 1607; CHECK-LABEL: add_v2i32_v2i64_acc_sext: 1608; CHECK: @ %bb.0: @ %entry 1609; CHECK-NEXT: .save {r7, lr} 1610; CHECK-NEXT: push {r7, lr} 1611; CHECK-NEXT: vmov r2, s8 1612; CHECK-NEXT: vmullb.s32 q3, q0, q1 1613; CHECK-NEXT: cmp r2, #0 1614; CHECK-NEXT: cset r2, eq 1615; CHECK-NEXT: tst.w r2, #1 1616; CHECK-NEXT: csetm r2, ne 1617; CHECK-NEXT: vmov.32 q0[0], r2 1618; CHECK-NEXT: vmov.32 q0[1], r2 1619; CHECK-NEXT: vmov r2, s10 1620; CHECK-NEXT: cmp r2, #0 1621; CHECK-NEXT: cset r2, eq 1622; CHECK-NEXT: tst.w r2, #1 1623; CHECK-NEXT: csetm r2, ne 1624; CHECK-NEXT: vmov.32 q0[2], r2 1625; CHECK-NEXT: vmov.32 q0[3], r2 1626; CHECK-NEXT: vand q0, q3, q0 1627; CHECK-NEXT: vmov r2, s2 1628; CHECK-NEXT: vmov r3, s0 1629; CHECK-NEXT: vmov r12, s3 1630; CHECK-NEXT: vmov lr, s1 1631; CHECK-NEXT: adds r2, r2, r3 1632; CHECK-NEXT: adc.w r3, lr, r12 1633; CHECK-NEXT: adds r0, r0, r2 1634; CHECK-NEXT: adcs r1, r3 1635; CHECK-NEXT: pop {r7, pc} 1636entry: 1637 %c = icmp eq <2 x i32> %b, zeroinitializer 1638 %xx = sext <2 x i32> %x to <2 x i64> 1639 %yy = sext <2 x i32> %y to <2 x i64> 1640 %m = mul <2 x i64> %xx, %yy 1641 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 1642 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 1643 %r = add i64 %z, %a 1644 ret i64 %r 1645} 1646 1647define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i32 %a) { 1648; CHECK-LABEL: add_v8i16_v8i32_acc_zext: 1649; CHECK: @ %bb.0: @ %entry 1650; CHECK-NEXT: vpt.i16 eq, q2, zr 1651; CHECK-NEXT: vmlavat.u16 r0, q0, q1 1652; CHECK-NEXT: bx lr 1653entry: 1654 %c = icmp eq <8 x i16> %b, zeroinitializer 1655 %xx = zext <8 x i16> %x to <8 x i32> 1656 %yy = zext <8 x i16> %y to <8 x i32> 1657 %m = mul <8 x i32> %xx, %yy 1658 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer 1659 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s) 1660 %r = add i32 %z, %a 1661 ret i32 %r 1662} 1663 1664define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i32 %a) { 1665; CHECK-LABEL: add_v8i16_v8i32_acc_sext: 1666; CHECK: @ %bb.0: @ %entry 1667; CHECK-NEXT: vpt.i16 eq, q2, zr 1668; CHECK-NEXT: vmlavat.s16 r0, q0, q1 1669; CHECK-NEXT: bx lr 1670entry: 1671 %c = icmp eq <8 x i16> %b, zeroinitializer 1672 %xx = sext <8 x i16> %x to <8 x i32> 1673 %yy = sext <8 x i16> %y to <8 x i32> 1674 %m = mul <8 x i32> %xx, %yy 1675 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer 1676 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s) 1677 %r = add i32 %z, %a 1678 ret i32 %r 1679} 1680 1681define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b, i32 %a) { 1682; CHECK-LABEL: add_v4i16_v4i32_acc_zext: 1683; CHECK: @ %bb.0: @ %entry 1684; CHECK-NEXT: vmovlb.u16 q1, q1 1685; CHECK-NEXT: vmovlb.u16 q0, q0 1686; CHECK-NEXT: vmovlb.u16 q2, q2 1687; CHECK-NEXT: vpt.i32 eq, q2, zr 1688; CHECK-NEXT: vmlavat.u32 r0, q0, q1 1689; CHECK-NEXT: bx lr 1690entry: 1691 %c = icmp eq <4 x i16> %b, zeroinitializer 1692 %xx = zext <4 x i16> %x to <4 x i32> 1693 %yy = zext <4 x i16> %y to <4 x i32> 1694 %m = mul <4 x i32> %xx, %yy 1695 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer 1696 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) 1697 %r = add i32 %z, %a 1698 ret i32 %r 1699} 1700 1701define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b, i32 %a) { 1702; CHECK-LABEL: add_v4i16_v4i32_acc_sext: 1703; CHECK: @ %bb.0: @ %entry 1704; CHECK-NEXT: vmovlb.s16 q1, q1 1705; CHECK-NEXT: vmovlb.s16 q0, q0 1706; CHECK-NEXT: vmovlb.u16 q2, q2 1707; CHECK-NEXT: vpt.i32 eq, q2, zr 1708; CHECK-NEXT: vmlavat.u32 r0, q0, q1 1709; CHECK-NEXT: bx lr 1710entry: 1711 %c = icmp eq <4 x i16> %b, zeroinitializer 1712 %xx = sext <4 x i16> %x to <4 x i32> 1713 %yy = sext <4 x i16> %y to <4 x i32> 1714 %m = mul <4 x i32> %xx, %yy 1715 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer 1716 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) 1717 %r = add i32 %z, %a 1718 ret i32 %r 1719} 1720 1721define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i16 %a) { 1722; CHECK-LABEL: add_v8i16_v8i16_acc: 1723; CHECK: @ %bb.0: @ %entry 1724; CHECK-NEXT: vpt.i16 eq, q2, zr 1725; CHECK-NEXT: vmlavat.u16 r0, q0, q1 1726; CHECK-NEXT: uxth r0, r0 1727; CHECK-NEXT: bx lr 1728entry: 1729 %c = icmp eq <8 x i16> %b, zeroinitializer 1730 %m = mul <8 x i16> %x, %y 1731 %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer 1732 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s) 1733 %r = add i16 %z, %a 1734 ret i16 %r 1735} 1736 1737define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) { 1738; CHECK-LABEL: add_v8i16_v8i64_acc_zext: 1739; CHECK: @ %bb.0: @ %entry 1740; CHECK-NEXT: vpt.i16 eq, q2, zr 1741; CHECK-NEXT: vmlalvat.u16 r0, r1, q0, q1 1742; CHECK-NEXT: bx lr 1743entry: 1744 %c = icmp eq <8 x i16> %b, zeroinitializer 1745 %xx = zext <8 x i16> %x to <8 x i64> 1746 %yy = zext <8 x i16> %y to <8 x i64> 1747 %m = mul <8 x i64> %xx, %yy 1748 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer 1749 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) 1750 %r = add i64 %z, %a 1751 ret i64 %r 1752} 1753 1754define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) { 1755; CHECK-LABEL: add_v8i16_v8i64_acc_sext: 1756; CHECK: @ %bb.0: @ %entry 1757; CHECK-NEXT: vpt.i16 eq, q2, zr 1758; CHECK-NEXT: vmlalvat.s16 r0, r1, q0, q1 1759; CHECK-NEXT: bx lr 1760entry: 1761 %c = icmp eq <8 x i16> %b, zeroinitializer 1762 %xx = sext <8 x i16> %x to <8 x i64> 1763 %yy = sext <8 x i16> %y to <8 x i64> 1764 %m = mul <8 x i64> %xx, %yy 1765 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer 1766 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) 1767 %r = add i64 %z, %a 1768 ret i64 %r 1769} 1770 1771define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) { 1772; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_zext: 1773; CHECK: @ %bb.0: @ %entry 1774; CHECK-NEXT: vpt.i16 eq, q2, zr 1775; CHECK-NEXT: vmlalvat.u16 r0, r1, q0, q1 1776; CHECK-NEXT: bx lr 1777entry: 1778 %c = icmp eq <8 x i16> %b, zeroinitializer 1779 %xx = zext <8 x i16> %x to <8 x i32> 1780 %yy = zext <8 x i16> %y to <8 x i32> 1781 %m = mul <8 x i32> %xx, %yy 1782 %ma = zext <8 x i32> %m to <8 x i64> 1783 %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer 1784 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) 1785 %r = add i64 %z, %a 1786 ret i64 %r 1787} 1788 1789define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) { 1790; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_sext: 1791; CHECK: @ %bb.0: @ %entry 1792; CHECK-NEXT: vpt.i16 eq, q2, zr 1793; CHECK-NEXT: vmlalvat.s16 r0, r1, q0, q1 1794; CHECK-NEXT: bx lr 1795entry: 1796 %c = icmp eq <8 x i16> %b, zeroinitializer 1797 %xx = sext <8 x i16> %x to <8 x i32> 1798 %yy = sext <8 x i16> %y to <8 x i32> 1799 %m = mul <8 x i32> %xx, %yy 1800 %ma = sext <8 x i32> %m to <8 x i64> 1801 %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer 1802 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) 1803 %r = add i64 %z, %a 1804 ret i64 %r 1805} 1806 1807define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_sextzext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) { 1808; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_sextzext: 1809; CHECK: @ %bb.0: @ %entry 1810; CHECK-NEXT: vpt.i16 eq, q2, zr 1811; CHECK-NEXT: vmlalvat.s16 r0, r1, q0, q0 1812; CHECK-NEXT: bx lr 1813entry: 1814 %c = icmp eq <8 x i16> %b, zeroinitializer 1815 %xx = sext <8 x i16> %x to <8 x i32> 1816 %m = mul <8 x i32> %xx, %xx 1817 %ma = zext <8 x i32> %m to <8 x i64> 1818 %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer 1819 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) 1820 %r = add i64 %z, %a 1821 ret i64 %r 1822} 1823 1824define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b, i64 %a) { 1825; CHECK-LABEL: add_v2i16_v2i64_acc_zext: 1826; CHECK: @ %bb.0: @ %entry 1827; CHECK-NEXT: .save {r7, lr} 1828; CHECK-NEXT: push {r7, lr} 1829; CHECK-NEXT: .vsave {d8, d9} 1830; CHECK-NEXT: vpush {d8, d9} 1831; CHECK-NEXT: vmov.i64 q3, #0xffff 1832; CHECK-NEXT: vand q1, q1, q3 1833; CHECK-NEXT: vand q4, q0, q3 1834; CHECK-NEXT: vmov r2, s4 1835; CHECK-NEXT: vmov r3, s16 1836; CHECK-NEXT: umull r2, r3, r3, r2 1837; CHECK-NEXT: vmov.32 q0[0], r2 1838; CHECK-NEXT: vmov r2, s6 1839; CHECK-NEXT: vmov.32 q0[1], r3 1840; CHECK-NEXT: vmov r3, s18 1841; CHECK-NEXT: vand q1, q2, q3 1842; CHECK-NEXT: umull r2, r3, r3, r2 1843; CHECK-NEXT: vmov.32 q0[2], r2 1844; CHECK-NEXT: vmov r2, s4 1845; CHECK-NEXT: vmov.32 q0[3], r3 1846; CHECK-NEXT: cmp r2, #0 1847; CHECK-NEXT: cset r2, eq 1848; CHECK-NEXT: tst.w r2, #1 1849; CHECK-NEXT: csetm r2, ne 1850; CHECK-NEXT: vmov.32 q2[0], r2 1851; CHECK-NEXT: vmov.32 q2[1], r2 1852; CHECK-NEXT: vmov r2, s6 1853; CHECK-NEXT: cmp r2, #0 1854; CHECK-NEXT: cset r2, eq 1855; CHECK-NEXT: tst.w r2, #1 1856; CHECK-NEXT: csetm r2, ne 1857; CHECK-NEXT: vmov.32 q2[2], r2 1858; CHECK-NEXT: vmov.32 q2[3], r2 1859; CHECK-NEXT: vand q0, q0, q2 1860; CHECK-NEXT: vmov r2, s2 1861; CHECK-NEXT: vmov r3, s0 1862; CHECK-NEXT: vmov r12, s3 1863; CHECK-NEXT: vmov lr, s1 1864; CHECK-NEXT: adds r2, r2, r3 1865; CHECK-NEXT: adc.w r3, lr, r12 1866; CHECK-NEXT: adds r0, r0, r2 1867; CHECK-NEXT: adcs r1, r3 1868; CHECK-NEXT: vpop {d8, d9} 1869; CHECK-NEXT: pop {r7, pc} 1870entry: 1871 %c = icmp eq <2 x i16> %b, zeroinitializer 1872 %xx = zext <2 x i16> %x to <2 x i64> 1873 %yy = zext <2 x i16> %y to <2 x i64> 1874 %m = mul <2 x i64> %xx, %yy 1875 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 1876 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 1877 %r = add i64 %z, %a 1878 ret i64 %r 1879} 1880 1881define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b, i64 %a) { 1882; CHECK-LABEL: add_v2i16_v2i64_acc_sext: 1883; CHECK: @ %bb.0: @ %entry 1884; CHECK-NEXT: .save {r7, lr} 1885; CHECK-NEXT: push {r7, lr} 1886; CHECK-NEXT: vmov.i32 q3, #0xffff 1887; CHECK-NEXT: vmov r3, s0 1888; CHECK-NEXT: vand q3, q2, q3 1889; CHECK-NEXT: vmov r2, s12 1890; CHECK-NEXT: sxth r3, r3 1891; CHECK-NEXT: cmp r2, #0 1892; CHECK-NEXT: cset r2, eq 1893; CHECK-NEXT: tst.w r2, #1 1894; CHECK-NEXT: csetm r2, ne 1895; CHECK-NEXT: vmov.32 q2[0], r2 1896; CHECK-NEXT: vmov.32 q2[1], r2 1897; CHECK-NEXT: vmov r2, s14 1898; CHECK-NEXT: cmp r2, #0 1899; CHECK-NEXT: cset r2, eq 1900; CHECK-NEXT: tst.w r2, #1 1901; CHECK-NEXT: csetm r2, ne 1902; CHECK-NEXT: vmov.32 q2[2], r2 1903; CHECK-NEXT: vmov.32 q2[3], r2 1904; CHECK-NEXT: vmov r2, s4 1905; CHECK-NEXT: sxth r2, r2 1906; CHECK-NEXT: smull r2, r3, r3, r2 1907; CHECK-NEXT: vmov.32 q3[0], r2 1908; CHECK-NEXT: vmov r2, s6 1909; CHECK-NEXT: vmov.32 q3[1], r3 1910; CHECK-NEXT: vmov r3, s2 1911; CHECK-NEXT: sxth r2, r2 1912; CHECK-NEXT: sxth r3, r3 1913; CHECK-NEXT: smull r2, r3, r3, r2 1914; CHECK-NEXT: vmov.32 q3[2], r2 1915; CHECK-NEXT: vmov.32 q3[3], r3 1916; CHECK-NEXT: vand q0, q3, q2 1917; CHECK-NEXT: vmov r2, s2 1918; CHECK-NEXT: vmov r3, s0 1919; CHECK-NEXT: vmov r12, s3 1920; CHECK-NEXT: vmov lr, s1 1921; CHECK-NEXT: adds r2, r2, r3 1922; CHECK-NEXT: adc.w r3, lr, r12 1923; CHECK-NEXT: adds r0, r0, r2 1924; CHECK-NEXT: adcs r1, r3 1925; CHECK-NEXT: pop {r7, pc} 1926entry: 1927 %c = icmp eq <2 x i16> %b, zeroinitializer 1928 %xx = sext <2 x i16> %x to <2 x i64> 1929 %yy = sext <2 x i16> %y to <2 x i64> 1930 %m = mul <2 x i64> %xx, %yy 1931 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 1932 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 1933 %r = add i64 %z, %a 1934 ret i64 %r 1935} 1936 1937define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) { 1938; CHECK-LABEL: add_v16i8_v16i32_acc_zext: 1939; CHECK: @ %bb.0: @ %entry 1940; CHECK-NEXT: vpt.i8 eq, q2, zr 1941; CHECK-NEXT: vmlavat.u8 r0, q0, q1 1942; CHECK-NEXT: bx lr 1943entry: 1944 %c = icmp eq <16 x i8> %b, zeroinitializer 1945 %xx = zext <16 x i8> %x to <16 x i32> 1946 %yy = zext <16 x i8> %y to <16 x i32> 1947 %m = mul <16 x i32> %xx, %yy 1948 %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer 1949 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) 1950 %r = add i32 %z, %a 1951 ret i32 %r 1952} 1953 1954define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) { 1955; CHECK-LABEL: add_v16i8_v16i32_acc_sext: 1956; CHECK: @ %bb.0: @ %entry 1957; CHECK-NEXT: vpt.i8 eq, q2, zr 1958; CHECK-NEXT: vmlavat.s8 r0, q0, q1 1959; CHECK-NEXT: bx lr 1960entry: 1961 %c = icmp eq <16 x i8> %b, zeroinitializer 1962 %xx = sext <16 x i8> %x to <16 x i32> 1963 %yy = sext <16 x i8> %y to <16 x i32> 1964 %m = mul <16 x i32> %xx, %yy 1965 %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer 1966 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) 1967 %r = add i32 %z, %a 1968 ret i32 %r 1969} 1970 1971define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) { 1972; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_zext: 1973; CHECK: @ %bb.0: @ %entry 1974; CHECK-NEXT: vpt.i8 eq, q2, zr 1975; CHECK-NEXT: vmlavat.u8 r0, q0, q1 1976; CHECK-NEXT: bx lr 1977entry: 1978 %c = icmp eq <16 x i8> %b, zeroinitializer 1979 %xx = zext <16 x i8> %x to <16 x i16> 1980 %yy = zext <16 x i8> %y to <16 x i16> 1981 %m = mul <16 x i16> %xx, %yy 1982 %ma = zext <16 x i16> %m to <16 x i32> 1983 %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer 1984 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) 1985 %r = add i32 %z, %a 1986 ret i32 %r 1987} 1988 1989define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) { 1990; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_sext: 1991; CHECK: @ %bb.0: @ %entry 1992; CHECK-NEXT: vpt.i8 eq, q2, zr 1993; CHECK-NEXT: vmlavat.s8 r0, q0, q1 1994; CHECK-NEXT: bx lr 1995entry: 1996 %c = icmp eq <16 x i8> %b, zeroinitializer 1997 %xx = sext <16 x i8> %x to <16 x i16> 1998 %yy = sext <16 x i8> %y to <16 x i16> 1999 %m = mul <16 x i16> %xx, %yy 2000 %ma = sext <16 x i16> %m to <16 x i32> 2001 %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer 2002 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) 2003 %r = add i32 %z, %a 2004 ret i32 %r 2005} 2006 2007define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_sextzext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) { 2008; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_sextzext: 2009; CHECK: @ %bb.0: @ %entry 2010; CHECK-NEXT: vpt.i8 eq, q2, zr 2011; CHECK-NEXT: vmlavat.s8 r0, q0, q0 2012; CHECK-NEXT: bx lr 2013entry: 2014 %c = icmp eq <16 x i8> %b, zeroinitializer 2015 %xx = sext <16 x i8> %x to <16 x i16> 2016 %m = mul <16 x i16> %xx, %xx 2017 %ma = zext <16 x i16> %m to <16 x i32> 2018 %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer 2019 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) 2020 %r = add i32 %z, %a 2021 ret i32 %r 2022} 2023 2024define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b, i32 %a) { 2025; CHECK-LABEL: add_v4i8_v4i32_acc_zext: 2026; CHECK: @ %bb.0: @ %entry 2027; CHECK-NEXT: vmov.i32 q3, #0xff 2028; CHECK-NEXT: vand q1, q1, q3 2029; CHECK-NEXT: vand q0, q0, q3 2030; CHECK-NEXT: vand q2, q2, q3 2031; CHECK-NEXT: vpt.i32 eq, q2, zr 2032; CHECK-NEXT: vmlavat.u32 r0, q0, q1 2033; CHECK-NEXT: bx lr 2034entry: 2035 %c = icmp eq <4 x i8> %b, zeroinitializer 2036 %xx = zext <4 x i8> %x to <4 x i32> 2037 %yy = zext <4 x i8> %y to <4 x i32> 2038 %m = mul <4 x i32> %xx, %yy 2039 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer 2040 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) 2041 %r = add i32 %z, %a 2042 ret i32 %r 2043} 2044 2045define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b, i32 %a) { 2046; CHECK-LABEL: add_v4i8_v4i32_acc_sext: 2047; CHECK: @ %bb.0: @ %entry 2048; CHECK-NEXT: vmovlb.s8 q1, q1 2049; CHECK-NEXT: vmovlb.s8 q0, q0 2050; CHECK-NEXT: vmov.i32 q3, #0xff 2051; CHECK-NEXT: vmovlb.s16 q1, q1 2052; CHECK-NEXT: vand q2, q2, q3 2053; CHECK-NEXT: vmovlb.s16 q0, q0 2054; CHECK-NEXT: vpt.i32 eq, q2, zr 2055; CHECK-NEXT: vmlavat.u32 r0, q0, q1 2056; CHECK-NEXT: bx lr 2057entry: 2058 %c = icmp eq <4 x i8> %b, zeroinitializer 2059 %xx = sext <4 x i8> %x to <4 x i32> 2060 %yy = sext <4 x i8> %y to <4 x i32> 2061 %m = mul <4 x i32> %xx, %yy 2062 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer 2063 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) 2064 %r = add i32 %z, %a 2065 ret i32 %r 2066} 2067 2068define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i16 %a) { 2069; CHECK-LABEL: add_v16i8_v16i16_acc_zext: 2070; CHECK: @ %bb.0: @ %entry 2071; CHECK-NEXT: vpt.i8 eq, q2, zr 2072; CHECK-NEXT: vmlavat.u8 r0, q0, q1 2073; CHECK-NEXT: uxth r0, r0 2074; CHECK-NEXT: bx lr 2075entry: 2076 %c = icmp eq <16 x i8> %b, zeroinitializer 2077 %xx = zext <16 x i8> %x to <16 x i16> 2078 %yy = zext <16 x i8> %y to <16 x i16> 2079 %m = mul <16 x i16> %xx, %yy 2080 %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer 2081 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) 2082 %r = add i16 %z, %a 2083 ret i16 %r 2084} 2085 2086define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i16 %a) { 2087; CHECK-LABEL: add_v16i8_v16i16_acc_sext: 2088; CHECK: @ %bb.0: @ %entry 2089; CHECK-NEXT: vpt.i8 eq, q2, zr 2090; CHECK-NEXT: vmlavat.s8 r0, q0, q1 2091; CHECK-NEXT: sxth r0, r0 2092; CHECK-NEXT: bx lr 2093entry: 2094 %c = icmp eq <16 x i8> %b, zeroinitializer 2095 %xx = sext <16 x i8> %x to <16 x i16> 2096 %yy = sext <16 x i8> %y to <16 x i16> 2097 %m = mul <16 x i16> %xx, %yy 2098 %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer 2099 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) 2100 %r = add i16 %z, %a 2101 ret i16 %r 2102} 2103 2104define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b, i16 %a) { 2105; CHECK-LABEL: add_v8i8_v8i16_acc_zext: 2106; CHECK: @ %bb.0: @ %entry 2107; CHECK-NEXT: vmovlb.u8 q1, q1 2108; CHECK-NEXT: vmovlb.u8 q0, q0 2109; CHECK-NEXT: vmovlb.u8 q2, q2 2110; CHECK-NEXT: vpt.i16 eq, q2, zr 2111; CHECK-NEXT: vmlavat.u16 r0, q0, q1 2112; CHECK-NEXT: uxth r0, r0 2113; CHECK-NEXT: bx lr 2114entry: 2115 %c = icmp eq <8 x i8> %b, zeroinitializer 2116 %xx = zext <8 x i8> %x to <8 x i16> 2117 %yy = zext <8 x i8> %y to <8 x i16> 2118 %m = mul <8 x i16> %xx, %yy 2119 %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer 2120 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s) 2121 %r = add i16 %z, %a 2122 ret i16 %r 2123} 2124 2125define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b, i16 %a) { 2126; CHECK-LABEL: add_v8i8_v8i16_acc_sext: 2127; CHECK: @ %bb.0: @ %entry 2128; CHECK-NEXT: vmovlb.s8 q1, q1 2129; CHECK-NEXT: vmovlb.s8 q0, q0 2130; CHECK-NEXT: vmovlb.u8 q2, q2 2131; CHECK-NEXT: vpt.i16 eq, q2, zr 2132; CHECK-NEXT: vmlavat.u16 r0, q0, q1 2133; CHECK-NEXT: sxth r0, r0 2134; CHECK-NEXT: bx lr 2135entry: 2136 %c = icmp eq <8 x i8> %b, zeroinitializer 2137 %xx = sext <8 x i8> %x to <8 x i16> 2138 %yy = sext <8 x i8> %y to <8 x i16> 2139 %m = mul <8 x i16> %xx, %yy 2140 %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer 2141 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s) 2142 %r = add i16 %z, %a 2143 ret i16 %r 2144} 2145 2146define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i8 %a) { 2147; CHECK-LABEL: add_v16i8_v16i8_acc: 2148; CHECK: @ %bb.0: @ %entry 2149; CHECK-NEXT: vpt.i8 eq, q2, zr 2150; CHECK-NEXT: vmlavat.u8 r0, q0, q1 2151; CHECK-NEXT: uxtb r0, r0 2152; CHECK-NEXT: bx lr 2153entry: 2154 %c = icmp eq <16 x i8> %b, zeroinitializer 2155 %m = mul <16 x i8> %x, %y 2156 %s = select <16 x i1> %c, <16 x i8> %m, <16 x i8> zeroinitializer 2157 %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %s) 2158 %r = add i8 %z, %a 2159 ret i8 %r 2160} 2161 2162define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i64 %a) { 2163; CHECK-LABEL: add_v16i8_v16i64_acc_zext: 2164; CHECK: @ %bb.0: @ %entry 2165; CHECK-NEXT: .save {r4, r5, r6, lr} 2166; CHECK-NEXT: push {r4, r5, r6, lr} 2167; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 2168; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 2169; CHECK-NEXT: .pad #80 2170; CHECK-NEXT: sub sp, #80 2171; CHECK-NEXT: vmov q3, q1 2172; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill 2173; CHECK-NEXT: vmov.i8 q0, #0x0 2174; CHECK-NEXT: vmov.i8 q1, #0xff 2175; CHECK-NEXT: vcmp.i8 eq, q2, zr 2176; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill 2177; CHECK-NEXT: vpsel q5, q1, q0 2178; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill 2179; CHECK-NEXT: vmov.u8 r2, q5[0] 2180; CHECK-NEXT: vmov.i64 q4, #0xff 2181; CHECK-NEXT: vmov.16 q2[0], r2 2182; CHECK-NEXT: vmov.u8 r2, q5[1] 2183; CHECK-NEXT: vmov.16 q2[1], r2 2184; CHECK-NEXT: vmov.u8 r2, q5[2] 2185; CHECK-NEXT: vmov.16 q2[2], r2 2186; CHECK-NEXT: vmov.u8 r2, q5[3] 2187; CHECK-NEXT: vmov.16 q2[3], r2 2188; CHECK-NEXT: vmov.u8 r2, q5[4] 2189; CHECK-NEXT: vmov.16 q2[4], r2 2190; CHECK-NEXT: vmov.u8 r2, q5[5] 2191; CHECK-NEXT: vmov.16 q2[5], r2 2192; CHECK-NEXT: vmov.u8 r2, q5[6] 2193; CHECK-NEXT: vmov.16 q2[6], r2 2194; CHECK-NEXT: vmov.u8 r2, q5[7] 2195; CHECK-NEXT: vmov.16 q2[7], r2 2196; CHECK-NEXT: vcmp.i16 ne, q2, zr 2197; CHECK-NEXT: vpsel q6, q1, q0 2198; CHECK-NEXT: vmov.u16 r2, q6[0] 2199; CHECK-NEXT: vmov.32 q2[0], r2 2200; CHECK-NEXT: vmov.u16 r2, q6[1] 2201; CHECK-NEXT: vmov.32 q2[1], r2 2202; CHECK-NEXT: vmov.u16 r2, q6[2] 2203; CHECK-NEXT: vmov.32 q2[2], r2 2204; CHECK-NEXT: vmov.u16 r2, q6[3] 2205; CHECK-NEXT: vmov.32 q2[3], r2 2206; CHECK-NEXT: vcmp.i32 ne, q2, zr 2207; CHECK-NEXT: vmrs lr, p0 2208; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload 2209; CHECK-NEXT: and r3, lr, #1 2210; CHECK-NEXT: rsbs r3, r3, #0 2211; CHECK-NEXT: vmov.32 q7[0], r3 2212; CHECK-NEXT: vmov.32 q7[1], r3 2213; CHECK-NEXT: ubfx r3, lr, #4, #1 2214; CHECK-NEXT: rsbs r3, r3, #0 2215; CHECK-NEXT: vmov.32 q7[2], r3 2216; CHECK-NEXT: vmov.32 q7[3], r3 2217; CHECK-NEXT: vmov.u8 r3, q3[0] 2218; CHECK-NEXT: vmov.32 q0[0], r3 2219; CHECK-NEXT: vmov.u8 r3, q3[1] 2220; CHECK-NEXT: vmov.32 q0[2], r3 2221; CHECK-NEXT: vmov.u8 r3, q1[0] 2222; CHECK-NEXT: vand q2, q0, q4 2223; CHECK-NEXT: vmov.32 q0[0], r3 2224; CHECK-NEXT: vmov.u8 r3, q1[1] 2225; CHECK-NEXT: vmov r12, s8 2226; CHECK-NEXT: vmov.32 q0[2], r3 2227; CHECK-NEXT: vand q1, q0, q4 2228; CHECK-NEXT: vmov r3, s4 2229; CHECK-NEXT: umull r3, r2, r3, r12 2230; CHECK-NEXT: vmov.32 q0[0], r3 2231; CHECK-NEXT: vmov r3, s6 2232; CHECK-NEXT: vmov.32 q0[1], r2 2233; CHECK-NEXT: vmov r2, s10 2234; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload 2235; CHECK-NEXT: umull r2, r3, r3, r2 2236; CHECK-NEXT: vmov.32 q0[2], r2 2237; CHECK-NEXT: vmov.32 q0[3], r3 2238; CHECK-NEXT: vand q0, q0, q7 2239; CHECK-NEXT: vmov r4, s2 2240; CHECK-NEXT: vmov r2, s0 2241; CHECK-NEXT: vmov r12, s3 2242; CHECK-NEXT: vmov r3, s1 2243; CHECK-NEXT: adds r4, r4, r2 2244; CHECK-NEXT: ubfx r2, lr, #8, #1 2245; CHECK-NEXT: rsb.w r2, r2, #0 2246; CHECK-NEXT: vmov.32 q7[0], r2 2247; CHECK-NEXT: adc.w r12, r12, r3 2248; CHECK-NEXT: vmov.32 q7[1], r2 2249; CHECK-NEXT: ubfx r2, lr, #12, #1 2250; CHECK-NEXT: rsbs r2, r2, #0 2251; CHECK-NEXT: vmov.u8 r3, q2[2] 2252; CHECK-NEXT: vmov.32 q7[2], r2 2253; CHECK-NEXT: vmov.32 q1[0], r3 2254; CHECK-NEXT: vmov.32 q7[3], r2 2255; CHECK-NEXT: vmov.u8 r2, q3[2] 2256; CHECK-NEXT: vmov.32 q0[0], r2 2257; CHECK-NEXT: vmov.u8 r2, q3[3] 2258; CHECK-NEXT: vmov.u8 r3, q2[3] 2259; CHECK-NEXT: vmov.32 q0[2], r2 2260; CHECK-NEXT: vmov.32 q1[2], r3 2261; CHECK-NEXT: vand q0, q0, q4 2262; CHECK-NEXT: vand q1, q1, q4 2263; CHECK-NEXT: vmov r2, s0 2264; CHECK-NEXT: vmov r3, s4 2265; CHECK-NEXT: umull r2, r3, r3, r2 2266; CHECK-NEXT: vmov.32 q2[0], r2 2267; CHECK-NEXT: vmov r2, s2 2268; CHECK-NEXT: vmov.32 q2[1], r3 2269; CHECK-NEXT: vmov r3, s6 2270; CHECK-NEXT: umull r2, r3, r3, r2 2271; CHECK-NEXT: vmov.32 q2[2], r2 2272; CHECK-NEXT: vmov.32 q2[3], r3 2273; CHECK-NEXT: vand q0, q2, q7 2274; CHECK-NEXT: vmov r3, s0 2275; CHECK-NEXT: vmov r2, s1 2276; CHECK-NEXT: adds r3, r3, r4 2277; CHECK-NEXT: vmov r4, s3 2278; CHECK-NEXT: adc.w lr, r12, r2 2279; CHECK-NEXT: vmov r2, s2 2280; CHECK-NEXT: adds.w r12, r3, r2 2281; CHECK-NEXT: vmov.u16 r2, q6[4] 2282; CHECK-NEXT: vmov.32 q0[0], r2 2283; CHECK-NEXT: vmov.u16 r2, q6[5] 2284; CHECK-NEXT: vmov.32 q0[1], r2 2285; CHECK-NEXT: vmov.u16 r2, q6[6] 2286; CHECK-NEXT: vmov.32 q0[2], r2 2287; CHECK-NEXT: vmov.u16 r2, q6[7] 2288; CHECK-NEXT: vmov.32 q0[3], r2 2289; CHECK-NEXT: adc.w lr, lr, r4 2290; CHECK-NEXT: vcmp.i32 ne, q0, zr 2291; CHECK-NEXT: vmrs r6, p0 2292; CHECK-NEXT: vstrw.32 q3, [sp, #48] @ 16-byte Spill 2293; CHECK-NEXT: vstrw.32 q4, [sp] @ 16-byte Spill 2294; CHECK-NEXT: and r4, r6, #1 2295; CHECK-NEXT: rsbs r4, r4, #0 2296; CHECK-NEXT: vmov.32 q6[0], r4 2297; CHECK-NEXT: vmov.32 q6[1], r4 2298; CHECK-NEXT: ubfx r4, r6, #4, #1 2299; CHECK-NEXT: rsbs r4, r4, #0 2300; CHECK-NEXT: vmov.32 q6[2], r4 2301; CHECK-NEXT: vmov.32 q6[3], r4 2302; CHECK-NEXT: vmov.u8 r4, q3[4] 2303; CHECK-NEXT: vmov.32 q0[0], r4 2304; CHECK-NEXT: vmov.u8 r4, q3[5] 2305; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload 2306; CHECK-NEXT: vmov.32 q0[2], r4 2307; CHECK-NEXT: vand q0, q0, q4 2308; CHECK-NEXT: vmov.u8 r3, q3[4] 2309; CHECK-NEXT: vmov r4, s0 2310; CHECK-NEXT: vmov.32 q1[0], r3 2311; CHECK-NEXT: vmov.u8 r3, q3[5] 2312; CHECK-NEXT: vmov.32 q1[2], r3 2313; CHECK-NEXT: vmov q7, q3 2314; CHECK-NEXT: vand q1, q1, q4 2315; CHECK-NEXT: vmov r3, s4 2316; CHECK-NEXT: umull r3, r4, r3, r4 2317; CHECK-NEXT: vmov.32 q2[0], r3 2318; CHECK-NEXT: vmov r3, s2 2319; CHECK-NEXT: vmov.32 q2[1], r4 2320; CHECK-NEXT: vmov r4, s6 2321; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload 2322; CHECK-NEXT: umull r3, r4, r4, r3 2323; CHECK-NEXT: vmov.32 q2[2], r3 2324; CHECK-NEXT: vmov.32 q2[3], r4 2325; CHECK-NEXT: vand q0, q2, q6 2326; CHECK-NEXT: vmov r4, s0 2327; CHECK-NEXT: vmov r3, s1 2328; CHECK-NEXT: vmov r5, s2 2329; CHECK-NEXT: vmov r2, s3 2330; CHECK-NEXT: adds.w r4, r4, r12 2331; CHECK-NEXT: adc.w r3, r3, lr 2332; CHECK-NEXT: adds r4, r4, r5 2333; CHECK-NEXT: adc.w r12, r3, r2 2334; CHECK-NEXT: ubfx r2, r6, #8, #1 2335; CHECK-NEXT: rsbs r2, r2, #0 2336; CHECK-NEXT: vmov.u8 r3, q3[6] 2337; CHECK-NEXT: vmov.32 q6[0], r2 2338; CHECK-NEXT: vmov.32 q6[1], r2 2339; CHECK-NEXT: ubfx r2, r6, #12, #1 2340; CHECK-NEXT: rsbs r2, r2, #0 2341; CHECK-NEXT: vmov.32 q6[2], r2 2342; CHECK-NEXT: vmov.32 q6[3], r2 2343; CHECK-NEXT: vmov.u8 r2, q1[6] 2344; CHECK-NEXT: vmov.32 q0[0], r2 2345; CHECK-NEXT: vmov.u8 r2, q1[7] 2346; CHECK-NEXT: vmov.32 q1[0], r3 2347; CHECK-NEXT: vmov.u8 r3, q3[7] 2348; CHECK-NEXT: vmov.32 q0[2], r2 2349; CHECK-NEXT: vmov.32 q1[2], r3 2350; CHECK-NEXT: vand q0, q0, q4 2351; CHECK-NEXT: vand q1, q1, q4 2352; CHECK-NEXT: vmov r2, s0 2353; CHECK-NEXT: vmov r3, s4 2354; CHECK-NEXT: umull r2, r3, r3, r2 2355; CHECK-NEXT: vmov.32 q2[0], r2 2356; CHECK-NEXT: vmov r2, s2 2357; CHECK-NEXT: vmov.32 q2[1], r3 2358; CHECK-NEXT: vmov r3, s6 2359; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload 2360; CHECK-NEXT: umull r2, r3, r3, r2 2361; CHECK-NEXT: vmov.32 q2[2], r2 2362; CHECK-NEXT: vmov.32 q2[3], r3 2363; CHECK-NEXT: vand q0, q2, q6 2364; CHECK-NEXT: vmov r3, s0 2365; CHECK-NEXT: vmov r2, s1 2366; CHECK-NEXT: vmov r5, s2 2367; CHECK-NEXT: vmov r6, s3 2368; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload 2369; CHECK-NEXT: adds r3, r3, r4 2370; CHECK-NEXT: adc.w r2, r2, r12 2371; CHECK-NEXT: adds.w r12, r3, r5 2372; CHECK-NEXT: vmov.u8 r5, q7[8] 2373; CHECK-NEXT: adc.w r3, r2, r6 2374; CHECK-NEXT: vmov.u8 r2, q5[8] 2375; CHECK-NEXT: vmov.16 q6[0], r2 2376; CHECK-NEXT: vmov.u8 r2, q5[9] 2377; CHECK-NEXT: vmov.16 q6[1], r2 2378; CHECK-NEXT: vmov.u8 r2, q5[10] 2379; CHECK-NEXT: vmov.16 q6[2], r2 2380; CHECK-NEXT: vmov.u8 r2, q5[11] 2381; CHECK-NEXT: vmov.16 q6[3], r2 2382; CHECK-NEXT: vmov.u8 r2, q5[12] 2383; CHECK-NEXT: vmov.16 q6[4], r2 2384; CHECK-NEXT: vmov.u8 r2, q5[13] 2385; CHECK-NEXT: vmov.16 q6[5], r2 2386; CHECK-NEXT: vmov.u8 r2, q5[14] 2387; CHECK-NEXT: vmov.16 q6[6], r2 2388; CHECK-NEXT: vmov.u8 r2, q5[15] 2389; CHECK-NEXT: vmov.16 q6[7], r2 2390; CHECK-NEXT: vcmp.i16 ne, q6, zr 2391; CHECK-NEXT: vpsel q3, q1, q0 2392; CHECK-NEXT: vmov.32 q1[0], r5 2393; CHECK-NEXT: vmov.u16 r2, q3[0] 2394; CHECK-NEXT: vmov.u8 r5, q7[9] 2395; CHECK-NEXT: vmov.32 q0[0], r2 2396; CHECK-NEXT: vmov.u16 r2, q3[1] 2397; CHECK-NEXT: vmov.32 q0[1], r2 2398; CHECK-NEXT: vmov.u16 r2, q3[2] 2399; CHECK-NEXT: vmov.32 q0[2], r2 2400; CHECK-NEXT: vmov.u16 r2, q3[3] 2401; CHECK-NEXT: vmov.32 q0[3], r2 2402; CHECK-NEXT: vmov.32 q1[2], r5 2403; CHECK-NEXT: vcmp.i32 ne, q0, zr 2404; CHECK-NEXT: vmrs r2, p0 2405; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload 2406; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload 2407; CHECK-NEXT: vand q1, q1, q5 2408; CHECK-NEXT: vmov r5, s4 2409; CHECK-NEXT: and r6, r2, #1 2410; CHECK-NEXT: rsbs r6, r6, #0 2411; CHECK-NEXT: vmov.32 q4[0], r6 2412; CHECK-NEXT: vmov.32 q4[1], r6 2413; CHECK-NEXT: ubfx r6, r2, #4, #1 2414; CHECK-NEXT: rsbs r6, r6, #0 2415; CHECK-NEXT: vmov.32 q4[2], r6 2416; CHECK-NEXT: vmov.32 q4[3], r6 2417; CHECK-NEXT: vmov.u8 r6, q6[8] 2418; CHECK-NEXT: vmov.32 q0[0], r6 2419; CHECK-NEXT: vmov.u8 r6, q6[9] 2420; CHECK-NEXT: vmov.32 q0[2], r6 2421; CHECK-NEXT: vand q0, q0, q5 2422; CHECK-NEXT: vmov r6, s0 2423; CHECK-NEXT: umull r6, r5, r5, r6 2424; CHECK-NEXT: vmov.32 q2[0], r6 2425; CHECK-NEXT: vmov r6, s2 2426; CHECK-NEXT: vmov.32 q2[1], r5 2427; CHECK-NEXT: vmov r5, s6 2428; CHECK-NEXT: umull r6, r5, r5, r6 2429; CHECK-NEXT: vmov.32 q2[2], r6 2430; CHECK-NEXT: vmov.32 q2[3], r5 2431; CHECK-NEXT: vand q0, q2, q4 2432; CHECK-NEXT: vmov r5, s0 2433; CHECK-NEXT: vmov r6, s1 2434; CHECK-NEXT: vmov r4, s3 2435; CHECK-NEXT: adds.w r5, r5, r12 2436; CHECK-NEXT: adcs r6, r3 2437; CHECK-NEXT: vmov r3, s2 2438; CHECK-NEXT: adds r3, r3, r5 2439; CHECK-NEXT: adc.w r12, r6, r4 2440; CHECK-NEXT: ubfx r6, r2, #8, #1 2441; CHECK-NEXT: rsbs r6, r6, #0 2442; CHECK-NEXT: ubfx r2, r2, #12, #1 2443; CHECK-NEXT: vmov.32 q4[0], r6 2444; CHECK-NEXT: rsbs r2, r2, #0 2445; CHECK-NEXT: vmov.32 q4[1], r6 2446; CHECK-NEXT: vmov.u8 r6, q7[10] 2447; CHECK-NEXT: vmov.32 q4[2], r2 2448; CHECK-NEXT: vmov.32 q1[0], r6 2449; CHECK-NEXT: vmov.32 q4[3], r2 2450; CHECK-NEXT: vmov.u8 r2, q6[10] 2451; CHECK-NEXT: vmov.32 q0[0], r2 2452; CHECK-NEXT: vmov.u8 r2, q6[11] 2453; CHECK-NEXT: vmov.u8 r6, q7[11] 2454; CHECK-NEXT: vmov.32 q0[2], r2 2455; CHECK-NEXT: vmov.32 q1[2], r6 2456; CHECK-NEXT: vand q0, q0, q5 2457; CHECK-NEXT: vand q1, q1, q5 2458; CHECK-NEXT: vmov r2, s0 2459; CHECK-NEXT: vmov r6, s4 2460; CHECK-NEXT: umull r2, r6, r6, r2 2461; CHECK-NEXT: vmov.32 q2[0], r2 2462; CHECK-NEXT: vmov r2, s2 2463; CHECK-NEXT: vmov.32 q2[1], r6 2464; CHECK-NEXT: vmov r6, s6 2465; CHECK-NEXT: umull r2, r6, r6, r2 2466; CHECK-NEXT: vmov.32 q2[2], r2 2467; CHECK-NEXT: vmov.32 q2[3], r6 2468; CHECK-NEXT: vand q0, q2, q4 2469; CHECK-NEXT: vmov r6, s0 2470; CHECK-NEXT: vmov r2, s1 2471; CHECK-NEXT: vmov r5, s2 2472; CHECK-NEXT: adds r3, r3, r6 2473; CHECK-NEXT: vmov r6, s3 2474; CHECK-NEXT: adc.w r2, r2, r12 2475; CHECK-NEXT: adds.w r12, r3, r5 2476; CHECK-NEXT: vmov.u8 r5, q7[12] 2477; CHECK-NEXT: vmov.32 q1[0], r5 2478; CHECK-NEXT: vmov.u8 r5, q7[13] 2479; CHECK-NEXT: vmov.32 q1[2], r5 2480; CHECK-NEXT: vand q1, q1, q5 2481; CHECK-NEXT: vmov r5, s4 2482; CHECK-NEXT: adc.w r3, r2, r6 2483; CHECK-NEXT: vmov.u16 r2, q3[4] 2484; CHECK-NEXT: vmov.32 q0[0], r2 2485; CHECK-NEXT: vmov.u16 r2, q3[5] 2486; CHECK-NEXT: vmov.32 q0[1], r2 2487; CHECK-NEXT: vmov.u16 r2, q3[6] 2488; CHECK-NEXT: vmov.32 q0[2], r2 2489; CHECK-NEXT: vmov.u16 r2, q3[7] 2490; CHECK-NEXT: vmov.32 q0[3], r2 2491; CHECK-NEXT: vcmp.i32 ne, q0, zr 2492; CHECK-NEXT: vmrs r2, p0 2493; CHECK-NEXT: and r6, r2, #1 2494; CHECK-NEXT: rsbs r6, r6, #0 2495; CHECK-NEXT: vmov.32 q3[0], r6 2496; CHECK-NEXT: vmov.32 q3[1], r6 2497; CHECK-NEXT: ubfx r6, r2, #4, #1 2498; CHECK-NEXT: rsbs r6, r6, #0 2499; CHECK-NEXT: vmov.32 q3[2], r6 2500; CHECK-NEXT: vmov.32 q3[3], r6 2501; CHECK-NEXT: vmov.u8 r6, q6[12] 2502; CHECK-NEXT: vmov.32 q0[0], r6 2503; CHECK-NEXT: vmov.u8 r6, q6[13] 2504; CHECK-NEXT: vmov.32 q0[2], r6 2505; CHECK-NEXT: vand q0, q0, q5 2506; CHECK-NEXT: vmov r6, s0 2507; CHECK-NEXT: umull r6, r5, r5, r6 2508; CHECK-NEXT: vmov.32 q2[0], r6 2509; CHECK-NEXT: vmov r6, s2 2510; CHECK-NEXT: vmov.32 q2[1], r5 2511; CHECK-NEXT: vmov r5, s6 2512; CHECK-NEXT: umull r6, r5, r5, r6 2513; CHECK-NEXT: vmov.32 q2[2], r6 2514; CHECK-NEXT: vmov.32 q2[3], r5 2515; CHECK-NEXT: vand q0, q2, q3 2516; CHECK-NEXT: vmov r5, s0 2517; CHECK-NEXT: vmov r6, s1 2518; CHECK-NEXT: vmov r4, s3 2519; CHECK-NEXT: adds.w r5, r5, r12 2520; CHECK-NEXT: adcs r6, r3 2521; CHECK-NEXT: vmov r3, s2 2522; CHECK-NEXT: adds r3, r3, r5 2523; CHECK-NEXT: adc.w r12, r6, r4 2524; CHECK-NEXT: ubfx r6, r2, #8, #1 2525; CHECK-NEXT: rsbs r6, r6, #0 2526; CHECK-NEXT: ubfx r2, r2, #12, #1 2527; CHECK-NEXT: vmov.32 q3[0], r6 2528; CHECK-NEXT: rsbs r2, r2, #0 2529; CHECK-NEXT: vmov.32 q3[1], r6 2530; CHECK-NEXT: vmov.u8 r6, q7[14] 2531; CHECK-NEXT: vmov.32 q3[2], r2 2532; CHECK-NEXT: vmov.32 q1[0], r6 2533; CHECK-NEXT: vmov.32 q3[3], r2 2534; CHECK-NEXT: vmov.u8 r2, q6[14] 2535; CHECK-NEXT: vmov.32 q0[0], r2 2536; CHECK-NEXT: vmov.u8 r2, q6[15] 2537; CHECK-NEXT: vmov.u8 r6, q7[15] 2538; CHECK-NEXT: vmov.32 q0[2], r2 2539; CHECK-NEXT: vmov.32 q1[2], r6 2540; CHECK-NEXT: vand q0, q0, q5 2541; CHECK-NEXT: vand q1, q1, q5 2542; CHECK-NEXT: vmov r2, s0 2543; CHECK-NEXT: vmov r6, s4 2544; CHECK-NEXT: umull r2, r6, r6, r2 2545; CHECK-NEXT: vmov.32 q2[0], r2 2546; CHECK-NEXT: vmov r2, s2 2547; CHECK-NEXT: vmov.32 q2[1], r6 2548; CHECK-NEXT: vmov r6, s6 2549; CHECK-NEXT: umull r2, r6, r6, r2 2550; CHECK-NEXT: vmov.32 q2[2], r2 2551; CHECK-NEXT: vmov.32 q2[3], r6 2552; CHECK-NEXT: vand q0, q2, q3 2553; CHECK-NEXT: vmov r6, s0 2554; CHECK-NEXT: vmov r2, s1 2555; CHECK-NEXT: vmov r5, s2 2556; CHECK-NEXT: adds r3, r3, r6 2557; CHECK-NEXT: vmov r6, s3 2558; CHECK-NEXT: adc.w r2, r2, r12 2559; CHECK-NEXT: adds r3, r3, r5 2560; CHECK-NEXT: adcs r2, r6 2561; CHECK-NEXT: adds r0, r0, r3 2562; CHECK-NEXT: adcs r1, r2 2563; CHECK-NEXT: add sp, #80 2564; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 2565; CHECK-NEXT: pop {r4, r5, r6, pc} 2566entry: 2567 %c = icmp eq <16 x i8> %b, zeroinitializer 2568 %xx = zext <16 x i8> %x to <16 x i64> 2569 %yy = zext <16 x i8> %y to <16 x i64> 2570 %m = mul <16 x i64> %xx, %yy 2571 %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer 2572 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s) 2573 %r = add i64 %z, %a 2574 ret i64 %r 2575} 2576 2577define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i64 %a) { 2578; CHECK-LABEL: add_v16i8_v16i64_acc_sext: 2579; CHECK: @ %bb.0: @ %entry 2580; CHECK-NEXT: .save {r4, r5, r7, lr} 2581; CHECK-NEXT: push {r4, r5, r7, lr} 2582; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 2583; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 2584; CHECK-NEXT: vcmp.i8 eq, q2, zr 2585; CHECK-NEXT: vmov.i8 q2, #0x0 2586; CHECK-NEXT: vmov.i8 q3, #0xff 2587; CHECK-NEXT: vpsel q4, q3, q2 2588; CHECK-NEXT: vmov.u8 r2, q4[0] 2589; CHECK-NEXT: vmov.16 q5[0], r2 2590; CHECK-NEXT: vmov.u8 r2, q4[1] 2591; CHECK-NEXT: vmov.16 q5[1], r2 2592; CHECK-NEXT: vmov.u8 r2, q4[2] 2593; CHECK-NEXT: vmov.16 q5[2], r2 2594; CHECK-NEXT: vmov.u8 r2, q4[3] 2595; CHECK-NEXT: vmov.16 q5[3], r2 2596; CHECK-NEXT: vmov.u8 r2, q4[4] 2597; CHECK-NEXT: vmov.16 q5[4], r2 2598; CHECK-NEXT: vmov.u8 r2, q4[5] 2599; CHECK-NEXT: vmov.16 q5[5], r2 2600; CHECK-NEXT: vmov.u8 r2, q4[6] 2601; CHECK-NEXT: vmov.16 q5[6], r2 2602; CHECK-NEXT: vmov.u8 r2, q4[7] 2603; CHECK-NEXT: vmov.16 q5[7], r2 2604; CHECK-NEXT: vcmp.i16 ne, q5, zr 2605; CHECK-NEXT: vpsel q5, q3, q2 2606; CHECK-NEXT: vmov.u16 r2, q5[0] 2607; CHECK-NEXT: vmov.32 q6[0], r2 2608; CHECK-NEXT: vmov.u16 r2, q5[1] 2609; CHECK-NEXT: vmov.32 q6[1], r2 2610; CHECK-NEXT: vmov.u16 r2, q5[2] 2611; CHECK-NEXT: vmov.32 q6[2], r2 2612; CHECK-NEXT: vmov.u16 r2, q5[3] 2613; CHECK-NEXT: vmov.32 q6[3], r2 2614; CHECK-NEXT: vcmp.i32 ne, q6, zr 2615; CHECK-NEXT: vmrs r2, p0 2616; CHECK-NEXT: and r3, r2, #1 2617; CHECK-NEXT: rsbs r3, r3, #0 2618; CHECK-NEXT: vmov.32 q6[0], r3 2619; CHECK-NEXT: vmov.32 q6[1], r3 2620; CHECK-NEXT: ubfx r3, r2, #4, #1 2621; CHECK-NEXT: rsbs r3, r3, #0 2622; CHECK-NEXT: vmov.32 q6[2], r3 2623; CHECK-NEXT: vmov.32 q6[3], r3 2624; CHECK-NEXT: vmov.u8 r3, q1[0] 2625; CHECK-NEXT: sxtb.w r12, r3 2626; CHECK-NEXT: vmov.u8 r3, q0[0] 2627; CHECK-NEXT: sxtb r3, r3 2628; CHECK-NEXT: smull r3, r12, r3, r12 2629; CHECK-NEXT: vmov.32 q7[0], r3 2630; CHECK-NEXT: vmov.u8 r3, q1[1] 2631; CHECK-NEXT: vmov.32 q7[1], r12 2632; CHECK-NEXT: sxtb.w r12, r3 2633; CHECK-NEXT: vmov.u8 r3, q0[1] 2634; CHECK-NEXT: sxtb r3, r3 2635; CHECK-NEXT: smull r3, r12, r3, r12 2636; CHECK-NEXT: vmov.32 q7[2], r3 2637; CHECK-NEXT: vmov.32 q7[3], r12 2638; CHECK-NEXT: vand q6, q7, q6 2639; CHECK-NEXT: vmov r3, s26 2640; CHECK-NEXT: vmov r4, s24 2641; CHECK-NEXT: vmov r12, s27 2642; CHECK-NEXT: vmov r5, s25 2643; CHECK-NEXT: adds.w lr, r4, r3 2644; CHECK-NEXT: ubfx r3, r2, #8, #1 2645; CHECK-NEXT: rsb.w r3, r3, #0 2646; CHECK-NEXT: ubfx r2, r2, #12, #1 2647; CHECK-NEXT: vmov.32 q6[0], r3 2648; CHECK-NEXT: rsb.w r2, r2, #0 2649; CHECK-NEXT: vmov.32 q6[1], r3 2650; CHECK-NEXT: vmov.u8 r3, q0[2] 2651; CHECK-NEXT: vmov.32 q6[2], r2 2652; CHECK-NEXT: sxtb r3, r3 2653; CHECK-NEXT: vmov.32 q6[3], r2 2654; CHECK-NEXT: vmov.u8 r2, q1[2] 2655; CHECK-NEXT: sxtb r2, r2 2656; CHECK-NEXT: adc.w r12, r12, r5 2657; CHECK-NEXT: smull r2, r3, r3, r2 2658; CHECK-NEXT: vmov.32 q7[0], r2 2659; CHECK-NEXT: vmov.u8 r2, q1[3] 2660; CHECK-NEXT: vmov.32 q7[1], r3 2661; CHECK-NEXT: vmov.u8 r3, q0[3] 2662; CHECK-NEXT: sxtb r2, r2 2663; CHECK-NEXT: sxtb r3, r3 2664; CHECK-NEXT: smull r2, r3, r3, r2 2665; CHECK-NEXT: vmov.32 q7[2], r2 2666; CHECK-NEXT: vmov.32 q7[3], r3 2667; CHECK-NEXT: vand q6, q7, q6 2668; CHECK-NEXT: vmov r3, s24 2669; CHECK-NEXT: vmov r2, s25 2670; CHECK-NEXT: vmov r4, s26 2671; CHECK-NEXT: vmov r5, s27 2672; CHECK-NEXT: adds.w r3, r3, lr 2673; CHECK-NEXT: adc.w r2, r2, r12 2674; CHECK-NEXT: adds.w r12, r3, r4 2675; CHECK-NEXT: vmov.u8 r4, q0[4] 2676; CHECK-NEXT: adc.w r3, r2, r5 2677; CHECK-NEXT: vmov.u16 r2, q5[4] 2678; CHECK-NEXT: vmov.32 q6[0], r2 2679; CHECK-NEXT: vmov.u16 r2, q5[5] 2680; CHECK-NEXT: vmov.32 q6[1], r2 2681; CHECK-NEXT: vmov.u16 r2, q5[6] 2682; CHECK-NEXT: vmov.32 q6[2], r2 2683; CHECK-NEXT: vmov.u16 r2, q5[7] 2684; CHECK-NEXT: vmov.32 q6[3], r2 2685; CHECK-NEXT: sxtb r4, r4 2686; CHECK-NEXT: vcmp.i32 ne, q6, zr 2687; CHECK-NEXT: vmrs r2, p0 2688; CHECK-NEXT: and r5, r2, #1 2689; CHECK-NEXT: rsbs r5, r5, #0 2690; CHECK-NEXT: vmov.32 q5[0], r5 2691; CHECK-NEXT: vmov.32 q5[1], r5 2692; CHECK-NEXT: ubfx r5, r2, #4, #1 2693; CHECK-NEXT: rsbs r5, r5, #0 2694; CHECK-NEXT: vmov.32 q5[2], r5 2695; CHECK-NEXT: vmov.32 q5[3], r5 2696; CHECK-NEXT: vmov.u8 r5, q1[4] 2697; CHECK-NEXT: sxtb r5, r5 2698; CHECK-NEXT: smull r5, r4, r4, r5 2699; CHECK-NEXT: vmov.32 q6[0], r5 2700; CHECK-NEXT: vmov.u8 r5, q1[5] 2701; CHECK-NEXT: vmov.32 q6[1], r4 2702; CHECK-NEXT: vmov.u8 r4, q0[5] 2703; CHECK-NEXT: sxtb r5, r5 2704; CHECK-NEXT: sxtb r4, r4 2705; CHECK-NEXT: smull r5, r4, r4, r5 2706; CHECK-NEXT: vmov.32 q6[2], r5 2707; CHECK-NEXT: vmov.32 q6[3], r4 2708; CHECK-NEXT: vand q5, q6, q5 2709; CHECK-NEXT: vmov r4, s20 2710; CHECK-NEXT: vmov r5, s21 2711; CHECK-NEXT: adds.w r12, r12, r4 2712; CHECK-NEXT: vmov r4, s22 2713; CHECK-NEXT: adcs r5, r3 2714; CHECK-NEXT: vmov r3, s23 2715; CHECK-NEXT: adds.w r4, r4, r12 2716; CHECK-NEXT: adc.w r12, r5, r3 2717; CHECK-NEXT: ubfx r3, r2, #8, #1 2718; CHECK-NEXT: rsbs r3, r3, #0 2719; CHECK-NEXT: ubfx r2, r2, #12, #1 2720; CHECK-NEXT: vmov.32 q5[0], r3 2721; CHECK-NEXT: rsbs r2, r2, #0 2722; CHECK-NEXT: vmov.32 q5[1], r3 2723; CHECK-NEXT: vmov.u8 r3, q0[6] 2724; CHECK-NEXT: vmov.32 q5[2], r2 2725; CHECK-NEXT: sxtb r3, r3 2726; CHECK-NEXT: vmov.32 q5[3], r2 2727; CHECK-NEXT: vmov.u8 r2, q1[6] 2728; CHECK-NEXT: sxtb r2, r2 2729; CHECK-NEXT: smull r2, r3, r3, r2 2730; CHECK-NEXT: vmov.32 q6[0], r2 2731; CHECK-NEXT: vmov.u8 r2, q1[7] 2732; CHECK-NEXT: vmov.32 q6[1], r3 2733; CHECK-NEXT: vmov.u8 r3, q0[7] 2734; CHECK-NEXT: sxtb r2, r2 2735; CHECK-NEXT: sxtb r3, r3 2736; CHECK-NEXT: smull r2, r3, r3, r2 2737; CHECK-NEXT: vmov.32 q6[2], r2 2738; CHECK-NEXT: vmov.32 q6[3], r3 2739; CHECK-NEXT: vand q5, q6, q5 2740; CHECK-NEXT: vmov r3, s20 2741; CHECK-NEXT: vmov r2, s21 2742; CHECK-NEXT: vmov r5, s23 2743; CHECK-NEXT: adds r3, r3, r4 2744; CHECK-NEXT: vmov r4, s22 2745; CHECK-NEXT: adc.w r2, r2, r12 2746; CHECK-NEXT: adds.w r12, r3, r4 2747; CHECK-NEXT: vmov.u8 r4, q0[8] 2748; CHECK-NEXT: adc.w r3, r2, r5 2749; CHECK-NEXT: vmov.u8 r2, q4[8] 2750; CHECK-NEXT: vmov.16 q5[0], r2 2751; CHECK-NEXT: vmov.u8 r2, q4[9] 2752; CHECK-NEXT: vmov.16 q5[1], r2 2753; CHECK-NEXT: vmov.u8 r2, q4[10] 2754; CHECK-NEXT: vmov.16 q5[2], r2 2755; CHECK-NEXT: vmov.u8 r2, q4[11] 2756; CHECK-NEXT: vmov.16 q5[3], r2 2757; CHECK-NEXT: vmov.u8 r2, q4[12] 2758; CHECK-NEXT: vmov.16 q5[4], r2 2759; CHECK-NEXT: vmov.u8 r2, q4[13] 2760; CHECK-NEXT: vmov.16 q5[5], r2 2761; CHECK-NEXT: vmov.u8 r2, q4[14] 2762; CHECK-NEXT: vmov.16 q5[6], r2 2763; CHECK-NEXT: vmov.u8 r2, q4[15] 2764; CHECK-NEXT: vmov.16 q5[7], r2 2765; CHECK-NEXT: sxtb r4, r4 2766; CHECK-NEXT: vcmp.i16 ne, q5, zr 2767; CHECK-NEXT: vpsel q2, q3, q2 2768; CHECK-NEXT: vmov.u16 r2, q2[0] 2769; CHECK-NEXT: vmov.32 q3[0], r2 2770; CHECK-NEXT: vmov.u16 r2, q2[1] 2771; CHECK-NEXT: vmov.32 q3[1], r2 2772; CHECK-NEXT: vmov.u16 r2, q2[2] 2773; CHECK-NEXT: vmov.32 q3[2], r2 2774; CHECK-NEXT: vmov.u16 r2, q2[3] 2775; CHECK-NEXT: vmov.32 q3[3], r2 2776; CHECK-NEXT: vcmp.i32 ne, q3, zr 2777; CHECK-NEXT: vmrs r2, p0 2778; CHECK-NEXT: and r5, r2, #1 2779; CHECK-NEXT: rsbs r5, r5, #0 2780; CHECK-NEXT: vmov.32 q3[0], r5 2781; CHECK-NEXT: vmov.32 q3[1], r5 2782; CHECK-NEXT: ubfx r5, r2, #4, #1 2783; CHECK-NEXT: rsbs r5, r5, #0 2784; CHECK-NEXT: vmov.32 q3[2], r5 2785; CHECK-NEXT: vmov.32 q3[3], r5 2786; CHECK-NEXT: vmov.u8 r5, q1[8] 2787; CHECK-NEXT: sxtb r5, r5 2788; CHECK-NEXT: smull r5, r4, r4, r5 2789; CHECK-NEXT: vmov.32 q4[0], r5 2790; CHECK-NEXT: vmov.u8 r5, q1[9] 2791; CHECK-NEXT: vmov.32 q4[1], r4 2792; CHECK-NEXT: vmov.u8 r4, q0[9] 2793; CHECK-NEXT: sxtb r5, r5 2794; CHECK-NEXT: sxtb r4, r4 2795; CHECK-NEXT: smull r5, r4, r4, r5 2796; CHECK-NEXT: vmov.32 q4[2], r5 2797; CHECK-NEXT: vmov.32 q4[3], r4 2798; CHECK-NEXT: vand q3, q4, q3 2799; CHECK-NEXT: vmov r4, s12 2800; CHECK-NEXT: vmov r5, s13 2801; CHECK-NEXT: adds.w r12, r12, r4 2802; CHECK-NEXT: vmov r4, s14 2803; CHECK-NEXT: adcs r5, r3 2804; CHECK-NEXT: vmov r3, s15 2805; CHECK-NEXT: adds.w r4, r4, r12 2806; CHECK-NEXT: adc.w r12, r5, r3 2807; CHECK-NEXT: ubfx r3, r2, #8, #1 2808; CHECK-NEXT: rsbs r3, r3, #0 2809; CHECK-NEXT: ubfx r2, r2, #12, #1 2810; CHECK-NEXT: vmov.32 q3[0], r3 2811; CHECK-NEXT: rsbs r2, r2, #0 2812; CHECK-NEXT: vmov.32 q3[1], r3 2813; CHECK-NEXT: vmov.u8 r3, q0[10] 2814; CHECK-NEXT: vmov.32 q3[2], r2 2815; CHECK-NEXT: sxtb r3, r3 2816; CHECK-NEXT: vmov.32 q3[3], r2 2817; CHECK-NEXT: vmov.u8 r2, q1[10] 2818; CHECK-NEXT: sxtb r2, r2 2819; CHECK-NEXT: smull r2, r3, r3, r2 2820; CHECK-NEXT: vmov.32 q4[0], r2 2821; CHECK-NEXT: vmov.u8 r2, q1[11] 2822; CHECK-NEXT: vmov.32 q4[1], r3 2823; CHECK-NEXT: vmov.u8 r3, q0[11] 2824; CHECK-NEXT: sxtb r2, r2 2825; CHECK-NEXT: sxtb r3, r3 2826; CHECK-NEXT: smull r2, r3, r3, r2 2827; CHECK-NEXT: vmov.32 q4[2], r2 2828; CHECK-NEXT: vmov.32 q4[3], r3 2829; CHECK-NEXT: vand q3, q4, q3 2830; CHECK-NEXT: vmov r3, s12 2831; CHECK-NEXT: vmov r2, s13 2832; CHECK-NEXT: vmov r5, s15 2833; CHECK-NEXT: adds r3, r3, r4 2834; CHECK-NEXT: vmov r4, s14 2835; CHECK-NEXT: adc.w r2, r2, r12 2836; CHECK-NEXT: adds.w r12, r3, r4 2837; CHECK-NEXT: vmov.u8 r4, q0[12] 2838; CHECK-NEXT: adc.w r3, r2, r5 2839; CHECK-NEXT: vmov.u16 r2, q2[4] 2840; CHECK-NEXT: vmov.32 q3[0], r2 2841; CHECK-NEXT: vmov.u16 r2, q2[5] 2842; CHECK-NEXT: vmov.32 q3[1], r2 2843; CHECK-NEXT: vmov.u16 r2, q2[6] 2844; CHECK-NEXT: vmov.32 q3[2], r2 2845; CHECK-NEXT: vmov.u16 r2, q2[7] 2846; CHECK-NEXT: vmov.32 q3[3], r2 2847; CHECK-NEXT: sxtb r4, r4 2848; CHECK-NEXT: vcmp.i32 ne, q3, zr 2849; CHECK-NEXT: vmrs r2, p0 2850; CHECK-NEXT: and r5, r2, #1 2851; CHECK-NEXT: rsbs r5, r5, #0 2852; CHECK-NEXT: vmov.32 q2[0], r5 2853; CHECK-NEXT: vmov.32 q2[1], r5 2854; CHECK-NEXT: ubfx r5, r2, #4, #1 2855; CHECK-NEXT: rsbs r5, r5, #0 2856; CHECK-NEXT: vmov.32 q2[2], r5 2857; CHECK-NEXT: vmov.32 q2[3], r5 2858; CHECK-NEXT: vmov.u8 r5, q1[12] 2859; CHECK-NEXT: sxtb r5, r5 2860; CHECK-NEXT: smull r5, r4, r4, r5 2861; CHECK-NEXT: vmov.32 q3[0], r5 2862; CHECK-NEXT: vmov.u8 r5, q1[13] 2863; CHECK-NEXT: vmov.32 q3[1], r4 2864; CHECK-NEXT: vmov.u8 r4, q0[13] 2865; CHECK-NEXT: sxtb r5, r5 2866; CHECK-NEXT: sxtb r4, r4 2867; CHECK-NEXT: smull r5, r4, r4, r5 2868; CHECK-NEXT: vmov.32 q3[2], r5 2869; CHECK-NEXT: vmov.32 q3[3], r4 2870; CHECK-NEXT: vand q2, q3, q2 2871; CHECK-NEXT: vmov r4, s8 2872; CHECK-NEXT: vmov r5, s9 2873; CHECK-NEXT: adds.w r12, r12, r4 2874; CHECK-NEXT: vmov r4, s10 2875; CHECK-NEXT: adcs r5, r3 2876; CHECK-NEXT: vmov r3, s11 2877; CHECK-NEXT: adds.w r4, r4, r12 2878; CHECK-NEXT: adc.w r12, r5, r3 2879; CHECK-NEXT: ubfx r3, r2, #8, #1 2880; CHECK-NEXT: rsbs r3, r3, #0 2881; CHECK-NEXT: ubfx r2, r2, #12, #1 2882; CHECK-NEXT: vmov.32 q2[0], r3 2883; CHECK-NEXT: rsbs r2, r2, #0 2884; CHECK-NEXT: vmov.32 q2[1], r3 2885; CHECK-NEXT: vmov.u8 r3, q0[14] 2886; CHECK-NEXT: vmov.32 q2[2], r2 2887; CHECK-NEXT: sxtb r3, r3 2888; CHECK-NEXT: vmov.32 q2[3], r2 2889; CHECK-NEXT: vmov.u8 r2, q1[14] 2890; CHECK-NEXT: sxtb r2, r2 2891; CHECK-NEXT: smull r2, r3, r3, r2 2892; CHECK-NEXT: vmov.32 q3[0], r2 2893; CHECK-NEXT: vmov.u8 r2, q1[15] 2894; CHECK-NEXT: vmov.32 q3[1], r3 2895; CHECK-NEXT: vmov.u8 r3, q0[15] 2896; CHECK-NEXT: sxtb r2, r2 2897; CHECK-NEXT: sxtb r3, r3 2898; CHECK-NEXT: smull r2, r3, r3, r2 2899; CHECK-NEXT: vmov.32 q3[2], r2 2900; CHECK-NEXT: vmov.32 q3[3], r3 2901; CHECK-NEXT: vand q0, q3, q2 2902; CHECK-NEXT: vmov r3, s0 2903; CHECK-NEXT: vmov r2, s1 2904; CHECK-NEXT: vmov r5, s3 2905; CHECK-NEXT: adds r3, r3, r4 2906; CHECK-NEXT: vmov r4, s2 2907; CHECK-NEXT: adc.w r2, r2, r12 2908; CHECK-NEXT: adds r3, r3, r4 2909; CHECK-NEXT: adcs r2, r5 2910; CHECK-NEXT: adds r0, r0, r3 2911; CHECK-NEXT: adcs r1, r2 2912; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 2913; CHECK-NEXT: pop {r4, r5, r7, pc} 2914entry: 2915 %c = icmp eq <16 x i8> %b, zeroinitializer 2916 %xx = sext <16 x i8> %x to <16 x i64> 2917 %yy = sext <16 x i8> %y to <16 x i64> 2918 %m = mul <16 x i64> %xx, %yy 2919 %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer 2920 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s) 2921 %r = add i64 %z, %a 2922 ret i64 %r 2923} 2924 2925define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b, i64 %a) { 2926; CHECK-LABEL: add_v2i8_v2i64_acc_zext: 2927; CHECK: @ %bb.0: @ %entry 2928; CHECK-NEXT: .save {r7, lr} 2929; CHECK-NEXT: push {r7, lr} 2930; CHECK-NEXT: .vsave {d8, d9} 2931; CHECK-NEXT: vpush {d8, d9} 2932; CHECK-NEXT: vmov.i64 q3, #0xff 2933; CHECK-NEXT: vand q1, q1, q3 2934; CHECK-NEXT: vand q4, q0, q3 2935; CHECK-NEXT: vmov r2, s4 2936; CHECK-NEXT: vmov r3, s16 2937; CHECK-NEXT: umull r2, r3, r3, r2 2938; CHECK-NEXT: vmov.32 q0[0], r2 2939; CHECK-NEXT: vmov r2, s6 2940; CHECK-NEXT: vmov.32 q0[1], r3 2941; CHECK-NEXT: vmov r3, s18 2942; CHECK-NEXT: vand q1, q2, q3 2943; CHECK-NEXT: umull r2, r3, r3, r2 2944; CHECK-NEXT: vmov.32 q0[2], r2 2945; CHECK-NEXT: vmov r2, s4 2946; CHECK-NEXT: vmov.32 q0[3], r3 2947; CHECK-NEXT: cmp r2, #0 2948; CHECK-NEXT: cset r2, eq 2949; CHECK-NEXT: tst.w r2, #1 2950; CHECK-NEXT: csetm r2, ne 2951; CHECK-NEXT: vmov.32 q2[0], r2 2952; CHECK-NEXT: vmov.32 q2[1], r2 2953; CHECK-NEXT: vmov r2, s6 2954; CHECK-NEXT: cmp r2, #0 2955; CHECK-NEXT: cset r2, eq 2956; CHECK-NEXT: tst.w r2, #1 2957; CHECK-NEXT: csetm r2, ne 2958; CHECK-NEXT: vmov.32 q2[2], r2 2959; CHECK-NEXT: vmov.32 q2[3], r2 2960; CHECK-NEXT: vand q0, q0, q2 2961; CHECK-NEXT: vmov r2, s2 2962; CHECK-NEXT: vmov r3, s0 2963; CHECK-NEXT: vmov r12, s3 2964; CHECK-NEXT: vmov lr, s1 2965; CHECK-NEXT: adds r2, r2, r3 2966; CHECK-NEXT: adc.w r3, lr, r12 2967; CHECK-NEXT: adds r0, r0, r2 2968; CHECK-NEXT: adcs r1, r3 2969; CHECK-NEXT: vpop {d8, d9} 2970; CHECK-NEXT: pop {r7, pc} 2971entry: 2972 %c = icmp eq <2 x i8> %b, zeroinitializer 2973 %xx = zext <2 x i8> %x to <2 x i64> 2974 %yy = zext <2 x i8> %y to <2 x i64> 2975 %m = mul <2 x i64> %xx, %yy 2976 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 2977 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 2978 %r = add i64 %z, %a 2979 ret i64 %r 2980} 2981 2982define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b, i64 %a) { 2983; CHECK-LABEL: add_v2i8_v2i64_acc_sext: 2984; CHECK: @ %bb.0: @ %entry 2985; CHECK-NEXT: .save {r7, lr} 2986; CHECK-NEXT: push {r7, lr} 2987; CHECK-NEXT: vmov.i32 q3, #0xff 2988; CHECK-NEXT: vmov r3, s0 2989; CHECK-NEXT: vand q3, q2, q3 2990; CHECK-NEXT: vmov r2, s12 2991; CHECK-NEXT: sxtb r3, r3 2992; CHECK-NEXT: cmp r2, #0 2993; CHECK-NEXT: cset r2, eq 2994; CHECK-NEXT: tst.w r2, #1 2995; CHECK-NEXT: csetm r2, ne 2996; CHECK-NEXT: vmov.32 q2[0], r2 2997; CHECK-NEXT: vmov.32 q2[1], r2 2998; CHECK-NEXT: vmov r2, s14 2999; CHECK-NEXT: cmp r2, #0 3000; CHECK-NEXT: cset r2, eq 3001; CHECK-NEXT: tst.w r2, #1 3002; CHECK-NEXT: csetm r2, ne 3003; CHECK-NEXT: vmov.32 q2[2], r2 3004; CHECK-NEXT: vmov.32 q2[3], r2 3005; CHECK-NEXT: vmov r2, s4 3006; CHECK-NEXT: sxtb r2, r2 3007; CHECK-NEXT: smull r2, r3, r3, r2 3008; CHECK-NEXT: vmov.32 q3[0], r2 3009; CHECK-NEXT: vmov r2, s6 3010; CHECK-NEXT: vmov.32 q3[1], r3 3011; CHECK-NEXT: vmov r3, s2 3012; CHECK-NEXT: sxtb r2, r2 3013; CHECK-NEXT: sxtb r3, r3 3014; CHECK-NEXT: smull r2, r3, r3, r2 3015; CHECK-NEXT: vmov.32 q3[2], r2 3016; CHECK-NEXT: vmov.32 q3[3], r3 3017; CHECK-NEXT: vand q0, q3, q2 3018; CHECK-NEXT: vmov r2, s2 3019; CHECK-NEXT: vmov r3, s0 3020; CHECK-NEXT: vmov r12, s3 3021; CHECK-NEXT: vmov lr, s1 3022; CHECK-NEXT: adds r2, r2, r3 3023; CHECK-NEXT: adc.w r3, lr, r12 3024; CHECK-NEXT: adds r0, r0, r2 3025; CHECK-NEXT: adcs r1, r3 3026; CHECK-NEXT: pop {r7, pc} 3027entry: 3028 %c = icmp eq <2 x i8> %b, zeroinitializer 3029 %xx = sext <2 x i8> %x to <2 x i64> 3030 %yy = sext <2 x i8> %y to <2 x i64> 3031 %m = mul <2 x i64> %xx, %yy 3032 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 3033 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 3034 %r = add i64 %z, %a 3035 ret i64 %r 3036} 3037 3038define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %y, <2 x i64> %b, i64 %a) { 3039; CHECK-LABEL: add_v2i64_v2i64_acc: 3040; CHECK: @ %bb.0: @ %entry 3041; CHECK-NEXT: .save {r4, lr} 3042; CHECK-NEXT: push {r4, lr} 3043; CHECK-NEXT: vmov r2, s4 3044; CHECK-NEXT: vmov r3, s0 3045; CHECK-NEXT: vmov r4, s5 3046; CHECK-NEXT: umull r12, lr, r3, r2 3047; CHECK-NEXT: mla r3, r3, r4, lr 3048; CHECK-NEXT: vmov r4, s1 3049; CHECK-NEXT: vmov.32 q3[0], r12 3050; CHECK-NEXT: mla r2, r4, r2, r3 3051; CHECK-NEXT: vmov r3, s2 3052; CHECK-NEXT: vmov.32 q3[1], r2 3053; CHECK-NEXT: vmov r2, s6 3054; CHECK-NEXT: umull r4, r12, r3, r2 3055; CHECK-NEXT: vmov.32 q3[2], r4 3056; CHECK-NEXT: vmov r4, s7 3057; CHECK-NEXT: mla r3, r3, r4, r12 3058; CHECK-NEXT: vmov r4, s3 3059; CHECK-NEXT: mla r2, r4, r2, r3 3060; CHECK-NEXT: vmov r3, s8 3061; CHECK-NEXT: vmov.32 q3[3], r2 3062; CHECK-NEXT: vmov r2, s9 3063; CHECK-NEXT: orrs r2, r3 3064; CHECK-NEXT: vmov r3, s10 3065; CHECK-NEXT: cset r2, eq 3066; CHECK-NEXT: tst.w r2, #1 3067; CHECK-NEXT: csetm r2, ne 3068; CHECK-NEXT: vmov.32 q0[0], r2 3069; CHECK-NEXT: vmov.32 q0[1], r2 3070; CHECK-NEXT: vmov r2, s11 3071; CHECK-NEXT: orrs r2, r3 3072; CHECK-NEXT: cset r2, eq 3073; CHECK-NEXT: tst.w r2, #1 3074; CHECK-NEXT: csetm r2, ne 3075; CHECK-NEXT: vmov.32 q0[2], r2 3076; CHECK-NEXT: vmov.32 q0[3], r2 3077; CHECK-NEXT: vand q0, q3, q0 3078; CHECK-NEXT: vmov r4, s2 3079; CHECK-NEXT: vmov r2, s0 3080; CHECK-NEXT: vmov r12, s3 3081; CHECK-NEXT: vmov r3, s1 3082; CHECK-NEXT: adds r2, r2, r4 3083; CHECK-NEXT: adc.w r3, r3, r12 3084; CHECK-NEXT: adds r0, r0, r2 3085; CHECK-NEXT: adcs r1, r3 3086; CHECK-NEXT: pop {r4, pc} 3087entry: 3088 %c = icmp eq <2 x i64> %b, zeroinitializer 3089 %m = mul <2 x i64> %x, %y 3090 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 3091 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 3092 %r = add i64 %z, %a 3093 ret i64 %r 3094} 3095 3096declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) 3097declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) 3098declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) 3099declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 3100declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) 3101declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) 3102declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) 3103declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) 3104declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) 3105declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) 3106