1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s 3 4define arm_aapcs_vfpcc <16 x i8> @test_vmulq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 { 5; CHECK-LABEL: test_vmulq_u8: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: vmul.i8 q0, q1, q0 8; CHECK-NEXT: bx lr 9entry: 10 %0 = mul <16 x i8> %b, %a 11 ret <16 x i8> %0 12} 13 14define arm_aapcs_vfpcc <8 x i16> @test_vmulq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 { 15; CHECK-LABEL: test_vmulq_s16: 16; CHECK: @ %bb.0: @ %entry 17; CHECK-NEXT: vmul.i16 q0, q1, q0 18; CHECK-NEXT: bx lr 19entry: 20 %0 = mul <8 x i16> %b, %a 21 ret <8 x i16> %0 22} 23 24define arm_aapcs_vfpcc <4 x i32> @test_vmulq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 { 25; CHECK-LABEL: test_vmulq_u32: 26; CHECK: @ %bb.0: @ %entry 27; CHECK-NEXT: vmul.i32 q0, q1, q0 28; CHECK-NEXT: bx lr 29entry: 30 %0 = mul <4 x i32> %b, %a 31 ret <4 x i32> %0 32} 33 34define arm_aapcs_vfpcc <4 x float> @test_vmulq_f32(<4 x float> %a, <4 x float> %b) local_unnamed_addr #0 { 35; CHECK-LABEL: test_vmulq_f32: 36; CHECK: @ %bb.0: @ %entry 37; CHECK-NEXT: vmul.f32 q0, q0, q1 38; CHECK-NEXT: bx lr 39entry: 40 %0 = fmul <4 x float> %a, %b 41 ret <4 x float> %0 42} 43 44define arm_aapcs_vfpcc <16 x i8> @test_vmulq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 { 45; CHECK-LABEL: test_vmulq_m_s8: 46; CHECK: @ %bb.0: @ %entry 47; CHECK-NEXT: vmsr p0, r0 48; CHECK-NEXT: vpst 49; CHECK-NEXT: vmult.i8 q0, q1, q2 50; CHECK-NEXT: bx lr 51entry: 52 %0 = zext i16 %p to i32 53 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) 54 %2 = tail call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive) 55 ret <16 x i8> %2 56} 57 58declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2 59 60declare <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2 61 62define arm_aapcs_vfpcc <8 x i16> @test_vmulq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 { 63; CHECK-LABEL: test_vmulq_m_u16: 64; CHECK: @ %bb.0: @ %entry 65; CHECK-NEXT: vmsr p0, r0 66; CHECK-NEXT: vpst 67; CHECK-NEXT: vmult.i16 q0, q1, q2 68; CHECK-NEXT: bx lr 69entry: 70 %0 = zext i16 %p to i32 71 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) 72 %2 = tail call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive) 73 ret <8 x i16> %2 74} 75 76declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2 77 78declare <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2 79 80define arm_aapcs_vfpcc <4 x i32> @test_vmulq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 { 81; CHECK-LABEL: test_vmulq_m_s32: 82; CHECK: @ %bb.0: @ %entry 83; CHECK-NEXT: vmsr p0, r0 84; CHECK-NEXT: vpst 85; CHECK-NEXT: vmult.i32 q0, q1, q2 86; CHECK-NEXT: bx lr 87entry: 88 %0 = zext i16 %p to i32 89 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) 90 %2 = tail call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive) 91 ret <4 x i32> %2 92} 93 94declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2 95 96declare <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2 97 98define arm_aapcs_vfpcc <8 x half> @test_vmulq_m_f16(<8 x half> %inactive, <8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #1 { 99; CHECK-LABEL: test_vmulq_m_f16: 100; CHECK: @ %bb.0: @ %entry 101; CHECK-NEXT: vmsr p0, r0 102; CHECK-NEXT: vpst 103; CHECK-NEXT: vmult.f16 q0, q1, q2 104; CHECK-NEXT: bx lr 105entry: 106 %0 = zext i16 %p to i32 107 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) 108 %2 = tail call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> %inactive) 109 ret <8 x half> %2 110} 111 112declare <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #2 113 114define arm_aapcs_vfpcc <16 x i8> @test_vmulq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 { 115; CHECK-LABEL: test_vmulq_x_u8: 116; CHECK: @ %bb.0: @ %entry 117; CHECK-NEXT: vmsr p0, r0 118; CHECK-NEXT: vpst 119; CHECK-NEXT: vmult.i8 q0, q0, q1 120; CHECK-NEXT: bx lr 121entry: 122 %0 = zext i16 %p to i32 123 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) 124 %2 = tail call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef) 125 ret <16 x i8> %2 126} 127 128define arm_aapcs_vfpcc <8 x i16> @test_vmulq_x_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 { 129; CHECK-LABEL: test_vmulq_x_s16: 130; CHECK: @ %bb.0: @ %entry 131; CHECK-NEXT: vmsr p0, r0 132; CHECK-NEXT: vpst 133; CHECK-NEXT: vmult.i16 q0, q0, q1 134; CHECK-NEXT: bx lr 135entry: 136 %0 = zext i16 %p to i32 137 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) 138 %2 = tail call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef) 139 ret <8 x i16> %2 140} 141 142define arm_aapcs_vfpcc <4 x i32> @test_vmulq_x_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 { 143; CHECK-LABEL: test_vmulq_x_u32: 144; CHECK: @ %bb.0: @ %entry 145; CHECK-NEXT: vmsr p0, r0 146; CHECK-NEXT: vpst 147; CHECK-NEXT: vmult.i32 q0, q0, q1 148; CHECK-NEXT: bx lr 149entry: 150 %0 = zext i16 %p to i32 151 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) 152 %2 = tail call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef) 153 ret <4 x i32> %2 154} 155 156define arm_aapcs_vfpcc <4 x float> @test_vmulq_m_f32(<4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #1 { 157; CHECK-LABEL: test_vmulq_m_f32: 158; CHECK: @ %bb.0: @ %entry 159; CHECK-NEXT: vmsr p0, r0 160; CHECK-NEXT: vpst 161; CHECK-NEXT: vmult.f32 q0, q0, q1 162; CHECK-NEXT: bx lr 163entry: 164 %0 = zext i16 %p to i32 165 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) 166 %2 = tail call <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> undef) 167 ret <4 x float> %2 168} 169 170declare <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #2 171 172define arm_aapcs_vfpcc <16 x i8> @test_vmulq_n_u8(<16 x i8> %a, i8 zeroext %b) { 173; CHECK-LABEL: test_vmulq_n_u8: 174; CHECK: @ %bb.0: @ %entry 175; CHECK-NEXT: vmul.i8 q0, q0, r0 176; CHECK-NEXT: bx lr 177entry: 178 %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0 179 %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer 180 %0 = mul <16 x i8> %.splat, %a 181 ret <16 x i8> %0 182} 183 184define arm_aapcs_vfpcc <8 x i16> @test_vmulq_n_s16(<8 x i16> %a, i16 signext %b) { 185; CHECK-LABEL: test_vmulq_n_s16: 186; CHECK: @ %bb.0: @ %entry 187; CHECK-NEXT: vmul.i16 q0, q0, r0 188; CHECK-NEXT: bx lr 189entry: 190 %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0 191 %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer 192 %0 = mul <8 x i16> %.splat, %a 193 ret <8 x i16> %0 194} 195 196define arm_aapcs_vfpcc <4 x i32> @test_vmulq_n_u32(<4 x i32> %a, i32 %b) { 197; CHECK-LABEL: test_vmulq_n_u32: 198; CHECK: @ %bb.0: @ %entry 199; CHECK-NEXT: vmul.i32 q0, q0, r0 200; CHECK-NEXT: bx lr 201entry: 202 %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0 203 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 204 %0 = mul <4 x i32> %.splat, %a 205 ret <4 x i32> %0 206} 207 208define arm_aapcs_vfpcc <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) { 209; CHECK-LABEL: test_vmulq_n_f32: 210; CHECK: @ %bb.0: @ %entry 211; CHECK-NEXT: vmov r0, s4 212; CHECK-NEXT: vmul.f32 q0, q0, r0 213; CHECK-NEXT: bx lr 214entry: 215 %.splatinsert = insertelement <4 x float> undef, float %b, i32 0 216 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer 217 %0 = fmul <4 x float> %.splat, %a 218 ret <4 x float> %0 219} 220 221define arm_aapcs_vfpcc <16 x i8> @test_vmulq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) { 222; CHECK-LABEL: test_vmulq_m_n_s8: 223; CHECK: @ %bb.0: @ %entry 224; CHECK-NEXT: vmsr p0, r1 225; CHECK-NEXT: vpst 226; CHECK-NEXT: vmult.i8 q0, q1, r0 227; CHECK-NEXT: bx lr 228entry: 229 %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0 230 %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer 231 %0 = zext i16 %p to i32 232 %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) 233 %2 = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> %inactive) 234 ret <16 x i8> %2 235} 236 237define arm_aapcs_vfpcc <8 x i16> @test_vmulq_m_n_u16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %b, i16 zeroext %p) { 238; CHECK-LABEL: test_vmulq_m_n_u16: 239; CHECK: @ %bb.0: @ %entry 240; CHECK-NEXT: vmsr p0, r1 241; CHECK-NEXT: vpst 242; CHECK-NEXT: vmult.i16 q0, q1, r0 243; CHECK-NEXT: bx lr 244entry: 245 %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0 246 %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer 247 %0 = zext i16 %p to i32 248 %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) 249 %2 = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> %inactive) 250 ret <8 x i16> %2 251} 252 253define arm_aapcs_vfpcc <4 x i32> @test_vmulq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) { 254; CHECK-LABEL: test_vmulq_m_n_s32: 255; CHECK: @ %bb.0: @ %entry 256; CHECK-NEXT: vmsr p0, r1 257; CHECK-NEXT: vpst 258; CHECK-NEXT: vmult.i32 q0, q1, r0 259; CHECK-NEXT: bx lr 260entry: 261 %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0 262 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 263 %0 = zext i16 %p to i32 264 %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) 265 %2 = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, <4 x i1> %1, <4 x i32> %inactive) 266 ret <4 x i32> %2 267} 268 269define arm_aapcs_vfpcc <8 x half> @test_vmulq_m_n_f16(<8 x half> %inactive, <8 x half> %a, float %b.coerce, i16 zeroext %p) { 270; CHECK-LABEL: test_vmulq_m_n_f16: 271; CHECK: @ %bb.0: @ %entry 272; CHECK-NEXT: vmov r1, s8 273; CHECK-NEXT: vmsr p0, r0 274; CHECK-NEXT: vpst 275; CHECK-NEXT: vmult.f16 q0, q1, r1 276; CHECK-NEXT: bx lr 277entry: 278 %0 = bitcast float %b.coerce to i32 279 %tmp.0.extract.trunc = trunc i32 %0 to i16 280 %1 = bitcast i16 %tmp.0.extract.trunc to half 281 %.splatinsert = insertelement <8 x half> undef, half %1, i32 0 282 %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer 283 %2 = zext i16 %p to i32 284 %3 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2) 285 %4 = call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %.splat, <8 x i1> %3, <8 x half> %inactive) 286 ret <8 x half> %4 287} 288 289define arm_aapcs_vfpcc <16 x i8> @test_vmulq_x_n_u8(<16 x i8> %a, i8 zeroext %b, i16 zeroext %p) { 290; CHECK-LABEL: test_vmulq_x_n_u8: 291; CHECK: @ %bb.0: @ %entry 292; CHECK-NEXT: vmsr p0, r1 293; CHECK-NEXT: vpst 294; CHECK-NEXT: vmult.i8 q0, q0, r0 295; CHECK-NEXT: bx lr 296entry: 297 %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0 298 %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer 299 %0 = zext i16 %p to i32 300 %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) 301 %2 = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> undef) 302 ret <16 x i8> %2 303} 304 305define arm_aapcs_vfpcc <8 x i16> @test_vmulq_x_n_s16(<8 x i16> %a, i16 signext %b, i16 zeroext %p) { 306; CHECK-LABEL: test_vmulq_x_n_s16: 307; CHECK: @ %bb.0: @ %entry 308; CHECK-NEXT: vmsr p0, r1 309; CHECK-NEXT: vpst 310; CHECK-NEXT: vmult.i16 q0, q0, r0 311; CHECK-NEXT: bx lr 312entry: 313 %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0 314 %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer 315 %0 = zext i16 %p to i32 316 %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) 317 %2 = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> undef) 318 ret <8 x i16> %2 319} 320 321define arm_aapcs_vfpcc <4 x i32> @test_vmulq_x_n_u32(<4 x i32> %a, i32 %b, i16 zeroext %p) { 322; CHECK-LABEL: test_vmulq_x_n_u32: 323; CHECK: @ %bb.0: @ %entry 324; CHECK-NEXT: vmsr p0, r1 325; CHECK-NEXT: vpst 326; CHECK-NEXT: vmult.i32 q0, q0, r0 327; CHECK-NEXT: bx lr 328entry: 329 %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0 330 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 331 %0 = zext i16 %p to i32 332 %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) 333 %2 = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, <4 x i1> %1, <4 x i32> undef) 334 ret <4 x i32> %2 335} 336 337define arm_aapcs_vfpcc <4 x float> @test_vmulq_x_n_f32(<4 x float> %a, float %b, i16 zeroext %p) { 338; CHECK-LABEL: test_vmulq_x_n_f32: 339; CHECK: @ %bb.0: @ %entry 340; CHECK-NEXT: vmov r1, s4 341; CHECK-NEXT: vmsr p0, r0 342; CHECK-NEXT: vpst 343; CHECK-NEXT: vmult.f32 q0, q0, r1 344; CHECK-NEXT: bx lr 345entry: 346 %.splatinsert = insertelement <4 x float> undef, float %b, i32 0 347 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer 348 %0 = zext i16 %p to i32 349 %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) 350 %2 = call <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %.splat, <4 x i1> %1, <4 x float> undef) 351 ret <4 x float> %2 352} 353