1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s 3 4define arm_aapcs_vfpcc i32 @vqdmulh_i8(<16 x i8> %s0, <16 x i8> %s1) { 5; CHECK-LABEL: vqdmulh_i8: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: vqdmulh.s8 q0, q1, q0 8; CHECK-NEXT: vaddv.s8 r0, q0 9; CHECK-NEXT: bx lr 10entry: 11 %l2 = sext <16 x i8> %s0 to <16 x i32> 12 %l5 = sext <16 x i8> %s1 to <16 x i32> 13 %l6 = mul nsw <16 x i32> %l5, %l2 14 %l7 = ashr <16 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 15 %l8 = icmp slt <16 x i32> %l7, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127> 16 %l9 = select <16 x i1> %l8, <16 x i32> %l7, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127> 17 %l10 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %l9) 18 ret i32 %l10 19} 20 21define arm_aapcs_vfpcc <16 x i8> @vqdmulh_i8_b(<16 x i8> %s0, <16 x i8> %s1) { 22; CHECK-LABEL: vqdmulh_i8_b: 23; CHECK: @ %bb.0: @ %entry 24; CHECK-NEXT: vqdmulh.s8 q0, q1, q0 25; CHECK-NEXT: bx lr 26entry: 27 %l2 = sext <16 x i8> %s0 to <16 x i32> 28 %l5 = sext <16 x i8> %s1 to <16 x i32> 29 %l6 = mul nsw <16 x i32> %l5, %l2 30 %l7 = ashr <16 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 31 %l8 = icmp slt <16 x i32> %l7, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127> 32 %l9 = select <16 x i1> %l8, <16 x i32> %l7, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127> 33 %l10 = trunc <16 x i32> %l9 to <16 x i8> 34 ret <16 x i8> %l10 35} 36 37define arm_aapcs_vfpcc i32 @vqdmulh_i16(<8 x i16> %s0, <8 x i16> %s1) { 38; CHECK-LABEL: vqdmulh_i16: 39; CHECK: @ %bb.0: @ %entry 40; CHECK-NEXT: vqdmulh.s16 q0, q1, q0 41; CHECK-NEXT: vaddv.s16 r0, q0 42; CHECK-NEXT: bx lr 43entry: 44 %l2 = sext <8 x i16> %s0 to <8 x i32> 45 %l5 = sext <8 x i16> %s1 to <8 x i32> 46 %l6 = mul nsw <8 x i32> %l5, %l2 47 %l7 = ashr <8 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 48 %l8 = icmp slt <8 x i32> %l7, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 49 %l9 = select <8 x i1> %l8, <8 x i32> %l7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 50 %l10 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %l9) 51 ret i32 %l10 52} 53 54define arm_aapcs_vfpcc <8 x i16> @vqdmulh_i16_b(<8 x i16> %s0, <8 x i16> %s1) { 55; CHECK-LABEL: vqdmulh_i16_b: 56; CHECK: @ %bb.0: @ %entry 57; CHECK-NEXT: vqdmulh.s16 q0, q1, q0 58; CHECK-NEXT: bx lr 59entry: 60 %l2 = sext <8 x i16> %s0 to <8 x i32> 61 %l5 = sext <8 x i16> %s1 to <8 x i32> 62 %l6 = mul nsw <8 x i32> %l5, %l2 63 %l7 = ashr <8 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 64 %l8 = icmp slt <8 x i32> %l7, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 65 %l9 = select <8 x i1> %l8, <8 x i32> %l7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 66 %l10 = trunc <8 x i32> %l9 to <8 x i16> 67 ret <8 x i16> %l10 68} 69 70define arm_aapcs_vfpcc <8 x i16> @vqdmulh_i16_c(<8 x i16> %s0, <8 x i16> %s1) { 71; CHECK-LABEL: vqdmulh_i16_c: 72; CHECK: @ %bb.0: @ %entry 73; CHECK-NEXT: .vsave {d8, d9} 74; CHECK-NEXT: vpush {d8, d9} 75; CHECK-NEXT: vmov q2, q0 76; CHECK-NEXT: vmov.u16 r0, q0[0] 77; CHECK-NEXT: vmov.32 q0[0], r0 78; CHECK-NEXT: vmov.u16 r0, q2[1] 79; CHECK-NEXT: vmov.32 q0[1], r0 80; CHECK-NEXT: vmov.u16 r0, q2[2] 81; CHECK-NEXT: vmov.32 q0[2], r0 82; CHECK-NEXT: vmov.u16 r0, q2[3] 83; CHECK-NEXT: vmov.32 q0[3], r0 84; CHECK-NEXT: vmov.u16 r0, q1[0] 85; CHECK-NEXT: vmov.32 q3[0], r0 86; CHECK-NEXT: vmov.u16 r0, q1[1] 87; CHECK-NEXT: vmov.32 q3[1], r0 88; CHECK-NEXT: vmov.u16 r0, q1[2] 89; CHECK-NEXT: vmov.32 q3[2], r0 90; CHECK-NEXT: vmov.u16 r0, q1[3] 91; CHECK-NEXT: vmov.32 q3[3], r0 92; CHECK-NEXT: vmullb.s16 q0, q3, q0 93; CHECK-NEXT: vmov.i32 q3, #0x7fff 94; CHECK-NEXT: vshl.i32 q0, q0, #10 95; CHECK-NEXT: vshr.s32 q0, q0, #10 96; CHECK-NEXT: vshr.s32 q0, q0, #15 97; CHECK-NEXT: vmin.s32 q4, q0, q3 98; CHECK-NEXT: vmov r0, s16 99; CHECK-NEXT: vmov.16 q0[0], r0 100; CHECK-NEXT: vmov r0, s17 101; CHECK-NEXT: vmov.16 q0[1], r0 102; CHECK-NEXT: vmov r0, s18 103; CHECK-NEXT: vmov.16 q0[2], r0 104; CHECK-NEXT: vmov r0, s19 105; CHECK-NEXT: vmov.16 q0[3], r0 106; CHECK-NEXT: vmov.u16 r0, q2[4] 107; CHECK-NEXT: vmov.32 q4[0], r0 108; CHECK-NEXT: vmov.u16 r0, q2[5] 109; CHECK-NEXT: vmov.32 q4[1], r0 110; CHECK-NEXT: vmov.u16 r0, q2[6] 111; CHECK-NEXT: vmov.32 q4[2], r0 112; CHECK-NEXT: vmov.u16 r0, q2[7] 113; CHECK-NEXT: vmov.32 q4[3], r0 114; CHECK-NEXT: vmov.u16 r0, q1[4] 115; CHECK-NEXT: vmov.32 q2[0], r0 116; CHECK-NEXT: vmov.u16 r0, q1[5] 117; CHECK-NEXT: vmov.32 q2[1], r0 118; CHECK-NEXT: vmov.u16 r0, q1[6] 119; CHECK-NEXT: vmov.32 q2[2], r0 120; CHECK-NEXT: vmov.u16 r0, q1[7] 121; CHECK-NEXT: vmov.32 q2[3], r0 122; CHECK-NEXT: vmullb.s16 q1, q2, q4 123; CHECK-NEXT: vshl.i32 q1, q1, #10 124; CHECK-NEXT: vshr.s32 q1, q1, #10 125; CHECK-NEXT: vshr.s32 q1, q1, #15 126; CHECK-NEXT: vmin.s32 q1, q1, q3 127; CHECK-NEXT: vmov r0, s4 128; CHECK-NEXT: vmov.16 q0[4], r0 129; CHECK-NEXT: vmov r0, s5 130; CHECK-NEXT: vmov.16 q0[5], r0 131; CHECK-NEXT: vmov r0, s6 132; CHECK-NEXT: vmov.16 q0[6], r0 133; CHECK-NEXT: vmov r0, s7 134; CHECK-NEXT: vmov.16 q0[7], r0 135; CHECK-NEXT: vpop {d8, d9} 136; CHECK-NEXT: bx lr 137entry: 138 %l2 = sext <8 x i16> %s0 to <8 x i22> 139 %l5 = sext <8 x i16> %s1 to <8 x i22> 140 %l6 = mul nsw <8 x i22> %l5, %l2 141 %l7 = ashr <8 x i22> %l6, <i22 15, i22 15, i22 15, i22 15, i22 15, i22 15, i22 15, i22 15> 142 %l8 = icmp slt <8 x i22> %l7, <i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767> 143 %l9 = select <8 x i1> %l8, <8 x i22> %l7, <8 x i22> <i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767> 144 %l10 = trunc <8 x i22> %l9 to <8 x i16> 145 ret <8 x i16> %l10 146} 147 148define arm_aapcs_vfpcc i64 @vqdmulh_i32(<4 x i32> %s0, <4 x i32> %s1) { 149; CHECK-LABEL: vqdmulh_i32: 150; CHECK: @ %bb.0: @ %entry 151; CHECK-NEXT: vqdmulh.s32 q0, q1, q0 152; CHECK-NEXT: vaddlv.s32 r0, r1, q0 153; CHECK-NEXT: bx lr 154entry: 155 %l2 = sext <4 x i32> %s0 to <4 x i64> 156 %l5 = sext <4 x i32> %s1 to <4 x i64> 157 %l6 = mul nsw <4 x i64> %l5, %l2 158 %l7 = ashr <4 x i64> %l6, <i64 31, i64 31, i64 31, i64 31> 159 %l8 = icmp slt <4 x i64> %l7, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647> 160 %l9 = select <4 x i1> %l8, <4 x i64> %l7, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647> 161 %l10 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %l9) 162 ret i64 %l10 163} 164 165define arm_aapcs_vfpcc <4 x i32> @vqdmulh_i32_b(<4 x i32> %s0, <4 x i32> %s1) { 166; CHECK-LABEL: vqdmulh_i32_b: 167; CHECK: @ %bb.0: @ %entry 168; CHECK-NEXT: vqdmulh.s32 q0, q1, q0 169; CHECK-NEXT: bx lr 170entry: 171 %l2 = sext <4 x i32> %s0 to <4 x i64> 172 %l5 = sext <4 x i32> %s1 to <4 x i64> 173 %l6 = mul nsw <4 x i64> %l5, %l2 174 %l7 = ashr <4 x i64> %l6, <i64 31, i64 31, i64 31, i64 31> 175 %l8 = icmp slt <4 x i64> %l7, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647> 176 %l9 = select <4 x i1> %l8, <4 x i64> %l7, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647> 177 %l10 = trunc <4 x i64> %l9 to <4 x i32> 178 ret <4 x i32> %l10 179} 180 181 182 183 184define void @vqdmulh_loop_i8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) local_unnamed_addr #0 { 185; CHECK-LABEL: vqdmulh_loop_i8: 186; CHECK: @ %bb.0: @ %entry 187; CHECK-NEXT: .save {r7, lr} 188; CHECK-NEXT: push {r7, lr} 189; CHECK-NEXT: mov.w lr, #64 190; CHECK-NEXT: dls lr, lr 191; CHECK-NEXT: .LBB7_1: @ %vector.body 192; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 193; CHECK-NEXT: vldrb.u8 q0, [r0], #16 194; CHECK-NEXT: vldrb.u8 q1, [r1], #16 195; CHECK-NEXT: vqdmulh.s8 q0, q1, q0 196; CHECK-NEXT: vstrb.8 q0, [r2], #16 197; CHECK-NEXT: le lr, .LBB7_1 198; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 199; CHECK-NEXT: pop {r7, pc} 200entry: 201 br label %vector.body 202 203vector.body: ; preds = %vector.body, %entry 204 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 205 %0 = getelementptr inbounds i8, i8* %x, i32 %index 206 %1 = bitcast i8* %0 to <16 x i8>* 207 %wide.load = load <16 x i8>, <16 x i8>* %1, align 1 208 %2 = sext <16 x i8> %wide.load to <16 x i32> 209 %3 = getelementptr inbounds i8, i8* %y, i32 %index 210 %4 = bitcast i8* %3 to <16 x i8>* 211 %wide.load26 = load <16 x i8>, <16 x i8>* %4, align 1 212 %5 = sext <16 x i8> %wide.load26 to <16 x i32> 213 %6 = mul nsw <16 x i32> %5, %2 214 %7 = ashr <16 x i32> %6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 215 %8 = icmp slt <16 x i32> %7, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127> 216 %9 = select <16 x i1> %8, <16 x i32> %7, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127> 217 %10 = trunc <16 x i32> %9 to <16 x i8> 218 %11 = getelementptr inbounds i8, i8* %z, i32 %index 219 %12 = bitcast i8* %11 to <16 x i8>* 220 store <16 x i8> %10, <16 x i8>* %12, align 1 221 %index.next = add i32 %index, 16 222 %13 = icmp eq i32 %index.next, 1024 223 br i1 %13, label %for.cond.cleanup, label %vector.body 224 225for.cond.cleanup: ; preds = %vector.body 226 ret void 227} 228 229define void @vqdmulh_loop_i16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) { 230; CHECK-LABEL: vqdmulh_loop_i16: 231; CHECK: @ %bb.0: @ %entry 232; CHECK-NEXT: .save {r7, lr} 233; CHECK-NEXT: push {r7, lr} 234; CHECK-NEXT: mov.w lr, #128 235; CHECK-NEXT: dls lr, lr 236; CHECK-NEXT: .LBB8_1: @ %vector.body 237; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 238; CHECK-NEXT: vldrh.u16 q0, [r0], #16 239; CHECK-NEXT: vldrh.u16 q1, [r1], #16 240; CHECK-NEXT: vqdmulh.s16 q0, q1, q0 241; CHECK-NEXT: vstrb.8 q0, [r2], #16 242; CHECK-NEXT: le lr, .LBB8_1 243; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 244; CHECK-NEXT: pop {r7, pc} 245entry: 246 br label %vector.body 247 248vector.body: ; preds = %vector.body, %entry 249 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 250 %0 = getelementptr inbounds i16, i16* %x, i32 %index 251 %1 = bitcast i16* %0 to <8 x i16>* 252 %wide.load = load <8 x i16>, <8 x i16>* %1, align 2 253 %2 = sext <8 x i16> %wide.load to <8 x i32> 254 %3 = getelementptr inbounds i16, i16* %y, i32 %index 255 %4 = bitcast i16* %3 to <8 x i16>* 256 %wide.load30 = load <8 x i16>, <8 x i16>* %4, align 2 257 %5 = sext <8 x i16> %wide.load30 to <8 x i32> 258 %6 = mul nsw <8 x i32> %5, %2 259 %7 = ashr <8 x i32> %6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 260 %8 = icmp slt <8 x i32> %7, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 261 %9 = select <8 x i1> %8, <8 x i32> %7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 262 %10 = trunc <8 x i32> %9 to <8 x i16> 263 %11 = getelementptr inbounds i16, i16* %z, i32 %index 264 %12 = bitcast i16* %11 to <8 x i16>* 265 store <8 x i16> %10, <8 x i16>* %12, align 2 266 %index.next = add i32 %index, 8 267 %13 = icmp eq i32 %index.next, 1024 268 br i1 %13, label %for.cond.cleanup, label %vector.body 269 270for.cond.cleanup: ; preds = %vector.body 271 ret void 272} 273 274define void @vqdmulh_loop_i32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { 275; CHECK-LABEL: vqdmulh_loop_i32: 276; CHECK: @ %bb.0: @ %entry 277; CHECK-NEXT: .save {r7, lr} 278; CHECK-NEXT: push {r7, lr} 279; CHECK-NEXT: mov.w lr, #256 280; CHECK-NEXT: dls lr, lr 281; CHECK-NEXT: .LBB9_1: @ %vector.body 282; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 283; CHECK-NEXT: vldrw.u32 q0, [r0], #16 284; CHECK-NEXT: vldrw.u32 q1, [r1], #16 285; CHECK-NEXT: vqdmulh.s32 q0, q1, q0 286; CHECK-NEXT: vstrb.8 q0, [r2], #16 287; CHECK-NEXT: le lr, .LBB9_1 288; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 289; CHECK-NEXT: pop {r7, pc} 290entry: 291 br label %vector.body 292 293vector.body: ; preds = %vector.body, %entry 294 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 295 %0 = getelementptr inbounds i32, i32* %x, i32 %index 296 %1 = bitcast i32* %0 to <4 x i32>* 297 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 298 %2 = sext <4 x i32> %wide.load to <4 x i64> 299 %3 = getelementptr inbounds i32, i32* %y, i32 %index 300 %4 = bitcast i32* %3 to <4 x i32>* 301 %wide.load30 = load <4 x i32>, <4 x i32>* %4, align 4 302 %5 = sext <4 x i32> %wide.load30 to <4 x i64> 303 %6 = mul nsw <4 x i64> %5, %2 304 %7 = ashr <4 x i64> %6, <i64 31, i64 31, i64 31, i64 31> 305 %8 = icmp slt <4 x i64> %7, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647> 306 %9 = select <4 x i1> %8, <4 x i64> %7, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647> 307 %10 = trunc <4 x i64> %9 to <4 x i32> 308 %11 = getelementptr inbounds i32, i32* %z, i32 %index 309 %12 = bitcast i32* %11 to <4 x i32>* 310 store <4 x i32> %10, <4 x i32>* %12, align 4 311 %index.next = add i32 %index, 4 312 %13 = icmp eq i32 %index.next, 1024 313 br i1 %13, label %for.cond.cleanup, label %vector.body 314 315for.cond.cleanup: ; preds = %vector.body 316 ret void 317} 318 319declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) 320declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) 321declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) 322