1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX256,AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX256,AVX512,AVX512F 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX256,AVX512,AVX512BW 7 8define i32 @_Z10test_shortPsS_i_128(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 { 9; SSE2-LABEL: _Z10test_shortPsS_i_128: 10; SSE2: # %bb.0: # %entry 11; SSE2-NEXT: movl %edx, %eax 12; SSE2-NEXT: pxor %xmm0, %xmm0 13; SSE2-NEXT: xorl %ecx, %ecx 14; SSE2-NEXT: .p2align 4, 0x90 15; SSE2-NEXT: .LBB0_1: # %vector.body 16; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 17; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 18; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 19; SSE2-NEXT: movdqa %xmm2, %xmm3 20; SSE2-NEXT: pmulhw %xmm1, %xmm3 21; SSE2-NEXT: pmullw %xmm1, %xmm2 22; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 23; SSE2-NEXT: paddd %xmm2, %xmm0 24; SSE2-NEXT: addq $8, %rcx 25; SSE2-NEXT: cmpq %rcx, %rax 26; SSE2-NEXT: jne .LBB0_1 27; SSE2-NEXT: # %bb.2: # %middle.block 28; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 29; SSE2-NEXT: paddd %xmm0, %xmm1 30; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 31; SSE2-NEXT: paddd %xmm1, %xmm0 32; SSE2-NEXT: movd %xmm0, %eax 33; SSE2-NEXT: retq 34; 35; AVX-LABEL: _Z10test_shortPsS_i_128: 36; AVX: # %bb.0: # %entry 37; AVX-NEXT: movl %edx, %eax 38; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 39; AVX-NEXT: xorl %ecx, %ecx 40; AVX-NEXT: .p2align 4, 0x90 41; AVX-NEXT: .LBB0_1: # %vector.body 42; AVX-NEXT: # =>This Inner Loop Header: Depth=1 43; AVX-NEXT: vpmovsxwd (%rdi,%rcx,2), %xmm1 44; AVX-NEXT: vpmovsxwd (%rsi,%rcx,2), %xmm2 45; AVX-NEXT: vpmulld %xmm1, %xmm2, %xmm1 46; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 47; AVX-NEXT: addq $8, %rcx 48; AVX-NEXT: cmpq %rcx, %rax 49; AVX-NEXT: jne .LBB0_1 50; AVX-NEXT: # %bb.2: # %middle.block 51; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 52; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 53; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 54; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 55; AVX-NEXT: vmovd %xmm0, %eax 56; AVX-NEXT: retq 57entry: 58 %3 = zext i32 %2 to i64 59 br label %vector.body 60 61vector.body: 62 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 63 %vec.phi = phi <4 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 64 %4 = getelementptr inbounds i16, i16* %0, i64 %index 65 %5 = bitcast i16* %4 to <4 x i16>* 66 %wide.load = load <4 x i16>, <4 x i16>* %5, align 2 67 %6 = sext <4 x i16> %wide.load to <4 x i32> 68 %7 = getelementptr inbounds i16, i16* %1, i64 %index 69 %8 = bitcast i16* %7 to <4 x i16>* 70 %wide.load14 = load <4 x i16>, <4 x i16>* %8, align 2 71 %9 = sext <4 x i16> %wide.load14 to <4 x i32> 72 %10 = mul nsw <4 x i32> %9, %6 73 %11 = add nsw <4 x i32> %10, %vec.phi 74 %index.next = add i64 %index, 8 75 %12 = icmp eq i64 %index.next, %3 76 br i1 %12, label %middle.block, label %vector.body 77 78middle.block: 79 %rdx.shuf15 = shufflevector <4 x i32> %11, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 80 %bin.rdx16 = add <4 x i32> %11, %rdx.shuf15 81 %rdx.shuf17 = shufflevector <4 x i32> %bin.rdx16, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 82 %bin.rdx18 = add <4 x i32> %bin.rdx16, %rdx.shuf17 83 %13 = extractelement <4 x i32> %bin.rdx18, i32 0 84 ret i32 %13 85} 86 87define i32 @_Z10test_shortPsS_i_256(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 { 88; SSE2-LABEL: _Z10test_shortPsS_i_256: 89; SSE2: # %bb.0: # %entry 90; SSE2-NEXT: movl %edx, %eax 91; SSE2-NEXT: pxor %xmm0, %xmm0 92; SSE2-NEXT: xorl %ecx, %ecx 93; SSE2-NEXT: pxor %xmm1, %xmm1 94; SSE2-NEXT: .p2align 4, 0x90 95; SSE2-NEXT: .LBB1_1: # %vector.body 96; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 97; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm2 98; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm3 99; SSE2-NEXT: pmaddwd %xmm2, %xmm3 100; SSE2-NEXT: paddd %xmm3, %xmm1 101; SSE2-NEXT: addq $8, %rcx 102; SSE2-NEXT: cmpq %rcx, %rax 103; SSE2-NEXT: jne .LBB1_1 104; SSE2-NEXT: # %bb.2: # %middle.block 105; SSE2-NEXT: paddd %xmm0, %xmm1 106; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 107; SSE2-NEXT: paddd %xmm1, %xmm0 108; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 109; SSE2-NEXT: paddd %xmm0, %xmm1 110; SSE2-NEXT: movd %xmm1, %eax 111; SSE2-NEXT: retq 112; 113; AVX1-LABEL: _Z10test_shortPsS_i_256: 114; AVX1: # %bb.0: # %entry 115; AVX1-NEXT: movl %edx, %eax 116; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 117; AVX1-NEXT: xorl %ecx, %ecx 118; AVX1-NEXT: .p2align 4, 0x90 119; AVX1-NEXT: .LBB1_1: # %vector.body 120; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 121; AVX1-NEXT: vmovdqu (%rsi,%rcx,2), %xmm1 122; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1 123; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 124; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 125; AVX1-NEXT: addq $8, %rcx 126; AVX1-NEXT: cmpq %rcx, %rax 127; AVX1-NEXT: jne .LBB1_1 128; AVX1-NEXT: # %bb.2: # %middle.block 129; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 130; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 131; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 132; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 133; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 134; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 135; AVX1-NEXT: vmovd %xmm0, %eax 136; AVX1-NEXT: vzeroupper 137; AVX1-NEXT: retq 138; 139; AVX256-LABEL: _Z10test_shortPsS_i_256: 140; AVX256: # %bb.0: # %entry 141; AVX256-NEXT: movl %edx, %eax 142; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 143; AVX256-NEXT: xorl %ecx, %ecx 144; AVX256-NEXT: .p2align 4, 0x90 145; AVX256-NEXT: .LBB1_1: # %vector.body 146; AVX256-NEXT: # =>This Inner Loop Header: Depth=1 147; AVX256-NEXT: vmovdqu (%rsi,%rcx,2), %xmm1 148; AVX256-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1 149; AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 150; AVX256-NEXT: addq $8, %rcx 151; AVX256-NEXT: cmpq %rcx, %rax 152; AVX256-NEXT: jne .LBB1_1 153; AVX256-NEXT: # %bb.2: # %middle.block 154; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 155; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 156; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 157; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 158; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 159; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 160; AVX256-NEXT: vmovd %xmm0, %eax 161; AVX256-NEXT: vzeroupper 162; AVX256-NEXT: retq 163entry: 164 %3 = zext i32 %2 to i64 165 br label %vector.body 166 167vector.body: 168 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 169 %vec.phi = phi <8 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 170 %4 = getelementptr inbounds i16, i16* %0, i64 %index 171 %5 = bitcast i16* %4 to <8 x i16>* 172 %wide.load = load <8 x i16>, <8 x i16>* %5, align 2 173 %6 = sext <8 x i16> %wide.load to <8 x i32> 174 %7 = getelementptr inbounds i16, i16* %1, i64 %index 175 %8 = bitcast i16* %7 to <8 x i16>* 176 %wide.load14 = load <8 x i16>, <8 x i16>* %8, align 2 177 %9 = sext <8 x i16> %wide.load14 to <8 x i32> 178 %10 = mul nsw <8 x i32> %9, %6 179 %11 = add nsw <8 x i32> %10, %vec.phi 180 %index.next = add i64 %index, 8 181 %12 = icmp eq i64 %index.next, %3 182 br i1 %12, label %middle.block, label %vector.body 183 184middle.block: 185 %rdx.shuf = shufflevector <8 x i32> %11, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 186 %bin.rdx = add <8 x i32> %11, %rdx.shuf 187 %rdx.shuf15 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 188 %bin.rdx16 = add <8 x i32> %bin.rdx, %rdx.shuf15 189 %rdx.shuf17 = shufflevector <8 x i32> %bin.rdx16, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 190 %bin.rdx18 = add <8 x i32> %bin.rdx16, %rdx.shuf17 191 %13 = extractelement <8 x i32> %bin.rdx18, i32 0 192 ret i32 %13 193} 194 195define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 { 196; SSE2-LABEL: _Z10test_shortPsS_i_512: 197; SSE2: # %bb.0: # %entry 198; SSE2-NEXT: movl %edx, %eax 199; SSE2-NEXT: pxor %xmm0, %xmm0 200; SSE2-NEXT: xorl %ecx, %ecx 201; SSE2-NEXT: pxor %xmm2, %xmm2 202; SSE2-NEXT: pxor %xmm1, %xmm1 203; SSE2-NEXT: .p2align 4, 0x90 204; SSE2-NEXT: .LBB2_1: # %vector.body 205; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 206; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm3 207; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm4 208; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm5 209; SSE2-NEXT: pmaddwd %xmm3, %xmm5 210; SSE2-NEXT: paddd %xmm5, %xmm2 211; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm3 212; SSE2-NEXT: pmaddwd %xmm4, %xmm3 213; SSE2-NEXT: paddd %xmm3, %xmm1 214; SSE2-NEXT: addq $16, %rcx 215; SSE2-NEXT: cmpq %rcx, %rax 216; SSE2-NEXT: jne .LBB2_1 217; SSE2-NEXT: # %bb.2: # %middle.block 218; SSE2-NEXT: paddd %xmm0, %xmm2 219; SSE2-NEXT: paddd %xmm0, %xmm1 220; SSE2-NEXT: paddd %xmm2, %xmm1 221; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 222; SSE2-NEXT: paddd %xmm1, %xmm0 223; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 224; SSE2-NEXT: paddd %xmm0, %xmm1 225; SSE2-NEXT: movd %xmm1, %eax 226; SSE2-NEXT: retq 227; 228; AVX1-LABEL: _Z10test_shortPsS_i_512: 229; AVX1: # %bb.0: # %entry 230; AVX1-NEXT: movl %edx, %eax 231; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 232; AVX1-NEXT: xorl %ecx, %ecx 233; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 234; AVX1-NEXT: .p2align 4, 0x90 235; AVX1-NEXT: .LBB2_1: # %vector.body 236; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 237; AVX1-NEXT: vmovdqu (%rsi,%rcx,2), %xmm2 238; AVX1-NEXT: vmovdqu 16(%rsi,%rcx,2), %xmm3 239; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm2, %xmm2 240; AVX1-NEXT: vpmaddwd 16(%rdi,%rcx,2), %xmm3, %xmm3 241; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 242; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 243; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 244; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 245; AVX1-NEXT: addq $16, %rcx 246; AVX1-NEXT: cmpq %rcx, %rax 247; AVX1-NEXT: jne .LBB2_1 248; AVX1-NEXT: # %bb.2: # %middle.block 249; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 250; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 251; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 252; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 253; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 254; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 255; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 256; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 257; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 258; AVX1-NEXT: vmovd %xmm0, %eax 259; AVX1-NEXT: vzeroupper 260; AVX1-NEXT: retq 261; 262; AVX2-LABEL: _Z10test_shortPsS_i_512: 263; AVX2: # %bb.0: # %entry 264; AVX2-NEXT: movl %edx, %eax 265; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 266; AVX2-NEXT: xorl %ecx, %ecx 267; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 268; AVX2-NEXT: .p2align 4, 0x90 269; AVX2-NEXT: .LBB2_1: # %vector.body 270; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 271; AVX2-NEXT: vmovdqu (%rsi,%rcx,2), %ymm2 272; AVX2-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm2, %ymm2 273; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 274; AVX2-NEXT: addq $16, %rcx 275; AVX2-NEXT: cmpq %rcx, %rax 276; AVX2-NEXT: jne .LBB2_1 277; AVX2-NEXT: # %bb.2: # %middle.block 278; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 279; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 280; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 281; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 282; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 283; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 284; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 285; AVX2-NEXT: vmovd %xmm0, %eax 286; AVX2-NEXT: vzeroupper 287; AVX2-NEXT: retq 288; 289; AVX512-LABEL: _Z10test_shortPsS_i_512: 290; AVX512: # %bb.0: # %entry 291; AVX512-NEXT: movl %edx, %eax 292; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 293; AVX512-NEXT: xorl %ecx, %ecx 294; AVX512-NEXT: .p2align 4, 0x90 295; AVX512-NEXT: .LBB2_1: # %vector.body 296; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 297; AVX512-NEXT: vmovdqu (%rsi,%rcx,2), %ymm1 298; AVX512-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm1, %ymm1 299; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 300; AVX512-NEXT: addq $16, %rcx 301; AVX512-NEXT: cmpq %rcx, %rax 302; AVX512-NEXT: jne .LBB2_1 303; AVX512-NEXT: # %bb.2: # %middle.block 304; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 305; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 306; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 307; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 308; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 309; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 310; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 311; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 312; AVX512-NEXT: vmovd %xmm0, %eax 313; AVX512-NEXT: vzeroupper 314; AVX512-NEXT: retq 315entry: 316 %3 = zext i32 %2 to i64 317 br label %vector.body 318 319vector.body: 320 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 321 %vec.phi = phi <16 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 322 %4 = getelementptr inbounds i16, i16* %0, i64 %index 323 %5 = bitcast i16* %4 to <16 x i16>* 324 %wide.load = load <16 x i16>, <16 x i16>* %5, align 2 325 %6 = sext <16 x i16> %wide.load to <16 x i32> 326 %7 = getelementptr inbounds i16, i16* %1, i64 %index 327 %8 = bitcast i16* %7 to <16 x i16>* 328 %wide.load14 = load <16 x i16>, <16 x i16>* %8, align 2 329 %9 = sext <16 x i16> %wide.load14 to <16 x i32> 330 %10 = mul nsw <16 x i32> %9, %6 331 %11 = add nsw <16 x i32> %10, %vec.phi 332 %index.next = add i64 %index, 16 333 %12 = icmp eq i64 %index.next, %3 334 br i1 %12, label %middle.block, label %vector.body 335 336middle.block: 337 %rdx.shuf1 = shufflevector <16 x i32> %11, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 338 %bin.rdx1 = add <16 x i32> %11, %rdx.shuf1 339 %rdx.shuf = shufflevector <16 x i32> %bin.rdx1, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 340 %bin.rdx = add <16 x i32> %bin.rdx1, %rdx.shuf 341 %rdx.shuf15 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 342 %bin.rdx16 = add <16 x i32> %bin.rdx, %rdx.shuf15 343 %rdx.shuf17 = shufflevector <16 x i32> %bin.rdx16, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 344 %bin.rdx18 = add <16 x i32> %bin.rdx16, %rdx.shuf17 345 %13 = extractelement <16 x i32> %bin.rdx18, i32 0 346 ret i32 %13 347} 348 349define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 { 350; SSE2-LABEL: _Z10test_shortPsS_i_1024: 351; SSE2: # %bb.0: # %entry 352; SSE2-NEXT: movl %edx, %eax 353; SSE2-NEXT: pxor %xmm8, %xmm8 354; SSE2-NEXT: xorl %ecx, %ecx 355; SSE2-NEXT: pxor %xmm2, %xmm2 356; SSE2-NEXT: pxor %xmm4, %xmm4 357; SSE2-NEXT: pxor %xmm1, %xmm1 358; SSE2-NEXT: pxor %xmm3, %xmm3 359; SSE2-NEXT: .p2align 4, 0x90 360; SSE2-NEXT: .LBB3_1: # %vector.body 361; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 362; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm5 363; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm6 364; SSE2-NEXT: movdqu 32(%rdi,%rcx,2), %xmm7 365; SSE2-NEXT: movdqu 48(%rdi,%rcx,2), %xmm9 366; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm0 367; SSE2-NEXT: pmaddwd %xmm5, %xmm0 368; SSE2-NEXT: paddd %xmm0, %xmm2 369; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm0 370; SSE2-NEXT: pmaddwd %xmm6, %xmm0 371; SSE2-NEXT: paddd %xmm0, %xmm4 372; SSE2-NEXT: movdqu 32(%rsi,%rcx,2), %xmm0 373; SSE2-NEXT: pmaddwd %xmm7, %xmm0 374; SSE2-NEXT: paddd %xmm0, %xmm1 375; SSE2-NEXT: movdqu 48(%rsi,%rcx,2), %xmm0 376; SSE2-NEXT: pmaddwd %xmm9, %xmm0 377; SSE2-NEXT: paddd %xmm0, %xmm3 378; SSE2-NEXT: addq $16, %rcx 379; SSE2-NEXT: cmpq %rcx, %rax 380; SSE2-NEXT: jne .LBB3_1 381; SSE2-NEXT: # %bb.2: # %middle.block 382; SSE2-NEXT: paddd %xmm8, %xmm4 383; SSE2-NEXT: paddd %xmm8, %xmm3 384; SSE2-NEXT: paddd %xmm4, %xmm3 385; SSE2-NEXT: paddd %xmm8, %xmm2 386; SSE2-NEXT: paddd %xmm8, %xmm1 387; SSE2-NEXT: paddd %xmm3, %xmm1 388; SSE2-NEXT: paddd %xmm2, %xmm1 389; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 390; SSE2-NEXT: paddd %xmm1, %xmm0 391; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 392; SSE2-NEXT: paddd %xmm0, %xmm1 393; SSE2-NEXT: movd %xmm1, %eax 394; SSE2-NEXT: retq 395; 396; AVX1-LABEL: _Z10test_shortPsS_i_1024: 397; AVX1: # %bb.0: # %entry 398; AVX1-NEXT: movl %edx, %eax 399; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 400; AVX1-NEXT: xorl %ecx, %ecx 401; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 402; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 403; AVX1-NEXT: .p2align 4, 0x90 404; AVX1-NEXT: .LBB3_1: # %vector.body 405; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 406; AVX1-NEXT: vmovdqu (%rsi,%rcx,2), %xmm3 407; AVX1-NEXT: vmovdqu 16(%rsi,%rcx,2), %xmm4 408; AVX1-NEXT: vmovdqu 32(%rsi,%rcx,2), %xmm5 409; AVX1-NEXT: vmovdqu 48(%rsi,%rcx,2), %xmm6 410; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm3, %xmm3 411; AVX1-NEXT: vpmaddwd 16(%rdi,%rcx,2), %xmm4, %xmm4 412; AVX1-NEXT: vpmaddwd 32(%rdi,%rcx,2), %xmm5, %xmm5 413; AVX1-NEXT: vpmaddwd 48(%rdi,%rcx,2), %xmm6, %xmm6 414; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 415; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6 416; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2 417; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 418; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 419; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 420; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 421; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 422; AVX1-NEXT: addq $16, %rcx 423; AVX1-NEXT: cmpq %rcx, %rax 424; AVX1-NEXT: jne .LBB3_1 425; AVX1-NEXT: # %bb.2: # %middle.block 426; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm3 427; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 428; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 429; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 430; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2 431; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2 432; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 433; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 434; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 435; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 436; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 437; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 438; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 439; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 440; AVX1-NEXT: vmovd %xmm0, %eax 441; AVX1-NEXT: vzeroupper 442; AVX1-NEXT: retq 443; 444; AVX2-LABEL: _Z10test_shortPsS_i_1024: 445; AVX2: # %bb.0: # %entry 446; AVX2-NEXT: movl %edx, %eax 447; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 448; AVX2-NEXT: xorl %ecx, %ecx 449; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 450; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 451; AVX2-NEXT: .p2align 4, 0x90 452; AVX2-NEXT: .LBB3_1: # %vector.body 453; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 454; AVX2-NEXT: vmovdqu (%rsi,%rcx,2), %ymm3 455; AVX2-NEXT: vmovdqu 32(%rsi,%rcx,2), %ymm4 456; AVX2-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm3, %ymm3 457; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 458; AVX2-NEXT: vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm3 459; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 460; AVX2-NEXT: addq $16, %rcx 461; AVX2-NEXT: cmpq %rcx, %rax 462; AVX2-NEXT: jne .LBB3_1 463; AVX2-NEXT: # %bb.2: # %middle.block 464; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1 465; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 466; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 467; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 468; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 469; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 470; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 471; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 472; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 473; AVX2-NEXT: vmovd %xmm0, %eax 474; AVX2-NEXT: vzeroupper 475; AVX2-NEXT: retq 476; 477; AVX512F-LABEL: _Z10test_shortPsS_i_1024: 478; AVX512F: # %bb.0: # %entry 479; AVX512F-NEXT: movl %edx, %eax 480; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 481; AVX512F-NEXT: xorl %ecx, %ecx 482; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 483; AVX512F-NEXT: .p2align 4, 0x90 484; AVX512F-NEXT: .LBB3_1: # %vector.body 485; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 486; AVX512F-NEXT: vmovdqu (%rsi,%rcx,2), %ymm2 487; AVX512F-NEXT: vmovdqu 32(%rsi,%rcx,2), %ymm3 488; AVX512F-NEXT: vpmaddwd 32(%rdi,%rcx,2), %ymm3, %ymm3 489; AVX512F-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm2, %ymm2 490; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 491; AVX512F-NEXT: vpaddd %zmm1, %zmm2, %zmm1 492; AVX512F-NEXT: addq $16, %rcx 493; AVX512F-NEXT: cmpq %rcx, %rax 494; AVX512F-NEXT: jne .LBB3_1 495; AVX512F-NEXT: # %bb.2: # %middle.block 496; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm0 497; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 498; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 499; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 500; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 501; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 502; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 503; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 504; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 505; AVX512F-NEXT: vmovd %xmm0, %eax 506; AVX512F-NEXT: vzeroupper 507; AVX512F-NEXT: retq 508; 509; AVX512BW-LABEL: _Z10test_shortPsS_i_1024: 510; AVX512BW: # %bb.0: # %entry 511; AVX512BW-NEXT: movl %edx, %eax 512; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 513; AVX512BW-NEXT: xorl %ecx, %ecx 514; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 515; AVX512BW-NEXT: .p2align 4, 0x90 516; AVX512BW-NEXT: .LBB3_1: # %vector.body 517; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 518; AVX512BW-NEXT: vmovdqu64 (%rsi,%rcx,2), %zmm2 519; AVX512BW-NEXT: vpmaddwd (%rdi,%rcx,2), %zmm2, %zmm2 520; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1 521; AVX512BW-NEXT: addq $16, %rcx 522; AVX512BW-NEXT: cmpq %rcx, %rax 523; AVX512BW-NEXT: jne .LBB3_1 524; AVX512BW-NEXT: # %bb.2: # %middle.block 525; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0 526; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 527; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 528; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 529; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 530; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 531; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 532; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 533; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 534; AVX512BW-NEXT: vmovd %xmm0, %eax 535; AVX512BW-NEXT: vzeroupper 536; AVX512BW-NEXT: retq 537entry: 538 %3 = zext i32 %2 to i64 539 br label %vector.body 540 541vector.body: 542 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 543 %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 544 %4 = getelementptr inbounds i16, i16* %0, i64 %index 545 %5 = bitcast i16* %4 to <32 x i16>* 546 %wide.load = load <32 x i16>, <32 x i16>* %5, align 2 547 %6 = sext <32 x i16> %wide.load to <32 x i32> 548 %7 = getelementptr inbounds i16, i16* %1, i64 %index 549 %8 = bitcast i16* %7 to <32 x i16>* 550 %wide.load14 = load <32 x i16>, <32 x i16>* %8, align 2 551 %9 = sext <32 x i16> %wide.load14 to <32 x i32> 552 %10 = mul nsw <32 x i32> %9, %6 553 %11 = add nsw <32 x i32> %10, %vec.phi 554 %index.next = add i64 %index, 16 555 %12 = icmp eq i64 %index.next, %3 556 br i1 %12, label %middle.block, label %vector.body 557 558middle.block: 559 %rdx.shuf2 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 560 %bin.rdx2 = add <32 x i32> %11, %rdx.shuf2 561 %rdx.shuf1 = shufflevector <32 x i32> %bin.rdx2, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 562 %bin.rdx1 = add <32 x i32> %bin.rdx2, %rdx.shuf1 563 %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 564 %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf 565 %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 566 %bin.rdx16 = add <32 x i32> %bin.rdx, %rdx.shuf15 567 %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx16, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 568 %bin.rdx18 = add <32 x i32> %bin.rdx16, %rdx.shuf17 569 %13 = extractelement <32 x i32> %bin.rdx18, i32 0 570 ret i32 %13 571} 572 573define i32 @_Z9test_charPcS_i_128(i8* nocapture readonly, i8* nocapture readonly, i32) local_unnamed_addr #0 { 574; SSE2-LABEL: _Z9test_charPcS_i_128: 575; SSE2: # %bb.0: # %entry 576; SSE2-NEXT: movl %edx, %eax 577; SSE2-NEXT: pxor %xmm0, %xmm0 578; SSE2-NEXT: xorl %ecx, %ecx 579; SSE2-NEXT: .p2align 4, 0x90 580; SSE2-NEXT: .LBB4_1: # %vector.body 581; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 582; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 583; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 584; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 585; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 586; SSE2-NEXT: psraw $8, %xmm1 587; SSE2-NEXT: psraw $8, %xmm2 588; SSE2-NEXT: pmullw %xmm1, %xmm2 589; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 590; SSE2-NEXT: psrad $16, %xmm1 591; SSE2-NEXT: paddd %xmm1, %xmm0 592; SSE2-NEXT: addq $16, %rcx 593; SSE2-NEXT: cmpq %rcx, %rax 594; SSE2-NEXT: jne .LBB4_1 595; SSE2-NEXT: # %bb.2: # %middle.block 596; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 597; SSE2-NEXT: paddd %xmm0, %xmm1 598; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 599; SSE2-NEXT: paddd %xmm1, %xmm0 600; SSE2-NEXT: movd %xmm0, %eax 601; SSE2-NEXT: retq 602; 603; AVX-LABEL: _Z9test_charPcS_i_128: 604; AVX: # %bb.0: # %entry 605; AVX-NEXT: movl %edx, %eax 606; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 607; AVX-NEXT: xorl %ecx, %ecx 608; AVX-NEXT: .p2align 4, 0x90 609; AVX-NEXT: .LBB4_1: # %vector.body 610; AVX-NEXT: # =>This Inner Loop Header: Depth=1 611; AVX-NEXT: vpmovsxbd (%rdi,%rcx), %xmm1 612; AVX-NEXT: vpmovsxbd (%rsi,%rcx), %xmm2 613; AVX-NEXT: vpmulld %xmm1, %xmm2, %xmm1 614; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 615; AVX-NEXT: addq $16, %rcx 616; AVX-NEXT: cmpq %rcx, %rax 617; AVX-NEXT: jne .LBB4_1 618; AVX-NEXT: # %bb.2: # %middle.block 619; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 620; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 621; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 622; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 623; AVX-NEXT: vmovd %xmm0, %eax 624; AVX-NEXT: retq 625entry: 626 %3 = zext i32 %2 to i64 627 br label %vector.body 628 629vector.body: 630 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 631 %vec.phi = phi <4 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 632 %4 = getelementptr inbounds i8, i8* %0, i64 %index 633 %5 = bitcast i8* %4 to <4 x i8>* 634 %wide.load = load <4 x i8>, <4 x i8>* %5, align 1 635 %6 = sext <4 x i8> %wide.load to <4 x i32> 636 %7 = getelementptr inbounds i8, i8* %1, i64 %index 637 %8 = bitcast i8* %7 to <4 x i8>* 638 %wide.load14 = load <4 x i8>, <4 x i8>* %8, align 1 639 %9 = sext <4 x i8> %wide.load14 to <4 x i32> 640 %10 = mul nsw <4 x i32> %9, %6 641 %11 = add nsw <4 x i32> %10, %vec.phi 642 %index.next = add i64 %index, 16 643 %12 = icmp eq i64 %index.next, %3 644 br i1 %12, label %middle.block, label %vector.body 645 646middle.block: 647 %rdx.shuf17 = shufflevector <4 x i32> %11, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 648 %bin.rdx18 = add <4 x i32> %11, %rdx.shuf17 649 %rdx.shuf19 = shufflevector <4 x i32> %bin.rdx18, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 650 %bin.rdx20 = add <4 x i32> %bin.rdx18, %rdx.shuf19 651 %13 = extractelement <4 x i32> %bin.rdx20, i32 0 652 ret i32 %13 653} 654 655define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly, i32) local_unnamed_addr #0 { 656; SSE2-LABEL: _Z9test_charPcS_i_256: 657; SSE2: # %bb.0: # %entry 658; SSE2-NEXT: movl %edx, %eax 659; SSE2-NEXT: pxor %xmm0, %xmm0 660; SSE2-NEXT: xorl %ecx, %ecx 661; SSE2-NEXT: pxor %xmm1, %xmm1 662; SSE2-NEXT: .p2align 4, 0x90 663; SSE2-NEXT: .LBB5_1: # %vector.body 664; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 665; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 666; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 667; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero 668; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 669; SSE2-NEXT: psraw $8, %xmm2 670; SSE2-NEXT: psraw $8, %xmm3 671; SSE2-NEXT: pmaddwd %xmm2, %xmm3 672; SSE2-NEXT: paddd %xmm3, %xmm1 673; SSE2-NEXT: addq $16, %rcx 674; SSE2-NEXT: cmpq %rcx, %rax 675; SSE2-NEXT: jne .LBB5_1 676; SSE2-NEXT: # %bb.2: # %middle.block 677; SSE2-NEXT: paddd %xmm0, %xmm1 678; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 679; SSE2-NEXT: paddd %xmm1, %xmm0 680; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 681; SSE2-NEXT: paddd %xmm0, %xmm1 682; SSE2-NEXT: movd %xmm1, %eax 683; SSE2-NEXT: retq 684; 685; AVX1-LABEL: _Z9test_charPcS_i_256: 686; AVX1: # %bb.0: # %entry 687; AVX1-NEXT: movl %edx, %eax 688; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 689; AVX1-NEXT: xorl %ecx, %ecx 690; AVX1-NEXT: .p2align 4, 0x90 691; AVX1-NEXT: .LBB5_1: # %vector.body 692; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 693; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm1 694; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm2 695; AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 696; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 697; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 698; AVX1-NEXT: addq $16, %rcx 699; AVX1-NEXT: cmpq %rcx, %rax 700; AVX1-NEXT: jne .LBB5_1 701; AVX1-NEXT: # %bb.2: # %middle.block 702; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 703; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 704; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 705; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 706; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 707; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 708; AVX1-NEXT: vmovd %xmm0, %eax 709; AVX1-NEXT: vzeroupper 710; AVX1-NEXT: retq 711; 712; AVX256-LABEL: _Z9test_charPcS_i_256: 713; AVX256: # %bb.0: # %entry 714; AVX256-NEXT: movl %edx, %eax 715; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 716; AVX256-NEXT: xorl %ecx, %ecx 717; AVX256-NEXT: .p2align 4, 0x90 718; AVX256-NEXT: .LBB5_1: # %vector.body 719; AVX256-NEXT: # =>This Inner Loop Header: Depth=1 720; AVX256-NEXT: vpmovsxbw (%rdi,%rcx), %xmm1 721; AVX256-NEXT: vpmovsxbw (%rsi,%rcx), %xmm2 722; AVX256-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 723; AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 724; AVX256-NEXT: addq $16, %rcx 725; AVX256-NEXT: cmpq %rcx, %rax 726; AVX256-NEXT: jne .LBB5_1 727; AVX256-NEXT: # %bb.2: # %middle.block 728; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 729; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 730; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 731; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 732; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 733; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 734; AVX256-NEXT: vmovd %xmm0, %eax 735; AVX256-NEXT: vzeroupper 736; AVX256-NEXT: retq 737entry: 738 %3 = zext i32 %2 to i64 739 br label %vector.body 740 741vector.body: 742 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 743 %vec.phi = phi <8 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 744 %4 = getelementptr inbounds i8, i8* %0, i64 %index 745 %5 = bitcast i8* %4 to <8 x i8>* 746 %wide.load = load <8 x i8>, <8 x i8>* %5, align 1 747 %6 = sext <8 x i8> %wide.load to <8 x i32> 748 %7 = getelementptr inbounds i8, i8* %1, i64 %index 749 %8 = bitcast i8* %7 to <8 x i8>* 750 %wide.load14 = load <8 x i8>, <8 x i8>* %8, align 1 751 %9 = sext <8 x i8> %wide.load14 to <8 x i32> 752 %10 = mul nsw <8 x i32> %9, %6 753 %11 = add nsw <8 x i32> %10, %vec.phi 754 %index.next = add i64 %index, 16 755 %12 = icmp eq i64 %index.next, %3 756 br i1 %12, label %middle.block, label %vector.body 757 758middle.block: 759 %rdx.shuf15 = shufflevector <8 x i32> %11, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 760 %bin.rdx16 = add <8 x i32> %11, %rdx.shuf15 761 %rdx.shuf17 = shufflevector <8 x i32> %bin.rdx16, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 762 %bin.rdx18 = add <8 x i32> %bin.rdx16, %rdx.shuf17 763 %rdx.shuf19 = shufflevector <8 x i32> %bin.rdx18, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 764 %bin.rdx20 = add <8 x i32> %bin.rdx18, %rdx.shuf19 765 %13 = extractelement <8 x i32> %bin.rdx20, i32 0 766 ret i32 %13 767} 768 769define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly, i32) local_unnamed_addr #0 { 770; SSE2-LABEL: _Z9test_charPcS_i_512: 771; SSE2: # %bb.0: # %entry 772; SSE2-NEXT: movl %edx, %eax 773; SSE2-NEXT: pxor %xmm0, %xmm0 774; SSE2-NEXT: xorl %ecx, %ecx 775; SSE2-NEXT: pxor %xmm2, %xmm2 776; SSE2-NEXT: pxor %xmm1, %xmm1 777; SSE2-NEXT: .p2align 4, 0x90 778; SSE2-NEXT: .LBB6_1: # %vector.body 779; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 780; SSE2-NEXT: movdqu (%rdi,%rcx), %xmm3 781; SSE2-NEXT: movdqu (%rsi,%rcx), %xmm4 782; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] 783; SSE2-NEXT: psraw $8, %xmm5 784; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] 785; SSE2-NEXT: psraw $8, %xmm6 786; SSE2-NEXT: pmaddwd %xmm5, %xmm6 787; SSE2-NEXT: paddd %xmm6, %xmm2 788; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 789; SSE2-NEXT: psraw $8, %xmm3 790; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 791; SSE2-NEXT: psraw $8, %xmm4 792; SSE2-NEXT: pmaddwd %xmm3, %xmm4 793; SSE2-NEXT: paddd %xmm4, %xmm1 794; SSE2-NEXT: addq $16, %rcx 795; SSE2-NEXT: cmpq %rcx, %rax 796; SSE2-NEXT: jne .LBB6_1 797; SSE2-NEXT: # %bb.2: # %middle.block 798; SSE2-NEXT: paddd %xmm0, %xmm2 799; SSE2-NEXT: paddd %xmm0, %xmm1 800; SSE2-NEXT: paddd %xmm2, %xmm1 801; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 802; SSE2-NEXT: paddd %xmm1, %xmm0 803; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 804; SSE2-NEXT: paddd %xmm0, %xmm1 805; SSE2-NEXT: movd %xmm1, %eax 806; SSE2-NEXT: retq 807; 808; AVX1-LABEL: _Z9test_charPcS_i_512: 809; AVX1: # %bb.0: # %entry 810; AVX1-NEXT: movl %edx, %eax 811; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 812; AVX1-NEXT: xorl %ecx, %ecx 813; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 814; AVX1-NEXT: .p2align 4, 0x90 815; AVX1-NEXT: .LBB6_1: # %vector.body 816; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 817; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm2 818; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm3 819; AVX1-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm4 820; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 821; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm4 822; AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 823; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 824; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 825; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 826; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 827; AVX1-NEXT: addq $16, %rcx 828; AVX1-NEXT: cmpq %rcx, %rax 829; AVX1-NEXT: jne .LBB6_1 830; AVX1-NEXT: # %bb.2: # %middle.block 831; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 832; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 833; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 834; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 835; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 836; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 837; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 838; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 839; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 840; AVX1-NEXT: vmovd %xmm0, %eax 841; AVX1-NEXT: vzeroupper 842; AVX1-NEXT: retq 843; 844; AVX2-LABEL: _Z9test_charPcS_i_512: 845; AVX2: # %bb.0: # %entry 846; AVX2-NEXT: movl %edx, %eax 847; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 848; AVX2-NEXT: xorl %ecx, %ecx 849; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 850; AVX2-NEXT: .p2align 4, 0x90 851; AVX2-NEXT: .LBB6_1: # %vector.body 852; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 853; AVX2-NEXT: vpmovsxbw (%rdi,%rcx), %ymm2 854; AVX2-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 855; AVX2-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 856; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 857; AVX2-NEXT: addq $16, %rcx 858; AVX2-NEXT: cmpq %rcx, %rax 859; AVX2-NEXT: jne .LBB6_1 860; AVX2-NEXT: # %bb.2: # %middle.block 861; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 862; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 863; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 864; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 865; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 866; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 867; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 868; AVX2-NEXT: vmovd %xmm0, %eax 869; AVX2-NEXT: vzeroupper 870; AVX2-NEXT: retq 871; 872; AVX512-LABEL: _Z9test_charPcS_i_512: 873; AVX512: # %bb.0: # %entry 874; AVX512-NEXT: movl %edx, %eax 875; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 876; AVX512-NEXT: xorl %ecx, %ecx 877; AVX512-NEXT: .p2align 4, 0x90 878; AVX512-NEXT: .LBB6_1: # %vector.body 879; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 880; AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %ymm1 881; AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %ymm2 882; AVX512-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 883; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 884; AVX512-NEXT: addq $16, %rcx 885; AVX512-NEXT: cmpq %rcx, %rax 886; AVX512-NEXT: jne .LBB6_1 887; AVX512-NEXT: # %bb.2: # %middle.block 888; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 889; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 890; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 891; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 892; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 893; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 894; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 895; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 896; AVX512-NEXT: vmovd %xmm0, %eax 897; AVX512-NEXT: vzeroupper 898; AVX512-NEXT: retq 899entry: 900 %3 = zext i32 %2 to i64 901 br label %vector.body 902 903vector.body: 904 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 905 %vec.phi = phi <16 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 906 %4 = getelementptr inbounds i8, i8* %0, i64 %index 907 %5 = bitcast i8* %4 to <16 x i8>* 908 %wide.load = load <16 x i8>, <16 x i8>* %5, align 1 909 %6 = sext <16 x i8> %wide.load to <16 x i32> 910 %7 = getelementptr inbounds i8, i8* %1, i64 %index 911 %8 = bitcast i8* %7 to <16 x i8>* 912 %wide.load14 = load <16 x i8>, <16 x i8>* %8, align 1 913 %9 = sext <16 x i8> %wide.load14 to <16 x i32> 914 %10 = mul nsw <16 x i32> %9, %6 915 %11 = add nsw <16 x i32> %10, %vec.phi 916 %index.next = add i64 %index, 16 917 %12 = icmp eq i64 %index.next, %3 918 br i1 %12, label %middle.block, label %vector.body 919 920middle.block: 921 %rdx.shuf = shufflevector <16 x i32> %11, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 922 %bin.rdx = add <16 x i32> %11, %rdx.shuf 923 %rdx.shuf15 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 924 %bin.rdx16 = add <16 x i32> %bin.rdx, %rdx.shuf15 925 %rdx.shuf17 = shufflevector <16 x i32> %bin.rdx16, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 926 %bin.rdx18 = add <16 x i32> %bin.rdx16, %rdx.shuf17 927 %rdx.shuf19 = shufflevector <16 x i32> %bin.rdx18, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 928 %bin.rdx20 = add <16 x i32> %bin.rdx18, %rdx.shuf19 929 %13 = extractelement <16 x i32> %bin.rdx20, i32 0 930 ret i32 %13 931} 932 933define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonly, i32) local_unnamed_addr #0 { 934; SSE2-LABEL: _Z9test_charPcS_i_1024: 935; SSE2: # %bb.0: # %entry 936; SSE2-NEXT: movl %edx, %eax 937; SSE2-NEXT: pxor %xmm8, %xmm8 938; SSE2-NEXT: xorl %ecx, %ecx 939; SSE2-NEXT: pxor %xmm2, %xmm2 940; SSE2-NEXT: pxor %xmm4, %xmm4 941; SSE2-NEXT: pxor %xmm1, %xmm1 942; SSE2-NEXT: pxor %xmm3, %xmm3 943; SSE2-NEXT: .p2align 4, 0x90 944; SSE2-NEXT: .LBB7_1: # %vector.body 945; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 946; SSE2-NEXT: movdqu (%rdi,%rcx), %xmm7 947; SSE2-NEXT: movdqu 16(%rdi,%rcx), %xmm10 948; SSE2-NEXT: movdqu (%rsi,%rcx), %xmm0 949; SSE2-NEXT: movdqu 16(%rsi,%rcx), %xmm9 950; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] 951; SSE2-NEXT: psraw $8, %xmm5 952; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] 953; SSE2-NEXT: psraw $8, %xmm6 954; SSE2-NEXT: pmaddwd %xmm5, %xmm6 955; SSE2-NEXT: paddd %xmm6, %xmm2 956; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] 957; SSE2-NEXT: psraw $8, %xmm5 958; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 959; SSE2-NEXT: psraw $8, %xmm0 960; SSE2-NEXT: pmaddwd %xmm5, %xmm0 961; SSE2-NEXT: paddd %xmm0, %xmm4 962; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] 963; SSE2-NEXT: psraw $8, %xmm0 964; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] 965; SSE2-NEXT: psraw $8, %xmm5 966; SSE2-NEXT: pmaddwd %xmm0, %xmm5 967; SSE2-NEXT: paddd %xmm5, %xmm1 968; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] 969; SSE2-NEXT: psraw $8, %xmm0 970; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] 971; SSE2-NEXT: psraw $8, %xmm5 972; SSE2-NEXT: pmaddwd %xmm0, %xmm5 973; SSE2-NEXT: paddd %xmm5, %xmm3 974; SSE2-NEXT: addq $32, %rcx 975; SSE2-NEXT: cmpq %rcx, %rax 976; SSE2-NEXT: jne .LBB7_1 977; SSE2-NEXT: # %bb.2: # %middle.block 978; SSE2-NEXT: paddd %xmm8, %xmm4 979; SSE2-NEXT: paddd %xmm8, %xmm3 980; SSE2-NEXT: paddd %xmm4, %xmm3 981; SSE2-NEXT: paddd %xmm8, %xmm2 982; SSE2-NEXT: paddd %xmm8, %xmm1 983; SSE2-NEXT: paddd %xmm3, %xmm1 984; SSE2-NEXT: paddd %xmm2, %xmm1 985; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 986; SSE2-NEXT: paddd %xmm1, %xmm0 987; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 988; SSE2-NEXT: paddd %xmm0, %xmm1 989; SSE2-NEXT: movd %xmm1, %eax 990; SSE2-NEXT: retq 991; 992; AVX1-LABEL: _Z9test_charPcS_i_1024: 993; AVX1: # %bb.0: # %entry 994; AVX1-NEXT: movl %edx, %eax 995; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 996; AVX1-NEXT: xorl %ecx, %ecx 997; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 998; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 999; AVX1-NEXT: .p2align 4, 0x90 1000; AVX1-NEXT: .LBB7_1: # %vector.body 1001; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 1002; AVX1-NEXT: vpmovsxbw 24(%rdi,%rcx), %xmm3 1003; AVX1-NEXT: vpmovsxbw 16(%rdi,%rcx), %xmm4 1004; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm5 1005; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm6 1006; AVX1-NEXT: vpmovsxbw 24(%rsi,%rcx), %xmm7 1007; AVX1-NEXT: vpmaddwd %xmm3, %xmm7, %xmm3 1008; AVX1-NEXT: vpmovsxbw 16(%rsi,%rcx), %xmm7 1009; AVX1-NEXT: vpmaddwd %xmm4, %xmm7, %xmm4 1010; AVX1-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm7 1011; AVX1-NEXT: vpmaddwd %xmm5, %xmm7, %xmm5 1012; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm7 1013; AVX1-NEXT: vpmaddwd %xmm6, %xmm7, %xmm6 1014; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 1015; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3 1016; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 1017; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 1018; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1019; AVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm3 1020; AVX1-NEXT: vpaddd %xmm1, %xmm6, %xmm1 1021; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1022; AVX1-NEXT: addq $32, %rcx 1023; AVX1-NEXT: cmpq %rcx, %rax 1024; AVX1-NEXT: jne .LBB7_1 1025; AVX1-NEXT: # %bb.2: # %middle.block 1026; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm3 1027; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 1028; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 1029; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1030; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2 1031; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2 1032; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 1033; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 1034; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 1035; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 1036; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1037; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1038; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1039; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1040; AVX1-NEXT: vmovd %xmm0, %eax 1041; AVX1-NEXT: vzeroupper 1042; AVX1-NEXT: retq 1043; 1044; AVX2-LABEL: _Z9test_charPcS_i_1024: 1045; AVX2: # %bb.0: # %entry 1046; AVX2-NEXT: movl %edx, %eax 1047; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 1048; AVX2-NEXT: xorl %ecx, %ecx 1049; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1050; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1051; AVX2-NEXT: .p2align 4, 0x90 1052; AVX2-NEXT: .LBB7_1: # %vector.body 1053; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 1054; AVX2-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 1055; AVX2-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 1056; AVX2-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 1057; AVX2-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 1058; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 1059; AVX2-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 1060; AVX2-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 1061; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 1062; AVX2-NEXT: addq $32, %rcx 1063; AVX2-NEXT: cmpq %rcx, %rax 1064; AVX2-NEXT: jne .LBB7_1 1065; AVX2-NEXT: # %bb.2: # %middle.block 1066; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1 1067; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 1068; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 1069; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1070; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1071; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1072; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1073; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1074; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1075; AVX2-NEXT: vmovd %xmm0, %eax 1076; AVX2-NEXT: vzeroupper 1077; AVX2-NEXT: retq 1078; 1079; AVX512F-LABEL: _Z9test_charPcS_i_1024: 1080; AVX512F: # %bb.0: # %entry 1081; AVX512F-NEXT: movl %edx, %eax 1082; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 1083; AVX512F-NEXT: xorl %ecx, %ecx 1084; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 1085; AVX512F-NEXT: .p2align 4, 0x90 1086; AVX512F-NEXT: .LBB7_1: # %vector.body 1087; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 1088; AVX512F-NEXT: vpmovsxbw (%rdi,%rcx), %ymm2 1089; AVX512F-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 1090; AVX512F-NEXT: vpmovsxbw (%rsi,%rcx), %ymm4 1091; AVX512F-NEXT: vpmaddwd %ymm2, %ymm4, %ymm2 1092; AVX512F-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm4 1093; AVX512F-NEXT: vpmaddwd %ymm3, %ymm4, %ymm3 1094; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 1095; AVX512F-NEXT: vpaddd %zmm1, %zmm2, %zmm1 1096; AVX512F-NEXT: addq $32, %rcx 1097; AVX512F-NEXT: cmpq %rcx, %rax 1098; AVX512F-NEXT: jne .LBB7_1 1099; AVX512F-NEXT: # %bb.2: # %middle.block 1100; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm0 1101; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1102; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1103; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 1104; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1105; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1106; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1107; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1108; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1109; AVX512F-NEXT: vmovd %xmm0, %eax 1110; AVX512F-NEXT: vzeroupper 1111; AVX512F-NEXT: retq 1112; 1113; AVX512BW-LABEL: _Z9test_charPcS_i_1024: 1114; AVX512BW: # %bb.0: # %entry 1115; AVX512BW-NEXT: movl %edx, %eax 1116; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 1117; AVX512BW-NEXT: xorl %ecx, %ecx 1118; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 1119; AVX512BW-NEXT: .p2align 4, 0x90 1120; AVX512BW-NEXT: .LBB7_1: # %vector.body 1121; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 1122; AVX512BW-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 1123; AVX512BW-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 1124; AVX512BW-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 1125; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1 1126; AVX512BW-NEXT: addq $32, %rcx 1127; AVX512BW-NEXT: cmpq %rcx, %rax 1128; AVX512BW-NEXT: jne .LBB7_1 1129; AVX512BW-NEXT: # %bb.2: # %middle.block 1130; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0 1131; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1132; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1133; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 1134; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1135; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1136; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1137; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1138; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1139; AVX512BW-NEXT: vmovd %xmm0, %eax 1140; AVX512BW-NEXT: vzeroupper 1141; AVX512BW-NEXT: retq 1142entry: 1143 %3 = zext i32 %2 to i64 1144 br label %vector.body 1145 1146vector.body: 1147 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 1148 %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 1149 %4 = getelementptr inbounds i8, i8* %0, i64 %index 1150 %5 = bitcast i8* %4 to <32 x i8>* 1151 %wide.load = load <32 x i8>, <32 x i8>* %5, align 1 1152 %6 = sext <32 x i8> %wide.load to <32 x i32> 1153 %7 = getelementptr inbounds i8, i8* %1, i64 %index 1154 %8 = bitcast i8* %7 to <32 x i8>* 1155 %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1 1156 %9 = sext <32 x i8> %wide.load14 to <32 x i32> 1157 %10 = mul nsw <32 x i32> %9, %6 1158 %11 = add nsw <32 x i32> %10, %vec.phi 1159 %index.next = add i64 %index, 32 1160 %12 = icmp eq i64 %index.next, %3 1161 br i1 %12, label %middle.block, label %vector.body 1162 1163middle.block: 1164 %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1165 %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1 1166 %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1167 %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf 1168 %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1169 %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15 1170 %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1171 %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17 1172 %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1173 %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19 1174 %13 = extractelement <32 x i32> %bin.rdx20, i32 0 1175 ret i32 %13 1176} 1177 1178define i32 @test_unsigned_short_128(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 { 1179; SSE2-LABEL: test_unsigned_short_128: 1180; SSE2: # %bb.0: # %entry 1181; SSE2-NEXT: movl %edx, %eax 1182; SSE2-NEXT: pxor %xmm0, %xmm0 1183; SSE2-NEXT: xorl %ecx, %ecx 1184; SSE2-NEXT: .p2align 4, 0x90 1185; SSE2-NEXT: .LBB8_1: # %vector.body 1186; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 1187; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 1188; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 1189; SSE2-NEXT: movdqa %xmm2, %xmm3 1190; SSE2-NEXT: pmulhuw %xmm1, %xmm3 1191; SSE2-NEXT: pmullw %xmm1, %xmm2 1192; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 1193; SSE2-NEXT: paddd %xmm2, %xmm0 1194; SSE2-NEXT: addq $16, %rcx 1195; SSE2-NEXT: cmpq %rcx, %rax 1196; SSE2-NEXT: jne .LBB8_1 1197; SSE2-NEXT: # %bb.2: # %middle.block 1198; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1199; SSE2-NEXT: paddd %xmm0, %xmm1 1200; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 1201; SSE2-NEXT: paddd %xmm1, %xmm0 1202; SSE2-NEXT: movd %xmm0, %eax 1203; SSE2-NEXT: retq 1204; 1205; AVX-LABEL: test_unsigned_short_128: 1206; AVX: # %bb.0: # %entry 1207; AVX-NEXT: movl %edx, %eax 1208; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 1209; AVX-NEXT: xorl %ecx, %ecx 1210; AVX-NEXT: .p2align 4, 0x90 1211; AVX-NEXT: .LBB8_1: # %vector.body 1212; AVX-NEXT: # =>This Inner Loop Header: Depth=1 1213; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1214; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1215; AVX-NEXT: vpmulld %xmm1, %xmm2, %xmm1 1216; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 1217; AVX-NEXT: addq $16, %rcx 1218; AVX-NEXT: cmpq %rcx, %rax 1219; AVX-NEXT: jne .LBB8_1 1220; AVX-NEXT: # %bb.2: # %middle.block 1221; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1222; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1223; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1224; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1225; AVX-NEXT: vmovd %xmm0, %eax 1226; AVX-NEXT: retq 1227entry: 1228 %3 = zext i32 %2 to i64 1229 br label %vector.body 1230 1231vector.body: 1232 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 1233 %vec.phi = phi <4 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 1234 %4 = getelementptr inbounds i16, i16* %0, i64 %index 1235 %5 = bitcast i16* %4 to <4 x i16>* 1236 %wide.load = load <4 x i16>, <4 x i16>* %5, align 2 1237 %6 = zext <4 x i16> %wide.load to <4 x i32> 1238 %7 = getelementptr inbounds i16, i16* %1, i64 %index 1239 %8 = bitcast i16* %7 to <4 x i16>* 1240 %wide.load14 = load <4 x i16>, <4 x i16>* %8, align 2 1241 %9 = zext <4 x i16> %wide.load14 to <4 x i32> 1242 %10 = mul nsw <4 x i32> %9, %6 1243 %11 = add nsw <4 x i32> %10, %vec.phi 1244 %index.next = add i64 %index, 16 1245 %12 = icmp eq i64 %index.next, %3 1246 br i1 %12, label %middle.block, label %vector.body 1247 1248middle.block: 1249 %rdx.shuf15 = shufflevector <4 x i32> %11, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 1250 %bin.rdx16 = add <4 x i32> %11, %rdx.shuf15 1251 %rdx.shuf17 = shufflevector <4 x i32> %bin.rdx16, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 1252 %bin.rdx18 = add <4 x i32> %bin.rdx16, %rdx.shuf17 1253 %13 = extractelement <4 x i32> %bin.rdx18, i32 0 1254 ret i32 %13 1255} 1256 1257define i32 @test_unsigned_short_256(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 { 1258; SSE2-LABEL: test_unsigned_short_256: 1259; SSE2: # %bb.0: # %entry 1260; SSE2-NEXT: movl %edx, %eax 1261; SSE2-NEXT: pxor %xmm0, %xmm0 1262; SSE2-NEXT: xorl %ecx, %ecx 1263; SSE2-NEXT: pxor %xmm1, %xmm1 1264; SSE2-NEXT: .p2align 4, 0x90 1265; SSE2-NEXT: .LBB9_1: # %vector.body 1266; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 1267; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm2 1268; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm3 1269; SSE2-NEXT: movdqa %xmm3, %xmm4 1270; SSE2-NEXT: pmulhuw %xmm2, %xmm4 1271; SSE2-NEXT: pmullw %xmm2, %xmm3 1272; SSE2-NEXT: movdqa %xmm3, %xmm2 1273; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 1274; SSE2-NEXT: paddd %xmm2, %xmm0 1275; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 1276; SSE2-NEXT: paddd %xmm3, %xmm1 1277; SSE2-NEXT: addq $16, %rcx 1278; SSE2-NEXT: cmpq %rcx, %rax 1279; SSE2-NEXT: jne .LBB9_1 1280; SSE2-NEXT: # %bb.2: # %middle.block 1281; SSE2-NEXT: paddd %xmm1, %xmm0 1282; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1283; SSE2-NEXT: paddd %xmm0, %xmm1 1284; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 1285; SSE2-NEXT: paddd %xmm1, %xmm0 1286; SSE2-NEXT: movd %xmm0, %eax 1287; SSE2-NEXT: retq 1288; 1289; AVX1-LABEL: test_unsigned_short_256: 1290; AVX1: # %bb.0: # %entry 1291; AVX1-NEXT: movl %edx, %eax 1292; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 1293; AVX1-NEXT: xorl %ecx, %ecx 1294; AVX1-NEXT: .p2align 4, 0x90 1295; AVX1-NEXT: .LBB9_1: # %vector.body 1296; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 1297; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1298; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1299; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1300; AVX1-NEXT: vpmulld %xmm1, %xmm3, %xmm1 1301; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1302; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2 1303; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1304; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 1305; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 1306; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1307; AVX1-NEXT: addq $16, %rcx 1308; AVX1-NEXT: cmpq %rcx, %rax 1309; AVX1-NEXT: jne .LBB9_1 1310; AVX1-NEXT: # %bb.2: # %middle.block 1311; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1312; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1313; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1314; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1315; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1316; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1317; AVX1-NEXT: vmovd %xmm0, %eax 1318; AVX1-NEXT: vzeroupper 1319; AVX1-NEXT: retq 1320; 1321; AVX256-LABEL: test_unsigned_short_256: 1322; AVX256: # %bb.0: # %entry 1323; AVX256-NEXT: movl %edx, %eax 1324; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 1325; AVX256-NEXT: xorl %ecx, %ecx 1326; AVX256-NEXT: .p2align 4, 0x90 1327; AVX256-NEXT: .LBB9_1: # %vector.body 1328; AVX256-NEXT: # =>This Inner Loop Header: Depth=1 1329; AVX256-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1330; AVX256-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1331; AVX256-NEXT: vpmulld %ymm1, %ymm2, %ymm1 1332; AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 1333; AVX256-NEXT: addq $16, %rcx 1334; AVX256-NEXT: cmpq %rcx, %rax 1335; AVX256-NEXT: jne .LBB9_1 1336; AVX256-NEXT: # %bb.2: # %middle.block 1337; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 1338; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1339; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1340; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1341; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1342; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1343; AVX256-NEXT: vmovd %xmm0, %eax 1344; AVX256-NEXT: vzeroupper 1345; AVX256-NEXT: retq 1346entry: 1347 %3 = zext i32 %2 to i64 1348 br label %vector.body 1349 1350vector.body: 1351 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 1352 %vec.phi = phi <8 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 1353 %4 = getelementptr inbounds i16, i16* %0, i64 %index 1354 %5 = bitcast i16* %4 to <8 x i16>* 1355 %wide.load = load <8 x i16>, <8 x i16>* %5, align 2 1356 %6 = zext <8 x i16> %wide.load to <8 x i32> 1357 %7 = getelementptr inbounds i16, i16* %1, i64 %index 1358 %8 = bitcast i16* %7 to <8 x i16>* 1359 %wide.load14 = load <8 x i16>, <8 x i16>* %8, align 2 1360 %9 = zext <8 x i16> %wide.load14 to <8 x i32> 1361 %10 = mul nsw <8 x i32> %9, %6 1362 %11 = add nsw <8 x i32> %10, %vec.phi 1363 %index.next = add i64 %index, 16 1364 %12 = icmp eq i64 %index.next, %3 1365 br i1 %12, label %middle.block, label %vector.body 1366 1367middle.block: 1368 %rdx.shuf = shufflevector <8 x i32> %11, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 1369 %bin.rdx = add <8 x i32> %11, %rdx.shuf 1370 %rdx.shuf15 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1371 %bin.rdx16 = add <8 x i32> %bin.rdx, %rdx.shuf15 1372 %rdx.shuf17 = shufflevector <8 x i32> %bin.rdx16, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1373 %bin.rdx18 = add <8 x i32> %bin.rdx16, %rdx.shuf17 1374 %13 = extractelement <8 x i32> %bin.rdx18, i32 0 1375 ret i32 %13 1376} 1377 1378define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 { 1379; SSE2-LABEL: test_unsigned_short_512: 1380; SSE2: # %bb.0: # %entry 1381; SSE2-NEXT: movl %edx, %eax 1382; SSE2-NEXT: pxor %xmm0, %xmm0 1383; SSE2-NEXT: xorl %ecx, %ecx 1384; SSE2-NEXT: pxor %xmm1, %xmm1 1385; SSE2-NEXT: pxor %xmm3, %xmm3 1386; SSE2-NEXT: pxor %xmm2, %xmm2 1387; SSE2-NEXT: .p2align 4, 0x90 1388; SSE2-NEXT: .LBB10_1: # %vector.body 1389; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 1390; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm4 1391; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm8 1392; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm6 1393; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm7 1394; SSE2-NEXT: movdqa %xmm6, %xmm5 1395; SSE2-NEXT: pmulhuw %xmm4, %xmm5 1396; SSE2-NEXT: pmullw %xmm4, %xmm6 1397; SSE2-NEXT: movdqa %xmm6, %xmm4 1398; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] 1399; SSE2-NEXT: paddd %xmm4, %xmm0 1400; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] 1401; SSE2-NEXT: paddd %xmm6, %xmm1 1402; SSE2-NEXT: movdqa %xmm7, %xmm4 1403; SSE2-NEXT: pmulhuw %xmm8, %xmm4 1404; SSE2-NEXT: pmullw %xmm8, %xmm7 1405; SSE2-NEXT: movdqa %xmm7, %xmm5 1406; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 1407; SSE2-NEXT: paddd %xmm5, %xmm3 1408; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] 1409; SSE2-NEXT: paddd %xmm7, %xmm2 1410; SSE2-NEXT: addq $16, %rcx 1411; SSE2-NEXT: cmpq %rcx, %rax 1412; SSE2-NEXT: jne .LBB10_1 1413; SSE2-NEXT: # %bb.2: # %middle.block 1414; SSE2-NEXT: paddd %xmm3, %xmm0 1415; SSE2-NEXT: paddd %xmm2, %xmm1 1416; SSE2-NEXT: paddd %xmm0, %xmm1 1417; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 1418; SSE2-NEXT: paddd %xmm1, %xmm0 1419; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1420; SSE2-NEXT: paddd %xmm0, %xmm1 1421; SSE2-NEXT: movd %xmm1, %eax 1422; SSE2-NEXT: retq 1423; 1424; AVX1-LABEL: test_unsigned_short_512: 1425; AVX1: # %bb.0: # %entry 1426; AVX1-NEXT: movl %edx, %eax 1427; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 1428; AVX1-NEXT: xorl %ecx, %ecx 1429; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1430; AVX1-NEXT: .p2align 4, 0x90 1431; AVX1-NEXT: .LBB10_1: # %vector.body 1432; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 1433; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1434; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1435; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1436; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1437; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1438; AVX1-NEXT: vpmulld %xmm2, %xmm6, %xmm2 1439; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1440; AVX1-NEXT: vpmulld %xmm3, %xmm6, %xmm3 1441; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1442; AVX1-NEXT: vpmulld %xmm4, %xmm6, %xmm4 1443; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1444; AVX1-NEXT: vpmulld %xmm5, %xmm6, %xmm5 1445; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 1446; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2 1447; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 1448; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1449; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1450; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 1451; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm0 1452; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1453; AVX1-NEXT: addq $16, %rcx 1454; AVX1-NEXT: cmpq %rcx, %rax 1455; AVX1-NEXT: jne .LBB10_1 1456; AVX1-NEXT: # %bb.2: # %middle.block 1457; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1458; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1459; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 1460; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 1461; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1462; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1463; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1464; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1465; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1466; AVX1-NEXT: vmovd %xmm0, %eax 1467; AVX1-NEXT: vzeroupper 1468; AVX1-NEXT: retq 1469; 1470; AVX2-LABEL: test_unsigned_short_512: 1471; AVX2: # %bb.0: # %entry 1472; AVX2-NEXT: movl %edx, %eax 1473; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 1474; AVX2-NEXT: xorl %ecx, %ecx 1475; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1476; AVX2-NEXT: .p2align 4, 0x90 1477; AVX2-NEXT: .LBB10_1: # %vector.body 1478; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 1479; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1480; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1481; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1482; AVX2-NEXT: vpmulld %ymm2, %ymm4, %ymm2 1483; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 1484; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1485; AVX2-NEXT: vpmulld %ymm3, %ymm2, %ymm2 1486; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 1487; AVX2-NEXT: addq $16, %rcx 1488; AVX2-NEXT: cmpq %rcx, %rax 1489; AVX2-NEXT: jne .LBB10_1 1490; AVX2-NEXT: # %bb.2: # %middle.block 1491; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 1492; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1493; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1494; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1495; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1496; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1497; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1498; AVX2-NEXT: vmovd %xmm0, %eax 1499; AVX2-NEXT: vzeroupper 1500; AVX2-NEXT: retq 1501; 1502; AVX512-LABEL: test_unsigned_short_512: 1503; AVX512: # %bb.0: # %entry 1504; AVX512-NEXT: movl %edx, %eax 1505; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 1506; AVX512-NEXT: xorl %ecx, %ecx 1507; AVX512-NEXT: .p2align 4, 0x90 1508; AVX512-NEXT: .LBB10_1: # %vector.body 1509; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 1510; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 1511; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 1512; AVX512-NEXT: vpmulld %zmm1, %zmm2, %zmm1 1513; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 1514; AVX512-NEXT: addq $16, %rcx 1515; AVX512-NEXT: cmpq %rcx, %rax 1516; AVX512-NEXT: jne .LBB10_1 1517; AVX512-NEXT: # %bb.2: # %middle.block 1518; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1519; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1520; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1521; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1522; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1523; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1524; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1525; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1526; AVX512-NEXT: vmovd %xmm0, %eax 1527; AVX512-NEXT: vzeroupper 1528; AVX512-NEXT: retq 1529entry: 1530 %3 = zext i32 %2 to i64 1531 br label %vector.body 1532 1533vector.body: 1534 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 1535 %vec.phi = phi <16 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 1536 %4 = getelementptr inbounds i16, i16* %0, i64 %index 1537 %5 = bitcast i16* %4 to <16 x i16>* 1538 %wide.load = load <16 x i16>, <16 x i16>* %5, align 2 1539 %6 = zext <16 x i16> %wide.load to <16 x i32> 1540 %7 = getelementptr inbounds i16, i16* %1, i64 %index 1541 %8 = bitcast i16* %7 to <16 x i16>* 1542 %wide.load14 = load <16 x i16>, <16 x i16>* %8, align 2 1543 %9 = zext <16 x i16> %wide.load14 to <16 x i32> 1544 %10 = mul nsw <16 x i32> %9, %6 1545 %11 = add nsw <16 x i32> %10, %vec.phi 1546 %index.next = add i64 %index, 16 1547 %12 = icmp eq i64 %index.next, %3 1548 br i1 %12, label %middle.block, label %vector.body 1549 1550middle.block: 1551 %rdx.shuf1 = shufflevector <16 x i32> %11, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1552 %bin.rdx1 = add <16 x i32> %11, %rdx.shuf1 1553 %rdx.shuf = shufflevector <16 x i32> %bin.rdx1, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1554 %bin.rdx = add <16 x i32> %bin.rdx1, %rdx.shuf 1555 %rdx.shuf15 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1556 %bin.rdx16 = add <16 x i32> %bin.rdx, %rdx.shuf15 1557 %rdx.shuf17 = shufflevector <16 x i32> %bin.rdx16, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1558 %bin.rdx18 = add <16 x i32> %bin.rdx16, %rdx.shuf17 1559 %13 = extractelement <16 x i32> %bin.rdx18, i32 0 1560 ret i32 %13 1561} 1562 1563define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 { 1564; SSE2-LABEL: test_unsigned_short_1024: 1565; SSE2: # %bb.0: # %entry 1566; SSE2-NEXT: movl %edx, %eax 1567; SSE2-NEXT: pxor %xmm8, %xmm8 1568; SSE2-NEXT: xorl %ecx, %ecx 1569; SSE2-NEXT: pxor %xmm3, %xmm3 1570; SSE2-NEXT: pxor %xmm9, %xmm9 1571; SSE2-NEXT: pxor %xmm10, %xmm10 1572; SSE2-NEXT: pxor %xmm4, %xmm4 1573; SSE2-NEXT: pxor %xmm6, %xmm6 1574; SSE2-NEXT: pxor %xmm5, %xmm5 1575; SSE2-NEXT: pxor %xmm7, %xmm7 1576; SSE2-NEXT: .p2align 4, 0x90 1577; SSE2-NEXT: .LBB11_1: # %vector.body 1578; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 1579; SSE2-NEXT: movdqu 48(%rdi,%rcx,2), %xmm0 1580; SSE2-NEXT: movdqu 48(%rsi,%rcx,2), %xmm1 1581; SSE2-NEXT: movdqa %xmm1, %xmm2 1582; SSE2-NEXT: pmulhuw %xmm0, %xmm2 1583; SSE2-NEXT: pmullw %xmm0, %xmm1 1584; SSE2-NEXT: movdqa %xmm1, %xmm0 1585; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1586; SSE2-NEXT: paddd %xmm0, %xmm7 1587; SSE2-NEXT: movdqu 32(%rdi,%rcx,2), %xmm0 1588; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1589; SSE2-NEXT: movdqu 32(%rsi,%rcx,2), %xmm2 1590; SSE2-NEXT: paddd %xmm1, %xmm5 1591; SSE2-NEXT: movdqa %xmm2, %xmm1 1592; SSE2-NEXT: pmulhuw %xmm0, %xmm1 1593; SSE2-NEXT: pmullw %xmm0, %xmm2 1594; SSE2-NEXT: movdqa %xmm2, %xmm0 1595; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1596; SSE2-NEXT: paddd %xmm0, %xmm6 1597; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm0 1598; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 1599; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm1 1600; SSE2-NEXT: paddd %xmm2, %xmm4 1601; SSE2-NEXT: movdqa %xmm1, %xmm2 1602; SSE2-NEXT: pmulhuw %xmm0, %xmm2 1603; SSE2-NEXT: pmullw %xmm0, %xmm1 1604; SSE2-NEXT: movdqa %xmm1, %xmm0 1605; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1606; SSE2-NEXT: paddd %xmm0, %xmm8 1607; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm0 1608; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1609; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm2 1610; SSE2-NEXT: paddd %xmm1, %xmm3 1611; SSE2-NEXT: movdqa %xmm2, %xmm1 1612; SSE2-NEXT: pmulhuw %xmm0, %xmm1 1613; SSE2-NEXT: pmullw %xmm0, %xmm2 1614; SSE2-NEXT: movdqa %xmm2, %xmm0 1615; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1616; SSE2-NEXT: paddd %xmm0, %xmm9 1617; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 1618; SSE2-NEXT: paddd %xmm2, %xmm10 1619; SSE2-NEXT: addq $16, %rcx 1620; SSE2-NEXT: cmpq %rcx, %rax 1621; SSE2-NEXT: jne .LBB11_1 1622; SSE2-NEXT: # %bb.2: # %middle.block 1623; SSE2-NEXT: paddd %xmm6, %xmm3 1624; SSE2-NEXT: paddd %xmm7, %xmm10 1625; SSE2-NEXT: paddd %xmm3, %xmm10 1626; SSE2-NEXT: paddd %xmm4, %xmm8 1627; SSE2-NEXT: paddd %xmm5, %xmm9 1628; SSE2-NEXT: paddd %xmm10, %xmm9 1629; SSE2-NEXT: paddd %xmm8, %xmm9 1630; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] 1631; SSE2-NEXT: paddd %xmm9, %xmm0 1632; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1633; SSE2-NEXT: paddd %xmm0, %xmm1 1634; SSE2-NEXT: movd %xmm1, %eax 1635; SSE2-NEXT: retq 1636; 1637; AVX1-LABEL: test_unsigned_short_1024: 1638; AVX1: # %bb.0: # %entry 1639; AVX1-NEXT: movl %edx, %eax 1640; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 1641; AVX1-NEXT: xorl %ecx, %ecx 1642; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1643; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9 1644; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 1645; AVX1-NEXT: .p2align 4, 0x90 1646; AVX1-NEXT: .LBB11_1: # %vector.body 1647; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 1648; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1649; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1650; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1651; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1652; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1653; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1654; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm10 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1655; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm11 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1656; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1657; AVX1-NEXT: vpmulld %xmm4, %xmm1, %xmm1 1658; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1659; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4 1660; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1661; AVX1-NEXT: vpmulld %xmm6, %xmm5, %xmm5 1662; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1663; AVX1-NEXT: vpmulld %xmm7, %xmm6, %xmm6 1664; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1665; AVX1-NEXT: vpmulld %xmm0, %xmm7, %xmm13 1666; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1667; AVX1-NEXT: vpmulld %xmm12, %xmm7, %xmm7 1668; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1669; AVX1-NEXT: vpmulld %xmm10, %xmm0, %xmm10 1670; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1671; AVX1-NEXT: vpmulld %xmm11, %xmm0, %xmm11 1672; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 1673; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 1674; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm1 1675; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 1676; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm0 1677; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm0 1678; AVX1-NEXT: vpaddd %xmm6, %xmm8, %xmm1 1679; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm8 1680; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm0 1681; AVX1-NEXT: vpaddd %xmm0, %xmm13, %xmm0 1682; AVX1-NEXT: vpaddd %xmm7, %xmm9, %xmm1 1683; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm9 1684; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0 1685; AVX1-NEXT: vpaddd %xmm0, %xmm10, %xmm0 1686; AVX1-NEXT: vpaddd %xmm3, %xmm11, %xmm1 1687; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm3 1688; AVX1-NEXT: addq $16, %rcx 1689; AVX1-NEXT: cmpq %rcx, %rax 1690; AVX1-NEXT: jne .LBB11_1 1691; AVX1-NEXT: # %bb.2: # %middle.block 1692; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm0 1693; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm1 1694; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm4 1695; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1696; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1697; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 1698; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 1699; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 1700; AVX1-NEXT: vpaddd %xmm0, %xmm9, %xmm0 1701; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1702; AVX1-NEXT: vpaddd %xmm0, %xmm8, %xmm0 1703; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1704; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1705; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1706; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1707; AVX1-NEXT: vmovd %xmm0, %eax 1708; AVX1-NEXT: vzeroupper 1709; AVX1-NEXT: retq 1710; 1711; AVX2-LABEL: test_unsigned_short_1024: 1712; AVX2: # %bb.0: # %entry 1713; AVX2-NEXT: movl %edx, %eax 1714; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 1715; AVX2-NEXT: xorl %ecx, %ecx 1716; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1717; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1718; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 1719; AVX2-NEXT: .p2align 4, 0x90 1720; AVX2-NEXT: .LBB11_1: # %vector.body 1721; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 1722; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1723; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1724; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1725; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1726; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1727; AVX2-NEXT: vpmulld %ymm4, %ymm8, %ymm4 1728; AVX2-NEXT: vpaddd %ymm2, %ymm4, %ymm2 1729; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1730; AVX2-NEXT: vpmulld %ymm5, %ymm4, %ymm4 1731; AVX2-NEXT: vpaddd %ymm1, %ymm4, %ymm1 1732; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1733; AVX2-NEXT: vpmulld %ymm6, %ymm4, %ymm4 1734; AVX2-NEXT: vpaddd %ymm0, %ymm4, %ymm0 1735; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1736; AVX2-NEXT: vpmulld %ymm7, %ymm4, %ymm4 1737; AVX2-NEXT: vpaddd %ymm3, %ymm4, %ymm3 1738; AVX2-NEXT: addq $16, %rcx 1739; AVX2-NEXT: cmpq %rcx, %rax 1740; AVX2-NEXT: jne .LBB11_1 1741; AVX2-NEXT: # %bb.2: # %middle.block 1742; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 1743; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 1744; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 1745; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1746; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1747; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1748; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1749; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1750; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1751; AVX2-NEXT: vmovd %xmm0, %eax 1752; AVX2-NEXT: vzeroupper 1753; AVX2-NEXT: retq 1754; 1755; AVX512-LABEL: test_unsigned_short_1024: 1756; AVX512: # %bb.0: # %entry 1757; AVX512-NEXT: movl %edx, %eax 1758; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 1759; AVX512-NEXT: xorl %ecx, %ecx 1760; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 1761; AVX512-NEXT: .p2align 4, 0x90 1762; AVX512-NEXT: .LBB11_1: # %vector.body 1763; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 1764; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 1765; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 1766; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 1767; AVX512-NEXT: vpmulld %zmm2, %zmm4, %zmm2 1768; AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1 1769; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 1770; AVX512-NEXT: vpmulld %zmm3, %zmm2, %zmm2 1771; AVX512-NEXT: vpaddd %zmm0, %zmm2, %zmm0 1772; AVX512-NEXT: addq $16, %rcx 1773; AVX512-NEXT: cmpq %rcx, %rax 1774; AVX512-NEXT: jne .LBB11_1 1775; AVX512-NEXT: # %bb.2: # %middle.block 1776; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1777; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1778; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1779; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1780; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1781; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1782; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1783; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1784; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1785; AVX512-NEXT: vmovd %xmm0, %eax 1786; AVX512-NEXT: vzeroupper 1787; AVX512-NEXT: retq 1788entry: 1789 %3 = zext i32 %2 to i64 1790 br label %vector.body 1791 1792vector.body: 1793 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 1794 %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 1795 %4 = getelementptr inbounds i16, i16* %0, i64 %index 1796 %5 = bitcast i16* %4 to <32 x i16>* 1797 %wide.load = load <32 x i16>, <32 x i16>* %5, align 2 1798 %6 = zext <32 x i16> %wide.load to <32 x i32> 1799 %7 = getelementptr inbounds i16, i16* %1, i64 %index 1800 %8 = bitcast i16* %7 to <32 x i16>* 1801 %wide.load14 = load <32 x i16>, <32 x i16>* %8, align 2 1802 %9 = zext <32 x i16> %wide.load14 to <32 x i32> 1803 %10 = mul nsw <32 x i32> %9, %6 1804 %11 = add nsw <32 x i32> %10, %vec.phi 1805 %index.next = add i64 %index, 16 1806 %12 = icmp eq i64 %index.next, %3 1807 br i1 %12, label %middle.block, label %vector.body 1808 1809middle.block: 1810 %rdx.shuf2 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1811 %bin.rdx2 = add <32 x i32> %11, %rdx.shuf2 1812 %rdx.shuf1 = shufflevector <32 x i32> %bin.rdx2, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1813 %bin.rdx1 = add <32 x i32> %bin.rdx2, %rdx.shuf1 1814 %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1815 %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf 1816 %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1817 %bin.rdx16 = add <32 x i32> %bin.rdx, %rdx.shuf15 1818 %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx16, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1819 %bin.rdx18 = add <32 x i32> %bin.rdx16, %rdx.shuf17 1820 %13 = extractelement <32 x i32> %bin.rdx18, i32 0 1821 ret i32 %13 1822} 1823 1824define <4 x i32> @pmaddwd_8(<8 x i16> %A, <8 x i16> %B) { 1825; SSE2-LABEL: pmaddwd_8: 1826; SSE2: # %bb.0: 1827; SSE2-NEXT: pmaddwd %xmm1, %xmm0 1828; SSE2-NEXT: retq 1829; 1830; AVX-LABEL: pmaddwd_8: 1831; AVX: # %bb.0: 1832; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 1833; AVX-NEXT: retq 1834 %a = sext <8 x i16> %A to <8 x i32> 1835 %b = sext <8 x i16> %B to <8 x i32> 1836 %m = mul nsw <8 x i32> %a, %b 1837 %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1838 %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1839 %ret = add <4 x i32> %odd, %even 1840 ret <4 x i32> %ret 1841} 1842 1843define <4 x i32> @pmaddwd_8_swapped(<8 x i16> %A, <8 x i16> %B) { 1844; SSE2-LABEL: pmaddwd_8_swapped: 1845; SSE2: # %bb.0: 1846; SSE2-NEXT: pmaddwd %xmm1, %xmm0 1847; SSE2-NEXT: retq 1848; 1849; AVX-LABEL: pmaddwd_8_swapped: 1850; AVX: # %bb.0: 1851; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 1852; AVX-NEXT: retq 1853 %a = sext <8 x i16> %A to <8 x i32> 1854 %b = sext <8 x i16> %B to <8 x i32> 1855 %m = mul nsw <8 x i32> %a, %b 1856 %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1857 %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1858 %ret = add <4 x i32> %even, %odd 1859 ret <4 x i32> %ret 1860} 1861 1862define <4 x i32> @larger_mul(<16 x i16> %A, <16 x i16> %B) { 1863; SSE2-LABEL: larger_mul: 1864; SSE2: # %bb.0: 1865; SSE2-NEXT: movdqa %xmm0, %xmm1 1866; SSE2-NEXT: pmulhw %xmm2, %xmm1 1867; SSE2-NEXT: pmullw %xmm2, %xmm0 1868; SSE2-NEXT: movdqa %xmm0, %xmm2 1869; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 1870; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1871; SSE2-NEXT: movdqa %xmm0, %xmm1 1872; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 1873; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] 1874; SSE2-NEXT: paddd %xmm1, %xmm0 1875; SSE2-NEXT: retq 1876; 1877; AVX1-LABEL: larger_mul: 1878; AVX1: # %bb.0: 1879; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 1880; AVX1-NEXT: vzeroupper 1881; AVX1-NEXT: retq 1882; 1883; AVX2-LABEL: larger_mul: 1884; AVX2: # %bb.0: 1885; AVX2-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 1886; AVX2-NEXT: vzeroupper 1887; AVX2-NEXT: retq 1888; 1889; AVX512-LABEL: larger_mul: 1890; AVX512: # %bb.0: 1891; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 1892; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1 1893; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 1894; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1895; AVX512-NEXT: vphaddd %xmm1, %xmm0, %xmm0 1896; AVX512-NEXT: vzeroupper 1897; AVX512-NEXT: retq 1898 %a = sext <16 x i16> %A to <16 x i32> 1899 %b = sext <16 x i16> %B to <16 x i32> 1900 %m = mul nsw <16 x i32> %a, %b 1901 %odd = shufflevector <16 x i32> %m, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1902 %even = shufflevector <16 x i32> %m, <16 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1903 %ret = add <4 x i32> %odd, %even 1904 ret <4 x i32> %ret 1905} 1906 1907define <8 x i32> @pmaddwd_16(<16 x i16> %A, <16 x i16> %B) { 1908; SSE2-LABEL: pmaddwd_16: 1909; SSE2: # %bb.0: 1910; SSE2-NEXT: pmaddwd %xmm2, %xmm0 1911; SSE2-NEXT: pmaddwd %xmm3, %xmm1 1912; SSE2-NEXT: retq 1913; 1914; AVX1-LABEL: pmaddwd_16: 1915; AVX1: # %bb.0: 1916; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1917; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1918; AVX1-NEXT: vpmaddwd %xmm2, %xmm3, %xmm2 1919; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 1920; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1921; AVX1-NEXT: retq 1922; 1923; AVX256-LABEL: pmaddwd_16: 1924; AVX256: # %bb.0: 1925; AVX256-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 1926; AVX256-NEXT: retq 1927 %a = sext <16 x i16> %A to <16 x i32> 1928 %b = sext <16 x i16> %B to <16 x i32> 1929 %m = mul nsw <16 x i32> %a, %b 1930 %odd = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 1931 %even = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 1932 %ret = add <8 x i32> %odd, %even 1933 ret <8 x i32> %ret 1934} 1935 1936define <16 x i32> @pmaddwd_32(<32 x i16> %A, <32 x i16> %B) { 1937; SSE2-LABEL: pmaddwd_32: 1938; SSE2: # %bb.0: 1939; SSE2-NEXT: pmaddwd %xmm4, %xmm0 1940; SSE2-NEXT: pmaddwd %xmm5, %xmm1 1941; SSE2-NEXT: pmaddwd %xmm6, %xmm2 1942; SSE2-NEXT: pmaddwd %xmm7, %xmm3 1943; SSE2-NEXT: retq 1944; 1945; AVX1-LABEL: pmaddwd_32: 1946; AVX1: # %bb.0: 1947; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 1948; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 1949; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4 1950; AVX1-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0 1951; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 1952; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 1953; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 1954; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 1955; AVX1-NEXT: vpmaddwd %xmm3, %xmm1, %xmm1 1956; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1957; AVX1-NEXT: retq 1958; 1959; AVX2-LABEL: pmaddwd_32: 1960; AVX2: # %bb.0: 1961; AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 1962; AVX2-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1 1963; AVX2-NEXT: retq 1964; 1965; AVX512F-LABEL: pmaddwd_32: 1966; AVX512F: # %bb.0: 1967; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 1968; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 1969; AVX512F-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 1970; AVX512F-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 1971; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 1972; AVX512F-NEXT: retq 1973; 1974; AVX512BW-LABEL: pmaddwd_32: 1975; AVX512BW: # %bb.0: 1976; AVX512BW-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0 1977; AVX512BW-NEXT: retq 1978 %a = sext <32 x i16> %A to <32 x i32> 1979 %b = sext <32 x i16> %B to <32 x i32> 1980 %m = mul nsw <32 x i32> %a, %b 1981 %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 1982 %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 1983 %ret = add <16 x i32> %odd, %even 1984 ret <16 x i32> %ret 1985} 1986 1987define <4 x i32> @pmaddwd_const(<8 x i16> %A) { 1988; SSE2-LABEL: pmaddwd_const: 1989; SSE2: # %bb.0: 1990; SSE2-NEXT: pmaddwd {{.*}}(%rip), %xmm0 1991; SSE2-NEXT: retq 1992; 1993; AVX-LABEL: pmaddwd_const: 1994; AVX: # %bb.0: 1995; AVX-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 1996; AVX-NEXT: retq 1997 %a = sext <8 x i16> %A to <8 x i32> 1998 %m = mul nsw <8 x i32> %a, <i32 32767, i32 -32768, i32 0, i32 0, i32 1, i32 7, i32 42, i32 32> 1999 %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 2000 %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 2001 %ret = add <4 x i32> %odd, %even 2002 ret <4 x i32> %ret 2003} 2004 2005; Do not select unsigned i16 multiplication 2006define <4 x i32> @pmaddwd_negative1(<8 x i16> %A, <8 x i16> %B) { 2007; SSE2-LABEL: pmaddwd_negative1: 2008; SSE2: # %bb.0: 2009; SSE2-NEXT: movdqa %xmm0, %xmm2 2010; SSE2-NEXT: pmulhuw %xmm1, %xmm2 2011; SSE2-NEXT: pmullw %xmm1, %xmm0 2012; SSE2-NEXT: movdqa %xmm0, %xmm1 2013; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 2014; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 2015; SSE2-NEXT: movdqa %xmm0, %xmm2 2016; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] 2017; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 2018; SSE2-NEXT: paddd %xmm2, %xmm0 2019; SSE2-NEXT: retq 2020; 2021; AVX1-LABEL: pmaddwd_negative1: 2022; AVX1: # %bb.0: 2023; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 2024; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 2025; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 2026; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 2027; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2 2028; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 2029; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 2030; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0 2031; AVX1-NEXT: retq 2032; 2033; AVX256-LABEL: pmaddwd_negative1: 2034; AVX256: # %bb.0: 2035; AVX256-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2036; AVX256-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2037; AVX256-NEXT: vpmulld %ymm1, %ymm0, %ymm0 2038; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 2039; AVX256-NEXT: vphaddd %xmm1, %xmm0, %xmm0 2040; AVX256-NEXT: vzeroupper 2041; AVX256-NEXT: retq 2042 %a = zext <8 x i16> %A to <8 x i32> 2043 %b = zext <8 x i16> %B to <8 x i32> 2044 %m = mul nuw <8 x i32> %a, %b 2045 %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 2046 %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 2047 %ret = add <4 x i32> %odd, %even 2048 ret <4 x i32> %ret 2049} 2050 2051; Do not select if constant is too large 2052define <4 x i32> @pmaddwd_negative2(<8 x i16> %A) { 2053; SSE2-LABEL: pmaddwd_negative2: 2054; SSE2: # %bb.0: 2055; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2056; SSE2-NEXT: psrad $16, %xmm1 2057; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 2058; SSE2-NEXT: psrad $16, %xmm0 2059; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 2060; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,7,42,32] 2061; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 2062; SSE2-NEXT: pmuludq %xmm2, %xmm4 2063; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 2064; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [32768,4294934528,0,0] 2065; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] 2066; SSE2-NEXT: pmuludq %xmm2, %xmm6 2067; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm4[0,2] 2068; SSE2-NEXT: pmuludq %xmm3, %xmm0 2069; SSE2-NEXT: pmuludq %xmm5, %xmm1 2070; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] 2071; SSE2-NEXT: paddd %xmm6, %xmm1 2072; SSE2-NEXT: movdqa %xmm1, %xmm0 2073; SSE2-NEXT: retq 2074; 2075; AVX1-LABEL: pmaddwd_negative2: 2076; AVX1: # %bb.0: 2077; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2078; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1 2079; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 2080; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 2081; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 2082; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 2083; AVX1-NEXT: retq 2084; 2085; AVX256-LABEL: pmaddwd_negative2: 2086; AVX256: # %bb.0: 2087; AVX256-NEXT: vpmovsxwd %xmm0, %ymm0 2088; AVX256-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 2089; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 2090; AVX256-NEXT: vphaddd %xmm1, %xmm0, %xmm0 2091; AVX256-NEXT: vzeroupper 2092; AVX256-NEXT: retq 2093 %a = sext <8 x i16> %A to <8 x i32> 2094 %m = mul nsw <8 x i32> %a, <i32 32768, i32 -32768, i32 0, i32 0, i32 1, i32 7, i32 42, i32 32> 2095 %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 2096 %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 2097 %ret = add <4 x i32> %odd, %even 2098 ret <4 x i32> %ret 2099} 2100 2101define <4 x i32> @jumbled_indices4(<8 x i16> %A, <8 x i16> %B) { 2102; SSE2-LABEL: jumbled_indices4: 2103; SSE2: # %bb.0: 2104; SSE2-NEXT: pmaddwd %xmm1, %xmm0 2105; SSE2-NEXT: retq 2106; 2107; AVX-LABEL: jumbled_indices4: 2108; AVX: # %bb.0: 2109; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 2110; AVX-NEXT: retq 2111 %exta = sext <8 x i16> %A to <8 x i32> 2112 %extb = sext <8 x i16> %B to <8 x i32> 2113 %m = mul <8 x i32> %exta, %extb 2114 %sa = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 3, i32 1, i32 5, i32 6> 2115 %sb = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 2, i32 0, i32 4, i32 7> 2116 %a = add <4 x i32> %sa, %sb 2117 ret <4 x i32> %a 2118} 2119 2120define <8 x i32> @jumbled_indices8(<16 x i16> %A, <16 x i16> %B) { 2121; SSE2-LABEL: jumbled_indices8: 2122; SSE2: # %bb.0: 2123; SSE2-NEXT: pmaddwd %xmm2, %xmm0 2124; SSE2-NEXT: pmaddwd %xmm3, %xmm1 2125; SSE2-NEXT: retq 2126; 2127; AVX1-LABEL: jumbled_indices8: 2128; AVX1: # %bb.0: 2129; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2130; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 2131; AVX1-NEXT: vpmaddwd %xmm2, %xmm3, %xmm2 2132; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 2133; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2134; AVX1-NEXT: retq 2135; 2136; AVX256-LABEL: jumbled_indices8: 2137; AVX256: # %bb.0: 2138; AVX256-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 2139; AVX256-NEXT: retq 2140 %exta = sext <16 x i16> %A to <16 x i32> 2141 %extb = sext <16 x i16> %B to <16 x i32> 2142 %m = mul <16 x i32> %exta, %extb 2143 %sa = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 7, i32 4, i32 11, i32 8, i32 15, i32 12> 2144 %sb = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 6, i32 5, i32 10, i32 9, i32 14, i32 13> 2145 %a = add <8 x i32> %sa, %sb 2146 ret <8 x i32> %a 2147} 2148 2149define <16 x i32> @jumbled_indices16(<32 x i16> %A, <32 x i16> %B) { 2150; SSE2-LABEL: jumbled_indices16: 2151; SSE2: # %bb.0: 2152; SSE2-NEXT: pmaddwd %xmm4, %xmm0 2153; SSE2-NEXT: pmaddwd %xmm5, %xmm1 2154; SSE2-NEXT: pmaddwd %xmm6, %xmm2 2155; SSE2-NEXT: pmaddwd %xmm7, %xmm3 2156; SSE2-NEXT: retq 2157; 2158; AVX1-LABEL: jumbled_indices16: 2159; AVX1: # %bb.0: 2160; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 2161; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 2162; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4 2163; AVX1-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0 2164; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 2165; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 2166; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 2167; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 2168; AVX1-NEXT: vpmaddwd %xmm3, %xmm1, %xmm1 2169; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2170; AVX1-NEXT: retq 2171; 2172; AVX2-LABEL: jumbled_indices16: 2173; AVX2: # %bb.0: 2174; AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 2175; AVX2-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1 2176; AVX2-NEXT: retq 2177; 2178; AVX512F-LABEL: jumbled_indices16: 2179; AVX512F: # %bb.0: 2180; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 2181; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 2182; AVX512F-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 2183; AVX512F-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 2184; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 2185; AVX512F-NEXT: retq 2186; 2187; AVX512BW-LABEL: jumbled_indices16: 2188; AVX512BW: # %bb.0: 2189; AVX512BW-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0 2190; AVX512BW-NEXT: retq 2191 %exta = sext <32 x i16> %A to <32 x i32> 2192 %extb = sext <32 x i16> %B to <32 x i32> 2193 %m = mul <32 x i32> %exta, %extb 2194 %sa = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 2, i32 0, i32 5, i32 6, i32 11, i32 9, i32 15, i32 12, i32 17, i32 18, i32 20, i32 23, i32 27, i32 24, i32 31, i32 29> 2195 %sb = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 3, i32 1, i32 4, i32 7, i32 10, i32 8, i32 14, i32 13, i32 16, i32 19, i32 21, i32 22, i32 26, i32 25, i32 30, i32 28> 2196 %a = add <16 x i32> %sa, %sb 2197 ret <16 x i32> %a 2198} 2199 2200define <32 x i32> @jumbled_indices32(<64 x i16> %A, <64 x i16> %B) { 2201; SSE2-LABEL: jumbled_indices32: 2202; SSE2: # %bb.0: 2203; SSE2-NEXT: movq %rdi, %rax 2204; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm0 2205; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm1 2206; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm2 2207; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm3 2208; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm4 2209; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm5 2210; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm6 2211; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm7 2212; SSE2-NEXT: movdqa %xmm7, 112(%rdi) 2213; SSE2-NEXT: movdqa %xmm6, 96(%rdi) 2214; SSE2-NEXT: movdqa %xmm5, 80(%rdi) 2215; SSE2-NEXT: movdqa %xmm4, 64(%rdi) 2216; SSE2-NEXT: movdqa %xmm3, 48(%rdi) 2217; SSE2-NEXT: movdqa %xmm2, 32(%rdi) 2218; SSE2-NEXT: movdqa %xmm1, 16(%rdi) 2219; SSE2-NEXT: movdqa %xmm0, (%rdi) 2220; SSE2-NEXT: retq 2221; 2222; AVX1-LABEL: jumbled_indices32: 2223; AVX1: # %bb.0: 2224; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm8 2225; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm9 2226; AVX1-NEXT: vpmaddwd %xmm8, %xmm9, %xmm8 2227; AVX1-NEXT: vpmaddwd %xmm4, %xmm0, %xmm0 2228; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 2229; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm8 2230; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 2231; AVX1-NEXT: vpmaddwd %xmm4, %xmm8, %xmm4 2232; AVX1-NEXT: vpmaddwd %xmm5, %xmm1, %xmm1 2233; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 2234; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm4 2235; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 2236; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4 2237; AVX1-NEXT: vpmaddwd %xmm6, %xmm2, %xmm2 2238; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 2239; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm4 2240; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 2241; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4 2242; AVX1-NEXT: vpmaddwd %xmm7, %xmm3, %xmm3 2243; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 2244; AVX1-NEXT: retq 2245; 2246; AVX2-LABEL: jumbled_indices32: 2247; AVX2: # %bb.0: 2248; AVX2-NEXT: vpmaddwd %ymm4, %ymm0, %ymm0 2249; AVX2-NEXT: vpmaddwd %ymm5, %ymm1, %ymm1 2250; AVX2-NEXT: vpmaddwd %ymm6, %ymm2, %ymm2 2251; AVX2-NEXT: vpmaddwd %ymm7, %ymm3, %ymm3 2252; AVX2-NEXT: retq 2253; 2254; AVX512F-LABEL: jumbled_indices32: 2255; AVX512F: # %bb.0: 2256; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4 2257; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 2258; AVX512F-NEXT: vpmaddwd %ymm4, %ymm5, %ymm4 2259; AVX512F-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 2260; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 2261; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm2 2262; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 2263; AVX512F-NEXT: vpmaddwd %ymm2, %ymm4, %ymm2 2264; AVX512F-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1 2265; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 2266; AVX512F-NEXT: retq 2267; 2268; AVX512BW-LABEL: jumbled_indices32: 2269; AVX512BW: # %bb.0: 2270; AVX512BW-NEXT: vpmaddwd %zmm2, %zmm0, %zmm0 2271; AVX512BW-NEXT: vpmaddwd %zmm3, %zmm1, %zmm1 2272; AVX512BW-NEXT: retq 2273 %exta = sext <64 x i16> %A to <64 x i32> 2274 %extb = sext <64 x i16> %B to <64 x i32> 2275 %m = mul <64 x i32> %exta, %extb 2276 %sa = shufflevector <64 x i32> %m, <64 x i32> undef, <32 x i32> <i32 1, i32 2, i32 6, i32 5, i32 10, i32 8, i32 14, i32 12, i32 19, i32 17, i32 22, i32 20, i32 25, i32 27, i32 30, i32 28, i32 32, i32 34, i32 37, i32 38, i32 41, i32 43, i32 45, i32 47, i32 50, i32 48, i32 52, i32 54, i32 59, i32 56, i32 61, i32 63> 2277 %sb = shufflevector <64 x i32> %m, <64 x i32> undef, <32 x i32> <i32 0, i32 3, i32 7, i32 4, i32 11, i32 9, i32 15, i32 13, i32 18, i32 16, i32 23, i32 21, i32 24, i32 26, i32 31, i32 29, i32 33, i32 35, i32 36, i32 39, i32 40, i32 42, i32 44, i32 46, i32 51, i32 49, i32 53, i32 55, i32 58, i32 57, i32 60, i32 62> 2278 %a = add <32 x i32> %sa, %sb 2279 ret <32 x i32> %a 2280} 2281 2282; NOTE: We're testing with loads because ABI lowering creates a concat_vectors that extract_vector_elt creation can see through. 2283; This would require the combine to recreate the concat_vectors. 2284define <4 x i32> @pmaddwd_128(<8 x i16>* %Aptr, <8 x i16>* %Bptr) { 2285; SSE2-LABEL: pmaddwd_128: 2286; SSE2: # %bb.0: 2287; SSE2-NEXT: movdqa (%rdi), %xmm0 2288; SSE2-NEXT: pmaddwd (%rsi), %xmm0 2289; SSE2-NEXT: retq 2290; 2291; AVX-LABEL: pmaddwd_128: 2292; AVX: # %bb.0: 2293; AVX-NEXT: vmovdqa (%rdi), %xmm0 2294; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 2295; AVX-NEXT: retq 2296 %A = load <8 x i16>, <8 x i16>* %Aptr 2297 %B = load <8 x i16>, <8 x i16>* %Bptr 2298 %A_even = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 2299 %A_odd = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 2300 %B_even = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 2301 %B_odd = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 2302 %A_even_ext = sext <4 x i16> %A_even to <4 x i32> 2303 %B_even_ext = sext <4 x i16> %B_even to <4 x i32> 2304 %A_odd_ext = sext <4 x i16> %A_odd to <4 x i32> 2305 %B_odd_ext = sext <4 x i16> %B_odd to <4 x i32> 2306 %even_mul = mul <4 x i32> %A_even_ext, %B_even_ext 2307 %odd_mul = mul <4 x i32> %A_odd_ext, %B_odd_ext 2308 %add = add <4 x i32> %even_mul, %odd_mul 2309 ret <4 x i32> %add 2310} 2311 2312define <8 x i32> @pmaddwd_256(<16 x i16>* %Aptr, <16 x i16>* %Bptr) { 2313; SSE2-LABEL: pmaddwd_256: 2314; SSE2: # %bb.0: 2315; SSE2-NEXT: movdqa (%rdi), %xmm0 2316; SSE2-NEXT: movdqa 16(%rdi), %xmm1 2317; SSE2-NEXT: pmaddwd (%rsi), %xmm0 2318; SSE2-NEXT: pmaddwd 16(%rsi), %xmm1 2319; SSE2-NEXT: retq 2320; 2321; AVX1-LABEL: pmaddwd_256: 2322; AVX1: # %bb.0: 2323; AVX1-NEXT: vmovdqa (%rdi), %xmm0 2324; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 2325; AVX1-NEXT: vpmaddwd 16(%rsi), %xmm1, %xmm1 2326; AVX1-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 2327; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2328; AVX1-NEXT: retq 2329; 2330; AVX256-LABEL: pmaddwd_256: 2331; AVX256: # %bb.0: 2332; AVX256-NEXT: vmovdqa (%rdi), %ymm0 2333; AVX256-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 2334; AVX256-NEXT: retq 2335 %A = load <16 x i16>, <16 x i16>* %Aptr 2336 %B = load <16 x i16>, <16 x i16>* %Bptr 2337 %A_even = shufflevector <16 x i16> %A, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 2338 %A_odd = shufflevector <16 x i16> %A, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 2339 %B_even = shufflevector <16 x i16> %B, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 2340 %B_odd = shufflevector <16 x i16> %B, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 2341 %A_even_ext = sext <8 x i16> %A_even to <8 x i32> 2342 %B_even_ext = sext <8 x i16> %B_even to <8 x i32> 2343 %A_odd_ext = sext <8 x i16> %A_odd to <8 x i32> 2344 %B_odd_ext = sext <8 x i16> %B_odd to <8 x i32> 2345 %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext 2346 %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext 2347 %add = add <8 x i32> %even_mul, %odd_mul 2348 ret <8 x i32> %add 2349} 2350 2351define <16 x i32> @pmaddwd_512(<32 x i16>* %Aptr, <32 x i16>* %Bptr) { 2352; SSE2-LABEL: pmaddwd_512: 2353; SSE2: # %bb.0: 2354; SSE2-NEXT: movdqa (%rdi), %xmm0 2355; SSE2-NEXT: movdqa 16(%rdi), %xmm1 2356; SSE2-NEXT: movdqa 32(%rdi), %xmm2 2357; SSE2-NEXT: movdqa 48(%rdi), %xmm3 2358; SSE2-NEXT: pmaddwd (%rsi), %xmm0 2359; SSE2-NEXT: pmaddwd 16(%rsi), %xmm1 2360; SSE2-NEXT: pmaddwd 32(%rsi), %xmm2 2361; SSE2-NEXT: pmaddwd 48(%rsi), %xmm3 2362; SSE2-NEXT: retq 2363; 2364; AVX1-LABEL: pmaddwd_512: 2365; AVX1: # %bb.0: 2366; AVX1-NEXT: vmovdqa (%rdi), %xmm0 2367; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 2368; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 2369; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 2370; AVX1-NEXT: vpmaddwd 16(%rsi), %xmm1, %xmm1 2371; AVX1-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 2372; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2373; AVX1-NEXT: vpmaddwd 48(%rsi), %xmm3, %xmm1 2374; AVX1-NEXT: vpmaddwd 32(%rsi), %xmm2, %xmm2 2375; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 2376; AVX1-NEXT: retq 2377; 2378; AVX2-LABEL: pmaddwd_512: 2379; AVX2: # %bb.0: 2380; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2381; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 2382; AVX2-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 2383; AVX2-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 2384; AVX2-NEXT: retq 2385; 2386; AVX512F-LABEL: pmaddwd_512: 2387; AVX512F: # %bb.0: 2388; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 2389; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 2390; AVX512F-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 2391; AVX512F-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 2392; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2393; AVX512F-NEXT: retq 2394; 2395; AVX512BW-LABEL: pmaddwd_512: 2396; AVX512BW: # %bb.0: 2397; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 2398; AVX512BW-NEXT: vpmaddwd (%rsi), %zmm0, %zmm0 2399; AVX512BW-NEXT: retq 2400 %A = load <32 x i16>, <32 x i16>* %Aptr 2401 %B = load <32 x i16>, <32 x i16>* %Bptr 2402 %A_even = shufflevector <32 x i16> %A, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 2403 %A_odd = shufflevector <32 x i16> %A, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 2404 %B_even = shufflevector <32 x i16> %B, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 2405 %B_odd = shufflevector <32 x i16> %B, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 2406 %A_even_ext = sext <16 x i16> %A_even to <16 x i32> 2407 %B_even_ext = sext <16 x i16> %B_even to <16 x i32> 2408 %A_odd_ext = sext <16 x i16> %A_odd to <16 x i32> 2409 %B_odd_ext = sext <16 x i16> %B_odd to <16 x i32> 2410 %even_mul = mul <16 x i32> %A_even_ext, %B_even_ext 2411 %odd_mul = mul <16 x i32> %A_odd_ext, %B_odd_ext 2412 %add = add <16 x i32> %even_mul, %odd_mul 2413 ret <16 x i32> %add 2414} 2415 2416define <32 x i32> @pmaddwd_1024(<64 x i16>* %Aptr, <64 x i16>* %Bptr) { 2417; SSE2-LABEL: pmaddwd_1024: 2418; SSE2: # %bb.0: 2419; SSE2-NEXT: movq %rdi, %rax 2420; SSE2-NEXT: movdqa (%rsi), %xmm0 2421; SSE2-NEXT: movdqa 16(%rsi), %xmm1 2422; SSE2-NEXT: movdqa 32(%rsi), %xmm2 2423; SSE2-NEXT: movdqa 48(%rsi), %xmm3 2424; SSE2-NEXT: pmaddwd (%rdx), %xmm0 2425; SSE2-NEXT: pmaddwd 16(%rdx), %xmm1 2426; SSE2-NEXT: pmaddwd 32(%rdx), %xmm2 2427; SSE2-NEXT: pmaddwd 48(%rdx), %xmm3 2428; SSE2-NEXT: movdqa 64(%rsi), %xmm4 2429; SSE2-NEXT: pmaddwd 64(%rdx), %xmm4 2430; SSE2-NEXT: movdqa 80(%rsi), %xmm5 2431; SSE2-NEXT: pmaddwd 80(%rdx), %xmm5 2432; SSE2-NEXT: movdqa 96(%rsi), %xmm6 2433; SSE2-NEXT: pmaddwd 96(%rdx), %xmm6 2434; SSE2-NEXT: movdqa 112(%rsi), %xmm7 2435; SSE2-NEXT: pmaddwd 112(%rdx), %xmm7 2436; SSE2-NEXT: movdqa %xmm7, 112(%rdi) 2437; SSE2-NEXT: movdqa %xmm6, 96(%rdi) 2438; SSE2-NEXT: movdqa %xmm5, 80(%rdi) 2439; SSE2-NEXT: movdqa %xmm4, 64(%rdi) 2440; SSE2-NEXT: movdqa %xmm3, 48(%rdi) 2441; SSE2-NEXT: movdqa %xmm2, 32(%rdi) 2442; SSE2-NEXT: movdqa %xmm1, 16(%rdi) 2443; SSE2-NEXT: movdqa %xmm0, (%rdi) 2444; SSE2-NEXT: retq 2445; 2446; AVX1-LABEL: pmaddwd_1024: 2447; AVX1: # %bb.0: 2448; AVX1-NEXT: vmovdqa (%rdi), %xmm0 2449; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 2450; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 2451; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 2452; AVX1-NEXT: vpmaddwd 16(%rsi), %xmm1, %xmm1 2453; AVX1-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 2454; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2455; AVX1-NEXT: vpmaddwd 48(%rsi), %xmm3, %xmm1 2456; AVX1-NEXT: vpmaddwd 32(%rsi), %xmm2, %xmm2 2457; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 2458; AVX1-NEXT: vmovdqa 80(%rdi), %xmm2 2459; AVX1-NEXT: vpmaddwd 80(%rsi), %xmm2, %xmm2 2460; AVX1-NEXT: vmovdqa 64(%rdi), %xmm3 2461; AVX1-NEXT: vpmaddwd 64(%rsi), %xmm3, %xmm3 2462; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 2463; AVX1-NEXT: vmovdqa 112(%rdi), %xmm3 2464; AVX1-NEXT: vpmaddwd 112(%rsi), %xmm3, %xmm3 2465; AVX1-NEXT: vmovdqa 96(%rdi), %xmm4 2466; AVX1-NEXT: vpmaddwd 96(%rsi), %xmm4, %xmm4 2467; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 2468; AVX1-NEXT: retq 2469; 2470; AVX2-LABEL: pmaddwd_1024: 2471; AVX2: # %bb.0: 2472; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2473; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 2474; AVX2-NEXT: vmovdqa 64(%rdi), %ymm2 2475; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3 2476; AVX2-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 2477; AVX2-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 2478; AVX2-NEXT: vpmaddwd 64(%rsi), %ymm2, %ymm2 2479; AVX2-NEXT: vpmaddwd 96(%rsi), %ymm3, %ymm3 2480; AVX2-NEXT: retq 2481; 2482; AVX512F-LABEL: pmaddwd_1024: 2483; AVX512F: # %bb.0: 2484; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 2485; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 2486; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm2 2487; AVX512F-NEXT: vmovdqa 96(%rdi), %ymm3 2488; AVX512F-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 2489; AVX512F-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 2490; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2491; AVX512F-NEXT: vpmaddwd 96(%rsi), %ymm3, %ymm1 2492; AVX512F-NEXT: vpmaddwd 64(%rsi), %ymm2, %ymm2 2493; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 2494; AVX512F-NEXT: retq 2495; 2496; AVX512BW-LABEL: pmaddwd_1024: 2497; AVX512BW: # %bb.0: 2498; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 2499; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 2500; AVX512BW-NEXT: vpmaddwd (%rsi), %zmm0, %zmm0 2501; AVX512BW-NEXT: vpmaddwd 64(%rsi), %zmm1, %zmm1 2502; AVX512BW-NEXT: retq 2503 %A = load <64 x i16>, <64 x i16>* %Aptr 2504 %B = load <64 x i16>, <64 x i16>* %Bptr 2505 %A_even = shufflevector <64 x i16> %A, <64 x i16> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62> 2506 %A_odd = shufflevector <64 x i16> %A, <64 x i16> undef, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63> 2507 %B_even = shufflevector <64 x i16> %B, <64 x i16> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62> 2508 %B_odd = shufflevector <64 x i16> %B, <64 x i16> undef, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63> 2509 %A_even_ext = sext <32 x i16> %A_even to <32 x i32> 2510 %B_even_ext = sext <32 x i16> %B_even to <32 x i32> 2511 %A_odd_ext = sext <32 x i16> %A_odd to <32 x i32> 2512 %B_odd_ext = sext <32 x i16> %B_odd to <32 x i32> 2513 %even_mul = mul <32 x i32> %A_even_ext, %B_even_ext 2514 %odd_mul = mul <32 x i32> %A_odd_ext, %B_odd_ext 2515 %add = add <32 x i32> %even_mul, %odd_mul 2516 ret <32 x i32> %add 2517} 2518 2519define <4 x i32> @pmaddwd_commuted_mul(<8 x i16>* %Aptr, <8 x i16>* %Bptr) { 2520; SSE2-LABEL: pmaddwd_commuted_mul: 2521; SSE2: # %bb.0: 2522; SSE2-NEXT: movdqa (%rdi), %xmm0 2523; SSE2-NEXT: pmaddwd (%rsi), %xmm0 2524; SSE2-NEXT: retq 2525; 2526; AVX-LABEL: pmaddwd_commuted_mul: 2527; AVX: # %bb.0: 2528; AVX-NEXT: vmovdqa (%rdi), %xmm0 2529; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 2530; AVX-NEXT: retq 2531 %A = load <8 x i16>, <8 x i16>* %Aptr 2532 %B = load <8 x i16>, <8 x i16>* %Bptr 2533 %A_even = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 2534 %A_odd = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 2535 %B_even = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 2536 %B_odd = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 2537 %A_even_ext = sext <4 x i16> %A_even to <4 x i32> 2538 %B_even_ext = sext <4 x i16> %B_even to <4 x i32> 2539 %A_odd_ext = sext <4 x i16> %A_odd to <4 x i32> 2540 %B_odd_ext = sext <4 x i16> %B_odd to <4 x i32> 2541 %even_mul = mul <4 x i32> %A_even_ext, %B_even_ext 2542 %odd_mul = mul <4 x i32> %B_odd_ext, %A_odd_ext ; Different order than previous mul 2543 %add = add <4 x i32> %even_mul, %odd_mul 2544 ret <4 x i32> %add 2545} 2546 2547define <4 x i32> @pmaddwd_swapped_indices(<8 x i16>* %Aptr, <8 x i16>* %Bptr) { 2548; SSE2-LABEL: pmaddwd_swapped_indices: 2549; SSE2: # %bb.0: 2550; SSE2-NEXT: movdqa (%rdi), %xmm0 2551; SSE2-NEXT: pmaddwd (%rsi), %xmm0 2552; SSE2-NEXT: retq 2553; 2554; AVX-LABEL: pmaddwd_swapped_indices: 2555; AVX: # %bb.0: 2556; AVX-NEXT: vmovdqa (%rdi), %xmm0 2557; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 2558; AVX-NEXT: retq 2559 %A = load <8 x i16>, <8 x i16>* %Aptr 2560 %B = load <8 x i16>, <8 x i16>* %Bptr 2561 %A_even = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 1, i32 2, i32 5, i32 6> ; indices aren't all even 2562 %A_odd = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 7> ; indices aren't all odd 2563 %B_even = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 2, i32 5, i32 6> ; same indices as A 2564 %B_odd = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 7> ; same indices as A 2565 %A_even_ext = sext <4 x i16> %A_even to <4 x i32> 2566 %B_even_ext = sext <4 x i16> %B_even to <4 x i32> 2567 %A_odd_ext = sext <4 x i16> %A_odd to <4 x i32> 2568 %B_odd_ext = sext <4 x i16> %B_odd to <4 x i32> 2569 %even_mul = mul <4 x i32> %A_even_ext, %B_even_ext 2570 %odd_mul = mul <4 x i32> %A_odd_ext, %B_odd_ext 2571 %add = add <4 x i32> %even_mul, %odd_mul 2572 ret <4 x i32> %add 2573} 2574 2575; Negative test were indices aren't paired properly 2576define <4 x i32> @pmaddwd_bad_indices(<8 x i16>* %Aptr, <8 x i16>* %Bptr) { 2577; SSE2-LABEL: pmaddwd_bad_indices: 2578; SSE2: # %bb.0: 2579; SSE2-NEXT: movdqa (%rdi), %xmm0 2580; SSE2-NEXT: movdqa (%rsi), %xmm1 2581; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,1,2,3,4,5,6,7] 2582; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] 2583; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 2584; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] 2585; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] 2586; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] 2587; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2588; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,2,2,3,4,5,6,7] 2589; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] 2590; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 2591; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 2592; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] 2593; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2594; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 2595; SSE2-NEXT: movdqa %xmm2, %xmm4 2596; SSE2-NEXT: pmulhw %xmm3, %xmm4 2597; SSE2-NEXT: pmullw %xmm3, %xmm2 2598; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 2599; SSE2-NEXT: movdqa %xmm0, %xmm3 2600; SSE2-NEXT: pmulhw %xmm1, %xmm3 2601; SSE2-NEXT: pmullw %xmm1, %xmm0 2602; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] 2603; SSE2-NEXT: paddd %xmm2, %xmm0 2604; SSE2-NEXT: retq 2605; 2606; AVX-LABEL: pmaddwd_bad_indices: 2607; AVX: # %bb.0: 2608; AVX-NEXT: vmovdqa (%rdi), %xmm0 2609; AVX-NEXT: vmovdqa (%rsi), %xmm1 2610; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u] 2611; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u] 2612; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] 2613; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 2614; AVX-NEXT: vpmovsxwd %xmm2, %xmm2 2615; AVX-NEXT: vpmovsxwd %xmm3, %xmm3 2616; AVX-NEXT: vpmulld %xmm3, %xmm2, %xmm2 2617; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 2618; AVX-NEXT: vpmovsxwd %xmm1, %xmm1 2619; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 2620; AVX-NEXT: vpaddd %xmm0, %xmm2, %xmm0 2621; AVX-NEXT: retq 2622 %A = load <8 x i16>, <8 x i16>* %Aptr 2623 %B = load <8 x i16>, <8 x i16>* %Bptr 2624 %A_even = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 1, i32 2, i32 5, i32 6> 2625 %A_odd = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 7> 2626 %B_even = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> ; different indices than A 2627 %B_odd = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> ; different indices than A 2628 %A_even_ext = sext <4 x i16> %A_even to <4 x i32> 2629 %B_even_ext = sext <4 x i16> %B_even to <4 x i32> 2630 %A_odd_ext = sext <4 x i16> %A_odd to <4 x i32> 2631 %B_odd_ext = sext <4 x i16> %B_odd to <4 x i32> 2632 %even_mul = mul <4 x i32> %A_even_ext, %B_even_ext 2633 %odd_mul = mul <4 x i32> %A_odd_ext, %B_odd_ext 2634 %add = add <4 x i32> %even_mul, %odd_mul 2635 ret <4 x i32> %add 2636} 2637 2638; This test contains two multiplies joined by an add. The result of that add is then reduced to a single element. 2639; SelectionDAGBuilder should tag the joining add as a vector reduction. We need to recognize that both sides can use pmaddwd 2640define i32 @madd_double_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* %arg2, <8 x i16>* %arg3) { 2641; SSE2-LABEL: madd_double_reduction: 2642; SSE2: # %bb.0: 2643; SSE2-NEXT: movdqu (%rdi), %xmm0 2644; SSE2-NEXT: movdqu (%rsi), %xmm1 2645; SSE2-NEXT: pmaddwd %xmm0, %xmm1 2646; SSE2-NEXT: movdqu (%rdx), %xmm0 2647; SSE2-NEXT: movdqu (%rcx), %xmm2 2648; SSE2-NEXT: pmaddwd %xmm0, %xmm2 2649; SSE2-NEXT: paddd %xmm1, %xmm2 2650; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 2651; SSE2-NEXT: paddd %xmm2, %xmm0 2652; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2653; SSE2-NEXT: paddd %xmm0, %xmm1 2654; SSE2-NEXT: movd %xmm1, %eax 2655; SSE2-NEXT: retq 2656; 2657; AVX-LABEL: madd_double_reduction: 2658; AVX: # %bb.0: 2659; AVX-NEXT: vmovdqu (%rdi), %xmm0 2660; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 2661; AVX-NEXT: vmovdqu (%rdx), %xmm1 2662; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1 2663; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 2664; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2665; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2666; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2667; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2668; AVX-NEXT: vmovd %xmm0, %eax 2669; AVX-NEXT: retq 2670 %tmp = load <8 x i16>, <8 x i16>* %arg, align 1 2671 %tmp6 = load <8 x i16>, <8 x i16>* %arg1, align 1 2672 %tmp7 = sext <8 x i16> %tmp to <8 x i32> 2673 %tmp17 = sext <8 x i16> %tmp6 to <8 x i32> 2674 %tmp19 = mul nsw <8 x i32> %tmp7, %tmp17 2675 %tmp20 = load <8 x i16>, <8 x i16>* %arg2, align 1 2676 %tmp21 = load <8 x i16>, <8 x i16>* %arg3, align 1 2677 %tmp22 = sext <8 x i16> %tmp20 to <8 x i32> 2678 %tmp23 = sext <8 x i16> %tmp21 to <8 x i32> 2679 %tmp25 = mul nsw <8 x i32> %tmp22, %tmp23 2680 %tmp26 = add nuw nsw <8 x i32> %tmp25, %tmp19 2681 %tmp29 = shufflevector <8 x i32> %tmp26, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 2682 %tmp30 = add <8 x i32> %tmp26, %tmp29 2683 %tmp31 = shufflevector <8 x i32> %tmp30, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2684 %tmp32 = add <8 x i32> %tmp30, %tmp31 2685 %tmp33 = shufflevector <8 x i32> %tmp32, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2686 %tmp34 = add <8 x i32> %tmp32, %tmp33 2687 %tmp35 = extractelement <8 x i32> %tmp34, i64 0 2688 ret i32 %tmp35 2689} 2690 2691define i32 @madd_quad_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* %arg2, <8 x i16>* %arg3, <8 x i16>* %arg4, <8 x i16>* %arg5, <8 x i16>* %arg6, <8 x i16>* %arg7) { 2692; SSE2-LABEL: madd_quad_reduction: 2693; SSE2: # %bb.0: 2694; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 2695; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax 2696; SSE2-NEXT: movdqu (%rdi), %xmm0 2697; SSE2-NEXT: movdqu (%rsi), %xmm1 2698; SSE2-NEXT: pmaddwd %xmm0, %xmm1 2699; SSE2-NEXT: movdqu (%rdx), %xmm0 2700; SSE2-NEXT: movdqu (%rcx), %xmm2 2701; SSE2-NEXT: pmaddwd %xmm0, %xmm2 2702; SSE2-NEXT: movdqu (%r8), %xmm0 2703; SSE2-NEXT: movdqu (%r9), %xmm3 2704; SSE2-NEXT: pmaddwd %xmm0, %xmm3 2705; SSE2-NEXT: paddd %xmm1, %xmm3 2706; SSE2-NEXT: movdqu (%rax), %xmm0 2707; SSE2-NEXT: movdqu (%r10), %xmm1 2708; SSE2-NEXT: pmaddwd %xmm0, %xmm1 2709; SSE2-NEXT: paddd %xmm3, %xmm1 2710; SSE2-NEXT: paddd %xmm2, %xmm1 2711; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 2712; SSE2-NEXT: paddd %xmm1, %xmm0 2713; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2714; SSE2-NEXT: paddd %xmm0, %xmm1 2715; SSE2-NEXT: movd %xmm1, %eax 2716; SSE2-NEXT: retq 2717; 2718; AVX-LABEL: madd_quad_reduction: 2719; AVX: # %bb.0: 2720; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 2721; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 2722; AVX-NEXT: vmovdqu (%rdi), %xmm0 2723; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 2724; AVX-NEXT: vmovdqu (%rdx), %xmm1 2725; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1 2726; AVX-NEXT: vmovdqu (%r8), %xmm2 2727; AVX-NEXT: vpmaddwd (%r9), %xmm2, %xmm2 2728; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 2729; AVX-NEXT: vmovdqu (%rax), %xmm2 2730; AVX-NEXT: vpmaddwd (%r10), %xmm2, %xmm2 2731; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 2732; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 2733; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2734; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2735; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2736; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2737; AVX-NEXT: vmovd %xmm0, %eax 2738; AVX-NEXT: retq 2739 %tmp = load <8 x i16>, <8 x i16>* %arg, align 1 2740 %tmp6 = load <8 x i16>, <8 x i16>* %arg1, align 1 2741 %tmp7 = sext <8 x i16> %tmp to <8 x i32> 2742 %tmp17 = sext <8 x i16> %tmp6 to <8 x i32> 2743 %tmp19 = mul nsw <8 x i32> %tmp7, %tmp17 2744 %tmp20 = load <8 x i16>, <8 x i16>* %arg2, align 1 2745 %tmp21 = load <8 x i16>, <8 x i16>* %arg3, align 1 2746 %tmp22 = sext <8 x i16> %tmp20 to <8 x i32> 2747 %tmp23 = sext <8 x i16> %tmp21 to <8 x i32> 2748 %tmp25 = mul nsw <8 x i32> %tmp22, %tmp23 2749 %tmp26 = add nuw nsw <8 x i32> %tmp25, %tmp19 2750 2751 %tmp40 = load <8 x i16>, <8 x i16>* %arg4, align 1 2752 %tmp41 = load <8 x i16>, <8 x i16>* %arg5, align 1 2753 %tmp42 = sext <8 x i16> %tmp40 to <8 x i32> 2754 %tmp43 = sext <8 x i16> %tmp41 to <8 x i32> 2755 %tmp45 = mul nsw <8 x i32> %tmp42, %tmp43 2756 %tmp56 = add nuw nsw <8 x i32> %tmp26, %tmp45 2757 2758 %tmp50 = load <8 x i16>, <8 x i16>* %arg6, align 1 2759 %tmp51 = load <8 x i16>, <8 x i16>* %arg7, align 1 2760 %tmp52 = sext <8 x i16> %tmp50 to <8 x i32> 2761 %tmp53 = sext <8 x i16> %tmp51 to <8 x i32> 2762 %tmp55 = mul nsw <8 x i32> %tmp52, %tmp53 2763 %tmp57 = add nuw nsw <8 x i32> %tmp55, %tmp56 2764 2765 %tmp29 = shufflevector <8 x i32> %tmp57, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 2766 %tmp30 = add <8 x i32> %tmp57, %tmp29 2767 %tmp31 = shufflevector <8 x i32> %tmp30, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2768 %tmp32 = add <8 x i32> %tmp30, %tmp31 2769 %tmp33 = shufflevector <8 x i32> %tmp32, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2770 %tmp34 = add <8 x i32> %tmp32, %tmp33 2771 %tmp35 = extractelement <8 x i32> %tmp34, i64 0 2772 ret i32 %tmp35 2773} 2774 2775define i64 @sum_and_sum_of_squares(i8* %a, i32 %n) { 2776; SSE2-LABEL: sum_and_sum_of_squares: 2777; SSE2: # %bb.0: # %entry 2778; SSE2-NEXT: movl %esi, %eax 2779; SSE2-NEXT: pxor %xmm0, %xmm0 2780; SSE2-NEXT: pxor %xmm1, %xmm1 2781; SSE2-NEXT: pxor %xmm2, %xmm2 2782; SSE2-NEXT: pxor %xmm3, %xmm3 2783; SSE2-NEXT: .p2align 4, 0x90 2784; SSE2-NEXT: .LBB33_1: # %vector.body 2785; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 2786; SSE2-NEXT: movq {{.*#+}} xmm4 = mem[0],zero 2787; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 2788; SSE2-NEXT: movdqa %xmm4, %xmm5 2789; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] 2790; SSE2-NEXT: paddd %xmm5, %xmm2 2791; SSE2-NEXT: movdqa %xmm4, %xmm5 2792; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] 2793; SSE2-NEXT: paddd %xmm5, %xmm3 2794; SSE2-NEXT: pmaddwd %xmm4, %xmm4 2795; SSE2-NEXT: paddd %xmm4, %xmm1 2796; SSE2-NEXT: addq $8, %rdi 2797; SSE2-NEXT: addq $-8, %rax 2798; SSE2-NEXT: jne .LBB33_1 2799; SSE2-NEXT: # %bb.2: # %middle.block 2800; SSE2-NEXT: paddd %xmm3, %xmm2 2801; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] 2802; SSE2-NEXT: paddd %xmm2, %xmm3 2803; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] 2804; SSE2-NEXT: paddd %xmm3, %xmm2 2805; SSE2-NEXT: movd %xmm2, %ecx 2806; SSE2-NEXT: paddd %xmm0, %xmm1 2807; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 2808; SSE2-NEXT: paddd %xmm1, %xmm0 2809; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2810; SSE2-NEXT: paddd %xmm0, %xmm1 2811; SSE2-NEXT: movd %xmm1, %eax 2812; SSE2-NEXT: shlq $32, %rcx 2813; SSE2-NEXT: orq %rcx, %rax 2814; SSE2-NEXT: retq 2815; 2816; AVX1-LABEL: sum_and_sum_of_squares: 2817; AVX1: # %bb.0: # %entry 2818; AVX1-NEXT: movl %esi, %eax 2819; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 2820; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2821; AVX1-NEXT: .p2align 4, 0x90 2822; AVX1-NEXT: .LBB33_1: # %vector.body 2823; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 2824; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2825; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2826; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 2827; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm4 2828; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 2829; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 2830; AVX1-NEXT: vpmaddwd %xmm2, %xmm2, %xmm2 2831; AVX1-NEXT: vpmaddwd %xmm3, %xmm3, %xmm3 2832; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 2833; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 2834; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 2835; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 2836; AVX1-NEXT: addq $8, %rdi 2837; AVX1-NEXT: addq $-8, %rax 2838; AVX1-NEXT: jne .LBB33_1 2839; AVX1-NEXT: # %bb.2: # %middle.block 2840; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2841; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 2842; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 2843; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 2844; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 2845; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 2846; AVX1-NEXT: vmovd %xmm1, %ecx 2847; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2848; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2849; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2850; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2851; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2852; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2853; AVX1-NEXT: vmovd %xmm0, %eax 2854; AVX1-NEXT: shlq $32, %rcx 2855; AVX1-NEXT: orq %rcx, %rax 2856; AVX1-NEXT: vzeroupper 2857; AVX1-NEXT: retq 2858; 2859; AVX256-LABEL: sum_and_sum_of_squares: 2860; AVX256: # %bb.0: # %entry 2861; AVX256-NEXT: movl %esi, %eax 2862; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 2863; AVX256-NEXT: vpxor %xmm1, %xmm1, %xmm1 2864; AVX256-NEXT: .p2align 4, 0x90 2865; AVX256-NEXT: .LBB33_1: # %vector.body 2866; AVX256-NEXT: # =>This Inner Loop Header: Depth=1 2867; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 2868; AVX256-NEXT: vpaddd %ymm1, %ymm2, %ymm1 2869; AVX256-NEXT: vpmaddwd %ymm2, %ymm2, %ymm2 2870; AVX256-NEXT: vpaddd %ymm0, %ymm2, %ymm0 2871; AVX256-NEXT: addq $8, %rdi 2872; AVX256-NEXT: addq $-8, %rax 2873; AVX256-NEXT: jne .LBB33_1 2874; AVX256-NEXT: # %bb.2: # %middle.block 2875; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 2876; AVX256-NEXT: vpaddd %xmm2, %xmm1, %xmm1 2877; AVX256-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 2878; AVX256-NEXT: vpaddd %xmm2, %xmm1, %xmm1 2879; AVX256-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 2880; AVX256-NEXT: vpaddd %xmm2, %xmm1, %xmm1 2881; AVX256-NEXT: vmovd %xmm1, %ecx 2882; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 2883; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2884; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2885; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2886; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2887; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2888; AVX256-NEXT: vmovd %xmm0, %eax 2889; AVX256-NEXT: shlq $32, %rcx 2890; AVX256-NEXT: orq %rcx, %rax 2891; AVX256-NEXT: vzeroupper 2892; AVX256-NEXT: retq 2893entry: 2894 %0 = zext i32 %n to i64 2895 br label %vector.body 2896 2897vector.body: 2898 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 2899 %vec.phi = phi <8 x i32> [ %6, %vector.body ], [ zeroinitializer, %entry ] 2900 %sum.phi = phi <8 x i32> [ %4, %vector.body ], [ zeroinitializer, %entry ] 2901 %1 = getelementptr inbounds i8, i8* %a, i64 %index 2902 %2 = bitcast i8* %1 to <8 x i8>* 2903 %wide.load = load <8 x i8>, <8 x i8>* %2, align 1 2904 %3 = zext <8 x i8> %wide.load to <8 x i32> 2905 %4 = add nsw <8 x i32> %3, %sum.phi 2906 %5 = mul nsw <8 x i32> %3, %3 2907 %6 = add nsw <8 x i32> %5, %vec.phi 2908 %index.next = add i64 %index, 8 2909 %7 = icmp eq i64 %index.next, %0 2910 br i1 %7, label %middle.block, label %vector.body 2911 2912middle.block: 2913 %rdx.shuf35 = shufflevector <8 x i32> %4, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 2914 %bin.rdx36 = add <8 x i32> %4, %rdx.shuf35 2915 %rdx.shuf37 = shufflevector <8 x i32> %bin.rdx36, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2916 %bin.rdx38 = add <8 x i32> %bin.rdx36, %rdx.shuf37 2917 %rdx.shuf39 = shufflevector <8 x i32> %bin.rdx38, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2918 %bin.rdx40 = add <8 x i32> %bin.rdx38, %rdx.shuf39 2919 %8 = extractelement <8 x i32> %bin.rdx40, i32 0 2920 %rdx.shuf = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 2921 %bin.rdx = add <8 x i32> %6, %rdx.shuf 2922 %rdx.shuf31 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2923 %bin.rdx32 = add <8 x i32> %bin.rdx, %rdx.shuf31 2924 %rdx.shuf33 = shufflevector <8 x i32> %bin.rdx32, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2925 %bin.rdx34 = add <8 x i32> %bin.rdx32, %rdx.shuf33 2926 %9 = extractelement <8 x i32> %bin.rdx34, i32 0 2927 %tmp = zext i32 %8 to i64 2928 %tmp28 = shl nuw i64 %tmp, 32 2929 %tmp29 = zext i32 %9 to i64 2930 %tmp30 = or i64 %tmp28, %tmp29 2931 ret i64 %tmp30 2932} 2933 2934define i32 @sum_of_square_differences(i8* %a, i8* %b, i32 %n) { 2935; SSE2-LABEL: sum_of_square_differences: 2936; SSE2: # %bb.0: # %entry 2937; SSE2-NEXT: movl %edx, %eax 2938; SSE2-NEXT: pxor %xmm0, %xmm0 2939; SSE2-NEXT: xorl %ecx, %ecx 2940; SSE2-NEXT: pxor %xmm1, %xmm1 2941; SSE2-NEXT: .p2align 4, 0x90 2942; SSE2-NEXT: .LBB34_1: # %vector.body 2943; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 2944; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 2945; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero 2946; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 2947; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 2948; SSE2-NEXT: psubw %xmm2, %xmm3 2949; SSE2-NEXT: pmaddwd %xmm3, %xmm3 2950; SSE2-NEXT: paddd %xmm3, %xmm1 2951; SSE2-NEXT: addq $8, %rcx 2952; SSE2-NEXT: cmpq %rcx, %rax 2953; SSE2-NEXT: jne .LBB34_1 2954; SSE2-NEXT: # %bb.2: # %middle.block 2955; SSE2-NEXT: paddd %xmm0, %xmm1 2956; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 2957; SSE2-NEXT: paddd %xmm1, %xmm0 2958; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2959; SSE2-NEXT: paddd %xmm0, %xmm1 2960; SSE2-NEXT: movd %xmm1, %eax 2961; SSE2-NEXT: retq 2962; 2963; AVX1-LABEL: sum_of_square_differences: 2964; AVX1: # %bb.0: # %entry 2965; AVX1-NEXT: movl %edx, %eax 2966; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 2967; AVX1-NEXT: xorl %ecx, %ecx 2968; AVX1-NEXT: .p2align 4, 0x90 2969; AVX1-NEXT: .LBB34_1: # %vector.body 2970; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 2971; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 2972; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 2973; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 2974; AVX1-NEXT: vpmaddwd %xmm1, %xmm1, %xmm1 2975; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 2976; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 2977; AVX1-NEXT: addq $8, %rcx 2978; AVX1-NEXT: cmpq %rcx, %rax 2979; AVX1-NEXT: jne .LBB34_1 2980; AVX1-NEXT: # %bb.2: # %middle.block 2981; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2982; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2983; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2984; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2985; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2986; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2987; AVX1-NEXT: vmovd %xmm0, %eax 2988; AVX1-NEXT: vzeroupper 2989; AVX1-NEXT: retq 2990; 2991; AVX256-LABEL: sum_of_square_differences: 2992; AVX256: # %bb.0: # %entry 2993; AVX256-NEXT: movl %edx, %eax 2994; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 2995; AVX256-NEXT: xorl %ecx, %ecx 2996; AVX256-NEXT: .p2align 4, 0x90 2997; AVX256-NEXT: .LBB34_1: # %vector.body 2998; AVX256-NEXT: # =>This Inner Loop Header: Depth=1 2999; AVX256-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 3000; AVX256-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 3001; AVX256-NEXT: vpsubw %xmm1, %xmm2, %xmm1 3002; AVX256-NEXT: vpmaddwd %xmm1, %xmm1, %xmm1 3003; AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 3004; AVX256-NEXT: addq $8, %rcx 3005; AVX256-NEXT: cmpq %rcx, %rax 3006; AVX256-NEXT: jne .LBB34_1 3007; AVX256-NEXT: # %bb.2: # %middle.block 3008; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 3009; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 3010; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 3011; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 3012; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 3013; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 3014; AVX256-NEXT: vmovd %xmm0, %eax 3015; AVX256-NEXT: vzeroupper 3016; AVX256-NEXT: retq 3017entry: 3018 %0 = zext i32 %n to i64 3019 br label %vector.body 3020 3021vector.body: 3022 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 3023 %vec.phi = phi <8 x i32> [ %9, %vector.body ], [ zeroinitializer, %entry ] 3024 %1 = getelementptr inbounds i8, i8* %a, i64 %index 3025 %2 = bitcast i8* %1 to <8 x i8>* 3026 %wide.load = load <8 x i8>, <8 x i8>* %2, align 1 3027 %3 = zext <8 x i8> %wide.load to <8 x i32> 3028 %4 = getelementptr inbounds i8, i8* %b, i64 %index 3029 %5 = bitcast i8* %4 to <8 x i8>* 3030 %wide.load2 = load <8 x i8>, <8 x i8>* %5, align 1 3031 %6 = zext <8 x i8> %wide.load2 to <8 x i32> 3032 %7 = sub <8 x i32> %6, %3 3033 %8 = mul <8 x i32> %7, %7 3034 %9 = add nsw <8 x i32> %8, %vec.phi 3035 %index.next = add i64 %index, 8 3036 %10 = icmp eq i64 %index.next, %0 3037 br i1 %10, label %middle.block, label %vector.body 3038 3039middle.block: 3040 %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 3041 %bin.rdx = add <8 x i32> %9, %rdx.shuf 3042 %rdx.shuf31 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3043 %bin.rdx32 = add <8 x i32> %bin.rdx, %rdx.shuf31 3044 %rdx.shuf33 = shufflevector <8 x i32> %bin.rdx32, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3045 %bin.rdx34 = add <8 x i32> %bin.rdx32, %rdx.shuf33 3046 %11 = extractelement <8 x i32> %bin.rdx34, i32 0 3047 ret i32 %11 3048} 3049