• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX256,AVX2
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX256,AVX512,AVX512F
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX256,AVX512,AVX512BW
7
8define i32 @_Z10test_shortPsS_i_128(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 {
9; SSE2-LABEL: _Z10test_shortPsS_i_128:
10; SSE2:       # %bb.0: # %entry
11; SSE2-NEXT:    movl %edx, %eax
12; SSE2-NEXT:    pxor %xmm0, %xmm0
13; SSE2-NEXT:    xorl %ecx, %ecx
14; SSE2-NEXT:    .p2align 4, 0x90
15; SSE2-NEXT:  .LBB0_1: # %vector.body
16; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
17; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
18; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
19; SSE2-NEXT:    movdqa %xmm2, %xmm3
20; SSE2-NEXT:    pmulhw %xmm1, %xmm3
21; SSE2-NEXT:    pmullw %xmm1, %xmm2
22; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
23; SSE2-NEXT:    paddd %xmm2, %xmm0
24; SSE2-NEXT:    addq $8, %rcx
25; SSE2-NEXT:    cmpq %rcx, %rax
26; SSE2-NEXT:    jne .LBB0_1
27; SSE2-NEXT:  # %bb.2: # %middle.block
28; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
29; SSE2-NEXT:    paddd %xmm0, %xmm1
30; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
31; SSE2-NEXT:    paddd %xmm1, %xmm0
32; SSE2-NEXT:    movd %xmm0, %eax
33; SSE2-NEXT:    retq
34;
35; AVX-LABEL: _Z10test_shortPsS_i_128:
36; AVX:       # %bb.0: # %entry
37; AVX-NEXT:    movl %edx, %eax
38; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
39; AVX-NEXT:    xorl %ecx, %ecx
40; AVX-NEXT:    .p2align 4, 0x90
41; AVX-NEXT:  .LBB0_1: # %vector.body
42; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
43; AVX-NEXT:    vpmovsxwd (%rdi,%rcx,2), %xmm1
44; AVX-NEXT:    vpmovsxwd (%rsi,%rcx,2), %xmm2
45; AVX-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
46; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
47; AVX-NEXT:    addq $8, %rcx
48; AVX-NEXT:    cmpq %rcx, %rax
49; AVX-NEXT:    jne .LBB0_1
50; AVX-NEXT:  # %bb.2: # %middle.block
51; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
52; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
53; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
54; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
55; AVX-NEXT:    vmovd %xmm0, %eax
56; AVX-NEXT:    retq
57entry:
58  %3 = zext i32 %2 to i64
59  br label %vector.body
60
61vector.body:
62  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
63  %vec.phi = phi <4 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
64  %4 = getelementptr inbounds i16, i16* %0, i64 %index
65  %5 = bitcast i16* %4 to <4 x i16>*
66  %wide.load = load <4 x i16>, <4 x i16>* %5, align 2
67  %6 = sext <4 x i16> %wide.load to <4 x i32>
68  %7 = getelementptr inbounds i16, i16* %1, i64 %index
69  %8 = bitcast i16* %7 to <4 x i16>*
70  %wide.load14 = load <4 x i16>, <4 x i16>* %8, align 2
71  %9 = sext <4 x i16> %wide.load14 to <4 x i32>
72  %10 = mul nsw <4 x i32> %9, %6
73  %11 = add nsw <4 x i32> %10, %vec.phi
74  %index.next = add i64 %index, 8
75  %12 = icmp eq i64 %index.next, %3
76  br i1 %12, label %middle.block, label %vector.body
77
78middle.block:
79  %rdx.shuf15 = shufflevector <4 x i32> %11, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
80  %bin.rdx16 = add <4 x i32> %11, %rdx.shuf15
81  %rdx.shuf17 = shufflevector <4 x i32> %bin.rdx16, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
82  %bin.rdx18 = add <4 x i32> %bin.rdx16, %rdx.shuf17
83  %13 = extractelement <4 x i32> %bin.rdx18, i32 0
84  ret i32 %13
85}
86
87define i32 @_Z10test_shortPsS_i_256(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 {
88; SSE2-LABEL: _Z10test_shortPsS_i_256:
89; SSE2:       # %bb.0: # %entry
90; SSE2-NEXT:    movl %edx, %eax
91; SSE2-NEXT:    pxor %xmm0, %xmm0
92; SSE2-NEXT:    xorl %ecx, %ecx
93; SSE2-NEXT:    pxor %xmm1, %xmm1
94; SSE2-NEXT:    .p2align 4, 0x90
95; SSE2-NEXT:  .LBB1_1: # %vector.body
96; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
97; SSE2-NEXT:    movdqu (%rdi,%rcx,2), %xmm2
98; SSE2-NEXT:    movdqu (%rsi,%rcx,2), %xmm3
99; SSE2-NEXT:    pmaddwd %xmm2, %xmm3
100; SSE2-NEXT:    paddd %xmm3, %xmm1
101; SSE2-NEXT:    addq $8, %rcx
102; SSE2-NEXT:    cmpq %rcx, %rax
103; SSE2-NEXT:    jne .LBB1_1
104; SSE2-NEXT:  # %bb.2: # %middle.block
105; SSE2-NEXT:    paddd %xmm0, %xmm1
106; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
107; SSE2-NEXT:    paddd %xmm1, %xmm0
108; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
109; SSE2-NEXT:    paddd %xmm0, %xmm1
110; SSE2-NEXT:    movd %xmm1, %eax
111; SSE2-NEXT:    retq
112;
113; AVX1-LABEL: _Z10test_shortPsS_i_256:
114; AVX1:       # %bb.0: # %entry
115; AVX1-NEXT:    movl %edx, %eax
116; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
117; AVX1-NEXT:    xorl %ecx, %ecx
118; AVX1-NEXT:    .p2align 4, 0x90
119; AVX1-NEXT:  .LBB1_1: # %vector.body
120; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
121; AVX1-NEXT:    vmovdqu (%rsi,%rcx,2), %xmm1
122; AVX1-NEXT:    vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1
123; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
124; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
125; AVX1-NEXT:    addq $8, %rcx
126; AVX1-NEXT:    cmpq %rcx, %rax
127; AVX1-NEXT:    jne .LBB1_1
128; AVX1-NEXT:  # %bb.2: # %middle.block
129; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
130; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
131; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
132; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
133; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
134; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
135; AVX1-NEXT:    vmovd %xmm0, %eax
136; AVX1-NEXT:    vzeroupper
137; AVX1-NEXT:    retq
138;
139; AVX256-LABEL: _Z10test_shortPsS_i_256:
140; AVX256:       # %bb.0: # %entry
141; AVX256-NEXT:    movl %edx, %eax
142; AVX256-NEXT:    vpxor %xmm0, %xmm0, %xmm0
143; AVX256-NEXT:    xorl %ecx, %ecx
144; AVX256-NEXT:    .p2align 4, 0x90
145; AVX256-NEXT:  .LBB1_1: # %vector.body
146; AVX256-NEXT:    # =>This Inner Loop Header: Depth=1
147; AVX256-NEXT:    vmovdqu (%rsi,%rcx,2), %xmm1
148; AVX256-NEXT:    vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1
149; AVX256-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
150; AVX256-NEXT:    addq $8, %rcx
151; AVX256-NEXT:    cmpq %rcx, %rax
152; AVX256-NEXT:    jne .LBB1_1
153; AVX256-NEXT:  # %bb.2: # %middle.block
154; AVX256-NEXT:    vextracti128 $1, %ymm0, %xmm1
155; AVX256-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
156; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
157; AVX256-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
158; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
159; AVX256-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
160; AVX256-NEXT:    vmovd %xmm0, %eax
161; AVX256-NEXT:    vzeroupper
162; AVX256-NEXT:    retq
163entry:
164  %3 = zext i32 %2 to i64
165  br label %vector.body
166
167vector.body:
168  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
169  %vec.phi = phi <8 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
170  %4 = getelementptr inbounds i16, i16* %0, i64 %index
171  %5 = bitcast i16* %4 to <8 x i16>*
172  %wide.load = load <8 x i16>, <8 x i16>* %5, align 2
173  %6 = sext <8 x i16> %wide.load to <8 x i32>
174  %7 = getelementptr inbounds i16, i16* %1, i64 %index
175  %8 = bitcast i16* %7 to <8 x i16>*
176  %wide.load14 = load <8 x i16>, <8 x i16>* %8, align 2
177  %9 = sext <8 x i16> %wide.load14 to <8 x i32>
178  %10 = mul nsw <8 x i32> %9, %6
179  %11 = add nsw <8 x i32> %10, %vec.phi
180  %index.next = add i64 %index, 8
181  %12 = icmp eq i64 %index.next, %3
182  br i1 %12, label %middle.block, label %vector.body
183
184middle.block:
185  %rdx.shuf = shufflevector <8 x i32> %11, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
186  %bin.rdx = add <8 x i32> %11, %rdx.shuf
187  %rdx.shuf15 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
188  %bin.rdx16 = add <8 x i32> %bin.rdx, %rdx.shuf15
189  %rdx.shuf17 = shufflevector <8 x i32> %bin.rdx16, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
190  %bin.rdx18 = add <8 x i32> %bin.rdx16, %rdx.shuf17
191  %13 = extractelement <8 x i32> %bin.rdx18, i32 0
192  ret i32 %13
193}
194
195define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 {
196; SSE2-LABEL: _Z10test_shortPsS_i_512:
197; SSE2:       # %bb.0: # %entry
198; SSE2-NEXT:    movl %edx, %eax
199; SSE2-NEXT:    pxor %xmm0, %xmm0
200; SSE2-NEXT:    xorl %ecx, %ecx
201; SSE2-NEXT:    pxor %xmm2, %xmm2
202; SSE2-NEXT:    pxor %xmm1, %xmm1
203; SSE2-NEXT:    .p2align 4, 0x90
204; SSE2-NEXT:  .LBB2_1: # %vector.body
205; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
206; SSE2-NEXT:    movdqu (%rdi,%rcx,2), %xmm3
207; SSE2-NEXT:    movdqu 16(%rdi,%rcx,2), %xmm4
208; SSE2-NEXT:    movdqu (%rsi,%rcx,2), %xmm5
209; SSE2-NEXT:    pmaddwd %xmm3, %xmm5
210; SSE2-NEXT:    paddd %xmm5, %xmm2
211; SSE2-NEXT:    movdqu 16(%rsi,%rcx,2), %xmm3
212; SSE2-NEXT:    pmaddwd %xmm4, %xmm3
213; SSE2-NEXT:    paddd %xmm3, %xmm1
214; SSE2-NEXT:    addq $16, %rcx
215; SSE2-NEXT:    cmpq %rcx, %rax
216; SSE2-NEXT:    jne .LBB2_1
217; SSE2-NEXT:  # %bb.2: # %middle.block
218; SSE2-NEXT:    paddd %xmm0, %xmm2
219; SSE2-NEXT:    paddd %xmm0, %xmm1
220; SSE2-NEXT:    paddd %xmm2, %xmm1
221; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
222; SSE2-NEXT:    paddd %xmm1, %xmm0
223; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
224; SSE2-NEXT:    paddd %xmm0, %xmm1
225; SSE2-NEXT:    movd %xmm1, %eax
226; SSE2-NEXT:    retq
227;
228; AVX1-LABEL: _Z10test_shortPsS_i_512:
229; AVX1:       # %bb.0: # %entry
230; AVX1-NEXT:    movl %edx, %eax
231; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
232; AVX1-NEXT:    xorl %ecx, %ecx
233; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
234; AVX1-NEXT:    .p2align 4, 0x90
235; AVX1-NEXT:  .LBB2_1: # %vector.body
236; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
237; AVX1-NEXT:    vmovdqu (%rsi,%rcx,2), %xmm2
238; AVX1-NEXT:    vmovdqu 16(%rsi,%rcx,2), %xmm3
239; AVX1-NEXT:    vpmaddwd (%rdi,%rcx,2), %xmm2, %xmm2
240; AVX1-NEXT:    vpmaddwd 16(%rdi,%rcx,2), %xmm3, %xmm3
241; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
242; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
243; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
244; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
245; AVX1-NEXT:    addq $16, %rcx
246; AVX1-NEXT:    cmpq %rcx, %rax
247; AVX1-NEXT:    jne .LBB2_1
248; AVX1-NEXT:  # %bb.2: # %middle.block
249; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
250; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
251; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
252; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
253; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
254; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
255; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
256; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
257; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
258; AVX1-NEXT:    vmovd %xmm0, %eax
259; AVX1-NEXT:    vzeroupper
260; AVX1-NEXT:    retq
261;
262; AVX2-LABEL: _Z10test_shortPsS_i_512:
263; AVX2:       # %bb.0: # %entry
264; AVX2-NEXT:    movl %edx, %eax
265; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
266; AVX2-NEXT:    xorl %ecx, %ecx
267; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
268; AVX2-NEXT:    .p2align 4, 0x90
269; AVX2-NEXT:  .LBB2_1: # %vector.body
270; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
271; AVX2-NEXT:    vmovdqu (%rsi,%rcx,2), %ymm2
272; AVX2-NEXT:    vpmaddwd (%rdi,%rcx,2), %ymm2, %ymm2
273; AVX2-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
274; AVX2-NEXT:    addq $16, %rcx
275; AVX2-NEXT:    cmpq %rcx, %rax
276; AVX2-NEXT:    jne .LBB2_1
277; AVX2-NEXT:  # %bb.2: # %middle.block
278; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
279; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
280; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
281; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
282; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
283; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
284; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
285; AVX2-NEXT:    vmovd %xmm0, %eax
286; AVX2-NEXT:    vzeroupper
287; AVX2-NEXT:    retq
288;
289; AVX512-LABEL: _Z10test_shortPsS_i_512:
290; AVX512:       # %bb.0: # %entry
291; AVX512-NEXT:    movl %edx, %eax
292; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
293; AVX512-NEXT:    xorl %ecx, %ecx
294; AVX512-NEXT:    .p2align 4, 0x90
295; AVX512-NEXT:  .LBB2_1: # %vector.body
296; AVX512-NEXT:    # =>This Inner Loop Header: Depth=1
297; AVX512-NEXT:    vmovdqu (%rsi,%rcx,2), %ymm1
298; AVX512-NEXT:    vpmaddwd (%rdi,%rcx,2), %ymm1, %ymm1
299; AVX512-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
300; AVX512-NEXT:    addq $16, %rcx
301; AVX512-NEXT:    cmpq %rcx, %rax
302; AVX512-NEXT:    jne .LBB2_1
303; AVX512-NEXT:  # %bb.2: # %middle.block
304; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
305; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
306; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
307; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
308; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
309; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
310; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
311; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
312; AVX512-NEXT:    vmovd %xmm0, %eax
313; AVX512-NEXT:    vzeroupper
314; AVX512-NEXT:    retq
315entry:
316  %3 = zext i32 %2 to i64
317  br label %vector.body
318
319vector.body:
320  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
321  %vec.phi = phi <16 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
322  %4 = getelementptr inbounds i16, i16* %0, i64 %index
323  %5 = bitcast i16* %4 to <16 x i16>*
324  %wide.load = load <16 x i16>, <16 x i16>* %5, align 2
325  %6 = sext <16 x i16> %wide.load to <16 x i32>
326  %7 = getelementptr inbounds i16, i16* %1, i64 %index
327  %8 = bitcast i16* %7 to <16 x i16>*
328  %wide.load14 = load <16 x i16>, <16 x i16>* %8, align 2
329  %9 = sext <16 x i16> %wide.load14 to <16 x i32>
330  %10 = mul nsw <16 x i32> %9, %6
331  %11 = add nsw <16 x i32> %10, %vec.phi
332  %index.next = add i64 %index, 16
333  %12 = icmp eq i64 %index.next, %3
334  br i1 %12, label %middle.block, label %vector.body
335
336middle.block:
337  %rdx.shuf1 = shufflevector <16 x i32> %11, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
338  %bin.rdx1 = add <16 x i32> %11, %rdx.shuf1
339  %rdx.shuf = shufflevector <16 x i32> %bin.rdx1, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
340  %bin.rdx = add <16 x i32> %bin.rdx1, %rdx.shuf
341  %rdx.shuf15 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
342  %bin.rdx16 = add <16 x i32> %bin.rdx, %rdx.shuf15
343  %rdx.shuf17 = shufflevector <16 x i32> %bin.rdx16, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
344  %bin.rdx18 = add <16 x i32> %bin.rdx16, %rdx.shuf17
345  %13 = extractelement <16 x i32> %bin.rdx18, i32 0
346  ret i32 %13
347}
348
349define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 {
350; SSE2-LABEL: _Z10test_shortPsS_i_1024:
351; SSE2:       # %bb.0: # %entry
352; SSE2-NEXT:    movl %edx, %eax
353; SSE2-NEXT:    pxor %xmm8, %xmm8
354; SSE2-NEXT:    xorl %ecx, %ecx
355; SSE2-NEXT:    pxor %xmm2, %xmm2
356; SSE2-NEXT:    pxor %xmm4, %xmm4
357; SSE2-NEXT:    pxor %xmm1, %xmm1
358; SSE2-NEXT:    pxor %xmm3, %xmm3
359; SSE2-NEXT:    .p2align 4, 0x90
360; SSE2-NEXT:  .LBB3_1: # %vector.body
361; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
362; SSE2-NEXT:    movdqu (%rdi,%rcx,2), %xmm5
363; SSE2-NEXT:    movdqu 16(%rdi,%rcx,2), %xmm6
364; SSE2-NEXT:    movdqu 32(%rdi,%rcx,2), %xmm7
365; SSE2-NEXT:    movdqu 48(%rdi,%rcx,2), %xmm9
366; SSE2-NEXT:    movdqu (%rsi,%rcx,2), %xmm0
367; SSE2-NEXT:    pmaddwd %xmm5, %xmm0
368; SSE2-NEXT:    paddd %xmm0, %xmm2
369; SSE2-NEXT:    movdqu 16(%rsi,%rcx,2), %xmm0
370; SSE2-NEXT:    pmaddwd %xmm6, %xmm0
371; SSE2-NEXT:    paddd %xmm0, %xmm4
372; SSE2-NEXT:    movdqu 32(%rsi,%rcx,2), %xmm0
373; SSE2-NEXT:    pmaddwd %xmm7, %xmm0
374; SSE2-NEXT:    paddd %xmm0, %xmm1
375; SSE2-NEXT:    movdqu 48(%rsi,%rcx,2), %xmm0
376; SSE2-NEXT:    pmaddwd %xmm9, %xmm0
377; SSE2-NEXT:    paddd %xmm0, %xmm3
378; SSE2-NEXT:    addq $16, %rcx
379; SSE2-NEXT:    cmpq %rcx, %rax
380; SSE2-NEXT:    jne .LBB3_1
381; SSE2-NEXT:  # %bb.2: # %middle.block
382; SSE2-NEXT:    paddd %xmm8, %xmm4
383; SSE2-NEXT:    paddd %xmm8, %xmm3
384; SSE2-NEXT:    paddd %xmm4, %xmm3
385; SSE2-NEXT:    paddd %xmm8, %xmm2
386; SSE2-NEXT:    paddd %xmm8, %xmm1
387; SSE2-NEXT:    paddd %xmm3, %xmm1
388; SSE2-NEXT:    paddd %xmm2, %xmm1
389; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
390; SSE2-NEXT:    paddd %xmm1, %xmm0
391; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
392; SSE2-NEXT:    paddd %xmm0, %xmm1
393; SSE2-NEXT:    movd %xmm1, %eax
394; SSE2-NEXT:    retq
395;
396; AVX1-LABEL: _Z10test_shortPsS_i_1024:
397; AVX1:       # %bb.0: # %entry
398; AVX1-NEXT:    movl %edx, %eax
399; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
400; AVX1-NEXT:    xorl %ecx, %ecx
401; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
402; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
403; AVX1-NEXT:    .p2align 4, 0x90
404; AVX1-NEXT:  .LBB3_1: # %vector.body
405; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
406; AVX1-NEXT:    vmovdqu (%rsi,%rcx,2), %xmm3
407; AVX1-NEXT:    vmovdqu 16(%rsi,%rcx,2), %xmm4
408; AVX1-NEXT:    vmovdqu 32(%rsi,%rcx,2), %xmm5
409; AVX1-NEXT:    vmovdqu 48(%rsi,%rcx,2), %xmm6
410; AVX1-NEXT:    vpmaddwd (%rdi,%rcx,2), %xmm3, %xmm3
411; AVX1-NEXT:    vpmaddwd 16(%rdi,%rcx,2), %xmm4, %xmm4
412; AVX1-NEXT:    vpmaddwd 32(%rdi,%rcx,2), %xmm5, %xmm5
413; AVX1-NEXT:    vpmaddwd 48(%rdi,%rcx,2), %xmm6, %xmm6
414; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
415; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6
416; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
417; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
418; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
419; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
420; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
421; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
422; AVX1-NEXT:    addq $16, %rcx
423; AVX1-NEXT:    cmpq %rcx, %rax
424; AVX1-NEXT:    jne .LBB3_1
425; AVX1-NEXT:  # %bb.2: # %middle.block
426; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm3
427; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
428; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
429; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
430; AVX1-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
431; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
432; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
433; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
434; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
435; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
436; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
437; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
438; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
439; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
440; AVX1-NEXT:    vmovd %xmm0, %eax
441; AVX1-NEXT:    vzeroupper
442; AVX1-NEXT:    retq
443;
444; AVX2-LABEL: _Z10test_shortPsS_i_1024:
445; AVX2:       # %bb.0: # %entry
446; AVX2-NEXT:    movl %edx, %eax
447; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
448; AVX2-NEXT:    xorl %ecx, %ecx
449; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
450; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
451; AVX2-NEXT:    .p2align 4, 0x90
452; AVX2-NEXT:  .LBB3_1: # %vector.body
453; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
454; AVX2-NEXT:    vmovdqu (%rsi,%rcx,2), %ymm3
455; AVX2-NEXT:    vmovdqu 32(%rsi,%rcx,2), %ymm4
456; AVX2-NEXT:    vpmaddwd (%rdi,%rcx,2), %ymm3, %ymm3
457; AVX2-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
458; AVX2-NEXT:    vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm3
459; AVX2-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
460; AVX2-NEXT:    addq $16, %rcx
461; AVX2-NEXT:    cmpq %rcx, %rax
462; AVX2-NEXT:    jne .LBB3_1
463; AVX2-NEXT:  # %bb.2: # %middle.block
464; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm1
465; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
466; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
467; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
468; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
469; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
470; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
471; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
472; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
473; AVX2-NEXT:    vmovd %xmm0, %eax
474; AVX2-NEXT:    vzeroupper
475; AVX2-NEXT:    retq
476;
477; AVX512F-LABEL: _Z10test_shortPsS_i_1024:
478; AVX512F:       # %bb.0: # %entry
479; AVX512F-NEXT:    movl %edx, %eax
480; AVX512F-NEXT:    vpxor %xmm0, %xmm0, %xmm0
481; AVX512F-NEXT:    xorl %ecx, %ecx
482; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
483; AVX512F-NEXT:    .p2align 4, 0x90
484; AVX512F-NEXT:  .LBB3_1: # %vector.body
485; AVX512F-NEXT:    # =>This Inner Loop Header: Depth=1
486; AVX512F-NEXT:    vmovdqu (%rsi,%rcx,2), %ymm2
487; AVX512F-NEXT:    vmovdqu 32(%rsi,%rcx,2), %ymm3
488; AVX512F-NEXT:    vpmaddwd 32(%rdi,%rcx,2), %ymm3, %ymm3
489; AVX512F-NEXT:    vpmaddwd (%rdi,%rcx,2), %ymm2, %ymm2
490; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
491; AVX512F-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
492; AVX512F-NEXT:    addq $16, %rcx
493; AVX512F-NEXT:    cmpq %rcx, %rax
494; AVX512F-NEXT:    jne .LBB3_1
495; AVX512F-NEXT:  # %bb.2: # %middle.block
496; AVX512F-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
497; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
498; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
499; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
500; AVX512F-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
501; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
502; AVX512F-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
503; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
504; AVX512F-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
505; AVX512F-NEXT:    vmovd %xmm0, %eax
506; AVX512F-NEXT:    vzeroupper
507; AVX512F-NEXT:    retq
508;
509; AVX512BW-LABEL: _Z10test_shortPsS_i_1024:
510; AVX512BW:       # %bb.0: # %entry
511; AVX512BW-NEXT:    movl %edx, %eax
512; AVX512BW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
513; AVX512BW-NEXT:    xorl %ecx, %ecx
514; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
515; AVX512BW-NEXT:    .p2align 4, 0x90
516; AVX512BW-NEXT:  .LBB3_1: # %vector.body
517; AVX512BW-NEXT:    # =>This Inner Loop Header: Depth=1
518; AVX512BW-NEXT:    vmovdqu64 (%rsi,%rcx,2), %zmm2
519; AVX512BW-NEXT:    vpmaddwd (%rdi,%rcx,2), %zmm2, %zmm2
520; AVX512BW-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
521; AVX512BW-NEXT:    addq $16, %rcx
522; AVX512BW-NEXT:    cmpq %rcx, %rax
523; AVX512BW-NEXT:    jne .LBB3_1
524; AVX512BW-NEXT:  # %bb.2: # %middle.block
525; AVX512BW-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
526; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
527; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
528; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
529; AVX512BW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
530; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
531; AVX512BW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
532; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
533; AVX512BW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
534; AVX512BW-NEXT:    vmovd %xmm0, %eax
535; AVX512BW-NEXT:    vzeroupper
536; AVX512BW-NEXT:    retq
537entry:
538  %3 = zext i32 %2 to i64
539  br label %vector.body
540
541vector.body:
542  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
543  %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
544  %4 = getelementptr inbounds i16, i16* %0, i64 %index
545  %5 = bitcast i16* %4 to <32 x i16>*
546  %wide.load = load <32 x i16>, <32 x i16>* %5, align 2
547  %6 = sext <32 x i16> %wide.load to <32 x i32>
548  %7 = getelementptr inbounds i16, i16* %1, i64 %index
549  %8 = bitcast i16* %7 to <32 x i16>*
550  %wide.load14 = load <32 x i16>, <32 x i16>* %8, align 2
551  %9 = sext <32 x i16> %wide.load14 to <32 x i32>
552  %10 = mul nsw <32 x i32> %9, %6
553  %11 = add nsw <32 x i32> %10, %vec.phi
554  %index.next = add i64 %index, 16
555  %12 = icmp eq i64 %index.next, %3
556  br i1 %12, label %middle.block, label %vector.body
557
558middle.block:
559  %rdx.shuf2 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
560  %bin.rdx2 = add <32 x i32> %11, %rdx.shuf2
561  %rdx.shuf1 = shufflevector <32 x i32> %bin.rdx2, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
562  %bin.rdx1 = add <32 x i32> %bin.rdx2, %rdx.shuf1
563  %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
564  %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf
565  %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
566  %bin.rdx16 = add <32 x i32> %bin.rdx, %rdx.shuf15
567  %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx16, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
568  %bin.rdx18 = add <32 x i32> %bin.rdx16, %rdx.shuf17
569  %13 = extractelement <32 x i32> %bin.rdx18, i32 0
570  ret i32 %13
571}
572
573define i32 @_Z9test_charPcS_i_128(i8* nocapture readonly, i8* nocapture readonly, i32) local_unnamed_addr #0 {
574; SSE2-LABEL: _Z9test_charPcS_i_128:
575; SSE2:       # %bb.0: # %entry
576; SSE2-NEXT:    movl %edx, %eax
577; SSE2-NEXT:    pxor %xmm0, %xmm0
578; SSE2-NEXT:    xorl %ecx, %ecx
579; SSE2-NEXT:    .p2align 4, 0x90
580; SSE2-NEXT:  .LBB4_1: # %vector.body
581; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
582; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
583; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
584; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
585; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
586; SSE2-NEXT:    psraw $8, %xmm1
587; SSE2-NEXT:    psraw $8, %xmm2
588; SSE2-NEXT:    pmullw %xmm1, %xmm2
589; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
590; SSE2-NEXT:    psrad $16, %xmm1
591; SSE2-NEXT:    paddd %xmm1, %xmm0
592; SSE2-NEXT:    addq $16, %rcx
593; SSE2-NEXT:    cmpq %rcx, %rax
594; SSE2-NEXT:    jne .LBB4_1
595; SSE2-NEXT:  # %bb.2: # %middle.block
596; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
597; SSE2-NEXT:    paddd %xmm0, %xmm1
598; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
599; SSE2-NEXT:    paddd %xmm1, %xmm0
600; SSE2-NEXT:    movd %xmm0, %eax
601; SSE2-NEXT:    retq
602;
603; AVX-LABEL: _Z9test_charPcS_i_128:
604; AVX:       # %bb.0: # %entry
605; AVX-NEXT:    movl %edx, %eax
606; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
607; AVX-NEXT:    xorl %ecx, %ecx
608; AVX-NEXT:    .p2align 4, 0x90
609; AVX-NEXT:  .LBB4_1: # %vector.body
610; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
611; AVX-NEXT:    vpmovsxbd (%rdi,%rcx), %xmm1
612; AVX-NEXT:    vpmovsxbd (%rsi,%rcx), %xmm2
613; AVX-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
614; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
615; AVX-NEXT:    addq $16, %rcx
616; AVX-NEXT:    cmpq %rcx, %rax
617; AVX-NEXT:    jne .LBB4_1
618; AVX-NEXT:  # %bb.2: # %middle.block
619; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
620; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
621; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
622; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
623; AVX-NEXT:    vmovd %xmm0, %eax
624; AVX-NEXT:    retq
625entry:
626  %3 = zext i32 %2 to i64
627  br label %vector.body
628
629vector.body:
630  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
631  %vec.phi = phi <4 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
632  %4 = getelementptr inbounds i8, i8* %0, i64 %index
633  %5 = bitcast i8* %4 to <4 x i8>*
634  %wide.load = load <4 x i8>, <4 x i8>* %5, align 1
635  %6 = sext <4 x i8> %wide.load to <4 x i32>
636  %7 = getelementptr inbounds i8, i8* %1, i64 %index
637  %8 = bitcast i8* %7 to <4 x i8>*
638  %wide.load14 = load <4 x i8>, <4 x i8>* %8, align 1
639  %9 = sext <4 x i8> %wide.load14 to <4 x i32>
640  %10 = mul nsw <4 x i32> %9, %6
641  %11 = add nsw <4 x i32> %10, %vec.phi
642  %index.next = add i64 %index, 16
643  %12 = icmp eq i64 %index.next, %3
644  br i1 %12, label %middle.block, label %vector.body
645
646middle.block:
647  %rdx.shuf17 = shufflevector <4 x i32> %11, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
648  %bin.rdx18 = add <4 x i32> %11, %rdx.shuf17
649  %rdx.shuf19 = shufflevector <4 x i32> %bin.rdx18, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
650  %bin.rdx20 = add <4 x i32> %bin.rdx18, %rdx.shuf19
651  %13 = extractelement <4 x i32> %bin.rdx20, i32 0
652  ret i32 %13
653}
654
655define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly, i32) local_unnamed_addr #0 {
656; SSE2-LABEL: _Z9test_charPcS_i_256:
657; SSE2:       # %bb.0: # %entry
658; SSE2-NEXT:    movl %edx, %eax
659; SSE2-NEXT:    pxor %xmm0, %xmm0
660; SSE2-NEXT:    xorl %ecx, %ecx
661; SSE2-NEXT:    pxor %xmm1, %xmm1
662; SSE2-NEXT:    .p2align 4, 0x90
663; SSE2-NEXT:  .LBB5_1: # %vector.body
664; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
665; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
666; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
667; SSE2-NEXT:    movq {{.*#+}} xmm3 = mem[0],zero
668; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
669; SSE2-NEXT:    psraw $8, %xmm2
670; SSE2-NEXT:    psraw $8, %xmm3
671; SSE2-NEXT:    pmaddwd %xmm2, %xmm3
672; SSE2-NEXT:    paddd %xmm3, %xmm1
673; SSE2-NEXT:    addq $16, %rcx
674; SSE2-NEXT:    cmpq %rcx, %rax
675; SSE2-NEXT:    jne .LBB5_1
676; SSE2-NEXT:  # %bb.2: # %middle.block
677; SSE2-NEXT:    paddd %xmm0, %xmm1
678; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
679; SSE2-NEXT:    paddd %xmm1, %xmm0
680; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
681; SSE2-NEXT:    paddd %xmm0, %xmm1
682; SSE2-NEXT:    movd %xmm1, %eax
683; SSE2-NEXT:    retq
684;
685; AVX1-LABEL: _Z9test_charPcS_i_256:
686; AVX1:       # %bb.0: # %entry
687; AVX1-NEXT:    movl %edx, %eax
688; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
689; AVX1-NEXT:    xorl %ecx, %ecx
690; AVX1-NEXT:    .p2align 4, 0x90
691; AVX1-NEXT:  .LBB5_1: # %vector.body
692; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
693; AVX1-NEXT:    vpmovsxbw (%rdi,%rcx), %xmm1
694; AVX1-NEXT:    vpmovsxbw (%rsi,%rcx), %xmm2
695; AVX1-NEXT:    vpmaddwd %xmm1, %xmm2, %xmm1
696; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
697; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
698; AVX1-NEXT:    addq $16, %rcx
699; AVX1-NEXT:    cmpq %rcx, %rax
700; AVX1-NEXT:    jne .LBB5_1
701; AVX1-NEXT:  # %bb.2: # %middle.block
702; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
703; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
704; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
705; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
706; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
707; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
708; AVX1-NEXT:    vmovd %xmm0, %eax
709; AVX1-NEXT:    vzeroupper
710; AVX1-NEXT:    retq
711;
712; AVX256-LABEL: _Z9test_charPcS_i_256:
713; AVX256:       # %bb.0: # %entry
714; AVX256-NEXT:    movl %edx, %eax
715; AVX256-NEXT:    vpxor %xmm0, %xmm0, %xmm0
716; AVX256-NEXT:    xorl %ecx, %ecx
717; AVX256-NEXT:    .p2align 4, 0x90
718; AVX256-NEXT:  .LBB5_1: # %vector.body
719; AVX256-NEXT:    # =>This Inner Loop Header: Depth=1
720; AVX256-NEXT:    vpmovsxbw (%rdi,%rcx), %xmm1
721; AVX256-NEXT:    vpmovsxbw (%rsi,%rcx), %xmm2
722; AVX256-NEXT:    vpmaddwd %xmm1, %xmm2, %xmm1
723; AVX256-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
724; AVX256-NEXT:    addq $16, %rcx
725; AVX256-NEXT:    cmpq %rcx, %rax
726; AVX256-NEXT:    jne .LBB5_1
727; AVX256-NEXT:  # %bb.2: # %middle.block
728; AVX256-NEXT:    vextracti128 $1, %ymm0, %xmm1
729; AVX256-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
730; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
731; AVX256-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
732; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
733; AVX256-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
734; AVX256-NEXT:    vmovd %xmm0, %eax
735; AVX256-NEXT:    vzeroupper
736; AVX256-NEXT:    retq
737entry:
738  %3 = zext i32 %2 to i64
739  br label %vector.body
740
741vector.body:
742  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
743  %vec.phi = phi <8 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
744  %4 = getelementptr inbounds i8, i8* %0, i64 %index
745  %5 = bitcast i8* %4 to <8 x i8>*
746  %wide.load = load <8 x i8>, <8 x i8>* %5, align 1
747  %6 = sext <8 x i8> %wide.load to <8 x i32>
748  %7 = getelementptr inbounds i8, i8* %1, i64 %index
749  %8 = bitcast i8* %7 to <8 x i8>*
750  %wide.load14 = load <8 x i8>, <8 x i8>* %8, align 1
751  %9 = sext <8 x i8> %wide.load14 to <8 x i32>
752  %10 = mul nsw <8 x i32> %9, %6
753  %11 = add nsw <8 x i32> %10, %vec.phi
754  %index.next = add i64 %index, 16
755  %12 = icmp eq i64 %index.next, %3
756  br i1 %12, label %middle.block, label %vector.body
757
758middle.block:
759  %rdx.shuf15 = shufflevector <8 x i32> %11, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
760  %bin.rdx16 = add <8 x i32> %11, %rdx.shuf15
761  %rdx.shuf17 = shufflevector <8 x i32> %bin.rdx16, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
762  %bin.rdx18 = add <8 x i32> %bin.rdx16, %rdx.shuf17
763  %rdx.shuf19 = shufflevector <8 x i32> %bin.rdx18, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
764  %bin.rdx20 = add <8 x i32> %bin.rdx18, %rdx.shuf19
765  %13 = extractelement <8 x i32> %bin.rdx20, i32 0
766  ret i32 %13
767}
768
769define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly, i32) local_unnamed_addr #0 {
770; SSE2-LABEL: _Z9test_charPcS_i_512:
771; SSE2:       # %bb.0: # %entry
772; SSE2-NEXT:    movl %edx, %eax
773; SSE2-NEXT:    pxor %xmm0, %xmm0
774; SSE2-NEXT:    xorl %ecx, %ecx
775; SSE2-NEXT:    pxor %xmm2, %xmm2
776; SSE2-NEXT:    pxor %xmm1, %xmm1
777; SSE2-NEXT:    .p2align 4, 0x90
778; SSE2-NEXT:  .LBB6_1: # %vector.body
779; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
780; SSE2-NEXT:    movdqu (%rdi,%rcx), %xmm3
781; SSE2-NEXT:    movdqu (%rsi,%rcx), %xmm4
782; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
783; SSE2-NEXT:    psraw $8, %xmm5
784; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
785; SSE2-NEXT:    psraw $8, %xmm6
786; SSE2-NEXT:    pmaddwd %xmm5, %xmm6
787; SSE2-NEXT:    paddd %xmm6, %xmm2
788; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
789; SSE2-NEXT:    psraw $8, %xmm3
790; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
791; SSE2-NEXT:    psraw $8, %xmm4
792; SSE2-NEXT:    pmaddwd %xmm3, %xmm4
793; SSE2-NEXT:    paddd %xmm4, %xmm1
794; SSE2-NEXT:    addq $16, %rcx
795; SSE2-NEXT:    cmpq %rcx, %rax
796; SSE2-NEXT:    jne .LBB6_1
797; SSE2-NEXT:  # %bb.2: # %middle.block
798; SSE2-NEXT:    paddd %xmm0, %xmm2
799; SSE2-NEXT:    paddd %xmm0, %xmm1
800; SSE2-NEXT:    paddd %xmm2, %xmm1
801; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
802; SSE2-NEXT:    paddd %xmm1, %xmm0
803; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
804; SSE2-NEXT:    paddd %xmm0, %xmm1
805; SSE2-NEXT:    movd %xmm1, %eax
806; SSE2-NEXT:    retq
807;
808; AVX1-LABEL: _Z9test_charPcS_i_512:
809; AVX1:       # %bb.0: # %entry
810; AVX1-NEXT:    movl %edx, %eax
811; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
812; AVX1-NEXT:    xorl %ecx, %ecx
813; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
814; AVX1-NEXT:    .p2align 4, 0x90
815; AVX1-NEXT:  .LBB6_1: # %vector.body
816; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
817; AVX1-NEXT:    vpmovsxbw 8(%rdi,%rcx), %xmm2
818; AVX1-NEXT:    vpmovsxbw (%rdi,%rcx), %xmm3
819; AVX1-NEXT:    vpmovsxbw 8(%rsi,%rcx), %xmm4
820; AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
821; AVX1-NEXT:    vpmovsxbw (%rsi,%rcx), %xmm4
822; AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
823; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
824; AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
825; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
826; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
827; AVX1-NEXT:    addq $16, %rcx
828; AVX1-NEXT:    cmpq %rcx, %rax
829; AVX1-NEXT:    jne .LBB6_1
830; AVX1-NEXT:  # %bb.2: # %middle.block
831; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
832; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
833; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
834; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
835; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
836; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
837; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
838; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
839; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
840; AVX1-NEXT:    vmovd %xmm0, %eax
841; AVX1-NEXT:    vzeroupper
842; AVX1-NEXT:    retq
843;
844; AVX2-LABEL: _Z9test_charPcS_i_512:
845; AVX2:       # %bb.0: # %entry
846; AVX2-NEXT:    movl %edx, %eax
847; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
848; AVX2-NEXT:    xorl %ecx, %ecx
849; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
850; AVX2-NEXT:    .p2align 4, 0x90
851; AVX2-NEXT:  .LBB6_1: # %vector.body
852; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
853; AVX2-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm2
854; AVX2-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm3
855; AVX2-NEXT:    vpmaddwd %ymm2, %ymm3, %ymm2
856; AVX2-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
857; AVX2-NEXT:    addq $16, %rcx
858; AVX2-NEXT:    cmpq %rcx, %rax
859; AVX2-NEXT:    jne .LBB6_1
860; AVX2-NEXT:  # %bb.2: # %middle.block
861; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
862; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
863; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
864; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
865; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
866; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
867; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
868; AVX2-NEXT:    vmovd %xmm0, %eax
869; AVX2-NEXT:    vzeroupper
870; AVX2-NEXT:    retq
871;
872; AVX512-LABEL: _Z9test_charPcS_i_512:
873; AVX512:       # %bb.0: # %entry
874; AVX512-NEXT:    movl %edx, %eax
875; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
876; AVX512-NEXT:    xorl %ecx, %ecx
877; AVX512-NEXT:    .p2align 4, 0x90
878; AVX512-NEXT:  .LBB6_1: # %vector.body
879; AVX512-NEXT:    # =>This Inner Loop Header: Depth=1
880; AVX512-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm1
881; AVX512-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm2
882; AVX512-NEXT:    vpmaddwd %ymm1, %ymm2, %ymm1
883; AVX512-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
884; AVX512-NEXT:    addq $16, %rcx
885; AVX512-NEXT:    cmpq %rcx, %rax
886; AVX512-NEXT:    jne .LBB6_1
887; AVX512-NEXT:  # %bb.2: # %middle.block
888; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
889; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
890; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
891; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
892; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
893; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
894; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
895; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
896; AVX512-NEXT:    vmovd %xmm0, %eax
897; AVX512-NEXT:    vzeroupper
898; AVX512-NEXT:    retq
899entry:
900  %3 = zext i32 %2 to i64
901  br label %vector.body
902
903vector.body:
904  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
905  %vec.phi = phi <16 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
906  %4 = getelementptr inbounds i8, i8* %0, i64 %index
907  %5 = bitcast i8* %4 to <16 x i8>*
908  %wide.load = load <16 x i8>, <16 x i8>* %5, align 1
909  %6 = sext <16 x i8> %wide.load to <16 x i32>
910  %7 = getelementptr inbounds i8, i8* %1, i64 %index
911  %8 = bitcast i8* %7 to <16 x i8>*
912  %wide.load14 = load <16 x i8>, <16 x i8>* %8, align 1
913  %9 = sext <16 x i8> %wide.load14 to <16 x i32>
914  %10 = mul nsw <16 x i32> %9, %6
915  %11 = add nsw <16 x i32> %10, %vec.phi
916  %index.next = add i64 %index, 16
917  %12 = icmp eq i64 %index.next, %3
918  br i1 %12, label %middle.block, label %vector.body
919
920middle.block:
921  %rdx.shuf = shufflevector <16 x i32> %11, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
922  %bin.rdx = add <16 x i32> %11, %rdx.shuf
923  %rdx.shuf15 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
924  %bin.rdx16 = add <16 x i32> %bin.rdx, %rdx.shuf15
925  %rdx.shuf17 = shufflevector <16 x i32> %bin.rdx16, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
926  %bin.rdx18 = add <16 x i32> %bin.rdx16, %rdx.shuf17
927  %rdx.shuf19 = shufflevector <16 x i32> %bin.rdx18, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
928  %bin.rdx20 = add <16 x i32> %bin.rdx18, %rdx.shuf19
929  %13 = extractelement <16 x i32> %bin.rdx20, i32 0
930  ret i32 %13
931}
932
933define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonly, i32) local_unnamed_addr #0 {
934; SSE2-LABEL: _Z9test_charPcS_i_1024:
935; SSE2:       # %bb.0: # %entry
936; SSE2-NEXT:    movl %edx, %eax
937; SSE2-NEXT:    pxor %xmm8, %xmm8
938; SSE2-NEXT:    xorl %ecx, %ecx
939; SSE2-NEXT:    pxor %xmm2, %xmm2
940; SSE2-NEXT:    pxor %xmm4, %xmm4
941; SSE2-NEXT:    pxor %xmm1, %xmm1
942; SSE2-NEXT:    pxor %xmm3, %xmm3
943; SSE2-NEXT:    .p2align 4, 0x90
944; SSE2-NEXT:  .LBB7_1: # %vector.body
945; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
946; SSE2-NEXT:    movdqu (%rdi,%rcx), %xmm7
947; SSE2-NEXT:    movdqu 16(%rdi,%rcx), %xmm10
948; SSE2-NEXT:    movdqu (%rsi,%rcx), %xmm0
949; SSE2-NEXT:    movdqu 16(%rsi,%rcx), %xmm9
950; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
951; SSE2-NEXT:    psraw $8, %xmm5
952; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
953; SSE2-NEXT:    psraw $8, %xmm6
954; SSE2-NEXT:    pmaddwd %xmm5, %xmm6
955; SSE2-NEXT:    paddd %xmm6, %xmm2
956; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
957; SSE2-NEXT:    psraw $8, %xmm5
958; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
959; SSE2-NEXT:    psraw $8, %xmm0
960; SSE2-NEXT:    pmaddwd %xmm5, %xmm0
961; SSE2-NEXT:    paddd %xmm0, %xmm4
962; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
963; SSE2-NEXT:    psraw $8, %xmm0
964; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
965; SSE2-NEXT:    psraw $8, %xmm5
966; SSE2-NEXT:    pmaddwd %xmm0, %xmm5
967; SSE2-NEXT:    paddd %xmm5, %xmm1
968; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15]
969; SSE2-NEXT:    psraw $8, %xmm0
970; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15]
971; SSE2-NEXT:    psraw $8, %xmm5
972; SSE2-NEXT:    pmaddwd %xmm0, %xmm5
973; SSE2-NEXT:    paddd %xmm5, %xmm3
974; SSE2-NEXT:    addq $32, %rcx
975; SSE2-NEXT:    cmpq %rcx, %rax
976; SSE2-NEXT:    jne .LBB7_1
977; SSE2-NEXT:  # %bb.2: # %middle.block
978; SSE2-NEXT:    paddd %xmm8, %xmm4
979; SSE2-NEXT:    paddd %xmm8, %xmm3
980; SSE2-NEXT:    paddd %xmm4, %xmm3
981; SSE2-NEXT:    paddd %xmm8, %xmm2
982; SSE2-NEXT:    paddd %xmm8, %xmm1
983; SSE2-NEXT:    paddd %xmm3, %xmm1
984; SSE2-NEXT:    paddd %xmm2, %xmm1
985; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
986; SSE2-NEXT:    paddd %xmm1, %xmm0
987; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
988; SSE2-NEXT:    paddd %xmm0, %xmm1
989; SSE2-NEXT:    movd %xmm1, %eax
990; SSE2-NEXT:    retq
991;
992; AVX1-LABEL: _Z9test_charPcS_i_1024:
993; AVX1:       # %bb.0: # %entry
994; AVX1-NEXT:    movl %edx, %eax
995; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
996; AVX1-NEXT:    xorl %ecx, %ecx
997; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
998; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
999; AVX1-NEXT:    .p2align 4, 0x90
1000; AVX1-NEXT:  .LBB7_1: # %vector.body
1001; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
1002; AVX1-NEXT:    vpmovsxbw 24(%rdi,%rcx), %xmm3
1003; AVX1-NEXT:    vpmovsxbw 16(%rdi,%rcx), %xmm4
1004; AVX1-NEXT:    vpmovsxbw 8(%rdi,%rcx), %xmm5
1005; AVX1-NEXT:    vpmovsxbw (%rdi,%rcx), %xmm6
1006; AVX1-NEXT:    vpmovsxbw 24(%rsi,%rcx), %xmm7
1007; AVX1-NEXT:    vpmaddwd %xmm3, %xmm7, %xmm3
1008; AVX1-NEXT:    vpmovsxbw 16(%rsi,%rcx), %xmm7
1009; AVX1-NEXT:    vpmaddwd %xmm4, %xmm7, %xmm4
1010; AVX1-NEXT:    vpmovsxbw 8(%rsi,%rcx), %xmm7
1011; AVX1-NEXT:    vpmaddwd %xmm5, %xmm7, %xmm5
1012; AVX1-NEXT:    vpmovsxbw (%rsi,%rcx), %xmm7
1013; AVX1-NEXT:    vpmaddwd %xmm6, %xmm7, %xmm6
1014; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
1015; AVX1-NEXT:    vpaddd %xmm7, %xmm3, %xmm3
1016; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
1017; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
1018; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1019; AVX1-NEXT:    vpaddd %xmm3, %xmm5, %xmm3
1020; AVX1-NEXT:    vpaddd %xmm1, %xmm6, %xmm1
1021; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
1022; AVX1-NEXT:    addq $32, %rcx
1023; AVX1-NEXT:    cmpq %rcx, %rax
1024; AVX1-NEXT:    jne .LBB7_1
1025; AVX1-NEXT:  # %bb.2: # %middle.block
1026; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm3
1027; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
1028; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
1029; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1030; AVX1-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
1031; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
1032; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
1033; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
1034; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
1035; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
1036; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1037; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1038; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1039; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1040; AVX1-NEXT:    vmovd %xmm0, %eax
1041; AVX1-NEXT:    vzeroupper
1042; AVX1-NEXT:    retq
1043;
1044; AVX2-LABEL: _Z9test_charPcS_i_1024:
1045; AVX2:       # %bb.0: # %entry
1046; AVX2-NEXT:    movl %edx, %eax
1047; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1048; AVX2-NEXT:    xorl %ecx, %ecx
1049; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1050; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1051; AVX2-NEXT:    .p2align 4, 0x90
1052; AVX2-NEXT:  .LBB7_1: # %vector.body
1053; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
1054; AVX2-NEXT:    vpmovsxbw 16(%rdi,%rcx), %ymm3
1055; AVX2-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm4
1056; AVX2-NEXT:    vpmovsxbw 16(%rsi,%rcx), %ymm5
1057; AVX2-NEXT:    vpmaddwd %ymm3, %ymm5, %ymm3
1058; AVX2-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
1059; AVX2-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm3
1060; AVX2-NEXT:    vpmaddwd %ymm4, %ymm3, %ymm3
1061; AVX2-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
1062; AVX2-NEXT:    addq $32, %rcx
1063; AVX2-NEXT:    cmpq %rcx, %rax
1064; AVX2-NEXT:    jne .LBB7_1
1065; AVX2-NEXT:  # %bb.2: # %middle.block
1066; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm1
1067; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
1068; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
1069; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1070; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1071; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1072; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1073; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1074; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1075; AVX2-NEXT:    vmovd %xmm0, %eax
1076; AVX2-NEXT:    vzeroupper
1077; AVX2-NEXT:    retq
1078;
1079; AVX512F-LABEL: _Z9test_charPcS_i_1024:
1080; AVX512F:       # %bb.0: # %entry
1081; AVX512F-NEXT:    movl %edx, %eax
1082; AVX512F-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1083; AVX512F-NEXT:    xorl %ecx, %ecx
1084; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1085; AVX512F-NEXT:    .p2align 4, 0x90
1086; AVX512F-NEXT:  .LBB7_1: # %vector.body
1087; AVX512F-NEXT:    # =>This Inner Loop Header: Depth=1
1088; AVX512F-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm2
1089; AVX512F-NEXT:    vpmovsxbw 16(%rdi,%rcx), %ymm3
1090; AVX512F-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm4
1091; AVX512F-NEXT:    vpmaddwd %ymm2, %ymm4, %ymm2
1092; AVX512F-NEXT:    vpmovsxbw 16(%rsi,%rcx), %ymm4
1093; AVX512F-NEXT:    vpmaddwd %ymm3, %ymm4, %ymm3
1094; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
1095; AVX512F-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
1096; AVX512F-NEXT:    addq $32, %rcx
1097; AVX512F-NEXT:    cmpq %rcx, %rax
1098; AVX512F-NEXT:    jne .LBB7_1
1099; AVX512F-NEXT:  # %bb.2: # %middle.block
1100; AVX512F-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
1101; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1102; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
1103; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
1104; AVX512F-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1105; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1106; AVX512F-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1107; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1108; AVX512F-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1109; AVX512F-NEXT:    vmovd %xmm0, %eax
1110; AVX512F-NEXT:    vzeroupper
1111; AVX512F-NEXT:    retq
1112;
1113; AVX512BW-LABEL: _Z9test_charPcS_i_1024:
1114; AVX512BW:       # %bb.0: # %entry
1115; AVX512BW-NEXT:    movl %edx, %eax
1116; AVX512BW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1117; AVX512BW-NEXT:    xorl %ecx, %ecx
1118; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1119; AVX512BW-NEXT:    .p2align 4, 0x90
1120; AVX512BW-NEXT:  .LBB7_1: # %vector.body
1121; AVX512BW-NEXT:    # =>This Inner Loop Header: Depth=1
1122; AVX512BW-NEXT:    vpmovsxbw (%rdi,%rcx), %zmm2
1123; AVX512BW-NEXT:    vpmovsxbw (%rsi,%rcx), %zmm3
1124; AVX512BW-NEXT:    vpmaddwd %zmm2, %zmm3, %zmm2
1125; AVX512BW-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
1126; AVX512BW-NEXT:    addq $32, %rcx
1127; AVX512BW-NEXT:    cmpq %rcx, %rax
1128; AVX512BW-NEXT:    jne .LBB7_1
1129; AVX512BW-NEXT:  # %bb.2: # %middle.block
1130; AVX512BW-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
1131; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1132; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
1133; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
1134; AVX512BW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1135; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1136; AVX512BW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1137; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1138; AVX512BW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1139; AVX512BW-NEXT:    vmovd %xmm0, %eax
1140; AVX512BW-NEXT:    vzeroupper
1141; AVX512BW-NEXT:    retq
1142entry:
1143  %3 = zext i32 %2 to i64
1144  br label %vector.body
1145
1146vector.body:
1147  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
1148  %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
1149  %4 = getelementptr inbounds i8, i8* %0, i64 %index
1150  %5 = bitcast i8* %4 to <32 x i8>*
1151  %wide.load = load <32 x i8>, <32 x i8>* %5, align 1
1152  %6 = sext <32 x i8> %wide.load to <32 x i32>
1153  %7 = getelementptr inbounds i8, i8* %1, i64 %index
1154  %8 = bitcast i8* %7 to <32 x i8>*
1155  %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1
1156  %9 = sext <32 x i8> %wide.load14 to <32 x i32>
1157  %10 = mul nsw <32 x i32> %9, %6
1158  %11 = add nsw <32 x i32> %10, %vec.phi
1159  %index.next = add i64 %index, 32
1160  %12 = icmp eq i64 %index.next, %3
1161  br i1 %12, label %middle.block, label %vector.body
1162
1163middle.block:
1164  %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1165  %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1
1166  %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1167  %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf
1168  %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1169  %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15
1170  %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1171  %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17
1172  %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1173  %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19
1174  %13 = extractelement <32 x i32> %bin.rdx20, i32 0
1175  ret i32 %13
1176}
1177
1178define i32 @test_unsigned_short_128(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 {
1179; SSE2-LABEL: test_unsigned_short_128:
1180; SSE2:       # %bb.0: # %entry
1181; SSE2-NEXT:    movl %edx, %eax
1182; SSE2-NEXT:    pxor %xmm0, %xmm0
1183; SSE2-NEXT:    xorl %ecx, %ecx
1184; SSE2-NEXT:    .p2align 4, 0x90
1185; SSE2-NEXT:  .LBB8_1: # %vector.body
1186; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
1187; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
1188; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
1189; SSE2-NEXT:    movdqa %xmm2, %xmm3
1190; SSE2-NEXT:    pmulhuw %xmm1, %xmm3
1191; SSE2-NEXT:    pmullw %xmm1, %xmm2
1192; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1193; SSE2-NEXT:    paddd %xmm2, %xmm0
1194; SSE2-NEXT:    addq $16, %rcx
1195; SSE2-NEXT:    cmpq %rcx, %rax
1196; SSE2-NEXT:    jne .LBB8_1
1197; SSE2-NEXT:  # %bb.2: # %middle.block
1198; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1199; SSE2-NEXT:    paddd %xmm0, %xmm1
1200; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1201; SSE2-NEXT:    paddd %xmm1, %xmm0
1202; SSE2-NEXT:    movd %xmm0, %eax
1203; SSE2-NEXT:    retq
1204;
1205; AVX-LABEL: test_unsigned_short_128:
1206; AVX:       # %bb.0: # %entry
1207; AVX-NEXT:    movl %edx, %eax
1208; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1209; AVX-NEXT:    xorl %ecx, %ecx
1210; AVX-NEXT:    .p2align 4, 0x90
1211; AVX-NEXT:  .LBB8_1: # %vector.body
1212; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
1213; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1214; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1215; AVX-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
1216; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
1217; AVX-NEXT:    addq $16, %rcx
1218; AVX-NEXT:    cmpq %rcx, %rax
1219; AVX-NEXT:    jne .LBB8_1
1220; AVX-NEXT:  # %bb.2: # %middle.block
1221; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1222; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1223; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1224; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1225; AVX-NEXT:    vmovd %xmm0, %eax
1226; AVX-NEXT:    retq
1227entry:
1228  %3 = zext i32 %2 to i64
1229  br label %vector.body
1230
1231vector.body:
1232  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
1233  %vec.phi = phi <4 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
1234  %4 = getelementptr inbounds i16, i16* %0, i64 %index
1235  %5 = bitcast i16* %4 to <4 x i16>*
1236  %wide.load = load <4 x i16>, <4 x i16>* %5, align 2
1237  %6 = zext <4 x i16> %wide.load to <4 x i32>
1238  %7 = getelementptr inbounds i16, i16* %1, i64 %index
1239  %8 = bitcast i16* %7 to <4 x i16>*
1240  %wide.load14 = load <4 x i16>, <4 x i16>* %8, align 2
1241  %9 = zext <4 x i16> %wide.load14 to <4 x i32>
1242  %10 = mul nsw <4 x i32> %9, %6
1243  %11 = add nsw <4 x i32> %10, %vec.phi
1244  %index.next = add i64 %index, 16
1245  %12 = icmp eq i64 %index.next, %3
1246  br i1 %12, label %middle.block, label %vector.body
1247
1248middle.block:
1249  %rdx.shuf15 = shufflevector <4 x i32> %11, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
1250  %bin.rdx16 = add <4 x i32> %11, %rdx.shuf15
1251  %rdx.shuf17 = shufflevector <4 x i32> %bin.rdx16, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1252  %bin.rdx18 = add <4 x i32> %bin.rdx16, %rdx.shuf17
1253  %13 = extractelement <4 x i32> %bin.rdx18, i32 0
1254  ret i32 %13
1255}
1256
1257define i32 @test_unsigned_short_256(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 {
1258; SSE2-LABEL: test_unsigned_short_256:
1259; SSE2:       # %bb.0: # %entry
1260; SSE2-NEXT:    movl %edx, %eax
1261; SSE2-NEXT:    pxor %xmm0, %xmm0
1262; SSE2-NEXT:    xorl %ecx, %ecx
1263; SSE2-NEXT:    pxor %xmm1, %xmm1
1264; SSE2-NEXT:    .p2align 4, 0x90
1265; SSE2-NEXT:  .LBB9_1: # %vector.body
1266; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
1267; SSE2-NEXT:    movdqu (%rdi,%rcx,2), %xmm2
1268; SSE2-NEXT:    movdqu (%rsi,%rcx,2), %xmm3
1269; SSE2-NEXT:    movdqa %xmm3, %xmm4
1270; SSE2-NEXT:    pmulhuw %xmm2, %xmm4
1271; SSE2-NEXT:    pmullw %xmm2, %xmm3
1272; SSE2-NEXT:    movdqa %xmm3, %xmm2
1273; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
1274; SSE2-NEXT:    paddd %xmm2, %xmm0
1275; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1276; SSE2-NEXT:    paddd %xmm3, %xmm1
1277; SSE2-NEXT:    addq $16, %rcx
1278; SSE2-NEXT:    cmpq %rcx, %rax
1279; SSE2-NEXT:    jne .LBB9_1
1280; SSE2-NEXT:  # %bb.2: # %middle.block
1281; SSE2-NEXT:    paddd %xmm1, %xmm0
1282; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1283; SSE2-NEXT:    paddd %xmm0, %xmm1
1284; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1285; SSE2-NEXT:    paddd %xmm1, %xmm0
1286; SSE2-NEXT:    movd %xmm0, %eax
1287; SSE2-NEXT:    retq
1288;
1289; AVX1-LABEL: test_unsigned_short_256:
1290; AVX1:       # %bb.0: # %entry
1291; AVX1-NEXT:    movl %edx, %eax
1292; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1293; AVX1-NEXT:    xorl %ecx, %ecx
1294; AVX1-NEXT:    .p2align 4, 0x90
1295; AVX1-NEXT:  .LBB9_1: # %vector.body
1296; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
1297; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1298; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1299; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1300; AVX1-NEXT:    vpmulld %xmm1, %xmm3, %xmm1
1301; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1302; AVX1-NEXT:    vpmulld %xmm2, %xmm3, %xmm2
1303; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1304; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
1305; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
1306; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1307; AVX1-NEXT:    addq $16, %rcx
1308; AVX1-NEXT:    cmpq %rcx, %rax
1309; AVX1-NEXT:    jne .LBB9_1
1310; AVX1-NEXT:  # %bb.2: # %middle.block
1311; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1312; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1313; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1314; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1315; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1316; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1317; AVX1-NEXT:    vmovd %xmm0, %eax
1318; AVX1-NEXT:    vzeroupper
1319; AVX1-NEXT:    retq
1320;
1321; AVX256-LABEL: test_unsigned_short_256:
1322; AVX256:       # %bb.0: # %entry
1323; AVX256-NEXT:    movl %edx, %eax
1324; AVX256-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1325; AVX256-NEXT:    xorl %ecx, %ecx
1326; AVX256-NEXT:    .p2align 4, 0x90
1327; AVX256-NEXT:  .LBB9_1: # %vector.body
1328; AVX256-NEXT:    # =>This Inner Loop Header: Depth=1
1329; AVX256-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1330; AVX256-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1331; AVX256-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
1332; AVX256-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
1333; AVX256-NEXT:    addq $16, %rcx
1334; AVX256-NEXT:    cmpq %rcx, %rax
1335; AVX256-NEXT:    jne .LBB9_1
1336; AVX256-NEXT:  # %bb.2: # %middle.block
1337; AVX256-NEXT:    vextracti128 $1, %ymm0, %xmm1
1338; AVX256-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1339; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1340; AVX256-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1341; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1342; AVX256-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1343; AVX256-NEXT:    vmovd %xmm0, %eax
1344; AVX256-NEXT:    vzeroupper
1345; AVX256-NEXT:    retq
1346entry:
1347  %3 = zext i32 %2 to i64
1348  br label %vector.body
1349
1350vector.body:
1351  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
1352  %vec.phi = phi <8 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
1353  %4 = getelementptr inbounds i16, i16* %0, i64 %index
1354  %5 = bitcast i16* %4 to <8 x i16>*
1355  %wide.load = load <8 x i16>, <8 x i16>* %5, align 2
1356  %6 = zext <8 x i16> %wide.load to <8 x i32>
1357  %7 = getelementptr inbounds i16, i16* %1, i64 %index
1358  %8 = bitcast i16* %7 to <8 x i16>*
1359  %wide.load14 = load <8 x i16>, <8 x i16>* %8, align 2
1360  %9 = zext <8 x i16> %wide.load14 to <8 x i32>
1361  %10 = mul nsw <8 x i32> %9, %6
1362  %11 = add nsw <8 x i32> %10, %vec.phi
1363  %index.next = add i64 %index, 16
1364  %12 = icmp eq i64 %index.next, %3
1365  br i1 %12, label %middle.block, label %vector.body
1366
1367middle.block:
1368  %rdx.shuf = shufflevector <8 x i32> %11, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
1369  %bin.rdx = add <8 x i32> %11, %rdx.shuf
1370  %rdx.shuf15 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1371  %bin.rdx16 = add <8 x i32> %bin.rdx, %rdx.shuf15
1372  %rdx.shuf17 = shufflevector <8 x i32> %bin.rdx16, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1373  %bin.rdx18 = add <8 x i32> %bin.rdx16, %rdx.shuf17
1374  %13 = extractelement <8 x i32> %bin.rdx18, i32 0
1375  ret i32 %13
1376}
1377
1378define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 {
1379; SSE2-LABEL: test_unsigned_short_512:
1380; SSE2:       # %bb.0: # %entry
1381; SSE2-NEXT:    movl %edx, %eax
1382; SSE2-NEXT:    pxor %xmm0, %xmm0
1383; SSE2-NEXT:    xorl %ecx, %ecx
1384; SSE2-NEXT:    pxor %xmm1, %xmm1
1385; SSE2-NEXT:    pxor %xmm3, %xmm3
1386; SSE2-NEXT:    pxor %xmm2, %xmm2
1387; SSE2-NEXT:    .p2align 4, 0x90
1388; SSE2-NEXT:  .LBB10_1: # %vector.body
1389; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
1390; SSE2-NEXT:    movdqu (%rdi,%rcx,2), %xmm4
1391; SSE2-NEXT:    movdqu 16(%rdi,%rcx,2), %xmm8
1392; SSE2-NEXT:    movdqu (%rsi,%rcx,2), %xmm6
1393; SSE2-NEXT:    movdqu 16(%rsi,%rcx,2), %xmm7
1394; SSE2-NEXT:    movdqa %xmm6, %xmm5
1395; SSE2-NEXT:    pmulhuw %xmm4, %xmm5
1396; SSE2-NEXT:    pmullw %xmm4, %xmm6
1397; SSE2-NEXT:    movdqa %xmm6, %xmm4
1398; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
1399; SSE2-NEXT:    paddd %xmm4, %xmm0
1400; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
1401; SSE2-NEXT:    paddd %xmm6, %xmm1
1402; SSE2-NEXT:    movdqa %xmm7, %xmm4
1403; SSE2-NEXT:    pmulhuw %xmm8, %xmm4
1404; SSE2-NEXT:    pmullw %xmm8, %xmm7
1405; SSE2-NEXT:    movdqa %xmm7, %xmm5
1406; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
1407; SSE2-NEXT:    paddd %xmm5, %xmm3
1408; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
1409; SSE2-NEXT:    paddd %xmm7, %xmm2
1410; SSE2-NEXT:    addq $16, %rcx
1411; SSE2-NEXT:    cmpq %rcx, %rax
1412; SSE2-NEXT:    jne .LBB10_1
1413; SSE2-NEXT:  # %bb.2: # %middle.block
1414; SSE2-NEXT:    paddd %xmm3, %xmm0
1415; SSE2-NEXT:    paddd %xmm2, %xmm1
1416; SSE2-NEXT:    paddd %xmm0, %xmm1
1417; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1418; SSE2-NEXT:    paddd %xmm1, %xmm0
1419; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1420; SSE2-NEXT:    paddd %xmm0, %xmm1
1421; SSE2-NEXT:    movd %xmm1, %eax
1422; SSE2-NEXT:    retq
1423;
1424; AVX1-LABEL: test_unsigned_short_512:
1425; AVX1:       # %bb.0: # %entry
1426; AVX1-NEXT:    movl %edx, %eax
1427; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1428; AVX1-NEXT:    xorl %ecx, %ecx
1429; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1430; AVX1-NEXT:    .p2align 4, 0x90
1431; AVX1-NEXT:  .LBB10_1: # %vector.body
1432; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
1433; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1434; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1435; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1436; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1437; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1438; AVX1-NEXT:    vpmulld %xmm2, %xmm6, %xmm2
1439; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1440; AVX1-NEXT:    vpmulld %xmm3, %xmm6, %xmm3
1441; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1442; AVX1-NEXT:    vpmulld %xmm4, %xmm6, %xmm4
1443; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1444; AVX1-NEXT:    vpmulld %xmm5, %xmm6, %xmm5
1445; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
1446; AVX1-NEXT:    vpaddd %xmm6, %xmm2, %xmm2
1447; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
1448; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1449; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1450; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
1451; AVX1-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
1452; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1453; AVX1-NEXT:    addq $16, %rcx
1454; AVX1-NEXT:    cmpq %rcx, %rax
1455; AVX1-NEXT:    jne .LBB10_1
1456; AVX1-NEXT:  # %bb.2: # %middle.block
1457; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1458; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1459; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
1460; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
1461; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1462; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1463; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1464; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1465; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1466; AVX1-NEXT:    vmovd %xmm0, %eax
1467; AVX1-NEXT:    vzeroupper
1468; AVX1-NEXT:    retq
1469;
1470; AVX2-LABEL: test_unsigned_short_512:
1471; AVX2:       # %bb.0: # %entry
1472; AVX2-NEXT:    movl %edx, %eax
1473; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1474; AVX2-NEXT:    xorl %ecx, %ecx
1475; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1476; AVX2-NEXT:    .p2align 4, 0x90
1477; AVX2-NEXT:  .LBB10_1: # %vector.body
1478; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
1479; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1480; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1481; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1482; AVX2-NEXT:    vpmulld %ymm2, %ymm4, %ymm2
1483; AVX2-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
1484; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1485; AVX2-NEXT:    vpmulld %ymm3, %ymm2, %ymm2
1486; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
1487; AVX2-NEXT:    addq $16, %rcx
1488; AVX2-NEXT:    cmpq %rcx, %rax
1489; AVX2-NEXT:    jne .LBB10_1
1490; AVX2-NEXT:  # %bb.2: # %middle.block
1491; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
1492; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1493; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1494; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1495; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1496; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1497; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1498; AVX2-NEXT:    vmovd %xmm0, %eax
1499; AVX2-NEXT:    vzeroupper
1500; AVX2-NEXT:    retq
1501;
1502; AVX512-LABEL: test_unsigned_short_512:
1503; AVX512:       # %bb.0: # %entry
1504; AVX512-NEXT:    movl %edx, %eax
1505; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1506; AVX512-NEXT:    xorl %ecx, %ecx
1507; AVX512-NEXT:    .p2align 4, 0x90
1508; AVX512-NEXT:  .LBB10_1: # %vector.body
1509; AVX512-NEXT:    # =>This Inner Loop Header: Depth=1
1510; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1511; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1512; AVX512-NEXT:    vpmulld %zmm1, %zmm2, %zmm1
1513; AVX512-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
1514; AVX512-NEXT:    addq $16, %rcx
1515; AVX512-NEXT:    cmpq %rcx, %rax
1516; AVX512-NEXT:    jne .LBB10_1
1517; AVX512-NEXT:  # %bb.2: # %middle.block
1518; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1519; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
1520; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
1521; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1522; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1523; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1524; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1525; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1526; AVX512-NEXT:    vmovd %xmm0, %eax
1527; AVX512-NEXT:    vzeroupper
1528; AVX512-NEXT:    retq
1529entry:
1530  %3 = zext i32 %2 to i64
1531  br label %vector.body
1532
1533vector.body:
1534  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
1535  %vec.phi = phi <16 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
1536  %4 = getelementptr inbounds i16, i16* %0, i64 %index
1537  %5 = bitcast i16* %4 to <16 x i16>*
1538  %wide.load = load <16 x i16>, <16 x i16>* %5, align 2
1539  %6 = zext <16 x i16> %wide.load to <16 x i32>
1540  %7 = getelementptr inbounds i16, i16* %1, i64 %index
1541  %8 = bitcast i16* %7 to <16 x i16>*
1542  %wide.load14 = load <16 x i16>, <16 x i16>* %8, align 2
1543  %9 = zext <16 x i16> %wide.load14 to <16 x i32>
1544  %10 = mul nsw <16 x i32> %9, %6
1545  %11 = add nsw <16 x i32> %10, %vec.phi
1546  %index.next = add i64 %index, 16
1547  %12 = icmp eq i64 %index.next, %3
1548  br i1 %12, label %middle.block, label %vector.body
1549
1550middle.block:
1551  %rdx.shuf1 = shufflevector <16 x i32> %11, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1552  %bin.rdx1 = add <16 x i32> %11, %rdx.shuf1
1553  %rdx.shuf = shufflevector <16 x i32> %bin.rdx1, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1554  %bin.rdx = add <16 x i32> %bin.rdx1, %rdx.shuf
1555  %rdx.shuf15 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1556  %bin.rdx16 = add <16 x i32> %bin.rdx, %rdx.shuf15
1557  %rdx.shuf17 = shufflevector <16 x i32> %bin.rdx16, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1558  %bin.rdx18 = add <16 x i32> %bin.rdx16, %rdx.shuf17
1559  %13 = extractelement <16 x i32> %bin.rdx18, i32 0
1560  ret i32 %13
1561}
1562
1563define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 {
1564; SSE2-LABEL: test_unsigned_short_1024:
1565; SSE2:       # %bb.0: # %entry
1566; SSE2-NEXT:    movl %edx, %eax
1567; SSE2-NEXT:    pxor %xmm8, %xmm8
1568; SSE2-NEXT:    xorl %ecx, %ecx
1569; SSE2-NEXT:    pxor %xmm3, %xmm3
1570; SSE2-NEXT:    pxor %xmm9, %xmm9
1571; SSE2-NEXT:    pxor %xmm10, %xmm10
1572; SSE2-NEXT:    pxor %xmm4, %xmm4
1573; SSE2-NEXT:    pxor %xmm6, %xmm6
1574; SSE2-NEXT:    pxor %xmm5, %xmm5
1575; SSE2-NEXT:    pxor %xmm7, %xmm7
1576; SSE2-NEXT:    .p2align 4, 0x90
1577; SSE2-NEXT:  .LBB11_1: # %vector.body
1578; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
1579; SSE2-NEXT:    movdqu 48(%rdi,%rcx,2), %xmm0
1580; SSE2-NEXT:    movdqu 48(%rsi,%rcx,2), %xmm1
1581; SSE2-NEXT:    movdqa %xmm1, %xmm2
1582; SSE2-NEXT:    pmulhuw %xmm0, %xmm2
1583; SSE2-NEXT:    pmullw %xmm0, %xmm1
1584; SSE2-NEXT:    movdqa %xmm1, %xmm0
1585; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1586; SSE2-NEXT:    paddd %xmm0, %xmm7
1587; SSE2-NEXT:    movdqu 32(%rdi,%rcx,2), %xmm0
1588; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1589; SSE2-NEXT:    movdqu 32(%rsi,%rcx,2), %xmm2
1590; SSE2-NEXT:    paddd %xmm1, %xmm5
1591; SSE2-NEXT:    movdqa %xmm2, %xmm1
1592; SSE2-NEXT:    pmulhuw %xmm0, %xmm1
1593; SSE2-NEXT:    pmullw %xmm0, %xmm2
1594; SSE2-NEXT:    movdqa %xmm2, %xmm0
1595; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1596; SSE2-NEXT:    paddd %xmm0, %xmm6
1597; SSE2-NEXT:    movdqu (%rdi,%rcx,2), %xmm0
1598; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1599; SSE2-NEXT:    movdqu (%rsi,%rcx,2), %xmm1
1600; SSE2-NEXT:    paddd %xmm2, %xmm4
1601; SSE2-NEXT:    movdqa %xmm1, %xmm2
1602; SSE2-NEXT:    pmulhuw %xmm0, %xmm2
1603; SSE2-NEXT:    pmullw %xmm0, %xmm1
1604; SSE2-NEXT:    movdqa %xmm1, %xmm0
1605; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1606; SSE2-NEXT:    paddd %xmm0, %xmm8
1607; SSE2-NEXT:    movdqu 16(%rdi,%rcx,2), %xmm0
1608; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1609; SSE2-NEXT:    movdqu 16(%rsi,%rcx,2), %xmm2
1610; SSE2-NEXT:    paddd %xmm1, %xmm3
1611; SSE2-NEXT:    movdqa %xmm2, %xmm1
1612; SSE2-NEXT:    pmulhuw %xmm0, %xmm1
1613; SSE2-NEXT:    pmullw %xmm0, %xmm2
1614; SSE2-NEXT:    movdqa %xmm2, %xmm0
1615; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1616; SSE2-NEXT:    paddd %xmm0, %xmm9
1617; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1618; SSE2-NEXT:    paddd %xmm2, %xmm10
1619; SSE2-NEXT:    addq $16, %rcx
1620; SSE2-NEXT:    cmpq %rcx, %rax
1621; SSE2-NEXT:    jne .LBB11_1
1622; SSE2-NEXT:  # %bb.2: # %middle.block
1623; SSE2-NEXT:    paddd %xmm6, %xmm3
1624; SSE2-NEXT:    paddd %xmm7, %xmm10
1625; SSE2-NEXT:    paddd %xmm3, %xmm10
1626; SSE2-NEXT:    paddd %xmm4, %xmm8
1627; SSE2-NEXT:    paddd %xmm5, %xmm9
1628; SSE2-NEXT:    paddd %xmm10, %xmm9
1629; SSE2-NEXT:    paddd %xmm8, %xmm9
1630; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3]
1631; SSE2-NEXT:    paddd %xmm9, %xmm0
1632; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1633; SSE2-NEXT:    paddd %xmm0, %xmm1
1634; SSE2-NEXT:    movd %xmm1, %eax
1635; SSE2-NEXT:    retq
1636;
1637; AVX1-LABEL: test_unsigned_short_1024:
1638; AVX1:       # %bb.0: # %entry
1639; AVX1-NEXT:    movl %edx, %eax
1640; AVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
1641; AVX1-NEXT:    xorl %ecx, %ecx
1642; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1643; AVX1-NEXT:    vpxor %xmm9, %xmm9, %xmm9
1644; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
1645; AVX1-NEXT:    .p2align 4, 0x90
1646; AVX1-NEXT:  .LBB11_1: # %vector.body
1647; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
1648; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1649; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1650; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1651; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1652; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1653; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1654; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm10 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1655; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm11 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1656; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1657; AVX1-NEXT:    vpmulld %xmm4, %xmm1, %xmm1
1658; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1659; AVX1-NEXT:    vpmulld %xmm5, %xmm4, %xmm4
1660; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1661; AVX1-NEXT:    vpmulld %xmm6, %xmm5, %xmm5
1662; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1663; AVX1-NEXT:    vpmulld %xmm7, %xmm6, %xmm6
1664; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1665; AVX1-NEXT:    vpmulld %xmm0, %xmm7, %xmm13
1666; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1667; AVX1-NEXT:    vpmulld %xmm12, %xmm7, %xmm7
1668; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1669; AVX1-NEXT:    vpmulld %xmm10, %xmm0, %xmm10
1670; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1671; AVX1-NEXT:    vpmulld %xmm11, %xmm0, %xmm11
1672; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
1673; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
1674; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm1
1675; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm2
1676; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm0
1677; AVX1-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
1678; AVX1-NEXT:    vpaddd %xmm6, %xmm8, %xmm1
1679; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm8
1680; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm0
1681; AVX1-NEXT:    vpaddd %xmm0, %xmm13, %xmm0
1682; AVX1-NEXT:    vpaddd %xmm7, %xmm9, %xmm1
1683; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm9
1684; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm0
1685; AVX1-NEXT:    vpaddd %xmm0, %xmm10, %xmm0
1686; AVX1-NEXT:    vpaddd %xmm3, %xmm11, %xmm1
1687; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm3
1688; AVX1-NEXT:    addq $16, %rcx
1689; AVX1-NEXT:    cmpq %rcx, %rax
1690; AVX1-NEXT:    jne .LBB11_1
1691; AVX1-NEXT:  # %bb.2: # %middle.block
1692; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm0
1693; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm1
1694; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm4
1695; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1696; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1697; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
1698; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
1699; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
1700; AVX1-NEXT:    vpaddd %xmm0, %xmm9, %xmm0
1701; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1702; AVX1-NEXT:    vpaddd %xmm0, %xmm8, %xmm0
1703; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1704; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1705; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1706; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1707; AVX1-NEXT:    vmovd %xmm0, %eax
1708; AVX1-NEXT:    vzeroupper
1709; AVX1-NEXT:    retq
1710;
1711; AVX2-LABEL: test_unsigned_short_1024:
1712; AVX2:       # %bb.0: # %entry
1713; AVX2-NEXT:    movl %edx, %eax
1714; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1715; AVX2-NEXT:    xorl %ecx, %ecx
1716; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1717; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1718; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
1719; AVX2-NEXT:    .p2align 4, 0x90
1720; AVX2-NEXT:  .LBB11_1: # %vector.body
1721; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
1722; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1723; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1724; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1725; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1726; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1727; AVX2-NEXT:    vpmulld %ymm4, %ymm8, %ymm4
1728; AVX2-NEXT:    vpaddd %ymm2, %ymm4, %ymm2
1729; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1730; AVX2-NEXT:    vpmulld %ymm5, %ymm4, %ymm4
1731; AVX2-NEXT:    vpaddd %ymm1, %ymm4, %ymm1
1732; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1733; AVX2-NEXT:    vpmulld %ymm6, %ymm4, %ymm4
1734; AVX2-NEXT:    vpaddd %ymm0, %ymm4, %ymm0
1735; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1736; AVX2-NEXT:    vpmulld %ymm7, %ymm4, %ymm4
1737; AVX2-NEXT:    vpaddd %ymm3, %ymm4, %ymm3
1738; AVX2-NEXT:    addq $16, %rcx
1739; AVX2-NEXT:    cmpq %rcx, %rax
1740; AVX2-NEXT:    jne .LBB11_1
1741; AVX2-NEXT:  # %bb.2: # %middle.block
1742; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
1743; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
1744; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
1745; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1746; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1747; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1748; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1749; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1750; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1751; AVX2-NEXT:    vmovd %xmm0, %eax
1752; AVX2-NEXT:    vzeroupper
1753; AVX2-NEXT:    retq
1754;
1755; AVX512-LABEL: test_unsigned_short_1024:
1756; AVX512:       # %bb.0: # %entry
1757; AVX512-NEXT:    movl %edx, %eax
1758; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1759; AVX512-NEXT:    xorl %ecx, %ecx
1760; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1761; AVX512-NEXT:    .p2align 4, 0x90
1762; AVX512-NEXT:  .LBB11_1: # %vector.body
1763; AVX512-NEXT:    # =>This Inner Loop Header: Depth=1
1764; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1765; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1766; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1767; AVX512-NEXT:    vpmulld %zmm2, %zmm4, %zmm2
1768; AVX512-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
1769; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1770; AVX512-NEXT:    vpmulld %zmm3, %zmm2, %zmm2
1771; AVX512-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
1772; AVX512-NEXT:    addq $16, %rcx
1773; AVX512-NEXT:    cmpq %rcx, %rax
1774; AVX512-NEXT:    jne .LBB11_1
1775; AVX512-NEXT:  # %bb.2: # %middle.block
1776; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
1777; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1778; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
1779; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
1780; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1781; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1782; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1783; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1784; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1785; AVX512-NEXT:    vmovd %xmm0, %eax
1786; AVX512-NEXT:    vzeroupper
1787; AVX512-NEXT:    retq
1788entry:
1789  %3 = zext i32 %2 to i64
1790  br label %vector.body
1791
1792vector.body:
1793  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
1794  %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
1795  %4 = getelementptr inbounds i16, i16* %0, i64 %index
1796  %5 = bitcast i16* %4 to <32 x i16>*
1797  %wide.load = load <32 x i16>, <32 x i16>* %5, align 2
1798  %6 = zext <32 x i16> %wide.load to <32 x i32>
1799  %7 = getelementptr inbounds i16, i16* %1, i64 %index
1800  %8 = bitcast i16* %7 to <32 x i16>*
1801  %wide.load14 = load <32 x i16>, <32 x i16>* %8, align 2
1802  %9 = zext <32 x i16> %wide.load14 to <32 x i32>
1803  %10 = mul nsw <32 x i32> %9, %6
1804  %11 = add nsw <32 x i32> %10, %vec.phi
1805  %index.next = add i64 %index, 16
1806  %12 = icmp eq i64 %index.next, %3
1807  br i1 %12, label %middle.block, label %vector.body
1808
1809middle.block:
1810  %rdx.shuf2 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1811  %bin.rdx2 = add <32 x i32> %11, %rdx.shuf2
1812  %rdx.shuf1 = shufflevector <32 x i32> %bin.rdx2, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1813  %bin.rdx1 = add <32 x i32> %bin.rdx2, %rdx.shuf1
1814  %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1815  %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf
1816  %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1817  %bin.rdx16 = add <32 x i32> %bin.rdx, %rdx.shuf15
1818  %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx16, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1819  %bin.rdx18 = add <32 x i32> %bin.rdx16, %rdx.shuf17
1820  %13 = extractelement <32 x i32> %bin.rdx18, i32 0
1821  ret i32 %13
1822}
1823
1824define <4 x i32> @pmaddwd_8(<8 x i16> %A, <8 x i16> %B) {
1825; SSE2-LABEL: pmaddwd_8:
1826; SSE2:       # %bb.0:
1827; SSE2-NEXT:    pmaddwd %xmm1, %xmm0
1828; SSE2-NEXT:    retq
1829;
1830; AVX-LABEL: pmaddwd_8:
1831; AVX:       # %bb.0:
1832; AVX-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
1833; AVX-NEXT:    retq
1834   %a = sext <8 x i16> %A to <8 x i32>
1835   %b = sext <8 x i16> %B to <8 x i32>
1836   %m = mul nsw <8 x i32> %a, %b
1837   %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1838   %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1839   %ret = add <4 x i32> %odd, %even
1840   ret <4 x i32> %ret
1841}
1842
1843define <4 x i32> @pmaddwd_8_swapped(<8 x i16> %A, <8 x i16> %B) {
1844; SSE2-LABEL: pmaddwd_8_swapped:
1845; SSE2:       # %bb.0:
1846; SSE2-NEXT:    pmaddwd %xmm1, %xmm0
1847; SSE2-NEXT:    retq
1848;
1849; AVX-LABEL: pmaddwd_8_swapped:
1850; AVX:       # %bb.0:
1851; AVX-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
1852; AVX-NEXT:    retq
1853   %a = sext <8 x i16> %A to <8 x i32>
1854   %b = sext <8 x i16> %B to <8 x i32>
1855   %m = mul nsw <8 x i32> %a, %b
1856   %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1857   %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1858   %ret = add <4 x i32> %even, %odd
1859   ret <4 x i32> %ret
1860}
1861
1862define <4 x i32> @larger_mul(<16 x i16> %A, <16 x i16> %B) {
1863; SSE2-LABEL: larger_mul:
1864; SSE2:       # %bb.0:
1865; SSE2-NEXT:    movdqa %xmm0, %xmm1
1866; SSE2-NEXT:    pmulhw %xmm2, %xmm1
1867; SSE2-NEXT:    pmullw %xmm2, %xmm0
1868; SSE2-NEXT:    movdqa %xmm0, %xmm2
1869; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1870; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1871; SSE2-NEXT:    movdqa %xmm0, %xmm1
1872; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1873; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
1874; SSE2-NEXT:    paddd %xmm1, %xmm0
1875; SSE2-NEXT:    retq
1876;
1877; AVX1-LABEL: larger_mul:
1878; AVX1:       # %bb.0:
1879; AVX1-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
1880; AVX1-NEXT:    vzeroupper
1881; AVX1-NEXT:    retq
1882;
1883; AVX2-LABEL: larger_mul:
1884; AVX2:       # %bb.0:
1885; AVX2-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
1886; AVX2-NEXT:    vzeroupper
1887; AVX2-NEXT:    retq
1888;
1889; AVX512-LABEL: larger_mul:
1890; AVX512:       # %bb.0:
1891; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
1892; AVX512-NEXT:    vpmovsxwd %ymm1, %zmm1
1893; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
1894; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
1895; AVX512-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
1896; AVX512-NEXT:    vzeroupper
1897; AVX512-NEXT:    retq
1898   %a = sext <16 x i16> %A to <16 x i32>
1899   %b = sext <16 x i16> %B to <16 x i32>
1900   %m = mul nsw <16 x i32> %a, %b
1901   %odd = shufflevector <16 x i32> %m, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1902   %even = shufflevector <16 x i32> %m, <16 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1903   %ret = add <4 x i32> %odd, %even
1904   ret <4 x i32> %ret
1905}
1906
1907define <8 x i32> @pmaddwd_16(<16 x i16> %A, <16 x i16> %B) {
1908; SSE2-LABEL: pmaddwd_16:
1909; SSE2:       # %bb.0:
1910; SSE2-NEXT:    pmaddwd %xmm2, %xmm0
1911; SSE2-NEXT:    pmaddwd %xmm3, %xmm1
1912; SSE2-NEXT:    retq
1913;
1914; AVX1-LABEL: pmaddwd_16:
1915; AVX1:       # %bb.0:
1916; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1917; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1918; AVX1-NEXT:    vpmaddwd %xmm2, %xmm3, %xmm2
1919; AVX1-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
1920; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1921; AVX1-NEXT:    retq
1922;
1923; AVX256-LABEL: pmaddwd_16:
1924; AVX256:       # %bb.0:
1925; AVX256-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
1926; AVX256-NEXT:    retq
1927   %a = sext <16 x i16> %A to <16 x i32>
1928   %b = sext <16 x i16> %B to <16 x i32>
1929   %m = mul nsw <16 x i32> %a, %b
1930   %odd = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
1931   %even = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
1932   %ret = add <8 x i32> %odd, %even
1933   ret <8 x i32> %ret
1934}
1935
1936define <16 x i32> @pmaddwd_32(<32 x i16> %A, <32 x i16> %B) {
1937; SSE2-LABEL: pmaddwd_32:
1938; SSE2:       # %bb.0:
1939; SSE2-NEXT:    pmaddwd %xmm4, %xmm0
1940; SSE2-NEXT:    pmaddwd %xmm5, %xmm1
1941; SSE2-NEXT:    pmaddwd %xmm6, %xmm2
1942; SSE2-NEXT:    pmaddwd %xmm7, %xmm3
1943; SSE2-NEXT:    retq
1944;
1945; AVX1-LABEL: pmaddwd_32:
1946; AVX1:       # %bb.0:
1947; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
1948; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
1949; AVX1-NEXT:    vpmaddwd %xmm4, %xmm5, %xmm4
1950; AVX1-NEXT:    vpmaddwd %xmm2, %xmm0, %xmm0
1951; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
1952; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
1953; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
1954; AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
1955; AVX1-NEXT:    vpmaddwd %xmm3, %xmm1, %xmm1
1956; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1957; AVX1-NEXT:    retq
1958;
1959; AVX2-LABEL: pmaddwd_32:
1960; AVX2:       # %bb.0:
1961; AVX2-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
1962; AVX2-NEXT:    vpmaddwd %ymm3, %ymm1, %ymm1
1963; AVX2-NEXT:    retq
1964;
1965; AVX512F-LABEL: pmaddwd_32:
1966; AVX512F:       # %bb.0:
1967; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
1968; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
1969; AVX512F-NEXT:    vpmaddwd %ymm2, %ymm3, %ymm2
1970; AVX512F-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
1971; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1972; AVX512F-NEXT:    retq
1973;
1974; AVX512BW-LABEL: pmaddwd_32:
1975; AVX512BW:       # %bb.0:
1976; AVX512BW-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm0
1977; AVX512BW-NEXT:    retq
1978   %a = sext <32 x i16> %A to <32 x i32>
1979   %b = sext <32 x i16> %B to <32 x i32>
1980   %m = mul nsw <32 x i32> %a, %b
1981   %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
1982   %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
1983   %ret = add <16 x i32> %odd, %even
1984   ret <16 x i32> %ret
1985}
1986
1987define <4 x i32> @pmaddwd_const(<8 x i16> %A) {
1988; SSE2-LABEL: pmaddwd_const:
1989; SSE2:       # %bb.0:
1990; SSE2-NEXT:    pmaddwd {{.*}}(%rip), %xmm0
1991; SSE2-NEXT:    retq
1992;
1993; AVX-LABEL: pmaddwd_const:
1994; AVX:       # %bb.0:
1995; AVX-NEXT:    vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
1996; AVX-NEXT:    retq
1997   %a = sext <8 x i16> %A to <8 x i32>
1998   %m = mul nsw <8 x i32> %a, <i32 32767, i32 -32768, i32 0, i32 0, i32 1, i32 7, i32 42, i32 32>
1999   %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
2000   %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
2001   %ret = add <4 x i32> %odd, %even
2002   ret <4 x i32> %ret
2003}
2004
2005; Do not select unsigned i16 multiplication
2006define <4 x i32> @pmaddwd_negative1(<8 x i16> %A, <8 x i16> %B) {
2007; SSE2-LABEL: pmaddwd_negative1:
2008; SSE2:       # %bb.0:
2009; SSE2-NEXT:    movdqa %xmm0, %xmm2
2010; SSE2-NEXT:    pmulhuw %xmm1, %xmm2
2011; SSE2-NEXT:    pmullw %xmm1, %xmm0
2012; SSE2-NEXT:    movdqa %xmm0, %xmm1
2013; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2014; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
2015; SSE2-NEXT:    movdqa %xmm0, %xmm2
2016; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
2017; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
2018; SSE2-NEXT:    paddd %xmm2, %xmm0
2019; SSE2-NEXT:    retq
2020;
2021; AVX1-LABEL: pmaddwd_negative1:
2022; AVX1:       # %bb.0:
2023; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2024; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2025; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2026; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2027; AVX1-NEXT:    vpmulld %xmm2, %xmm3, %xmm2
2028; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
2029; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
2030; AVX1-NEXT:    vphaddd %xmm2, %xmm0, %xmm0
2031; AVX1-NEXT:    retq
2032;
2033; AVX256-LABEL: pmaddwd_negative1:
2034; AVX256:       # %bb.0:
2035; AVX256-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2036; AVX256-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2037; AVX256-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
2038; AVX256-NEXT:    vextracti128 $1, %ymm0, %xmm1
2039; AVX256-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
2040; AVX256-NEXT:    vzeroupper
2041; AVX256-NEXT:    retq
2042   %a = zext <8 x i16> %A to <8 x i32>
2043   %b = zext <8 x i16> %B to <8 x i32>
2044   %m = mul nuw <8 x i32> %a, %b
2045   %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
2046   %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
2047   %ret = add <4 x i32> %odd, %even
2048   ret <4 x i32> %ret
2049}
2050
2051; Do not select if constant is too large
2052define <4 x i32> @pmaddwd_negative2(<8 x i16> %A) {
2053; SSE2-LABEL: pmaddwd_negative2:
2054; SSE2:       # %bb.0:
2055; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2056; SSE2-NEXT:    psrad $16, %xmm1
2057; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
2058; SSE2-NEXT:    psrad $16, %xmm0
2059; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
2060; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,7,42,32]
2061; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
2062; SSE2-NEXT:    pmuludq %xmm2, %xmm4
2063; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
2064; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [32768,4294934528,0,0]
2065; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
2066; SSE2-NEXT:    pmuludq %xmm2, %xmm6
2067; SSE2-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,2],xmm4[0,2]
2068; SSE2-NEXT:    pmuludq %xmm3, %xmm0
2069; SSE2-NEXT:    pmuludq %xmm5, %xmm1
2070; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
2071; SSE2-NEXT:    paddd %xmm6, %xmm1
2072; SSE2-NEXT:    movdqa %xmm1, %xmm0
2073; SSE2-NEXT:    retq
2074;
2075; AVX1-LABEL: pmaddwd_negative2:
2076; AVX1:       # %bb.0:
2077; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2078; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1
2079; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
2080; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
2081; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
2082; AVX1-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
2083; AVX1-NEXT:    retq
2084;
2085; AVX256-LABEL: pmaddwd_negative2:
2086; AVX256:       # %bb.0:
2087; AVX256-NEXT:    vpmovsxwd %xmm0, %ymm0
2088; AVX256-NEXT:    vpmulld {{.*}}(%rip), %ymm0, %ymm0
2089; AVX256-NEXT:    vextracti128 $1, %ymm0, %xmm1
2090; AVX256-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
2091; AVX256-NEXT:    vzeroupper
2092; AVX256-NEXT:    retq
2093   %a = sext <8 x i16> %A to <8 x i32>
2094   %m = mul nsw <8 x i32> %a, <i32 32768, i32 -32768, i32 0, i32 0, i32 1, i32 7, i32 42, i32 32>
2095   %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
2096   %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
2097   %ret = add <4 x i32> %odd, %even
2098   ret <4 x i32> %ret
2099}
2100
2101define <4 x i32> @jumbled_indices4(<8 x i16> %A, <8 x i16> %B) {
2102; SSE2-LABEL: jumbled_indices4:
2103; SSE2:       # %bb.0:
2104; SSE2-NEXT:    pmaddwd %xmm1, %xmm0
2105; SSE2-NEXT:    retq
2106;
2107; AVX-LABEL: jumbled_indices4:
2108; AVX:       # %bb.0:
2109; AVX-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
2110; AVX-NEXT:    retq
2111  %exta = sext <8 x i16> %A to <8 x i32>
2112  %extb = sext <8 x i16> %B to <8 x i32>
2113  %m = mul <8 x i32> %exta, %extb
2114  %sa = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 3, i32 1, i32 5, i32 6>
2115  %sb = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 2, i32 0, i32 4, i32 7>
2116  %a = add <4 x i32> %sa, %sb
2117  ret <4 x i32> %a
2118}
2119
2120define <8 x i32> @jumbled_indices8(<16 x i16> %A, <16 x i16> %B) {
2121; SSE2-LABEL: jumbled_indices8:
2122; SSE2:       # %bb.0:
2123; SSE2-NEXT:    pmaddwd %xmm2, %xmm0
2124; SSE2-NEXT:    pmaddwd %xmm3, %xmm1
2125; SSE2-NEXT:    retq
2126;
2127; AVX1-LABEL: jumbled_indices8:
2128; AVX1:       # %bb.0:
2129; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2130; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
2131; AVX1-NEXT:    vpmaddwd %xmm2, %xmm3, %xmm2
2132; AVX1-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
2133; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2134; AVX1-NEXT:    retq
2135;
2136; AVX256-LABEL: jumbled_indices8:
2137; AVX256:       # %bb.0:
2138; AVX256-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
2139; AVX256-NEXT:    retq
2140  %exta = sext <16 x i16> %A to <16 x i32>
2141  %extb = sext <16 x i16> %B to <16 x i32>
2142  %m = mul <16 x i32> %exta, %extb
2143  %sa = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 7, i32 4, i32 11, i32 8, i32 15, i32 12>
2144  %sb = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 6, i32 5, i32 10, i32 9, i32 14, i32 13>
2145  %a = add <8 x i32> %sa, %sb
2146  ret <8 x i32> %a
2147}
2148
2149define <16 x i32> @jumbled_indices16(<32 x i16> %A, <32 x i16> %B) {
2150; SSE2-LABEL: jumbled_indices16:
2151; SSE2:       # %bb.0:
2152; SSE2-NEXT:    pmaddwd %xmm4, %xmm0
2153; SSE2-NEXT:    pmaddwd %xmm5, %xmm1
2154; SSE2-NEXT:    pmaddwd %xmm6, %xmm2
2155; SSE2-NEXT:    pmaddwd %xmm7, %xmm3
2156; SSE2-NEXT:    retq
2157;
2158; AVX1-LABEL: jumbled_indices16:
2159; AVX1:       # %bb.0:
2160; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
2161; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
2162; AVX1-NEXT:    vpmaddwd %xmm4, %xmm5, %xmm4
2163; AVX1-NEXT:    vpmaddwd %xmm2, %xmm0, %xmm0
2164; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
2165; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
2166; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
2167; AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
2168; AVX1-NEXT:    vpmaddwd %xmm3, %xmm1, %xmm1
2169; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2170; AVX1-NEXT:    retq
2171;
2172; AVX2-LABEL: jumbled_indices16:
2173; AVX2:       # %bb.0:
2174; AVX2-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
2175; AVX2-NEXT:    vpmaddwd %ymm3, %ymm1, %ymm1
2176; AVX2-NEXT:    retq
2177;
2178; AVX512F-LABEL: jumbled_indices16:
2179; AVX512F:       # %bb.0:
2180; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
2181; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
2182; AVX512F-NEXT:    vpmaddwd %ymm2, %ymm3, %ymm2
2183; AVX512F-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
2184; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
2185; AVX512F-NEXT:    retq
2186;
2187; AVX512BW-LABEL: jumbled_indices16:
2188; AVX512BW:       # %bb.0:
2189; AVX512BW-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm0
2190; AVX512BW-NEXT:    retq
2191  %exta = sext <32 x i16> %A to <32 x i32>
2192  %extb = sext <32 x i16> %B to <32 x i32>
2193  %m = mul <32 x i32> %exta, %extb
2194  %sa = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 2, i32 0, i32 5, i32 6, i32 11, i32 9, i32 15, i32 12, i32 17, i32 18, i32 20, i32 23, i32 27, i32 24, i32 31, i32 29>
2195  %sb = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 3, i32 1, i32 4, i32 7, i32 10, i32 8, i32 14, i32 13, i32 16, i32 19, i32 21, i32 22, i32 26, i32 25, i32 30, i32 28>
2196  %a = add <16 x i32> %sa, %sb
2197  ret <16 x i32> %a
2198}
2199
2200define <32 x i32> @jumbled_indices32(<64 x i16> %A, <64 x i16> %B) {
2201; SSE2-LABEL: jumbled_indices32:
2202; SSE2:       # %bb.0:
2203; SSE2-NEXT:    movq %rdi, %rax
2204; SSE2-NEXT:    pmaddwd {{[0-9]+}}(%rsp), %xmm0
2205; SSE2-NEXT:    pmaddwd {{[0-9]+}}(%rsp), %xmm1
2206; SSE2-NEXT:    pmaddwd {{[0-9]+}}(%rsp), %xmm2
2207; SSE2-NEXT:    pmaddwd {{[0-9]+}}(%rsp), %xmm3
2208; SSE2-NEXT:    pmaddwd {{[0-9]+}}(%rsp), %xmm4
2209; SSE2-NEXT:    pmaddwd {{[0-9]+}}(%rsp), %xmm5
2210; SSE2-NEXT:    pmaddwd {{[0-9]+}}(%rsp), %xmm6
2211; SSE2-NEXT:    pmaddwd {{[0-9]+}}(%rsp), %xmm7
2212; SSE2-NEXT:    movdqa %xmm7, 112(%rdi)
2213; SSE2-NEXT:    movdqa %xmm6, 96(%rdi)
2214; SSE2-NEXT:    movdqa %xmm5, 80(%rdi)
2215; SSE2-NEXT:    movdqa %xmm4, 64(%rdi)
2216; SSE2-NEXT:    movdqa %xmm3, 48(%rdi)
2217; SSE2-NEXT:    movdqa %xmm2, 32(%rdi)
2218; SSE2-NEXT:    movdqa %xmm1, 16(%rdi)
2219; SSE2-NEXT:    movdqa %xmm0, (%rdi)
2220; SSE2-NEXT:    retq
2221;
2222; AVX1-LABEL: jumbled_indices32:
2223; AVX1:       # %bb.0:
2224; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm8
2225; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm9
2226; AVX1-NEXT:    vpmaddwd %xmm8, %xmm9, %xmm8
2227; AVX1-NEXT:    vpmaddwd %xmm4, %xmm0, %xmm0
2228; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm0
2229; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm8
2230; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
2231; AVX1-NEXT:    vpmaddwd %xmm4, %xmm8, %xmm4
2232; AVX1-NEXT:    vpmaddwd %xmm5, %xmm1, %xmm1
2233; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
2234; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm4
2235; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
2236; AVX1-NEXT:    vpmaddwd %xmm4, %xmm5, %xmm4
2237; AVX1-NEXT:    vpmaddwd %xmm6, %xmm2, %xmm2
2238; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
2239; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm4
2240; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
2241; AVX1-NEXT:    vpmaddwd %xmm4, %xmm5, %xmm4
2242; AVX1-NEXT:    vpmaddwd %xmm7, %xmm3, %xmm3
2243; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
2244; AVX1-NEXT:    retq
2245;
2246; AVX2-LABEL: jumbled_indices32:
2247; AVX2:       # %bb.0:
2248; AVX2-NEXT:    vpmaddwd %ymm4, %ymm0, %ymm0
2249; AVX2-NEXT:    vpmaddwd %ymm5, %ymm1, %ymm1
2250; AVX2-NEXT:    vpmaddwd %ymm6, %ymm2, %ymm2
2251; AVX2-NEXT:    vpmaddwd %ymm7, %ymm3, %ymm3
2252; AVX2-NEXT:    retq
2253;
2254; AVX512F-LABEL: jumbled_indices32:
2255; AVX512F:       # %bb.0:
2256; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm4
2257; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
2258; AVX512F-NEXT:    vpmaddwd %ymm4, %ymm5, %ymm4
2259; AVX512F-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
2260; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
2261; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
2262; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
2263; AVX512F-NEXT:    vpmaddwd %ymm2, %ymm4, %ymm2
2264; AVX512F-NEXT:    vpmaddwd %ymm3, %ymm1, %ymm1
2265; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
2266; AVX512F-NEXT:    retq
2267;
2268; AVX512BW-LABEL: jumbled_indices32:
2269; AVX512BW:       # %bb.0:
2270; AVX512BW-NEXT:    vpmaddwd %zmm2, %zmm0, %zmm0
2271; AVX512BW-NEXT:    vpmaddwd %zmm3, %zmm1, %zmm1
2272; AVX512BW-NEXT:    retq
2273  %exta = sext <64 x i16> %A to <64 x i32>
2274  %extb = sext <64 x i16> %B to <64 x i32>
2275  %m = mul <64 x i32> %exta, %extb
2276  %sa = shufflevector <64 x i32> %m, <64 x i32> undef, <32 x i32> <i32 1, i32 2, i32 6, i32 5, i32 10, i32 8, i32 14, i32 12, i32 19, i32 17, i32 22, i32 20, i32 25, i32 27, i32 30, i32 28, i32 32, i32 34, i32 37, i32 38, i32 41, i32 43, i32 45, i32 47, i32 50, i32 48, i32 52, i32 54, i32 59, i32 56, i32 61, i32 63>
2277  %sb = shufflevector <64 x i32> %m, <64 x i32> undef, <32 x i32> <i32 0, i32 3, i32 7, i32 4, i32 11, i32 9, i32 15, i32 13, i32 18, i32 16, i32 23, i32 21, i32 24, i32 26, i32 31, i32 29, i32 33, i32 35, i32 36, i32 39, i32 40, i32 42, i32 44, i32 46, i32 51, i32 49, i32 53, i32 55, i32 58, i32 57, i32 60, i32 62>
2278  %a = add <32 x i32> %sa, %sb
2279  ret <32 x i32> %a
2280}
2281
2282; NOTE: We're testing with loads because ABI lowering creates a concat_vectors that extract_vector_elt creation can see through.
2283; This would require the combine to recreate the concat_vectors.
2284define <4 x i32> @pmaddwd_128(<8 x i16>* %Aptr, <8 x i16>* %Bptr) {
2285; SSE2-LABEL: pmaddwd_128:
2286; SSE2:       # %bb.0:
2287; SSE2-NEXT:    movdqa (%rdi), %xmm0
2288; SSE2-NEXT:    pmaddwd (%rsi), %xmm0
2289; SSE2-NEXT:    retq
2290;
2291; AVX-LABEL: pmaddwd_128:
2292; AVX:       # %bb.0:
2293; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2294; AVX-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
2295; AVX-NEXT:    retq
2296  %A = load <8 x i16>, <8 x i16>* %Aptr
2297  %B = load <8 x i16>, <8 x i16>* %Bptr
2298  %A_even = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
2299  %A_odd = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
2300  %B_even = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
2301  %B_odd = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
2302  %A_even_ext = sext <4 x i16> %A_even to <4 x i32>
2303  %B_even_ext = sext <4 x i16> %B_even to <4 x i32>
2304  %A_odd_ext = sext <4 x i16> %A_odd to <4 x i32>
2305  %B_odd_ext = sext <4 x i16> %B_odd to <4 x i32>
2306  %even_mul = mul <4 x i32> %A_even_ext, %B_even_ext
2307  %odd_mul = mul <4 x i32> %A_odd_ext, %B_odd_ext
2308  %add = add <4 x i32> %even_mul, %odd_mul
2309  ret <4 x i32> %add
2310}
2311
2312define <8 x i32> @pmaddwd_256(<16 x i16>* %Aptr, <16 x i16>* %Bptr) {
2313; SSE2-LABEL: pmaddwd_256:
2314; SSE2:       # %bb.0:
2315; SSE2-NEXT:    movdqa (%rdi), %xmm0
2316; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
2317; SSE2-NEXT:    pmaddwd (%rsi), %xmm0
2318; SSE2-NEXT:    pmaddwd 16(%rsi), %xmm1
2319; SSE2-NEXT:    retq
2320;
2321; AVX1-LABEL: pmaddwd_256:
2322; AVX1:       # %bb.0:
2323; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
2324; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
2325; AVX1-NEXT:    vpmaddwd 16(%rsi), %xmm1, %xmm1
2326; AVX1-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
2327; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2328; AVX1-NEXT:    retq
2329;
2330; AVX256-LABEL: pmaddwd_256:
2331; AVX256:       # %bb.0:
2332; AVX256-NEXT:    vmovdqa (%rdi), %ymm0
2333; AVX256-NEXT:    vpmaddwd (%rsi), %ymm0, %ymm0
2334; AVX256-NEXT:    retq
2335  %A = load <16 x i16>, <16 x i16>* %Aptr
2336  %B = load <16 x i16>, <16 x i16>* %Bptr
2337  %A_even = shufflevector <16 x i16> %A, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
2338  %A_odd = shufflevector <16 x i16> %A, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
2339  %B_even = shufflevector <16 x i16> %B, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
2340  %B_odd = shufflevector <16 x i16> %B, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
2341  %A_even_ext = sext <8 x i16> %A_even to <8 x i32>
2342  %B_even_ext = sext <8 x i16> %B_even to <8 x i32>
2343  %A_odd_ext = sext <8 x i16> %A_odd to <8 x i32>
2344  %B_odd_ext = sext <8 x i16> %B_odd to <8 x i32>
2345  %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext
2346  %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext
2347  %add = add <8 x i32> %even_mul, %odd_mul
2348  ret <8 x i32> %add
2349}
2350
2351define <16 x i32> @pmaddwd_512(<32 x i16>* %Aptr, <32 x i16>* %Bptr) {
2352; SSE2-LABEL: pmaddwd_512:
2353; SSE2:       # %bb.0:
2354; SSE2-NEXT:    movdqa (%rdi), %xmm0
2355; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
2356; SSE2-NEXT:    movdqa 32(%rdi), %xmm2
2357; SSE2-NEXT:    movdqa 48(%rdi), %xmm3
2358; SSE2-NEXT:    pmaddwd (%rsi), %xmm0
2359; SSE2-NEXT:    pmaddwd 16(%rsi), %xmm1
2360; SSE2-NEXT:    pmaddwd 32(%rsi), %xmm2
2361; SSE2-NEXT:    pmaddwd 48(%rsi), %xmm3
2362; SSE2-NEXT:    retq
2363;
2364; AVX1-LABEL: pmaddwd_512:
2365; AVX1:       # %bb.0:
2366; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
2367; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
2368; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm2
2369; AVX1-NEXT:    vmovdqa 48(%rdi), %xmm3
2370; AVX1-NEXT:    vpmaddwd 16(%rsi), %xmm1, %xmm1
2371; AVX1-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
2372; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2373; AVX1-NEXT:    vpmaddwd 48(%rsi), %xmm3, %xmm1
2374; AVX1-NEXT:    vpmaddwd 32(%rsi), %xmm2, %xmm2
2375; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
2376; AVX1-NEXT:    retq
2377;
2378; AVX2-LABEL: pmaddwd_512:
2379; AVX2:       # %bb.0:
2380; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
2381; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
2382; AVX2-NEXT:    vpmaddwd (%rsi), %ymm0, %ymm0
2383; AVX2-NEXT:    vpmaddwd 32(%rsi), %ymm1, %ymm1
2384; AVX2-NEXT:    retq
2385;
2386; AVX512F-LABEL: pmaddwd_512:
2387; AVX512F:       # %bb.0:
2388; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
2389; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
2390; AVX512F-NEXT:    vpmaddwd 32(%rsi), %ymm1, %ymm1
2391; AVX512F-NEXT:    vpmaddwd (%rsi), %ymm0, %ymm0
2392; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2393; AVX512F-NEXT:    retq
2394;
2395; AVX512BW-LABEL: pmaddwd_512:
2396; AVX512BW:       # %bb.0:
2397; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
2398; AVX512BW-NEXT:    vpmaddwd (%rsi), %zmm0, %zmm0
2399; AVX512BW-NEXT:    retq
2400  %A = load <32 x i16>, <32 x i16>* %Aptr
2401  %B = load <32 x i16>, <32 x i16>* %Bptr
2402  %A_even = shufflevector <32 x i16> %A, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
2403  %A_odd = shufflevector <32 x i16> %A, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
2404  %B_even = shufflevector <32 x i16> %B, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
2405  %B_odd = shufflevector <32 x i16> %B, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
2406  %A_even_ext = sext <16 x i16> %A_even to <16 x i32>
2407  %B_even_ext = sext <16 x i16> %B_even to <16 x i32>
2408  %A_odd_ext = sext <16 x i16> %A_odd to <16 x i32>
2409  %B_odd_ext = sext <16 x i16> %B_odd to <16 x i32>
2410  %even_mul = mul <16 x i32> %A_even_ext, %B_even_ext
2411  %odd_mul = mul <16 x i32> %A_odd_ext, %B_odd_ext
2412  %add = add <16 x i32> %even_mul, %odd_mul
2413  ret <16 x i32> %add
2414}
2415
2416define <32 x i32> @pmaddwd_1024(<64 x i16>* %Aptr, <64 x i16>* %Bptr) {
2417; SSE2-LABEL: pmaddwd_1024:
2418; SSE2:       # %bb.0:
2419; SSE2-NEXT:    movq %rdi, %rax
2420; SSE2-NEXT:    movdqa (%rsi), %xmm0
2421; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
2422; SSE2-NEXT:    movdqa 32(%rsi), %xmm2
2423; SSE2-NEXT:    movdqa 48(%rsi), %xmm3
2424; SSE2-NEXT:    pmaddwd (%rdx), %xmm0
2425; SSE2-NEXT:    pmaddwd 16(%rdx), %xmm1
2426; SSE2-NEXT:    pmaddwd 32(%rdx), %xmm2
2427; SSE2-NEXT:    pmaddwd 48(%rdx), %xmm3
2428; SSE2-NEXT:    movdqa 64(%rsi), %xmm4
2429; SSE2-NEXT:    pmaddwd 64(%rdx), %xmm4
2430; SSE2-NEXT:    movdqa 80(%rsi), %xmm5
2431; SSE2-NEXT:    pmaddwd 80(%rdx), %xmm5
2432; SSE2-NEXT:    movdqa 96(%rsi), %xmm6
2433; SSE2-NEXT:    pmaddwd 96(%rdx), %xmm6
2434; SSE2-NEXT:    movdqa 112(%rsi), %xmm7
2435; SSE2-NEXT:    pmaddwd 112(%rdx), %xmm7
2436; SSE2-NEXT:    movdqa %xmm7, 112(%rdi)
2437; SSE2-NEXT:    movdqa %xmm6, 96(%rdi)
2438; SSE2-NEXT:    movdqa %xmm5, 80(%rdi)
2439; SSE2-NEXT:    movdqa %xmm4, 64(%rdi)
2440; SSE2-NEXT:    movdqa %xmm3, 48(%rdi)
2441; SSE2-NEXT:    movdqa %xmm2, 32(%rdi)
2442; SSE2-NEXT:    movdqa %xmm1, 16(%rdi)
2443; SSE2-NEXT:    movdqa %xmm0, (%rdi)
2444; SSE2-NEXT:    retq
2445;
2446; AVX1-LABEL: pmaddwd_1024:
2447; AVX1:       # %bb.0:
2448; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
2449; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
2450; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm2
2451; AVX1-NEXT:    vmovdqa 48(%rdi), %xmm3
2452; AVX1-NEXT:    vpmaddwd 16(%rsi), %xmm1, %xmm1
2453; AVX1-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
2454; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2455; AVX1-NEXT:    vpmaddwd 48(%rsi), %xmm3, %xmm1
2456; AVX1-NEXT:    vpmaddwd 32(%rsi), %xmm2, %xmm2
2457; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
2458; AVX1-NEXT:    vmovdqa 80(%rdi), %xmm2
2459; AVX1-NEXT:    vpmaddwd 80(%rsi), %xmm2, %xmm2
2460; AVX1-NEXT:    vmovdqa 64(%rdi), %xmm3
2461; AVX1-NEXT:    vpmaddwd 64(%rsi), %xmm3, %xmm3
2462; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
2463; AVX1-NEXT:    vmovdqa 112(%rdi), %xmm3
2464; AVX1-NEXT:    vpmaddwd 112(%rsi), %xmm3, %xmm3
2465; AVX1-NEXT:    vmovdqa 96(%rdi), %xmm4
2466; AVX1-NEXT:    vpmaddwd 96(%rsi), %xmm4, %xmm4
2467; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
2468; AVX1-NEXT:    retq
2469;
2470; AVX2-LABEL: pmaddwd_1024:
2471; AVX2:       # %bb.0:
2472; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
2473; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
2474; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm2
2475; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm3
2476; AVX2-NEXT:    vpmaddwd (%rsi), %ymm0, %ymm0
2477; AVX2-NEXT:    vpmaddwd 32(%rsi), %ymm1, %ymm1
2478; AVX2-NEXT:    vpmaddwd 64(%rsi), %ymm2, %ymm2
2479; AVX2-NEXT:    vpmaddwd 96(%rsi), %ymm3, %ymm3
2480; AVX2-NEXT:    retq
2481;
2482; AVX512F-LABEL: pmaddwd_1024:
2483; AVX512F:       # %bb.0:
2484; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
2485; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
2486; AVX512F-NEXT:    vmovdqa 64(%rdi), %ymm2
2487; AVX512F-NEXT:    vmovdqa 96(%rdi), %ymm3
2488; AVX512F-NEXT:    vpmaddwd 32(%rsi), %ymm1, %ymm1
2489; AVX512F-NEXT:    vpmaddwd (%rsi), %ymm0, %ymm0
2490; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2491; AVX512F-NEXT:    vpmaddwd 96(%rsi), %ymm3, %ymm1
2492; AVX512F-NEXT:    vpmaddwd 64(%rsi), %ymm2, %ymm2
2493; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
2494; AVX512F-NEXT:    retq
2495;
2496; AVX512BW-LABEL: pmaddwd_1024:
2497; AVX512BW:       # %bb.0:
2498; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
2499; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
2500; AVX512BW-NEXT:    vpmaddwd (%rsi), %zmm0, %zmm0
2501; AVX512BW-NEXT:    vpmaddwd 64(%rsi), %zmm1, %zmm1
2502; AVX512BW-NEXT:    retq
2503  %A = load <64 x i16>, <64 x i16>* %Aptr
2504  %B = load <64 x i16>, <64 x i16>* %Bptr
2505  %A_even = shufflevector <64 x i16> %A, <64 x i16> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
2506  %A_odd = shufflevector <64 x i16> %A, <64 x i16> undef, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
2507  %B_even = shufflevector <64 x i16> %B, <64 x i16> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
2508  %B_odd = shufflevector <64 x i16> %B, <64 x i16> undef, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
2509  %A_even_ext = sext <32 x i16> %A_even to <32 x i32>
2510  %B_even_ext = sext <32 x i16> %B_even to <32 x i32>
2511  %A_odd_ext = sext <32 x i16> %A_odd to <32 x i32>
2512  %B_odd_ext = sext <32 x i16> %B_odd to <32 x i32>
2513  %even_mul = mul <32 x i32> %A_even_ext, %B_even_ext
2514  %odd_mul = mul <32 x i32> %A_odd_ext, %B_odd_ext
2515  %add = add <32 x i32> %even_mul, %odd_mul
2516  ret <32 x i32> %add
2517}
2518
2519define <4 x i32> @pmaddwd_commuted_mul(<8 x i16>* %Aptr, <8 x i16>* %Bptr) {
2520; SSE2-LABEL: pmaddwd_commuted_mul:
2521; SSE2:       # %bb.0:
2522; SSE2-NEXT:    movdqa (%rdi), %xmm0
2523; SSE2-NEXT:    pmaddwd (%rsi), %xmm0
2524; SSE2-NEXT:    retq
2525;
2526; AVX-LABEL: pmaddwd_commuted_mul:
2527; AVX:       # %bb.0:
2528; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2529; AVX-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
2530; AVX-NEXT:    retq
2531  %A = load <8 x i16>, <8 x i16>* %Aptr
2532  %B = load <8 x i16>, <8 x i16>* %Bptr
2533  %A_even = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
2534  %A_odd = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
2535  %B_even = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
2536  %B_odd = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
2537  %A_even_ext = sext <4 x i16> %A_even to <4 x i32>
2538  %B_even_ext = sext <4 x i16> %B_even to <4 x i32>
2539  %A_odd_ext = sext <4 x i16> %A_odd to <4 x i32>
2540  %B_odd_ext = sext <4 x i16> %B_odd to <4 x i32>
2541  %even_mul = mul <4 x i32> %A_even_ext, %B_even_ext
2542  %odd_mul = mul <4 x i32> %B_odd_ext, %A_odd_ext ; Different order than previous mul
2543  %add = add <4 x i32> %even_mul, %odd_mul
2544  ret <4 x i32> %add
2545}
2546
2547define <4 x i32> @pmaddwd_swapped_indices(<8 x i16>* %Aptr, <8 x i16>* %Bptr) {
2548; SSE2-LABEL: pmaddwd_swapped_indices:
2549; SSE2:       # %bb.0:
2550; SSE2-NEXT:    movdqa (%rdi), %xmm0
2551; SSE2-NEXT:    pmaddwd (%rsi), %xmm0
2552; SSE2-NEXT:    retq
2553;
2554; AVX-LABEL: pmaddwd_swapped_indices:
2555; AVX:       # %bb.0:
2556; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2557; AVX-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
2558; AVX-NEXT:    retq
2559  %A = load <8 x i16>, <8 x i16>* %Aptr
2560  %B = load <8 x i16>, <8 x i16>* %Bptr
2561  %A_even = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 1, i32 2, i32 5, i32 6> ; indices aren't all even
2562  %A_odd = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 7> ; indices aren't all odd
2563  %B_even = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 2, i32 5, i32 6> ; same indices as A
2564  %B_odd = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 7> ; same indices as A
2565  %A_even_ext = sext <4 x i16> %A_even to <4 x i32>
2566  %B_even_ext = sext <4 x i16> %B_even to <4 x i32>
2567  %A_odd_ext = sext <4 x i16> %A_odd to <4 x i32>
2568  %B_odd_ext = sext <4 x i16> %B_odd to <4 x i32>
2569  %even_mul = mul <4 x i32> %A_even_ext, %B_even_ext
2570  %odd_mul = mul <4 x i32> %A_odd_ext, %B_odd_ext
2571  %add = add <4 x i32> %even_mul, %odd_mul
2572  ret <4 x i32> %add
2573}
2574
2575; Negative test were indices aren't paired properly
2576define <4 x i32> @pmaddwd_bad_indices(<8 x i16>* %Aptr, <8 x i16>* %Bptr) {
2577; SSE2-LABEL: pmaddwd_bad_indices:
2578; SSE2:       # %bb.0:
2579; SSE2-NEXT:    movdqa (%rdi), %xmm0
2580; SSE2-NEXT:    movdqa (%rsi), %xmm1
2581; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[2,1,2,3,4,5,6,7]
2582; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
2583; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2584; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
2585; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
2586; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
2587; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2588; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[0,2,2,3,4,5,6,7]
2589; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
2590; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2591; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
2592; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
2593; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2594; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
2595; SSE2-NEXT:    movdqa %xmm2, %xmm4
2596; SSE2-NEXT:    pmulhw %xmm3, %xmm4
2597; SSE2-NEXT:    pmullw %xmm3, %xmm2
2598; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
2599; SSE2-NEXT:    movdqa %xmm0, %xmm3
2600; SSE2-NEXT:    pmulhw %xmm1, %xmm3
2601; SSE2-NEXT:    pmullw %xmm1, %xmm0
2602; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
2603; SSE2-NEXT:    paddd %xmm2, %xmm0
2604; SSE2-NEXT:    retq
2605;
2606; AVX-LABEL: pmaddwd_bad_indices:
2607; AVX:       # %bb.0:
2608; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2609; AVX-NEXT:    vmovdqa (%rsi), %xmm1
2610; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u]
2611; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u]
2612; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
2613; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2614; AVX-NEXT:    vpmovsxwd %xmm2, %xmm2
2615; AVX-NEXT:    vpmovsxwd %xmm3, %xmm3
2616; AVX-NEXT:    vpmulld %xmm3, %xmm2, %xmm2
2617; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
2618; AVX-NEXT:    vpmovsxwd %xmm1, %xmm1
2619; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
2620; AVX-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
2621; AVX-NEXT:    retq
2622  %A = load <8 x i16>, <8 x i16>* %Aptr
2623  %B = load <8 x i16>, <8 x i16>* %Bptr
2624  %A_even = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
2625  %A_odd = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
2626  %B_even = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> ; different indices than A
2627  %B_odd = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> ; different indices than A
2628  %A_even_ext = sext <4 x i16> %A_even to <4 x i32>
2629  %B_even_ext = sext <4 x i16> %B_even to <4 x i32>
2630  %A_odd_ext = sext <4 x i16> %A_odd to <4 x i32>
2631  %B_odd_ext = sext <4 x i16> %B_odd to <4 x i32>
2632  %even_mul = mul <4 x i32> %A_even_ext, %B_even_ext
2633  %odd_mul = mul <4 x i32> %A_odd_ext, %B_odd_ext
2634  %add = add <4 x i32> %even_mul, %odd_mul
2635  ret <4 x i32> %add
2636}
2637
2638; This test contains two multiplies joined by an add. The result of that add is then reduced to a single element.
2639; SelectionDAGBuilder should tag the joining add as a vector reduction. We need to recognize that both sides can use pmaddwd
2640define i32 @madd_double_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* %arg2, <8 x i16>* %arg3) {
2641; SSE2-LABEL: madd_double_reduction:
2642; SSE2:       # %bb.0:
2643; SSE2-NEXT:    movdqu (%rdi), %xmm0
2644; SSE2-NEXT:    movdqu (%rsi), %xmm1
2645; SSE2-NEXT:    pmaddwd %xmm0, %xmm1
2646; SSE2-NEXT:    movdqu (%rdx), %xmm0
2647; SSE2-NEXT:    movdqu (%rcx), %xmm2
2648; SSE2-NEXT:    pmaddwd %xmm0, %xmm2
2649; SSE2-NEXT:    paddd %xmm1, %xmm2
2650; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
2651; SSE2-NEXT:    paddd %xmm2, %xmm0
2652; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2653; SSE2-NEXT:    paddd %xmm0, %xmm1
2654; SSE2-NEXT:    movd %xmm1, %eax
2655; SSE2-NEXT:    retq
2656;
2657; AVX-LABEL: madd_double_reduction:
2658; AVX:       # %bb.0:
2659; AVX-NEXT:    vmovdqu (%rdi), %xmm0
2660; AVX-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
2661; AVX-NEXT:    vmovdqu (%rdx), %xmm1
2662; AVX-NEXT:    vpmaddwd (%rcx), %xmm1, %xmm1
2663; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
2664; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2665; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
2666; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2667; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
2668; AVX-NEXT:    vmovd %xmm0, %eax
2669; AVX-NEXT:    retq
2670  %tmp = load <8 x i16>, <8 x i16>* %arg, align 1
2671  %tmp6 = load <8 x i16>, <8 x i16>* %arg1, align 1
2672  %tmp7 = sext <8 x i16> %tmp to <8 x i32>
2673  %tmp17 = sext <8 x i16> %tmp6 to <8 x i32>
2674  %tmp19 = mul nsw <8 x i32> %tmp7, %tmp17
2675  %tmp20 = load <8 x i16>, <8 x i16>* %arg2, align 1
2676  %tmp21 = load <8 x i16>, <8 x i16>* %arg3, align 1
2677  %tmp22 = sext <8 x i16> %tmp20 to <8 x i32>
2678  %tmp23 = sext <8 x i16> %tmp21 to <8 x i32>
2679  %tmp25 = mul nsw <8 x i32> %tmp22, %tmp23
2680  %tmp26 = add nuw nsw <8 x i32> %tmp25, %tmp19
2681  %tmp29 = shufflevector <8 x i32> %tmp26, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
2682  %tmp30 = add <8 x i32> %tmp26, %tmp29
2683  %tmp31 = shufflevector <8 x i32> %tmp30, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2684  %tmp32 = add <8 x i32> %tmp30, %tmp31
2685  %tmp33 = shufflevector <8 x i32> %tmp32, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2686  %tmp34 = add <8 x i32> %tmp32, %tmp33
2687  %tmp35 = extractelement <8 x i32> %tmp34, i64 0
2688  ret i32 %tmp35
2689}
2690
2691define i32 @madd_quad_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* %arg2, <8 x i16>* %arg3, <8 x i16>* %arg4, <8 x i16>* %arg5, <8 x i16>* %arg6, <8 x i16>* %arg7) {
2692; SSE2-LABEL: madd_quad_reduction:
2693; SSE2:       # %bb.0:
2694; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
2695; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2696; SSE2-NEXT:    movdqu (%rdi), %xmm0
2697; SSE2-NEXT:    movdqu (%rsi), %xmm1
2698; SSE2-NEXT:    pmaddwd %xmm0, %xmm1
2699; SSE2-NEXT:    movdqu (%rdx), %xmm0
2700; SSE2-NEXT:    movdqu (%rcx), %xmm2
2701; SSE2-NEXT:    pmaddwd %xmm0, %xmm2
2702; SSE2-NEXT:    movdqu (%r8), %xmm0
2703; SSE2-NEXT:    movdqu (%r9), %xmm3
2704; SSE2-NEXT:    pmaddwd %xmm0, %xmm3
2705; SSE2-NEXT:    paddd %xmm1, %xmm3
2706; SSE2-NEXT:    movdqu (%rax), %xmm0
2707; SSE2-NEXT:    movdqu (%r10), %xmm1
2708; SSE2-NEXT:    pmaddwd %xmm0, %xmm1
2709; SSE2-NEXT:    paddd %xmm3, %xmm1
2710; SSE2-NEXT:    paddd %xmm2, %xmm1
2711; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
2712; SSE2-NEXT:    paddd %xmm1, %xmm0
2713; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2714; SSE2-NEXT:    paddd %xmm0, %xmm1
2715; SSE2-NEXT:    movd %xmm1, %eax
2716; SSE2-NEXT:    retq
2717;
2718; AVX-LABEL: madd_quad_reduction:
2719; AVX:       # %bb.0:
2720; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r10
2721; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2722; AVX-NEXT:    vmovdqu (%rdi), %xmm0
2723; AVX-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
2724; AVX-NEXT:    vmovdqu (%rdx), %xmm1
2725; AVX-NEXT:    vpmaddwd (%rcx), %xmm1, %xmm1
2726; AVX-NEXT:    vmovdqu (%r8), %xmm2
2727; AVX-NEXT:    vpmaddwd (%r9), %xmm2, %xmm2
2728; AVX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
2729; AVX-NEXT:    vmovdqu (%rax), %xmm2
2730; AVX-NEXT:    vpmaddwd (%r10), %xmm2, %xmm2
2731; AVX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
2732; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
2733; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2734; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
2735; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2736; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
2737; AVX-NEXT:    vmovd %xmm0, %eax
2738; AVX-NEXT:    retq
2739  %tmp = load <8 x i16>, <8 x i16>* %arg, align 1
2740  %tmp6 = load <8 x i16>, <8 x i16>* %arg1, align 1
2741  %tmp7 = sext <8 x i16> %tmp to <8 x i32>
2742  %tmp17 = sext <8 x i16> %tmp6 to <8 x i32>
2743  %tmp19 = mul nsw <8 x i32> %tmp7, %tmp17
2744  %tmp20 = load <8 x i16>, <8 x i16>* %arg2, align 1
2745  %tmp21 = load <8 x i16>, <8 x i16>* %arg3, align 1
2746  %tmp22 = sext <8 x i16> %tmp20 to <8 x i32>
2747  %tmp23 = sext <8 x i16> %tmp21 to <8 x i32>
2748  %tmp25 = mul nsw <8 x i32> %tmp22, %tmp23
2749  %tmp26 = add nuw nsw <8 x i32> %tmp25, %tmp19
2750
2751  %tmp40 = load <8 x i16>, <8 x i16>* %arg4, align 1
2752  %tmp41 = load <8 x i16>, <8 x i16>* %arg5, align 1
2753  %tmp42 = sext <8 x i16> %tmp40 to <8 x i32>
2754  %tmp43 = sext <8 x i16> %tmp41 to <8 x i32>
2755  %tmp45 = mul nsw <8 x i32> %tmp42, %tmp43
2756  %tmp56 = add nuw nsw <8 x i32> %tmp26, %tmp45
2757
2758  %tmp50 = load <8 x i16>, <8 x i16>* %arg6, align 1
2759  %tmp51 = load <8 x i16>, <8 x i16>* %arg7, align 1
2760  %tmp52 = sext <8 x i16> %tmp50 to <8 x i32>
2761  %tmp53 = sext <8 x i16> %tmp51 to <8 x i32>
2762  %tmp55 = mul nsw <8 x i32> %tmp52, %tmp53
2763  %tmp57 = add nuw nsw <8 x i32> %tmp55, %tmp56
2764
2765  %tmp29 = shufflevector <8 x i32> %tmp57, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
2766  %tmp30 = add <8 x i32> %tmp57, %tmp29
2767  %tmp31 = shufflevector <8 x i32> %tmp30, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2768  %tmp32 = add <8 x i32> %tmp30, %tmp31
2769  %tmp33 = shufflevector <8 x i32> %tmp32, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2770  %tmp34 = add <8 x i32> %tmp32, %tmp33
2771  %tmp35 = extractelement <8 x i32> %tmp34, i64 0
2772  ret i32 %tmp35
2773}
2774
2775define i64 @sum_and_sum_of_squares(i8* %a, i32 %n) {
2776; SSE2-LABEL: sum_and_sum_of_squares:
2777; SSE2:       # %bb.0: # %entry
2778; SSE2-NEXT:    movl %esi, %eax
2779; SSE2-NEXT:    pxor %xmm0, %xmm0
2780; SSE2-NEXT:    pxor %xmm1, %xmm1
2781; SSE2-NEXT:    pxor %xmm2, %xmm2
2782; SSE2-NEXT:    pxor %xmm3, %xmm3
2783; SSE2-NEXT:    .p2align 4, 0x90
2784; SSE2-NEXT:  .LBB33_1: # %vector.body
2785; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
2786; SSE2-NEXT:    movq {{.*#+}} xmm4 = mem[0],zero
2787; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
2788; SSE2-NEXT:    movdqa %xmm4, %xmm5
2789; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
2790; SSE2-NEXT:    paddd %xmm5, %xmm2
2791; SSE2-NEXT:    movdqa %xmm4, %xmm5
2792; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
2793; SSE2-NEXT:    paddd %xmm5, %xmm3
2794; SSE2-NEXT:    pmaddwd %xmm4, %xmm4
2795; SSE2-NEXT:    paddd %xmm4, %xmm1
2796; SSE2-NEXT:    addq $8, %rdi
2797; SSE2-NEXT:    addq $-8, %rax
2798; SSE2-NEXT:    jne .LBB33_1
2799; SSE2-NEXT:  # %bb.2: # %middle.block
2800; SSE2-NEXT:    paddd %xmm3, %xmm2
2801; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
2802; SSE2-NEXT:    paddd %xmm2, %xmm3
2803; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1]
2804; SSE2-NEXT:    paddd %xmm3, %xmm2
2805; SSE2-NEXT:    movd %xmm2, %ecx
2806; SSE2-NEXT:    paddd %xmm0, %xmm1
2807; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
2808; SSE2-NEXT:    paddd %xmm1, %xmm0
2809; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2810; SSE2-NEXT:    paddd %xmm0, %xmm1
2811; SSE2-NEXT:    movd %xmm1, %eax
2812; SSE2-NEXT:    shlq $32, %rcx
2813; SSE2-NEXT:    orq %rcx, %rax
2814; SSE2-NEXT:    retq
2815;
2816; AVX1-LABEL: sum_and_sum_of_squares:
2817; AVX1:       # %bb.0: # %entry
2818; AVX1-NEXT:    movl %esi, %eax
2819; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
2820; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2821; AVX1-NEXT:    .p2align 4, 0x90
2822; AVX1-NEXT:  .LBB33_1: # %vector.body
2823; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
2824; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2825; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2826; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
2827; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm4
2828; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
2829; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
2830; AVX1-NEXT:    vpmaddwd %xmm2, %xmm2, %xmm2
2831; AVX1-NEXT:    vpmaddwd %xmm3, %xmm3, %xmm3
2832; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
2833; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
2834; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
2835; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
2836; AVX1-NEXT:    addq $8, %rdi
2837; AVX1-NEXT:    addq $-8, %rax
2838; AVX1-NEXT:    jne .LBB33_1
2839; AVX1-NEXT:  # %bb.2: # %middle.block
2840; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2841; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
2842; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
2843; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
2844; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
2845; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
2846; AVX1-NEXT:    vmovd %xmm1, %ecx
2847; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2848; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
2849; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2850; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
2851; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2852; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
2853; AVX1-NEXT:    vmovd %xmm0, %eax
2854; AVX1-NEXT:    shlq $32, %rcx
2855; AVX1-NEXT:    orq %rcx, %rax
2856; AVX1-NEXT:    vzeroupper
2857; AVX1-NEXT:    retq
2858;
2859; AVX256-LABEL: sum_and_sum_of_squares:
2860; AVX256:       # %bb.0: # %entry
2861; AVX256-NEXT:    movl %esi, %eax
2862; AVX256-NEXT:    vpxor %xmm0, %xmm0, %xmm0
2863; AVX256-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2864; AVX256-NEXT:    .p2align 4, 0x90
2865; AVX256-NEXT:  .LBB33_1: # %vector.body
2866; AVX256-NEXT:    # =>This Inner Loop Header: Depth=1
2867; AVX256-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
2868; AVX256-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
2869; AVX256-NEXT:    vpmaddwd %ymm2, %ymm2, %ymm2
2870; AVX256-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
2871; AVX256-NEXT:    addq $8, %rdi
2872; AVX256-NEXT:    addq $-8, %rax
2873; AVX256-NEXT:    jne .LBB33_1
2874; AVX256-NEXT:  # %bb.2: # %middle.block
2875; AVX256-NEXT:    vextracti128 $1, %ymm1, %xmm2
2876; AVX256-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
2877; AVX256-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
2878; AVX256-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
2879; AVX256-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
2880; AVX256-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
2881; AVX256-NEXT:    vmovd %xmm1, %ecx
2882; AVX256-NEXT:    vextracti128 $1, %ymm0, %xmm1
2883; AVX256-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
2884; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2885; AVX256-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
2886; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2887; AVX256-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
2888; AVX256-NEXT:    vmovd %xmm0, %eax
2889; AVX256-NEXT:    shlq $32, %rcx
2890; AVX256-NEXT:    orq %rcx, %rax
2891; AVX256-NEXT:    vzeroupper
2892; AVX256-NEXT:    retq
2893entry:
2894  %0 = zext i32 %n to i64
2895  br label %vector.body
2896
2897vector.body:
2898  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
2899  %vec.phi = phi <8 x i32> [ %6, %vector.body ], [ zeroinitializer, %entry ]
2900  %sum.phi = phi <8 x i32> [ %4, %vector.body ], [ zeroinitializer, %entry ]
2901  %1 = getelementptr inbounds i8, i8* %a, i64 %index
2902  %2 = bitcast i8* %1 to <8 x i8>*
2903  %wide.load = load <8 x i8>, <8 x i8>* %2, align 1
2904  %3 = zext <8 x i8> %wide.load to <8 x i32>
2905  %4 = add nsw <8 x i32> %3, %sum.phi
2906  %5 = mul nsw <8 x i32> %3, %3
2907  %6 = add nsw <8 x i32> %5, %vec.phi
2908  %index.next = add i64 %index, 8
2909  %7 = icmp eq i64 %index.next, %0
2910  br i1 %7, label %middle.block, label %vector.body
2911
2912middle.block:
2913  %rdx.shuf35 = shufflevector <8 x i32> %4, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
2914  %bin.rdx36 = add <8 x i32> %4, %rdx.shuf35
2915  %rdx.shuf37 = shufflevector <8 x i32> %bin.rdx36, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2916  %bin.rdx38 = add <8 x i32> %bin.rdx36, %rdx.shuf37
2917  %rdx.shuf39 = shufflevector <8 x i32> %bin.rdx38, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2918  %bin.rdx40 = add <8 x i32> %bin.rdx38, %rdx.shuf39
2919  %8 = extractelement <8 x i32> %bin.rdx40, i32 0
2920  %rdx.shuf = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
2921  %bin.rdx = add <8 x i32> %6, %rdx.shuf
2922  %rdx.shuf31 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2923  %bin.rdx32 = add <8 x i32> %bin.rdx, %rdx.shuf31
2924  %rdx.shuf33 = shufflevector <8 x i32> %bin.rdx32, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2925  %bin.rdx34 = add <8 x i32> %bin.rdx32, %rdx.shuf33
2926  %9 = extractelement <8 x i32> %bin.rdx34, i32 0
2927  %tmp = zext i32 %8 to i64
2928  %tmp28 = shl nuw i64 %tmp, 32
2929  %tmp29 = zext i32 %9 to i64
2930  %tmp30 = or i64 %tmp28, %tmp29
2931  ret i64 %tmp30
2932}
2933
2934define i32 @sum_of_square_differences(i8* %a, i8* %b, i32 %n) {
2935; SSE2-LABEL: sum_of_square_differences:
2936; SSE2:       # %bb.0: # %entry
2937; SSE2-NEXT:    movl %edx, %eax
2938; SSE2-NEXT:    pxor %xmm0, %xmm0
2939; SSE2-NEXT:    xorl %ecx, %ecx
2940; SSE2-NEXT:    pxor %xmm1, %xmm1
2941; SSE2-NEXT:    .p2align 4, 0x90
2942; SSE2-NEXT:  .LBB34_1: # %vector.body
2943; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
2944; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
2945; SSE2-NEXT:    movq {{.*#+}} xmm3 = mem[0],zero
2946; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2947; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2948; SSE2-NEXT:    psubw %xmm2, %xmm3
2949; SSE2-NEXT:    pmaddwd %xmm3, %xmm3
2950; SSE2-NEXT:    paddd %xmm3, %xmm1
2951; SSE2-NEXT:    addq $8, %rcx
2952; SSE2-NEXT:    cmpq %rcx, %rax
2953; SSE2-NEXT:    jne .LBB34_1
2954; SSE2-NEXT:  # %bb.2: # %middle.block
2955; SSE2-NEXT:    paddd %xmm0, %xmm1
2956; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
2957; SSE2-NEXT:    paddd %xmm1, %xmm0
2958; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2959; SSE2-NEXT:    paddd %xmm0, %xmm1
2960; SSE2-NEXT:    movd %xmm1, %eax
2961; SSE2-NEXT:    retq
2962;
2963; AVX1-LABEL: sum_of_square_differences:
2964; AVX1:       # %bb.0: # %entry
2965; AVX1-NEXT:    movl %edx, %eax
2966; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
2967; AVX1-NEXT:    xorl %ecx, %ecx
2968; AVX1-NEXT:    .p2align 4, 0x90
2969; AVX1-NEXT:  .LBB34_1: # %vector.body
2970; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
2971; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2972; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2973; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
2974; AVX1-NEXT:    vpmaddwd %xmm1, %xmm1, %xmm1
2975; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
2976; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2977; AVX1-NEXT:    addq $8, %rcx
2978; AVX1-NEXT:    cmpq %rcx, %rax
2979; AVX1-NEXT:    jne .LBB34_1
2980; AVX1-NEXT:  # %bb.2: # %middle.block
2981; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2982; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
2983; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2984; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
2985; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2986; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
2987; AVX1-NEXT:    vmovd %xmm0, %eax
2988; AVX1-NEXT:    vzeroupper
2989; AVX1-NEXT:    retq
2990;
2991; AVX256-LABEL: sum_of_square_differences:
2992; AVX256:       # %bb.0: # %entry
2993; AVX256-NEXT:    movl %edx, %eax
2994; AVX256-NEXT:    vpxor %xmm0, %xmm0, %xmm0
2995; AVX256-NEXT:    xorl %ecx, %ecx
2996; AVX256-NEXT:    .p2align 4, 0x90
2997; AVX256-NEXT:  .LBB34_1: # %vector.body
2998; AVX256-NEXT:    # =>This Inner Loop Header: Depth=1
2999; AVX256-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
3000; AVX256-NEXT:    vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
3001; AVX256-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
3002; AVX256-NEXT:    vpmaddwd %xmm1, %xmm1, %xmm1
3003; AVX256-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
3004; AVX256-NEXT:    addq $8, %rcx
3005; AVX256-NEXT:    cmpq %rcx, %rax
3006; AVX256-NEXT:    jne .LBB34_1
3007; AVX256-NEXT:  # %bb.2: # %middle.block
3008; AVX256-NEXT:    vextracti128 $1, %ymm0, %xmm1
3009; AVX256-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
3010; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3011; AVX256-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
3012; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
3013; AVX256-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
3014; AVX256-NEXT:    vmovd %xmm0, %eax
3015; AVX256-NEXT:    vzeroupper
3016; AVX256-NEXT:    retq
3017entry:
3018  %0 = zext i32 %n to i64
3019  br label %vector.body
3020
3021vector.body:
3022  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
3023  %vec.phi = phi <8 x i32> [ %9, %vector.body ], [ zeroinitializer, %entry ]
3024  %1 = getelementptr inbounds i8, i8* %a, i64 %index
3025  %2 = bitcast i8* %1 to <8 x i8>*
3026  %wide.load = load <8 x i8>, <8 x i8>* %2, align 1
3027  %3 = zext <8 x i8> %wide.load to <8 x i32>
3028  %4 = getelementptr inbounds i8, i8* %b, i64 %index
3029  %5 = bitcast i8* %4 to <8 x i8>*
3030  %wide.load2 = load <8 x i8>, <8 x i8>* %5, align 1
3031  %6 = zext <8 x i8> %wide.load2 to <8 x i32>
3032  %7 = sub <8 x i32> %6, %3
3033  %8 = mul <8 x i32> %7, %7
3034  %9 = add nsw <8 x i32> %8, %vec.phi
3035  %index.next = add i64 %index, 8
3036  %10 = icmp eq i64 %index.next, %0
3037  br i1 %10, label %middle.block, label %vector.body
3038
3039middle.block:
3040  %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
3041  %bin.rdx = add <8 x i32> %9, %rdx.shuf
3042  %rdx.shuf31 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3043  %bin.rdx32 = add <8 x i32> %bin.rdx, %rdx.shuf31
3044  %rdx.shuf33 = shufflevector <8 x i32> %bin.rdx32, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3045  %bin.rdx34 = add <8 x i32> %bin.rdx32, %rdx.shuf33
3046  %11 = extractelement <8 x i32> %bin.rdx34, i32 0
3047  ret i32 %11
3048}
3049