• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL --check-prefix=AVX512BWVL
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL --check-prefix=AVX512DQVL
10
11;
12; vXi64
13;
14
15define i64 @test_v2i64(<2 x i64> %a0) {
16; SSE-LABEL: test_v2i64:
17; SSE:       # %bb.0:
18; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
19; SSE-NEXT:    movdqa %xmm0, %xmm2
20; SSE-NEXT:    psrlq $32, %xmm2
21; SSE-NEXT:    pmuludq %xmm1, %xmm2
22; SSE-NEXT:    movdqa %xmm1, %xmm3
23; SSE-NEXT:    psrlq $32, %xmm3
24; SSE-NEXT:    pmuludq %xmm0, %xmm3
25; SSE-NEXT:    paddq %xmm2, %xmm3
26; SSE-NEXT:    psllq $32, %xmm3
27; SSE-NEXT:    pmuludq %xmm1, %xmm0
28; SSE-NEXT:    paddq %xmm3, %xmm0
29; SSE-NEXT:    movq %xmm0, %rax
30; SSE-NEXT:    retq
31;
32; AVX-LABEL: test_v2i64:
33; AVX:       # %bb.0:
34; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
35; AVX-NEXT:    vpsrlq $32, %xmm0, %xmm2
36; AVX-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
37; AVX-NEXT:    vpsrlq $32, %xmm1, %xmm3
38; AVX-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
39; AVX-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
40; AVX-NEXT:    vpsllq $32, %xmm2, %xmm2
41; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
42; AVX-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
43; AVX-NEXT:    vmovq %xmm0, %rax
44; AVX-NEXT:    retq
45;
46; AVX512BW-LABEL: test_v2i64:
47; AVX512BW:       # %bb.0:
48; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
49; AVX512BW-NEXT:    vpsrlq $32, %xmm0, %xmm2
50; AVX512BW-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
51; AVX512BW-NEXT:    vpsrlq $32, %xmm1, %xmm3
52; AVX512BW-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
53; AVX512BW-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
54; AVX512BW-NEXT:    vpsllq $32, %xmm2, %xmm2
55; AVX512BW-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
56; AVX512BW-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
57; AVX512BW-NEXT:    vmovq %xmm0, %rax
58; AVX512BW-NEXT:    retq
59;
60; AVX512BWVL-LABEL: test_v2i64:
61; AVX512BWVL:       # %bb.0:
62; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
63; AVX512BWVL-NEXT:    vpsrlq $32, %xmm0, %xmm2
64; AVX512BWVL-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
65; AVX512BWVL-NEXT:    vpsrlq $32, %xmm1, %xmm3
66; AVX512BWVL-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
67; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
68; AVX512BWVL-NEXT:    vpsllq $32, %xmm2, %xmm2
69; AVX512BWVL-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
70; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
71; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
72; AVX512BWVL-NEXT:    retq
73;
74; AVX512DQ-LABEL: test_v2i64:
75; AVX512DQ:       # %bb.0:
76; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
77; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
78; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
79; AVX512DQ-NEXT:    vmovq %xmm0, %rax
80; AVX512DQ-NEXT:    vzeroupper
81; AVX512DQ-NEXT:    retq
82;
83; AVX512DQVL-LABEL: test_v2i64:
84; AVX512DQVL:       # %bb.0:
85; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
86; AVX512DQVL-NEXT:    vpmullq %xmm1, %xmm0, %xmm0
87; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
88; AVX512DQVL-NEXT:    retq
89  %1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> %a0)
90  ret i64 %1
91}
92
93define i64 @test_v4i64(<4 x i64> %a0) {
94; SSE-LABEL: test_v4i64:
95; SSE:       # %bb.0:
96; SSE-NEXT:    movdqa %xmm0, %xmm2
97; SSE-NEXT:    psrlq $32, %xmm2
98; SSE-NEXT:    pmuludq %xmm1, %xmm2
99; SSE-NEXT:    movdqa %xmm1, %xmm3
100; SSE-NEXT:    psrlq $32, %xmm3
101; SSE-NEXT:    pmuludq %xmm0, %xmm3
102; SSE-NEXT:    paddq %xmm2, %xmm3
103; SSE-NEXT:    psllq $32, %xmm3
104; SSE-NEXT:    pmuludq %xmm1, %xmm0
105; SSE-NEXT:    paddq %xmm3, %xmm0
106; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
107; SSE-NEXT:    movdqa %xmm0, %xmm2
108; SSE-NEXT:    psrlq $32, %xmm2
109; SSE-NEXT:    pmuludq %xmm1, %xmm2
110; SSE-NEXT:    movdqa %xmm1, %xmm3
111; SSE-NEXT:    psrlq $32, %xmm3
112; SSE-NEXT:    pmuludq %xmm0, %xmm3
113; SSE-NEXT:    paddq %xmm2, %xmm3
114; SSE-NEXT:    psllq $32, %xmm3
115; SSE-NEXT:    pmuludq %xmm1, %xmm0
116; SSE-NEXT:    paddq %xmm3, %xmm0
117; SSE-NEXT:    movq %xmm0, %rax
118; SSE-NEXT:    retq
119;
120; AVX1-LABEL: test_v4i64:
121; AVX1:       # %bb.0:
122; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
123; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
124; AVX1-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
125; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
126; AVX1-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
127; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
128; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
129; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
130; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
131; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
132; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
133; AVX1-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
134; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
135; AVX1-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
136; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
137; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
138; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
139; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
140; AVX1-NEXT:    vmovq %xmm0, %rax
141; AVX1-NEXT:    vzeroupper
142; AVX1-NEXT:    retq
143;
144; AVX2-LABEL: test_v4i64:
145; AVX2:       # %bb.0:
146; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
147; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm2
148; AVX2-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
149; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
150; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
151; AVX2-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
152; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
153; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
154; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
155; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
156; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm2
157; AVX2-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
158; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
159; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
160; AVX2-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
161; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
162; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
163; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
164; AVX2-NEXT:    vmovq %xmm0, %rax
165; AVX2-NEXT:    vzeroupper
166; AVX2-NEXT:    retq
167;
168; AVX512BW-LABEL: test_v4i64:
169; AVX512BW:       # %bb.0:
170; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
171; AVX512BW-NEXT:    vpsrlq $32, %ymm0, %ymm2
172; AVX512BW-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
173; AVX512BW-NEXT:    vpsrlq $32, %ymm1, %ymm3
174; AVX512BW-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
175; AVX512BW-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
176; AVX512BW-NEXT:    vpsllq $32, %ymm2, %ymm2
177; AVX512BW-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
178; AVX512BW-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
179; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
180; AVX512BW-NEXT:    vpsrlq $32, %ymm0, %ymm2
181; AVX512BW-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
182; AVX512BW-NEXT:    vpsrlq $32, %ymm1, %ymm3
183; AVX512BW-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
184; AVX512BW-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
185; AVX512BW-NEXT:    vpsllq $32, %ymm2, %ymm2
186; AVX512BW-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
187; AVX512BW-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
188; AVX512BW-NEXT:    vmovq %xmm0, %rax
189; AVX512BW-NEXT:    vzeroupper
190; AVX512BW-NEXT:    retq
191;
192; AVX512BWVL-LABEL: test_v4i64:
193; AVX512BWVL:       # %bb.0:
194; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
195; AVX512BWVL-NEXT:    vpsrlq $32, %ymm0, %ymm2
196; AVX512BWVL-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
197; AVX512BWVL-NEXT:    vpsrlq $32, %ymm1, %ymm3
198; AVX512BWVL-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
199; AVX512BWVL-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
200; AVX512BWVL-NEXT:    vpsllq $32, %ymm2, %ymm2
201; AVX512BWVL-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
202; AVX512BWVL-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
203; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
204; AVX512BWVL-NEXT:    vpsrlq $32, %ymm0, %ymm2
205; AVX512BWVL-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
206; AVX512BWVL-NEXT:    vpsrlq $32, %ymm1, %ymm3
207; AVX512BWVL-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
208; AVX512BWVL-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
209; AVX512BWVL-NEXT:    vpsllq $32, %ymm2, %ymm2
210; AVX512BWVL-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
211; AVX512BWVL-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
212; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
213; AVX512BWVL-NEXT:    vzeroupper
214; AVX512BWVL-NEXT:    retq
215;
216; AVX512DQ-LABEL: test_v4i64:
217; AVX512DQ:       # %bb.0:
218; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
219; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
220; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
221; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
222; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
223; AVX512DQ-NEXT:    vmovq %xmm0, %rax
224; AVX512DQ-NEXT:    vzeroupper
225; AVX512DQ-NEXT:    retq
226;
227; AVX512DQVL-LABEL: test_v4i64:
228; AVX512DQVL:       # %bb.0:
229; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
230; AVX512DQVL-NEXT:    vpmullq %ymm1, %ymm0, %ymm0
231; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
232; AVX512DQVL-NEXT:    vpmullq %ymm1, %ymm0, %ymm0
233; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
234; AVX512DQVL-NEXT:    vzeroupper
235; AVX512DQVL-NEXT:    retq
236  %1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> %a0)
237  ret i64 %1
238}
239
240define i64 @test_v8i64(<8 x i64> %a0) {
241; SSE-LABEL: test_v8i64:
242; SSE:       # %bb.0:
243; SSE-NEXT:    movdqa %xmm1, %xmm4
244; SSE-NEXT:    psrlq $32, %xmm4
245; SSE-NEXT:    pmuludq %xmm3, %xmm4
246; SSE-NEXT:    movdqa %xmm3, %xmm5
247; SSE-NEXT:    psrlq $32, %xmm5
248; SSE-NEXT:    pmuludq %xmm1, %xmm5
249; SSE-NEXT:    paddq %xmm4, %xmm5
250; SSE-NEXT:    psllq $32, %xmm5
251; SSE-NEXT:    pmuludq %xmm3, %xmm1
252; SSE-NEXT:    paddq %xmm5, %xmm1
253; SSE-NEXT:    movdqa %xmm0, %xmm3
254; SSE-NEXT:    psrlq $32, %xmm3
255; SSE-NEXT:    pmuludq %xmm2, %xmm3
256; SSE-NEXT:    movdqa %xmm2, %xmm4
257; SSE-NEXT:    psrlq $32, %xmm4
258; SSE-NEXT:    pmuludq %xmm0, %xmm4
259; SSE-NEXT:    paddq %xmm3, %xmm4
260; SSE-NEXT:    psllq $32, %xmm4
261; SSE-NEXT:    pmuludq %xmm2, %xmm0
262; SSE-NEXT:    paddq %xmm4, %xmm0
263; SSE-NEXT:    movdqa %xmm0, %xmm2
264; SSE-NEXT:    psrlq $32, %xmm2
265; SSE-NEXT:    pmuludq %xmm1, %xmm2
266; SSE-NEXT:    movdqa %xmm1, %xmm3
267; SSE-NEXT:    psrlq $32, %xmm3
268; SSE-NEXT:    pmuludq %xmm0, %xmm3
269; SSE-NEXT:    paddq %xmm2, %xmm3
270; SSE-NEXT:    psllq $32, %xmm3
271; SSE-NEXT:    pmuludq %xmm1, %xmm0
272; SSE-NEXT:    paddq %xmm3, %xmm0
273; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
274; SSE-NEXT:    movdqa %xmm0, %xmm2
275; SSE-NEXT:    psrlq $32, %xmm2
276; SSE-NEXT:    pmuludq %xmm1, %xmm2
277; SSE-NEXT:    movdqa %xmm1, %xmm3
278; SSE-NEXT:    psrlq $32, %xmm3
279; SSE-NEXT:    pmuludq %xmm0, %xmm3
280; SSE-NEXT:    paddq %xmm2, %xmm3
281; SSE-NEXT:    psllq $32, %xmm3
282; SSE-NEXT:    pmuludq %xmm1, %xmm0
283; SSE-NEXT:    paddq %xmm3, %xmm0
284; SSE-NEXT:    movq %xmm0, %rax
285; SSE-NEXT:    retq
286;
287; AVX1-LABEL: test_v8i64:
288; AVX1:       # %bb.0:
289; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
290; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
291; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm4
292; AVX1-NEXT:    vpmuludq %xmm2, %xmm4, %xmm4
293; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm5
294; AVX1-NEXT:    vpmuludq %xmm5, %xmm3, %xmm5
295; AVX1-NEXT:    vpaddq %xmm4, %xmm5, %xmm4
296; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
297; AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
298; AVX1-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
299; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm3
300; AVX1-NEXT:    vpmuludq %xmm1, %xmm3, %xmm3
301; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
302; AVX1-NEXT:    vpmuludq %xmm4, %xmm0, %xmm4
303; AVX1-NEXT:    vpaddq %xmm3, %xmm4, %xmm3
304; AVX1-NEXT:    vpsllq $32, %xmm3, %xmm3
305; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
306; AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm0
307; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm1
308; AVX1-NEXT:    vpmuludq %xmm2, %xmm1, %xmm1
309; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm3
310; AVX1-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
311; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
312; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
313; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm0
314; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
315; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
316; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
317; AVX1-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
318; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
319; AVX1-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
320; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
321; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
322; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
323; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
324; AVX1-NEXT:    vmovq %xmm0, %rax
325; AVX1-NEXT:    vzeroupper
326; AVX1-NEXT:    retq
327;
328; AVX2-LABEL: test_v8i64:
329; AVX2:       # %bb.0:
330; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm2
331; AVX2-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
332; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
333; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
334; AVX2-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
335; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
336; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
337; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
338; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
339; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm2
340; AVX2-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
341; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
342; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
343; AVX2-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
344; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
345; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
346; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
347; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
348; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm2
349; AVX2-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
350; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
351; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
352; AVX2-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
353; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
354; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
355; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
356; AVX2-NEXT:    vmovq %xmm0, %rax
357; AVX2-NEXT:    vzeroupper
358; AVX2-NEXT:    retq
359;
360; AVX512BW-LABEL: test_v8i64:
361; AVX512BW:       # %bb.0:
362; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
363; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm2
364; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
365; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm3
366; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
367; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
368; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
369; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
370; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
371; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
372; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm2
373; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
374; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm3
375; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
376; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
377; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
378; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
379; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
380; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
381; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm2
382; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
383; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm3
384; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
385; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
386; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
387; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
388; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
389; AVX512BW-NEXT:    vmovq %xmm0, %rax
390; AVX512BW-NEXT:    vzeroupper
391; AVX512BW-NEXT:    retq
392;
393; AVX512BWVL-LABEL: test_v8i64:
394; AVX512BWVL:       # %bb.0:
395; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
396; AVX512BWVL-NEXT:    vpsrlq $32, %zmm0, %zmm2
397; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
398; AVX512BWVL-NEXT:    vpsrlq $32, %zmm1, %zmm3
399; AVX512BWVL-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
400; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
401; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
402; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
403; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
404; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
405; AVX512BWVL-NEXT:    vpsrlq $32, %zmm0, %zmm2
406; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
407; AVX512BWVL-NEXT:    vpsrlq $32, %zmm1, %zmm3
408; AVX512BWVL-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
409; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
410; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
411; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
412; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
413; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
414; AVX512BWVL-NEXT:    vpsrlq $32, %zmm0, %zmm2
415; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
416; AVX512BWVL-NEXT:    vpsrlq $32, %zmm1, %zmm3
417; AVX512BWVL-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
418; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
419; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
420; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
421; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
422; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
423; AVX512BWVL-NEXT:    vzeroupper
424; AVX512BWVL-NEXT:    retq
425;
426; AVX512DQ-LABEL: test_v8i64:
427; AVX512DQ:       # %bb.0:
428; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
429; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
430; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
431; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
432; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
433; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
434; AVX512DQ-NEXT:    vmovq %xmm0, %rax
435; AVX512DQ-NEXT:    vzeroupper
436; AVX512DQ-NEXT:    retq
437;
438; AVX512DQVL-LABEL: test_v8i64:
439; AVX512DQVL:       # %bb.0:
440; AVX512DQVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
441; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
442; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
443; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
444; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
445; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
446; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
447; AVX512DQVL-NEXT:    vzeroupper
448; AVX512DQVL-NEXT:    retq
449  %1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> %a0)
450  ret i64 %1
451}
452
453define i64 @test_v16i64(<16 x i64> %a0) {
454; SSE-LABEL: test_v16i64:
455; SSE:       # %bb.0:
456; SSE-NEXT:    movdqa %xmm2, %xmm8
457; SSE-NEXT:    psrlq $32, %xmm8
458; SSE-NEXT:    pmuludq %xmm6, %xmm8
459; SSE-NEXT:    movdqa %xmm6, %xmm9
460; SSE-NEXT:    psrlq $32, %xmm9
461; SSE-NEXT:    pmuludq %xmm2, %xmm9
462; SSE-NEXT:    paddq %xmm8, %xmm9
463; SSE-NEXT:    psllq $32, %xmm9
464; SSE-NEXT:    pmuludq %xmm6, %xmm2
465; SSE-NEXT:    paddq %xmm9, %xmm2
466; SSE-NEXT:    movdqa %xmm0, %xmm8
467; SSE-NEXT:    psrlq $32, %xmm8
468; SSE-NEXT:    pmuludq %xmm4, %xmm8
469; SSE-NEXT:    movdqa %xmm4, %xmm6
470; SSE-NEXT:    psrlq $32, %xmm6
471; SSE-NEXT:    pmuludq %xmm0, %xmm6
472; SSE-NEXT:    paddq %xmm8, %xmm6
473; SSE-NEXT:    psllq $32, %xmm6
474; SSE-NEXT:    pmuludq %xmm4, %xmm0
475; SSE-NEXT:    paddq %xmm6, %xmm0
476; SSE-NEXT:    movdqa %xmm3, %xmm4
477; SSE-NEXT:    psrlq $32, %xmm4
478; SSE-NEXT:    pmuludq %xmm7, %xmm4
479; SSE-NEXT:    movdqa %xmm7, %xmm6
480; SSE-NEXT:    psrlq $32, %xmm6
481; SSE-NEXT:    pmuludq %xmm3, %xmm6
482; SSE-NEXT:    paddq %xmm4, %xmm6
483; SSE-NEXT:    psllq $32, %xmm6
484; SSE-NEXT:    pmuludq %xmm7, %xmm3
485; SSE-NEXT:    paddq %xmm6, %xmm3
486; SSE-NEXT:    movdqa %xmm1, %xmm4
487; SSE-NEXT:    psrlq $32, %xmm4
488; SSE-NEXT:    pmuludq %xmm5, %xmm4
489; SSE-NEXT:    movdqa %xmm5, %xmm6
490; SSE-NEXT:    psrlq $32, %xmm6
491; SSE-NEXT:    pmuludq %xmm1, %xmm6
492; SSE-NEXT:    paddq %xmm4, %xmm6
493; SSE-NEXT:    psllq $32, %xmm6
494; SSE-NEXT:    pmuludq %xmm5, %xmm1
495; SSE-NEXT:    paddq %xmm6, %xmm1
496; SSE-NEXT:    movdqa %xmm1, %xmm4
497; SSE-NEXT:    psrlq $32, %xmm4
498; SSE-NEXT:    pmuludq %xmm3, %xmm4
499; SSE-NEXT:    movdqa %xmm3, %xmm5
500; SSE-NEXT:    psrlq $32, %xmm5
501; SSE-NEXT:    pmuludq %xmm1, %xmm5
502; SSE-NEXT:    paddq %xmm4, %xmm5
503; SSE-NEXT:    psllq $32, %xmm5
504; SSE-NEXT:    pmuludq %xmm3, %xmm1
505; SSE-NEXT:    paddq %xmm5, %xmm1
506; SSE-NEXT:    movdqa %xmm0, %xmm3
507; SSE-NEXT:    psrlq $32, %xmm3
508; SSE-NEXT:    pmuludq %xmm2, %xmm3
509; SSE-NEXT:    movdqa %xmm2, %xmm4
510; SSE-NEXT:    psrlq $32, %xmm4
511; SSE-NEXT:    pmuludq %xmm0, %xmm4
512; SSE-NEXT:    paddq %xmm3, %xmm4
513; SSE-NEXT:    psllq $32, %xmm4
514; SSE-NEXT:    pmuludq %xmm2, %xmm0
515; SSE-NEXT:    paddq %xmm4, %xmm0
516; SSE-NEXT:    movdqa %xmm0, %xmm2
517; SSE-NEXT:    psrlq $32, %xmm2
518; SSE-NEXT:    pmuludq %xmm1, %xmm2
519; SSE-NEXT:    movdqa %xmm1, %xmm3
520; SSE-NEXT:    psrlq $32, %xmm3
521; SSE-NEXT:    pmuludq %xmm0, %xmm3
522; SSE-NEXT:    paddq %xmm2, %xmm3
523; SSE-NEXT:    psllq $32, %xmm3
524; SSE-NEXT:    pmuludq %xmm1, %xmm0
525; SSE-NEXT:    paddq %xmm3, %xmm0
526; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
527; SSE-NEXT:    movdqa %xmm0, %xmm2
528; SSE-NEXT:    psrlq $32, %xmm2
529; SSE-NEXT:    pmuludq %xmm1, %xmm2
530; SSE-NEXT:    movdqa %xmm1, %xmm3
531; SSE-NEXT:    psrlq $32, %xmm3
532; SSE-NEXT:    pmuludq %xmm0, %xmm3
533; SSE-NEXT:    paddq %xmm2, %xmm3
534; SSE-NEXT:    psllq $32, %xmm3
535; SSE-NEXT:    pmuludq %xmm1, %xmm0
536; SSE-NEXT:    paddq %xmm3, %xmm0
537; SSE-NEXT:    movq %xmm0, %rax
538; SSE-NEXT:    retq
539;
540; AVX1-LABEL: test_v16i64:
541; AVX1:       # %bb.0:
542; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
543; AVX1-NEXT:    vpmuludq %xmm3, %xmm4, %xmm4
544; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm5
545; AVX1-NEXT:    vpmuludq %xmm5, %xmm1, %xmm5
546; AVX1-NEXT:    vpaddq %xmm4, %xmm5, %xmm4
547; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
548; AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm5
549; AVX1-NEXT:    vpaddq %xmm4, %xmm5, %xmm4
550; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm5
551; AVX1-NEXT:    vpmuludq %xmm2, %xmm5, %xmm5
552; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm6
553; AVX1-NEXT:    vpmuludq %xmm6, %xmm0, %xmm6
554; AVX1-NEXT:    vpaddq %xmm5, %xmm6, %xmm5
555; AVX1-NEXT:    vpsllq $32, %xmm5, %xmm5
556; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm6
557; AVX1-NEXT:    vpaddq %xmm5, %xmm6, %xmm5
558; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
559; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
560; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm6
561; AVX1-NEXT:    vpmuludq %xmm3, %xmm6, %xmm6
562; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm7
563; AVX1-NEXT:    vpmuludq %xmm7, %xmm1, %xmm7
564; AVX1-NEXT:    vpaddq %xmm6, %xmm7, %xmm6
565; AVX1-NEXT:    vpsllq $32, %xmm6, %xmm6
566; AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
567; AVX1-NEXT:    vpaddq %xmm6, %xmm1, %xmm1
568; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
569; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
570; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm3
571; AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm3
572; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm6
573; AVX1-NEXT:    vpmuludq %xmm6, %xmm0, %xmm6
574; AVX1-NEXT:    vpaddq %xmm3, %xmm6, %xmm3
575; AVX1-NEXT:    vpsllq $32, %xmm3, %xmm3
576; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm0
577; AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm0
578; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
579; AVX1-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
580; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
581; AVX1-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
582; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
583; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
584; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
585; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
586; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm1
587; AVX1-NEXT:    vpmuludq %xmm4, %xmm1, %xmm1
588; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm2
589; AVX1-NEXT:    vpmuludq %xmm2, %xmm5, %xmm2
590; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
591; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
592; AVX1-NEXT:    vpmuludq %xmm4, %xmm5, %xmm2
593; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
594; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm2
595; AVX1-NEXT:    vpmuludq %xmm0, %xmm2, %xmm2
596; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm3
597; AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm3
598; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
599; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
600; AVX1-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
601; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
602; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
603; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
604; AVX1-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
605; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
606; AVX1-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
607; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
608; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
609; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
610; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
611; AVX1-NEXT:    vmovq %xmm0, %rax
612; AVX1-NEXT:    vzeroupper
613; AVX1-NEXT:    retq
614;
615; AVX2-LABEL: test_v16i64:
616; AVX2:       # %bb.0:
617; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm4
618; AVX2-NEXT:    vpmuludq %ymm3, %ymm4, %ymm4
619; AVX2-NEXT:    vpsrlq $32, %ymm3, %ymm5
620; AVX2-NEXT:    vpmuludq %ymm5, %ymm1, %ymm5
621; AVX2-NEXT:    vpaddq %ymm4, %ymm5, %ymm4
622; AVX2-NEXT:    vpsllq $32, %ymm4, %ymm4
623; AVX2-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
624; AVX2-NEXT:    vpaddq %ymm4, %ymm1, %ymm1
625; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm3
626; AVX2-NEXT:    vpmuludq %ymm2, %ymm3, %ymm3
627; AVX2-NEXT:    vpsrlq $32, %ymm2, %ymm4
628; AVX2-NEXT:    vpmuludq %ymm4, %ymm0, %ymm4
629; AVX2-NEXT:    vpaddq %ymm3, %ymm4, %ymm3
630; AVX2-NEXT:    vpsllq $32, %ymm3, %ymm3
631; AVX2-NEXT:    vpmuludq %ymm2, %ymm0, %ymm0
632; AVX2-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
633; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm2
634; AVX2-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
635; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
636; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
637; AVX2-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
638; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
639; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
640; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
641; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
642; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm2
643; AVX2-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
644; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
645; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
646; AVX2-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
647; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
648; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
649; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
650; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
651; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm2
652; AVX2-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
653; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
654; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
655; AVX2-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
656; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
657; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
658; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
659; AVX2-NEXT:    vmovq %xmm0, %rax
660; AVX2-NEXT:    vzeroupper
661; AVX2-NEXT:    retq
662;
663; AVX512BW-LABEL: test_v16i64:
664; AVX512BW:       # %bb.0:
665; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm2
666; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
667; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm3
668; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
669; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
670; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
671; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
672; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
673; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
674; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm2
675; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
676; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm3
677; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
678; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
679; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
680; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
681; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
682; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
683; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm2
684; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
685; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm3
686; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
687; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
688; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
689; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
690; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
691; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
692; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm2
693; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
694; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm3
695; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
696; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
697; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
698; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
699; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
700; AVX512BW-NEXT:    vmovq %xmm0, %rax
701; AVX512BW-NEXT:    vzeroupper
702; AVX512BW-NEXT:    retq
703;
704; AVX512BWVL-LABEL: test_v16i64:
705; AVX512BWVL:       # %bb.0:
706; AVX512BWVL-NEXT:    vpsrlq $32, %zmm0, %zmm2
707; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
708; AVX512BWVL-NEXT:    vpsrlq $32, %zmm1, %zmm3
709; AVX512BWVL-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
710; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
711; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
712; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
713; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
714; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
715; AVX512BWVL-NEXT:    vpsrlq $32, %zmm0, %zmm2
716; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
717; AVX512BWVL-NEXT:    vpsrlq $32, %zmm1, %zmm3
718; AVX512BWVL-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
719; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
720; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
721; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
722; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
723; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
724; AVX512BWVL-NEXT:    vpsrlq $32, %zmm0, %zmm2
725; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
726; AVX512BWVL-NEXT:    vpsrlq $32, %zmm1, %zmm3
727; AVX512BWVL-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
728; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
729; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
730; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
731; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
732; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
733; AVX512BWVL-NEXT:    vpsrlq $32, %zmm0, %zmm2
734; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
735; AVX512BWVL-NEXT:    vpsrlq $32, %zmm1, %zmm3
736; AVX512BWVL-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
737; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
738; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
739; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
740; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
741; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
742; AVX512BWVL-NEXT:    vzeroupper
743; AVX512BWVL-NEXT:    retq
744;
745; AVX512DQ-LABEL: test_v16i64:
746; AVX512DQ:       # %bb.0:
747; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
748; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
749; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
750; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
751; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
752; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
753; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
754; AVX512DQ-NEXT:    vmovq %xmm0, %rax
755; AVX512DQ-NEXT:    vzeroupper
756; AVX512DQ-NEXT:    retq
757;
758; AVX512DQVL-LABEL: test_v16i64:
759; AVX512DQVL:       # %bb.0:
760; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
761; AVX512DQVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
762; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
763; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
764; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
765; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
766; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
767; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
768; AVX512DQVL-NEXT:    vzeroupper
769; AVX512DQVL-NEXT:    retq
770  %1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> %a0)
771  ret i64 %1
772}
773
774;
775; vXi32
776;
777
778define i32 @test_v4i32(<4 x i32> %a0) {
779; SSE2-LABEL: test_v4i32:
780; SSE2:       # %bb.0:
781; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
782; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
783; SSE2-NEXT:    pmuludq %xmm1, %xmm0
784; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
785; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
786; SSE2-NEXT:    pmuludq %xmm2, %xmm1
787; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
788; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
789; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
790; SSE2-NEXT:    pmuludq %xmm0, %xmm1
791; SSE2-NEXT:    movd %xmm1, %eax
792; SSE2-NEXT:    retq
793;
794; SSE41-LABEL: test_v4i32:
795; SSE41:       # %bb.0:
796; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
797; SSE41-NEXT:    pmulld %xmm0, %xmm1
798; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
799; SSE41-NEXT:    pmulld %xmm1, %xmm0
800; SSE41-NEXT:    movd %xmm0, %eax
801; SSE41-NEXT:    retq
802;
803; AVX-LABEL: test_v4i32:
804; AVX:       # %bb.0:
805; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
806; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
807; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
808; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
809; AVX-NEXT:    vmovd %xmm0, %eax
810; AVX-NEXT:    retq
811;
812; AVX512-LABEL: test_v4i32:
813; AVX512:       # %bb.0:
814; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
815; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
816; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
817; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
818; AVX512-NEXT:    vmovd %xmm0, %eax
819; AVX512-NEXT:    retq
820  %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> %a0)
821  ret i32 %1
822}
823
824define i32 @test_v8i32(<8 x i32> %a0) {
825; SSE2-LABEL: test_v8i32:
826; SSE2:       # %bb.0:
827; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
828; SSE2-NEXT:    pmuludq %xmm1, %xmm0
829; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
830; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
831; SSE2-NEXT:    pmuludq %xmm2, %xmm1
832; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
833; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
834; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
835; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
836; SSE2-NEXT:    pmuludq %xmm1, %xmm0
837; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
838; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
839; SSE2-NEXT:    pmuludq %xmm2, %xmm1
840; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
841; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
842; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
843; SSE2-NEXT:    pmuludq %xmm0, %xmm1
844; SSE2-NEXT:    movd %xmm1, %eax
845; SSE2-NEXT:    retq
846;
847; SSE41-LABEL: test_v8i32:
848; SSE41:       # %bb.0:
849; SSE41-NEXT:    pmulld %xmm1, %xmm0
850; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
851; SSE41-NEXT:    pmulld %xmm0, %xmm1
852; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
853; SSE41-NEXT:    pmulld %xmm1, %xmm0
854; SSE41-NEXT:    movd %xmm0, %eax
855; SSE41-NEXT:    retq
856;
857; AVX1-LABEL: test_v8i32:
858; AVX1:       # %bb.0:
859; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
860; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
861; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
862; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
863; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
864; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
865; AVX1-NEXT:    vmovd %xmm0, %eax
866; AVX1-NEXT:    vzeroupper
867; AVX1-NEXT:    retq
868;
869; AVX2-LABEL: test_v8i32:
870; AVX2:       # %bb.0:
871; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
872; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
873; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
874; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
875; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
876; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
877; AVX2-NEXT:    vmovd %xmm0, %eax
878; AVX2-NEXT:    vzeroupper
879; AVX2-NEXT:    retq
880;
881; AVX512-LABEL: test_v8i32:
882; AVX512:       # %bb.0:
883; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
884; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
885; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
886; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
887; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
888; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
889; AVX512-NEXT:    vmovd %xmm0, %eax
890; AVX512-NEXT:    vzeroupper
891; AVX512-NEXT:    retq
892  %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> %a0)
893  ret i32 %1
894}
895
896define i32 @test_v16i32(<16 x i32> %a0) {
897; SSE2-LABEL: test_v16i32:
898; SSE2:       # %bb.0:
899; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
900; SSE2-NEXT:    pmuludq %xmm3, %xmm1
901; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
902; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
903; SSE2-NEXT:    pmuludq %xmm4, %xmm3
904; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
905; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
906; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
907; SSE2-NEXT:    pmuludq %xmm2, %xmm0
908; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
909; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
910; SSE2-NEXT:    pmuludq %xmm3, %xmm2
911; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
912; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
913; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
914; SSE2-NEXT:    pmuludq %xmm1, %xmm0
915; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
916; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
917; SSE2-NEXT:    pmuludq %xmm2, %xmm1
918; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
919; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
920; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
921; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
922; SSE2-NEXT:    pmuludq %xmm1, %xmm0
923; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
924; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
925; SSE2-NEXT:    pmuludq %xmm2, %xmm1
926; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
927; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
928; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
929; SSE2-NEXT:    pmuludq %xmm0, %xmm1
930; SSE2-NEXT:    movd %xmm1, %eax
931; SSE2-NEXT:    retq
932;
933; SSE41-LABEL: test_v16i32:
934; SSE41:       # %bb.0:
935; SSE41-NEXT:    pmulld %xmm3, %xmm1
936; SSE41-NEXT:    pmulld %xmm2, %xmm0
937; SSE41-NEXT:    pmulld %xmm1, %xmm0
938; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
939; SSE41-NEXT:    pmulld %xmm0, %xmm1
940; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
941; SSE41-NEXT:    pmulld %xmm1, %xmm0
942; SSE41-NEXT:    movd %xmm0, %eax
943; SSE41-NEXT:    retq
944;
945; AVX1-LABEL: test_v16i32:
946; AVX1:       # %bb.0:
947; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
948; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
949; AVX1-NEXT:    vpmulld %xmm2, %xmm3, %xmm2
950; AVX1-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
951; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
952; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
953; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
954; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
955; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
956; AVX1-NEXT:    vmovd %xmm0, %eax
957; AVX1-NEXT:    vzeroupper
958; AVX1-NEXT:    retq
959;
960; AVX2-LABEL: test_v16i32:
961; AVX2:       # %bb.0:
962; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
963; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
964; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
965; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
966; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
967; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
968; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
969; AVX2-NEXT:    vmovd %xmm0, %eax
970; AVX2-NEXT:    vzeroupper
971; AVX2-NEXT:    retq
972;
973; AVX512-LABEL: test_v16i32:
974; AVX512:       # %bb.0:
975; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
976; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
977; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
978; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
979; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
980; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
981; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
982; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
983; AVX512-NEXT:    vmovd %xmm0, %eax
984; AVX512-NEXT:    vzeroupper
985; AVX512-NEXT:    retq
986  %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> %a0)
987  ret i32 %1
988}
989
990define i32 @test_v32i32(<32 x i32> %a0) {
991; SSE2-LABEL: test_v32i32:
992; SSE2:       # %bb.0:
993; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3]
994; SSE2-NEXT:    pmuludq %xmm6, %xmm2
995; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
996; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
997; SSE2-NEXT:    pmuludq %xmm8, %xmm6
998; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
999; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
1000; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
1001; SSE2-NEXT:    pmuludq %xmm4, %xmm0
1002; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1003; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1004; SSE2-NEXT:    pmuludq %xmm6, %xmm4
1005; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1006; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
1007; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
1008; SSE2-NEXT:    pmuludq %xmm7, %xmm3
1009; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
1010; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
1011; SSE2-NEXT:    pmuludq %xmm4, %xmm6
1012; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3]
1013; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1014; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
1015; SSE2-NEXT:    pmuludq %xmm5, %xmm1
1016; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1017; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1018; SSE2-NEXT:    pmuludq %xmm4, %xmm5
1019; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
1020; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
1021; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
1022; SSE2-NEXT:    pmuludq %xmm3, %xmm1
1023; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1024; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1025; SSE2-NEXT:    pmuludq %xmm4, %xmm3
1026; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
1027; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1028; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
1029; SSE2-NEXT:    pmuludq %xmm2, %xmm0
1030; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1031; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1032; SSE2-NEXT:    pmuludq %xmm3, %xmm2
1033; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1034; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1035; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1036; SSE2-NEXT:    pmuludq %xmm1, %xmm0
1037; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1038; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1039; SSE2-NEXT:    pmuludq %xmm2, %xmm1
1040; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1041; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1042; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1043; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1044; SSE2-NEXT:    pmuludq %xmm1, %xmm0
1045; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1046; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1047; SSE2-NEXT:    pmuludq %xmm2, %xmm1
1048; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1049; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1050; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1051; SSE2-NEXT:    pmuludq %xmm0, %xmm1
1052; SSE2-NEXT:    movd %xmm1, %eax
1053; SSE2-NEXT:    retq
1054;
1055; SSE41-LABEL: test_v32i32:
1056; SSE41:       # %bb.0:
1057; SSE41-NEXT:    pmulld %xmm6, %xmm2
1058; SSE41-NEXT:    pmulld %xmm4, %xmm0
1059; SSE41-NEXT:    pmulld %xmm2, %xmm0
1060; SSE41-NEXT:    pmulld %xmm7, %xmm3
1061; SSE41-NEXT:    pmulld %xmm5, %xmm1
1062; SSE41-NEXT:    pmulld %xmm3, %xmm1
1063; SSE41-NEXT:    pmulld %xmm0, %xmm1
1064; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
1065; SSE41-NEXT:    pmulld %xmm1, %xmm0
1066; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1067; SSE41-NEXT:    pmulld %xmm0, %xmm1
1068; SSE41-NEXT:    movd %xmm1, %eax
1069; SSE41-NEXT:    retq
1070;
1071; AVX1-LABEL: test_v32i32:
1072; AVX1:       # %bb.0:
1073; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm4
1074; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1075; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1076; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
1077; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
1078; AVX1-NEXT:    vpmulld %xmm1, %xmm3, %xmm1
1079; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1080; AVX1-NEXT:    vpmulld %xmm1, %xmm3, %xmm1
1081; AVX1-NEXT:    vpmulld %xmm4, %xmm2, %xmm2
1082; AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
1083; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1084; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1085; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1086; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1087; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1088; AVX1-NEXT:    vmovd %xmm0, %eax
1089; AVX1-NEXT:    vzeroupper
1090; AVX1-NEXT:    retq
1091;
1092; AVX2-LABEL: test_v32i32:
1093; AVX2:       # %bb.0:
1094; AVX2-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
1095; AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
1096; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1097; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1098; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1099; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1100; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1101; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1102; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1103; AVX2-NEXT:    vmovd %xmm0, %eax
1104; AVX2-NEXT:    vzeroupper
1105; AVX2-NEXT:    retq
1106;
1107; AVX512-LABEL: test_v32i32:
1108; AVX512:       # %bb.0:
1109; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
1110; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1111; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
1112; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
1113; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
1114; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1115; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
1116; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1117; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
1118; AVX512-NEXT:    vmovd %xmm0, %eax
1119; AVX512-NEXT:    vzeroupper
1120; AVX512-NEXT:    retq
1121  %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> %a0)
1122  ret i32 %1
1123}
1124
1125;
1126; vXi16
1127;
1128
1129define i16 @test_v8i16(<8 x i16> %a0) {
1130; SSE-LABEL: test_v8i16:
1131; SSE:       # %bb.0:
1132; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1133; SSE-NEXT:    pmullw %xmm0, %xmm1
1134; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1135; SSE-NEXT:    pmullw %xmm1, %xmm0
1136; SSE-NEXT:    movdqa %xmm0, %xmm1
1137; SSE-NEXT:    psrld $16, %xmm1
1138; SSE-NEXT:    pmullw %xmm0, %xmm1
1139; SSE-NEXT:    movd %xmm1, %eax
1140; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
1141; SSE-NEXT:    retq
1142;
1143; AVX-LABEL: test_v8i16:
1144; AVX:       # %bb.0:
1145; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1146; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1147; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1148; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1149; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
1150; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1151; AVX-NEXT:    vmovd %xmm0, %eax
1152; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
1153; AVX-NEXT:    retq
1154;
1155; AVX512-LABEL: test_v8i16:
1156; AVX512:       # %bb.0:
1157; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1158; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1159; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1160; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1161; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
1162; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1163; AVX512-NEXT:    vmovd %xmm0, %eax
1164; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
1165; AVX512-NEXT:    retq
1166  %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> %a0)
1167  ret i16 %1
1168}
1169
1170define i16 @test_v16i16(<16 x i16> %a0) {
1171; SSE-LABEL: test_v16i16:
1172; SSE:       # %bb.0:
1173; SSE-NEXT:    pmullw %xmm1, %xmm0
1174; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1175; SSE-NEXT:    pmullw %xmm0, %xmm1
1176; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1177; SSE-NEXT:    pmullw %xmm1, %xmm0
1178; SSE-NEXT:    movdqa %xmm0, %xmm1
1179; SSE-NEXT:    psrld $16, %xmm1
1180; SSE-NEXT:    pmullw %xmm0, %xmm1
1181; SSE-NEXT:    movd %xmm1, %eax
1182; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
1183; SSE-NEXT:    retq
1184;
1185; AVX1-LABEL: test_v16i16:
1186; AVX1:       # %bb.0:
1187; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1188; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1189; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1190; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1191; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1192; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1193; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
1194; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1195; AVX1-NEXT:    vmovd %xmm0, %eax
1196; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
1197; AVX1-NEXT:    vzeroupper
1198; AVX1-NEXT:    retq
1199;
1200; AVX2-LABEL: test_v16i16:
1201; AVX2:       # %bb.0:
1202; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1203; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1204; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1205; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1206; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1207; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1208; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
1209; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1210; AVX2-NEXT:    vmovd %xmm0, %eax
1211; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
1212; AVX2-NEXT:    vzeroupper
1213; AVX2-NEXT:    retq
1214;
1215; AVX512-LABEL: test_v16i16:
1216; AVX512:       # %bb.0:
1217; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
1218; AVX512-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1219; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1220; AVX512-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1221; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1222; AVX512-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1223; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
1224; AVX512-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1225; AVX512-NEXT:    vmovd %xmm0, %eax
1226; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
1227; AVX512-NEXT:    vzeroupper
1228; AVX512-NEXT:    retq
1229  %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> %a0)
1230  ret i16 %1
1231}
1232
1233define i16 @test_v32i16(<32 x i16> %a0) {
1234; SSE-LABEL: test_v32i16:
1235; SSE:       # %bb.0:
1236; SSE-NEXT:    pmullw %xmm3, %xmm1
1237; SSE-NEXT:    pmullw %xmm2, %xmm0
1238; SSE-NEXT:    pmullw %xmm1, %xmm0
1239; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1240; SSE-NEXT:    pmullw %xmm0, %xmm1
1241; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1242; SSE-NEXT:    pmullw %xmm1, %xmm0
1243; SSE-NEXT:    movdqa %xmm0, %xmm1
1244; SSE-NEXT:    psrld $16, %xmm1
1245; SSE-NEXT:    pmullw %xmm0, %xmm1
1246; SSE-NEXT:    movd %xmm1, %eax
1247; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
1248; SSE-NEXT:    retq
1249;
1250; AVX1-LABEL: test_v32i16:
1251; AVX1:       # %bb.0:
1252; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1253; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1254; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
1255; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
1256; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1257; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1258; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1259; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1260; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1261; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
1262; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1263; AVX1-NEXT:    vmovd %xmm0, %eax
1264; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
1265; AVX1-NEXT:    vzeroupper
1266; AVX1-NEXT:    retq
1267;
1268; AVX2-LABEL: test_v32i16:
1269; AVX2:       # %bb.0:
1270; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1271; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1272; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1273; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1274; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1275; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1276; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1277; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
1278; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1279; AVX2-NEXT:    vmovd %xmm0, %eax
1280; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
1281; AVX2-NEXT:    vzeroupper
1282; AVX2-NEXT:    retq
1283;
1284; AVX512BW-LABEL: test_v32i16:
1285; AVX512BW:       # %bb.0:
1286; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1287; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1288; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
1289; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1290; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1291; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1292; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1293; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1294; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
1295; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1296; AVX512BW-NEXT:    vmovd %xmm0, %eax
1297; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
1298; AVX512BW-NEXT:    vzeroupper
1299; AVX512BW-NEXT:    retq
1300;
1301; AVX512BWVL-LABEL: test_v32i16:
1302; AVX512BWVL:       # %bb.0:
1303; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1304; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1305; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
1306; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1307; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1308; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1309; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1310; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1311; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
1312; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1313; AVX512BWVL-NEXT:    vmovd %xmm0, %eax
1314; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
1315; AVX512BWVL-NEXT:    vzeroupper
1316; AVX512BWVL-NEXT:    retq
1317;
1318; AVX512DQ-LABEL: test_v32i16:
1319; AVX512DQ:       # %bb.0:
1320; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1321; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
1322; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1323; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1324; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1325; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1326; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1327; AVX512DQ-NEXT:    vpsrld $16, %xmm0, %xmm1
1328; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1329; AVX512DQ-NEXT:    vmovd %xmm0, %eax
1330; AVX512DQ-NEXT:    # kill: def $ax killed $ax killed $eax
1331; AVX512DQ-NEXT:    vzeroupper
1332; AVX512DQ-NEXT:    retq
1333;
1334; AVX512DQVL-LABEL: test_v32i16:
1335; AVX512DQVL:       # %bb.0:
1336; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1337; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
1338; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1339; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1340; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1341; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1342; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1343; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm1
1344; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1345; AVX512DQVL-NEXT:    vmovd %xmm0, %eax
1346; AVX512DQVL-NEXT:    # kill: def $ax killed $ax killed $eax
1347; AVX512DQVL-NEXT:    vzeroupper
1348; AVX512DQVL-NEXT:    retq
1349  %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> %a0)
1350  ret i16 %1
1351}
1352
1353define i16 @test_v64i16(<64 x i16> %a0) {
1354; SSE-LABEL: test_v64i16:
1355; SSE:       # %bb.0:
1356; SSE-NEXT:    pmullw %xmm6, %xmm2
1357; SSE-NEXT:    pmullw %xmm4, %xmm0
1358; SSE-NEXT:    pmullw %xmm2, %xmm0
1359; SSE-NEXT:    pmullw %xmm7, %xmm3
1360; SSE-NEXT:    pmullw %xmm5, %xmm1
1361; SSE-NEXT:    pmullw %xmm3, %xmm1
1362; SSE-NEXT:    pmullw %xmm0, %xmm1
1363; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
1364; SSE-NEXT:    pmullw %xmm1, %xmm0
1365; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1366; SSE-NEXT:    pmullw %xmm0, %xmm1
1367; SSE-NEXT:    movdqa %xmm1, %xmm0
1368; SSE-NEXT:    psrld $16, %xmm0
1369; SSE-NEXT:    pmullw %xmm1, %xmm0
1370; SSE-NEXT:    movd %xmm0, %eax
1371; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
1372; SSE-NEXT:    retq
1373;
1374; AVX1-LABEL: test_v64i16:
1375; AVX1:       # %bb.0:
1376; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm4
1377; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1378; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1379; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
1380; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
1381; AVX1-NEXT:    vpmullw %xmm1, %xmm3, %xmm1
1382; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1383; AVX1-NEXT:    vpmullw %xmm1, %xmm3, %xmm1
1384; AVX1-NEXT:    vpmullw %xmm4, %xmm2, %xmm2
1385; AVX1-NEXT:    vpmullw %xmm1, %xmm2, %xmm1
1386; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1387; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1388; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1389; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1390; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1391; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
1392; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1393; AVX1-NEXT:    vmovd %xmm0, %eax
1394; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
1395; AVX1-NEXT:    vzeroupper
1396; AVX1-NEXT:    retq
1397;
1398; AVX2-LABEL: test_v64i16:
1399; AVX2:       # %bb.0:
1400; AVX2-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
1401; AVX2-NEXT:    vpmullw %ymm1, %ymm2, %ymm1
1402; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1403; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1404; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1405; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1406; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1407; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1408; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1409; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
1410; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1411; AVX2-NEXT:    vmovd %xmm0, %eax
1412; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
1413; AVX2-NEXT:    vzeroupper
1414; AVX2-NEXT:    retq
1415;
1416; AVX512BW-LABEL: test_v64i16:
1417; AVX512BW:       # %bb.0:
1418; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1419; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1420; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1421; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
1422; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1423; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1424; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1425; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1426; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1427; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
1428; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1429; AVX512BW-NEXT:    vmovd %xmm0, %eax
1430; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
1431; AVX512BW-NEXT:    vzeroupper
1432; AVX512BW-NEXT:    retq
1433;
1434; AVX512BWVL-LABEL: test_v64i16:
1435; AVX512BWVL:       # %bb.0:
1436; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1437; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1438; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1439; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
1440; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1441; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1442; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1443; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1444; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1445; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
1446; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1447; AVX512BWVL-NEXT:    vmovd %xmm0, %eax
1448; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
1449; AVX512BWVL-NEXT:    vzeroupper
1450; AVX512BWVL-NEXT:    retq
1451;
1452; AVX512DQ-LABEL: test_v64i16:
1453; AVX512DQ:       # %bb.0:
1454; AVX512DQ-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
1455; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm2, %ymm1
1456; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1457; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
1458; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1459; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1460; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1461; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1462; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1463; AVX512DQ-NEXT:    vpsrld $16, %xmm0, %xmm1
1464; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1465; AVX512DQ-NEXT:    vmovd %xmm0, %eax
1466; AVX512DQ-NEXT:    # kill: def $ax killed $ax killed $eax
1467; AVX512DQ-NEXT:    vzeroupper
1468; AVX512DQ-NEXT:    retq
1469;
1470; AVX512DQVL-LABEL: test_v64i16:
1471; AVX512DQVL:       # %bb.0:
1472; AVX512DQVL-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
1473; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm2, %ymm1
1474; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1475; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
1476; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1477; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1478; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1479; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1480; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1481; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm1
1482; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1483; AVX512DQVL-NEXT:    vmovd %xmm0, %eax
1484; AVX512DQVL-NEXT:    # kill: def $ax killed $ax killed $eax
1485; AVX512DQVL-NEXT:    vzeroupper
1486; AVX512DQVL-NEXT:    retq
1487  %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> %a0)
1488  ret i16 %1
1489}
1490
1491;
1492; vXi8
1493;
1494
1495define i8 @test_v16i8(<16 x i8> %a0) {
1496; SSE2-LABEL: test_v16i8:
1497; SSE2:       # %bb.0:
1498; SSE2-NEXT:    movdqa %xmm0, %xmm1
1499; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1500; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1501; SSE2-NEXT:    pmullw %xmm1, %xmm0
1502; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
1503; SSE2-NEXT:    pand %xmm1, %xmm0
1504; SSE2-NEXT:    pxor %xmm3, %xmm3
1505; SSE2-NEXT:    packuswb %xmm3, %xmm0
1506; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,2,3,3]
1507; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1508; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1509; SSE2-NEXT:    pmullw %xmm0, %xmm2
1510; SSE2-NEXT:    pand %xmm1, %xmm2
1511; SSE2-NEXT:    packuswb %xmm3, %xmm2
1512; SSE2-NEXT:    movdqa %xmm2, %xmm0
1513; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1514; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1515; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1516; SSE2-NEXT:    pmullw %xmm0, %xmm2
1517; SSE2-NEXT:    pand %xmm1, %xmm2
1518; SSE2-NEXT:    packuswb %xmm3, %xmm2
1519; SSE2-NEXT:    movdqa %xmm2, %xmm0
1520; SSE2-NEXT:    psrlw $8, %xmm0
1521; SSE2-NEXT:    movdqa %xmm2, %xmm3
1522; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
1523; SSE2-NEXT:    pmullw %xmm0, %xmm3
1524; SSE2-NEXT:    pand %xmm1, %xmm3
1525; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1526; SSE2-NEXT:    pmullw %xmm0, %xmm2
1527; SSE2-NEXT:    pand %xmm1, %xmm2
1528; SSE2-NEXT:    packuswb %xmm3, %xmm2
1529; SSE2-NEXT:    movd %xmm2, %eax
1530; SSE2-NEXT:    # kill: def $al killed $al killed $eax
1531; SSE2-NEXT:    retq
1532;
1533; SSE41-LABEL: test_v16i8:
1534; SSE41:       # %bb.0:
1535; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1536; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1537; SSE41-NEXT:    pmullw %xmm1, %xmm0
1538; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
1539; SSE41-NEXT:    pand %xmm1, %xmm0
1540; SSE41-NEXT:    pxor %xmm2, %xmm2
1541; SSE41-NEXT:    packuswb %xmm2, %xmm0
1542; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1543; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1544; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1545; SSE41-NEXT:    pmullw %xmm3, %xmm0
1546; SSE41-NEXT:    pand %xmm1, %xmm0
1547; SSE41-NEXT:    packuswb %xmm2, %xmm0
1548; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1549; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1550; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1551; SSE41-NEXT:    pmullw %xmm3, %xmm0
1552; SSE41-NEXT:    pand %xmm1, %xmm0
1553; SSE41-NEXT:    packuswb %xmm2, %xmm0
1554; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1555; SSE41-NEXT:    psrlw $8, %xmm0
1556; SSE41-NEXT:    pmullw %xmm1, %xmm0
1557; SSE41-NEXT:    pextrb $0, %xmm0, %eax
1558; SSE41-NEXT:    # kill: def $al killed $al killed $eax
1559; SSE41-NEXT:    retq
1560;
1561; AVX1-LABEL: test_v16i8:
1562; AVX1:       # %bb.0:
1563; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1564; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1565; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1566; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
1567; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
1568; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1569; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1570; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1571; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1572; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1573; AVX1-NEXT:    vpmullw %xmm0, %xmm3, %xmm0
1574; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
1575; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1576; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1577; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1578; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1579; AVX1-NEXT:    vpmullw %xmm0, %xmm3, %xmm0
1580; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
1581; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1582; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
1583; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1584; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1585; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
1586; AVX1-NEXT:    # kill: def $al killed $al killed $eax
1587; AVX1-NEXT:    retq
1588;
1589; AVX2-LABEL: test_v16i8:
1590; AVX2:       # %bb.0:
1591; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1592; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
1593; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
1594; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1595; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1596; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1597; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1598; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1599; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1600; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1601; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
1602; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
1603; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1604; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1605; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1606; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1607; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1608; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
1609; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
1610; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
1611; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1612; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1613; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1614; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1615; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1616; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
1617; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
1618; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
1619; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1620; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
1621; AVX2-NEXT:    # kill: def $al killed $al killed $eax
1622; AVX2-NEXT:    vzeroupper
1623; AVX2-NEXT:    retq
1624;
1625; AVX512BW-LABEL: test_v16i8:
1626; AVX512BW:       # %bb.0:
1627; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1628; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
1629; AVX512BW-NEXT:    vpmovsxbw %xmm1, %ymm1
1630; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1631; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1632; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1633; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
1634; AVX512BW-NEXT:    vpmovsxbw %xmm1, %ymm1
1635; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1636; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1637; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
1638; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
1639; AVX512BW-NEXT:    vpmovsxbw %xmm1, %ymm1
1640; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1641; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1642; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm1
1643; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
1644; AVX512BW-NEXT:    vpmovsxbw %xmm1, %ymm1
1645; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1646; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1647; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
1648; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
1649; AVX512BW-NEXT:    vzeroupper
1650; AVX512BW-NEXT:    retq
1651;
1652; AVX512BWVL-LABEL: test_v16i8:
1653; AVX512BWVL:       # %bb.0:
1654; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1655; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
1656; AVX512BWVL-NEXT:    vpmovsxbw %xmm1, %ymm1
1657; AVX512BWVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1658; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
1659; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1660; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
1661; AVX512BWVL-NEXT:    vpmovsxbw %xmm1, %ymm1
1662; AVX512BWVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1663; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
1664; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
1665; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
1666; AVX512BWVL-NEXT:    vpmovsxbw %xmm1, %ymm1
1667; AVX512BWVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1668; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
1669; AVX512BWVL-NEXT:    vpsrlw $8, %xmm0, %xmm1
1670; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
1671; AVX512BWVL-NEXT:    vpmovsxbw %xmm1, %ymm1
1672; AVX512BWVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1673; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
1674; AVX512BWVL-NEXT:    vpextrb $0, %xmm0, %eax
1675; AVX512BWVL-NEXT:    # kill: def $al killed $al killed $eax
1676; AVX512BWVL-NEXT:    vzeroupper
1677; AVX512BWVL-NEXT:    retq
1678;
1679; AVX512DQ-LABEL: test_v16i8:
1680; AVX512DQ:       # %bb.0:
1681; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1682; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
1683; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
1684; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1685; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
1686; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1687; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1688; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
1689; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
1690; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1691; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
1692; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1693; AVX512DQ-NEXT:    vpsrld $16, %xmm0, %xmm1
1694; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
1695; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
1696; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1697; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
1698; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1699; AVX512DQ-NEXT:    vpsrlw $8, %xmm0, %xmm1
1700; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
1701; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
1702; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1703; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
1704; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1705; AVX512DQ-NEXT:    vpextrb $0, %xmm0, %eax
1706; AVX512DQ-NEXT:    # kill: def $al killed $al killed $eax
1707; AVX512DQ-NEXT:    vzeroupper
1708; AVX512DQ-NEXT:    retq
1709;
1710; AVX512DQVL-LABEL: test_v16i8:
1711; AVX512DQVL:       # %bb.0:
1712; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1713; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
1714; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
1715; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1716; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
1717; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
1718; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1719; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
1720; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
1721; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1722; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
1723; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
1724; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm1
1725; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
1726; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
1727; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1728; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
1729; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
1730; AVX512DQVL-NEXT:    vpsrlw $8, %xmm0, %xmm1
1731; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
1732; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
1733; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1734; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
1735; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
1736; AVX512DQVL-NEXT:    vpextrb $0, %xmm0, %eax
1737; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
1738; AVX512DQVL-NEXT:    vzeroupper
1739; AVX512DQVL-NEXT:    retq
1740  %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> %a0)
1741  ret i8 %1
1742}
1743
1744define i8 @test_v32i8(<32 x i8> %a0) {
1745; SSE2-LABEL: test_v32i8:
1746; SSE2:       # %bb.0:
1747; SSE2-NEXT:    movdqa %xmm1, %xmm2
1748; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
1749; SSE2-NEXT:    movdqa %xmm0, %xmm3
1750; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
1751; SSE2-NEXT:    pmullw %xmm2, %xmm3
1752; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1753; SSE2-NEXT:    pand %xmm2, %xmm3
1754; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1755; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1756; SSE2-NEXT:    pmullw %xmm1, %xmm0
1757; SSE2-NEXT:    pand %xmm2, %xmm0
1758; SSE2-NEXT:    packuswb %xmm3, %xmm0
1759; SSE2-NEXT:    movdqa %xmm0, %xmm1
1760; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1761; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1762; SSE2-NEXT:    pmullw %xmm1, %xmm0
1763; SSE2-NEXT:    pand %xmm2, %xmm0
1764; SSE2-NEXT:    pxor %xmm3, %xmm3
1765; SSE2-NEXT:    packuswb %xmm3, %xmm0
1766; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,2,3,3]
1767; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1768; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1769; SSE2-NEXT:    pmullw %xmm0, %xmm1
1770; SSE2-NEXT:    pand %xmm2, %xmm1
1771; SSE2-NEXT:    packuswb %xmm3, %xmm1
1772; SSE2-NEXT:    movdqa %xmm1, %xmm0
1773; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1774; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1775; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1776; SSE2-NEXT:    pmullw %xmm0, %xmm1
1777; SSE2-NEXT:    pand %xmm2, %xmm1
1778; SSE2-NEXT:    packuswb %xmm3, %xmm1
1779; SSE2-NEXT:    movdqa %xmm1, %xmm0
1780; SSE2-NEXT:    psrlw $8, %xmm0
1781; SSE2-NEXT:    movdqa %xmm1, %xmm3
1782; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
1783; SSE2-NEXT:    pmullw %xmm0, %xmm3
1784; SSE2-NEXT:    pand %xmm2, %xmm3
1785; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1786; SSE2-NEXT:    pmullw %xmm0, %xmm1
1787; SSE2-NEXT:    pand %xmm2, %xmm1
1788; SSE2-NEXT:    packuswb %xmm3, %xmm1
1789; SSE2-NEXT:    movd %xmm1, %eax
1790; SSE2-NEXT:    # kill: def $al killed $al killed $eax
1791; SSE2-NEXT:    retq
1792;
1793; SSE41-LABEL: test_v32i8:
1794; SSE41:       # %bb.0:
1795; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1796; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1797; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1798; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1799; SSE41-NEXT:    pmullw %xmm1, %xmm0
1800; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
1801; SSE41-NEXT:    pand %xmm1, %xmm0
1802; SSE41-NEXT:    pmullw %xmm2, %xmm3
1803; SSE41-NEXT:    pand %xmm1, %xmm3
1804; SSE41-NEXT:    packuswb %xmm0, %xmm3
1805; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
1806; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1807; SSE41-NEXT:    pmullw %xmm0, %xmm3
1808; SSE41-NEXT:    pand %xmm1, %xmm3
1809; SSE41-NEXT:    pxor %xmm0, %xmm0
1810; SSE41-NEXT:    packuswb %xmm0, %xmm3
1811; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
1812; SSE41-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1813; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
1814; SSE41-NEXT:    pmullw %xmm2, %xmm3
1815; SSE41-NEXT:    pand %xmm1, %xmm3
1816; SSE41-NEXT:    packuswb %xmm0, %xmm3
1817; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
1818; SSE41-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1819; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
1820; SSE41-NEXT:    pmullw %xmm2, %xmm3
1821; SSE41-NEXT:    pand %xmm1, %xmm3
1822; SSE41-NEXT:    packuswb %xmm0, %xmm3
1823; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
1824; SSE41-NEXT:    psrlw $8, %xmm3
1825; SSE41-NEXT:    pmullw %xmm0, %xmm3
1826; SSE41-NEXT:    pextrb $0, %xmm3, %eax
1827; SSE41-NEXT:    # kill: def $al killed $al killed $eax
1828; SSE41-NEXT:    retq
1829;
1830; AVX1-LABEL: test_v32i8:
1831; AVX1:       # %bb.0:
1832; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1833; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1834; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1835; AVX1-NEXT:    vpmullw %xmm1, %xmm3, %xmm3
1836; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
1837; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm3
1838; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1839; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1840; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
1841; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
1842; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
1843; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
1844; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1845; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1846; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
1847; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm3
1848; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1849; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1850; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
1851; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
1852; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
1853; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
1854; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1855; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1856; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
1857; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm3
1858; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1859; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1860; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
1861; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
1862; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
1863; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm2
1864; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1865; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1866; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
1867; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm3
1868; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1869; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1870; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
1871; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
1872; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
1873; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
1874; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1875; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1876; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
1877; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm3
1878; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1879; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1880; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
1881; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
1882; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
1883; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
1884; AVX1-NEXT:    # kill: def $al killed $al killed $eax
1885; AVX1-NEXT:    vzeroupper
1886; AVX1-NEXT:    retq
1887;
1888; AVX2-LABEL: test_v32i8:
1889; AVX2:       # %bb.0:
1890; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1891; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
1892; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
1893; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1894; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1895; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1896; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1897; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1898; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1899; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1900; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
1901; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
1902; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1903; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1904; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1905; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1906; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1907; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1908; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
1909; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
1910; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1911; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1912; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1913; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1914; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1915; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
1916; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
1917; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
1918; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1919; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1920; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1921; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1922; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1923; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
1924; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
1925; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
1926; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1927; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
1928; AVX2-NEXT:    # kill: def $al killed $al killed $eax
1929; AVX2-NEXT:    vzeroupper
1930; AVX2-NEXT:    retq
1931;
1932; AVX512BW-LABEL: test_v32i8:
1933; AVX512BW:       # %bb.0:
1934; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
1935; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
1936; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
1937; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1938; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1939; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1940; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
1941; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
1942; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1943; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1944; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1945; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
1946; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
1947; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1948; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1949; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
1950; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
1951; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
1952; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1953; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1954; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm1
1955; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
1956; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
1957; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1958; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1959; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
1960; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
1961; AVX512BW-NEXT:    vzeroupper
1962; AVX512BW-NEXT:    retq
1963;
1964; AVX512BWVL-LABEL: test_v32i8:
1965; AVX512BWVL:       # %bb.0:
1966; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
1967; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
1968; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
1969; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1970; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
1971; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1972; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
1973; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
1974; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1975; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
1976; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1977; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
1978; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
1979; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1980; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
1981; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
1982; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
1983; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
1984; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1985; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
1986; AVX512BWVL-NEXT:    vpsrlw $8, %xmm0, %xmm1
1987; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
1988; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
1989; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1990; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
1991; AVX512BWVL-NEXT:    vpextrb $0, %xmm0, %eax
1992; AVX512BWVL-NEXT:    # kill: def $al killed $al killed $eax
1993; AVX512BWVL-NEXT:    vzeroupper
1994; AVX512BWVL-NEXT:    retq
1995;
1996; AVX512DQ-LABEL: test_v32i8:
1997; AVX512DQ:       # %bb.0:
1998; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
1999; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
2000; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
2001; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2002; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
2003; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2004; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
2005; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
2006; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
2007; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2008; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
2009; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2010; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
2011; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
2012; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
2013; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2014; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
2015; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2016; AVX512DQ-NEXT:    vpsrld $16, %xmm0, %xmm1
2017; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
2018; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
2019; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2020; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
2021; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2022; AVX512DQ-NEXT:    vpsrlw $8, %xmm0, %xmm1
2023; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
2024; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
2025; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2026; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
2027; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2028; AVX512DQ-NEXT:    vpextrb $0, %xmm0, %eax
2029; AVX512DQ-NEXT:    # kill: def $al killed $al killed $eax
2030; AVX512DQ-NEXT:    vzeroupper
2031; AVX512DQ-NEXT:    retq
2032;
2033; AVX512DQVL-LABEL: test_v32i8:
2034; AVX512DQVL:       # %bb.0:
2035; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
2036; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
2037; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
2038; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2039; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
2040; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
2041; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
2042; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
2043; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
2044; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2045; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
2046; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
2047; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
2048; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
2049; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
2050; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2051; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
2052; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
2053; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm1
2054; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
2055; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
2056; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2057; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
2058; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
2059; AVX512DQVL-NEXT:    vpsrlw $8, %xmm0, %xmm1
2060; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
2061; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
2062; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2063; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
2064; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
2065; AVX512DQVL-NEXT:    vpextrb $0, %xmm0, %eax
2066; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
2067; AVX512DQVL-NEXT:    vzeroupper
2068; AVX512DQVL-NEXT:    retq
2069  %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> %a0)
2070  ret i8 %1
2071}
2072
2073define i8 @test_v64i8(<64 x i8> %a0) {
2074; SSE2-LABEL: test_v64i8:
2075; SSE2:       # %bb.0:
2076; SSE2-NEXT:    movdqa %xmm2, %xmm4
2077; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
2078; SSE2-NEXT:    movdqa %xmm0, %xmm5
2079; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
2080; SSE2-NEXT:    pmullw %xmm4, %xmm5
2081; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
2082; SSE2-NEXT:    pand %xmm4, %xmm5
2083; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2084; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2085; SSE2-NEXT:    pmullw %xmm2, %xmm0
2086; SSE2-NEXT:    pand %xmm4, %xmm0
2087; SSE2-NEXT:    packuswb %xmm5, %xmm0
2088; SSE2-NEXT:    movdqa %xmm3, %xmm2
2089; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2090; SSE2-NEXT:    movdqa %xmm1, %xmm5
2091; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
2092; SSE2-NEXT:    pmullw %xmm2, %xmm5
2093; SSE2-NEXT:    pand %xmm4, %xmm5
2094; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2095; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2096; SSE2-NEXT:    pmullw %xmm3, %xmm1
2097; SSE2-NEXT:    pand %xmm4, %xmm1
2098; SSE2-NEXT:    packuswb %xmm5, %xmm1
2099; SSE2-NEXT:    movdqa %xmm1, %xmm2
2100; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2101; SSE2-NEXT:    movdqa %xmm0, %xmm3
2102; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
2103; SSE2-NEXT:    pmullw %xmm2, %xmm3
2104; SSE2-NEXT:    pand %xmm4, %xmm3
2105; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2106; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2107; SSE2-NEXT:    pmullw %xmm1, %xmm0
2108; SSE2-NEXT:    pand %xmm4, %xmm0
2109; SSE2-NEXT:    packuswb %xmm3, %xmm0
2110; SSE2-NEXT:    movdqa %xmm0, %xmm1
2111; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2112; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2113; SSE2-NEXT:    pmullw %xmm1, %xmm0
2114; SSE2-NEXT:    pand %xmm4, %xmm0
2115; SSE2-NEXT:    pxor %xmm2, %xmm2
2116; SSE2-NEXT:    packuswb %xmm2, %xmm0
2117; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,2,3,3]
2118; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2119; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2120; SSE2-NEXT:    pmullw %xmm0, %xmm1
2121; SSE2-NEXT:    pand %xmm4, %xmm1
2122; SSE2-NEXT:    packuswb %xmm2, %xmm1
2123; SSE2-NEXT:    movdqa %xmm1, %xmm0
2124; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2125; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
2126; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2127; SSE2-NEXT:    pmullw %xmm0, %xmm1
2128; SSE2-NEXT:    pand %xmm4, %xmm1
2129; SSE2-NEXT:    packuswb %xmm2, %xmm1
2130; SSE2-NEXT:    movdqa %xmm1, %xmm0
2131; SSE2-NEXT:    psrlw $8, %xmm0
2132; SSE2-NEXT:    movdqa %xmm1, %xmm2
2133; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2134; SSE2-NEXT:    pmullw %xmm0, %xmm2
2135; SSE2-NEXT:    pand %xmm4, %xmm2
2136; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2137; SSE2-NEXT:    pmullw %xmm0, %xmm1
2138; SSE2-NEXT:    pand %xmm4, %xmm1
2139; SSE2-NEXT:    packuswb %xmm2, %xmm1
2140; SSE2-NEXT:    movd %xmm1, %eax
2141; SSE2-NEXT:    # kill: def $al killed $al killed $eax
2142; SSE2-NEXT:    retq
2143;
2144; SSE41-LABEL: test_v64i8:
2145; SSE41:       # %bb.0:
2146; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2147; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2148; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2149; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2150; SSE41-NEXT:    pmullw %xmm2, %xmm0
2151; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2152; SSE41-NEXT:    pand %xmm2, %xmm0
2153; SSE41-NEXT:    pmullw %xmm5, %xmm4
2154; SSE41-NEXT:    pand %xmm2, %xmm4
2155; SSE41-NEXT:    packuswb %xmm0, %xmm4
2156; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2157; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2158; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2159; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2160; SSE41-NEXT:    pmullw %xmm3, %xmm1
2161; SSE41-NEXT:    pand %xmm2, %xmm1
2162; SSE41-NEXT:    pmullw %xmm0, %xmm5
2163; SSE41-NEXT:    pand %xmm2, %xmm5
2164; SSE41-NEXT:    packuswb %xmm1, %xmm5
2165; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
2166; SSE41-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2167; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
2168; SSE41-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2169; SSE41-NEXT:    pmullw %xmm5, %xmm4
2170; SSE41-NEXT:    pand %xmm2, %xmm4
2171; SSE41-NEXT:    pmullw %xmm0, %xmm1
2172; SSE41-NEXT:    pand %xmm2, %xmm1
2173; SSE41-NEXT:    packuswb %xmm4, %xmm1
2174; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2175; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2176; SSE41-NEXT:    pmullw %xmm0, %xmm1
2177; SSE41-NEXT:    pand %xmm2, %xmm1
2178; SSE41-NEXT:    pxor %xmm0, %xmm0
2179; SSE41-NEXT:    packuswb %xmm0, %xmm1
2180; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2181; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2182; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2183; SSE41-NEXT:    pmullw %xmm3, %xmm1
2184; SSE41-NEXT:    pand %xmm2, %xmm1
2185; SSE41-NEXT:    packuswb %xmm0, %xmm1
2186; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2187; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2188; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
2189; SSE41-NEXT:    pmullw %xmm3, %xmm1
2190; SSE41-NEXT:    pand %xmm2, %xmm1
2191; SSE41-NEXT:    packuswb %xmm0, %xmm1
2192; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2193; SSE41-NEXT:    psrlw $8, %xmm1
2194; SSE41-NEXT:    pmullw %xmm0, %xmm1
2195; SSE41-NEXT:    pextrb $0, %xmm1, %eax
2196; SSE41-NEXT:    # kill: def $al killed $al killed $eax
2197; SSE41-NEXT:    retq
2198;
2199; AVX1-LABEL: test_v64i8:
2200; AVX1:       # %bb.0:
2201; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2202; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2203; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm3
2204; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2205; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm3
2206; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2207; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2208; AVX1-NEXT:    vpmullw %xmm4, %xmm5, %xmm4
2209; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm4
2210; AVX1-NEXT:    vpackuswb %xmm3, %xmm4, %xmm3
2211; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2212; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2213; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2214; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2215; AVX1-NEXT:    vpmullw %xmm4, %xmm5, %xmm4
2216; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm4
2217; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2218; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2219; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2220; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
2221; AVX1-NEXT:    vpackuswb %xmm4, %xmm0, %xmm0
2222; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2223; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2224; AVX1-NEXT:    vpmullw %xmm1, %xmm4, %xmm1
2225; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
2226; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2227; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2228; AVX1-NEXT:    vpmullw %xmm0, %xmm3, %xmm0
2229; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
2230; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2231; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
2232; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2233; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2234; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
2235; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm3
2236; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2237; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2238; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2239; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
2240; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
2241; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
2242; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2243; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2244; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
2245; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm3
2246; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2247; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2248; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2249; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
2250; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
2251; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
2252; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2253; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2254; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
2255; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm3
2256; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2257; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2258; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2259; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
2260; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
2261; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
2262; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2263; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2264; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
2265; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm3
2266; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2267; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2268; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2269; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
2270; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
2271; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
2272; AVX1-NEXT:    # kill: def $al killed $al killed $eax
2273; AVX1-NEXT:    vzeroupper
2274; AVX1-NEXT:    retq
2275;
2276; AVX2-LABEL: test_v64i8:
2277; AVX2:       # %bb.0:
2278; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm2
2279; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm3
2280; AVX2-NEXT:    vpmullw %ymm2, %ymm3, %ymm3
2281; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
2282; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2283; AVX2-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
2284; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
2285; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
2286; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
2287; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
2288; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
2289; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
2290; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2291; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2292; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
2293; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2294; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2295; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
2296; AVX2-NEXT:    vpmovsxbw %xmm3, %ymm1
2297; AVX2-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
2298; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2299; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
2300; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2301; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2302; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
2303; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
2304; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
2305; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2306; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2307; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
2308; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2309; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2310; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
2311; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
2312; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
2313; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2314; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2315; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
2316; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2317; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2318; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
2319; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
2320; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
2321; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2322; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2323; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
2324; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2325; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2326; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
2327; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
2328; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
2329; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2330; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
2331; AVX2-NEXT:    # kill: def $al killed $al killed $eax
2332; AVX2-NEXT:    vzeroupper
2333; AVX2-NEXT:    retq
2334;
2335; AVX512BW-LABEL: test_v64i8:
2336; AVX512BW:       # %bb.0:
2337; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2338; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
2339; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
2340; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2341; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2342; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
2343; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
2344; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
2345; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2346; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2347; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
2348; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
2349; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
2350; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2351; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2352; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
2353; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
2354; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
2355; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2356; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2357; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
2358; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
2359; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
2360; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2361; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2362; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm1
2363; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
2364; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
2365; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2366; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2367; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
2368; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
2369; AVX512BW-NEXT:    vzeroupper
2370; AVX512BW-NEXT:    retq
2371;
2372; AVX512BWVL-LABEL: test_v64i8:
2373; AVX512BWVL:       # %bb.0:
2374; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2375; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
2376; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
2377; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2378; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
2379; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
2380; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
2381; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
2382; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2383; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
2384; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
2385; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
2386; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
2387; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2388; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
2389; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
2390; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
2391; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
2392; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2393; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
2394; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
2395; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
2396; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
2397; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2398; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
2399; AVX512BWVL-NEXT:    vpsrlw $8, %xmm0, %xmm1
2400; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
2401; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
2402; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2403; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
2404; AVX512BWVL-NEXT:    vpextrb $0, %xmm0, %eax
2405; AVX512BWVL-NEXT:    # kill: def $al killed $al killed $eax
2406; AVX512BWVL-NEXT:    vzeroupper
2407; AVX512BWVL-NEXT:    retq
2408;
2409; AVX512DQ-LABEL: test_v64i8:
2410; AVX512DQ:       # %bb.0:
2411; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm2
2412; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm3
2413; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
2414; AVX512DQ-NEXT:    vpmovsxwd %ymm2, %zmm2
2415; AVX512DQ-NEXT:    vpmovdb %zmm2, %xmm2
2416; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm1
2417; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
2418; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
2419; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
2420; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2421; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
2422; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2423; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
2424; AVX512DQ-NEXT:    vpmovsxbw %xmm2, %ymm1
2425; AVX512DQ-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
2426; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
2427; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2428; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
2429; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
2430; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
2431; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2432; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
2433; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2434; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
2435; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
2436; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
2437; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2438; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
2439; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2440; AVX512DQ-NEXT:    vpsrld $16, %xmm0, %xmm1
2441; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
2442; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
2443; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2444; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
2445; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2446; AVX512DQ-NEXT:    vpsrlw $8, %xmm0, %xmm1
2447; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
2448; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
2449; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2450; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
2451; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2452; AVX512DQ-NEXT:    vpextrb $0, %xmm0, %eax
2453; AVX512DQ-NEXT:    # kill: def $al killed $al killed $eax
2454; AVX512DQ-NEXT:    vzeroupper
2455; AVX512DQ-NEXT:    retq
2456;
2457; AVX512DQVL-LABEL: test_v64i8:
2458; AVX512DQVL:       # %bb.0:
2459; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm2
2460; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm3
2461; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
2462; AVX512DQVL-NEXT:    vpmovsxwd %ymm2, %zmm2
2463; AVX512DQVL-NEXT:    vpmovdb %zmm2, %xmm2
2464; AVX512DQVL-NEXT:    vextracti128 $1, %ymm1, %xmm1
2465; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
2466; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm0
2467; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
2468; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2469; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
2470; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
2471; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
2472; AVX512DQVL-NEXT:    vpmovsxbw %xmm2, %ymm1
2473; AVX512DQVL-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
2474; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
2475; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
2476; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
2477; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
2478; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
2479; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2480; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
2481; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
2482; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
2483; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
2484; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
2485; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2486; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
2487; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
2488; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm1
2489; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
2490; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
2491; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2492; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
2493; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
2494; AVX512DQVL-NEXT:    vpsrlw $8, %xmm0, %xmm1
2495; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
2496; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
2497; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2498; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
2499; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
2500; AVX512DQVL-NEXT:    vpextrb $0, %xmm0, %eax
2501; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
2502; AVX512DQVL-NEXT:    vzeroupper
2503; AVX512DQVL-NEXT:    retq
2504  %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> %a0)
2505  ret i8 %1
2506}
2507
2508define i8 @test_v128i8(<128 x i8> %a0) {
2509; SSE2-LABEL: test_v128i8:
2510; SSE2:       # %bb.0:
2511; SSE2-NEXT:    movdqa %xmm5, %xmm8
2512; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
2513; SSE2-NEXT:    movdqa %xmm1, %xmm9
2514; SSE2-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15]
2515; SSE2-NEXT:    pmullw %xmm8, %xmm9
2516; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
2517; SSE2-NEXT:    pand %xmm8, %xmm9
2518; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
2519; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2520; SSE2-NEXT:    pmullw %xmm5, %xmm1
2521; SSE2-NEXT:    pand %xmm8, %xmm1
2522; SSE2-NEXT:    packuswb %xmm9, %xmm1
2523; SSE2-NEXT:    movdqa %xmm7, %xmm9
2524; SSE2-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15]
2525; SSE2-NEXT:    movdqa %xmm3, %xmm5
2526; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
2527; SSE2-NEXT:    pmullw %xmm9, %xmm5
2528; SSE2-NEXT:    pand %xmm8, %xmm5
2529; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
2530; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2531; SSE2-NEXT:    pmullw %xmm7, %xmm3
2532; SSE2-NEXT:    pand %xmm8, %xmm3
2533; SSE2-NEXT:    packuswb %xmm5, %xmm3
2534; SSE2-NEXT:    movdqa %xmm4, %xmm5
2535; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
2536; SSE2-NEXT:    movdqa %xmm0, %xmm7
2537; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
2538; SSE2-NEXT:    pmullw %xmm5, %xmm7
2539; SSE2-NEXT:    pand %xmm8, %xmm7
2540; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
2541; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2542; SSE2-NEXT:    pmullw %xmm4, %xmm0
2543; SSE2-NEXT:    pand %xmm8, %xmm0
2544; SSE2-NEXT:    packuswb %xmm7, %xmm0
2545; SSE2-NEXT:    movdqa %xmm6, %xmm4
2546; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
2547; SSE2-NEXT:    movdqa %xmm2, %xmm5
2548; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
2549; SSE2-NEXT:    pmullw %xmm4, %xmm5
2550; SSE2-NEXT:    pand %xmm8, %xmm5
2551; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
2552; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2553; SSE2-NEXT:    pmullw %xmm6, %xmm2
2554; SSE2-NEXT:    pand %xmm8, %xmm2
2555; SSE2-NEXT:    packuswb %xmm5, %xmm2
2556; SSE2-NEXT:    movdqa %xmm2, %xmm4
2557; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
2558; SSE2-NEXT:    movdqa %xmm0, %xmm5
2559; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
2560; SSE2-NEXT:    pmullw %xmm4, %xmm5
2561; SSE2-NEXT:    pand %xmm8, %xmm5
2562; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2563; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2564; SSE2-NEXT:    pmullw %xmm2, %xmm0
2565; SSE2-NEXT:    pand %xmm8, %xmm0
2566; SSE2-NEXT:    packuswb %xmm5, %xmm0
2567; SSE2-NEXT:    movdqa %xmm3, %xmm2
2568; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2569; SSE2-NEXT:    movdqa %xmm1, %xmm4
2570; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
2571; SSE2-NEXT:    pmullw %xmm2, %xmm4
2572; SSE2-NEXT:    pand %xmm8, %xmm4
2573; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2574; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2575; SSE2-NEXT:    pmullw %xmm3, %xmm1
2576; SSE2-NEXT:    pand %xmm8, %xmm1
2577; SSE2-NEXT:    packuswb %xmm4, %xmm1
2578; SSE2-NEXT:    movdqa %xmm1, %xmm2
2579; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2580; SSE2-NEXT:    movdqa %xmm0, %xmm3
2581; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
2582; SSE2-NEXT:    pmullw %xmm2, %xmm3
2583; SSE2-NEXT:    pand %xmm8, %xmm3
2584; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2585; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2586; SSE2-NEXT:    pmullw %xmm1, %xmm0
2587; SSE2-NEXT:    pand %xmm8, %xmm0
2588; SSE2-NEXT:    packuswb %xmm3, %xmm0
2589; SSE2-NEXT:    movdqa %xmm0, %xmm1
2590; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2591; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2592; SSE2-NEXT:    pmullw %xmm1, %xmm0
2593; SSE2-NEXT:    pand %xmm8, %xmm0
2594; SSE2-NEXT:    pxor %xmm2, %xmm2
2595; SSE2-NEXT:    packuswb %xmm2, %xmm0
2596; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,2,3,3]
2597; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2598; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2599; SSE2-NEXT:    pmullw %xmm0, %xmm1
2600; SSE2-NEXT:    pand %xmm8, %xmm1
2601; SSE2-NEXT:    packuswb %xmm2, %xmm1
2602; SSE2-NEXT:    movdqa %xmm1, %xmm0
2603; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2604; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
2605; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2606; SSE2-NEXT:    pmullw %xmm0, %xmm1
2607; SSE2-NEXT:    pand %xmm8, %xmm1
2608; SSE2-NEXT:    packuswb %xmm2, %xmm1
2609; SSE2-NEXT:    movdqa %xmm1, %xmm0
2610; SSE2-NEXT:    psrlw $8, %xmm0
2611; SSE2-NEXT:    movdqa %xmm1, %xmm2
2612; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2613; SSE2-NEXT:    pmullw %xmm0, %xmm2
2614; SSE2-NEXT:    pand %xmm8, %xmm2
2615; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2616; SSE2-NEXT:    pmullw %xmm0, %xmm1
2617; SSE2-NEXT:    pand %xmm8, %xmm1
2618; SSE2-NEXT:    packuswb %xmm2, %xmm1
2619; SSE2-NEXT:    movd %xmm1, %eax
2620; SSE2-NEXT:    # kill: def $al killed $al killed $eax
2621; SSE2-NEXT:    retq
2622;
2623; SSE41-LABEL: test_v128i8:
2624; SSE41:       # %bb.0:
2625; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm9 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
2626; SSE41-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2627; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm8 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2628; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2629; SSE41-NEXT:    pmullw %xmm5, %xmm1
2630; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
2631; SSE41-NEXT:    pand %xmm5, %xmm1
2632; SSE41-NEXT:    pmullw %xmm9, %xmm8
2633; SSE41-NEXT:    pand %xmm5, %xmm8
2634; SSE41-NEXT:    packuswb %xmm1, %xmm8
2635; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm9 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero
2636; SSE41-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2637; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2638; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2639; SSE41-NEXT:    pmullw %xmm7, %xmm3
2640; SSE41-NEXT:    pand %xmm5, %xmm3
2641; SSE41-NEXT:    pmullw %xmm9, %xmm1
2642; SSE41-NEXT:    pand %xmm5, %xmm1
2643; SSE41-NEXT:    packuswb %xmm3, %xmm1
2644; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm7 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
2645; SSE41-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2646; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2647; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2648; SSE41-NEXT:    pmullw %xmm4, %xmm0
2649; SSE41-NEXT:    pand %xmm5, %xmm0
2650; SSE41-NEXT:    pmullw %xmm7, %xmm3
2651; SSE41-NEXT:    pand %xmm5, %xmm3
2652; SSE41-NEXT:    packuswb %xmm0, %xmm3
2653; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
2654; SSE41-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2655; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2656; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2657; SSE41-NEXT:    pmullw %xmm6, %xmm2
2658; SSE41-NEXT:    pand %xmm5, %xmm2
2659; SSE41-NEXT:    pmullw %xmm0, %xmm4
2660; SSE41-NEXT:    pand %xmm5, %xmm4
2661; SSE41-NEXT:    packuswb %xmm2, %xmm4
2662; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
2663; SSE41-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2664; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2665; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2666; SSE41-NEXT:    pmullw %xmm4, %xmm3
2667; SSE41-NEXT:    pand %xmm5, %xmm3
2668; SSE41-NEXT:    pmullw %xmm2, %xmm0
2669; SSE41-NEXT:    pand %xmm5, %xmm0
2670; SSE41-NEXT:    packuswb %xmm3, %xmm0
2671; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2672; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2673; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero
2674; SSE41-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2675; SSE41-NEXT:    pmullw %xmm1, %xmm8
2676; SSE41-NEXT:    pand %xmm5, %xmm8
2677; SSE41-NEXT:    pmullw %xmm2, %xmm3
2678; SSE41-NEXT:    pand %xmm5, %xmm3
2679; SSE41-NEXT:    packuswb %xmm8, %xmm3
2680; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2681; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2682; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2683; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2684; SSE41-NEXT:    pmullw %xmm3, %xmm0
2685; SSE41-NEXT:    pand %xmm5, %xmm0
2686; SSE41-NEXT:    pmullw %xmm1, %xmm2
2687; SSE41-NEXT:    pand %xmm5, %xmm2
2688; SSE41-NEXT:    packuswb %xmm0, %xmm2
2689; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2690; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2691; SSE41-NEXT:    pmullw %xmm0, %xmm2
2692; SSE41-NEXT:    pand %xmm5, %xmm2
2693; SSE41-NEXT:    pxor %xmm0, %xmm0
2694; SSE41-NEXT:    packuswb %xmm0, %xmm2
2695; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2696; SSE41-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2697; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
2698; SSE41-NEXT:    pmullw %xmm1, %xmm2
2699; SSE41-NEXT:    pand %xmm5, %xmm2
2700; SSE41-NEXT:    packuswb %xmm0, %xmm2
2701; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2702; SSE41-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2703; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
2704; SSE41-NEXT:    pmullw %xmm1, %xmm2
2705; SSE41-NEXT:    pand %xmm5, %xmm2
2706; SSE41-NEXT:    packuswb %xmm0, %xmm2
2707; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2708; SSE41-NEXT:    psrlw $8, %xmm2
2709; SSE41-NEXT:    pmullw %xmm0, %xmm2
2710; SSE41-NEXT:    pextrb $0, %xmm2, %eax
2711; SSE41-NEXT:    # kill: def $al killed $al killed $eax
2712; SSE41-NEXT:    retq
2713;
2714; AVX1-LABEL: test_v128i8:
2715; AVX1:       # %bb.0:
2716; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
2717; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2718; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
2719; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2720; AVX1-NEXT:    vpmullw %xmm4, %xmm7, %xmm7
2721; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
2722; AVX1-NEXT:    vpand %xmm4, %xmm7, %xmm7
2723; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
2724; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
2725; AVX1-NEXT:    vpmullw %xmm5, %xmm6, %xmm5
2726; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm5
2727; AVX1-NEXT:    vpackuswb %xmm7, %xmm5, %xmm8
2728; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
2729; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2730; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
2731; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2732; AVX1-NEXT:    vpmullw %xmm9, %xmm7, %xmm7
2733; AVX1-NEXT:    vpand %xmm4, %xmm7, %xmm7
2734; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
2735; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
2736; AVX1-NEXT:    vpmullw %xmm6, %xmm5, %xmm5
2737; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm5
2738; AVX1-NEXT:    vpackuswb %xmm7, %xmm5, %xmm6
2739; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2740; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2741; AVX1-NEXT:    vpmullw %xmm5, %xmm7, %xmm5
2742; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm5
2743; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2744; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2745; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
2746; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2747; AVX1-NEXT:    vpackuswb %xmm5, %xmm0, %xmm0
2748; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2749; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2750; AVX1-NEXT:    vpmullw %xmm2, %xmm5, %xmm2
2751; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2752; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2753; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2754; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
2755; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2756; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
2757; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2758; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2759; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
2760; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2761; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2762; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2763; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2764; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2765; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2766; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2767; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2768; AVX1-NEXT:    vpmullw %xmm1, %xmm2, %xmm1
2769; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2770; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
2771; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero
2772; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
2773; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2774; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
2775; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2776; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2777; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
2778; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2779; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2780; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2781; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2782; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2783; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2784; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
2785; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2786; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2787; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
2788; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2789; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2790; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2791; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2792; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2793; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2794; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
2795; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2796; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2797; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
2798; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2799; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2800; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2801; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2802; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2803; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2804; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
2805; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2806; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2807; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
2808; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2809; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2810; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2811; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2812; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2813; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2814; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
2815; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2816; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2817; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
2818; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2819; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2820; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2821; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2822; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2823; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2824; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
2825; AVX1-NEXT:    # kill: def $al killed $al killed $eax
2826; AVX1-NEXT:    vzeroupper
2827; AVX1-NEXT:    retq
2828;
2829; AVX2-LABEL: test_v128i8:
2830; AVX2:       # %bb.0:
2831; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
2832; AVX2-NEXT:    vpmovsxbw %xmm4, %ymm4
2833; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm5
2834; AVX2-NEXT:    vpmovsxbw %xmm5, %ymm5
2835; AVX2-NEXT:    vpmullw %ymm4, %ymm5, %ymm5
2836; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
2837; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2838; AVX2-NEXT:    vpshufb %xmm4, %xmm6, %xmm6
2839; AVX2-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
2840; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
2841; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm6
2842; AVX2-NEXT:    vpmovsxbw %xmm6, %ymm6
2843; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm7
2844; AVX2-NEXT:    vpmovsxbw %xmm7, %ymm7
2845; AVX2-NEXT:    vpmullw %ymm6, %ymm7, %ymm6
2846; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm7
2847; AVX2-NEXT:    vpshufb %xmm4, %xmm7, %xmm7
2848; AVX2-NEXT:    vpshufb %xmm4, %xmm6, %xmm6
2849; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
2850; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
2851; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
2852; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
2853; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2854; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
2855; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
2856; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
2857; AVX2-NEXT:    vpmovsxbw %xmm3, %ymm2
2858; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
2859; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
2860; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2861; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
2862; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
2863; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2864; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
2865; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
2866; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2867; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2868; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
2869; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
2870; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2871; AVX2-NEXT:    vpmovsxbw %xmm6, %ymm1
2872; AVX2-NEXT:    vpmovsxbw %xmm5, %ymm2
2873; AVX2-NEXT:    vpmullw %ymm1, %ymm2, %ymm1
2874; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2875; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
2876; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
2877; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2878; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
2879; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
2880; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2881; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2882; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
2883; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
2884; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2885; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
2886; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
2887; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
2888; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2889; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2890; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
2891; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
2892; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2893; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
2894; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
2895; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
2896; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2897; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2898; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
2899; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
2900; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2901; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
2902; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
2903; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
2904; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2905; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2906; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
2907; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
2908; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2909; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
2910; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
2911; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
2912; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2913; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
2914; AVX2-NEXT:    # kill: def $al killed $al killed $eax
2915; AVX2-NEXT:    vzeroupper
2916; AVX2-NEXT:    retq
2917;
2918; AVX512BW-LABEL: test_v128i8:
2919; AVX512BW:       # %bb.0:
2920; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm2
2921; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm3
2922; AVX512BW-NEXT:    vpmullw %zmm2, %zmm3, %zmm2
2923; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
2924; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
2925; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
2926; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
2927; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
2928; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2929; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2930; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
2931; AVX512BW-NEXT:    vpmovsxbw %ymm2, %zmm1
2932; AVX512BW-NEXT:    vpmullw %zmm0, %zmm1, %zmm0
2933; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2934; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
2935; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
2936; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
2937; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2938; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2939; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
2940; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
2941; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
2942; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2943; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2944; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
2945; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
2946; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
2947; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2948; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2949; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
2950; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
2951; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
2952; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2953; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2954; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm1
2955; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
2956; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
2957; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2958; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2959; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
2960; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
2961; AVX512BW-NEXT:    vzeroupper
2962; AVX512BW-NEXT:    retq
2963;
2964; AVX512BWVL-LABEL: test_v128i8:
2965; AVX512BWVL:       # %bb.0:
2966; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm2
2967; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm3
2968; AVX512BWVL-NEXT:    vpmullw %zmm2, %zmm3, %zmm2
2969; AVX512BWVL-NEXT:    vpmovwb %zmm2, %ymm2
2970; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
2971; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
2972; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
2973; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
2974; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2975; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
2976; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
2977; AVX512BWVL-NEXT:    vpmovsxbw %ymm2, %zmm1
2978; AVX512BWVL-NEXT:    vpmullw %zmm0, %zmm1, %zmm0
2979; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
2980; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
2981; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
2982; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
2983; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2984; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
2985; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
2986; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
2987; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
2988; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2989; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
2990; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
2991; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
2992; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
2993; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2994; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
2995; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
2996; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
2997; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
2998; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2999; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
3000; AVX512BWVL-NEXT:    vpsrlw $8, %xmm0, %xmm1
3001; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
3002; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
3003; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
3004; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
3005; AVX512BWVL-NEXT:    vpextrb $0, %xmm0, %eax
3006; AVX512BWVL-NEXT:    # kill: def $al killed $al killed $eax
3007; AVX512BWVL-NEXT:    vzeroupper
3008; AVX512BWVL-NEXT:    retq
3009;
3010; AVX512DQ-LABEL: test_v128i8:
3011; AVX512DQ:       # %bb.0:
3012; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm4
3013; AVX512DQ-NEXT:    vpmovsxbw %xmm4, %ymm4
3014; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm5
3015; AVX512DQ-NEXT:    vpmovsxbw %xmm5, %ymm5
3016; AVX512DQ-NEXT:    vpmullw %ymm4, %ymm5, %ymm4
3017; AVX512DQ-NEXT:    vpmovsxwd %ymm4, %zmm4
3018; AVX512DQ-NEXT:    vpmovdb %zmm4, %xmm4
3019; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm5
3020; AVX512DQ-NEXT:    vpmovsxbw %xmm5, %ymm5
3021; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm6
3022; AVX512DQ-NEXT:    vpmovsxbw %xmm6, %ymm6
3023; AVX512DQ-NEXT:    vpmullw %ymm5, %ymm6, %ymm5
3024; AVX512DQ-NEXT:    vpmovsxwd %ymm5, %zmm5
3025; AVX512DQ-NEXT:    vpmovdb %zmm5, %xmm5
3026; AVX512DQ-NEXT:    vpmovsxbw %xmm2, %ymm2
3027; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
3028; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
3029; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
3030; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
3031; AVX512DQ-NEXT:    vpmovsxbw %xmm3, %ymm2
3032; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
3033; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
3034; AVX512DQ-NEXT:    vpmovsxwd %ymm1, %zmm1
3035; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
3036; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
3037; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
3038; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
3039; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
3040; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
3041; AVX512DQ-NEXT:    vpmovsxbw %xmm5, %ymm1
3042; AVX512DQ-NEXT:    vpmovsxbw %xmm4, %ymm2
3043; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm2, %ymm1
3044; AVX512DQ-NEXT:    vpmovsxwd %ymm1, %zmm1
3045; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
3046; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
3047; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
3048; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
3049; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
3050; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
3051; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
3052; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
3053; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
3054; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
3055; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
3056; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
3057; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
3058; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
3059; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
3060; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
3061; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
3062; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
3063; AVX512DQ-NEXT:    vpsrld $16, %xmm0, %xmm1
3064; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
3065; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
3066; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
3067; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
3068; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
3069; AVX512DQ-NEXT:    vpsrlw $8, %xmm0, %xmm1
3070; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
3071; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
3072; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
3073; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
3074; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
3075; AVX512DQ-NEXT:    vpextrb $0, %xmm0, %eax
3076; AVX512DQ-NEXT:    # kill: def $al killed $al killed $eax
3077; AVX512DQ-NEXT:    vzeroupper
3078; AVX512DQ-NEXT:    retq
3079;
3080; AVX512DQVL-LABEL: test_v128i8:
3081; AVX512DQVL:       # %bb.0:
3082; AVX512DQVL-NEXT:    vextracti128 $1, %ymm2, %xmm4
3083; AVX512DQVL-NEXT:    vpmovsxbw %xmm4, %ymm4
3084; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm5
3085; AVX512DQVL-NEXT:    vpmovsxbw %xmm5, %ymm5
3086; AVX512DQVL-NEXT:    vpmullw %ymm4, %ymm5, %ymm4
3087; AVX512DQVL-NEXT:    vpmovsxwd %ymm4, %zmm4
3088; AVX512DQVL-NEXT:    vpmovdb %zmm4, %xmm4
3089; AVX512DQVL-NEXT:    vextracti128 $1, %ymm3, %xmm5
3090; AVX512DQVL-NEXT:    vpmovsxbw %xmm5, %ymm5
3091; AVX512DQVL-NEXT:    vextracti128 $1, %ymm1, %xmm6
3092; AVX512DQVL-NEXT:    vpmovsxbw %xmm6, %ymm6
3093; AVX512DQVL-NEXT:    vpmullw %ymm5, %ymm6, %ymm5
3094; AVX512DQVL-NEXT:    vpmovsxwd %ymm5, %zmm5
3095; AVX512DQVL-NEXT:    vpmovdb %zmm5, %xmm5
3096; AVX512DQVL-NEXT:    vpmovsxbw %xmm2, %ymm2
3097; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
3098; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
3099; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
3100; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
3101; AVX512DQVL-NEXT:    vpmovsxbw %xmm3, %ymm2
3102; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
3103; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
3104; AVX512DQVL-NEXT:    vpmovsxwd %ymm1, %zmm1
3105; AVX512DQVL-NEXT:    vpmovdb %zmm1, %xmm1
3106; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
3107; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
3108; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
3109; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
3110; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
3111; AVX512DQVL-NEXT:    vpmovsxbw %xmm5, %ymm1
3112; AVX512DQVL-NEXT:    vpmovsxbw %xmm4, %ymm2
3113; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm2, %ymm1
3114; AVX512DQVL-NEXT:    vpmovsxwd %ymm1, %zmm1
3115; AVX512DQVL-NEXT:    vpmovdb %zmm1, %xmm1
3116; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
3117; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
3118; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
3119; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
3120; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
3121; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
3122; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
3123; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
3124; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
3125; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
3126; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
3127; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
3128; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
3129; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
3130; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
3131; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
3132; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
3133; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm1
3134; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
3135; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
3136; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
3137; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
3138; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
3139; AVX512DQVL-NEXT:    vpsrlw $8, %xmm0, %xmm1
3140; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
3141; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
3142; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
3143; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
3144; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
3145; AVX512DQVL-NEXT:    vpextrb $0, %xmm0, %eax
3146; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
3147; AVX512DQVL-NEXT:    vzeroupper
3148; AVX512DQVL-NEXT:    retq
3149  %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> %a0)
3150  ret i8 %1
3151}
3152
3153declare i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64>)
3154declare i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64>)
3155declare i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64>)
3156declare i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64>)
3157
3158declare i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32>)
3159declare i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32>)
3160declare i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32>)
3161declare i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32>)
3162
3163declare i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16>)
3164declare i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16>)
3165declare i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16>)
3166declare i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16>)
3167
3168declare i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8>)
3169declare i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8>)
3170declare i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8>)
3171declare i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8>)
3172