• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont | FileCheck %s --check-prefixes=CHECK32,SLM32
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=silvermont | FileCheck %s --check-prefixes=CHECK64,SLM64
4; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=CHECK32,SLOW32
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=CHECK64,SLOW64
6; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4-32
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4-64
8; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX-32,AVX2-32
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX-64,AVX2-64
10; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX-32,AVX512-32,AVX512DQ-32
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX-64,AVX512-64,AVX512DQ-64
12; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX-32,AVX512-32,AVX512BW-32
13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX-64,AVX512-64,AVX512BW-64
14; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=knl | FileCheck %s --check-prefixes=AVX-32,AVX512-32,KNL-32
15; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefixes=AVX-64,AVX512-64,KNL-64
16
17; Make sure that the slow-pmulld feature can be used without SSE4.1.
18; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont -mattr=-sse4.1
19
20define <4 x i32> @test_mul_v4i32_v4i8(<4 x i8> %A) {
21; CHECK32-LABEL: test_mul_v4i32_v4i8:
22; CHECK32:       # %bb.0:
23; CHECK32-NEXT:    pand {{\.LCPI.*}}, %xmm0
24; CHECK32-NEXT:    pmaddwd {{\.LCPI.*}}, %xmm0
25; CHECK32-NEXT:    retl
26;
27; CHECK64-LABEL: test_mul_v4i32_v4i8:
28; CHECK64:       # %bb.0:
29; CHECK64-NEXT:    pand {{.*}}(%rip), %xmm0
30; CHECK64-NEXT:    pmaddwd {{.*}}(%rip), %xmm0
31; CHECK64-NEXT:    retq
32;
33; SSE4-32-LABEL: test_mul_v4i32_v4i8:
34; SSE4-32:       # %bb.0:
35; SSE4-32-NEXT:    pand {{\.LCPI.*}}, %xmm0
36; SSE4-32-NEXT:    pmaddwd {{\.LCPI.*}}, %xmm0
37; SSE4-32-NEXT:    retl
38;
39; SSE4-64-LABEL: test_mul_v4i32_v4i8:
40; SSE4-64:       # %bb.0:
41; SSE4-64-NEXT:    pand {{.*}}(%rip), %xmm0
42; SSE4-64-NEXT:    pmaddwd {{.*}}(%rip), %xmm0
43; SSE4-64-NEXT:    retq
44;
45; AVX2-32-LABEL: test_mul_v4i32_v4i8:
46; AVX2-32:       # %bb.0:
47; AVX2-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
48; AVX2-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
49; AVX2-32-NEXT:    retl
50;
51; AVX2-64-LABEL: test_mul_v4i32_v4i8:
52; AVX2-64:       # %bb.0:
53; AVX2-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
54; AVX2-64-NEXT:    vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
55; AVX2-64-NEXT:    retq
56;
57; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8:
58; AVX512DQ-32:       # %bb.0:
59; AVX512DQ-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
60; AVX512DQ-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
61; AVX512DQ-32-NEXT:    retl
62;
63; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8:
64; AVX512DQ-64:       # %bb.0:
65; AVX512DQ-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
66; AVX512DQ-64-NEXT:    vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
67; AVX512DQ-64-NEXT:    retq
68;
69; AVX512BW-32-LABEL: test_mul_v4i32_v4i8:
70; AVX512BW-32:       # %bb.0:
71; AVX512BW-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
72; AVX512BW-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
73; AVX512BW-32-NEXT:    retl
74;
75; AVX512BW-64-LABEL: test_mul_v4i32_v4i8:
76; AVX512BW-64:       # %bb.0:
77; AVX512BW-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
78; AVX512BW-64-NEXT:    vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
79; AVX512BW-64-NEXT:    retq
80;
81; KNL-32-LABEL: test_mul_v4i32_v4i8:
82; KNL-32:       # %bb.0:
83; KNL-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
84; KNL-32-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
85; KNL-32-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
86; KNL-32-NEXT:    retl
87;
88; KNL-64-LABEL: test_mul_v4i32_v4i8:
89; KNL-64:       # %bb.0:
90; KNL-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
91; KNL-64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
92; KNL-64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
93; KNL-64-NEXT:    retq
94  %z = zext <4 x i8> %A to <4 x i32>
95  %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
96  ret <4 x i32> %m
97}
98
99define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) {
100; SLM32-LABEL: test_mul_v8i32_v8i8:
101; SLM32:       # %bb.0:
102; SLM32-NEXT:    movdqa %xmm0, %xmm1
103; SLM32-NEXT:    pand {{\.LCPI.*}}, %xmm1
104; SLM32-NEXT:    movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
105; SLM32-NEXT:    movdqa %xmm1, %xmm2
106; SLM32-NEXT:    pmullw %xmm0, %xmm1
107; SLM32-NEXT:    pmulhw %xmm0, %xmm2
108; SLM32-NEXT:    movdqa %xmm1, %xmm0
109; SLM32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
110; SLM32-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
111; SLM32-NEXT:    retl
112;
113; SLM64-LABEL: test_mul_v8i32_v8i8:
114; SLM64:       # %bb.0:
115; SLM64-NEXT:    movdqa %xmm0, %xmm1
116; SLM64-NEXT:    pand {{.*}}(%rip), %xmm1
117; SLM64-NEXT:    movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
118; SLM64-NEXT:    movdqa %xmm1, %xmm2
119; SLM64-NEXT:    pmullw %xmm0, %xmm1
120; SLM64-NEXT:    pmulhw %xmm0, %xmm2
121; SLM64-NEXT:    movdqa %xmm1, %xmm0
122; SLM64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
123; SLM64-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
124; SLM64-NEXT:    retq
125;
126; SLOW32-LABEL: test_mul_v8i32_v8i8:
127; SLOW32:       # %bb.0:
128; SLOW32-NEXT:    movdqa %xmm0, %xmm1
129; SLOW32-NEXT:    pand {{\.LCPI.*}}, %xmm1
130; SLOW32-NEXT:    movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
131; SLOW32-NEXT:    movdqa %xmm1, %xmm2
132; SLOW32-NEXT:    pmulhw %xmm0, %xmm2
133; SLOW32-NEXT:    pmullw %xmm0, %xmm1
134; SLOW32-NEXT:    movdqa %xmm1, %xmm0
135; SLOW32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
136; SLOW32-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
137; SLOW32-NEXT:    retl
138;
139; SLOW64-LABEL: test_mul_v8i32_v8i8:
140; SLOW64:       # %bb.0:
141; SLOW64-NEXT:    movdqa %xmm0, %xmm1
142; SLOW64-NEXT:    pand {{.*}}(%rip), %xmm1
143; SLOW64-NEXT:    movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
144; SLOW64-NEXT:    movdqa %xmm1, %xmm2
145; SLOW64-NEXT:    pmulhw %xmm0, %xmm2
146; SLOW64-NEXT:    pmullw %xmm0, %xmm1
147; SLOW64-NEXT:    movdqa %xmm1, %xmm0
148; SLOW64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
149; SLOW64-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
150; SLOW64-NEXT:    retq
151;
152; SSE4-32-LABEL: test_mul_v8i32_v8i8:
153; SSE4-32:       # %bb.0:
154; SSE4-32-NEXT:    pand {{\.LCPI.*}}, %xmm0
155; SSE4-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
156; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
157; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
158; SSE4-32-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
159; SSE4-32-NEXT:    pmaddwd %xmm2, %xmm0
160; SSE4-32-NEXT:    pmaddwd %xmm2, %xmm1
161; SSE4-32-NEXT:    retl
162;
163; SSE4-64-LABEL: test_mul_v8i32_v8i8:
164; SSE4-64:       # %bb.0:
165; SSE4-64-NEXT:    pand {{.*}}(%rip), %xmm0
166; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
167; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
168; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
169; SSE4-64-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
170; SSE4-64-NEXT:    pmaddwd %xmm2, %xmm0
171; SSE4-64-NEXT:    pmaddwd %xmm2, %xmm1
172; SSE4-64-NEXT:    retq
173;
174; AVX2-32-LABEL: test_mul_v8i32_v8i8:
175; AVX2-32:       # %bb.0:
176; AVX2-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
177; AVX2-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
178; AVX2-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
179; AVX2-32-NEXT:    retl
180;
181; AVX2-64-LABEL: test_mul_v8i32_v8i8:
182; AVX2-64:       # %bb.0:
183; AVX2-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
184; AVX2-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
185; AVX2-64-NEXT:    vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
186; AVX2-64-NEXT:    retq
187;
188; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8:
189; AVX512DQ-32:       # %bb.0:
190; AVX512DQ-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
191; AVX512DQ-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
192; AVX512DQ-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
193; AVX512DQ-32-NEXT:    retl
194;
195; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8:
196; AVX512DQ-64:       # %bb.0:
197; AVX512DQ-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
198; AVX512DQ-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
199; AVX512DQ-64-NEXT:    vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
200; AVX512DQ-64-NEXT:    retq
201;
202; AVX512BW-32-LABEL: test_mul_v8i32_v8i8:
203; AVX512BW-32:       # %bb.0:
204; AVX512BW-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
205; AVX512BW-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
206; AVX512BW-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
207; AVX512BW-32-NEXT:    retl
208;
209; AVX512BW-64-LABEL: test_mul_v8i32_v8i8:
210; AVX512BW-64:       # %bb.0:
211; AVX512BW-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
212; AVX512BW-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
213; AVX512BW-64-NEXT:    vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
214; AVX512BW-64-NEXT:    retq
215;
216; KNL-32-LABEL: test_mul_v8i32_v8i8:
217; KNL-32:       # %bb.0:
218; KNL-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
219; KNL-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
220; KNL-32-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
221; KNL-32-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
222; KNL-32-NEXT:    retl
223;
224; KNL-64-LABEL: test_mul_v8i32_v8i8:
225; KNL-64:       # %bb.0:
226; KNL-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
227; KNL-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
228; KNL-64-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
229; KNL-64-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
230; KNL-64-NEXT:    retq
231  %z = zext <8 x i8> %A to <8 x i32>
232  %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
233  ret <8 x i32> %m
234}
235
236define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
237; SLM32-LABEL: test_mul_v16i32_v16i8:
238; SLM32:       # %bb.0:
239; SLM32-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
240; SLM32-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
241; SLM32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
242; SLM32-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
243; SLM32-NEXT:    movdqa %xmm1, %xmm4
244; SLM32-NEXT:    movdqa %xmm3, %xmm5
245; SLM32-NEXT:    pmullw %xmm2, %xmm1
246; SLM32-NEXT:    pmullw %xmm2, %xmm3
247; SLM32-NEXT:    pmulhw %xmm2, %xmm4
248; SLM32-NEXT:    pmulhw %xmm2, %xmm5
249; SLM32-NEXT:    movdqa %xmm1, %xmm0
250; SLM32-NEXT:    movdqa %xmm3, %xmm2
251; SLM32-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
252; SLM32-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
253; SLM32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
254; SLM32-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
255; SLM32-NEXT:    retl
256;
257; SLM64-LABEL: test_mul_v16i32_v16i8:
258; SLM64:       # %bb.0:
259; SLM64-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
260; SLM64-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
261; SLM64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
262; SLM64-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
263; SLM64-NEXT:    movdqa %xmm1, %xmm4
264; SLM64-NEXT:    movdqa %xmm3, %xmm5
265; SLM64-NEXT:    pmullw %xmm2, %xmm1
266; SLM64-NEXT:    pmullw %xmm2, %xmm3
267; SLM64-NEXT:    pmulhw %xmm2, %xmm4
268; SLM64-NEXT:    pmulhw %xmm2, %xmm5
269; SLM64-NEXT:    movdqa %xmm1, %xmm0
270; SLM64-NEXT:    movdqa %xmm3, %xmm2
271; SLM64-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
272; SLM64-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
273; SLM64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
274; SLM64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
275; SLM64-NEXT:    retq
276;
277; SLOW32-LABEL: test_mul_v16i32_v16i8:
278; SLOW32:       # %bb.0:
279; SLOW32-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
280; SLOW32-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
281; SLOW32-NEXT:    movdqa %xmm1, %xmm3
282; SLOW32-NEXT:    pmulhw %xmm2, %xmm3
283; SLOW32-NEXT:    pmullw %xmm2, %xmm1
284; SLOW32-NEXT:    movdqa %xmm1, %xmm4
285; SLOW32-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
286; SLOW32-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
287; SLOW32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
288; SLOW32-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
289; SLOW32-NEXT:    movdqa %xmm3, %xmm0
290; SLOW32-NEXT:    pmulhw %xmm2, %xmm0
291; SLOW32-NEXT:    pmullw %xmm2, %xmm3
292; SLOW32-NEXT:    movdqa %xmm3, %xmm2
293; SLOW32-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
294; SLOW32-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
295; SLOW32-NEXT:    movdqa %xmm4, %xmm0
296; SLOW32-NEXT:    retl
297;
298; SLOW64-LABEL: test_mul_v16i32_v16i8:
299; SLOW64:       # %bb.0:
300; SLOW64-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
301; SLOW64-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
302; SLOW64-NEXT:    movdqa %xmm1, %xmm3
303; SLOW64-NEXT:    pmulhw %xmm2, %xmm3
304; SLOW64-NEXT:    pmullw %xmm2, %xmm1
305; SLOW64-NEXT:    movdqa %xmm1, %xmm4
306; SLOW64-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
307; SLOW64-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
308; SLOW64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
309; SLOW64-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
310; SLOW64-NEXT:    movdqa %xmm3, %xmm0
311; SLOW64-NEXT:    pmulhw %xmm2, %xmm0
312; SLOW64-NEXT:    pmullw %xmm2, %xmm3
313; SLOW64-NEXT:    movdqa %xmm3, %xmm2
314; SLOW64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
315; SLOW64-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
316; SLOW64-NEXT:    movdqa %xmm4, %xmm0
317; SLOW64-NEXT:    retq
318;
319; SSE4-32-LABEL: test_mul_v16i32_v16i8:
320; SSE4-32:       # %bb.0:
321; SSE4-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
322; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
323; SSE4-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
324; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
325; SSE4-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
326; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
327; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
328; SSE4-32-NEXT:    movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
329; SSE4-32-NEXT:    pmaddwd %xmm4, %xmm0
330; SSE4-32-NEXT:    pmaddwd %xmm4, %xmm1
331; SSE4-32-NEXT:    pmaddwd %xmm4, %xmm2
332; SSE4-32-NEXT:    pmaddwd %xmm4, %xmm3
333; SSE4-32-NEXT:    retl
334;
335; SSE4-64-LABEL: test_mul_v16i32_v16i8:
336; SSE4-64:       # %bb.0:
337; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
338; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
339; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
340; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
341; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
342; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
343; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
344; SSE4-64-NEXT:    movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
345; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm0
346; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm1
347; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm2
348; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm3
349; SSE4-64-NEXT:    retq
350;
351; AVX2-32-LABEL: test_mul_v16i32_v16i8:
352; AVX2-32:       # %bb.0:
353; AVX2-32-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
354; AVX2-32-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
355; AVX2-32-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
356; AVX2-32-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
357; AVX2-32-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
358; AVX2-32-NEXT:    vpmaddwd %ymm2, %ymm1, %ymm1
359; AVX2-32-NEXT:    retl
360;
361; AVX2-64-LABEL: test_mul_v16i32_v16i8:
362; AVX2-64:       # %bb.0:
363; AVX2-64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
364; AVX2-64-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
365; AVX2-64-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
366; AVX2-64-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
367; AVX2-64-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
368; AVX2-64-NEXT:    vpmaddwd %ymm2, %ymm1, %ymm1
369; AVX2-64-NEXT:    retq
370;
371; AVX512DQ-32-LABEL: test_mul_v16i32_v16i8:
372; AVX512DQ-32:       # %bb.0:
373; AVX512DQ-32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
374; AVX512DQ-32-NEXT:    vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
375; AVX512DQ-32-NEXT:    retl
376;
377; AVX512DQ-64-LABEL: test_mul_v16i32_v16i8:
378; AVX512DQ-64:       # %bb.0:
379; AVX512DQ-64-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
380; AVX512DQ-64-NEXT:    vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
381; AVX512DQ-64-NEXT:    retq
382;
383; AVX512BW-32-LABEL: test_mul_v16i32_v16i8:
384; AVX512BW-32:       # %bb.0:
385; AVX512BW-32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
386; AVX512BW-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %zmm0, %zmm0
387; AVX512BW-32-NEXT:    retl
388;
389; AVX512BW-64-LABEL: test_mul_v16i32_v16i8:
390; AVX512BW-64:       # %bb.0:
391; AVX512BW-64-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
392; AVX512BW-64-NEXT:    vpmaddwd {{.*}}(%rip), %zmm0, %zmm0
393; AVX512BW-64-NEXT:    retq
394;
395; KNL-32-LABEL: test_mul_v16i32_v16i8:
396; KNL-32:       # %bb.0:
397; KNL-32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
398; KNL-32-NEXT:    vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
399; KNL-32-NEXT:    retl
400;
401; KNL-64-LABEL: test_mul_v16i32_v16i8:
402; KNL-64:       # %bb.0:
403; KNL-64-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
404; KNL-64-NEXT:    vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
405; KNL-64-NEXT:    retq
406  %z = zext <16 x i8> %A to <16 x i32>
407  %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
408  ret <16 x i32> %m
409}
410
411define <4 x i32> @test_mul_v4i32_v4i16(<4 x i16> %A) {
412; SLM32-LABEL: test_mul_v4i32_v4i16:
413; SLM32:       # %bb.0:
414; SLM32-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
415; SLM32-NEXT:    movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
416; SLM32-NEXT:    movdqa %xmm0, %xmm2
417; SLM32-NEXT:    pmullw %xmm1, %xmm0
418; SLM32-NEXT:    pmulhuw %xmm1, %xmm2
419; SLM32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
420; SLM32-NEXT:    retl
421;
422; SLM64-LABEL: test_mul_v4i32_v4i16:
423; SLM64:       # %bb.0:
424; SLM64-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
425; SLM64-NEXT:    movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
426; SLM64-NEXT:    movdqa %xmm0, %xmm2
427; SLM64-NEXT:    pmullw %xmm1, %xmm0
428; SLM64-NEXT:    pmulhuw %xmm1, %xmm2
429; SLM64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
430; SLM64-NEXT:    retq
431;
432; SLOW32-LABEL: test_mul_v4i32_v4i16:
433; SLOW32:       # %bb.0:
434; SLOW32-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
435; SLOW32-NEXT:    movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
436; SLOW32-NEXT:    movdqa %xmm0, %xmm2
437; SLOW32-NEXT:    pmulhuw %xmm1, %xmm2
438; SLOW32-NEXT:    pmullw %xmm1, %xmm0
439; SLOW32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
440; SLOW32-NEXT:    retl
441;
442; SLOW64-LABEL: test_mul_v4i32_v4i16:
443; SLOW64:       # %bb.0:
444; SLOW64-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
445; SLOW64-NEXT:    movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
446; SLOW64-NEXT:    movdqa %xmm0, %xmm2
447; SLOW64-NEXT:    pmulhuw %xmm1, %xmm2
448; SLOW64-NEXT:    pmullw %xmm1, %xmm0
449; SLOW64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
450; SLOW64-NEXT:    retq
451;
452; SSE4-32-LABEL: test_mul_v4i32_v4i16:
453; SSE4-32:       # %bb.0:
454; SSE4-32-NEXT:    pxor %xmm1, %xmm1
455; SSE4-32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
456; SSE4-32-NEXT:    pmulld {{\.LCPI.*}}, %xmm0
457; SSE4-32-NEXT:    retl
458;
459; SSE4-64-LABEL: test_mul_v4i32_v4i16:
460; SSE4-64:       # %bb.0:
461; SSE4-64-NEXT:    pxor %xmm1, %xmm1
462; SSE4-64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
463; SSE4-64-NEXT:    pmulld {{.*}}(%rip), %xmm0
464; SSE4-64-NEXT:    retq
465;
466; AVX-32-LABEL: test_mul_v4i32_v4i16:
467; AVX-32:       # %bb.0:
468; AVX-32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
469; AVX-32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
470; AVX-32-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
471; AVX-32-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
472; AVX-32-NEXT:    retl
473;
474; AVX-64-LABEL: test_mul_v4i32_v4i16:
475; AVX-64:       # %bb.0:
476; AVX-64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
477; AVX-64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
478; AVX-64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
479; AVX-64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
480; AVX-64-NEXT:    retq
481  %z = zext <4 x i16> %A to <4 x i32>
482  %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
483  ret <4 x i32> %m
484}
485
486define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) {
487; SLM32-LABEL: test_mul_v8i32_v8i16:
488; SLM32:       # %bb.0:
489; SLM32-NEXT:    movdqa %xmm0, %xmm1
490; SLM32-NEXT:    movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
491; SLM32-NEXT:    movdqa %xmm1, %xmm2
492; SLM32-NEXT:    pmullw %xmm0, %xmm1
493; SLM32-NEXT:    pmulhuw %xmm0, %xmm2
494; SLM32-NEXT:    movdqa %xmm1, %xmm0
495; SLM32-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
496; SLM32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
497; SLM32-NEXT:    retl
498;
499; SLM64-LABEL: test_mul_v8i32_v8i16:
500; SLM64:       # %bb.0:
501; SLM64-NEXT:    movdqa %xmm0, %xmm1
502; SLM64-NEXT:    movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
503; SLM64-NEXT:    movdqa %xmm1, %xmm2
504; SLM64-NEXT:    pmullw %xmm0, %xmm1
505; SLM64-NEXT:    pmulhuw %xmm0, %xmm2
506; SLM64-NEXT:    movdqa %xmm1, %xmm0
507; SLM64-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
508; SLM64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
509; SLM64-NEXT:    retq
510;
511; SLOW32-LABEL: test_mul_v8i32_v8i16:
512; SLOW32:       # %bb.0:
513; SLOW32-NEXT:    movdqa %xmm0, %xmm1
514; SLOW32-NEXT:    movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
515; SLOW32-NEXT:    movdqa %xmm1, %xmm2
516; SLOW32-NEXT:    pmulhuw %xmm0, %xmm2
517; SLOW32-NEXT:    pmullw %xmm0, %xmm1
518; SLOW32-NEXT:    movdqa %xmm1, %xmm0
519; SLOW32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
520; SLOW32-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
521; SLOW32-NEXT:    retl
522;
523; SLOW64-LABEL: test_mul_v8i32_v8i16:
524; SLOW64:       # %bb.0:
525; SLOW64-NEXT:    movdqa %xmm0, %xmm1
526; SLOW64-NEXT:    movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
527; SLOW64-NEXT:    movdqa %xmm1, %xmm2
528; SLOW64-NEXT:    pmulhuw %xmm0, %xmm2
529; SLOW64-NEXT:    pmullw %xmm0, %xmm1
530; SLOW64-NEXT:    movdqa %xmm1, %xmm0
531; SLOW64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
532; SLOW64-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
533; SLOW64-NEXT:    retq
534;
535; SSE4-32-LABEL: test_mul_v8i32_v8i16:
536; SSE4-32:       # %bb.0:
537; SSE4-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
538; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
539; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
540; SSE4-32-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
541; SSE4-32-NEXT:    pmulld %xmm2, %xmm0
542; SSE4-32-NEXT:    pmulld %xmm2, %xmm1
543; SSE4-32-NEXT:    retl
544;
545; SSE4-64-LABEL: test_mul_v8i32_v8i16:
546; SSE4-64:       # %bb.0:
547; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
548; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
549; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
550; SSE4-64-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
551; SSE4-64-NEXT:    pmulld %xmm2, %xmm0
552; SSE4-64-NEXT:    pmulld %xmm2, %xmm1
553; SSE4-64-NEXT:    retq
554;
555; AVX-32-LABEL: test_mul_v8i32_v8i16:
556; AVX-32:       # %bb.0:
557; AVX-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
558; AVX-32-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
559; AVX-32-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
560; AVX-32-NEXT:    retl
561;
562; AVX-64-LABEL: test_mul_v8i32_v8i16:
563; AVX-64:       # %bb.0:
564; AVX-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
565; AVX-64-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
566; AVX-64-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
567; AVX-64-NEXT:    retq
568  %z = zext <8 x i16> %A to <8 x i32>
569  %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
570  ret <8 x i32> %m
571}
572
573define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) {
574; SLM32-LABEL: test_mul_v16i32_v16i16:
575; SLM32:       # %bb.0:
576; SLM32-NEXT:    movdqa %xmm1, %xmm3
577; SLM32-NEXT:    movdqa %xmm0, %xmm1
578; SLM32-NEXT:    movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
579; SLM32-NEXT:    movdqa %xmm1, %xmm2
580; SLM32-NEXT:    movdqa %xmm3, %xmm4
581; SLM32-NEXT:    pmullw %xmm0, %xmm1
582; SLM32-NEXT:    pmulhuw %xmm0, %xmm2
583; SLM32-NEXT:    pmullw %xmm0, %xmm3
584; SLM32-NEXT:    pmulhuw %xmm0, %xmm4
585; SLM32-NEXT:    movdqa %xmm1, %xmm0
586; SLM32-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
587; SLM32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
588; SLM32-NEXT:    movdqa %xmm3, %xmm2
589; SLM32-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
590; SLM32-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
591; SLM32-NEXT:    retl
592;
593; SLM64-LABEL: test_mul_v16i32_v16i16:
594; SLM64:       # %bb.0:
595; SLM64-NEXT:    movdqa %xmm1, %xmm3
596; SLM64-NEXT:    movdqa %xmm0, %xmm1
597; SLM64-NEXT:    movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
598; SLM64-NEXT:    movdqa %xmm1, %xmm2
599; SLM64-NEXT:    movdqa %xmm3, %xmm4
600; SLM64-NEXT:    pmullw %xmm0, %xmm1
601; SLM64-NEXT:    pmulhuw %xmm0, %xmm2
602; SLM64-NEXT:    pmullw %xmm0, %xmm3
603; SLM64-NEXT:    pmulhuw %xmm0, %xmm4
604; SLM64-NEXT:    movdqa %xmm1, %xmm0
605; SLM64-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
606; SLM64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
607; SLM64-NEXT:    movdqa %xmm3, %xmm2
608; SLM64-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
609; SLM64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
610; SLM64-NEXT:    retq
611;
612; SLOW32-LABEL: test_mul_v16i32_v16i16:
613; SLOW32:       # %bb.0:
614; SLOW32-NEXT:    movdqa %xmm1, %xmm3
615; SLOW32-NEXT:    movdqa %xmm0, %xmm1
616; SLOW32-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
617; SLOW32-NEXT:    movdqa %xmm0, %xmm4
618; SLOW32-NEXT:    pmulhuw %xmm2, %xmm4
619; SLOW32-NEXT:    pmullw %xmm2, %xmm1
620; SLOW32-NEXT:    movdqa %xmm1, %xmm0
621; SLOW32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
622; SLOW32-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
623; SLOW32-NEXT:    movdqa %xmm3, %xmm4
624; SLOW32-NEXT:    pmulhuw %xmm2, %xmm4
625; SLOW32-NEXT:    pmullw %xmm2, %xmm3
626; SLOW32-NEXT:    movdqa %xmm3, %xmm2
627; SLOW32-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
628; SLOW32-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
629; SLOW32-NEXT:    retl
630;
631; SLOW64-LABEL: test_mul_v16i32_v16i16:
632; SLOW64:       # %bb.0:
633; SLOW64-NEXT:    movdqa %xmm1, %xmm3
634; SLOW64-NEXT:    movdqa %xmm0, %xmm1
635; SLOW64-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
636; SLOW64-NEXT:    movdqa %xmm0, %xmm4
637; SLOW64-NEXT:    pmulhuw %xmm2, %xmm4
638; SLOW64-NEXT:    pmullw %xmm2, %xmm1
639; SLOW64-NEXT:    movdqa %xmm1, %xmm0
640; SLOW64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
641; SLOW64-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
642; SLOW64-NEXT:    movdqa %xmm3, %xmm4
643; SLOW64-NEXT:    pmulhuw %xmm2, %xmm4
644; SLOW64-NEXT:    pmullw %xmm2, %xmm3
645; SLOW64-NEXT:    movdqa %xmm3, %xmm2
646; SLOW64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
647; SLOW64-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
648; SLOW64-NEXT:    retq
649;
650; SSE4-32-LABEL: test_mul_v16i32_v16i16:
651; SSE4-32:       # %bb.0:
652; SSE4-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
653; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
654; SSE4-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
655; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
656; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
657; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
658; SSE4-32-NEXT:    movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
659; SSE4-32-NEXT:    pmulld %xmm1, %xmm0
660; SSE4-32-NEXT:    pmulld %xmm1, %xmm2
661; SSE4-32-NEXT:    pmulld %xmm1, %xmm4
662; SSE4-32-NEXT:    pmulld %xmm1, %xmm3
663; SSE4-32-NEXT:    movdqa %xmm4, %xmm1
664; SSE4-32-NEXT:    retl
665;
666; SSE4-64-LABEL: test_mul_v16i32_v16i16:
667; SSE4-64:       # %bb.0:
668; SSE4-64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
669; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
670; SSE4-64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
671; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
672; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
673; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
674; SSE4-64-NEXT:    movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
675; SSE4-64-NEXT:    pmulld %xmm1, %xmm0
676; SSE4-64-NEXT:    pmulld %xmm1, %xmm2
677; SSE4-64-NEXT:    pmulld %xmm1, %xmm4
678; SSE4-64-NEXT:    pmulld %xmm1, %xmm3
679; SSE4-64-NEXT:    movdqa %xmm4, %xmm1
680; SSE4-64-NEXT:    retq
681;
682; AVX2-32-LABEL: test_mul_v16i32_v16i16:
683; AVX2-32:       # %bb.0:
684; AVX2-32-NEXT:    vextracti128 $1, %ymm0, %xmm1
685; AVX2-32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
686; AVX2-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
687; AVX2-32-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
688; AVX2-32-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
689; AVX2-32-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
690; AVX2-32-NEXT:    retl
691;
692; AVX2-64-LABEL: test_mul_v16i32_v16i16:
693; AVX2-64:       # %bb.0:
694; AVX2-64-NEXT:    vextracti128 $1, %ymm0, %xmm1
695; AVX2-64-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
696; AVX2-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
697; AVX2-64-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
698; AVX2-64-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
699; AVX2-64-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
700; AVX2-64-NEXT:    retq
701;
702; AVX512-32-LABEL: test_mul_v16i32_v16i16:
703; AVX512-32:       # %bb.0:
704; AVX512-32-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
705; AVX512-32-NEXT:    vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
706; AVX512-32-NEXT:    retl
707;
708; AVX512-64-LABEL: test_mul_v16i32_v16i16:
709; AVX512-64:       # %bb.0:
710; AVX512-64-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
711; AVX512-64-NEXT:    vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
712; AVX512-64-NEXT:    retq
713  %z = zext <16 x i16> %A to <16 x i32>
714  %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
715  ret <16 x i32> %m
716}
717
718;
719; MinSize Tests
720;
721
722define <4 x i32> @test_mul_v4i32_v4i8_minsize(<4 x i8> %A) minsize {
723; CHECK32-LABEL: test_mul_v4i32_v4i8_minsize:
724; CHECK32:       # %bb.0:
725; CHECK32-NEXT:    pand {{\.LCPI.*}}, %xmm0
726; CHECK32-NEXT:    pmaddwd {{\.LCPI.*}}, %xmm0
727; CHECK32-NEXT:    retl
728;
729; CHECK64-LABEL: test_mul_v4i32_v4i8_minsize:
730; CHECK64:       # %bb.0:
731; CHECK64-NEXT:    pand {{.*}}(%rip), %xmm0
732; CHECK64-NEXT:    pmaddwd {{.*}}(%rip), %xmm0
733; CHECK64-NEXT:    retq
734;
735; SSE4-32-LABEL: test_mul_v4i32_v4i8_minsize:
736; SSE4-32:       # %bb.0:
737; SSE4-32-NEXT:    pand {{\.LCPI.*}}, %xmm0
738; SSE4-32-NEXT:    pmaddwd {{\.LCPI.*}}, %xmm0
739; SSE4-32-NEXT:    retl
740;
741; SSE4-64-LABEL: test_mul_v4i32_v4i8_minsize:
742; SSE4-64:       # %bb.0:
743; SSE4-64-NEXT:    pand {{.*}}(%rip), %xmm0
744; SSE4-64-NEXT:    pmaddwd {{.*}}(%rip), %xmm0
745; SSE4-64-NEXT:    retq
746;
747; AVX2-32-LABEL: test_mul_v4i32_v4i8_minsize:
748; AVX2-32:       # %bb.0:
749; AVX2-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
750; AVX2-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
751; AVX2-32-NEXT:    retl
752;
753; AVX2-64-LABEL: test_mul_v4i32_v4i8_minsize:
754; AVX2-64:       # %bb.0:
755; AVX2-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
756; AVX2-64-NEXT:    vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
757; AVX2-64-NEXT:    retq
758;
759; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8_minsize:
760; AVX512DQ-32:       # %bb.0:
761; AVX512DQ-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
762; AVX512DQ-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
763; AVX512DQ-32-NEXT:    retl
764;
765; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8_minsize:
766; AVX512DQ-64:       # %bb.0:
767; AVX512DQ-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
768; AVX512DQ-64-NEXT:    vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
769; AVX512DQ-64-NEXT:    retq
770;
771; AVX512BW-32-LABEL: test_mul_v4i32_v4i8_minsize:
772; AVX512BW-32:       # %bb.0:
773; AVX512BW-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
774; AVX512BW-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
775; AVX512BW-32-NEXT:    retl
776;
777; AVX512BW-64-LABEL: test_mul_v4i32_v4i8_minsize:
778; AVX512BW-64:       # %bb.0:
779; AVX512BW-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
780; AVX512BW-64-NEXT:    vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
781; AVX512BW-64-NEXT:    retq
782;
783; KNL-32-LABEL: test_mul_v4i32_v4i8_minsize:
784; KNL-32:       # %bb.0:
785; KNL-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
786; KNL-32-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
787; KNL-32-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
788; KNL-32-NEXT:    retl
789;
790; KNL-64-LABEL: test_mul_v4i32_v4i8_minsize:
791; KNL-64:       # %bb.0:
792; KNL-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
793; KNL-64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
794; KNL-64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
795; KNL-64-NEXT:    retq
796  %z = zext <4 x i8> %A to <4 x i32>
797  %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
798  ret <4 x i32> %m
799}
800
801define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize {
802; SLM32-LABEL: test_mul_v8i32_v8i8_minsize:
803; SLM32:       # %bb.0:
804; SLM32-NEXT:    pand {{\.LCPI.*}}, %xmm0
805; SLM32-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
806; SLM32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
807; SLM32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
808; SLM32-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
809; SLM32-NEXT:    pmaddwd %xmm2, %xmm0
810; SLM32-NEXT:    pmaddwd %xmm2, %xmm1
811; SLM32-NEXT:    retl
812;
813; SLM64-LABEL: test_mul_v8i32_v8i8_minsize:
814; SLM64:       # %bb.0:
815; SLM64-NEXT:    pand {{.*}}(%rip), %xmm0
816; SLM64-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
817; SLM64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
818; SLM64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
819; SLM64-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
820; SLM64-NEXT:    pmaddwd %xmm2, %xmm0
821; SLM64-NEXT:    pmaddwd %xmm2, %xmm1
822; SLM64-NEXT:    retq
823;
824; SLOW32-LABEL: test_mul_v8i32_v8i8_minsize:
825; SLOW32:       # %bb.0:
826; SLOW32-NEXT:    pand {{\.LCPI.*}}, %xmm0
827; SLOW32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
828; SLOW32-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
829; SLOW32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
830; SLOW32-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
831; SLOW32-NEXT:    pmaddwd %xmm2, %xmm0
832; SLOW32-NEXT:    pmaddwd %xmm2, %xmm1
833; SLOW32-NEXT:    retl
834;
835; SLOW64-LABEL: test_mul_v8i32_v8i8_minsize:
836; SLOW64:       # %bb.0:
837; SLOW64-NEXT:    pand {{.*}}(%rip), %xmm0
838; SLOW64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
839; SLOW64-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
840; SLOW64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
841; SLOW64-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
842; SLOW64-NEXT:    pmaddwd %xmm2, %xmm0
843; SLOW64-NEXT:    pmaddwd %xmm2, %xmm1
844; SLOW64-NEXT:    retq
845;
846; SSE4-32-LABEL: test_mul_v8i32_v8i8_minsize:
847; SSE4-32:       # %bb.0:
848; SSE4-32-NEXT:    pand {{\.LCPI.*}}, %xmm0
849; SSE4-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
850; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
851; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
852; SSE4-32-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
853; SSE4-32-NEXT:    pmaddwd %xmm2, %xmm0
854; SSE4-32-NEXT:    pmaddwd %xmm2, %xmm1
855; SSE4-32-NEXT:    retl
856;
857; SSE4-64-LABEL: test_mul_v8i32_v8i8_minsize:
858; SSE4-64:       # %bb.0:
859; SSE4-64-NEXT:    pand {{.*}}(%rip), %xmm0
860; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
861; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
862; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
863; SSE4-64-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
864; SSE4-64-NEXT:    pmaddwd %xmm2, %xmm0
865; SSE4-64-NEXT:    pmaddwd %xmm2, %xmm1
866; SSE4-64-NEXT:    retq
867;
868; AVX2-32-LABEL: test_mul_v8i32_v8i8_minsize:
869; AVX2-32:       # %bb.0:
870; AVX2-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
871; AVX2-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
872; AVX2-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
873; AVX2-32-NEXT:    retl
874;
875; AVX2-64-LABEL: test_mul_v8i32_v8i8_minsize:
876; AVX2-64:       # %bb.0:
877; AVX2-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
878; AVX2-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
879; AVX2-64-NEXT:    vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
880; AVX2-64-NEXT:    retq
881;
882; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8_minsize:
883; AVX512DQ-32:       # %bb.0:
884; AVX512DQ-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
885; AVX512DQ-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
886; AVX512DQ-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
887; AVX512DQ-32-NEXT:    retl
888;
889; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8_minsize:
890; AVX512DQ-64:       # %bb.0:
891; AVX512DQ-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
892; AVX512DQ-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
893; AVX512DQ-64-NEXT:    vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
894; AVX512DQ-64-NEXT:    retq
895;
896; AVX512BW-32-LABEL: test_mul_v8i32_v8i8_minsize:
897; AVX512BW-32:       # %bb.0:
898; AVX512BW-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
899; AVX512BW-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
900; AVX512BW-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
901; AVX512BW-32-NEXT:    retl
902;
903; AVX512BW-64-LABEL: test_mul_v8i32_v8i8_minsize:
904; AVX512BW-64:       # %bb.0:
905; AVX512BW-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
906; AVX512BW-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
907; AVX512BW-64-NEXT:    vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
908; AVX512BW-64-NEXT:    retq
909;
910; KNL-32-LABEL: test_mul_v8i32_v8i8_minsize:
911; KNL-32:       # %bb.0:
912; KNL-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
913; KNL-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
914; KNL-32-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
915; KNL-32-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
916; KNL-32-NEXT:    retl
917;
918; KNL-64-LABEL: test_mul_v8i32_v8i8_minsize:
919; KNL-64:       # %bb.0:
920; KNL-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
921; KNL-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
922; KNL-64-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
923; KNL-64-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
924; KNL-64-NEXT:    retq
925  %z = zext <8 x i8> %A to <8 x i32>
926  %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
927  ret <8 x i32> %m
928}
929
930define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {
931; SLM32-LABEL: test_mul_v16i32_v16i8_minsize:
932; SLM32:       # %bb.0:
933; SLM32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
934; SLM32-NEXT:    movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778]
935; SLM32-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
936; SLM32-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
937; SLM32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
938; SLM32-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
939; SLM32-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
940; SLM32-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
941; SLM32-NEXT:    pmaddwd %xmm5, %xmm0
942; SLM32-NEXT:    pmaddwd %xmm5, %xmm1
943; SLM32-NEXT:    pmaddwd %xmm5, %xmm2
944; SLM32-NEXT:    pmaddwd %xmm5, %xmm3
945; SLM32-NEXT:    retl
946;
947; SLM64-LABEL: test_mul_v16i32_v16i8_minsize:
948; SLM64:       # %bb.0:
949; SLM64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
950; SLM64-NEXT:    movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778]
951; SLM64-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
952; SLM64-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
953; SLM64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
954; SLM64-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
955; SLM64-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
956; SLM64-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
957; SLM64-NEXT:    pmaddwd %xmm5, %xmm0
958; SLM64-NEXT:    pmaddwd %xmm5, %xmm1
959; SLM64-NEXT:    pmaddwd %xmm5, %xmm2
960; SLM64-NEXT:    pmaddwd %xmm5, %xmm3
961; SLM64-NEXT:    retq
962;
963; SLOW32-LABEL: test_mul_v16i32_v16i8_minsize:
964; SLOW32:       # %bb.0:
965; SLOW32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
966; SLOW32-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
967; SLOW32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
968; SLOW32-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
969; SLOW32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
970; SLOW32-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
971; SLOW32-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
972; SLOW32-NEXT:    movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
973; SLOW32-NEXT:    pmaddwd %xmm4, %xmm0
974; SLOW32-NEXT:    pmaddwd %xmm4, %xmm1
975; SLOW32-NEXT:    pmaddwd %xmm4, %xmm2
976; SLOW32-NEXT:    pmaddwd %xmm4, %xmm3
977; SLOW32-NEXT:    retl
978;
979; SLOW64-LABEL: test_mul_v16i32_v16i8_minsize:
980; SLOW64:       # %bb.0:
981; SLOW64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
982; SLOW64-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
983; SLOW64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
984; SLOW64-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
985; SLOW64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
986; SLOW64-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
987; SLOW64-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
988; SLOW64-NEXT:    movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
989; SLOW64-NEXT:    pmaddwd %xmm4, %xmm0
990; SLOW64-NEXT:    pmaddwd %xmm4, %xmm1
991; SLOW64-NEXT:    pmaddwd %xmm4, %xmm2
992; SLOW64-NEXT:    pmaddwd %xmm4, %xmm3
993; SLOW64-NEXT:    retq
994;
995; SSE4-32-LABEL: test_mul_v16i32_v16i8_minsize:
996; SSE4-32:       # %bb.0:
997; SSE4-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
998; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
999; SSE4-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1000; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1001; SSE4-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1002; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1003; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1004; SSE4-32-NEXT:    movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
1005; SSE4-32-NEXT:    pmaddwd %xmm4, %xmm0
1006; SSE4-32-NEXT:    pmaddwd %xmm4, %xmm1
1007; SSE4-32-NEXT:    pmaddwd %xmm4, %xmm2
1008; SSE4-32-NEXT:    pmaddwd %xmm4, %xmm3
1009; SSE4-32-NEXT:    retl
1010;
1011; SSE4-64-LABEL: test_mul_v16i32_v16i8_minsize:
1012; SSE4-64:       # %bb.0:
1013; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
1014; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1015; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1016; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1017; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1018; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1019; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1020; SSE4-64-NEXT:    movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
1021; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm0
1022; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm1
1023; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm2
1024; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm3
1025; SSE4-64-NEXT:    retq
1026;
1027; AVX2-32-LABEL: test_mul_v16i32_v16i8_minsize:
1028; AVX2-32:       # %bb.0:
1029; AVX2-32-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1030; AVX2-32-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
1031; AVX2-32-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1032; AVX2-32-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
1033; AVX2-32-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
1034; AVX2-32-NEXT:    vpmaddwd %ymm2, %ymm1, %ymm1
1035; AVX2-32-NEXT:    retl
1036;
1037; AVX2-64-LABEL: test_mul_v16i32_v16i8_minsize:
1038; AVX2-64:       # %bb.0:
1039; AVX2-64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1040; AVX2-64-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
1041; AVX2-64-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1042; AVX2-64-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
1043; AVX2-64-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
1044; AVX2-64-NEXT:    vpmaddwd %ymm2, %ymm1, %ymm1
1045; AVX2-64-NEXT:    retq
1046;
1047; AVX512DQ-32-LABEL: test_mul_v16i32_v16i8_minsize:
1048; AVX512DQ-32:       # %bb.0:
1049; AVX512DQ-32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1050; AVX512DQ-32-NEXT:    vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
1051; AVX512DQ-32-NEXT:    retl
1052;
1053; AVX512DQ-64-LABEL: test_mul_v16i32_v16i8_minsize:
1054; AVX512DQ-64:       # %bb.0:
1055; AVX512DQ-64-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1056; AVX512DQ-64-NEXT:    vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
1057; AVX512DQ-64-NEXT:    retq
1058;
1059; AVX512BW-32-LABEL: test_mul_v16i32_v16i8_minsize:
1060; AVX512BW-32:       # %bb.0:
1061; AVX512BW-32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1062; AVX512BW-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %zmm0, %zmm0
1063; AVX512BW-32-NEXT:    retl
1064;
1065; AVX512BW-64-LABEL: test_mul_v16i32_v16i8_minsize:
1066; AVX512BW-64:       # %bb.0:
1067; AVX512BW-64-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1068; AVX512BW-64-NEXT:    vpmaddwd {{.*}}(%rip), %zmm0, %zmm0
1069; AVX512BW-64-NEXT:    retq
1070;
1071; KNL-32-LABEL: test_mul_v16i32_v16i8_minsize:
1072; KNL-32:       # %bb.0:
1073; KNL-32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1074; KNL-32-NEXT:    vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
1075; KNL-32-NEXT:    retl
1076;
1077; KNL-64-LABEL: test_mul_v16i32_v16i8_minsize:
1078; KNL-64:       # %bb.0:
1079; KNL-64-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1080; KNL-64-NEXT:    vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
1081; KNL-64-NEXT:    retq
1082  %z = zext <16 x i8> %A to <16 x i32>
1083  %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
1084  ret <16 x i32> %m
1085}
1086
1087define <4 x i32> @test_mul_v4i32_v4i16_minsize(<4 x i16> %A) minsize {
1088; CHECK32-LABEL: test_mul_v4i32_v4i16_minsize:
1089; CHECK32:       # %bb.0:
1090; CHECK32-NEXT:    pxor %xmm1, %xmm1
1091; CHECK32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1092; CHECK32-NEXT:    pmulld {{\.LCPI.*}}, %xmm0
1093; CHECK32-NEXT:    retl
1094;
1095; CHECK64-LABEL: test_mul_v4i32_v4i16_minsize:
1096; CHECK64:       # %bb.0:
1097; CHECK64-NEXT:    pxor %xmm1, %xmm1
1098; CHECK64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1099; CHECK64-NEXT:    pmulld {{.*}}(%rip), %xmm0
1100; CHECK64-NEXT:    retq
1101;
1102; SSE4-32-LABEL: test_mul_v4i32_v4i16_minsize:
1103; SSE4-32:       # %bb.0:
1104; SSE4-32-NEXT:    pxor %xmm1, %xmm1
1105; SSE4-32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1106; SSE4-32-NEXT:    pmulld {{\.LCPI.*}}, %xmm0
1107; SSE4-32-NEXT:    retl
1108;
1109; SSE4-64-LABEL: test_mul_v4i32_v4i16_minsize:
1110; SSE4-64:       # %bb.0:
1111; SSE4-64-NEXT:    pxor %xmm1, %xmm1
1112; SSE4-64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1113; SSE4-64-NEXT:    pmulld {{.*}}(%rip), %xmm0
1114; SSE4-64-NEXT:    retq
1115;
1116; AVX-32-LABEL: test_mul_v4i32_v4i16_minsize:
1117; AVX-32:       # %bb.0:
1118; AVX-32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1119; AVX-32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1120; AVX-32-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
1121; AVX-32-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1122; AVX-32-NEXT:    retl
1123;
1124; AVX-64-LABEL: test_mul_v4i32_v4i16_minsize:
1125; AVX-64:       # %bb.0:
1126; AVX-64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1127; AVX-64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1128; AVX-64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
1129; AVX-64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1130; AVX-64-NEXT:    retq
1131  %z = zext <4 x i16> %A to <4 x i32>
1132  %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
1133  ret <4 x i32> %m
1134}
1135
1136define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize {
1137; SLM32-LABEL: test_mul_v8i32_v8i16_minsize:
1138; SLM32:       # %bb.0:
1139; SLM32-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
1140; SLM32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1141; SLM32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1142; SLM32-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1143; SLM32-NEXT:    pmulld %xmm2, %xmm0
1144; SLM32-NEXT:    pmulld %xmm2, %xmm1
1145; SLM32-NEXT:    retl
1146;
1147; SLM64-LABEL: test_mul_v8i32_v8i16_minsize:
1148; SLM64:       # %bb.0:
1149; SLM64-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
1150; SLM64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1151; SLM64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1152; SLM64-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1153; SLM64-NEXT:    pmulld %xmm2, %xmm0
1154; SLM64-NEXT:    pmulld %xmm2, %xmm1
1155; SLM64-NEXT:    retq
1156;
1157; SLOW32-LABEL: test_mul_v8i32_v8i16_minsize:
1158; SLOW32:       # %bb.0:
1159; SLOW32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1160; SLOW32-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1161; SLOW32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1162; SLOW32-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
1163; SLOW32-NEXT:    pmulld %xmm2, %xmm0
1164; SLOW32-NEXT:    pmulld %xmm2, %xmm1
1165; SLOW32-NEXT:    retl
1166;
1167; SLOW64-LABEL: test_mul_v8i32_v8i16_minsize:
1168; SLOW64:       # %bb.0:
1169; SLOW64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1170; SLOW64-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1171; SLOW64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1172; SLOW64-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
1173; SLOW64-NEXT:    pmulld %xmm2, %xmm0
1174; SLOW64-NEXT:    pmulld %xmm2, %xmm1
1175; SLOW64-NEXT:    retq
1176;
1177; SSE4-32-LABEL: test_mul_v8i32_v8i16_minsize:
1178; SSE4-32:       # %bb.0:
1179; SSE4-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1180; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1181; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1182; SSE4-32-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
1183; SSE4-32-NEXT:    pmulld %xmm2, %xmm0
1184; SSE4-32-NEXT:    pmulld %xmm2, %xmm1
1185; SSE4-32-NEXT:    retl
1186;
1187; SSE4-64-LABEL: test_mul_v8i32_v8i16_minsize:
1188; SSE4-64:       # %bb.0:
1189; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1190; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1191; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1192; SSE4-64-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
1193; SSE4-64-NEXT:    pmulld %xmm2, %xmm0
1194; SSE4-64-NEXT:    pmulld %xmm2, %xmm1
1195; SSE4-64-NEXT:    retq
1196;
1197; AVX-32-LABEL: test_mul_v8i32_v8i16_minsize:
1198; AVX-32:       # %bb.0:
1199; AVX-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1200; AVX-32-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
1201; AVX-32-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1202; AVX-32-NEXT:    retl
1203;
1204; AVX-64-LABEL: test_mul_v8i32_v8i16_minsize:
1205; AVX-64:       # %bb.0:
1206; AVX-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1207; AVX-64-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
1208; AVX-64-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1209; AVX-64-NEXT:    retq
1210  %z = zext <8 x i16> %A to <8 x i32>
1211  %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
1212  ret <8 x i32> %m
1213}
1214
1215define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize {
1216; SLM32-LABEL: test_mul_v16i32_v16i16_minsize:
1217; SLM32:       # %bb.0:
1218; SLM32-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
1219; SLM32-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1220; SLM32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
1221; SLM32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1222; SLM32-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1223; SLM32-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1224; SLM32-NEXT:    movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
1225; SLM32-NEXT:    pmulld %xmm1, %xmm4
1226; SLM32-NEXT:    pmulld %xmm1, %xmm0
1227; SLM32-NEXT:    pmulld %xmm1, %xmm2
1228; SLM32-NEXT:    pmulld %xmm1, %xmm3
1229; SLM32-NEXT:    movdqa %xmm4, %xmm1
1230; SLM32-NEXT:    retl
1231;
1232; SLM64-LABEL: test_mul_v16i32_v16i16_minsize:
1233; SLM64:       # %bb.0:
1234; SLM64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
1235; SLM64-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1236; SLM64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
1237; SLM64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1238; SLM64-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1239; SLM64-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1240; SLM64-NEXT:    movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
1241; SLM64-NEXT:    pmulld %xmm1, %xmm4
1242; SLM64-NEXT:    pmulld %xmm1, %xmm0
1243; SLM64-NEXT:    pmulld %xmm1, %xmm2
1244; SLM64-NEXT:    pmulld %xmm1, %xmm3
1245; SLM64-NEXT:    movdqa %xmm4, %xmm1
1246; SLM64-NEXT:    retq
1247;
1248; SLOW32-LABEL: test_mul_v16i32_v16i16_minsize:
1249; SLOW32:       # %bb.0:
1250; SLOW32-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
1251; SLOW32-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1252; SLOW32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
1253; SLOW32-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1254; SLOW32-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1255; SLOW32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1256; SLOW32-NEXT:    movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
1257; SLOW32-NEXT:    pmulld %xmm1, %xmm0
1258; SLOW32-NEXT:    pmulld %xmm1, %xmm2
1259; SLOW32-NEXT:    pmulld %xmm1, %xmm4
1260; SLOW32-NEXT:    pmulld %xmm1, %xmm3
1261; SLOW32-NEXT:    movdqa %xmm4, %xmm1
1262; SLOW32-NEXT:    retl
1263;
1264; SLOW64-LABEL: test_mul_v16i32_v16i16_minsize:
1265; SLOW64:       # %bb.0:
1266; SLOW64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
1267; SLOW64-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1268; SLOW64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
1269; SLOW64-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1270; SLOW64-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1271; SLOW64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1272; SLOW64-NEXT:    movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
1273; SLOW64-NEXT:    pmulld %xmm1, %xmm0
1274; SLOW64-NEXT:    pmulld %xmm1, %xmm2
1275; SLOW64-NEXT:    pmulld %xmm1, %xmm4
1276; SLOW64-NEXT:    pmulld %xmm1, %xmm3
1277; SLOW64-NEXT:    movdqa %xmm4, %xmm1
1278; SLOW64-NEXT:    retq
1279;
1280; SSE4-32-LABEL: test_mul_v16i32_v16i16_minsize:
1281; SSE4-32:       # %bb.0:
1282; SSE4-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
1283; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1284; SSE4-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
1285; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1286; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1287; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1288; SSE4-32-NEXT:    movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
1289; SSE4-32-NEXT:    pmulld %xmm1, %xmm0
1290; SSE4-32-NEXT:    pmulld %xmm1, %xmm2
1291; SSE4-32-NEXT:    pmulld %xmm1, %xmm4
1292; SSE4-32-NEXT:    pmulld %xmm1, %xmm3
1293; SSE4-32-NEXT:    movdqa %xmm4, %xmm1
1294; SSE4-32-NEXT:    retl
1295;
1296; SSE4-64-LABEL: test_mul_v16i32_v16i16_minsize:
1297; SSE4-64:       # %bb.0:
1298; SSE4-64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
1299; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1300; SSE4-64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
1301; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1302; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1303; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1304; SSE4-64-NEXT:    movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
1305; SSE4-64-NEXT:    pmulld %xmm1, %xmm0
1306; SSE4-64-NEXT:    pmulld %xmm1, %xmm2
1307; SSE4-64-NEXT:    pmulld %xmm1, %xmm4
1308; SSE4-64-NEXT:    pmulld %xmm1, %xmm3
1309; SSE4-64-NEXT:    movdqa %xmm4, %xmm1
1310; SSE4-64-NEXT:    retq
1311;
1312; AVX2-32-LABEL: test_mul_v16i32_v16i16_minsize:
1313; AVX2-32:       # %bb.0:
1314; AVX2-32-NEXT:    vextracti128 $1, %ymm0, %xmm1
1315; AVX2-32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1316; AVX2-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1317; AVX2-32-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
1318; AVX2-32-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
1319; AVX2-32-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
1320; AVX2-32-NEXT:    retl
1321;
1322; AVX2-64-LABEL: test_mul_v16i32_v16i16_minsize:
1323; AVX2-64:       # %bb.0:
1324; AVX2-64-NEXT:    vextracti128 $1, %ymm0, %xmm1
1325; AVX2-64-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1326; AVX2-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1327; AVX2-64-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
1328; AVX2-64-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
1329; AVX2-64-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
1330; AVX2-64-NEXT:    retq
1331;
1332; AVX512-32-LABEL: test_mul_v16i32_v16i16_minsize:
1333; AVX512-32:       # %bb.0:
1334; AVX512-32-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1335; AVX512-32-NEXT:    vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
1336; AVX512-32-NEXT:    retl
1337;
1338; AVX512-64-LABEL: test_mul_v16i32_v16i16_minsize:
1339; AVX512-64:       # %bb.0:
1340; AVX512-64-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1341; AVX512-64-NEXT:    vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
1342; AVX512-64-NEXT:    retq
1343  %z = zext <16 x i16> %A to <16 x i32>
1344  %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
1345  ret <16 x i32> %m
1346}
1347