• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86-SSE
3; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefixes=X86-AVX,X86-AVX1
4; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X86-AVX,X86-AVX2
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64-SSE
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefixes=X64-AVX,X64-AVX1
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64-AVX,X64-AVX2
8
9@c = external dso_local global i32*, align 8
10
11; %val1 = load <2 x i8>
12; %op1 = zext<2 x i32> %val1
13; %val2 = load <2 x i8>
14; %op2 = zext<2 x i32> %val2
15; %rst = mul <2 x i32> %op1, %op2
16;
17define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
18; X86-SSE-LABEL: mul_2xi8:
19; X86-SSE:       # %bb.0: # %entry
20; X86-SSE-NEXT:    pushl %esi
21; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
22; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
23; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
24; X86-SSE-NEXT:    movl c, %esi
25; X86-SSE-NEXT:    movzwl (%edx,%ecx), %edx
26; X86-SSE-NEXT:    movd %edx, %xmm0
27; X86-SSE-NEXT:    movzwl (%eax,%ecx), %eax
28; X86-SSE-NEXT:    movd %eax, %xmm1
29; X86-SSE-NEXT:    pxor %xmm2, %xmm2
30; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
31; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
32; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
33; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
34; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
35; X86-SSE-NEXT:    popl %esi
36; X86-SSE-NEXT:    retl
37;
38; X86-AVX-LABEL: mul_2xi8:
39; X86-AVX:       # %bb.0: # %entry
40; X86-AVX-NEXT:    pushl %esi
41; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
42; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
43; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
44; X86-AVX-NEXT:    movl c, %esi
45; X86-AVX-NEXT:    movzwl (%edx,%ecx), %edx
46; X86-AVX-NEXT:    vmovd %edx, %xmm0
47; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
48; X86-AVX-NEXT:    movzwl (%eax,%ecx), %eax
49; X86-AVX-NEXT:    vmovd %eax, %xmm1
50; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
51; X86-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
52; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
53; X86-AVX-NEXT:    popl %esi
54; X86-AVX-NEXT:    retl
55;
56; X64-SSE-LABEL: mul_2xi8:
57; X64-SSE:       # %bb.0: # %entry
58; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
59; X64-SSE-NEXT:    movzwl (%rdi,%rdx), %ecx
60; X64-SSE-NEXT:    movd %ecx, %xmm0
61; X64-SSE-NEXT:    movzwl (%rsi,%rdx), %ecx
62; X64-SSE-NEXT:    movd %ecx, %xmm1
63; X64-SSE-NEXT:    pxor %xmm2, %xmm2
64; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
65; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
66; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
67; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
68; X64-SSE-NEXT:    movq %xmm1, (%rax,%rdx,4)
69; X64-SSE-NEXT:    retq
70;
71; X64-AVX-LABEL: mul_2xi8:
72; X64-AVX:       # %bb.0: # %entry
73; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
74; X64-AVX-NEXT:    movzwl (%rdi,%rdx), %ecx
75; X64-AVX-NEXT:    vmovd %ecx, %xmm0
76; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
77; X64-AVX-NEXT:    movzwl (%rsi,%rdx), %ecx
78; X64-AVX-NEXT:    vmovd %ecx, %xmm1
79; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
80; X64-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
81; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
82; X64-AVX-NEXT:    retq
83entry:
84  %pre = load i32*, i32** @c
85  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
86  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
87  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
88  %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
89  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
90  %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
91  %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
92  %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
93  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
94  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
95  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
96  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
97  ret void
98}
99
100; %val1 = load <4 x i8>
101; %op1 = zext<4 x i32> %val1
102; %val2 = load <4 x i8>
103; %op2 = zext<4 x i32> %val2
104; %rst = mul <4 x i32> %op1, %op2
105;
106define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
107; X86-SSE-LABEL: mul_4xi8:
108; X86-SSE:       # %bb.0: # %entry
109; X86-SSE-NEXT:    pushl %esi
110; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
111; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
112; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
113; X86-SSE-NEXT:    movl c, %esi
114; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
115; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
116; X86-SSE-NEXT:    pxor %xmm2, %xmm2
117; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
118; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
119; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
120; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
121; X86-SSE-NEXT:    movdqu %xmm1, (%esi,%ecx,4)
122; X86-SSE-NEXT:    popl %esi
123; X86-SSE-NEXT:    retl
124;
125; X86-AVX-LABEL: mul_4xi8:
126; X86-AVX:       # %bb.0: # %entry
127; X86-AVX-NEXT:    pushl %esi
128; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
129; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
130; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
131; X86-AVX-NEXT:    movl c, %esi
132; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
133; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
134; X86-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
135; X86-AVX-NEXT:    vmovdqu %xmm0, (%esi,%ecx,4)
136; X86-AVX-NEXT:    popl %esi
137; X86-AVX-NEXT:    retl
138;
139; X64-SSE-LABEL: mul_4xi8:
140; X64-SSE:       # %bb.0: # %entry
141; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
142; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
143; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
144; X64-SSE-NEXT:    pxor %xmm2, %xmm2
145; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
146; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
147; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
148; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
149; X64-SSE-NEXT:    movdqu %xmm1, (%rax,%rdx,4)
150; X64-SSE-NEXT:    retq
151;
152; X64-AVX-LABEL: mul_4xi8:
153; X64-AVX:       # %bb.0: # %entry
154; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
155; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
156; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
157; X64-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
158; X64-AVX-NEXT:    vmovdqu %xmm0, (%rax,%rdx,4)
159; X64-AVX-NEXT:    retq
160entry:
161  %pre = load i32*, i32** @c
162  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
163  %tmp7 = bitcast i8* %tmp6 to <4 x i8>*
164  %wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1
165  %tmp8 = zext <4 x i8> %wide.load to <4 x i32>
166  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
167  %tmp11 = bitcast i8* %tmp10 to <4 x i8>*
168  %wide.load17 = load <4 x i8>, <4 x i8>* %tmp11, align 1
169  %tmp12 = zext <4 x i8> %wide.load17 to <4 x i32>
170  %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
171  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
172  %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
173  store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
174  ret void
175}
176
177; %val1 = load <8 x i8>
178; %op1 = zext<8 x i32> %val1
179; %val2 = load <8 x i8>
180; %op2 = zext<8 x i32> %val2
181; %rst = mul <8 x i32> %op1, %op2
182;
183define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
184; X86-SSE-LABEL: mul_8xi8:
185; X86-SSE:       # %bb.0: # %entry
186; X86-SSE-NEXT:    pushl %esi
187; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
188; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
189; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
190; X86-SSE-NEXT:    movl c, %esi
191; X86-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
192; X86-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
193; X86-SSE-NEXT:    pxor %xmm2, %xmm2
194; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
195; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
196; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
197; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
198; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
199; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
200; X86-SSE-NEXT:    movdqu %xmm1, 16(%esi,%ecx,4)
201; X86-SSE-NEXT:    movdqu %xmm0, (%esi,%ecx,4)
202; X86-SSE-NEXT:    popl %esi
203; X86-SSE-NEXT:    retl
204;
205; X86-AVX1-LABEL: mul_8xi8:
206; X86-AVX1:       # %bb.0: # %entry
207; X86-AVX1-NEXT:    pushl %esi
208; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
209; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
210; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
211; X86-AVX1-NEXT:    movl c, %esi
212; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
213; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
214; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
215; X86-AVX1-NEXT:    vpmaddwd %xmm0, %xmm2, %xmm0
216; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
217; X86-AVX1-NEXT:    vpmaddwd %xmm1, %xmm2, %xmm1
218; X86-AVX1-NEXT:    vmovdqu %xmm0, 16(%esi,%ecx,4)
219; X86-AVX1-NEXT:    vmovdqu %xmm1, (%esi,%ecx,4)
220; X86-AVX1-NEXT:    popl %esi
221; X86-AVX1-NEXT:    retl
222;
223; X86-AVX2-LABEL: mul_8xi8:
224; X86-AVX2:       # %bb.0: # %entry
225; X86-AVX2-NEXT:    pushl %esi
226; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
227; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
228; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
229; X86-AVX2-NEXT:    movl c, %esi
230; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
231; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
232; X86-AVX2-NEXT:    vpmaddwd %ymm0, %ymm1, %ymm0
233; X86-AVX2-NEXT:    vmovdqu %ymm0, (%esi,%ecx,4)
234; X86-AVX2-NEXT:    popl %esi
235; X86-AVX2-NEXT:    vzeroupper
236; X86-AVX2-NEXT:    retl
237;
238; X64-SSE-LABEL: mul_8xi8:
239; X64-SSE:       # %bb.0: # %entry
240; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
241; X64-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
242; X64-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
243; X64-SSE-NEXT:    pxor %xmm2, %xmm2
244; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
245; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
246; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
247; X64-SSE-NEXT:    movdqa %xmm1, %xmm0
248; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
249; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
250; X64-SSE-NEXT:    movdqu %xmm1, 16(%rax,%rdx,4)
251; X64-SSE-NEXT:    movdqu %xmm0, (%rax,%rdx,4)
252; X64-SSE-NEXT:    retq
253;
254; X64-AVX1-LABEL: mul_8xi8:
255; X64-AVX1:       # %bb.0: # %entry
256; X64-AVX1-NEXT:    movq {{.*}}(%rip), %rax
257; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
258; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
259; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
260; X64-AVX1-NEXT:    vpmaddwd %xmm0, %xmm2, %xmm0
261; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
262; X64-AVX1-NEXT:    vpmaddwd %xmm1, %xmm2, %xmm1
263; X64-AVX1-NEXT:    vmovdqu %xmm0, 16(%rax,%rdx,4)
264; X64-AVX1-NEXT:    vmovdqu %xmm1, (%rax,%rdx,4)
265; X64-AVX1-NEXT:    retq
266;
267; X64-AVX2-LABEL: mul_8xi8:
268; X64-AVX2:       # %bb.0: # %entry
269; X64-AVX2-NEXT:    movq {{.*}}(%rip), %rax
270; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
271; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
272; X64-AVX2-NEXT:    vpmaddwd %ymm0, %ymm1, %ymm0
273; X64-AVX2-NEXT:    vmovdqu %ymm0, (%rax,%rdx,4)
274; X64-AVX2-NEXT:    vzeroupper
275; X64-AVX2-NEXT:    retq
276entry:
277  %pre = load i32*, i32** @c
278  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
279  %tmp7 = bitcast i8* %tmp6 to <8 x i8>*
280  %wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1
281  %tmp8 = zext <8 x i8> %wide.load to <8 x i32>
282  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
283  %tmp11 = bitcast i8* %tmp10 to <8 x i8>*
284  %wide.load17 = load <8 x i8>, <8 x i8>* %tmp11, align 1
285  %tmp12 = zext <8 x i8> %wide.load17 to <8 x i32>
286  %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
287  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
288  %tmp15 = bitcast i32* %tmp14 to <8 x i32>*
289  store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
290  ret void
291}
292
293; %val1 = load <16 x i8>
294; %op1 = zext<16 x i32> %val1
295; %val2 = load <16 x i8>
296; %op2 = zext<16 x i32> %val2
297; %rst = mul <16 x i32> %op1, %op2
298;
299define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
300; X86-SSE-LABEL: mul_16xi8:
301; X86-SSE:       # %bb.0: # %entry
302; X86-SSE-NEXT:    pushl %esi
303; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
304; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
305; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
306; X86-SSE-NEXT:    movl c, %esi
307; X86-SSE-NEXT:    movdqu (%edx,%ecx), %xmm0
308; X86-SSE-NEXT:    movdqu (%eax,%ecx), %xmm1
309; X86-SSE-NEXT:    pxor %xmm2, %xmm2
310; X86-SSE-NEXT:    movdqa %xmm0, %xmm3
311; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
312; X86-SSE-NEXT:    movdqa %xmm1, %xmm4
313; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
314; X86-SSE-NEXT:    pmullw %xmm3, %xmm4
315; X86-SSE-NEXT:    movdqa %xmm4, %xmm3
316; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
317; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
318; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
319; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
320; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
321; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
322; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
323; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
324; X86-SSE-NEXT:    movdqu %xmm1, 48(%esi,%ecx,4)
325; X86-SSE-NEXT:    movdqu %xmm0, 32(%esi,%ecx,4)
326; X86-SSE-NEXT:    movdqu %xmm4, 16(%esi,%ecx,4)
327; X86-SSE-NEXT:    movdqu %xmm3, (%esi,%ecx,4)
328; X86-SSE-NEXT:    popl %esi
329; X86-SSE-NEXT:    retl
330;
331; X86-AVX1-LABEL: mul_16xi8:
332; X86-AVX1:       # %bb.0: # %entry
333; X86-AVX1-NEXT:    pushl %esi
334; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
335; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
336; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
337; X86-AVX1-NEXT:    movl c, %esi
338; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
339; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
340; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
341; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
342; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
343; X86-AVX1-NEXT:    vpmaddwd %xmm0, %xmm4, %xmm0
344; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
345; X86-AVX1-NEXT:    vpmaddwd %xmm1, %xmm4, %xmm1
346; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
347; X86-AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
348; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
349; X86-AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
350; X86-AVX1-NEXT:    vmovdqu %xmm0, 48(%esi,%ecx,4)
351; X86-AVX1-NEXT:    vmovdqu %xmm1, 32(%esi,%ecx,4)
352; X86-AVX1-NEXT:    vmovdqu %xmm2, 16(%esi,%ecx,4)
353; X86-AVX1-NEXT:    vmovdqu %xmm3, (%esi,%ecx,4)
354; X86-AVX1-NEXT:    popl %esi
355; X86-AVX1-NEXT:    retl
356;
357; X86-AVX2-LABEL: mul_16xi8:
358; X86-AVX2:       # %bb.0: # %entry
359; X86-AVX2-NEXT:    pushl %esi
360; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
361; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
362; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
363; X86-AVX2-NEXT:    movl c, %esi
364; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
365; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
366; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
367; X86-AVX2-NEXT:    vpmaddwd %ymm0, %ymm2, %ymm0
368; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
369; X86-AVX2-NEXT:    vpmaddwd %ymm1, %ymm2, %ymm1
370; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%esi,%ecx,4)
371; X86-AVX2-NEXT:    vmovdqu %ymm1, (%esi,%ecx,4)
372; X86-AVX2-NEXT:    popl %esi
373; X86-AVX2-NEXT:    vzeroupper
374; X86-AVX2-NEXT:    retl
375;
376; X64-SSE-LABEL: mul_16xi8:
377; X64-SSE:       # %bb.0: # %entry
378; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
379; X64-SSE-NEXT:    movdqu (%rdi,%rdx), %xmm0
380; X64-SSE-NEXT:    movdqu (%rsi,%rdx), %xmm1
381; X64-SSE-NEXT:    pxor %xmm2, %xmm2
382; X64-SSE-NEXT:    movdqa %xmm0, %xmm3
383; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
384; X64-SSE-NEXT:    movdqa %xmm1, %xmm4
385; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
386; X64-SSE-NEXT:    pmullw %xmm3, %xmm4
387; X64-SSE-NEXT:    movdqa %xmm4, %xmm3
388; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
389; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
390; X64-SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
391; X64-SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
392; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
393; X64-SSE-NEXT:    movdqa %xmm1, %xmm0
394; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
395; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
396; X64-SSE-NEXT:    movdqu %xmm1, 48(%rax,%rdx,4)
397; X64-SSE-NEXT:    movdqu %xmm0, 32(%rax,%rdx,4)
398; X64-SSE-NEXT:    movdqu %xmm4, 16(%rax,%rdx,4)
399; X64-SSE-NEXT:    movdqu %xmm3, (%rax,%rdx,4)
400; X64-SSE-NEXT:    retq
401;
402; X64-AVX1-LABEL: mul_16xi8:
403; X64-AVX1:       # %bb.0: # %entry
404; X64-AVX1-NEXT:    movq {{.*}}(%rip), %rax
405; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
406; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
407; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
408; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
409; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
410; X64-AVX1-NEXT:    vpmaddwd %xmm0, %xmm4, %xmm0
411; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
412; X64-AVX1-NEXT:    vpmaddwd %xmm1, %xmm4, %xmm1
413; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
414; X64-AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
415; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
416; X64-AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
417; X64-AVX1-NEXT:    vmovdqu %xmm0, 48(%rax,%rdx,4)
418; X64-AVX1-NEXT:    vmovdqu %xmm1, 32(%rax,%rdx,4)
419; X64-AVX1-NEXT:    vmovdqu %xmm2, 16(%rax,%rdx,4)
420; X64-AVX1-NEXT:    vmovdqu %xmm3, (%rax,%rdx,4)
421; X64-AVX1-NEXT:    retq
422;
423; X64-AVX2-LABEL: mul_16xi8:
424; X64-AVX2:       # %bb.0: # %entry
425; X64-AVX2-NEXT:    movq {{.*}}(%rip), %rax
426; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
427; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
428; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
429; X64-AVX2-NEXT:    vpmaddwd %ymm0, %ymm2, %ymm0
430; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
431; X64-AVX2-NEXT:    vpmaddwd %ymm1, %ymm2, %ymm1
432; X64-AVX2-NEXT:    vmovdqu %ymm0, 32(%rax,%rdx,4)
433; X64-AVX2-NEXT:    vmovdqu %ymm1, (%rax,%rdx,4)
434; X64-AVX2-NEXT:    vzeroupper
435; X64-AVX2-NEXT:    retq
436entry:
437  %pre = load i32*, i32** @c
438  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
439  %tmp7 = bitcast i8* %tmp6 to <16 x i8>*
440  %wide.load = load <16 x i8>, <16 x i8>* %tmp7, align 1
441  %tmp8 = zext <16 x i8> %wide.load to <16 x i32>
442  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
443  %tmp11 = bitcast i8* %tmp10 to <16 x i8>*
444  %wide.load17 = load <16 x i8>, <16 x i8>* %tmp11, align 1
445  %tmp12 = zext <16 x i8> %wide.load17 to <16 x i32>
446  %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
447  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
448  %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
449  store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
450  ret void
451}
452
453; %val1 = load <2 x i16>
454; %op1 = zext<2 x i32> %val1
455; %val2 = load <2 x i16>
456; %op2 = zext<2 x i32> %val2
457; %rst = mul <2 x i32> %op1, %op2
458;
459define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
460; X86-SSE-LABEL: mul_2xi16:
461; X86-SSE:       # %bb.0: # %entry
462; X86-SSE-NEXT:    pushl %esi
463; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
464; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
465; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
466; X86-SSE-NEXT:    movl c, %esi
467; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
468; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
469; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
470; X86-SSE-NEXT:    pmulhuw %xmm0, %xmm2
471; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
472; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
473; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
474; X86-SSE-NEXT:    popl %esi
475; X86-SSE-NEXT:    retl
476;
477; X86-AVX-LABEL: mul_2xi16:
478; X86-AVX:       # %bb.0: # %entry
479; X86-AVX-NEXT:    pushl %esi
480; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
481; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
482; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
483; X86-AVX-NEXT:    movl c, %esi
484; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
485; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
486; X86-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
487; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
488; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
489; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
490; X86-AVX-NEXT:    popl %esi
491; X86-AVX-NEXT:    retl
492;
493; X64-SSE-LABEL: mul_2xi16:
494; X64-SSE:       # %bb.0: # %entry
495; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
496; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
497; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
498; X64-SSE-NEXT:    movdqa %xmm1, %xmm2
499; X64-SSE-NEXT:    pmulhuw %xmm0, %xmm2
500; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
501; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
502; X64-SSE-NEXT:    movq %xmm1, (%rax,%rdx,4)
503; X64-SSE-NEXT:    retq
504;
505; X64-AVX-LABEL: mul_2xi16:
506; X64-AVX:       # %bb.0: # %entry
507; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
508; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
509; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
510; X64-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
511; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
512; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
513; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
514; X64-AVX-NEXT:    retq
515entry:
516  %pre = load i32*, i32** @c
517  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
518  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
519  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
520  %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
521  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
522  %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
523  %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
524  %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
525  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
526  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
527  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
528  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
529  ret void
530}
531
532; %val1 = load <4 x i16>
533; %op1 = zext<4 x i32> %val1
534; %val2 = load <4 x i16>
535; %op2 = zext<4 x i32> %val2
536; %rst = mul <4 x i32> %op1, %op2
537;
538define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
539; X86-SSE-LABEL: mul_4xi16:
540; X86-SSE:       # %bb.0: # %entry
541; X86-SSE-NEXT:    pushl %esi
542; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
543; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
544; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
545; X86-SSE-NEXT:    movl c, %esi
546; X86-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
547; X86-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
548; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
549; X86-SSE-NEXT:    pmulhuw %xmm0, %xmm2
550; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
551; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
552; X86-SSE-NEXT:    movdqu %xmm1, (%esi,%ecx,4)
553; X86-SSE-NEXT:    popl %esi
554; X86-SSE-NEXT:    retl
555;
556; X86-AVX-LABEL: mul_4xi16:
557; X86-AVX:       # %bb.0: # %entry
558; X86-AVX-NEXT:    pushl %esi
559; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
560; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
561; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
562; X86-AVX-NEXT:    movl c, %esi
563; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
564; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
565; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
566; X86-AVX-NEXT:    vmovdqu %xmm0, (%esi,%ecx,4)
567; X86-AVX-NEXT:    popl %esi
568; X86-AVX-NEXT:    retl
569;
570; X64-SSE-LABEL: mul_4xi16:
571; X64-SSE:       # %bb.0: # %entry
572; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
573; X64-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
574; X64-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
575; X64-SSE-NEXT:    movdqa %xmm1, %xmm2
576; X64-SSE-NEXT:    pmulhuw %xmm0, %xmm2
577; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
578; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
579; X64-SSE-NEXT:    movdqu %xmm1, (%rax,%rdx,4)
580; X64-SSE-NEXT:    retq
581;
582; X64-AVX-LABEL: mul_4xi16:
583; X64-AVX:       # %bb.0: # %entry
584; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
585; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
586; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
587; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
588; X64-AVX-NEXT:    vmovdqu %xmm0, (%rax,%rdx,4)
589; X64-AVX-NEXT:    retq
590entry:
591  %pre = load i32*, i32** @c
592  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
593  %tmp7 = bitcast i8* %tmp6 to <4 x i16>*
594  %wide.load = load <4 x i16>, <4 x i16>* %tmp7, align 1
595  %tmp8 = zext <4 x i16> %wide.load to <4 x i32>
596  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
597  %tmp11 = bitcast i8* %tmp10 to <4 x i16>*
598  %wide.load17 = load <4 x i16>, <4 x i16>* %tmp11, align 1
599  %tmp12 = zext <4 x i16> %wide.load17 to <4 x i32>
600  %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
601  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
602  %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
603  store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
604  ret void
605}
606
607; %val1 = load <8 x i16>
608; %op1 = zext<8 x i32> %val1
609; %val2 = load <8 x i16>
610; %op2 = zext<8 x i32> %val2
611; %rst = mul <8 x i32> %op1, %op2
612;
613define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
614; X86-SSE-LABEL: mul_8xi16:
615; X86-SSE:       # %bb.0: # %entry
616; X86-SSE-NEXT:    pushl %esi
617; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
618; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
619; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
620; X86-SSE-NEXT:    movl c, %esi
621; X86-SSE-NEXT:    movdqu (%edx,%ecx), %xmm0
622; X86-SSE-NEXT:    movdqu (%eax,%ecx), %xmm1
623; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
624; X86-SSE-NEXT:    pmulhuw %xmm0, %xmm2
625; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
626; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
627; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
628; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
629; X86-SSE-NEXT:    movdqu %xmm1, 16(%esi,%ecx,4)
630; X86-SSE-NEXT:    movdqu %xmm0, (%esi,%ecx,4)
631; X86-SSE-NEXT:    popl %esi
632; X86-SSE-NEXT:    retl
633;
634; X86-AVX1-LABEL: mul_8xi16:
635; X86-AVX1:       # %bb.0: # %entry
636; X86-AVX1-NEXT:    pushl %esi
637; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
638; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
639; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
640; X86-AVX1-NEXT:    movl c, %esi
641; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
642; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
643; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
644; X86-AVX1-NEXT:    vpmulld %xmm0, %xmm2, %xmm0
645; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
646; X86-AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
647; X86-AVX1-NEXT:    vmovdqu %xmm0, 16(%esi,%ecx,4)
648; X86-AVX1-NEXT:    vmovdqu %xmm1, (%esi,%ecx,4)
649; X86-AVX1-NEXT:    popl %esi
650; X86-AVX1-NEXT:    retl
651;
652; X86-AVX2-LABEL: mul_8xi16:
653; X86-AVX2:       # %bb.0: # %entry
654; X86-AVX2-NEXT:    pushl %esi
655; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
656; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
657; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
658; X86-AVX2-NEXT:    movl c, %esi
659; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
660; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
661; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
662; X86-AVX2-NEXT:    vmovdqu %ymm0, (%esi,%ecx,4)
663; X86-AVX2-NEXT:    popl %esi
664; X86-AVX2-NEXT:    vzeroupper
665; X86-AVX2-NEXT:    retl
666;
667; X64-SSE-LABEL: mul_8xi16:
668; X64-SSE:       # %bb.0: # %entry
669; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
670; X64-SSE-NEXT:    movdqu (%rdi,%rdx), %xmm0
671; X64-SSE-NEXT:    movdqu (%rsi,%rdx), %xmm1
672; X64-SSE-NEXT:    movdqa %xmm1, %xmm2
673; X64-SSE-NEXT:    pmulhuw %xmm0, %xmm2
674; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
675; X64-SSE-NEXT:    movdqa %xmm1, %xmm0
676; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
677; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
678; X64-SSE-NEXT:    movdqu %xmm1, 16(%rax,%rdx,4)
679; X64-SSE-NEXT:    movdqu %xmm0, (%rax,%rdx,4)
680; X64-SSE-NEXT:    retq
681;
682; X64-AVX1-LABEL: mul_8xi16:
683; X64-AVX1:       # %bb.0: # %entry
684; X64-AVX1-NEXT:    movq {{.*}}(%rip), %rax
685; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
686; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
687; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
688; X64-AVX1-NEXT:    vpmulld %xmm0, %xmm2, %xmm0
689; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
690; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
691; X64-AVX1-NEXT:    vmovdqu %xmm0, 16(%rax,%rdx,4)
692; X64-AVX1-NEXT:    vmovdqu %xmm1, (%rax,%rdx,4)
693; X64-AVX1-NEXT:    retq
694;
695; X64-AVX2-LABEL: mul_8xi16:
696; X64-AVX2:       # %bb.0: # %entry
697; X64-AVX2-NEXT:    movq {{.*}}(%rip), %rax
698; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
699; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
700; X64-AVX2-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
701; X64-AVX2-NEXT:    vmovdqu %ymm0, (%rax,%rdx,4)
702; X64-AVX2-NEXT:    vzeroupper
703; X64-AVX2-NEXT:    retq
704entry:
705  %pre = load i32*, i32** @c
706  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
707  %tmp7 = bitcast i8* %tmp6 to <8 x i16>*
708  %wide.load = load <8 x i16>, <8 x i16>* %tmp7, align 1
709  %tmp8 = zext <8 x i16> %wide.load to <8 x i32>
710  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
711  %tmp11 = bitcast i8* %tmp10 to <8 x i16>*
712  %wide.load17 = load <8 x i16>, <8 x i16>* %tmp11, align 1
713  %tmp12 = zext <8 x i16> %wide.load17 to <8 x i32>
714  %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
715  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
716  %tmp15 = bitcast i32* %tmp14 to <8 x i32>*
717  store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
718  ret void
719}
720
721; %val1 = load <16 x i16>
722; %op1 = zext<16 x i32> %val1
723; %val2 = load <16 x i16>
724; %op2 = zext<16 x i32> %val2
725; %rst = mul <16 x i32> %op1, %op2
726;
727define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
728; X86-SSE-LABEL: mul_16xi16:
729; X86-SSE:       # %bb.0: # %entry
730; X86-SSE-NEXT:    pushl %esi
731; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
732; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
733; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
734; X86-SSE-NEXT:    movl c, %esi
735; X86-SSE-NEXT:    movdqu (%edx,%ecx), %xmm0
736; X86-SSE-NEXT:    movdqu 16(%edx,%ecx), %xmm1
737; X86-SSE-NEXT:    movdqu (%eax,%ecx), %xmm2
738; X86-SSE-NEXT:    movdqu 16(%eax,%ecx), %xmm3
739; X86-SSE-NEXT:    movdqa %xmm2, %xmm4
740; X86-SSE-NEXT:    pmulhuw %xmm0, %xmm4
741; X86-SSE-NEXT:    pmullw %xmm0, %xmm2
742; X86-SSE-NEXT:    movdqa %xmm2, %xmm0
743; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
744; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
745; X86-SSE-NEXT:    movdqa %xmm3, %xmm4
746; X86-SSE-NEXT:    pmulhuw %xmm1, %xmm4
747; X86-SSE-NEXT:    pmullw %xmm1, %xmm3
748; X86-SSE-NEXT:    movdqa %xmm3, %xmm1
749; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
750; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
751; X86-SSE-NEXT:    movdqu %xmm3, 32(%esi,%ecx,4)
752; X86-SSE-NEXT:    movdqu %xmm1, 48(%esi,%ecx,4)
753; X86-SSE-NEXT:    movdqu %xmm2, (%esi,%ecx,4)
754; X86-SSE-NEXT:    movdqu %xmm0, 16(%esi,%ecx,4)
755; X86-SSE-NEXT:    popl %esi
756; X86-SSE-NEXT:    retl
757;
758; X86-AVX1-LABEL: mul_16xi16:
759; X86-AVX1:       # %bb.0: # %entry
760; X86-AVX1-NEXT:    pushl %esi
761; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
762; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
763; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
764; X86-AVX1-NEXT:    movl c, %esi
765; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
766; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
767; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
768; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
769; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
770; X86-AVX1-NEXT:    vpmulld %xmm0, %xmm4, %xmm0
771; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
772; X86-AVX1-NEXT:    vpmulld %xmm1, %xmm4, %xmm1
773; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
774; X86-AVX1-NEXT:    vpmulld %xmm2, %xmm4, %xmm2
775; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
776; X86-AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm3
777; X86-AVX1-NEXT:    vmovdqu %xmm0, 48(%esi,%ecx,4)
778; X86-AVX1-NEXT:    vmovdqu %xmm1, 32(%esi,%ecx,4)
779; X86-AVX1-NEXT:    vmovdqu %xmm2, 16(%esi,%ecx,4)
780; X86-AVX1-NEXT:    vmovdqu %xmm3, (%esi,%ecx,4)
781; X86-AVX1-NEXT:    popl %esi
782; X86-AVX1-NEXT:    retl
783;
784; X86-AVX2-LABEL: mul_16xi16:
785; X86-AVX2:       # %bb.0: # %entry
786; X86-AVX2-NEXT:    pushl %esi
787; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
788; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
789; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
790; X86-AVX2-NEXT:    movl c, %esi
791; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
792; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
793; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
794; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm2, %ymm0
795; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
796; X86-AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
797; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%esi,%ecx,4)
798; X86-AVX2-NEXT:    vmovdqu %ymm1, (%esi,%ecx,4)
799; X86-AVX2-NEXT:    popl %esi
800; X86-AVX2-NEXT:    vzeroupper
801; X86-AVX2-NEXT:    retl
802;
803; X64-SSE-LABEL: mul_16xi16:
804; X64-SSE:       # %bb.0: # %entry
805; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
806; X64-SSE-NEXT:    movdqu (%rdi,%rdx), %xmm0
807; X64-SSE-NEXT:    movdqu 16(%rdi,%rdx), %xmm1
808; X64-SSE-NEXT:    movdqu (%rsi,%rdx), %xmm2
809; X64-SSE-NEXT:    movdqu 16(%rsi,%rdx), %xmm3
810; X64-SSE-NEXT:    movdqa %xmm2, %xmm4
811; X64-SSE-NEXT:    pmulhuw %xmm0, %xmm4
812; X64-SSE-NEXT:    pmullw %xmm0, %xmm2
813; X64-SSE-NEXT:    movdqa %xmm2, %xmm0
814; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
815; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
816; X64-SSE-NEXT:    movdqa %xmm3, %xmm4
817; X64-SSE-NEXT:    pmulhuw %xmm1, %xmm4
818; X64-SSE-NEXT:    pmullw %xmm1, %xmm3
819; X64-SSE-NEXT:    movdqa %xmm3, %xmm1
820; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
821; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
822; X64-SSE-NEXT:    movdqu %xmm3, 32(%rax,%rdx,4)
823; X64-SSE-NEXT:    movdqu %xmm1, 48(%rax,%rdx,4)
824; X64-SSE-NEXT:    movdqu %xmm2, (%rax,%rdx,4)
825; X64-SSE-NEXT:    movdqu %xmm0, 16(%rax,%rdx,4)
826; X64-SSE-NEXT:    retq
827;
828; X64-AVX1-LABEL: mul_16xi16:
829; X64-AVX1:       # %bb.0: # %entry
830; X64-AVX1-NEXT:    movq {{.*}}(%rip), %rax
831; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
832; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
833; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
834; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
835; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
836; X64-AVX1-NEXT:    vpmulld %xmm0, %xmm4, %xmm0
837; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
838; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm4, %xmm1
839; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
840; X64-AVX1-NEXT:    vpmulld %xmm2, %xmm4, %xmm2
841; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
842; X64-AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm3
843; X64-AVX1-NEXT:    vmovdqu %xmm0, 48(%rax,%rdx,4)
844; X64-AVX1-NEXT:    vmovdqu %xmm1, 32(%rax,%rdx,4)
845; X64-AVX1-NEXT:    vmovdqu %xmm2, 16(%rax,%rdx,4)
846; X64-AVX1-NEXT:    vmovdqu %xmm3, (%rax,%rdx,4)
847; X64-AVX1-NEXT:    retq
848;
849; X64-AVX2-LABEL: mul_16xi16:
850; X64-AVX2:       # %bb.0: # %entry
851; X64-AVX2-NEXT:    movq {{.*}}(%rip), %rax
852; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
853; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
854; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
855; X64-AVX2-NEXT:    vpmulld %ymm0, %ymm2, %ymm0
856; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
857; X64-AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
858; X64-AVX2-NEXT:    vmovdqu %ymm0, 32(%rax,%rdx,4)
859; X64-AVX2-NEXT:    vmovdqu %ymm1, (%rax,%rdx,4)
860; X64-AVX2-NEXT:    vzeroupper
861; X64-AVX2-NEXT:    retq
862entry:
863  %pre = load i32*, i32** @c
864  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
865  %tmp7 = bitcast i8* %tmp6 to <16 x i16>*
866  %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
867  %tmp8 = zext <16 x i16> %wide.load to <16 x i32>
868  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
869  %tmp11 = bitcast i8* %tmp10 to <16 x i16>*
870  %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
871  %tmp12 = zext <16 x i16> %wide.load17 to <16 x i32>
872  %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
873  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
874  %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
875  store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
876  ret void
877}
878
879; %val1 = load <2 x i8>
880; %op1 = sext<2 x i32> %val1
881; %val2 = load <2 x i8>
882; %op2 = sext<2 x i32> %val2
883; %rst = mul <2 x i32> %op1, %op2
884;
885define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
886; X86-SSE-LABEL: mul_2xi8_sext:
887; X86-SSE:       # %bb.0: # %entry
888; X86-SSE-NEXT:    pushl %esi
889; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
890; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
891; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
892; X86-SSE-NEXT:    movl c, %esi
893; X86-SSE-NEXT:    movzwl (%edx,%ecx), %edx
894; X86-SSE-NEXT:    movd %edx, %xmm0
895; X86-SSE-NEXT:    movzwl (%eax,%ecx), %eax
896; X86-SSE-NEXT:    movd %eax, %xmm1
897; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
898; X86-SSE-NEXT:    psraw $8, %xmm0
899; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
900; X86-SSE-NEXT:    psraw $8, %xmm1
901; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
902; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7]
903; X86-SSE-NEXT:    psrad $16, %xmm0
904; X86-SSE-NEXT:    movq %xmm0, (%esi,%ecx,4)
905; X86-SSE-NEXT:    popl %esi
906; X86-SSE-NEXT:    retl
907;
908; X86-AVX-LABEL: mul_2xi8_sext:
909; X86-AVX:       # %bb.0: # %entry
910; X86-AVX-NEXT:    pushl %esi
911; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
912; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
913; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
914; X86-AVX-NEXT:    movl c, %esi
915; X86-AVX-NEXT:    movzwl (%edx,%ecx), %edx
916; X86-AVX-NEXT:    vmovd %edx, %xmm0
917; X86-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
918; X86-AVX-NEXT:    movzwl (%eax,%ecx), %eax
919; X86-AVX-NEXT:    vmovd %eax, %xmm1
920; X86-AVX-NEXT:    vpmovsxbd %xmm1, %xmm1
921; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
922; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
923; X86-AVX-NEXT:    popl %esi
924; X86-AVX-NEXT:    retl
925;
926; X64-SSE-LABEL: mul_2xi8_sext:
927; X64-SSE:       # %bb.0: # %entry
928; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
929; X64-SSE-NEXT:    movzwl (%rdi,%rdx), %ecx
930; X64-SSE-NEXT:    movd %ecx, %xmm0
931; X64-SSE-NEXT:    movzwl (%rsi,%rdx), %ecx
932; X64-SSE-NEXT:    movd %ecx, %xmm1
933; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
934; X64-SSE-NEXT:    psraw $8, %xmm0
935; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
936; X64-SSE-NEXT:    psraw $8, %xmm1
937; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
938; X64-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7]
939; X64-SSE-NEXT:    psrad $16, %xmm0
940; X64-SSE-NEXT:    movq %xmm0, (%rax,%rdx,4)
941; X64-SSE-NEXT:    retq
942;
943; X64-AVX-LABEL: mul_2xi8_sext:
944; X64-AVX:       # %bb.0: # %entry
945; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
946; X64-AVX-NEXT:    movzwl (%rdi,%rdx), %ecx
947; X64-AVX-NEXT:    vmovd %ecx, %xmm0
948; X64-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
949; X64-AVX-NEXT:    movzwl (%rsi,%rdx), %ecx
950; X64-AVX-NEXT:    vmovd %ecx, %xmm1
951; X64-AVX-NEXT:    vpmovsxbd %xmm1, %xmm1
952; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
953; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
954; X64-AVX-NEXT:    retq
955entry:
956  %pre = load i32*, i32** @c
957  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
958  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
959  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
960  %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
961  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
962  %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
963  %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
964  %tmp12 = sext <2 x i8> %wide.load17 to <2 x i32>
965  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
966  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
967  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
968  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
969  ret void
970}
971
972; %val1 = load <2 x i8>
973; %op1 = sext<2 x i32> %val1
974; %val2 = load <2 x i8>
975; %op2 = zext<2 x i32> %val2
976; %rst = mul <2 x i32> %op1, %op2
977;
978define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
979; X86-SSE-LABEL: mul_2xi8_sext_zext:
980; X86-SSE:       # %bb.0: # %entry
981; X86-SSE-NEXT:    pushl %esi
982; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
983; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
984; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
985; X86-SSE-NEXT:    movl c, %esi
986; X86-SSE-NEXT:    movzwl (%edx,%ecx), %edx
987; X86-SSE-NEXT:    movd %edx, %xmm0
988; X86-SSE-NEXT:    movzwl (%eax,%ecx), %eax
989; X86-SSE-NEXT:    movd %eax, %xmm1
990; X86-SSE-NEXT:    pxor %xmm2, %xmm2
991; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
992; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
993; X86-SSE-NEXT:    psraw $8, %xmm0
994; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
995; X86-SSE-NEXT:    pmulhw %xmm0, %xmm2
996; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
997; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
998; X86-SSE-NEXT:    movq %xmm0, (%esi,%ecx,4)
999; X86-SSE-NEXT:    popl %esi
1000; X86-SSE-NEXT:    retl
1001;
1002; X86-AVX-LABEL: mul_2xi8_sext_zext:
1003; X86-AVX:       # %bb.0: # %entry
1004; X86-AVX-NEXT:    pushl %esi
1005; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1006; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1007; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
1008; X86-AVX-NEXT:    movl c, %esi
1009; X86-AVX-NEXT:    movzwl (%edx,%ecx), %edx
1010; X86-AVX-NEXT:    vmovd %edx, %xmm0
1011; X86-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
1012; X86-AVX-NEXT:    movzwl (%eax,%ecx), %eax
1013; X86-AVX-NEXT:    vmovd %eax, %xmm1
1014; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1015; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
1016; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
1017; X86-AVX-NEXT:    popl %esi
1018; X86-AVX-NEXT:    retl
1019;
1020; X64-SSE-LABEL: mul_2xi8_sext_zext:
1021; X64-SSE:       # %bb.0: # %entry
1022; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
1023; X64-SSE-NEXT:    movzwl (%rdi,%rdx), %ecx
1024; X64-SSE-NEXT:    movd %ecx, %xmm0
1025; X64-SSE-NEXT:    movzwl (%rsi,%rdx), %ecx
1026; X64-SSE-NEXT:    movd %ecx, %xmm1
1027; X64-SSE-NEXT:    pxor %xmm2, %xmm2
1028; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1029; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1030; X64-SSE-NEXT:    psraw $8, %xmm0
1031; X64-SSE-NEXT:    movdqa %xmm1, %xmm2
1032; X64-SSE-NEXT:    pmulhw %xmm0, %xmm2
1033; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
1034; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1035; X64-SSE-NEXT:    movq %xmm0, (%rax,%rdx,4)
1036; X64-SSE-NEXT:    retq
1037;
1038; X64-AVX-LABEL: mul_2xi8_sext_zext:
1039; X64-AVX:       # %bb.0: # %entry
1040; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
1041; X64-AVX-NEXT:    movzwl (%rdi,%rdx), %ecx
1042; X64-AVX-NEXT:    vmovd %ecx, %xmm0
1043; X64-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
1044; X64-AVX-NEXT:    movzwl (%rsi,%rdx), %ecx
1045; X64-AVX-NEXT:    vmovd %ecx, %xmm1
1046; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1047; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
1048; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
1049; X64-AVX-NEXT:    retq
1050entry:
1051  %pre = load i32*, i32** @c
1052  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1053  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1054  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1055  %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
1056  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
1057  %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
1058  %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
1059  %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
1060  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
1061  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1062  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1063  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1064  ret void
1065}
1066
1067; %val1 = load <2 x i16>
1068; %op1 = sext<2 x i32> %val1
1069; %val2 = load <2 x i16>
1070; %op2 = sext<2 x i32> %val2
1071; %rst = mul <2 x i32> %op1, %op2
1072;
1073define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
1074; X86-SSE-LABEL: mul_2xi16_sext:
1075; X86-SSE:       # %bb.0: # %entry
1076; X86-SSE-NEXT:    pushl %esi
1077; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1078; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1079; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
1080; X86-SSE-NEXT:    movl c, %esi
1081; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1082; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1083; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
1084; X86-SSE-NEXT:    pmulhw %xmm0, %xmm2
1085; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
1086; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1087; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
1088; X86-SSE-NEXT:    popl %esi
1089; X86-SSE-NEXT:    retl
1090;
1091; X86-AVX-LABEL: mul_2xi16_sext:
1092; X86-AVX:       # %bb.0: # %entry
1093; X86-AVX-NEXT:    pushl %esi
1094; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1095; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1096; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
1097; X86-AVX-NEXT:    movl c, %esi
1098; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1099; X86-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
1100; X86-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1101; X86-AVX-NEXT:    vpmovsxwd %xmm1, %xmm1
1102; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
1103; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
1104; X86-AVX-NEXT:    popl %esi
1105; X86-AVX-NEXT:    retl
1106;
1107; X64-SSE-LABEL: mul_2xi16_sext:
1108; X64-SSE:       # %bb.0: # %entry
1109; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
1110; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1111; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1112; X64-SSE-NEXT:    movdqa %xmm1, %xmm2
1113; X64-SSE-NEXT:    pmulhw %xmm0, %xmm2
1114; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
1115; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1116; X64-SSE-NEXT:    movq %xmm1, (%rax,%rdx,4)
1117; X64-SSE-NEXT:    retq
1118;
1119; X64-AVX-LABEL: mul_2xi16_sext:
1120; X64-AVX:       # %bb.0: # %entry
1121; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
1122; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1123; X64-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
1124; X64-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1125; X64-AVX-NEXT:    vpmovsxwd %xmm1, %xmm1
1126; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
1127; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
1128; X64-AVX-NEXT:    retq
1129entry:
1130  %pre = load i32*, i32** @c
1131  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1132  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
1133  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
1134  %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
1135  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
1136  %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
1137  %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
1138  %tmp12 = sext <2 x i16> %wide.load17 to <2 x i32>
1139  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
1140  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1141  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1142  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1143  ret void
1144}
1145
1146; %val1 = load <2 x i16>
1147; %op1 = sext<2 x i32> %val1
1148; %val2 = load <2 x i16>
1149; %op2 = zext<2 x i32> %val2
1150; %rst = mul <2 x i32> %op1, %op2
1151;
1152define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
1153; X86-SSE-LABEL: mul_2xi16_sext_zext:
1154; X86-SSE:       # %bb.0: # %entry
1155; X86-SSE-NEXT:    pushl %esi
1156; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1157; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1158; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
1159; X86-SSE-NEXT:    movl c, %esi
1160; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1161; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
1162; X86-SSE-NEXT:    psrad $16, %xmm0
1163; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1164; X86-SSE-NEXT:    pxor %xmm2, %xmm2
1165; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1166; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1167; X86-SSE-NEXT:    pmuludq %xmm0, %xmm1
1168; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1169; X86-SSE-NEXT:    pmuludq %xmm2, %xmm0
1170; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1171; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
1172; X86-SSE-NEXT:    popl %esi
1173; X86-SSE-NEXT:    retl
1174;
1175; X86-AVX-LABEL: mul_2xi16_sext_zext:
1176; X86-AVX:       # %bb.0: # %entry
1177; X86-AVX-NEXT:    pushl %esi
1178; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1179; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1180; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
1181; X86-AVX-NEXT:    movl c, %esi
1182; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1183; X86-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
1184; X86-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1185; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1186; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
1187; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
1188; X86-AVX-NEXT:    popl %esi
1189; X86-AVX-NEXT:    retl
1190;
1191; X64-SSE-LABEL: mul_2xi16_sext_zext:
1192; X64-SSE:       # %bb.0: # %entry
1193; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
1194; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1195; X64-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
1196; X64-SSE-NEXT:    psrad $16, %xmm0
1197; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1198; X64-SSE-NEXT:    pxor %xmm2, %xmm2
1199; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1200; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1201; X64-SSE-NEXT:    pmuludq %xmm0, %xmm1
1202; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1203; X64-SSE-NEXT:    pmuludq %xmm2, %xmm0
1204; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1205; X64-SSE-NEXT:    movq %xmm1, (%rax,%rdx,4)
1206; X64-SSE-NEXT:    retq
1207;
1208; X64-AVX-LABEL: mul_2xi16_sext_zext:
1209; X64-AVX:       # %bb.0: # %entry
1210; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
1211; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1212; X64-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
1213; X64-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1214; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1215; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
1216; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
1217; X64-AVX-NEXT:    retq
1218entry:
1219  %pre = load i32*, i32** @c
1220  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1221  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
1222  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
1223  %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
1224  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
1225  %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
1226  %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
1227  %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
1228  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
1229  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1230  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1231  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1232  ret void
1233}
1234
1235; %val1 = load <16 x i16>
1236; %op1 = sext<16 x i32> %val1
1237; %val2 = load <16 x i16>
1238; %op2 = sext<16 x i32> %val2
1239; %rst = mul <16 x i32> %op1, %op2
1240;
1241define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
1242; X86-SSE-LABEL: mul_16xi16_sext:
1243; X86-SSE:       # %bb.0: # %entry
1244; X86-SSE-NEXT:    pushl %esi
1245; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1246; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1247; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
1248; X86-SSE-NEXT:    movl c, %esi
1249; X86-SSE-NEXT:    movdqu (%edx,%ecx), %xmm0
1250; X86-SSE-NEXT:    movdqu 16(%edx,%ecx), %xmm1
1251; X86-SSE-NEXT:    movdqu (%eax,%ecx), %xmm2
1252; X86-SSE-NEXT:    movdqu 16(%eax,%ecx), %xmm3
1253; X86-SSE-NEXT:    movdqa %xmm2, %xmm4
1254; X86-SSE-NEXT:    pmulhw %xmm0, %xmm4
1255; X86-SSE-NEXT:    pmullw %xmm0, %xmm2
1256; X86-SSE-NEXT:    movdqa %xmm2, %xmm0
1257; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1258; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
1259; X86-SSE-NEXT:    movdqa %xmm3, %xmm4
1260; X86-SSE-NEXT:    pmulhw %xmm1, %xmm4
1261; X86-SSE-NEXT:    pmullw %xmm1, %xmm3
1262; X86-SSE-NEXT:    movdqa %xmm3, %xmm1
1263; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1264; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
1265; X86-SSE-NEXT:    movdqu %xmm3, 32(%esi,%ecx,4)
1266; X86-SSE-NEXT:    movdqu %xmm1, 48(%esi,%ecx,4)
1267; X86-SSE-NEXT:    movdqu %xmm2, (%esi,%ecx,4)
1268; X86-SSE-NEXT:    movdqu %xmm0, 16(%esi,%ecx,4)
1269; X86-SSE-NEXT:    popl %esi
1270; X86-SSE-NEXT:    retl
1271;
1272; X86-AVX1-LABEL: mul_16xi16_sext:
1273; X86-AVX1:       # %bb.0: # %entry
1274; X86-AVX1-NEXT:    pushl %esi
1275; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
1276; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1277; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
1278; X86-AVX1-NEXT:    movl c, %esi
1279; X86-AVX1-NEXT:    vpmovsxwd 24(%edx,%ecx), %xmm0
1280; X86-AVX1-NEXT:    vpmovsxwd 16(%edx,%ecx), %xmm1
1281; X86-AVX1-NEXT:    vpmovsxwd 8(%edx,%ecx), %xmm2
1282; X86-AVX1-NEXT:    vpmovsxwd (%edx,%ecx), %xmm3
1283; X86-AVX1-NEXT:    vpmovsxwd 24(%eax,%ecx), %xmm4
1284; X86-AVX1-NEXT:    vpmulld %xmm0, %xmm4, %xmm0
1285; X86-AVX1-NEXT:    vpmovsxwd 16(%eax,%ecx), %xmm4
1286; X86-AVX1-NEXT:    vpmulld %xmm1, %xmm4, %xmm1
1287; X86-AVX1-NEXT:    vpmovsxwd 8(%eax,%ecx), %xmm4
1288; X86-AVX1-NEXT:    vpmulld %xmm2, %xmm4, %xmm2
1289; X86-AVX1-NEXT:    vpmovsxwd (%eax,%ecx), %xmm4
1290; X86-AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm3
1291; X86-AVX1-NEXT:    vmovdqu %xmm0, 48(%esi,%ecx,4)
1292; X86-AVX1-NEXT:    vmovdqu %xmm1, 32(%esi,%ecx,4)
1293; X86-AVX1-NEXT:    vmovdqu %xmm2, 16(%esi,%ecx,4)
1294; X86-AVX1-NEXT:    vmovdqu %xmm3, (%esi,%ecx,4)
1295; X86-AVX1-NEXT:    popl %esi
1296; X86-AVX1-NEXT:    retl
1297;
1298; X86-AVX2-LABEL: mul_16xi16_sext:
1299; X86-AVX2:       # %bb.0: # %entry
1300; X86-AVX2-NEXT:    pushl %esi
1301; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
1302; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1303; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
1304; X86-AVX2-NEXT:    movl c, %esi
1305; X86-AVX2-NEXT:    vpmovsxwd 16(%edx,%ecx), %ymm0
1306; X86-AVX2-NEXT:    vpmovsxwd (%edx,%ecx), %ymm1
1307; X86-AVX2-NEXT:    vpmovsxwd 16(%eax,%ecx), %ymm2
1308; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm2, %ymm0
1309; X86-AVX2-NEXT:    vpmovsxwd (%eax,%ecx), %ymm2
1310; X86-AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
1311; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%esi,%ecx,4)
1312; X86-AVX2-NEXT:    vmovdqu %ymm1, (%esi,%ecx,4)
1313; X86-AVX2-NEXT:    popl %esi
1314; X86-AVX2-NEXT:    vzeroupper
1315; X86-AVX2-NEXT:    retl
1316;
1317; X64-SSE-LABEL: mul_16xi16_sext:
1318; X64-SSE:       # %bb.0: # %entry
1319; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
1320; X64-SSE-NEXT:    movdqu (%rdi,%rdx), %xmm0
1321; X64-SSE-NEXT:    movdqu 16(%rdi,%rdx), %xmm1
1322; X64-SSE-NEXT:    movdqu (%rsi,%rdx), %xmm2
1323; X64-SSE-NEXT:    movdqu 16(%rsi,%rdx), %xmm3
1324; X64-SSE-NEXT:    movdqa %xmm2, %xmm4
1325; X64-SSE-NEXT:    pmulhw %xmm0, %xmm4
1326; X64-SSE-NEXT:    pmullw %xmm0, %xmm2
1327; X64-SSE-NEXT:    movdqa %xmm2, %xmm0
1328; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1329; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
1330; X64-SSE-NEXT:    movdqa %xmm3, %xmm4
1331; X64-SSE-NEXT:    pmulhw %xmm1, %xmm4
1332; X64-SSE-NEXT:    pmullw %xmm1, %xmm3
1333; X64-SSE-NEXT:    movdqa %xmm3, %xmm1
1334; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1335; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
1336; X64-SSE-NEXT:    movdqu %xmm3, 32(%rax,%rdx,4)
1337; X64-SSE-NEXT:    movdqu %xmm1, 48(%rax,%rdx,4)
1338; X64-SSE-NEXT:    movdqu %xmm2, (%rax,%rdx,4)
1339; X64-SSE-NEXT:    movdqu %xmm0, 16(%rax,%rdx,4)
1340; X64-SSE-NEXT:    retq
1341;
1342; X64-AVX1-LABEL: mul_16xi16_sext:
1343; X64-AVX1:       # %bb.0: # %entry
1344; X64-AVX1-NEXT:    movq {{.*}}(%rip), %rax
1345; X64-AVX1-NEXT:    vpmovsxwd 24(%rdi,%rdx), %xmm0
1346; X64-AVX1-NEXT:    vpmovsxwd 16(%rdi,%rdx), %xmm1
1347; X64-AVX1-NEXT:    vpmovsxwd 8(%rdi,%rdx), %xmm2
1348; X64-AVX1-NEXT:    vpmovsxwd (%rdi,%rdx), %xmm3
1349; X64-AVX1-NEXT:    vpmovsxwd 24(%rsi,%rdx), %xmm4
1350; X64-AVX1-NEXT:    vpmulld %xmm0, %xmm4, %xmm0
1351; X64-AVX1-NEXT:    vpmovsxwd 16(%rsi,%rdx), %xmm4
1352; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm4, %xmm1
1353; X64-AVX1-NEXT:    vpmovsxwd 8(%rsi,%rdx), %xmm4
1354; X64-AVX1-NEXT:    vpmulld %xmm2, %xmm4, %xmm2
1355; X64-AVX1-NEXT:    vpmovsxwd (%rsi,%rdx), %xmm4
1356; X64-AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm3
1357; X64-AVX1-NEXT:    vmovdqu %xmm0, 48(%rax,%rdx,4)
1358; X64-AVX1-NEXT:    vmovdqu %xmm1, 32(%rax,%rdx,4)
1359; X64-AVX1-NEXT:    vmovdqu %xmm2, 16(%rax,%rdx,4)
1360; X64-AVX1-NEXT:    vmovdqu %xmm3, (%rax,%rdx,4)
1361; X64-AVX1-NEXT:    retq
1362;
1363; X64-AVX2-LABEL: mul_16xi16_sext:
1364; X64-AVX2:       # %bb.0: # %entry
1365; X64-AVX2-NEXT:    movq {{.*}}(%rip), %rax
1366; X64-AVX2-NEXT:    vpmovsxwd 16(%rdi,%rdx), %ymm0
1367; X64-AVX2-NEXT:    vpmovsxwd (%rdi,%rdx), %ymm1
1368; X64-AVX2-NEXT:    vpmovsxwd 16(%rsi,%rdx), %ymm2
1369; X64-AVX2-NEXT:    vpmulld %ymm0, %ymm2, %ymm0
1370; X64-AVX2-NEXT:    vpmovsxwd (%rsi,%rdx), %ymm2
1371; X64-AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
1372; X64-AVX2-NEXT:    vmovdqu %ymm0, 32(%rax,%rdx,4)
1373; X64-AVX2-NEXT:    vmovdqu %ymm1, (%rax,%rdx,4)
1374; X64-AVX2-NEXT:    vzeroupper
1375; X64-AVX2-NEXT:    retq
1376entry:
1377  %pre = load i32*, i32** @c
1378  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1379  %tmp7 = bitcast i8* %tmp6 to <16 x i16>*
1380  %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
1381  %tmp8 = sext <16 x i16> %wide.load to <16 x i32>
1382  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
1383  %tmp11 = bitcast i8* %tmp10 to <16 x i16>*
1384  %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
1385  %tmp12 = sext <16 x i16> %wide.load17 to <16 x i32>
1386  %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
1387  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1388  %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
1389  store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
1390  ret void
1391}
1392
1393; %val = load <2 x i8>
1394; %op1 = zext<2 x i32> %val
1395; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255)
1396; %rst = mul <2 x i32> %op1, %op2
1397;
1398define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) {
1399; X86-SSE-LABEL: mul_2xi8_varconst1:
1400; X86-SSE:       # %bb.0: # %entry
1401; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1402; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1403; X86-SSE-NEXT:    movl c, %edx
1404; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
1405; X86-SSE-NEXT:    movd %ecx, %xmm0
1406; X86-SSE-NEXT:    pxor %xmm1, %xmm1
1407; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1408; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1409; X86-SSE-NEXT:    pmaddwd {{\.LCPI.*}}, %xmm0
1410; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
1411; X86-SSE-NEXT:    retl
1412;
1413; X86-AVX-LABEL: mul_2xi8_varconst1:
1414; X86-AVX:       # %bb.0: # %entry
1415; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1416; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1417; X86-AVX-NEXT:    movl c, %edx
1418; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
1419; X86-AVX-NEXT:    vmovd %ecx, %xmm0
1420; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1421; X86-AVX-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
1422; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
1423; X86-AVX-NEXT:    retl
1424;
1425; X64-SSE-LABEL: mul_2xi8_varconst1:
1426; X64-SSE:       # %bb.0: # %entry
1427; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
1428; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
1429; X64-SSE-NEXT:    movd %ecx, %xmm0
1430; X64-SSE-NEXT:    pxor %xmm1, %xmm1
1431; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1432; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1433; X64-SSE-NEXT:    pmaddwd {{.*}}(%rip), %xmm0
1434; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
1435; X64-SSE-NEXT:    retq
1436;
1437; X64-AVX-LABEL: mul_2xi8_varconst1:
1438; X64-AVX:       # %bb.0: # %entry
1439; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
1440; X64-AVX-NEXT:    movzwl (%rdi,%rsi), %ecx
1441; X64-AVX-NEXT:    vmovd %ecx, %xmm0
1442; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1443; X64-AVX-NEXT:    vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
1444; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
1445; X64-AVX-NEXT:    retq
1446entry:
1447  %pre = load i32*, i32** @c
1448  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1449  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1450  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1451  %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
1452  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 255>
1453  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1454  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1455  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1456  ret void
1457}
1458
1459; %val = load <2 x i8>
1460; %op1 = sext<2 x i32> %val
1461; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127)
1462; %rst = mul <2 x i32> %op1, %op2
1463;
1464define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) {
1465; X86-SSE-LABEL: mul_2xi8_varconst2:
1466; X86-SSE:       # %bb.0: # %entry
1467; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1468; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1469; X86-SSE-NEXT:    movl c, %edx
1470; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
1471; X86-SSE-NEXT:    movd %ecx, %xmm0
1472; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1473; X86-SSE-NEXT:    psraw $8, %xmm0
1474; X86-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
1475; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
1476; X86-SSE-NEXT:    psrad $16, %xmm0
1477; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
1478; X86-SSE-NEXT:    retl
1479;
1480; X86-AVX-LABEL: mul_2xi8_varconst2:
1481; X86-AVX:       # %bb.0: # %entry
1482; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1483; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1484; X86-AVX-NEXT:    movl c, %edx
1485; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
1486; X86-AVX-NEXT:    vmovd %ecx, %xmm0
1487; X86-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
1488; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
1489; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
1490; X86-AVX-NEXT:    retl
1491;
1492; X64-SSE-LABEL: mul_2xi8_varconst2:
1493; X64-SSE:       # %bb.0: # %entry
1494; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
1495; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
1496; X64-SSE-NEXT:    movd %ecx, %xmm0
1497; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1498; X64-SSE-NEXT:    psraw $8, %xmm0
1499; X64-SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
1500; X64-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
1501; X64-SSE-NEXT:    psrad $16, %xmm0
1502; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
1503; X64-SSE-NEXT:    retq
1504;
1505; X64-AVX-LABEL: mul_2xi8_varconst2:
1506; X64-AVX:       # %bb.0: # %entry
1507; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
1508; X64-AVX-NEXT:    movzwl (%rdi,%rsi), %ecx
1509; X64-AVX-NEXT:    vmovd %ecx, %xmm0
1510; X64-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
1511; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
1512; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
1513; X64-AVX-NEXT:    retq
1514entry:
1515  %pre = load i32*, i32** @c
1516  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1517  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1518  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1519  %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
1520  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 127>
1521  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1522  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1523  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1524  ret void
1525}
1526
1527; %val = load <2 x i8>
1528; %op1 = zext<2 x i32> %val
1529; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256)
1530; %rst = mul <2 x i32> %op1, %op2
1531;
1532define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) {
1533; X86-SSE-LABEL: mul_2xi8_varconst3:
1534; X86-SSE:       # %bb.0: # %entry
1535; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1536; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1537; X86-SSE-NEXT:    movl c, %edx
1538; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
1539; X86-SSE-NEXT:    movd %ecx, %xmm0
1540; X86-SSE-NEXT:    pxor %xmm1, %xmm1
1541; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1542; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1543; X86-SSE-NEXT:    pmaddwd {{\.LCPI.*}}, %xmm0
1544; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
1545; X86-SSE-NEXT:    retl
1546;
1547; X86-AVX-LABEL: mul_2xi8_varconst3:
1548; X86-AVX:       # %bb.0: # %entry
1549; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1550; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1551; X86-AVX-NEXT:    movl c, %edx
1552; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
1553; X86-AVX-NEXT:    vmovd %ecx, %xmm0
1554; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1555; X86-AVX-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
1556; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
1557; X86-AVX-NEXT:    retl
1558;
1559; X64-SSE-LABEL: mul_2xi8_varconst3:
1560; X64-SSE:       # %bb.0: # %entry
1561; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
1562; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
1563; X64-SSE-NEXT:    movd %ecx, %xmm0
1564; X64-SSE-NEXT:    pxor %xmm1, %xmm1
1565; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1566; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1567; X64-SSE-NEXT:    pmaddwd {{.*}}(%rip), %xmm0
1568; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
1569; X64-SSE-NEXT:    retq
1570;
1571; X64-AVX-LABEL: mul_2xi8_varconst3:
1572; X64-AVX:       # %bb.0: # %entry
1573; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
1574; X64-AVX-NEXT:    movzwl (%rdi,%rsi), %ecx
1575; X64-AVX-NEXT:    vmovd %ecx, %xmm0
1576; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1577; X64-AVX-NEXT:    vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
1578; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
1579; X64-AVX-NEXT:    retq
1580entry:
1581  %pre = load i32*, i32** @c
1582  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1583  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1584  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1585  %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
1586  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 256>
1587  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1588  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1589  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1590  ret void
1591}
1592
1593; %val = load <2 x i8>
1594; %op1 = zext<2 x i32> %val
1595; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255)
1596; %rst = mul <2 x i32> %op1, %op2
1597;
1598define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) {
1599; X86-SSE-LABEL: mul_2xi8_varconst4:
1600; X86-SSE:       # %bb.0: # %entry
1601; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1602; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1603; X86-SSE-NEXT:    movl c, %edx
1604; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
1605; X86-SSE-NEXT:    movd %ecx, %xmm0
1606; X86-SSE-NEXT:    pxor %xmm1, %xmm1
1607; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1608; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
1609; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
1610; X86-SSE-NEXT:    pmulhw %xmm1, %xmm2
1611; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
1612; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1613; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
1614; X86-SSE-NEXT:    retl
1615;
1616; X86-AVX-LABEL: mul_2xi8_varconst4:
1617; X86-AVX:       # %bb.0: # %entry
1618; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1619; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1620; X86-AVX-NEXT:    movl c, %edx
1621; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
1622; X86-AVX-NEXT:    vmovd %ecx, %xmm0
1623; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1624; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
1625; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
1626; X86-AVX-NEXT:    retl
1627;
1628; X64-SSE-LABEL: mul_2xi8_varconst4:
1629; X64-SSE:       # %bb.0: # %entry
1630; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
1631; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
1632; X64-SSE-NEXT:    movd %ecx, %xmm0
1633; X64-SSE-NEXT:    pxor %xmm1, %xmm1
1634; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1635; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
1636; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
1637; X64-SSE-NEXT:    pmulhw %xmm1, %xmm2
1638; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
1639; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1640; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
1641; X64-SSE-NEXT:    retq
1642;
1643; X64-AVX-LABEL: mul_2xi8_varconst4:
1644; X64-AVX:       # %bb.0: # %entry
1645; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
1646; X64-AVX-NEXT:    movzwl (%rdi,%rsi), %ecx
1647; X64-AVX-NEXT:    vmovd %ecx, %xmm0
1648; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1649; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
1650; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
1651; X64-AVX-NEXT:    retq
1652entry:
1653  %pre = load i32*, i32** @c
1654  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1655  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1656  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1657  %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
1658  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -1, i32 255>
1659  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1660  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1661  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1662  ret void
1663}
1664
1665; %val = load <2 x i8>
1666; %op1 = sext<2 x i32> %val
1667; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127)
1668; %rst = mul <2 x i32> %op1, %op2
1669;
1670define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) {
1671; X86-SSE-LABEL: mul_2xi8_varconst5:
1672; X86-SSE:       # %bb.0: # %entry
1673; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1674; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1675; X86-SSE-NEXT:    movl c, %edx
1676; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
1677; X86-SSE-NEXT:    movd %ecx, %xmm0
1678; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1679; X86-SSE-NEXT:    psraw $8, %xmm0
1680; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
1681; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
1682; X86-SSE-NEXT:    pmulhw %xmm1, %xmm2
1683; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
1684; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1685; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
1686; X86-SSE-NEXT:    retl
1687;
1688; X86-AVX-LABEL: mul_2xi8_varconst5:
1689; X86-AVX:       # %bb.0: # %entry
1690; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1691; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1692; X86-AVX-NEXT:    movl c, %edx
1693; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
1694; X86-AVX-NEXT:    vmovd %ecx, %xmm0
1695; X86-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
1696; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
1697; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
1698; X86-AVX-NEXT:    retl
1699;
1700; X64-SSE-LABEL: mul_2xi8_varconst5:
1701; X64-SSE:       # %bb.0: # %entry
1702; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
1703; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
1704; X64-SSE-NEXT:    movd %ecx, %xmm0
1705; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1706; X64-SSE-NEXT:    psraw $8, %xmm0
1707; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
1708; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
1709; X64-SSE-NEXT:    pmulhw %xmm1, %xmm2
1710; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
1711; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1712; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
1713; X64-SSE-NEXT:    retq
1714;
1715; X64-AVX-LABEL: mul_2xi8_varconst5:
1716; X64-AVX:       # %bb.0: # %entry
1717; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
1718; X64-AVX-NEXT:    movzwl (%rdi,%rsi), %ecx
1719; X64-AVX-NEXT:    vmovd %ecx, %xmm0
1720; X64-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
1721; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
1722; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
1723; X64-AVX-NEXT:    retq
1724entry:
1725  %pre = load i32*, i32** @c
1726  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1727  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1728  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1729  %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
1730  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -129, i32 127>
1731  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1732  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1733  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1734  ret void
1735}
1736
1737; %val = load <2 x i8>
1738; %op1 = sext<2 x i32> %val
1739; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128)
1740; %rst = mul <2 x i32> %op1, %op2
1741;
1742define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) {
1743; X86-SSE-LABEL: mul_2xi8_varconst6:
1744; X86-SSE:       # %bb.0: # %entry
1745; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1746; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1747; X86-SSE-NEXT:    movl c, %edx
1748; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
1749; X86-SSE-NEXT:    movd %ecx, %xmm0
1750; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1751; X86-SSE-NEXT:    psraw $8, %xmm0
1752; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
1753; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
1754; X86-SSE-NEXT:    pmulhw %xmm1, %xmm2
1755; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
1756; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1757; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
1758; X86-SSE-NEXT:    retl
1759;
1760; X86-AVX-LABEL: mul_2xi8_varconst6:
1761; X86-AVX:       # %bb.0: # %entry
1762; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1763; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1764; X86-AVX-NEXT:    movl c, %edx
1765; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
1766; X86-AVX-NEXT:    vmovd %ecx, %xmm0
1767; X86-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
1768; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
1769; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
1770; X86-AVX-NEXT:    retl
1771;
1772; X64-SSE-LABEL: mul_2xi8_varconst6:
1773; X64-SSE:       # %bb.0: # %entry
1774; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
1775; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
1776; X64-SSE-NEXT:    movd %ecx, %xmm0
1777; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1778; X64-SSE-NEXT:    psraw $8, %xmm0
1779; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
1780; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
1781; X64-SSE-NEXT:    pmulhw %xmm1, %xmm2
1782; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
1783; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1784; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
1785; X64-SSE-NEXT:    retq
1786;
1787; X64-AVX-LABEL: mul_2xi8_varconst6:
1788; X64-AVX:       # %bb.0: # %entry
1789; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
1790; X64-AVX-NEXT:    movzwl (%rdi,%rsi), %ecx
1791; X64-AVX-NEXT:    vmovd %ecx, %xmm0
1792; X64-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
1793; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
1794; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
1795; X64-AVX-NEXT:    retq
1796entry:
1797  %pre = load i32*, i32** @c
1798  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1799  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
1800  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
1801  %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
1802  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 128>
1803  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1804  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1805  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1806  ret void
1807}
1808
1809; %val = load <2 x i16>
1810; %op1 = zext<2 x i32> %val
1811; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535)
1812; %rst = mul <2 x i32> %op1, %op2
1813;
1814define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) {
1815; X86-SSE-LABEL: mul_2xi16_varconst1:
1816; X86-SSE:       # %bb.0: # %entry
1817; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1818; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1819; X86-SSE-NEXT:    movl c, %edx
1820; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1821; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
1822; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
1823; X86-SSE-NEXT:    pmulhuw %xmm1, %xmm2
1824; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
1825; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1826; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
1827; X86-SSE-NEXT:    retl
1828;
1829; X86-AVX-LABEL: mul_2xi16_varconst1:
1830; X86-AVX:       # %bb.0: # %entry
1831; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1832; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1833; X86-AVX-NEXT:    movl c, %edx
1834; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1835; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1836; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
1837; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
1838; X86-AVX-NEXT:    retl
1839;
1840; X64-SSE-LABEL: mul_2xi16_varconst1:
1841; X64-SSE:       # %bb.0: # %entry
1842; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
1843; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1844; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
1845; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
1846; X64-SSE-NEXT:    pmulhuw %xmm1, %xmm2
1847; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
1848; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1849; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
1850; X64-SSE-NEXT:    retq
1851;
1852; X64-AVX-LABEL: mul_2xi16_varconst1:
1853; X64-AVX:       # %bb.0: # %entry
1854; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
1855; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1856; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1857; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
1858; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
1859; X64-AVX-NEXT:    retq
1860entry:
1861  %pre = load i32*, i32** @c
1862  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1863  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
1864  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
1865  %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
1866  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65535>
1867  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1868  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1869  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1870  ret void
1871}
1872
1873; %val = load <2 x i16>
1874; %op1 = sext<2 x i32> %val
1875; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767)
1876; %rst = mul <2 x i32> %op1, %op2
1877;
1878define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) {
1879; X86-SSE-LABEL: mul_2xi16_varconst2:
1880; X86-SSE:       # %bb.0: # %entry
1881; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1882; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1883; X86-SSE-NEXT:    movl c, %edx
1884; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1885; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
1886; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
1887; X86-SSE-NEXT:    pmulhw %xmm1, %xmm2
1888; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
1889; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1890; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
1891; X86-SSE-NEXT:    retl
1892;
1893; X86-AVX-LABEL: mul_2xi16_varconst2:
1894; X86-AVX:       # %bb.0: # %entry
1895; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1896; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1897; X86-AVX-NEXT:    movl c, %edx
1898; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1899; X86-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
1900; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
1901; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
1902; X86-AVX-NEXT:    retl
1903;
1904; X64-SSE-LABEL: mul_2xi16_varconst2:
1905; X64-SSE:       # %bb.0: # %entry
1906; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
1907; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1908; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
1909; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
1910; X64-SSE-NEXT:    pmulhw %xmm1, %xmm2
1911; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
1912; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1913; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
1914; X64-SSE-NEXT:    retq
1915;
1916; X64-AVX-LABEL: mul_2xi16_varconst2:
1917; X64-AVX:       # %bb.0: # %entry
1918; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
1919; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1920; X64-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
1921; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
1922; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
1923; X64-AVX-NEXT:    retq
1924entry:
1925  %pre = load i32*, i32** @c
1926  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1927  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
1928  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
1929  %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
1930  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -32768, i32 32767>
1931  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1932  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1933  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
1934  ret void
1935}
1936
1937; %val = load <2 x i16>
1938; %op1 = zext<2 x i32> %val
1939; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536)
1940; %rst = mul <2 x i32> %op1, %op2
1941;
1942define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) {
1943; X86-SSE-LABEL: mul_2xi16_varconst3:
1944; X86-SSE:       # %bb.0: # %entry
1945; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1946; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1947; X86-SSE-NEXT:    movl c, %edx
1948; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1949; X86-SSE-NEXT:    pxor %xmm1, %xmm1
1950; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1951; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
1952; X86-SSE-NEXT:    pmuludq {{\.LCPI.*}}, %xmm0
1953; X86-SSE-NEXT:    pmuludq {{\.LCPI.*}}, %xmm1
1954; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1955; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
1956; X86-SSE-NEXT:    retl
1957;
1958; X86-AVX-LABEL: mul_2xi16_varconst3:
1959; X86-AVX:       # %bb.0: # %entry
1960; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1961; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1962; X86-AVX-NEXT:    movl c, %edx
1963; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1964; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1965; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
1966; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
1967; X86-AVX-NEXT:    retl
1968;
1969; X64-SSE-LABEL: mul_2xi16_varconst3:
1970; X64-SSE:       # %bb.0: # %entry
1971; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
1972; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1973; X64-SSE-NEXT:    pxor %xmm1, %xmm1
1974; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1975; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
1976; X64-SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm0
1977; X64-SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm1
1978; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1979; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
1980; X64-SSE-NEXT:    retq
1981;
1982; X64-AVX-LABEL: mul_2xi16_varconst3:
1983; X64-AVX:       # %bb.0: # %entry
1984; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
1985; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1986; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1987; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
1988; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
1989; X64-AVX-NEXT:    retq
1990entry:
1991  %pre = load i32*, i32** @c
1992  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
1993  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
1994  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
1995  %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
1996  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65536>
1997  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
1998  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
1999  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
2000  ret void
2001}
2002
2003; %val = load <2 x i16>
2004; %op1 = sext<2 x i32> %val
2005; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768)
2006; %rst = mul <2 x i32> %op1, %op2
2007;
2008define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) {
2009; X86-SSE-LABEL: mul_2xi16_varconst4:
2010; X86-SSE:       # %bb.0: # %entry
2011; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
2012; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
2013; X86-SSE-NEXT:    movl c, %edx
2014; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2015; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
2016; X86-SSE-NEXT:    psrad $16, %xmm0
2017; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
2018; X86-SSE-NEXT:    pmuludq {{\.LCPI.*}}, %xmm0
2019; X86-SSE-NEXT:    pmuludq {{\.LCPI.*}}, %xmm1
2020; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2021; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
2022; X86-SSE-NEXT:    retl
2023;
2024; X86-AVX-LABEL: mul_2xi16_varconst4:
2025; X86-AVX:       # %bb.0: # %entry
2026; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
2027; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
2028; X86-AVX-NEXT:    movl c, %edx
2029; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2030; X86-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
2031; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
2032; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
2033; X86-AVX-NEXT:    retl
2034;
2035; X64-SSE-LABEL: mul_2xi16_varconst4:
2036; X64-SSE:       # %bb.0: # %entry
2037; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
2038; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2039; X64-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
2040; X64-SSE-NEXT:    psrad $16, %xmm0
2041; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
2042; X64-SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm0
2043; X64-SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm1
2044; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2045; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
2046; X64-SSE-NEXT:    retq
2047;
2048; X64-AVX-LABEL: mul_2xi16_varconst4:
2049; X64-AVX:       # %bb.0: # %entry
2050; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
2051; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2052; X64-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
2053; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
2054; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
2055; X64-AVX-NEXT:    retq
2056entry:
2057  %pre = load i32*, i32** @c
2058  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
2059  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
2060  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
2061  %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
2062  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 32768>
2063  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
2064  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
2065  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
2066  ret void
2067}
2068
2069;
2070; Illegal Types
2071;
2072
2073define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
2074; X86-SSE-LABEL: PR34947:
2075; X86-SSE:       # %bb.0:
2076; X86-SSE-NEXT:    pushl %esi
2077; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
2078; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
2079; X86-SSE-NEXT:    movdqa (%eax), %xmm4
2080; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2081; X86-SSE-NEXT:    movdqa (%ecx), %xmm1
2082; X86-SSE-NEXT:    movdqa 16(%ecx), %xmm5
2083; X86-SSE-NEXT:    pxor %xmm3, %xmm3
2084; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
2085; X86-SSE-NEXT:    movdqa %xmm4, %xmm2
2086; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
2087; X86-SSE-NEXT:    movdqa %xmm4, %xmm6
2088; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
2089; X86-SSE-NEXT:    movdqa %xmm4, %xmm3
2090; X86-SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2091; X86-SSE-NEXT:    movd %xmm3, %eax
2092; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[3,3,3,3]
2093; X86-SSE-NEXT:    movd %xmm3, %esi
2094; X86-SSE-NEXT:    xorl %edx, %edx
2095; X86-SSE-NEXT:    divl %esi
2096; X86-SSE-NEXT:    movd %edx, %xmm3
2097; X86-SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
2098; X86-SSE-NEXT:    movd %xmm7, %eax
2099; X86-SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3]
2100; X86-SSE-NEXT:    movd %xmm7, %esi
2101; X86-SSE-NEXT:    xorl %edx, %edx
2102; X86-SSE-NEXT:    divl %esi
2103; X86-SSE-NEXT:    movd %edx, %xmm7
2104; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
2105; X86-SSE-NEXT:    movd %xmm6, %eax
2106; X86-SSE-NEXT:    movd %xmm5, %esi
2107; X86-SSE-NEXT:    xorl %edx, %edx
2108; X86-SSE-NEXT:    divl %esi
2109; X86-SSE-NEXT:    movd %edx, %xmm3
2110; X86-SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,1,1]
2111; X86-SSE-NEXT:    movd %xmm6, %eax
2112; X86-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,1,1]
2113; X86-SSE-NEXT:    movd %xmm5, %esi
2114; X86-SSE-NEXT:    xorl %edx, %edx
2115; X86-SSE-NEXT:    divl %esi
2116; X86-SSE-NEXT:    movd %edx, %xmm5
2117; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
2118; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
2119; X86-SSE-NEXT:    movdqa %xmm4, %xmm5
2120; X86-SSE-NEXT:    psrld $16, %xmm5
2121; X86-SSE-NEXT:    movd %xmm5, %eax
2122; X86-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
2123; X86-SSE-NEXT:    movd %xmm5, %esi
2124; X86-SSE-NEXT:    xorl %edx, %edx
2125; X86-SSE-NEXT:    divl %esi
2126; X86-SSE-NEXT:    movd %edx, %xmm6
2127; X86-SSE-NEXT:    movd %xmm2, %eax
2128; X86-SSE-NEXT:    movd %xmm1, %esi
2129; X86-SSE-NEXT:    xorl %edx, %edx
2130; X86-SSE-NEXT:    divl %esi
2131; X86-SSE-NEXT:    movd %edx, %xmm5
2132; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
2133; X86-SSE-NEXT:    psrlq $48, %xmm4
2134; X86-SSE-NEXT:    movd %xmm4, %eax
2135; X86-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[3,3,3,3]
2136; X86-SSE-NEXT:    movd %xmm4, %esi
2137; X86-SSE-NEXT:    xorl %edx, %edx
2138; X86-SSE-NEXT:    divl %esi
2139; X86-SSE-NEXT:    movd %edx, %xmm4
2140; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
2141; X86-SSE-NEXT:    movd %xmm2, %eax
2142; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
2143; X86-SSE-NEXT:    movd %xmm1, %esi
2144; X86-SSE-NEXT:    xorl %edx, %edx
2145; X86-SSE-NEXT:    divl %esi
2146; X86-SSE-NEXT:    movd %edx, %xmm1
2147; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
2148; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0]
2149; X86-SSE-NEXT:    movd %xmm0, %eax
2150; X86-SSE-NEXT:    xorl %edx, %edx
2151; X86-SSE-NEXT:    divl 32(%ecx)
2152; X86-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [8199,8199,8199,8199]
2153; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
2154; X86-SSE-NEXT:    pmuludq %xmm0, %xmm5
2155; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[0,2,2,3]
2156; X86-SSE-NEXT:    pmuludq %xmm0, %xmm1
2157; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2158; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2159; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
2160; X86-SSE-NEXT:    pmuludq %xmm0, %xmm3
2161; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2162; X86-SSE-NEXT:    pmuludq %xmm0, %xmm1
2163; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
2164; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
2165; X86-SSE-NEXT:    imull $8199, %edx, %eax # imm = 0x2007
2166; X86-SSE-NEXT:    movl %eax, (%eax)
2167; X86-SSE-NEXT:    movdqa %xmm3, (%eax)
2168; X86-SSE-NEXT:    movdqa %xmm2, (%eax)
2169; X86-SSE-NEXT:    popl %esi
2170; X86-SSE-NEXT:    retl
2171;
2172; X86-AVX1-LABEL: PR34947:
2173; X86-AVX1:       # %bb.0:
2174; X86-AVX1-NEXT:    pushl %ebp
2175; X86-AVX1-NEXT:    pushl %ebx
2176; X86-AVX1-NEXT:    pushl %edi
2177; X86-AVX1-NEXT:    pushl %esi
2178; X86-AVX1-NEXT:    subl $16, %esp
2179; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
2180; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
2181; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2182; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2183; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2184; X86-AVX1-NEXT:    vmovd %xmm1, %eax
2185; X86-AVX1-NEXT:    xorl %edx, %edx
2186; X86-AVX1-NEXT:    divl 32(%ecx)
2187; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2188; X86-AVX1-NEXT:    vpextrd $3, %xmm2, %eax
2189; X86-AVX1-NEXT:    vmovdqa (%ecx), %xmm1
2190; X86-AVX1-NEXT:    vmovdqa 16(%ecx), %xmm3
2191; X86-AVX1-NEXT:    vpextrd $3, %xmm3, %ecx
2192; X86-AVX1-NEXT:    xorl %edx, %edx
2193; X86-AVX1-NEXT:    divl %ecx
2194; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2195; X86-AVX1-NEXT:    vpextrd $2, %xmm2, %eax
2196; X86-AVX1-NEXT:    vpextrd $2, %xmm3, %ecx
2197; X86-AVX1-NEXT:    xorl %edx, %edx
2198; X86-AVX1-NEXT:    divl %ecx
2199; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2200; X86-AVX1-NEXT:    vpextrd $1, %xmm2, %eax
2201; X86-AVX1-NEXT:    vpextrd $1, %xmm3, %ecx
2202; X86-AVX1-NEXT:    xorl %edx, %edx
2203; X86-AVX1-NEXT:    divl %ecx
2204; X86-AVX1-NEXT:    movl %edx, (%esp) # 4-byte Spill
2205; X86-AVX1-NEXT:    vmovd %xmm2, %eax
2206; X86-AVX1-NEXT:    vmovd %xmm3, %ecx
2207; X86-AVX1-NEXT:    xorl %edx, %edx
2208; X86-AVX1-NEXT:    divl %ecx
2209; X86-AVX1-NEXT:    movl %edx, %ebp
2210; X86-AVX1-NEXT:    vpextrd $3, %xmm0, %eax
2211; X86-AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
2212; X86-AVX1-NEXT:    xorl %edx, %edx
2213; X86-AVX1-NEXT:    divl %ecx
2214; X86-AVX1-NEXT:    movl %edx, %ebx
2215; X86-AVX1-NEXT:    vpextrd $2, %xmm0, %eax
2216; X86-AVX1-NEXT:    vpextrd $2, %xmm1, %esi
2217; X86-AVX1-NEXT:    xorl %edx, %edx
2218; X86-AVX1-NEXT:    divl %esi
2219; X86-AVX1-NEXT:    movl %edx, %esi
2220; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %eax
2221; X86-AVX1-NEXT:    vpextrd $1, %xmm1, %edi
2222; X86-AVX1-NEXT:    xorl %edx, %edx
2223; X86-AVX1-NEXT:    divl %edi
2224; X86-AVX1-NEXT:    movl %edx, %edi
2225; X86-AVX1-NEXT:    vmovd %xmm0, %eax
2226; X86-AVX1-NEXT:    vmovd %xmm1, %ecx
2227; X86-AVX1-NEXT:    xorl %edx, %edx
2228; X86-AVX1-NEXT:    divl %ecx
2229; X86-AVX1-NEXT:    vmovd %edx, %xmm0
2230; X86-AVX1-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
2231; X86-AVX1-NEXT:    vpinsrd $2, %esi, %xmm0, %xmm0
2232; X86-AVX1-NEXT:    vpinsrd $3, %ebx, %xmm0, %xmm0
2233; X86-AVX1-NEXT:    vmovd %ebp, %xmm1
2234; X86-AVX1-NEXT:    vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload
2235; X86-AVX1-NEXT:    vpinsrd $2, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
2236; X86-AVX1-NEXT:    vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
2237; X86-AVX1-NEXT:    imull $8199, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
2238; X86-AVX1-NEXT:    # imm = 0x2007
2239; X86-AVX1-NEXT:    movl %eax, (%eax)
2240; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [8199,8199,8199,8199]
2241; X86-AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
2242; X86-AVX1-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
2243; X86-AVX1-NEXT:    vmovdqa %xmm1, (%eax)
2244; X86-AVX1-NEXT:    vmovdqa %xmm0, (%eax)
2245; X86-AVX1-NEXT:    addl $16, %esp
2246; X86-AVX1-NEXT:    popl %esi
2247; X86-AVX1-NEXT:    popl %edi
2248; X86-AVX1-NEXT:    popl %ebx
2249; X86-AVX1-NEXT:    popl %ebp
2250; X86-AVX1-NEXT:    retl
2251;
2252; X86-AVX2-LABEL: PR34947:
2253; X86-AVX2:       # %bb.0:
2254; X86-AVX2-NEXT:    pushl %edi
2255; X86-AVX2-NEXT:    pushl %esi
2256; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %esi
2257; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2258; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2259; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2260; X86-AVX2-NEXT:    vmovdqa (%esi), %xmm2
2261; X86-AVX2-NEXT:    vmovdqa 16(%esi), %xmm3
2262; X86-AVX2-NEXT:    vpextrd $1, %xmm3, %ecx
2263; X86-AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
2264; X86-AVX2-NEXT:    vpextrd $1, %xmm4, %eax
2265; X86-AVX2-NEXT:    xorl %edx, %edx
2266; X86-AVX2-NEXT:    divl %ecx
2267; X86-AVX2-NEXT:    movl %edx, %ecx
2268; X86-AVX2-NEXT:    vmovd %xmm3, %edi
2269; X86-AVX2-NEXT:    vmovd %xmm4, %eax
2270; X86-AVX2-NEXT:    xorl %edx, %edx
2271; X86-AVX2-NEXT:    divl %edi
2272; X86-AVX2-NEXT:    vmovd %edx, %xmm5
2273; X86-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm5, %xmm5
2274; X86-AVX2-NEXT:    vpextrd $2, %xmm3, %ecx
2275; X86-AVX2-NEXT:    vpextrd $2, %xmm4, %eax
2276; X86-AVX2-NEXT:    xorl %edx, %edx
2277; X86-AVX2-NEXT:    divl %ecx
2278; X86-AVX2-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm5
2279; X86-AVX2-NEXT:    vpextrd $3, %xmm3, %ecx
2280; X86-AVX2-NEXT:    vpextrd $3, %xmm4, %eax
2281; X86-AVX2-NEXT:    xorl %edx, %edx
2282; X86-AVX2-NEXT:    divl %ecx
2283; X86-AVX2-NEXT:    vpinsrd $3, %edx, %xmm5, %xmm3
2284; X86-AVX2-NEXT:    vpextrd $1, %xmm2, %ecx
2285; X86-AVX2-NEXT:    vpextrd $1, %xmm1, %eax
2286; X86-AVX2-NEXT:    xorl %edx, %edx
2287; X86-AVX2-NEXT:    divl %ecx
2288; X86-AVX2-NEXT:    movl %edx, %ecx
2289; X86-AVX2-NEXT:    vmovd %xmm2, %edi
2290; X86-AVX2-NEXT:    vmovd %xmm1, %eax
2291; X86-AVX2-NEXT:    xorl %edx, %edx
2292; X86-AVX2-NEXT:    divl %edi
2293; X86-AVX2-NEXT:    vmovd %edx, %xmm4
2294; X86-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm4, %xmm4
2295; X86-AVX2-NEXT:    vpextrd $2, %xmm2, %ecx
2296; X86-AVX2-NEXT:    vpextrd $2, %xmm1, %eax
2297; X86-AVX2-NEXT:    xorl %edx, %edx
2298; X86-AVX2-NEXT:    divl %ecx
2299; X86-AVX2-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
2300; X86-AVX2-NEXT:    vpextrd $3, %xmm2, %ecx
2301; X86-AVX2-NEXT:    vpextrd $3, %xmm1, %eax
2302; X86-AVX2-NEXT:    xorl %edx, %edx
2303; X86-AVX2-NEXT:    divl %ecx
2304; X86-AVX2-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm1
2305; X86-AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
2306; X86-AVX2-NEXT:    vmovd %xmm0, %eax
2307; X86-AVX2-NEXT:    xorl %edx, %edx
2308; X86-AVX2-NEXT:    divl 32(%esi)
2309; X86-AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199]
2310; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
2311; X86-AVX2-NEXT:    imull $8199, %edx, %eax # imm = 0x2007
2312; X86-AVX2-NEXT:    movl %eax, (%eax)
2313; X86-AVX2-NEXT:    vmovdqa %ymm0, (%eax)
2314; X86-AVX2-NEXT:    popl %esi
2315; X86-AVX2-NEXT:    popl %edi
2316; X86-AVX2-NEXT:    vzeroupper
2317; X86-AVX2-NEXT:    retl
2318;
2319; X64-SSE-LABEL: PR34947:
2320; X64-SSE:       # %bb.0:
2321; X64-SSE-NEXT:    movdqa (%rdi), %xmm4
2322; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2323; X64-SSE-NEXT:    movdqa (%rsi), %xmm1
2324; X64-SSE-NEXT:    movdqa 16(%rsi), %xmm5
2325; X64-SSE-NEXT:    pxor %xmm3, %xmm3
2326; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
2327; X64-SSE-NEXT:    movdqa %xmm4, %xmm2
2328; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
2329; X64-SSE-NEXT:    movdqa %xmm4, %xmm6
2330; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
2331; X64-SSE-NEXT:    movdqa %xmm4, %xmm3
2332; X64-SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2333; X64-SSE-NEXT:    movd %xmm3, %eax
2334; X64-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[3,3,3,3]
2335; X64-SSE-NEXT:    movd %xmm3, %ecx
2336; X64-SSE-NEXT:    xorl %edx, %edx
2337; X64-SSE-NEXT:    divl %ecx
2338; X64-SSE-NEXT:    movd %edx, %xmm3
2339; X64-SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
2340; X64-SSE-NEXT:    movd %xmm7, %eax
2341; X64-SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3]
2342; X64-SSE-NEXT:    movd %xmm7, %ecx
2343; X64-SSE-NEXT:    xorl %edx, %edx
2344; X64-SSE-NEXT:    divl %ecx
2345; X64-SSE-NEXT:    movd %edx, %xmm7
2346; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
2347; X64-SSE-NEXT:    movd %xmm6, %eax
2348; X64-SSE-NEXT:    movd %xmm5, %ecx
2349; X64-SSE-NEXT:    xorl %edx, %edx
2350; X64-SSE-NEXT:    divl %ecx
2351; X64-SSE-NEXT:    movd %edx, %xmm3
2352; X64-SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,1,1]
2353; X64-SSE-NEXT:    movd %xmm6, %eax
2354; X64-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,1,1]
2355; X64-SSE-NEXT:    movd %xmm5, %ecx
2356; X64-SSE-NEXT:    xorl %edx, %edx
2357; X64-SSE-NEXT:    divl %ecx
2358; X64-SSE-NEXT:    movd %edx, %xmm5
2359; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
2360; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
2361; X64-SSE-NEXT:    movdqa %xmm4, %xmm5
2362; X64-SSE-NEXT:    psrld $16, %xmm5
2363; X64-SSE-NEXT:    movd %xmm5, %eax
2364; X64-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
2365; X64-SSE-NEXT:    movd %xmm5, %ecx
2366; X64-SSE-NEXT:    xorl %edx, %edx
2367; X64-SSE-NEXT:    divl %ecx
2368; X64-SSE-NEXT:    movd %edx, %xmm6
2369; X64-SSE-NEXT:    movd %xmm2, %eax
2370; X64-SSE-NEXT:    movd %xmm1, %ecx
2371; X64-SSE-NEXT:    xorl %edx, %edx
2372; X64-SSE-NEXT:    divl %ecx
2373; X64-SSE-NEXT:    movd %edx, %xmm5
2374; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
2375; X64-SSE-NEXT:    psrlq $48, %xmm4
2376; X64-SSE-NEXT:    movd %xmm4, %eax
2377; X64-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[3,3,3,3]
2378; X64-SSE-NEXT:    movd %xmm4, %ecx
2379; X64-SSE-NEXT:    xorl %edx, %edx
2380; X64-SSE-NEXT:    divl %ecx
2381; X64-SSE-NEXT:    movd %edx, %xmm4
2382; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
2383; X64-SSE-NEXT:    movd %xmm2, %eax
2384; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
2385; X64-SSE-NEXT:    movd %xmm1, %ecx
2386; X64-SSE-NEXT:    xorl %edx, %edx
2387; X64-SSE-NEXT:    divl %ecx
2388; X64-SSE-NEXT:    movd %edx, %xmm1
2389; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
2390; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0]
2391; X64-SSE-NEXT:    movd %xmm0, %eax
2392; X64-SSE-NEXT:    xorl %edx, %edx
2393; X64-SSE-NEXT:    divl 32(%rsi)
2394; X64-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [8199,8199,8199,8199]
2395; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
2396; X64-SSE-NEXT:    pmuludq %xmm0, %xmm5
2397; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[0,2,2,3]
2398; X64-SSE-NEXT:    pmuludq %xmm0, %xmm1
2399; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2400; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2401; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
2402; X64-SSE-NEXT:    pmuludq %xmm0, %xmm3
2403; X64-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2404; X64-SSE-NEXT:    pmuludq %xmm0, %xmm1
2405; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
2406; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
2407; X64-SSE-NEXT:    imull $8199, %edx, %eax # imm = 0x2007
2408; X64-SSE-NEXT:    movl %eax, (%rax)
2409; X64-SSE-NEXT:    movdqa %xmm3, (%rax)
2410; X64-SSE-NEXT:    movdqa %xmm2, (%rax)
2411; X64-SSE-NEXT:    retq
2412;
2413; X64-AVX1-LABEL: PR34947:
2414; X64-AVX1:       # %bb.0:
2415; X64-AVX1-NEXT:    pushq %rbp
2416; X64-AVX1-NEXT:    pushq %rbx
2417; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2418; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2419; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2420; X64-AVX1-NEXT:    vmovd %xmm1, %eax
2421; X64-AVX1-NEXT:    xorl %edx, %edx
2422; X64-AVX1-NEXT:    divl 32(%rsi)
2423; X64-AVX1-NEXT:    movl %edx, %r8d
2424; X64-AVX1-NEXT:    vpextrd $3, %xmm2, %eax
2425; X64-AVX1-NEXT:    vmovdqa (%rsi), %xmm1
2426; X64-AVX1-NEXT:    vmovdqa 16(%rsi), %xmm3
2427; X64-AVX1-NEXT:    vpextrd $3, %xmm3, %ecx
2428; X64-AVX1-NEXT:    xorl %edx, %edx
2429; X64-AVX1-NEXT:    divl %ecx
2430; X64-AVX1-NEXT:    movl %edx, %r9d
2431; X64-AVX1-NEXT:    vpextrd $2, %xmm2, %eax
2432; X64-AVX1-NEXT:    vpextrd $2, %xmm3, %ecx
2433; X64-AVX1-NEXT:    xorl %edx, %edx
2434; X64-AVX1-NEXT:    divl %ecx
2435; X64-AVX1-NEXT:    movl %edx, %r10d
2436; X64-AVX1-NEXT:    vpextrd $1, %xmm2, %eax
2437; X64-AVX1-NEXT:    vpextrd $1, %xmm3, %ecx
2438; X64-AVX1-NEXT:    xorl %edx, %edx
2439; X64-AVX1-NEXT:    divl %ecx
2440; X64-AVX1-NEXT:    movl %edx, %r11d
2441; X64-AVX1-NEXT:    vmovd %xmm2, %eax
2442; X64-AVX1-NEXT:    vmovd %xmm3, %ecx
2443; X64-AVX1-NEXT:    xorl %edx, %edx
2444; X64-AVX1-NEXT:    divl %ecx
2445; X64-AVX1-NEXT:    movl %edx, %esi
2446; X64-AVX1-NEXT:    vpextrd $3, %xmm0, %eax
2447; X64-AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
2448; X64-AVX1-NEXT:    xorl %edx, %edx
2449; X64-AVX1-NEXT:    divl %ecx
2450; X64-AVX1-NEXT:    movl %edx, %edi
2451; X64-AVX1-NEXT:    vpextrd $2, %xmm0, %eax
2452; X64-AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
2453; X64-AVX1-NEXT:    xorl %edx, %edx
2454; X64-AVX1-NEXT:    divl %ecx
2455; X64-AVX1-NEXT:    movl %edx, %ecx
2456; X64-AVX1-NEXT:    vpextrd $1, %xmm0, %eax
2457; X64-AVX1-NEXT:    vpextrd $1, %xmm1, %ebx
2458; X64-AVX1-NEXT:    xorl %edx, %edx
2459; X64-AVX1-NEXT:    divl %ebx
2460; X64-AVX1-NEXT:    movl %edx, %ebx
2461; X64-AVX1-NEXT:    vmovd %xmm0, %eax
2462; X64-AVX1-NEXT:    vmovd %xmm1, %ebp
2463; X64-AVX1-NEXT:    xorl %edx, %edx
2464; X64-AVX1-NEXT:    divl %ebp
2465; X64-AVX1-NEXT:    vmovd %edx, %xmm0
2466; X64-AVX1-NEXT:    vpinsrd $1, %ebx, %xmm0, %xmm0
2467; X64-AVX1-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
2468; X64-AVX1-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0
2469; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
2470; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
2471; X64-AVX1-NEXT:    vmovd %esi, %xmm2
2472; X64-AVX1-NEXT:    vpinsrd $1, %r11d, %xmm2, %xmm2
2473; X64-AVX1-NEXT:    vpinsrd $2, %r10d, %xmm2, %xmm2
2474; X64-AVX1-NEXT:    vpinsrd $3, %r9d, %xmm2, %xmm2
2475; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
2476; X64-AVX1-NEXT:    imull $8199, %r8d, %eax # imm = 0x2007
2477; X64-AVX1-NEXT:    movl %eax, (%rax)
2478; X64-AVX1-NEXT:    vmovdqa %xmm1, (%rax)
2479; X64-AVX1-NEXT:    vmovdqa %xmm0, (%rax)
2480; X64-AVX1-NEXT:    popq %rbx
2481; X64-AVX1-NEXT:    popq %rbp
2482; X64-AVX1-NEXT:    retq
2483;
2484; X64-AVX2-LABEL: PR34947:
2485; X64-AVX2:       # %bb.0:
2486; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2487; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2488; X64-AVX2-NEXT:    vmovdqa (%rsi), %xmm2
2489; X64-AVX2-NEXT:    vmovdqa 16(%rsi), %xmm3
2490; X64-AVX2-NEXT:    vpextrd $1, %xmm3, %ecx
2491; X64-AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
2492; X64-AVX2-NEXT:    vpextrd $1, %xmm4, %eax
2493; X64-AVX2-NEXT:    xorl %edx, %edx
2494; X64-AVX2-NEXT:    divl %ecx
2495; X64-AVX2-NEXT:    movl %edx, %ecx
2496; X64-AVX2-NEXT:    vmovd %xmm3, %edi
2497; X64-AVX2-NEXT:    vmovd %xmm4, %eax
2498; X64-AVX2-NEXT:    xorl %edx, %edx
2499; X64-AVX2-NEXT:    divl %edi
2500; X64-AVX2-NEXT:    vmovd %edx, %xmm5
2501; X64-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm5, %xmm5
2502; X64-AVX2-NEXT:    vpextrd $2, %xmm3, %ecx
2503; X64-AVX2-NEXT:    vpextrd $2, %xmm4, %eax
2504; X64-AVX2-NEXT:    xorl %edx, %edx
2505; X64-AVX2-NEXT:    divl %ecx
2506; X64-AVX2-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm5
2507; X64-AVX2-NEXT:    vpextrd $3, %xmm3, %ecx
2508; X64-AVX2-NEXT:    vpextrd $3, %xmm4, %eax
2509; X64-AVX2-NEXT:    xorl %edx, %edx
2510; X64-AVX2-NEXT:    divl %ecx
2511; X64-AVX2-NEXT:    vpinsrd $3, %edx, %xmm5, %xmm3
2512; X64-AVX2-NEXT:    vpextrd $1, %xmm2, %ecx
2513; X64-AVX2-NEXT:    vpextrd $1, %xmm1, %eax
2514; X64-AVX2-NEXT:    xorl %edx, %edx
2515; X64-AVX2-NEXT:    divl %ecx
2516; X64-AVX2-NEXT:    movl %edx, %ecx
2517; X64-AVX2-NEXT:    vmovd %xmm2, %edi
2518; X64-AVX2-NEXT:    vmovd %xmm1, %eax
2519; X64-AVX2-NEXT:    xorl %edx, %edx
2520; X64-AVX2-NEXT:    divl %edi
2521; X64-AVX2-NEXT:    vmovd %edx, %xmm4
2522; X64-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm4, %xmm4
2523; X64-AVX2-NEXT:    vpextrd $2, %xmm2, %ecx
2524; X64-AVX2-NEXT:    vpextrd $2, %xmm1, %eax
2525; X64-AVX2-NEXT:    xorl %edx, %edx
2526; X64-AVX2-NEXT:    divl %ecx
2527; X64-AVX2-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
2528; X64-AVX2-NEXT:    vpextrd $3, %xmm2, %ecx
2529; X64-AVX2-NEXT:    vpextrd $3, %xmm1, %eax
2530; X64-AVX2-NEXT:    xorl %edx, %edx
2531; X64-AVX2-NEXT:    divl %ecx
2532; X64-AVX2-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm1
2533; X64-AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
2534; X64-AVX2-NEXT:    vmovd %xmm0, %eax
2535; X64-AVX2-NEXT:    xorl %edx, %edx
2536; X64-AVX2-NEXT:    divl 32(%rsi)
2537; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199]
2538; X64-AVX2-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
2539; X64-AVX2-NEXT:    imull $8199, %edx, %eax # imm = 0x2007
2540; X64-AVX2-NEXT:    movl %eax, (%rax)
2541; X64-AVX2-NEXT:    vmovdqa %ymm0, (%rax)
2542; X64-AVX2-NEXT:    vzeroupper
2543; X64-AVX2-NEXT:    retq
2544  %a0 = load <9 x i16>, <9 x i16>* %p0, align 64
2545  %a1 = load <9 x i32>, <9 x i32>* %p1, align 64
2546  %ext0 = zext <9 x i16> %a0 to <9 x i32>
2547  %rem = urem <9 x i32> %ext0, %a1
2548  %mul = mul <9 x i32> <i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199>, %rem
2549  store <9 x i32> %mul, <9 x i32>* undef, align 64
2550  ret void
2551}
2552