• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s
3
4@c = external global i32*, align 8
5
6; %val1 = load <2 x i8>
7; %op1 = zext<2 x i32> %val1
8; %val2 = load <2 x i8>
9; %op2 = zext<2 x i32> %val2
10; %rst = mul <2 x i32> %op1, %op2
11;
12define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
13; CHECK-LABEL: mul_2xi8:
14; CHECK:       # BB#0: # %entry
15; CHECK-NEXT:    movq {{.*}}(%rip), %rax
16; CHECK-NEXT:    movzwl (%rdi,%rdx), %ecx
17; CHECK-NEXT:    movd %ecx, %xmm0
18; CHECK-NEXT:    movzwl (%rsi,%rdx), %ecx
19; CHECK-NEXT:    movd %ecx, %xmm1
20; CHECK-NEXT:    pxor %xmm2, %xmm2
21; CHECK-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
22; CHECK-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
23; CHECK-NEXT:    pmullw %xmm0, %xmm1
24; CHECK-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
25; CHECK-NEXT:    movq %xmm1, (%rax,%rdx,4)
26; CHECK-NEXT:    retq
27entry:
28  %pre = load i32*, i32** @c
29  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
30  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
31  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
32  %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
33  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
34  %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
35  %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
36  %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
37  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
38  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
39  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
40  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
41  ret void
42}
43
44; %val1 = load <4 x i8>
45; %op1 = zext<4 x i32> %val1
46; %val2 = load <4 x i8>
47; %op2 = zext<4 x i32> %val2
48; %rst = mul <4 x i32> %op1, %op2
49;
50define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
51; CHECK-LABEL: mul_4xi8:
52; CHECK:       # BB#0: # %entry
53; CHECK-NEXT:    movq {{.*}}(%rip), %rax
54; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
55; CHECK-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
56; CHECK-NEXT:    pxor %xmm2, %xmm2
57; CHECK-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
58; CHECK-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
59; CHECK-NEXT:    pmullw %xmm0, %xmm1
60; CHECK-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
61; CHECK-NEXT:    movdqu %xmm1, (%rax,%rdx,4)
62; CHECK-NEXT:    retq
63entry:
64  %pre = load i32*, i32** @c
65  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
66  %tmp7 = bitcast i8* %tmp6 to <4 x i8>*
67  %wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1
68  %tmp8 = zext <4 x i8> %wide.load to <4 x i32>
69  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
70  %tmp11 = bitcast i8* %tmp10 to <4 x i8>*
71  %wide.load17 = load <4 x i8>, <4 x i8>* %tmp11, align 1
72  %tmp12 = zext <4 x i8> %wide.load17 to <4 x i32>
73  %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
74  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
75  %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
76  store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
77  ret void
78}
79
80; %val1 = load <8 x i8>
81; %op1 = zext<8 x i32> %val1
82; %val2 = load <8 x i8>
83; %op2 = zext<8 x i32> %val2
84; %rst = mul <8 x i32> %op1, %op2
85;
86define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
87; CHECK-LABEL: mul_8xi8:
88; CHECK:       # BB#0: # %entry
89; CHECK-NEXT:    movq {{.*}}(%rip), %rax
90; CHECK-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
91; CHECK-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
92; CHECK-NEXT:    pxor %xmm2, %xmm2
93; CHECK-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
94; CHECK-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
95; CHECK-NEXT:    pmullw %xmm0, %xmm1
96; CHECK-NEXT:    movdqa %xmm1, %xmm0
97; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
98; CHECK-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
99; CHECK-NEXT:    movdqu %xmm1, 16(%rax,%rdx,4)
100; CHECK-NEXT:    movdqu %xmm0, (%rax,%rdx,4)
101; CHECK-NEXT:    retq
102entry:
103  %pre = load i32*, i32** @c
104  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
105  %tmp7 = bitcast i8* %tmp6 to <8 x i8>*
106  %wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1
107  %tmp8 = zext <8 x i8> %wide.load to <8 x i32>
108  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
109  %tmp11 = bitcast i8* %tmp10 to <8 x i8>*
110  %wide.load17 = load <8 x i8>, <8 x i8>* %tmp11, align 1
111  %tmp12 = zext <8 x i8> %wide.load17 to <8 x i32>
112  %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
113  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
114  %tmp15 = bitcast i32* %tmp14 to <8 x i32>*
115  store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
116  ret void
117}
118
119; %val1 = load <16 x i8>
120; %op1 = zext<16 x i32> %val1
121; %val2 = load <16 x i8>
122; %op2 = zext<16 x i32> %val2
123; %rst = mul <16 x i32> %op1, %op2
124;
125define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
126; CHECK-LABEL: mul_16xi8:
127; CHECK:       # BB#0: # %entry
128; CHECK-NEXT:    movq {{.*}}(%rip), %rax
129; CHECK-NEXT:    movdqu (%rdi,%rdx), %xmm0
130; CHECK-NEXT:    movdqu (%rsi,%rdx), %xmm1
131; CHECK-NEXT:    pxor %xmm2, %xmm2
132; CHECK-NEXT:    movdqa %xmm0, %xmm3
133; CHECK-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
134; CHECK-NEXT:    movdqa %xmm1, %xmm4
135; CHECK-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
136; CHECK-NEXT:    pmullw %xmm3, %xmm4
137; CHECK-NEXT:    movdqa %xmm4, %xmm3
138; CHECK-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
139; CHECK-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
140; CHECK-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
141; CHECK-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
142; CHECK-NEXT:    pmullw %xmm0, %xmm1
143; CHECK-NEXT:    movdqa %xmm1, %xmm0
144; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
145; CHECK-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
146; CHECK-NEXT:    movdqu %xmm1, 48(%rax,%rdx,4)
147; CHECK-NEXT:    movdqu %xmm0, 32(%rax,%rdx,4)
148; CHECK-NEXT:    movdqu %xmm4, 16(%rax,%rdx,4)
149; CHECK-NEXT:    movdqu %xmm3, (%rax,%rdx,4)
150; CHECK-NEXT:    retq
151entry:
152  %pre = load i32*, i32** @c
153  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
154  %tmp7 = bitcast i8* %tmp6 to <16 x i8>*
155  %wide.load = load <16 x i8>, <16 x i8>* %tmp7, align 1
156  %tmp8 = zext <16 x i8> %wide.load to <16 x i32>
157  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
158  %tmp11 = bitcast i8* %tmp10 to <16 x i8>*
159  %wide.load17 = load <16 x i8>, <16 x i8>* %tmp11, align 1
160  %tmp12 = zext <16 x i8> %wide.load17 to <16 x i32>
161  %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
162  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
163  %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
164  store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
165  ret void
166}
167
168; %val1 = load <2 x i16>
169; %op1 = zext<2 x i32> %val1
170; %val2 = load <2 x i16>
171; %op2 = zext<2 x i32> %val2
172; %rst = mul <2 x i32> %op1, %op2
173;
174define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
175; CHECK-LABEL: mul_2xi16:
176; CHECK:       # BB#0: # %entry
177; CHECK-NEXT:    movq {{.*}}(%rip), %rax
178; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
179; CHECK-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
180; CHECK-NEXT:    movdqa %xmm1, %xmm2
181; CHECK-NEXT:    pmulhuw %xmm0, %xmm2
182; CHECK-NEXT:    pmullw %xmm0, %xmm1
183; CHECK-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
184; CHECK-NEXT:    movq %xmm1, (%rax,%rdx,4)
185; CHECK-NEXT:    retq
186entry:
187  %pre = load i32*, i32** @c
188  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
189  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
190  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
191  %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
192  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
193  %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
194  %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
195  %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
196  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
197  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
198  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
199  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
200  ret void
201}
202
203; %val1 = load <4 x i16>
204; %op1 = zext<4 x i32> %val1
205; %val2 = load <4 x i16>
206; %op2 = zext<4 x i32> %val2
207; %rst = mul <4 x i32> %op1, %op2
208;
209define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
210; CHECK-LABEL: mul_4xi16:
211; CHECK:       # BB#0: # %entry
212; CHECK-NEXT:    movq {{.*}}(%rip), %rax
213; CHECK-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
214; CHECK-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
215; CHECK-NEXT:    movdqa %xmm1, %xmm2
216; CHECK-NEXT:    pmulhuw %xmm0, %xmm2
217; CHECK-NEXT:    pmullw %xmm0, %xmm1
218; CHECK-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
219; CHECK-NEXT:    movdqu %xmm1, (%rax,%rdx,4)
220; CHECK-NEXT:    retq
221entry:
222  %pre = load i32*, i32** @c
223  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
224  %tmp7 = bitcast i8* %tmp6 to <4 x i16>*
225  %wide.load = load <4 x i16>, <4 x i16>* %tmp7, align 1
226  %tmp8 = zext <4 x i16> %wide.load to <4 x i32>
227  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
228  %tmp11 = bitcast i8* %tmp10 to <4 x i16>*
229  %wide.load17 = load <4 x i16>, <4 x i16>* %tmp11, align 1
230  %tmp12 = zext <4 x i16> %wide.load17 to <4 x i32>
231  %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
232  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
233  %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
234  store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
235  ret void
236}
237
238; %val1 = load <8 x i16>
239; %op1 = zext<8 x i32> %val1
240; %val2 = load <8 x i16>
241; %op2 = zext<8 x i32> %val2
242; %rst = mul <8 x i32> %op1, %op2
243;
244define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
245; CHECK-LABEL: mul_8xi16:
246; CHECK:       # BB#0: # %entry
247; CHECK-NEXT:    movq {{.*}}(%rip), %rax
248; CHECK-NEXT:    movdqu (%rdi,%rdx), %xmm0
249; CHECK-NEXT:    movdqu (%rsi,%rdx), %xmm1
250; CHECK-NEXT:    movdqa %xmm1, %xmm2
251; CHECK-NEXT:    pmulhuw %xmm0, %xmm2
252; CHECK-NEXT:    pmullw %xmm0, %xmm1
253; CHECK-NEXT:    movdqa %xmm1, %xmm0
254; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
255; CHECK-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
256; CHECK-NEXT:    movdqu %xmm1, 16(%rax,%rdx,4)
257; CHECK-NEXT:    movdqu %xmm0, (%rax,%rdx,4)
258; CHECK-NEXT:    retq
259entry:
260  %pre = load i32*, i32** @c
261  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
262  %tmp7 = bitcast i8* %tmp6 to <8 x i16>*
263  %wide.load = load <8 x i16>, <8 x i16>* %tmp7, align 1
264  %tmp8 = zext <8 x i16> %wide.load to <8 x i32>
265  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
266  %tmp11 = bitcast i8* %tmp10 to <8 x i16>*
267  %wide.load17 = load <8 x i16>, <8 x i16>* %tmp11, align 1
268  %tmp12 = zext <8 x i16> %wide.load17 to <8 x i32>
269  %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
270  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
271  %tmp15 = bitcast i32* %tmp14 to <8 x i32>*
272  store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
273  ret void
274}
275
276; %val1 = load <16 x i16>
277; %op1 = zext<16 x i32> %val1
278; %val2 = load <16 x i16>
279; %op2 = zext<16 x i32> %val2
280; %rst = mul <16 x i32> %op1, %op2
281;
282define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
283; CHECK-LABEL: mul_16xi16:
284; CHECK:       # BB#0: # %entry
285; CHECK-NEXT:    movq {{.*}}(%rip), %rax
286; CHECK-NEXT:    movdqu (%rdi,%rdx), %xmm0
287; CHECK-NEXT:    movdqu 16(%rdi,%rdx), %xmm1
288; CHECK-NEXT:    movdqu (%rsi,%rdx), %xmm2
289; CHECK-NEXT:    movdqu 16(%rsi,%rdx), %xmm3
290; CHECK-NEXT:    movdqa %xmm2, %xmm4
291; CHECK-NEXT:    pmulhuw %xmm0, %xmm4
292; CHECK-NEXT:    pmullw %xmm0, %xmm2
293; CHECK-NEXT:    movdqa %xmm2, %xmm0
294; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
295; CHECK-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
296; CHECK-NEXT:    movdqa %xmm3, %xmm4
297; CHECK-NEXT:    pmulhuw %xmm1, %xmm4
298; CHECK-NEXT:    pmullw %xmm1, %xmm3
299; CHECK-NEXT:    movdqa %xmm3, %xmm1
300; CHECK-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
301; CHECK-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
302; CHECK-NEXT:    movdqu %xmm3, 48(%rax,%rdx,4)
303; CHECK-NEXT:    movdqu %xmm1, 32(%rax,%rdx,4)
304; CHECK-NEXT:    movdqu %xmm2, 16(%rax,%rdx,4)
305; CHECK-NEXT:    movdqu %xmm0, (%rax,%rdx,4)
306; CHECK-NEXT:    retq
307entry:
308  %pre = load i32*, i32** @c
309  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
310  %tmp7 = bitcast i8* %tmp6 to <16 x i16>*
311  %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
312  %tmp8 = zext <16 x i16> %wide.load to <16 x i32>
313  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
314  %tmp11 = bitcast i8* %tmp10 to <16 x i16>*
315  %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
316  %tmp12 = zext <16 x i16> %wide.load17 to <16 x i32>
317  %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
318  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
319  %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
320  store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
321  ret void
322}
323
324; %val1 = load <2 x i8>
325; %op1 = sext<2 x i32> %val1
326; %val2 = load <2 x i8>
327; %op2 = sext<2 x i32> %val2
328; %rst = mul <2 x i32> %op1, %op2
329;
330define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
331; CHECK-LABEL: mul_2xi8_sext:
332; CHECK:       # BB#0: # %entry
333; CHECK-NEXT:    movq {{.*}}(%rip), %rax
334; CHECK-NEXT:    movzwl (%rdi,%rdx), %ecx
335; CHECK-NEXT:    movd %ecx, %xmm0
336; CHECK-NEXT:    movzwl (%rsi,%rdx), %ecx
337; CHECK-NEXT:    movd %ecx, %xmm1
338; CHECK-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
339; CHECK-NEXT:    psraw $8, %xmm0
340; CHECK-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
341; CHECK-NEXT:    psraw $8, %xmm1
342; CHECK-NEXT:    pmullw %xmm0, %xmm1
343; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
344; CHECK-NEXT:    psrad $16, %xmm0
345; CHECK-NEXT:    movq %xmm0, (%rax,%rdx,4)
346; CHECK-NEXT:    retq
347entry:
348  %pre = load i32*, i32** @c
349  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
350  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
351  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
352  %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
353  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
354  %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
355  %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
356  %tmp12 = sext <2 x i8> %wide.load17 to <2 x i32>
357  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
358  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
359  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
360  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
361  ret void
362}
363
364; %val1 = load <2 x i8>
365; %op1 = sext<2 x i32> %val1
366; %val2 = load <2 x i8>
367; %op2 = zext<2 x i32> %val2
368; %rst = mul <2 x i32> %op1, %op2
369;
370define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
371; CHECK-LABEL: mul_2xi8_sext_zext:
372; CHECK:       # BB#0: # %entry
373; CHECK-NEXT:    movq {{.*}}(%rip), %rax
374; CHECK-NEXT:    movzwl (%rdi,%rdx), %ecx
375; CHECK-NEXT:    movd %ecx, %xmm0
376; CHECK-NEXT:    movzwl (%rsi,%rdx), %ecx
377; CHECK-NEXT:    movd %ecx, %xmm1
378; CHECK-NEXT:    pxor %xmm2, %xmm2
379; CHECK-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
380; CHECK-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
381; CHECK-NEXT:    psraw $8, %xmm0
382; CHECK-NEXT:    movdqa %xmm1, %xmm2
383; CHECK-NEXT:    pmulhw %xmm0, %xmm2
384; CHECK-NEXT:    pmullw %xmm1, %xmm0
385; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
386; CHECK-NEXT:    movq %xmm0, (%rax,%rdx,4)
387; CHECK-NEXT:    retq
388entry:
389  %pre = load i32*, i32** @c
390  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
391  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
392  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
393  %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
394  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
395  %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
396  %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
397  %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
398  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
399  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
400  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
401  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
402  ret void
403}
404
405; %val1 = load <2 x i16>
406; %op1 = sext<2 x i32> %val1
407; %val2 = load <2 x i16>
408; %op2 = sext<2 x i32> %val2
409; %rst = mul <2 x i32> %op1, %op2
410;
411define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
412; CHECK-LABEL: mul_2xi16_sext:
413; CHECK:       # BB#0: # %entry
414; CHECK-NEXT:    movq {{.*}}(%rip), %rax
415; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
416; CHECK-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
417; CHECK-NEXT:    movdqa %xmm1, %xmm2
418; CHECK-NEXT:    pmulhw %xmm0, %xmm2
419; CHECK-NEXT:    pmullw %xmm0, %xmm1
420; CHECK-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
421; CHECK-NEXT:    movq %xmm1, (%rax,%rdx,4)
422; CHECK-NEXT:    retq
423entry:
424  %pre = load i32*, i32** @c
425  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
426  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
427  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
428  %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
429  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
430  %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
431  %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
432  %tmp12 = sext <2 x i16> %wide.load17 to <2 x i32>
433  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
434  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
435  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
436  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
437  ret void
438}
439
440; %val1 = load <2 x i16>
441; %op1 = sext<2 x i32> %val1
442; %val2 = load <2 x i16>
443; %op2 = zext<2 x i32> %val2
444; %rst = mul <2 x i32> %op1, %op2
445;
446define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
447; CHECK-LABEL: mul_2xi16_sext_zext:
448; CHECK:       # BB#0: # %entry
449; CHECK-NEXT:    movq {{.*}}(%rip), %rax
450; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
451; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
452; CHECK-NEXT:    psrad $16, %xmm0
453; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
454; CHECK-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
455; CHECK-NEXT:    pxor %xmm2, %xmm2
456; CHECK-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
457; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
458; CHECK-NEXT:    movdqa %xmm1, %xmm2
459; CHECK-NEXT:    pmuludq %xmm0, %xmm2
460; CHECK-NEXT:    movdqa %xmm0, %xmm3
461; CHECK-NEXT:    psrlq $32, %xmm3
462; CHECK-NEXT:    pmuludq %xmm1, %xmm3
463; CHECK-NEXT:    psllq $32, %xmm3
464; CHECK-NEXT:    paddq %xmm2, %xmm3
465; CHECK-NEXT:    psrlq $32, %xmm1
466; CHECK-NEXT:    pmuludq %xmm0, %xmm1
467; CHECK-NEXT:    psllq $32, %xmm1
468; CHECK-NEXT:    paddq %xmm3, %xmm1
469; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
470; CHECK-NEXT:    movq %xmm0, (%rax,%rdx,4)
471; CHECK-NEXT:    retq
472entry:
473  %pre = load i32*, i32** @c
474  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
475  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
476  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
477  %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
478  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
479  %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
480  %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
481  %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
482  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
483  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
484  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
485  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
486  ret void
487}
488
489; %val1 = load <16 x i16>
490; %op1 = sext<16 x i32> %val1
491; %val2 = load <16 x i16>
492; %op2 = sext<16 x i32> %val2
493; %rst = mul <16 x i32> %op1, %op2
494;
495define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
496; CHECK-LABEL: mul_16xi16_sext:
497; CHECK:       # BB#0: # %entry
498; CHECK-NEXT:    movq {{.*}}(%rip), %rax
499; CHECK-NEXT:    movdqu (%rdi,%rdx), %xmm0
500; CHECK-NEXT:    movdqu 16(%rdi,%rdx), %xmm1
501; CHECK-NEXT:    movdqu (%rsi,%rdx), %xmm2
502; CHECK-NEXT:    movdqu 16(%rsi,%rdx), %xmm3
503; CHECK-NEXT:    movdqa %xmm2, %xmm4
504; CHECK-NEXT:    pmulhw %xmm0, %xmm4
505; CHECK-NEXT:    pmullw %xmm0, %xmm2
506; CHECK-NEXT:    movdqa %xmm2, %xmm0
507; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
508; CHECK-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
509; CHECK-NEXT:    movdqa %xmm3, %xmm4
510; CHECK-NEXT:    pmulhw %xmm1, %xmm4
511; CHECK-NEXT:    pmullw %xmm1, %xmm3
512; CHECK-NEXT:    movdqa %xmm3, %xmm1
513; CHECK-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
514; CHECK-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
515; CHECK-NEXT:    movdqu %xmm3, 48(%rax,%rdx,4)
516; CHECK-NEXT:    movdqu %xmm1, 32(%rax,%rdx,4)
517; CHECK-NEXT:    movdqu %xmm2, 16(%rax,%rdx,4)
518; CHECK-NEXT:    movdqu %xmm0, (%rax,%rdx,4)
519; CHECK-NEXT:    retq
520entry:
521  %pre = load i32*, i32** @c
522  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
523  %tmp7 = bitcast i8* %tmp6 to <16 x i16>*
524  %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
525  %tmp8 = sext <16 x i16> %wide.load to <16 x i32>
526  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
527  %tmp11 = bitcast i8* %tmp10 to <16 x i16>*
528  %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
529  %tmp12 = sext <16 x i16> %wide.load17 to <16 x i32>
530  %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
531  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
532  %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
533  store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
534  ret void
535}
536
537; %val = load <2 x i8>
538; %op1 = zext<2 x i32> %val
539; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255)
540; %rst = mul <2 x i32> %op1, %op2
541;
542define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) {
543; CHECK-LABEL: mul_2xi8_varconst1:
544; CHECK:       # BB#0: # %entry
545; CHECK-NEXT:    movq {{.*}}(%rip), %rax
546; CHECK-NEXT:    movzwl (%rdi,%rsi), %ecx
547; CHECK-NEXT:    movd %ecx, %xmm0
548; CHECK-NEXT:    pxor %xmm1, %xmm1
549; CHECK-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
550; CHECK-NEXT:    pmullw {{.*}}(%rip), %xmm0
551; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
552; CHECK-NEXT:    movq %xmm0, (%rax,%rsi,4)
553; CHECK-NEXT:    retq
554entry:
555  %pre = load i32*, i32** @c
556  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
557  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
558  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
559  %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
560  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 255>
561  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
562  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
563  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
564  ret void
565}
566
567; %val = load <2 x i8>
568; %op1 = sext<2 x i32> %val
569; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127)
570; %rst = mul <2 x i32> %op1, %op2
571;
572define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) {
573; CHECK-LABEL: mul_2xi8_varconst2:
574; CHECK:       # BB#0: # %entry
575; CHECK-NEXT:    movq {{.*}}(%rip), %rax
576; CHECK-NEXT:    movzwl (%rdi,%rsi), %ecx
577; CHECK-NEXT:    movd %ecx, %xmm0
578; CHECK-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
579; CHECK-NEXT:    psraw $8, %xmm0
580; CHECK-NEXT:    pmullw {{.*}}(%rip), %xmm0
581; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
582; CHECK-NEXT:    psrad $16, %xmm0
583; CHECK-NEXT:    movq %xmm0, (%rax,%rsi,4)
584; CHECK-NEXT:    retq
585entry:
586  %pre = load i32*, i32** @c
587  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
588  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
589  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
590  %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
591  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 127>
592  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
593  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
594  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
595  ret void
596}
597
598; %val = load <2 x i8>
599; %op1 = zext<2 x i32> %val
600; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256)
601; %rst = mul <2 x i32> %op1, %op2
602;
603define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) {
604; CHECK-LABEL: mul_2xi8_varconst3:
605; CHECK:       # BB#0: # %entry
606; CHECK-NEXT:    movq {{.*}}(%rip), %rax
607; CHECK-NEXT:    movzwl (%rdi,%rsi), %ecx
608; CHECK-NEXT:    movd %ecx, %xmm0
609; CHECK-NEXT:    pxor %xmm1, %xmm1
610; CHECK-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
611; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u>
612; CHECK-NEXT:    movdqa %xmm0, %xmm2
613; CHECK-NEXT:    pmulhw %xmm1, %xmm2
614; CHECK-NEXT:    pmullw %xmm1, %xmm0
615; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
616; CHECK-NEXT:    movq %xmm0, (%rax,%rsi,4)
617; CHECK-NEXT:    retq
618entry:
619  %pre = load i32*, i32** @c
620  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
621  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
622  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
623  %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
624  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 256>
625  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
626  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
627  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
628  ret void
629}
630
631; %val = load <2 x i8>
632; %op1 = zext<2 x i32> %val
633; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255)
634; %rst = mul <2 x i32> %op1, %op2
635;
636define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) {
637; CHECK-LABEL: mul_2xi8_varconst4:
638; CHECK:       # BB#0: # %entry
639; CHECK-NEXT:    movq {{.*}}(%rip), %rax
640; CHECK-NEXT:    movzwl (%rdi,%rsi), %ecx
641; CHECK-NEXT:    movd %ecx, %xmm0
642; CHECK-NEXT:    pxor %xmm1, %xmm1
643; CHECK-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
644; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
645; CHECK-NEXT:    movdqa %xmm0, %xmm2
646; CHECK-NEXT:    pmulhw %xmm1, %xmm2
647; CHECK-NEXT:    pmullw %xmm1, %xmm0
648; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
649; CHECK-NEXT:    movq %xmm0, (%rax,%rsi,4)
650; CHECK-NEXT:    retq
651entry:
652  %pre = load i32*, i32** @c
653  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
654  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
655  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
656  %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
657  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -1, i32 255>
658  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
659  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
660  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
661  ret void
662}
663
664; %val = load <2 x i8>
665; %op1 = sext<2 x i32> %val
666; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127)
667; %rst = mul <2 x i32> %op1, %op2
668;
669define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) {
670; CHECK-LABEL: mul_2xi8_varconst5:
671; CHECK:       # BB#0: # %entry
672; CHECK-NEXT:    movq {{.*}}(%rip), %rax
673; CHECK-NEXT:    movzwl (%rdi,%rsi), %ecx
674; CHECK-NEXT:    movd %ecx, %xmm0
675; CHECK-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
676; CHECK-NEXT:    psraw $8, %xmm0
677; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
678; CHECK-NEXT:    movdqa %xmm0, %xmm2
679; CHECK-NEXT:    pmulhw %xmm1, %xmm2
680; CHECK-NEXT:    pmullw %xmm1, %xmm0
681; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
682; CHECK-NEXT:    movq %xmm0, (%rax,%rsi,4)
683; CHECK-NEXT:    retq
684entry:
685  %pre = load i32*, i32** @c
686  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
687  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
688  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
689  %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
690  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -129, i32 127>
691  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
692  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
693  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
694  ret void
695}
696
697; %val = load <2 x i8>
698; %op1 = sext<2 x i32> %val
699; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128)
700; %rst = mul <2 x i32> %op1, %op2
701;
702define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) {
703; CHECK-LABEL: mul_2xi8_varconst6:
704; CHECK:       # BB#0: # %entry
705; CHECK-NEXT:    movq {{.*}}(%rip), %rax
706; CHECK-NEXT:    movzwl (%rdi,%rsi), %ecx
707; CHECK-NEXT:    movd %ecx, %xmm0
708; CHECK-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
709; CHECK-NEXT:    psraw $8, %xmm0
710; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
711; CHECK-NEXT:    movdqa %xmm0, %xmm2
712; CHECK-NEXT:    pmulhw %xmm1, %xmm2
713; CHECK-NEXT:    pmullw %xmm1, %xmm0
714; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
715; CHECK-NEXT:    movq %xmm0, (%rax,%rsi,4)
716; CHECK-NEXT:    retq
717entry:
718  %pre = load i32*, i32** @c
719  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
720  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
721  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
722  %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
723  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 128>
724  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
725  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
726  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
727  ret void
728}
729
730; %val = load <2 x i16>
731; %op1 = zext<2 x i32> %val
732; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535)
733; %rst = mul <2 x i32> %op1, %op2
734;
735define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) {
736; CHECK-LABEL: mul_2xi16_varconst1:
737; CHECK:       # BB#0: # %entry
738; CHECK-NEXT:    movq {{.*}}(%rip), %rax
739; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
740; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
741; CHECK-NEXT:    movdqa %xmm0, %xmm2
742; CHECK-NEXT:    pmulhuw %xmm1, %xmm2
743; CHECK-NEXT:    pmullw %xmm1, %xmm0
744; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
745; CHECK-NEXT:    movq %xmm0, (%rax,%rsi,4)
746; CHECK-NEXT:    retq
747entry:
748  %pre = load i32*, i32** @c
749  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
750  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
751  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
752  %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
753  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65535>
754  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
755  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
756  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
757  ret void
758}
759
760; %val = load <2 x i16>
761; %op1 = sext<2 x i32> %val
762; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767)
763; %rst = mul <2 x i32> %op1, %op2
764;
765define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) {
766; CHECK-LABEL: mul_2xi16_varconst2:
767; CHECK:       # BB#0: # %entry
768; CHECK-NEXT:    movq {{.*}}(%rip), %rax
769; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
770; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
771; CHECK-NEXT:    movdqa %xmm0, %xmm2
772; CHECK-NEXT:    pmulhw %xmm1, %xmm2
773; CHECK-NEXT:    pmullw %xmm1, %xmm0
774; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
775; CHECK-NEXT:    movq %xmm0, (%rax,%rsi,4)
776; CHECK-NEXT:    retq
777entry:
778  %pre = load i32*, i32** @c
779  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
780  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
781  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
782  %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
783  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -32768, i32 32767>
784  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
785  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
786  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
787  ret void
788}
789
790; %val = load <2 x i16>
791; %op1 = zext<2 x i32> %val
792; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536)
793; %rst = mul <2 x i32> %op1, %op2
794;
795define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) {
796; CHECK-LABEL: mul_2xi16_varconst3:
797; CHECK:       # BB#0: # %entry
798; CHECK-NEXT:    movq {{.*}}(%rip), %rax
799; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
800; CHECK-NEXT:    pxor %xmm1, %xmm1
801; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
802; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
803; CHECK-NEXT:    movl $65536, %ecx # imm = 0x10000
804; CHECK-NEXT:    movd %rcx, %xmm1
805; CHECK-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
806; CHECK-NEXT:    movdqa %xmm0, %xmm2
807; CHECK-NEXT:    pmuludq %xmm1, %xmm2
808; CHECK-NEXT:    psrlq $32, %xmm0
809; CHECK-NEXT:    pmuludq %xmm1, %xmm0
810; CHECK-NEXT:    psllq $32, %xmm0
811; CHECK-NEXT:    paddq %xmm2, %xmm0
812; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
813; CHECK-NEXT:    movq %xmm0, (%rax,%rsi,4)
814; CHECK-NEXT:    retq
815entry:
816  %pre = load i32*, i32** @c
817  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
818  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
819  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
820  %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
821  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65536>
822  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
823  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
824  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
825  ret void
826}
827
828; %val = load <2 x i16>
829; %op1 = sext<2 x i32> %val
830; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768)
831; %rst = mul <2 x i32> %op1, %op2
832;
833define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) {
834; CHECK-LABEL: mul_2xi16_varconst4:
835; CHECK:       # BB#0: # %entry
836; CHECK-NEXT:    movq {{.*}}(%rip), %rax
837; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
838; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
839; CHECK-NEXT:    psrad $16, %xmm0
840; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
841; CHECK-NEXT:    movl $32768, %ecx # imm = 0x8000
842; CHECK-NEXT:    movd %rcx, %xmm1
843; CHECK-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
844; CHECK-NEXT:    movdqa %xmm0, %xmm2
845; CHECK-NEXT:    pmuludq %xmm1, %xmm2
846; CHECK-NEXT:    psrlq $32, %xmm0
847; CHECK-NEXT:    pmuludq %xmm1, %xmm0
848; CHECK-NEXT:    psllq $32, %xmm0
849; CHECK-NEXT:    paddq %xmm2, %xmm0
850; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
851; CHECK-NEXT:    movq %xmm0, (%rax,%rsi,4)
852; CHECK-NEXT:    retq
853entry:
854  %pre = load i32*, i32** @c
855  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
856  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
857  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
858  %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
859  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 32768>
860  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
861  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
862  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
863  ret void
864}
865