• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
5
6define <16 x i8> @mul8c(<16 x i8> %i) nounwind  {
7; SSE2-LABEL: mul8c:
8; SSE2:       # BB#0: # %entry
9; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
10; SSE2-NEXT:    psraw $8, %xmm1
11; SSE2-NEXT:    movdqa %xmm0, %xmm2
12; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
13; SSE2-NEXT:    psraw $8, %xmm2
14; SSE2-NEXT:    pmullw %xmm1, %xmm2
15; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
16; SSE2-NEXT:    pand %xmm3, %xmm2
17; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
18; SSE2-NEXT:    psraw $8, %xmm0
19; SSE2-NEXT:    pmullw %xmm1, %xmm0
20; SSE2-NEXT:    pand %xmm3, %xmm0
21; SSE2-NEXT:    packuswb %xmm2, %xmm0
22; SSE2-NEXT:    retq
23;
24; SSE41-LABEL: mul8c:
25; SSE41:       # BB#0: # %entry
26; SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
27; SSE41-NEXT:    pmovsxbw {{.*}}(%rip), %xmm2
28; SSE41-NEXT:    pmullw %xmm2, %xmm1
29; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
30; SSE41-NEXT:    pand %xmm3, %xmm1
31; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
32; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
33; SSE41-NEXT:    pmullw %xmm2, %xmm0
34; SSE41-NEXT:    pand %xmm3, %xmm0
35; SSE41-NEXT:    packuswb %xmm0, %xmm1
36; SSE41-NEXT:    movdqa %xmm1, %xmm0
37; SSE41-NEXT:    retq
38;
39; AVX2-LABEL: mul8c:
40; AVX2:       # BB#0: # %entry
41; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
42; AVX2-NEXT:    vpmovsxbw {{.*}}(%rip), %ymm1
43; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
44; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
45; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
46; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
47; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
48; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
49; AVX2-NEXT:    vzeroupper
50; AVX2-NEXT:    retq
51entry:
52  %A = mul <16 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 >
53  ret <16 x i8> %A
54}
55
56define <8 x i16> @mul16c(<8 x i16> %i) nounwind  {
57; SSE-LABEL: mul16c:
58; SSE:       # BB#0: # %entry
59; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
60; SSE-NEXT:    retq
61;
62; AVX2-LABEL: mul16c:
63; AVX2:       # BB#0: # %entry
64; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
65; AVX2-NEXT:    retq
66entry:
67  %A = mul <8 x i16> %i, < i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117 >
68  ret <8 x i16> %A
69}
70
71define <4 x i32> @a(<4 x i32> %i) nounwind  {
72; SSE2-LABEL: a:
73; SSE2:       # BB#0: # %entry
74; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [117,117,117,117]
75; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
76; SSE2-NEXT:    pmuludq %xmm1, %xmm0
77; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
78; SSE2-NEXT:    pmuludq %xmm1, %xmm2
79; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
80; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
81; SSE2-NEXT:    retq
82;
83; SSE41-LABEL: a:
84; SSE41:       # BB#0: # %entry
85; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
86; SSE41-NEXT:    retq
87;
88; AVX2-LABEL: a:
89; AVX2:       # BB#0: # %entry
90; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
91; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
92; AVX2-NEXT:    retq
93entry:
94  %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 >
95  ret <4 x i32> %A
96}
97
98define <2 x i64> @b(<2 x i64> %i) nounwind  {
99; SSE-LABEL: b:
100; SSE:       # BB#0: # %entry
101; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [117,117]
102; SSE-NEXT:    movdqa %xmm0, %xmm2
103; SSE-NEXT:    pmuludq %xmm1, %xmm2
104; SSE-NEXT:    psrlq $32, %xmm0
105; SSE-NEXT:    pmuludq %xmm1, %xmm0
106; SSE-NEXT:    psllq $32, %xmm0
107; SSE-NEXT:    paddq %xmm2, %xmm0
108; SSE-NEXT:    retq
109;
110; AVX2-LABEL: b:
111; AVX2:       # BB#0: # %entry
112; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [117,117]
113; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
114; AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
115; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
116; AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
117; AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
118; AVX2-NEXT:    retq
119entry:
120  %A = mul <2 x i64> %i, < i64 117, i64 117 >
121  ret <2 x i64> %A
122}
123
124define <16 x i8> @mul8(<16 x i8> %i, <16 x i8> %j) nounwind  {
125; SSE2-LABEL: mul8:
126; SSE2:       # BB#0: # %entry
127; SSE2-NEXT:    movdqa %xmm1, %xmm2
128; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
129; SSE2-NEXT:    psraw $8, %xmm2
130; SSE2-NEXT:    movdqa %xmm0, %xmm3
131; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
132; SSE2-NEXT:    psraw $8, %xmm3
133; SSE2-NEXT:    pmullw %xmm2, %xmm3
134; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
135; SSE2-NEXT:    pand %xmm2, %xmm3
136; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
137; SSE2-NEXT:    psraw $8, %xmm1
138; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
139; SSE2-NEXT:    psraw $8, %xmm0
140; SSE2-NEXT:    pmullw %xmm1, %xmm0
141; SSE2-NEXT:    pand %xmm2, %xmm0
142; SSE2-NEXT:    packuswb %xmm3, %xmm0
143; SSE2-NEXT:    retq
144;
145; SSE41-LABEL: mul8:
146; SSE41:       # BB#0: # %entry
147; SSE41-NEXT:    pmovsxbw %xmm1, %xmm3
148; SSE41-NEXT:    pmovsxbw %xmm0, %xmm2
149; SSE41-NEXT:    pmullw %xmm3, %xmm2
150; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
151; SSE41-NEXT:    pand %xmm3, %xmm2
152; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
153; SSE41-NEXT:    pmovsxbw %xmm1, %xmm1
154; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
155; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
156; SSE41-NEXT:    pmullw %xmm1, %xmm0
157; SSE41-NEXT:    pand %xmm3, %xmm0
158; SSE41-NEXT:    packuswb %xmm0, %xmm2
159; SSE41-NEXT:    movdqa %xmm2, %xmm0
160; SSE41-NEXT:    retq
161;
162; AVX2-LABEL: mul8:
163; AVX2:       # BB#0: # %entry
164; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
165; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
166; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
167; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
168; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
169; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
170; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
171; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
172; AVX2-NEXT:    vzeroupper
173; AVX2-NEXT:    retq
174entry:
175  %A = mul <16 x i8> %i, %j
176  ret <16 x i8> %A
177}
178
179define <8 x i16> @mul16(<8 x i16> %i, <8 x i16> %j) nounwind  {
180; SSE-LABEL: mul16:
181; SSE:       # BB#0: # %entry
182; SSE-NEXT:    pmullw %xmm1, %xmm0
183; SSE-NEXT:    retq
184;
185; AVX2-LABEL: mul16:
186; AVX2:       # BB#0: # %entry
187; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
188; AVX2-NEXT:    retq
189entry:
190  %A = mul <8 x i16> %i, %j
191  ret <8 x i16> %A
192}
193
194define <4 x i32> @c(<4 x i32> %i, <4 x i32> %j) nounwind  {
195; SSE2-LABEL: c:
196; SSE2:       # BB#0: # %entry
197; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
198; SSE2-NEXT:    pmuludq %xmm1, %xmm0
199; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
200; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
201; SSE2-NEXT:    pmuludq %xmm2, %xmm1
202; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
203; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
204; SSE2-NEXT:    retq
205;
206; SSE41-LABEL: c:
207; SSE41:       # BB#0: # %entry
208; SSE41-NEXT:    pmulld %xmm1, %xmm0
209; SSE41-NEXT:    retq
210;
211; AVX2-LABEL: c:
212; AVX2:       # BB#0: # %entry
213; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
214; AVX2-NEXT:    retq
215entry:
216  %A = mul <4 x i32> %i, %j
217  ret <4 x i32> %A
218}
219
220define <2 x i64> @d(<2 x i64> %i, <2 x i64> %j) nounwind  {
221; SSE-LABEL: d:
222; SSE:       # BB#0: # %entry
223; SSE-NEXT:    movdqa %xmm0, %xmm2
224; SSE-NEXT:    pmuludq %xmm1, %xmm2
225; SSE-NEXT:    movdqa %xmm1, %xmm3
226; SSE-NEXT:    psrlq $32, %xmm3
227; SSE-NEXT:    pmuludq %xmm0, %xmm3
228; SSE-NEXT:    psllq $32, %xmm3
229; SSE-NEXT:    paddq %xmm3, %xmm2
230; SSE-NEXT:    psrlq $32, %xmm0
231; SSE-NEXT:    pmuludq %xmm1, %xmm0
232; SSE-NEXT:    psllq $32, %xmm0
233; SSE-NEXT:    paddq %xmm2, %xmm0
234; SSE-NEXT:    retq
235;
236; AVX2-LABEL: d:
237; AVX2:       # BB#0: # %entry
238; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
239; AVX2-NEXT:    vpsrlq $32, %xmm1, %xmm3
240; AVX2-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
241; AVX2-NEXT:    vpsllq $32, %xmm3, %xmm3
242; AVX2-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
243; AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
244; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
245; AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
246; AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
247; AVX2-NEXT:    retq
248entry:
249  %A = mul <2 x i64> %i, %j
250  ret <2 x i64> %A
251}
252
253declare void @foo()
254
255define <4 x i32> @e(<4 x i32> %i, <4 x i32> %j) nounwind  {
256; SSE2-LABEL: e:
257; SSE2:       # BB#0: # %entry
258; SSE2-NEXT:    subq $40, %rsp
259; SSE2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
260; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
261; SSE2-NEXT:    callq foo
262; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
263; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
264; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
265; SSE2-NEXT:    pmuludq %xmm2, %xmm0
266; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
267; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
268; SSE2-NEXT:    pmuludq %xmm1, %xmm2
269; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
270; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
271; SSE2-NEXT:    addq $40, %rsp
272; SSE2-NEXT:    retq
273;
274; SSE41-LABEL: e:
275; SSE41:       # BB#0: # %entry
276; SSE41-NEXT:    subq $40, %rsp
277; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
278; SSE41-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
279; SSE41-NEXT:    callq foo
280; SSE41-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
281; SSE41-NEXT:    pmulld {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
282; SSE41-NEXT:    addq $40, %rsp
283; SSE41-NEXT:    retq
284;
285; AVX2-LABEL: e:
286; AVX2:       # BB#0: # %entry
287; AVX2-NEXT:    subq $40, %rsp
288; AVX2-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
289; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
290; AVX2-NEXT:    callq foo
291; AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
292; AVX2-NEXT:    vpmulld {{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
293; AVX2-NEXT:    addq $40, %rsp
294; AVX2-NEXT:    retq
295entry:
296  ; Use a call to force spills.
297  call void @foo()
298  %A = mul <4 x i32> %i, %j
299  ret <4 x i32> %A
300}
301
302define <2 x i64> @f(<2 x i64> %i, <2 x i64> %j) nounwind  {
303; SSE-LABEL: f:
304; SSE:       # BB#0: # %entry
305; SSE-NEXT:    subq $40, %rsp
306; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
307; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
308; SSE-NEXT:    callq foo
309; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
310; SSE-NEXT:    movdqa %xmm0, %xmm2
311; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
312; SSE-NEXT:    pmuludq %xmm3, %xmm2
313; SSE-NEXT:    movdqa %xmm3, %xmm1
314; SSE-NEXT:    psrlq $32, %xmm1
315; SSE-NEXT:    pmuludq %xmm0, %xmm1
316; SSE-NEXT:    psllq $32, %xmm1
317; SSE-NEXT:    paddq %xmm1, %xmm2
318; SSE-NEXT:    psrlq $32, %xmm0
319; SSE-NEXT:    pmuludq %xmm3, %xmm0
320; SSE-NEXT:    psllq $32, %xmm0
321; SSE-NEXT:    paddq %xmm2, %xmm0
322; SSE-NEXT:    addq $40, %rsp
323; SSE-NEXT:    retq
324;
325; AVX2-LABEL: f:
326; AVX2:       # BB#0: # %entry
327; AVX2-NEXT:    subq $40, %rsp
328; AVX2-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
329; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
330; AVX2-NEXT:    callq foo
331; AVX2-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
332; AVX2-NEXT:    vmovdqa (%rsp), %xmm3 # 16-byte Reload
333; AVX2-NEXT:    vpmuludq %xmm2, %xmm3, %xmm0
334; AVX2-NEXT:    vpsrlq $32, %xmm2, %xmm1
335; AVX2-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
336; AVX2-NEXT:    vpsllq $32, %xmm1, %xmm1
337; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
338; AVX2-NEXT:    vpsrlq $32, %xmm3, %xmm1
339; AVX2-NEXT:    vpmuludq %xmm2, %xmm1, %xmm1
340; AVX2-NEXT:    vpsllq $32, %xmm1, %xmm1
341; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
342; AVX2-NEXT:    addq $40, %rsp
343; AVX2-NEXT:    retq
344entry:
345  ; Use a call to force spills.
346  call void @foo()
347  %A = mul <2 x i64> %i, %j
348  ret <2 x i64> %A
349}
350
351define <4 x i64> @b1(<4 x i64> %i) nounwind  {
352; SSE-LABEL: b1:
353; SSE:       # BB#0: # %entry
354; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [117,117]
355; SSE-NEXT:    movdqa %xmm0, %xmm3
356; SSE-NEXT:    pmuludq %xmm2, %xmm3
357; SSE-NEXT:    psrlq $32, %xmm0
358; SSE-NEXT:    pmuludq %xmm2, %xmm0
359; SSE-NEXT:    psllq $32, %xmm0
360; SSE-NEXT:    paddq %xmm3, %xmm0
361; SSE-NEXT:    movdqa %xmm1, %xmm3
362; SSE-NEXT:    pmuludq %xmm2, %xmm3
363; SSE-NEXT:    psrlq $32, %xmm1
364; SSE-NEXT:    pmuludq %xmm2, %xmm1
365; SSE-NEXT:    psllq $32, %xmm1
366; SSE-NEXT:    paddq %xmm3, %xmm1
367; SSE-NEXT:    retq
368;
369; AVX2-LABEL: b1:
370; AVX2:       # BB#0: # %entry
371; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
372; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
373; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
374; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
375; AVX2-NEXT:    vpsllq $32, %ymm0, %ymm0
376; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
377; AVX2-NEXT:    retq
378entry:
379  %A = mul <4 x i64> %i, < i64 117, i64 117, i64 117, i64 117 >
380  ret <4 x i64> %A
381}
382
383define <4 x i64> @b2(<4 x i64> %i, <4 x i64> %j) nounwind  {
384; SSE-LABEL: b2:
385; SSE:       # BB#0: # %entry
386; SSE-NEXT:    movdqa %xmm0, %xmm4
387; SSE-NEXT:    pmuludq %xmm2, %xmm4
388; SSE-NEXT:    movdqa %xmm2, %xmm5
389; SSE-NEXT:    psrlq $32, %xmm5
390; SSE-NEXT:    pmuludq %xmm0, %xmm5
391; SSE-NEXT:    psllq $32, %xmm5
392; SSE-NEXT:    paddq %xmm5, %xmm4
393; SSE-NEXT:    psrlq $32, %xmm0
394; SSE-NEXT:    pmuludq %xmm2, %xmm0
395; SSE-NEXT:    psllq $32, %xmm0
396; SSE-NEXT:    paddq %xmm4, %xmm0
397; SSE-NEXT:    movdqa %xmm1, %xmm2
398; SSE-NEXT:    pmuludq %xmm3, %xmm2
399; SSE-NEXT:    movdqa %xmm3, %xmm4
400; SSE-NEXT:    psrlq $32, %xmm4
401; SSE-NEXT:    pmuludq %xmm1, %xmm4
402; SSE-NEXT:    psllq $32, %xmm4
403; SSE-NEXT:    paddq %xmm4, %xmm2
404; SSE-NEXT:    psrlq $32, %xmm1
405; SSE-NEXT:    pmuludq %xmm3, %xmm1
406; SSE-NEXT:    psllq $32, %xmm1
407; SSE-NEXT:    paddq %xmm2, %xmm1
408; SSE-NEXT:    retq
409;
410; AVX2-LABEL: b2:
411; AVX2:       # BB#0: # %entry
412; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
413; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
414; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
415; AVX2-NEXT:    vpsllq $32, %ymm3, %ymm3
416; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
417; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
418; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
419; AVX2-NEXT:    vpsllq $32, %ymm0, %ymm0
420; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
421; AVX2-NEXT:    retq
422entry:
423  %A = mul <4 x i64> %i, %j
424  ret <4 x i64> %A
425}
426
427