• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE
3; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1
4; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512
5; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE
6; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
7; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
8
9; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse41-builtins.c
10
11define <2 x i64> @test_mm_blend_epi16(<2 x i64> %a0, <2 x i64> %a1) {
12; SSE-LABEL: test_mm_blend_epi16:
13; SSE:       # %bb.0:
14; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6,7]
15; SSE-NEXT:    ret{{[l|q]}}
16;
17; AVX-LABEL: test_mm_blend_epi16:
18; AVX:       # %bb.0:
19; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6,7]
20; AVX-NEXT:    ret{{[l|q]}}
21  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
22  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
23  %shuf = shufflevector <8 x i16> %arg0, <8 x i16> %arg1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 7>
24  %res = bitcast <8 x i16> %shuf to <2 x i64>
25  ret <2 x i64> %res
26}
27
28define <2 x double> @test_mm_blend_pd(<2 x double> %a0, <2 x double> %a1) {
29; SSE-LABEL: test_mm_blend_pd:
30; SSE:       # %bb.0:
31; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
32; SSE-NEXT:    ret{{[l|q]}}
33;
34; AVX-LABEL: test_mm_blend_pd:
35; AVX:       # %bb.0:
36; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
37; AVX-NEXT:    ret{{[l|q]}}
38  %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 3>
39  ret <2 x double> %res
40}
41
42define <4 x float> @test_mm_blend_ps(<4 x float> %a0, <4 x float> %a1) {
43; SSE-LABEL: test_mm_blend_ps:
44; SSE:       # %bb.0:
45; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
46; SSE-NEXT:    ret{{[l|q]}}
47;
48; AVX-LABEL: test_mm_blend_ps:
49; AVX:       # %bb.0:
50; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
51; AVX-NEXT:    ret{{[l|q]}}
52  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
53  ret <4 x float> %res
54}
55
56define <2 x i64> @test_mm_blendv_epi8(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
57; SSE-LABEL: test_mm_blendv_epi8:
58; SSE:       # %bb.0:
59; SSE-NEXT:    movdqa %xmm0, %xmm3
60; SSE-NEXT:    movaps %xmm2, %xmm0
61; SSE-NEXT:    pblendvb %xmm0, %xmm1, %xmm3
62; SSE-NEXT:    movdqa %xmm3, %xmm0
63; SSE-NEXT:    ret{{[l|q]}}
64;
65; AVX-LABEL: test_mm_blendv_epi8:
66; AVX:       # %bb.0:
67; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
68; AVX-NEXT:    ret{{[l|q]}}
69  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
70  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
71  %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
72  %call = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %arg0, <16 x i8> %arg1, <16 x i8> %arg2)
73  %res = bitcast <16 x i8> %call to <2 x i64>
74  ret <2 x i64> %res
75}
76declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
77
78define <2 x double> @test_mm_blendv_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
79; SSE-LABEL: test_mm_blendv_pd:
80; SSE:       # %bb.0:
81; SSE-NEXT:    movapd %xmm0, %xmm3
82; SSE-NEXT:    movaps %xmm2, %xmm0
83; SSE-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
84; SSE-NEXT:    movapd %xmm3, %xmm0
85; SSE-NEXT:    ret{{[l|q]}}
86;
87; AVX-LABEL: test_mm_blendv_pd:
88; AVX:       # %bb.0:
89; AVX-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
90; AVX-NEXT:    ret{{[l|q]}}
91  %res = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
92  ret <2 x double> %res
93}
94declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
95
96define <4 x float> @test_mm_blendv_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
97; SSE-LABEL: test_mm_blendv_ps:
98; SSE:       # %bb.0:
99; SSE-NEXT:    movaps %xmm0, %xmm3
100; SSE-NEXT:    movaps %xmm2, %xmm0
101; SSE-NEXT:    blendvps %xmm0, %xmm1, %xmm3
102; SSE-NEXT:    movaps %xmm3, %xmm0
103; SSE-NEXT:    ret{{[l|q]}}
104;
105; AVX-LABEL: test_mm_blendv_ps:
106; AVX:       # %bb.0:
107; AVX-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
108; AVX-NEXT:    ret{{[l|q]}}
109  %res = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
110  ret <4 x float> %res
111}
112declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
113
114define <2 x double> @test_mm_ceil_pd(<2 x double> %a0) {
115; SSE-LABEL: test_mm_ceil_pd:
116; SSE:       # %bb.0:
117; SSE-NEXT:    roundpd $2, %xmm0, %xmm0
118; SSE-NEXT:    ret{{[l|q]}}
119;
120; AVX-LABEL: test_mm_ceil_pd:
121; AVX:       # %bb.0:
122; AVX-NEXT:    vroundpd $2, %xmm0, %xmm0
123; AVX-NEXT:    ret{{[l|q]}}
124  %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 2)
125  ret <2 x double> %res
126}
127declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
128
129define <4 x float> @test_mm_ceil_ps(<4 x float> %a0) {
130; SSE-LABEL: test_mm_ceil_ps:
131; SSE:       # %bb.0:
132; SSE-NEXT:    roundps $2, %xmm0, %xmm0
133; SSE-NEXT:    ret{{[l|q]}}
134;
135; AVX-LABEL: test_mm_ceil_ps:
136; AVX:       # %bb.0:
137; AVX-NEXT:    vroundps $2, %xmm0, %xmm0
138; AVX-NEXT:    ret{{[l|q]}}
139  %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 2)
140  ret <4 x float> %res
141}
142declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
143
144define <2 x double> @test_mm_ceil_sd(<2 x double> %a0, <2 x double> %a1) {
145; SSE-LABEL: test_mm_ceil_sd:
146; SSE:       # %bb.0:
147; SSE-NEXT:    roundsd $2, %xmm1, %xmm0
148; SSE-NEXT:    ret{{[l|q]}}
149;
150; AVX-LABEL: test_mm_ceil_sd:
151; AVX:       # %bb.0:
152; AVX-NEXT:    vroundsd $2, %xmm1, %xmm0, %xmm0
153; AVX-NEXT:    ret{{[l|q]}}
154  %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 2)
155  ret <2 x double> %res
156}
157declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
158
159define <4 x float> @test_mm_ceil_ss(<4 x float> %a0, <4 x float> %a1) {
160; SSE-LABEL: test_mm_ceil_ss:
161; SSE:       # %bb.0:
162; SSE-NEXT:    roundss $2, %xmm1, %xmm0
163; SSE-NEXT:    ret{{[l|q]}}
164;
165; AVX-LABEL: test_mm_ceil_ss:
166; AVX:       # %bb.0:
167; AVX-NEXT:    vroundss $2, %xmm1, %xmm0, %xmm0
168; AVX-NEXT:    ret{{[l|q]}}
169  %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 2)
170  ret <4 x float> %res
171}
172declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
173
174define <2 x i64> @test_mm_cmpeq_epi64(<2 x i64> %a0, <2 x i64> %a1) {
175; SSE-LABEL: test_mm_cmpeq_epi64:
176; SSE:       # %bb.0:
177; SSE-NEXT:    pcmpeqq %xmm1, %xmm0
178; SSE-NEXT:    ret{{[l|q]}}
179;
180; AVX1-LABEL: test_mm_cmpeq_epi64:
181; AVX1:       # %bb.0:
182; AVX1-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
183; AVX1-NEXT:    ret{{[l|q]}}
184;
185; AVX512-LABEL: test_mm_cmpeq_epi64:
186; AVX512:       # %bb.0:
187; AVX512-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0
188; AVX512-NEXT:    vpmovm2q %k0, %xmm0
189; AVX512-NEXT:    ret{{[l|q]}}
190  %cmp = icmp eq <2 x i64> %a0, %a1
191  %res = sext <2 x i1> %cmp to <2 x i64>
192  ret <2 x i64> %res
193}
194
195define <2 x i64> @test_mm_cvtepi8_epi16(<2 x i64> %a0) {
196; SSE-LABEL: test_mm_cvtepi8_epi16:
197; SSE:       # %bb.0:
198; SSE-NEXT:    pmovsxbw %xmm0, %xmm0
199; SSE-NEXT:    ret{{[l|q]}}
200;
201; AVX-LABEL: test_mm_cvtepi8_epi16:
202; AVX:       # %bb.0:
203; AVX-NEXT:    vpmovsxbw %xmm0, %xmm0
204; AVX-NEXT:    ret{{[l|q]}}
205  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
206  %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
207  %sext = sext <8 x i8> %ext0 to <8 x i16>
208  %res = bitcast <8 x i16> %sext to <2 x i64>
209  ret <2 x i64> %res
210}
211
212define <2 x i64> @test_mm_cvtepi8_epi32(<2 x i64> %a0) {
213; SSE-LABEL: test_mm_cvtepi8_epi32:
214; SSE:       # %bb.0:
215; SSE-NEXT:    pmovsxbd %xmm0, %xmm0
216; SSE-NEXT:    ret{{[l|q]}}
217;
218; AVX-LABEL: test_mm_cvtepi8_epi32:
219; AVX:       # %bb.0:
220; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
221; AVX-NEXT:    ret{{[l|q]}}
222  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
223  %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
224  %sext = sext <4 x i8> %ext0 to <4 x i32>
225  %res = bitcast <4 x i32> %sext to <2 x i64>
226  ret <2 x i64> %res
227}
228
229define <2 x i64> @test_mm_cvtepi8_epi64(<2 x i64> %a0) {
230; SSE-LABEL: test_mm_cvtepi8_epi64:
231; SSE:       # %bb.0:
232; SSE-NEXT:    pmovsxbq %xmm0, %xmm0
233; SSE-NEXT:    ret{{[l|q]}}
234;
235; AVX-LABEL: test_mm_cvtepi8_epi64:
236; AVX:       # %bb.0:
237; AVX-NEXT:    vpmovsxbq %xmm0, %xmm0
238; AVX-NEXT:    ret{{[l|q]}}
239  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
240  %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
241  %sext = sext <2 x i8> %ext0 to <2 x i64>
242  ret <2 x i64> %sext
243}
244
245define <2 x i64> @test_mm_cvtepi16_epi32(<2 x i64> %a0) {
246; SSE-LABEL: test_mm_cvtepi16_epi32:
247; SSE:       # %bb.0:
248; SSE-NEXT:    pmovsxwd %xmm0, %xmm0
249; SSE-NEXT:    ret{{[l|q]}}
250;
251; AVX-LABEL: test_mm_cvtepi16_epi32:
252; AVX:       # %bb.0:
253; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
254; AVX-NEXT:    ret{{[l|q]}}
255  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
256  %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
257  %sext = sext <4 x i16> %ext0 to <4 x i32>
258  %res = bitcast <4 x i32> %sext to <2 x i64>
259  ret <2 x i64> %res
260}
261
262define <2 x i64> @test_mm_cvtepi16_epi64(<2 x i64> %a0) {
263; SSE-LABEL: test_mm_cvtepi16_epi64:
264; SSE:       # %bb.0:
265; SSE-NEXT:    pmovsxwq %xmm0, %xmm0
266; SSE-NEXT:    ret{{[l|q]}}
267;
268; AVX-LABEL: test_mm_cvtepi16_epi64:
269; AVX:       # %bb.0:
270; AVX-NEXT:    vpmovsxwq %xmm0, %xmm0
271; AVX-NEXT:    ret{{[l|q]}}
272  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
273  %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
274  %sext = sext <2 x i16> %ext0 to <2 x i64>
275  ret <2 x i64> %sext
276}
277
278define <2 x i64> @test_mm_cvtepi32_epi64(<2 x i64> %a0) {
279; SSE-LABEL: test_mm_cvtepi32_epi64:
280; SSE:       # %bb.0:
281; SSE-NEXT:    pmovsxdq %xmm0, %xmm0
282; SSE-NEXT:    ret{{[l|q]}}
283;
284; AVX-LABEL: test_mm_cvtepi32_epi64:
285; AVX:       # %bb.0:
286; AVX-NEXT:    vpmovsxdq %xmm0, %xmm0
287; AVX-NEXT:    ret{{[l|q]}}
288  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
289  %ext0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
290  %sext = sext <2 x i32> %ext0 to <2 x i64>
291  ret <2 x i64> %sext
292}
293
294define <2 x i64> @test_mm_cvtepu8_epi16(<2 x i64> %a0) {
295; SSE-LABEL: test_mm_cvtepu8_epi16:
296; SSE:       # %bb.0:
297; SSE-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
298; SSE-NEXT:    ret{{[l|q]}}
299;
300; AVX-LABEL: test_mm_cvtepu8_epi16:
301; AVX:       # %bb.0:
302; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
303; AVX-NEXT:    ret{{[l|q]}}
304  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
305  %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
306  %sext = zext <8 x i8> %ext0 to <8 x i16>
307  %res = bitcast <8 x i16> %sext to <2 x i64>
308  ret <2 x i64> %res
309}
310
311define <2 x i64> @test_mm_cvtepu8_epi32(<2 x i64> %a0) {
312; SSE-LABEL: test_mm_cvtepu8_epi32:
313; SSE:       # %bb.0:
314; SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
315; SSE-NEXT:    ret{{[l|q]}}
316;
317; AVX-LABEL: test_mm_cvtepu8_epi32:
318; AVX:       # %bb.0:
319; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
320; AVX-NEXT:    ret{{[l|q]}}
321  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
322  %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
323  %sext = zext <4 x i8> %ext0 to <4 x i32>
324  %res = bitcast <4 x i32> %sext to <2 x i64>
325  ret <2 x i64> %res
326}
327
328define <2 x i64> @test_mm_cvtepu8_epi64(<2 x i64> %a0) {
329; SSE-LABEL: test_mm_cvtepu8_epi64:
330; SSE:       # %bb.0:
331; SSE-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
332; SSE-NEXT:    ret{{[l|q]}}
333;
334; AVX-LABEL: test_mm_cvtepu8_epi64:
335; AVX:       # %bb.0:
336; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
337; AVX-NEXT:    ret{{[l|q]}}
338  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
339  %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
340  %sext = zext <2 x i8> %ext0 to <2 x i64>
341  ret <2 x i64> %sext
342}
343
344define <2 x i64> @test_mm_cvtepu16_epi32(<2 x i64> %a0) {
345; SSE-LABEL: test_mm_cvtepu16_epi32:
346; SSE:       # %bb.0:
347; SSE-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
348; SSE-NEXT:    ret{{[l|q]}}
349;
350; AVX-LABEL: test_mm_cvtepu16_epi32:
351; AVX:       # %bb.0:
352; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
353; AVX-NEXT:    ret{{[l|q]}}
354  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
355  %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
356  %sext = zext <4 x i16> %ext0 to <4 x i32>
357  %res = bitcast <4 x i32> %sext to <2 x i64>
358  ret <2 x i64> %res
359}
360
361define <2 x i64> @test_mm_cvtepu16_epi64(<2 x i64> %a0) {
362; SSE-LABEL: test_mm_cvtepu16_epi64:
363; SSE:       # %bb.0:
364; SSE-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
365; SSE-NEXT:    ret{{[l|q]}}
366;
367; AVX-LABEL: test_mm_cvtepu16_epi64:
368; AVX:       # %bb.0:
369; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
370; AVX-NEXT:    ret{{[l|q]}}
371  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
372  %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
373  %sext = zext <2 x i16> %ext0 to <2 x i64>
374  ret <2 x i64> %sext
375}
376
377define <2 x i64> @test_mm_cvtepu32_epi64(<2 x i64> %a0) {
378; SSE-LABEL: test_mm_cvtepu32_epi64:
379; SSE:       # %bb.0:
380; SSE-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
381; SSE-NEXT:    ret{{[l|q]}}
382;
383; AVX-LABEL: test_mm_cvtepu32_epi64:
384; AVX:       # %bb.0:
385; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
386; AVX-NEXT:    ret{{[l|q]}}
387  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
388  %ext0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
389  %sext = zext <2 x i32> %ext0 to <2 x i64>
390  ret <2 x i64> %sext
391}
392
393define <2 x double> @test_mm_dp_pd(<2 x double> %a0, <2 x double> %a1) {
394; SSE-LABEL: test_mm_dp_pd:
395; SSE:       # %bb.0:
396; SSE-NEXT:    dppd $7, %xmm1, %xmm0
397; SSE-NEXT:    ret{{[l|q]}}
398;
399; AVX-LABEL: test_mm_dp_pd:
400; AVX:       # %bb.0:
401; AVX-NEXT:    vdppd $7, %xmm1, %xmm0, %xmm0
402; AVX-NEXT:    ret{{[l|q]}}
403  %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7)
404  ret <2 x double> %res
405}
406declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone
407
408define <4 x float> @test_mm_dp_ps(<4 x float> %a0, <4 x float> %a1) {
409; SSE-LABEL: test_mm_dp_ps:
410; SSE:       # %bb.0:
411; SSE-NEXT:    dpps $7, %xmm1, %xmm0
412; SSE-NEXT:    ret{{[l|q]}}
413;
414; AVX-LABEL: test_mm_dp_ps:
415; AVX:       # %bb.0:
416; AVX-NEXT:    vdpps $7, %xmm1, %xmm0, %xmm0
417; AVX-NEXT:    ret{{[l|q]}}
418  %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7)
419  ret <4 x float> %res
420}
421declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
422
423define i32 @test_mm_extract_epi8(<2 x i64> %a0) {
424; SSE-LABEL: test_mm_extract_epi8:
425; SSE:       # %bb.0:
426; SSE-NEXT:    pextrb $1, %xmm0, %eax
427; SSE-NEXT:    movzbl %al, %eax
428; SSE-NEXT:    ret{{[l|q]}}
429;
430; AVX-LABEL: test_mm_extract_epi8:
431; AVX:       # %bb.0:
432; AVX-NEXT:    vpextrb $1, %xmm0, %eax
433; AVX-NEXT:    movzbl %al, %eax
434; AVX-NEXT:    ret{{[l|q]}}
435  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
436  %ext = extractelement <16 x i8> %arg0, i32 1
437  %res = zext i8 %ext to i32
438  ret i32 %res
439}
440
441define i32 @test_mm_extract_epi32(<2 x i64> %a0) {
442; SSE-LABEL: test_mm_extract_epi32:
443; SSE:       # %bb.0:
444; SSE-NEXT:    extractps $1, %xmm0, %eax
445; SSE-NEXT:    ret{{[l|q]}}
446;
447; AVX-LABEL: test_mm_extract_epi32:
448; AVX:       # %bb.0:
449; AVX-NEXT:    vextractps $1, %xmm0, %eax
450; AVX-NEXT:    ret{{[l|q]}}
451  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
452  %ext = extractelement <4 x i32> %arg0, i32 1
453  ret i32 %ext
454}
455
456define i64 @test_mm_extract_epi64(<2 x i64> %a0) {
457; X86-SSE-LABEL: test_mm_extract_epi64:
458; X86-SSE:       # %bb.0:
459; X86-SSE-NEXT:    extractps $2, %xmm0, %eax
460; X86-SSE-NEXT:    extractps $3, %xmm0, %edx
461; X86-SSE-NEXT:    retl
462;
463; X86-AVX-LABEL: test_mm_extract_epi64:
464; X86-AVX:       # %bb.0:
465; X86-AVX-NEXT:    vextractps $2, %xmm0, %eax
466; X86-AVX-NEXT:    vextractps $3, %xmm0, %edx
467; X86-AVX-NEXT:    retl
468;
469; X64-SSE-LABEL: test_mm_extract_epi64:
470; X64-SSE:       # %bb.0:
471; X64-SSE-NEXT:    pextrq $1, %xmm0, %rax
472; X64-SSE-NEXT:    retq
473;
474; X64-AVX-LABEL: test_mm_extract_epi64:
475; X64-AVX:       # %bb.0:
476; X64-AVX-NEXT:    vpextrq $1, %xmm0, %rax
477; X64-AVX-NEXT:    retq
478  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
479  %ext = extractelement <2 x i64> %a0, i32 1
480  ret i64 %ext
481}
482
483define i32 @test_mm_extract_ps(<4 x float> %a0) {
484; SSE-LABEL: test_mm_extract_ps:
485; SSE:       # %bb.0:
486; SSE-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
487; SSE-NEXT:    movd %xmm0, %eax
488; SSE-NEXT:    ret{{[l|q]}}
489;
490; AVX-LABEL: test_mm_extract_ps:
491; AVX:       # %bb.0:
492; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
493; AVX-NEXT:    vmovd %xmm0, %eax
494; AVX-NEXT:    ret{{[l|q]}}
495  %ext = extractelement <4 x float> %a0, i32 1
496  %bc = bitcast float %ext to i32
497  ret i32 %bc
498}
499
500define <2 x double> @test_mm_floor_pd(<2 x double> %a0) {
501; SSE-LABEL: test_mm_floor_pd:
502; SSE:       # %bb.0:
503; SSE-NEXT:    roundpd $1, %xmm0, %xmm0
504; SSE-NEXT:    ret{{[l|q]}}
505;
506; AVX-LABEL: test_mm_floor_pd:
507; AVX:       # %bb.0:
508; AVX-NEXT:    vroundpd $1, %xmm0, %xmm0
509; AVX-NEXT:    ret{{[l|q]}}
510  %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 1)
511  ret <2 x double> %res
512}
513
514define <4 x float> @test_mm_floor_ps(<4 x float> %a0) {
515; SSE-LABEL: test_mm_floor_ps:
516; SSE:       # %bb.0:
517; SSE-NEXT:    roundps $1, %xmm0, %xmm0
518; SSE-NEXT:    ret{{[l|q]}}
519;
520; AVX-LABEL: test_mm_floor_ps:
521; AVX:       # %bb.0:
522; AVX-NEXT:    vroundps $1, %xmm0, %xmm0
523; AVX-NEXT:    ret{{[l|q]}}
524  %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 1)
525  ret <4 x float> %res
526}
527
528define <2 x double> @test_mm_floor_sd(<2 x double> %a0, <2 x double> %a1) {
529; SSE-LABEL: test_mm_floor_sd:
530; SSE:       # %bb.0:
531; SSE-NEXT:    roundsd $1, %xmm1, %xmm0
532; SSE-NEXT:    ret{{[l|q]}}
533;
534; AVX-LABEL: test_mm_floor_sd:
535; AVX:       # %bb.0:
536; AVX-NEXT:    vroundsd $1, %xmm1, %xmm0, %xmm0
537; AVX-NEXT:    ret{{[l|q]}}
538  %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 1)
539  ret <2 x double> %res
540}
541
542define <4 x float> @test_mm_floor_ss(<4 x float> %a0, <4 x float> %a1) {
543; SSE-LABEL: test_mm_floor_ss:
544; SSE:       # %bb.0:
545; SSE-NEXT:    roundss $1, %xmm1, %xmm0
546; SSE-NEXT:    ret{{[l|q]}}
547;
548; AVX-LABEL: test_mm_floor_ss:
549; AVX:       # %bb.0:
550; AVX-NEXT:    vroundss $1, %xmm1, %xmm0, %xmm0
551; AVX-NEXT:    ret{{[l|q]}}
552  %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 1)
553  ret <4 x float> %res
554}
555
556define <2 x i64> @test_mm_insert_epi8(<2 x i64> %a0, i8 %a1) {
557; X86-SSE-LABEL: test_mm_insert_epi8:
558; X86-SSE:       # %bb.0:
559; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
560; X86-SSE-NEXT:    pinsrb $1, %eax, %xmm0
561; X86-SSE-NEXT:    retl
562;
563; X86-AVX-LABEL: test_mm_insert_epi8:
564; X86-AVX:       # %bb.0:
565; X86-AVX-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
566; X86-AVX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
567; X86-AVX-NEXT:    retl
568;
569; X64-SSE-LABEL: test_mm_insert_epi8:
570; X64-SSE:       # %bb.0:
571; X64-SSE-NEXT:    movzbl %dil, %eax
572; X64-SSE-NEXT:    pinsrb $1, %eax, %xmm0
573; X64-SSE-NEXT:    retq
574;
575; X64-AVX-LABEL: test_mm_insert_epi8:
576; X64-AVX:       # %bb.0:
577; X64-AVX-NEXT:    movzbl %dil, %eax
578; X64-AVX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
579; X64-AVX-NEXT:    retq
580  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
581  %res = insertelement <16 x i8> %arg0, i8 %a1,i32 1
582  %bc = bitcast <16 x i8> %res to <2 x i64>
583  ret <2 x i64> %bc
584}
585
586define <2 x i64> @test_mm_insert_epi32(<2 x i64> %a0, i32 %a1) {
587; X86-SSE-LABEL: test_mm_insert_epi32:
588; X86-SSE:       # %bb.0:
589; X86-SSE-NEXT:    pinsrd $1, {{[0-9]+}}(%esp), %xmm0
590; X86-SSE-NEXT:    retl
591;
592; X86-AVX-LABEL: test_mm_insert_epi32:
593; X86-AVX:       # %bb.0:
594; X86-AVX-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
595; X86-AVX-NEXT:    retl
596;
597; X64-SSE-LABEL: test_mm_insert_epi32:
598; X64-SSE:       # %bb.0:
599; X64-SSE-NEXT:    pinsrd $1, %edi, %xmm0
600; X64-SSE-NEXT:    retq
601;
602; X64-AVX-LABEL: test_mm_insert_epi32:
603; X64-AVX:       # %bb.0:
604; X64-AVX-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
605; X64-AVX-NEXT:    retq
606  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
607  %res = insertelement <4 x i32> %arg0, i32 %a1,i32 1
608  %bc = bitcast <4 x i32> %res to <2 x i64>
609  ret <2 x i64> %bc
610}
611
612define <2 x i64> @test_mm_insert_epi64(<2 x i64> %a0, i64 %a1) {
613; X86-SSE-LABEL: test_mm_insert_epi64:
614; X86-SSE:       # %bb.0:
615; X86-SSE-NEXT:    pinsrd $2, {{[0-9]+}}(%esp), %xmm0
616; X86-SSE-NEXT:    pinsrd $3, {{[0-9]+}}(%esp), %xmm0
617; X86-SSE-NEXT:    retl
618;
619; X86-AVX-LABEL: test_mm_insert_epi64:
620; X86-AVX:       # %bb.0:
621; X86-AVX-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
622; X86-AVX-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
623; X86-AVX-NEXT:    retl
624;
625; X64-SSE-LABEL: test_mm_insert_epi64:
626; X64-SSE:       # %bb.0:
627; X64-SSE-NEXT:    pinsrq $1, %rdi, %xmm0
628; X64-SSE-NEXT:    retq
629;
630; X64-AVX-LABEL: test_mm_insert_epi64:
631; X64-AVX:       # %bb.0:
632; X64-AVX-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0
633; X64-AVX-NEXT:    retq
634  %res = insertelement <2 x i64> %a0, i64 %a1,i32 1
635  ret <2 x i64> %res
636}
637
638define <4 x float> @test_mm_insert_ps(<4 x float> %a0, <4 x float> %a1) {
639; SSE-LABEL: test_mm_insert_ps:
640; SSE:       # %bb.0:
641; SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1],zero,xmm0[3]
642; SSE-NEXT:    ret{{[l|q]}}
643;
644; AVX-LABEL: test_mm_insert_ps:
645; AVX:       # %bb.0:
646; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[1],zero,xmm0[3]
647; AVX-NEXT:    ret{{[l|q]}}
648  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 4)
649  ret <4 x float> %res
650}
651declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
652
653define <2 x i64> @test_mm_max_epi8(<2 x i64> %a0, <2 x i64> %a1) {
654; SSE-LABEL: test_mm_max_epi8:
655; SSE:       # %bb.0:
656; SSE-NEXT:    pmaxsb %xmm1, %xmm0
657; SSE-NEXT:    ret{{[l|q]}}
658;
659; AVX-LABEL: test_mm_max_epi8:
660; AVX:       # %bb.0:
661; AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
662; AVX-NEXT:    ret{{[l|q]}}
663  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
664  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
665  %cmp = icmp sgt <16 x i8> %arg0, %arg1
666  %sel = select <16 x i1> %cmp, <16 x i8> %arg0, <16 x i8> %arg1
667  %bc = bitcast <16 x i8> %sel to <2 x i64>
668  ret <2 x i64> %bc
669}
670
671define <2 x i64> @test_mm_max_epi32(<2 x i64> %a0, <2 x i64> %a1) {
672; SSE-LABEL: test_mm_max_epi32:
673; SSE:       # %bb.0:
674; SSE-NEXT:    pmaxsd %xmm1, %xmm0
675; SSE-NEXT:    ret{{[l|q]}}
676;
677; AVX-LABEL: test_mm_max_epi32:
678; AVX:       # %bb.0:
679; AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
680; AVX-NEXT:    ret{{[l|q]}}
681  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
682  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
683  %cmp = icmp sgt <4 x i32> %arg0, %arg1
684  %sel = select <4 x i1> %cmp, <4 x i32> %arg0, <4 x i32> %arg1
685  %bc = bitcast <4 x i32> %sel to <2 x i64>
686  ret <2 x i64> %bc
687}
688
689define <2 x i64> @test_mm_max_epu16(<2 x i64> %a0, <2 x i64> %a1) {
690; SSE-LABEL: test_mm_max_epu16:
691; SSE:       # %bb.0:
692; SSE-NEXT:    pmaxuw %xmm1, %xmm0
693; SSE-NEXT:    ret{{[l|q]}}
694;
695; AVX-LABEL: test_mm_max_epu16:
696; AVX:       # %bb.0:
697; AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
698; AVX-NEXT:    ret{{[l|q]}}
699  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
700  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
701  %cmp = icmp ugt <8 x i16> %arg0, %arg1
702  %sel = select <8 x i1> %cmp, <8 x i16> %arg0, <8 x i16> %arg1
703  %bc = bitcast <8 x i16> %sel to <2 x i64>
704  ret <2 x i64> %bc
705}
706
707define <2 x i64> @test_mm_max_epu32(<2 x i64> %a0, <2 x i64> %a1) {
708; SSE-LABEL: test_mm_max_epu32:
709; SSE:       # %bb.0:
710; SSE-NEXT:    pmaxud %xmm1, %xmm0
711; SSE-NEXT:    ret{{[l|q]}}
712;
713; AVX-LABEL: test_mm_max_epu32:
714; AVX:       # %bb.0:
715; AVX-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
716; AVX-NEXT:    ret{{[l|q]}}
717  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
718  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
719  %cmp = icmp ugt <4 x i32> %arg0, %arg1
720  %sel = select <4 x i1> %cmp, <4 x i32> %arg0, <4 x i32> %arg1
721  %bc = bitcast <4 x i32> %sel to <2 x i64>
722  ret <2 x i64> %bc
723}
724
725define <2 x i64> @test_mm_min_epi8(<2 x i64> %a0, <2 x i64> %a1) {
726; SSE-LABEL: test_mm_min_epi8:
727; SSE:       # %bb.0:
728; SSE-NEXT:    pminsb %xmm1, %xmm0
729; SSE-NEXT:    ret{{[l|q]}}
730;
731; AVX-LABEL: test_mm_min_epi8:
732; AVX:       # %bb.0:
733; AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
734; AVX-NEXT:    ret{{[l|q]}}
735  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
736  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
737  %cmp = icmp slt <16 x i8> %arg0, %arg1
738  %sel = select <16 x i1> %cmp, <16 x i8> %arg0, <16 x i8> %arg1
739  %bc = bitcast <16 x i8> %sel to <2 x i64>
740  ret <2 x i64> %bc
741}
742
743define <2 x i64> @test_mm_min_epi32(<2 x i64> %a0, <2 x i64> %a1) {
744; SSE-LABEL: test_mm_min_epi32:
745; SSE:       # %bb.0:
746; SSE-NEXT:    pminsd %xmm1, %xmm0
747; SSE-NEXT:    ret{{[l|q]}}
748;
749; AVX-LABEL: test_mm_min_epi32:
750; AVX:       # %bb.0:
751; AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
752; AVX-NEXT:    ret{{[l|q]}}
753  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
754  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
755  %cmp = icmp slt <4 x i32> %arg0, %arg1
756  %sel = select <4 x i1> %cmp, <4 x i32> %arg0, <4 x i32> %arg1
757  %bc = bitcast <4 x i32> %sel to <2 x i64>
758  ret <2 x i64> %bc
759}
760
761define <2 x i64> @test_mm_min_epu16(<2 x i64> %a0, <2 x i64> %a1) {
762; SSE-LABEL: test_mm_min_epu16:
763; SSE:       # %bb.0:
764; SSE-NEXT:    pminuw %xmm1, %xmm0
765; SSE-NEXT:    ret{{[l|q]}}
766;
767; AVX-LABEL: test_mm_min_epu16:
768; AVX:       # %bb.0:
769; AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
770; AVX-NEXT:    ret{{[l|q]}}
771  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
772  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
773  %cmp = icmp ult <8 x i16> %arg0, %arg1
774  %sel = select <8 x i1> %cmp, <8 x i16> %arg0, <8 x i16> %arg1
775  %bc = bitcast <8 x i16> %sel to <2 x i64>
776  ret <2 x i64> %bc
777}
778
779define <2 x i64> @test_mm_min_epu32(<2 x i64> %a0, <2 x i64> %a1) {
780; SSE-LABEL: test_mm_min_epu32:
781; SSE:       # %bb.0:
782; SSE-NEXT:    pminud %xmm1, %xmm0
783; SSE-NEXT:    ret{{[l|q]}}
784;
785; AVX-LABEL: test_mm_min_epu32:
786; AVX:       # %bb.0:
787; AVX-NEXT:    vpminud %xmm1, %xmm0, %xmm0
788; AVX-NEXT:    ret{{[l|q]}}
789  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
790  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
791  %cmp = icmp ult <4 x i32> %arg0, %arg1
792  %sel = select <4 x i1> %cmp, <4 x i32> %arg0, <4 x i32> %arg1
793  %bc = bitcast <4 x i32> %sel to <2 x i64>
794  ret <2 x i64> %bc
795}
796
797define <2 x i64> @test_mm_minpos_epu16(<2 x i64> %a0) {
798; SSE-LABEL: test_mm_minpos_epu16:
799; SSE:       # %bb.0:
800; SSE-NEXT:    phminposuw %xmm0, %xmm0
801; SSE-NEXT:    ret{{[l|q]}}
802;
803; AVX-LABEL: test_mm_minpos_epu16:
804; AVX:       # %bb.0:
805; AVX-NEXT:    vphminposuw %xmm0, %xmm0
806; AVX-NEXT:    ret{{[l|q]}}
807  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
808  %res = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %arg0)
809  %bc = bitcast <8 x i16> %res to <2 x i64>
810  ret <2 x i64> %bc
811}
812declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
813
814define <2 x i64> @test_mm_mpsadbw_epu8(<2 x i64> %a0, <2 x i64> %a1) {
815; SSE-LABEL: test_mm_mpsadbw_epu8:
816; SSE:       # %bb.0:
817; SSE-NEXT:    mpsadbw $1, %xmm1, %xmm0
818; SSE-NEXT:    ret{{[l|q]}}
819;
820; AVX-LABEL: test_mm_mpsadbw_epu8:
821; AVX:       # %bb.0:
822; AVX-NEXT:    vmpsadbw $1, %xmm1, %xmm0, %xmm0
823; AVX-NEXT:    ret{{[l|q]}}
824  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
825  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
826  %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %arg0, <16 x i8> %arg1, i8 1)
827  %bc = bitcast <8 x i16> %res to <2 x i64>
828  ret <2 x i64> %bc
829}
830declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone
831
832define <2 x i64> @test_mm_mul_epi32(<2 x i64> %a0, <2 x i64> %a1) {
833; SSE-LABEL: test_mm_mul_epi32:
834; SSE:       # %bb.0:
835; SSE-NEXT:    psllq $32, %xmm0
836; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
837; SSE-NEXT:    psrad $31, %xmm0
838; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
839; SSE-NEXT:    psllq $32, %xmm1
840; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
841; SSE-NEXT:    psrad $31, %xmm1
842; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
843; SSE-NEXT:    pmuldq %xmm1, %xmm0
844; SSE-NEXT:    ret{{[l|q]}}
845;
846; AVX1-LABEL: test_mm_mul_epi32:
847; AVX1:       # %bb.0:
848; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
849; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm2
850; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
851; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
852; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
853; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
854; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
855; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
856; AVX1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0
857; AVX1-NEXT:    ret{{[l|q]}}
858;
859; AVX512-LABEL: test_mm_mul_epi32:
860; AVX512:       # %bb.0:
861; AVX512-NEXT:    vpsllq $32, %xmm0, %xmm0
862; AVX512-NEXT:    vpsraq $32, %xmm0, %xmm0
863; AVX512-NEXT:    vpsllq $32, %xmm1, %xmm1
864; AVX512-NEXT:    vpsraq $32, %xmm1, %xmm1
865; AVX512-NEXT:    vpmullq %xmm1, %xmm0, %xmm0
866; AVX512-NEXT:    ret{{[l|q]}}
867  %A = shl <2 x i64> %a0, <i64 32, i64 32>
868  %A1 = ashr exact <2 x i64> %A, <i64 32, i64 32>
869  %B = shl <2 x i64> %a1, <i64 32, i64 32>
870  %B1 = ashr exact <2 x i64> %B, <i64 32, i64 32>
871  %res = mul nsw <2 x i64> %A1, %B1
872  ret <2 x i64> %res
873}
874
875define <2 x i64> @test_mm_mullo_epi32(<2 x i64> %a0, <2 x i64> %a1) {
876; SSE-LABEL: test_mm_mullo_epi32:
877; SSE:       # %bb.0:
878; SSE-NEXT:    pmulld %xmm1, %xmm0
879; SSE-NEXT:    ret{{[l|q]}}
880;
881; AVX-LABEL: test_mm_mullo_epi32:
882; AVX:       # %bb.0:
883; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
884; AVX-NEXT:    ret{{[l|q]}}
885  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
886  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
887  %res = mul <4 x i32> %arg0, %arg1
888  %bc = bitcast <4 x i32> %res to <2 x i64>
889  ret <2 x i64> %bc
890}
891
892define <2 x i64> @test_mm_packus_epi32(<2 x i64> %a0, <2 x i64> %a1) {
893; SSE-LABEL: test_mm_packus_epi32:
894; SSE:       # %bb.0:
895; SSE-NEXT:    packusdw %xmm1, %xmm0
896; SSE-NEXT:    ret{{[l|q]}}
897;
898; AVX-LABEL: test_mm_packus_epi32:
899; AVX:       # %bb.0:
900; AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
901; AVX-NEXT:    ret{{[l|q]}}
902  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
903  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
904  %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %arg0, <4 x i32> %arg1)
905  %bc = bitcast <8 x i16> %res to <2 x i64>
906  ret <2 x i64> %bc
907}
908declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
909
910define <2 x double> @test_mm_round_pd(<2 x double> %a0) {
911; SSE-LABEL: test_mm_round_pd:
912; SSE:       # %bb.0:
913; SSE-NEXT:    roundpd $4, %xmm0, %xmm0
914; SSE-NEXT:    ret{{[l|q]}}
915;
916; AVX-LABEL: test_mm_round_pd:
917; AVX:       # %bb.0:
918; AVX-NEXT:    vroundpd $4, %xmm0, %xmm0
919; AVX-NEXT:    ret{{[l|q]}}
920  %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 4)
921  ret <2 x double> %res
922}
923
924define <4 x float> @test_mm_round_ps(<4 x float> %a0) {
925; SSE-LABEL: test_mm_round_ps:
926; SSE:       # %bb.0:
927; SSE-NEXT:    roundps $4, %xmm0, %xmm0
928; SSE-NEXT:    ret{{[l|q]}}
929;
930; AVX-LABEL: test_mm_round_ps:
931; AVX:       # %bb.0:
932; AVX-NEXT:    vroundps $4, %xmm0, %xmm0
933; AVX-NEXT:    ret{{[l|q]}}
934  %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 4)
935  ret <4 x float> %res
936}
937
938define <2 x double> @test_mm_round_sd(<2 x double> %a0, <2 x double> %a1) {
939; SSE-LABEL: test_mm_round_sd:
940; SSE:       # %bb.0:
941; SSE-NEXT:    roundsd $4, %xmm1, %xmm0
942; SSE-NEXT:    ret{{[l|q]}}
943;
944; AVX-LABEL: test_mm_round_sd:
945; AVX:       # %bb.0:
946; AVX-NEXT:    vroundsd $4, %xmm1, %xmm0, %xmm0
947; AVX-NEXT:    ret{{[l|q]}}
948  %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 4)
949  ret <2 x double> %res
950}
951
952define <4 x float> @test_mm_round_ss(<4 x float> %a0, <4 x float> %a1) {
953; SSE-LABEL: test_mm_round_ss:
954; SSE:       # %bb.0:
955; SSE-NEXT:    roundss $4, %xmm1, %xmm0
956; SSE-NEXT:    ret{{[l|q]}}
957;
958; AVX-LABEL: test_mm_round_ss:
959; AVX:       # %bb.0:
960; AVX-NEXT:    vroundss $4, %xmm1, %xmm0, %xmm0
961; AVX-NEXT:    ret{{[l|q]}}
962  %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 4)
963  ret <4 x float> %res
964}
965
966define <2 x i64> @test_mm_stream_load_si128(<2 x i64>* %a0) {
967; X86-SSE-LABEL: test_mm_stream_load_si128:
968; X86-SSE:       # %bb.0:
969; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
970; X86-SSE-NEXT:    movntdqa (%eax), %xmm0
971; X86-SSE-NEXT:    retl
972;
973; X86-AVX-LABEL: test_mm_stream_load_si128:
974; X86-AVX:       # %bb.0:
975; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
976; X86-AVX-NEXT:    vmovntdqa (%eax), %xmm0
977; X86-AVX-NEXT:    retl
978;
979; X64-SSE-LABEL: test_mm_stream_load_si128:
980; X64-SSE:       # %bb.0:
981; X64-SSE-NEXT:    movntdqa (%rdi), %xmm0
982; X64-SSE-NEXT:    retq
983;
984; X64-AVX-LABEL: test_mm_stream_load_si128:
985; X64-AVX:       # %bb.0:
986; X64-AVX-NEXT:    vmovntdqa (%rdi), %xmm0
987; X64-AVX-NEXT:    retq
988  %arg0 = bitcast <2 x i64>* %a0 to i8*
989  %res = call <2 x i64> @llvm.x86.sse41.movntdqa(i8* %arg0)
990  ret <2 x i64> %res
991}
992declare <2 x i64> @llvm.x86.sse41.movntdqa(i8*) nounwind readnone
993
994define i32 @test_mm_test_all_ones(<2 x i64> %a0) {
995; SSE-LABEL: test_mm_test_all_ones:
996; SSE:       # %bb.0:
997; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
998; SSE-NEXT:    xorl %eax, %eax
999; SSE-NEXT:    ptest %xmm1, %xmm0
1000; SSE-NEXT:    setb %al
1001; SSE-NEXT:    ret{{[l|q]}}
1002;
1003; AVX-LABEL: test_mm_test_all_ones:
1004; AVX:       # %bb.0:
1005; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1006; AVX-NEXT:    xorl %eax, %eax
1007; AVX-NEXT:    vptest %xmm1, %xmm0
1008; AVX-NEXT:    setb %al
1009; AVX-NEXT:    ret{{[l|q]}}
1010  %res = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> <i64 -1, i64 -1>)
1011  ret i32 %res
1012}
1013declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
1014
1015define i32 @test_mm_test_all_zeros(<2 x i64> %a0, <2 x i64> %a1) {
1016; SSE-LABEL: test_mm_test_all_zeros:
1017; SSE:       # %bb.0:
1018; SSE-NEXT:    xorl %eax, %eax
1019; SSE-NEXT:    ptest %xmm1, %xmm0
1020; SSE-NEXT:    sete %al
1021; SSE-NEXT:    ret{{[l|q]}}
1022;
1023; AVX-LABEL: test_mm_test_all_zeros:
1024; AVX:       # %bb.0:
1025; AVX-NEXT:    xorl %eax, %eax
1026; AVX-NEXT:    vptest %xmm1, %xmm0
1027; AVX-NEXT:    sete %al
1028; AVX-NEXT:    ret{{[l|q]}}
1029  %res = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %a0, <2 x i64> %a1)
1030  ret i32 %res
1031}
1032declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
1033
1034define i32 @test_mm_test_mix_ones_zeros(<2 x i64> %a0, <2 x i64> %a1) {
1035; SSE-LABEL: test_mm_test_mix_ones_zeros:
1036; SSE:       # %bb.0:
1037; SSE-NEXT:    xorl %eax, %eax
1038; SSE-NEXT:    ptest %xmm1, %xmm0
1039; SSE-NEXT:    seta %al
1040; SSE-NEXT:    ret{{[l|q]}}
1041;
1042; AVX-LABEL: test_mm_test_mix_ones_zeros:
1043; AVX:       # %bb.0:
1044; AVX-NEXT:    xorl %eax, %eax
1045; AVX-NEXT:    vptest %xmm1, %xmm0
1046; AVX-NEXT:    seta %al
1047; AVX-NEXT:    ret{{[l|q]}}
1048  %res = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %a0, <2 x i64> %a1)
1049  ret i32 %res
1050}
1051declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
1052
1053define i32 @test_mm_testc_si128(<2 x i64> %a0, <2 x i64> %a1) {
1054; SSE-LABEL: test_mm_testc_si128:
1055; SSE:       # %bb.0:
1056; SSE-NEXT:    xorl %eax, %eax
1057; SSE-NEXT:    ptest %xmm1, %xmm0
1058; SSE-NEXT:    setb %al
1059; SSE-NEXT:    ret{{[l|q]}}
1060;
1061; AVX-LABEL: test_mm_testc_si128:
1062; AVX:       # %bb.0:
1063; AVX-NEXT:    xorl %eax, %eax
1064; AVX-NEXT:    vptest %xmm1, %xmm0
1065; AVX-NEXT:    setb %al
1066; AVX-NEXT:    ret{{[l|q]}}
1067  %res = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1)
1068  ret i32 %res
1069}
1070
1071define i32 @test_mm_testnzc_si128(<2 x i64> %a0, <2 x i64> %a1) {
1072; SSE-LABEL: test_mm_testnzc_si128:
1073; SSE:       # %bb.0:
1074; SSE-NEXT:    xorl %eax, %eax
1075; SSE-NEXT:    ptest %xmm1, %xmm0
1076; SSE-NEXT:    seta %al
1077; SSE-NEXT:    ret{{[l|q]}}
1078;
1079; AVX-LABEL: test_mm_testnzc_si128:
1080; AVX:       # %bb.0:
1081; AVX-NEXT:    xorl %eax, %eax
1082; AVX-NEXT:    vptest %xmm1, %xmm0
1083; AVX-NEXT:    seta %al
1084; AVX-NEXT:    ret{{[l|q]}}
1085  %res = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %a0, <2 x i64> %a1)
1086  ret i32 %res
1087}
1088
1089define i32 @test_mm_testz_si128(<2 x i64> %a0, <2 x i64> %a1) {
1090; SSE-LABEL: test_mm_testz_si128:
1091; SSE:       # %bb.0:
1092; SSE-NEXT:    xorl %eax, %eax
1093; SSE-NEXT:    ptest %xmm1, %xmm0
1094; SSE-NEXT:    sete %al
1095; SSE-NEXT:    ret{{[l|q]}}
1096;
1097; AVX-LABEL: test_mm_testz_si128:
1098; AVX:       # %bb.0:
1099; AVX-NEXT:    xorl %eax, %eax
1100; AVX-NEXT:    vptest %xmm1, %xmm0
1101; AVX-NEXT:    sete %al
1102; AVX-NEXT:    ret{{[l|q]}}
1103  %res = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %a0, <2 x i64> %a1)
1104  ret i32 %res
1105}
1106