• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop  | FileCheck %s --check-prefix=XOP
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
8
9;
10; 128-bit vectors
11;
12
13define <2 x i64> @bitselect_v2i64_rr(<2 x i64>, <2 x i64>) {
14; SSE-LABEL: bitselect_v2i64_rr:
15; SSE:       # %bb.0:
16; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
17; SSE-NEXT:    andps {{.*}}(%rip), %xmm1
18; SSE-NEXT:    orps %xmm1, %xmm0
19; SSE-NEXT:    retq
20;
21; XOP-LABEL: bitselect_v2i64_rr:
22; XOP:       # %bb.0:
23; XOP-NEXT:    vpcmov {{.*}}(%rip), %xmm0, %xmm1, %xmm0
24; XOP-NEXT:    retq
25;
26; AVX-LABEL: bitselect_v2i64_rr:
27; AVX:       # %bb.0:
28; AVX-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
29; AVX-NEXT:    vandps {{.*}}(%rip), %xmm1, %xmm1
30; AVX-NEXT:    vorps %xmm0, %xmm1, %xmm0
31; AVX-NEXT:    retq
32;
33; AVX512F-LABEL: bitselect_v2i64_rr:
34; AVX512F:       # %bb.0:
35; AVX512F-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
36; AVX512F-NEXT:    vandps {{.*}}(%rip), %xmm1, %xmm1
37; AVX512F-NEXT:    vorps %xmm0, %xmm1, %xmm0
38; AVX512F-NEXT:    retq
39;
40; AVX512VL-LABEL: bitselect_v2i64_rr:
41; AVX512VL:       # %bb.0:
42; AVX512VL-NEXT:    vpternlogq $216, {{.*}}(%rip), %xmm1, %xmm0
43; AVX512VL-NEXT:    retq
44  %3 = and <2 x i64> %0, <i64 4294967296, i64 12884901890>
45  %4 = and <2 x i64> %1, <i64 -4294967297, i64 -12884901891>
46  %5 = or <2 x i64> %4, %3
47  ret <2 x i64> %5
48}
49
50define <2 x i64> @bitselect_v2i64_rm(<2 x i64>, <2 x i64>* nocapture readonly) {
51; SSE-LABEL: bitselect_v2i64_rm:
52; SSE:       # %bb.0:
53; SSE-NEXT:    movaps (%rdi), %xmm1
54; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
55; SSE-NEXT:    andps {{.*}}(%rip), %xmm1
56; SSE-NEXT:    orps %xmm1, %xmm0
57; SSE-NEXT:    retq
58;
59; XOP-LABEL: bitselect_v2i64_rm:
60; XOP:       # %bb.0:
61; XOP-NEXT:    vmovdqa (%rdi), %xmm1
62; XOP-NEXT:    vpcmov {{.*}}(%rip), %xmm0, %xmm1, %xmm0
63; XOP-NEXT:    retq
64;
65; AVX-LABEL: bitselect_v2i64_rm:
66; AVX:       # %bb.0:
67; AVX-NEXT:    vmovaps (%rdi), %xmm1
68; AVX-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
69; AVX-NEXT:    vandps {{.*}}(%rip), %xmm1, %xmm1
70; AVX-NEXT:    vorps %xmm0, %xmm1, %xmm0
71; AVX-NEXT:    retq
72;
73; AVX512F-LABEL: bitselect_v2i64_rm:
74; AVX512F:       # %bb.0:
75; AVX512F-NEXT:    vmovaps (%rdi), %xmm1
76; AVX512F-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
77; AVX512F-NEXT:    vandps {{.*}}(%rip), %xmm1, %xmm1
78; AVX512F-NEXT:    vorps %xmm0, %xmm1, %xmm0
79; AVX512F-NEXT:    retq
80;
81; AVX512VL-LABEL: bitselect_v2i64_rm:
82; AVX512VL:       # %bb.0:
83; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm1
84; AVX512VL-NEXT:    vpternlogq $216, {{.*}}(%rip), %xmm1, %xmm0
85; AVX512VL-NEXT:    retq
86  %3 = load <2 x i64>, <2 x i64>* %1
87  %4 = and <2 x i64> %0, <i64 8589934593, i64 3>
88  %5 = and <2 x i64> %3, <i64 -8589934594, i64 -4>
89  %6 = or <2 x i64> %5, %4
90  ret <2 x i64> %6
91}
92
93define <2 x i64> @bitselect_v2i64_mr(<2 x i64>* nocapture readonly, <2 x i64>) {
94; SSE-LABEL: bitselect_v2i64_mr:
95; SSE:       # %bb.0:
96; SSE-NEXT:    movaps (%rdi), %xmm1
97; SSE-NEXT:    andps {{.*}}(%rip), %xmm1
98; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
99; SSE-NEXT:    orps %xmm1, %xmm0
100; SSE-NEXT:    retq
101;
102; XOP-LABEL: bitselect_v2i64_mr:
103; XOP:       # %bb.0:
104; XOP-NEXT:    vmovdqa (%rdi), %xmm1
105; XOP-NEXT:    vpcmov {{.*}}(%rip), %xmm0, %xmm1, %xmm0
106; XOP-NEXT:    retq
107;
108; AVX-LABEL: bitselect_v2i64_mr:
109; AVX:       # %bb.0:
110; AVX-NEXT:    vmovaps (%rdi), %xmm1
111; AVX-NEXT:    vandps {{.*}}(%rip), %xmm1, %xmm1
112; AVX-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
113; AVX-NEXT:    vorps %xmm0, %xmm1, %xmm0
114; AVX-NEXT:    retq
115;
116; AVX512F-LABEL: bitselect_v2i64_mr:
117; AVX512F:       # %bb.0:
118; AVX512F-NEXT:    vmovaps (%rdi), %xmm1
119; AVX512F-NEXT:    vandps {{.*}}(%rip), %xmm1, %xmm1
120; AVX512F-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
121; AVX512F-NEXT:    vorps %xmm0, %xmm1, %xmm0
122; AVX512F-NEXT:    retq
123;
124; AVX512VL-LABEL: bitselect_v2i64_mr:
125; AVX512VL:       # %bb.0:
126; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm1
127; AVX512VL-NEXT:    vpternlogq $216, {{.*}}(%rip), %xmm1, %xmm0
128; AVX512VL-NEXT:    retq
129  %3 = load <2 x i64>, <2 x i64>* %0
130  %4 = and <2 x i64> %3, <i64 12884901890, i64 4294967296>
131  %5 = and <2 x i64> %1, <i64 -12884901891, i64 -4294967297>
132  %6 = or <2 x i64> %4, %5
133  ret <2 x i64> %6
134}
135
136define <2 x i64> @bitselect_v2i64_mm(<2 x i64>* nocapture readonly, <2 x i64>* nocapture readonly) {
137; SSE-LABEL: bitselect_v2i64_mm:
138; SSE:       # %bb.0:
139; SSE-NEXT:    movaps (%rdi), %xmm1
140; SSE-NEXT:    movaps (%rsi), %xmm0
141; SSE-NEXT:    andps {{.*}}(%rip), %xmm1
142; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
143; SSE-NEXT:    orps %xmm1, %xmm0
144; SSE-NEXT:    retq
145;
146; XOP-LABEL: bitselect_v2i64_mm:
147; XOP:       # %bb.0:
148; XOP-NEXT:    vmovdqa (%rsi), %xmm0
149; XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744073709551612,18446744065119617022]
150; XOP-NEXT:    vpcmov %xmm1, (%rdi), %xmm0, %xmm0
151; XOP-NEXT:    retq
152;
153; AVX-LABEL: bitselect_v2i64_mm:
154; AVX:       # %bb.0:
155; AVX-NEXT:    vmovaps (%rdi), %xmm0
156; AVX-NEXT:    vmovaps (%rsi), %xmm1
157; AVX-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
158; AVX-NEXT:    vandps {{.*}}(%rip), %xmm1, %xmm1
159; AVX-NEXT:    vorps %xmm0, %xmm1, %xmm0
160; AVX-NEXT:    retq
161;
162; AVX512F-LABEL: bitselect_v2i64_mm:
163; AVX512F:       # %bb.0:
164; AVX512F-NEXT:    vmovaps (%rdi), %xmm0
165; AVX512F-NEXT:    vmovaps (%rsi), %xmm1
166; AVX512F-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
167; AVX512F-NEXT:    vandps {{.*}}(%rip), %xmm1, %xmm1
168; AVX512F-NEXT:    vorps %xmm0, %xmm1, %xmm0
169; AVX512F-NEXT:    retq
170;
171; AVX512VL-LABEL: bitselect_v2i64_mm:
172; AVX512VL:       # %bb.0:
173; AVX512VL-NEXT:    vmovdqa (%rsi), %xmm1
174; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm0 = [18446744073709551612,18446744065119617022]
175; AVX512VL-NEXT:    vpternlogq $202, (%rdi), %xmm1, %xmm0
176; AVX512VL-NEXT:    retq
177  %3 = load <2 x i64>, <2 x i64>* %0
178  %4 = load <2 x i64>, <2 x i64>* %1
179  %5 = and <2 x i64> %3, <i64 3, i64 8589934593>
180  %6 = and <2 x i64> %4, <i64 -4, i64 -8589934594>
181  %7 = or <2 x i64> %6, %5
182  ret <2 x i64> %7
183}
184
185define <2 x i64> @bitselect_v2i64_broadcast_rrr(<2 x i64> %a0, <2 x i64> %a1, i64 %a2) {
186; SSE-LABEL: bitselect_v2i64_broadcast_rrr:
187; SSE:       # %bb.0:
188; SSE-NEXT:    movq %rdi, %xmm2
189; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
190; SSE-NEXT:    pand %xmm2, %xmm0
191; SSE-NEXT:    pandn %xmm1, %xmm2
192; SSE-NEXT:    por %xmm2, %xmm0
193; SSE-NEXT:    retq
194;
195; XOP-LABEL: bitselect_v2i64_broadcast_rrr:
196; XOP:       # %bb.0:
197; XOP-NEXT:    vmovq %rdi, %xmm2
198; XOP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
199; XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
200; XOP-NEXT:    retq
201;
202; AVX1-LABEL: bitselect_v2i64_broadcast_rrr:
203; AVX1:       # %bb.0:
204; AVX1-NEXT:    vmovq %rdi, %xmm2
205; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
206; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
207; AVX1-NEXT:    vpandn %xmm1, %xmm2, %xmm1
208; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
209; AVX1-NEXT:    retq
210;
211; AVX2-LABEL: bitselect_v2i64_broadcast_rrr:
212; AVX2:       # %bb.0:
213; AVX2-NEXT:    vmovq %rdi, %xmm2
214; AVX2-NEXT:    vpbroadcastq %xmm2, %xmm2
215; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
216; AVX2-NEXT:    vpandn %xmm1, %xmm2, %xmm1
217; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
218; AVX2-NEXT:    retq
219;
220; AVX512F-LABEL: bitselect_v2i64_broadcast_rrr:
221; AVX512F:       # %bb.0:
222; AVX512F-NEXT:    vmovq %rdi, %xmm2
223; AVX512F-NEXT:    vpbroadcastq %xmm2, %xmm2
224; AVX512F-NEXT:    vpand %xmm2, %xmm0, %xmm0
225; AVX512F-NEXT:    vpandn %xmm1, %xmm2, %xmm1
226; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
227; AVX512F-NEXT:    retq
228;
229; AVX512VL-LABEL: bitselect_v2i64_broadcast_rrr:
230; AVX512VL:       # %bb.0:
231; AVX512VL-NEXT:    vpbroadcastq %rdi, %xmm2
232; AVX512VL-NEXT:    vpternlogq $226, %xmm1, %xmm2, %xmm0
233; AVX512VL-NEXT:    retq
234  %1 = insertelement <2 x i64> undef, i64 %a2, i32 0
235  %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
236  %3 = xor <2 x i64> %1, <i64 -1, i64 undef>
237  %4 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
238  %5 = and <2 x i64> %a0, %2
239  %6 = and <2 x i64> %a1, %4
240  %7 = or <2 x i64> %5, %6
241  ret <2 x i64> %7
242}
243
244define <2 x i64> @bitselect_v2i64_broadcast_rrm(<2 x i64> %a0, <2 x i64> %a1, i64* %p2) {
245; SSE-LABEL: bitselect_v2i64_broadcast_rrm:
246; SSE:       # %bb.0:
247; SSE-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
248; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
249; SSE-NEXT:    pand %xmm2, %xmm0
250; SSE-NEXT:    pandn %xmm1, %xmm2
251; SSE-NEXT:    por %xmm2, %xmm0
252; SSE-NEXT:    retq
253;
254; XOP-LABEL: bitselect_v2i64_broadcast_rrm:
255; XOP:       # %bb.0:
256; XOP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
257; XOP-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1]
258; XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
259; XOP-NEXT:    vandnps %xmm1, %xmm2, %xmm1
260; XOP-NEXT:    vorps %xmm1, %xmm0, %xmm0
261; XOP-NEXT:    retq
262;
263; AVX1-LABEL: bitselect_v2i64_broadcast_rrm:
264; AVX1:       # %bb.0:
265; AVX1-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
266; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1]
267; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm0
268; AVX1-NEXT:    vandnps %xmm1, %xmm2, %xmm1
269; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
270; AVX1-NEXT:    retq
271;
272; AVX2-LABEL: bitselect_v2i64_broadcast_rrm:
273; AVX2:       # %bb.0:
274; AVX2-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
275; AVX2-NEXT:    vandps %xmm2, %xmm0, %xmm0
276; AVX2-NEXT:    vandnps %xmm1, %xmm2, %xmm1
277; AVX2-NEXT:    vorps %xmm1, %xmm0, %xmm0
278; AVX2-NEXT:    retq
279;
280; AVX512F-LABEL: bitselect_v2i64_broadcast_rrm:
281; AVX512F:       # %bb.0:
282; AVX512F-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
283; AVX512F-NEXT:    vandps %xmm2, %xmm0, %xmm0
284; AVX512F-NEXT:    vandnps %xmm1, %xmm2, %xmm1
285; AVX512F-NEXT:    vorps %xmm1, %xmm0, %xmm0
286; AVX512F-NEXT:    retq
287;
288; AVX512VL-LABEL: bitselect_v2i64_broadcast_rrm:
289; AVX512VL:       # %bb.0:
290; AVX512VL-NEXT:    vpternlogq $228, (%rdi){1to2}, %xmm1, %xmm0
291; AVX512VL-NEXT:    retq
292  %a2 = load i64, i64* %p2
293  %1 = insertelement <2 x i64> undef, i64 %a2, i32 0
294  %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
295  %3 = xor <2 x i64> %1, <i64 -1, i64 undef>
296  %4 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
297  %5 = and <2 x i64> %a0, %2
298  %6 = and <2 x i64> %a1, %4
299  %7 = or <2 x i64> %5, %6
300  ret <2 x i64> %7
301}
302
303;
304; 256-bit vectors
305;
306
307define <4 x i64> @bitselect_v4i64_rr(<4 x i64>, <4 x i64>) {
308; SSE-LABEL: bitselect_v4i64_rr:
309; SSE:       # %bb.0:
310; SSE-NEXT:    andps {{.*}}(%rip), %xmm1
311; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
312; SSE-NEXT:    andps {{.*}}(%rip), %xmm3
313; SSE-NEXT:    orps %xmm3, %xmm1
314; SSE-NEXT:    andps {{.*}}(%rip), %xmm2
315; SSE-NEXT:    orps %xmm2, %xmm0
316; SSE-NEXT:    retq
317;
318; XOP-LABEL: bitselect_v4i64_rr:
319; XOP:       # %bb.0:
320; XOP-NEXT:    vpcmov {{.*}}(%rip), %ymm0, %ymm1, %ymm0
321; XOP-NEXT:    retq
322;
323; AVX-LABEL: bitselect_v4i64_rr:
324; AVX:       # %bb.0:
325; AVX-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
326; AVX-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
327; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
328; AVX-NEXT:    retq
329;
330; AVX512F-LABEL: bitselect_v4i64_rr:
331; AVX512F:       # %bb.0:
332; AVX512F-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
333; AVX512F-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
334; AVX512F-NEXT:    vorps %ymm0, %ymm1, %ymm0
335; AVX512F-NEXT:    retq
336;
337; AVX512VL-LABEL: bitselect_v4i64_rr:
338; AVX512VL:       # %bb.0:
339; AVX512VL-NEXT:    vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0
340; AVX512VL-NEXT:    retq
341  %3 = and <4 x i64> %0, <i64 4294967296, i64 12884901890, i64 12884901890, i64 12884901890>
342  %4 = and <4 x i64> %1, <i64 -4294967297, i64 -12884901891, i64 -12884901891, i64 -12884901891>
343  %5 = or <4 x i64> %4, %3
344  ret <4 x i64> %5
345}
346
347define <4 x i64> @bitselect_v4i64_rm(<4 x i64>, <4 x i64>* nocapture readonly) {
348; SSE-LABEL: bitselect_v4i64_rm:
349; SSE:       # %bb.0:
350; SSE-NEXT:    movaps {{.*#+}} xmm2 = [18446744065119617022,18446744073709551612]
351; SSE-NEXT:    movaps 16(%rdi), %xmm4
352; SSE-NEXT:    andps %xmm2, %xmm4
353; SSE-NEXT:    movaps (%rdi), %xmm5
354; SSE-NEXT:    andps %xmm2, %xmm5
355; SSE-NEXT:    movaps %xmm2, %xmm3
356; SSE-NEXT:    andnps %xmm0, %xmm3
357; SSE-NEXT:    orps %xmm5, %xmm3
358; SSE-NEXT:    andnps %xmm1, %xmm2
359; SSE-NEXT:    orps %xmm4, %xmm2
360; SSE-NEXT:    movaps %xmm3, %xmm0
361; SSE-NEXT:    movaps %xmm2, %xmm1
362; SSE-NEXT:    retq
363;
364; XOP-LABEL: bitselect_v4i64_rm:
365; XOP:       # %bb.0:
366; XOP-NEXT:    vmovdqa (%rdi), %ymm1
367; XOP-NEXT:    vpcmov {{.*}}(%rip), %ymm0, %ymm1, %ymm0
368; XOP-NEXT:    retq
369;
370; AVX-LABEL: bitselect_v4i64_rm:
371; AVX:       # %bb.0:
372; AVX-NEXT:    vmovaps (%rdi), %ymm1
373; AVX-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
374; AVX-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
375; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
376; AVX-NEXT:    retq
377;
378; AVX512F-LABEL: bitselect_v4i64_rm:
379; AVX512F:       # %bb.0:
380; AVX512F-NEXT:    vmovaps (%rdi), %ymm1
381; AVX512F-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
382; AVX512F-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
383; AVX512F-NEXT:    vorps %ymm0, %ymm1, %ymm0
384; AVX512F-NEXT:    retq
385;
386; AVX512VL-LABEL: bitselect_v4i64_rm:
387; AVX512VL:       # %bb.0:
388; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm1
389; AVX512VL-NEXT:    vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0
390; AVX512VL-NEXT:    retq
391  %3 = load <4 x i64>, <4 x i64>* %1
392  %4 = and <4 x i64> %0, <i64 8589934593, i64 3, i64 8589934593, i64 3>
393  %5 = and <4 x i64> %3, <i64 -8589934594, i64 -4, i64 -8589934594, i64 -4>
394  %6 = or <4 x i64> %5, %4
395  ret <4 x i64> %6
396}
397
398define <4 x i64> @bitselect_v4i64_mr(<4 x i64>* nocapture readonly, <4 x i64>) {
399; SSE-LABEL: bitselect_v4i64_mr:
400; SSE:       # %bb.0:
401; SSE-NEXT:    movaps {{.*#+}} xmm2 = [12884901890,4294967296]
402; SSE-NEXT:    movaps 16(%rdi), %xmm4
403; SSE-NEXT:    andps %xmm2, %xmm4
404; SSE-NEXT:    movaps (%rdi), %xmm5
405; SSE-NEXT:    andps %xmm2, %xmm5
406; SSE-NEXT:    movaps %xmm2, %xmm3
407; SSE-NEXT:    andnps %xmm0, %xmm3
408; SSE-NEXT:    orps %xmm5, %xmm3
409; SSE-NEXT:    andnps %xmm1, %xmm2
410; SSE-NEXT:    orps %xmm4, %xmm2
411; SSE-NEXT:    movaps %xmm3, %xmm0
412; SSE-NEXT:    movaps %xmm2, %xmm1
413; SSE-NEXT:    retq
414;
415; XOP-LABEL: bitselect_v4i64_mr:
416; XOP:       # %bb.0:
417; XOP-NEXT:    vmovdqa (%rdi), %ymm1
418; XOP-NEXT:    vpcmov {{.*}}(%rip), %ymm0, %ymm1, %ymm0
419; XOP-NEXT:    retq
420;
421; AVX-LABEL: bitselect_v4i64_mr:
422; AVX:       # %bb.0:
423; AVX-NEXT:    vmovaps (%rdi), %ymm1
424; AVX-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
425; AVX-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
426; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
427; AVX-NEXT:    retq
428;
429; AVX512F-LABEL: bitselect_v4i64_mr:
430; AVX512F:       # %bb.0:
431; AVX512F-NEXT:    vmovaps (%rdi), %ymm1
432; AVX512F-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
433; AVX512F-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
434; AVX512F-NEXT:    vorps %ymm0, %ymm1, %ymm0
435; AVX512F-NEXT:    retq
436;
437; AVX512VL-LABEL: bitselect_v4i64_mr:
438; AVX512VL:       # %bb.0:
439; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm1
440; AVX512VL-NEXT:    vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0
441; AVX512VL-NEXT:    retq
442  %3 = load <4 x i64>, <4 x i64>* %0
443  %4 = and <4 x i64> %3, <i64 12884901890, i64 4294967296, i64 12884901890, i64 4294967296>
444  %5 = and <4 x i64> %1, <i64 -12884901891, i64 -4294967297, i64 -12884901891, i64 -4294967297>
445  %6 = or <4 x i64> %4, %5
446  ret <4 x i64> %6
447}
448
449define <4 x i64> @bitselect_v4i64_mm(<4 x i64>* nocapture readonly, <4 x i64>* nocapture readonly) {
450; SSE-LABEL: bitselect_v4i64_mm:
451; SSE:       # %bb.0:
452; SSE-NEXT:    movaps {{.*#+}} xmm1 = [18446744073709551612,18446744065119617022]
453; SSE-NEXT:    movaps 16(%rsi), %xmm2
454; SSE-NEXT:    andps %xmm1, %xmm2
455; SSE-NEXT:    movaps (%rsi), %xmm3
456; SSE-NEXT:    andps %xmm1, %xmm3
457; SSE-NEXT:    movaps %xmm1, %xmm0
458; SSE-NEXT:    andnps (%rdi), %xmm0
459; SSE-NEXT:    orps %xmm3, %xmm0
460; SSE-NEXT:    andnps 16(%rdi), %xmm1
461; SSE-NEXT:    orps %xmm2, %xmm1
462; SSE-NEXT:    retq
463;
464; XOP-LABEL: bitselect_v4i64_mm:
465; XOP:       # %bb.0:
466; XOP-NEXT:    vmovdqa (%rsi), %ymm0
467; XOP-NEXT:    vmovdqa {{.*#+}} ymm1 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022]
468; XOP-NEXT:    vpcmov %ymm1, (%rdi), %ymm0, %ymm0
469; XOP-NEXT:    retq
470;
471; AVX-LABEL: bitselect_v4i64_mm:
472; AVX:       # %bb.0:
473; AVX-NEXT:    vmovaps (%rdi), %ymm0
474; AVX-NEXT:    vmovaps (%rsi), %ymm1
475; AVX-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
476; AVX-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
477; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
478; AVX-NEXT:    retq
479;
480; AVX512F-LABEL: bitselect_v4i64_mm:
481; AVX512F:       # %bb.0:
482; AVX512F-NEXT:    vmovaps (%rdi), %ymm0
483; AVX512F-NEXT:    vmovaps (%rsi), %ymm1
484; AVX512F-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
485; AVX512F-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
486; AVX512F-NEXT:    vorps %ymm0, %ymm1, %ymm0
487; AVX512F-NEXT:    retq
488;
489; AVX512VL-LABEL: bitselect_v4i64_mm:
490; AVX512VL:       # %bb.0:
491; AVX512VL-NEXT:    vmovdqa (%rsi), %ymm1
492; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm0 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022]
493; AVX512VL-NEXT:    vpternlogq $202, (%rdi), %ymm1, %ymm0
494; AVX512VL-NEXT:    retq
495  %3 = load <4 x i64>, <4 x i64>* %0
496  %4 = load <4 x i64>, <4 x i64>* %1
497  %5 = and <4 x i64> %3, <i64 3, i64 8589934593, i64 3, i64 8589934593>
498  %6 = and <4 x i64> %4, <i64 -4, i64 -8589934594, i64 -4, i64 -8589934594>
499  %7 = or <4 x i64> %6, %5
500  ret <4 x i64> %7
501}
502
503define <4 x i64> @bitselect_v4i64_broadcast_rrr(<4 x i64> %a0, <4 x i64> %a1, i64 %a2) {
504; SSE-LABEL: bitselect_v4i64_broadcast_rrr:
505; SSE:       # %bb.0:
506; SSE-NEXT:    movq %rdi, %xmm4
507; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
508; SSE-NEXT:    pand %xmm4, %xmm1
509; SSE-NEXT:    pand %xmm4, %xmm0
510; SSE-NEXT:    movdqa %xmm4, %xmm5
511; SSE-NEXT:    pandn %xmm3, %xmm5
512; SSE-NEXT:    por %xmm5, %xmm1
513; SSE-NEXT:    pandn %xmm2, %xmm4
514; SSE-NEXT:    por %xmm4, %xmm0
515; SSE-NEXT:    retq
516;
517; XOP-LABEL: bitselect_v4i64_broadcast_rrr:
518; XOP:       # %bb.0:
519; XOP-NEXT:    vmovq %rdi, %xmm2
520; XOP-NEXT:    vmovq %rdi, %xmm3
521; XOP-NEXT:    vmovddup {{.*#+}} xmm2 = xmm2[0,0]
522; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
523; XOP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
524; XOP-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm3
525; XOP-NEXT:    vandps %ymm2, %ymm0, %ymm0
526; XOP-NEXT:    vandnps %ymm1, %ymm3, %ymm1
527; XOP-NEXT:    vorps %ymm1, %ymm0, %ymm0
528; XOP-NEXT:    retq
529;
530; AVX1-LABEL: bitselect_v4i64_broadcast_rrr:
531; AVX1:       # %bb.0:
532; AVX1-NEXT:    vmovq %rdi, %xmm2
533; AVX1-NEXT:    vmovq %rdi, %xmm3
534; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = xmm2[0,0]
535; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
536; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
537; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm3
538; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
539; AVX1-NEXT:    vandnps %ymm1, %ymm3, %ymm1
540; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
541; AVX1-NEXT:    retq
542;
543; AVX2-LABEL: bitselect_v4i64_broadcast_rrr:
544; AVX2:       # %bb.0:
545; AVX2-NEXT:    vmovq %rdi, %xmm2
546; AVX2-NEXT:    vpbroadcastq %xmm2, %ymm2
547; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
548; AVX2-NEXT:    vpandn %ymm1, %ymm2, %ymm1
549; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
550; AVX2-NEXT:    retq
551;
552; AVX512F-LABEL: bitselect_v4i64_broadcast_rrr:
553; AVX512F:       # %bb.0:
554; AVX512F-NEXT:    vmovq %rdi, %xmm2
555; AVX512F-NEXT:    vpbroadcastq %xmm2, %ymm2
556; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
557; AVX512F-NEXT:    vpandn %ymm1, %ymm2, %ymm1
558; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
559; AVX512F-NEXT:    retq
560;
561; AVX512VL-LABEL: bitselect_v4i64_broadcast_rrr:
562; AVX512VL:       # %bb.0:
563; AVX512VL-NEXT:    vpbroadcastq %rdi, %ymm2
564; AVX512VL-NEXT:    vpternlogq $226, %ymm1, %ymm2, %ymm0
565; AVX512VL-NEXT:    retq
566  %1 = insertelement <4 x i64> undef, i64 %a2, i32 0
567  %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <4 x i32> zeroinitializer
568  %3 = xor <4 x i64> %1, <i64 -1, i64 undef, i64 undef, i64 undef>
569  %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> zeroinitializer
570  %5 = and <4 x i64> %a0, %2
571  %6 = and <4 x i64> %a1, %4
572  %7 = or <4 x i64> %5, %6
573  ret <4 x i64> %7
574}
575
576define <4 x i64> @bitselect_v4i64_broadcast_rrm(<4 x i64> %a0, <4 x i64> %a1, i64* %p2) {
577; SSE-LABEL: bitselect_v4i64_broadcast_rrm:
578; SSE:       # %bb.0:
579; SSE-NEXT:    movq {{.*#+}} xmm4 = mem[0],zero
580; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
581; SSE-NEXT:    pand %xmm4, %xmm1
582; SSE-NEXT:    pand %xmm4, %xmm0
583; SSE-NEXT:    movdqa %xmm4, %xmm5
584; SSE-NEXT:    pandn %xmm3, %xmm5
585; SSE-NEXT:    por %xmm5, %xmm1
586; SSE-NEXT:    pandn %xmm2, %xmm4
587; SSE-NEXT:    por %xmm4, %xmm0
588; SSE-NEXT:    retq
589;
590; XOP-LABEL: bitselect_v4i64_broadcast_rrm:
591; XOP:       # %bb.0:
592; XOP-NEXT:    vbroadcastsd (%rdi), %ymm2
593; XOP-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
594; XOP-NEXT:    retq
595;
596; AVX-LABEL: bitselect_v4i64_broadcast_rrm:
597; AVX:       # %bb.0:
598; AVX-NEXT:    vbroadcastsd (%rdi), %ymm2
599; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
600; AVX-NEXT:    vandnps %ymm1, %ymm2, %ymm1
601; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
602; AVX-NEXT:    retq
603;
604; AVX512F-LABEL: bitselect_v4i64_broadcast_rrm:
605; AVX512F:       # %bb.0:
606; AVX512F-NEXT:    vbroadcastsd (%rdi), %ymm2
607; AVX512F-NEXT:    vandps %ymm2, %ymm0, %ymm0
608; AVX512F-NEXT:    vandnps %ymm1, %ymm2, %ymm1
609; AVX512F-NEXT:    vorps %ymm1, %ymm0, %ymm0
610; AVX512F-NEXT:    retq
611;
612; AVX512VL-LABEL: bitselect_v4i64_broadcast_rrm:
613; AVX512VL:       # %bb.0:
614; AVX512VL-NEXT:    vpternlogq $228, (%rdi){1to4}, %ymm1, %ymm0
615; AVX512VL-NEXT:    retq
616  %a2 = load i64, i64* %p2
617  %1 = insertelement <4 x i64> undef, i64 %a2, i32 0
618  %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <4 x i32> zeroinitializer
619  %3 = xor <4 x i64> %1, <i64 -1, i64 undef, i64 undef, i64 undef>
620  %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> zeroinitializer
621  %5 = and <4 x i64> %a0, %2
622  %6 = and <4 x i64> %a1, %4
623  %7 = or <4 x i64> %5, %6
624  ret <4 x i64> %7
625}
626
627;
628; 512-bit vectors
629;
630
631define <8 x i64> @bitselect_v8i64_rr(<8 x i64>, <8 x i64>) {
632; SSE-LABEL: bitselect_v8i64_rr:
633; SSE:       # %bb.0:
634; SSE-NEXT:    movaps {{.*#+}} xmm8 = [18446744060824649725,18446744060824649725]
635; SSE-NEXT:    andps %xmm8, %xmm7
636; SSE-NEXT:    movaps {{.*#+}} xmm9 = [18446744069414584319,18446744060824649725]
637; SSE-NEXT:    andps %xmm9, %xmm6
638; SSE-NEXT:    andps %xmm8, %xmm5
639; SSE-NEXT:    andps %xmm9, %xmm4
640; SSE-NEXT:    movaps %xmm9, %xmm10
641; SSE-NEXT:    andnps %xmm0, %xmm10
642; SSE-NEXT:    orps %xmm4, %xmm10
643; SSE-NEXT:    movaps %xmm8, %xmm4
644; SSE-NEXT:    andnps %xmm1, %xmm4
645; SSE-NEXT:    orps %xmm5, %xmm4
646; SSE-NEXT:    andnps %xmm2, %xmm9
647; SSE-NEXT:    orps %xmm6, %xmm9
648; SSE-NEXT:    andnps %xmm3, %xmm8
649; SSE-NEXT:    orps %xmm7, %xmm8
650; SSE-NEXT:    movaps %xmm10, %xmm0
651; SSE-NEXT:    movaps %xmm4, %xmm1
652; SSE-NEXT:    movaps %xmm9, %xmm2
653; SSE-NEXT:    movaps %xmm8, %xmm3
654; SSE-NEXT:    retq
655;
656; XOP-LABEL: bitselect_v8i64_rr:
657; XOP:       # %bb.0:
658; XOP-NEXT:    vmovdqa {{.*#+}} ymm4 = [18446744069414584319,18446744060824649725,18446744060824649725,18446744060824649725]
659; XOP-NEXT:    vpcmov %ymm4, %ymm0, %ymm2, %ymm0
660; XOP-NEXT:    vpcmov %ymm4, %ymm1, %ymm3, %ymm1
661; XOP-NEXT:    retq
662;
663; AVX-LABEL: bitselect_v8i64_rr:
664; AVX:       # %bb.0:
665; AVX-NEXT:    vmovaps {{.*#+}} ymm4 = [18446744069414584319,18446744060824649725,18446744060824649725,18446744060824649725]
666; AVX-NEXT:    vandps %ymm4, %ymm3, %ymm3
667; AVX-NEXT:    vandps %ymm4, %ymm2, %ymm2
668; AVX-NEXT:    vandnps %ymm0, %ymm4, %ymm0
669; AVX-NEXT:    vorps %ymm0, %ymm2, %ymm0
670; AVX-NEXT:    vandnps %ymm1, %ymm4, %ymm1
671; AVX-NEXT:    vorps %ymm1, %ymm3, %ymm1
672; AVX-NEXT:    retq
673;
674; AVX512-LABEL: bitselect_v8i64_rr:
675; AVX512:       # %bb.0:
676; AVX512-NEXT:    vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0
677; AVX512-NEXT:    retq
678  %3 = and <8 x i64> %0, <i64 4294967296, i64 12884901890, i64 12884901890, i64 12884901890, i64 4294967296, i64 12884901890, i64 12884901890, i64 12884901890>
679  %4 = and <8 x i64> %1, <i64 -4294967297, i64 -12884901891, i64 -12884901891, i64 -12884901891, i64 -4294967297, i64 -12884901891, i64 -12884901891, i64 -12884901891>
680  %5 = or <8 x i64> %4, %3
681  ret <8 x i64> %5
682}
683
684define <8 x i64> @bitselect_v8i64_rm(<8 x i64>, <8 x i64>* nocapture readonly) {
685; SSE-LABEL: bitselect_v8i64_rm:
686; SSE:       # %bb.0:
687; SSE-NEXT:    movaps {{.*#+}} xmm4 = [18446744065119617022,18446744073709551612]
688; SSE-NEXT:    movaps 48(%rdi), %xmm8
689; SSE-NEXT:    andps %xmm4, %xmm8
690; SSE-NEXT:    movaps 32(%rdi), %xmm9
691; SSE-NEXT:    andps %xmm4, %xmm9
692; SSE-NEXT:    movaps 16(%rdi), %xmm7
693; SSE-NEXT:    andps %xmm4, %xmm7
694; SSE-NEXT:    movaps (%rdi), %xmm6
695; SSE-NEXT:    andps %xmm4, %xmm6
696; SSE-NEXT:    movaps %xmm4, %xmm5
697; SSE-NEXT:    andnps %xmm0, %xmm5
698; SSE-NEXT:    orps %xmm6, %xmm5
699; SSE-NEXT:    movaps %xmm4, %xmm6
700; SSE-NEXT:    andnps %xmm1, %xmm6
701; SSE-NEXT:    orps %xmm7, %xmm6
702; SSE-NEXT:    movaps %xmm4, %xmm7
703; SSE-NEXT:    andnps %xmm2, %xmm7
704; SSE-NEXT:    orps %xmm9, %xmm7
705; SSE-NEXT:    andnps %xmm3, %xmm4
706; SSE-NEXT:    orps %xmm8, %xmm4
707; SSE-NEXT:    movaps %xmm5, %xmm0
708; SSE-NEXT:    movaps %xmm6, %xmm1
709; SSE-NEXT:    movaps %xmm7, %xmm2
710; SSE-NEXT:    movaps %xmm4, %xmm3
711; SSE-NEXT:    retq
712;
713; XOP-LABEL: bitselect_v8i64_rm:
714; XOP:       # %bb.0:
715; XOP-NEXT:    vmovdqa (%rdi), %ymm2
716; XOP-NEXT:    vmovdqa 32(%rdi), %ymm3
717; XOP-NEXT:    vbroadcastf128 {{.*#+}} ymm4 = [18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612]
718; XOP-NEXT:    # ymm4 = mem[0,1,0,1]
719; XOP-NEXT:    vpcmov %ymm4, %ymm0, %ymm2, %ymm0
720; XOP-NEXT:    vpcmov %ymm4, %ymm1, %ymm3, %ymm1
721; XOP-NEXT:    retq
722;
723; AVX-LABEL: bitselect_v8i64_rm:
724; AVX:       # %bb.0:
725; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612]
726; AVX-NEXT:    # ymm2 = mem[0,1,0,1]
727; AVX-NEXT:    vandps 32(%rdi), %ymm2, %ymm3
728; AVX-NEXT:    vandps (%rdi), %ymm2, %ymm4
729; AVX-NEXT:    vandnps %ymm0, %ymm2, %ymm0
730; AVX-NEXT:    vorps %ymm0, %ymm4, %ymm0
731; AVX-NEXT:    vandnps %ymm1, %ymm2, %ymm1
732; AVX-NEXT:    vorps %ymm1, %ymm3, %ymm1
733; AVX-NEXT:    retq
734;
735; AVX512-LABEL: bitselect_v8i64_rm:
736; AVX512:       # %bb.0:
737; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm1
738; AVX512-NEXT:    vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0
739; AVX512-NEXT:    retq
740  %3 = load <8 x i64>, <8 x i64>* %1
741  %4 = and <8 x i64> %0, <i64 8589934593, i64 3, i64 8589934593, i64 3, i64 8589934593, i64 3, i64 8589934593, i64 3>
742  %5 = and <8 x i64> %3, <i64 -8589934594, i64 -4, i64 -8589934594, i64 -4, i64 -8589934594, i64 -4, i64 -8589934594, i64 -4>
743  %6 = or <8 x i64> %5, %4
744  ret <8 x i64> %6
745}
746
747define <8 x i64> @bitselect_v8i64_mr(<8 x i64>* nocapture readonly, <8 x i64>) {
748; SSE-LABEL: bitselect_v8i64_mr:
749; SSE:       # %bb.0:
750; SSE-NEXT:    movaps {{.*#+}} xmm4 = [12884901890,4294967296]
751; SSE-NEXT:    movaps 48(%rdi), %xmm8
752; SSE-NEXT:    andps %xmm4, %xmm8
753; SSE-NEXT:    movaps 32(%rdi), %xmm9
754; SSE-NEXT:    andps %xmm4, %xmm9
755; SSE-NEXT:    movaps 16(%rdi), %xmm7
756; SSE-NEXT:    andps %xmm4, %xmm7
757; SSE-NEXT:    movaps (%rdi), %xmm6
758; SSE-NEXT:    andps %xmm4, %xmm6
759; SSE-NEXT:    movaps %xmm4, %xmm5
760; SSE-NEXT:    andnps %xmm0, %xmm5
761; SSE-NEXT:    orps %xmm6, %xmm5
762; SSE-NEXT:    movaps %xmm4, %xmm6
763; SSE-NEXT:    andnps %xmm1, %xmm6
764; SSE-NEXT:    orps %xmm7, %xmm6
765; SSE-NEXT:    movaps %xmm4, %xmm7
766; SSE-NEXT:    andnps %xmm2, %xmm7
767; SSE-NEXT:    orps %xmm9, %xmm7
768; SSE-NEXT:    andnps %xmm3, %xmm4
769; SSE-NEXT:    orps %xmm8, %xmm4
770; SSE-NEXT:    movaps %xmm5, %xmm0
771; SSE-NEXT:    movaps %xmm6, %xmm1
772; SSE-NEXT:    movaps %xmm7, %xmm2
773; SSE-NEXT:    movaps %xmm4, %xmm3
774; SSE-NEXT:    retq
775;
776; XOP-LABEL: bitselect_v8i64_mr:
777; XOP:       # %bb.0:
778; XOP-NEXT:    vmovdqa (%rdi), %ymm2
779; XOP-NEXT:    vmovdqa 32(%rdi), %ymm3
780; XOP-NEXT:    vbroadcastf128 {{.*#+}} ymm4 = [12884901890,4294967296,12884901890,4294967296]
781; XOP-NEXT:    # ymm4 = mem[0,1,0,1]
782; XOP-NEXT:    vpcmov %ymm4, %ymm0, %ymm2, %ymm0
783; XOP-NEXT:    vpcmov %ymm4, %ymm1, %ymm3, %ymm1
784; XOP-NEXT:    retq
785;
786; AVX-LABEL: bitselect_v8i64_mr:
787; AVX:       # %bb.0:
788; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [12884901890,4294967296,12884901890,4294967296]
789; AVX-NEXT:    # ymm2 = mem[0,1,0,1]
790; AVX-NEXT:    vandps 32(%rdi), %ymm2, %ymm3
791; AVX-NEXT:    vandps (%rdi), %ymm2, %ymm4
792; AVX-NEXT:    vandnps %ymm0, %ymm2, %ymm0
793; AVX-NEXT:    vorps %ymm0, %ymm4, %ymm0
794; AVX-NEXT:    vandnps %ymm1, %ymm2, %ymm1
795; AVX-NEXT:    vorps %ymm1, %ymm3, %ymm1
796; AVX-NEXT:    retq
797;
798; AVX512-LABEL: bitselect_v8i64_mr:
799; AVX512:       # %bb.0:
800; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm1
801; AVX512-NEXT:    vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0
802; AVX512-NEXT:    retq
803  %3 = load <8 x i64>, <8 x i64>* %0
804  %4 = and <8 x i64> %3, <i64 12884901890, i64 4294967296, i64 12884901890, i64 4294967296, i64 12884901890, i64 4294967296, i64 12884901890, i64 4294967296>
805  %5 = and <8 x i64> %1, <i64 -12884901891, i64 -4294967297, i64 -12884901891, i64 -4294967297, i64 -12884901891, i64 -4294967297, i64 -12884901891, i64 -4294967297>
806  %6 = or <8 x i64> %4, %5
807  ret <8 x i64> %6
808}
809
810define <8 x i64> @bitselect_v8i64_mm(<8 x i64>* nocapture readonly, <8 x i64>* nocapture readonly) {
811; SSE-LABEL: bitselect_v8i64_mm:
812; SSE:       # %bb.0:
813; SSE-NEXT:    movaps {{.*#+}} xmm3 = [18446744073709551612,18446744065119617022]
814; SSE-NEXT:    movaps 48(%rsi), %xmm4
815; SSE-NEXT:    andps %xmm3, %xmm4
816; SSE-NEXT:    movaps 32(%rsi), %xmm5
817; SSE-NEXT:    andps %xmm3, %xmm5
818; SSE-NEXT:    movaps 16(%rsi), %xmm2
819; SSE-NEXT:    andps %xmm3, %xmm2
820; SSE-NEXT:    movaps (%rsi), %xmm1
821; SSE-NEXT:    andps %xmm3, %xmm1
822; SSE-NEXT:    movaps %xmm3, %xmm0
823; SSE-NEXT:    andnps (%rdi), %xmm0
824; SSE-NEXT:    orps %xmm1, %xmm0
825; SSE-NEXT:    movaps %xmm3, %xmm1
826; SSE-NEXT:    andnps 16(%rdi), %xmm1
827; SSE-NEXT:    orps %xmm2, %xmm1
828; SSE-NEXT:    movaps %xmm3, %xmm2
829; SSE-NEXT:    andnps 32(%rdi), %xmm2
830; SSE-NEXT:    orps %xmm5, %xmm2
831; SSE-NEXT:    andnps 48(%rdi), %xmm3
832; SSE-NEXT:    orps %xmm4, %xmm3
833; SSE-NEXT:    retq
834;
835; XOP-LABEL: bitselect_v8i64_mm:
836; XOP:       # %bb.0:
837; XOP-NEXT:    vmovdqa (%rsi), %ymm0
838; XOP-NEXT:    vmovdqa 32(%rsi), %ymm1
839; XOP-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022]
840; XOP-NEXT:    # ymm2 = mem[0,1,0,1]
841; XOP-NEXT:    vpcmov %ymm2, (%rdi), %ymm0, %ymm0
842; XOP-NEXT:    vpcmov %ymm2, 32(%rdi), %ymm1, %ymm1
843; XOP-NEXT:    retq
844;
845; AVX-LABEL: bitselect_v8i64_mm:
846; AVX:       # %bb.0:
847; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022]
848; AVX-NEXT:    # ymm1 = mem[0,1,0,1]
849; AVX-NEXT:    vandps 32(%rsi), %ymm1, %ymm2
850; AVX-NEXT:    vandps (%rsi), %ymm1, %ymm0
851; AVX-NEXT:    vandnps (%rdi), %ymm1, %ymm3
852; AVX-NEXT:    vorps %ymm3, %ymm0, %ymm0
853; AVX-NEXT:    vandnps 32(%rdi), %ymm1, %ymm1
854; AVX-NEXT:    vorps %ymm1, %ymm2, %ymm1
855; AVX-NEXT:    retq
856;
857; AVX512-LABEL: bitselect_v8i64_mm:
858; AVX512:       # %bb.0:
859; AVX512-NEXT:    vmovdqa64 (%rsi), %zmm1
860; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022]
861; AVX512-NEXT:    vpternlogq $202, (%rdi), %zmm1, %zmm0
862; AVX512-NEXT:    retq
863  %3 = load <8 x i64>, <8 x i64>* %0
864  %4 = load <8 x i64>, <8 x i64>* %1
865  %5 = and <8 x i64> %3, <i64 3, i64 8589934593, i64 3, i64 8589934593, i64 3, i64 8589934593, i64 3, i64 8589934593>
866  %6 = and <8 x i64> %4, <i64 -4, i64 -8589934594, i64 -4, i64 -8589934594, i64 -4, i64 -8589934594, i64 -4, i64 -8589934594>
867  %7 = or <8 x i64> %6, %5
868  ret <8 x i64> %7
869}
870
871define <8 x i64> @bitselect_v8i64_broadcast_rrr(<8 x i64> %a0, <8 x i64> %a1, i64 %a2) {
872; SSE-LABEL: bitselect_v8i64_broadcast_rrr:
873; SSE:       # %bb.0:
874; SSE-NEXT:    movq %rdi, %xmm8
875; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1]
876; SSE-NEXT:    pand %xmm8, %xmm3
877; SSE-NEXT:    pand %xmm8, %xmm2
878; SSE-NEXT:    pand %xmm8, %xmm1
879; SSE-NEXT:    pand %xmm8, %xmm0
880; SSE-NEXT:    movdqa %xmm8, %xmm9
881; SSE-NEXT:    pandn %xmm7, %xmm9
882; SSE-NEXT:    por %xmm9, %xmm3
883; SSE-NEXT:    movdqa %xmm8, %xmm7
884; SSE-NEXT:    pandn %xmm6, %xmm7
885; SSE-NEXT:    por %xmm7, %xmm2
886; SSE-NEXT:    movdqa %xmm8, %xmm6
887; SSE-NEXT:    pandn %xmm5, %xmm6
888; SSE-NEXT:    por %xmm6, %xmm1
889; SSE-NEXT:    pandn %xmm4, %xmm8
890; SSE-NEXT:    por %xmm8, %xmm0
891; SSE-NEXT:    retq
892;
893; XOP-LABEL: bitselect_v8i64_broadcast_rrr:
894; XOP:       # %bb.0:
895; XOP-NEXT:    vmovq %rdi, %xmm4
896; XOP-NEXT:    vmovq %rdi, %xmm5
897; XOP-NEXT:    vmovddup {{.*#+}} xmm4 = xmm4[0,0]
898; XOP-NEXT:    vinsertf128 $1, %xmm4, %ymm4, %ymm4
899; XOP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1]
900; XOP-NEXT:    vinsertf128 $1, %xmm5, %ymm5, %ymm5
901; XOP-NEXT:    vandps %ymm4, %ymm1, %ymm1
902; XOP-NEXT:    vandps %ymm4, %ymm0, %ymm0
903; XOP-NEXT:    vandnps %ymm3, %ymm5, %ymm3
904; XOP-NEXT:    vorps %ymm3, %ymm1, %ymm1
905; XOP-NEXT:    vandnps %ymm2, %ymm5, %ymm2
906; XOP-NEXT:    vorps %ymm2, %ymm0, %ymm0
907; XOP-NEXT:    retq
908;
909; AVX1-LABEL: bitselect_v8i64_broadcast_rrr:
910; AVX1:       # %bb.0:
911; AVX1-NEXT:    vmovq %rdi, %xmm4
912; AVX1-NEXT:    vmovq %rdi, %xmm5
913; AVX1-NEXT:    vmovddup {{.*#+}} xmm4 = xmm4[0,0]
914; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm4, %ymm4
915; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1]
916; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm5, %ymm5
917; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
918; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
919; AVX1-NEXT:    vandnps %ymm3, %ymm5, %ymm3
920; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
921; AVX1-NEXT:    vandnps %ymm2, %ymm5, %ymm2
922; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
923; AVX1-NEXT:    retq
924;
925; AVX2-LABEL: bitselect_v8i64_broadcast_rrr:
926; AVX2:       # %bb.0:
927; AVX2-NEXT:    vmovq %rdi, %xmm4
928; AVX2-NEXT:    vpbroadcastq %xmm4, %ymm4
929; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
930; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
931; AVX2-NEXT:    vpandn %ymm3, %ymm4, %ymm3
932; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
933; AVX2-NEXT:    vpandn %ymm2, %ymm4, %ymm2
934; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
935; AVX2-NEXT:    retq
936;
937; AVX512-LABEL: bitselect_v8i64_broadcast_rrr:
938; AVX512:       # %bb.0:
939; AVX512-NEXT:    vpbroadcastq %rdi, %zmm2
940; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm2, %zmm0
941; AVX512-NEXT:    retq
942  %1 = insertelement <8 x i64> undef, i64 %a2, i32 0
943  %2 = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> zeroinitializer
944  %3 = xor <8 x i64> %1, <i64 -1, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>
945  %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> zeroinitializer
946  %5 = and <8 x i64> %a0, %2
947  %6 = and <8 x i64> %a1, %4
948  %7 = or <8 x i64> %5, %6
949  ret <8 x i64> %7
950}
951
952define <8 x i64> @bitselect_v8i64_broadcast_rrm(<8 x i64> %a0, <8 x i64> %a1, i64* %p2) {
953; SSE-LABEL: bitselect_v8i64_broadcast_rrm:
954; SSE:       # %bb.0:
955; SSE-NEXT:    movq {{.*#+}} xmm8 = mem[0],zero
956; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1]
957; SSE-NEXT:    pand %xmm8, %xmm3
958; SSE-NEXT:    pand %xmm8, %xmm2
959; SSE-NEXT:    pand %xmm8, %xmm1
960; SSE-NEXT:    pand %xmm8, %xmm0
961; SSE-NEXT:    movdqa %xmm8, %xmm9
962; SSE-NEXT:    pandn %xmm7, %xmm9
963; SSE-NEXT:    por %xmm9, %xmm3
964; SSE-NEXT:    movdqa %xmm8, %xmm7
965; SSE-NEXT:    pandn %xmm6, %xmm7
966; SSE-NEXT:    por %xmm7, %xmm2
967; SSE-NEXT:    movdqa %xmm8, %xmm6
968; SSE-NEXT:    pandn %xmm5, %xmm6
969; SSE-NEXT:    por %xmm6, %xmm1
970; SSE-NEXT:    pandn %xmm4, %xmm8
971; SSE-NEXT:    por %xmm8, %xmm0
972; SSE-NEXT:    retq
973;
974; XOP-LABEL: bitselect_v8i64_broadcast_rrm:
975; XOP:       # %bb.0:
976; XOP-NEXT:    vbroadcastsd (%rdi), %ymm4
977; XOP-NEXT:    vpcmov %ymm4, %ymm2, %ymm0, %ymm0
978; XOP-NEXT:    vpcmov %ymm4, %ymm3, %ymm1, %ymm1
979; XOP-NEXT:    retq
980;
981; AVX-LABEL: bitselect_v8i64_broadcast_rrm:
982; AVX:       # %bb.0:
983; AVX-NEXT:    vbroadcastsd (%rdi), %ymm4
984; AVX-NEXT:    vandps %ymm4, %ymm1, %ymm1
985; AVX-NEXT:    vandps %ymm4, %ymm0, %ymm0
986; AVX-NEXT:    vandnps %ymm3, %ymm4, %ymm3
987; AVX-NEXT:    vorps %ymm3, %ymm1, %ymm1
988; AVX-NEXT:    vandnps %ymm2, %ymm4, %ymm2
989; AVX-NEXT:    vorps %ymm2, %ymm0, %ymm0
990; AVX-NEXT:    retq
991;
992; AVX512-LABEL: bitselect_v8i64_broadcast_rrm:
993; AVX512:       # %bb.0:
994; AVX512-NEXT:    vpternlogq $228, (%rdi){1to8}, %zmm1, %zmm0
995; AVX512-NEXT:    retq
996  %a2 = load i64, i64* %p2
997  %1 = insertelement <8 x i64> undef, i64 %a2, i32 0
998  %2 = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> zeroinitializer
999  %3 = xor <8 x i64> %1, <i64 -1, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>
1000  %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> zeroinitializer
1001  %5 = and <8 x i64> %a0, %2
1002  %6 = and <8 x i64> %a1, %4
1003  %7 = or <8 x i64> %5, %6
1004  ret <8 x i64> %7
1005}
1006
1007; Check that mask registers don't get canonicalized.
1008define <4 x i1> @bitselect_v4i1_loop(<4 x i32> %a0, <4 x i32> %a1) {
1009; SSE-LABEL: bitselect_v4i1_loop:
1010; SSE:       # %bb.0: # %bb
1011; SSE-NEXT:    pxor %xmm2, %xmm2
1012; SSE-NEXT:    pcmpeqd %xmm0, %xmm2
1013; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [12,12,12,12]
1014; SSE-NEXT:    pcmpeqd %xmm1, %xmm0
1015; SSE-NEXT:    pcmpeqd {{.*}}(%rip), %xmm1
1016; SSE-NEXT:    pand %xmm2, %xmm1
1017; SSE-NEXT:    pandn %xmm0, %xmm2
1018; SSE-NEXT:    por %xmm1, %xmm2
1019; SSE-NEXT:    movdqa %xmm2, %xmm0
1020; SSE-NEXT:    retq
1021;
1022; XOP-LABEL: bitselect_v4i1_loop:
1023; XOP:       # %bb.0: # %bb
1024; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1025; XOP-NEXT:    vpcomneqd %xmm2, %xmm0, %xmm0
1026; XOP-NEXT:    vpcomeqd {{.*}}(%rip), %xmm1, %xmm2
1027; XOP-NEXT:    vpcomeqd {{.*}}(%rip), %xmm1, %xmm1
1028; XOP-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
1029; XOP-NEXT:    retq
1030;
1031; AVX1-LABEL: bitselect_v4i1_loop:
1032; AVX1:       # %bb.0: # %bb
1033; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1034; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
1035; AVX1-NEXT:    vpcmpeqd {{.*}}(%rip), %xmm1, %xmm2
1036; AVX1-NEXT:    vpcmpeqd {{.*}}(%rip), %xmm1, %xmm1
1037; AVX1-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
1038; AVX1-NEXT:    retq
1039;
1040; AVX2-LABEL: bitselect_v4i1_loop:
1041; AVX2:       # %bb.0: # %bb
1042; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1043; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
1044; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [12,12,12,12]
1045; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm2
1046; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [15,15,15,15]
1047; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm1
1048; AVX2-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
1049; AVX2-NEXT:    retq
1050;
1051; AVX512F-LABEL: bitselect_v4i1_loop:
1052; AVX512F:       # %bb.0: # %bb
1053; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1054; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1055; AVX512F-NEXT:    vpcmpeqd {{.*}}(%rip){1to16}, %zmm1, %k1
1056; AVX512F-NEXT:    vpcmpeqd {{.*}}(%rip){1to16}, %zmm1, %k2
1057; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0 {%k2}
1058; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
1059; AVX512F-NEXT:    korw %k0, %k1, %k1
1060; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1061; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1062; AVX512F-NEXT:    vzeroupper
1063; AVX512F-NEXT:    retq
1064;
1065; AVX512VL-LABEL: bitselect_v4i1_loop:
1066; AVX512VL:       # %bb.0: # %bb
1067; AVX512VL-NEXT:    vpcmpeqd {{.*}}(%rip){1to4}, %xmm1, %k1
1068; AVX512VL-NEXT:    vpcmpeqd {{.*}}(%rip){1to4}, %xmm1, %k2
1069; AVX512VL-NEXT:    vptestnmd %xmm0, %xmm0, %k0 {%k2}
1070; AVX512VL-NEXT:    vptestmd %xmm0, %xmm0, %k1 {%k1}
1071; AVX512VL-NEXT:    korw %k0, %k1, %k1
1072; AVX512VL-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1073; AVX512VL-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1074; AVX512VL-NEXT:    retq
1075bb:
1076  %tmp = icmp ne <4 x i32> %a0, zeroinitializer
1077  %tmp2 = icmp eq <4 x i32> %a1, <i32 12, i32 12, i32 12, i32 12>
1078  %tmp3 = icmp eq <4 x i32> %a1, <i32 15, i32 15, i32 15, i32 15>
1079  %tmp4 = select <4 x i1> %tmp, <4 x i1> %tmp2, <4 x i1> %tmp3
1080  ret <4 x i1> %tmp4
1081}
1082
1083