• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck --check-prefix=CHECK --check-prefix=KNL %s
3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck --check-prefix=CHECK --check-prefix=SKX --check-prefix=SKX_ONLY %s
4; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+avx512vbmi | FileCheck --check-prefix=CHECK --check-prefix=SKX --check-prefix=SKX_VBMI %s
5
6define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
7; CHECK-LABEL: test1:
8; CHECK:       ## %bb.0:
9; CHECK-NEXT:    vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
10; CHECK-NEXT:    vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
11; CHECK-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
12; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
13; CHECK-NEXT:    vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
14; CHECK-NEXT:    retq
15  %rrr = load float, float* %br
16  %rrr2 = insertelement <16 x float> %x, float %rrr, i32 1
17  %rrr3 = insertelement <16 x float> %rrr2, float %y, i32 14
18  ret <16 x float> %rrr3
19}
20
21define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
22; CHECK-LABEL: test2:
23; CHECK:       ## %bb.0:
24; CHECK-NEXT:    vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0]
25; CHECK-NEXT:    vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
26; CHECK-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
27; CHECK-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
28; CHECK-NEXT:    vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
29; CHECK-NEXT:    retq
30  %rrr = load double, double* %br
31  %rrr2 = insertelement <8 x double> %x, double %rrr, i32 1
32  %rrr3 = insertelement <8 x double> %rrr2, double %y, i32 6
33  ret <8 x double> %rrr3
34}
35
36define <16 x float> @test3(<16 x float> %x) nounwind {
37; CHECK-LABEL: test3:
38; CHECK:       ## %bb.0:
39; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
40; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
41; CHECK-NEXT:    vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
42; CHECK-NEXT:    retq
43  %eee = extractelement <16 x float> %x, i32 4
44  %rrr2 = insertelement <16 x float> %x, float %eee, i32 1
45  ret <16 x float> %rrr2
46}
47
48define <8 x i64> @test4(<8 x i64> %x) nounwind {
49; CHECK-LABEL: test4:
50; CHECK:       ## %bb.0:
51; CHECK-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
52; CHECK-NEXT:    vmovq %xmm1, %rax
53; CHECK-NEXT:    vpinsrq $1, %rax, %xmm0, %xmm1
54; CHECK-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
55; CHECK-NEXT:    retq
56  %eee = extractelement <8 x i64> %x, i32 4
57  %rrr2 = insertelement <8 x i64> %x, i64 %eee, i32 1
58  ret <8 x i64> %rrr2
59}
60
61define i32 @test5(<4 x float> %x) nounwind {
62; CHECK-LABEL: test5:
63; CHECK:       ## %bb.0:
64; CHECK-NEXT:    vextractps $3, %xmm0, %eax
65; CHECK-NEXT:    retq
66  %ef = extractelement <4 x float> %x, i32 3
67  %ei = bitcast float %ef to i32
68  ret i32 %ei
69}
70
71define void @test6(<4 x float> %x, float* %out) nounwind {
72; CHECK-LABEL: test6:
73; CHECK:       ## %bb.0:
74; CHECK-NEXT:    vextractps $3, %xmm0, (%rdi)
75; CHECK-NEXT:    retq
76  %ef = extractelement <4 x float> %x, i32 3
77  store float %ef, float* %out, align 4
78  ret void
79}
80
81define float @test7(<16 x float> %x, i32 %ind) nounwind {
82; CHECK-LABEL: test7:
83; CHECK:       ## %bb.0:
84; CHECK-NEXT:    pushq %rbp
85; CHECK-NEXT:    movq %rsp, %rbp
86; CHECK-NEXT:    andq $-64, %rsp
87; CHECK-NEXT:    subq $128, %rsp
88; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
89; CHECK-NEXT:    vmovaps %zmm0, (%rsp)
90; CHECK-NEXT:    andl $15, %edi
91; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
92; CHECK-NEXT:    movq %rbp, %rsp
93; CHECK-NEXT:    popq %rbp
94; CHECK-NEXT:    vzeroupper
95; CHECK-NEXT:    retq
96  %e = extractelement <16 x float> %x, i32 %ind
97  ret float %e
98}
99
100define double @test8(<8 x double> %x, i32 %ind) nounwind {
101; CHECK-LABEL: test8:
102; CHECK:       ## %bb.0:
103; CHECK-NEXT:    pushq %rbp
104; CHECK-NEXT:    movq %rsp, %rbp
105; CHECK-NEXT:    andq $-64, %rsp
106; CHECK-NEXT:    subq $128, %rsp
107; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
108; CHECK-NEXT:    vmovaps %zmm0, (%rsp)
109; CHECK-NEXT:    andl $7, %edi
110; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
111; CHECK-NEXT:    movq %rbp, %rsp
112; CHECK-NEXT:    popq %rbp
113; CHECK-NEXT:    vzeroupper
114; CHECK-NEXT:    retq
115  %e = extractelement <8 x double> %x, i32 %ind
116  ret double %e
117}
118
119define float @test9(<8 x float> %x, i32 %ind) nounwind {
120; CHECK-LABEL: test9:
121; CHECK:       ## %bb.0:
122; CHECK-NEXT:    pushq %rbp
123; CHECK-NEXT:    movq %rsp, %rbp
124; CHECK-NEXT:    andq $-32, %rsp
125; CHECK-NEXT:    subq $64, %rsp
126; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
127; CHECK-NEXT:    vmovaps %ymm0, (%rsp)
128; CHECK-NEXT:    andl $7, %edi
129; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
130; CHECK-NEXT:    movq %rbp, %rsp
131; CHECK-NEXT:    popq %rbp
132; CHECK-NEXT:    vzeroupper
133; CHECK-NEXT:    retq
134  %e = extractelement <8 x float> %x, i32 %ind
135  ret float %e
136}
137
138define i32 @test10(<16 x i32> %x, i32 %ind) nounwind {
139; CHECK-LABEL: test10:
140; CHECK:       ## %bb.0:
141; CHECK-NEXT:    pushq %rbp
142; CHECK-NEXT:    movq %rsp, %rbp
143; CHECK-NEXT:    andq $-64, %rsp
144; CHECK-NEXT:    subq $128, %rsp
145; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
146; CHECK-NEXT:    vmovaps %zmm0, (%rsp)
147; CHECK-NEXT:    andl $15, %edi
148; CHECK-NEXT:    movl (%rsp,%rdi,4), %eax
149; CHECK-NEXT:    movq %rbp, %rsp
150; CHECK-NEXT:    popq %rbp
151; CHECK-NEXT:    vzeroupper
152; CHECK-NEXT:    retq
153  %e = extractelement <16 x i32> %x, i32 %ind
154  ret i32 %e
155}
156
157define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) {
158; KNL-LABEL: test11:
159; KNL:       ## %bb.0:
160; KNL-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
161; KNL-NEXT:    kshiftrw $4, %k0, %k0
162; KNL-NEXT:    kmovw %k0, %eax
163; KNL-NEXT:    testb $1, %al
164; KNL-NEXT:    je LBB10_2
165; KNL-NEXT:  ## %bb.1: ## %A
166; KNL-NEXT:    vmovdqa64 %zmm1, %zmm0
167; KNL-NEXT:    retq
168; KNL-NEXT:  LBB10_2: ## %B
169; KNL-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
170; KNL-NEXT:    retq
171;
172; SKX-LABEL: test11:
173; SKX:       ## %bb.0:
174; SKX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
175; SKX-NEXT:    kshiftrw $4, %k0, %k0
176; SKX-NEXT:    kmovd %k0, %eax
177; SKX-NEXT:    testb $1, %al
178; SKX-NEXT:    je LBB10_2
179; SKX-NEXT:  ## %bb.1: ## %A
180; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
181; SKX-NEXT:    retq
182; SKX-NEXT:  LBB10_2: ## %B
183; SKX-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
184; SKX-NEXT:    retq
185  %cmp_res = icmp ult <16 x i32> %a, %b
186  %ia = extractelement <16 x i1> %cmp_res, i32 4
187  br i1 %ia, label %A, label %B
188  A:
189    ret <16 x i32>%b
190  B:
191   %c = add <16 x i32>%b, %a
192   ret <16 x i32>%c
193}
194
195define i64 @test12(<16 x i64>%a, <16 x i64>%b, i64 %a1, i64 %b1) {
196; KNL-LABEL: test12:
197; KNL:       ## %bb.0:
198; KNL-NEXT:    vpcmpgtq %zmm0, %zmm2, %k0
199; KNL-NEXT:    kmovw %k0, %eax
200; KNL-NEXT:    testb $1, %al
201; KNL-NEXT:    cmoveq %rsi, %rdi
202; KNL-NEXT:    movq %rdi, %rax
203; KNL-NEXT:    vzeroupper
204; KNL-NEXT:    retq
205;
206; SKX-LABEL: test12:
207; SKX:       ## %bb.0:
208; SKX-NEXT:    vpcmpgtq %zmm0, %zmm2, %k0
209; SKX-NEXT:    kmovd %k0, %eax
210; SKX-NEXT:    testb $1, %al
211; SKX-NEXT:    cmoveq %rsi, %rdi
212; SKX-NEXT:    movq %rdi, %rax
213; SKX-NEXT:    vzeroupper
214; SKX-NEXT:    retq
215  %cmpvector_func.i = icmp slt <16 x i64> %a, %b
216  %extract24vector_func.i = extractelement <16 x i1> %cmpvector_func.i, i32 0
217  %res = select i1 %extract24vector_func.i, i64 %a1, i64 %b1
218  ret i64 %res
219}
220
221define i16 @test13(i32 %a, i32 %b) {
222; KNL-LABEL: test13:
223; KNL:       ## %bb.0:
224; KNL-NEXT:    cmpl %esi, %edi
225; KNL-NEXT:    setb %al
226; KNL-NEXT:    movw $-4, %cx
227; KNL-NEXT:    kmovw %ecx, %k0
228; KNL-NEXT:    kshiftrw $1, %k0, %k0
229; KNL-NEXT:    kshiftlw $1, %k0, %k0
230; KNL-NEXT:    andl $1, %eax
231; KNL-NEXT:    kmovw %eax, %k1
232; KNL-NEXT:    korw %k1, %k0, %k0
233; KNL-NEXT:    kmovw %k0, %eax
234; KNL-NEXT:    ## kill: def $ax killed $ax killed $eax
235; KNL-NEXT:    retq
236;
237; SKX-LABEL: test13:
238; SKX:       ## %bb.0:
239; SKX-NEXT:    cmpl %esi, %edi
240; SKX-NEXT:    setb %al
241; SKX-NEXT:    movw $-4, %cx
242; SKX-NEXT:    kmovd %ecx, %k0
243; SKX-NEXT:    kshiftrw $1, %k0, %k0
244; SKX-NEXT:    kshiftlw $1, %k0, %k0
245; SKX-NEXT:    andl $1, %eax
246; SKX-NEXT:    kmovw %eax, %k1
247; SKX-NEXT:    korw %k1, %k0, %k0
248; SKX-NEXT:    kmovd %k0, %eax
249; SKX-NEXT:    ## kill: def $ax killed $ax killed $eax
250; SKX-NEXT:    retq
251  %cmp_res = icmp ult i32 %a, %b
252  %maskv = insertelement <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %cmp_res, i32 0
253  %res = bitcast <16 x i1> %maskv to i16
254  ret i16 %res
255}
256
257define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) {
258; KNL-LABEL: test14:
259; KNL:       ## %bb.0:
260; KNL-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0
261; KNL-NEXT:    kshiftrw $4, %k0, %k0
262; KNL-NEXT:    kmovw %k0, %eax
263; KNL-NEXT:    testb $1, %al
264; KNL-NEXT:    cmoveq %rsi, %rdi
265; KNL-NEXT:    movq %rdi, %rax
266; KNL-NEXT:    vzeroupper
267; KNL-NEXT:    retq
268;
269; SKX-LABEL: test14:
270; SKX:       ## %bb.0:
271; SKX-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0
272; SKX-NEXT:    kshiftrw $4, %k0, %k0
273; SKX-NEXT:    kmovd %k0, %eax
274; SKX-NEXT:    testb $1, %al
275; SKX-NEXT:    cmoveq %rsi, %rdi
276; SKX-NEXT:    movq %rdi, %rax
277; SKX-NEXT:    vzeroupper
278; SKX-NEXT:    retq
279  %cmpvector_func.i = icmp slt <8 x i64> %a, %b
280  %extract24vector_func.i = extractelement <8 x i1> %cmpvector_func.i, i32 4
281  %res = select i1 %extract24vector_func.i, i64 %a1, i64 %b1
282  ret i64 %res
283}
284
285define i16 @test15(i1 *%addr) {
286; CHECK-LABEL: test15:
287; CHECK:       ## %bb.0:
288; CHECK-NEXT:    xorl %ecx, %ecx
289; CHECK-NEXT:    cmpb $0, (%rdi)
290; CHECK-NEXT:    movl $65535, %eax ## imm = 0xFFFF
291; CHECK-NEXT:    cmovel %ecx, %eax
292; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
293; CHECK-NEXT:    retq
294  %x = load i1 , i1 * %addr, align 1
295  %x1 = insertelement <16 x i1> undef, i1 %x, i32 10
296  %x2 = bitcast <16 x i1>%x1 to i16
297  ret i16 %x2
298}
299
300define i16 @test16(i1 *%addr, i16 %a) {
301; KNL-LABEL: test16:
302; KNL:       ## %bb.0:
303; KNL-NEXT:    movb (%rdi), %al
304; KNL-NEXT:    kmovw %esi, %k0
305; KNL-NEXT:    kmovw %eax, %k1
306; KNL-NEXT:    kshiftrw $10, %k0, %k2
307; KNL-NEXT:    kxorw %k1, %k2, %k1
308; KNL-NEXT:    kshiftlw $15, %k1, %k1
309; KNL-NEXT:    kshiftrw $5, %k1, %k1
310; KNL-NEXT:    kxorw %k1, %k0, %k0
311; KNL-NEXT:    kmovw %k0, %eax
312; KNL-NEXT:    ## kill: def $ax killed $ax killed $eax
313; KNL-NEXT:    retq
314;
315; SKX-LABEL: test16:
316; SKX:       ## %bb.0:
317; SKX-NEXT:    kmovb (%rdi), %k0
318; SKX-NEXT:    kmovd %esi, %k1
319; SKX-NEXT:    kshiftrw $10, %k1, %k2
320; SKX-NEXT:    kxorw %k0, %k2, %k0
321; SKX-NEXT:    kshiftlw $15, %k0, %k0
322; SKX-NEXT:    kshiftrw $5, %k0, %k0
323; SKX-NEXT:    kxorw %k0, %k1, %k0
324; SKX-NEXT:    kmovd %k0, %eax
325; SKX-NEXT:    ## kill: def $ax killed $ax killed $eax
326; SKX-NEXT:    retq
327  %x = load i1 , i1 * %addr, align 128
328  %a1 = bitcast i16 %a to <16 x i1>
329  %x1 = insertelement <16 x i1> %a1, i1 %x, i32 10
330  %x2 = bitcast <16 x i1>%x1 to i16
331  ret i16 %x2
332}
333
334define i8 @test17(i1 *%addr, i8 %a) {
335; KNL-LABEL: test17:
336; KNL:       ## %bb.0:
337; KNL-NEXT:    movb (%rdi), %al
338; KNL-NEXT:    kmovw %esi, %k0
339; KNL-NEXT:    kmovw %eax, %k1
340; KNL-NEXT:    kshiftrw $4, %k0, %k2
341; KNL-NEXT:    kxorw %k1, %k2, %k1
342; KNL-NEXT:    kshiftlw $15, %k1, %k1
343; KNL-NEXT:    kshiftrw $11, %k1, %k1
344; KNL-NEXT:    kxorw %k1, %k0, %k0
345; KNL-NEXT:    kmovw %k0, %eax
346; KNL-NEXT:    ## kill: def $al killed $al killed $eax
347; KNL-NEXT:    retq
348;
349; SKX-LABEL: test17:
350; SKX:       ## %bb.0:
351; SKX-NEXT:    kmovb (%rdi), %k0
352; SKX-NEXT:    kmovd %esi, %k1
353; SKX-NEXT:    kshiftrb $4, %k1, %k2
354; SKX-NEXT:    kxorb %k0, %k2, %k0
355; SKX-NEXT:    kshiftlb $7, %k0, %k0
356; SKX-NEXT:    kshiftrb $3, %k0, %k0
357; SKX-NEXT:    kxorb %k0, %k1, %k0
358; SKX-NEXT:    kmovd %k0, %eax
359; SKX-NEXT:    ## kill: def $al killed $al killed $eax
360; SKX-NEXT:    retq
361  %x = load i1 , i1 * %addr, align 128
362  %a1 = bitcast i8 %a to <8 x i1>
363  %x1 = insertelement <8 x i1> %a1, i1 %x, i32 4
364  %x2 = bitcast <8 x i1>%x1 to i8
365  ret i8 %x2
366}
367
368define i64 @extract_v8i64(<8 x i64> %x, i64* %dst) {
369; CHECK-LABEL: extract_v8i64:
370; CHECK:       ## %bb.0:
371; CHECK-NEXT:    vpextrq $1, %xmm0, %rax
372; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
373; CHECK-NEXT:    vpextrq $1, %xmm0, (%rdi)
374; CHECK-NEXT:    vzeroupper
375; CHECK-NEXT:    retq
376  %r1 = extractelement <8 x i64> %x, i32 1
377  %r2 = extractelement <8 x i64> %x, i32 3
378  store i64 %r2, i64* %dst, align 1
379  ret i64 %r1
380}
381
382define i64 @extract_v4i64(<4 x i64> %x, i64* %dst) {
383; CHECK-LABEL: extract_v4i64:
384; CHECK:       ## %bb.0:
385; CHECK-NEXT:    vpextrq $1, %xmm0, %rax
386; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
387; CHECK-NEXT:    vpextrq $1, %xmm0, (%rdi)
388; CHECK-NEXT:    vzeroupper
389; CHECK-NEXT:    retq
390  %r1 = extractelement <4 x i64> %x, i32 1
391  %r2 = extractelement <4 x i64> %x, i32 3
392  store i64 %r2, i64* %dst, align 1
393  ret i64 %r1
394}
395
396define i64 @extract_v2i64(<2 x i64> %x, i64* %dst) {
397; CHECK-LABEL: extract_v2i64:
398; CHECK:       ## %bb.0:
399; CHECK-NEXT:    vmovq %xmm0, %rax
400; CHECK-NEXT:    vpextrq $1, %xmm0, (%rdi)
401; CHECK-NEXT:    retq
402  %r1 = extractelement <2 x i64> %x, i32 0
403  %r2 = extractelement <2 x i64> %x, i32 1
404  store i64 %r2, i64* %dst, align 1
405  ret i64 %r1
406}
407
408define i32 @extract_v16i32(<16 x i32> %x, i32* %dst) {
409; CHECK-LABEL: extract_v16i32:
410; CHECK:       ## %bb.0:
411; CHECK-NEXT:    vextractps $1, %xmm0, %eax
412; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
413; CHECK-NEXT:    vextractps $1, %xmm0, (%rdi)
414; CHECK-NEXT:    vzeroupper
415; CHECK-NEXT:    retq
416  %r1 = extractelement <16 x i32> %x, i32 1
417  %r2 = extractelement <16 x i32> %x, i32 5
418  store i32 %r2, i32* %dst, align 1
419  ret i32 %r1
420}
421
422define i32 @extract_v8i32(<8 x i32> %x, i32* %dst) {
423; CHECK-LABEL: extract_v8i32:
424; CHECK:       ## %bb.0:
425; CHECK-NEXT:    vextractps $1, %xmm0, %eax
426; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
427; CHECK-NEXT:    vextractps $1, %xmm0, (%rdi)
428; CHECK-NEXT:    vzeroupper
429; CHECK-NEXT:    retq
430  %r1 = extractelement <8 x i32> %x, i32 1
431  %r2 = extractelement <8 x i32> %x, i32 5
432  store i32 %r2, i32* %dst, align 1
433  ret i32 %r1
434}
435
436define i32 @extract_v4i32(<4 x i32> %x, i32* %dst) {
437; CHECK-LABEL: extract_v4i32:
438; CHECK:       ## %bb.0:
439; CHECK-NEXT:    vextractps $1, %xmm0, %eax
440; CHECK-NEXT:    vextractps $3, %xmm0, (%rdi)
441; CHECK-NEXT:    retq
442  %r1 = extractelement <4 x i32> %x, i32 1
443  %r2 = extractelement <4 x i32> %x, i32 3
444  store i32 %r2, i32* %dst, align 1
445  ret i32 %r1
446}
447
448define i16 @extract_v32i16(<32 x i16> %x, i16* %dst) {
449; CHECK-LABEL: extract_v32i16:
450; CHECK:       ## %bb.0:
451; CHECK-NEXT:    vpextrw $1, %xmm0, %eax
452; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
453; CHECK-NEXT:    vpextrw $1, %xmm0, (%rdi)
454; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
455; CHECK-NEXT:    vzeroupper
456; CHECK-NEXT:    retq
457  %r1 = extractelement <32 x i16> %x, i32 1
458  %r2 = extractelement <32 x i16> %x, i32 9
459  store i16 %r2, i16* %dst, align 1
460  ret i16 %r1
461}
462
463define i16 @extract_v16i16(<16 x i16> %x, i16* %dst) {
464; CHECK-LABEL: extract_v16i16:
465; CHECK:       ## %bb.0:
466; CHECK-NEXT:    vpextrw $1, %xmm0, %eax
467; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
468; CHECK-NEXT:    vpextrw $1, %xmm0, (%rdi)
469; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
470; CHECK-NEXT:    vzeroupper
471; CHECK-NEXT:    retq
472  %r1 = extractelement <16 x i16> %x, i32 1
473  %r2 = extractelement <16 x i16> %x, i32 9
474  store i16 %r2, i16* %dst, align 1
475  ret i16 %r1
476}
477
478define i16 @extract_v8i16(<8 x i16> %x, i16* %dst) {
479; CHECK-LABEL: extract_v8i16:
480; CHECK:       ## %bb.0:
481; CHECK-NEXT:    vpextrw $1, %xmm0, %eax
482; CHECK-NEXT:    vpextrw $3, %xmm0, (%rdi)
483; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
484; CHECK-NEXT:    retq
485  %r1 = extractelement <8 x i16> %x, i32 1
486  %r2 = extractelement <8 x i16> %x, i32 3
487  store i16 %r2, i16* %dst, align 1
488  ret i16 %r1
489}
490
491define i8 @extract_v64i8(<64 x i8> %x, i8* %dst) {
492; CHECK-LABEL: extract_v64i8:
493; CHECK:       ## %bb.0:
494; CHECK-NEXT:    vpextrb $1, %xmm0, %eax
495; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
496; CHECK-NEXT:    vpextrb $1, %xmm0, (%rdi)
497; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
498; CHECK-NEXT:    vzeroupper
499; CHECK-NEXT:    retq
500  %r1 = extractelement <64 x i8> %x, i32 1
501  %r2 = extractelement <64 x i8> %x, i32 17
502  store i8 %r2, i8* %dst, align 1
503  ret i8 %r1
504}
505
506define i8 @extract_v32i8(<32 x i8> %x, i8* %dst) {
507; CHECK-LABEL: extract_v32i8:
508; CHECK:       ## %bb.0:
509; CHECK-NEXT:    vpextrb $1, %xmm0, %eax
510; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
511; CHECK-NEXT:    vpextrb $1, %xmm0, (%rdi)
512; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
513; CHECK-NEXT:    vzeroupper
514; CHECK-NEXT:    retq
515  %r1 = extractelement <32 x i8> %x, i32 1
516  %r2 = extractelement <32 x i8> %x, i32 17
517  store i8 %r2, i8* %dst, align 1
518  ret i8 %r1
519}
520
521define i8 @extract_v16i8(<16 x i8> %x, i8* %dst) {
522; CHECK-LABEL: extract_v16i8:
523; CHECK:       ## %bb.0:
524; CHECK-NEXT:    vpextrb $1, %xmm0, %eax
525; CHECK-NEXT:    vpextrb $3, %xmm0, (%rdi)
526; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
527; CHECK-NEXT:    retq
528  %r1 = extractelement <16 x i8> %x, i32 1
529  %r2 = extractelement <16 x i8> %x, i32 3
530  store i8 %r2, i8* %dst, align 1
531  ret i8 %r1
532}
533
534define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) {
535; CHECK-LABEL: insert_v8i64:
536; CHECK:       ## %bb.0:
537; CHECK-NEXT:    vpinsrq $1, (%rsi), %xmm0, %xmm1
538; CHECK-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm1
539; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
540; CHECK-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0
541; CHECK-NEXT:    vinserti32x4 $1, %xmm0, %zmm1, %zmm0
542; CHECK-NEXT:    retq
543  %val = load i64, i64* %ptr
544  %r1 = insertelement <8 x i64> %x, i64 %val, i32 1
545  %r2 = insertelement <8 x i64> %r1, i64 %y, i32 3
546  ret <8 x i64> %r2
547}
548
549define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) {
550; CHECK-LABEL: insert_v4i64:
551; CHECK:       ## %bb.0:
552; CHECK-NEXT:    vpinsrq $1, (%rsi), %xmm0, %xmm1
553; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
554; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
555; CHECK-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0
556; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
557; CHECK-NEXT:    retq
558  %val = load i64, i64* %ptr
559  %r1 = insertelement <4 x i64> %x, i64 %val, i32 1
560  %r2 = insertelement <4 x i64> %r1, i64 %y, i32 3
561  ret <4 x i64> %r2
562}
563
564define <2 x i64> @insert_v2i64(<2 x i64> %x, i64 %y , i64* %ptr) {
565; CHECK-LABEL: insert_v2i64:
566; CHECK:       ## %bb.0:
567; CHECK-NEXT:    vpinsrq $0, %rdi, %xmm0, %xmm0
568; CHECK-NEXT:    vpinsrq $1, (%rsi), %xmm0, %xmm0
569; CHECK-NEXT:    retq
570  %val = load i64, i64* %ptr
571  %r1 = insertelement <2 x i64> %x, i64 %val, i32 1
572  %r2 = insertelement <2 x i64> %r1, i64 %y, i32 0
573  ret <2 x i64> %r2
574}
575
576define <16 x i32> @insert_v16i32(<16 x i32> %x, i32 %y, i32* %ptr) {
577; CHECK-LABEL: insert_v16i32:
578; CHECK:       ## %bb.0:
579; CHECK-NEXT:    vpinsrd $1, (%rsi), %xmm0, %xmm1
580; CHECK-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm1
581; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
582; CHECK-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
583; CHECK-NEXT:    vinserti32x4 $1, %xmm0, %zmm1, %zmm0
584; CHECK-NEXT:    retq
585  %val = load i32, i32* %ptr
586  %r1 = insertelement <16 x i32> %x, i32 %val, i32 1
587  %r2 = insertelement <16 x i32> %r1, i32 %y, i32 5
588  ret <16 x i32> %r2
589}
590
591define <8 x i32> @insert_v8i32(<8 x i32> %x, i32 %y, i32* %ptr) {
592; CHECK-LABEL: insert_v8i32:
593; CHECK:       ## %bb.0:
594; CHECK-NEXT:    vpinsrd $1, (%rsi), %xmm0, %xmm1
595; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
596; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
597; CHECK-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
598; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
599; CHECK-NEXT:    retq
600  %val = load i32, i32* %ptr
601  %r1 = insertelement <8 x i32> %x, i32 %val, i32 1
602  %r2 = insertelement <8 x i32> %r1, i32 %y, i32 5
603  ret <8 x i32> %r2
604}
605
606define <4 x i32> @insert_v4i32(<4 x i32> %x, i32 %y, i32* %ptr) {
607; CHECK-LABEL: insert_v4i32:
608; CHECK:       ## %bb.0:
609; CHECK-NEXT:    vpinsrd $1, (%rsi), %xmm0, %xmm0
610; CHECK-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0
611; CHECK-NEXT:    retq
612  %val = load i32, i32* %ptr
613  %r1 = insertelement <4 x i32> %x, i32 %val, i32 1
614  %r2 = insertelement <4 x i32> %r1, i32 %y, i32 3
615  ret <4 x i32> %r2
616}
617
618define <32 x i16> @insert_v32i16(<32 x i16> %x, i16 %y, i16* %ptr) {
619; KNL-LABEL: insert_v32i16:
620; KNL:       ## %bb.0:
621; KNL-NEXT:    vpinsrw $1, (%rsi), %xmm0, %xmm2
622; KNL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
623; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
624; KNL-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0
625; KNL-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
626; KNL-NEXT:    retq
627;
628; SKX-LABEL: insert_v32i16:
629; SKX:       ## %bb.0:
630; SKX-NEXT:    vpinsrw $1, (%rsi), %xmm0, %xmm1
631; SKX-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm1
632; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
633; SKX-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0
634; SKX-NEXT:    vinserti32x4 $1, %xmm0, %zmm1, %zmm0
635; SKX-NEXT:    retq
636  %val = load i16, i16* %ptr
637  %r1 = insertelement <32 x i16> %x, i16 %val, i32 1
638  %r2 = insertelement <32 x i16> %r1, i16 %y, i32 9
639  ret <32 x i16> %r2
640}
641
642define <16 x i16> @insert_v16i16(<16 x i16> %x, i16 %y, i16* %ptr) {
643; CHECK-LABEL: insert_v16i16:
644; CHECK:       ## %bb.0:
645; CHECK-NEXT:    vpinsrw $1, (%rsi), %xmm0, %xmm1
646; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
647; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
648; CHECK-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0
649; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
650; CHECK-NEXT:    retq
651  %val = load i16, i16* %ptr
652  %r1 = insertelement <16 x i16> %x, i16 %val, i32 1
653  %r2 = insertelement <16 x i16> %r1, i16 %y, i32 9
654  ret <16 x i16> %r2
655}
656
657define <8 x i16> @insert_v8i16(<8 x i16> %x, i16 %y, i16* %ptr) {
658; CHECK-LABEL: insert_v8i16:
659; CHECK:       ## %bb.0:
660; CHECK-NEXT:    vpinsrw $1, (%rsi), %xmm0, %xmm0
661; CHECK-NEXT:    vpinsrw $5, %edi, %xmm0, %xmm0
662; CHECK-NEXT:    retq
663  %val = load i16, i16* %ptr
664  %r1 = insertelement <8 x i16> %x, i16 %val, i32 1
665  %r2 = insertelement <8 x i16> %r1, i16 %y, i32 5
666  ret <8 x i16> %r2
667}
668
669define <64 x i8> @insert_v64i8(<64 x i8> %x, i8 %y, i8* %ptr) {
670; KNL-LABEL: insert_v64i8:
671; KNL:       ## %bb.0:
672; KNL-NEXT:    vpinsrb $1, (%rsi), %xmm0, %xmm2
673; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
674; KNL-NEXT:    vextracti128 $1, %ymm1, %xmm2
675; KNL-NEXT:    vpinsrb $2, %edi, %xmm2, %xmm2
676; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
677; KNL-NEXT:    retq
678;
679; SKX-LABEL: insert_v64i8:
680; SKX:       ## %bb.0:
681; SKX-NEXT:    vpinsrb $1, (%rsi), %xmm0, %xmm1
682; SKX-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm1
683; SKX-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
684; SKX-NEXT:    vpinsrb $2, %edi, %xmm0, %xmm0
685; SKX-NEXT:    vinserti32x4 $3, %xmm0, %zmm1, %zmm0
686; SKX-NEXT:    retq
687  %val = load i8, i8* %ptr
688  %r1 = insertelement <64 x i8> %x, i8 %val, i32 1
689  %r2 = insertelement <64 x i8> %r1, i8 %y, i32 50
690  ret <64 x i8> %r2
691}
692
693define <32 x i8> @insert_v32i8(<32 x i8> %x, i8 %y, i8* %ptr) {
694; CHECK-LABEL: insert_v32i8:
695; CHECK:       ## %bb.0:
696; CHECK-NEXT:    vpinsrb $1, (%rsi), %xmm0, %xmm1
697; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
698; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
699; CHECK-NEXT:    vpinsrb $1, %edi, %xmm0, %xmm0
700; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
701; CHECK-NEXT:    retq
702  %val = load i8, i8* %ptr
703  %r1 = insertelement <32 x i8> %x, i8 %val, i32 1
704  %r2 = insertelement <32 x i8> %r1, i8 %y, i32 17
705  ret <32 x i8> %r2
706}
707
708define <16 x i8> @insert_v16i8(<16 x i8> %x, i8 %y, i8* %ptr) {
709; CHECK-LABEL: insert_v16i8:
710; CHECK:       ## %bb.0:
711; CHECK-NEXT:    vpinsrb $3, (%rsi), %xmm0, %xmm0
712; CHECK-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
713; CHECK-NEXT:    retq
714  %val = load i8, i8* %ptr
715  %r1 = insertelement <16 x i8> %x, i8 %val, i32 3
716  %r2 = insertelement <16 x i8> %r1, i8 %y, i32 10
717  ret <16 x i8> %r2
718}
719
720define <8 x i64> @test_insert_128_v8i64(<8 x i64> %x, i64 %y) {
721; CHECK-LABEL: test_insert_128_v8i64:
722; CHECK:       ## %bb.0:
723; CHECK-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm1
724; CHECK-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
725; CHECK-NEXT:    retq
726  %r = insertelement <8 x i64> %x, i64 %y, i32 1
727  ret <8 x i64> %r
728}
729
730define <16 x i32> @test_insert_128_v16i32(<16 x i32> %x, i32 %y) {
731; CHECK-LABEL: test_insert_128_v16i32:
732; CHECK:       ## %bb.0:
733; CHECK-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm1
734; CHECK-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
735; CHECK-NEXT:    retq
736  %r = insertelement <16 x i32> %x, i32 %y, i32 1
737  ret <16 x i32> %r
738}
739
740define <8 x double> @test_insert_128_v8f64(<8 x double> %x, double %y) {
741; CHECK-LABEL: test_insert_128_v8f64:
742; CHECK:       ## %bb.0:
743; CHECK-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0]
744; CHECK-NEXT:    vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
745; CHECK-NEXT:    retq
746  %r = insertelement <8 x double> %x, double %y, i32 1
747  ret <8 x double> %r
748}
749
750define <16 x float> @test_insert_128_v16f32(<16 x float> %x, float %y) {
751; CHECK-LABEL: test_insert_128_v16f32:
752; CHECK:       ## %bb.0:
753; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
754; CHECK-NEXT:    vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
755; CHECK-NEXT:    retq
756  %r = insertelement <16 x float> %x, float %y, i32 1
757  ret <16 x float> %r
758}
759
760define <16 x i16> @test_insert_128_v16i16(<16 x i16> %x, i16 %y) {
761; CHECK-LABEL: test_insert_128_v16i16:
762; CHECK:       ## %bb.0:
763; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
764; CHECK-NEXT:    vpinsrw $2, %edi, %xmm1, %xmm1
765; CHECK-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
766; CHECK-NEXT:    retq
767  %r = insertelement <16 x i16> %x, i16 %y, i32 10
768  ret <16 x i16> %r
769}
770
771define <32 x i8> @test_insert_128_v32i8(<32 x i8> %x, i8 %y) {
772; CHECK-LABEL: test_insert_128_v32i8:
773; CHECK:       ## %bb.0:
774; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
775; CHECK-NEXT:    vpinsrb $4, %edi, %xmm1, %xmm1
776; CHECK-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
777; CHECK-NEXT:    retq
778  %r = insertelement <32 x i8> %x, i8 %y, i32 20
779  ret <32 x i8> %r
780}
781
782define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32> %y) {
783; KNL-LABEL: test_insertelement_v32i1:
784; KNL:       ## %bb.0:
785; KNL-NEXT:    cmpl %esi, %edi
786; KNL-NEXT:    setb %al
787; KNL-NEXT:    vpcmpltud %zmm3, %zmm1, %k0
788; KNL-NEXT:    kmovw %k0, %ecx
789; KNL-NEXT:    shll $16, %ecx
790; KNL-NEXT:    vpcmpltud %zmm2, %zmm0, %k0
791; KNL-NEXT:    kshiftrw $4, %k0, %k1
792; KNL-NEXT:    kmovw %eax, %k2
793; KNL-NEXT:    kxorw %k2, %k1, %k1
794; KNL-NEXT:    kshiftlw $15, %k1, %k1
795; KNL-NEXT:    kshiftrw $11, %k1, %k1
796; KNL-NEXT:    kxorw %k1, %k0, %k0
797; KNL-NEXT:    kmovw %k0, %eax
798; KNL-NEXT:    orl %ecx, %eax
799; KNL-NEXT:    vzeroupper
800; KNL-NEXT:    retq
801;
802; SKX-LABEL: test_insertelement_v32i1:
803; SKX:       ## %bb.0:
804; SKX-NEXT:    cmpl %esi, %edi
805; SKX-NEXT:    setb %al
806; SKX-NEXT:    vpcmpltud %zmm2, %zmm0, %k0
807; SKX-NEXT:    vpcmpltud %zmm3, %zmm1, %k1
808; SKX-NEXT:    kunpckwd %k0, %k1, %k0
809; SKX-NEXT:    kshiftrd $4, %k0, %k1
810; SKX-NEXT:    kmovd %eax, %k2
811; SKX-NEXT:    kxord %k2, %k1, %k1
812; SKX-NEXT:    kshiftld $31, %k1, %k1
813; SKX-NEXT:    kshiftrd $27, %k1, %k1
814; SKX-NEXT:    kxord %k1, %k0, %k0
815; SKX-NEXT:    kmovd %k0, %eax
816; SKX-NEXT:    vzeroupper
817; SKX-NEXT:    retq
818  %cmp_res_i1 = icmp ult i32 %a, %b
819  %cmp_cmp_vec = icmp ult <32 x i32> %x, %y
820  %maskv = insertelement <32 x i1> %cmp_cmp_vec, i1 %cmp_res_i1, i32 4
821  %res = bitcast <32 x i1> %maskv to i32
822  ret i32 %res
823}
824
825define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y) {
826; KNL-LABEL: test_iinsertelement_v4i1:
827; KNL:       ## %bb.0:
828; KNL-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
829; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
830; KNL-NEXT:    cmpl %esi, %edi
831; KNL-NEXT:    setb %al
832; KNL-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
833; KNL-NEXT:    kshiftrw $2, %k0, %k1
834; KNL-NEXT:    kmovw %eax, %k2
835; KNL-NEXT:    kxorw %k2, %k1, %k1
836; KNL-NEXT:    kshiftlw $15, %k1, %k1
837; KNL-NEXT:    kshiftrw $13, %k1, %k1
838; KNL-NEXT:    kxorw %k1, %k0, %k0
839; KNL-NEXT:    kmovw %k0, %eax
840; KNL-NEXT:    ## kill: def $al killed $al killed $eax
841; KNL-NEXT:    vzeroupper
842; KNL-NEXT:    retq
843;
844; SKX-LABEL: test_iinsertelement_v4i1:
845; SKX:       ## %bb.0:
846; SKX-NEXT:    cmpl %esi, %edi
847; SKX-NEXT:    setb %al
848; SKX-NEXT:    vpcmpltud %xmm1, %xmm0, %k0
849; SKX-NEXT:    kshiftrb $2, %k0, %k1
850; SKX-NEXT:    kmovd %eax, %k2
851; SKX-NEXT:    kxorb %k2, %k1, %k1
852; SKX-NEXT:    kshiftlb $7, %k1, %k1
853; SKX-NEXT:    kshiftrb $5, %k1, %k1
854; SKX-NEXT:    kxorb %k1, %k0, %k0
855; SKX-NEXT:    kmovd %k0, %eax
856; SKX-NEXT:    ## kill: def $al killed $al killed $eax
857; SKX-NEXT:    retq
858  %cmp_res_i1 = icmp ult i32 %a, %b
859  %cmp_cmp_vec = icmp ult <4 x i32> %x, %y
860  %maskv = insertelement <4 x i1> %cmp_cmp_vec, i1 %cmp_res_i1, i32 2
861  %res0 = shufflevector <4 x i1> %maskv, <4 x i1> undef , <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
862  %res = bitcast <8 x i1> %res0 to i8
863  ret i8 %res
864}
865
866define i8 @test_iinsertelement_v2i1(i32 %a, i32 %b, <2 x i64> %x , <2 x i64> %y) {
867; KNL-LABEL: test_iinsertelement_v2i1:
868; KNL:       ## %bb.0:
869; KNL-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
870; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
871; KNL-NEXT:    cmpl %esi, %edi
872; KNL-NEXT:    setb %al
873; KNL-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0
874; KNL-NEXT:    kshiftlw $15, %k0, %k0
875; KNL-NEXT:    kshiftrw $15, %k0, %k0
876; KNL-NEXT:    kmovw %eax, %k1
877; KNL-NEXT:    kshiftlw $1, %k1, %k1
878; KNL-NEXT:    korw %k1, %k0, %k0
879; KNL-NEXT:    kmovw %k0, %eax
880; KNL-NEXT:    ## kill: def $al killed $al killed $eax
881; KNL-NEXT:    vzeroupper
882; KNL-NEXT:    retq
883;
884; SKX-LABEL: test_iinsertelement_v2i1:
885; SKX:       ## %bb.0:
886; SKX-NEXT:    cmpl %esi, %edi
887; SKX-NEXT:    setb %al
888; SKX-NEXT:    vpcmpltuq %xmm1, %xmm0, %k0
889; SKX-NEXT:    kshiftlb $7, %k0, %k0
890; SKX-NEXT:    kshiftrb $7, %k0, %k0
891; SKX-NEXT:    kmovd %eax, %k1
892; SKX-NEXT:    kshiftlb $1, %k1, %k1
893; SKX-NEXT:    korb %k1, %k0, %k0
894; SKX-NEXT:    kmovd %k0, %eax
895; SKX-NEXT:    ## kill: def $al killed $al killed $eax
896; SKX-NEXT:    retq
897  %cmp_res_i1 = icmp ult i32 %a, %b
898  %cmp_cmp_vec = icmp ult <2 x i64> %x, %y
899  %maskv = insertelement <2 x i1> %cmp_cmp_vec, i1 %cmp_res_i1, i32 1
900  %res0 = shufflevector <2 x i1> %maskv, <2 x i1> undef , <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
901  %res = bitcast <8 x i1> %res0 to i8
902  ret i8 %res
903}
904
905define zeroext i8 @test_extractelement_v2i1(<2 x i64> %a, <2 x i64> %b) {
906; KNL-LABEL: test_extractelement_v2i1:
907; KNL:       ## %bb.0:
908; KNL-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
909; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
910; KNL-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k0
911; KNL-NEXT:    kmovw %k0, %eax
912; KNL-NEXT:    andb $1, %al
913; KNL-NEXT:    movb $4, %cl
914; KNL-NEXT:    subb %al, %cl
915; KNL-NEXT:    movzbl %cl, %eax
916; KNL-NEXT:    vzeroupper
917; KNL-NEXT:    retq
918;
919; SKX-LABEL: test_extractelement_v2i1:
920; SKX:       ## %bb.0:
921; SKX-NEXT:    vpcmpnleuq %xmm1, %xmm0, %k0
922; SKX-NEXT:    kmovd %k0, %eax
923; SKX-NEXT:    andb $1, %al
924; SKX-NEXT:    movb $4, %cl
925; SKX-NEXT:    subb %al, %cl
926; SKX-NEXT:    movzbl %cl, %eax
927; SKX-NEXT:    retq
928  %t1 = icmp ugt <2 x i64> %a, %b
929  %t2 = extractelement <2 x i1> %t1, i32 0
930  %res = select i1 %t2, i8 3, i8 4
931  ret i8 %res
932}
933
934define zeroext i8 @extractelement_v2i1_alt(<2 x i64> %a, <2 x i64> %b) {
935; KNL-LABEL: extractelement_v2i1_alt:
936; KNL:       ## %bb.0:
937; KNL-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
938; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
939; KNL-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k0
940; KNL-NEXT:    kmovw %k0, %eax
941; KNL-NEXT:    andb $1, %al
942; KNL-NEXT:    movb $4, %cl
943; KNL-NEXT:    subb %al, %cl
944; KNL-NEXT:    movzbl %cl, %eax
945; KNL-NEXT:    vzeroupper
946; KNL-NEXT:    retq
947;
948; SKX-LABEL: extractelement_v2i1_alt:
949; SKX:       ## %bb.0:
950; SKX-NEXT:    vpcmpnleuq %xmm1, %xmm0, %k0
951; SKX-NEXT:    kmovd %k0, %eax
952; SKX-NEXT:    andb $1, %al
953; SKX-NEXT:    movb $4, %cl
954; SKX-NEXT:    subb %al, %cl
955; SKX-NEXT:    movzbl %cl, %eax
956; SKX-NEXT:    retq
957  %t1 = icmp ugt <2 x i64> %a, %b
958  %t2 = extractelement <2 x i1> %t1, i32 0
959  %sext = sext i1 %t2 to i8
960  %res = add i8 %sext, 4
961  ret i8 %res
962}
963
964define zeroext i8 @test_extractelement_v4i1(<4 x i32> %a, <4 x i32> %b) {
965; KNL-LABEL: test_extractelement_v4i1:
966; KNL:       ## %bb.0:
967; KNL-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
968; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
969; KNL-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
970; KNL-NEXT:    kshiftrw $3, %k0, %k0
971; KNL-NEXT:    kmovw %k0, %eax
972; KNL-NEXT:    andl $1, %eax
973; KNL-NEXT:    vzeroupper
974; KNL-NEXT:    retq
975;
976; SKX-LABEL: test_extractelement_v4i1:
977; SKX:       ## %bb.0:
978; SKX-NEXT:    vpcmpnleud %xmm1, %xmm0, %k0
979; SKX-NEXT:    kshiftrw $3, %k0, %k0
980; SKX-NEXT:    kmovd %k0, %eax
981; SKX-NEXT:    andl $1, %eax
982; SKX-NEXT:    retq
983  %t1 = icmp ugt <4 x i32> %a, %b
984  %t2 = extractelement <4 x i1> %t1, i32 3
985  %res = zext i1 %t2 to i8
986  ret i8 %res
987}
988
989define zeroext i8 @test_extractelement_v32i1(<32 x i8> %a, <32 x i8> %b) {
990; KNL-LABEL: test_extractelement_v32i1:
991; KNL:       ## %bb.0:
992; KNL-NEXT:    vpminub %ymm1, %ymm0, %ymm1
993; KNL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
994; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
995; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
996; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
997; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
998; KNL-NEXT:    kshiftrw $2, %k0, %k0
999; KNL-NEXT:    kmovw %k0, %eax
1000; KNL-NEXT:    andl $1, %eax
1001; KNL-NEXT:    vzeroupper
1002; KNL-NEXT:    retq
1003;
1004; SKX-LABEL: test_extractelement_v32i1:
1005; SKX:       ## %bb.0:
1006; SKX-NEXT:    vpcmpnleub %ymm1, %ymm0, %k0
1007; SKX-NEXT:    kshiftrd $2, %k0, %k0
1008; SKX-NEXT:    kmovd %k0, %eax
1009; SKX-NEXT:    andl $1, %eax
1010; SKX-NEXT:    vzeroupper
1011; SKX-NEXT:    retq
1012  %t1 = icmp ugt <32 x i8> %a, %b
1013  %t2 = extractelement <32 x i1> %t1, i32 2
1014  %res = zext i1 %t2 to i8
1015  ret i8 %res
1016}
1017
1018define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) {
1019; KNL-LABEL: test_extractelement_v64i1:
1020; KNL:       ## %bb.0:
1021; KNL-NEXT:    vpminub %ymm3, %ymm1, %ymm0
1022; KNL-NEXT:    vpcmpeqb %ymm0, %ymm1, %ymm0
1023; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
1024; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
1025; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
1026; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
1027; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
1028; KNL-NEXT:    kshiftrw $15, %k0, %k0
1029; KNL-NEXT:    kmovw %k0, %eax
1030; KNL-NEXT:    andb $1, %al
1031; KNL-NEXT:    movb $4, %cl
1032; KNL-NEXT:    subb %al, %cl
1033; KNL-NEXT:    movzbl %cl, %eax
1034; KNL-NEXT:    vzeroupper
1035; KNL-NEXT:    retq
1036;
1037; SKX-LABEL: test_extractelement_v64i1:
1038; SKX:       ## %bb.0:
1039; SKX-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0
1040; SKX-NEXT:    kshiftrq $63, %k0, %k0
1041; SKX-NEXT:    kmovd %k0, %eax
1042; SKX-NEXT:    andb $1, %al
1043; SKX-NEXT:    movb $4, %cl
1044; SKX-NEXT:    subb %al, %cl
1045; SKX-NEXT:    movzbl %cl, %eax
1046; SKX-NEXT:    vzeroupper
1047; SKX-NEXT:    retq
1048  %t1 = icmp ugt <64 x i8> %a, %b
1049  %t2 = extractelement <64 x i1> %t1, i32 63
1050  %res = select i1 %t2, i8 3, i8 4
1051  ret i8 %res
1052}
1053
1054define zeroext i8 @extractelement_v64i1_alt(<64 x i8> %a, <64 x i8> %b) {
1055; KNL-LABEL: extractelement_v64i1_alt:
1056; KNL:       ## %bb.0:
1057; KNL-NEXT:    vpminub %ymm3, %ymm1, %ymm0
1058; KNL-NEXT:    vpcmpeqb %ymm0, %ymm1, %ymm0
1059; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
1060; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
1061; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
1062; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
1063; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
1064; KNL-NEXT:    kshiftrw $15, %k0, %k0
1065; KNL-NEXT:    kmovw %k0, %eax
1066; KNL-NEXT:    andb $1, %al
1067; KNL-NEXT:    movb $4, %cl
1068; KNL-NEXT:    subb %al, %cl
1069; KNL-NEXT:    movzbl %cl, %eax
1070; KNL-NEXT:    vzeroupper
1071; KNL-NEXT:    retq
1072;
1073; SKX-LABEL: extractelement_v64i1_alt:
1074; SKX:       ## %bb.0:
1075; SKX-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0
1076; SKX-NEXT:    kshiftrq $63, %k0, %k0
1077; SKX-NEXT:    kmovd %k0, %eax
1078; SKX-NEXT:    andb $1, %al
1079; SKX-NEXT:    movb $4, %cl
1080; SKX-NEXT:    subb %al, %cl
1081; SKX-NEXT:    movzbl %cl, %eax
1082; SKX-NEXT:    vzeroupper
1083; SKX-NEXT:    retq
1084  %t1 = icmp ugt <64 x i8> %a, %b
1085  %t2 = extractelement <64 x i1> %t1, i32 63
1086  %sext = sext i1 %t2 to i8
1087  %res = add i8 %sext, 4
1088  ret i8 %res
1089}
1090
1091define i64 @test_extractelement_variable_v2i64(<2 x i64> %t1, i32 %index) {
1092; CHECK-LABEL: test_extractelement_variable_v2i64:
1093; CHECK:       ## %bb.0:
1094; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1095; CHECK-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1096; CHECK-NEXT:    andl $1, %edi
1097; CHECK-NEXT:    movq -24(%rsp,%rdi,8), %rax
1098; CHECK-NEXT:    retq
1099  %t2 = extractelement <2 x i64> %t1, i32 %index
1100  ret i64 %t2
1101}
1102
1103define i64 @test_extractelement_variable_v4i64(<4 x i64> %t1, i32 %index) {
1104; CHECK-LABEL: test_extractelement_variable_v4i64:
1105; CHECK:       ## %bb.0:
1106; CHECK-NEXT:    pushq %rbp
1107; CHECK-NEXT:    .cfi_def_cfa_offset 16
1108; CHECK-NEXT:    .cfi_offset %rbp, -16
1109; CHECK-NEXT:    movq %rsp, %rbp
1110; CHECK-NEXT:    .cfi_def_cfa_register %rbp
1111; CHECK-NEXT:    andq $-32, %rsp
1112; CHECK-NEXT:    subq $64, %rsp
1113; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1114; CHECK-NEXT:    vmovaps %ymm0, (%rsp)
1115; CHECK-NEXT:    andl $3, %edi
1116; CHECK-NEXT:    movq (%rsp,%rdi,8), %rax
1117; CHECK-NEXT:    movq %rbp, %rsp
1118; CHECK-NEXT:    popq %rbp
1119; CHECK-NEXT:    vzeroupper
1120; CHECK-NEXT:    retq
1121  %t2 = extractelement <4 x i64> %t1, i32 %index
1122  ret i64 %t2
1123}
1124
1125define i64 @test_extractelement_variable_v8i64(<8 x i64> %t1, i32 %index) {
1126; CHECK-LABEL: test_extractelement_variable_v8i64:
1127; CHECK:       ## %bb.0:
1128; CHECK-NEXT:    pushq %rbp
1129; CHECK-NEXT:    .cfi_def_cfa_offset 16
1130; CHECK-NEXT:    .cfi_offset %rbp, -16
1131; CHECK-NEXT:    movq %rsp, %rbp
1132; CHECK-NEXT:    .cfi_def_cfa_register %rbp
1133; CHECK-NEXT:    andq $-64, %rsp
1134; CHECK-NEXT:    subq $128, %rsp
1135; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1136; CHECK-NEXT:    vmovaps %zmm0, (%rsp)
1137; CHECK-NEXT:    andl $7, %edi
1138; CHECK-NEXT:    movq (%rsp,%rdi,8), %rax
1139; CHECK-NEXT:    movq %rbp, %rsp
1140; CHECK-NEXT:    popq %rbp
1141; CHECK-NEXT:    vzeroupper
1142; CHECK-NEXT:    retq
1143  %t2 = extractelement <8 x i64> %t1, i32 %index
1144  ret i64 %t2
1145}
1146
1147define double @test_extractelement_variable_v2f64(<2 x double> %t1, i32 %index) {
1148; CHECK-LABEL: test_extractelement_variable_v2f64:
1149; CHECK:       ## %bb.0:
1150; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1151; CHECK-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1152; CHECK-NEXT:    andl $1, %edi
1153; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1154; CHECK-NEXT:    retq
1155  %t2 = extractelement <2 x double> %t1, i32 %index
1156  ret double %t2
1157}
1158
1159define double @test_extractelement_variable_v4f64(<4 x double> %t1, i32 %index) {
1160; CHECK-LABEL: test_extractelement_variable_v4f64:
1161; CHECK:       ## %bb.0:
1162; CHECK-NEXT:    pushq %rbp
1163; CHECK-NEXT:    .cfi_def_cfa_offset 16
1164; CHECK-NEXT:    .cfi_offset %rbp, -16
1165; CHECK-NEXT:    movq %rsp, %rbp
1166; CHECK-NEXT:    .cfi_def_cfa_register %rbp
1167; CHECK-NEXT:    andq $-32, %rsp
1168; CHECK-NEXT:    subq $64, %rsp
1169; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1170; CHECK-NEXT:    vmovaps %ymm0, (%rsp)
1171; CHECK-NEXT:    andl $3, %edi
1172; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1173; CHECK-NEXT:    movq %rbp, %rsp
1174; CHECK-NEXT:    popq %rbp
1175; CHECK-NEXT:    vzeroupper
1176; CHECK-NEXT:    retq
1177  %t2 = extractelement <4 x double> %t1, i32 %index
1178  ret double %t2
1179}
1180
1181define double @test_extractelement_variable_v8f64(<8 x double> %t1, i32 %index) {
1182; CHECK-LABEL: test_extractelement_variable_v8f64:
1183; CHECK:       ## %bb.0:
1184; CHECK-NEXT:    pushq %rbp
1185; CHECK-NEXT:    .cfi_def_cfa_offset 16
1186; CHECK-NEXT:    .cfi_offset %rbp, -16
1187; CHECK-NEXT:    movq %rsp, %rbp
1188; CHECK-NEXT:    .cfi_def_cfa_register %rbp
1189; CHECK-NEXT:    andq $-64, %rsp
1190; CHECK-NEXT:    subq $128, %rsp
1191; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1192; CHECK-NEXT:    vmovaps %zmm0, (%rsp)
1193; CHECK-NEXT:    andl $7, %edi
1194; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1195; CHECK-NEXT:    movq %rbp, %rsp
1196; CHECK-NEXT:    popq %rbp
1197; CHECK-NEXT:    vzeroupper
1198; CHECK-NEXT:    retq
1199  %t2 = extractelement <8 x double> %t1, i32 %index
1200  ret double %t2
1201}
1202
1203define i32 @test_extractelement_variable_v4i32(<4 x i32> %t1, i32 %index) {
1204; CHECK-LABEL: test_extractelement_variable_v4i32:
1205; CHECK:       ## %bb.0:
1206; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1207; CHECK-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1208; CHECK-NEXT:    andl $3, %edi
1209; CHECK-NEXT:    movl -24(%rsp,%rdi,4), %eax
1210; CHECK-NEXT:    retq
1211  %t2 = extractelement <4 x i32> %t1, i32 %index
1212  ret i32 %t2
1213}
1214
1215define i32 @test_extractelement_variable_v8i32(<8 x i32> %t1, i32 %index) {
1216; CHECK-LABEL: test_extractelement_variable_v8i32:
1217; CHECK:       ## %bb.0:
1218; CHECK-NEXT:    pushq %rbp
1219; CHECK-NEXT:    .cfi_def_cfa_offset 16
1220; CHECK-NEXT:    .cfi_offset %rbp, -16
1221; CHECK-NEXT:    movq %rsp, %rbp
1222; CHECK-NEXT:    .cfi_def_cfa_register %rbp
1223; CHECK-NEXT:    andq $-32, %rsp
1224; CHECK-NEXT:    subq $64, %rsp
1225; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1226; CHECK-NEXT:    vmovaps %ymm0, (%rsp)
1227; CHECK-NEXT:    andl $7, %edi
1228; CHECK-NEXT:    movl (%rsp,%rdi,4), %eax
1229; CHECK-NEXT:    movq %rbp, %rsp
1230; CHECK-NEXT:    popq %rbp
1231; CHECK-NEXT:    vzeroupper
1232; CHECK-NEXT:    retq
1233  %t2 = extractelement <8 x i32> %t1, i32 %index
1234  ret i32 %t2
1235}
1236
1237define i32 @test_extractelement_variable_v16i32(<16 x i32> %t1, i32 %index) {
1238; CHECK-LABEL: test_extractelement_variable_v16i32:
1239; CHECK:       ## %bb.0:
1240; CHECK-NEXT:    pushq %rbp
1241; CHECK-NEXT:    .cfi_def_cfa_offset 16
1242; CHECK-NEXT:    .cfi_offset %rbp, -16
1243; CHECK-NEXT:    movq %rsp, %rbp
1244; CHECK-NEXT:    .cfi_def_cfa_register %rbp
1245; CHECK-NEXT:    andq $-64, %rsp
1246; CHECK-NEXT:    subq $128, %rsp
1247; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1248; CHECK-NEXT:    vmovaps %zmm0, (%rsp)
1249; CHECK-NEXT:    andl $15, %edi
1250; CHECK-NEXT:    movl (%rsp,%rdi,4), %eax
1251; CHECK-NEXT:    movq %rbp, %rsp
1252; CHECK-NEXT:    popq %rbp
1253; CHECK-NEXT:    vzeroupper
1254; CHECK-NEXT:    retq
1255  %t2 = extractelement <16 x i32> %t1, i32 %index
1256  ret i32 %t2
1257}
1258
1259define float @test_extractelement_variable_v4f32(<4 x float> %t1, i32 %index) {
1260; CHECK-LABEL: test_extractelement_variable_v4f32:
1261; CHECK:       ## %bb.0:
1262; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1263; CHECK-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1264; CHECK-NEXT:    andl $3, %edi
1265; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1266; CHECK-NEXT:    retq
1267  %t2 = extractelement <4 x float> %t1, i32 %index
1268  ret float %t2
1269}
1270
1271define float @test_extractelement_variable_v8f32(<8 x float> %t1, i32 %index) {
1272; CHECK-LABEL: test_extractelement_variable_v8f32:
1273; CHECK:       ## %bb.0:
1274; CHECK-NEXT:    pushq %rbp
1275; CHECK-NEXT:    .cfi_def_cfa_offset 16
1276; CHECK-NEXT:    .cfi_offset %rbp, -16
1277; CHECK-NEXT:    movq %rsp, %rbp
1278; CHECK-NEXT:    .cfi_def_cfa_register %rbp
1279; CHECK-NEXT:    andq $-32, %rsp
1280; CHECK-NEXT:    subq $64, %rsp
1281; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1282; CHECK-NEXT:    vmovaps %ymm0, (%rsp)
1283; CHECK-NEXT:    andl $7, %edi
1284; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1285; CHECK-NEXT:    movq %rbp, %rsp
1286; CHECK-NEXT:    popq %rbp
1287; CHECK-NEXT:    vzeroupper
1288; CHECK-NEXT:    retq
1289  %t2 = extractelement <8 x float> %t1, i32 %index
1290  ret float %t2
1291}
1292
1293define float @test_extractelement_variable_v16f32(<16 x float> %t1, i32 %index) {
1294; CHECK-LABEL: test_extractelement_variable_v16f32:
1295; CHECK:       ## %bb.0:
1296; CHECK-NEXT:    pushq %rbp
1297; CHECK-NEXT:    .cfi_def_cfa_offset 16
1298; CHECK-NEXT:    .cfi_offset %rbp, -16
1299; CHECK-NEXT:    movq %rsp, %rbp
1300; CHECK-NEXT:    .cfi_def_cfa_register %rbp
1301; CHECK-NEXT:    andq $-64, %rsp
1302; CHECK-NEXT:    subq $128, %rsp
1303; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1304; CHECK-NEXT:    vmovaps %zmm0, (%rsp)
1305; CHECK-NEXT:    andl $15, %edi
1306; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1307; CHECK-NEXT:    movq %rbp, %rsp
1308; CHECK-NEXT:    popq %rbp
1309; CHECK-NEXT:    vzeroupper
1310; CHECK-NEXT:    retq
1311  %t2 = extractelement <16 x float> %t1, i32 %index
1312  ret float %t2
1313}
1314
1315define i16 @test_extractelement_variable_v8i16(<8 x i16> %t1, i32 %index) {
1316; CHECK-LABEL: test_extractelement_variable_v8i16:
1317; CHECK:       ## %bb.0:
1318; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1319; CHECK-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1320; CHECK-NEXT:    andl $7, %edi
1321; CHECK-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
1322; CHECK-NEXT:    retq
1323  %t2 = extractelement <8 x i16> %t1, i32 %index
1324  ret i16 %t2
1325}
1326
1327define i16 @test_extractelement_variable_v16i16(<16 x i16> %t1, i32 %index) {
1328; CHECK-LABEL: test_extractelement_variable_v16i16:
1329; CHECK:       ## %bb.0:
1330; CHECK-NEXT:    pushq %rbp
1331; CHECK-NEXT:    .cfi_def_cfa_offset 16
1332; CHECK-NEXT:    .cfi_offset %rbp, -16
1333; CHECK-NEXT:    movq %rsp, %rbp
1334; CHECK-NEXT:    .cfi_def_cfa_register %rbp
1335; CHECK-NEXT:    andq $-32, %rsp
1336; CHECK-NEXT:    subq $64, %rsp
1337; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1338; CHECK-NEXT:    vmovaps %ymm0, (%rsp)
1339; CHECK-NEXT:    andl $15, %edi
1340; CHECK-NEXT:    movzwl (%rsp,%rdi,2), %eax
1341; CHECK-NEXT:    movq %rbp, %rsp
1342; CHECK-NEXT:    popq %rbp
1343; CHECK-NEXT:    vzeroupper
1344; CHECK-NEXT:    retq
1345  %t2 = extractelement <16 x i16> %t1, i32 %index
1346  ret i16 %t2
1347}
1348
1349define i16 @test_extractelement_variable_v32i16(<32 x i16> %t1, i32 %index) {
1350; KNL-LABEL: test_extractelement_variable_v32i16:
1351; KNL:       ## %bb.0:
1352; KNL-NEXT:    pushq %rbp
1353; KNL-NEXT:    .cfi_def_cfa_offset 16
1354; KNL-NEXT:    .cfi_offset %rbp, -16
1355; KNL-NEXT:    movq %rsp, %rbp
1356; KNL-NEXT:    .cfi_def_cfa_register %rbp
1357; KNL-NEXT:    andq $-64, %rsp
1358; KNL-NEXT:    subq $128, %rsp
1359; KNL-NEXT:    ## kill: def $edi killed $edi def $rdi
1360; KNL-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
1361; KNL-NEXT:    vmovaps %ymm0, (%rsp)
1362; KNL-NEXT:    andl $31, %edi
1363; KNL-NEXT:    movzwl (%rsp,%rdi,2), %eax
1364; KNL-NEXT:    movq %rbp, %rsp
1365; KNL-NEXT:    popq %rbp
1366; KNL-NEXT:    vzeroupper
1367; KNL-NEXT:    retq
1368;
1369; SKX-LABEL: test_extractelement_variable_v32i16:
1370; SKX:       ## %bb.0:
1371; SKX-NEXT:    pushq %rbp
1372; SKX-NEXT:    .cfi_def_cfa_offset 16
1373; SKX-NEXT:    .cfi_offset %rbp, -16
1374; SKX-NEXT:    movq %rsp, %rbp
1375; SKX-NEXT:    .cfi_def_cfa_register %rbp
1376; SKX-NEXT:    andq $-64, %rsp
1377; SKX-NEXT:    subq $128, %rsp
1378; SKX-NEXT:    ## kill: def $edi killed $edi def $rdi
1379; SKX-NEXT:    vmovaps %zmm0, (%rsp)
1380; SKX-NEXT:    andl $31, %edi
1381; SKX-NEXT:    movzwl (%rsp,%rdi,2), %eax
1382; SKX-NEXT:    movq %rbp, %rsp
1383; SKX-NEXT:    popq %rbp
1384; SKX-NEXT:    vzeroupper
1385; SKX-NEXT:    retq
1386  %t2 = extractelement <32 x i16> %t1, i32 %index
1387  ret i16 %t2
1388}
1389
1390define i8 @test_extractelement_variable_v16i8(<16 x i8> %t1, i32 %index) {
1391; CHECK-LABEL: test_extractelement_variable_v16i8:
1392; CHECK:       ## %bb.0:
1393; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1394; CHECK-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1395; CHECK-NEXT:    andl $15, %edi
1396; CHECK-NEXT:    movb -24(%rsp,%rdi), %al
1397; CHECK-NEXT:    retq
1398  %t2 = extractelement <16 x i8> %t1, i32 %index
1399  ret i8 %t2
1400}
1401
1402define i8 @test_extractelement_variable_v32i8(<32 x i8> %t1, i32 %index) {
1403; CHECK-LABEL: test_extractelement_variable_v32i8:
1404; CHECK:       ## %bb.0:
1405; CHECK-NEXT:    pushq %rbp
1406; CHECK-NEXT:    .cfi_def_cfa_offset 16
1407; CHECK-NEXT:    .cfi_offset %rbp, -16
1408; CHECK-NEXT:    movq %rsp, %rbp
1409; CHECK-NEXT:    .cfi_def_cfa_register %rbp
1410; CHECK-NEXT:    andq $-32, %rsp
1411; CHECK-NEXT:    subq $64, %rsp
1412; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1413; CHECK-NEXT:    vmovaps %ymm0, (%rsp)
1414; CHECK-NEXT:    andl $31, %edi
1415; CHECK-NEXT:    movb (%rsp,%rdi), %al
1416; CHECK-NEXT:    movq %rbp, %rsp
1417; CHECK-NEXT:    popq %rbp
1418; CHECK-NEXT:    vzeroupper
1419; CHECK-NEXT:    retq
1420
1421  %t2 = extractelement <32 x i8> %t1, i32 %index
1422  ret i8 %t2
1423}
1424
1425define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) {
1426; KNL-LABEL: test_extractelement_variable_v64i8:
1427; KNL:       ## %bb.0:
1428; KNL-NEXT:    pushq %rbp
1429; KNL-NEXT:    .cfi_def_cfa_offset 16
1430; KNL-NEXT:    .cfi_offset %rbp, -16
1431; KNL-NEXT:    movq %rsp, %rbp
1432; KNL-NEXT:    .cfi_def_cfa_register %rbp
1433; KNL-NEXT:    andq $-64, %rsp
1434; KNL-NEXT:    subq $128, %rsp
1435; KNL-NEXT:    ## kill: def $edi killed $edi def $rdi
1436; KNL-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
1437; KNL-NEXT:    vmovaps %ymm0, (%rsp)
1438; KNL-NEXT:    andl $63, %edi
1439; KNL-NEXT:    movb (%rsp,%rdi), %al
1440; KNL-NEXT:    movq %rbp, %rsp
1441; KNL-NEXT:    popq %rbp
1442; KNL-NEXT:    vzeroupper
1443; KNL-NEXT:    retq
1444;
1445; SKX-LABEL: test_extractelement_variable_v64i8:
1446; SKX:       ## %bb.0:
1447; SKX-NEXT:    pushq %rbp
1448; SKX-NEXT:    .cfi_def_cfa_offset 16
1449; SKX-NEXT:    .cfi_offset %rbp, -16
1450; SKX-NEXT:    movq %rsp, %rbp
1451; SKX-NEXT:    .cfi_def_cfa_register %rbp
1452; SKX-NEXT:    andq $-64, %rsp
1453; SKX-NEXT:    subq $128, %rsp
1454; SKX-NEXT:    ## kill: def $edi killed $edi def $rdi
1455; SKX-NEXT:    vmovaps %zmm0, (%rsp)
1456; SKX-NEXT:    andl $63, %edi
1457; SKX-NEXT:    movb (%rsp,%rdi), %al
1458; SKX-NEXT:    movq %rbp, %rsp
1459; SKX-NEXT:    popq %rbp
1460; SKX-NEXT:    vzeroupper
1461; SKX-NEXT:    retq
1462
1463  %t2 = extractelement <64 x i8> %t1, i32 %index
1464  ret i8 %t2
1465}
1466
1467define i8 @test_extractelement_variable_v64i8_indexi8(<64 x i8> %t1, i8 %index) {
1468; KNL-LABEL: test_extractelement_variable_v64i8_indexi8:
1469; KNL:       ## %bb.0:
1470; KNL-NEXT:    pushq %rbp
1471; KNL-NEXT:    .cfi_def_cfa_offset 16
1472; KNL-NEXT:    .cfi_offset %rbp, -16
1473; KNL-NEXT:    movq %rsp, %rbp
1474; KNL-NEXT:    .cfi_def_cfa_register %rbp
1475; KNL-NEXT:    andq $-64, %rsp
1476; KNL-NEXT:    subq $128, %rsp
1477; KNL-NEXT:    addb %dil, %dil
1478; KNL-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
1479; KNL-NEXT:    vmovaps %ymm0, (%rsp)
1480; KNL-NEXT:    movzbl %dil, %eax
1481; KNL-NEXT:    andl $63, %eax
1482; KNL-NEXT:    movb (%rsp,%rax), %al
1483; KNL-NEXT:    movq %rbp, %rsp
1484; KNL-NEXT:    popq %rbp
1485; KNL-NEXT:    vzeroupper
1486; KNL-NEXT:    retq
1487;
1488; SKX-LABEL: test_extractelement_variable_v64i8_indexi8:
1489; SKX:       ## %bb.0:
1490; SKX-NEXT:    pushq %rbp
1491; SKX-NEXT:    .cfi_def_cfa_offset 16
1492; SKX-NEXT:    .cfi_offset %rbp, -16
1493; SKX-NEXT:    movq %rsp, %rbp
1494; SKX-NEXT:    .cfi_def_cfa_register %rbp
1495; SKX-NEXT:    andq $-64, %rsp
1496; SKX-NEXT:    subq $128, %rsp
1497; SKX-NEXT:    addb %dil, %dil
1498; SKX-NEXT:    vmovaps %zmm0, (%rsp)
1499; SKX-NEXT:    movzbl %dil, %eax
1500; SKX-NEXT:    andl $63, %eax
1501; SKX-NEXT:    movb (%rsp,%rax), %al
1502; SKX-NEXT:    movq %rbp, %rsp
1503; SKX-NEXT:    popq %rbp
1504; SKX-NEXT:    vzeroupper
1505; SKX-NEXT:    retq
1506
1507  %i  = add i8 %index, %index
1508  %t2 = extractelement <64 x i8> %t1, i8 %i
1509  ret i8 %t2
1510}
1511
1512define zeroext i8 @test_extractelement_varible_v2i1(<2 x i64> %a, <2 x i64> %b, i32 %index) {
1513; KNL-LABEL: test_extractelement_varible_v2i1:
1514; KNL:       ## %bb.0:
1515; KNL-NEXT:    ## kill: def $edi killed $edi def $rdi
1516; KNL-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
1517; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
1518; KNL-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k1
1519; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1520; KNL-NEXT:    vextracti32x4 $0, %zmm0, -{{[0-9]+}}(%rsp)
1521; KNL-NEXT:    andl $1, %edi
1522; KNL-NEXT:    movzbl -24(%rsp,%rdi,8), %eax
1523; KNL-NEXT:    andl $1, %eax
1524; KNL-NEXT:    vzeroupper
1525; KNL-NEXT:    retq
1526;
1527; SKX-LABEL: test_extractelement_varible_v2i1:
1528; SKX:       ## %bb.0:
1529; SKX-NEXT:    ## kill: def $edi killed $edi def $rdi
1530; SKX-NEXT:    vpcmpnleuq %xmm1, %xmm0, %k0
1531; SKX-NEXT:    vpmovm2q %k0, %xmm0
1532; SKX-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
1533; SKX-NEXT:    andl $1, %edi
1534; SKX-NEXT:    movzbl -24(%rsp,%rdi,8), %eax
1535; SKX-NEXT:    andl $1, %eax
1536; SKX-NEXT:    retq
1537  %t1 = icmp ugt <2 x i64> %a, %b
1538  %t2 = extractelement <2 x i1> %t1, i32 %index
1539  %res = zext i1 %t2 to i8
1540  ret i8 %res
1541}
1542
1543define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b, i32 %index) {
1544; KNL-LABEL: test_extractelement_varible_v4i1:
1545; KNL:       ## %bb.0:
1546; KNL-NEXT:    ## kill: def $edi killed $edi def $rdi
1547; KNL-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
1548; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
1549; KNL-NEXT:    vpcmpnleud %zmm1, %zmm0, %k1
1550; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1551; KNL-NEXT:    vextracti32x4 $0, %zmm0, -{{[0-9]+}}(%rsp)
1552; KNL-NEXT:    andl $3, %edi
1553; KNL-NEXT:    movzbl -24(%rsp,%rdi,4), %eax
1554; KNL-NEXT:    andl $1, %eax
1555; KNL-NEXT:    vzeroupper
1556; KNL-NEXT:    retq
1557;
1558; SKX-LABEL: test_extractelement_varible_v4i1:
1559; SKX:       ## %bb.0:
1560; SKX-NEXT:    ## kill: def $edi killed $edi def $rdi
1561; SKX-NEXT:    vpcmpnleud %xmm1, %xmm0, %k0
1562; SKX-NEXT:    vpmovm2d %k0, %xmm0
1563; SKX-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
1564; SKX-NEXT:    andl $3, %edi
1565; SKX-NEXT:    movzbl -24(%rsp,%rdi,4), %eax
1566; SKX-NEXT:    andl $1, %eax
1567; SKX-NEXT:    retq
1568  %t1 = icmp ugt <4 x i32> %a, %b
1569  %t2 = extractelement <4 x i1> %t1, i32 %index
1570  %res = zext i1 %t2 to i8
1571  ret i8 %res
1572}
1573
1574define zeroext i8 @test_extractelement_varible_v8i1(<8 x i32> %a, <8 x i32> %b, i32 %index) {
1575; KNL-LABEL: test_extractelement_varible_v8i1:
1576; KNL:       ## %bb.0:
1577; KNL-NEXT:    ## kill: def $edi killed $edi def $rdi
1578; KNL-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
1579; KNL-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
1580; KNL-NEXT:    vpcmpnleud %zmm1, %zmm0, %k1
1581; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1582; KNL-NEXT:    vpmovdw %zmm0, %ymm0
1583; KNL-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
1584; KNL-NEXT:    andl $7, %edi
1585; KNL-NEXT:    movzbl -24(%rsp,%rdi,2), %eax
1586; KNL-NEXT:    andl $1, %eax
1587; KNL-NEXT:    vzeroupper
1588; KNL-NEXT:    retq
1589;
1590; SKX-LABEL: test_extractelement_varible_v8i1:
1591; SKX:       ## %bb.0:
1592; SKX-NEXT:    ## kill: def $edi killed $edi def $rdi
1593; SKX-NEXT:    vpcmpnleud %ymm1, %ymm0, %k0
1594; SKX-NEXT:    vpmovm2w %k0, %xmm0
1595; SKX-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
1596; SKX-NEXT:    andl $7, %edi
1597; SKX-NEXT:    movzbl -24(%rsp,%rdi,2), %eax
1598; SKX-NEXT:    andl $1, %eax
1599; SKX-NEXT:    vzeroupper
1600; SKX-NEXT:    retq
1601  %t1 = icmp ugt <8 x i32> %a, %b
1602  %t2 = extractelement <8 x i1> %t1, i32 %index
1603  %res = zext i1 %t2 to i8
1604  ret i8 %res
1605}
1606
1607define zeroext i8 @test_extractelement_varible_v16i1(<16 x i32> %a, <16 x i32> %b, i32 %index) {
1608; KNL-LABEL: test_extractelement_varible_v16i1:
1609; KNL:       ## %bb.0:
1610; KNL-NEXT:    ## kill: def $edi killed $edi def $rdi
1611; KNL-NEXT:    vpcmpnleud %zmm1, %zmm0, %k1
1612; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1613; KNL-NEXT:    vpmovdb %zmm0, -{{[0-9]+}}(%rsp)
1614; KNL-NEXT:    andl $15, %edi
1615; KNL-NEXT:    movzbl -24(%rsp,%rdi), %eax
1616; KNL-NEXT:    andl $1, %eax
1617; KNL-NEXT:    vzeroupper
1618; KNL-NEXT:    retq
1619;
1620; SKX-LABEL: test_extractelement_varible_v16i1:
1621; SKX:       ## %bb.0:
1622; SKX-NEXT:    ## kill: def $edi killed $edi def $rdi
1623; SKX-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
1624; SKX-NEXT:    vpmovm2b %k0, %xmm0
1625; SKX-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
1626; SKX-NEXT:    andl $15, %edi
1627; SKX-NEXT:    movzbl -24(%rsp,%rdi), %eax
1628; SKX-NEXT:    andl $1, %eax
1629; SKX-NEXT:    vzeroupper
1630; SKX-NEXT:    retq
1631  %t1 = icmp ugt <16 x i32> %a, %b
1632  %t2 = extractelement <16 x i1> %t1, i32 %index
1633  %res = zext i1 %t2 to i8
1634  ret i8 %res
1635}
1636
1637define zeroext i8 @test_extractelement_varible_v32i1(<32 x i8> %a, <32 x i8> %b, i32 %index) {
1638; KNL-LABEL: test_extractelement_varible_v32i1:
1639; KNL:       ## %bb.0:
1640; KNL-NEXT:    pushq %rbp
1641; KNL-NEXT:    .cfi_def_cfa_offset 16
1642; KNL-NEXT:    .cfi_offset %rbp, -16
1643; KNL-NEXT:    movq %rsp, %rbp
1644; KNL-NEXT:    .cfi_def_cfa_register %rbp
1645; KNL-NEXT:    andq $-32, %rsp
1646; KNL-NEXT:    subq $64, %rsp
1647; KNL-NEXT:    ## kill: def $edi killed $edi def $rdi
1648; KNL-NEXT:    vpminub %ymm1, %ymm0, %ymm1
1649; KNL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
1650; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
1651; KNL-NEXT:    vmovdqa %ymm0, (%rsp)
1652; KNL-NEXT:    andl $31, %edi
1653; KNL-NEXT:    movzbl (%rsp,%rdi), %eax
1654; KNL-NEXT:    andl $1, %eax
1655; KNL-NEXT:    movq %rbp, %rsp
1656; KNL-NEXT:    popq %rbp
1657; KNL-NEXT:    vzeroupper
1658; KNL-NEXT:    retq
1659;
1660; SKX-LABEL: test_extractelement_varible_v32i1:
1661; SKX:       ## %bb.0:
1662; SKX-NEXT:    pushq %rbp
1663; SKX-NEXT:    .cfi_def_cfa_offset 16
1664; SKX-NEXT:    .cfi_offset %rbp, -16
1665; SKX-NEXT:    movq %rsp, %rbp
1666; SKX-NEXT:    .cfi_def_cfa_register %rbp
1667; SKX-NEXT:    andq $-32, %rsp
1668; SKX-NEXT:    subq $64, %rsp
1669; SKX-NEXT:    ## kill: def $edi killed $edi def $rdi
1670; SKX-NEXT:    vpcmpnleub %ymm1, %ymm0, %k0
1671; SKX-NEXT:    vpmovm2b %k0, %ymm0
1672; SKX-NEXT:    vmovdqa %ymm0, (%rsp)
1673; SKX-NEXT:    andl $31, %edi
1674; SKX-NEXT:    movzbl (%rsp,%rdi), %eax
1675; SKX-NEXT:    andl $1, %eax
1676; SKX-NEXT:    movq %rbp, %rsp
1677; SKX-NEXT:    popq %rbp
1678; SKX-NEXT:    vzeroupper
1679; SKX-NEXT:    retq
1680  %t1 = icmp ugt <32 x i8> %a, %b
1681  %t2 = extractelement <32 x i1> %t1, i32 %index
1682  %res = zext i1 %t2 to i8
1683  ret i8 %res
1684}
1685
1686define <8 x i64> @insert_double_zero(<2 x i64> %a) nounwind {
1687; CHECK-LABEL: insert_double_zero:
1688; CHECK:       ## %bb.0:
1689; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1690; CHECK-NEXT:    vinsertf32x4 $2, %xmm0, %zmm1, %zmm0
1691; CHECK-NEXT:    retq
1692  %b = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1693  %d = shufflevector <4 x i64> %b, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1694  %e = shufflevector <8 x i64> %d, <8 x i64> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
1695  ret <8 x i64> %e
1696}
1697
1698define i32 @test_insertelement_variable_v32i1(<32 x i8> %a, i8 %b, i32 %index) {
1699; KNL-LABEL: test_insertelement_variable_v32i1:
1700; KNL:       ## %bb.0:
1701; KNL-NEXT:    pushq %rbp
1702; KNL-NEXT:    .cfi_def_cfa_offset 16
1703; KNL-NEXT:    .cfi_offset %rbp, -16
1704; KNL-NEXT:    movq %rsp, %rbp
1705; KNL-NEXT:    .cfi_def_cfa_register %rbp
1706; KNL-NEXT:    andq $-32, %rsp
1707; KNL-NEXT:    subq $64, %rsp
1708; KNL-NEXT:    ## kill: def $esi killed $esi def $rsi
1709; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1710; KNL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
1711; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
1712; KNL-NEXT:    andl $31, %esi
1713; KNL-NEXT:    testb %dil, %dil
1714; KNL-NEXT:    vmovdqa %ymm0, (%rsp)
1715; KNL-NEXT:    setne (%rsp,%rsi)
1716; KNL-NEXT:    vpmovsxbd (%rsp), %zmm0
1717; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
1718; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
1719; KNL-NEXT:    kmovw %k0, %ecx
1720; KNL-NEXT:    vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
1721; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
1722; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
1723; KNL-NEXT:    kmovw %k0, %eax
1724; KNL-NEXT:    shll $16, %eax
1725; KNL-NEXT:    orl %ecx, %eax
1726; KNL-NEXT:    movq %rbp, %rsp
1727; KNL-NEXT:    popq %rbp
1728; KNL-NEXT:    vzeroupper
1729; KNL-NEXT:    retq
1730;
1731; SKX-LABEL: test_insertelement_variable_v32i1:
1732; SKX:       ## %bb.0:
1733; SKX-NEXT:    pushq %rbp
1734; SKX-NEXT:    .cfi_def_cfa_offset 16
1735; SKX-NEXT:    .cfi_offset %rbp, -16
1736; SKX-NEXT:    movq %rsp, %rbp
1737; SKX-NEXT:    .cfi_def_cfa_register %rbp
1738; SKX-NEXT:    andq $-32, %rsp
1739; SKX-NEXT:    subq $64, %rsp
1740; SKX-NEXT:    ## kill: def $esi killed $esi def $rsi
1741; SKX-NEXT:    vptestmb %ymm0, %ymm0, %k0
1742; SKX-NEXT:    andl $31, %esi
1743; SKX-NEXT:    testb %dil, %dil
1744; SKX-NEXT:    vpmovm2b %k0, %ymm0
1745; SKX-NEXT:    vmovdqa %ymm0, (%rsp)
1746; SKX-NEXT:    setne (%rsp,%rsi)
1747; SKX-NEXT:    vpsllw $7, (%rsp), %ymm0
1748; SKX-NEXT:    vpmovb2m %ymm0, %k0
1749; SKX-NEXT:    kmovd %k0, %eax
1750; SKX-NEXT:    movq %rbp, %rsp
1751; SKX-NEXT:    popq %rbp
1752; SKX-NEXT:    vzeroupper
1753; SKX-NEXT:    retq
1754  %t1 = icmp ugt <32 x i8> %a, zeroinitializer
1755  %t2 = icmp ugt i8 %b, 0
1756  %t3 = insertelement <32 x i1> %t1, i1 %t2, i32 %index
1757  %t4 = bitcast <32 x i1> %t3 to i32
1758  ret i32 %t4
1759}
1760
1761define i64 @test_insertelement_variable_v64i1(<64 x i8> %a, i8 %b, i32 %index) {
1762; KNL-LABEL: test_insertelement_variable_v64i1:
1763; KNL:       ## %bb.0:
1764; KNL-NEXT:    pushq %rbp
1765; KNL-NEXT:    .cfi_def_cfa_offset 16
1766; KNL-NEXT:    .cfi_offset %rbp, -16
1767; KNL-NEXT:    movq %rsp, %rbp
1768; KNL-NEXT:    .cfi_def_cfa_register %rbp
1769; KNL-NEXT:    andq $-64, %rsp
1770; KNL-NEXT:    subq $128, %rsp
1771; KNL-NEXT:    ## kill: def $esi killed $esi def $rsi
1772; KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1773; KNL-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
1774; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
1775; KNL-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm1
1776; KNL-NEXT:    vpternlogq $15, %zmm1, %zmm1, %zmm1
1777; KNL-NEXT:    andl $63, %esi
1778; KNL-NEXT:    testb %dil, %dil
1779; KNL-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%rsp)
1780; KNL-NEXT:    vmovdqa %ymm0, (%rsp)
1781; KNL-NEXT:    setne (%rsp,%rsi)
1782; KNL-NEXT:    vmovdqa (%rsp), %ymm0
1783; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm1
1784; KNL-NEXT:    vpmovsxbd %xmm0, %zmm2
1785; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
1786; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k0
1787; KNL-NEXT:    kmovw %k0, %eax
1788; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
1789; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
1790; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
1791; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
1792; KNL-NEXT:    kmovw %k0, %ecx
1793; KNL-NEXT:    shll $16, %ecx
1794; KNL-NEXT:    orl %eax, %ecx
1795; KNL-NEXT:    vpmovsxbd %xmm1, %zmm0
1796; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
1797; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
1798; KNL-NEXT:    kmovw %k0, %edx
1799; KNL-NEXT:    vextracti128 $1, %ymm1, %xmm0
1800; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
1801; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
1802; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
1803; KNL-NEXT:    kmovw %k0, %eax
1804; KNL-NEXT:    shll $16, %eax
1805; KNL-NEXT:    orl %edx, %eax
1806; KNL-NEXT:    shlq $32, %rax
1807; KNL-NEXT:    orq %rcx, %rax
1808; KNL-NEXT:    movq %rbp, %rsp
1809; KNL-NEXT:    popq %rbp
1810; KNL-NEXT:    vzeroupper
1811; KNL-NEXT:    retq
1812;
1813; SKX-LABEL: test_insertelement_variable_v64i1:
1814; SKX:       ## %bb.0:
1815; SKX-NEXT:    pushq %rbp
1816; SKX-NEXT:    .cfi_def_cfa_offset 16
1817; SKX-NEXT:    .cfi_offset %rbp, -16
1818; SKX-NEXT:    movq %rsp, %rbp
1819; SKX-NEXT:    .cfi_def_cfa_register %rbp
1820; SKX-NEXT:    andq $-64, %rsp
1821; SKX-NEXT:    subq $128, %rsp
1822; SKX-NEXT:    ## kill: def $esi killed $esi def $rsi
1823; SKX-NEXT:    vptestmb %zmm0, %zmm0, %k0
1824; SKX-NEXT:    andl $63, %esi
1825; SKX-NEXT:    testb %dil, %dil
1826; SKX-NEXT:    vpmovm2b %k0, %zmm0
1827; SKX-NEXT:    vmovdqa64 %zmm0, (%rsp)
1828; SKX-NEXT:    setne (%rsp,%rsi)
1829; SKX-NEXT:    vpsllw $7, (%rsp), %zmm0
1830; SKX-NEXT:    vpmovb2m %zmm0, %k0
1831; SKX-NEXT:    kmovq %k0, %rax
1832; SKX-NEXT:    movq %rbp, %rsp
1833; SKX-NEXT:    popq %rbp
1834; SKX-NEXT:    vzeroupper
1835; SKX-NEXT:    retq
1836  %t1 = icmp ugt <64 x i8> %a, zeroinitializer
1837  %t2 = icmp ugt i8 %b, 0
1838  %t3 = insertelement <64 x i1> %t1, i1 %t2, i32 %index
1839  %t4 = bitcast <64 x i1> %t3 to i64
1840  ret i64 %t4
1841}
1842
1843define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) {
1844; KNL-LABEL: test_insertelement_variable_v96i1:
1845; KNL:       ## %bb.0:
1846; KNL-NEXT:    pushq %rbp
1847; KNL-NEXT:    .cfi_def_cfa_offset 16
1848; KNL-NEXT:    .cfi_offset %rbp, -16
1849; KNL-NEXT:    movq %rsp, %rbp
1850; KNL-NEXT:    .cfi_def_cfa_register %rbp
1851; KNL-NEXT:    andq $-128, %rsp
1852; KNL-NEXT:    subq $256, %rsp ## imm = 0x100
1853; KNL-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1854; KNL-NEXT:    vpinsrb $1, 488(%rbp), %xmm0, %xmm0
1855; KNL-NEXT:    vpinsrb $2, 496(%rbp), %xmm0, %xmm0
1856; KNL-NEXT:    vpinsrb $3, 504(%rbp), %xmm0, %xmm0
1857; KNL-NEXT:    vpinsrb $4, 512(%rbp), %xmm0, %xmm0
1858; KNL-NEXT:    vpinsrb $5, 520(%rbp), %xmm0, %xmm0
1859; KNL-NEXT:    vpinsrb $6, 528(%rbp), %xmm0, %xmm0
1860; KNL-NEXT:    vpinsrb $7, 536(%rbp), %xmm0, %xmm0
1861; KNL-NEXT:    vpinsrb $8, 544(%rbp), %xmm0, %xmm0
1862; KNL-NEXT:    vpinsrb $9, 552(%rbp), %xmm0, %xmm0
1863; KNL-NEXT:    vpinsrb $10, 560(%rbp), %xmm0, %xmm0
1864; KNL-NEXT:    vpinsrb $11, 568(%rbp), %xmm0, %xmm0
1865; KNL-NEXT:    vpinsrb $12, 576(%rbp), %xmm0, %xmm0
1866; KNL-NEXT:    vpinsrb $13, 584(%rbp), %xmm0, %xmm0
1867; KNL-NEXT:    vpinsrb $14, 592(%rbp), %xmm0, %xmm0
1868; KNL-NEXT:    vpinsrb $15, 600(%rbp), %xmm0, %xmm0
1869; KNL-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1870; KNL-NEXT:    vpinsrb $1, 616(%rbp), %xmm1, %xmm1
1871; KNL-NEXT:    vpinsrb $2, 624(%rbp), %xmm1, %xmm1
1872; KNL-NEXT:    vpinsrb $3, 632(%rbp), %xmm1, %xmm1
1873; KNL-NEXT:    vpinsrb $4, 640(%rbp), %xmm1, %xmm1
1874; KNL-NEXT:    vpinsrb $5, 648(%rbp), %xmm1, %xmm1
1875; KNL-NEXT:    vpinsrb $6, 656(%rbp), %xmm1, %xmm1
1876; KNL-NEXT:    vpinsrb $7, 664(%rbp), %xmm1, %xmm1
1877; KNL-NEXT:    vpinsrb $8, 672(%rbp), %xmm1, %xmm1
1878; KNL-NEXT:    vpinsrb $9, 680(%rbp), %xmm1, %xmm1
1879; KNL-NEXT:    vpinsrb $10, 688(%rbp), %xmm1, %xmm1
1880; KNL-NEXT:    vpinsrb $11, 696(%rbp), %xmm1, %xmm1
1881; KNL-NEXT:    vpinsrb $12, 704(%rbp), %xmm1, %xmm1
1882; KNL-NEXT:    vpinsrb $13, 712(%rbp), %xmm1, %xmm1
1883; KNL-NEXT:    vpinsrb $14, 720(%rbp), %xmm1, %xmm1
1884; KNL-NEXT:    vpinsrb $15, 728(%rbp), %xmm1, %xmm1
1885; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1886; KNL-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1887; KNL-NEXT:    vpinsrb $1, 232(%rbp), %xmm1, %xmm1
1888; KNL-NEXT:    vpinsrb $2, 240(%rbp), %xmm1, %xmm1
1889; KNL-NEXT:    vpinsrb $3, 248(%rbp), %xmm1, %xmm1
1890; KNL-NEXT:    vpinsrb $4, 256(%rbp), %xmm1, %xmm1
1891; KNL-NEXT:    vpinsrb $5, 264(%rbp), %xmm1, %xmm1
1892; KNL-NEXT:    vpinsrb $6, 272(%rbp), %xmm1, %xmm1
1893; KNL-NEXT:    vpinsrb $7, 280(%rbp), %xmm1, %xmm1
1894; KNL-NEXT:    vpinsrb $8, 288(%rbp), %xmm1, %xmm1
1895; KNL-NEXT:    vpinsrb $9, 296(%rbp), %xmm1, %xmm1
1896; KNL-NEXT:    vpinsrb $10, 304(%rbp), %xmm1, %xmm1
1897; KNL-NEXT:    vpinsrb $11, 312(%rbp), %xmm1, %xmm1
1898; KNL-NEXT:    vpinsrb $12, 320(%rbp), %xmm1, %xmm1
1899; KNL-NEXT:    vpinsrb $13, 328(%rbp), %xmm1, %xmm1
1900; KNL-NEXT:    vpinsrb $14, 336(%rbp), %xmm1, %xmm1
1901; KNL-NEXT:    vpinsrb $15, 344(%rbp), %xmm1, %xmm1
1902; KNL-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
1903; KNL-NEXT:    vpinsrb $1, 360(%rbp), %xmm2, %xmm2
1904; KNL-NEXT:    vpinsrb $2, 368(%rbp), %xmm2, %xmm2
1905; KNL-NEXT:    vpinsrb $3, 376(%rbp), %xmm2, %xmm2
1906; KNL-NEXT:    vpinsrb $4, 384(%rbp), %xmm2, %xmm2
1907; KNL-NEXT:    vpinsrb $5, 392(%rbp), %xmm2, %xmm2
1908; KNL-NEXT:    vpinsrb $6, 400(%rbp), %xmm2, %xmm2
1909; KNL-NEXT:    vpinsrb $7, 408(%rbp), %xmm2, %xmm2
1910; KNL-NEXT:    vpinsrb $8, 416(%rbp), %xmm2, %xmm2
1911; KNL-NEXT:    vpinsrb $9, 424(%rbp), %xmm2, %xmm2
1912; KNL-NEXT:    vpinsrb $10, 432(%rbp), %xmm2, %xmm2
1913; KNL-NEXT:    vpinsrb $11, 440(%rbp), %xmm2, %xmm2
1914; KNL-NEXT:    vpinsrb $12, 448(%rbp), %xmm2, %xmm2
1915; KNL-NEXT:    vpinsrb $13, 456(%rbp), %xmm2, %xmm2
1916; KNL-NEXT:    vpinsrb $14, 464(%rbp), %xmm2, %xmm2
1917; KNL-NEXT:    vpinsrb $15, 472(%rbp), %xmm2, %xmm2
1918; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
1919; KNL-NEXT:    vmovd %edi, %xmm2
1920; KNL-NEXT:    vpinsrb $1, %esi, %xmm2, %xmm2
1921; KNL-NEXT:    vpinsrb $2, %edx, %xmm2, %xmm2
1922; KNL-NEXT:    vpinsrb $3, %ecx, %xmm2, %xmm2
1923; KNL-NEXT:    vpinsrb $4, %r8d, %xmm2, %xmm2
1924; KNL-NEXT:    vpinsrb $5, %r9d, %xmm2, %xmm2
1925; KNL-NEXT:    vpinsrb $6, 16(%rbp), %xmm2, %xmm2
1926; KNL-NEXT:    vpinsrb $7, 24(%rbp), %xmm2, %xmm2
1927; KNL-NEXT:    vpinsrb $8, 32(%rbp), %xmm2, %xmm2
1928; KNL-NEXT:    vpinsrb $9, 40(%rbp), %xmm2, %xmm2
1929; KNL-NEXT:    vpinsrb $10, 48(%rbp), %xmm2, %xmm2
1930; KNL-NEXT:    vpinsrb $11, 56(%rbp), %xmm2, %xmm2
1931; KNL-NEXT:    vpinsrb $12, 64(%rbp), %xmm2, %xmm2
1932; KNL-NEXT:    vpinsrb $13, 72(%rbp), %xmm2, %xmm2
1933; KNL-NEXT:    vpinsrb $14, 80(%rbp), %xmm2, %xmm2
1934; KNL-NEXT:    vpinsrb $15, 88(%rbp), %xmm2, %xmm2
1935; KNL-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
1936; KNL-NEXT:    vpinsrb $1, 104(%rbp), %xmm3, %xmm3
1937; KNL-NEXT:    vpinsrb $2, 112(%rbp), %xmm3, %xmm3
1938; KNL-NEXT:    vpinsrb $3, 120(%rbp), %xmm3, %xmm3
1939; KNL-NEXT:    vpinsrb $4, 128(%rbp), %xmm3, %xmm3
1940; KNL-NEXT:    vpinsrb $5, 136(%rbp), %xmm3, %xmm3
1941; KNL-NEXT:    vpinsrb $6, 144(%rbp), %xmm3, %xmm3
1942; KNL-NEXT:    vpinsrb $7, 152(%rbp), %xmm3, %xmm3
1943; KNL-NEXT:    vpinsrb $8, 160(%rbp), %xmm3, %xmm3
1944; KNL-NEXT:    vpinsrb $9, 168(%rbp), %xmm3, %xmm3
1945; KNL-NEXT:    vpinsrb $10, 176(%rbp), %xmm3, %xmm3
1946; KNL-NEXT:    vpinsrb $11, 184(%rbp), %xmm3, %xmm3
1947; KNL-NEXT:    vpinsrb $12, 192(%rbp), %xmm3, %xmm3
1948; KNL-NEXT:    vpinsrb $13, 200(%rbp), %xmm3, %xmm3
1949; KNL-NEXT:    vpinsrb $14, 208(%rbp), %xmm3, %xmm3
1950; KNL-NEXT:    vpinsrb $15, 216(%rbp), %xmm3, %xmm3
1951; KNL-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
1952; KNL-NEXT:    movl 744(%rbp), %eax
1953; KNL-NEXT:    andl $127, %eax
1954; KNL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
1955; KNL-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
1956; KNL-NEXT:    vpternlogq $15, %zmm2, %zmm2, %zmm2
1957; KNL-NEXT:    vpcmpeqb %ymm3, %ymm1, %ymm1
1958; KNL-NEXT:    vpternlogq $15, %zmm1, %zmm1, %zmm1
1959; KNL-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm0
1960; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
1961; KNL-NEXT:    cmpb $0, 736(%rbp)
1962; KNL-NEXT:    vmovdqa %ymm3, {{[0-9]+}}(%rsp)
1963; KNL-NEXT:    vmovdqa %ymm0, {{[0-9]+}}(%rsp)
1964; KNL-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%rsp)
1965; KNL-NEXT:    vmovdqa %ymm2, (%rsp)
1966; KNL-NEXT:    setne (%rsp,%rax)
1967; KNL-NEXT:    vmovdqa (%rsp), %ymm2
1968; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm3
1969; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm1
1970; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm0
1971; KNL-NEXT:    vpmovsxbd %xmm2, %zmm4
1972; KNL-NEXT:    vpslld $31, %zmm4, %zmm4
1973; KNL-NEXT:    vptestmd %zmm4, %zmm4, %k0
1974; KNL-NEXT:    kmovw %k0, %eax
1975; KNL-NEXT:    vextracti128 $1, %ymm2, %xmm2
1976; KNL-NEXT:    vpmovsxbd %xmm2, %zmm2
1977; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
1978; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k0
1979; KNL-NEXT:    kmovw %k0, %ecx
1980; KNL-NEXT:    shll $16, %ecx
1981; KNL-NEXT:    orl %eax, %ecx
1982; KNL-NEXT:    vpmovsxbd %xmm3, %zmm2
1983; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
1984; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k0
1985; KNL-NEXT:    kmovw %k0, %edx
1986; KNL-NEXT:    vextracti128 $1, %ymm3, %xmm2
1987; KNL-NEXT:    vpmovsxbd %xmm2, %zmm2
1988; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
1989; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k0
1990; KNL-NEXT:    kmovw %k0, %eax
1991; KNL-NEXT:    shll $16, %eax
1992; KNL-NEXT:    orl %edx, %eax
1993; KNL-NEXT:    shlq $32, %rax
1994; KNL-NEXT:    orq %rcx, %rax
1995; KNL-NEXT:    vpmovsxbd %xmm1, %zmm2
1996; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
1997; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k0
1998; KNL-NEXT:    kmovw %k0, %ecx
1999; KNL-NEXT:    vextracti128 $1, %ymm1, %xmm1
2000; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
2001; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
2002; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
2003; KNL-NEXT:    kmovw %k0, %esi
2004; KNL-NEXT:    shll $16, %esi
2005; KNL-NEXT:    orl %ecx, %esi
2006; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
2007; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
2008; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
2009; KNL-NEXT:    kmovw %k0, %ecx
2010; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
2011; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
2012; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
2013; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
2014; KNL-NEXT:    kmovw %k0, %edx
2015; KNL-NEXT:    shll $16, %edx
2016; KNL-NEXT:    orl %ecx, %edx
2017; KNL-NEXT:    shlq $32, %rdx
2018; KNL-NEXT:    orq %rsi, %rdx
2019; KNL-NEXT:    movq %rbp, %rsp
2020; KNL-NEXT:    popq %rbp
2021; KNL-NEXT:    vzeroupper
2022; KNL-NEXT:    retq
2023;
2024; SKX-LABEL: test_insertelement_variable_v96i1:
2025; SKX:       ## %bb.0:
2026; SKX-NEXT:    pushq %rbp
2027; SKX-NEXT:    .cfi_def_cfa_offset 16
2028; SKX-NEXT:    .cfi_offset %rbp, -16
2029; SKX-NEXT:    movq %rsp, %rbp
2030; SKX-NEXT:    .cfi_def_cfa_register %rbp
2031; SKX-NEXT:    andq $-128, %rsp
2032; SKX-NEXT:    subq $256, %rsp ## imm = 0x100
2033; SKX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2034; SKX-NEXT:    vpinsrb $1, 232(%rbp), %xmm0, %xmm0
2035; SKX-NEXT:    vpinsrb $2, 240(%rbp), %xmm0, %xmm0
2036; SKX-NEXT:    vpinsrb $3, 248(%rbp), %xmm0, %xmm0
2037; SKX-NEXT:    vpinsrb $4, 256(%rbp), %xmm0, %xmm0
2038; SKX-NEXT:    vpinsrb $5, 264(%rbp), %xmm0, %xmm0
2039; SKX-NEXT:    vpinsrb $6, 272(%rbp), %xmm0, %xmm0
2040; SKX-NEXT:    vpinsrb $7, 280(%rbp), %xmm0, %xmm0
2041; SKX-NEXT:    vpinsrb $8, 288(%rbp), %xmm0, %xmm0
2042; SKX-NEXT:    vpinsrb $9, 296(%rbp), %xmm0, %xmm0
2043; SKX-NEXT:    vpinsrb $10, 304(%rbp), %xmm0, %xmm0
2044; SKX-NEXT:    vpinsrb $11, 312(%rbp), %xmm0, %xmm0
2045; SKX-NEXT:    vpinsrb $12, 320(%rbp), %xmm0, %xmm0
2046; SKX-NEXT:    vpinsrb $13, 328(%rbp), %xmm0, %xmm0
2047; SKX-NEXT:    vpinsrb $14, 336(%rbp), %xmm0, %xmm0
2048; SKX-NEXT:    vpinsrb $15, 344(%rbp), %xmm0, %xmm0
2049; SKX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2050; SKX-NEXT:    vpinsrb $1, 360(%rbp), %xmm1, %xmm1
2051; SKX-NEXT:    vpinsrb $2, 368(%rbp), %xmm1, %xmm1
2052; SKX-NEXT:    vpinsrb $3, 376(%rbp), %xmm1, %xmm1
2053; SKX-NEXT:    vpinsrb $4, 384(%rbp), %xmm1, %xmm1
2054; SKX-NEXT:    vpinsrb $5, 392(%rbp), %xmm1, %xmm1
2055; SKX-NEXT:    vpinsrb $6, 400(%rbp), %xmm1, %xmm1
2056; SKX-NEXT:    vpinsrb $7, 408(%rbp), %xmm1, %xmm1
2057; SKX-NEXT:    vpinsrb $8, 416(%rbp), %xmm1, %xmm1
2058; SKX-NEXT:    vpinsrb $9, 424(%rbp), %xmm1, %xmm1
2059; SKX-NEXT:    vpinsrb $10, 432(%rbp), %xmm1, %xmm1
2060; SKX-NEXT:    vpinsrb $11, 440(%rbp), %xmm1, %xmm1
2061; SKX-NEXT:    vpinsrb $12, 448(%rbp), %xmm1, %xmm1
2062; SKX-NEXT:    vpinsrb $13, 456(%rbp), %xmm1, %xmm1
2063; SKX-NEXT:    vpinsrb $14, 464(%rbp), %xmm1, %xmm1
2064; SKX-NEXT:    vpinsrb $15, 472(%rbp), %xmm1, %xmm1
2065; SKX-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
2066; SKX-NEXT:    vmovd %edi, %xmm1
2067; SKX-NEXT:    vpinsrb $1, %esi, %xmm1, %xmm1
2068; SKX-NEXT:    vpinsrb $2, %edx, %xmm1, %xmm1
2069; SKX-NEXT:    vpinsrb $3, %ecx, %xmm1, %xmm1
2070; SKX-NEXT:    vpinsrb $4, %r8d, %xmm1, %xmm1
2071; SKX-NEXT:    vpinsrb $5, %r9d, %xmm1, %xmm1
2072; SKX-NEXT:    vpinsrb $6, 16(%rbp), %xmm1, %xmm1
2073; SKX-NEXT:    vpinsrb $7, 24(%rbp), %xmm1, %xmm1
2074; SKX-NEXT:    vpinsrb $8, 32(%rbp), %xmm1, %xmm1
2075; SKX-NEXT:    vpinsrb $9, 40(%rbp), %xmm1, %xmm1
2076; SKX-NEXT:    vpinsrb $10, 48(%rbp), %xmm1, %xmm1
2077; SKX-NEXT:    vpinsrb $11, 56(%rbp), %xmm1, %xmm1
2078; SKX-NEXT:    vpinsrb $12, 64(%rbp), %xmm1, %xmm1
2079; SKX-NEXT:    vpinsrb $13, 72(%rbp), %xmm1, %xmm1
2080; SKX-NEXT:    vpinsrb $14, 80(%rbp), %xmm1, %xmm1
2081; SKX-NEXT:    vpinsrb $15, 88(%rbp), %xmm1, %xmm1
2082; SKX-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
2083; SKX-NEXT:    vpinsrb $1, 104(%rbp), %xmm2, %xmm2
2084; SKX-NEXT:    vpinsrb $2, 112(%rbp), %xmm2, %xmm2
2085; SKX-NEXT:    vpinsrb $3, 120(%rbp), %xmm2, %xmm2
2086; SKX-NEXT:    vpinsrb $4, 128(%rbp), %xmm2, %xmm2
2087; SKX-NEXT:    vpinsrb $5, 136(%rbp), %xmm2, %xmm2
2088; SKX-NEXT:    vpinsrb $6, 144(%rbp), %xmm2, %xmm2
2089; SKX-NEXT:    vpinsrb $7, 152(%rbp), %xmm2, %xmm2
2090; SKX-NEXT:    vpinsrb $8, 160(%rbp), %xmm2, %xmm2
2091; SKX-NEXT:    vpinsrb $9, 168(%rbp), %xmm2, %xmm2
2092; SKX-NEXT:    vpinsrb $10, 176(%rbp), %xmm2, %xmm2
2093; SKX-NEXT:    vpinsrb $11, 184(%rbp), %xmm2, %xmm2
2094; SKX-NEXT:    vpinsrb $12, 192(%rbp), %xmm2, %xmm2
2095; SKX-NEXT:    vpinsrb $13, 200(%rbp), %xmm2, %xmm2
2096; SKX-NEXT:    vpinsrb $14, 208(%rbp), %xmm2, %xmm2
2097; SKX-NEXT:    vpinsrb $15, 216(%rbp), %xmm2, %xmm2
2098; SKX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2099; SKX-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
2100; SKX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2101; SKX-NEXT:    vpinsrb $1, 488(%rbp), %xmm1, %xmm1
2102; SKX-NEXT:    vpinsrb $2, 496(%rbp), %xmm1, %xmm1
2103; SKX-NEXT:    vpinsrb $3, 504(%rbp), %xmm1, %xmm1
2104; SKX-NEXT:    vpinsrb $4, 512(%rbp), %xmm1, %xmm1
2105; SKX-NEXT:    vpinsrb $5, 520(%rbp), %xmm1, %xmm1
2106; SKX-NEXT:    vpinsrb $6, 528(%rbp), %xmm1, %xmm1
2107; SKX-NEXT:    vpinsrb $7, 536(%rbp), %xmm1, %xmm1
2108; SKX-NEXT:    vpinsrb $8, 544(%rbp), %xmm1, %xmm1
2109; SKX-NEXT:    vpinsrb $9, 552(%rbp), %xmm1, %xmm1
2110; SKX-NEXT:    vpinsrb $10, 560(%rbp), %xmm1, %xmm1
2111; SKX-NEXT:    vpinsrb $11, 568(%rbp), %xmm1, %xmm1
2112; SKX-NEXT:    vpinsrb $12, 576(%rbp), %xmm1, %xmm1
2113; SKX-NEXT:    vpinsrb $13, 584(%rbp), %xmm1, %xmm1
2114; SKX-NEXT:    vpinsrb $14, 592(%rbp), %xmm1, %xmm1
2115; SKX-NEXT:    vpinsrb $15, 600(%rbp), %xmm1, %xmm1
2116; SKX-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
2117; SKX-NEXT:    vpinsrb $1, 616(%rbp), %xmm2, %xmm2
2118; SKX-NEXT:    vpinsrb $2, 624(%rbp), %xmm2, %xmm2
2119; SKX-NEXT:    vpinsrb $3, 632(%rbp), %xmm2, %xmm2
2120; SKX-NEXT:    vpinsrb $4, 640(%rbp), %xmm2, %xmm2
2121; SKX-NEXT:    vpinsrb $5, 648(%rbp), %xmm2, %xmm2
2122; SKX-NEXT:    vpinsrb $6, 656(%rbp), %xmm2, %xmm2
2123; SKX-NEXT:    vpinsrb $7, 664(%rbp), %xmm2, %xmm2
2124; SKX-NEXT:    vpinsrb $8, 672(%rbp), %xmm2, %xmm2
2125; SKX-NEXT:    vpinsrb $9, 680(%rbp), %xmm2, %xmm2
2126; SKX-NEXT:    vpinsrb $10, 688(%rbp), %xmm2, %xmm2
2127; SKX-NEXT:    vpinsrb $11, 696(%rbp), %xmm2, %xmm2
2128; SKX-NEXT:    vpinsrb $12, 704(%rbp), %xmm2, %xmm2
2129; SKX-NEXT:    vpinsrb $13, 712(%rbp), %xmm2, %xmm2
2130; SKX-NEXT:    vpinsrb $14, 720(%rbp), %xmm2, %xmm2
2131; SKX-NEXT:    vpinsrb $15, 728(%rbp), %xmm2, %xmm2
2132; SKX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2133; SKX-NEXT:    movl 744(%rbp), %eax
2134; SKX-NEXT:    andl $127, %eax
2135; SKX-NEXT:    vptestmb %zmm0, %zmm0, %k0
2136; SKX-NEXT:    vptestmb %zmm1, %zmm1, %k1
2137; SKX-NEXT:    cmpb $0, 736(%rbp)
2138; SKX-NEXT:    vpmovm2b %k1, %zmm0
2139; SKX-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
2140; SKX-NEXT:    vpmovm2b %k0, %zmm0
2141; SKX-NEXT:    vmovdqa64 %zmm0, (%rsp)
2142; SKX-NEXT:    setne (%rsp,%rax)
2143; SKX-NEXT:    vpsllw $7, {{[0-9]+}}(%rsp), %zmm0
2144; SKX-NEXT:    vpmovb2m %zmm0, %k0
2145; SKX-NEXT:    vpsllw $7, (%rsp), %zmm0
2146; SKX-NEXT:    vpmovb2m %zmm0, %k1
2147; SKX-NEXT:    kmovq %k1, %rax
2148; SKX-NEXT:    kmovq %k0, %rdx
2149; SKX-NEXT:    movq %rbp, %rsp
2150; SKX-NEXT:    popq %rbp
2151; SKX-NEXT:    vzeroupper
2152; SKX-NEXT:    retq
2153  %t1 = icmp ugt <96 x i8> %a, zeroinitializer
2154  %t2 = icmp ugt i8 %b, 0
2155  %t3 = insertelement <96 x i1> %t1, i1 %t2, i32 %index
2156  %t4 = bitcast <96 x i1> %t3 to i96
2157  ret i96 %t4
2158}
2159
2160define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index) {
2161; KNL-LABEL: test_insertelement_variable_v128i1:
2162; KNL:       ## %bb.0:
2163; KNL-NEXT:    pushq %rbp
2164; KNL-NEXT:    .cfi_def_cfa_offset 16
2165; KNL-NEXT:    .cfi_offset %rbp, -16
2166; KNL-NEXT:    movq %rsp, %rbp
2167; KNL-NEXT:    .cfi_def_cfa_register %rbp
2168; KNL-NEXT:    andq $-128, %rsp
2169; KNL-NEXT:    subq $256, %rsp ## imm = 0x100
2170; KNL-NEXT:    ## kill: def $esi killed $esi def $rsi
2171; KNL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
2172; KNL-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
2173; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
2174; KNL-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm1
2175; KNL-NEXT:    vpternlogq $15, %zmm1, %zmm1, %zmm1
2176; KNL-NEXT:    vpcmpeqb %ymm4, %ymm2, %ymm2
2177; KNL-NEXT:    vpternlogq $15, %zmm2, %zmm2, %zmm2
2178; KNL-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm3
2179; KNL-NEXT:    vpternlogq $15, %zmm3, %zmm3, %zmm3
2180; KNL-NEXT:    andl $127, %esi
2181; KNL-NEXT:    testb %dil, %dil
2182; KNL-NEXT:    vmovdqa %ymm3, {{[0-9]+}}(%rsp)
2183; KNL-NEXT:    vmovdqa %ymm2, {{[0-9]+}}(%rsp)
2184; KNL-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%rsp)
2185; KNL-NEXT:    vmovdqa %ymm0, (%rsp)
2186; KNL-NEXT:    setne (%rsp,%rsi)
2187; KNL-NEXT:    vmovdqa (%rsp), %ymm2
2188; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm3
2189; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm1
2190; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm0
2191; KNL-NEXT:    vpmovsxbd %xmm2, %zmm4
2192; KNL-NEXT:    vpslld $31, %zmm4, %zmm4
2193; KNL-NEXT:    vptestmd %zmm4, %zmm4, %k0
2194; KNL-NEXT:    kmovw %k0, %eax
2195; KNL-NEXT:    vextracti128 $1, %ymm2, %xmm2
2196; KNL-NEXT:    vpmovsxbd %xmm2, %zmm2
2197; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
2198; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k0
2199; KNL-NEXT:    kmovw %k0, %ecx
2200; KNL-NEXT:    shll $16, %ecx
2201; KNL-NEXT:    orl %eax, %ecx
2202; KNL-NEXT:    vpmovsxbd %xmm3, %zmm2
2203; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
2204; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k0
2205; KNL-NEXT:    kmovw %k0, %edx
2206; KNL-NEXT:    vextracti128 $1, %ymm3, %xmm2
2207; KNL-NEXT:    vpmovsxbd %xmm2, %zmm2
2208; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
2209; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k0
2210; KNL-NEXT:    kmovw %k0, %eax
2211; KNL-NEXT:    shll $16, %eax
2212; KNL-NEXT:    orl %edx, %eax
2213; KNL-NEXT:    shlq $32, %rax
2214; KNL-NEXT:    orq %rcx, %rax
2215; KNL-NEXT:    vpmovsxbd %xmm1, %zmm2
2216; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
2217; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k0
2218; KNL-NEXT:    kmovw %k0, %ecx
2219; KNL-NEXT:    vextracti128 $1, %ymm1, %xmm1
2220; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
2221; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
2222; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
2223; KNL-NEXT:    kmovw %k0, %esi
2224; KNL-NEXT:    shll $16, %esi
2225; KNL-NEXT:    orl %ecx, %esi
2226; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
2227; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
2228; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
2229; KNL-NEXT:    kmovw %k0, %ecx
2230; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
2231; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
2232; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
2233; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
2234; KNL-NEXT:    kmovw %k0, %edx
2235; KNL-NEXT:    shll $16, %edx
2236; KNL-NEXT:    orl %ecx, %edx
2237; KNL-NEXT:    shlq $32, %rdx
2238; KNL-NEXT:    orq %rsi, %rdx
2239; KNL-NEXT:    movq %rbp, %rsp
2240; KNL-NEXT:    popq %rbp
2241; KNL-NEXT:    vzeroupper
2242; KNL-NEXT:    retq
2243;
2244; SKX-LABEL: test_insertelement_variable_v128i1:
2245; SKX:       ## %bb.0:
2246; SKX-NEXT:    pushq %rbp
2247; SKX-NEXT:    .cfi_def_cfa_offset 16
2248; SKX-NEXT:    .cfi_offset %rbp, -16
2249; SKX-NEXT:    movq %rsp, %rbp
2250; SKX-NEXT:    .cfi_def_cfa_register %rbp
2251; SKX-NEXT:    andq $-128, %rsp
2252; SKX-NEXT:    subq $256, %rsp ## imm = 0x100
2253; SKX-NEXT:    ## kill: def $esi killed $esi def $rsi
2254; SKX-NEXT:    vptestmb %zmm0, %zmm0, %k0
2255; SKX-NEXT:    vptestmb %zmm1, %zmm1, %k1
2256; SKX-NEXT:    andl $127, %esi
2257; SKX-NEXT:    testb %dil, %dil
2258; SKX-NEXT:    vpmovm2b %k1, %zmm0
2259; SKX-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
2260; SKX-NEXT:    vpmovm2b %k0, %zmm0
2261; SKX-NEXT:    vmovdqa64 %zmm0, (%rsp)
2262; SKX-NEXT:    setne (%rsp,%rsi)
2263; SKX-NEXT:    vpsllw $7, {{[0-9]+}}(%rsp), %zmm0
2264; SKX-NEXT:    vpmovb2m %zmm0, %k0
2265; SKX-NEXT:    vpsllw $7, (%rsp), %zmm0
2266; SKX-NEXT:    vpmovb2m %zmm0, %k1
2267; SKX-NEXT:    kmovq %k1, %rax
2268; SKX-NEXT:    kmovq %k0, %rdx
2269; SKX-NEXT:    movq %rbp, %rsp
2270; SKX-NEXT:    popq %rbp
2271; SKX-NEXT:    vzeroupper
2272; SKX-NEXT:    retq
2273  %t1 = icmp ugt <128 x i8> %a, zeroinitializer
2274  %t2 = icmp ugt i8 %b, 0
2275  %t3 = insertelement <128 x i1> %t1, i1 %t2, i32 %index
2276  %t4 = bitcast <128 x i1> %t3 to i128
2277  ret i128 %t4
2278}
2279