• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit,avx512vbmi | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI
4; Make sure CPUs default to prefer-256-bit. avx512vnni isn't interesting as it just adds an isel peephole for vpmaddwd+vpaddd
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=cascadelake | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=cooperlake | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=cannonlake | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=icelake-client | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=icelake-server | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI
12
13; This file primarily contains tests for specific places in X86ISelLowering.cpp that needed be made aware of the legalizer not allowing 512-bit vectors due to prefer-256-bit even though AVX512 is enabled.
14
15define void @add256(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "min-legal-vector-width"="256" {
16; CHECK-LABEL: add256:
17; CHECK:       # %bb.0:
18; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
19; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
20; CHECK-NEXT:    vpaddd 32(%rsi), %ymm1, %ymm1
21; CHECK-NEXT:    vpaddd (%rsi), %ymm0, %ymm0
22; CHECK-NEXT:    vmovdqa %ymm0, (%rdx)
23; CHECK-NEXT:    vmovdqa %ymm1, 32(%rdx)
24; CHECK-NEXT:    vzeroupper
25; CHECK-NEXT:    retq
26  %d = load <16 x i32>, <16 x i32>* %a
27  %e = load <16 x i32>, <16 x i32>* %b
28  %f = add <16 x i32> %d, %e
29  store <16 x i32> %f, <16 x i32>* %c
30  ret void
31}
32
33define void @add512(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "min-legal-vector-width"="512" {
34; CHECK-LABEL: add512:
35; CHECK:       # %bb.0:
36; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
37; CHECK-NEXT:    vpaddd (%rsi), %zmm0, %zmm0
38; CHECK-NEXT:    vmovdqa64 %zmm0, (%rdx)
39; CHECK-NEXT:    vzeroupper
40; CHECK-NEXT:    retq
41  %d = load <16 x i32>, <16 x i32>* %a
42  %e = load <16 x i32>, <16 x i32>* %b
43  %f = add <16 x i32> %d, %e
44  store <16 x i32> %f, <16 x i32>* %c
45  ret void
46}
47
48define void @avg_v64i8_256(<64 x i8>* %a, <64 x i8>* %b) "min-legal-vector-width"="256" {
49; CHECK-LABEL: avg_v64i8_256:
50; CHECK:       # %bb.0:
51; CHECK-NEXT:    vmovdqa (%rsi), %ymm0
52; CHECK-NEXT:    vmovdqa 32(%rsi), %ymm1
53; CHECK-NEXT:    vpavgb (%rdi), %ymm0, %ymm0
54; CHECK-NEXT:    vpavgb 32(%rdi), %ymm1, %ymm1
55; CHECK-NEXT:    vmovdqu %ymm1, (%rax)
56; CHECK-NEXT:    vmovdqu %ymm0, (%rax)
57; CHECK-NEXT:    vzeroupper
58; CHECK-NEXT:    retq
59  %1 = load <64 x i8>, <64 x i8>* %a
60  %2 = load <64 x i8>, <64 x i8>* %b
61  %3 = zext <64 x i8> %1 to <64 x i32>
62  %4 = zext <64 x i8> %2 to <64 x i32>
63  %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
64  %6 = add nuw nsw <64 x i32> %5, %4
65  %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
66  %8 = trunc <64 x i32> %7 to <64 x i8>
67  store <64 x i8> %8, <64 x i8>* undef, align 4
68  ret void
69}
70
71
72define void @avg_v64i8_512(<64 x i8>* %a, <64 x i8>* %b) "min-legal-vector-width"="512" {
73; CHECK-LABEL: avg_v64i8_512:
74; CHECK:       # %bb.0:
75; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm0
76; CHECK-NEXT:    vpavgb (%rdi), %zmm0, %zmm0
77; CHECK-NEXT:    vmovdqu64 %zmm0, (%rax)
78; CHECK-NEXT:    vzeroupper
79; CHECK-NEXT:    retq
80  %1 = load <64 x i8>, <64 x i8>* %a
81  %2 = load <64 x i8>, <64 x i8>* %b
82  %3 = zext <64 x i8> %1 to <64 x i32>
83  %4 = zext <64 x i8> %2 to <64 x i32>
84  %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
85  %6 = add nuw nsw <64 x i32> %5, %4
86  %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
87  %8 = trunc <64 x i32> %7 to <64 x i8>
88  store <64 x i8> %8, <64 x i8>* undef, align 4
89  ret void
90}
91
92define void @pmaddwd_32_256(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "min-legal-vector-width"="256" {
93; CHECK-LABEL: pmaddwd_32_256:
94; CHECK:       # %bb.0:
95; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
96; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
97; CHECK-NEXT:    vpmaddwd 32(%rsi), %ymm1, %ymm1
98; CHECK-NEXT:    vpmaddwd (%rsi), %ymm0, %ymm0
99; CHECK-NEXT:    vmovdqa %ymm0, (%rdx)
100; CHECK-NEXT:    vmovdqa %ymm1, 32(%rdx)
101; CHECK-NEXT:    vzeroupper
102; CHECK-NEXT:    retq
103   %A = load <32 x i16>, <32 x i16>* %APtr
104   %B = load <32 x i16>, <32 x i16>* %BPtr
105   %a = sext <32 x i16> %A to <32 x i32>
106   %b = sext <32 x i16> %B to <32 x i32>
107   %m = mul nsw <32 x i32> %a, %b
108   %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
109   %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
110   %ret = add <16 x i32> %odd, %even
111   store <16 x i32> %ret, <16 x i32>* %CPtr
112   ret void
113}
114
115define void @pmaddwd_32_512(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "min-legal-vector-width"="512" {
116; CHECK-LABEL: pmaddwd_32_512:
117; CHECK:       # %bb.0:
118; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
119; CHECK-NEXT:    vpmaddwd (%rsi), %zmm0, %zmm0
120; CHECK-NEXT:    vmovdqa64 %zmm0, (%rdx)
121; CHECK-NEXT:    vzeroupper
122; CHECK-NEXT:    retq
123   %A = load <32 x i16>, <32 x i16>* %APtr
124   %B = load <32 x i16>, <32 x i16>* %BPtr
125   %a = sext <32 x i16> %A to <32 x i32>
126   %b = sext <32 x i16> %B to <32 x i32>
127   %m = mul nsw <32 x i32> %a, %b
128   %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
129   %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
130   %ret = add <16 x i32> %odd, %even
131   store <16 x i32> %ret, <16 x i32>* %CPtr
132   ret void
133}
134
135define void @psubus_64i8_max_256(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "min-legal-vector-width"="256" {
136; CHECK-LABEL: psubus_64i8_max_256:
137; CHECK:       # %bb.0:
138; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
139; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
140; CHECK-NEXT:    vpsubusb 32(%rsi), %ymm1, %ymm1
141; CHECK-NEXT:    vpsubusb (%rsi), %ymm0, %ymm0
142; CHECK-NEXT:    vmovdqa %ymm0, (%rdx)
143; CHECK-NEXT:    vmovdqa %ymm1, 32(%rdx)
144; CHECK-NEXT:    vzeroupper
145; CHECK-NEXT:    retq
146  %x = load <64 x i8>, <64 x i8>* %xptr
147  %y = load <64 x i8>, <64 x i8>* %yptr
148  %cmp = icmp ult <64 x i8> %x, %y
149  %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x
150  %res = sub <64 x i8> %max, %y
151  store <64 x i8> %res, <64 x i8>* %zptr
152  ret void
153}
154
155define void @psubus_64i8_max_512(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "min-legal-vector-width"="512" {
156; CHECK-LABEL: psubus_64i8_max_512:
157; CHECK:       # %bb.0:
158; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
159; CHECK-NEXT:    vpsubusb (%rsi), %zmm0, %zmm0
160; CHECK-NEXT:    vmovdqa64 %zmm0, (%rdx)
161; CHECK-NEXT:    vzeroupper
162; CHECK-NEXT:    retq
163  %x = load <64 x i8>, <64 x i8>* %xptr
164  %y = load <64 x i8>, <64 x i8>* %yptr
165  %cmp = icmp ult <64 x i8> %x, %y
166  %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x
167  %res = sub <64 x i8> %max, %y
168  store <64 x i8> %res, <64 x i8>* %zptr
169  ret void
170}
171
172define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly, i32) "min-legal-vector-width"="256" {
173; CHECK-LABEL: _Z9test_charPcS_i_256:
174; CHECK:       # %bb.0: # %entry
175; CHECK-NEXT:    movl %edx, %eax
176; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
177; CHECK-NEXT:    xorl %ecx, %ecx
178; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
179; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
180; CHECK-NEXT:    .p2align 4, 0x90
181; CHECK-NEXT:  .LBB8_1: # %vector.body
182; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
183; CHECK-NEXT:    vpmovsxbw 16(%rdi,%rcx), %ymm3
184; CHECK-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm4
185; CHECK-NEXT:    vpmovsxbw 16(%rsi,%rcx), %ymm5
186; CHECK-NEXT:    vpmaddwd %ymm3, %ymm5, %ymm3
187; CHECK-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
188; CHECK-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm3
189; CHECK-NEXT:    vpmaddwd %ymm4, %ymm3, %ymm3
190; CHECK-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
191; CHECK-NEXT:    addq $32, %rcx
192; CHECK-NEXT:    cmpq %rcx, %rax
193; CHECK-NEXT:    jne .LBB8_1
194; CHECK-NEXT:  # %bb.2: # %middle.block
195; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm1
196; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
197; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
198; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
199; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
200; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
201; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
202; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
203; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
204; CHECK-NEXT:    vmovd %xmm0, %eax
205; CHECK-NEXT:    vzeroupper
206; CHECK-NEXT:    retq
207entry:
208  %3 = zext i32 %2 to i64
209  br label %vector.body
210
211vector.body:
212  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
213  %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
214  %4 = getelementptr inbounds i8, i8* %0, i64 %index
215  %5 = bitcast i8* %4 to <32 x i8>*
216  %wide.load = load <32 x i8>, <32 x i8>* %5, align 1
217  %6 = sext <32 x i8> %wide.load to <32 x i32>
218  %7 = getelementptr inbounds i8, i8* %1, i64 %index
219  %8 = bitcast i8* %7 to <32 x i8>*
220  %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1
221  %9 = sext <32 x i8> %wide.load14 to <32 x i32>
222  %10 = mul nsw <32 x i32> %9, %6
223  %11 = add nsw <32 x i32> %10, %vec.phi
224  %index.next = add i64 %index, 32
225  %12 = icmp eq i64 %index.next, %3
226  br i1 %12, label %middle.block, label %vector.body
227
228middle.block:
229  %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
230  %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1
231  %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
232  %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf
233  %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
234  %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15
235  %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
236  %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17
237  %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
238  %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19
239  %13 = extractelement <32 x i32> %bin.rdx20, i32 0
240  ret i32 %13
241}
242
243define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly, i32) "min-legal-vector-width"="512" {
244; CHECK-LABEL: _Z9test_charPcS_i_512:
245; CHECK:       # %bb.0: # %entry
246; CHECK-NEXT:    movl %edx, %eax
247; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
248; CHECK-NEXT:    xorl %ecx, %ecx
249; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
250; CHECK-NEXT:    .p2align 4, 0x90
251; CHECK-NEXT:  .LBB9_1: # %vector.body
252; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
253; CHECK-NEXT:    vpmovsxbw (%rdi,%rcx), %zmm2
254; CHECK-NEXT:    vpmovsxbw (%rsi,%rcx), %zmm3
255; CHECK-NEXT:    vpmaddwd %zmm2, %zmm3, %zmm2
256; CHECK-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
257; CHECK-NEXT:    addq $32, %rcx
258; CHECK-NEXT:    cmpq %rcx, %rax
259; CHECK-NEXT:    jne .LBB9_1
260; CHECK-NEXT:  # %bb.2: # %middle.block
261; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
262; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
263; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
264; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
265; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
266; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
267; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
268; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
269; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
270; CHECK-NEXT:    vmovd %xmm0, %eax
271; CHECK-NEXT:    vzeroupper
272; CHECK-NEXT:    retq
273entry:
274  %3 = zext i32 %2 to i64
275  br label %vector.body
276
277vector.body:
278  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
279  %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
280  %4 = getelementptr inbounds i8, i8* %0, i64 %index
281  %5 = bitcast i8* %4 to <32 x i8>*
282  %wide.load = load <32 x i8>, <32 x i8>* %5, align 1
283  %6 = sext <32 x i8> %wide.load to <32 x i32>
284  %7 = getelementptr inbounds i8, i8* %1, i64 %index
285  %8 = bitcast i8* %7 to <32 x i8>*
286  %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1
287  %9 = sext <32 x i8> %wide.load14 to <32 x i32>
288  %10 = mul nsw <32 x i32> %9, %6
289  %11 = add nsw <32 x i32> %10, %vec.phi
290  %index.next = add i64 %index, 32
291  %12 = icmp eq i64 %index.next, %3
292  br i1 %12, label %middle.block, label %vector.body
293
294middle.block:
295  %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
296  %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1
297  %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
298  %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf
299  %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
300  %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15
301  %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
302  %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17
303  %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
304  %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19
305  %13 = extractelement <32 x i32> %bin.rdx20, i32 0
306  ret i32 %13
307}
308
309@a = global [1024 x i8] zeroinitializer, align 16
310@b = global [1024 x i8] zeroinitializer, align 16
311
312define i32 @sad_16i8_256() "min-legal-vector-width"="256" {
313; CHECK-LABEL: sad_16i8_256:
314; CHECK:       # %bb.0: # %entry
315; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
316; CHECK-NEXT:    movq $-1024, %rax # imm = 0xFC00
317; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
318; CHECK-NEXT:    .p2align 4, 0x90
319; CHECK-NEXT:  .LBB10_1: # %vector.body
320; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
321; CHECK-NEXT:    vmovdqu a+1024(%rax), %xmm2
322; CHECK-NEXT:    vpsadbw b+1024(%rax), %xmm2, %xmm2
323; CHECK-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
324; CHECK-NEXT:    addq $4, %rax
325; CHECK-NEXT:    jne .LBB10_1
326; CHECK-NEXT:  # %bb.2: # %middle.block
327; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
328; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
329; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
330; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
331; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
332; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
333; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
334; CHECK-NEXT:    vmovd %xmm0, %eax
335; CHECK-NEXT:    vzeroupper
336; CHECK-NEXT:    retq
337entry:
338  br label %vector.body
339
340vector.body:
341  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
342  %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
343  %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
344  %1 = bitcast i8* %0 to <16 x i8>*
345  %wide.load = load <16 x i8>, <16 x i8>* %1, align 4
346  %2 = zext <16 x i8> %wide.load to <16 x i32>
347  %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
348  %4 = bitcast i8* %3 to <16 x i8>*
349  %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4
350  %5 = zext <16 x i8> %wide.load1 to <16 x i32>
351  %6 = sub nsw <16 x i32> %2, %5
352  %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
353  %8 = sub nsw <16 x i32> zeroinitializer, %6
354  %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8
355  %10 = add nsw <16 x i32> %9, %vec.phi
356  %index.next = add i64 %index, 4
357  %11 = icmp eq i64 %index.next, 1024
358  br i1 %11, label %middle.block, label %vector.body
359
360middle.block:
361  %rdx.shuf = shufflevector <16 x i32> %10, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
362  %bin.rdx = add <16 x i32> %10, %rdx.shuf
363  %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
364  %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2
365  %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
366  %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3
367  %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
368  %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4
369  %12 = extractelement <16 x i32> %bin.rdx4, i32 0
370  ret i32 %12
371}
372
373define i32 @sad_16i8_512() "min-legal-vector-width"="512" {
374; CHECK-LABEL: sad_16i8_512:
375; CHECK:       # %bb.0: # %entry
376; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
377; CHECK-NEXT:    movq $-1024, %rax # imm = 0xFC00
378; CHECK-NEXT:    .p2align 4, 0x90
379; CHECK-NEXT:  .LBB11_1: # %vector.body
380; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
381; CHECK-NEXT:    vmovdqu a+1024(%rax), %xmm1
382; CHECK-NEXT:    vpsadbw b+1024(%rax), %xmm1, %xmm1
383; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
384; CHECK-NEXT:    addq $4, %rax
385; CHECK-NEXT:    jne .LBB11_1
386; CHECK-NEXT:  # %bb.2: # %middle.block
387; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
388; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
389; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
390; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
391; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
392; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
393; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
394; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
395; CHECK-NEXT:    vmovd %xmm0, %eax
396; CHECK-NEXT:    vzeroupper
397; CHECK-NEXT:    retq
398entry:
399  br label %vector.body
400
401vector.body:
402  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
403  %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
404  %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
405  %1 = bitcast i8* %0 to <16 x i8>*
406  %wide.load = load <16 x i8>, <16 x i8>* %1, align 4
407  %2 = zext <16 x i8> %wide.load to <16 x i32>
408  %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
409  %4 = bitcast i8* %3 to <16 x i8>*
410  %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4
411  %5 = zext <16 x i8> %wide.load1 to <16 x i32>
412  %6 = sub nsw <16 x i32> %2, %5
413  %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
414  %8 = sub nsw <16 x i32> zeroinitializer, %6
415  %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8
416  %10 = add nsw <16 x i32> %9, %vec.phi
417  %index.next = add i64 %index, 4
418  %11 = icmp eq i64 %index.next, 1024
419  br i1 %11, label %middle.block, label %vector.body
420
421middle.block:
422  %rdx.shuf = shufflevector <16 x i32> %10, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
423  %bin.rdx = add <16 x i32> %10, %rdx.shuf
424  %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
425  %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2
426  %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
427  %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3
428  %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
429  %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4
430  %12 = extractelement <16 x i32> %bin.rdx4, i32 0
431  ret i32 %12
432}
433
434define void @sbto16f32_256(<16 x i16> %a, <16 x float>* %res) "min-legal-vector-width"="256" {
435; CHECK-LABEL: sbto16f32_256:
436; CHECK:       # %bb.0:
437; CHECK-NEXT:    vpmovw2m %ymm0, %k0
438; CHECK-NEXT:    kshiftrw $8, %k0, %k1
439; CHECK-NEXT:    vpmovm2d %k1, %ymm0
440; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
441; CHECK-NEXT:    vpmovm2d %k0, %ymm1
442; CHECK-NEXT:    vcvtdq2ps %ymm1, %ymm1
443; CHECK-NEXT:    vmovaps %ymm1, (%rdi)
444; CHECK-NEXT:    vmovaps %ymm0, 32(%rdi)
445; CHECK-NEXT:    vzeroupper
446; CHECK-NEXT:    retq
447  %mask = icmp slt <16 x i16> %a, zeroinitializer
448  %1 = sitofp <16 x i1> %mask to <16 x float>
449  store <16 x float> %1, <16 x float>* %res
450  ret void
451}
452
453define void @sbto16f32_512(<16 x i16> %a, <16 x float>* %res) "min-legal-vector-width"="512" {
454; CHECK-LABEL: sbto16f32_512:
455; CHECK:       # %bb.0:
456; CHECK-NEXT:    vpmovw2m %ymm0, %k0
457; CHECK-NEXT:    vpmovm2d %k0, %zmm0
458; CHECK-NEXT:    vcvtdq2ps %zmm0, %zmm0
459; CHECK-NEXT:    vmovaps %zmm0, (%rdi)
460; CHECK-NEXT:    vzeroupper
461; CHECK-NEXT:    retq
462  %mask = icmp slt <16 x i16> %a, zeroinitializer
463  %1 = sitofp <16 x i1> %mask to <16 x float>
464  store <16 x float> %1, <16 x float>* %res
465  ret void
466}
467
468define void @sbto16f64_256(<16 x i16> %a, <16 x double>* %res)  "min-legal-vector-width"="256" {
469; CHECK-LABEL: sbto16f64_256:
470; CHECK:       # %bb.0:
471; CHECK-NEXT:    vpmovw2m %ymm0, %k0
472; CHECK-NEXT:    kshiftrw $8, %k0, %k1
473; CHECK-NEXT:    vpmovm2d %k1, %ymm0
474; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm1
475; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
476; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm0
477; CHECK-NEXT:    vpmovm2d %k0, %ymm2
478; CHECK-NEXT:    vcvtdq2pd %xmm2, %ymm3
479; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm2
480; CHECK-NEXT:    vcvtdq2pd %xmm2, %ymm2
481; CHECK-NEXT:    vmovaps %ymm2, 32(%rdi)
482; CHECK-NEXT:    vmovaps %ymm3, (%rdi)
483; CHECK-NEXT:    vmovaps %ymm0, 96(%rdi)
484; CHECK-NEXT:    vmovaps %ymm1, 64(%rdi)
485; CHECK-NEXT:    vzeroupper
486; CHECK-NEXT:    retq
487  %mask = icmp slt <16 x i16> %a, zeroinitializer
488  %1 = sitofp <16 x i1> %mask to <16 x double>
489  store <16 x double> %1, <16 x double>* %res
490  ret void
491}
492
493define void @sbto16f64_512(<16 x i16> %a, <16 x double>* %res)  "min-legal-vector-width"="512" {
494; CHECK-LABEL: sbto16f64_512:
495; CHECK:       # %bb.0:
496; CHECK-NEXT:    vpmovw2m %ymm0, %k0
497; CHECK-NEXT:    vpmovm2d %k0, %zmm0
498; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm1
499; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
500; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm0
501; CHECK-NEXT:    vmovaps %zmm0, 64(%rdi)
502; CHECK-NEXT:    vmovaps %zmm1, (%rdi)
503; CHECK-NEXT:    vzeroupper
504; CHECK-NEXT:    retq
505  %mask = icmp slt <16 x i16> %a, zeroinitializer
506  %1 = sitofp <16 x i1> %mask to <16 x double>
507  store <16 x double> %1, <16 x double>* %res
508  ret void
509}
510
511define void @ubto16f32_256(<16 x i16> %a, <16 x float>* %res) "min-legal-vector-width"="256" {
512; CHECK-LABEL: ubto16f32_256:
513; CHECK:       # %bb.0:
514; CHECK-NEXT:    vpmovw2m %ymm0, %k0
515; CHECK-NEXT:    kshiftrw $8, %k0, %k1
516; CHECK-NEXT:    vpmovm2d %k1, %ymm0
517; CHECK-NEXT:    vpsrld $31, %ymm0, %ymm0
518; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
519; CHECK-NEXT:    vpmovm2d %k0, %ymm1
520; CHECK-NEXT:    vpsrld $31, %ymm1, %ymm1
521; CHECK-NEXT:    vcvtdq2ps %ymm1, %ymm1
522; CHECK-NEXT:    vmovaps %ymm1, (%rdi)
523; CHECK-NEXT:    vmovaps %ymm0, 32(%rdi)
524; CHECK-NEXT:    vzeroupper
525; CHECK-NEXT:    retq
526  %mask = icmp slt <16 x i16> %a, zeroinitializer
527  %1 = uitofp <16 x i1> %mask to <16 x float>
528  store <16 x float> %1, <16 x float>* %res
529  ret void
530}
531
532define void @ubto16f32_512(<16 x i16> %a, <16 x float>* %res) "min-legal-vector-width"="512" {
533; CHECK-LABEL: ubto16f32_512:
534; CHECK:       # %bb.0:
535; CHECK-NEXT:    vpmovw2m %ymm0, %k0
536; CHECK-NEXT:    vpmovm2d %k0, %zmm0
537; CHECK-NEXT:    vpsrld $31, %zmm0, %zmm0
538; CHECK-NEXT:    vcvtdq2ps %zmm0, %zmm0
539; CHECK-NEXT:    vmovaps %zmm0, (%rdi)
540; CHECK-NEXT:    vzeroupper
541; CHECK-NEXT:    retq
542  %mask = icmp slt <16 x i16> %a, zeroinitializer
543  %1 = uitofp <16 x i1> %mask to <16 x float>
544  store <16 x float> %1, <16 x float>* %res
545  ret void
546}
547
548define void @ubto16f64_256(<16 x i16> %a, <16 x double>* %res) "min-legal-vector-width"="256" {
549; CHECK-LABEL: ubto16f64_256:
550; CHECK:       # %bb.0:
551; CHECK-NEXT:    vpmovw2m %ymm0, %k0
552; CHECK-NEXT:    kshiftrw $8, %k0, %k1
553; CHECK-NEXT:    vpmovm2d %k1, %ymm0
554; CHECK-NEXT:    vpsrld $31, %ymm0, %ymm0
555; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm1
556; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
557; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm0
558; CHECK-NEXT:    vpmovm2d %k0, %ymm2
559; CHECK-NEXT:    vpsrld $31, %ymm2, %ymm2
560; CHECK-NEXT:    vcvtdq2pd %xmm2, %ymm3
561; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm2
562; CHECK-NEXT:    vcvtdq2pd %xmm2, %ymm2
563; CHECK-NEXT:    vmovaps %ymm2, 32(%rdi)
564; CHECK-NEXT:    vmovaps %ymm3, (%rdi)
565; CHECK-NEXT:    vmovaps %ymm0, 96(%rdi)
566; CHECK-NEXT:    vmovaps %ymm1, 64(%rdi)
567; CHECK-NEXT:    vzeroupper
568; CHECK-NEXT:    retq
569  %mask = icmp slt <16 x i16> %a, zeroinitializer
570  %1 = uitofp <16 x i1> %mask to <16 x double>
571  store <16 x double> %1, <16 x double>* %res
572  ret void
573}
574
575define void @ubto16f64_512(<16 x i16> %a, <16 x double>* %res) "min-legal-vector-width"="512" {
576; CHECK-LABEL: ubto16f64_512:
577; CHECK:       # %bb.0:
578; CHECK-NEXT:    vpmovw2m %ymm0, %k0
579; CHECK-NEXT:    vpmovm2d %k0, %zmm0
580; CHECK-NEXT:    vpsrld $31, %zmm0, %zmm0
581; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm1
582; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
583; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm0
584; CHECK-NEXT:    vmovaps %zmm0, 64(%rdi)
585; CHECK-NEXT:    vmovaps %zmm1, (%rdi)
586; CHECK-NEXT:    vzeroupper
587; CHECK-NEXT:    retq
588  %mask = icmp slt <16 x i16> %a, zeroinitializer
589  %1 = uitofp <16 x i1> %mask to <16 x double>
590  store <16 x double> %1, <16 x double>* %res
591  ret void
592}
593
594define <16 x i16> @test_16f32toub_256(<16 x float>* %ptr, <16 x i16> %passthru) "min-legal-vector-width"="256" {
595; CHECK-LABEL: test_16f32toub_256:
596; CHECK:       # %bb.0:
597; CHECK-NEXT:    vcvttps2dq (%rdi), %ymm1
598; CHECK-NEXT:    vpslld $31, %ymm1, %ymm1
599; CHECK-NEXT:    vpmovd2m %ymm1, %k0
600; CHECK-NEXT:    vcvttps2dq 32(%rdi), %ymm1
601; CHECK-NEXT:    vpslld $31, %ymm1, %ymm1
602; CHECK-NEXT:    vpmovd2m %ymm1, %k1
603; CHECK-NEXT:    kunpckbw %k0, %k1, %k1
604; CHECK-NEXT:    vmovdqu16 %ymm0, %ymm0 {%k1} {z}
605; CHECK-NEXT:    retq
606  %a = load <16 x float>, <16 x float>* %ptr
607  %mask = fptoui <16 x float> %a to <16 x i1>
608  %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer
609  ret <16 x i16> %select
610}
611
612define <16 x i16> @test_16f32toub_512(<16 x float>* %ptr, <16 x i16> %passthru) "min-legal-vector-width"="512" {
613; CHECK-LABEL: test_16f32toub_512:
614; CHECK:       # %bb.0:
615; CHECK-NEXT:    vcvttps2dq (%rdi), %zmm1
616; CHECK-NEXT:    vpslld $31, %zmm1, %zmm1
617; CHECK-NEXT:    vpmovd2m %zmm1, %k1
618; CHECK-NEXT:    vmovdqu16 %ymm0, %ymm0 {%k1} {z}
619; CHECK-NEXT:    retq
620  %a = load <16 x float>, <16 x float>* %ptr
621  %mask = fptoui <16 x float> %a to <16 x i1>
622  %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer
623  ret <16 x i16> %select
624}
625
626define <16 x i16> @test_16f32tosb_256(<16 x float>* %ptr, <16 x i16> %passthru) "min-legal-vector-width"="256" {
627; CHECK-LABEL: test_16f32tosb_256:
628; CHECK:       # %bb.0:
629; CHECK-NEXT:    vcvttps2dq (%rdi), %ymm1
630; CHECK-NEXT:    vpmovd2m %ymm1, %k0
631; CHECK-NEXT:    vcvttps2dq 32(%rdi), %ymm1
632; CHECK-NEXT:    vpmovd2m %ymm1, %k1
633; CHECK-NEXT:    kunpckbw %k0, %k1, %k1
634; CHECK-NEXT:    vmovdqu16 %ymm0, %ymm0 {%k1} {z}
635; CHECK-NEXT:    retq
636  %a = load <16 x float>, <16 x float>* %ptr
637  %mask = fptosi <16 x float> %a to <16 x i1>
638  %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer
639  ret <16 x i16> %select
640}
641
642define <16 x i16> @test_16f32tosb_512(<16 x float>* %ptr, <16 x i16> %passthru) "min-legal-vector-width"="512" {
643; CHECK-LABEL: test_16f32tosb_512:
644; CHECK:       # %bb.0:
645; CHECK-NEXT:    vcvttps2dq (%rdi), %zmm1
646; CHECK-NEXT:    vpmovd2m %zmm1, %k1
647; CHECK-NEXT:    vmovdqu16 %ymm0, %ymm0 {%k1} {z}
648; CHECK-NEXT:    retq
649  %a = load <16 x float>, <16 x float>* %ptr
650  %mask = fptosi <16 x float> %a to <16 x i1>
651  %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer
652  ret <16 x i16> %select
653}
654
655define void @mul256(<64 x i8>* %a, <64 x i8>* %b, <64 x i8>* %c) "min-legal-vector-width"="256" {
656; CHECK-AVX512-LABEL: mul256:
657; CHECK-AVX512:       # %bb.0:
658; CHECK-AVX512-NEXT:    vmovdqa (%rdi), %ymm0
659; CHECK-AVX512-NEXT:    vmovdqa 32(%rdi), %ymm1
660; CHECK-AVX512-NEXT:    vmovdqa (%rsi), %ymm2
661; CHECK-AVX512-NEXT:    vmovdqa 32(%rsi), %ymm3
662; CHECK-AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
663; CHECK-AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
664; CHECK-AVX512-NEXT:    vpmullw %ymm4, %ymm5, %ymm4
665; CHECK-AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
666; CHECK-AVX512-NEXT:    vpand %ymm5, %ymm4, %ymm4
667; CHECK-AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
668; CHECK-AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
669; CHECK-AVX512-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
670; CHECK-AVX512-NEXT:    vpand %ymm5, %ymm1, %ymm1
671; CHECK-AVX512-NEXT:    vpackuswb %ymm4, %ymm1, %ymm1
672; CHECK-AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
673; CHECK-AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
674; CHECK-AVX512-NEXT:    vpmullw %ymm3, %ymm4, %ymm3
675; CHECK-AVX512-NEXT:    vpand %ymm5, %ymm3, %ymm3
676; CHECK-AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
677; CHECK-AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
678; CHECK-AVX512-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
679; CHECK-AVX512-NEXT:    vpand %ymm5, %ymm0, %ymm0
680; CHECK-AVX512-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
681; CHECK-AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
682; CHECK-AVX512-NEXT:    vmovdqa %ymm1, 32(%rdx)
683; CHECK-AVX512-NEXT:    vzeroupper
684; CHECK-AVX512-NEXT:    retq
685;
686; CHECK-VBMI-LABEL: mul256:
687; CHECK-VBMI:       # %bb.0:
688; CHECK-VBMI-NEXT:    vmovdqa (%rdi), %ymm0
689; CHECK-VBMI-NEXT:    vmovdqa 32(%rdi), %ymm1
690; CHECK-VBMI-NEXT:    vmovdqa (%rsi), %ymm2
691; CHECK-VBMI-NEXT:    vmovdqa 32(%rsi), %ymm3
692; CHECK-VBMI-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
693; CHECK-VBMI-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
694; CHECK-VBMI-NEXT:    vpmullw %ymm4, %ymm5, %ymm4
695; CHECK-VBMI-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
696; CHECK-VBMI-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
697; CHECK-VBMI-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
698; CHECK-VBMI-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
699; CHECK-VBMI-NEXT:    vpermt2b %ymm4, %ymm3, %ymm1
700; CHECK-VBMI-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
701; CHECK-VBMI-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
702; CHECK-VBMI-NEXT:    vpmullw %ymm4, %ymm5, %ymm4
703; CHECK-VBMI-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
704; CHECK-VBMI-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
705; CHECK-VBMI-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
706; CHECK-VBMI-NEXT:    vpermt2b %ymm4, %ymm3, %ymm0
707; CHECK-VBMI-NEXT:    vmovdqa %ymm0, (%rdx)
708; CHECK-VBMI-NEXT:    vmovdqa %ymm1, 32(%rdx)
709; CHECK-VBMI-NEXT:    vzeroupper
710; CHECK-VBMI-NEXT:    retq
711  %d = load <64 x i8>, <64 x i8>* %a
712  %e = load <64 x i8>, <64 x i8>* %b
713  %f = mul <64 x i8> %d, %e
714  store <64 x i8> %f, <64 x i8>* %c
715  ret void
716}
717
718define void @mul512(<64 x i8>* %a, <64 x i8>* %b, <64 x i8>* %c) "min-legal-vector-width"="512" {
719; CHECK-AVX512-LABEL: mul512:
720; CHECK-AVX512:       # %bb.0:
721; CHECK-AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
722; CHECK-AVX512-NEXT:    vmovdqa64 (%rsi), %zmm1
723; CHECK-AVX512-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
724; CHECK-AVX512-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
725; CHECK-AVX512-NEXT:    vpmullw %zmm2, %zmm3, %zmm2
726; CHECK-AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
727; CHECK-AVX512-NEXT:    vpandq %zmm3, %zmm2, %zmm2
728; CHECK-AVX512-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
729; CHECK-AVX512-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
730; CHECK-AVX512-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
731; CHECK-AVX512-NEXT:    vpandq %zmm3, %zmm0, %zmm0
732; CHECK-AVX512-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
733; CHECK-AVX512-NEXT:    vmovdqa64 %zmm0, (%rdx)
734; CHECK-AVX512-NEXT:    vzeroupper
735; CHECK-AVX512-NEXT:    retq
736;
737; CHECK-VBMI-LABEL: mul512:
738; CHECK-VBMI:       # %bb.0:
739; CHECK-VBMI-NEXT:    vmovdqa64 (%rdi), %zmm0
740; CHECK-VBMI-NEXT:    vmovdqa64 (%rsi), %zmm1
741; CHECK-VBMI-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
742; CHECK-VBMI-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
743; CHECK-VBMI-NEXT:    vpmullw %zmm2, %zmm3, %zmm2
744; CHECK-VBMI-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
745; CHECK-VBMI-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
746; CHECK-VBMI-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
747; CHECK-VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126]
748; CHECK-VBMI-NEXT:    vpermi2b %zmm2, %zmm0, %zmm1
749; CHECK-VBMI-NEXT:    vmovdqa64 %zmm1, (%rdx)
750; CHECK-VBMI-NEXT:    vzeroupper
751; CHECK-VBMI-NEXT:    retq
752  %d = load <64 x i8>, <64 x i8>* %a
753  %e = load <64 x i8>, <64 x i8>* %b
754  %f = mul <64 x i8> %d, %e
755  store <64 x i8> %f, <64 x i8>* %c
756  ret void
757}
758
759; This threw an assertion at one point.
760define <4 x i32> @mload_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) "min-legal-vector-width"="256" {
761; CHECK-LABEL: mload_v4i32:
762; CHECK:       # %bb.0:
763; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
764; CHECK-NEXT:    vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
765; CHECK-NEXT:    retq
766  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
767  %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
768  ret <4 x i32> %res
769}
770declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
771
772define <16 x i32> @trunc_v16i64_v16i32(<16 x i64>* %x) nounwind "min-legal-vector-width"="256" {
773; CHECK-LABEL: trunc_v16i64_v16i32:
774; CHECK:       # %bb.0:
775; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
776; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
777; CHECK-NEXT:    vmovdqa 64(%rdi), %ymm2
778; CHECK-NEXT:    vmovdqa 96(%rdi), %ymm3
779; CHECK-NEXT:    vpmovqd %ymm0, %xmm0
780; CHECK-NEXT:    vpmovqd %ymm1, %xmm1
781; CHECK-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
782; CHECK-NEXT:    vpmovqd %ymm2, %xmm1
783; CHECK-NEXT:    vpmovqd %ymm3, %xmm2
784; CHECK-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
785; CHECK-NEXT:    retq
786  %a = load <16 x i64>, <16 x i64>* %x
787  %b = trunc <16 x i64> %a to <16 x i32>
788  ret <16 x i32> %b
789}
790
791define <16 x i8> @trunc_v16i64_v16i8(<16 x i64>* %x) nounwind "min-legal-vector-width"="256" {
792; CHECK-LABEL: trunc_v16i64_v16i8:
793; CHECK:       # %bb.0:
794; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
795; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
796; CHECK-NEXT:    vmovdqa 64(%rdi), %ymm2
797; CHECK-NEXT:    vmovdqa 96(%rdi), %ymm3
798; CHECK-NEXT:    vpmovqb %ymm3, %xmm3
799; CHECK-NEXT:    vpmovqb %ymm2, %xmm2
800; CHECK-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
801; CHECK-NEXT:    vpmovqb %ymm1, %xmm1
802; CHECK-NEXT:    vpmovqb %ymm0, %xmm0
803; CHECK-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
804; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
805; CHECK-NEXT:    vzeroupper
806; CHECK-NEXT:    retq
807  %a = load <16 x i64>, <16 x i64>* %x
808  %b = trunc <16 x i64> %a to <16 x i8>
809  ret <16 x i8> %b
810}
811
812define <16 x i8> @trunc_v16i32_v16i8(<16 x i32>* %x) nounwind "min-legal-vector-width"="256" {
813; CHECK-LABEL: trunc_v16i32_v16i8:
814; CHECK:       # %bb.0:
815; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
816; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
817; CHECK-NEXT:    vpmovdb %ymm1, %xmm1
818; CHECK-NEXT:    vpmovdb %ymm0, %xmm0
819; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
820; CHECK-NEXT:    vzeroupper
821; CHECK-NEXT:    retq
822  %a = load <16 x i32>, <16 x i32>* %x
823  %b = trunc <16 x i32> %a to <16 x i8>
824  ret <16 x i8> %b
825}
826
827define <8 x i8> @trunc_v8i64_v8i8(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" {
828; CHECK-LABEL: trunc_v8i64_v8i8:
829; CHECK:       # %bb.0:
830; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
831; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
832; CHECK-NEXT:    vpmovqb %ymm1, %xmm1
833; CHECK-NEXT:    vpmovqb %ymm0, %xmm0
834; CHECK-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
835; CHECK-NEXT:    vzeroupper
836; CHECK-NEXT:    retq
837  %a = load <8 x i64>, <8 x i64>* %x
838  %b = trunc <8 x i64> %a to <8 x i8>
839  ret <8 x i8> %b
840}
841
842define <8 x i16> @trunc_v8i64_v8i16(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" {
843; CHECK-LABEL: trunc_v8i64_v8i16:
844; CHECK:       # %bb.0:
845; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
846; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
847; CHECK-NEXT:    vpmovqw %ymm1, %xmm1
848; CHECK-NEXT:    vpmovqw %ymm0, %xmm0
849; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
850; CHECK-NEXT:    vzeroupper
851; CHECK-NEXT:    retq
852  %a = load <8 x i64>, <8 x i64>* %x
853  %b = trunc <8 x i64> %a to <8 x i16>
854  ret <8 x i16> %b
855}
856
857define <8 x i32> @trunc_v8i64_v8i32_zeroes(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" {
858; CHECK-LABEL: trunc_v8i64_v8i32_zeroes:
859; CHECK:       # %bb.0:
860; CHECK-NEXT:    vpsrlq $48, 32(%rdi), %ymm0
861; CHECK-NEXT:    vpsrlq $48, (%rdi), %ymm1
862; CHECK-NEXT:    vpackusdw %ymm0, %ymm1, %ymm0
863; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
864; CHECK-NEXT:    retq
865  %a = load <8 x i64>, <8 x i64>* %x
866  %b = lshr <8 x i64> %a, <i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48>
867  %c = trunc <8 x i64> %b to <8 x i32>
868  ret <8 x i32> %c
869}
870
871define <16 x i16> @trunc_v16i32_v16i16_zeroes(<16 x i32>* %x) nounwind "min-legal-vector-width"="256" {
872; CHECK-LABEL: trunc_v16i32_v16i16_zeroes:
873; CHECK:       # %bb.0:
874; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
875; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
876; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm1, %ymm0
877; CHECK-NEXT:    retq
878  %a = load <16 x i32>, <16 x i32>* %x
879  %b = lshr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
880  %c = trunc <16 x i32> %b to <16 x i16>
881  ret <16 x i16> %c
882}
883
884define <32 x i8> @trunc_v32i16_v32i8_zeroes(<32 x i16>* %x) nounwind "min-legal-vector-width"="256" {
885; CHECK-AVX512-LABEL: trunc_v32i16_v32i8_zeroes:
886; CHECK-AVX512:       # %bb.0:
887; CHECK-AVX512-NEXT:    vpsrlw $8, 32(%rdi), %ymm0
888; CHECK-AVX512-NEXT:    vpsrlw $8, (%rdi), %ymm1
889; CHECK-AVX512-NEXT:    vpackuswb %ymm0, %ymm1, %ymm0
890; CHECK-AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
891; CHECK-AVX512-NEXT:    retq
892;
893; CHECK-VBMI-LABEL: trunc_v32i16_v32i8_zeroes:
894; CHECK-VBMI:       # %bb.0:
895; CHECK-VBMI-NEXT:    vmovdqa (%rdi), %ymm1
896; CHECK-VBMI-NEXT:    vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63]
897; CHECK-VBMI-NEXT:    vpermi2b 32(%rdi), %ymm1, %ymm0
898; CHECK-VBMI-NEXT:    retq
899  %a = load <32 x i16>, <32 x i16>* %x
900  %b = lshr <32 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
901  %c = trunc <32 x i16> %b to <32 x i8>
902  ret <32 x i8> %c
903}
904
905define <8 x i32> @trunc_v8i64_v8i32_sign(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" {
906; CHECK-LABEL: trunc_v8i64_v8i32_sign:
907; CHECK:       # %bb.0:
908; CHECK-NEXT:    vpsraq $48, 32(%rdi), %ymm0
909; CHECK-NEXT:    vpsraq $48, (%rdi), %ymm1
910; CHECK-NEXT:    vpmovqd %ymm1, %xmm1
911; CHECK-NEXT:    vpmovqd %ymm0, %xmm0
912; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
913; CHECK-NEXT:    retq
914  %a = load <8 x i64>, <8 x i64>* %x
915  %b = ashr <8 x i64> %a, <i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48>
916  %c = trunc <8 x i64> %b to <8 x i32>
917  ret <8 x i32> %c
918}
919
920define <16 x i16> @trunc_v16i32_v16i16_sign(<16 x i32>* %x) nounwind "min-legal-vector-width"="256" {
921; CHECK-LABEL: trunc_v16i32_v16i16_sign:
922; CHECK:       # %bb.0:
923; CHECK-NEXT:    vpsrad $16, 32(%rdi), %ymm0
924; CHECK-NEXT:    vpsrad $16, (%rdi), %ymm1
925; CHECK-NEXT:    vpackssdw %ymm0, %ymm1, %ymm0
926; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
927; CHECK-NEXT:    retq
928  %a = load <16 x i32>, <16 x i32>* %x
929  %b = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
930  %c = trunc <16 x i32> %b to <16 x i16>
931  ret <16 x i16> %c
932}
933
934define <32 x i8> @trunc_v32i16_v32i8_sign(<32 x i16>* %x) nounwind "min-legal-vector-width"="256" {
935; CHECK-LABEL: trunc_v32i16_v32i8_sign:
936; CHECK:       # %bb.0:
937; CHECK-NEXT:    vpsraw $8, 32(%rdi), %ymm0
938; CHECK-NEXT:    vpsraw $8, (%rdi), %ymm1
939; CHECK-NEXT:    vpacksswb %ymm0, %ymm1, %ymm0
940; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
941; CHECK-NEXT:    retq
942  %a = load <32 x i16>, <32 x i16>* %x
943  %b = ashr <32 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
944  %c = trunc <32 x i16> %b to <32 x i8>
945  ret <32 x i8> %c
946}
947
948define void @zext_v16i8_v16i64(<16 x i8> %x, <16 x i64>* %y) nounwind "min-legal-vector-width"="256" {
949; CHECK-LABEL: zext_v16i8_v16i64:
950; CHECK:       # %bb.0:
951; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
952; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
953; CHECK-NEXT:    vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
954; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm1
955; CHECK-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
956; CHECK-NEXT:    vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
957; CHECK-NEXT:    vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
958; CHECK-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
959; CHECK-NEXT:    vmovdqa %ymm0, (%rdi)
960; CHECK-NEXT:    vmovdqa %ymm1, 64(%rdi)
961; CHECK-NEXT:    vmovdqa %ymm3, 96(%rdi)
962; CHECK-NEXT:    vmovdqa %ymm2, 32(%rdi)
963; CHECK-NEXT:    vzeroupper
964; CHECK-NEXT:    retq
965  %a = zext <16 x i8> %x to <16 x i64>
966  store <16 x i64> %a, <16 x i64>* %y
967  ret void
968}
969
970define void @sext_v16i8_v16i64(<16 x i8> %x, <16 x i64>* %y) nounwind "min-legal-vector-width"="256" {
971; CHECK-LABEL: sext_v16i8_v16i64:
972; CHECK:       # %bb.0:
973; CHECK-NEXT:    vpmovsxbw %xmm0, %ymm1
974; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
975; CHECK-NEXT:    vpmovsxwq %xmm2, %ymm2
976; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm1
977; CHECK-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
978; CHECK-NEXT:    vpmovsxwq %xmm3, %ymm3
979; CHECK-NEXT:    vpmovsxwq %xmm1, %ymm1
980; CHECK-NEXT:    vpmovsxbq %xmm0, %ymm0
981; CHECK-NEXT:    vmovdqa %ymm0, (%rdi)
982; CHECK-NEXT:    vmovdqa %ymm1, 64(%rdi)
983; CHECK-NEXT:    vmovdqa %ymm3, 96(%rdi)
984; CHECK-NEXT:    vmovdqa %ymm2, 32(%rdi)
985; CHECK-NEXT:    vzeroupper
986; CHECK-NEXT:    retq
987  %a = sext <16 x i8> %x to <16 x i64>
988  store <16 x i64> %a, <16 x i64>* %y
989  ret void
990}
991
992define void @vselect_split_v8i16_setcc(<8 x i16> %s, <8 x i16> %t, <8 x i64>* %p, <8 x i64>* %q, <8 x i64>* %r) "min-legal-vector-width"="256" {
993; CHECK-LABEL: vselect_split_v8i16_setcc:
994; CHECK:       # %bb.0:
995; CHECK-NEXT:    vmovdqa (%rsi), %ymm2
996; CHECK-NEXT:    vmovdqa 32(%rsi), %ymm3
997; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
998; CHECK-NEXT:    kshiftrb $4, %k1, %k2
999; CHECK-NEXT:    vmovdqa64 32(%rdi), %ymm3 {%k2}
1000; CHECK-NEXT:    vmovdqa64 (%rdi), %ymm2 {%k1}
1001; CHECK-NEXT:    vmovdqa %ymm2, (%rdx)
1002; CHECK-NEXT:    vmovdqa %ymm3, 32(%rdx)
1003; CHECK-NEXT:    vzeroupper
1004; CHECK-NEXT:    retq
1005  %x = load <8 x i64>, <8 x i64>* %p
1006  %y = load <8 x i64>, <8 x i64>* %q
1007  %a = icmp eq <8 x i16> %s, %t
1008  %b = select <8 x i1> %a, <8 x i64> %x, <8 x i64> %y
1009  store <8 x i64> %b, <8 x i64>* %r
1010  ret void
1011}
1012
1013define void @vselect_split_v8i32_setcc(<8 x i32> %s, <8 x i32> %t, <8 x i64>* %p, <8 x i64>* %q, <8 x i64>* %r) "min-legal-vector-width"="256" {
1014; CHECK-LABEL: vselect_split_v8i32_setcc:
1015; CHECK:       # %bb.0:
1016; CHECK-NEXT:    vmovdqa (%rsi), %ymm2
1017; CHECK-NEXT:    vmovdqa 32(%rsi), %ymm3
1018; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
1019; CHECK-NEXT:    kshiftrb $4, %k1, %k2
1020; CHECK-NEXT:    vmovdqa64 32(%rdi), %ymm3 {%k2}
1021; CHECK-NEXT:    vmovdqa64 (%rdi), %ymm2 {%k1}
1022; CHECK-NEXT:    vmovdqa %ymm2, (%rdx)
1023; CHECK-NEXT:    vmovdqa %ymm3, 32(%rdx)
1024; CHECK-NEXT:    vzeroupper
1025; CHECK-NEXT:    retq
1026  %x = load <8 x i64>, <8 x i64>* %p
1027  %y = load <8 x i64>, <8 x i64>* %q
1028  %a = icmp eq <8 x i32> %s, %t
1029  %b = select <8 x i1> %a, <8 x i64> %x, <8 x i64> %y
1030  store <8 x i64> %b, <8 x i64>* %r
1031  ret void
1032}
1033
1034define void @vselect_split_v16i8_setcc(<16 x i8> %s, <16 x i8> %t, <16 x i32>* %p, <16 x i32>* %q, <16 x i32>* %r) "min-legal-vector-width"="256" {
1035; CHECK-LABEL: vselect_split_v16i8_setcc:
1036; CHECK:       # %bb.0:
1037; CHECK-NEXT:    vmovdqa (%rsi), %ymm2
1038; CHECK-NEXT:    vmovdqa 32(%rsi), %ymm3
1039; CHECK-NEXT:    vpcmpeqb %xmm1, %xmm0, %k1
1040; CHECK-NEXT:    kshiftrw $8, %k1, %k2
1041; CHECK-NEXT:    vmovdqa32 32(%rdi), %ymm3 {%k2}
1042; CHECK-NEXT:    vmovdqa32 (%rdi), %ymm2 {%k1}
1043; CHECK-NEXT:    vmovdqa %ymm2, (%rdx)
1044; CHECK-NEXT:    vmovdqa %ymm3, 32(%rdx)
1045; CHECK-NEXT:    vzeroupper
1046; CHECK-NEXT:    retq
1047  %x = load <16 x i32>, <16 x i32>* %p
1048  %y = load <16 x i32>, <16 x i32>* %q
1049  %a = icmp eq <16 x i8> %s, %t
1050  %b = select <16 x i1> %a, <16 x i32> %x, <16 x i32> %y
1051  store <16 x i32> %b, <16 x i32>* %r
1052  ret void
1053}
1054
1055define void @vselect_split_v16i16_setcc(<16 x i16> %s, <16 x i16> %t, <16 x i32>* %p, <16 x i32>* %q, <16 x i32>* %r) "min-legal-vector-width"="256" {
1056; CHECK-LABEL: vselect_split_v16i16_setcc:
1057; CHECK:       # %bb.0:
1058; CHECK-NEXT:    vmovdqa (%rsi), %ymm2
1059; CHECK-NEXT:    vmovdqa 32(%rsi), %ymm3
1060; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
1061; CHECK-NEXT:    kshiftrw $8, %k1, %k2
1062; CHECK-NEXT:    vmovdqa32 32(%rdi), %ymm3 {%k2}
1063; CHECK-NEXT:    vmovdqa32 (%rdi), %ymm2 {%k1}
1064; CHECK-NEXT:    vmovdqa %ymm2, (%rdx)
1065; CHECK-NEXT:    vmovdqa %ymm3, 32(%rdx)
1066; CHECK-NEXT:    vzeroupper
1067; CHECK-NEXT:    retq
1068  %x = load <16 x i32>, <16 x i32>* %p
1069  %y = load <16 x i32>, <16 x i32>* %q
1070  %a = icmp eq <16 x i16> %s, %t
1071  %b = select <16 x i1> %a, <16 x i32> %x, <16 x i32> %y
1072  store <16 x i32> %b, <16 x i32>* %r
1073  ret void
1074}
1075
1076define <16 x i8> @trunc_packus_v16i32_v16i8(<16 x i32>* %p) "min-legal-vector-width"="256" {
1077; CHECK-LABEL: trunc_packus_v16i32_v16i8:
1078; CHECK:       # %bb.0:
1079; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
1080; CHECK-NEXT:    vpackusdw 32(%rdi), %ymm0, %ymm0
1081; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1082; CHECK-NEXT:    vpmovuswb %ymm0, %xmm0
1083; CHECK-NEXT:    vzeroupper
1084; CHECK-NEXT:    retq
1085  %a = load <16 x i32>, <16 x i32>* %p
1086  %b = icmp slt <16 x i32> %a, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
1087  %c = select <16 x i1> %b, <16 x i32> %a, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
1088  %d = icmp sgt <16 x i32> %c, zeroinitializer
1089  %e = select <16 x i1> %d, <16 x i32> %c, <16 x i32> zeroinitializer
1090  %f = trunc <16 x i32> %e to <16 x i8>
1091  ret <16 x i8> %f
1092}
1093
1094define void @trunc_packus_v16i32_v16i8_store(<16 x i32>* %p, <16 x i8>* %q) "min-legal-vector-width"="256" {
1095; CHECK-LABEL: trunc_packus_v16i32_v16i8_store:
1096; CHECK:       # %bb.0:
1097; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
1098; CHECK-NEXT:    vpackusdw 32(%rdi), %ymm0, %ymm0
1099; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1100; CHECK-NEXT:    vpmovuswb %ymm0, (%rsi)
1101; CHECK-NEXT:    vzeroupper
1102; CHECK-NEXT:    retq
1103  %a = load <16 x i32>, <16 x i32>* %p
1104  %b = icmp slt <16 x i32> %a, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
1105  %c = select <16 x i1> %b, <16 x i32> %a, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
1106  %d = icmp sgt <16 x i32> %c, zeroinitializer
1107  %e = select <16 x i1> %d, <16 x i32> %c, <16 x i32> zeroinitializer
1108  %f = trunc <16 x i32> %e to <16 x i8>
1109  store <16 x i8> %f, <16 x i8>* %q
1110  ret void
1111}
1112
1113define <64 x i1> @v64i1_argument_return(<64 x i1> %x) "min-legal-vector-width"="256" {
1114; CHECK-LABEL: v64i1_argument_return:
1115; CHECK:       # %bb.0:
1116; CHECK-NEXT:    retq
1117  ret <64 x i1> %x
1118}
1119
1120define void @v64i1_shuffle(<64 x i8>* %x, <64 x i8>* %y) "min-legal-vector-width"="256" {
1121; CHECK-LABEL: v64i1_shuffle:
1122; CHECK:       # %bb.0: # %entry
1123; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
1124; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm0
1125; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k0
1126; CHECK-NEXT:    kshiftrd $1, %k0, %k1
1127; CHECK-NEXT:    movq $-3, %rax
1128; CHECK-NEXT:    kmovq %rax, %k2
1129; CHECK-NEXT:    kandq %k2, %k1, %k1
1130; CHECK-NEXT:    kshiftlq $63, %k0, %k2
1131; CHECK-NEXT:    kshiftrq $62, %k2, %k2
1132; CHECK-NEXT:    korq %k2, %k1, %k1
1133; CHECK-NEXT:    movq $-5, %rax
1134; CHECK-NEXT:    kmovq %rax, %k2
1135; CHECK-NEXT:    kandq %k2, %k1, %k1
1136; CHECK-NEXT:    kshiftrd $3, %k0, %k2
1137; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1138; CHECK-NEXT:    kshiftrq $61, %k2, %k2
1139; CHECK-NEXT:    korq %k2, %k1, %k1
1140; CHECK-NEXT:    movq $-9, %rax
1141; CHECK-NEXT:    kmovq %rax, %k2
1142; CHECK-NEXT:    kandq %k2, %k1, %k1
1143; CHECK-NEXT:    kshiftrd $2, %k0, %k2
1144; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1145; CHECK-NEXT:    kshiftrq $60, %k2, %k2
1146; CHECK-NEXT:    korq %k2, %k1, %k1
1147; CHECK-NEXT:    movq $-17, %rax
1148; CHECK-NEXT:    kmovq %rax, %k2
1149; CHECK-NEXT:    kandq %k2, %k1, %k1
1150; CHECK-NEXT:    kshiftrd $5, %k0, %k2
1151; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1152; CHECK-NEXT:    kshiftrq $59, %k2, %k2
1153; CHECK-NEXT:    korq %k2, %k1, %k1
1154; CHECK-NEXT:    movq $-33, %rax
1155; CHECK-NEXT:    kmovq %rax, %k2
1156; CHECK-NEXT:    kandq %k2, %k1, %k1
1157; CHECK-NEXT:    kshiftrd $4, %k0, %k2
1158; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1159; CHECK-NEXT:    kshiftrq $58, %k2, %k2
1160; CHECK-NEXT:    korq %k2, %k1, %k1
1161; CHECK-NEXT:    movq $-65, %rax
1162; CHECK-NEXT:    kmovq %rax, %k2
1163; CHECK-NEXT:    kandq %k2, %k1, %k1
1164; CHECK-NEXT:    kshiftrd $7, %k0, %k2
1165; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1166; CHECK-NEXT:    kshiftrq $57, %k2, %k2
1167; CHECK-NEXT:    korq %k2, %k1, %k1
1168; CHECK-NEXT:    movq $-129, %rax
1169; CHECK-NEXT:    kmovq %rax, %k2
1170; CHECK-NEXT:    kandq %k2, %k1, %k1
1171; CHECK-NEXT:    kshiftrd $6, %k0, %k2
1172; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1173; CHECK-NEXT:    kshiftrq $56, %k2, %k2
1174; CHECK-NEXT:    korq %k2, %k1, %k1
1175; CHECK-NEXT:    movq $-257, %rax # imm = 0xFEFF
1176; CHECK-NEXT:    kmovq %rax, %k2
1177; CHECK-NEXT:    kandq %k2, %k1, %k1
1178; CHECK-NEXT:    kshiftrd $9, %k0, %k2
1179; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1180; CHECK-NEXT:    kshiftrq $55, %k2, %k2
1181; CHECK-NEXT:    korq %k2, %k1, %k1
1182; CHECK-NEXT:    movq $-513, %rax # imm = 0xFDFF
1183; CHECK-NEXT:    kmovq %rax, %k2
1184; CHECK-NEXT:    kandq %k2, %k1, %k1
1185; CHECK-NEXT:    kshiftrd $8, %k0, %k2
1186; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1187; CHECK-NEXT:    kshiftrq $54, %k2, %k2
1188; CHECK-NEXT:    korq %k2, %k1, %k1
1189; CHECK-NEXT:    movq $-1025, %rax # imm = 0xFBFF
1190; CHECK-NEXT:    kmovq %rax, %k2
1191; CHECK-NEXT:    kandq %k2, %k1, %k1
1192; CHECK-NEXT:    kshiftrd $11, %k0, %k2
1193; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1194; CHECK-NEXT:    kshiftrq $53, %k2, %k2
1195; CHECK-NEXT:    korq %k2, %k1, %k1
1196; CHECK-NEXT:    movq $-2049, %rax # imm = 0xF7FF
1197; CHECK-NEXT:    kmovq %rax, %k2
1198; CHECK-NEXT:    kandq %k2, %k1, %k1
1199; CHECK-NEXT:    kshiftrd $10, %k0, %k2
1200; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1201; CHECK-NEXT:    kshiftrq $52, %k2, %k2
1202; CHECK-NEXT:    korq %k2, %k1, %k1
1203; CHECK-NEXT:    movq $-4097, %rax # imm = 0xEFFF
1204; CHECK-NEXT:    kmovq %rax, %k2
1205; CHECK-NEXT:    kandq %k2, %k1, %k1
1206; CHECK-NEXT:    kshiftrd $13, %k0, %k2
1207; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1208; CHECK-NEXT:    kshiftrq $51, %k2, %k2
1209; CHECK-NEXT:    korq %k2, %k1, %k1
1210; CHECK-NEXT:    movq $-8193, %rax # imm = 0xDFFF
1211; CHECK-NEXT:    kmovq %rax, %k2
1212; CHECK-NEXT:    kandq %k2, %k1, %k1
1213; CHECK-NEXT:    kshiftrd $12, %k0, %k2
1214; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1215; CHECK-NEXT:    kshiftrq $50, %k2, %k2
1216; CHECK-NEXT:    korq %k2, %k1, %k1
1217; CHECK-NEXT:    movq $-16385, %rax # imm = 0xBFFF
1218; CHECK-NEXT:    kmovq %rax, %k2
1219; CHECK-NEXT:    kandq %k2, %k1, %k1
1220; CHECK-NEXT:    kshiftrd $15, %k0, %k2
1221; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1222; CHECK-NEXT:    kshiftrq $49, %k2, %k2
1223; CHECK-NEXT:    korq %k2, %k1, %k1
1224; CHECK-NEXT:    movq $-32769, %rax # imm = 0xFFFF7FFF
1225; CHECK-NEXT:    kmovq %rax, %k2
1226; CHECK-NEXT:    kandq %k2, %k1, %k1
1227; CHECK-NEXT:    kshiftrd $14, %k0, %k2
1228; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1229; CHECK-NEXT:    kshiftrq $48, %k2, %k2
1230; CHECK-NEXT:    korq %k2, %k1, %k1
1231; CHECK-NEXT:    movq $-65537, %rax # imm = 0xFFFEFFFF
1232; CHECK-NEXT:    kmovq %rax, %k2
1233; CHECK-NEXT:    kandq %k2, %k1, %k1
1234; CHECK-NEXT:    kshiftrd $17, %k0, %k2
1235; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1236; CHECK-NEXT:    kshiftrq $47, %k2, %k2
1237; CHECK-NEXT:    korq %k2, %k1, %k1
1238; CHECK-NEXT:    movq $-131073, %rax # imm = 0xFFFDFFFF
1239; CHECK-NEXT:    kmovq %rax, %k2
1240; CHECK-NEXT:    kandq %k2, %k1, %k1
1241; CHECK-NEXT:    kshiftrd $16, %k0, %k2
1242; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1243; CHECK-NEXT:    kshiftrq $46, %k2, %k2
1244; CHECK-NEXT:    korq %k2, %k1, %k1
1245; CHECK-NEXT:    movq $-262145, %rax # imm = 0xFFFBFFFF
1246; CHECK-NEXT:    kmovq %rax, %k2
1247; CHECK-NEXT:    kandq %k2, %k1, %k1
1248; CHECK-NEXT:    kshiftrd $19, %k0, %k2
1249; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1250; CHECK-NEXT:    kshiftrq $45, %k2, %k2
1251; CHECK-NEXT:    korq %k2, %k1, %k1
1252; CHECK-NEXT:    movq $-524289, %rax # imm = 0xFFF7FFFF
1253; CHECK-NEXT:    kmovq %rax, %k2
1254; CHECK-NEXT:    kandq %k2, %k1, %k1
1255; CHECK-NEXT:    kshiftrd $18, %k0, %k2
1256; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1257; CHECK-NEXT:    kshiftrq $44, %k2, %k2
1258; CHECK-NEXT:    korq %k2, %k1, %k1
1259; CHECK-NEXT:    movq $-1048577, %rax # imm = 0xFFEFFFFF
1260; CHECK-NEXT:    kmovq %rax, %k2
1261; CHECK-NEXT:    kandq %k2, %k1, %k1
1262; CHECK-NEXT:    kshiftrd $21, %k0, %k2
1263; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1264; CHECK-NEXT:    kshiftrq $43, %k2, %k2
1265; CHECK-NEXT:    korq %k2, %k1, %k1
1266; CHECK-NEXT:    movq $-2097153, %rax # imm = 0xFFDFFFFF
1267; CHECK-NEXT:    kmovq %rax, %k2
1268; CHECK-NEXT:    kandq %k2, %k1, %k1
1269; CHECK-NEXT:    kshiftrd $20, %k0, %k2
1270; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1271; CHECK-NEXT:    kshiftrq $42, %k2, %k2
1272; CHECK-NEXT:    korq %k2, %k1, %k1
1273; CHECK-NEXT:    movq $-4194305, %rax # imm = 0xFFBFFFFF
1274; CHECK-NEXT:    kmovq %rax, %k2
1275; CHECK-NEXT:    kandq %k2, %k1, %k1
1276; CHECK-NEXT:    kshiftrd $23, %k0, %k2
1277; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1278; CHECK-NEXT:    kshiftrq $41, %k2, %k2
1279; CHECK-NEXT:    korq %k2, %k1, %k1
1280; CHECK-NEXT:    movq $-8388609, %rax # imm = 0xFF7FFFFF
1281; CHECK-NEXT:    kmovq %rax, %k2
1282; CHECK-NEXT:    kandq %k2, %k1, %k1
1283; CHECK-NEXT:    kshiftrd $22, %k0, %k2
1284; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1285; CHECK-NEXT:    kshiftrq $40, %k2, %k2
1286; CHECK-NEXT:    korq %k2, %k1, %k1
1287; CHECK-NEXT:    movq $-16777217, %rax # imm = 0xFEFFFFFF
1288; CHECK-NEXT:    kmovq %rax, %k2
1289; CHECK-NEXT:    kandq %k2, %k1, %k1
1290; CHECK-NEXT:    kshiftrd $25, %k0, %k2
1291; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1292; CHECK-NEXT:    kshiftrq $39, %k2, %k2
1293; CHECK-NEXT:    korq %k2, %k1, %k1
1294; CHECK-NEXT:    movq $-33554433, %rax # imm = 0xFDFFFFFF
1295; CHECK-NEXT:    kmovq %rax, %k2
1296; CHECK-NEXT:    kandq %k2, %k1, %k1
1297; CHECK-NEXT:    kshiftrd $24, %k0, %k2
1298; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1299; CHECK-NEXT:    kshiftrq $38, %k2, %k2
1300; CHECK-NEXT:    korq %k2, %k1, %k1
1301; CHECK-NEXT:    movq $-67108865, %rax # imm = 0xFBFFFFFF
1302; CHECK-NEXT:    kmovq %rax, %k2
1303; CHECK-NEXT:    kandq %k2, %k1, %k1
1304; CHECK-NEXT:    kshiftrd $27, %k0, %k2
1305; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1306; CHECK-NEXT:    kshiftrq $37, %k2, %k2
1307; CHECK-NEXT:    korq %k2, %k1, %k1
1308; CHECK-NEXT:    movq $-134217729, %rax # imm = 0xF7FFFFFF
1309; CHECK-NEXT:    kmovq %rax, %k2
1310; CHECK-NEXT:    kandq %k2, %k1, %k1
1311; CHECK-NEXT:    kshiftrd $26, %k0, %k2
1312; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1313; CHECK-NEXT:    kshiftrq $36, %k2, %k2
1314; CHECK-NEXT:    korq %k2, %k1, %k1
1315; CHECK-NEXT:    movq $-268435457, %rax # imm = 0xEFFFFFFF
1316; CHECK-NEXT:    kmovq %rax, %k2
1317; CHECK-NEXT:    kandq %k2, %k1, %k1
1318; CHECK-NEXT:    kshiftrd $29, %k0, %k2
1319; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1320; CHECK-NEXT:    kshiftrq $35, %k2, %k2
1321; CHECK-NEXT:    korq %k2, %k1, %k1
1322; CHECK-NEXT:    movq $-536870913, %rax # imm = 0xDFFFFFFF
1323; CHECK-NEXT:    kmovq %rax, %k2
1324; CHECK-NEXT:    kandq %k2, %k1, %k1
1325; CHECK-NEXT:    kshiftrd $28, %k0, %k2
1326; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1327; CHECK-NEXT:    kshiftrq $34, %k2, %k2
1328; CHECK-NEXT:    korq %k2, %k1, %k1
1329; CHECK-NEXT:    movq $-1073741825, %rax # imm = 0xBFFFFFFF
1330; CHECK-NEXT:    kmovq %rax, %k2
1331; CHECK-NEXT:    kandq %k2, %k1, %k1
1332; CHECK-NEXT:    kshiftrd $31, %k0, %k2
1333; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1334; CHECK-NEXT:    kshiftrq $33, %k2, %k2
1335; CHECK-NEXT:    korq %k2, %k1, %k1
1336; CHECK-NEXT:    movabsq $-2147483649, %rax # imm = 0xFFFFFFFF7FFFFFFF
1337; CHECK-NEXT:    kmovq %rax, %k2
1338; CHECK-NEXT:    kandq %k2, %k1, %k2
1339; CHECK-NEXT:    vptestnmb %ymm0, %ymm0, %k1
1340; CHECK-NEXT:    kshiftrd $30, %k0, %k0
1341; CHECK-NEXT:    kshiftlq $63, %k0, %k0
1342; CHECK-NEXT:    kshiftrq $32, %k0, %k0
1343; CHECK-NEXT:    korq %k0, %k2, %k0
1344; CHECK-NEXT:    movabsq $-4294967297, %rax # imm = 0xFFFFFFFEFFFFFFFF
1345; CHECK-NEXT:    kmovq %rax, %k2
1346; CHECK-NEXT:    kandq %k2, %k0, %k0
1347; CHECK-NEXT:    kshiftrd $1, %k1, %k2
1348; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1349; CHECK-NEXT:    kshiftrq $31, %k2, %k2
1350; CHECK-NEXT:    korq %k2, %k0, %k0
1351; CHECK-NEXT:    movabsq $-8589934593, %rax # imm = 0xFFFFFFFDFFFFFFFF
1352; CHECK-NEXT:    kmovq %rax, %k2
1353; CHECK-NEXT:    kandq %k2, %k0, %k0
1354; CHECK-NEXT:    kshiftlq $63, %k1, %k2
1355; CHECK-NEXT:    kshiftrq $30, %k2, %k2
1356; CHECK-NEXT:    korq %k2, %k0, %k0
1357; CHECK-NEXT:    movabsq $-17179869185, %rax # imm = 0xFFFFFFFBFFFFFFFF
1358; CHECK-NEXT:    kmovq %rax, %k2
1359; CHECK-NEXT:    kandq %k2, %k0, %k0
1360; CHECK-NEXT:    kshiftrd $3, %k1, %k2
1361; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1362; CHECK-NEXT:    kshiftrq $29, %k2, %k2
1363; CHECK-NEXT:    korq %k2, %k0, %k0
1364; CHECK-NEXT:    movabsq $-34359738369, %rax # imm = 0xFFFFFFF7FFFFFFFF
1365; CHECK-NEXT:    kmovq %rax, %k2
1366; CHECK-NEXT:    kandq %k2, %k0, %k0
1367; CHECK-NEXT:    kshiftrd $2, %k1, %k2
1368; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1369; CHECK-NEXT:    kshiftrq $28, %k2, %k2
1370; CHECK-NEXT:    korq %k2, %k0, %k0
1371; CHECK-NEXT:    movabsq $-68719476737, %rax # imm = 0xFFFFFFEFFFFFFFFF
1372; CHECK-NEXT:    kmovq %rax, %k2
1373; CHECK-NEXT:    kandq %k2, %k0, %k0
1374; CHECK-NEXT:    kshiftrd $5, %k1, %k2
1375; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1376; CHECK-NEXT:    kshiftrq $27, %k2, %k2
1377; CHECK-NEXT:    korq %k2, %k0, %k0
1378; CHECK-NEXT:    movabsq $-137438953473, %rax # imm = 0xFFFFFFDFFFFFFFFF
1379; CHECK-NEXT:    kmovq %rax, %k2
1380; CHECK-NEXT:    kandq %k2, %k0, %k0
1381; CHECK-NEXT:    kshiftrd $4, %k1, %k2
1382; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1383; CHECK-NEXT:    kshiftrq $26, %k2, %k2
1384; CHECK-NEXT:    korq %k2, %k0, %k0
1385; CHECK-NEXT:    movabsq $-274877906945, %rax # imm = 0xFFFFFFBFFFFFFFFF
1386; CHECK-NEXT:    kmovq %rax, %k2
1387; CHECK-NEXT:    kandq %k2, %k0, %k0
1388; CHECK-NEXT:    kshiftrd $7, %k1, %k2
1389; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1390; CHECK-NEXT:    kshiftrq $25, %k2, %k2
1391; CHECK-NEXT:    korq %k2, %k0, %k0
1392; CHECK-NEXT:    movabsq $-549755813889, %rax # imm = 0xFFFFFF7FFFFFFFFF
1393; CHECK-NEXT:    kmovq %rax, %k2
1394; CHECK-NEXT:    kandq %k2, %k0, %k0
1395; CHECK-NEXT:    kshiftrd $6, %k1, %k2
1396; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1397; CHECK-NEXT:    kshiftrq $24, %k2, %k2
1398; CHECK-NEXT:    korq %k2, %k0, %k0
1399; CHECK-NEXT:    movabsq $-1099511627777, %rax # imm = 0xFFFFFEFFFFFFFFFF
1400; CHECK-NEXT:    kmovq %rax, %k2
1401; CHECK-NEXT:    kandq %k2, %k0, %k0
1402; CHECK-NEXT:    kshiftrd $9, %k1, %k2
1403; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1404; CHECK-NEXT:    kshiftrq $23, %k2, %k2
1405; CHECK-NEXT:    korq %k2, %k0, %k0
1406; CHECK-NEXT:    movabsq $-2199023255553, %rax # imm = 0xFFFFFDFFFFFFFFFF
1407; CHECK-NEXT:    kmovq %rax, %k2
1408; CHECK-NEXT:    kandq %k2, %k0, %k0
1409; CHECK-NEXT:    kshiftrd $8, %k1, %k2
1410; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1411; CHECK-NEXT:    kshiftrq $22, %k2, %k2
1412; CHECK-NEXT:    korq %k2, %k0, %k0
1413; CHECK-NEXT:    movabsq $-4398046511105, %rax # imm = 0xFFFFFBFFFFFFFFFF
1414; CHECK-NEXT:    kmovq %rax, %k2
1415; CHECK-NEXT:    kandq %k2, %k0, %k0
1416; CHECK-NEXT:    kshiftrd $11, %k1, %k2
1417; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1418; CHECK-NEXT:    kshiftrq $21, %k2, %k2
1419; CHECK-NEXT:    korq %k2, %k0, %k0
1420; CHECK-NEXT:    movabsq $-8796093022209, %rax # imm = 0xFFFFF7FFFFFFFFFF
1421; CHECK-NEXT:    kmovq %rax, %k2
1422; CHECK-NEXT:    kandq %k2, %k0, %k0
1423; CHECK-NEXT:    kshiftrd $10, %k1, %k2
1424; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1425; CHECK-NEXT:    kshiftrq $20, %k2, %k2
1426; CHECK-NEXT:    korq %k2, %k0, %k0
1427; CHECK-NEXT:    movabsq $-17592186044417, %rax # imm = 0xFFFFEFFFFFFFFFFF
1428; CHECK-NEXT:    kmovq %rax, %k2
1429; CHECK-NEXT:    kandq %k2, %k0, %k0
1430; CHECK-NEXT:    kshiftrd $13, %k1, %k2
1431; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1432; CHECK-NEXT:    kshiftrq $19, %k2, %k2
1433; CHECK-NEXT:    korq %k2, %k0, %k0
1434; CHECK-NEXT:    movabsq $-35184372088833, %rax # imm = 0xFFFFDFFFFFFFFFFF
1435; CHECK-NEXT:    kmovq %rax, %k2
1436; CHECK-NEXT:    kandq %k2, %k0, %k0
1437; CHECK-NEXT:    kshiftrd $12, %k1, %k2
1438; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1439; CHECK-NEXT:    kshiftrq $18, %k2, %k2
1440; CHECK-NEXT:    korq %k2, %k0, %k0
1441; CHECK-NEXT:    movabsq $-70368744177665, %rax # imm = 0xFFFFBFFFFFFFFFFF
1442; CHECK-NEXT:    kmovq %rax, %k2
1443; CHECK-NEXT:    kandq %k2, %k0, %k0
1444; CHECK-NEXT:    kshiftrd $15, %k1, %k2
1445; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1446; CHECK-NEXT:    kshiftrq $17, %k2, %k2
1447; CHECK-NEXT:    korq %k2, %k0, %k0
1448; CHECK-NEXT:    movabsq $-140737488355329, %rax # imm = 0xFFFF7FFFFFFFFFFF
1449; CHECK-NEXT:    kmovq %rax, %k2
1450; CHECK-NEXT:    kandq %k2, %k0, %k0
1451; CHECK-NEXT:    kshiftrd $14, %k1, %k2
1452; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1453; CHECK-NEXT:    kshiftrq $16, %k2, %k2
1454; CHECK-NEXT:    korq %k2, %k0, %k0
1455; CHECK-NEXT:    movabsq $-281474976710657, %rax # imm = 0xFFFEFFFFFFFFFFFF
1456; CHECK-NEXT:    kmovq %rax, %k2
1457; CHECK-NEXT:    kandq %k2, %k0, %k0
1458; CHECK-NEXT:    kshiftrd $17, %k1, %k2
1459; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1460; CHECK-NEXT:    kshiftrq $15, %k2, %k2
1461; CHECK-NEXT:    korq %k2, %k0, %k0
1462; CHECK-NEXT:    movabsq $-562949953421313, %rax # imm = 0xFFFDFFFFFFFFFFFF
1463; CHECK-NEXT:    kmovq %rax, %k2
1464; CHECK-NEXT:    kandq %k2, %k0, %k0
1465; CHECK-NEXT:    kshiftrd $16, %k1, %k2
1466; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1467; CHECK-NEXT:    kshiftrq $14, %k2, %k2
1468; CHECK-NEXT:    korq %k2, %k0, %k0
1469; CHECK-NEXT:    movabsq $-1125899906842625, %rax # imm = 0xFFFBFFFFFFFFFFFF
1470; CHECK-NEXT:    kmovq %rax, %k2
1471; CHECK-NEXT:    kandq %k2, %k0, %k0
1472; CHECK-NEXT:    kshiftrd $19, %k1, %k2
1473; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1474; CHECK-NEXT:    kshiftrq $13, %k2, %k2
1475; CHECK-NEXT:    korq %k2, %k0, %k0
1476; CHECK-NEXT:    movabsq $-2251799813685249, %rax # imm = 0xFFF7FFFFFFFFFFFF
1477; CHECK-NEXT:    kmovq %rax, %k2
1478; CHECK-NEXT:    kandq %k2, %k0, %k0
1479; CHECK-NEXT:    kshiftrd $18, %k1, %k2
1480; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1481; CHECK-NEXT:    kshiftrq $12, %k2, %k2
1482; CHECK-NEXT:    korq %k2, %k0, %k0
1483; CHECK-NEXT:    movabsq $-4503599627370497, %rax # imm = 0xFFEFFFFFFFFFFFFF
1484; CHECK-NEXT:    kmovq %rax, %k2
1485; CHECK-NEXT:    kandq %k2, %k0, %k0
1486; CHECK-NEXT:    kshiftrd $21, %k1, %k2
1487; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1488; CHECK-NEXT:    kshiftrq $11, %k2, %k2
1489; CHECK-NEXT:    korq %k2, %k0, %k0
1490; CHECK-NEXT:    movabsq $-9007199254740993, %rax # imm = 0xFFDFFFFFFFFFFFFF
1491; CHECK-NEXT:    kmovq %rax, %k2
1492; CHECK-NEXT:    kandq %k2, %k0, %k0
1493; CHECK-NEXT:    kshiftrd $20, %k1, %k2
1494; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1495; CHECK-NEXT:    kshiftrq $10, %k2, %k2
1496; CHECK-NEXT:    korq %k2, %k0, %k0
1497; CHECK-NEXT:    movabsq $-18014398509481985, %rax # imm = 0xFFBFFFFFFFFFFFFF
1498; CHECK-NEXT:    kmovq %rax, %k2
1499; CHECK-NEXT:    kandq %k2, %k0, %k0
1500; CHECK-NEXT:    kshiftrd $23, %k1, %k2
1501; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1502; CHECK-NEXT:    kshiftrq $9, %k2, %k2
1503; CHECK-NEXT:    korq %k2, %k0, %k0
1504; CHECK-NEXT:    movabsq $-36028797018963969, %rax # imm = 0xFF7FFFFFFFFFFFFF
1505; CHECK-NEXT:    kmovq %rax, %k2
1506; CHECK-NEXT:    kandq %k2, %k0, %k0
1507; CHECK-NEXT:    kshiftrd $22, %k1, %k2
1508; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1509; CHECK-NEXT:    kshiftrq $8, %k2, %k2
1510; CHECK-NEXT:    korq %k2, %k0, %k0
1511; CHECK-NEXT:    movabsq $-72057594037927937, %rax # imm = 0xFEFFFFFFFFFFFFFF
1512; CHECK-NEXT:    kmovq %rax, %k2
1513; CHECK-NEXT:    kandq %k2, %k0, %k0
1514; CHECK-NEXT:    kshiftrd $25, %k1, %k2
1515; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1516; CHECK-NEXT:    kshiftrq $7, %k2, %k2
1517; CHECK-NEXT:    korq %k2, %k0, %k0
1518; CHECK-NEXT:    movabsq $-144115188075855873, %rax # imm = 0xFDFFFFFFFFFFFFFF
1519; CHECK-NEXT:    kmovq %rax, %k2
1520; CHECK-NEXT:    kandq %k2, %k0, %k0
1521; CHECK-NEXT:    kshiftrd $24, %k1, %k2
1522; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1523; CHECK-NEXT:    kshiftrq $6, %k2, %k2
1524; CHECK-NEXT:    korq %k2, %k0, %k0
1525; CHECK-NEXT:    movabsq $-288230376151711745, %rax # imm = 0xFBFFFFFFFFFFFFFF
1526; CHECK-NEXT:    kmovq %rax, %k2
1527; CHECK-NEXT:    kandq %k2, %k0, %k0
1528; CHECK-NEXT:    kshiftrd $27, %k1, %k2
1529; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1530; CHECK-NEXT:    kshiftrq $5, %k2, %k2
1531; CHECK-NEXT:    korq %k2, %k0, %k0
1532; CHECK-NEXT:    movabsq $-576460752303423489, %rax # imm = 0xF7FFFFFFFFFFFFFF
1533; CHECK-NEXT:    kmovq %rax, %k2
1534; CHECK-NEXT:    kandq %k2, %k0, %k0
1535; CHECK-NEXT:    kshiftrd $26, %k1, %k2
1536; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1537; CHECK-NEXT:    kshiftrq $4, %k2, %k2
1538; CHECK-NEXT:    korq %k2, %k0, %k0
1539; CHECK-NEXT:    movabsq $-1152921504606846977, %rax # imm = 0xEFFFFFFFFFFFFFFF
1540; CHECK-NEXT:    kmovq %rax, %k2
1541; CHECK-NEXT:    kandq %k2, %k0, %k0
1542; CHECK-NEXT:    kshiftrd $29, %k1, %k2
1543; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1544; CHECK-NEXT:    kshiftrq $3, %k2, %k2
1545; CHECK-NEXT:    korq %k2, %k0, %k0
1546; CHECK-NEXT:    movabsq $-2305843009213693953, %rax # imm = 0xDFFFFFFFFFFFFFFF
1547; CHECK-NEXT:    kmovq %rax, %k2
1548; CHECK-NEXT:    kandq %k2, %k0, %k0
1549; CHECK-NEXT:    kshiftrd $28, %k1, %k2
1550; CHECK-NEXT:    kshiftlq $63, %k2, %k2
1551; CHECK-NEXT:    kshiftrq $2, %k2, %k2
1552; CHECK-NEXT:    korq %k2, %k0, %k0
1553; CHECK-NEXT:    movabsq $-4611686018427387905, %rax # imm = 0xBFFFFFFFFFFFFFFF
1554; CHECK-NEXT:    kmovq %rax, %k2
1555; CHECK-NEXT:    kandq %k2, %k0, %k0
1556; CHECK-NEXT:    kshiftrd $31, %k1, %k2
1557; CHECK-NEXT:    kshiftlq $62, %k2, %k2
1558; CHECK-NEXT:    korq %k2, %k0, %k0
1559; CHECK-NEXT:    kshiftrd $30, %k1, %k1
1560; CHECK-NEXT:    kshiftlq $1, %k0, %k0
1561; CHECK-NEXT:    kshiftrq $1, %k0, %k0
1562; CHECK-NEXT:    kshiftlq $63, %k1, %k1
1563; CHECK-NEXT:    korq %k1, %k0, %k1
1564; CHECK-NEXT:    vmovdqu8 %ymm1, (%rsi) {%k1}
1565; CHECK-NEXT:    kshiftrq $32, %k1, %k1
1566; CHECK-NEXT:    vmovdqu8 %ymm0, 32(%rsi) {%k1}
1567; CHECK-NEXT:    vzeroupper
1568; CHECK-NEXT:    retq
1569entry:
1570  %a = load <64 x i8>, <64 x i8>* %x
1571  %b = icmp eq <64 x i8> %a, zeroinitializer
1572  %shuf = shufflevector <64 x i1> %b, <64 x i1> undef, <64 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14, i32 17, i32 16, i32 19, i32 18, i32 21, i32 20, i32 23, i32 22, i32 25, i32 24, i32 27, i32 26, i32 29, i32 28, i32 31, i32 30, i32 33, i32 32, i32 35, i32 34, i32 37, i32 36, i32 39, i32 38, i32 41, i32 40, i32 43, i32 42, i32 45, i32 44, i32 47, i32 46, i32 49, i32 48, i32 51, i32 50, i32 53, i32 52, i32 55, i32 54, i32 57, i32 56, i32 59, i32 58, i32 61, i32 60, i32 63, i32 62>
1573  call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> %a, <64 x i8>* %y, i32 1, <64 x i1> %shuf)
1574  ret void
1575}
1576declare void @llvm.masked.store.v64i8.p0v64i8(<64 x i8>, <64 x i8>*, i32, <64 x i1>)
1577
1578@mem64_dst = global i64 0, align 8
1579@mem64_src = global i64 0, align 8
1580define i32 @v64i1_inline_asm() "min-legal-vector-width"="256" {
1581; CHECK-LABEL: v64i1_inline_asm:
1582; CHECK:       # %bb.0:
1583; CHECK-NEXT:    kmovq {{.*}}(%rip), %k0
1584; CHECK-NEXT:    #APP
1585; CHECK-NEXT:    #NO_APP
1586; CHECK-NEXT:    kmovq %k0, {{.*}}(%rip)
1587; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
1588; CHECK-NEXT:    retq
1589  %1 = alloca i32, align 4
1590  %2 = load i64, i64* @mem64_src, align 8
1591  %3 = call i64 asm "", "=k,k,~{dirflag},~{fpsr},~{flags}"(i64 %2)
1592  store i64 %3, i64* @mem64_dst, align 8
1593  %4 = load i32, i32* %1, align 4
1594  ret i32 %4
1595}
1596
1597define void @cmp_v8i64_sext(<8 x i64>* %xptr, <8 x i64>* %yptr, <8 x i64>* %zptr) "min-legal-vector-width"="256" {
1598; CHECK-LABEL: cmp_v8i64_sext:
1599; CHECK:       # %bb.0:
1600; CHECK-NEXT:    vmovdqa (%rsi), %ymm0
1601; CHECK-NEXT:    vmovdqa 32(%rsi), %ymm1
1602; CHECK-NEXT:    vpcmpgtq 32(%rdi), %ymm1, %ymm1
1603; CHECK-NEXT:    vpcmpgtq (%rdi), %ymm0, %ymm0
1604; CHECK-NEXT:    vmovdqa %ymm0, (%rdx)
1605; CHECK-NEXT:    vmovdqa %ymm1, 32(%rdx)
1606; CHECK-NEXT:    vzeroupper
1607; CHECK-NEXT:    retq
1608  %x = load <8 x i64>, <8 x i64>* %xptr
1609  %y = load <8 x i64>, <8 x i64>* %yptr
1610  %cmp = icmp slt <8 x i64> %x, %y
1611  %ext = sext <8 x i1> %cmp to <8 x i64>
1612  store <8 x i64> %ext, <8 x i64>* %zptr
1613  ret void
1614}
1615
1616define void @cmp_v8i64_zext(<8 x i64>* %xptr, <8 x i64>* %yptr, <8 x i64>* %zptr) "min-legal-vector-width"="256" {
1617; CHECK-LABEL: cmp_v8i64_zext:
1618; CHECK:       # %bb.0:
1619; CHECK-NEXT:    vmovdqa (%rsi), %ymm0
1620; CHECK-NEXT:    vmovdqa 32(%rsi), %ymm1
1621; CHECK-NEXT:    vpcmpgtq 32(%rdi), %ymm1, %ymm1
1622; CHECK-NEXT:    vpcmpgtq (%rdi), %ymm0, %ymm0
1623; CHECK-NEXT:    vpsrlq $63, %ymm1, %ymm1
1624; CHECK-NEXT:    vpsrlq $63, %ymm0, %ymm0
1625; CHECK-NEXT:    vmovdqa %ymm0, (%rdx)
1626; CHECK-NEXT:    vmovdqa %ymm1, 32(%rdx)
1627; CHECK-NEXT:    vzeroupper
1628; CHECK-NEXT:    retq
1629  %x = load <8 x i64>, <8 x i64>* %xptr
1630  %y = load <8 x i64>, <8 x i64>* %yptr
1631  %cmp = icmp slt <8 x i64> %x, %y
1632  %ext = zext <8 x i1> %cmp to <8 x i64>
1633  store <8 x i64> %ext, <8 x i64>* %zptr
1634  ret void
1635}
1636
1637define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind "min-legal-vector-width"="256" {
1638; CHECK-LABEL: var_rotate_v16i8:
1639; CHECK:       # %bb.0:
1640; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1641; CHECK-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
1642; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1643; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1644; CHECK-NEXT:    vpsllvw %ymm1, %ymm0, %ymm1
1645; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
1646; CHECK-NEXT:    vpsrlvw %ymm2, %ymm0, %ymm0
1647; CHECK-NEXT:    vpor %ymm0, %ymm1, %ymm0
1648; CHECK-NEXT:    vpmovwb %ymm0, %xmm0
1649; CHECK-NEXT:    vzeroupper
1650; CHECK-NEXT:    retq
1651  %b8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
1652  %shl = shl <16 x i8> %a, %b
1653  %lshr = lshr <16 x i8> %a, %b8
1654  %or = or <16 x i8> %shl, %lshr
1655  ret <16 x i8> %or
1656}
1657
1658define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind "min-legal-vector-width"="256" {
1659; CHECK-LABEL: var_rotate_v32i8:
1660; CHECK:       # %bb.0:
1661; CHECK-NEXT:    vpsllw $4, %ymm0, %ymm2
1662; CHECK-NEXT:    vpsrlw $4, %ymm0, %ymm3
1663; CHECK-NEXT:    vpternlogq $216, {{.*}}(%rip), %ymm2, %ymm3
1664; CHECK-NEXT:    vpsllw $5, %ymm1, %ymm1
1665; CHECK-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
1666; CHECK-NEXT:    vpsllw $2, %ymm0, %ymm2
1667; CHECK-NEXT:    vpsrlw $6, %ymm0, %ymm3
1668; CHECK-NEXT:    vpternlogq $216, {{.*}}(%rip), %ymm2, %ymm3
1669; CHECK-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
1670; CHECK-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
1671; CHECK-NEXT:    vpsrlw $7, %ymm0, %ymm2
1672; CHECK-NEXT:    vpaddb %ymm0, %ymm0, %ymm3
1673; CHECK-NEXT:    vpternlogq $248, {{.*}}(%rip), %ymm2, %ymm3
1674; CHECK-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
1675; CHECK-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
1676; CHECK-NEXT:    retq
1677  %b8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
1678  %shl = shl <32 x i8> %a, %b
1679  %lshr = lshr <32 x i8> %a, %b8
1680  %or = or <32 x i8> %shl, %lshr
1681  ret <32 x i8> %or
1682}
1683
1684define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind "min-legal-vector-width"="256" {
1685; CHECK-LABEL: splatvar_rotate_v32i8:
1686; CHECK:       # %bb.0:
1687; CHECK-NEXT:    vpbroadcastb %xmm1, %xmm1
1688; CHECK-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
1689; CHECK-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1690; CHECK-NEXT:    vpsllw %xmm2, %ymm0, %ymm3
1691; CHECK-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1692; CHECK-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
1693; CHECK-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
1694; CHECK-NEXT:    vpsllw %xmm2, %xmm4, %xmm2
1695; CHECK-NEXT:    vpbroadcastb %xmm2, %ymm2
1696; CHECK-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1697; CHECK-NEXT:    vpsrlw %xmm1, %ymm0, %ymm5
1698; CHECK-NEXT:    vpand %ymm2, %ymm3, %ymm2
1699; CHECK-NEXT:    vpsrlw %xmm1, %xmm4, %xmm0
1700; CHECK-NEXT:    vpsrlw $8, %xmm0, %xmm0
1701; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0
1702; CHECK-NEXT:    vpternlogq $236, %ymm5, %ymm2, %ymm0
1703; CHECK-NEXT:    retq
1704  %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
1705  %splat8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
1706  %shl = shl <32 x i8> %a, %splat
1707  %lshr = lshr <32 x i8> %a, %splat8
1708  %or = or <32 x i8> %shl, %lshr
1709  ret <32 x i8> %or
1710}
1711
1712define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind "min-legal-vector-width"="256" {
1713; CHECK-AVX512-LABEL: constant_rotate_v32i8:
1714; CHECK-AVX512:       # %bb.0:
1715; CHECK-AVX512-NEXT:    vpsllw $4, %ymm0, %ymm1
1716; CHECK-AVX512-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1717; CHECK-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1718; CHECK-AVX512-NEXT:    # ymm2 = mem[0,1,0,1]
1719; CHECK-AVX512-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1720; CHECK-AVX512-NEXT:    vpsllw $2, %ymm1, %ymm3
1721; CHECK-AVX512-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
1722; CHECK-AVX512-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
1723; CHECK-AVX512-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1724; CHECK-AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm3
1725; CHECK-AVX512-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
1726; CHECK-AVX512-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1727; CHECK-AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1728; CHECK-AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1729; CHECK-AVX512-NEXT:    vpsllvw {{.*}}(%rip), %ymm3, %ymm3
1730; CHECK-AVX512-NEXT:    vpsrlw $8, %ymm3, %ymm3
1731; CHECK-AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1732; CHECK-AVX512-NEXT:    vpsllvw {{.*}}(%rip), %ymm0, %ymm0
1733; CHECK-AVX512-NEXT:    vpsrlw $8, %ymm0, %ymm0
1734; CHECK-AVX512-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
1735; CHECK-AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
1736; CHECK-AVX512-NEXT:    retq
1737;
1738; CHECK-VBMI-LABEL: constant_rotate_v32i8:
1739; CHECK-VBMI:       # %bb.0:
1740; CHECK-VBMI-NEXT:    vpsllw $4, %ymm0, %ymm1
1741; CHECK-VBMI-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1742; CHECK-VBMI-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1743; CHECK-VBMI-NEXT:    # ymm2 = mem[0,1,0,1]
1744; CHECK-VBMI-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1745; CHECK-VBMI-NEXT:    vpsllw $2, %ymm1, %ymm3
1746; CHECK-VBMI-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
1747; CHECK-VBMI-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
1748; CHECK-VBMI-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1749; CHECK-VBMI-NEXT:    vpaddb %ymm1, %ymm1, %ymm3
1750; CHECK-VBMI-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
1751; CHECK-VBMI-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1752; CHECK-VBMI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1753; CHECK-VBMI-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1754; CHECK-VBMI-NEXT:    vpsllvw {{.*}}(%rip), %ymm3, %ymm3
1755; CHECK-VBMI-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1756; CHECK-VBMI-NEXT:    vpsllvw {{.*}}(%rip), %ymm0, %ymm0
1757; CHECK-VBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,33,35,37,39,41,43,45,47,17,19,21,23,25,27,29,31,49,51,53,55,57,59,61,63]
1758; CHECK-VBMI-NEXT:    vpermi2b %ymm3, %ymm0, %ymm2
1759; CHECK-VBMI-NEXT:    vpor %ymm2, %ymm1, %ymm0
1760; CHECK-VBMI-NEXT:    retq
1761  %shl = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
1762  %lshr = lshr <32 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
1763  %or = or <32 x i8> %shl, %lshr
1764  ret <32 x i8> %or
1765}
1766
1767define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind "min-legal-vector-width"="256" {
1768; CHECK-LABEL: splatconstant_rotate_v32i8:
1769; CHECK:       # %bb.0:
1770; CHECK-NEXT:    vpsllw $4, %ymm0, %ymm1
1771; CHECK-NEXT:    vpsrlw $4, %ymm0, %ymm0
1772; CHECK-NEXT:    vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0
1773; CHECK-NEXT:    retq
1774  %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1775  %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1776  %or = or <32 x i8> %shl, %lshr
1777  ret <32 x i8> %or
1778}
1779
1780define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind "min-legal-vector-width"="256" {
1781; CHECK-LABEL: splatconstant_rotate_mask_v32i8:
1782; CHECK:       # %bb.0:
1783; CHECK-NEXT:    vpsllw $4, %ymm0, %ymm1
1784; CHECK-NEXT:    vpsrlw $4, %ymm0, %ymm0
1785; CHECK-NEXT:    vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0
1786; CHECK-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1787; CHECK-NEXT:    retq
1788  %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1789  %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1790  %rmask = and <32 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
1791  %lmask = and <32 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
1792  %or = or <32 x i8> %lmask, %rmask
1793  ret <32 x i8> %or
1794}
1795