• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw,avx512dq,prefer-256-bit | FileCheck %s
3
4; This file primarily contains tests for specific places in X86ISelLowering.cpp that needed be made aware of the legalizer not allowing 512-bit vectors due to prefer-256-bit even though AVX512 is enabled.
5
6define void @add256(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "required-vector-width"="256" {
7; CHECK-LABEL: add256:
8; CHECK:       # %bb.0:
9; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
10; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
11; CHECK-NEXT:    vpaddd (%rsi), %ymm0, %ymm0
12; CHECK-NEXT:    vpaddd 32(%rsi), %ymm1, %ymm1
13; CHECK-NEXT:    vmovdqa %ymm1, 32(%rdx)
14; CHECK-NEXT:    vmovdqa %ymm0, (%rdx)
15; CHECK-NEXT:    vzeroupper
16; CHECK-NEXT:    retq
17  %d = load <16 x i32>, <16 x i32>* %a
18  %e = load <16 x i32>, <16 x i32>* %b
19  %f = add <16 x i32> %d, %e
20  store <16 x i32> %f, <16 x i32>* %c
21  ret void
22}
23
24define void @add512(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "required-vector-width"="512" {
25; CHECK-LABEL: add512:
26; CHECK:       # %bb.0:
27; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
28; CHECK-NEXT:    vpaddd (%rsi), %zmm0, %zmm0
29; CHECK-NEXT:    vmovdqa64 %zmm0, (%rdx)
30; CHECK-NEXT:    vzeroupper
31; CHECK-NEXT:    retq
32  %d = load <16 x i32>, <16 x i32>* %a
33  %e = load <16 x i32>, <16 x i32>* %b
34  %f = add <16 x i32> %d, %e
35  store <16 x i32> %f, <16 x i32>* %c
36  ret void
37}
38
39define void @avg_v64i8_256(<64 x i8>* %a, <64 x i8>* %b) "required-vector-width"="256" {
40; CHECK-LABEL: avg_v64i8_256:
41; CHECK:       # %bb.0:
42; CHECK-NEXT:    vmovdqa (%rsi), %ymm0
43; CHECK-NEXT:    vmovdqa 32(%rsi), %ymm1
44; CHECK-NEXT:    vpavgb (%rdi), %ymm0, %ymm0
45; CHECK-NEXT:    vpavgb 32(%rdi), %ymm1, %ymm1
46; CHECK-NEXT:    vmovdqu %ymm1, (%rax)
47; CHECK-NEXT:    vmovdqu %ymm0, (%rax)
48; CHECK-NEXT:    vzeroupper
49; CHECK-NEXT:    retq
50  %1 = load <64 x i8>, <64 x i8>* %a
51  %2 = load <64 x i8>, <64 x i8>* %b
52  %3 = zext <64 x i8> %1 to <64 x i32>
53  %4 = zext <64 x i8> %2 to <64 x i32>
54  %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
55  %6 = add nuw nsw <64 x i32> %5, %4
56  %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
57  %8 = trunc <64 x i32> %7 to <64 x i8>
58  store <64 x i8> %8, <64 x i8>* undef, align 4
59  ret void
60}
61
62
63define void @avg_v64i8_512(<64 x i8>* %a, <64 x i8>* %b) "required-vector-width"="512" {
64; CHECK-LABEL: avg_v64i8_512:
65; CHECK:       # %bb.0:
66; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm0
67; CHECK-NEXT:    vpavgb (%rdi), %zmm0, %zmm0
68; CHECK-NEXT:    vmovdqu64 %zmm0, (%rax)
69; CHECK-NEXT:    vzeroupper
70; CHECK-NEXT:    retq
71  %1 = load <64 x i8>, <64 x i8>* %a
72  %2 = load <64 x i8>, <64 x i8>* %b
73  %3 = zext <64 x i8> %1 to <64 x i32>
74  %4 = zext <64 x i8> %2 to <64 x i32>
75  %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
76  %6 = add nuw nsw <64 x i32> %5, %4
77  %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
78  %8 = trunc <64 x i32> %7 to <64 x i8>
79  store <64 x i8> %8, <64 x i8>* undef, align 4
80  ret void
81}
82
83define void @pmaddwd_32_256(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "required-vector-width"="256" {
84; CHECK-LABEL: pmaddwd_32_256:
85; CHECK:       # %bb.0:
86; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
87; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
88; CHECK-NEXT:    vpmaddwd (%rsi), %ymm0, %ymm0
89; CHECK-NEXT:    vpmaddwd 32(%rsi), %ymm1, %ymm1
90; CHECK-NEXT:    vmovdqa %ymm1, 32(%rdx)
91; CHECK-NEXT:    vmovdqa %ymm0, (%rdx)
92; CHECK-NEXT:    vzeroupper
93; CHECK-NEXT:    retq
94   %A = load <32 x i16>, <32 x i16>* %APtr
95   %B = load <32 x i16>, <32 x i16>* %BPtr
96   %a = sext <32 x i16> %A to <32 x i32>
97   %b = sext <32 x i16> %B to <32 x i32>
98   %m = mul nsw <32 x i32> %a, %b
99   %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
100   %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
101   %ret = add <16 x i32> %odd, %even
102   store <16 x i32> %ret, <16 x i32>* %CPtr
103   ret void
104}
105
106define void @pmaddwd_32_512(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "required-vector-width"="512" {
107; CHECK-LABEL: pmaddwd_32_512:
108; CHECK:       # %bb.0:
109; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
110; CHECK-NEXT:    vpmaddwd (%rsi), %zmm0, %zmm0
111; CHECK-NEXT:    vmovdqa64 %zmm0, (%rdx)
112; CHECK-NEXT:    vzeroupper
113; CHECK-NEXT:    retq
114   %A = load <32 x i16>, <32 x i16>* %APtr
115   %B = load <32 x i16>, <32 x i16>* %BPtr
116   %a = sext <32 x i16> %A to <32 x i32>
117   %b = sext <32 x i16> %B to <32 x i32>
118   %m = mul nsw <32 x i32> %a, %b
119   %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
120   %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
121   %ret = add <16 x i32> %odd, %even
122   store <16 x i32> %ret, <16 x i32>* %CPtr
123   ret void
124}
125
126define void @psubus_64i8_max_256(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "required-vector-width"="256" {
127; CHECK-LABEL: psubus_64i8_max_256:
128; CHECK:       # %bb.0:
129; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
130; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
131; CHECK-NEXT:    vpsubusb (%rsi), %ymm0, %ymm0
132; CHECK-NEXT:    vpsubusb 32(%rsi), %ymm1, %ymm1
133; CHECK-NEXT:    vmovdqa %ymm1, 32(%rdx)
134; CHECK-NEXT:    vmovdqa %ymm0, (%rdx)
135; CHECK-NEXT:    vzeroupper
136; CHECK-NEXT:    retq
137  %x = load <64 x i8>, <64 x i8>* %xptr
138  %y = load <64 x i8>, <64 x i8>* %yptr
139  %cmp = icmp ult <64 x i8> %x, %y
140  %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x
141  %res = sub <64 x i8> %max, %y
142  store <64 x i8> %res, <64 x i8>* %zptr
143  ret void
144}
145
146define void @psubus_64i8_max_512(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "required-vector-width"="512" {
147; CHECK-LABEL: psubus_64i8_max_512:
148; CHECK:       # %bb.0:
149; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
150; CHECK-NEXT:    vpsubusb (%rsi), %zmm0, %zmm0
151; CHECK-NEXT:    vmovdqa64 %zmm0, (%rdx)
152; CHECK-NEXT:    vzeroupper
153; CHECK-NEXT:    retq
154  %x = load <64 x i8>, <64 x i8>* %xptr
155  %y = load <64 x i8>, <64 x i8>* %yptr
156  %cmp = icmp ult <64 x i8> %x, %y
157  %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x
158  %res = sub <64 x i8> %max, %y
159  store <64 x i8> %res, <64 x i8>* %zptr
160  ret void
161}
162
163define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly, i32) "required-vector-width"="256" {
164; CHECK-LABEL: _Z9test_charPcS_i_256:
165; CHECK:       # %bb.0: # %entry
166; CHECK-NEXT:    movl %edx, %eax
167; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
168; CHECK-NEXT:    xorl %ecx, %ecx
169; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
170; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
171; CHECK-NEXT:    .p2align 4, 0x90
172; CHECK-NEXT:  .LBB8_1: # %vector.body
173; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
174; CHECK-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm3
175; CHECK-NEXT:    vpmovsxbw 16(%rdi,%rcx), %ymm4
176; CHECK-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm5
177; CHECK-NEXT:    vpmaddwd %ymm3, %ymm5, %ymm3
178; CHECK-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
179; CHECK-NEXT:    vpmovsxbw 16(%rsi,%rcx), %ymm3
180; CHECK-NEXT:    vpmaddwd %ymm4, %ymm3, %ymm3
181; CHECK-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
182; CHECK-NEXT:    addq $32, %rcx
183; CHECK-NEXT:    cmpq %rcx, %rax
184; CHECK-NEXT:    jne .LBB8_1
185; CHECK-NEXT:  # %bb.2: # %middle.block
186; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm1
187; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
188; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
189; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
190; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
191; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
192; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
193; CHECK-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
194; CHECK-NEXT:    vmovd %xmm0, %eax
195; CHECK-NEXT:    vzeroupper
196; CHECK-NEXT:    retq
197entry:
198  %3 = zext i32 %2 to i64
199  br label %vector.body
200
201vector.body:
202  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
203  %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
204  %4 = getelementptr inbounds i8, i8* %0, i64 %index
205  %5 = bitcast i8* %4 to <32 x i8>*
206  %wide.load = load <32 x i8>, <32 x i8>* %5, align 1
207  %6 = sext <32 x i8> %wide.load to <32 x i32>
208  %7 = getelementptr inbounds i8, i8* %1, i64 %index
209  %8 = bitcast i8* %7 to <32 x i8>*
210  %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1
211  %9 = sext <32 x i8> %wide.load14 to <32 x i32>
212  %10 = mul nsw <32 x i32> %9, %6
213  %11 = add nsw <32 x i32> %10, %vec.phi
214  %index.next = add i64 %index, 32
215  %12 = icmp eq i64 %index.next, %3
216  br i1 %12, label %middle.block, label %vector.body
217
218middle.block:
219  %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
220  %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1
221  %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
222  %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf
223  %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
224  %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15
225  %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
226  %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17
227  %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
228  %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19
229  %13 = extractelement <32 x i32> %bin.rdx20, i32 0
230  ret i32 %13
231}
232
233define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly, i32) "required-vector-width"="512" {
234; CHECK-LABEL: _Z9test_charPcS_i_512:
235; CHECK:       # %bb.0: # %entry
236; CHECK-NEXT:    movl %edx, %eax
237; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
238; CHECK-NEXT:    xorl %ecx, %ecx
239; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
240; CHECK-NEXT:    .p2align 4, 0x90
241; CHECK-NEXT:  .LBB9_1: # %vector.body
242; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
243; CHECK-NEXT:    vpmovsxbw (%rdi,%rcx), %zmm2
244; CHECK-NEXT:    vpmovsxbw (%rsi,%rcx), %zmm3
245; CHECK-NEXT:    vpmaddwd %zmm2, %zmm3, %zmm2
246; CHECK-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
247; CHECK-NEXT:    addq $32, %rcx
248; CHECK-NEXT:    cmpq %rcx, %rax
249; CHECK-NEXT:    jne .LBB9_1
250; CHECK-NEXT:  # %bb.2: # %middle.block
251; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
252; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
253; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
254; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
255; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
256; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
257; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
258; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
259; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
260; CHECK-NEXT:    vmovd %xmm0, %eax
261; CHECK-NEXT:    vzeroupper
262; CHECK-NEXT:    retq
263entry:
264  %3 = zext i32 %2 to i64
265  br label %vector.body
266
267vector.body:
268  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
269  %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
270  %4 = getelementptr inbounds i8, i8* %0, i64 %index
271  %5 = bitcast i8* %4 to <32 x i8>*
272  %wide.load = load <32 x i8>, <32 x i8>* %5, align 1
273  %6 = sext <32 x i8> %wide.load to <32 x i32>
274  %7 = getelementptr inbounds i8, i8* %1, i64 %index
275  %8 = bitcast i8* %7 to <32 x i8>*
276  %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1
277  %9 = sext <32 x i8> %wide.load14 to <32 x i32>
278  %10 = mul nsw <32 x i32> %9, %6
279  %11 = add nsw <32 x i32> %10, %vec.phi
280  %index.next = add i64 %index, 32
281  %12 = icmp eq i64 %index.next, %3
282  br i1 %12, label %middle.block, label %vector.body
283
284middle.block:
285  %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
286  %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1
287  %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
288  %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf
289  %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
290  %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15
291  %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
292  %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17
293  %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
294  %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19
295  %13 = extractelement <32 x i32> %bin.rdx20, i32 0
296  ret i32 %13
297}
298
299@a = global [1024 x i8] zeroinitializer, align 16
300@b = global [1024 x i8] zeroinitializer, align 16
301
302define i32 @sad_16i8_256() "required-vector-width"="256" {
303; CHECK-LABEL: sad_16i8_256:
304; CHECK:       # %bb.0: # %entry
305; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
306; CHECK-NEXT:    movq $-1024, %rax # imm = 0xFC00
307; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
308; CHECK-NEXT:    .p2align 4, 0x90
309; CHECK-NEXT:  .LBB10_1: # %vector.body
310; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
311; CHECK-NEXT:    vmovdqu a+1024(%rax), %xmm2
312; CHECK-NEXT:    vpsadbw b+1024(%rax), %xmm2, %xmm2
313; CHECK-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
314; CHECK-NEXT:    addq $4, %rax
315; CHECK-NEXT:    jne .LBB10_1
316; CHECK-NEXT:  # %bb.2: # %middle.block
317; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
318; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
319; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
320; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
321; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
322; CHECK-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
323; CHECK-NEXT:    vmovd %xmm0, %eax
324; CHECK-NEXT:    vzeroupper
325; CHECK-NEXT:    retq
326entry:
327  br label %vector.body
328
329vector.body:
330  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
331  %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
332  %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
333  %1 = bitcast i8* %0 to <16 x i8>*
334  %wide.load = load <16 x i8>, <16 x i8>* %1, align 4
335  %2 = zext <16 x i8> %wide.load to <16 x i32>
336  %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
337  %4 = bitcast i8* %3 to <16 x i8>*
338  %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4
339  %5 = zext <16 x i8> %wide.load1 to <16 x i32>
340  %6 = sub nsw <16 x i32> %2, %5
341  %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
342  %8 = sub nsw <16 x i32> zeroinitializer, %6
343  %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8
344  %10 = add nsw <16 x i32> %9, %vec.phi
345  %index.next = add i64 %index, 4
346  %11 = icmp eq i64 %index.next, 1024
347  br i1 %11, label %middle.block, label %vector.body
348
349middle.block:
350  %.lcssa = phi <16 x i32> [ %10, %vector.body ]
351  %rdx.shuf = shufflevector <16 x i32> %.lcssa, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
352  %bin.rdx = add <16 x i32> %.lcssa, %rdx.shuf
353  %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
354  %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2
355  %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
356  %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3
357  %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
358  %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4
359  %12 = extractelement <16 x i32> %bin.rdx4, i32 0
360  ret i32 %12
361}
362
363define i32 @sad_16i8_512() "required-vector-width"="512" {
364; CHECK-LABEL: sad_16i8_512:
365; CHECK:       # %bb.0: # %entry
366; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
367; CHECK-NEXT:    movq $-1024, %rax # imm = 0xFC00
368; CHECK-NEXT:    .p2align 4, 0x90
369; CHECK-NEXT:  .LBB11_1: # %vector.body
370; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
371; CHECK-NEXT:    vmovdqu a+1024(%rax), %xmm1
372; CHECK-NEXT:    vpsadbw b+1024(%rax), %xmm1, %xmm1
373; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
374; CHECK-NEXT:    addq $4, %rax
375; CHECK-NEXT:    jne .LBB11_1
376; CHECK-NEXT:  # %bb.2: # %middle.block
377; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
378; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
379; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
380; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
381; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
382; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
383; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
384; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
385; CHECK-NEXT:    vmovd %xmm0, %eax
386; CHECK-NEXT:    vzeroupper
387; CHECK-NEXT:    retq
388entry:
389  br label %vector.body
390
391vector.body:
392  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
393  %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
394  %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
395  %1 = bitcast i8* %0 to <16 x i8>*
396  %wide.load = load <16 x i8>, <16 x i8>* %1, align 4
397  %2 = zext <16 x i8> %wide.load to <16 x i32>
398  %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
399  %4 = bitcast i8* %3 to <16 x i8>*
400  %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4
401  %5 = zext <16 x i8> %wide.load1 to <16 x i32>
402  %6 = sub nsw <16 x i32> %2, %5
403  %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
404  %8 = sub nsw <16 x i32> zeroinitializer, %6
405  %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8
406  %10 = add nsw <16 x i32> %9, %vec.phi
407  %index.next = add i64 %index, 4
408  %11 = icmp eq i64 %index.next, 1024
409  br i1 %11, label %middle.block, label %vector.body
410
411middle.block:
412  %.lcssa = phi <16 x i32> [ %10, %vector.body ]
413  %rdx.shuf = shufflevector <16 x i32> %.lcssa, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
414  %bin.rdx = add <16 x i32> %.lcssa, %rdx.shuf
415  %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
416  %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2
417  %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
418  %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3
419  %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
420  %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4
421  %12 = extractelement <16 x i32> %bin.rdx4, i32 0
422  ret i32 %12
423}
424
425define void @sbto16f32_256(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="256" {
426; CHECK-LABEL: sbto16f32_256:
427; CHECK:       # %bb.0:
428; CHECK-NEXT:    vpmovw2m %ymm0, %k0
429; CHECK-NEXT:    kshiftrw $8, %k0, %k1
430; CHECK-NEXT:    vpmovm2d %k1, %ymm0
431; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
432; CHECK-NEXT:    vpmovm2d %k0, %ymm1
433; CHECK-NEXT:    vcvtdq2ps %ymm1, %ymm1
434; CHECK-NEXT:    vmovaps %ymm1, (%rdi)
435; CHECK-NEXT:    vmovaps %ymm0, 32(%rdi)
436; CHECK-NEXT:    vzeroupper
437; CHECK-NEXT:    retq
438  %mask = icmp slt <16 x i16> %a, zeroinitializer
439  %1 = sitofp <16 x i1> %mask to <16 x float>
440  store <16 x float> %1, <16 x float>* %res
441  ret void
442}
443
444define void @sbto16f32_512(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="512" {
445; CHECK-LABEL: sbto16f32_512:
446; CHECK:       # %bb.0:
447; CHECK-NEXT:    vpmovw2m %ymm0, %k0
448; CHECK-NEXT:    vpmovm2d %k0, %zmm0
449; CHECK-NEXT:    vcvtdq2ps %zmm0, %zmm0
450; CHECK-NEXT:    vmovaps %zmm0, (%rdi)
451; CHECK-NEXT:    vzeroupper
452; CHECK-NEXT:    retq
453  %mask = icmp slt <16 x i16> %a, zeroinitializer
454  %1 = sitofp <16 x i1> %mask to <16 x float>
455  store <16 x float> %1, <16 x float>* %res
456  ret void
457}
458
459define void @sbto16f64_256(<16 x i16> %a, <16 x double>* %res)  "required-vector-width"="256" {
460; CHECK-LABEL: sbto16f64_256:
461; CHECK:       # %bb.0:
462; CHECK-NEXT:    vpmovw2m %ymm0, %k0
463; CHECK-NEXT:    kshiftrw $8, %k0, %k1
464; CHECK-NEXT:    vpmovm2d %k1, %ymm0
465; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm1
466; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
467; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm0
468; CHECK-NEXT:    vpmovm2d %k0, %ymm2
469; CHECK-NEXT:    vcvtdq2pd %xmm2, %ymm3
470; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm2
471; CHECK-NEXT:    vcvtdq2pd %xmm2, %ymm2
472; CHECK-NEXT:    vmovaps %ymm2, 32(%rdi)
473; CHECK-NEXT:    vmovaps %ymm3, (%rdi)
474; CHECK-NEXT:    vmovaps %ymm0, 96(%rdi)
475; CHECK-NEXT:    vmovaps %ymm1, 64(%rdi)
476; CHECK-NEXT:    vzeroupper
477; CHECK-NEXT:    retq
478  %mask = icmp slt <16 x i16> %a, zeroinitializer
479  %1 = sitofp <16 x i1> %mask to <16 x double>
480  store <16 x double> %1, <16 x double>* %res
481  ret void
482}
483
484define void @sbto16f64_512(<16 x i16> %a, <16 x double>* %res)  "required-vector-width"="512" {
485; CHECK-LABEL: sbto16f64_512:
486; CHECK:       # %bb.0:
487; CHECK-NEXT:    vpmovw2m %ymm0, %k0
488; CHECK-NEXT:    vpmovm2d %k0, %zmm0
489; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm1
490; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
491; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm0
492; CHECK-NEXT:    vmovaps %zmm0, 64(%rdi)
493; CHECK-NEXT:    vmovaps %zmm1, (%rdi)
494; CHECK-NEXT:    vzeroupper
495; CHECK-NEXT:    retq
496  %mask = icmp slt <16 x i16> %a, zeroinitializer
497  %1 = sitofp <16 x i1> %mask to <16 x double>
498  store <16 x double> %1, <16 x double>* %res
499  ret void
500}
501
502define void @ubto16f32_256(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="256" {
503; CHECK-LABEL: ubto16f32_256:
504; CHECK:       # %bb.0:
505; CHECK-NEXT:    vpmovw2m %ymm0, %k0
506; CHECK-NEXT:    kshiftrw $8, %k0, %k1
507; CHECK-NEXT:    vpmovm2d %k1, %ymm0
508; CHECK-NEXT:    vpsrld $31, %ymm0, %ymm0
509; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
510; CHECK-NEXT:    vpmovm2d %k0, %ymm1
511; CHECK-NEXT:    vpsrld $31, %ymm1, %ymm1
512; CHECK-NEXT:    vcvtdq2ps %ymm1, %ymm1
513; CHECK-NEXT:    vmovaps %ymm1, (%rdi)
514; CHECK-NEXT:    vmovaps %ymm0, 32(%rdi)
515; CHECK-NEXT:    vzeroupper
516; CHECK-NEXT:    retq
517  %mask = icmp slt <16 x i16> %a, zeroinitializer
518  %1 = uitofp <16 x i1> %mask to <16 x float>
519  store <16 x float> %1, <16 x float>* %res
520  ret void
521}
522
523define void @ubto16f32_512(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="512" {
524; CHECK-LABEL: ubto16f32_512:
525; CHECK:       # %bb.0:
526; CHECK-NEXT:    vpmovw2m %ymm0, %k0
527; CHECK-NEXT:    vpmovm2d %k0, %zmm0
528; CHECK-NEXT:    vpsrld $31, %zmm0, %zmm0
529; CHECK-NEXT:    vcvtdq2ps %zmm0, %zmm0
530; CHECK-NEXT:    vmovaps %zmm0, (%rdi)
531; CHECK-NEXT:    vzeroupper
532; CHECK-NEXT:    retq
533  %mask = icmp slt <16 x i16> %a, zeroinitializer
534  %1 = uitofp <16 x i1> %mask to <16 x float>
535  store <16 x float> %1, <16 x float>* %res
536  ret void
537}
538
539define void @ubto16f64_256(<16 x i16> %a, <16 x double>* %res) "required-vector-width"="256" {
540; CHECK-LABEL: ubto16f64_256:
541; CHECK:       # %bb.0:
542; CHECK-NEXT:    vpmovw2m %ymm0, %k0
543; CHECK-NEXT:    kshiftrw $8, %k0, %k1
544; CHECK-NEXT:    vpmovm2d %k1, %ymm0
545; CHECK-NEXT:    vpsrld $31, %ymm0, %ymm0
546; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm1
547; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
548; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm0
549; CHECK-NEXT:    vpmovm2d %k0, %ymm2
550; CHECK-NEXT:    vpsrld $31, %ymm2, %ymm2
551; CHECK-NEXT:    vcvtdq2pd %xmm2, %ymm3
552; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm2
553; CHECK-NEXT:    vcvtdq2pd %xmm2, %ymm2
554; CHECK-NEXT:    vmovaps %ymm2, 32(%rdi)
555; CHECK-NEXT:    vmovaps %ymm3, (%rdi)
556; CHECK-NEXT:    vmovaps %ymm0, 96(%rdi)
557; CHECK-NEXT:    vmovaps %ymm1, 64(%rdi)
558; CHECK-NEXT:    vzeroupper
559; CHECK-NEXT:    retq
560  %mask = icmp slt <16 x i16> %a, zeroinitializer
561  %1 = uitofp <16 x i1> %mask to <16 x double>
562  store <16 x double> %1, <16 x double>* %res
563  ret void
564}
565
566define void @ubto16f64_512(<16 x i16> %a, <16 x double>* %res) "required-vector-width"="512" {
567; CHECK-LABEL: ubto16f64_512:
568; CHECK:       # %bb.0:
569; CHECK-NEXT:    vpmovw2m %ymm0, %k0
570; CHECK-NEXT:    vpmovm2d %k0, %zmm0
571; CHECK-NEXT:    vpsrld $31, %zmm0, %zmm0
572; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm1
573; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
574; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm0
575; CHECK-NEXT:    vmovaps %zmm0, 64(%rdi)
576; CHECK-NEXT:    vmovaps %zmm1, (%rdi)
577; CHECK-NEXT:    vzeroupper
578; CHECK-NEXT:    retq
579  %mask = icmp slt <16 x i16> %a, zeroinitializer
580  %1 = uitofp <16 x i1> %mask to <16 x double>
581  store <16 x double> %1, <16 x double>* %res
582  ret void
583}
584
585define <16 x i16> @test_16f32toub_256(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="256" {
586; CHECK-LABEL: test_16f32toub_256:
587; CHECK:       # %bb.0:
588; CHECK-NEXT:    vcvttps2dq (%rdi), %ymm1
589; CHECK-NEXT:    vpmovdw %ymm1, %xmm1
590; CHECK-NEXT:    vcvttps2dq 32(%rdi), %ymm2
591; CHECK-NEXT:    vpmovdw %ymm2, %xmm2
592; CHECK-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
593; CHECK-NEXT:    vpsllw $15, %ymm1, %ymm1
594; CHECK-NEXT:    vpmovw2m %ymm1, %k1
595; CHECK-NEXT:    vmovdqu16 %ymm0, %ymm0 {%k1} {z}
596; CHECK-NEXT:    retq
597  %a = load <16 x float>, <16 x float>* %ptr
598  %mask = fptoui <16 x float> %a to <16 x i1>
599  %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer
600  ret <16 x i16> %select
601}
602
603define <16 x i16> @test_16f32toub_512(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="512" {
604; CHECK-LABEL: test_16f32toub_512:
605; CHECK:       # %bb.0:
606; CHECK-NEXT:    vcvttps2dq (%rdi), %zmm1
607; CHECK-NEXT:    vpslld $31, %zmm1, %zmm1
608; CHECK-NEXT:    vpmovd2m %zmm1, %k1
609; CHECK-NEXT:    vmovdqu16 %ymm0, %ymm0 {%k1} {z}
610; CHECK-NEXT:    retq
611  %a = load <16 x float>, <16 x float>* %ptr
612  %mask = fptoui <16 x float> %a to <16 x i1>
613  %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer
614  ret <16 x i16> %select
615}
616
617define <16 x i16> @test_16f32tosb_256(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="256" {
618; CHECK-LABEL: test_16f32tosb_256:
619; CHECK:       # %bb.0:
620; CHECK-NEXT:    vcvttps2dq (%rdi), %ymm1
621; CHECK-NEXT:    vpmovdw %ymm1, %xmm1
622; CHECK-NEXT:    vcvttps2dq 32(%rdi), %ymm2
623; CHECK-NEXT:    vpmovdw %ymm2, %xmm2
624; CHECK-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
625; CHECK-NEXT:    vpsllw $15, %ymm1, %ymm1
626; CHECK-NEXT:    vpmovw2m %ymm1, %k1
627; CHECK-NEXT:    vmovdqu16 %ymm0, %ymm0 {%k1} {z}
628; CHECK-NEXT:    retq
629  %a = load <16 x float>, <16 x float>* %ptr
630  %mask = fptosi <16 x float> %a to <16 x i1>
631  %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer
632  ret <16 x i16> %select
633}
634
635define <16 x i16> @test_16f32tosb_512(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="512" {
636; CHECK-LABEL: test_16f32tosb_512:
637; CHECK:       # %bb.0:
638; CHECK-NEXT:    vcvttps2dq (%rdi), %zmm1
639; CHECK-NEXT:    vpmovd2m %zmm1, %k1
640; CHECK-NEXT:    vmovdqu16 %ymm0, %ymm0 {%k1} {z}
641; CHECK-NEXT:    retq
642  %a = load <16 x float>, <16 x float>* %ptr
643  %mask = fptosi <16 x float> %a to <16 x i1>
644  %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer
645  ret <16 x i16> %select
646}
647