• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=AVX
4; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
5; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
6
7define <2 x double> @floor_v2f64(<2 x double> %p) {
8; SSE41-LABEL: floor_v2f64:
9; SSE41:       ## %bb.0:
10; SSE41-NEXT:    roundpd $9, %xmm0, %xmm0
11; SSE41-NEXT:    retq
12;
13; AVX-LABEL: floor_v2f64:
14; AVX:       ## %bb.0:
15; AVX-NEXT:    vroundpd $9, %xmm0, %xmm0
16; AVX-NEXT:    retq
17;
18; AVX512-LABEL: floor_v2f64:
19; AVX512:       ## %bb.0:
20; AVX512-NEXT:    vroundpd $9, %xmm0, %xmm0
21; AVX512-NEXT:    retq
22  %t = call <2 x double> @llvm.floor.v2f64(<2 x double> %p)
23  ret <2 x double> %t
24}
25declare <2 x double> @llvm.floor.v2f64(<2 x double> %p)
26
27define <4 x float> @floor_v4f32(<4 x float> %p) {
28; SSE41-LABEL: floor_v4f32:
29; SSE41:       ## %bb.0:
30; SSE41-NEXT:    roundps $9, %xmm0, %xmm0
31; SSE41-NEXT:    retq
32;
33; AVX-LABEL: floor_v4f32:
34; AVX:       ## %bb.0:
35; AVX-NEXT:    vroundps $9, %xmm0, %xmm0
36; AVX-NEXT:    retq
37;
38; AVX512-LABEL: floor_v4f32:
39; AVX512:       ## %bb.0:
40; AVX512-NEXT:    vroundps $9, %xmm0, %xmm0
41; AVX512-NEXT:    retq
42  %t = call <4 x float> @llvm.floor.v4f32(<4 x float> %p)
43  ret <4 x float> %t
44}
45declare <4 x float> @llvm.floor.v4f32(<4 x float> %p)
46
47define <4 x double> @floor_v4f64(<4 x double> %p){
48; SSE41-LABEL: floor_v4f64:
49; SSE41:       ## %bb.0:
50; SSE41-NEXT:    roundpd $9, %xmm0, %xmm0
51; SSE41-NEXT:    roundpd $9, %xmm1, %xmm1
52; SSE41-NEXT:    retq
53;
54; AVX-LABEL: floor_v4f64:
55; AVX:       ## %bb.0:
56; AVX-NEXT:    vroundpd $9, %ymm0, %ymm0
57; AVX-NEXT:    retq
58;
59; AVX512-LABEL: floor_v4f64:
60; AVX512:       ## %bb.0:
61; AVX512-NEXT:    vroundpd $9, %ymm0, %ymm0
62; AVX512-NEXT:    retq
63  %t = call <4 x double> @llvm.floor.v4f64(<4 x double> %p)
64  ret <4 x double> %t
65}
66declare <4 x double> @llvm.floor.v4f64(<4 x double> %p)
67
68define <8 x float> @floor_v8f32(<8 x float> %p) {
69; SSE41-LABEL: floor_v8f32:
70; SSE41:       ## %bb.0:
71; SSE41-NEXT:    roundps $9, %xmm0, %xmm0
72; SSE41-NEXT:    roundps $9, %xmm1, %xmm1
73; SSE41-NEXT:    retq
74;
75; AVX-LABEL: floor_v8f32:
76; AVX:       ## %bb.0:
77; AVX-NEXT:    vroundps $9, %ymm0, %ymm0
78; AVX-NEXT:    retq
79;
80; AVX512-LABEL: floor_v8f32:
81; AVX512:       ## %bb.0:
82; AVX512-NEXT:    vroundps $9, %ymm0, %ymm0
83; AVX512-NEXT:    retq
84  %t = call <8 x float> @llvm.floor.v8f32(<8 x float> %p)
85  ret <8 x float> %t
86}
87declare <8 x float> @llvm.floor.v8f32(<8 x float> %p)
88
89define <8 x double> @floor_v8f64(<8 x double> %p){
90; SSE41-LABEL: floor_v8f64:
91; SSE41:       ## %bb.0:
92; SSE41-NEXT:    roundpd $9, %xmm0, %xmm0
93; SSE41-NEXT:    roundpd $9, %xmm1, %xmm1
94; SSE41-NEXT:    roundpd $9, %xmm2, %xmm2
95; SSE41-NEXT:    roundpd $9, %xmm3, %xmm3
96; SSE41-NEXT:    retq
97;
98; AVX-LABEL: floor_v8f64:
99; AVX:       ## %bb.0:
100; AVX-NEXT:    vroundpd $9, %ymm0, %ymm0
101; AVX-NEXT:    vroundpd $9, %ymm1, %ymm1
102; AVX-NEXT:    retq
103;
104; AVX512-LABEL: floor_v8f64:
105; AVX512:       ## %bb.0:
106; AVX512-NEXT:    vrndscalepd $9, %zmm0, %zmm0
107; AVX512-NEXT:    retq
108  %t = call <8 x double> @llvm.floor.v8f64(<8 x double> %p)
109  ret <8 x double> %t
110}
111declare <8 x double> @llvm.floor.v8f64(<8 x double> %p)
112
113define <16 x float> @floor_v16f32(<16 x float> %p) {
114; SSE41-LABEL: floor_v16f32:
115; SSE41:       ## %bb.0:
116; SSE41-NEXT:    roundps $9, %xmm0, %xmm0
117; SSE41-NEXT:    roundps $9, %xmm1, %xmm1
118; SSE41-NEXT:    roundps $9, %xmm2, %xmm2
119; SSE41-NEXT:    roundps $9, %xmm3, %xmm3
120; SSE41-NEXT:    retq
121;
122; AVX-LABEL: floor_v16f32:
123; AVX:       ## %bb.0:
124; AVX-NEXT:    vroundps $9, %ymm0, %ymm0
125; AVX-NEXT:    vroundps $9, %ymm1, %ymm1
126; AVX-NEXT:    retq
127;
128; AVX512-LABEL: floor_v16f32:
129; AVX512:       ## %bb.0:
130; AVX512-NEXT:    vrndscaleps $9, %zmm0, %zmm0
131; AVX512-NEXT:    retq
132  %t = call <16 x float> @llvm.floor.v16f32(<16 x float> %p)
133  ret <16 x float> %t
134}
135declare <16 x float> @llvm.floor.v16f32(<16 x float> %p)
136
137define <2 x double> @ceil_v2f64(<2 x double> %p) {
138; SSE41-LABEL: ceil_v2f64:
139; SSE41:       ## %bb.0:
140; SSE41-NEXT:    roundpd $10, %xmm0, %xmm0
141; SSE41-NEXT:    retq
142;
143; AVX-LABEL: ceil_v2f64:
144; AVX:       ## %bb.0:
145; AVX-NEXT:    vroundpd $10, %xmm0, %xmm0
146; AVX-NEXT:    retq
147;
148; AVX512-LABEL: ceil_v2f64:
149; AVX512:       ## %bb.0:
150; AVX512-NEXT:    vroundpd $10, %xmm0, %xmm0
151; AVX512-NEXT:    retq
152  %t = call <2 x double> @llvm.ceil.v2f64(<2 x double> %p)
153  ret <2 x double> %t
154}
155declare <2 x double> @llvm.ceil.v2f64(<2 x double> %p)
156
157define <2 x double> @ceil_v2f64_load(<2 x double>* %ptr) {
158; SSE41-LABEL: ceil_v2f64_load:
159; SSE41:       ## %bb.0:
160; SSE41-NEXT:    movupd (%rdi), %xmm0
161; SSE41-NEXT:    roundpd $10, %xmm0, %xmm0
162; SSE41-NEXT:    retq
163;
164; AVX-LABEL: ceil_v2f64_load:
165; AVX:       ## %bb.0:
166; AVX-NEXT:    vroundpd $10, (%rdi), %xmm0
167; AVX-NEXT:    retq
168;
169; AVX512-LABEL: ceil_v2f64_load:
170; AVX512:       ## %bb.0:
171; AVX512-NEXT:    vroundpd $10, (%rdi), %xmm0
172; AVX512-NEXT:    retq
173  %p = load <2 x double>, <2 x double>* %ptr, align 1
174  %t = call <2 x double> @llvm.ceil.v2f64(<2 x double> %p)
175  ret <2 x double> %t
176}
177
178define <4 x float> @ceil_v4f32(<4 x float> %p) {
179; SSE41-LABEL: ceil_v4f32:
180; SSE41:       ## %bb.0:
181; SSE41-NEXT:    roundps $10, %xmm0, %xmm0
182; SSE41-NEXT:    retq
183;
184; AVX-LABEL: ceil_v4f32:
185; AVX:       ## %bb.0:
186; AVX-NEXT:    vroundps $10, %xmm0, %xmm0
187; AVX-NEXT:    retq
188;
189; AVX512-LABEL: ceil_v4f32:
190; AVX512:       ## %bb.0:
191; AVX512-NEXT:    vroundps $10, %xmm0, %xmm0
192; AVX512-NEXT:    retq
193  %t = call <4 x float> @llvm.ceil.v4f32(<4 x float> %p)
194  ret <4 x float> %t
195}
196declare <4 x float> @llvm.ceil.v4f32(<4 x float> %p)
197
198define <4 x float> @ceil_v4f32_load(<4 x float>* %ptr) {
199; SSE41-LABEL: ceil_v4f32_load:
200; SSE41:       ## %bb.0:
201; SSE41-NEXT:    movups (%rdi), %xmm0
202; SSE41-NEXT:    roundps $10, %xmm0, %xmm0
203; SSE41-NEXT:    retq
204;
205; AVX-LABEL: ceil_v4f32_load:
206; AVX:       ## %bb.0:
207; AVX-NEXT:    vroundps $10, (%rdi), %xmm0
208; AVX-NEXT:    retq
209;
210; AVX512-LABEL: ceil_v4f32_load:
211; AVX512:       ## %bb.0:
212; AVX512-NEXT:    vroundps $10, (%rdi), %xmm0
213; AVX512-NEXT:    retq
214  %p = load <4 x float>, <4 x float>* %ptr, align 1
215  %t = call <4 x float> @llvm.ceil.v4f32(<4 x float> %p)
216  ret <4 x float> %t
217}
218
219define <4 x double> @ceil_v4f64(<4 x double> %p) {
220; SSE41-LABEL: ceil_v4f64:
221; SSE41:       ## %bb.0:
222; SSE41-NEXT:    roundpd $10, %xmm0, %xmm0
223; SSE41-NEXT:    roundpd $10, %xmm1, %xmm1
224; SSE41-NEXT:    retq
225;
226; AVX-LABEL: ceil_v4f64:
227; AVX:       ## %bb.0:
228; AVX-NEXT:    vroundpd $10, %ymm0, %ymm0
229; AVX-NEXT:    retq
230;
231; AVX512-LABEL: ceil_v4f64:
232; AVX512:       ## %bb.0:
233; AVX512-NEXT:    vroundpd $10, %ymm0, %ymm0
234; AVX512-NEXT:    retq
235  %t = call <4 x double> @llvm.ceil.v4f64(<4 x double> %p)
236  ret <4 x double> %t
237}
238declare <4 x double> @llvm.ceil.v4f64(<4 x double> %p)
239
240define <8 x float> @ceil_v8f32(<8 x float> %p) {
241; SSE41-LABEL: ceil_v8f32:
242; SSE41:       ## %bb.0:
243; SSE41-NEXT:    roundps $10, %xmm0, %xmm0
244; SSE41-NEXT:    roundps $10, %xmm1, %xmm1
245; SSE41-NEXT:    retq
246;
247; AVX-LABEL: ceil_v8f32:
248; AVX:       ## %bb.0:
249; AVX-NEXT:    vroundps $10, %ymm0, %ymm0
250; AVX-NEXT:    retq
251;
252; AVX512-LABEL: ceil_v8f32:
253; AVX512:       ## %bb.0:
254; AVX512-NEXT:    vroundps $10, %ymm0, %ymm0
255; AVX512-NEXT:    retq
256  %t = call <8 x float> @llvm.ceil.v8f32(<8 x float> %p)
257  ret <8 x float> %t
258}
259declare <8 x float> @llvm.ceil.v8f32(<8 x float> %p)
260
261define <8 x double> @ceil_v8f64(<8 x double> %p){
262; SSE41-LABEL: ceil_v8f64:
263; SSE41:       ## %bb.0:
264; SSE41-NEXT:    roundpd $10, %xmm0, %xmm0
265; SSE41-NEXT:    roundpd $10, %xmm1, %xmm1
266; SSE41-NEXT:    roundpd $10, %xmm2, %xmm2
267; SSE41-NEXT:    roundpd $10, %xmm3, %xmm3
268; SSE41-NEXT:    retq
269;
270; AVX-LABEL: ceil_v8f64:
271; AVX:       ## %bb.0:
272; AVX-NEXT:    vroundpd $10, %ymm0, %ymm0
273; AVX-NEXT:    vroundpd $10, %ymm1, %ymm1
274; AVX-NEXT:    retq
275;
276; AVX512-LABEL: ceil_v8f64:
277; AVX512:       ## %bb.0:
278; AVX512-NEXT:    vrndscalepd $10, %zmm0, %zmm0
279; AVX512-NEXT:    retq
280  %t = call <8 x double> @llvm.ceil.v8f64(<8 x double> %p)
281  ret <8 x double> %t
282}
283declare <8 x double> @llvm.ceil.v8f64(<8 x double> %p)
284
285define <16 x float> @ceil_v16f32(<16 x float> %p) {
286; SSE41-LABEL: ceil_v16f32:
287; SSE41:       ## %bb.0:
288; SSE41-NEXT:    roundps $10, %xmm0, %xmm0
289; SSE41-NEXT:    roundps $10, %xmm1, %xmm1
290; SSE41-NEXT:    roundps $10, %xmm2, %xmm2
291; SSE41-NEXT:    roundps $10, %xmm3, %xmm3
292; SSE41-NEXT:    retq
293;
294; AVX-LABEL: ceil_v16f32:
295; AVX:       ## %bb.0:
296; AVX-NEXT:    vroundps $10, %ymm0, %ymm0
297; AVX-NEXT:    vroundps $10, %ymm1, %ymm1
298; AVX-NEXT:    retq
299;
300; AVX512-LABEL: ceil_v16f32:
301; AVX512:       ## %bb.0:
302; AVX512-NEXT:    vrndscaleps $10, %zmm0, %zmm0
303; AVX512-NEXT:    retq
304  %t = call <16 x float> @llvm.ceil.v16f32(<16 x float> %p)
305  ret <16 x float> %t
306}
307declare <16 x float> @llvm.ceil.v16f32(<16 x float> %p)
308
309define <2 x double> @trunc_v2f64(<2 x double> %p) {
310; SSE41-LABEL: trunc_v2f64:
311; SSE41:       ## %bb.0:
312; SSE41-NEXT:    roundpd $11, %xmm0, %xmm0
313; SSE41-NEXT:    retq
314;
315; AVX-LABEL: trunc_v2f64:
316; AVX:       ## %bb.0:
317; AVX-NEXT:    vroundpd $11, %xmm0, %xmm0
318; AVX-NEXT:    retq
319;
320; AVX512-LABEL: trunc_v2f64:
321; AVX512:       ## %bb.0:
322; AVX512-NEXT:    vroundpd $11, %xmm0, %xmm0
323; AVX512-NEXT:    retq
324  %t = call <2 x double> @llvm.trunc.v2f64(<2 x double> %p)
325  ret <2 x double> %t
326}
327declare <2 x double> @llvm.trunc.v2f64(<2 x double> %p)
328
329define <4 x float> @trunc_v4f32(<4 x float> %p) {
330; SSE41-LABEL: trunc_v4f32:
331; SSE41:       ## %bb.0:
332; SSE41-NEXT:    roundps $11, %xmm0, %xmm0
333; SSE41-NEXT:    retq
334;
335; AVX-LABEL: trunc_v4f32:
336; AVX:       ## %bb.0:
337; AVX-NEXT:    vroundps $11, %xmm0, %xmm0
338; AVX-NEXT:    retq
339;
340; AVX512-LABEL: trunc_v4f32:
341; AVX512:       ## %bb.0:
342; AVX512-NEXT:    vroundps $11, %xmm0, %xmm0
343; AVX512-NEXT:    retq
344  %t = call <4 x float> @llvm.trunc.v4f32(<4 x float> %p)
345  ret <4 x float> %t
346}
347declare <4 x float> @llvm.trunc.v4f32(<4 x float> %p)
348
349define <4 x double> @trunc_v4f64(<4 x double> %p) {
350; SSE41-LABEL: trunc_v4f64:
351; SSE41:       ## %bb.0:
352; SSE41-NEXT:    roundpd $11, %xmm0, %xmm0
353; SSE41-NEXT:    roundpd $11, %xmm1, %xmm1
354; SSE41-NEXT:    retq
355;
356; AVX-LABEL: trunc_v4f64:
357; AVX:       ## %bb.0:
358; AVX-NEXT:    vroundpd $11, %ymm0, %ymm0
359; AVX-NEXT:    retq
360;
361; AVX512-LABEL: trunc_v4f64:
362; AVX512:       ## %bb.0:
363; AVX512-NEXT:    vroundpd $11, %ymm0, %ymm0
364; AVX512-NEXT:    retq
365  %t = call <4 x double> @llvm.trunc.v4f64(<4 x double> %p)
366  ret <4 x double> %t
367}
368declare <4 x double> @llvm.trunc.v4f64(<4 x double> %p)
369
370define <8 x float> @trunc_v8f32(<8 x float> %p) {
371; SSE41-LABEL: trunc_v8f32:
372; SSE41:       ## %bb.0:
373; SSE41-NEXT:    roundps $11, %xmm0, %xmm0
374; SSE41-NEXT:    roundps $11, %xmm1, %xmm1
375; SSE41-NEXT:    retq
376;
377; AVX-LABEL: trunc_v8f32:
378; AVX:       ## %bb.0:
379; AVX-NEXT:    vroundps $11, %ymm0, %ymm0
380; AVX-NEXT:    retq
381;
382; AVX512-LABEL: trunc_v8f32:
383; AVX512:       ## %bb.0:
384; AVX512-NEXT:    vroundps $11, %ymm0, %ymm0
385; AVX512-NEXT:    retq
386  %t = call <8 x float> @llvm.trunc.v8f32(<8 x float> %p)
387  ret <8 x float> %t
388}
389declare <8 x float> @llvm.trunc.v8f32(<8 x float> %p)
390
391define <8 x double> @trunc_v8f64(<8 x double> %p){
392; SSE41-LABEL: trunc_v8f64:
393; SSE41:       ## %bb.0:
394; SSE41-NEXT:    roundpd $11, %xmm0, %xmm0
395; SSE41-NEXT:    roundpd $11, %xmm1, %xmm1
396; SSE41-NEXT:    roundpd $11, %xmm2, %xmm2
397; SSE41-NEXT:    roundpd $11, %xmm3, %xmm3
398; SSE41-NEXT:    retq
399;
400; AVX-LABEL: trunc_v8f64:
401; AVX:       ## %bb.0:
402; AVX-NEXT:    vroundpd $11, %ymm0, %ymm0
403; AVX-NEXT:    vroundpd $11, %ymm1, %ymm1
404; AVX-NEXT:    retq
405;
406; AVX512-LABEL: trunc_v8f64:
407; AVX512:       ## %bb.0:
408; AVX512-NEXT:    vrndscalepd $11, %zmm0, %zmm0
409; AVX512-NEXT:    retq
410  %t = call <8 x double> @llvm.trunc.v8f64(<8 x double> %p)
411  ret <8 x double> %t
412}
413declare <8 x double> @llvm.trunc.v8f64(<8 x double> %p)
414
415define <16 x float> @trunc_v16f32(<16 x float> %p) {
416; SSE41-LABEL: trunc_v16f32:
417; SSE41:       ## %bb.0:
418; SSE41-NEXT:    roundps $11, %xmm0, %xmm0
419; SSE41-NEXT:    roundps $11, %xmm1, %xmm1
420; SSE41-NEXT:    roundps $11, %xmm2, %xmm2
421; SSE41-NEXT:    roundps $11, %xmm3, %xmm3
422; SSE41-NEXT:    retq
423;
424; AVX-LABEL: trunc_v16f32:
425; AVX:       ## %bb.0:
426; AVX-NEXT:    vroundps $11, %ymm0, %ymm0
427; AVX-NEXT:    vroundps $11, %ymm1, %ymm1
428; AVX-NEXT:    retq
429;
430; AVX512-LABEL: trunc_v16f32:
431; AVX512:       ## %bb.0:
432; AVX512-NEXT:    vrndscaleps $11, %zmm0, %zmm0
433; AVX512-NEXT:    retq
434  %t = call <16 x float> @llvm.trunc.v16f32(<16 x float> %p)
435  ret <16 x float> %t
436}
437declare <16 x float> @llvm.trunc.v16f32(<16 x float> %p)
438
439define <2 x double> @rint_v2f64(<2 x double> %p) {
440; SSE41-LABEL: rint_v2f64:
441; SSE41:       ## %bb.0:
442; SSE41-NEXT:    roundpd $4, %xmm0, %xmm0
443; SSE41-NEXT:    retq
444;
445; AVX-LABEL: rint_v2f64:
446; AVX:       ## %bb.0:
447; AVX-NEXT:    vroundpd $4, %xmm0, %xmm0
448; AVX-NEXT:    retq
449;
450; AVX512-LABEL: rint_v2f64:
451; AVX512:       ## %bb.0:
452; AVX512-NEXT:    vroundpd $4, %xmm0, %xmm0
453; AVX512-NEXT:    retq
454  %t = call <2 x double> @llvm.rint.v2f64(<2 x double> %p)
455  ret <2 x double> %t
456}
457declare <2 x double> @llvm.rint.v2f64(<2 x double> %p)
458
459define <4 x float> @rint_v4f32(<4 x float> %p) {
460; SSE41-LABEL: rint_v4f32:
461; SSE41:       ## %bb.0:
462; SSE41-NEXT:    roundps $4, %xmm0, %xmm0
463; SSE41-NEXT:    retq
464;
465; AVX-LABEL: rint_v4f32:
466; AVX:       ## %bb.0:
467; AVX-NEXT:    vroundps $4, %xmm0, %xmm0
468; AVX-NEXT:    retq
469;
470; AVX512-LABEL: rint_v4f32:
471; AVX512:       ## %bb.0:
472; AVX512-NEXT:    vroundps $4, %xmm0, %xmm0
473; AVX512-NEXT:    retq
474  %t = call <4 x float> @llvm.rint.v4f32(<4 x float> %p)
475  ret <4 x float> %t
476}
477declare <4 x float> @llvm.rint.v4f32(<4 x float> %p)
478
479define <4 x double> @rint_v4f64(<4 x double> %p) {
480; SSE41-LABEL: rint_v4f64:
481; SSE41:       ## %bb.0:
482; SSE41-NEXT:    roundpd $4, %xmm0, %xmm0
483; SSE41-NEXT:    roundpd $4, %xmm1, %xmm1
484; SSE41-NEXT:    retq
485;
486; AVX-LABEL: rint_v4f64:
487; AVX:       ## %bb.0:
488; AVX-NEXT:    vroundpd $4, %ymm0, %ymm0
489; AVX-NEXT:    retq
490;
491; AVX512-LABEL: rint_v4f64:
492; AVX512:       ## %bb.0:
493; AVX512-NEXT:    vroundpd $4, %ymm0, %ymm0
494; AVX512-NEXT:    retq
495  %t = call <4 x double> @llvm.rint.v4f64(<4 x double> %p)
496  ret <4 x double> %t
497}
498declare <4 x double> @llvm.rint.v4f64(<4 x double> %p)
499
500define <8 x float> @rint_v8f32(<8 x float> %p) {
501; SSE41-LABEL: rint_v8f32:
502; SSE41:       ## %bb.0:
503; SSE41-NEXT:    roundps $4, %xmm0, %xmm0
504; SSE41-NEXT:    roundps $4, %xmm1, %xmm1
505; SSE41-NEXT:    retq
506;
507; AVX-LABEL: rint_v8f32:
508; AVX:       ## %bb.0:
509; AVX-NEXT:    vroundps $4, %ymm0, %ymm0
510; AVX-NEXT:    retq
511;
512; AVX512-LABEL: rint_v8f32:
513; AVX512:       ## %bb.0:
514; AVX512-NEXT:    vroundps $4, %ymm0, %ymm0
515; AVX512-NEXT:    retq
516  %t = call <8 x float> @llvm.rint.v8f32(<8 x float> %p)
517  ret <8 x float> %t
518}
519declare <8 x float> @llvm.rint.v8f32(<8 x float> %p)
520
521define <8 x double> @rint_v8f64(<8 x double> %p){
522; SSE41-LABEL: rint_v8f64:
523; SSE41:       ## %bb.0:
524; SSE41-NEXT:    roundpd $4, %xmm0, %xmm0
525; SSE41-NEXT:    roundpd $4, %xmm1, %xmm1
526; SSE41-NEXT:    roundpd $4, %xmm2, %xmm2
527; SSE41-NEXT:    roundpd $4, %xmm3, %xmm3
528; SSE41-NEXT:    retq
529;
530; AVX-LABEL: rint_v8f64:
531; AVX:       ## %bb.0:
532; AVX-NEXT:    vroundpd $4, %ymm0, %ymm0
533; AVX-NEXT:    vroundpd $4, %ymm1, %ymm1
534; AVX-NEXT:    retq
535;
536; AVX512-LABEL: rint_v8f64:
537; AVX512:       ## %bb.0:
538; AVX512-NEXT:    vrndscalepd $4, %zmm0, %zmm0
539; AVX512-NEXT:    retq
540  %t = call <8 x double> @llvm.rint.v8f64(<8 x double> %p)
541  ret <8 x double> %t
542}
543declare <8 x double> @llvm.rint.v8f64(<8 x double> %p)
544
545define <16 x float> @rint_v16f32(<16 x float> %p) {
546; SSE41-LABEL: rint_v16f32:
547; SSE41:       ## %bb.0:
548; SSE41-NEXT:    roundps $4, %xmm0, %xmm0
549; SSE41-NEXT:    roundps $4, %xmm1, %xmm1
550; SSE41-NEXT:    roundps $4, %xmm2, %xmm2
551; SSE41-NEXT:    roundps $4, %xmm3, %xmm3
552; SSE41-NEXT:    retq
553;
554; AVX-LABEL: rint_v16f32:
555; AVX:       ## %bb.0:
556; AVX-NEXT:    vroundps $4, %ymm0, %ymm0
557; AVX-NEXT:    vroundps $4, %ymm1, %ymm1
558; AVX-NEXT:    retq
559;
560; AVX512-LABEL: rint_v16f32:
561; AVX512:       ## %bb.0:
562; AVX512-NEXT:    vrndscaleps $4, %zmm0, %zmm0
563; AVX512-NEXT:    retq
564  %t = call <16 x float> @llvm.rint.v16f32(<16 x float> %p)
565  ret <16 x float> %t
566}
567declare <16 x float> @llvm.rint.v16f32(<16 x float> %p)
568
569define <2 x double> @nearbyint_v2f64(<2 x double> %p) {
570; SSE41-LABEL: nearbyint_v2f64:
571; SSE41:       ## %bb.0:
572; SSE41-NEXT:    roundpd $12, %xmm0, %xmm0
573; SSE41-NEXT:    retq
574;
575; AVX-LABEL: nearbyint_v2f64:
576; AVX:       ## %bb.0:
577; AVX-NEXT:    vroundpd $12, %xmm0, %xmm0
578; AVX-NEXT:    retq
579;
580; AVX512-LABEL: nearbyint_v2f64:
581; AVX512:       ## %bb.0:
582; AVX512-NEXT:    vroundpd $12, %xmm0, %xmm0
583; AVX512-NEXT:    retq
584  %t = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p)
585  ret <2 x double> %t
586}
587declare <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p)
588
589define <4 x float> @nearbyint_v4f32(<4 x float> %p) {
590; SSE41-LABEL: nearbyint_v4f32:
591; SSE41:       ## %bb.0:
592; SSE41-NEXT:    roundps $12, %xmm0, %xmm0
593; SSE41-NEXT:    retq
594;
595; AVX-LABEL: nearbyint_v4f32:
596; AVX:       ## %bb.0:
597; AVX-NEXT:    vroundps $12, %xmm0, %xmm0
598; AVX-NEXT:    retq
599;
600; AVX512-LABEL: nearbyint_v4f32:
601; AVX512:       ## %bb.0:
602; AVX512-NEXT:    vroundps $12, %xmm0, %xmm0
603; AVX512-NEXT:    retq
604  %t = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p)
605  ret <4 x float> %t
606}
607declare <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p)
608
609define <4 x double> @nearbyint_v4f64(<4 x double> %p) {
610; SSE41-LABEL: nearbyint_v4f64:
611; SSE41:       ## %bb.0:
612; SSE41-NEXT:    roundpd $12, %xmm0, %xmm0
613; SSE41-NEXT:    roundpd $12, %xmm1, %xmm1
614; SSE41-NEXT:    retq
615;
616; AVX-LABEL: nearbyint_v4f64:
617; AVX:       ## %bb.0:
618; AVX-NEXT:    vroundpd $12, %ymm0, %ymm0
619; AVX-NEXT:    retq
620;
621; AVX512-LABEL: nearbyint_v4f64:
622; AVX512:       ## %bb.0:
623; AVX512-NEXT:    vroundpd $12, %ymm0, %ymm0
624; AVX512-NEXT:    retq
625  %t = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p)
626  ret <4 x double> %t
627}
628declare <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p)
629
630define <8 x float> @nearbyint_v8f32(<8 x float> %p) {
631; SSE41-LABEL: nearbyint_v8f32:
632; SSE41:       ## %bb.0:
633; SSE41-NEXT:    roundps $12, %xmm0, %xmm0
634; SSE41-NEXT:    roundps $12, %xmm1, %xmm1
635; SSE41-NEXT:    retq
636;
637; AVX-LABEL: nearbyint_v8f32:
638; AVX:       ## %bb.0:
639; AVX-NEXT:    vroundps $12, %ymm0, %ymm0
640; AVX-NEXT:    retq
641;
642; AVX512-LABEL: nearbyint_v8f32:
643; AVX512:       ## %bb.0:
644; AVX512-NEXT:    vroundps $12, %ymm0, %ymm0
645; AVX512-NEXT:    retq
646  %t = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p)
647  ret <8 x float> %t
648}
649declare <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p)
650
651define <8 x double> @nearbyint_v8f64(<8 x double> %p){
652; SSE41-LABEL: nearbyint_v8f64:
653; SSE41:       ## %bb.0:
654; SSE41-NEXT:    roundpd $12, %xmm0, %xmm0
655; SSE41-NEXT:    roundpd $12, %xmm1, %xmm1
656; SSE41-NEXT:    roundpd $12, %xmm2, %xmm2
657; SSE41-NEXT:    roundpd $12, %xmm3, %xmm3
658; SSE41-NEXT:    retq
659;
660; AVX-LABEL: nearbyint_v8f64:
661; AVX:       ## %bb.0:
662; AVX-NEXT:    vroundpd $12, %ymm0, %ymm0
663; AVX-NEXT:    vroundpd $12, %ymm1, %ymm1
664; AVX-NEXT:    retq
665;
666; AVX512-LABEL: nearbyint_v8f64:
667; AVX512:       ## %bb.0:
668; AVX512-NEXT:    vrndscalepd $12, %zmm0, %zmm0
669; AVX512-NEXT:    retq
670  %t = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> %p)
671  ret <8 x double> %t
672}
673declare <8 x double> @llvm.nearbyint.v8f64(<8 x double> %p)
674
675define <16 x float> @nearbyint_v16f32(<16 x float> %p) {
676; SSE41-LABEL: nearbyint_v16f32:
677; SSE41:       ## %bb.0:
678; SSE41-NEXT:    roundps $12, %xmm0, %xmm0
679; SSE41-NEXT:    roundps $12, %xmm1, %xmm1
680; SSE41-NEXT:    roundps $12, %xmm2, %xmm2
681; SSE41-NEXT:    roundps $12, %xmm3, %xmm3
682; SSE41-NEXT:    retq
683;
684; AVX-LABEL: nearbyint_v16f32:
685; AVX:       ## %bb.0:
686; AVX-NEXT:    vroundps $12, %ymm0, %ymm0
687; AVX-NEXT:    vroundps $12, %ymm1, %ymm1
688; AVX-NEXT:    retq
689;
690; AVX512-LABEL: nearbyint_v16f32:
691; AVX512:       ## %bb.0:
692; AVX512-NEXT:    vrndscaleps $12, %zmm0, %zmm0
693; AVX512-NEXT:    retq
694  %t = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> %p)
695  ret <16 x float> %t
696}
697declare <16 x float> @llvm.nearbyint.v16f32(<16 x float> %p)
698
699;
700; Constant Folding
701;
702
703define <2 x double> @const_floor_v2f64() {
704; SSE41-LABEL: const_floor_v2f64:
705; SSE41:       ## %bb.0:
706; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0]
707; SSE41-NEXT:    retq
708;
709; AVX-LABEL: const_floor_v2f64:
710; AVX:       ## %bb.0:
711; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0]
712; AVX-NEXT:    retq
713;
714; AVX512-LABEL: const_floor_v2f64:
715; AVX512:       ## %bb.0:
716; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0]
717; AVX512-NEXT:    retq
718  %t = call <2 x double> @llvm.floor.v2f64(<2 x double> <double -1.5, double 2.5>)
719  ret <2 x double> %t
720}
721
722define <4 x float> @const_floor_v4f32() {
723; SSE41-LABEL: const_floor_v4f32:
724; SSE41:       ## %bb.0:
725; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-4.0E+0,6.0E+0,-9.0E+0,2.0E+0]
726; SSE41-NEXT:    retq
727;
728; AVX-LABEL: const_floor_v4f32:
729; AVX:       ## %bb.0:
730; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-4.0E+0,6.0E+0,-9.0E+0,2.0E+0]
731; AVX-NEXT:    retq
732;
733; AVX512-LABEL: const_floor_v4f32:
734; AVX512:       ## %bb.0:
735; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-4.0E+0,6.0E+0,-9.0E+0,2.0E+0]
736; AVX512-NEXT:    retq
737  %t = call <4 x float> @llvm.floor.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
738  ret <4 x float> %t
739}
740
741define <2 x double> @const_ceil_v2f64() {
742; SSE41-LABEL: const_ceil_v2f64:
743; SSE41:       ## %bb.0:
744; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-1.0E+0,3.0E+0]
745; SSE41-NEXT:    retq
746;
747; AVX-LABEL: const_ceil_v2f64:
748; AVX:       ## %bb.0:
749; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-1.0E+0,3.0E+0]
750; AVX-NEXT:    retq
751;
752; AVX512-LABEL: const_ceil_v2f64:
753; AVX512:       ## %bb.0:
754; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-1.0E+0,3.0E+0]
755; AVX512-NEXT:    retq
756  %t = call <2 x double> @llvm.ceil.v2f64(<2 x double> <double -1.5, double 2.5>)
757  ret <2 x double> %t
758}
759
760define <4 x float> @const_ceil_v4f32() {
761; SSE41-LABEL: const_ceil_v4f32:
762; SSE41:       ## %bb.0:
763; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,3.0E+0]
764; SSE41-NEXT:    retq
765;
766; AVX-LABEL: const_ceil_v4f32:
767; AVX:       ## %bb.0:
768; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,3.0E+0]
769; AVX-NEXT:    retq
770;
771; AVX512-LABEL: const_ceil_v4f32:
772; AVX512:       ## %bb.0:
773; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,3.0E+0]
774; AVX512-NEXT:    retq
775  %t = call <4 x float> @llvm.ceil.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
776  ret <4 x float> %t
777}
778
779define <2 x double> @const_trunc_v2f64() {
780; SSE41-LABEL: const_trunc_v2f64:
781; SSE41:       ## %bb.0:
782; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-1.0E+0,2.0E+0]
783; SSE41-NEXT:    retq
784;
785; AVX-LABEL: const_trunc_v2f64:
786; AVX:       ## %bb.0:
787; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-1.0E+0,2.0E+0]
788; AVX-NEXT:    retq
789;
790; AVX512-LABEL: const_trunc_v2f64:
791; AVX512:       ## %bb.0:
792; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-1.0E+0,2.0E+0]
793; AVX512-NEXT:    retq
794  %t = call <2 x double> @llvm.trunc.v2f64(<2 x double> <double -1.5, double 2.5>)
795  ret <2 x double> %t
796}
797
798define <4 x float> @const_trunc_v4f32() {
799; SSE41-LABEL: const_trunc_v4f32:
800; SSE41:       ## %bb.0:
801; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,2.0E+0]
802; SSE41-NEXT:    retq
803;
804; AVX-LABEL: const_trunc_v4f32:
805; AVX:       ## %bb.0:
806; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,2.0E+0]
807; AVX-NEXT:    retq
808;
809; AVX512-LABEL: const_trunc_v4f32:
810; AVX512:       ## %bb.0:
811; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,2.0E+0]
812; AVX512-NEXT:    retq
813  %t = call <4 x float> @llvm.trunc.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
814  ret <4 x float> %t
815}
816
817;
818; Scalar and masked instructions
819;
820
821define <4 x float> @floor_ss(<4 x float> %x, <4 x float> %y) nounwind {
822; SSE41-LABEL: floor_ss:
823; SSE41:       ## %bb.0:
824; SSE41-NEXT:    roundss $9, %xmm0, %xmm0
825; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
826; SSE41-NEXT:    retq
827;
828; AVX-LABEL: floor_ss:
829; AVX:       ## %bb.0:
830; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
831; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
832; AVX-NEXT:    retq
833;
834; AVX512-LABEL: floor_ss:
835; AVX512:       ## %bb.0:
836; AVX512-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
837; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
838; AVX512-NEXT:    retq
839  %s = extractelement <4 x float> %x, i32 0
840  %call = call float @llvm.floor.f32(float %s)
841  %res = insertelement <4 x float> %y, float %call, i32 0
842  ret <4 x float> %res
843}
844declare float @llvm.floor.f32(float %s)
845
846define <2 x double> @floor_sd(<2 x double> %x, <2 x double> %y) nounwind {
847; SSE41-LABEL: floor_sd:
848; SSE41:       ## %bb.0:
849; SSE41-NEXT:    roundsd $9, %xmm0, %xmm0
850; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
851; SSE41-NEXT:    retq
852;
853; AVX-LABEL: floor_sd:
854; AVX:       ## %bb.0:
855; AVX-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm0
856; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
857; AVX-NEXT:    retq
858;
859; AVX512-LABEL: floor_sd:
860; AVX512:       ## %bb.0:
861; AVX512-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm0
862; AVX512-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
863; AVX512-NEXT:    retq
864  %s = extractelement <2 x double> %x, i32 0
865  %call = call double @llvm.floor.f64(double %s)
866  %res = insertelement <2 x double> %y, double %call, i32 0
867  ret <2 x double> %res
868}
869declare double @llvm.floor.f64(double %s)
870
871define <4 x float> @floor_mask_128_ps(<4 x float> %x, <4 x float> %y) nounwind {
872; SSE41-LABEL: floor_mask_128_ps:
873; SSE41:       ## %bb.0:
874; SSE41-NEXT:    roundps $9, %xmm0, %xmm2
875; SSE41-NEXT:    cmpeqps %xmm1, %xmm0
876; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm1
877; SSE41-NEXT:    movaps %xmm1, %xmm0
878; SSE41-NEXT:    retq
879;
880; AVX-LABEL: floor_mask_128_ps:
881; AVX:       ## %bb.0:
882; AVX-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm2
883; AVX-NEXT:    vroundps $9, %xmm0, %xmm0
884; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
885; AVX-NEXT:    retq
886;
887; AVX512F-LABEL: floor_mask_128_ps:
888; AVX512F:       ## %bb.0:
889; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
890; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
891; AVX512F-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
892; AVX512F-NEXT:    vroundps $9, %xmm0, %xmm0
893; AVX512F-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
894; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
895; AVX512F-NEXT:    vzeroupper
896; AVX512F-NEXT:    retq
897;
898; AVX512VL-LABEL: floor_mask_128_ps:
899; AVX512VL:       ## %bb.0:
900; AVX512VL-NEXT:    vcmpeqps %xmm1, %xmm0, %k1
901; AVX512VL-NEXT:    vrndscaleps $9, %xmm0, %xmm1 {%k1}
902; AVX512VL-NEXT:    vmovaps %xmm1, %xmm0
903; AVX512VL-NEXT:    retq
904  %k = fcmp oeq <4 x float> %x, %y
905  %call = call <4 x float> @llvm.floor.v4f32(<4 x float> %x)
906  %res = select <4 x i1> %k, <4 x float> %call, <4 x float> %y
907  ret <4 x float> %res
908}
909
910define <4 x float> @floor_maskz_128_ps(<4 x float> %x, <4 x float> %y) nounwind {
911; SSE41-LABEL: floor_maskz_128_ps:
912; SSE41:       ## %bb.0:
913; SSE41-NEXT:    cmpeqps %xmm0, %xmm1
914; SSE41-NEXT:    roundps $9, %xmm0, %xmm0
915; SSE41-NEXT:    andps %xmm1, %xmm0
916; SSE41-NEXT:    retq
917;
918; AVX-LABEL: floor_maskz_128_ps:
919; AVX:       ## %bb.0:
920; AVX-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm1
921; AVX-NEXT:    vroundps $9, %xmm0, %xmm0
922; AVX-NEXT:    vandps %xmm0, %xmm1, %xmm0
923; AVX-NEXT:    retq
924;
925; AVX512F-LABEL: floor_maskz_128_ps:
926; AVX512F:       ## %bb.0:
927; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
928; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
929; AVX512F-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
930; AVX512F-NEXT:    vroundps $9, %xmm0, %xmm0
931; AVX512F-NEXT:    vmovaps %zmm0, %zmm0 {%k1} {z}
932; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
933; AVX512F-NEXT:    vzeroupper
934; AVX512F-NEXT:    retq
935;
936; AVX512VL-LABEL: floor_maskz_128_ps:
937; AVX512VL:       ## %bb.0:
938; AVX512VL-NEXT:    vcmpeqps %xmm1, %xmm0, %k1
939; AVX512VL-NEXT:    vrndscaleps $9, %xmm0, %xmm0 {%k1} {z}
940; AVX512VL-NEXT:    retq
941  %k = fcmp oeq <4 x float> %x, %y
942  %call = call <4 x float> @llvm.floor.v4f32(<4 x float> %x)
943  %res = select <4 x i1> %k, <4 x float> %call, <4 x float> zeroinitializer
944  ret <4 x float> %res
945}
946
947define <2 x double> @floor_mask_128_pd(<2 x double> %x, <2 x double> %y) nounwind {
948; SSE41-LABEL: floor_mask_128_pd:
949; SSE41:       ## %bb.0:
950; SSE41-NEXT:    roundpd $9, %xmm0, %xmm2
951; SSE41-NEXT:    cmpeqpd %xmm1, %xmm0
952; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
953; SSE41-NEXT:    movapd %xmm1, %xmm0
954; SSE41-NEXT:    retq
955;
956; AVX-LABEL: floor_mask_128_pd:
957; AVX:       ## %bb.0:
958; AVX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm2
959; AVX-NEXT:    vroundpd $9, %xmm0, %xmm0
960; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
961; AVX-NEXT:    retq
962;
963; AVX512F-LABEL: floor_mask_128_pd:
964; AVX512F:       ## %bb.0:
965; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
966; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
967; AVX512F-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
968; AVX512F-NEXT:    vroundpd $9, %xmm0, %xmm0
969; AVX512F-NEXT:    vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
970; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
971; AVX512F-NEXT:    vzeroupper
972; AVX512F-NEXT:    retq
973;
974; AVX512VL-LABEL: floor_mask_128_pd:
975; AVX512VL:       ## %bb.0:
976; AVX512VL-NEXT:    vcmpeqpd %xmm1, %xmm0, %k1
977; AVX512VL-NEXT:    vrndscalepd $9, %xmm0, %xmm1 {%k1}
978; AVX512VL-NEXT:    vmovapd %xmm1, %xmm0
979; AVX512VL-NEXT:    retq
980  %k = fcmp oeq <2 x double> %x, %y
981  %call = call <2 x double> @llvm.floor.v2f64(<2 x double> %x)
982  %res = select <2 x i1> %k, <2 x double> %call, <2 x double> %y
983  ret <2 x double> %res
984}
985
986define <2 x double> @floor_maskz_128_pd(<2 x double> %x, <2 x double> %y) nounwind {
987; SSE41-LABEL: floor_maskz_128_pd:
988; SSE41:       ## %bb.0:
989; SSE41-NEXT:    cmpeqpd %xmm0, %xmm1
990; SSE41-NEXT:    roundpd $9, %xmm0, %xmm0
991; SSE41-NEXT:    andpd %xmm1, %xmm0
992; SSE41-NEXT:    retq
993;
994; AVX-LABEL: floor_maskz_128_pd:
995; AVX:       ## %bb.0:
996; AVX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm1
997; AVX-NEXT:    vroundpd $9, %xmm0, %xmm0
998; AVX-NEXT:    vandpd %xmm0, %xmm1, %xmm0
999; AVX-NEXT:    retq
1000;
1001; AVX512F-LABEL: floor_maskz_128_pd:
1002; AVX512F:       ## %bb.0:
1003; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
1004; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
1005; AVX512F-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
1006; AVX512F-NEXT:    vroundpd $9, %xmm0, %xmm0
1007; AVX512F-NEXT:    vmovapd %zmm0, %zmm0 {%k1} {z}
1008; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
1009; AVX512F-NEXT:    vzeroupper
1010; AVX512F-NEXT:    retq
1011;
1012; AVX512VL-LABEL: floor_maskz_128_pd:
1013; AVX512VL:       ## %bb.0:
1014; AVX512VL-NEXT:    vcmpeqpd %xmm1, %xmm0, %k1
1015; AVX512VL-NEXT:    vrndscalepd $9, %xmm0, %xmm0 {%k1} {z}
1016; AVX512VL-NEXT:    retq
1017  %k = fcmp oeq <2 x double> %x, %y
1018  %call = call <2 x double> @llvm.floor.v2f64(<2 x double> %x)
1019  %res = select <2 x i1> %k, <2 x double> %call, <2 x double> zeroinitializer
1020  ret <2 x double> %res
1021}
1022
1023define <8 x float> @floor_mask_256_ps(<8 x float> %x, <8 x float> %y) nounwind {
1024; SSE41-LABEL: floor_mask_256_ps:
1025; SSE41:       ## %bb.0:
1026; SSE41-NEXT:    roundps $9, %xmm1, %xmm4
1027; SSE41-NEXT:    cmpeqps %xmm3, %xmm1
1028; SSE41-NEXT:    roundps $9, %xmm0, %xmm5
1029; SSE41-NEXT:    cmpeqps %xmm2, %xmm0
1030; SSE41-NEXT:    blendvps %xmm0, %xmm5, %xmm2
1031; SSE41-NEXT:    movaps %xmm1, %xmm0
1032; SSE41-NEXT:    blendvps %xmm0, %xmm4, %xmm3
1033; SSE41-NEXT:    movaps %xmm2, %xmm0
1034; SSE41-NEXT:    movaps %xmm3, %xmm1
1035; SSE41-NEXT:    retq
1036;
1037; AVX-LABEL: floor_mask_256_ps:
1038; AVX:       ## %bb.0:
1039; AVX-NEXT:    vcmpeqps %ymm1, %ymm0, %ymm2
1040; AVX-NEXT:    vroundps $9, %ymm0, %ymm0
1041; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
1042; AVX-NEXT:    retq
1043;
1044; AVX512F-LABEL: floor_mask_256_ps:
1045; AVX512F:       ## %bb.0:
1046; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
1047; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
1048; AVX512F-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
1049; AVX512F-NEXT:    vroundps $9, %ymm0, %ymm0
1050; AVX512F-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
1051; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
1052; AVX512F-NEXT:    retq
1053;
1054; AVX512VL-LABEL: floor_mask_256_ps:
1055; AVX512VL:       ## %bb.0:
1056; AVX512VL-NEXT:    vcmpeqps %ymm1, %ymm0, %k1
1057; AVX512VL-NEXT:    vrndscaleps $9, %ymm0, %ymm1 {%k1}
1058; AVX512VL-NEXT:    vmovaps %ymm1, %ymm0
1059; AVX512VL-NEXT:    retq
1060  %k = fcmp oeq <8 x float> %x, %y
1061  %call = call <8 x float> @llvm.floor.v8f32(<8 x float> %x)
1062  %res = select <8 x i1> %k, <8 x float> %call, <8 x float> %y
1063  ret <8 x float> %res
1064}
1065
1066define <8 x float> @floor_maskz_256_ps(<8 x float> %x, <8 x float> %y) nounwind {
1067; SSE41-LABEL: floor_maskz_256_ps:
1068; SSE41:       ## %bb.0:
1069; SSE41-NEXT:    cmpeqps %xmm1, %xmm3
1070; SSE41-NEXT:    cmpeqps %xmm0, %xmm2
1071; SSE41-NEXT:    roundps $9, %xmm1, %xmm1
1072; SSE41-NEXT:    andps %xmm3, %xmm1
1073; SSE41-NEXT:    roundps $9, %xmm0, %xmm0
1074; SSE41-NEXT:    andps %xmm2, %xmm0
1075; SSE41-NEXT:    retq
1076;
1077; AVX-LABEL: floor_maskz_256_ps:
1078; AVX:       ## %bb.0:
1079; AVX-NEXT:    vcmpeqps %ymm1, %ymm0, %ymm1
1080; AVX-NEXT:    vroundps $9, %ymm0, %ymm0
1081; AVX-NEXT:    vandps %ymm0, %ymm1, %ymm0
1082; AVX-NEXT:    retq
1083;
1084; AVX512F-LABEL: floor_maskz_256_ps:
1085; AVX512F:       ## %bb.0:
1086; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
1087; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
1088; AVX512F-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
1089; AVX512F-NEXT:    vroundps $9, %ymm0, %ymm0
1090; AVX512F-NEXT:    vmovaps %zmm0, %zmm0 {%k1} {z}
1091; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
1092; AVX512F-NEXT:    retq
1093;
1094; AVX512VL-LABEL: floor_maskz_256_ps:
1095; AVX512VL:       ## %bb.0:
1096; AVX512VL-NEXT:    vcmpeqps %ymm1, %ymm0, %k1
1097; AVX512VL-NEXT:    vrndscaleps $9, %ymm0, %ymm0 {%k1} {z}
1098; AVX512VL-NEXT:    retq
1099  %k = fcmp oeq <8 x float> %x, %y
1100  %call = call <8 x float> @llvm.floor.v8f32(<8 x float> %x)
1101  %res = select <8 x i1> %k, <8 x float> %call, <8 x float> zeroinitializer
1102  ret <8 x float> %res
1103}
1104
1105define <4 x double> @floor_mask_256_pd(<4 x double> %x, <4 x double> %y) nounwind {
1106; SSE41-LABEL: floor_mask_256_pd:
1107; SSE41:       ## %bb.0:
1108; SSE41-NEXT:    roundpd $9, %xmm1, %xmm4
1109; SSE41-NEXT:    cmpeqpd %xmm3, %xmm1
1110; SSE41-NEXT:    roundpd $9, %xmm0, %xmm5
1111; SSE41-NEXT:    cmpeqpd %xmm2, %xmm0
1112; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm2
1113; SSE41-NEXT:    movapd %xmm1, %xmm0
1114; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm3
1115; SSE41-NEXT:    movapd %xmm2, %xmm0
1116; SSE41-NEXT:    movapd %xmm3, %xmm1
1117; SSE41-NEXT:    retq
1118;
1119; AVX-LABEL: floor_mask_256_pd:
1120; AVX:       ## %bb.0:
1121; AVX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm2
1122; AVX-NEXT:    vroundpd $9, %ymm0, %ymm0
1123; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1124; AVX-NEXT:    retq
1125;
1126; AVX512F-LABEL: floor_mask_256_pd:
1127; AVX512F:       ## %bb.0:
1128; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
1129; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
1130; AVX512F-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
1131; AVX512F-NEXT:    vroundpd $9, %ymm0, %ymm0
1132; AVX512F-NEXT:    vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
1133; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
1134; AVX512F-NEXT:    retq
1135;
1136; AVX512VL-LABEL: floor_mask_256_pd:
1137; AVX512VL:       ## %bb.0:
1138; AVX512VL-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
1139; AVX512VL-NEXT:    vrndscalepd $9, %ymm0, %ymm1 {%k1}
1140; AVX512VL-NEXT:    vmovapd %ymm1, %ymm0
1141; AVX512VL-NEXT:    retq
1142  %k = fcmp oeq <4 x double> %x, %y
1143  %call = call <4 x double> @llvm.floor.v4f64(<4 x double> %x)
1144  %res = select <4 x i1> %k, <4 x double> %call, <4 x double> %y
1145  ret <4 x double> %res
1146}
1147
1148define <4 x double> @floor_maskz_256_pd(<4 x double> %x, <4 x double> %y) nounwind {
1149; SSE41-LABEL: floor_maskz_256_pd:
1150; SSE41:       ## %bb.0:
1151; SSE41-NEXT:    cmpeqpd %xmm1, %xmm3
1152; SSE41-NEXT:    cmpeqpd %xmm0, %xmm2
1153; SSE41-NEXT:    roundpd $9, %xmm1, %xmm1
1154; SSE41-NEXT:    andpd %xmm3, %xmm1
1155; SSE41-NEXT:    roundpd $9, %xmm0, %xmm0
1156; SSE41-NEXT:    andpd %xmm2, %xmm0
1157; SSE41-NEXT:    retq
1158;
1159; AVX-LABEL: floor_maskz_256_pd:
1160; AVX:       ## %bb.0:
1161; AVX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm1
1162; AVX-NEXT:    vroundpd $9, %ymm0, %ymm0
1163; AVX-NEXT:    vandpd %ymm0, %ymm1, %ymm0
1164; AVX-NEXT:    retq
1165;
1166; AVX512F-LABEL: floor_maskz_256_pd:
1167; AVX512F:       ## %bb.0:
1168; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
1169; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
1170; AVX512F-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
1171; AVX512F-NEXT:    vroundpd $9, %ymm0, %ymm0
1172; AVX512F-NEXT:    vmovapd %zmm0, %zmm0 {%k1} {z}
1173; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
1174; AVX512F-NEXT:    retq
1175;
1176; AVX512VL-LABEL: floor_maskz_256_pd:
1177; AVX512VL:       ## %bb.0:
1178; AVX512VL-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
1179; AVX512VL-NEXT:    vrndscalepd $9, %ymm0, %ymm0 {%k1} {z}
1180; AVX512VL-NEXT:    retq
1181  %k = fcmp oeq <4 x double> %x, %y
1182  %call = call <4 x double> @llvm.floor.v4f64(<4 x double> %x)
1183  %res = select <4 x i1> %k, <4 x double> %call, <4 x double> zeroinitializer
1184  ret <4 x double> %res
1185}
1186
1187define <16 x float> @floor_mask_512_ps(<16 x float> %x, <16 x float> %y) nounwind {
1188; SSE41-LABEL: floor_mask_512_ps:
1189; SSE41:       ## %bb.0:
1190; SSE41-NEXT:    roundps $9, %xmm3, %xmm8
1191; SSE41-NEXT:    cmpeqps %xmm7, %xmm3
1192; SSE41-NEXT:    roundps $9, %xmm2, %xmm9
1193; SSE41-NEXT:    cmpeqps %xmm6, %xmm2
1194; SSE41-NEXT:    roundps $9, %xmm1, %xmm10
1195; SSE41-NEXT:    cmpeqps %xmm5, %xmm1
1196; SSE41-NEXT:    roundps $9, %xmm0, %xmm11
1197; SSE41-NEXT:    cmpeqps %xmm4, %xmm0
1198; SSE41-NEXT:    blendvps %xmm0, %xmm11, %xmm4
1199; SSE41-NEXT:    movaps %xmm1, %xmm0
1200; SSE41-NEXT:    blendvps %xmm0, %xmm10, %xmm5
1201; SSE41-NEXT:    movaps %xmm2, %xmm0
1202; SSE41-NEXT:    blendvps %xmm0, %xmm9, %xmm6
1203; SSE41-NEXT:    movaps %xmm3, %xmm0
1204; SSE41-NEXT:    blendvps %xmm0, %xmm8, %xmm7
1205; SSE41-NEXT:    movaps %xmm4, %xmm0
1206; SSE41-NEXT:    movaps %xmm5, %xmm1
1207; SSE41-NEXT:    movaps %xmm6, %xmm2
1208; SSE41-NEXT:    movaps %xmm7, %xmm3
1209; SSE41-NEXT:    retq
1210;
1211; AVX-LABEL: floor_mask_512_ps:
1212; AVX:       ## %bb.0:
1213; AVX-NEXT:    vcmpeqps %ymm3, %ymm1, %ymm4
1214; AVX-NEXT:    vcmpeqps %ymm2, %ymm0, %ymm5
1215; AVX-NEXT:    vroundps $9, %ymm1, %ymm1
1216; AVX-NEXT:    vroundps $9, %ymm0, %ymm0
1217; AVX-NEXT:    vblendvps %ymm5, %ymm0, %ymm2, %ymm0
1218; AVX-NEXT:    vblendvps %ymm4, %ymm1, %ymm3, %ymm1
1219; AVX-NEXT:    retq
1220;
1221; AVX512-LABEL: floor_mask_512_ps:
1222; AVX512:       ## %bb.0:
1223; AVX512-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
1224; AVX512-NEXT:    vrndscaleps $9, %zmm0, %zmm1 {%k1}
1225; AVX512-NEXT:    vmovaps %zmm1, %zmm0
1226; AVX512-NEXT:    retq
1227  %k = fcmp oeq <16 x float> %x, %y
1228  %call = call <16 x float> @llvm.floor.v16f32(<16 x float> %x)
1229  %res = select <16 x i1> %k, <16 x float> %call, <16 x float> %y
1230  ret <16 x float> %res
1231}
1232
1233define <16 x float> @floor_maskz_512_ps(<16 x float> %x, <16 x float> %y) nounwind {
1234; SSE41-LABEL: floor_maskz_512_ps:
1235; SSE41:       ## %bb.0:
1236; SSE41-NEXT:    cmpeqps %xmm3, %xmm7
1237; SSE41-NEXT:    cmpeqps %xmm2, %xmm6
1238; SSE41-NEXT:    cmpeqps %xmm1, %xmm5
1239; SSE41-NEXT:    cmpeqps %xmm0, %xmm4
1240; SSE41-NEXT:    roundps $9, %xmm3, %xmm3
1241; SSE41-NEXT:    andps %xmm7, %xmm3
1242; SSE41-NEXT:    roundps $9, %xmm2, %xmm2
1243; SSE41-NEXT:    andps %xmm6, %xmm2
1244; SSE41-NEXT:    roundps $9, %xmm1, %xmm1
1245; SSE41-NEXT:    andps %xmm5, %xmm1
1246; SSE41-NEXT:    roundps $9, %xmm0, %xmm0
1247; SSE41-NEXT:    andps %xmm4, %xmm0
1248; SSE41-NEXT:    retq
1249;
1250; AVX-LABEL: floor_maskz_512_ps:
1251; AVX:       ## %bb.0:
1252; AVX-NEXT:    vcmpeqps %ymm3, %ymm1, %ymm3
1253; AVX-NEXT:    vcmpeqps %ymm2, %ymm0, %ymm2
1254; AVX-NEXT:    vroundps $9, %ymm1, %ymm1
1255; AVX-NEXT:    vandps %ymm1, %ymm3, %ymm1
1256; AVX-NEXT:    vroundps $9, %ymm0, %ymm0
1257; AVX-NEXT:    vandps %ymm0, %ymm2, %ymm0
1258; AVX-NEXT:    retq
1259;
1260; AVX512-LABEL: floor_maskz_512_ps:
1261; AVX512:       ## %bb.0:
1262; AVX512-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
1263; AVX512-NEXT:    vrndscaleps $9, %zmm0, %zmm0 {%k1} {z}
1264; AVX512-NEXT:    retq
1265  %k = fcmp oeq <16 x float> %x, %y
1266  %call = call <16 x float> @llvm.floor.v16f32(<16 x float> %x)
1267  %res = select <16 x i1> %k, <16 x float> %call, <16 x float> zeroinitializer
1268  ret <16 x float> %res
1269}
1270
1271define <8 x double> @floor_mask_512_pd(<8 x double> %x, <8 x double> %y) nounwind {
1272; SSE41-LABEL: floor_mask_512_pd:
1273; SSE41:       ## %bb.0:
1274; SSE41-NEXT:    roundpd $9, %xmm3, %xmm8
1275; SSE41-NEXT:    cmpeqpd %xmm7, %xmm3
1276; SSE41-NEXT:    roundpd $9, %xmm2, %xmm9
1277; SSE41-NEXT:    cmpeqpd %xmm6, %xmm2
1278; SSE41-NEXT:    roundpd $9, %xmm1, %xmm10
1279; SSE41-NEXT:    cmpeqpd %xmm5, %xmm1
1280; SSE41-NEXT:    roundpd $9, %xmm0, %xmm11
1281; SSE41-NEXT:    cmpeqpd %xmm4, %xmm0
1282; SSE41-NEXT:    blendvpd %xmm0, %xmm11, %xmm4
1283; SSE41-NEXT:    movapd %xmm1, %xmm0
1284; SSE41-NEXT:    blendvpd %xmm0, %xmm10, %xmm5
1285; SSE41-NEXT:    movapd %xmm2, %xmm0
1286; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm6
1287; SSE41-NEXT:    movapd %xmm3, %xmm0
1288; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm7
1289; SSE41-NEXT:    movapd %xmm4, %xmm0
1290; SSE41-NEXT:    movapd %xmm5, %xmm1
1291; SSE41-NEXT:    movapd %xmm6, %xmm2
1292; SSE41-NEXT:    movapd %xmm7, %xmm3
1293; SSE41-NEXT:    retq
1294;
1295; AVX-LABEL: floor_mask_512_pd:
1296; AVX:       ## %bb.0:
1297; AVX-NEXT:    vcmpeqpd %ymm3, %ymm1, %ymm4
1298; AVX-NEXT:    vcmpeqpd %ymm2, %ymm0, %ymm5
1299; AVX-NEXT:    vroundpd $9, %ymm1, %ymm1
1300; AVX-NEXT:    vroundpd $9, %ymm0, %ymm0
1301; AVX-NEXT:    vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
1302; AVX-NEXT:    vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
1303; AVX-NEXT:    retq
1304;
1305; AVX512-LABEL: floor_mask_512_pd:
1306; AVX512:       ## %bb.0:
1307; AVX512-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
1308; AVX512-NEXT:    vrndscalepd $9, %zmm0, %zmm1 {%k1}
1309; AVX512-NEXT:    vmovapd %zmm1, %zmm0
1310; AVX512-NEXT:    retq
1311  %k = fcmp oeq <8 x double> %x, %y
1312  %call = call <8 x double> @llvm.floor.v8f64(<8 x double> %x)
1313  %res = select <8 x i1> %k, <8 x double> %call, <8 x double> %y
1314  ret <8 x double> %res
1315}
1316
1317define <8 x double> @floor_maskz_512_pd(<8 x double> %x, <8 x double> %y) nounwind {
1318; SSE41-LABEL: floor_maskz_512_pd:
1319; SSE41:       ## %bb.0:
1320; SSE41-NEXT:    cmpeqpd %xmm3, %xmm7
1321; SSE41-NEXT:    cmpeqpd %xmm2, %xmm6
1322; SSE41-NEXT:    cmpeqpd %xmm1, %xmm5
1323; SSE41-NEXT:    cmpeqpd %xmm0, %xmm4
1324; SSE41-NEXT:    roundpd $9, %xmm3, %xmm3
1325; SSE41-NEXT:    andpd %xmm7, %xmm3
1326; SSE41-NEXT:    roundpd $9, %xmm2, %xmm2
1327; SSE41-NEXT:    andpd %xmm6, %xmm2
1328; SSE41-NEXT:    roundpd $9, %xmm1, %xmm1
1329; SSE41-NEXT:    andpd %xmm5, %xmm1
1330; SSE41-NEXT:    roundpd $9, %xmm0, %xmm0
1331; SSE41-NEXT:    andpd %xmm4, %xmm0
1332; SSE41-NEXT:    retq
1333;
1334; AVX-LABEL: floor_maskz_512_pd:
1335; AVX:       ## %bb.0:
1336; AVX-NEXT:    vcmpeqpd %ymm3, %ymm1, %ymm3
1337; AVX-NEXT:    vcmpeqpd %ymm2, %ymm0, %ymm2
1338; AVX-NEXT:    vroundpd $9, %ymm1, %ymm1
1339; AVX-NEXT:    vandpd %ymm1, %ymm3, %ymm1
1340; AVX-NEXT:    vroundpd $9, %ymm0, %ymm0
1341; AVX-NEXT:    vandpd %ymm0, %ymm2, %ymm0
1342; AVX-NEXT:    retq
1343;
1344; AVX512-LABEL: floor_maskz_512_pd:
1345; AVX512:       ## %bb.0:
1346; AVX512-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
1347; AVX512-NEXT:    vrndscalepd $9, %zmm0, %zmm0 {%k1} {z}
1348; AVX512-NEXT:    retq
1349  %k = fcmp oeq <8 x double> %x, %y
1350  %call = call <8 x double> @llvm.floor.v8f64(<8 x double> %x)
1351  %res = select <8 x i1> %k, <8 x double> %call, <8 x double> zeroinitializer
1352  ret <8 x double> %res
1353}
1354
1355define <4 x float> @floor_mask_ss(<4 x float> %x, <4 x float> %y, <4 x float> %w, i8 %k) nounwind {
1356; SSE41-LABEL: floor_mask_ss:
1357; SSE41:       ## %bb.0:
1358; SSE41-NEXT:    testb $1, %dil
1359; SSE41-NEXT:    je LBB52_2
1360; SSE41-NEXT:  ## %bb.1:
1361; SSE41-NEXT:    xorps %xmm2, %xmm2
1362; SSE41-NEXT:    roundss $9, %xmm0, %xmm2
1363; SSE41-NEXT:  LBB52_2:
1364; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
1365; SSE41-NEXT:    movaps %xmm1, %xmm0
1366; SSE41-NEXT:    retq
1367;
1368; AVX-LABEL: floor_mask_ss:
1369; AVX:       ## %bb.0:
1370; AVX-NEXT:    testb $1, %dil
1371; AVX-NEXT:    je LBB52_2
1372; AVX-NEXT:  ## %bb.1:
1373; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm2
1374; AVX-NEXT:  LBB52_2:
1375; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
1376; AVX-NEXT:    retq
1377;
1378; AVX512-LABEL: floor_mask_ss:
1379; AVX512:       ## %bb.0:
1380; AVX512-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
1381; AVX512-NEXT:    kmovw %edi, %k1
1382; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm2 {%k1}
1383; AVX512-NEXT:    vmovaps %xmm2, %xmm0
1384; AVX512-NEXT:    retq
1385  %mask = and i8 %k, 1
1386  %nmask = icmp eq i8 %mask, 0
1387  %s = extractelement <4 x float> %x, i64 0
1388  %call = tail call float @llvm.floor.f32(float %s)
1389  %dst = extractelement <4 x float> %w, i64 0
1390  %low = select i1 %nmask, float %dst, float %call
1391  %res = insertelement <4 x float> %y, float %low, i64 0
1392  ret <4 x float> %res
1393}
1394
1395define <4 x float> @floor_maskz_ss(<4 x float> %x, <4 x float> %y, i8 %k) nounwind {
1396; SSE41-LABEL: floor_maskz_ss:
1397; SSE41:       ## %bb.0:
1398; SSE41-NEXT:    testb $1, %dil
1399; SSE41-NEXT:    xorps %xmm2, %xmm2
1400; SSE41-NEXT:    je LBB53_2
1401; SSE41-NEXT:  ## %bb.1:
1402; SSE41-NEXT:    xorps %xmm2, %xmm2
1403; SSE41-NEXT:    roundss $9, %xmm0, %xmm2
1404; SSE41-NEXT:  LBB53_2:
1405; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
1406; SSE41-NEXT:    movaps %xmm1, %xmm0
1407; SSE41-NEXT:    retq
1408;
1409; AVX-LABEL: floor_maskz_ss:
1410; AVX:       ## %bb.0:
1411; AVX-NEXT:    testb $1, %dil
1412; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1413; AVX-NEXT:    je LBB53_2
1414; AVX-NEXT:  ## %bb.1:
1415; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm2
1416; AVX-NEXT:  LBB53_2:
1417; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
1418; AVX-NEXT:    retq
1419;
1420; AVX512-LABEL: floor_maskz_ss:
1421; AVX512:       ## %bb.0:
1422; AVX512-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
1423; AVX512-NEXT:    kmovw %edi, %k1
1424; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm0 {%k1} {z}
1425; AVX512-NEXT:    retq
1426  %mask = and i8 %k, 1
1427  %nmask = icmp eq i8 %mask, 0
1428  %s = extractelement <4 x float> %x, i64 0
1429  %call = tail call float @llvm.floor.f32(float %s)
1430  %low = select i1 %nmask, float zeroinitializer, float %call
1431  %res = insertelement <4 x float> %y, float %low, i64 0
1432  ret <4 x float> %res
1433}
1434
1435define <2 x double> @floor_mask_sd(<2 x double> %x, <2 x double> %y, <2 x double> %w, i8 %k) nounwind {
1436; SSE41-LABEL: floor_mask_sd:
1437; SSE41:       ## %bb.0:
1438; SSE41-NEXT:    testb $1, %dil
1439; SSE41-NEXT:    je LBB54_2
1440; SSE41-NEXT:  ## %bb.1:
1441; SSE41-NEXT:    xorps %xmm2, %xmm2
1442; SSE41-NEXT:    roundsd $9, %xmm0, %xmm2
1443; SSE41-NEXT:  LBB54_2:
1444; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
1445; SSE41-NEXT:    movapd %xmm1, %xmm0
1446; SSE41-NEXT:    retq
1447;
1448; AVX-LABEL: floor_mask_sd:
1449; AVX:       ## %bb.0:
1450; AVX-NEXT:    testb $1, %dil
1451; AVX-NEXT:    je LBB54_2
1452; AVX-NEXT:  ## %bb.1:
1453; AVX-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm2
1454; AVX-NEXT:  LBB54_2:
1455; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
1456; AVX-NEXT:    retq
1457;
1458; AVX512-LABEL: floor_mask_sd:
1459; AVX512:       ## %bb.0:
1460; AVX512-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm0
1461; AVX512-NEXT:    kmovw %edi, %k1
1462; AVX512-NEXT:    vmovsd %xmm0, %xmm1, %xmm2 {%k1}
1463; AVX512-NEXT:    vmovapd %xmm2, %xmm0
1464; AVX512-NEXT:    retq
1465  %mask = and i8 %k, 1
1466  %nmask = icmp eq i8 %mask, 0
1467  %s = extractelement <2 x double> %x, i64 0
1468  %call = tail call double @llvm.floor.f64(double %s)
1469  %dst = extractelement <2 x double> %w, i64 0
1470  %low = select i1 %nmask, double %dst, double %call
1471  %res = insertelement <2 x double> %y, double %low, i64 0
1472  ret <2 x double> %res
1473}
1474
1475define <2 x double> @floor_maskz_sd(<2 x double> %x, <2 x double> %y, i8 %k) nounwind {
1476; SSE41-LABEL: floor_maskz_sd:
1477; SSE41:       ## %bb.0:
1478; SSE41-NEXT:    testb $1, %dil
1479; SSE41-NEXT:    xorpd %xmm2, %xmm2
1480; SSE41-NEXT:    je LBB55_2
1481; SSE41-NEXT:  ## %bb.1:
1482; SSE41-NEXT:    xorps %xmm2, %xmm2
1483; SSE41-NEXT:    roundsd $9, %xmm0, %xmm2
1484; SSE41-NEXT:  LBB55_2:
1485; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
1486; SSE41-NEXT:    movapd %xmm1, %xmm0
1487; SSE41-NEXT:    retq
1488;
1489; AVX-LABEL: floor_maskz_sd:
1490; AVX:       ## %bb.0:
1491; AVX-NEXT:    testb $1, %dil
1492; AVX-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
1493; AVX-NEXT:    je LBB55_2
1494; AVX-NEXT:  ## %bb.1:
1495; AVX-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm2
1496; AVX-NEXT:  LBB55_2:
1497; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
1498; AVX-NEXT:    retq
1499;
1500; AVX512-LABEL: floor_maskz_sd:
1501; AVX512:       ## %bb.0:
1502; AVX512-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm0
1503; AVX512-NEXT:    kmovw %edi, %k1
1504; AVX512-NEXT:    vmovsd %xmm0, %xmm1, %xmm0 {%k1} {z}
1505; AVX512-NEXT:    retq
1506  %mask = and i8 %k, 1
1507  %nmask = icmp eq i8 %mask, 0
1508  %s = extractelement <2 x double> %x, i64 0
1509  %call = tail call double @llvm.floor.f64(double %s)
1510  %low = select i1 %nmask, double zeroinitializer, double %call
1511  %res = insertelement <2 x double> %y, double %low, i64 0
1512  ret <2 x double> %res
1513}
1514
1515define <4 x float> @floor_mask_ss_trunc(<4 x float> %x, <4 x float> %y, <4 x float> %w, i16 %k) nounwind {
1516; SSE41-LABEL: floor_mask_ss_trunc:
1517; SSE41:       ## %bb.0:
1518; SSE41-NEXT:    testb $1, %dil
1519; SSE41-NEXT:    je LBB56_2
1520; SSE41-NEXT:  ## %bb.1:
1521; SSE41-NEXT:    xorps %xmm2, %xmm2
1522; SSE41-NEXT:    roundss $9, %xmm0, %xmm2
1523; SSE41-NEXT:  LBB56_2:
1524; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
1525; SSE41-NEXT:    movaps %xmm1, %xmm0
1526; SSE41-NEXT:    retq
1527;
1528; AVX-LABEL: floor_mask_ss_trunc:
1529; AVX:       ## %bb.0:
1530; AVX-NEXT:    testb $1, %dil
1531; AVX-NEXT:    je LBB56_2
1532; AVX-NEXT:  ## %bb.1:
1533; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm2
1534; AVX-NEXT:  LBB56_2:
1535; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
1536; AVX-NEXT:    retq
1537;
1538; AVX512-LABEL: floor_mask_ss_trunc:
1539; AVX512:       ## %bb.0:
1540; AVX512-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
1541; AVX512-NEXT:    kmovw %edi, %k1
1542; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm2 {%k1}
1543; AVX512-NEXT:    vmovaps %xmm2, %xmm0
1544; AVX512-NEXT:    retq
1545  %mask = trunc i16 %k to i1
1546  %s = extractelement <4 x float> %x, i64 0
1547  %call = tail call float @llvm.floor.f32(float %s)
1548  %dst = extractelement <4 x float> %w, i64 0
1549  %low = select i1 %mask, float %call, float %dst
1550  %res = insertelement <4 x float> %y, float %low, i64 0
1551  ret <4 x float> %res
1552}
1553
1554define <4 x float> @floor_maskz_ss_trunc(<4 x float> %x, <4 x float> %y, i16 %k) nounwind {
1555; SSE41-LABEL: floor_maskz_ss_trunc:
1556; SSE41:       ## %bb.0:
1557; SSE41-NEXT:    testb $1, %dil
1558; SSE41-NEXT:    jne LBB57_1
1559; SSE41-NEXT:  ## %bb.2:
1560; SSE41-NEXT:    xorps %xmm0, %xmm0
1561; SSE41-NEXT:    jmp LBB57_3
1562; SSE41-NEXT:  LBB57_1:
1563; SSE41-NEXT:    roundss $9, %xmm0, %xmm0
1564; SSE41-NEXT:  LBB57_3:
1565; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1566; SSE41-NEXT:    movaps %xmm1, %xmm0
1567; SSE41-NEXT:    retq
1568;
1569; AVX-LABEL: floor_maskz_ss_trunc:
1570; AVX:       ## %bb.0:
1571; AVX-NEXT:    testb $1, %dil
1572; AVX-NEXT:    jne LBB57_1
1573; AVX-NEXT:  ## %bb.2:
1574; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1575; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1576; AVX-NEXT:    retq
1577; AVX-NEXT:  LBB57_1:
1578; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
1579; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1580; AVX-NEXT:    retq
1581;
1582; AVX512-LABEL: floor_maskz_ss_trunc:
1583; AVX512:       ## %bb.0:
1584; AVX512-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
1585; AVX512-NEXT:    kmovw %edi, %k1
1586; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm0 {%k1} {z}
1587; AVX512-NEXT:    retq
1588  %mask = trunc i16 %k to i1
1589  %s = extractelement <4 x float> %x, i64 0
1590  %call = tail call float @llvm.floor.f32(float %s)
1591  %low = select i1 %mask, float %call, float zeroinitializer
1592  %res = insertelement <4 x float> %y, float %low, i64 0
1593  ret <4 x float> %res
1594}
1595
1596define <2 x double> @floor_mask_sd_trunc(<2 x double> %x, <2 x double> %y, <2 x double> %w, i16 %k) nounwind {
1597; SSE41-LABEL: floor_mask_sd_trunc:
1598; SSE41:       ## %bb.0:
1599; SSE41-NEXT:    testb $1, %dil
1600; SSE41-NEXT:    je LBB58_2
1601; SSE41-NEXT:  ## %bb.1:
1602; SSE41-NEXT:    xorps %xmm2, %xmm2
1603; SSE41-NEXT:    roundsd $9, %xmm0, %xmm2
1604; SSE41-NEXT:  LBB58_2:
1605; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
1606; SSE41-NEXT:    movapd %xmm1, %xmm0
1607; SSE41-NEXT:    retq
1608;
1609; AVX-LABEL: floor_mask_sd_trunc:
1610; AVX:       ## %bb.0:
1611; AVX-NEXT:    testb $1, %dil
1612; AVX-NEXT:    je LBB58_2
1613; AVX-NEXT:  ## %bb.1:
1614; AVX-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm2
1615; AVX-NEXT:  LBB58_2:
1616; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
1617; AVX-NEXT:    retq
1618;
1619; AVX512-LABEL: floor_mask_sd_trunc:
1620; AVX512:       ## %bb.0:
1621; AVX512-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm0
1622; AVX512-NEXT:    kmovw %edi, %k1
1623; AVX512-NEXT:    vmovsd %xmm0, %xmm1, %xmm2 {%k1}
1624; AVX512-NEXT:    vmovapd %xmm2, %xmm0
1625; AVX512-NEXT:    retq
1626  %mask = trunc i16 %k to i1
1627  %s = extractelement <2 x double> %x, i64 0
1628  %call = tail call double @llvm.floor.f64(double %s)
1629  %dst = extractelement <2 x double> %w, i64 0
1630  %low = select i1 %mask, double %call, double %dst
1631  %res = insertelement <2 x double> %y, double %low, i64 0
1632  ret <2 x double> %res
1633}
1634
1635define <2 x double> @floor_maskz_sd_trunc(<2 x double> %x, <2 x double> %y, i16 %k) nounwind {
1636; SSE41-LABEL: floor_maskz_sd_trunc:
1637; SSE41:       ## %bb.0:
1638; SSE41-NEXT:    testb $1, %dil
1639; SSE41-NEXT:    jne LBB59_1
1640; SSE41-NEXT:  ## %bb.2:
1641; SSE41-NEXT:    xorpd %xmm0, %xmm0
1642; SSE41-NEXT:    jmp LBB59_3
1643; SSE41-NEXT:  LBB59_1:
1644; SSE41-NEXT:    roundsd $9, %xmm0, %xmm0
1645; SSE41-NEXT:  LBB59_3:
1646; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1647; SSE41-NEXT:    movapd %xmm1, %xmm0
1648; SSE41-NEXT:    retq
1649;
1650; AVX-LABEL: floor_maskz_sd_trunc:
1651; AVX:       ## %bb.0:
1652; AVX-NEXT:    testb $1, %dil
1653; AVX-NEXT:    jne LBB59_1
1654; AVX-NEXT:  ## %bb.2:
1655; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1656; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1657; AVX-NEXT:    retq
1658; AVX-NEXT:  LBB59_1:
1659; AVX-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm0
1660; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1661; AVX-NEXT:    retq
1662;
1663; AVX512-LABEL: floor_maskz_sd_trunc:
1664; AVX512:       ## %bb.0:
1665; AVX512-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm0
1666; AVX512-NEXT:    kmovw %edi, %k1
1667; AVX512-NEXT:    vmovsd %xmm0, %xmm1, %xmm0 {%k1} {z}
1668; AVX512-NEXT:    retq
1669  %mask = trunc i16 %k to i1
1670  %s = extractelement <2 x double> %x, i64 0
1671  %call = tail call double @llvm.floor.f64(double %s)
1672  %low = select i1 %mask, double %call, double zeroinitializer
1673  %res = insertelement <2 x double> %y, double %low, i64 0
1674  ret <2 x double> %res
1675}
1676
1677define <4 x float> @floor_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x float> %w) nounwind {
1678; SSE41-LABEL: floor_mask_ss_mask8:
1679; SSE41:       ## %bb.0:
1680; SSE41-NEXT:    roundss $9, %xmm0, %xmm3
1681; SSE41-NEXT:    cmpeqss %xmm1, %xmm0
1682; SSE41-NEXT:    andps %xmm0, %xmm3
1683; SSE41-NEXT:    andnps %xmm2, %xmm0
1684; SSE41-NEXT:    orps %xmm3, %xmm0
1685; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1686; SSE41-NEXT:    retq
1687;
1688; AVX-LABEL: floor_mask_ss_mask8:
1689; AVX:       ## %bb.0:
1690; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm3
1691; AVX-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0
1692; AVX-NEXT:    vblendvps %xmm0, %xmm3, %xmm2, %xmm0
1693; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1694; AVX-NEXT:    retq
1695;
1696; AVX512-LABEL: floor_mask_ss_mask8:
1697; AVX512:       ## %bb.0:
1698; AVX512-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm3
1699; AVX512-NEXT:    vcmpeqss %xmm1, %xmm0, %k1
1700; AVX512-NEXT:    vmovss %xmm3, %xmm1, %xmm2 {%k1}
1701; AVX512-NEXT:    vmovaps %xmm2, %xmm0
1702; AVX512-NEXT:    retq
1703  %mask1 = fcmp oeq <4 x float> %x, %y
1704  %mask = extractelement <4 x i1> %mask1, i64 0
1705  %s = extractelement <4 x float> %x, i64 0
1706  %call = tail call float @llvm.floor.f32(float %s)
1707  %dst = extractelement <4 x float> %w, i64 0
1708  %low = select i1 %mask, float %call, float %dst
1709  %res = insertelement <4 x float> %y, float %low, i64 0
1710  ret <4 x float> %res
1711}
1712
1713define <4 x float> @floor_maskz_ss_mask8(<4 x float> %x, <4 x float> %y) nounwind {
1714; SSE41-LABEL: floor_maskz_ss_mask8:
1715; SSE41:       ## %bb.0:
1716; SSE41-NEXT:    roundss $9, %xmm0, %xmm2
1717; SSE41-NEXT:    cmpeqss %xmm1, %xmm0
1718; SSE41-NEXT:    andps %xmm2, %xmm0
1719; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1720; SSE41-NEXT:    retq
1721;
1722; AVX-LABEL: floor_maskz_ss_mask8:
1723; AVX:       ## %bb.0:
1724; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm2
1725; AVX-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0
1726; AVX-NEXT:    vandps %xmm2, %xmm0, %xmm0
1727; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1728; AVX-NEXT:    retq
1729;
1730; AVX512-LABEL: floor_maskz_ss_mask8:
1731; AVX512:       ## %bb.0:
1732; AVX512-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm2
1733; AVX512-NEXT:    vcmpeqss %xmm1, %xmm0, %k1
1734; AVX512-NEXT:    vmovss %xmm2, %xmm1, %xmm0 {%k1} {z}
1735; AVX512-NEXT:    retq
1736  %mask1 = fcmp oeq <4 x float> %x, %y
1737  %mask = extractelement <4 x i1> %mask1, i64 0
1738  %s = extractelement <4 x float> %x, i64 0
1739  %call = tail call float @llvm.floor.f32(float %s)
1740  %low = select i1 %mask, float %call, float zeroinitializer
1741  %res = insertelement <4 x float> %y, float %low, i64 0
1742  ret <4 x float> %res
1743}
1744
1745define <2 x double> @floor_mask_sd_mask8(<2 x double> %x, <2 x double> %y, <2 x double> %w) nounwind {
1746; SSE41-LABEL: floor_mask_sd_mask8:
1747; SSE41:       ## %bb.0:
1748; SSE41-NEXT:    roundsd $9, %xmm0, %xmm3
1749; SSE41-NEXT:    cmpeqsd %xmm1, %xmm0
1750; SSE41-NEXT:    andpd %xmm0, %xmm3
1751; SSE41-NEXT:    andnpd %xmm2, %xmm0
1752; SSE41-NEXT:    orpd %xmm3, %xmm0
1753; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1754; SSE41-NEXT:    retq
1755;
1756; AVX-LABEL: floor_mask_sd_mask8:
1757; AVX:       ## %bb.0:
1758; AVX-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm3
1759; AVX-NEXT:    vcmpeqsd %xmm1, %xmm0, %xmm0
1760; AVX-NEXT:    vblendvpd %xmm0, %xmm3, %xmm2, %xmm0
1761; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1762; AVX-NEXT:    retq
1763;
1764; AVX512-LABEL: floor_mask_sd_mask8:
1765; AVX512:       ## %bb.0:
1766; AVX512-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm3
1767; AVX512-NEXT:    vcmpeqsd %xmm1, %xmm0, %k1
1768; AVX512-NEXT:    vmovsd %xmm3, %xmm1, %xmm2 {%k1}
1769; AVX512-NEXT:    vmovapd %xmm2, %xmm0
1770; AVX512-NEXT:    retq
1771  %mask1 = fcmp oeq <2 x double> %x, %y
1772  %mask = extractelement <2 x i1> %mask1, i64 0
1773  %s = extractelement <2 x double> %x, i64 0
1774  %call = tail call double @llvm.floor.f64(double %s)
1775  %dst = extractelement <2 x double> %w, i64 0
1776  %low = select i1 %mask, double %call, double %dst
1777  %res = insertelement <2 x double> %y, double %low, i64 0
1778  ret <2 x double> %res
1779}
1780
1781define <2 x double> @floor_maskz_sd_mask8(<2 x double> %x, <2 x double> %y) nounwind {
1782; SSE41-LABEL: floor_maskz_sd_mask8:
1783; SSE41:       ## %bb.0:
1784; SSE41-NEXT:    roundsd $9, %xmm0, %xmm2
1785; SSE41-NEXT:    cmpeqsd %xmm1, %xmm0
1786; SSE41-NEXT:    andpd %xmm2, %xmm0
1787; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1788; SSE41-NEXT:    retq
1789;
1790; AVX-LABEL: floor_maskz_sd_mask8:
1791; AVX:       ## %bb.0:
1792; AVX-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm2
1793; AVX-NEXT:    vcmpeqsd %xmm1, %xmm0, %xmm0
1794; AVX-NEXT:    vandpd %xmm2, %xmm0, %xmm0
1795; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1796; AVX-NEXT:    retq
1797;
1798; AVX512-LABEL: floor_maskz_sd_mask8:
1799; AVX512:       ## %bb.0:
1800; AVX512-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm2
1801; AVX512-NEXT:    vcmpeqsd %xmm1, %xmm0, %k1
1802; AVX512-NEXT:    vmovsd %xmm2, %xmm1, %xmm0 {%k1} {z}
1803; AVX512-NEXT:    retq
1804  %mask1 = fcmp oeq <2 x double> %x, %y
1805  %mask = extractelement <2 x i1> %mask1, i64 0
1806  %s = extractelement <2 x double> %x, i64 0
1807  %call = tail call double @llvm.floor.f64(double %s)
1808  %low = select i1 %mask, double %call, double zeroinitializer
1809  %res = insertelement <2 x double> %y, double %low, i64 0
1810  ret <2 x double> %res
1811}
1812
1813define <4 x float> @ceil_ss(<4 x float> %x, <4 x float> %y) nounwind {
1814; SSE41-LABEL: ceil_ss:
1815; SSE41:       ## %bb.0:
1816; SSE41-NEXT:    roundss $10, %xmm0, %xmm0
1817; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1818; SSE41-NEXT:    retq
1819;
1820; AVX-LABEL: ceil_ss:
1821; AVX:       ## %bb.0:
1822; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
1823; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1824; AVX-NEXT:    retq
1825;
1826; AVX512-LABEL: ceil_ss:
1827; AVX512:       ## %bb.0:
1828; AVX512-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
1829; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1830; AVX512-NEXT:    retq
1831  %s = extractelement <4 x float> %x, i32 0
1832  %call = call float @llvm.ceil.f32(float %s)
1833  %res = insertelement <4 x float> %y, float %call, i32 0
1834  ret <4 x float> %res
1835}
1836declare float @llvm.ceil.f32(float %s)
1837
1838define <2 x double> @ceil_sd(<2 x double> %x, <2 x double> %y) nounwind {
1839; SSE41-LABEL: ceil_sd:
1840; SSE41:       ## %bb.0:
1841; SSE41-NEXT:    roundsd $10, %xmm0, %xmm0
1842; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1843; SSE41-NEXT:    retq
1844;
1845; AVX-LABEL: ceil_sd:
1846; AVX:       ## %bb.0:
1847; AVX-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm0
1848; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1849; AVX-NEXT:    retq
1850;
1851; AVX512-LABEL: ceil_sd:
1852; AVX512:       ## %bb.0:
1853; AVX512-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm0
1854; AVX512-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1855; AVX512-NEXT:    retq
1856  %s = extractelement <2 x double> %x, i32 0
1857  %call = call double @llvm.ceil.f64(double %s)
1858  %res = insertelement <2 x double> %y, double %call, i32 0
1859  ret <2 x double> %res
1860}
1861declare double @llvm.ceil.f64(double %s)
1862
1863define <4 x float> @ceil_mask_128_ps(<4 x float> %x, <4 x float> %y) nounwind {
1864; SSE41-LABEL: ceil_mask_128_ps:
1865; SSE41:       ## %bb.0:
1866; SSE41-NEXT:    roundps $10, %xmm0, %xmm2
1867; SSE41-NEXT:    cmpeqps %xmm1, %xmm0
1868; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm1
1869; SSE41-NEXT:    movaps %xmm1, %xmm0
1870; SSE41-NEXT:    retq
1871;
1872; AVX-LABEL: ceil_mask_128_ps:
1873; AVX:       ## %bb.0:
1874; AVX-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm2
1875; AVX-NEXT:    vroundps $10, %xmm0, %xmm0
1876; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
1877; AVX-NEXT:    retq
1878;
1879; AVX512F-LABEL: ceil_mask_128_ps:
1880; AVX512F:       ## %bb.0:
1881; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
1882; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
1883; AVX512F-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
1884; AVX512F-NEXT:    vroundps $10, %xmm0, %xmm0
1885; AVX512F-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
1886; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
1887; AVX512F-NEXT:    vzeroupper
1888; AVX512F-NEXT:    retq
1889;
1890; AVX512VL-LABEL: ceil_mask_128_ps:
1891; AVX512VL:       ## %bb.0:
1892; AVX512VL-NEXT:    vcmpeqps %xmm1, %xmm0, %k1
1893; AVX512VL-NEXT:    vrndscaleps $10, %xmm0, %xmm1 {%k1}
1894; AVX512VL-NEXT:    vmovaps %xmm1, %xmm0
1895; AVX512VL-NEXT:    retq
1896  %k = fcmp oeq <4 x float> %x, %y
1897  %call = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x)
1898  %res = select <4 x i1> %k, <4 x float> %call, <4 x float> %y
1899  ret <4 x float> %res
1900}
1901
1902define <4 x float> @ceil_maskz_128_ps(<4 x float> %x, <4 x float> %y) nounwind {
1903; SSE41-LABEL: ceil_maskz_128_ps:
1904; SSE41:       ## %bb.0:
1905; SSE41-NEXT:    cmpeqps %xmm0, %xmm1
1906; SSE41-NEXT:    roundps $10, %xmm0, %xmm0
1907; SSE41-NEXT:    andps %xmm1, %xmm0
1908; SSE41-NEXT:    retq
1909;
1910; AVX-LABEL: ceil_maskz_128_ps:
1911; AVX:       ## %bb.0:
1912; AVX-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm1
1913; AVX-NEXT:    vroundps $10, %xmm0, %xmm0
1914; AVX-NEXT:    vandps %xmm0, %xmm1, %xmm0
1915; AVX-NEXT:    retq
1916;
1917; AVX512F-LABEL: ceil_maskz_128_ps:
1918; AVX512F:       ## %bb.0:
1919; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
1920; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
1921; AVX512F-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
1922; AVX512F-NEXT:    vroundps $10, %xmm0, %xmm0
1923; AVX512F-NEXT:    vmovaps %zmm0, %zmm0 {%k1} {z}
1924; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
1925; AVX512F-NEXT:    vzeroupper
1926; AVX512F-NEXT:    retq
1927;
1928; AVX512VL-LABEL: ceil_maskz_128_ps:
1929; AVX512VL:       ## %bb.0:
1930; AVX512VL-NEXT:    vcmpeqps %xmm1, %xmm0, %k1
1931; AVX512VL-NEXT:    vrndscaleps $10, %xmm0, %xmm0 {%k1} {z}
1932; AVX512VL-NEXT:    retq
1933  %k = fcmp oeq <4 x float> %x, %y
1934  %call = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x)
1935  %res = select <4 x i1> %k, <4 x float> %call, <4 x float> zeroinitializer
1936  ret <4 x float> %res
1937}
1938
1939define <2 x double> @ceil_mask_128_pd(<2 x double> %x, <2 x double> %y) nounwind {
1940; SSE41-LABEL: ceil_mask_128_pd:
1941; SSE41:       ## %bb.0:
1942; SSE41-NEXT:    roundpd $10, %xmm0, %xmm2
1943; SSE41-NEXT:    cmpeqpd %xmm1, %xmm0
1944; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
1945; SSE41-NEXT:    movapd %xmm1, %xmm0
1946; SSE41-NEXT:    retq
1947;
1948; AVX-LABEL: ceil_mask_128_pd:
1949; AVX:       ## %bb.0:
1950; AVX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm2
1951; AVX-NEXT:    vroundpd $10, %xmm0, %xmm0
1952; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1953; AVX-NEXT:    retq
1954;
1955; AVX512F-LABEL: ceil_mask_128_pd:
1956; AVX512F:       ## %bb.0:
1957; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
1958; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
1959; AVX512F-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
1960; AVX512F-NEXT:    vroundpd $10, %xmm0, %xmm0
1961; AVX512F-NEXT:    vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
1962; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
1963; AVX512F-NEXT:    vzeroupper
1964; AVX512F-NEXT:    retq
1965;
1966; AVX512VL-LABEL: ceil_mask_128_pd:
1967; AVX512VL:       ## %bb.0:
1968; AVX512VL-NEXT:    vcmpeqpd %xmm1, %xmm0, %k1
1969; AVX512VL-NEXT:    vrndscalepd $10, %xmm0, %xmm1 {%k1}
1970; AVX512VL-NEXT:    vmovapd %xmm1, %xmm0
1971; AVX512VL-NEXT:    retq
1972  %k = fcmp oeq <2 x double> %x, %y
1973  %call = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x)
1974  %res = select <2 x i1> %k, <2 x double> %call, <2 x double> %y
1975  ret <2 x double> %res
1976}
1977
1978define <2 x double> @ceil_maskz_128_pd(<2 x double> %x, <2 x double> %y) nounwind {
1979; SSE41-LABEL: ceil_maskz_128_pd:
1980; SSE41:       ## %bb.0:
1981; SSE41-NEXT:    cmpeqpd %xmm0, %xmm1
1982; SSE41-NEXT:    roundpd $10, %xmm0, %xmm0
1983; SSE41-NEXT:    andpd %xmm1, %xmm0
1984; SSE41-NEXT:    retq
1985;
1986; AVX-LABEL: ceil_maskz_128_pd:
1987; AVX:       ## %bb.0:
1988; AVX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm1
1989; AVX-NEXT:    vroundpd $10, %xmm0, %xmm0
1990; AVX-NEXT:    vandpd %xmm0, %xmm1, %xmm0
1991; AVX-NEXT:    retq
1992;
1993; AVX512F-LABEL: ceil_maskz_128_pd:
1994; AVX512F:       ## %bb.0:
1995; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
1996; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
1997; AVX512F-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
1998; AVX512F-NEXT:    vroundpd $10, %xmm0, %xmm0
1999; AVX512F-NEXT:    vmovapd %zmm0, %zmm0 {%k1} {z}
2000; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
2001; AVX512F-NEXT:    vzeroupper
2002; AVX512F-NEXT:    retq
2003;
2004; AVX512VL-LABEL: ceil_maskz_128_pd:
2005; AVX512VL:       ## %bb.0:
2006; AVX512VL-NEXT:    vcmpeqpd %xmm1, %xmm0, %k1
2007; AVX512VL-NEXT:    vrndscalepd $10, %xmm0, %xmm0 {%k1} {z}
2008; AVX512VL-NEXT:    retq
2009  %k = fcmp oeq <2 x double> %x, %y
2010  %call = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x)
2011  %res = select <2 x i1> %k, <2 x double> %call, <2 x double> zeroinitializer
2012  ret <2 x double> %res
2013}
2014
2015define <8 x float> @ceil_mask_256_ps(<8 x float> %x, <8 x float> %y) nounwind {
2016; SSE41-LABEL: ceil_mask_256_ps:
2017; SSE41:       ## %bb.0:
2018; SSE41-NEXT:    roundps $10, %xmm1, %xmm4
2019; SSE41-NEXT:    cmpeqps %xmm3, %xmm1
2020; SSE41-NEXT:    roundps $10, %xmm0, %xmm5
2021; SSE41-NEXT:    cmpeqps %xmm2, %xmm0
2022; SSE41-NEXT:    blendvps %xmm0, %xmm5, %xmm2
2023; SSE41-NEXT:    movaps %xmm1, %xmm0
2024; SSE41-NEXT:    blendvps %xmm0, %xmm4, %xmm3
2025; SSE41-NEXT:    movaps %xmm2, %xmm0
2026; SSE41-NEXT:    movaps %xmm3, %xmm1
2027; SSE41-NEXT:    retq
2028;
2029; AVX-LABEL: ceil_mask_256_ps:
2030; AVX:       ## %bb.0:
2031; AVX-NEXT:    vcmpeqps %ymm1, %ymm0, %ymm2
2032; AVX-NEXT:    vroundps $10, %ymm0, %ymm0
2033; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
2034; AVX-NEXT:    retq
2035;
2036; AVX512F-LABEL: ceil_mask_256_ps:
2037; AVX512F:       ## %bb.0:
2038; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
2039; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
2040; AVX512F-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
2041; AVX512F-NEXT:    vroundps $10, %ymm0, %ymm0
2042; AVX512F-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
2043; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
2044; AVX512F-NEXT:    retq
2045;
2046; AVX512VL-LABEL: ceil_mask_256_ps:
2047; AVX512VL:       ## %bb.0:
2048; AVX512VL-NEXT:    vcmpeqps %ymm1, %ymm0, %k1
2049; AVX512VL-NEXT:    vrndscaleps $10, %ymm0, %ymm1 {%k1}
2050; AVX512VL-NEXT:    vmovaps %ymm1, %ymm0
2051; AVX512VL-NEXT:    retq
2052  %k = fcmp oeq <8 x float> %x, %y
2053  %call = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x)
2054  %res = select <8 x i1> %k, <8 x float> %call, <8 x float> %y
2055  ret <8 x float> %res
2056}
2057
2058define <8 x float> @ceil_maskz_256_ps(<8 x float> %x, <8 x float> %y) nounwind {
2059; SSE41-LABEL: ceil_maskz_256_ps:
2060; SSE41:       ## %bb.0:
2061; SSE41-NEXT:    cmpeqps %xmm1, %xmm3
2062; SSE41-NEXT:    cmpeqps %xmm0, %xmm2
2063; SSE41-NEXT:    roundps $10, %xmm1, %xmm1
2064; SSE41-NEXT:    andps %xmm3, %xmm1
2065; SSE41-NEXT:    roundps $10, %xmm0, %xmm0
2066; SSE41-NEXT:    andps %xmm2, %xmm0
2067; SSE41-NEXT:    retq
2068;
2069; AVX-LABEL: ceil_maskz_256_ps:
2070; AVX:       ## %bb.0:
2071; AVX-NEXT:    vcmpeqps %ymm1, %ymm0, %ymm1
2072; AVX-NEXT:    vroundps $10, %ymm0, %ymm0
2073; AVX-NEXT:    vandps %ymm0, %ymm1, %ymm0
2074; AVX-NEXT:    retq
2075;
2076; AVX512F-LABEL: ceil_maskz_256_ps:
2077; AVX512F:       ## %bb.0:
2078; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
2079; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
2080; AVX512F-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
2081; AVX512F-NEXT:    vroundps $10, %ymm0, %ymm0
2082; AVX512F-NEXT:    vmovaps %zmm0, %zmm0 {%k1} {z}
2083; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
2084; AVX512F-NEXT:    retq
2085;
2086; AVX512VL-LABEL: ceil_maskz_256_ps:
2087; AVX512VL:       ## %bb.0:
2088; AVX512VL-NEXT:    vcmpeqps %ymm1, %ymm0, %k1
2089; AVX512VL-NEXT:    vrndscaleps $10, %ymm0, %ymm0 {%k1} {z}
2090; AVX512VL-NEXT:    retq
2091  %k = fcmp oeq <8 x float> %x, %y
2092  %call = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x)
2093  %res = select <8 x i1> %k, <8 x float> %call, <8 x float> zeroinitializer
2094  ret <8 x float> %res
2095}
2096
2097define <4 x double> @ceil_mask_256_pd(<4 x double> %x, <4 x double> %y) nounwind {
2098; SSE41-LABEL: ceil_mask_256_pd:
2099; SSE41:       ## %bb.0:
2100; SSE41-NEXT:    roundpd $10, %xmm1, %xmm4
2101; SSE41-NEXT:    cmpeqpd %xmm3, %xmm1
2102; SSE41-NEXT:    roundpd $10, %xmm0, %xmm5
2103; SSE41-NEXT:    cmpeqpd %xmm2, %xmm0
2104; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm2
2105; SSE41-NEXT:    movapd %xmm1, %xmm0
2106; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm3
2107; SSE41-NEXT:    movapd %xmm2, %xmm0
2108; SSE41-NEXT:    movapd %xmm3, %xmm1
2109; SSE41-NEXT:    retq
2110;
2111; AVX-LABEL: ceil_mask_256_pd:
2112; AVX:       ## %bb.0:
2113; AVX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm2
2114; AVX-NEXT:    vroundpd $10, %ymm0, %ymm0
2115; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
2116; AVX-NEXT:    retq
2117;
2118; AVX512F-LABEL: ceil_mask_256_pd:
2119; AVX512F:       ## %bb.0:
2120; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
2121; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
2122; AVX512F-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
2123; AVX512F-NEXT:    vroundpd $10, %ymm0, %ymm0
2124; AVX512F-NEXT:    vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
2125; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
2126; AVX512F-NEXT:    retq
2127;
2128; AVX512VL-LABEL: ceil_mask_256_pd:
2129; AVX512VL:       ## %bb.0:
2130; AVX512VL-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
2131; AVX512VL-NEXT:    vrndscalepd $10, %ymm0, %ymm1 {%k1}
2132; AVX512VL-NEXT:    vmovapd %ymm1, %ymm0
2133; AVX512VL-NEXT:    retq
2134  %k = fcmp oeq <4 x double> %x, %y
2135  %call = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x)
2136  %res = select <4 x i1> %k, <4 x double> %call, <4 x double> %y
2137  ret <4 x double> %res
2138}
2139
2140define <4 x double> @ceil_maskz_256_pd(<4 x double> %x, <4 x double> %y) nounwind {
2141; SSE41-LABEL: ceil_maskz_256_pd:
2142; SSE41:       ## %bb.0:
2143; SSE41-NEXT:    cmpeqpd %xmm1, %xmm3
2144; SSE41-NEXT:    cmpeqpd %xmm0, %xmm2
2145; SSE41-NEXT:    roundpd $10, %xmm1, %xmm1
2146; SSE41-NEXT:    andpd %xmm3, %xmm1
2147; SSE41-NEXT:    roundpd $10, %xmm0, %xmm0
2148; SSE41-NEXT:    andpd %xmm2, %xmm0
2149; SSE41-NEXT:    retq
2150;
2151; AVX-LABEL: ceil_maskz_256_pd:
2152; AVX:       ## %bb.0:
2153; AVX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm1
2154; AVX-NEXT:    vroundpd $10, %ymm0, %ymm0
2155; AVX-NEXT:    vandpd %ymm0, %ymm1, %ymm0
2156; AVX-NEXT:    retq
2157;
2158; AVX512F-LABEL: ceil_maskz_256_pd:
2159; AVX512F:       ## %bb.0:
2160; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
2161; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
2162; AVX512F-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
2163; AVX512F-NEXT:    vroundpd $10, %ymm0, %ymm0
2164; AVX512F-NEXT:    vmovapd %zmm0, %zmm0 {%k1} {z}
2165; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
2166; AVX512F-NEXT:    retq
2167;
2168; AVX512VL-LABEL: ceil_maskz_256_pd:
2169; AVX512VL:       ## %bb.0:
2170; AVX512VL-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
2171; AVX512VL-NEXT:    vrndscalepd $10, %ymm0, %ymm0 {%k1} {z}
2172; AVX512VL-NEXT:    retq
2173  %k = fcmp oeq <4 x double> %x, %y
2174  %call = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x)
2175  %res = select <4 x i1> %k, <4 x double> %call, <4 x double> zeroinitializer
2176  ret <4 x double> %res
2177}
2178
2179define <16 x float> @ceil_mask_512_ps(<16 x float> %x, <16 x float> %y) nounwind {
2180; SSE41-LABEL: ceil_mask_512_ps:
2181; SSE41:       ## %bb.0:
2182; SSE41-NEXT:    roundps $10, %xmm3, %xmm8
2183; SSE41-NEXT:    cmpeqps %xmm7, %xmm3
2184; SSE41-NEXT:    roundps $10, %xmm2, %xmm9
2185; SSE41-NEXT:    cmpeqps %xmm6, %xmm2
2186; SSE41-NEXT:    roundps $10, %xmm1, %xmm10
2187; SSE41-NEXT:    cmpeqps %xmm5, %xmm1
2188; SSE41-NEXT:    roundps $10, %xmm0, %xmm11
2189; SSE41-NEXT:    cmpeqps %xmm4, %xmm0
2190; SSE41-NEXT:    blendvps %xmm0, %xmm11, %xmm4
2191; SSE41-NEXT:    movaps %xmm1, %xmm0
2192; SSE41-NEXT:    blendvps %xmm0, %xmm10, %xmm5
2193; SSE41-NEXT:    movaps %xmm2, %xmm0
2194; SSE41-NEXT:    blendvps %xmm0, %xmm9, %xmm6
2195; SSE41-NEXT:    movaps %xmm3, %xmm0
2196; SSE41-NEXT:    blendvps %xmm0, %xmm8, %xmm7
2197; SSE41-NEXT:    movaps %xmm4, %xmm0
2198; SSE41-NEXT:    movaps %xmm5, %xmm1
2199; SSE41-NEXT:    movaps %xmm6, %xmm2
2200; SSE41-NEXT:    movaps %xmm7, %xmm3
2201; SSE41-NEXT:    retq
2202;
2203; AVX-LABEL: ceil_mask_512_ps:
2204; AVX:       ## %bb.0:
2205; AVX-NEXT:    vcmpeqps %ymm3, %ymm1, %ymm4
2206; AVX-NEXT:    vcmpeqps %ymm2, %ymm0, %ymm5
2207; AVX-NEXT:    vroundps $10, %ymm1, %ymm1
2208; AVX-NEXT:    vroundps $10, %ymm0, %ymm0
2209; AVX-NEXT:    vblendvps %ymm5, %ymm0, %ymm2, %ymm0
2210; AVX-NEXT:    vblendvps %ymm4, %ymm1, %ymm3, %ymm1
2211; AVX-NEXT:    retq
2212;
2213; AVX512-LABEL: ceil_mask_512_ps:
2214; AVX512:       ## %bb.0:
2215; AVX512-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
2216; AVX512-NEXT:    vrndscaleps $10, %zmm0, %zmm1 {%k1}
2217; AVX512-NEXT:    vmovaps %zmm1, %zmm0
2218; AVX512-NEXT:    retq
2219  %k = fcmp oeq <16 x float> %x, %y
2220  %call = call <16 x float> @llvm.ceil.v16f32(<16 x float> %x)
2221  %res = select <16 x i1> %k, <16 x float> %call, <16 x float> %y
2222  ret <16 x float> %res
2223}
2224
2225define <16 x float> @ceil_maskz_512_ps(<16 x float> %x, <16 x float> %y) nounwind {
2226; SSE41-LABEL: ceil_maskz_512_ps:
2227; SSE41:       ## %bb.0:
2228; SSE41-NEXT:    cmpeqps %xmm3, %xmm7
2229; SSE41-NEXT:    cmpeqps %xmm2, %xmm6
2230; SSE41-NEXT:    cmpeqps %xmm1, %xmm5
2231; SSE41-NEXT:    cmpeqps %xmm0, %xmm4
2232; SSE41-NEXT:    roundps $10, %xmm3, %xmm3
2233; SSE41-NEXT:    andps %xmm7, %xmm3
2234; SSE41-NEXT:    roundps $10, %xmm2, %xmm2
2235; SSE41-NEXT:    andps %xmm6, %xmm2
2236; SSE41-NEXT:    roundps $10, %xmm1, %xmm1
2237; SSE41-NEXT:    andps %xmm5, %xmm1
2238; SSE41-NEXT:    roundps $10, %xmm0, %xmm0
2239; SSE41-NEXT:    andps %xmm4, %xmm0
2240; SSE41-NEXT:    retq
2241;
2242; AVX-LABEL: ceil_maskz_512_ps:
2243; AVX:       ## %bb.0:
2244; AVX-NEXT:    vcmpeqps %ymm3, %ymm1, %ymm3
2245; AVX-NEXT:    vcmpeqps %ymm2, %ymm0, %ymm2
2246; AVX-NEXT:    vroundps $10, %ymm1, %ymm1
2247; AVX-NEXT:    vandps %ymm1, %ymm3, %ymm1
2248; AVX-NEXT:    vroundps $10, %ymm0, %ymm0
2249; AVX-NEXT:    vandps %ymm0, %ymm2, %ymm0
2250; AVX-NEXT:    retq
2251;
2252; AVX512-LABEL: ceil_maskz_512_ps:
2253; AVX512:       ## %bb.0:
2254; AVX512-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
2255; AVX512-NEXT:    vrndscaleps $10, %zmm0, %zmm0 {%k1} {z}
2256; AVX512-NEXT:    retq
2257  %k = fcmp oeq <16 x float> %x, %y
2258  %call = call <16 x float> @llvm.ceil.v16f32(<16 x float> %x)
2259  %res = select <16 x i1> %k, <16 x float> %call, <16 x float> zeroinitializer
2260  ret <16 x float> %res
2261}
2262
2263define <8 x double> @ceil_mask_512_pd(<8 x double> %x, <8 x double> %y) nounwind {
2264; SSE41-LABEL: ceil_mask_512_pd:
2265; SSE41:       ## %bb.0:
2266; SSE41-NEXT:    roundpd $10, %xmm3, %xmm8
2267; SSE41-NEXT:    cmpeqpd %xmm7, %xmm3
2268; SSE41-NEXT:    roundpd $10, %xmm2, %xmm9
2269; SSE41-NEXT:    cmpeqpd %xmm6, %xmm2
2270; SSE41-NEXT:    roundpd $10, %xmm1, %xmm10
2271; SSE41-NEXT:    cmpeqpd %xmm5, %xmm1
2272; SSE41-NEXT:    roundpd $10, %xmm0, %xmm11
2273; SSE41-NEXT:    cmpeqpd %xmm4, %xmm0
2274; SSE41-NEXT:    blendvpd %xmm0, %xmm11, %xmm4
2275; SSE41-NEXT:    movapd %xmm1, %xmm0
2276; SSE41-NEXT:    blendvpd %xmm0, %xmm10, %xmm5
2277; SSE41-NEXT:    movapd %xmm2, %xmm0
2278; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm6
2279; SSE41-NEXT:    movapd %xmm3, %xmm0
2280; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm7
2281; SSE41-NEXT:    movapd %xmm4, %xmm0
2282; SSE41-NEXT:    movapd %xmm5, %xmm1
2283; SSE41-NEXT:    movapd %xmm6, %xmm2
2284; SSE41-NEXT:    movapd %xmm7, %xmm3
2285; SSE41-NEXT:    retq
2286;
2287; AVX-LABEL: ceil_mask_512_pd:
2288; AVX:       ## %bb.0:
2289; AVX-NEXT:    vcmpeqpd %ymm3, %ymm1, %ymm4
2290; AVX-NEXT:    vcmpeqpd %ymm2, %ymm0, %ymm5
2291; AVX-NEXT:    vroundpd $10, %ymm1, %ymm1
2292; AVX-NEXT:    vroundpd $10, %ymm0, %ymm0
2293; AVX-NEXT:    vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
2294; AVX-NEXT:    vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
2295; AVX-NEXT:    retq
2296;
2297; AVX512-LABEL: ceil_mask_512_pd:
2298; AVX512:       ## %bb.0:
2299; AVX512-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
2300; AVX512-NEXT:    vrndscalepd $10, %zmm0, %zmm1 {%k1}
2301; AVX512-NEXT:    vmovapd %zmm1, %zmm0
2302; AVX512-NEXT:    retq
2303  %k = fcmp oeq <8 x double> %x, %y
2304  %call = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x)
2305  %res = select <8 x i1> %k, <8 x double> %call, <8 x double> %y
2306  ret <8 x double> %res
2307}
2308
2309define <8 x double> @ceil_maskz_512_pd(<8 x double> %x, <8 x double> %y) nounwind {
2310; SSE41-LABEL: ceil_maskz_512_pd:
2311; SSE41:       ## %bb.0:
2312; SSE41-NEXT:    cmpeqpd %xmm3, %xmm7
2313; SSE41-NEXT:    cmpeqpd %xmm2, %xmm6
2314; SSE41-NEXT:    cmpeqpd %xmm1, %xmm5
2315; SSE41-NEXT:    cmpeqpd %xmm0, %xmm4
2316; SSE41-NEXT:    roundpd $10, %xmm3, %xmm3
2317; SSE41-NEXT:    andpd %xmm7, %xmm3
2318; SSE41-NEXT:    roundpd $10, %xmm2, %xmm2
2319; SSE41-NEXT:    andpd %xmm6, %xmm2
2320; SSE41-NEXT:    roundpd $10, %xmm1, %xmm1
2321; SSE41-NEXT:    andpd %xmm5, %xmm1
2322; SSE41-NEXT:    roundpd $10, %xmm0, %xmm0
2323; SSE41-NEXT:    andpd %xmm4, %xmm0
2324; SSE41-NEXT:    retq
2325;
2326; AVX-LABEL: ceil_maskz_512_pd:
2327; AVX:       ## %bb.0:
2328; AVX-NEXT:    vcmpeqpd %ymm3, %ymm1, %ymm3
2329; AVX-NEXT:    vcmpeqpd %ymm2, %ymm0, %ymm2
2330; AVX-NEXT:    vroundpd $10, %ymm1, %ymm1
2331; AVX-NEXT:    vandpd %ymm1, %ymm3, %ymm1
2332; AVX-NEXT:    vroundpd $10, %ymm0, %ymm0
2333; AVX-NEXT:    vandpd %ymm0, %ymm2, %ymm0
2334; AVX-NEXT:    retq
2335;
2336; AVX512-LABEL: ceil_maskz_512_pd:
2337; AVX512:       ## %bb.0:
2338; AVX512-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
2339; AVX512-NEXT:    vrndscalepd $10, %zmm0, %zmm0 {%k1} {z}
2340; AVX512-NEXT:    retq
2341  %k = fcmp oeq <8 x double> %x, %y
2342  %call = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x)
2343  %res = select <8 x i1> %k, <8 x double> %call, <8 x double> zeroinitializer
2344  ret <8 x double> %res
2345}
2346
2347define <4 x float> @ceil_mask_ss(<4 x float> %x, <4 x float> %y, <4 x float> %w, i8 %k) nounwind {
2348; SSE41-LABEL: ceil_mask_ss:
2349; SSE41:       ## %bb.0:
2350; SSE41-NEXT:    testb $1, %dil
2351; SSE41-NEXT:    je LBB78_2
2352; SSE41-NEXT:  ## %bb.1:
2353; SSE41-NEXT:    xorps %xmm2, %xmm2
2354; SSE41-NEXT:    roundss $10, %xmm0, %xmm2
2355; SSE41-NEXT:  LBB78_2:
2356; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
2357; SSE41-NEXT:    movaps %xmm1, %xmm0
2358; SSE41-NEXT:    retq
2359;
2360; AVX-LABEL: ceil_mask_ss:
2361; AVX:       ## %bb.0:
2362; AVX-NEXT:    testb $1, %dil
2363; AVX-NEXT:    je LBB78_2
2364; AVX-NEXT:  ## %bb.1:
2365; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm2
2366; AVX-NEXT:  LBB78_2:
2367; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
2368; AVX-NEXT:    retq
2369;
2370; AVX512-LABEL: ceil_mask_ss:
2371; AVX512:       ## %bb.0:
2372; AVX512-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
2373; AVX512-NEXT:    kmovw %edi, %k1
2374; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm2 {%k1}
2375; AVX512-NEXT:    vmovaps %xmm2, %xmm0
2376; AVX512-NEXT:    retq
2377  %mask = and i8 %k, 1
2378  %nmask = icmp eq i8 %mask, 0
2379  %s = extractelement <4 x float> %x, i64 0
2380  %call = tail call float @llvm.ceil.f32(float %s)
2381  %dst = extractelement <4 x float> %w, i64 0
2382  %low = select i1 %nmask, float %dst, float %call
2383  %res = insertelement <4 x float> %y, float %low, i64 0
2384  ret <4 x float> %res
2385}
2386
2387define <4 x float> @ceil_maskz_ss(<4 x float> %x, <4 x float> %y, i8 %k) nounwind {
2388; SSE41-LABEL: ceil_maskz_ss:
2389; SSE41:       ## %bb.0:
2390; SSE41-NEXT:    testb $1, %dil
2391; SSE41-NEXT:    xorps %xmm2, %xmm2
2392; SSE41-NEXT:    je LBB79_2
2393; SSE41-NEXT:  ## %bb.1:
2394; SSE41-NEXT:    xorps %xmm2, %xmm2
2395; SSE41-NEXT:    roundss $10, %xmm0, %xmm2
2396; SSE41-NEXT:  LBB79_2:
2397; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
2398; SSE41-NEXT:    movaps %xmm1, %xmm0
2399; SSE41-NEXT:    retq
2400;
2401; AVX-LABEL: ceil_maskz_ss:
2402; AVX:       ## %bb.0:
2403; AVX-NEXT:    testb $1, %dil
2404; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
2405; AVX-NEXT:    je LBB79_2
2406; AVX-NEXT:  ## %bb.1:
2407; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm2
2408; AVX-NEXT:  LBB79_2:
2409; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
2410; AVX-NEXT:    retq
2411;
2412; AVX512-LABEL: ceil_maskz_ss:
2413; AVX512:       ## %bb.0:
2414; AVX512-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
2415; AVX512-NEXT:    kmovw %edi, %k1
2416; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm0 {%k1} {z}
2417; AVX512-NEXT:    retq
2418  %mask = and i8 %k, 1
2419  %nmask = icmp eq i8 %mask, 0
2420  %s = extractelement <4 x float> %x, i64 0
2421  %call = tail call float @llvm.ceil.f32(float %s)
2422  %low = select i1 %nmask, float zeroinitializer, float %call
2423  %res = insertelement <4 x float> %y, float %low, i64 0
2424  ret <4 x float> %res
2425}
2426
2427define <2 x double> @ceil_mask_sd(<2 x double> %x, <2 x double> %y, <2 x double> %w, i8 %k) nounwind {
2428; SSE41-LABEL: ceil_mask_sd:
2429; SSE41:       ## %bb.0:
2430; SSE41-NEXT:    testb $1, %dil
2431; SSE41-NEXT:    je LBB80_2
2432; SSE41-NEXT:  ## %bb.1:
2433; SSE41-NEXT:    xorps %xmm2, %xmm2
2434; SSE41-NEXT:    roundsd $10, %xmm0, %xmm2
2435; SSE41-NEXT:  LBB80_2:
2436; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
2437; SSE41-NEXT:    movapd %xmm1, %xmm0
2438; SSE41-NEXT:    retq
2439;
2440; AVX-LABEL: ceil_mask_sd:
2441; AVX:       ## %bb.0:
2442; AVX-NEXT:    testb $1, %dil
2443; AVX-NEXT:    je LBB80_2
2444; AVX-NEXT:  ## %bb.1:
2445; AVX-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm2
2446; AVX-NEXT:  LBB80_2:
2447; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
2448; AVX-NEXT:    retq
2449;
2450; AVX512-LABEL: ceil_mask_sd:
2451; AVX512:       ## %bb.0:
2452; AVX512-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm0
2453; AVX512-NEXT:    kmovw %edi, %k1
2454; AVX512-NEXT:    vmovsd %xmm0, %xmm1, %xmm2 {%k1}
2455; AVX512-NEXT:    vmovapd %xmm2, %xmm0
2456; AVX512-NEXT:    retq
2457  %mask = and i8 %k, 1
2458  %nmask = icmp eq i8 %mask, 0
2459  %s = extractelement <2 x double> %x, i64 0
2460  %call = tail call double @llvm.ceil.f64(double %s)
2461  %dst = extractelement <2 x double> %w, i64 0
2462  %low = select i1 %nmask, double %dst, double %call
2463  %res = insertelement <2 x double> %y, double %low, i64 0
2464  ret <2 x double> %res
2465}
2466
2467define <2 x double> @ceil_maskz_sd(<2 x double> %x, <2 x double> %y, i8 %k) nounwind {
2468; SSE41-LABEL: ceil_maskz_sd:
2469; SSE41:       ## %bb.0:
2470; SSE41-NEXT:    testb $1, %dil
2471; SSE41-NEXT:    xorpd %xmm2, %xmm2
2472; SSE41-NEXT:    je LBB81_2
2473; SSE41-NEXT:  ## %bb.1:
2474; SSE41-NEXT:    xorps %xmm2, %xmm2
2475; SSE41-NEXT:    roundsd $10, %xmm0, %xmm2
2476; SSE41-NEXT:  LBB81_2:
2477; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
2478; SSE41-NEXT:    movapd %xmm1, %xmm0
2479; SSE41-NEXT:    retq
2480;
2481; AVX-LABEL: ceil_maskz_sd:
2482; AVX:       ## %bb.0:
2483; AVX-NEXT:    testb $1, %dil
2484; AVX-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
2485; AVX-NEXT:    je LBB81_2
2486; AVX-NEXT:  ## %bb.1:
2487; AVX-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm2
2488; AVX-NEXT:  LBB81_2:
2489; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
2490; AVX-NEXT:    retq
2491;
2492; AVX512-LABEL: ceil_maskz_sd:
2493; AVX512:       ## %bb.0:
2494; AVX512-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm0
2495; AVX512-NEXT:    kmovw %edi, %k1
2496; AVX512-NEXT:    vmovsd %xmm0, %xmm1, %xmm0 {%k1} {z}
2497; AVX512-NEXT:    retq
2498  %mask = and i8 %k, 1
2499  %nmask = icmp eq i8 %mask, 0
2500  %s = extractelement <2 x double> %x, i64 0
2501  %call = tail call double @llvm.ceil.f64(double %s)
2502  %low = select i1 %nmask, double zeroinitializer, double %call
2503  %res = insertelement <2 x double> %y, double %low, i64 0
2504  ret <2 x double> %res
2505}
2506
2507define <4 x float> @ceil_mask_ss_trunc(<4 x float> %x, <4 x float> %y, <4 x float> %w, i16 %k) nounwind {
2508; SSE41-LABEL: ceil_mask_ss_trunc:
2509; SSE41:       ## %bb.0:
2510; SSE41-NEXT:    testb $1, %dil
2511; SSE41-NEXT:    je LBB82_2
2512; SSE41-NEXT:  ## %bb.1:
2513; SSE41-NEXT:    xorps %xmm2, %xmm2
2514; SSE41-NEXT:    roundss $10, %xmm0, %xmm2
2515; SSE41-NEXT:  LBB82_2:
2516; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
2517; SSE41-NEXT:    movaps %xmm1, %xmm0
2518; SSE41-NEXT:    retq
2519;
2520; AVX-LABEL: ceil_mask_ss_trunc:
2521; AVX:       ## %bb.0:
2522; AVX-NEXT:    testb $1, %dil
2523; AVX-NEXT:    je LBB82_2
2524; AVX-NEXT:  ## %bb.1:
2525; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm2
2526; AVX-NEXT:  LBB82_2:
2527; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
2528; AVX-NEXT:    retq
2529;
2530; AVX512-LABEL: ceil_mask_ss_trunc:
2531; AVX512:       ## %bb.0:
2532; AVX512-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
2533; AVX512-NEXT:    kmovw %edi, %k1
2534; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm2 {%k1}
2535; AVX512-NEXT:    vmovaps %xmm2, %xmm0
2536; AVX512-NEXT:    retq
2537  %mask = trunc i16 %k to i1
2538  %s = extractelement <4 x float> %x, i64 0
2539  %call = tail call float @llvm.ceil.f32(float %s)
2540  %dst = extractelement <4 x float> %w, i64 0
2541  %low = select i1 %mask, float %call, float %dst
2542  %res = insertelement <4 x float> %y, float %low, i64 0
2543  ret <4 x float> %res
2544}
2545
2546define <4 x float> @ceil_maskz_ss_trunc(<4 x float> %x, <4 x float> %y, i16 %k) nounwind {
2547; SSE41-LABEL: ceil_maskz_ss_trunc:
2548; SSE41:       ## %bb.0:
2549; SSE41-NEXT:    testb $1, %dil
2550; SSE41-NEXT:    jne LBB83_1
2551; SSE41-NEXT:  ## %bb.2:
2552; SSE41-NEXT:    xorps %xmm0, %xmm0
2553; SSE41-NEXT:    jmp LBB83_3
2554; SSE41-NEXT:  LBB83_1:
2555; SSE41-NEXT:    roundss $10, %xmm0, %xmm0
2556; SSE41-NEXT:  LBB83_3:
2557; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2558; SSE41-NEXT:    movaps %xmm1, %xmm0
2559; SSE41-NEXT:    retq
2560;
2561; AVX-LABEL: ceil_maskz_ss_trunc:
2562; AVX:       ## %bb.0:
2563; AVX-NEXT:    testb $1, %dil
2564; AVX-NEXT:    jne LBB83_1
2565; AVX-NEXT:  ## %bb.2:
2566; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
2567; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2568; AVX-NEXT:    retq
2569; AVX-NEXT:  LBB83_1:
2570; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
2571; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2572; AVX-NEXT:    retq
2573;
2574; AVX512-LABEL: ceil_maskz_ss_trunc:
2575; AVX512:       ## %bb.0:
2576; AVX512-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
2577; AVX512-NEXT:    kmovw %edi, %k1
2578; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm0 {%k1} {z}
2579; AVX512-NEXT:    retq
2580  %mask = trunc i16 %k to i1
2581  %s = extractelement <4 x float> %x, i64 0
2582  %call = tail call float @llvm.ceil.f32(float %s)
2583  %low = select i1 %mask, float %call, float zeroinitializer
2584  %res = insertelement <4 x float> %y, float %low, i64 0
2585  ret <4 x float> %res
2586}
2587
2588define <2 x double> @ceil_mask_sd_trunc(<2 x double> %x, <2 x double> %y, <2 x double> %w, i16 %k) nounwind {
2589; SSE41-LABEL: ceil_mask_sd_trunc:
2590; SSE41:       ## %bb.0:
2591; SSE41-NEXT:    testb $1, %dil
2592; SSE41-NEXT:    je LBB84_2
2593; SSE41-NEXT:  ## %bb.1:
2594; SSE41-NEXT:    xorps %xmm2, %xmm2
2595; SSE41-NEXT:    roundsd $10, %xmm0, %xmm2
2596; SSE41-NEXT:  LBB84_2:
2597; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
2598; SSE41-NEXT:    movapd %xmm1, %xmm0
2599; SSE41-NEXT:    retq
2600;
2601; AVX-LABEL: ceil_mask_sd_trunc:
2602; AVX:       ## %bb.0:
2603; AVX-NEXT:    testb $1, %dil
2604; AVX-NEXT:    je LBB84_2
2605; AVX-NEXT:  ## %bb.1:
2606; AVX-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm2
2607; AVX-NEXT:  LBB84_2:
2608; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
2609; AVX-NEXT:    retq
2610;
2611; AVX512-LABEL: ceil_mask_sd_trunc:
2612; AVX512:       ## %bb.0:
2613; AVX512-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm0
2614; AVX512-NEXT:    kmovw %edi, %k1
2615; AVX512-NEXT:    vmovsd %xmm0, %xmm1, %xmm2 {%k1}
2616; AVX512-NEXT:    vmovapd %xmm2, %xmm0
2617; AVX512-NEXT:    retq
2618  %mask = trunc i16 %k to i1
2619  %s = extractelement <2 x double> %x, i64 0
2620  %call = tail call double @llvm.ceil.f64(double %s)
2621  %dst = extractelement <2 x double> %w, i64 0
2622  %low = select i1 %mask, double %call, double %dst
2623  %res = insertelement <2 x double> %y, double %low, i64 0
2624  ret <2 x double> %res
2625}
2626
2627define <2 x double> @ceil_maskz_sd_trunc(<2 x double> %x, <2 x double> %y, i16 %k) nounwind {
2628; SSE41-LABEL: ceil_maskz_sd_trunc:
2629; SSE41:       ## %bb.0:
2630; SSE41-NEXT:    testb $1, %dil
2631; SSE41-NEXT:    jne LBB85_1
2632; SSE41-NEXT:  ## %bb.2:
2633; SSE41-NEXT:    xorpd %xmm0, %xmm0
2634; SSE41-NEXT:    jmp LBB85_3
2635; SSE41-NEXT:  LBB85_1:
2636; SSE41-NEXT:    roundsd $10, %xmm0, %xmm0
2637; SSE41-NEXT:  LBB85_3:
2638; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2639; SSE41-NEXT:    movapd %xmm1, %xmm0
2640; SSE41-NEXT:    retq
2641;
2642; AVX-LABEL: ceil_maskz_sd_trunc:
2643; AVX:       ## %bb.0:
2644; AVX-NEXT:    testb $1, %dil
2645; AVX-NEXT:    jne LBB85_1
2646; AVX-NEXT:  ## %bb.2:
2647; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
2648; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2649; AVX-NEXT:    retq
2650; AVX-NEXT:  LBB85_1:
2651; AVX-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm0
2652; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2653; AVX-NEXT:    retq
2654;
2655; AVX512-LABEL: ceil_maskz_sd_trunc:
2656; AVX512:       ## %bb.0:
2657; AVX512-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm0
2658; AVX512-NEXT:    kmovw %edi, %k1
2659; AVX512-NEXT:    vmovsd %xmm0, %xmm1, %xmm0 {%k1} {z}
2660; AVX512-NEXT:    retq
2661  %mask = trunc i16 %k to i1
2662  %s = extractelement <2 x double> %x, i64 0
2663  %call = tail call double @llvm.ceil.f64(double %s)
2664  %low = select i1 %mask, double %call, double zeroinitializer
2665  %res = insertelement <2 x double> %y, double %low, i64 0
2666  ret <2 x double> %res
2667}
2668
2669define <4 x float> @ceil_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x float> %w) nounwind {
2670; SSE41-LABEL: ceil_mask_ss_mask8:
2671; SSE41:       ## %bb.0:
2672; SSE41-NEXT:    roundss $10, %xmm0, %xmm3
2673; SSE41-NEXT:    cmpeqss %xmm1, %xmm0
2674; SSE41-NEXT:    andps %xmm0, %xmm3
2675; SSE41-NEXT:    andnps %xmm2, %xmm0
2676; SSE41-NEXT:    orps %xmm3, %xmm0
2677; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2678; SSE41-NEXT:    retq
2679;
2680; AVX-LABEL: ceil_mask_ss_mask8:
2681; AVX:       ## %bb.0:
2682; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm3
2683; AVX-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0
2684; AVX-NEXT:    vblendvps %xmm0, %xmm3, %xmm2, %xmm0
2685; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2686; AVX-NEXT:    retq
2687;
2688; AVX512-LABEL: ceil_mask_ss_mask8:
2689; AVX512:       ## %bb.0:
2690; AVX512-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm3
2691; AVX512-NEXT:    vcmpeqss %xmm1, %xmm0, %k1
2692; AVX512-NEXT:    vmovss %xmm3, %xmm1, %xmm2 {%k1}
2693; AVX512-NEXT:    vmovaps %xmm2, %xmm0
2694; AVX512-NEXT:    retq
2695  %mask1 = fcmp oeq <4 x float> %x, %y
2696  %mask = extractelement <4 x i1> %mask1, i64 0
2697  %s = extractelement <4 x float> %x, i64 0
2698  %call = tail call float @llvm.ceil.f32(float %s)
2699  %dst = extractelement <4 x float> %w, i64 0
2700  %low = select i1 %mask, float %call, float %dst
2701  %res = insertelement <4 x float> %y, float %low, i64 0
2702  ret <4 x float> %res
2703}
2704
2705define <4 x float> @ceil_maskz_ss_mask8(<4 x float> %x, <4 x float> %y) nounwind {
2706; SSE41-LABEL: ceil_maskz_ss_mask8:
2707; SSE41:       ## %bb.0:
2708; SSE41-NEXT:    roundss $10, %xmm0, %xmm2
2709; SSE41-NEXT:    cmpeqss %xmm1, %xmm0
2710; SSE41-NEXT:    andps %xmm2, %xmm0
2711; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2712; SSE41-NEXT:    retq
2713;
2714; AVX-LABEL: ceil_maskz_ss_mask8:
2715; AVX:       ## %bb.0:
2716; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm2
2717; AVX-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0
2718; AVX-NEXT:    vandps %xmm2, %xmm0, %xmm0
2719; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2720; AVX-NEXT:    retq
2721;
2722; AVX512-LABEL: ceil_maskz_ss_mask8:
2723; AVX512:       ## %bb.0:
2724; AVX512-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm2
2725; AVX512-NEXT:    vcmpeqss %xmm1, %xmm0, %k1
2726; AVX512-NEXT:    vmovss %xmm2, %xmm1, %xmm0 {%k1} {z}
2727; AVX512-NEXT:    retq
2728  %mask1 = fcmp oeq <4 x float> %x, %y
2729  %mask = extractelement <4 x i1> %mask1, i64 0
2730  %s = extractelement <4 x float> %x, i64 0
2731  %call = tail call float @llvm.ceil.f32(float %s)
2732  %low = select i1 %mask, float %call, float zeroinitializer
2733  %res = insertelement <4 x float> %y, float %low, i64 0
2734  ret <4 x float> %res
2735}
2736
2737define <2 x double> @ceil_mask_sd_mask8(<2 x double> %x, <2 x double> %y, <2 x double> %w) nounwind {
2738; SSE41-LABEL: ceil_mask_sd_mask8:
2739; SSE41:       ## %bb.0:
2740; SSE41-NEXT:    roundsd $10, %xmm0, %xmm3
2741; SSE41-NEXT:    cmpeqsd %xmm1, %xmm0
2742; SSE41-NEXT:    andpd %xmm0, %xmm3
2743; SSE41-NEXT:    andnpd %xmm2, %xmm0
2744; SSE41-NEXT:    orpd %xmm3, %xmm0
2745; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2746; SSE41-NEXT:    retq
2747;
2748; AVX-LABEL: ceil_mask_sd_mask8:
2749; AVX:       ## %bb.0:
2750; AVX-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm3
2751; AVX-NEXT:    vcmpeqsd %xmm1, %xmm0, %xmm0
2752; AVX-NEXT:    vblendvpd %xmm0, %xmm3, %xmm2, %xmm0
2753; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2754; AVX-NEXT:    retq
2755;
2756; AVX512-LABEL: ceil_mask_sd_mask8:
2757; AVX512:       ## %bb.0:
2758; AVX512-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm3
2759; AVX512-NEXT:    vcmpeqsd %xmm1, %xmm0, %k1
2760; AVX512-NEXT:    vmovsd %xmm3, %xmm1, %xmm2 {%k1}
2761; AVX512-NEXT:    vmovapd %xmm2, %xmm0
2762; AVX512-NEXT:    retq
2763  %mask1 = fcmp oeq <2 x double> %x, %y
2764  %mask = extractelement <2 x i1> %mask1, i64 0
2765  %s = extractelement <2 x double> %x, i64 0
2766  %call = tail call double @llvm.ceil.f64(double %s)
2767  %dst = extractelement <2 x double> %w, i64 0
2768  %low = select i1 %mask, double %call, double %dst
2769  %res = insertelement <2 x double> %y, double %low, i64 0
2770  ret <2 x double> %res
2771}
2772
2773define <2 x double> @ceil_maskz_sd_mask8(<2 x double> %x, <2 x double> %y) nounwind {
2774; SSE41-LABEL: ceil_maskz_sd_mask8:
2775; SSE41:       ## %bb.0:
2776; SSE41-NEXT:    roundsd $10, %xmm0, %xmm2
2777; SSE41-NEXT:    cmpeqsd %xmm1, %xmm0
2778; SSE41-NEXT:    andpd %xmm2, %xmm0
2779; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2780; SSE41-NEXT:    retq
2781;
2782; AVX-LABEL: ceil_maskz_sd_mask8:
2783; AVX:       ## %bb.0:
2784; AVX-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm2
2785; AVX-NEXT:    vcmpeqsd %xmm1, %xmm0, %xmm0
2786; AVX-NEXT:    vandpd %xmm2, %xmm0, %xmm0
2787; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2788; AVX-NEXT:    retq
2789;
2790; AVX512-LABEL: ceil_maskz_sd_mask8:
2791; AVX512:       ## %bb.0:
2792; AVX512-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm2
2793; AVX512-NEXT:    vcmpeqsd %xmm1, %xmm0, %k1
2794; AVX512-NEXT:    vmovsd %xmm2, %xmm1, %xmm0 {%k1} {z}
2795; AVX512-NEXT:    retq
2796  %mask1 = fcmp oeq <2 x double> %x, %y
2797  %mask = extractelement <2 x i1> %mask1, i64 0
2798  %s = extractelement <2 x double> %x, i64 0
2799  %call = tail call double @llvm.ceil.f64(double %s)
2800  %low = select i1 %mask, double %call, double zeroinitializer
2801  %res = insertelement <2 x double> %y, double %low, i64 0
2802  ret <2 x double> %res
2803}
2804