• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=knl | FileCheck %s -check-prefix=X32 -check-prefix=X32-KNL
3; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=skx | FileCheck %s -check-prefix=X32 -check-prefix=X32-SKX
4; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=knl | FileCheck %s -check-prefix=WIN32 -check-prefix=WIN32-KNL
5; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=skx | FileCheck %s -check-prefix=WIN32 -check-prefix=WIN32-SKX
6; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=knl | FileCheck %s -check-prefix=WIN64 -check-prefix=WIN64-KNL
7; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=skx | FileCheck %s -check-prefix=WIN64 -check-prefix=WIN64-SKX
8; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -check-prefix=X64 -check-prefix=X64-KNL
9; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s -check-prefix=X64 -check-prefix=X64-SKX
10
11declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
12declare <16 x float> @func_float16(<16 x float>, <16 x float>)
13declare i32 @func_int(i32, i32)
14
15;test calling conventions - input parameters
16define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
17; X32-LABEL: testf16_inp:
18; X32:       ## %bb.0:
19; X32-NEXT:    pushl %ebp
20; X32-NEXT:    movl %esp, %ebp
21; X32-NEXT:    andl $-64, %esp
22; X32-NEXT:    subl $192, %esp
23; X32-NEXT:    vaddps %zmm1, %zmm0, %zmm0
24; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
25; X32-NEXT:    movl %eax, (%esp)
26; X32-NEXT:    calll _func_float16_ptr
27; X32-NEXT:    vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0
28; X32-NEXT:    movl %ebp, %esp
29; X32-NEXT:    popl %ebp
30; X32-NEXT:    retl
31;
32; WIN32-LABEL: testf16_inp:
33; WIN32:       # %bb.0:
34; WIN32-NEXT:    pushl %ebp
35; WIN32-NEXT:    movl %esp, %ebp
36; WIN32-NEXT:    andl $-64, %esp
37; WIN32-NEXT:    subl $128, %esp
38; WIN32-NEXT:    vaddps %zmm1, %zmm0, %zmm0
39; WIN32-NEXT:    movl %esp, %eax
40; WIN32-NEXT:    pushl %eax
41; WIN32-NEXT:    calll _func_float16_ptr
42; WIN32-NEXT:    addl $4, %esp
43; WIN32-NEXT:    vaddps (%esp), %zmm0, %zmm0
44; WIN32-NEXT:    movl %ebp, %esp
45; WIN32-NEXT:    popl %ebp
46; WIN32-NEXT:    retl
47;
48; WIN64-LABEL: testf16_inp:
49; WIN64:       # %bb.0:
50; WIN64-NEXT:    pushq %rbp
51; WIN64-NEXT:    subq $176, %rsp
52; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rbp
53; WIN64-NEXT:    andq $-64, %rsp
54; WIN64-NEXT:    vmovaps (%rcx), %zmm0
55; WIN64-NEXT:    vaddps (%rdx), %zmm0, %zmm0
56; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
57; WIN64-NEXT:    callq func_float16_ptr
58; WIN64-NEXT:    vaddps {{[0-9]+}}(%rsp), %zmm0, %zmm0
59; WIN64-NEXT:    leaq 48(%rbp), %rsp
60; WIN64-NEXT:    popq %rbp
61; WIN64-NEXT:    retq
62;
63; X64-LABEL: testf16_inp:
64; X64:       ## %bb.0:
65; X64-NEXT:    pushq %rbp
66; X64-NEXT:    movq %rsp, %rbp
67; X64-NEXT:    pushq %r13
68; X64-NEXT:    pushq %r12
69; X64-NEXT:    andq $-64, %rsp
70; X64-NEXT:    subq $128, %rsp
71; X64-NEXT:    vaddps %zmm1, %zmm0, %zmm0
72; X64-NEXT:    movq %rsp, %rdi
73; X64-NEXT:    callq _func_float16_ptr
74; X64-NEXT:    vaddps (%rsp), %zmm0, %zmm0
75; X64-NEXT:    leaq -16(%rbp), %rsp
76; X64-NEXT:    popq %r12
77; X64-NEXT:    popq %r13
78; X64-NEXT:    popq %rbp
79; X64-NEXT:    retq
80  %y = alloca <16 x float>, align 16
81  %x = fadd <16 x float> %a, %b
82  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
83  %2 = load <16 x float>, <16 x float>* %y, align 16
84  %3 = fadd <16 x float> %2, %1
85  ret <16 x float> %3
86}
87
88;test calling conventions - preserved registers
89
90define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
91; X32-LABEL: testf16_regs:
92; X32:       ## %bb.0:
93; X32-NEXT:    pushl %ebp
94; X32-NEXT:    movl %esp, %ebp
95; X32-NEXT:    andl $-64, %esp
96; X32-NEXT:    subl $256, %esp ## imm = 0x100
97; X32-NEXT:    vmovaps %zmm1, {{[0-9]+}}(%esp) ## 64-byte Spill
98; X32-NEXT:    vaddps %zmm1, %zmm0, %zmm0
99; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
100; X32-NEXT:    movl %eax, (%esp)
101; X32-NEXT:    calll _func_float16_ptr
102; X32-NEXT:    vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0 ## 64-byte Folded Reload
103; X32-NEXT:    vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0
104; X32-NEXT:    movl %ebp, %esp
105; X32-NEXT:    popl %ebp
106; X32-NEXT:    retl
107;
108; WIN32-LABEL: testf16_regs:
109; WIN32:       # %bb.0:
110; WIN32-NEXT:    pushl %ebp
111; WIN32-NEXT:    movl %esp, %ebp
112; WIN32-NEXT:    andl $-64, %esp
113; WIN32-NEXT:    subl $192, %esp
114; WIN32-NEXT:    vmovaps %zmm1, (%esp) # 64-byte Spill
115; WIN32-NEXT:    vaddps %zmm1, %zmm0, %zmm0
116; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
117; WIN32-NEXT:    pushl %eax
118; WIN32-NEXT:    calll _func_float16_ptr
119; WIN32-NEXT:    addl $4, %esp
120; WIN32-NEXT:    vaddps (%esp), %zmm0, %zmm0 # 64-byte Folded Reload
121; WIN32-NEXT:    vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0
122; WIN32-NEXT:    movl %ebp, %esp
123; WIN32-NEXT:    popl %ebp
124; WIN32-NEXT:    retl
125;
126; WIN64-LABEL: testf16_regs:
127; WIN64:       # %bb.0:
128; WIN64-NEXT:    pushq %rbp
129; WIN64-NEXT:    subq $176, %rsp
130; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rbp
131; WIN64-NEXT:    andq $-64, %rsp
132; WIN64-NEXT:    vmovaps (%rdx), %zmm16
133; WIN64-NEXT:    vaddps (%rcx), %zmm16, %zmm0
134; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
135; WIN64-NEXT:    callq func_float16_ptr
136; WIN64-NEXT:    vaddps %zmm16, %zmm0, %zmm0
137; WIN64-NEXT:    vaddps {{[0-9]+}}(%rsp), %zmm0, %zmm0
138; WIN64-NEXT:    leaq 48(%rbp), %rsp
139; WIN64-NEXT:    popq %rbp
140; WIN64-NEXT:    retq
141;
142; X64-LABEL: testf16_regs:
143; X64:       ## %bb.0:
144; X64-NEXT:    pushq %rbp
145; X64-NEXT:    movq %rsp, %rbp
146; X64-NEXT:    pushq %r13
147; X64-NEXT:    pushq %r12
148; X64-NEXT:    andq $-64, %rsp
149; X64-NEXT:    subq $128, %rsp
150; X64-NEXT:    vmovaps %zmm1, %zmm16
151; X64-NEXT:    vaddps %zmm1, %zmm0, %zmm0
152; X64-NEXT:    movq %rsp, %rdi
153; X64-NEXT:    callq _func_float16_ptr
154; X64-NEXT:    vaddps %zmm16, %zmm0, %zmm0
155; X64-NEXT:    vaddps (%rsp), %zmm0, %zmm0
156; X64-NEXT:    leaq -16(%rbp), %rsp
157; X64-NEXT:    popq %r12
158; X64-NEXT:    popq %r13
159; X64-NEXT:    popq %rbp
160; X64-NEXT:    retq
161  %y = alloca <16 x float>, align 16
162  %x = fadd <16 x float> %a, %b
163  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
164  %2 = load <16 x float>, <16 x float>* %y, align 16
165  %3 = fadd <16 x float> %1, %b
166  %4 = fadd <16 x float> %2, %3
167  ret <16 x float> %4
168}
169
170; test calling conventions - prolog and epilog
171define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
172; X32-LABEL: test_prolog_epilog:
173; X32:       ## %bb.0:
174; X32-NEXT:    subl $12, %esp
175; X32-NEXT:    calll _func_float16
176; X32-NEXT:    addl $12, %esp
177; X32-NEXT:    retl
178;
179; WIN32-LABEL: test_prolog_epilog:
180; WIN32:       # %bb.0:
181; WIN32-NEXT:    calll _func_float16
182; WIN32-NEXT:    retl
183;
184; WIN64-KNL-LABEL: test_prolog_epilog:
185; WIN64-KNL:       # %bb.0:
186; WIN64-KNL-NEXT:    pushq %rbp
187; WIN64-KNL-NEXT:    subq $1328, %rsp # imm = 0x530
188; WIN64-KNL-NEXT:    leaq {{[0-9]+}}(%rsp), %rbp
189; WIN64-KNL-NEXT:    kmovw %k7, 1198(%rbp) # 2-byte Spill
190; WIN64-KNL-NEXT:    kmovw %k6, 1196(%rbp) # 2-byte Spill
191; WIN64-KNL-NEXT:    kmovw %k5, 1194(%rbp) # 2-byte Spill
192; WIN64-KNL-NEXT:    kmovw %k4, 1192(%rbp) # 2-byte Spill
193; WIN64-KNL-NEXT:    vmovaps %zmm21, 1104(%rbp) # 64-byte Spill
194; WIN64-KNL-NEXT:    vmovaps %zmm20, 992(%rbp) # 64-byte Spill
195; WIN64-KNL-NEXT:    vmovaps %zmm19, 896(%rbp) # 64-byte Spill
196; WIN64-KNL-NEXT:    vmovaps %zmm18, 832(%rbp) # 64-byte Spill
197; WIN64-KNL-NEXT:    vmovaps %zmm17, 768(%rbp) # 64-byte Spill
198; WIN64-KNL-NEXT:    vmovaps %zmm16, 704(%rbp) # 64-byte Spill
199; WIN64-KNL-NEXT:    vmovaps %zmm15, 640(%rbp) # 64-byte Spill
200; WIN64-KNL-NEXT:    vmovaps %zmm14, 576(%rbp) # 64-byte Spill
201; WIN64-KNL-NEXT:    vmovaps %zmm13, 512(%rbp) # 64-byte Spill
202; WIN64-KNL-NEXT:    vmovaps %zmm12, 448(%rbp) # 64-byte Spill
203; WIN64-KNL-NEXT:    vmovaps %zmm11, 384(%rbp) # 64-byte Spill
204; WIN64-KNL-NEXT:    vmovaps %zmm10, 320(%rbp) # 64-byte Spill
205; WIN64-KNL-NEXT:    vmovaps %zmm9, 256(%rbp) # 64-byte Spill
206; WIN64-KNL-NEXT:    vmovaps %zmm8, 192(%rbp) # 64-byte Spill
207; WIN64-KNL-NEXT:    vmovaps %zmm7, 128(%rbp) # 64-byte Spill
208; WIN64-KNL-NEXT:    vmovaps %zmm6, 64(%rbp) # 64-byte Spill
209; WIN64-KNL-NEXT:    andq $-64, %rsp
210; WIN64-KNL-NEXT:    vmovaps %zmm1, {{[0-9]+}}(%rsp)
211; WIN64-KNL-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
212; WIN64-KNL-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
213; WIN64-KNL-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
214; WIN64-KNL-NEXT:    callq func_float16
215; WIN64-KNL-NEXT:    vmovaps 64(%rbp), %zmm6 # 64-byte Reload
216; WIN64-KNL-NEXT:    vmovaps 128(%rbp), %zmm7 # 64-byte Reload
217; WIN64-KNL-NEXT:    vmovaps 192(%rbp), %zmm8 # 64-byte Reload
218; WIN64-KNL-NEXT:    vmovaps 256(%rbp), %zmm9 # 64-byte Reload
219; WIN64-KNL-NEXT:    vmovaps 320(%rbp), %zmm10 # 64-byte Reload
220; WIN64-KNL-NEXT:    vmovaps 384(%rbp), %zmm11 # 64-byte Reload
221; WIN64-KNL-NEXT:    vmovaps 448(%rbp), %zmm12 # 64-byte Reload
222; WIN64-KNL-NEXT:    vmovaps 512(%rbp), %zmm13 # 64-byte Reload
223; WIN64-KNL-NEXT:    vmovaps 576(%rbp), %zmm14 # 64-byte Reload
224; WIN64-KNL-NEXT:    vmovaps 640(%rbp), %zmm15 # 64-byte Reload
225; WIN64-KNL-NEXT:    vmovaps 704(%rbp), %zmm16 # 64-byte Reload
226; WIN64-KNL-NEXT:    vmovaps 768(%rbp), %zmm17 # 64-byte Reload
227; WIN64-KNL-NEXT:    vmovaps 832(%rbp), %zmm18 # 64-byte Reload
228; WIN64-KNL-NEXT:    vmovaps 896(%rbp), %zmm19 # 64-byte Reload
229; WIN64-KNL-NEXT:    vmovaps 992(%rbp), %zmm20 # 64-byte Reload
230; WIN64-KNL-NEXT:    vmovaps 1104(%rbp), %zmm21 # 64-byte Reload
231; WIN64-KNL-NEXT:    kmovw 1192(%rbp), %k4 # 2-byte Reload
232; WIN64-KNL-NEXT:    kmovw 1194(%rbp), %k5 # 2-byte Reload
233; WIN64-KNL-NEXT:    kmovw 1196(%rbp), %k6 # 2-byte Reload
234; WIN64-KNL-NEXT:    kmovw 1198(%rbp), %k7 # 2-byte Reload
235; WIN64-KNL-NEXT:    leaq 1200(%rbp), %rsp
236; WIN64-KNL-NEXT:    popq %rbp
237; WIN64-KNL-NEXT:    retq
238;
239; WIN64-SKX-LABEL: test_prolog_epilog:
240; WIN64-SKX:       # %bb.0:
241; WIN64-SKX-NEXT:    pushq %rbp
242; WIN64-SKX-NEXT:    subq $1328, %rsp # imm = 0x530
243; WIN64-SKX-NEXT:    leaq {{[0-9]+}}(%rsp), %rbp
244; WIN64-SKX-NEXT:    kmovq %k7, 1192(%rbp) # 8-byte Spill
245; WIN64-SKX-NEXT:    kmovq %k6, 1184(%rbp) # 8-byte Spill
246; WIN64-SKX-NEXT:    kmovq %k5, 1176(%rbp) # 8-byte Spill
247; WIN64-SKX-NEXT:    kmovq %k4, 1168(%rbp) # 8-byte Spill
248; WIN64-SKX-NEXT:    vmovaps %zmm21, 1056(%rbp) # 64-byte Spill
249; WIN64-SKX-NEXT:    vmovaps %zmm20, 960(%rbp) # 64-byte Spill
250; WIN64-SKX-NEXT:    vmovaps %zmm19, 896(%rbp) # 64-byte Spill
251; WIN64-SKX-NEXT:    vmovaps %zmm18, 832(%rbp) # 64-byte Spill
252; WIN64-SKX-NEXT:    vmovaps %zmm17, 768(%rbp) # 64-byte Spill
253; WIN64-SKX-NEXT:    vmovaps %zmm16, 704(%rbp) # 64-byte Spill
254; WIN64-SKX-NEXT:    vmovaps %zmm15, 640(%rbp) # 64-byte Spill
255; WIN64-SKX-NEXT:    vmovaps %zmm14, 576(%rbp) # 64-byte Spill
256; WIN64-SKX-NEXT:    vmovaps %zmm13, 512(%rbp) # 64-byte Spill
257; WIN64-SKX-NEXT:    vmovaps %zmm12, 448(%rbp) # 64-byte Spill
258; WIN64-SKX-NEXT:    vmovaps %zmm11, 384(%rbp) # 64-byte Spill
259; WIN64-SKX-NEXT:    vmovaps %zmm10, 320(%rbp) # 64-byte Spill
260; WIN64-SKX-NEXT:    vmovaps %zmm9, 256(%rbp) # 64-byte Spill
261; WIN64-SKX-NEXT:    vmovaps %zmm8, 192(%rbp) # 64-byte Spill
262; WIN64-SKX-NEXT:    vmovaps %zmm7, 128(%rbp) # 64-byte Spill
263; WIN64-SKX-NEXT:    vmovaps %zmm6, 64(%rbp) # 64-byte Spill
264; WIN64-SKX-NEXT:    andq $-64, %rsp
265; WIN64-SKX-NEXT:    vmovaps %zmm1, {{[0-9]+}}(%rsp)
266; WIN64-SKX-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
267; WIN64-SKX-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
268; WIN64-SKX-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
269; WIN64-SKX-NEXT:    callq func_float16
270; WIN64-SKX-NEXT:    vmovaps 64(%rbp), %zmm6 # 64-byte Reload
271; WIN64-SKX-NEXT:    vmovaps 128(%rbp), %zmm7 # 64-byte Reload
272; WIN64-SKX-NEXT:    vmovaps 192(%rbp), %zmm8 # 64-byte Reload
273; WIN64-SKX-NEXT:    vmovaps 256(%rbp), %zmm9 # 64-byte Reload
274; WIN64-SKX-NEXT:    vmovaps 320(%rbp), %zmm10 # 64-byte Reload
275; WIN64-SKX-NEXT:    vmovaps 384(%rbp), %zmm11 # 64-byte Reload
276; WIN64-SKX-NEXT:    vmovaps 448(%rbp), %zmm12 # 64-byte Reload
277; WIN64-SKX-NEXT:    vmovaps 512(%rbp), %zmm13 # 64-byte Reload
278; WIN64-SKX-NEXT:    vmovaps 576(%rbp), %zmm14 # 64-byte Reload
279; WIN64-SKX-NEXT:    vmovaps 640(%rbp), %zmm15 # 64-byte Reload
280; WIN64-SKX-NEXT:    vmovaps 704(%rbp), %zmm16 # 64-byte Reload
281; WIN64-SKX-NEXT:    vmovaps 768(%rbp), %zmm17 # 64-byte Reload
282; WIN64-SKX-NEXT:    vmovaps 832(%rbp), %zmm18 # 64-byte Reload
283; WIN64-SKX-NEXT:    vmovaps 896(%rbp), %zmm19 # 64-byte Reload
284; WIN64-SKX-NEXT:    vmovaps 960(%rbp), %zmm20 # 64-byte Reload
285; WIN64-SKX-NEXT:    vmovaps 1056(%rbp), %zmm21 # 64-byte Reload
286; WIN64-SKX-NEXT:    kmovq 1168(%rbp), %k4 # 8-byte Reload
287; WIN64-SKX-NEXT:    kmovq 1176(%rbp), %k5 # 8-byte Reload
288; WIN64-SKX-NEXT:    kmovq 1184(%rbp), %k6 # 8-byte Reload
289; WIN64-SKX-NEXT:    kmovq 1192(%rbp), %k7 # 8-byte Reload
290; WIN64-SKX-NEXT:    leaq 1200(%rbp), %rsp
291; WIN64-SKX-NEXT:    popq %rbp
292; WIN64-SKX-NEXT:    retq
293;
294; X64-KNL-LABEL: test_prolog_epilog:
295; X64-KNL:       ## %bb.0:
296; X64-KNL-NEXT:    pushq %rsi
297; X64-KNL-NEXT:    pushq %rdi
298; X64-KNL-NEXT:    subq $1064, %rsp ## imm = 0x428
299; X64-KNL-NEXT:    kmovw %k7, {{[0-9]+}}(%rsp) ## 2-byte Spill
300; X64-KNL-NEXT:    kmovw %k6, {{[0-9]+}}(%rsp) ## 2-byte Spill
301; X64-KNL-NEXT:    kmovw %k5, {{[0-9]+}}(%rsp) ## 2-byte Spill
302; X64-KNL-NEXT:    kmovw %k4, {{[0-9]+}}(%rsp) ## 2-byte Spill
303; X64-KNL-NEXT:    vmovups %zmm31, {{[0-9]+}}(%rsp) ## 64-byte Spill
304; X64-KNL-NEXT:    vmovups %zmm30, {{[0-9]+}}(%rsp) ## 64-byte Spill
305; X64-KNL-NEXT:    vmovups %zmm29, {{[0-9]+}}(%rsp) ## 64-byte Spill
306; X64-KNL-NEXT:    vmovups %zmm28, {{[0-9]+}}(%rsp) ## 64-byte Spill
307; X64-KNL-NEXT:    vmovups %zmm27, {{[0-9]+}}(%rsp) ## 64-byte Spill
308; X64-KNL-NEXT:    vmovups %zmm26, {{[0-9]+}}(%rsp) ## 64-byte Spill
309; X64-KNL-NEXT:    vmovups %zmm25, {{[0-9]+}}(%rsp) ## 64-byte Spill
310; X64-KNL-NEXT:    vmovups %zmm24, {{[0-9]+}}(%rsp) ## 64-byte Spill
311; X64-KNL-NEXT:    vmovups %zmm23, {{[0-9]+}}(%rsp) ## 64-byte Spill
312; X64-KNL-NEXT:    vmovups %zmm22, {{[0-9]+}}(%rsp) ## 64-byte Spill
313; X64-KNL-NEXT:    vmovups %zmm21, {{[0-9]+}}(%rsp) ## 64-byte Spill
314; X64-KNL-NEXT:    vmovups %zmm20, {{[0-9]+}}(%rsp) ## 64-byte Spill
315; X64-KNL-NEXT:    vmovups %zmm19, {{[0-9]+}}(%rsp) ## 64-byte Spill
316; X64-KNL-NEXT:    vmovups %zmm18, {{[0-9]+}}(%rsp) ## 64-byte Spill
317; X64-KNL-NEXT:    vmovups %zmm17, {{[0-9]+}}(%rsp) ## 64-byte Spill
318; X64-KNL-NEXT:    vmovups %zmm16, (%rsp) ## 64-byte Spill
319; X64-KNL-NEXT:    callq _func_float16
320; X64-KNL-NEXT:    vmovups (%rsp), %zmm16 ## 64-byte Reload
321; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm17 ## 64-byte Reload
322; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm18 ## 64-byte Reload
323; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm19 ## 64-byte Reload
324; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm20 ## 64-byte Reload
325; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm21 ## 64-byte Reload
326; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm22 ## 64-byte Reload
327; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm23 ## 64-byte Reload
328; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm24 ## 64-byte Reload
329; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm25 ## 64-byte Reload
330; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm26 ## 64-byte Reload
331; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm27 ## 64-byte Reload
332; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm28 ## 64-byte Reload
333; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm29 ## 64-byte Reload
334; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm30 ## 64-byte Reload
335; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm31 ## 64-byte Reload
336; X64-KNL-NEXT:    kmovw {{[0-9]+}}(%rsp), %k4 ## 2-byte Reload
337; X64-KNL-NEXT:    kmovw {{[0-9]+}}(%rsp), %k5 ## 2-byte Reload
338; X64-KNL-NEXT:    kmovw {{[0-9]+}}(%rsp), %k6 ## 2-byte Reload
339; X64-KNL-NEXT:    kmovw {{[0-9]+}}(%rsp), %k7 ## 2-byte Reload
340; X64-KNL-NEXT:    addq $1064, %rsp ## imm = 0x428
341; X64-KNL-NEXT:    popq %rdi
342; X64-KNL-NEXT:    popq %rsi
343; X64-KNL-NEXT:    retq
344;
345; X64-SKX-LABEL: test_prolog_epilog:
346; X64-SKX:       ## %bb.0:
347; X64-SKX-NEXT:    pushq %rsi
348; X64-SKX-NEXT:    pushq %rdi
349; X64-SKX-NEXT:    subq $1192, %rsp ## imm = 0x4A8
350; X64-SKX-NEXT:    kmovq %k7, {{[0-9]+}}(%rsp) ## 8-byte Spill
351; X64-SKX-NEXT:    kmovq %k6, {{[0-9]+}}(%rsp) ## 8-byte Spill
352; X64-SKX-NEXT:    kmovq %k5, {{[0-9]+}}(%rsp) ## 8-byte Spill
353; X64-SKX-NEXT:    kmovq %k4, {{[0-9]+}}(%rsp) ## 8-byte Spill
354; X64-SKX-NEXT:    vmovups %zmm31, {{[0-9]+}}(%rsp) ## 64-byte Spill
355; X64-SKX-NEXT:    vmovups %zmm30, {{[0-9]+}}(%rsp) ## 64-byte Spill
356; X64-SKX-NEXT:    vmovups %zmm29, {{[0-9]+}}(%rsp) ## 64-byte Spill
357; X64-SKX-NEXT:    vmovups %zmm28, {{[0-9]+}}(%rsp) ## 64-byte Spill
358; X64-SKX-NEXT:    vmovups %zmm27, {{[0-9]+}}(%rsp) ## 64-byte Spill
359; X64-SKX-NEXT:    vmovups %zmm26, {{[0-9]+}}(%rsp) ## 64-byte Spill
360; X64-SKX-NEXT:    vmovups %zmm25, {{[0-9]+}}(%rsp) ## 64-byte Spill
361; X64-SKX-NEXT:    vmovups %zmm24, {{[0-9]+}}(%rsp) ## 64-byte Spill
362; X64-SKX-NEXT:    vmovups %zmm23, {{[0-9]+}}(%rsp) ## 64-byte Spill
363; X64-SKX-NEXT:    vmovups %zmm22, {{[0-9]+}}(%rsp) ## 64-byte Spill
364; X64-SKX-NEXT:    vmovups %zmm21, {{[0-9]+}}(%rsp) ## 64-byte Spill
365; X64-SKX-NEXT:    vmovups %zmm20, {{[0-9]+}}(%rsp) ## 64-byte Spill
366; X64-SKX-NEXT:    vmovups %zmm19, {{[0-9]+}}(%rsp) ## 64-byte Spill
367; X64-SKX-NEXT:    vmovups %zmm18, {{[0-9]+}}(%rsp) ## 64-byte Spill
368; X64-SKX-NEXT:    vmovups %zmm17, {{[0-9]+}}(%rsp) ## 64-byte Spill
369; X64-SKX-NEXT:    vmovups %zmm16, (%rsp) ## 64-byte Spill
370; X64-SKX-NEXT:    callq _func_float16
371; X64-SKX-NEXT:    vmovups (%rsp), %zmm16 ## 64-byte Reload
372; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm17 ## 64-byte Reload
373; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm18 ## 64-byte Reload
374; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm19 ## 64-byte Reload
375; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm20 ## 64-byte Reload
376; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm21 ## 64-byte Reload
377; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm22 ## 64-byte Reload
378; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm23 ## 64-byte Reload
379; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm24 ## 64-byte Reload
380; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm25 ## 64-byte Reload
381; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm26 ## 64-byte Reload
382; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm27 ## 64-byte Reload
383; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm28 ## 64-byte Reload
384; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm29 ## 64-byte Reload
385; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm30 ## 64-byte Reload
386; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm31 ## 64-byte Reload
387; X64-SKX-NEXT:    kmovq {{[0-9]+}}(%rsp), %k4 ## 8-byte Reload
388; X64-SKX-NEXT:    kmovq {{[0-9]+}}(%rsp), %k5 ## 8-byte Reload
389; X64-SKX-NEXT:    kmovq {{[0-9]+}}(%rsp), %k6 ## 8-byte Reload
390; X64-SKX-NEXT:    kmovq {{[0-9]+}}(%rsp), %k7 ## 8-byte Reload
391; X64-SKX-NEXT:    addq $1192, %rsp ## imm = 0x4A8
392; X64-SKX-NEXT:    popq %rdi
393; X64-SKX-NEXT:    popq %rsi
394; X64-SKX-NEXT:    retq
395   %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
396   ret <16 x float> %c
397}
398
399
400declare <16 x float> @func_float16_mask(<16 x float>, <16 x i1>)
401
402define <16 x float> @testf16_inp_mask(<16 x float> %a, i16 %mask)  {
403; X32-LABEL: testf16_inp_mask:
404; X32:       ## %bb.0:
405; X32-NEXT:    subl $12, %esp
406; X32-NEXT:    .cfi_def_cfa_offset 16
407; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
408; X32-NEXT:    calll _func_float16_mask
409; X32-NEXT:    addl $12, %esp
410; X32-NEXT:    retl
411;
412; WIN32-LABEL: testf16_inp_mask:
413; WIN32:       # %bb.0:
414; WIN32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
415; WIN32-NEXT:    calll _func_float16_mask
416; WIN32-NEXT:    retl
417;
418; WIN64-KNL-LABEL: testf16_inp_mask:
419; WIN64-KNL:       # %bb.0:
420; WIN64-KNL-NEXT:    subq $40, %rsp
421; WIN64-KNL-NEXT:    .seh_stackalloc 40
422; WIN64-KNL-NEXT:    .seh_endprologue
423; WIN64-KNL-NEXT:    # kill: def $dx killed $dx def $edx
424; WIN64-KNL-NEXT:    vmovaps (%rcx), %zmm0
425; WIN64-KNL-NEXT:    kmovw %edx, %k1
426; WIN64-KNL-NEXT:    callq func_float16_mask
427; WIN64-KNL-NEXT:    nop
428; WIN64-KNL-NEXT:    addq $40, %rsp
429; WIN64-KNL-NEXT:    retq
430; WIN64-KNL-NEXT:    .seh_handlerdata
431; WIN64-KNL-NEXT:    .text
432; WIN64-KNL-NEXT:    .seh_endproc
433;
434; WIN64-SKX-LABEL: testf16_inp_mask:
435; WIN64-SKX:       # %bb.0:
436; WIN64-SKX-NEXT:    subq $40, %rsp
437; WIN64-SKX-NEXT:    .seh_stackalloc 40
438; WIN64-SKX-NEXT:    .seh_endprologue
439; WIN64-SKX-NEXT:    # kill: def $dx killed $dx def $edx
440; WIN64-SKX-NEXT:    vmovaps (%rcx), %zmm0
441; WIN64-SKX-NEXT:    kmovd %edx, %k1
442; WIN64-SKX-NEXT:    callq func_float16_mask
443; WIN64-SKX-NEXT:    nop
444; WIN64-SKX-NEXT:    addq $40, %rsp
445; WIN64-SKX-NEXT:    retq
446; WIN64-SKX-NEXT:    .seh_handlerdata
447; WIN64-SKX-NEXT:    .text
448; WIN64-SKX-NEXT:    .seh_endproc
449;
450; X64-KNL-LABEL: testf16_inp_mask:
451; X64-KNL:       ## %bb.0:
452; X64-KNL-NEXT:    pushq %rbp
453; X64-KNL-NEXT:    .cfi_def_cfa_offset 16
454; X64-KNL-NEXT:    pushq %r13
455; X64-KNL-NEXT:    .cfi_def_cfa_offset 24
456; X64-KNL-NEXT:    pushq %r12
457; X64-KNL-NEXT:    .cfi_def_cfa_offset 32
458; X64-KNL-NEXT:    .cfi_offset %r12, -32
459; X64-KNL-NEXT:    .cfi_offset %r13, -24
460; X64-KNL-NEXT:    .cfi_offset %rbp, -16
461; X64-KNL-NEXT:    kmovw %edi, %k1
462; X64-KNL-NEXT:    callq _func_float16_mask
463; X64-KNL-NEXT:    popq %r12
464; X64-KNL-NEXT:    popq %r13
465; X64-KNL-NEXT:    popq %rbp
466; X64-KNL-NEXT:    retq
467;
468; X64-SKX-LABEL: testf16_inp_mask:
469; X64-SKX:       ## %bb.0:
470; X64-SKX-NEXT:    pushq %rbp
471; X64-SKX-NEXT:    .cfi_def_cfa_offset 16
472; X64-SKX-NEXT:    pushq %r13
473; X64-SKX-NEXT:    .cfi_def_cfa_offset 24
474; X64-SKX-NEXT:    pushq %r12
475; X64-SKX-NEXT:    .cfi_def_cfa_offset 32
476; X64-SKX-NEXT:    .cfi_offset %r12, -32
477; X64-SKX-NEXT:    .cfi_offset %r13, -24
478; X64-SKX-NEXT:    .cfi_offset %rbp, -16
479; X64-SKX-NEXT:    kmovd %edi, %k1
480; X64-SKX-NEXT:    callq _func_float16_mask
481; X64-SKX-NEXT:    popq %r12
482; X64-SKX-NEXT:    popq %r13
483; X64-SKX-NEXT:    popq %rbp
484; X64-SKX-NEXT:    retq
485  %imask = bitcast i16 %mask to <16 x i1>
486  %1 = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1> %imask)
487  ret <16 x float> %1
488}
489
490define intel_ocl_bicc <16 x float> @test_prolog_epilog_with_mask(<16 x float> %a, <16 x i32> %x1, <16 x i32>%x2, <16 x i1> %mask) nounwind {
491; X32-LABEL: test_prolog_epilog_with_mask:
492; X32:       ## %bb.0:
493; X32-NEXT:    subl $12, %esp
494; X32-NEXT:    vpcmpeqd %zmm2, %zmm1, %k0
495; X32-NEXT:    kxorw %k1, %k0, %k1
496; X32-NEXT:    calll _func_float16_mask
497; X32-NEXT:    addl $12, %esp
498; X32-NEXT:    retl
499;
500; WIN32-LABEL: test_prolog_epilog_with_mask:
501; WIN32:       # %bb.0:
502; WIN32-NEXT:    vpcmpeqd %zmm2, %zmm1, %k0
503; WIN32-NEXT:    kxorw %k1, %k0, %k1
504; WIN32-NEXT:    calll _func_float16_mask
505; WIN32-NEXT:    retl
506;
507; WIN64-LABEL: test_prolog_epilog_with_mask:
508; WIN64:       # %bb.0:
509; WIN64-NEXT:    subq $40, %rsp
510; WIN64-NEXT:    vpcmpeqd %zmm2, %zmm1, %k0
511; WIN64-NEXT:    kxorw %k1, %k0, %k1
512; WIN64-NEXT:    callq func_float16_mask
513; WIN64-NEXT:    addq $40, %rsp
514; WIN64-NEXT:    retq
515;
516; X64-LABEL: test_prolog_epilog_with_mask:
517; X64:       ## %bb.0:
518; X64-NEXT:    pushq %rax
519; X64-NEXT:    vpcmpeqd %zmm2, %zmm1, %k0
520; X64-NEXT:    kxorw %k1, %k0, %k1
521; X64-NEXT:    callq _func_float16_mask
522; X64-NEXT:    popq %rax
523; X64-NEXT:    retq
524   %cmp_res = icmp eq <16 x i32>%x1, %x2
525   %mask1 = xor <16 x i1> %cmp_res, %mask
526   %c = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1>%mask1)
527   ret <16 x float> %c
528}
529