1; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck -check-prefix=X86 %s 2; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+avx | FileCheck -check-prefix=X86 %s 3; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx | FileCheck -check-prefix=WIN64 %s 4; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck -check-prefix=X64 %s 5 6declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *) 7declare <16 x float> @func_float16(<16 x float>, <16 x float>) 8declare i32 @func_int(i32, i32) 9 10; WIN64-LABEL: testf16_inp 11; WIN64: vaddps {{.*}}, {{%ymm[0-1]}} 12; WIN64: vaddps {{.*}}, {{%ymm[0-1]}} 13; WIN64: leaq {{.*}}(%rsp), %rcx 14; WIN64: call 15; WIN64: ret 16 17; X86-LABEL: testf16_inp 18; X86: vaddps {{.*}}, {{%ymm[0-1]}} 19; X86: vaddps {{.*}}, {{%ymm[0-1]}} 20; Push is not deemed profitable if we're realigning the stack. 21; X86: {{pushl|movl}} %eax 22; X86: call 23; X86: ret 24 25; X64-LABEL: testf16_inp 26; X64: vaddps {{.*}}, {{%ymm[0-1]}} 27; X64: vaddps {{.*}}, {{%ymm[0-1]}} 28; X64: movq %rsp, %rdi 29; X64: call 30; X64: ret 31 32;test calling conventions - input parameters 33define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind { 34 %y = alloca <16 x float>, align 16 35 %x = fadd <16 x float> %a, %b 36 %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 37 %2 = load <16 x float>, <16 x float>* %y, align 16 38 %3 = fadd <16 x float> %2, %1 39 ret <16 x float> %3 40} 41 42;test calling conventions - preserved registers 43 44; preserved ymm6-ymm15 45; WIN64-LABEL: testf16_regs 46; WIN64: call 47; WIN64: vaddps {{%ymm[6-7]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}} 48; WIN64: vaddps {{%ymm[6-7]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}} 49; WIN64: ret 50 51; preserved ymm8-ymm15 52; X64-LABEL: testf16_regs 53; X64: call 54; X64: vaddps {{%ymm[0-1]}}, {{%ymm[8-9]}}, {{%ymm[0-1]}} 55; X64: vaddps {{%ymm[0-1]}}, {{%ymm[8-9]}}, {{%ymm[0-1]}} 56; X64: ret 57 58define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind { 59 %y = alloca <16 x float>, align 16 60 %x = fadd <16 x float> %a, %b 61 %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 62 %2 = load <16 x float>, <16 x float>* %y, align 16 63 %3 = fadd <16 x float> %1, %b 64 %4 = fadd <16 x float> %2, %3 65 ret <16 x float> %4 66} 67 68; test calling conventions - prolog and epilog 69; WIN64-LABEL: test_prolog_epilog 70; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill 71; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill 72; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill 73; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill 74; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill 75; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill 76; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill 77; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill 78; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill 79; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill 80; WIN64: call 81; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload 82; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload 83; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload 84; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload 85; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload 86; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload 87; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload 88; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload 89; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload 90; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload 91 92; X64-LABEL: test_prolog_epilog 93; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill 94; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill 95; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill 96; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill 97; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill 98; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill 99; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill 100; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill 101; X64: call 102; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload 103; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload 104; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload 105; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload 106; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload 107; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload 108; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload 109; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload 110define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind { 111 %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b) 112 ret <16 x float> %c 113} 114 115; test functions with integer parameters 116; pass parameters on stack for 32-bit platform 117; X86-LABEL: test_int 118; X86: pushl {{.*}} 119; X86: pushl {{.*}} 120; X86: call 121; X86: addl {{.*}}, %eax 122 123; pass parameters in registers for 64-bit platform 124; X64-LABEL: test_int 125; X64: movl {{.*}}, %esi 126; X64: leal {{.*}}, %edi 127; X64: call 128; X64: addl {{.*}}, %eax 129define i32 @test_int(i32 %a, i32 %b) nounwind { 130 %c1 = add i32 %a, %b 131 %c2 = call intel_ocl_bicc i32 @func_int(i32 %c1, i32 %a) 132 %c = add i32 %c2, %b 133 ret i32 %c 134} 135 136; WIN64-LABEL: test_float4 137; WIN64-NOT: vzeroupper 138; WIN64: call 139; WIN64-NOT: vzeroupper 140; WIN64: call 141; WIN64: ret 142 143; X64-LABEL: test_float4 144; X64-NOT: vzeroupper 145; X64: call 146; X64-NOT: vzeroupper 147; X64: call 148; X64: ret 149 150; X86-LABEL: test_float4 151; X86: vzeroupper 152; X86: call 153; X86: vzeroupper 154; X86: call 155; X86: ret 156 157declare <4 x float> @func_float4(<4 x float>, <4 x float>, <4 x float>) 158 159define <8 x float> @test_float4(<8 x float> %a, <8 x float> %b, <8 x float> %c) nounwind readnone { 160entry: 161 %0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 162 %1 = shufflevector <8 x float> %b, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 163 %2 = shufflevector <8 x float> %c, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 164 %call.i = tail call intel_ocl_bicc <4 x float> @func_float4(<4 x float> %0, <4 x float> %1, <4 x float> %2) nounwind 165 %3 = shufflevector <4 x float> %call.i, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 166 %4 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 167 %5 = shufflevector <8 x float> %b, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 168 %6 = shufflevector <8 x float> %c, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 169 %call.i2 = tail call intel_ocl_bicc <4 x float> @func_float4(<4 x float> %4, <4 x float> %5, <4 x float> %6) nounwind 170 %7 = shufflevector <4 x float> %call.i2, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 171 %8 = shufflevector <8 x float> %3, <8 x float> %7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> 172 ret <8 x float> %8 173} 174