; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -O3 -mtriple=x86_64-pc-linux < %s | FileCheck %s --check-prefix=NOFMA ; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma < %s | FileCheck %s --check-prefixes=FMA,FMA-AVX1 ; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma4 < %s | FileCheck %s --check-prefix=FMA4 ; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+avx512f < %s | FileCheck %s --check-prefixes=FMA,FMA-AVX512 define float @f1(float %0, float %1, float %2) #0 { ; NOFMA-LABEL: f1: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: pushq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 16 ; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0 ; NOFMA-NEXT: callq fmaf ; NOFMA-NEXT: popq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq ; ; FMA-LABEL: f1: ; FMA: # %bb.0: # %entry ; FMA-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 ; FMA-NEXT: retq ; ; FMA4-LABEL: f1: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfnmaddss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2 ; FMA4-NEXT: retq entry: %3 = fneg float %0 %result = call float @llvm.experimental.constrained.fma.f32(float %3, float %1, float %2, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret float %result } define double @f2(double %0, double %1, double %2) #0 { ; NOFMA-LABEL: f2: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: pushq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 16 ; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0 ; NOFMA-NEXT: callq fma ; NOFMA-NEXT: popq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq ; ; FMA-LABEL: f2: ; FMA: # %bb.0: # %entry ; FMA-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 ; FMA-NEXT: retq ; ; FMA4-LABEL: f2: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfnmaddsd {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2 ; FMA4-NEXT: retq entry: %3 = fneg double %0 %result = call double @llvm.experimental.constrained.fma.f64(double %3, double %1, double %2, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret double %result } define float @f3(float %0, float %1, float %2) #0 { ; NOFMA-LABEL: f3: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: pushq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 16 ; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm2 ; NOFMA-NEXT: callq fmaf ; NOFMA-NEXT: popq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq ; ; FMA-LABEL: f3: ; FMA: # %bb.0: # %entry ; FMA-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 ; FMA-NEXT: retq ; ; FMA4-LABEL: f3: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2 ; FMA4-NEXT: retq entry: %3 = fneg float %2 %result = call float @llvm.experimental.constrained.fma.f32(float %0, float %1, float %3, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret float %result } define double @f4(double %0, double %1, double %2) #0 { ; NOFMA-LABEL: f4: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: pushq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 16 ; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm2 ; NOFMA-NEXT: callq fma ; NOFMA-NEXT: popq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq ; ; FMA-LABEL: f4: ; FMA: # %bb.0: # %entry ; FMA-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 ; FMA-NEXT: retq ; ; FMA4-LABEL: f4: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfmsubsd {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2 ; FMA4-NEXT: retq entry: %3 = fneg double %2 %result = call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %3, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret double %result } define float @f5(float %0, float %1, float %2) #0 { ; NOFMA-LABEL: f5: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: pushq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 16 ; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; NOFMA-NEXT: xorps %xmm3, %xmm0 ; NOFMA-NEXT: xorps %xmm3, %xmm2 ; NOFMA-NEXT: callq fmaf ; NOFMA-NEXT: popq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq ; ; FMA-LABEL: f5: ; FMA: # %bb.0: # %entry ; FMA-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 ; FMA-NEXT: retq ; ; FMA4-LABEL: f5: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfnmsubss {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 ; FMA4-NEXT: retq entry: %3 = fneg float %0 %4 = fneg float %2 %result = call float @llvm.experimental.constrained.fma.f32(float %3, float %1, float %4, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret float %result } define double @f6(double %0, double %1, double %2) #0 { ; NOFMA-LABEL: f6: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: pushq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 16 ; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0] ; NOFMA-NEXT: xorps %xmm3, %xmm0 ; NOFMA-NEXT: xorps %xmm3, %xmm2 ; NOFMA-NEXT: callq fma ; NOFMA-NEXT: popq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq ; ; FMA-LABEL: f6: ; FMA: # %bb.0: # %entry ; FMA-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 ; FMA-NEXT: retq ; ; FMA4-LABEL: f6: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfnmsubsd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 ; FMA4-NEXT: retq entry: %3 = fneg double %0 %4 = fneg double %2 %result = call double @llvm.experimental.constrained.fma.f64(double %3, double %1, double %4, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret double %result } define float @f7(float %0, float %1, float %2) #0 { ; NOFMA-LABEL: f7: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: pushq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 16 ; NOFMA-NEXT: callq fmaf ; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0 ; NOFMA-NEXT: popq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq ; ; FMA-AVX1-LABEL: f7: ; FMA-AVX1: # %bb.0: # %entry ; FMA-AVX1-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; FMA-AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 ; FMA-AVX1-NEXT: retq ; ; FMA4-LABEL: f7: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2 ; FMA4-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 ; FMA4-NEXT: retq ; ; FMA-AVX512-LABEL: f7: ; FMA-AVX512: # %bb.0: # %entry ; FMA-AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; FMA-AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; FMA-AVX512-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; FMA-AVX512-NEXT: retq entry: %3 = call float @llvm.experimental.constrained.fma.f32(float %0, float %1, float %2, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 %result = fneg float %3 ret float %result } define double @f8(double %0, double %1, double %2) #0 { ; NOFMA-LABEL: f8: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: pushq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 16 ; NOFMA-NEXT: callq fma ; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0 ; NOFMA-NEXT: popq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq ; ; FMA-LABEL: f8: ; FMA: # %bb.0: # %entry ; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; FMA-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 ; FMA-NEXT: retq ; ; FMA4-LABEL: f8: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2 ; FMA4-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 ; FMA4-NEXT: retq entry: %3 = call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %2, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 %result = fneg double %3 ret double %result } define float @f9(float %0, float %1, float %2) #0 { ; NOFMA-LABEL: f9: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: pushq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 16 ; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; NOFMA-NEXT: xorps %xmm3, %xmm0 ; NOFMA-NEXT: xorps %xmm3, %xmm2 ; NOFMA-NEXT: callq fmaf ; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0 ; NOFMA-NEXT: popq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq ; ; FMA-AVX1-LABEL: f9: ; FMA-AVX1: # %bb.0: # %entry ; FMA-AVX1-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 ; FMA-AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 ; FMA-AVX1-NEXT: retq ; ; FMA4-LABEL: f9: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfnmsubss {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 ; FMA4-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 ; FMA4-NEXT: retq ; ; FMA-AVX512-LABEL: f9: ; FMA-AVX512: # %bb.0: # %entry ; FMA-AVX512-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 ; FMA-AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; FMA-AVX512-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; FMA-AVX512-NEXT: retq entry: %3 = fneg float %0 %4 = fneg float %2 %5 = call float @llvm.experimental.constrained.fma.f32(float %3, float %1, float %4, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 %result = fneg float %5 ret float %result } define double @f10(double %0, double %1, double %2) #0 { ; NOFMA-LABEL: f10: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: pushq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 16 ; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0] ; NOFMA-NEXT: xorps %xmm3, %xmm0 ; NOFMA-NEXT: xorps %xmm3, %xmm2 ; NOFMA-NEXT: callq fma ; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0 ; NOFMA-NEXT: popq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq ; ; FMA-LABEL: f10: ; FMA: # %bb.0: # %entry ; FMA-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 ; FMA-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 ; FMA-NEXT: retq ; ; FMA4-LABEL: f10: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfnmsubsd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 ; FMA4-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 ; FMA4-NEXT: retq entry: %3 = fneg double %0 %4 = fneg double %2 %5 = call double @llvm.experimental.constrained.fma.f64(double %3, double %1, double %4, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 %result = fneg double %5 ret double %result } ; Verify constrained fmul and fadd aren't fused. define float @f11(float %0, float %1, float %2) #0 { ; NOFMA-LABEL: f11: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: mulss %xmm1, %xmm0 ; NOFMA-NEXT: addss %xmm2, %xmm0 ; NOFMA-NEXT: retq ; ; FMA-LABEL: f11: ; FMA: # %bb.0: # %entry ; FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; FMA-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; FMA-NEXT: retq ; ; FMA4-LABEL: f11: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; FMA4-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; FMA4-NEXT: retq entry: %3 = call float @llvm.experimental.constrained.fmul.f32(float %0, float %1, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 %4 = call float @llvm.experimental.constrained.fadd.f32(float %3, float %2, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret float %4 } ; Verify constrained fmul and fadd aren't fused. define double @f12(double %0, double %1, double %2) #0 { ; NOFMA-LABEL: f12: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: mulsd %xmm1, %xmm0 ; NOFMA-NEXT: addsd %xmm2, %xmm0 ; NOFMA-NEXT: retq ; ; FMA-LABEL: f12: ; FMA: # %bb.0: # %entry ; FMA-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; FMA-NEXT: vaddsd %xmm2, %xmm0, %xmm0 ; FMA-NEXT: retq ; ; FMA4-LABEL: f12: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; FMA4-NEXT: vaddsd %xmm2, %xmm0, %xmm0 ; FMA4-NEXT: retq entry: %3 = call double @llvm.experimental.constrained.fmul.f64(double %0, double %1, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 %4 = call double @llvm.experimental.constrained.fadd.f64(double %3, double %2, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret double %4 } ; Verify that fmuladd(3.5) isn't simplified when the rounding mode is ; unknown. define float @f15() #0 { ; NOFMA-LABEL: f15: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; NOFMA-NEXT: movaps %xmm1, %xmm0 ; NOFMA-NEXT: mulss %xmm1, %xmm0 ; NOFMA-NEXT: addss %xmm1, %xmm0 ; NOFMA-NEXT: retq ; ; FMA-LABEL: f15: ; FMA: # %bb.0: # %entry ; FMA-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; FMA-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 ; FMA-NEXT: retq ; ; FMA4-LABEL: f15: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; FMA4-NEXT: vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 ; FMA4-NEXT: retq entry: %result = call float @llvm.experimental.constrained.fmuladd.f32( float 3.5, float 3.5, float 3.5, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret float %result } ; Verify that fmuladd(42.1) isn't simplified when the rounding mode is ; unknown. define double @f16() #0 { ; NOFMA-LABEL: f16: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; NOFMA-NEXT: movapd %xmm1, %xmm0 ; NOFMA-NEXT: mulsd %xmm1, %xmm0 ; NOFMA-NEXT: addsd %xmm1, %xmm0 ; NOFMA-NEXT: retq ; ; FMA-LABEL: f16: ; FMA: # %bb.0: # %entry ; FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 ; FMA-NEXT: retq ; ; FMA4-LABEL: f16: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; FMA4-NEXT: vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 ; FMA4-NEXT: retq entry: %result = call double @llvm.experimental.constrained.fmuladd.f64( double 42.1, double 42.1, double 42.1, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret double %result } ; Verify that fma(3.5) isn't simplified when the rounding mode is ; unknown. define float @f17() #0 { ; NOFMA-LABEL: f17: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: pushq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 16 ; NOFMA-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; NOFMA-NEXT: movaps %xmm0, %xmm1 ; NOFMA-NEXT: movaps %xmm0, %xmm2 ; NOFMA-NEXT: callq fmaf ; NOFMA-NEXT: popq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq ; ; FMA-LABEL: f17: ; FMA: # %bb.0: # %entry ; FMA-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; FMA-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 ; FMA-NEXT: retq ; ; FMA4-LABEL: f17: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; FMA4-NEXT: vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 ; FMA4-NEXT: retq entry: %result = call float @llvm.experimental.constrained.fma.f32( float 3.5, float 3.5, float 3.5, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret float %result } ; Verify that fma(42.1) isn't simplified when the rounding mode is ; unknown. define double @f18() #0 { ; NOFMA-LABEL: f18: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: pushq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 16 ; NOFMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NOFMA-NEXT: movaps %xmm0, %xmm1 ; NOFMA-NEXT: movaps %xmm0, %xmm2 ; NOFMA-NEXT: callq fma ; NOFMA-NEXT: popq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq ; ; FMA-LABEL: f18: ; FMA: # %bb.0: # %entry ; FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 ; FMA-NEXT: retq ; ; FMA4-LABEL: f18: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; FMA4-NEXT: vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 ; FMA4-NEXT: retq entry: %result = call double @llvm.experimental.constrained.fma.f64( double 42.1, double 42.1, double 42.1, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret double %result } define <4 x float> @f19(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 { ; NOFMA-LABEL: f19: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: subq $88, %rsp ; NOFMA-NEXT: .cfi_def_cfa_offset 96 ; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: pxor {{.*}}(%rip), %xmm0 ; NOFMA-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; NOFMA-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; NOFMA-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; NOFMA-NEXT: callq fmaf ; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; NOFMA-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] ; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm0 = mem[2,3,2,3] ; NOFMA-NEXT: callq fmaf ; NOFMA-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; NOFMA-NEXT: callq fmaf ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; NOFMA-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; NOFMA-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm0 = mem[1,1,1,1] ; NOFMA-NEXT: callq fmaf ; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; NOFMA-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0] ; NOFMA-NEXT: movdqa %xmm1, %xmm0 ; NOFMA-NEXT: addq $88, %rsp ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq ; ; FMA-LABEL: f19: ; FMA: # %bb.0: # %entry ; FMA-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 ; FMA-NEXT: retq ; ; FMA4-LABEL: f19: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2 ; FMA4-NEXT: retq entry: %3 = fneg <4 x float> %0 %result = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %3, <4 x float> %1, <4 x float> %2, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret <4 x float> %result } define <2 x double> @f20(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 { ; NOFMA-LABEL: f20: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: subq $72, %rsp ; NOFMA-NEXT: .cfi_def_cfa_offset 80 ; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0 ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: callq fma ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; NOFMA-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] ; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm0 = mem[2,3,2,3] ; NOFMA-NEXT: callq fma ; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; NOFMA-NEXT: movdqa %xmm1, %xmm0 ; NOFMA-NEXT: addq $72, %rsp ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq ; ; FMA-LABEL: f20: ; FMA: # %bb.0: # %entry ; FMA-NEXT: vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 ; FMA-NEXT: retq ; ; FMA4-LABEL: f20: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfnmaddpd {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2 ; FMA4-NEXT: retq entry: %3 = fneg <2 x double> %0 %result = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %3, <2 x double> %1, <2 x double> %2, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret <2 x double> %result } define <4 x float> @f21(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 { ; NOFMA-LABEL: f21: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: subq $88, %rsp ; NOFMA-NEXT: .cfi_def_cfa_offset 96 ; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: pxor {{.*}}(%rip), %xmm2 ; NOFMA-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; NOFMA-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; NOFMA-NEXT: callq fmaf ; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; NOFMA-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm2 = mem[2,3,2,3] ; NOFMA-NEXT: callq fmaf ; NOFMA-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; NOFMA-NEXT: callq fmaf ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; NOFMA-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; NOFMA-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm2 = mem[1,1,1,1] ; NOFMA-NEXT: callq fmaf ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; NOFMA-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0] ; NOFMA-NEXT: movaps %xmm1, %xmm0 ; NOFMA-NEXT: addq $88, %rsp ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq ; ; FMA-LABEL: f21: ; FMA: # %bb.0: # %entry ; FMA-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 ; FMA-NEXT: retq ; ; FMA4-LABEL: f21: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2 ; FMA4-NEXT: retq entry: %3 = fneg <4 x float> %2 %result = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %3, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret <4 x float> %result } define <2 x double> @f22(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 { ; NOFMA-LABEL: f22: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: subq $72, %rsp ; NOFMA-NEXT: .cfi_def_cfa_offset 80 ; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm2 ; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: callq fma ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; NOFMA-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm2 = mem[2,3,2,3] ; NOFMA-NEXT: callq fma ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; NOFMA-NEXT: movaps %xmm1, %xmm0 ; NOFMA-NEXT: addq $72, %rsp ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq ; ; FMA-LABEL: f22: ; FMA: # %bb.0: # %entry ; FMA-NEXT: vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 ; FMA-NEXT: retq ; ; FMA4-LABEL: f22: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfmsubpd {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2 ; FMA4-NEXT: retq entry: %3 = fneg <2 x double> %2 %result = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %0, <2 x double> %1, <2 x double> %3, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret <2 x double> %result } define <4 x float> @f23(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 { ; NOFMA-LABEL: f23: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: subq $88, %rsp ; NOFMA-NEXT: .cfi_def_cfa_offset 96 ; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: movdqa {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; NOFMA-NEXT: pxor %xmm3, %xmm0 ; NOFMA-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: pxor %xmm3, %xmm2 ; NOFMA-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; NOFMA-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; NOFMA-NEXT: callq fmaf ; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm0 = mem[2,3,2,3] ; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm2 = mem[2,3,2,3] ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; NOFMA-NEXT: callq fmaf ; NOFMA-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; NOFMA-NEXT: callq fmaf ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm0 = mem[1,1,1,1] ; NOFMA-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm2 = mem[1,1,1,1] ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; NOFMA-NEXT: callq fmaf ; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; NOFMA-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0] ; NOFMA-NEXT: movdqa %xmm1, %xmm0 ; NOFMA-NEXT: addq $88, %rsp ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq ; ; FMA-LABEL: f23: ; FMA: # %bb.0: # %entry ; FMA-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 ; FMA-NEXT: retq ; ; FMA4-LABEL: f23: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfnmsubps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 ; FMA4-NEXT: retq entry: %3 = fneg <4 x float> %0 %4 = fneg <4 x float> %2 %result = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %3, <4 x float> %1, <4 x float> %4, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret <4 x float> %result } define <2 x double> @f24(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 { ; NOFMA-LABEL: f24: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: subq $72, %rsp ; NOFMA-NEXT: .cfi_def_cfa_offset 80 ; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0] ; NOFMA-NEXT: xorps %xmm3, %xmm0 ; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; NOFMA-NEXT: xorps %xmm3, %xmm2 ; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: callq fma ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm0 = mem[2,3,2,3] ; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm2 = mem[2,3,2,3] ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; NOFMA-NEXT: callq fma ; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; NOFMA-NEXT: movdqa %xmm1, %xmm0 ; NOFMA-NEXT: addq $72, %rsp ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq ; ; FMA-LABEL: f24: ; FMA: # %bb.0: # %entry ; FMA-NEXT: vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 ; FMA-NEXT: retq ; ; FMA4-LABEL: f24: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfnmsubpd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 ; FMA4-NEXT: retq entry: %3 = fneg <2 x double> %0 %4 = fneg <2 x double> %2 %result = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %3, <2 x double> %1, <2 x double> %4, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret <2 x double> %result } define <4 x float> @f25(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 { ; NOFMA-LABEL: f25: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: subq $88, %rsp ; NOFMA-NEXT: .cfi_def_cfa_offset 96 ; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; NOFMA-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; NOFMA-NEXT: callq fmaf ; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; NOFMA-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; NOFMA-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] ; NOFMA-NEXT: callq fmaf ; NOFMA-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; NOFMA-NEXT: callq fmaf ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; NOFMA-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; NOFMA-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; NOFMA-NEXT: callq fmaf ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; NOFMA-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0] ; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm1 ; NOFMA-NEXT: movaps %xmm1, %xmm0 ; NOFMA-NEXT: addq $88, %rsp ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq ; ; FMA-AVX1-LABEL: f25: ; FMA-AVX1: # %bb.0: # %entry ; FMA-AVX1-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; FMA-AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 ; FMA-AVX1-NEXT: retq ; ; FMA4-LABEL: f25: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2 ; FMA4-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 ; FMA4-NEXT: retq ; ; FMA-AVX512-LABEL: f25: ; FMA-AVX512: # %bb.0: # %entry ; FMA-AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; FMA-AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; FMA-AVX512-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; FMA-AVX512-NEXT: retq entry: %3 = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 %result = fneg <4 x float> %3 ret <4 x float> %result } define <2 x double> @f26(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 { ; NOFMA-LABEL: f26: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: subq $72, %rsp ; NOFMA-NEXT: .cfi_def_cfa_offset 80 ; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; NOFMA-NEXT: callq fma ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; NOFMA-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; NOFMA-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] ; NOFMA-NEXT: callq fma ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm1 ; NOFMA-NEXT: movaps %xmm1, %xmm0 ; NOFMA-NEXT: addq $72, %rsp ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq ; ; FMA-LABEL: f26: ; FMA: # %bb.0: # %entry ; FMA-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; FMA-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 ; FMA-NEXT: retq ; ; FMA4-LABEL: f26: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2 ; FMA4-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 ; FMA4-NEXT: retq entry: %3 = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %0, <2 x double> %1, <2 x double> %2, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 %result = fneg <2 x double> %3 ret <2 x double> %result } define <4 x float> @f27(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 { ; NOFMA-LABEL: f27: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: subq $88, %rsp ; NOFMA-NEXT: .cfi_def_cfa_offset 96 ; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: movdqa {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; NOFMA-NEXT: pxor %xmm3, %xmm0 ; NOFMA-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: pxor %xmm3, %xmm2 ; NOFMA-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; NOFMA-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; NOFMA-NEXT: callq fmaf ; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm0 = mem[2,3,2,3] ; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm2 = mem[2,3,2,3] ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; NOFMA-NEXT: callq fmaf ; NOFMA-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; NOFMA-NEXT: callq fmaf ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm0 = mem[1,1,1,1] ; NOFMA-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm2 = mem[1,1,1,1] ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; NOFMA-NEXT: callq fmaf ; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; NOFMA-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0] ; NOFMA-NEXT: pxor {{.*}}(%rip), %xmm1 ; NOFMA-NEXT: movdqa %xmm1, %xmm0 ; NOFMA-NEXT: addq $88, %rsp ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq ; ; FMA-AVX1-LABEL: f27: ; FMA-AVX1: # %bb.0: # %entry ; FMA-AVX1-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 ; FMA-AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 ; FMA-AVX1-NEXT: retq ; ; FMA4-LABEL: f27: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfnmsubps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 ; FMA4-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 ; FMA4-NEXT: retq ; ; FMA-AVX512-LABEL: f27: ; FMA-AVX512: # %bb.0: # %entry ; FMA-AVX512-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 ; FMA-AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; FMA-AVX512-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; FMA-AVX512-NEXT: retq entry: %3 = fneg <4 x float> %0 %4 = fneg <4 x float> %2 %5 = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %3, <4 x float> %1, <4 x float> %4, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 %result = fneg <4 x float> %5 ret <4 x float> %result } define <2 x double> @f28(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 { ; NOFMA-LABEL: f28: ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: subq $72, %rsp ; NOFMA-NEXT: .cfi_def_cfa_offset 80 ; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0] ; NOFMA-NEXT: xorps %xmm3, %xmm0 ; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; NOFMA-NEXT: xorps %xmm3, %xmm2 ; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: callq fma ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm0 = mem[2,3,2,3] ; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm2 = mem[2,3,2,3] ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; NOFMA-NEXT: callq fma ; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; NOFMA-NEXT: pxor {{.*}}(%rip), %xmm1 ; NOFMA-NEXT: movdqa %xmm1, %xmm0 ; NOFMA-NEXT: addq $72, %rsp ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq ; ; FMA-LABEL: f28: ; FMA: # %bb.0: # %entry ; FMA-NEXT: vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 ; FMA-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 ; FMA-NEXT: retq ; ; FMA4-LABEL: f28: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfnmsubpd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 ; FMA4-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 ; FMA4-NEXT: retq entry: %3 = fneg <2 x double> %0 %4 = fneg <2 x double> %2 %5 = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %3, <2 x double> %1, <2 x double> %4, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 %result = fneg <2 x double> %5 ret <2 x double> %result } attributes #0 = { strictfp } declare float @llvm.experimental.constrained.fmul.f32(float, float, metadata, metadata) declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata) declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata) declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata) declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata) declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata) declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata) declare float @llvm.experimental.constrained.fmuladd.f32(float, float, float, metadata, metadata) declare double @llvm.experimental.constrained.fmuladd.f64(double, double, double, metadata, metadata)