1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=SSE,SSE-X86 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=SSE,SSE-X64 4; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+fma -O3 | FileCheck %s --check-prefixes=AVX 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma -O3 | FileCheck %s --check-prefixes=AVX 6; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=AVX 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=AVX 8 9declare <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double>, <2 x double>, metadata, metadata) 10declare <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float>, <4 x float>, metadata, metadata) 11declare <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double>, <2 x double>, metadata, metadata) 12declare <4 x float> @llvm.experimental.constrained.fsub.v4f32(<4 x float>, <4 x float>, metadata, metadata) 13declare <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double>, <2 x double>, metadata, metadata) 14declare <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float>, <4 x float>, metadata, metadata) 15declare <2 x double> @llvm.experimental.constrained.fdiv.v2f64(<2 x double>, <2 x double>, metadata, metadata) 16declare <4 x float> @llvm.experimental.constrained.fdiv.v4f32(<4 x float>, <4 x float>, metadata, metadata) 17declare <2 x double> @llvm.experimental.constrained.sqrt.v2f64(<2 x double>, metadata, metadata) 18declare <4 x float> @llvm.experimental.constrained.sqrt.v4f32(<4 x float>, metadata, metadata) 19declare float @llvm.experimental.constrained.fptrunc.f32.f64(double, metadata, metadata) 20declare <2 x float> @llvm.experimental.constrained.fptrunc.v2f32.v2f64(<2 x double>, metadata, metadata) 21declare double @llvm.experimental.constrained.fpext.f64.f32(float, metadata) 22declare <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f32(<2 x float>, metadata) 23declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata) 24declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata) 25 26define <2 x double> @f1(<2 x double> %a, <2 x double> %b) #0 { 27; SSE-LABEL: f1: 28; SSE: # %bb.0: 29; SSE-NEXT: addpd %xmm1, %xmm0 30; SSE-NEXT: ret{{[l|q]}} 31; 32; AVX-LABEL: f1: 33; AVX: # %bb.0: 34; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 35; AVX-NEXT: ret{{[l|q]}} 36 %ret = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %a, <2 x double> %b, 37 metadata !"round.dynamic", 38 metadata !"fpexcept.strict") #0 39 ret <2 x double> %ret 40} 41 42define <4 x float> @f2(<4 x float> %a, <4 x float> %b) #0 { 43; SSE-LABEL: f2: 44; SSE: # %bb.0: 45; SSE-NEXT: addps %xmm1, %xmm0 46; SSE-NEXT: ret{{[l|q]}} 47; 48; AVX-LABEL: f2: 49; AVX: # %bb.0: 50; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 51; AVX-NEXT: ret{{[l|q]}} 52 %ret = call <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float> %a, <4 x float> %b, 53 metadata !"round.dynamic", 54 metadata !"fpexcept.strict") #0 55 ret <4 x float> %ret 56} 57 58define <2 x double> @f3(<2 x double> %a, <2 x double> %b) #0 { 59; SSE-LABEL: f3: 60; SSE: # %bb.0: 61; SSE-NEXT: subpd %xmm1, %xmm0 62; SSE-NEXT: ret{{[l|q]}} 63; 64; AVX-LABEL: f3: 65; AVX: # %bb.0: 66; AVX-NEXT: vsubpd %xmm1, %xmm0, %xmm0 67; AVX-NEXT: ret{{[l|q]}} 68 %ret = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %a, <2 x double> %b, 69 metadata !"round.dynamic", 70 metadata !"fpexcept.strict") #0 71 ret <2 x double> %ret 72} 73 74define <4 x float> @f4(<4 x float> %a, <4 x float> %b) #0 { 75; SSE-LABEL: f4: 76; SSE: # %bb.0: 77; SSE-NEXT: subps %xmm1, %xmm0 78; SSE-NEXT: ret{{[l|q]}} 79; 80; AVX-LABEL: f4: 81; AVX: # %bb.0: 82; AVX-NEXT: vsubps %xmm1, %xmm0, %xmm0 83; AVX-NEXT: ret{{[l|q]}} 84 %ret = call <4 x float> @llvm.experimental.constrained.fsub.v4f32(<4 x float> %a, <4 x float> %b, 85 metadata !"round.dynamic", 86 metadata !"fpexcept.strict") #0 87 ret <4 x float> %ret 88} 89 90define <2 x double> @f5(<2 x double> %a, <2 x double> %b) #0 { 91; SSE-LABEL: f5: 92; SSE: # %bb.0: 93; SSE-NEXT: mulpd %xmm1, %xmm0 94; SSE-NEXT: ret{{[l|q]}} 95; 96; AVX-LABEL: f5: 97; AVX: # %bb.0: 98; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 99; AVX-NEXT: ret{{[l|q]}} 100 %ret = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %a, <2 x double> %b, 101 metadata !"round.dynamic", 102 metadata !"fpexcept.strict") #0 103 ret <2 x double> %ret 104} 105 106define <4 x float> @f6(<4 x float> %a, <4 x float> %b) #0 { 107; SSE-LABEL: f6: 108; SSE: # %bb.0: 109; SSE-NEXT: mulps %xmm1, %xmm0 110; SSE-NEXT: ret{{[l|q]}} 111; 112; AVX-LABEL: f6: 113; AVX: # %bb.0: 114; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 115; AVX-NEXT: ret{{[l|q]}} 116 %ret = call <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float> %a, <4 x float> %b, 117 metadata !"round.dynamic", 118 metadata !"fpexcept.strict") #0 119 ret <4 x float> %ret 120} 121 122define <2 x double> @f7(<2 x double> %a, <2 x double> %b) #0 { 123; SSE-LABEL: f7: 124; SSE: # %bb.0: 125; SSE-NEXT: divpd %xmm1, %xmm0 126; SSE-NEXT: ret{{[l|q]}} 127; 128; AVX-LABEL: f7: 129; AVX: # %bb.0: 130; AVX-NEXT: vdivpd %xmm1, %xmm0, %xmm0 131; AVX-NEXT: ret{{[l|q]}} 132 %ret = call <2 x double> @llvm.experimental.constrained.fdiv.v2f64(<2 x double> %a, <2 x double> %b, 133 metadata !"round.dynamic", 134 metadata !"fpexcept.strict") #0 135 ret <2 x double> %ret 136} 137 138define <4 x float> @f8(<4 x float> %a, <4 x float> %b) #0 { 139; SSE-LABEL: f8: 140; SSE: # %bb.0: 141; SSE-NEXT: divps %xmm1, %xmm0 142; SSE-NEXT: ret{{[l|q]}} 143; 144; AVX-LABEL: f8: 145; AVX: # %bb.0: 146; AVX-NEXT: vdivps %xmm1, %xmm0, %xmm0 147; AVX-NEXT: ret{{[l|q]}} 148 %ret = call <4 x float> @llvm.experimental.constrained.fdiv.v4f32(<4 x float> %a, <4 x float> %b, 149 metadata !"round.dynamic", 150 metadata !"fpexcept.strict") #0 151 ret <4 x float> %ret 152} 153 154define <2 x double> @f9(<2 x double> %a) #0 { 155; SSE-LABEL: f9: 156; SSE: # %bb.0: 157; SSE-NEXT: sqrtpd %xmm0, %xmm0 158; SSE-NEXT: ret{{[l|q]}} 159; 160; AVX-LABEL: f9: 161; AVX: # %bb.0: 162; AVX-NEXT: vsqrtpd %xmm0, %xmm0 163; AVX-NEXT: ret{{[l|q]}} 164 %sqrt = call <2 x double> @llvm.experimental.constrained.sqrt.v2f64( 165 <2 x double> %a, 166 metadata !"round.dynamic", 167 metadata !"fpexcept.strict") #0 168 ret <2 x double> %sqrt 169} 170 171define <4 x float> @f10(<4 x float> %a) #0 { 172; SSE-LABEL: f10: 173; SSE: # %bb.0: 174; SSE-NEXT: sqrtps %xmm0, %xmm0 175; SSE-NEXT: ret{{[l|q]}} 176; 177; AVX-LABEL: f10: 178; AVX: # %bb.0: 179; AVX-NEXT: vsqrtps %xmm0, %xmm0 180; AVX-NEXT: ret{{[l|q]}} 181 %sqrt = call <4 x float> @llvm.experimental.constrained.sqrt.v4f32( 182 <4 x float> %a, 183 metadata !"round.dynamic", 184 metadata !"fpexcept.strict") #0 185 ret <4 x float > %sqrt 186} 187 188define <4 x float> @f11(<2 x double> %a0, <4 x float> %a1) #0 { 189; SSE-LABEL: f11: 190; SSE: # %bb.0: 191; SSE-NEXT: cvtsd2ss %xmm0, %xmm1 192; SSE-NEXT: movaps %xmm1, %xmm0 193; SSE-NEXT: ret{{[l|q]}} 194; 195; AVX-LABEL: f11: 196; AVX: # %bb.0: 197; AVX-NEXT: vcvtsd2ss %xmm0, %xmm1, %xmm0 198; AVX-NEXT: ret{{[l|q]}} 199 %ext = extractelement <2 x double> %a0, i32 0 200 %cvt = call float @llvm.experimental.constrained.fptrunc.f32.f64(double %ext, 201 metadata !"round.dynamic", 202 metadata !"fpexcept.strict") #0 203 %res = insertelement <4 x float> %a1, float %cvt, i32 0 204 ret <4 x float> %res 205} 206 207define <2 x double> @f12(<2 x double> %a0, <4 x float> %a1) #0 { 208; SSE-LABEL: f12: 209; SSE: # %bb.0: 210; SSE-NEXT: cvtss2sd %xmm1, %xmm0 211; SSE-NEXT: ret{{[l|q]}} 212; 213; AVX-LABEL: f12: 214; AVX: # %bb.0: 215; AVX-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0 216; AVX-NEXT: ret{{[l|q]}} 217 %ext = extractelement <4 x float> %a1, i32 0 218 %cvt = call double @llvm.experimental.constrained.fpext.f64.f32(float %ext, 219 metadata !"fpexcept.strict") #0 220 %res = insertelement <2 x double> %a0, double %cvt, i32 0 221 ret <2 x double> %res 222} 223 224define <4 x float> @f13(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 { 225; SSE-X86-LABEL: f13: 226; SSE-X86: # %bb.0: 227; SSE-X86-NEXT: subl $108, %esp 228; SSE-X86-NEXT: .cfi_def_cfa_offset 112 229; SSE-X86-NEXT: movups %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 230; SSE-X86-NEXT: movups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 231; SSE-X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 232; SSE-X86-NEXT: movss %xmm2, {{[0-9]+}}(%esp) 233; SSE-X86-NEXT: movss %xmm1, {{[0-9]+}}(%esp) 234; SSE-X86-NEXT: movss %xmm0, (%esp) 235; SSE-X86-NEXT: calll fmaf 236; SSE-X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill 237; SSE-X86-NEXT: wait 238; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 239; SSE-X86-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 240; SSE-X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp) 241; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 242; SSE-X86-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 243; SSE-X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp) 244; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 245; SSE-X86-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 246; SSE-X86-NEXT: movss %xmm0, (%esp) 247; SSE-X86-NEXT: calll fmaf 248; SSE-X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill 249; SSE-X86-NEXT: wait 250; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 251; SSE-X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 252; SSE-X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp) 253; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 254; SSE-X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 255; SSE-X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp) 256; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 257; SSE-X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 258; SSE-X86-NEXT: movss %xmm0, (%esp) 259; SSE-X86-NEXT: calll fmaf 260; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 261; SSE-X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 262; SSE-X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp) 263; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 264; SSE-X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 265; SSE-X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp) 266; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 267; SSE-X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 268; SSE-X86-NEXT: movss %xmm0, (%esp) 269; SSE-X86-NEXT: fstps {{[0-9]+}}(%esp) 270; SSE-X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload 271; SSE-X86-NEXT: fstps {{[0-9]+}}(%esp) 272; SSE-X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload 273; SSE-X86-NEXT: fstps {{[0-9]+}}(%esp) 274; SSE-X86-NEXT: wait 275; SSE-X86-NEXT: calll fmaf 276; SSE-X86-NEXT: fstps {{[0-9]+}}(%esp) 277; SSE-X86-NEXT: wait 278; SSE-X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 279; SSE-X86-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 280; SSE-X86-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 281; SSE-X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 282; SSE-X86-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 283; SSE-X86-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 284; SSE-X86-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 285; SSE-X86-NEXT: addl $108, %esp 286; SSE-X86-NEXT: .cfi_def_cfa_offset 4 287; SSE-X86-NEXT: retl 288; 289; SSE-X64-LABEL: f13: 290; SSE-X64: # %bb.0: 291; SSE-X64-NEXT: subq $88, %rsp 292; SSE-X64-NEXT: .cfi_def_cfa_offset 96 293; SSE-X64-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 294; SSE-X64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 295; SSE-X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 296; SSE-X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 297; SSE-X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 298; SSE-X64-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 299; SSE-X64-NEXT: callq fmaf 300; SSE-X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 301; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 302; SSE-X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 303; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 304; SSE-X64-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 305; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 306; SSE-X64-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] 307; SSE-X64-NEXT: callq fmaf 308; SSE-X64-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload 309; SSE-X64-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 310; SSE-X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 311; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 312; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 313; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 314; SSE-X64-NEXT: callq fmaf 315; SSE-X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 316; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 317; SSE-X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 318; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 319; SSE-X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] 320; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 321; SSE-X64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] 322; SSE-X64-NEXT: callq fmaf 323; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 324; SSE-X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 325; SSE-X64-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload 326; SSE-X64-NEXT: # xmm1 = xmm1[0],mem[0] 327; SSE-X64-NEXT: movaps %xmm1, %xmm0 328; SSE-X64-NEXT: addq $88, %rsp 329; SSE-X64-NEXT: .cfi_def_cfa_offset 8 330; SSE-X64-NEXT: retq 331; 332; AVX-LABEL: f13: 333; AVX: # %bb.0: 334; AVX-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 335; AVX-NEXT: ret{{[l|q]}} 336 %res = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, 337 metadata !"round.dynamic", 338 metadata !"fpexcept.strict") #0 339 ret <4 x float> %res 340} 341 342define <2 x double> @f14(<2 x double> %a, <2 x double> %b, <2 x double> %c) #0 { 343; SSE-X86-LABEL: f14: 344; SSE-X86: # %bb.0: 345; SSE-X86-NEXT: pushl %ebp 346; SSE-X86-NEXT: .cfi_def_cfa_offset 8 347; SSE-X86-NEXT: .cfi_offset %ebp, -8 348; SSE-X86-NEXT: movl %esp, %ebp 349; SSE-X86-NEXT: .cfi_def_cfa_register %ebp 350; SSE-X86-NEXT: andl $-16, %esp 351; SSE-X86-NEXT: subl $112, %esp 352; SSE-X86-NEXT: movaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 353; SSE-X86-NEXT: movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 354; SSE-X86-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 355; SSE-X86-NEXT: movlps %xmm2, {{[0-9]+}}(%esp) 356; SSE-X86-NEXT: movlps %xmm1, {{[0-9]+}}(%esp) 357; SSE-X86-NEXT: movlps %xmm0, (%esp) 358; SSE-X86-NEXT: calll fma 359; SSE-X86-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 360; SSE-X86-NEXT: movhps %xmm0, {{[0-9]+}}(%esp) 361; SSE-X86-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 362; SSE-X86-NEXT: movhps %xmm0, {{[0-9]+}}(%esp) 363; SSE-X86-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 364; SSE-X86-NEXT: movhps %xmm0, (%esp) 365; SSE-X86-NEXT: fstpl {{[0-9]+}}(%esp) 366; SSE-X86-NEXT: wait 367; SSE-X86-NEXT: calll fma 368; SSE-X86-NEXT: fstpl {{[0-9]+}}(%esp) 369; SSE-X86-NEXT: wait 370; SSE-X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 371; SSE-X86-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 372; SSE-X86-NEXT: movl %ebp, %esp 373; SSE-X86-NEXT: popl %ebp 374; SSE-X86-NEXT: .cfi_def_cfa %esp, 4 375; SSE-X86-NEXT: retl 376; 377; SSE-X64-LABEL: f14: 378; SSE-X64: # %bb.0: 379; SSE-X64-NEXT: subq $72, %rsp 380; SSE-X64-NEXT: .cfi_def_cfa_offset 80 381; SSE-X64-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 382; SSE-X64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 383; SSE-X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 384; SSE-X64-NEXT: callq fma 385; SSE-X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 386; SSE-X64-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 387; SSE-X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 388; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 389; SSE-X64-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 390; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 391; SSE-X64-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] 392; SSE-X64-NEXT: callq fma 393; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 394; SSE-X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 395; SSE-X64-NEXT: movaps %xmm1, %xmm0 396; SSE-X64-NEXT: addq $72, %rsp 397; SSE-X64-NEXT: .cfi_def_cfa_offset 8 398; SSE-X64-NEXT: retq 399; 400; AVX-LABEL: f14: 401; AVX: # %bb.0: 402; AVX-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 403; AVX-NEXT: ret{{[l|q]}} 404 %res = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, 405 metadata !"round.dynamic", 406 metadata !"fpexcept.strict") #0 407 ret <2 x double> %res 408} 409 410define <2 x double> @f15(<2 x float> %a) #0 { 411; SSE-LABEL: f15: 412; SSE: # %bb.0: 413; SSE-NEXT: cvtps2pd %xmm0, %xmm0 414; SSE-NEXT: ret{{[l|q]}} 415; 416; AVX-LABEL: f15: 417; AVX: # %bb.0: 418; AVX-NEXT: vcvtps2pd %xmm0, %xmm0 419; AVX-NEXT: ret{{[l|q]}} 420 %ret = call <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f32( 421 <2 x float> %a, 422 metadata !"fpexcept.strict") #0 423 ret <2 x double> %ret 424} 425 426define <2 x float> @f16(<2 x double> %a) #0 { 427; SSE-LABEL: f16: 428; SSE: # %bb.0: 429; SSE-NEXT: cvtpd2ps %xmm0, %xmm0 430; SSE-NEXT: ret{{[l|q]}} 431; 432; AVX-LABEL: f16: 433; AVX: # %bb.0: 434; AVX-NEXT: vcvtpd2ps %xmm0, %xmm0 435; AVX-NEXT: ret{{[l|q]}} 436 %ret = call <2 x float> @llvm.experimental.constrained.fptrunc.v2f32.v2f64( 437 <2 x double> %a, 438 metadata !"round.dynamic", 439 metadata !"fpexcept.strict") #0 440 ret <2 x float> %ret 441} 442 443 444attributes #0 = { strictfp } 445