1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 | FileCheck %s --check-prefix=FMA4 3 4attributes #0 = { nounwind } 5 6declare <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone 7define <4 x float> @test_x86_fmadd_baa_ss(<4 x float> %a, <4 x float> %b) #0 { 8; FMA4-LABEL: test_x86_fmadd_baa_ss: 9; FMA4: # %bb.0: 10; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 11; FMA4-NEXT: vfmaddss {{.*#+}} xmm0 = (xmm0 * mem) + xmm0 12; FMA4-NEXT: retq 13 %res = call <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind 14 ret <4 x float> %res 15} 16 17define <4 x float> @test_x86_fmadd_aba_ss(<4 x float> %a, <4 x float> %b) #0 { 18; FMA4-LABEL: test_x86_fmadd_aba_ss: 19; FMA4: # %bb.0: 20; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 21; FMA4-NEXT: vfmaddss {{.*#+}} xmm0 = (xmm0 * mem) + xmm0 22; FMA4-NEXT: retq 23 %res = call <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind 24 ret <4 x float> %res 25} 26 27define <4 x float> @test_x86_fmadd_bba_ss(<4 x float> %a, <4 x float> %b) #0 { 28; FMA4-LABEL: test_x86_fmadd_bba_ss: 29; FMA4: # %bb.0: 30; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 31; FMA4-NEXT: vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm0) + mem 32; FMA4-NEXT: retq 33 %res = call <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind 34 ret <4 x float> %res 35} 36 37declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone 38define <4 x float> @test_x86_fmadd_baa_ps(<4 x float> %a, <4 x float> %b) #0 { 39; FMA4-LABEL: test_x86_fmadd_baa_ps: 40; FMA4: # %bb.0: 41; FMA4-NEXT: vmovaps (%rcx), %xmm0 42; FMA4-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * mem) + xmm0 43; FMA4-NEXT: retq 44 %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind 45 ret <4 x float> %res 46} 47 48define <4 x float> @test_x86_fmadd_aba_ps(<4 x float> %a, <4 x float> %b) #0 { 49; FMA4-LABEL: test_x86_fmadd_aba_ps: 50; FMA4: # %bb.0: 51; FMA4-NEXT: vmovaps (%rcx), %xmm0 52; FMA4-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * mem) + xmm0 53; FMA4-NEXT: retq 54 %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind 55 ret <4 x float> %res 56} 57 58define <4 x float> @test_x86_fmadd_bba_ps(<4 x float> %a, <4 x float> %b) #0 { 59; FMA4-LABEL: test_x86_fmadd_bba_ps: 60; FMA4: # %bb.0: 61; FMA4-NEXT: vmovaps (%rdx), %xmm0 62; FMA4-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm0) + mem 63; FMA4-NEXT: retq 64 %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind 65 ret <4 x float> %res 66} 67 68declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone 69define <8 x float> @test_x86_fmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 { 70; FMA4-LABEL: test_x86_fmadd_baa_ps_y: 71; FMA4: # %bb.0: 72; FMA4-NEXT: vmovaps (%rcx), %ymm0 73; FMA4-NEXT: vfmaddps {{.*#+}} ymm0 = (ymm0 * mem) + ymm0 74; FMA4-NEXT: retq 75 %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind 76 ret <8 x float> %res 77} 78 79define <8 x float> @test_x86_fmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 { 80; FMA4-LABEL: test_x86_fmadd_aba_ps_y: 81; FMA4: # %bb.0: 82; FMA4-NEXT: vmovaps (%rcx), %ymm0 83; FMA4-NEXT: vfmaddps {{.*#+}} ymm0 = (ymm0 * mem) + ymm0 84; FMA4-NEXT: retq 85 %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind 86 ret <8 x float> %res 87} 88 89define <8 x float> @test_x86_fmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 { 90; FMA4-LABEL: test_x86_fmadd_bba_ps_y: 91; FMA4: # %bb.0: 92; FMA4-NEXT: vmovaps (%rdx), %ymm0 93; FMA4-NEXT: vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm0) + mem 94; FMA4-NEXT: retq 95 %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind 96 ret <8 x float> %res 97} 98 99declare <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone 100define <2 x double> @test_x86_fmadd_baa_sd(<2 x double> %a, <2 x double> %b) #0 { 101; FMA4-LABEL: test_x86_fmadd_baa_sd: 102; FMA4: # %bb.0: 103; FMA4-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 104; FMA4-NEXT: vfmaddsd {{.*#+}} xmm0 = (xmm0 * mem) + xmm0 105; FMA4-NEXT: retq 106 %res = call <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind 107 ret <2 x double> %res 108} 109 110define <2 x double> @test_x86_fmadd_aba_sd(<2 x double> %a, <2 x double> %b) #0 { 111; FMA4-LABEL: test_x86_fmadd_aba_sd: 112; FMA4: # %bb.0: 113; FMA4-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 114; FMA4-NEXT: vfmaddsd {{.*#+}} xmm0 = (xmm0 * mem) + xmm0 115; FMA4-NEXT: retq 116 %res = call <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind 117 ret <2 x double> %res 118} 119 120define <2 x double> @test_x86_fmadd_bba_sd(<2 x double> %a, <2 x double> %b) #0 { 121; FMA4-LABEL: test_x86_fmadd_bba_sd: 122; FMA4: # %bb.0: 123; FMA4-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 124; FMA4-NEXT: vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm0) + mem 125; FMA4-NEXT: retq 126 %res = call <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind 127 ret <2 x double> %res 128} 129 130declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone 131define <2 x double> @test_x86_fmadd_baa_pd(<2 x double> %a, <2 x double> %b) #0 { 132; FMA4-LABEL: test_x86_fmadd_baa_pd: 133; FMA4: # %bb.0: 134; FMA4-NEXT: vmovapd (%rcx), %xmm0 135; FMA4-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm0 * mem) + xmm0 136; FMA4-NEXT: retq 137 %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind 138 ret <2 x double> %res 139} 140 141define <2 x double> @test_x86_fmadd_aba_pd(<2 x double> %a, <2 x double> %b) #0 { 142; FMA4-LABEL: test_x86_fmadd_aba_pd: 143; FMA4: # %bb.0: 144; FMA4-NEXT: vmovapd (%rcx), %xmm0 145; FMA4-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm0 * mem) + xmm0 146; FMA4-NEXT: retq 147 %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind 148 ret <2 x double> %res 149} 150 151define <2 x double> @test_x86_fmadd_bba_pd(<2 x double> %a, <2 x double> %b) #0 { 152; FMA4-LABEL: test_x86_fmadd_bba_pd: 153; FMA4: # %bb.0: 154; FMA4-NEXT: vmovapd (%rdx), %xmm0 155; FMA4-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm0 * xmm0) + mem 156; FMA4-NEXT: retq 157 %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind 158 ret <2 x double> %res 159} 160 161declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone 162define <4 x double> @test_x86_fmadd_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 { 163; FMA4-LABEL: test_x86_fmadd_baa_pd_y: 164; FMA4: # %bb.0: 165; FMA4-NEXT: vmovapd (%rcx), %ymm0 166; FMA4-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * mem) + ymm0 167; FMA4-NEXT: retq 168 %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind 169 ret <4 x double> %res 170} 171 172define <4 x double> @test_x86_fmadd_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 { 173; FMA4-LABEL: test_x86_fmadd_aba_pd_y: 174; FMA4: # %bb.0: 175; FMA4-NEXT: vmovapd (%rcx), %ymm0 176; FMA4-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * mem) + ymm0 177; FMA4-NEXT: retq 178 %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind 179 ret <4 x double> %res 180} 181 182define <4 x double> @test_x86_fmadd_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 { 183; FMA4-LABEL: test_x86_fmadd_bba_pd_y: 184; FMA4: # %bb.0: 185; FMA4-NEXT: vmovapd (%rdx), %ymm0 186; FMA4-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm0) + mem 187; FMA4-NEXT: retq 188 %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind 189 ret <4 x double> %res 190} 191 192declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone 193define <4 x float> @test_x86_fnmadd_baa_ps(<4 x float> %a, <4 x float> %b) #0 { 194; FMA4-LABEL: test_x86_fnmadd_baa_ps: 195; FMA4: # %bb.0: 196; FMA4-NEXT: vmovaps (%rcx), %xmm0 197; FMA4-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm0 * mem) + xmm0 198; FMA4-NEXT: retq 199 %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind 200 ret <4 x float> %res 201} 202 203define <4 x float> @test_x86_fnmadd_aba_ps(<4 x float> %a, <4 x float> %b) #0 { 204; FMA4-LABEL: test_x86_fnmadd_aba_ps: 205; FMA4: # %bb.0: 206; FMA4-NEXT: vmovaps (%rcx), %xmm0 207; FMA4-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm0 * mem) + xmm0 208; FMA4-NEXT: retq 209 %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind 210 ret <4 x float> %res 211} 212 213define <4 x float> @test_x86_fnmadd_bba_ps(<4 x float> %a, <4 x float> %b) #0 { 214; FMA4-LABEL: test_x86_fnmadd_bba_ps: 215; FMA4: # %bb.0: 216; FMA4-NEXT: vmovaps (%rdx), %xmm0 217; FMA4-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm0 * xmm0) + mem 218; FMA4-NEXT: retq 219 %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind 220 ret <4 x float> %res 221} 222 223declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone 224define <8 x float> @test_x86_fnmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 { 225; FMA4-LABEL: test_x86_fnmadd_baa_ps_y: 226; FMA4: # %bb.0: 227; FMA4-NEXT: vmovaps (%rcx), %ymm0 228; FMA4-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm0 * mem) + ymm0 229; FMA4-NEXT: retq 230 %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind 231 ret <8 x float> %res 232} 233 234define <8 x float> @test_x86_fnmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 { 235; FMA4-LABEL: test_x86_fnmadd_aba_ps_y: 236; FMA4: # %bb.0: 237; FMA4-NEXT: vmovaps (%rcx), %ymm0 238; FMA4-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm0 * mem) + ymm0 239; FMA4-NEXT: retq 240 %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind 241 ret <8 x float> %res 242} 243 244define <8 x float> @test_x86_fnmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 { 245; FMA4-LABEL: test_x86_fnmadd_bba_ps_y: 246; FMA4: # %bb.0: 247; FMA4-NEXT: vmovaps (%rdx), %ymm0 248; FMA4-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm0 * ymm0) + mem 249; FMA4-NEXT: retq 250 %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind 251 ret <8 x float> %res 252} 253 254declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone 255define <2 x double> @test_x86_fnmadd_baa_pd(<2 x double> %a, <2 x double> %b) #0 { 256; FMA4-LABEL: test_x86_fnmadd_baa_pd: 257; FMA4: # %bb.0: 258; FMA4-NEXT: vmovapd (%rcx), %xmm0 259; FMA4-NEXT: vfnmaddpd {{.*#+}} xmm0 = -(xmm0 * mem) + xmm0 260; FMA4-NEXT: retq 261 %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind 262 ret <2 x double> %res 263} 264 265define <2 x double> @test_x86_fnmadd_aba_pd(<2 x double> %a, <2 x double> %b) #0 { 266; FMA4-LABEL: test_x86_fnmadd_aba_pd: 267; FMA4: # %bb.0: 268; FMA4-NEXT: vmovapd (%rcx), %xmm0 269; FMA4-NEXT: vfnmaddpd {{.*#+}} xmm0 = -(xmm0 * mem) + xmm0 270; FMA4-NEXT: retq 271 %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind 272 ret <2 x double> %res 273} 274 275define <2 x double> @test_x86_fnmadd_bba_pd(<2 x double> %a, <2 x double> %b) #0 { 276; FMA4-LABEL: test_x86_fnmadd_bba_pd: 277; FMA4: # %bb.0: 278; FMA4-NEXT: vmovapd (%rdx), %xmm0 279; FMA4-NEXT: vfnmaddpd {{.*#+}} xmm0 = -(xmm0 * xmm0) + mem 280; FMA4-NEXT: retq 281 %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind 282 ret <2 x double> %res 283} 284 285declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone 286define <4 x double> @test_x86_fnmadd_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 { 287; FMA4-LABEL: test_x86_fnmadd_baa_pd_y: 288; FMA4: # %bb.0: 289; FMA4-NEXT: vmovapd (%rcx), %ymm0 290; FMA4-NEXT: vfnmaddpd {{.*#+}} ymm0 = -(ymm0 * mem) + ymm0 291; FMA4-NEXT: retq 292 %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind 293 ret <4 x double> %res 294} 295 296define <4 x double> @test_x86_fnmadd_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 { 297; FMA4-LABEL: test_x86_fnmadd_aba_pd_y: 298; FMA4: # %bb.0: 299; FMA4-NEXT: vmovapd (%rcx), %ymm0 300; FMA4-NEXT: vfnmaddpd {{.*#+}} ymm0 = -(ymm0 * mem) + ymm0 301; FMA4-NEXT: retq 302 %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind 303 ret <4 x double> %res 304} 305 306define <4 x double> @test_x86_fnmadd_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 { 307; FMA4-LABEL: test_x86_fnmadd_bba_pd_y: 308; FMA4: # %bb.0: 309; FMA4-NEXT: vmovapd (%rdx), %ymm0 310; FMA4-NEXT: vfnmaddpd {{.*#+}} ymm0 = -(ymm0 * ymm0) + mem 311; FMA4-NEXT: retq 312 %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind 313 ret <4 x double> %res 314} 315 316declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone 317define <4 x float> @test_x86_fmsub_baa_ps(<4 x float> %a, <4 x float> %b) #0 { 318; FMA4-LABEL: test_x86_fmsub_baa_ps: 319; FMA4: # %bb.0: 320; FMA4-NEXT: vmovaps (%rcx), %xmm0 321; FMA4-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * mem) - xmm0 322; FMA4-NEXT: retq 323 %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind 324 ret <4 x float> %res 325} 326 327define <4 x float> @test_x86_fmsub_aba_ps(<4 x float> %a, <4 x float> %b) #0 { 328; FMA4-LABEL: test_x86_fmsub_aba_ps: 329; FMA4: # %bb.0: 330; FMA4-NEXT: vmovaps (%rcx), %xmm0 331; FMA4-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * mem) - xmm0 332; FMA4-NEXT: retq 333 %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind 334 ret <4 x float> %res 335} 336 337define <4 x float> @test_x86_fmsub_bba_ps(<4 x float> %a, <4 x float> %b) #0 { 338; FMA4-LABEL: test_x86_fmsub_bba_ps: 339; FMA4: # %bb.0: 340; FMA4-NEXT: vmovaps (%rdx), %xmm0 341; FMA4-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm0) - mem 342; FMA4-NEXT: retq 343 %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind 344 ret <4 x float> %res 345} 346 347declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone 348define <8 x float> @test_x86_fmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 { 349; FMA4-LABEL: test_x86_fmsub_baa_ps_y: 350; FMA4: # %bb.0: 351; FMA4-NEXT: vmovaps (%rcx), %ymm0 352; FMA4-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * mem) - ymm0 353; FMA4-NEXT: retq 354 %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind 355 ret <8 x float> %res 356} 357 358define <8 x float> @test_x86_fmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 { 359; FMA4-LABEL: test_x86_fmsub_aba_ps_y: 360; FMA4: # %bb.0: 361; FMA4-NEXT: vmovaps (%rcx), %ymm0 362; FMA4-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * mem) - ymm0 363; FMA4-NEXT: retq 364 %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind 365 ret <8 x float> %res 366} 367 368define <8 x float> @test_x86_fmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 { 369; FMA4-LABEL: test_x86_fmsub_bba_ps_y: 370; FMA4: # %bb.0: 371; FMA4-NEXT: vmovaps (%rdx), %ymm0 372; FMA4-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm0) - mem 373; FMA4-NEXT: retq 374 %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind 375 ret <8 x float> %res 376} 377 378declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone 379define <2 x double> @test_x86_fmsub_baa_pd(<2 x double> %a, <2 x double> %b) #0 { 380; FMA4-LABEL: test_x86_fmsub_baa_pd: 381; FMA4: # %bb.0: 382; FMA4-NEXT: vmovapd (%rcx), %xmm0 383; FMA4-NEXT: vfmsubpd {{.*#+}} xmm0 = (xmm0 * mem) - xmm0 384; FMA4-NEXT: retq 385 %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind 386 ret <2 x double> %res 387} 388 389define <2 x double> @test_x86_fmsub_aba_pd(<2 x double> %a, <2 x double> %b) #0 { 390; FMA4-LABEL: test_x86_fmsub_aba_pd: 391; FMA4: # %bb.0: 392; FMA4-NEXT: vmovapd (%rcx), %xmm0 393; FMA4-NEXT: vfmsubpd {{.*#+}} xmm0 = (xmm0 * mem) - xmm0 394; FMA4-NEXT: retq 395 %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind 396 ret <2 x double> %res 397} 398 399define <2 x double> @test_x86_fmsub_bba_pd(<2 x double> %a, <2 x double> %b) #0 { 400; FMA4-LABEL: test_x86_fmsub_bba_pd: 401; FMA4: # %bb.0: 402; FMA4-NEXT: vmovapd (%rdx), %xmm0 403; FMA4-NEXT: vfmsubpd {{.*#+}} xmm0 = (xmm0 * xmm0) - mem 404; FMA4-NEXT: retq 405 %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind 406 ret <2 x double> %res 407} 408 409declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone 410define <4 x double> @test_x86_fmsub_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 { 411; FMA4-LABEL: test_x86_fmsub_baa_pd_y: 412; FMA4: # %bb.0: 413; FMA4-NEXT: vmovapd (%rcx), %ymm0 414; FMA4-NEXT: vfmsubpd {{.*#+}} ymm0 = (ymm0 * mem) - ymm0 415; FMA4-NEXT: retq 416 %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind 417 ret <4 x double> %res 418} 419 420define <4 x double> @test_x86_fmsub_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 { 421; FMA4-LABEL: test_x86_fmsub_aba_pd_y: 422; FMA4: # %bb.0: 423; FMA4-NEXT: vmovapd (%rcx), %ymm0 424; FMA4-NEXT: vfmsubpd {{.*#+}} ymm0 = (ymm0 * mem) - ymm0 425; FMA4-NEXT: retq 426 %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind 427 ret <4 x double> %res 428} 429 430define <4 x double> @test_x86_fmsub_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 { 431; FMA4-LABEL: test_x86_fmsub_bba_pd_y: 432; FMA4: # %bb.0: 433; FMA4-NEXT: vmovapd (%rdx), %ymm0 434; FMA4-NEXT: vfmsubpd {{.*#+}} ymm0 = (ymm0 * ymm0) - mem 435; FMA4-NEXT: retq 436 %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind 437 ret <4 x double> %res 438} 439 440declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone 441define <4 x float> @test_x86_fnmsub_baa_ps(<4 x float> %a, <4 x float> %b) #0 { 442; FMA4-LABEL: test_x86_fnmsub_baa_ps: 443; FMA4: # %bb.0: 444; FMA4-NEXT: vmovaps (%rcx), %xmm0 445; FMA4-NEXT: vfnmsubps {{.*#+}} xmm0 = -(xmm0 * mem) - xmm0 446; FMA4-NEXT: retq 447 %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind 448 ret <4 x float> %res 449} 450 451define <4 x float> @test_x86_fnmsub_aba_ps(<4 x float> %a, <4 x float> %b) #0 { 452; FMA4-LABEL: test_x86_fnmsub_aba_ps: 453; FMA4: # %bb.0: 454; FMA4-NEXT: vmovaps (%rcx), %xmm0 455; FMA4-NEXT: vfnmsubps {{.*#+}} xmm0 = -(xmm0 * mem) - xmm0 456; FMA4-NEXT: retq 457 %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind 458 ret <4 x float> %res 459} 460 461define <4 x float> @test_x86_fnmsub_bba_ps(<4 x float> %a, <4 x float> %b) #0 { 462; FMA4-LABEL: test_x86_fnmsub_bba_ps: 463; FMA4: # %bb.0: 464; FMA4-NEXT: vmovaps (%rdx), %xmm0 465; FMA4-NEXT: vfnmsubps {{.*#+}} xmm0 = -(xmm0 * xmm0) - mem 466; FMA4-NEXT: retq 467 %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind 468 ret <4 x float> %res 469} 470 471declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone 472define <8 x float> @test_x86_fnmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 { 473; FMA4-LABEL: test_x86_fnmsub_baa_ps_y: 474; FMA4: # %bb.0: 475; FMA4-NEXT: vmovaps (%rcx), %ymm0 476; FMA4-NEXT: vfnmsubps {{.*#+}} ymm0 = -(ymm0 * mem) - ymm0 477; FMA4-NEXT: retq 478 %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind 479 ret <8 x float> %res 480} 481 482define <8 x float> @test_x86_fnmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 { 483; FMA4-LABEL: test_x86_fnmsub_aba_ps_y: 484; FMA4: # %bb.0: 485; FMA4-NEXT: vmovaps (%rcx), %ymm0 486; FMA4-NEXT: vfnmsubps {{.*#+}} ymm0 = -(ymm0 * mem) - ymm0 487; FMA4-NEXT: retq 488 %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind 489 ret <8 x float> %res 490} 491 492define <8 x float> @test_x86_fnmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 { 493; FMA4-LABEL: test_x86_fnmsub_bba_ps_y: 494; FMA4: # %bb.0: 495; FMA4-NEXT: vmovaps (%rdx), %ymm0 496; FMA4-NEXT: vfnmsubps {{.*#+}} ymm0 = -(ymm0 * ymm0) - mem 497; FMA4-NEXT: retq 498 %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind 499 ret <8 x float> %res 500} 501 502declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone 503define <2 x double> @test_x86_fnmsub_baa_pd(<2 x double> %a, <2 x double> %b) #0 { 504; FMA4-LABEL: test_x86_fnmsub_baa_pd: 505; FMA4: # %bb.0: 506; FMA4-NEXT: vmovapd (%rcx), %xmm0 507; FMA4-NEXT: vfnmsubpd {{.*#+}} xmm0 = -(xmm0 * mem) - xmm0 508; FMA4-NEXT: retq 509 %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind 510 ret <2 x double> %res 511} 512 513define <2 x double> @test_x86_fnmsub_aba_pd(<2 x double> %a, <2 x double> %b) #0 { 514; FMA4-LABEL: test_x86_fnmsub_aba_pd: 515; FMA4: # %bb.0: 516; FMA4-NEXT: vmovapd (%rcx), %xmm0 517; FMA4-NEXT: vfnmsubpd {{.*#+}} xmm0 = -(xmm0 * mem) - xmm0 518; FMA4-NEXT: retq 519 %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind 520 ret <2 x double> %res 521} 522 523define <2 x double> @test_x86_fnmsub_bba_pd(<2 x double> %a, <2 x double> %b) #0 { 524; FMA4-LABEL: test_x86_fnmsub_bba_pd: 525; FMA4: # %bb.0: 526; FMA4-NEXT: vmovapd (%rdx), %xmm0 527; FMA4-NEXT: vfnmsubpd {{.*#+}} xmm0 = -(xmm0 * xmm0) - mem 528; FMA4-NEXT: retq 529 %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind 530 ret <2 x double> %res 531} 532 533declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone 534define <4 x double> @test_x86_fnmsub_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 { 535; FMA4-LABEL: test_x86_fnmsub_baa_pd_y: 536; FMA4: # %bb.0: 537; FMA4-NEXT: vmovapd (%rcx), %ymm0 538; FMA4-NEXT: vfnmsubpd {{.*#+}} ymm0 = -(ymm0 * mem) - ymm0 539; FMA4-NEXT: retq 540 %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind 541 ret <4 x double> %res 542} 543 544define <4 x double> @test_x86_fnmsub_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 { 545; FMA4-LABEL: test_x86_fnmsub_aba_pd_y: 546; FMA4: # %bb.0: 547; FMA4-NEXT: vmovapd (%rcx), %ymm0 548; FMA4-NEXT: vfnmsubpd {{.*#+}} ymm0 = -(ymm0 * mem) - ymm0 549; FMA4-NEXT: retq 550 %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind 551 ret <4 x double> %res 552} 553 554define <4 x double> @test_x86_fnmsub_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 { 555; FMA4-LABEL: test_x86_fnmsub_bba_pd_y: 556; FMA4: # %bb.0: 557; FMA4-NEXT: vmovapd (%rdx), %ymm0 558; FMA4-NEXT: vfnmsubpd {{.*#+}} ymm0 = -(ymm0 * ymm0) - mem 559; FMA4-NEXT: retq 560 %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind 561 ret <4 x double> %res 562} 563 564