1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fma | FileCheck %s 3 4define <2 x double> @fmaddsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { 5; CHECK-LABEL: fmaddsubpd_loop_128: 6; CHECK: # %bb.0: # %entry 7; CHECK-NEXT: xorl %eax, %eax 8; CHECK-NEXT: cmpl %edi, %eax 9; CHECK-NEXT: jge .LBB0_3 10; CHECK-NEXT: .p2align 4, 0x90 11; CHECK-NEXT: .LBB0_2: # %for.body 12; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 13; CHECK-NEXT: vfmaddsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2 14; CHECK-NEXT: incl %eax 15; CHECK-NEXT: cmpl %edi, %eax 16; CHECK-NEXT: jl .LBB0_2 17; CHECK-NEXT: .LBB0_3: # %for.end 18; CHECK-NEXT: vmovapd %xmm2, %xmm0 19; CHECK-NEXT: retq 20entry: 21 br label %for.cond 22 23for.cond: 24 %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ] 25 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 26 %cmp = icmp slt i32 %i.0, %iter 27 br i1 %cmp, label %for.body, label %for.end 28 29for.body: 30 br label %for.inc 31 32for.inc: 33 %0 = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0) 34 %inc = add nsw i32 %i.0, 1 35 br label %for.cond 36 37for.end: 38 ret <2 x double> %c.addr.0 39} 40 41define <2 x double> @fmsubaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { 42; CHECK-LABEL: fmsubaddpd_loop_128: 43; CHECK: # %bb.0: # %entry 44; CHECK-NEXT: xorl %eax, %eax 45; CHECK-NEXT: cmpl %edi, %eax 46; CHECK-NEXT: jge .LBB1_3 47; CHECK-NEXT: .p2align 4, 0x90 48; CHECK-NEXT: .LBB1_2: # %for.body 49; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 50; CHECK-NEXT: vfmsubadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2 51; CHECK-NEXT: incl %eax 52; CHECK-NEXT: cmpl %edi, %eax 53; CHECK-NEXT: jl .LBB1_2 54; CHECK-NEXT: .LBB1_3: # %for.end 55; CHECK-NEXT: vmovapd %xmm2, %xmm0 56; CHECK-NEXT: retq 57entry: 58 br label %for.cond 59 60for.cond: 61 %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ] 62 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 63 %cmp = icmp slt i32 %i.0, %iter 64 br i1 %cmp, label %for.body, label %for.end 65 66for.body: 67 br label %for.inc 68 69for.inc: 70 %0 = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0) 71 %inc = add nsw i32 %i.0, 1 72 br label %for.cond 73 74for.end: 75 ret <2 x double> %c.addr.0 76} 77 78define <2 x double> @fmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { 79; CHECK-LABEL: fmaddpd_loop_128: 80; CHECK: # %bb.0: # %entry 81; CHECK-NEXT: xorl %eax, %eax 82; CHECK-NEXT: cmpl %edi, %eax 83; CHECK-NEXT: jge .LBB2_3 84; CHECK-NEXT: .p2align 4, 0x90 85; CHECK-NEXT: .LBB2_2: # %for.body 86; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 87; CHECK-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2 88; CHECK-NEXT: incl %eax 89; CHECK-NEXT: cmpl %edi, %eax 90; CHECK-NEXT: jl .LBB2_2 91; CHECK-NEXT: .LBB2_3: # %for.end 92; CHECK-NEXT: vmovapd %xmm2, %xmm0 93; CHECK-NEXT: retq 94entry: 95 br label %for.cond 96 97for.cond: 98 %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ] 99 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 100 %cmp = icmp slt i32 %i.0, %iter 101 br i1 %cmp, label %for.body, label %for.end 102 103for.body: 104 br label %for.inc 105 106for.inc: 107 %0 = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0) 108 %inc = add nsw i32 %i.0, 1 109 br label %for.cond 110 111for.end: 112 ret <2 x double> %c.addr.0 113} 114 115define <2 x double> @fmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { 116; CHECK-LABEL: fmsubpd_loop_128: 117; CHECK: # %bb.0: # %entry 118; CHECK-NEXT: xorl %eax, %eax 119; CHECK-NEXT: cmpl %edi, %eax 120; CHECK-NEXT: jge .LBB3_3 121; CHECK-NEXT: .p2align 4, 0x90 122; CHECK-NEXT: .LBB3_2: # %for.body 123; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 124; CHECK-NEXT: vfmsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 125; CHECK-NEXT: incl %eax 126; CHECK-NEXT: cmpl %edi, %eax 127; CHECK-NEXT: jl .LBB3_2 128; CHECK-NEXT: .LBB3_3: # %for.end 129; CHECK-NEXT: vmovapd %xmm2, %xmm0 130; CHECK-NEXT: retq 131entry: 132 br label %for.cond 133 134for.cond: 135 %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ] 136 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 137 %cmp = icmp slt i32 %i.0, %iter 138 br i1 %cmp, label %for.body, label %for.end 139 140for.body: 141 br label %for.inc 142 143for.inc: 144 %0 = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0) 145 %inc = add nsw i32 %i.0, 1 146 br label %for.cond 147 148for.end: 149 ret <2 x double> %c.addr.0 150} 151 152define <2 x double> @fnmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { 153; CHECK-LABEL: fnmaddpd_loop_128: 154; CHECK: # %bb.0: # %entry 155; CHECK-NEXT: xorl %eax, %eax 156; CHECK-NEXT: cmpl %edi, %eax 157; CHECK-NEXT: jge .LBB4_3 158; CHECK-NEXT: .p2align 4, 0x90 159; CHECK-NEXT: .LBB4_2: # %for.body 160; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 161; CHECK-NEXT: vfnmadd231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 162; CHECK-NEXT: incl %eax 163; CHECK-NEXT: cmpl %edi, %eax 164; CHECK-NEXT: jl .LBB4_2 165; CHECK-NEXT: .LBB4_3: # %for.end 166; CHECK-NEXT: vmovapd %xmm2, %xmm0 167; CHECK-NEXT: retq 168entry: 169 br label %for.cond 170 171for.cond: 172 %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ] 173 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 174 %cmp = icmp slt i32 %i.0, %iter 175 br i1 %cmp, label %for.body, label %for.end 176 177for.body: 178 br label %for.inc 179 180for.inc: 181 %0 = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0) 182 %inc = add nsw i32 %i.0, 1 183 br label %for.cond 184 185for.end: 186 ret <2 x double> %c.addr.0 187} 188 189define <2 x double> @fnmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { 190; CHECK-LABEL: fnmsubpd_loop_128: 191; CHECK: # %bb.0: # %entry 192; CHECK-NEXT: xorl %eax, %eax 193; CHECK-NEXT: cmpl %edi, %eax 194; CHECK-NEXT: jge .LBB5_3 195; CHECK-NEXT: .p2align 4, 0x90 196; CHECK-NEXT: .LBB5_2: # %for.body 197; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 198; CHECK-NEXT: vfnmsub231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2 199; CHECK-NEXT: incl %eax 200; CHECK-NEXT: cmpl %edi, %eax 201; CHECK-NEXT: jl .LBB5_2 202; CHECK-NEXT: .LBB5_3: # %for.end 203; CHECK-NEXT: vmovapd %xmm2, %xmm0 204; CHECK-NEXT: retq 205entry: 206 br label %for.cond 207 208for.cond: 209 %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ] 210 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 211 %cmp = icmp slt i32 %i.0, %iter 212 br i1 %cmp, label %for.body, label %for.end 213 214for.body: 215 br label %for.inc 216 217for.inc: 218 %0 = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0) 219 %inc = add nsw i32 %i.0, 1 220 br label %for.cond 221 222for.end: 223 ret <2 x double> %c.addr.0 224} 225 226declare <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double>, <2 x double>, <2 x double>) 227declare <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double>, <2 x double>, <2 x double>) 228declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) 229declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) 230declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) 231declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) 232 233define <4 x float> @fmaddsubps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) { 234; CHECK-LABEL: fmaddsubps_loop_128: 235; CHECK: # %bb.0: # %entry 236; CHECK-NEXT: xorl %eax, %eax 237; CHECK-NEXT: cmpl %edi, %eax 238; CHECK-NEXT: jge .LBB6_3 239; CHECK-NEXT: .p2align 4, 0x90 240; CHECK-NEXT: .LBB6_2: # %for.body 241; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 242; CHECK-NEXT: vfmaddsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2 243; CHECK-NEXT: incl %eax 244; CHECK-NEXT: cmpl %edi, %eax 245; CHECK-NEXT: jl .LBB6_2 246; CHECK-NEXT: .LBB6_3: # %for.end 247; CHECK-NEXT: vmovaps %xmm2, %xmm0 248; CHECK-NEXT: retq 249entry: 250 br label %for.cond 251 252for.cond: 253 %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ] 254 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 255 %cmp = icmp slt i32 %i.0, %iter 256 br i1 %cmp, label %for.body, label %for.end 257 258for.body: 259 br label %for.inc 260 261for.inc: 262 %0 = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0) 263 %inc = add nsw i32 %i.0, 1 264 br label %for.cond 265 266for.end: 267 ret <4 x float> %c.addr.0 268} 269 270define <4 x float> @fmsubaddps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) { 271; CHECK-LABEL: fmsubaddps_loop_128: 272; CHECK: # %bb.0: # %entry 273; CHECK-NEXT: xorl %eax, %eax 274; CHECK-NEXT: cmpl %edi, %eax 275; CHECK-NEXT: jge .LBB7_3 276; CHECK-NEXT: .p2align 4, 0x90 277; CHECK-NEXT: .LBB7_2: # %for.body 278; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 279; CHECK-NEXT: vfmsubadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2 280; CHECK-NEXT: incl %eax 281; CHECK-NEXT: cmpl %edi, %eax 282; CHECK-NEXT: jl .LBB7_2 283; CHECK-NEXT: .LBB7_3: # %for.end 284; CHECK-NEXT: vmovaps %xmm2, %xmm0 285; CHECK-NEXT: retq 286entry: 287 br label %for.cond 288 289for.cond: 290 %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ] 291 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 292 %cmp = icmp slt i32 %i.0, %iter 293 br i1 %cmp, label %for.body, label %for.end 294 295for.body: 296 br label %for.inc 297 298for.inc: 299 %0 = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0) 300 %inc = add nsw i32 %i.0, 1 301 br label %for.cond 302 303for.end: 304 ret <4 x float> %c.addr.0 305} 306 307define <4 x float> @fmaddps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) { 308; CHECK-LABEL: fmaddps_loop_128: 309; CHECK: # %bb.0: # %entry 310; CHECK-NEXT: xorl %eax, %eax 311; CHECK-NEXT: cmpl %edi, %eax 312; CHECK-NEXT: jge .LBB8_3 313; CHECK-NEXT: .p2align 4, 0x90 314; CHECK-NEXT: .LBB8_2: # %for.body 315; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 316; CHECK-NEXT: vfmadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2 317; CHECK-NEXT: incl %eax 318; CHECK-NEXT: cmpl %edi, %eax 319; CHECK-NEXT: jl .LBB8_2 320; CHECK-NEXT: .LBB8_3: # %for.end 321; CHECK-NEXT: vmovaps %xmm2, %xmm0 322; CHECK-NEXT: retq 323entry: 324 br label %for.cond 325 326for.cond: 327 %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ] 328 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 329 %cmp = icmp slt i32 %i.0, %iter 330 br i1 %cmp, label %for.body, label %for.end 331 332for.body: 333 br label %for.inc 334 335for.inc: 336 %0 = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0) 337 %inc = add nsw i32 %i.0, 1 338 br label %for.cond 339 340for.end: 341 ret <4 x float> %c.addr.0 342} 343 344define <4 x float> @fmsubps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) { 345; CHECK-LABEL: fmsubps_loop_128: 346; CHECK: # %bb.0: # %entry 347; CHECK-NEXT: xorl %eax, %eax 348; CHECK-NEXT: cmpl %edi, %eax 349; CHECK-NEXT: jge .LBB9_3 350; CHECK-NEXT: .p2align 4, 0x90 351; CHECK-NEXT: .LBB9_2: # %for.body 352; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 353; CHECK-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 354; CHECK-NEXT: incl %eax 355; CHECK-NEXT: cmpl %edi, %eax 356; CHECK-NEXT: jl .LBB9_2 357; CHECK-NEXT: .LBB9_3: # %for.end 358; CHECK-NEXT: vmovaps %xmm2, %xmm0 359; CHECK-NEXT: retq 360entry: 361 br label %for.cond 362 363for.cond: 364 %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ] 365 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 366 %cmp = icmp slt i32 %i.0, %iter 367 br i1 %cmp, label %for.body, label %for.end 368 369for.body: 370 br label %for.inc 371 372for.inc: 373 %0 = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0) 374 %inc = add nsw i32 %i.0, 1 375 br label %for.cond 376 377for.end: 378 ret <4 x float> %c.addr.0 379} 380 381define <4 x float> @fnmaddps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) { 382; CHECK-LABEL: fnmaddps_loop_128: 383; CHECK: # %bb.0: # %entry 384; CHECK-NEXT: xorl %eax, %eax 385; CHECK-NEXT: cmpl %edi, %eax 386; CHECK-NEXT: jge .LBB10_3 387; CHECK-NEXT: .p2align 4, 0x90 388; CHECK-NEXT: .LBB10_2: # %for.body 389; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 390; CHECK-NEXT: vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 391; CHECK-NEXT: incl %eax 392; CHECK-NEXT: cmpl %edi, %eax 393; CHECK-NEXT: jl .LBB10_2 394; CHECK-NEXT: .LBB10_3: # %for.end 395; CHECK-NEXT: vmovaps %xmm2, %xmm0 396; CHECK-NEXT: retq 397entry: 398 br label %for.cond 399 400for.cond: 401 %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ] 402 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 403 %cmp = icmp slt i32 %i.0, %iter 404 br i1 %cmp, label %for.body, label %for.end 405 406for.body: 407 br label %for.inc 408 409for.inc: 410 %0 = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0) 411 %inc = add nsw i32 %i.0, 1 412 br label %for.cond 413 414for.end: 415 ret <4 x float> %c.addr.0 416} 417 418define <4 x float> @fnmsubps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) { 419; CHECK-LABEL: fnmsubps_loop_128: 420; CHECK: # %bb.0: # %entry 421; CHECK-NEXT: xorl %eax, %eax 422; CHECK-NEXT: cmpl %edi, %eax 423; CHECK-NEXT: jge .LBB11_3 424; CHECK-NEXT: .p2align 4, 0x90 425; CHECK-NEXT: .LBB11_2: # %for.body 426; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 427; CHECK-NEXT: vfnmsub231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2 428; CHECK-NEXT: incl %eax 429; CHECK-NEXT: cmpl %edi, %eax 430; CHECK-NEXT: jl .LBB11_2 431; CHECK-NEXT: .LBB11_3: # %for.end 432; CHECK-NEXT: vmovaps %xmm2, %xmm0 433; CHECK-NEXT: retq 434entry: 435 br label %for.cond 436 437for.cond: 438 %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ] 439 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 440 %cmp = icmp slt i32 %i.0, %iter 441 br i1 %cmp, label %for.body, label %for.end 442 443for.body: 444 br label %for.inc 445 446for.inc: 447 %0 = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0) 448 %inc = add nsw i32 %i.0, 1 449 br label %for.cond 450 451for.end: 452 ret <4 x float> %c.addr.0 453} 454 455declare <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float>, <4 x float>, <4 x float>) 456declare <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float>, <4 x float>, <4 x float>) 457declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) 458declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) 459declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) 460declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) 461 462define <4 x double> @fmaddsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { 463; CHECK-LABEL: fmaddsubpd_loop_256: 464; CHECK: # %bb.0: # %entry 465; CHECK-NEXT: xorl %eax, %eax 466; CHECK-NEXT: cmpl %edi, %eax 467; CHECK-NEXT: jge .LBB12_3 468; CHECK-NEXT: .p2align 4, 0x90 469; CHECK-NEXT: .LBB12_2: # %for.body 470; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 471; CHECK-NEXT: vfmaddsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2 472; CHECK-NEXT: incl %eax 473; CHECK-NEXT: cmpl %edi, %eax 474; CHECK-NEXT: jl .LBB12_2 475; CHECK-NEXT: .LBB12_3: # %for.end 476; CHECK-NEXT: vmovapd %ymm2, %ymm0 477; CHECK-NEXT: retq 478entry: 479 br label %for.cond 480 481for.cond: 482 %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] 483 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 484 %cmp = icmp slt i32 %i.0, %iter 485 br i1 %cmp, label %for.body, label %for.end 486 487for.body: 488 br label %for.inc 489 490for.inc: 491 %0 = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) 492 %inc = add nsw i32 %i.0, 1 493 br label %for.cond 494 495for.end: 496 ret <4 x double> %c.addr.0 497} 498 499define <4 x double> @fmsubaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { 500; CHECK-LABEL: fmsubaddpd_loop_256: 501; CHECK: # %bb.0: # %entry 502; CHECK-NEXT: xorl %eax, %eax 503; CHECK-NEXT: cmpl %edi, %eax 504; CHECK-NEXT: jge .LBB13_3 505; CHECK-NEXT: .p2align 4, 0x90 506; CHECK-NEXT: .LBB13_2: # %for.body 507; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 508; CHECK-NEXT: vfmsubadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2 509; CHECK-NEXT: incl %eax 510; CHECK-NEXT: cmpl %edi, %eax 511; CHECK-NEXT: jl .LBB13_2 512; CHECK-NEXT: .LBB13_3: # %for.end 513; CHECK-NEXT: vmovapd %ymm2, %ymm0 514; CHECK-NEXT: retq 515entry: 516 br label %for.cond 517 518for.cond: 519 %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] 520 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 521 %cmp = icmp slt i32 %i.0, %iter 522 br i1 %cmp, label %for.body, label %for.end 523 524for.body: 525 br label %for.inc 526 527for.inc: 528 %0 = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) 529 %inc = add nsw i32 %i.0, 1 530 br label %for.cond 531 532for.end: 533 ret <4 x double> %c.addr.0 534} 535 536define <4 x double> @fmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { 537; CHECK-LABEL: fmaddpd_loop_256: 538; CHECK: # %bb.0: # %entry 539; CHECK-NEXT: xorl %eax, %eax 540; CHECK-NEXT: cmpl %edi, %eax 541; CHECK-NEXT: jge .LBB14_3 542; CHECK-NEXT: .p2align 4, 0x90 543; CHECK-NEXT: .LBB14_2: # %for.body 544; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 545; CHECK-NEXT: vfmadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2 546; CHECK-NEXT: incl %eax 547; CHECK-NEXT: cmpl %edi, %eax 548; CHECK-NEXT: jl .LBB14_2 549; CHECK-NEXT: .LBB14_3: # %for.end 550; CHECK-NEXT: vmovapd %ymm2, %ymm0 551; CHECK-NEXT: retq 552entry: 553 br label %for.cond 554 555for.cond: 556 %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] 557 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 558 %cmp = icmp slt i32 %i.0, %iter 559 br i1 %cmp, label %for.body, label %for.end 560 561for.body: 562 br label %for.inc 563 564for.inc: 565 %0 = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) 566 %inc = add nsw i32 %i.0, 1 567 br label %for.cond 568 569for.end: 570 ret <4 x double> %c.addr.0 571} 572 573define <4 x double> @fmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { 574; CHECK-LABEL: fmsubpd_loop_256: 575; CHECK: # %bb.0: # %entry 576; CHECK-NEXT: xorl %eax, %eax 577; CHECK-NEXT: cmpl %edi, %eax 578; CHECK-NEXT: jge .LBB15_3 579; CHECK-NEXT: .p2align 4, 0x90 580; CHECK-NEXT: .LBB15_2: # %for.body 581; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 582; CHECK-NEXT: vfmsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2 583; CHECK-NEXT: incl %eax 584; CHECK-NEXT: cmpl %edi, %eax 585; CHECK-NEXT: jl .LBB15_2 586; CHECK-NEXT: .LBB15_3: # %for.end 587; CHECK-NEXT: vmovapd %ymm2, %ymm0 588; CHECK-NEXT: retq 589entry: 590 br label %for.cond 591 592for.cond: 593 %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] 594 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 595 %cmp = icmp slt i32 %i.0, %iter 596 br i1 %cmp, label %for.body, label %for.end 597 598for.body: 599 br label %for.inc 600 601for.inc: 602 %0 = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) 603 %inc = add nsw i32 %i.0, 1 604 br label %for.cond 605 606for.end: 607 ret <4 x double> %c.addr.0 608} 609 610define <4 x double> @fnmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { 611; CHECK-LABEL: fnmaddpd_loop_256: 612; CHECK: # %bb.0: # %entry 613; CHECK-NEXT: xorl %eax, %eax 614; CHECK-NEXT: cmpl %edi, %eax 615; CHECK-NEXT: jge .LBB16_3 616; CHECK-NEXT: .p2align 4, 0x90 617; CHECK-NEXT: .LBB16_2: # %for.body 618; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 619; CHECK-NEXT: vfnmadd231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2 620; CHECK-NEXT: incl %eax 621; CHECK-NEXT: cmpl %edi, %eax 622; CHECK-NEXT: jl .LBB16_2 623; CHECK-NEXT: .LBB16_3: # %for.end 624; CHECK-NEXT: vmovapd %ymm2, %ymm0 625; CHECK-NEXT: retq 626entry: 627 br label %for.cond 628 629for.cond: 630 %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] 631 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 632 %cmp = icmp slt i32 %i.0, %iter 633 br i1 %cmp, label %for.body, label %for.end 634 635for.body: 636 br label %for.inc 637 638for.inc: 639 %0 = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) 640 %inc = add nsw i32 %i.0, 1 641 br label %for.cond 642 643for.end: 644 ret <4 x double> %c.addr.0 645} 646 647define <4 x double> @fnmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { 648; CHECK-LABEL: fnmsubpd_loop_256: 649; CHECK: # %bb.0: # %entry 650; CHECK-NEXT: xorl %eax, %eax 651; CHECK-NEXT: cmpl %edi, %eax 652; CHECK-NEXT: jge .LBB17_3 653; CHECK-NEXT: .p2align 4, 0x90 654; CHECK-NEXT: .LBB17_2: # %for.body 655; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 656; CHECK-NEXT: vfnmsub231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2 657; CHECK-NEXT: incl %eax 658; CHECK-NEXT: cmpl %edi, %eax 659; CHECK-NEXT: jl .LBB17_2 660; CHECK-NEXT: .LBB17_3: # %for.end 661; CHECK-NEXT: vmovapd %ymm2, %ymm0 662; CHECK-NEXT: retq 663entry: 664 br label %for.cond 665 666for.cond: 667 %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] 668 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 669 %cmp = icmp slt i32 %i.0, %iter 670 br i1 %cmp, label %for.body, label %for.end 671 672for.body: 673 br label %for.inc 674 675for.inc: 676 %0 = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) 677 %inc = add nsw i32 %i.0, 1 678 br label %for.cond 679 680for.end: 681 ret <4 x double> %c.addr.0 682} 683 684declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>) 685declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>) 686declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) 687declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) 688declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) 689declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) 690 691define <8 x float> @fmaddsubps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { 692; CHECK-LABEL: fmaddsubps_loop_256: 693; CHECK: # %bb.0: # %entry 694; CHECK-NEXT: xorl %eax, %eax 695; CHECK-NEXT: cmpl %edi, %eax 696; CHECK-NEXT: jge .LBB18_3 697; CHECK-NEXT: .p2align 4, 0x90 698; CHECK-NEXT: .LBB18_2: # %for.body 699; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 700; CHECK-NEXT: vfmaddsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2 701; CHECK-NEXT: incl %eax 702; CHECK-NEXT: cmpl %edi, %eax 703; CHECK-NEXT: jl .LBB18_2 704; CHECK-NEXT: .LBB18_3: # %for.end 705; CHECK-NEXT: vmovaps %ymm2, %ymm0 706; CHECK-NEXT: retq 707entry: 708 br label %for.cond 709 710for.cond: 711 %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] 712 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 713 %cmp = icmp slt i32 %i.0, %iter 714 br i1 %cmp, label %for.body, label %for.end 715 716for.body: 717 br label %for.inc 718 719for.inc: 720 %0 = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) 721 %inc = add nsw i32 %i.0, 1 722 br label %for.cond 723 724for.end: 725 ret <8 x float> %c.addr.0 726} 727 728define <8 x float> @fmsubaddps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { 729; CHECK-LABEL: fmsubaddps_loop_256: 730; CHECK: # %bb.0: # %entry 731; CHECK-NEXT: xorl %eax, %eax 732; CHECK-NEXT: cmpl %edi, %eax 733; CHECK-NEXT: jge .LBB19_3 734; CHECK-NEXT: .p2align 4, 0x90 735; CHECK-NEXT: .LBB19_2: # %for.body 736; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 737; CHECK-NEXT: vfmsubadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2 738; CHECK-NEXT: incl %eax 739; CHECK-NEXT: cmpl %edi, %eax 740; CHECK-NEXT: jl .LBB19_2 741; CHECK-NEXT: .LBB19_3: # %for.end 742; CHECK-NEXT: vmovaps %ymm2, %ymm0 743; CHECK-NEXT: retq 744entry: 745 br label %for.cond 746 747for.cond: 748 %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] 749 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 750 %cmp = icmp slt i32 %i.0, %iter 751 br i1 %cmp, label %for.body, label %for.end 752 753for.body: 754 br label %for.inc 755 756for.inc: 757 %0 = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) 758 %inc = add nsw i32 %i.0, 1 759 br label %for.cond 760 761for.end: 762 ret <8 x float> %c.addr.0 763} 764 765define <8 x float> @fmaddps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { 766; CHECK-LABEL: fmaddps_loop_256: 767; CHECK: # %bb.0: # %entry 768; CHECK-NEXT: xorl %eax, %eax 769; CHECK-NEXT: cmpl %edi, %eax 770; CHECK-NEXT: jge .LBB20_3 771; CHECK-NEXT: .p2align 4, 0x90 772; CHECK-NEXT: .LBB20_2: # %for.body 773; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 774; CHECK-NEXT: vfmadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2 775; CHECK-NEXT: incl %eax 776; CHECK-NEXT: cmpl %edi, %eax 777; CHECK-NEXT: jl .LBB20_2 778; CHECK-NEXT: .LBB20_3: # %for.end 779; CHECK-NEXT: vmovaps %ymm2, %ymm0 780; CHECK-NEXT: retq 781entry: 782 br label %for.cond 783 784for.cond: 785 %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] 786 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 787 %cmp = icmp slt i32 %i.0, %iter 788 br i1 %cmp, label %for.body, label %for.end 789 790for.body: 791 br label %for.inc 792 793for.inc: 794 %0 = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) 795 %inc = add nsw i32 %i.0, 1 796 br label %for.cond 797 798for.end: 799 ret <8 x float> %c.addr.0 800} 801 802define <8 x float> @fmsubps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { 803; CHECK-LABEL: fmsubps_loop_256: 804; CHECK: # %bb.0: # %entry 805; CHECK-NEXT: xorl %eax, %eax 806; CHECK-NEXT: cmpl %edi, %eax 807; CHECK-NEXT: jge .LBB21_3 808; CHECK-NEXT: .p2align 4, 0x90 809; CHECK-NEXT: .LBB21_2: # %for.body 810; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 811; CHECK-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2 812; CHECK-NEXT: incl %eax 813; CHECK-NEXT: cmpl %edi, %eax 814; CHECK-NEXT: jl .LBB21_2 815; CHECK-NEXT: .LBB21_3: # %for.end 816; CHECK-NEXT: vmovaps %ymm2, %ymm0 817; CHECK-NEXT: retq 818entry: 819 br label %for.cond 820 821for.cond: 822 %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] 823 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 824 %cmp = icmp slt i32 %i.0, %iter 825 br i1 %cmp, label %for.body, label %for.end 826 827for.body: 828 br label %for.inc 829 830for.inc: 831 %0 = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) 832 %inc = add nsw i32 %i.0, 1 833 br label %for.cond 834 835for.end: 836 ret <8 x float> %c.addr.0 837} 838 839define <8 x float> @fnmaddps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { 840; CHECK-LABEL: fnmaddps_loop_256: 841; CHECK: # %bb.0: # %entry 842; CHECK-NEXT: xorl %eax, %eax 843; CHECK-NEXT: cmpl %edi, %eax 844; CHECK-NEXT: jge .LBB22_3 845; CHECK-NEXT: .p2align 4, 0x90 846; CHECK-NEXT: .LBB22_2: # %for.body 847; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 848; CHECK-NEXT: vfnmadd231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2 849; CHECK-NEXT: incl %eax 850; CHECK-NEXT: cmpl %edi, %eax 851; CHECK-NEXT: jl .LBB22_2 852; CHECK-NEXT: .LBB22_3: # %for.end 853; CHECK-NEXT: vmovaps %ymm2, %ymm0 854; CHECK-NEXT: retq 855entry: 856 br label %for.cond 857 858for.cond: 859 %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] 860 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 861 %cmp = icmp slt i32 %i.0, %iter 862 br i1 %cmp, label %for.body, label %for.end 863 864for.body: 865 br label %for.inc 866 867for.inc: 868 %0 = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) 869 %inc = add nsw i32 %i.0, 1 870 br label %for.cond 871 872for.end: 873 ret <8 x float> %c.addr.0 874} 875 876define <8 x float> @fnmsubps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { 877; CHECK-LABEL: fnmsubps_loop_256: 878; CHECK: # %bb.0: # %entry 879; CHECK-NEXT: xorl %eax, %eax 880; CHECK-NEXT: cmpl %edi, %eax 881; CHECK-NEXT: jge .LBB23_3 882; CHECK-NEXT: .p2align 4, 0x90 883; CHECK-NEXT: .LBB23_2: # %for.body 884; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 885; CHECK-NEXT: vfnmsub231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2 886; CHECK-NEXT: incl %eax 887; CHECK-NEXT: cmpl %edi, %eax 888; CHECK-NEXT: jl .LBB23_2 889; CHECK-NEXT: .LBB23_3: # %for.end 890; CHECK-NEXT: vmovaps %ymm2, %ymm0 891; CHECK-NEXT: retq 892entry: 893 br label %for.cond 894 895for.cond: 896 %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] 897 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 898 %cmp = icmp slt i32 %i.0, %iter 899 br i1 %cmp, label %for.body, label %for.end 900 901for.body: 902 br label %for.inc 903 904for.inc: 905 %0 = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) 906 %inc = add nsw i32 %i.0, 1 907 br label %for.cond 908 909for.end: 910 ret <8 x float> %c.addr.0 911} 912 913declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>) 914declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>) 915declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) 916declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) 917declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) 918declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) 919