1; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fma | FileCheck %s 2 3; CHECK-LABEL: fmaddsubpd_loop_128: 4; CHECK: vfmaddsub231pd %xmm1, %xmm0, %xmm2 5; CHECK: vmovaps %xmm2, %xmm0 6; CHECK-NEXT: retq 7define <2 x double> @fmaddsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { 8entry: 9 br label %for.cond 10 11for.cond: 12 %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ] 13 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 14 %cmp = icmp slt i32 %i.0, %iter 15 br i1 %cmp, label %for.body, label %for.end 16 17for.body: 18 br label %for.inc 19 20for.inc: 21 %0 = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0) 22 %inc = add nsw i32 %i.0, 1 23 br label %for.cond 24 25for.end: 26 ret <2 x double> %c.addr.0 27} 28 29; CHECK-LABEL: fmsubaddpd_loop_128: 30; CHECK: vfmsubadd231pd %xmm1, %xmm0, %xmm2 31; CHECK: vmovaps %xmm2, %xmm0 32; CHECK-NEXT: retq 33define <2 x double> @fmsubaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { 34entry: 35 br label %for.cond 36 37for.cond: 38 %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ] 39 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 40 %cmp = icmp slt i32 %i.0, %iter 41 br i1 %cmp, label %for.body, label %for.end 42 43for.body: 44 br label %for.inc 45 46for.inc: 47 %0 = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0) 48 %inc = add nsw i32 %i.0, 1 49 br label %for.cond 50 51for.end: 52 ret <2 x double> %c.addr.0 53} 54 55; CHECK-LABEL: fmaddpd_loop_128: 56; CHECK: vfmadd231pd %xmm1, %xmm0, %xmm2 57; CHECK: vmovaps %xmm2, %xmm0 58; CHECK-NEXT: retq 59define <2 x double> @fmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { 60entry: 61 br label %for.cond 62 63for.cond: 64 %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ] 65 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 66 %cmp = icmp slt i32 %i.0, %iter 67 br i1 %cmp, label %for.body, label %for.end 68 69for.body: 70 br label %for.inc 71 72for.inc: 73 %0 = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0) 74 %inc = add nsw i32 %i.0, 1 75 br label %for.cond 76 77for.end: 78 ret <2 x double> %c.addr.0 79} 80 81; CHECK-LABEL: fmsubpd_loop_128: 82; CHECK: vfmsub231pd %xmm1, %xmm0, %xmm2 83; CHECK: vmovaps %xmm2, %xmm0 84; CHECK-NEXT: retq 85define <2 x double> @fmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { 86entry: 87 br label %for.cond 88 89for.cond: 90 %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ] 91 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 92 %cmp = icmp slt i32 %i.0, %iter 93 br i1 %cmp, label %for.body, label %for.end 94 95for.body: 96 br label %for.inc 97 98for.inc: 99 %0 = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0) 100 %inc = add nsw i32 %i.0, 1 101 br label %for.cond 102 103for.end: 104 ret <2 x double> %c.addr.0 105} 106 107; CHECK-LABEL: fnmaddpd_loop_128: 108; CHECK: vfnmadd231pd %xmm1, %xmm0, %xmm2 109; CHECK: vmovaps %xmm2, %xmm0 110; CHECK-NEXT: retq 111define <2 x double> @fnmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { 112entry: 113 br label %for.cond 114 115for.cond: 116 %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ] 117 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 118 %cmp = icmp slt i32 %i.0, %iter 119 br i1 %cmp, label %for.body, label %for.end 120 121for.body: 122 br label %for.inc 123 124for.inc: 125 %0 = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0) 126 %inc = add nsw i32 %i.0, 1 127 br label %for.cond 128 129for.end: 130 ret <2 x double> %c.addr.0 131} 132 133; CHECK-LABEL: fnmsubpd_loop_128: 134; CHECK: vfnmsub231pd %xmm1, %xmm0, %xmm2 135; CHECK: vmovaps %xmm2, %xmm0 136; CHECK-NEXT: retq 137define <2 x double> @fnmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { 138entry: 139 br label %for.cond 140 141for.cond: 142 %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ] 143 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 144 %cmp = icmp slt i32 %i.0, %iter 145 br i1 %cmp, label %for.body, label %for.end 146 147for.body: 148 br label %for.inc 149 150for.inc: 151 %0 = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0) 152 %inc = add nsw i32 %i.0, 1 153 br label %for.cond 154 155for.end: 156 ret <2 x double> %c.addr.0 157} 158 159declare <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double>, <2 x double>, <2 x double>) 160declare <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double>, <2 x double>, <2 x double>) 161declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) 162declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) 163declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) 164declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) 165 166 167; CHECK-LABEL: fmaddsubps_loop_128: 168; CHECK: vfmaddsub231ps %xmm1, %xmm0, %xmm2 169; CHECK: vmovaps %xmm2, %xmm0 170; CHECK-NEXT: retq 171define <4 x float> @fmaddsubps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) { 172entry: 173 br label %for.cond 174 175for.cond: 176 %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ] 177 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 178 %cmp = icmp slt i32 %i.0, %iter 179 br i1 %cmp, label %for.body, label %for.end 180 181for.body: 182 br label %for.inc 183 184for.inc: 185 %0 = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0) 186 %inc = add nsw i32 %i.0, 1 187 br label %for.cond 188 189for.end: 190 ret <4 x float> %c.addr.0 191} 192 193; CHECK-LABEL: fmsubaddps_loop_128: 194; CHECK: vfmsubadd231ps %xmm1, %xmm0, %xmm2 195; CHECK: vmovaps %xmm2, %xmm0 196; CHECK-NEXT: retq 197define <4 x float> @fmsubaddps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) { 198entry: 199 br label %for.cond 200 201for.cond: 202 %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ] 203 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 204 %cmp = icmp slt i32 %i.0, %iter 205 br i1 %cmp, label %for.body, label %for.end 206 207for.body: 208 br label %for.inc 209 210for.inc: 211 %0 = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0) 212 %inc = add nsw i32 %i.0, 1 213 br label %for.cond 214 215for.end: 216 ret <4 x float> %c.addr.0 217} 218 219; CHECK-LABEL: fmaddps_loop_128: 220; CHECK: vfmadd231ps %xmm1, %xmm0, %xmm2 221; CHECK: vmovaps %xmm2, %xmm0 222; CHECK-NEXT: retq 223define <4 x float> @fmaddps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) { 224entry: 225 br label %for.cond 226 227for.cond: 228 %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ] 229 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 230 %cmp = icmp slt i32 %i.0, %iter 231 br i1 %cmp, label %for.body, label %for.end 232 233for.body: 234 br label %for.inc 235 236for.inc: 237 %0 = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0) 238 %inc = add nsw i32 %i.0, 1 239 br label %for.cond 240 241for.end: 242 ret <4 x float> %c.addr.0 243} 244 245; CHECK-LABEL: fmsubps_loop_128: 246; CHECK: vfmsub231ps %xmm1, %xmm0, %xmm2 247; CHECK: vmovaps %xmm2, %xmm0 248; CHECK-NEXT: retq 249define <4 x float> @fmsubps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) { 250entry: 251 br label %for.cond 252 253for.cond: 254 %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ] 255 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 256 %cmp = icmp slt i32 %i.0, %iter 257 br i1 %cmp, label %for.body, label %for.end 258 259for.body: 260 br label %for.inc 261 262for.inc: 263 %0 = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0) 264 %inc = add nsw i32 %i.0, 1 265 br label %for.cond 266 267for.end: 268 ret <4 x float> %c.addr.0 269} 270 271; CHECK-LABEL: fnmaddps_loop_128: 272; CHECK: vfnmadd231ps %xmm1, %xmm0, %xmm2 273; CHECK: vmovaps %xmm2, %xmm0 274; CHECK-NEXT: retq 275define <4 x float> @fnmaddps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) { 276entry: 277 br label %for.cond 278 279for.cond: 280 %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ] 281 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 282 %cmp = icmp slt i32 %i.0, %iter 283 br i1 %cmp, label %for.body, label %for.end 284 285for.body: 286 br label %for.inc 287 288for.inc: 289 %0 = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0) 290 %inc = add nsw i32 %i.0, 1 291 br label %for.cond 292 293for.end: 294 ret <4 x float> %c.addr.0 295} 296 297; CHECK-LABEL: fnmsubps_loop_128: 298; CHECK: vfnmsub231ps %xmm1, %xmm0, %xmm2 299; CHECK: vmovaps %xmm2, %xmm0 300; CHECK-NEXT: retq 301define <4 x float> @fnmsubps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) { 302entry: 303 br label %for.cond 304 305for.cond: 306 %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ] 307 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 308 %cmp = icmp slt i32 %i.0, %iter 309 br i1 %cmp, label %for.body, label %for.end 310 311for.body: 312 br label %for.inc 313 314for.inc: 315 %0 = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0) 316 %inc = add nsw i32 %i.0, 1 317 br label %for.cond 318 319for.end: 320 ret <4 x float> %c.addr.0 321} 322 323declare <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float>, <4 x float>, <4 x float>) 324declare <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float>, <4 x float>, <4 x float>) 325declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) 326declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) 327declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) 328declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) 329 330; CHECK-LABEL: fmaddsubpd_loop_256: 331; CHECK: vfmaddsub231pd %ymm1, %ymm0, %ymm2 332; CHECK: vmovaps %ymm2, %ymm0 333; CHECK-NEXT: retq 334define <4 x double> @fmaddsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { 335entry: 336 br label %for.cond 337 338for.cond: 339 %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] 340 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 341 %cmp = icmp slt i32 %i.0, %iter 342 br i1 %cmp, label %for.body, label %for.end 343 344for.body: 345 br label %for.inc 346 347for.inc: 348 %0 = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) 349 %inc = add nsw i32 %i.0, 1 350 br label %for.cond 351 352for.end: 353 ret <4 x double> %c.addr.0 354} 355 356; CHECK-LABEL: fmsubaddpd_loop_256: 357; CHECK: vfmsubadd231pd %ymm1, %ymm0, %ymm2 358; CHECK: vmovaps %ymm2, %ymm0 359; CHECK-NEXT: retq 360define <4 x double> @fmsubaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { 361entry: 362 br label %for.cond 363 364for.cond: 365 %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] 366 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 367 %cmp = icmp slt i32 %i.0, %iter 368 br i1 %cmp, label %for.body, label %for.end 369 370for.body: 371 br label %for.inc 372 373for.inc: 374 %0 = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) 375 %inc = add nsw i32 %i.0, 1 376 br label %for.cond 377 378for.end: 379 ret <4 x double> %c.addr.0 380} 381 382; CHECK-LABEL: fmaddpd_loop_256: 383; CHECK: vfmadd231pd %ymm1, %ymm0, %ymm2 384; CHECK: vmovaps %ymm2, %ymm0 385; CHECK-NEXT: retq 386define <4 x double> @fmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { 387entry: 388 br label %for.cond 389 390for.cond: 391 %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] 392 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 393 %cmp = icmp slt i32 %i.0, %iter 394 br i1 %cmp, label %for.body, label %for.end 395 396for.body: 397 br label %for.inc 398 399for.inc: 400 %0 = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) 401 %inc = add nsw i32 %i.0, 1 402 br label %for.cond 403 404for.end: 405 ret <4 x double> %c.addr.0 406} 407 408; CHECK-LABEL: fmsubpd_loop_256: 409; CHECK: vfmsub231pd %ymm1, %ymm0, %ymm2 410; CHECK: vmovaps %ymm2, %ymm0 411; CHECK-NEXT: retq 412define <4 x double> @fmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { 413entry: 414 br label %for.cond 415 416for.cond: 417 %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] 418 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 419 %cmp = icmp slt i32 %i.0, %iter 420 br i1 %cmp, label %for.body, label %for.end 421 422for.body: 423 br label %for.inc 424 425for.inc: 426 %0 = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) 427 %inc = add nsw i32 %i.0, 1 428 br label %for.cond 429 430for.end: 431 ret <4 x double> %c.addr.0 432} 433 434; CHECK-LABEL: fnmaddpd_loop_256: 435; CHECK: vfnmadd231pd %ymm1, %ymm0, %ymm2 436; CHECK: vmovaps %ymm2, %ymm0 437; CHECK-NEXT: retq 438define <4 x double> @fnmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { 439entry: 440 br label %for.cond 441 442for.cond: 443 %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] 444 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 445 %cmp = icmp slt i32 %i.0, %iter 446 br i1 %cmp, label %for.body, label %for.end 447 448for.body: 449 br label %for.inc 450 451for.inc: 452 %0 = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) 453 %inc = add nsw i32 %i.0, 1 454 br label %for.cond 455 456for.end: 457 ret <4 x double> %c.addr.0 458} 459 460; CHECK-LABEL: fnmsubpd_loop_256: 461; CHECK: vfnmsub231pd %ymm1, %ymm0, %ymm2 462; CHECK: vmovaps %ymm2, %ymm0 463; CHECK-NEXT: retq 464define <4 x double> @fnmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { 465entry: 466 br label %for.cond 467 468for.cond: 469 %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] 470 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 471 %cmp = icmp slt i32 %i.0, %iter 472 br i1 %cmp, label %for.body, label %for.end 473 474for.body: 475 br label %for.inc 476 477for.inc: 478 %0 = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) 479 %inc = add nsw i32 %i.0, 1 480 br label %for.cond 481 482for.end: 483 ret <4 x double> %c.addr.0 484} 485 486declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>) 487declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>) 488declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) 489declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) 490declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) 491declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) 492 493 494; CHECK-LABEL: fmaddsubps_loop_256: 495; CHECK: vfmaddsub231ps %ymm1, %ymm0, %ymm2 496; CHECK: vmovaps %ymm2, %ymm0 497; CHECK-NEXT: retq 498define <8 x float> @fmaddsubps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { 499entry: 500 br label %for.cond 501 502for.cond: 503 %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] 504 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 505 %cmp = icmp slt i32 %i.0, %iter 506 br i1 %cmp, label %for.body, label %for.end 507 508for.body: 509 br label %for.inc 510 511for.inc: 512 %0 = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) 513 %inc = add nsw i32 %i.0, 1 514 br label %for.cond 515 516for.end: 517 ret <8 x float> %c.addr.0 518} 519 520; CHECK-LABEL: fmsubaddps_loop_256: 521; CHECK: vfmsubadd231ps %ymm1, %ymm0, %ymm2 522; CHECK: vmovaps %ymm2, %ymm0 523; CHECK-NEXT: retq 524define <8 x float> @fmsubaddps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { 525entry: 526 br label %for.cond 527 528for.cond: 529 %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] 530 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 531 %cmp = icmp slt i32 %i.0, %iter 532 br i1 %cmp, label %for.body, label %for.end 533 534for.body: 535 br label %for.inc 536 537for.inc: 538 %0 = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) 539 %inc = add nsw i32 %i.0, 1 540 br label %for.cond 541 542for.end: 543 ret <8 x float> %c.addr.0 544} 545 546; CHECK-LABEL: fmaddps_loop_256: 547; CHECK: vfmadd231ps %ymm1, %ymm0, %ymm2 548; CHECK: vmovaps %ymm2, %ymm0 549; CHECK-NEXT: retq 550define <8 x float> @fmaddps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { 551entry: 552 br label %for.cond 553 554for.cond: 555 %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] 556 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 557 %cmp = icmp slt i32 %i.0, %iter 558 br i1 %cmp, label %for.body, label %for.end 559 560for.body: 561 br label %for.inc 562 563for.inc: 564 %0 = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) 565 %inc = add nsw i32 %i.0, 1 566 br label %for.cond 567 568for.end: 569 ret <8 x float> %c.addr.0 570} 571 572; CHECK-LABEL: fmsubps_loop_256: 573; CHECK: vfmsub231ps %ymm1, %ymm0, %ymm2 574; CHECK: vmovaps %ymm2, %ymm0 575; CHECK-NEXT: retq 576define <8 x float> @fmsubps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { 577entry: 578 br label %for.cond 579 580for.cond: 581 %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] 582 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 583 %cmp = icmp slt i32 %i.0, %iter 584 br i1 %cmp, label %for.body, label %for.end 585 586for.body: 587 br label %for.inc 588 589for.inc: 590 %0 = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) 591 %inc = add nsw i32 %i.0, 1 592 br label %for.cond 593 594for.end: 595 ret <8 x float> %c.addr.0 596} 597 598; CHECK-LABEL: fnmaddps_loop_256: 599; CHECK: vfnmadd231ps %ymm1, %ymm0, %ymm2 600; CHECK: vmovaps %ymm2, %ymm0 601; CHECK-NEXT: retq 602define <8 x float> @fnmaddps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { 603entry: 604 br label %for.cond 605 606for.cond: 607 %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] 608 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 609 %cmp = icmp slt i32 %i.0, %iter 610 br i1 %cmp, label %for.body, label %for.end 611 612for.body: 613 br label %for.inc 614 615for.inc: 616 %0 = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) 617 %inc = add nsw i32 %i.0, 1 618 br label %for.cond 619 620for.end: 621 ret <8 x float> %c.addr.0 622} 623 624; CHECK-LABEL: fnmsubps_loop_256: 625; CHECK: vfnmsub231ps %ymm1, %ymm0, %ymm2 626; CHECK: vmovaps %ymm2, %ymm0 627; CHECK-NEXT: retq 628define <8 x float> @fnmsubps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { 629entry: 630 br label %for.cond 631 632for.cond: 633 %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] 634 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] 635 %cmp = icmp slt i32 %i.0, %iter 636 br i1 %cmp, label %for.body, label %for.end 637 638for.body: 639 br label %for.inc 640 641for.inc: 642 %0 = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) 643 %inc = add nsw i32 %i.0, 1 644 br label %for.cond 645 646for.end: 647 ret <8 x float> %c.addr.0 648} 649 650declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>) 651declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>) 652declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) 653declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) 654declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) 655declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) 656