1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma | FileCheck -check-prefix=FMA3 -check-prefix=FMA3_256 %s 3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma,+avx512f | FileCheck -check-prefix=FMA3 -check-prefix=FMA3_512 %s 4; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma4 | FileCheck -check-prefix=FMA4 %s 5 6; This test checks the fusing of MUL + ADDSUB to FMADDSUB. 7 8define <2 x double> @mul_addsub_pd128(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 { 9; FMA3-LABEL: mul_addsub_pd128: 10; FMA3: # %bb.0: # %entry 11; FMA3-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 12; FMA3-NEXT: retq 13; 14; FMA4-LABEL: mul_addsub_pd128: 15; FMA4: # %bb.0: # %entry 16; FMA4-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0 17; FMA4-NEXT: retq 18entry: 19 %AB = fmul <2 x double> %A, %B 20 %Sub = fsub <2 x double> %AB, %C 21 %Add = fadd <2 x double> %AB, %C 22 %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add, <2 x i32> <i32 0, i32 3> 23 ret <2 x double> %Addsub 24} 25 26define <4 x float> @mul_addsub_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 { 27; FMA3-LABEL: mul_addsub_ps128: 28; FMA3: # %bb.0: # %entry 29; FMA3-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 30; FMA3-NEXT: retq 31; 32; FMA4-LABEL: mul_addsub_ps128: 33; FMA4: # %bb.0: # %entry 34; FMA4-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0 35; FMA4-NEXT: retq 36entry: 37 %AB = fmul <4 x float> %A, %B 38 %Sub = fsub <4 x float> %AB, %C 39 %Add = fadd <4 x float> %AB, %C 40 %Addsub = shufflevector <4 x float> %Sub, <4 x float> %Add, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 41 ret <4 x float> %Addsub 42} 43 44define <4 x double> @mul_addsub_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 { 45; FMA3-LABEL: mul_addsub_pd256: 46; FMA3: # %bb.0: # %entry 47; FMA3-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 48; FMA3-NEXT: retq 49; 50; FMA4-LABEL: mul_addsub_pd256: 51; FMA4: # %bb.0: # %entry 52; FMA4-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0 53; FMA4-NEXT: retq 54entry: 55 %AB = fmul <4 x double> %A, %B 56 %Sub = fsub <4 x double> %AB, %C 57 %Add = fadd <4 x double> %AB, %C 58 %Addsub = shufflevector <4 x double> %Sub, <4 x double> %Add, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 59 ret <4 x double> %Addsub 60} 61 62define <8 x float> @mul_addsub_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 { 63; FMA3-LABEL: mul_addsub_ps256: 64; FMA3: # %bb.0: # %entry 65; FMA3-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 66; FMA3-NEXT: retq 67; 68; FMA4-LABEL: mul_addsub_ps256: 69; FMA4: # %bb.0: # %entry 70; FMA4-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0 71; FMA4-NEXT: retq 72entry: 73 %AB = fmul <8 x float> %A, %B 74 %Sub = fsub <8 x float> %AB, %C 75 %Add = fadd <8 x float> %AB, %C 76 %Addsub = shufflevector <8 x float> %Sub, <8 x float> %Add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 77 ret <8 x float> %Addsub 78} 79 80define <8 x double> @mul_addsub_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 { 81; FMA3_256-LABEL: mul_addsub_pd512: 82; FMA3_256: # %bb.0: # %entry 83; FMA3_256-NEXT: vfmaddsub213pd %ymm4, %ymm2, %ymm0 84; FMA3_256-NEXT: vfmaddsub213pd %ymm5, %ymm3, %ymm1 85; FMA3_256-NEXT: retq 86; 87; FMA3_512-LABEL: mul_addsub_pd512: 88; FMA3_512: # %bb.0: # %entry 89; FMA3_512-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm0 90; FMA3_512-NEXT: retq 91; 92; FMA4-LABEL: mul_addsub_pd512: 93; FMA4: # %bb.0: # %entry 94; FMA4-NEXT: vfmaddsubpd %ymm4, %ymm2, %ymm0, %ymm0 95; FMA4-NEXT: vfmaddsubpd %ymm5, %ymm3, %ymm1, %ymm1 96; FMA4-NEXT: retq 97entry: 98 %AB = fmul <8 x double> %A, %B 99 %Sub = fsub <8 x double> %AB, %C 100 %Add = fadd <8 x double> %AB, %C 101 %Addsub = shufflevector <8 x double> %Sub, <8 x double> %Add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 102 ret <8 x double> %Addsub 103} 104 105define <16 x float> @mul_addsub_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 { 106; FMA3_256-LABEL: mul_addsub_ps512: 107; FMA3_256: # %bb.0: # %entry 108; FMA3_256-NEXT: vfmaddsub213ps %ymm4, %ymm2, %ymm0 109; FMA3_256-NEXT: vfmaddsub213ps %ymm5, %ymm3, %ymm1 110; FMA3_256-NEXT: retq 111; 112; FMA3_512-LABEL: mul_addsub_ps512: 113; FMA3_512: # %bb.0: # %entry 114; FMA3_512-NEXT: vfmaddsub213ps %zmm2, %zmm1, %zmm0 115; FMA3_512-NEXT: retq 116; 117; FMA4-LABEL: mul_addsub_ps512: 118; FMA4: # %bb.0: # %entry 119; FMA4-NEXT: vfmaddsubps %ymm4, %ymm2, %ymm0, %ymm0 120; FMA4-NEXT: vfmaddsubps %ymm5, %ymm3, %ymm1, %ymm1 121; FMA4-NEXT: retq 122entry: 123 %AB = fmul <16 x float> %A, %B 124 %Sub = fsub <16 x float> %AB, %C 125 %Add = fadd <16 x float> %AB, %C 126 %Addsub = shufflevector <16 x float> %Sub, <16 x float> %Add, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 127 ret <16 x float> %Addsub 128} 129 130define <4 x float> @buildvector_mul_addsub_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 { 131; FMA3-LABEL: buildvector_mul_addsub_ps128: 132; FMA3: # %bb.0: # %bb 133; FMA3-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 134; FMA3-NEXT: retq 135; 136; FMA4-LABEL: buildvector_mul_addsub_ps128: 137; FMA4: # %bb.0: # %bb 138; FMA4-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0 139; FMA4-NEXT: retq 140bb: 141 %A = fmul <4 x float> %C, %D 142 %A0 = extractelement <4 x float> %A, i32 0 143 %B0 = extractelement <4 x float> %B, i32 0 144 %sub0 = fsub float %A0, %B0 145 %A2 = extractelement <4 x float> %A, i32 2 146 %B2 = extractelement <4 x float> %B, i32 2 147 %sub2 = fsub float %A2, %B2 148 %A1 = extractelement <4 x float> %A, i32 1 149 %B1 = extractelement <4 x float> %B, i32 1 150 %add1 = fadd float %A1, %B1 151 %A3 = extractelement <4 x float> %A, i32 3 152 %B3 = extractelement <4 x float> %B, i32 3 153 %add3 = fadd float %A3, %B3 154 %vecinsert1 = insertelement <4 x float> undef, float %sub0, i32 0 155 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add1, i32 1 156 %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub2, i32 2 157 %vecinsert4 = insertelement <4 x float> %vecinsert3, float %add3, i32 3 158 ret <4 x float> %vecinsert4 159} 160 161define <2 x double> @buildvector_mul_addsub_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 { 162; FMA3-LABEL: buildvector_mul_addsub_pd128: 163; FMA3: # %bb.0: # %bb 164; FMA3-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 165; FMA3-NEXT: retq 166; 167; FMA4-LABEL: buildvector_mul_addsub_pd128: 168; FMA4: # %bb.0: # %bb 169; FMA4-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0 170; FMA4-NEXT: retq 171bb: 172 %A = fmul <2 x double> %C, %D 173 %A0 = extractelement <2 x double> %A, i32 0 174 %B0 = extractelement <2 x double> %B, i32 0 175 %sub0 = fsub double %A0, %B0 176 %A1 = extractelement <2 x double> %A, i32 1 177 %B1 = extractelement <2 x double> %B, i32 1 178 %add1 = fadd double %A1, %B1 179 %vecinsert1 = insertelement <2 x double> undef, double %sub0, i32 0 180 %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add1, i32 1 181 ret <2 x double> %vecinsert2 182} 183 184define <8 x float> @buildvector_mul_addsub_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 { 185; FMA3-LABEL: buildvector_mul_addsub_ps256: 186; FMA3: # %bb.0: # %bb 187; FMA3-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 188; FMA3-NEXT: retq 189; 190; FMA4-LABEL: buildvector_mul_addsub_ps256: 191; FMA4: # %bb.0: # %bb 192; FMA4-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0 193; FMA4-NEXT: retq 194bb: 195 %A = fmul <8 x float> %C, %D 196 %A0 = extractelement <8 x float> %A, i32 0 197 %B0 = extractelement <8 x float> %B, i32 0 198 %sub0 = fsub float %A0, %B0 199 %A2 = extractelement <8 x float> %A, i32 2 200 %B2 = extractelement <8 x float> %B, i32 2 201 %sub2 = fsub float %A2, %B2 202 %A4 = extractelement <8 x float> %A, i32 4 203 %B4 = extractelement <8 x float> %B, i32 4 204 %sub4 = fsub float %A4, %B4 205 %A6 = extractelement <8 x float> %A, i32 6 206 %B6 = extractelement <8 x float> %B, i32 6 207 %sub6 = fsub float %A6, %B6 208 %A1 = extractelement <8 x float> %A, i32 1 209 %B1 = extractelement <8 x float> %B, i32 1 210 %add1 = fadd float %A1, %B1 211 %A3 = extractelement <8 x float> %A, i32 3 212 %B3 = extractelement <8 x float> %B, i32 3 213 %add3 = fadd float %A3, %B3 214 %A5 = extractelement <8 x float> %A, i32 5 215 %B5 = extractelement <8 x float> %B, i32 5 216 %add5 = fadd float %A5, %B5 217 %A7 = extractelement <8 x float> %A, i32 7 218 %B7 = extractelement <8 x float> %B, i32 7 219 %add7 = fadd float %A7, %B7 220 %vecinsert1 = insertelement <8 x float> undef, float %sub0, i32 0 221 %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add1, i32 1 222 %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub2, i32 2 223 %vecinsert4 = insertelement <8 x float> %vecinsert3, float %add3, i32 3 224 %vecinsert5 = insertelement <8 x float> %vecinsert4, float %sub4, i32 4 225 %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add5, i32 5 226 %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub6, i32 6 227 %vecinsert8 = insertelement <8 x float> %vecinsert7, float %add7, i32 7 228 ret <8 x float> %vecinsert8 229} 230 231define <4 x double> @buildvector_mul_addsub_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 { 232; FMA3-LABEL: buildvector_mul_addsub_pd256: 233; FMA3: # %bb.0: # %bb 234; FMA3-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 235; FMA3-NEXT: retq 236; 237; FMA4-LABEL: buildvector_mul_addsub_pd256: 238; FMA4: # %bb.0: # %bb 239; FMA4-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0 240; FMA4-NEXT: retq 241bb: 242 %A = fmul <4 x double> %C, %D 243 %A0 = extractelement <4 x double> %A, i32 0 244 %B0 = extractelement <4 x double> %B, i32 0 245 %sub0 = fsub double %A0, %B0 246 %A2 = extractelement <4 x double> %A, i32 2 247 %B2 = extractelement <4 x double> %B, i32 2 248 %sub2 = fsub double %A2, %B2 249 %A1 = extractelement <4 x double> %A, i32 1 250 %B1 = extractelement <4 x double> %B, i32 1 251 %add1 = fadd double %A1, %B1 252 %A3 = extractelement <4 x double> %A, i32 3 253 %B3 = extractelement <4 x double> %B, i32 3 254 %add3 = fadd double %A3, %B3 255 %vecinsert1 = insertelement <4 x double> undef, double %sub0, i32 0 256 %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add1, i32 1 257 %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub2, i32 2 258 %vecinsert4 = insertelement <4 x double> %vecinsert3, double %add3, i32 3 259 ret <4 x double> %vecinsert4 260} 261 262define <16 x float> @buildvector_mul_addsub_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 { 263; FMA3_256-LABEL: buildvector_mul_addsub_ps512: 264; FMA3_256: # %bb.0: # %bb 265; FMA3_256-NEXT: vfmaddsub213ps %ymm4, %ymm2, %ymm0 266; FMA3_256-NEXT: vfmaddsub213ps %ymm5, %ymm3, %ymm1 267; FMA3_256-NEXT: retq 268; 269; FMA3_512-LABEL: buildvector_mul_addsub_ps512: 270; FMA3_512: # %bb.0: # %bb 271; FMA3_512-NEXT: vfmaddsub213ps %zmm2, %zmm1, %zmm0 272; FMA3_512-NEXT: retq 273; 274; FMA4-LABEL: buildvector_mul_addsub_ps512: 275; FMA4: # %bb.0: # %bb 276; FMA4-NEXT: vfmaddsubps %ymm4, %ymm2, %ymm0, %ymm0 277; FMA4-NEXT: vfmaddsubps %ymm5, %ymm3, %ymm1, %ymm1 278; FMA4-NEXT: retq 279bb: 280 %A = fmul <16 x float> %C, %D 281 %A0 = extractelement <16 x float> %A, i32 0 282 %B0 = extractelement <16 x float> %B, i32 0 283 %sub0 = fsub float %A0, %B0 284 %A2 = extractelement <16 x float> %A, i32 2 285 %B2 = extractelement <16 x float> %B, i32 2 286 %sub2 = fsub float %A2, %B2 287 %A4 = extractelement <16 x float> %A, i32 4 288 %B4 = extractelement <16 x float> %B, i32 4 289 %sub4 = fsub float %A4, %B4 290 %A6 = extractelement <16 x float> %A, i32 6 291 %B6 = extractelement <16 x float> %B, i32 6 292 %sub6 = fsub float %A6, %B6 293 %A8 = extractelement <16 x float> %A, i32 8 294 %B8 = extractelement <16 x float> %B, i32 8 295 %sub8 = fsub float %A8, %B8 296 %A10 = extractelement <16 x float> %A, i32 10 297 %B10 = extractelement <16 x float> %B, i32 10 298 %sub10 = fsub float %A10, %B10 299 %A12 = extractelement <16 x float> %A, i32 12 300 %B12 = extractelement <16 x float> %B, i32 12 301 %sub12 = fsub float %A12, %B12 302 %A14 = extractelement <16 x float> %A, i32 14 303 %B14 = extractelement <16 x float> %B, i32 14 304 %sub14 = fsub float %A14, %B14 305 %A1 = extractelement <16 x float> %A, i32 1 306 %B1 = extractelement <16 x float> %B, i32 1 307 %add1 = fadd float %A1, %B1 308 %A3 = extractelement <16 x float> %A, i32 3 309 %B3 = extractelement <16 x float> %B, i32 3 310 %add3 = fadd float %A3, %B3 311 %A5 = extractelement <16 x float> %A, i32 5 312 %B5 = extractelement <16 x float> %B, i32 5 313 %add5 = fadd float %A5, %B5 314 %A7 = extractelement <16 x float> %A, i32 7 315 %B7 = extractelement <16 x float> %B, i32 7 316 %add7 = fadd float %A7, %B7 317 %A9 = extractelement <16 x float> %A, i32 9 318 %B9 = extractelement <16 x float> %B, i32 9 319 %add9 = fadd float %A9, %B9 320 %A11 = extractelement <16 x float> %A, i32 11 321 %B11 = extractelement <16 x float> %B, i32 11 322 %add11 = fadd float %A11, %B11 323 %A13 = extractelement <16 x float> %A, i32 13 324 %B13 = extractelement <16 x float> %B, i32 13 325 %add13 = fadd float %A13, %B13 326 %A15 = extractelement <16 x float> %A, i32 15 327 %B15 = extractelement <16 x float> %B, i32 15 328 %add15 = fadd float %A15, %B15 329 %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0 330 %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1 331 %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2 332 %vecinsert4 = insertelement <16 x float> %vecinsert3, float %add3, i32 3 333 %vecinsert5 = insertelement <16 x float> %vecinsert4, float %sub4, i32 4 334 ; element 5 is undef 335 %vecinsert7 = insertelement <16 x float> %vecinsert5, float %sub6, i32 6 336 %vecinsert8 = insertelement <16 x float> %vecinsert7, float %add7, i32 7 337 %vecinsert9 = insertelement <16 x float> %vecinsert8, float %sub8, i32 8 338 %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add9, i32 9 339 %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub10, i32 10 340 %vecinsert12 = insertelement <16 x float> %vecinsert11, float %add11, i32 11 341 ; element 12 is undef 342 %vecinsert14 = insertelement <16 x float> %vecinsert12, float %add13, i32 13 343 %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub14, i32 14 344 %vecinsert16 = insertelement <16 x float> %vecinsert15, float %add15, i32 15 345 ret <16 x float> %vecinsert16 346} 347 348define <8 x double> @buildvector_mul_addsub_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 { 349; FMA3_256-LABEL: buildvector_mul_addsub_pd512: 350; FMA3_256: # %bb.0: # %bb 351; FMA3_256-NEXT: vfmaddsub213pd %ymm4, %ymm2, %ymm0 352; FMA3_256-NEXT: vfmaddsub213pd %ymm5, %ymm3, %ymm1 353; FMA3_256-NEXT: retq 354; 355; FMA3_512-LABEL: buildvector_mul_addsub_pd512: 356; FMA3_512: # %bb.0: # %bb 357; FMA3_512-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm0 358; FMA3_512-NEXT: retq 359; 360; FMA4-LABEL: buildvector_mul_addsub_pd512: 361; FMA4: # %bb.0: # %bb 362; FMA4-NEXT: vfmaddsubpd %ymm4, %ymm2, %ymm0, %ymm0 363; FMA4-NEXT: vfmaddsubpd %ymm5, %ymm3, %ymm1, %ymm1 364; FMA4-NEXT: retq 365bb: 366 %A = fmul <8 x double> %C, %D 367 %A0 = extractelement <8 x double> %A, i32 0 368 %B0 = extractelement <8 x double> %B, i32 0 369 %sub0 = fsub double %A0, %B0 370 %A2 = extractelement <8 x double> %A, i32 2 371 %B2 = extractelement <8 x double> %B, i32 2 372 %sub2 = fsub double %A2, %B2 373 %A4 = extractelement <8 x double> %A, i32 4 374 %B4 = extractelement <8 x double> %B, i32 4 375 %sub4 = fsub double %A4, %B4 376 %A6 = extractelement <8 x double> %A, i32 6 377 %B6 = extractelement <8 x double> %B, i32 6 378 %sub6 = fsub double %A6, %B6 379 %A1 = extractelement <8 x double> %A, i32 1 380 %B1 = extractelement <8 x double> %B, i32 1 381 %add1 = fadd double %A1, %B1 382 %A3 = extractelement <8 x double> %A, i32 3 383 %B3 = extractelement <8 x double> %B, i32 3 384 %add3 = fadd double %A3, %B3 385 %A7 = extractelement <8 x double> %A, i32 7 386 %B7 = extractelement <8 x double> %B, i32 7 387 %add7 = fadd double %A7, %B7 388 %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0 389 %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1 390 %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2 391 %vecinsert4 = insertelement <8 x double> %vecinsert3, double %add3, i32 3 392 %vecinsert5 = insertelement <8 x double> %vecinsert4, double %sub4, i32 4 393 ; element 5 is undef 394 %vecinsert7 = insertelement <8 x double> %vecinsert5, double %sub6, i32 6 395 %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7 396 ret <8 x double> %vecinsert8 397} 398 399define <4 x float> @buildvector_mul_subadd_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 { 400; FMA3-LABEL: buildvector_mul_subadd_ps128: 401; FMA3: # %bb.0: # %bb 402; FMA3-NEXT: vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2 403; FMA3-NEXT: retq 404; 405; FMA4-LABEL: buildvector_mul_subadd_ps128: 406; FMA4: # %bb.0: # %bb 407; FMA4-NEXT: vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0 408; FMA4-NEXT: retq 409bb: 410 %A = fmul <4 x float> %C, %D 411 %A0 = extractelement <4 x float> %A, i32 0 412 %B0 = extractelement <4 x float> %B, i32 0 413 %sub0 = fadd float %A0, %B0 414 %A2 = extractelement <4 x float> %A, i32 2 415 %B2 = extractelement <4 x float> %B, i32 2 416 %sub2 = fadd float %A2, %B2 417 %A1 = extractelement <4 x float> %A, i32 1 418 %B1 = extractelement <4 x float> %B, i32 1 419 %add1 = fsub float %A1, %B1 420 %A3 = extractelement <4 x float> %A, i32 3 421 %B3 = extractelement <4 x float> %B, i32 3 422 %add3 = fsub float %A3, %B3 423 %vecinsert1 = insertelement <4 x float> undef, float %sub0, i32 0 424 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add1, i32 1 425 %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub2, i32 2 426 %vecinsert4 = insertelement <4 x float> %vecinsert3, float %add3, i32 3 427 ret <4 x float> %vecinsert4 428} 429 430define <2 x double> @buildvector_mul_subadd_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 { 431; FMA3-LABEL: buildvector_mul_subadd_pd128: 432; FMA3: # %bb.0: # %bb 433; FMA3-NEXT: vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2 434; FMA3-NEXT: retq 435; 436; FMA4-LABEL: buildvector_mul_subadd_pd128: 437; FMA4: # %bb.0: # %bb 438; FMA4-NEXT: vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0 439; FMA4-NEXT: retq 440bb: 441 %A = fmul <2 x double> %C, %D 442 %A0 = extractelement <2 x double> %A, i32 0 443 %B0 = extractelement <2 x double> %B, i32 0 444 %sub0 = fadd double %A0, %B0 445 %A1 = extractelement <2 x double> %A, i32 1 446 %B1 = extractelement <2 x double> %B, i32 1 447 %add1 = fsub double %A1, %B1 448 %vecinsert1 = insertelement <2 x double> undef, double %sub0, i32 0 449 %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add1, i32 1 450 ret <2 x double> %vecinsert2 451} 452 453define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 { 454; FMA3-LABEL: buildvector_mul_subadd_ps256: 455; FMA3: # %bb.0: # %bb 456; FMA3-NEXT: vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2 457; FMA3-NEXT: retq 458; 459; FMA4-LABEL: buildvector_mul_subadd_ps256: 460; FMA4: # %bb.0: # %bb 461; FMA4-NEXT: vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0 462; FMA4-NEXT: retq 463bb: 464 %A = fmul <8 x float> %C, %D 465 %A0 = extractelement <8 x float> %A, i32 0 466 %B0 = extractelement <8 x float> %B, i32 0 467 %sub0 = fadd float %A0, %B0 468 %A2 = extractelement <8 x float> %A, i32 2 469 %B2 = extractelement <8 x float> %B, i32 2 470 %sub2 = fadd float %A2, %B2 471 %A4 = extractelement <8 x float> %A, i32 4 472 %B4 = extractelement <8 x float> %B, i32 4 473 %sub4 = fadd float %A4, %B4 474 %A6 = extractelement <8 x float> %A, i32 6 475 %B6 = extractelement <8 x float> %B, i32 6 476 %sub6 = fadd float %A6, %B6 477 %A1 = extractelement <8 x float> %A, i32 1 478 %B1 = extractelement <8 x float> %B, i32 1 479 %add1 = fsub float %A1, %B1 480 %A3 = extractelement <8 x float> %A, i32 3 481 %B3 = extractelement <8 x float> %B, i32 3 482 %add3 = fsub float %A3, %B3 483 %A5 = extractelement <8 x float> %A, i32 5 484 %B5 = extractelement <8 x float> %B, i32 5 485 %add5 = fsub float %A5, %B5 486 %A7 = extractelement <8 x float> %A, i32 7 487 %B7 = extractelement <8 x float> %B, i32 7 488 %add7 = fsub float %A7, %B7 489 %vecinsert1 = insertelement <8 x float> undef, float %sub0, i32 0 490 %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add1, i32 1 491 %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub2, i32 2 492 %vecinsert4 = insertelement <8 x float> %vecinsert3, float %add3, i32 3 493 %vecinsert5 = insertelement <8 x float> %vecinsert4, float %sub4, i32 4 494 %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add5, i32 5 495 %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub6, i32 6 496 %vecinsert8 = insertelement <8 x float> %vecinsert7, float %add7, i32 7 497 ret <8 x float> %vecinsert8 498} 499 500define <4 x double> @buildvector_mul_subadd_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 { 501; FMA3-LABEL: buildvector_mul_subadd_pd256: 502; FMA3: # %bb.0: # %bb 503; FMA3-NEXT: vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2 504; FMA3-NEXT: retq 505; 506; FMA4-LABEL: buildvector_mul_subadd_pd256: 507; FMA4: # %bb.0: # %bb 508; FMA4-NEXT: vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0 509; FMA4-NEXT: retq 510bb: 511 %A = fmul <4 x double> %C, %D 512 %A0 = extractelement <4 x double> %A, i32 0 513 %B0 = extractelement <4 x double> %B, i32 0 514 %sub0 = fadd double %A0, %B0 515 %A2 = extractelement <4 x double> %A, i32 2 516 %B2 = extractelement <4 x double> %B, i32 2 517 %sub2 = fadd double %A2, %B2 518 %A1 = extractelement <4 x double> %A, i32 1 519 %B1 = extractelement <4 x double> %B, i32 1 520 %add1 = fsub double %A1, %B1 521 %A3 = extractelement <4 x double> %A, i32 3 522 %B3 = extractelement <4 x double> %B, i32 3 523 %add3 = fsub double %A3, %B3 524 %vecinsert1 = insertelement <4 x double> undef, double %sub0, i32 0 525 %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add1, i32 1 526 %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub2, i32 2 527 %vecinsert4 = insertelement <4 x double> %vecinsert3, double %add3, i32 3 528 ret <4 x double> %vecinsert4 529} 530 531define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 { 532; FMA3_256-LABEL: buildvector_mul_subadd_ps512: 533; FMA3_256: # %bb.0: # %bb 534; FMA3_256-NEXT: vfmsubadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) -/+ ymm4 535; FMA3_256-NEXT: vfmsubadd213ps {{.*#+}} ymm1 = (ymm3 * ymm1) -/+ ymm5 536; FMA3_256-NEXT: retq 537; 538; FMA3_512-LABEL: buildvector_mul_subadd_ps512: 539; FMA3_512: # %bb.0: # %bb 540; FMA3_512-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2 541; FMA3_512-NEXT: retq 542; 543; FMA4-LABEL: buildvector_mul_subadd_ps512: 544; FMA4: # %bb.0: # %bb 545; FMA4-NEXT: vfmsubaddps %ymm4, %ymm2, %ymm0, %ymm0 546; FMA4-NEXT: vfmsubaddps %ymm5, %ymm3, %ymm1, %ymm1 547; FMA4-NEXT: retq 548bb: 549 %A = fmul <16 x float> %C, %D 550 %A0 = extractelement <16 x float> %A, i32 0 551 %B0 = extractelement <16 x float> %B, i32 0 552 %sub0 = fadd float %A0, %B0 553 %A2 = extractelement <16 x float> %A, i32 2 554 %B2 = extractelement <16 x float> %B, i32 2 555 %sub2 = fadd float %A2, %B2 556 %A4 = extractelement <16 x float> %A, i32 4 557 %B4 = extractelement <16 x float> %B, i32 4 558 %sub4 = fadd float %A4, %B4 559 %A6 = extractelement <16 x float> %A, i32 6 560 %B6 = extractelement <16 x float> %B, i32 6 561 %sub6 = fadd float %A6, %B6 562 %A8 = extractelement <16 x float> %A, i32 8 563 %B8 = extractelement <16 x float> %B, i32 8 564 %sub8 = fadd float %A8, %B8 565 %A10 = extractelement <16 x float> %A, i32 10 566 %B10 = extractelement <16 x float> %B, i32 10 567 %sub10 = fadd float %A10, %B10 568 %A12 = extractelement <16 x float> %A, i32 12 569 %B12 = extractelement <16 x float> %B, i32 12 570 %sub12 = fadd float %A12, %B12 571 %A14 = extractelement <16 x float> %A, i32 14 572 %B14 = extractelement <16 x float> %B, i32 14 573 %sub14 = fadd float %A14, %B14 574 %A1 = extractelement <16 x float> %A, i32 1 575 %B1 = extractelement <16 x float> %B, i32 1 576 %add1 = fsub float %A1, %B1 577 %A3 = extractelement <16 x float> %A, i32 3 578 %B3 = extractelement <16 x float> %B, i32 3 579 %add3 = fsub float %A3, %B3 580 %A5 = extractelement <16 x float> %A, i32 5 581 %B5 = extractelement <16 x float> %B, i32 5 582 %add5 = fsub float %A5, %B5 583 %A7 = extractelement <16 x float> %A, i32 7 584 %B7 = extractelement <16 x float> %B, i32 7 585 %add7 = fsub float %A7, %B7 586 %A9 = extractelement <16 x float> %A, i32 9 587 %B9 = extractelement <16 x float> %B, i32 9 588 %add9 = fsub float %A9, %B9 589 %A11 = extractelement <16 x float> %A, i32 11 590 %B11 = extractelement <16 x float> %B, i32 11 591 %add11 = fsub float %A11, %B11 592 %A13 = extractelement <16 x float> %A, i32 13 593 %B13 = extractelement <16 x float> %B, i32 13 594 %add13 = fsub float %A13, %B13 595 %A15 = extractelement <16 x float> %A, i32 15 596 %B15 = extractelement <16 x float> %B, i32 15 597 %add15 = fsub float %A15, %B15 598 %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0 599 %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1 600 %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2 601 %vecinsert4 = insertelement <16 x float> %vecinsert3, float %add3, i32 3 602 %vecinsert5 = insertelement <16 x float> %vecinsert4, float %sub4, i32 4 603 ; element 5 is undef 604 %vecinsert7 = insertelement <16 x float> %vecinsert5, float %sub6, i32 6 605 %vecinsert8 = insertelement <16 x float> %vecinsert7, float %add7, i32 7 606 %vecinsert9 = insertelement <16 x float> %vecinsert8, float %sub8, i32 8 607 %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add9, i32 9 608 %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub10, i32 10 609 %vecinsert12 = insertelement <16 x float> %vecinsert11, float %add11, i32 11 610 ; element 12 is undef 611 %vecinsert14 = insertelement <16 x float> %vecinsert12, float %add13, i32 13 612 %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub14, i32 14 613 %vecinsert16 = insertelement <16 x float> %vecinsert15, float %add15, i32 15 614 ret <16 x float> %vecinsert16 615} 616 617define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 { 618; FMA3_256-LABEL: buildvector_mul_subadd_pd512: 619; FMA3_256: # %bb.0: # %bb 620; FMA3_256-NEXT: vfmsubadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) -/+ ymm4 621; FMA3_256-NEXT: vfmsubadd213pd {{.*#+}} ymm1 = (ymm3 * ymm1) -/+ ymm5 622; FMA3_256-NEXT: retq 623; 624; FMA3_512-LABEL: buildvector_mul_subadd_pd512: 625; FMA3_512: # %bb.0: # %bb 626; FMA3_512-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2 627; FMA3_512-NEXT: retq 628; 629; FMA4-LABEL: buildvector_mul_subadd_pd512: 630; FMA4: # %bb.0: # %bb 631; FMA4-NEXT: vfmsubaddpd %ymm4, %ymm2, %ymm0, %ymm0 632; FMA4-NEXT: vfmsubaddpd %ymm5, %ymm3, %ymm1, %ymm1 633; FMA4-NEXT: retq 634bb: 635 %A = fmul <8 x double> %C, %D 636 %A0 = extractelement <8 x double> %A, i32 0 637 %B0 = extractelement <8 x double> %B, i32 0 638 %sub0 = fadd double %A0, %B0 639 %A2 = extractelement <8 x double> %A, i32 2 640 %B2 = extractelement <8 x double> %B, i32 2 641 %sub2 = fadd double %A2, %B2 642 %A4 = extractelement <8 x double> %A, i32 4 643 %B4 = extractelement <8 x double> %B, i32 4 644 %sub4 = fadd double %A4, %B4 645 %A6 = extractelement <8 x double> %A, i32 6 646 %B6 = extractelement <8 x double> %B, i32 6 647 %sub6 = fadd double %A6, %B6 648 %A1 = extractelement <8 x double> %A, i32 1 649 %B1 = extractelement <8 x double> %B, i32 1 650 %add1 = fsub double %A1, %B1 651 %A3 = extractelement <8 x double> %A, i32 3 652 %B3 = extractelement <8 x double> %B, i32 3 653 %add3 = fsub double %A3, %B3 654 %A7 = extractelement <8 x double> %A, i32 7 655 %B7 = extractelement <8 x double> %B, i32 7 656 %add7 = fsub double %A7, %B7 657 %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0 658 %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1 659 %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2 660 %vecinsert4 = insertelement <8 x double> %vecinsert3, double %add3, i32 3 661 %vecinsert5 = insertelement <8 x double> %vecinsert4, double %sub4, i32 4 662 ; element 5 is undef 663 %vecinsert7 = insertelement <8 x double> %vecinsert5, double %sub6, i32 6 664 %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7 665 ret <8 x double> %vecinsert8 666} 667 668attributes #0 = { nounwind "unsafe-fp-math"="true" } 669