1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma | FileCheck %s -check-prefixes=FMA3,FMA3_256 3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma,+avx512f | FileCheck %s -check-prefixes=FMA3,FMA3_512 4; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma4 | FileCheck %s -check-prefixes=FMA4 5 6; This test checks the fusing of MUL + ADDSUB to FMADDSUB. 7 8define <2 x double> @mul_addsub_pd128(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 { 9; FMA3-LABEL: mul_addsub_pd128: 10; FMA3: # %bb.0: # %entry 11; FMA3-NEXT: vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2 12; FMA3-NEXT: retq 13; 14; FMA4-LABEL: mul_addsub_pd128: 15; FMA4: # %bb.0: # %entry 16; FMA4-NEXT: vfmaddsubpd {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2 17; FMA4-NEXT: retq 18entry: 19 %AB = fmul <2 x double> %A, %B 20 %Sub = fsub <2 x double> %AB, %C 21 %Add = fadd <2 x double> %AB, %C 22 %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add, <2 x i32> <i32 0, i32 3> 23 ret <2 x double> %Addsub 24} 25 26define <4 x float> @mul_addsub_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 { 27; FMA3-LABEL: mul_addsub_ps128: 28; FMA3: # %bb.0: # %entry 29; FMA3-NEXT: vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2 30; FMA3-NEXT: retq 31; 32; FMA4-LABEL: mul_addsub_ps128: 33; FMA4: # %bb.0: # %entry 34; FMA4-NEXT: vfmaddsubps {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2 35; FMA4-NEXT: retq 36entry: 37 %AB = fmul <4 x float> %A, %B 38 %Sub = fsub <4 x float> %AB, %C 39 %Add = fadd <4 x float> %AB, %C 40 %Addsub = shufflevector <4 x float> %Sub, <4 x float> %Add, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 41 ret <4 x float> %Addsub 42} 43 44define <4 x double> @mul_addsub_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 { 45; FMA3-LABEL: mul_addsub_pd256: 46; FMA3: # %bb.0: # %entry 47; FMA3-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2 48; FMA3-NEXT: retq 49; 50; FMA4-LABEL: mul_addsub_pd256: 51; FMA4: # %bb.0: # %entry 52; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2 53; FMA4-NEXT: retq 54entry: 55 %AB = fmul <4 x double> %A, %B 56 %Sub = fsub <4 x double> %AB, %C 57 %Add = fadd <4 x double> %AB, %C 58 %Addsub = shufflevector <4 x double> %Sub, <4 x double> %Add, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 59 ret <4 x double> %Addsub 60} 61 62define <8 x float> @mul_addsub_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 { 63; FMA3-LABEL: mul_addsub_ps256: 64; FMA3: # %bb.0: # %entry 65; FMA3-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2 66; FMA3-NEXT: retq 67; 68; FMA4-LABEL: mul_addsub_ps256: 69; FMA4: # %bb.0: # %entry 70; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2 71; FMA4-NEXT: retq 72entry: 73 %AB = fmul <8 x float> %A, %B 74 %Sub = fsub <8 x float> %AB, %C 75 %Add = fadd <8 x float> %AB, %C 76 %Addsub = shufflevector <8 x float> %Sub, <8 x float> %Add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 77 ret <8 x float> %Addsub 78} 79 80define <8 x double> @mul_addsub_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 { 81; FMA3_256-LABEL: mul_addsub_pd512: 82; FMA3_256: # %bb.0: # %entry 83; FMA3_256-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4 84; FMA3_256-NEXT: vfmaddsub213pd {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm5 85; FMA3_256-NEXT: retq 86; 87; FMA3_512-LABEL: mul_addsub_pd512: 88; FMA3_512: # %bb.0: # %entry 89; FMA3_512-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 90; FMA3_512-NEXT: retq 91; 92; FMA4-LABEL: mul_addsub_pd512: 93; FMA4: # %bb.0: # %entry 94; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm4 95; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5 96; FMA4-NEXT: retq 97entry: 98 %AB = fmul <8 x double> %A, %B 99 %Sub = fsub <8 x double> %AB, %C 100 %Add = fadd <8 x double> %AB, %C 101 %Addsub = shufflevector <8 x double> %Sub, <8 x double> %Add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 102 ret <8 x double> %Addsub 103} 104 105define <16 x float> @mul_addsub_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 { 106; FMA3_256-LABEL: mul_addsub_ps512: 107; FMA3_256: # %bb.0: # %entry 108; FMA3_256-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4 109; FMA3_256-NEXT: vfmaddsub213ps {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm5 110; FMA3_256-NEXT: retq 111; 112; FMA3_512-LABEL: mul_addsub_ps512: 113; FMA3_512: # %bb.0: # %entry 114; FMA3_512-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 115; FMA3_512-NEXT: retq 116; 117; FMA4-LABEL: mul_addsub_ps512: 118; FMA4: # %bb.0: # %entry 119; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm4 120; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5 121; FMA4-NEXT: retq 122entry: 123 %AB = fmul <16 x float> %A, %B 124 %Sub = fsub <16 x float> %AB, %C 125 %Add = fadd <16 x float> %AB, %C 126 %Addsub = shufflevector <16 x float> %Sub, <16 x float> %Add, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 127 ret <16 x float> %Addsub 128} 129 130define <4 x float> @buildvector_mul_addsub_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 { 131; FMA3-LABEL: buildvector_mul_addsub_ps128: 132; FMA3: # %bb.0: # %bb 133; FMA3-NEXT: vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2 134; FMA3-NEXT: retq 135; 136; FMA4-LABEL: buildvector_mul_addsub_ps128: 137; FMA4: # %bb.0: # %bb 138; FMA4-NEXT: vfmaddsubps {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2 139; FMA4-NEXT: retq 140bb: 141 %A = fmul <4 x float> %C, %D 142 %A0 = extractelement <4 x float> %A, i32 0 143 %B0 = extractelement <4 x float> %B, i32 0 144 %sub0 = fsub float %A0, %B0 145 %A2 = extractelement <4 x float> %A, i32 2 146 %B2 = extractelement <4 x float> %B, i32 2 147 %sub2 = fsub float %A2, %B2 148 %A1 = extractelement <4 x float> %A, i32 1 149 %B1 = extractelement <4 x float> %B, i32 1 150 %add1 = fadd float %A1, %B1 151 %A3 = extractelement <4 x float> %A, i32 3 152 %B3 = extractelement <4 x float> %B, i32 3 153 %add3 = fadd float %A3, %B3 154 %vecinsert1 = insertelement <4 x float> undef, float %sub0, i32 0 155 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add1, i32 1 156 %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub2, i32 2 157 %vecinsert4 = insertelement <4 x float> %vecinsert3, float %add3, i32 3 158 ret <4 x float> %vecinsert4 159} 160 161define <2 x double> @buildvector_mul_addsub_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 { 162; FMA3-LABEL: buildvector_mul_addsub_pd128: 163; FMA3: # %bb.0: # %bb 164; FMA3-NEXT: vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2 165; FMA3-NEXT: retq 166; 167; FMA4-LABEL: buildvector_mul_addsub_pd128: 168; FMA4: # %bb.0: # %bb 169; FMA4-NEXT: vfmaddsubpd {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2 170; FMA4-NEXT: retq 171bb: 172 %A = fmul <2 x double> %C, %D 173 %A0 = extractelement <2 x double> %A, i32 0 174 %B0 = extractelement <2 x double> %B, i32 0 175 %sub0 = fsub double %A0, %B0 176 %A1 = extractelement <2 x double> %A, i32 1 177 %B1 = extractelement <2 x double> %B, i32 1 178 %add1 = fadd double %A1, %B1 179 %vecinsert1 = insertelement <2 x double> undef, double %sub0, i32 0 180 %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add1, i32 1 181 ret <2 x double> %vecinsert2 182} 183 184define <8 x float> @buildvector_mul_addsub_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 { 185; FMA3-LABEL: buildvector_mul_addsub_ps256: 186; FMA3: # %bb.0: # %bb 187; FMA3-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2 188; FMA3-NEXT: retq 189; 190; FMA4-LABEL: buildvector_mul_addsub_ps256: 191; FMA4: # %bb.0: # %bb 192; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2 193; FMA4-NEXT: retq 194bb: 195 %A = fmul <8 x float> %C, %D 196 %A0 = extractelement <8 x float> %A, i32 0 197 %B0 = extractelement <8 x float> %B, i32 0 198 %sub0 = fsub float %A0, %B0 199 %A2 = extractelement <8 x float> %A, i32 2 200 %B2 = extractelement <8 x float> %B, i32 2 201 %sub2 = fsub float %A2, %B2 202 %A4 = extractelement <8 x float> %A, i32 4 203 %B4 = extractelement <8 x float> %B, i32 4 204 %sub4 = fsub float %A4, %B4 205 %A6 = extractelement <8 x float> %A, i32 6 206 %B6 = extractelement <8 x float> %B, i32 6 207 %sub6 = fsub float %A6, %B6 208 %A1 = extractelement <8 x float> %A, i32 1 209 %B1 = extractelement <8 x float> %B, i32 1 210 %add1 = fadd float %A1, %B1 211 %A3 = extractelement <8 x float> %A, i32 3 212 %B3 = extractelement <8 x float> %B, i32 3 213 %add3 = fadd float %A3, %B3 214 %A5 = extractelement <8 x float> %A, i32 5 215 %B5 = extractelement <8 x float> %B, i32 5 216 %add5 = fadd float %A5, %B5 217 %A7 = extractelement <8 x float> %A, i32 7 218 %B7 = extractelement <8 x float> %B, i32 7 219 %add7 = fadd float %A7, %B7 220 %vecinsert1 = insertelement <8 x float> undef, float %sub0, i32 0 221 %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add1, i32 1 222 %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub2, i32 2 223 %vecinsert4 = insertelement <8 x float> %vecinsert3, float %add3, i32 3 224 %vecinsert5 = insertelement <8 x float> %vecinsert4, float %sub4, i32 4 225 %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add5, i32 5 226 %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub6, i32 6 227 %vecinsert8 = insertelement <8 x float> %vecinsert7, float %add7, i32 7 228 ret <8 x float> %vecinsert8 229} 230 231define <4 x double> @buildvector_mul_addsub_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 { 232; FMA3-LABEL: buildvector_mul_addsub_pd256: 233; FMA3: # %bb.0: # %bb 234; FMA3-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2 235; FMA3-NEXT: retq 236; 237; FMA4-LABEL: buildvector_mul_addsub_pd256: 238; FMA4: # %bb.0: # %bb 239; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2 240; FMA4-NEXT: retq 241bb: 242 %A = fmul <4 x double> %C, %D 243 %A0 = extractelement <4 x double> %A, i32 0 244 %B0 = extractelement <4 x double> %B, i32 0 245 %sub0 = fsub double %A0, %B0 246 %A2 = extractelement <4 x double> %A, i32 2 247 %B2 = extractelement <4 x double> %B, i32 2 248 %sub2 = fsub double %A2, %B2 249 %A1 = extractelement <4 x double> %A, i32 1 250 %B1 = extractelement <4 x double> %B, i32 1 251 %add1 = fadd double %A1, %B1 252 %A3 = extractelement <4 x double> %A, i32 3 253 %B3 = extractelement <4 x double> %B, i32 3 254 %add3 = fadd double %A3, %B3 255 %vecinsert1 = insertelement <4 x double> undef, double %sub0, i32 0 256 %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add1, i32 1 257 %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub2, i32 2 258 %vecinsert4 = insertelement <4 x double> %vecinsert3, double %add3, i32 3 259 ret <4 x double> %vecinsert4 260} 261 262define <16 x float> @buildvector_mul_addsub_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 { 263; FMA3_256-LABEL: buildvector_mul_addsub_ps512: 264; FMA3_256: # %bb.0: # %bb 265; FMA3_256-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4 266; FMA3_256-NEXT: vfmaddsub213ps {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm5 267; FMA3_256-NEXT: retq 268; 269; FMA3_512-LABEL: buildvector_mul_addsub_ps512: 270; FMA3_512: # %bb.0: # %bb 271; FMA3_512-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 272; FMA3_512-NEXT: retq 273; 274; FMA4-LABEL: buildvector_mul_addsub_ps512: 275; FMA4: # %bb.0: # %bb 276; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm4 277; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5 278; FMA4-NEXT: retq 279bb: 280 %A = fmul <16 x float> %C, %D 281 %A0 = extractelement <16 x float> %A, i32 0 282 %B0 = extractelement <16 x float> %B, i32 0 283 %sub0 = fsub float %A0, %B0 284 %A2 = extractelement <16 x float> %A, i32 2 285 %B2 = extractelement <16 x float> %B, i32 2 286 %sub2 = fsub float %A2, %B2 287 %A4 = extractelement <16 x float> %A, i32 4 288 %B4 = extractelement <16 x float> %B, i32 4 289 %sub4 = fsub float %A4, %B4 290 %A6 = extractelement <16 x float> %A, i32 6 291 %B6 = extractelement <16 x float> %B, i32 6 292 %sub6 = fsub float %A6, %B6 293 %A8 = extractelement <16 x float> %A, i32 8 294 %B8 = extractelement <16 x float> %B, i32 8 295 %sub8 = fsub float %A8, %B8 296 %A10 = extractelement <16 x float> %A, i32 10 297 %B10 = extractelement <16 x float> %B, i32 10 298 %sub10 = fsub float %A10, %B10 299 %A12 = extractelement <16 x float> %A, i32 12 300 %B12 = extractelement <16 x float> %B, i32 12 301 %sub12 = fsub float %A12, %B12 302 %A14 = extractelement <16 x float> %A, i32 14 303 %B14 = extractelement <16 x float> %B, i32 14 304 %sub14 = fsub float %A14, %B14 305 %A1 = extractelement <16 x float> %A, i32 1 306 %B1 = extractelement <16 x float> %B, i32 1 307 %add1 = fadd float %A1, %B1 308 %A3 = extractelement <16 x float> %A, i32 3 309 %B3 = extractelement <16 x float> %B, i32 3 310 %add3 = fadd float %A3, %B3 311 %A5 = extractelement <16 x float> %A, i32 5 312 %B5 = extractelement <16 x float> %B, i32 5 313 %add5 = fadd float %A5, %B5 314 %A7 = extractelement <16 x float> %A, i32 7 315 %B7 = extractelement <16 x float> %B, i32 7 316 %add7 = fadd float %A7, %B7 317 %A9 = extractelement <16 x float> %A, i32 9 318 %B9 = extractelement <16 x float> %B, i32 9 319 %add9 = fadd float %A9, %B9 320 %A11 = extractelement <16 x float> %A, i32 11 321 %B11 = extractelement <16 x float> %B, i32 11 322 %add11 = fadd float %A11, %B11 323 %A13 = extractelement <16 x float> %A, i32 13 324 %B13 = extractelement <16 x float> %B, i32 13 325 %add13 = fadd float %A13, %B13 326 %A15 = extractelement <16 x float> %A, i32 15 327 %B15 = extractelement <16 x float> %B, i32 15 328 %add15 = fadd float %A15, %B15 329 %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0 330 %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1 331 %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2 332 %vecinsert4 = insertelement <16 x float> %vecinsert3, float %add3, i32 3 333 %vecinsert5 = insertelement <16 x float> %vecinsert4, float %sub4, i32 4 334 ; element 5 is undef 335 %vecinsert7 = insertelement <16 x float> %vecinsert5, float %sub6, i32 6 336 %vecinsert8 = insertelement <16 x float> %vecinsert7, float %add7, i32 7 337 %vecinsert9 = insertelement <16 x float> %vecinsert8, float %sub8, i32 8 338 %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add9, i32 9 339 %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub10, i32 10 340 %vecinsert12 = insertelement <16 x float> %vecinsert11, float %add11, i32 11 341 ; element 12 is undef 342 %vecinsert14 = insertelement <16 x float> %vecinsert12, float %add13, i32 13 343 %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub14, i32 14 344 %vecinsert16 = insertelement <16 x float> %vecinsert15, float %add15, i32 15 345 ret <16 x float> %vecinsert16 346} 347 348define <8 x double> @buildvector_mul_addsub_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 { 349; FMA3_256-LABEL: buildvector_mul_addsub_pd512: 350; FMA3_256: # %bb.0: # %bb 351; FMA3_256-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4 352; FMA3_256-NEXT: vfmaddsub213pd {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm5 353; FMA3_256-NEXT: retq 354; 355; FMA3_512-LABEL: buildvector_mul_addsub_pd512: 356; FMA3_512: # %bb.0: # %bb 357; FMA3_512-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 358; FMA3_512-NEXT: retq 359; 360; FMA4-LABEL: buildvector_mul_addsub_pd512: 361; FMA4: # %bb.0: # %bb 362; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm4 363; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5 364; FMA4-NEXT: retq 365bb: 366 %A = fmul <8 x double> %C, %D 367 %A0 = extractelement <8 x double> %A, i32 0 368 %B0 = extractelement <8 x double> %B, i32 0 369 %sub0 = fsub double %A0, %B0 370 %A2 = extractelement <8 x double> %A, i32 2 371 %B2 = extractelement <8 x double> %B, i32 2 372 %sub2 = fsub double %A2, %B2 373 %A4 = extractelement <8 x double> %A, i32 4 374 %B4 = extractelement <8 x double> %B, i32 4 375 %sub4 = fsub double %A4, %B4 376 %A6 = extractelement <8 x double> %A, i32 6 377 %B6 = extractelement <8 x double> %B, i32 6 378 %sub6 = fsub double %A6, %B6 379 %A1 = extractelement <8 x double> %A, i32 1 380 %B1 = extractelement <8 x double> %B, i32 1 381 %add1 = fadd double %A1, %B1 382 %A3 = extractelement <8 x double> %A, i32 3 383 %B3 = extractelement <8 x double> %B, i32 3 384 %add3 = fadd double %A3, %B3 385 %A7 = extractelement <8 x double> %A, i32 7 386 %B7 = extractelement <8 x double> %B, i32 7 387 %add7 = fadd double %A7, %B7 388 %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0 389 %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1 390 %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2 391 %vecinsert4 = insertelement <8 x double> %vecinsert3, double %add3, i32 3 392 %vecinsert5 = insertelement <8 x double> %vecinsert4, double %sub4, i32 4 393 ; element 5 is undef 394 %vecinsert7 = insertelement <8 x double> %vecinsert5, double %sub6, i32 6 395 %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7 396 ret <8 x double> %vecinsert8 397} 398 399define <4 x float> @buildvector_mul_subadd_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 { 400; FMA3-LABEL: buildvector_mul_subadd_ps128: 401; FMA3: # %bb.0: # %bb 402; FMA3-NEXT: vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2 403; FMA3-NEXT: retq 404; 405; FMA4-LABEL: buildvector_mul_subadd_ps128: 406; FMA4: # %bb.0: # %bb 407; FMA4-NEXT: vfmsubaddps {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2 408; FMA4-NEXT: retq 409bb: 410 %A = fmul <4 x float> %C, %D 411 %A0 = extractelement <4 x float> %A, i32 0 412 %B0 = extractelement <4 x float> %B, i32 0 413 %sub0 = fadd float %A0, %B0 414 %A2 = extractelement <4 x float> %A, i32 2 415 %B2 = extractelement <4 x float> %B, i32 2 416 %sub2 = fadd float %A2, %B2 417 %A1 = extractelement <4 x float> %A, i32 1 418 %B1 = extractelement <4 x float> %B, i32 1 419 %add1 = fsub float %A1, %B1 420 %A3 = extractelement <4 x float> %A, i32 3 421 %B3 = extractelement <4 x float> %B, i32 3 422 %add3 = fsub float %A3, %B3 423 %vecinsert1 = insertelement <4 x float> undef, float %sub0, i32 0 424 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add1, i32 1 425 %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub2, i32 2 426 %vecinsert4 = insertelement <4 x float> %vecinsert3, float %add3, i32 3 427 ret <4 x float> %vecinsert4 428} 429 430define <2 x double> @buildvector_mul_subadd_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 { 431; FMA3-LABEL: buildvector_mul_subadd_pd128: 432; FMA3: # %bb.0: # %bb 433; FMA3-NEXT: vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2 434; FMA3-NEXT: retq 435; 436; FMA4-LABEL: buildvector_mul_subadd_pd128: 437; FMA4: # %bb.0: # %bb 438; FMA4-NEXT: vfmsubaddpd {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2 439; FMA4-NEXT: retq 440bb: 441 %A = fmul <2 x double> %C, %D 442 %A0 = extractelement <2 x double> %A, i32 0 443 %B0 = extractelement <2 x double> %B, i32 0 444 %sub0 = fadd double %A0, %B0 445 %A1 = extractelement <2 x double> %A, i32 1 446 %B1 = extractelement <2 x double> %B, i32 1 447 %add1 = fsub double %A1, %B1 448 %vecinsert1 = insertelement <2 x double> undef, double %sub0, i32 0 449 %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add1, i32 1 450 ret <2 x double> %vecinsert2 451} 452 453define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 { 454; FMA3-LABEL: buildvector_mul_subadd_ps256: 455; FMA3: # %bb.0: # %bb 456; FMA3-NEXT: vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2 457; FMA3-NEXT: retq 458; 459; FMA4-LABEL: buildvector_mul_subadd_ps256: 460; FMA4: # %bb.0: # %bb 461; FMA4-NEXT: vfmsubaddps {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2 462; FMA4-NEXT: retq 463bb: 464 %A = fmul <8 x float> %C, %D 465 %A0 = extractelement <8 x float> %A, i32 0 466 %B0 = extractelement <8 x float> %B, i32 0 467 %sub0 = fadd float %A0, %B0 468 %A2 = extractelement <8 x float> %A, i32 2 469 %B2 = extractelement <8 x float> %B, i32 2 470 %sub2 = fadd float %A2, %B2 471 %A4 = extractelement <8 x float> %A, i32 4 472 %B4 = extractelement <8 x float> %B, i32 4 473 %sub4 = fadd float %A4, %B4 474 %A6 = extractelement <8 x float> %A, i32 6 475 %B6 = extractelement <8 x float> %B, i32 6 476 %sub6 = fadd float %A6, %B6 477 %A1 = extractelement <8 x float> %A, i32 1 478 %B1 = extractelement <8 x float> %B, i32 1 479 %add1 = fsub float %A1, %B1 480 %A3 = extractelement <8 x float> %A, i32 3 481 %B3 = extractelement <8 x float> %B, i32 3 482 %add3 = fsub float %A3, %B3 483 %A5 = extractelement <8 x float> %A, i32 5 484 %B5 = extractelement <8 x float> %B, i32 5 485 %add5 = fsub float %A5, %B5 486 %A7 = extractelement <8 x float> %A, i32 7 487 %B7 = extractelement <8 x float> %B, i32 7 488 %add7 = fsub float %A7, %B7 489 %vecinsert1 = insertelement <8 x float> undef, float %sub0, i32 0 490 %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add1, i32 1 491 %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub2, i32 2 492 %vecinsert4 = insertelement <8 x float> %vecinsert3, float %add3, i32 3 493 %vecinsert5 = insertelement <8 x float> %vecinsert4, float %sub4, i32 4 494 %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add5, i32 5 495 %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub6, i32 6 496 %vecinsert8 = insertelement <8 x float> %vecinsert7, float %add7, i32 7 497 ret <8 x float> %vecinsert8 498} 499 500define <4 x double> @buildvector_mul_subadd_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 { 501; FMA3-LABEL: buildvector_mul_subadd_pd256: 502; FMA3: # %bb.0: # %bb 503; FMA3-NEXT: vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2 504; FMA3-NEXT: retq 505; 506; FMA4-LABEL: buildvector_mul_subadd_pd256: 507; FMA4: # %bb.0: # %bb 508; FMA4-NEXT: vfmsubaddpd {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2 509; FMA4-NEXT: retq 510bb: 511 %A = fmul <4 x double> %C, %D 512 %A0 = extractelement <4 x double> %A, i32 0 513 %B0 = extractelement <4 x double> %B, i32 0 514 %sub0 = fadd double %A0, %B0 515 %A2 = extractelement <4 x double> %A, i32 2 516 %B2 = extractelement <4 x double> %B, i32 2 517 %sub2 = fadd double %A2, %B2 518 %A1 = extractelement <4 x double> %A, i32 1 519 %B1 = extractelement <4 x double> %B, i32 1 520 %add1 = fsub double %A1, %B1 521 %A3 = extractelement <4 x double> %A, i32 3 522 %B3 = extractelement <4 x double> %B, i32 3 523 %add3 = fsub double %A3, %B3 524 %vecinsert1 = insertelement <4 x double> undef, double %sub0, i32 0 525 %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add1, i32 1 526 %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub2, i32 2 527 %vecinsert4 = insertelement <4 x double> %vecinsert3, double %add3, i32 3 528 ret <4 x double> %vecinsert4 529} 530 531define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 { 532; FMA3_256-LABEL: buildvector_mul_subadd_ps512: 533; FMA3_256: # %bb.0: # %bb 534; FMA3_256-NEXT: vfmsubadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) -/+ ymm4 535; FMA3_256-NEXT: vfmsubadd213ps {{.*#+}} ymm1 = (ymm3 * ymm1) -/+ ymm5 536; FMA3_256-NEXT: retq 537; 538; FMA3_512-LABEL: buildvector_mul_subadd_ps512: 539; FMA3_512: # %bb.0: # %bb 540; FMA3_512-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2 541; FMA3_512-NEXT: retq 542; 543; FMA4-LABEL: buildvector_mul_subadd_ps512: 544; FMA4: # %bb.0: # %bb 545; FMA4-NEXT: vfmsubaddps {{.*#+}} ymm0 = (ymm0 * ymm2) -/+ ymm4 546; FMA4-NEXT: vfmsubaddps {{.*#+}} ymm1 = (ymm1 * ymm3) -/+ ymm5 547; FMA4-NEXT: retq 548bb: 549 %A = fmul <16 x float> %C, %D 550 %A0 = extractelement <16 x float> %A, i32 0 551 %B0 = extractelement <16 x float> %B, i32 0 552 %sub0 = fadd float %A0, %B0 553 %A2 = extractelement <16 x float> %A, i32 2 554 %B2 = extractelement <16 x float> %B, i32 2 555 %sub2 = fadd float %A2, %B2 556 %A4 = extractelement <16 x float> %A, i32 4 557 %B4 = extractelement <16 x float> %B, i32 4 558 %sub4 = fadd float %A4, %B4 559 %A6 = extractelement <16 x float> %A, i32 6 560 %B6 = extractelement <16 x float> %B, i32 6 561 %sub6 = fadd float %A6, %B6 562 %A8 = extractelement <16 x float> %A, i32 8 563 %B8 = extractelement <16 x float> %B, i32 8 564 %sub8 = fadd float %A8, %B8 565 %A10 = extractelement <16 x float> %A, i32 10 566 %B10 = extractelement <16 x float> %B, i32 10 567 %sub10 = fadd float %A10, %B10 568 %A12 = extractelement <16 x float> %A, i32 12 569 %B12 = extractelement <16 x float> %B, i32 12 570 %sub12 = fadd float %A12, %B12 571 %A14 = extractelement <16 x float> %A, i32 14 572 %B14 = extractelement <16 x float> %B, i32 14 573 %sub14 = fadd float %A14, %B14 574 %A1 = extractelement <16 x float> %A, i32 1 575 %B1 = extractelement <16 x float> %B, i32 1 576 %add1 = fsub float %A1, %B1 577 %A3 = extractelement <16 x float> %A, i32 3 578 %B3 = extractelement <16 x float> %B, i32 3 579 %add3 = fsub float %A3, %B3 580 %A5 = extractelement <16 x float> %A, i32 5 581 %B5 = extractelement <16 x float> %B, i32 5 582 %add5 = fsub float %A5, %B5 583 %A7 = extractelement <16 x float> %A, i32 7 584 %B7 = extractelement <16 x float> %B, i32 7 585 %add7 = fsub float %A7, %B7 586 %A9 = extractelement <16 x float> %A, i32 9 587 %B9 = extractelement <16 x float> %B, i32 9 588 %add9 = fsub float %A9, %B9 589 %A11 = extractelement <16 x float> %A, i32 11 590 %B11 = extractelement <16 x float> %B, i32 11 591 %add11 = fsub float %A11, %B11 592 %A13 = extractelement <16 x float> %A, i32 13 593 %B13 = extractelement <16 x float> %B, i32 13 594 %add13 = fsub float %A13, %B13 595 %A15 = extractelement <16 x float> %A, i32 15 596 %B15 = extractelement <16 x float> %B, i32 15 597 %add15 = fsub float %A15, %B15 598 %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0 599 %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1 600 %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2 601 %vecinsert4 = insertelement <16 x float> %vecinsert3, float %add3, i32 3 602 %vecinsert5 = insertelement <16 x float> %vecinsert4, float %sub4, i32 4 603 ; element 5 is undef 604 %vecinsert7 = insertelement <16 x float> %vecinsert5, float %sub6, i32 6 605 %vecinsert8 = insertelement <16 x float> %vecinsert7, float %add7, i32 7 606 %vecinsert9 = insertelement <16 x float> %vecinsert8, float %sub8, i32 8 607 %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add9, i32 9 608 %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub10, i32 10 609 %vecinsert12 = insertelement <16 x float> %vecinsert11, float %add11, i32 11 610 ; element 12 is undef 611 %vecinsert14 = insertelement <16 x float> %vecinsert12, float %add13, i32 13 612 %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub14, i32 14 613 %vecinsert16 = insertelement <16 x float> %vecinsert15, float %add15, i32 15 614 ret <16 x float> %vecinsert16 615} 616 617define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 { 618; FMA3_256-LABEL: buildvector_mul_subadd_pd512: 619; FMA3_256: # %bb.0: # %bb 620; FMA3_256-NEXT: vfmsubadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) -/+ ymm4 621; FMA3_256-NEXT: vfmsubadd213pd {{.*#+}} ymm1 = (ymm3 * ymm1) -/+ ymm5 622; FMA3_256-NEXT: retq 623; 624; FMA3_512-LABEL: buildvector_mul_subadd_pd512: 625; FMA3_512: # %bb.0: # %bb 626; FMA3_512-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2 627; FMA3_512-NEXT: retq 628; 629; FMA4-LABEL: buildvector_mul_subadd_pd512: 630; FMA4: # %bb.0: # %bb 631; FMA4-NEXT: vfmsubaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) -/+ ymm4 632; FMA4-NEXT: vfmsubaddpd {{.*#+}} ymm1 = (ymm1 * ymm3) -/+ ymm5 633; FMA4-NEXT: retq 634bb: 635 %A = fmul <8 x double> %C, %D 636 %A0 = extractelement <8 x double> %A, i32 0 637 %B0 = extractelement <8 x double> %B, i32 0 638 %sub0 = fadd double %A0, %B0 639 %A2 = extractelement <8 x double> %A, i32 2 640 %B2 = extractelement <8 x double> %B, i32 2 641 %sub2 = fadd double %A2, %B2 642 %A4 = extractelement <8 x double> %A, i32 4 643 %B4 = extractelement <8 x double> %B, i32 4 644 %sub4 = fadd double %A4, %B4 645 %A6 = extractelement <8 x double> %A, i32 6 646 %B6 = extractelement <8 x double> %B, i32 6 647 %sub6 = fadd double %A6, %B6 648 %A1 = extractelement <8 x double> %A, i32 1 649 %B1 = extractelement <8 x double> %B, i32 1 650 %add1 = fsub double %A1, %B1 651 %A3 = extractelement <8 x double> %A, i32 3 652 %B3 = extractelement <8 x double> %B, i32 3 653 %add3 = fsub double %A3, %B3 654 %A7 = extractelement <8 x double> %A, i32 7 655 %B7 = extractelement <8 x double> %B, i32 7 656 %add7 = fsub double %A7, %B7 657 %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0 658 %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1 659 %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2 660 %vecinsert4 = insertelement <8 x double> %vecinsert3, double %add3, i32 3 661 %vecinsert5 = insertelement <8 x double> %vecinsert4, double %sub4, i32 4 662 ; element 5 is undef 663 %vecinsert7 = insertelement <8 x double> %vecinsert5, double %sub6, i32 6 664 %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7 665 ret <8 x double> %vecinsert8 666} 667 668attributes #0 = { nounwind "unsafe-fp-math"="true" } 669