1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 5 6; Verify that we correctly fold horizontal binop even in the presence of UNDEFs. 7 8define <4 x float> @test1_undef(<4 x float> %a, <4 x float> %b) { 9; SSE-LABEL: test1_undef: 10; SSE: # %bb.0: 11; SSE-NEXT: haddps %xmm1, %xmm0 12; SSE-NEXT: retq 13; 14; AVX-LABEL: test1_undef: 15; AVX: # %bb.0: 16; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 17; AVX-NEXT: retq 18 %vecext = extractelement <4 x float> %a, i32 0 19 %vecext1 = extractelement <4 x float> %a, i32 1 20 %add = fadd float %vecext, %vecext1 21 %vecinit = insertelement <4 x float> undef, float %add, i32 0 22 %vecext2 = extractelement <4 x float> %a, i32 2 23 %vecext3 = extractelement <4 x float> %a, i32 3 24 %add4 = fadd float %vecext2, %vecext3 25 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1 26 %vecext10 = extractelement <4 x float> %b, i32 2 27 %vecext11 = extractelement <4 x float> %b, i32 3 28 %add12 = fadd float %vecext10, %vecext11 29 %vecinit13 = insertelement <4 x float> %vecinit5, float %add12, i32 3 30 ret <4 x float> %vecinit13 31} 32 33define <4 x float> @test2_undef(<4 x float> %a, <4 x float> %b) { 34; SSE-LABEL: test2_undef: 35; SSE: # %bb.0: 36; SSE-NEXT: haddps %xmm1, %xmm0 37; SSE-NEXT: retq 38; 39; AVX-LABEL: test2_undef: 40; AVX: # %bb.0: 41; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 42; AVX-NEXT: retq 43 %vecext = extractelement <4 x float> %a, i32 0 44 %vecext1 = extractelement <4 x float> %a, i32 1 45 %add = fadd float %vecext, %vecext1 46 %vecinit = insertelement <4 x float> undef, float %add, i32 0 47 %vecext6 = extractelement <4 x float> %b, i32 0 48 %vecext7 = extractelement <4 x float> %b, i32 1 49 %add8 = fadd float %vecext6, %vecext7 50 %vecinit9 = insertelement <4 x float> %vecinit, float %add8, i32 2 51 %vecext10 = extractelement <4 x float> %b, i32 2 52 %vecext11 = extractelement <4 x float> %b, i32 3 53 %add12 = fadd float %vecext10, %vecext11 54 %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3 55 ret <4 x float> %vecinit13 56} 57 58define <4 x float> @test3_undef(<4 x float> %a, <4 x float> %b) { 59; SSE-LABEL: test3_undef: 60; SSE: # %bb.0: 61; SSE-NEXT: haddps %xmm1, %xmm0 62; SSE-NEXT: retq 63; 64; AVX-LABEL: test3_undef: 65; AVX: # %bb.0: 66; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 67; AVX-NEXT: retq 68 %vecext = extractelement <4 x float> %a, i32 0 69 %vecext1 = extractelement <4 x float> %a, i32 1 70 %add = fadd float %vecext, %vecext1 71 %vecinit = insertelement <4 x float> undef, float %add, i32 0 72 %vecext2 = extractelement <4 x float> %a, i32 2 73 %vecext3 = extractelement <4 x float> %a, i32 3 74 %add4 = fadd float %vecext2, %vecext3 75 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1 76 %vecext6 = extractelement <4 x float> %b, i32 0 77 %vecext7 = extractelement <4 x float> %b, i32 1 78 %add8 = fadd float %vecext6, %vecext7 79 %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2 80 ret <4 x float> %vecinit9 81} 82 83define <4 x float> @test4_undef(<4 x float> %a, <4 x float> %b) { 84; SSE-LABEL: test4_undef: 85; SSE: # %bb.0: 86; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 87; SSE-NEXT: addss %xmm1, %xmm0 88; SSE-NEXT: retq 89; 90; AVX-LABEL: test4_undef: 91; AVX: # %bb.0: 92; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 93; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 94; AVX-NEXT: retq 95 %vecext = extractelement <4 x float> %a, i32 0 96 %vecext1 = extractelement <4 x float> %a, i32 1 97 %add = fadd float %vecext, %vecext1 98 %vecinit = insertelement <4 x float> undef, float %add, i32 0 99 ret <4 x float> %vecinit 100} 101 102define <2 x double> @test5_undef(<2 x double> %a, <2 x double> %b) { 103; SSE-LABEL: test5_undef: 104; SSE: # %bb.0: 105; SSE-NEXT: movaps %xmm0, %xmm1 106; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 107; SSE-NEXT: addsd %xmm0, %xmm1 108; SSE-NEXT: movapd %xmm1, %xmm0 109; SSE-NEXT: retq 110; 111; AVX-LABEL: test5_undef: 112; AVX: # %bb.0: 113; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 114; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 115; AVX-NEXT: retq 116 %vecext = extractelement <2 x double> %a, i32 0 117 %vecext1 = extractelement <2 x double> %a, i32 1 118 %add = fadd double %vecext, %vecext1 119 %vecinit = insertelement <2 x double> undef, double %add, i32 0 120 ret <2 x double> %vecinit 121} 122 123define <4 x float> @test6_undef(<4 x float> %a, <4 x float> %b) { 124; SSE-LABEL: test6_undef: 125; SSE: # %bb.0: 126; SSE-NEXT: haddps %xmm0, %xmm0 127; SSE-NEXT: retq 128; 129; AVX-LABEL: test6_undef: 130; AVX: # %bb.0: 131; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 132; AVX-NEXT: retq 133 %vecext = extractelement <4 x float> %a, i32 0 134 %vecext1 = extractelement <4 x float> %a, i32 1 135 %add = fadd float %vecext, %vecext1 136 %vecinit = insertelement <4 x float> undef, float %add, i32 0 137 %vecext2 = extractelement <4 x float> %a, i32 2 138 %vecext3 = extractelement <4 x float> %a, i32 3 139 %add4 = fadd float %vecext2, %vecext3 140 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1 141 ret <4 x float> %vecinit5 142} 143 144define <4 x float> @test7_undef(<4 x float> %a, <4 x float> %b) { 145; SSE-LABEL: test7_undef: 146; SSE: # %bb.0: 147; SSE-NEXT: haddps %xmm1, %xmm0 148; SSE-NEXT: retq 149; 150; AVX-LABEL: test7_undef: 151; AVX: # %bb.0: 152; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 153; AVX-NEXT: retq 154 %vecext = extractelement <4 x float> %b, i32 0 155 %vecext1 = extractelement <4 x float> %b, i32 1 156 %add = fadd float %vecext, %vecext1 157 %vecinit = insertelement <4 x float> undef, float %add, i32 2 158 %vecext2 = extractelement <4 x float> %b, i32 2 159 %vecext3 = extractelement <4 x float> %b, i32 3 160 %add4 = fadd float %vecext2, %vecext3 161 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3 162 ret <4 x float> %vecinit5 163} 164 165define <4 x float> @test8_undef(<4 x float> %a, <4 x float> %b) { 166; SSE-LABEL: test8_undef: 167; SSE: # %bb.0: 168; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 169; SSE-NEXT: addss %xmm0, %xmm1 170; SSE-NEXT: movaps %xmm0, %xmm2 171; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] 172; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 173; SSE-NEXT: addss %xmm2, %xmm0 174; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 175; SSE-NEXT: movaps %xmm1, %xmm0 176; SSE-NEXT: retq 177; 178; AVX-LABEL: test8_undef: 179; AVX: # %bb.0: 180; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 181; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm1 182; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 183; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 184; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm0 185; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 186; AVX-NEXT: retq 187 %vecext = extractelement <4 x float> %a, i32 0 188 %vecext1 = extractelement <4 x float> %a, i32 1 189 %add = fadd float %vecext, %vecext1 190 %vecinit = insertelement <4 x float> undef, float %add, i32 0 191 %vecext2 = extractelement <4 x float> %a, i32 2 192 %vecext3 = extractelement <4 x float> %a, i32 3 193 %add4 = fadd float %vecext2, %vecext3 194 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 2 195 ret <4 x float> %vecinit5 196} 197 198define <4 x float> @test9_undef(<4 x float> %a, <4 x float> %b) { 199; SSE-LABEL: test9_undef: 200; SSE: # %bb.0: 201; SSE-NEXT: haddps %xmm1, %xmm0 202; SSE-NEXT: retq 203; 204; AVX-LABEL: test9_undef: 205; AVX: # %bb.0: 206; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 207; AVX-NEXT: retq 208 %vecext = extractelement <4 x float> %a, i32 0 209 %vecext1 = extractelement <4 x float> %a, i32 1 210 %add = fadd float %vecext, %vecext1 211 %vecinit = insertelement <4 x float> undef, float %add, i32 0 212 %vecext2 = extractelement <4 x float> %b, i32 2 213 %vecext3 = extractelement <4 x float> %b, i32 3 214 %add4 = fadd float %vecext2, %vecext3 215 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3 216 ret <4 x float> %vecinit5 217} 218 219define <8 x float> @test10_undef(<8 x float> %a, <8 x float> %b) { 220; SSE-LABEL: test10_undef: 221; SSE: # %bb.0: 222; SSE-NEXT: haddps %xmm2, %xmm0 223; SSE-NEXT: retq 224; 225; AVX-LABEL: test10_undef: 226; AVX: # %bb.0: 227; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0 228; AVX-NEXT: retq 229 %vecext = extractelement <8 x float> %a, i32 0 230 %vecext1 = extractelement <8 x float> %a, i32 1 231 %add = fadd float %vecext, %vecext1 232 %vecinit = insertelement <8 x float> undef, float %add, i32 0 233 %vecext2 = extractelement <8 x float> %b, i32 2 234 %vecext3 = extractelement <8 x float> %b, i32 3 235 %add4 = fadd float %vecext2, %vecext3 236 %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 3 237 ret <8 x float> %vecinit5 238} 239 240define <8 x float> @test11_undef(<8 x float> %a, <8 x float> %b) { 241; SSE-LABEL: test11_undef: 242; SSE: # %bb.0: 243; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 244; SSE-NEXT: addss %xmm1, %xmm0 245; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] 246; SSE-NEXT: addss %xmm3, %xmm1 247; SSE-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0] 248; SSE-NEXT: retq 249; 250; AVX-LABEL: test11_undef: 251; AVX: # %bb.0: 252; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 253; AVX-NEXT: retq 254 %vecext = extractelement <8 x float> %a, i32 0 255 %vecext1 = extractelement <8 x float> %a, i32 1 256 %add = fadd float %vecext, %vecext1 257 %vecinit = insertelement <8 x float> undef, float %add, i32 0 258 %vecext2 = extractelement <8 x float> %b, i32 4 259 %vecext3 = extractelement <8 x float> %b, i32 5 260 %add4 = fadd float %vecext2, %vecext3 261 %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 6 262 ret <8 x float> %vecinit5 263} 264 265define <8 x float> @test12_undef(<8 x float> %a, <8 x float> %b) { 266; SSE-LABEL: test12_undef: 267; SSE: # %bb.0: 268; SSE-NEXT: haddps %xmm0, %xmm0 269; SSE-NEXT: retq 270; 271; AVX-LABEL: test12_undef: 272; AVX: # %bb.0: 273; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 274; AVX-NEXT: retq 275 %vecext = extractelement <8 x float> %a, i32 0 276 %vecext1 = extractelement <8 x float> %a, i32 1 277 %add = fadd float %vecext, %vecext1 278 %vecinit = insertelement <8 x float> undef, float %add, i32 0 279 %vecext2 = extractelement <8 x float> %a, i32 2 280 %vecext3 = extractelement <8 x float> %a, i32 3 281 %add4 = fadd float %vecext2, %vecext3 282 %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1 283 ret <8 x float> %vecinit5 284} 285 286define <8 x float> @test13_undef(<8 x float> %a, <8 x float> %b) { 287; SSE-LABEL: test13_undef: 288; SSE: # %bb.0: 289; SSE-NEXT: haddps %xmm1, %xmm0 290; SSE-NEXT: retq 291; 292; AVX-LABEL: test13_undef: 293; AVX: # %bb.0: 294; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 295; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 296; AVX-NEXT: retq 297 %vecext = extractelement <8 x float> %a, i32 0 298 %vecext1 = extractelement <8 x float> %a, i32 1 299 %add1 = fadd float %vecext, %vecext1 300 %vecinit1 = insertelement <8 x float> undef, float %add1, i32 0 301 %vecext2 = extractelement <8 x float> %a, i32 2 302 %vecext3 = extractelement <8 x float> %a, i32 3 303 %add2 = fadd float %vecext2, %vecext3 304 %vecinit2 = insertelement <8 x float> %vecinit1, float %add2, i32 1 305 %vecext4 = extractelement <8 x float> %a, i32 4 306 %vecext5 = extractelement <8 x float> %a, i32 5 307 %add3 = fadd float %vecext4, %vecext5 308 %vecinit3 = insertelement <8 x float> %vecinit2, float %add3, i32 2 309 %vecext6 = extractelement <8 x float> %a, i32 6 310 %vecext7 = extractelement <8 x float> %a, i32 7 311 %add4 = fadd float %vecext6, %vecext7 312 %vecinit4 = insertelement <8 x float> %vecinit3, float %add4, i32 3 313 ret <8 x float> %vecinit4 314} 315 316define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) { 317; SSE-LABEL: test14_undef: 318; SSE: # %bb.0: 319; SSE-NEXT: phaddd %xmm2, %xmm0 320; SSE-NEXT: retq 321; 322; AVX1-LABEL: test14_undef: 323; AVX1: # %bb.0: 324; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 325; AVX1-NEXT: retq 326; 327; AVX2-LABEL: test14_undef: 328; AVX2: # %bb.0: 329; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0 330; AVX2-NEXT: retq 331 %vecext = extractelement <8 x i32> %a, i32 0 332 %vecext1 = extractelement <8 x i32> %a, i32 1 333 %add = add i32 %vecext, %vecext1 334 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 335 %vecext2 = extractelement <8 x i32> %b, i32 2 336 %vecext3 = extractelement <8 x i32> %b, i32 3 337 %add4 = add i32 %vecext2, %vecext3 338 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 3 339 ret <8 x i32> %vecinit5 340} 341 342; On AVX2, the following sequence can be folded into a single horizontal add. 343; If the Subtarget doesn't support AVX2, then we avoid emitting two packed 344; integer horizontal adds instead of two scalar adds followed by vector inserts. 345define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) { 346; SSE-LABEL: test15_undef: 347; SSE: # %bb.0: 348; SSE-NEXT: movd %xmm0, %eax 349; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 350; SSE-NEXT: movd %xmm0, %ecx 351; SSE-NEXT: addl %eax, %ecx 352; SSE-NEXT: movd %xmm3, %eax 353; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] 354; SSE-NEXT: movd %xmm0, %edx 355; SSE-NEXT: addl %eax, %edx 356; SSE-NEXT: movd %ecx, %xmm0 357; SSE-NEXT: movd %edx, %xmm1 358; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 359; SSE-NEXT: retq 360; 361; AVX1-LABEL: test15_undef: 362; AVX1: # %bb.0: 363; AVX1-NEXT: vmovd %xmm0, %eax 364; AVX1-NEXT: vpextrd $1, %xmm0, %ecx 365; AVX1-NEXT: addl %eax, %ecx 366; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 367; AVX1-NEXT: vmovd %xmm0, %eax 368; AVX1-NEXT: vpextrd $1, %xmm0, %edx 369; AVX1-NEXT: addl %eax, %edx 370; AVX1-NEXT: vmovd %ecx, %xmm0 371; AVX1-NEXT: vmovd %edx, %xmm1 372; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 373; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 374; AVX1-NEXT: retq 375; 376; AVX2-LABEL: test15_undef: 377; AVX2: # %bb.0: 378; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 379; AVX2-NEXT: retq 380 %vecext = extractelement <8 x i32> %a, i32 0 381 %vecext1 = extractelement <8 x i32> %a, i32 1 382 %add = add i32 %vecext, %vecext1 383 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 384 %vecext2 = extractelement <8 x i32> %b, i32 4 385 %vecext3 = extractelement <8 x i32> %b, i32 5 386 %add4 = add i32 %vecext2, %vecext3 387 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 6 388 ret <8 x i32> %vecinit5 389} 390 391define <8 x i32> @test16_undef(<8 x i32> %a, <8 x i32> %b) { 392; SSE-LABEL: test16_undef: 393; SSE: # %bb.0: 394; SSE-NEXT: phaddd %xmm0, %xmm0 395; SSE-NEXT: retq 396; 397; AVX1-LABEL: test16_undef: 398; AVX1: # %bb.0: 399; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 400; AVX1-NEXT: retq 401; 402; AVX2-LABEL: test16_undef: 403; AVX2: # %bb.0: 404; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 405; AVX2-NEXT: retq 406 %vecext = extractelement <8 x i32> %a, i32 0 407 %vecext1 = extractelement <8 x i32> %a, i32 1 408 %add = add i32 %vecext, %vecext1 409 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 410 %vecext2 = extractelement <8 x i32> %a, i32 2 411 %vecext3 = extractelement <8 x i32> %a, i32 3 412 %add4 = add i32 %vecext2, %vecext3 413 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1 414 ret <8 x i32> %vecinit5 415} 416 417define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) { 418; SSE-LABEL: test17_undef: 419; SSE: # %bb.0: 420; SSE-NEXT: phaddd %xmm1, %xmm0 421; SSE-NEXT: retq 422; 423; AVX1-LABEL: test17_undef: 424; AVX1: # %bb.0: 425; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 426; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 427; AVX1-NEXT: retq 428; 429; AVX2-LABEL: test17_undef: 430; AVX2: # %bb.0: 431; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 432; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 433; AVX2-NEXT: retq 434 %vecext = extractelement <8 x i32> %a, i32 0 435 %vecext1 = extractelement <8 x i32> %a, i32 1 436 %add1 = add i32 %vecext, %vecext1 437 %vecinit1 = insertelement <8 x i32> undef, i32 %add1, i32 0 438 %vecext2 = extractelement <8 x i32> %a, i32 2 439 %vecext3 = extractelement <8 x i32> %a, i32 3 440 %add2 = add i32 %vecext2, %vecext3 441 %vecinit2 = insertelement <8 x i32> %vecinit1, i32 %add2, i32 1 442 %vecext4 = extractelement <8 x i32> %a, i32 4 443 %vecext5 = extractelement <8 x i32> %a, i32 5 444 %add3 = add i32 %vecext4, %vecext5 445 %vecinit3 = insertelement <8 x i32> %vecinit2, i32 %add3, i32 2 446 %vecext6 = extractelement <8 x i32> %a, i32 6 447 %vecext7 = extractelement <8 x i32> %a, i32 7 448 %add4 = add i32 %vecext6, %vecext7 449 %vecinit4 = insertelement <8 x i32> %vecinit3, i32 %add4, i32 3 450 ret <8 x i32> %vecinit4 451} 452