1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL 8 9; 10; vXf32 (accum) 11; 12 13define float @test_v2f32(float %a0, <2 x float> %a1) { 14; SSE2-LABEL: test_v2f32: 15; SSE2: # %bb.0: 16; SSE2-NEXT: movaps %xmm1, %xmm0 17; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] 18; SSE2-NEXT: addps %xmm1, %xmm0 19; SSE2-NEXT: retq 20; 21; SSE41-LABEL: test_v2f32: 22; SSE41: # %bb.0: 23; SSE41-NEXT: haddps %xmm1, %xmm1 24; SSE41-NEXT: movaps %xmm1, %xmm0 25; SSE41-NEXT: retq 26; 27; AVX-LABEL: test_v2f32: 28; AVX: # %bb.0: 29; AVX-NEXT: vhaddps %xmm1, %xmm1, %xmm0 30; AVX-NEXT: retq 31; 32; AVX512-LABEL: test_v2f32: 33; AVX512: # %bb.0: 34; AVX512-NEXT: vhaddps %xmm1, %xmm1, %xmm0 35; AVX512-NEXT: retq 36 %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1) 37 ret float %1 38} 39 40define float @test_v4f32(float %a0, <4 x float> %a1) { 41; SSE2-LABEL: test_v4f32: 42; SSE2: # %bb.0: 43; SSE2-NEXT: movaps %xmm1, %xmm2 44; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] 45; SSE2-NEXT: addps %xmm1, %xmm2 46; SSE2-NEXT: movaps %xmm2, %xmm0 47; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3] 48; SSE2-NEXT: addps %xmm2, %xmm0 49; SSE2-NEXT: retq 50; 51; SSE41-LABEL: test_v4f32: 52; SSE41: # %bb.0: 53; SSE41-NEXT: movaps %xmm1, %xmm0 54; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 55; SSE41-NEXT: addps %xmm1, %xmm0 56; SSE41-NEXT: haddps %xmm0, %xmm0 57; SSE41-NEXT: retq 58; 59; AVX-LABEL: test_v4f32: 60; AVX: # %bb.0: 61; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] 62; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 63; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 64; AVX-NEXT: retq 65; 66; AVX512-LABEL: test_v4f32: 67; AVX512: # %bb.0: 68; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] 69; AVX512-NEXT: vaddps %xmm0, %xmm1, %xmm0 70; AVX512-NEXT: vhaddps %xmm0, %xmm0, %xmm0 71; AVX512-NEXT: retq 72 %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1) 73 ret float %1 74} 75 76define float @test_v8f32(float %a0, <8 x float> %a1) { 77; SSE2-LABEL: test_v8f32: 78; SSE2: # %bb.0: 79; SSE2-NEXT: addps %xmm2, %xmm1 80; SSE2-NEXT: movaps %xmm1, %xmm2 81; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] 82; SSE2-NEXT: addps %xmm1, %xmm2 83; SSE2-NEXT: movaps %xmm2, %xmm0 84; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3] 85; SSE2-NEXT: addps %xmm2, %xmm0 86; SSE2-NEXT: retq 87; 88; SSE41-LABEL: test_v8f32: 89; SSE41: # %bb.0: 90; SSE41-NEXT: addps %xmm2, %xmm1 91; SSE41-NEXT: movaps %xmm1, %xmm0 92; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 93; SSE41-NEXT: addps %xmm1, %xmm0 94; SSE41-NEXT: haddps %xmm0, %xmm0 95; SSE41-NEXT: retq 96; 97; AVX-LABEL: test_v8f32: 98; AVX: # %bb.0: 99; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0 100; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0 101; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 102; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 103; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 104; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 105; AVX-NEXT: vzeroupper 106; AVX-NEXT: retq 107; 108; AVX512-LABEL: test_v8f32: 109; AVX512: # %bb.0: 110; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0 111; AVX512-NEXT: vaddps %ymm0, %ymm1, %ymm0 112; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 113; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 114; AVX512-NEXT: vhaddps %ymm0, %ymm0, %ymm0 115; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 116; AVX512-NEXT: vzeroupper 117; AVX512-NEXT: retq 118 %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1) 119 ret float %1 120} 121 122define float @test_v16f32(float %a0, <16 x float> %a1) { 123; SSE2-LABEL: test_v16f32: 124; SSE2: # %bb.0: 125; SSE2-NEXT: addps %xmm4, %xmm2 126; SSE2-NEXT: addps %xmm3, %xmm1 127; SSE2-NEXT: addps %xmm2, %xmm1 128; SSE2-NEXT: movaps %xmm1, %xmm2 129; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] 130; SSE2-NEXT: addps %xmm1, %xmm2 131; SSE2-NEXT: movaps %xmm2, %xmm0 132; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3] 133; SSE2-NEXT: addps %xmm2, %xmm0 134; SSE2-NEXT: retq 135; 136; SSE41-LABEL: test_v16f32: 137; SSE41: # %bb.0: 138; SSE41-NEXT: addps %xmm4, %xmm2 139; SSE41-NEXT: addps %xmm3, %xmm1 140; SSE41-NEXT: addps %xmm2, %xmm1 141; SSE41-NEXT: movaps %xmm1, %xmm0 142; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 143; SSE41-NEXT: addps %xmm1, %xmm0 144; SSE41-NEXT: haddps %xmm0, %xmm0 145; SSE41-NEXT: retq 146; 147; AVX-LABEL: test_v16f32: 148; AVX: # %bb.0: 149; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm0 150; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 151; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 152; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 153; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 154; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 155; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 156; AVX-NEXT: vzeroupper 157; AVX-NEXT: retq 158; 159; AVX512-LABEL: test_v16f32: 160; AVX512: # %bb.0: 161; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm0 162; AVX512-NEXT: vaddps %zmm0, %zmm1, %zmm0 163; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 164; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 165; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 166; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 167; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 168; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 169; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 170; AVX512-NEXT: vzeroupper 171; AVX512-NEXT: retq 172 %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1) 173 ret float %1 174} 175 176; 177; vXf32 (zero) 178; 179 180define float @test_v2f32_zero(<2 x float> %a0) { 181; SSE2-LABEL: test_v2f32_zero: 182; SSE2: # %bb.0: 183; SSE2-NEXT: movaps %xmm0, %xmm1 184; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] 185; SSE2-NEXT: addps %xmm0, %xmm1 186; SSE2-NEXT: movaps %xmm1, %xmm0 187; SSE2-NEXT: retq 188; 189; SSE41-LABEL: test_v2f32_zero: 190; SSE41: # %bb.0: 191; SSE41-NEXT: haddps %xmm0, %xmm0 192; SSE41-NEXT: retq 193; 194; AVX-LABEL: test_v2f32_zero: 195; AVX: # %bb.0: 196; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 197; AVX-NEXT: retq 198; 199; AVX512-LABEL: test_v2f32_zero: 200; AVX512: # %bb.0: 201; AVX512-NEXT: vhaddps %xmm0, %xmm0, %xmm0 202; AVX512-NEXT: retq 203 %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0) 204 ret float %1 205} 206 207define float @test_v4f32_zero(<4 x float> %a0) { 208; SSE2-LABEL: test_v4f32_zero: 209; SSE2: # %bb.0: 210; SSE2-NEXT: movaps %xmm0, %xmm1 211; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 212; SSE2-NEXT: addps %xmm0, %xmm1 213; SSE2-NEXT: movaps %xmm1, %xmm0 214; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] 215; SSE2-NEXT: addps %xmm1, %xmm0 216; SSE2-NEXT: retq 217; 218; SSE41-LABEL: test_v4f32_zero: 219; SSE41: # %bb.0: 220; SSE41-NEXT: movaps %xmm0, %xmm1 221; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 222; SSE41-NEXT: addps %xmm0, %xmm1 223; SSE41-NEXT: haddps %xmm1, %xmm1 224; SSE41-NEXT: movaps %xmm1, %xmm0 225; SSE41-NEXT: retq 226; 227; AVX-LABEL: test_v4f32_zero: 228; AVX: # %bb.0: 229; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 230; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 231; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 232; AVX-NEXT: retq 233; 234; AVX512-LABEL: test_v4f32_zero: 235; AVX512: # %bb.0: 236; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 237; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 238; AVX512-NEXT: vhaddps %xmm0, %xmm0, %xmm0 239; AVX512-NEXT: retq 240 %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0) 241 ret float %1 242} 243 244define float @test_v8f32_zero(<8 x float> %a0) { 245; SSE2-LABEL: test_v8f32_zero: 246; SSE2: # %bb.0: 247; SSE2-NEXT: addps %xmm1, %xmm0 248; SSE2-NEXT: movaps %xmm0, %xmm1 249; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 250; SSE2-NEXT: addps %xmm0, %xmm1 251; SSE2-NEXT: movaps %xmm1, %xmm0 252; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] 253; SSE2-NEXT: addps %xmm1, %xmm0 254; SSE2-NEXT: retq 255; 256; SSE41-LABEL: test_v8f32_zero: 257; SSE41: # %bb.0: 258; SSE41-NEXT: addps %xmm1, %xmm0 259; SSE41-NEXT: movaps %xmm0, %xmm1 260; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 261; SSE41-NEXT: addps %xmm0, %xmm1 262; SSE41-NEXT: haddps %xmm1, %xmm1 263; SSE41-NEXT: movaps %xmm1, %xmm0 264; SSE41-NEXT: retq 265; 266; AVX-LABEL: test_v8f32_zero: 267; AVX: # %bb.0: 268; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 269; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 270; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 271; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 272; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 273; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 274; AVX-NEXT: vzeroupper 275; AVX-NEXT: retq 276; 277; AVX512-LABEL: test_v8f32_zero: 278; AVX512: # %bb.0: 279; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 280; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 281; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 282; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 283; AVX512-NEXT: vhaddps %ymm0, %ymm0, %ymm0 284; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 285; AVX512-NEXT: vzeroupper 286; AVX512-NEXT: retq 287 %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0) 288 ret float %1 289} 290 291define float @test_v16f32_zero(<16 x float> %a0) { 292; SSE2-LABEL: test_v16f32_zero: 293; SSE2: # %bb.0: 294; SSE2-NEXT: addps %xmm3, %xmm1 295; SSE2-NEXT: addps %xmm2, %xmm0 296; SSE2-NEXT: addps %xmm1, %xmm0 297; SSE2-NEXT: movaps %xmm0, %xmm1 298; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 299; SSE2-NEXT: addps %xmm0, %xmm1 300; SSE2-NEXT: movaps %xmm1, %xmm0 301; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] 302; SSE2-NEXT: addps %xmm1, %xmm0 303; SSE2-NEXT: retq 304; 305; SSE41-LABEL: test_v16f32_zero: 306; SSE41: # %bb.0: 307; SSE41-NEXT: addps %xmm3, %xmm1 308; SSE41-NEXT: addps %xmm2, %xmm0 309; SSE41-NEXT: addps %xmm1, %xmm0 310; SSE41-NEXT: movaps %xmm0, %xmm1 311; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 312; SSE41-NEXT: addps %xmm0, %xmm1 313; SSE41-NEXT: haddps %xmm1, %xmm1 314; SSE41-NEXT: movaps %xmm1, %xmm0 315; SSE41-NEXT: retq 316; 317; AVX-LABEL: test_v16f32_zero: 318; AVX: # %bb.0: 319; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 320; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 321; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 322; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 323; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 324; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 325; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 326; AVX-NEXT: vzeroupper 327; AVX-NEXT: retq 328; 329; AVX512-LABEL: test_v16f32_zero: 330; AVX512: # %bb.0: 331; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 332; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 333; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 334; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 335; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 336; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 337; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 338; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 339; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 340; AVX512-NEXT: vzeroupper 341; AVX512-NEXT: retq 342 %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0) 343 ret float %1 344} 345 346; 347; vXf32 (undef) 348; 349 350define float @test_v2f32_undef(<2 x float> %a0) { 351; SSE2-LABEL: test_v2f32_undef: 352; SSE2: # %bb.0: 353; SSE2-NEXT: movaps %xmm0, %xmm1 354; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] 355; SSE2-NEXT: addps %xmm0, %xmm1 356; SSE2-NEXT: movaps %xmm1, %xmm0 357; SSE2-NEXT: retq 358; 359; SSE41-LABEL: test_v2f32_undef: 360; SSE41: # %bb.0: 361; SSE41-NEXT: haddps %xmm0, %xmm0 362; SSE41-NEXT: retq 363; 364; AVX-LABEL: test_v2f32_undef: 365; AVX: # %bb.0: 366; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 367; AVX-NEXT: retq 368; 369; AVX512-LABEL: test_v2f32_undef: 370; AVX512: # %bb.0: 371; AVX512-NEXT: vhaddps %xmm0, %xmm0, %xmm0 372; AVX512-NEXT: retq 373 %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float undef, <2 x float> %a0) 374 ret float %1 375} 376 377define float @test_v4f32_undef(<4 x float> %a0) { 378; SSE2-LABEL: test_v4f32_undef: 379; SSE2: # %bb.0: 380; SSE2-NEXT: movaps %xmm0, %xmm1 381; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 382; SSE2-NEXT: addps %xmm0, %xmm1 383; SSE2-NEXT: movaps %xmm1, %xmm0 384; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] 385; SSE2-NEXT: addps %xmm1, %xmm0 386; SSE2-NEXT: retq 387; 388; SSE41-LABEL: test_v4f32_undef: 389; SSE41: # %bb.0: 390; SSE41-NEXT: movaps %xmm0, %xmm1 391; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 392; SSE41-NEXT: addps %xmm0, %xmm1 393; SSE41-NEXT: haddps %xmm1, %xmm1 394; SSE41-NEXT: movaps %xmm1, %xmm0 395; SSE41-NEXT: retq 396; 397; AVX-LABEL: test_v4f32_undef: 398; AVX: # %bb.0: 399; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 400; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 401; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 402; AVX-NEXT: retq 403; 404; AVX512-LABEL: test_v4f32_undef: 405; AVX512: # %bb.0: 406; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 407; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 408; AVX512-NEXT: vhaddps %xmm0, %xmm0, %xmm0 409; AVX512-NEXT: retq 410 %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> %a0) 411 ret float %1 412} 413 414define float @test_v8f32_undef(<8 x float> %a0) { 415; SSE2-LABEL: test_v8f32_undef: 416; SSE2: # %bb.0: 417; SSE2-NEXT: addps %xmm1, %xmm0 418; SSE2-NEXT: movaps %xmm0, %xmm1 419; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 420; SSE2-NEXT: addps %xmm0, %xmm1 421; SSE2-NEXT: movaps %xmm1, %xmm0 422; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] 423; SSE2-NEXT: addps %xmm1, %xmm0 424; SSE2-NEXT: retq 425; 426; SSE41-LABEL: test_v8f32_undef: 427; SSE41: # %bb.0: 428; SSE41-NEXT: addps %xmm1, %xmm0 429; SSE41-NEXT: movaps %xmm0, %xmm1 430; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 431; SSE41-NEXT: addps %xmm0, %xmm1 432; SSE41-NEXT: haddps %xmm1, %xmm1 433; SSE41-NEXT: movaps %xmm1, %xmm0 434; SSE41-NEXT: retq 435; 436; AVX-LABEL: test_v8f32_undef: 437; AVX: # %bb.0: 438; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 439; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 440; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 441; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 442; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 443; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 444; AVX-NEXT: vzeroupper 445; AVX-NEXT: retq 446; 447; AVX512-LABEL: test_v8f32_undef: 448; AVX512: # %bb.0: 449; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 450; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 451; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 452; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 453; AVX512-NEXT: vhaddps %ymm0, %ymm0, %ymm0 454; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 455; AVX512-NEXT: vzeroupper 456; AVX512-NEXT: retq 457 %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float undef, <8 x float> %a0) 458 ret float %1 459} 460 461define float @test_v16f32_undef(<16 x float> %a0) { 462; SSE2-LABEL: test_v16f32_undef: 463; SSE2: # %bb.0: 464; SSE2-NEXT: addps %xmm3, %xmm1 465; SSE2-NEXT: addps %xmm2, %xmm0 466; SSE2-NEXT: addps %xmm1, %xmm0 467; SSE2-NEXT: movaps %xmm0, %xmm1 468; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 469; SSE2-NEXT: addps %xmm0, %xmm1 470; SSE2-NEXT: movaps %xmm1, %xmm0 471; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] 472; SSE2-NEXT: addps %xmm1, %xmm0 473; SSE2-NEXT: retq 474; 475; SSE41-LABEL: test_v16f32_undef: 476; SSE41: # %bb.0: 477; SSE41-NEXT: addps %xmm3, %xmm1 478; SSE41-NEXT: addps %xmm2, %xmm0 479; SSE41-NEXT: addps %xmm1, %xmm0 480; SSE41-NEXT: movaps %xmm0, %xmm1 481; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 482; SSE41-NEXT: addps %xmm0, %xmm1 483; SSE41-NEXT: haddps %xmm1, %xmm1 484; SSE41-NEXT: movaps %xmm1, %xmm0 485; SSE41-NEXT: retq 486; 487; AVX-LABEL: test_v16f32_undef: 488; AVX: # %bb.0: 489; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 490; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 491; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 492; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 493; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 494; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 495; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 496; AVX-NEXT: vzeroupper 497; AVX-NEXT: retq 498; 499; AVX512-LABEL: test_v16f32_undef: 500; AVX512: # %bb.0: 501; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 502; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 503; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 504; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 505; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 506; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 507; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 508; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 509; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 510; AVX512-NEXT: vzeroupper 511; AVX512-NEXT: retq 512 %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float undef, <16 x float> %a0) 513 ret float %1 514} 515 516; 517; vXf64 (accum) 518; 519 520define double @test_v2f64(double %a0, <2 x double> %a1) { 521; SSE2-LABEL: test_v2f64: 522; SSE2: # %bb.0: 523; SSE2-NEXT: movaps %xmm1, %xmm0 524; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 525; SSE2-NEXT: addpd %xmm1, %xmm0 526; SSE2-NEXT: retq 527; 528; SSE41-LABEL: test_v2f64: 529; SSE41: # %bb.0: 530; SSE41-NEXT: haddpd %xmm1, %xmm1 531; SSE41-NEXT: movapd %xmm1, %xmm0 532; SSE41-NEXT: retq 533; 534; AVX-LABEL: test_v2f64: 535; AVX: # %bb.0: 536; AVX-NEXT: vhaddpd %xmm1, %xmm1, %xmm0 537; AVX-NEXT: retq 538; 539; AVX512-LABEL: test_v2f64: 540; AVX512: # %bb.0: 541; AVX512-NEXT: vhaddpd %xmm1, %xmm1, %xmm0 542; AVX512-NEXT: retq 543 %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1) 544 ret double %1 545} 546 547define double @test_v4f64(double %a0, <4 x double> %a1) { 548; SSE2-LABEL: test_v4f64: 549; SSE2: # %bb.0: 550; SSE2-NEXT: addpd %xmm2, %xmm1 551; SSE2-NEXT: movapd %xmm1, %xmm0 552; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 553; SSE2-NEXT: addpd %xmm1, %xmm0 554; SSE2-NEXT: retq 555; 556; SSE41-LABEL: test_v4f64: 557; SSE41: # %bb.0: 558; SSE41-NEXT: addpd %xmm2, %xmm1 559; SSE41-NEXT: haddpd %xmm1, %xmm1 560; SSE41-NEXT: movapd %xmm1, %xmm0 561; SSE41-NEXT: retq 562; 563; AVX-LABEL: test_v4f64: 564; AVX: # %bb.0: 565; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0 566; AVX-NEXT: vaddpd %ymm0, %ymm1, %ymm0 567; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 568; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 569; AVX-NEXT: vzeroupper 570; AVX-NEXT: retq 571; 572; AVX512-LABEL: test_v4f64: 573; AVX512: # %bb.0: 574; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0 575; AVX512-NEXT: vaddpd %ymm0, %ymm1, %ymm0 576; AVX512-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 577; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 578; AVX512-NEXT: vzeroupper 579; AVX512-NEXT: retq 580 %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1) 581 ret double %1 582} 583 584define double @test_v8f64(double %a0, <8 x double> %a1) { 585; SSE2-LABEL: test_v8f64: 586; SSE2: # %bb.0: 587; SSE2-NEXT: addpd %xmm4, %xmm2 588; SSE2-NEXT: addpd %xmm3, %xmm1 589; SSE2-NEXT: addpd %xmm2, %xmm1 590; SSE2-NEXT: movapd %xmm1, %xmm0 591; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 592; SSE2-NEXT: addpd %xmm1, %xmm0 593; SSE2-NEXT: retq 594; 595; SSE41-LABEL: test_v8f64: 596; SSE41: # %bb.0: 597; SSE41-NEXT: addpd %xmm4, %xmm2 598; SSE41-NEXT: addpd %xmm3, %xmm1 599; SSE41-NEXT: addpd %xmm2, %xmm1 600; SSE41-NEXT: haddpd %xmm1, %xmm1 601; SSE41-NEXT: movapd %xmm1, %xmm0 602; SSE41-NEXT: retq 603; 604; AVX-LABEL: test_v8f64: 605; AVX: # %bb.0: 606; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm0 607; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 608; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 609; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 610; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 611; AVX-NEXT: vzeroupper 612; AVX-NEXT: retq 613; 614; AVX512-LABEL: test_v8f64: 615; AVX512: # %bb.0: 616; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm0 617; AVX512-NEXT: vaddpd %zmm0, %zmm1, %zmm0 618; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 619; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 620; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 621; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 622; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 623; AVX512-NEXT: vzeroupper 624; AVX512-NEXT: retq 625 %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1) 626 ret double %1 627} 628 629define double @test_v16f64(double %a0, <16 x double> %a1) { 630; SSE2-LABEL: test_v16f64: 631; SSE2: # %bb.0: 632; SSE2-NEXT: addpd %xmm6, %xmm2 633; SSE2-NEXT: addpd %xmm7, %xmm3 634; SSE2-NEXT: addpd %xmm5, %xmm1 635; SSE2-NEXT: addpd %xmm3, %xmm1 636; SSE2-NEXT: addpd {{[0-9]+}}(%rsp), %xmm4 637; SSE2-NEXT: addpd %xmm2, %xmm4 638; SSE2-NEXT: addpd %xmm1, %xmm4 639; SSE2-NEXT: movapd %xmm4, %xmm0 640; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm4[1],xmm0[1] 641; SSE2-NEXT: addpd %xmm4, %xmm0 642; SSE2-NEXT: retq 643; 644; SSE41-LABEL: test_v16f64: 645; SSE41: # %bb.0: 646; SSE41-NEXT: addpd %xmm6, %xmm2 647; SSE41-NEXT: addpd %xmm7, %xmm3 648; SSE41-NEXT: addpd %xmm5, %xmm1 649; SSE41-NEXT: addpd %xmm3, %xmm1 650; SSE41-NEXT: addpd {{[0-9]+}}(%rsp), %xmm4 651; SSE41-NEXT: addpd %xmm2, %xmm4 652; SSE41-NEXT: addpd %xmm1, %xmm4 653; SSE41-NEXT: haddpd %xmm4, %xmm4 654; SSE41-NEXT: movapd %xmm4, %xmm0 655; SSE41-NEXT: retq 656; 657; AVX-LABEL: test_v16f64: 658; AVX: # %bb.0: 659; AVX-NEXT: vaddpd %ymm4, %ymm2, %ymm0 660; AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1 661; AVX-NEXT: vaddpd %ymm0, %ymm1, %ymm0 662; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 663; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 664; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 665; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 666; AVX-NEXT: vzeroupper 667; AVX-NEXT: retq 668; 669; AVX512-LABEL: test_v16f64: 670; AVX512: # %bb.0: 671; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm0 672; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 673; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 674; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 675; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 676; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 677; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 678; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 679; AVX512-NEXT: vzeroupper 680; AVX512-NEXT: retq 681 %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1) 682 ret double %1 683} 684 685; 686; vXf64 (zero) 687; 688 689define double @test_v2f64_zero(<2 x double> %a0) { 690; SSE2-LABEL: test_v2f64_zero: 691; SSE2: # %bb.0: 692; SSE2-NEXT: movaps %xmm0, %xmm1 693; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 694; SSE2-NEXT: addpd %xmm0, %xmm1 695; SSE2-NEXT: movapd %xmm1, %xmm0 696; SSE2-NEXT: retq 697; 698; SSE41-LABEL: test_v2f64_zero: 699; SSE41: # %bb.0: 700; SSE41-NEXT: haddpd %xmm0, %xmm0 701; SSE41-NEXT: retq 702; 703; AVX-LABEL: test_v2f64_zero: 704; AVX: # %bb.0: 705; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 706; AVX-NEXT: retq 707; 708; AVX512-LABEL: test_v2f64_zero: 709; AVX512: # %bb.0: 710; AVX512-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 711; AVX512-NEXT: retq 712 %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0) 713 ret double %1 714} 715 716define double @test_v4f64_zero(<4 x double> %a0) { 717; SSE2-LABEL: test_v4f64_zero: 718; SSE2: # %bb.0: 719; SSE2-NEXT: addpd %xmm1, %xmm0 720; SSE2-NEXT: movapd %xmm0, %xmm1 721; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 722; SSE2-NEXT: addpd %xmm0, %xmm1 723; SSE2-NEXT: movapd %xmm1, %xmm0 724; SSE2-NEXT: retq 725; 726; SSE41-LABEL: test_v4f64_zero: 727; SSE41: # %bb.0: 728; SSE41-NEXT: addpd %xmm1, %xmm0 729; SSE41-NEXT: haddpd %xmm0, %xmm0 730; SSE41-NEXT: retq 731; 732; AVX-LABEL: test_v4f64_zero: 733; AVX: # %bb.0: 734; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 735; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 736; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 737; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 738; AVX-NEXT: vzeroupper 739; AVX-NEXT: retq 740; 741; AVX512-LABEL: test_v4f64_zero: 742; AVX512: # %bb.0: 743; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 744; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0 745; AVX512-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 746; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 747; AVX512-NEXT: vzeroupper 748; AVX512-NEXT: retq 749 %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0) 750 ret double %1 751} 752 753define double @test_v8f64_zero(<8 x double> %a0) { 754; SSE2-LABEL: test_v8f64_zero: 755; SSE2: # %bb.0: 756; SSE2-NEXT: addpd %xmm3, %xmm1 757; SSE2-NEXT: addpd %xmm2, %xmm0 758; SSE2-NEXT: addpd %xmm1, %xmm0 759; SSE2-NEXT: movapd %xmm0, %xmm1 760; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 761; SSE2-NEXT: addpd %xmm0, %xmm1 762; SSE2-NEXT: movapd %xmm1, %xmm0 763; SSE2-NEXT: retq 764; 765; SSE41-LABEL: test_v8f64_zero: 766; SSE41: # %bb.0: 767; SSE41-NEXT: addpd %xmm3, %xmm1 768; SSE41-NEXT: addpd %xmm2, %xmm0 769; SSE41-NEXT: addpd %xmm1, %xmm0 770; SSE41-NEXT: haddpd %xmm0, %xmm0 771; SSE41-NEXT: retq 772; 773; AVX-LABEL: test_v8f64_zero: 774; AVX: # %bb.0: 775; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 776; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 777; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 778; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 779; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 780; AVX-NEXT: vzeroupper 781; AVX-NEXT: retq 782; 783; AVX512-LABEL: test_v8f64_zero: 784; AVX512: # %bb.0: 785; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 786; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 787; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 788; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 789; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 790; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 791; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 792; AVX512-NEXT: vzeroupper 793; AVX512-NEXT: retq 794 %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0) 795 ret double %1 796} 797 798define double @test_v16f64_zero(<16 x double> %a0) { 799; SSE2-LABEL: test_v16f64_zero: 800; SSE2: # %bb.0: 801; SSE2-NEXT: addpd %xmm6, %xmm2 802; SSE2-NEXT: addpd %xmm4, %xmm0 803; SSE2-NEXT: addpd %xmm2, %xmm0 804; SSE2-NEXT: addpd %xmm7, %xmm3 805; SSE2-NEXT: addpd %xmm5, %xmm1 806; SSE2-NEXT: addpd %xmm3, %xmm1 807; SSE2-NEXT: addpd %xmm0, %xmm1 808; SSE2-NEXT: movapd %xmm1, %xmm0 809; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 810; SSE2-NEXT: addpd %xmm1, %xmm0 811; SSE2-NEXT: retq 812; 813; SSE41-LABEL: test_v16f64_zero: 814; SSE41: # %bb.0: 815; SSE41-NEXT: addpd %xmm6, %xmm2 816; SSE41-NEXT: addpd %xmm4, %xmm0 817; SSE41-NEXT: addpd %xmm2, %xmm0 818; SSE41-NEXT: addpd %xmm7, %xmm3 819; SSE41-NEXT: addpd %xmm5, %xmm1 820; SSE41-NEXT: addpd %xmm3, %xmm1 821; SSE41-NEXT: addpd %xmm0, %xmm1 822; SSE41-NEXT: haddpd %xmm1, %xmm1 823; SSE41-NEXT: movapd %xmm1, %xmm0 824; SSE41-NEXT: retq 825; 826; AVX-LABEL: test_v16f64_zero: 827; AVX: # %bb.0: 828; AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1 829; AVX-NEXT: vaddpd %ymm2, %ymm0, %ymm0 830; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 831; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 832; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 833; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 834; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 835; AVX-NEXT: vzeroupper 836; AVX-NEXT: retq 837; 838; AVX512-LABEL: test_v16f64_zero: 839; AVX512: # %bb.0: 840; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 841; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 842; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 843; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 844; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 845; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 846; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 847; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 848; AVX512-NEXT: vzeroupper 849; AVX512-NEXT: retq 850 %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0) 851 ret double %1 852} 853 854; 855; vXf64 (undef) 856; 857 858define double @test_v2f64_undef(<2 x double> %a0) { 859; SSE2-LABEL: test_v2f64_undef: 860; SSE2: # %bb.0: 861; SSE2-NEXT: movaps %xmm0, %xmm1 862; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 863; SSE2-NEXT: addpd %xmm0, %xmm1 864; SSE2-NEXT: movapd %xmm1, %xmm0 865; SSE2-NEXT: retq 866; 867; SSE41-LABEL: test_v2f64_undef: 868; SSE41: # %bb.0: 869; SSE41-NEXT: haddpd %xmm0, %xmm0 870; SSE41-NEXT: retq 871; 872; AVX-LABEL: test_v2f64_undef: 873; AVX: # %bb.0: 874; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 875; AVX-NEXT: retq 876; 877; AVX512-LABEL: test_v2f64_undef: 878; AVX512: # %bb.0: 879; AVX512-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 880; AVX512-NEXT: retq 881 %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double undef, <2 x double> %a0) 882 ret double %1 883} 884 885define double @test_v4f64_undef(<4 x double> %a0) { 886; SSE2-LABEL: test_v4f64_undef: 887; SSE2: # %bb.0: 888; SSE2-NEXT: addpd %xmm1, %xmm0 889; SSE2-NEXT: movapd %xmm0, %xmm1 890; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 891; SSE2-NEXT: addpd %xmm0, %xmm1 892; SSE2-NEXT: movapd %xmm1, %xmm0 893; SSE2-NEXT: retq 894; 895; SSE41-LABEL: test_v4f64_undef: 896; SSE41: # %bb.0: 897; SSE41-NEXT: addpd %xmm1, %xmm0 898; SSE41-NEXT: haddpd %xmm0, %xmm0 899; SSE41-NEXT: retq 900; 901; AVX-LABEL: test_v4f64_undef: 902; AVX: # %bb.0: 903; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 904; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 905; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 906; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 907; AVX-NEXT: vzeroupper 908; AVX-NEXT: retq 909; 910; AVX512-LABEL: test_v4f64_undef: 911; AVX512: # %bb.0: 912; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 913; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0 914; AVX512-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 915; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 916; AVX512-NEXT: vzeroupper 917; AVX512-NEXT: retq 918 %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double undef, <4 x double> %a0) 919 ret double %1 920} 921 922define double @test_v8f64_undef(<8 x double> %a0) { 923; SSE2-LABEL: test_v8f64_undef: 924; SSE2: # %bb.0: 925; SSE2-NEXT: addpd %xmm3, %xmm1 926; SSE2-NEXT: addpd %xmm2, %xmm0 927; SSE2-NEXT: addpd %xmm1, %xmm0 928; SSE2-NEXT: movapd %xmm0, %xmm1 929; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 930; SSE2-NEXT: addpd %xmm0, %xmm1 931; SSE2-NEXT: movapd %xmm1, %xmm0 932; SSE2-NEXT: retq 933; 934; SSE41-LABEL: test_v8f64_undef: 935; SSE41: # %bb.0: 936; SSE41-NEXT: addpd %xmm3, %xmm1 937; SSE41-NEXT: addpd %xmm2, %xmm0 938; SSE41-NEXT: addpd %xmm1, %xmm0 939; SSE41-NEXT: haddpd %xmm0, %xmm0 940; SSE41-NEXT: retq 941; 942; AVX-LABEL: test_v8f64_undef: 943; AVX: # %bb.0: 944; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 945; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 946; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 947; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 948; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 949; AVX-NEXT: vzeroupper 950; AVX-NEXT: retq 951; 952; AVX512-LABEL: test_v8f64_undef: 953; AVX512: # %bb.0: 954; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 955; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 956; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 957; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 958; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 959; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 960; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 961; AVX512-NEXT: vzeroupper 962; AVX512-NEXT: retq 963 %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double undef, <8 x double> %a0) 964 ret double %1 965} 966 967define double @test_v16f64_undef(<16 x double> %a0) { 968; SSE2-LABEL: test_v16f64_undef: 969; SSE2: # %bb.0: 970; SSE2-NEXT: addpd %xmm6, %xmm2 971; SSE2-NEXT: addpd %xmm4, %xmm0 972; SSE2-NEXT: addpd %xmm2, %xmm0 973; SSE2-NEXT: addpd %xmm7, %xmm3 974; SSE2-NEXT: addpd %xmm5, %xmm1 975; SSE2-NEXT: addpd %xmm3, %xmm1 976; SSE2-NEXT: addpd %xmm0, %xmm1 977; SSE2-NEXT: movapd %xmm1, %xmm0 978; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 979; SSE2-NEXT: addpd %xmm1, %xmm0 980; SSE2-NEXT: retq 981; 982; SSE41-LABEL: test_v16f64_undef: 983; SSE41: # %bb.0: 984; SSE41-NEXT: addpd %xmm6, %xmm2 985; SSE41-NEXT: addpd %xmm4, %xmm0 986; SSE41-NEXT: addpd %xmm2, %xmm0 987; SSE41-NEXT: addpd %xmm7, %xmm3 988; SSE41-NEXT: addpd %xmm5, %xmm1 989; SSE41-NEXT: addpd %xmm3, %xmm1 990; SSE41-NEXT: addpd %xmm0, %xmm1 991; SSE41-NEXT: haddpd %xmm1, %xmm1 992; SSE41-NEXT: movapd %xmm1, %xmm0 993; SSE41-NEXT: retq 994; 995; AVX-LABEL: test_v16f64_undef: 996; AVX: # %bb.0: 997; AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1 998; AVX-NEXT: vaddpd %ymm2, %ymm0, %ymm0 999; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1000; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 1001; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1002; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 1003; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1004; AVX-NEXT: vzeroupper 1005; AVX-NEXT: retq 1006; 1007; AVX512-LABEL: test_v16f64_undef: 1008; AVX512: # %bb.0: 1009; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 1010; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 1011; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 1012; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 1013; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 1014; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1015; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 1016; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1017; AVX512-NEXT: vzeroupper 1018; AVX512-NEXT: retq 1019 %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double undef, <16 x double> %a0) 1020 ret double %1 1021} 1022 1023declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float, <2 x float>) 1024declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float, <4 x float>) 1025declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float, <8 x float>) 1026declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float, <16 x float>) 1027 1028declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double, <2 x double>) 1029declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double, <4 x double>) 1030declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double, <8 x double>) 1031declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double, <16 x double>) 1032