1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL 8 9; 10; vXf32 (accum) 11; 12 13define float @test_v2f32(float %a0, <2 x float> %a1) { 14; SSE2-LABEL: test_v2f32: 15; SSE2: # %bb.0: 16; SSE2-NEXT: addss %xmm1, %xmm0 17; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] 18; SSE2-NEXT: addss %xmm1, %xmm0 19; SSE2-NEXT: retq 20; 21; SSE41-LABEL: test_v2f32: 22; SSE41: # %bb.0: 23; SSE41-NEXT: addss %xmm1, %xmm0 24; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 25; SSE41-NEXT: addss %xmm1, %xmm0 26; SSE41-NEXT: retq 27; 28; AVX-LABEL: test_v2f32: 29; AVX: # %bb.0: 30; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 31; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 32; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 33; AVX-NEXT: retq 34; 35; AVX512-LABEL: test_v2f32: 36; AVX512: # %bb.0: 37; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 38; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 39; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 40; AVX512-NEXT: retq 41 %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1) 42 ret float %1 43} 44 45define float @test_v4f32(float %a0, <4 x float> %a1) { 46; SSE2-LABEL: test_v4f32: 47; SSE2: # %bb.0: 48; SSE2-NEXT: addss %xmm1, %xmm0 49; SSE2-NEXT: movaps %xmm1, %xmm2 50; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3] 51; SSE2-NEXT: addss %xmm2, %xmm0 52; SSE2-NEXT: movaps %xmm1, %xmm2 53; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] 54; SSE2-NEXT: addss %xmm2, %xmm0 55; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 56; SSE2-NEXT: addss %xmm1, %xmm0 57; SSE2-NEXT: retq 58; 59; SSE41-LABEL: test_v4f32: 60; SSE41: # %bb.0: 61; SSE41-NEXT: addss %xmm1, %xmm0 62; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 63; SSE41-NEXT: addss %xmm2, %xmm0 64; SSE41-NEXT: movaps %xmm1, %xmm2 65; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] 66; SSE41-NEXT: addss %xmm2, %xmm0 67; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 68; SSE41-NEXT: addss %xmm1, %xmm0 69; SSE41-NEXT: retq 70; 71; AVX-LABEL: test_v4f32: 72; AVX: # %bb.0: 73; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 74; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 75; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 76; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 77; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 78; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 79; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 80; AVX-NEXT: retq 81; 82; AVX512-LABEL: test_v4f32: 83; AVX512: # %bb.0: 84; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 85; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 86; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 87; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 88; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 89; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 90; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 91; AVX512-NEXT: retq 92 %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1) 93 ret float %1 94} 95 96define float @test_v8f32(float %a0, <8 x float> %a1) { 97; SSE2-LABEL: test_v8f32: 98; SSE2: # %bb.0: 99; SSE2-NEXT: addss %xmm1, %xmm0 100; SSE2-NEXT: movaps %xmm1, %xmm3 101; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[2,3] 102; SSE2-NEXT: addss %xmm3, %xmm0 103; SSE2-NEXT: movaps %xmm1, %xmm3 104; SSE2-NEXT: movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1] 105; SSE2-NEXT: addss %xmm3, %xmm0 106; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 107; SSE2-NEXT: addss %xmm1, %xmm0 108; SSE2-NEXT: addss %xmm2, %xmm0 109; SSE2-NEXT: movaps %xmm2, %xmm1 110; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3] 111; SSE2-NEXT: addss %xmm1, %xmm0 112; SSE2-NEXT: movaps %xmm2, %xmm1 113; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] 114; SSE2-NEXT: addss %xmm1, %xmm0 115; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3] 116; SSE2-NEXT: addss %xmm2, %xmm0 117; SSE2-NEXT: retq 118; 119; SSE41-LABEL: test_v8f32: 120; SSE41: # %bb.0: 121; SSE41-NEXT: addss %xmm1, %xmm0 122; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 123; SSE41-NEXT: addss %xmm3, %xmm0 124; SSE41-NEXT: movaps %xmm1, %xmm3 125; SSE41-NEXT: movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1] 126; SSE41-NEXT: addss %xmm3, %xmm0 127; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 128; SSE41-NEXT: addss %xmm1, %xmm0 129; SSE41-NEXT: addss %xmm2, %xmm0 130; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 131; SSE41-NEXT: addss %xmm1, %xmm0 132; SSE41-NEXT: movaps %xmm2, %xmm1 133; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] 134; SSE41-NEXT: addss %xmm1, %xmm0 135; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3] 136; SSE41-NEXT: addss %xmm2, %xmm0 137; SSE41-NEXT: retq 138; 139; AVX-LABEL: test_v8f32: 140; AVX: # %bb.0: 141; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 142; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 143; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 144; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 145; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 146; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] 147; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 148; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 149; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 150; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 151; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 152; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 153; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 154; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 155; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 156; AVX-NEXT: vzeroupper 157; AVX-NEXT: retq 158; 159; AVX512-LABEL: test_v8f32: 160; AVX512: # %bb.0: 161; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 162; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 163; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 164; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 165; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 166; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] 167; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 168; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1 169; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 170; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 171; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 172; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 173; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 174; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 175; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 176; AVX512-NEXT: vzeroupper 177; AVX512-NEXT: retq 178 %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1) 179 ret float %1 180} 181 182define float @test_v16f32(float %a0, <16 x float> %a1) { 183; SSE2-LABEL: test_v16f32: 184; SSE2: # %bb.0: 185; SSE2-NEXT: addss %xmm1, %xmm0 186; SSE2-NEXT: movaps %xmm1, %xmm5 187; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[2,3] 188; SSE2-NEXT: addss %xmm5, %xmm0 189; SSE2-NEXT: movaps %xmm1, %xmm5 190; SSE2-NEXT: movhlps {{.*#+}} xmm5 = xmm1[1],xmm5[1] 191; SSE2-NEXT: addss %xmm5, %xmm0 192; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 193; SSE2-NEXT: addss %xmm1, %xmm0 194; SSE2-NEXT: addss %xmm2, %xmm0 195; SSE2-NEXT: movaps %xmm2, %xmm1 196; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3] 197; SSE2-NEXT: addss %xmm1, %xmm0 198; SSE2-NEXT: movaps %xmm2, %xmm1 199; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] 200; SSE2-NEXT: addss %xmm1, %xmm0 201; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3] 202; SSE2-NEXT: addss %xmm2, %xmm0 203; SSE2-NEXT: addss %xmm3, %xmm0 204; SSE2-NEXT: movaps %xmm3, %xmm1 205; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[2,3] 206; SSE2-NEXT: addss %xmm1, %xmm0 207; SSE2-NEXT: movaps %xmm3, %xmm1 208; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] 209; SSE2-NEXT: addss %xmm1, %xmm0 210; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3] 211; SSE2-NEXT: addss %xmm3, %xmm0 212; SSE2-NEXT: addss %xmm4, %xmm0 213; SSE2-NEXT: movaps %xmm4, %xmm1 214; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[2,3] 215; SSE2-NEXT: addss %xmm1, %xmm0 216; SSE2-NEXT: movaps %xmm4, %xmm1 217; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1] 218; SSE2-NEXT: addss %xmm1, %xmm0 219; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1,2,3] 220; SSE2-NEXT: addss %xmm4, %xmm0 221; SSE2-NEXT: retq 222; 223; SSE41-LABEL: test_v16f32: 224; SSE41: # %bb.0: 225; SSE41-NEXT: addss %xmm1, %xmm0 226; SSE41-NEXT: movshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] 227; SSE41-NEXT: addss %xmm5, %xmm0 228; SSE41-NEXT: movaps %xmm1, %xmm5 229; SSE41-NEXT: movhlps {{.*#+}} xmm5 = xmm1[1],xmm5[1] 230; SSE41-NEXT: addss %xmm5, %xmm0 231; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 232; SSE41-NEXT: addss %xmm1, %xmm0 233; SSE41-NEXT: addss %xmm2, %xmm0 234; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 235; SSE41-NEXT: addss %xmm1, %xmm0 236; SSE41-NEXT: movaps %xmm2, %xmm1 237; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] 238; SSE41-NEXT: addss %xmm1, %xmm0 239; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3] 240; SSE41-NEXT: addss %xmm2, %xmm0 241; SSE41-NEXT: addss %xmm3, %xmm0 242; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] 243; SSE41-NEXT: addss %xmm1, %xmm0 244; SSE41-NEXT: movaps %xmm3, %xmm1 245; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] 246; SSE41-NEXT: addss %xmm1, %xmm0 247; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3] 248; SSE41-NEXT: addss %xmm3, %xmm0 249; SSE41-NEXT: addss %xmm4, %xmm0 250; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3] 251; SSE41-NEXT: addss %xmm1, %xmm0 252; SSE41-NEXT: movaps %xmm4, %xmm1 253; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1] 254; SSE41-NEXT: addss %xmm1, %xmm0 255; SSE41-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1,2,3] 256; SSE41-NEXT: addss %xmm4, %xmm0 257; SSE41-NEXT: retq 258; 259; AVX-LABEL: test_v16f32: 260; AVX: # %bb.0: 261; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 262; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 263; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 264; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 265; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 266; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3] 267; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 268; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 269; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 270; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 271; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 272; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 273; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 274; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 275; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 276; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 277; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 278; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 279; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 280; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 281; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3] 282; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 283; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 284; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 285; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 286; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 287; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 288; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 289; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 290; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 291; AVX-NEXT: vzeroupper 292; AVX-NEXT: retq 293; 294; AVX512-LABEL: test_v16f32: 295; AVX512: # %bb.0: 296; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 297; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 298; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 299; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 300; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 301; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] 302; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 303; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 304; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 305; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 306; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 307; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 308; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 309; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] 310; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 311; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 312; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 313; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 314; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 315; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 316; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 317; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] 318; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 319; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 320; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 321; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 322; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 323; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 324; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 325; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 326; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 327; AVX512-NEXT: vzeroupper 328; AVX512-NEXT: retq 329 %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1) 330 ret float %1 331} 332 333; 334; vXf32 (zero) 335; 336 337define float @test_v2f32_zero(<2 x float> %a0) { 338; SSE2-LABEL: test_v2f32_zero: 339; SSE2: # %bb.0: 340; SSE2-NEXT: xorps %xmm1, %xmm1 341; SSE2-NEXT: addss %xmm0, %xmm1 342; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] 343; SSE2-NEXT: addss %xmm1, %xmm0 344; SSE2-NEXT: retq 345; 346; SSE41-LABEL: test_v2f32_zero: 347; SSE41: # %bb.0: 348; SSE41-NEXT: xorps %xmm1, %xmm1 349; SSE41-NEXT: addss %xmm0, %xmm1 350; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 351; SSE41-NEXT: addss %xmm1, %xmm0 352; SSE41-NEXT: retq 353; 354; AVX-LABEL: test_v2f32_zero: 355; AVX: # %bb.0: 356; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 357; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm1 358; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 359; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 360; AVX-NEXT: retq 361; 362; AVX512-LABEL: test_v2f32_zero: 363; AVX512: # %bb.0: 364; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 365; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1 366; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 367; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 368; AVX512-NEXT: retq 369 %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0) 370 ret float %1 371} 372 373define float @test_v4f32_zero(<4 x float> %a0) { 374; SSE2-LABEL: test_v4f32_zero: 375; SSE2: # %bb.0: 376; SSE2-NEXT: xorps %xmm1, %xmm1 377; SSE2-NEXT: addss %xmm0, %xmm1 378; SSE2-NEXT: movaps %xmm0, %xmm2 379; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[2,3] 380; SSE2-NEXT: addss %xmm1, %xmm2 381; SSE2-NEXT: movaps %xmm0, %xmm1 382; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 383; SSE2-NEXT: addss %xmm2, %xmm1 384; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 385; SSE2-NEXT: addss %xmm1, %xmm0 386; SSE2-NEXT: retq 387; 388; SSE41-LABEL: test_v4f32_zero: 389; SSE41: # %bb.0: 390; SSE41-NEXT: xorps %xmm1, %xmm1 391; SSE41-NEXT: addss %xmm0, %xmm1 392; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 393; SSE41-NEXT: addss %xmm1, %xmm2 394; SSE41-NEXT: movaps %xmm0, %xmm1 395; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 396; SSE41-NEXT: addss %xmm2, %xmm1 397; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 398; SSE41-NEXT: addss %xmm1, %xmm0 399; SSE41-NEXT: retq 400; 401; AVX-LABEL: test_v4f32_zero: 402; AVX: # %bb.0: 403; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 404; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm1 405; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 406; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 407; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 408; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 409; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 410; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 411; AVX-NEXT: retq 412; 413; AVX512-LABEL: test_v4f32_zero: 414; AVX512: # %bb.0: 415; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 416; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1 417; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 418; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 419; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 420; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 421; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 422; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 423; AVX512-NEXT: retq 424 %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0) 425 ret float %1 426} 427 428define float @test_v8f32_zero(<8 x float> %a0) { 429; SSE2-LABEL: test_v8f32_zero: 430; SSE2: # %bb.0: 431; SSE2-NEXT: xorps %xmm2, %xmm2 432; SSE2-NEXT: addss %xmm0, %xmm2 433; SSE2-NEXT: movaps %xmm0, %xmm3 434; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3] 435; SSE2-NEXT: addss %xmm2, %xmm3 436; SSE2-NEXT: movaps %xmm0, %xmm2 437; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] 438; SSE2-NEXT: addss %xmm3, %xmm2 439; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 440; SSE2-NEXT: addss %xmm2, %xmm0 441; SSE2-NEXT: addss %xmm1, %xmm0 442; SSE2-NEXT: movaps %xmm1, %xmm2 443; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3] 444; SSE2-NEXT: addss %xmm2, %xmm0 445; SSE2-NEXT: movaps %xmm1, %xmm2 446; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] 447; SSE2-NEXT: addss %xmm2, %xmm0 448; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 449; SSE2-NEXT: addss %xmm1, %xmm0 450; SSE2-NEXT: retq 451; 452; SSE41-LABEL: test_v8f32_zero: 453; SSE41: # %bb.0: 454; SSE41-NEXT: xorps %xmm2, %xmm2 455; SSE41-NEXT: addss %xmm0, %xmm2 456; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 457; SSE41-NEXT: addss %xmm2, %xmm3 458; SSE41-NEXT: movaps %xmm0, %xmm2 459; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] 460; SSE41-NEXT: addss %xmm3, %xmm2 461; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 462; SSE41-NEXT: addss %xmm2, %xmm0 463; SSE41-NEXT: addss %xmm1, %xmm0 464; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 465; SSE41-NEXT: addss %xmm2, %xmm0 466; SSE41-NEXT: movaps %xmm1, %xmm2 467; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] 468; SSE41-NEXT: addss %xmm2, %xmm0 469; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 470; SSE41-NEXT: addss %xmm1, %xmm0 471; SSE41-NEXT: retq 472; 473; AVX-LABEL: test_v8f32_zero: 474; AVX: # %bb.0: 475; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 476; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm1 477; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 478; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 479; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 480; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 481; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] 482; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 483; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 484; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm1 485; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 486; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 487; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 488; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 489; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 490; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 491; AVX-NEXT: vzeroupper 492; AVX-NEXT: retq 493; 494; AVX512-LABEL: test_v8f32_zero: 495; AVX512: # %bb.0: 496; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 497; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1 498; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 499; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 500; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 501; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 502; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] 503; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 504; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 505; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 506; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 507; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 508; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 509; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 510; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 511; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 512; AVX512-NEXT: vzeroupper 513; AVX512-NEXT: retq 514 %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0) 515 ret float %1 516} 517 518define float @test_v16f32_zero(<16 x float> %a0) { 519; SSE2-LABEL: test_v16f32_zero: 520; SSE2: # %bb.0: 521; SSE2-NEXT: xorps %xmm4, %xmm4 522; SSE2-NEXT: addss %xmm0, %xmm4 523; SSE2-NEXT: movaps %xmm0, %xmm5 524; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[2,3] 525; SSE2-NEXT: addss %xmm4, %xmm5 526; SSE2-NEXT: movaps %xmm0, %xmm4 527; SSE2-NEXT: movhlps {{.*#+}} xmm4 = xmm0[1],xmm4[1] 528; SSE2-NEXT: addss %xmm5, %xmm4 529; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 530; SSE2-NEXT: addss %xmm4, %xmm0 531; SSE2-NEXT: addss %xmm1, %xmm0 532; SSE2-NEXT: movaps %xmm1, %xmm4 533; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[2,3] 534; SSE2-NEXT: addss %xmm4, %xmm0 535; SSE2-NEXT: movaps %xmm1, %xmm4 536; SSE2-NEXT: movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1] 537; SSE2-NEXT: addss %xmm4, %xmm0 538; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 539; SSE2-NEXT: addss %xmm1, %xmm0 540; SSE2-NEXT: addss %xmm2, %xmm0 541; SSE2-NEXT: movaps %xmm2, %xmm1 542; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3] 543; SSE2-NEXT: addss %xmm1, %xmm0 544; SSE2-NEXT: movaps %xmm2, %xmm1 545; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] 546; SSE2-NEXT: addss %xmm1, %xmm0 547; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3] 548; SSE2-NEXT: addss %xmm2, %xmm0 549; SSE2-NEXT: addss %xmm3, %xmm0 550; SSE2-NEXT: movaps %xmm3, %xmm1 551; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[2,3] 552; SSE2-NEXT: addss %xmm1, %xmm0 553; SSE2-NEXT: movaps %xmm3, %xmm1 554; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] 555; SSE2-NEXT: addss %xmm1, %xmm0 556; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3] 557; SSE2-NEXT: addss %xmm3, %xmm0 558; SSE2-NEXT: retq 559; 560; SSE41-LABEL: test_v16f32_zero: 561; SSE41: # %bb.0: 562; SSE41-NEXT: xorps %xmm4, %xmm4 563; SSE41-NEXT: addss %xmm0, %xmm4 564; SSE41-NEXT: movshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] 565; SSE41-NEXT: addss %xmm4, %xmm5 566; SSE41-NEXT: movaps %xmm0, %xmm4 567; SSE41-NEXT: movhlps {{.*#+}} xmm4 = xmm0[1],xmm4[1] 568; SSE41-NEXT: addss %xmm5, %xmm4 569; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 570; SSE41-NEXT: addss %xmm4, %xmm0 571; SSE41-NEXT: addss %xmm1, %xmm0 572; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] 573; SSE41-NEXT: addss %xmm4, %xmm0 574; SSE41-NEXT: movaps %xmm1, %xmm4 575; SSE41-NEXT: movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1] 576; SSE41-NEXT: addss %xmm4, %xmm0 577; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 578; SSE41-NEXT: addss %xmm1, %xmm0 579; SSE41-NEXT: addss %xmm2, %xmm0 580; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 581; SSE41-NEXT: addss %xmm1, %xmm0 582; SSE41-NEXT: movaps %xmm2, %xmm1 583; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] 584; SSE41-NEXT: addss %xmm1, %xmm0 585; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3] 586; SSE41-NEXT: addss %xmm2, %xmm0 587; SSE41-NEXT: addss %xmm3, %xmm0 588; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] 589; SSE41-NEXT: addss %xmm1, %xmm0 590; SSE41-NEXT: movaps %xmm3, %xmm1 591; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] 592; SSE41-NEXT: addss %xmm1, %xmm0 593; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3] 594; SSE41-NEXT: addss %xmm3, %xmm0 595; SSE41-NEXT: retq 596; 597; AVX-LABEL: test_v16f32_zero: 598; AVX: # %bb.0: 599; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 600; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm2 601; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 602; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 603; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 604; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 605; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3] 606; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 607; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 608; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm2 609; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 610; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 611; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 612; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 613; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 614; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm0 615; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 616; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 617; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 618; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 619; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 620; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] 621; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 622; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 623; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 624; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 625; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 626; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 627; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 628; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 629; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 630; AVX-NEXT: vzeroupper 631; AVX-NEXT: retq 632; 633; AVX512-LABEL: test_v16f32_zero: 634; AVX512: # %bb.0: 635; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 636; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1 637; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 638; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 639; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 640; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 641; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] 642; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 643; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 644; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 645; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 646; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 647; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 648; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 649; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] 650; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 651; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 652; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 653; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 654; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 655; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 656; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 657; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] 658; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 659; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 660; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 661; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 662; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 663; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 664; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 665; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 666; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 667; AVX512-NEXT: vzeroupper 668; AVX512-NEXT: retq 669 %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0) 670 ret float %1 671} 672 673; 674; vXf32 (undef) 675; 676 677define float @test_v2f32_undef(<2 x float> %a0) { 678; SSE2-LABEL: test_v2f32_undef: 679; SSE2: # %bb.0: 680; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] 681; SSE2-NEXT: addss {{.*}}(%rip), %xmm0 682; SSE2-NEXT: retq 683; 684; SSE41-LABEL: test_v2f32_undef: 685; SSE41: # %bb.0: 686; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 687; SSE41-NEXT: addss {{.*}}(%rip), %xmm0 688; SSE41-NEXT: retq 689; 690; AVX-LABEL: test_v2f32_undef: 691; AVX: # %bb.0: 692; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 693; AVX-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0 694; AVX-NEXT: retq 695; 696; AVX512-LABEL: test_v2f32_undef: 697; AVX512: # %bb.0: 698; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 699; AVX512-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0 700; AVX512-NEXT: retq 701 %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float undef, <2 x float> %a0) 702 ret float %1 703} 704 705define float @test_v4f32_undef(<4 x float> %a0) { 706; SSE2-LABEL: test_v4f32_undef: 707; SSE2: # %bb.0: 708; SSE2-NEXT: movaps %xmm0, %xmm1 709; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] 710; SSE2-NEXT: addss {{.*}}(%rip), %xmm1 711; SSE2-NEXT: movaps %xmm0, %xmm2 712; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] 713; SSE2-NEXT: addss %xmm1, %xmm2 714; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 715; SSE2-NEXT: addss %xmm2, %xmm0 716; SSE2-NEXT: retq 717; 718; SSE41-LABEL: test_v4f32_undef: 719; SSE41: # %bb.0: 720; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 721; SSE41-NEXT: addss {{.*}}(%rip), %xmm1 722; SSE41-NEXT: movaps %xmm0, %xmm2 723; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] 724; SSE41-NEXT: addss %xmm1, %xmm2 725; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 726; SSE41-NEXT: addss %xmm2, %xmm0 727; SSE41-NEXT: retq 728; 729; AVX-LABEL: test_v4f32_undef: 730; AVX: # %bb.0: 731; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 732; AVX-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 733; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 734; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 735; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 736; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 737; AVX-NEXT: retq 738; 739; AVX512-LABEL: test_v4f32_undef: 740; AVX512: # %bb.0: 741; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 742; AVX512-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 743; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 744; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 745; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 746; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 747; AVX512-NEXT: retq 748 %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> %a0) 749 ret float %1 750} 751 752define float @test_v8f32_undef(<8 x float> %a0) { 753; SSE2-LABEL: test_v8f32_undef: 754; SSE2: # %bb.0: 755; SSE2-NEXT: movaps %xmm0, %xmm2 756; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[2,3] 757; SSE2-NEXT: addss {{.*}}(%rip), %xmm2 758; SSE2-NEXT: movaps %xmm0, %xmm3 759; SSE2-NEXT: movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1] 760; SSE2-NEXT: addss %xmm2, %xmm3 761; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 762; SSE2-NEXT: addss %xmm3, %xmm0 763; SSE2-NEXT: addss %xmm1, %xmm0 764; SSE2-NEXT: movaps %xmm1, %xmm2 765; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3] 766; SSE2-NEXT: addss %xmm2, %xmm0 767; SSE2-NEXT: movaps %xmm1, %xmm2 768; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] 769; SSE2-NEXT: addss %xmm2, %xmm0 770; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 771; SSE2-NEXT: addss %xmm1, %xmm0 772; SSE2-NEXT: retq 773; 774; SSE41-LABEL: test_v8f32_undef: 775; SSE41: # %bb.0: 776; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 777; SSE41-NEXT: addss {{.*}}(%rip), %xmm2 778; SSE41-NEXT: movaps %xmm0, %xmm3 779; SSE41-NEXT: movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1] 780; SSE41-NEXT: addss %xmm2, %xmm3 781; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 782; SSE41-NEXT: addss %xmm3, %xmm0 783; SSE41-NEXT: addss %xmm1, %xmm0 784; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 785; SSE41-NEXT: addss %xmm2, %xmm0 786; SSE41-NEXT: movaps %xmm1, %xmm2 787; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] 788; SSE41-NEXT: addss %xmm2, %xmm0 789; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 790; SSE41-NEXT: addss %xmm1, %xmm0 791; SSE41-NEXT: retq 792; 793; AVX-LABEL: test_v8f32_undef: 794; AVX: # %bb.0: 795; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 796; AVX-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 797; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 798; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 799; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] 800; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 801; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 802; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm1 803; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 804; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 805; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 806; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 807; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 808; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 809; AVX-NEXT: vzeroupper 810; AVX-NEXT: retq 811; 812; AVX512-LABEL: test_v8f32_undef: 813; AVX512: # %bb.0: 814; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 815; AVX512-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 816; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 817; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 818; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] 819; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 820; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 821; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 822; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 823; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 824; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 825; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 826; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 827; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 828; AVX512-NEXT: vzeroupper 829; AVX512-NEXT: retq 830 %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float undef, <8 x float> %a0) 831 ret float %1 832} 833 834define float @test_v16f32_undef(<16 x float> %a0) { 835; SSE2-LABEL: test_v16f32_undef: 836; SSE2: # %bb.0: 837; SSE2-NEXT: movaps %xmm0, %xmm4 838; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[2,3] 839; SSE2-NEXT: addss {{.*}}(%rip), %xmm4 840; SSE2-NEXT: movaps %xmm0, %xmm5 841; SSE2-NEXT: movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1] 842; SSE2-NEXT: addss %xmm4, %xmm5 843; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 844; SSE2-NEXT: addss %xmm5, %xmm0 845; SSE2-NEXT: addss %xmm1, %xmm0 846; SSE2-NEXT: movaps %xmm1, %xmm4 847; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[2,3] 848; SSE2-NEXT: addss %xmm4, %xmm0 849; SSE2-NEXT: movaps %xmm1, %xmm4 850; SSE2-NEXT: movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1] 851; SSE2-NEXT: addss %xmm4, %xmm0 852; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 853; SSE2-NEXT: addss %xmm1, %xmm0 854; SSE2-NEXT: addss %xmm2, %xmm0 855; SSE2-NEXT: movaps %xmm2, %xmm1 856; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3] 857; SSE2-NEXT: addss %xmm1, %xmm0 858; SSE2-NEXT: movaps %xmm2, %xmm1 859; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] 860; SSE2-NEXT: addss %xmm1, %xmm0 861; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3] 862; SSE2-NEXT: addss %xmm2, %xmm0 863; SSE2-NEXT: addss %xmm3, %xmm0 864; SSE2-NEXT: movaps %xmm3, %xmm1 865; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[2,3] 866; SSE2-NEXT: addss %xmm1, %xmm0 867; SSE2-NEXT: movaps %xmm3, %xmm1 868; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] 869; SSE2-NEXT: addss %xmm1, %xmm0 870; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3] 871; SSE2-NEXT: addss %xmm3, %xmm0 872; SSE2-NEXT: retq 873; 874; SSE41-LABEL: test_v16f32_undef: 875; SSE41: # %bb.0: 876; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 877; SSE41-NEXT: addss {{.*}}(%rip), %xmm4 878; SSE41-NEXT: movaps %xmm0, %xmm5 879; SSE41-NEXT: movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1] 880; SSE41-NEXT: addss %xmm4, %xmm5 881; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 882; SSE41-NEXT: addss %xmm5, %xmm0 883; SSE41-NEXT: addss %xmm1, %xmm0 884; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] 885; SSE41-NEXT: addss %xmm4, %xmm0 886; SSE41-NEXT: movaps %xmm1, %xmm4 887; SSE41-NEXT: movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1] 888; SSE41-NEXT: addss %xmm4, %xmm0 889; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 890; SSE41-NEXT: addss %xmm1, %xmm0 891; SSE41-NEXT: addss %xmm2, %xmm0 892; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 893; SSE41-NEXT: addss %xmm1, %xmm0 894; SSE41-NEXT: movaps %xmm2, %xmm1 895; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] 896; SSE41-NEXT: addss %xmm1, %xmm0 897; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3] 898; SSE41-NEXT: addss %xmm2, %xmm0 899; SSE41-NEXT: addss %xmm3, %xmm0 900; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] 901; SSE41-NEXT: addss %xmm1, %xmm0 902; SSE41-NEXT: movaps %xmm3, %xmm1 903; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] 904; SSE41-NEXT: addss %xmm1, %xmm0 905; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3] 906; SSE41-NEXT: addss %xmm3, %xmm0 907; SSE41-NEXT: retq 908; 909; AVX-LABEL: test_v16f32_undef: 910; AVX: # %bb.0: 911; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 912; AVX-NEXT: vaddss {{.*}}(%rip), %xmm2, %xmm2 913; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 914; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 915; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3] 916; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 917; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 918; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm2 919; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 920; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 921; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 922; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 923; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 924; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm0 925; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 926; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 927; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 928; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 929; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 930; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] 931; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 932; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 933; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 934; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 935; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 936; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 937; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 938; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 939; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 940; AVX-NEXT: vzeroupper 941; AVX-NEXT: retq 942; 943; AVX512-LABEL: test_v16f32_undef: 944; AVX512: # %bb.0: 945; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 946; AVX512-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 947; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 948; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 949; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] 950; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 951; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 952; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 953; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 954; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 955; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 956; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 957; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] 958; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 959; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 960; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 961; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 962; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 963; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 964; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 965; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] 966; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 967; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 968; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 969; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 970; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 971; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 972; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 973; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 974; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 975; AVX512-NEXT: vzeroupper 976; AVX512-NEXT: retq 977 %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float undef, <16 x float> %a0) 978 ret float %1 979} 980 981; 982; vXf64 (accum) 983; 984 985define double @test_v2f64(double %a0, <2 x double> %a1) { 986; SSE-LABEL: test_v2f64: 987; SSE: # %bb.0: 988; SSE-NEXT: addsd %xmm1, %xmm0 989; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 990; SSE-NEXT: addsd %xmm1, %xmm0 991; SSE-NEXT: retq 992; 993; AVX-LABEL: test_v2f64: 994; AVX: # %bb.0: 995; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 996; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 997; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 998; AVX-NEXT: retq 999; 1000; AVX512-LABEL: test_v2f64: 1001; AVX512: # %bb.0: 1002; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1003; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1004; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1005; AVX512-NEXT: retq 1006 %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1) 1007 ret double %1 1008} 1009 1010define double @test_v4f64(double %a0, <4 x double> %a1) { 1011; SSE-LABEL: test_v4f64: 1012; SSE: # %bb.0: 1013; SSE-NEXT: addsd %xmm1, %xmm0 1014; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 1015; SSE-NEXT: addsd %xmm1, %xmm0 1016; SSE-NEXT: addsd %xmm2, %xmm0 1017; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] 1018; SSE-NEXT: addsd %xmm2, %xmm0 1019; SSE-NEXT: retq 1020; 1021; AVX-LABEL: test_v4f64: 1022; AVX: # %bb.0: 1023; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1024; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1025; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1026; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1027; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1028; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1029; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1030; AVX-NEXT: vzeroupper 1031; AVX-NEXT: retq 1032; 1033; AVX512-LABEL: test_v4f64: 1034; AVX512: # %bb.0: 1035; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1036; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1037; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1038; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1 1039; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1040; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1041; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1042; AVX512-NEXT: vzeroupper 1043; AVX512-NEXT: retq 1044 %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1) 1045 ret double %1 1046} 1047 1048define double @test_v8f64(double %a0, <8 x double> %a1) { 1049; SSE-LABEL: test_v8f64: 1050; SSE: # %bb.0: 1051; SSE-NEXT: addsd %xmm1, %xmm0 1052; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 1053; SSE-NEXT: addsd %xmm1, %xmm0 1054; SSE-NEXT: addsd %xmm2, %xmm0 1055; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] 1056; SSE-NEXT: addsd %xmm2, %xmm0 1057; SSE-NEXT: addsd %xmm3, %xmm0 1058; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1] 1059; SSE-NEXT: addsd %xmm3, %xmm0 1060; SSE-NEXT: addsd %xmm4, %xmm0 1061; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1] 1062; SSE-NEXT: addsd %xmm4, %xmm0 1063; SSE-NEXT: retq 1064; 1065; AVX-LABEL: test_v8f64: 1066; AVX: # %bb.0: 1067; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1068; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 1069; AVX-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1070; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1071; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1072; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1073; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1074; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1075; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1076; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1077; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 1078; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1079; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1080; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1081; AVX-NEXT: vzeroupper 1082; AVX-NEXT: retq 1083; 1084; AVX512-LABEL: test_v8f64: 1085; AVX512: # %bb.0: 1086; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1087; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1088; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1089; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 1090; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1091; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1092; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1093; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 1094; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1095; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1096; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1097; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 1098; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1099; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1100; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1101; AVX512-NEXT: vzeroupper 1102; AVX512-NEXT: retq 1103 %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1) 1104 ret double %1 1105} 1106 1107define double @test_v16f64(double %a0, <16 x double> %a1) { 1108; SSE-LABEL: test_v16f64: 1109; SSE: # %bb.0: 1110; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 1111; SSE-NEXT: addsd %xmm1, %xmm0 1112; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 1113; SSE-NEXT: addsd %xmm1, %xmm0 1114; SSE-NEXT: addsd %xmm2, %xmm0 1115; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] 1116; SSE-NEXT: addsd %xmm2, %xmm0 1117; SSE-NEXT: addsd %xmm3, %xmm0 1118; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1] 1119; SSE-NEXT: addsd %xmm3, %xmm0 1120; SSE-NEXT: addsd %xmm4, %xmm0 1121; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1] 1122; SSE-NEXT: addsd %xmm4, %xmm0 1123; SSE-NEXT: addsd %xmm5, %xmm0 1124; SSE-NEXT: movhlps {{.*#+}} xmm5 = xmm5[1,1] 1125; SSE-NEXT: addsd %xmm5, %xmm0 1126; SSE-NEXT: addsd %xmm6, %xmm0 1127; SSE-NEXT: movhlps {{.*#+}} xmm6 = xmm6[1,1] 1128; SSE-NEXT: addsd %xmm6, %xmm0 1129; SSE-NEXT: addsd %xmm7, %xmm0 1130; SSE-NEXT: movhlps {{.*#+}} xmm7 = xmm7[1,1] 1131; SSE-NEXT: addsd %xmm7, %xmm0 1132; SSE-NEXT: addsd %xmm8, %xmm0 1133; SSE-NEXT: movhlps {{.*#+}} xmm8 = xmm8[1,1] 1134; SSE-NEXT: addsd %xmm8, %xmm0 1135; SSE-NEXT: retq 1136; 1137; AVX-LABEL: test_v16f64: 1138; AVX: # %bb.0: 1139; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1140; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] 1141; AVX-NEXT: vaddsd %xmm5, %xmm0, %xmm0 1142; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1143; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1144; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1145; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1146; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1147; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1148; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1149; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 1150; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1151; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1152; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1153; AVX-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1154; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0] 1155; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1156; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1 1157; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1158; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1159; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1160; AVX-NEXT: vaddsd %xmm4, %xmm0, %xmm0 1161; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm4[1,0] 1162; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1163; AVX-NEXT: vextractf128 $1, %ymm4, %xmm1 1164; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1165; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1166; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1167; AVX-NEXT: vzeroupper 1168; AVX-NEXT: retq 1169; 1170; AVX512-LABEL: test_v16f64: 1171; AVX512: # %bb.0: 1172; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1173; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 1174; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1175; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm3 1176; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1177; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] 1178; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1179; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm3 1180; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1181; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] 1182; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1183; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 1184; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1185; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1186; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1187; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1188; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1189; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1190; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm1 1191; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1192; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1193; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1194; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm1 1195; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1196; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1197; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1198; AVX512-NEXT: vextractf32x4 $3, %zmm2, %xmm1 1199; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1200; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1201; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1202; AVX512-NEXT: vzeroupper 1203; AVX512-NEXT: retq 1204 %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1) 1205 ret double %1 1206} 1207 1208; 1209; vXf64 (zero) 1210; 1211 1212define double @test_v2f64_zero(<2 x double> %a0) { 1213; SSE-LABEL: test_v2f64_zero: 1214; SSE: # %bb.0: 1215; SSE-NEXT: xorpd %xmm1, %xmm1 1216; SSE-NEXT: addsd %xmm0, %xmm1 1217; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 1218; SSE-NEXT: addsd %xmm1, %xmm0 1219; SSE-NEXT: retq 1220; 1221; AVX-LABEL: test_v2f64_zero: 1222; AVX: # %bb.0: 1223; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1224; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm1 1225; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1226; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1227; AVX-NEXT: retq 1228; 1229; AVX512-LABEL: test_v2f64_zero: 1230; AVX512: # %bb.0: 1231; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1232; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm1 1233; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1234; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1235; AVX512-NEXT: retq 1236 %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0) 1237 ret double %1 1238} 1239 1240define double @test_v4f64_zero(<4 x double> %a0) { 1241; SSE-LABEL: test_v4f64_zero: 1242; SSE: # %bb.0: 1243; SSE-NEXT: xorpd %xmm2, %xmm2 1244; SSE-NEXT: addsd %xmm0, %xmm2 1245; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 1246; SSE-NEXT: addsd %xmm2, %xmm0 1247; SSE-NEXT: addsd %xmm1, %xmm0 1248; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 1249; SSE-NEXT: addsd %xmm1, %xmm0 1250; SSE-NEXT: retq 1251; 1252; AVX-LABEL: test_v4f64_zero: 1253; AVX: # %bb.0: 1254; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1255; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm1 1256; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 1257; AVX-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1258; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1259; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1260; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1261; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1262; AVX-NEXT: vzeroupper 1263; AVX-NEXT: retq 1264; 1265; AVX512-LABEL: test_v4f64_zero: 1266; AVX512: # %bb.0: 1267; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1268; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm1 1269; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 1270; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1271; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 1272; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1273; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1274; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1275; AVX512-NEXT: vzeroupper 1276; AVX512-NEXT: retq 1277 %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0) 1278 ret double %1 1279} 1280 1281define double @test_v8f64_zero(<8 x double> %a0) { 1282; SSE-LABEL: test_v8f64_zero: 1283; SSE: # %bb.0: 1284; SSE-NEXT: xorpd %xmm4, %xmm4 1285; SSE-NEXT: addsd %xmm0, %xmm4 1286; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 1287; SSE-NEXT: addsd %xmm4, %xmm0 1288; SSE-NEXT: addsd %xmm1, %xmm0 1289; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 1290; SSE-NEXT: addsd %xmm1, %xmm0 1291; SSE-NEXT: addsd %xmm2, %xmm0 1292; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] 1293; SSE-NEXT: addsd %xmm2, %xmm0 1294; SSE-NEXT: addsd %xmm3, %xmm0 1295; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1] 1296; SSE-NEXT: addsd %xmm3, %xmm0 1297; SSE-NEXT: retq 1298; 1299; AVX-LABEL: test_v8f64_zero: 1300; AVX: # %bb.0: 1301; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 1302; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm2 1303; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 1304; AVX-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1305; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1306; AVX-NEXT: vaddsd %xmm0, %xmm2, %xmm2 1307; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1308; AVX-NEXT: vaddsd %xmm0, %xmm2, %xmm0 1309; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1310; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1311; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1312; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1313; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1314; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1315; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1316; AVX-NEXT: vzeroupper 1317; AVX-NEXT: retq 1318; 1319; AVX512-LABEL: test_v8f64_zero: 1320; AVX512: # %bb.0: 1321; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1322; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm1 1323; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 1324; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1325; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 1326; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1327; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1328; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1329; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 1330; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1331; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1332; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1333; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1334; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1335; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1336; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1337; AVX512-NEXT: vzeroupper 1338; AVX512-NEXT: retq 1339 %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0) 1340 ret double %1 1341} 1342 1343define double @test_v16f64_zero(<16 x double> %a0) { 1344; SSE-LABEL: test_v16f64_zero: 1345; SSE: # %bb.0: 1346; SSE-NEXT: xorpd %xmm8, %xmm8 1347; SSE-NEXT: addsd %xmm0, %xmm8 1348; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 1349; SSE-NEXT: addsd %xmm8, %xmm0 1350; SSE-NEXT: addsd %xmm1, %xmm0 1351; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 1352; SSE-NEXT: addsd %xmm1, %xmm0 1353; SSE-NEXT: addsd %xmm2, %xmm0 1354; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] 1355; SSE-NEXT: addsd %xmm2, %xmm0 1356; SSE-NEXT: addsd %xmm3, %xmm0 1357; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1] 1358; SSE-NEXT: addsd %xmm3, %xmm0 1359; SSE-NEXT: addsd %xmm4, %xmm0 1360; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1] 1361; SSE-NEXT: addsd %xmm4, %xmm0 1362; SSE-NEXT: addsd %xmm5, %xmm0 1363; SSE-NEXT: movhlps {{.*#+}} xmm5 = xmm5[1,1] 1364; SSE-NEXT: addsd %xmm5, %xmm0 1365; SSE-NEXT: addsd %xmm6, %xmm0 1366; SSE-NEXT: movhlps {{.*#+}} xmm6 = xmm6[1,1] 1367; SSE-NEXT: addsd %xmm6, %xmm0 1368; SSE-NEXT: addsd %xmm7, %xmm0 1369; SSE-NEXT: movhlps {{.*#+}} xmm7 = xmm7[1,1] 1370; SSE-NEXT: addsd %xmm7, %xmm0 1371; SSE-NEXT: retq 1372; 1373; AVX-LABEL: test_v16f64_zero: 1374; AVX: # %bb.0: 1375; AVX-NEXT: vxorpd %xmm4, %xmm4, %xmm4 1376; AVX-NEXT: vaddsd %xmm4, %xmm0, %xmm4 1377; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm0[1,0] 1378; AVX-NEXT: vaddsd %xmm5, %xmm4, %xmm4 1379; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1380; AVX-NEXT: vaddsd %xmm0, %xmm4, %xmm4 1381; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1382; AVX-NEXT: vaddsd %xmm0, %xmm4, %xmm0 1383; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1384; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] 1385; AVX-NEXT: vaddsd %xmm4, %xmm0, %xmm0 1386; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1387; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1388; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1389; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1390; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1391; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1392; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1393; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 1394; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1395; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1396; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1397; AVX-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1398; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0] 1399; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1400; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1 1401; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1402; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1403; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1404; AVX-NEXT: vzeroupper 1405; AVX-NEXT: retq 1406; 1407; AVX512-LABEL: test_v16f64_zero: 1408; AVX512: # %bb.0: 1409; AVX512-NEXT: vxorpd %xmm2, %xmm2, %xmm2 1410; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm2 1411; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 1412; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1413; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3 1414; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1415; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] 1416; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1417; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3 1418; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1419; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] 1420; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1421; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1422; AVX512-NEXT: vaddsd %xmm0, %xmm2, %xmm2 1423; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1424; AVX512-NEXT: vaddsd %xmm0, %xmm2, %xmm0 1425; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1426; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1427; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1428; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 1429; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1430; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1431; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1432; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 1433; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1434; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1435; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1436; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 1437; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1438; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1439; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1440; AVX512-NEXT: vzeroupper 1441; AVX512-NEXT: retq 1442 %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0) 1443 ret double %1 1444} 1445 1446; 1447; vXf64 (undef) 1448; 1449 1450define double @test_v2f64_undef(<2 x double> %a0) { 1451; SSE-LABEL: test_v2f64_undef: 1452; SSE: # %bb.0: 1453; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 1454; SSE-NEXT: addsd {{.*}}(%rip), %xmm0 1455; SSE-NEXT: retq 1456; 1457; AVX-LABEL: test_v2f64_undef: 1458; AVX: # %bb.0: 1459; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1460; AVX-NEXT: vaddsd {{.*}}(%rip), %xmm0, %xmm0 1461; AVX-NEXT: retq 1462; 1463; AVX512-LABEL: test_v2f64_undef: 1464; AVX512: # %bb.0: 1465; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1466; AVX512-NEXT: vaddsd {{.*}}(%rip), %xmm0, %xmm0 1467; AVX512-NEXT: retq 1468 %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double undef, <2 x double> %a0) 1469 ret double %1 1470} 1471 1472define double @test_v4f64_undef(<4 x double> %a0) { 1473; SSE-LABEL: test_v4f64_undef: 1474; SSE: # %bb.0: 1475; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 1476; SSE-NEXT: addsd {{.*}}(%rip), %xmm0 1477; SSE-NEXT: addsd %xmm1, %xmm0 1478; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 1479; SSE-NEXT: addsd %xmm1, %xmm0 1480; SSE-NEXT: retq 1481; 1482; AVX-LABEL: test_v4f64_undef: 1483; AVX: # %bb.0: 1484; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1485; AVX-NEXT: vaddsd {{.*}}(%rip), %xmm1, %xmm1 1486; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1487; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1488; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1489; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1490; AVX-NEXT: vzeroupper 1491; AVX-NEXT: retq 1492; 1493; AVX512-LABEL: test_v4f64_undef: 1494; AVX512: # %bb.0: 1495; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1496; AVX512-NEXT: vaddsd {{.*}}(%rip), %xmm1, %xmm1 1497; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 1498; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1499; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1500; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1501; AVX512-NEXT: vzeroupper 1502; AVX512-NEXT: retq 1503 %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double undef, <4 x double> %a0) 1504 ret double %1 1505} 1506 1507define double @test_v8f64_undef(<8 x double> %a0) { 1508; SSE-LABEL: test_v8f64_undef: 1509; SSE: # %bb.0: 1510; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 1511; SSE-NEXT: addsd {{.*}}(%rip), %xmm0 1512; SSE-NEXT: addsd %xmm1, %xmm0 1513; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 1514; SSE-NEXT: addsd %xmm1, %xmm0 1515; SSE-NEXT: addsd %xmm2, %xmm0 1516; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] 1517; SSE-NEXT: addsd %xmm2, %xmm0 1518; SSE-NEXT: addsd %xmm3, %xmm0 1519; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1] 1520; SSE-NEXT: addsd %xmm3, %xmm0 1521; SSE-NEXT: retq 1522; 1523; AVX-LABEL: test_v8f64_undef: 1524; AVX: # %bb.0: 1525; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 1526; AVX-NEXT: vaddsd {{.*}}(%rip), %xmm2, %xmm2 1527; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1528; AVX-NEXT: vaddsd %xmm0, %xmm2, %xmm2 1529; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1530; AVX-NEXT: vaddsd %xmm0, %xmm2, %xmm0 1531; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1532; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1533; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1534; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1535; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1536; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1537; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1538; AVX-NEXT: vzeroupper 1539; AVX-NEXT: retq 1540; 1541; AVX512-LABEL: test_v8f64_undef: 1542; AVX512: # %bb.0: 1543; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1544; AVX512-NEXT: vaddsd {{.*}}(%rip), %xmm1, %xmm1 1545; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 1546; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1547; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1548; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1549; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 1550; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1551; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1552; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1553; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1554; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1555; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1556; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1557; AVX512-NEXT: vzeroupper 1558; AVX512-NEXT: retq 1559 %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double undef, <8 x double> %a0) 1560 ret double %1 1561} 1562 1563define double @test_v16f64_undef(<16 x double> %a0) { 1564; SSE-LABEL: test_v16f64_undef: 1565; SSE: # %bb.0: 1566; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 1567; SSE-NEXT: addsd {{.*}}(%rip), %xmm0 1568; SSE-NEXT: addsd %xmm1, %xmm0 1569; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 1570; SSE-NEXT: addsd %xmm1, %xmm0 1571; SSE-NEXT: addsd %xmm2, %xmm0 1572; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] 1573; SSE-NEXT: addsd %xmm2, %xmm0 1574; SSE-NEXT: addsd %xmm3, %xmm0 1575; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1] 1576; SSE-NEXT: addsd %xmm3, %xmm0 1577; SSE-NEXT: addsd %xmm4, %xmm0 1578; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1] 1579; SSE-NEXT: addsd %xmm4, %xmm0 1580; SSE-NEXT: addsd %xmm5, %xmm0 1581; SSE-NEXT: movhlps {{.*#+}} xmm5 = xmm5[1,1] 1582; SSE-NEXT: addsd %xmm5, %xmm0 1583; SSE-NEXT: addsd %xmm6, %xmm0 1584; SSE-NEXT: movhlps {{.*#+}} xmm6 = xmm6[1,1] 1585; SSE-NEXT: addsd %xmm6, %xmm0 1586; SSE-NEXT: addsd %xmm7, %xmm0 1587; SSE-NEXT: movhlps {{.*#+}} xmm7 = xmm7[1,1] 1588; SSE-NEXT: addsd %xmm7, %xmm0 1589; SSE-NEXT: retq 1590; 1591; AVX-LABEL: test_v16f64_undef: 1592; AVX: # %bb.0: 1593; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] 1594; AVX-NEXT: vaddsd {{.*}}(%rip), %xmm4, %xmm4 1595; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1596; AVX-NEXT: vaddsd %xmm0, %xmm4, %xmm4 1597; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1598; AVX-NEXT: vaddsd %xmm0, %xmm4, %xmm0 1599; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1600; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] 1601; AVX-NEXT: vaddsd %xmm4, %xmm0, %xmm0 1602; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1603; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1604; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1605; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1606; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1607; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1608; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1609; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 1610; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1611; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1612; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1613; AVX-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1614; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0] 1615; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1616; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1 1617; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1618; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1619; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1620; AVX-NEXT: vzeroupper 1621; AVX-NEXT: retq 1622; 1623; AVX512-LABEL: test_v16f64_undef: 1624; AVX512: # %bb.0: 1625; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 1626; AVX512-NEXT: vaddsd {{.*}}(%rip), %xmm2, %xmm2 1627; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3 1628; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1629; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] 1630; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1631; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3 1632; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1633; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] 1634; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1635; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1636; AVX512-NEXT: vaddsd %xmm0, %xmm2, %xmm2 1637; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1638; AVX512-NEXT: vaddsd %xmm0, %xmm2, %xmm0 1639; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1640; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1641; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1642; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 1643; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1644; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1645; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1646; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 1647; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1648; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1649; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1650; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 1651; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1652; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1653; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1654; AVX512-NEXT: vzeroupper 1655; AVX512-NEXT: retq 1656 %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double undef, <16 x double> %a0) 1657 ret double %1 1658} 1659 1660declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float, <2 x float>) 1661declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float, <4 x float>) 1662declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float, <8 x float>) 1663declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float, <16 x float>) 1664 1665declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double, <2 x double>) 1666declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double, <4 x double>) 1667declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double, <8 x double>) 1668declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double, <16 x double>) 1669