1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL 8 9; 10; vXf32 (accum) 11; 12 13define float @test_v2f32(float %a0, <2 x float> %a1) { 14; SSE2-LABEL: test_v2f32: 15; SSE2: # %bb.0: 16; SSE2-NEXT: movaps %xmm1, %xmm0 17; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] 18; SSE2-NEXT: mulps %xmm1, %xmm0 19; SSE2-NEXT: retq 20; 21; SSE41-LABEL: test_v2f32: 22; SSE41: # %bb.0: 23; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 24; SSE41-NEXT: mulps %xmm1, %xmm0 25; SSE41-NEXT: retq 26; 27; AVX-LABEL: test_v2f32: 28; AVX: # %bb.0: 29; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 30; AVX-NEXT: vmulps %xmm0, %xmm1, %xmm0 31; AVX-NEXT: retq 32; 33; AVX512-LABEL: test_v2f32: 34; AVX512: # %bb.0: 35; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 36; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 37; AVX512-NEXT: retq 38 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float %a0, <2 x float> %a1) 39 ret float %1 40} 41 42define float @test_v4f32(float %a0, <4 x float> %a1) { 43; SSE2-LABEL: test_v4f32: 44; SSE2: # %bb.0: 45; SSE2-NEXT: movaps %xmm1, %xmm2 46; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] 47; SSE2-NEXT: mulps %xmm1, %xmm2 48; SSE2-NEXT: movaps %xmm2, %xmm0 49; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3] 50; SSE2-NEXT: mulps %xmm2, %xmm0 51; SSE2-NEXT: retq 52; 53; SSE41-LABEL: test_v4f32: 54; SSE41: # %bb.0: 55; SSE41-NEXT: movaps %xmm1, %xmm2 56; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] 57; SSE41-NEXT: mulps %xmm1, %xmm2 58; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] 59; SSE41-NEXT: mulps %xmm2, %xmm0 60; SSE41-NEXT: retq 61; 62; AVX-LABEL: test_v4f32: 63; AVX: # %bb.0: 64; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] 65; AVX-NEXT: vmulps %xmm0, %xmm1, %xmm0 66; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 67; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 68; AVX-NEXT: retq 69; 70; AVX512-LABEL: test_v4f32: 71; AVX512: # %bb.0: 72; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] 73; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 74; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 75; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 76; AVX512-NEXT: retq 77 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float %a0, <4 x float> %a1) 78 ret float %1 79} 80 81define float @test_v8f32(float %a0, <8 x float> %a1) { 82; SSE2-LABEL: test_v8f32: 83; SSE2: # %bb.0: 84; SSE2-NEXT: mulps %xmm2, %xmm1 85; SSE2-NEXT: movaps %xmm1, %xmm2 86; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] 87; SSE2-NEXT: mulps %xmm1, %xmm2 88; SSE2-NEXT: movaps %xmm2, %xmm0 89; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3] 90; SSE2-NEXT: mulps %xmm2, %xmm0 91; SSE2-NEXT: retq 92; 93; SSE41-LABEL: test_v8f32: 94; SSE41: # %bb.0: 95; SSE41-NEXT: mulps %xmm2, %xmm1 96; SSE41-NEXT: movaps %xmm1, %xmm2 97; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] 98; SSE41-NEXT: mulps %xmm1, %xmm2 99; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] 100; SSE41-NEXT: mulps %xmm2, %xmm0 101; SSE41-NEXT: retq 102; 103; AVX-LABEL: test_v8f32: 104; AVX: # %bb.0: 105; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0 106; AVX-NEXT: vmulps %ymm0, %ymm1, %ymm0 107; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 108; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 109; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 110; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 111; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 112; AVX-NEXT: vzeroupper 113; AVX-NEXT: retq 114; 115; AVX512-LABEL: test_v8f32: 116; AVX512: # %bb.0: 117; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0 118; AVX512-NEXT: vmulps %ymm0, %ymm1, %ymm0 119; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 120; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0 121; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 122; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0 123; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 124; AVX512-NEXT: vzeroupper 125; AVX512-NEXT: retq 126 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float %a0, <8 x float> %a1) 127 ret float %1 128} 129 130define float @test_v16f32(float %a0, <16 x float> %a1) { 131; SSE2-LABEL: test_v16f32: 132; SSE2: # %bb.0: 133; SSE2-NEXT: mulps %xmm4, %xmm2 134; SSE2-NEXT: mulps %xmm3, %xmm1 135; SSE2-NEXT: mulps %xmm2, %xmm1 136; SSE2-NEXT: movaps %xmm1, %xmm2 137; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] 138; SSE2-NEXT: mulps %xmm1, %xmm2 139; SSE2-NEXT: movaps %xmm2, %xmm0 140; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3] 141; SSE2-NEXT: mulps %xmm2, %xmm0 142; SSE2-NEXT: retq 143; 144; SSE41-LABEL: test_v16f32: 145; SSE41: # %bb.0: 146; SSE41-NEXT: mulps %xmm4, %xmm2 147; SSE41-NEXT: mulps %xmm3, %xmm1 148; SSE41-NEXT: mulps %xmm2, %xmm1 149; SSE41-NEXT: movaps %xmm1, %xmm2 150; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] 151; SSE41-NEXT: mulps %xmm1, %xmm2 152; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] 153; SSE41-NEXT: mulps %xmm2, %xmm0 154; SSE41-NEXT: retq 155; 156; AVX-LABEL: test_v16f32: 157; AVX: # %bb.0: 158; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm0 159; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 160; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 161; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 162; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 163; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 164; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 165; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 166; AVX-NEXT: vzeroupper 167; AVX-NEXT: retq 168; 169; AVX512-LABEL: test_v16f32: 170; AVX512: # %bb.0: 171; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm0 172; AVX512-NEXT: vmulps %zmm0, %zmm1, %zmm0 173; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 174; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 175; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 176; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 177; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 178; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 179; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 180; AVX512-NEXT: vzeroupper 181; AVX512-NEXT: retq 182 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float %a0, <16 x float> %a1) 183 ret float %1 184} 185 186; 187; vXf32 (one) 188; 189 190define float @test_v2f32_zero(<2 x float> %a0) { 191; SSE2-LABEL: test_v2f32_zero: 192; SSE2: # %bb.0: 193; SSE2-NEXT: movaps %xmm0, %xmm1 194; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] 195; SSE2-NEXT: mulps %xmm0, %xmm1 196; SSE2-NEXT: movaps %xmm1, %xmm0 197; SSE2-NEXT: retq 198; 199; SSE41-LABEL: test_v2f32_zero: 200; SSE41: # %bb.0: 201; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 202; SSE41-NEXT: mulps %xmm1, %xmm0 203; SSE41-NEXT: retq 204; 205; AVX-LABEL: test_v2f32_zero: 206; AVX: # %bb.0: 207; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 208; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 209; AVX-NEXT: retq 210; 211; AVX512-LABEL: test_v2f32_zero: 212; AVX512: # %bb.0: 213; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 214; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 215; AVX512-NEXT: retq 216 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float 1.0, <2 x float> %a0) 217 ret float %1 218} 219 220define float @test_v4f32_zero(<4 x float> %a0) { 221; SSE2-LABEL: test_v4f32_zero: 222; SSE2: # %bb.0: 223; SSE2-NEXT: movaps %xmm0, %xmm1 224; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 225; SSE2-NEXT: mulps %xmm0, %xmm1 226; SSE2-NEXT: movaps %xmm1, %xmm0 227; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] 228; SSE2-NEXT: mulps %xmm1, %xmm0 229; SSE2-NEXT: retq 230; 231; SSE41-LABEL: test_v4f32_zero: 232; SSE41: # %bb.0: 233; SSE41-NEXT: movaps %xmm0, %xmm1 234; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 235; SSE41-NEXT: mulps %xmm0, %xmm1 236; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 237; SSE41-NEXT: mulps %xmm0, %xmm1 238; SSE41-NEXT: movaps %xmm1, %xmm0 239; SSE41-NEXT: retq 240; 241; AVX-LABEL: test_v4f32_zero: 242; AVX: # %bb.0: 243; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 244; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 245; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 246; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 247; AVX-NEXT: retq 248; 249; AVX512-LABEL: test_v4f32_zero: 250; AVX512: # %bb.0: 251; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 252; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 253; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 254; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 255; AVX512-NEXT: retq 256 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float 1.0, <4 x float> %a0) 257 ret float %1 258} 259 260define float @test_v8f32_zero(<8 x float> %a0) { 261; SSE2-LABEL: test_v8f32_zero: 262; SSE2: # %bb.0: 263; SSE2-NEXT: mulps %xmm1, %xmm0 264; SSE2-NEXT: movaps %xmm0, %xmm1 265; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 266; SSE2-NEXT: mulps %xmm0, %xmm1 267; SSE2-NEXT: movaps %xmm1, %xmm0 268; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] 269; SSE2-NEXT: mulps %xmm1, %xmm0 270; SSE2-NEXT: retq 271; 272; SSE41-LABEL: test_v8f32_zero: 273; SSE41: # %bb.0: 274; SSE41-NEXT: mulps %xmm1, %xmm0 275; SSE41-NEXT: movaps %xmm0, %xmm1 276; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 277; SSE41-NEXT: mulps %xmm0, %xmm1 278; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 279; SSE41-NEXT: mulps %xmm0, %xmm1 280; SSE41-NEXT: movaps %xmm1, %xmm0 281; SSE41-NEXT: retq 282; 283; AVX-LABEL: test_v8f32_zero: 284; AVX: # %bb.0: 285; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 286; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 287; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 288; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 289; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 290; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 291; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 292; AVX-NEXT: vzeroupper 293; AVX-NEXT: retq 294; 295; AVX512-LABEL: test_v8f32_zero: 296; AVX512: # %bb.0: 297; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 298; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0 299; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 300; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0 301; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 302; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0 303; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 304; AVX512-NEXT: vzeroupper 305; AVX512-NEXT: retq 306 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0) 307 ret float %1 308} 309 310define float @test_v16f32_zero(<16 x float> %a0) { 311; SSE2-LABEL: test_v16f32_zero: 312; SSE2: # %bb.0: 313; SSE2-NEXT: mulps %xmm3, %xmm1 314; SSE2-NEXT: mulps %xmm2, %xmm0 315; SSE2-NEXT: mulps %xmm1, %xmm0 316; SSE2-NEXT: movaps %xmm0, %xmm1 317; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 318; SSE2-NEXT: mulps %xmm0, %xmm1 319; SSE2-NEXT: movaps %xmm1, %xmm0 320; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] 321; SSE2-NEXT: mulps %xmm1, %xmm0 322; SSE2-NEXT: retq 323; 324; SSE41-LABEL: test_v16f32_zero: 325; SSE41: # %bb.0: 326; SSE41-NEXT: mulps %xmm3, %xmm1 327; SSE41-NEXT: mulps %xmm2, %xmm0 328; SSE41-NEXT: mulps %xmm1, %xmm0 329; SSE41-NEXT: movaps %xmm0, %xmm1 330; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 331; SSE41-NEXT: mulps %xmm0, %xmm1 332; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 333; SSE41-NEXT: mulps %xmm0, %xmm1 334; SSE41-NEXT: movaps %xmm1, %xmm0 335; SSE41-NEXT: retq 336; 337; AVX-LABEL: test_v16f32_zero: 338; AVX: # %bb.0: 339; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 340; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 341; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 342; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 343; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 344; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 345; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 346; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 347; AVX-NEXT: vzeroupper 348; AVX-NEXT: retq 349; 350; AVX512-LABEL: test_v16f32_zero: 351; AVX512: # %bb.0: 352; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 353; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 354; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 355; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 356; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 357; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 358; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 359; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 360; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 361; AVX512-NEXT: vzeroupper 362; AVX512-NEXT: retq 363 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0) 364 ret float %1 365} 366 367; 368; vXf32 (undef) 369; 370 371define float @test_v2f32_undef(<2 x float> %a0) { 372; SSE2-LABEL: test_v2f32_undef: 373; SSE2: # %bb.0: 374; SSE2-NEXT: movaps %xmm0, %xmm1 375; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] 376; SSE2-NEXT: mulps %xmm0, %xmm1 377; SSE2-NEXT: movaps %xmm1, %xmm0 378; SSE2-NEXT: retq 379; 380; SSE41-LABEL: test_v2f32_undef: 381; SSE41: # %bb.0: 382; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 383; SSE41-NEXT: mulps %xmm1, %xmm0 384; SSE41-NEXT: retq 385; 386; AVX-LABEL: test_v2f32_undef: 387; AVX: # %bb.0: 388; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 389; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 390; AVX-NEXT: retq 391; 392; AVX512-LABEL: test_v2f32_undef: 393; AVX512: # %bb.0: 394; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 395; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 396; AVX512-NEXT: retq 397 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float undef, <2 x float> %a0) 398 ret float %1 399} 400 401define float @test_v4f32_undef(<4 x float> %a0) { 402; SSE2-LABEL: test_v4f32_undef: 403; SSE2: # %bb.0: 404; SSE2-NEXT: movaps %xmm0, %xmm1 405; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 406; SSE2-NEXT: mulps %xmm0, %xmm1 407; SSE2-NEXT: movaps %xmm1, %xmm0 408; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] 409; SSE2-NEXT: mulps %xmm1, %xmm0 410; SSE2-NEXT: retq 411; 412; SSE41-LABEL: test_v4f32_undef: 413; SSE41: # %bb.0: 414; SSE41-NEXT: movaps %xmm0, %xmm1 415; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 416; SSE41-NEXT: mulps %xmm0, %xmm1 417; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 418; SSE41-NEXT: mulps %xmm0, %xmm1 419; SSE41-NEXT: movaps %xmm1, %xmm0 420; SSE41-NEXT: retq 421; 422; AVX-LABEL: test_v4f32_undef: 423; AVX: # %bb.0: 424; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 425; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 426; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 427; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 428; AVX-NEXT: retq 429; 430; AVX512-LABEL: test_v4f32_undef: 431; AVX512: # %bb.0: 432; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 433; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 434; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 435; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 436; AVX512-NEXT: retq 437 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float undef, <4 x float> %a0) 438 ret float %1 439} 440 441define float @test_v8f32_undef(<8 x float> %a0) { 442; SSE2-LABEL: test_v8f32_undef: 443; SSE2: # %bb.0: 444; SSE2-NEXT: mulps %xmm1, %xmm0 445; SSE2-NEXT: movaps %xmm0, %xmm1 446; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 447; SSE2-NEXT: mulps %xmm0, %xmm1 448; SSE2-NEXT: movaps %xmm1, %xmm0 449; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] 450; SSE2-NEXT: mulps %xmm1, %xmm0 451; SSE2-NEXT: retq 452; 453; SSE41-LABEL: test_v8f32_undef: 454; SSE41: # %bb.0: 455; SSE41-NEXT: mulps %xmm1, %xmm0 456; SSE41-NEXT: movaps %xmm0, %xmm1 457; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 458; SSE41-NEXT: mulps %xmm0, %xmm1 459; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 460; SSE41-NEXT: mulps %xmm0, %xmm1 461; SSE41-NEXT: movaps %xmm1, %xmm0 462; SSE41-NEXT: retq 463; 464; AVX-LABEL: test_v8f32_undef: 465; AVX: # %bb.0: 466; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 467; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 468; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 469; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 470; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 471; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 472; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 473; AVX-NEXT: vzeroupper 474; AVX-NEXT: retq 475; 476; AVX512-LABEL: test_v8f32_undef: 477; AVX512: # %bb.0: 478; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 479; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0 480; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 481; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0 482; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 483; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0 484; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 485; AVX512-NEXT: vzeroupper 486; AVX512-NEXT: retq 487 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float undef, <8 x float> %a0) 488 ret float %1 489} 490 491define float @test_v16f32_undef(<16 x float> %a0) { 492; SSE2-LABEL: test_v16f32_undef: 493; SSE2: # %bb.0: 494; SSE2-NEXT: mulps %xmm3, %xmm1 495; SSE2-NEXT: mulps %xmm2, %xmm0 496; SSE2-NEXT: mulps %xmm1, %xmm0 497; SSE2-NEXT: movaps %xmm0, %xmm1 498; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 499; SSE2-NEXT: mulps %xmm0, %xmm1 500; SSE2-NEXT: movaps %xmm1, %xmm0 501; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] 502; SSE2-NEXT: mulps %xmm1, %xmm0 503; SSE2-NEXT: retq 504; 505; SSE41-LABEL: test_v16f32_undef: 506; SSE41: # %bb.0: 507; SSE41-NEXT: mulps %xmm3, %xmm1 508; SSE41-NEXT: mulps %xmm2, %xmm0 509; SSE41-NEXT: mulps %xmm1, %xmm0 510; SSE41-NEXT: movaps %xmm0, %xmm1 511; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 512; SSE41-NEXT: mulps %xmm0, %xmm1 513; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 514; SSE41-NEXT: mulps %xmm0, %xmm1 515; SSE41-NEXT: movaps %xmm1, %xmm0 516; SSE41-NEXT: retq 517; 518; AVX-LABEL: test_v16f32_undef: 519; AVX: # %bb.0: 520; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 521; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 522; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 523; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 524; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 525; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 526; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 527; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 528; AVX-NEXT: vzeroupper 529; AVX-NEXT: retq 530; 531; AVX512-LABEL: test_v16f32_undef: 532; AVX512: # %bb.0: 533; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 534; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 535; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 536; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 537; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 538; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 539; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 540; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 541; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 542; AVX512-NEXT: vzeroupper 543; AVX512-NEXT: retq 544 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float undef, <16 x float> %a0) 545 ret float %1 546} 547 548; 549; vXf64 (accum) 550; 551 552define double @test_v2f64(double %a0, <2 x double> %a1) { 553; SSE-LABEL: test_v2f64: 554; SSE: # %bb.0: 555; SSE-NEXT: movaps %xmm1, %xmm0 556; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 557; SSE-NEXT: mulpd %xmm1, %xmm0 558; SSE-NEXT: retq 559; 560; AVX-LABEL: test_v2f64: 561; AVX: # %bb.0: 562; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] 563; AVX-NEXT: vmulpd %xmm0, %xmm1, %xmm0 564; AVX-NEXT: retq 565; 566; AVX512-LABEL: test_v2f64: 567; AVX512: # %bb.0: 568; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] 569; AVX512-NEXT: vmulpd %xmm0, %xmm1, %xmm0 570; AVX512-NEXT: retq 571 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double %a0, <2 x double> %a1) 572 ret double %1 573} 574 575define double @test_v4f64(double %a0, <4 x double> %a1) { 576; SSE-LABEL: test_v4f64: 577; SSE: # %bb.0: 578; SSE-NEXT: mulpd %xmm2, %xmm1 579; SSE-NEXT: movapd %xmm1, %xmm0 580; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 581; SSE-NEXT: mulpd %xmm1, %xmm0 582; SSE-NEXT: retq 583; 584; AVX-LABEL: test_v4f64: 585; AVX: # %bb.0: 586; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0 587; AVX-NEXT: vmulpd %ymm0, %ymm1, %ymm0 588; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 589; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 590; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 591; AVX-NEXT: vzeroupper 592; AVX-NEXT: retq 593; 594; AVX512-LABEL: test_v4f64: 595; AVX512: # %bb.0: 596; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0 597; AVX512-NEXT: vmulpd %ymm0, %ymm1, %ymm0 598; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 599; AVX512-NEXT: vmulpd %ymm1, %ymm0, %ymm0 600; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 601; AVX512-NEXT: vzeroupper 602; AVX512-NEXT: retq 603 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double %a0, <4 x double> %a1) 604 ret double %1 605} 606 607define double @test_v8f64(double %a0, <8 x double> %a1) { 608; SSE-LABEL: test_v8f64: 609; SSE: # %bb.0: 610; SSE-NEXT: mulpd %xmm4, %xmm2 611; SSE-NEXT: mulpd %xmm3, %xmm1 612; SSE-NEXT: mulpd %xmm2, %xmm1 613; SSE-NEXT: movapd %xmm1, %xmm0 614; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 615; SSE-NEXT: mulpd %xmm1, %xmm0 616; SSE-NEXT: retq 617; 618; AVX-LABEL: test_v8f64: 619; AVX: # %bb.0: 620; AVX-NEXT: vmulpd %ymm2, %ymm1, %ymm0 621; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 622; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 623; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 624; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 625; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 626; AVX-NEXT: vzeroupper 627; AVX-NEXT: retq 628; 629; AVX512-LABEL: test_v8f64: 630; AVX512: # %bb.0: 631; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm0 632; AVX512-NEXT: vmulpd %zmm0, %zmm1, %zmm0 633; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 634; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 635; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 636; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 637; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 638; AVX512-NEXT: vzeroupper 639; AVX512-NEXT: retq 640 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double %a0, <8 x double> %a1) 641 ret double %1 642} 643 644define double @test_v16f64(double %a0, <16 x double> %a1) { 645; SSE-LABEL: test_v16f64: 646; SSE: # %bb.0: 647; SSE-NEXT: mulpd %xmm6, %xmm2 648; SSE-NEXT: mulpd %xmm7, %xmm3 649; SSE-NEXT: mulpd %xmm5, %xmm1 650; SSE-NEXT: mulpd %xmm3, %xmm1 651; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4 652; SSE-NEXT: mulpd %xmm2, %xmm4 653; SSE-NEXT: mulpd %xmm1, %xmm4 654; SSE-NEXT: movapd %xmm4, %xmm0 655; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm4[1],xmm0[1] 656; SSE-NEXT: mulpd %xmm4, %xmm0 657; SSE-NEXT: retq 658; 659; AVX-LABEL: test_v16f64: 660; AVX: # %bb.0: 661; AVX-NEXT: vmulpd %ymm4, %ymm2, %ymm0 662; AVX-NEXT: vmulpd %ymm3, %ymm1, %ymm1 663; AVX-NEXT: vmulpd %ymm0, %ymm1, %ymm0 664; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 665; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 666; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 667; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 668; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 669; AVX-NEXT: vzeroupper 670; AVX-NEXT: retq 671; 672; AVX512-LABEL: test_v16f64: 673; AVX512: # %bb.0: 674; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm0 675; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 676; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 677; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 678; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 679; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 680; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 681; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 682; AVX512-NEXT: vzeroupper 683; AVX512-NEXT: retq 684 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double %a0, <16 x double> %a1) 685 ret double %1 686} 687 688; 689; vXf64 (one) 690; 691 692define double @test_v2f64_zero(<2 x double> %a0) { 693; SSE-LABEL: test_v2f64_zero: 694; SSE: # %bb.0: 695; SSE-NEXT: movaps %xmm0, %xmm1 696; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 697; SSE-NEXT: mulpd %xmm0, %xmm1 698; SSE-NEXT: movapd %xmm1, %xmm0 699; SSE-NEXT: retq 700; 701; AVX-LABEL: test_v2f64_zero: 702; AVX: # %bb.0: 703; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 704; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 705; AVX-NEXT: retq 706; 707; AVX512-LABEL: test_v2f64_zero: 708; AVX512: # %bb.0: 709; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 710; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0 711; AVX512-NEXT: retq 712 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double 1.0, <2 x double> %a0) 713 ret double %1 714} 715 716define double @test_v4f64_zero(<4 x double> %a0) { 717; SSE-LABEL: test_v4f64_zero: 718; SSE: # %bb.0: 719; SSE-NEXT: mulpd %xmm1, %xmm0 720; SSE-NEXT: movapd %xmm0, %xmm1 721; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 722; SSE-NEXT: mulpd %xmm0, %xmm1 723; SSE-NEXT: movapd %xmm1, %xmm0 724; SSE-NEXT: retq 725; 726; AVX-LABEL: test_v4f64_zero: 727; AVX: # %bb.0: 728; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 729; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 730; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 731; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 732; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 733; AVX-NEXT: vzeroupper 734; AVX-NEXT: retq 735; 736; AVX512-LABEL: test_v4f64_zero: 737; AVX512: # %bb.0: 738; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 739; AVX512-NEXT: vmulpd %ymm1, %ymm0, %ymm0 740; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 741; AVX512-NEXT: vmulpd %ymm1, %ymm0, %ymm0 742; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 743; AVX512-NEXT: vzeroupper 744; AVX512-NEXT: retq 745 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0) 746 ret double %1 747} 748 749define double @test_v8f64_zero(<8 x double> %a0) { 750; SSE-LABEL: test_v8f64_zero: 751; SSE: # %bb.0: 752; SSE-NEXT: mulpd %xmm3, %xmm1 753; SSE-NEXT: mulpd %xmm2, %xmm0 754; SSE-NEXT: mulpd %xmm1, %xmm0 755; SSE-NEXT: movapd %xmm0, %xmm1 756; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 757; SSE-NEXT: mulpd %xmm0, %xmm1 758; SSE-NEXT: movapd %xmm1, %xmm0 759; SSE-NEXT: retq 760; 761; AVX-LABEL: test_v8f64_zero: 762; AVX: # %bb.0: 763; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 764; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 765; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 766; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 767; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 768; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 769; AVX-NEXT: vzeroupper 770; AVX-NEXT: retq 771; 772; AVX512-LABEL: test_v8f64_zero: 773; AVX512: # %bb.0: 774; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 775; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 776; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 777; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 778; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 779; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 780; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 781; AVX512-NEXT: vzeroupper 782; AVX512-NEXT: retq 783 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0) 784 ret double %1 785} 786 787define double @test_v16f64_zero(<16 x double> %a0) { 788; SSE-LABEL: test_v16f64_zero: 789; SSE: # %bb.0: 790; SSE-NEXT: mulpd %xmm6, %xmm2 791; SSE-NEXT: mulpd %xmm4, %xmm0 792; SSE-NEXT: mulpd %xmm2, %xmm0 793; SSE-NEXT: mulpd %xmm7, %xmm3 794; SSE-NEXT: mulpd %xmm5, %xmm1 795; SSE-NEXT: mulpd %xmm3, %xmm1 796; SSE-NEXT: mulpd %xmm0, %xmm1 797; SSE-NEXT: movapd %xmm1, %xmm0 798; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 799; SSE-NEXT: mulpd %xmm1, %xmm0 800; SSE-NEXT: retq 801; 802; AVX-LABEL: test_v16f64_zero: 803; AVX: # %bb.0: 804; AVX-NEXT: vmulpd %ymm3, %ymm1, %ymm1 805; AVX-NEXT: vmulpd %ymm2, %ymm0, %ymm0 806; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 807; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 808; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 809; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 810; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 811; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 812; AVX-NEXT: vzeroupper 813; AVX-NEXT: retq 814; 815; AVX512-LABEL: test_v16f64_zero: 816; AVX512: # %bb.0: 817; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 818; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 819; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 820; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 821; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 822; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 823; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 824; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 825; AVX512-NEXT: vzeroupper 826; AVX512-NEXT: retq 827 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0) 828 ret double %1 829} 830 831; 832; vXf64 (undef) 833; 834 835define double @test_v2f64_undef(<2 x double> %a0) { 836; SSE-LABEL: test_v2f64_undef: 837; SSE: # %bb.0: 838; SSE-NEXT: movaps %xmm0, %xmm1 839; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 840; SSE-NEXT: mulpd %xmm0, %xmm1 841; SSE-NEXT: movapd %xmm1, %xmm0 842; SSE-NEXT: retq 843; 844; AVX-LABEL: test_v2f64_undef: 845; AVX: # %bb.0: 846; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 847; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 848; AVX-NEXT: retq 849; 850; AVX512-LABEL: test_v2f64_undef: 851; AVX512: # %bb.0: 852; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 853; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0 854; AVX512-NEXT: retq 855 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double undef, <2 x double> %a0) 856 ret double %1 857} 858 859define double @test_v4f64_undef(<4 x double> %a0) { 860; SSE-LABEL: test_v4f64_undef: 861; SSE: # %bb.0: 862; SSE-NEXT: mulpd %xmm1, %xmm0 863; SSE-NEXT: movapd %xmm0, %xmm1 864; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 865; SSE-NEXT: mulpd %xmm0, %xmm1 866; SSE-NEXT: movapd %xmm1, %xmm0 867; SSE-NEXT: retq 868; 869; AVX-LABEL: test_v4f64_undef: 870; AVX: # %bb.0: 871; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 872; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 873; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 874; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 875; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 876; AVX-NEXT: vzeroupper 877; AVX-NEXT: retq 878; 879; AVX512-LABEL: test_v4f64_undef: 880; AVX512: # %bb.0: 881; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 882; AVX512-NEXT: vmulpd %ymm1, %ymm0, %ymm0 883; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 884; AVX512-NEXT: vmulpd %ymm1, %ymm0, %ymm0 885; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 886; AVX512-NEXT: vzeroupper 887; AVX512-NEXT: retq 888 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double undef, <4 x double> %a0) 889 ret double %1 890} 891 892define double @test_v8f64_undef(<8 x double> %a0) { 893; SSE-LABEL: test_v8f64_undef: 894; SSE: # %bb.0: 895; SSE-NEXT: mulpd %xmm3, %xmm1 896; SSE-NEXT: mulpd %xmm2, %xmm0 897; SSE-NEXT: mulpd %xmm1, %xmm0 898; SSE-NEXT: movapd %xmm0, %xmm1 899; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 900; SSE-NEXT: mulpd %xmm0, %xmm1 901; SSE-NEXT: movapd %xmm1, %xmm0 902; SSE-NEXT: retq 903; 904; AVX-LABEL: test_v8f64_undef: 905; AVX: # %bb.0: 906; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 907; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 908; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 909; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 910; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 911; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 912; AVX-NEXT: vzeroupper 913; AVX-NEXT: retq 914; 915; AVX512-LABEL: test_v8f64_undef: 916; AVX512: # %bb.0: 917; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 918; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 919; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 920; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 921; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 922; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 923; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 924; AVX512-NEXT: vzeroupper 925; AVX512-NEXT: retq 926 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double undef, <8 x double> %a0) 927 ret double %1 928} 929 930define double @test_v16f64_undef(<16 x double> %a0) { 931; SSE-LABEL: test_v16f64_undef: 932; SSE: # %bb.0: 933; SSE-NEXT: mulpd %xmm6, %xmm2 934; SSE-NEXT: mulpd %xmm4, %xmm0 935; SSE-NEXT: mulpd %xmm2, %xmm0 936; SSE-NEXT: mulpd %xmm7, %xmm3 937; SSE-NEXT: mulpd %xmm5, %xmm1 938; SSE-NEXT: mulpd %xmm3, %xmm1 939; SSE-NEXT: mulpd %xmm0, %xmm1 940; SSE-NEXT: movapd %xmm1, %xmm0 941; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 942; SSE-NEXT: mulpd %xmm1, %xmm0 943; SSE-NEXT: retq 944; 945; AVX-LABEL: test_v16f64_undef: 946; AVX: # %bb.0: 947; AVX-NEXT: vmulpd %ymm3, %ymm1, %ymm1 948; AVX-NEXT: vmulpd %ymm2, %ymm0, %ymm0 949; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 950; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 951; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 952; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 953; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 954; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 955; AVX-NEXT: vzeroupper 956; AVX-NEXT: retq 957; 958; AVX512-LABEL: test_v16f64_undef: 959; AVX512: # %bb.0: 960; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 961; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 962; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 963; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 964; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 965; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 966; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 967; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 968; AVX512-NEXT: vzeroupper 969; AVX512-NEXT: retq 970 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double undef, <16 x double> %a0) 971 ret double %1 972} 973 974declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float, <2 x float>) 975declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float, <4 x float>) 976declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float, <8 x float>) 977declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float, <16 x float>) 978 979declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double, <2 x double>) 980declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double, <4 x double>) 981declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double, <8 x double>) 982declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double, <16 x double>) 983