1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL --check-prefix=AVX512BWVL 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL --check-prefix=AVX512DQVL 10 11; 12; vXi64 13; 14 15define i64 @test_v2i64(<2 x i64> %a0) { 16; SSE-LABEL: test_v2i64: 17; SSE: # %bb.0: 18; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 19; SSE-NEXT: movdqa %xmm0, %xmm2 20; SSE-NEXT: psrlq $32, %xmm2 21; SSE-NEXT: pmuludq %xmm1, %xmm2 22; SSE-NEXT: movdqa %xmm1, %xmm3 23; SSE-NEXT: psrlq $32, %xmm3 24; SSE-NEXT: pmuludq %xmm0, %xmm3 25; SSE-NEXT: paddq %xmm2, %xmm3 26; SSE-NEXT: psllq $32, %xmm3 27; SSE-NEXT: pmuludq %xmm1, %xmm0 28; SSE-NEXT: paddq %xmm3, %xmm0 29; SSE-NEXT: movq %xmm0, %rax 30; SSE-NEXT: retq 31; 32; AVX-LABEL: test_v2i64: 33; AVX: # %bb.0: 34; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 35; AVX-NEXT: vpsrlq $32, %xmm0, %xmm2 36; AVX-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 37; AVX-NEXT: vpsrlq $32, %xmm1, %xmm3 38; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 39; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm2 40; AVX-NEXT: vpsllq $32, %xmm2, %xmm2 41; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 42; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 43; AVX-NEXT: vmovq %xmm0, %rax 44; AVX-NEXT: retq 45; 46; AVX512BW-LABEL: test_v2i64: 47; AVX512BW: # %bb.0: 48; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 49; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 50; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 51; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3 52; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 53; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 54; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 55; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 56; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 57; AVX512BW-NEXT: vmovq %xmm0, %rax 58; AVX512BW-NEXT: retq 59; 60; AVX512BWVL-LABEL: test_v2i64: 61; AVX512BWVL: # %bb.0: 62; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 63; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 64; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 65; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 66; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 67; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 68; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 69; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 70; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 71; AVX512BWVL-NEXT: vmovq %xmm0, %rax 72; AVX512BWVL-NEXT: retq 73; 74; AVX512DQ-LABEL: test_v2i64: 75; AVX512DQ: # %bb.0: 76; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 77; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 78; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 79; AVX512DQ-NEXT: vmovq %xmm0, %rax 80; AVX512DQ-NEXT: vzeroupper 81; AVX512DQ-NEXT: retq 82; 83; AVX512DQVL-LABEL: test_v2i64: 84; AVX512DQVL: # %bb.0: 85; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 86; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 87; AVX512DQVL-NEXT: vmovq %xmm0, %rax 88; AVX512DQVL-NEXT: retq 89 %1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> %a0) 90 ret i64 %1 91} 92 93define i64 @test_v4i64(<4 x i64> %a0) { 94; SSE-LABEL: test_v4i64: 95; SSE: # %bb.0: 96; SSE-NEXT: movdqa %xmm0, %xmm2 97; SSE-NEXT: psrlq $32, %xmm2 98; SSE-NEXT: pmuludq %xmm1, %xmm2 99; SSE-NEXT: movdqa %xmm1, %xmm3 100; SSE-NEXT: psrlq $32, %xmm3 101; SSE-NEXT: pmuludq %xmm0, %xmm3 102; SSE-NEXT: paddq %xmm2, %xmm3 103; SSE-NEXT: psllq $32, %xmm3 104; SSE-NEXT: pmuludq %xmm1, %xmm0 105; SSE-NEXT: paddq %xmm3, %xmm0 106; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 107; SSE-NEXT: movdqa %xmm0, %xmm2 108; SSE-NEXT: psrlq $32, %xmm2 109; SSE-NEXT: pmuludq %xmm1, %xmm2 110; SSE-NEXT: movdqa %xmm1, %xmm3 111; SSE-NEXT: psrlq $32, %xmm3 112; SSE-NEXT: pmuludq %xmm0, %xmm3 113; SSE-NEXT: paddq %xmm2, %xmm3 114; SSE-NEXT: psllq $32, %xmm3 115; SSE-NEXT: pmuludq %xmm1, %xmm0 116; SSE-NEXT: paddq %xmm3, %xmm0 117; SSE-NEXT: movq %xmm0, %rax 118; SSE-NEXT: retq 119; 120; AVX1-LABEL: test_v4i64: 121; AVX1: # %bb.0: 122; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 123; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 124; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 125; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 126; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 127; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 128; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 129; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 130; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 131; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 132; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 133; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 134; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 135; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 136; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 137; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 138; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 139; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 140; AVX1-NEXT: vmovq %xmm0, %rax 141; AVX1-NEXT: vzeroupper 142; AVX1-NEXT: retq 143; 144; AVX2-LABEL: test_v4i64: 145; AVX2: # %bb.0: 146; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 147; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 148; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 149; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 150; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 151; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 152; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 153; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 154; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 155; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 156; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 157; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 158; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 159; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 160; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 161; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 162; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 163; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 164; AVX2-NEXT: vmovq %xmm0, %rax 165; AVX2-NEXT: vzeroupper 166; AVX2-NEXT: retq 167; 168; AVX512BW-LABEL: test_v4i64: 169; AVX512BW: # %bb.0: 170; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 171; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm2 172; AVX512BW-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 173; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm3 174; AVX512BW-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 175; AVX512BW-NEXT: vpaddq %ymm2, %ymm3, %ymm2 176; AVX512BW-NEXT: vpsllq $32, %ymm2, %ymm2 177; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 178; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 179; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 180; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm2 181; AVX512BW-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 182; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm3 183; AVX512BW-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 184; AVX512BW-NEXT: vpaddq %ymm2, %ymm3, %ymm2 185; AVX512BW-NEXT: vpsllq $32, %ymm2, %ymm2 186; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 187; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 188; AVX512BW-NEXT: vmovq %xmm0, %rax 189; AVX512BW-NEXT: vzeroupper 190; AVX512BW-NEXT: retq 191; 192; AVX512BWVL-LABEL: test_v4i64: 193; AVX512BWVL: # %bb.0: 194; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 195; AVX512BWVL-NEXT: vpsrlq $32, %ymm0, %ymm2 196; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 197; AVX512BWVL-NEXT: vpsrlq $32, %ymm1, %ymm3 198; AVX512BWVL-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 199; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm3, %ymm2 200; AVX512BWVL-NEXT: vpsllq $32, %ymm2, %ymm2 201; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 202; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm0, %ymm0 203; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 204; AVX512BWVL-NEXT: vpsrlq $32, %ymm0, %ymm2 205; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 206; AVX512BWVL-NEXT: vpsrlq $32, %ymm1, %ymm3 207; AVX512BWVL-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 208; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm3, %ymm2 209; AVX512BWVL-NEXT: vpsllq $32, %ymm2, %ymm2 210; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 211; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm0, %ymm0 212; AVX512BWVL-NEXT: vmovq %xmm0, %rax 213; AVX512BWVL-NEXT: vzeroupper 214; AVX512BWVL-NEXT: retq 215; 216; AVX512DQ-LABEL: test_v4i64: 217; AVX512DQ: # %bb.0: 218; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 219; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 220; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 221; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 222; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 223; AVX512DQ-NEXT: vmovq %xmm0, %rax 224; AVX512DQ-NEXT: vzeroupper 225; AVX512DQ-NEXT: retq 226; 227; AVX512DQVL-LABEL: test_v4i64: 228; AVX512DQVL: # %bb.0: 229; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 230; AVX512DQVL-NEXT: vpmullq %ymm1, %ymm0, %ymm0 231; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 232; AVX512DQVL-NEXT: vpmullq %ymm1, %ymm0, %ymm0 233; AVX512DQVL-NEXT: vmovq %xmm0, %rax 234; AVX512DQVL-NEXT: vzeroupper 235; AVX512DQVL-NEXT: retq 236 %1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> %a0) 237 ret i64 %1 238} 239 240define i64 @test_v8i64(<8 x i64> %a0) { 241; SSE-LABEL: test_v8i64: 242; SSE: # %bb.0: 243; SSE-NEXT: movdqa %xmm1, %xmm4 244; SSE-NEXT: psrlq $32, %xmm4 245; SSE-NEXT: pmuludq %xmm3, %xmm4 246; SSE-NEXT: movdqa %xmm3, %xmm5 247; SSE-NEXT: psrlq $32, %xmm5 248; SSE-NEXT: pmuludq %xmm1, %xmm5 249; SSE-NEXT: paddq %xmm4, %xmm5 250; SSE-NEXT: psllq $32, %xmm5 251; SSE-NEXT: pmuludq %xmm3, %xmm1 252; SSE-NEXT: paddq %xmm5, %xmm1 253; SSE-NEXT: movdqa %xmm0, %xmm3 254; SSE-NEXT: psrlq $32, %xmm3 255; SSE-NEXT: pmuludq %xmm2, %xmm3 256; SSE-NEXT: movdqa %xmm2, %xmm4 257; SSE-NEXT: psrlq $32, %xmm4 258; SSE-NEXT: pmuludq %xmm0, %xmm4 259; SSE-NEXT: paddq %xmm3, %xmm4 260; SSE-NEXT: psllq $32, %xmm4 261; SSE-NEXT: pmuludq %xmm2, %xmm0 262; SSE-NEXT: paddq %xmm4, %xmm0 263; SSE-NEXT: movdqa %xmm0, %xmm2 264; SSE-NEXT: psrlq $32, %xmm2 265; SSE-NEXT: pmuludq %xmm1, %xmm2 266; SSE-NEXT: movdqa %xmm1, %xmm3 267; SSE-NEXT: psrlq $32, %xmm3 268; SSE-NEXT: pmuludq %xmm0, %xmm3 269; SSE-NEXT: paddq %xmm2, %xmm3 270; SSE-NEXT: psllq $32, %xmm3 271; SSE-NEXT: pmuludq %xmm1, %xmm0 272; SSE-NEXT: paddq %xmm3, %xmm0 273; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 274; SSE-NEXT: movdqa %xmm0, %xmm2 275; SSE-NEXT: psrlq $32, %xmm2 276; SSE-NEXT: pmuludq %xmm1, %xmm2 277; SSE-NEXT: movdqa %xmm1, %xmm3 278; SSE-NEXT: psrlq $32, %xmm3 279; SSE-NEXT: pmuludq %xmm0, %xmm3 280; SSE-NEXT: paddq %xmm2, %xmm3 281; SSE-NEXT: psllq $32, %xmm3 282; SSE-NEXT: pmuludq %xmm1, %xmm0 283; SSE-NEXT: paddq %xmm3, %xmm0 284; SSE-NEXT: movq %xmm0, %rax 285; SSE-NEXT: retq 286; 287; AVX1-LABEL: test_v8i64: 288; AVX1: # %bb.0: 289; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 290; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 291; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4 292; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm4 293; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5 294; AVX1-NEXT: vpmuludq %xmm5, %xmm3, %xmm5 295; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 296; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 297; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 298; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 299; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 300; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm3 301; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 302; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4 303; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 304; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 305; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 306; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 307; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm1 308; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 309; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm3 310; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 311; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 312; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 313; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 314; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 315; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 316; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 317; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 318; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 319; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 320; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 321; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 322; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 323; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 324; AVX1-NEXT: vmovq %xmm0, %rax 325; AVX1-NEXT: vzeroupper 326; AVX1-NEXT: retq 327; 328; AVX2-LABEL: test_v8i64: 329; AVX2: # %bb.0: 330; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 331; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 332; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 333; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 334; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 335; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 336; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 337; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 338; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 339; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 340; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 341; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 342; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 343; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 344; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 345; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 346; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 347; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 348; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 349; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 350; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 351; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 352; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 353; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 354; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 355; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 356; AVX2-NEXT: vmovq %xmm0, %rax 357; AVX2-NEXT: vzeroupper 358; AVX2-NEXT: retq 359; 360; AVX512BW-LABEL: test_v8i64: 361; AVX512BW: # %bb.0: 362; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 363; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 364; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 365; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 366; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 367; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 368; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 369; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 370; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 371; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 372; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 373; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 374; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 375; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 376; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 377; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 378; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 379; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 380; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 381; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 382; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 383; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 384; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 385; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 386; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 387; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 388; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 389; AVX512BW-NEXT: vmovq %xmm0, %rax 390; AVX512BW-NEXT: vzeroupper 391; AVX512BW-NEXT: retq 392; 393; AVX512BWVL-LABEL: test_v8i64: 394; AVX512BWVL: # %bb.0: 395; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 396; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 397; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 398; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 399; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 400; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 401; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 402; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 403; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 404; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 405; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 406; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 407; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 408; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 409; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 410; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 411; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 412; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 413; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 414; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 415; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 416; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 417; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 418; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 419; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 420; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 421; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 422; AVX512BWVL-NEXT: vmovq %xmm0, %rax 423; AVX512BWVL-NEXT: vzeroupper 424; AVX512BWVL-NEXT: retq 425; 426; AVX512DQ-LABEL: test_v8i64: 427; AVX512DQ: # %bb.0: 428; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 429; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 430; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 431; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 432; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 433; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 434; AVX512DQ-NEXT: vmovq %xmm0, %rax 435; AVX512DQ-NEXT: vzeroupper 436; AVX512DQ-NEXT: retq 437; 438; AVX512DQVL-LABEL: test_v8i64: 439; AVX512DQVL: # %bb.0: 440; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 441; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 442; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 443; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 444; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 445; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 446; AVX512DQVL-NEXT: vmovq %xmm0, %rax 447; AVX512DQVL-NEXT: vzeroupper 448; AVX512DQVL-NEXT: retq 449 %1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> %a0) 450 ret i64 %1 451} 452 453define i64 @test_v16i64(<16 x i64> %a0) { 454; SSE-LABEL: test_v16i64: 455; SSE: # %bb.0: 456; SSE-NEXT: movdqa %xmm2, %xmm8 457; SSE-NEXT: psrlq $32, %xmm8 458; SSE-NEXT: pmuludq %xmm6, %xmm8 459; SSE-NEXT: movdqa %xmm6, %xmm9 460; SSE-NEXT: psrlq $32, %xmm9 461; SSE-NEXT: pmuludq %xmm2, %xmm9 462; SSE-NEXT: paddq %xmm8, %xmm9 463; SSE-NEXT: psllq $32, %xmm9 464; SSE-NEXT: pmuludq %xmm6, %xmm2 465; SSE-NEXT: paddq %xmm9, %xmm2 466; SSE-NEXT: movdqa %xmm0, %xmm8 467; SSE-NEXT: psrlq $32, %xmm8 468; SSE-NEXT: pmuludq %xmm4, %xmm8 469; SSE-NEXT: movdqa %xmm4, %xmm6 470; SSE-NEXT: psrlq $32, %xmm6 471; SSE-NEXT: pmuludq %xmm0, %xmm6 472; SSE-NEXT: paddq %xmm8, %xmm6 473; SSE-NEXT: psllq $32, %xmm6 474; SSE-NEXT: pmuludq %xmm4, %xmm0 475; SSE-NEXT: paddq %xmm6, %xmm0 476; SSE-NEXT: movdqa %xmm3, %xmm4 477; SSE-NEXT: psrlq $32, %xmm4 478; SSE-NEXT: pmuludq %xmm7, %xmm4 479; SSE-NEXT: movdqa %xmm7, %xmm6 480; SSE-NEXT: psrlq $32, %xmm6 481; SSE-NEXT: pmuludq %xmm3, %xmm6 482; SSE-NEXT: paddq %xmm4, %xmm6 483; SSE-NEXT: psllq $32, %xmm6 484; SSE-NEXT: pmuludq %xmm7, %xmm3 485; SSE-NEXT: paddq %xmm6, %xmm3 486; SSE-NEXT: movdqa %xmm1, %xmm4 487; SSE-NEXT: psrlq $32, %xmm4 488; SSE-NEXT: pmuludq %xmm5, %xmm4 489; SSE-NEXT: movdqa %xmm5, %xmm6 490; SSE-NEXT: psrlq $32, %xmm6 491; SSE-NEXT: pmuludq %xmm1, %xmm6 492; SSE-NEXT: paddq %xmm4, %xmm6 493; SSE-NEXT: psllq $32, %xmm6 494; SSE-NEXT: pmuludq %xmm5, %xmm1 495; SSE-NEXT: paddq %xmm6, %xmm1 496; SSE-NEXT: movdqa %xmm1, %xmm4 497; SSE-NEXT: psrlq $32, %xmm4 498; SSE-NEXT: pmuludq %xmm3, %xmm4 499; SSE-NEXT: movdqa %xmm3, %xmm5 500; SSE-NEXT: psrlq $32, %xmm5 501; SSE-NEXT: pmuludq %xmm1, %xmm5 502; SSE-NEXT: paddq %xmm4, %xmm5 503; SSE-NEXT: psllq $32, %xmm5 504; SSE-NEXT: pmuludq %xmm3, %xmm1 505; SSE-NEXT: paddq %xmm5, %xmm1 506; SSE-NEXT: movdqa %xmm0, %xmm3 507; SSE-NEXT: psrlq $32, %xmm3 508; SSE-NEXT: pmuludq %xmm2, %xmm3 509; SSE-NEXT: movdqa %xmm2, %xmm4 510; SSE-NEXT: psrlq $32, %xmm4 511; SSE-NEXT: pmuludq %xmm0, %xmm4 512; SSE-NEXT: paddq %xmm3, %xmm4 513; SSE-NEXT: psllq $32, %xmm4 514; SSE-NEXT: pmuludq %xmm2, %xmm0 515; SSE-NEXT: paddq %xmm4, %xmm0 516; SSE-NEXT: movdqa %xmm0, %xmm2 517; SSE-NEXT: psrlq $32, %xmm2 518; SSE-NEXT: pmuludq %xmm1, %xmm2 519; SSE-NEXT: movdqa %xmm1, %xmm3 520; SSE-NEXT: psrlq $32, %xmm3 521; SSE-NEXT: pmuludq %xmm0, %xmm3 522; SSE-NEXT: paddq %xmm2, %xmm3 523; SSE-NEXT: psllq $32, %xmm3 524; SSE-NEXT: pmuludq %xmm1, %xmm0 525; SSE-NEXT: paddq %xmm3, %xmm0 526; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 527; SSE-NEXT: movdqa %xmm0, %xmm2 528; SSE-NEXT: psrlq $32, %xmm2 529; SSE-NEXT: pmuludq %xmm1, %xmm2 530; SSE-NEXT: movdqa %xmm1, %xmm3 531; SSE-NEXT: psrlq $32, %xmm3 532; SSE-NEXT: pmuludq %xmm0, %xmm3 533; SSE-NEXT: paddq %xmm2, %xmm3 534; SSE-NEXT: psllq $32, %xmm3 535; SSE-NEXT: pmuludq %xmm1, %xmm0 536; SSE-NEXT: paddq %xmm3, %xmm0 537; SSE-NEXT: movq %xmm0, %rax 538; SSE-NEXT: retq 539; 540; AVX1-LABEL: test_v16i64: 541; AVX1: # %bb.0: 542; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 543; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm4 544; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 545; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm5 546; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 547; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 548; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm5 549; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 550; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm5 551; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm5 552; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6 553; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6 554; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 555; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 556; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm6 557; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 558; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 559; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 560; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm6 561; AVX1-NEXT: vpmuludq %xmm3, %xmm6, %xmm6 562; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm7 563; AVX1-NEXT: vpmuludq %xmm7, %xmm1, %xmm7 564; AVX1-NEXT: vpaddq %xmm6, %xmm7, %xmm6 565; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6 566; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 567; AVX1-NEXT: vpaddq %xmm6, %xmm1, %xmm1 568; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 569; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 570; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 571; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm3 572; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6 573; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6 574; AVX1-NEXT: vpaddq %xmm3, %xmm6, %xmm3 575; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 576; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 577; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 578; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 579; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 580; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 581; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 582; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 583; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 584; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 585; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 586; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm1 587; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1 588; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm2 589; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm2 590; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 591; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 592; AVX1-NEXT: vpmuludq %xmm4, %xmm5, %xmm2 593; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 594; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm2 595; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 596; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 597; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 598; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 599; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 600; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 601; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 602; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 603; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 604; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 605; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 606; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 607; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 608; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 609; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 610; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 611; AVX1-NEXT: vmovq %xmm0, %rax 612; AVX1-NEXT: vzeroupper 613; AVX1-NEXT: retq 614; 615; AVX2-LABEL: test_v16i64: 616; AVX2: # %bb.0: 617; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm4 618; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm4 619; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm5 620; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm5 621; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4 622; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4 623; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 624; AVX2-NEXT: vpaddq %ymm4, %ymm1, %ymm1 625; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3 626; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm3 627; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4 628; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm4 629; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3 630; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3 631; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 632; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 633; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 634; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 635; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 636; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 637; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 638; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 639; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 640; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 641; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 642; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 643; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 644; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 645; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 646; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 647; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 648; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 649; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 650; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 651; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 652; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 653; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 654; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 655; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 656; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 657; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 658; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 659; AVX2-NEXT: vmovq %xmm0, %rax 660; AVX2-NEXT: vzeroupper 661; AVX2-NEXT: retq 662; 663; AVX512BW-LABEL: test_v16i64: 664; AVX512BW: # %bb.0: 665; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 666; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 667; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 668; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 669; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 670; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 671; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 672; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 673; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 674; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 675; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 676; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 677; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 678; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 679; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 680; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 681; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 682; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 683; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 684; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 685; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 686; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 687; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 688; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 689; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 690; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 691; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 692; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 693; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 694; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 695; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 696; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 697; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 698; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 699; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 700; AVX512BW-NEXT: vmovq %xmm0, %rax 701; AVX512BW-NEXT: vzeroupper 702; AVX512BW-NEXT: retq 703; 704; AVX512BWVL-LABEL: test_v16i64: 705; AVX512BWVL: # %bb.0: 706; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 707; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 708; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 709; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 710; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 711; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 712; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 713; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 714; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 715; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 716; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 717; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 718; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 719; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 720; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 721; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 722; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 723; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 724; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 725; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 726; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 727; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 728; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 729; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 730; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 731; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 732; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 733; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 734; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 735; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 736; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 737; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 738; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 739; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 740; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 741; AVX512BWVL-NEXT: vmovq %xmm0, %rax 742; AVX512BWVL-NEXT: vzeroupper 743; AVX512BWVL-NEXT: retq 744; 745; AVX512DQ-LABEL: test_v16i64: 746; AVX512DQ: # %bb.0: 747; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 748; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 749; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 750; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 751; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 752; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 753; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 754; AVX512DQ-NEXT: vmovq %xmm0, %rax 755; AVX512DQ-NEXT: vzeroupper 756; AVX512DQ-NEXT: retq 757; 758; AVX512DQVL-LABEL: test_v16i64: 759; AVX512DQVL: # %bb.0: 760; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 761; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 762; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 763; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 764; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 765; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 766; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 767; AVX512DQVL-NEXT: vmovq %xmm0, %rax 768; AVX512DQVL-NEXT: vzeroupper 769; AVX512DQVL-NEXT: retq 770 %1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> %a0) 771 ret i64 %1 772} 773 774; 775; vXi32 776; 777 778define i32 @test_v4i32(<4 x i32> %a0) { 779; SSE2-LABEL: test_v4i32: 780; SSE2: # %bb.0: 781; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 782; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 783; SSE2-NEXT: pmuludq %xmm1, %xmm0 784; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 785; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 786; SSE2-NEXT: pmuludq %xmm2, %xmm1 787; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 788; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 789; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 790; SSE2-NEXT: pmuludq %xmm0, %xmm1 791; SSE2-NEXT: movd %xmm1, %eax 792; SSE2-NEXT: retq 793; 794; SSE41-LABEL: test_v4i32: 795; SSE41: # %bb.0: 796; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 797; SSE41-NEXT: pmulld %xmm0, %xmm1 798; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 799; SSE41-NEXT: pmulld %xmm1, %xmm0 800; SSE41-NEXT: movd %xmm0, %eax 801; SSE41-NEXT: retq 802; 803; AVX-LABEL: test_v4i32: 804; AVX: # %bb.0: 805; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 806; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 807; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 808; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 809; AVX-NEXT: vmovd %xmm0, %eax 810; AVX-NEXT: retq 811; 812; AVX512-LABEL: test_v4i32: 813; AVX512: # %bb.0: 814; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 815; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 816; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 817; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 818; AVX512-NEXT: vmovd %xmm0, %eax 819; AVX512-NEXT: retq 820 %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> %a0) 821 ret i32 %1 822} 823 824define i32 @test_v8i32(<8 x i32> %a0) { 825; SSE2-LABEL: test_v8i32: 826; SSE2: # %bb.0: 827; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 828; SSE2-NEXT: pmuludq %xmm1, %xmm0 829; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 830; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 831; SSE2-NEXT: pmuludq %xmm2, %xmm1 832; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 833; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 834; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 835; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 836; SSE2-NEXT: pmuludq %xmm1, %xmm0 837; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 838; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 839; SSE2-NEXT: pmuludq %xmm2, %xmm1 840; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 841; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 842; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 843; SSE2-NEXT: pmuludq %xmm0, %xmm1 844; SSE2-NEXT: movd %xmm1, %eax 845; SSE2-NEXT: retq 846; 847; SSE41-LABEL: test_v8i32: 848; SSE41: # %bb.0: 849; SSE41-NEXT: pmulld %xmm1, %xmm0 850; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 851; SSE41-NEXT: pmulld %xmm0, %xmm1 852; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 853; SSE41-NEXT: pmulld %xmm1, %xmm0 854; SSE41-NEXT: movd %xmm0, %eax 855; SSE41-NEXT: retq 856; 857; AVX1-LABEL: test_v8i32: 858; AVX1: # %bb.0: 859; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 860; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 861; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 862; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 863; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 864; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 865; AVX1-NEXT: vmovd %xmm0, %eax 866; AVX1-NEXT: vzeroupper 867; AVX1-NEXT: retq 868; 869; AVX2-LABEL: test_v8i32: 870; AVX2: # %bb.0: 871; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 872; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 873; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 874; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 875; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 876; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 877; AVX2-NEXT: vmovd %xmm0, %eax 878; AVX2-NEXT: vzeroupper 879; AVX2-NEXT: retq 880; 881; AVX512-LABEL: test_v8i32: 882; AVX512: # %bb.0: 883; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 884; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 885; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 886; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 887; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 888; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 889; AVX512-NEXT: vmovd %xmm0, %eax 890; AVX512-NEXT: vzeroupper 891; AVX512-NEXT: retq 892 %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> %a0) 893 ret i32 %1 894} 895 896define i32 @test_v16i32(<16 x i32> %a0) { 897; SSE2-LABEL: test_v16i32: 898; SSE2: # %bb.0: 899; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 900; SSE2-NEXT: pmuludq %xmm3, %xmm1 901; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 902; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 903; SSE2-NEXT: pmuludq %xmm4, %xmm3 904; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 905; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 906; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 907; SSE2-NEXT: pmuludq %xmm2, %xmm0 908; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 909; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 910; SSE2-NEXT: pmuludq %xmm3, %xmm2 911; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 912; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 913; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 914; SSE2-NEXT: pmuludq %xmm1, %xmm0 915; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 916; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 917; SSE2-NEXT: pmuludq %xmm2, %xmm1 918; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 919; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 920; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 921; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 922; SSE2-NEXT: pmuludq %xmm1, %xmm0 923; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 924; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 925; SSE2-NEXT: pmuludq %xmm2, %xmm1 926; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 927; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 928; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 929; SSE2-NEXT: pmuludq %xmm0, %xmm1 930; SSE2-NEXT: movd %xmm1, %eax 931; SSE2-NEXT: retq 932; 933; SSE41-LABEL: test_v16i32: 934; SSE41: # %bb.0: 935; SSE41-NEXT: pmulld %xmm3, %xmm1 936; SSE41-NEXT: pmulld %xmm2, %xmm0 937; SSE41-NEXT: pmulld %xmm1, %xmm0 938; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 939; SSE41-NEXT: pmulld %xmm0, %xmm1 940; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 941; SSE41-NEXT: pmulld %xmm1, %xmm0 942; SSE41-NEXT: movd %xmm0, %eax 943; SSE41-NEXT: retq 944; 945; AVX1-LABEL: test_v16i32: 946; AVX1: # %bb.0: 947; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 948; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 949; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2 950; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 951; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 952; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 953; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 954; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 955; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 956; AVX1-NEXT: vmovd %xmm0, %eax 957; AVX1-NEXT: vzeroupper 958; AVX1-NEXT: retq 959; 960; AVX2-LABEL: test_v16i32: 961; AVX2: # %bb.0: 962; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 963; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 964; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 965; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 966; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 967; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 968; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 969; AVX2-NEXT: vmovd %xmm0, %eax 970; AVX2-NEXT: vzeroupper 971; AVX2-NEXT: retq 972; 973; AVX512-LABEL: test_v16i32: 974; AVX512: # %bb.0: 975; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 976; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 977; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 978; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 979; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 980; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 981; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 982; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 983; AVX512-NEXT: vmovd %xmm0, %eax 984; AVX512-NEXT: vzeroupper 985; AVX512-NEXT: retq 986 %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> %a0) 987 ret i32 %1 988} 989 990define i32 @test_v32i32(<32 x i32> %a0) { 991; SSE2-LABEL: test_v32i32: 992; SSE2: # %bb.0: 993; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3] 994; SSE2-NEXT: pmuludq %xmm6, %xmm2 995; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 996; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] 997; SSE2-NEXT: pmuludq %xmm8, %xmm6 998; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 999; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] 1000; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] 1001; SSE2-NEXT: pmuludq %xmm4, %xmm0 1002; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1003; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 1004; SSE2-NEXT: pmuludq %xmm6, %xmm4 1005; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1006; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 1007; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 1008; SSE2-NEXT: pmuludq %xmm7, %xmm3 1009; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 1010; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] 1011; SSE2-NEXT: pmuludq %xmm4, %xmm6 1012; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] 1013; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 1014; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 1015; SSE2-NEXT: pmuludq %xmm5, %xmm1 1016; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1017; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 1018; SSE2-NEXT: pmuludq %xmm4, %xmm5 1019; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] 1020; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 1021; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 1022; SSE2-NEXT: pmuludq %xmm3, %xmm1 1023; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1024; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 1025; SSE2-NEXT: pmuludq %xmm4, %xmm3 1026; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 1027; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 1028; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 1029; SSE2-NEXT: pmuludq %xmm2, %xmm0 1030; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1031; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 1032; SSE2-NEXT: pmuludq %xmm3, %xmm2 1033; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 1034; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1035; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 1036; SSE2-NEXT: pmuludq %xmm1, %xmm0 1037; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1038; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 1039; SSE2-NEXT: pmuludq %xmm2, %xmm1 1040; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1041; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1042; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1043; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 1044; SSE2-NEXT: pmuludq %xmm1, %xmm0 1045; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1046; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 1047; SSE2-NEXT: pmuludq %xmm2, %xmm1 1048; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1049; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1050; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1051; SSE2-NEXT: pmuludq %xmm0, %xmm1 1052; SSE2-NEXT: movd %xmm1, %eax 1053; SSE2-NEXT: retq 1054; 1055; SSE41-LABEL: test_v32i32: 1056; SSE41: # %bb.0: 1057; SSE41-NEXT: pmulld %xmm6, %xmm2 1058; SSE41-NEXT: pmulld %xmm4, %xmm0 1059; SSE41-NEXT: pmulld %xmm2, %xmm0 1060; SSE41-NEXT: pmulld %xmm7, %xmm3 1061; SSE41-NEXT: pmulld %xmm5, %xmm1 1062; SSE41-NEXT: pmulld %xmm3, %xmm1 1063; SSE41-NEXT: pmulld %xmm0, %xmm1 1064; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 1065; SSE41-NEXT: pmulld %xmm1, %xmm0 1066; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1067; SSE41-NEXT: pmulld %xmm0, %xmm1 1068; SSE41-NEXT: movd %xmm1, %eax 1069; SSE41-NEXT: retq 1070; 1071; AVX1-LABEL: test_v32i32: 1072; AVX1: # %bb.0: 1073; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm4 1074; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1075; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1076; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 1077; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 1078; AVX1-NEXT: vpmulld %xmm1, %xmm3, %xmm1 1079; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1080; AVX1-NEXT: vpmulld %xmm1, %xmm3, %xmm1 1081; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2 1082; AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 1083; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1084; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1085; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1086; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1087; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1088; AVX1-NEXT: vmovd %xmm0, %eax 1089; AVX1-NEXT: vzeroupper 1090; AVX1-NEXT: retq 1091; 1092; AVX2-LABEL: test_v32i32: 1093; AVX2: # %bb.0: 1094; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1 1095; AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 1096; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 1097; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1098; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 1099; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1100; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 1101; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1102; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 1103; AVX2-NEXT: vmovd %xmm0, %eax 1104; AVX2-NEXT: vzeroupper 1105; AVX2-NEXT: retq 1106; 1107; AVX512-LABEL: test_v32i32: 1108; AVX512: # %bb.0: 1109; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 1110; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1111; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 1112; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1113; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 1114; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1115; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 1116; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1117; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 1118; AVX512-NEXT: vmovd %xmm0, %eax 1119; AVX512-NEXT: vzeroupper 1120; AVX512-NEXT: retq 1121 %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> %a0) 1122 ret i32 %1 1123} 1124 1125; 1126; vXi16 1127; 1128 1129define i16 @test_v8i16(<8 x i16> %a0) { 1130; SSE-LABEL: test_v8i16: 1131; SSE: # %bb.0: 1132; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1133; SSE-NEXT: pmullw %xmm0, %xmm1 1134; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 1135; SSE-NEXT: pmullw %xmm1, %xmm0 1136; SSE-NEXT: movdqa %xmm0, %xmm1 1137; SSE-NEXT: psrld $16, %xmm1 1138; SSE-NEXT: pmullw %xmm0, %xmm1 1139; SSE-NEXT: movd %xmm1, %eax 1140; SSE-NEXT: # kill: def $ax killed $ax killed $eax 1141; SSE-NEXT: retq 1142; 1143; AVX-LABEL: test_v8i16: 1144; AVX: # %bb.0: 1145; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1146; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1147; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1148; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1149; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 1150; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1151; AVX-NEXT: vmovd %xmm0, %eax 1152; AVX-NEXT: # kill: def $ax killed $ax killed $eax 1153; AVX-NEXT: retq 1154; 1155; AVX512-LABEL: test_v8i16: 1156; AVX512: # %bb.0: 1157; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1158; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1159; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1160; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1161; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 1162; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1163; AVX512-NEXT: vmovd %xmm0, %eax 1164; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 1165; AVX512-NEXT: retq 1166 %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> %a0) 1167 ret i16 %1 1168} 1169 1170define i16 @test_v16i16(<16 x i16> %a0) { 1171; SSE-LABEL: test_v16i16: 1172; SSE: # %bb.0: 1173; SSE-NEXT: pmullw %xmm1, %xmm0 1174; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1175; SSE-NEXT: pmullw %xmm0, %xmm1 1176; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 1177; SSE-NEXT: pmullw %xmm1, %xmm0 1178; SSE-NEXT: movdqa %xmm0, %xmm1 1179; SSE-NEXT: psrld $16, %xmm1 1180; SSE-NEXT: pmullw %xmm0, %xmm1 1181; SSE-NEXT: movd %xmm1, %eax 1182; SSE-NEXT: # kill: def $ax killed $ax killed $eax 1183; SSE-NEXT: retq 1184; 1185; AVX1-LABEL: test_v16i16: 1186; AVX1: # %bb.0: 1187; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1188; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1189; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1190; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1191; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1192; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1193; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 1194; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1195; AVX1-NEXT: vmovd %xmm0, %eax 1196; AVX1-NEXT: # kill: def $ax killed $ax killed $eax 1197; AVX1-NEXT: vzeroupper 1198; AVX1-NEXT: retq 1199; 1200; AVX2-LABEL: test_v16i16: 1201; AVX2: # %bb.0: 1202; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1203; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1204; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1205; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1206; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1207; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1208; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 1209; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1210; AVX2-NEXT: vmovd %xmm0, %eax 1211; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 1212; AVX2-NEXT: vzeroupper 1213; AVX2-NEXT: retq 1214; 1215; AVX512-LABEL: test_v16i16: 1216; AVX512: # %bb.0: 1217; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1218; AVX512-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1219; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1220; AVX512-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1221; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1222; AVX512-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1223; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 1224; AVX512-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1225; AVX512-NEXT: vmovd %xmm0, %eax 1226; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 1227; AVX512-NEXT: vzeroupper 1228; AVX512-NEXT: retq 1229 %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> %a0) 1230 ret i16 %1 1231} 1232 1233define i16 @test_v32i16(<32 x i16> %a0) { 1234; SSE-LABEL: test_v32i16: 1235; SSE: # %bb.0: 1236; SSE-NEXT: pmullw %xmm3, %xmm1 1237; SSE-NEXT: pmullw %xmm2, %xmm0 1238; SSE-NEXT: pmullw %xmm1, %xmm0 1239; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1240; SSE-NEXT: pmullw %xmm0, %xmm1 1241; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 1242; SSE-NEXT: pmullw %xmm1, %xmm0 1243; SSE-NEXT: movdqa %xmm0, %xmm1 1244; SSE-NEXT: psrld $16, %xmm1 1245; SSE-NEXT: pmullw %xmm0, %xmm1 1246; SSE-NEXT: movd %xmm1, %eax 1247; SSE-NEXT: # kill: def $ax killed $ax killed $eax 1248; SSE-NEXT: retq 1249; 1250; AVX1-LABEL: test_v32i16: 1251; AVX1: # %bb.0: 1252; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1253; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1254; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 1255; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 1256; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1257; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1258; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1259; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1260; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1261; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 1262; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1263; AVX1-NEXT: vmovd %xmm0, %eax 1264; AVX1-NEXT: # kill: def $ax killed $ax killed $eax 1265; AVX1-NEXT: vzeroupper 1266; AVX1-NEXT: retq 1267; 1268; AVX2-LABEL: test_v32i16: 1269; AVX2: # %bb.0: 1270; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1271; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1272; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1273; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1274; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1275; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1276; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1277; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 1278; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1279; AVX2-NEXT: vmovd %xmm0, %eax 1280; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 1281; AVX2-NEXT: vzeroupper 1282; AVX2-NEXT: retq 1283; 1284; AVX512BW-LABEL: test_v32i16: 1285; AVX512BW: # %bb.0: 1286; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1287; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1288; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 1289; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1290; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1291; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1292; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1293; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1294; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 1295; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1296; AVX512BW-NEXT: vmovd %xmm0, %eax 1297; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax 1298; AVX512BW-NEXT: vzeroupper 1299; AVX512BW-NEXT: retq 1300; 1301; AVX512BWVL-LABEL: test_v32i16: 1302; AVX512BWVL: # %bb.0: 1303; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1304; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1305; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 1306; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1307; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1308; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1309; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1310; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1311; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1 1312; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1313; AVX512BWVL-NEXT: vmovd %xmm0, %eax 1314; AVX512BWVL-NEXT: # kill: def $ax killed $ax killed $eax 1315; AVX512BWVL-NEXT: vzeroupper 1316; AVX512BWVL-NEXT: retq 1317; 1318; AVX512DQ-LABEL: test_v32i16: 1319; AVX512DQ: # %bb.0: 1320; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1321; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 1322; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1323; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1324; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1325; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1326; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1327; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 1328; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1329; AVX512DQ-NEXT: vmovd %xmm0, %eax 1330; AVX512DQ-NEXT: # kill: def $ax killed $ax killed $eax 1331; AVX512DQ-NEXT: vzeroupper 1332; AVX512DQ-NEXT: retq 1333; 1334; AVX512DQVL-LABEL: test_v32i16: 1335; AVX512DQVL: # %bb.0: 1336; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1337; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 1338; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1339; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1340; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1341; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1342; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1343; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1 1344; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1345; AVX512DQVL-NEXT: vmovd %xmm0, %eax 1346; AVX512DQVL-NEXT: # kill: def $ax killed $ax killed $eax 1347; AVX512DQVL-NEXT: vzeroupper 1348; AVX512DQVL-NEXT: retq 1349 %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> %a0) 1350 ret i16 %1 1351} 1352 1353define i16 @test_v64i16(<64 x i16> %a0) { 1354; SSE-LABEL: test_v64i16: 1355; SSE: # %bb.0: 1356; SSE-NEXT: pmullw %xmm6, %xmm2 1357; SSE-NEXT: pmullw %xmm4, %xmm0 1358; SSE-NEXT: pmullw %xmm2, %xmm0 1359; SSE-NEXT: pmullw %xmm7, %xmm3 1360; SSE-NEXT: pmullw %xmm5, %xmm1 1361; SSE-NEXT: pmullw %xmm3, %xmm1 1362; SSE-NEXT: pmullw %xmm0, %xmm1 1363; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 1364; SSE-NEXT: pmullw %xmm1, %xmm0 1365; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1366; SSE-NEXT: pmullw %xmm0, %xmm1 1367; SSE-NEXT: movdqa %xmm1, %xmm0 1368; SSE-NEXT: psrld $16, %xmm0 1369; SSE-NEXT: pmullw %xmm1, %xmm0 1370; SSE-NEXT: movd %xmm0, %eax 1371; SSE-NEXT: # kill: def $ax killed $ax killed $eax 1372; SSE-NEXT: retq 1373; 1374; AVX1-LABEL: test_v64i16: 1375; AVX1: # %bb.0: 1376; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm4 1377; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1378; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1379; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 1380; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 1381; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm1 1382; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1383; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm1 1384; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 1385; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1 1386; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1387; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1388; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1389; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1390; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1391; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 1392; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1393; AVX1-NEXT: vmovd %xmm0, %eax 1394; AVX1-NEXT: # kill: def $ax killed $ax killed $eax 1395; AVX1-NEXT: vzeroupper 1396; AVX1-NEXT: retq 1397; 1398; AVX2-LABEL: test_v64i16: 1399; AVX2: # %bb.0: 1400; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1 1401; AVX2-NEXT: vpmullw %ymm1, %ymm2, %ymm1 1402; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1403; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1404; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1405; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1406; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1407; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1408; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1409; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 1410; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1411; AVX2-NEXT: vmovd %xmm0, %eax 1412; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 1413; AVX2-NEXT: vzeroupper 1414; AVX2-NEXT: retq 1415; 1416; AVX512BW-LABEL: test_v64i16: 1417; AVX512BW: # %bb.0: 1418; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1419; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1420; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1421; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 1422; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1423; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1424; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1425; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1426; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1427; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 1428; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1429; AVX512BW-NEXT: vmovd %xmm0, %eax 1430; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax 1431; AVX512BW-NEXT: vzeroupper 1432; AVX512BW-NEXT: retq 1433; 1434; AVX512BWVL-LABEL: test_v64i16: 1435; AVX512BWVL: # %bb.0: 1436; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1437; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1438; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1439; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 1440; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1441; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1442; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1443; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1444; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1445; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1 1446; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1447; AVX512BWVL-NEXT: vmovd %xmm0, %eax 1448; AVX512BWVL-NEXT: # kill: def $ax killed $ax killed $eax 1449; AVX512BWVL-NEXT: vzeroupper 1450; AVX512BWVL-NEXT: retq 1451; 1452; AVX512DQ-LABEL: test_v64i16: 1453; AVX512DQ: # %bb.0: 1454; AVX512DQ-NEXT: vpmullw %ymm3, %ymm1, %ymm1 1455; AVX512DQ-NEXT: vpmullw %ymm1, %ymm2, %ymm1 1456; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1457; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 1458; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1459; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1460; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1461; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1462; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1463; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 1464; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1465; AVX512DQ-NEXT: vmovd %xmm0, %eax 1466; AVX512DQ-NEXT: # kill: def $ax killed $ax killed $eax 1467; AVX512DQ-NEXT: vzeroupper 1468; AVX512DQ-NEXT: retq 1469; 1470; AVX512DQVL-LABEL: test_v64i16: 1471; AVX512DQVL: # %bb.0: 1472; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm1, %ymm1 1473; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm2, %ymm1 1474; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1475; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 1476; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1477; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1478; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1479; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1480; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1481; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1 1482; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1483; AVX512DQVL-NEXT: vmovd %xmm0, %eax 1484; AVX512DQVL-NEXT: # kill: def $ax killed $ax killed $eax 1485; AVX512DQVL-NEXT: vzeroupper 1486; AVX512DQVL-NEXT: retq 1487 %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> %a0) 1488 ret i16 %1 1489} 1490 1491; 1492; vXi8 1493; 1494 1495define i8 @test_v16i8(<16 x i8> %a0) { 1496; SSE2-LABEL: test_v16i8: 1497; SSE2: # %bb.0: 1498; SSE2-NEXT: movdqa %xmm0, %xmm1 1499; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1500; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1501; SSE2-NEXT: pmullw %xmm1, %xmm0 1502; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 1503; SSE2-NEXT: pand %xmm1, %xmm0 1504; SSE2-NEXT: pxor %xmm3, %xmm3 1505; SSE2-NEXT: packuswb %xmm3, %xmm0 1506; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,2,3,3] 1507; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1508; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 1509; SSE2-NEXT: pmullw %xmm0, %xmm2 1510; SSE2-NEXT: pand %xmm1, %xmm2 1511; SSE2-NEXT: packuswb %xmm3, %xmm2 1512; SSE2-NEXT: movdqa %xmm2, %xmm0 1513; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1514; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 1515; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 1516; SSE2-NEXT: pmullw %xmm0, %xmm2 1517; SSE2-NEXT: pand %xmm1, %xmm2 1518; SSE2-NEXT: packuswb %xmm3, %xmm2 1519; SSE2-NEXT: movdqa %xmm2, %xmm0 1520; SSE2-NEXT: psrlw $8, %xmm0 1521; SSE2-NEXT: movdqa %xmm2, %xmm3 1522; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] 1523; SSE2-NEXT: pmullw %xmm0, %xmm3 1524; SSE2-NEXT: pand %xmm1, %xmm3 1525; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 1526; SSE2-NEXT: pmullw %xmm0, %xmm2 1527; SSE2-NEXT: pand %xmm1, %xmm2 1528; SSE2-NEXT: packuswb %xmm3, %xmm2 1529; SSE2-NEXT: movd %xmm2, %eax 1530; SSE2-NEXT: # kill: def $al killed $al killed $eax 1531; SSE2-NEXT: retq 1532; 1533; SSE41-LABEL: test_v16i8: 1534; SSE41: # %bb.0: 1535; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1536; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1537; SSE41-NEXT: pmullw %xmm1, %xmm0 1538; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 1539; SSE41-NEXT: pand %xmm1, %xmm0 1540; SSE41-NEXT: pxor %xmm2, %xmm2 1541; SSE41-NEXT: packuswb %xmm2, %xmm0 1542; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1543; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1544; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1545; SSE41-NEXT: pmullw %xmm3, %xmm0 1546; SSE41-NEXT: pand %xmm1, %xmm0 1547; SSE41-NEXT: packuswb %xmm2, %xmm0 1548; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1549; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1550; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 1551; SSE41-NEXT: pmullw %xmm3, %xmm0 1552; SSE41-NEXT: pand %xmm1, %xmm0 1553; SSE41-NEXT: packuswb %xmm2, %xmm0 1554; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1555; SSE41-NEXT: psrlw $8, %xmm0 1556; SSE41-NEXT: pmullw %xmm1, %xmm0 1557; SSE41-NEXT: pextrb $0, %xmm0, %eax 1558; SSE41-NEXT: # kill: def $al killed $al killed $eax 1559; SSE41-NEXT: retq 1560; 1561; AVX1-LABEL: test_v16i8: 1562; AVX1: # %bb.0: 1563; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1564; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1565; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1566; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 1567; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1568; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1569; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1570; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1571; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1572; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1573; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0 1574; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1575; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1576; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1577; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1578; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 1579; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0 1580; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1581; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1582; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 1583; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1584; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1585; AVX1-NEXT: vpextrb $0, %xmm0, %eax 1586; AVX1-NEXT: # kill: def $al killed $al killed $eax 1587; AVX1-NEXT: retq 1588; 1589; AVX2-LABEL: test_v16i8: 1590; AVX2: # %bb.0: 1591; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1592; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1593; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 1594; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1595; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1596; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1597; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1598; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1599; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1600; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1601; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1602; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 1603; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1604; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1605; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1606; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1607; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1608; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 1609; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1610; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 1611; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1612; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1613; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1614; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1615; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1616; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 1617; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1618; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 1619; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1620; AVX2-NEXT: vpextrb $0, %xmm0, %eax 1621; AVX2-NEXT: # kill: def $al killed $al killed $eax 1622; AVX2-NEXT: vzeroupper 1623; AVX2-NEXT: retq 1624; 1625; AVX512BW-LABEL: test_v16i8: 1626; AVX512BW: # %bb.0: 1627; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1628; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 1629; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1 1630; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1631; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1632; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1633; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 1634; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1 1635; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1636; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1637; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 1638; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 1639; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1 1640; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1641; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1642; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 1643; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 1644; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1 1645; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1646; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1647; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax 1648; AVX512BW-NEXT: # kill: def $al killed $al killed $eax 1649; AVX512BW-NEXT: vzeroupper 1650; AVX512BW-NEXT: retq 1651; 1652; AVX512BWVL-LABEL: test_v16i8: 1653; AVX512BWVL: # %bb.0: 1654; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1655; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 1656; AVX512BWVL-NEXT: vpmovsxbw %xmm1, %ymm1 1657; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1658; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 1659; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1660; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 1661; AVX512BWVL-NEXT: vpmovsxbw %xmm1, %ymm1 1662; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1663; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 1664; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1 1665; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 1666; AVX512BWVL-NEXT: vpmovsxbw %xmm1, %ymm1 1667; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1668; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 1669; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1 1670; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 1671; AVX512BWVL-NEXT: vpmovsxbw %xmm1, %ymm1 1672; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1673; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 1674; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax 1675; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax 1676; AVX512BWVL-NEXT: vzeroupper 1677; AVX512BWVL-NEXT: retq 1678; 1679; AVX512DQ-LABEL: test_v16i8: 1680; AVX512DQ: # %bb.0: 1681; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1682; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 1683; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 1684; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1685; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 1686; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1687; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1688; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 1689; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 1690; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1691; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 1692; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1693; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 1694; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 1695; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 1696; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1697; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 1698; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1699; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1 1700; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 1701; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 1702; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1703; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 1704; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1705; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax 1706; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax 1707; AVX512DQ-NEXT: vzeroupper 1708; AVX512DQ-NEXT: retq 1709; 1710; AVX512DQVL-LABEL: test_v16i8: 1711; AVX512DQVL: # %bb.0: 1712; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1713; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 1714; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 1715; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1716; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 1717; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 1718; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1719; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 1720; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 1721; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1722; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 1723; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 1724; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1 1725; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 1726; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 1727; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1728; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 1729; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 1730; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1 1731; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 1732; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 1733; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1734; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 1735; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 1736; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax 1737; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax 1738; AVX512DQVL-NEXT: vzeroupper 1739; AVX512DQVL-NEXT: retq 1740 %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> %a0) 1741 ret i8 %1 1742} 1743 1744define i8 @test_v32i8(<32 x i8> %a0) { 1745; SSE2-LABEL: test_v32i8: 1746; SSE2: # %bb.0: 1747; SSE2-NEXT: movdqa %xmm1, %xmm2 1748; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 1749; SSE2-NEXT: movdqa %xmm0, %xmm3 1750; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] 1751; SSE2-NEXT: pmullw %xmm2, %xmm3 1752; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1753; SSE2-NEXT: pand %xmm2, %xmm3 1754; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1755; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1756; SSE2-NEXT: pmullw %xmm1, %xmm0 1757; SSE2-NEXT: pand %xmm2, %xmm0 1758; SSE2-NEXT: packuswb %xmm3, %xmm0 1759; SSE2-NEXT: movdqa %xmm0, %xmm1 1760; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1761; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1762; SSE2-NEXT: pmullw %xmm1, %xmm0 1763; SSE2-NEXT: pand %xmm2, %xmm0 1764; SSE2-NEXT: pxor %xmm3, %xmm3 1765; SSE2-NEXT: packuswb %xmm3, %xmm0 1766; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,2,3,3] 1767; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1768; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1769; SSE2-NEXT: pmullw %xmm0, %xmm1 1770; SSE2-NEXT: pand %xmm2, %xmm1 1771; SSE2-NEXT: packuswb %xmm3, %xmm1 1772; SSE2-NEXT: movdqa %xmm1, %xmm0 1773; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1774; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 1775; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1776; SSE2-NEXT: pmullw %xmm0, %xmm1 1777; SSE2-NEXT: pand %xmm2, %xmm1 1778; SSE2-NEXT: packuswb %xmm3, %xmm1 1779; SSE2-NEXT: movdqa %xmm1, %xmm0 1780; SSE2-NEXT: psrlw $8, %xmm0 1781; SSE2-NEXT: movdqa %xmm1, %xmm3 1782; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] 1783; SSE2-NEXT: pmullw %xmm0, %xmm3 1784; SSE2-NEXT: pand %xmm2, %xmm3 1785; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1786; SSE2-NEXT: pmullw %xmm0, %xmm1 1787; SSE2-NEXT: pand %xmm2, %xmm1 1788; SSE2-NEXT: packuswb %xmm3, %xmm1 1789; SSE2-NEXT: movd %xmm1, %eax 1790; SSE2-NEXT: # kill: def $al killed $al killed $eax 1791; SSE2-NEXT: retq 1792; 1793; SSE41-LABEL: test_v32i8: 1794; SSE41: # %bb.0: 1795; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1796; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1797; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1798; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1799; SSE41-NEXT: pmullw %xmm1, %xmm0 1800; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 1801; SSE41-NEXT: pand %xmm1, %xmm0 1802; SSE41-NEXT: pmullw %xmm2, %xmm3 1803; SSE41-NEXT: pand %xmm1, %xmm3 1804; SSE41-NEXT: packuswb %xmm0, %xmm3 1805; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 1806; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1807; SSE41-NEXT: pmullw %xmm0, %xmm3 1808; SSE41-NEXT: pand %xmm1, %xmm3 1809; SSE41-NEXT: pxor %xmm0, %xmm0 1810; SSE41-NEXT: packuswb %xmm0, %xmm3 1811; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 1812; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1813; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 1814; SSE41-NEXT: pmullw %xmm2, %xmm3 1815; SSE41-NEXT: pand %xmm1, %xmm3 1816; SSE41-NEXT: packuswb %xmm0, %xmm3 1817; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 1818; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1819; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] 1820; SSE41-NEXT: pmullw %xmm2, %xmm3 1821; SSE41-NEXT: pand %xmm1, %xmm3 1822; SSE41-NEXT: packuswb %xmm0, %xmm3 1823; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 1824; SSE41-NEXT: psrlw $8, %xmm3 1825; SSE41-NEXT: pmullw %xmm0, %xmm3 1826; SSE41-NEXT: pextrb $0, %xmm3, %eax 1827; SSE41-NEXT: # kill: def $al killed $al killed $eax 1828; SSE41-NEXT: retq 1829; 1830; AVX1-LABEL: test_v32i8: 1831; AVX1: # %bb.0: 1832; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1833; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1834; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1835; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm3 1836; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 1837; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm3 1838; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1839; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1840; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 1841; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1842; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 1843; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 1844; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1845; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1846; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 1847; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm3 1848; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1849; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1850; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 1851; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1852; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 1853; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] 1854; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1855; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1856; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 1857; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm3 1858; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1859; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1860; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 1861; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1862; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 1863; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2 1864; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1865; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1866; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 1867; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm3 1868; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1869; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1870; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 1871; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1872; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 1873; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 1874; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1875; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1876; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 1877; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm3 1878; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1879; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1880; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 1881; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1882; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 1883; AVX1-NEXT: vpextrb $0, %xmm0, %eax 1884; AVX1-NEXT: # kill: def $al killed $al killed $eax 1885; AVX1-NEXT: vzeroupper 1886; AVX1-NEXT: retq 1887; 1888; AVX2-LABEL: test_v32i8: 1889; AVX2: # %bb.0: 1890; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1891; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 1892; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1893; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1894; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1895; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1896; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1897; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1898; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1899; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1900; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1901; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 1902; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1903; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1904; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1905; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1906; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1907; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1908; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1909; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 1910; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1911; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1912; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1913; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1914; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1915; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 1916; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1917; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 1918; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1919; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1920; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1921; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1922; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1923; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 1924; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1925; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 1926; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1927; AVX2-NEXT: vpextrb $0, %xmm0, %eax 1928; AVX2-NEXT: # kill: def $al killed $al killed $eax 1929; AVX2-NEXT: vzeroupper 1930; AVX2-NEXT: retq 1931; 1932; AVX512BW-LABEL: test_v32i8: 1933; AVX512BW: # %bb.0: 1934; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 1935; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 1936; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 1937; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1938; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1939; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1940; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 1941; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 1942; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1943; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1944; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1945; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 1946; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 1947; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1948; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1949; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 1950; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 1951; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 1952; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1953; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1954; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 1955; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 1956; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 1957; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1958; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1959; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax 1960; AVX512BW-NEXT: # kill: def $al killed $al killed $eax 1961; AVX512BW-NEXT: vzeroupper 1962; AVX512BW-NEXT: retq 1963; 1964; AVX512BWVL-LABEL: test_v32i8: 1965; AVX512BWVL: # %bb.0: 1966; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 1967; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 1968; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 1969; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1970; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 1971; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1972; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 1973; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 1974; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1975; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 1976; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1977; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 1978; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 1979; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1980; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 1981; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1 1982; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 1983; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 1984; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1985; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 1986; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1 1987; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 1988; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 1989; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1990; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 1991; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax 1992; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax 1993; AVX512BWVL-NEXT: vzeroupper 1994; AVX512BWVL-NEXT: retq 1995; 1996; AVX512DQ-LABEL: test_v32i8: 1997; AVX512DQ: # %bb.0: 1998; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 1999; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 2000; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 2001; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2002; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 2003; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2004; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 2005; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 2006; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 2007; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2008; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 2009; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2010; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 2011; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 2012; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 2013; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2014; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 2015; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2016; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 2017; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 2018; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 2019; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2020; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 2021; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2022; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1 2023; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 2024; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 2025; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2026; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 2027; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2028; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax 2029; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax 2030; AVX512DQ-NEXT: vzeroupper 2031; AVX512DQ-NEXT: retq 2032; 2033; AVX512DQVL-LABEL: test_v32i8: 2034; AVX512DQVL: # %bb.0: 2035; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 2036; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 2037; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 2038; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2039; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 2040; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 2041; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 2042; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 2043; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 2044; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2045; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 2046; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 2047; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 2048; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 2049; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 2050; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2051; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 2052; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 2053; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1 2054; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 2055; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 2056; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2057; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 2058; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 2059; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1 2060; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 2061; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 2062; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2063; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 2064; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 2065; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax 2066; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax 2067; AVX512DQVL-NEXT: vzeroupper 2068; AVX512DQVL-NEXT: retq 2069 %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> %a0) 2070 ret i8 %1 2071} 2072 2073define i8 @test_v64i8(<64 x i8> %a0) { 2074; SSE2-LABEL: test_v64i8: 2075; SSE2: # %bb.0: 2076; SSE2-NEXT: movdqa %xmm2, %xmm4 2077; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 2078; SSE2-NEXT: movdqa %xmm0, %xmm5 2079; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] 2080; SSE2-NEXT: pmullw %xmm4, %xmm5 2081; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 2082; SSE2-NEXT: pand %xmm4, %xmm5 2083; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 2084; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2085; SSE2-NEXT: pmullw %xmm2, %xmm0 2086; SSE2-NEXT: pand %xmm4, %xmm0 2087; SSE2-NEXT: packuswb %xmm5, %xmm0 2088; SSE2-NEXT: movdqa %xmm3, %xmm2 2089; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 2090; SSE2-NEXT: movdqa %xmm1, %xmm5 2091; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] 2092; SSE2-NEXT: pmullw %xmm2, %xmm5 2093; SSE2-NEXT: pand %xmm4, %xmm5 2094; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 2095; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2096; SSE2-NEXT: pmullw %xmm3, %xmm1 2097; SSE2-NEXT: pand %xmm4, %xmm1 2098; SSE2-NEXT: packuswb %xmm5, %xmm1 2099; SSE2-NEXT: movdqa %xmm1, %xmm2 2100; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 2101; SSE2-NEXT: movdqa %xmm0, %xmm3 2102; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] 2103; SSE2-NEXT: pmullw %xmm2, %xmm3 2104; SSE2-NEXT: pand %xmm4, %xmm3 2105; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2106; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2107; SSE2-NEXT: pmullw %xmm1, %xmm0 2108; SSE2-NEXT: pand %xmm4, %xmm0 2109; SSE2-NEXT: packuswb %xmm3, %xmm0 2110; SSE2-NEXT: movdqa %xmm0, %xmm1 2111; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 2112; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2113; SSE2-NEXT: pmullw %xmm1, %xmm0 2114; SSE2-NEXT: pand %xmm4, %xmm0 2115; SSE2-NEXT: pxor %xmm2, %xmm2 2116; SSE2-NEXT: packuswb %xmm2, %xmm0 2117; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,2,3,3] 2118; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2119; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2120; SSE2-NEXT: pmullw %xmm0, %xmm1 2121; SSE2-NEXT: pand %xmm4, %xmm1 2122; SSE2-NEXT: packuswb %xmm2, %xmm1 2123; SSE2-NEXT: movdqa %xmm1, %xmm0 2124; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2125; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 2126; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2127; SSE2-NEXT: pmullw %xmm0, %xmm1 2128; SSE2-NEXT: pand %xmm4, %xmm1 2129; SSE2-NEXT: packuswb %xmm2, %xmm1 2130; SSE2-NEXT: movdqa %xmm1, %xmm0 2131; SSE2-NEXT: psrlw $8, %xmm0 2132; SSE2-NEXT: movdqa %xmm1, %xmm2 2133; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 2134; SSE2-NEXT: pmullw %xmm0, %xmm2 2135; SSE2-NEXT: pand %xmm4, %xmm2 2136; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2137; SSE2-NEXT: pmullw %xmm0, %xmm1 2138; SSE2-NEXT: pand %xmm4, %xmm1 2139; SSE2-NEXT: packuswb %xmm2, %xmm1 2140; SSE2-NEXT: movd %xmm1, %eax 2141; SSE2-NEXT: # kill: def $al killed $al killed $eax 2142; SSE2-NEXT: retq 2143; 2144; SSE41-LABEL: test_v64i8: 2145; SSE41: # %bb.0: 2146; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2147; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2148; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2149; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2150; SSE41-NEXT: pmullw %xmm2, %xmm0 2151; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 2152; SSE41-NEXT: pand %xmm2, %xmm0 2153; SSE41-NEXT: pmullw %xmm5, %xmm4 2154; SSE41-NEXT: pand %xmm2, %xmm4 2155; SSE41-NEXT: packuswb %xmm0, %xmm4 2156; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 2157; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2158; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2159; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2160; SSE41-NEXT: pmullw %xmm3, %xmm1 2161; SSE41-NEXT: pand %xmm2, %xmm1 2162; SSE41-NEXT: pmullw %xmm0, %xmm5 2163; SSE41-NEXT: pand %xmm2, %xmm5 2164; SSE41-NEXT: packuswb %xmm1, %xmm5 2165; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 2166; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2167; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero 2168; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2169; SSE41-NEXT: pmullw %xmm5, %xmm4 2170; SSE41-NEXT: pand %xmm2, %xmm4 2171; SSE41-NEXT: pmullw %xmm0, %xmm1 2172; SSE41-NEXT: pand %xmm2, %xmm1 2173; SSE41-NEXT: packuswb %xmm4, %xmm1 2174; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2175; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2176; SSE41-NEXT: pmullw %xmm0, %xmm1 2177; SSE41-NEXT: pand %xmm2, %xmm1 2178; SSE41-NEXT: pxor %xmm0, %xmm0 2179; SSE41-NEXT: packuswb %xmm0, %xmm1 2180; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2181; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2182; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2183; SSE41-NEXT: pmullw %xmm3, %xmm1 2184; SSE41-NEXT: pand %xmm2, %xmm1 2185; SSE41-NEXT: packuswb %xmm0, %xmm1 2186; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2187; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2188; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] 2189; SSE41-NEXT: pmullw %xmm3, %xmm1 2190; SSE41-NEXT: pand %xmm2, %xmm1 2191; SSE41-NEXT: packuswb %xmm0, %xmm1 2192; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2193; SSE41-NEXT: psrlw $8, %xmm1 2194; SSE41-NEXT: pmullw %xmm0, %xmm1 2195; SSE41-NEXT: pextrb $0, %xmm1, %eax 2196; SSE41-NEXT: # kill: def $al killed $al killed $eax 2197; SSE41-NEXT: retq 2198; 2199; AVX1-LABEL: test_v64i8: 2200; AVX1: # %bb.0: 2201; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2202; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2203; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm3 2204; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 2205; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 2206; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2207; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2208; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4 2209; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4 2210; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3 2211; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2212; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2213; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2214; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2215; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4 2216; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4 2217; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2218; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2219; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2220; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 2221; AVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0 2222; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2223; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2224; AVX1-NEXT: vpmullw %xmm1, %xmm4, %xmm1 2225; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 2226; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2227; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 2228; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0 2229; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 2230; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2231; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 2232; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2233; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2234; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 2235; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 2236; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2237; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2238; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2239; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 2240; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 2241; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 2242; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2243; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2244; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 2245; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 2246; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2247; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2248; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2249; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 2250; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 2251; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 2252; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2253; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2254; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 2255; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 2256; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2257; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2258; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2259; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 2260; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 2261; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 2262; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2263; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2264; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 2265; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 2266; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2267; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2268; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2269; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 2270; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 2271; AVX1-NEXT: vpextrb $0, %xmm0, %eax 2272; AVX1-NEXT: # kill: def $al killed $al killed $eax 2273; AVX1-NEXT: vzeroupper 2274; AVX1-NEXT: retq 2275; 2276; AVX2-LABEL: test_v64i8: 2277; AVX2: # %bb.0: 2278; AVX2-NEXT: vpmovsxbw %xmm1, %ymm2 2279; AVX2-NEXT: vpmovsxbw %xmm0, %ymm3 2280; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm3 2281; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 2282; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 2283; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm4 2284; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm3 2285; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] 2286; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 2287; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 2288; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 2289; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2290; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2291; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2292; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 2293; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2294; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2295; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2296; AVX2-NEXT: vpmovsxbw %xmm3, %ymm1 2297; AVX2-NEXT: vpmullw %ymm0, %ymm1, %ymm0 2298; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2299; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 2300; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2301; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2302; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 2303; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2304; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 2305; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2306; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2307; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 2308; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2309; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2310; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 2311; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2312; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 2313; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2314; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2315; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 2316; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2317; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2318; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 2319; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2320; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 2321; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2322; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2323; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 2324; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2325; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2326; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 2327; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2328; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 2329; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2330; AVX2-NEXT: vpextrb $0, %xmm0, %eax 2331; AVX2-NEXT: # kill: def $al killed $al killed $eax 2332; AVX2-NEXT: vzeroupper 2333; AVX2-NEXT: retq 2334; 2335; AVX512BW-LABEL: test_v64i8: 2336; AVX512BW: # %bb.0: 2337; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2338; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 2339; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 2340; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2341; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2342; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 2343; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 2344; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 2345; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2346; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2347; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 2348; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 2349; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 2350; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2351; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2352; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 2353; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 2354; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 2355; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2356; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2357; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 2358; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 2359; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 2360; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2361; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2362; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 2363; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 2364; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 2365; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2366; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2367; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax 2368; AVX512BW-NEXT: # kill: def $al killed $al killed $eax 2369; AVX512BW-NEXT: vzeroupper 2370; AVX512BW-NEXT: retq 2371; 2372; AVX512BWVL-LABEL: test_v64i8: 2373; AVX512BWVL: # %bb.0: 2374; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2375; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 2376; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 2377; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2378; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 2379; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 2380; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 2381; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 2382; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2383; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 2384; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 2385; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 2386; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 2387; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2388; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 2389; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 2390; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 2391; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 2392; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2393; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 2394; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1 2395; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 2396; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 2397; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2398; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 2399; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1 2400; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 2401; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 2402; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2403; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 2404; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax 2405; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax 2406; AVX512BWVL-NEXT: vzeroupper 2407; AVX512BWVL-NEXT: retq 2408; 2409; AVX512DQ-LABEL: test_v64i8: 2410; AVX512DQ: # %bb.0: 2411; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm2 2412; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm3 2413; AVX512DQ-NEXT: vpmullw %ymm2, %ymm3, %ymm2 2414; AVX512DQ-NEXT: vpmovsxwd %ymm2, %zmm2 2415; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2 2416; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1 2417; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 2418; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 2419; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 2420; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2421; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 2422; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2423; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 2424; AVX512DQ-NEXT: vpmovsxbw %xmm2, %ymm1 2425; AVX512DQ-NEXT: vpmullw %ymm0, %ymm1, %ymm0 2426; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 2427; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2428; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 2429; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 2430; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 2431; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2432; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 2433; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2434; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 2435; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 2436; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 2437; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2438; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 2439; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2440; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 2441; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 2442; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 2443; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2444; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 2445; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2446; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1 2447; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 2448; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 2449; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2450; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 2451; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2452; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax 2453; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax 2454; AVX512DQ-NEXT: vzeroupper 2455; AVX512DQ-NEXT: retq 2456; 2457; AVX512DQVL-LABEL: test_v64i8: 2458; AVX512DQVL: # %bb.0: 2459; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm2 2460; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm3 2461; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm3, %ymm2 2462; AVX512DQVL-NEXT: vpmovsxwd %ymm2, %zmm2 2463; AVX512DQVL-NEXT: vpmovdb %zmm2, %xmm2 2464; AVX512DQVL-NEXT: vextracti128 $1, %ymm1, %xmm1 2465; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 2466; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm0 2467; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 2468; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2469; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 2470; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 2471; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 2472; AVX512DQVL-NEXT: vpmovsxbw %xmm2, %ymm1 2473; AVX512DQVL-NEXT: vpmullw %ymm0, %ymm1, %ymm0 2474; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 2475; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 2476; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 2477; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 2478; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 2479; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2480; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 2481; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 2482; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 2483; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 2484; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 2485; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2486; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 2487; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 2488; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1 2489; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 2490; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 2491; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2492; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 2493; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 2494; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1 2495; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 2496; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 2497; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2498; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 2499; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 2500; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax 2501; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax 2502; AVX512DQVL-NEXT: vzeroupper 2503; AVX512DQVL-NEXT: retq 2504 %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> %a0) 2505 ret i8 %1 2506} 2507 2508define i8 @test_v128i8(<128 x i8> %a0) { 2509; SSE2-LABEL: test_v128i8: 2510; SSE2: # %bb.0: 2511; SSE2-NEXT: movdqa %xmm5, %xmm8 2512; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] 2513; SSE2-NEXT: movdqa %xmm1, %xmm9 2514; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15] 2515; SSE2-NEXT: pmullw %xmm8, %xmm9 2516; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] 2517; SSE2-NEXT: pand %xmm8, %xmm9 2518; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] 2519; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2520; SSE2-NEXT: pmullw %xmm5, %xmm1 2521; SSE2-NEXT: pand %xmm8, %xmm1 2522; SSE2-NEXT: packuswb %xmm9, %xmm1 2523; SSE2-NEXT: movdqa %xmm7, %xmm9 2524; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15] 2525; SSE2-NEXT: movdqa %xmm3, %xmm5 2526; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] 2527; SSE2-NEXT: pmullw %xmm9, %xmm5 2528; SSE2-NEXT: pand %xmm8, %xmm5 2529; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] 2530; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 2531; SSE2-NEXT: pmullw %xmm7, %xmm3 2532; SSE2-NEXT: pand %xmm8, %xmm3 2533; SSE2-NEXT: packuswb %xmm5, %xmm3 2534; SSE2-NEXT: movdqa %xmm4, %xmm5 2535; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] 2536; SSE2-NEXT: movdqa %xmm0, %xmm7 2537; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] 2538; SSE2-NEXT: pmullw %xmm5, %xmm7 2539; SSE2-NEXT: pand %xmm8, %xmm7 2540; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 2541; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2542; SSE2-NEXT: pmullw %xmm4, %xmm0 2543; SSE2-NEXT: pand %xmm8, %xmm0 2544; SSE2-NEXT: packuswb %xmm7, %xmm0 2545; SSE2-NEXT: movdqa %xmm6, %xmm4 2546; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 2547; SSE2-NEXT: movdqa %xmm2, %xmm5 2548; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] 2549; SSE2-NEXT: pmullw %xmm4, %xmm5 2550; SSE2-NEXT: pand %xmm8, %xmm5 2551; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] 2552; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 2553; SSE2-NEXT: pmullw %xmm6, %xmm2 2554; SSE2-NEXT: pand %xmm8, %xmm2 2555; SSE2-NEXT: packuswb %xmm5, %xmm2 2556; SSE2-NEXT: movdqa %xmm2, %xmm4 2557; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 2558; SSE2-NEXT: movdqa %xmm0, %xmm5 2559; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] 2560; SSE2-NEXT: pmullw %xmm4, %xmm5 2561; SSE2-NEXT: pand %xmm8, %xmm5 2562; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 2563; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2564; SSE2-NEXT: pmullw %xmm2, %xmm0 2565; SSE2-NEXT: pand %xmm8, %xmm0 2566; SSE2-NEXT: packuswb %xmm5, %xmm0 2567; SSE2-NEXT: movdqa %xmm3, %xmm2 2568; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 2569; SSE2-NEXT: movdqa %xmm1, %xmm4 2570; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 2571; SSE2-NEXT: pmullw %xmm2, %xmm4 2572; SSE2-NEXT: pand %xmm8, %xmm4 2573; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 2574; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2575; SSE2-NEXT: pmullw %xmm3, %xmm1 2576; SSE2-NEXT: pand %xmm8, %xmm1 2577; SSE2-NEXT: packuswb %xmm4, %xmm1 2578; SSE2-NEXT: movdqa %xmm1, %xmm2 2579; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 2580; SSE2-NEXT: movdqa %xmm0, %xmm3 2581; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] 2582; SSE2-NEXT: pmullw %xmm2, %xmm3 2583; SSE2-NEXT: pand %xmm8, %xmm3 2584; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2585; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2586; SSE2-NEXT: pmullw %xmm1, %xmm0 2587; SSE2-NEXT: pand %xmm8, %xmm0 2588; SSE2-NEXT: packuswb %xmm3, %xmm0 2589; SSE2-NEXT: movdqa %xmm0, %xmm1 2590; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 2591; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2592; SSE2-NEXT: pmullw %xmm1, %xmm0 2593; SSE2-NEXT: pand %xmm8, %xmm0 2594; SSE2-NEXT: pxor %xmm2, %xmm2 2595; SSE2-NEXT: packuswb %xmm2, %xmm0 2596; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,2,3,3] 2597; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2598; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2599; SSE2-NEXT: pmullw %xmm0, %xmm1 2600; SSE2-NEXT: pand %xmm8, %xmm1 2601; SSE2-NEXT: packuswb %xmm2, %xmm1 2602; SSE2-NEXT: movdqa %xmm1, %xmm0 2603; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2604; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 2605; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2606; SSE2-NEXT: pmullw %xmm0, %xmm1 2607; SSE2-NEXT: pand %xmm8, %xmm1 2608; SSE2-NEXT: packuswb %xmm2, %xmm1 2609; SSE2-NEXT: movdqa %xmm1, %xmm0 2610; SSE2-NEXT: psrlw $8, %xmm0 2611; SSE2-NEXT: movdqa %xmm1, %xmm2 2612; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 2613; SSE2-NEXT: pmullw %xmm0, %xmm2 2614; SSE2-NEXT: pand %xmm8, %xmm2 2615; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2616; SSE2-NEXT: pmullw %xmm0, %xmm1 2617; SSE2-NEXT: pand %xmm8, %xmm1 2618; SSE2-NEXT: packuswb %xmm2, %xmm1 2619; SSE2-NEXT: movd %xmm1, %eax 2620; SSE2-NEXT: # kill: def $al killed $al killed $eax 2621; SSE2-NEXT: retq 2622; 2623; SSE41-LABEL: test_v128i8: 2624; SSE41: # %bb.0: 2625; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 2626; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2627; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2628; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2629; SSE41-NEXT: pmullw %xmm5, %xmm1 2630; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 2631; SSE41-NEXT: pand %xmm5, %xmm1 2632; SSE41-NEXT: pmullw %xmm9, %xmm8 2633; SSE41-NEXT: pand %xmm5, %xmm8 2634; SSE41-NEXT: packuswb %xmm1, %xmm8 2635; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero 2636; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2637; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 2638; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2639; SSE41-NEXT: pmullw %xmm7, %xmm3 2640; SSE41-NEXT: pand %xmm5, %xmm3 2641; SSE41-NEXT: pmullw %xmm9, %xmm1 2642; SSE41-NEXT: pand %xmm5, %xmm1 2643; SSE41-NEXT: packuswb %xmm3, %xmm1 2644; SSE41-NEXT: pmovzxbw {{.*#+}} xmm7 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero 2645; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2646; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2647; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2648; SSE41-NEXT: pmullw %xmm4, %xmm0 2649; SSE41-NEXT: pand %xmm5, %xmm0 2650; SSE41-NEXT: pmullw %xmm7, %xmm3 2651; SSE41-NEXT: pand %xmm5, %xmm3 2652; SSE41-NEXT: packuswb %xmm0, %xmm3 2653; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 2654; SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2655; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2656; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2657; SSE41-NEXT: pmullw %xmm6, %xmm2 2658; SSE41-NEXT: pand %xmm5, %xmm2 2659; SSE41-NEXT: pmullw %xmm0, %xmm4 2660; SSE41-NEXT: pand %xmm5, %xmm4 2661; SSE41-NEXT: packuswb %xmm2, %xmm4 2662; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero 2663; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2664; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 2665; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2666; SSE41-NEXT: pmullw %xmm4, %xmm3 2667; SSE41-NEXT: pand %xmm5, %xmm3 2668; SSE41-NEXT: pmullw %xmm2, %xmm0 2669; SSE41-NEXT: pand %xmm5, %xmm0 2670; SSE41-NEXT: packuswb %xmm3, %xmm0 2671; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2672; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2673; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero 2674; SSE41-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2675; SSE41-NEXT: pmullw %xmm1, %xmm8 2676; SSE41-NEXT: pand %xmm5, %xmm8 2677; SSE41-NEXT: pmullw %xmm2, %xmm3 2678; SSE41-NEXT: pand %xmm5, %xmm3 2679; SSE41-NEXT: packuswb %xmm8, %xmm3 2680; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 2681; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2682; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2683; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2684; SSE41-NEXT: pmullw %xmm3, %xmm0 2685; SSE41-NEXT: pand %xmm5, %xmm0 2686; SSE41-NEXT: pmullw %xmm1, %xmm2 2687; SSE41-NEXT: pand %xmm5, %xmm2 2688; SSE41-NEXT: packuswb %xmm0, %xmm2 2689; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2690; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2691; SSE41-NEXT: pmullw %xmm0, %xmm2 2692; SSE41-NEXT: pand %xmm5, %xmm2 2693; SSE41-NEXT: pxor %xmm0, %xmm0 2694; SSE41-NEXT: packuswb %xmm0, %xmm2 2695; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2696; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2697; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 2698; SSE41-NEXT: pmullw %xmm1, %xmm2 2699; SSE41-NEXT: pand %xmm5, %xmm2 2700; SSE41-NEXT: packuswb %xmm0, %xmm2 2701; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2702; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2703; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] 2704; SSE41-NEXT: pmullw %xmm1, %xmm2 2705; SSE41-NEXT: pand %xmm5, %xmm2 2706; SSE41-NEXT: packuswb %xmm0, %xmm2 2707; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2708; SSE41-NEXT: psrlw $8, %xmm2 2709; SSE41-NEXT: pmullw %xmm0, %xmm2 2710; SSE41-NEXT: pextrb $0, %xmm2, %eax 2711; SSE41-NEXT: # kill: def $al killed $al killed $eax 2712; SSE41-NEXT: retq 2713; 2714; AVX1-LABEL: test_v128i8: 2715; AVX1: # %bb.0: 2716; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 2717; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2718; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 2719; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2720; AVX1-NEXT: vpmullw %xmm4, %xmm7, %xmm7 2721; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 2722; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7 2723; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 2724; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 2725; AVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm5 2726; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5 2727; AVX1-NEXT: vpackuswb %xmm7, %xmm5, %xmm8 2728; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 2729; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2730; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 2731; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2732; AVX1-NEXT: vpmullw %xmm9, %xmm7, %xmm7 2733; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7 2734; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 2735; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 2736; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm5 2737; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5 2738; AVX1-NEXT: vpackuswb %xmm7, %xmm5, %xmm6 2739; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2740; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2741; AVX1-NEXT: vpmullw %xmm5, %xmm7, %xmm5 2742; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5 2743; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2744; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2745; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 2746; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2747; AVX1-NEXT: vpackuswb %xmm5, %xmm0, %xmm0 2748; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2749; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2750; AVX1-NEXT: vpmullw %xmm2, %xmm5, %xmm2 2751; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2752; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 2753; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2754; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 2755; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2756; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 2757; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2758; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2759; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 2760; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2761; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2762; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2763; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2764; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2765; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2766; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2767; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2768; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1 2769; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2770; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 2771; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero 2772; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 2773; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2774; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 2775; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2776; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2777; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 2778; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2779; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2780; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2781; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2782; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2783; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2784; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 2785; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2786; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2787; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 2788; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2789; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2790; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2791; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2792; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2793; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2794; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 2795; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2796; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2797; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 2798; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2799; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2800; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2801; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2802; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2803; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2804; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 2805; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2806; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2807; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 2808; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2809; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2810; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2811; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2812; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2813; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2814; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 2815; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2816; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2817; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 2818; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2819; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2820; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2821; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2822; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2823; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2824; AVX1-NEXT: vpextrb $0, %xmm0, %eax 2825; AVX1-NEXT: # kill: def $al killed $al killed $eax 2826; AVX1-NEXT: vzeroupper 2827; AVX1-NEXT: retq 2828; 2829; AVX2-LABEL: test_v128i8: 2830; AVX2: # %bb.0: 2831; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 2832; AVX2-NEXT: vpmovsxbw %xmm4, %ymm4 2833; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 2834; AVX2-NEXT: vpmovsxbw %xmm5, %ymm5 2835; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm5 2836; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 2837; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 2838; AVX2-NEXT: vpshufb %xmm4, %xmm6, %xmm6 2839; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm5 2840; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] 2841; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm6 2842; AVX2-NEXT: vpmovsxbw %xmm6, %ymm6 2843; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm7 2844; AVX2-NEXT: vpmovsxbw %xmm7, %ymm7 2845; AVX2-NEXT: vpmullw %ymm6, %ymm7, %ymm6 2846; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 2847; AVX2-NEXT: vpshufb %xmm4, %xmm7, %xmm7 2848; AVX2-NEXT: vpshufb %xmm4, %xmm6, %xmm6 2849; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] 2850; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2 2851; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2852; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 2853; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2854; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 2855; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 2856; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 2857; AVX2-NEXT: vpmovsxbw %xmm3, %ymm2 2858; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 2859; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 2860; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2861; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 2862; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 2863; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 2864; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 2865; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2866; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2867; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2868; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 2869; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 2870; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2871; AVX2-NEXT: vpmovsxbw %xmm6, %ymm1 2872; AVX2-NEXT: vpmovsxbw %xmm5, %ymm2 2873; AVX2-NEXT: vpmullw %ymm1, %ymm2, %ymm1 2874; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2875; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 2876; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 2877; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 2878; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 2879; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2880; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2881; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2882; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 2883; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 2884; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2885; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 2886; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2887; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 2888; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2889; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2890; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 2891; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 2892; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2893; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 2894; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2895; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 2896; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2897; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2898; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 2899; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 2900; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2901; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 2902; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2903; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 2904; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2905; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2906; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 2907; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 2908; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2909; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 2910; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2911; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 2912; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2913; AVX2-NEXT: vpextrb $0, %xmm0, %eax 2914; AVX2-NEXT: # kill: def $al killed $al killed $eax 2915; AVX2-NEXT: vzeroupper 2916; AVX2-NEXT: retq 2917; 2918; AVX512BW-LABEL: test_v128i8: 2919; AVX512BW: # %bb.0: 2920; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm2 2921; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm3 2922; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2 2923; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 2924; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 2925; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 2926; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 2927; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 2928; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2929; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2930; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 2931; AVX512BW-NEXT: vpmovsxbw %ymm2, %zmm1 2932; AVX512BW-NEXT: vpmullw %zmm0, %zmm1, %zmm0 2933; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2934; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 2935; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 2936; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 2937; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2938; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2939; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 2940; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 2941; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 2942; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2943; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2944; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 2945; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 2946; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 2947; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2948; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2949; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 2950; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 2951; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 2952; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2953; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2954; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 2955; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 2956; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 2957; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2958; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2959; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax 2960; AVX512BW-NEXT: # kill: def $al killed $al killed $eax 2961; AVX512BW-NEXT: vzeroupper 2962; AVX512BW-NEXT: retq 2963; 2964; AVX512BWVL-LABEL: test_v128i8: 2965; AVX512BWVL: # %bb.0: 2966; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm2 2967; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm3 2968; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm3, %zmm2 2969; AVX512BWVL-NEXT: vpmovwb %zmm2, %ymm2 2970; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 2971; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 2972; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 2973; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 2974; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2975; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 2976; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 2977; AVX512BWVL-NEXT: vpmovsxbw %ymm2, %zmm1 2978; AVX512BWVL-NEXT: vpmullw %zmm0, %zmm1, %zmm0 2979; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 2980; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 2981; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 2982; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 2983; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2984; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 2985; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 2986; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 2987; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 2988; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2989; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 2990; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 2991; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 2992; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 2993; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2994; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 2995; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1 2996; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 2997; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 2998; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2999; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 3000; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1 3001; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 3002; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 3003; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 3004; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 3005; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax 3006; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax 3007; AVX512BWVL-NEXT: vzeroupper 3008; AVX512BWVL-NEXT: retq 3009; 3010; AVX512DQ-LABEL: test_v128i8: 3011; AVX512DQ: # %bb.0: 3012; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm4 3013; AVX512DQ-NEXT: vpmovsxbw %xmm4, %ymm4 3014; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm5 3015; AVX512DQ-NEXT: vpmovsxbw %xmm5, %ymm5 3016; AVX512DQ-NEXT: vpmullw %ymm4, %ymm5, %ymm4 3017; AVX512DQ-NEXT: vpmovsxwd %ymm4, %zmm4 3018; AVX512DQ-NEXT: vpmovdb %zmm4, %xmm4 3019; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm5 3020; AVX512DQ-NEXT: vpmovsxbw %xmm5, %ymm5 3021; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm6 3022; AVX512DQ-NEXT: vpmovsxbw %xmm6, %ymm6 3023; AVX512DQ-NEXT: vpmullw %ymm5, %ymm6, %ymm5 3024; AVX512DQ-NEXT: vpmovsxwd %ymm5, %zmm5 3025; AVX512DQ-NEXT: vpmovdb %zmm5, %xmm5 3026; AVX512DQ-NEXT: vpmovsxbw %xmm2, %ymm2 3027; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 3028; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0 3029; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 3030; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3031; AVX512DQ-NEXT: vpmovsxbw %xmm3, %ymm2 3032; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 3033; AVX512DQ-NEXT: vpmullw %ymm2, %ymm1, %ymm1 3034; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm1 3035; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 3036; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 3037; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 3038; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 3039; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 3040; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3041; AVX512DQ-NEXT: vpmovsxbw %xmm5, %ymm1 3042; AVX512DQ-NEXT: vpmovsxbw %xmm4, %ymm2 3043; AVX512DQ-NEXT: vpmullw %ymm1, %ymm2, %ymm1 3044; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm1 3045; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 3046; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 3047; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 3048; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 3049; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 3050; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3051; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 3052; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 3053; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 3054; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 3055; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 3056; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3057; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 3058; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 3059; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 3060; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 3061; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 3062; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3063; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 3064; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 3065; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 3066; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 3067; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 3068; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3069; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1 3070; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 3071; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 3072; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 3073; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 3074; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3075; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax 3076; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax 3077; AVX512DQ-NEXT: vzeroupper 3078; AVX512DQ-NEXT: retq 3079; 3080; AVX512DQVL-LABEL: test_v128i8: 3081; AVX512DQVL: # %bb.0: 3082; AVX512DQVL-NEXT: vextracti128 $1, %ymm2, %xmm4 3083; AVX512DQVL-NEXT: vpmovsxbw %xmm4, %ymm4 3084; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm5 3085; AVX512DQVL-NEXT: vpmovsxbw %xmm5, %ymm5 3086; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm5, %ymm4 3087; AVX512DQVL-NEXT: vpmovsxwd %ymm4, %zmm4 3088; AVX512DQVL-NEXT: vpmovdb %zmm4, %xmm4 3089; AVX512DQVL-NEXT: vextracti128 $1, %ymm3, %xmm5 3090; AVX512DQVL-NEXT: vpmovsxbw %xmm5, %ymm5 3091; AVX512DQVL-NEXT: vextracti128 $1, %ymm1, %xmm6 3092; AVX512DQVL-NEXT: vpmovsxbw %xmm6, %ymm6 3093; AVX512DQVL-NEXT: vpmullw %ymm5, %ymm6, %ymm5 3094; AVX512DQVL-NEXT: vpmovsxwd %ymm5, %zmm5 3095; AVX512DQVL-NEXT: vpmovdb %zmm5, %xmm5 3096; AVX512DQVL-NEXT: vpmovsxbw %xmm2, %ymm2 3097; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 3098; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm0, %ymm0 3099; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 3100; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 3101; AVX512DQVL-NEXT: vpmovsxbw %xmm3, %ymm2 3102; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 3103; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm1, %ymm1 3104; AVX512DQVL-NEXT: vpmovsxwd %ymm1, %zmm1 3105; AVX512DQVL-NEXT: vpmovdb %zmm1, %xmm1 3106; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 3107; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 3108; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 3109; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 3110; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 3111; AVX512DQVL-NEXT: vpmovsxbw %xmm5, %ymm1 3112; AVX512DQVL-NEXT: vpmovsxbw %xmm4, %ymm2 3113; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm2, %ymm1 3114; AVX512DQVL-NEXT: vpmovsxwd %ymm1, %zmm1 3115; AVX512DQVL-NEXT: vpmovdb %zmm1, %xmm1 3116; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 3117; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 3118; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 3119; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 3120; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 3121; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 3122; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 3123; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 3124; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 3125; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 3126; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 3127; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 3128; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 3129; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 3130; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 3131; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 3132; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 3133; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1 3134; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 3135; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 3136; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 3137; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 3138; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 3139; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1 3140; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 3141; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 3142; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 3143; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 3144; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 3145; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax 3146; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax 3147; AVX512DQVL-NEXT: vzeroupper 3148; AVX512DQVL-NEXT: retq 3149 %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> %a0) 3150 ret i8 %1 3151} 3152 3153declare i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64>) 3154declare i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64>) 3155declare i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64>) 3156declare i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64>) 3157 3158declare i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32>) 3159declare i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32>) 3160declare i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32>) 3161declare i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32>) 3162 3163declare i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16>) 3164declare i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16>) 3165declare i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16>) 3166declare i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16>) 3167 3168declare i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8>) 3169declare i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8>) 3170declare i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8>) 3171declare i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8>) 3172