1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW 7 8; 9; add 10; 11 12define <4 x i32> @trunc_add_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 13; SSE-LABEL: trunc_add_v4i64_4i32: 14; SSE: # BB#0: 15; SSE-NEXT: paddq %xmm2, %xmm0 16; SSE-NEXT: paddq %xmm3, %xmm1 17; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 18; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 19; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 20; SSE-NEXT: retq 21; 22; AVX1-LABEL: trunc_add_v4i64_4i32: 23; AVX1: # BB#0: 24; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 25; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 26; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 27; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 28; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] 29; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] 30; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 31; AVX1-NEXT: vzeroupper 32; AVX1-NEXT: retq 33; 34; AVX2-LABEL: trunc_add_v4i64_4i32: 35; AVX2: # BB#0: 36; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 37; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 38; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 39; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 40; AVX2-NEXT: vzeroupper 41; AVX2-NEXT: retq 42; 43; AVX512-LABEL: trunc_add_v4i64_4i32: 44; AVX512: # BB#0: 45; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 46; AVX512-NEXT: vpmovqd %zmm0, %ymm0 47; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 48; AVX512-NEXT: retq 49 %1 = add <4 x i64> %a0, %a1 50 %2 = trunc <4 x i64> %1 to <4 x i32> 51 ret <4 x i32> %2 52} 53 54define <8 x i16> @trunc_add_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 55; SSE-LABEL: trunc_add_v8i64_8i16: 56; SSE: # BB#0: 57; SSE-NEXT: paddq %xmm6, %xmm2 58; SSE-NEXT: paddq %xmm4, %xmm0 59; SSE-NEXT: paddq %xmm7, %xmm3 60; SSE-NEXT: paddq %xmm5, %xmm1 61; SSE-NEXT: pextrw $4, %xmm1, %eax 62; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 63; SSE-NEXT: pextrw $4, %xmm0, %ecx 64; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 65; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 66; SSE-NEXT: pextrw $4, %xmm3, %edx 67; SSE-NEXT: movd %edx, %xmm1 68; SSE-NEXT: movd %eax, %xmm3 69; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 70; SSE-NEXT: pextrw $4, %xmm2, %eax 71; SSE-NEXT: movd %eax, %xmm1 72; SSE-NEXT: movd %ecx, %xmm2 73; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 74; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 75; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 76; SSE-NEXT: retq 77; 78; AVX1-LABEL: trunc_add_v8i64_8i16: 79; AVX1: # BB#0: 80; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 81; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 82; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 83; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 84; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm2 85; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 86; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 87; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 88; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 89; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 90; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 91; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 92; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 93; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7] 94; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 95; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 96; AVX1-NEXT: vzeroupper 97; AVX1-NEXT: retq 98; 99; AVX2-LABEL: trunc_add_v8i64_8i16: 100; AVX2: # BB#0: 101; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 102; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 103; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 104; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 105; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] 106; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3] 107; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 108; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 109; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 110; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 111; AVX2-NEXT: vzeroupper 112; AVX2-NEXT: retq 113; 114; AVX512-LABEL: trunc_add_v8i64_8i16: 115; AVX512: # BB#0: 116; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 117; AVX512-NEXT: vpmovqw %zmm0, %xmm0 118; AVX512-NEXT: retq 119 %1 = add <8 x i64> %a0, %a1 120 %2 = trunc <8 x i64> %1 to <8 x i16> 121 ret <8 x i16> %2 122} 123 124define <8 x i16> @trunc_add_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 125; SSE-LABEL: trunc_add_v8i32_8i16: 126; SSE: # BB#0: 127; SSE-NEXT: paddd %xmm2, %xmm0 128; SSE-NEXT: paddd %xmm3, %xmm1 129; SSE-NEXT: pslld $16, %xmm1 130; SSE-NEXT: psrad $16, %xmm1 131; SSE-NEXT: pslld $16, %xmm0 132; SSE-NEXT: psrad $16, %xmm0 133; SSE-NEXT: packssdw %xmm1, %xmm0 134; SSE-NEXT: retq 135; 136; AVX1-LABEL: trunc_add_v8i32_8i16: 137; AVX1: # BB#0: 138; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 139; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 140; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 141; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 142; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 143; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 144; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 145; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 146; AVX1-NEXT: vzeroupper 147; AVX1-NEXT: retq 148; 149; AVX2-LABEL: trunc_add_v8i32_8i16: 150; AVX2: # BB#0: 151; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 152; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 153; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 154; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 155; AVX2-NEXT: vzeroupper 156; AVX2-NEXT: retq 157; 158; AVX512-LABEL: trunc_add_v8i32_8i16: 159; AVX512: # BB#0: 160; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 161; AVX512-NEXT: vpmovdw %zmm0, %ymm0 162; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 163; AVX512-NEXT: retq 164 %1 = add <8 x i32> %a0, %a1 165 %2 = trunc <8 x i32> %1 to <8 x i16> 166 ret <8 x i16> %2 167} 168 169define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 170; SSE-LABEL: trunc_add_v16i64_v16i8: 171; SSE: # BB#0: 172; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm0 173; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm1 174; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm2 175; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm3 176; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm4 177; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm5 178; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm6 179; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm7 180; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 181; SSE-NEXT: pand %xmm8, %xmm7 182; SSE-NEXT: pand %xmm8, %xmm6 183; SSE-NEXT: packuswb %xmm7, %xmm6 184; SSE-NEXT: pand %xmm8, %xmm5 185; SSE-NEXT: pand %xmm8, %xmm4 186; SSE-NEXT: packuswb %xmm5, %xmm4 187; SSE-NEXT: packuswb %xmm6, %xmm4 188; SSE-NEXT: pand %xmm8, %xmm3 189; SSE-NEXT: pand %xmm8, %xmm2 190; SSE-NEXT: packuswb %xmm3, %xmm2 191; SSE-NEXT: pand %xmm8, %xmm1 192; SSE-NEXT: pand %xmm8, %xmm0 193; SSE-NEXT: packuswb %xmm1, %xmm0 194; SSE-NEXT: packuswb %xmm2, %xmm0 195; SSE-NEXT: packuswb %xmm4, %xmm0 196; SSE-NEXT: retq 197; 198; AVX1-LABEL: trunc_add_v16i64_v16i8: 199; AVX1: # BB#0: 200; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8 201; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 202; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 203; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 204; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm4 205; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 206; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 207; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 208; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm5 209; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 210; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 211; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm2 212; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm6 213; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 214; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 215; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm3 216; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 217; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 218; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 219; AVX1-NEXT: vpackuswb %xmm3, %xmm6, %xmm3 220; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 221; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 222; AVX1-NEXT: vpackuswb %xmm2, %xmm5, %xmm2 223; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 224; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 225; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3 226; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 227; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 228; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3 229; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0 230; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 231; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 232; AVX1-NEXT: vzeroupper 233; AVX1-NEXT: retq 234; 235; AVX2-LABEL: trunc_add_v16i64_v16i8: 236; AVX2: # BB#0: 237; AVX2-NEXT: vpaddq %ymm5, %ymm1, %ymm1 238; AVX2-NEXT: vpaddq %ymm4, %ymm0, %ymm0 239; AVX2-NEXT: vpaddq %ymm7, %ymm3, %ymm3 240; AVX2-NEXT: vpaddq %ymm6, %ymm2, %ymm2 241; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6] 242; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] 243; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6] 244; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] 245; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 246; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] 247; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 248; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 249; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 250; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 251; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 252; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 253; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] 254; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3] 255; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 256; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 257; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 258; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 259; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 260; AVX2-NEXT: vzeroupper 261; AVX2-NEXT: retq 262; 263; AVX512-LABEL: trunc_add_v16i64_v16i8: 264; AVX512: # BB#0: 265; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1 266; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0 267; AVX512-NEXT: vpmovqd %zmm0, %ymm0 268; AVX512-NEXT: vpmovqd %zmm1, %ymm1 269; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 270; AVX512-NEXT: vpmovdb %zmm0, %xmm0 271; AVX512-NEXT: retq 272 %1 = add <16 x i64> %a0, %a1 273 %2 = trunc <16 x i64> %1 to <16 x i8> 274 ret <16 x i8> %2 275} 276 277define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 278; SSE-LABEL: trunc_add_v16i32_v16i8: 279; SSE: # BB#0: 280; SSE-NEXT: paddd %xmm4, %xmm0 281; SSE-NEXT: paddd %xmm5, %xmm1 282; SSE-NEXT: paddd %xmm6, %xmm2 283; SSE-NEXT: paddd %xmm7, %xmm3 284; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 285; SSE-NEXT: pand %xmm4, %xmm3 286; SSE-NEXT: pand %xmm4, %xmm2 287; SSE-NEXT: packuswb %xmm3, %xmm2 288; SSE-NEXT: pand %xmm4, %xmm1 289; SSE-NEXT: pand %xmm4, %xmm0 290; SSE-NEXT: packuswb %xmm1, %xmm0 291; SSE-NEXT: packuswb %xmm2, %xmm0 292; SSE-NEXT: retq 293; 294; AVX1-LABEL: trunc_add_v16i32_v16i8: 295; AVX1: # BB#0: 296; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 297; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 298; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 299; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 300; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm2 301; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 302; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 303; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 304; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 305; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 306; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 307; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 308; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 309; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 310; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0 311; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 312; AVX1-NEXT: vzeroupper 313; AVX1-NEXT: retq 314; 315; AVX2-LABEL: trunc_add_v16i32_v16i8: 316; AVX2: # BB#0: 317; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 318; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 319; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] 320; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 321; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 322; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 323; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 324; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 325; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 326; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 327; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 328; AVX2-NEXT: vzeroupper 329; AVX2-NEXT: retq 330; 331; AVX512-LABEL: trunc_add_v16i32_v16i8: 332; AVX512: # BB#0: 333; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 334; AVX512-NEXT: vpmovdb %zmm0, %xmm0 335; AVX512-NEXT: retq 336 %1 = add <16 x i32> %a0, %a1 337 %2 = trunc <16 x i32> %1 to <16 x i8> 338 ret <16 x i8> %2 339} 340 341define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 342; SSE-LABEL: trunc_add_v16i16_v16i8: 343; SSE: # BB#0: 344; SSE-NEXT: paddw %xmm2, %xmm0 345; SSE-NEXT: paddw %xmm3, %xmm1 346; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 347; SSE-NEXT: pand %xmm2, %xmm1 348; SSE-NEXT: pand %xmm2, %xmm0 349; SSE-NEXT: packuswb %xmm1, %xmm0 350; SSE-NEXT: retq 351; 352; AVX1-LABEL: trunc_add_v16i16_v16i8: 353; AVX1: # BB#0: 354; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 355; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 356; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 357; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 358; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 359; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 360; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 361; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 362; AVX1-NEXT: vzeroupper 363; AVX1-NEXT: retq 364; 365; AVX2-LABEL: trunc_add_v16i16_v16i8: 366; AVX2: # BB#0: 367; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 368; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 369; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 370; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 371; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 372; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 373; AVX2-NEXT: vzeroupper 374; AVX2-NEXT: retq 375; 376; AVX512F-LABEL: trunc_add_v16i16_v16i8: 377; AVX512F: # BB#0: 378; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0 379; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 380; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 381; AVX512F-NEXT: retq 382; 383; AVX512BW-LABEL: trunc_add_v16i16_v16i8: 384; AVX512BW: # BB#0: 385; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0 386; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 387; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 388; AVX512BW-NEXT: retq 389 %1 = add <16 x i16> %a0, %a1 390 %2 = trunc <16 x i16> %1 to <16 x i8> 391 ret <16 x i8> %2 392} 393 394; 395; add to constant 396; 397 398define <4 x i32> @trunc_add_const_v4i64_4i32(<4 x i64> %a0) nounwind { 399; SSE-LABEL: trunc_add_const_v4i64_4i32: 400; SSE: # BB#0: 401; SSE-NEXT: movl $1, %eax 402; SSE-NEXT: movd %rax, %xmm2 403; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] 404; SSE-NEXT: paddq %xmm0, %xmm2 405; SSE-NEXT: paddq {{.*}}(%rip), %xmm1 406; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 407; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 408; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 409; SSE-NEXT: retq 410; 411; AVX1-LABEL: trunc_add_const_v4i64_4i32: 412; AVX1: # BB#0: 413; AVX1-NEXT: movl $1, %eax 414; AVX1-NEXT: vmovq %rax, %xmm1 415; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] 416; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 417; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 418; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 419; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 420; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] 421; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 422; AVX1-NEXT: vzeroupper 423; AVX1-NEXT: retq 424; 425; AVX2-LABEL: trunc_add_const_v4i64_4i32: 426; AVX2: # BB#0: 427; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 428; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 429; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 430; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 431; AVX2-NEXT: vzeroupper 432; AVX2-NEXT: retq 433; 434; AVX512-LABEL: trunc_add_const_v4i64_4i32: 435; AVX512: # BB#0: 436; AVX512-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 437; AVX512-NEXT: vpmovqd %zmm0, %ymm0 438; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 439; AVX512-NEXT: retq 440 %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 441 %2 = trunc <4 x i64> %1 to <4 x i32> 442 ret <4 x i32> %2 443} 444 445define <8 x i16> @trunc_add_const_v16i64_v16i16(<8 x i64> %a0) nounwind { 446; SSE-LABEL: trunc_add_const_v16i64_v16i16: 447; SSE: # BB#0: 448; SSE-NEXT: movdqa %xmm0, %xmm4 449; SSE-NEXT: movl $1, %eax 450; SSE-NEXT: movd %rax, %xmm0 451; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] 452; SSE-NEXT: paddq %xmm4, %xmm0 453; SSE-NEXT: paddq {{.*}}(%rip), %xmm2 454; SSE-NEXT: paddq {{.*}}(%rip), %xmm3 455; SSE-NEXT: paddq {{.*}}(%rip), %xmm1 456; SSE-NEXT: pextrw $4, %xmm1, %eax 457; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 458; SSE-NEXT: pextrw $4, %xmm0, %ecx 459; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 460; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 461; SSE-NEXT: pextrw $4, %xmm3, %edx 462; SSE-NEXT: movd %edx, %xmm1 463; SSE-NEXT: movd %eax, %xmm3 464; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 465; SSE-NEXT: movd %ecx, %xmm1 466; SSE-NEXT: pextrw $4, %xmm2, %eax 467; SSE-NEXT: movd %eax, %xmm2 468; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 469; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 470; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 471; SSE-NEXT: retq 472; 473; AVX1-LABEL: trunc_add_const_v16i64_v16i16: 474; AVX1: # BB#0: 475; AVX1-NEXT: movl $1, %eax 476; AVX1-NEXT: vmovq %rax, %xmm2 477; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] 478; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2 479; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 480; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 481; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm3 482; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 483; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1 484; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 485; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] 486; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] 487; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 488; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] 489; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] 490; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 491; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 492; AVX1-NEXT: vzeroupper 493; AVX1-NEXT: retq 494; 495; AVX2-LABEL: trunc_add_const_v16i64_v16i16: 496; AVX2: # BB#0: 497; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm1, %ymm1 498; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 499; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 500; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 501; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] 502; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3] 503; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 504; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 505; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 506; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 507; AVX2-NEXT: vzeroupper 508; AVX2-NEXT: retq 509; 510; AVX512-LABEL: trunc_add_const_v16i64_v16i16: 511; AVX512: # BB#0: 512; AVX512-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 513; AVX512-NEXT: vpmovqw %zmm0, %xmm0 514; AVX512-NEXT: retq 515 %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 516 %2 = trunc <8 x i64> %1 to <8 x i16> 517 ret <8 x i16> %2 518} 519 520define <8 x i16> @trunc_add_const_v16i32_v16i16(<8 x i32> %a0) nounwind { 521; SSE-LABEL: trunc_add_const_v16i32_v16i16: 522; SSE: # BB#0: 523; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 524; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 525; SSE-NEXT: pslld $16, %xmm1 526; SSE-NEXT: psrad $16, %xmm1 527; SSE-NEXT: pslld $16, %xmm0 528; SSE-NEXT: psrad $16, %xmm0 529; SSE-NEXT: packssdw %xmm1, %xmm0 530; SSE-NEXT: retq 531; 532; AVX1-LABEL: trunc_add_const_v16i32_v16i16: 533; AVX1: # BB#0: 534; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm1 535; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 536; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 537; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 538; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 539; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 540; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 541; AVX1-NEXT: vzeroupper 542; AVX1-NEXT: retq 543; 544; AVX2-LABEL: trunc_add_const_v16i32_v16i16: 545; AVX2: # BB#0: 546; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 547; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 548; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 549; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 550; AVX2-NEXT: vzeroupper 551; AVX2-NEXT: retq 552; 553; AVX512-LABEL: trunc_add_const_v16i32_v16i16: 554; AVX512: # BB#0: 555; AVX512-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 556; AVX512-NEXT: vpmovdw %zmm0, %ymm0 557; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 558; AVX512-NEXT: retq 559 %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 560 %2 = trunc <8 x i32> %1 to <8 x i16> 561 ret <8 x i16> %2 562} 563 564define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 565; SSE-LABEL: trunc_add_const_v16i64_v16i8: 566; SSE: # BB#0: 567; SSE-NEXT: movl $1, %eax 568; SSE-NEXT: movd %rax, %xmm8 569; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] 570; SSE-NEXT: paddq %xmm8, %xmm0 571; SSE-NEXT: paddq {{.*}}(%rip), %xmm1 572; SSE-NEXT: paddq {{.*}}(%rip), %xmm2 573; SSE-NEXT: paddq {{.*}}(%rip), %xmm3 574; SSE-NEXT: paddq {{.*}}(%rip), %xmm4 575; SSE-NEXT: paddq {{.*}}(%rip), %xmm5 576; SSE-NEXT: paddq {{.*}}(%rip), %xmm6 577; SSE-NEXT: paddq {{.*}}(%rip), %xmm7 578; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 579; SSE-NEXT: pand %xmm8, %xmm7 580; SSE-NEXT: pand %xmm8, %xmm6 581; SSE-NEXT: packuswb %xmm7, %xmm6 582; SSE-NEXT: pand %xmm8, %xmm5 583; SSE-NEXT: pand %xmm8, %xmm4 584; SSE-NEXT: packuswb %xmm5, %xmm4 585; SSE-NEXT: packuswb %xmm6, %xmm4 586; SSE-NEXT: pand %xmm8, %xmm3 587; SSE-NEXT: pand %xmm8, %xmm2 588; SSE-NEXT: packuswb %xmm3, %xmm2 589; SSE-NEXT: pand %xmm8, %xmm1 590; SSE-NEXT: pand %xmm8, %xmm0 591; SSE-NEXT: packuswb %xmm1, %xmm0 592; SSE-NEXT: packuswb %xmm2, %xmm0 593; SSE-NEXT: packuswb %xmm4, %xmm0 594; SSE-NEXT: retq 595; 596; AVX1-LABEL: trunc_add_const_v16i64_v16i8: 597; AVX1: # BB#0: 598; AVX1-NEXT: movl $1, %eax 599; AVX1-NEXT: vmovq %rax, %xmm4 600; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] 601; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8 602; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 603; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 604; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm5 605; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 606; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1 607; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm2, %xmm6 608; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 609; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm2, %xmm2 610; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm3, %xmm7 611; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 612; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm3, %xmm3 613; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 614; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 615; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7 616; AVX1-NEXT: vpackuswb %xmm3, %xmm7, %xmm3 617; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 618; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6 619; AVX1-NEXT: vpackuswb %xmm2, %xmm6, %xmm2 620; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 621; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 622; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3 623; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 624; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 625; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3 626; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0 627; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 628; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 629; AVX1-NEXT: vzeroupper 630; AVX1-NEXT: retq 631; 632; AVX2-LABEL: trunc_add_const_v16i64_v16i8: 633; AVX2: # BB#0: 634; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm1, %ymm1 635; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 636; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm3, %ymm3 637; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm2, %ymm2 638; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6] 639; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] 640; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6] 641; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] 642; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 643; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] 644; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 645; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 646; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 647; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 648; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 649; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 650; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] 651; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3] 652; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 653; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 654; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 655; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 656; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 657; AVX2-NEXT: vzeroupper 658; AVX2-NEXT: retq 659; 660; AVX512-LABEL: trunc_add_const_v16i64_v16i8: 661; AVX512: # BB#0: 662; AVX512-NEXT: vpaddq {{.*}}(%rip), %zmm1, %zmm1 663; AVX512-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 664; AVX512-NEXT: vpmovqd %zmm0, %ymm0 665; AVX512-NEXT: vpmovqd %zmm1, %ymm1 666; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 667; AVX512-NEXT: vpmovdb %zmm0, %xmm0 668; AVX512-NEXT: retq 669 %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 670 %2 = trunc <16 x i64> %1 to <16 x i8> 671 ret <16 x i8> %2 672} 673 674define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 675; SSE-LABEL: trunc_add_const_v16i32_v16i8: 676; SSE: # BB#0: 677; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 678; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 679; SSE-NEXT: paddd {{.*}}(%rip), %xmm2 680; SSE-NEXT: paddd {{.*}}(%rip), %xmm3 681; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 682; SSE-NEXT: pand %xmm4, %xmm3 683; SSE-NEXT: pand %xmm4, %xmm2 684; SSE-NEXT: packuswb %xmm3, %xmm2 685; SSE-NEXT: pand %xmm4, %xmm1 686; SSE-NEXT: pand %xmm4, %xmm0 687; SSE-NEXT: packuswb %xmm1, %xmm0 688; SSE-NEXT: packuswb %xmm2, %xmm0 689; SSE-NEXT: retq 690; 691; AVX1-LABEL: trunc_add_const_v16i32_v16i8: 692; AVX1: # BB#0: 693; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm2 694; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 695; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 696; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm3 697; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 698; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 699; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 700; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 701; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 702; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 703; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 704; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 705; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0 706; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 707; AVX1-NEXT: vzeroupper 708; AVX1-NEXT: retq 709; 710; AVX2-LABEL: trunc_add_const_v16i32_v16i8: 711; AVX2: # BB#0: 712; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 713; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm1, %ymm1 714; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] 715; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 716; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 717; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 718; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 719; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 720; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 721; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 722; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 723; AVX2-NEXT: vzeroupper 724; AVX2-NEXT: retq 725; 726; AVX512-LABEL: trunc_add_const_v16i32_v16i8: 727; AVX512: # BB#0: 728; AVX512-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 729; AVX512-NEXT: vpmovdb %zmm0, %xmm0 730; AVX512-NEXT: retq 731 %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 732 %2 = trunc <16 x i32> %1 to <16 x i8> 733 ret <16 x i8> %2 734} 735 736define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 737; SSE-LABEL: trunc_add_const_v16i16_v16i8: 738; SSE: # BB#0: 739; SSE-NEXT: paddw {{.*}}(%rip), %xmm0 740; SSE-NEXT: paddw {{.*}}(%rip), %xmm1 741; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 742; SSE-NEXT: pand %xmm2, %xmm1 743; SSE-NEXT: pand %xmm2, %xmm0 744; SSE-NEXT: packuswb %xmm1, %xmm0 745; SSE-NEXT: retq 746; 747; AVX1-LABEL: trunc_add_const_v16i16_v16i8: 748; AVX1: # BB#0: 749; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm1 750; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 751; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 752; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 753; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 754; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 755; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 756; AVX1-NEXT: vzeroupper 757; AVX1-NEXT: retq 758; 759; AVX2-LABEL: trunc_add_const_v16i16_v16i8: 760; AVX2: # BB#0: 761; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 762; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 763; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 764; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 765; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 766; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 767; AVX2-NEXT: vzeroupper 768; AVX2-NEXT: retq 769; 770; AVX512F-LABEL: trunc_add_const_v16i16_v16i8: 771; AVX512F: # BB#0: 772; AVX512F-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 773; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 774; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 775; AVX512F-NEXT: retq 776; 777; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8: 778; AVX512BW: # BB#0: 779; AVX512BW-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 780; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 781; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 782; AVX512BW-NEXT: retq 783 %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 784 %2 = trunc <16 x i16> %1 to <16 x i8> 785 ret <16 x i8> %2 786} 787 788; 789; sub 790; 791 792define <4 x i32> @trunc_sub_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 793; SSE-LABEL: trunc_sub_v4i64_4i32: 794; SSE: # BB#0: 795; SSE-NEXT: psubq %xmm2, %xmm0 796; SSE-NEXT: psubq %xmm3, %xmm1 797; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 798; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 799; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 800; SSE-NEXT: retq 801; 802; AVX1-LABEL: trunc_sub_v4i64_4i32: 803; AVX1: # BB#0: 804; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm2 805; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 806; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 807; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 808; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] 809; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] 810; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 811; AVX1-NEXT: vzeroupper 812; AVX1-NEXT: retq 813; 814; AVX2-LABEL: trunc_sub_v4i64_4i32: 815; AVX2: # BB#0: 816; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 817; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 818; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 819; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 820; AVX2-NEXT: vzeroupper 821; AVX2-NEXT: retq 822; 823; AVX512-LABEL: trunc_sub_v4i64_4i32: 824; AVX512: # BB#0: 825; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0 826; AVX512-NEXT: vpmovqd %zmm0, %ymm0 827; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 828; AVX512-NEXT: retq 829 %1 = sub <4 x i64> %a0, %a1 830 %2 = trunc <4 x i64> %1 to <4 x i32> 831 ret <4 x i32> %2 832} 833 834define <8 x i16> @trunc_sub_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 835; SSE-LABEL: trunc_sub_v8i64_8i16: 836; SSE: # BB#0: 837; SSE-NEXT: psubq %xmm6, %xmm2 838; SSE-NEXT: psubq %xmm4, %xmm0 839; SSE-NEXT: psubq %xmm7, %xmm3 840; SSE-NEXT: psubq %xmm5, %xmm1 841; SSE-NEXT: pextrw $4, %xmm1, %eax 842; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 843; SSE-NEXT: pextrw $4, %xmm0, %ecx 844; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 845; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 846; SSE-NEXT: pextrw $4, %xmm3, %edx 847; SSE-NEXT: movd %edx, %xmm1 848; SSE-NEXT: movd %eax, %xmm3 849; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 850; SSE-NEXT: pextrw $4, %xmm2, %eax 851; SSE-NEXT: movd %eax, %xmm1 852; SSE-NEXT: movd %ecx, %xmm2 853; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 854; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 855; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 856; SSE-NEXT: retq 857; 858; AVX1-LABEL: trunc_sub_v8i64_8i16: 859; AVX1: # BB#0: 860; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm4 861; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 862; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 863; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 864; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm2 865; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 866; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 867; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 868; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 869; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 870; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 871; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 872; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 873; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7] 874; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 875; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 876; AVX1-NEXT: vzeroupper 877; AVX1-NEXT: retq 878; 879; AVX2-LABEL: trunc_sub_v8i64_8i16: 880; AVX2: # BB#0: 881; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1 882; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 883; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 884; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 885; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] 886; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3] 887; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 888; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 889; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 890; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 891; AVX2-NEXT: vzeroupper 892; AVX2-NEXT: retq 893; 894; AVX512-LABEL: trunc_sub_v8i64_8i16: 895; AVX512: # BB#0: 896; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 897; AVX512-NEXT: vpmovqw %zmm0, %xmm0 898; AVX512-NEXT: retq 899 %1 = sub <8 x i64> %a0, %a1 900 %2 = trunc <8 x i64> %1 to <8 x i16> 901 ret <8 x i16> %2 902} 903 904define <8 x i16> @trunc_sub_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 905; SSE-LABEL: trunc_sub_v8i32_8i16: 906; SSE: # BB#0: 907; SSE-NEXT: psubd %xmm2, %xmm0 908; SSE-NEXT: psubd %xmm3, %xmm1 909; SSE-NEXT: pslld $16, %xmm1 910; SSE-NEXT: psrad $16, %xmm1 911; SSE-NEXT: pslld $16, %xmm0 912; SSE-NEXT: psrad $16, %xmm0 913; SSE-NEXT: packssdw %xmm1, %xmm0 914; SSE-NEXT: retq 915; 916; AVX1-LABEL: trunc_sub_v8i32_8i16: 917; AVX1: # BB#0: 918; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2 919; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 920; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 921; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 922; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 923; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 924; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 925; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 926; AVX1-NEXT: vzeroupper 927; AVX1-NEXT: retq 928; 929; AVX2-LABEL: trunc_sub_v8i32_8i16: 930; AVX2: # BB#0: 931; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 932; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 933; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 934; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 935; AVX2-NEXT: vzeroupper 936; AVX2-NEXT: retq 937; 938; AVX512-LABEL: trunc_sub_v8i32_8i16: 939; AVX512: # BB#0: 940; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 941; AVX512-NEXT: vpmovdw %zmm0, %ymm0 942; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 943; AVX512-NEXT: retq 944 %1 = sub <8 x i32> %a0, %a1 945 %2 = trunc <8 x i32> %1 to <8 x i16> 946 ret <8 x i16> %2 947} 948 949define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 950; SSE-LABEL: trunc_sub_v16i64_v16i8: 951; SSE: # BB#0: 952; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm0 953; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm1 954; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm2 955; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm3 956; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm4 957; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm5 958; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm6 959; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm7 960; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 961; SSE-NEXT: pand %xmm8, %xmm7 962; SSE-NEXT: pand %xmm8, %xmm6 963; SSE-NEXT: packuswb %xmm7, %xmm6 964; SSE-NEXT: pand %xmm8, %xmm5 965; SSE-NEXT: pand %xmm8, %xmm4 966; SSE-NEXT: packuswb %xmm5, %xmm4 967; SSE-NEXT: packuswb %xmm6, %xmm4 968; SSE-NEXT: pand %xmm8, %xmm3 969; SSE-NEXT: pand %xmm8, %xmm2 970; SSE-NEXT: packuswb %xmm3, %xmm2 971; SSE-NEXT: pand %xmm8, %xmm1 972; SSE-NEXT: pand %xmm8, %xmm0 973; SSE-NEXT: packuswb %xmm1, %xmm0 974; SSE-NEXT: packuswb %xmm2, %xmm0 975; SSE-NEXT: packuswb %xmm4, %xmm0 976; SSE-NEXT: retq 977; 978; AVX1-LABEL: trunc_sub_v16i64_v16i8: 979; AVX1: # BB#0: 980; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8 981; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 982; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 983; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm0 984; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm4 985; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 986; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 987; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm1 988; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm5 989; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 990; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 991; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2 992; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm6 993; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 994; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 995; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm3 996; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 997; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 998; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 999; AVX1-NEXT: vpackuswb %xmm3, %xmm6, %xmm3 1000; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 1001; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 1002; AVX1-NEXT: vpackuswb %xmm2, %xmm5, %xmm2 1003; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 1004; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 1005; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3 1006; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 1007; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 1008; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3 1009; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0 1010; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1011; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1012; AVX1-NEXT: vzeroupper 1013; AVX1-NEXT: retq 1014; 1015; AVX2-LABEL: trunc_sub_v16i64_v16i8: 1016; AVX2: # BB#0: 1017; AVX2-NEXT: vpsubq %ymm5, %ymm1, %ymm1 1018; AVX2-NEXT: vpsubq %ymm4, %ymm0, %ymm0 1019; AVX2-NEXT: vpsubq %ymm7, %ymm3, %ymm3 1020; AVX2-NEXT: vpsubq %ymm6, %ymm2, %ymm2 1021; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6] 1022; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] 1023; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6] 1024; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] 1025; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 1026; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] 1027; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 1028; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 1029; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1030; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 1031; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 1032; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 1033; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] 1034; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3] 1035; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1036; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 1037; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1038; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 1039; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 1040; AVX2-NEXT: vzeroupper 1041; AVX2-NEXT: retq 1042; 1043; AVX512-LABEL: trunc_sub_v16i64_v16i8: 1044; AVX512: # BB#0: 1045; AVX512-NEXT: vpsubq %zmm3, %zmm1, %zmm1 1046; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0 1047; AVX512-NEXT: vpmovqd %zmm0, %ymm0 1048; AVX512-NEXT: vpmovqd %zmm1, %ymm1 1049; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1050; AVX512-NEXT: vpmovdb %zmm0, %xmm0 1051; AVX512-NEXT: retq 1052 %1 = sub <16 x i64> %a0, %a1 1053 %2 = trunc <16 x i64> %1 to <16 x i8> 1054 ret <16 x i8> %2 1055} 1056 1057define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 1058; SSE-LABEL: trunc_sub_v16i32_v16i8: 1059; SSE: # BB#0: 1060; SSE-NEXT: psubd %xmm4, %xmm0 1061; SSE-NEXT: psubd %xmm5, %xmm1 1062; SSE-NEXT: psubd %xmm6, %xmm2 1063; SSE-NEXT: psubd %xmm7, %xmm3 1064; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 1065; SSE-NEXT: pand %xmm4, %xmm3 1066; SSE-NEXT: pand %xmm4, %xmm2 1067; SSE-NEXT: packuswb %xmm3, %xmm2 1068; SSE-NEXT: pand %xmm4, %xmm1 1069; SSE-NEXT: pand %xmm4, %xmm0 1070; SSE-NEXT: packuswb %xmm1, %xmm0 1071; SSE-NEXT: packuswb %xmm2, %xmm0 1072; SSE-NEXT: retq 1073; 1074; AVX1-LABEL: trunc_sub_v16i32_v16i8: 1075; AVX1: # BB#0: 1076; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm4 1077; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1078; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1079; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 1080; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm2 1081; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1082; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1083; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 1084; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 1085; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1086; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1087; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 1088; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1089; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 1090; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0 1091; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1092; AVX1-NEXT: vzeroupper 1093; AVX1-NEXT: retq 1094; 1095; AVX2-LABEL: trunc_sub_v16i32_v16i8: 1096; AVX2: # BB#0: 1097; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 1098; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1 1099; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] 1100; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 1101; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 1102; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1103; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1104; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 1105; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1106; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1107; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1108; AVX2-NEXT: vzeroupper 1109; AVX2-NEXT: retq 1110; 1111; AVX512-LABEL: trunc_sub_v16i32_v16i8: 1112; AVX512: # BB#0: 1113; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0 1114; AVX512-NEXT: vpmovdb %zmm0, %xmm0 1115; AVX512-NEXT: retq 1116 %1 = sub <16 x i32> %a0, %a1 1117 %2 = trunc <16 x i32> %1 to <16 x i8> 1118 ret <16 x i8> %2 1119} 1120 1121define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 1122; SSE-LABEL: trunc_sub_v16i16_v16i8: 1123; SSE: # BB#0: 1124; SSE-NEXT: psubw %xmm2, %xmm0 1125; SSE-NEXT: psubw %xmm3, %xmm1 1126; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1127; SSE-NEXT: pand %xmm2, %xmm1 1128; SSE-NEXT: pand %xmm2, %xmm0 1129; SSE-NEXT: packuswb %xmm1, %xmm0 1130; SSE-NEXT: retq 1131; 1132; AVX1-LABEL: trunc_sub_v16i16_v16i8: 1133; AVX1: # BB#0: 1134; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2 1135; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1136; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1137; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 1138; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1139; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 1140; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 1141; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1142; AVX1-NEXT: vzeroupper 1143; AVX1-NEXT: retq 1144; 1145; AVX2-LABEL: trunc_sub_v16i16_v16i8: 1146; AVX2: # BB#0: 1147; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1148; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1149; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1150; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1151; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1152; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1153; AVX2-NEXT: vzeroupper 1154; AVX2-NEXT: retq 1155; 1156; AVX512F-LABEL: trunc_sub_v16i16_v16i8: 1157; AVX512F: # BB#0: 1158; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1159; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 1160; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 1161; AVX512F-NEXT: retq 1162; 1163; AVX512BW-LABEL: trunc_sub_v16i16_v16i8: 1164; AVX512BW: # BB#0: 1165; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1166; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1167; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 1168; AVX512BW-NEXT: retq 1169 %1 = sub <16 x i16> %a0, %a1 1170 %2 = trunc <16 x i16> %1 to <16 x i8> 1171 ret <16 x i8> %2 1172} 1173 1174; 1175; sub to constant 1176; 1177 1178define <4 x i32> @trunc_sub_const_v4i64_4i32(<4 x i64> %a0) nounwind { 1179; SSE-LABEL: trunc_sub_const_v4i64_4i32: 1180; SSE: # BB#0: 1181; SSE-NEXT: movl $1, %eax 1182; SSE-NEXT: movd %rax, %xmm2 1183; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] 1184; SSE-NEXT: psubq %xmm2, %xmm0 1185; SSE-NEXT: psubq {{.*}}(%rip), %xmm1 1186; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1187; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1188; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1189; SSE-NEXT: retq 1190; 1191; AVX1-LABEL: trunc_sub_const_v4i64_4i32: 1192; AVX1: # BB#0: 1193; AVX1-NEXT: movl $1, %eax 1194; AVX1-NEXT: vmovq %rax, %xmm1 1195; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] 1196; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 1197; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1198; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0 1199; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1200; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] 1201; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 1202; AVX1-NEXT: vzeroupper 1203; AVX1-NEXT: retq 1204; 1205; AVX2-LABEL: trunc_sub_const_v4i64_4i32: 1206; AVX2: # BB#0: 1207; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 1208; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 1209; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 1210; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 1211; AVX2-NEXT: vzeroupper 1212; AVX2-NEXT: retq 1213; 1214; AVX512-LABEL: trunc_sub_const_v4i64_4i32: 1215; AVX512: # BB#0: 1216; AVX512-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 1217; AVX512-NEXT: vpmovqd %zmm0, %ymm0 1218; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 1219; AVX512-NEXT: retq 1220 %1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 1221 %2 = trunc <4 x i64> %1 to <4 x i32> 1222 ret <4 x i32> %2 1223} 1224 1225define <8 x i16> @trunc_sub_const_v16i64_v16i16(<8 x i64> %a0) nounwind { 1226; SSE-LABEL: trunc_sub_const_v16i64_v16i16: 1227; SSE: # BB#0: 1228; SSE-NEXT: movl $1, %eax 1229; SSE-NEXT: movd %rax, %xmm4 1230; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] 1231; SSE-NEXT: psubq %xmm4, %xmm0 1232; SSE-NEXT: psubq {{.*}}(%rip), %xmm2 1233; SSE-NEXT: psubq {{.*}}(%rip), %xmm3 1234; SSE-NEXT: psubq {{.*}}(%rip), %xmm1 1235; SSE-NEXT: pextrw $4, %xmm1, %eax 1236; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 1237; SSE-NEXT: pextrw $4, %xmm0, %ecx 1238; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1239; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1240; SSE-NEXT: pextrw $4, %xmm3, %edx 1241; SSE-NEXT: movd %edx, %xmm1 1242; SSE-NEXT: movd %eax, %xmm3 1243; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 1244; SSE-NEXT: movd %ecx, %xmm1 1245; SSE-NEXT: pextrw $4, %xmm2, %eax 1246; SSE-NEXT: movd %eax, %xmm2 1247; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1248; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 1249; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1250; SSE-NEXT: retq 1251; 1252; AVX1-LABEL: trunc_sub_const_v16i64_v16i16: 1253; AVX1: # BB#0: 1254; AVX1-NEXT: movl $1, %eax 1255; AVX1-NEXT: vmovq %rax, %xmm2 1256; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] 1257; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm2 1258; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1259; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0 1260; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm3 1261; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1262; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm1 1263; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 1264; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] 1265; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] 1266; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 1267; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] 1268; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] 1269; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 1270; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1271; AVX1-NEXT: vzeroupper 1272; AVX1-NEXT: retq 1273; 1274; AVX2-LABEL: trunc_sub_const_v16i64_v16i16: 1275; AVX2: # BB#0: 1276; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1 1277; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 1278; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 1279; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 1280; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] 1281; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3] 1282; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1283; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 1284; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1285; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 1286; AVX2-NEXT: vzeroupper 1287; AVX2-NEXT: retq 1288; 1289; AVX512-LABEL: trunc_sub_const_v16i64_v16i16: 1290; AVX512: # BB#0: 1291; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0 1292; AVX512-NEXT: vpmovqw %zmm0, %xmm0 1293; AVX512-NEXT: retq 1294 %1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 1295 %2 = trunc <8 x i64> %1 to <8 x i16> 1296 ret <8 x i16> %2 1297} 1298 1299define <8 x i16> @trunc_sub_const_v16i32_v16i16(<8 x i32> %a0) nounwind { 1300; SSE-LABEL: trunc_sub_const_v16i32_v16i16: 1301; SSE: # BB#0: 1302; SSE-NEXT: psubd {{.*}}(%rip), %xmm0 1303; SSE-NEXT: psubd {{.*}}(%rip), %xmm1 1304; SSE-NEXT: pslld $16, %xmm1 1305; SSE-NEXT: psrad $16, %xmm1 1306; SSE-NEXT: pslld $16, %xmm0 1307; SSE-NEXT: psrad $16, %xmm0 1308; SSE-NEXT: packssdw %xmm1, %xmm0 1309; SSE-NEXT: retq 1310; 1311; AVX1-LABEL: trunc_sub_const_v16i32_v16i16: 1312; AVX1: # BB#0: 1313; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm1 1314; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1315; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 1316; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1317; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1318; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1319; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1320; AVX1-NEXT: vzeroupper 1321; AVX1-NEXT: retq 1322; 1323; AVX2-LABEL: trunc_sub_const_v16i32_v16i16: 1324; AVX2: # BB#0: 1325; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0 1326; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 1327; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1328; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 1329; AVX2-NEXT: vzeroupper 1330; AVX2-NEXT: retq 1331; 1332; AVX512-LABEL: trunc_sub_const_v16i32_v16i16: 1333; AVX512: # BB#0: 1334; AVX512-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0 1335; AVX512-NEXT: vpmovdw %zmm0, %ymm0 1336; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 1337; AVX512-NEXT: retq 1338 %1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1339 %2 = trunc <8 x i32> %1 to <8 x i16> 1340 ret <8 x i16> %2 1341} 1342 1343define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 1344; SSE-LABEL: trunc_sub_const_v16i64_v16i8: 1345; SSE: # BB#0: 1346; SSE-NEXT: movl $1, %eax 1347; SSE-NEXT: movd %rax, %xmm8 1348; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] 1349; SSE-NEXT: psubq %xmm8, %xmm0 1350; SSE-NEXT: psubq {{.*}}(%rip), %xmm1 1351; SSE-NEXT: psubq {{.*}}(%rip), %xmm2 1352; SSE-NEXT: psubq {{.*}}(%rip), %xmm3 1353; SSE-NEXT: psubq {{.*}}(%rip), %xmm4 1354; SSE-NEXT: psubq {{.*}}(%rip), %xmm5 1355; SSE-NEXT: psubq {{.*}}(%rip), %xmm6 1356; SSE-NEXT: psubq {{.*}}(%rip), %xmm7 1357; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 1358; SSE-NEXT: pand %xmm8, %xmm7 1359; SSE-NEXT: pand %xmm8, %xmm6 1360; SSE-NEXT: packuswb %xmm7, %xmm6 1361; SSE-NEXT: pand %xmm8, %xmm5 1362; SSE-NEXT: pand %xmm8, %xmm4 1363; SSE-NEXT: packuswb %xmm5, %xmm4 1364; SSE-NEXT: packuswb %xmm6, %xmm4 1365; SSE-NEXT: pand %xmm8, %xmm3 1366; SSE-NEXT: pand %xmm8, %xmm2 1367; SSE-NEXT: packuswb %xmm3, %xmm2 1368; SSE-NEXT: pand %xmm8, %xmm1 1369; SSE-NEXT: pand %xmm8, %xmm0 1370; SSE-NEXT: packuswb %xmm1, %xmm0 1371; SSE-NEXT: packuswb %xmm2, %xmm0 1372; SSE-NEXT: packuswb %xmm4, %xmm0 1373; SSE-NEXT: retq 1374; 1375; AVX1-LABEL: trunc_sub_const_v16i64_v16i8: 1376; AVX1: # BB#0: 1377; AVX1-NEXT: movl $1, %eax 1378; AVX1-NEXT: vmovq %rax, %xmm4 1379; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] 1380; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8 1381; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1382; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0 1383; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm5 1384; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1385; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm1 1386; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm2, %xmm6 1387; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1388; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm2, %xmm2 1389; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm7 1390; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1391; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm3 1392; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 1393; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 1394; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7 1395; AVX1-NEXT: vpackuswb %xmm3, %xmm7, %xmm3 1396; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 1397; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6 1398; AVX1-NEXT: vpackuswb %xmm2, %xmm6, %xmm2 1399; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 1400; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 1401; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3 1402; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 1403; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 1404; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3 1405; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0 1406; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1407; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1408; AVX1-NEXT: vzeroupper 1409; AVX1-NEXT: retq 1410; 1411; AVX2-LABEL: trunc_sub_const_v16i64_v16i8: 1412; AVX2: # BB#0: 1413; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1 1414; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 1415; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm3, %ymm3 1416; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm2, %ymm2 1417; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6] 1418; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] 1419; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6] 1420; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] 1421; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 1422; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] 1423; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 1424; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 1425; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1426; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 1427; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 1428; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 1429; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] 1430; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3] 1431; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1432; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 1433; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1434; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 1435; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 1436; AVX2-NEXT: vzeroupper 1437; AVX2-NEXT: retq 1438; 1439; AVX512-LABEL: trunc_sub_const_v16i64_v16i8: 1440; AVX512: # BB#0: 1441; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm1, %zmm1 1442; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0 1443; AVX512-NEXT: vpmovqd %zmm0, %ymm0 1444; AVX512-NEXT: vpmovqd %zmm1, %ymm1 1445; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1446; AVX512-NEXT: vpmovdb %zmm0, %xmm0 1447; AVX512-NEXT: retq 1448 %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 1449 %2 = trunc <16 x i64> %1 to <16 x i8> 1450 ret <16 x i8> %2 1451} 1452 1453define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 1454; SSE-LABEL: trunc_sub_const_v16i32_v16i8: 1455; SSE: # BB#0: 1456; SSE-NEXT: psubd {{.*}}(%rip), %xmm0 1457; SSE-NEXT: psubd {{.*}}(%rip), %xmm1 1458; SSE-NEXT: psubd {{.*}}(%rip), %xmm2 1459; SSE-NEXT: psubd {{.*}}(%rip), %xmm3 1460; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 1461; SSE-NEXT: pand %xmm4, %xmm3 1462; SSE-NEXT: pand %xmm4, %xmm2 1463; SSE-NEXT: packuswb %xmm3, %xmm2 1464; SSE-NEXT: pand %xmm4, %xmm1 1465; SSE-NEXT: pand %xmm4, %xmm0 1466; SSE-NEXT: packuswb %xmm1, %xmm0 1467; SSE-NEXT: packuswb %xmm2, %xmm0 1468; SSE-NEXT: retq 1469; 1470; AVX1-LABEL: trunc_sub_const_v16i32_v16i8: 1471; AVX1: # BB#0: 1472; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm2 1473; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1474; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 1475; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm1, %xmm3 1476; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1477; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm1, %xmm1 1478; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 1479; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 1480; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 1481; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 1482; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 1483; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 1484; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0 1485; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1486; AVX1-NEXT: vzeroupper 1487; AVX1-NEXT: retq 1488; 1489; AVX2-LABEL: trunc_sub_const_v16i32_v16i8: 1490; AVX2: # BB#0: 1491; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0 1492; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm1, %ymm1 1493; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] 1494; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 1495; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 1496; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1497; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1498; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 1499; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1500; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1501; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1502; AVX2-NEXT: vzeroupper 1503; AVX2-NEXT: retq 1504; 1505; AVX512-LABEL: trunc_sub_const_v16i32_v16i8: 1506; AVX512: # BB#0: 1507; AVX512-NEXT: vpsubd {{.*}}(%rip), %zmm0, %zmm0 1508; AVX512-NEXT: vpmovdb %zmm0, %xmm0 1509; AVX512-NEXT: retq 1510 %1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1511 %2 = trunc <16 x i32> %1 to <16 x i8> 1512 ret <16 x i8> %2 1513} 1514 1515define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 1516; SSE-LABEL: trunc_sub_const_v16i16_v16i8: 1517; SSE: # BB#0: 1518; SSE-NEXT: psubw {{.*}}(%rip), %xmm0 1519; SSE-NEXT: psubw {{.*}}(%rip), %xmm1 1520; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1521; SSE-NEXT: pand %xmm2, %xmm1 1522; SSE-NEXT: pand %xmm2, %xmm0 1523; SSE-NEXT: packuswb %xmm1, %xmm0 1524; SSE-NEXT: retq 1525; 1526; AVX1-LABEL: trunc_sub_const_v16i16_v16i8: 1527; AVX1: # BB#0: 1528; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm1 1529; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1530; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 1531; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1532; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1533; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1534; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1535; AVX1-NEXT: vzeroupper 1536; AVX1-NEXT: retq 1537; 1538; AVX2-LABEL: trunc_sub_const_v16i16_v16i8: 1539; AVX2: # BB#0: 1540; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 1541; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1542; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1543; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1544; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1545; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1546; AVX2-NEXT: vzeroupper 1547; AVX2-NEXT: retq 1548; 1549; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8: 1550; AVX512F: # BB#0: 1551; AVX512F-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 1552; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 1553; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 1554; AVX512F-NEXT: retq 1555; 1556; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8: 1557; AVX512BW: # BB#0: 1558; AVX512BW-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 1559; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1560; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 1561; AVX512BW-NEXT: retq 1562 %1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 1563 %2 = trunc <16 x i16> %1 to <16 x i8> 1564 ret <16 x i8> %2 1565} 1566 1567; 1568; mul 1569; 1570 1571define <4 x i32> @trunc_mul_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 1572; SSE-LABEL: trunc_mul_v4i64_4i32: 1573; SSE: # BB#0: 1574; SSE-NEXT: movdqa %xmm0, %xmm4 1575; SSE-NEXT: pmuludq %xmm2, %xmm4 1576; SSE-NEXT: movdqa %xmm2, %xmm5 1577; SSE-NEXT: psrlq $32, %xmm5 1578; SSE-NEXT: pmuludq %xmm0, %xmm5 1579; SSE-NEXT: psllq $32, %xmm5 1580; SSE-NEXT: paddq %xmm4, %xmm5 1581; SSE-NEXT: psrlq $32, %xmm0 1582; SSE-NEXT: pmuludq %xmm2, %xmm0 1583; SSE-NEXT: psllq $32, %xmm0 1584; SSE-NEXT: paddq %xmm5, %xmm0 1585; SSE-NEXT: movdqa %xmm1, %xmm2 1586; SSE-NEXT: pmuludq %xmm3, %xmm2 1587; SSE-NEXT: movdqa %xmm3, %xmm4 1588; SSE-NEXT: psrlq $32, %xmm4 1589; SSE-NEXT: pmuludq %xmm1, %xmm4 1590; SSE-NEXT: psllq $32, %xmm4 1591; SSE-NEXT: paddq %xmm2, %xmm4 1592; SSE-NEXT: psrlq $32, %xmm1 1593; SSE-NEXT: pmuludq %xmm3, %xmm1 1594; SSE-NEXT: psllq $32, %xmm1 1595; SSE-NEXT: paddq %xmm4, %xmm1 1596; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1597; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1598; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1599; SSE-NEXT: retq 1600; 1601; AVX1-LABEL: trunc_mul_v4i64_4i32: 1602; AVX1: # BB#0: 1603; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 1604; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 1605; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 1606; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 1607; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 1608; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 1609; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm3 1610; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 1611; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 1612; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1613; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1614; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 1615; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 1616; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4 1617; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 1618; AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3 1619; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 1620; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 1621; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 1622; AVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0 1623; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] 1624; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] 1625; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 1626; AVX1-NEXT: vzeroupper 1627; AVX1-NEXT: retq 1628; 1629; AVX2-LABEL: trunc_mul_v4i64_4i32: 1630; AVX2: # BB#0: 1631; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 1632; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 1633; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 1634; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3 1635; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 1636; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 1637; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 1638; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 1639; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 1640; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 1641; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 1642; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 1643; AVX2-NEXT: vzeroupper 1644; AVX2-NEXT: retq 1645; 1646; AVX512-LABEL: trunc_mul_v4i64_4i32: 1647; AVX512: # BB#0: 1648; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 1649; AVX512-NEXT: vpsrlq $32, %ymm1, %ymm3 1650; AVX512-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 1651; AVX512-NEXT: vpsllq $32, %ymm3, %ymm3 1652; AVX512-NEXT: vpaddq %ymm3, %ymm2, %ymm2 1653; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm0 1654; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 1655; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0 1656; AVX512-NEXT: vpaddq %ymm0, %ymm2, %ymm0 1657; AVX512-NEXT: vpmovqd %zmm0, %ymm0 1658; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 1659; AVX512-NEXT: retq 1660 %1 = mul <4 x i64> %a0, %a1 1661 %2 = trunc <4 x i64> %1 to <4 x i32> 1662 ret <4 x i32> %2 1663} 1664 1665define <8 x i16> @trunc_mul_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 1666; SSE-LABEL: trunc_mul_v8i64_8i16: 1667; SSE: # BB#0: 1668; SSE-NEXT: movdqa %xmm2, %xmm8 1669; SSE-NEXT: pmuludq %xmm6, %xmm8 1670; SSE-NEXT: movdqa %xmm6, %xmm9 1671; SSE-NEXT: psrlq $32, %xmm9 1672; SSE-NEXT: pmuludq %xmm2, %xmm9 1673; SSE-NEXT: psllq $32, %xmm9 1674; SSE-NEXT: paddq %xmm8, %xmm9 1675; SSE-NEXT: psrlq $32, %xmm2 1676; SSE-NEXT: pmuludq %xmm6, %xmm2 1677; SSE-NEXT: psllq $32, %xmm2 1678; SSE-NEXT: paddq %xmm9, %xmm2 1679; SSE-NEXT: movdqa %xmm0, %xmm8 1680; SSE-NEXT: pmuludq %xmm4, %xmm8 1681; SSE-NEXT: movdqa %xmm4, %xmm6 1682; SSE-NEXT: psrlq $32, %xmm6 1683; SSE-NEXT: pmuludq %xmm0, %xmm6 1684; SSE-NEXT: psllq $32, %xmm6 1685; SSE-NEXT: paddq %xmm8, %xmm6 1686; SSE-NEXT: psrlq $32, %xmm0 1687; SSE-NEXT: pmuludq %xmm4, %xmm0 1688; SSE-NEXT: psllq $32, %xmm0 1689; SSE-NEXT: paddq %xmm6, %xmm0 1690; SSE-NEXT: movdqa %xmm3, %xmm4 1691; SSE-NEXT: pmuludq %xmm7, %xmm4 1692; SSE-NEXT: movdqa %xmm7, %xmm6 1693; SSE-NEXT: psrlq $32, %xmm6 1694; SSE-NEXT: pmuludq %xmm3, %xmm6 1695; SSE-NEXT: psllq $32, %xmm6 1696; SSE-NEXT: paddq %xmm4, %xmm6 1697; SSE-NEXT: psrlq $32, %xmm3 1698; SSE-NEXT: pmuludq %xmm7, %xmm3 1699; SSE-NEXT: psllq $32, %xmm3 1700; SSE-NEXT: paddq %xmm6, %xmm3 1701; SSE-NEXT: movdqa %xmm1, %xmm4 1702; SSE-NEXT: pmuludq %xmm5, %xmm4 1703; SSE-NEXT: movdqa %xmm5, %xmm6 1704; SSE-NEXT: psrlq $32, %xmm6 1705; SSE-NEXT: pmuludq %xmm1, %xmm6 1706; SSE-NEXT: psllq $32, %xmm6 1707; SSE-NEXT: paddq %xmm4, %xmm6 1708; SSE-NEXT: psrlq $32, %xmm1 1709; SSE-NEXT: pmuludq %xmm5, %xmm1 1710; SSE-NEXT: psllq $32, %xmm1 1711; SSE-NEXT: paddq %xmm6, %xmm1 1712; SSE-NEXT: pextrw $4, %xmm1, %eax 1713; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 1714; SSE-NEXT: pextrw $4, %xmm0, %ecx 1715; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1716; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1717; SSE-NEXT: pextrw $4, %xmm3, %edx 1718; SSE-NEXT: movd %edx, %xmm1 1719; SSE-NEXT: movd %eax, %xmm3 1720; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 1721; SSE-NEXT: pextrw $4, %xmm2, %eax 1722; SSE-NEXT: movd %eax, %xmm1 1723; SSE-NEXT: movd %ecx, %xmm2 1724; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 1725; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 1726; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1727; SSE-NEXT: retq 1728; 1729; AVX1-LABEL: trunc_mul_v8i64_8i16: 1730; AVX1: # BB#0: 1731; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm4 1732; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5 1733; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm5 1734; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 1735; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4 1736; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm5 1737; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm5 1738; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 1739; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4 1740; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1741; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1742; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm5 1743; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6 1744; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6 1745; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6 1746; AVX1-NEXT: vpaddq %xmm6, %xmm5, %xmm5 1747; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 1748; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 1749; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 1750; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0 1751; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm2 1752; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 1753; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm5 1754; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 1755; AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2 1756; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5 1757; AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm5 1758; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 1759; AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2 1760; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1761; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1762; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm5 1763; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm6 1764; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm6 1765; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6 1766; AVX1-NEXT: vpaddq %xmm6, %xmm5, %xmm5 1767; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 1768; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 1769; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 1770; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1 1771; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 1772; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 1773; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 1774; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 1775; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 1776; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7] 1777; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 1778; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1779; AVX1-NEXT: vzeroupper 1780; AVX1-NEXT: retq 1781; 1782; AVX2-LABEL: trunc_mul_v8i64_8i16: 1783; AVX2: # BB#0: 1784; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm4 1785; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm5 1786; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm5 1787; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5 1788; AVX2-NEXT: vpaddq %ymm5, %ymm4, %ymm4 1789; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 1790; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 1791; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 1792; AVX2-NEXT: vpaddq %ymm1, %ymm4, %ymm1 1793; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm3 1794; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4 1795; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm4 1796; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4 1797; AVX2-NEXT: vpaddq %ymm4, %ymm3, %ymm3 1798; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 1799; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 1800; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 1801; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0 1802; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 1803; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 1804; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] 1805; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3] 1806; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1807; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 1808; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1809; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 1810; AVX2-NEXT: vzeroupper 1811; AVX2-NEXT: retq 1812; 1813; AVX512-LABEL: trunc_mul_v8i64_8i16: 1814; AVX512: # BB#0: 1815; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 1816; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm3 1817; AVX512-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 1818; AVX512-NEXT: vpsllq $32, %zmm3, %zmm3 1819; AVX512-NEXT: vpaddq %zmm3, %zmm2, %zmm2 1820; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0 1821; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 1822; AVX512-NEXT: vpsllq $32, %zmm0, %zmm0 1823; AVX512-NEXT: vpaddq %zmm0, %zmm2, %zmm0 1824; AVX512-NEXT: vpmovqw %zmm0, %xmm0 1825; AVX512-NEXT: retq 1826 %1 = mul <8 x i64> %a0, %a1 1827 %2 = trunc <8 x i64> %1 to <8 x i16> 1828 ret <8 x i16> %2 1829} 1830 1831define <8 x i16> @trunc_mul_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 1832; SSE-LABEL: trunc_mul_v8i32_8i16: 1833; SSE: # BB#0: 1834; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 1835; SSE-NEXT: pmuludq %xmm2, %xmm0 1836; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1837; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 1838; SSE-NEXT: pmuludq %xmm4, %xmm2 1839; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 1840; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1841; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 1842; SSE-NEXT: pmuludq %xmm3, %xmm1 1843; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1844; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 1845; SSE-NEXT: pmuludq %xmm2, %xmm3 1846; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 1847; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1848; SSE-NEXT: pslld $16, %xmm1 1849; SSE-NEXT: psrad $16, %xmm1 1850; SSE-NEXT: pslld $16, %xmm0 1851; SSE-NEXT: psrad $16, %xmm0 1852; SSE-NEXT: packssdw %xmm1, %xmm0 1853; SSE-NEXT: retq 1854; 1855; AVX1-LABEL: trunc_mul_v8i32_8i16: 1856; AVX1: # BB#0: 1857; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm2 1858; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1859; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1860; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1861; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1862; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 1863; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 1864; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1865; AVX1-NEXT: vzeroupper 1866; AVX1-NEXT: retq 1867; 1868; AVX2-LABEL: trunc_mul_v8i32_8i16: 1869; AVX2: # BB#0: 1870; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 1871; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 1872; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1873; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 1874; AVX2-NEXT: vzeroupper 1875; AVX2-NEXT: retq 1876; 1877; AVX512-LABEL: trunc_mul_v8i32_8i16: 1878; AVX512: # BB#0: 1879; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 1880; AVX512-NEXT: vpmovdw %zmm0, %ymm0 1881; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 1882; AVX512-NEXT: retq 1883 %1 = mul <8 x i32> %a0, %a1 1884 %2 = trunc <8 x i32> %1 to <8 x i16> 1885 ret <8 x i16> %2 1886} 1887 1888define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 1889; SSE-LABEL: trunc_mul_v16i64_v16i8: 1890; SSE: # BB#0: 1891; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1892; SSE-NEXT: movdqa %xmm0, %xmm9 1893; SSE-NEXT: pmuludq %xmm8, %xmm9 1894; SSE-NEXT: movdqa %xmm8, %xmm10 1895; SSE-NEXT: psrlq $32, %xmm10 1896; SSE-NEXT: pmuludq %xmm0, %xmm10 1897; SSE-NEXT: psllq $32, %xmm10 1898; SSE-NEXT: paddq %xmm10, %xmm9 1899; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 1900; SSE-NEXT: psrlq $32, %xmm0 1901; SSE-NEXT: pmuludq %xmm8, %xmm0 1902; SSE-NEXT: psllq $32, %xmm0 1903; SSE-NEXT: paddq %xmm9, %xmm0 1904; SSE-NEXT: movdqa %xmm1, %xmm8 1905; SSE-NEXT: pmuludq %xmm10, %xmm8 1906; SSE-NEXT: movdqa %xmm10, %xmm9 1907; SSE-NEXT: psrlq $32, %xmm9 1908; SSE-NEXT: pmuludq %xmm1, %xmm9 1909; SSE-NEXT: psllq $32, %xmm9 1910; SSE-NEXT: paddq %xmm8, %xmm9 1911; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1912; SSE-NEXT: psrlq $32, %xmm1 1913; SSE-NEXT: pmuludq %xmm10, %xmm1 1914; SSE-NEXT: psllq $32, %xmm1 1915; SSE-NEXT: paddq %xmm9, %xmm1 1916; SSE-NEXT: movdqa %xmm2, %xmm9 1917; SSE-NEXT: pmuludq %xmm8, %xmm9 1918; SSE-NEXT: movdqa %xmm8, %xmm10 1919; SSE-NEXT: psrlq $32, %xmm10 1920; SSE-NEXT: pmuludq %xmm2, %xmm10 1921; SSE-NEXT: psllq $32, %xmm10 1922; SSE-NEXT: paddq %xmm9, %xmm10 1923; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 1924; SSE-NEXT: psrlq $32, %xmm2 1925; SSE-NEXT: pmuludq %xmm8, %xmm2 1926; SSE-NEXT: psllq $32, %xmm2 1927; SSE-NEXT: paddq %xmm10, %xmm2 1928; SSE-NEXT: movdqa %xmm3, %xmm8 1929; SSE-NEXT: pmuludq %xmm9, %xmm8 1930; SSE-NEXT: movdqa %xmm9, %xmm10 1931; SSE-NEXT: psrlq $32, %xmm10 1932; SSE-NEXT: pmuludq %xmm3, %xmm10 1933; SSE-NEXT: psllq $32, %xmm10 1934; SSE-NEXT: paddq %xmm8, %xmm10 1935; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1936; SSE-NEXT: psrlq $32, %xmm3 1937; SSE-NEXT: pmuludq %xmm9, %xmm3 1938; SSE-NEXT: psllq $32, %xmm3 1939; SSE-NEXT: paddq %xmm10, %xmm3 1940; SSE-NEXT: movdqa %xmm4, %xmm9 1941; SSE-NEXT: pmuludq %xmm8, %xmm9 1942; SSE-NEXT: movdqa %xmm8, %xmm10 1943; SSE-NEXT: psrlq $32, %xmm10 1944; SSE-NEXT: pmuludq %xmm4, %xmm10 1945; SSE-NEXT: psllq $32, %xmm10 1946; SSE-NEXT: paddq %xmm9, %xmm10 1947; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 1948; SSE-NEXT: psrlq $32, %xmm4 1949; SSE-NEXT: pmuludq %xmm8, %xmm4 1950; SSE-NEXT: psllq $32, %xmm4 1951; SSE-NEXT: paddq %xmm10, %xmm4 1952; SSE-NEXT: movdqa %xmm5, %xmm8 1953; SSE-NEXT: pmuludq %xmm9, %xmm8 1954; SSE-NEXT: movdqa %xmm9, %xmm10 1955; SSE-NEXT: psrlq $32, %xmm10 1956; SSE-NEXT: pmuludq %xmm5, %xmm10 1957; SSE-NEXT: psllq $32, %xmm10 1958; SSE-NEXT: paddq %xmm8, %xmm10 1959; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1960; SSE-NEXT: psrlq $32, %xmm5 1961; SSE-NEXT: pmuludq %xmm9, %xmm5 1962; SSE-NEXT: psllq $32, %xmm5 1963; SSE-NEXT: paddq %xmm10, %xmm5 1964; SSE-NEXT: movdqa %xmm6, %xmm9 1965; SSE-NEXT: pmuludq %xmm8, %xmm9 1966; SSE-NEXT: movdqa %xmm8, %xmm10 1967; SSE-NEXT: psrlq $32, %xmm10 1968; SSE-NEXT: pmuludq %xmm6, %xmm10 1969; SSE-NEXT: psllq $32, %xmm10 1970; SSE-NEXT: paddq %xmm9, %xmm10 1971; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 1972; SSE-NEXT: psrlq $32, %xmm6 1973; SSE-NEXT: pmuludq %xmm8, %xmm6 1974; SSE-NEXT: psllq $32, %xmm6 1975; SSE-NEXT: paddq %xmm10, %xmm6 1976; SSE-NEXT: movdqa %xmm7, %xmm8 1977; SSE-NEXT: pmuludq %xmm9, %xmm8 1978; SSE-NEXT: movdqa %xmm9, %xmm10 1979; SSE-NEXT: psrlq $32, %xmm10 1980; SSE-NEXT: pmuludq %xmm7, %xmm10 1981; SSE-NEXT: psllq $32, %xmm10 1982; SSE-NEXT: paddq %xmm8, %xmm10 1983; SSE-NEXT: psrlq $32, %xmm7 1984; SSE-NEXT: pmuludq %xmm9, %xmm7 1985; SSE-NEXT: psllq $32, %xmm7 1986; SSE-NEXT: paddq %xmm10, %xmm7 1987; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 1988; SSE-NEXT: pand %xmm8, %xmm7 1989; SSE-NEXT: pand %xmm8, %xmm6 1990; SSE-NEXT: packuswb %xmm7, %xmm6 1991; SSE-NEXT: pand %xmm8, %xmm5 1992; SSE-NEXT: pand %xmm8, %xmm4 1993; SSE-NEXT: packuswb %xmm5, %xmm4 1994; SSE-NEXT: packuswb %xmm6, %xmm4 1995; SSE-NEXT: pand %xmm8, %xmm3 1996; SSE-NEXT: pand %xmm8, %xmm2 1997; SSE-NEXT: packuswb %xmm3, %xmm2 1998; SSE-NEXT: pand %xmm8, %xmm1 1999; SSE-NEXT: pand %xmm8, %xmm0 2000; SSE-NEXT: packuswb %xmm1, %xmm0 2001; SSE-NEXT: packuswb %xmm2, %xmm0 2002; SSE-NEXT: packuswb %xmm4, %xmm0 2003; SSE-NEXT: retq 2004; 2005; AVX1-LABEL: trunc_mul_v16i64_v16i8: 2006; AVX1: # BB#0: 2007; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm8 2008; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm9 2009; AVX1-NEXT: vpmuludq %xmm9, %xmm0, %xmm9 2010; AVX1-NEXT: vpsllq $32, %xmm9, %xmm9 2011; AVX1-NEXT: vpaddq %xmm9, %xmm8, %xmm8 2012; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm9 2013; AVX1-NEXT: vpmuludq %xmm4, %xmm9, %xmm9 2014; AVX1-NEXT: vpsllq $32, %xmm9, %xmm9 2015; AVX1-NEXT: vpaddq %xmm9, %xmm8, %xmm8 2016; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm10 2017; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2018; AVX1-NEXT: vpmuludq %xmm10, %xmm0, %xmm9 2019; AVX1-NEXT: vpsrlq $32, %xmm10, %xmm4 2020; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4 2021; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 2022; AVX1-NEXT: vpaddq %xmm4, %xmm9, %xmm4 2023; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 2024; AVX1-NEXT: vpmuludq %xmm10, %xmm0, %xmm0 2025; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 2026; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm9 2027; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm4 2028; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm0 2029; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 2030; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 2031; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 2032; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 2033; AVX1-NEXT: vpmuludq %xmm5, %xmm4, %xmm4 2034; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 2035; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm10 2036; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm0 2037; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2038; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm5 2039; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4 2040; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm4 2041; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 2042; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 2043; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 2044; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 2045; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 2046; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm1 2047; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm0 2048; AVX1-NEXT: vpsrlq $32, %xmm6, %xmm4 2049; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 2050; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 2051; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 2052; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4 2053; AVX1-NEXT: vpmuludq %xmm6, %xmm4, %xmm4 2054; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 2055; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm5 2056; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm0 2057; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 2058; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm4 2059; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm6 2060; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm6 2061; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6 2062; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm4 2063; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 2064; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm0 2065; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 2066; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 2067; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm2 2068; AVX1-NEXT: vpsrlq $32, %xmm7, %xmm4 2069; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm4 2070; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 2071; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 2072; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4 2073; AVX1-NEXT: vpmuludq %xmm7, %xmm4, %xmm4 2074; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 2075; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 2076; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm4 2077; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 2078; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm6 2079; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm7 2080; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm7 2081; AVX1-NEXT: vpsllq $32, %xmm7, %xmm7 2082; AVX1-NEXT: vpaddq %xmm7, %xmm6, %xmm6 2083; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm3 2084; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3 2085; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 2086; AVX1-NEXT: vpaddq %xmm3, %xmm6, %xmm3 2087; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 2088; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 2089; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2090; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 2091; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2092; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3 2093; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0 2094; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2095; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2096; AVX1-NEXT: vpand %xmm4, %xmm10, %xmm2 2097; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 2098; AVX1-NEXT: vpand %xmm4, %xmm9, %xmm2 2099; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3 2100; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 2101; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 2102; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 2103; AVX1-NEXT: vzeroupper 2104; AVX1-NEXT: retq 2105; 2106; AVX2-LABEL: trunc_mul_v16i64_v16i8: 2107; AVX2: # BB#0: 2108; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm8 2109; AVX2-NEXT: vpsrlq $32, %ymm5, %ymm9 2110; AVX2-NEXT: vpmuludq %ymm9, %ymm1, %ymm9 2111; AVX2-NEXT: vpsllq $32, %ymm9, %ymm9 2112; AVX2-NEXT: vpaddq %ymm9, %ymm8, %ymm8 2113; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 2114; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm1 2115; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 2116; AVX2-NEXT: vpaddq %ymm1, %ymm8, %ymm1 2117; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm5 2118; AVX2-NEXT: vpsrlq $32, %ymm4, %ymm8 2119; AVX2-NEXT: vpmuludq %ymm8, %ymm0, %ymm8 2120; AVX2-NEXT: vpsllq $32, %ymm8, %ymm8 2121; AVX2-NEXT: vpaddq %ymm8, %ymm5, %ymm5 2122; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 2123; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0 2124; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 2125; AVX2-NEXT: vpaddq %ymm0, %ymm5, %ymm0 2126; AVX2-NEXT: vpmuludq %ymm7, %ymm3, %ymm4 2127; AVX2-NEXT: vpsrlq $32, %ymm7, %ymm5 2128; AVX2-NEXT: vpmuludq %ymm5, %ymm3, %ymm5 2129; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5 2130; AVX2-NEXT: vpaddq %ymm5, %ymm4, %ymm4 2131; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3 2132; AVX2-NEXT: vpmuludq %ymm7, %ymm3, %ymm3 2133; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3 2134; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3 2135; AVX2-NEXT: vpmuludq %ymm6, %ymm2, %ymm4 2136; AVX2-NEXT: vpsrlq $32, %ymm6, %ymm5 2137; AVX2-NEXT: vpmuludq %ymm5, %ymm2, %ymm5 2138; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5 2139; AVX2-NEXT: vpaddq %ymm5, %ymm4, %ymm4 2140; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2 2141; AVX2-NEXT: vpmuludq %ymm6, %ymm2, %ymm2 2142; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 2143; AVX2-NEXT: vpaddq %ymm2, %ymm4, %ymm2 2144; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6] 2145; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] 2146; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6] 2147; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] 2148; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 2149; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] 2150; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 2151; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 2152; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 2153; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 2154; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 2155; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 2156; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] 2157; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3] 2158; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 2159; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 2160; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2161; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 2162; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 2163; AVX2-NEXT: vzeroupper 2164; AVX2-NEXT: retq 2165; 2166; AVX512-LABEL: trunc_mul_v16i64_v16i8: 2167; AVX512: # BB#0: 2168; AVX512-NEXT: vpmuludq %zmm3, %zmm1, %zmm4 2169; AVX512-NEXT: vpsrlq $32, %zmm3, %zmm5 2170; AVX512-NEXT: vpmuludq %zmm5, %zmm1, %zmm5 2171; AVX512-NEXT: vpsllq $32, %zmm5, %zmm5 2172; AVX512-NEXT: vpaddq %zmm5, %zmm4, %zmm4 2173; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm1 2174; AVX512-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 2175; AVX512-NEXT: vpsllq $32, %zmm1, %zmm1 2176; AVX512-NEXT: vpaddq %zmm1, %zmm4, %zmm1 2177; AVX512-NEXT: vpmuludq %zmm2, %zmm0, %zmm3 2178; AVX512-NEXT: vpsrlq $32, %zmm2, %zmm4 2179; AVX512-NEXT: vpmuludq %zmm4, %zmm0, %zmm4 2180; AVX512-NEXT: vpsllq $32, %zmm4, %zmm4 2181; AVX512-NEXT: vpaddq %zmm4, %zmm3, %zmm3 2182; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0 2183; AVX512-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 2184; AVX512-NEXT: vpsllq $32, %zmm0, %zmm0 2185; AVX512-NEXT: vpaddq %zmm0, %zmm3, %zmm0 2186; AVX512-NEXT: vpmovqd %zmm0, %ymm0 2187; AVX512-NEXT: vpmovqd %zmm1, %ymm1 2188; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2189; AVX512-NEXT: vpmovdb %zmm0, %xmm0 2190; AVX512-NEXT: retq 2191 %1 = mul <16 x i64> %a0, %a1 2192 %2 = trunc <16 x i64> %1 to <16 x i8> 2193 ret <16 x i8> %2 2194} 2195 2196define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 2197; SSE-LABEL: trunc_mul_v16i32_v16i8: 2198; SSE: # BB#0: 2199; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] 2200; SSE-NEXT: pmuludq %xmm4, %xmm0 2201; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2202; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 2203; SSE-NEXT: pmuludq %xmm8, %xmm4 2204; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2205; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 2206; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 2207; SSE-NEXT: pmuludq %xmm5, %xmm1 2208; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2209; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 2210; SSE-NEXT: pmuludq %xmm4, %xmm5 2211; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] 2212; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 2213; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] 2214; SSE-NEXT: pmuludq %xmm6, %xmm2 2215; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 2216; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] 2217; SSE-NEXT: pmuludq %xmm4, %xmm5 2218; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] 2219; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 2220; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 2221; SSE-NEXT: pmuludq %xmm7, %xmm3 2222; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 2223; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] 2224; SSE-NEXT: pmuludq %xmm4, %xmm5 2225; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] 2226; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 2227; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 2228; SSE-NEXT: pand %xmm4, %xmm3 2229; SSE-NEXT: pand %xmm4, %xmm2 2230; SSE-NEXT: packuswb %xmm3, %xmm2 2231; SSE-NEXT: pand %xmm4, %xmm1 2232; SSE-NEXT: pand %xmm4, %xmm0 2233; SSE-NEXT: packuswb %xmm1, %xmm0 2234; SSE-NEXT: packuswb %xmm2, %xmm0 2235; SSE-NEXT: retq 2236; 2237; AVX1-LABEL: trunc_mul_v16i32_v16i8: 2238; AVX1: # BB#0: 2239; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm4 2240; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 2241; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2242; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 2243; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm2 2244; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 2245; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2246; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 2247; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 2248; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 2249; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 2250; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 2251; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 2252; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 2253; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0 2254; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2255; AVX1-NEXT: vzeroupper 2256; AVX1-NEXT: retq 2257; 2258; AVX2-LABEL: trunc_mul_v16i32_v16i8: 2259; AVX2: # BB#0: 2260; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0 2261; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1 2262; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] 2263; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2264; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 2265; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 2266; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2267; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2268; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2269; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2270; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2271; AVX2-NEXT: vzeroupper 2272; AVX2-NEXT: retq 2273; 2274; AVX512-LABEL: trunc_mul_v16i32_v16i8: 2275; AVX512: # BB#0: 2276; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 2277; AVX512-NEXT: vpmovdb %zmm0, %xmm0 2278; AVX512-NEXT: retq 2279 %1 = mul <16 x i32> %a0, %a1 2280 %2 = trunc <16 x i32> %1 to <16 x i8> 2281 ret <16 x i8> %2 2282} 2283 2284define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 2285; SSE-LABEL: trunc_mul_v16i16_v16i8: 2286; SSE: # BB#0: 2287; SSE-NEXT: pmullw %xmm2, %xmm0 2288; SSE-NEXT: pmullw %xmm3, %xmm1 2289; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 2290; SSE-NEXT: pand %xmm2, %xmm1 2291; SSE-NEXT: pand %xmm2, %xmm0 2292; SSE-NEXT: packuswb %xmm1, %xmm0 2293; SSE-NEXT: retq 2294; 2295; AVX1-LABEL: trunc_mul_v16i16_v16i8: 2296; AVX1: # BB#0: 2297; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm2 2298; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2299; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2300; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2301; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 2302; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 2303; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 2304; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2305; AVX1-NEXT: vzeroupper 2306; AVX1-NEXT: retq 2307; 2308; AVX2-LABEL: trunc_mul_v16i16_v16i8: 2309; AVX2: # BB#0: 2310; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2311; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2312; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 2313; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 2314; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2315; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2316; AVX2-NEXT: vzeroupper 2317; AVX2-NEXT: retq 2318; 2319; AVX512F-LABEL: trunc_mul_v16i16_v16i8: 2320; AVX512F: # BB#0: 2321; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2322; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 2323; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 2324; AVX512F-NEXT: retq 2325; 2326; AVX512BW-LABEL: trunc_mul_v16i16_v16i8: 2327; AVX512BW: # BB#0: 2328; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2329; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2330; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2331; AVX512BW-NEXT: retq 2332 %1 = mul <16 x i16> %a0, %a1 2333 %2 = trunc <16 x i16> %1 to <16 x i8> 2334 ret <16 x i8> %2 2335} 2336 2337; 2338; mul to constant 2339; 2340 2341define <4 x i32> @trunc_mul_const_v4i64_4i32(<4 x i64> %a0) nounwind { 2342; SSE-LABEL: trunc_mul_const_v4i64_4i32: 2343; SSE: # BB#0: 2344; SSE-NEXT: movl $1, %eax 2345; SSE-NEXT: movd %rax, %xmm2 2346; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] 2347; SSE-NEXT: movdqa %xmm0, %xmm3 2348; SSE-NEXT: pmuludq %xmm2, %xmm3 2349; SSE-NEXT: psrlq $32, %xmm0 2350; SSE-NEXT: pmuludq %xmm2, %xmm0 2351; SSE-NEXT: psllq $32, %xmm0 2352; SSE-NEXT: paddq %xmm3, %xmm0 2353; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,3] 2354; SSE-NEXT: movdqa %xmm1, %xmm3 2355; SSE-NEXT: pmuludq %xmm2, %xmm3 2356; SSE-NEXT: psrlq $32, %xmm1 2357; SSE-NEXT: pmuludq %xmm2, %xmm1 2358; SSE-NEXT: psllq $32, %xmm1 2359; SSE-NEXT: paddq %xmm3, %xmm1 2360; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2361; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2362; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2363; SSE-NEXT: retq 2364; 2365; AVX1-LABEL: trunc_mul_const_v4i64_4i32: 2366; AVX1: # BB#0: 2367; AVX1-NEXT: movl $1, %eax 2368; AVX1-NEXT: vmovq %rax, %xmm1 2369; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] 2370; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 2371; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 2372; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 2373; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 2374; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 2375; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2376; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3] 2377; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm3 2378; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 2379; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 2380; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 2381; AVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0 2382; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] 2383; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2384; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 2385; AVX1-NEXT: vzeroupper 2386; AVX1-NEXT: retq 2387; 2388; AVX2-LABEL: trunc_mul_const_v4i64_4i32: 2389; AVX2: # BB#0: 2390; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3] 2391; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 2392; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 2393; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 2394; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 2395; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 2396; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 2397; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 2398; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2399; AVX2-NEXT: vzeroupper 2400; AVX2-NEXT: retq 2401; 2402; AVX512-LABEL: trunc_mul_const_v4i64_4i32: 2403; AVX512: # BB#0: 2404; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3] 2405; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 2406; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm0 2407; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 2408; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0 2409; AVX512-NEXT: vpaddq %ymm0, %ymm2, %ymm0 2410; AVX512-NEXT: vpmovqd %zmm0, %ymm0 2411; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2412; AVX512-NEXT: retq 2413 %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 2414 %2 = trunc <4 x i64> %1 to <4 x i32> 2415 ret <4 x i32> %2 2416} 2417 2418define <8 x i16> @trunc_mul_const_v16i64_v16i16(<8 x i64> %a0) nounwind { 2419; SSE-LABEL: trunc_mul_const_v16i64_v16i16: 2420; SSE: # BB#0: 2421; SSE-NEXT: movdqa {{.*#+}} xmm4 = [4,5] 2422; SSE-NEXT: movdqa %xmm2, %xmm5 2423; SSE-NEXT: pmuludq %xmm4, %xmm5 2424; SSE-NEXT: psrlq $32, %xmm2 2425; SSE-NEXT: pmuludq %xmm4, %xmm2 2426; SSE-NEXT: psllq $32, %xmm2 2427; SSE-NEXT: paddq %xmm5, %xmm2 2428; SSE-NEXT: movl $1, %eax 2429; SSE-NEXT: movd %rax, %xmm4 2430; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] 2431; SSE-NEXT: movdqa %xmm0, %xmm5 2432; SSE-NEXT: pmuludq %xmm4, %xmm5 2433; SSE-NEXT: psrlq $32, %xmm0 2434; SSE-NEXT: pmuludq %xmm4, %xmm0 2435; SSE-NEXT: psllq $32, %xmm0 2436; SSE-NEXT: paddq %xmm5, %xmm0 2437; SSE-NEXT: movdqa {{.*#+}} xmm4 = [6,7] 2438; SSE-NEXT: movdqa %xmm3, %xmm5 2439; SSE-NEXT: pmuludq %xmm4, %xmm5 2440; SSE-NEXT: psrlq $32, %xmm3 2441; SSE-NEXT: pmuludq %xmm4, %xmm3 2442; SSE-NEXT: psllq $32, %xmm3 2443; SSE-NEXT: paddq %xmm5, %xmm3 2444; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,3] 2445; SSE-NEXT: movdqa %xmm1, %xmm5 2446; SSE-NEXT: pmuludq %xmm4, %xmm5 2447; SSE-NEXT: psrlq $32, %xmm1 2448; SSE-NEXT: pmuludq %xmm4, %xmm1 2449; SSE-NEXT: psllq $32, %xmm1 2450; SSE-NEXT: paddq %xmm5, %xmm1 2451; SSE-NEXT: pextrw $4, %xmm1, %eax 2452; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 2453; SSE-NEXT: pextrw $4, %xmm0, %ecx 2454; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 2455; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2456; SSE-NEXT: pextrw $4, %xmm3, %edx 2457; SSE-NEXT: movd %edx, %xmm1 2458; SSE-NEXT: movd %eax, %xmm3 2459; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 2460; SSE-NEXT: pextrw $4, %xmm2, %eax 2461; SSE-NEXT: movd %eax, %xmm1 2462; SSE-NEXT: movd %ecx, %xmm2 2463; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 2464; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 2465; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 2466; SSE-NEXT: retq 2467; 2468; AVX1-LABEL: trunc_mul_const_v16i64_v16i16: 2469; AVX1: # BB#0: 2470; AVX1-NEXT: movl $1, %eax 2471; AVX1-NEXT: vmovq %rax, %xmm2 2472; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] 2473; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm3 2474; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4 2475; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 2476; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 2477; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 2478; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2479; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3] 2480; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm4 2481; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 2482; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm0 2483; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 2484; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 2485; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5] 2486; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm4 2487; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5 2488; AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm3 2489; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 2490; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 2491; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2492; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [6,7] 2493; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm5 2494; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 2495; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1 2496; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 2497; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1 2498; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 2499; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] 2500; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] 2501; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 2502; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] 2503; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] 2504; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 2505; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2506; AVX1-NEXT: vzeroupper 2507; AVX1-NEXT: retq 2508; 2509; AVX2-LABEL: trunc_mul_const_v16i64_v16i16: 2510; AVX2: # BB#0: 2511; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,6,7] 2512; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm3 2513; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 2514; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 2515; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 2516; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1 2517; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3] 2518; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm3 2519; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 2520; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 2521; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 2522; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0 2523; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 2524; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 2525; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] 2526; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3] 2527; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 2528; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 2529; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2530; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2531; AVX2-NEXT: vzeroupper 2532; AVX2-NEXT: retq 2533; 2534; AVX512-LABEL: trunc_mul_const_v16i64_v16i16: 2535; AVX512: # BB#0: 2536; AVX512-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7] 2537; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 2538; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0 2539; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 2540; AVX512-NEXT: vpsllq $32, %zmm0, %zmm0 2541; AVX512-NEXT: vpaddq %zmm0, %zmm2, %zmm0 2542; AVX512-NEXT: vpmovqw %zmm0, %xmm0 2543; AVX512-NEXT: retq 2544 %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 2545 %2 = trunc <8 x i64> %1 to <8 x i16> 2546 ret <8 x i16> %2 2547} 2548 2549define <8 x i16> @trunc_mul_const_v16i32_v16i16(<8 x i32> %a0) nounwind { 2550; SSE-LABEL: trunc_mul_const_v16i32_v16i16: 2551; SSE: # BB#0: 2552; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3] 2553; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 2554; SSE-NEXT: pmuludq %xmm2, %xmm0 2555; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2556; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 2557; SSE-NEXT: pmuludq %xmm3, %xmm2 2558; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 2559; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 2560; SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,5,6,7] 2561; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 2562; SSE-NEXT: pmuludq %xmm2, %xmm1 2563; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2564; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 2565; SSE-NEXT: pmuludq %xmm3, %xmm2 2566; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 2567; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 2568; SSE-NEXT: pslld $16, %xmm1 2569; SSE-NEXT: psrad $16, %xmm1 2570; SSE-NEXT: pslld $16, %xmm0 2571; SSE-NEXT: psrad $16, %xmm0 2572; SSE-NEXT: packssdw %xmm1, %xmm0 2573; SSE-NEXT: retq 2574; 2575; AVX1-LABEL: trunc_mul_const_v16i32_v16i16: 2576; AVX1: # BB#0: 2577; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1 2578; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2579; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 2580; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 2581; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2582; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 2583; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2584; AVX1-NEXT: vzeroupper 2585; AVX1-NEXT: retq 2586; 2587; AVX2-LABEL: trunc_mul_const_v16i32_v16i16: 2588; AVX2: # BB#0: 2589; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 2590; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 2591; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2592; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2593; AVX2-NEXT: vzeroupper 2594; AVX2-NEXT: retq 2595; 2596; AVX512-LABEL: trunc_mul_const_v16i32_v16i16: 2597; AVX512: # BB#0: 2598; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 2599; AVX512-NEXT: vpmovdw %zmm0, %ymm0 2600; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2601; AVX512-NEXT: retq 2602 %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2603 %2 = trunc <8 x i32> %1 to <8 x i16> 2604 ret <8 x i16> %2 2605} 2606 2607define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 2608; SSE-LABEL: trunc_mul_const_v16i64_v16i8: 2609; SSE: # BB#0: 2610; SSE-NEXT: movl $1, %eax 2611; SSE-NEXT: movd %rax, %xmm8 2612; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] 2613; SSE-NEXT: movdqa %xmm0, %xmm9 2614; SSE-NEXT: pmuludq %xmm8, %xmm9 2615; SSE-NEXT: psrlq $32, %xmm0 2616; SSE-NEXT: pmuludq %xmm8, %xmm0 2617; SSE-NEXT: psllq $32, %xmm0 2618; SSE-NEXT: paddq %xmm9, %xmm0 2619; SSE-NEXT: movdqa {{.*#+}} xmm8 = [2,3] 2620; SSE-NEXT: movdqa %xmm1, %xmm9 2621; SSE-NEXT: pmuludq %xmm8, %xmm9 2622; SSE-NEXT: psrlq $32, %xmm1 2623; SSE-NEXT: pmuludq %xmm8, %xmm1 2624; SSE-NEXT: psllq $32, %xmm1 2625; SSE-NEXT: paddq %xmm9, %xmm1 2626; SSE-NEXT: movdqa {{.*#+}} xmm8 = [4,5] 2627; SSE-NEXT: movdqa %xmm2, %xmm9 2628; SSE-NEXT: pmuludq %xmm8, %xmm9 2629; SSE-NEXT: psrlq $32, %xmm2 2630; SSE-NEXT: pmuludq %xmm8, %xmm2 2631; SSE-NEXT: psllq $32, %xmm2 2632; SSE-NEXT: paddq %xmm9, %xmm2 2633; SSE-NEXT: movdqa {{.*#+}} xmm8 = [6,7] 2634; SSE-NEXT: movdqa %xmm3, %xmm9 2635; SSE-NEXT: pmuludq %xmm8, %xmm9 2636; SSE-NEXT: psrlq $32, %xmm3 2637; SSE-NEXT: pmuludq %xmm8, %xmm3 2638; SSE-NEXT: psllq $32, %xmm3 2639; SSE-NEXT: paddq %xmm9, %xmm3 2640; SSE-NEXT: movdqa {{.*#+}} xmm8 = [8,9] 2641; SSE-NEXT: movdqa %xmm4, %xmm9 2642; SSE-NEXT: pmuludq %xmm8, %xmm9 2643; SSE-NEXT: psrlq $32, %xmm4 2644; SSE-NEXT: pmuludq %xmm8, %xmm4 2645; SSE-NEXT: psllq $32, %xmm4 2646; SSE-NEXT: paddq %xmm9, %xmm4 2647; SSE-NEXT: movdqa {{.*#+}} xmm8 = [10,11] 2648; SSE-NEXT: movdqa %xmm5, %xmm9 2649; SSE-NEXT: pmuludq %xmm8, %xmm9 2650; SSE-NEXT: psrlq $32, %xmm5 2651; SSE-NEXT: pmuludq %xmm8, %xmm5 2652; SSE-NEXT: psllq $32, %xmm5 2653; SSE-NEXT: paddq %xmm9, %xmm5 2654; SSE-NEXT: movdqa {{.*#+}} xmm8 = [12,13] 2655; SSE-NEXT: movdqa %xmm6, %xmm9 2656; SSE-NEXT: pmuludq %xmm8, %xmm9 2657; SSE-NEXT: psrlq $32, %xmm6 2658; SSE-NEXT: pmuludq %xmm8, %xmm6 2659; SSE-NEXT: psllq $32, %xmm6 2660; SSE-NEXT: paddq %xmm9, %xmm6 2661; SSE-NEXT: movdqa {{.*#+}} xmm8 = [14,15] 2662; SSE-NEXT: movdqa %xmm7, %xmm9 2663; SSE-NEXT: pmuludq %xmm8, %xmm9 2664; SSE-NEXT: psrlq $32, %xmm7 2665; SSE-NEXT: pmuludq %xmm8, %xmm7 2666; SSE-NEXT: psllq $32, %xmm7 2667; SSE-NEXT: paddq %xmm9, %xmm7 2668; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 2669; SSE-NEXT: pand %xmm8, %xmm7 2670; SSE-NEXT: pand %xmm8, %xmm6 2671; SSE-NEXT: packuswb %xmm7, %xmm6 2672; SSE-NEXT: pand %xmm8, %xmm5 2673; SSE-NEXT: pand %xmm8, %xmm4 2674; SSE-NEXT: packuswb %xmm5, %xmm4 2675; SSE-NEXT: packuswb %xmm6, %xmm4 2676; SSE-NEXT: pand %xmm8, %xmm3 2677; SSE-NEXT: pand %xmm8, %xmm2 2678; SSE-NEXT: packuswb %xmm3, %xmm2 2679; SSE-NEXT: pand %xmm8, %xmm1 2680; SSE-NEXT: pand %xmm8, %xmm0 2681; SSE-NEXT: packuswb %xmm1, %xmm0 2682; SSE-NEXT: packuswb %xmm2, %xmm0 2683; SSE-NEXT: packuswb %xmm4, %xmm0 2684; SSE-NEXT: retq 2685; 2686; AVX1-LABEL: trunc_mul_const_v16i64_v16i8: 2687; AVX1: # BB#0: 2688; AVX1-NEXT: movl $1, %eax 2689; AVX1-NEXT: vmovq %rax, %xmm4 2690; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] 2691; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm5 2692; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm6 2693; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 2694; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 2695; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm8 2696; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2697; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3] 2698; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm6 2699; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 2700; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm0 2701; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 2702; AVX1-NEXT: vpaddq %xmm0, %xmm6, %xmm9 2703; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5] 2704; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm6 2705; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm7 2706; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 2707; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 2708; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 2709; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2710; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7] 2711; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm7 2712; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 2713; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm1 2714; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 2715; AVX1-NEXT: vpaddq %xmm1, %xmm7, %xmm1 2716; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9] 2717; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm7 2718; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4 2719; AVX1-NEXT: vpmuludq %xmm6, %xmm4, %xmm4 2720; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 2721; AVX1-NEXT: vpaddq %xmm4, %xmm7, %xmm4 2722; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 2723; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [10,11] 2724; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm7 2725; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 2726; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm2 2727; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 2728; AVX1-NEXT: vpaddq %xmm2, %xmm7, %xmm2 2729; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [12,13] 2730; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm7 2731; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm0 2732; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm0 2733; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 2734; AVX1-NEXT: vpaddq %xmm0, %xmm7, %xmm0 2735; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 2736; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [14,15] 2737; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm7 2738; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm3 2739; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm3 2740; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 2741; AVX1-NEXT: vpaddq %xmm3, %xmm7, %xmm3 2742; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 2743; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 2744; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 2745; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 2746; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 2747; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm3 2748; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 2749; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0 2750; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 2751; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm2 2752; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 2753; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm2 2754; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm3 2755; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 2756; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 2757; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 2758; AVX1-NEXT: vzeroupper 2759; AVX1-NEXT: retq 2760; 2761; AVX2-LABEL: trunc_mul_const_v16i64_v16i8: 2762; AVX2: # BB#0: 2763; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,6,7] 2764; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm5 2765; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 2766; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm1 2767; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 2768; AVX2-NEXT: vpaddq %ymm1, %ymm5, %ymm1 2769; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3] 2770; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm5 2771; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 2772; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0 2773; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 2774; AVX2-NEXT: vpaddq %ymm0, %ymm5, %ymm0 2775; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [12,13,14,15] 2776; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm5 2777; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3 2778; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm3 2779; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3 2780; AVX2-NEXT: vpaddq %ymm3, %ymm5, %ymm3 2781; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,10,11] 2782; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm5 2783; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2 2784; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm2 2785; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 2786; AVX2-NEXT: vpaddq %ymm2, %ymm5, %ymm2 2787; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6] 2788; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] 2789; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6] 2790; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] 2791; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 2792; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] 2793; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 2794; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 2795; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 2796; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 2797; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 2798; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 2799; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] 2800; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3] 2801; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 2802; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 2803; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2804; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 2805; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 2806; AVX2-NEXT: vzeroupper 2807; AVX2-NEXT: retq 2808; 2809; AVX512-LABEL: trunc_mul_const_v16i64_v16i8: 2810; AVX512: # BB#0: 2811; AVX512-NEXT: vmovdqa32 {{.*#+}} zmm2 = [8,9,10,11,12,13,14,15] 2812; AVX512-NEXT: vpmuludq %zmm2, %zmm1, %zmm3 2813; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm1 2814; AVX512-NEXT: vpmuludq %zmm2, %zmm1, %zmm1 2815; AVX512-NEXT: vpsllq $32, %zmm1, %zmm1 2816; AVX512-NEXT: vpaddq %zmm1, %zmm3, %zmm1 2817; AVX512-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7] 2818; AVX512-NEXT: vpmuludq %zmm2, %zmm0, %zmm3 2819; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0 2820; AVX512-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 2821; AVX512-NEXT: vpsllq $32, %zmm0, %zmm0 2822; AVX512-NEXT: vpaddq %zmm0, %zmm3, %zmm0 2823; AVX512-NEXT: vpmovqd %zmm0, %ymm0 2824; AVX512-NEXT: vpmovqd %zmm1, %ymm1 2825; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2826; AVX512-NEXT: vpmovdb %zmm0, %xmm0 2827; AVX512-NEXT: retq 2828 %1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 2829 %2 = trunc <16 x i64> %1 to <16 x i8> 2830 ret <16 x i8> %2 2831} 2832 2833define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 2834; SSE-LABEL: trunc_mul_const_v16i32_v16i8: 2835; SSE: # BB#0: 2836; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,1,2,3] 2837; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 2838; SSE-NEXT: pmuludq %xmm4, %xmm0 2839; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2840; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 2841; SSE-NEXT: pmuludq %xmm5, %xmm4 2842; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2843; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 2844; SSE-NEXT: movdqa {{.*#+}} xmm4 = [4,5,6,7] 2845; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] 2846; SSE-NEXT: pmuludq %xmm4, %xmm1 2847; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2848; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 2849; SSE-NEXT: pmuludq %xmm5, %xmm4 2850; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2851; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 2852; SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,9,10,11] 2853; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] 2854; SSE-NEXT: pmuludq %xmm4, %xmm2 2855; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 2856; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 2857; SSE-NEXT: pmuludq %xmm5, %xmm4 2858; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2859; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 2860; SSE-NEXT: movdqa {{.*#+}} xmm4 = [12,13,14,15] 2861; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] 2862; SSE-NEXT: pmuludq %xmm4, %xmm3 2863; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 2864; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 2865; SSE-NEXT: pmuludq %xmm5, %xmm4 2866; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2867; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 2868; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 2869; SSE-NEXT: pand %xmm4, %xmm3 2870; SSE-NEXT: pand %xmm4, %xmm2 2871; SSE-NEXT: packuswb %xmm3, %xmm2 2872; SSE-NEXT: pand %xmm4, %xmm1 2873; SSE-NEXT: pand %xmm4, %xmm0 2874; SSE-NEXT: packuswb %xmm1, %xmm0 2875; SSE-NEXT: packuswb %xmm2, %xmm0 2876; SSE-NEXT: retq 2877; 2878; AVX1-LABEL: trunc_mul_const_v16i32_v16i8: 2879; AVX1: # BB#0: 2880; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 2881; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2882; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 2883; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm3 2884; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2885; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 2886; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 2887; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2888; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 2889; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 2890; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2891; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2892; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0 2893; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2894; AVX1-NEXT: vzeroupper 2895; AVX1-NEXT: retq 2896; 2897; AVX2-LABEL: trunc_mul_const_v16i32_v16i8: 2898; AVX2: # BB#0: 2899; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 2900; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 2901; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] 2902; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2903; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 2904; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 2905; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2906; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2907; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2908; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2909; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2910; AVX2-NEXT: vzeroupper 2911; AVX2-NEXT: retq 2912; 2913; AVX512-LABEL: trunc_mul_const_v16i32_v16i8: 2914; AVX512: # BB#0: 2915; AVX512-NEXT: vpmulld {{.*}}(%rip), %zmm0, %zmm0 2916; AVX512-NEXT: vpmovdb %zmm0, %xmm0 2917; AVX512-NEXT: retq 2918 %1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2919 %2 = trunc <16 x i32> %1 to <16 x i8> 2920 ret <16 x i8> %2 2921} 2922 2923define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 2924; SSE-LABEL: trunc_mul_const_v16i16_v16i8: 2925; SSE: # BB#0: 2926; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 2927; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1 2928; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 2929; SSE-NEXT: pand %xmm2, %xmm1 2930; SSE-NEXT: pand %xmm2, %xmm0 2931; SSE-NEXT: packuswb %xmm1, %xmm0 2932; SSE-NEXT: retq 2933; 2934; AVX1-LABEL: trunc_mul_const_v16i16_v16i8: 2935; AVX1: # BB#0: 2936; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1 2937; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2938; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 2939; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 2940; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2941; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 2942; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2943; AVX1-NEXT: vzeroupper 2944; AVX1-NEXT: retq 2945; 2946; AVX2-LABEL: trunc_mul_const_v16i16_v16i8: 2947; AVX2: # BB#0: 2948; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 2949; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2950; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 2951; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 2952; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2953; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2954; AVX2-NEXT: vzeroupper 2955; AVX2-NEXT: retq 2956; 2957; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8: 2958; AVX512F: # BB#0: 2959; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 2960; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 2961; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 2962; AVX512F-NEXT: retq 2963; 2964; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8: 2965; AVX512BW: # BB#0: 2966; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 2967; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2968; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2969; AVX512BW-NEXT: retq 2970 %1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 2971 %2 = trunc <16 x i16> %1 to <16 x i8> 2972 ret <16 x i8> %2 2973} 2974 2975; 2976; and 2977; 2978 2979define <4 x i32> @trunc_and_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2980; SSE-LABEL: trunc_and_v4i64_4i32: 2981; SSE: # BB#0: 2982; SSE-NEXT: pand %xmm2, %xmm0 2983; SSE-NEXT: pand %xmm3, %xmm1 2984; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2985; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2986; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2987; SSE-NEXT: retq 2988; 2989; AVX1-LABEL: trunc_and_v4i64_4i32: 2990; AVX1: # BB#0: 2991; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 2992; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2993; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] 2994; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2995; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 2996; AVX1-NEXT: vzeroupper 2997; AVX1-NEXT: retq 2998; 2999; AVX2-LABEL: trunc_and_v4i64_4i32: 3000; AVX2: # BB#0: 3001; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 3002; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 3003; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 3004; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3005; AVX2-NEXT: vzeroupper 3006; AVX2-NEXT: retq 3007; 3008; AVX512-LABEL: trunc_and_v4i64_4i32: 3009; AVX512: # BB#0: 3010; AVX512-NEXT: vandps %ymm1, %ymm0, %ymm0 3011; AVX512-NEXT: vpmovqd %zmm0, %ymm0 3012; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3013; AVX512-NEXT: retq 3014 %1 = and <4 x i64> %a0, %a1 3015 %2 = trunc <4 x i64> %1 to <4 x i32> 3016 ret <4 x i32> %2 3017} 3018 3019define <8 x i16> @trunc_and_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 3020; SSE-LABEL: trunc_and_v8i64_8i16: 3021; SSE: # BB#0: 3022; SSE-NEXT: pand %xmm6, %xmm2 3023; SSE-NEXT: pand %xmm4, %xmm0 3024; SSE-NEXT: pand %xmm7, %xmm3 3025; SSE-NEXT: pand %xmm5, %xmm1 3026; SSE-NEXT: pextrw $4, %xmm1, %eax 3027; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 3028; SSE-NEXT: pextrw $4, %xmm0, %ecx 3029; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 3030; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 3031; SSE-NEXT: pextrw $4, %xmm3, %edx 3032; SSE-NEXT: movd %edx, %xmm1 3033; SSE-NEXT: movd %eax, %xmm3 3034; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 3035; SSE-NEXT: pextrw $4, %xmm2, %eax 3036; SSE-NEXT: movd %eax, %xmm1 3037; SSE-NEXT: movd %ecx, %xmm2 3038; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 3039; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 3040; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 3041; SSE-NEXT: retq 3042; 3043; AVX1-LABEL: trunc_and_v8i64_8i16: 3044; AVX1: # BB#0: 3045; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3046; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 3047; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 3048; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 3049; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 3050; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 3051; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 3052; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3053; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 3054; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 3055; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3056; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3057; AVX1-NEXT: vzeroupper 3058; AVX1-NEXT: retq 3059; 3060; AVX2-LABEL: trunc_and_v8i64_8i16: 3061; AVX2: # BB#0: 3062; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 3063; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 3064; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 3065; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 3066; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] 3067; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3] 3068; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 3069; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 3070; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3071; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3072; AVX2-NEXT: vzeroupper 3073; AVX2-NEXT: retq 3074; 3075; AVX512-LABEL: trunc_and_v8i64_8i16: 3076; AVX512: # BB#0: 3077; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 3078; AVX512-NEXT: vpmovqw %zmm0, %xmm0 3079; AVX512-NEXT: retq 3080 %1 = and <8 x i64> %a0, %a1 3081 %2 = trunc <8 x i64> %1 to <8 x i16> 3082 ret <8 x i16> %2 3083} 3084 3085define <8 x i16> @trunc_and_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 3086; SSE-LABEL: trunc_and_v8i32_8i16: 3087; SSE: # BB#0: 3088; SSE-NEXT: pand %xmm2, %xmm0 3089; SSE-NEXT: pand %xmm3, %xmm1 3090; SSE-NEXT: pslld $16, %xmm1 3091; SSE-NEXT: psrad $16, %xmm1 3092; SSE-NEXT: pslld $16, %xmm0 3093; SSE-NEXT: psrad $16, %xmm0 3094; SSE-NEXT: packssdw %xmm1, %xmm0 3095; SSE-NEXT: retq 3096; 3097; AVX1-LABEL: trunc_and_v8i32_8i16: 3098; AVX1: # BB#0: 3099; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 3100; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3101; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 3102; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3103; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3104; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3105; AVX1-NEXT: vzeroupper 3106; AVX1-NEXT: retq 3107; 3108; AVX2-LABEL: trunc_and_v8i32_8i16: 3109; AVX2: # BB#0: 3110; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 3111; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 3112; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3113; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3114; AVX2-NEXT: vzeroupper 3115; AVX2-NEXT: retq 3116; 3117; AVX512-LABEL: trunc_and_v8i32_8i16: 3118; AVX512: # BB#0: 3119; AVX512-NEXT: vandps %ymm1, %ymm0, %ymm0 3120; AVX512-NEXT: vpmovdw %zmm0, %ymm0 3121; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3122; AVX512-NEXT: retq 3123 %1 = and <8 x i32> %a0, %a1 3124 %2 = trunc <8 x i32> %1 to <8 x i16> 3125 ret <8 x i16> %2 3126} 3127 3128define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 3129; SSE-LABEL: trunc_and_v16i64_v16i8: 3130; SSE: # BB#0: 3131; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm0 3132; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm1 3133; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm2 3134; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm3 3135; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm4 3136; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm5 3137; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm6 3138; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm7 3139; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 3140; SSE-NEXT: pand %xmm8, %xmm7 3141; SSE-NEXT: pand %xmm8, %xmm6 3142; SSE-NEXT: packuswb %xmm7, %xmm6 3143; SSE-NEXT: pand %xmm8, %xmm5 3144; SSE-NEXT: pand %xmm8, %xmm4 3145; SSE-NEXT: packuswb %xmm5, %xmm4 3146; SSE-NEXT: packuswb %xmm6, %xmm4 3147; SSE-NEXT: pand %xmm8, %xmm3 3148; SSE-NEXT: pand %xmm8, %xmm2 3149; SSE-NEXT: packuswb %xmm3, %xmm2 3150; SSE-NEXT: pand %xmm8, %xmm1 3151; SSE-NEXT: pand %xmm8, %xmm0 3152; SSE-NEXT: packuswb %xmm1, %xmm0 3153; SSE-NEXT: packuswb %xmm2, %xmm0 3154; SSE-NEXT: packuswb %xmm4, %xmm0 3155; SSE-NEXT: retq 3156; 3157; AVX1-LABEL: trunc_and_v16i64_v16i8: 3158; AVX1: # BB#0: 3159; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 3160; AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1 3161; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2 3162; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3 3163; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 3164; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 3165; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 3166; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 3167; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 3168; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 3169; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 3170; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2 3171; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 3172; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 3173; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3174; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 3175; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1 3176; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 3177; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 3178; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 3179; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0 3180; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 3181; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3182; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3183; AVX1-NEXT: vzeroupper 3184; AVX1-NEXT: retq 3185; 3186; AVX2-LABEL: trunc_and_v16i64_v16i8: 3187; AVX2: # BB#0: 3188; AVX2-NEXT: vpand %ymm5, %ymm1, %ymm1 3189; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 3190; AVX2-NEXT: vpand %ymm7, %ymm3, %ymm3 3191; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm2 3192; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6] 3193; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] 3194; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6] 3195; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] 3196; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 3197; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] 3198; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 3199; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 3200; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 3201; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 3202; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 3203; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 3204; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] 3205; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3] 3206; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 3207; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 3208; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3209; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 3210; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 3211; AVX2-NEXT: vzeroupper 3212; AVX2-NEXT: retq 3213; 3214; AVX512-LABEL: trunc_and_v16i64_v16i8: 3215; AVX512: # BB#0: 3216; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1 3217; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0 3218; AVX512-NEXT: vpmovqd %zmm0, %ymm0 3219; AVX512-NEXT: vpmovqd %zmm1, %ymm1 3220; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 3221; AVX512-NEXT: vpmovdb %zmm0, %xmm0 3222; AVX512-NEXT: retq 3223 %1 = and <16 x i64> %a0, %a1 3224 %2 = trunc <16 x i64> %1 to <16 x i8> 3225 ret <16 x i8> %2 3226} 3227 3228define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 3229; SSE-LABEL: trunc_and_v16i32_v16i8: 3230; SSE: # BB#0: 3231; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3232; SSE-NEXT: pand %xmm8, %xmm7 3233; SSE-NEXT: pand %xmm3, %xmm7 3234; SSE-NEXT: pand %xmm8, %xmm6 3235; SSE-NEXT: pand %xmm2, %xmm6 3236; SSE-NEXT: packuswb %xmm7, %xmm6 3237; SSE-NEXT: pand %xmm8, %xmm5 3238; SSE-NEXT: pand %xmm1, %xmm5 3239; SSE-NEXT: pand %xmm8, %xmm4 3240; SSE-NEXT: pand %xmm4, %xmm0 3241; SSE-NEXT: packuswb %xmm5, %xmm0 3242; SSE-NEXT: packuswb %xmm6, %xmm0 3243; SSE-NEXT: retq 3244; 3245; AVX1-LABEL: trunc_and_v16i32_v16i8: 3246; AVX1: # BB#0: 3247; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3248; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 3249; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 3250; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3251; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 3252; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 3253; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 3254; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3255; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 3256; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 3257; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3258; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3259; AVX1-NEXT: vzeroupper 3260; AVX1-NEXT: retq 3261; 3262; AVX2-LABEL: trunc_and_v16i32_v16i8: 3263; AVX2: # BB#0: 3264; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 3265; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 3266; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] 3267; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3268; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 3269; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 3270; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 3271; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3272; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3273; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 3274; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3275; AVX2-NEXT: vzeroupper 3276; AVX2-NEXT: retq 3277; 3278; AVX512-LABEL: trunc_and_v16i32_v16i8: 3279; AVX512: # BB#0: 3280; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 3281; AVX512-NEXT: vpmovdb %zmm0, %xmm0 3282; AVX512-NEXT: retq 3283 %1 = and <16 x i32> %a0, %a1 3284 %2 = trunc <16 x i32> %1 to <16 x i8> 3285 ret <16 x i8> %2 3286} 3287 3288define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 3289; SSE-LABEL: trunc_and_v16i16_v16i8: 3290; SSE: # BB#0: 3291; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 3292; SSE-NEXT: pand %xmm4, %xmm3 3293; SSE-NEXT: pand %xmm1, %xmm3 3294; SSE-NEXT: pand %xmm4, %xmm2 3295; SSE-NEXT: pand %xmm2, %xmm0 3296; SSE-NEXT: packuswb %xmm3, %xmm0 3297; SSE-NEXT: retq 3298; 3299; AVX1-LABEL: trunc_and_v16i16_v16i8: 3300; AVX1: # BB#0: 3301; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 3302; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3303; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 3304; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3305; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3306; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3307; AVX1-NEXT: vzeroupper 3308; AVX1-NEXT: retq 3309; 3310; AVX2-LABEL: trunc_and_v16i16_v16i8: 3311; AVX2: # BB#0: 3312; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 3313; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3314; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 3315; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3316; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3317; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3318; AVX2-NEXT: vzeroupper 3319; AVX2-NEXT: retq 3320; 3321; AVX512F-LABEL: trunc_and_v16i16_v16i8: 3322; AVX512F: # BB#0: 3323; AVX512F-NEXT: vandps %ymm1, %ymm0, %ymm0 3324; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 3325; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 3326; AVX512F-NEXT: retq 3327; 3328; AVX512BW-LABEL: trunc_and_v16i16_v16i8: 3329; AVX512BW: # BB#0: 3330; AVX512BW-NEXT: vandps %ymm1, %ymm0, %ymm0 3331; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 3332; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3333; AVX512BW-NEXT: retq 3334 %1 = and <16 x i16> %a0, %a1 3335 %2 = trunc <16 x i16> %1 to <16 x i8> 3336 ret <16 x i8> %2 3337} 3338 3339; 3340; and to constant 3341; 3342 3343define <4 x i32> @trunc_and_const_v4i64_4i32(<4 x i64> %a0) nounwind { 3344; SSE-LABEL: trunc_and_const_v4i64_4i32: 3345; SSE: # BB#0: 3346; SSE-NEXT: movl $1, %eax 3347; SSE-NEXT: movd %rax, %xmm2 3348; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] 3349; SSE-NEXT: pand %xmm0, %xmm2 3350; SSE-NEXT: pand {{.*}}(%rip), %xmm1 3351; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 3352; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 3353; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3354; SSE-NEXT: retq 3355; 3356; AVX1-LABEL: trunc_and_const_v4i64_4i32: 3357; AVX1: # BB#0: 3358; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 3359; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3360; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] 3361; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 3362; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 3363; AVX1-NEXT: vzeroupper 3364; AVX1-NEXT: retq 3365; 3366; AVX2-LABEL: trunc_and_const_v4i64_4i32: 3367; AVX2: # BB#0: 3368; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 3369; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 3370; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 3371; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3372; AVX2-NEXT: vzeroupper 3373; AVX2-NEXT: retq 3374; 3375; AVX512-LABEL: trunc_and_const_v4i64_4i32: 3376; AVX512: # BB#0: 3377; AVX512-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 3378; AVX512-NEXT: vpmovqd %zmm0, %ymm0 3379; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3380; AVX512-NEXT: retq 3381 %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 3382 %2 = trunc <4 x i64> %1 to <4 x i32> 3383 ret <4 x i32> %2 3384} 3385 3386define <8 x i16> @trunc_and_const_v16i64_v16i16(<8 x i64> %a0) nounwind { 3387; SSE-LABEL: trunc_and_const_v16i64_v16i16: 3388; SSE: # BB#0: 3389; SSE-NEXT: movdqa %xmm0, %xmm4 3390; SSE-NEXT: movl $1, %eax 3391; SSE-NEXT: movd %rax, %xmm0 3392; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] 3393; SSE-NEXT: pand %xmm4, %xmm0 3394; SSE-NEXT: pand {{.*}}(%rip), %xmm2 3395; SSE-NEXT: pand {{.*}}(%rip), %xmm3 3396; SSE-NEXT: pand {{.*}}(%rip), %xmm1 3397; SSE-NEXT: pextrw $4, %xmm1, %eax 3398; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 3399; SSE-NEXT: pextrw $4, %xmm0, %ecx 3400; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 3401; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 3402; SSE-NEXT: pextrw $4, %xmm3, %edx 3403; SSE-NEXT: movd %edx, %xmm1 3404; SSE-NEXT: movd %eax, %xmm3 3405; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 3406; SSE-NEXT: movd %ecx, %xmm1 3407; SSE-NEXT: pextrw $4, %xmm2, %eax 3408; SSE-NEXT: movd %eax, %xmm2 3409; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 3410; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 3411; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 3412; SSE-NEXT: retq 3413; 3414; AVX1-LABEL: trunc_and_const_v16i64_v16i16: 3415; AVX1: # BB#0: 3416; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 3417; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 3418; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 3419; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 3420; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 3421; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 3422; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 3423; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3424; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 3425; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 3426; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3427; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3428; AVX1-NEXT: vzeroupper 3429; AVX1-NEXT: retq 3430; 3431; AVX2-LABEL: trunc_and_const_v16i64_v16i16: 3432; AVX2: # BB#0: 3433; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 3434; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 3435; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 3436; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 3437; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] 3438; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3] 3439; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 3440; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 3441; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3442; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3443; AVX2-NEXT: vzeroupper 3444; AVX2-NEXT: retq 3445; 3446; AVX512-LABEL: trunc_and_const_v16i64_v16i16: 3447; AVX512: # BB#0: 3448; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 3449; AVX512-NEXT: vpmovqw %zmm0, %xmm0 3450; AVX512-NEXT: retq 3451 %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 3452 %2 = trunc <8 x i64> %1 to <8 x i16> 3453 ret <8 x i16> %2 3454} 3455 3456define <8 x i16> @trunc_and_const_v16i32_v16i16(<8 x i32> %a0) nounwind { 3457; SSE-LABEL: trunc_and_const_v16i32_v16i16: 3458; SSE: # BB#0: 3459; SSE-NEXT: pand {{.*}}(%rip), %xmm0 3460; SSE-NEXT: pand {{.*}}(%rip), %xmm1 3461; SSE-NEXT: pslld $16, %xmm1 3462; SSE-NEXT: psrad $16, %xmm1 3463; SSE-NEXT: pslld $16, %xmm0 3464; SSE-NEXT: psrad $16, %xmm0 3465; SSE-NEXT: packssdw %xmm1, %xmm0 3466; SSE-NEXT: retq 3467; 3468; AVX1-LABEL: trunc_and_const_v16i32_v16i16: 3469; AVX1: # BB#0: 3470; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 3471; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3472; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 3473; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3474; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3475; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3476; AVX1-NEXT: vzeroupper 3477; AVX1-NEXT: retq 3478; 3479; AVX2-LABEL: trunc_and_const_v16i32_v16i16: 3480; AVX2: # BB#0: 3481; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 3482; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 3483; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3484; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3485; AVX2-NEXT: vzeroupper 3486; AVX2-NEXT: retq 3487; 3488; AVX512-LABEL: trunc_and_const_v16i32_v16i16: 3489; AVX512: # BB#0: 3490; AVX512-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 3491; AVX512-NEXT: vpmovdw %zmm0, %ymm0 3492; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3493; AVX512-NEXT: retq 3494 %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3495 %2 = trunc <8 x i32> %1 to <8 x i16> 3496 ret <8 x i16> %2 3497} 3498 3499define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 3500; SSE-LABEL: trunc_and_const_v16i64_v16i8: 3501; SSE: # BB#0: 3502; SSE-NEXT: movl $1, %eax 3503; SSE-NEXT: movd %rax, %xmm8 3504; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] 3505; SSE-NEXT: pand {{.*}}(%rip), %xmm1 3506; SSE-NEXT: pand {{.*}}(%rip), %xmm2 3507; SSE-NEXT: pand {{.*}}(%rip), %xmm3 3508; SSE-NEXT: pand {{.*}}(%rip), %xmm4 3509; SSE-NEXT: pand {{.*}}(%rip), %xmm5 3510; SSE-NEXT: pand {{.*}}(%rip), %xmm6 3511; SSE-NEXT: pand {{.*}}(%rip), %xmm7 3512; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 3513; SSE-NEXT: pand %xmm9, %xmm7 3514; SSE-NEXT: pand %xmm9, %xmm6 3515; SSE-NEXT: packuswb %xmm7, %xmm6 3516; SSE-NEXT: pand %xmm9, %xmm5 3517; SSE-NEXT: pand %xmm9, %xmm4 3518; SSE-NEXT: packuswb %xmm5, %xmm4 3519; SSE-NEXT: packuswb %xmm6, %xmm4 3520; SSE-NEXT: pand %xmm9, %xmm3 3521; SSE-NEXT: pand %xmm9, %xmm2 3522; SSE-NEXT: packuswb %xmm3, %xmm2 3523; SSE-NEXT: pand %xmm9, %xmm1 3524; SSE-NEXT: pand %xmm9, %xmm8 3525; SSE-NEXT: pand %xmm8, %xmm0 3526; SSE-NEXT: packuswb %xmm1, %xmm0 3527; SSE-NEXT: packuswb %xmm2, %xmm0 3528; SSE-NEXT: packuswb %xmm4, %xmm0 3529; SSE-NEXT: retq 3530; 3531; AVX1-LABEL: trunc_and_const_v16i64_v16i8: 3532; AVX1: # BB#0: 3533; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 3534; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 3535; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 3536; AVX1-NEXT: vandps {{.*}}(%rip), %ymm3, %ymm3 3537; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 3538; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 3539; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 3540; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 3541; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 3542; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 3543; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 3544; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2 3545; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 3546; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 3547; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3548; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 3549; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1 3550; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 3551; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 3552; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 3553; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0 3554; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 3555; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3556; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3557; AVX1-NEXT: vzeroupper 3558; AVX1-NEXT: retq 3559; 3560; AVX2-LABEL: trunc_and_const_v16i64_v16i8: 3561; AVX2: # BB#0: 3562; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 3563; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 3564; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 3565; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 3566; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6] 3567; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] 3568; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6] 3569; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] 3570; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 3571; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] 3572; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 3573; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 3574; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 3575; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 3576; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 3577; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 3578; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] 3579; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3] 3580; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 3581; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 3582; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3583; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 3584; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 3585; AVX2-NEXT: vzeroupper 3586; AVX2-NEXT: retq 3587; 3588; AVX512-LABEL: trunc_and_const_v16i64_v16i8: 3589; AVX512: # BB#0: 3590; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 3591; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 3592; AVX512-NEXT: vpmovqd %zmm0, %ymm0 3593; AVX512-NEXT: vpmovqd %zmm1, %ymm1 3594; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 3595; AVX512-NEXT: vpmovdb %zmm0, %xmm0 3596; AVX512-NEXT: retq 3597 %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 3598 %2 = trunc <16 x i64> %1 to <16 x i8> 3599 ret <16 x i8> %2 3600} 3601 3602define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 3603; SSE-LABEL: trunc_and_const_v16i32_v16i8: 3604; SSE: # BB#0: 3605; SSE-NEXT: pand {{.*}}(%rip), %xmm0 3606; SSE-NEXT: pand {{.*}}(%rip), %xmm1 3607; SSE-NEXT: pand {{.*}}(%rip), %xmm2 3608; SSE-NEXT: pand {{.*}}(%rip), %xmm3 3609; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3610; SSE-NEXT: pand %xmm4, %xmm3 3611; SSE-NEXT: pand %xmm4, %xmm2 3612; SSE-NEXT: packuswb %xmm3, %xmm2 3613; SSE-NEXT: pand %xmm4, %xmm1 3614; SSE-NEXT: pand %xmm4, %xmm0 3615; SSE-NEXT: packuswb %xmm1, %xmm0 3616; SSE-NEXT: packuswb %xmm2, %xmm0 3617; SSE-NEXT: retq 3618; 3619; AVX1-LABEL: trunc_and_const_v16i32_v16i8: 3620; AVX1: # BB#0: 3621; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 3622; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 3623; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 3624; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3625; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 3626; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 3627; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 3628; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3629; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 3630; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 3631; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3632; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3633; AVX1-NEXT: vzeroupper 3634; AVX1-NEXT: retq 3635; 3636; AVX2-LABEL: trunc_and_const_v16i32_v16i8: 3637; AVX2: # BB#0: 3638; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 3639; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 3640; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] 3641; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3642; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 3643; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 3644; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 3645; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3646; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3647; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 3648; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3649; AVX2-NEXT: vzeroupper 3650; AVX2-NEXT: retq 3651; 3652; AVX512-LABEL: trunc_and_const_v16i32_v16i8: 3653; AVX512: # BB#0: 3654; AVX512-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0 3655; AVX512-NEXT: vpmovdb %zmm0, %xmm0 3656; AVX512-NEXT: retq 3657 %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 3658 %2 = trunc <16 x i32> %1 to <16 x i8> 3659 ret <16 x i8> %2 3660} 3661 3662define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 3663; SSE-LABEL: trunc_and_const_v16i16_v16i8: 3664; SSE: # BB#0: 3665; SSE-NEXT: pand {{.*}}(%rip), %xmm0 3666; SSE-NEXT: pand {{.*}}(%rip), %xmm1 3667; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 3668; SSE-NEXT: pand %xmm2, %xmm1 3669; SSE-NEXT: pand %xmm2, %xmm0 3670; SSE-NEXT: packuswb %xmm1, %xmm0 3671; SSE-NEXT: retq 3672; 3673; AVX1-LABEL: trunc_and_const_v16i16_v16i8: 3674; AVX1: # BB#0: 3675; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 3676; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3677; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 3678; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3679; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3680; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3681; AVX1-NEXT: vzeroupper 3682; AVX1-NEXT: retq 3683; 3684; AVX2-LABEL: trunc_and_const_v16i16_v16i8: 3685; AVX2: # BB#0: 3686; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 3687; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3688; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 3689; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3690; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3691; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3692; AVX2-NEXT: vzeroupper 3693; AVX2-NEXT: retq 3694; 3695; AVX512F-LABEL: trunc_and_const_v16i16_v16i8: 3696; AVX512F: # BB#0: 3697; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 3698; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 3699; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 3700; AVX512F-NEXT: retq 3701; 3702; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8: 3703; AVX512BW: # BB#0: 3704; AVX512BW-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 3705; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 3706; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3707; AVX512BW-NEXT: retq 3708 %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 3709 %2 = trunc <16 x i16> %1 to <16 x i8> 3710 ret <16 x i8> %2 3711} 3712 3713; 3714; xor 3715; 3716 3717define <4 x i32> @trunc_xor_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 3718; SSE-LABEL: trunc_xor_v4i64_4i32: 3719; SSE: # BB#0: 3720; SSE-NEXT: pxor %xmm2, %xmm0 3721; SSE-NEXT: pxor %xmm3, %xmm1 3722; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 3723; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 3724; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3725; SSE-NEXT: retq 3726; 3727; AVX1-LABEL: trunc_xor_v4i64_4i32: 3728; AVX1: # BB#0: 3729; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 3730; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3731; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] 3732; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 3733; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 3734; AVX1-NEXT: vzeroupper 3735; AVX1-NEXT: retq 3736; 3737; AVX2-LABEL: trunc_xor_v4i64_4i32: 3738; AVX2: # BB#0: 3739; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 3740; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 3741; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 3742; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3743; AVX2-NEXT: vzeroupper 3744; AVX2-NEXT: retq 3745; 3746; AVX512-LABEL: trunc_xor_v4i64_4i32: 3747; AVX512: # BB#0: 3748; AVX512-NEXT: vxorps %ymm1, %ymm0, %ymm0 3749; AVX512-NEXT: vpmovqd %zmm0, %ymm0 3750; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3751; AVX512-NEXT: retq 3752 %1 = xor <4 x i64> %a0, %a1 3753 %2 = trunc <4 x i64> %1 to <4 x i32> 3754 ret <4 x i32> %2 3755} 3756 3757define <8 x i16> @trunc_xor_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 3758; SSE-LABEL: trunc_xor_v8i64_8i16: 3759; SSE: # BB#0: 3760; SSE-NEXT: pxor %xmm6, %xmm2 3761; SSE-NEXT: pxor %xmm4, %xmm0 3762; SSE-NEXT: pxor %xmm7, %xmm3 3763; SSE-NEXT: pxor %xmm5, %xmm1 3764; SSE-NEXT: pextrw $4, %xmm1, %eax 3765; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 3766; SSE-NEXT: pextrw $4, %xmm0, %ecx 3767; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 3768; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 3769; SSE-NEXT: pextrw $4, %xmm3, %edx 3770; SSE-NEXT: movd %edx, %xmm1 3771; SSE-NEXT: movd %eax, %xmm3 3772; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 3773; SSE-NEXT: pextrw $4, %xmm2, %eax 3774; SSE-NEXT: movd %eax, %xmm1 3775; SSE-NEXT: movd %ecx, %xmm2 3776; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 3777; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 3778; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 3779; SSE-NEXT: retq 3780; 3781; AVX1-LABEL: trunc_xor_v8i64_8i16: 3782; AVX1: # BB#0: 3783; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 3784; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 3785; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 3786; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 3787; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 3788; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 3789; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 3790; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3791; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 3792; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 3793; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3794; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3795; AVX1-NEXT: vzeroupper 3796; AVX1-NEXT: retq 3797; 3798; AVX2-LABEL: trunc_xor_v8i64_8i16: 3799; AVX2: # BB#0: 3800; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 3801; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 3802; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 3803; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 3804; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] 3805; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3] 3806; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 3807; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 3808; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3809; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3810; AVX2-NEXT: vzeroupper 3811; AVX2-NEXT: retq 3812; 3813; AVX512-LABEL: trunc_xor_v8i64_8i16: 3814; AVX512: # BB#0: 3815; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 3816; AVX512-NEXT: vpmovqw %zmm0, %xmm0 3817; AVX512-NEXT: retq 3818 %1 = xor <8 x i64> %a0, %a1 3819 %2 = trunc <8 x i64> %1 to <8 x i16> 3820 ret <8 x i16> %2 3821} 3822 3823define <8 x i16> @trunc_xor_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 3824; SSE-LABEL: trunc_xor_v8i32_8i16: 3825; SSE: # BB#0: 3826; SSE-NEXT: pxor %xmm2, %xmm0 3827; SSE-NEXT: pxor %xmm3, %xmm1 3828; SSE-NEXT: pslld $16, %xmm1 3829; SSE-NEXT: psrad $16, %xmm1 3830; SSE-NEXT: pslld $16, %xmm0 3831; SSE-NEXT: psrad $16, %xmm0 3832; SSE-NEXT: packssdw %xmm1, %xmm0 3833; SSE-NEXT: retq 3834; 3835; AVX1-LABEL: trunc_xor_v8i32_8i16: 3836; AVX1: # BB#0: 3837; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 3838; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3839; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 3840; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3841; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3842; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3843; AVX1-NEXT: vzeroupper 3844; AVX1-NEXT: retq 3845; 3846; AVX2-LABEL: trunc_xor_v8i32_8i16: 3847; AVX2: # BB#0: 3848; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 3849; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 3850; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3851; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3852; AVX2-NEXT: vzeroupper 3853; AVX2-NEXT: retq 3854; 3855; AVX512-LABEL: trunc_xor_v8i32_8i16: 3856; AVX512: # BB#0: 3857; AVX512-NEXT: vxorps %ymm1, %ymm0, %ymm0 3858; AVX512-NEXT: vpmovdw %zmm0, %ymm0 3859; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3860; AVX512-NEXT: retq 3861 %1 = xor <8 x i32> %a0, %a1 3862 %2 = trunc <8 x i32> %1 to <8 x i16> 3863 ret <8 x i16> %2 3864} 3865 3866define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 3867; SSE-LABEL: trunc_xor_v16i64_v16i8: 3868; SSE: # BB#0: 3869; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm0 3870; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm1 3871; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm2 3872; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm3 3873; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm4 3874; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm5 3875; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm6 3876; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm7 3877; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 3878; SSE-NEXT: pand %xmm8, %xmm7 3879; SSE-NEXT: pand %xmm8, %xmm6 3880; SSE-NEXT: packuswb %xmm7, %xmm6 3881; SSE-NEXT: pand %xmm8, %xmm5 3882; SSE-NEXT: pand %xmm8, %xmm4 3883; SSE-NEXT: packuswb %xmm5, %xmm4 3884; SSE-NEXT: packuswb %xmm6, %xmm4 3885; SSE-NEXT: pand %xmm8, %xmm3 3886; SSE-NEXT: pand %xmm8, %xmm2 3887; SSE-NEXT: packuswb %xmm3, %xmm2 3888; SSE-NEXT: pand %xmm8, %xmm1 3889; SSE-NEXT: pand %xmm8, %xmm0 3890; SSE-NEXT: packuswb %xmm1, %xmm0 3891; SSE-NEXT: packuswb %xmm2, %xmm0 3892; SSE-NEXT: packuswb %xmm4, %xmm0 3893; SSE-NEXT: retq 3894; 3895; AVX1-LABEL: trunc_xor_v16i64_v16i8: 3896; AVX1: # BB#0: 3897; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0 3898; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1 3899; AVX1-NEXT: vxorps %ymm6, %ymm2, %ymm2 3900; AVX1-NEXT: vxorps %ymm7, %ymm3, %ymm3 3901; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 3902; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 3903; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 3904; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 3905; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 3906; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 3907; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 3908; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2 3909; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 3910; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 3911; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3912; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 3913; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1 3914; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 3915; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 3916; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 3917; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0 3918; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 3919; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3920; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3921; AVX1-NEXT: vzeroupper 3922; AVX1-NEXT: retq 3923; 3924; AVX2-LABEL: trunc_xor_v16i64_v16i8: 3925; AVX2: # BB#0: 3926; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm1 3927; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 3928; AVX2-NEXT: vpxor %ymm7, %ymm3, %ymm3 3929; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2 3930; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6] 3931; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] 3932; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6] 3933; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] 3934; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 3935; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] 3936; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 3937; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 3938; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 3939; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 3940; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 3941; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 3942; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] 3943; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3] 3944; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 3945; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 3946; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3947; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 3948; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 3949; AVX2-NEXT: vzeroupper 3950; AVX2-NEXT: retq 3951; 3952; AVX512-LABEL: trunc_xor_v16i64_v16i8: 3953; AVX512: # BB#0: 3954; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1 3955; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0 3956; AVX512-NEXT: vpmovqd %zmm0, %ymm0 3957; AVX512-NEXT: vpmovqd %zmm1, %ymm1 3958; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 3959; AVX512-NEXT: vpmovdb %zmm0, %xmm0 3960; AVX512-NEXT: retq 3961 %1 = xor <16 x i64> %a0, %a1 3962 %2 = trunc <16 x i64> %1 to <16 x i8> 3963 ret <16 x i8> %2 3964} 3965 3966define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 3967; SSE-LABEL: trunc_xor_v16i32_v16i8: 3968; SSE: # BB#0: 3969; SSE-NEXT: pxor %xmm4, %xmm0 3970; SSE-NEXT: pxor %xmm5, %xmm1 3971; SSE-NEXT: pxor %xmm6, %xmm2 3972; SSE-NEXT: pxor %xmm7, %xmm3 3973; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3974; SSE-NEXT: pand %xmm4, %xmm3 3975; SSE-NEXT: pand %xmm4, %xmm2 3976; SSE-NEXT: packuswb %xmm3, %xmm2 3977; SSE-NEXT: pand %xmm4, %xmm1 3978; SSE-NEXT: pand %xmm4, %xmm0 3979; SSE-NEXT: packuswb %xmm1, %xmm0 3980; SSE-NEXT: packuswb %xmm2, %xmm0 3981; SSE-NEXT: retq 3982; 3983; AVX1-LABEL: trunc_xor_v16i32_v16i8: 3984; AVX1: # BB#0: 3985; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 3986; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 3987; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 3988; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3989; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 3990; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 3991; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 3992; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3993; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 3994; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 3995; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3996; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3997; AVX1-NEXT: vzeroupper 3998; AVX1-NEXT: retq 3999; 4000; AVX2-LABEL: trunc_xor_v16i32_v16i8: 4001; AVX2: # BB#0: 4002; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 4003; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 4004; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] 4005; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 4006; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 4007; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 4008; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 4009; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 4010; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4011; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 4012; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4013; AVX2-NEXT: vzeroupper 4014; AVX2-NEXT: retq 4015; 4016; AVX512-LABEL: trunc_xor_v16i32_v16i8: 4017; AVX512: # BB#0: 4018; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 4019; AVX512-NEXT: vpmovdb %zmm0, %xmm0 4020; AVX512-NEXT: retq 4021 %1 = xor <16 x i32> %a0, %a1 4022 %2 = trunc <16 x i32> %1 to <16 x i8> 4023 ret <16 x i8> %2 4024} 4025 4026define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 4027; SSE-LABEL: trunc_xor_v16i16_v16i8: 4028; SSE: # BB#0: 4029; SSE-NEXT: pxor %xmm2, %xmm0 4030; SSE-NEXT: pxor %xmm3, %xmm1 4031; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 4032; SSE-NEXT: pand %xmm2, %xmm1 4033; SSE-NEXT: pand %xmm2, %xmm0 4034; SSE-NEXT: packuswb %xmm1, %xmm0 4035; SSE-NEXT: retq 4036; 4037; AVX1-LABEL: trunc_xor_v16i16_v16i8: 4038; AVX1: # BB#0: 4039; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 4040; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4041; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 4042; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 4043; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 4044; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4045; AVX1-NEXT: vzeroupper 4046; AVX1-NEXT: retq 4047; 4048; AVX2-LABEL: trunc_xor_v16i16_v16i8: 4049; AVX2: # BB#0: 4050; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 4051; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4052; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 4053; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 4054; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 4055; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4056; AVX2-NEXT: vzeroupper 4057; AVX2-NEXT: retq 4058; 4059; AVX512F-LABEL: trunc_xor_v16i16_v16i8: 4060; AVX512F: # BB#0: 4061; AVX512F-NEXT: vxorps %ymm1, %ymm0, %ymm0 4062; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 4063; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 4064; AVX512F-NEXT: retq 4065; 4066; AVX512BW-LABEL: trunc_xor_v16i16_v16i8: 4067; AVX512BW: # BB#0: 4068; AVX512BW-NEXT: vxorps %ymm1, %ymm0, %ymm0 4069; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 4070; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 4071; AVX512BW-NEXT: retq 4072 %1 = xor <16 x i16> %a0, %a1 4073 %2 = trunc <16 x i16> %1 to <16 x i8> 4074 ret <16 x i8> %2 4075} 4076 4077; 4078; xor to constant 4079; 4080 4081define <4 x i32> @trunc_xor_const_v4i64_4i32(<4 x i64> %a0) nounwind { 4082; SSE-LABEL: trunc_xor_const_v4i64_4i32: 4083; SSE: # BB#0: 4084; SSE-NEXT: movl $1, %eax 4085; SSE-NEXT: movd %rax, %xmm2 4086; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] 4087; SSE-NEXT: pxor %xmm0, %xmm2 4088; SSE-NEXT: pxor {{.*}}(%rip), %xmm1 4089; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 4090; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 4091; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4092; SSE-NEXT: retq 4093; 4094; AVX1-LABEL: trunc_xor_const_v4i64_4i32: 4095; AVX1: # BB#0: 4096; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 4097; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4098; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] 4099; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 4100; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 4101; AVX1-NEXT: vzeroupper 4102; AVX1-NEXT: retq 4103; 4104; AVX2-LABEL: trunc_xor_const_v4i64_4i32: 4105; AVX2: # BB#0: 4106; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 4107; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 4108; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 4109; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 4110; AVX2-NEXT: vzeroupper 4111; AVX2-NEXT: retq 4112; 4113; AVX512-LABEL: trunc_xor_const_v4i64_4i32: 4114; AVX512: # BB#0: 4115; AVX512-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 4116; AVX512-NEXT: vpmovqd %zmm0, %ymm0 4117; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 4118; AVX512-NEXT: retq 4119 %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 4120 %2 = trunc <4 x i64> %1 to <4 x i32> 4121 ret <4 x i32> %2 4122} 4123 4124define <8 x i16> @trunc_xor_const_v16i64_v16i16(<8 x i64> %a0) nounwind { 4125; SSE-LABEL: trunc_xor_const_v16i64_v16i16: 4126; SSE: # BB#0: 4127; SSE-NEXT: movdqa %xmm0, %xmm4 4128; SSE-NEXT: movl $1, %eax 4129; SSE-NEXT: movd %rax, %xmm0 4130; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] 4131; SSE-NEXT: pxor %xmm4, %xmm0 4132; SSE-NEXT: pxor {{.*}}(%rip), %xmm2 4133; SSE-NEXT: pxor {{.*}}(%rip), %xmm3 4134; SSE-NEXT: pxor {{.*}}(%rip), %xmm1 4135; SSE-NEXT: pextrw $4, %xmm1, %eax 4136; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 4137; SSE-NEXT: pextrw $4, %xmm0, %ecx 4138; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 4139; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 4140; SSE-NEXT: pextrw $4, %xmm3, %edx 4141; SSE-NEXT: movd %edx, %xmm1 4142; SSE-NEXT: movd %eax, %xmm3 4143; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 4144; SSE-NEXT: movd %ecx, %xmm1 4145; SSE-NEXT: pextrw $4, %xmm2, %eax 4146; SSE-NEXT: movd %eax, %xmm2 4147; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 4148; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 4149; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 4150; SSE-NEXT: retq 4151; 4152; AVX1-LABEL: trunc_xor_const_v16i64_v16i16: 4153; AVX1: # BB#0: 4154; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 4155; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1 4156; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 4157; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 4158; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 4159; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 4160; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 4161; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4162; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 4163; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 4164; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4165; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4166; AVX1-NEXT: vzeroupper 4167; AVX1-NEXT: retq 4168; 4169; AVX2-LABEL: trunc_xor_const_v16i64_v16i16: 4170; AVX2: # BB#0: 4171; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1 4172; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 4173; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 4174; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 4175; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] 4176; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3] 4177; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 4178; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 4179; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4180; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 4181; AVX2-NEXT: vzeroupper 4182; AVX2-NEXT: retq 4183; 4184; AVX512-LABEL: trunc_xor_const_v16i64_v16i16: 4185; AVX512: # BB#0: 4186; AVX512-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0 4187; AVX512-NEXT: vpmovqw %zmm0, %xmm0 4188; AVX512-NEXT: retq 4189 %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 4190 %2 = trunc <8 x i64> %1 to <8 x i16> 4191 ret <8 x i16> %2 4192} 4193 4194define <8 x i16> @trunc_xor_const_v16i32_v16i16(<8 x i32> %a0) nounwind { 4195; SSE-LABEL: trunc_xor_const_v16i32_v16i16: 4196; SSE: # BB#0: 4197; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 4198; SSE-NEXT: pxor {{.*}}(%rip), %xmm1 4199; SSE-NEXT: pslld $16, %xmm1 4200; SSE-NEXT: psrad $16, %xmm1 4201; SSE-NEXT: pslld $16, %xmm0 4202; SSE-NEXT: psrad $16, %xmm0 4203; SSE-NEXT: packssdw %xmm1, %xmm0 4204; SSE-NEXT: retq 4205; 4206; AVX1-LABEL: trunc_xor_const_v16i32_v16i16: 4207; AVX1: # BB#0: 4208; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 4209; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4210; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 4211; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 4212; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 4213; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4214; AVX1-NEXT: vzeroupper 4215; AVX1-NEXT: retq 4216; 4217; AVX2-LABEL: trunc_xor_const_v16i32_v16i16: 4218; AVX2: # BB#0: 4219; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 4220; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 4221; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4222; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 4223; AVX2-NEXT: vzeroupper 4224; AVX2-NEXT: retq 4225; 4226; AVX512-LABEL: trunc_xor_const_v16i32_v16i16: 4227; AVX512: # BB#0: 4228; AVX512-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 4229; AVX512-NEXT: vpmovdw %zmm0, %ymm0 4230; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 4231; AVX512-NEXT: retq 4232 %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 4233 %2 = trunc <8 x i32> %1 to <8 x i16> 4234 ret <8 x i16> %2 4235} 4236 4237define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 4238; SSE-LABEL: trunc_xor_const_v16i64_v16i8: 4239; SSE: # BB#0: 4240; SSE-NEXT: movl $1, %eax 4241; SSE-NEXT: movd %rax, %xmm8 4242; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] 4243; SSE-NEXT: pxor %xmm8, %xmm0 4244; SSE-NEXT: pxor {{.*}}(%rip), %xmm1 4245; SSE-NEXT: pxor {{.*}}(%rip), %xmm2 4246; SSE-NEXT: pxor {{.*}}(%rip), %xmm3 4247; SSE-NEXT: pxor {{.*}}(%rip), %xmm4 4248; SSE-NEXT: pxor {{.*}}(%rip), %xmm5 4249; SSE-NEXT: pxor {{.*}}(%rip), %xmm6 4250; SSE-NEXT: pxor {{.*}}(%rip), %xmm7 4251; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 4252; SSE-NEXT: pand %xmm8, %xmm7 4253; SSE-NEXT: pand %xmm8, %xmm6 4254; SSE-NEXT: packuswb %xmm7, %xmm6 4255; SSE-NEXT: pand %xmm8, %xmm5 4256; SSE-NEXT: pand %xmm8, %xmm4 4257; SSE-NEXT: packuswb %xmm5, %xmm4 4258; SSE-NEXT: packuswb %xmm6, %xmm4 4259; SSE-NEXT: pand %xmm8, %xmm3 4260; SSE-NEXT: pand %xmm8, %xmm2 4261; SSE-NEXT: packuswb %xmm3, %xmm2 4262; SSE-NEXT: pand %xmm8, %xmm1 4263; SSE-NEXT: pand %xmm8, %xmm0 4264; SSE-NEXT: packuswb %xmm1, %xmm0 4265; SSE-NEXT: packuswb %xmm2, %xmm0 4266; SSE-NEXT: packuswb %xmm4, %xmm0 4267; SSE-NEXT: retq 4268; 4269; AVX1-LABEL: trunc_xor_const_v16i64_v16i8: 4270; AVX1: # BB#0: 4271; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 4272; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1 4273; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm2, %ymm2 4274; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm3, %ymm3 4275; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 4276; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 4277; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 4278; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 4279; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 4280; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 4281; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 4282; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2 4283; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 4284; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 4285; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4286; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 4287; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1 4288; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 4289; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 4290; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 4291; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0 4292; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 4293; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4294; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 4295; AVX1-NEXT: vzeroupper 4296; AVX1-NEXT: retq 4297; 4298; AVX2-LABEL: trunc_xor_const_v16i64_v16i8: 4299; AVX2: # BB#0: 4300; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1 4301; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 4302; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm3, %ymm3 4303; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm2, %ymm2 4304; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6] 4305; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] 4306; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6] 4307; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] 4308; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 4309; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] 4310; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 4311; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 4312; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 4313; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 4314; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 4315; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 4316; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] 4317; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3] 4318; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 4319; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 4320; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4321; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 4322; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 4323; AVX2-NEXT: vzeroupper 4324; AVX2-NEXT: retq 4325; 4326; AVX512-LABEL: trunc_xor_const_v16i64_v16i8: 4327; AVX512: # BB#0: 4328; AVX512-NEXT: vpxorq {{.*}}(%rip), %zmm1, %zmm1 4329; AVX512-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0 4330; AVX512-NEXT: vpmovqd %zmm0, %ymm0 4331; AVX512-NEXT: vpmovqd %zmm1, %ymm1 4332; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 4333; AVX512-NEXT: vpmovdb %zmm0, %xmm0 4334; AVX512-NEXT: retq 4335 %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 4336 %2 = trunc <16 x i64> %1 to <16 x i8> 4337 ret <16 x i8> %2 4338} 4339 4340define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 4341; SSE-LABEL: trunc_xor_const_v16i32_v16i8: 4342; SSE: # BB#0: 4343; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 4344; SSE-NEXT: pxor {{.*}}(%rip), %xmm1 4345; SSE-NEXT: pxor {{.*}}(%rip), %xmm2 4346; SSE-NEXT: pxor {{.*}}(%rip), %xmm3 4347; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 4348; SSE-NEXT: pand %xmm4, %xmm3 4349; SSE-NEXT: pand %xmm4, %xmm2 4350; SSE-NEXT: packuswb %xmm3, %xmm2 4351; SSE-NEXT: pand %xmm4, %xmm1 4352; SSE-NEXT: pand %xmm4, %xmm0 4353; SSE-NEXT: packuswb %xmm1, %xmm0 4354; SSE-NEXT: packuswb %xmm2, %xmm0 4355; SSE-NEXT: retq 4356; 4357; AVX1-LABEL: trunc_xor_const_v16i32_v16i8: 4358; AVX1: # BB#0: 4359; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 4360; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1 4361; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 4362; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 4363; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 4364; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 4365; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 4366; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4367; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 4368; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 4369; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 4370; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4371; AVX1-NEXT: vzeroupper 4372; AVX1-NEXT: retq 4373; 4374; AVX2-LABEL: trunc_xor_const_v16i32_v16i8: 4375; AVX2: # BB#0: 4376; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 4377; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1 4378; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] 4379; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 4380; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 4381; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 4382; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 4383; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 4384; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4385; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 4386; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4387; AVX2-NEXT: vzeroupper 4388; AVX2-NEXT: retq 4389; 4390; AVX512-LABEL: trunc_xor_const_v16i32_v16i8: 4391; AVX512: # BB#0: 4392; AVX512-NEXT: vpxord {{.*}}(%rip), %zmm0, %zmm0 4393; AVX512-NEXT: vpmovdb %zmm0, %xmm0 4394; AVX512-NEXT: retq 4395 %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 4396 %2 = trunc <16 x i32> %1 to <16 x i8> 4397 ret <16 x i8> %2 4398} 4399 4400define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 4401; SSE-LABEL: trunc_xor_const_v16i16_v16i8: 4402; SSE: # BB#0: 4403; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 4404; SSE-NEXT: pxor {{.*}}(%rip), %xmm1 4405; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 4406; SSE-NEXT: pand %xmm2, %xmm1 4407; SSE-NEXT: pand %xmm2, %xmm0 4408; SSE-NEXT: packuswb %xmm1, %xmm0 4409; SSE-NEXT: retq 4410; 4411; AVX1-LABEL: trunc_xor_const_v16i16_v16i8: 4412; AVX1: # BB#0: 4413; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 4414; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4415; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 4416; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 4417; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 4418; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4419; AVX1-NEXT: vzeroupper 4420; AVX1-NEXT: retq 4421; 4422; AVX2-LABEL: trunc_xor_const_v16i16_v16i8: 4423; AVX2: # BB#0: 4424; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 4425; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4426; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 4427; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 4428; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 4429; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4430; AVX2-NEXT: vzeroupper 4431; AVX2-NEXT: retq 4432; 4433; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8: 4434; AVX512F: # BB#0: 4435; AVX512F-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 4436; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 4437; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 4438; AVX512F-NEXT: retq 4439; 4440; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8: 4441; AVX512BW: # BB#0: 4442; AVX512BW-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 4443; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 4444; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 4445; AVX512BW-NEXT: retq 4446 %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 4447 %2 = trunc <16 x i16> %1 to <16 x i8> 4448 ret <16 x i8> %2 4449} 4450 4451; 4452; or 4453; 4454 4455define <4 x i32> @trunc_or_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 4456; SSE-LABEL: trunc_or_v4i64_4i32: 4457; SSE: # BB#0: 4458; SSE-NEXT: por %xmm2, %xmm0 4459; SSE-NEXT: por %xmm3, %xmm1 4460; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 4461; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 4462; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4463; SSE-NEXT: retq 4464; 4465; AVX1-LABEL: trunc_or_v4i64_4i32: 4466; AVX1: # BB#0: 4467; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 4468; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4469; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] 4470; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 4471; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 4472; AVX1-NEXT: vzeroupper 4473; AVX1-NEXT: retq 4474; 4475; AVX2-LABEL: trunc_or_v4i64_4i32: 4476; AVX2: # BB#0: 4477; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 4478; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 4479; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 4480; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 4481; AVX2-NEXT: vzeroupper 4482; AVX2-NEXT: retq 4483; 4484; AVX512-LABEL: trunc_or_v4i64_4i32: 4485; AVX512: # BB#0: 4486; AVX512-NEXT: vorps %ymm1, %ymm0, %ymm0 4487; AVX512-NEXT: vpmovqd %zmm0, %ymm0 4488; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 4489; AVX512-NEXT: retq 4490 %1 = or <4 x i64> %a0, %a1 4491 %2 = trunc <4 x i64> %1 to <4 x i32> 4492 ret <4 x i32> %2 4493} 4494 4495define <8 x i16> @trunc_or_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 4496; SSE-LABEL: trunc_or_v8i64_8i16: 4497; SSE: # BB#0: 4498; SSE-NEXT: por %xmm6, %xmm2 4499; SSE-NEXT: por %xmm4, %xmm0 4500; SSE-NEXT: por %xmm7, %xmm3 4501; SSE-NEXT: por %xmm5, %xmm1 4502; SSE-NEXT: pextrw $4, %xmm1, %eax 4503; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 4504; SSE-NEXT: pextrw $4, %xmm0, %ecx 4505; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 4506; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 4507; SSE-NEXT: pextrw $4, %xmm3, %edx 4508; SSE-NEXT: movd %edx, %xmm1 4509; SSE-NEXT: movd %eax, %xmm3 4510; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 4511; SSE-NEXT: pextrw $4, %xmm2, %eax 4512; SSE-NEXT: movd %eax, %xmm1 4513; SSE-NEXT: movd %ecx, %xmm2 4514; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 4515; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 4516; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 4517; SSE-NEXT: retq 4518; 4519; AVX1-LABEL: trunc_or_v8i64_8i16: 4520; AVX1: # BB#0: 4521; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 4522; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 4523; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 4524; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 4525; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 4526; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 4527; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 4528; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4529; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 4530; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 4531; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4532; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4533; AVX1-NEXT: vzeroupper 4534; AVX1-NEXT: retq 4535; 4536; AVX2-LABEL: trunc_or_v8i64_8i16: 4537; AVX2: # BB#0: 4538; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 4539; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 4540; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 4541; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 4542; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] 4543; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3] 4544; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 4545; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 4546; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4547; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 4548; AVX2-NEXT: vzeroupper 4549; AVX2-NEXT: retq 4550; 4551; AVX512-LABEL: trunc_or_v8i64_8i16: 4552; AVX512: # BB#0: 4553; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 4554; AVX512-NEXT: vpmovqw %zmm0, %xmm0 4555; AVX512-NEXT: retq 4556 %1 = or <8 x i64> %a0, %a1 4557 %2 = trunc <8 x i64> %1 to <8 x i16> 4558 ret <8 x i16> %2 4559} 4560 4561define <8 x i16> @trunc_or_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 4562; SSE-LABEL: trunc_or_v8i32_8i16: 4563; SSE: # BB#0: 4564; SSE-NEXT: por %xmm2, %xmm0 4565; SSE-NEXT: por %xmm3, %xmm1 4566; SSE-NEXT: pslld $16, %xmm1 4567; SSE-NEXT: psrad $16, %xmm1 4568; SSE-NEXT: pslld $16, %xmm0 4569; SSE-NEXT: psrad $16, %xmm0 4570; SSE-NEXT: packssdw %xmm1, %xmm0 4571; SSE-NEXT: retq 4572; 4573; AVX1-LABEL: trunc_or_v8i32_8i16: 4574; AVX1: # BB#0: 4575; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 4576; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4577; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 4578; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 4579; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 4580; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4581; AVX1-NEXT: vzeroupper 4582; AVX1-NEXT: retq 4583; 4584; AVX2-LABEL: trunc_or_v8i32_8i16: 4585; AVX2: # BB#0: 4586; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 4587; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 4588; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4589; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 4590; AVX2-NEXT: vzeroupper 4591; AVX2-NEXT: retq 4592; 4593; AVX512-LABEL: trunc_or_v8i32_8i16: 4594; AVX512: # BB#0: 4595; AVX512-NEXT: vorps %ymm1, %ymm0, %ymm0 4596; AVX512-NEXT: vpmovdw %zmm0, %ymm0 4597; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 4598; AVX512-NEXT: retq 4599 %1 = or <8 x i32> %a0, %a1 4600 %2 = trunc <8 x i32> %1 to <8 x i16> 4601 ret <8 x i16> %2 4602} 4603 4604define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 4605; SSE-LABEL: trunc_or_v16i64_v16i8: 4606; SSE: # BB#0: 4607; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm0 4608; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm1 4609; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm2 4610; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm3 4611; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm4 4612; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm5 4613; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm6 4614; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm7 4615; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 4616; SSE-NEXT: pand %xmm8, %xmm7 4617; SSE-NEXT: pand %xmm8, %xmm6 4618; SSE-NEXT: packuswb %xmm7, %xmm6 4619; SSE-NEXT: pand %xmm8, %xmm5 4620; SSE-NEXT: pand %xmm8, %xmm4 4621; SSE-NEXT: packuswb %xmm5, %xmm4 4622; SSE-NEXT: packuswb %xmm6, %xmm4 4623; SSE-NEXT: pand %xmm8, %xmm3 4624; SSE-NEXT: pand %xmm8, %xmm2 4625; SSE-NEXT: packuswb %xmm3, %xmm2 4626; SSE-NEXT: pand %xmm8, %xmm1 4627; SSE-NEXT: pand %xmm8, %xmm0 4628; SSE-NEXT: packuswb %xmm1, %xmm0 4629; SSE-NEXT: packuswb %xmm2, %xmm0 4630; SSE-NEXT: packuswb %xmm4, %xmm0 4631; SSE-NEXT: retq 4632; 4633; AVX1-LABEL: trunc_or_v16i64_v16i8: 4634; AVX1: # BB#0: 4635; AVX1-NEXT: vorps %ymm4, %ymm0, %ymm0 4636; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1 4637; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2 4638; AVX1-NEXT: vorps %ymm7, %ymm3, %ymm3 4639; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 4640; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 4641; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 4642; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 4643; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 4644; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 4645; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 4646; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2 4647; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 4648; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 4649; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4650; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 4651; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1 4652; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 4653; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 4654; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 4655; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0 4656; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 4657; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4658; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 4659; AVX1-NEXT: vzeroupper 4660; AVX1-NEXT: retq 4661; 4662; AVX2-LABEL: trunc_or_v16i64_v16i8: 4663; AVX2: # BB#0: 4664; AVX2-NEXT: vpor %ymm5, %ymm1, %ymm1 4665; AVX2-NEXT: vpor %ymm4, %ymm0, %ymm0 4666; AVX2-NEXT: vpor %ymm7, %ymm3, %ymm3 4667; AVX2-NEXT: vpor %ymm6, %ymm2, %ymm2 4668; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6] 4669; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] 4670; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6] 4671; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] 4672; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 4673; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] 4674; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 4675; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 4676; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 4677; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 4678; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 4679; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 4680; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] 4681; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3] 4682; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 4683; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 4684; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4685; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 4686; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 4687; AVX2-NEXT: vzeroupper 4688; AVX2-NEXT: retq 4689; 4690; AVX512-LABEL: trunc_or_v16i64_v16i8: 4691; AVX512: # BB#0: 4692; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1 4693; AVX512-NEXT: vporq %zmm2, %zmm0, %zmm0 4694; AVX512-NEXT: vpmovqd %zmm0, %ymm0 4695; AVX512-NEXT: vpmovqd %zmm1, %ymm1 4696; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 4697; AVX512-NEXT: vpmovdb %zmm0, %xmm0 4698; AVX512-NEXT: retq 4699 %1 = or <16 x i64> %a0, %a1 4700 %2 = trunc <16 x i64> %1 to <16 x i8> 4701 ret <16 x i8> %2 4702} 4703 4704define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 4705; SSE-LABEL: trunc_or_v16i32_v16i8: 4706; SSE: # BB#0: 4707; SSE-NEXT: por %xmm4, %xmm0 4708; SSE-NEXT: por %xmm5, %xmm1 4709; SSE-NEXT: por %xmm6, %xmm2 4710; SSE-NEXT: por %xmm7, %xmm3 4711; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 4712; SSE-NEXT: pand %xmm4, %xmm3 4713; SSE-NEXT: pand %xmm4, %xmm2 4714; SSE-NEXT: packuswb %xmm3, %xmm2 4715; SSE-NEXT: pand %xmm4, %xmm1 4716; SSE-NEXT: pand %xmm4, %xmm0 4717; SSE-NEXT: packuswb %xmm1, %xmm0 4718; SSE-NEXT: packuswb %xmm2, %xmm0 4719; SSE-NEXT: retq 4720; 4721; AVX1-LABEL: trunc_or_v16i32_v16i8: 4722; AVX1: # BB#0: 4723; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 4724; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 4725; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 4726; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 4727; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 4728; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 4729; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 4730; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4731; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 4732; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 4733; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 4734; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4735; AVX1-NEXT: vzeroupper 4736; AVX1-NEXT: retq 4737; 4738; AVX2-LABEL: trunc_or_v16i32_v16i8: 4739; AVX2: # BB#0: 4740; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 4741; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 4742; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] 4743; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 4744; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 4745; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 4746; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 4747; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 4748; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4749; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 4750; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4751; AVX2-NEXT: vzeroupper 4752; AVX2-NEXT: retq 4753; 4754; AVX512-LABEL: trunc_or_v16i32_v16i8: 4755; AVX512: # BB#0: 4756; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 4757; AVX512-NEXT: vpmovdb %zmm0, %xmm0 4758; AVX512-NEXT: retq 4759 %1 = or <16 x i32> %a0, %a1 4760 %2 = trunc <16 x i32> %1 to <16 x i8> 4761 ret <16 x i8> %2 4762} 4763 4764define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 4765; SSE-LABEL: trunc_or_v16i16_v16i8: 4766; SSE: # BB#0: 4767; SSE-NEXT: por %xmm2, %xmm0 4768; SSE-NEXT: por %xmm3, %xmm1 4769; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 4770; SSE-NEXT: pand %xmm2, %xmm1 4771; SSE-NEXT: pand %xmm2, %xmm0 4772; SSE-NEXT: packuswb %xmm1, %xmm0 4773; SSE-NEXT: retq 4774; 4775; AVX1-LABEL: trunc_or_v16i16_v16i8: 4776; AVX1: # BB#0: 4777; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 4778; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4779; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 4780; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 4781; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 4782; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4783; AVX1-NEXT: vzeroupper 4784; AVX1-NEXT: retq 4785; 4786; AVX2-LABEL: trunc_or_v16i16_v16i8: 4787; AVX2: # BB#0: 4788; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 4789; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4790; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 4791; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 4792; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 4793; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4794; AVX2-NEXT: vzeroupper 4795; AVX2-NEXT: retq 4796; 4797; AVX512F-LABEL: trunc_or_v16i16_v16i8: 4798; AVX512F: # BB#0: 4799; AVX512F-NEXT: vorps %ymm1, %ymm0, %ymm0 4800; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 4801; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 4802; AVX512F-NEXT: retq 4803; 4804; AVX512BW-LABEL: trunc_or_v16i16_v16i8: 4805; AVX512BW: # BB#0: 4806; AVX512BW-NEXT: vorps %ymm1, %ymm0, %ymm0 4807; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 4808; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 4809; AVX512BW-NEXT: retq 4810 %1 = or <16 x i16> %a0, %a1 4811 %2 = trunc <16 x i16> %1 to <16 x i8> 4812 ret <16 x i8> %2 4813} 4814 4815; 4816; or to constant 4817; 4818 4819define <4 x i32> @trunc_or_const_v4i64_4i32(<4 x i64> %a0) nounwind { 4820; SSE-LABEL: trunc_or_const_v4i64_4i32: 4821; SSE: # BB#0: 4822; SSE-NEXT: movl $1, %eax 4823; SSE-NEXT: movd %rax, %xmm2 4824; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] 4825; SSE-NEXT: por %xmm0, %xmm2 4826; SSE-NEXT: por {{.*}}(%rip), %xmm1 4827; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 4828; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 4829; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4830; SSE-NEXT: retq 4831; 4832; AVX1-LABEL: trunc_or_const_v4i64_4i32: 4833; AVX1: # BB#0: 4834; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 4835; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4836; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] 4837; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 4838; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 4839; AVX1-NEXT: vzeroupper 4840; AVX1-NEXT: retq 4841; 4842; AVX2-LABEL: trunc_or_const_v4i64_4i32: 4843; AVX2: # BB#0: 4844; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 4845; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 4846; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 4847; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 4848; AVX2-NEXT: vzeroupper 4849; AVX2-NEXT: retq 4850; 4851; AVX512-LABEL: trunc_or_const_v4i64_4i32: 4852; AVX512: # BB#0: 4853; AVX512-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 4854; AVX512-NEXT: vpmovqd %zmm0, %ymm0 4855; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 4856; AVX512-NEXT: retq 4857 %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 4858 %2 = trunc <4 x i64> %1 to <4 x i32> 4859 ret <4 x i32> %2 4860} 4861 4862define <8 x i16> @trunc_or_const_v16i64_v16i16(<8 x i64> %a0) nounwind { 4863; SSE-LABEL: trunc_or_const_v16i64_v16i16: 4864; SSE: # BB#0: 4865; SSE-NEXT: movdqa %xmm0, %xmm4 4866; SSE-NEXT: movl $1, %eax 4867; SSE-NEXT: movd %rax, %xmm0 4868; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] 4869; SSE-NEXT: por %xmm4, %xmm0 4870; SSE-NEXT: por {{.*}}(%rip), %xmm2 4871; SSE-NEXT: por {{.*}}(%rip), %xmm3 4872; SSE-NEXT: por {{.*}}(%rip), %xmm1 4873; SSE-NEXT: pextrw $4, %xmm1, %eax 4874; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 4875; SSE-NEXT: pextrw $4, %xmm0, %ecx 4876; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 4877; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 4878; SSE-NEXT: pextrw $4, %xmm3, %edx 4879; SSE-NEXT: movd %edx, %xmm1 4880; SSE-NEXT: movd %eax, %xmm3 4881; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 4882; SSE-NEXT: movd %ecx, %xmm1 4883; SSE-NEXT: pextrw $4, %xmm2, %eax 4884; SSE-NEXT: movd %eax, %xmm2 4885; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 4886; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 4887; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 4888; SSE-NEXT: retq 4889; 4890; AVX1-LABEL: trunc_or_const_v16i64_v16i16: 4891; AVX1: # BB#0: 4892; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 4893; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1 4894; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 4895; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 4896; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 4897; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 4898; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 4899; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4900; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 4901; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 4902; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4903; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4904; AVX1-NEXT: vzeroupper 4905; AVX1-NEXT: retq 4906; 4907; AVX2-LABEL: trunc_or_const_v16i64_v16i16: 4908; AVX2: # BB#0: 4909; AVX2-NEXT: vpor {{.*}}(%rip), %ymm1, %ymm1 4910; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 4911; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 4912; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 4913; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] 4914; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3] 4915; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 4916; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 4917; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4918; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 4919; AVX2-NEXT: vzeroupper 4920; AVX2-NEXT: retq 4921; 4922; AVX512-LABEL: trunc_or_const_v16i64_v16i16: 4923; AVX512: # BB#0: 4924; AVX512-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0 4925; AVX512-NEXT: vpmovqw %zmm0, %xmm0 4926; AVX512-NEXT: retq 4927 %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 4928 %2 = trunc <8 x i64> %1 to <8 x i16> 4929 ret <8 x i16> %2 4930} 4931 4932define <8 x i16> @trunc_or_const_v16i32_v16i16(<8 x i32> %a0) nounwind { 4933; SSE-LABEL: trunc_or_const_v16i32_v16i16: 4934; SSE: # BB#0: 4935; SSE-NEXT: por {{.*}}(%rip), %xmm0 4936; SSE-NEXT: por {{.*}}(%rip), %xmm1 4937; SSE-NEXT: pslld $16, %xmm1 4938; SSE-NEXT: psrad $16, %xmm1 4939; SSE-NEXT: pslld $16, %xmm0 4940; SSE-NEXT: psrad $16, %xmm0 4941; SSE-NEXT: packssdw %xmm1, %xmm0 4942; SSE-NEXT: retq 4943; 4944; AVX1-LABEL: trunc_or_const_v16i32_v16i16: 4945; AVX1: # BB#0: 4946; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 4947; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4948; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 4949; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 4950; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 4951; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4952; AVX1-NEXT: vzeroupper 4953; AVX1-NEXT: retq 4954; 4955; AVX2-LABEL: trunc_or_const_v16i32_v16i16: 4956; AVX2: # BB#0: 4957; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 4958; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 4959; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4960; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 4961; AVX2-NEXT: vzeroupper 4962; AVX2-NEXT: retq 4963; 4964; AVX512-LABEL: trunc_or_const_v16i32_v16i16: 4965; AVX512: # BB#0: 4966; AVX512-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 4967; AVX512-NEXT: vpmovdw %zmm0, %ymm0 4968; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 4969; AVX512-NEXT: retq 4970 %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 4971 %2 = trunc <8 x i32> %1 to <8 x i16> 4972 ret <8 x i16> %2 4973} 4974 4975define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 4976; SSE-LABEL: trunc_or_const_v16i64_v16i8: 4977; SSE: # BB#0: 4978; SSE-NEXT: movl $1, %eax 4979; SSE-NEXT: movd %rax, %xmm8 4980; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] 4981; SSE-NEXT: por %xmm8, %xmm0 4982; SSE-NEXT: por {{.*}}(%rip), %xmm1 4983; SSE-NEXT: por {{.*}}(%rip), %xmm2 4984; SSE-NEXT: por {{.*}}(%rip), %xmm3 4985; SSE-NEXT: por {{.*}}(%rip), %xmm4 4986; SSE-NEXT: por {{.*}}(%rip), %xmm5 4987; SSE-NEXT: por {{.*}}(%rip), %xmm6 4988; SSE-NEXT: por {{.*}}(%rip), %xmm7 4989; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 4990; SSE-NEXT: pand %xmm8, %xmm7 4991; SSE-NEXT: pand %xmm8, %xmm6 4992; SSE-NEXT: packuswb %xmm7, %xmm6 4993; SSE-NEXT: pand %xmm8, %xmm5 4994; SSE-NEXT: pand %xmm8, %xmm4 4995; SSE-NEXT: packuswb %xmm5, %xmm4 4996; SSE-NEXT: packuswb %xmm6, %xmm4 4997; SSE-NEXT: pand %xmm8, %xmm3 4998; SSE-NEXT: pand %xmm8, %xmm2 4999; SSE-NEXT: packuswb %xmm3, %xmm2 5000; SSE-NEXT: pand %xmm8, %xmm1 5001; SSE-NEXT: pand %xmm8, %xmm0 5002; SSE-NEXT: packuswb %xmm1, %xmm0 5003; SSE-NEXT: packuswb %xmm2, %xmm0 5004; SSE-NEXT: packuswb %xmm4, %xmm0 5005; SSE-NEXT: retq 5006; 5007; AVX1-LABEL: trunc_or_const_v16i64_v16i8: 5008; AVX1: # BB#0: 5009; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 5010; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1 5011; AVX1-NEXT: vorps {{.*}}(%rip), %ymm2, %ymm2 5012; AVX1-NEXT: vorps {{.*}}(%rip), %ymm3, %ymm3 5013; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 5014; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 5015; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 5016; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 5017; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 5018; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 5019; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 5020; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2 5021; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 5022; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 5023; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 5024; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 5025; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1 5026; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 5027; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 5028; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 5029; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0 5030; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 5031; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 5032; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 5033; AVX1-NEXT: vzeroupper 5034; AVX1-NEXT: retq 5035; 5036; AVX2-LABEL: trunc_or_const_v16i64_v16i8: 5037; AVX2: # BB#0: 5038; AVX2-NEXT: vpor {{.*}}(%rip), %ymm1, %ymm1 5039; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 5040; AVX2-NEXT: vpor {{.*}}(%rip), %ymm3, %ymm3 5041; AVX2-NEXT: vpor {{.*}}(%rip), %ymm2, %ymm2 5042; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6] 5043; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] 5044; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6] 5045; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] 5046; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 5047; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] 5048; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 5049; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 5050; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 5051; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 5052; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 5053; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 5054; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] 5055; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3] 5056; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 5057; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 5058; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 5059; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 5060; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 5061; AVX2-NEXT: vzeroupper 5062; AVX2-NEXT: retq 5063; 5064; AVX512-LABEL: trunc_or_const_v16i64_v16i8: 5065; AVX512: # BB#0: 5066; AVX512-NEXT: vporq {{.*}}(%rip), %zmm1, %zmm1 5067; AVX512-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0 5068; AVX512-NEXT: vpmovqd %zmm0, %ymm0 5069; AVX512-NEXT: vpmovqd %zmm1, %ymm1 5070; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 5071; AVX512-NEXT: vpmovdb %zmm0, %xmm0 5072; AVX512-NEXT: retq 5073 %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 5074 %2 = trunc <16 x i64> %1 to <16 x i8> 5075 ret <16 x i8> %2 5076} 5077 5078define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 5079; SSE-LABEL: trunc_or_const_v16i32_v16i8: 5080; SSE: # BB#0: 5081; SSE-NEXT: por {{.*}}(%rip), %xmm0 5082; SSE-NEXT: por {{.*}}(%rip), %xmm1 5083; SSE-NEXT: por {{.*}}(%rip), %xmm2 5084; SSE-NEXT: por {{.*}}(%rip), %xmm3 5085; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 5086; SSE-NEXT: pand %xmm4, %xmm3 5087; SSE-NEXT: pand %xmm4, %xmm2 5088; SSE-NEXT: packuswb %xmm3, %xmm2 5089; SSE-NEXT: pand %xmm4, %xmm1 5090; SSE-NEXT: pand %xmm4, %xmm0 5091; SSE-NEXT: packuswb %xmm1, %xmm0 5092; SSE-NEXT: packuswb %xmm2, %xmm0 5093; SSE-NEXT: retq 5094; 5095; AVX1-LABEL: trunc_or_const_v16i32_v16i8: 5096; AVX1: # BB#0: 5097; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 5098; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1 5099; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 5100; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 5101; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 5102; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 5103; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 5104; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 5105; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 5106; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 5107; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 5108; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 5109; AVX1-NEXT: vzeroupper 5110; AVX1-NEXT: retq 5111; 5112; AVX2-LABEL: trunc_or_const_v16i32_v16i8: 5113; AVX2: # BB#0: 5114; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 5115; AVX2-NEXT: vpor {{.*}}(%rip), %ymm1, %ymm1 5116; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] 5117; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 5118; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 5119; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 5120; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 5121; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 5122; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 5123; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 5124; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 5125; AVX2-NEXT: vzeroupper 5126; AVX2-NEXT: retq 5127; 5128; AVX512-LABEL: trunc_or_const_v16i32_v16i8: 5129; AVX512: # BB#0: 5130; AVX512-NEXT: vpord {{.*}}(%rip), %zmm0, %zmm0 5131; AVX512-NEXT: vpmovdb %zmm0, %xmm0 5132; AVX512-NEXT: retq 5133 %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 5134 %2 = trunc <16 x i32> %1 to <16 x i8> 5135 ret <16 x i8> %2 5136} 5137 5138define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 5139; SSE-LABEL: trunc_or_const_v16i16_v16i8: 5140; SSE: # BB#0: 5141; SSE-NEXT: por {{.*}}(%rip), %xmm0 5142; SSE-NEXT: por {{.*}}(%rip), %xmm1 5143; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 5144; SSE-NEXT: pand %xmm2, %xmm1 5145; SSE-NEXT: pand %xmm2, %xmm0 5146; SSE-NEXT: packuswb %xmm1, %xmm0 5147; SSE-NEXT: retq 5148; 5149; AVX1-LABEL: trunc_or_const_v16i16_v16i8: 5150; AVX1: # BB#0: 5151; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 5152; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 5153; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 5154; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 5155; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 5156; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 5157; AVX1-NEXT: vzeroupper 5158; AVX1-NEXT: retq 5159; 5160; AVX2-LABEL: trunc_or_const_v16i16_v16i8: 5161; AVX2: # BB#0: 5162; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 5163; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 5164; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 5165; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 5166; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 5167; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 5168; AVX2-NEXT: vzeroupper 5169; AVX2-NEXT: retq 5170; 5171; AVX512F-LABEL: trunc_or_const_v16i16_v16i8: 5172; AVX512F: # BB#0: 5173; AVX512F-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 5174; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 5175; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 5176; AVX512F-NEXT: retq 5177; 5178; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8: 5179; AVX512BW: # BB#0: 5180; AVX512BW-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 5181; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 5182; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 5183; AVX512BW-NEXT: retq 5184 %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 5185 %2 = trunc <16 x i16> %1 to <16 x i8> 5186 ret <16 x i8> %2 5187} 5188 5189; 5190; complex patterns - often created by vectorizer 5191; 5192 5193define <4 x i32> @mul_add_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { 5194; SSE-LABEL: mul_add_v4i64_v4i32: 5195; SSE: # BB#0: 5196; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 5197; SSE-NEXT: movdqa %xmm2, %xmm3 5198; SSE-NEXT: psrad $31, %xmm3 5199; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 5200; SSE-NEXT: movdqa %xmm0, %xmm3 5201; SSE-NEXT: psrad $31, %xmm3 5202; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 5203; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] 5204; SSE-NEXT: movdqa %xmm3, %xmm4 5205; SSE-NEXT: psrad $31, %xmm4 5206; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 5207; SSE-NEXT: movdqa %xmm1, %xmm4 5208; SSE-NEXT: psrad $31, %xmm4 5209; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 5210; SSE-NEXT: movdqa %xmm0, %xmm4 5211; SSE-NEXT: pmuludq %xmm1, %xmm4 5212; SSE-NEXT: movdqa %xmm1, %xmm5 5213; SSE-NEXT: psrlq $32, %xmm5 5214; SSE-NEXT: pmuludq %xmm0, %xmm5 5215; SSE-NEXT: psllq $32, %xmm5 5216; SSE-NEXT: paddq %xmm4, %xmm5 5217; SSE-NEXT: psrlq $32, %xmm0 5218; SSE-NEXT: pmuludq %xmm1, %xmm0 5219; SSE-NEXT: psllq $32, %xmm0 5220; SSE-NEXT: paddq %xmm5, %xmm0 5221; SSE-NEXT: movdqa %xmm2, %xmm1 5222; SSE-NEXT: pmuludq %xmm3, %xmm1 5223; SSE-NEXT: movdqa %xmm3, %xmm4 5224; SSE-NEXT: psrlq $32, %xmm4 5225; SSE-NEXT: pmuludq %xmm2, %xmm4 5226; SSE-NEXT: psllq $32, %xmm4 5227; SSE-NEXT: paddq %xmm1, %xmm4 5228; SSE-NEXT: psrlq $32, %xmm2 5229; SSE-NEXT: pmuludq %xmm3, %xmm2 5230; SSE-NEXT: psllq $32, %xmm2 5231; SSE-NEXT: paddq %xmm4, %xmm2 5232; SSE-NEXT: paddq {{.*}}(%rip), %xmm2 5233; SSE-NEXT: paddq {{.*}}(%rip), %xmm0 5234; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 5235; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] 5236; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 5237; SSE-NEXT: retq 5238; 5239; AVX1-LABEL: mul_add_v4i64_v4i32: 5240; AVX1: # BB#0: 5241; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 5242; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2 5243; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 5244; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] 5245; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3 5246; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 5247; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm4 5248; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5 5249; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm5 5250; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 5251; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4 5252; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 5253; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 5254; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 5255; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 5256; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm1 5257; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4 5258; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 5259; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 5260; AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm1 5261; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 5262; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 5263; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 5264; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 5265; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1 5266; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 5267; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 5268; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] 5269; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 5270; AVX1-NEXT: retq 5271; 5272; AVX2-LABEL: mul_add_v4i64_v4i32: 5273; AVX2: # BB#0: 5274; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 5275; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 5276; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 5277; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 5278; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 5279; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3 5280; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 5281; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 5282; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 5283; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 5284; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 5285; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 5286; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] 5287; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 5288; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 5289; AVX2-NEXT: vzeroupper 5290; AVX2-NEXT: retq 5291; 5292; AVX512-LABEL: mul_add_v4i64_v4i32: 5293; AVX512: # BB#0: 5294; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0 5295; AVX512-NEXT: vpmovsxdq %xmm1, %ymm1 5296; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 5297; AVX512-NEXT: vpsrlq $32, %ymm1, %ymm3 5298; AVX512-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 5299; AVX512-NEXT: vpsllq $32, %ymm3, %ymm3 5300; AVX512-NEXT: vpaddq %ymm3, %ymm2, %ymm2 5301; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm0 5302; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 5303; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0 5304; AVX512-NEXT: vpaddq %ymm0, %ymm2, %ymm0 5305; AVX512-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 5306; AVX512-NEXT: vpmovqd %zmm0, %ymm0 5307; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 5308; AVX512-NEXT: retq 5309 %1 = sext <4 x i32> %a0 to <4 x i64> 5310 %2 = sext <4 x i32> %a1 to <4 x i64> 5311 %3 = mul <4 x i64> %1, %2 5312 %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3> 5313 %5 = trunc <4 x i64> %4 to <4 x i32> 5314 ret <4 x i32> %5 5315} 5316