1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 12 13define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) { 14; SSE-LABEL: trunc8i64_8i32: 15; SSE: # %bb.0: # %entry 16; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 17; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 18; SSE-NEXT: movaps %xmm2, %xmm1 19; SSE-NEXT: retq 20; 21; AVX1-LABEL: trunc8i64_8i32: 22; AVX1: # %bb.0: # %entry 23; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 24; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 25; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 26; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 27; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 28; AVX1-NEXT: retq 29; 30; AVX2-SLOW-LABEL: trunc8i64_8i32: 31; AVX2-SLOW: # %bb.0: # %entry 32; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 33; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] 34; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 35; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] 36; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 37; AVX2-SLOW-NEXT: retq 38; 39; AVX2-FAST-LABEL: trunc8i64_8i32: 40; AVX2-FAST: # %bb.0: # %entry 41; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 42; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 43; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 44; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 45; AVX2-FAST-NEXT: retq 46; 47; AVX512-LABEL: trunc8i64_8i32: 48; AVX512: # %bb.0: # %entry 49; AVX512-NEXT: vpmovqd %zmm0, %ymm0 50; AVX512-NEXT: retq 51entry: 52 %0 = trunc <8 x i64> %a to <8 x i32> 53 ret <8 x i32> %0 54} 55 56define <8 x i32> @trunc8i64_8i32_ashr(<8 x i64> %a) { 57; SSE-LABEL: trunc8i64_8i32_ashr: 58; SSE: # %bb.0: # %entry 59; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 60; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] 61; SSE-NEXT: movaps %xmm2, %xmm1 62; SSE-NEXT: retq 63; 64; AVX1-LABEL: trunc8i64_8i32_ashr: 65; AVX1: # %bb.0: # %entry 66; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 67; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 68; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] 69; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] 70; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 71; AVX1-NEXT: retq 72; 73; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr: 74; AVX2-SLOW: # %bb.0: # %entry 75; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] 76; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] 77; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7] 78; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] 79; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 80; AVX2-SLOW-NEXT: retq 81; 82; AVX2-FAST-LABEL: trunc8i64_8i32_ashr: 83; AVX2-FAST: # %bb.0: # %entry 84; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7] 85; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 86; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 87; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 88; AVX2-FAST-NEXT: retq 89; 90; AVX512-LABEL: trunc8i64_8i32_ashr: 91; AVX512: # %bb.0: # %entry 92; AVX512-NEXT: vpsraq $32, %zmm0, %zmm0 93; AVX512-NEXT: vpmovqd %zmm0, %ymm0 94; AVX512-NEXT: retq 95entry: 96 %0 = ashr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 97 %1 = trunc <8 x i64> %0 to <8 x i32> 98 ret <8 x i32> %1 99} 100 101define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) { 102; SSE-LABEL: trunc8i64_8i32_lshr: 103; SSE: # %bb.0: # %entry 104; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 105; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] 106; SSE-NEXT: movaps %xmm2, %xmm1 107; SSE-NEXT: retq 108; 109; AVX1-LABEL: trunc8i64_8i32_lshr: 110; AVX1: # %bb.0: # %entry 111; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 112; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 113; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] 114; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] 115; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 116; AVX1-NEXT: retq 117; 118; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr: 119; AVX2-SLOW: # %bb.0: # %entry 120; AVX2-SLOW-NEXT: vpsrlq $32, %ymm1, %ymm1 121; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0 122; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 123; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 124; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 125; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 126; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 127; AVX2-SLOW-NEXT: retq 128; 129; AVX2-FAST-LABEL: trunc8i64_8i32_lshr: 130; AVX2-FAST: # %bb.0: # %entry 131; AVX2-FAST-NEXT: vpsrlq $32, %ymm1, %ymm1 132; AVX2-FAST-NEXT: vpsrlq $32, %ymm0, %ymm0 133; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 134; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 135; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 136; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 137; AVX2-FAST-NEXT: retq 138; 139; AVX512-LABEL: trunc8i64_8i32_lshr: 140; AVX512: # %bb.0: # %entry 141; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0 142; AVX512-NEXT: vpmovqd %zmm0, %ymm0 143; AVX512-NEXT: retq 144entry: 145 %0 = lshr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 146 %1 = trunc <8 x i64> %0 to <8 x i32> 147 ret <8 x i32> %1 148} 149 150define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) { 151; SSE2-LABEL: trunc8i64_8i16: 152; SSE2: # %bb.0: # %entry 153; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 154; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 155; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 156; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 157; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 158; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 159; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 160; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 161; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 162; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 163; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 164; SSE2-NEXT: retq 165; 166; SSSE3-LABEL: trunc8i64_8i16: 167; SSSE3: # %bb.0: # %entry 168; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 169; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 170; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 171; SSSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 172; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 173; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 174; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 175; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 176; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 177; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 178; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 179; SSSE3-NEXT: retq 180; 181; SSE41-LABEL: trunc8i64_8i16: 182; SSE41: # %bb.0: # %entry 183; SSE41-NEXT: pxor %xmm4, %xmm4 184; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] 185; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] 186; SSE41-NEXT: packusdw %xmm3, %xmm2 187; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] 188; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] 189; SSE41-NEXT: packusdw %xmm1, %xmm0 190; SSE41-NEXT: packusdw %xmm2, %xmm0 191; SSE41-NEXT: retq 192; 193; AVX1-LABEL: trunc8i64_8i16: 194; AVX1: # %bb.0: # %entry 195; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 196; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 197; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 198; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 199; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 200; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 201; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 202; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 203; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 204; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 205; AVX1-NEXT: vzeroupper 206; AVX1-NEXT: retq 207; 208; AVX2-SLOW-LABEL: trunc8i64_8i16: 209; AVX2-SLOW: # %bb.0: # %entry 210; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 211; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 212; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 213; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 214; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 215; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 216; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 217; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 218; AVX2-SLOW-NEXT: vzeroupper 219; AVX2-SLOW-NEXT: retq 220; 221; AVX2-FAST-LABEL: trunc8i64_8i16: 222; AVX2-FAST: # %bb.0: # %entry 223; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 224; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 225; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 226; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 227; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 228; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 229; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 230; AVX2-FAST-NEXT: vzeroupper 231; AVX2-FAST-NEXT: retq 232; 233; AVX512-LABEL: trunc8i64_8i16: 234; AVX512: # %bb.0: # %entry 235; AVX512-NEXT: vpmovqw %zmm0, %xmm0 236; AVX512-NEXT: vzeroupper 237; AVX512-NEXT: retq 238entry: 239 %0 = trunc <8 x i64> %a to <8 x i16> 240 ret <8 x i16> %0 241} 242 243define void @trunc8i64_8i8(<8 x i64> %a) { 244; SSE2-LABEL: trunc8i64_8i8: 245; SSE2: # %bb.0: # %entry 246; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 247; SSE2-NEXT: pand %xmm4, %xmm3 248; SSE2-NEXT: pand %xmm4, %xmm2 249; SSE2-NEXT: packuswb %xmm3, %xmm2 250; SSE2-NEXT: pand %xmm4, %xmm1 251; SSE2-NEXT: pand %xmm4, %xmm0 252; SSE2-NEXT: packuswb %xmm1, %xmm0 253; SSE2-NEXT: packuswb %xmm2, %xmm0 254; SSE2-NEXT: packuswb %xmm0, %xmm0 255; SSE2-NEXT: movq %xmm0, (%rax) 256; SSE2-NEXT: retq 257; 258; SSSE3-LABEL: trunc8i64_8i8: 259; SSSE3: # %bb.0: # %entry 260; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 261; SSSE3-NEXT: pand %xmm4, %xmm3 262; SSSE3-NEXT: pand %xmm4, %xmm2 263; SSSE3-NEXT: packuswb %xmm3, %xmm2 264; SSSE3-NEXT: pand %xmm4, %xmm1 265; SSSE3-NEXT: pand %xmm4, %xmm0 266; SSSE3-NEXT: packuswb %xmm1, %xmm0 267; SSSE3-NEXT: packuswb %xmm2, %xmm0 268; SSSE3-NEXT: packuswb %xmm0, %xmm0 269; SSSE3-NEXT: movq %xmm0, (%rax) 270; SSSE3-NEXT: retq 271; 272; SSE41-LABEL: trunc8i64_8i8: 273; SSE41: # %bb.0: # %entry 274; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 275; SSE41-NEXT: pand %xmm4, %xmm3 276; SSE41-NEXT: pand %xmm4, %xmm2 277; SSE41-NEXT: packusdw %xmm3, %xmm2 278; SSE41-NEXT: pand %xmm4, %xmm1 279; SSE41-NEXT: pand %xmm4, %xmm0 280; SSE41-NEXT: packusdw %xmm1, %xmm0 281; SSE41-NEXT: packusdw %xmm2, %xmm0 282; SSE41-NEXT: packuswb %xmm0, %xmm0 283; SSE41-NEXT: movq %xmm0, (%rax) 284; SSE41-NEXT: retq 285; 286; AVX1-LABEL: trunc8i64_8i8: 287; AVX1: # %bb.0: # %entry 288; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 289; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 290; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 291; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 292; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 293; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 294; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 295; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 296; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 297; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 298; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 299; AVX1-NEXT: vmovq %xmm0, (%rax) 300; AVX1-NEXT: vzeroupper 301; AVX1-NEXT: retq 302; 303; AVX2-SLOW-LABEL: trunc8i64_8i8: 304; AVX2-SLOW: # %bb.0: # %entry 305; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 306; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 307; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 308; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 309; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 310; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 311; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 312; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 313; AVX2-SLOW-NEXT: vmovq %xmm0, (%rax) 314; AVX2-SLOW-NEXT: vzeroupper 315; AVX2-SLOW-NEXT: retq 316; 317; AVX2-FAST-LABEL: trunc8i64_8i8: 318; AVX2-FAST: # %bb.0: # %entry 319; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 320; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 321; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 322; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 323; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 324; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 325; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 326; AVX2-FAST-NEXT: vmovq %xmm0, (%rax) 327; AVX2-FAST-NEXT: vzeroupper 328; AVX2-FAST-NEXT: retq 329; 330; AVX512-LABEL: trunc8i64_8i8: 331; AVX512: # %bb.0: # %entry 332; AVX512-NEXT: vpmovqb %zmm0, (%rax) 333; AVX512-NEXT: vzeroupper 334; AVX512-NEXT: retq 335entry: 336 %0 = trunc <8 x i64> %a to <8 x i8> 337 store <8 x i8> %0, <8 x i8>* undef, align 4 338 ret void 339} 340 341define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) { 342; SSE2-LABEL: trunc8i32_8i16: 343; SSE2: # %bb.0: # %entry 344; SSE2-NEXT: pslld $16, %xmm1 345; SSE2-NEXT: psrad $16, %xmm1 346; SSE2-NEXT: pslld $16, %xmm0 347; SSE2-NEXT: psrad $16, %xmm0 348; SSE2-NEXT: packssdw %xmm1, %xmm0 349; SSE2-NEXT: retq 350; 351; SSSE3-LABEL: trunc8i32_8i16: 352; SSSE3: # %bb.0: # %entry 353; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 354; SSSE3-NEXT: pshufb %xmm2, %xmm1 355; SSSE3-NEXT: pshufb %xmm2, %xmm0 356; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 357; SSSE3-NEXT: retq 358; 359; SSE41-LABEL: trunc8i32_8i16: 360; SSE41: # %bb.0: # %entry 361; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 362; SSE41-NEXT: pshufb %xmm2, %xmm1 363; SSE41-NEXT: pshufb %xmm2, %xmm0 364; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 365; SSE41-NEXT: retq 366; 367; AVX1-LABEL: trunc8i32_8i16: 368; AVX1: # %bb.0: # %entry 369; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 370; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 371; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 372; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 373; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 374; AVX1-NEXT: vzeroupper 375; AVX1-NEXT: retq 376; 377; AVX2-LABEL: trunc8i32_8i16: 378; AVX2: # %bb.0: # %entry 379; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 380; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 381; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 382; AVX2-NEXT: vzeroupper 383; AVX2-NEXT: retq 384; 385; AVX512F-LABEL: trunc8i32_8i16: 386; AVX512F: # %bb.0: # %entry 387; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 388; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 389; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 390; AVX512F-NEXT: vzeroupper 391; AVX512F-NEXT: retq 392; 393; AVX512VL-LABEL: trunc8i32_8i16: 394; AVX512VL: # %bb.0: # %entry 395; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 396; AVX512VL-NEXT: vzeroupper 397; AVX512VL-NEXT: retq 398; 399; AVX512BW-LABEL: trunc8i32_8i16: 400; AVX512BW: # %bb.0: # %entry 401; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 402; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 403; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 404; AVX512BW-NEXT: vzeroupper 405; AVX512BW-NEXT: retq 406; 407; AVX512BWVL-LABEL: trunc8i32_8i16: 408; AVX512BWVL: # %bb.0: # %entry 409; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 410; AVX512BWVL-NEXT: vzeroupper 411; AVX512BWVL-NEXT: retq 412entry: 413 %0 = trunc <8 x i32> %a to <8 x i16> 414 ret <8 x i16> %0 415} 416 417define <8 x i16> @trunc8i32_8i16_ashr(<8 x i32> %a) { 418; SSE-LABEL: trunc8i32_8i16_ashr: 419; SSE: # %bb.0: # %entry 420; SSE-NEXT: psrad $16, %xmm1 421; SSE-NEXT: psrad $16, %xmm0 422; SSE-NEXT: packssdw %xmm1, %xmm0 423; SSE-NEXT: retq 424; 425; AVX1-LABEL: trunc8i32_8i16_ashr: 426; AVX1: # %bb.0: # %entry 427; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 428; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 429; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 430; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 431; AVX1-NEXT: vzeroupper 432; AVX1-NEXT: retq 433; 434; AVX2-LABEL: trunc8i32_8i16_ashr: 435; AVX2: # %bb.0: # %entry 436; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0 437; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 438; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 439; AVX2-NEXT: vzeroupper 440; AVX2-NEXT: retq 441; 442; AVX512F-LABEL: trunc8i32_8i16_ashr: 443; AVX512F: # %bb.0: # %entry 444; AVX512F-NEXT: vpsrad $16, %ymm0, %ymm0 445; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 446; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 447; AVX512F-NEXT: vzeroupper 448; AVX512F-NEXT: retq 449; 450; AVX512VL-LABEL: trunc8i32_8i16_ashr: 451; AVX512VL: # %bb.0: # %entry 452; AVX512VL-NEXT: vpsrad $16, %ymm0, %ymm0 453; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 454; AVX512VL-NEXT: vzeroupper 455; AVX512VL-NEXT: retq 456; 457; AVX512BW-LABEL: trunc8i32_8i16_ashr: 458; AVX512BW: # %bb.0: # %entry 459; AVX512BW-NEXT: vpsrad $16, %ymm0, %ymm0 460; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 461; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 462; AVX512BW-NEXT: vzeroupper 463; AVX512BW-NEXT: retq 464; 465; AVX512BWVL-LABEL: trunc8i32_8i16_ashr: 466; AVX512BWVL: # %bb.0: # %entry 467; AVX512BWVL-NEXT: vpsrad $16, %ymm0, %ymm0 468; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 469; AVX512BWVL-NEXT: vzeroupper 470; AVX512BWVL-NEXT: retq 471entry: 472 %0 = ashr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 473 %1 = trunc <8 x i32> %0 to <8 x i16> 474 ret <8 x i16> %1 475} 476 477define <8 x i16> @trunc8i32_8i16_lshr(<8 x i32> %a) { 478; SSE2-LABEL: trunc8i32_8i16_lshr: 479; SSE2: # %bb.0: # %entry 480; SSE2-NEXT: psrld $16, %xmm0 481; SSE2-NEXT: psrld $16, %xmm1 482; SSE2-NEXT: pslld $16, %xmm1 483; SSE2-NEXT: psrad $16, %xmm1 484; SSE2-NEXT: pslld $16, %xmm0 485; SSE2-NEXT: psrad $16, %xmm0 486; SSE2-NEXT: packssdw %xmm1, %xmm0 487; SSE2-NEXT: retq 488; 489; SSSE3-LABEL: trunc8i32_8i16_lshr: 490; SSSE3: # %bb.0: # %entry 491; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,10,11,14,15,14,15,255,255] 492; SSSE3-NEXT: pshufb %xmm2, %xmm1 493; SSSE3-NEXT: pshufb %xmm2, %xmm0 494; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 495; SSSE3-NEXT: retq 496; 497; SSE41-LABEL: trunc8i32_8i16_lshr: 498; SSE41: # %bb.0: # %entry 499; SSE41-NEXT: psrld $16, %xmm1 500; SSE41-NEXT: psrld $16, %xmm0 501; SSE41-NEXT: packusdw %xmm1, %xmm0 502; SSE41-NEXT: retq 503; 504; AVX1-LABEL: trunc8i32_8i16_lshr: 505; AVX1: # %bb.0: # %entry 506; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 507; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 508; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 509; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 510; AVX1-NEXT: vzeroupper 511; AVX1-NEXT: retq 512; 513; AVX2-LABEL: trunc8i32_8i16_lshr: 514; AVX2: # %bb.0: # %entry 515; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 516; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 517; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 518; AVX2-NEXT: vzeroupper 519; AVX2-NEXT: retq 520; 521; AVX512F-LABEL: trunc8i32_8i16_lshr: 522; AVX512F: # %bb.0: # %entry 523; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0 524; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 525; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 526; AVX512F-NEXT: vzeroupper 527; AVX512F-NEXT: retq 528; 529; AVX512VL-LABEL: trunc8i32_8i16_lshr: 530; AVX512VL: # %bb.0: # %entry 531; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0 532; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 533; AVX512VL-NEXT: vzeroupper 534; AVX512VL-NEXT: retq 535; 536; AVX512BW-LABEL: trunc8i32_8i16_lshr: 537; AVX512BW: # %bb.0: # %entry 538; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm0 539; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 540; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 541; AVX512BW-NEXT: vzeroupper 542; AVX512BW-NEXT: retq 543; 544; AVX512BWVL-LABEL: trunc8i32_8i16_lshr: 545; AVX512BWVL: # %bb.0: # %entry 546; AVX512BWVL-NEXT: vpsrld $16, %ymm0, %ymm0 547; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 548; AVX512BWVL-NEXT: vzeroupper 549; AVX512BWVL-NEXT: retq 550entry: 551 %0 = lshr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 552 %1 = trunc <8 x i32> %0 to <8 x i16> 553 ret <8 x i16> %1 554} 555 556define void @trunc8i32_8i8(<8 x i32> %a) { 557; SSE2-LABEL: trunc8i32_8i8: 558; SSE2: # %bb.0: # %entry 559; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 560; SSE2-NEXT: pand %xmm2, %xmm1 561; SSE2-NEXT: pand %xmm2, %xmm0 562; SSE2-NEXT: packuswb %xmm1, %xmm0 563; SSE2-NEXT: packuswb %xmm0, %xmm0 564; SSE2-NEXT: movq %xmm0, (%rax) 565; SSE2-NEXT: retq 566; 567; SSSE3-LABEL: trunc8i32_8i8: 568; SSSE3: # %bb.0: # %entry 569; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 570; SSSE3-NEXT: pshufb %xmm2, %xmm1 571; SSSE3-NEXT: pshufb %xmm2, %xmm0 572; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 573; SSSE3-NEXT: movq %xmm0, (%rax) 574; SSSE3-NEXT: retq 575; 576; SSE41-LABEL: trunc8i32_8i8: 577; SSE41: # %bb.0: # %entry 578; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 579; SSE41-NEXT: pshufb %xmm2, %xmm1 580; SSE41-NEXT: pshufb %xmm2, %xmm0 581; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 582; SSE41-NEXT: movq %xmm0, (%rax) 583; SSE41-NEXT: retq 584; 585; AVX1-LABEL: trunc8i32_8i8: 586; AVX1: # %bb.0: # %entry 587; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 588; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 589; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 590; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 591; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 592; AVX1-NEXT: vmovq %xmm0, (%rax) 593; AVX1-NEXT: vzeroupper 594; AVX1-NEXT: retq 595; 596; AVX2-LABEL: trunc8i32_8i8: 597; AVX2: # %bb.0: # %entry 598; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 599; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 600; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 601; AVX2-NEXT: vmovq %xmm0, (%rax) 602; AVX2-NEXT: vzeroupper 603; AVX2-NEXT: retq 604; 605; AVX512F-LABEL: trunc8i32_8i8: 606; AVX512F: # %bb.0: # %entry 607; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 608; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 609; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 610; AVX512F-NEXT: vmovq %xmm0, (%rax) 611; AVX512F-NEXT: vzeroupper 612; AVX512F-NEXT: retq 613; 614; AVX512VL-LABEL: trunc8i32_8i8: 615; AVX512VL: # %bb.0: # %entry 616; AVX512VL-NEXT: vpmovdb %ymm0, (%rax) 617; AVX512VL-NEXT: vzeroupper 618; AVX512VL-NEXT: retq 619; 620; AVX512BW-LABEL: trunc8i32_8i8: 621; AVX512BW: # %bb.0: # %entry 622; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 623; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 624; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 625; AVX512BW-NEXT: vmovq %xmm0, (%rax) 626; AVX512BW-NEXT: vzeroupper 627; AVX512BW-NEXT: retq 628; 629; AVX512BWVL-LABEL: trunc8i32_8i8: 630; AVX512BWVL: # %bb.0: # %entry 631; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rax) 632; AVX512BWVL-NEXT: vzeroupper 633; AVX512BWVL-NEXT: retq 634entry: 635 %0 = trunc <8 x i32> %a to <8 x i8> 636 store <8 x i8> %0, <8 x i8>* undef, align 4 637 ret void 638} 639 640define void @trunc16i32_16i16(<16 x i32> %a) { 641; SSE2-LABEL: trunc16i32_16i16: 642; SSE2: # %bb.0: # %entry 643; SSE2-NEXT: pslld $16, %xmm1 644; SSE2-NEXT: psrad $16, %xmm1 645; SSE2-NEXT: pslld $16, %xmm0 646; SSE2-NEXT: psrad $16, %xmm0 647; SSE2-NEXT: packssdw %xmm1, %xmm0 648; SSE2-NEXT: pslld $16, %xmm3 649; SSE2-NEXT: psrad $16, %xmm3 650; SSE2-NEXT: pslld $16, %xmm2 651; SSE2-NEXT: psrad $16, %xmm2 652; SSE2-NEXT: packssdw %xmm3, %xmm2 653; SSE2-NEXT: movdqu %xmm2, (%rax) 654; SSE2-NEXT: movdqu %xmm0, (%rax) 655; SSE2-NEXT: retq 656; 657; SSSE3-LABEL: trunc16i32_16i16: 658; SSSE3: # %bb.0: # %entry 659; SSSE3-NEXT: pslld $16, %xmm1 660; SSSE3-NEXT: psrad $16, %xmm1 661; SSSE3-NEXT: pslld $16, %xmm0 662; SSSE3-NEXT: psrad $16, %xmm0 663; SSSE3-NEXT: packssdw %xmm1, %xmm0 664; SSSE3-NEXT: pslld $16, %xmm3 665; SSSE3-NEXT: psrad $16, %xmm3 666; SSSE3-NEXT: pslld $16, %xmm2 667; SSSE3-NEXT: psrad $16, %xmm2 668; SSSE3-NEXT: packssdw %xmm3, %xmm2 669; SSSE3-NEXT: movdqu %xmm2, (%rax) 670; SSSE3-NEXT: movdqu %xmm0, (%rax) 671; SSSE3-NEXT: retq 672; 673; SSE41-LABEL: trunc16i32_16i16: 674; SSE41: # %bb.0: # %entry 675; SSE41-NEXT: pxor %xmm4, %xmm4 676; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] 677; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] 678; SSE41-NEXT: packusdw %xmm1, %xmm0 679; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7] 680; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7] 681; SSE41-NEXT: packusdw %xmm3, %xmm2 682; SSE41-NEXT: movdqu %xmm2, (%rax) 683; SSE41-NEXT: movdqu %xmm0, (%rax) 684; SSE41-NEXT: retq 685; 686; AVX1-LABEL: trunc16i32_16i16: 687; AVX1: # %bb.0: # %entry 688; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 689; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 690; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7] 691; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7] 692; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 693; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 694; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7] 695; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] 696; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 697; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 698; AVX1-NEXT: vmovups %ymm0, (%rax) 699; AVX1-NEXT: vzeroupper 700; AVX1-NEXT: retq 701; 702; AVX2-LABEL: trunc16i32_16i16: 703; AVX2: # %bb.0: # %entry 704; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 705; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 706; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 707; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 708; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 709; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 710; AVX2-NEXT: vmovdqu %ymm0, (%rax) 711; AVX2-NEXT: vzeroupper 712; AVX2-NEXT: retq 713; 714; AVX512-LABEL: trunc16i32_16i16: 715; AVX512: # %bb.0: # %entry 716; AVX512-NEXT: vpmovdw %zmm0, (%rax) 717; AVX512-NEXT: vzeroupper 718; AVX512-NEXT: retq 719entry: 720 %0 = trunc <16 x i32> %a to <16 x i16> 721 store <16 x i16> %0, <16 x i16>* undef, align 4 722 ret void 723} 724 725define void @trunc16i32_16i16_ashr(<16 x i32> %a) { 726; SSE-LABEL: trunc16i32_16i16_ashr: 727; SSE: # %bb.0: # %entry 728; SSE-NEXT: psrad $16, %xmm3 729; SSE-NEXT: psrad $16, %xmm2 730; SSE-NEXT: packssdw %xmm3, %xmm2 731; SSE-NEXT: psrad $16, %xmm1 732; SSE-NEXT: psrad $16, %xmm0 733; SSE-NEXT: packssdw %xmm1, %xmm0 734; SSE-NEXT: movdqu %xmm2, (%rax) 735; SSE-NEXT: movdqu %xmm0, (%rax) 736; SSE-NEXT: retq 737; 738; AVX1-LABEL: trunc16i32_16i16_ashr: 739; AVX1: # %bb.0: # %entry 740; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 741; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2 742; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 743; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 744; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 745; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2 746; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 747; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 748; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 749; AVX1-NEXT: vmovups %ymm0, (%rax) 750; AVX1-NEXT: vzeroupper 751; AVX1-NEXT: retq 752; 753; AVX2-LABEL: trunc16i32_16i16_ashr: 754; AVX2: # %bb.0: # %entry 755; AVX2-NEXT: vpsrad $16, %ymm1, %ymm1 756; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0 757; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 758; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 759; AVX2-NEXT: vmovdqu %ymm0, (%rax) 760; AVX2-NEXT: vzeroupper 761; AVX2-NEXT: retq 762; 763; AVX512-LABEL: trunc16i32_16i16_ashr: 764; AVX512: # %bb.0: # %entry 765; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0 766; AVX512-NEXT: vpmovdw %zmm0, (%rax) 767; AVX512-NEXT: vzeroupper 768; AVX512-NEXT: retq 769entry: 770 %0 = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 771 %1 = trunc <16 x i32> %0 to <16 x i16> 772 store <16 x i16> %1, <16 x i16>* undef, align 4 773 ret void 774} 775 776define void @trunc16i32_16i16_lshr(<16 x i32> %a) { 777; SSE2-LABEL: trunc16i32_16i16_lshr: 778; SSE2: # %bb.0: # %entry 779; SSE2-NEXT: psrld $16, %xmm2 780; SSE2-NEXT: psrld $16, %xmm3 781; SSE2-NEXT: psrld $16, %xmm0 782; SSE2-NEXT: psrld $16, %xmm1 783; SSE2-NEXT: pslld $16, %xmm1 784; SSE2-NEXT: psrad $16, %xmm1 785; SSE2-NEXT: pslld $16, %xmm0 786; SSE2-NEXT: psrad $16, %xmm0 787; SSE2-NEXT: packssdw %xmm1, %xmm0 788; SSE2-NEXT: pslld $16, %xmm3 789; SSE2-NEXT: psrad $16, %xmm3 790; SSE2-NEXT: pslld $16, %xmm2 791; SSE2-NEXT: psrad $16, %xmm2 792; SSE2-NEXT: packssdw %xmm3, %xmm2 793; SSE2-NEXT: movdqu %xmm2, (%rax) 794; SSE2-NEXT: movdqu %xmm0, (%rax) 795; SSE2-NEXT: retq 796; 797; SSSE3-LABEL: trunc16i32_16i16_lshr: 798; SSSE3: # %bb.0: # %entry 799; SSSE3-NEXT: psrld $16, %xmm2 800; SSSE3-NEXT: psrld $16, %xmm3 801; SSSE3-NEXT: psrld $16, %xmm0 802; SSSE3-NEXT: psrld $16, %xmm1 803; SSSE3-NEXT: pslld $16, %xmm1 804; SSSE3-NEXT: psrad $16, %xmm1 805; SSSE3-NEXT: pslld $16, %xmm0 806; SSSE3-NEXT: psrad $16, %xmm0 807; SSSE3-NEXT: packssdw %xmm1, %xmm0 808; SSSE3-NEXT: pslld $16, %xmm3 809; SSSE3-NEXT: psrad $16, %xmm3 810; SSSE3-NEXT: pslld $16, %xmm2 811; SSSE3-NEXT: psrad $16, %xmm2 812; SSSE3-NEXT: packssdw %xmm3, %xmm2 813; SSSE3-NEXT: movdqu %xmm2, (%rax) 814; SSSE3-NEXT: movdqu %xmm0, (%rax) 815; SSSE3-NEXT: retq 816; 817; SSE41-LABEL: trunc16i32_16i16_lshr: 818; SSE41: # %bb.0: # %entry 819; SSE41-NEXT: psrld $16, %xmm3 820; SSE41-NEXT: psrld $16, %xmm2 821; SSE41-NEXT: packusdw %xmm3, %xmm2 822; SSE41-NEXT: psrld $16, %xmm1 823; SSE41-NEXT: psrld $16, %xmm0 824; SSE41-NEXT: packusdw %xmm1, %xmm0 825; SSE41-NEXT: movdqu %xmm2, (%rax) 826; SSE41-NEXT: movdqu %xmm0, (%rax) 827; SSE41-NEXT: retq 828; 829; AVX1-LABEL: trunc16i32_16i16_lshr: 830; AVX1: # %bb.0: # %entry 831; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 832; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 833; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 834; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 835; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 836; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 837; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 838; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 839; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 840; AVX1-NEXT: vmovups %ymm0, (%rax) 841; AVX1-NEXT: vzeroupper 842; AVX1-NEXT: retq 843; 844; AVX2-LABEL: trunc16i32_16i16_lshr: 845; AVX2: # %bb.0: # %entry 846; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 847; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 848; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 849; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 850; AVX2-NEXT: vmovdqu %ymm0, (%rax) 851; AVX2-NEXT: vzeroupper 852; AVX2-NEXT: retq 853; 854; AVX512-LABEL: trunc16i32_16i16_lshr: 855; AVX512: # %bb.0: # %entry 856; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0 857; AVX512-NEXT: vpmovdw %zmm0, (%rax) 858; AVX512-NEXT: vzeroupper 859; AVX512-NEXT: retq 860entry: 861 %0 = lshr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 862 %1 = trunc <16 x i32> %0 to <16 x i16> 863 store <16 x i16> %1, <16 x i16>* undef, align 4 864 ret void 865} 866 867define void @trunc16i32_16i8(<16 x i32> %a) { 868; SSE2-LABEL: trunc16i32_16i8: 869; SSE2: # %bb.0: # %entry 870; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 871; SSE2-NEXT: pand %xmm4, %xmm3 872; SSE2-NEXT: pand %xmm4, %xmm2 873; SSE2-NEXT: packuswb %xmm3, %xmm2 874; SSE2-NEXT: pand %xmm4, %xmm1 875; SSE2-NEXT: pand %xmm4, %xmm0 876; SSE2-NEXT: packuswb %xmm1, %xmm0 877; SSE2-NEXT: packuswb %xmm2, %xmm0 878; SSE2-NEXT: movdqu %xmm0, (%rax) 879; SSE2-NEXT: retq 880; 881; SSSE3-LABEL: trunc16i32_16i8: 882; SSSE3: # %bb.0: # %entry 883; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 884; SSSE3-NEXT: pand %xmm4, %xmm3 885; SSSE3-NEXT: pand %xmm4, %xmm2 886; SSSE3-NEXT: packuswb %xmm3, %xmm2 887; SSSE3-NEXT: pand %xmm4, %xmm1 888; SSSE3-NEXT: pand %xmm4, %xmm0 889; SSSE3-NEXT: packuswb %xmm1, %xmm0 890; SSSE3-NEXT: packuswb %xmm2, %xmm0 891; SSSE3-NEXT: movdqu %xmm0, (%rax) 892; SSSE3-NEXT: retq 893; 894; SSE41-LABEL: trunc16i32_16i8: 895; SSE41: # %bb.0: # %entry 896; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 897; SSE41-NEXT: pand %xmm4, %xmm3 898; SSE41-NEXT: pand %xmm4, %xmm2 899; SSE41-NEXT: packusdw %xmm3, %xmm2 900; SSE41-NEXT: pand %xmm4, %xmm1 901; SSE41-NEXT: pand %xmm4, %xmm0 902; SSE41-NEXT: packusdw %xmm1, %xmm0 903; SSE41-NEXT: packuswb %xmm2, %xmm0 904; SSE41-NEXT: movdqu %xmm0, (%rax) 905; SSE41-NEXT: retq 906; 907; AVX1-LABEL: trunc16i32_16i8: 908; AVX1: # %bb.0: # %entry 909; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 910; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 911; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 912; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 913; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 914; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 915; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 916; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 917; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 918; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 919; AVX1-NEXT: vmovdqu %xmm0, (%rax) 920; AVX1-NEXT: vzeroupper 921; AVX1-NEXT: retq 922; 923; AVX2-LABEL: trunc16i32_16i8: 924; AVX2: # %bb.0: # %entry 925; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 926; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 927; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 928; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 929; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 930; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 931; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 932; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 933; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 934; AVX2-NEXT: vmovdqu %xmm0, (%rax) 935; AVX2-NEXT: vzeroupper 936; AVX2-NEXT: retq 937; 938; AVX512-LABEL: trunc16i32_16i8: 939; AVX512: # %bb.0: # %entry 940; AVX512-NEXT: vpmovdb %zmm0, (%rax) 941; AVX512-NEXT: vzeroupper 942; AVX512-NEXT: retq 943entry: 944 %0 = trunc <16 x i32> %a to <16 x i8> 945 store <16 x i8> %0, <16 x i8>* undef, align 4 946 ret void 947} 948 949define void @trunc16i32_16i8_ashr(<16 x i32> %a) { 950; SSE-LABEL: trunc16i32_16i8_ashr: 951; SSE: # %bb.0: # %entry 952; SSE-NEXT: psrad $24, %xmm1 953; SSE-NEXT: psrad $24, %xmm0 954; SSE-NEXT: packssdw %xmm1, %xmm0 955; SSE-NEXT: psrad $24, %xmm3 956; SSE-NEXT: psrad $24, %xmm2 957; SSE-NEXT: packssdw %xmm3, %xmm2 958; SSE-NEXT: packsswb %xmm2, %xmm0 959; SSE-NEXT: movdqu %xmm0, (%rax) 960; SSE-NEXT: retq 961; 962; AVX1-LABEL: trunc16i32_16i8_ashr: 963; AVX1: # %bb.0: # %entry 964; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 965; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2 966; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 967; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 968; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 969; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2 970; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1 971; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 972; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 973; AVX1-NEXT: vmovdqu %xmm0, (%rax) 974; AVX1-NEXT: vzeroupper 975; AVX1-NEXT: retq 976; 977; AVX2-LABEL: trunc16i32_16i8_ashr: 978; AVX2: # %bb.0: # %entry 979; AVX2-NEXT: vpsrad $24, %ymm1, %ymm1 980; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0 981; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 982; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 983; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 984; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 985; AVX2-NEXT: vmovdqu %xmm0, (%rax) 986; AVX2-NEXT: vzeroupper 987; AVX2-NEXT: retq 988; 989; AVX512-LABEL: trunc16i32_16i8_ashr: 990; AVX512: # %bb.0: # %entry 991; AVX512-NEXT: vpsrld $24, %zmm0, %zmm0 992; AVX512-NEXT: vpmovdb %zmm0, (%rax) 993; AVX512-NEXT: vzeroupper 994; AVX512-NEXT: retq 995entry: 996 %0 = ashr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24> 997 %1 = trunc <16 x i32> %0 to <16 x i8> 998 store <16 x i8> %1, <16 x i8>* undef, align 4 999 ret void 1000} 1001 1002define void @trunc16i32_16i8_lshr(<16 x i32> %a) { 1003; SSE2-LABEL: trunc16i32_16i8_lshr: 1004; SSE2: # %bb.0: # %entry 1005; SSE2-NEXT: psrld $24, %xmm1 1006; SSE2-NEXT: psrld $24, %xmm0 1007; SSE2-NEXT: packuswb %xmm1, %xmm0 1008; SSE2-NEXT: psrld $24, %xmm3 1009; SSE2-NEXT: psrld $24, %xmm2 1010; SSE2-NEXT: packuswb %xmm3, %xmm2 1011; SSE2-NEXT: packuswb %xmm2, %xmm0 1012; SSE2-NEXT: movdqu %xmm0, (%rax) 1013; SSE2-NEXT: retq 1014; 1015; SSSE3-LABEL: trunc16i32_16i8_lshr: 1016; SSSE3: # %bb.0: # %entry 1017; SSSE3-NEXT: psrld $24, %xmm1 1018; SSSE3-NEXT: psrld $24, %xmm0 1019; SSSE3-NEXT: packuswb %xmm1, %xmm0 1020; SSSE3-NEXT: psrld $24, %xmm3 1021; SSSE3-NEXT: psrld $24, %xmm2 1022; SSSE3-NEXT: packuswb %xmm3, %xmm2 1023; SSSE3-NEXT: packuswb %xmm2, %xmm0 1024; SSSE3-NEXT: movdqu %xmm0, (%rax) 1025; SSSE3-NEXT: retq 1026; 1027; SSE41-LABEL: trunc16i32_16i8_lshr: 1028; SSE41: # %bb.0: # %entry 1029; SSE41-NEXT: psrld $24, %xmm1 1030; SSE41-NEXT: psrld $24, %xmm0 1031; SSE41-NEXT: packusdw %xmm1, %xmm0 1032; SSE41-NEXT: psrld $24, %xmm3 1033; SSE41-NEXT: psrld $24, %xmm2 1034; SSE41-NEXT: packusdw %xmm3, %xmm2 1035; SSE41-NEXT: packuswb %xmm2, %xmm0 1036; SSE41-NEXT: movdqu %xmm0, (%rax) 1037; SSE41-NEXT: retq 1038; 1039; AVX1-LABEL: trunc16i32_16i8_lshr: 1040; AVX1: # %bb.0: # %entry 1041; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1042; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 1043; AVX1-NEXT: vpsrld $24, %xmm0, %xmm0 1044; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 1045; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1046; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 1047; AVX1-NEXT: vpsrld $24, %xmm1, %xmm1 1048; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 1049; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1050; AVX1-NEXT: vmovdqu %xmm0, (%rax) 1051; AVX1-NEXT: vzeroupper 1052; AVX1-NEXT: retq 1053; 1054; AVX2-LABEL: trunc16i32_16i8_lshr: 1055; AVX2: # %bb.0: # %entry 1056; AVX2-NEXT: vpsrld $24, %ymm1, %ymm1 1057; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0 1058; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1059; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1060; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1061; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1062; AVX2-NEXT: vmovdqu %xmm0, (%rax) 1063; AVX2-NEXT: vzeroupper 1064; AVX2-NEXT: retq 1065; 1066; AVX512-LABEL: trunc16i32_16i8_lshr: 1067; AVX512: # %bb.0: # %entry 1068; AVX512-NEXT: vpsrld $24, %zmm0, %zmm0 1069; AVX512-NEXT: vpmovdb %zmm0, (%rax) 1070; AVX512-NEXT: vzeroupper 1071; AVX512-NEXT: retq 1072entry: 1073 %0 = lshr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24> 1074 %1 = trunc <16 x i32> %0 to <16 x i8> 1075 store <16 x i8> %1, <16 x i8>* undef, align 4 1076 ret void 1077} 1078 1079;PR25684 1080define void @trunc16i16_16i8(<16 x i16> %a) { 1081; SSE2-LABEL: trunc16i16_16i8: 1082; SSE2: # %bb.0: # %entry 1083; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1084; SSE2-NEXT: pand %xmm2, %xmm1 1085; SSE2-NEXT: pand %xmm2, %xmm0 1086; SSE2-NEXT: packuswb %xmm1, %xmm0 1087; SSE2-NEXT: movdqu %xmm0, (%rax) 1088; SSE2-NEXT: retq 1089; 1090; SSSE3-LABEL: trunc16i16_16i8: 1091; SSSE3: # %bb.0: # %entry 1092; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1093; SSSE3-NEXT: pshufb %xmm2, %xmm1 1094; SSSE3-NEXT: pshufb %xmm2, %xmm0 1095; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1096; SSSE3-NEXT: movdqu %xmm0, (%rax) 1097; SSSE3-NEXT: retq 1098; 1099; SSE41-LABEL: trunc16i16_16i8: 1100; SSE41: # %bb.0: # %entry 1101; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1102; SSE41-NEXT: pshufb %xmm2, %xmm1 1103; SSE41-NEXT: pshufb %xmm2, %xmm0 1104; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1105; SSE41-NEXT: movdqu %xmm0, (%rax) 1106; SSE41-NEXT: retq 1107; 1108; AVX1-LABEL: trunc16i16_16i8: 1109; AVX1: # %bb.0: # %entry 1110; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1111; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1112; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1113; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1114; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1115; AVX1-NEXT: vmovdqu %xmm0, (%rax) 1116; AVX1-NEXT: vzeroupper 1117; AVX1-NEXT: retq 1118; 1119; AVX2-LABEL: trunc16i16_16i8: 1120; AVX2: # %bb.0: # %entry 1121; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1122; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1123; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1124; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1125; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1126; AVX2-NEXT: vmovdqu %xmm0, (%rax) 1127; AVX2-NEXT: vzeroupper 1128; AVX2-NEXT: retq 1129; 1130; AVX512F-LABEL: trunc16i16_16i8: 1131; AVX512F: # %bb.0: # %entry 1132; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 1133; AVX512F-NEXT: vpmovdb %zmm0, (%rax) 1134; AVX512F-NEXT: vzeroupper 1135; AVX512F-NEXT: retq 1136; 1137; AVX512VL-LABEL: trunc16i16_16i8: 1138; AVX512VL: # %bb.0: # %entry 1139; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0 1140; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) 1141; AVX512VL-NEXT: vzeroupper 1142; AVX512VL-NEXT: retq 1143; 1144; AVX512BW-LABEL: trunc16i16_16i8: 1145; AVX512BW: # %bb.0: # %entry 1146; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1147; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1148; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) 1149; AVX512BW-NEXT: vzeroupper 1150; AVX512BW-NEXT: retq 1151; 1152; AVX512BWVL-LABEL: trunc16i16_16i8: 1153; AVX512BWVL: # %bb.0: # %entry 1154; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax) 1155; AVX512BWVL-NEXT: vzeroupper 1156; AVX512BWVL-NEXT: retq 1157entry: 1158 %0 = trunc <16 x i16> %a to <16 x i8> 1159 store <16 x i8> %0, <16 x i8>* undef, align 4 1160 ret void 1161} 1162 1163define void @trunc16i16_16i8_ashr(<16 x i16> %a) { 1164; SSE-LABEL: trunc16i16_16i8_ashr: 1165; SSE: # %bb.0: # %entry 1166; SSE-NEXT: psraw $8, %xmm1 1167; SSE-NEXT: psraw $8, %xmm0 1168; SSE-NEXT: packsswb %xmm1, %xmm0 1169; SSE-NEXT: movdqu %xmm0, (%rax) 1170; SSE-NEXT: retq 1171; 1172; AVX1-LABEL: trunc16i16_16i8_ashr: 1173; AVX1: # %bb.0: # %entry 1174; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1175; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 1176; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 1177; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 1178; AVX1-NEXT: vmovdqu %xmm0, (%rax) 1179; AVX1-NEXT: vzeroupper 1180; AVX1-NEXT: retq 1181; 1182; AVX2-LABEL: trunc16i16_16i8_ashr: 1183; AVX2: # %bb.0: # %entry 1184; AVX2-NEXT: vpsraw $8, %ymm0, %ymm0 1185; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1186; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 1187; AVX2-NEXT: vmovdqu %xmm0, (%rax) 1188; AVX2-NEXT: vzeroupper 1189; AVX2-NEXT: retq 1190; 1191; AVX512F-LABEL: trunc16i16_16i8_ashr: 1192; AVX512F: # %bb.0: # %entry 1193; AVX512F-NEXT: vpsraw $8, %ymm0, %ymm0 1194; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 1195; AVX512F-NEXT: vpmovdb %zmm0, (%rax) 1196; AVX512F-NEXT: vzeroupper 1197; AVX512F-NEXT: retq 1198; 1199; AVX512VL-LABEL: trunc16i16_16i8_ashr: 1200; AVX512VL: # %bb.0: # %entry 1201; AVX512VL-NEXT: vpsraw $8, %ymm0, %ymm0 1202; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0 1203; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) 1204; AVX512VL-NEXT: vzeroupper 1205; AVX512VL-NEXT: retq 1206; 1207; AVX512BW-LABEL: trunc16i16_16i8_ashr: 1208; AVX512BW: # %bb.0: # %entry 1209; AVX512BW-NEXT: vpsraw $8, %ymm0, %ymm0 1210; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1211; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) 1212; AVX512BW-NEXT: vzeroupper 1213; AVX512BW-NEXT: retq 1214; 1215; AVX512BWVL-LABEL: trunc16i16_16i8_ashr: 1216; AVX512BWVL: # %bb.0: # %entry 1217; AVX512BWVL-NEXT: vpsrlw $8, %ymm0, %ymm0 1218; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax) 1219; AVX512BWVL-NEXT: vzeroupper 1220; AVX512BWVL-NEXT: retq 1221entry: 1222 %0 = ashr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 1223 %1 = trunc <16 x i16> %0 to <16 x i8> 1224 store <16 x i8> %1, <16 x i8>* undef, align 4 1225 ret void 1226} 1227 1228define void @trunc16i16_16i8_lshr(<16 x i16> %a) { 1229; SSE-LABEL: trunc16i16_16i8_lshr: 1230; SSE: # %bb.0: # %entry 1231; SSE-NEXT: psrlw $8, %xmm1 1232; SSE-NEXT: psrlw $8, %xmm0 1233; SSE-NEXT: packuswb %xmm1, %xmm0 1234; SSE-NEXT: movdqu %xmm0, (%rax) 1235; SSE-NEXT: retq 1236; 1237; AVX1-LABEL: trunc16i16_16i8_lshr: 1238; AVX1: # %bb.0: # %entry 1239; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1240; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 1241; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1242; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1243; AVX1-NEXT: vmovdqu %xmm0, (%rax) 1244; AVX1-NEXT: vzeroupper 1245; AVX1-NEXT: retq 1246; 1247; AVX2-LABEL: trunc16i16_16i8_lshr: 1248; AVX2: # %bb.0: # %entry 1249; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1250; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1251; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1252; AVX2-NEXT: vmovdqu %xmm0, (%rax) 1253; AVX2-NEXT: vzeroupper 1254; AVX2-NEXT: retq 1255; 1256; AVX512F-LABEL: trunc16i16_16i8_lshr: 1257; AVX512F: # %bb.0: # %entry 1258; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 1259; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 1260; AVX512F-NEXT: vpmovdb %zmm0, (%rax) 1261; AVX512F-NEXT: vzeroupper 1262; AVX512F-NEXT: retq 1263; 1264; AVX512VL-LABEL: trunc16i16_16i8_lshr: 1265; AVX512VL: # %bb.0: # %entry 1266; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 1267; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0 1268; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) 1269; AVX512VL-NEXT: vzeroupper 1270; AVX512VL-NEXT: retq 1271; 1272; AVX512BW-LABEL: trunc16i16_16i8_lshr: 1273; AVX512BW: # %bb.0: # %entry 1274; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 1275; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1276; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) 1277; AVX512BW-NEXT: vzeroupper 1278; AVX512BW-NEXT: retq 1279; 1280; AVX512BWVL-LABEL: trunc16i16_16i8_lshr: 1281; AVX512BWVL: # %bb.0: # %entry 1282; AVX512BWVL-NEXT: vpsrlw $8, %ymm0, %ymm0 1283; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax) 1284; AVX512BWVL-NEXT: vzeroupper 1285; AVX512BWVL-NEXT: retq 1286entry: 1287 %0 = lshr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 1288 %1 = trunc <16 x i16> %0 to <16 x i8> 1289 store <16 x i8> %1, <16 x i8>* undef, align 4 1290 ret void 1291} 1292 1293define void @trunc32i16_32i8(<32 x i16> %a) { 1294; SSE2-LABEL: trunc32i16_32i8: 1295; SSE2: # %bb.0: # %entry 1296; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 1297; SSE2-NEXT: pand %xmm4, %xmm1 1298; SSE2-NEXT: pand %xmm4, %xmm0 1299; SSE2-NEXT: packuswb %xmm1, %xmm0 1300; SSE2-NEXT: pand %xmm4, %xmm3 1301; SSE2-NEXT: pand %xmm4, %xmm2 1302; SSE2-NEXT: packuswb %xmm3, %xmm2 1303; SSE2-NEXT: movdqu %xmm2, (%rax) 1304; SSE2-NEXT: movdqu %xmm0, (%rax) 1305; SSE2-NEXT: retq 1306; 1307; SSSE3-LABEL: trunc32i16_32i8: 1308; SSSE3: # %bb.0: # %entry 1309; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1310; SSSE3-NEXT: pshufb %xmm4, %xmm1 1311; SSSE3-NEXT: pshufb %xmm4, %xmm0 1312; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1313; SSSE3-NEXT: pshufb %xmm4, %xmm3 1314; SSSE3-NEXT: pshufb %xmm4, %xmm2 1315; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1316; SSSE3-NEXT: movdqu %xmm2, (%rax) 1317; SSSE3-NEXT: movdqu %xmm0, (%rax) 1318; SSSE3-NEXT: retq 1319; 1320; SSE41-LABEL: trunc32i16_32i8: 1321; SSE41: # %bb.0: # %entry 1322; SSE41-NEXT: movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1323; SSE41-NEXT: pshufb %xmm4, %xmm1 1324; SSE41-NEXT: pshufb %xmm4, %xmm0 1325; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1326; SSE41-NEXT: pshufb %xmm4, %xmm3 1327; SSE41-NEXT: pshufb %xmm4, %xmm2 1328; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1329; SSE41-NEXT: movdqu %xmm2, (%rax) 1330; SSE41-NEXT: movdqu %xmm0, (%rax) 1331; SSE41-NEXT: retq 1332; 1333; AVX1-LABEL: trunc32i16_32i8: 1334; AVX1: # %bb.0: # %entry 1335; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1336; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1337; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1338; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1339; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1340; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1341; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1342; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1343; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 1344; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1345; AVX1-NEXT: vmovups %ymm0, (%rax) 1346; AVX1-NEXT: vzeroupper 1347; AVX1-NEXT: retq 1348; 1349; AVX2-LABEL: trunc32i16_32i8: 1350; AVX2: # %bb.0: # %entry 1351; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1352; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1353; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1354; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1355; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1356; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1357; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1358; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1359; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 1360; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1361; AVX2-NEXT: vmovdqu %ymm0, (%rax) 1362; AVX2-NEXT: vzeroupper 1363; AVX2-NEXT: retq 1364; 1365; AVX512F-LABEL: trunc32i16_32i8: 1366; AVX512F: # %bb.0: # %entry 1367; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 1368; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 1369; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 1370; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 1371; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1372; AVX512F-NEXT: vmovdqu %ymm0, (%rax) 1373; AVX512F-NEXT: vzeroupper 1374; AVX512F-NEXT: retq 1375; 1376; AVX512VL-LABEL: trunc32i16_32i8: 1377; AVX512VL: # %bb.0: # %entry 1378; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0 1379; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 1380; AVX512VL-NEXT: vpmovsxwd %ymm1, %zmm1 1381; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 1382; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1383; AVX512VL-NEXT: vmovdqu %ymm0, (%rax) 1384; AVX512VL-NEXT: vzeroupper 1385; AVX512VL-NEXT: retq 1386; 1387; AVX512BW-LABEL: trunc32i16_32i8: 1388; AVX512BW: # %bb.0: # %entry 1389; AVX512BW-NEXT: vpmovwb %zmm0, (%rax) 1390; AVX512BW-NEXT: vzeroupper 1391; AVX512BW-NEXT: retq 1392; 1393; AVX512BWVL-LABEL: trunc32i16_32i8: 1394; AVX512BWVL: # %bb.0: # %entry 1395; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rax) 1396; AVX512BWVL-NEXT: vzeroupper 1397; AVX512BWVL-NEXT: retq 1398entry: 1399 %0 = trunc <32 x i16> %a to <32 x i8> 1400 store <32 x i8> %0, <32 x i8>* undef, align 4 1401 ret void 1402} 1403 1404define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) { 1405; SSE-LABEL: trunc2x4i64_8i32: 1406; SSE: # %bb.0: # %entry 1407; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1408; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 1409; SSE-NEXT: movaps %xmm2, %xmm1 1410; SSE-NEXT: retq 1411; 1412; AVX1-LABEL: trunc2x4i64_8i32: 1413; AVX1: # %bb.0: # %entry 1414; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1415; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 1416; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1417; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 1418; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1419; AVX1-NEXT: retq 1420; 1421; AVX2-SLOW-LABEL: trunc2x4i64_8i32: 1422; AVX2-SLOW: # %bb.0: # %entry 1423; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 1424; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] 1425; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 1426; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] 1427; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1428; AVX2-SLOW-NEXT: retq 1429; 1430; AVX2-FAST-LABEL: trunc2x4i64_8i32: 1431; AVX2-FAST: # %bb.0: # %entry 1432; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 1433; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 1434; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 1435; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1436; AVX2-FAST-NEXT: retq 1437; 1438; AVX512F-LABEL: trunc2x4i64_8i32: 1439; AVX512F: # %bb.0: # %entry 1440; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1441; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1442; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 1443; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 1444; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1445; AVX512F-NEXT: retq 1446; 1447; AVX512VL-LABEL: trunc2x4i64_8i32: 1448; AVX512VL: # %bb.0: # %entry 1449; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 1450; AVX512VL-NEXT: vpmovqd %ymm1, %xmm1 1451; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1452; AVX512VL-NEXT: retq 1453; 1454; AVX512BW-LABEL: trunc2x4i64_8i32: 1455; AVX512BW: # %bb.0: # %entry 1456; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1457; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1458; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 1459; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 1460; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1461; AVX512BW-NEXT: retq 1462; 1463; AVX512BWVL-LABEL: trunc2x4i64_8i32: 1464; AVX512BWVL: # %bb.0: # %entry 1465; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 1466; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1 1467; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1468; AVX512BWVL-NEXT: retq 1469entry: 1470 %0 = trunc <4 x i64> %a to <4 x i32> 1471 %1 = trunc <4 x i64> %b to <4 x i32> 1472 %2 = shufflevector <4 x i32> %0, <4 x i32> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1473 ret <8 x i32> %2 1474} 1475 1476define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) { 1477; SSE2-LABEL: trunc2x4i64_8i16: 1478; SSE2: # %bb.0: # %entry 1479; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1480; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1481; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1482; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 1483; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 1484; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 1485; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 1486; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 1487; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 1488; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1489; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 1490; SSE2-NEXT: retq 1491; 1492; SSSE3-LABEL: trunc2x4i64_8i16: 1493; SSSE3: # %bb.0: # %entry 1494; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1495; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1496; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1497; SSSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 1498; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 1499; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 1500; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 1501; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 1502; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 1503; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1504; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 1505; SSSE3-NEXT: retq 1506; 1507; SSE41-LABEL: trunc2x4i64_8i16: 1508; SSE41: # %bb.0: # %entry 1509; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 1510; SSE41-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] 1511; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 1512; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] 1513; SSE41-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 1514; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1515; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1516; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1517; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1518; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1519; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 1520; SSE41-NEXT: retq 1521; 1522; AVX1-LABEL: trunc2x4i64_8i16: 1523; AVX1: # %bb.0: # %entry 1524; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1525; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 1526; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1527; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 1528; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1529; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1530; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1531; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1532; AVX1-NEXT: vzeroupper 1533; AVX1-NEXT: retq 1534; 1535; AVX2-SLOW-LABEL: trunc2x4i64_8i16: 1536; AVX2-SLOW: # %bb.0: # %entry 1537; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 1538; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1539; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 1540; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 1541; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1542; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1543; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1544; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1545; AVX2-SLOW-NEXT: vzeroupper 1546; AVX2-SLOW-NEXT: retq 1547; 1548; AVX2-FAST-LABEL: trunc2x4i64_8i16: 1549; AVX2-FAST: # %bb.0: # %entry 1550; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 1551; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 1552; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 1553; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1554; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1555; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1556; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1557; AVX2-FAST-NEXT: vzeroupper 1558; AVX2-FAST-NEXT: retq 1559; 1560; AVX512F-LABEL: trunc2x4i64_8i16: 1561; AVX512F: # %bb.0: # %entry 1562; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1563; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1564; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 1565; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 1566; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1567; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1568; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1569; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1570; AVX512F-NEXT: vzeroupper 1571; AVX512F-NEXT: retq 1572; 1573; AVX512VL-LABEL: trunc2x4i64_8i16: 1574; AVX512VL: # %bb.0: # %entry 1575; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 1576; AVX512VL-NEXT: vpmovqd %ymm1, %xmm1 1577; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1578; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1579; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1580; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1581; AVX512VL-NEXT: vzeroupper 1582; AVX512VL-NEXT: retq 1583; 1584; AVX512BW-LABEL: trunc2x4i64_8i16: 1585; AVX512BW: # %bb.0: # %entry 1586; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1587; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1588; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 1589; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 1590; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1591; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1592; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1593; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1594; AVX512BW-NEXT: vzeroupper 1595; AVX512BW-NEXT: retq 1596; 1597; AVX512BWVL-LABEL: trunc2x4i64_8i16: 1598; AVX512BWVL: # %bb.0: # %entry 1599; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 1600; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1 1601; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1602; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1603; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1604; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1605; AVX512BWVL-NEXT: vzeroupper 1606; AVX512BWVL-NEXT: retq 1607entry: 1608 %0 = trunc <4 x i64> %a to <4 x i16> 1609 %1 = trunc <4 x i64> %b to <4 x i16> 1610 %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1611 ret <8 x i16> %2 1612} 1613 1614define <4 x i32> @trunc2x2i64_4i32(<2 x i64> %a, <2 x i64> %b) { 1615; SSE-LABEL: trunc2x2i64_4i32: 1616; SSE: # %bb.0: # %entry 1617; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1618; SSE-NEXT: retq 1619; 1620; AVX-LABEL: trunc2x2i64_4i32: 1621; AVX: # %bb.0: # %entry 1622; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1623; AVX-NEXT: retq 1624; 1625; AVX512-LABEL: trunc2x2i64_4i32: 1626; AVX512: # %bb.0: # %entry 1627; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1628; AVX512-NEXT: retq 1629entry: 1630 %0 = trunc <2 x i64> %a to <2 x i32> 1631 %1 = trunc <2 x i64> %b to <2 x i32> 1632 %2 = shufflevector <2 x i32> %0, <2 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1633 ret <4 x i32> %2 1634} 1635 1636define i64 @trunc2i64_i64(<2 x i64> %inval) { 1637; SSE-LABEL: trunc2i64_i64: 1638; SSE: # %bb.0: # %entry 1639; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1640; SSE-NEXT: movq %xmm0, %rax 1641; SSE-NEXT: retq 1642; 1643; AVX-LABEL: trunc2i64_i64: 1644; AVX: # %bb.0: # %entry 1645; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1646; AVX-NEXT: vmovq %xmm0, %rax 1647; AVX-NEXT: retq 1648; 1649; AVX512F-LABEL: trunc2i64_i64: 1650; AVX512F: # %bb.0: # %entry 1651; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1652; AVX512F-NEXT: vmovq %xmm0, %rax 1653; AVX512F-NEXT: retq 1654; 1655; AVX512VL-LABEL: trunc2i64_i64: 1656; AVX512VL: # %bb.0: # %entry 1657; AVX512VL-NEXT: vpmovqd %xmm0, -{{[0-9]+}}(%rsp) 1658; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax 1659; AVX512VL-NEXT: retq 1660; 1661; AVX512BW-LABEL: trunc2i64_i64: 1662; AVX512BW: # %bb.0: # %entry 1663; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1664; AVX512BW-NEXT: vmovq %xmm0, %rax 1665; AVX512BW-NEXT: retq 1666; 1667; AVX512BWVL-LABEL: trunc2i64_i64: 1668; AVX512BWVL: # %bb.0: # %entry 1669; AVX512BWVL-NEXT: vpmovqd %xmm0, -{{[0-9]+}}(%rsp) 1670; AVX512BWVL-NEXT: movq -{{[0-9]+}}(%rsp), %rax 1671; AVX512BWVL-NEXT: retq 1672entry: 1673 %0 = trunc <2 x i64> %inval to <2 x i32> 1674 %1 = bitcast <2 x i32> %0 to i64 1675 ret i64 %1 1676} 1677 1678define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) { 1679; SSE2-LABEL: trunc2x4i32_8i16: 1680; SSE2: # %bb.0: # %entry 1681; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1682; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 1683; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1684; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1685; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 1686; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1687; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1688; SSE2-NEXT: retq 1689; 1690; SSSE3-LABEL: trunc2x4i32_8i16: 1691; SSSE3: # %bb.0: # %entry 1692; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1693; SSSE3-NEXT: pshufb %xmm2, %xmm1 1694; SSSE3-NEXT: pshufb %xmm2, %xmm0 1695; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1696; SSSE3-NEXT: retq 1697; 1698; SSE41-LABEL: trunc2x4i32_8i16: 1699; SSE41: # %bb.0: # %entry 1700; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1701; SSE41-NEXT: pshufb %xmm2, %xmm1 1702; SSE41-NEXT: pshufb %xmm2, %xmm0 1703; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1704; SSE41-NEXT: retq 1705; 1706; AVX-LABEL: trunc2x4i32_8i16: 1707; AVX: # %bb.0: # %entry 1708; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1709; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1710; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1711; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1712; AVX-NEXT: retq 1713; 1714; AVX512-LABEL: trunc2x4i32_8i16: 1715; AVX512: # %bb.0: # %entry 1716; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1717; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1718; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1719; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1720; AVX512-NEXT: retq 1721entry: 1722 %0 = trunc <4 x i32> %a to <4 x i16> 1723 %1 = trunc <4 x i32> %b to <4 x i16> 1724 %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1725 ret <8 x i16> %2 1726} 1727 1728; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524 1729define i64 @trunc4i32_i64(<4 x i32> %inval) { 1730; SSE2-LABEL: trunc4i32_i64: 1731; SSE2: # %bb.0: # %entry 1732; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1733; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 1734; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1735; SSE2-NEXT: movq %xmm0, %rax 1736; SSE2-NEXT: retq 1737; 1738; SSSE3-LABEL: trunc4i32_i64: 1739; SSSE3: # %bb.0: # %entry 1740; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1741; SSSE3-NEXT: movq %xmm0, %rax 1742; SSSE3-NEXT: retq 1743; 1744; SSE41-LABEL: trunc4i32_i64: 1745; SSE41: # %bb.0: # %entry 1746; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1747; SSE41-NEXT: movq %xmm0, %rax 1748; SSE41-NEXT: retq 1749; 1750; AVX-LABEL: trunc4i32_i64: 1751; AVX: # %bb.0: # %entry 1752; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1753; AVX-NEXT: vmovq %xmm0, %rax 1754; AVX-NEXT: retq 1755; 1756; AVX512F-LABEL: trunc4i32_i64: 1757; AVX512F: # %bb.0: # %entry 1758; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1759; AVX512F-NEXT: vmovq %xmm0, %rax 1760; AVX512F-NEXT: retq 1761; 1762; AVX512VL-LABEL: trunc4i32_i64: 1763; AVX512VL: # %bb.0: # %entry 1764; AVX512VL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp) 1765; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax 1766; AVX512VL-NEXT: retq 1767; 1768; AVX512BW-LABEL: trunc4i32_i64: 1769; AVX512BW: # %bb.0: # %entry 1770; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1771; AVX512BW-NEXT: vmovq %xmm0, %rax 1772; AVX512BW-NEXT: retq 1773; 1774; AVX512BWVL-LABEL: trunc4i32_i64: 1775; AVX512BWVL: # %bb.0: # %entry 1776; AVX512BWVL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp) 1777; AVX512BWVL-NEXT: movq -{{[0-9]+}}(%rsp), %rax 1778; AVX512BWVL-NEXT: retq 1779entry: 1780 %0 = trunc <4 x i32> %inval to <4 x i16> 1781 %1 = bitcast <4 x i16> %0 to i64 1782 ret i64 %1 1783} 1784 1785define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) { 1786; SSE2-LABEL: trunc2x8i16_16i8: 1787; SSE2: # %bb.0: # %entry 1788; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1789; SSE2-NEXT: pand %xmm2, %xmm1 1790; SSE2-NEXT: pand %xmm2, %xmm0 1791; SSE2-NEXT: packuswb %xmm1, %xmm0 1792; SSE2-NEXT: retq 1793; 1794; SSSE3-LABEL: trunc2x8i16_16i8: 1795; SSSE3: # %bb.0: # %entry 1796; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1797; SSSE3-NEXT: pshufb %xmm2, %xmm1 1798; SSSE3-NEXT: pshufb %xmm2, %xmm0 1799; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1800; SSSE3-NEXT: retq 1801; 1802; SSE41-LABEL: trunc2x8i16_16i8: 1803; SSE41: # %bb.0: # %entry 1804; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1805; SSE41-NEXT: pshufb %xmm2, %xmm1 1806; SSE41-NEXT: pshufb %xmm2, %xmm0 1807; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1808; SSE41-NEXT: retq 1809; 1810; AVX-LABEL: trunc2x8i16_16i8: 1811; AVX: # %bb.0: # %entry 1812; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1813; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1814; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1815; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1816; AVX-NEXT: retq 1817; 1818; AVX512-LABEL: trunc2x8i16_16i8: 1819; AVX512: # %bb.0: # %entry 1820; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1821; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1822; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1823; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1824; AVX512-NEXT: retq 1825entry: 1826 %0 = trunc <8 x i16> %a to <8 x i8> 1827 %1 = trunc <8 x i16> %b to <8 x i8> 1828 %2 = shufflevector <8 x i8> %0, <8 x i8> %1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1829 ret <16 x i8> %2 1830} 1831 1832; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524 1833define i64 @trunc8i16_i64(<8 x i16> %inval) { 1834; SSE2-LABEL: trunc8i16_i64: 1835; SSE2: # %bb.0: # %entry 1836; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1837; SSE2-NEXT: packuswb %xmm0, %xmm0 1838; SSE2-NEXT: movq %xmm0, %rax 1839; SSE2-NEXT: retq 1840; 1841; SSSE3-LABEL: trunc8i16_i64: 1842; SSSE3: # %bb.0: # %entry 1843; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 1844; SSSE3-NEXT: movq %xmm0, %rax 1845; SSSE3-NEXT: retq 1846; 1847; SSE41-LABEL: trunc8i16_i64: 1848; SSE41: # %bb.0: # %entry 1849; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 1850; SSE41-NEXT: movq %xmm0, %rax 1851; SSE41-NEXT: retq 1852; 1853; AVX-LABEL: trunc8i16_i64: 1854; AVX: # %bb.0: # %entry 1855; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 1856; AVX-NEXT: vmovq %xmm0, %rax 1857; AVX-NEXT: retq 1858; 1859; AVX512F-LABEL: trunc8i16_i64: 1860; AVX512F: # %bb.0: # %entry 1861; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 1862; AVX512F-NEXT: vmovq %xmm0, %rax 1863; AVX512F-NEXT: retq 1864; 1865; AVX512VL-LABEL: trunc8i16_i64: 1866; AVX512VL: # %bb.0: # %entry 1867; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 1868; AVX512VL-NEXT: vmovq %xmm0, %rax 1869; AVX512VL-NEXT: retq 1870; 1871; AVX512BW-LABEL: trunc8i16_i64: 1872; AVX512BW: # %bb.0: # %entry 1873; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 1874; AVX512BW-NEXT: vmovq %xmm0, %rax 1875; AVX512BW-NEXT: retq 1876; 1877; AVX512BWVL-LABEL: trunc8i16_i64: 1878; AVX512BWVL: # %bb.0: # %entry 1879; AVX512BWVL-NEXT: vpmovwb %xmm0, -{{[0-9]+}}(%rsp) 1880; AVX512BWVL-NEXT: movq -{{[0-9]+}}(%rsp), %rax 1881; AVX512BWVL-NEXT: retq 1882entry: 1883 %0 = trunc <8 x i16> %inval to <8 x i8> 1884 %1 = bitcast <8 x i8> %0 to i64 1885 ret i64 %1 1886} 1887 1888define <16 x i8> @trunc16i64_16i8_const() { 1889; SSE-LABEL: trunc16i64_16i8_const: 1890; SSE: # %bb.0: # %entry 1891; SSE-NEXT: xorps %xmm0, %xmm0 1892; SSE-NEXT: retq 1893; 1894; AVX-LABEL: trunc16i64_16i8_const: 1895; AVX: # %bb.0: # %entry 1896; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1897; AVX-NEXT: retq 1898; 1899; AVX512-LABEL: trunc16i64_16i8_const: 1900; AVX512: # %bb.0: # %entry 1901; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 1902; AVX512-NEXT: retq 1903 1904entry: 1905 %0 = trunc <16 x i64> zeroinitializer to <16 x i8> 1906 %1 = shufflevector <16 x i8> %0, <16 x i8> %0, <16 x i32> <i32 28, i32 30, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 undef, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26> 1907 ret <16 x i8> %1 1908} 1909 1910define <8 x i16> @PR32160(<8 x i32> %x) { 1911; SSE-LABEL: PR32160: 1912; SSE: # %bb.0: 1913; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] 1914; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 1915; SSE-NEXT: retq 1916; 1917; AVX1-LABEL: PR32160: 1918; AVX1: # %bb.0: 1919; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9] 1920; AVX1-NEXT: vzeroupper 1921; AVX1-NEXT: retq 1922; 1923; AVX2-SLOW-LABEL: PR32160: 1924; AVX2-SLOW: # %bb.0: 1925; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 1926; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1927; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] 1928; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 1929; AVX2-SLOW-NEXT: vzeroupper 1930; AVX2-SLOW-NEXT: retq 1931; 1932; AVX2-FAST-LABEL: PR32160: 1933; AVX2-FAST: # %bb.0: 1934; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 1935; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1936; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] 1937; AVX2-FAST-NEXT: vzeroupper 1938; AVX2-FAST-NEXT: retq 1939; 1940; AVX512F-LABEL: PR32160: 1941; AVX512F: # %bb.0: 1942; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1943; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 1944; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] 1945; AVX512F-NEXT: vpbroadcastd %xmm0, %xmm0 1946; AVX512F-NEXT: vzeroupper 1947; AVX512F-NEXT: retq 1948; 1949; AVX512VL-LABEL: PR32160: 1950; AVX512VL: # %bb.0: 1951; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 1952; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] 1953; AVX512VL-NEXT: vzeroupper 1954; AVX512VL-NEXT: retq 1955; 1956; AVX512BW-LABEL: PR32160: 1957; AVX512BW: # %bb.0: 1958; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1959; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 1960; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] 1961; AVX512BW-NEXT: vzeroupper 1962; AVX512BW-NEXT: retq 1963; 1964; AVX512BWVL-LABEL: PR32160: 1965; AVX512BWVL: # %bb.0: 1966; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 1967; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] 1968; AVX512BWVL-NEXT: vzeroupper 1969; AVX512BWVL-NEXT: retq 1970 %shuf = trunc <8 x i32> %x to <8 x i16> 1971 %trunc = shufflevector <8 x i16> %shuf, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 1972 ret <8 x i16> %trunc 1973} 1974 1975define void @PR34773(i16* %a0, i8* %a1) { 1976; SSE-LABEL: PR34773: 1977; SSE: # %bb.0: 1978; SSE-NEXT: movdqu (%rdi), %xmm0 1979; SSE-NEXT: movdqu 16(%rdi), %xmm1 1980; SSE-NEXT: movdqu 32(%rdi), %xmm2 1981; SSE-NEXT: movdqu 48(%rdi), %xmm3 1982; SSE-NEXT: psrlw $8, %xmm1 1983; SSE-NEXT: psrlw $8, %xmm0 1984; SSE-NEXT: packuswb %xmm1, %xmm0 1985; SSE-NEXT: psrlw $8, %xmm3 1986; SSE-NEXT: psrlw $8, %xmm2 1987; SSE-NEXT: packuswb %xmm3, %xmm2 1988; SSE-NEXT: movdqu %xmm0, (%rsi) 1989; SSE-NEXT: movdqu %xmm2, 16(%rsi) 1990; SSE-NEXT: retq 1991; 1992; AVX1-LABEL: PR34773: 1993; AVX1: # %bb.0: 1994; AVX1-NEXT: vmovdqu (%rdi), %ymm0 1995; AVX1-NEXT: vmovdqu 32(%rdi), %ymm1 1996; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1997; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 1998; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1999; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2000; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2001; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 2002; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 2003; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 2004; AVX1-NEXT: vmovdqu %xmm0, (%rsi) 2005; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi) 2006; AVX1-NEXT: vzeroupper 2007; AVX1-NEXT: retq 2008; 2009; AVX2-LABEL: PR34773: 2010; AVX2: # %bb.0: 2011; AVX2-NEXT: vmovdqu (%rdi), %ymm0 2012; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 2013; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 2014; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 2015; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2016; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2017; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2018; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 2019; AVX2-NEXT: vmovdqu %xmm0, (%rsi) 2020; AVX2-NEXT: vmovdqu %xmm1, 16(%rsi) 2021; AVX2-NEXT: vzeroupper 2022; AVX2-NEXT: retq 2023; 2024; AVX512F-LABEL: PR34773: 2025; AVX512F: # %bb.0: 2026; AVX512F-NEXT: vmovdqu (%rdi), %ymm0 2027; AVX512F-NEXT: vmovdqu 32(%rdi), %ymm1 2028; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 2029; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 2030; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 2031; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 2032; AVX512F-NEXT: vpmovdb %zmm0, (%rsi) 2033; AVX512F-NEXT: vpmovdb %zmm1, 16(%rsi) 2034; AVX512F-NEXT: vzeroupper 2035; AVX512F-NEXT: retq 2036; 2037; AVX512VL-LABEL: PR34773: 2038; AVX512VL: # %bb.0: 2039; AVX512VL-NEXT: vmovdqu (%rdi), %ymm0 2040; AVX512VL-NEXT: vmovdqu 32(%rdi), %ymm1 2041; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 2042; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 2043; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0 2044; AVX512VL-NEXT: vpmovsxwd %ymm1, %zmm1 2045; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi) 2046; AVX512VL-NEXT: vpmovdb %zmm1, 16(%rsi) 2047; AVX512VL-NEXT: vzeroupper 2048; AVX512VL-NEXT: retq 2049; 2050; AVX512BW-LABEL: PR34773: 2051; AVX512BW: # %bb.0: 2052; AVX512BW-NEXT: vmovdqu (%rdi), %ymm0 2053; AVX512BW-NEXT: vmovdqu 32(%rdi), %ymm1 2054; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 2055; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 2056; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2057; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 2058; AVX512BW-NEXT: vmovdqu %xmm0, (%rsi) 2059; AVX512BW-NEXT: vmovdqu %xmm1, 16(%rsi) 2060; AVX512BW-NEXT: vzeroupper 2061; AVX512BW-NEXT: retq 2062; 2063; AVX512BWVL-LABEL: PR34773: 2064; AVX512BWVL: # %bb.0: 2065; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %ymm0 2066; AVX512BWVL-NEXT: vpsrlw $8, 32(%rdi), %ymm1 2067; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi) 2068; AVX512BWVL-NEXT: vpmovwb %ymm1, 16(%rsi) 2069; AVX512BWVL-NEXT: vzeroupper 2070; AVX512BWVL-NEXT: retq 2071 %1 = getelementptr i16, i16* %a0, i64 16 2072 %2 = getelementptr i8, i8* %a1, i64 16 2073 %3 = bitcast i16* %a0 to <16 x i16>* 2074 %4 = bitcast i16* %1 to <16 x i16>* 2075 %5 = bitcast i8* %a1 to <16 x i8>* 2076 %6 = bitcast i8* %2 to <16 x i8>* 2077 %7 = load <16 x i16>, <16 x i16>* %3, align 2 2078 %8 = load <16 x i16>, <16 x i16>* %4, align 2 2079 %9 = lshr <16 x i16> %7, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 2080 %10 = lshr <16 x i16> %8, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 2081 %11 = trunc <16 x i16> %9 to <16 x i8> 2082 %12 = trunc <16 x i16> %10 to <16 x i8> 2083 store <16 x i8> %11, <16 x i8>* %5, align 1 2084 store <16 x i8> %12, <16 x i8>* %6, align 1 2085 ret void 2086} 2087