1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512VL 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512BW 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512BWVL 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512VBMIVL 10 11; PR31551 12; Pairs of shufflevector:trunc functions with functional equivalence. 13; Ideally, the shuffles should be lowered to code with the same quality as the truncates. 14 15define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind { 16; AVX-LABEL: shuffle_v32i8_to_v16i8: 17; AVX: # %bb.0: 18; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] 19; AVX-NEXT: vpand 16(%rdi), %xmm0, %xmm1 20; AVX-NEXT: vpand (%rdi), %xmm0, %xmm0 21; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 22; AVX-NEXT: vmovdqa %xmm0, (%rsi) 23; AVX-NEXT: retq 24; 25; AVX512F-LABEL: shuffle_v32i8_to_v16i8: 26; AVX512F: # %bb.0: 27; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] 28; AVX512F-NEXT: vpand 16(%rdi), %xmm0, %xmm1 29; AVX512F-NEXT: vpand (%rdi), %xmm0, %xmm0 30; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 31; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) 32; AVX512F-NEXT: retq 33; 34; AVX512VL-LABEL: shuffle_v32i8_to_v16i8: 35; AVX512VL: # %bb.0: 36; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] 37; AVX512VL-NEXT: vpand 16(%rdi), %xmm0, %xmm1 38; AVX512VL-NEXT: vpand (%rdi), %xmm0, %xmm0 39; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 40; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) 41; AVX512VL-NEXT: retq 42; 43; AVX512BW-LABEL: shuffle_v32i8_to_v16i8: 44; AVX512BW: # %bb.0: 45; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 46; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 47; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) 48; AVX512BW-NEXT: vzeroupper 49; AVX512BW-NEXT: retq 50; 51; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8: 52; AVX512BWVL: # %bb.0: 53; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 54; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi) 55; AVX512BWVL-NEXT: vzeroupper 56; AVX512BWVL-NEXT: retq 57; 58; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v16i8: 59; AVX512VBMIVL: # %bb.0: 60; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 61; AVX512VBMIVL-NEXT: vpmovwb %ymm0, (%rsi) 62; AVX512VBMIVL-NEXT: vzeroupper 63; AVX512VBMIVL-NEXT: retq 64 %vec = load <32 x i8>, <32 x i8>* %L 65 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 66 store <16 x i8> %strided.vec, <16 x i8>* %S 67 ret void 68} 69 70define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind { 71; AVX1-LABEL: trunc_v16i16_to_v16i8: 72; AVX1: # %bb.0: 73; AVX1-NEXT: vmovaps (%rdi), %ymm0 74; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 75; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 76; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 77; AVX1-NEXT: vmovdqa %xmm0, (%rsi) 78; AVX1-NEXT: vzeroupper 79; AVX1-NEXT: retq 80; 81; AVX2-LABEL: trunc_v16i16_to_v16i8: 82; AVX2: # %bb.0: 83; AVX2-NEXT: vmovdqa (%rdi), %ymm0 84; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 85; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 86; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 87; AVX2-NEXT: vmovdqa %xmm0, (%rsi) 88; AVX2-NEXT: vzeroupper 89; AVX2-NEXT: retq 90; 91; AVX512F-LABEL: trunc_v16i16_to_v16i8: 92; AVX512F: # %bb.0: 93; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 94; AVX512F-NEXT: vpmovdb %zmm0, (%rsi) 95; AVX512F-NEXT: vzeroupper 96; AVX512F-NEXT: retq 97; 98; AVX512VL-LABEL: trunc_v16i16_to_v16i8: 99; AVX512VL: # %bb.0: 100; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 101; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi) 102; AVX512VL-NEXT: vzeroupper 103; AVX512VL-NEXT: retq 104; 105; AVX512BW-LABEL: trunc_v16i16_to_v16i8: 106; AVX512BW: # %bb.0: 107; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 108; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 109; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) 110; AVX512BW-NEXT: vzeroupper 111; AVX512BW-NEXT: retq 112; 113; AVX512BWVL-LABEL: trunc_v16i16_to_v16i8: 114; AVX512BWVL: # %bb.0: 115; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 116; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi) 117; AVX512BWVL-NEXT: vzeroupper 118; AVX512BWVL-NEXT: retq 119; 120; AVX512VBMIVL-LABEL: trunc_v16i16_to_v16i8: 121; AVX512VBMIVL: # %bb.0: 122; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 123; AVX512VBMIVL-NEXT: vpmovwb %ymm0, (%rsi) 124; AVX512VBMIVL-NEXT: vzeroupper 125; AVX512VBMIVL-NEXT: retq 126 %vec = load <32 x i8>, <32 x i8>* %L 127 %bc = bitcast <32 x i8> %vec to <16 x i16> 128 %strided.vec = trunc <16 x i16> %bc to <16 x i8> 129 store <16 x i8> %strided.vec, <16 x i8>* %S 130 ret void 131} 132 133define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind { 134; AVX-LABEL: shuffle_v16i16_to_v8i16: 135; AVX: # %bb.0: 136; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 137; AVX-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7] 138; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7] 139; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 140; AVX-NEXT: vmovdqa %xmm0, (%rsi) 141; AVX-NEXT: retq 142; 143; AVX512F-LABEL: shuffle_v16i16_to_v8i16: 144; AVX512F: # %bb.0: 145; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 146; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 147; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) 148; AVX512F-NEXT: vzeroupper 149; AVX512F-NEXT: retq 150; 151; AVX512VL-LABEL: shuffle_v16i16_to_v8i16: 152; AVX512VL: # %bb.0: 153; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 154; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi) 155; AVX512VL-NEXT: vzeroupper 156; AVX512VL-NEXT: retq 157; 158; AVX512BW-LABEL: shuffle_v16i16_to_v8i16: 159; AVX512BW: # %bb.0: 160; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 161; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 162; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) 163; AVX512BW-NEXT: vzeroupper 164; AVX512BW-NEXT: retq 165; 166; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16: 167; AVX512BWVL: # %bb.0: 168; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 169; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi) 170; AVX512BWVL-NEXT: vzeroupper 171; AVX512BWVL-NEXT: retq 172; 173; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v8i16: 174; AVX512VBMIVL: # %bb.0: 175; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 176; AVX512VBMIVL-NEXT: vpmovdw %ymm0, (%rsi) 177; AVX512VBMIVL-NEXT: vzeroupper 178; AVX512VBMIVL-NEXT: retq 179 %vec = load <16 x i16>, <16 x i16>* %L 180 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 181 store <8 x i16> %strided.vec, <8 x i16>* %S 182 ret void 183} 184 185define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind { 186; AVX1-LABEL: trunc_v8i32_to_v8i16: 187; AVX1: # %bb.0: 188; AVX1-NEXT: vmovdqa (%rdi), %xmm0 189; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 190; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 191; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 192; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 193; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 194; AVX1-NEXT: vmovdqa %xmm0, (%rsi) 195; AVX1-NEXT: retq 196; 197; AVX2-LABEL: trunc_v8i32_to_v8i16: 198; AVX2: # %bb.0: 199; AVX2-NEXT: vmovdqa (%rdi), %ymm0 200; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 201; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 202; AVX2-NEXT: vmovdqa %xmm0, (%rsi) 203; AVX2-NEXT: vzeroupper 204; AVX2-NEXT: retq 205; 206; AVX512F-LABEL: trunc_v8i32_to_v8i16: 207; AVX512F: # %bb.0: 208; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 209; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 210; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) 211; AVX512F-NEXT: vzeroupper 212; AVX512F-NEXT: retq 213; 214; AVX512VL-LABEL: trunc_v8i32_to_v8i16: 215; AVX512VL: # %bb.0: 216; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 217; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi) 218; AVX512VL-NEXT: vzeroupper 219; AVX512VL-NEXT: retq 220; 221; AVX512BW-LABEL: trunc_v8i32_to_v8i16: 222; AVX512BW: # %bb.0: 223; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 224; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 225; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) 226; AVX512BW-NEXT: vzeroupper 227; AVX512BW-NEXT: retq 228; 229; AVX512BWVL-LABEL: trunc_v8i32_to_v8i16: 230; AVX512BWVL: # %bb.0: 231; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 232; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi) 233; AVX512BWVL-NEXT: vzeroupper 234; AVX512BWVL-NEXT: retq 235; 236; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i16: 237; AVX512VBMIVL: # %bb.0: 238; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 239; AVX512VBMIVL-NEXT: vpmovdw %ymm0, (%rsi) 240; AVX512VBMIVL-NEXT: vzeroupper 241; AVX512VBMIVL-NEXT: retq 242 %vec = load <16 x i16>, <16 x i16>* %L 243 %bc = bitcast <16 x i16> %vec to <8 x i32> 244 %strided.vec = trunc <8 x i32> %bc to <8 x i16> 245 store <8 x i16> %strided.vec, <8 x i16>* %S 246 ret void 247} 248 249define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind { 250; AVX-LABEL: shuffle_v8i32_to_v4i32: 251; AVX: # %bb.0: 252; AVX-NEXT: vmovaps (%rdi), %xmm0 253; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] 254; AVX-NEXT: vmovaps %xmm0, (%rsi) 255; AVX-NEXT: retq 256; 257; AVX512F-LABEL: shuffle_v8i32_to_v4i32: 258; AVX512F: # %bb.0: 259; AVX512F-NEXT: vmovaps (%rdi), %xmm0 260; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] 261; AVX512F-NEXT: vmovaps %xmm0, (%rsi) 262; AVX512F-NEXT: retq 263; 264; AVX512VL-LABEL: shuffle_v8i32_to_v4i32: 265; AVX512VL: # %bb.0: 266; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 267; AVX512VL-NEXT: vpmovqd %ymm0, (%rsi) 268; AVX512VL-NEXT: vzeroupper 269; AVX512VL-NEXT: retq 270; 271; AVX512BW-LABEL: shuffle_v8i32_to_v4i32: 272; AVX512BW: # %bb.0: 273; AVX512BW-NEXT: vmovaps (%rdi), %xmm0 274; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] 275; AVX512BW-NEXT: vmovaps %xmm0, (%rsi) 276; AVX512BW-NEXT: retq 277; 278; AVX512BWVL-LABEL: shuffle_v8i32_to_v4i32: 279; AVX512BWVL: # %bb.0: 280; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 281; AVX512BWVL-NEXT: vpmovqd %ymm0, (%rsi) 282; AVX512BWVL-NEXT: vzeroupper 283; AVX512BWVL-NEXT: retq 284; 285; AVX512VBMIVL-LABEL: shuffle_v8i32_to_v4i32: 286; AVX512VBMIVL: # %bb.0: 287; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 288; AVX512VBMIVL-NEXT: vpmovqd %ymm0, (%rsi) 289; AVX512VBMIVL-NEXT: vzeroupper 290; AVX512VBMIVL-NEXT: retq 291 %vec = load <8 x i32>, <8 x i32>* %L 292 %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 293 store <4 x i32> %strided.vec, <4 x i32>* %S 294 ret void 295} 296 297define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind { 298; AVX1-LABEL: trunc_v4i64_to_v4i32: 299; AVX1: # %bb.0: 300; AVX1-NEXT: vmovaps (%rdi), %xmm0 301; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] 302; AVX1-NEXT: vmovaps %xmm0, (%rsi) 303; AVX1-NEXT: retq 304; 305; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i32: 306; AVX2-SLOW: # %bb.0: 307; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0 308; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] 309; AVX2-SLOW-NEXT: vmovaps %xmm0, (%rsi) 310; AVX2-SLOW-NEXT: retq 311; 312; AVX2-FAST-LABEL: trunc_v4i64_to_v4i32: 313; AVX2-FAST: # %bb.0: 314; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm0 = <0,2,4,6,u,u,u,u> 315; AVX2-FAST-NEXT: vpermps (%rdi), %ymm0, %ymm0 316; AVX2-FAST-NEXT: vmovaps %xmm0, (%rsi) 317; AVX2-FAST-NEXT: vzeroupper 318; AVX2-FAST-NEXT: retq 319; 320; AVX512F-LABEL: trunc_v4i64_to_v4i32: 321; AVX512F: # %bb.0: 322; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 323; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 324; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) 325; AVX512F-NEXT: vzeroupper 326; AVX512F-NEXT: retq 327; 328; AVX512VL-LABEL: trunc_v4i64_to_v4i32: 329; AVX512VL: # %bb.0: 330; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 331; AVX512VL-NEXT: vpmovqd %ymm0, (%rsi) 332; AVX512VL-NEXT: vzeroupper 333; AVX512VL-NEXT: retq 334; 335; AVX512BW-LABEL: trunc_v4i64_to_v4i32: 336; AVX512BW: # %bb.0: 337; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 338; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 339; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) 340; AVX512BW-NEXT: vzeroupper 341; AVX512BW-NEXT: retq 342; 343; AVX512BWVL-LABEL: trunc_v4i64_to_v4i32: 344; AVX512BWVL: # %bb.0: 345; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 346; AVX512BWVL-NEXT: vpmovqd %ymm0, (%rsi) 347; AVX512BWVL-NEXT: vzeroupper 348; AVX512BWVL-NEXT: retq 349; 350; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i32: 351; AVX512VBMIVL: # %bb.0: 352; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 353; AVX512VBMIVL-NEXT: vpmovqd %ymm0, (%rsi) 354; AVX512VBMIVL-NEXT: vzeroupper 355; AVX512VBMIVL-NEXT: retq 356 %vec = load <8 x i32>, <8 x i32>* %L 357 %bc = bitcast <8 x i32> %vec to <4 x i64> 358 %strided.vec = trunc <4 x i64> %bc to <4 x i32> 359 store <4 x i32> %strided.vec, <4 x i32>* %S 360 ret void 361} 362 363define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind { 364; AVX-LABEL: shuffle_v32i8_to_v8i8: 365; AVX: # %bb.0: 366; AVX-NEXT: vmovdqa (%rdi), %xmm0 367; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 368; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 369; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 370; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 371; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 372; AVX-NEXT: vmovq %xmm0, (%rsi) 373; AVX-NEXT: retq 374; 375; AVX512F-LABEL: shuffle_v32i8_to_v8i8: 376; AVX512F: # %bb.0: 377; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 378; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 379; AVX512F-NEXT: vmovq %xmm0, (%rsi) 380; AVX512F-NEXT: vzeroupper 381; AVX512F-NEXT: retq 382; 383; AVX512VL-LABEL: shuffle_v32i8_to_v8i8: 384; AVX512VL: # %bb.0: 385; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 386; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi) 387; AVX512VL-NEXT: vzeroupper 388; AVX512VL-NEXT: retq 389; 390; AVX512BW-LABEL: shuffle_v32i8_to_v8i8: 391; AVX512BW: # %bb.0: 392; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 393; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 394; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 395; AVX512BW-NEXT: vzeroupper 396; AVX512BW-NEXT: retq 397; 398; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8: 399; AVX512BWVL: # %bb.0: 400; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 401; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi) 402; AVX512BWVL-NEXT: vzeroupper 403; AVX512BWVL-NEXT: retq 404; 405; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8: 406; AVX512VBMIVL: # %bb.0: 407; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 408; AVX512VBMIVL-NEXT: vpmovdb %ymm0, (%rsi) 409; AVX512VBMIVL-NEXT: vzeroupper 410; AVX512VBMIVL-NEXT: retq 411 %vec = load <32 x i8>, <32 x i8>* %L 412 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28> 413 store <8 x i8> %strided.vec, <8 x i8>* %S 414 ret void 415} 416 417define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind { 418; AVX-LABEL: trunc_v8i32_to_v8i8: 419; AVX: # %bb.0: 420; AVX-NEXT: vmovdqa (%rdi), %xmm0 421; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 422; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 423; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 424; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 425; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 426; AVX-NEXT: vmovq %xmm0, (%rsi) 427; AVX-NEXT: retq 428; 429; AVX512F-LABEL: trunc_v8i32_to_v8i8: 430; AVX512F: # %bb.0: 431; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 432; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 433; AVX512F-NEXT: vmovq %xmm0, (%rsi) 434; AVX512F-NEXT: vzeroupper 435; AVX512F-NEXT: retq 436; 437; AVX512VL-LABEL: trunc_v8i32_to_v8i8: 438; AVX512VL: # %bb.0: 439; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 440; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi) 441; AVX512VL-NEXT: vzeroupper 442; AVX512VL-NEXT: retq 443; 444; AVX512BW-LABEL: trunc_v8i32_to_v8i8: 445; AVX512BW: # %bb.0: 446; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 447; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 448; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 449; AVX512BW-NEXT: vzeroupper 450; AVX512BW-NEXT: retq 451; 452; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8: 453; AVX512BWVL: # %bb.0: 454; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 455; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi) 456; AVX512BWVL-NEXT: vzeroupper 457; AVX512BWVL-NEXT: retq 458; 459; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8: 460; AVX512VBMIVL: # %bb.0: 461; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 462; AVX512VBMIVL-NEXT: vpmovdb %ymm0, (%rsi) 463; AVX512VBMIVL-NEXT: vzeroupper 464; AVX512VBMIVL-NEXT: retq 465 %vec = load <32 x i8>, <32 x i8>* %L 466 %bc = bitcast <32 x i8> %vec to <8 x i32> 467 %strided.vec = trunc <8 x i32> %bc to <8 x i8> 468 store <8 x i8> %strided.vec, <8 x i8>* %S 469 ret void 470} 471 472define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind { 473; IR generated from: 474; return (__m128i) {(long long)__builtin_convertvector((__v8si)__A, __v8qi), 0}; 475; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v2i64: 476; AVX1: # %bb.0: 477; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 478; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 479; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 480; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 481; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 482; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 483; AVX1-NEXT: vzeroupper 484; AVX1-NEXT: retq 485; 486; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v2i64: 487; AVX2: # %bb.0: 488; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 489; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 490; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 491; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 492; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 493; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 494; AVX2-NEXT: vzeroupper 495; AVX2-NEXT: retq 496; 497; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v2i64: 498; AVX512F: # %bb.0: 499; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 500; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 501; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 502; AVX512F-NEXT: vzeroupper 503; AVX512F-NEXT: retq 504; 505; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v2i64: 506; AVX512VL: # %bb.0: 507; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 508; AVX512VL-NEXT: vzeroupper 509; AVX512VL-NEXT: retq 510; 511; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v2i64: 512; AVX512BW: # %bb.0: 513; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 514; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 515; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 516; AVX512BW-NEXT: vzeroupper 517; AVX512BW-NEXT: retq 518; 519; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64: 520; AVX512BWVL: # %bb.0: 521; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 522; AVX512BWVL-NEXT: vzeroupper 523; AVX512BWVL-NEXT: retq 524; 525; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64: 526; AVX512VBMIVL: # %bb.0: 527; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0 528; AVX512VBMIVL-NEXT: vzeroupper 529; AVX512VBMIVL-NEXT: retq 530 %truncated.vec = trunc <8 x i32> %vec to <8 x i8> 531 %bc = bitcast <8 x i8> %truncated.vec to i64 532 %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0 533 ret <2 x i64> %result 534} 535 536define <16 x i8> @trunc_v8i32_to_v8i8_with_zext_return_v16i8(<8 x i32> %vec) nounwind { 537; AVX1-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: 538; AVX1: # %bb.0: 539; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 540; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 541; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] 542; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] 543; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 544; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero 545; AVX1-NEXT: vzeroupper 546; AVX1-NEXT: retq 547; 548; AVX2-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: 549; AVX2: # %bb.0: 550; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 551; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 552; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero 553; AVX2-NEXT: vzeroupper 554; AVX2-NEXT: retq 555; 556; AVX512F-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: 557; AVX512F: # %bb.0: 558; AVX512F-NEXT: vmovdqa %ymm0, %ymm0 559; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 560; AVX512F-NEXT: vzeroupper 561; AVX512F-NEXT: retq 562; 563; AVX512VL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: 564; AVX512VL: # %bb.0: 565; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 566; AVX512VL-NEXT: vzeroupper 567; AVX512VL-NEXT: retq 568; 569; AVX512BW-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: 570; AVX512BW: # %bb.0: 571; AVX512BW-NEXT: vmovdqa %ymm0, %ymm0 572; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 573; AVX512BW-NEXT: vzeroupper 574; AVX512BW-NEXT: retq 575; 576; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: 577; AVX512BWVL: # %bb.0: 578; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 579; AVX512BWVL-NEXT: vzeroupper 580; AVX512BWVL-NEXT: retq 581; 582; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: 583; AVX512VBMIVL: # %bb.0: 584; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0 585; AVX512VBMIVL-NEXT: vzeroupper 586; AVX512VBMIVL-NEXT: retq 587 %truncated = trunc <8 x i32> %vec to <8 x i8> 588 %truncated.ext = zext <8 x i8> %truncated to <8 x i16> 589 %bc = bitcast <8 x i16> %truncated.ext to <16 x i8> 590 %result = shufflevector <16 x i8> %bc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 591 ret <16 x i8> %result 592} 593 594define <16 x i8> @trunc_v8i32_to_v8i8_via_v8i16_return_v16i8(<8 x i32> %vec) nounwind { 595; AVX1-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: 596; AVX1: # %bb.0: 597; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 598; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 599; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] 600; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] 601; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 602; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero 603; AVX1-NEXT: vzeroupper 604; AVX1-NEXT: retq 605; 606; AVX2-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: 607; AVX2: # %bb.0: 608; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 609; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 610; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero 611; AVX2-NEXT: vzeroupper 612; AVX2-NEXT: retq 613; 614; AVX512F-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: 615; AVX512F: # %bb.0: 616; AVX512F-NEXT: vmovdqa %ymm0, %ymm0 617; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 618; AVX512F-NEXT: vzeroupper 619; AVX512F-NEXT: retq 620; 621; AVX512VL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: 622; AVX512VL: # %bb.0: 623; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 624; AVX512VL-NEXT: vzeroupper 625; AVX512VL-NEXT: retq 626; 627; AVX512BW-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: 628; AVX512BW: # %bb.0: 629; AVX512BW-NEXT: vmovdqa %ymm0, %ymm0 630; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 631; AVX512BW-NEXT: vzeroupper 632; AVX512BW-NEXT: retq 633; 634; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: 635; AVX512BWVL: # %bb.0: 636; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 637; AVX512BWVL-NEXT: vzeroupper 638; AVX512BWVL-NEXT: retq 639; 640; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: 641; AVX512VBMIVL: # %bb.0: 642; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0 643; AVX512VBMIVL-NEXT: vzeroupper 644; AVX512VBMIVL-NEXT: retq 645 %truncated = trunc <8 x i32> %vec to <8 x i16> 646 %bc = bitcast <8 x i16> %truncated to <16 x i8> 647 %result = shufflevector <16 x i8> %bc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 17, i32 20, i32 24, i32 22, i32 31, i32 28, i32 28, i32 29> 648 ret <16 x i8> %result 649} 650 651define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind { 652; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v16i8: 653; AVX1: # %bb.0: 654; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 655; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 656; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 657; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 658; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 659; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 660; AVX1-NEXT: vzeroupper 661; AVX1-NEXT: retq 662; 663; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v16i8: 664; AVX2: # %bb.0: 665; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 666; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 667; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 668; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 669; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 670; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 671; AVX2-NEXT: vzeroupper 672; AVX2-NEXT: retq 673; 674; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v16i8: 675; AVX512F: # %bb.0: 676; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 677; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 678; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 679; AVX512F-NEXT: vzeroupper 680; AVX512F-NEXT: retq 681; 682; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v16i8: 683; AVX512VL: # %bb.0: 684; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 685; AVX512VL-NEXT: vzeroupper 686; AVX512VL-NEXT: retq 687; 688; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v16i8: 689; AVX512BW: # %bb.0: 690; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 691; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 692; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 693; AVX512BW-NEXT: vzeroupper 694; AVX512BW-NEXT: retq 695; 696; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8: 697; AVX512BWVL: # %bb.0: 698; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 699; AVX512BWVL-NEXT: vzeroupper 700; AVX512BWVL-NEXT: retq 701; 702; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8: 703; AVX512VBMIVL: # %bb.0: 704; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0 705; AVX512VBMIVL-NEXT: vzeroupper 706; AVX512VBMIVL-NEXT: retq 707 %truncated = trunc <8 x i32> %vec to <8 x i8> 708 %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 709 ret <16 x i8> %result 710} 711 712define <2 x i64> @trunc_v4i64_to_v4i16_return_v2i64(<4 x i64> %vec) nounwind { 713; IR generated from: 714; return (__m128i) {(long long)__builtin_convertvector((__v4di)x, __v4hi), 0}; 715; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v2i64: 716; AVX1: # %bb.0: 717; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 718; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 719; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] 720; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] 721; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 722; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 723; AVX1-NEXT: vzeroupper 724; AVX1-NEXT: retq 725; 726; AVX2-LABEL: trunc_v4i64_to_v4i16_return_v2i64: 727; AVX2: # %bb.0: 728; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 729; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 730; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] 731; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] 732; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 733; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 734; AVX2-NEXT: vzeroupper 735; AVX2-NEXT: retq 736; 737; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v2i64: 738; AVX512F: # %bb.0: 739; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 740; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 741; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 742; AVX512F-NEXT: vzeroupper 743; AVX512F-NEXT: retq 744; 745; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v2i64: 746; AVX512VL: # %bb.0: 747; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 748; AVX512VL-NEXT: vzeroupper 749; AVX512VL-NEXT: retq 750; 751; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v2i64: 752; AVX512BW: # %bb.0: 753; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 754; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 755; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 756; AVX512BW-NEXT: vzeroupper 757; AVX512BW-NEXT: retq 758; 759; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64: 760; AVX512BWVL: # %bb.0: 761; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 762; AVX512BWVL-NEXT: vzeroupper 763; AVX512BWVL-NEXT: retq 764; 765; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64: 766; AVX512VBMIVL: # %bb.0: 767; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0 768; AVX512VBMIVL-NEXT: vzeroupper 769; AVX512VBMIVL-NEXT: retq 770 %truncated = trunc <4 x i64> %vec to <4 x i16> 771 %bc = bitcast <4 x i16> %truncated to i64 772 %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0 773 ret <2 x i64> %result 774} 775 776define <8 x i16> @trunc_v4i64_to_v4i16_with_zext_return_v8i16(<4 x i64> %vec) nounwind { 777; AVX1-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: 778; AVX1: # %bb.0: 779; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 780; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 781; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 782; AVX1-NEXT: vzeroupper 783; AVX1-NEXT: retq 784; 785; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: 786; AVX2-SLOW: # %bb.0: 787; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 788; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 789; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 790; AVX2-SLOW-NEXT: vzeroupper 791; AVX2-SLOW-NEXT: retq 792; 793; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: 794; AVX2-FAST: # %bb.0: 795; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 796; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 797; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 798; AVX2-FAST-NEXT: vzeroupper 799; AVX2-FAST-NEXT: retq 800; 801; AVX512F-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: 802; AVX512F: # %bb.0: 803; AVX512F-NEXT: vmovdqa %ymm0, %ymm0 804; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 805; AVX512F-NEXT: vzeroupper 806; AVX512F-NEXT: retq 807; 808; AVX512VL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: 809; AVX512VL: # %bb.0: 810; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 811; AVX512VL-NEXT: vzeroupper 812; AVX512VL-NEXT: retq 813; 814; AVX512BW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: 815; AVX512BW: # %bb.0: 816; AVX512BW-NEXT: vmovdqa %ymm0, %ymm0 817; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 818; AVX512BW-NEXT: vzeroupper 819; AVX512BW-NEXT: retq 820; 821; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: 822; AVX512BWVL: # %bb.0: 823; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 824; AVX512BWVL-NEXT: vzeroupper 825; AVX512BWVL-NEXT: retq 826; 827; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: 828; AVX512VBMIVL: # %bb.0: 829; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0 830; AVX512VBMIVL-NEXT: vzeroupper 831; AVX512VBMIVL-NEXT: retq 832 %truncated = trunc <4 x i64> %vec to <4 x i16> 833 %truncated.ext = zext <4 x i16> %truncated to <4 x i32> 834 %bc = bitcast <4 x i32> %truncated.ext to <8 x i16> 835 %result = shufflevector <8 x i16> %bc, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 836 ret <8 x i16> %result 837} 838 839define <8 x i16> @trunc_v4i64_to_v4i16_via_v4i32_return_v8i16(<4 x i64> %vec) nounwind { 840; AVX1-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: 841; AVX1: # %bb.0: 842; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 843; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 844; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 845; AVX1-NEXT: vzeroupper 846; AVX1-NEXT: retq 847; 848; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: 849; AVX2-SLOW: # %bb.0: 850; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 851; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 852; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 853; AVX2-SLOW-NEXT: vzeroupper 854; AVX2-SLOW-NEXT: retq 855; 856; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: 857; AVX2-FAST: # %bb.0: 858; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 859; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 860; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 861; AVX2-FAST-NEXT: vzeroupper 862; AVX2-FAST-NEXT: retq 863; 864; AVX512F-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: 865; AVX512F: # %bb.0: 866; AVX512F-NEXT: vmovdqa %ymm0, %ymm0 867; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 868; AVX512F-NEXT: vzeroupper 869; AVX512F-NEXT: retq 870; 871; AVX512VL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: 872; AVX512VL: # %bb.0: 873; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 874; AVX512VL-NEXT: vzeroupper 875; AVX512VL-NEXT: retq 876; 877; AVX512BW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: 878; AVX512BW: # %bb.0: 879; AVX512BW-NEXT: vmovdqa %ymm0, %ymm0 880; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 881; AVX512BW-NEXT: vzeroupper 882; AVX512BW-NEXT: retq 883; 884; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: 885; AVX512BWVL: # %bb.0: 886; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 887; AVX512BWVL-NEXT: vzeroupper 888; AVX512BWVL-NEXT: retq 889; 890; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: 891; AVX512VBMIVL: # %bb.0: 892; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0 893; AVX512VBMIVL-NEXT: vzeroupper 894; AVX512VBMIVL-NEXT: retq 895 %truncated = trunc <4 x i64> %vec to <4 x i32> 896 %bc = bitcast <4 x i32> %truncated to <8 x i16> 897 %result = shufflevector <8 x i16> %bc, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 8, i32 undef, i32 13> 898 ret <8 x i16> %result 899} 900 901define <8 x i16> @trunc_v4i64_to_v4i16_return_v8i16(<4 x i64> %vec) nounwind { 902; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v8i16: 903; AVX1: # %bb.0: 904; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 905; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 906; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] 907; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] 908; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 909; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 910; AVX1-NEXT: vzeroupper 911; AVX1-NEXT: retq 912; 913; AVX2-LABEL: trunc_v4i64_to_v4i16_return_v8i16: 914; AVX2: # %bb.0: 915; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 916; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 917; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] 918; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] 919; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 920; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 921; AVX2-NEXT: vzeroupper 922; AVX2-NEXT: retq 923; 924; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v8i16: 925; AVX512F: # %bb.0: 926; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 927; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 928; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 929; AVX512F-NEXT: vzeroupper 930; AVX512F-NEXT: retq 931; 932; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v8i16: 933; AVX512VL: # %bb.0: 934; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 935; AVX512VL-NEXT: vzeroupper 936; AVX512VL-NEXT: retq 937; 938; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v8i16: 939; AVX512BW: # %bb.0: 940; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 941; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 942; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 943; AVX512BW-NEXT: vzeroupper 944; AVX512BW-NEXT: retq 945; 946; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16: 947; AVX512BWVL: # %bb.0: 948; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 949; AVX512BWVL-NEXT: vzeroupper 950; AVX512BWVL-NEXT: retq 951; 952; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16: 953; AVX512VBMIVL: # %bb.0: 954; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0 955; AVX512VBMIVL-NEXT: vzeroupper 956; AVX512VBMIVL-NEXT: retq 957 %truncated = trunc <4 x i64> %vec to <4 x i16> 958 %result = shufflevector <4 x i16> %truncated, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 959 ret <8 x i16> %result 960} 961 962define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind { 963; AVX1-LABEL: trunc_v4i64_to_v4i8_return_v16i8: 964; AVX1: # %bb.0: 965; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 966; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 967; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 968; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 969; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 970; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 971; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 972; AVX1-NEXT: vzeroupper 973; AVX1-NEXT: retq 974; 975; AVX2-LABEL: trunc_v4i64_to_v4i8_return_v16i8: 976; AVX2: # %bb.0: 977; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 978; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 979; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 980; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 981; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 982; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 983; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 984; AVX2-NEXT: vzeroupper 985; AVX2-NEXT: retq 986; 987; AVX512F-LABEL: trunc_v4i64_to_v4i8_return_v16i8: 988; AVX512F: # %bb.0: 989; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 990; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 991; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 992; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 993; AVX512F-NEXT: vzeroupper 994; AVX512F-NEXT: retq 995; 996; AVX512VL-LABEL: trunc_v4i64_to_v4i8_return_v16i8: 997; AVX512VL: # %bb.0: 998; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0 999; AVX512VL-NEXT: vzeroupper 1000; AVX512VL-NEXT: retq 1001; 1002; AVX512BW-LABEL: trunc_v4i64_to_v4i8_return_v16i8: 1003; AVX512BW: # %bb.0: 1004; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1005; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 1006; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 1007; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1008; AVX512BW-NEXT: vzeroupper 1009; AVX512BW-NEXT: retq 1010; 1011; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8: 1012; AVX512BWVL: # %bb.0: 1013; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0 1014; AVX512BWVL-NEXT: vzeroupper 1015; AVX512BWVL-NEXT: retq 1016; 1017; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8: 1018; AVX512VBMIVL: # %bb.0: 1019; AVX512VBMIVL-NEXT: vpmovqb %ymm0, %xmm0 1020; AVX512VBMIVL-NEXT: vzeroupper 1021; AVX512VBMIVL-NEXT: retq 1022 %truncated = trunc <4 x i64> %vec to <4 x i8> 1023 %result = shufflevector <4 x i8> %truncated, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 5, i32 5, i32 undef, i32 7> 1024 ret <16 x i8> %result 1025} 1026 1027define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind { 1028; AVX-LABEL: shuffle_v16i16_to_v4i16: 1029; AVX: # %bb.0: 1030; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 1031; AVX-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7] 1032; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7] 1033; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1034; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 1035; AVX-NEXT: vmovq %xmm0, (%rsi) 1036; AVX-NEXT: retq 1037; 1038; AVX512F-LABEL: shuffle_v16i16_to_v4i16: 1039; AVX512F: # %bb.0: 1040; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 1041; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 1042; AVX512F-NEXT: vmovq %xmm0, (%rsi) 1043; AVX512F-NEXT: vzeroupper 1044; AVX512F-NEXT: retq 1045; 1046; AVX512VL-LABEL: shuffle_v16i16_to_v4i16: 1047; AVX512VL: # %bb.0: 1048; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 1049; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi) 1050; AVX512VL-NEXT: vzeroupper 1051; AVX512VL-NEXT: retq 1052; 1053; AVX512BW-LABEL: shuffle_v16i16_to_v4i16: 1054; AVX512BW: # %bb.0: 1055; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 1056; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 1057; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 1058; AVX512BW-NEXT: vzeroupper 1059; AVX512BW-NEXT: retq 1060; 1061; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16: 1062; AVX512BWVL: # %bb.0: 1063; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 1064; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi) 1065; AVX512BWVL-NEXT: vzeroupper 1066; AVX512BWVL-NEXT: retq 1067; 1068; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16: 1069; AVX512VBMIVL: # %bb.0: 1070; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 1071; AVX512VBMIVL-NEXT: vpmovqw %ymm0, (%rsi) 1072; AVX512VBMIVL-NEXT: vzeroupper 1073; AVX512VBMIVL-NEXT: retq 1074 %vec = load <16 x i16>, <16 x i16>* %L 1075 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 1076 store <4 x i16> %strided.vec, <4 x i16>* %S 1077 ret void 1078} 1079 1080define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind { 1081; AVX-LABEL: trunc_v4i64_to_v4i16: 1082; AVX: # %bb.0: 1083; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 1084; AVX-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7] 1085; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7] 1086; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1087; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 1088; AVX-NEXT: vmovq %xmm0, (%rsi) 1089; AVX-NEXT: retq 1090; 1091; AVX512F-LABEL: trunc_v4i64_to_v4i16: 1092; AVX512F: # %bb.0: 1093; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 1094; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 1095; AVX512F-NEXT: vmovq %xmm0, (%rsi) 1096; AVX512F-NEXT: vzeroupper 1097; AVX512F-NEXT: retq 1098; 1099; AVX512VL-LABEL: trunc_v4i64_to_v4i16: 1100; AVX512VL: # %bb.0: 1101; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 1102; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi) 1103; AVX512VL-NEXT: vzeroupper 1104; AVX512VL-NEXT: retq 1105; 1106; AVX512BW-LABEL: trunc_v4i64_to_v4i16: 1107; AVX512BW: # %bb.0: 1108; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 1109; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 1110; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 1111; AVX512BW-NEXT: vzeroupper 1112; AVX512BW-NEXT: retq 1113; 1114; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16: 1115; AVX512BWVL: # %bb.0: 1116; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 1117; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi) 1118; AVX512BWVL-NEXT: vzeroupper 1119; AVX512BWVL-NEXT: retq 1120; 1121; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16: 1122; AVX512VBMIVL: # %bb.0: 1123; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 1124; AVX512VBMIVL-NEXT: vpmovqw %ymm0, (%rsi) 1125; AVX512VBMIVL-NEXT: vzeroupper 1126; AVX512VBMIVL-NEXT: retq 1127 %vec = load <16 x i16>, <16 x i16>* %L 1128 %bc = bitcast <16 x i16> %vec to <4 x i64> 1129 %strided.vec = trunc <4 x i64> %bc to <4 x i16> 1130 store <4 x i16> %strided.vec, <4 x i16>* %S 1131 ret void 1132} 1133 1134define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind { 1135; AVX-LABEL: shuffle_v32i8_to_v4i8: 1136; AVX: # %bb.0: 1137; AVX-NEXT: vmovdqa (%rdi), %xmm0 1138; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 1139; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 1140; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1141; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1142; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1143; AVX-NEXT: vmovd %xmm0, (%rsi) 1144; AVX-NEXT: retq 1145; 1146; AVX512F-LABEL: shuffle_v32i8_to_v4i8: 1147; AVX512F: # %bb.0: 1148; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 1149; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 1150; AVX512F-NEXT: vmovd %xmm0, (%rsi) 1151; AVX512F-NEXT: vzeroupper 1152; AVX512F-NEXT: retq 1153; 1154; AVX512VL-LABEL: shuffle_v32i8_to_v4i8: 1155; AVX512VL: # %bb.0: 1156; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 1157; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi) 1158; AVX512VL-NEXT: vzeroupper 1159; AVX512VL-NEXT: retq 1160; 1161; AVX512BW-LABEL: shuffle_v32i8_to_v4i8: 1162; AVX512BW: # %bb.0: 1163; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 1164; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 1165; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 1166; AVX512BW-NEXT: vzeroupper 1167; AVX512BW-NEXT: retq 1168; 1169; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8: 1170; AVX512BWVL: # %bb.0: 1171; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 1172; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi) 1173; AVX512BWVL-NEXT: vzeroupper 1174; AVX512BWVL-NEXT: retq 1175; 1176; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8: 1177; AVX512VBMIVL: # %bb.0: 1178; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 1179; AVX512VBMIVL-NEXT: vpmovqb %ymm0, (%rsi) 1180; AVX512VBMIVL-NEXT: vzeroupper 1181; AVX512VBMIVL-NEXT: retq 1182 %vec = load <32 x i8>, <32 x i8>* %L 1183 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24> 1184 store <4 x i8> %strided.vec, <4 x i8>* %S 1185 ret void 1186} 1187 1188define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind { 1189; AVX-LABEL: trunc_v4i64_to_v4i8: 1190; AVX: # %bb.0: 1191; AVX-NEXT: vmovdqa (%rdi), %xmm0 1192; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 1193; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 1194; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1195; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1196; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1197; AVX-NEXT: vmovd %xmm0, (%rsi) 1198; AVX-NEXT: retq 1199; 1200; AVX512F-LABEL: trunc_v4i64_to_v4i8: 1201; AVX512F: # %bb.0: 1202; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 1203; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 1204; AVX512F-NEXT: vmovd %xmm0, (%rsi) 1205; AVX512F-NEXT: vzeroupper 1206; AVX512F-NEXT: retq 1207; 1208; AVX512VL-LABEL: trunc_v4i64_to_v4i8: 1209; AVX512VL: # %bb.0: 1210; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 1211; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi) 1212; AVX512VL-NEXT: vzeroupper 1213; AVX512VL-NEXT: retq 1214; 1215; AVX512BW-LABEL: trunc_v4i64_to_v4i8: 1216; AVX512BW: # %bb.0: 1217; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 1218; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 1219; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 1220; AVX512BW-NEXT: vzeroupper 1221; AVX512BW-NEXT: retq 1222; 1223; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8: 1224; AVX512BWVL: # %bb.0: 1225; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 1226; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi) 1227; AVX512BWVL-NEXT: vzeroupper 1228; AVX512BWVL-NEXT: retq 1229; 1230; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8: 1231; AVX512VBMIVL: # %bb.0: 1232; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 1233; AVX512VBMIVL-NEXT: vpmovqb %ymm0, (%rsi) 1234; AVX512VBMIVL-NEXT: vzeroupper 1235; AVX512VBMIVL-NEXT: retq 1236 %vec = load <32 x i8>, <32 x i8>* %L 1237 %bc = bitcast <32 x i8> %vec to <4 x i64> 1238 %strided.vec = trunc <4 x i64> %bc to <4 x i8> 1239 store <4 x i8> %strided.vec, <4 x i8>* %S 1240 ret void 1241} 1242 1243; In this case not all elements are collected from the same source vector, so 1244; the resulting BUILD_VECTOR should not be combined to a truncate. 1245define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind { 1246; AVX1-LABEL: negative: 1247; AVX1: # %bb.0: 1248; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1249; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14] 1250; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 1251; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 1252; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1253; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 1254; AVX1-NEXT: vzeroupper 1255; AVX1-NEXT: retq 1256; 1257; AVX2-LABEL: negative: 1258; AVX2: # %bb.0: 1259; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] 1260; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1261; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 1262; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 1263; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1264; AVX2-NEXT: vzeroupper 1265; AVX2-NEXT: retq 1266; 1267; AVX512F-LABEL: negative: 1268; AVX512F: # %bb.0: 1269; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] 1270; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1271; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 1272; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 1273; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1274; AVX512F-NEXT: vzeroupper 1275; AVX512F-NEXT: retq 1276; 1277; AVX512VL-LABEL: negative: 1278; AVX512VL: # %bb.0: 1279; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm0[2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] 1280; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1281; AVX512VL-NEXT: # ymm2 = mem[0,1,0,1] 1282; AVX512VL-NEXT: vpternlogq $206, %ymm1, %ymm0, %ymm2 1283; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,3,2,3] 1284; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1285; AVX512VL-NEXT: vzeroupper 1286; AVX512VL-NEXT: retq 1287; 1288; AVX512BW-LABEL: negative: 1289; AVX512BW: # %bb.0: 1290; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] 1291; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1292; AVX512BW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 1293; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 1294; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1295; AVX512BW-NEXT: vzeroupper 1296; AVX512BW-NEXT: retq 1297; 1298; AVX512BWVL-LABEL: negative: 1299; AVX512BWVL: # %bb.0: 1300; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] 1301; AVX512BWVL-NEXT: movl $65537, %eax # imm = 0x10001 1302; AVX512BWVL-NEXT: kmovd %eax, %k1 1303; AVX512BWVL-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} 1304; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 1305; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1306; AVX512BWVL-NEXT: vzeroupper 1307; AVX512BWVL-NEXT: retq 1308; 1309; AVX512VBMIVL-LABEL: negative: 1310; AVX512VBMIVL: # %bb.0: 1311; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm2 = <32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 1312; AVX512VBMIVL-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 1313; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1314; AVX512VBMIVL-NEXT: vzeroupper 1315; AVX512VBMIVL-NEXT: retq 1316 %strided.vec = shufflevector <32 x i8> %v, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 1317 %w0 = extractelement <32 x i8> %w, i32 0 1318 %merged = insertelement <16 x i8> %strided.vec, i8 %w0, i32 0 1319 ret <16 x i8> %merged 1320} 1321