1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2-SLOW 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 12 13; PR31551 14; Pairs of shufflevector:trunc functions with functional equivalence. 15; Ideally, the shuffles should be lowered to code with the same quality as the truncates. 16 17define void @shuffle_v16i8_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind { 18; SSE2-LABEL: shuffle_v16i8_to_v8i8: 19; SSE2: # %bb.0: 20; SSE2-NEXT: movdqa (%rdi), %xmm0 21; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 22; SSE2-NEXT: packuswb %xmm0, %xmm0 23; SSE2-NEXT: movq %xmm0, (%rsi) 24; SSE2-NEXT: retq 25; 26; SSE42-LABEL: shuffle_v16i8_to_v8i8: 27; SSE42: # %bb.0: 28; SSE42-NEXT: movdqa (%rdi), %xmm0 29; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 30; SSE42-NEXT: movq %xmm0, (%rsi) 31; SSE42-NEXT: retq 32; 33; AVX-LABEL: shuffle_v16i8_to_v8i8: 34; AVX: # %bb.0: 35; AVX-NEXT: vmovdqa (%rdi), %xmm0 36; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 37; AVX-NEXT: vmovq %xmm0, (%rsi) 38; AVX-NEXT: retq 39; 40; AVX512F-LABEL: shuffle_v16i8_to_v8i8: 41; AVX512F: # %bb.0: 42; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 43; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 44; AVX512F-NEXT: vmovq %xmm0, (%rsi) 45; AVX512F-NEXT: retq 46; 47; AVX512VL-LABEL: shuffle_v16i8_to_v8i8: 48; AVX512VL: # %bb.0: 49; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 50; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 51; AVX512VL-NEXT: vmovq %xmm0, (%rsi) 52; AVX512VL-NEXT: retq 53; 54; AVX512BW-LABEL: shuffle_v16i8_to_v8i8: 55; AVX512BW: # %bb.0: 56; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 57; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 58; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 59; AVX512BW-NEXT: retq 60; 61; AVX512BWVL-LABEL: shuffle_v16i8_to_v8i8: 62; AVX512BWVL: # %bb.0: 63; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 64; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) 65; AVX512BWVL-NEXT: retq 66 %vec = load <16 x i8>, <16 x i8>* %L 67 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 68 store <8 x i8> %strided.vec, <8 x i8>* %S 69 ret void 70} 71 72define void @trunc_v8i16_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind { 73; SSE2-LABEL: trunc_v8i16_to_v8i8: 74; SSE2: # %bb.0: 75; SSE2-NEXT: movdqa (%rdi), %xmm0 76; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 77; SSE2-NEXT: packuswb %xmm0, %xmm0 78; SSE2-NEXT: movq %xmm0, (%rsi) 79; SSE2-NEXT: retq 80; 81; SSE42-LABEL: trunc_v8i16_to_v8i8: 82; SSE42: # %bb.0: 83; SSE42-NEXT: movdqa (%rdi), %xmm0 84; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 85; SSE42-NEXT: movq %xmm0, (%rsi) 86; SSE42-NEXT: retq 87; 88; AVX-LABEL: trunc_v8i16_to_v8i8: 89; AVX: # %bb.0: 90; AVX-NEXT: vmovdqa (%rdi), %xmm0 91; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 92; AVX-NEXT: vmovq %xmm0, (%rsi) 93; AVX-NEXT: retq 94; 95; AVX512F-LABEL: trunc_v8i16_to_v8i8: 96; AVX512F: # %bb.0: 97; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 98; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 99; AVX512F-NEXT: vmovq %xmm0, (%rsi) 100; AVX512F-NEXT: retq 101; 102; AVX512VL-LABEL: trunc_v8i16_to_v8i8: 103; AVX512VL: # %bb.0: 104; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 105; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 106; AVX512VL-NEXT: vmovq %xmm0, (%rsi) 107; AVX512VL-NEXT: retq 108; 109; AVX512BW-LABEL: trunc_v8i16_to_v8i8: 110; AVX512BW: # %bb.0: 111; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 112; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 113; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 114; AVX512BW-NEXT: retq 115; 116; AVX512BWVL-LABEL: trunc_v8i16_to_v8i8: 117; AVX512BWVL: # %bb.0: 118; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 119; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) 120; AVX512BWVL-NEXT: retq 121 %vec = load <16 x i8>, <16 x i8>* %L 122 %bc = bitcast <16 x i8> %vec to <8 x i16> 123 %strided.vec = trunc <8 x i16> %bc to <8 x i8> 124 store <8 x i8> %strided.vec, <8 x i8>* %S 125 ret void 126} 127 128define void @shuffle_v8i16_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind { 129; SSE2-LABEL: shuffle_v8i16_to_v4i16: 130; SSE2: # %bb.0: 131; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7] 132; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 133; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 134; SSE2-NEXT: movq %xmm0, (%rsi) 135; SSE2-NEXT: retq 136; 137; SSE42-LABEL: shuffle_v8i16_to_v4i16: 138; SSE42: # %bb.0: 139; SSE42-NEXT: movdqa (%rdi), %xmm0 140; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] 141; SSE42-NEXT: movq %xmm0, (%rsi) 142; SSE42-NEXT: retq 143; 144; AVX-LABEL: shuffle_v8i16_to_v4i16: 145; AVX: # %bb.0: 146; AVX-NEXT: vmovdqa (%rdi), %xmm0 147; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] 148; AVX-NEXT: vmovq %xmm0, (%rsi) 149; AVX-NEXT: retq 150; 151; AVX512F-LABEL: shuffle_v8i16_to_v4i16: 152; AVX512F: # %bb.0: 153; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 154; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] 155; AVX512F-NEXT: vmovq %xmm0, (%rsi) 156; AVX512F-NEXT: retq 157; 158; AVX512VL-LABEL: shuffle_v8i16_to_v4i16: 159; AVX512VL: # %bb.0: 160; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 161; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) 162; AVX512VL-NEXT: retq 163; 164; AVX512BW-LABEL: shuffle_v8i16_to_v4i16: 165; AVX512BW: # %bb.0: 166; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 167; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] 168; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 169; AVX512BW-NEXT: retq 170; 171; AVX512BWVL-LABEL: shuffle_v8i16_to_v4i16: 172; AVX512BWVL: # %bb.0: 173; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 174; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) 175; AVX512BWVL-NEXT: retq 176 %vec = load <8 x i16>, <8 x i16>* %L 177 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 178 store <4 x i16> %strided.vec, <4 x i16>* %S 179 ret void 180} 181 182define void @trunc_v4i32_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind { 183; SSE2-LABEL: trunc_v4i32_to_v4i16: 184; SSE2: # %bb.0: 185; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7] 186; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 187; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 188; SSE2-NEXT: movq %xmm0, (%rsi) 189; SSE2-NEXT: retq 190; 191; SSE42-LABEL: trunc_v4i32_to_v4i16: 192; SSE42: # %bb.0: 193; SSE42-NEXT: movdqa (%rdi), %xmm0 194; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] 195; SSE42-NEXT: movq %xmm0, (%rsi) 196; SSE42-NEXT: retq 197; 198; AVX-LABEL: trunc_v4i32_to_v4i16: 199; AVX: # %bb.0: 200; AVX-NEXT: vmovdqa (%rdi), %xmm0 201; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] 202; AVX-NEXT: vmovq %xmm0, (%rsi) 203; AVX-NEXT: retq 204; 205; AVX512F-LABEL: trunc_v4i32_to_v4i16: 206; AVX512F: # %bb.0: 207; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 208; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] 209; AVX512F-NEXT: vmovq %xmm0, (%rsi) 210; AVX512F-NEXT: retq 211; 212; AVX512VL-LABEL: trunc_v4i32_to_v4i16: 213; AVX512VL: # %bb.0: 214; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 215; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) 216; AVX512VL-NEXT: retq 217; 218; AVX512BW-LABEL: trunc_v4i32_to_v4i16: 219; AVX512BW: # %bb.0: 220; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 221; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] 222; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 223; AVX512BW-NEXT: retq 224; 225; AVX512BWVL-LABEL: trunc_v4i32_to_v4i16: 226; AVX512BWVL: # %bb.0: 227; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 228; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) 229; AVX512BWVL-NEXT: retq 230 %vec = load <8 x i16>, <8 x i16>* %L 231 %bc = bitcast <8 x i16> %vec to <4 x i32> 232 %strided.vec = trunc <4 x i32> %bc to <4 x i16> 233 store <4 x i16> %strided.vec, <4 x i16>* %S 234 ret void 235} 236 237define void @shuffle_v4i32_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind { 238; SSE-LABEL: shuffle_v4i32_to_v2i32: 239; SSE: # %bb.0: 240; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3] 241; SSE-NEXT: movq %xmm0, (%rsi) 242; SSE-NEXT: retq 243; 244; AVX-LABEL: shuffle_v4i32_to_v2i32: 245; AVX: # %bb.0: 246; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] 247; AVX-NEXT: vmovlps %xmm0, (%rsi) 248; AVX-NEXT: retq 249; 250; AVX512-LABEL: shuffle_v4i32_to_v2i32: 251; AVX512: # %bb.0: 252; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] 253; AVX512-NEXT: vmovlps %xmm0, (%rsi) 254; AVX512-NEXT: retq 255 %vec = load <4 x i32>, <4 x i32>* %L 256 %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> <i32 0, i32 2> 257 store <2 x i32> %strided.vec, <2 x i32>* %S 258 ret void 259} 260 261define void @trunc_v2i64_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind { 262; SSE-LABEL: trunc_v2i64_to_v2i32: 263; SSE: # %bb.0: 264; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3] 265; SSE-NEXT: movq %xmm0, (%rsi) 266; SSE-NEXT: retq 267; 268; AVX-LABEL: trunc_v2i64_to_v2i32: 269; AVX: # %bb.0: 270; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] 271; AVX-NEXT: vmovlps %xmm0, (%rsi) 272; AVX-NEXT: retq 273; 274; AVX512F-LABEL: trunc_v2i64_to_v2i32: 275; AVX512F: # %bb.0: 276; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] 277; AVX512F-NEXT: vmovlps %xmm0, (%rsi) 278; AVX512F-NEXT: retq 279; 280; AVX512VL-LABEL: trunc_v2i64_to_v2i32: 281; AVX512VL: # %bb.0: 282; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 283; AVX512VL-NEXT: vpmovqd %xmm0, (%rsi) 284; AVX512VL-NEXT: retq 285; 286; AVX512BW-LABEL: trunc_v2i64_to_v2i32: 287; AVX512BW: # %bb.0: 288; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] 289; AVX512BW-NEXT: vmovlps %xmm0, (%rsi) 290; AVX512BW-NEXT: retq 291; 292; AVX512BWVL-LABEL: trunc_v2i64_to_v2i32: 293; AVX512BWVL: # %bb.0: 294; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 295; AVX512BWVL-NEXT: vpmovqd %xmm0, (%rsi) 296; AVX512BWVL-NEXT: retq 297 %vec = load <4 x i32>, <4 x i32>* %L 298 %bc = bitcast <4 x i32> %vec to <2 x i64> 299 %strided.vec = trunc <2 x i64> %bc to <2 x i32> 300 store <2 x i32> %strided.vec, <2 x i32>* %S 301 ret void 302} 303 304define void @shuffle_v16i8_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind { 305; SSE2-LABEL: shuffle_v16i8_to_v4i8: 306; SSE2: # %bb.0: 307; SSE2-NEXT: movdqa (%rdi), %xmm0 308; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 309; SSE2-NEXT: packuswb %xmm0, %xmm0 310; SSE2-NEXT: packuswb %xmm0, %xmm0 311; SSE2-NEXT: movd %xmm0, (%rsi) 312; SSE2-NEXT: retq 313; 314; SSE42-LABEL: shuffle_v16i8_to_v4i8: 315; SSE42: # %bb.0: 316; SSE42-NEXT: movdqa (%rdi), %xmm0 317; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 318; SSE42-NEXT: movd %xmm0, (%rsi) 319; SSE42-NEXT: retq 320; 321; AVX-LABEL: shuffle_v16i8_to_v4i8: 322; AVX: # %bb.0: 323; AVX-NEXT: vmovdqa (%rdi), %xmm0 324; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 325; AVX-NEXT: vmovd %xmm0, (%rsi) 326; AVX-NEXT: retq 327; 328; AVX512F-LABEL: shuffle_v16i8_to_v4i8: 329; AVX512F: # %bb.0: 330; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 331; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 332; AVX512F-NEXT: vmovd %xmm0, (%rsi) 333; AVX512F-NEXT: retq 334; 335; AVX512VL-LABEL: shuffle_v16i8_to_v4i8: 336; AVX512VL: # %bb.0: 337; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 338; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) 339; AVX512VL-NEXT: retq 340; 341; AVX512BW-LABEL: shuffle_v16i8_to_v4i8: 342; AVX512BW: # %bb.0: 343; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 344; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 345; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 346; AVX512BW-NEXT: retq 347; 348; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8: 349; AVX512BWVL: # %bb.0: 350; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 351; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) 352; AVX512BWVL-NEXT: retq 353 %vec = load <16 x i8>, <16 x i8>* %L 354 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 355 store <4 x i8> %strided.vec, <4 x i8>* %S 356 ret void 357} 358 359define void @trunc_v4i32_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind { 360; SSE2-LABEL: trunc_v4i32_to_v4i8: 361; SSE2: # %bb.0: 362; SSE2-NEXT: movdqa (%rdi), %xmm0 363; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 364; SSE2-NEXT: packuswb %xmm0, %xmm0 365; SSE2-NEXT: packuswb %xmm0, %xmm0 366; SSE2-NEXT: movd %xmm0, (%rsi) 367; SSE2-NEXT: retq 368; 369; SSE42-LABEL: trunc_v4i32_to_v4i8: 370; SSE42: # %bb.0: 371; SSE42-NEXT: movdqa (%rdi), %xmm0 372; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 373; SSE42-NEXT: movd %xmm0, (%rsi) 374; SSE42-NEXT: retq 375; 376; AVX-LABEL: trunc_v4i32_to_v4i8: 377; AVX: # %bb.0: 378; AVX-NEXT: vmovdqa (%rdi), %xmm0 379; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 380; AVX-NEXT: vmovd %xmm0, (%rsi) 381; AVX-NEXT: retq 382; 383; AVX512F-LABEL: trunc_v4i32_to_v4i8: 384; AVX512F: # %bb.0: 385; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 386; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 387; AVX512F-NEXT: vmovd %xmm0, (%rsi) 388; AVX512F-NEXT: retq 389; 390; AVX512VL-LABEL: trunc_v4i32_to_v4i8: 391; AVX512VL: # %bb.0: 392; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 393; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) 394; AVX512VL-NEXT: retq 395; 396; AVX512BW-LABEL: trunc_v4i32_to_v4i8: 397; AVX512BW: # %bb.0: 398; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 399; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 400; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 401; AVX512BW-NEXT: retq 402; 403; AVX512BWVL-LABEL: trunc_v4i32_to_v4i8: 404; AVX512BWVL: # %bb.0: 405; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 406; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) 407; AVX512BWVL-NEXT: retq 408 %vec = load <16 x i8>, <16 x i8>* %L 409 %bc = bitcast <16 x i8> %vec to <4 x i32> 410 %strided.vec = trunc <4 x i32> %bc to <4 x i8> 411 store <4 x i8> %strided.vec, <4 x i8>* %S 412 ret void 413} 414 415define void @shuffle_v8i16_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind { 416; SSE-LABEL: shuffle_v8i16_to_v2i16: 417; SSE: # %bb.0: 418; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3] 419; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 420; SSE-NEXT: movd %xmm0, (%rsi) 421; SSE-NEXT: retq 422; 423; AVX1-LABEL: shuffle_v8i16_to_v2i16: 424; AVX1: # %bb.0: 425; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 426; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 427; AVX1-NEXT: vmovd %xmm0, (%rsi) 428; AVX1-NEXT: retq 429; 430; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16: 431; AVX2-SLOW: # %bb.0: 432; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 433; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 434; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi) 435; AVX2-SLOW-NEXT: retq 436; 437; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16: 438; AVX2-FAST: # %bb.0: 439; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 440; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u] 441; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi) 442; AVX2-FAST-NEXT: retq 443; 444; AVX512F-LABEL: shuffle_v8i16_to_v2i16: 445; AVX512F: # %bb.0: 446; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 447; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 448; AVX512F-NEXT: vmovd %xmm0, (%rsi) 449; AVX512F-NEXT: retq 450; 451; AVX512VL-LABEL: shuffle_v8i16_to_v2i16: 452; AVX512VL: # %bb.0: 453; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 454; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi) 455; AVX512VL-NEXT: retq 456; 457; AVX512BW-LABEL: shuffle_v8i16_to_v2i16: 458; AVX512BW: # %bb.0: 459; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 460; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u] 461; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 462; AVX512BW-NEXT: retq 463; 464; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16: 465; AVX512BWVL: # %bb.0: 466; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 467; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi) 468; AVX512BWVL-NEXT: retq 469 %vec = load <8 x i16>, <8 x i16>* %L 470 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 0, i32 4> 471 store <2 x i16> %strided.vec, <2 x i16>* %S 472 ret void 473} 474 475define void @trunc_v2i64_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind { 476; SSE-LABEL: trunc_v2i64_to_v2i16: 477; SSE: # %bb.0: 478; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3] 479; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 480; SSE-NEXT: movd %xmm0, (%rsi) 481; SSE-NEXT: retq 482; 483; AVX1-LABEL: trunc_v2i64_to_v2i16: 484; AVX1: # %bb.0: 485; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 486; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 487; AVX1-NEXT: vmovd %xmm0, (%rsi) 488; AVX1-NEXT: retq 489; 490; AVX2-SLOW-LABEL: trunc_v2i64_to_v2i16: 491; AVX2-SLOW: # %bb.0: 492; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 493; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 494; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi) 495; AVX2-SLOW-NEXT: retq 496; 497; AVX2-FAST-LABEL: trunc_v2i64_to_v2i16: 498; AVX2-FAST: # %bb.0: 499; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 500; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u] 501; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi) 502; AVX2-FAST-NEXT: retq 503; 504; AVX512F-LABEL: trunc_v2i64_to_v2i16: 505; AVX512F: # %bb.0: 506; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 507; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 508; AVX512F-NEXT: vmovd %xmm0, (%rsi) 509; AVX512F-NEXT: retq 510; 511; AVX512VL-LABEL: trunc_v2i64_to_v2i16: 512; AVX512VL: # %bb.0: 513; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 514; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi) 515; AVX512VL-NEXT: retq 516; 517; AVX512BW-LABEL: trunc_v2i64_to_v2i16: 518; AVX512BW: # %bb.0: 519; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 520; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u] 521; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 522; AVX512BW-NEXT: retq 523; 524; AVX512BWVL-LABEL: trunc_v2i64_to_v2i16: 525; AVX512BWVL: # %bb.0: 526; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 527; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi) 528; AVX512BWVL-NEXT: retq 529 %vec = load <8 x i16>, <8 x i16>* %L 530 %bc = bitcast <8 x i16> %vec to <2 x i64> 531 %strided.vec = trunc <2 x i64> %bc to <2 x i16> 532 store <2 x i16> %strided.vec, <2 x i16>* %S 533 ret void 534} 535 536define void @shuffle_v16i8_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind { 537; SSE2-LABEL: shuffle_v16i8_to_v2i8: 538; SSE2: # %bb.0: 539; SSE2-NEXT: movdqa (%rdi), %xmm0 540; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 541; SSE2-NEXT: packuswb %xmm0, %xmm0 542; SSE2-NEXT: packuswb %xmm0, %xmm0 543; SSE2-NEXT: packuswb %xmm0, %xmm0 544; SSE2-NEXT: movd %xmm0, %eax 545; SSE2-NEXT: movw %ax, (%rsi) 546; SSE2-NEXT: retq 547; 548; SSE42-LABEL: shuffle_v16i8_to_v2i8: 549; SSE42: # %bb.0: 550; SSE42-NEXT: movdqa (%rdi), %xmm0 551; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 552; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 553; SSE42-NEXT: retq 554; 555; AVX-LABEL: shuffle_v16i8_to_v2i8: 556; AVX: # %bb.0: 557; AVX-NEXT: vmovdqa (%rdi), %xmm0 558; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 559; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) 560; AVX-NEXT: retq 561; 562; AVX512F-LABEL: shuffle_v16i8_to_v2i8: 563; AVX512F: # %bb.0: 564; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 565; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 566; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi) 567; AVX512F-NEXT: retq 568; 569; AVX512VL-LABEL: shuffle_v16i8_to_v2i8: 570; AVX512VL: # %bb.0: 571; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 572; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi) 573; AVX512VL-NEXT: retq 574; 575; AVX512BW-LABEL: shuffle_v16i8_to_v2i8: 576; AVX512BW: # %bb.0: 577; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 578; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 579; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) 580; AVX512BW-NEXT: retq 581; 582; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8: 583; AVX512BWVL: # %bb.0: 584; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 585; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi) 586; AVX512BWVL-NEXT: retq 587 %vec = load <16 x i8>, <16 x i8>* %L 588 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 0, i32 8> 589 store <2 x i8> %strided.vec, <2 x i8>* %S 590 ret void 591} 592 593define void @trunc_v2i64_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind { 594; SSE2-LABEL: trunc_v2i64_to_v2i8: 595; SSE2: # %bb.0: 596; SSE2-NEXT: movdqa (%rdi), %xmm0 597; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 598; SSE2-NEXT: packuswb %xmm0, %xmm0 599; SSE2-NEXT: packuswb %xmm0, %xmm0 600; SSE2-NEXT: packuswb %xmm0, %xmm0 601; SSE2-NEXT: movd %xmm0, %eax 602; SSE2-NEXT: movw %ax, (%rsi) 603; SSE2-NEXT: retq 604; 605; SSE42-LABEL: trunc_v2i64_to_v2i8: 606; SSE42: # %bb.0: 607; SSE42-NEXT: movdqa (%rdi), %xmm0 608; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 609; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 610; SSE42-NEXT: retq 611; 612; AVX-LABEL: trunc_v2i64_to_v2i8: 613; AVX: # %bb.0: 614; AVX-NEXT: vmovdqa (%rdi), %xmm0 615; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 616; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) 617; AVX-NEXT: retq 618; 619; AVX512F-LABEL: trunc_v2i64_to_v2i8: 620; AVX512F: # %bb.0: 621; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 622; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 623; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi) 624; AVX512F-NEXT: retq 625; 626; AVX512VL-LABEL: trunc_v2i64_to_v2i8: 627; AVX512VL: # %bb.0: 628; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 629; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi) 630; AVX512VL-NEXT: retq 631; 632; AVX512BW-LABEL: trunc_v2i64_to_v2i8: 633; AVX512BW: # %bb.0: 634; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 635; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 636; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) 637; AVX512BW-NEXT: retq 638; 639; AVX512BWVL-LABEL: trunc_v2i64_to_v2i8: 640; AVX512BWVL: # %bb.0: 641; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 642; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi) 643; AVX512BWVL-NEXT: retq 644 %vec = load <16 x i8>, <16 x i8>* %L 645 %bc = bitcast <16 x i8> %vec to <2 x i64> 646 %strided.vec = trunc <2 x i64> %bc to <2 x i8> 647 store <2 x i8> %strided.vec, <2 x i8>* %S 648 ret void 649} 650