1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL 8 9; PR31551 10; Pairs of shufflevector:trunc functions with functional equivalence. 11; Ideally, the shuffles should be lowered to code with the same quality as the truncates. 12 13define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind { 14; AVX512F-LABEL: shuffle_v64i8_to_v32i8: 15; AVX512F: # %bb.0: 16; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 17; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 18; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] 19; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] 20; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 21; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 22; AVX512F-NEXT: vmovdqa %ymm0, (%rsi) 23; AVX512F-NEXT: vzeroupper 24; AVX512F-NEXT: retq 25; 26; AVX512VL-LABEL: shuffle_v64i8_to_v32i8: 27; AVX512VL: # %bb.0: 28; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 29; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 30; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] 31; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] 32; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7] 33; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 34; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi) 35; AVX512VL-NEXT: vzeroupper 36; AVX512VL-NEXT: retq 37; 38; AVX512BW-LABEL: shuffle_v64i8_to_v32i8: 39; AVX512BW: # %bb.0: 40; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 41; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi) 42; AVX512BW-NEXT: vzeroupper 43; AVX512BW-NEXT: retq 44; 45; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8: 46; AVX512BWVL: # %bb.0: 47; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 48; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi) 49; AVX512BWVL-NEXT: vzeroupper 50; AVX512BWVL-NEXT: retq 51; 52; AVX512VBMI-LABEL: shuffle_v64i8_to_v32i8: 53; AVX512VBMI: # %bb.0: 54; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0 55; AVX512VBMI-NEXT: vpmovwb %zmm0, (%rsi) 56; AVX512VBMI-NEXT: vzeroupper 57; AVX512VBMI-NEXT: retq 58; 59; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v32i8: 60; AVX512VBMIVL: # %bb.0: 61; AVX512VBMIVL-NEXT: vmovdqa64 (%rdi), %zmm0 62; AVX512VBMIVL-NEXT: vpmovwb %zmm0, (%rsi) 63; AVX512VBMIVL-NEXT: vzeroupper 64; AVX512VBMIVL-NEXT: retq 65 %vec = load <64 x i8>, <64 x i8>* %L 66 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62> 67 store <32 x i8> %strided.vec, <32 x i8>* %S 68 ret void 69} 70 71define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind { 72; AVX512F-LABEL: trunc_v32i16_to_v32i8: 73; AVX512F: # %bb.0: 74; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 75; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 76; AVX512F-NEXT: vpmovdb %zmm1, 16(%rsi) 77; AVX512F-NEXT: vpmovdb %zmm0, (%rsi) 78; AVX512F-NEXT: vzeroupper 79; AVX512F-NEXT: retq 80; 81; AVX512VL-LABEL: trunc_v32i16_to_v32i8: 82; AVX512VL: # %bb.0: 83; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 84; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 85; AVX512VL-NEXT: vpmovdb %zmm1, 16(%rsi) 86; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi) 87; AVX512VL-NEXT: vzeroupper 88; AVX512VL-NEXT: retq 89; 90; AVX512BW-LABEL: trunc_v32i16_to_v32i8: 91; AVX512BW: # %bb.0: 92; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 93; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi) 94; AVX512BW-NEXT: vzeroupper 95; AVX512BW-NEXT: retq 96; 97; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8: 98; AVX512BWVL: # %bb.0: 99; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 100; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi) 101; AVX512BWVL-NEXT: vzeroupper 102; AVX512BWVL-NEXT: retq 103; 104; AVX512VBMI-LABEL: trunc_v32i16_to_v32i8: 105; AVX512VBMI: # %bb.0: 106; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0 107; AVX512VBMI-NEXT: vpmovwb %zmm0, (%rsi) 108; AVX512VBMI-NEXT: vzeroupper 109; AVX512VBMI-NEXT: retq 110; 111; AVX512VBMIVL-LABEL: trunc_v32i16_to_v32i8: 112; AVX512VBMIVL: # %bb.0: 113; AVX512VBMIVL-NEXT: vmovdqa64 (%rdi), %zmm0 114; AVX512VBMIVL-NEXT: vpmovwb %zmm0, (%rsi) 115; AVX512VBMIVL-NEXT: vzeroupper 116; AVX512VBMIVL-NEXT: retq 117 %vec = load <64 x i8>, <64 x i8>* %L 118 %bc = bitcast <64 x i8> %vec to <32 x i16> 119 %strided.vec = trunc <32 x i16> %bc to <32 x i8> 120 store <32 x i8> %strided.vec, <32 x i8>* %S 121 ret void 122} 123 124define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind { 125; AVX512-LABEL: shuffle_v32i16_to_v16i16: 126; AVX512: # %bb.0: 127; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 128; AVX512-NEXT: vpmovdw %zmm0, (%rsi) 129; AVX512-NEXT: vzeroupper 130; AVX512-NEXT: retq 131 %vec = load <32 x i16>, <32 x i16>* %L 132 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 133 store <16 x i16> %strided.vec, <16 x i16>* %S 134 ret void 135} 136 137define void @trunc_v16i32_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind { 138; AVX512-LABEL: trunc_v16i32_to_v16i16: 139; AVX512: # %bb.0: 140; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 141; AVX512-NEXT: vpmovdw %zmm0, (%rsi) 142; AVX512-NEXT: vzeroupper 143; AVX512-NEXT: retq 144 %vec = load <32 x i16>, <32 x i16>* %L 145 %bc = bitcast <32 x i16> %vec to <16 x i32> 146 %strided.vec = trunc <16 x i32> %bc to <16 x i16> 147 store <16 x i16> %strided.vec, <16 x i16>* %S 148 ret void 149} 150 151define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind { 152; AVX512-LABEL: shuffle_v16i32_to_v8i32: 153; AVX512: # %bb.0: 154; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 155; AVX512-NEXT: vpmovqd %zmm0, (%rsi) 156; AVX512-NEXT: vzeroupper 157; AVX512-NEXT: retq 158 %vec = load <16 x i32>, <16 x i32>* %L 159 %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 160 store <8 x i32> %strided.vec, <8 x i32>* %S 161 ret void 162} 163 164define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind { 165; AVX512-LABEL: trunc_v8i64_to_v8i32: 166; AVX512: # %bb.0: 167; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 168; AVX512-NEXT: vpmovqd %zmm0, (%rsi) 169; AVX512-NEXT: vzeroupper 170; AVX512-NEXT: retq 171 %vec = load <16 x i32>, <16 x i32>* %L 172 %bc = bitcast <16 x i32> %vec to <8 x i64> 173 %strided.vec = trunc <8 x i64> %bc to <8 x i32> 174 store <8 x i32> %strided.vec, <8 x i32>* %S 175 ret void 176} 177 178define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind { 179; AVX512-LABEL: shuffle_v64i8_to_v16i8: 180; AVX512: # %bb.0: 181; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 182; AVX512-NEXT: vpmovdb %zmm0, (%rsi) 183; AVX512-NEXT: vzeroupper 184; AVX512-NEXT: retq 185 %vec = load <64 x i8>, <64 x i8>* %L 186 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60> 187 store <16 x i8> %strided.vec, <16 x i8>* %S 188 ret void 189} 190 191define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind { 192; AVX512-LABEL: trunc_v16i32_to_v16i8: 193; AVX512: # %bb.0: 194; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 195; AVX512-NEXT: vpmovdb %zmm0, (%rsi) 196; AVX512-NEXT: vzeroupper 197; AVX512-NEXT: retq 198 %vec = load <64 x i8>, <64 x i8>* %L 199 %bc = bitcast <64 x i8> %vec to <16 x i32> 200 %strided.vec = trunc <16 x i32> %bc to <16 x i8> 201 store <16 x i8> %strided.vec, <16 x i8>* %S 202 ret void 203} 204 205define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind { 206; AVX512-LABEL: shuffle_v32i16_to_v8i16: 207; AVX512: # %bb.0: 208; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 209; AVX512-NEXT: vpmovqw %zmm0, (%rsi) 210; AVX512-NEXT: vzeroupper 211; AVX512-NEXT: retq 212 %vec = load <32 x i16>, <32 x i16>* %L 213 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28> 214 store <8 x i16> %strided.vec, <8 x i16>* %S 215 ret void 216} 217 218define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind { 219; AVX512-LABEL: trunc_v8i64_to_v8i16: 220; AVX512: # %bb.0: 221; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 222; AVX512-NEXT: vpmovqw %zmm0, (%rsi) 223; AVX512-NEXT: vzeroupper 224; AVX512-NEXT: retq 225 %vec = load <32 x i16>, <32 x i16>* %L 226 %bc = bitcast <32 x i16> %vec to <8 x i64> 227 %strided.vec = trunc <8 x i64> %bc to <8 x i16> 228 store <8 x i16> %strided.vec, <8 x i16>* %S 229 ret void 230} 231 232define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind { 233; AVX512-LABEL: shuffle_v64i8_to_v8i8: 234; AVX512: # %bb.0: 235; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 236; AVX512-NEXT: vpmovqb %zmm0, (%rsi) 237; AVX512-NEXT: vzeroupper 238; AVX512-NEXT: retq 239 %vec = load <64 x i8>, <64 x i8>* %L 240 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56> 241 store <8 x i8> %strided.vec, <8 x i8>* %S 242 ret void 243} 244 245define void @trunc_v8i64_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind { 246; AVX512-LABEL: trunc_v8i64_to_v8i8: 247; AVX512: # %bb.0: 248; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 249; AVX512-NEXT: vpmovqb %zmm0, (%rsi) 250; AVX512-NEXT: vzeroupper 251; AVX512-NEXT: retq 252 %vec = load <64 x i8>, <64 x i8>* %L 253 %bc = bitcast <64 x i8> %vec to <8 x i64> 254 %strided.vec = trunc <8 x i64> %bc to <8 x i8> 255 store <8 x i8> %strided.vec, <8 x i8>* %S 256 ret void 257} 258 259define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61(<64 x i8> %x) { 260; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: 261; AVX512F: # %bb.0: 262; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 263; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 264; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u> 265; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 266; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 267; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 268; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 269; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> 270; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 271; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 272; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 273; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 274; AVX512F-NEXT: vzeroupper 275; AVX512F-NEXT: retq 276; 277; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: 278; AVX512VL: # %bb.0: 279; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 280; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 281; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u> 282; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 283; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 284; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 285; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 286; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> 287; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 288; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 289; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 290; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 291; AVX512VL-NEXT: vzeroupper 292; AVX512VL-NEXT: retq 293; 294; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: 295; AVX512BW: # %bb.0: 296; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 297; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 298; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u> 299; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 300; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 301; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 302; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 303; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> 304; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 305; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 306; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 307; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 308; AVX512BW-NEXT: vzeroupper 309; AVX512BW-NEXT: retq 310; 311; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: 312; AVX512BWVL: # %bb.0: 313; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 314; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 315; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u> 316; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 317; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 318; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 319; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 320; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> 321; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 322; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 323; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 324; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 325; AVX512BWVL-NEXT: vzeroupper 326; AVX512BWVL-NEXT: retq 327; 328; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: 329; AVX512VBMI: # %bb.0: 330; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] 331; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0 332; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 333; AVX512VBMI-NEXT: vzeroupper 334; AVX512VBMI-NEXT: retq 335; 336; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: 337; AVX512VBMIVL: # %bb.0: 338; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] 339; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 340; AVX512VBMIVL-NEXT: vpermt2b %ymm2, %ymm1, %ymm0 341; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 342; AVX512VBMIVL-NEXT: vzeroupper 343; AVX512VBMIVL-NEXT: retq 344 %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61> 345 ret <16 x i8> %res 346} 347 348define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) { 349; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: 350; AVX512F: # %bb.0: 351; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 352; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> 353; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 354; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm2 355; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 356; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 357; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 358; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] 359; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] 360; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 361; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 362; AVX512F-NEXT: vzeroupper 363; AVX512F-NEXT: retq 364; 365; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: 366; AVX512VL: # %bb.0: 367; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 368; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> 369; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 370; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm2 371; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 372; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 373; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 374; AVX512VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] 375; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] 376; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 377; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 378; AVX512VL-NEXT: vzeroupper 379; AVX512VL-NEXT: retq 380; 381; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: 382; AVX512BW: # %bb.0: 383; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 384; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> 385; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 386; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm2 387; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 388; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 389; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 390; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] 391; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] 392; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 393; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 394; AVX512BW-NEXT: vzeroupper 395; AVX512BW-NEXT: retq 396; 397; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: 398; AVX512BWVL: # %bb.0: 399; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 400; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> 401; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 402; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm2 403; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 404; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 405; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 406; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] 407; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] 408; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 409; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 410; AVX512BWVL-NEXT: vzeroupper 411; AVX512BWVL-NEXT: retq 412; 413; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: 414; AVX512VBMI: # %bb.0: 415; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62] 416; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0 417; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 418; AVX512VBMI-NEXT: vzeroupper 419; AVX512VBMI-NEXT: retq 420; 421; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: 422; AVX512VBMIVL: # %bb.0: 423; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62] 424; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 425; AVX512VBMIVL-NEXT: vpermt2b %ymm2, %ymm1, %ymm0 426; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 427; AVX512VBMIVL-NEXT: vzeroupper 428; AVX512VBMIVL-NEXT: retq 429 %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 62> 430 ret <16 x i8> %res 431} 432 433define <4 x double> @PR34175(<32 x i16>* %p) { 434; AVX512F-LABEL: PR34175: 435; AVX512F: # %bb.0: 436; AVX512F-NEXT: vmovdqu (%rdi), %xmm0 437; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1 438; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 439; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 440; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 441; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 442; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0 443; AVX512F-NEXT: retq 444; 445; AVX512VL-LABEL: PR34175: 446; AVX512VL: # %bb.0: 447; AVX512VL-NEXT: vmovdqu (%rdi), %xmm0 448; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm1 449; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 450; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 451; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 452; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 453; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0 454; AVX512VL-NEXT: retq 455; 456; AVX512BW-LABEL: PR34175: 457; AVX512BW: # %bb.0: 458; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,8,32,40,u,u,u,u,u,u,u,u,u,u,u,u> 459; AVX512BW-NEXT: vmovdqu (%rdi), %ymm1 460; AVX512BW-NEXT: vmovdqu 32(%rdi), %ymm2 461; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 462; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 463; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0 464; AVX512BW-NEXT: retq 465; 466; AVX512BWVL-LABEL: PR34175: 467; AVX512BWVL: # %bb.0: 468; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 469; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768] 470; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 471; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 472; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0 473; AVX512BWVL-NEXT: retq 474; 475; AVX512VBMI-LABEL: PR34175: 476; AVX512VBMI: # %bb.0: 477; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = <0,8,32,40,u,u,u,u,u,u,u,u,u,u,u,u> 478; AVX512VBMI-NEXT: vmovdqu (%rdi), %ymm1 479; AVX512VBMI-NEXT: vmovdqu 32(%rdi), %ymm2 480; AVX512VBMI-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 481; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 482; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0 483; AVX512VBMI-NEXT: retq 484; 485; AVX512VBMIVL-LABEL: PR34175: 486; AVX512VBMIVL: # %bb.0: 487; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm0 488; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768] 489; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 490; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 491; AVX512VBMIVL-NEXT: vcvtdq2pd %xmm0, %ymm0 492; AVX512VBMIVL-NEXT: retq 493 %v = load <32 x i16>, <32 x i16>* %p, align 2 494 %shuf = shufflevector <32 x i16> %v, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24> 495 %tofp = uitofp <4 x i16> %shuf to <4 x double> 496 ret <4 x double> %tofp 497} 498 499define <16 x i8> @trunc_v8i64_to_v8i8_return_v16i8(<8 x i64> %vec) nounwind { 500; AVX512-LABEL: trunc_v8i64_to_v8i8_return_v16i8: 501; AVX512: # %bb.0: 502; AVX512-NEXT: vpmovqb %zmm0, %xmm0 503; AVX512-NEXT: vzeroupper 504; AVX512-NEXT: retq 505 %truncated = trunc <8 x i64> %vec to <8 x i8> 506 %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 507 ret <16 x i8> %result 508} 509 510