1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW 4 5; trunc(concat(x,y)) -> pack 6 7define <32 x i16> @trunc_concat_packssdw_512(<16 x i32> %a0, <16 x i32> %a1) nounwind { 8; AVX512-LABEL: trunc_concat_packssdw_512: 9; AVX512: # %bb.0: 10; AVX512-NEXT: vpsrad $17, %zmm0, %zmm0 11; AVX512-NEXT: vpsrad $23, %zmm1, %zmm1 12; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15] 13; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 14; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] 15; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 16; AVX512-NEXT: vpmovdw %zmm3, %ymm0 17; AVX512-NEXT: vpmovdw %zmm2, %ymm1 18; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 19; AVX512-NEXT: retq 20 %1 = ashr <16 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17> 21 %2 = ashr <16 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23> 22 %3 = shufflevector <16 x i32> %1, <16 x i32> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31> 23 %4 = trunc <32 x i32> %3 to <32 x i16> 24 ret <32 x i16> %4 25} 26 27define <32 x i16> @trunc_concat_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) nounwind { 28; AVX512-LABEL: trunc_concat_packusdw_512: 29; AVX512: # %bb.0: 30; AVX512-NEXT: vpsrld $17, %zmm0, %zmm0 31; AVX512-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm1, %zmm1 32; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15] 33; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 34; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] 35; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 36; AVX512-NEXT: vpmovdw %zmm3, %ymm0 37; AVX512-NEXT: vpmovdw %zmm2, %ymm1 38; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 39; AVX512-NEXT: retq 40 %1 = lshr <16 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17> 41 %2 = and <16 x i32> %a1, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 42 %3 = shufflevector <16 x i32> %1, <16 x i32> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31> 43 %4 = trunc <32 x i32> %3 to <32 x i16> 44 ret <32 x i16> %4 45} 46 47define <64 x i8> @trunc_concat_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) nounwind { 48; AVX512F-LABEL: trunc_concat_packsswb_512: 49; AVX512F: # %bb.0: 50; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm2 51; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 52; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0 53; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 54; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 55; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,8,9,2,3,10,11] 56; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 57; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,5,12,13,6,7,14,15] 58; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 59; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero 60; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 61; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm1 62; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 63; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 64; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 65; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 66; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 67; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2 68; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 69; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 70; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 71; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 72; AVX512F-NEXT: retq 73; 74; AVX512BW-LABEL: trunc_concat_packsswb_512: 75; AVX512BW: # %bb.0: 76; AVX512BW-NEXT: vpsraw $15, %zmm0, %zmm0 77; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 78; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15] 79; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 80; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] 81; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 82; AVX512BW-NEXT: vpmovwb %zmm3, %ymm0 83; AVX512BW-NEXT: vpmovwb %zmm2, %ymm1 84; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 85; AVX512BW-NEXT: retq 86 %1 = ashr <32 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 87 %2 = and <32 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 88 %3 = shufflevector <32 x i16> %1, <32 x i16> %2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 89 %4 = trunc <64 x i16> %3 to <64 x i8> 90 ret <64 x i8> %4 91} 92 93define <64 x i8> @trunc_concat_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) nounwind { 94; AVX512F-LABEL: trunc_concat_packuswb_512: 95; AVX512F: # %bb.0: 96; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm2 97; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 98; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0 99; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 100; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 101; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,8,9,2,3,10,11] 102; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 103; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,5,12,13,6,7,14,15] 104; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 105; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero 106; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 107; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm1 108; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 109; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 110; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 111; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 112; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 113; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2 114; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 115; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 116; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 117; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 118; AVX512F-NEXT: retq 119; 120; AVX512BW-LABEL: trunc_concat_packuswb_512: 121; AVX512BW: # %bb.0: 122; AVX512BW-NEXT: vpsrlw $15, %zmm0, %zmm0 123; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 124; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15] 125; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 126; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] 127; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 128; AVX512BW-NEXT: vpmovwb %zmm3, %ymm0 129; AVX512BW-NEXT: vpmovwb %zmm2, %ymm1 130; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 131; AVX512BW-NEXT: retq 132 %1 = lshr <32 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 133 %2 = and <32 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 134 %3 = shufflevector <32 x i16> %1, <32 x i16> %2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 135 %4 = trunc <64 x i16> %3 to <64 x i8> 136 ret <64 x i8> %4 137} 138 139; concat(trunc(x),trunc(y)) -> pack 140 141define <32 x i16> @concat_trunc_packssdw_512(<16 x i32> %a0, <16 x i32> %a1) nounwind { 142; AVX512-LABEL: concat_trunc_packssdw_512: 143; AVX512: # %bb.0: 144; AVX512-NEXT: vpsrad $17, %zmm0, %zmm0 145; AVX512-NEXT: vpsrad $23, %zmm1, %zmm1 146; AVX512-NEXT: vpmovdw %zmm0, %ymm0 147; AVX512-NEXT: vpmovdw %zmm1, %ymm1 148; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 149; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 150; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] 151; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 152; AVX512-NEXT: retq 153 %1 = ashr <16 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17> 154 %2 = ashr <16 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23> 155 %3 = trunc <16 x i32> %1 to <16 x i16> 156 %4 = trunc <16 x i32> %2 to <16 x i16> 157 %5 = shufflevector <16 x i16> %3, <16 x i16> %4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31> 158 ret <32 x i16> %5 159} 160 161define <32 x i16> @concat_trunc_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) nounwind { 162; AVX512-LABEL: concat_trunc_packusdw_512: 163; AVX512: # %bb.0: 164; AVX512-NEXT: vpsrld $17, %zmm0, %zmm0 165; AVX512-NEXT: vpsrld $23, %zmm1, %zmm1 166; AVX512-NEXT: vpmovdw %zmm0, %ymm0 167; AVX512-NEXT: vpmovdw %zmm1, %ymm1 168; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 169; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 170; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] 171; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 172; AVX512-NEXT: retq 173 %1 = lshr <16 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17> 174 %2 = lshr <16 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23> 175 %3 = trunc <16 x i32> %1 to <16 x i16> 176 %4 = trunc <16 x i32> %2 to <16 x i16> 177 %5 = shufflevector <16 x i16> %3, <16 x i16> %4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31> 178 ret <32 x i16> %5 179} 180 181define <64 x i8> @concat_trunc_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) nounwind { 182; AVX512F-LABEL: concat_trunc_packsswb_512: 183; AVX512F: # %bb.0: 184; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm2 185; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 186; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0 187; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 188; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 189; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 190; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 191; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 192; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 193; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 194; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 195; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 196; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 197; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 198; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 199; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2 200; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 201; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] 202; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 203; AVX512F-NEXT: retq 204; 205; AVX512BW-LABEL: concat_trunc_packsswb_512: 206; AVX512BW: # %bb.0: 207; AVX512BW-NEXT: vpsraw $15, %zmm0, %zmm0 208; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 209; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 210; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 211; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 212; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 213; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] 214; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 215; AVX512BW-NEXT: retq 216 %1 = ashr <32 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 217 %2 = and <32 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 218 %3 = trunc <32 x i16> %1 to <32 x i8> 219 %4 = trunc <32 x i16> %2 to <32 x i8> 220 %5 = shufflevector <32 x i8> %3, <32 x i8> %4, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 221 ret <64 x i8> %5 222} 223 224define <64 x i8> @concat_trunc_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) nounwind { 225; AVX512F-LABEL: concat_trunc_packuswb_512: 226; AVX512F: # %bb.0: 227; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm2 228; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 229; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0 230; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 231; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 232; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 233; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 234; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 235; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 236; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 237; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 238; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 239; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 240; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 241; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 242; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2 243; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 244; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] 245; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 246; AVX512F-NEXT: retq 247; 248; AVX512BW-LABEL: concat_trunc_packuswb_512: 249; AVX512BW: # %bb.0: 250; AVX512BW-NEXT: vpsrlw $15, %zmm0, %zmm0 251; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 252; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 253; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 254; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 255; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 256; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] 257; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 258; AVX512BW-NEXT: retq 259 %1 = lshr <32 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 260 %2 = and <32 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 261 %3 = trunc <32 x i16> %1 to <32 x i8> 262 %4 = trunc <32 x i16> %2 to <32 x i8> 263 %5 = shufflevector <32 x i8> %3, <32 x i8> %4, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 264 ret <64 x i8> %5 265} 266