1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,XOP 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX512,AVX512F 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX512,AVX512DQ 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX512,AVX512BW 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX512,VBMI 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX512VL 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX512VL,AVX512VLDQ 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,INT256,AVX512VL,AVX512VLBW 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX,INT256,AVX512VL,VLVBMI 13 14define <4 x i64> @var_shuffle_v4i64(<4 x i64> %v, <4 x i64> %indices) nounwind { 15; XOP-LABEL: var_shuffle_v4i64: 16; XOP: # %bb.0: 17; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 18; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 19; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm3 20; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 21; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1 22; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 23; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm2, %ymm0, %ymm0 24; XOP-NEXT: retq 25; 26; AVX1-LABEL: var_shuffle_v4i64: 27; AVX1: # %bb.0: 28; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2] 29; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 30; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 31; AVX1-NEXT: vpaddq %xmm4, %xmm4, %xmm4 32; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 33; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 34; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 35; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 36; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] 37; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 38; AVX1-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 39; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 40; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 41; AVX1-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 42; AVX1-NEXT: retq 43; 44; AVX2-LABEL: var_shuffle_v4i64: 45; AVX2: # %bb.0: 46; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 47; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] 48; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 49; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3] 50; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 51; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] 52; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 53; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 54; AVX2-NEXT: retq 55; 56; AVX512-LABEL: var_shuffle_v4i64: 57; AVX512: # %bb.0: 58; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 59; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 60; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 61; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 62; AVX512-NEXT: retq 63; 64; AVX512VL-LABEL: var_shuffle_v4i64: 65; AVX512VL: # %bb.0: 66; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0 67; AVX512VL-NEXT: retq 68 %index0 = extractelement <4 x i64> %indices, i32 0 69 %index1 = extractelement <4 x i64> %indices, i32 1 70 %index2 = extractelement <4 x i64> %indices, i32 2 71 %index3 = extractelement <4 x i64> %indices, i32 3 72 %v0 = extractelement <4 x i64> %v, i64 %index0 73 %v1 = extractelement <4 x i64> %v, i64 %index1 74 %v2 = extractelement <4 x i64> %v, i64 %index2 75 %v3 = extractelement <4 x i64> %v, i64 %index3 76 %ret0 = insertelement <4 x i64> undef, i64 %v0, i32 0 77 %ret1 = insertelement <4 x i64> %ret0, i64 %v1, i32 1 78 %ret2 = insertelement <4 x i64> %ret1, i64 %v2, i32 2 79 %ret3 = insertelement <4 x i64> %ret2, i64 %v3, i32 3 80 ret <4 x i64> %ret3 81} 82 83define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind { 84; XOP-LABEL: var_shuffle_v8i32: 85; XOP: # %bb.0: 86; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 87; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 88; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0 89; XOP-NEXT: retq 90; 91; AVX1-LABEL: var_shuffle_v8i32: 92; AVX1: # %bb.0: 93; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3] 94; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 95; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 96; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3 97; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 98; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 99; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] 100; AVX1-NEXT: vpermilps %ymm1, %ymm3, %ymm3 101; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 102; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 103; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0 104; AVX1-NEXT: retq 105; 106; INT256-LABEL: var_shuffle_v8i32: 107; INT256: # %bb.0: 108; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0 109; INT256-NEXT: retq 110 %index0 = extractelement <8 x i32> %indices, i32 0 111 %index1 = extractelement <8 x i32> %indices, i32 1 112 %index2 = extractelement <8 x i32> %indices, i32 2 113 %index3 = extractelement <8 x i32> %indices, i32 3 114 %index4 = extractelement <8 x i32> %indices, i32 4 115 %index5 = extractelement <8 x i32> %indices, i32 5 116 %index6 = extractelement <8 x i32> %indices, i32 6 117 %index7 = extractelement <8 x i32> %indices, i32 7 118 %v0 = extractelement <8 x i32> %v, i32 %index0 119 %v1 = extractelement <8 x i32> %v, i32 %index1 120 %v2 = extractelement <8 x i32> %v, i32 %index2 121 %v3 = extractelement <8 x i32> %v, i32 %index3 122 %v4 = extractelement <8 x i32> %v, i32 %index4 123 %v5 = extractelement <8 x i32> %v, i32 %index5 124 %v6 = extractelement <8 x i32> %v, i32 %index6 125 %v7 = extractelement <8 x i32> %v, i32 %index7 126 %ret0 = insertelement <8 x i32> undef, i32 %v0, i32 0 127 %ret1 = insertelement <8 x i32> %ret0, i32 %v1, i32 1 128 %ret2 = insertelement <8 x i32> %ret1, i32 %v2, i32 2 129 %ret3 = insertelement <8 x i32> %ret2, i32 %v3, i32 3 130 %ret4 = insertelement <8 x i32> %ret3, i32 %v4, i32 4 131 %ret5 = insertelement <8 x i32> %ret4, i32 %v5, i32 5 132 %ret6 = insertelement <8 x i32> %ret5, i32 %v6, i32 6 133 %ret7 = insertelement <8 x i32> %ret6, i32 %v7, i32 7 134 ret <8 x i32> %ret7 135} 136 137define <16 x i16> @var_shuffle_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwind { 138; XOP-LABEL: var_shuffle_v16i16: 139; XOP: # %bb.0: 140; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [256,256,256,256,256,256,256,256] 141; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [514,514,514,514,514,514,514,514] 142; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm4 143; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 144; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm1 145; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 146; XOP-NEXT: vpperm %xmm1, %xmm2, %xmm0, %xmm1 147; XOP-NEXT: vpperm %xmm4, %xmm2, %xmm0, %xmm0 148; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 149; XOP-NEXT: retq 150; 151; AVX1-LABEL: var_shuffle_v16i16: 152; AVX1: # %bb.0: 153; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [514,514,514,514,514,514,514,514] 154; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3 155; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,256,256,256,256,256,256,256] 156; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3 157; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 158; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 159; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm1 160; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 161; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm4 162; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 163; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm6 164; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 165; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1 166; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2 167; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm4 168; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 169; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 170; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 171; AVX1-NEXT: retq 172; 173; AVX2-LABEL: var_shuffle_v16i16: 174; AVX2: # %bb.0: 175; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 176; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1 177; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 178; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2 179; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 180; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] 181; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 182; AVX2-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 183; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 184; AVX2-NEXT: retq 185; 186; AVX512-LABEL: var_shuffle_v16i16: 187; AVX512: # %bb.0: 188; AVX512-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 189; AVX512-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1 190; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 191; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm2 192; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 193; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] 194; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 195; AVX512-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 196; AVX512-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 197; AVX512-NEXT: retq 198; 199; AVX512VLDQ-LABEL: var_shuffle_v16i16: 200; AVX512VLDQ: # %bb.0: 201; AVX512VLDQ-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 202; AVX512VLDQ-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1 203; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 204; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm2, %ymm2 205; AVX512VLDQ-NEXT: vextracti128 $1, %ymm0, %xmm3 206; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] 207; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 208; AVX512VLDQ-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 209; AVX512VLDQ-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 210; AVX512VLDQ-NEXT: retq 211; 212; AVX512VLBW-LABEL: var_shuffle_v16i16: 213; AVX512VLBW: # %bb.0: 214; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0 215; AVX512VLBW-NEXT: retq 216; 217; VLVBMI-LABEL: var_shuffle_v16i16: 218; VLVBMI: # %bb.0: 219; VLVBMI-NEXT: vpermw %ymm0, %ymm1, %ymm0 220; VLVBMI-NEXT: retq 221 %index0 = extractelement <16 x i16> %indices, i32 0 222 %index1 = extractelement <16 x i16> %indices, i32 1 223 %index2 = extractelement <16 x i16> %indices, i32 2 224 %index3 = extractelement <16 x i16> %indices, i32 3 225 %index4 = extractelement <16 x i16> %indices, i32 4 226 %index5 = extractelement <16 x i16> %indices, i32 5 227 %index6 = extractelement <16 x i16> %indices, i32 6 228 %index7 = extractelement <16 x i16> %indices, i32 7 229 %index8 = extractelement <16 x i16> %indices, i32 8 230 %index9 = extractelement <16 x i16> %indices, i32 9 231 %index10 = extractelement <16 x i16> %indices, i32 10 232 %index11 = extractelement <16 x i16> %indices, i32 11 233 %index12 = extractelement <16 x i16> %indices, i32 12 234 %index13 = extractelement <16 x i16> %indices, i32 13 235 %index14 = extractelement <16 x i16> %indices, i32 14 236 %index15 = extractelement <16 x i16> %indices, i32 15 237 %v0 = extractelement <16 x i16> %v, i16 %index0 238 %v1 = extractelement <16 x i16> %v, i16 %index1 239 %v2 = extractelement <16 x i16> %v, i16 %index2 240 %v3 = extractelement <16 x i16> %v, i16 %index3 241 %v4 = extractelement <16 x i16> %v, i16 %index4 242 %v5 = extractelement <16 x i16> %v, i16 %index5 243 %v6 = extractelement <16 x i16> %v, i16 %index6 244 %v7 = extractelement <16 x i16> %v, i16 %index7 245 %v8 = extractelement <16 x i16> %v, i16 %index8 246 %v9 = extractelement <16 x i16> %v, i16 %index9 247 %v10 = extractelement <16 x i16> %v, i16 %index10 248 %v11 = extractelement <16 x i16> %v, i16 %index11 249 %v12 = extractelement <16 x i16> %v, i16 %index12 250 %v13 = extractelement <16 x i16> %v, i16 %index13 251 %v14 = extractelement <16 x i16> %v, i16 %index14 252 %v15 = extractelement <16 x i16> %v, i16 %index15 253 %ret0 = insertelement <16 x i16> undef, i16 %v0, i32 0 254 %ret1 = insertelement <16 x i16> %ret0, i16 %v1, i32 1 255 %ret2 = insertelement <16 x i16> %ret1, i16 %v2, i32 2 256 %ret3 = insertelement <16 x i16> %ret2, i16 %v3, i32 3 257 %ret4 = insertelement <16 x i16> %ret3, i16 %v4, i32 4 258 %ret5 = insertelement <16 x i16> %ret4, i16 %v5, i32 5 259 %ret6 = insertelement <16 x i16> %ret5, i16 %v6, i32 6 260 %ret7 = insertelement <16 x i16> %ret6, i16 %v7, i32 7 261 %ret8 = insertelement <16 x i16> %ret7, i16 %v8, i32 8 262 %ret9 = insertelement <16 x i16> %ret8, i16 %v9, i32 9 263 %ret10 = insertelement <16 x i16> %ret9, i16 %v10, i32 10 264 %ret11 = insertelement <16 x i16> %ret10, i16 %v11, i32 11 265 %ret12 = insertelement <16 x i16> %ret11, i16 %v12, i32 12 266 %ret13 = insertelement <16 x i16> %ret12, i16 %v13, i32 13 267 %ret14 = insertelement <16 x i16> %ret13, i16 %v14, i32 14 268 %ret15 = insertelement <16 x i16> %ret14, i16 %v15, i32 15 269 ret <16 x i16> %ret15 270} 271 272define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind { 273; XOP-LABEL: var_shuffle_v32i8: 274; XOP: # %bb.0: 275; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 276; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 277; XOP-NEXT: vpperm %xmm2, %xmm3, %xmm0, %xmm2 278; XOP-NEXT: vpperm %xmm1, %xmm3, %xmm0, %xmm0 279; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 280; XOP-NEXT: retq 281; 282; AVX1-LABEL: var_shuffle_v32i8: 283; AVX1: # %bb.0: 284; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 285; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 286; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4 287; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 288; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm6 289; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2 290; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm2, %xmm2 291; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm3 292; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm4 293; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 294; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm0, %xmm0 295; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 296; AVX1-NEXT: retq 297; 298; AVX2-LABEL: var_shuffle_v32i8: 299; AVX2: # %bb.0: 300; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 301; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2 302; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 303; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] 304; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 305; AVX2-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 306; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 307; AVX2-NEXT: retq 308; 309; AVX512-LABEL: var_shuffle_v32i8: 310; AVX512: # %bb.0: 311; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 312; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm2 313; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 314; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] 315; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 316; AVX512-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 317; AVX512-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 318; AVX512-NEXT: retq 319; 320; AVX512VLDQ-LABEL: var_shuffle_v32i8: 321; AVX512VLDQ: # %bb.0: 322; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 323; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm2, %ymm2 324; AVX512VLDQ-NEXT: vextracti128 $1, %ymm0, %xmm3 325; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] 326; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 327; AVX512VLDQ-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 328; AVX512VLDQ-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 329; AVX512VLDQ-NEXT: retq 330; 331; AVX512VLBW-LABEL: var_shuffle_v32i8: 332; AVX512VLBW: # %bb.0: 333; AVX512VLBW-NEXT: vextracti128 $1, %ymm0, %xmm2 334; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] 335; AVX512VLBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 336; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 337; AVX512VLBW-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %k1 338; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm2, %ymm0 {%k1} 339; AVX512VLBW-NEXT: retq 340; 341; VLVBMI-LABEL: var_shuffle_v32i8: 342; VLVBMI: # %bb.0: 343; VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 344; VLVBMI-NEXT: retq 345 %index0 = extractelement <32 x i8> %indices, i32 0 346 %index1 = extractelement <32 x i8> %indices, i32 1 347 %index2 = extractelement <32 x i8> %indices, i32 2 348 %index3 = extractelement <32 x i8> %indices, i32 3 349 %index4 = extractelement <32 x i8> %indices, i32 4 350 %index5 = extractelement <32 x i8> %indices, i32 5 351 %index6 = extractelement <32 x i8> %indices, i32 6 352 %index7 = extractelement <32 x i8> %indices, i32 7 353 %index8 = extractelement <32 x i8> %indices, i32 8 354 %index9 = extractelement <32 x i8> %indices, i32 9 355 %index10 = extractelement <32 x i8> %indices, i32 10 356 %index11 = extractelement <32 x i8> %indices, i32 11 357 %index12 = extractelement <32 x i8> %indices, i32 12 358 %index13 = extractelement <32 x i8> %indices, i32 13 359 %index14 = extractelement <32 x i8> %indices, i32 14 360 %index15 = extractelement <32 x i8> %indices, i32 15 361 %index16 = extractelement <32 x i8> %indices, i32 16 362 %index17 = extractelement <32 x i8> %indices, i32 17 363 %index18 = extractelement <32 x i8> %indices, i32 18 364 %index19 = extractelement <32 x i8> %indices, i32 19 365 %index20 = extractelement <32 x i8> %indices, i32 20 366 %index21 = extractelement <32 x i8> %indices, i32 21 367 %index22 = extractelement <32 x i8> %indices, i32 22 368 %index23 = extractelement <32 x i8> %indices, i32 23 369 %index24 = extractelement <32 x i8> %indices, i32 24 370 %index25 = extractelement <32 x i8> %indices, i32 25 371 %index26 = extractelement <32 x i8> %indices, i32 26 372 %index27 = extractelement <32 x i8> %indices, i32 27 373 %index28 = extractelement <32 x i8> %indices, i32 28 374 %index29 = extractelement <32 x i8> %indices, i32 29 375 %index30 = extractelement <32 x i8> %indices, i32 30 376 %index31 = extractelement <32 x i8> %indices, i32 31 377 %v0 = extractelement <32 x i8> %v, i8 %index0 378 %v1 = extractelement <32 x i8> %v, i8 %index1 379 %v2 = extractelement <32 x i8> %v, i8 %index2 380 %v3 = extractelement <32 x i8> %v, i8 %index3 381 %v4 = extractelement <32 x i8> %v, i8 %index4 382 %v5 = extractelement <32 x i8> %v, i8 %index5 383 %v6 = extractelement <32 x i8> %v, i8 %index6 384 %v7 = extractelement <32 x i8> %v, i8 %index7 385 %v8 = extractelement <32 x i8> %v, i8 %index8 386 %v9 = extractelement <32 x i8> %v, i8 %index9 387 %v10 = extractelement <32 x i8> %v, i8 %index10 388 %v11 = extractelement <32 x i8> %v, i8 %index11 389 %v12 = extractelement <32 x i8> %v, i8 %index12 390 %v13 = extractelement <32 x i8> %v, i8 %index13 391 %v14 = extractelement <32 x i8> %v, i8 %index14 392 %v15 = extractelement <32 x i8> %v, i8 %index15 393 %v16 = extractelement <32 x i8> %v, i8 %index16 394 %v17 = extractelement <32 x i8> %v, i8 %index17 395 %v18 = extractelement <32 x i8> %v, i8 %index18 396 %v19 = extractelement <32 x i8> %v, i8 %index19 397 %v20 = extractelement <32 x i8> %v, i8 %index20 398 %v21 = extractelement <32 x i8> %v, i8 %index21 399 %v22 = extractelement <32 x i8> %v, i8 %index22 400 %v23 = extractelement <32 x i8> %v, i8 %index23 401 %v24 = extractelement <32 x i8> %v, i8 %index24 402 %v25 = extractelement <32 x i8> %v, i8 %index25 403 %v26 = extractelement <32 x i8> %v, i8 %index26 404 %v27 = extractelement <32 x i8> %v, i8 %index27 405 %v28 = extractelement <32 x i8> %v, i8 %index28 406 %v29 = extractelement <32 x i8> %v, i8 %index29 407 %v30 = extractelement <32 x i8> %v, i8 %index30 408 %v31 = extractelement <32 x i8> %v, i8 %index31 409 %ret0 = insertelement <32 x i8> undef, i8 %v0, i32 0 410 %ret1 = insertelement <32 x i8> %ret0, i8 %v1, i32 1 411 %ret2 = insertelement <32 x i8> %ret1, i8 %v2, i32 2 412 %ret3 = insertelement <32 x i8> %ret2, i8 %v3, i32 3 413 %ret4 = insertelement <32 x i8> %ret3, i8 %v4, i32 4 414 %ret5 = insertelement <32 x i8> %ret4, i8 %v5, i32 5 415 %ret6 = insertelement <32 x i8> %ret5, i8 %v6, i32 6 416 %ret7 = insertelement <32 x i8> %ret6, i8 %v7, i32 7 417 %ret8 = insertelement <32 x i8> %ret7, i8 %v8, i32 8 418 %ret9 = insertelement <32 x i8> %ret8, i8 %v9, i32 9 419 %ret10 = insertelement <32 x i8> %ret9, i8 %v10, i32 10 420 %ret11 = insertelement <32 x i8> %ret10, i8 %v11, i32 11 421 %ret12 = insertelement <32 x i8> %ret11, i8 %v12, i32 12 422 %ret13 = insertelement <32 x i8> %ret12, i8 %v13, i32 13 423 %ret14 = insertelement <32 x i8> %ret13, i8 %v14, i32 14 424 %ret15 = insertelement <32 x i8> %ret14, i8 %v15, i32 15 425 %ret16 = insertelement <32 x i8> %ret15, i8 %v16, i32 16 426 %ret17 = insertelement <32 x i8> %ret16, i8 %v17, i32 17 427 %ret18 = insertelement <32 x i8> %ret17, i8 %v18, i32 18 428 %ret19 = insertelement <32 x i8> %ret18, i8 %v19, i32 19 429 %ret20 = insertelement <32 x i8> %ret19, i8 %v20, i32 20 430 %ret21 = insertelement <32 x i8> %ret20, i8 %v21, i32 21 431 %ret22 = insertelement <32 x i8> %ret21, i8 %v22, i32 22 432 %ret23 = insertelement <32 x i8> %ret22, i8 %v23, i32 23 433 %ret24 = insertelement <32 x i8> %ret23, i8 %v24, i32 24 434 %ret25 = insertelement <32 x i8> %ret24, i8 %v25, i32 25 435 %ret26 = insertelement <32 x i8> %ret25, i8 %v26, i32 26 436 %ret27 = insertelement <32 x i8> %ret26, i8 %v27, i32 27 437 %ret28 = insertelement <32 x i8> %ret27, i8 %v28, i32 28 438 %ret29 = insertelement <32 x i8> %ret28, i8 %v29, i32 29 439 %ret30 = insertelement <32 x i8> %ret29, i8 %v30, i32 30 440 %ret31 = insertelement <32 x i8> %ret30, i8 %v31, i32 31 441 ret <32 x i8> %ret31 442} 443 444define <4 x double> @var_shuffle_v4f64(<4 x double> %v, <4 x i64> %indices) nounwind { 445; XOP-LABEL: var_shuffle_v4f64: 446; XOP: # %bb.0: 447; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 448; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 449; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm3 450; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 451; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1 452; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 453; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm2, %ymm0, %ymm0 454; XOP-NEXT: retq 455; 456; AVX1-LABEL: var_shuffle_v4f64: 457; AVX1: # %bb.0: 458; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2] 459; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 460; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 461; AVX1-NEXT: vpaddq %xmm4, %xmm4, %xmm4 462; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 463; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 464; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 465; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 466; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] 467; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 468; AVX1-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 469; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 470; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 471; AVX1-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 472; AVX1-NEXT: retq 473; 474; AVX2-LABEL: var_shuffle_v4f64: 475; AVX2: # %bb.0: 476; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 477; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] 478; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 479; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3] 480; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 481; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] 482; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 483; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 484; AVX2-NEXT: retq 485; 486; AVX512-LABEL: var_shuffle_v4f64: 487; AVX512: # %bb.0: 488; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 489; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 490; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 491; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 492; AVX512-NEXT: retq 493; 494; AVX512VL-LABEL: var_shuffle_v4f64: 495; AVX512VL: # %bb.0: 496; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0 497; AVX512VL-NEXT: retq 498 %index0 = extractelement <4 x i64> %indices, i32 0 499 %index1 = extractelement <4 x i64> %indices, i32 1 500 %index2 = extractelement <4 x i64> %indices, i32 2 501 %index3 = extractelement <4 x i64> %indices, i32 3 502 %v0 = extractelement <4 x double> %v, i64 %index0 503 %v1 = extractelement <4 x double> %v, i64 %index1 504 %v2 = extractelement <4 x double> %v, i64 %index2 505 %v3 = extractelement <4 x double> %v, i64 %index3 506 %ret0 = insertelement <4 x double> undef, double %v0, i32 0 507 %ret1 = insertelement <4 x double> %ret0, double %v1, i32 1 508 %ret2 = insertelement <4 x double> %ret1, double %v2, i32 2 509 %ret3 = insertelement <4 x double> %ret2, double %v3, i32 3 510 ret <4 x double> %ret3 511} 512 513define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwind { 514; XOP-LABEL: var_shuffle_v8f32: 515; XOP: # %bb.0: 516; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 517; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 518; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0 519; XOP-NEXT: retq 520; 521; AVX1-LABEL: var_shuffle_v8f32: 522; AVX1: # %bb.0: 523; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3] 524; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 525; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 526; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3 527; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 528; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 529; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] 530; AVX1-NEXT: vpermilps %ymm1, %ymm3, %ymm3 531; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 532; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 533; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0 534; AVX1-NEXT: retq 535; 536; INT256-LABEL: var_shuffle_v8f32: 537; INT256: # %bb.0: 538; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0 539; INT256-NEXT: retq 540 %index0 = extractelement <8 x i32> %indices, i32 0 541 %index1 = extractelement <8 x i32> %indices, i32 1 542 %index2 = extractelement <8 x i32> %indices, i32 2 543 %index3 = extractelement <8 x i32> %indices, i32 3 544 %index4 = extractelement <8 x i32> %indices, i32 4 545 %index5 = extractelement <8 x i32> %indices, i32 5 546 %index6 = extractelement <8 x i32> %indices, i32 6 547 %index7 = extractelement <8 x i32> %indices, i32 7 548 %v0 = extractelement <8 x float> %v, i32 %index0 549 %v1 = extractelement <8 x float> %v, i32 %index1 550 %v2 = extractelement <8 x float> %v, i32 %index2 551 %v3 = extractelement <8 x float> %v, i32 %index3 552 %v4 = extractelement <8 x float> %v, i32 %index4 553 %v5 = extractelement <8 x float> %v, i32 %index5 554 %v6 = extractelement <8 x float> %v, i32 %index6 555 %v7 = extractelement <8 x float> %v, i32 %index7 556 %ret0 = insertelement <8 x float> undef, float %v0, i32 0 557 %ret1 = insertelement <8 x float> %ret0, float %v1, i32 1 558 %ret2 = insertelement <8 x float> %ret1, float %v2, i32 2 559 %ret3 = insertelement <8 x float> %ret2, float %v3, i32 3 560 %ret4 = insertelement <8 x float> %ret3, float %v4, i32 4 561 %ret5 = insertelement <8 x float> %ret4, float %v5, i32 5 562 %ret6 = insertelement <8 x float> %ret5, float %v6, i32 6 563 %ret7 = insertelement <8 x float> %ret6, float %v7, i32 7 564 ret <8 x float> %ret7 565} 566 567; 568; PR35820 - Unequal source/destination vector sizes 569; 570 571define <4 x i64> @var_shuffle_v4i64_from_v2i64(<2 x i64> %v, <4 x i64> %indices) nounwind { 572; XOP-LABEL: var_shuffle_v4i64_from_v2i64: 573; XOP: # %bb.0: 574; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 575; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 576; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 577; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm3 578; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 579; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1 580; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 581; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm2, %ymm0, %ymm0 582; XOP-NEXT: retq 583; 584; AVX1-LABEL: var_shuffle_v4i64_from_v2i64: 585; AVX1: # %bb.0: 586; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 587; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2] 588; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 589; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 590; AVX1-NEXT: vpaddq %xmm4, %xmm4, %xmm4 591; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 592; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 593; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 594; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 595; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] 596; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 597; AVX1-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 598; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 599; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 600; AVX1-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 601; AVX1-NEXT: retq 602; 603; AVX2-LABEL: var_shuffle_v4i64_from_v2i64: 604; AVX2: # %bb.0: 605; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 606; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 607; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] 608; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 609; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3] 610; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 611; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] 612; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 613; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 614; AVX2-NEXT: retq 615; 616; AVX512-LABEL: var_shuffle_v4i64_from_v2i64: 617; AVX512: # %bb.0: 618; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 619; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 620; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 621; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 622; AVX512-NEXT: retq 623; 624; AVX512VL-LABEL: var_shuffle_v4i64_from_v2i64: 625; AVX512VL: # %bb.0: 626; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 627; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0 628; AVX512VL-NEXT: retq 629 %index0 = extractelement <4 x i64> %indices, i32 0 630 %index1 = extractelement <4 x i64> %indices, i32 1 631 %index2 = extractelement <4 x i64> %indices, i32 2 632 %index3 = extractelement <4 x i64> %indices, i32 3 633 %v0 = extractelement <2 x i64> %v, i64 %index0 634 %v1 = extractelement <2 x i64> %v, i64 %index1 635 %v2 = extractelement <2 x i64> %v, i64 %index2 636 %v3 = extractelement <2 x i64> %v, i64 %index3 637 %ret0 = insertelement <4 x i64> undef, i64 %v0, i32 0 638 %ret1 = insertelement <4 x i64> %ret0, i64 %v1, i32 1 639 %ret2 = insertelement <4 x i64> %ret1, i64 %v2, i32 2 640 %ret3 = insertelement <4 x i64> %ret2, i64 %v3, i32 3 641 ret <4 x i64> %ret3 642} 643 644define <8 x i32> @var_shuffle_v8i32_from_v4i32(<4 x i32> %v, <8 x i32> %indices) unnamed_addr nounwind { 645; XOP-LABEL: var_shuffle_v8i32_from_v4i32: 646; XOP: # %bb.0: # %entry 647; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 648; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 649; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 650; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0 651; XOP-NEXT: retq 652; 653; AVX1-LABEL: var_shuffle_v8i32_from_v4i32: 654; AVX1: # %bb.0: # %entry 655; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 656; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3] 657; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 658; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 659; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3 660; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 661; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 662; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] 663; AVX1-NEXT: vpermilps %ymm1, %ymm3, %ymm3 664; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 665; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 666; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0 667; AVX1-NEXT: retq 668; 669; INT256-LABEL: var_shuffle_v8i32_from_v4i32: 670; INT256: # %bb.0: # %entry 671; INT256-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 672; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0 673; INT256-NEXT: retq 674entry: 675 %tmp1 = extractelement <8 x i32> %indices, i32 0 676 %vecext2.8 = extractelement <4 x i32> %v, i32 %tmp1 677 %tmp2 = extractelement <8 x i32> %indices, i32 1 678 %vecext2.9 = extractelement <4 x i32> %v, i32 %tmp2 679 %tmp3 = extractelement <8 x i32> %indices, i32 2 680 %vecext2.10 = extractelement <4 x i32> %v, i32 %tmp3 681 %tmp4 = extractelement <8 x i32> %indices, i32 3 682 %vecext2.11 = extractelement <4 x i32> %v, i32 %tmp4 683 %tmp5 = extractelement <8 x i32> %indices, i32 4 684 %vecext2.12 = extractelement <4 x i32> %v, i32 %tmp5 685 %tmp6 = extractelement <8 x i32> %indices, i32 5 686 %vecext2.13 = extractelement <4 x i32> %v, i32 %tmp6 687 %tmp7 = extractelement <8 x i32> %indices, i32 6 688 %vecext2.14 = extractelement <4 x i32> %v, i32 %tmp7 689 %tmp8 = extractelement <8 x i32> %indices, i32 7 690 %vecext2.15 = extractelement <4 x i32> %v, i32 %tmp8 691 %tmp9 = insertelement <8 x i32> undef, i32 %vecext2.8, i32 0 692 %tmp10 = insertelement <8 x i32> %tmp9, i32 %vecext2.9, i32 1 693 %tmp11 = insertelement <8 x i32> %tmp10, i32 %vecext2.10, i32 2 694 %tmp12 = insertelement <8 x i32> %tmp11, i32 %vecext2.11, i32 3 695 %tmp13 = insertelement <8 x i32> %tmp12, i32 %vecext2.12, i32 4 696 %tmp14 = insertelement <8 x i32> %tmp13, i32 %vecext2.13, i32 5 697 %tmp15 = insertelement <8 x i32> %tmp14, i32 %vecext2.14, i32 6 698 %tmp16 = insertelement <8 x i32> %tmp15, i32 %vecext2.15, i32 7 699 ret <8 x i32> %tmp16 700} 701 702define <16 x i16> @var_shuffle_v16i16_from_v8i16(<8 x i16> %v, <16 x i16> %indices) nounwind { 703; XOP-LABEL: var_shuffle_v16i16_from_v8i16: 704; XOP: # %bb.0: 705; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [256,256,256,256,256,256,256,256] 706; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [514,514,514,514,514,514,514,514] 707; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm4 708; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 709; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm1 710; XOP-NEXT: vpperm %xmm1, %xmm0, %xmm0, %xmm1 711; XOP-NEXT: vpperm %xmm4, %xmm0, %xmm0, %xmm0 712; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 713; XOP-NEXT: retq 714; 715; AVX1-LABEL: var_shuffle_v16i16_from_v8i16: 716; AVX1: # %bb.0: 717; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [514,514,514,514,514,514,514,514] 718; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3 719; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,256,256,256,256,256,256,256] 720; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3 721; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 722; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 723; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm1 724; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 725; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm4 726; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm5 727; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 728; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm1, %xmm1 729; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2 730; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm4 731; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 732; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 733; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 734; AVX1-NEXT: retq 735; 736; AVX2-LABEL: var_shuffle_v16i16_from_v8i16: 737; AVX2: # %bb.0: 738; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 739; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 740; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1 741; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm2 742; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 743; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 744; AVX2-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 745; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 746; AVX2-NEXT: retq 747; 748; AVX512-LABEL: var_shuffle_v16i16_from_v8i16: 749; AVX512: # %bb.0: 750; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 751; AVX512-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 752; AVX512-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1 753; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm2 754; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 755; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 756; AVX512-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 757; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 758; AVX512-NEXT: retq 759; 760; AVX512VLDQ-LABEL: var_shuffle_v16i16_from_v8i16: 761; AVX512VLDQ: # %bb.0: 762; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 763; AVX512VLDQ-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 764; AVX512VLDQ-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1 765; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm2 766; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 767; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 768; AVX512VLDQ-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 769; AVX512VLDQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 770; AVX512VLDQ-NEXT: retq 771; 772; AVX512VLBW-LABEL: var_shuffle_v16i16_from_v8i16: 773; AVX512VLBW: # %bb.0: 774; AVX512VLBW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 775; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0 776; AVX512VLBW-NEXT: retq 777; 778; VLVBMI-LABEL: var_shuffle_v16i16_from_v8i16: 779; VLVBMI: # %bb.0: 780; VLVBMI-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 781; VLVBMI-NEXT: vpermw %ymm0, %ymm1, %ymm0 782; VLVBMI-NEXT: retq 783 %index0 = extractelement <16 x i16> %indices, i32 0 784 %index1 = extractelement <16 x i16> %indices, i32 1 785 %index2 = extractelement <16 x i16> %indices, i32 2 786 %index3 = extractelement <16 x i16> %indices, i32 3 787 %index4 = extractelement <16 x i16> %indices, i32 4 788 %index5 = extractelement <16 x i16> %indices, i32 5 789 %index6 = extractelement <16 x i16> %indices, i32 6 790 %index7 = extractelement <16 x i16> %indices, i32 7 791 %index8 = extractelement <16 x i16> %indices, i32 8 792 %index9 = extractelement <16 x i16> %indices, i32 9 793 %index10 = extractelement <16 x i16> %indices, i32 10 794 %index11 = extractelement <16 x i16> %indices, i32 11 795 %index12 = extractelement <16 x i16> %indices, i32 12 796 %index13 = extractelement <16 x i16> %indices, i32 13 797 %index14 = extractelement <16 x i16> %indices, i32 14 798 %index15 = extractelement <16 x i16> %indices, i32 15 799 %v0 = extractelement <8 x i16> %v, i16 %index0 800 %v1 = extractelement <8 x i16> %v, i16 %index1 801 %v2 = extractelement <8 x i16> %v, i16 %index2 802 %v3 = extractelement <8 x i16> %v, i16 %index3 803 %v4 = extractelement <8 x i16> %v, i16 %index4 804 %v5 = extractelement <8 x i16> %v, i16 %index5 805 %v6 = extractelement <8 x i16> %v, i16 %index6 806 %v7 = extractelement <8 x i16> %v, i16 %index7 807 %v8 = extractelement <8 x i16> %v, i16 %index8 808 %v9 = extractelement <8 x i16> %v, i16 %index9 809 %v10 = extractelement <8 x i16> %v, i16 %index10 810 %v11 = extractelement <8 x i16> %v, i16 %index11 811 %v12 = extractelement <8 x i16> %v, i16 %index12 812 %v13 = extractelement <8 x i16> %v, i16 %index13 813 %v14 = extractelement <8 x i16> %v, i16 %index14 814 %v15 = extractelement <8 x i16> %v, i16 %index15 815 %ret0 = insertelement <16 x i16> undef, i16 %v0, i32 0 816 %ret1 = insertelement <16 x i16> %ret0, i16 %v1, i32 1 817 %ret2 = insertelement <16 x i16> %ret1, i16 %v2, i32 2 818 %ret3 = insertelement <16 x i16> %ret2, i16 %v3, i32 3 819 %ret4 = insertelement <16 x i16> %ret3, i16 %v4, i32 4 820 %ret5 = insertelement <16 x i16> %ret4, i16 %v5, i32 5 821 %ret6 = insertelement <16 x i16> %ret5, i16 %v6, i32 6 822 %ret7 = insertelement <16 x i16> %ret6, i16 %v7, i32 7 823 %ret8 = insertelement <16 x i16> %ret7, i16 %v8, i32 8 824 %ret9 = insertelement <16 x i16> %ret8, i16 %v9, i32 9 825 %ret10 = insertelement <16 x i16> %ret9, i16 %v10, i32 10 826 %ret11 = insertelement <16 x i16> %ret10, i16 %v11, i32 11 827 %ret12 = insertelement <16 x i16> %ret11, i16 %v12, i32 12 828 %ret13 = insertelement <16 x i16> %ret12, i16 %v13, i32 13 829 %ret14 = insertelement <16 x i16> %ret13, i16 %v14, i32 14 830 %ret15 = insertelement <16 x i16> %ret14, i16 %v15, i32 15 831 ret <16 x i16> %ret15 832} 833 834define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices) nounwind { 835; XOP-LABEL: var_shuffle_v32i8_from_v16i8: 836; XOP: # %bb.0: 837; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 838; XOP-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm2 839; XOP-NEXT: vpperm %xmm1, %xmm0, %xmm0, %xmm0 840; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 841; XOP-NEXT: retq 842; 843; AVX1-LABEL: var_shuffle_v32i8_from_v16i8: 844; AVX1: # %bb.0: 845; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 846; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 847; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4 848; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm5 849; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2 850; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm2, %xmm2 851; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm3 852; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm4 853; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 854; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm0, %xmm0 855; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 856; AVX1-NEXT: retq 857; 858; AVX2-LABEL: var_shuffle_v32i8_from_v16i8: 859; AVX2: # %bb.0: 860; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 861; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm2 862; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 863; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 864; AVX2-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 865; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 866; AVX2-NEXT: retq 867; 868; AVX512-LABEL: var_shuffle_v32i8_from_v16i8: 869; AVX512: # %bb.0: 870; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 871; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm2 872; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 873; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 874; AVX512-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 875; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 876; AVX512-NEXT: retq 877; 878; AVX512VLDQ-LABEL: var_shuffle_v32i8_from_v16i8: 879; AVX512VLDQ: # %bb.0: 880; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 881; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm2 882; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 883; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 884; AVX512VLDQ-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 885; AVX512VLDQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 886; AVX512VLDQ-NEXT: retq 887; 888; AVX512VLBW-LABEL: var_shuffle_v32i8_from_v16i8: 889; AVX512VLBW: # %bb.0: 890; AVX512VLBW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 891; AVX512VLBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 892; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm2, %ymm2 893; AVX512VLBW-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %k1 894; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm0, %ymm2 {%k1} 895; AVX512VLBW-NEXT: vmovdqa %ymm2, %ymm0 896; AVX512VLBW-NEXT: retq 897; 898; VLVBMI-LABEL: var_shuffle_v32i8_from_v16i8: 899; VLVBMI: # %bb.0: 900; VLVBMI-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 901; VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 902; VLVBMI-NEXT: retq 903 %index0 = extractelement <32 x i8> %indices, i32 0 904 %index1 = extractelement <32 x i8> %indices, i32 1 905 %index2 = extractelement <32 x i8> %indices, i32 2 906 %index3 = extractelement <32 x i8> %indices, i32 3 907 %index4 = extractelement <32 x i8> %indices, i32 4 908 %index5 = extractelement <32 x i8> %indices, i32 5 909 %index6 = extractelement <32 x i8> %indices, i32 6 910 %index7 = extractelement <32 x i8> %indices, i32 7 911 %index8 = extractelement <32 x i8> %indices, i32 8 912 %index9 = extractelement <32 x i8> %indices, i32 9 913 %index10 = extractelement <32 x i8> %indices, i32 10 914 %index11 = extractelement <32 x i8> %indices, i32 11 915 %index12 = extractelement <32 x i8> %indices, i32 12 916 %index13 = extractelement <32 x i8> %indices, i32 13 917 %index14 = extractelement <32 x i8> %indices, i32 14 918 %index15 = extractelement <32 x i8> %indices, i32 15 919 %index16 = extractelement <32 x i8> %indices, i32 16 920 %index17 = extractelement <32 x i8> %indices, i32 17 921 %index18 = extractelement <32 x i8> %indices, i32 18 922 %index19 = extractelement <32 x i8> %indices, i32 19 923 %index20 = extractelement <32 x i8> %indices, i32 20 924 %index21 = extractelement <32 x i8> %indices, i32 21 925 %index22 = extractelement <32 x i8> %indices, i32 22 926 %index23 = extractelement <32 x i8> %indices, i32 23 927 %index24 = extractelement <32 x i8> %indices, i32 24 928 %index25 = extractelement <32 x i8> %indices, i32 25 929 %index26 = extractelement <32 x i8> %indices, i32 26 930 %index27 = extractelement <32 x i8> %indices, i32 27 931 %index28 = extractelement <32 x i8> %indices, i32 28 932 %index29 = extractelement <32 x i8> %indices, i32 29 933 %index30 = extractelement <32 x i8> %indices, i32 30 934 %index31 = extractelement <32 x i8> %indices, i32 31 935 %v0 = extractelement <16 x i8> %v, i8 %index0 936 %v1 = extractelement <16 x i8> %v, i8 %index1 937 %v2 = extractelement <16 x i8> %v, i8 %index2 938 %v3 = extractelement <16 x i8> %v, i8 %index3 939 %v4 = extractelement <16 x i8> %v, i8 %index4 940 %v5 = extractelement <16 x i8> %v, i8 %index5 941 %v6 = extractelement <16 x i8> %v, i8 %index6 942 %v7 = extractelement <16 x i8> %v, i8 %index7 943 %v8 = extractelement <16 x i8> %v, i8 %index8 944 %v9 = extractelement <16 x i8> %v, i8 %index9 945 %v10 = extractelement <16 x i8> %v, i8 %index10 946 %v11 = extractelement <16 x i8> %v, i8 %index11 947 %v12 = extractelement <16 x i8> %v, i8 %index12 948 %v13 = extractelement <16 x i8> %v, i8 %index13 949 %v14 = extractelement <16 x i8> %v, i8 %index14 950 %v15 = extractelement <16 x i8> %v, i8 %index15 951 %v16 = extractelement <16 x i8> %v, i8 %index16 952 %v17 = extractelement <16 x i8> %v, i8 %index17 953 %v18 = extractelement <16 x i8> %v, i8 %index18 954 %v19 = extractelement <16 x i8> %v, i8 %index19 955 %v20 = extractelement <16 x i8> %v, i8 %index20 956 %v21 = extractelement <16 x i8> %v, i8 %index21 957 %v22 = extractelement <16 x i8> %v, i8 %index22 958 %v23 = extractelement <16 x i8> %v, i8 %index23 959 %v24 = extractelement <16 x i8> %v, i8 %index24 960 %v25 = extractelement <16 x i8> %v, i8 %index25 961 %v26 = extractelement <16 x i8> %v, i8 %index26 962 %v27 = extractelement <16 x i8> %v, i8 %index27 963 %v28 = extractelement <16 x i8> %v, i8 %index28 964 %v29 = extractelement <16 x i8> %v, i8 %index29 965 %v30 = extractelement <16 x i8> %v, i8 %index30 966 %v31 = extractelement <16 x i8> %v, i8 %index31 967 %ret0 = insertelement <32 x i8> undef, i8 %v0, i32 0 968 %ret1 = insertelement <32 x i8> %ret0, i8 %v1, i32 1 969 %ret2 = insertelement <32 x i8> %ret1, i8 %v2, i32 2 970 %ret3 = insertelement <32 x i8> %ret2, i8 %v3, i32 3 971 %ret4 = insertelement <32 x i8> %ret3, i8 %v4, i32 4 972 %ret5 = insertelement <32 x i8> %ret4, i8 %v5, i32 5 973 %ret6 = insertelement <32 x i8> %ret5, i8 %v6, i32 6 974 %ret7 = insertelement <32 x i8> %ret6, i8 %v7, i32 7 975 %ret8 = insertelement <32 x i8> %ret7, i8 %v8, i32 8 976 %ret9 = insertelement <32 x i8> %ret8, i8 %v9, i32 9 977 %ret10 = insertelement <32 x i8> %ret9, i8 %v10, i32 10 978 %ret11 = insertelement <32 x i8> %ret10, i8 %v11, i32 11 979 %ret12 = insertelement <32 x i8> %ret11, i8 %v12, i32 12 980 %ret13 = insertelement <32 x i8> %ret12, i8 %v13, i32 13 981 %ret14 = insertelement <32 x i8> %ret13, i8 %v14, i32 14 982 %ret15 = insertelement <32 x i8> %ret14, i8 %v15, i32 15 983 %ret16 = insertelement <32 x i8> %ret15, i8 %v16, i32 16 984 %ret17 = insertelement <32 x i8> %ret16, i8 %v17, i32 17 985 %ret18 = insertelement <32 x i8> %ret17, i8 %v18, i32 18 986 %ret19 = insertelement <32 x i8> %ret18, i8 %v19, i32 19 987 %ret20 = insertelement <32 x i8> %ret19, i8 %v20, i32 20 988 %ret21 = insertelement <32 x i8> %ret20, i8 %v21, i32 21 989 %ret22 = insertelement <32 x i8> %ret21, i8 %v22, i32 22 990 %ret23 = insertelement <32 x i8> %ret22, i8 %v23, i32 23 991 %ret24 = insertelement <32 x i8> %ret23, i8 %v24, i32 24 992 %ret25 = insertelement <32 x i8> %ret24, i8 %v25, i32 25 993 %ret26 = insertelement <32 x i8> %ret25, i8 %v26, i32 26 994 %ret27 = insertelement <32 x i8> %ret26, i8 %v27, i32 27 995 %ret28 = insertelement <32 x i8> %ret27, i8 %v28, i32 28 996 %ret29 = insertelement <32 x i8> %ret28, i8 %v29, i32 29 997 %ret30 = insertelement <32 x i8> %ret29, i8 %v30, i32 30 998 %ret31 = insertelement <32 x i8> %ret30, i8 %v31, i32 31 999 ret <32 x i8> %ret31 1000} 1001 1002define <4 x double> @var_shuffle_v4f64_from_v2f64(<2 x double> %v, <4 x i64> %indices) nounwind { 1003; XOP-LABEL: var_shuffle_v4f64_from_v2f64: 1004; XOP: # %bb.0: 1005; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1006; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 1007; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1008; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm3 1009; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 1010; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1 1011; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 1012; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm2, %ymm0, %ymm0 1013; XOP-NEXT: retq 1014; 1015; AVX1-LABEL: var_shuffle_v4f64_from_v2f64: 1016; AVX1: # %bb.0: 1017; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1018; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2] 1019; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 1020; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 1021; AVX1-NEXT: vpaddq %xmm4, %xmm4, %xmm4 1022; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 1023; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 1024; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 1025; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 1026; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] 1027; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 1028; AVX1-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 1029; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1030; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 1031; AVX1-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 1032; AVX1-NEXT: retq 1033; 1034; AVX2-LABEL: var_shuffle_v4f64_from_v2f64: 1035; AVX2: # %bb.0: 1036; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1037; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 1038; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] 1039; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 1040; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3] 1041; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 1042; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] 1043; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 1044; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 1045; AVX2-NEXT: retq 1046; 1047; AVX512-LABEL: var_shuffle_v4f64_from_v2f64: 1048; AVX512: # %bb.0: 1049; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1050; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1051; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 1052; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1053; AVX512-NEXT: retq 1054; 1055; AVX512VL-LABEL: var_shuffle_v4f64_from_v2f64: 1056; AVX512VL: # %bb.0: 1057; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1058; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0 1059; AVX512VL-NEXT: retq 1060 %index0 = extractelement <4 x i64> %indices, i32 0 1061 %index1 = extractelement <4 x i64> %indices, i32 1 1062 %index2 = extractelement <4 x i64> %indices, i32 2 1063 %index3 = extractelement <4 x i64> %indices, i32 3 1064 %v0 = extractelement <2 x double> %v, i64 %index0 1065 %v1 = extractelement <2 x double> %v, i64 %index1 1066 %v2 = extractelement <2 x double> %v, i64 %index2 1067 %v3 = extractelement <2 x double> %v, i64 %index3 1068 %ret0 = insertelement <4 x double> undef, double %v0, i32 0 1069 %ret1 = insertelement <4 x double> %ret0, double %v1, i32 1 1070 %ret2 = insertelement <4 x double> %ret1, double %v2, i32 2 1071 %ret3 = insertelement <4 x double> %ret2, double %v3, i32 3 1072 ret <4 x double> %ret3 1073} 1074 1075define <8 x float> @var_shuffle_v8f32_from_v4f32(<4 x float> %v, <8 x i32> %indices) unnamed_addr nounwind { 1076; XOP-LABEL: var_shuffle_v8f32_from_v4f32: 1077; XOP: # %bb.0: # %entry 1078; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1079; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 1080; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1081; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0 1082; XOP-NEXT: retq 1083; 1084; AVX1-LABEL: var_shuffle_v8f32_from_v4f32: 1085; AVX1: # %bb.0: # %entry 1086; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1087; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3] 1088; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 1089; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 1090; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3 1091; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 1092; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 1093; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] 1094; AVX1-NEXT: vpermilps %ymm1, %ymm3, %ymm3 1095; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1096; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 1097; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0 1098; AVX1-NEXT: retq 1099; 1100; INT256-LABEL: var_shuffle_v8f32_from_v4f32: 1101; INT256: # %bb.0: # %entry 1102; INT256-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1103; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0 1104; INT256-NEXT: retq 1105entry: 1106 %tmp1 = extractelement <8 x i32> %indices, i32 0 1107 %vecext2.8 = extractelement <4 x float> %v, i32 %tmp1 1108 %tmp2 = extractelement <8 x i32> %indices, i32 1 1109 %vecext2.9 = extractelement <4 x float> %v, i32 %tmp2 1110 %tmp3 = extractelement <8 x i32> %indices, i32 2 1111 %vecext2.10 = extractelement <4 x float> %v, i32 %tmp3 1112 %tmp4 = extractelement <8 x i32> %indices, i32 3 1113 %vecext2.11 = extractelement <4 x float> %v, i32 %tmp4 1114 %tmp5 = extractelement <8 x i32> %indices, i32 4 1115 %vecext2.12 = extractelement <4 x float> %v, i32 %tmp5 1116 %tmp6 = extractelement <8 x i32> %indices, i32 5 1117 %vecext2.13 = extractelement <4 x float> %v, i32 %tmp6 1118 %tmp7 = extractelement <8 x i32> %indices, i32 6 1119 %vecext2.14 = extractelement <4 x float> %v, i32 %tmp7 1120 %tmp8 = extractelement <8 x i32> %indices, i32 7 1121 %vecext2.15 = extractelement <4 x float> %v, i32 %tmp8 1122 %tmp9 = insertelement <8 x float> undef, float %vecext2.8, i32 0 1123 %tmp10 = insertelement <8 x float> %tmp9, float %vecext2.9, i32 1 1124 %tmp11 = insertelement <8 x float> %tmp10, float %vecext2.10, i32 2 1125 %tmp12 = insertelement <8 x float> %tmp11, float %vecext2.11, i32 3 1126 %tmp13 = insertelement <8 x float> %tmp12, float %vecext2.12, i32 4 1127 %tmp14 = insertelement <8 x float> %tmp13, float %vecext2.13, i32 5 1128 %tmp15 = insertelement <8 x float> %tmp14, float %vecext2.14, i32 6 1129 %tmp16 = insertelement <8 x float> %tmp15, float %vecext2.15, i32 7 1130 ret <8 x float> %tmp16 1131} 1132 1133define <4 x i32> @var_shuffle_v4i32_from_v8i32(<8 x i32> %v, <4 x i32> %indices) unnamed_addr nounwind { 1134; XOP-LABEL: var_shuffle_v4i32_from_v8i32: 1135; XOP: # %bb.0: # %entry 1136; XOP-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1137; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 1138; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1139; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0 1140; XOP-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1141; XOP-NEXT: vzeroupper 1142; XOP-NEXT: retq 1143; 1144; AVX1-LABEL: var_shuffle_v4i32_from_v8i32: 1145; AVX1: # %bb.0: # %entry 1146; AVX1-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1147; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 1148; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2 1149; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1150; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 1151; AVX1-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,3,3,3,3,3,3] 1152; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 1153; AVX1-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm4 1154; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1 1155; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 1156; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 1157; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1158; AVX1-NEXT: vzeroupper 1159; AVX1-NEXT: retq 1160; 1161; INT256-LABEL: var_shuffle_v4i32_from_v8i32: 1162; INT256: # %bb.0: # %entry 1163; INT256-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1164; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0 1165; INT256-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1166; INT256-NEXT: vzeroupper 1167; INT256-NEXT: retq 1168entry: 1169 %tmp1 = extractelement <4 x i32> %indices, i32 0 1170 %vecext2.8 = extractelement <8 x i32> %v, i32 %tmp1 1171 %tmp2 = extractelement <4 x i32> %indices, i32 1 1172 %vecext2.9 = extractelement <8 x i32> %v, i32 %tmp2 1173 %tmp3 = extractelement <4 x i32> %indices, i32 2 1174 %vecext2.10 = extractelement <8 x i32> %v, i32 %tmp3 1175 %tmp4 = extractelement <4 x i32> %indices, i32 3 1176 %vecext2.11 = extractelement <8 x i32> %v, i32 %tmp4 1177 %tmp9 = insertelement <4 x i32> undef, i32 %vecext2.8, i32 0 1178 %tmp10 = insertelement <4 x i32> %tmp9, i32 %vecext2.9, i32 1 1179 %tmp11 = insertelement <4 x i32> %tmp10, i32 %vecext2.10, i32 2 1180 %tmp12 = insertelement <4 x i32> %tmp11, i32 %vecext2.11, i32 3 1181 ret <4 x i32> %tmp12 1182} 1183