1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefix=XOP 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=INT256,AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=INT256,AVX512 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=INT256,AVX512 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=INT256,AVX512 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=INT256,AVX512 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=INT256,AVX512VL 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=INT256,AVX512VL,AVX512VLDQ 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=INT256,AVX512VL,AVX512VLBW 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=INT256,AVX512VL,VLVBMI 13 14define <4 x i64> @var_shuffle_v4i64(<4 x i64> %v, <4 x i64> %indices) nounwind { 15; XOP-LABEL: var_shuffle_v4i64: 16; XOP: # %bb.0: 17; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 18; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 19; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm3 20; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 21; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1 22; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 23; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm2, %ymm0, %ymm0 24; XOP-NEXT: retq 25; 26; AVX1-LABEL: var_shuffle_v4i64: 27; AVX1: # %bb.0: 28; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 29; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm3 30; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 31; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 32; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm4 33; AVX1-NEXT: vpermilpd %ymm4, %ymm2, %ymm2 34; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 35; AVX1-NEXT: vpermilpd %ymm4, %ymm0, %ymm0 36; AVX1-NEXT: vpcmpgtq {{.*}}(%rip), %xmm3, %xmm3 37; AVX1-NEXT: vpcmpgtq {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1 38; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 39; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 40; AVX1-NEXT: retq 41; 42; AVX2-LABEL: var_shuffle_v4i64: 43; AVX2: # %bb.0: 44; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 45; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] 46; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 47; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3] 48; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 49; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] 50; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 51; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 52; AVX2-NEXT: retq 53; 54; AVX512-LABEL: var_shuffle_v4i64: 55; AVX512: # %bb.0: 56; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 57; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 58; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 59; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 60; AVX512-NEXT: retq 61; 62; AVX512VL-LABEL: var_shuffle_v4i64: 63; AVX512VL: # %bb.0: 64; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0 65; AVX512VL-NEXT: retq 66 %index0 = extractelement <4 x i64> %indices, i32 0 67 %index1 = extractelement <4 x i64> %indices, i32 1 68 %index2 = extractelement <4 x i64> %indices, i32 2 69 %index3 = extractelement <4 x i64> %indices, i32 3 70 %v0 = extractelement <4 x i64> %v, i64 %index0 71 %v1 = extractelement <4 x i64> %v, i64 %index1 72 %v2 = extractelement <4 x i64> %v, i64 %index2 73 %v3 = extractelement <4 x i64> %v, i64 %index3 74 %ret0 = insertelement <4 x i64> undef, i64 %v0, i32 0 75 %ret1 = insertelement <4 x i64> %ret0, i64 %v1, i32 1 76 %ret2 = insertelement <4 x i64> %ret1, i64 %v2, i32 2 77 %ret3 = insertelement <4 x i64> %ret2, i64 %v3, i32 3 78 ret <4 x i64> %ret3 79} 80 81define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind { 82; XOP-LABEL: var_shuffle_v8i32: 83; XOP: # %bb.0: 84; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 85; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 86; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0 87; XOP-NEXT: retq 88; 89; AVX1-LABEL: var_shuffle_v8i32: 90; AVX1: # %bb.0: 91; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 92; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2 93; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 94; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 95; AVX1-NEXT: vpcmpgtd {{.*}}(%rip), %xmm1, %xmm3 96; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 97; AVX1-NEXT: vpcmpgtd {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1 98; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 99; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 100; AVX1-NEXT: retq 101; 102; INT256-LABEL: var_shuffle_v8i32: 103; INT256: # %bb.0: 104; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0 105; INT256-NEXT: retq 106 %index0 = extractelement <8 x i32> %indices, i32 0 107 %index1 = extractelement <8 x i32> %indices, i32 1 108 %index2 = extractelement <8 x i32> %indices, i32 2 109 %index3 = extractelement <8 x i32> %indices, i32 3 110 %index4 = extractelement <8 x i32> %indices, i32 4 111 %index5 = extractelement <8 x i32> %indices, i32 5 112 %index6 = extractelement <8 x i32> %indices, i32 6 113 %index7 = extractelement <8 x i32> %indices, i32 7 114 %v0 = extractelement <8 x i32> %v, i32 %index0 115 %v1 = extractelement <8 x i32> %v, i32 %index1 116 %v2 = extractelement <8 x i32> %v, i32 %index2 117 %v3 = extractelement <8 x i32> %v, i32 %index3 118 %v4 = extractelement <8 x i32> %v, i32 %index4 119 %v5 = extractelement <8 x i32> %v, i32 %index5 120 %v6 = extractelement <8 x i32> %v, i32 %index6 121 %v7 = extractelement <8 x i32> %v, i32 %index7 122 %ret0 = insertelement <8 x i32> undef, i32 %v0, i32 0 123 %ret1 = insertelement <8 x i32> %ret0, i32 %v1, i32 1 124 %ret2 = insertelement <8 x i32> %ret1, i32 %v2, i32 2 125 %ret3 = insertelement <8 x i32> %ret2, i32 %v3, i32 3 126 %ret4 = insertelement <8 x i32> %ret3, i32 %v4, i32 4 127 %ret5 = insertelement <8 x i32> %ret4, i32 %v5, i32 5 128 %ret6 = insertelement <8 x i32> %ret5, i32 %v6, i32 6 129 %ret7 = insertelement <8 x i32> %ret6, i32 %v7, i32 7 130 ret <8 x i32> %ret7 131} 132 133define <16 x i16> @var_shuffle_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwind { 134; XOP-LABEL: var_shuffle_v16i16: 135; XOP: # %bb.0: 136; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [256,256,256,256,256,256,256,256] 137; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [514,514,514,514,514,514,514,514] 138; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm4 139; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 140; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm1 141; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 142; XOP-NEXT: vpperm %xmm1, %xmm2, %xmm0, %xmm1 143; XOP-NEXT: vpperm %xmm4, %xmm2, %xmm0, %xmm0 144; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 145; XOP-NEXT: retq 146; 147; AVX1-LABEL: var_shuffle_v16i16: 148; AVX1: # %bb.0: 149; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [514,514,514,514,514,514,514,514] 150; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3 151; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,256,256,256,256,256,256,256] 152; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3 153; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 154; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 155; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm1 156; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 157; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm4 158; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 159; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm6 160; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 161; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1 162; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2 163; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm4 164; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 165; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 166; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 167; AVX1-NEXT: retq 168; 169; AVX2-LABEL: var_shuffle_v16i16: 170; AVX2: # %bb.0: 171; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 172; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1 173; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3] 174; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2 175; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 176; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 177; AVX2-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 178; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 179; AVX2-NEXT: retq 180; 181; AVX512-LABEL: var_shuffle_v16i16: 182; AVX512: # %bb.0: 183; AVX512-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 184; AVX512-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1 185; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3] 186; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm2 187; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 188; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 189; AVX512-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 190; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 191; AVX512-NEXT: retq 192; 193; AVX512VLDQ-LABEL: var_shuffle_v16i16: 194; AVX512VLDQ: # %bb.0: 195; AVX512VLDQ-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 196; AVX512VLDQ-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1 197; AVX512VLDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3] 198; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm2, %ymm2 199; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 200; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 201; AVX512VLDQ-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 202; AVX512VLDQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 203; AVX512VLDQ-NEXT: retq 204; 205; AVX512VLBW-LABEL: var_shuffle_v16i16: 206; AVX512VLBW: # %bb.0: 207; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0 208; AVX512VLBW-NEXT: retq 209; 210; VLVBMI-LABEL: var_shuffle_v16i16: 211; VLVBMI: # %bb.0: 212; VLVBMI-NEXT: vpermw %ymm0, %ymm1, %ymm0 213; VLVBMI-NEXT: retq 214 %index0 = extractelement <16 x i16> %indices, i32 0 215 %index1 = extractelement <16 x i16> %indices, i32 1 216 %index2 = extractelement <16 x i16> %indices, i32 2 217 %index3 = extractelement <16 x i16> %indices, i32 3 218 %index4 = extractelement <16 x i16> %indices, i32 4 219 %index5 = extractelement <16 x i16> %indices, i32 5 220 %index6 = extractelement <16 x i16> %indices, i32 6 221 %index7 = extractelement <16 x i16> %indices, i32 7 222 %index8 = extractelement <16 x i16> %indices, i32 8 223 %index9 = extractelement <16 x i16> %indices, i32 9 224 %index10 = extractelement <16 x i16> %indices, i32 10 225 %index11 = extractelement <16 x i16> %indices, i32 11 226 %index12 = extractelement <16 x i16> %indices, i32 12 227 %index13 = extractelement <16 x i16> %indices, i32 13 228 %index14 = extractelement <16 x i16> %indices, i32 14 229 %index15 = extractelement <16 x i16> %indices, i32 15 230 %v0 = extractelement <16 x i16> %v, i16 %index0 231 %v1 = extractelement <16 x i16> %v, i16 %index1 232 %v2 = extractelement <16 x i16> %v, i16 %index2 233 %v3 = extractelement <16 x i16> %v, i16 %index3 234 %v4 = extractelement <16 x i16> %v, i16 %index4 235 %v5 = extractelement <16 x i16> %v, i16 %index5 236 %v6 = extractelement <16 x i16> %v, i16 %index6 237 %v7 = extractelement <16 x i16> %v, i16 %index7 238 %v8 = extractelement <16 x i16> %v, i16 %index8 239 %v9 = extractelement <16 x i16> %v, i16 %index9 240 %v10 = extractelement <16 x i16> %v, i16 %index10 241 %v11 = extractelement <16 x i16> %v, i16 %index11 242 %v12 = extractelement <16 x i16> %v, i16 %index12 243 %v13 = extractelement <16 x i16> %v, i16 %index13 244 %v14 = extractelement <16 x i16> %v, i16 %index14 245 %v15 = extractelement <16 x i16> %v, i16 %index15 246 %ret0 = insertelement <16 x i16> undef, i16 %v0, i32 0 247 %ret1 = insertelement <16 x i16> %ret0, i16 %v1, i32 1 248 %ret2 = insertelement <16 x i16> %ret1, i16 %v2, i32 2 249 %ret3 = insertelement <16 x i16> %ret2, i16 %v3, i32 3 250 %ret4 = insertelement <16 x i16> %ret3, i16 %v4, i32 4 251 %ret5 = insertelement <16 x i16> %ret4, i16 %v5, i32 5 252 %ret6 = insertelement <16 x i16> %ret5, i16 %v6, i32 6 253 %ret7 = insertelement <16 x i16> %ret6, i16 %v7, i32 7 254 %ret8 = insertelement <16 x i16> %ret7, i16 %v8, i32 8 255 %ret9 = insertelement <16 x i16> %ret8, i16 %v9, i32 9 256 %ret10 = insertelement <16 x i16> %ret9, i16 %v10, i32 10 257 %ret11 = insertelement <16 x i16> %ret10, i16 %v11, i32 11 258 %ret12 = insertelement <16 x i16> %ret11, i16 %v12, i32 12 259 %ret13 = insertelement <16 x i16> %ret12, i16 %v13, i32 13 260 %ret14 = insertelement <16 x i16> %ret13, i16 %v14, i32 14 261 %ret15 = insertelement <16 x i16> %ret14, i16 %v15, i32 15 262 ret <16 x i16> %ret15 263} 264 265define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind { 266; XOP-LABEL: var_shuffle_v32i8: 267; XOP: # %bb.0: 268; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 269; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 270; XOP-NEXT: vpperm %xmm2, %xmm3, %xmm0, %xmm2 271; XOP-NEXT: vpperm %xmm1, %xmm3, %xmm0, %xmm0 272; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 273; XOP-NEXT: retq 274; 275; AVX1-LABEL: var_shuffle_v32i8: 276; AVX1: # %bb.0: 277; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 278; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 279; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4 280; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 281; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm6 282; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2 283; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm2, %xmm2 284; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm3 285; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm4 286; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 287; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm0, %xmm0 288; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 289; AVX1-NEXT: retq 290; 291; AVX2-LABEL: var_shuffle_v32i8: 292; AVX2: # %bb.0: 293; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3] 294; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2 295; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 296; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 297; AVX2-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 298; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 299; AVX2-NEXT: retq 300; 301; AVX512-LABEL: var_shuffle_v32i8: 302; AVX512: # %bb.0: 303; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3] 304; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm2 305; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 306; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 307; AVX512-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 308; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 309; AVX512-NEXT: retq 310; 311; AVX512VLDQ-LABEL: var_shuffle_v32i8: 312; AVX512VLDQ: # %bb.0: 313; AVX512VLDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3] 314; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm2, %ymm2 315; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 316; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 317; AVX512VLDQ-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 318; AVX512VLDQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 319; AVX512VLDQ-NEXT: retq 320; 321; AVX512VLBW-LABEL: var_shuffle_v32i8: 322; AVX512VLBW: # %bb.0: 323; AVX512VLBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 324; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm2, %ymm2 325; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 326; AVX512VLBW-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %k1 327; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm0, %ymm2 {%k1} 328; AVX512VLBW-NEXT: vmovdqa %ymm2, %ymm0 329; AVX512VLBW-NEXT: retq 330; 331; VLVBMI-LABEL: var_shuffle_v32i8: 332; VLVBMI: # %bb.0: 333; VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 334; VLVBMI-NEXT: retq 335 %index0 = extractelement <32 x i8> %indices, i32 0 336 %index1 = extractelement <32 x i8> %indices, i32 1 337 %index2 = extractelement <32 x i8> %indices, i32 2 338 %index3 = extractelement <32 x i8> %indices, i32 3 339 %index4 = extractelement <32 x i8> %indices, i32 4 340 %index5 = extractelement <32 x i8> %indices, i32 5 341 %index6 = extractelement <32 x i8> %indices, i32 6 342 %index7 = extractelement <32 x i8> %indices, i32 7 343 %index8 = extractelement <32 x i8> %indices, i32 8 344 %index9 = extractelement <32 x i8> %indices, i32 9 345 %index10 = extractelement <32 x i8> %indices, i32 10 346 %index11 = extractelement <32 x i8> %indices, i32 11 347 %index12 = extractelement <32 x i8> %indices, i32 12 348 %index13 = extractelement <32 x i8> %indices, i32 13 349 %index14 = extractelement <32 x i8> %indices, i32 14 350 %index15 = extractelement <32 x i8> %indices, i32 15 351 %index16 = extractelement <32 x i8> %indices, i32 16 352 %index17 = extractelement <32 x i8> %indices, i32 17 353 %index18 = extractelement <32 x i8> %indices, i32 18 354 %index19 = extractelement <32 x i8> %indices, i32 19 355 %index20 = extractelement <32 x i8> %indices, i32 20 356 %index21 = extractelement <32 x i8> %indices, i32 21 357 %index22 = extractelement <32 x i8> %indices, i32 22 358 %index23 = extractelement <32 x i8> %indices, i32 23 359 %index24 = extractelement <32 x i8> %indices, i32 24 360 %index25 = extractelement <32 x i8> %indices, i32 25 361 %index26 = extractelement <32 x i8> %indices, i32 26 362 %index27 = extractelement <32 x i8> %indices, i32 27 363 %index28 = extractelement <32 x i8> %indices, i32 28 364 %index29 = extractelement <32 x i8> %indices, i32 29 365 %index30 = extractelement <32 x i8> %indices, i32 30 366 %index31 = extractelement <32 x i8> %indices, i32 31 367 %v0 = extractelement <32 x i8> %v, i8 %index0 368 %v1 = extractelement <32 x i8> %v, i8 %index1 369 %v2 = extractelement <32 x i8> %v, i8 %index2 370 %v3 = extractelement <32 x i8> %v, i8 %index3 371 %v4 = extractelement <32 x i8> %v, i8 %index4 372 %v5 = extractelement <32 x i8> %v, i8 %index5 373 %v6 = extractelement <32 x i8> %v, i8 %index6 374 %v7 = extractelement <32 x i8> %v, i8 %index7 375 %v8 = extractelement <32 x i8> %v, i8 %index8 376 %v9 = extractelement <32 x i8> %v, i8 %index9 377 %v10 = extractelement <32 x i8> %v, i8 %index10 378 %v11 = extractelement <32 x i8> %v, i8 %index11 379 %v12 = extractelement <32 x i8> %v, i8 %index12 380 %v13 = extractelement <32 x i8> %v, i8 %index13 381 %v14 = extractelement <32 x i8> %v, i8 %index14 382 %v15 = extractelement <32 x i8> %v, i8 %index15 383 %v16 = extractelement <32 x i8> %v, i8 %index16 384 %v17 = extractelement <32 x i8> %v, i8 %index17 385 %v18 = extractelement <32 x i8> %v, i8 %index18 386 %v19 = extractelement <32 x i8> %v, i8 %index19 387 %v20 = extractelement <32 x i8> %v, i8 %index20 388 %v21 = extractelement <32 x i8> %v, i8 %index21 389 %v22 = extractelement <32 x i8> %v, i8 %index22 390 %v23 = extractelement <32 x i8> %v, i8 %index23 391 %v24 = extractelement <32 x i8> %v, i8 %index24 392 %v25 = extractelement <32 x i8> %v, i8 %index25 393 %v26 = extractelement <32 x i8> %v, i8 %index26 394 %v27 = extractelement <32 x i8> %v, i8 %index27 395 %v28 = extractelement <32 x i8> %v, i8 %index28 396 %v29 = extractelement <32 x i8> %v, i8 %index29 397 %v30 = extractelement <32 x i8> %v, i8 %index30 398 %v31 = extractelement <32 x i8> %v, i8 %index31 399 %ret0 = insertelement <32 x i8> undef, i8 %v0, i32 0 400 %ret1 = insertelement <32 x i8> %ret0, i8 %v1, i32 1 401 %ret2 = insertelement <32 x i8> %ret1, i8 %v2, i32 2 402 %ret3 = insertelement <32 x i8> %ret2, i8 %v3, i32 3 403 %ret4 = insertelement <32 x i8> %ret3, i8 %v4, i32 4 404 %ret5 = insertelement <32 x i8> %ret4, i8 %v5, i32 5 405 %ret6 = insertelement <32 x i8> %ret5, i8 %v6, i32 6 406 %ret7 = insertelement <32 x i8> %ret6, i8 %v7, i32 7 407 %ret8 = insertelement <32 x i8> %ret7, i8 %v8, i32 8 408 %ret9 = insertelement <32 x i8> %ret8, i8 %v9, i32 9 409 %ret10 = insertelement <32 x i8> %ret9, i8 %v10, i32 10 410 %ret11 = insertelement <32 x i8> %ret10, i8 %v11, i32 11 411 %ret12 = insertelement <32 x i8> %ret11, i8 %v12, i32 12 412 %ret13 = insertelement <32 x i8> %ret12, i8 %v13, i32 13 413 %ret14 = insertelement <32 x i8> %ret13, i8 %v14, i32 14 414 %ret15 = insertelement <32 x i8> %ret14, i8 %v15, i32 15 415 %ret16 = insertelement <32 x i8> %ret15, i8 %v16, i32 16 416 %ret17 = insertelement <32 x i8> %ret16, i8 %v17, i32 17 417 %ret18 = insertelement <32 x i8> %ret17, i8 %v18, i32 18 418 %ret19 = insertelement <32 x i8> %ret18, i8 %v19, i32 19 419 %ret20 = insertelement <32 x i8> %ret19, i8 %v20, i32 20 420 %ret21 = insertelement <32 x i8> %ret20, i8 %v21, i32 21 421 %ret22 = insertelement <32 x i8> %ret21, i8 %v22, i32 22 422 %ret23 = insertelement <32 x i8> %ret22, i8 %v23, i32 23 423 %ret24 = insertelement <32 x i8> %ret23, i8 %v24, i32 24 424 %ret25 = insertelement <32 x i8> %ret24, i8 %v25, i32 25 425 %ret26 = insertelement <32 x i8> %ret25, i8 %v26, i32 26 426 %ret27 = insertelement <32 x i8> %ret26, i8 %v27, i32 27 427 %ret28 = insertelement <32 x i8> %ret27, i8 %v28, i32 28 428 %ret29 = insertelement <32 x i8> %ret28, i8 %v29, i32 29 429 %ret30 = insertelement <32 x i8> %ret29, i8 %v30, i32 30 430 %ret31 = insertelement <32 x i8> %ret30, i8 %v31, i32 31 431 ret <32 x i8> %ret31 432} 433 434define <4 x double> @var_shuffle_v4f64(<4 x double> %v, <4 x i64> %indices) nounwind { 435; XOP-LABEL: var_shuffle_v4f64: 436; XOP: # %bb.0: 437; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 438; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 439; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm3 440; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 441; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1 442; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 443; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm2, %ymm0, %ymm0 444; XOP-NEXT: retq 445; 446; AVX1-LABEL: var_shuffle_v4f64: 447; AVX1: # %bb.0: 448; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 449; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm3 450; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 451; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 452; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm4 453; AVX1-NEXT: vpermilpd %ymm4, %ymm2, %ymm2 454; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 455; AVX1-NEXT: vpermilpd %ymm4, %ymm0, %ymm0 456; AVX1-NEXT: vpcmpgtq {{.*}}(%rip), %xmm3, %xmm3 457; AVX1-NEXT: vpcmpgtq {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1 458; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 459; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 460; AVX1-NEXT: retq 461; 462; AVX2-LABEL: var_shuffle_v4f64: 463; AVX2: # %bb.0: 464; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 465; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] 466; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 467; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3] 468; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 469; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] 470; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 471; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 472; AVX2-NEXT: retq 473; 474; AVX512-LABEL: var_shuffle_v4f64: 475; AVX512: # %bb.0: 476; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 477; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 478; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 479; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 480; AVX512-NEXT: retq 481; 482; AVX512VL-LABEL: var_shuffle_v4f64: 483; AVX512VL: # %bb.0: 484; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0 485; AVX512VL-NEXT: retq 486 %index0 = extractelement <4 x i64> %indices, i32 0 487 %index1 = extractelement <4 x i64> %indices, i32 1 488 %index2 = extractelement <4 x i64> %indices, i32 2 489 %index3 = extractelement <4 x i64> %indices, i32 3 490 %v0 = extractelement <4 x double> %v, i64 %index0 491 %v1 = extractelement <4 x double> %v, i64 %index1 492 %v2 = extractelement <4 x double> %v, i64 %index2 493 %v3 = extractelement <4 x double> %v, i64 %index3 494 %ret0 = insertelement <4 x double> undef, double %v0, i32 0 495 %ret1 = insertelement <4 x double> %ret0, double %v1, i32 1 496 %ret2 = insertelement <4 x double> %ret1, double %v2, i32 2 497 %ret3 = insertelement <4 x double> %ret2, double %v3, i32 3 498 ret <4 x double> %ret3 499} 500 501define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwind { 502; XOP-LABEL: var_shuffle_v8f32: 503; XOP: # %bb.0: 504; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 505; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 506; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0 507; XOP-NEXT: retq 508; 509; AVX1-LABEL: var_shuffle_v8f32: 510; AVX1: # %bb.0: 511; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 512; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2 513; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 514; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 515; AVX1-NEXT: vpcmpgtd {{.*}}(%rip), %xmm1, %xmm3 516; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 517; AVX1-NEXT: vpcmpgtd {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1 518; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 519; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 520; AVX1-NEXT: retq 521; 522; INT256-LABEL: var_shuffle_v8f32: 523; INT256: # %bb.0: 524; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0 525; INT256-NEXT: retq 526 %index0 = extractelement <8 x i32> %indices, i32 0 527 %index1 = extractelement <8 x i32> %indices, i32 1 528 %index2 = extractelement <8 x i32> %indices, i32 2 529 %index3 = extractelement <8 x i32> %indices, i32 3 530 %index4 = extractelement <8 x i32> %indices, i32 4 531 %index5 = extractelement <8 x i32> %indices, i32 5 532 %index6 = extractelement <8 x i32> %indices, i32 6 533 %index7 = extractelement <8 x i32> %indices, i32 7 534 %v0 = extractelement <8 x float> %v, i32 %index0 535 %v1 = extractelement <8 x float> %v, i32 %index1 536 %v2 = extractelement <8 x float> %v, i32 %index2 537 %v3 = extractelement <8 x float> %v, i32 %index3 538 %v4 = extractelement <8 x float> %v, i32 %index4 539 %v5 = extractelement <8 x float> %v, i32 %index5 540 %v6 = extractelement <8 x float> %v, i32 %index6 541 %v7 = extractelement <8 x float> %v, i32 %index7 542 %ret0 = insertelement <8 x float> undef, float %v0, i32 0 543 %ret1 = insertelement <8 x float> %ret0, float %v1, i32 1 544 %ret2 = insertelement <8 x float> %ret1, float %v2, i32 2 545 %ret3 = insertelement <8 x float> %ret2, float %v3, i32 3 546 %ret4 = insertelement <8 x float> %ret3, float %v4, i32 4 547 %ret5 = insertelement <8 x float> %ret4, float %v5, i32 5 548 %ret6 = insertelement <8 x float> %ret5, float %v6, i32 6 549 %ret7 = insertelement <8 x float> %ret6, float %v7, i32 7 550 ret <8 x float> %ret7 551} 552 553; 554; PR35820 - Unequal source/destination vector sizes 555; 556 557define <4 x i64> @var_shuffle_v4i64_from_v2i64(<2 x i64> %v, <4 x i64> %indices) nounwind { 558; XOP-LABEL: var_shuffle_v4i64_from_v2i64: 559; XOP: # %bb.0: 560; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 561; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 562; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm2 563; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 564; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1 565; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 566; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm0, %ymm0, %ymm0 567; XOP-NEXT: retq 568; 569; AVX1-LABEL: var_shuffle_v4i64_from_v2i64: 570; AVX1: # %bb.0: 571; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 572; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 573; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm2 574; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 575; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 576; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3 577; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm0 578; AVX1-NEXT: vpcmpgtq {{.*}}(%rip), %xmm2, %xmm2 579; AVX1-NEXT: vpcmpgtq {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1 580; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 581; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm2 582; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 583; AVX1-NEXT: retq 584; 585; AVX2-LABEL: var_shuffle_v4i64_from_v2i64: 586; AVX2: # %bb.0: 587; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 588; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 589; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] 590; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 591; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm3 592; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] 593; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 594; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 595; AVX2-NEXT: retq 596; 597; AVX512-LABEL: var_shuffle_v4i64_from_v2i64: 598; AVX512: # %bb.0: 599; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 600; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 601; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 602; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 603; AVX512-NEXT: retq 604; 605; AVX512VL-LABEL: var_shuffle_v4i64_from_v2i64: 606; AVX512VL: # %bb.0: 607; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 608; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0 609; AVX512VL-NEXT: retq 610 %index0 = extractelement <4 x i64> %indices, i32 0 611 %index1 = extractelement <4 x i64> %indices, i32 1 612 %index2 = extractelement <4 x i64> %indices, i32 2 613 %index3 = extractelement <4 x i64> %indices, i32 3 614 %v0 = extractelement <2 x i64> %v, i64 %index0 615 %v1 = extractelement <2 x i64> %v, i64 %index1 616 %v2 = extractelement <2 x i64> %v, i64 %index2 617 %v3 = extractelement <2 x i64> %v, i64 %index3 618 %ret0 = insertelement <4 x i64> undef, i64 %v0, i32 0 619 %ret1 = insertelement <4 x i64> %ret0, i64 %v1, i32 1 620 %ret2 = insertelement <4 x i64> %ret1, i64 %v2, i32 2 621 %ret3 = insertelement <4 x i64> %ret2, i64 %v3, i32 3 622 ret <4 x i64> %ret3 623} 624 625define <8 x i32> @var_shuffle_v8i32_from_v4i32(<4 x i32> %v, <8 x i32> %indices) unnamed_addr nounwind { 626; XOP-LABEL: var_shuffle_v8i32_from_v4i32: 627; XOP: # %bb.0: # %entry 628; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 629; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 630; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 631; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0 632; XOP-NEXT: retq 633; 634; AVX1-LABEL: var_shuffle_v8i32_from_v4i32: 635; AVX1: # %bb.0: # %entry 636; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 637; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 638; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2 639; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 640; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 641; AVX1-NEXT: vpcmpgtd {{.*}}(%rip), %xmm1, %xmm3 642; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 643; AVX1-NEXT: vpcmpgtd {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1 644; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 645; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 646; AVX1-NEXT: retq 647; 648; INT256-LABEL: var_shuffle_v8i32_from_v4i32: 649; INT256: # %bb.0: # %entry 650; INT256-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 651; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0 652; INT256-NEXT: retq 653entry: 654 %tmp1 = extractelement <8 x i32> %indices, i32 0 655 %vecext2.8 = extractelement <4 x i32> %v, i32 %tmp1 656 %tmp2 = extractelement <8 x i32> %indices, i32 1 657 %vecext2.9 = extractelement <4 x i32> %v, i32 %tmp2 658 %tmp3 = extractelement <8 x i32> %indices, i32 2 659 %vecext2.10 = extractelement <4 x i32> %v, i32 %tmp3 660 %tmp4 = extractelement <8 x i32> %indices, i32 3 661 %vecext2.11 = extractelement <4 x i32> %v, i32 %tmp4 662 %tmp5 = extractelement <8 x i32> %indices, i32 4 663 %vecext2.12 = extractelement <4 x i32> %v, i32 %tmp5 664 %tmp6 = extractelement <8 x i32> %indices, i32 5 665 %vecext2.13 = extractelement <4 x i32> %v, i32 %tmp6 666 %tmp7 = extractelement <8 x i32> %indices, i32 6 667 %vecext2.14 = extractelement <4 x i32> %v, i32 %tmp7 668 %tmp8 = extractelement <8 x i32> %indices, i32 7 669 %vecext2.15 = extractelement <4 x i32> %v, i32 %tmp8 670 %tmp9 = insertelement <8 x i32> undef, i32 %vecext2.8, i32 0 671 %tmp10 = insertelement <8 x i32> %tmp9, i32 %vecext2.9, i32 1 672 %tmp11 = insertelement <8 x i32> %tmp10, i32 %vecext2.10, i32 2 673 %tmp12 = insertelement <8 x i32> %tmp11, i32 %vecext2.11, i32 3 674 %tmp13 = insertelement <8 x i32> %tmp12, i32 %vecext2.12, i32 4 675 %tmp14 = insertelement <8 x i32> %tmp13, i32 %vecext2.13, i32 5 676 %tmp15 = insertelement <8 x i32> %tmp14, i32 %vecext2.14, i32 6 677 %tmp16 = insertelement <8 x i32> %tmp15, i32 %vecext2.15, i32 7 678 ret <8 x i32> %tmp16 679} 680 681define <16 x i16> @var_shuffle_v16i16_from_v8i16(<8 x i16> %v, <16 x i16> %indices) nounwind { 682; XOP-LABEL: var_shuffle_v16i16_from_v8i16: 683; XOP: # %bb.0: 684; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [256,256,256,256,256,256,256,256] 685; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [514,514,514,514,514,514,514,514] 686; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm4 687; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 688; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm1 689; XOP-NEXT: vpperm %xmm1, %xmm0, %xmm0, %xmm1 690; XOP-NEXT: vpperm %xmm4, %xmm0, %xmm0, %xmm0 691; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 692; XOP-NEXT: retq 693; 694; AVX1-LABEL: var_shuffle_v16i16_from_v8i16: 695; AVX1: # %bb.0: 696; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [514,514,514,514,514,514,514,514] 697; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3 698; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,256,256,256,256,256,256,256] 699; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3 700; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 701; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 702; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm1 703; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 704; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm4 705; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm5 706; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 707; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm1, %xmm1 708; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2 709; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm4 710; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 711; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 712; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 713; AVX1-NEXT: retq 714; 715; AVX2-LABEL: var_shuffle_v16i16_from_v8i16: 716; AVX2: # %bb.0: 717; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 718; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 719; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1 720; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm2 721; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 722; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 723; AVX2-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 724; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 725; AVX2-NEXT: retq 726; 727; AVX512-LABEL: var_shuffle_v16i16_from_v8i16: 728; AVX512: # %bb.0: 729; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 730; AVX512-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 731; AVX512-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1 732; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm2 733; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 734; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 735; AVX512-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 736; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 737; AVX512-NEXT: retq 738; 739; AVX512VLDQ-LABEL: var_shuffle_v16i16_from_v8i16: 740; AVX512VLDQ: # %bb.0: 741; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 742; AVX512VLDQ-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 743; AVX512VLDQ-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1 744; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm2 745; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 746; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 747; AVX512VLDQ-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 748; AVX512VLDQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 749; AVX512VLDQ-NEXT: retq 750; 751; AVX512VLBW-LABEL: var_shuffle_v16i16_from_v8i16: 752; AVX512VLBW: # %bb.0: 753; AVX512VLBW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 754; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0 755; AVX512VLBW-NEXT: retq 756; 757; VLVBMI-LABEL: var_shuffle_v16i16_from_v8i16: 758; VLVBMI: # %bb.0: 759; VLVBMI-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 760; VLVBMI-NEXT: vpermw %ymm0, %ymm1, %ymm0 761; VLVBMI-NEXT: retq 762 %index0 = extractelement <16 x i16> %indices, i32 0 763 %index1 = extractelement <16 x i16> %indices, i32 1 764 %index2 = extractelement <16 x i16> %indices, i32 2 765 %index3 = extractelement <16 x i16> %indices, i32 3 766 %index4 = extractelement <16 x i16> %indices, i32 4 767 %index5 = extractelement <16 x i16> %indices, i32 5 768 %index6 = extractelement <16 x i16> %indices, i32 6 769 %index7 = extractelement <16 x i16> %indices, i32 7 770 %index8 = extractelement <16 x i16> %indices, i32 8 771 %index9 = extractelement <16 x i16> %indices, i32 9 772 %index10 = extractelement <16 x i16> %indices, i32 10 773 %index11 = extractelement <16 x i16> %indices, i32 11 774 %index12 = extractelement <16 x i16> %indices, i32 12 775 %index13 = extractelement <16 x i16> %indices, i32 13 776 %index14 = extractelement <16 x i16> %indices, i32 14 777 %index15 = extractelement <16 x i16> %indices, i32 15 778 %v0 = extractelement <8 x i16> %v, i16 %index0 779 %v1 = extractelement <8 x i16> %v, i16 %index1 780 %v2 = extractelement <8 x i16> %v, i16 %index2 781 %v3 = extractelement <8 x i16> %v, i16 %index3 782 %v4 = extractelement <8 x i16> %v, i16 %index4 783 %v5 = extractelement <8 x i16> %v, i16 %index5 784 %v6 = extractelement <8 x i16> %v, i16 %index6 785 %v7 = extractelement <8 x i16> %v, i16 %index7 786 %v8 = extractelement <8 x i16> %v, i16 %index8 787 %v9 = extractelement <8 x i16> %v, i16 %index9 788 %v10 = extractelement <8 x i16> %v, i16 %index10 789 %v11 = extractelement <8 x i16> %v, i16 %index11 790 %v12 = extractelement <8 x i16> %v, i16 %index12 791 %v13 = extractelement <8 x i16> %v, i16 %index13 792 %v14 = extractelement <8 x i16> %v, i16 %index14 793 %v15 = extractelement <8 x i16> %v, i16 %index15 794 %ret0 = insertelement <16 x i16> undef, i16 %v0, i32 0 795 %ret1 = insertelement <16 x i16> %ret0, i16 %v1, i32 1 796 %ret2 = insertelement <16 x i16> %ret1, i16 %v2, i32 2 797 %ret3 = insertelement <16 x i16> %ret2, i16 %v3, i32 3 798 %ret4 = insertelement <16 x i16> %ret3, i16 %v4, i32 4 799 %ret5 = insertelement <16 x i16> %ret4, i16 %v5, i32 5 800 %ret6 = insertelement <16 x i16> %ret5, i16 %v6, i32 6 801 %ret7 = insertelement <16 x i16> %ret6, i16 %v7, i32 7 802 %ret8 = insertelement <16 x i16> %ret7, i16 %v8, i32 8 803 %ret9 = insertelement <16 x i16> %ret8, i16 %v9, i32 9 804 %ret10 = insertelement <16 x i16> %ret9, i16 %v10, i32 10 805 %ret11 = insertelement <16 x i16> %ret10, i16 %v11, i32 11 806 %ret12 = insertelement <16 x i16> %ret11, i16 %v12, i32 12 807 %ret13 = insertelement <16 x i16> %ret12, i16 %v13, i32 13 808 %ret14 = insertelement <16 x i16> %ret13, i16 %v14, i32 14 809 %ret15 = insertelement <16 x i16> %ret14, i16 %v15, i32 15 810 ret <16 x i16> %ret15 811} 812 813define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices) nounwind { 814; XOP-LABEL: var_shuffle_v32i8_from_v16i8: 815; XOP: # %bb.0: 816; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 817; XOP-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm2 818; XOP-NEXT: vpperm %xmm1, %xmm0, %xmm0, %xmm0 819; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 820; XOP-NEXT: retq 821; 822; AVX1-LABEL: var_shuffle_v32i8_from_v16i8: 823; AVX1: # %bb.0: 824; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 825; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 826; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4 827; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm5 828; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2 829; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm2, %xmm2 830; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm3 831; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm4 832; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 833; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm0, %xmm0 834; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 835; AVX1-NEXT: retq 836; 837; AVX2-LABEL: var_shuffle_v32i8_from_v16i8: 838; AVX2: # %bb.0: 839; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 840; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm2 841; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 842; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 843; AVX2-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 844; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 845; AVX2-NEXT: retq 846; 847; AVX512-LABEL: var_shuffle_v32i8_from_v16i8: 848; AVX512: # %bb.0: 849; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 850; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm2 851; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 852; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 853; AVX512-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 854; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 855; AVX512-NEXT: retq 856; 857; AVX512VLDQ-LABEL: var_shuffle_v32i8_from_v16i8: 858; AVX512VLDQ: # %bb.0: 859; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 860; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm2 861; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 862; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 863; AVX512VLDQ-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 864; AVX512VLDQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 865; AVX512VLDQ-NEXT: retq 866; 867; AVX512VLBW-LABEL: var_shuffle_v32i8_from_v16i8: 868; AVX512VLBW: # %bb.0: 869; AVX512VLBW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 870; AVX512VLBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 871; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 872; AVX512VLBW-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %k1 873; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 {%k1} 874; AVX512VLBW-NEXT: retq 875; 876; VLVBMI-LABEL: var_shuffle_v32i8_from_v16i8: 877; VLVBMI: # %bb.0: 878; VLVBMI-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 879; VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 880; VLVBMI-NEXT: retq 881 %index0 = extractelement <32 x i8> %indices, i32 0 882 %index1 = extractelement <32 x i8> %indices, i32 1 883 %index2 = extractelement <32 x i8> %indices, i32 2 884 %index3 = extractelement <32 x i8> %indices, i32 3 885 %index4 = extractelement <32 x i8> %indices, i32 4 886 %index5 = extractelement <32 x i8> %indices, i32 5 887 %index6 = extractelement <32 x i8> %indices, i32 6 888 %index7 = extractelement <32 x i8> %indices, i32 7 889 %index8 = extractelement <32 x i8> %indices, i32 8 890 %index9 = extractelement <32 x i8> %indices, i32 9 891 %index10 = extractelement <32 x i8> %indices, i32 10 892 %index11 = extractelement <32 x i8> %indices, i32 11 893 %index12 = extractelement <32 x i8> %indices, i32 12 894 %index13 = extractelement <32 x i8> %indices, i32 13 895 %index14 = extractelement <32 x i8> %indices, i32 14 896 %index15 = extractelement <32 x i8> %indices, i32 15 897 %index16 = extractelement <32 x i8> %indices, i32 16 898 %index17 = extractelement <32 x i8> %indices, i32 17 899 %index18 = extractelement <32 x i8> %indices, i32 18 900 %index19 = extractelement <32 x i8> %indices, i32 19 901 %index20 = extractelement <32 x i8> %indices, i32 20 902 %index21 = extractelement <32 x i8> %indices, i32 21 903 %index22 = extractelement <32 x i8> %indices, i32 22 904 %index23 = extractelement <32 x i8> %indices, i32 23 905 %index24 = extractelement <32 x i8> %indices, i32 24 906 %index25 = extractelement <32 x i8> %indices, i32 25 907 %index26 = extractelement <32 x i8> %indices, i32 26 908 %index27 = extractelement <32 x i8> %indices, i32 27 909 %index28 = extractelement <32 x i8> %indices, i32 28 910 %index29 = extractelement <32 x i8> %indices, i32 29 911 %index30 = extractelement <32 x i8> %indices, i32 30 912 %index31 = extractelement <32 x i8> %indices, i32 31 913 %v0 = extractelement <16 x i8> %v, i8 %index0 914 %v1 = extractelement <16 x i8> %v, i8 %index1 915 %v2 = extractelement <16 x i8> %v, i8 %index2 916 %v3 = extractelement <16 x i8> %v, i8 %index3 917 %v4 = extractelement <16 x i8> %v, i8 %index4 918 %v5 = extractelement <16 x i8> %v, i8 %index5 919 %v6 = extractelement <16 x i8> %v, i8 %index6 920 %v7 = extractelement <16 x i8> %v, i8 %index7 921 %v8 = extractelement <16 x i8> %v, i8 %index8 922 %v9 = extractelement <16 x i8> %v, i8 %index9 923 %v10 = extractelement <16 x i8> %v, i8 %index10 924 %v11 = extractelement <16 x i8> %v, i8 %index11 925 %v12 = extractelement <16 x i8> %v, i8 %index12 926 %v13 = extractelement <16 x i8> %v, i8 %index13 927 %v14 = extractelement <16 x i8> %v, i8 %index14 928 %v15 = extractelement <16 x i8> %v, i8 %index15 929 %v16 = extractelement <16 x i8> %v, i8 %index16 930 %v17 = extractelement <16 x i8> %v, i8 %index17 931 %v18 = extractelement <16 x i8> %v, i8 %index18 932 %v19 = extractelement <16 x i8> %v, i8 %index19 933 %v20 = extractelement <16 x i8> %v, i8 %index20 934 %v21 = extractelement <16 x i8> %v, i8 %index21 935 %v22 = extractelement <16 x i8> %v, i8 %index22 936 %v23 = extractelement <16 x i8> %v, i8 %index23 937 %v24 = extractelement <16 x i8> %v, i8 %index24 938 %v25 = extractelement <16 x i8> %v, i8 %index25 939 %v26 = extractelement <16 x i8> %v, i8 %index26 940 %v27 = extractelement <16 x i8> %v, i8 %index27 941 %v28 = extractelement <16 x i8> %v, i8 %index28 942 %v29 = extractelement <16 x i8> %v, i8 %index29 943 %v30 = extractelement <16 x i8> %v, i8 %index30 944 %v31 = extractelement <16 x i8> %v, i8 %index31 945 %ret0 = insertelement <32 x i8> undef, i8 %v0, i32 0 946 %ret1 = insertelement <32 x i8> %ret0, i8 %v1, i32 1 947 %ret2 = insertelement <32 x i8> %ret1, i8 %v2, i32 2 948 %ret3 = insertelement <32 x i8> %ret2, i8 %v3, i32 3 949 %ret4 = insertelement <32 x i8> %ret3, i8 %v4, i32 4 950 %ret5 = insertelement <32 x i8> %ret4, i8 %v5, i32 5 951 %ret6 = insertelement <32 x i8> %ret5, i8 %v6, i32 6 952 %ret7 = insertelement <32 x i8> %ret6, i8 %v7, i32 7 953 %ret8 = insertelement <32 x i8> %ret7, i8 %v8, i32 8 954 %ret9 = insertelement <32 x i8> %ret8, i8 %v9, i32 9 955 %ret10 = insertelement <32 x i8> %ret9, i8 %v10, i32 10 956 %ret11 = insertelement <32 x i8> %ret10, i8 %v11, i32 11 957 %ret12 = insertelement <32 x i8> %ret11, i8 %v12, i32 12 958 %ret13 = insertelement <32 x i8> %ret12, i8 %v13, i32 13 959 %ret14 = insertelement <32 x i8> %ret13, i8 %v14, i32 14 960 %ret15 = insertelement <32 x i8> %ret14, i8 %v15, i32 15 961 %ret16 = insertelement <32 x i8> %ret15, i8 %v16, i32 16 962 %ret17 = insertelement <32 x i8> %ret16, i8 %v17, i32 17 963 %ret18 = insertelement <32 x i8> %ret17, i8 %v18, i32 18 964 %ret19 = insertelement <32 x i8> %ret18, i8 %v19, i32 19 965 %ret20 = insertelement <32 x i8> %ret19, i8 %v20, i32 20 966 %ret21 = insertelement <32 x i8> %ret20, i8 %v21, i32 21 967 %ret22 = insertelement <32 x i8> %ret21, i8 %v22, i32 22 968 %ret23 = insertelement <32 x i8> %ret22, i8 %v23, i32 23 969 %ret24 = insertelement <32 x i8> %ret23, i8 %v24, i32 24 970 %ret25 = insertelement <32 x i8> %ret24, i8 %v25, i32 25 971 %ret26 = insertelement <32 x i8> %ret25, i8 %v26, i32 26 972 %ret27 = insertelement <32 x i8> %ret26, i8 %v27, i32 27 973 %ret28 = insertelement <32 x i8> %ret27, i8 %v28, i32 28 974 %ret29 = insertelement <32 x i8> %ret28, i8 %v29, i32 29 975 %ret30 = insertelement <32 x i8> %ret29, i8 %v30, i32 30 976 %ret31 = insertelement <32 x i8> %ret30, i8 %v31, i32 31 977 ret <32 x i8> %ret31 978} 979 980define <4 x double> @var_shuffle_v4f64_from_v2f64(<2 x double> %v, <4 x i64> %indices) nounwind { 981; XOP-LABEL: var_shuffle_v4f64_from_v2f64: 982; XOP: # %bb.0: 983; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 984; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 985; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm2 986; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 987; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1 988; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 989; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm0, %ymm0, %ymm0 990; XOP-NEXT: retq 991; 992; AVX1-LABEL: var_shuffle_v4f64_from_v2f64: 993; AVX1: # %bb.0: 994; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 995; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 996; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm2 997; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 998; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 999; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3 1000; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm0 1001; AVX1-NEXT: vpcmpgtq {{.*}}(%rip), %xmm2, %xmm2 1002; AVX1-NEXT: vpcmpgtq {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1 1003; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1004; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm2 1005; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 1006; AVX1-NEXT: retq 1007; 1008; AVX2-LABEL: var_shuffle_v4f64_from_v2f64: 1009; AVX2: # %bb.0: 1010; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1011; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 1012; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] 1013; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 1014; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm3 1015; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] 1016; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 1017; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 1018; AVX2-NEXT: retq 1019; 1020; AVX512-LABEL: var_shuffle_v4f64_from_v2f64: 1021; AVX512: # %bb.0: 1022; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1023; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1024; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 1025; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1026; AVX512-NEXT: retq 1027; 1028; AVX512VL-LABEL: var_shuffle_v4f64_from_v2f64: 1029; AVX512VL: # %bb.0: 1030; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1031; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0 1032; AVX512VL-NEXT: retq 1033 %index0 = extractelement <4 x i64> %indices, i32 0 1034 %index1 = extractelement <4 x i64> %indices, i32 1 1035 %index2 = extractelement <4 x i64> %indices, i32 2 1036 %index3 = extractelement <4 x i64> %indices, i32 3 1037 %v0 = extractelement <2 x double> %v, i64 %index0 1038 %v1 = extractelement <2 x double> %v, i64 %index1 1039 %v2 = extractelement <2 x double> %v, i64 %index2 1040 %v3 = extractelement <2 x double> %v, i64 %index3 1041 %ret0 = insertelement <4 x double> undef, double %v0, i32 0 1042 %ret1 = insertelement <4 x double> %ret0, double %v1, i32 1 1043 %ret2 = insertelement <4 x double> %ret1, double %v2, i32 2 1044 %ret3 = insertelement <4 x double> %ret2, double %v3, i32 3 1045 ret <4 x double> %ret3 1046} 1047 1048define <8 x float> @var_shuffle_v8f32_from_v4f32(<4 x float> %v, <8 x i32> %indices) unnamed_addr nounwind { 1049; XOP-LABEL: var_shuffle_v8f32_from_v4f32: 1050; XOP: # %bb.0: # %entry 1051; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1052; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 1053; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1054; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0 1055; XOP-NEXT: retq 1056; 1057; AVX1-LABEL: var_shuffle_v8f32_from_v4f32: 1058; AVX1: # %bb.0: # %entry 1059; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1060; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 1061; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2 1062; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1063; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 1064; AVX1-NEXT: vpcmpgtd {{.*}}(%rip), %xmm1, %xmm3 1065; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1066; AVX1-NEXT: vpcmpgtd {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1 1067; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 1068; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 1069; AVX1-NEXT: retq 1070; 1071; INT256-LABEL: var_shuffle_v8f32_from_v4f32: 1072; INT256: # %bb.0: # %entry 1073; INT256-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1074; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0 1075; INT256-NEXT: retq 1076entry: 1077 %tmp1 = extractelement <8 x i32> %indices, i32 0 1078 %vecext2.8 = extractelement <4 x float> %v, i32 %tmp1 1079 %tmp2 = extractelement <8 x i32> %indices, i32 1 1080 %vecext2.9 = extractelement <4 x float> %v, i32 %tmp2 1081 %tmp3 = extractelement <8 x i32> %indices, i32 2 1082 %vecext2.10 = extractelement <4 x float> %v, i32 %tmp3 1083 %tmp4 = extractelement <8 x i32> %indices, i32 3 1084 %vecext2.11 = extractelement <4 x float> %v, i32 %tmp4 1085 %tmp5 = extractelement <8 x i32> %indices, i32 4 1086 %vecext2.12 = extractelement <4 x float> %v, i32 %tmp5 1087 %tmp6 = extractelement <8 x i32> %indices, i32 5 1088 %vecext2.13 = extractelement <4 x float> %v, i32 %tmp6 1089 %tmp7 = extractelement <8 x i32> %indices, i32 6 1090 %vecext2.14 = extractelement <4 x float> %v, i32 %tmp7 1091 %tmp8 = extractelement <8 x i32> %indices, i32 7 1092 %vecext2.15 = extractelement <4 x float> %v, i32 %tmp8 1093 %tmp9 = insertelement <8 x float> undef, float %vecext2.8, i32 0 1094 %tmp10 = insertelement <8 x float> %tmp9, float %vecext2.9, i32 1 1095 %tmp11 = insertelement <8 x float> %tmp10, float %vecext2.10, i32 2 1096 %tmp12 = insertelement <8 x float> %tmp11, float %vecext2.11, i32 3 1097 %tmp13 = insertelement <8 x float> %tmp12, float %vecext2.12, i32 4 1098 %tmp14 = insertelement <8 x float> %tmp13, float %vecext2.13, i32 5 1099 %tmp15 = insertelement <8 x float> %tmp14, float %vecext2.14, i32 6 1100 %tmp16 = insertelement <8 x float> %tmp15, float %vecext2.15, i32 7 1101 ret <8 x float> %tmp16 1102} 1103 1104define <4 x i32> @var_shuffle_v4i32_from_v8i32(<8 x i32> %v, <4 x i32> %indices) unnamed_addr nounwind { 1105; XOP-LABEL: var_shuffle_v4i32_from_v8i32: 1106; XOP: # %bb.0: # %entry 1107; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 1108; XOP-NEXT: vpermil2ps $0, %xmm1, %xmm2, %xmm0, %xmm0 1109; XOP-NEXT: vzeroupper 1110; XOP-NEXT: retq 1111; 1112; AVX1-LABEL: var_shuffle_v4i32_from_v8i32: 1113; AVX1: # %bb.0: # %entry 1114; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1115; AVX1-NEXT: vpermilps %xmm1, %xmm2, %xmm2 1116; AVX1-NEXT: vpermilps %xmm1, %xmm0, %xmm0 1117; AVX1-NEXT: vpcmpgtd {{.*}}(%rip), %xmm1, %xmm1 1118; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 1119; AVX1-NEXT: vzeroupper 1120; AVX1-NEXT: retq 1121; 1122; INT256-LABEL: var_shuffle_v4i32_from_v8i32: 1123; INT256: # %bb.0: # %entry 1124; INT256-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1125; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0 1126; INT256-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1127; INT256-NEXT: vzeroupper 1128; INT256-NEXT: retq 1129entry: 1130 %tmp1 = extractelement <4 x i32> %indices, i32 0 1131 %vecext2.8 = extractelement <8 x i32> %v, i32 %tmp1 1132 %tmp2 = extractelement <4 x i32> %indices, i32 1 1133 %vecext2.9 = extractelement <8 x i32> %v, i32 %tmp2 1134 %tmp3 = extractelement <4 x i32> %indices, i32 2 1135 %vecext2.10 = extractelement <8 x i32> %v, i32 %tmp3 1136 %tmp4 = extractelement <4 x i32> %indices, i32 3 1137 %vecext2.11 = extractelement <8 x i32> %v, i32 %tmp4 1138 %tmp9 = insertelement <4 x i32> undef, i32 %vecext2.8, i32 0 1139 %tmp10 = insertelement <4 x i32> %tmp9, i32 %vecext2.9, i32 1 1140 %tmp11 = insertelement <4 x i32> %tmp10, i32 %vecext2.10, i32 2 1141 %tmp12 = insertelement <4 x i32> %tmp11, i32 %vecext2.11, i32 3 1142 ret <4 x i32> %tmp12 1143} 1144