1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1 13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2 14 15; Just one 32-bit run to make sure we do reasonable things for i64 cases. 16; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86-SSE2 17 18declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) 19declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) 20declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) 21declare <16 x i8> @llvm.fshl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) 22 23; 24; Variable Shifts 25; 26 27define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { 28; SSE2-LABEL: var_funnnel_v2i64: 29; SSE2: # %bb.0: 30; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [63,63] 31; SSE2-NEXT: pxor %xmm3, %xmm3 32; SSE2-NEXT: psubq %xmm1, %xmm3 33; SSE2-NEXT: pand %xmm2, %xmm1 34; SSE2-NEXT: movdqa %xmm0, %xmm4 35; SSE2-NEXT: psllq %xmm1, %xmm4 36; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 37; SSE2-NEXT: movdqa %xmm0, %xmm5 38; SSE2-NEXT: psllq %xmm1, %xmm5 39; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] 40; SSE2-NEXT: pand %xmm2, %xmm3 41; SSE2-NEXT: movdqa %xmm0, %xmm1 42; SSE2-NEXT: psrlq %xmm3, %xmm1 43; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] 44; SSE2-NEXT: psrlq %xmm2, %xmm0 45; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 46; SSE2-NEXT: orpd %xmm5, %xmm0 47; SSE2-NEXT: retq 48; 49; SSE41-LABEL: var_funnnel_v2i64: 50; SSE41: # %bb.0: 51; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [63,63] 52; SSE41-NEXT: pxor %xmm3, %xmm3 53; SSE41-NEXT: psubq %xmm1, %xmm3 54; SSE41-NEXT: pand %xmm2, %xmm1 55; SSE41-NEXT: movdqa %xmm0, %xmm4 56; SSE41-NEXT: psllq %xmm1, %xmm4 57; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 58; SSE41-NEXT: movdqa %xmm0, %xmm5 59; SSE41-NEXT: psllq %xmm1, %xmm5 60; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4,5,6,7] 61; SSE41-NEXT: pand %xmm2, %xmm3 62; SSE41-NEXT: movdqa %xmm0, %xmm1 63; SSE41-NEXT: psrlq %xmm3, %xmm1 64; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] 65; SSE41-NEXT: psrlq %xmm2, %xmm0 66; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 67; SSE41-NEXT: por %xmm5, %xmm0 68; SSE41-NEXT: retq 69; 70; AVX1-LABEL: var_funnnel_v2i64: 71; AVX1: # %bb.0: 72; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] 73; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 74; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm4 75; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] 76; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3 77; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 78; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 79; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 80; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 81; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 82; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 83; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 84; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] 85; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 86; AVX1-NEXT: retq 87; 88; AVX2-LABEL: var_funnnel_v2i64: 89; AVX2: # %bb.0: 90; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] 91; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 92; AVX2-NEXT: vpsllvq %xmm3, %xmm0, %xmm3 93; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 94; AVX2-NEXT: vpsubq %xmm1, %xmm4, %xmm1 95; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 96; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 97; AVX2-NEXT: vpor %xmm0, %xmm3, %xmm0 98; AVX2-NEXT: retq 99; 100; AVX512F-LABEL: var_funnnel_v2i64: 101; AVX512F: # %bb.0: 102; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 103; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 104; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0 105; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 106; AVX512F-NEXT: vzeroupper 107; AVX512F-NEXT: retq 108; 109; AVX512VL-LABEL: var_funnnel_v2i64: 110; AVX512VL: # %bb.0: 111; AVX512VL-NEXT: vprolvq %xmm1, %xmm0, %xmm0 112; AVX512VL-NEXT: retq 113; 114; AVX512BW-LABEL: var_funnnel_v2i64: 115; AVX512BW: # %bb.0: 116; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 117; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 118; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 119; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 120; AVX512BW-NEXT: vzeroupper 121; AVX512BW-NEXT: retq 122; 123; AVX512VLBW-LABEL: var_funnnel_v2i64: 124; AVX512VLBW: # %bb.0: 125; AVX512VLBW-NEXT: vprolvq %xmm1, %xmm0, %xmm0 126; AVX512VLBW-NEXT: retq 127; 128; AVX512VBMI2-LABEL: var_funnnel_v2i64: 129; AVX512VBMI2: # %bb.0: 130; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 131; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 132; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0 133; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 134; AVX512VBMI2-NEXT: vzeroupper 135; AVX512VBMI2-NEXT: retq 136; 137; AVX512VLVBMI2-LABEL: var_funnnel_v2i64: 138; AVX512VLVBMI2: # %bb.0: 139; AVX512VLVBMI2-NEXT: vprolvq %xmm1, %xmm0, %xmm0 140; AVX512VLVBMI2-NEXT: retq 141; 142; XOP-LABEL: var_funnnel_v2i64: 143; XOP: # %bb.0: 144; XOP-NEXT: vprotq %xmm1, %xmm0, %xmm0 145; XOP-NEXT: retq 146; 147; X86-SSE2-LABEL: var_funnnel_v2i64: 148; X86-SSE2: # %bb.0: 149; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [63,0,63,0] 150; X86-SSE2-NEXT: pxor %xmm3, %xmm3 151; X86-SSE2-NEXT: psubq %xmm1, %xmm3 152; X86-SSE2-NEXT: pand %xmm2, %xmm1 153; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 154; X86-SSE2-NEXT: psllq %xmm1, %xmm4 155; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 156; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 157; X86-SSE2-NEXT: psllq %xmm1, %xmm5 158; X86-SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] 159; X86-SSE2-NEXT: pand %xmm2, %xmm3 160; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 161; X86-SSE2-NEXT: psrlq %xmm3, %xmm1 162; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] 163; X86-SSE2-NEXT: psrlq %xmm2, %xmm0 164; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 165; X86-SSE2-NEXT: orpd %xmm5, %xmm0 166; X86-SSE2-NEXT: retl 167 %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> %amt) 168 ret <2 x i64> %res 169} 170 171define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %amt) nounwind { 172; SSE2-LABEL: var_funnnel_v4i32: 173; SSE2: # %bb.0: 174; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 175; SSE2-NEXT: pslld $23, %xmm1 176; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 177; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 178; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 179; SSE2-NEXT: pmuludq %xmm1, %xmm0 180; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 181; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 182; SSE2-NEXT: pmuludq %xmm2, %xmm1 183; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 184; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 185; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 186; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 187; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 188; SSE2-NEXT: por %xmm3, %xmm0 189; SSE2-NEXT: retq 190; 191; SSE41-LABEL: var_funnnel_v4i32: 192; SSE41: # %bb.0: 193; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 194; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 195; SSE41-NEXT: pslld $23, %xmm1 196; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1 197; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 198; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 199; SSE41-NEXT: pmuludq %xmm2, %xmm3 200; SSE41-NEXT: pmuludq %xmm1, %xmm0 201; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 202; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 203; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 204; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 205; SSE41-NEXT: por %xmm1, %xmm0 206; SSE41-NEXT: retq 207; 208; AVX1-LABEL: var_funnnel_v4i32: 209; AVX1: # %bb.0: 210; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 211; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 212; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 213; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 214; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 215; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 216; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 217; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 218; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 219; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 220; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] 221; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 222; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 223; AVX1-NEXT: retq 224; 225; AVX2-LABEL: var_funnnel_v4i32: 226; AVX2: # %bb.0: 227; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] 228; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 229; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 230; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] 231; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 232; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 233; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 234; AVX2-NEXT: retq 235; 236; AVX512F-LABEL: var_funnnel_v4i32: 237; AVX512F: # %bb.0: 238; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 239; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 240; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0 241; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 242; AVX512F-NEXT: vzeroupper 243; AVX512F-NEXT: retq 244; 245; AVX512VL-LABEL: var_funnnel_v4i32: 246; AVX512VL: # %bb.0: 247; AVX512VL-NEXT: vprolvd %xmm1, %xmm0, %xmm0 248; AVX512VL-NEXT: retq 249; 250; AVX512BW-LABEL: var_funnnel_v4i32: 251; AVX512BW: # %bb.0: 252; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 253; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 254; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 255; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 256; AVX512BW-NEXT: vzeroupper 257; AVX512BW-NEXT: retq 258; 259; AVX512VLBW-LABEL: var_funnnel_v4i32: 260; AVX512VLBW: # %bb.0: 261; AVX512VLBW-NEXT: vprolvd %xmm1, %xmm0, %xmm0 262; AVX512VLBW-NEXT: retq 263; 264; AVX512VBMI2-LABEL: var_funnnel_v4i32: 265; AVX512VBMI2: # %bb.0: 266; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 267; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 268; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 269; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 270; AVX512VBMI2-NEXT: vzeroupper 271; AVX512VBMI2-NEXT: retq 272; 273; AVX512VLVBMI2-LABEL: var_funnnel_v4i32: 274; AVX512VLVBMI2: # %bb.0: 275; AVX512VLVBMI2-NEXT: vprolvd %xmm1, %xmm0, %xmm0 276; AVX512VLVBMI2-NEXT: retq 277; 278; XOP-LABEL: var_funnnel_v4i32: 279; XOP: # %bb.0: 280; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0 281; XOP-NEXT: retq 282; 283; X86-SSE2-LABEL: var_funnnel_v4i32: 284; X86-SSE2: # %bb.0: 285; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1 286; X86-SSE2-NEXT: pslld $23, %xmm1 287; X86-SSE2-NEXT: paddd {{\.LCPI.*}}, %xmm1 288; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 289; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 290; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 291; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 292; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 293; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1 294; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 295; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 296; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 297; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 298; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 299; X86-SSE2-NEXT: por %xmm3, %xmm0 300; X86-SSE2-NEXT: retl 301 %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %amt) 302 ret <4 x i32> %res 303} 304 305define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { 306; SSE2-LABEL: var_funnnel_v8i16: 307; SSE2: # %bb.0: 308; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 309; SSE2-NEXT: movdqa %xmm1, %xmm2 310; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 311; SSE2-NEXT: pslld $23, %xmm2 312; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] 313; SSE2-NEXT: paddd %xmm3, %xmm2 314; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 315; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] 316; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] 317; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 318; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 319; SSE2-NEXT: pslld $23, %xmm1 320; SSE2-NEXT: paddd %xmm3, %xmm1 321; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 322; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 323; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 324; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 325; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 326; SSE2-NEXT: movdqa %xmm0, %xmm2 327; SSE2-NEXT: pmulhuw %xmm1, %xmm2 328; SSE2-NEXT: pmullw %xmm1, %xmm0 329; SSE2-NEXT: por %xmm2, %xmm0 330; SSE2-NEXT: retq 331; 332; SSE41-LABEL: var_funnnel_v8i16: 333; SSE41: # %bb.0: 334; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 335; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 336; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 337; SSE41-NEXT: pslld $23, %xmm1 338; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] 339; SSE41-NEXT: paddd %xmm3, %xmm1 340; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 341; SSE41-NEXT: pslld $23, %xmm2 342; SSE41-NEXT: paddd %xmm3, %xmm2 343; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 344; SSE41-NEXT: packusdw %xmm1, %xmm2 345; SSE41-NEXT: movdqa %xmm0, %xmm1 346; SSE41-NEXT: pmulhuw %xmm2, %xmm1 347; SSE41-NEXT: pmullw %xmm2, %xmm0 348; SSE41-NEXT: por %xmm1, %xmm0 349; SSE41-NEXT: retq 350; 351; AVX1-LABEL: var_funnnel_v8i16: 352; AVX1: # %bb.0: 353; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 354; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7] 355; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 356; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] 357; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 358; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 359; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 360; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 361; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 362; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 363; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 364; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 365; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 366; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 367; AVX1-NEXT: retq 368; 369; AVX2-LABEL: var_funnnel_v8i16: 370; AVX2: # %bb.0: 371; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 372; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] 373; AVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm2 374; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 375; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 376; AVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm2 377; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 378; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 379; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 380; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 381; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 382; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 383; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 384; AVX2-NEXT: vzeroupper 385; AVX2-NEXT: retq 386; 387; AVX512F-LABEL: var_funnnel_v8i16: 388; AVX512F: # %bb.0: 389; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 390; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 391; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 392; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm2 393; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 394; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1 395; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 396; AVX512F-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 397; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 398; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 399; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 400; AVX512F-NEXT: vzeroupper 401; AVX512F-NEXT: retq 402; 403; AVX512VL-LABEL: var_funnnel_v8i16: 404; AVX512VL: # %bb.0: 405; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 406; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 407; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 408; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm2 409; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 410; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm1 411; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 412; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 413; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 414; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 415; AVX512VL-NEXT: vzeroupper 416; AVX512VL-NEXT: retq 417; 418; AVX512BW-LABEL: var_funnnel_v8i16: 419; AVX512BW: # %bb.0: 420; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 421; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 422; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 423; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 424; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 425; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 426; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 427; AVX512BW-NEXT: vzeroupper 428; AVX512BW-NEXT: retq 429; 430; AVX512VLBW-LABEL: var_funnnel_v8i16: 431; AVX512VLBW: # %bb.0: 432; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 433; AVX512VLBW-NEXT: vpsllvw %xmm1, %xmm0, %xmm2 434; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 435; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 436; AVX512VLBW-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 437; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0 438; AVX512VLBW-NEXT: retq 439; 440; AVX512VBMI2-LABEL: var_funnnel_v8i16: 441; AVX512VBMI2: # %bb.0: 442; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 443; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 444; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 445; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 446; AVX512VBMI2-NEXT: vzeroupper 447; AVX512VBMI2-NEXT: retq 448; 449; AVX512VLVBMI2-LABEL: var_funnnel_v8i16: 450; AVX512VLVBMI2: # %bb.0: 451; AVX512VLVBMI2-NEXT: vpshldvw %xmm1, %xmm0, %xmm0 452; AVX512VLVBMI2-NEXT: retq 453; 454; XOP-LABEL: var_funnnel_v8i16: 455; XOP: # %bb.0: 456; XOP-NEXT: vprotw %xmm1, %xmm0, %xmm0 457; XOP-NEXT: retq 458; 459; X86-SSE2-LABEL: var_funnnel_v8i16: 460; X86-SSE2: # %bb.0: 461; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1 462; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 463; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 464; X86-SSE2-NEXT: pslld $23, %xmm2 465; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] 466; X86-SSE2-NEXT: paddd %xmm3, %xmm2 467; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2 468; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] 469; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] 470; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 471; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 472; X86-SSE2-NEXT: pslld $23, %xmm1 473; X86-SSE2-NEXT: paddd %xmm3, %xmm1 474; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 475; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 476; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 477; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 478; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 479; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 480; X86-SSE2-NEXT: pmulhuw %xmm1, %xmm2 481; X86-SSE2-NEXT: pmullw %xmm1, %xmm0 482; X86-SSE2-NEXT: por %xmm2, %xmm0 483; X86-SSE2-NEXT: retl 484 %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %x, <8 x i16> %amt) 485 ret <8 x i16> %res 486} 487 488define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { 489; SSE2-LABEL: var_funnnel_v16i8: 490; SSE2: # %bb.0: 491; SSE2-NEXT: movdqa %xmm0, %xmm2 492; SSE2-NEXT: psllw $5, %xmm1 493; SSE2-NEXT: pxor %xmm0, %xmm0 494; SSE2-NEXT: pxor %xmm3, %xmm3 495; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 496; SSE2-NEXT: movdqa %xmm2, %xmm4 497; SSE2-NEXT: psrlw $4, %xmm4 498; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 499; SSE2-NEXT: movdqa %xmm2, %xmm5 500; SSE2-NEXT: psllw $4, %xmm5 501; SSE2-NEXT: pand {{.*}}(%rip), %xmm5 502; SSE2-NEXT: por %xmm4, %xmm5 503; SSE2-NEXT: pand %xmm3, %xmm5 504; SSE2-NEXT: pandn %xmm2, %xmm3 505; SSE2-NEXT: por %xmm5, %xmm3 506; SSE2-NEXT: movdqa %xmm3, %xmm2 507; SSE2-NEXT: psrlw $6, %xmm2 508; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 509; SSE2-NEXT: movdqa %xmm3, %xmm4 510; SSE2-NEXT: psllw $2, %xmm4 511; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 512; SSE2-NEXT: por %xmm2, %xmm4 513; SSE2-NEXT: paddb %xmm1, %xmm1 514; SSE2-NEXT: pxor %xmm2, %xmm2 515; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 516; SSE2-NEXT: pand %xmm2, %xmm4 517; SSE2-NEXT: pandn %xmm3, %xmm2 518; SSE2-NEXT: por %xmm4, %xmm2 519; SSE2-NEXT: movdqa %xmm2, %xmm3 520; SSE2-NEXT: paddb %xmm2, %xmm3 521; SSE2-NEXT: movdqa %xmm2, %xmm4 522; SSE2-NEXT: psrlw $7, %xmm4 523; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 524; SSE2-NEXT: por %xmm3, %xmm4 525; SSE2-NEXT: paddb %xmm1, %xmm1 526; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 527; SSE2-NEXT: pand %xmm0, %xmm4 528; SSE2-NEXT: pandn %xmm2, %xmm0 529; SSE2-NEXT: por %xmm4, %xmm0 530; SSE2-NEXT: retq 531; 532; SSE41-LABEL: var_funnnel_v16i8: 533; SSE41: # %bb.0: 534; SSE41-NEXT: movdqa %xmm1, %xmm2 535; SSE41-NEXT: movdqa %xmm0, %xmm1 536; SSE41-NEXT: psrlw $4, %xmm0 537; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 538; SSE41-NEXT: movdqa %xmm1, %xmm3 539; SSE41-NEXT: psllw $4, %xmm3 540; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 541; SSE41-NEXT: por %xmm0, %xmm3 542; SSE41-NEXT: psllw $5, %xmm2 543; SSE41-NEXT: movdqa %xmm2, %xmm0 544; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 545; SSE41-NEXT: movdqa %xmm1, %xmm0 546; SSE41-NEXT: psrlw $6, %xmm0 547; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 548; SSE41-NEXT: movdqa %xmm1, %xmm3 549; SSE41-NEXT: psllw $2, %xmm3 550; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 551; SSE41-NEXT: por %xmm0, %xmm3 552; SSE41-NEXT: paddb %xmm2, %xmm2 553; SSE41-NEXT: movdqa %xmm2, %xmm0 554; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 555; SSE41-NEXT: movdqa %xmm1, %xmm0 556; SSE41-NEXT: paddb %xmm1, %xmm0 557; SSE41-NEXT: movdqa %xmm1, %xmm3 558; SSE41-NEXT: psrlw $7, %xmm3 559; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 560; SSE41-NEXT: por %xmm0, %xmm3 561; SSE41-NEXT: paddb %xmm2, %xmm2 562; SSE41-NEXT: movdqa %xmm2, %xmm0 563; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 564; SSE41-NEXT: movdqa %xmm1, %xmm0 565; SSE41-NEXT: retq 566; 567; AVX-LABEL: var_funnnel_v16i8: 568; AVX: # %bb.0: 569; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 570; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 571; AVX-NEXT: vpsllw $4, %xmm0, %xmm3 572; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 573; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 574; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 575; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 576; AVX-NEXT: vpsrlw $6, %xmm0, %xmm2 577; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 578; AVX-NEXT: vpsllw $2, %xmm0, %xmm3 579; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 580; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 581; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 582; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 583; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 584; AVX-NEXT: vpsrlw $7, %xmm0, %xmm3 585; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 586; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 587; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 588; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 589; AVX-NEXT: retq 590; 591; AVX512F-LABEL: var_funnnel_v16i8: 592; AVX512F: # %bb.0: 593; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 594; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3 595; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero 596; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 597; AVX512F-NEXT: vpsllvd %zmm3, %zmm0, %zmm3 598; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 599; AVX512F-NEXT: vpsubb %xmm1, %xmm4, %xmm1 600; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 601; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 602; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 603; AVX512F-NEXT: vpord %zmm0, %zmm3, %zmm0 604; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 605; AVX512F-NEXT: vzeroupper 606; AVX512F-NEXT: retq 607; 608; AVX512VL-LABEL: var_funnnel_v16i8: 609; AVX512VL: # %bb.0: 610; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 611; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3 612; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero 613; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 614; AVX512VL-NEXT: vpsllvd %zmm3, %zmm0, %zmm3 615; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 616; AVX512VL-NEXT: vpsubb %xmm1, %xmm4, %xmm1 617; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 618; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 619; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 620; AVX512VL-NEXT: vpord %zmm0, %zmm3, %zmm0 621; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 622; AVX512VL-NEXT: vzeroupper 623; AVX512VL-NEXT: retq 624; 625; AVX512BW-LABEL: var_funnnel_v16i8: 626; AVX512BW: # %bb.0: 627; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 628; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 629; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 630; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero 631; AVX512BW-NEXT: vpsllvw %zmm3, %zmm0, %zmm3 632; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4 633; AVX512BW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 634; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 635; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 636; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 637; AVX512BW-NEXT: vpor %ymm0, %ymm3, %ymm0 638; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 639; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 640; AVX512BW-NEXT: vzeroupper 641; AVX512BW-NEXT: retq 642; 643; AVX512VLBW-LABEL: var_funnnel_v16i8: 644; AVX512VLBW: # %bb.0: 645; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 646; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 647; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero 648; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 649; AVX512VLBW-NEXT: vpsllvw %ymm3, %ymm0, %ymm3 650; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4 651; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 652; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 653; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 654; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 655; AVX512VLBW-NEXT: vpor %ymm0, %ymm3, %ymm0 656; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 657; AVX512VLBW-NEXT: vzeroupper 658; AVX512VLBW-NEXT: retq 659; 660; AVX512VBMI2-LABEL: var_funnnel_v16i8: 661; AVX512VBMI2: # %bb.0: 662; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 663; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 664; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 665; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero 666; AVX512VBMI2-NEXT: vpsllvw %zmm3, %zmm0, %zmm3 667; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 668; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1 669; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 670; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 671; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 672; AVX512VBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0 673; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 674; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 675; AVX512VBMI2-NEXT: vzeroupper 676; AVX512VBMI2-NEXT: retq 677; 678; AVX512VLVBMI2-LABEL: var_funnnel_v16i8: 679; AVX512VLVBMI2: # %bb.0: 680; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 681; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 682; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero 683; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 684; AVX512VLVBMI2-NEXT: vpsllvw %ymm3, %ymm0, %ymm3 685; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 686; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1 687; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 688; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 689; AVX512VLVBMI2-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 690; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0 691; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 692; AVX512VLVBMI2-NEXT: vzeroupper 693; AVX512VLVBMI2-NEXT: retq 694; 695; XOP-LABEL: var_funnnel_v16i8: 696; XOP: # %bb.0: 697; XOP-NEXT: vprotb %xmm1, %xmm0, %xmm0 698; XOP-NEXT: retq 699; 700; X86-SSE2-LABEL: var_funnnel_v16i8: 701; X86-SSE2: # %bb.0: 702; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 703; X86-SSE2-NEXT: psllw $5, %xmm1 704; X86-SSE2-NEXT: pxor %xmm0, %xmm0 705; X86-SSE2-NEXT: pxor %xmm3, %xmm3 706; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm3 707; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 708; X86-SSE2-NEXT: psrlw $4, %xmm4 709; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm4 710; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 711; X86-SSE2-NEXT: psllw $4, %xmm5 712; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm5 713; X86-SSE2-NEXT: por %xmm4, %xmm5 714; X86-SSE2-NEXT: pand %xmm3, %xmm5 715; X86-SSE2-NEXT: pandn %xmm2, %xmm3 716; X86-SSE2-NEXT: por %xmm5, %xmm3 717; X86-SSE2-NEXT: movdqa %xmm3, %xmm2 718; X86-SSE2-NEXT: psrlw $6, %xmm2 719; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm2 720; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 721; X86-SSE2-NEXT: psllw $2, %xmm4 722; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm4 723; X86-SSE2-NEXT: por %xmm2, %xmm4 724; X86-SSE2-NEXT: paddb %xmm1, %xmm1 725; X86-SSE2-NEXT: pxor %xmm2, %xmm2 726; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 727; X86-SSE2-NEXT: pand %xmm2, %xmm4 728; X86-SSE2-NEXT: pandn %xmm3, %xmm2 729; X86-SSE2-NEXT: por %xmm4, %xmm2 730; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 731; X86-SSE2-NEXT: paddb %xmm2, %xmm3 732; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 733; X86-SSE2-NEXT: psrlw $7, %xmm4 734; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm4 735; X86-SSE2-NEXT: por %xmm3, %xmm4 736; X86-SSE2-NEXT: paddb %xmm1, %xmm1 737; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm0 738; X86-SSE2-NEXT: pand %xmm0, %xmm4 739; X86-SSE2-NEXT: pandn %xmm2, %xmm0 740; X86-SSE2-NEXT: por %xmm4, %xmm0 741; X86-SSE2-NEXT: retl 742 %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> %amt) 743 ret <16 x i8> %res 744} 745 746; 747; Uniform Variable Shifts 748; 749 750define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { 751; SSE-LABEL: splatvar_funnnel_v2i64: 752; SSE: # %bb.0: 753; SSE-NEXT: movdqa {{.*#+}} xmm2 = [63,63] 754; SSE-NEXT: pxor %xmm3, %xmm3 755; SSE-NEXT: psubq %xmm1, %xmm3 756; SSE-NEXT: pand %xmm2, %xmm1 757; SSE-NEXT: movdqa %xmm0, %xmm4 758; SSE-NEXT: psllq %xmm1, %xmm4 759; SSE-NEXT: pand %xmm2, %xmm3 760; SSE-NEXT: psrlq %xmm3, %xmm0 761; SSE-NEXT: por %xmm4, %xmm0 762; SSE-NEXT: retq 763; 764; AVX-LABEL: splatvar_funnnel_v2i64: 765; AVX: # %bb.0: 766; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] 767; AVX-NEXT: vpand %xmm2, %xmm1, %xmm3 768; AVX-NEXT: vpsllq %xmm3, %xmm0, %xmm3 769; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 770; AVX-NEXT: vpsubq %xmm1, %xmm4, %xmm1 771; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 772; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 773; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0 774; AVX-NEXT: retq 775; 776; AVX512F-LABEL: splatvar_funnnel_v2i64: 777; AVX512F: # %bb.0: 778; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 779; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1 780; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0 781; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 782; AVX512F-NEXT: vzeroupper 783; AVX512F-NEXT: retq 784; 785; AVX512VL-LABEL: splatvar_funnnel_v2i64: 786; AVX512VL: # %bb.0: 787; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 788; AVX512VL-NEXT: vprolvq %xmm1, %xmm0, %xmm0 789; AVX512VL-NEXT: retq 790; 791; AVX512BW-LABEL: splatvar_funnnel_v2i64: 792; AVX512BW: # %bb.0: 793; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 794; AVX512BW-NEXT: vpbroadcastq %xmm1, %xmm1 795; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 796; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 797; AVX512BW-NEXT: vzeroupper 798; AVX512BW-NEXT: retq 799; 800; AVX512VLBW-LABEL: splatvar_funnnel_v2i64: 801; AVX512VLBW: # %bb.0: 802; AVX512VLBW-NEXT: vpbroadcastq %xmm1, %xmm1 803; AVX512VLBW-NEXT: vprolvq %xmm1, %xmm0, %xmm0 804; AVX512VLBW-NEXT: retq 805; 806; AVX512VBMI2-LABEL: splatvar_funnnel_v2i64: 807; AVX512VBMI2: # %bb.0: 808; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 809; AVX512VBMI2-NEXT: vpbroadcastq %xmm1, %xmm1 810; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0 811; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 812; AVX512VBMI2-NEXT: vzeroupper 813; AVX512VBMI2-NEXT: retq 814; 815; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i64: 816; AVX512VLVBMI2: # %bb.0: 817; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm1, %xmm1 818; AVX512VLVBMI2-NEXT: vprolvq %xmm1, %xmm0, %xmm0 819; AVX512VLVBMI2-NEXT: retq 820; 821; XOPAVX1-LABEL: splatvar_funnnel_v2i64: 822; XOPAVX1: # %bb.0: 823; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 824; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0 825; XOPAVX1-NEXT: retq 826; 827; XOPAVX2-LABEL: splatvar_funnnel_v2i64: 828; XOPAVX2: # %bb.0: 829; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 830; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0 831; XOPAVX2-NEXT: retq 832; 833; X86-SSE2-LABEL: splatvar_funnnel_v2i64: 834; X86-SSE2: # %bb.0: 835; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 836; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [63,0,63,0] 837; X86-SSE2-NEXT: pxor %xmm3, %xmm3 838; X86-SSE2-NEXT: psubq %xmm1, %xmm3 839; X86-SSE2-NEXT: pand %xmm2, %xmm1 840; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 841; X86-SSE2-NEXT: psllq %xmm1, %xmm4 842; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 843; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 844; X86-SSE2-NEXT: psllq %xmm1, %xmm5 845; X86-SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] 846; X86-SSE2-NEXT: pand %xmm2, %xmm3 847; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 848; X86-SSE2-NEXT: psrlq %xmm3, %xmm1 849; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] 850; X86-SSE2-NEXT: psrlq %xmm2, %xmm0 851; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 852; X86-SSE2-NEXT: orpd %xmm5, %xmm0 853; X86-SSE2-NEXT: retl 854 %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer 855 %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> %splat) 856 ret <2 x i64> %res 857} 858 859define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %amt) nounwind { 860; SSE2-LABEL: splatvar_funnnel_v4i32: 861; SSE2: # %bb.0: 862; SSE2-NEXT: movd %xmm1, %eax 863; SSE2-NEXT: andl $31, %eax 864; SSE2-NEXT: movd %eax, %xmm1 865; SSE2-NEXT: movdqa %xmm0, %xmm2 866; SSE2-NEXT: pslld %xmm1, %xmm2 867; SSE2-NEXT: movl $32, %ecx 868; SSE2-NEXT: subl %eax, %ecx 869; SSE2-NEXT: movd %ecx, %xmm1 870; SSE2-NEXT: psrld %xmm1, %xmm0 871; SSE2-NEXT: por %xmm2, %xmm0 872; SSE2-NEXT: retq 873; 874; SSE41-LABEL: splatvar_funnnel_v4i32: 875; SSE41: # %bb.0: 876; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 877; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero 878; SSE41-NEXT: movdqa %xmm0, %xmm3 879; SSE41-NEXT: pslld %xmm2, %xmm3 880; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32] 881; SSE41-NEXT: psubd %xmm1, %xmm2 882; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero 883; SSE41-NEXT: psrld %xmm1, %xmm0 884; SSE41-NEXT: por %xmm3, %xmm0 885; SSE41-NEXT: retq 886; 887; AVX1-LABEL: splatvar_funnnel_v4i32: 888; AVX1: # %bb.0: 889; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 890; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero 891; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm2 892; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32] 893; AVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1 894; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 895; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 896; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 897; AVX1-NEXT: retq 898; 899; AVX2-LABEL: splatvar_funnnel_v4i32: 900; AVX2: # %bb.0: 901; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] 902; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 903; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero 904; AVX2-NEXT: vpslld %xmm2, %xmm0, %xmm2 905; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] 906; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 907; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 908; AVX2-NEXT: vpsrld %xmm1, %xmm0, %xmm0 909; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 910; AVX2-NEXT: retq 911; 912; AVX512F-LABEL: splatvar_funnnel_v4i32: 913; AVX512F: # %bb.0: 914; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 915; AVX512F-NEXT: vpbroadcastd %xmm1, %xmm1 916; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0 917; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 918; AVX512F-NEXT: vzeroupper 919; AVX512F-NEXT: retq 920; 921; AVX512VL-LABEL: splatvar_funnnel_v4i32: 922; AVX512VL: # %bb.0: 923; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1 924; AVX512VL-NEXT: vprolvd %xmm1, %xmm0, %xmm0 925; AVX512VL-NEXT: retq 926; 927; AVX512BW-LABEL: splatvar_funnnel_v4i32: 928; AVX512BW: # %bb.0: 929; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 930; AVX512BW-NEXT: vpbroadcastd %xmm1, %xmm1 931; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 932; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 933; AVX512BW-NEXT: vzeroupper 934; AVX512BW-NEXT: retq 935; 936; AVX512VLBW-LABEL: splatvar_funnnel_v4i32: 937; AVX512VLBW: # %bb.0: 938; AVX512VLBW-NEXT: vpbroadcastd %xmm1, %xmm1 939; AVX512VLBW-NEXT: vprolvd %xmm1, %xmm0, %xmm0 940; AVX512VLBW-NEXT: retq 941; 942; AVX512VBMI2-LABEL: splatvar_funnnel_v4i32: 943; AVX512VBMI2: # %bb.0: 944; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 945; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %xmm1 946; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 947; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 948; AVX512VBMI2-NEXT: vzeroupper 949; AVX512VBMI2-NEXT: retq 950; 951; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i32: 952; AVX512VLVBMI2: # %bb.0: 953; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm1, %xmm1 954; AVX512VLVBMI2-NEXT: vprolvd %xmm1, %xmm0, %xmm0 955; AVX512VLVBMI2-NEXT: retq 956; 957; XOPAVX1-LABEL: splatvar_funnnel_v4i32: 958; XOPAVX1: # %bb.0: 959; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 960; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0 961; XOPAVX1-NEXT: retq 962; 963; XOPAVX2-LABEL: splatvar_funnnel_v4i32: 964; XOPAVX2: # %bb.0: 965; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 966; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0 967; XOPAVX2-NEXT: retq 968; 969; X86-SSE2-LABEL: splatvar_funnnel_v4i32: 970; X86-SSE2: # %bb.0: 971; X86-SSE2-NEXT: movd %xmm1, %eax 972; X86-SSE2-NEXT: andl $31, %eax 973; X86-SSE2-NEXT: movd %eax, %xmm1 974; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 975; X86-SSE2-NEXT: pslld %xmm1, %xmm2 976; X86-SSE2-NEXT: movl $32, %ecx 977; X86-SSE2-NEXT: subl %eax, %ecx 978; X86-SSE2-NEXT: movd %ecx, %xmm1 979; X86-SSE2-NEXT: psrld %xmm1, %xmm0 980; X86-SSE2-NEXT: por %xmm2, %xmm0 981; X86-SSE2-NEXT: retl 982 %splat = shufflevector <4 x i32> %amt, <4 x i32> undef, <4 x i32> zeroinitializer 983 %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %splat) 984 ret <4 x i32> %res 985} 986 987define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { 988; SSE2-LABEL: splatvar_funnnel_v8i16: 989; SSE2: # %bb.0: 990; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 991; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0] 992; SSE2-NEXT: pand %xmm1, %xmm2 993; SSE2-NEXT: movdqa %xmm0, %xmm3 994; SSE2-NEXT: psllw %xmm2, %xmm3 995; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] 996; SSE2-NEXT: psubw %xmm1, %xmm2 997; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] 998; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 999; SSE2-NEXT: psrlw %xmm2, %xmm0 1000; SSE2-NEXT: por %xmm3, %xmm0 1001; SSE2-NEXT: retq 1002; 1003; SSE41-LABEL: splatvar_funnnel_v8i16: 1004; SSE41: # %bb.0: 1005; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 1006; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1007; SSE41-NEXT: movdqa %xmm0, %xmm3 1008; SSE41-NEXT: psllw %xmm2, %xmm3 1009; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] 1010; SSE41-NEXT: psubw %xmm1, %xmm2 1011; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 1012; SSE41-NEXT: psrlw %xmm1, %xmm0 1013; SSE41-NEXT: por %xmm3, %xmm0 1014; SSE41-NEXT: retq 1015; 1016; AVX-LABEL: splatvar_funnnel_v8i16: 1017; AVX: # %bb.0: 1018; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1019; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1020; AVX-NEXT: vpsllw %xmm2, %xmm0, %xmm2 1021; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 1022; AVX-NEXT: vpsubw %xmm1, %xmm3, %xmm1 1023; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1024; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1025; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 1026; AVX-NEXT: retq 1027; 1028; AVX512F-LABEL: splatvar_funnnel_v8i16: 1029; AVX512F: # %bb.0: 1030; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1031; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1032; AVX512F-NEXT: vpsllw %xmm2, %xmm0, %xmm2 1033; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 1034; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1 1035; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1036; AVX512F-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1037; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 1038; AVX512F-NEXT: retq 1039; 1040; AVX512VL-LABEL: splatvar_funnnel_v8i16: 1041; AVX512VL: # %bb.0: 1042; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1043; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1044; AVX512VL-NEXT: vpsllw %xmm2, %xmm0, %xmm2 1045; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 1046; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm1 1047; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1048; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1049; AVX512VL-NEXT: vpor %xmm0, %xmm2, %xmm0 1050; AVX512VL-NEXT: retq 1051; 1052; AVX512BW-LABEL: splatvar_funnnel_v8i16: 1053; AVX512BW: # %bb.0: 1054; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1055; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1056; AVX512BW-NEXT: vpsllw %xmm2, %xmm0, %xmm2 1057; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 1058; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 1059; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1060; AVX512BW-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1061; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 1062; AVX512BW-NEXT: retq 1063; 1064; AVX512VLBW-LABEL: splatvar_funnnel_v8i16: 1065; AVX512VLBW: # %bb.0: 1066; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1067; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1068; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm0, %xmm2 1069; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 1070; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 1071; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1072; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1073; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0 1074; AVX512VLBW-NEXT: retq 1075; 1076; AVX512VBMI2-LABEL: splatvar_funnnel_v8i16: 1077; AVX512VBMI2: # %bb.0: 1078; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1079; AVX512VBMI2-NEXT: vpbroadcastw %xmm1, %xmm1 1080; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 1081; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1082; AVX512VBMI2-NEXT: vzeroupper 1083; AVX512VBMI2-NEXT: retq 1084; 1085; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i16: 1086; AVX512VLVBMI2: # %bb.0: 1087; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm1, %xmm1 1088; AVX512VLVBMI2-NEXT: vpshldvw %xmm1, %xmm0, %xmm0 1089; AVX512VLVBMI2-NEXT: retq 1090; 1091; XOPAVX1-LABEL: splatvar_funnnel_v8i16: 1092; XOPAVX1: # %bb.0: 1093; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 1094; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 1095; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 1096; XOPAVX1-NEXT: retq 1097; 1098; XOPAVX2-LABEL: splatvar_funnnel_v8i16: 1099; XOPAVX2: # %bb.0: 1100; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 1101; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0 1102; XOPAVX2-NEXT: retq 1103; 1104; X86-SSE2-LABEL: splatvar_funnnel_v8i16: 1105; X86-SSE2: # %bb.0: 1106; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1 1107; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0] 1108; X86-SSE2-NEXT: pand %xmm1, %xmm2 1109; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 1110; X86-SSE2-NEXT: psllw %xmm2, %xmm3 1111; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] 1112; X86-SSE2-NEXT: psubw %xmm1, %xmm2 1113; X86-SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] 1114; X86-SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1115; X86-SSE2-NEXT: psrlw %xmm2, %xmm0 1116; X86-SSE2-NEXT: por %xmm3, %xmm0 1117; X86-SSE2-NEXT: retl 1118 %splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer 1119 %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %x, <8 x i16> %splat) 1120 ret <8 x i16> %res 1121} 1122 1123define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { 1124; SSE2-LABEL: splatvar_funnnel_v16i8: 1125; SSE2: # %bb.0: 1126; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 1127; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1128; SSE2-NEXT: psubb %xmm1, %xmm2 1129; SSE2-NEXT: movdqa %xmm1, %xmm3 1130; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0] 1131; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1132; SSE2-NEXT: movdqa %xmm0, %xmm1 1133; SSE2-NEXT: psllw %xmm3, %xmm1 1134; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 1135; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 1136; SSE2-NEXT: psllw %xmm3, %xmm5 1137; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1138; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,0,0,0,4,5,6,7] 1139; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] 1140; SSE2-NEXT: pand %xmm3, %xmm1 1141; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] 1142; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1143; SSE2-NEXT: psrlw %xmm2, %xmm0 1144; SSE2-NEXT: psrlw %xmm2, %xmm4 1145; SSE2-NEXT: psrlw $8, %xmm4 1146; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1147; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,0,0,4,5,6,7] 1148; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 1149; SSE2-NEXT: pand %xmm0, %xmm2 1150; SSE2-NEXT: por %xmm2, %xmm1 1151; SSE2-NEXT: movdqa %xmm1, %xmm0 1152; SSE2-NEXT: retq 1153; 1154; SSE41-LABEL: splatvar_funnnel_v16i8: 1155; SSE41: # %bb.0: 1156; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 1157; SSE41-NEXT: pmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1158; SSE41-NEXT: movdqa %xmm0, %xmm2 1159; SSE41-NEXT: psllw %xmm3, %xmm2 1160; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 1161; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 1162; SSE41-NEXT: psllw %xmm3, %xmm5 1163; SSE41-NEXT: pxor %xmm3, %xmm3 1164; SSE41-NEXT: pshufb %xmm3, %xmm5 1165; SSE41-NEXT: pand %xmm5, %xmm2 1166; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1167; SSE41-NEXT: psubb %xmm1, %xmm3 1168; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero 1169; SSE41-NEXT: psrlw %xmm1, %xmm0 1170; SSE41-NEXT: psrlw %xmm1, %xmm4 1171; SSE41-NEXT: pshufb {{.*#+}} xmm4 = xmm4[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1172; SSE41-NEXT: pand %xmm0, %xmm4 1173; SSE41-NEXT: por %xmm4, %xmm2 1174; SSE41-NEXT: movdqa %xmm2, %xmm0 1175; SSE41-NEXT: retq 1176; 1177; AVX1-LABEL: splatvar_funnnel_v16i8: 1178; AVX1: # %bb.0: 1179; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1180; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1181; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm3 1182; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 1183; AVX1-NEXT: vpsllw %xmm2, %xmm4, %xmm2 1184; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 1185; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm2 1186; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 1187; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1188; AVX1-NEXT: vpsubb %xmm1, %xmm3, %xmm1 1189; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1190; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1191; AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm1 1192; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1193; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1194; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 1195; AVX1-NEXT: retq 1196; 1197; AVX2-LABEL: splatvar_funnnel_v16i8: 1198; AVX2: # %bb.0: 1199; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1200; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1201; AVX2-NEXT: vpsllw %xmm2, %xmm0, %xmm3 1202; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 1203; AVX2-NEXT: vpsllw %xmm2, %xmm4, %xmm2 1204; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 1205; AVX2-NEXT: vpand %xmm2, %xmm3, %xmm2 1206; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1207; AVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1 1208; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1209; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1210; AVX2-NEXT: vpsrlw %xmm1, %xmm4, %xmm1 1211; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 1212; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 1213; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 1214; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 1215; AVX2-NEXT: retq 1216; 1217; AVX512F-LABEL: splatvar_funnnel_v16i8: 1218; AVX512F: # %bb.0: 1219; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 1220; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3 1221; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero 1222; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1223; AVX512F-NEXT: vpslld %xmm3, %zmm0, %zmm3 1224; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 1225; AVX512F-NEXT: vpsubb %xmm1, %xmm4, %xmm1 1226; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 1227; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1228; AVX512F-NEXT: vpsrld %xmm1, %zmm0, %zmm0 1229; AVX512F-NEXT: vpord %zmm0, %zmm3, %zmm0 1230; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 1231; AVX512F-NEXT: vzeroupper 1232; AVX512F-NEXT: retq 1233; 1234; AVX512VL-LABEL: splatvar_funnnel_v16i8: 1235; AVX512VL: # %bb.0: 1236; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 1237; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3 1238; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero 1239; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1240; AVX512VL-NEXT: vpslld %xmm3, %zmm0, %zmm3 1241; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 1242; AVX512VL-NEXT: vpsubb %xmm1, %xmm4, %xmm1 1243; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 1244; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1245; AVX512VL-NEXT: vpsrld %xmm1, %zmm0, %zmm0 1246; AVX512VL-NEXT: vpord %zmm0, %zmm3, %zmm0 1247; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 1248; AVX512VL-NEXT: vzeroupper 1249; AVX512VL-NEXT: retq 1250; 1251; AVX512BW-LABEL: splatvar_funnnel_v16i8: 1252; AVX512BW: # %bb.0: 1253; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 1254; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 1255; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero 1256; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1257; AVX512BW-NEXT: vpsllw %xmm3, %ymm0, %ymm3 1258; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4 1259; AVX512BW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 1260; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 1261; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1262; AVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 1263; AVX512BW-NEXT: vpor %ymm0, %ymm3, %ymm0 1264; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1265; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1266; AVX512BW-NEXT: vzeroupper 1267; AVX512BW-NEXT: retq 1268; 1269; AVX512VLBW-LABEL: splatvar_funnnel_v16i8: 1270; AVX512VLBW: # %bb.0: 1271; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 1272; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 1273; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero 1274; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1275; AVX512VLBW-NEXT: vpsllw %xmm3, %ymm0, %ymm3 1276; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4 1277; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 1278; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 1279; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1280; AVX512VLBW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 1281; AVX512VLBW-NEXT: vpor %ymm0, %ymm3, %ymm0 1282; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 1283; AVX512VLBW-NEXT: vzeroupper 1284; AVX512VLBW-NEXT: retq 1285; 1286; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8: 1287; AVX512VBMI2: # %bb.0: 1288; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 1289; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 1290; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero 1291; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1292; AVX512VBMI2-NEXT: vpsllw %xmm3, %ymm0, %ymm3 1293; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 1294; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1 1295; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 1296; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1297; AVX512VBMI2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 1298; AVX512VBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0 1299; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 1300; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1301; AVX512VBMI2-NEXT: vzeroupper 1302; AVX512VBMI2-NEXT: retq 1303; 1304; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8: 1305; AVX512VLVBMI2: # %bb.0: 1306; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 1307; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 1308; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero 1309; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1310; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %ymm0, %ymm3 1311; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 1312; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1 1313; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 1314; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1315; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 1316; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0 1317; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 1318; AVX512VLVBMI2-NEXT: vzeroupper 1319; AVX512VLVBMI2-NEXT: retq 1320; 1321; XOPAVX1-LABEL: splatvar_funnnel_v16i8: 1322; XOPAVX1: # %bb.0: 1323; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1324; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1325; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0 1326; XOPAVX1-NEXT: retq 1327; 1328; XOPAVX2-LABEL: splatvar_funnnel_v16i8: 1329; XOPAVX2: # %bb.0: 1330; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 1331; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0 1332; XOPAVX2-NEXT: retq 1333; 1334; X86-SSE2-LABEL: splatvar_funnnel_v16i8: 1335; X86-SSE2: # %bb.0: 1336; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1 1337; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1338; X86-SSE2-NEXT: psubb %xmm1, %xmm2 1339; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 1340; X86-SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0] 1341; X86-SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1342; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 1343; X86-SSE2-NEXT: psllw %xmm3, %xmm1 1344; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm4 1345; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm5 1346; X86-SSE2-NEXT: psllw %xmm3, %xmm5 1347; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1348; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,0,0,0,4,5,6,7] 1349; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] 1350; X86-SSE2-NEXT: pand %xmm3, %xmm1 1351; X86-SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] 1352; X86-SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1353; X86-SSE2-NEXT: psrlw %xmm2, %xmm0 1354; X86-SSE2-NEXT: psrlw %xmm2, %xmm4 1355; X86-SSE2-NEXT: psrlw $8, %xmm4 1356; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1357; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,0,0,4,5,6,7] 1358; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 1359; X86-SSE2-NEXT: pand %xmm0, %xmm2 1360; X86-SSE2-NEXT: por %xmm2, %xmm1 1361; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 1362; X86-SSE2-NEXT: retl 1363 %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer 1364 %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> %splat) 1365 ret <16 x i8> %res 1366} 1367 1368; 1369; Constant Shifts 1370; 1371 1372define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x) nounwind { 1373; SSE2-LABEL: constant_funnnel_v2i64: 1374; SSE2: # %bb.0: 1375; SSE2-NEXT: movdqa %xmm0, %xmm1 1376; SSE2-NEXT: psrlq $60, %xmm1 1377; SSE2-NEXT: movdqa %xmm0, %xmm2 1378; SSE2-NEXT: psrlq $50, %xmm2 1379; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 1380; SSE2-NEXT: movdqa %xmm0, %xmm1 1381; SSE2-NEXT: psllq $4, %xmm1 1382; SSE2-NEXT: psllq $14, %xmm0 1383; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1384; SSE2-NEXT: orpd %xmm2, %xmm0 1385; SSE2-NEXT: retq 1386; 1387; SSE41-LABEL: constant_funnnel_v2i64: 1388; SSE41: # %bb.0: 1389; SSE41-NEXT: movdqa %xmm0, %xmm1 1390; SSE41-NEXT: psrlq $50, %xmm1 1391; SSE41-NEXT: movdqa %xmm0, %xmm2 1392; SSE41-NEXT: psrlq $60, %xmm2 1393; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1394; SSE41-NEXT: movdqa %xmm0, %xmm1 1395; SSE41-NEXT: psllq $14, %xmm1 1396; SSE41-NEXT: psllq $4, %xmm0 1397; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1398; SSE41-NEXT: por %xmm2, %xmm0 1399; SSE41-NEXT: retq 1400; 1401; AVX1-LABEL: constant_funnnel_v2i64: 1402; AVX1: # %bb.0: 1403; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm1 1404; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm2 1405; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1406; AVX1-NEXT: vpsllq $14, %xmm0, %xmm2 1407; AVX1-NEXT: vpsllq $4, %xmm0, %xmm0 1408; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 1409; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 1410; AVX1-NEXT: retq 1411; 1412; AVX2-LABEL: constant_funnnel_v2i64: 1413; AVX2: # %bb.0: 1414; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm1 1415; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0 1416; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 1417; AVX2-NEXT: retq 1418; 1419; AVX512F-LABEL: constant_funnnel_v2i64: 1420; AVX512F: # %bb.0: 1421; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1422; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] 1423; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0 1424; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1425; AVX512F-NEXT: vzeroupper 1426; AVX512F-NEXT: retq 1427; 1428; AVX512VL-LABEL: constant_funnnel_v2i64: 1429; AVX512VL: # %bb.0: 1430; AVX512VL-NEXT: vprolvq {{.*}}(%rip), %xmm0, %xmm0 1431; AVX512VL-NEXT: retq 1432; 1433; AVX512BW-LABEL: constant_funnnel_v2i64: 1434; AVX512BW: # %bb.0: 1435; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1436; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] 1437; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 1438; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1439; AVX512BW-NEXT: vzeroupper 1440; AVX512BW-NEXT: retq 1441; 1442; AVX512VLBW-LABEL: constant_funnnel_v2i64: 1443; AVX512VLBW: # %bb.0: 1444; AVX512VLBW-NEXT: vprolvq {{.*}}(%rip), %xmm0, %xmm0 1445; AVX512VLBW-NEXT: retq 1446; 1447; AVX512VBMI2-LABEL: constant_funnnel_v2i64: 1448; AVX512VBMI2: # %bb.0: 1449; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1450; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] 1451; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0 1452; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1453; AVX512VBMI2-NEXT: vzeroupper 1454; AVX512VBMI2-NEXT: retq 1455; 1456; AVX512VLVBMI2-LABEL: constant_funnnel_v2i64: 1457; AVX512VLVBMI2: # %bb.0: 1458; AVX512VLVBMI2-NEXT: vprolvq {{.*}}(%rip), %xmm0, %xmm0 1459; AVX512VLVBMI2-NEXT: retq 1460; 1461; XOP-LABEL: constant_funnnel_v2i64: 1462; XOP: # %bb.0: 1463; XOP-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0 1464; XOP-NEXT: retq 1465; 1466; X86-SSE2-LABEL: constant_funnnel_v2i64: 1467; X86-SSE2: # %bb.0: 1468; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [63,0,63,0] 1469; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = <4,u,14,u> 1470; X86-SSE2-NEXT: pxor %xmm3, %xmm3 1471; X86-SSE2-NEXT: psubq %xmm2, %xmm3 1472; X86-SSE2-NEXT: pand %xmm1, %xmm2 1473; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 1474; X86-SSE2-NEXT: psllq %xmm2, %xmm4 1475; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 1476; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 1477; X86-SSE2-NEXT: psllq %xmm2, %xmm5 1478; X86-SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] 1479; X86-SSE2-NEXT: pand %xmm1, %xmm3 1480; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 1481; X86-SSE2-NEXT: psrlq %xmm3, %xmm1 1482; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] 1483; X86-SSE2-NEXT: psrlq %xmm2, %xmm0 1484; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1485; X86-SSE2-NEXT: orpd %xmm5, %xmm0 1486; X86-SSE2-NEXT: retl 1487 %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> <i64 4, i64 14>) 1488 ret <2 x i64> %res 1489} 1490 1491define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind { 1492; SSE2-LABEL: constant_funnnel_v4i32: 1493; SSE2: # %bb.0: 1494; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128] 1495; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 1496; SSE2-NEXT: pmuludq %xmm1, %xmm0 1497; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 1498; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 1499; SSE2-NEXT: pmuludq %xmm2, %xmm1 1500; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 1501; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 1502; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1503; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1504; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1505; SSE2-NEXT: por %xmm3, %xmm0 1506; SSE2-NEXT: retq 1507; 1508; SSE41-LABEL: constant_funnnel_v4i32: 1509; SSE41: # %bb.0: 1510; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128] 1511; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 1512; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 1513; SSE41-NEXT: pmuludq %xmm2, %xmm3 1514; SSE41-NEXT: pmuludq %xmm1, %xmm0 1515; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 1516; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 1517; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 1518; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 1519; SSE41-NEXT: por %xmm1, %xmm0 1520; SSE41-NEXT: retq 1521; 1522; AVX1-LABEL: constant_funnnel_v4i32: 1523; AVX1: # %bb.0: 1524; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,32,64,128] 1525; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 1526; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 1527; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 1528; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 1529; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 1530; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1531; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] 1532; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 1533; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 1534; AVX1-NEXT: retq 1535; 1536; AVX2-LABEL: constant_funnnel_v4i32: 1537; AVX2: # %bb.0: 1538; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 1539; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 1540; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 1541; AVX2-NEXT: retq 1542; 1543; AVX512F-LABEL: constant_funnnel_v4i32: 1544; AVX512F: # %bb.0: 1545; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1546; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] 1547; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0 1548; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1549; AVX512F-NEXT: vzeroupper 1550; AVX512F-NEXT: retq 1551; 1552; AVX512VL-LABEL: constant_funnnel_v4i32: 1553; AVX512VL: # %bb.0: 1554; AVX512VL-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0 1555; AVX512VL-NEXT: retq 1556; 1557; AVX512BW-LABEL: constant_funnnel_v4i32: 1558; AVX512BW: # %bb.0: 1559; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1560; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] 1561; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 1562; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1563; AVX512BW-NEXT: vzeroupper 1564; AVX512BW-NEXT: retq 1565; 1566; AVX512VLBW-LABEL: constant_funnnel_v4i32: 1567; AVX512VLBW: # %bb.0: 1568; AVX512VLBW-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0 1569; AVX512VLBW-NEXT: retq 1570; 1571; AVX512VBMI2-LABEL: constant_funnnel_v4i32: 1572; AVX512VBMI2: # %bb.0: 1573; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1574; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] 1575; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 1576; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1577; AVX512VBMI2-NEXT: vzeroupper 1578; AVX512VBMI2-NEXT: retq 1579; 1580; AVX512VLVBMI2-LABEL: constant_funnnel_v4i32: 1581; AVX512VLVBMI2: # %bb.0: 1582; AVX512VLVBMI2-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0 1583; AVX512VLVBMI2-NEXT: retq 1584; 1585; XOP-LABEL: constant_funnnel_v4i32: 1586; XOP: # %bb.0: 1587; XOP-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 1588; XOP-NEXT: retq 1589; 1590; X86-SSE2-LABEL: constant_funnnel_v4i32: 1591; X86-SSE2: # %bb.0: 1592; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128] 1593; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 1594; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 1595; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 1596; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 1597; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1 1598; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 1599; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 1600; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1601; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1602; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1603; X86-SSE2-NEXT: por %xmm3, %xmm0 1604; X86-SSE2-NEXT: retl 1605 %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 4, i32 5, i32 6, i32 7>) 1606 ret <4 x i32> %res 1607} 1608 1609define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x) nounwind { 1610; SSE-LABEL: constant_funnnel_v8i16: 1611; SSE: # %bb.0: 1612; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] 1613; SSE-NEXT: movdqa %xmm0, %xmm2 1614; SSE-NEXT: pmulhuw %xmm1, %xmm2 1615; SSE-NEXT: pmullw %xmm1, %xmm0 1616; SSE-NEXT: por %xmm2, %xmm0 1617; SSE-NEXT: retq 1618; 1619; AVX-LABEL: constant_funnnel_v8i16: 1620; AVX: # %bb.0: 1621; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] 1622; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 1623; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1624; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0 1625; AVX-NEXT: retq 1626; 1627; AVX512F-LABEL: constant_funnnel_v8i16: 1628; AVX512F: # %bb.0: 1629; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] 1630; AVX512F-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 1631; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1632; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0 1633; AVX512F-NEXT: retq 1634; 1635; AVX512VL-LABEL: constant_funnnel_v8i16: 1636; AVX512VL: # %bb.0: 1637; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] 1638; AVX512VL-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 1639; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1640; AVX512VL-NEXT: vpor %xmm2, %xmm0, %xmm0 1641; AVX512VL-NEXT: retq 1642; 1643; AVX512BW-LABEL: constant_funnnel_v8i16: 1644; AVX512BW: # %bb.0: 1645; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1646; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [16,15,14,13,12,11,10,9] 1647; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1 1648; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] 1649; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 1650; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 1651; AVX512BW-NEXT: vzeroupper 1652; AVX512BW-NEXT: retq 1653; 1654; AVX512VLBW-LABEL: constant_funnnel_v8i16: 1655; AVX512VLBW: # %bb.0: 1656; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm1 1657; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0 1658; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 1659; AVX512VLBW-NEXT: retq 1660; 1661; AVX512VBMI2-LABEL: constant_funnnel_v8i16: 1662; AVX512VBMI2: # %bb.0: 1663; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1664; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] 1665; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 1666; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1667; AVX512VBMI2-NEXT: vzeroupper 1668; AVX512VBMI2-NEXT: retq 1669; 1670; AVX512VLVBMI2-LABEL: constant_funnnel_v8i16: 1671; AVX512VLVBMI2: # %bb.0: 1672; AVX512VLVBMI2-NEXT: vpshldvw {{.*}}(%rip), %xmm0, %xmm0 1673; AVX512VLVBMI2-NEXT: retq 1674; 1675; XOP-LABEL: constant_funnnel_v8i16: 1676; XOP: # %bb.0: 1677; XOP-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0 1678; XOP-NEXT: retq 1679; 1680; X86-SSE2-LABEL: constant_funnnel_v8i16: 1681; X86-SSE2: # %bb.0: 1682; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] 1683; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 1684; X86-SSE2-NEXT: pmulhuw %xmm1, %xmm2 1685; X86-SSE2-NEXT: pmullw %xmm1, %xmm0 1686; X86-SSE2-NEXT: por %xmm2, %xmm0 1687; X86-SSE2-NEXT: retl 1688 %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %x, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>) 1689 ret <8 x i16> %res 1690} 1691 1692define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind { 1693; SSE2-LABEL: constant_funnnel_v16i8: 1694; SSE2: # %bb.0: 1695; SSE2-NEXT: pxor %xmm1, %xmm1 1696; SSE2-NEXT: movdqa %xmm0, %xmm2 1697; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 1698; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2 1699; SSE2-NEXT: psrlw $8, %xmm2 1700; SSE2-NEXT: movdqa %xmm0, %xmm3 1701; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 1702; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm3 1703; SSE2-NEXT: psrlw $8, %xmm3 1704; SSE2-NEXT: packuswb %xmm2, %xmm3 1705; SSE2-NEXT: movdqa %xmm0, %xmm1 1706; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1707; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm1 1708; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1709; SSE2-NEXT: pand %xmm2, %xmm1 1710; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1711; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0 1712; SSE2-NEXT: pand %xmm2, %xmm0 1713; SSE2-NEXT: packuswb %xmm1, %xmm0 1714; SSE2-NEXT: por %xmm3, %xmm0 1715; SSE2-NEXT: retq 1716; 1717; SSE41-LABEL: constant_funnnel_v16i8: 1718; SSE41: # %bb.0: 1719; SSE41-NEXT: movdqa %xmm0, %xmm2 1720; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1721; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm2 1722; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 1723; SSE41-NEXT: pand %xmm3, %xmm2 1724; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1725; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] 1726; SSE41-NEXT: pmullw %xmm1, %xmm4 1727; SSE41-NEXT: pand %xmm3, %xmm4 1728; SSE41-NEXT: packuswb %xmm2, %xmm4 1729; SSE41-NEXT: pxor %xmm2, %xmm2 1730; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1731; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0 1732; SSE41-NEXT: psrlw $8, %xmm0 1733; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1 1734; SSE41-NEXT: psrlw $8, %xmm1 1735; SSE41-NEXT: packuswb %xmm0, %xmm1 1736; SSE41-NEXT: por %xmm4, %xmm1 1737; SSE41-NEXT: movdqa %xmm1, %xmm0 1738; SSE41-NEXT: retq 1739; 1740; AVX1-LABEL: constant_funnnel_v16i8: 1741; AVX1: # %bb.0: 1742; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1743; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 1744; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1745; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 1746; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1747; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm4 1748; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2 1749; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 1750; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1751; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1752; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 1753; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1754; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm2 1755; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 1756; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0 1757; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 1758; AVX1-NEXT: retq 1759; 1760; AVX2-LABEL: constant_funnnel_v16i8: 1761; AVX2: # %bb.0: 1762; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1763; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm1 1764; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 1765; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1766; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 1767; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 1768; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1769; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1770; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1771; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 1772; AVX2-NEXT: vzeroupper 1773; AVX2-NEXT: retq 1774; 1775; AVX512F-LABEL: constant_funnnel_v16i8: 1776; AVX512F: # %bb.0: 1777; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1778; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm1 1779; AVX512F-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 1780; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 1781; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 1782; AVX512F-NEXT: vzeroupper 1783; AVX512F-NEXT: retq 1784; 1785; AVX512VL-LABEL: constant_funnnel_v16i8: 1786; AVX512VL: # %bb.0: 1787; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1788; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm1 1789; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 1790; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0 1791; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 1792; AVX512VL-NEXT: vzeroupper 1793; AVX512VL-NEXT: retq 1794; 1795; AVX512BW-LABEL: constant_funnnel_v16i8: 1796; AVX512BW: # %bb.0: 1797; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7] 1798; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1799; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1 1800; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] 1801; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 1802; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 1803; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1804; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1805; AVX512BW-NEXT: vzeroupper 1806; AVX512BW-NEXT: retq 1807; 1808; AVX512VLBW-LABEL: constant_funnnel_v16i8: 1809; AVX512VLBW: # %bb.0: 1810; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1811; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm1 1812; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0 1813; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 1814; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 1815; AVX512VLBW-NEXT: vzeroupper 1816; AVX512VLBW-NEXT: retq 1817; 1818; AVX512VBMI2-LABEL: constant_funnnel_v16i8: 1819; AVX512VBMI2: # %bb.0: 1820; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7] 1821; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1822; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1 1823; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] 1824; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 1825; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0 1826; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 1827; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1828; AVX512VBMI2-NEXT: vzeroupper 1829; AVX512VBMI2-NEXT: retq 1830; 1831; AVX512VLVBMI2-LABEL: constant_funnnel_v16i8: 1832; AVX512VLVBMI2: # %bb.0: 1833; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1834; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm1 1835; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0 1836; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0 1837; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 1838; AVX512VLVBMI2-NEXT: vzeroupper 1839; AVX512VLVBMI2-NEXT: retq 1840; 1841; XOP-LABEL: constant_funnnel_v16i8: 1842; XOP: # %bb.0: 1843; XOP-NEXT: vprotb {{.*}}(%rip), %xmm0, %xmm0 1844; XOP-NEXT: retq 1845; 1846; X86-SSE2-LABEL: constant_funnnel_v16i8: 1847; X86-SSE2: # %bb.0: 1848; X86-SSE2-NEXT: pxor %xmm1, %xmm1 1849; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 1850; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 1851; X86-SSE2-NEXT: pmullw {{\.LCPI.*}}, %xmm2 1852; X86-SSE2-NEXT: psrlw $8, %xmm2 1853; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 1854; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 1855; X86-SSE2-NEXT: pmullw {{\.LCPI.*}}, %xmm3 1856; X86-SSE2-NEXT: psrlw $8, %xmm3 1857; X86-SSE2-NEXT: packuswb %xmm2, %xmm3 1858; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 1859; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1860; X86-SSE2-NEXT: pmullw {{\.LCPI.*}}, %xmm1 1861; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1862; X86-SSE2-NEXT: pand %xmm2, %xmm1 1863; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1864; X86-SSE2-NEXT: pmullw {{\.LCPI.*}}, %xmm0 1865; X86-SSE2-NEXT: pand %xmm2, %xmm0 1866; X86-SSE2-NEXT: packuswb %xmm1, %xmm0 1867; X86-SSE2-NEXT: por %xmm3, %xmm0 1868; X86-SSE2-NEXT: retl 1869 %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>) 1870 ret <16 x i8> %res 1871} 1872 1873; 1874; Uniform Constant Shifts 1875; 1876 1877define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x) nounwind { 1878; SSE-LABEL: splatconstant_funnnel_v2i64: 1879; SSE: # %bb.0: 1880; SSE-NEXT: movdqa %xmm0, %xmm1 1881; SSE-NEXT: psrlq $50, %xmm1 1882; SSE-NEXT: psllq $14, %xmm0 1883; SSE-NEXT: por %xmm1, %xmm0 1884; SSE-NEXT: retq 1885; 1886; AVX-LABEL: splatconstant_funnnel_v2i64: 1887; AVX: # %bb.0: 1888; AVX-NEXT: vpsrlq $50, %xmm0, %xmm1 1889; AVX-NEXT: vpsllq $14, %xmm0, %xmm0 1890; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1891; AVX-NEXT: retq 1892; 1893; AVX512F-LABEL: splatconstant_funnnel_v2i64: 1894; AVX512F: # %bb.0: 1895; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1896; AVX512F-NEXT: vprolq $14, %zmm0, %zmm0 1897; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1898; AVX512F-NEXT: vzeroupper 1899; AVX512F-NEXT: retq 1900; 1901; AVX512VL-LABEL: splatconstant_funnnel_v2i64: 1902; AVX512VL: # %bb.0: 1903; AVX512VL-NEXT: vprolq $14, %xmm0, %xmm0 1904; AVX512VL-NEXT: retq 1905; 1906; AVX512BW-LABEL: splatconstant_funnnel_v2i64: 1907; AVX512BW: # %bb.0: 1908; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1909; AVX512BW-NEXT: vprolq $14, %zmm0, %zmm0 1910; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1911; AVX512BW-NEXT: vzeroupper 1912; AVX512BW-NEXT: retq 1913; 1914; AVX512VLBW-LABEL: splatconstant_funnnel_v2i64: 1915; AVX512VLBW: # %bb.0: 1916; AVX512VLBW-NEXT: vprolq $14, %xmm0, %xmm0 1917; AVX512VLBW-NEXT: retq 1918; 1919; AVX512VBMI2-LABEL: splatconstant_funnnel_v2i64: 1920; AVX512VBMI2: # %bb.0: 1921; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1922; AVX512VBMI2-NEXT: vprolq $14, %zmm0, %zmm0 1923; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1924; AVX512VBMI2-NEXT: vzeroupper 1925; AVX512VBMI2-NEXT: retq 1926; 1927; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v2i64: 1928; AVX512VLVBMI2: # %bb.0: 1929; AVX512VLVBMI2-NEXT: vprolq $14, %xmm0, %xmm0 1930; AVX512VLVBMI2-NEXT: retq 1931; 1932; XOP-LABEL: splatconstant_funnnel_v2i64: 1933; XOP: # %bb.0: 1934; XOP-NEXT: vprotq $14, %xmm0, %xmm0 1935; XOP-NEXT: retq 1936; 1937; X86-SSE2-LABEL: splatconstant_funnnel_v2i64: 1938; X86-SSE2: # %bb.0: 1939; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 1940; X86-SSE2-NEXT: psrlq $50, %xmm1 1941; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm1[0,1] 1942; X86-SSE2-NEXT: psllq $14, %xmm0 1943; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm0[0,1] 1944; X86-SSE2-NEXT: orpd %xmm1, %xmm0 1945; X86-SSE2-NEXT: retl 1946 %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> <i64 14, i64 14>) 1947 ret <2 x i64> %res 1948} 1949 1950define <4 x i32> @splatconstant_funnnel_v4i32(<4 x i32> %x) nounwind { 1951; SSE-LABEL: splatconstant_funnnel_v4i32: 1952; SSE: # %bb.0: 1953; SSE-NEXT: movdqa %xmm0, %xmm1 1954; SSE-NEXT: psrld $28, %xmm1 1955; SSE-NEXT: pslld $4, %xmm0 1956; SSE-NEXT: por %xmm1, %xmm0 1957; SSE-NEXT: retq 1958; 1959; AVX-LABEL: splatconstant_funnnel_v4i32: 1960; AVX: # %bb.0: 1961; AVX-NEXT: vpsrld $28, %xmm0, %xmm1 1962; AVX-NEXT: vpslld $4, %xmm0, %xmm0 1963; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1964; AVX-NEXT: retq 1965; 1966; AVX512F-LABEL: splatconstant_funnnel_v4i32: 1967; AVX512F: # %bb.0: 1968; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1969; AVX512F-NEXT: vprold $4, %zmm0, %zmm0 1970; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1971; AVX512F-NEXT: vzeroupper 1972; AVX512F-NEXT: retq 1973; 1974; AVX512VL-LABEL: splatconstant_funnnel_v4i32: 1975; AVX512VL: # %bb.0: 1976; AVX512VL-NEXT: vprold $4, %xmm0, %xmm0 1977; AVX512VL-NEXT: retq 1978; 1979; AVX512BW-LABEL: splatconstant_funnnel_v4i32: 1980; AVX512BW: # %bb.0: 1981; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1982; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0 1983; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1984; AVX512BW-NEXT: vzeroupper 1985; AVX512BW-NEXT: retq 1986; 1987; AVX512VLBW-LABEL: splatconstant_funnnel_v4i32: 1988; AVX512VLBW: # %bb.0: 1989; AVX512VLBW-NEXT: vprold $4, %xmm0, %xmm0 1990; AVX512VLBW-NEXT: retq 1991; 1992; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i32: 1993; AVX512VBMI2: # %bb.0: 1994; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1995; AVX512VBMI2-NEXT: vprold $4, %zmm0, %zmm0 1996; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1997; AVX512VBMI2-NEXT: vzeroupper 1998; AVX512VBMI2-NEXT: retq 1999; 2000; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i32: 2001; AVX512VLVBMI2: # %bb.0: 2002; AVX512VLVBMI2-NEXT: vprold $4, %xmm0, %xmm0 2003; AVX512VLVBMI2-NEXT: retq 2004; 2005; XOP-LABEL: splatconstant_funnnel_v4i32: 2006; XOP: # %bb.0: 2007; XOP-NEXT: vprotd $4, %xmm0, %xmm0 2008; XOP-NEXT: retq 2009; 2010; X86-SSE2-LABEL: splatconstant_funnnel_v4i32: 2011; X86-SSE2: # %bb.0: 2012; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 2013; X86-SSE2-NEXT: psrld $28, %xmm1 2014; X86-SSE2-NEXT: pslld $4, %xmm0 2015; X86-SSE2-NEXT: por %xmm1, %xmm0 2016; X86-SSE2-NEXT: retl 2017 %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 4, i32 4, i32 4, i32 4>) 2018 ret <4 x i32> %res 2019} 2020 2021define <8 x i16> @splatconstant_funnnel_v8i16(<8 x i16> %x) nounwind { 2022; SSE-LABEL: splatconstant_funnnel_v8i16: 2023; SSE: # %bb.0: 2024; SSE-NEXT: movdqa %xmm0, %xmm1 2025; SSE-NEXT: psrlw $9, %xmm1 2026; SSE-NEXT: psllw $7, %xmm0 2027; SSE-NEXT: por %xmm1, %xmm0 2028; SSE-NEXT: retq 2029; 2030; AVX-LABEL: splatconstant_funnnel_v8i16: 2031; AVX: # %bb.0: 2032; AVX-NEXT: vpsrlw $9, %xmm0, %xmm1 2033; AVX-NEXT: vpsllw $7, %xmm0, %xmm0 2034; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 2035; AVX-NEXT: retq 2036; 2037; AVX512F-LABEL: splatconstant_funnnel_v8i16: 2038; AVX512F: # %bb.0: 2039; AVX512F-NEXT: vpsrlw $9, %xmm0, %xmm1 2040; AVX512F-NEXT: vpsllw $7, %xmm0, %xmm0 2041; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 2042; AVX512F-NEXT: retq 2043; 2044; AVX512VL-LABEL: splatconstant_funnnel_v8i16: 2045; AVX512VL: # %bb.0: 2046; AVX512VL-NEXT: vpsrlw $9, %xmm0, %xmm1 2047; AVX512VL-NEXT: vpsllw $7, %xmm0, %xmm0 2048; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 2049; AVX512VL-NEXT: retq 2050; 2051; AVX512BW-LABEL: splatconstant_funnnel_v8i16: 2052; AVX512BW: # %bb.0: 2053; AVX512BW-NEXT: vpsrlw $9, %xmm0, %xmm1 2054; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0 2055; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 2056; AVX512BW-NEXT: retq 2057; 2058; AVX512VLBW-LABEL: splatconstant_funnnel_v8i16: 2059; AVX512VLBW: # %bb.0: 2060; AVX512VLBW-NEXT: vpsrlw $9, %xmm0, %xmm1 2061; AVX512VLBW-NEXT: vpsllw $7, %xmm0, %xmm0 2062; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 2063; AVX512VLBW-NEXT: retq 2064; 2065; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i16: 2066; AVX512VBMI2: # %bb.0: 2067; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2068; AVX512VBMI2-NEXT: vpshldw $7, %zmm0, %zmm0, %zmm0 2069; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 2070; AVX512VBMI2-NEXT: vzeroupper 2071; AVX512VBMI2-NEXT: retq 2072; 2073; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i16: 2074; AVX512VLVBMI2: # %bb.0: 2075; AVX512VLVBMI2-NEXT: vpshldw $7, %xmm0, %xmm0, %xmm0 2076; AVX512VLVBMI2-NEXT: retq 2077; 2078; XOP-LABEL: splatconstant_funnnel_v8i16: 2079; XOP: # %bb.0: 2080; XOP-NEXT: vprotw $7, %xmm0, %xmm0 2081; XOP-NEXT: retq 2082; 2083; X86-SSE2-LABEL: splatconstant_funnnel_v8i16: 2084; X86-SSE2: # %bb.0: 2085; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 2086; X86-SSE2-NEXT: psrlw $9, %xmm1 2087; X86-SSE2-NEXT: psllw $7, %xmm0 2088; X86-SSE2-NEXT: por %xmm1, %xmm0 2089; X86-SSE2-NEXT: retl 2090 %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %x, <8 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>) 2091 ret <8 x i16> %res 2092} 2093 2094define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind { 2095; SSE-LABEL: splatconstant_funnnel_v16i8: 2096; SSE: # %bb.0: 2097; SSE-NEXT: movdqa %xmm0, %xmm1 2098; SSE-NEXT: psrlw $4, %xmm1 2099; SSE-NEXT: pand {{.*}}(%rip), %xmm1 2100; SSE-NEXT: psllw $4, %xmm0 2101; SSE-NEXT: pand {{.*}}(%rip), %xmm0 2102; SSE-NEXT: por %xmm1, %xmm0 2103; SSE-NEXT: retq 2104; 2105; AVX-LABEL: splatconstant_funnnel_v16i8: 2106; AVX: # %bb.0: 2107; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1 2108; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 2109; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 2110; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2111; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 2112; AVX-NEXT: retq 2113; 2114; AVX512F-LABEL: splatconstant_funnnel_v16i8: 2115; AVX512F: # %bb.0: 2116; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm1 2117; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 2118; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm0 2119; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2120; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 2121; AVX512F-NEXT: retq 2122; 2123; AVX512VL-LABEL: splatconstant_funnnel_v16i8: 2124; AVX512VL: # %bb.0: 2125; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm1 2126; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm0 2127; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm1, %xmm0 2128; AVX512VL-NEXT: retq 2129; 2130; AVX512BW-LABEL: splatconstant_funnnel_v16i8: 2131; AVX512BW: # %bb.0: 2132; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm1 2133; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 2134; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm0 2135; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2136; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 2137; AVX512BW-NEXT: retq 2138; 2139; AVX512VLBW-LABEL: splatconstant_funnnel_v16i8: 2140; AVX512VLBW: # %bb.0: 2141; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm1 2142; AVX512VLBW-NEXT: vpsrlw $4, %xmm0, %xmm0 2143; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm1, %xmm0 2144; AVX512VLBW-NEXT: retq 2145; 2146; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8: 2147; AVX512VBMI2: # %bb.0: 2148; AVX512VBMI2-NEXT: vpsrlw $4, %xmm0, %xmm1 2149; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 2150; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm0 2151; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2152; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0 2153; AVX512VBMI2-NEXT: retq 2154; 2155; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8: 2156; AVX512VLVBMI2: # %bb.0: 2157; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm1 2158; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0 2159; AVX512VLVBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm1, %xmm0 2160; AVX512VLVBMI2-NEXT: retq 2161; 2162; XOP-LABEL: splatconstant_funnnel_v16i8: 2163; XOP: # %bb.0: 2164; XOP-NEXT: vprotb $4, %xmm0, %xmm0 2165; XOP-NEXT: retq 2166; 2167; X86-SSE2-LABEL: splatconstant_funnnel_v16i8: 2168; X86-SSE2: # %bb.0: 2169; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 2170; X86-SSE2-NEXT: psrlw $4, %xmm1 2171; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1 2172; X86-SSE2-NEXT: psllw $4, %xmm0 2173; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm0 2174; X86-SSE2-NEXT: por %xmm1, %xmm0 2175; X86-SSE2-NEXT: retl 2176 %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>) 2177 ret <16 x i8> %res 2178} 2179