1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL 12; 13; Just one 32-bit run to make sure we do reasonable things for i64 shifts. 14; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2 15 16; 17; Variable Shifts 18; 19 20define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 21; SSE2-LABEL: var_shift_v2i64: 22; SSE2: # %bb.0: 23; SSE2-NEXT: movdqa %xmm0, %xmm2 24; SSE2-NEXT: psrlq %xmm1, %xmm2 25; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 26; SSE2-NEXT: psrlq %xmm1, %xmm0 27; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] 28; SSE2-NEXT: retq 29; 30; SSE41-LABEL: var_shift_v2i64: 31; SSE41: # %bb.0: 32; SSE41-NEXT: movdqa %xmm0, %xmm2 33; SSE41-NEXT: psrlq %xmm1, %xmm2 34; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 35; SSE41-NEXT: psrlq %xmm1, %xmm0 36; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] 37; SSE41-NEXT: retq 38; 39; AVX1-LABEL: var_shift_v2i64: 40; AVX1: # %bb.0: 41; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 42; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 43; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 44; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] 45; AVX1-NEXT: retq 46; 47; AVX2-LABEL: var_shift_v2i64: 48; AVX2: # %bb.0: 49; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 50; AVX2-NEXT: retq 51; 52; XOPAVX1-LABEL: var_shift_v2i64: 53; XOPAVX1: # %bb.0: 54; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 55; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 56; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0 57; XOPAVX1-NEXT: retq 58; 59; XOPAVX2-LABEL: var_shift_v2i64: 60; XOPAVX2: # %bb.0: 61; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 62; XOPAVX2-NEXT: retq 63; 64; AVX512-LABEL: var_shift_v2i64: 65; AVX512: # %bb.0: 66; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 67; AVX512-NEXT: retq 68; 69; AVX512VL-LABEL: var_shift_v2i64: 70; AVX512VL: # %bb.0: 71; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 72; AVX512VL-NEXT: retq 73; 74; X32-SSE-LABEL: var_shift_v2i64: 75; X32-SSE: # %bb.0: 76; X32-SSE-NEXT: movdqa %xmm0, %xmm2 77; X32-SSE-NEXT: psrlq %xmm1, %xmm2 78; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 79; X32-SSE-NEXT: psrlq %xmm1, %xmm0 80; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] 81; X32-SSE-NEXT: retl 82 %shift = lshr <2 x i64> %a, %b 83 ret <2 x i64> %shift 84} 85 86define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 87; SSE2-LABEL: var_shift_v4i32: 88; SSE2: # %bb.0: 89; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 90; SSE2-NEXT: movdqa %xmm0, %xmm3 91; SSE2-NEXT: psrld %xmm2, %xmm3 92; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] 93; SSE2-NEXT: movdqa %xmm0, %xmm2 94; SSE2-NEXT: psrld %xmm4, %xmm2 95; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 96; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 97; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] 98; SSE2-NEXT: movdqa %xmm0, %xmm4 99; SSE2-NEXT: psrld %xmm3, %xmm4 100; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 101; SSE2-NEXT: psrld %xmm1, %xmm0 102; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] 103; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] 104; SSE2-NEXT: movaps %xmm2, %xmm0 105; SSE2-NEXT: retq 106; 107; SSE41-LABEL: var_shift_v4i32: 108; SSE41: # %bb.0: 109; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 110; SSE41-NEXT: movdqa %xmm0, %xmm3 111; SSE41-NEXT: psrld %xmm2, %xmm3 112; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] 113; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] 114; SSE41-NEXT: movdqa %xmm0, %xmm5 115; SSE41-NEXT: psrld %xmm4, %xmm5 116; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] 117; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 118; SSE41-NEXT: movdqa %xmm0, %xmm3 119; SSE41-NEXT: psrld %xmm1, %xmm3 120; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] 121; SSE41-NEXT: psrld %xmm1, %xmm0 122; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 123; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] 124; SSE41-NEXT: retq 125; 126; AVX1-LABEL: var_shift_v4i32: 127; AVX1: # %bb.0: 128; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 129; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 130; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 131; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 132; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 133; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 134; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] 135; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 136; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 137; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 138; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] 139; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 140; AVX1-NEXT: retq 141; 142; AVX2-LABEL: var_shift_v4i32: 143; AVX2: # %bb.0: 144; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 145; AVX2-NEXT: retq 146; 147; XOPAVX1-LABEL: var_shift_v4i32: 148; XOPAVX1: # %bb.0: 149; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 150; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 151; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0 152; XOPAVX1-NEXT: retq 153; 154; XOPAVX2-LABEL: var_shift_v4i32: 155; XOPAVX2: # %bb.0: 156; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 157; XOPAVX2-NEXT: retq 158; 159; AVX512-LABEL: var_shift_v4i32: 160; AVX512: # %bb.0: 161; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 162; AVX512-NEXT: retq 163; 164; AVX512VL-LABEL: var_shift_v4i32: 165; AVX512VL: # %bb.0: 166; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 167; AVX512VL-NEXT: retq 168; 169; X32-SSE-LABEL: var_shift_v4i32: 170; X32-SSE: # %bb.0: 171; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 172; X32-SSE-NEXT: movdqa %xmm0, %xmm3 173; X32-SSE-NEXT: psrld %xmm2, %xmm3 174; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] 175; X32-SSE-NEXT: movdqa %xmm0, %xmm2 176; X32-SSE-NEXT: psrld %xmm4, %xmm2 177; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 178; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 179; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] 180; X32-SSE-NEXT: movdqa %xmm0, %xmm4 181; X32-SSE-NEXT: psrld %xmm3, %xmm4 182; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 183; X32-SSE-NEXT: psrld %xmm1, %xmm0 184; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] 185; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] 186; X32-SSE-NEXT: movaps %xmm2, %xmm0 187; X32-SSE-NEXT: retl 188 %shift = lshr <4 x i32> %a, %b 189 ret <4 x i32> %shift 190} 191 192define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 193; SSE2-LABEL: var_shift_v8i16: 194; SSE2: # %bb.0: 195; SSE2-NEXT: psllw $12, %xmm1 196; SSE2-NEXT: movdqa %xmm1, %xmm2 197; SSE2-NEXT: psraw $15, %xmm2 198; SSE2-NEXT: movdqa %xmm2, %xmm3 199; SSE2-NEXT: pandn %xmm0, %xmm3 200; SSE2-NEXT: psrlw $8, %xmm0 201; SSE2-NEXT: pand %xmm2, %xmm0 202; SSE2-NEXT: por %xmm3, %xmm0 203; SSE2-NEXT: paddw %xmm1, %xmm1 204; SSE2-NEXT: movdqa %xmm1, %xmm2 205; SSE2-NEXT: psraw $15, %xmm2 206; SSE2-NEXT: movdqa %xmm2, %xmm3 207; SSE2-NEXT: pandn %xmm0, %xmm3 208; SSE2-NEXT: psrlw $4, %xmm0 209; SSE2-NEXT: pand %xmm2, %xmm0 210; SSE2-NEXT: por %xmm3, %xmm0 211; SSE2-NEXT: paddw %xmm1, %xmm1 212; SSE2-NEXT: movdqa %xmm1, %xmm2 213; SSE2-NEXT: psraw $15, %xmm2 214; SSE2-NEXT: movdqa %xmm2, %xmm3 215; SSE2-NEXT: pandn %xmm0, %xmm3 216; SSE2-NEXT: psrlw $2, %xmm0 217; SSE2-NEXT: pand %xmm2, %xmm0 218; SSE2-NEXT: por %xmm3, %xmm0 219; SSE2-NEXT: paddw %xmm1, %xmm1 220; SSE2-NEXT: psraw $15, %xmm1 221; SSE2-NEXT: movdqa %xmm1, %xmm2 222; SSE2-NEXT: pandn %xmm0, %xmm2 223; SSE2-NEXT: psrlw $1, %xmm0 224; SSE2-NEXT: pand %xmm1, %xmm0 225; SSE2-NEXT: por %xmm2, %xmm0 226; SSE2-NEXT: retq 227; 228; SSE41-LABEL: var_shift_v8i16: 229; SSE41: # %bb.0: 230; SSE41-NEXT: movdqa %xmm0, %xmm2 231; SSE41-NEXT: movdqa %xmm1, %xmm0 232; SSE41-NEXT: psllw $12, %xmm0 233; SSE41-NEXT: psllw $4, %xmm1 234; SSE41-NEXT: por %xmm0, %xmm1 235; SSE41-NEXT: movdqa %xmm1, %xmm3 236; SSE41-NEXT: paddw %xmm1, %xmm3 237; SSE41-NEXT: movdqa %xmm2, %xmm4 238; SSE41-NEXT: psrlw $8, %xmm4 239; SSE41-NEXT: movdqa %xmm1, %xmm0 240; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm2 241; SSE41-NEXT: movdqa %xmm2, %xmm1 242; SSE41-NEXT: psrlw $4, %xmm1 243; SSE41-NEXT: movdqa %xmm3, %xmm0 244; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2 245; SSE41-NEXT: movdqa %xmm2, %xmm1 246; SSE41-NEXT: psrlw $2, %xmm1 247; SSE41-NEXT: paddw %xmm3, %xmm3 248; SSE41-NEXT: movdqa %xmm3, %xmm0 249; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2 250; SSE41-NEXT: movdqa %xmm2, %xmm1 251; SSE41-NEXT: psrlw $1, %xmm1 252; SSE41-NEXT: paddw %xmm3, %xmm3 253; SSE41-NEXT: movdqa %xmm3, %xmm0 254; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2 255; SSE41-NEXT: movdqa %xmm2, %xmm0 256; SSE41-NEXT: retq 257; 258; AVX1-LABEL: var_shift_v8i16: 259; AVX1: # %bb.0: 260; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 261; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 262; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 263; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 264; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3 265; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 266; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 267; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 268; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 269; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 270; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 271; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 272; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 273; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 274; AVX1-NEXT: retq 275; 276; AVX2-LABEL: var_shift_v8i16: 277; AVX2: # %bb.0: 278; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 279; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 280; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 281; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 282; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 283; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 284; AVX2-NEXT: vzeroupper 285; AVX2-NEXT: retq 286; 287; XOP-LABEL: var_shift_v8i16: 288; XOP: # %bb.0: 289; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 290; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 291; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0 292; XOP-NEXT: retq 293; 294; AVX512DQ-LABEL: var_shift_v8i16: 295; AVX512DQ: # %bb.0: 296; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 297; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 298; AVX512DQ-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 299; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 300; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 301; AVX512DQ-NEXT: vzeroupper 302; AVX512DQ-NEXT: retq 303; 304; AVX512BW-LABEL: var_shift_v8i16: 305; AVX512BW: # %bb.0: 306; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 307; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 308; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 309; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 310; AVX512BW-NEXT: vzeroupper 311; AVX512BW-NEXT: retq 312; 313; AVX512DQVL-LABEL: var_shift_v8i16: 314; AVX512DQVL: # %bb.0: 315; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 316; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 317; AVX512DQVL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 318; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 319; AVX512DQVL-NEXT: vzeroupper 320; AVX512DQVL-NEXT: retq 321; 322; AVX512BWVL-LABEL: var_shift_v8i16: 323; AVX512BWVL: # %bb.0: 324; AVX512BWVL-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 325; AVX512BWVL-NEXT: retq 326; 327; X32-SSE-LABEL: var_shift_v8i16: 328; X32-SSE: # %bb.0: 329; X32-SSE-NEXT: psllw $12, %xmm1 330; X32-SSE-NEXT: movdqa %xmm1, %xmm2 331; X32-SSE-NEXT: psraw $15, %xmm2 332; X32-SSE-NEXT: movdqa %xmm2, %xmm3 333; X32-SSE-NEXT: pandn %xmm0, %xmm3 334; X32-SSE-NEXT: psrlw $8, %xmm0 335; X32-SSE-NEXT: pand %xmm2, %xmm0 336; X32-SSE-NEXT: por %xmm3, %xmm0 337; X32-SSE-NEXT: paddw %xmm1, %xmm1 338; X32-SSE-NEXT: movdqa %xmm1, %xmm2 339; X32-SSE-NEXT: psraw $15, %xmm2 340; X32-SSE-NEXT: movdqa %xmm2, %xmm3 341; X32-SSE-NEXT: pandn %xmm0, %xmm3 342; X32-SSE-NEXT: psrlw $4, %xmm0 343; X32-SSE-NEXT: pand %xmm2, %xmm0 344; X32-SSE-NEXT: por %xmm3, %xmm0 345; X32-SSE-NEXT: paddw %xmm1, %xmm1 346; X32-SSE-NEXT: movdqa %xmm1, %xmm2 347; X32-SSE-NEXT: psraw $15, %xmm2 348; X32-SSE-NEXT: movdqa %xmm2, %xmm3 349; X32-SSE-NEXT: pandn %xmm0, %xmm3 350; X32-SSE-NEXT: psrlw $2, %xmm0 351; X32-SSE-NEXT: pand %xmm2, %xmm0 352; X32-SSE-NEXT: por %xmm3, %xmm0 353; X32-SSE-NEXT: paddw %xmm1, %xmm1 354; X32-SSE-NEXT: psraw $15, %xmm1 355; X32-SSE-NEXT: movdqa %xmm1, %xmm2 356; X32-SSE-NEXT: pandn %xmm0, %xmm2 357; X32-SSE-NEXT: psrlw $1, %xmm0 358; X32-SSE-NEXT: pand %xmm1, %xmm0 359; X32-SSE-NEXT: por %xmm2, %xmm0 360; X32-SSE-NEXT: retl 361 %shift = lshr <8 x i16> %a, %b 362 ret <8 x i16> %shift 363} 364 365define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 366; SSE2-LABEL: var_shift_v16i8: 367; SSE2: # %bb.0: 368; SSE2-NEXT: psllw $5, %xmm1 369; SSE2-NEXT: pxor %xmm2, %xmm2 370; SSE2-NEXT: pxor %xmm3, %xmm3 371; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 372; SSE2-NEXT: movdqa %xmm3, %xmm4 373; SSE2-NEXT: pandn %xmm0, %xmm4 374; SSE2-NEXT: psrlw $4, %xmm0 375; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 376; SSE2-NEXT: pand %xmm3, %xmm0 377; SSE2-NEXT: por %xmm4, %xmm0 378; SSE2-NEXT: paddb %xmm1, %xmm1 379; SSE2-NEXT: pxor %xmm3, %xmm3 380; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 381; SSE2-NEXT: movdqa %xmm3, %xmm4 382; SSE2-NEXT: pandn %xmm0, %xmm4 383; SSE2-NEXT: psrlw $2, %xmm0 384; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 385; SSE2-NEXT: pand %xmm3, %xmm0 386; SSE2-NEXT: por %xmm4, %xmm0 387; SSE2-NEXT: paddb %xmm1, %xmm1 388; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 389; SSE2-NEXT: movdqa %xmm2, %xmm1 390; SSE2-NEXT: pandn %xmm0, %xmm1 391; SSE2-NEXT: psrlw $1, %xmm0 392; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 393; SSE2-NEXT: pand %xmm2, %xmm0 394; SSE2-NEXT: por %xmm1, %xmm0 395; SSE2-NEXT: retq 396; 397; SSE41-LABEL: var_shift_v16i8: 398; SSE41: # %bb.0: 399; SSE41-NEXT: movdqa %xmm0, %xmm2 400; SSE41-NEXT: psllw $5, %xmm1 401; SSE41-NEXT: movdqa %xmm0, %xmm3 402; SSE41-NEXT: psrlw $4, %xmm3 403; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 404; SSE41-NEXT: movdqa %xmm1, %xmm0 405; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 406; SSE41-NEXT: movdqa %xmm2, %xmm3 407; SSE41-NEXT: psrlw $2, %xmm3 408; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 409; SSE41-NEXT: paddb %xmm1, %xmm1 410; SSE41-NEXT: movdqa %xmm1, %xmm0 411; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 412; SSE41-NEXT: movdqa %xmm2, %xmm3 413; SSE41-NEXT: psrlw $1, %xmm3 414; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 415; SSE41-NEXT: paddb %xmm1, %xmm1 416; SSE41-NEXT: movdqa %xmm1, %xmm0 417; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 418; SSE41-NEXT: movdqa %xmm2, %xmm0 419; SSE41-NEXT: retq 420; 421; AVX-LABEL: var_shift_v16i8: 422; AVX: # %bb.0: 423; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 424; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 425; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 426; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 427; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 428; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 429; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 430; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 431; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 432; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 433; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 434; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 435; AVX-NEXT: retq 436; 437; XOP-LABEL: var_shift_v16i8: 438; XOP: # %bb.0: 439; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 440; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 441; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 442; XOP-NEXT: retq 443; 444; AVX512DQ-LABEL: var_shift_v16i8: 445; AVX512DQ: # %bb.0: 446; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 447; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 448; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 449; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 450; AVX512DQ-NEXT: vzeroupper 451; AVX512DQ-NEXT: retq 452; 453; AVX512BW-LABEL: var_shift_v16i8: 454; AVX512BW: # %bb.0: 455; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 456; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 457; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 458; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 459; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 460; AVX512BW-NEXT: vzeroupper 461; AVX512BW-NEXT: retq 462; 463; AVX512DQVL-LABEL: var_shift_v16i8: 464; AVX512DQVL: # %bb.0: 465; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 466; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 467; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 468; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 469; AVX512DQVL-NEXT: vzeroupper 470; AVX512DQVL-NEXT: retq 471; 472; AVX512BWVL-LABEL: var_shift_v16i8: 473; AVX512BWVL: # %bb.0: 474; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 475; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 476; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 477; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 478; AVX512BWVL-NEXT: vzeroupper 479; AVX512BWVL-NEXT: retq 480; 481; X32-SSE-LABEL: var_shift_v16i8: 482; X32-SSE: # %bb.0: 483; X32-SSE-NEXT: psllw $5, %xmm1 484; X32-SSE-NEXT: pxor %xmm2, %xmm2 485; X32-SSE-NEXT: pxor %xmm3, %xmm3 486; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 487; X32-SSE-NEXT: movdqa %xmm3, %xmm4 488; X32-SSE-NEXT: pandn %xmm0, %xmm4 489; X32-SSE-NEXT: psrlw $4, %xmm0 490; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 491; X32-SSE-NEXT: pand %xmm3, %xmm0 492; X32-SSE-NEXT: por %xmm4, %xmm0 493; X32-SSE-NEXT: paddb %xmm1, %xmm1 494; X32-SSE-NEXT: pxor %xmm3, %xmm3 495; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 496; X32-SSE-NEXT: movdqa %xmm3, %xmm4 497; X32-SSE-NEXT: pandn %xmm0, %xmm4 498; X32-SSE-NEXT: psrlw $2, %xmm0 499; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 500; X32-SSE-NEXT: pand %xmm3, %xmm0 501; X32-SSE-NEXT: por %xmm4, %xmm0 502; X32-SSE-NEXT: paddb %xmm1, %xmm1 503; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 504; X32-SSE-NEXT: movdqa %xmm2, %xmm1 505; X32-SSE-NEXT: pandn %xmm0, %xmm1 506; X32-SSE-NEXT: psrlw $1, %xmm0 507; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 508; X32-SSE-NEXT: pand %xmm2, %xmm0 509; X32-SSE-NEXT: por %xmm1, %xmm0 510; X32-SSE-NEXT: retl 511 %shift = lshr <16 x i8> %a, %b 512 ret <16 x i8> %shift 513} 514 515; 516; Uniform Variable Shifts 517; 518 519define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 520; SSE-LABEL: splatvar_shift_v2i64: 521; SSE: # %bb.0: 522; SSE-NEXT: psrlq %xmm1, %xmm0 523; SSE-NEXT: retq 524; 525; AVX-LABEL: splatvar_shift_v2i64: 526; AVX: # %bb.0: 527; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 528; AVX-NEXT: retq 529; 530; XOP-LABEL: splatvar_shift_v2i64: 531; XOP: # %bb.0: 532; XOP-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 533; XOP-NEXT: retq 534; 535; AVX512-LABEL: splatvar_shift_v2i64: 536; AVX512: # %bb.0: 537; AVX512-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 538; AVX512-NEXT: retq 539; 540; AVX512VL-LABEL: splatvar_shift_v2i64: 541; AVX512VL: # %bb.0: 542; AVX512VL-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 543; AVX512VL-NEXT: retq 544; 545; X32-SSE-LABEL: splatvar_shift_v2i64: 546; X32-SSE: # %bb.0: 547; X32-SSE-NEXT: psrlq %xmm1, %xmm0 548; X32-SSE-NEXT: retl 549 %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer 550 %shift = lshr <2 x i64> %a, %splat 551 ret <2 x i64> %shift 552} 553 554define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 555; SSE2-LABEL: splatvar_shift_v4i32: 556; SSE2: # %bb.0: 557; SSE2-NEXT: xorps %xmm2, %xmm2 558; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 559; SSE2-NEXT: psrld %xmm2, %xmm0 560; SSE2-NEXT: retq 561; 562; SSE41-LABEL: splatvar_shift_v4i32: 563; SSE41: # %bb.0: 564; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 565; SSE41-NEXT: psrld %xmm1, %xmm0 566; SSE41-NEXT: retq 567; 568; AVX-LABEL: splatvar_shift_v4i32: 569; AVX: # %bb.0: 570; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 571; AVX-NEXT: vpsrld %xmm1, %xmm0, %xmm0 572; AVX-NEXT: retq 573; 574; XOP-LABEL: splatvar_shift_v4i32: 575; XOP: # %bb.0: 576; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 577; XOP-NEXT: vpsrld %xmm1, %xmm0, %xmm0 578; XOP-NEXT: retq 579; 580; AVX512-LABEL: splatvar_shift_v4i32: 581; AVX512: # %bb.0: 582; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 583; AVX512-NEXT: vpsrld %xmm1, %xmm0, %xmm0 584; AVX512-NEXT: retq 585; 586; AVX512VL-LABEL: splatvar_shift_v4i32: 587; AVX512VL: # %bb.0: 588; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 589; AVX512VL-NEXT: vpsrld %xmm1, %xmm0, %xmm0 590; AVX512VL-NEXT: retq 591; 592; X32-SSE-LABEL: splatvar_shift_v4i32: 593; X32-SSE: # %bb.0: 594; X32-SSE-NEXT: xorps %xmm2, %xmm2 595; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 596; X32-SSE-NEXT: psrld %xmm2, %xmm0 597; X32-SSE-NEXT: retl 598 %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer 599 %shift = lshr <4 x i32> %a, %splat 600 ret <4 x i32> %shift 601} 602 603define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 604; SSE2-LABEL: splatvar_shift_v8i16: 605; SSE2: # %bb.0: 606; SSE2-NEXT: pextrw $0, %xmm1, %eax 607; SSE2-NEXT: movd %eax, %xmm1 608; SSE2-NEXT: psrlw %xmm1, %xmm0 609; SSE2-NEXT: retq 610; 611; SSE41-LABEL: splatvar_shift_v8i16: 612; SSE41: # %bb.0: 613; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 614; SSE41-NEXT: psrlw %xmm1, %xmm0 615; SSE41-NEXT: retq 616; 617; AVX-LABEL: splatvar_shift_v8i16: 618; AVX: # %bb.0: 619; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 620; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 621; AVX-NEXT: retq 622; 623; XOP-LABEL: splatvar_shift_v8i16: 624; XOP: # %bb.0: 625; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 626; XOP-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 627; XOP-NEXT: retq 628; 629; AVX512-LABEL: splatvar_shift_v8i16: 630; AVX512: # %bb.0: 631; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 632; AVX512-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 633; AVX512-NEXT: retq 634; 635; AVX512VL-LABEL: splatvar_shift_v8i16: 636; AVX512VL: # %bb.0: 637; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 638; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 639; AVX512VL-NEXT: retq 640; 641; X32-SSE-LABEL: splatvar_shift_v8i16: 642; X32-SSE: # %bb.0: 643; X32-SSE-NEXT: pextrw $0, %xmm1, %eax 644; X32-SSE-NEXT: movd %eax, %xmm1 645; X32-SSE-NEXT: psrlw %xmm1, %xmm0 646; X32-SSE-NEXT: retl 647 %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer 648 %shift = lshr <8 x i16> %a, %splat 649 ret <8 x i16> %shift 650} 651 652define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 653; SSE2-LABEL: splatvar_shift_v16i8: 654; SSE2: # %bb.0: 655; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 656; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] 657; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] 658; SSE2-NEXT: psllw $5, %xmm2 659; SSE2-NEXT: pxor %xmm1, %xmm1 660; SSE2-NEXT: pxor %xmm3, %xmm3 661; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 662; SSE2-NEXT: movdqa %xmm3, %xmm4 663; SSE2-NEXT: pandn %xmm0, %xmm4 664; SSE2-NEXT: psrlw $4, %xmm0 665; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 666; SSE2-NEXT: pand %xmm3, %xmm0 667; SSE2-NEXT: por %xmm4, %xmm0 668; SSE2-NEXT: paddb %xmm2, %xmm2 669; SSE2-NEXT: pxor %xmm3, %xmm3 670; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 671; SSE2-NEXT: movdqa %xmm3, %xmm4 672; SSE2-NEXT: pandn %xmm0, %xmm4 673; SSE2-NEXT: psrlw $2, %xmm0 674; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 675; SSE2-NEXT: pand %xmm3, %xmm0 676; SSE2-NEXT: por %xmm4, %xmm0 677; SSE2-NEXT: paddb %xmm2, %xmm2 678; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 679; SSE2-NEXT: movdqa %xmm1, %xmm2 680; SSE2-NEXT: pandn %xmm0, %xmm2 681; SSE2-NEXT: psrlw $1, %xmm0 682; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 683; SSE2-NEXT: pand %xmm1, %xmm0 684; SSE2-NEXT: por %xmm2, %xmm0 685; SSE2-NEXT: retq 686; 687; SSE41-LABEL: splatvar_shift_v16i8: 688; SSE41: # %bb.0: 689; SSE41-NEXT: movdqa %xmm0, %xmm2 690; SSE41-NEXT: pxor %xmm0, %xmm0 691; SSE41-NEXT: pshufb %xmm0, %xmm1 692; SSE41-NEXT: psllw $5, %xmm1 693; SSE41-NEXT: movdqa %xmm1, %xmm3 694; SSE41-NEXT: paddb %xmm1, %xmm3 695; SSE41-NEXT: movdqa %xmm2, %xmm4 696; SSE41-NEXT: psrlw $4, %xmm4 697; SSE41-NEXT: pand {{.*}}(%rip), %xmm4 698; SSE41-NEXT: movdqa %xmm1, %xmm0 699; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm2 700; SSE41-NEXT: movdqa %xmm2, %xmm1 701; SSE41-NEXT: psrlw $2, %xmm1 702; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 703; SSE41-NEXT: movdqa %xmm3, %xmm0 704; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2 705; SSE41-NEXT: movdqa %xmm2, %xmm1 706; SSE41-NEXT: psrlw $1, %xmm1 707; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 708; SSE41-NEXT: paddb %xmm3, %xmm3 709; SSE41-NEXT: movdqa %xmm3, %xmm0 710; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2 711; SSE41-NEXT: movdqa %xmm2, %xmm0 712; SSE41-NEXT: retq 713; 714; AVX1-LABEL: splatvar_shift_v16i8: 715; AVX1: # %bb.0: 716; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 717; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 718; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 719; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm2 720; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 721; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 722; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 723; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 724; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 725; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 726; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 727; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 728; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 729; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 730; AVX1-NEXT: retq 731; 732; AVX2-LABEL: splatvar_shift_v16i8: 733; AVX2: # %bb.0: 734; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 735; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 736; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm2 737; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 738; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 739; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm2 740; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 741; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 742; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 743; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm2 744; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 745; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 746; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 747; AVX2-NEXT: retq 748; 749; XOPAVX1-LABEL: splatvar_shift_v16i8: 750; XOPAVX1: # %bb.0: 751; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 752; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 753; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 754; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 755; XOPAVX1-NEXT: retq 756; 757; XOPAVX2-LABEL: splatvar_shift_v16i8: 758; XOPAVX2: # %bb.0: 759; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 760; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 761; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 762; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 763; XOPAVX2-NEXT: retq 764; 765; AVX512DQ-LABEL: splatvar_shift_v16i8: 766; AVX512DQ: # %bb.0: 767; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 768; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 769; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 770; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 771; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 772; AVX512DQ-NEXT: vzeroupper 773; AVX512DQ-NEXT: retq 774; 775; AVX512BW-LABEL: splatvar_shift_v16i8: 776; AVX512BW: # %bb.0: 777; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 778; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 779; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 780; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 781; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 782; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 783; AVX512BW-NEXT: vzeroupper 784; AVX512BW-NEXT: retq 785; 786; AVX512DQVL-LABEL: splatvar_shift_v16i8: 787; AVX512DQVL: # %bb.0: 788; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 789; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 790; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 791; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 792; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 793; AVX512DQVL-NEXT: vzeroupper 794; AVX512DQVL-NEXT: retq 795; 796; AVX512BWVL-LABEL: splatvar_shift_v16i8: 797; AVX512BWVL: # %bb.0: 798; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 799; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 800; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 801; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 802; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 803; AVX512BWVL-NEXT: vzeroupper 804; AVX512BWVL-NEXT: retq 805; 806; X32-SSE-LABEL: splatvar_shift_v16i8: 807; X32-SSE: # %bb.0: 808; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 809; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] 810; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] 811; X32-SSE-NEXT: psllw $5, %xmm2 812; X32-SSE-NEXT: pxor %xmm1, %xmm1 813; X32-SSE-NEXT: pxor %xmm3, %xmm3 814; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3 815; X32-SSE-NEXT: movdqa %xmm3, %xmm4 816; X32-SSE-NEXT: pandn %xmm0, %xmm4 817; X32-SSE-NEXT: psrlw $4, %xmm0 818; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 819; X32-SSE-NEXT: pand %xmm3, %xmm0 820; X32-SSE-NEXT: por %xmm4, %xmm0 821; X32-SSE-NEXT: paddb %xmm2, %xmm2 822; X32-SSE-NEXT: pxor %xmm3, %xmm3 823; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3 824; X32-SSE-NEXT: movdqa %xmm3, %xmm4 825; X32-SSE-NEXT: pandn %xmm0, %xmm4 826; X32-SSE-NEXT: psrlw $2, %xmm0 827; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 828; X32-SSE-NEXT: pand %xmm3, %xmm0 829; X32-SSE-NEXT: por %xmm4, %xmm0 830; X32-SSE-NEXT: paddb %xmm2, %xmm2 831; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm1 832; X32-SSE-NEXT: movdqa %xmm1, %xmm2 833; X32-SSE-NEXT: pandn %xmm0, %xmm2 834; X32-SSE-NEXT: psrlw $1, %xmm0 835; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 836; X32-SSE-NEXT: pand %xmm1, %xmm0 837; X32-SSE-NEXT: por %xmm2, %xmm0 838; X32-SSE-NEXT: retl 839 %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer 840 %shift = lshr <16 x i8> %a, %splat 841 ret <16 x i8> %shift 842} 843 844; 845; Constant Shifts 846; 847 848define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind { 849; SSE2-LABEL: constant_shift_v2i64: 850; SSE2: # %bb.0: 851; SSE2-NEXT: movdqa %xmm0, %xmm1 852; SSE2-NEXT: psrlq $1, %xmm1 853; SSE2-NEXT: psrlq $7, %xmm0 854; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 855; SSE2-NEXT: retq 856; 857; SSE41-LABEL: constant_shift_v2i64: 858; SSE41: # %bb.0: 859; SSE41-NEXT: movdqa %xmm0, %xmm1 860; SSE41-NEXT: psrlq $7, %xmm1 861; SSE41-NEXT: psrlq $1, %xmm0 862; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 863; SSE41-NEXT: retq 864; 865; AVX1-LABEL: constant_shift_v2i64: 866; AVX1: # %bb.0: 867; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1 868; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 869; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 870; AVX1-NEXT: retq 871; 872; AVX2-LABEL: constant_shift_v2i64: 873; AVX2: # %bb.0: 874; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 875; AVX2-NEXT: retq 876; 877; XOPAVX1-LABEL: constant_shift_v2i64: 878; XOPAVX1: # %bb.0: 879; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0 880; XOPAVX1-NEXT: retq 881; 882; XOPAVX2-LABEL: constant_shift_v2i64: 883; XOPAVX2: # %bb.0: 884; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 885; XOPAVX2-NEXT: retq 886; 887; AVX512-LABEL: constant_shift_v2i64: 888; AVX512: # %bb.0: 889; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 890; AVX512-NEXT: retq 891; 892; AVX512VL-LABEL: constant_shift_v2i64: 893; AVX512VL: # %bb.0: 894; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 895; AVX512VL-NEXT: retq 896; 897; X32-SSE-LABEL: constant_shift_v2i64: 898; X32-SSE: # %bb.0: 899; X32-SSE-NEXT: movdqa %xmm0, %xmm1 900; X32-SSE-NEXT: psrlq $1, %xmm1 901; X32-SSE-NEXT: psrlq $7, %xmm0 902; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 903; X32-SSE-NEXT: retl 904 %shift = lshr <2 x i64> %a, <i64 1, i64 7> 905 ret <2 x i64> %shift 906} 907 908define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind { 909; SSE2-LABEL: constant_shift_v4i32: 910; SSE2: # %bb.0: 911; SSE2-NEXT: movdqa %xmm0, %xmm1 912; SSE2-NEXT: psrld $7, %xmm1 913; SSE2-NEXT: movdqa %xmm0, %xmm2 914; SSE2-NEXT: psrld $6, %xmm2 915; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] 916; SSE2-NEXT: movdqa %xmm0, %xmm1 917; SSE2-NEXT: psrld $5, %xmm1 918; SSE2-NEXT: psrld $4, %xmm0 919; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 920; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] 921; SSE2-NEXT: retq 922; 923; SSE41-LABEL: constant_shift_v4i32: 924; SSE41: # %bb.0: 925; SSE41-NEXT: movdqa %xmm0, %xmm1 926; SSE41-NEXT: psrld $7, %xmm1 927; SSE41-NEXT: movdqa %xmm0, %xmm2 928; SSE41-NEXT: psrld $5, %xmm2 929; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 930; SSE41-NEXT: movdqa %xmm0, %xmm1 931; SSE41-NEXT: psrld $6, %xmm1 932; SSE41-NEXT: psrld $4, %xmm0 933; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 934; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 935; SSE41-NEXT: retq 936; 937; AVX1-LABEL: constant_shift_v4i32: 938; AVX1: # %bb.0: 939; AVX1-NEXT: vpsrld $7, %xmm0, %xmm1 940; AVX1-NEXT: vpsrld $5, %xmm0, %xmm2 941; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 942; AVX1-NEXT: vpsrld $6, %xmm0, %xmm2 943; AVX1-NEXT: vpsrld $4, %xmm0, %xmm0 944; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 945; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 946; AVX1-NEXT: retq 947; 948; AVX2-LABEL: constant_shift_v4i32: 949; AVX2: # %bb.0: 950; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 951; AVX2-NEXT: retq 952; 953; XOPAVX1-LABEL: constant_shift_v4i32: 954; XOPAVX1: # %bb.0: 955; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 956; XOPAVX1-NEXT: retq 957; 958; XOPAVX2-LABEL: constant_shift_v4i32: 959; XOPAVX2: # %bb.0: 960; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 961; XOPAVX2-NEXT: retq 962; 963; AVX512-LABEL: constant_shift_v4i32: 964; AVX512: # %bb.0: 965; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 966; AVX512-NEXT: retq 967; 968; AVX512VL-LABEL: constant_shift_v4i32: 969; AVX512VL: # %bb.0: 970; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 971; AVX512VL-NEXT: retq 972; 973; X32-SSE-LABEL: constant_shift_v4i32: 974; X32-SSE: # %bb.0: 975; X32-SSE-NEXT: movdqa %xmm0, %xmm1 976; X32-SSE-NEXT: psrld $7, %xmm1 977; X32-SSE-NEXT: movdqa %xmm0, %xmm2 978; X32-SSE-NEXT: psrld $6, %xmm2 979; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] 980; X32-SSE-NEXT: movdqa %xmm0, %xmm1 981; X32-SSE-NEXT: psrld $5, %xmm1 982; X32-SSE-NEXT: psrld $4, %xmm0 983; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 984; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] 985; X32-SSE-NEXT: retl 986 %shift = lshr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7> 987 ret <4 x i32> %shift 988} 989 990define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { 991; SSE2-LABEL: constant_shift_v8i16: 992; SSE2: # %bb.0: 993; SSE2-NEXT: movdqa %xmm0, %xmm1 994; SSE2-NEXT: psrlw $4, %xmm1 995; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 996; SSE2-NEXT: movapd %xmm1, %xmm2 997; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3] 998; SSE2-NEXT: psrlw $2, %xmm1 999; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] 1000; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 1001; SSE2-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] 1002; SSE2-NEXT: movaps %xmm2, %xmm0 1003; SSE2-NEXT: andps %xmm1, %xmm0 1004; SSE2-NEXT: psrlw $1, %xmm2 1005; SSE2-NEXT: andnps %xmm2, %xmm1 1006; SSE2-NEXT: orps %xmm1, %xmm0 1007; SSE2-NEXT: retq 1008; 1009; SSE41-LABEL: constant_shift_v8i16: 1010; SSE41: # %bb.0: 1011; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <u,32768,16384,8192,4096,2048,1024,512> 1012; SSE41-NEXT: pmulhuw %xmm0, %xmm1 1013; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1014; SSE41-NEXT: retq 1015; 1016; AVX-LABEL: constant_shift_v8i16: 1017; AVX: # %bb.0: 1018; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 1019; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1020; AVX-NEXT: retq 1021; 1022; XOP-LABEL: constant_shift_v8i16: 1023; XOP: # %bb.0: 1024; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0 1025; XOP-NEXT: retq 1026; 1027; AVX512DQ-LABEL: constant_shift_v8i16: 1028; AVX512DQ: # %bb.0: 1029; AVX512DQ-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 1030; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1031; AVX512DQ-NEXT: retq 1032; 1033; AVX512BW-LABEL: constant_shift_v8i16: 1034; AVX512BW: # %bb.0: 1035; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1036; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] 1037; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 1038; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1039; AVX512BW-NEXT: vzeroupper 1040; AVX512BW-NEXT: retq 1041; 1042; AVX512DQVL-LABEL: constant_shift_v8i16: 1043; AVX512DQVL: # %bb.0: 1044; AVX512DQVL-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 1045; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 1046; AVX512DQVL-NEXT: retq 1047; 1048; AVX512BWVL-LABEL: constant_shift_v8i16: 1049; AVX512BWVL: # %bb.0: 1050; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm0 1051; AVX512BWVL-NEXT: retq 1052; 1053; X32-SSE-LABEL: constant_shift_v8i16: 1054; X32-SSE: # %bb.0: 1055; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1056; X32-SSE-NEXT: psrlw $4, %xmm1 1057; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 1058; X32-SSE-NEXT: movapd %xmm1, %xmm2 1059; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3] 1060; X32-SSE-NEXT: psrlw $2, %xmm1 1061; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] 1062; X32-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 1063; X32-SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] 1064; X32-SSE-NEXT: movaps %xmm2, %xmm0 1065; X32-SSE-NEXT: andps %xmm1, %xmm0 1066; X32-SSE-NEXT: psrlw $1, %xmm2 1067; X32-SSE-NEXT: andnps %xmm2, %xmm1 1068; X32-SSE-NEXT: orps %xmm1, %xmm0 1069; X32-SSE-NEXT: retl 1070 %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7> 1071 ret <8 x i16> %shift 1072} 1073 1074define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { 1075; SSE2-LABEL: constant_shift_v16i8: 1076; SSE2: # %bb.0: 1077; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,49376,32928,16480,32] 1078; SSE2-NEXT: pxor %xmm1, %xmm1 1079; SSE2-NEXT: pxor %xmm3, %xmm3 1080; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 1081; SSE2-NEXT: movdqa %xmm3, %xmm4 1082; SSE2-NEXT: pandn %xmm0, %xmm4 1083; SSE2-NEXT: psrlw $4, %xmm0 1084; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1085; SSE2-NEXT: pand %xmm3, %xmm0 1086; SSE2-NEXT: por %xmm4, %xmm0 1087; SSE2-NEXT: paddb %xmm2, %xmm2 1088; SSE2-NEXT: pxor %xmm3, %xmm3 1089; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 1090; SSE2-NEXT: movdqa %xmm3, %xmm4 1091; SSE2-NEXT: pandn %xmm0, %xmm4 1092; SSE2-NEXT: psrlw $2, %xmm0 1093; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1094; SSE2-NEXT: pand %xmm3, %xmm0 1095; SSE2-NEXT: por %xmm4, %xmm0 1096; SSE2-NEXT: paddb %xmm2, %xmm2 1097; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 1098; SSE2-NEXT: movdqa %xmm1, %xmm2 1099; SSE2-NEXT: pandn %xmm0, %xmm2 1100; SSE2-NEXT: psrlw $1, %xmm0 1101; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1102; SSE2-NEXT: pand %xmm1, %xmm0 1103; SSE2-NEXT: por %xmm2, %xmm0 1104; SSE2-NEXT: retq 1105; 1106; SSE41-LABEL: constant_shift_v16i8: 1107; SSE41: # %bb.0: 1108; SSE41-NEXT: movdqa %xmm0, %xmm1 1109; SSE41-NEXT: movdqa %xmm0, %xmm2 1110; SSE41-NEXT: psrlw $4, %xmm2 1111; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 1112; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [8192,24640,41088,57536,49376,32928,16480,32] 1113; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 1114; SSE41-NEXT: movdqa %xmm1, %xmm2 1115; SSE41-NEXT: psrlw $2, %xmm2 1116; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 1117; SSE41-NEXT: paddb %xmm0, %xmm0 1118; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 1119; SSE41-NEXT: movdqa %xmm1, %xmm2 1120; SSE41-NEXT: psrlw $1, %xmm2 1121; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 1122; SSE41-NEXT: paddb %xmm0, %xmm0 1123; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 1124; SSE41-NEXT: movdqa %xmm1, %xmm0 1125; SSE41-NEXT: retq 1126; 1127; AVX-LABEL: constant_shift_v16i8: 1128; AVX: # %bb.0: 1129; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1 1130; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1131; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,49376,32928,16480,32] 1132; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1133; AVX-NEXT: vpsrlw $2, %xmm0, %xmm1 1134; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1135; AVX-NEXT: vpaddb %xmm2, %xmm2, %xmm2 1136; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1137; AVX-NEXT: vpsrlw $1, %xmm0, %xmm1 1138; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1139; AVX-NEXT: vpaddb %xmm2, %xmm2, %xmm2 1140; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1141; AVX-NEXT: retq 1142; 1143; XOP-LABEL: constant_shift_v16i8: 1144; XOP: # %bb.0: 1145; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 1146; XOP-NEXT: retq 1147; 1148; AVX512DQ-LABEL: constant_shift_v16i8: 1149; AVX512DQ: # %bb.0: 1150; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1151; AVX512DQ-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 1152; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1153; AVX512DQ-NEXT: vzeroupper 1154; AVX512DQ-NEXT: retq 1155; 1156; AVX512BW-LABEL: constant_shift_v16i8: 1157; AVX512BW: # %bb.0: 1158; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 1159; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1160; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 1161; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1162; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1163; AVX512BW-NEXT: vzeroupper 1164; AVX512BW-NEXT: retq 1165; 1166; AVX512DQVL-LABEL: constant_shift_v16i8: 1167; AVX512DQVL: # %bb.0: 1168; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1169; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 1170; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 1171; AVX512DQVL-NEXT: vzeroupper 1172; AVX512DQVL-NEXT: retq 1173; 1174; AVX512BWVL-LABEL: constant_shift_v16i8: 1175; AVX512BWVL: # %bb.0: 1176; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1177; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0 1178; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 1179; AVX512BWVL-NEXT: vzeroupper 1180; AVX512BWVL-NEXT: retq 1181; 1182; X32-SSE-LABEL: constant_shift_v16i8: 1183; X32-SSE: # %bb.0: 1184; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,49376,32928,16480,32] 1185; X32-SSE-NEXT: pxor %xmm1, %xmm1 1186; X32-SSE-NEXT: pxor %xmm3, %xmm3 1187; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3 1188; X32-SSE-NEXT: movdqa %xmm3, %xmm4 1189; X32-SSE-NEXT: pandn %xmm0, %xmm4 1190; X32-SSE-NEXT: psrlw $4, %xmm0 1191; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 1192; X32-SSE-NEXT: pand %xmm3, %xmm0 1193; X32-SSE-NEXT: por %xmm4, %xmm0 1194; X32-SSE-NEXT: paddb %xmm2, %xmm2 1195; X32-SSE-NEXT: pxor %xmm3, %xmm3 1196; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3 1197; X32-SSE-NEXT: movdqa %xmm3, %xmm4 1198; X32-SSE-NEXT: pandn %xmm0, %xmm4 1199; X32-SSE-NEXT: psrlw $2, %xmm0 1200; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 1201; X32-SSE-NEXT: pand %xmm3, %xmm0 1202; X32-SSE-NEXT: por %xmm4, %xmm0 1203; X32-SSE-NEXT: paddb %xmm2, %xmm2 1204; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm1 1205; X32-SSE-NEXT: movdqa %xmm1, %xmm2 1206; X32-SSE-NEXT: pandn %xmm0, %xmm2 1207; X32-SSE-NEXT: psrlw $1, %xmm0 1208; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 1209; X32-SSE-NEXT: pand %xmm1, %xmm0 1210; X32-SSE-NEXT: por %xmm2, %xmm0 1211; X32-SSE-NEXT: retl 1212 %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> 1213 ret <16 x i8> %shift 1214} 1215 1216; 1217; Uniform Constant Shifts 1218; 1219 1220define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind { 1221; SSE-LABEL: splatconstant_shift_v2i64: 1222; SSE: # %bb.0: 1223; SSE-NEXT: psrlq $7, %xmm0 1224; SSE-NEXT: retq 1225; 1226; AVX-LABEL: splatconstant_shift_v2i64: 1227; AVX: # %bb.0: 1228; AVX-NEXT: vpsrlq $7, %xmm0, %xmm0 1229; AVX-NEXT: retq 1230; 1231; XOP-LABEL: splatconstant_shift_v2i64: 1232; XOP: # %bb.0: 1233; XOP-NEXT: vpsrlq $7, %xmm0, %xmm0 1234; XOP-NEXT: retq 1235; 1236; AVX512-LABEL: splatconstant_shift_v2i64: 1237; AVX512: # %bb.0: 1238; AVX512-NEXT: vpsrlq $7, %xmm0, %xmm0 1239; AVX512-NEXT: retq 1240; 1241; AVX512VL-LABEL: splatconstant_shift_v2i64: 1242; AVX512VL: # %bb.0: 1243; AVX512VL-NEXT: vpsrlq $7, %xmm0, %xmm0 1244; AVX512VL-NEXT: retq 1245; 1246; X32-SSE-LABEL: splatconstant_shift_v2i64: 1247; X32-SSE: # %bb.0: 1248; X32-SSE-NEXT: psrlq $7, %xmm0 1249; X32-SSE-NEXT: retl 1250 %shift = lshr <2 x i64> %a, <i64 7, i64 7> 1251 ret <2 x i64> %shift 1252} 1253 1254define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind { 1255; SSE-LABEL: splatconstant_shift_v4i32: 1256; SSE: # %bb.0: 1257; SSE-NEXT: psrld $5, %xmm0 1258; SSE-NEXT: retq 1259; 1260; AVX-LABEL: splatconstant_shift_v4i32: 1261; AVX: # %bb.0: 1262; AVX-NEXT: vpsrld $5, %xmm0, %xmm0 1263; AVX-NEXT: retq 1264; 1265; XOP-LABEL: splatconstant_shift_v4i32: 1266; XOP: # %bb.0: 1267; XOP-NEXT: vpsrld $5, %xmm0, %xmm0 1268; XOP-NEXT: retq 1269; 1270; AVX512-LABEL: splatconstant_shift_v4i32: 1271; AVX512: # %bb.0: 1272; AVX512-NEXT: vpsrld $5, %xmm0, %xmm0 1273; AVX512-NEXT: retq 1274; 1275; AVX512VL-LABEL: splatconstant_shift_v4i32: 1276; AVX512VL: # %bb.0: 1277; AVX512VL-NEXT: vpsrld $5, %xmm0, %xmm0 1278; AVX512VL-NEXT: retq 1279; 1280; X32-SSE-LABEL: splatconstant_shift_v4i32: 1281; X32-SSE: # %bb.0: 1282; X32-SSE-NEXT: psrld $5, %xmm0 1283; X32-SSE-NEXT: retl 1284 %shift = lshr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5> 1285 ret <4 x i32> %shift 1286} 1287 1288define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind { 1289; SSE-LABEL: splatconstant_shift_v8i16: 1290; SSE: # %bb.0: 1291; SSE-NEXT: psrlw $3, %xmm0 1292; SSE-NEXT: retq 1293; 1294; AVX-LABEL: splatconstant_shift_v8i16: 1295; AVX: # %bb.0: 1296; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 1297; AVX-NEXT: retq 1298; 1299; XOP-LABEL: splatconstant_shift_v8i16: 1300; XOP: # %bb.0: 1301; XOP-NEXT: vpsrlw $3, %xmm0, %xmm0 1302; XOP-NEXT: retq 1303; 1304; AVX512-LABEL: splatconstant_shift_v8i16: 1305; AVX512: # %bb.0: 1306; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 1307; AVX512-NEXT: retq 1308; 1309; AVX512VL-LABEL: splatconstant_shift_v8i16: 1310; AVX512VL: # %bb.0: 1311; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 1312; AVX512VL-NEXT: retq 1313; 1314; X32-SSE-LABEL: splatconstant_shift_v8i16: 1315; X32-SSE: # %bb.0: 1316; X32-SSE-NEXT: psrlw $3, %xmm0 1317; X32-SSE-NEXT: retl 1318 %shift = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> 1319 ret <8 x i16> %shift 1320} 1321 1322define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind { 1323; SSE-LABEL: splatconstant_shift_v16i8: 1324; SSE: # %bb.0: 1325; SSE-NEXT: psrlw $3, %xmm0 1326; SSE-NEXT: pand {{.*}}(%rip), %xmm0 1327; SSE-NEXT: retq 1328; 1329; AVX-LABEL: splatconstant_shift_v16i8: 1330; AVX: # %bb.0: 1331; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 1332; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1333; AVX-NEXT: retq 1334; 1335; XOP-LABEL: splatconstant_shift_v16i8: 1336; XOP: # %bb.0: 1337; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 1338; XOP-NEXT: retq 1339; 1340; AVX512-LABEL: splatconstant_shift_v16i8: 1341; AVX512: # %bb.0: 1342; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 1343; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1344; AVX512-NEXT: retq 1345; 1346; AVX512VL-LABEL: splatconstant_shift_v16i8: 1347; AVX512VL: # %bb.0: 1348; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 1349; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1350; AVX512VL-NEXT: retq 1351; 1352; X32-SSE-LABEL: splatconstant_shift_v16i8: 1353; X32-SSE: # %bb.0: 1354; X32-SSE-NEXT: psrlw $3, %xmm0 1355; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 1356; X32-SSE-NEXT: retl 1357 %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> 1358 ret <16 x i8> %shift 1359} 1360