1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 8; 9; Just one 32-bit run to make sure we do reasonable things for i64 shifts. 10; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2 11 12; 13; Variable Shifts 14; 15 16define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 17; SSE2-LABEL: var_shift_v2i64: 18; SSE2: # BB#0: 19; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] 20; SSE2-NEXT: movdqa %xmm0, %xmm2 21; SSE2-NEXT: psrlq %xmm3, %xmm2 22; SSE2-NEXT: psrlq %xmm1, %xmm0 23; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 24; SSE2-NEXT: movapd %xmm2, %xmm0 25; SSE2-NEXT: retq 26; 27; SSE41-LABEL: var_shift_v2i64: 28; SSE41: # BB#0: 29; SSE41-NEXT: movdqa %xmm0, %xmm2 30; SSE41-NEXT: psrlq %xmm1, %xmm2 31; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 32; SSE41-NEXT: psrlq %xmm1, %xmm0 33; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] 34; SSE41-NEXT: retq 35; 36; AVX1-LABEL: var_shift_v2i64: 37; AVX1: # BB#0: 38; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 39; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 40; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 41; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] 42; AVX1-NEXT: retq 43; 44; AVX2-LABEL: var_shift_v2i64: 45; AVX2: # BB#0: 46; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 47; AVX2-NEXT: retq 48; 49; XOPAVX1-LABEL: var_shift_v2i64: 50; XOPAVX1: # BB#0: 51; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 52; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 53; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0 54; XOPAVX1-NEXT: retq 55; 56; XOPAVX2-LABEL: var_shift_v2i64: 57; XOPAVX2: # BB#0: 58; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 59; XOPAVX2-NEXT: retq 60; 61; X32-SSE-LABEL: var_shift_v2i64: 62; X32-SSE: # BB#0: 63; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] 64; X32-SSE-NEXT: movdqa %xmm0, %xmm2 65; X32-SSE-NEXT: psrlq %xmm3, %xmm2 66; X32-SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero 67; X32-SSE-NEXT: psrlq %xmm1, %xmm0 68; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 69; X32-SSE-NEXT: movapd %xmm2, %xmm0 70; X32-SSE-NEXT: retl 71 %shift = lshr <2 x i64> %a, %b 72 ret <2 x i64> %shift 73} 74 75define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 76; SSE2-LABEL: var_shift_v4i32: 77; SSE2: # BB#0: 78; SSE2-NEXT: movdqa %xmm1, %xmm2 79; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 80; SSE2-NEXT: movdqa %xmm0, %xmm3 81; SSE2-NEXT: psrld %xmm2, %xmm3 82; SSE2-NEXT: movdqa %xmm1, %xmm2 83; SSE2-NEXT: psrlq $32, %xmm2 84; SSE2-NEXT: movdqa %xmm0, %xmm4 85; SSE2-NEXT: psrld %xmm2, %xmm4 86; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] 87; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] 88; SSE2-NEXT: pxor %xmm3, %xmm3 89; SSE2-NEXT: movdqa %xmm1, %xmm4 90; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] 91; SSE2-NEXT: movdqa %xmm0, %xmm5 92; SSE2-NEXT: psrld %xmm4, %xmm5 93; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 94; SSE2-NEXT: psrld %xmm1, %xmm0 95; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] 96; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] 97; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 98; SSE2-NEXT: retq 99; 100; SSE41-LABEL: var_shift_v4i32: 101; SSE41: # BB#0: 102; SSE41-NEXT: movdqa %xmm1, %xmm2 103; SSE41-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 104; SSE41-NEXT: movdqa %xmm0, %xmm3 105; SSE41-NEXT: psrld %xmm2, %xmm3 106; SSE41-NEXT: movdqa %xmm1, %xmm2 107; SSE41-NEXT: psrlq $32, %xmm2 108; SSE41-NEXT: movdqa %xmm0, %xmm4 109; SSE41-NEXT: psrld %xmm2, %xmm4 110; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] 111; SSE41-NEXT: pxor %xmm2, %xmm2 112; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero 113; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] 114; SSE41-NEXT: movdqa %xmm0, %xmm2 115; SSE41-NEXT: psrld %xmm1, %xmm2 116; SSE41-NEXT: psrld %xmm3, %xmm0 117; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 118; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] 119; SSE41-NEXT: retq 120; 121; AVX1-LABEL: var_shift_v4i32: 122; AVX1: # BB#0: 123; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 124; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 125; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 126; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 127; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 128; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 129; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] 130; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 131; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 132; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 133; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] 134; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 135; AVX1-NEXT: retq 136; 137; AVX2-LABEL: var_shift_v4i32: 138; AVX2: # BB#0: 139; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 140; AVX2-NEXT: retq 141; 142; XOPAVX1-LABEL: var_shift_v4i32: 143; XOPAVX1: # BB#0: 144; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 145; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 146; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0 147; XOPAVX1-NEXT: retq 148; 149; XOPAVX2-LABEL: var_shift_v4i32: 150; XOPAVX2: # BB#0: 151; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 152; XOPAVX2-NEXT: retq 153; 154; X32-SSE-LABEL: var_shift_v4i32: 155; X32-SSE: # BB#0: 156; X32-SSE-NEXT: movdqa %xmm1, %xmm2 157; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 158; X32-SSE-NEXT: movdqa %xmm0, %xmm3 159; X32-SSE-NEXT: psrld %xmm2, %xmm3 160; X32-SSE-NEXT: movdqa %xmm1, %xmm2 161; X32-SSE-NEXT: psrlq $32, %xmm2 162; X32-SSE-NEXT: movdqa %xmm0, %xmm4 163; X32-SSE-NEXT: psrld %xmm2, %xmm4 164; X32-SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] 165; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] 166; X32-SSE-NEXT: pxor %xmm3, %xmm3 167; X32-SSE-NEXT: movdqa %xmm1, %xmm4 168; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] 169; X32-SSE-NEXT: movdqa %xmm0, %xmm5 170; X32-SSE-NEXT: psrld %xmm4, %xmm5 171; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 172; X32-SSE-NEXT: psrld %xmm1, %xmm0 173; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] 174; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] 175; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 176; X32-SSE-NEXT: retl 177 %shift = lshr <4 x i32> %a, %b 178 ret <4 x i32> %shift 179} 180 181define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 182; SSE2-LABEL: var_shift_v8i16: 183; SSE2: # BB#0: 184; SSE2-NEXT: psllw $12, %xmm1 185; SSE2-NEXT: movdqa %xmm1, %xmm2 186; SSE2-NEXT: psraw $15, %xmm2 187; SSE2-NEXT: movdqa %xmm2, %xmm3 188; SSE2-NEXT: pandn %xmm0, %xmm3 189; SSE2-NEXT: psrlw $8, %xmm0 190; SSE2-NEXT: pand %xmm2, %xmm0 191; SSE2-NEXT: por %xmm3, %xmm0 192; SSE2-NEXT: paddw %xmm1, %xmm1 193; SSE2-NEXT: movdqa %xmm1, %xmm2 194; SSE2-NEXT: psraw $15, %xmm2 195; SSE2-NEXT: movdqa %xmm2, %xmm3 196; SSE2-NEXT: pandn %xmm0, %xmm3 197; SSE2-NEXT: psrlw $4, %xmm0 198; SSE2-NEXT: pand %xmm2, %xmm0 199; SSE2-NEXT: por %xmm3, %xmm0 200; SSE2-NEXT: paddw %xmm1, %xmm1 201; SSE2-NEXT: movdqa %xmm1, %xmm2 202; SSE2-NEXT: psraw $15, %xmm2 203; SSE2-NEXT: movdqa %xmm2, %xmm3 204; SSE2-NEXT: pandn %xmm0, %xmm3 205; SSE2-NEXT: psrlw $2, %xmm0 206; SSE2-NEXT: pand %xmm2, %xmm0 207; SSE2-NEXT: por %xmm3, %xmm0 208; SSE2-NEXT: paddw %xmm1, %xmm1 209; SSE2-NEXT: psraw $15, %xmm1 210; SSE2-NEXT: movdqa %xmm1, %xmm2 211; SSE2-NEXT: pandn %xmm0, %xmm2 212; SSE2-NEXT: psrlw $1, %xmm0 213; SSE2-NEXT: pand %xmm1, %xmm0 214; SSE2-NEXT: por %xmm2, %xmm0 215; SSE2-NEXT: retq 216; 217; SSE41-LABEL: var_shift_v8i16: 218; SSE41: # BB#0: 219; SSE41-NEXT: movdqa %xmm0, %xmm2 220; SSE41-NEXT: movdqa %xmm1, %xmm0 221; SSE41-NEXT: psllw $12, %xmm0 222; SSE41-NEXT: psllw $4, %xmm1 223; SSE41-NEXT: por %xmm0, %xmm1 224; SSE41-NEXT: movdqa %xmm1, %xmm3 225; SSE41-NEXT: paddw %xmm3, %xmm3 226; SSE41-NEXT: movdqa %xmm2, %xmm4 227; SSE41-NEXT: psrlw $8, %xmm4 228; SSE41-NEXT: movdqa %xmm1, %xmm0 229; SSE41-NEXT: pblendvb %xmm4, %xmm2 230; SSE41-NEXT: movdqa %xmm2, %xmm1 231; SSE41-NEXT: psrlw $4, %xmm1 232; SSE41-NEXT: movdqa %xmm3, %xmm0 233; SSE41-NEXT: pblendvb %xmm1, %xmm2 234; SSE41-NEXT: movdqa %xmm2, %xmm1 235; SSE41-NEXT: psrlw $2, %xmm1 236; SSE41-NEXT: paddw %xmm3, %xmm3 237; SSE41-NEXT: movdqa %xmm3, %xmm0 238; SSE41-NEXT: pblendvb %xmm1, %xmm2 239; SSE41-NEXT: movdqa %xmm2, %xmm1 240; SSE41-NEXT: psrlw $1, %xmm1 241; SSE41-NEXT: paddw %xmm3, %xmm3 242; SSE41-NEXT: movdqa %xmm3, %xmm0 243; SSE41-NEXT: pblendvb %xmm1, %xmm2 244; SSE41-NEXT: movdqa %xmm2, %xmm0 245; SSE41-NEXT: retq 246; 247; AVX1-LABEL: var_shift_v8i16: 248; AVX1: # BB#0: 249; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 250; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 251; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 252; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 253; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3 254; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 255; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 256; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 257; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 258; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 259; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 260; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 261; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 262; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 263; AVX1-NEXT: retq 264; 265; AVX2-LABEL: var_shift_v8i16: 266; AVX2: # BB#0: 267; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 268; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 269; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 270; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 271; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 272; AVX2-NEXT: vzeroupper 273; AVX2-NEXT: retq 274; 275; XOP-LABEL: var_shift_v8i16: 276; XOP: # BB#0: 277; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 278; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 279; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0 280; XOP-NEXT: retq 281; 282; X32-SSE-LABEL: var_shift_v8i16: 283; X32-SSE: # BB#0: 284; X32-SSE-NEXT: psllw $12, %xmm1 285; X32-SSE-NEXT: movdqa %xmm1, %xmm2 286; X32-SSE-NEXT: psraw $15, %xmm2 287; X32-SSE-NEXT: movdqa %xmm2, %xmm3 288; X32-SSE-NEXT: pandn %xmm0, %xmm3 289; X32-SSE-NEXT: psrlw $8, %xmm0 290; X32-SSE-NEXT: pand %xmm2, %xmm0 291; X32-SSE-NEXT: por %xmm3, %xmm0 292; X32-SSE-NEXT: paddw %xmm1, %xmm1 293; X32-SSE-NEXT: movdqa %xmm1, %xmm2 294; X32-SSE-NEXT: psraw $15, %xmm2 295; X32-SSE-NEXT: movdqa %xmm2, %xmm3 296; X32-SSE-NEXT: pandn %xmm0, %xmm3 297; X32-SSE-NEXT: psrlw $4, %xmm0 298; X32-SSE-NEXT: pand %xmm2, %xmm0 299; X32-SSE-NEXT: por %xmm3, %xmm0 300; X32-SSE-NEXT: paddw %xmm1, %xmm1 301; X32-SSE-NEXT: movdqa %xmm1, %xmm2 302; X32-SSE-NEXT: psraw $15, %xmm2 303; X32-SSE-NEXT: movdqa %xmm2, %xmm3 304; X32-SSE-NEXT: pandn %xmm0, %xmm3 305; X32-SSE-NEXT: psrlw $2, %xmm0 306; X32-SSE-NEXT: pand %xmm2, %xmm0 307; X32-SSE-NEXT: por %xmm3, %xmm0 308; X32-SSE-NEXT: paddw %xmm1, %xmm1 309; X32-SSE-NEXT: psraw $15, %xmm1 310; X32-SSE-NEXT: movdqa %xmm1, %xmm2 311; X32-SSE-NEXT: pandn %xmm0, %xmm2 312; X32-SSE-NEXT: psrlw $1, %xmm0 313; X32-SSE-NEXT: pand %xmm1, %xmm0 314; X32-SSE-NEXT: por %xmm2, %xmm0 315; X32-SSE-NEXT: retl 316 %shift = lshr <8 x i16> %a, %b 317 ret <8 x i16> %shift 318} 319 320define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 321; SSE2-LABEL: var_shift_v16i8: 322; SSE2: # BB#0: 323; SSE2-NEXT: psllw $5, %xmm1 324; SSE2-NEXT: pxor %xmm2, %xmm2 325; SSE2-NEXT: pxor %xmm3, %xmm3 326; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 327; SSE2-NEXT: movdqa %xmm3, %xmm4 328; SSE2-NEXT: pandn %xmm0, %xmm4 329; SSE2-NEXT: psrlw $4, %xmm0 330; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 331; SSE2-NEXT: pand %xmm3, %xmm0 332; SSE2-NEXT: por %xmm4, %xmm0 333; SSE2-NEXT: paddb %xmm1, %xmm1 334; SSE2-NEXT: pxor %xmm3, %xmm3 335; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 336; SSE2-NEXT: movdqa %xmm3, %xmm4 337; SSE2-NEXT: pandn %xmm0, %xmm4 338; SSE2-NEXT: psrlw $2, %xmm0 339; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 340; SSE2-NEXT: pand %xmm3, %xmm0 341; SSE2-NEXT: por %xmm4, %xmm0 342; SSE2-NEXT: paddb %xmm1, %xmm1 343; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 344; SSE2-NEXT: movdqa %xmm2, %xmm1 345; SSE2-NEXT: pandn %xmm0, %xmm1 346; SSE2-NEXT: psrlw $1, %xmm0 347; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 348; SSE2-NEXT: pand %xmm2, %xmm0 349; SSE2-NEXT: por %xmm1, %xmm0 350; SSE2-NEXT: retq 351; 352; SSE41-LABEL: var_shift_v16i8: 353; SSE41: # BB#0: 354; SSE41-NEXT: movdqa %xmm0, %xmm2 355; SSE41-NEXT: psllw $5, %xmm1 356; SSE41-NEXT: movdqa %xmm2, %xmm3 357; SSE41-NEXT: psrlw $4, %xmm3 358; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 359; SSE41-NEXT: movdqa %xmm1, %xmm0 360; SSE41-NEXT: pblendvb %xmm3, %xmm2 361; SSE41-NEXT: movdqa %xmm2, %xmm3 362; SSE41-NEXT: psrlw $2, %xmm3 363; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 364; SSE41-NEXT: paddb %xmm1, %xmm1 365; SSE41-NEXT: movdqa %xmm1, %xmm0 366; SSE41-NEXT: pblendvb %xmm3, %xmm2 367; SSE41-NEXT: movdqa %xmm2, %xmm3 368; SSE41-NEXT: psrlw $1, %xmm3 369; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 370; SSE41-NEXT: paddb %xmm1, %xmm1 371; SSE41-NEXT: movdqa %xmm1, %xmm0 372; SSE41-NEXT: pblendvb %xmm3, %xmm2 373; SSE41-NEXT: movdqa %xmm2, %xmm0 374; SSE41-NEXT: retq 375; 376; AVX-LABEL: var_shift_v16i8: 377; AVX: # BB#0: 378; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 379; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 380; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 381; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 382; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 383; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 384; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 385; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 386; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 387; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 388; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 389; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 390; AVX-NEXT: retq 391; 392; XOP-LABEL: var_shift_v16i8: 393; XOP: # BB#0: 394; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 395; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 396; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 397; XOP-NEXT: retq 398; 399; X32-SSE-LABEL: var_shift_v16i8: 400; X32-SSE: # BB#0: 401; X32-SSE-NEXT: psllw $5, %xmm1 402; X32-SSE-NEXT: pxor %xmm2, %xmm2 403; X32-SSE-NEXT: pxor %xmm3, %xmm3 404; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 405; X32-SSE-NEXT: movdqa %xmm3, %xmm4 406; X32-SSE-NEXT: pandn %xmm0, %xmm4 407; X32-SSE-NEXT: psrlw $4, %xmm0 408; X32-SSE-NEXT: pand .LCPI3_0, %xmm0 409; X32-SSE-NEXT: pand %xmm3, %xmm0 410; X32-SSE-NEXT: por %xmm4, %xmm0 411; X32-SSE-NEXT: paddb %xmm1, %xmm1 412; X32-SSE-NEXT: pxor %xmm3, %xmm3 413; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 414; X32-SSE-NEXT: movdqa %xmm3, %xmm4 415; X32-SSE-NEXT: pandn %xmm0, %xmm4 416; X32-SSE-NEXT: psrlw $2, %xmm0 417; X32-SSE-NEXT: pand .LCPI3_1, %xmm0 418; X32-SSE-NEXT: pand %xmm3, %xmm0 419; X32-SSE-NEXT: por %xmm4, %xmm0 420; X32-SSE-NEXT: paddb %xmm1, %xmm1 421; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 422; X32-SSE-NEXT: movdqa %xmm2, %xmm1 423; X32-SSE-NEXT: pandn %xmm0, %xmm1 424; X32-SSE-NEXT: psrlw $1, %xmm0 425; X32-SSE-NEXT: pand .LCPI3_2, %xmm0 426; X32-SSE-NEXT: pand %xmm2, %xmm0 427; X32-SSE-NEXT: por %xmm1, %xmm0 428; X32-SSE-NEXT: retl 429 %shift = lshr <16 x i8> %a, %b 430 ret <16 x i8> %shift 431} 432 433; 434; Uniform Variable Shifts 435; 436 437define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 438; SSE-LABEL: splatvar_shift_v2i64: 439; SSE: # BB#0: 440; SSE-NEXT: psrlq %xmm1, %xmm0 441; SSE-NEXT: retq 442; 443; AVX-LABEL: splatvar_shift_v2i64: 444; AVX: # BB#0: 445; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 446; AVX-NEXT: retq 447; 448; XOP-LABEL: splatvar_shift_v2i64: 449; XOP: # BB#0: 450; XOP-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 451; XOP-NEXT: retq 452; 453; X32-SSE-LABEL: splatvar_shift_v2i64: 454; X32-SSE: # BB#0: 455; X32-SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero 456; X32-SSE-NEXT: psrlq %xmm1, %xmm0 457; X32-SSE-NEXT: retl 458 %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer 459 %shift = lshr <2 x i64> %a, %splat 460 ret <2 x i64> %shift 461} 462 463define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 464; SSE2-LABEL: splatvar_shift_v4i32: 465; SSE2: # BB#0: 466; SSE2-NEXT: xorps %xmm2, %xmm2 467; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 468; SSE2-NEXT: psrld %xmm2, %xmm0 469; SSE2-NEXT: retq 470; 471; SSE41-LABEL: splatvar_shift_v4i32: 472; SSE41: # BB#0: 473; SSE41-NEXT: pxor %xmm2, %xmm2 474; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7] 475; SSE41-NEXT: psrld %xmm2, %xmm0 476; SSE41-NEXT: retq 477; 478; AVX-LABEL: splatvar_shift_v4i32: 479; AVX: # BB#0: 480; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 481; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] 482; AVX-NEXT: vpsrld %xmm1, %xmm0, %xmm0 483; AVX-NEXT: retq 484; 485; XOP-LABEL: splatvar_shift_v4i32: 486; XOP: # BB#0: 487; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 488; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] 489; XOP-NEXT: vpsrld %xmm1, %xmm0, %xmm0 490; XOP-NEXT: retq 491; 492; X32-SSE-LABEL: splatvar_shift_v4i32: 493; X32-SSE: # BB#0: 494; X32-SSE-NEXT: xorps %xmm2, %xmm2 495; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 496; X32-SSE-NEXT: psrld %xmm2, %xmm0 497; X32-SSE-NEXT: retl 498 %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer 499 %shift = lshr <4 x i32> %a, %splat 500 ret <4 x i32> %shift 501} 502 503define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 504; SSE2-LABEL: splatvar_shift_v8i16: 505; SSE2: # BB#0: 506; SSE2-NEXT: movd %xmm1, %eax 507; SSE2-NEXT: movzwl %ax, %eax 508; SSE2-NEXT: movd %eax, %xmm1 509; SSE2-NEXT: psrlw %xmm1, %xmm0 510; SSE2-NEXT: retq 511; 512; SSE41-LABEL: splatvar_shift_v8i16: 513; SSE41: # BB#0: 514; SSE41-NEXT: pxor %xmm2, %xmm2 515; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7] 516; SSE41-NEXT: psrlw %xmm2, %xmm0 517; SSE41-NEXT: retq 518; 519; AVX-LABEL: splatvar_shift_v8i16: 520; AVX: # BB#0: 521; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 522; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] 523; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 524; AVX-NEXT: retq 525; 526; XOP-LABEL: splatvar_shift_v8i16: 527; XOP: # BB#0: 528; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 529; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] 530; XOP-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 531; XOP-NEXT: retq 532; 533; X32-SSE-LABEL: splatvar_shift_v8i16: 534; X32-SSE: # BB#0: 535; X32-SSE-NEXT: movd %xmm1, %eax 536; X32-SSE-NEXT: movzwl %ax, %eax 537; X32-SSE-NEXT: movd %eax, %xmm1 538; X32-SSE-NEXT: psrlw %xmm1, %xmm0 539; X32-SSE-NEXT: retl 540 %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer 541 %shift = lshr <8 x i16> %a, %splat 542 ret <8 x i16> %shift 543} 544 545define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 546; SSE2-LABEL: splatvar_shift_v16i8: 547; SSE2: # BB#0: 548; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 549; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] 550; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 551; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4] 552; SSE2-NEXT: psllw $5, %xmm2 553; SSE2-NEXT: pxor %xmm1, %xmm1 554; SSE2-NEXT: pxor %xmm3, %xmm3 555; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 556; SSE2-NEXT: movdqa %xmm3, %xmm4 557; SSE2-NEXT: pandn %xmm0, %xmm4 558; SSE2-NEXT: psrlw $4, %xmm0 559; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 560; SSE2-NEXT: pand %xmm3, %xmm0 561; SSE2-NEXT: por %xmm4, %xmm0 562; SSE2-NEXT: paddb %xmm2, %xmm2 563; SSE2-NEXT: pxor %xmm3, %xmm3 564; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 565; SSE2-NEXT: movdqa %xmm3, %xmm4 566; SSE2-NEXT: pandn %xmm0, %xmm4 567; SSE2-NEXT: psrlw $2, %xmm0 568; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 569; SSE2-NEXT: pand %xmm3, %xmm0 570; SSE2-NEXT: por %xmm4, %xmm0 571; SSE2-NEXT: paddb %xmm2, %xmm2 572; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 573; SSE2-NEXT: movdqa %xmm1, %xmm2 574; SSE2-NEXT: pandn %xmm0, %xmm2 575; SSE2-NEXT: psrlw $1, %xmm0 576; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 577; SSE2-NEXT: pand %xmm1, %xmm0 578; SSE2-NEXT: por %xmm2, %xmm0 579; SSE2-NEXT: retq 580; 581; SSE41-LABEL: splatvar_shift_v16i8: 582; SSE41: # BB#0: 583; SSE41-NEXT: movdqa %xmm0, %xmm2 584; SSE41-NEXT: pxor %xmm0, %xmm0 585; SSE41-NEXT: pshufb %xmm0, %xmm1 586; SSE41-NEXT: psllw $5, %xmm1 587; SSE41-NEXT: movdqa %xmm1, %xmm3 588; SSE41-NEXT: paddb %xmm3, %xmm3 589; SSE41-NEXT: movdqa %xmm2, %xmm4 590; SSE41-NEXT: psrlw $4, %xmm4 591; SSE41-NEXT: pand {{.*}}(%rip), %xmm4 592; SSE41-NEXT: movdqa %xmm1, %xmm0 593; SSE41-NEXT: pblendvb %xmm4, %xmm2 594; SSE41-NEXT: movdqa %xmm2, %xmm1 595; SSE41-NEXT: psrlw $2, %xmm1 596; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 597; SSE41-NEXT: movdqa %xmm3, %xmm0 598; SSE41-NEXT: pblendvb %xmm1, %xmm2 599; SSE41-NEXT: movdqa %xmm2, %xmm1 600; SSE41-NEXT: psrlw $1, %xmm1 601; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 602; SSE41-NEXT: paddb %xmm3, %xmm3 603; SSE41-NEXT: movdqa %xmm3, %xmm0 604; SSE41-NEXT: pblendvb %xmm1, %xmm2 605; SSE41-NEXT: movdqa %xmm2, %xmm0 606; SSE41-NEXT: retq 607; 608; AVX1-LABEL: splatvar_shift_v16i8: 609; AVX1: # BB#0: 610; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 611; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 612; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 613; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm2 614; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 615; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 616; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 617; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 618; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 619; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 620; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 621; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 622; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 623; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 624; AVX1-NEXT: retq 625; 626; AVX2-LABEL: splatvar_shift_v16i8: 627; AVX2: # BB#0: 628; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 629; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 630; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm2 631; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 632; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 633; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm2 634; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 635; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 636; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 637; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm2 638; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 639; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 640; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 641; AVX2-NEXT: retq 642; 643; XOPAVX1-LABEL: splatvar_shift_v16i8: 644; XOPAVX1: # BB#0: 645; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 646; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 647; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 648; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 649; XOPAVX1-NEXT: retq 650; 651; XOPAVX2-LABEL: splatvar_shift_v16i8: 652; XOPAVX2: # BB#0: 653; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 654; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 655; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 656; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 657; XOPAVX2-NEXT: retq 658; 659; X32-SSE-LABEL: splatvar_shift_v16i8: 660; X32-SSE: # BB#0: 661; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 662; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] 663; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 664; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4] 665; X32-SSE-NEXT: psllw $5, %xmm2 666; X32-SSE-NEXT: pxor %xmm1, %xmm1 667; X32-SSE-NEXT: pxor %xmm3, %xmm3 668; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3 669; X32-SSE-NEXT: movdqa %xmm3, %xmm4 670; X32-SSE-NEXT: pandn %xmm0, %xmm4 671; X32-SSE-NEXT: psrlw $4, %xmm0 672; X32-SSE-NEXT: pand .LCPI7_0, %xmm0 673; X32-SSE-NEXT: pand %xmm3, %xmm0 674; X32-SSE-NEXT: por %xmm4, %xmm0 675; X32-SSE-NEXT: paddb %xmm2, %xmm2 676; X32-SSE-NEXT: pxor %xmm3, %xmm3 677; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3 678; X32-SSE-NEXT: movdqa %xmm3, %xmm4 679; X32-SSE-NEXT: pandn %xmm0, %xmm4 680; X32-SSE-NEXT: psrlw $2, %xmm0 681; X32-SSE-NEXT: pand .LCPI7_1, %xmm0 682; X32-SSE-NEXT: pand %xmm3, %xmm0 683; X32-SSE-NEXT: por %xmm4, %xmm0 684; X32-SSE-NEXT: paddb %xmm2, %xmm2 685; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm1 686; X32-SSE-NEXT: movdqa %xmm1, %xmm2 687; X32-SSE-NEXT: pandn %xmm0, %xmm2 688; X32-SSE-NEXT: psrlw $1, %xmm0 689; X32-SSE-NEXT: pand .LCPI7_2, %xmm0 690; X32-SSE-NEXT: pand %xmm1, %xmm0 691; X32-SSE-NEXT: por %xmm2, %xmm0 692; X32-SSE-NEXT: retl 693 %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer 694 %shift = lshr <16 x i8> %a, %splat 695 ret <16 x i8> %shift 696} 697 698; 699; Constant Shifts 700; 701 702define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind { 703; SSE2-LABEL: constant_shift_v2i64: 704; SSE2: # BB#0: 705; SSE2-NEXT: movdqa %xmm0, %xmm1 706; SSE2-NEXT: psrlq $7, %xmm1 707; SSE2-NEXT: psrlq $1, %xmm0 708; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 709; SSE2-NEXT: movapd %xmm1, %xmm0 710; SSE2-NEXT: retq 711; 712; SSE41-LABEL: constant_shift_v2i64: 713; SSE41: # BB#0: 714; SSE41-NEXT: movdqa %xmm0, %xmm1 715; SSE41-NEXT: psrlq $7, %xmm1 716; SSE41-NEXT: psrlq $1, %xmm0 717; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 718; SSE41-NEXT: retq 719; 720; AVX1-LABEL: constant_shift_v2i64: 721; AVX1: # BB#0: 722; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1 723; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 724; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 725; AVX1-NEXT: retq 726; 727; AVX2-LABEL: constant_shift_v2i64: 728; AVX2: # BB#0: 729; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 730; AVX2-NEXT: retq 731; 732; XOPAVX1-LABEL: constant_shift_v2i64: 733; XOPAVX1: # BB#0: 734; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 735; XOPAVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm1 736; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0 737; XOPAVX1-NEXT: retq 738; 739; XOPAVX2-LABEL: constant_shift_v2i64: 740; XOPAVX2: # BB#0: 741; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 742; XOPAVX2-NEXT: retq 743; 744; X32-SSE-LABEL: constant_shift_v2i64: 745; X32-SSE: # BB#0: 746; X32-SSE-NEXT: movdqa %xmm0, %xmm1 747; X32-SSE-NEXT: psrlq $7, %xmm1 748; X32-SSE-NEXT: psrlq $1, %xmm0 749; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 750; X32-SSE-NEXT: movapd %xmm1, %xmm0 751; X32-SSE-NEXT: retl 752 %shift = lshr <2 x i64> %a, <i64 1, i64 7> 753 ret <2 x i64> %shift 754} 755 756define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind { 757; SSE2-LABEL: constant_shift_v4i32: 758; SSE2: # BB#0: 759; SSE2-NEXT: movdqa %xmm0, %xmm1 760; SSE2-NEXT: psrld $7, %xmm1 761; SSE2-NEXT: movdqa %xmm0, %xmm2 762; SSE2-NEXT: psrld $5, %xmm2 763; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] 764; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 765; SSE2-NEXT: movdqa %xmm0, %xmm2 766; SSE2-NEXT: psrld $6, %xmm2 767; SSE2-NEXT: psrld $4, %xmm0 768; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 769; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 770; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 771; SSE2-NEXT: retq 772; 773; SSE41-LABEL: constant_shift_v4i32: 774; SSE41: # BB#0: 775; SSE41-NEXT: movdqa %xmm0, %xmm1 776; SSE41-NEXT: psrld $7, %xmm1 777; SSE41-NEXT: movdqa %xmm0, %xmm2 778; SSE41-NEXT: psrld $5, %xmm2 779; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 780; SSE41-NEXT: movdqa %xmm0, %xmm1 781; SSE41-NEXT: psrld $6, %xmm1 782; SSE41-NEXT: psrld $4, %xmm0 783; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 784; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 785; SSE41-NEXT: retq 786; 787; AVX1-LABEL: constant_shift_v4i32: 788; AVX1: # BB#0: 789; AVX1-NEXT: vpsrld $7, %xmm0, %xmm1 790; AVX1-NEXT: vpsrld $5, %xmm0, %xmm2 791; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 792; AVX1-NEXT: vpsrld $6, %xmm0, %xmm2 793; AVX1-NEXT: vpsrld $4, %xmm0, %xmm0 794; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 795; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 796; AVX1-NEXT: retq 797; 798; AVX2-LABEL: constant_shift_v4i32: 799; AVX2: # BB#0: 800; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 801; AVX2-NEXT: retq 802; 803; XOPAVX1-LABEL: constant_shift_v4i32: 804; XOPAVX1: # BB#0: 805; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 806; XOPAVX1-NEXT: retq 807; 808; XOPAVX2-LABEL: constant_shift_v4i32: 809; XOPAVX2: # BB#0: 810; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 811; XOPAVX2-NEXT: retq 812; 813; X32-SSE-LABEL: constant_shift_v4i32: 814; X32-SSE: # BB#0: 815; X32-SSE-NEXT: movdqa %xmm0, %xmm1 816; X32-SSE-NEXT: psrld $7, %xmm1 817; X32-SSE-NEXT: movdqa %xmm0, %xmm2 818; X32-SSE-NEXT: psrld $5, %xmm2 819; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] 820; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 821; X32-SSE-NEXT: movdqa %xmm0, %xmm2 822; X32-SSE-NEXT: psrld $6, %xmm2 823; X32-SSE-NEXT: psrld $4, %xmm0 824; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 825; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 826; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 827; X32-SSE-NEXT: retl 828 %shift = lshr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7> 829 ret <4 x i32> %shift 830} 831 832define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { 833; SSE2-LABEL: constant_shift_v8i16: 834; SSE2: # BB#0: 835; SSE2-NEXT: movdqa %xmm0, %xmm1 836; SSE2-NEXT: psrlw $4, %xmm1 837; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 838; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] 839; SSE2-NEXT: psrlw $2, %xmm1 840; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] 841; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 842; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] 843; SSE2-NEXT: movdqa %xmm2, %xmm1 844; SSE2-NEXT: pand %xmm0, %xmm1 845; SSE2-NEXT: psrlw $1, %xmm2 846; SSE2-NEXT: pandn %xmm2, %xmm0 847; SSE2-NEXT: por %xmm1, %xmm0 848; SSE2-NEXT: retq 849; 850; SSE41-LABEL: constant_shift_v8i16: 851; SSE41: # BB#0: 852; SSE41-NEXT: movdqa %xmm0, %xmm1 853; SSE41-NEXT: movdqa %xmm1, %xmm2 854; SSE41-NEXT: psrlw $8, %xmm2 855; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,4112,8224,12336,16448,20560,24672,28784] 856; SSE41-NEXT: pblendvb %xmm2, %xmm1 857; SSE41-NEXT: movdqa %xmm1, %xmm2 858; SSE41-NEXT: psrlw $4, %xmm2 859; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,8224,16448,24672,32896,41120,49344,57568] 860; SSE41-NEXT: pblendvb %xmm2, %xmm1 861; SSE41-NEXT: movdqa %xmm1, %xmm2 862; SSE41-NEXT: psrlw $2, %xmm2 863; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,16448,32896,49344,256,16704,33152,49600] 864; SSE41-NEXT: pblendvb %xmm2, %xmm1 865; SSE41-NEXT: movdqa %xmm1, %xmm2 866; SSE41-NEXT: psrlw $1, %xmm2 867; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,32896,256,33152,512,33408,768,33664] 868; SSE41-NEXT: pblendvb %xmm2, %xmm1 869; SSE41-NEXT: movdqa %xmm1, %xmm0 870; SSE41-NEXT: retq 871; 872; AVX1-LABEL: constant_shift_v8i16: 873; AVX1: # BB#0: 874; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 875; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4112,8224,12336,16448,20560,24672,28784] 876; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 877; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 878; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,8224,16448,24672,32896,41120,49344,57568] 879; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 880; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 881; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,16448,32896,49344,256,16704,33152,49600] 882; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 883; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 884; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,32896,256,33152,512,33408,768,33664] 885; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 886; AVX1-NEXT: retq 887; 888; AVX2-LABEL: constant_shift_v8i16: 889; AVX2: # BB#0: 890; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 891; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 892; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 893; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 894; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 895; AVX2-NEXT: vzeroupper 896; AVX2-NEXT: retq 897; 898; XOP-LABEL: constant_shift_v8i16: 899; XOP: # BB#0: 900; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 901; XOP-NEXT: vpsubw {{.*}}(%rip), %xmm1, %xmm1 902; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0 903; XOP-NEXT: retq 904; 905; X32-SSE-LABEL: constant_shift_v8i16: 906; X32-SSE: # BB#0: 907; X32-SSE-NEXT: movdqa %xmm0, %xmm1 908; X32-SSE-NEXT: psrlw $4, %xmm1 909; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 910; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] 911; X32-SSE-NEXT: psrlw $2, %xmm1 912; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] 913; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 914; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] 915; X32-SSE-NEXT: movdqa %xmm2, %xmm1 916; X32-SSE-NEXT: pand %xmm0, %xmm1 917; X32-SSE-NEXT: psrlw $1, %xmm2 918; X32-SSE-NEXT: pandn %xmm2, %xmm0 919; X32-SSE-NEXT: por %xmm1, %xmm0 920; X32-SSE-NEXT: retl 921 %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7> 922 ret <8 x i16> %shift 923} 924 925define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { 926; SSE2-LABEL: constant_shift_v16i8: 927; SSE2: # BB#0: 928; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 929; SSE2-NEXT: psllw $5, %xmm2 930; SSE2-NEXT: pxor %xmm1, %xmm1 931; SSE2-NEXT: pxor %xmm3, %xmm3 932; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 933; SSE2-NEXT: movdqa %xmm3, %xmm4 934; SSE2-NEXT: pandn %xmm0, %xmm4 935; SSE2-NEXT: psrlw $4, %xmm0 936; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 937; SSE2-NEXT: pand %xmm3, %xmm0 938; SSE2-NEXT: por %xmm4, %xmm0 939; SSE2-NEXT: paddb %xmm2, %xmm2 940; SSE2-NEXT: pxor %xmm3, %xmm3 941; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 942; SSE2-NEXT: movdqa %xmm3, %xmm4 943; SSE2-NEXT: pandn %xmm0, %xmm4 944; SSE2-NEXT: psrlw $2, %xmm0 945; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 946; SSE2-NEXT: pand %xmm3, %xmm0 947; SSE2-NEXT: por %xmm4, %xmm0 948; SSE2-NEXT: paddb %xmm2, %xmm2 949; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 950; SSE2-NEXT: movdqa %xmm1, %xmm2 951; SSE2-NEXT: pandn %xmm0, %xmm2 952; SSE2-NEXT: psrlw $1, %xmm0 953; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 954; SSE2-NEXT: pand %xmm1, %xmm0 955; SSE2-NEXT: por %xmm2, %xmm0 956; SSE2-NEXT: retq 957; 958; SSE41-LABEL: constant_shift_v16i8: 959; SSE41: # BB#0: 960; SSE41-NEXT: movdqa %xmm0, %xmm1 961; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 962; SSE41-NEXT: psllw $5, %xmm0 963; SSE41-NEXT: movdqa %xmm1, %xmm2 964; SSE41-NEXT: psrlw $4, %xmm2 965; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 966; SSE41-NEXT: pblendvb %xmm2, %xmm1 967; SSE41-NEXT: movdqa %xmm1, %xmm2 968; SSE41-NEXT: psrlw $2, %xmm2 969; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 970; SSE41-NEXT: paddb %xmm0, %xmm0 971; SSE41-NEXT: pblendvb %xmm2, %xmm1 972; SSE41-NEXT: movdqa %xmm1, %xmm2 973; SSE41-NEXT: psrlw $1, %xmm2 974; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 975; SSE41-NEXT: paddb %xmm0, %xmm0 976; SSE41-NEXT: pblendvb %xmm2, %xmm1 977; SSE41-NEXT: movdqa %xmm1, %xmm0 978; SSE41-NEXT: retq 979; 980; AVX-LABEL: constant_shift_v16i8: 981; AVX: # BB#0: 982; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 983; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 984; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 985; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 986; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 987; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 988; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 989; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 990; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 991; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 992; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 993; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 994; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 995; AVX-NEXT: retq 996; 997; XOP-LABEL: constant_shift_v16i8: 998; XOP: # BB#0: 999; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 1000; XOP-NEXT: vpsubb {{.*}}(%rip), %xmm1, %xmm1 1001; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 1002; XOP-NEXT: retq 1003; 1004; X32-SSE-LABEL: constant_shift_v16i8: 1005; X32-SSE: # BB#0: 1006; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 1007; X32-SSE-NEXT: psllw $5, %xmm2 1008; X32-SSE-NEXT: pxor %xmm1, %xmm1 1009; X32-SSE-NEXT: pxor %xmm3, %xmm3 1010; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3 1011; X32-SSE-NEXT: movdqa %xmm3, %xmm4 1012; X32-SSE-NEXT: pandn %xmm0, %xmm4 1013; X32-SSE-NEXT: psrlw $4, %xmm0 1014; X32-SSE-NEXT: pand .LCPI11_1, %xmm0 1015; X32-SSE-NEXT: pand %xmm3, %xmm0 1016; X32-SSE-NEXT: por %xmm4, %xmm0 1017; X32-SSE-NEXT: paddb %xmm2, %xmm2 1018; X32-SSE-NEXT: pxor %xmm3, %xmm3 1019; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3 1020; X32-SSE-NEXT: movdqa %xmm3, %xmm4 1021; X32-SSE-NEXT: pandn %xmm0, %xmm4 1022; X32-SSE-NEXT: psrlw $2, %xmm0 1023; X32-SSE-NEXT: pand .LCPI11_2, %xmm0 1024; X32-SSE-NEXT: pand %xmm3, %xmm0 1025; X32-SSE-NEXT: por %xmm4, %xmm0 1026; X32-SSE-NEXT: paddb %xmm2, %xmm2 1027; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm1 1028; X32-SSE-NEXT: movdqa %xmm1, %xmm2 1029; X32-SSE-NEXT: pandn %xmm0, %xmm2 1030; X32-SSE-NEXT: psrlw $1, %xmm0 1031; X32-SSE-NEXT: pand .LCPI11_3, %xmm0 1032; X32-SSE-NEXT: pand %xmm1, %xmm0 1033; X32-SSE-NEXT: por %xmm2, %xmm0 1034; X32-SSE-NEXT: retl 1035 %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> 1036 ret <16 x i8> %shift 1037} 1038 1039; 1040; Uniform Constant Shifts 1041; 1042 1043define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind { 1044; SSE-LABEL: splatconstant_shift_v2i64: 1045; SSE: # BB#0: 1046; SSE-NEXT: psrlq $7, %xmm0 1047; SSE-NEXT: retq 1048; 1049; AVX-LABEL: splatconstant_shift_v2i64: 1050; AVX: # BB#0: 1051; AVX-NEXT: vpsrlq $7, %xmm0, %xmm0 1052; AVX-NEXT: retq 1053; 1054; XOP-LABEL: splatconstant_shift_v2i64: 1055; XOP: # BB#0: 1056; XOP-NEXT: vpsrlq $7, %xmm0, %xmm0 1057; XOP-NEXT: retq 1058; 1059; X32-SSE-LABEL: splatconstant_shift_v2i64: 1060; X32-SSE: # BB#0: 1061; X32-SSE-NEXT: psrlq $7, %xmm0 1062; X32-SSE-NEXT: retl 1063 %shift = lshr <2 x i64> %a, <i64 7, i64 7> 1064 ret <2 x i64> %shift 1065} 1066 1067define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind { 1068; SSE-LABEL: splatconstant_shift_v4i32: 1069; SSE: # BB#0: 1070; SSE-NEXT: psrld $5, %xmm0 1071; SSE-NEXT: retq 1072; 1073; AVX-LABEL: splatconstant_shift_v4i32: 1074; AVX: # BB#0: 1075; AVX-NEXT: vpsrld $5, %xmm0, %xmm0 1076; AVX-NEXT: retq 1077; 1078; XOP-LABEL: splatconstant_shift_v4i32: 1079; XOP: # BB#0: 1080; XOP-NEXT: vpsrld $5, %xmm0, %xmm0 1081; XOP-NEXT: retq 1082; 1083; X32-SSE-LABEL: splatconstant_shift_v4i32: 1084; X32-SSE: # BB#0: 1085; X32-SSE-NEXT: psrld $5, %xmm0 1086; X32-SSE-NEXT: retl 1087 %shift = lshr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5> 1088 ret <4 x i32> %shift 1089} 1090 1091define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind { 1092; SSE-LABEL: splatconstant_shift_v8i16: 1093; SSE: # BB#0: 1094; SSE-NEXT: psrlw $3, %xmm0 1095; SSE-NEXT: retq 1096; 1097; AVX-LABEL: splatconstant_shift_v8i16: 1098; AVX: # BB#0: 1099; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 1100; AVX-NEXT: retq 1101; 1102; XOP-LABEL: splatconstant_shift_v8i16: 1103; XOP: # BB#0: 1104; XOP-NEXT: vpsrlw $3, %xmm0, %xmm0 1105; XOP-NEXT: retq 1106; 1107; X32-SSE-LABEL: splatconstant_shift_v8i16: 1108; X32-SSE: # BB#0: 1109; X32-SSE-NEXT: psrlw $3, %xmm0 1110; X32-SSE-NEXT: retl 1111 %shift = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> 1112 ret <8 x i16> %shift 1113} 1114 1115define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind { 1116; SSE-LABEL: splatconstant_shift_v16i8: 1117; SSE: # BB#0: 1118; SSE-NEXT: psrlw $3, %xmm0 1119; SSE-NEXT: pand {{.*}}(%rip), %xmm0 1120; SSE-NEXT: retq 1121; 1122; AVX-LABEL: splatconstant_shift_v16i8: 1123; AVX: # BB#0: 1124; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 1125; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1126; AVX-NEXT: retq 1127; 1128; XOP-LABEL: splatconstant_shift_v16i8: 1129; XOP: # BB#0: 1130; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 1131; XOP-NEXT: vpsubb {{.*}}(%rip), %xmm1, %xmm1 1132; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 1133; XOP-NEXT: retq 1134; 1135; X32-SSE-LABEL: splatconstant_shift_v16i8: 1136; X32-SSE: # BB#0: 1137; X32-SSE-NEXT: psrlw $3, %xmm0 1138; X32-SSE-NEXT: pand .LCPI15_0, %xmm0 1139; X32-SSE-NEXT: retl 1140 %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> 1141 ret <16 x i8> %shift 1142} 1143