1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 8; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW 9; 10; Just one 32-bit run to make sure we do reasonable things for i64 shifts. 11; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2 12 13; 14; Variable Shifts 15; 16 17define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 18; SSE2-LABEL: var_shift_v2i64: 19; SSE2: # BB#0: 20; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] 21; SSE2-NEXT: movdqa %xmm0, %xmm2 22; SSE2-NEXT: psllq %xmm3, %xmm2 23; SSE2-NEXT: psllq %xmm1, %xmm0 24; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 25; SSE2-NEXT: movapd %xmm2, %xmm0 26; SSE2-NEXT: retq 27; 28; SSE41-LABEL: var_shift_v2i64: 29; SSE41: # BB#0: 30; SSE41-NEXT: movdqa %xmm0, %xmm2 31; SSE41-NEXT: psllq %xmm1, %xmm2 32; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 33; SSE41-NEXT: psllq %xmm1, %xmm0 34; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] 35; SSE41-NEXT: retq 36; 37; AVX1-LABEL: var_shift_v2i64: 38; AVX1: # BB#0: 39; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2 40; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 41; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 42; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] 43; AVX1-NEXT: retq 44; 45; AVX2-LABEL: var_shift_v2i64: 46; AVX2: # BB#0: 47; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 48; AVX2-NEXT: retq 49; 50; XOPAVX1-LABEL: var_shift_v2i64: 51; XOPAVX1: # BB#0: 52; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0 53; XOPAVX1-NEXT: retq 54; 55; XOPAVX2-LABEL: var_shift_v2i64: 56; XOPAVX2: # BB#0: 57; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 58; XOPAVX2-NEXT: retq 59; 60; AVX512-LABEL: var_shift_v2i64: 61; AVX512: ## BB#0: 62; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 63; AVX512-NEXT: retq 64; 65; X32-SSE-LABEL: var_shift_v2i64: 66; X32-SSE: # BB#0: 67; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] 68; X32-SSE-NEXT: movdqa %xmm0, %xmm2 69; X32-SSE-NEXT: psllq %xmm3, %xmm2 70; X32-SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero 71; X32-SSE-NEXT: psllq %xmm1, %xmm0 72; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 73; X32-SSE-NEXT: movapd %xmm2, %xmm0 74; X32-SSE-NEXT: retl 75 %shift = shl <2 x i64> %a, %b 76 ret <2 x i64> %shift 77} 78 79define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 80; SSE2-LABEL: var_shift_v4i32: 81; SSE2: # BB#0: 82; SSE2-NEXT: pslld $23, %xmm1 83; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 84; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 85; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 86; SSE2-NEXT: pmuludq %xmm0, %xmm1 87; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 88; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 89; SSE2-NEXT: pmuludq %xmm2, %xmm0 90; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 91; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 92; SSE2-NEXT: movdqa %xmm1, %xmm0 93; SSE2-NEXT: retq 94; 95; SSE41-LABEL: var_shift_v4i32: 96; SSE41: # BB#0: 97; SSE41-NEXT: pslld $23, %xmm1 98; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1 99; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 100; SSE41-NEXT: pmulld %xmm1, %xmm0 101; SSE41-NEXT: retq 102; 103; AVX1-LABEL: var_shift_v4i32: 104; AVX1: # BB#0: 105; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 106; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 107; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 108; AVX1-NEXT: vpmulld %xmm0, %xmm1, %xmm0 109; AVX1-NEXT: retq 110; 111; AVX2-LABEL: var_shift_v4i32: 112; AVX2: # BB#0: 113; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 114; AVX2-NEXT: retq 115; 116; XOPAVX1-LABEL: var_shift_v4i32: 117; XOPAVX1: # BB#0: 118; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0 119; XOPAVX1-NEXT: retq 120; 121; XOPAVX2-LABEL: var_shift_v4i32: 122; XOPAVX2: # BB#0: 123; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 124; XOPAVX2-NEXT: retq 125; 126; AVX512-LABEL: var_shift_v4i32: 127; AVX512: ## BB#0: 128; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 129; AVX512-NEXT: retq 130; 131; X32-SSE-LABEL: var_shift_v4i32: 132; X32-SSE: # BB#0: 133; X32-SSE-NEXT: pslld $23, %xmm1 134; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1 135; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1 136; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 137; X32-SSE-NEXT: pmuludq %xmm0, %xmm1 138; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 139; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 140; X32-SSE-NEXT: pmuludq %xmm2, %xmm0 141; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 142; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 143; X32-SSE-NEXT: movdqa %xmm1, %xmm0 144; X32-SSE-NEXT: retl 145 %shift = shl <4 x i32> %a, %b 146 ret <4 x i32> %shift 147} 148 149define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 150; SSE2-LABEL: var_shift_v8i16: 151; SSE2: # BB#0: 152; SSE2-NEXT: psllw $12, %xmm1 153; SSE2-NEXT: movdqa %xmm1, %xmm2 154; SSE2-NEXT: psraw $15, %xmm2 155; SSE2-NEXT: movdqa %xmm2, %xmm3 156; SSE2-NEXT: pandn %xmm0, %xmm3 157; SSE2-NEXT: psllw $8, %xmm0 158; SSE2-NEXT: pand %xmm2, %xmm0 159; SSE2-NEXT: por %xmm3, %xmm0 160; SSE2-NEXT: paddw %xmm1, %xmm1 161; SSE2-NEXT: movdqa %xmm1, %xmm2 162; SSE2-NEXT: psraw $15, %xmm2 163; SSE2-NEXT: movdqa %xmm2, %xmm3 164; SSE2-NEXT: pandn %xmm0, %xmm3 165; SSE2-NEXT: psllw $4, %xmm0 166; SSE2-NEXT: pand %xmm2, %xmm0 167; SSE2-NEXT: por %xmm3, %xmm0 168; SSE2-NEXT: paddw %xmm1, %xmm1 169; SSE2-NEXT: movdqa %xmm1, %xmm2 170; SSE2-NEXT: psraw $15, %xmm2 171; SSE2-NEXT: movdqa %xmm2, %xmm3 172; SSE2-NEXT: pandn %xmm0, %xmm3 173; SSE2-NEXT: psllw $2, %xmm0 174; SSE2-NEXT: pand %xmm2, %xmm0 175; SSE2-NEXT: por %xmm3, %xmm0 176; SSE2-NEXT: paddw %xmm1, %xmm1 177; SSE2-NEXT: psraw $15, %xmm1 178; SSE2-NEXT: movdqa %xmm1, %xmm2 179; SSE2-NEXT: pandn %xmm0, %xmm2 180; SSE2-NEXT: psllw $1, %xmm0 181; SSE2-NEXT: pand %xmm1, %xmm0 182; SSE2-NEXT: por %xmm2, %xmm0 183; SSE2-NEXT: retq 184; 185; SSE41-LABEL: var_shift_v8i16: 186; SSE41: # BB#0: 187; SSE41-NEXT: movdqa %xmm0, %xmm2 188; SSE41-NEXT: movdqa %xmm1, %xmm0 189; SSE41-NEXT: psllw $12, %xmm0 190; SSE41-NEXT: psllw $4, %xmm1 191; SSE41-NEXT: por %xmm0, %xmm1 192; SSE41-NEXT: movdqa %xmm1, %xmm3 193; SSE41-NEXT: paddw %xmm3, %xmm3 194; SSE41-NEXT: movdqa %xmm2, %xmm4 195; SSE41-NEXT: psllw $8, %xmm4 196; SSE41-NEXT: movdqa %xmm1, %xmm0 197; SSE41-NEXT: pblendvb %xmm4, %xmm2 198; SSE41-NEXT: movdqa %xmm2, %xmm1 199; SSE41-NEXT: psllw $4, %xmm1 200; SSE41-NEXT: movdqa %xmm3, %xmm0 201; SSE41-NEXT: pblendvb %xmm1, %xmm2 202; SSE41-NEXT: movdqa %xmm2, %xmm1 203; SSE41-NEXT: psllw $2, %xmm1 204; SSE41-NEXT: paddw %xmm3, %xmm3 205; SSE41-NEXT: movdqa %xmm3, %xmm0 206; SSE41-NEXT: pblendvb %xmm1, %xmm2 207; SSE41-NEXT: movdqa %xmm2, %xmm1 208; SSE41-NEXT: psllw $1, %xmm1 209; SSE41-NEXT: paddw %xmm3, %xmm3 210; SSE41-NEXT: movdqa %xmm3, %xmm0 211; SSE41-NEXT: pblendvb %xmm1, %xmm2 212; SSE41-NEXT: movdqa %xmm2, %xmm0 213; SSE41-NEXT: retq 214; 215; AVX1-LABEL: var_shift_v8i16: 216; AVX1: # BB#0: 217; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 218; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 219; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 220; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 221; AVX1-NEXT: vpsllw $8, %xmm0, %xmm3 222; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 223; AVX1-NEXT: vpsllw $4, %xmm0, %xmm1 224; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 225; AVX1-NEXT: vpsllw $2, %xmm0, %xmm1 226; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 227; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 228; AVX1-NEXT: vpsllw $1, %xmm0, %xmm1 229; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 230; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 231; AVX1-NEXT: retq 232; 233; AVX2-LABEL: var_shift_v8i16: 234; AVX2: # BB#0: 235; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 236; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 237; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 238; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 239; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 240; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 241; AVX2-NEXT: vzeroupper 242; AVX2-NEXT: retq 243; 244; XOP-LABEL: var_shift_v8i16: 245; XOP: # BB#0: 246; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0 247; XOP-NEXT: retq 248; 249; AVX512-LABEL: var_shift_v8i16: 250; AVX512: ## BB#0: 251; AVX512-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def> 252; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def> 253; AVX512-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 254; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> 255; AVX512-NEXT: retq 256; 257; X32-SSE-LABEL: var_shift_v8i16: 258; X32-SSE: # BB#0: 259; X32-SSE-NEXT: psllw $12, %xmm1 260; X32-SSE-NEXT: movdqa %xmm1, %xmm2 261; X32-SSE-NEXT: psraw $15, %xmm2 262; X32-SSE-NEXT: movdqa %xmm2, %xmm3 263; X32-SSE-NEXT: pandn %xmm0, %xmm3 264; X32-SSE-NEXT: psllw $8, %xmm0 265; X32-SSE-NEXT: pand %xmm2, %xmm0 266; X32-SSE-NEXT: por %xmm3, %xmm0 267; X32-SSE-NEXT: paddw %xmm1, %xmm1 268; X32-SSE-NEXT: movdqa %xmm1, %xmm2 269; X32-SSE-NEXT: psraw $15, %xmm2 270; X32-SSE-NEXT: movdqa %xmm2, %xmm3 271; X32-SSE-NEXT: pandn %xmm0, %xmm3 272; X32-SSE-NEXT: psllw $4, %xmm0 273; X32-SSE-NEXT: pand %xmm2, %xmm0 274; X32-SSE-NEXT: por %xmm3, %xmm0 275; X32-SSE-NEXT: paddw %xmm1, %xmm1 276; X32-SSE-NEXT: movdqa %xmm1, %xmm2 277; X32-SSE-NEXT: psraw $15, %xmm2 278; X32-SSE-NEXT: movdqa %xmm2, %xmm3 279; X32-SSE-NEXT: pandn %xmm0, %xmm3 280; X32-SSE-NEXT: psllw $2, %xmm0 281; X32-SSE-NEXT: pand %xmm2, %xmm0 282; X32-SSE-NEXT: por %xmm3, %xmm0 283; X32-SSE-NEXT: paddw %xmm1, %xmm1 284; X32-SSE-NEXT: psraw $15, %xmm1 285; X32-SSE-NEXT: movdqa %xmm1, %xmm2 286; X32-SSE-NEXT: pandn %xmm0, %xmm2 287; X32-SSE-NEXT: psllw $1, %xmm0 288; X32-SSE-NEXT: pand %xmm1, %xmm0 289; X32-SSE-NEXT: por %xmm2, %xmm0 290; X32-SSE-NEXT: retl 291 %shift = shl <8 x i16> %a, %b 292 ret <8 x i16> %shift 293} 294 295define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 296; SSE2-LABEL: var_shift_v16i8: 297; SSE2: # BB#0: 298; SSE2-NEXT: psllw $5, %xmm1 299; SSE2-NEXT: pxor %xmm2, %xmm2 300; SSE2-NEXT: pxor %xmm3, %xmm3 301; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 302; SSE2-NEXT: movdqa %xmm3, %xmm4 303; SSE2-NEXT: pandn %xmm0, %xmm4 304; SSE2-NEXT: psllw $4, %xmm0 305; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 306; SSE2-NEXT: pand %xmm3, %xmm0 307; SSE2-NEXT: por %xmm4, %xmm0 308; SSE2-NEXT: paddb %xmm1, %xmm1 309; SSE2-NEXT: pxor %xmm3, %xmm3 310; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 311; SSE2-NEXT: movdqa %xmm3, %xmm4 312; SSE2-NEXT: pandn %xmm0, %xmm4 313; SSE2-NEXT: psllw $2, %xmm0 314; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 315; SSE2-NEXT: pand %xmm3, %xmm0 316; SSE2-NEXT: por %xmm4, %xmm0 317; SSE2-NEXT: paddb %xmm1, %xmm1 318; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 319; SSE2-NEXT: movdqa %xmm2, %xmm1 320; SSE2-NEXT: pandn %xmm0, %xmm1 321; SSE2-NEXT: paddb %xmm0, %xmm0 322; SSE2-NEXT: pand %xmm2, %xmm0 323; SSE2-NEXT: por %xmm1, %xmm0 324; SSE2-NEXT: retq 325; 326; SSE41-LABEL: var_shift_v16i8: 327; SSE41: # BB#0: 328; SSE41-NEXT: movdqa %xmm0, %xmm2 329; SSE41-NEXT: psllw $5, %xmm1 330; SSE41-NEXT: movdqa %xmm2, %xmm3 331; SSE41-NEXT: psllw $4, %xmm3 332; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 333; SSE41-NEXT: movdqa %xmm1, %xmm0 334; SSE41-NEXT: pblendvb %xmm3, %xmm2 335; SSE41-NEXT: movdqa %xmm2, %xmm3 336; SSE41-NEXT: psllw $2, %xmm3 337; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 338; SSE41-NEXT: paddb %xmm1, %xmm1 339; SSE41-NEXT: movdqa %xmm1, %xmm0 340; SSE41-NEXT: pblendvb %xmm3, %xmm2 341; SSE41-NEXT: movdqa %xmm2, %xmm3 342; SSE41-NEXT: paddb %xmm3, %xmm3 343; SSE41-NEXT: paddb %xmm1, %xmm1 344; SSE41-NEXT: movdqa %xmm1, %xmm0 345; SSE41-NEXT: pblendvb %xmm3, %xmm2 346; SSE41-NEXT: movdqa %xmm2, %xmm0 347; SSE41-NEXT: retq 348; 349; AVX-LABEL: var_shift_v16i8: 350; AVX: # BB#0: 351; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 352; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 353; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 354; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 355; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 356; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 357; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 358; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 359; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 360; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 361; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 362; AVX-NEXT: retq 363; 364; XOP-LABEL: var_shift_v16i8: 365; XOP: # BB#0: 366; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 367; XOP-NEXT: retq 368; 369; AVX512-LABEL: var_shift_v16i8: 370; AVX512: ## BB#0: 371; AVX512-NEXT: vpsllw $5, %xmm1, %xmm1 372; AVX512-NEXT: vpsllw $4, %xmm0, %xmm2 373; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 374; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 375; AVX512-NEXT: vpsllw $2, %xmm0, %xmm2 376; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 377; AVX512-NEXT: vpaddb %xmm1, %xmm1, %xmm1 378; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 379; AVX512-NEXT: vpaddb %xmm0, %xmm0, %xmm2 380; AVX512-NEXT: vpaddb %xmm1, %xmm1, %xmm1 381; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 382; AVX512-NEXT: retq 383; 384; X32-SSE-LABEL: var_shift_v16i8: 385; X32-SSE: # BB#0: 386; X32-SSE-NEXT: psllw $5, %xmm1 387; X32-SSE-NEXT: pxor %xmm2, %xmm2 388; X32-SSE-NEXT: pxor %xmm3, %xmm3 389; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 390; X32-SSE-NEXT: movdqa %xmm3, %xmm4 391; X32-SSE-NEXT: pandn %xmm0, %xmm4 392; X32-SSE-NEXT: psllw $4, %xmm0 393; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 394; X32-SSE-NEXT: pand %xmm3, %xmm0 395; X32-SSE-NEXT: por %xmm4, %xmm0 396; X32-SSE-NEXT: paddb %xmm1, %xmm1 397; X32-SSE-NEXT: pxor %xmm3, %xmm3 398; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 399; X32-SSE-NEXT: movdqa %xmm3, %xmm4 400; X32-SSE-NEXT: pandn %xmm0, %xmm4 401; X32-SSE-NEXT: psllw $2, %xmm0 402; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 403; X32-SSE-NEXT: pand %xmm3, %xmm0 404; X32-SSE-NEXT: por %xmm4, %xmm0 405; X32-SSE-NEXT: paddb %xmm1, %xmm1 406; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 407; X32-SSE-NEXT: movdqa %xmm2, %xmm1 408; X32-SSE-NEXT: pandn %xmm0, %xmm1 409; X32-SSE-NEXT: paddb %xmm0, %xmm0 410; X32-SSE-NEXT: pand %xmm2, %xmm0 411; X32-SSE-NEXT: por %xmm1, %xmm0 412; X32-SSE-NEXT: retl 413 %shift = shl <16 x i8> %a, %b 414 ret <16 x i8> %shift 415} 416 417; 418; Uniform Variable Shifts 419; 420 421define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 422; SSE-LABEL: splatvar_shift_v2i64: 423; SSE: # BB#0: 424; SSE-NEXT: psllq %xmm1, %xmm0 425; SSE-NEXT: retq 426; 427; AVX-LABEL: splatvar_shift_v2i64: 428; AVX: # BB#0: 429; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm0 430; AVX-NEXT: retq 431; 432; XOP-LABEL: splatvar_shift_v2i64: 433; XOP: # BB#0: 434; XOP-NEXT: vpsllq %xmm1, %xmm0, %xmm0 435; XOP-NEXT: retq 436; 437; AVX512-LABEL: splatvar_shift_v2i64: 438; AVX512: ## BB#0: 439; AVX512-NEXT: vpsllq %xmm1, %xmm0, %xmm0 440; AVX512-NEXT: retq 441; 442; X32-SSE-LABEL: splatvar_shift_v2i64: 443; X32-SSE: # BB#0: 444; X32-SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero 445; X32-SSE-NEXT: psllq %xmm1, %xmm0 446; X32-SSE-NEXT: retl 447 %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer 448 %shift = shl <2 x i64> %a, %splat 449 ret <2 x i64> %shift 450} 451 452define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 453; SSE2-LABEL: splatvar_shift_v4i32: 454; SSE2: # BB#0: 455; SSE2-NEXT: xorps %xmm2, %xmm2 456; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 457; SSE2-NEXT: pslld %xmm2, %xmm0 458; SSE2-NEXT: retq 459; 460; SSE41-LABEL: splatvar_shift_v4i32: 461; SSE41: # BB#0: 462; SSE41-NEXT: pxor %xmm2, %xmm2 463; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7] 464; SSE41-NEXT: pslld %xmm2, %xmm0 465; SSE41-NEXT: retq 466; 467; AVX-LABEL: splatvar_shift_v4i32: 468; AVX: # BB#0: 469; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 470; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] 471; AVX-NEXT: vpslld %xmm1, %xmm0, %xmm0 472; AVX-NEXT: retq 473; 474; XOP-LABEL: splatvar_shift_v4i32: 475; XOP: # BB#0: 476; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 477; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] 478; XOP-NEXT: vpslld %xmm1, %xmm0, %xmm0 479; XOP-NEXT: retq 480; 481; AVX512-LABEL: splatvar_shift_v4i32: 482; AVX512: ## BB#0: 483; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 484; AVX512-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] 485; AVX512-NEXT: vpslld %xmm1, %xmm0, %xmm0 486; AVX512-NEXT: retq 487; 488; X32-SSE-LABEL: splatvar_shift_v4i32: 489; X32-SSE: # BB#0: 490; X32-SSE-NEXT: xorps %xmm2, %xmm2 491; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 492; X32-SSE-NEXT: pslld %xmm2, %xmm0 493; X32-SSE-NEXT: retl 494 %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer 495 %shift = shl <4 x i32> %a, %splat 496 ret <4 x i32> %shift 497} 498 499define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 500; SSE2-LABEL: splatvar_shift_v8i16: 501; SSE2: # BB#0: 502; SSE2-NEXT: movd %xmm1, %eax 503; SSE2-NEXT: movzwl %ax, %eax 504; SSE2-NEXT: movd %eax, %xmm1 505; SSE2-NEXT: psllw %xmm1, %xmm0 506; SSE2-NEXT: retq 507; 508; SSE41-LABEL: splatvar_shift_v8i16: 509; SSE41: # BB#0: 510; SSE41-NEXT: pxor %xmm2, %xmm2 511; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7] 512; SSE41-NEXT: psllw %xmm2, %xmm0 513; SSE41-NEXT: retq 514; 515; AVX-LABEL: splatvar_shift_v8i16: 516; AVX: # BB#0: 517; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 518; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] 519; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0 520; AVX-NEXT: retq 521; 522; XOP-LABEL: splatvar_shift_v8i16: 523; XOP: # BB#0: 524; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 525; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] 526; XOP-NEXT: vpsllw %xmm1, %xmm0, %xmm0 527; XOP-NEXT: retq 528; 529; AVX512-LABEL: splatvar_shift_v8i16: 530; AVX512: ## BB#0: 531; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 532; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] 533; AVX512-NEXT: vpsllw %xmm1, %xmm0, %xmm0 534; AVX512-NEXT: retq 535; 536; X32-SSE-LABEL: splatvar_shift_v8i16: 537; X32-SSE: # BB#0: 538; X32-SSE-NEXT: movd %xmm1, %eax 539; X32-SSE-NEXT: movzwl %ax, %eax 540; X32-SSE-NEXT: movd %eax, %xmm1 541; X32-SSE-NEXT: psllw %xmm1, %xmm0 542; X32-SSE-NEXT: retl 543 %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer 544 %shift = shl <8 x i16> %a, %splat 545 ret <8 x i16> %shift 546} 547 548define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 549; SSE2-LABEL: splatvar_shift_v16i8: 550; SSE2: # BB#0: 551; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 552; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 553; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] 554; SSE2-NEXT: psllw $5, %xmm2 555; SSE2-NEXT: pxor %xmm1, %xmm1 556; SSE2-NEXT: pxor %xmm3, %xmm3 557; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 558; SSE2-NEXT: movdqa %xmm3, %xmm4 559; SSE2-NEXT: pandn %xmm0, %xmm4 560; SSE2-NEXT: psllw $4, %xmm0 561; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 562; SSE2-NEXT: pand %xmm3, %xmm0 563; SSE2-NEXT: por %xmm4, %xmm0 564; SSE2-NEXT: paddb %xmm2, %xmm2 565; SSE2-NEXT: pxor %xmm3, %xmm3 566; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 567; SSE2-NEXT: movdqa %xmm3, %xmm4 568; SSE2-NEXT: pandn %xmm0, %xmm4 569; SSE2-NEXT: psllw $2, %xmm0 570; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 571; SSE2-NEXT: pand %xmm3, %xmm0 572; SSE2-NEXT: por %xmm4, %xmm0 573; SSE2-NEXT: paddb %xmm2, %xmm2 574; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 575; SSE2-NEXT: movdqa %xmm1, %xmm2 576; SSE2-NEXT: pandn %xmm0, %xmm2 577; SSE2-NEXT: paddb %xmm0, %xmm0 578; SSE2-NEXT: pand %xmm1, %xmm0 579; SSE2-NEXT: por %xmm2, %xmm0 580; SSE2-NEXT: retq 581; 582; SSE41-LABEL: splatvar_shift_v16i8: 583; SSE41: # BB#0: 584; SSE41-NEXT: movdqa %xmm0, %xmm2 585; SSE41-NEXT: pxor %xmm0, %xmm0 586; SSE41-NEXT: pshufb %xmm0, %xmm1 587; SSE41-NEXT: psllw $5, %xmm1 588; SSE41-NEXT: movdqa %xmm1, %xmm3 589; SSE41-NEXT: paddb %xmm3, %xmm3 590; SSE41-NEXT: movdqa %xmm2, %xmm4 591; SSE41-NEXT: psllw $4, %xmm4 592; SSE41-NEXT: pand {{.*}}(%rip), %xmm4 593; SSE41-NEXT: movdqa %xmm1, %xmm0 594; SSE41-NEXT: pblendvb %xmm4, %xmm2 595; SSE41-NEXT: movdqa %xmm2, %xmm1 596; SSE41-NEXT: psllw $2, %xmm1 597; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 598; SSE41-NEXT: movdqa %xmm3, %xmm0 599; SSE41-NEXT: pblendvb %xmm1, %xmm2 600; SSE41-NEXT: movdqa %xmm2, %xmm1 601; SSE41-NEXT: paddb %xmm1, %xmm1 602; SSE41-NEXT: paddb %xmm3, %xmm3 603; SSE41-NEXT: movdqa %xmm3, %xmm0 604; SSE41-NEXT: pblendvb %xmm1, %xmm2 605; SSE41-NEXT: movdqa %xmm2, %xmm0 606; SSE41-NEXT: retq 607; 608; AVX1-LABEL: splatvar_shift_v16i8: 609; AVX1: # BB#0: 610; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 611; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 612; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 613; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm2 614; AVX1-NEXT: vpsllw $4, %xmm0, %xmm3 615; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 616; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 617; AVX1-NEXT: vpsllw $2, %xmm0, %xmm1 618; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 619; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 620; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm1 621; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 622; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 623; AVX1-NEXT: retq 624; 625; AVX2-LABEL: splatvar_shift_v16i8: 626; AVX2: # BB#0: 627; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 628; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 629; AVX2-NEXT: vpsllw $4, %xmm0, %xmm2 630; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 631; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 632; AVX2-NEXT: vpsllw $2, %xmm0, %xmm2 633; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 634; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 635; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 636; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2 637; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 638; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 639; AVX2-NEXT: retq 640; 641; XOPAVX1-LABEL: splatvar_shift_v16i8: 642; XOPAVX1: # BB#0: 643; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 644; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 645; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 646; XOPAVX1-NEXT: retq 647; 648; XOPAVX2-LABEL: splatvar_shift_v16i8: 649; XOPAVX2: # BB#0: 650; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 651; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 652; XOPAVX2-NEXT: retq 653; 654; AVX512-LABEL: splatvar_shift_v16i8: 655; AVX512: ## BB#0: 656; AVX512-NEXT: vpbroadcastb %xmm1, %xmm1 657; AVX512-NEXT: vpsllw $5, %xmm1, %xmm1 658; AVX512-NEXT: vpsllw $4, %xmm0, %xmm2 659; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 660; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 661; AVX512-NEXT: vpsllw $2, %xmm0, %xmm2 662; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 663; AVX512-NEXT: vpaddb %xmm1, %xmm1, %xmm1 664; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 665; AVX512-NEXT: vpaddb %xmm0, %xmm0, %xmm2 666; AVX512-NEXT: vpaddb %xmm1, %xmm1, %xmm1 667; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 668; AVX512-NEXT: retq 669; 670; X32-SSE-LABEL: splatvar_shift_v16i8: 671; X32-SSE: # BB#0: 672; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 673; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 674; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] 675; X32-SSE-NEXT: psllw $5, %xmm2 676; X32-SSE-NEXT: pxor %xmm1, %xmm1 677; X32-SSE-NEXT: pxor %xmm3, %xmm3 678; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3 679; X32-SSE-NEXT: movdqa %xmm3, %xmm4 680; X32-SSE-NEXT: pandn %xmm0, %xmm4 681; X32-SSE-NEXT: psllw $4, %xmm0 682; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 683; X32-SSE-NEXT: pand %xmm3, %xmm0 684; X32-SSE-NEXT: por %xmm4, %xmm0 685; X32-SSE-NEXT: paddb %xmm2, %xmm2 686; X32-SSE-NEXT: pxor %xmm3, %xmm3 687; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3 688; X32-SSE-NEXT: movdqa %xmm3, %xmm4 689; X32-SSE-NEXT: pandn %xmm0, %xmm4 690; X32-SSE-NEXT: psllw $2, %xmm0 691; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 692; X32-SSE-NEXT: pand %xmm3, %xmm0 693; X32-SSE-NEXT: por %xmm4, %xmm0 694; X32-SSE-NEXT: paddb %xmm2, %xmm2 695; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm1 696; X32-SSE-NEXT: movdqa %xmm1, %xmm2 697; X32-SSE-NEXT: pandn %xmm0, %xmm2 698; X32-SSE-NEXT: paddb %xmm0, %xmm0 699; X32-SSE-NEXT: pand %xmm1, %xmm0 700; X32-SSE-NEXT: por %xmm2, %xmm0 701; X32-SSE-NEXT: retl 702 %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer 703 %shift = shl <16 x i8> %a, %splat 704 ret <16 x i8> %shift 705} 706 707; 708; Constant Shifts 709; 710 711define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind { 712; SSE2-LABEL: constant_shift_v2i64: 713; SSE2: # BB#0: 714; SSE2-NEXT: movdqa %xmm0, %xmm1 715; SSE2-NEXT: psllq $7, %xmm1 716; SSE2-NEXT: psllq $1, %xmm0 717; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 718; SSE2-NEXT: movapd %xmm1, %xmm0 719; SSE2-NEXT: retq 720; 721; SSE41-LABEL: constant_shift_v2i64: 722; SSE41: # BB#0: 723; SSE41-NEXT: movdqa %xmm0, %xmm1 724; SSE41-NEXT: psllq $7, %xmm1 725; SSE41-NEXT: psllq $1, %xmm0 726; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 727; SSE41-NEXT: retq 728; 729; AVX1-LABEL: constant_shift_v2i64: 730; AVX1: # BB#0: 731; AVX1-NEXT: vpsllq $7, %xmm0, %xmm1 732; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0 733; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 734; AVX1-NEXT: retq 735; 736; AVX2-LABEL: constant_shift_v2i64: 737; AVX2: # BB#0: 738; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0 739; AVX2-NEXT: retq 740; 741; XOPAVX1-LABEL: constant_shift_v2i64: 742; XOPAVX1: # BB#0: 743; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0 744; XOPAVX1-NEXT: retq 745; 746; XOPAVX2-LABEL: constant_shift_v2i64: 747; XOPAVX2: # BB#0: 748; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0 749; XOPAVX2-NEXT: retq 750; 751; AVX512-LABEL: constant_shift_v2i64: 752; AVX512: ## BB#0: 753; AVX512-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0 754; AVX512-NEXT: retq 755; 756; X32-SSE-LABEL: constant_shift_v2i64: 757; X32-SSE: # BB#0: 758; X32-SSE-NEXT: movdqa %xmm0, %xmm1 759; X32-SSE-NEXT: psllq $7, %xmm1 760; X32-SSE-NEXT: psllq $1, %xmm0 761; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 762; X32-SSE-NEXT: movapd %xmm1, %xmm0 763; X32-SSE-NEXT: retl 764 %shift = shl <2 x i64> %a, <i64 1, i64 7> 765 ret <2 x i64> %shift 766} 767 768define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind { 769; SSE2-LABEL: constant_shift_v4i32: 770; SSE2: # BB#0: 771; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128] 772; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 773; SSE2-NEXT: pmuludq %xmm1, %xmm0 774; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 775; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 776; SSE2-NEXT: pmuludq %xmm2, %xmm1 777; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 778; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 779; SSE2-NEXT: retq 780; 781; SSE41-LABEL: constant_shift_v4i32: 782; SSE41: # BB#0: 783; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 784; SSE41-NEXT: retq 785; 786; AVX1-LABEL: constant_shift_v4i32: 787; AVX1: # BB#0: 788; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 789; AVX1-NEXT: retq 790; 791; AVX2-LABEL: constant_shift_v4i32: 792; AVX2: # BB#0: 793; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 794; AVX2-NEXT: retq 795; 796; XOPAVX1-LABEL: constant_shift_v4i32: 797; XOPAVX1: # BB#0: 798; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 799; XOPAVX1-NEXT: retq 800; 801; XOPAVX2-LABEL: constant_shift_v4i32: 802; XOPAVX2: # BB#0: 803; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 804; XOPAVX2-NEXT: retq 805; 806; AVX512-LABEL: constant_shift_v4i32: 807; AVX512: ## BB#0: 808; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 809; AVX512-NEXT: retq 810; 811; X32-SSE-LABEL: constant_shift_v4i32: 812; X32-SSE: # BB#0: 813; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128] 814; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 815; X32-SSE-NEXT: pmuludq %xmm1, %xmm0 816; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 817; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 818; X32-SSE-NEXT: pmuludq %xmm2, %xmm1 819; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 820; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 821; X32-SSE-NEXT: retl 822 %shift = shl <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7> 823 ret <4 x i32> %shift 824} 825 826define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { 827; SSE-LABEL: constant_shift_v8i16: 828; SSE: # BB#0: 829; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 830; SSE-NEXT: retq 831; 832; AVX-LABEL: constant_shift_v8i16: 833; AVX: # BB#0: 834; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 835; AVX-NEXT: retq 836; 837; XOP-LABEL: constant_shift_v8i16: 838; XOP: # BB#0: 839; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0 840; XOP-NEXT: retq 841; 842; AVX512-LABEL: constant_shift_v8i16: 843; AVX512: ## BB#0: 844; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def> 845; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] 846; AVX512-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 847; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> 848; AVX512-NEXT: retq 849; 850; X32-SSE-LABEL: constant_shift_v8i16: 851; X32-SSE: # BB#0: 852; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 853; X32-SSE-NEXT: retl 854 %shift = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7> 855 ret <8 x i16> %shift 856} 857 858define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { 859; SSE2-LABEL: constant_shift_v16i8: 860; SSE2: # BB#0: 861; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 862; SSE2-NEXT: psllw $5, %xmm2 863; SSE2-NEXT: pxor %xmm1, %xmm1 864; SSE2-NEXT: pxor %xmm3, %xmm3 865; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 866; SSE2-NEXT: movdqa %xmm3, %xmm4 867; SSE2-NEXT: pandn %xmm0, %xmm4 868; SSE2-NEXT: psllw $4, %xmm0 869; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 870; SSE2-NEXT: pand %xmm3, %xmm0 871; SSE2-NEXT: por %xmm4, %xmm0 872; SSE2-NEXT: paddb %xmm2, %xmm2 873; SSE2-NEXT: pxor %xmm3, %xmm3 874; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 875; SSE2-NEXT: movdqa %xmm3, %xmm4 876; SSE2-NEXT: pandn %xmm0, %xmm4 877; SSE2-NEXT: psllw $2, %xmm0 878; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 879; SSE2-NEXT: pand %xmm3, %xmm0 880; SSE2-NEXT: por %xmm4, %xmm0 881; SSE2-NEXT: paddb %xmm2, %xmm2 882; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 883; SSE2-NEXT: movdqa %xmm1, %xmm2 884; SSE2-NEXT: pandn %xmm0, %xmm2 885; SSE2-NEXT: paddb %xmm0, %xmm0 886; SSE2-NEXT: pand %xmm1, %xmm0 887; SSE2-NEXT: por %xmm2, %xmm0 888; SSE2-NEXT: retq 889; 890; SSE41-LABEL: constant_shift_v16i8: 891; SSE41: # BB#0: 892; SSE41-NEXT: movdqa %xmm0, %xmm1 893; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 894; SSE41-NEXT: psllw $5, %xmm0 895; SSE41-NEXT: movdqa %xmm1, %xmm2 896; SSE41-NEXT: psllw $4, %xmm2 897; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 898; SSE41-NEXT: pblendvb %xmm2, %xmm1 899; SSE41-NEXT: movdqa %xmm1, %xmm2 900; SSE41-NEXT: psllw $2, %xmm2 901; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 902; SSE41-NEXT: paddb %xmm0, %xmm0 903; SSE41-NEXT: pblendvb %xmm2, %xmm1 904; SSE41-NEXT: movdqa %xmm1, %xmm2 905; SSE41-NEXT: paddb %xmm2, %xmm2 906; SSE41-NEXT: paddb %xmm0, %xmm0 907; SSE41-NEXT: pblendvb %xmm2, %xmm1 908; SSE41-NEXT: movdqa %xmm1, %xmm0 909; SSE41-NEXT: retq 910; 911; AVX-LABEL: constant_shift_v16i8: 912; AVX: # BB#0: 913; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 914; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 915; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 916; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 917; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 918; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 919; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 920; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 921; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 922; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 923; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 924; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 925; AVX-NEXT: retq 926; 927; XOP-LABEL: constant_shift_v16i8: 928; XOP: # BB#0: 929; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 930; XOP-NEXT: retq 931; 932; AVX512-LABEL: constant_shift_v16i8: 933; AVX512: ## BB#0: 934; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 935; AVX512-NEXT: vpsllw $5, %xmm1, %xmm1 936; AVX512-NEXT: vpsllw $4, %xmm0, %xmm2 937; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 938; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 939; AVX512-NEXT: vpsllw $2, %xmm0, %xmm2 940; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 941; AVX512-NEXT: vpaddb %xmm1, %xmm1, %xmm1 942; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 943; AVX512-NEXT: vpaddb %xmm0, %xmm0, %xmm2 944; AVX512-NEXT: vpaddb %xmm1, %xmm1, %xmm1 945; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 946; AVX512-NEXT: retq 947; 948; X32-SSE-LABEL: constant_shift_v16i8: 949; X32-SSE: # BB#0: 950; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 951; X32-SSE-NEXT: psllw $5, %xmm2 952; X32-SSE-NEXT: pxor %xmm1, %xmm1 953; X32-SSE-NEXT: pxor %xmm3, %xmm3 954; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3 955; X32-SSE-NEXT: movdqa %xmm3, %xmm4 956; X32-SSE-NEXT: pandn %xmm0, %xmm4 957; X32-SSE-NEXT: psllw $4, %xmm0 958; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 959; X32-SSE-NEXT: pand %xmm3, %xmm0 960; X32-SSE-NEXT: por %xmm4, %xmm0 961; X32-SSE-NEXT: paddb %xmm2, %xmm2 962; X32-SSE-NEXT: pxor %xmm3, %xmm3 963; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3 964; X32-SSE-NEXT: movdqa %xmm3, %xmm4 965; X32-SSE-NEXT: pandn %xmm0, %xmm4 966; X32-SSE-NEXT: psllw $2, %xmm0 967; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 968; X32-SSE-NEXT: pand %xmm3, %xmm0 969; X32-SSE-NEXT: por %xmm4, %xmm0 970; X32-SSE-NEXT: paddb %xmm2, %xmm2 971; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm1 972; X32-SSE-NEXT: movdqa %xmm1, %xmm2 973; X32-SSE-NEXT: pandn %xmm0, %xmm2 974; X32-SSE-NEXT: paddb %xmm0, %xmm0 975; X32-SSE-NEXT: pand %xmm1, %xmm0 976; X32-SSE-NEXT: por %xmm2, %xmm0 977; X32-SSE-NEXT: retl 978 %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> 979 ret <16 x i8> %shift 980} 981 982; 983; Uniform Constant Shifts 984; 985 986define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind { 987; SSE-LABEL: splatconstant_shift_v2i64: 988; SSE: # BB#0: 989; SSE-NEXT: psllq $7, %xmm0 990; SSE-NEXT: retq 991; 992; AVX-LABEL: splatconstant_shift_v2i64: 993; AVX: # BB#0: 994; AVX-NEXT: vpsllq $7, %xmm0, %xmm0 995; AVX-NEXT: retq 996; 997; XOP-LABEL: splatconstant_shift_v2i64: 998; XOP: # BB#0: 999; XOP-NEXT: vpsllq $7, %xmm0, %xmm0 1000; XOP-NEXT: retq 1001; 1002; AVX512-LABEL: splatconstant_shift_v2i64: 1003; AVX512: ## BB#0: 1004; AVX512-NEXT: vpsllq $7, %xmm0, %xmm0 1005; AVX512-NEXT: retq 1006; 1007; X32-SSE-LABEL: splatconstant_shift_v2i64: 1008; X32-SSE: # BB#0: 1009; X32-SSE-NEXT: psllq $7, %xmm0 1010; X32-SSE-NEXT: retl 1011 %shift = shl <2 x i64> %a, <i64 7, i64 7> 1012 ret <2 x i64> %shift 1013} 1014 1015define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind { 1016; SSE-LABEL: splatconstant_shift_v4i32: 1017; SSE: # BB#0: 1018; SSE-NEXT: pslld $5, %xmm0 1019; SSE-NEXT: retq 1020; 1021; AVX-LABEL: splatconstant_shift_v4i32: 1022; AVX: # BB#0: 1023; AVX-NEXT: vpslld $5, %xmm0, %xmm0 1024; AVX-NEXT: retq 1025; 1026; XOP-LABEL: splatconstant_shift_v4i32: 1027; XOP: # BB#0: 1028; XOP-NEXT: vpslld $5, %xmm0, %xmm0 1029; XOP-NEXT: retq 1030; 1031; AVX512-LABEL: splatconstant_shift_v4i32: 1032; AVX512: ## BB#0: 1033; AVX512-NEXT: vpslld $5, %xmm0, %xmm0 1034; AVX512-NEXT: retq 1035; 1036; X32-SSE-LABEL: splatconstant_shift_v4i32: 1037; X32-SSE: # BB#0: 1038; X32-SSE-NEXT: pslld $5, %xmm0 1039; X32-SSE-NEXT: retl 1040 %shift = shl <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5> 1041 ret <4 x i32> %shift 1042} 1043 1044define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind { 1045; SSE-LABEL: splatconstant_shift_v8i16: 1046; SSE: # BB#0: 1047; SSE-NEXT: psllw $3, %xmm0 1048; SSE-NEXT: retq 1049; 1050; AVX-LABEL: splatconstant_shift_v8i16: 1051; AVX: # BB#0: 1052; AVX-NEXT: vpsllw $3, %xmm0, %xmm0 1053; AVX-NEXT: retq 1054; 1055; XOP-LABEL: splatconstant_shift_v8i16: 1056; XOP: # BB#0: 1057; XOP-NEXT: vpsllw $3, %xmm0, %xmm0 1058; XOP-NEXT: retq 1059; 1060; AVX512-LABEL: splatconstant_shift_v8i16: 1061; AVX512: ## BB#0: 1062; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 1063; AVX512-NEXT: retq 1064; 1065; X32-SSE-LABEL: splatconstant_shift_v8i16: 1066; X32-SSE: # BB#0: 1067; X32-SSE-NEXT: psllw $3, %xmm0 1068; X32-SSE-NEXT: retl 1069 %shift = shl <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> 1070 ret <8 x i16> %shift 1071} 1072 1073define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind { 1074; SSE-LABEL: splatconstant_shift_v16i8: 1075; SSE: # BB#0: 1076; SSE-NEXT: psllw $3, %xmm0 1077; SSE-NEXT: pand {{.*}}(%rip), %xmm0 1078; SSE-NEXT: retq 1079; 1080; AVX-LABEL: splatconstant_shift_v16i8: 1081; AVX: # BB#0: 1082; AVX-NEXT: vpsllw $3, %xmm0, %xmm0 1083; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1084; AVX-NEXT: retq 1085; 1086; XOP-LABEL: splatconstant_shift_v16i8: 1087; XOP: # BB#0: 1088; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 1089; XOP-NEXT: retq 1090; 1091; AVX512-LABEL: splatconstant_shift_v16i8: 1092; AVX512: ## BB#0: 1093; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 1094; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1095; AVX512-NEXT: retq 1096; 1097; X32-SSE-LABEL: splatconstant_shift_v16i8: 1098; X32-SSE: # BB#0: 1099; X32-SSE-NEXT: psllw $3, %xmm0 1100; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 1101; X32-SSE-NEXT: retl 1102 %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> 1103 ret <16 x i8> %shift 1104} 1105