1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 8; 9; Just one 32-bit run to make sure we do reasonable things for i64 shifts. 10; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2 11 12; 13; Variable Shifts 14; 15 16define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 17; SSE2-LABEL: var_shift_v2i64: 18; SSE2: # BB#0: 19; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] 20; SSE2-NEXT: movdqa %xmm0, %xmm2 21; SSE2-NEXT: psllq %xmm3, %xmm2 22; SSE2-NEXT: psllq %xmm1, %xmm0 23; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 24; SSE2-NEXT: movapd %xmm2, %xmm0 25; SSE2-NEXT: retq 26; 27; SSE41-LABEL: var_shift_v2i64: 28; SSE41: # BB#0: 29; SSE41-NEXT: movdqa %xmm0, %xmm2 30; SSE41-NEXT: psllq %xmm1, %xmm2 31; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 32; SSE41-NEXT: psllq %xmm1, %xmm0 33; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] 34; SSE41-NEXT: retq 35; 36; AVX1-LABEL: var_shift_v2i64: 37; AVX1: # BB#0: 38; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2 39; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 40; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 41; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] 42; AVX1-NEXT: retq 43; 44; AVX2-LABEL: var_shift_v2i64: 45; AVX2: # BB#0: 46; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 47; AVX2-NEXT: retq 48; 49; XOPAVX1-LABEL: var_shift_v2i64: 50; XOPAVX1: # BB#0: 51; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0 52; XOPAVX1-NEXT: retq 53; 54; XOPAVX2-LABEL: var_shift_v2i64: 55; XOPAVX2: # BB#0: 56; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 57; XOPAVX2-NEXT: retq 58; 59; X32-SSE-LABEL: var_shift_v2i64: 60; X32-SSE: # BB#0: 61; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] 62; X32-SSE-NEXT: movdqa %xmm0, %xmm2 63; X32-SSE-NEXT: psllq %xmm3, %xmm2 64; X32-SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero 65; X32-SSE-NEXT: psllq %xmm1, %xmm0 66; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 67; X32-SSE-NEXT: movapd %xmm2, %xmm0 68; X32-SSE-NEXT: retl 69 %shift = shl <2 x i64> %a, %b 70 ret <2 x i64> %shift 71} 72 73define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 74; SSE2-LABEL: var_shift_v4i32: 75; SSE2: # BB#0: 76; SSE2-NEXT: pslld $23, %xmm1 77; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 78; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 79; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 80; SSE2-NEXT: pmuludq %xmm0, %xmm1 81; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 82; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 83; SSE2-NEXT: pmuludq %xmm2, %xmm0 84; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 85; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 86; SSE2-NEXT: movdqa %xmm1, %xmm0 87; SSE2-NEXT: retq 88; 89; SSE41-LABEL: var_shift_v4i32: 90; SSE41: # BB#0: 91; SSE41-NEXT: pslld $23, %xmm1 92; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1 93; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 94; SSE41-NEXT: pmulld %xmm1, %xmm0 95; SSE41-NEXT: retq 96; 97; AVX1-LABEL: var_shift_v4i32: 98; AVX1: # BB#0: 99; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 100; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 101; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 102; AVX1-NEXT: vpmulld %xmm0, %xmm1, %xmm0 103; AVX1-NEXT: retq 104; 105; AVX2-LABEL: var_shift_v4i32: 106; AVX2: # BB#0: 107; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 108; AVX2-NEXT: retq 109; 110; XOPAVX1-LABEL: var_shift_v4i32: 111; XOPAVX1: # BB#0: 112; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0 113; XOPAVX1-NEXT: retq 114; 115; XOPAVX2-LABEL: var_shift_v4i32: 116; XOPAVX2: # BB#0: 117; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 118; XOPAVX2-NEXT: retq 119; 120; X32-SSE-LABEL: var_shift_v4i32: 121; X32-SSE: # BB#0: 122; X32-SSE-NEXT: pslld $23, %xmm1 123; X32-SSE-NEXT: paddd .LCPI1_0, %xmm1 124; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1 125; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 126; X32-SSE-NEXT: pmuludq %xmm0, %xmm1 127; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 128; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 129; X32-SSE-NEXT: pmuludq %xmm2, %xmm0 130; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 131; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 132; X32-SSE-NEXT: movdqa %xmm1, %xmm0 133; X32-SSE-NEXT: retl 134 %shift = shl <4 x i32> %a, %b 135 ret <4 x i32> %shift 136} 137 138define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 139; SSE2-LABEL: var_shift_v8i16: 140; SSE2: # BB#0: 141; SSE2-NEXT: psllw $12, %xmm1 142; SSE2-NEXT: movdqa %xmm1, %xmm2 143; SSE2-NEXT: psraw $15, %xmm2 144; SSE2-NEXT: movdqa %xmm2, %xmm3 145; SSE2-NEXT: pandn %xmm0, %xmm3 146; SSE2-NEXT: psllw $8, %xmm0 147; SSE2-NEXT: pand %xmm2, %xmm0 148; SSE2-NEXT: por %xmm3, %xmm0 149; SSE2-NEXT: paddw %xmm1, %xmm1 150; SSE2-NEXT: movdqa %xmm1, %xmm2 151; SSE2-NEXT: psraw $15, %xmm2 152; SSE2-NEXT: movdqa %xmm2, %xmm3 153; SSE2-NEXT: pandn %xmm0, %xmm3 154; SSE2-NEXT: psllw $4, %xmm0 155; SSE2-NEXT: pand %xmm2, %xmm0 156; SSE2-NEXT: por %xmm3, %xmm0 157; SSE2-NEXT: paddw %xmm1, %xmm1 158; SSE2-NEXT: movdqa %xmm1, %xmm2 159; SSE2-NEXT: psraw $15, %xmm2 160; SSE2-NEXT: movdqa %xmm2, %xmm3 161; SSE2-NEXT: pandn %xmm0, %xmm3 162; SSE2-NEXT: psllw $2, %xmm0 163; SSE2-NEXT: pand %xmm2, %xmm0 164; SSE2-NEXT: por %xmm3, %xmm0 165; SSE2-NEXT: paddw %xmm1, %xmm1 166; SSE2-NEXT: psraw $15, %xmm1 167; SSE2-NEXT: movdqa %xmm1, %xmm2 168; SSE2-NEXT: pandn %xmm0, %xmm2 169; SSE2-NEXT: psllw $1, %xmm0 170; SSE2-NEXT: pand %xmm1, %xmm0 171; SSE2-NEXT: por %xmm2, %xmm0 172; SSE2-NEXT: retq 173; 174; SSE41-LABEL: var_shift_v8i16: 175; SSE41: # BB#0: 176; SSE41-NEXT: movdqa %xmm0, %xmm2 177; SSE41-NEXT: movdqa %xmm1, %xmm0 178; SSE41-NEXT: psllw $12, %xmm0 179; SSE41-NEXT: psllw $4, %xmm1 180; SSE41-NEXT: por %xmm0, %xmm1 181; SSE41-NEXT: movdqa %xmm1, %xmm3 182; SSE41-NEXT: paddw %xmm3, %xmm3 183; SSE41-NEXT: movdqa %xmm2, %xmm4 184; SSE41-NEXT: psllw $8, %xmm4 185; SSE41-NEXT: movdqa %xmm1, %xmm0 186; SSE41-NEXT: pblendvb %xmm4, %xmm2 187; SSE41-NEXT: movdqa %xmm2, %xmm1 188; SSE41-NEXT: psllw $4, %xmm1 189; SSE41-NEXT: movdqa %xmm3, %xmm0 190; SSE41-NEXT: pblendvb %xmm1, %xmm2 191; SSE41-NEXT: movdqa %xmm2, %xmm1 192; SSE41-NEXT: psllw $2, %xmm1 193; SSE41-NEXT: paddw %xmm3, %xmm3 194; SSE41-NEXT: movdqa %xmm3, %xmm0 195; SSE41-NEXT: pblendvb %xmm1, %xmm2 196; SSE41-NEXT: movdqa %xmm2, %xmm1 197; SSE41-NEXT: psllw $1, %xmm1 198; SSE41-NEXT: paddw %xmm3, %xmm3 199; SSE41-NEXT: movdqa %xmm3, %xmm0 200; SSE41-NEXT: pblendvb %xmm1, %xmm2 201; SSE41-NEXT: movdqa %xmm2, %xmm0 202; SSE41-NEXT: retq 203; 204; AVX1-LABEL: var_shift_v8i16: 205; AVX1: # BB#0: 206; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 207; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 208; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 209; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 210; AVX1-NEXT: vpsllw $8, %xmm0, %xmm3 211; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 212; AVX1-NEXT: vpsllw $4, %xmm0, %xmm1 213; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 214; AVX1-NEXT: vpsllw $2, %xmm0, %xmm1 215; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 216; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 217; AVX1-NEXT: vpsllw $1, %xmm0, %xmm1 218; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 219; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 220; AVX1-NEXT: retq 221; 222; AVX2-LABEL: var_shift_v8i16: 223; AVX2: # BB#0: 224; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 225; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 226; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 227; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 228; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 229; AVX2-NEXT: vzeroupper 230; AVX2-NEXT: retq 231; 232; XOP-LABEL: var_shift_v8i16: 233; XOP: # BB#0: 234; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0 235; XOP-NEXT: retq 236; 237; X32-SSE-LABEL: var_shift_v8i16: 238; X32-SSE: # BB#0: 239; X32-SSE-NEXT: psllw $12, %xmm1 240; X32-SSE-NEXT: movdqa %xmm1, %xmm2 241; X32-SSE-NEXT: psraw $15, %xmm2 242; X32-SSE-NEXT: movdqa %xmm2, %xmm3 243; X32-SSE-NEXT: pandn %xmm0, %xmm3 244; X32-SSE-NEXT: psllw $8, %xmm0 245; X32-SSE-NEXT: pand %xmm2, %xmm0 246; X32-SSE-NEXT: por %xmm3, %xmm0 247; X32-SSE-NEXT: paddw %xmm1, %xmm1 248; X32-SSE-NEXT: movdqa %xmm1, %xmm2 249; X32-SSE-NEXT: psraw $15, %xmm2 250; X32-SSE-NEXT: movdqa %xmm2, %xmm3 251; X32-SSE-NEXT: pandn %xmm0, %xmm3 252; X32-SSE-NEXT: psllw $4, %xmm0 253; X32-SSE-NEXT: pand %xmm2, %xmm0 254; X32-SSE-NEXT: por %xmm3, %xmm0 255; X32-SSE-NEXT: paddw %xmm1, %xmm1 256; X32-SSE-NEXT: movdqa %xmm1, %xmm2 257; X32-SSE-NEXT: psraw $15, %xmm2 258; X32-SSE-NEXT: movdqa %xmm2, %xmm3 259; X32-SSE-NEXT: pandn %xmm0, %xmm3 260; X32-SSE-NEXT: psllw $2, %xmm0 261; X32-SSE-NEXT: pand %xmm2, %xmm0 262; X32-SSE-NEXT: por %xmm3, %xmm0 263; X32-SSE-NEXT: paddw %xmm1, %xmm1 264; X32-SSE-NEXT: psraw $15, %xmm1 265; X32-SSE-NEXT: movdqa %xmm1, %xmm2 266; X32-SSE-NEXT: pandn %xmm0, %xmm2 267; X32-SSE-NEXT: psllw $1, %xmm0 268; X32-SSE-NEXT: pand %xmm1, %xmm0 269; X32-SSE-NEXT: por %xmm2, %xmm0 270; X32-SSE-NEXT: retl 271 %shift = shl <8 x i16> %a, %b 272 ret <8 x i16> %shift 273} 274 275define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 276; SSE2-LABEL: var_shift_v16i8: 277; SSE2: # BB#0: 278; SSE2-NEXT: psllw $5, %xmm1 279; SSE2-NEXT: pxor %xmm2, %xmm2 280; SSE2-NEXT: pxor %xmm3, %xmm3 281; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 282; SSE2-NEXT: movdqa %xmm3, %xmm4 283; SSE2-NEXT: pandn %xmm0, %xmm4 284; SSE2-NEXT: psllw $4, %xmm0 285; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 286; SSE2-NEXT: pand %xmm3, %xmm0 287; SSE2-NEXT: por %xmm4, %xmm0 288; SSE2-NEXT: paddb %xmm1, %xmm1 289; SSE2-NEXT: pxor %xmm3, %xmm3 290; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 291; SSE2-NEXT: movdqa %xmm3, %xmm4 292; SSE2-NEXT: pandn %xmm0, %xmm4 293; SSE2-NEXT: psllw $2, %xmm0 294; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 295; SSE2-NEXT: pand %xmm3, %xmm0 296; SSE2-NEXT: por %xmm4, %xmm0 297; SSE2-NEXT: paddb %xmm1, %xmm1 298; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 299; SSE2-NEXT: movdqa %xmm2, %xmm1 300; SSE2-NEXT: pandn %xmm0, %xmm1 301; SSE2-NEXT: paddb %xmm0, %xmm0 302; SSE2-NEXT: pand %xmm2, %xmm0 303; SSE2-NEXT: por %xmm1, %xmm0 304; SSE2-NEXT: retq 305; 306; SSE41-LABEL: var_shift_v16i8: 307; SSE41: # BB#0: 308; SSE41-NEXT: movdqa %xmm0, %xmm2 309; SSE41-NEXT: psllw $5, %xmm1 310; SSE41-NEXT: movdqa %xmm2, %xmm3 311; SSE41-NEXT: psllw $4, %xmm3 312; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 313; SSE41-NEXT: movdqa %xmm1, %xmm0 314; SSE41-NEXT: pblendvb %xmm3, %xmm2 315; SSE41-NEXT: movdqa %xmm2, %xmm3 316; SSE41-NEXT: psllw $2, %xmm3 317; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 318; SSE41-NEXT: paddb %xmm1, %xmm1 319; SSE41-NEXT: movdqa %xmm1, %xmm0 320; SSE41-NEXT: pblendvb %xmm3, %xmm2 321; SSE41-NEXT: movdqa %xmm2, %xmm3 322; SSE41-NEXT: paddb %xmm3, %xmm3 323; SSE41-NEXT: paddb %xmm1, %xmm1 324; SSE41-NEXT: movdqa %xmm1, %xmm0 325; SSE41-NEXT: pblendvb %xmm3, %xmm2 326; SSE41-NEXT: movdqa %xmm2, %xmm0 327; SSE41-NEXT: retq 328; 329; AVX-LABEL: var_shift_v16i8: 330; AVX: # BB#0: 331; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 332; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 333; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 334; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 335; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 336; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 337; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 338; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 339; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 340; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 341; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 342; AVX-NEXT: retq 343; 344; XOP-LABEL: var_shift_v16i8: 345; XOP: # BB#0: 346; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 347; XOP-NEXT: retq 348; 349; X32-SSE-LABEL: var_shift_v16i8: 350; X32-SSE: # BB#0: 351; X32-SSE-NEXT: psllw $5, %xmm1 352; X32-SSE-NEXT: pxor %xmm2, %xmm2 353; X32-SSE-NEXT: pxor %xmm3, %xmm3 354; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 355; X32-SSE-NEXT: movdqa %xmm3, %xmm4 356; X32-SSE-NEXT: pandn %xmm0, %xmm4 357; X32-SSE-NEXT: psllw $4, %xmm0 358; X32-SSE-NEXT: pand .LCPI3_0, %xmm0 359; X32-SSE-NEXT: pand %xmm3, %xmm0 360; X32-SSE-NEXT: por %xmm4, %xmm0 361; X32-SSE-NEXT: paddb %xmm1, %xmm1 362; X32-SSE-NEXT: pxor %xmm3, %xmm3 363; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 364; X32-SSE-NEXT: movdqa %xmm3, %xmm4 365; X32-SSE-NEXT: pandn %xmm0, %xmm4 366; X32-SSE-NEXT: psllw $2, %xmm0 367; X32-SSE-NEXT: pand .LCPI3_1, %xmm0 368; X32-SSE-NEXT: pand %xmm3, %xmm0 369; X32-SSE-NEXT: por %xmm4, %xmm0 370; X32-SSE-NEXT: paddb %xmm1, %xmm1 371; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 372; X32-SSE-NEXT: movdqa %xmm2, %xmm1 373; X32-SSE-NEXT: pandn %xmm0, %xmm1 374; X32-SSE-NEXT: paddb %xmm0, %xmm0 375; X32-SSE-NEXT: pand %xmm2, %xmm0 376; X32-SSE-NEXT: por %xmm1, %xmm0 377; X32-SSE-NEXT: retl 378 %shift = shl <16 x i8> %a, %b 379 ret <16 x i8> %shift 380} 381 382; 383; Uniform Variable Shifts 384; 385 386define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 387; SSE-LABEL: splatvar_shift_v2i64: 388; SSE: # BB#0: 389; SSE-NEXT: psllq %xmm1, %xmm0 390; SSE-NEXT: retq 391; 392; AVX-LABEL: splatvar_shift_v2i64: 393; AVX: # BB#0: 394; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm0 395; AVX-NEXT: retq 396; 397; XOP-LABEL: splatvar_shift_v2i64: 398; XOP: # BB#0: 399; XOP-NEXT: vpsllq %xmm1, %xmm0, %xmm0 400; XOP-NEXT: retq 401; 402; X32-SSE-LABEL: splatvar_shift_v2i64: 403; X32-SSE: # BB#0: 404; X32-SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero 405; X32-SSE-NEXT: psllq %xmm1, %xmm0 406; X32-SSE-NEXT: retl 407 %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer 408 %shift = shl <2 x i64> %a, %splat 409 ret <2 x i64> %shift 410} 411 412define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 413; SSE2-LABEL: splatvar_shift_v4i32: 414; SSE2: # BB#0: 415; SSE2-NEXT: xorps %xmm2, %xmm2 416; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 417; SSE2-NEXT: pslld %xmm2, %xmm0 418; SSE2-NEXT: retq 419; 420; SSE41-LABEL: splatvar_shift_v4i32: 421; SSE41: # BB#0: 422; SSE41-NEXT: pxor %xmm2, %xmm2 423; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7] 424; SSE41-NEXT: pslld %xmm2, %xmm0 425; SSE41-NEXT: retq 426; 427; AVX-LABEL: splatvar_shift_v4i32: 428; AVX: # BB#0: 429; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 430; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] 431; AVX-NEXT: vpslld %xmm1, %xmm0, %xmm0 432; AVX-NEXT: retq 433; 434; XOP-LABEL: splatvar_shift_v4i32: 435; XOP: # BB#0: 436; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 437; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] 438; XOP-NEXT: vpslld %xmm1, %xmm0, %xmm0 439; XOP-NEXT: retq 440; 441; X32-SSE-LABEL: splatvar_shift_v4i32: 442; X32-SSE: # BB#0: 443; X32-SSE-NEXT: xorps %xmm2, %xmm2 444; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 445; X32-SSE-NEXT: pslld %xmm2, %xmm0 446; X32-SSE-NEXT: retl 447 %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer 448 %shift = shl <4 x i32> %a, %splat 449 ret <4 x i32> %shift 450} 451 452define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 453; SSE2-LABEL: splatvar_shift_v8i16: 454; SSE2: # BB#0: 455; SSE2-NEXT: movd %xmm1, %eax 456; SSE2-NEXT: movzwl %ax, %eax 457; SSE2-NEXT: movd %eax, %xmm1 458; SSE2-NEXT: psllw %xmm1, %xmm0 459; SSE2-NEXT: retq 460; 461; SSE41-LABEL: splatvar_shift_v8i16: 462; SSE41: # BB#0: 463; SSE41-NEXT: pxor %xmm2, %xmm2 464; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7] 465; SSE41-NEXT: psllw %xmm2, %xmm0 466; SSE41-NEXT: retq 467; 468; AVX-LABEL: splatvar_shift_v8i16: 469; AVX: # BB#0: 470; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 471; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] 472; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0 473; AVX-NEXT: retq 474; 475; XOP-LABEL: splatvar_shift_v8i16: 476; XOP: # BB#0: 477; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 478; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] 479; XOP-NEXT: vpsllw %xmm1, %xmm0, %xmm0 480; XOP-NEXT: retq 481; 482; X32-SSE-LABEL: splatvar_shift_v8i16: 483; X32-SSE: # BB#0: 484; X32-SSE-NEXT: movd %xmm1, %eax 485; X32-SSE-NEXT: movzwl %ax, %eax 486; X32-SSE-NEXT: movd %eax, %xmm1 487; X32-SSE-NEXT: psllw %xmm1, %xmm0 488; X32-SSE-NEXT: retl 489 %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer 490 %shift = shl <8 x i16> %a, %splat 491 ret <8 x i16> %shift 492} 493 494define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 495; SSE2-LABEL: splatvar_shift_v16i8: 496; SSE2: # BB#0: 497; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 498; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] 499; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 500; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4] 501; SSE2-NEXT: psllw $5, %xmm2 502; SSE2-NEXT: pxor %xmm1, %xmm1 503; SSE2-NEXT: pxor %xmm3, %xmm3 504; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 505; SSE2-NEXT: movdqa %xmm3, %xmm4 506; SSE2-NEXT: pandn %xmm0, %xmm4 507; SSE2-NEXT: psllw $4, %xmm0 508; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 509; SSE2-NEXT: pand %xmm3, %xmm0 510; SSE2-NEXT: por %xmm4, %xmm0 511; SSE2-NEXT: paddb %xmm2, %xmm2 512; SSE2-NEXT: pxor %xmm3, %xmm3 513; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 514; SSE2-NEXT: movdqa %xmm3, %xmm4 515; SSE2-NEXT: pandn %xmm0, %xmm4 516; SSE2-NEXT: psllw $2, %xmm0 517; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 518; SSE2-NEXT: pand %xmm3, %xmm0 519; SSE2-NEXT: por %xmm4, %xmm0 520; SSE2-NEXT: paddb %xmm2, %xmm2 521; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 522; SSE2-NEXT: movdqa %xmm1, %xmm2 523; SSE2-NEXT: pandn %xmm0, %xmm2 524; SSE2-NEXT: paddb %xmm0, %xmm0 525; SSE2-NEXT: pand %xmm1, %xmm0 526; SSE2-NEXT: por %xmm2, %xmm0 527; SSE2-NEXT: retq 528; 529; SSE41-LABEL: splatvar_shift_v16i8: 530; SSE41: # BB#0: 531; SSE41-NEXT: movdqa %xmm0, %xmm2 532; SSE41-NEXT: pxor %xmm0, %xmm0 533; SSE41-NEXT: pshufb %xmm0, %xmm1 534; SSE41-NEXT: psllw $5, %xmm1 535; SSE41-NEXT: movdqa %xmm1, %xmm3 536; SSE41-NEXT: paddb %xmm3, %xmm3 537; SSE41-NEXT: movdqa %xmm2, %xmm4 538; SSE41-NEXT: psllw $4, %xmm4 539; SSE41-NEXT: pand {{.*}}(%rip), %xmm4 540; SSE41-NEXT: movdqa %xmm1, %xmm0 541; SSE41-NEXT: pblendvb %xmm4, %xmm2 542; SSE41-NEXT: movdqa %xmm2, %xmm1 543; SSE41-NEXT: psllw $2, %xmm1 544; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 545; SSE41-NEXT: movdqa %xmm3, %xmm0 546; SSE41-NEXT: pblendvb %xmm1, %xmm2 547; SSE41-NEXT: movdqa %xmm2, %xmm1 548; SSE41-NEXT: paddb %xmm1, %xmm1 549; SSE41-NEXT: paddb %xmm3, %xmm3 550; SSE41-NEXT: movdqa %xmm3, %xmm0 551; SSE41-NEXT: pblendvb %xmm1, %xmm2 552; SSE41-NEXT: movdqa %xmm2, %xmm0 553; SSE41-NEXT: retq 554; 555; AVX1-LABEL: splatvar_shift_v16i8: 556; AVX1: # BB#0: 557; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 558; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 559; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 560; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm2 561; AVX1-NEXT: vpsllw $4, %xmm0, %xmm3 562; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 563; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 564; AVX1-NEXT: vpsllw $2, %xmm0, %xmm1 565; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 566; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 567; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm1 568; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 569; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 570; AVX1-NEXT: retq 571; 572; AVX2-LABEL: splatvar_shift_v16i8: 573; AVX2: # BB#0: 574; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 575; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 576; AVX2-NEXT: vpsllw $4, %xmm0, %xmm2 577; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 578; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 579; AVX2-NEXT: vpsllw $2, %xmm0, %xmm2 580; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 581; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 582; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 583; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2 584; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 585; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 586; AVX2-NEXT: retq 587; 588; XOPAVX1-LABEL: splatvar_shift_v16i8: 589; XOPAVX1: # BB#0: 590; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 591; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 592; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 593; XOPAVX1-NEXT: retq 594; 595; XOPAVX2-LABEL: splatvar_shift_v16i8: 596; XOPAVX2: # BB#0: 597; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 598; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 599; XOPAVX2-NEXT: retq 600; 601; X32-SSE-LABEL: splatvar_shift_v16i8: 602; X32-SSE: # BB#0: 603; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 604; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] 605; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 606; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4] 607; X32-SSE-NEXT: psllw $5, %xmm2 608; X32-SSE-NEXT: pxor %xmm1, %xmm1 609; X32-SSE-NEXT: pxor %xmm3, %xmm3 610; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3 611; X32-SSE-NEXT: movdqa %xmm3, %xmm4 612; X32-SSE-NEXT: pandn %xmm0, %xmm4 613; X32-SSE-NEXT: psllw $4, %xmm0 614; X32-SSE-NEXT: pand .LCPI7_0, %xmm0 615; X32-SSE-NEXT: pand %xmm3, %xmm0 616; X32-SSE-NEXT: por %xmm4, %xmm0 617; X32-SSE-NEXT: paddb %xmm2, %xmm2 618; X32-SSE-NEXT: pxor %xmm3, %xmm3 619; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3 620; X32-SSE-NEXT: movdqa %xmm3, %xmm4 621; X32-SSE-NEXT: pandn %xmm0, %xmm4 622; X32-SSE-NEXT: psllw $2, %xmm0 623; X32-SSE-NEXT: pand .LCPI7_1, %xmm0 624; X32-SSE-NEXT: pand %xmm3, %xmm0 625; X32-SSE-NEXT: por %xmm4, %xmm0 626; X32-SSE-NEXT: paddb %xmm2, %xmm2 627; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm1 628; X32-SSE-NEXT: movdqa %xmm1, %xmm2 629; X32-SSE-NEXT: pandn %xmm0, %xmm2 630; X32-SSE-NEXT: paddb %xmm0, %xmm0 631; X32-SSE-NEXT: pand %xmm1, %xmm0 632; X32-SSE-NEXT: por %xmm2, %xmm0 633; X32-SSE-NEXT: retl 634 %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer 635 %shift = shl <16 x i8> %a, %splat 636 ret <16 x i8> %shift 637} 638 639; 640; Constant Shifts 641; 642 643define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind { 644; SSE2-LABEL: constant_shift_v2i64: 645; SSE2: # BB#0: 646; SSE2-NEXT: movdqa %xmm0, %xmm1 647; SSE2-NEXT: psllq $7, %xmm1 648; SSE2-NEXT: psllq $1, %xmm0 649; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 650; SSE2-NEXT: movapd %xmm1, %xmm0 651; SSE2-NEXT: retq 652; 653; SSE41-LABEL: constant_shift_v2i64: 654; SSE41: # BB#0: 655; SSE41-NEXT: movdqa %xmm0, %xmm1 656; SSE41-NEXT: psllq $7, %xmm1 657; SSE41-NEXT: psllq $1, %xmm0 658; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 659; SSE41-NEXT: retq 660; 661; AVX1-LABEL: constant_shift_v2i64: 662; AVX1: # BB#0: 663; AVX1-NEXT: vpsllq $7, %xmm0, %xmm1 664; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0 665; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 666; AVX1-NEXT: retq 667; 668; AVX2-LABEL: constant_shift_v2i64: 669; AVX2: # BB#0: 670; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0 671; AVX2-NEXT: retq 672; 673; XOPAVX1-LABEL: constant_shift_v2i64: 674; XOPAVX1: # BB#0: 675; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0 676; XOPAVX1-NEXT: retq 677; 678; XOPAVX2-LABEL: constant_shift_v2i64: 679; XOPAVX2: # BB#0: 680; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0 681; XOPAVX2-NEXT: retq 682; 683; X32-SSE-LABEL: constant_shift_v2i64: 684; X32-SSE: # BB#0: 685; X32-SSE-NEXT: movdqa %xmm0, %xmm1 686; X32-SSE-NEXT: psllq $7, %xmm1 687; X32-SSE-NEXT: psllq $1, %xmm0 688; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 689; X32-SSE-NEXT: movapd %xmm1, %xmm0 690; X32-SSE-NEXT: retl 691 %shift = shl <2 x i64> %a, <i64 1, i64 7> 692 ret <2 x i64> %shift 693} 694 695define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind { 696; SSE2-LABEL: constant_shift_v4i32: 697; SSE2: # BB#0: 698; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128] 699; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 700; SSE2-NEXT: pmuludq %xmm1, %xmm0 701; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 702; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 703; SSE2-NEXT: pmuludq %xmm2, %xmm1 704; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 705; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 706; SSE2-NEXT: retq 707; 708; SSE41-LABEL: constant_shift_v4i32: 709; SSE41: # BB#0: 710; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 711; SSE41-NEXT: retq 712; 713; AVX1-LABEL: constant_shift_v4i32: 714; AVX1: # BB#0: 715; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 716; AVX1-NEXT: retq 717; 718; AVX2-LABEL: constant_shift_v4i32: 719; AVX2: # BB#0: 720; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 721; AVX2-NEXT: retq 722; 723; XOPAVX1-LABEL: constant_shift_v4i32: 724; XOPAVX1: # BB#0: 725; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 726; XOPAVX1-NEXT: retq 727; 728; XOPAVX2-LABEL: constant_shift_v4i32: 729; XOPAVX2: # BB#0: 730; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 731; XOPAVX2-NEXT: retq 732; 733; X32-SSE-LABEL: constant_shift_v4i32: 734; X32-SSE: # BB#0: 735; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128] 736; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 737; X32-SSE-NEXT: pmuludq %xmm1, %xmm0 738; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 739; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 740; X32-SSE-NEXT: pmuludq %xmm2, %xmm1 741; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 742; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 743; X32-SSE-NEXT: retl 744 %shift = shl <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7> 745 ret <4 x i32> %shift 746} 747 748define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { 749; SSE-LABEL: constant_shift_v8i16: 750; SSE: # BB#0: 751; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 752; SSE-NEXT: retq 753; 754; AVX-LABEL: constant_shift_v8i16: 755; AVX: # BB#0: 756; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 757; AVX-NEXT: retq 758; 759; XOP-LABEL: constant_shift_v8i16: 760; XOP: # BB#0: 761; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0 762; XOP-NEXT: retq 763; 764; X32-SSE-LABEL: constant_shift_v8i16: 765; X32-SSE: # BB#0: 766; X32-SSE-NEXT: pmullw .LCPI10_0, %xmm0 767; X32-SSE-NEXT: retl 768 %shift = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7> 769 ret <8 x i16> %shift 770} 771 772define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { 773; SSE2-LABEL: constant_shift_v16i8: 774; SSE2: # BB#0: 775; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 776; SSE2-NEXT: psllw $5, %xmm2 777; SSE2-NEXT: pxor %xmm1, %xmm1 778; SSE2-NEXT: pxor %xmm3, %xmm3 779; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 780; SSE2-NEXT: movdqa %xmm3, %xmm4 781; SSE2-NEXT: pandn %xmm0, %xmm4 782; SSE2-NEXT: psllw $4, %xmm0 783; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 784; SSE2-NEXT: pand %xmm3, %xmm0 785; SSE2-NEXT: por %xmm4, %xmm0 786; SSE2-NEXT: paddb %xmm2, %xmm2 787; SSE2-NEXT: pxor %xmm3, %xmm3 788; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 789; SSE2-NEXT: movdqa %xmm3, %xmm4 790; SSE2-NEXT: pandn %xmm0, %xmm4 791; SSE2-NEXT: psllw $2, %xmm0 792; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 793; SSE2-NEXT: pand %xmm3, %xmm0 794; SSE2-NEXT: por %xmm4, %xmm0 795; SSE2-NEXT: paddb %xmm2, %xmm2 796; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 797; SSE2-NEXT: movdqa %xmm1, %xmm2 798; SSE2-NEXT: pandn %xmm0, %xmm2 799; SSE2-NEXT: paddb %xmm0, %xmm0 800; SSE2-NEXT: pand %xmm1, %xmm0 801; SSE2-NEXT: por %xmm2, %xmm0 802; SSE2-NEXT: retq 803; 804; SSE41-LABEL: constant_shift_v16i8: 805; SSE41: # BB#0: 806; SSE41-NEXT: movdqa %xmm0, %xmm1 807; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 808; SSE41-NEXT: psllw $5, %xmm0 809; SSE41-NEXT: movdqa %xmm1, %xmm2 810; SSE41-NEXT: psllw $4, %xmm2 811; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 812; SSE41-NEXT: pblendvb %xmm2, %xmm1 813; SSE41-NEXT: movdqa %xmm1, %xmm2 814; SSE41-NEXT: psllw $2, %xmm2 815; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 816; SSE41-NEXT: paddb %xmm0, %xmm0 817; SSE41-NEXT: pblendvb %xmm2, %xmm1 818; SSE41-NEXT: movdqa %xmm1, %xmm2 819; SSE41-NEXT: paddb %xmm2, %xmm2 820; SSE41-NEXT: paddb %xmm0, %xmm0 821; SSE41-NEXT: pblendvb %xmm2, %xmm1 822; SSE41-NEXT: movdqa %xmm1, %xmm0 823; SSE41-NEXT: retq 824; 825; AVX-LABEL: constant_shift_v16i8: 826; AVX: # BB#0: 827; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 828; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 829; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 830; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 831; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 832; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 833; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 834; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 835; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 836; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 837; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 838; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 839; AVX-NEXT: retq 840; 841; XOP-LABEL: constant_shift_v16i8: 842; XOP: # BB#0: 843; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 844; XOP-NEXT: retq 845; 846; X32-SSE-LABEL: constant_shift_v16i8: 847; X32-SSE: # BB#0: 848; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 849; X32-SSE-NEXT: psllw $5, %xmm2 850; X32-SSE-NEXT: pxor %xmm1, %xmm1 851; X32-SSE-NEXT: pxor %xmm3, %xmm3 852; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3 853; X32-SSE-NEXT: movdqa %xmm3, %xmm4 854; X32-SSE-NEXT: pandn %xmm0, %xmm4 855; X32-SSE-NEXT: psllw $4, %xmm0 856; X32-SSE-NEXT: pand .LCPI11_1, %xmm0 857; X32-SSE-NEXT: pand %xmm3, %xmm0 858; X32-SSE-NEXT: por %xmm4, %xmm0 859; X32-SSE-NEXT: paddb %xmm2, %xmm2 860; X32-SSE-NEXT: pxor %xmm3, %xmm3 861; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3 862; X32-SSE-NEXT: movdqa %xmm3, %xmm4 863; X32-SSE-NEXT: pandn %xmm0, %xmm4 864; X32-SSE-NEXT: psllw $2, %xmm0 865; X32-SSE-NEXT: pand .LCPI11_2, %xmm0 866; X32-SSE-NEXT: pand %xmm3, %xmm0 867; X32-SSE-NEXT: por %xmm4, %xmm0 868; X32-SSE-NEXT: paddb %xmm2, %xmm2 869; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm1 870; X32-SSE-NEXT: movdqa %xmm1, %xmm2 871; X32-SSE-NEXT: pandn %xmm0, %xmm2 872; X32-SSE-NEXT: paddb %xmm0, %xmm0 873; X32-SSE-NEXT: pand %xmm1, %xmm0 874; X32-SSE-NEXT: por %xmm2, %xmm0 875; X32-SSE-NEXT: retl 876 %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> 877 ret <16 x i8> %shift 878} 879 880; 881; Uniform Constant Shifts 882; 883 884define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind { 885; SSE-LABEL: splatconstant_shift_v2i64: 886; SSE: # BB#0: 887; SSE-NEXT: psllq $7, %xmm0 888; SSE-NEXT: retq 889; 890; AVX-LABEL: splatconstant_shift_v2i64: 891; AVX: # BB#0: 892; AVX-NEXT: vpsllq $7, %xmm0, %xmm0 893; AVX-NEXT: retq 894; 895; XOP-LABEL: splatconstant_shift_v2i64: 896; XOP: # BB#0: 897; XOP-NEXT: vpsllq $7, %xmm0, %xmm0 898; XOP-NEXT: retq 899; 900; X32-SSE-LABEL: splatconstant_shift_v2i64: 901; X32-SSE: # BB#0: 902; X32-SSE-NEXT: psllq $7, %xmm0 903; X32-SSE-NEXT: retl 904 %shift = shl <2 x i64> %a, <i64 7, i64 7> 905 ret <2 x i64> %shift 906} 907 908define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind { 909; SSE-LABEL: splatconstant_shift_v4i32: 910; SSE: # BB#0: 911; SSE-NEXT: pslld $5, %xmm0 912; SSE-NEXT: retq 913; 914; AVX-LABEL: splatconstant_shift_v4i32: 915; AVX: # BB#0: 916; AVX-NEXT: vpslld $5, %xmm0, %xmm0 917; AVX-NEXT: retq 918; 919; XOP-LABEL: splatconstant_shift_v4i32: 920; XOP: # BB#0: 921; XOP-NEXT: vpslld $5, %xmm0, %xmm0 922; XOP-NEXT: retq 923; 924; X32-SSE-LABEL: splatconstant_shift_v4i32: 925; X32-SSE: # BB#0: 926; X32-SSE-NEXT: pslld $5, %xmm0 927; X32-SSE-NEXT: retl 928 %shift = shl <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5> 929 ret <4 x i32> %shift 930} 931 932define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind { 933; SSE-LABEL: splatconstant_shift_v8i16: 934; SSE: # BB#0: 935; SSE-NEXT: psllw $3, %xmm0 936; SSE-NEXT: retq 937; 938; AVX-LABEL: splatconstant_shift_v8i16: 939; AVX: # BB#0: 940; AVX-NEXT: vpsllw $3, %xmm0, %xmm0 941; AVX-NEXT: retq 942; 943; XOP-LABEL: splatconstant_shift_v8i16: 944; XOP: # BB#0: 945; XOP-NEXT: vpsllw $3, %xmm0, %xmm0 946; XOP-NEXT: retq 947; 948; X32-SSE-LABEL: splatconstant_shift_v8i16: 949; X32-SSE: # BB#0: 950; X32-SSE-NEXT: psllw $3, %xmm0 951; X32-SSE-NEXT: retl 952 %shift = shl <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> 953 ret <8 x i16> %shift 954} 955 956define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind { 957; SSE-LABEL: splatconstant_shift_v16i8: 958; SSE: # BB#0: 959; SSE-NEXT: psllw $3, %xmm0 960; SSE-NEXT: pand {{.*}}(%rip), %xmm0 961; SSE-NEXT: retq 962; 963; AVX-LABEL: splatconstant_shift_v16i8: 964; AVX: # BB#0: 965; AVX-NEXT: vpsllw $3, %xmm0, %xmm0 966; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 967; AVX-NEXT: retq 968; 969; XOP-LABEL: splatconstant_shift_v16i8: 970; XOP: # BB#0: 971; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 972; XOP-NEXT: retq 973; 974; X32-SSE-LABEL: splatconstant_shift_v16i8: 975; X32-SSE: # BB#0: 976; X32-SSE-NEXT: psllw $3, %xmm0 977; X32-SSE-NEXT: pand .LCPI15_0, %xmm0 978; X32-SSE-NEXT: retl 979 %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> 980 ret <16 x i8> %shift 981} 982