1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 8; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW 9 10; 11; Just one 32-bit run to make sure we do reasonable things for i64 shifts. 12; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2 13 14; 15; Variable Shifts 16; 17 18define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 19; SSE2-LABEL: var_shift_v2i64: 20; SSE2: # BB#0: 21; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] 22; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 23; SSE2-NEXT: movdqa %xmm2, %xmm4 24; SSE2-NEXT: psrlq %xmm3, %xmm4 25; SSE2-NEXT: psrlq %xmm1, %xmm2 26; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] 27; SSE2-NEXT: movdqa %xmm0, %xmm2 28; SSE2-NEXT: psrlq %xmm3, %xmm2 29; SSE2-NEXT: psrlq %xmm1, %xmm0 30; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 31; SSE2-NEXT: xorpd %xmm4, %xmm2 32; SSE2-NEXT: psubq %xmm4, %xmm2 33; SSE2-NEXT: movdqa %xmm2, %xmm0 34; SSE2-NEXT: retq 35; 36; SSE41-LABEL: var_shift_v2i64: 37; SSE41: # BB#0: 38; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 39; SSE41-NEXT: movdqa %xmm2, %xmm3 40; SSE41-NEXT: psrlq %xmm1, %xmm3 41; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] 42; SSE41-NEXT: psrlq %xmm4, %xmm2 43; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 44; SSE41-NEXT: movdqa %xmm0, %xmm3 45; SSE41-NEXT: psrlq %xmm1, %xmm3 46; SSE41-NEXT: psrlq %xmm4, %xmm0 47; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 48; SSE41-NEXT: pxor %xmm2, %xmm0 49; SSE41-NEXT: psubq %xmm2, %xmm0 50; SSE41-NEXT: retq 51; 52; AVX1-LABEL: var_shift_v2i64: 53; AVX1: # BB#0: 54; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 55; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm3 56; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] 57; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm2 58; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 59; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1 60; AVX1-NEXT: vpsrlq %xmm4, %xmm0, %xmm0 61; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 62; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 63; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 64; AVX1-NEXT: retq 65; 66; AVX2-LABEL: var_shift_v2i64: 67; AVX2: # BB#0: 68; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 69; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm3 70; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 71; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 72; AVX2-NEXT: vpsubq %xmm3, %xmm0, %xmm0 73; AVX2-NEXT: retq 74; 75; XOP-LABEL: var_shift_v2i64: 76; XOP: # BB#0: 77; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 78; XOP-NEXT: vpsubq %xmm1, %xmm2, %xmm1 79; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm0 80; XOP-NEXT: retq 81; 82; AVX512-LABEL: var_shift_v2i64: 83; AVX512: ## BB#0: 84; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 85; AVX512-NEXT: vpsrlvq %xmm1, %xmm2, %xmm3 86; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0 87; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 88; AVX512-NEXT: vpsubq %xmm3, %xmm0, %xmm0 89; AVX512-NEXT: retq 90; 91; X32-SSE-LABEL: var_shift_v2i64: 92; X32-SSE: # BB#0: 93; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] 94; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] 95; X32-SSE-NEXT: movdqa %xmm3, %xmm4 96; X32-SSE-NEXT: psrlq %xmm2, %xmm4 97; X32-SSE-NEXT: movq {{.*#+}} xmm5 = xmm1[0],zero 98; X32-SSE-NEXT: psrlq %xmm5, %xmm3 99; X32-SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] 100; X32-SSE-NEXT: movdqa %xmm0, %xmm1 101; X32-SSE-NEXT: psrlq %xmm2, %xmm1 102; X32-SSE-NEXT: psrlq %xmm5, %xmm0 103; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 104; X32-SSE-NEXT: xorpd %xmm4, %xmm1 105; X32-SSE-NEXT: psubq %xmm4, %xmm1 106; X32-SSE-NEXT: movdqa %xmm1, %xmm0 107; X32-SSE-NEXT: retl 108 %shift = ashr <2 x i64> %a, %b 109 ret <2 x i64> %shift 110} 111 112define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 113; SSE2-LABEL: var_shift_v4i32: 114; SSE2: # BB#0: 115; SSE2-NEXT: movdqa %xmm1, %xmm2 116; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 117; SSE2-NEXT: movdqa %xmm0, %xmm3 118; SSE2-NEXT: psrad %xmm2, %xmm3 119; SSE2-NEXT: movdqa %xmm1, %xmm2 120; SSE2-NEXT: psrlq $32, %xmm2 121; SSE2-NEXT: movdqa %xmm0, %xmm4 122; SSE2-NEXT: psrad %xmm2, %xmm4 123; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] 124; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] 125; SSE2-NEXT: pxor %xmm3, %xmm3 126; SSE2-NEXT: movdqa %xmm1, %xmm4 127; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] 128; SSE2-NEXT: movdqa %xmm0, %xmm5 129; SSE2-NEXT: psrad %xmm4, %xmm5 130; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 131; SSE2-NEXT: psrad %xmm1, %xmm0 132; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] 133; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] 134; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 135; SSE2-NEXT: retq 136; 137; SSE41-LABEL: var_shift_v4i32: 138; SSE41: # BB#0: 139; SSE41-NEXT: movdqa %xmm1, %xmm2 140; SSE41-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 141; SSE41-NEXT: movdqa %xmm0, %xmm3 142; SSE41-NEXT: psrad %xmm2, %xmm3 143; SSE41-NEXT: movdqa %xmm1, %xmm2 144; SSE41-NEXT: psrlq $32, %xmm2 145; SSE41-NEXT: movdqa %xmm0, %xmm4 146; SSE41-NEXT: psrad %xmm2, %xmm4 147; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] 148; SSE41-NEXT: pxor %xmm2, %xmm2 149; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero 150; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] 151; SSE41-NEXT: movdqa %xmm0, %xmm2 152; SSE41-NEXT: psrad %xmm1, %xmm2 153; SSE41-NEXT: psrad %xmm3, %xmm0 154; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 155; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] 156; SSE41-NEXT: retq 157; 158; AVX1-LABEL: var_shift_v4i32: 159; AVX1: # BB#0: 160; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 161; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2 162; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 163; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 164; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 165; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 166; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] 167; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 168; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 169; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 170; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] 171; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 172; AVX1-NEXT: retq 173; 174; AVX2-LABEL: var_shift_v4i32: 175; AVX2: # BB#0: 176; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 177; AVX2-NEXT: retq 178; 179; XOPAVX1-LABEL: var_shift_v4i32: 180; XOPAVX1: # BB#0: 181; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 182; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 183; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0 184; XOPAVX1-NEXT: retq 185; 186; XOPAVX2-LABEL: var_shift_v4i32: 187; XOPAVX2: # BB#0: 188; XOPAVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 189; XOPAVX2-NEXT: retq 190; 191; AVX512-LABEL: var_shift_v4i32: 192; AVX512: ## BB#0: 193; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0 194; AVX512-NEXT: retq 195; 196; X32-SSE-LABEL: var_shift_v4i32: 197; X32-SSE: # BB#0: 198; X32-SSE-NEXT: movdqa %xmm1, %xmm2 199; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 200; X32-SSE-NEXT: movdqa %xmm0, %xmm3 201; X32-SSE-NEXT: psrad %xmm2, %xmm3 202; X32-SSE-NEXT: movdqa %xmm1, %xmm2 203; X32-SSE-NEXT: psrlq $32, %xmm2 204; X32-SSE-NEXT: movdqa %xmm0, %xmm4 205; X32-SSE-NEXT: psrad %xmm2, %xmm4 206; X32-SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] 207; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] 208; X32-SSE-NEXT: pxor %xmm3, %xmm3 209; X32-SSE-NEXT: movdqa %xmm1, %xmm4 210; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] 211; X32-SSE-NEXT: movdqa %xmm0, %xmm5 212; X32-SSE-NEXT: psrad %xmm4, %xmm5 213; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 214; X32-SSE-NEXT: psrad %xmm1, %xmm0 215; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] 216; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] 217; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 218; X32-SSE-NEXT: retl 219 %shift = ashr <4 x i32> %a, %b 220 ret <4 x i32> %shift 221} 222 223define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 224; SSE2-LABEL: var_shift_v8i16: 225; SSE2: # BB#0: 226; SSE2-NEXT: psllw $12, %xmm1 227; SSE2-NEXT: movdqa %xmm1, %xmm2 228; SSE2-NEXT: psraw $15, %xmm2 229; SSE2-NEXT: movdqa %xmm2, %xmm3 230; SSE2-NEXT: pandn %xmm0, %xmm3 231; SSE2-NEXT: psraw $8, %xmm0 232; SSE2-NEXT: pand %xmm2, %xmm0 233; SSE2-NEXT: por %xmm3, %xmm0 234; SSE2-NEXT: paddw %xmm1, %xmm1 235; SSE2-NEXT: movdqa %xmm1, %xmm2 236; SSE2-NEXT: psraw $15, %xmm2 237; SSE2-NEXT: movdqa %xmm2, %xmm3 238; SSE2-NEXT: pandn %xmm0, %xmm3 239; SSE2-NEXT: psraw $4, %xmm0 240; SSE2-NEXT: pand %xmm2, %xmm0 241; SSE2-NEXT: por %xmm3, %xmm0 242; SSE2-NEXT: paddw %xmm1, %xmm1 243; SSE2-NEXT: movdqa %xmm1, %xmm2 244; SSE2-NEXT: psraw $15, %xmm2 245; SSE2-NEXT: movdqa %xmm2, %xmm3 246; SSE2-NEXT: pandn %xmm0, %xmm3 247; SSE2-NEXT: psraw $2, %xmm0 248; SSE2-NEXT: pand %xmm2, %xmm0 249; SSE2-NEXT: por %xmm3, %xmm0 250; SSE2-NEXT: paddw %xmm1, %xmm1 251; SSE2-NEXT: psraw $15, %xmm1 252; SSE2-NEXT: movdqa %xmm1, %xmm2 253; SSE2-NEXT: pandn %xmm0, %xmm2 254; SSE2-NEXT: psraw $1, %xmm0 255; SSE2-NEXT: pand %xmm1, %xmm0 256; SSE2-NEXT: por %xmm2, %xmm0 257; SSE2-NEXT: retq 258; 259; SSE41-LABEL: var_shift_v8i16: 260; SSE41: # BB#0: 261; SSE41-NEXT: movdqa %xmm0, %xmm2 262; SSE41-NEXT: movdqa %xmm1, %xmm0 263; SSE41-NEXT: psllw $12, %xmm0 264; SSE41-NEXT: psllw $4, %xmm1 265; SSE41-NEXT: por %xmm0, %xmm1 266; SSE41-NEXT: movdqa %xmm1, %xmm3 267; SSE41-NEXT: paddw %xmm3, %xmm3 268; SSE41-NEXT: movdqa %xmm2, %xmm4 269; SSE41-NEXT: psraw $8, %xmm4 270; SSE41-NEXT: movdqa %xmm1, %xmm0 271; SSE41-NEXT: pblendvb %xmm4, %xmm2 272; SSE41-NEXT: movdqa %xmm2, %xmm1 273; SSE41-NEXT: psraw $4, %xmm1 274; SSE41-NEXT: movdqa %xmm3, %xmm0 275; SSE41-NEXT: pblendvb %xmm1, %xmm2 276; SSE41-NEXT: movdqa %xmm2, %xmm1 277; SSE41-NEXT: psraw $2, %xmm1 278; SSE41-NEXT: paddw %xmm3, %xmm3 279; SSE41-NEXT: movdqa %xmm3, %xmm0 280; SSE41-NEXT: pblendvb %xmm1, %xmm2 281; SSE41-NEXT: movdqa %xmm2, %xmm1 282; SSE41-NEXT: psraw $1, %xmm1 283; SSE41-NEXT: paddw %xmm3, %xmm3 284; SSE41-NEXT: movdqa %xmm3, %xmm0 285; SSE41-NEXT: pblendvb %xmm1, %xmm2 286; SSE41-NEXT: movdqa %xmm2, %xmm0 287; SSE41-NEXT: retq 288; 289; AVX1-LABEL: var_shift_v8i16: 290; AVX1: # BB#0: 291; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 292; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 293; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 294; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 295; AVX1-NEXT: vpsraw $8, %xmm0, %xmm3 296; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 297; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 298; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 299; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 300; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 301; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 302; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 303; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 304; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 305; AVX1-NEXT: retq 306; 307; AVX2-LABEL: var_shift_v8i16: 308; AVX2: # BB#0: 309; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 310; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 311; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 312; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 313; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 314; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 315; AVX2-NEXT: vzeroupper 316; AVX2-NEXT: retq 317; 318; XOP-LABEL: var_shift_v8i16: 319; XOP: # BB#0: 320; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 321; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 322; XOP-NEXT: vpshaw %xmm1, %xmm0, %xmm0 323; XOP-NEXT: retq 324; 325; AVX512-LABEL: var_shift_v8i16: 326; AVX512: ## BB#0: 327; AVX512-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def> 328; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def> 329; AVX512-NEXT: vpsravw %zmm1, %zmm0, %zmm0 330; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> 331; AVX512-NEXT: retq 332; 333; X32-SSE-LABEL: var_shift_v8i16: 334; X32-SSE: # BB#0: 335; X32-SSE-NEXT: psllw $12, %xmm1 336; X32-SSE-NEXT: movdqa %xmm1, %xmm2 337; X32-SSE-NEXT: psraw $15, %xmm2 338; X32-SSE-NEXT: movdqa %xmm2, %xmm3 339; X32-SSE-NEXT: pandn %xmm0, %xmm3 340; X32-SSE-NEXT: psraw $8, %xmm0 341; X32-SSE-NEXT: pand %xmm2, %xmm0 342; X32-SSE-NEXT: por %xmm3, %xmm0 343; X32-SSE-NEXT: paddw %xmm1, %xmm1 344; X32-SSE-NEXT: movdqa %xmm1, %xmm2 345; X32-SSE-NEXT: psraw $15, %xmm2 346; X32-SSE-NEXT: movdqa %xmm2, %xmm3 347; X32-SSE-NEXT: pandn %xmm0, %xmm3 348; X32-SSE-NEXT: psraw $4, %xmm0 349; X32-SSE-NEXT: pand %xmm2, %xmm0 350; X32-SSE-NEXT: por %xmm3, %xmm0 351; X32-SSE-NEXT: paddw %xmm1, %xmm1 352; X32-SSE-NEXT: movdqa %xmm1, %xmm2 353; X32-SSE-NEXT: psraw $15, %xmm2 354; X32-SSE-NEXT: movdqa %xmm2, %xmm3 355; X32-SSE-NEXT: pandn %xmm0, %xmm3 356; X32-SSE-NEXT: psraw $2, %xmm0 357; X32-SSE-NEXT: pand %xmm2, %xmm0 358; X32-SSE-NEXT: por %xmm3, %xmm0 359; X32-SSE-NEXT: paddw %xmm1, %xmm1 360; X32-SSE-NEXT: psraw $15, %xmm1 361; X32-SSE-NEXT: movdqa %xmm1, %xmm2 362; X32-SSE-NEXT: pandn %xmm0, %xmm2 363; X32-SSE-NEXT: psraw $1, %xmm0 364; X32-SSE-NEXT: pand %xmm1, %xmm0 365; X32-SSE-NEXT: por %xmm2, %xmm0 366; X32-SSE-NEXT: retl 367 %shift = ashr <8 x i16> %a, %b 368 ret <8 x i16> %shift 369} 370 371define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 372; SSE2-LABEL: var_shift_v16i8: 373; SSE2: # BB#0: 374; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 375; SSE2-NEXT: psllw $5, %xmm1 376; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] 377; SSE2-NEXT: pxor %xmm3, %xmm3 378; SSE2-NEXT: pxor %xmm5, %xmm5 379; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 380; SSE2-NEXT: movdqa %xmm5, %xmm6 381; SSE2-NEXT: pandn %xmm2, %xmm6 382; SSE2-NEXT: psraw $4, %xmm2 383; SSE2-NEXT: pand %xmm5, %xmm2 384; SSE2-NEXT: por %xmm6, %xmm2 385; SSE2-NEXT: paddw %xmm4, %xmm4 386; SSE2-NEXT: pxor %xmm5, %xmm5 387; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 388; SSE2-NEXT: movdqa %xmm5, %xmm6 389; SSE2-NEXT: pandn %xmm2, %xmm6 390; SSE2-NEXT: psraw $2, %xmm2 391; SSE2-NEXT: pand %xmm5, %xmm2 392; SSE2-NEXT: por %xmm6, %xmm2 393; SSE2-NEXT: paddw %xmm4, %xmm4 394; SSE2-NEXT: pxor %xmm5, %xmm5 395; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 396; SSE2-NEXT: movdqa %xmm5, %xmm4 397; SSE2-NEXT: pandn %xmm2, %xmm4 398; SSE2-NEXT: psraw $1, %xmm2 399; SSE2-NEXT: pand %xmm5, %xmm2 400; SSE2-NEXT: por %xmm4, %xmm2 401; SSE2-NEXT: psrlw $8, %xmm2 402; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 403; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 404; SSE2-NEXT: pxor %xmm4, %xmm4 405; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 406; SSE2-NEXT: movdqa %xmm4, %xmm5 407; SSE2-NEXT: pandn %xmm0, %xmm5 408; SSE2-NEXT: psraw $4, %xmm0 409; SSE2-NEXT: pand %xmm4, %xmm0 410; SSE2-NEXT: por %xmm5, %xmm0 411; SSE2-NEXT: paddw %xmm1, %xmm1 412; SSE2-NEXT: pxor %xmm4, %xmm4 413; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 414; SSE2-NEXT: movdqa %xmm4, %xmm5 415; SSE2-NEXT: pandn %xmm0, %xmm5 416; SSE2-NEXT: psraw $2, %xmm0 417; SSE2-NEXT: pand %xmm4, %xmm0 418; SSE2-NEXT: por %xmm5, %xmm0 419; SSE2-NEXT: paddw %xmm1, %xmm1 420; SSE2-NEXT: pcmpgtw %xmm1, %xmm3 421; SSE2-NEXT: movdqa %xmm3, %xmm1 422; SSE2-NEXT: pandn %xmm0, %xmm1 423; SSE2-NEXT: psraw $1, %xmm0 424; SSE2-NEXT: pand %xmm3, %xmm0 425; SSE2-NEXT: por %xmm1, %xmm0 426; SSE2-NEXT: psrlw $8, %xmm0 427; SSE2-NEXT: packuswb %xmm2, %xmm0 428; SSE2-NEXT: retq 429; 430; SSE41-LABEL: var_shift_v16i8: 431; SSE41: # BB#0: 432; SSE41-NEXT: movdqa %xmm0, %xmm2 433; SSE41-NEXT: psllw $5, %xmm1 434; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 435; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 436; SSE41-NEXT: movdqa %xmm3, %xmm4 437; SSE41-NEXT: psraw $4, %xmm4 438; SSE41-NEXT: pblendvb %xmm4, %xmm3 439; SSE41-NEXT: movdqa %xmm3, %xmm4 440; SSE41-NEXT: psraw $2, %xmm4 441; SSE41-NEXT: paddw %xmm0, %xmm0 442; SSE41-NEXT: pblendvb %xmm4, %xmm3 443; SSE41-NEXT: movdqa %xmm3, %xmm4 444; SSE41-NEXT: psraw $1, %xmm4 445; SSE41-NEXT: paddw %xmm0, %xmm0 446; SSE41-NEXT: pblendvb %xmm4, %xmm3 447; SSE41-NEXT: psrlw $8, %xmm3 448; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 449; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 450; SSE41-NEXT: movdqa %xmm1, %xmm2 451; SSE41-NEXT: psraw $4, %xmm2 452; SSE41-NEXT: pblendvb %xmm2, %xmm1 453; SSE41-NEXT: movdqa %xmm1, %xmm2 454; SSE41-NEXT: psraw $2, %xmm2 455; SSE41-NEXT: paddw %xmm0, %xmm0 456; SSE41-NEXT: pblendvb %xmm2, %xmm1 457; SSE41-NEXT: movdqa %xmm1, %xmm2 458; SSE41-NEXT: psraw $1, %xmm2 459; SSE41-NEXT: paddw %xmm0, %xmm0 460; SSE41-NEXT: pblendvb %xmm2, %xmm1 461; SSE41-NEXT: psrlw $8, %xmm1 462; SSE41-NEXT: packuswb %xmm3, %xmm1 463; SSE41-NEXT: movdqa %xmm1, %xmm0 464; SSE41-NEXT: retq 465; 466; AVX-LABEL: var_shift_v16i8: 467; AVX: # BB#0: 468; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 469; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 470; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 471; AVX-NEXT: vpsraw $4, %xmm3, %xmm4 472; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 473; AVX-NEXT: vpsraw $2, %xmm3, %xmm4 474; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 475; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 476; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 477; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 478; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 479; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 480; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 481; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 482; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 483; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 484; AVX-NEXT: vpsraw $2, %xmm0, %xmm3 485; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 486; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 487; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 488; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 489; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 490; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 491; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 492; AVX-NEXT: retq 493; 494; XOP-LABEL: var_shift_v16i8: 495; XOP: # BB#0: 496; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 497; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 498; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 499; XOP-NEXT: retq 500; 501; AVX512-LABEL: var_shift_v16i8: 502; AVX512: ## BB#0: 503; AVX512-NEXT: vpsllw $5, %xmm1, %xmm1 504; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 505; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 506; AVX512-NEXT: vpsraw $4, %xmm3, %xmm4 507; AVX512-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 508; AVX512-NEXT: vpsraw $2, %xmm3, %xmm4 509; AVX512-NEXT: vpaddw %xmm2, %xmm2, %xmm2 510; AVX512-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 511; AVX512-NEXT: vpsraw $1, %xmm3, %xmm4 512; AVX512-NEXT: vpaddw %xmm2, %xmm2, %xmm2 513; AVX512-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 514; AVX512-NEXT: vpsrlw $8, %xmm2, %xmm2 515; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 516; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 517; AVX512-NEXT: vpsraw $4, %xmm0, %xmm3 518; AVX512-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 519; AVX512-NEXT: vpsraw $2, %xmm0, %xmm3 520; AVX512-NEXT: vpaddw %xmm1, %xmm1, %xmm1 521; AVX512-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 522; AVX512-NEXT: vpsraw $1, %xmm0, %xmm3 523; AVX512-NEXT: vpaddw %xmm1, %xmm1, %xmm1 524; AVX512-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 525; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm0 526; AVX512-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 527; AVX512-NEXT: retq 528; 529; X32-SSE-LABEL: var_shift_v16i8: 530; X32-SSE: # BB#0: 531; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 532; X32-SSE-NEXT: psllw $5, %xmm1 533; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] 534; X32-SSE-NEXT: pxor %xmm3, %xmm3 535; X32-SSE-NEXT: pxor %xmm5, %xmm5 536; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 537; X32-SSE-NEXT: movdqa %xmm5, %xmm6 538; X32-SSE-NEXT: pandn %xmm2, %xmm6 539; X32-SSE-NEXT: psraw $4, %xmm2 540; X32-SSE-NEXT: pand %xmm5, %xmm2 541; X32-SSE-NEXT: por %xmm6, %xmm2 542; X32-SSE-NEXT: paddw %xmm4, %xmm4 543; X32-SSE-NEXT: pxor %xmm5, %xmm5 544; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 545; X32-SSE-NEXT: movdqa %xmm5, %xmm6 546; X32-SSE-NEXT: pandn %xmm2, %xmm6 547; X32-SSE-NEXT: psraw $2, %xmm2 548; X32-SSE-NEXT: pand %xmm5, %xmm2 549; X32-SSE-NEXT: por %xmm6, %xmm2 550; X32-SSE-NEXT: paddw %xmm4, %xmm4 551; X32-SSE-NEXT: pxor %xmm5, %xmm5 552; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 553; X32-SSE-NEXT: movdqa %xmm5, %xmm4 554; X32-SSE-NEXT: pandn %xmm2, %xmm4 555; X32-SSE-NEXT: psraw $1, %xmm2 556; X32-SSE-NEXT: pand %xmm5, %xmm2 557; X32-SSE-NEXT: por %xmm4, %xmm2 558; X32-SSE-NEXT: psrlw $8, %xmm2 559; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 560; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 561; X32-SSE-NEXT: pxor %xmm4, %xmm4 562; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4 563; X32-SSE-NEXT: movdqa %xmm4, %xmm5 564; X32-SSE-NEXT: pandn %xmm0, %xmm5 565; X32-SSE-NEXT: psraw $4, %xmm0 566; X32-SSE-NEXT: pand %xmm4, %xmm0 567; X32-SSE-NEXT: por %xmm5, %xmm0 568; X32-SSE-NEXT: paddw %xmm1, %xmm1 569; X32-SSE-NEXT: pxor %xmm4, %xmm4 570; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4 571; X32-SSE-NEXT: movdqa %xmm4, %xmm5 572; X32-SSE-NEXT: pandn %xmm0, %xmm5 573; X32-SSE-NEXT: psraw $2, %xmm0 574; X32-SSE-NEXT: pand %xmm4, %xmm0 575; X32-SSE-NEXT: por %xmm5, %xmm0 576; X32-SSE-NEXT: paddw %xmm1, %xmm1 577; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm3 578; X32-SSE-NEXT: movdqa %xmm3, %xmm1 579; X32-SSE-NEXT: pandn %xmm0, %xmm1 580; X32-SSE-NEXT: psraw $1, %xmm0 581; X32-SSE-NEXT: pand %xmm3, %xmm0 582; X32-SSE-NEXT: por %xmm1, %xmm0 583; X32-SSE-NEXT: psrlw $8, %xmm0 584; X32-SSE-NEXT: packuswb %xmm2, %xmm0 585; X32-SSE-NEXT: retl 586 %shift = ashr <16 x i8> %a, %b 587 ret <16 x i8> %shift 588} 589 590; 591; Uniform Variable Shifts 592; 593 594define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 595; SSE-LABEL: splatvar_shift_v2i64: 596; SSE: # BB#0: 597; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 598; SSE-NEXT: psrlq %xmm1, %xmm2 599; SSE-NEXT: psrlq %xmm1, %xmm0 600; SSE-NEXT: pxor %xmm2, %xmm0 601; SSE-NEXT: psubq %xmm2, %xmm0 602; SSE-NEXT: retq 603; 604; AVX-LABEL: splatvar_shift_v2i64: 605; AVX: # BB#0: 606; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 607; AVX-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 608; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 609; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 610; AVX-NEXT: vpsubq %xmm2, %xmm0, %xmm0 611; AVX-NEXT: retq 612; 613; XOPAVX1-LABEL: splatvar_shift_v2i64: 614; XOPAVX1: # BB#0: 615; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 616; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 617; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 618; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0 619; XOPAVX1-NEXT: retq 620; 621; XOPAVX2-LABEL: splatvar_shift_v2i64: 622; XOPAVX2: # BB#0: 623; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 624; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 625; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1 626; XOPAVX2-NEXT: vpshaq %xmm1, %xmm0, %xmm0 627; XOPAVX2-NEXT: retq 628; 629; AVX512-LABEL: splatvar_shift_v2i64: 630; AVX512: ## BB#0: 631; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 632; AVX512-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 633; AVX512-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 634; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0 635; AVX512-NEXT: vpsubq %xmm2, %xmm0, %xmm0 636; AVX512-NEXT: retq 637; 638; X32-SSE-LABEL: splatvar_shift_v2i64: 639; X32-SSE: # BB#0: 640; X32-SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero 641; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] 642; X32-SSE-NEXT: psrlq %xmm1, %xmm2 643; X32-SSE-NEXT: psrlq %xmm1, %xmm0 644; X32-SSE-NEXT: pxor %xmm2, %xmm0 645; X32-SSE-NEXT: psubq %xmm2, %xmm0 646; X32-SSE-NEXT: retl 647 %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer 648 %shift = ashr <2 x i64> %a, %splat 649 ret <2 x i64> %shift 650} 651 652define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 653; SSE2-LABEL: splatvar_shift_v4i32: 654; SSE2: # BB#0: 655; SSE2-NEXT: xorps %xmm2, %xmm2 656; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 657; SSE2-NEXT: psrad %xmm2, %xmm0 658; SSE2-NEXT: retq 659; 660; SSE41-LABEL: splatvar_shift_v4i32: 661; SSE41: # BB#0: 662; SSE41-NEXT: pxor %xmm2, %xmm2 663; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7] 664; SSE41-NEXT: psrad %xmm2, %xmm0 665; SSE41-NEXT: retq 666; 667; AVX-LABEL: splatvar_shift_v4i32: 668; AVX: # BB#0: 669; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 670; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] 671; AVX-NEXT: vpsrad %xmm1, %xmm0, %xmm0 672; AVX-NEXT: retq 673; 674; XOP-LABEL: splatvar_shift_v4i32: 675; XOP: # BB#0: 676; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 677; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] 678; XOP-NEXT: vpsrad %xmm1, %xmm0, %xmm0 679; XOP-NEXT: retq 680; 681; AVX512-LABEL: splatvar_shift_v4i32: 682; AVX512: ## BB#0: 683; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 684; AVX512-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] 685; AVX512-NEXT: vpsrad %xmm1, %xmm0, %xmm0 686; AVX512-NEXT: retq 687; 688; X32-SSE-LABEL: splatvar_shift_v4i32: 689; X32-SSE: # BB#0: 690; X32-SSE-NEXT: xorps %xmm2, %xmm2 691; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 692; X32-SSE-NEXT: psrad %xmm2, %xmm0 693; X32-SSE-NEXT: retl 694 %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer 695 %shift = ashr <4 x i32> %a, %splat 696 ret <4 x i32> %shift 697} 698 699define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 700; SSE2-LABEL: splatvar_shift_v8i16: 701; SSE2: # BB#0: 702; SSE2-NEXT: movd %xmm1, %eax 703; SSE2-NEXT: movzwl %ax, %eax 704; SSE2-NEXT: movd %eax, %xmm1 705; SSE2-NEXT: psraw %xmm1, %xmm0 706; SSE2-NEXT: retq 707; 708; SSE41-LABEL: splatvar_shift_v8i16: 709; SSE41: # BB#0: 710; SSE41-NEXT: pxor %xmm2, %xmm2 711; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7] 712; SSE41-NEXT: psraw %xmm2, %xmm0 713; SSE41-NEXT: retq 714; 715; AVX-LABEL: splatvar_shift_v8i16: 716; AVX: # BB#0: 717; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 718; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] 719; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0 720; AVX-NEXT: retq 721; 722; XOP-LABEL: splatvar_shift_v8i16: 723; XOP: # BB#0: 724; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 725; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] 726; XOP-NEXT: vpsraw %xmm1, %xmm0, %xmm0 727; XOP-NEXT: retq 728; 729; AVX512-LABEL: splatvar_shift_v8i16: 730; AVX512: ## BB#0: 731; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 732; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] 733; AVX512-NEXT: vpsraw %xmm1, %xmm0, %xmm0 734; AVX512-NEXT: retq 735; 736; X32-SSE-LABEL: splatvar_shift_v8i16: 737; X32-SSE: # BB#0: 738; X32-SSE-NEXT: movd %xmm1, %eax 739; X32-SSE-NEXT: movzwl %ax, %eax 740; X32-SSE-NEXT: movd %eax, %xmm1 741; X32-SSE-NEXT: psraw %xmm1, %xmm0 742; X32-SSE-NEXT: retl 743 %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer 744 %shift = ashr <8 x i16> %a, %splat 745 ret <8 x i16> %shift 746} 747 748define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 749; SSE2-LABEL: splatvar_shift_v16i8: 750; SSE2: # BB#0: 751; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 752; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 753; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] 754; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 755; SSE2-NEXT: psllw $5, %xmm3 756; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] 757; SSE2-NEXT: pxor %xmm2, %xmm2 758; SSE2-NEXT: pxor %xmm5, %xmm5 759; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 760; SSE2-NEXT: movdqa %xmm5, %xmm6 761; SSE2-NEXT: pandn %xmm1, %xmm6 762; SSE2-NEXT: psraw $4, %xmm1 763; SSE2-NEXT: pand %xmm5, %xmm1 764; SSE2-NEXT: por %xmm6, %xmm1 765; SSE2-NEXT: paddw %xmm4, %xmm4 766; SSE2-NEXT: pxor %xmm5, %xmm5 767; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 768; SSE2-NEXT: movdqa %xmm5, %xmm6 769; SSE2-NEXT: pandn %xmm1, %xmm6 770; SSE2-NEXT: psraw $2, %xmm1 771; SSE2-NEXT: pand %xmm5, %xmm1 772; SSE2-NEXT: por %xmm6, %xmm1 773; SSE2-NEXT: paddw %xmm4, %xmm4 774; SSE2-NEXT: pxor %xmm5, %xmm5 775; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 776; SSE2-NEXT: movdqa %xmm5, %xmm4 777; SSE2-NEXT: pandn %xmm1, %xmm4 778; SSE2-NEXT: psraw $1, %xmm1 779; SSE2-NEXT: pand %xmm5, %xmm1 780; SSE2-NEXT: por %xmm4, %xmm1 781; SSE2-NEXT: psrlw $8, %xmm1 782; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 783; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 784; SSE2-NEXT: pxor %xmm4, %xmm4 785; SSE2-NEXT: pcmpgtw %xmm3, %xmm4 786; SSE2-NEXT: movdqa %xmm4, %xmm5 787; SSE2-NEXT: pandn %xmm0, %xmm5 788; SSE2-NEXT: psraw $4, %xmm0 789; SSE2-NEXT: pand %xmm4, %xmm0 790; SSE2-NEXT: por %xmm5, %xmm0 791; SSE2-NEXT: paddw %xmm3, %xmm3 792; SSE2-NEXT: pxor %xmm4, %xmm4 793; SSE2-NEXT: pcmpgtw %xmm3, %xmm4 794; SSE2-NEXT: movdqa %xmm4, %xmm5 795; SSE2-NEXT: pandn %xmm0, %xmm5 796; SSE2-NEXT: psraw $2, %xmm0 797; SSE2-NEXT: pand %xmm4, %xmm0 798; SSE2-NEXT: por %xmm5, %xmm0 799; SSE2-NEXT: paddw %xmm3, %xmm3 800; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 801; SSE2-NEXT: movdqa %xmm2, %xmm3 802; SSE2-NEXT: pandn %xmm0, %xmm3 803; SSE2-NEXT: psraw $1, %xmm0 804; SSE2-NEXT: pand %xmm2, %xmm0 805; SSE2-NEXT: por %xmm3, %xmm0 806; SSE2-NEXT: psrlw $8, %xmm0 807; SSE2-NEXT: packuswb %xmm1, %xmm0 808; SSE2-NEXT: retq 809; 810; SSE41-LABEL: splatvar_shift_v16i8: 811; SSE41: # BB#0: 812; SSE41-NEXT: movdqa %xmm0, %xmm2 813; SSE41-NEXT: pxor %xmm0, %xmm0 814; SSE41-NEXT: pshufb %xmm0, %xmm1 815; SSE41-NEXT: psllw $5, %xmm1 816; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 817; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 818; SSE41-NEXT: movdqa %xmm3, %xmm4 819; SSE41-NEXT: psraw $4, %xmm4 820; SSE41-NEXT: pblendvb %xmm4, %xmm3 821; SSE41-NEXT: movdqa %xmm3, %xmm4 822; SSE41-NEXT: psraw $2, %xmm4 823; SSE41-NEXT: paddw %xmm0, %xmm0 824; SSE41-NEXT: pblendvb %xmm4, %xmm3 825; SSE41-NEXT: movdqa %xmm3, %xmm4 826; SSE41-NEXT: psraw $1, %xmm4 827; SSE41-NEXT: paddw %xmm0, %xmm0 828; SSE41-NEXT: pblendvb %xmm4, %xmm3 829; SSE41-NEXT: psrlw $8, %xmm3 830; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 831; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 832; SSE41-NEXT: movdqa %xmm1, %xmm2 833; SSE41-NEXT: psraw $4, %xmm2 834; SSE41-NEXT: pblendvb %xmm2, %xmm1 835; SSE41-NEXT: movdqa %xmm1, %xmm2 836; SSE41-NEXT: psraw $2, %xmm2 837; SSE41-NEXT: paddw %xmm0, %xmm0 838; SSE41-NEXT: pblendvb %xmm2, %xmm1 839; SSE41-NEXT: movdqa %xmm1, %xmm2 840; SSE41-NEXT: psraw $1, %xmm2 841; SSE41-NEXT: paddw %xmm0, %xmm0 842; SSE41-NEXT: pblendvb %xmm2, %xmm1 843; SSE41-NEXT: psrlw $8, %xmm1 844; SSE41-NEXT: packuswb %xmm3, %xmm1 845; SSE41-NEXT: movdqa %xmm1, %xmm0 846; SSE41-NEXT: retq 847; 848; AVX1-LABEL: splatvar_shift_v16i8: 849; AVX1: # BB#0: 850; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 851; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 852; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 853; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 854; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 855; AVX1-NEXT: vpsraw $4, %xmm3, %xmm4 856; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 857; AVX1-NEXT: vpsraw $2, %xmm3, %xmm4 858; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 859; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 860; AVX1-NEXT: vpsraw $1, %xmm3, %xmm4 861; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 862; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 863; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 864; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 865; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 866; AVX1-NEXT: vpsraw $4, %xmm0, %xmm3 867; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 868; AVX1-NEXT: vpsraw $2, %xmm0, %xmm3 869; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 870; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 871; AVX1-NEXT: vpsraw $1, %xmm0, %xmm3 872; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 873; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 874; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 875; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 876; AVX1-NEXT: retq 877; 878; AVX2-LABEL: splatvar_shift_v16i8: 879; AVX2: # BB#0: 880; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 881; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 882; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 883; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 884; AVX2-NEXT: vpsraw $4, %xmm3, %xmm4 885; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 886; AVX2-NEXT: vpsraw $2, %xmm3, %xmm4 887; AVX2-NEXT: vpaddw %xmm2, %xmm2, %xmm2 888; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 889; AVX2-NEXT: vpsraw $1, %xmm3, %xmm4 890; AVX2-NEXT: vpaddw %xmm2, %xmm2, %xmm2 891; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 892; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 893; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 894; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 895; AVX2-NEXT: vpsraw $4, %xmm0, %xmm3 896; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 897; AVX2-NEXT: vpsraw $2, %xmm0, %xmm3 898; AVX2-NEXT: vpaddw %xmm1, %xmm1, %xmm1 899; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 900; AVX2-NEXT: vpsraw $1, %xmm0, %xmm3 901; AVX2-NEXT: vpaddw %xmm1, %xmm1, %xmm1 902; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 903; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 904; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 905; AVX2-NEXT: retq 906; 907; XOPAVX1-LABEL: splatvar_shift_v16i8: 908; XOPAVX1: # BB#0: 909; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 910; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 911; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 912; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 913; XOPAVX1-NEXT: retq 914; 915; XOPAVX2-LABEL: splatvar_shift_v16i8: 916; XOPAVX2: # BB#0: 917; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 918; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 919; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 920; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 921; XOPAVX2-NEXT: retq 922; 923; AVX512-LABEL: splatvar_shift_v16i8: 924; AVX512: ## BB#0: 925; AVX512-NEXT: vpbroadcastb %xmm1, %xmm1 926; AVX512-NEXT: vpsllw $5, %xmm1, %xmm1 927; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 928; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 929; AVX512-NEXT: vpsraw $4, %xmm3, %xmm4 930; AVX512-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 931; AVX512-NEXT: vpsraw $2, %xmm3, %xmm4 932; AVX512-NEXT: vpaddw %xmm2, %xmm2, %xmm2 933; AVX512-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 934; AVX512-NEXT: vpsraw $1, %xmm3, %xmm4 935; AVX512-NEXT: vpaddw %xmm2, %xmm2, %xmm2 936; AVX512-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 937; AVX512-NEXT: vpsrlw $8, %xmm2, %xmm2 938; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 939; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 940; AVX512-NEXT: vpsraw $4, %xmm0, %xmm3 941; AVX512-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 942; AVX512-NEXT: vpsraw $2, %xmm0, %xmm3 943; AVX512-NEXT: vpaddw %xmm1, %xmm1, %xmm1 944; AVX512-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 945; AVX512-NEXT: vpsraw $1, %xmm0, %xmm3 946; AVX512-NEXT: vpaddw %xmm1, %xmm1, %xmm1 947; AVX512-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 948; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm0 949; AVX512-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 950; AVX512-NEXT: retq 951; 952; X32-SSE-LABEL: splatvar_shift_v16i8: 953; X32-SSE: # BB#0: 954; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 955; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 956; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] 957; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 958; X32-SSE-NEXT: psllw $5, %xmm3 959; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] 960; X32-SSE-NEXT: pxor %xmm2, %xmm2 961; X32-SSE-NEXT: pxor %xmm5, %xmm5 962; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 963; X32-SSE-NEXT: movdqa %xmm5, %xmm6 964; X32-SSE-NEXT: pandn %xmm1, %xmm6 965; X32-SSE-NEXT: psraw $4, %xmm1 966; X32-SSE-NEXT: pand %xmm5, %xmm1 967; X32-SSE-NEXT: por %xmm6, %xmm1 968; X32-SSE-NEXT: paddw %xmm4, %xmm4 969; X32-SSE-NEXT: pxor %xmm5, %xmm5 970; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 971; X32-SSE-NEXT: movdqa %xmm5, %xmm6 972; X32-SSE-NEXT: pandn %xmm1, %xmm6 973; X32-SSE-NEXT: psraw $2, %xmm1 974; X32-SSE-NEXT: pand %xmm5, %xmm1 975; X32-SSE-NEXT: por %xmm6, %xmm1 976; X32-SSE-NEXT: paddw %xmm4, %xmm4 977; X32-SSE-NEXT: pxor %xmm5, %xmm5 978; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 979; X32-SSE-NEXT: movdqa %xmm5, %xmm4 980; X32-SSE-NEXT: pandn %xmm1, %xmm4 981; X32-SSE-NEXT: psraw $1, %xmm1 982; X32-SSE-NEXT: pand %xmm5, %xmm1 983; X32-SSE-NEXT: por %xmm4, %xmm1 984; X32-SSE-NEXT: psrlw $8, %xmm1 985; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 986; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 987; X32-SSE-NEXT: pxor %xmm4, %xmm4 988; X32-SSE-NEXT: pcmpgtw %xmm3, %xmm4 989; X32-SSE-NEXT: movdqa %xmm4, %xmm5 990; X32-SSE-NEXT: pandn %xmm0, %xmm5 991; X32-SSE-NEXT: psraw $4, %xmm0 992; X32-SSE-NEXT: pand %xmm4, %xmm0 993; X32-SSE-NEXT: por %xmm5, %xmm0 994; X32-SSE-NEXT: paddw %xmm3, %xmm3 995; X32-SSE-NEXT: pxor %xmm4, %xmm4 996; X32-SSE-NEXT: pcmpgtw %xmm3, %xmm4 997; X32-SSE-NEXT: movdqa %xmm4, %xmm5 998; X32-SSE-NEXT: pandn %xmm0, %xmm5 999; X32-SSE-NEXT: psraw $2, %xmm0 1000; X32-SSE-NEXT: pand %xmm4, %xmm0 1001; X32-SSE-NEXT: por %xmm5, %xmm0 1002; X32-SSE-NEXT: paddw %xmm3, %xmm3 1003; X32-SSE-NEXT: pcmpgtw %xmm3, %xmm2 1004; X32-SSE-NEXT: movdqa %xmm2, %xmm3 1005; X32-SSE-NEXT: pandn %xmm0, %xmm3 1006; X32-SSE-NEXT: psraw $1, %xmm0 1007; X32-SSE-NEXT: pand %xmm2, %xmm0 1008; X32-SSE-NEXT: por %xmm3, %xmm0 1009; X32-SSE-NEXT: psrlw $8, %xmm0 1010; X32-SSE-NEXT: packuswb %xmm1, %xmm0 1011; X32-SSE-NEXT: retl 1012 %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer 1013 %shift = ashr <16 x i8> %a, %splat 1014 ret <16 x i8> %shift 1015} 1016 1017; 1018; Constant Shifts 1019; 1020 1021define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind { 1022; SSE2-LABEL: constant_shift_v2i64: 1023; SSE2: # BB#0: 1024; SSE2-NEXT: movdqa %xmm0, %xmm1 1025; SSE2-NEXT: psrlq $7, %xmm1 1026; SSE2-NEXT: psrlq $1, %xmm0 1027; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 1028; SSE2-NEXT: movapd {{.*#+}} xmm0 = [4611686018427387904,72057594037927936] 1029; SSE2-NEXT: xorpd %xmm0, %xmm1 1030; SSE2-NEXT: psubq %xmm0, %xmm1 1031; SSE2-NEXT: movdqa %xmm1, %xmm0 1032; SSE2-NEXT: retq 1033; 1034; SSE41-LABEL: constant_shift_v2i64: 1035; SSE41: # BB#0: 1036; SSE41-NEXT: movdqa %xmm0, %xmm1 1037; SSE41-NEXT: psrlq $7, %xmm1 1038; SSE41-NEXT: psrlq $1, %xmm0 1039; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1040; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936] 1041; SSE41-NEXT: pxor %xmm1, %xmm0 1042; SSE41-NEXT: psubq %xmm1, %xmm0 1043; SSE41-NEXT: retq 1044; 1045; AVX1-LABEL: constant_shift_v2i64: 1046; AVX1: # BB#0: 1047; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1 1048; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 1049; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1050; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936] 1051; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 1052; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 1053; AVX1-NEXT: retq 1054; 1055; AVX2-LABEL: constant_shift_v2i64: 1056; AVX2: # BB#0: 1057; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 1058; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936] 1059; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 1060; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 1061; AVX2-NEXT: retq 1062; 1063; XOP-LABEL: constant_shift_v2i64: 1064; XOP: # BB#0: 1065; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 1066; XOP-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm1 1067; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm0 1068; XOP-NEXT: retq 1069; 1070; AVX512-LABEL: constant_shift_v2i64: 1071; AVX512: ## BB#0: 1072; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 1073; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936] 1074; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 1075; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0 1076; AVX512-NEXT: retq 1077; 1078; X32-SSE-LABEL: constant_shift_v2i64: 1079; X32-SSE: # BB#0: 1080; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648] 1081; X32-SSE-NEXT: movdqa %xmm1, %xmm2 1082; X32-SSE-NEXT: psrlq $7, %xmm2 1083; X32-SSE-NEXT: psrlq $1, %xmm1 1084; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 1085; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1086; X32-SSE-NEXT: psrlq $7, %xmm1 1087; X32-SSE-NEXT: psrlq $1, %xmm0 1088; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 1089; X32-SSE-NEXT: xorpd %xmm2, %xmm1 1090; X32-SSE-NEXT: psubq %xmm2, %xmm1 1091; X32-SSE-NEXT: movdqa %xmm1, %xmm0 1092; X32-SSE-NEXT: retl 1093 %shift = ashr <2 x i64> %a, <i64 1, i64 7> 1094 ret <2 x i64> %shift 1095} 1096 1097define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind { 1098; SSE2-LABEL: constant_shift_v4i32: 1099; SSE2: # BB#0: 1100; SSE2-NEXT: movdqa %xmm0, %xmm1 1101; SSE2-NEXT: psrad $7, %xmm1 1102; SSE2-NEXT: movdqa %xmm0, %xmm2 1103; SSE2-NEXT: psrad $5, %xmm2 1104; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] 1105; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 1106; SSE2-NEXT: movdqa %xmm0, %xmm2 1107; SSE2-NEXT: psrad $6, %xmm2 1108; SSE2-NEXT: psrad $4, %xmm0 1109; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 1110; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 1111; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1112; SSE2-NEXT: retq 1113; 1114; SSE41-LABEL: constant_shift_v4i32: 1115; SSE41: # BB#0: 1116; SSE41-NEXT: movdqa %xmm0, %xmm1 1117; SSE41-NEXT: psrad $7, %xmm1 1118; SSE41-NEXT: movdqa %xmm0, %xmm2 1119; SSE41-NEXT: psrad $5, %xmm2 1120; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1121; SSE41-NEXT: movdqa %xmm0, %xmm1 1122; SSE41-NEXT: psrad $6, %xmm1 1123; SSE41-NEXT: psrad $4, %xmm0 1124; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1125; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 1126; SSE41-NEXT: retq 1127; 1128; AVX1-LABEL: constant_shift_v4i32: 1129; AVX1: # BB#0: 1130; AVX1-NEXT: vpsrad $7, %xmm0, %xmm1 1131; AVX1-NEXT: vpsrad $5, %xmm0, %xmm2 1132; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1133; AVX1-NEXT: vpsrad $6, %xmm0, %xmm2 1134; AVX1-NEXT: vpsrad $4, %xmm0, %xmm0 1135; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 1136; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 1137; AVX1-NEXT: retq 1138; 1139; AVX2-LABEL: constant_shift_v4i32: 1140; AVX2: # BB#0: 1141; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 1142; AVX2-NEXT: retq 1143; 1144; XOPAVX1-LABEL: constant_shift_v4i32: 1145; XOPAVX1: # BB#0: 1146; XOPAVX1-NEXT: vpshad {{.*}}(%rip), %xmm0, %xmm0 1147; XOPAVX1-NEXT: retq 1148; 1149; XOPAVX2-LABEL: constant_shift_v4i32: 1150; XOPAVX2: # BB#0: 1151; XOPAVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 1152; XOPAVX2-NEXT: retq 1153; 1154; AVX512-LABEL: constant_shift_v4i32: 1155; AVX512: ## BB#0: 1156; AVX512-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 1157; AVX512-NEXT: retq 1158; 1159; X32-SSE-LABEL: constant_shift_v4i32: 1160; X32-SSE: # BB#0: 1161; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1162; X32-SSE-NEXT: psrad $7, %xmm1 1163; X32-SSE-NEXT: movdqa %xmm0, %xmm2 1164; X32-SSE-NEXT: psrad $5, %xmm2 1165; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] 1166; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 1167; X32-SSE-NEXT: movdqa %xmm0, %xmm2 1168; X32-SSE-NEXT: psrad $6, %xmm2 1169; X32-SSE-NEXT: psrad $4, %xmm0 1170; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 1171; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 1172; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1173; X32-SSE-NEXT: retl 1174 %shift = ashr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7> 1175 ret <4 x i32> %shift 1176} 1177 1178define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { 1179; SSE2-LABEL: constant_shift_v8i16: 1180; SSE2: # BB#0: 1181; SSE2-NEXT: movdqa %xmm0, %xmm1 1182; SSE2-NEXT: psraw $4, %xmm1 1183; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 1184; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] 1185; SSE2-NEXT: psraw $2, %xmm1 1186; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] 1187; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 1188; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] 1189; SSE2-NEXT: movdqa %xmm2, %xmm1 1190; SSE2-NEXT: pand %xmm0, %xmm1 1191; SSE2-NEXT: psraw $1, %xmm2 1192; SSE2-NEXT: pandn %xmm2, %xmm0 1193; SSE2-NEXT: por %xmm1, %xmm0 1194; SSE2-NEXT: retq 1195; 1196; SSE41-LABEL: constant_shift_v8i16: 1197; SSE41: # BB#0: 1198; SSE41-NEXT: movdqa %xmm0, %xmm1 1199; SSE41-NEXT: psraw $4, %xmm1 1200; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1201; SSE41-NEXT: movdqa %xmm1, %xmm2 1202; SSE41-NEXT: psraw $2, %xmm2 1203; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1204; SSE41-NEXT: movdqa %xmm2, %xmm0 1205; SSE41-NEXT: psraw $1, %xmm0 1206; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] 1207; SSE41-NEXT: retq 1208; 1209; AVX1-LABEL: constant_shift_v8i16: 1210; AVX1: # BB#0: 1211; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 1212; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1213; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 1214; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 1215; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 1216; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 1217; AVX1-NEXT: retq 1218; 1219; AVX2-LABEL: constant_shift_v8i16: 1220; AVX2: # BB#0: 1221; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 1222; AVX2-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 1223; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 1224; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1225; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 1226; AVX2-NEXT: vzeroupper 1227; AVX2-NEXT: retq 1228; 1229; XOP-LABEL: constant_shift_v8i16: 1230; XOP: # BB#0: 1231; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 1232; XOP-NEXT: vpsubw {{.*}}(%rip), %xmm1, %xmm1 1233; XOP-NEXT: vpshaw %xmm1, %xmm0, %xmm0 1234; XOP-NEXT: retq 1235; 1236; AVX512-LABEL: constant_shift_v8i16: 1237; AVX512: ## BB#0: 1238; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def> 1239; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] 1240; AVX512-NEXT: vpsravw %zmm1, %zmm0, %zmm0 1241; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> 1242; AVX512-NEXT: retq 1243; 1244; X32-SSE-LABEL: constant_shift_v8i16: 1245; X32-SSE: # BB#0: 1246; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1247; X32-SSE-NEXT: psraw $4, %xmm1 1248; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 1249; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] 1250; X32-SSE-NEXT: psraw $2, %xmm1 1251; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] 1252; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 1253; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] 1254; X32-SSE-NEXT: movdqa %xmm2, %xmm1 1255; X32-SSE-NEXT: pand %xmm0, %xmm1 1256; X32-SSE-NEXT: psraw $1, %xmm2 1257; X32-SSE-NEXT: pandn %xmm2, %xmm0 1258; X32-SSE-NEXT: por %xmm1, %xmm0 1259; X32-SSE-NEXT: retl 1260 %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7> 1261 ret <8 x i16> %shift 1262} 1263 1264define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { 1265; SSE2-LABEL: constant_shift_v16i8: 1266; SSE2: # BB#0: 1267; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1268; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 1269; SSE2-NEXT: psllw $5, %xmm3 1270; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] 1271; SSE2-NEXT: pxor %xmm2, %xmm2 1272; SSE2-NEXT: pxor %xmm5, %xmm5 1273; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 1274; SSE2-NEXT: movdqa %xmm5, %xmm6 1275; SSE2-NEXT: pandn %xmm1, %xmm6 1276; SSE2-NEXT: psraw $4, %xmm1 1277; SSE2-NEXT: pand %xmm5, %xmm1 1278; SSE2-NEXT: por %xmm6, %xmm1 1279; SSE2-NEXT: paddw %xmm4, %xmm4 1280; SSE2-NEXT: pxor %xmm5, %xmm5 1281; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 1282; SSE2-NEXT: movdqa %xmm5, %xmm6 1283; SSE2-NEXT: pandn %xmm1, %xmm6 1284; SSE2-NEXT: psraw $2, %xmm1 1285; SSE2-NEXT: pand %xmm5, %xmm1 1286; SSE2-NEXT: por %xmm6, %xmm1 1287; SSE2-NEXT: paddw %xmm4, %xmm4 1288; SSE2-NEXT: pxor %xmm5, %xmm5 1289; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 1290; SSE2-NEXT: movdqa %xmm5, %xmm4 1291; SSE2-NEXT: pandn %xmm1, %xmm4 1292; SSE2-NEXT: psraw $1, %xmm1 1293; SSE2-NEXT: pand %xmm5, %xmm1 1294; SSE2-NEXT: por %xmm4, %xmm1 1295; SSE2-NEXT: psrlw $8, %xmm1 1296; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1297; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1298; SSE2-NEXT: pxor %xmm4, %xmm4 1299; SSE2-NEXT: pcmpgtw %xmm3, %xmm4 1300; SSE2-NEXT: movdqa %xmm4, %xmm5 1301; SSE2-NEXT: pandn %xmm0, %xmm5 1302; SSE2-NEXT: psraw $4, %xmm0 1303; SSE2-NEXT: pand %xmm4, %xmm0 1304; SSE2-NEXT: por %xmm5, %xmm0 1305; SSE2-NEXT: paddw %xmm3, %xmm3 1306; SSE2-NEXT: pxor %xmm4, %xmm4 1307; SSE2-NEXT: pcmpgtw %xmm3, %xmm4 1308; SSE2-NEXT: movdqa %xmm4, %xmm5 1309; SSE2-NEXT: pandn %xmm0, %xmm5 1310; SSE2-NEXT: psraw $2, %xmm0 1311; SSE2-NEXT: pand %xmm4, %xmm0 1312; SSE2-NEXT: por %xmm5, %xmm0 1313; SSE2-NEXT: paddw %xmm3, %xmm3 1314; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 1315; SSE2-NEXT: movdqa %xmm2, %xmm3 1316; SSE2-NEXT: pandn %xmm0, %xmm3 1317; SSE2-NEXT: psraw $1, %xmm0 1318; SSE2-NEXT: pand %xmm2, %xmm0 1319; SSE2-NEXT: por %xmm3, %xmm0 1320; SSE2-NEXT: psrlw $8, %xmm0 1321; SSE2-NEXT: packuswb %xmm1, %xmm0 1322; SSE2-NEXT: retq 1323; 1324; SSE41-LABEL: constant_shift_v16i8: 1325; SSE41: # BB#0: 1326; SSE41-NEXT: movdqa %xmm0, %xmm1 1327; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 1328; SSE41-NEXT: psllw $5, %xmm3 1329; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] 1330; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 1331; SSE41-NEXT: movdqa %xmm2, %xmm4 1332; SSE41-NEXT: psraw $4, %xmm4 1333; SSE41-NEXT: pblendvb %xmm4, %xmm2 1334; SSE41-NEXT: movdqa %xmm2, %xmm4 1335; SSE41-NEXT: psraw $2, %xmm4 1336; SSE41-NEXT: paddw %xmm0, %xmm0 1337; SSE41-NEXT: pblendvb %xmm4, %xmm2 1338; SSE41-NEXT: movdqa %xmm2, %xmm4 1339; SSE41-NEXT: psraw $1, %xmm4 1340; SSE41-NEXT: paddw %xmm0, %xmm0 1341; SSE41-NEXT: pblendvb %xmm4, %xmm2 1342; SSE41-NEXT: psrlw $8, %xmm2 1343; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 1344; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1345; SSE41-NEXT: movdqa %xmm1, %xmm3 1346; SSE41-NEXT: psraw $4, %xmm3 1347; SSE41-NEXT: pblendvb %xmm3, %xmm1 1348; SSE41-NEXT: movdqa %xmm1, %xmm3 1349; SSE41-NEXT: psraw $2, %xmm3 1350; SSE41-NEXT: paddw %xmm0, %xmm0 1351; SSE41-NEXT: pblendvb %xmm3, %xmm1 1352; SSE41-NEXT: movdqa %xmm1, %xmm3 1353; SSE41-NEXT: psraw $1, %xmm3 1354; SSE41-NEXT: paddw %xmm0, %xmm0 1355; SSE41-NEXT: pblendvb %xmm3, %xmm1 1356; SSE41-NEXT: psrlw $8, %xmm1 1357; SSE41-NEXT: packuswb %xmm2, %xmm1 1358; SSE41-NEXT: movdqa %xmm1, %xmm0 1359; SSE41-NEXT: retq 1360; 1361; AVX-LABEL: constant_shift_v16i8: 1362; AVX: # BB#0: 1363; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 1364; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 1365; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 1366; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1367; AVX-NEXT: vpsraw $4, %xmm3, %xmm4 1368; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 1369; AVX-NEXT: vpsraw $2, %xmm3, %xmm4 1370; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 1371; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 1372; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 1373; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 1374; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 1375; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 1376; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1377; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1378; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 1379; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 1380; AVX-NEXT: vpsraw $2, %xmm0, %xmm3 1381; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 1382; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 1383; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 1384; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 1385; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 1386; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 1387; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1388; AVX-NEXT: retq 1389; 1390; XOP-LABEL: constant_shift_v16i8: 1391; XOP: # BB#0: 1392; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 1393; XOP-NEXT: vpsubb {{.*}}(%rip), %xmm1, %xmm1 1394; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 1395; XOP-NEXT: retq 1396; 1397; AVX512-LABEL: constant_shift_v16i8: 1398; AVX512: ## BB#0: 1399; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 1400; AVX512-NEXT: vpsllw $5, %xmm1, %xmm1 1401; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 1402; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1403; AVX512-NEXT: vpsraw $4, %xmm3, %xmm4 1404; AVX512-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 1405; AVX512-NEXT: vpsraw $2, %xmm3, %xmm4 1406; AVX512-NEXT: vpaddw %xmm2, %xmm2, %xmm2 1407; AVX512-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 1408; AVX512-NEXT: vpsraw $1, %xmm3, %xmm4 1409; AVX512-NEXT: vpaddw %xmm2, %xmm2, %xmm2 1410; AVX512-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 1411; AVX512-NEXT: vpsrlw $8, %xmm2, %xmm2 1412; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1413; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1414; AVX512-NEXT: vpsraw $4, %xmm0, %xmm3 1415; AVX512-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 1416; AVX512-NEXT: vpsraw $2, %xmm0, %xmm3 1417; AVX512-NEXT: vpaddw %xmm1, %xmm1, %xmm1 1418; AVX512-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 1419; AVX512-NEXT: vpsraw $1, %xmm0, %xmm3 1420; AVX512-NEXT: vpaddw %xmm1, %xmm1, %xmm1 1421; AVX512-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 1422; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm0 1423; AVX512-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1424; AVX512-NEXT: retq 1425; 1426; X32-SSE-LABEL: constant_shift_v16i8: 1427; X32-SSE: # BB#0: 1428; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1429; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 1430; X32-SSE-NEXT: psllw $5, %xmm3 1431; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] 1432; X32-SSE-NEXT: pxor %xmm2, %xmm2 1433; X32-SSE-NEXT: pxor %xmm5, %xmm5 1434; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 1435; X32-SSE-NEXT: movdqa %xmm5, %xmm6 1436; X32-SSE-NEXT: pandn %xmm1, %xmm6 1437; X32-SSE-NEXT: psraw $4, %xmm1 1438; X32-SSE-NEXT: pand %xmm5, %xmm1 1439; X32-SSE-NEXT: por %xmm6, %xmm1 1440; X32-SSE-NEXT: paddw %xmm4, %xmm4 1441; X32-SSE-NEXT: pxor %xmm5, %xmm5 1442; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 1443; X32-SSE-NEXT: movdqa %xmm5, %xmm6 1444; X32-SSE-NEXT: pandn %xmm1, %xmm6 1445; X32-SSE-NEXT: psraw $2, %xmm1 1446; X32-SSE-NEXT: pand %xmm5, %xmm1 1447; X32-SSE-NEXT: por %xmm6, %xmm1 1448; X32-SSE-NEXT: paddw %xmm4, %xmm4 1449; X32-SSE-NEXT: pxor %xmm5, %xmm5 1450; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 1451; X32-SSE-NEXT: movdqa %xmm5, %xmm4 1452; X32-SSE-NEXT: pandn %xmm1, %xmm4 1453; X32-SSE-NEXT: psraw $1, %xmm1 1454; X32-SSE-NEXT: pand %xmm5, %xmm1 1455; X32-SSE-NEXT: por %xmm4, %xmm1 1456; X32-SSE-NEXT: psrlw $8, %xmm1 1457; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1458; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1459; X32-SSE-NEXT: pxor %xmm4, %xmm4 1460; X32-SSE-NEXT: pcmpgtw %xmm3, %xmm4 1461; X32-SSE-NEXT: movdqa %xmm4, %xmm5 1462; X32-SSE-NEXT: pandn %xmm0, %xmm5 1463; X32-SSE-NEXT: psraw $4, %xmm0 1464; X32-SSE-NEXT: pand %xmm4, %xmm0 1465; X32-SSE-NEXT: por %xmm5, %xmm0 1466; X32-SSE-NEXT: paddw %xmm3, %xmm3 1467; X32-SSE-NEXT: pxor %xmm4, %xmm4 1468; X32-SSE-NEXT: pcmpgtw %xmm3, %xmm4 1469; X32-SSE-NEXT: movdqa %xmm4, %xmm5 1470; X32-SSE-NEXT: pandn %xmm0, %xmm5 1471; X32-SSE-NEXT: psraw $2, %xmm0 1472; X32-SSE-NEXT: pand %xmm4, %xmm0 1473; X32-SSE-NEXT: por %xmm5, %xmm0 1474; X32-SSE-NEXT: paddw %xmm3, %xmm3 1475; X32-SSE-NEXT: pcmpgtw %xmm3, %xmm2 1476; X32-SSE-NEXT: movdqa %xmm2, %xmm3 1477; X32-SSE-NEXT: pandn %xmm0, %xmm3 1478; X32-SSE-NEXT: psraw $1, %xmm0 1479; X32-SSE-NEXT: pand %xmm2, %xmm0 1480; X32-SSE-NEXT: por %xmm3, %xmm0 1481; X32-SSE-NEXT: psrlw $8, %xmm0 1482; X32-SSE-NEXT: packuswb %xmm1, %xmm0 1483; X32-SSE-NEXT: retl 1484 %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> 1485 ret <16 x i8> %shift 1486} 1487 1488; 1489; Uniform Constant Shifts 1490; 1491 1492define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind { 1493; SSE2-LABEL: splatconstant_shift_v2i64: 1494; SSE2: # BB#0: 1495; SSE2-NEXT: movdqa %xmm0, %xmm1 1496; SSE2-NEXT: psrad $7, %xmm1 1497; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 1498; SSE2-NEXT: psrlq $7, %xmm0 1499; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1500; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1501; SSE2-NEXT: retq 1502; 1503; SSE41-LABEL: splatconstant_shift_v2i64: 1504; SSE41: # BB#0: 1505; SSE41-NEXT: movdqa %xmm0, %xmm1 1506; SSE41-NEXT: psrad $7, %xmm1 1507; SSE41-NEXT: psrlq $7, %xmm0 1508; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 1509; SSE41-NEXT: retq 1510; 1511; AVX1-LABEL: splatconstant_shift_v2i64: 1512; AVX1: # BB#0: 1513; AVX1-NEXT: vpsrad $7, %xmm0, %xmm1 1514; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0 1515; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 1516; AVX1-NEXT: retq 1517; 1518; AVX2-LABEL: splatconstant_shift_v2i64: 1519; AVX2: # BB#0: 1520; AVX2-NEXT: vpsrad $7, %xmm0, %xmm1 1521; AVX2-NEXT: vpsrlq $7, %xmm0, %xmm0 1522; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 1523; AVX2-NEXT: retq 1524; 1525; XOP-LABEL: splatconstant_shift_v2i64: 1526; XOP: # BB#0: 1527; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 1528; XOP-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm1 1529; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm0 1530; XOP-NEXT: retq 1531; 1532; AVX512-LABEL: splatconstant_shift_v2i64: 1533; AVX512: ## BB#0: 1534; AVX512-NEXT: vpsrad $7, %xmm0, %xmm1 1535; AVX512-NEXT: vpsrlq $7, %xmm0, %xmm0 1536; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 1537; AVX512-NEXT: retq 1538; 1539; X32-SSE-LABEL: splatconstant_shift_v2i64: 1540; X32-SSE: # BB#0: 1541; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1542; X32-SSE-NEXT: psrad $7, %xmm1 1543; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 1544; X32-SSE-NEXT: psrlq $7, %xmm0 1545; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1546; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1547; X32-SSE-NEXT: retl 1548 %shift = ashr <2 x i64> %a, <i64 7, i64 7> 1549 ret <2 x i64> %shift 1550} 1551 1552define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind { 1553; SSE-LABEL: splatconstant_shift_v4i32: 1554; SSE: # BB#0: 1555; SSE-NEXT: psrad $5, %xmm0 1556; SSE-NEXT: retq 1557; 1558; AVX-LABEL: splatconstant_shift_v4i32: 1559; AVX: # BB#0: 1560; AVX-NEXT: vpsrad $5, %xmm0, %xmm0 1561; AVX-NEXT: retq 1562; 1563; XOP-LABEL: splatconstant_shift_v4i32: 1564; XOP: # BB#0: 1565; XOP-NEXT: vpsrad $5, %xmm0, %xmm0 1566; XOP-NEXT: retq 1567; 1568; AVX512-LABEL: splatconstant_shift_v4i32: 1569; AVX512: ## BB#0: 1570; AVX512-NEXT: vpsrad $5, %xmm0, %xmm0 1571; AVX512-NEXT: retq 1572; 1573; X32-SSE-LABEL: splatconstant_shift_v4i32: 1574; X32-SSE: # BB#0: 1575; X32-SSE-NEXT: psrad $5, %xmm0 1576; X32-SSE-NEXT: retl 1577 %shift = ashr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5> 1578 ret <4 x i32> %shift 1579} 1580 1581define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind { 1582; SSE-LABEL: splatconstant_shift_v8i16: 1583; SSE: # BB#0: 1584; SSE-NEXT: psraw $3, %xmm0 1585; SSE-NEXT: retq 1586; 1587; AVX-LABEL: splatconstant_shift_v8i16: 1588; AVX: # BB#0: 1589; AVX-NEXT: vpsraw $3, %xmm0, %xmm0 1590; AVX-NEXT: retq 1591; 1592; XOP-LABEL: splatconstant_shift_v8i16: 1593; XOP: # BB#0: 1594; XOP-NEXT: vpsraw $3, %xmm0, %xmm0 1595; XOP-NEXT: retq 1596; 1597; AVX512-LABEL: splatconstant_shift_v8i16: 1598; AVX512: ## BB#0: 1599; AVX512-NEXT: vpsraw $3, %xmm0, %xmm0 1600; AVX512-NEXT: retq 1601; 1602; X32-SSE-LABEL: splatconstant_shift_v8i16: 1603; X32-SSE: # BB#0: 1604; X32-SSE-NEXT: psraw $3, %xmm0 1605; X32-SSE-NEXT: retl 1606 %shift = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> 1607 ret <8 x i16> %shift 1608} 1609 1610define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind { 1611; SSE-LABEL: splatconstant_shift_v16i8: 1612; SSE: # BB#0: 1613; SSE-NEXT: psrlw $3, %xmm0 1614; SSE-NEXT: pand {{.*}}(%rip), %xmm0 1615; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 1616; SSE-NEXT: pxor %xmm1, %xmm0 1617; SSE-NEXT: psubb %xmm1, %xmm0 1618; SSE-NEXT: retq 1619; 1620; AVX-LABEL: splatconstant_shift_v16i8: 1621; AVX: # BB#0: 1622; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 1623; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1624; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 1625; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 1626; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1627; AVX-NEXT: retq 1628; 1629; XOP-LABEL: splatconstant_shift_v16i8: 1630; XOP: # BB#0: 1631; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 1632; XOP-NEXT: vpsubb {{.*}}(%rip), %xmm1, %xmm1 1633; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 1634; XOP-NEXT: retq 1635; 1636; AVX512-LABEL: splatconstant_shift_v16i8: 1637; AVX512: ## BB#0: 1638; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 1639; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1640; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 1641; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 1642; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1643; AVX512-NEXT: retq 1644; 1645; X32-SSE-LABEL: splatconstant_shift_v16i8: 1646; X32-SSE: # BB#0: 1647; X32-SSE-NEXT: psrlw $3, %xmm0 1648; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 1649; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 1650; X32-SSE-NEXT: pxor %xmm1, %xmm0 1651; X32-SSE-NEXT: psubb %xmm1, %xmm0 1652; X32-SSE-NEXT: retl 1653 %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> 1654 ret <16 x i8> %shift 1655} 1656