1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL 12; 13; Just one 32-bit run to make sure we do reasonable things for i64 shifts. 14; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2 15 16; 17; Variable Shifts 18; 19 20define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 21; SSE2-LABEL: var_shift_v2i64: 22; SSE2: # %bb.0: 23; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 24; SSE2-NEXT: movdqa %xmm2, %xmm3 25; SSE2-NEXT: psrlq %xmm1, %xmm3 26; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] 27; SSE2-NEXT: psrlq %xmm4, %xmm2 28; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] 29; SSE2-NEXT: movdqa %xmm0, %xmm3 30; SSE2-NEXT: psrlq %xmm1, %xmm3 31; SSE2-NEXT: psrlq %xmm4, %xmm0 32; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 33; SSE2-NEXT: xorpd %xmm2, %xmm0 34; SSE2-NEXT: psubq %xmm2, %xmm0 35; SSE2-NEXT: retq 36; 37; SSE41-LABEL: var_shift_v2i64: 38; SSE41: # %bb.0: 39; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 40; SSE41-NEXT: movdqa %xmm2, %xmm3 41; SSE41-NEXT: psrlq %xmm1, %xmm3 42; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] 43; SSE41-NEXT: psrlq %xmm4, %xmm2 44; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 45; SSE41-NEXT: movdqa %xmm0, %xmm3 46; SSE41-NEXT: psrlq %xmm1, %xmm3 47; SSE41-NEXT: psrlq %xmm4, %xmm0 48; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 49; SSE41-NEXT: pxor %xmm2, %xmm0 50; SSE41-NEXT: psubq %xmm2, %xmm0 51; SSE41-NEXT: retq 52; 53; AVX1-LABEL: var_shift_v2i64: 54; AVX1: # %bb.0: 55; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 56; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm3 57; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] 58; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm2 59; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 60; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1 61; AVX1-NEXT: vpsrlq %xmm4, %xmm0, %xmm0 62; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 63; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 64; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 65; AVX1-NEXT: retq 66; 67; AVX2-LABEL: var_shift_v2i64: 68; AVX2: # %bb.0: 69; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 70; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm3 71; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 72; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 73; AVX2-NEXT: vpsubq %xmm3, %xmm0, %xmm0 74; AVX2-NEXT: retq 75; 76; XOP-LABEL: var_shift_v2i64: 77; XOP: # %bb.0: 78; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 79; XOP-NEXT: vpsubq %xmm1, %xmm2, %xmm1 80; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm0 81; XOP-NEXT: retq 82; 83; AVX512-LABEL: var_shift_v2i64: 84; AVX512: # %bb.0: 85; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 86; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 87; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 88; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 89; AVX512-NEXT: vzeroupper 90; AVX512-NEXT: retq 91; 92; AVX512VL-LABEL: var_shift_v2i64: 93; AVX512VL: # %bb.0: 94; AVX512VL-NEXT: vpsravq %xmm1, %xmm0, %xmm0 95; AVX512VL-NEXT: retq 96; 97; X32-SSE-LABEL: var_shift_v2i64: 98; X32-SSE: # %bb.0: 99; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] 100; X32-SSE-NEXT: movdqa %xmm2, %xmm3 101; X32-SSE-NEXT: psrlq %xmm1, %xmm3 102; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] 103; X32-SSE-NEXT: psrlq %xmm4, %xmm2 104; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] 105; X32-SSE-NEXT: movdqa %xmm0, %xmm3 106; X32-SSE-NEXT: psrlq %xmm1, %xmm3 107; X32-SSE-NEXT: psrlq %xmm4, %xmm0 108; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 109; X32-SSE-NEXT: xorpd %xmm2, %xmm0 110; X32-SSE-NEXT: psubq %xmm2, %xmm0 111; X32-SSE-NEXT: retl 112 %shift = ashr <2 x i64> %a, %b 113 ret <2 x i64> %shift 114} 115 116define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 117; SSE2-LABEL: var_shift_v4i32: 118; SSE2: # %bb.0: 119; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 120; SSE2-NEXT: movdqa %xmm0, %xmm3 121; SSE2-NEXT: psrad %xmm2, %xmm3 122; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] 123; SSE2-NEXT: movdqa %xmm0, %xmm2 124; SSE2-NEXT: psrad %xmm4, %xmm2 125; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 126; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 127; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] 128; SSE2-NEXT: movdqa %xmm0, %xmm4 129; SSE2-NEXT: psrad %xmm3, %xmm4 130; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 131; SSE2-NEXT: psrad %xmm1, %xmm0 132; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] 133; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] 134; SSE2-NEXT: movaps %xmm2, %xmm0 135; SSE2-NEXT: retq 136; 137; SSE41-LABEL: var_shift_v4i32: 138; SSE41: # %bb.0: 139; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 140; SSE41-NEXT: movdqa %xmm0, %xmm3 141; SSE41-NEXT: psrad %xmm2, %xmm3 142; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] 143; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] 144; SSE41-NEXT: movdqa %xmm0, %xmm5 145; SSE41-NEXT: psrad %xmm4, %xmm5 146; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] 147; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 148; SSE41-NEXT: movdqa %xmm0, %xmm3 149; SSE41-NEXT: psrad %xmm1, %xmm3 150; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] 151; SSE41-NEXT: psrad %xmm1, %xmm0 152; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 153; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] 154; SSE41-NEXT: retq 155; 156; AVX1-LABEL: var_shift_v4i32: 157; AVX1: # %bb.0: 158; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 159; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2 160; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 161; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 162; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 163; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 164; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] 165; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 166; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 167; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 168; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] 169; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 170; AVX1-NEXT: retq 171; 172; AVX2-LABEL: var_shift_v4i32: 173; AVX2: # %bb.0: 174; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 175; AVX2-NEXT: retq 176; 177; XOPAVX1-LABEL: var_shift_v4i32: 178; XOPAVX1: # %bb.0: 179; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 180; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 181; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0 182; XOPAVX1-NEXT: retq 183; 184; XOPAVX2-LABEL: var_shift_v4i32: 185; XOPAVX2: # %bb.0: 186; XOPAVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 187; XOPAVX2-NEXT: retq 188; 189; AVX512-LABEL: var_shift_v4i32: 190; AVX512: # %bb.0: 191; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0 192; AVX512-NEXT: retq 193; 194; AVX512VL-LABEL: var_shift_v4i32: 195; AVX512VL: # %bb.0: 196; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0 197; AVX512VL-NEXT: retq 198; 199; X32-SSE-LABEL: var_shift_v4i32: 200; X32-SSE: # %bb.0: 201; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 202; X32-SSE-NEXT: movdqa %xmm0, %xmm3 203; X32-SSE-NEXT: psrad %xmm2, %xmm3 204; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] 205; X32-SSE-NEXT: movdqa %xmm0, %xmm2 206; X32-SSE-NEXT: psrad %xmm4, %xmm2 207; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 208; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 209; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] 210; X32-SSE-NEXT: movdqa %xmm0, %xmm4 211; X32-SSE-NEXT: psrad %xmm3, %xmm4 212; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 213; X32-SSE-NEXT: psrad %xmm1, %xmm0 214; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] 215; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] 216; X32-SSE-NEXT: movaps %xmm2, %xmm0 217; X32-SSE-NEXT: retl 218 %shift = ashr <4 x i32> %a, %b 219 ret <4 x i32> %shift 220} 221 222define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 223; SSE2-LABEL: var_shift_v8i16: 224; SSE2: # %bb.0: 225; SSE2-NEXT: psllw $12, %xmm1 226; SSE2-NEXT: movdqa %xmm1, %xmm2 227; SSE2-NEXT: psraw $15, %xmm2 228; SSE2-NEXT: movdqa %xmm2, %xmm3 229; SSE2-NEXT: pandn %xmm0, %xmm3 230; SSE2-NEXT: psraw $8, %xmm0 231; SSE2-NEXT: pand %xmm2, %xmm0 232; SSE2-NEXT: por %xmm3, %xmm0 233; SSE2-NEXT: paddw %xmm1, %xmm1 234; SSE2-NEXT: movdqa %xmm1, %xmm2 235; SSE2-NEXT: psraw $15, %xmm2 236; SSE2-NEXT: movdqa %xmm2, %xmm3 237; SSE2-NEXT: pandn %xmm0, %xmm3 238; SSE2-NEXT: psraw $4, %xmm0 239; SSE2-NEXT: pand %xmm2, %xmm0 240; SSE2-NEXT: por %xmm3, %xmm0 241; SSE2-NEXT: paddw %xmm1, %xmm1 242; SSE2-NEXT: movdqa %xmm1, %xmm2 243; SSE2-NEXT: psraw $15, %xmm2 244; SSE2-NEXT: movdqa %xmm2, %xmm3 245; SSE2-NEXT: pandn %xmm0, %xmm3 246; SSE2-NEXT: psraw $2, %xmm0 247; SSE2-NEXT: pand %xmm2, %xmm0 248; SSE2-NEXT: por %xmm3, %xmm0 249; SSE2-NEXT: paddw %xmm1, %xmm1 250; SSE2-NEXT: psraw $15, %xmm1 251; SSE2-NEXT: movdqa %xmm1, %xmm2 252; SSE2-NEXT: pandn %xmm0, %xmm2 253; SSE2-NEXT: psraw $1, %xmm0 254; SSE2-NEXT: pand %xmm1, %xmm0 255; SSE2-NEXT: por %xmm2, %xmm0 256; SSE2-NEXT: retq 257; 258; SSE41-LABEL: var_shift_v8i16: 259; SSE41: # %bb.0: 260; SSE41-NEXT: movdqa %xmm0, %xmm2 261; SSE41-NEXT: movdqa %xmm1, %xmm0 262; SSE41-NEXT: psllw $12, %xmm0 263; SSE41-NEXT: psllw $4, %xmm1 264; SSE41-NEXT: por %xmm0, %xmm1 265; SSE41-NEXT: movdqa %xmm1, %xmm3 266; SSE41-NEXT: paddw %xmm1, %xmm3 267; SSE41-NEXT: movdqa %xmm2, %xmm4 268; SSE41-NEXT: psraw $8, %xmm4 269; SSE41-NEXT: movdqa %xmm1, %xmm0 270; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm2 271; SSE41-NEXT: movdqa %xmm2, %xmm1 272; SSE41-NEXT: psraw $4, %xmm1 273; SSE41-NEXT: movdqa %xmm3, %xmm0 274; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2 275; SSE41-NEXT: movdqa %xmm2, %xmm1 276; SSE41-NEXT: psraw $2, %xmm1 277; SSE41-NEXT: paddw %xmm3, %xmm3 278; SSE41-NEXT: movdqa %xmm3, %xmm0 279; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2 280; SSE41-NEXT: movdqa %xmm2, %xmm1 281; SSE41-NEXT: psraw $1, %xmm1 282; SSE41-NEXT: paddw %xmm3, %xmm3 283; SSE41-NEXT: movdqa %xmm3, %xmm0 284; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2 285; SSE41-NEXT: movdqa %xmm2, %xmm0 286; SSE41-NEXT: retq 287; 288; AVX1-LABEL: var_shift_v8i16: 289; AVX1: # %bb.0: 290; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 291; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 292; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 293; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 294; AVX1-NEXT: vpsraw $8, %xmm0, %xmm3 295; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 296; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 297; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 298; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 299; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 300; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 301; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 302; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 303; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 304; AVX1-NEXT: retq 305; 306; AVX2-LABEL: var_shift_v8i16: 307; AVX2: # %bb.0: 308; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 309; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 310; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 311; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 312; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 313; AVX2-NEXT: vzeroupper 314; AVX2-NEXT: retq 315; 316; XOP-LABEL: var_shift_v8i16: 317; XOP: # %bb.0: 318; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 319; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 320; XOP-NEXT: vpshaw %xmm1, %xmm0, %xmm0 321; XOP-NEXT: retq 322; 323; AVX512DQ-LABEL: var_shift_v8i16: 324; AVX512DQ: # %bb.0: 325; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 326; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0 327; AVX512DQ-NEXT: vpsravd %ymm1, %ymm0, %ymm0 328; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 329; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 330; AVX512DQ-NEXT: vzeroupper 331; AVX512DQ-NEXT: retq 332; 333; AVX512BW-LABEL: var_shift_v8i16: 334; AVX512BW: # %bb.0: 335; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 336; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 337; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 338; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 339; AVX512BW-NEXT: vzeroupper 340; AVX512BW-NEXT: retq 341; 342; AVX512DQVL-LABEL: var_shift_v8i16: 343; AVX512DQVL: # %bb.0: 344; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 345; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0 346; AVX512DQVL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 347; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 348; AVX512DQVL-NEXT: vzeroupper 349; AVX512DQVL-NEXT: retq 350; 351; AVX512BWVL-LABEL: var_shift_v8i16: 352; AVX512BWVL: # %bb.0: 353; AVX512BWVL-NEXT: vpsravw %xmm1, %xmm0, %xmm0 354; AVX512BWVL-NEXT: retq 355; 356; X32-SSE-LABEL: var_shift_v8i16: 357; X32-SSE: # %bb.0: 358; X32-SSE-NEXT: psllw $12, %xmm1 359; X32-SSE-NEXT: movdqa %xmm1, %xmm2 360; X32-SSE-NEXT: psraw $15, %xmm2 361; X32-SSE-NEXT: movdqa %xmm2, %xmm3 362; X32-SSE-NEXT: pandn %xmm0, %xmm3 363; X32-SSE-NEXT: psraw $8, %xmm0 364; X32-SSE-NEXT: pand %xmm2, %xmm0 365; X32-SSE-NEXT: por %xmm3, %xmm0 366; X32-SSE-NEXT: paddw %xmm1, %xmm1 367; X32-SSE-NEXT: movdqa %xmm1, %xmm2 368; X32-SSE-NEXT: psraw $15, %xmm2 369; X32-SSE-NEXT: movdqa %xmm2, %xmm3 370; X32-SSE-NEXT: pandn %xmm0, %xmm3 371; X32-SSE-NEXT: psraw $4, %xmm0 372; X32-SSE-NEXT: pand %xmm2, %xmm0 373; X32-SSE-NEXT: por %xmm3, %xmm0 374; X32-SSE-NEXT: paddw %xmm1, %xmm1 375; X32-SSE-NEXT: movdqa %xmm1, %xmm2 376; X32-SSE-NEXT: psraw $15, %xmm2 377; X32-SSE-NEXT: movdqa %xmm2, %xmm3 378; X32-SSE-NEXT: pandn %xmm0, %xmm3 379; X32-SSE-NEXT: psraw $2, %xmm0 380; X32-SSE-NEXT: pand %xmm2, %xmm0 381; X32-SSE-NEXT: por %xmm3, %xmm0 382; X32-SSE-NEXT: paddw %xmm1, %xmm1 383; X32-SSE-NEXT: psraw $15, %xmm1 384; X32-SSE-NEXT: movdqa %xmm1, %xmm2 385; X32-SSE-NEXT: pandn %xmm0, %xmm2 386; X32-SSE-NEXT: psraw $1, %xmm0 387; X32-SSE-NEXT: pand %xmm1, %xmm0 388; X32-SSE-NEXT: por %xmm2, %xmm0 389; X32-SSE-NEXT: retl 390 %shift = ashr <8 x i16> %a, %b 391 ret <8 x i16> %shift 392} 393 394define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 395; SSE2-LABEL: var_shift_v16i8: 396; SSE2: # %bb.0: 397; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 398; SSE2-NEXT: psllw $5, %xmm1 399; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] 400; SSE2-NEXT: pxor %xmm3, %xmm3 401; SSE2-NEXT: pxor %xmm5, %xmm5 402; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 403; SSE2-NEXT: movdqa %xmm5, %xmm6 404; SSE2-NEXT: pandn %xmm2, %xmm6 405; SSE2-NEXT: psraw $4, %xmm2 406; SSE2-NEXT: pand %xmm5, %xmm2 407; SSE2-NEXT: por %xmm6, %xmm2 408; SSE2-NEXT: paddw %xmm4, %xmm4 409; SSE2-NEXT: pxor %xmm5, %xmm5 410; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 411; SSE2-NEXT: movdqa %xmm5, %xmm6 412; SSE2-NEXT: pandn %xmm2, %xmm6 413; SSE2-NEXT: psraw $2, %xmm2 414; SSE2-NEXT: pand %xmm5, %xmm2 415; SSE2-NEXT: por %xmm6, %xmm2 416; SSE2-NEXT: paddw %xmm4, %xmm4 417; SSE2-NEXT: pxor %xmm5, %xmm5 418; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 419; SSE2-NEXT: movdqa %xmm5, %xmm4 420; SSE2-NEXT: pandn %xmm2, %xmm4 421; SSE2-NEXT: psraw $1, %xmm2 422; SSE2-NEXT: pand %xmm5, %xmm2 423; SSE2-NEXT: por %xmm4, %xmm2 424; SSE2-NEXT: psrlw $8, %xmm2 425; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 426; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 427; SSE2-NEXT: pxor %xmm4, %xmm4 428; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 429; SSE2-NEXT: movdqa %xmm4, %xmm5 430; SSE2-NEXT: pandn %xmm0, %xmm5 431; SSE2-NEXT: psraw $4, %xmm0 432; SSE2-NEXT: pand %xmm4, %xmm0 433; SSE2-NEXT: por %xmm5, %xmm0 434; SSE2-NEXT: paddw %xmm1, %xmm1 435; SSE2-NEXT: pxor %xmm4, %xmm4 436; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 437; SSE2-NEXT: movdqa %xmm4, %xmm5 438; SSE2-NEXT: pandn %xmm0, %xmm5 439; SSE2-NEXT: psraw $2, %xmm0 440; SSE2-NEXT: pand %xmm4, %xmm0 441; SSE2-NEXT: por %xmm5, %xmm0 442; SSE2-NEXT: paddw %xmm1, %xmm1 443; SSE2-NEXT: pcmpgtw %xmm1, %xmm3 444; SSE2-NEXT: movdqa %xmm3, %xmm1 445; SSE2-NEXT: pandn %xmm0, %xmm1 446; SSE2-NEXT: psraw $1, %xmm0 447; SSE2-NEXT: pand %xmm3, %xmm0 448; SSE2-NEXT: por %xmm1, %xmm0 449; SSE2-NEXT: psrlw $8, %xmm0 450; SSE2-NEXT: packuswb %xmm2, %xmm0 451; SSE2-NEXT: retq 452; 453; SSE41-LABEL: var_shift_v16i8: 454; SSE41: # %bb.0: 455; SSE41-NEXT: movdqa %xmm0, %xmm2 456; SSE41-NEXT: psllw $5, %xmm1 457; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 458; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 459; SSE41-NEXT: movdqa %xmm3, %xmm4 460; SSE41-NEXT: psraw $4, %xmm4 461; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 462; SSE41-NEXT: movdqa %xmm3, %xmm4 463; SSE41-NEXT: psraw $2, %xmm4 464; SSE41-NEXT: paddw %xmm0, %xmm0 465; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 466; SSE41-NEXT: movdqa %xmm3, %xmm4 467; SSE41-NEXT: psraw $1, %xmm4 468; SSE41-NEXT: paddw %xmm0, %xmm0 469; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 470; SSE41-NEXT: psrlw $8, %xmm3 471; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 472; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 473; SSE41-NEXT: movdqa %xmm1, %xmm2 474; SSE41-NEXT: psraw $4, %xmm2 475; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 476; SSE41-NEXT: movdqa %xmm1, %xmm2 477; SSE41-NEXT: psraw $2, %xmm2 478; SSE41-NEXT: paddw %xmm0, %xmm0 479; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 480; SSE41-NEXT: movdqa %xmm1, %xmm2 481; SSE41-NEXT: psraw $1, %xmm2 482; SSE41-NEXT: paddw %xmm0, %xmm0 483; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 484; SSE41-NEXT: psrlw $8, %xmm1 485; SSE41-NEXT: packuswb %xmm3, %xmm1 486; SSE41-NEXT: movdqa %xmm1, %xmm0 487; SSE41-NEXT: retq 488; 489; AVX-LABEL: var_shift_v16i8: 490; AVX: # %bb.0: 491; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 492; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 493; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 494; AVX-NEXT: vpsraw $4, %xmm3, %xmm4 495; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 496; AVX-NEXT: vpsraw $2, %xmm3, %xmm4 497; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 498; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 499; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 500; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 501; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 502; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 503; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 504; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 505; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 506; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 507; AVX-NEXT: vpsraw $2, %xmm0, %xmm3 508; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 509; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 510; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 511; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 512; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 513; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 514; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 515; AVX-NEXT: retq 516; 517; XOP-LABEL: var_shift_v16i8: 518; XOP: # %bb.0: 519; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 520; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 521; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 522; XOP-NEXT: retq 523; 524; AVX512DQ-LABEL: var_shift_v16i8: 525; AVX512DQ: # %bb.0: 526; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 527; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 528; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 529; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 530; AVX512DQ-NEXT: vzeroupper 531; AVX512DQ-NEXT: retq 532; 533; AVX512BW-LABEL: var_shift_v16i8: 534; AVX512BW: # %bb.0: 535; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 536; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 537; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 538; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 539; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 540; AVX512BW-NEXT: vzeroupper 541; AVX512BW-NEXT: retq 542; 543; AVX512DQVL-LABEL: var_shift_v16i8: 544; AVX512DQVL: # %bb.0: 545; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 546; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 547; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 548; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 549; AVX512DQVL-NEXT: vzeroupper 550; AVX512DQVL-NEXT: retq 551; 552; AVX512BWVL-LABEL: var_shift_v16i8: 553; AVX512BWVL: # %bb.0: 554; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 555; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 556; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 557; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 558; AVX512BWVL-NEXT: vzeroupper 559; AVX512BWVL-NEXT: retq 560; 561; X32-SSE-LABEL: var_shift_v16i8: 562; X32-SSE: # %bb.0: 563; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 564; X32-SSE-NEXT: psllw $5, %xmm1 565; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] 566; X32-SSE-NEXT: pxor %xmm3, %xmm3 567; X32-SSE-NEXT: pxor %xmm5, %xmm5 568; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 569; X32-SSE-NEXT: movdqa %xmm5, %xmm6 570; X32-SSE-NEXT: pandn %xmm2, %xmm6 571; X32-SSE-NEXT: psraw $4, %xmm2 572; X32-SSE-NEXT: pand %xmm5, %xmm2 573; X32-SSE-NEXT: por %xmm6, %xmm2 574; X32-SSE-NEXT: paddw %xmm4, %xmm4 575; X32-SSE-NEXT: pxor %xmm5, %xmm5 576; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 577; X32-SSE-NEXT: movdqa %xmm5, %xmm6 578; X32-SSE-NEXT: pandn %xmm2, %xmm6 579; X32-SSE-NEXT: psraw $2, %xmm2 580; X32-SSE-NEXT: pand %xmm5, %xmm2 581; X32-SSE-NEXT: por %xmm6, %xmm2 582; X32-SSE-NEXT: paddw %xmm4, %xmm4 583; X32-SSE-NEXT: pxor %xmm5, %xmm5 584; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 585; X32-SSE-NEXT: movdqa %xmm5, %xmm4 586; X32-SSE-NEXT: pandn %xmm2, %xmm4 587; X32-SSE-NEXT: psraw $1, %xmm2 588; X32-SSE-NEXT: pand %xmm5, %xmm2 589; X32-SSE-NEXT: por %xmm4, %xmm2 590; X32-SSE-NEXT: psrlw $8, %xmm2 591; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 592; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 593; X32-SSE-NEXT: pxor %xmm4, %xmm4 594; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4 595; X32-SSE-NEXT: movdqa %xmm4, %xmm5 596; X32-SSE-NEXT: pandn %xmm0, %xmm5 597; X32-SSE-NEXT: psraw $4, %xmm0 598; X32-SSE-NEXT: pand %xmm4, %xmm0 599; X32-SSE-NEXT: por %xmm5, %xmm0 600; X32-SSE-NEXT: paddw %xmm1, %xmm1 601; X32-SSE-NEXT: pxor %xmm4, %xmm4 602; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4 603; X32-SSE-NEXT: movdqa %xmm4, %xmm5 604; X32-SSE-NEXT: pandn %xmm0, %xmm5 605; X32-SSE-NEXT: psraw $2, %xmm0 606; X32-SSE-NEXT: pand %xmm4, %xmm0 607; X32-SSE-NEXT: por %xmm5, %xmm0 608; X32-SSE-NEXT: paddw %xmm1, %xmm1 609; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm3 610; X32-SSE-NEXT: movdqa %xmm3, %xmm1 611; X32-SSE-NEXT: pandn %xmm0, %xmm1 612; X32-SSE-NEXT: psraw $1, %xmm0 613; X32-SSE-NEXT: pand %xmm3, %xmm0 614; X32-SSE-NEXT: por %xmm1, %xmm0 615; X32-SSE-NEXT: psrlw $8, %xmm0 616; X32-SSE-NEXT: packuswb %xmm2, %xmm0 617; X32-SSE-NEXT: retl 618 %shift = ashr <16 x i8> %a, %b 619 ret <16 x i8> %shift 620} 621 622; 623; Uniform Variable Shifts 624; 625 626define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 627; SSE-LABEL: splatvar_shift_v2i64: 628; SSE: # %bb.0: 629; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 630; SSE-NEXT: psrlq %xmm1, %xmm2 631; SSE-NEXT: psrlq %xmm1, %xmm0 632; SSE-NEXT: pxor %xmm2, %xmm0 633; SSE-NEXT: psubq %xmm2, %xmm0 634; SSE-NEXT: retq 635; 636; AVX-LABEL: splatvar_shift_v2i64: 637; AVX: # %bb.0: 638; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 639; AVX-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 640; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 641; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 642; AVX-NEXT: vpsubq %xmm2, %xmm0, %xmm0 643; AVX-NEXT: retq 644; 645; XOPAVX1-LABEL: splatvar_shift_v2i64: 646; XOPAVX1: # %bb.0: 647; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 648; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 649; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 650; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0 651; XOPAVX1-NEXT: retq 652; 653; XOPAVX2-LABEL: splatvar_shift_v2i64: 654; XOPAVX2: # %bb.0: 655; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 656; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 657; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1 658; XOPAVX2-NEXT: vpshaq %xmm1, %xmm0, %xmm0 659; XOPAVX2-NEXT: retq 660; 661; AVX512-LABEL: splatvar_shift_v2i64: 662; AVX512: # %bb.0: 663; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 664; AVX512-NEXT: vpsraq %xmm1, %zmm0, %zmm0 665; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 666; AVX512-NEXT: vzeroupper 667; AVX512-NEXT: retq 668; 669; AVX512VL-LABEL: splatvar_shift_v2i64: 670; AVX512VL: # %bb.0: 671; AVX512VL-NEXT: vpsraq %xmm1, %xmm0, %xmm0 672; AVX512VL-NEXT: retq 673; 674; X32-SSE-LABEL: splatvar_shift_v2i64: 675; X32-SSE: # %bb.0: 676; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] 677; X32-SSE-NEXT: psrlq %xmm1, %xmm2 678; X32-SSE-NEXT: psrlq %xmm1, %xmm0 679; X32-SSE-NEXT: pxor %xmm2, %xmm0 680; X32-SSE-NEXT: psubq %xmm2, %xmm0 681; X32-SSE-NEXT: retl 682 %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer 683 %shift = ashr <2 x i64> %a, %splat 684 ret <2 x i64> %shift 685} 686 687define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 688; SSE2-LABEL: splatvar_shift_v4i32: 689; SSE2: # %bb.0: 690; SSE2-NEXT: xorps %xmm2, %xmm2 691; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 692; SSE2-NEXT: psrad %xmm2, %xmm0 693; SSE2-NEXT: retq 694; 695; SSE41-LABEL: splatvar_shift_v4i32: 696; SSE41: # %bb.0: 697; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 698; SSE41-NEXT: psrad %xmm1, %xmm0 699; SSE41-NEXT: retq 700; 701; AVX-LABEL: splatvar_shift_v4i32: 702; AVX: # %bb.0: 703; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 704; AVX-NEXT: vpsrad %xmm1, %xmm0, %xmm0 705; AVX-NEXT: retq 706; 707; XOP-LABEL: splatvar_shift_v4i32: 708; XOP: # %bb.0: 709; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 710; XOP-NEXT: vpsrad %xmm1, %xmm0, %xmm0 711; XOP-NEXT: retq 712; 713; AVX512-LABEL: splatvar_shift_v4i32: 714; AVX512: # %bb.0: 715; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 716; AVX512-NEXT: vpsrad %xmm1, %xmm0, %xmm0 717; AVX512-NEXT: retq 718; 719; AVX512VL-LABEL: splatvar_shift_v4i32: 720; AVX512VL: # %bb.0: 721; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 722; AVX512VL-NEXT: vpsrad %xmm1, %xmm0, %xmm0 723; AVX512VL-NEXT: retq 724; 725; X32-SSE-LABEL: splatvar_shift_v4i32: 726; X32-SSE: # %bb.0: 727; X32-SSE-NEXT: xorps %xmm2, %xmm2 728; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 729; X32-SSE-NEXT: psrad %xmm2, %xmm0 730; X32-SSE-NEXT: retl 731 %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer 732 %shift = ashr <4 x i32> %a, %splat 733 ret <4 x i32> %shift 734} 735 736define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 737; SSE2-LABEL: splatvar_shift_v8i16: 738; SSE2: # %bb.0: 739; SSE2-NEXT: pextrw $0, %xmm1, %eax 740; SSE2-NEXT: movd %eax, %xmm1 741; SSE2-NEXT: psraw %xmm1, %xmm0 742; SSE2-NEXT: retq 743; 744; SSE41-LABEL: splatvar_shift_v8i16: 745; SSE41: # %bb.0: 746; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 747; SSE41-NEXT: psraw %xmm1, %xmm0 748; SSE41-NEXT: retq 749; 750; AVX-LABEL: splatvar_shift_v8i16: 751; AVX: # %bb.0: 752; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 753; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0 754; AVX-NEXT: retq 755; 756; XOP-LABEL: splatvar_shift_v8i16: 757; XOP: # %bb.0: 758; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 759; XOP-NEXT: vpsraw %xmm1, %xmm0, %xmm0 760; XOP-NEXT: retq 761; 762; AVX512-LABEL: splatvar_shift_v8i16: 763; AVX512: # %bb.0: 764; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 765; AVX512-NEXT: vpsraw %xmm1, %xmm0, %xmm0 766; AVX512-NEXT: retq 767; 768; AVX512VL-LABEL: splatvar_shift_v8i16: 769; AVX512VL: # %bb.0: 770; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 771; AVX512VL-NEXT: vpsraw %xmm1, %xmm0, %xmm0 772; AVX512VL-NEXT: retq 773; 774; X32-SSE-LABEL: splatvar_shift_v8i16: 775; X32-SSE: # %bb.0: 776; X32-SSE-NEXT: pextrw $0, %xmm1, %eax 777; X32-SSE-NEXT: movd %eax, %xmm1 778; X32-SSE-NEXT: psraw %xmm1, %xmm0 779; X32-SSE-NEXT: retl 780 %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer 781 %shift = ashr <8 x i16> %a, %splat 782 ret <8 x i16> %shift 783} 784 785define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 786; SSE2-LABEL: splatvar_shift_v16i8: 787; SSE2: # %bb.0: 788; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 789; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] 790; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] 791; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 792; SSE2-NEXT: psllw $5, %xmm3 793; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] 794; SSE2-NEXT: pxor %xmm2, %xmm2 795; SSE2-NEXT: pxor %xmm5, %xmm5 796; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 797; SSE2-NEXT: movdqa %xmm5, %xmm6 798; SSE2-NEXT: pandn %xmm1, %xmm6 799; SSE2-NEXT: psraw $4, %xmm1 800; SSE2-NEXT: pand %xmm5, %xmm1 801; SSE2-NEXT: por %xmm6, %xmm1 802; SSE2-NEXT: paddw %xmm4, %xmm4 803; SSE2-NEXT: pxor %xmm5, %xmm5 804; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 805; SSE2-NEXT: movdqa %xmm5, %xmm6 806; SSE2-NEXT: pandn %xmm1, %xmm6 807; SSE2-NEXT: psraw $2, %xmm1 808; SSE2-NEXT: pand %xmm5, %xmm1 809; SSE2-NEXT: por %xmm6, %xmm1 810; SSE2-NEXT: paddw %xmm4, %xmm4 811; SSE2-NEXT: pxor %xmm5, %xmm5 812; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 813; SSE2-NEXT: movdqa %xmm5, %xmm4 814; SSE2-NEXT: pandn %xmm1, %xmm4 815; SSE2-NEXT: psraw $1, %xmm1 816; SSE2-NEXT: pand %xmm5, %xmm1 817; SSE2-NEXT: por %xmm4, %xmm1 818; SSE2-NEXT: psrlw $8, %xmm1 819; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 820; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 821; SSE2-NEXT: pxor %xmm4, %xmm4 822; SSE2-NEXT: pcmpgtw %xmm3, %xmm4 823; SSE2-NEXT: movdqa %xmm4, %xmm5 824; SSE2-NEXT: pandn %xmm0, %xmm5 825; SSE2-NEXT: psraw $4, %xmm0 826; SSE2-NEXT: pand %xmm4, %xmm0 827; SSE2-NEXT: por %xmm5, %xmm0 828; SSE2-NEXT: paddw %xmm3, %xmm3 829; SSE2-NEXT: pxor %xmm4, %xmm4 830; SSE2-NEXT: pcmpgtw %xmm3, %xmm4 831; SSE2-NEXT: movdqa %xmm4, %xmm5 832; SSE2-NEXT: pandn %xmm0, %xmm5 833; SSE2-NEXT: psraw $2, %xmm0 834; SSE2-NEXT: pand %xmm4, %xmm0 835; SSE2-NEXT: por %xmm5, %xmm0 836; SSE2-NEXT: paddw %xmm3, %xmm3 837; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 838; SSE2-NEXT: movdqa %xmm2, %xmm3 839; SSE2-NEXT: pandn %xmm0, %xmm3 840; SSE2-NEXT: psraw $1, %xmm0 841; SSE2-NEXT: pand %xmm2, %xmm0 842; SSE2-NEXT: por %xmm3, %xmm0 843; SSE2-NEXT: psrlw $8, %xmm0 844; SSE2-NEXT: packuswb %xmm1, %xmm0 845; SSE2-NEXT: retq 846; 847; SSE41-LABEL: splatvar_shift_v16i8: 848; SSE41: # %bb.0: 849; SSE41-NEXT: movdqa %xmm0, %xmm2 850; SSE41-NEXT: pxor %xmm0, %xmm0 851; SSE41-NEXT: pshufb %xmm0, %xmm1 852; SSE41-NEXT: psllw $5, %xmm1 853; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 854; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 855; SSE41-NEXT: movdqa %xmm3, %xmm4 856; SSE41-NEXT: psraw $4, %xmm4 857; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 858; SSE41-NEXT: movdqa %xmm3, %xmm4 859; SSE41-NEXT: psraw $2, %xmm4 860; SSE41-NEXT: paddw %xmm0, %xmm0 861; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 862; SSE41-NEXT: movdqa %xmm3, %xmm4 863; SSE41-NEXT: psraw $1, %xmm4 864; SSE41-NEXT: paddw %xmm0, %xmm0 865; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 866; SSE41-NEXT: psrlw $8, %xmm3 867; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 868; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 869; SSE41-NEXT: movdqa %xmm1, %xmm2 870; SSE41-NEXT: psraw $4, %xmm2 871; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 872; SSE41-NEXT: movdqa %xmm1, %xmm2 873; SSE41-NEXT: psraw $2, %xmm2 874; SSE41-NEXT: paddw %xmm0, %xmm0 875; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 876; SSE41-NEXT: movdqa %xmm1, %xmm2 877; SSE41-NEXT: psraw $1, %xmm2 878; SSE41-NEXT: paddw %xmm0, %xmm0 879; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 880; SSE41-NEXT: psrlw $8, %xmm1 881; SSE41-NEXT: packuswb %xmm3, %xmm1 882; SSE41-NEXT: movdqa %xmm1, %xmm0 883; SSE41-NEXT: retq 884; 885; AVX1-LABEL: splatvar_shift_v16i8: 886; AVX1: # %bb.0: 887; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 888; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 889; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 890; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 891; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 892; AVX1-NEXT: vpsraw $4, %xmm3, %xmm4 893; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 894; AVX1-NEXT: vpsraw $2, %xmm3, %xmm4 895; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 896; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 897; AVX1-NEXT: vpsraw $1, %xmm3, %xmm4 898; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 899; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 900; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 901; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 902; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 903; AVX1-NEXT: vpsraw $4, %xmm0, %xmm3 904; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 905; AVX1-NEXT: vpsraw $2, %xmm0, %xmm3 906; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 907; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 908; AVX1-NEXT: vpsraw $1, %xmm0, %xmm3 909; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 910; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 911; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 912; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 913; AVX1-NEXT: retq 914; 915; AVX2-LABEL: splatvar_shift_v16i8: 916; AVX2: # %bb.0: 917; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 918; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 919; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 920; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 921; AVX2-NEXT: vpsraw $4, %xmm3, %xmm4 922; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 923; AVX2-NEXT: vpsraw $2, %xmm3, %xmm4 924; AVX2-NEXT: vpaddw %xmm2, %xmm2, %xmm2 925; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 926; AVX2-NEXT: vpsraw $1, %xmm3, %xmm4 927; AVX2-NEXT: vpaddw %xmm2, %xmm2, %xmm2 928; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 929; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 930; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 931; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 932; AVX2-NEXT: vpsraw $4, %xmm0, %xmm3 933; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 934; AVX2-NEXT: vpsraw $2, %xmm0, %xmm3 935; AVX2-NEXT: vpaddw %xmm1, %xmm1, %xmm1 936; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 937; AVX2-NEXT: vpsraw $1, %xmm0, %xmm3 938; AVX2-NEXT: vpaddw %xmm1, %xmm1, %xmm1 939; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 940; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 941; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 942; AVX2-NEXT: retq 943; 944; XOPAVX1-LABEL: splatvar_shift_v16i8: 945; XOPAVX1: # %bb.0: 946; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 947; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 948; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 949; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 950; XOPAVX1-NEXT: retq 951; 952; XOPAVX2-LABEL: splatvar_shift_v16i8: 953; XOPAVX2: # %bb.0: 954; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 955; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 956; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 957; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 958; XOPAVX2-NEXT: retq 959; 960; AVX512DQ-LABEL: splatvar_shift_v16i8: 961; AVX512DQ: # %bb.0: 962; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 963; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 964; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 965; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 966; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 967; AVX512DQ-NEXT: vzeroupper 968; AVX512DQ-NEXT: retq 969; 970; AVX512BW-LABEL: splatvar_shift_v16i8: 971; AVX512BW: # %bb.0: 972; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 973; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 974; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 975; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 976; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 977; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 978; AVX512BW-NEXT: vzeroupper 979; AVX512BW-NEXT: retq 980; 981; AVX512DQVL-LABEL: splatvar_shift_v16i8: 982; AVX512DQVL: # %bb.0: 983; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 984; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 985; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 986; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 987; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 988; AVX512DQVL-NEXT: vzeroupper 989; AVX512DQVL-NEXT: retq 990; 991; AVX512BWVL-LABEL: splatvar_shift_v16i8: 992; AVX512BWVL: # %bb.0: 993; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 994; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 995; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 996; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 997; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 998; AVX512BWVL-NEXT: vzeroupper 999; AVX512BWVL-NEXT: retq 1000; 1001; X32-SSE-LABEL: splatvar_shift_v16i8: 1002; X32-SSE: # %bb.0: 1003; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1004; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] 1005; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] 1006; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1007; X32-SSE-NEXT: psllw $5, %xmm3 1008; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] 1009; X32-SSE-NEXT: pxor %xmm2, %xmm2 1010; X32-SSE-NEXT: pxor %xmm5, %xmm5 1011; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 1012; X32-SSE-NEXT: movdqa %xmm5, %xmm6 1013; X32-SSE-NEXT: pandn %xmm1, %xmm6 1014; X32-SSE-NEXT: psraw $4, %xmm1 1015; X32-SSE-NEXT: pand %xmm5, %xmm1 1016; X32-SSE-NEXT: por %xmm6, %xmm1 1017; X32-SSE-NEXT: paddw %xmm4, %xmm4 1018; X32-SSE-NEXT: pxor %xmm5, %xmm5 1019; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 1020; X32-SSE-NEXT: movdqa %xmm5, %xmm6 1021; X32-SSE-NEXT: pandn %xmm1, %xmm6 1022; X32-SSE-NEXT: psraw $2, %xmm1 1023; X32-SSE-NEXT: pand %xmm5, %xmm1 1024; X32-SSE-NEXT: por %xmm6, %xmm1 1025; X32-SSE-NEXT: paddw %xmm4, %xmm4 1026; X32-SSE-NEXT: pxor %xmm5, %xmm5 1027; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 1028; X32-SSE-NEXT: movdqa %xmm5, %xmm4 1029; X32-SSE-NEXT: pandn %xmm1, %xmm4 1030; X32-SSE-NEXT: psraw $1, %xmm1 1031; X32-SSE-NEXT: pand %xmm5, %xmm1 1032; X32-SSE-NEXT: por %xmm4, %xmm1 1033; X32-SSE-NEXT: psrlw $8, %xmm1 1034; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1035; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1036; X32-SSE-NEXT: pxor %xmm4, %xmm4 1037; X32-SSE-NEXT: pcmpgtw %xmm3, %xmm4 1038; X32-SSE-NEXT: movdqa %xmm4, %xmm5 1039; X32-SSE-NEXT: pandn %xmm0, %xmm5 1040; X32-SSE-NEXT: psraw $4, %xmm0 1041; X32-SSE-NEXT: pand %xmm4, %xmm0 1042; X32-SSE-NEXT: por %xmm5, %xmm0 1043; X32-SSE-NEXT: paddw %xmm3, %xmm3 1044; X32-SSE-NEXT: pxor %xmm4, %xmm4 1045; X32-SSE-NEXT: pcmpgtw %xmm3, %xmm4 1046; X32-SSE-NEXT: movdqa %xmm4, %xmm5 1047; X32-SSE-NEXT: pandn %xmm0, %xmm5 1048; X32-SSE-NEXT: psraw $2, %xmm0 1049; X32-SSE-NEXT: pand %xmm4, %xmm0 1050; X32-SSE-NEXT: por %xmm5, %xmm0 1051; X32-SSE-NEXT: paddw %xmm3, %xmm3 1052; X32-SSE-NEXT: pcmpgtw %xmm3, %xmm2 1053; X32-SSE-NEXT: movdqa %xmm2, %xmm3 1054; X32-SSE-NEXT: pandn %xmm0, %xmm3 1055; X32-SSE-NEXT: psraw $1, %xmm0 1056; X32-SSE-NEXT: pand %xmm2, %xmm0 1057; X32-SSE-NEXT: por %xmm3, %xmm0 1058; X32-SSE-NEXT: psrlw $8, %xmm0 1059; X32-SSE-NEXT: packuswb %xmm1, %xmm0 1060; X32-SSE-NEXT: retl 1061 %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer 1062 %shift = ashr <16 x i8> %a, %splat 1063 ret <16 x i8> %shift 1064} 1065 1066; 1067; Constant Shifts 1068; 1069 1070define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind { 1071; SSE2-LABEL: constant_shift_v2i64: 1072; SSE2: # %bb.0: 1073; SSE2-NEXT: movdqa %xmm0, %xmm1 1074; SSE2-NEXT: psrlq $1, %xmm1 1075; SSE2-NEXT: psrlq $7, %xmm0 1076; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1077; SSE2-NEXT: movapd {{.*#+}} xmm1 = [4611686018427387904,72057594037927936] 1078; SSE2-NEXT: xorpd %xmm1, %xmm0 1079; SSE2-NEXT: psubq %xmm1, %xmm0 1080; SSE2-NEXT: retq 1081; 1082; SSE41-LABEL: constant_shift_v2i64: 1083; SSE41: # %bb.0: 1084; SSE41-NEXT: movdqa %xmm0, %xmm1 1085; SSE41-NEXT: psrlq $7, %xmm1 1086; SSE41-NEXT: psrlq $1, %xmm0 1087; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1088; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936] 1089; SSE41-NEXT: pxor %xmm1, %xmm0 1090; SSE41-NEXT: psubq %xmm1, %xmm0 1091; SSE41-NEXT: retq 1092; 1093; AVX1-LABEL: constant_shift_v2i64: 1094; AVX1: # %bb.0: 1095; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1 1096; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 1097; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1098; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936] 1099; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 1100; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 1101; AVX1-NEXT: retq 1102; 1103; AVX2-LABEL: constant_shift_v2i64: 1104; AVX2: # %bb.0: 1105; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 1106; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936] 1107; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 1108; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 1109; AVX2-NEXT: retq 1110; 1111; XOP-LABEL: constant_shift_v2i64: 1112; XOP: # %bb.0: 1113; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 1114; XOP-NEXT: retq 1115; 1116; AVX512-LABEL: constant_shift_v2i64: 1117; AVX512: # %bb.0: 1118; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1119; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [1,7] 1120; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 1121; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1122; AVX512-NEXT: vzeroupper 1123; AVX512-NEXT: retq 1124; 1125; AVX512VL-LABEL: constant_shift_v2i64: 1126; AVX512VL: # %bb.0: 1127; AVX512VL-NEXT: vpsravq {{.*}}(%rip), %xmm0, %xmm0 1128; AVX512VL-NEXT: retq 1129; 1130; X32-SSE-LABEL: constant_shift_v2i64: 1131; X32-SSE: # %bb.0: 1132; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648] 1133; X32-SSE-NEXT: movdqa %xmm1, %xmm2 1134; X32-SSE-NEXT: psrlq $1, %xmm2 1135; X32-SSE-NEXT: psrlq $7, %xmm1 1136; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] 1137; X32-SSE-NEXT: movdqa %xmm0, %xmm2 1138; X32-SSE-NEXT: psrlq $1, %xmm2 1139; X32-SSE-NEXT: psrlq $7, %xmm0 1140; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] 1141; X32-SSE-NEXT: xorpd %xmm1, %xmm0 1142; X32-SSE-NEXT: psubq %xmm1, %xmm0 1143; X32-SSE-NEXT: retl 1144 %shift = ashr <2 x i64> %a, <i64 1, i64 7> 1145 ret <2 x i64> %shift 1146} 1147 1148define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind { 1149; SSE2-LABEL: constant_shift_v4i32: 1150; SSE2: # %bb.0: 1151; SSE2-NEXT: movdqa %xmm0, %xmm1 1152; SSE2-NEXT: psrad $7, %xmm1 1153; SSE2-NEXT: movdqa %xmm0, %xmm2 1154; SSE2-NEXT: psrad $6, %xmm2 1155; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] 1156; SSE2-NEXT: movdqa %xmm0, %xmm1 1157; SSE2-NEXT: psrad $5, %xmm1 1158; SSE2-NEXT: psrad $4, %xmm0 1159; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1160; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] 1161; SSE2-NEXT: retq 1162; 1163; SSE41-LABEL: constant_shift_v4i32: 1164; SSE41: # %bb.0: 1165; SSE41-NEXT: movdqa %xmm0, %xmm1 1166; SSE41-NEXT: psrad $7, %xmm1 1167; SSE41-NEXT: movdqa %xmm0, %xmm2 1168; SSE41-NEXT: psrad $5, %xmm2 1169; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1170; SSE41-NEXT: movdqa %xmm0, %xmm1 1171; SSE41-NEXT: psrad $6, %xmm1 1172; SSE41-NEXT: psrad $4, %xmm0 1173; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1174; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 1175; SSE41-NEXT: retq 1176; 1177; AVX1-LABEL: constant_shift_v4i32: 1178; AVX1: # %bb.0: 1179; AVX1-NEXT: vpsrad $7, %xmm0, %xmm1 1180; AVX1-NEXT: vpsrad $5, %xmm0, %xmm2 1181; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1182; AVX1-NEXT: vpsrad $6, %xmm0, %xmm2 1183; AVX1-NEXT: vpsrad $4, %xmm0, %xmm0 1184; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 1185; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 1186; AVX1-NEXT: retq 1187; 1188; AVX2-LABEL: constant_shift_v4i32: 1189; AVX2: # %bb.0: 1190; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 1191; AVX2-NEXT: retq 1192; 1193; XOPAVX1-LABEL: constant_shift_v4i32: 1194; XOPAVX1: # %bb.0: 1195; XOPAVX1-NEXT: vpshad {{.*}}(%rip), %xmm0, %xmm0 1196; XOPAVX1-NEXT: retq 1197; 1198; XOPAVX2-LABEL: constant_shift_v4i32: 1199; XOPAVX2: # %bb.0: 1200; XOPAVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 1201; XOPAVX2-NEXT: retq 1202; 1203; AVX512-LABEL: constant_shift_v4i32: 1204; AVX512: # %bb.0: 1205; AVX512-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 1206; AVX512-NEXT: retq 1207; 1208; AVX512VL-LABEL: constant_shift_v4i32: 1209; AVX512VL: # %bb.0: 1210; AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 1211; AVX512VL-NEXT: retq 1212; 1213; X32-SSE-LABEL: constant_shift_v4i32: 1214; X32-SSE: # %bb.0: 1215; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1216; X32-SSE-NEXT: psrad $7, %xmm1 1217; X32-SSE-NEXT: movdqa %xmm0, %xmm2 1218; X32-SSE-NEXT: psrad $6, %xmm2 1219; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] 1220; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1221; X32-SSE-NEXT: psrad $5, %xmm1 1222; X32-SSE-NEXT: psrad $4, %xmm0 1223; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1224; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] 1225; X32-SSE-NEXT: retl 1226 %shift = ashr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7> 1227 ret <4 x i32> %shift 1228} 1229 1230define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { 1231; SSE2-LABEL: constant_shift_v8i16: 1232; SSE2: # %bb.0: 1233; SSE2-NEXT: movdqa %xmm0, %xmm1 1234; SSE2-NEXT: psraw $4, %xmm1 1235; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 1236; SSE2-NEXT: movapd %xmm1, %xmm2 1237; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3] 1238; SSE2-NEXT: psraw $2, %xmm1 1239; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] 1240; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 1241; SSE2-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] 1242; SSE2-NEXT: movaps %xmm2, %xmm0 1243; SSE2-NEXT: andps %xmm1, %xmm0 1244; SSE2-NEXT: psraw $1, %xmm2 1245; SSE2-NEXT: andnps %xmm2, %xmm1 1246; SSE2-NEXT: orps %xmm1, %xmm0 1247; SSE2-NEXT: retq 1248; 1249; SSE41-LABEL: constant_shift_v8i16: 1250; SSE41: # %bb.0: 1251; SSE41-NEXT: movdqa %xmm0, %xmm1 1252; SSE41-NEXT: psraw $4, %xmm1 1253; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1254; SSE41-NEXT: movdqa %xmm1, %xmm2 1255; SSE41-NEXT: psraw $2, %xmm2 1256; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1257; SSE41-NEXT: movdqa %xmm2, %xmm0 1258; SSE41-NEXT: psraw $1, %xmm0 1259; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] 1260; SSE41-NEXT: retq 1261; 1262; AVX1-LABEL: constant_shift_v8i16: 1263; AVX1: # %bb.0: 1264; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 1265; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1266; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 1267; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 1268; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 1269; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 1270; AVX1-NEXT: retq 1271; 1272; AVX2-LABEL: constant_shift_v8i16: 1273; AVX2: # %bb.0: 1274; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 1275; AVX2-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 1276; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1277; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 1278; AVX2-NEXT: vzeroupper 1279; AVX2-NEXT: retq 1280; 1281; XOP-LABEL: constant_shift_v8i16: 1282; XOP: # %bb.0: 1283; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0 1284; XOP-NEXT: retq 1285; 1286; AVX512DQ-LABEL: constant_shift_v8i16: 1287; AVX512DQ: # %bb.0: 1288; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0 1289; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 1290; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 1291; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1292; AVX512DQ-NEXT: vzeroupper 1293; AVX512DQ-NEXT: retq 1294; 1295; AVX512BW-LABEL: constant_shift_v8i16: 1296; AVX512BW: # %bb.0: 1297; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1298; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] 1299; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 1300; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1301; AVX512BW-NEXT: vzeroupper 1302; AVX512BW-NEXT: retq 1303; 1304; AVX512DQVL-LABEL: constant_shift_v8i16: 1305; AVX512DQVL: # %bb.0: 1306; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0 1307; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 1308; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 1309; AVX512DQVL-NEXT: vzeroupper 1310; AVX512DQVL-NEXT: retq 1311; 1312; AVX512BWVL-LABEL: constant_shift_v8i16: 1313; AVX512BWVL: # %bb.0: 1314; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0 1315; AVX512BWVL-NEXT: retq 1316; 1317; X32-SSE-LABEL: constant_shift_v8i16: 1318; X32-SSE: # %bb.0: 1319; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1320; X32-SSE-NEXT: psraw $4, %xmm1 1321; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 1322; X32-SSE-NEXT: movapd %xmm1, %xmm2 1323; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3] 1324; X32-SSE-NEXT: psraw $2, %xmm1 1325; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] 1326; X32-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 1327; X32-SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] 1328; X32-SSE-NEXT: movaps %xmm2, %xmm0 1329; X32-SSE-NEXT: andps %xmm1, %xmm0 1330; X32-SSE-NEXT: psraw $1, %xmm2 1331; X32-SSE-NEXT: andnps %xmm2, %xmm1 1332; X32-SSE-NEXT: orps %xmm1, %xmm0 1333; X32-SSE-NEXT: retl 1334 %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7> 1335 ret <8 x i16> %shift 1336} 1337 1338define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { 1339; SSE2-LABEL: constant_shift_v16i8: 1340; SSE2: # %bb.0: 1341; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1342; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [8192,24640,41088,57536,49376,32928,16480,32] 1343; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] 1344; SSE2-NEXT: pxor %xmm2, %xmm2 1345; SSE2-NEXT: pxor %xmm5, %xmm5 1346; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 1347; SSE2-NEXT: movdqa %xmm5, %xmm6 1348; SSE2-NEXT: pandn %xmm1, %xmm6 1349; SSE2-NEXT: psraw $4, %xmm1 1350; SSE2-NEXT: pand %xmm5, %xmm1 1351; SSE2-NEXT: por %xmm6, %xmm1 1352; SSE2-NEXT: paddw %xmm4, %xmm4 1353; SSE2-NEXT: pxor %xmm5, %xmm5 1354; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 1355; SSE2-NEXT: movdqa %xmm5, %xmm6 1356; SSE2-NEXT: pandn %xmm1, %xmm6 1357; SSE2-NEXT: psraw $2, %xmm1 1358; SSE2-NEXT: pand %xmm5, %xmm1 1359; SSE2-NEXT: por %xmm6, %xmm1 1360; SSE2-NEXT: paddw %xmm4, %xmm4 1361; SSE2-NEXT: pxor %xmm5, %xmm5 1362; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 1363; SSE2-NEXT: movdqa %xmm5, %xmm4 1364; SSE2-NEXT: pandn %xmm1, %xmm4 1365; SSE2-NEXT: psraw $1, %xmm1 1366; SSE2-NEXT: pand %xmm5, %xmm1 1367; SSE2-NEXT: por %xmm4, %xmm1 1368; SSE2-NEXT: psrlw $8, %xmm1 1369; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1370; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1371; SSE2-NEXT: pxor %xmm4, %xmm4 1372; SSE2-NEXT: pcmpgtw %xmm3, %xmm4 1373; SSE2-NEXT: movdqa %xmm4, %xmm5 1374; SSE2-NEXT: pandn %xmm0, %xmm5 1375; SSE2-NEXT: psraw $4, %xmm0 1376; SSE2-NEXT: pand %xmm4, %xmm0 1377; SSE2-NEXT: por %xmm5, %xmm0 1378; SSE2-NEXT: paddw %xmm3, %xmm3 1379; SSE2-NEXT: pxor %xmm4, %xmm4 1380; SSE2-NEXT: pcmpgtw %xmm3, %xmm4 1381; SSE2-NEXT: movdqa %xmm4, %xmm5 1382; SSE2-NEXT: pandn %xmm0, %xmm5 1383; SSE2-NEXT: psraw $2, %xmm0 1384; SSE2-NEXT: pand %xmm4, %xmm0 1385; SSE2-NEXT: por %xmm5, %xmm0 1386; SSE2-NEXT: paddw %xmm3, %xmm3 1387; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 1388; SSE2-NEXT: movdqa %xmm2, %xmm3 1389; SSE2-NEXT: pandn %xmm0, %xmm3 1390; SSE2-NEXT: psraw $1, %xmm0 1391; SSE2-NEXT: pand %xmm2, %xmm0 1392; SSE2-NEXT: por %xmm3, %xmm0 1393; SSE2-NEXT: psrlw $8, %xmm0 1394; SSE2-NEXT: packuswb %xmm1, %xmm0 1395; SSE2-NEXT: retq 1396; 1397; SSE41-LABEL: constant_shift_v16i8: 1398; SSE41: # %bb.0: 1399; SSE41-NEXT: movdqa %xmm0, %xmm1 1400; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [8192,24640,41088,57536,49376,32928,16480,32] 1401; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] 1402; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 1403; SSE41-NEXT: movdqa %xmm2, %xmm4 1404; SSE41-NEXT: psraw $4, %xmm4 1405; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm2 1406; SSE41-NEXT: movdqa %xmm2, %xmm4 1407; SSE41-NEXT: psraw $2, %xmm4 1408; SSE41-NEXT: paddw %xmm0, %xmm0 1409; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm2 1410; SSE41-NEXT: movdqa %xmm2, %xmm4 1411; SSE41-NEXT: psraw $1, %xmm4 1412; SSE41-NEXT: paddw %xmm0, %xmm0 1413; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm2 1414; SSE41-NEXT: psrlw $8, %xmm2 1415; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 1416; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1417; SSE41-NEXT: movdqa %xmm1, %xmm3 1418; SSE41-NEXT: psraw $4, %xmm3 1419; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 1420; SSE41-NEXT: movdqa %xmm1, %xmm3 1421; SSE41-NEXT: psraw $2, %xmm3 1422; SSE41-NEXT: paddw %xmm0, %xmm0 1423; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 1424; SSE41-NEXT: movdqa %xmm1, %xmm3 1425; SSE41-NEXT: psraw $1, %xmm3 1426; SSE41-NEXT: paddw %xmm0, %xmm0 1427; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 1428; SSE41-NEXT: psrlw $8, %xmm1 1429; SSE41-NEXT: packuswb %xmm2, %xmm1 1430; SSE41-NEXT: movdqa %xmm1, %xmm0 1431; SSE41-NEXT: retq 1432; 1433; AVX-LABEL: constant_shift_v16i8: 1434; AVX: # %bb.0: 1435; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8192,24640,41088,57536,49376,32928,16480,32] 1436; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 1437; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1438; AVX-NEXT: vpsraw $4, %xmm3, %xmm4 1439; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 1440; AVX-NEXT: vpsraw $2, %xmm3, %xmm4 1441; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 1442; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 1443; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 1444; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 1445; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 1446; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 1447; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1448; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1449; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 1450; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 1451; AVX-NEXT: vpsraw $2, %xmm0, %xmm3 1452; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 1453; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 1454; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 1455; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 1456; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 1457; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 1458; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1459; AVX-NEXT: retq 1460; 1461; XOP-LABEL: constant_shift_v16i8: 1462; XOP: # %bb.0: 1463; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0 1464; XOP-NEXT: retq 1465; 1466; AVX512DQ-LABEL: constant_shift_v16i8: 1467; AVX512DQ: # %bb.0: 1468; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 1469; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 1470; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1471; AVX512DQ-NEXT: vzeroupper 1472; AVX512DQ-NEXT: retq 1473; 1474; AVX512BW-LABEL: constant_shift_v16i8: 1475; AVX512BW: # %bb.0: 1476; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 1477; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 1478; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 1479; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1480; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1481; AVX512BW-NEXT: vzeroupper 1482; AVX512BW-NEXT: retq 1483; 1484; AVX512DQVL-LABEL: constant_shift_v16i8: 1485; AVX512DQVL: # %bb.0: 1486; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 1487; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 1488; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 1489; AVX512DQVL-NEXT: vzeroupper 1490; AVX512DQVL-NEXT: retq 1491; 1492; AVX512BWVL-LABEL: constant_shift_v16i8: 1493; AVX512BWVL: # %bb.0: 1494; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 1495; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0 1496; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 1497; AVX512BWVL-NEXT: vzeroupper 1498; AVX512BWVL-NEXT: retq 1499; 1500; X32-SSE-LABEL: constant_shift_v16i8: 1501; X32-SSE: # %bb.0: 1502; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1503; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [8192,24640,41088,57536,49376,32928,16480,32] 1504; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] 1505; X32-SSE-NEXT: pxor %xmm2, %xmm2 1506; X32-SSE-NEXT: pxor %xmm5, %xmm5 1507; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 1508; X32-SSE-NEXT: movdqa %xmm5, %xmm6 1509; X32-SSE-NEXT: pandn %xmm1, %xmm6 1510; X32-SSE-NEXT: psraw $4, %xmm1 1511; X32-SSE-NEXT: pand %xmm5, %xmm1 1512; X32-SSE-NEXT: por %xmm6, %xmm1 1513; X32-SSE-NEXT: paddw %xmm4, %xmm4 1514; X32-SSE-NEXT: pxor %xmm5, %xmm5 1515; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 1516; X32-SSE-NEXT: movdqa %xmm5, %xmm6 1517; X32-SSE-NEXT: pandn %xmm1, %xmm6 1518; X32-SSE-NEXT: psraw $2, %xmm1 1519; X32-SSE-NEXT: pand %xmm5, %xmm1 1520; X32-SSE-NEXT: por %xmm6, %xmm1 1521; X32-SSE-NEXT: paddw %xmm4, %xmm4 1522; X32-SSE-NEXT: pxor %xmm5, %xmm5 1523; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 1524; X32-SSE-NEXT: movdqa %xmm5, %xmm4 1525; X32-SSE-NEXT: pandn %xmm1, %xmm4 1526; X32-SSE-NEXT: psraw $1, %xmm1 1527; X32-SSE-NEXT: pand %xmm5, %xmm1 1528; X32-SSE-NEXT: por %xmm4, %xmm1 1529; X32-SSE-NEXT: psrlw $8, %xmm1 1530; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1531; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1532; X32-SSE-NEXT: pxor %xmm4, %xmm4 1533; X32-SSE-NEXT: pcmpgtw %xmm3, %xmm4 1534; X32-SSE-NEXT: movdqa %xmm4, %xmm5 1535; X32-SSE-NEXT: pandn %xmm0, %xmm5 1536; X32-SSE-NEXT: psraw $4, %xmm0 1537; X32-SSE-NEXT: pand %xmm4, %xmm0 1538; X32-SSE-NEXT: por %xmm5, %xmm0 1539; X32-SSE-NEXT: paddw %xmm3, %xmm3 1540; X32-SSE-NEXT: pxor %xmm4, %xmm4 1541; X32-SSE-NEXT: pcmpgtw %xmm3, %xmm4 1542; X32-SSE-NEXT: movdqa %xmm4, %xmm5 1543; X32-SSE-NEXT: pandn %xmm0, %xmm5 1544; X32-SSE-NEXT: psraw $2, %xmm0 1545; X32-SSE-NEXT: pand %xmm4, %xmm0 1546; X32-SSE-NEXT: por %xmm5, %xmm0 1547; X32-SSE-NEXT: paddw %xmm3, %xmm3 1548; X32-SSE-NEXT: pcmpgtw %xmm3, %xmm2 1549; X32-SSE-NEXT: movdqa %xmm2, %xmm3 1550; X32-SSE-NEXT: pandn %xmm0, %xmm3 1551; X32-SSE-NEXT: psraw $1, %xmm0 1552; X32-SSE-NEXT: pand %xmm2, %xmm0 1553; X32-SSE-NEXT: por %xmm3, %xmm0 1554; X32-SSE-NEXT: psrlw $8, %xmm0 1555; X32-SSE-NEXT: packuswb %xmm1, %xmm0 1556; X32-SSE-NEXT: retl 1557 %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> 1558 ret <16 x i8> %shift 1559} 1560 1561; 1562; Uniform Constant Shifts 1563; 1564 1565define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind { 1566; SSE2-LABEL: splatconstant_shift_v2i64: 1567; SSE2: # %bb.0: 1568; SSE2-NEXT: movdqa %xmm0, %xmm1 1569; SSE2-NEXT: psrad $7, %xmm1 1570; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 1571; SSE2-NEXT: psrlq $7, %xmm0 1572; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1573; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1574; SSE2-NEXT: retq 1575; 1576; SSE41-LABEL: splatconstant_shift_v2i64: 1577; SSE41: # %bb.0: 1578; SSE41-NEXT: movdqa %xmm0, %xmm1 1579; SSE41-NEXT: psrad $7, %xmm1 1580; SSE41-NEXT: psrlq $7, %xmm0 1581; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 1582; SSE41-NEXT: retq 1583; 1584; AVX1-LABEL: splatconstant_shift_v2i64: 1585; AVX1: # %bb.0: 1586; AVX1-NEXT: vpsrad $7, %xmm0, %xmm1 1587; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0 1588; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 1589; AVX1-NEXT: retq 1590; 1591; AVX2-LABEL: splatconstant_shift_v2i64: 1592; AVX2: # %bb.0: 1593; AVX2-NEXT: vpsrad $7, %xmm0, %xmm1 1594; AVX2-NEXT: vpsrlq $7, %xmm0, %xmm0 1595; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 1596; AVX2-NEXT: retq 1597; 1598; XOP-LABEL: splatconstant_shift_v2i64: 1599; XOP: # %bb.0: 1600; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 1601; XOP-NEXT: retq 1602; 1603; AVX512-LABEL: splatconstant_shift_v2i64: 1604; AVX512: # %bb.0: 1605; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1606; AVX512-NEXT: vpsraq $7, %zmm0, %zmm0 1607; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1608; AVX512-NEXT: vzeroupper 1609; AVX512-NEXT: retq 1610; 1611; AVX512VL-LABEL: splatconstant_shift_v2i64: 1612; AVX512VL: # %bb.0: 1613; AVX512VL-NEXT: vpsraq $7, %xmm0, %xmm0 1614; AVX512VL-NEXT: retq 1615; 1616; X32-SSE-LABEL: splatconstant_shift_v2i64: 1617; X32-SSE: # %bb.0: 1618; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1619; X32-SSE-NEXT: psrad $7, %xmm1 1620; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 1621; X32-SSE-NEXT: psrlq $7, %xmm0 1622; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1623; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1624; X32-SSE-NEXT: retl 1625 %shift = ashr <2 x i64> %a, <i64 7, i64 7> 1626 ret <2 x i64> %shift 1627} 1628 1629define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind { 1630; SSE-LABEL: splatconstant_shift_v4i32: 1631; SSE: # %bb.0: 1632; SSE-NEXT: psrad $5, %xmm0 1633; SSE-NEXT: retq 1634; 1635; AVX-LABEL: splatconstant_shift_v4i32: 1636; AVX: # %bb.0: 1637; AVX-NEXT: vpsrad $5, %xmm0, %xmm0 1638; AVX-NEXT: retq 1639; 1640; XOP-LABEL: splatconstant_shift_v4i32: 1641; XOP: # %bb.0: 1642; XOP-NEXT: vpsrad $5, %xmm0, %xmm0 1643; XOP-NEXT: retq 1644; 1645; AVX512-LABEL: splatconstant_shift_v4i32: 1646; AVX512: # %bb.0: 1647; AVX512-NEXT: vpsrad $5, %xmm0, %xmm0 1648; AVX512-NEXT: retq 1649; 1650; AVX512VL-LABEL: splatconstant_shift_v4i32: 1651; AVX512VL: # %bb.0: 1652; AVX512VL-NEXT: vpsrad $5, %xmm0, %xmm0 1653; AVX512VL-NEXT: retq 1654; 1655; X32-SSE-LABEL: splatconstant_shift_v4i32: 1656; X32-SSE: # %bb.0: 1657; X32-SSE-NEXT: psrad $5, %xmm0 1658; X32-SSE-NEXT: retl 1659 %shift = ashr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5> 1660 ret <4 x i32> %shift 1661} 1662 1663define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind { 1664; SSE-LABEL: splatconstant_shift_v8i16: 1665; SSE: # %bb.0: 1666; SSE-NEXT: psraw $3, %xmm0 1667; SSE-NEXT: retq 1668; 1669; AVX-LABEL: splatconstant_shift_v8i16: 1670; AVX: # %bb.0: 1671; AVX-NEXT: vpsraw $3, %xmm0, %xmm0 1672; AVX-NEXT: retq 1673; 1674; XOP-LABEL: splatconstant_shift_v8i16: 1675; XOP: # %bb.0: 1676; XOP-NEXT: vpsraw $3, %xmm0, %xmm0 1677; XOP-NEXT: retq 1678; 1679; AVX512-LABEL: splatconstant_shift_v8i16: 1680; AVX512: # %bb.0: 1681; AVX512-NEXT: vpsraw $3, %xmm0, %xmm0 1682; AVX512-NEXT: retq 1683; 1684; AVX512VL-LABEL: splatconstant_shift_v8i16: 1685; AVX512VL: # %bb.0: 1686; AVX512VL-NEXT: vpsraw $3, %xmm0, %xmm0 1687; AVX512VL-NEXT: retq 1688; 1689; X32-SSE-LABEL: splatconstant_shift_v8i16: 1690; X32-SSE: # %bb.0: 1691; X32-SSE-NEXT: psraw $3, %xmm0 1692; X32-SSE-NEXT: retl 1693 %shift = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> 1694 ret <8 x i16> %shift 1695} 1696 1697define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind { 1698; SSE-LABEL: splatconstant_shift_v16i8: 1699; SSE: # %bb.0: 1700; SSE-NEXT: psrlw $3, %xmm0 1701; SSE-NEXT: pand {{.*}}(%rip), %xmm0 1702; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 1703; SSE-NEXT: pxor %xmm1, %xmm0 1704; SSE-NEXT: psubb %xmm1, %xmm0 1705; SSE-NEXT: retq 1706; 1707; AVX-LABEL: splatconstant_shift_v16i8: 1708; AVX: # %bb.0: 1709; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 1710; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1711; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 1712; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 1713; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1714; AVX-NEXT: retq 1715; 1716; XOP-LABEL: splatconstant_shift_v16i8: 1717; XOP: # %bb.0: 1718; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0 1719; XOP-NEXT: retq 1720; 1721; AVX512-LABEL: splatconstant_shift_v16i8: 1722; AVX512: # %bb.0: 1723; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 1724; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1725; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 1726; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 1727; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1728; AVX512-NEXT: retq 1729; 1730; AVX512VL-LABEL: splatconstant_shift_v16i8: 1731; AVX512VL: # %bb.0: 1732; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 1733; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1734; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 1735; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 1736; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1737; AVX512VL-NEXT: retq 1738; 1739; X32-SSE-LABEL: splatconstant_shift_v16i8: 1740; X32-SSE: # %bb.0: 1741; X32-SSE-NEXT: psrlw $3, %xmm0 1742; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 1743; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 1744; X32-SSE-NEXT: pxor %xmm1, %xmm0 1745; X32-SSE-NEXT: psubb %xmm1, %xmm0 1746; X32-SSE-NEXT: retl 1747 %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> 1748 ret <16 x i8> %shift 1749} 1750