1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 6; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW 7; 8; Variable Shifts 9; 10 11define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { 12; AVX1-LABEL: var_shift_v4i64: 13; AVX1: # BB#0: 14; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 15; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 16; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 17; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 18; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2 19; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] 20; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 21; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 22; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 23; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 24; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 25; AVX1-NEXT: retq 26; 27; AVX2-LABEL: var_shift_v4i64: 28; AVX2: # BB#0: 29; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 30; AVX2-NEXT: retq 31; 32; XOPAVX1-LABEL: var_shift_v4i64: 33; XOPAVX1: # BB#0: 34; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 35; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 36; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 37; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 38; XOPAVX1-NEXT: vpshlq %xmm2, %xmm4, %xmm2 39; XOPAVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm1 40; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0 41; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 42; XOPAVX1-NEXT: retq 43; 44; XOPAVX2-LABEL: var_shift_v4i64: 45; XOPAVX2: # BB#0: 46; XOPAVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 47; XOPAVX2-NEXT: retq 48; 49; AVX512-LABEL: var_shift_v4i64: 50; AVX512: ## BB#0: 51; AVX512-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 52; AVX512-NEXT: retq 53 %shift = lshr <4 x i64> %a, %b 54 ret <4 x i64> %shift 55} 56 57define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { 58; AVX1-LABEL: var_shift_v8i32: 59; AVX1: # BB#0: 60; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 61; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 62; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 63; AVX1-NEXT: vpsrld %xmm4, %xmm2, %xmm4 64; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 65; AVX1-NEXT: vpsrld %xmm5, %xmm2, %xmm5 66; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 67; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 68; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] 69; AVX1-NEXT: vpsrld %xmm6, %xmm2, %xmm6 70; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero 71; AVX1-NEXT: vpsrld %xmm3, %xmm2, %xmm2 72; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] 73; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] 74; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 75; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 76; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 77; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 78; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 79; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] 80; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 81; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 82; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 83; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] 84; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] 85; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 86; AVX1-NEXT: retq 87; 88; AVX2-LABEL: var_shift_v8i32: 89; AVX2: # BB#0: 90; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 91; AVX2-NEXT: retq 92; 93; XOPAVX1-LABEL: var_shift_v8i32: 94; XOPAVX1: # BB#0: 95; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 96; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 97; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 98; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 99; XOPAVX1-NEXT: vpshld %xmm2, %xmm4, %xmm2 100; XOPAVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1 101; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0 102; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 103; XOPAVX1-NEXT: retq 104; 105; XOPAVX2-LABEL: var_shift_v8i32: 106; XOPAVX2: # BB#0: 107; XOPAVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 108; XOPAVX2-NEXT: retq 109; 110; AVX512-LABEL: var_shift_v8i32: 111; AVX512: ## BB#0: 112; AVX512-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 113; AVX512-NEXT: retq 114 %shift = lshr <8 x i32> %a, %b 115 ret <8 x i32> %shift 116} 117 118define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { 119; AVX1-LABEL: var_shift_v16i16: 120; AVX1: # BB#0: 121; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 122; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3 123; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 124; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 125; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 126; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 127; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm5 128; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2 129; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm4 130; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 131; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm4 132; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 133; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 134; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm4 135; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 136; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 137; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3 138; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 139; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 140; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3 141; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm4 142; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 143; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 144; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 145; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 146; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 147; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 148; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 149; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 150; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 151; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 152; AVX1-NEXT: retq 153; 154; AVX2-LABEL: var_shift_v16i16: 155; AVX2: # BB#0: 156; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 157; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] 158; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] 159; AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3 160; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 161; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] 162; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] 163; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 164; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 165; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 166; AVX2-NEXT: retq 167; 168; XOPAVX1-LABEL: var_shift_v16i16: 169; XOPAVX1: # BB#0: 170; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 171; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 172; XOPAVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2 173; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 174; XOPAVX1-NEXT: vpshlw %xmm2, %xmm4, %xmm2 175; XOPAVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm1 176; XOPAVX1-NEXT: vpshlw %xmm1, %xmm0, %xmm0 177; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 178; XOPAVX1-NEXT: retq 179; 180; XOPAVX2-LABEL: var_shift_v16i16: 181; XOPAVX2: # BB#0: 182; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 183; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 184; XOPAVX2-NEXT: vpsubw %xmm2, %xmm3, %xmm2 185; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 186; XOPAVX2-NEXT: vpshlw %xmm2, %xmm4, %xmm2 187; XOPAVX2-NEXT: vpsubw %xmm1, %xmm3, %xmm1 188; XOPAVX2-NEXT: vpshlw %xmm1, %xmm0, %xmm0 189; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 190; XOPAVX2-NEXT: retq 191; 192; AVX512-LABEL: var_shift_v16i16: 193; AVX512: ## BB#0: 194; AVX512-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def> 195; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> 196; AVX512-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 197; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> 198; AVX512-NEXT: retq 199 %shift = lshr <16 x i16> %a, %b 200 ret <16 x i16> %shift 201} 202 203define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { 204; AVX1-LABEL: var_shift_v32i8: 205; AVX1: # BB#0: 206; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 207; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3 208; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 209; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 210; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 211; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5 212; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 213; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3 214; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 215; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 216; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 217; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 218; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3 219; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 220; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 221; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 222; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 223; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 224; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 225; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 226; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 227; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm3 228; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 229; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 230; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 231; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm3 232; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 233; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 234; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 235; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 236; AVX1-NEXT: retq 237; 238; AVX2-LABEL: var_shift_v32i8: 239; AVX2: # BB#0: 240; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 241; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2 242; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 243; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 244; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2 245; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 246; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 247; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 248; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2 249; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 250; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 251; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 252; AVX2-NEXT: retq 253; 254; XOPAVX1-LABEL: var_shift_v32i8: 255; XOPAVX1: # BB#0: 256; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 257; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 258; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm2 259; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 260; XOPAVX1-NEXT: vpshlb %xmm2, %xmm4, %xmm2 261; XOPAVX1-NEXT: vpsubb %xmm1, %xmm3, %xmm1 262; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 263; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 264; XOPAVX1-NEXT: retq 265; 266; XOPAVX2-LABEL: var_shift_v32i8: 267; XOPAVX2: # BB#0: 268; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 269; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 270; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm2 271; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 272; XOPAVX2-NEXT: vpshlb %xmm2, %xmm4, %xmm2 273; XOPAVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1 274; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 275; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 276; XOPAVX2-NEXT: retq 277; 278; AVX512-LABEL: var_shift_v32i8: 279; AVX512: ## BB#0: 280; AVX512-NEXT: vpsllw $5, %ymm1, %ymm1 281; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm2 282; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 283; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 284; AVX512-NEXT: vpsrlw $2, %ymm0, %ymm2 285; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 286; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1 287; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 288; AVX512-NEXT: vpsrlw $1, %ymm0, %ymm2 289; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 290; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1 291; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 292; AVX512-NEXT: retq 293 %shift = lshr <32 x i8> %a, %b 294 ret <32 x i8> %shift 295} 296 297; 298; Uniform Variable Shifts 299; 300 301define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { 302; AVX1-LABEL: splatvar_shift_v4i64: 303; AVX1: # BB#0: 304; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 305; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 306; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 307; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 308; AVX1-NEXT: retq 309; 310; AVX2-LABEL: splatvar_shift_v4i64: 311; AVX2: # BB#0: 312; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 313; AVX2-NEXT: retq 314; 315; XOPAVX1-LABEL: splatvar_shift_v4i64: 316; XOPAVX1: # BB#0: 317; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 318; XOPAVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 319; XOPAVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 320; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 321; XOPAVX1-NEXT: retq 322; 323; XOPAVX2-LABEL: splatvar_shift_v4i64: 324; XOPAVX2: # BB#0: 325; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 326; XOPAVX2-NEXT: retq 327; 328; AVX512-LABEL: splatvar_shift_v4i64: 329; AVX512: ## BB#0: 330; AVX512-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 331; AVX512-NEXT: retq 332 %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer 333 %shift = lshr <4 x i64> %a, %splat 334 ret <4 x i64> %shift 335} 336 337define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { 338; AVX1-LABEL: splatvar_shift_v8i32: 339; AVX1: # BB#0: 340; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 341; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] 342; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 343; AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2 344; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 345; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 346; AVX1-NEXT: retq 347; 348; AVX2-LABEL: splatvar_shift_v8i32: 349; AVX2: # BB#0: 350; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 351; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] 352; AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0 353; AVX2-NEXT: retq 354; 355; XOPAVX1-LABEL: splatvar_shift_v8i32: 356; XOPAVX1: # BB#0: 357; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 358; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] 359; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 360; XOPAVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2 361; XOPAVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 362; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 363; XOPAVX1-NEXT: retq 364; 365; XOPAVX2-LABEL: splatvar_shift_v8i32: 366; XOPAVX2: # BB#0: 367; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 368; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] 369; XOPAVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0 370; XOPAVX2-NEXT: retq 371; 372; AVX512-LABEL: splatvar_shift_v8i32: 373; AVX512: ## BB#0: 374; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 375; AVX512-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] 376; AVX512-NEXT: vpsrld %xmm1, %ymm0, %ymm0 377; AVX512-NEXT: retq 378 %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer 379 %shift = lshr <8 x i32> %a, %splat 380 ret <8 x i32> %shift 381} 382 383define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { 384; AVX1-LABEL: splatvar_shift_v16i16: 385; AVX1: # BB#0: 386; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 387; AVX1-NEXT: vmovd %xmm1, %eax 388; AVX1-NEXT: movzwl %ax, %eax 389; AVX1-NEXT: vmovd %eax, %xmm1 390; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 391; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 392; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 393; AVX1-NEXT: retq 394; 395; AVX2-LABEL: splatvar_shift_v16i16: 396; AVX2: # BB#0: 397; AVX2-NEXT: vmovd %xmm1, %eax 398; AVX2-NEXT: movzwl %ax, %eax 399; AVX2-NEXT: vmovd %eax, %xmm1 400; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 401; AVX2-NEXT: retq 402; 403; XOPAVX1-LABEL: splatvar_shift_v16i16: 404; XOPAVX1: # BB#0: 405; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 406; XOPAVX1-NEXT: vmovd %xmm1, %eax 407; XOPAVX1-NEXT: movzwl %ax, %eax 408; XOPAVX1-NEXT: vmovd %eax, %xmm1 409; XOPAVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 410; XOPAVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 411; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 412; XOPAVX1-NEXT: retq 413; 414; XOPAVX2-LABEL: splatvar_shift_v16i16: 415; XOPAVX2: # BB#0: 416; XOPAVX2-NEXT: vmovd %xmm1, %eax 417; XOPAVX2-NEXT: movzwl %ax, %eax 418; XOPAVX2-NEXT: vmovd %eax, %xmm1 419; XOPAVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 420; XOPAVX2-NEXT: retq 421; 422; AVX512-LABEL: splatvar_shift_v16i16: 423; AVX512: ## BB#0: 424; AVX512-NEXT: vmovd %xmm1, %eax 425; AVX512-NEXT: movzwl %ax, %eax 426; AVX512-NEXT: vmovd %eax, %xmm1 427; AVX512-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 428; AVX512-NEXT: retq 429 %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer 430 %shift = lshr <16 x i16> %a, %splat 431 ret <16 x i16> %shift 432} 433 434define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { 435; AVX1-LABEL: splatvar_shift_v32i8: 436; AVX1: # BB#0: 437; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 438; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 439; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 440; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3 441; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 442; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3 443; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 444; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm2, %xmm2 445; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3 446; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 447; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 448; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm6 449; AVX1-NEXT: vpblendvb %xmm6, %xmm3, %xmm2, %xmm2 450; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3 451; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 452; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 453; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm4 454; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2 455; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 456; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3 457; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 458; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 459; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 460; AVX1-NEXT: vpblendvb %xmm6, %xmm1, %xmm0, %xmm0 461; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 462; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 463; AVX1-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0 464; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 465; AVX1-NEXT: retq 466; 467; AVX2-LABEL: splatvar_shift_v32i8: 468; AVX2: # BB#0: 469; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 470; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2 471; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 472; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 473; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 474; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2 475; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 476; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 477; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 478; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2 479; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 480; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 481; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 482; AVX2-NEXT: retq 483; 484; XOPAVX1-LABEL: splatvar_shift_v32i8: 485; XOPAVX1: # BB#0: 486; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 487; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 488; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 489; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 490; XOPAVX1-NEXT: vpshlb %xmm1, %xmm2, %xmm2 491; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 492; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 493; XOPAVX1-NEXT: retq 494; 495; XOPAVX2-LABEL: splatvar_shift_v32i8: 496; XOPAVX2: # BB#0: 497; XOPAVX2-NEXT: vpbroadcastb %xmm1, %ymm1 498; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 499; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 500; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm2 501; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 502; XOPAVX2-NEXT: vpshlb %xmm2, %xmm4, %xmm2 503; XOPAVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1 504; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 505; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 506; XOPAVX2-NEXT: retq 507; 508; AVX512-LABEL: splatvar_shift_v32i8: 509; AVX512: ## BB#0: 510; AVX512-NEXT: vpbroadcastb %xmm1, %ymm1 511; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm2 512; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 513; AVX512-NEXT: vpsllw $5, %ymm1, %ymm1 514; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 515; AVX512-NEXT: vpsrlw $2, %ymm0, %ymm2 516; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 517; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1 518; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 519; AVX512-NEXT: vpsrlw $1, %ymm0, %ymm2 520; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 521; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1 522; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 523; AVX512-NEXT: retq 524 %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer 525 %shift = lshr <32 x i8> %a, %splat 526 ret <32 x i8> %shift 527} 528 529; 530; Constant Shifts 531; 532 533define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind { 534; AVX1-LABEL: constant_shift_v4i64: 535; AVX1: # BB#0: 536; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 537; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2 538; AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1 539; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] 540; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2 541; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 542; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 543; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 544; AVX1-NEXT: retq 545; 546; AVX2-LABEL: constant_shift_v4i64: 547; AVX2: # BB#0: 548; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 549; AVX2-NEXT: retq 550; 551; XOPAVX1-LABEL: constant_shift_v4i64: 552; XOPAVX1: # BB#0: 553; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 554; XOPAVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm2 555; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 556; XOPAVX1-NEXT: vpshlq %xmm2, %xmm3, %xmm2 557; XOPAVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm1 558; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0 559; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 560; XOPAVX1-NEXT: retq 561; 562; XOPAVX2-LABEL: constant_shift_v4i64: 563; XOPAVX2: # BB#0: 564; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 565; XOPAVX2-NEXT: retq 566; 567; AVX512-LABEL: constant_shift_v4i64: 568; AVX512: ## BB#0: 569; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 570; AVX512-NEXT: retq 571 %shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62> 572 ret <4 x i64> %shift 573} 574 575define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind { 576; AVX1-LABEL: constant_shift_v8i32: 577; AVX1: # BB#0: 578; AVX1-NEXT: vpsrld $7, %xmm0, %xmm1 579; AVX1-NEXT: vpsrld $5, %xmm0, %xmm2 580; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 581; AVX1-NEXT: vpsrld $6, %xmm0, %xmm2 582; AVX1-NEXT: vpsrld $4, %xmm0, %xmm3 583; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 584; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] 585; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 586; AVX1-NEXT: vpsrld $7, %xmm0, %xmm2 587; AVX1-NEXT: vpsrld $9, %xmm0, %xmm3 588; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 589; AVX1-NEXT: vpsrld $8, %xmm0, %xmm0 590; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 591; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 592; AVX1-NEXT: retq 593; 594; AVX2-LABEL: constant_shift_v8i32: 595; AVX2: # BB#0: 596; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0 597; AVX2-NEXT: retq 598; 599; XOPAVX1-LABEL: constant_shift_v8i32: 600; XOPAVX1: # BB#0: 601; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm1 602; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 603; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 604; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 605; XOPAVX1-NEXT: retq 606; 607; XOPAVX2-LABEL: constant_shift_v8i32: 608; XOPAVX2: # BB#0: 609; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0 610; XOPAVX2-NEXT: retq 611; 612; AVX512-LABEL: constant_shift_v8i32: 613; AVX512: ## BB#0: 614; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0 615; AVX512-NEXT: retq 616 %shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7> 617 ret <8 x i32> %shift 618} 619 620define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { 621; AVX1-LABEL: constant_shift_v16i16: 622; AVX1: # BB#0: 623; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 624; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 625; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2 626; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] 627; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2 628; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 629; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm2 630; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] 631; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 632; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 633; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2 634; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 635; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2 636; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] 637; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 638; AVX1-NEXT: retq 639; 640; AVX2-LABEL: constant_shift_v16i16: 641; AVX2: # BB#0: 642; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 643; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 644; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] 645; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] 646; AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3 647; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 648; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] 649; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] 650; AVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0 651; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 652; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 653; AVX2-NEXT: retq 654; 655; XOPAVX1-LABEL: constant_shift_v16i16: 656; XOPAVX1: # BB#0: 657; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 658; XOPAVX1-NEXT: vpsubw {{.*}}(%rip), %xmm1, %xmm2 659; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 660; XOPAVX1-NEXT: vpshlw %xmm2, %xmm3, %xmm2 661; XOPAVX1-NEXT: vpsubw {{.*}}(%rip), %xmm1, %xmm1 662; XOPAVX1-NEXT: vpshlw %xmm1, %xmm0, %xmm0 663; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 664; XOPAVX1-NEXT: retq 665; 666; XOPAVX2-LABEL: constant_shift_v16i16: 667; XOPAVX2: # BB#0: 668; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 669; XOPAVX2-NEXT: vpsubw {{.*}}(%rip), %xmm1, %xmm2 670; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 671; XOPAVX2-NEXT: vpshlw %xmm2, %xmm3, %xmm2 672; XOPAVX2-NEXT: vpsubw {{.*}}(%rip), %xmm1, %xmm1 673; XOPAVX2-NEXT: vpshlw %xmm1, %xmm0, %xmm0 674; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 675; XOPAVX2-NEXT: retq 676; 677; AVX512-LABEL: constant_shift_v16i16: 678; AVX512: ## BB#0: 679; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> 680; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 681; AVX512-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 682; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> 683; AVX512-NEXT: retq 684 %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 685 ret <16 x i16> %shift 686} 687 688define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { 689; AVX1-LABEL: constant_shift_v32i8: 690; AVX1: # BB#0: 691; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 692; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2 693; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 694; AVX1-NEXT: vpand %xmm8, %xmm2, %xmm2 695; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 696; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4 697; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm1, %xmm1 698; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2 699; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 700; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 701; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm6 702; AVX1-NEXT: vpblendvb %xmm6, %xmm2, %xmm1, %xmm1 703; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm2 704; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 705; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 706; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm3 707; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 708; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 709; AVX1-NEXT: vpand %xmm8, %xmm2, %xmm2 710; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm0, %xmm0 711; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2 712; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 713; AVX1-NEXT: vpblendvb %xmm6, %xmm2, %xmm0, %xmm0 714; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2 715; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 716; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 717; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 718; AVX1-NEXT: retq 719; 720; AVX2-LABEL: constant_shift_v32i8: 721; AVX2: # BB#0: 722; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 723; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 724; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2 725; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 726; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 727; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2 728; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 729; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 730; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 731; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2 732; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 733; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 734; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 735; AVX2-NEXT: retq 736; 737; XOPAVX1-LABEL: constant_shift_v32i8: 738; XOPAVX1: # BB#0: 739; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 740; XOPAVX1-NEXT: vpsubb {{.*}}(%rip), %xmm1, %xmm1 741; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 742; XOPAVX1-NEXT: vpshlb %xmm1, %xmm2, %xmm2 743; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 744; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 745; XOPAVX1-NEXT: retq 746; 747; XOPAVX2-LABEL: constant_shift_v32i8: 748; XOPAVX2: # BB#0: 749; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 750; XOPAVX2-NEXT: vpsubb {{.*}}(%rip), %xmm1, %xmm1 751; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 752; XOPAVX2-NEXT: vpshlb %xmm1, %xmm2, %xmm2 753; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 754; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 755; XOPAVX2-NEXT: retq 756; 757; AVX512-LABEL: constant_shift_v32i8: 758; AVX512: ## BB#0: 759; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 760; AVX512-NEXT: vpsllw $5, %ymm1, %ymm1 761; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm2 762; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 763; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 764; AVX512-NEXT: vpsrlw $2, %ymm0, %ymm2 765; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 766; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1 767; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 768; AVX512-NEXT: vpsrlw $1, %ymm0, %ymm2 769; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 770; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1 771; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 772; AVX512-NEXT: retq 773 %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> 774 ret <32 x i8> %shift 775} 776 777; 778; Uniform Constant Shifts 779; 780 781define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind { 782; AVX1-LABEL: splatconstant_shift_v4i64: 783; AVX1: # BB#0: 784; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1 785; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 786; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0 787; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 788; AVX1-NEXT: retq 789; 790; AVX2-LABEL: splatconstant_shift_v4i64: 791; AVX2: # BB#0: 792; AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0 793; AVX2-NEXT: retq 794; 795; XOPAVX1-LABEL: splatconstant_shift_v4i64: 796; XOPAVX1: # BB#0: 797; XOPAVX1-NEXT: vpsrlq $7, %xmm0, %xmm1 798; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 799; XOPAVX1-NEXT: vpsrlq $7, %xmm0, %xmm0 800; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 801; XOPAVX1-NEXT: retq 802; 803; XOPAVX2-LABEL: splatconstant_shift_v4i64: 804; XOPAVX2: # BB#0: 805; XOPAVX2-NEXT: vpsrlq $7, %ymm0, %ymm0 806; XOPAVX2-NEXT: retq 807; 808; AVX512-LABEL: splatconstant_shift_v4i64: 809; AVX512: ## BB#0: 810; AVX512-NEXT: vpsrlq $7, %ymm0, %ymm0 811; AVX512-NEXT: retq 812 %shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7> 813 ret <4 x i64> %shift 814} 815 816define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind { 817; AVX1-LABEL: splatconstant_shift_v8i32: 818; AVX1: # BB#0: 819; AVX1-NEXT: vpsrld $5, %xmm0, %xmm1 820; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 821; AVX1-NEXT: vpsrld $5, %xmm0, %xmm0 822; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 823; AVX1-NEXT: retq 824; 825; AVX2-LABEL: splatconstant_shift_v8i32: 826; AVX2: # BB#0: 827; AVX2-NEXT: vpsrld $5, %ymm0, %ymm0 828; AVX2-NEXT: retq 829; 830; XOPAVX1-LABEL: splatconstant_shift_v8i32: 831; XOPAVX1: # BB#0: 832; XOPAVX1-NEXT: vpsrld $5, %xmm0, %xmm1 833; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 834; XOPAVX1-NEXT: vpsrld $5, %xmm0, %xmm0 835; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 836; XOPAVX1-NEXT: retq 837; 838; XOPAVX2-LABEL: splatconstant_shift_v8i32: 839; XOPAVX2: # BB#0: 840; XOPAVX2-NEXT: vpsrld $5, %ymm0, %ymm0 841; XOPAVX2-NEXT: retq 842; 843; AVX512-LABEL: splatconstant_shift_v8i32: 844; AVX512: ## BB#0: 845; AVX512-NEXT: vpsrld $5, %ymm0, %ymm0 846; AVX512-NEXT: retq 847 %shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> 848 ret <8 x i32> %shift 849} 850 851define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind { 852; AVX1-LABEL: splatconstant_shift_v16i16: 853; AVX1: # BB#0: 854; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm1 855; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 856; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 857; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 858; AVX1-NEXT: retq 859; 860; AVX2-LABEL: splatconstant_shift_v16i16: 861; AVX2: # BB#0: 862; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 863; AVX2-NEXT: retq 864; 865; XOPAVX1-LABEL: splatconstant_shift_v16i16: 866; XOPAVX1: # BB#0: 867; XOPAVX1-NEXT: vpsrlw $3, %xmm0, %xmm1 868; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 869; XOPAVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 870; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 871; XOPAVX1-NEXT: retq 872; 873; XOPAVX2-LABEL: splatconstant_shift_v16i16: 874; XOPAVX2: # BB#0: 875; XOPAVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 876; XOPAVX2-NEXT: retq 877; 878; AVX512-LABEL: splatconstant_shift_v16i16: 879; AVX512: ## BB#0: 880; AVX512-NEXT: vpsrlw $3, %ymm0, %ymm0 881; AVX512-NEXT: retq 882 %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> 883 ret <16 x i16> %shift 884} 885 886define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { 887; AVX1-LABEL: splatconstant_shift_v32i8: 888; AVX1: # BB#0: 889; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 890; AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1 891; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] 892; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 893; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 894; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 895; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 896; AVX1-NEXT: retq 897; 898; AVX2-LABEL: splatconstant_shift_v32i8: 899; AVX2: # BB#0: 900; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 901; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 902; AVX2-NEXT: retq 903; 904; XOPAVX1-LABEL: splatconstant_shift_v32i8: 905; XOPAVX1: # BB#0: 906; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 907; XOPAVX1-NEXT: vpsubb {{.*}}(%rip), %xmm1, %xmm1 908; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 909; XOPAVX1-NEXT: vpshlb %xmm1, %xmm2, %xmm2 910; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 911; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 912; XOPAVX1-NEXT: retq 913; 914; XOPAVX2-LABEL: splatconstant_shift_v32i8: 915; XOPAVX2: # BB#0: 916; XOPAVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 917; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 918; XOPAVX2-NEXT: retq 919; 920; AVX512-LABEL: splatconstant_shift_v32i8: 921; AVX512: ## BB#0: 922; AVX512-NEXT: vpsrlw $3, %ymm0, %ymm0 923; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 924; AVX512-NEXT: retq 925 %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> 926 ret <32 x i8> %shift 927} 928