1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2 12 13; Just one 32-bit run to make sure we do reasonable things for i64 rotates. 14; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X32-SSE,X32-SSE2 15 16; 17; Variable Rotates 18; 19 20define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 21; SSE2-LABEL: var_rotate_v2i64: 22; SSE2: # %bb.0: 23; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [64,64] 24; SSE2-NEXT: psubq %xmm1, %xmm2 25; SSE2-NEXT: movdqa %xmm0, %xmm3 26; SSE2-NEXT: psllq %xmm1, %xmm3 27; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 28; SSE2-NEXT: movdqa %xmm0, %xmm4 29; SSE2-NEXT: psllq %xmm1, %xmm4 30; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] 31; SSE2-NEXT: movdqa %xmm0, %xmm1 32; SSE2-NEXT: psrlq %xmm2, %xmm1 33; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 34; SSE2-NEXT: psrlq %xmm2, %xmm0 35; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 36; SSE2-NEXT: orpd %xmm4, %xmm0 37; SSE2-NEXT: retq 38; 39; SSE41-LABEL: var_rotate_v2i64: 40; SSE41: # %bb.0: 41; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [64,64] 42; SSE41-NEXT: psubq %xmm1, %xmm2 43; SSE41-NEXT: movdqa %xmm0, %xmm3 44; SSE41-NEXT: psllq %xmm1, %xmm3 45; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 46; SSE41-NEXT: movdqa %xmm0, %xmm4 47; SSE41-NEXT: psllq %xmm1, %xmm4 48; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0,1,2,3],xmm4[4,5,6,7] 49; SSE41-NEXT: movdqa %xmm0, %xmm1 50; SSE41-NEXT: psrlq %xmm2, %xmm1 51; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 52; SSE41-NEXT: psrlq %xmm2, %xmm0 53; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 54; SSE41-NEXT: por %xmm4, %xmm0 55; SSE41-NEXT: retq 56; 57; AVX1-LABEL: var_rotate_v2i64: 58; AVX1: # %bb.0: 59; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] 60; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2 61; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3 62; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 63; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1 64; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] 65; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm3 66; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 67; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 68; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 69; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 70; AVX1-NEXT: retq 71; 72; AVX2-LABEL: var_rotate_v2i64: 73; AVX2: # %bb.0: 74; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] 75; AVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm2 76; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm1 77; AVX2-NEXT: vpsrlvq %xmm2, %xmm0, %xmm0 78; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 79; AVX2-NEXT: retq 80; 81; AVX512F-LABEL: var_rotate_v2i64: 82; AVX512F: # %bb.0: 83; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 84; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 85; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0 86; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 87; AVX512F-NEXT: vzeroupper 88; AVX512F-NEXT: retq 89; 90; AVX512VL-LABEL: var_rotate_v2i64: 91; AVX512VL: # %bb.0: 92; AVX512VL-NEXT: vprolvq %xmm1, %xmm0, %xmm0 93; AVX512VL-NEXT: retq 94; 95; AVX512BW-LABEL: var_rotate_v2i64: 96; AVX512BW: # %bb.0: 97; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 98; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 99; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 100; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 101; AVX512BW-NEXT: vzeroupper 102; AVX512BW-NEXT: retq 103; 104; AVX512VLBW-LABEL: var_rotate_v2i64: 105; AVX512VLBW: # %bb.0: 106; AVX512VLBW-NEXT: vprolvq %xmm1, %xmm0, %xmm0 107; AVX512VLBW-NEXT: retq 108; 109; XOP-LABEL: var_rotate_v2i64: 110; XOP: # %bb.0: 111; XOP-NEXT: vprotq %xmm1, %xmm0, %xmm0 112; XOP-NEXT: retq 113; 114; X32-SSE-LABEL: var_rotate_v2i64: 115; X32-SSE: # %bb.0: 116; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [64,0,64,0] 117; X32-SSE-NEXT: psubq %xmm1, %xmm2 118; X32-SSE-NEXT: movdqa %xmm0, %xmm3 119; X32-SSE-NEXT: psllq %xmm1, %xmm3 120; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 121; X32-SSE-NEXT: movdqa %xmm0, %xmm4 122; X32-SSE-NEXT: psllq %xmm1, %xmm4 123; X32-SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] 124; X32-SSE-NEXT: movdqa %xmm0, %xmm1 125; X32-SSE-NEXT: psrlq %xmm2, %xmm1 126; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 127; X32-SSE-NEXT: psrlq %xmm2, %xmm0 128; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 129; X32-SSE-NEXT: orpd %xmm4, %xmm0 130; X32-SSE-NEXT: retl 131 %b64 = sub <2 x i64> <i64 64, i64 64>, %b 132 %shl = shl <2 x i64> %a, %b 133 %lshr = lshr <2 x i64> %a, %b64 134 %or = or <2 x i64> %shl, %lshr 135 ret <2 x i64> %or 136} 137 138define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 139; SSE2-LABEL: var_rotate_v4i32: 140; SSE2: # %bb.0: 141; SSE2-NEXT: pslld $23, %xmm1 142; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 143; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 144; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 145; SSE2-NEXT: pmuludq %xmm1, %xmm0 146; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 147; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 148; SSE2-NEXT: pmuludq %xmm2, %xmm1 149; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 150; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 151; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 152; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 153; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 154; SSE2-NEXT: por %xmm3, %xmm0 155; SSE2-NEXT: retq 156; 157; SSE41-LABEL: var_rotate_v4i32: 158; SSE41: # %bb.0: 159; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 160; SSE41-NEXT: pslld $23, %xmm1 161; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1 162; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 163; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 164; SSE41-NEXT: pmuludq %xmm2, %xmm3 165; SSE41-NEXT: pmuludq %xmm1, %xmm0 166; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 167; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 168; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 169; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 170; SSE41-NEXT: por %xmm1, %xmm0 171; SSE41-NEXT: retq 172; 173; AVX1-LABEL: var_rotate_v4i32: 174; AVX1: # %bb.0: 175; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 176; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 177; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 178; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 179; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 180; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 181; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 182; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 183; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 184; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] 185; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 186; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 187; AVX1-NEXT: retq 188; 189; AVX2-LABEL: var_rotate_v4i32: 190; AVX2: # %bb.0: 191; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 192; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] 193; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 194; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 195; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 196; AVX2-NEXT: retq 197; 198; AVX512F-LABEL: var_rotate_v4i32: 199; AVX512F: # %bb.0: 200; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 201; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 202; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0 203; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 204; AVX512F-NEXT: vzeroupper 205; AVX512F-NEXT: retq 206; 207; AVX512VL-LABEL: var_rotate_v4i32: 208; AVX512VL: # %bb.0: 209; AVX512VL-NEXT: vprolvd %xmm1, %xmm0, %xmm0 210; AVX512VL-NEXT: retq 211; 212; AVX512BW-LABEL: var_rotate_v4i32: 213; AVX512BW: # %bb.0: 214; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 215; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 216; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 217; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 218; AVX512BW-NEXT: vzeroupper 219; AVX512BW-NEXT: retq 220; 221; AVX512VLBW-LABEL: var_rotate_v4i32: 222; AVX512VLBW: # %bb.0: 223; AVX512VLBW-NEXT: vprolvd %xmm1, %xmm0, %xmm0 224; AVX512VLBW-NEXT: retq 225; 226; XOP-LABEL: var_rotate_v4i32: 227; XOP: # %bb.0: 228; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0 229; XOP-NEXT: retq 230; 231; X32-SSE-LABEL: var_rotate_v4i32: 232; X32-SSE: # %bb.0: 233; X32-SSE-NEXT: pslld $23, %xmm1 234; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1 235; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1 236; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 237; X32-SSE-NEXT: pmuludq %xmm1, %xmm0 238; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 239; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 240; X32-SSE-NEXT: pmuludq %xmm2, %xmm1 241; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 242; X32-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 243; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 244; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 245; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 246; X32-SSE-NEXT: por %xmm3, %xmm0 247; X32-SSE-NEXT: retl 248 %b32 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %b 249 %shl = shl <4 x i32> %a, %b 250 %lshr = lshr <4 x i32> %a, %b32 251 %or = or <4 x i32> %shl, %lshr 252 ret <4 x i32> %or 253} 254 255define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 256; SSE2-LABEL: var_rotate_v8i16: 257; SSE2: # %bb.0: 258; SSE2-NEXT: pxor %xmm2, %xmm2 259; SSE2-NEXT: movdqa %xmm1, %xmm3 260; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 261; SSE2-NEXT: pslld $23, %xmm3 262; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] 263; SSE2-NEXT: paddd %xmm4, %xmm3 264; SSE2-NEXT: cvttps2dq %xmm3, %xmm3 265; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] 266; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] 267; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 268; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 269; SSE2-NEXT: pslld $23, %xmm1 270; SSE2-NEXT: paddd %xmm4, %xmm1 271; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 272; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 273; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 274; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 275; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] 276; SSE2-NEXT: movdqa %xmm0, %xmm2 277; SSE2-NEXT: pmulhuw %xmm1, %xmm2 278; SSE2-NEXT: pmullw %xmm1, %xmm0 279; SSE2-NEXT: por %xmm2, %xmm0 280; SSE2-NEXT: retq 281; 282; SSE41-LABEL: var_rotate_v8i16: 283; SSE41: # %bb.0: 284; SSE41-NEXT: pxor %xmm2, %xmm2 285; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 286; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 287; SSE41-NEXT: pslld $23, %xmm1 288; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] 289; SSE41-NEXT: paddd %xmm2, %xmm1 290; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 291; SSE41-NEXT: pslld $23, %xmm3 292; SSE41-NEXT: paddd %xmm2, %xmm3 293; SSE41-NEXT: cvttps2dq %xmm3, %xmm2 294; SSE41-NEXT: packusdw %xmm1, %xmm2 295; SSE41-NEXT: movdqa %xmm0, %xmm1 296; SSE41-NEXT: pmulhuw %xmm2, %xmm1 297; SSE41-NEXT: pmullw %xmm2, %xmm0 298; SSE41-NEXT: por %xmm1, %xmm0 299; SSE41-NEXT: retq 300; 301; AVX1-LABEL: var_rotate_v8i16: 302; AVX1: # %bb.0: 303; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 304; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 305; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 306; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] 307; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 308; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 309; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 310; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 311; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 312; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 313; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 314; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 315; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 316; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 317; AVX1-NEXT: retq 318; 319; AVX2-LABEL: var_rotate_v8i16: 320; AVX2: # %bb.0: 321; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 322; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 323; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm2 324; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 325; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 326; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 327; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] 328; AVX2-NEXT: vpsubw %xmm1, %xmm4, %xmm1 329; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 330; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 331; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 332; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 333; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 334; AVX2-NEXT: vzeroupper 335; AVX2-NEXT: retq 336; 337; AVX512F-LABEL: var_rotate_v8i16: 338; AVX512F: # %bb.0: 339; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 340; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 341; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm2 342; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 343; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 344; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1 345; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 346; AVX512F-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 347; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 348; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 349; AVX512F-NEXT: vzeroupper 350; AVX512F-NEXT: retq 351; 352; AVX512VL-LABEL: var_rotate_v8i16: 353; AVX512VL: # %bb.0: 354; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 355; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 356; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm2 357; AVX512VL-NEXT: vpmovdw %ymm2, %xmm2 358; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 359; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm1 360; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 361; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 362; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 363; AVX512VL-NEXT: vpor %xmm0, %xmm2, %xmm0 364; AVX512VL-NEXT: vzeroupper 365; AVX512VL-NEXT: retq 366; 367; AVX512BW-LABEL: var_rotate_v8i16: 368; AVX512BW: # %bb.0: 369; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 370; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 371; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 372; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 373; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 374; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 375; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 376; AVX512BW-NEXT: vzeroupper 377; AVX512BW-NEXT: retq 378; 379; AVX512VLBW-LABEL: var_rotate_v8i16: 380; AVX512VLBW: # %bb.0: 381; AVX512VLBW-NEXT: vpsllvw %xmm1, %xmm0, %xmm2 382; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 383; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 384; AVX512VLBW-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 385; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0 386; AVX512VLBW-NEXT: retq 387; 388; XOP-LABEL: var_rotate_v8i16: 389; XOP: # %bb.0: 390; XOP-NEXT: vprotw %xmm1, %xmm0, %xmm0 391; XOP-NEXT: retq 392; 393; X32-SSE-LABEL: var_rotate_v8i16: 394; X32-SSE: # %bb.0: 395; X32-SSE-NEXT: pxor %xmm2, %xmm2 396; X32-SSE-NEXT: movdqa %xmm1, %xmm3 397; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 398; X32-SSE-NEXT: pslld $23, %xmm3 399; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] 400; X32-SSE-NEXT: paddd %xmm4, %xmm3 401; X32-SSE-NEXT: cvttps2dq %xmm3, %xmm3 402; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] 403; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] 404; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 405; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 406; X32-SSE-NEXT: pslld $23, %xmm1 407; X32-SSE-NEXT: paddd %xmm4, %xmm1 408; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1 409; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 410; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 411; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 412; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] 413; X32-SSE-NEXT: movdqa %xmm0, %xmm2 414; X32-SSE-NEXT: pmulhuw %xmm1, %xmm2 415; X32-SSE-NEXT: pmullw %xmm1, %xmm0 416; X32-SSE-NEXT: por %xmm2, %xmm0 417; X32-SSE-NEXT: retl 418 %b16 = sub <8 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b 419 %shl = shl <8 x i16> %a, %b 420 %lshr = lshr <8 x i16> %a, %b16 421 %or = or <8 x i16> %shl, %lshr 422 ret <8 x i16> %or 423} 424 425define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 426; SSE2-LABEL: var_rotate_v16i8: 427; SSE2: # %bb.0: 428; SSE2-NEXT: movdqa %xmm0, %xmm2 429; SSE2-NEXT: psllw $5, %xmm1 430; SSE2-NEXT: pxor %xmm0, %xmm0 431; SSE2-NEXT: pxor %xmm3, %xmm3 432; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 433; SSE2-NEXT: movdqa %xmm2, %xmm4 434; SSE2-NEXT: psrlw $4, %xmm4 435; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 436; SSE2-NEXT: movdqa %xmm2, %xmm5 437; SSE2-NEXT: psllw $4, %xmm5 438; SSE2-NEXT: pand {{.*}}(%rip), %xmm5 439; SSE2-NEXT: por %xmm4, %xmm5 440; SSE2-NEXT: pand %xmm3, %xmm5 441; SSE2-NEXT: pandn %xmm2, %xmm3 442; SSE2-NEXT: por %xmm5, %xmm3 443; SSE2-NEXT: movdqa %xmm3, %xmm2 444; SSE2-NEXT: psrlw $6, %xmm2 445; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 446; SSE2-NEXT: movdqa %xmm3, %xmm4 447; SSE2-NEXT: psllw $2, %xmm4 448; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 449; SSE2-NEXT: por %xmm2, %xmm4 450; SSE2-NEXT: paddb %xmm1, %xmm1 451; SSE2-NEXT: pxor %xmm2, %xmm2 452; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 453; SSE2-NEXT: pand %xmm2, %xmm4 454; SSE2-NEXT: pandn %xmm3, %xmm2 455; SSE2-NEXT: por %xmm4, %xmm2 456; SSE2-NEXT: movdqa %xmm2, %xmm3 457; SSE2-NEXT: paddb %xmm2, %xmm3 458; SSE2-NEXT: movdqa %xmm2, %xmm4 459; SSE2-NEXT: psrlw $7, %xmm4 460; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 461; SSE2-NEXT: por %xmm3, %xmm4 462; SSE2-NEXT: paddb %xmm1, %xmm1 463; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 464; SSE2-NEXT: pand %xmm0, %xmm4 465; SSE2-NEXT: pandn %xmm2, %xmm0 466; SSE2-NEXT: por %xmm4, %xmm0 467; SSE2-NEXT: retq 468; 469; SSE41-LABEL: var_rotate_v16i8: 470; SSE41: # %bb.0: 471; SSE41-NEXT: movdqa %xmm1, %xmm2 472; SSE41-NEXT: movdqa %xmm0, %xmm1 473; SSE41-NEXT: psrlw $4, %xmm0 474; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 475; SSE41-NEXT: movdqa %xmm1, %xmm3 476; SSE41-NEXT: psllw $4, %xmm3 477; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 478; SSE41-NEXT: por %xmm0, %xmm3 479; SSE41-NEXT: psllw $5, %xmm2 480; SSE41-NEXT: movdqa %xmm2, %xmm0 481; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 482; SSE41-NEXT: movdqa %xmm1, %xmm0 483; SSE41-NEXT: psrlw $6, %xmm0 484; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 485; SSE41-NEXT: movdqa %xmm1, %xmm3 486; SSE41-NEXT: psllw $2, %xmm3 487; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 488; SSE41-NEXT: por %xmm0, %xmm3 489; SSE41-NEXT: paddb %xmm2, %xmm2 490; SSE41-NEXT: movdqa %xmm2, %xmm0 491; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 492; SSE41-NEXT: movdqa %xmm1, %xmm0 493; SSE41-NEXT: paddb %xmm1, %xmm0 494; SSE41-NEXT: movdqa %xmm1, %xmm3 495; SSE41-NEXT: psrlw $7, %xmm3 496; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 497; SSE41-NEXT: por %xmm0, %xmm3 498; SSE41-NEXT: paddb %xmm2, %xmm2 499; SSE41-NEXT: movdqa %xmm2, %xmm0 500; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 501; SSE41-NEXT: movdqa %xmm1, %xmm0 502; SSE41-NEXT: retq 503; 504; AVX-LABEL: var_rotate_v16i8: 505; AVX: # %bb.0: 506; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 507; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 508; AVX-NEXT: vpsllw $4, %xmm0, %xmm3 509; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 510; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 511; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 512; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 513; AVX-NEXT: vpsrlw $6, %xmm0, %xmm2 514; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 515; AVX-NEXT: vpsllw $2, %xmm0, %xmm3 516; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 517; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 518; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 519; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 520; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 521; AVX-NEXT: vpsrlw $7, %xmm0, %xmm3 522; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 523; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 524; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 525; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 526; AVX-NEXT: retq 527; 528; AVX512F-LABEL: var_rotate_v16i8: 529; AVX512F: # %bb.0: 530; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm2 531; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 532; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm3 533; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 534; AVX512F-NEXT: vpor %xmm2, %xmm3, %xmm2 535; AVX512F-NEXT: vpsllw $5, %xmm1, %xmm1 536; AVX512F-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 537; AVX512F-NEXT: vpsrlw $6, %xmm0, %xmm2 538; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 539; AVX512F-NEXT: vpsllw $2, %xmm0, %xmm3 540; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 541; AVX512F-NEXT: vpor %xmm2, %xmm3, %xmm2 542; AVX512F-NEXT: vpaddb %xmm1, %xmm1, %xmm1 543; AVX512F-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 544; AVX512F-NEXT: vpaddb %xmm0, %xmm0, %xmm2 545; AVX512F-NEXT: vpsrlw $7, %xmm0, %xmm3 546; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 547; AVX512F-NEXT: vpor %xmm3, %xmm2, %xmm2 548; AVX512F-NEXT: vpaddb %xmm1, %xmm1, %xmm1 549; AVX512F-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 550; AVX512F-NEXT: retq 551; 552; AVX512VL-LABEL: var_rotate_v16i8: 553; AVX512VL: # %bb.0: 554; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm2 555; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 556; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm3 557; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 558; AVX512VL-NEXT: vpor %xmm2, %xmm3, %xmm2 559; AVX512VL-NEXT: vpsllw $5, %xmm1, %xmm1 560; AVX512VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 561; AVX512VL-NEXT: vpsrlw $6, %xmm0, %xmm2 562; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 563; AVX512VL-NEXT: vpsllw $2, %xmm0, %xmm3 564; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 565; AVX512VL-NEXT: vpor %xmm2, %xmm3, %xmm2 566; AVX512VL-NEXT: vpaddb %xmm1, %xmm1, %xmm1 567; AVX512VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 568; AVX512VL-NEXT: vpaddb %xmm0, %xmm0, %xmm2 569; AVX512VL-NEXT: vpsrlw $7, %xmm0, %xmm3 570; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 571; AVX512VL-NEXT: vpor %xmm3, %xmm2, %xmm2 572; AVX512VL-NEXT: vpaddb %xmm1, %xmm1, %xmm1 573; AVX512VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 574; AVX512VL-NEXT: retq 575; 576; AVX512BW-LABEL: var_rotate_v16i8: 577; AVX512BW: # %bb.0: 578; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 579; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 580; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2 581; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 582; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 583; AVX512BW-NEXT: vpsubb %xmm1, %xmm3, %xmm1 584; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 585; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 586; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 587; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 588; AVX512BW-NEXT: vzeroupper 589; AVX512BW-NEXT: retq 590; 591; AVX512VLBW-LABEL: var_rotate_v16i8: 592; AVX512VLBW: # %bb.0: 593; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 594; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 595; AVX512VLBW-NEXT: vpsllvw %ymm2, %ymm0, %ymm2 596; AVX512VLBW-NEXT: vpmovwb %ymm2, %xmm2 597; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 598; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm3, %xmm1 599; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 600; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 601; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 602; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0 603; AVX512VLBW-NEXT: vzeroupper 604; AVX512VLBW-NEXT: retq 605; 606; XOP-LABEL: var_rotate_v16i8: 607; XOP: # %bb.0: 608; XOP-NEXT: vprotb %xmm1, %xmm0, %xmm0 609; XOP-NEXT: retq 610; 611; X32-SSE-LABEL: var_rotate_v16i8: 612; X32-SSE: # %bb.0: 613; X32-SSE-NEXT: movdqa %xmm0, %xmm2 614; X32-SSE-NEXT: psllw $5, %xmm1 615; X32-SSE-NEXT: pxor %xmm0, %xmm0 616; X32-SSE-NEXT: pxor %xmm3, %xmm3 617; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 618; X32-SSE-NEXT: movdqa %xmm2, %xmm4 619; X32-SSE-NEXT: psrlw $4, %xmm4 620; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4 621; X32-SSE-NEXT: movdqa %xmm2, %xmm5 622; X32-SSE-NEXT: psllw $4, %xmm5 623; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm5 624; X32-SSE-NEXT: por %xmm4, %xmm5 625; X32-SSE-NEXT: pand %xmm3, %xmm5 626; X32-SSE-NEXT: pandn %xmm2, %xmm3 627; X32-SSE-NEXT: por %xmm5, %xmm3 628; X32-SSE-NEXT: movdqa %xmm3, %xmm2 629; X32-SSE-NEXT: psrlw $6, %xmm2 630; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 631; X32-SSE-NEXT: movdqa %xmm3, %xmm4 632; X32-SSE-NEXT: psllw $2, %xmm4 633; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4 634; X32-SSE-NEXT: por %xmm2, %xmm4 635; X32-SSE-NEXT: paddb %xmm1, %xmm1 636; X32-SSE-NEXT: pxor %xmm2, %xmm2 637; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 638; X32-SSE-NEXT: pand %xmm2, %xmm4 639; X32-SSE-NEXT: pandn %xmm3, %xmm2 640; X32-SSE-NEXT: por %xmm4, %xmm2 641; X32-SSE-NEXT: movdqa %xmm2, %xmm3 642; X32-SSE-NEXT: paddb %xmm2, %xmm3 643; X32-SSE-NEXT: movdqa %xmm2, %xmm4 644; X32-SSE-NEXT: psrlw $7, %xmm4 645; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4 646; X32-SSE-NEXT: por %xmm3, %xmm4 647; X32-SSE-NEXT: paddb %xmm1, %xmm1 648; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm0 649; X32-SSE-NEXT: pand %xmm0, %xmm4 650; X32-SSE-NEXT: pandn %xmm2, %xmm0 651; X32-SSE-NEXT: por %xmm4, %xmm0 652; X32-SSE-NEXT: retl 653 %b8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b 654 %shl = shl <16 x i8> %a, %b 655 %lshr = lshr <16 x i8> %a, %b8 656 %or = or <16 x i8> %shl, %lshr 657 ret <16 x i8> %or 658} 659 660; 661; Uniform Variable Rotates 662; 663 664define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 665; SSE-LABEL: splatvar_rotate_v2i64: 666; SSE: # %bb.0: 667; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] 668; SSE-NEXT: movdqa {{.*#+}} xmm3 = [64,64] 669; SSE-NEXT: psubq %xmm2, %xmm3 670; SSE-NEXT: movdqa %xmm0, %xmm2 671; SSE-NEXT: psllq %xmm1, %xmm2 672; SSE-NEXT: psrlq %xmm3, %xmm0 673; SSE-NEXT: por %xmm2, %xmm0 674; SSE-NEXT: retq 675; 676; AVX1-LABEL: splatvar_rotate_v2i64: 677; AVX1: # %bb.0: 678; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] 679; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [64,64] 680; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 681; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1 682; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 683; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 684; AVX1-NEXT: retq 685; 686; AVX2-LABEL: splatvar_rotate_v2i64: 687; AVX2: # %bb.0: 688; AVX2-NEXT: vpbroadcastq %xmm1, %xmm2 689; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [64,64] 690; AVX2-NEXT: vpsubq %xmm2, %xmm3, %xmm2 691; AVX2-NEXT: vpsllq %xmm1, %xmm0, %xmm1 692; AVX2-NEXT: vpsrlvq %xmm2, %xmm0, %xmm0 693; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 694; AVX2-NEXT: retq 695; 696; AVX512F-LABEL: splatvar_rotate_v2i64: 697; AVX512F: # %bb.0: 698; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 699; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1 700; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0 701; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 702; AVX512F-NEXT: vzeroupper 703; AVX512F-NEXT: retq 704; 705; AVX512VL-LABEL: splatvar_rotate_v2i64: 706; AVX512VL: # %bb.0: 707; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 708; AVX512VL-NEXT: vprolvq %xmm1, %xmm0, %xmm0 709; AVX512VL-NEXT: retq 710; 711; AVX512BW-LABEL: splatvar_rotate_v2i64: 712; AVX512BW: # %bb.0: 713; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 714; AVX512BW-NEXT: vpbroadcastq %xmm1, %xmm1 715; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 716; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 717; AVX512BW-NEXT: vzeroupper 718; AVX512BW-NEXT: retq 719; 720; AVX512VLBW-LABEL: splatvar_rotate_v2i64: 721; AVX512VLBW: # %bb.0: 722; AVX512VLBW-NEXT: vpbroadcastq %xmm1, %xmm1 723; AVX512VLBW-NEXT: vprolvq %xmm1, %xmm0, %xmm0 724; AVX512VLBW-NEXT: retq 725; 726; XOPAVX1-LABEL: splatvar_rotate_v2i64: 727; XOPAVX1: # %bb.0: 728; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 729; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0 730; XOPAVX1-NEXT: retq 731; 732; XOPAVX2-LABEL: splatvar_rotate_v2i64: 733; XOPAVX2: # %bb.0: 734; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 735; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0 736; XOPAVX2-NEXT: retq 737; 738; X32-SSE-LABEL: splatvar_rotate_v2i64: 739; X32-SSE: # %bb.0: 740; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] 741; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [64,0,64,0] 742; X32-SSE-NEXT: psubq %xmm2, %xmm3 743; X32-SSE-NEXT: movdqa %xmm0, %xmm2 744; X32-SSE-NEXT: psllq %xmm1, %xmm2 745; X32-SSE-NEXT: movdqa %xmm0, %xmm1 746; X32-SSE-NEXT: psrlq %xmm3, %xmm1 747; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 748; X32-SSE-NEXT: psrlq %xmm3, %xmm0 749; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 750; X32-SSE-NEXT: orpd %xmm2, %xmm0 751; X32-SSE-NEXT: retl 752 %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer 753 %splat64 = sub <2 x i64> <i64 64, i64 64>, %splat 754 %shl = shl <2 x i64> %a, %splat 755 %lshr = lshr <2 x i64> %a, %splat64 756 %or = or <2 x i64> %shl, %lshr 757 ret <2 x i64> %or 758} 759 760define <4 x i32> @splatvar_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 761; SSE2-LABEL: splatvar_rotate_v4i32: 762; SSE2: # %bb.0: 763; SSE2-NEXT: xorps %xmm2, %xmm2 764; SSE2-NEXT: xorps %xmm3, %xmm3 765; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] 766; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 767; SSE2-NEXT: movdqa %xmm0, %xmm4 768; SSE2-NEXT: pslld %xmm3, %xmm4 769; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32] 770; SSE2-NEXT: psubd %xmm1, %xmm3 771; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] 772; SSE2-NEXT: psrld %xmm2, %xmm0 773; SSE2-NEXT: por %xmm4, %xmm0 774; SSE2-NEXT: retq 775; 776; SSE41-LABEL: splatvar_rotate_v4i32: 777; SSE41: # %bb.0: 778; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero 779; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 780; SSE41-NEXT: movdqa %xmm0, %xmm3 781; SSE41-NEXT: pslld %xmm2, %xmm3 782; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32] 783; SSE41-NEXT: psubd %xmm1, %xmm2 784; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero 785; SSE41-NEXT: psrld %xmm1, %xmm0 786; SSE41-NEXT: por %xmm3, %xmm0 787; SSE41-NEXT: retq 788; 789; AVX1-LABEL: splatvar_rotate_v4i32: 790; AVX1: # %bb.0: 791; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero 792; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 793; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm2 794; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32] 795; AVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1 796; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 797; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 798; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 799; AVX1-NEXT: retq 800; 801; AVX2-LABEL: splatvar_rotate_v4i32: 802; AVX2: # %bb.0: 803; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero 804; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 805; AVX2-NEXT: vpslld %xmm2, %xmm0, %xmm2 806; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] 807; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 808; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 809; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 810; AVX2-NEXT: retq 811; 812; AVX512F-LABEL: splatvar_rotate_v4i32: 813; AVX512F: # %bb.0: 814; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 815; AVX512F-NEXT: vpbroadcastd %xmm1, %xmm1 816; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0 817; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 818; AVX512F-NEXT: vzeroupper 819; AVX512F-NEXT: retq 820; 821; AVX512VL-LABEL: splatvar_rotate_v4i32: 822; AVX512VL: # %bb.0: 823; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1 824; AVX512VL-NEXT: vprolvd %xmm1, %xmm0, %xmm0 825; AVX512VL-NEXT: retq 826; 827; AVX512BW-LABEL: splatvar_rotate_v4i32: 828; AVX512BW: # %bb.0: 829; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 830; AVX512BW-NEXT: vpbroadcastd %xmm1, %xmm1 831; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 832; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 833; AVX512BW-NEXT: vzeroupper 834; AVX512BW-NEXT: retq 835; 836; AVX512VLBW-LABEL: splatvar_rotate_v4i32: 837; AVX512VLBW: # %bb.0: 838; AVX512VLBW-NEXT: vpbroadcastd %xmm1, %xmm1 839; AVX512VLBW-NEXT: vprolvd %xmm1, %xmm0, %xmm0 840; AVX512VLBW-NEXT: retq 841; 842; XOPAVX1-LABEL: splatvar_rotate_v4i32: 843; XOPAVX1: # %bb.0: 844; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 845; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0 846; XOPAVX1-NEXT: retq 847; 848; XOPAVX2-LABEL: splatvar_rotate_v4i32: 849; XOPAVX2: # %bb.0: 850; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 851; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0 852; XOPAVX2-NEXT: retq 853; 854; X32-SSE-LABEL: splatvar_rotate_v4i32: 855; X32-SSE: # %bb.0: 856; X32-SSE-NEXT: xorps %xmm2, %xmm2 857; X32-SSE-NEXT: xorps %xmm3, %xmm3 858; X32-SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] 859; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 860; X32-SSE-NEXT: movdqa %xmm0, %xmm4 861; X32-SSE-NEXT: pslld %xmm3, %xmm4 862; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32] 863; X32-SSE-NEXT: psubd %xmm1, %xmm3 864; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] 865; X32-SSE-NEXT: psrld %xmm2, %xmm0 866; X32-SSE-NEXT: por %xmm4, %xmm0 867; X32-SSE-NEXT: retl 868 %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer 869 %splat32 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %splat 870 %shl = shl <4 x i32> %a, %splat 871 %lshr = lshr <4 x i32> %a, %splat32 872 %or = or <4 x i32> %shl, %lshr 873 ret <4 x i32> %or 874} 875 876define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 877; SSE2-LABEL: splatvar_rotate_v8i16: 878; SSE2: # %bb.0: 879; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,0,2,3,4,5,6,7] 880; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 881; SSE2-NEXT: pextrw $0, %xmm1, %eax 882; SSE2-NEXT: movd %eax, %xmm1 883; SSE2-NEXT: movdqa %xmm0, %xmm3 884; SSE2-NEXT: psllw %xmm1, %xmm3 885; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16] 886; SSE2-NEXT: psubw %xmm2, %xmm1 887; SSE2-NEXT: pextrw $0, %xmm1, %eax 888; SSE2-NEXT: movd %eax, %xmm1 889; SSE2-NEXT: psrlw %xmm1, %xmm0 890; SSE2-NEXT: por %xmm3, %xmm0 891; SSE2-NEXT: retq 892; 893; SSE41-LABEL: splatvar_rotate_v8i16: 894; SSE41: # %bb.0: 895; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 896; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] 897; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 898; SSE41-NEXT: movdqa %xmm0, %xmm3 899; SSE41-NEXT: psllw %xmm2, %xmm3 900; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] 901; SSE41-NEXT: psubw %xmm1, %xmm2 902; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 903; SSE41-NEXT: psrlw %xmm1, %xmm0 904; SSE41-NEXT: por %xmm3, %xmm0 905; SSE41-NEXT: retq 906; 907; AVX1-LABEL: splatvar_rotate_v8i16: 908; AVX1: # %bb.0: 909; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 910; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] 911; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 912; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm2 913; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 914; AVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm1 915; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 916; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 917; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 918; AVX1-NEXT: retq 919; 920; AVX2-LABEL: splatvar_rotate_v8i16: 921; AVX2: # %bb.0: 922; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 923; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 924; AVX2-NEXT: vpsllw %xmm2, %xmm0, %xmm2 925; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 926; AVX2-NEXT: vpsubw %xmm1, %xmm3, %xmm1 927; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 928; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 929; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 930; AVX2-NEXT: retq 931; 932; AVX512F-LABEL: splatvar_rotate_v8i16: 933; AVX512F: # %bb.0: 934; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 935; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1 936; AVX512F-NEXT: vpsllw %xmm2, %xmm0, %xmm2 937; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 938; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1 939; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 940; AVX512F-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 941; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 942; AVX512F-NEXT: retq 943; 944; AVX512VL-LABEL: splatvar_rotate_v8i16: 945; AVX512VL: # %bb.0: 946; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 947; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 948; AVX512VL-NEXT: vpsllw %xmm2, %xmm0, %xmm2 949; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 950; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm1 951; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 952; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 953; AVX512VL-NEXT: vpor %xmm0, %xmm2, %xmm0 954; AVX512VL-NEXT: retq 955; 956; AVX512BW-LABEL: splatvar_rotate_v8i16: 957; AVX512BW: # %bb.0: 958; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 959; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 960; AVX512BW-NEXT: vpbroadcastw %xmm1, %xmm1 961; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 962; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 963; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1 964; AVX512BW-NEXT: vpsllw %xmm2, %xmm0, %xmm0 965; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 966; AVX512BW-NEXT: vzeroupper 967; AVX512BW-NEXT: retq 968; 969; AVX512VLBW-LABEL: splatvar_rotate_v8i16: 970; AVX512VLBW: # %bb.0: 971; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 972; AVX512VLBW-NEXT: vpbroadcastw %xmm1, %xmm1 973; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm0, %xmm2 974; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 975; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 976; AVX512VLBW-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 977; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0 978; AVX512VLBW-NEXT: retq 979; 980; XOPAVX1-LABEL: splatvar_rotate_v8i16: 981; XOPAVX1: # %bb.0: 982; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] 983; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 984; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 985; XOPAVX1-NEXT: retq 986; 987; XOPAVX2-LABEL: splatvar_rotate_v8i16: 988; XOPAVX2: # %bb.0: 989; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 990; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0 991; XOPAVX2-NEXT: retq 992; 993; X32-SSE-LABEL: splatvar_rotate_v8i16: 994; X32-SSE: # %bb.0: 995; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,0,2,3,4,5,6,7] 996; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 997; X32-SSE-NEXT: pextrw $0, %xmm1, %eax 998; X32-SSE-NEXT: movd %eax, %xmm1 999; X32-SSE-NEXT: movdqa %xmm0, %xmm3 1000; X32-SSE-NEXT: psllw %xmm1, %xmm3 1001; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16] 1002; X32-SSE-NEXT: psubw %xmm2, %xmm1 1003; X32-SSE-NEXT: pextrw $0, %xmm1, %eax 1004; X32-SSE-NEXT: movd %eax, %xmm1 1005; X32-SSE-NEXT: psrlw %xmm1, %xmm0 1006; X32-SSE-NEXT: por %xmm3, %xmm0 1007; X32-SSE-NEXT: retl 1008 %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer 1009 %splat16 = sub <8 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat 1010 %shl = shl <8 x i16> %a, %splat 1011 %lshr = lshr <8 x i16> %a, %splat16 1012 %or = or <8 x i16> %shl, %lshr 1013 ret <8 x i16> %or 1014} 1015 1016define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 1017; SSE2-LABEL: splatvar_rotate_v16i8: 1018; SSE2: # %bb.0: 1019; SSE2-NEXT: movdqa %xmm0, %xmm2 1020; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1021; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] 1022; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] 1023; SSE2-NEXT: movdqa %xmm2, %xmm0 1024; SSE2-NEXT: psrlw $4, %xmm0 1025; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1026; SSE2-NEXT: movdqa %xmm2, %xmm3 1027; SSE2-NEXT: psllw $4, %xmm3 1028; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1029; SSE2-NEXT: por %xmm0, %xmm3 1030; SSE2-NEXT: psllw $5, %xmm1 1031; SSE2-NEXT: pxor %xmm0, %xmm0 1032; SSE2-NEXT: pxor %xmm4, %xmm4 1033; SSE2-NEXT: pcmpgtb %xmm1, %xmm4 1034; SSE2-NEXT: pand %xmm4, %xmm3 1035; SSE2-NEXT: pandn %xmm2, %xmm4 1036; SSE2-NEXT: por %xmm3, %xmm4 1037; SSE2-NEXT: movdqa %xmm4, %xmm2 1038; SSE2-NEXT: psrlw $6, %xmm2 1039; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 1040; SSE2-NEXT: movdqa %xmm4, %xmm3 1041; SSE2-NEXT: psllw $2, %xmm3 1042; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1043; SSE2-NEXT: por %xmm2, %xmm3 1044; SSE2-NEXT: paddb %xmm1, %xmm1 1045; SSE2-NEXT: pxor %xmm2, %xmm2 1046; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 1047; SSE2-NEXT: pand %xmm2, %xmm3 1048; SSE2-NEXT: pandn %xmm4, %xmm2 1049; SSE2-NEXT: por %xmm3, %xmm2 1050; SSE2-NEXT: movdqa %xmm2, %xmm3 1051; SSE2-NEXT: paddb %xmm2, %xmm3 1052; SSE2-NEXT: movdqa %xmm2, %xmm4 1053; SSE2-NEXT: psrlw $7, %xmm4 1054; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 1055; SSE2-NEXT: por %xmm3, %xmm4 1056; SSE2-NEXT: paddb %xmm1, %xmm1 1057; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 1058; SSE2-NEXT: pand %xmm0, %xmm4 1059; SSE2-NEXT: pandn %xmm2, %xmm0 1060; SSE2-NEXT: por %xmm4, %xmm0 1061; SSE2-NEXT: retq 1062; 1063; SSE41-LABEL: splatvar_rotate_v16i8: 1064; SSE41: # %bb.0: 1065; SSE41-NEXT: movdqa %xmm1, %xmm2 1066; SSE41-NEXT: movdqa %xmm0, %xmm1 1067; SSE41-NEXT: pxor %xmm0, %xmm0 1068; SSE41-NEXT: pshufb %xmm0, %xmm2 1069; SSE41-NEXT: movdqa %xmm1, %xmm0 1070; SSE41-NEXT: psrlw $4, %xmm0 1071; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 1072; SSE41-NEXT: movdqa %xmm1, %xmm3 1073; SSE41-NEXT: psllw $4, %xmm3 1074; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 1075; SSE41-NEXT: por %xmm0, %xmm3 1076; SSE41-NEXT: psllw $5, %xmm2 1077; SSE41-NEXT: movdqa %xmm2, %xmm0 1078; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 1079; SSE41-NEXT: movdqa %xmm1, %xmm0 1080; SSE41-NEXT: psrlw $6, %xmm0 1081; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 1082; SSE41-NEXT: movdqa %xmm1, %xmm3 1083; SSE41-NEXT: psllw $2, %xmm3 1084; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 1085; SSE41-NEXT: por %xmm0, %xmm3 1086; SSE41-NEXT: paddb %xmm2, %xmm2 1087; SSE41-NEXT: movdqa %xmm2, %xmm0 1088; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 1089; SSE41-NEXT: movdqa %xmm1, %xmm0 1090; SSE41-NEXT: paddb %xmm1, %xmm0 1091; SSE41-NEXT: movdqa %xmm1, %xmm3 1092; SSE41-NEXT: psrlw $7, %xmm3 1093; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 1094; SSE41-NEXT: por %xmm0, %xmm3 1095; SSE41-NEXT: paddb %xmm2, %xmm2 1096; SSE41-NEXT: movdqa %xmm2, %xmm0 1097; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 1098; SSE41-NEXT: movdqa %xmm1, %xmm0 1099; SSE41-NEXT: retq 1100; 1101; AVX1-LABEL: splatvar_rotate_v16i8: 1102; AVX1: # %bb.0: 1103; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1104; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1105; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 1106; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1107; AVX1-NEXT: vpsllw $4, %xmm0, %xmm3 1108; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1109; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 1110; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 1111; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1112; AVX1-NEXT: vpsrlw $6, %xmm0, %xmm2 1113; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1114; AVX1-NEXT: vpsllw $2, %xmm0, %xmm3 1115; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1116; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 1117; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 1118; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1119; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm2 1120; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm3 1121; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1122; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 1123; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 1124; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1125; AVX1-NEXT: retq 1126; 1127; AVX2-LABEL: splatvar_rotate_v16i8: 1128; AVX2: # %bb.0: 1129; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 1130; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm2 1131; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1132; AVX2-NEXT: vpsllw $4, %xmm0, %xmm3 1133; AVX2-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1134; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 1135; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 1136; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1137; AVX2-NEXT: vpsrlw $6, %xmm0, %xmm2 1138; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1139; AVX2-NEXT: vpsllw $2, %xmm0, %xmm3 1140; AVX2-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1141; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 1142; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 1143; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1144; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2 1145; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm3 1146; AVX2-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1147; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 1148; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 1149; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1150; AVX2-NEXT: retq 1151; 1152; AVX512F-LABEL: splatvar_rotate_v16i8: 1153; AVX512F: # %bb.0: 1154; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1 1155; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm2 1156; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1157; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm3 1158; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1159; AVX512F-NEXT: vpor %xmm2, %xmm3, %xmm2 1160; AVX512F-NEXT: vpsllw $5, %xmm1, %xmm1 1161; AVX512F-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1162; AVX512F-NEXT: vpsrlw $6, %xmm0, %xmm2 1163; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1164; AVX512F-NEXT: vpsllw $2, %xmm0, %xmm3 1165; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1166; AVX512F-NEXT: vpor %xmm2, %xmm3, %xmm2 1167; AVX512F-NEXT: vpaddb %xmm1, %xmm1, %xmm1 1168; AVX512F-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1169; AVX512F-NEXT: vpaddb %xmm0, %xmm0, %xmm2 1170; AVX512F-NEXT: vpsrlw $7, %xmm0, %xmm3 1171; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1172; AVX512F-NEXT: vpor %xmm3, %xmm2, %xmm2 1173; AVX512F-NEXT: vpaddb %xmm1, %xmm1, %xmm1 1174; AVX512F-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1175; AVX512F-NEXT: retq 1176; 1177; AVX512VL-LABEL: splatvar_rotate_v16i8: 1178; AVX512VL: # %bb.0: 1179; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1 1180; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm2 1181; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1182; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm3 1183; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1184; AVX512VL-NEXT: vpor %xmm2, %xmm3, %xmm2 1185; AVX512VL-NEXT: vpsllw $5, %xmm1, %xmm1 1186; AVX512VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1187; AVX512VL-NEXT: vpsrlw $6, %xmm0, %xmm2 1188; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1189; AVX512VL-NEXT: vpsllw $2, %xmm0, %xmm3 1190; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1191; AVX512VL-NEXT: vpor %xmm2, %xmm3, %xmm2 1192; AVX512VL-NEXT: vpaddb %xmm1, %xmm1, %xmm1 1193; AVX512VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1194; AVX512VL-NEXT: vpaddb %xmm0, %xmm0, %xmm2 1195; AVX512VL-NEXT: vpsrlw $7, %xmm0, %xmm3 1196; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1197; AVX512VL-NEXT: vpor %xmm3, %xmm2, %xmm2 1198; AVX512VL-NEXT: vpaddb %xmm1, %xmm1, %xmm1 1199; AVX512VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1200; AVX512VL-NEXT: retq 1201; 1202; AVX512BW-LABEL: splatvar_rotate_v16i8: 1203; AVX512BW: # %bb.0: 1204; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 1205; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1206; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1207; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2 1208; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 1209; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1210; AVX512BW-NEXT: vpsubb %xmm1, %xmm3, %xmm1 1211; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1212; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 1213; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1214; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 1215; AVX512BW-NEXT: vzeroupper 1216; AVX512BW-NEXT: retq 1217; 1218; AVX512VLBW-LABEL: splatvar_rotate_v16i8: 1219; AVX512VLBW: # %bb.0: 1220; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1 1221; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1222; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1223; AVX512VLBW-NEXT: vpsllvw %ymm2, %ymm0, %ymm2 1224; AVX512VLBW-NEXT: vpmovwb %ymm2, %xmm2 1225; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1226; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm3, %xmm1 1227; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1228; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 1229; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 1230; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0 1231; AVX512VLBW-NEXT: vzeroupper 1232; AVX512VLBW-NEXT: retq 1233; 1234; XOPAVX1-LABEL: splatvar_rotate_v16i8: 1235; XOPAVX1: # %bb.0: 1236; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1237; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1238; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0 1239; XOPAVX1-NEXT: retq 1240; 1241; XOPAVX2-LABEL: splatvar_rotate_v16i8: 1242; XOPAVX2: # %bb.0: 1243; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 1244; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0 1245; XOPAVX2-NEXT: retq 1246; 1247; X32-SSE-LABEL: splatvar_rotate_v16i8: 1248; X32-SSE: # %bb.0: 1249; X32-SSE-NEXT: movdqa %xmm0, %xmm2 1250; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1251; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] 1252; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] 1253; X32-SSE-NEXT: movdqa %xmm2, %xmm0 1254; X32-SSE-NEXT: psrlw $4, %xmm0 1255; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 1256; X32-SSE-NEXT: movdqa %xmm2, %xmm3 1257; X32-SSE-NEXT: psllw $4, %xmm3 1258; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm3 1259; X32-SSE-NEXT: por %xmm0, %xmm3 1260; X32-SSE-NEXT: psllw $5, %xmm1 1261; X32-SSE-NEXT: pxor %xmm0, %xmm0 1262; X32-SSE-NEXT: pxor %xmm4, %xmm4 1263; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm4 1264; X32-SSE-NEXT: pand %xmm4, %xmm3 1265; X32-SSE-NEXT: pandn %xmm2, %xmm4 1266; X32-SSE-NEXT: por %xmm3, %xmm4 1267; X32-SSE-NEXT: movdqa %xmm4, %xmm2 1268; X32-SSE-NEXT: psrlw $6, %xmm2 1269; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 1270; X32-SSE-NEXT: movdqa %xmm4, %xmm3 1271; X32-SSE-NEXT: psllw $2, %xmm3 1272; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm3 1273; X32-SSE-NEXT: por %xmm2, %xmm3 1274; X32-SSE-NEXT: paddb %xmm1, %xmm1 1275; X32-SSE-NEXT: pxor %xmm2, %xmm2 1276; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 1277; X32-SSE-NEXT: pand %xmm2, %xmm3 1278; X32-SSE-NEXT: pandn %xmm4, %xmm2 1279; X32-SSE-NEXT: por %xmm3, %xmm2 1280; X32-SSE-NEXT: movdqa %xmm2, %xmm3 1281; X32-SSE-NEXT: paddb %xmm2, %xmm3 1282; X32-SSE-NEXT: movdqa %xmm2, %xmm4 1283; X32-SSE-NEXT: psrlw $7, %xmm4 1284; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4 1285; X32-SSE-NEXT: por %xmm3, %xmm4 1286; X32-SSE-NEXT: paddb %xmm1, %xmm1 1287; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm0 1288; X32-SSE-NEXT: pand %xmm0, %xmm4 1289; X32-SSE-NEXT: pandn %xmm2, %xmm0 1290; X32-SSE-NEXT: por %xmm4, %xmm0 1291; X32-SSE-NEXT: retl 1292 %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer 1293 %splat8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat 1294 %shl = shl <16 x i8> %a, %splat 1295 %lshr = lshr <16 x i8> %a, %splat8 1296 %or = or <16 x i8> %shl, %lshr 1297 ret <16 x i8> %or 1298} 1299 1300; 1301; Constant Rotates 1302; 1303 1304define <2 x i64> @constant_rotate_v2i64(<2 x i64> %a) nounwind { 1305; SSE2-LABEL: constant_rotate_v2i64: 1306; SSE2: # %bb.0: 1307; SSE2-NEXT: movdqa %xmm0, %xmm1 1308; SSE2-NEXT: psllq $4, %xmm1 1309; SSE2-NEXT: movdqa %xmm0, %xmm2 1310; SSE2-NEXT: psllq $14, %xmm2 1311; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 1312; SSE2-NEXT: movdqa %xmm0, %xmm1 1313; SSE2-NEXT: psrlq $60, %xmm1 1314; SSE2-NEXT: psrlq $50, %xmm0 1315; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1316; SSE2-NEXT: orpd %xmm2, %xmm0 1317; SSE2-NEXT: retq 1318; 1319; SSE41-LABEL: constant_rotate_v2i64: 1320; SSE41: # %bb.0: 1321; SSE41-NEXT: movdqa %xmm0, %xmm1 1322; SSE41-NEXT: psllq $14, %xmm1 1323; SSE41-NEXT: movdqa %xmm0, %xmm2 1324; SSE41-NEXT: psllq $4, %xmm2 1325; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1326; SSE41-NEXT: movdqa %xmm0, %xmm1 1327; SSE41-NEXT: psrlq $50, %xmm1 1328; SSE41-NEXT: psrlq $60, %xmm0 1329; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1330; SSE41-NEXT: por %xmm2, %xmm0 1331; SSE41-NEXT: retq 1332; 1333; AVX1-LABEL: constant_rotate_v2i64: 1334; AVX1: # %bb.0: 1335; AVX1-NEXT: vpsllq $14, %xmm0, %xmm1 1336; AVX1-NEXT: vpsllq $4, %xmm0, %xmm2 1337; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1338; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm2 1339; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm0 1340; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 1341; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 1342; AVX1-NEXT: retq 1343; 1344; AVX2-LABEL: constant_rotate_v2i64: 1345; AVX2: # %bb.0: 1346; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm1 1347; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 1348; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 1349; AVX2-NEXT: retq 1350; 1351; AVX512F-LABEL: constant_rotate_v2i64: 1352; AVX512F: # %bb.0: 1353; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1354; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] 1355; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0 1356; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1357; AVX512F-NEXT: vzeroupper 1358; AVX512F-NEXT: retq 1359; 1360; AVX512VL-LABEL: constant_rotate_v2i64: 1361; AVX512VL: # %bb.0: 1362; AVX512VL-NEXT: vprolvq {{.*}}(%rip), %xmm0, %xmm0 1363; AVX512VL-NEXT: retq 1364; 1365; AVX512BW-LABEL: constant_rotate_v2i64: 1366; AVX512BW: # %bb.0: 1367; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1368; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] 1369; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 1370; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1371; AVX512BW-NEXT: vzeroupper 1372; AVX512BW-NEXT: retq 1373; 1374; AVX512VLBW-LABEL: constant_rotate_v2i64: 1375; AVX512VLBW: # %bb.0: 1376; AVX512VLBW-NEXT: vprolvq {{.*}}(%rip), %xmm0, %xmm0 1377; AVX512VLBW-NEXT: retq 1378; 1379; XOP-LABEL: constant_rotate_v2i64: 1380; XOP: # %bb.0: 1381; XOP-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0 1382; XOP-NEXT: retq 1383; 1384; X32-SSE-LABEL: constant_rotate_v2i64: 1385; X32-SSE: # %bb.0: 1386; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1387; X32-SSE-NEXT: psllq $4, %xmm1 1388; X32-SSE-NEXT: movdqa %xmm0, %xmm2 1389; X32-SSE-NEXT: psllq $14, %xmm2 1390; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 1391; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1392; X32-SSE-NEXT: psrlq $60, %xmm1 1393; X32-SSE-NEXT: psrlq $50, %xmm0 1394; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1395; X32-SSE-NEXT: orpd %xmm2, %xmm0 1396; X32-SSE-NEXT: retl 1397 %shl = shl <2 x i64> %a, <i64 4, i64 14> 1398 %lshr = lshr <2 x i64> %a, <i64 60, i64 50> 1399 %or = or <2 x i64> %shl, %lshr 1400 ret <2 x i64> %or 1401} 1402 1403define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind { 1404; SSE2-LABEL: constant_rotate_v4i32: 1405; SSE2: # %bb.0: 1406; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128] 1407; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 1408; SSE2-NEXT: pmuludq %xmm1, %xmm0 1409; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 1410; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 1411; SSE2-NEXT: pmuludq %xmm2, %xmm1 1412; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 1413; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 1414; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1415; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1416; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1417; SSE2-NEXT: por %xmm3, %xmm0 1418; SSE2-NEXT: retq 1419; 1420; SSE41-LABEL: constant_rotate_v4i32: 1421; SSE41: # %bb.0: 1422; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128] 1423; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 1424; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 1425; SSE41-NEXT: pmuludq %xmm2, %xmm3 1426; SSE41-NEXT: pmuludq %xmm1, %xmm0 1427; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 1428; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 1429; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 1430; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 1431; SSE41-NEXT: por %xmm1, %xmm0 1432; SSE41-NEXT: retq 1433; 1434; AVX1-LABEL: constant_rotate_v4i32: 1435; AVX1: # %bb.0: 1436; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,32,64,128] 1437; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 1438; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 1439; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 1440; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 1441; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 1442; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1443; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] 1444; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 1445; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 1446; AVX1-NEXT: retq 1447; 1448; AVX2-LABEL: constant_rotate_v4i32: 1449; AVX2: # %bb.0: 1450; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 1451; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 1452; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 1453; AVX2-NEXT: retq 1454; 1455; AVX512F-LABEL: constant_rotate_v4i32: 1456; AVX512F: # %bb.0: 1457; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1458; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] 1459; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0 1460; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1461; AVX512F-NEXT: vzeroupper 1462; AVX512F-NEXT: retq 1463; 1464; AVX512VL-LABEL: constant_rotate_v4i32: 1465; AVX512VL: # %bb.0: 1466; AVX512VL-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0 1467; AVX512VL-NEXT: retq 1468; 1469; AVX512BW-LABEL: constant_rotate_v4i32: 1470; AVX512BW: # %bb.0: 1471; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1472; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] 1473; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 1474; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1475; AVX512BW-NEXT: vzeroupper 1476; AVX512BW-NEXT: retq 1477; 1478; AVX512VLBW-LABEL: constant_rotate_v4i32: 1479; AVX512VLBW: # %bb.0: 1480; AVX512VLBW-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0 1481; AVX512VLBW-NEXT: retq 1482; 1483; XOP-LABEL: constant_rotate_v4i32: 1484; XOP: # %bb.0: 1485; XOP-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 1486; XOP-NEXT: retq 1487; 1488; X32-SSE-LABEL: constant_rotate_v4i32: 1489; X32-SSE: # %bb.0: 1490; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128] 1491; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 1492; X32-SSE-NEXT: pmuludq %xmm1, %xmm0 1493; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 1494; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 1495; X32-SSE-NEXT: pmuludq %xmm2, %xmm1 1496; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 1497; X32-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 1498; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1499; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1500; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1501; X32-SSE-NEXT: por %xmm3, %xmm0 1502; X32-SSE-NEXT: retl 1503 %shl = shl <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7> 1504 %lshr = lshr <4 x i32> %a, <i32 28, i32 27, i32 26, i32 25> 1505 %or = or <4 x i32> %shl, %lshr 1506 ret <4 x i32> %or 1507} 1508 1509define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind { 1510; SSE-LABEL: constant_rotate_v8i16: 1511; SSE: # %bb.0: 1512; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] 1513; SSE-NEXT: movdqa %xmm0, %xmm2 1514; SSE-NEXT: pmulhuw %xmm1, %xmm2 1515; SSE-NEXT: pmullw %xmm1, %xmm0 1516; SSE-NEXT: por %xmm2, %xmm0 1517; SSE-NEXT: retq 1518; 1519; AVX-LABEL: constant_rotate_v8i16: 1520; AVX: # %bb.0: 1521; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] 1522; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 1523; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1524; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0 1525; AVX-NEXT: retq 1526; 1527; AVX512F-LABEL: constant_rotate_v8i16: 1528; AVX512F: # %bb.0: 1529; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] 1530; AVX512F-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 1531; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1532; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0 1533; AVX512F-NEXT: retq 1534; 1535; AVX512VL-LABEL: constant_rotate_v8i16: 1536; AVX512VL: # %bb.0: 1537; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] 1538; AVX512VL-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 1539; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1540; AVX512VL-NEXT: vpor %xmm2, %xmm0, %xmm0 1541; AVX512VL-NEXT: retq 1542; 1543; AVX512BW-LABEL: constant_rotate_v8i16: 1544; AVX512BW: # %bb.0: 1545; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1546; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] 1547; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [16,15,14,13,12,11,10,9] 1548; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm2 1549; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 1550; AVX512BW-NEXT: vpor %xmm2, %xmm0, %xmm0 1551; AVX512BW-NEXT: vzeroupper 1552; AVX512BW-NEXT: retq 1553; 1554; AVX512VLBW-LABEL: constant_rotate_v8i16: 1555; AVX512VLBW: # %bb.0: 1556; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm1 1557; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0 1558; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 1559; AVX512VLBW-NEXT: retq 1560; 1561; XOP-LABEL: constant_rotate_v8i16: 1562; XOP: # %bb.0: 1563; XOP-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0 1564; XOP-NEXT: retq 1565; 1566; X32-SSE-LABEL: constant_rotate_v8i16: 1567; X32-SSE: # %bb.0: 1568; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] 1569; X32-SSE-NEXT: movdqa %xmm0, %xmm2 1570; X32-SSE-NEXT: pmulhuw %xmm1, %xmm2 1571; X32-SSE-NEXT: pmullw %xmm1, %xmm0 1572; X32-SSE-NEXT: por %xmm2, %xmm0 1573; X32-SSE-NEXT: retl 1574 %shl = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7> 1575 %lshr = lshr <8 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9> 1576 %or = or <8 x i16> %shl, %lshr 1577 ret <8 x i16> %or 1578} 1579 1580define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind { 1581; SSE2-LABEL: constant_rotate_v16i8: 1582; SSE2: # %bb.0: 1583; SSE2-NEXT: movdqa %xmm0, %xmm1 1584; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,57600,41152,24704,8256] 1585; SSE2-NEXT: pxor %xmm0, %xmm0 1586; SSE2-NEXT: pxor %xmm3, %xmm3 1587; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 1588; SSE2-NEXT: movdqa %xmm1, %xmm4 1589; SSE2-NEXT: psrlw $4, %xmm4 1590; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 1591; SSE2-NEXT: movdqa %xmm1, %xmm5 1592; SSE2-NEXT: psllw $4, %xmm5 1593; SSE2-NEXT: pand {{.*}}(%rip), %xmm5 1594; SSE2-NEXT: por %xmm4, %xmm5 1595; SSE2-NEXT: pand %xmm3, %xmm5 1596; SSE2-NEXT: pandn %xmm1, %xmm3 1597; SSE2-NEXT: por %xmm5, %xmm3 1598; SSE2-NEXT: movdqa %xmm3, %xmm1 1599; SSE2-NEXT: psrlw $6, %xmm1 1600; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 1601; SSE2-NEXT: movdqa %xmm3, %xmm4 1602; SSE2-NEXT: psllw $2, %xmm4 1603; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 1604; SSE2-NEXT: por %xmm1, %xmm4 1605; SSE2-NEXT: paddb %xmm2, %xmm2 1606; SSE2-NEXT: pxor %xmm1, %xmm1 1607; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 1608; SSE2-NEXT: pand %xmm1, %xmm4 1609; SSE2-NEXT: pandn %xmm3, %xmm1 1610; SSE2-NEXT: por %xmm4, %xmm1 1611; SSE2-NEXT: movdqa %xmm1, %xmm3 1612; SSE2-NEXT: paddb %xmm1, %xmm3 1613; SSE2-NEXT: movdqa %xmm1, %xmm4 1614; SSE2-NEXT: psrlw $7, %xmm4 1615; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 1616; SSE2-NEXT: por %xmm3, %xmm4 1617; SSE2-NEXT: paddb %xmm2, %xmm2 1618; SSE2-NEXT: pcmpgtb %xmm2, %xmm0 1619; SSE2-NEXT: pand %xmm0, %xmm4 1620; SSE2-NEXT: pandn %xmm1, %xmm0 1621; SSE2-NEXT: por %xmm4, %xmm0 1622; SSE2-NEXT: retq 1623; 1624; SSE41-LABEL: constant_rotate_v16i8: 1625; SSE41: # %bb.0: 1626; SSE41-NEXT: movdqa %xmm0, %xmm1 1627; SSE41-NEXT: psrlw $4, %xmm0 1628; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 1629; SSE41-NEXT: movdqa %xmm1, %xmm2 1630; SSE41-NEXT: psllw $4, %xmm2 1631; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 1632; SSE41-NEXT: por %xmm0, %xmm2 1633; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [8192,24640,41088,57536,57600,41152,24704,8256] 1634; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 1635; SSE41-NEXT: movdqa %xmm1, %xmm2 1636; SSE41-NEXT: psrlw $6, %xmm2 1637; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 1638; SSE41-NEXT: movdqa %xmm1, %xmm3 1639; SSE41-NEXT: psllw $2, %xmm3 1640; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 1641; SSE41-NEXT: por %xmm2, %xmm3 1642; SSE41-NEXT: paddb %xmm0, %xmm0 1643; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 1644; SSE41-NEXT: movdqa %xmm1, %xmm2 1645; SSE41-NEXT: paddb %xmm1, %xmm2 1646; SSE41-NEXT: movdqa %xmm1, %xmm3 1647; SSE41-NEXT: psrlw $7, %xmm3 1648; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 1649; SSE41-NEXT: por %xmm2, %xmm3 1650; SSE41-NEXT: paddb %xmm0, %xmm0 1651; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 1652; SSE41-NEXT: movdqa %xmm1, %xmm0 1653; SSE41-NEXT: retq 1654; 1655; AVX-LABEL: constant_rotate_v16i8: 1656; AVX: # %bb.0: 1657; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1 1658; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1659; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 1660; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1661; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 1662; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,57600,41152,24704,8256] 1663; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1664; AVX-NEXT: vpsrlw $6, %xmm0, %xmm1 1665; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1666; AVX-NEXT: vpsllw $2, %xmm0, %xmm3 1667; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1668; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 1669; AVX-NEXT: vpaddb %xmm2, %xmm2, %xmm2 1670; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1671; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm1 1672; AVX-NEXT: vpsrlw $7, %xmm0, %xmm3 1673; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1674; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 1675; AVX-NEXT: vpaddb %xmm2, %xmm2, %xmm2 1676; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1677; AVX-NEXT: retq 1678; 1679; AVX512F-LABEL: constant_rotate_v16i8: 1680; AVX512F: # %bb.0: 1681; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm1 1682; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1683; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm2 1684; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1685; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1 1686; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,57600,41152,24704,8256] 1687; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1688; AVX512F-NEXT: vpsrlw $6, %xmm0, %xmm1 1689; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1690; AVX512F-NEXT: vpsllw $2, %xmm0, %xmm3 1691; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1692; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1 1693; AVX512F-NEXT: vpaddb %xmm2, %xmm2, %xmm2 1694; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1695; AVX512F-NEXT: vpaddb %xmm0, %xmm0, %xmm1 1696; AVX512F-NEXT: vpsrlw $7, %xmm0, %xmm3 1697; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1698; AVX512F-NEXT: vpor %xmm3, %xmm1, %xmm1 1699; AVX512F-NEXT: vpaddb %xmm2, %xmm2, %xmm2 1700; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1701; AVX512F-NEXT: retq 1702; 1703; AVX512VL-LABEL: constant_rotate_v16i8: 1704; AVX512VL: # %bb.0: 1705; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm1 1706; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1707; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm2 1708; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1709; AVX512VL-NEXT: vpor %xmm1, %xmm2, %xmm1 1710; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,57600,41152,24704,8256] 1711; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1712; AVX512VL-NEXT: vpsrlw $6, %xmm0, %xmm1 1713; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1714; AVX512VL-NEXT: vpsllw $2, %xmm0, %xmm3 1715; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1716; AVX512VL-NEXT: vpor %xmm1, %xmm3, %xmm1 1717; AVX512VL-NEXT: vpaddb %xmm2, %xmm2, %xmm2 1718; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1719; AVX512VL-NEXT: vpaddb %xmm0, %xmm0, %xmm1 1720; AVX512VL-NEXT: vpsrlw $7, %xmm0, %xmm3 1721; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1722; AVX512VL-NEXT: vpor %xmm3, %xmm1, %xmm1 1723; AVX512VL-NEXT: vpaddb %xmm2, %xmm2, %xmm2 1724; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1725; AVX512VL-NEXT: retq 1726; 1727; AVX512BW-LABEL: constant_rotate_v16i8: 1728; AVX512BW: # %bb.0: 1729; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7] 1730; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1731; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1 1732; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 1733; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] 1734; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 1735; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1736; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 1737; AVX512BW-NEXT: vzeroupper 1738; AVX512BW-NEXT: retq 1739; 1740; AVX512VLBW-LABEL: constant_rotate_v16i8: 1741; AVX512VLBW: # %bb.0: 1742; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1743; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm1 1744; AVX512VLBW-NEXT: vpmovwb %ymm1, %xmm1 1745; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0 1746; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 1747; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 1748; AVX512VLBW-NEXT: vzeroupper 1749; AVX512VLBW-NEXT: retq 1750; 1751; XOP-LABEL: constant_rotate_v16i8: 1752; XOP: # %bb.0: 1753; XOP-NEXT: vprotb {{.*}}(%rip), %xmm0, %xmm0 1754; XOP-NEXT: retq 1755; 1756; X32-SSE-LABEL: constant_rotate_v16i8: 1757; X32-SSE: # %bb.0: 1758; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1759; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,57600,41152,24704,8256] 1760; X32-SSE-NEXT: pxor %xmm0, %xmm0 1761; X32-SSE-NEXT: pxor %xmm3, %xmm3 1762; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3 1763; X32-SSE-NEXT: movdqa %xmm1, %xmm4 1764; X32-SSE-NEXT: psrlw $4, %xmm4 1765; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4 1766; X32-SSE-NEXT: movdqa %xmm1, %xmm5 1767; X32-SSE-NEXT: psllw $4, %xmm5 1768; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm5 1769; X32-SSE-NEXT: por %xmm4, %xmm5 1770; X32-SSE-NEXT: pand %xmm3, %xmm5 1771; X32-SSE-NEXT: pandn %xmm1, %xmm3 1772; X32-SSE-NEXT: por %xmm5, %xmm3 1773; X32-SSE-NEXT: movdqa %xmm3, %xmm1 1774; X32-SSE-NEXT: psrlw $6, %xmm1 1775; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 1776; X32-SSE-NEXT: movdqa %xmm3, %xmm4 1777; X32-SSE-NEXT: psllw $2, %xmm4 1778; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4 1779; X32-SSE-NEXT: por %xmm1, %xmm4 1780; X32-SSE-NEXT: paddb %xmm2, %xmm2 1781; X32-SSE-NEXT: pxor %xmm1, %xmm1 1782; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm1 1783; X32-SSE-NEXT: pand %xmm1, %xmm4 1784; X32-SSE-NEXT: pandn %xmm3, %xmm1 1785; X32-SSE-NEXT: por %xmm4, %xmm1 1786; X32-SSE-NEXT: movdqa %xmm1, %xmm3 1787; X32-SSE-NEXT: paddb %xmm1, %xmm3 1788; X32-SSE-NEXT: movdqa %xmm1, %xmm4 1789; X32-SSE-NEXT: psrlw $7, %xmm4 1790; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4 1791; X32-SSE-NEXT: por %xmm3, %xmm4 1792; X32-SSE-NEXT: paddb %xmm2, %xmm2 1793; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm0 1794; X32-SSE-NEXT: pand %xmm0, %xmm4 1795; X32-SSE-NEXT: pandn %xmm1, %xmm0 1796; X32-SSE-NEXT: por %xmm4, %xmm0 1797; X32-SSE-NEXT: retl 1798 %shl = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1> 1799 %lshr = lshr <16 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7> 1800 %or = or <16 x i8> %shl, %lshr 1801 ret <16 x i8> %or 1802} 1803 1804; 1805; Uniform Constant Rotates 1806; 1807 1808define <2 x i64> @splatconstant_rotate_v2i64(<2 x i64> %a) nounwind { 1809; SSE-LABEL: splatconstant_rotate_v2i64: 1810; SSE: # %bb.0: 1811; SSE-NEXT: movdqa %xmm0, %xmm1 1812; SSE-NEXT: psllq $14, %xmm1 1813; SSE-NEXT: psrlq $50, %xmm0 1814; SSE-NEXT: por %xmm1, %xmm0 1815; SSE-NEXT: retq 1816; 1817; AVX-LABEL: splatconstant_rotate_v2i64: 1818; AVX: # %bb.0: 1819; AVX-NEXT: vpsllq $14, %xmm0, %xmm1 1820; AVX-NEXT: vpsrlq $50, %xmm0, %xmm0 1821; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 1822; AVX-NEXT: retq 1823; 1824; AVX512F-LABEL: splatconstant_rotate_v2i64: 1825; AVX512F: # %bb.0: 1826; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1827; AVX512F-NEXT: vprolq $14, %zmm0, %zmm0 1828; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1829; AVX512F-NEXT: vzeroupper 1830; AVX512F-NEXT: retq 1831; 1832; AVX512VL-LABEL: splatconstant_rotate_v2i64: 1833; AVX512VL: # %bb.0: 1834; AVX512VL-NEXT: vprolq $14, %xmm0, %xmm0 1835; AVX512VL-NEXT: retq 1836; 1837; AVX512BW-LABEL: splatconstant_rotate_v2i64: 1838; AVX512BW: # %bb.0: 1839; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1840; AVX512BW-NEXT: vprolq $14, %zmm0, %zmm0 1841; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1842; AVX512BW-NEXT: vzeroupper 1843; AVX512BW-NEXT: retq 1844; 1845; AVX512VLBW-LABEL: splatconstant_rotate_v2i64: 1846; AVX512VLBW: # %bb.0: 1847; AVX512VLBW-NEXT: vprolq $14, %xmm0, %xmm0 1848; AVX512VLBW-NEXT: retq 1849; 1850; XOP-LABEL: splatconstant_rotate_v2i64: 1851; XOP: # %bb.0: 1852; XOP-NEXT: vprotq $14, %xmm0, %xmm0 1853; XOP-NEXT: retq 1854; 1855; X32-SSE-LABEL: splatconstant_rotate_v2i64: 1856; X32-SSE: # %bb.0: 1857; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1858; X32-SSE-NEXT: psllq $14, %xmm1 1859; X32-SSE-NEXT: psrlq $50, %xmm0 1860; X32-SSE-NEXT: por %xmm1, %xmm0 1861; X32-SSE-NEXT: retl 1862 %shl = shl <2 x i64> %a, <i64 14, i64 14> 1863 %lshr = lshr <2 x i64> %a, <i64 50, i64 50> 1864 %or = or <2 x i64> %shl, %lshr 1865 ret <2 x i64> %or 1866} 1867 1868define <4 x i32> @splatconstant_rotate_v4i32(<4 x i32> %a) nounwind { 1869; SSE-LABEL: splatconstant_rotate_v4i32: 1870; SSE: # %bb.0: 1871; SSE-NEXT: movdqa %xmm0, %xmm1 1872; SSE-NEXT: psrld $28, %xmm1 1873; SSE-NEXT: pslld $4, %xmm0 1874; SSE-NEXT: por %xmm1, %xmm0 1875; SSE-NEXT: retq 1876; 1877; AVX-LABEL: splatconstant_rotate_v4i32: 1878; AVX: # %bb.0: 1879; AVX-NEXT: vpsrld $28, %xmm0, %xmm1 1880; AVX-NEXT: vpslld $4, %xmm0, %xmm0 1881; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1882; AVX-NEXT: retq 1883; 1884; AVX512F-LABEL: splatconstant_rotate_v4i32: 1885; AVX512F: # %bb.0: 1886; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1887; AVX512F-NEXT: vprold $4, %zmm0, %zmm0 1888; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1889; AVX512F-NEXT: vzeroupper 1890; AVX512F-NEXT: retq 1891; 1892; AVX512VL-LABEL: splatconstant_rotate_v4i32: 1893; AVX512VL: # %bb.0: 1894; AVX512VL-NEXT: vprold $4, %xmm0, %xmm0 1895; AVX512VL-NEXT: retq 1896; 1897; AVX512BW-LABEL: splatconstant_rotate_v4i32: 1898; AVX512BW: # %bb.0: 1899; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1900; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0 1901; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1902; AVX512BW-NEXT: vzeroupper 1903; AVX512BW-NEXT: retq 1904; 1905; AVX512VLBW-LABEL: splatconstant_rotate_v4i32: 1906; AVX512VLBW: # %bb.0: 1907; AVX512VLBW-NEXT: vprold $4, %xmm0, %xmm0 1908; AVX512VLBW-NEXT: retq 1909; 1910; XOP-LABEL: splatconstant_rotate_v4i32: 1911; XOP: # %bb.0: 1912; XOP-NEXT: vprotd $4, %xmm0, %xmm0 1913; XOP-NEXT: retq 1914; 1915; X32-SSE-LABEL: splatconstant_rotate_v4i32: 1916; X32-SSE: # %bb.0: 1917; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1918; X32-SSE-NEXT: psrld $28, %xmm1 1919; X32-SSE-NEXT: pslld $4, %xmm0 1920; X32-SSE-NEXT: por %xmm1, %xmm0 1921; X32-SSE-NEXT: retl 1922 %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4> 1923 %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28> 1924 %or = or <4 x i32> %shl, %lshr 1925 ret <4 x i32> %or 1926} 1927 1928define <8 x i16> @splatconstant_rotate_v8i16(<8 x i16> %a) nounwind { 1929; SSE-LABEL: splatconstant_rotate_v8i16: 1930; SSE: # %bb.0: 1931; SSE-NEXT: movdqa %xmm0, %xmm1 1932; SSE-NEXT: psrlw $9, %xmm1 1933; SSE-NEXT: psllw $7, %xmm0 1934; SSE-NEXT: por %xmm1, %xmm0 1935; SSE-NEXT: retq 1936; 1937; AVX-LABEL: splatconstant_rotate_v8i16: 1938; AVX: # %bb.0: 1939; AVX-NEXT: vpsrlw $9, %xmm0, %xmm1 1940; AVX-NEXT: vpsllw $7, %xmm0, %xmm0 1941; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1942; AVX-NEXT: retq 1943; 1944; AVX512-LABEL: splatconstant_rotate_v8i16: 1945; AVX512: # %bb.0: 1946; AVX512-NEXT: vpsrlw $9, %xmm0, %xmm1 1947; AVX512-NEXT: vpsllw $7, %xmm0, %xmm0 1948; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 1949; AVX512-NEXT: retq 1950; 1951; XOP-LABEL: splatconstant_rotate_v8i16: 1952; XOP: # %bb.0: 1953; XOP-NEXT: vprotw $7, %xmm0, %xmm0 1954; XOP-NEXT: retq 1955; 1956; X32-SSE-LABEL: splatconstant_rotate_v8i16: 1957; X32-SSE: # %bb.0: 1958; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1959; X32-SSE-NEXT: psrlw $9, %xmm1 1960; X32-SSE-NEXT: psllw $7, %xmm0 1961; X32-SSE-NEXT: por %xmm1, %xmm0 1962; X32-SSE-NEXT: retl 1963 %shl = shl <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 1964 %lshr = lshr <8 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9> 1965 %or = or <8 x i16> %shl, %lshr 1966 ret <8 x i16> %or 1967} 1968 1969define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind { 1970; SSE-LABEL: splatconstant_rotate_v16i8: 1971; SSE: # %bb.0: 1972; SSE-NEXT: movdqa %xmm0, %xmm1 1973; SSE-NEXT: psrlw $4, %xmm1 1974; SSE-NEXT: pand {{.*}}(%rip), %xmm1 1975; SSE-NEXT: psllw $4, %xmm0 1976; SSE-NEXT: pand {{.*}}(%rip), %xmm0 1977; SSE-NEXT: por %xmm1, %xmm0 1978; SSE-NEXT: retq 1979; 1980; AVX-LABEL: splatconstant_rotate_v16i8: 1981; AVX: # %bb.0: 1982; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1 1983; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1984; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 1985; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1986; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1987; AVX-NEXT: retq 1988; 1989; AVX512-LABEL: splatconstant_rotate_v16i8: 1990; AVX512: # %bb.0: 1991; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm1 1992; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1993; AVX512-NEXT: vpsllw $4, %xmm0, %xmm0 1994; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1995; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 1996; AVX512-NEXT: retq 1997; 1998; XOP-LABEL: splatconstant_rotate_v16i8: 1999; XOP: # %bb.0: 2000; XOP-NEXT: vprotb $4, %xmm0, %xmm0 2001; XOP-NEXT: retq 2002; 2003; X32-SSE-LABEL: splatconstant_rotate_v16i8: 2004; X32-SSE: # %bb.0: 2005; X32-SSE-NEXT: movdqa %xmm0, %xmm1 2006; X32-SSE-NEXT: psrlw $4, %xmm1 2007; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 2008; X32-SSE-NEXT: psllw $4, %xmm0 2009; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 2010; X32-SSE-NEXT: por %xmm1, %xmm0 2011; X32-SSE-NEXT: retl 2012 %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 2013 %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 2014 %or = or <16 x i8> %shl, %lshr 2015 ret <16 x i8> %or 2016} 2017 2018; 2019; Masked Uniform Constant Rotates 2020; 2021 2022define <2 x i64> @splatconstant_rotate_mask_v2i64(<2 x i64> %a) nounwind { 2023; SSE-LABEL: splatconstant_rotate_mask_v2i64: 2024; SSE: # %bb.0: 2025; SSE-NEXT: psrlq $49, %xmm0 2026; SSE-NEXT: pand {{.*}}(%rip), %xmm0 2027; SSE-NEXT: retq 2028; 2029; AVX-LABEL: splatconstant_rotate_mask_v2i64: 2030; AVX: # %bb.0: 2031; AVX-NEXT: vpsrlq $49, %xmm0, %xmm0 2032; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2033; AVX-NEXT: retq 2034; 2035; AVX512F-LABEL: splatconstant_rotate_mask_v2i64: 2036; AVX512F: # %bb.0: 2037; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2038; AVX512F-NEXT: vprolq $15, %zmm0, %zmm0 2039; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2040; AVX512F-NEXT: vzeroupper 2041; AVX512F-NEXT: retq 2042; 2043; AVX512VL-LABEL: splatconstant_rotate_mask_v2i64: 2044; AVX512VL: # %bb.0: 2045; AVX512VL-NEXT: vprolq $15, %xmm0, %xmm0 2046; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2047; AVX512VL-NEXT: retq 2048; 2049; AVX512BW-LABEL: splatconstant_rotate_mask_v2i64: 2050; AVX512BW: # %bb.0: 2051; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2052; AVX512BW-NEXT: vprolq $15, %zmm0, %zmm0 2053; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2054; AVX512BW-NEXT: vzeroupper 2055; AVX512BW-NEXT: retq 2056; 2057; AVX512VLBW-LABEL: splatconstant_rotate_mask_v2i64: 2058; AVX512VLBW: # %bb.0: 2059; AVX512VLBW-NEXT: vprolq $15, %xmm0, %xmm0 2060; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2061; AVX512VLBW-NEXT: retq 2062; 2063; XOP-LABEL: splatconstant_rotate_mask_v2i64: 2064; XOP: # %bb.0: 2065; XOP-NEXT: vprotq $15, %xmm0, %xmm0 2066; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2067; XOP-NEXT: retq 2068; 2069; X32-SSE-LABEL: splatconstant_rotate_mask_v2i64: 2070; X32-SSE: # %bb.0: 2071; X32-SSE-NEXT: psrlq $49, %xmm0 2072; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 2073; X32-SSE-NEXT: retl 2074 %shl = shl <2 x i64> %a, <i64 15, i64 15> 2075 %lshr = lshr <2 x i64> %a, <i64 49, i64 49> 2076 %rmask = and <2 x i64> %lshr, <i64 255, i64 127> 2077 %lmask = and <2 x i64> %shl, <i64 65, i64 33> 2078 %or = or <2 x i64> %lmask, %rmask 2079 ret <2 x i64> %or 2080} 2081 2082define <4 x i32> @splatconstant_rotate_mask_v4i32(<4 x i32> %a) nounwind { 2083; SSE-LABEL: splatconstant_rotate_mask_v4i32: 2084; SSE: # %bb.0: 2085; SSE-NEXT: movdqa %xmm0, %xmm1 2086; SSE-NEXT: psrld $28, %xmm1 2087; SSE-NEXT: pslld $4, %xmm0 2088; SSE-NEXT: por %xmm1, %xmm0 2089; SSE-NEXT: pand {{.*}}(%rip), %xmm0 2090; SSE-NEXT: retq 2091; 2092; AVX-LABEL: splatconstant_rotate_mask_v4i32: 2093; AVX: # %bb.0: 2094; AVX-NEXT: vpsrld $28, %xmm0, %xmm1 2095; AVX-NEXT: vpslld $4, %xmm0, %xmm0 2096; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 2097; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2098; AVX-NEXT: retq 2099; 2100; AVX512F-LABEL: splatconstant_rotate_mask_v4i32: 2101; AVX512F: # %bb.0: 2102; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2103; AVX512F-NEXT: vprold $4, %zmm0, %zmm0 2104; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2105; AVX512F-NEXT: vzeroupper 2106; AVX512F-NEXT: retq 2107; 2108; AVX512VL-LABEL: splatconstant_rotate_mask_v4i32: 2109; AVX512VL: # %bb.0: 2110; AVX512VL-NEXT: vprold $4, %xmm0, %xmm0 2111; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2112; AVX512VL-NEXT: retq 2113; 2114; AVX512BW-LABEL: splatconstant_rotate_mask_v4i32: 2115; AVX512BW: # %bb.0: 2116; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2117; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0 2118; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2119; AVX512BW-NEXT: vzeroupper 2120; AVX512BW-NEXT: retq 2121; 2122; AVX512VLBW-LABEL: splatconstant_rotate_mask_v4i32: 2123; AVX512VLBW: # %bb.0: 2124; AVX512VLBW-NEXT: vprold $4, %xmm0, %xmm0 2125; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2126; AVX512VLBW-NEXT: retq 2127; 2128; XOP-LABEL: splatconstant_rotate_mask_v4i32: 2129; XOP: # %bb.0: 2130; XOP-NEXT: vprotd $4, %xmm0, %xmm0 2131; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2132; XOP-NEXT: retq 2133; 2134; X32-SSE-LABEL: splatconstant_rotate_mask_v4i32: 2135; X32-SSE: # %bb.0: 2136; X32-SSE-NEXT: movdqa %xmm0, %xmm1 2137; X32-SSE-NEXT: psrld $28, %xmm1 2138; X32-SSE-NEXT: pslld $4, %xmm0 2139; X32-SSE-NEXT: por %xmm1, %xmm0 2140; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 2141; X32-SSE-NEXT: retl 2142 %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4> 2143 %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28> 2144 %rmask = and <4 x i32> %lshr, <i32 127, i32 255, i32 511, i32 1023> 2145 %lmask = and <4 x i32> %shl, <i32 1023, i32 511, i32 255, i32 127> 2146 %or = or <4 x i32> %lmask, %rmask 2147 ret <4 x i32> %or 2148} 2149 2150define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind { 2151; SSE-LABEL: splatconstant_rotate_mask_v8i16: 2152; SSE: # %bb.0: 2153; SSE-NEXT: movdqa %xmm0, %xmm1 2154; SSE-NEXT: psrlw $11, %xmm1 2155; SSE-NEXT: psllw $5, %xmm0 2156; SSE-NEXT: por %xmm1, %xmm0 2157; SSE-NEXT: pand {{.*}}(%rip), %xmm0 2158; SSE-NEXT: retq 2159; 2160; AVX-LABEL: splatconstant_rotate_mask_v8i16: 2161; AVX: # %bb.0: 2162; AVX-NEXT: vpsrlw $11, %xmm0, %xmm1 2163; AVX-NEXT: vpsllw $5, %xmm0, %xmm0 2164; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 2165; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2166; AVX-NEXT: retq 2167; 2168; AVX512-LABEL: splatconstant_rotate_mask_v8i16: 2169; AVX512: # %bb.0: 2170; AVX512-NEXT: vpsrlw $11, %xmm0, %xmm1 2171; AVX512-NEXT: vpsllw $5, %xmm0, %xmm0 2172; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 2173; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2174; AVX512-NEXT: retq 2175; 2176; XOP-LABEL: splatconstant_rotate_mask_v8i16: 2177; XOP: # %bb.0: 2178; XOP-NEXT: vprotw $5, %xmm0, %xmm0 2179; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2180; XOP-NEXT: retq 2181; 2182; X32-SSE-LABEL: splatconstant_rotate_mask_v8i16: 2183; X32-SSE: # %bb.0: 2184; X32-SSE-NEXT: movdqa %xmm0, %xmm1 2185; X32-SSE-NEXT: psrlw $11, %xmm1 2186; X32-SSE-NEXT: psllw $5, %xmm0 2187; X32-SSE-NEXT: por %xmm1, %xmm0 2188; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 2189; X32-SSE-NEXT: retl 2190 %shl = shl <8 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5> 2191 %lshr = lshr <8 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11> 2192 %rmask = and <8 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55> 2193 %lmask = and <8 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33> 2194 %or = or <8 x i16> %lmask, %rmask 2195 ret <8 x i16> %or 2196} 2197 2198define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind { 2199; SSE-LABEL: splatconstant_rotate_mask_v16i8: 2200; SSE: # %bb.0: 2201; SSE-NEXT: movdqa %xmm0, %xmm1 2202; SSE-NEXT: psrlw $4, %xmm1 2203; SSE-NEXT: pand {{.*}}(%rip), %xmm1 2204; SSE-NEXT: psllw $4, %xmm0 2205; SSE-NEXT: pand {{.*}}(%rip), %xmm0 2206; SSE-NEXT: por %xmm1, %xmm0 2207; SSE-NEXT: pand {{.*}}(%rip), %xmm0 2208; SSE-NEXT: retq 2209; 2210; AVX-LABEL: splatconstant_rotate_mask_v16i8: 2211; AVX: # %bb.0: 2212; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1 2213; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 2214; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 2215; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2216; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 2217; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2218; AVX-NEXT: retq 2219; 2220; AVX512-LABEL: splatconstant_rotate_mask_v16i8: 2221; AVX512: # %bb.0: 2222; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm1 2223; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 2224; AVX512-NEXT: vpsllw $4, %xmm0, %xmm0 2225; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2226; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 2227; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2228; AVX512-NEXT: retq 2229; 2230; XOP-LABEL: splatconstant_rotate_mask_v16i8: 2231; XOP: # %bb.0: 2232; XOP-NEXT: vprotb $4, %xmm0, %xmm0 2233; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2234; XOP-NEXT: retq 2235; 2236; X32-SSE-LABEL: splatconstant_rotate_mask_v16i8: 2237; X32-SSE: # %bb.0: 2238; X32-SSE-NEXT: movdqa %xmm0, %xmm1 2239; X32-SSE-NEXT: psrlw $4, %xmm1 2240; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 2241; X32-SSE-NEXT: psllw $4, %xmm0 2242; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 2243; X32-SSE-NEXT: por %xmm1, %xmm0 2244; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 2245; X32-SSE-NEXT: retl 2246 %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 2247 %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 2248 %rmask = and <16 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55> 2249 %lmask = and <16 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33> 2250 %or = or <16 x i8> %lmask, %rmask 2251 ret <16 x i8> %or 2252} 2253