1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 8; 9; Just one 32-bit run to make sure we do reasonable things for i64 rotates. 10; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2 11 12; 13; Variable Rotates 14; 15 16define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 17; SSE2-LABEL: var_rotate_v2i64: 18; SSE2: # BB#0: 19; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [64,64] 20; SSE2-NEXT: psubq %xmm1, %xmm2 21; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] 22; SSE2-NEXT: movdqa %xmm0, %xmm4 23; SSE2-NEXT: psllq %xmm3, %xmm4 24; SSE2-NEXT: movdqa %xmm0, %xmm3 25; SSE2-NEXT: psllq %xmm1, %xmm3 26; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] 27; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] 28; SSE2-NEXT: movdqa %xmm0, %xmm1 29; SSE2-NEXT: psrlq %xmm3, %xmm1 30; SSE2-NEXT: psrlq %xmm2, %xmm0 31; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 32; SSE2-NEXT: orpd %xmm4, %xmm1 33; SSE2-NEXT: movapd %xmm1, %xmm0 34; SSE2-NEXT: retq 35; 36; SSE41-LABEL: var_rotate_v2i64: 37; SSE41: # BB#0: 38; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [64,64] 39; SSE41-NEXT: psubq %xmm1, %xmm2 40; SSE41-NEXT: movdqa %xmm0, %xmm3 41; SSE41-NEXT: psllq %xmm1, %xmm3 42; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 43; SSE41-NEXT: movdqa %xmm0, %xmm4 44; SSE41-NEXT: psllq %xmm1, %xmm4 45; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0,1,2,3],xmm4[4,5,6,7] 46; SSE41-NEXT: movdqa %xmm0, %xmm1 47; SSE41-NEXT: psrlq %xmm2, %xmm1 48; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 49; SSE41-NEXT: psrlq %xmm2, %xmm0 50; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 51; SSE41-NEXT: por %xmm4, %xmm0 52; SSE41-NEXT: retq 53; 54; AVX1-LABEL: var_rotate_v2i64: 55; AVX1: # BB#0: 56; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] 57; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2 58; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3 59; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 60; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1 61; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] 62; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm3 63; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 64; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 65; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 66; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 67; AVX1-NEXT: retq 68; 69; AVX2-LABEL: var_rotate_v2i64: 70; AVX2: # BB#0: 71; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] 72; AVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm2 73; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm1 74; AVX2-NEXT: vpsrlvq %xmm2, %xmm0, %xmm0 75; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 76; AVX2-NEXT: retq 77; 78; XOP-LABEL: var_rotate_v2i64: 79; XOP: # BB#0: 80; XOP-NEXT: vprotq %xmm1, %xmm0, %xmm0 81; XOP-NEXT: retq 82; 83; X32-SSE-LABEL: var_rotate_v2i64: 84; X32-SSE: # BB#0: 85; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [64,0,64,0] 86; X32-SSE-NEXT: psubq %xmm1, %xmm2 87; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] 88; X32-SSE-NEXT: movdqa %xmm0, %xmm4 89; X32-SSE-NEXT: psllq %xmm3, %xmm4 90; X32-SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero 91; X32-SSE-NEXT: movdqa %xmm0, %xmm3 92; X32-SSE-NEXT: psllq %xmm1, %xmm3 93; X32-SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] 94; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] 95; X32-SSE-NEXT: movdqa %xmm0, %xmm1 96; X32-SSE-NEXT: psrlq %xmm3, %xmm1 97; X32-SSE-NEXT: movq {{.*#+}} xmm2 = xmm2[0],zero 98; X32-SSE-NEXT: psrlq %xmm2, %xmm0 99; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 100; X32-SSE-NEXT: orpd %xmm4, %xmm1 101; X32-SSE-NEXT: movapd %xmm1, %xmm0 102; X32-SSE-NEXT: retl 103 %b64 = sub <2 x i64> <i64 64, i64 64>, %b 104 %shl = shl <2 x i64> %a, %b 105 %lshr = lshr <2 x i64> %a, %b64 106 %or = or <2 x i64> %shl, %lshr 107 ret <2 x i64> %or 108} 109 110define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 111; SSE2-LABEL: var_rotate_v4i32: 112; SSE2: # BB#0: 113; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32] 114; SSE2-NEXT: psubd %xmm1, %xmm2 115; SSE2-NEXT: pslld $23, %xmm1 116; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 117; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 118; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 119; SSE2-NEXT: pmuludq %xmm0, %xmm1 120; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 121; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 122; SSE2-NEXT: pmuludq %xmm3, %xmm4 123; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] 124; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 125; SSE2-NEXT: movdqa %xmm2, %xmm3 126; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 127; SSE2-NEXT: movdqa %xmm0, %xmm4 128; SSE2-NEXT: psrld %xmm3, %xmm4 129; SSE2-NEXT: movdqa %xmm2, %xmm3 130; SSE2-NEXT: psrlq $32, %xmm3 131; SSE2-NEXT: movdqa %xmm0, %xmm5 132; SSE2-NEXT: psrld %xmm3, %xmm5 133; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] 134; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,3,2,3] 135; SSE2-NEXT: pxor %xmm4, %xmm4 136; SSE2-NEXT: movdqa %xmm2, %xmm5 137; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] 138; SSE2-NEXT: movdqa %xmm0, %xmm6 139; SSE2-NEXT: psrld %xmm5, %xmm6 140; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 141; SSE2-NEXT: psrld %xmm2, %xmm0 142; SSE2-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] 143; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] 144; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 145; SSE2-NEXT: por %xmm1, %xmm0 146; SSE2-NEXT: retq 147; 148; SSE41-LABEL: var_rotate_v4i32: 149; SSE41: # BB#0: 150; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32] 151; SSE41-NEXT: psubd %xmm1, %xmm2 152; SSE41-NEXT: pslld $23, %xmm1 153; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1 154; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 155; SSE41-NEXT: pmulld %xmm0, %xmm1 156; SSE41-NEXT: movdqa %xmm2, %xmm3 157; SSE41-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 158; SSE41-NEXT: movdqa %xmm0, %xmm4 159; SSE41-NEXT: psrld %xmm3, %xmm4 160; SSE41-NEXT: movdqa %xmm2, %xmm3 161; SSE41-NEXT: psrlq $32, %xmm3 162; SSE41-NEXT: movdqa %xmm0, %xmm5 163; SSE41-NEXT: psrld %xmm3, %xmm5 164; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7] 165; SSE41-NEXT: pxor %xmm3, %xmm3 166; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero 167; SSE41-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] 168; SSE41-NEXT: movdqa %xmm0, %xmm3 169; SSE41-NEXT: psrld %xmm2, %xmm3 170; SSE41-NEXT: psrld %xmm4, %xmm0 171; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] 172; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] 173; SSE41-NEXT: por %xmm1, %xmm0 174; SSE41-NEXT: retq 175; 176; AVX1-LABEL: var_rotate_v4i32: 177; AVX1: # BB#0: 178; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32] 179; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm2 180; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 181; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 182; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 183; AVX1-NEXT: vpmulld %xmm0, %xmm1, %xmm1 184; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 185; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 186; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4 187; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 188; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 189; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 190; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] 191; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 192; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero 193; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm0 194; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] 195; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] 196; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 197; AVX1-NEXT: retq 198; 199; AVX2-LABEL: var_rotate_v4i32: 200; AVX2: # BB#0: 201; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 202; AVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm2 203; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm1 204; AVX2-NEXT: vpsrlvd %xmm2, %xmm0, %xmm0 205; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 206; AVX2-NEXT: retq 207; 208; XOP-LABEL: var_rotate_v4i32: 209; XOP: # BB#0: 210; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0 211; XOP-NEXT: retq 212; 213; X32-SSE-LABEL: var_rotate_v4i32: 214; X32-SSE: # BB#0: 215; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32] 216; X32-SSE-NEXT: psubd %xmm1, %xmm2 217; X32-SSE-NEXT: pslld $23, %xmm1 218; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1 219; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1 220; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 221; X32-SSE-NEXT: pmuludq %xmm0, %xmm1 222; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 223; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 224; X32-SSE-NEXT: pmuludq %xmm3, %xmm4 225; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] 226; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 227; X32-SSE-NEXT: movdqa %xmm2, %xmm3 228; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 229; X32-SSE-NEXT: movdqa %xmm0, %xmm4 230; X32-SSE-NEXT: psrld %xmm3, %xmm4 231; X32-SSE-NEXT: movdqa %xmm2, %xmm3 232; X32-SSE-NEXT: psrlq $32, %xmm3 233; X32-SSE-NEXT: movdqa %xmm0, %xmm5 234; X32-SSE-NEXT: psrld %xmm3, %xmm5 235; X32-SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] 236; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,3,2,3] 237; X32-SSE-NEXT: pxor %xmm4, %xmm4 238; X32-SSE-NEXT: movdqa %xmm2, %xmm5 239; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] 240; X32-SSE-NEXT: movdqa %xmm0, %xmm6 241; X32-SSE-NEXT: psrld %xmm5, %xmm6 242; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 243; X32-SSE-NEXT: psrld %xmm2, %xmm0 244; X32-SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] 245; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] 246; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 247; X32-SSE-NEXT: por %xmm1, %xmm0 248; X32-SSE-NEXT: retl 249 %b32 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %b 250 %shl = shl <4 x i32> %a, %b 251 %lshr = lshr <4 x i32> %a, %b32 252 %or = or <4 x i32> %shl, %lshr 253 ret <4 x i32> %or 254} 255 256define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 257; SSE2-LABEL: var_rotate_v8i16: 258; SSE2: # BB#0: 259; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 260; SSE2-NEXT: psubw %xmm1, %xmm3 261; SSE2-NEXT: psllw $12, %xmm1 262; SSE2-NEXT: movdqa %xmm1, %xmm2 263; SSE2-NEXT: psraw $15, %xmm2 264; SSE2-NEXT: movdqa %xmm0, %xmm4 265; SSE2-NEXT: psllw $8, %xmm4 266; SSE2-NEXT: pand %xmm2, %xmm4 267; SSE2-NEXT: pandn %xmm0, %xmm2 268; SSE2-NEXT: por %xmm4, %xmm2 269; SSE2-NEXT: paddw %xmm1, %xmm1 270; SSE2-NEXT: movdqa %xmm1, %xmm4 271; SSE2-NEXT: psraw $15, %xmm4 272; SSE2-NEXT: movdqa %xmm4, %xmm5 273; SSE2-NEXT: pandn %xmm2, %xmm5 274; SSE2-NEXT: psllw $4, %xmm2 275; SSE2-NEXT: pand %xmm4, %xmm2 276; SSE2-NEXT: por %xmm5, %xmm2 277; SSE2-NEXT: paddw %xmm1, %xmm1 278; SSE2-NEXT: movdqa %xmm1, %xmm4 279; SSE2-NEXT: psraw $15, %xmm4 280; SSE2-NEXT: movdqa %xmm4, %xmm5 281; SSE2-NEXT: pandn %xmm2, %xmm5 282; SSE2-NEXT: psllw $2, %xmm2 283; SSE2-NEXT: pand %xmm4, %xmm2 284; SSE2-NEXT: por %xmm5, %xmm2 285; SSE2-NEXT: paddw %xmm1, %xmm1 286; SSE2-NEXT: psraw $15, %xmm1 287; SSE2-NEXT: movdqa %xmm1, %xmm4 288; SSE2-NEXT: pandn %xmm2, %xmm4 289; SSE2-NEXT: psllw $1, %xmm2 290; SSE2-NEXT: pand %xmm1, %xmm2 291; SSE2-NEXT: psllw $12, %xmm3 292; SSE2-NEXT: movdqa %xmm3, %xmm1 293; SSE2-NEXT: psraw $15, %xmm1 294; SSE2-NEXT: movdqa %xmm1, %xmm5 295; SSE2-NEXT: pandn %xmm0, %xmm5 296; SSE2-NEXT: psrlw $8, %xmm0 297; SSE2-NEXT: pand %xmm1, %xmm0 298; SSE2-NEXT: por %xmm5, %xmm0 299; SSE2-NEXT: paddw %xmm3, %xmm3 300; SSE2-NEXT: movdqa %xmm3, %xmm1 301; SSE2-NEXT: psraw $15, %xmm1 302; SSE2-NEXT: movdqa %xmm1, %xmm5 303; SSE2-NEXT: pandn %xmm0, %xmm5 304; SSE2-NEXT: psrlw $4, %xmm0 305; SSE2-NEXT: pand %xmm1, %xmm0 306; SSE2-NEXT: por %xmm5, %xmm0 307; SSE2-NEXT: paddw %xmm3, %xmm3 308; SSE2-NEXT: movdqa %xmm3, %xmm1 309; SSE2-NEXT: psraw $15, %xmm1 310; SSE2-NEXT: movdqa %xmm1, %xmm5 311; SSE2-NEXT: pandn %xmm0, %xmm5 312; SSE2-NEXT: psrlw $2, %xmm0 313; SSE2-NEXT: pand %xmm1, %xmm0 314; SSE2-NEXT: por %xmm5, %xmm0 315; SSE2-NEXT: paddw %xmm3, %xmm3 316; SSE2-NEXT: psraw $15, %xmm3 317; SSE2-NEXT: movdqa %xmm3, %xmm1 318; SSE2-NEXT: pandn %xmm0, %xmm1 319; SSE2-NEXT: psrlw $1, %xmm0 320; SSE2-NEXT: pand %xmm3, %xmm0 321; SSE2-NEXT: por %xmm1, %xmm0 322; SSE2-NEXT: por %xmm4, %xmm0 323; SSE2-NEXT: por %xmm2, %xmm0 324; SSE2-NEXT: retq 325; 326; SSE41-LABEL: var_rotate_v8i16: 327; SSE41: # BB#0: 328; SSE41-NEXT: movdqa %xmm0, %xmm3 329; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] 330; SSE41-NEXT: psubw %xmm1, %xmm2 331; SSE41-NEXT: movdqa %xmm1, %xmm0 332; SSE41-NEXT: psllw $12, %xmm0 333; SSE41-NEXT: psllw $4, %xmm1 334; SSE41-NEXT: por %xmm0, %xmm1 335; SSE41-NEXT: movdqa %xmm1, %xmm4 336; SSE41-NEXT: paddw %xmm4, %xmm4 337; SSE41-NEXT: movdqa %xmm3, %xmm6 338; SSE41-NEXT: psllw $8, %xmm6 339; SSE41-NEXT: movdqa %xmm3, %xmm5 340; SSE41-NEXT: movdqa %xmm1, %xmm0 341; SSE41-NEXT: pblendvb %xmm6, %xmm5 342; SSE41-NEXT: movdqa %xmm5, %xmm1 343; SSE41-NEXT: psllw $4, %xmm1 344; SSE41-NEXT: movdqa %xmm4, %xmm0 345; SSE41-NEXT: pblendvb %xmm1, %xmm5 346; SSE41-NEXT: movdqa %xmm5, %xmm1 347; SSE41-NEXT: psllw $2, %xmm1 348; SSE41-NEXT: paddw %xmm4, %xmm4 349; SSE41-NEXT: movdqa %xmm4, %xmm0 350; SSE41-NEXT: pblendvb %xmm1, %xmm5 351; SSE41-NEXT: movdqa %xmm5, %xmm1 352; SSE41-NEXT: psllw $1, %xmm1 353; SSE41-NEXT: paddw %xmm4, %xmm4 354; SSE41-NEXT: movdqa %xmm4, %xmm0 355; SSE41-NEXT: pblendvb %xmm1, %xmm5 356; SSE41-NEXT: movdqa %xmm2, %xmm0 357; SSE41-NEXT: psllw $12, %xmm0 358; SSE41-NEXT: psllw $4, %xmm2 359; SSE41-NEXT: por %xmm0, %xmm2 360; SSE41-NEXT: movdqa %xmm2, %xmm1 361; SSE41-NEXT: paddw %xmm1, %xmm1 362; SSE41-NEXT: movdqa %xmm3, %xmm4 363; SSE41-NEXT: psrlw $8, %xmm4 364; SSE41-NEXT: movdqa %xmm2, %xmm0 365; SSE41-NEXT: pblendvb %xmm4, %xmm3 366; SSE41-NEXT: movdqa %xmm3, %xmm2 367; SSE41-NEXT: psrlw $4, %xmm2 368; SSE41-NEXT: movdqa %xmm1, %xmm0 369; SSE41-NEXT: pblendvb %xmm2, %xmm3 370; SSE41-NEXT: movdqa %xmm3, %xmm2 371; SSE41-NEXT: psrlw $2, %xmm2 372; SSE41-NEXT: paddw %xmm1, %xmm1 373; SSE41-NEXT: movdqa %xmm1, %xmm0 374; SSE41-NEXT: pblendvb %xmm2, %xmm3 375; SSE41-NEXT: movdqa %xmm3, %xmm2 376; SSE41-NEXT: psrlw $1, %xmm2 377; SSE41-NEXT: paddw %xmm1, %xmm1 378; SSE41-NEXT: movdqa %xmm1, %xmm0 379; SSE41-NEXT: pblendvb %xmm2, %xmm3 380; SSE41-NEXT: por %xmm5, %xmm3 381; SSE41-NEXT: movdqa %xmm3, %xmm0 382; SSE41-NEXT: retq 383; 384; AVX1-LABEL: var_rotate_v8i16: 385; AVX1: # BB#0: 386; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] 387; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm2 388; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3 389; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 390; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 391; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3 392; AVX1-NEXT: vpsllw $8, %xmm0, %xmm4 393; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm1 394; AVX1-NEXT: vpsllw $4, %xmm1, %xmm4 395; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm1, %xmm1 396; AVX1-NEXT: vpsllw $2, %xmm1, %xmm4 397; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 398; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm1, %xmm1 399; AVX1-NEXT: vpsllw $1, %xmm1, %xmm4 400; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 401; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm1, %xmm1 402; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3 403; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 404; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 405; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 406; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm4 407; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 408; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 409; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 410; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2 411; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 412; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 413; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2 414; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 415; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 416; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 417; AVX1-NEXT: retq 418; 419; AVX2-LABEL: var_rotate_v8i16: 420; AVX2: # BB#0: 421; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] 422; AVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm2 423; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 424; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 425; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm1 426; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] 427; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 428; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 429; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 430; AVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0 431; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 432; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 433; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 434; AVX2-NEXT: vzeroupper 435; AVX2-NEXT: retq 436; 437; XOP-LABEL: var_rotate_v8i16: 438; XOP: # BB#0: 439; XOP-NEXT: vprotw %xmm1, %xmm0, %xmm0 440; XOP-NEXT: retq 441; 442; X32-SSE-LABEL: var_rotate_v8i16: 443; X32-SSE: # BB#0: 444; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 445; X32-SSE-NEXT: psubw %xmm1, %xmm3 446; X32-SSE-NEXT: psllw $12, %xmm1 447; X32-SSE-NEXT: movdqa %xmm1, %xmm2 448; X32-SSE-NEXT: psraw $15, %xmm2 449; X32-SSE-NEXT: movdqa %xmm0, %xmm4 450; X32-SSE-NEXT: psllw $8, %xmm4 451; X32-SSE-NEXT: pand %xmm2, %xmm4 452; X32-SSE-NEXT: pandn %xmm0, %xmm2 453; X32-SSE-NEXT: por %xmm4, %xmm2 454; X32-SSE-NEXT: paddw %xmm1, %xmm1 455; X32-SSE-NEXT: movdqa %xmm1, %xmm4 456; X32-SSE-NEXT: psraw $15, %xmm4 457; X32-SSE-NEXT: movdqa %xmm4, %xmm5 458; X32-SSE-NEXT: pandn %xmm2, %xmm5 459; X32-SSE-NEXT: psllw $4, %xmm2 460; X32-SSE-NEXT: pand %xmm4, %xmm2 461; X32-SSE-NEXT: por %xmm5, %xmm2 462; X32-SSE-NEXT: paddw %xmm1, %xmm1 463; X32-SSE-NEXT: movdqa %xmm1, %xmm4 464; X32-SSE-NEXT: psraw $15, %xmm4 465; X32-SSE-NEXT: movdqa %xmm4, %xmm5 466; X32-SSE-NEXT: pandn %xmm2, %xmm5 467; X32-SSE-NEXT: psllw $2, %xmm2 468; X32-SSE-NEXT: pand %xmm4, %xmm2 469; X32-SSE-NEXT: por %xmm5, %xmm2 470; X32-SSE-NEXT: paddw %xmm1, %xmm1 471; X32-SSE-NEXT: psraw $15, %xmm1 472; X32-SSE-NEXT: movdqa %xmm1, %xmm4 473; X32-SSE-NEXT: pandn %xmm2, %xmm4 474; X32-SSE-NEXT: psllw $1, %xmm2 475; X32-SSE-NEXT: pand %xmm1, %xmm2 476; X32-SSE-NEXT: psllw $12, %xmm3 477; X32-SSE-NEXT: movdqa %xmm3, %xmm1 478; X32-SSE-NEXT: psraw $15, %xmm1 479; X32-SSE-NEXT: movdqa %xmm1, %xmm5 480; X32-SSE-NEXT: pandn %xmm0, %xmm5 481; X32-SSE-NEXT: psrlw $8, %xmm0 482; X32-SSE-NEXT: pand %xmm1, %xmm0 483; X32-SSE-NEXT: por %xmm5, %xmm0 484; X32-SSE-NEXT: paddw %xmm3, %xmm3 485; X32-SSE-NEXT: movdqa %xmm3, %xmm1 486; X32-SSE-NEXT: psraw $15, %xmm1 487; X32-SSE-NEXT: movdqa %xmm1, %xmm5 488; X32-SSE-NEXT: pandn %xmm0, %xmm5 489; X32-SSE-NEXT: psrlw $4, %xmm0 490; X32-SSE-NEXT: pand %xmm1, %xmm0 491; X32-SSE-NEXT: por %xmm5, %xmm0 492; X32-SSE-NEXT: paddw %xmm3, %xmm3 493; X32-SSE-NEXT: movdqa %xmm3, %xmm1 494; X32-SSE-NEXT: psraw $15, %xmm1 495; X32-SSE-NEXT: movdqa %xmm1, %xmm5 496; X32-SSE-NEXT: pandn %xmm0, %xmm5 497; X32-SSE-NEXT: psrlw $2, %xmm0 498; X32-SSE-NEXT: pand %xmm1, %xmm0 499; X32-SSE-NEXT: por %xmm5, %xmm0 500; X32-SSE-NEXT: paddw %xmm3, %xmm3 501; X32-SSE-NEXT: psraw $15, %xmm3 502; X32-SSE-NEXT: movdqa %xmm3, %xmm1 503; X32-SSE-NEXT: pandn %xmm0, %xmm1 504; X32-SSE-NEXT: psrlw $1, %xmm0 505; X32-SSE-NEXT: pand %xmm3, %xmm0 506; X32-SSE-NEXT: por %xmm1, %xmm0 507; X32-SSE-NEXT: por %xmm4, %xmm0 508; X32-SSE-NEXT: por %xmm2, %xmm0 509; X32-SSE-NEXT: retl 510 %b16 = sub <8 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b 511 %shl = shl <8 x i16> %a, %b 512 %lshr = lshr <8 x i16> %a, %b16 513 %or = or <8 x i16> %shl, %lshr 514 ret <8 x i16> %or 515} 516 517define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 518; SSE2-LABEL: var_rotate_v16i8: 519; SSE2: # BB#0: 520; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 521; SSE2-NEXT: psubb %xmm1, %xmm4 522; SSE2-NEXT: psllw $5, %xmm1 523; SSE2-NEXT: pxor %xmm3, %xmm3 524; SSE2-NEXT: pxor %xmm2, %xmm2 525; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 526; SSE2-NEXT: movdqa %xmm0, %xmm5 527; SSE2-NEXT: psllw $4, %xmm5 528; SSE2-NEXT: pand {{.*}}(%rip), %xmm5 529; SSE2-NEXT: pand %xmm2, %xmm5 530; SSE2-NEXT: pandn %xmm0, %xmm2 531; SSE2-NEXT: por %xmm5, %xmm2 532; SSE2-NEXT: paddb %xmm1, %xmm1 533; SSE2-NEXT: pxor %xmm5, %xmm5 534; SSE2-NEXT: pcmpgtb %xmm1, %xmm5 535; SSE2-NEXT: movdqa %xmm5, %xmm6 536; SSE2-NEXT: pandn %xmm2, %xmm6 537; SSE2-NEXT: psllw $2, %xmm2 538; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 539; SSE2-NEXT: pand %xmm5, %xmm2 540; SSE2-NEXT: por %xmm6, %xmm2 541; SSE2-NEXT: paddb %xmm1, %xmm1 542; SSE2-NEXT: pxor %xmm5, %xmm5 543; SSE2-NEXT: pcmpgtb %xmm1, %xmm5 544; SSE2-NEXT: movdqa %xmm5, %xmm1 545; SSE2-NEXT: pandn %xmm2, %xmm1 546; SSE2-NEXT: paddb %xmm2, %xmm2 547; SSE2-NEXT: pand %xmm5, %xmm2 548; SSE2-NEXT: psllw $5, %xmm4 549; SSE2-NEXT: pxor %xmm5, %xmm5 550; SSE2-NEXT: pcmpgtb %xmm4, %xmm5 551; SSE2-NEXT: movdqa %xmm5, %xmm6 552; SSE2-NEXT: pandn %xmm0, %xmm6 553; SSE2-NEXT: psrlw $4, %xmm0 554; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 555; SSE2-NEXT: pand %xmm5, %xmm0 556; SSE2-NEXT: por %xmm6, %xmm0 557; SSE2-NEXT: paddb %xmm4, %xmm4 558; SSE2-NEXT: pxor %xmm5, %xmm5 559; SSE2-NEXT: pcmpgtb %xmm4, %xmm5 560; SSE2-NEXT: movdqa %xmm5, %xmm6 561; SSE2-NEXT: pandn %xmm0, %xmm6 562; SSE2-NEXT: psrlw $2, %xmm0 563; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 564; SSE2-NEXT: pand %xmm5, %xmm0 565; SSE2-NEXT: por %xmm6, %xmm0 566; SSE2-NEXT: paddb %xmm4, %xmm4 567; SSE2-NEXT: pcmpgtb %xmm4, %xmm3 568; SSE2-NEXT: movdqa %xmm3, %xmm4 569; SSE2-NEXT: pandn %xmm0, %xmm4 570; SSE2-NEXT: psrlw $1, %xmm0 571; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 572; SSE2-NEXT: pand %xmm3, %xmm0 573; SSE2-NEXT: por %xmm4, %xmm0 574; SSE2-NEXT: por %xmm1, %xmm0 575; SSE2-NEXT: por %xmm2, %xmm0 576; SSE2-NEXT: retq 577; 578; SSE41-LABEL: var_rotate_v16i8: 579; SSE41: # BB#0: 580; SSE41-NEXT: movdqa %xmm1, %xmm3 581; SSE41-NEXT: movdqa %xmm0, %xmm1 582; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 583; SSE41-NEXT: psubb %xmm3, %xmm2 584; SSE41-NEXT: psllw $5, %xmm3 585; SSE41-NEXT: movdqa %xmm1, %xmm5 586; SSE41-NEXT: psllw $4, %xmm5 587; SSE41-NEXT: pand {{.*}}(%rip), %xmm5 588; SSE41-NEXT: movdqa %xmm1, %xmm4 589; SSE41-NEXT: movdqa %xmm3, %xmm0 590; SSE41-NEXT: pblendvb %xmm5, %xmm4 591; SSE41-NEXT: movdqa %xmm4, %xmm5 592; SSE41-NEXT: psllw $2, %xmm5 593; SSE41-NEXT: pand {{.*}}(%rip), %xmm5 594; SSE41-NEXT: paddb %xmm3, %xmm3 595; SSE41-NEXT: movdqa %xmm3, %xmm0 596; SSE41-NEXT: pblendvb %xmm5, %xmm4 597; SSE41-NEXT: movdqa %xmm4, %xmm5 598; SSE41-NEXT: paddb %xmm5, %xmm5 599; SSE41-NEXT: paddb %xmm3, %xmm3 600; SSE41-NEXT: movdqa %xmm3, %xmm0 601; SSE41-NEXT: pblendvb %xmm5, %xmm4 602; SSE41-NEXT: psllw $5, %xmm2 603; SSE41-NEXT: movdqa %xmm2, %xmm3 604; SSE41-NEXT: paddb %xmm3, %xmm3 605; SSE41-NEXT: movdqa %xmm1, %xmm5 606; SSE41-NEXT: psrlw $4, %xmm5 607; SSE41-NEXT: pand {{.*}}(%rip), %xmm5 608; SSE41-NEXT: movdqa %xmm2, %xmm0 609; SSE41-NEXT: pblendvb %xmm5, %xmm1 610; SSE41-NEXT: movdqa %xmm1, %xmm2 611; SSE41-NEXT: psrlw $2, %xmm2 612; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 613; SSE41-NEXT: movdqa %xmm3, %xmm0 614; SSE41-NEXT: pblendvb %xmm2, %xmm1 615; SSE41-NEXT: movdqa %xmm1, %xmm2 616; SSE41-NEXT: psrlw $1, %xmm2 617; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 618; SSE41-NEXT: paddb %xmm3, %xmm3 619; SSE41-NEXT: movdqa %xmm3, %xmm0 620; SSE41-NEXT: pblendvb %xmm2, %xmm1 621; SSE41-NEXT: por %xmm4, %xmm1 622; SSE41-NEXT: movdqa %xmm1, %xmm0 623; SSE41-NEXT: retq 624; 625; AVX-LABEL: var_rotate_v16i8: 626; AVX: # BB#0: 627; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 628; AVX-NEXT: vpsubb %xmm1, %xmm2, %xmm2 629; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 630; AVX-NEXT: vpsllw $4, %xmm0, %xmm3 631; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 632; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm3 633; AVX-NEXT: vpsllw $2, %xmm3, %xmm4 634; AVX-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4 635; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 636; AVX-NEXT: vpblendvb %xmm1, %xmm4, %xmm3, %xmm3 637; AVX-NEXT: vpaddb %xmm3, %xmm3, %xmm4 638; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 639; AVX-NEXT: vpblendvb %xmm1, %xmm4, %xmm3, %xmm1 640; AVX-NEXT: vpsllw $5, %xmm2, %xmm2 641; AVX-NEXT: vpaddb %xmm2, %xmm2, %xmm3 642; AVX-NEXT: vpsrlw $4, %xmm0, %xmm4 643; AVX-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4 644; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 645; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 646; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 647; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 648; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 649; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 650; AVX-NEXT: vpaddb %xmm3, %xmm3, %xmm3 651; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 652; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 653; AVX-NEXT: retq 654; 655; XOP-LABEL: var_rotate_v16i8: 656; XOP: # BB#0: 657; XOP-NEXT: vprotb %xmm1, %xmm0, %xmm0 658; XOP-NEXT: retq 659; 660; X32-SSE-LABEL: var_rotate_v16i8: 661; X32-SSE: # BB#0: 662; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 663; X32-SSE-NEXT: psubb %xmm1, %xmm4 664; X32-SSE-NEXT: psllw $5, %xmm1 665; X32-SSE-NEXT: pxor %xmm3, %xmm3 666; X32-SSE-NEXT: pxor %xmm2, %xmm2 667; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 668; X32-SSE-NEXT: movdqa %xmm0, %xmm5 669; X32-SSE-NEXT: psllw $4, %xmm5 670; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm5 671; X32-SSE-NEXT: pand %xmm2, %xmm5 672; X32-SSE-NEXT: pandn %xmm0, %xmm2 673; X32-SSE-NEXT: por %xmm5, %xmm2 674; X32-SSE-NEXT: paddb %xmm1, %xmm1 675; X32-SSE-NEXT: pxor %xmm5, %xmm5 676; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm5 677; X32-SSE-NEXT: movdqa %xmm5, %xmm6 678; X32-SSE-NEXT: pandn %xmm2, %xmm6 679; X32-SSE-NEXT: psllw $2, %xmm2 680; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 681; X32-SSE-NEXT: pand %xmm5, %xmm2 682; X32-SSE-NEXT: por %xmm6, %xmm2 683; X32-SSE-NEXT: paddb %xmm1, %xmm1 684; X32-SSE-NEXT: pxor %xmm5, %xmm5 685; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm5 686; X32-SSE-NEXT: movdqa %xmm5, %xmm1 687; X32-SSE-NEXT: pandn %xmm2, %xmm1 688; X32-SSE-NEXT: paddb %xmm2, %xmm2 689; X32-SSE-NEXT: pand %xmm5, %xmm2 690; X32-SSE-NEXT: psllw $5, %xmm4 691; X32-SSE-NEXT: pxor %xmm5, %xmm5 692; X32-SSE-NEXT: pcmpgtb %xmm4, %xmm5 693; X32-SSE-NEXT: movdqa %xmm5, %xmm6 694; X32-SSE-NEXT: pandn %xmm0, %xmm6 695; X32-SSE-NEXT: psrlw $4, %xmm0 696; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 697; X32-SSE-NEXT: pand %xmm5, %xmm0 698; X32-SSE-NEXT: por %xmm6, %xmm0 699; X32-SSE-NEXT: paddb %xmm4, %xmm4 700; X32-SSE-NEXT: pxor %xmm5, %xmm5 701; X32-SSE-NEXT: pcmpgtb %xmm4, %xmm5 702; X32-SSE-NEXT: movdqa %xmm5, %xmm6 703; X32-SSE-NEXT: pandn %xmm0, %xmm6 704; X32-SSE-NEXT: psrlw $2, %xmm0 705; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 706; X32-SSE-NEXT: pand %xmm5, %xmm0 707; X32-SSE-NEXT: por %xmm6, %xmm0 708; X32-SSE-NEXT: paddb %xmm4, %xmm4 709; X32-SSE-NEXT: pcmpgtb %xmm4, %xmm3 710; X32-SSE-NEXT: movdqa %xmm3, %xmm4 711; X32-SSE-NEXT: pandn %xmm0, %xmm4 712; X32-SSE-NEXT: psrlw $1, %xmm0 713; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 714; X32-SSE-NEXT: pand %xmm3, %xmm0 715; X32-SSE-NEXT: por %xmm4, %xmm0 716; X32-SSE-NEXT: por %xmm1, %xmm0 717; X32-SSE-NEXT: por %xmm2, %xmm0 718; X32-SSE-NEXT: retl 719 %b8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b 720 %shl = shl <16 x i8> %a, %b 721 %lshr = lshr <16 x i8> %a, %b8 722 %or = or <16 x i8> %shl, %lshr 723 ret <16 x i8> %or 724} 725 726; 727; Constant Rotates 728; 729 730define <2 x i64> @constant_rotate_v2i64(<2 x i64> %a) nounwind { 731; SSE2-LABEL: constant_rotate_v2i64: 732; SSE2: # BB#0: 733; SSE2-NEXT: movdqa %xmm0, %xmm2 734; SSE2-NEXT: psllq $14, %xmm2 735; SSE2-NEXT: movdqa %xmm0, %xmm1 736; SSE2-NEXT: psllq $4, %xmm1 737; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 738; SSE2-NEXT: movdqa %xmm0, %xmm1 739; SSE2-NEXT: psrlq $50, %xmm1 740; SSE2-NEXT: psrlq $60, %xmm0 741; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 742; SSE2-NEXT: orpd %xmm2, %xmm1 743; SSE2-NEXT: movapd %xmm1, %xmm0 744; SSE2-NEXT: retq 745; 746; SSE41-LABEL: constant_rotate_v2i64: 747; SSE41: # BB#0: 748; SSE41-NEXT: movdqa %xmm0, %xmm1 749; SSE41-NEXT: psllq $14, %xmm1 750; SSE41-NEXT: movdqa %xmm0, %xmm2 751; SSE41-NEXT: psllq $4, %xmm2 752; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 753; SSE41-NEXT: movdqa %xmm0, %xmm1 754; SSE41-NEXT: psrlq $50, %xmm1 755; SSE41-NEXT: psrlq $60, %xmm0 756; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 757; SSE41-NEXT: por %xmm2, %xmm0 758; SSE41-NEXT: retq 759; 760; AVX1-LABEL: constant_rotate_v2i64: 761; AVX1: # BB#0: 762; AVX1-NEXT: vpsllq $14, %xmm0, %xmm1 763; AVX1-NEXT: vpsllq $4, %xmm0, %xmm2 764; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 765; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm2 766; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm0 767; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 768; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 769; AVX1-NEXT: retq 770; 771; AVX2-LABEL: constant_rotate_v2i64: 772; AVX2: # BB#0: 773; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm1 774; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 775; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 776; AVX2-NEXT: retq 777; 778; XOPAVX1-LABEL: constant_rotate_v2i64: 779; XOPAVX1: # BB#0: 780; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm1 781; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 782; XOPAVX1-NEXT: vpsubq {{.*}}(%rip), %xmm2, %xmm2 783; XOPAVX1-NEXT: vpshlq %xmm2, %xmm0, %xmm0 784; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 785; XOPAVX1-NEXT: retq 786; 787; XOPAVX2-LABEL: constant_rotate_v2i64: 788; XOPAVX2: # BB#0: 789; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm1 790; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 791; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 792; XOPAVX2-NEXT: retq 793; 794; X32-SSE-LABEL: constant_rotate_v2i64: 795; X32-SSE: # BB#0: 796; X32-SSE-NEXT: movdqa %xmm0, %xmm2 797; X32-SSE-NEXT: psllq $14, %xmm2 798; X32-SSE-NEXT: movdqa %xmm0, %xmm1 799; X32-SSE-NEXT: psllq $4, %xmm1 800; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 801; X32-SSE-NEXT: movdqa %xmm0, %xmm1 802; X32-SSE-NEXT: psrlq $50, %xmm1 803; X32-SSE-NEXT: psrlq $60, %xmm0 804; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 805; X32-SSE-NEXT: orpd %xmm2, %xmm1 806; X32-SSE-NEXT: movapd %xmm1, %xmm0 807; X32-SSE-NEXT: retl 808 %shl = shl <2 x i64> %a, <i64 4, i64 14> 809 %lshr = lshr <2 x i64> %a, <i64 60, i64 50> 810 %or = or <2 x i64> %shl, %lshr 811 ret <2 x i64> %or 812} 813 814define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind { 815; SSE2-LABEL: constant_rotate_v4i32: 816; SSE2: # BB#0: 817; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128] 818; SSE2-NEXT: movdqa %xmm0, %xmm2 819; SSE2-NEXT: pmuludq %xmm1, %xmm2 820; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 821; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 822; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 823; SSE2-NEXT: pmuludq %xmm1, %xmm3 824; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] 825; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 826; SSE2-NEXT: movdqa %xmm0, %xmm1 827; SSE2-NEXT: psrld $25, %xmm1 828; SSE2-NEXT: movdqa %xmm0, %xmm3 829; SSE2-NEXT: psrld $27, %xmm3 830; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] 831; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 832; SSE2-NEXT: movdqa %xmm0, %xmm3 833; SSE2-NEXT: psrld $26, %xmm3 834; SSE2-NEXT: psrld $28, %xmm0 835; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] 836; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 837; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 838; SSE2-NEXT: por %xmm2, %xmm0 839; SSE2-NEXT: retq 840; 841; SSE41-LABEL: constant_rotate_v4i32: 842; SSE41: # BB#0: 843; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128] 844; SSE41-NEXT: pmulld %xmm0, %xmm1 845; SSE41-NEXT: movdqa %xmm0, %xmm2 846; SSE41-NEXT: psrld $25, %xmm2 847; SSE41-NEXT: movdqa %xmm0, %xmm3 848; SSE41-NEXT: psrld $27, %xmm3 849; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] 850; SSE41-NEXT: movdqa %xmm0, %xmm2 851; SSE41-NEXT: psrld $26, %xmm2 852; SSE41-NEXT: psrld $28, %xmm0 853; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 854; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] 855; SSE41-NEXT: por %xmm1, %xmm0 856; SSE41-NEXT: retq 857; 858; AVX1-LABEL: constant_rotate_v4i32: 859; AVX1: # BB#0: 860; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1 861; AVX1-NEXT: vpsrld $25, %xmm0, %xmm2 862; AVX1-NEXT: vpsrld $27, %xmm0, %xmm3 863; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 864; AVX1-NEXT: vpsrld $26, %xmm0, %xmm3 865; AVX1-NEXT: vpsrld $28, %xmm0, %xmm0 866; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] 867; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 868; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 869; AVX1-NEXT: retq 870; 871; AVX2-LABEL: constant_rotate_v4i32: 872; AVX2: # BB#0: 873; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm1 874; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 875; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 876; AVX2-NEXT: retq 877; 878; XOPAVX1-LABEL: constant_rotate_v4i32: 879; XOPAVX1: # BB#0: 880; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm1 881; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 882; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 883; XOPAVX1-NEXT: retq 884; 885; XOPAVX2-LABEL: constant_rotate_v4i32: 886; XOPAVX2: # BB#0: 887; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm1 888; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 889; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 890; XOPAVX2-NEXT: retq 891; 892; X32-SSE-LABEL: constant_rotate_v4i32: 893; X32-SSE: # BB#0: 894; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128] 895; X32-SSE-NEXT: movdqa %xmm0, %xmm2 896; X32-SSE-NEXT: pmuludq %xmm1, %xmm2 897; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 898; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 899; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 900; X32-SSE-NEXT: pmuludq %xmm1, %xmm3 901; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] 902; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 903; X32-SSE-NEXT: movdqa %xmm0, %xmm1 904; X32-SSE-NEXT: psrld $25, %xmm1 905; X32-SSE-NEXT: movdqa %xmm0, %xmm3 906; X32-SSE-NEXT: psrld $27, %xmm3 907; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] 908; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 909; X32-SSE-NEXT: movdqa %xmm0, %xmm3 910; X32-SSE-NEXT: psrld $26, %xmm3 911; X32-SSE-NEXT: psrld $28, %xmm0 912; X32-SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] 913; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 914; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 915; X32-SSE-NEXT: por %xmm2, %xmm0 916; X32-SSE-NEXT: retl 917 %shl = shl <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7> 918 %lshr = lshr <4 x i32> %a, <i32 28, i32 27, i32 26, i32 25> 919 %or = or <4 x i32> %shl, %lshr 920 ret <4 x i32> %or 921} 922 923define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind { 924; SSE2-LABEL: constant_rotate_v8i16: 925; SSE2: # BB#0: 926; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] 927; SSE2-NEXT: pmullw %xmm0, %xmm2 928; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] 929; SSE2-NEXT: movdqa %xmm1, %xmm3 930; SSE2-NEXT: pandn %xmm0, %xmm3 931; SSE2-NEXT: psrlw $8, %xmm0 932; SSE2-NEXT: pand %xmm1, %xmm0 933; SSE2-NEXT: por %xmm3, %xmm0 934; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,0,0,0] 935; SSE2-NEXT: movdqa %xmm1, %xmm3 936; SSE2-NEXT: pandn %xmm0, %xmm3 937; SSE2-NEXT: psrlw $4, %xmm0 938; SSE2-NEXT: pand %xmm1, %xmm0 939; SSE2-NEXT: por %xmm3, %xmm0 940; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,0,65535,65535,0] 941; SSE2-NEXT: movdqa %xmm1, %xmm3 942; SSE2-NEXT: pandn %xmm0, %xmm3 943; SSE2-NEXT: psrlw $2, %xmm0 944; SSE2-NEXT: pand %xmm1, %xmm0 945; SSE2-NEXT: por %xmm3, %xmm0 946; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] 947; SSE2-NEXT: movdqa %xmm0, %xmm1 948; SSE2-NEXT: pand %xmm3, %xmm1 949; SSE2-NEXT: psrlw $1, %xmm0 950; SSE2-NEXT: pandn %xmm0, %xmm3 951; SSE2-NEXT: por %xmm2, %xmm3 952; SSE2-NEXT: por %xmm3, %xmm1 953; SSE2-NEXT: movdqa %xmm1, %xmm0 954; SSE2-NEXT: retq 955; 956; SSE41-LABEL: constant_rotate_v8i16: 957; SSE41: # BB#0: 958; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] 959; SSE41-NEXT: pmullw %xmm0, %xmm1 960; SSE41-NEXT: movdqa %xmm0, %xmm2 961; SSE41-NEXT: psrlw $8, %xmm2 962; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] 963; SSE41-NEXT: movdqa %xmm2, %xmm0 964; SSE41-NEXT: psrlw $4, %xmm0 965; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3,4],xmm2[5,6,7] 966; SSE41-NEXT: movdqa %xmm0, %xmm2 967; SSE41-NEXT: psrlw $2, %xmm2 968; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2],xmm0[3,4],xmm2[5,6],xmm0[7] 969; SSE41-NEXT: movdqa %xmm2, %xmm0 970; SSE41-NEXT: psrlw $1, %xmm0 971; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] 972; SSE41-NEXT: por %xmm1, %xmm0 973; SSE41-NEXT: retq 974; 975; AVX1-LABEL: constant_rotate_v8i16: 976; AVX1: # BB#0: 977; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1 978; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 979; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] 980; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 981; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4],xmm0[5,6,7] 982; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2 983; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3,4],xmm2[5,6],xmm0[7] 984; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2 985; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] 986; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 987; AVX1-NEXT: retq 988; 989; AVX2-LABEL: constant_rotate_v8i16: 990; AVX2: # BB#0: 991; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1 992; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 993; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0 994; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 995; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 996; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 997; AVX2-NEXT: vzeroupper 998; AVX2-NEXT: retq 999; 1000; XOP-LABEL: constant_rotate_v8i16: 1001; XOP: # BB#0: 1002; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm1 1003; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 1004; XOP-NEXT: vpsubw {{.*}}(%rip), %xmm2, %xmm2 1005; XOP-NEXT: vpshlw %xmm2, %xmm0, %xmm0 1006; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 1007; XOP-NEXT: retq 1008; 1009; X32-SSE-LABEL: constant_rotate_v8i16: 1010; X32-SSE: # BB#0: 1011; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] 1012; X32-SSE-NEXT: pmullw %xmm0, %xmm2 1013; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] 1014; X32-SSE-NEXT: movdqa %xmm1, %xmm3 1015; X32-SSE-NEXT: pandn %xmm0, %xmm3 1016; X32-SSE-NEXT: psrlw $8, %xmm0 1017; X32-SSE-NEXT: pand %xmm1, %xmm0 1018; X32-SSE-NEXT: por %xmm3, %xmm0 1019; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,0,0,0] 1020; X32-SSE-NEXT: movdqa %xmm1, %xmm3 1021; X32-SSE-NEXT: pandn %xmm0, %xmm3 1022; X32-SSE-NEXT: psrlw $4, %xmm0 1023; X32-SSE-NEXT: pand %xmm1, %xmm0 1024; X32-SSE-NEXT: por %xmm3, %xmm0 1025; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,0,65535,65535,0] 1026; X32-SSE-NEXT: movdqa %xmm1, %xmm3 1027; X32-SSE-NEXT: pandn %xmm0, %xmm3 1028; X32-SSE-NEXT: psrlw $2, %xmm0 1029; X32-SSE-NEXT: pand %xmm1, %xmm0 1030; X32-SSE-NEXT: por %xmm3, %xmm0 1031; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] 1032; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1033; X32-SSE-NEXT: pand %xmm3, %xmm1 1034; X32-SSE-NEXT: psrlw $1, %xmm0 1035; X32-SSE-NEXT: pandn %xmm0, %xmm3 1036; X32-SSE-NEXT: por %xmm2, %xmm3 1037; X32-SSE-NEXT: por %xmm3, %xmm1 1038; X32-SSE-NEXT: movdqa %xmm1, %xmm0 1039; X32-SSE-NEXT: retl 1040 %shl = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7> 1041 %lshr = lshr <8 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9> 1042 %or = or <8 x i16> %shl, %lshr 1043 ret <8 x i16> %or 1044} 1045 1046define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind { 1047; SSE2-LABEL: constant_rotate_v16i8: 1048; SSE2: # BB#0: 1049; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] 1050; SSE2-NEXT: psllw $5, %xmm3 1051; SSE2-NEXT: pxor %xmm2, %xmm2 1052; SSE2-NEXT: pxor %xmm1, %xmm1 1053; SSE2-NEXT: pcmpgtb %xmm3, %xmm1 1054; SSE2-NEXT: movdqa %xmm0, %xmm4 1055; SSE2-NEXT: psllw $4, %xmm4 1056; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 1057; SSE2-NEXT: pand %xmm1, %xmm4 1058; SSE2-NEXT: pandn %xmm0, %xmm1 1059; SSE2-NEXT: por %xmm4, %xmm1 1060; SSE2-NEXT: paddb %xmm3, %xmm3 1061; SSE2-NEXT: pxor %xmm4, %xmm4 1062; SSE2-NEXT: pcmpgtb %xmm3, %xmm4 1063; SSE2-NEXT: movdqa %xmm4, %xmm5 1064; SSE2-NEXT: pandn %xmm1, %xmm5 1065; SSE2-NEXT: psllw $2, %xmm1 1066; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 1067; SSE2-NEXT: pand %xmm4, %xmm1 1068; SSE2-NEXT: por %xmm5, %xmm1 1069; SSE2-NEXT: paddb %xmm3, %xmm3 1070; SSE2-NEXT: pxor %xmm4, %xmm4 1071; SSE2-NEXT: pcmpgtb %xmm3, %xmm4 1072; SSE2-NEXT: movdqa %xmm4, %xmm3 1073; SSE2-NEXT: pandn %xmm1, %xmm3 1074; SSE2-NEXT: paddb %xmm1, %xmm1 1075; SSE2-NEXT: pand %xmm4, %xmm1 1076; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7] 1077; SSE2-NEXT: psllw $5, %xmm4 1078; SSE2-NEXT: pxor %xmm5, %xmm5 1079; SSE2-NEXT: pcmpgtb %xmm4, %xmm5 1080; SSE2-NEXT: movdqa %xmm5, %xmm6 1081; SSE2-NEXT: pandn %xmm0, %xmm6 1082; SSE2-NEXT: psrlw $4, %xmm0 1083; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1084; SSE2-NEXT: pand %xmm5, %xmm0 1085; SSE2-NEXT: por %xmm6, %xmm0 1086; SSE2-NEXT: paddb %xmm4, %xmm4 1087; SSE2-NEXT: pxor %xmm5, %xmm5 1088; SSE2-NEXT: pcmpgtb %xmm4, %xmm5 1089; SSE2-NEXT: movdqa %xmm5, %xmm6 1090; SSE2-NEXT: pandn %xmm0, %xmm6 1091; SSE2-NEXT: psrlw $2, %xmm0 1092; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1093; SSE2-NEXT: pand %xmm5, %xmm0 1094; SSE2-NEXT: por %xmm6, %xmm0 1095; SSE2-NEXT: paddb %xmm4, %xmm4 1096; SSE2-NEXT: pcmpgtb %xmm4, %xmm2 1097; SSE2-NEXT: movdqa %xmm2, %xmm4 1098; SSE2-NEXT: pandn %xmm0, %xmm4 1099; SSE2-NEXT: psrlw $1, %xmm0 1100; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1101; SSE2-NEXT: pand %xmm2, %xmm0 1102; SSE2-NEXT: por %xmm4, %xmm0 1103; SSE2-NEXT: por %xmm3, %xmm0 1104; SSE2-NEXT: por %xmm1, %xmm0 1105; SSE2-NEXT: retq 1106; 1107; SSE41-LABEL: constant_rotate_v16i8: 1108; SSE41: # BB#0: 1109; SSE41-NEXT: movdqa %xmm0, %xmm1 1110; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] 1111; SSE41-NEXT: psllw $5, %xmm0 1112; SSE41-NEXT: movdqa %xmm1, %xmm3 1113; SSE41-NEXT: psllw $4, %xmm3 1114; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 1115; SSE41-NEXT: movdqa %xmm1, %xmm2 1116; SSE41-NEXT: pblendvb %xmm3, %xmm2 1117; SSE41-NEXT: movdqa %xmm2, %xmm3 1118; SSE41-NEXT: psllw $2, %xmm3 1119; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 1120; SSE41-NEXT: paddb %xmm0, %xmm0 1121; SSE41-NEXT: pblendvb %xmm3, %xmm2 1122; SSE41-NEXT: movdqa %xmm2, %xmm3 1123; SSE41-NEXT: paddb %xmm3, %xmm3 1124; SSE41-NEXT: paddb %xmm0, %xmm0 1125; SSE41-NEXT: pblendvb %xmm3, %xmm2 1126; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7] 1127; SSE41-NEXT: psllw $5, %xmm0 1128; SSE41-NEXT: movdqa %xmm1, %xmm3 1129; SSE41-NEXT: psrlw $4, %xmm3 1130; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 1131; SSE41-NEXT: pblendvb %xmm3, %xmm1 1132; SSE41-NEXT: movdqa %xmm1, %xmm3 1133; SSE41-NEXT: psrlw $2, %xmm3 1134; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 1135; SSE41-NEXT: paddb %xmm0, %xmm0 1136; SSE41-NEXT: pblendvb %xmm3, %xmm1 1137; SSE41-NEXT: movdqa %xmm1, %xmm3 1138; SSE41-NEXT: psrlw $1, %xmm3 1139; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 1140; SSE41-NEXT: paddb %xmm0, %xmm0 1141; SSE41-NEXT: pblendvb %xmm3, %xmm1 1142; SSE41-NEXT: por %xmm2, %xmm1 1143; SSE41-NEXT: movdqa %xmm1, %xmm0 1144; SSE41-NEXT: retq 1145; 1146; AVX-LABEL: constant_rotate_v16i8: 1147; AVX: # BB#0: 1148; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] 1149; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 1150; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 1151; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1152; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm2 1153; AVX-NEXT: vpsllw $2, %xmm2, %xmm3 1154; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1155; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 1156; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm2, %xmm2 1157; AVX-NEXT: vpaddb %xmm2, %xmm2, %xmm3 1158; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 1159; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm2, %xmm1 1160; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7] 1161; AVX-NEXT: vpsllw $5, %xmm2, %xmm2 1162; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 1163; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1164; AVX-NEXT: vpblendvb %xmm2, %xmm3, %xmm0, %xmm0 1165; AVX-NEXT: vpsrlw $2, %xmm0, %xmm3 1166; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1167; AVX-NEXT: vpaddb %xmm2, %xmm2, %xmm2 1168; AVX-NEXT: vpblendvb %xmm2, %xmm3, %xmm0, %xmm0 1169; AVX-NEXT: vpsrlw $1, %xmm0, %xmm3 1170; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1171; AVX-NEXT: vpaddb %xmm2, %xmm2, %xmm2 1172; AVX-NEXT: vpblendvb %xmm2, %xmm3, %xmm0, %xmm0 1173; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 1174; AVX-NEXT: retq 1175; 1176; XOP-LABEL: constant_rotate_v16i8: 1177; XOP: # BB#0: 1178; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm1 1179; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 1180; XOP-NEXT: vpsubb {{.*}}(%rip), %xmm2, %xmm2 1181; XOP-NEXT: vpshlb %xmm2, %xmm0, %xmm0 1182; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 1183; XOP-NEXT: retq 1184; 1185; X32-SSE-LABEL: constant_rotate_v16i8: 1186; X32-SSE: # BB#0: 1187; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] 1188; X32-SSE-NEXT: psllw $5, %xmm3 1189; X32-SSE-NEXT: pxor %xmm2, %xmm2 1190; X32-SSE-NEXT: pxor %xmm1, %xmm1 1191; X32-SSE-NEXT: pcmpgtb %xmm3, %xmm1 1192; X32-SSE-NEXT: movdqa %xmm0, %xmm4 1193; X32-SSE-NEXT: psllw $4, %xmm4 1194; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4 1195; X32-SSE-NEXT: pand %xmm1, %xmm4 1196; X32-SSE-NEXT: pandn %xmm0, %xmm1 1197; X32-SSE-NEXT: por %xmm4, %xmm1 1198; X32-SSE-NEXT: paddb %xmm3, %xmm3 1199; X32-SSE-NEXT: pxor %xmm4, %xmm4 1200; X32-SSE-NEXT: pcmpgtb %xmm3, %xmm4 1201; X32-SSE-NEXT: movdqa %xmm4, %xmm5 1202; X32-SSE-NEXT: pandn %xmm1, %xmm5 1203; X32-SSE-NEXT: psllw $2, %xmm1 1204; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 1205; X32-SSE-NEXT: pand %xmm4, %xmm1 1206; X32-SSE-NEXT: por %xmm5, %xmm1 1207; X32-SSE-NEXT: paddb %xmm3, %xmm3 1208; X32-SSE-NEXT: pxor %xmm4, %xmm4 1209; X32-SSE-NEXT: pcmpgtb %xmm3, %xmm4 1210; X32-SSE-NEXT: movdqa %xmm4, %xmm3 1211; X32-SSE-NEXT: pandn %xmm1, %xmm3 1212; X32-SSE-NEXT: paddb %xmm1, %xmm1 1213; X32-SSE-NEXT: pand %xmm4, %xmm1 1214; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7] 1215; X32-SSE-NEXT: psllw $5, %xmm4 1216; X32-SSE-NEXT: pxor %xmm5, %xmm5 1217; X32-SSE-NEXT: pcmpgtb %xmm4, %xmm5 1218; X32-SSE-NEXT: movdqa %xmm5, %xmm6 1219; X32-SSE-NEXT: pandn %xmm0, %xmm6 1220; X32-SSE-NEXT: psrlw $4, %xmm0 1221; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 1222; X32-SSE-NEXT: pand %xmm5, %xmm0 1223; X32-SSE-NEXT: por %xmm6, %xmm0 1224; X32-SSE-NEXT: paddb %xmm4, %xmm4 1225; X32-SSE-NEXT: pxor %xmm5, %xmm5 1226; X32-SSE-NEXT: pcmpgtb %xmm4, %xmm5 1227; X32-SSE-NEXT: movdqa %xmm5, %xmm6 1228; X32-SSE-NEXT: pandn %xmm0, %xmm6 1229; X32-SSE-NEXT: psrlw $2, %xmm0 1230; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 1231; X32-SSE-NEXT: pand %xmm5, %xmm0 1232; X32-SSE-NEXT: por %xmm6, %xmm0 1233; X32-SSE-NEXT: paddb %xmm4, %xmm4 1234; X32-SSE-NEXT: pcmpgtb %xmm4, %xmm2 1235; X32-SSE-NEXT: movdqa %xmm2, %xmm4 1236; X32-SSE-NEXT: pandn %xmm0, %xmm4 1237; X32-SSE-NEXT: psrlw $1, %xmm0 1238; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 1239; X32-SSE-NEXT: pand %xmm2, %xmm0 1240; X32-SSE-NEXT: por %xmm4, %xmm0 1241; X32-SSE-NEXT: por %xmm3, %xmm0 1242; X32-SSE-NEXT: por %xmm1, %xmm0 1243; X32-SSE-NEXT: retl 1244 %shl = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1> 1245 %lshr = lshr <16 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7> 1246 %or = or <16 x i8> %shl, %lshr 1247 ret <16 x i8> %or 1248} 1249 1250; 1251; Uniform Constant Rotates 1252; 1253 1254define <2 x i64> @splatconstant_rotate_v2i64(<2 x i64> %a) nounwind { 1255; SSE-LABEL: splatconstant_rotate_v2i64: 1256; SSE: # BB#0: 1257; SSE-NEXT: movdqa %xmm0, %xmm1 1258; SSE-NEXT: psllq $14, %xmm1 1259; SSE-NEXT: psrlq $50, %xmm0 1260; SSE-NEXT: por %xmm1, %xmm0 1261; SSE-NEXT: retq 1262; 1263; AVX-LABEL: splatconstant_rotate_v2i64: 1264; AVX: # BB#0: 1265; AVX-NEXT: vpsllq $14, %xmm0, %xmm1 1266; AVX-NEXT: vpsrlq $50, %xmm0, %xmm0 1267; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 1268; AVX-NEXT: retq 1269; 1270; XOP-LABEL: splatconstant_rotate_v2i64: 1271; XOP: # BB#0: 1272; XOP-NEXT: vprotq $14, %xmm0, %xmm0 1273; XOP-NEXT: retq 1274; 1275; X32-SSE-LABEL: splatconstant_rotate_v2i64: 1276; X32-SSE: # BB#0: 1277; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1278; X32-SSE-NEXT: psllq $14, %xmm1 1279; X32-SSE-NEXT: psrlq $50, %xmm0 1280; X32-SSE-NEXT: por %xmm1, %xmm0 1281; X32-SSE-NEXT: retl 1282 %shl = shl <2 x i64> %a, <i64 14, i64 14> 1283 %lshr = lshr <2 x i64> %a, <i64 50, i64 50> 1284 %or = or <2 x i64> %shl, %lshr 1285 ret <2 x i64> %or 1286} 1287 1288define <4 x i32> @splatconstant_rotate_v4i32(<4 x i32> %a) nounwind { 1289; SSE-LABEL: splatconstant_rotate_v4i32: 1290; SSE: # BB#0: 1291; SSE-NEXT: movdqa %xmm0, %xmm1 1292; SSE-NEXT: pslld $4, %xmm1 1293; SSE-NEXT: psrld $28, %xmm0 1294; SSE-NEXT: por %xmm1, %xmm0 1295; SSE-NEXT: retq 1296; 1297; AVX-LABEL: splatconstant_rotate_v4i32: 1298; AVX: # BB#0: 1299; AVX-NEXT: vpslld $4, %xmm0, %xmm1 1300; AVX-NEXT: vpsrld $28, %xmm0, %xmm0 1301; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 1302; AVX-NEXT: retq 1303; 1304; XOP-LABEL: splatconstant_rotate_v4i32: 1305; XOP: # BB#0: 1306; XOP-NEXT: vprotd $4, %xmm0, %xmm0 1307; XOP-NEXT: retq 1308; 1309; X32-SSE-LABEL: splatconstant_rotate_v4i32: 1310; X32-SSE: # BB#0: 1311; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1312; X32-SSE-NEXT: pslld $4, %xmm1 1313; X32-SSE-NEXT: psrld $28, %xmm0 1314; X32-SSE-NEXT: por %xmm1, %xmm0 1315; X32-SSE-NEXT: retl 1316 %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4> 1317 %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28> 1318 %or = or <4 x i32> %shl, %lshr 1319 ret <4 x i32> %or 1320} 1321 1322define <8 x i16> @splatconstant_rotate_v8i16(<8 x i16> %a) nounwind { 1323; SSE-LABEL: splatconstant_rotate_v8i16: 1324; SSE: # BB#0: 1325; SSE-NEXT: movdqa %xmm0, %xmm1 1326; SSE-NEXT: psllw $7, %xmm1 1327; SSE-NEXT: psrlw $9, %xmm0 1328; SSE-NEXT: por %xmm1, %xmm0 1329; SSE-NEXT: retq 1330; 1331; AVX-LABEL: splatconstant_rotate_v8i16: 1332; AVX: # BB#0: 1333; AVX-NEXT: vpsllw $7, %xmm0, %xmm1 1334; AVX-NEXT: vpsrlw $9, %xmm0, %xmm0 1335; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 1336; AVX-NEXT: retq 1337; 1338; XOP-LABEL: splatconstant_rotate_v8i16: 1339; XOP: # BB#0: 1340; XOP-NEXT: vprotw $7, %xmm0, %xmm0 1341; XOP-NEXT: retq 1342; 1343; X32-SSE-LABEL: splatconstant_rotate_v8i16: 1344; X32-SSE: # BB#0: 1345; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1346; X32-SSE-NEXT: psllw $7, %xmm1 1347; X32-SSE-NEXT: psrlw $9, %xmm0 1348; X32-SSE-NEXT: por %xmm1, %xmm0 1349; X32-SSE-NEXT: retl 1350 %shl = shl <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 1351 %lshr = lshr <8 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9> 1352 %or = or <8 x i16> %shl, %lshr 1353 ret <8 x i16> %or 1354} 1355 1356define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind { 1357; SSE-LABEL: splatconstant_rotate_v16i8: 1358; SSE: # BB#0: 1359; SSE-NEXT: movdqa %xmm0, %xmm1 1360; SSE-NEXT: psllw $4, %xmm1 1361; SSE-NEXT: pand {{.*}}(%rip), %xmm1 1362; SSE-NEXT: psrlw $4, %xmm0 1363; SSE-NEXT: pand {{.*}}(%rip), %xmm0 1364; SSE-NEXT: por %xmm1, %xmm0 1365; SSE-NEXT: retq 1366; 1367; AVX-LABEL: splatconstant_rotate_v16i8: 1368; AVX: # BB#0: 1369; AVX-NEXT: vpsllw $4, %xmm0, %xmm1 1370; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1371; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 1372; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1373; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 1374; AVX-NEXT: retq 1375; 1376; XOP-LABEL: splatconstant_rotate_v16i8: 1377; XOP: # BB#0: 1378; XOP-NEXT: vprotb $4, %xmm0, %xmm0 1379; XOP-NEXT: retq 1380; 1381; X32-SSE-LABEL: splatconstant_rotate_v16i8: 1382; X32-SSE: # BB#0: 1383; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1384; X32-SSE-NEXT: psllw $4, %xmm1 1385; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 1386; X32-SSE-NEXT: psrlw $4, %xmm0 1387; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 1388; X32-SSE-NEXT: por %xmm1, %xmm0 1389; X32-SSE-NEXT: retl 1390 %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1391 %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1392 %or = or <16 x i8> %shl, %lshr 1393 ret <16 x i8> %or 1394} 1395 1396; 1397; Masked Uniform Constant Rotates 1398; 1399 1400define <2 x i64> @splatconstant_rotate_mask_v2i64(<2 x i64> %a) nounwind { 1401; SSE-LABEL: splatconstant_rotate_mask_v2i64: 1402; SSE: # BB#0: 1403; SSE-NEXT: movdqa %xmm0, %xmm1 1404; SSE-NEXT: psllq $15, %xmm1 1405; SSE-NEXT: psrlq $49, %xmm0 1406; SSE-NEXT: pand {{.*}}(%rip), %xmm0 1407; SSE-NEXT: pand {{.*}}(%rip), %xmm1 1408; SSE-NEXT: por %xmm0, %xmm1 1409; SSE-NEXT: movdqa %xmm1, %xmm0 1410; SSE-NEXT: retq 1411; 1412; AVX-LABEL: splatconstant_rotate_mask_v2i64: 1413; AVX: # BB#0: 1414; AVX-NEXT: vpsllq $15, %xmm0, %xmm1 1415; AVX-NEXT: vpsrlq $49, %xmm0, %xmm0 1416; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1417; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1418; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 1419; AVX-NEXT: retq 1420; 1421; XOP-LABEL: splatconstant_rotate_mask_v2i64: 1422; XOP: # BB#0: 1423; XOP-NEXT: vprotq $15, %xmm0, %xmm0 1424; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1425; XOP-NEXT: retq 1426; 1427; X32-SSE-LABEL: splatconstant_rotate_mask_v2i64: 1428; X32-SSE: # BB#0: 1429; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1430; X32-SSE-NEXT: psllq $15, %xmm1 1431; X32-SSE-NEXT: psrlq $49, %xmm0 1432; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 1433; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 1434; X32-SSE-NEXT: por %xmm0, %xmm1 1435; X32-SSE-NEXT: movdqa %xmm1, %xmm0 1436; X32-SSE-NEXT: retl 1437 %shl = shl <2 x i64> %a, <i64 15, i64 15> 1438 %lshr = lshr <2 x i64> %a, <i64 49, i64 49> 1439 %rmask = and <2 x i64> %lshr, <i64 255, i64 127> 1440 %lmask = and <2 x i64> %shl, <i64 65, i64 33> 1441 %or = or <2 x i64> %lmask, %rmask 1442 ret <2 x i64> %or 1443} 1444 1445define <4 x i32> @splatconstant_rotate_mask_v4i32(<4 x i32> %a) nounwind { 1446; SSE-LABEL: splatconstant_rotate_mask_v4i32: 1447; SSE: # BB#0: 1448; SSE-NEXT: movdqa %xmm0, %xmm1 1449; SSE-NEXT: pslld $4, %xmm1 1450; SSE-NEXT: psrld $28, %xmm0 1451; SSE-NEXT: pand {{.*}}(%rip), %xmm0 1452; SSE-NEXT: pand {{.*}}(%rip), %xmm1 1453; SSE-NEXT: por %xmm0, %xmm1 1454; SSE-NEXT: movdqa %xmm1, %xmm0 1455; SSE-NEXT: retq 1456; 1457; AVX-LABEL: splatconstant_rotate_mask_v4i32: 1458; AVX: # BB#0: 1459; AVX-NEXT: vpslld $4, %xmm0, %xmm1 1460; AVX-NEXT: vpsrld $28, %xmm0, %xmm0 1461; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1462; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1463; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 1464; AVX-NEXT: retq 1465; 1466; XOP-LABEL: splatconstant_rotate_mask_v4i32: 1467; XOP: # BB#0: 1468; XOP-NEXT: vprotd $4, %xmm0, %xmm0 1469; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1470; XOP-NEXT: retq 1471; 1472; X32-SSE-LABEL: splatconstant_rotate_mask_v4i32: 1473; X32-SSE: # BB#0: 1474; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1475; X32-SSE-NEXT: pslld $4, %xmm1 1476; X32-SSE-NEXT: psrld $28, %xmm0 1477; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 1478; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 1479; X32-SSE-NEXT: por %xmm0, %xmm1 1480; X32-SSE-NEXT: movdqa %xmm1, %xmm0 1481; X32-SSE-NEXT: retl 1482 %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4> 1483 %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28> 1484 %rmask = and <4 x i32> %lshr, <i32 127, i32 255, i32 511, i32 1023> 1485 %lmask = and <4 x i32> %shl, <i32 1023, i32 511, i32 255, i32 127> 1486 %or = or <4 x i32> %lmask, %rmask 1487 ret <4 x i32> %or 1488} 1489 1490define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind { 1491; SSE-LABEL: splatconstant_rotate_mask_v8i16: 1492; SSE: # BB#0: 1493; SSE-NEXT: movdqa %xmm0, %xmm1 1494; SSE-NEXT: psllw $5, %xmm1 1495; SSE-NEXT: psrlw $11, %xmm0 1496; SSE-NEXT: pand {{.*}}(%rip), %xmm0 1497; SSE-NEXT: pand {{.*}}(%rip), %xmm1 1498; SSE-NEXT: por %xmm0, %xmm1 1499; SSE-NEXT: movdqa %xmm1, %xmm0 1500; SSE-NEXT: retq 1501; 1502; AVX-LABEL: splatconstant_rotate_mask_v8i16: 1503; AVX: # BB#0: 1504; AVX-NEXT: vpsllw $5, %xmm0, %xmm1 1505; AVX-NEXT: vpsrlw $11, %xmm0, %xmm0 1506; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1507; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1508; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 1509; AVX-NEXT: retq 1510; 1511; XOP-LABEL: splatconstant_rotate_mask_v8i16: 1512; XOP: # BB#0: 1513; XOP-NEXT: vprotw $5, %xmm0, %xmm0 1514; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1515; XOP-NEXT: retq 1516; 1517; X32-SSE-LABEL: splatconstant_rotate_mask_v8i16: 1518; X32-SSE: # BB#0: 1519; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1520; X32-SSE-NEXT: psllw $5, %xmm1 1521; X32-SSE-NEXT: psrlw $11, %xmm0 1522; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 1523; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 1524; X32-SSE-NEXT: por %xmm0, %xmm1 1525; X32-SSE-NEXT: movdqa %xmm1, %xmm0 1526; X32-SSE-NEXT: retl 1527 %shl = shl <8 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5> 1528 %lshr = lshr <8 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11> 1529 %rmask = and <8 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55> 1530 %lmask = and <8 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33> 1531 %or = or <8 x i16> %lmask, %rmask 1532 ret <8 x i16> %or 1533} 1534 1535define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind { 1536; SSE-LABEL: splatconstant_rotate_mask_v16i8: 1537; SSE: # BB#0: 1538; SSE-NEXT: movdqa %xmm0, %xmm1 1539; SSE-NEXT: psllw $4, %xmm1 1540; SSE-NEXT: pand {{.*}}(%rip), %xmm1 1541; SSE-NEXT: psrlw $4, %xmm0 1542; SSE-NEXT: pand {{.*}}(%rip), %xmm0 1543; SSE-NEXT: pand {{.*}}(%rip), %xmm0 1544; SSE-NEXT: pand {{.*}}(%rip), %xmm1 1545; SSE-NEXT: por %xmm0, %xmm1 1546; SSE-NEXT: movdqa %xmm1, %xmm0 1547; SSE-NEXT: retq 1548; 1549; AVX-LABEL: splatconstant_rotate_mask_v16i8: 1550; AVX: # BB#0: 1551; AVX-NEXT: vpsllw $4, %xmm0, %xmm1 1552; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1553; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 1554; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1555; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1556; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1557; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 1558; AVX-NEXT: retq 1559; 1560; XOP-LABEL: splatconstant_rotate_mask_v16i8: 1561; XOP: # BB#0: 1562; XOP-NEXT: vprotb $4, %xmm0, %xmm0 1563; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1564; XOP-NEXT: retq 1565; 1566; X32-SSE-LABEL: splatconstant_rotate_mask_v16i8: 1567; X32-SSE: # BB#0: 1568; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1569; X32-SSE-NEXT: psllw $4, %xmm1 1570; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 1571; X32-SSE-NEXT: psrlw $4, %xmm0 1572; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 1573; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 1574; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 1575; X32-SSE-NEXT: por %xmm0, %xmm1 1576; X32-SSE-NEXT: movdqa %xmm1, %xmm0 1577; X32-SSE-NEXT: retl 1578 %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1579 %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1580 %rmask = and <16 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55> 1581 %lmask = and <16 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33> 1582 %or = or <16 x i8> %lmask, %rmask 1583 ret <16 x i8> %or 1584} 1585