1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 8; 9; Just one 32-bit run to make sure we do reasonable things for i64 rotates. 10; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2 11 12; 13; Variable Rotates 14; 15 16define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 17; SSE2-LABEL: var_rotate_v2i64: 18; SSE2: # BB#0: 19; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [64,64] 20; SSE2-NEXT: psubq %xmm1, %xmm2 21; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] 22; SSE2-NEXT: movdqa %xmm0, %xmm4 23; SSE2-NEXT: psllq %xmm3, %xmm4 24; SSE2-NEXT: movdqa %xmm0, %xmm3 25; SSE2-NEXT: psllq %xmm1, %xmm3 26; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] 27; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] 28; SSE2-NEXT: movdqa %xmm0, %xmm1 29; SSE2-NEXT: psrlq %xmm3, %xmm1 30; SSE2-NEXT: psrlq %xmm2, %xmm0 31; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 32; SSE2-NEXT: orpd %xmm4, %xmm1 33; SSE2-NEXT: movapd %xmm1, %xmm0 34; SSE2-NEXT: retq 35; 36; SSE41-LABEL: var_rotate_v2i64: 37; SSE41: # BB#0: 38; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [64,64] 39; SSE41-NEXT: psubq %xmm1, %xmm2 40; SSE41-NEXT: movdqa %xmm0, %xmm3 41; SSE41-NEXT: psllq %xmm1, %xmm3 42; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 43; SSE41-NEXT: movdqa %xmm0, %xmm4 44; SSE41-NEXT: psllq %xmm1, %xmm4 45; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0,1,2,3],xmm4[4,5,6,7] 46; SSE41-NEXT: movdqa %xmm0, %xmm1 47; SSE41-NEXT: psrlq %xmm2, %xmm1 48; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 49; SSE41-NEXT: psrlq %xmm2, %xmm0 50; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 51; SSE41-NEXT: por %xmm4, %xmm0 52; SSE41-NEXT: retq 53; 54; AVX1-LABEL: var_rotate_v2i64: 55; AVX1: # BB#0: 56; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] 57; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2 58; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3 59; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 60; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1 61; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] 62; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm3 63; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 64; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 65; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 66; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 67; AVX1-NEXT: retq 68; 69; AVX2-LABEL: var_rotate_v2i64: 70; AVX2: # BB#0: 71; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] 72; AVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm2 73; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm1 74; AVX2-NEXT: vpsrlvq %xmm2, %xmm0, %xmm0 75; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 76; AVX2-NEXT: retq 77; 78; XOP-LABEL: var_rotate_v2i64: 79; XOP: # BB#0: 80; XOP-NEXT: vprotq %xmm1, %xmm0, %xmm0 81; XOP-NEXT: retq 82; 83; X32-SSE-LABEL: var_rotate_v2i64: 84; X32-SSE: # BB#0: 85; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [64,0,64,0] 86; X32-SSE-NEXT: psubq %xmm1, %xmm2 87; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] 88; X32-SSE-NEXT: movdqa %xmm0, %xmm4 89; X32-SSE-NEXT: psllq %xmm3, %xmm4 90; X32-SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero 91; X32-SSE-NEXT: movdqa %xmm0, %xmm3 92; X32-SSE-NEXT: psllq %xmm1, %xmm3 93; X32-SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] 94; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] 95; X32-SSE-NEXT: movdqa %xmm0, %xmm1 96; X32-SSE-NEXT: psrlq %xmm3, %xmm1 97; X32-SSE-NEXT: movq {{.*#+}} xmm2 = xmm2[0],zero 98; X32-SSE-NEXT: psrlq %xmm2, %xmm0 99; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 100; X32-SSE-NEXT: orpd %xmm4, %xmm1 101; X32-SSE-NEXT: movapd %xmm1, %xmm0 102; X32-SSE-NEXT: retl 103 %b64 = sub <2 x i64> <i64 64, i64 64>, %b 104 %shl = shl <2 x i64> %a, %b 105 %lshr = lshr <2 x i64> %a, %b64 106 %or = or <2 x i64> %shl, %lshr 107 ret <2 x i64> %or 108} 109 110define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 111; SSE2-LABEL: var_rotate_v4i32: 112; SSE2: # BB#0: 113; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32] 114; SSE2-NEXT: psubd %xmm1, %xmm2 115; SSE2-NEXT: pslld $23, %xmm1 116; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 117; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 118; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 119; SSE2-NEXT: pmuludq %xmm0, %xmm1 120; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 121; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 122; SSE2-NEXT: pmuludq %xmm3, %xmm4 123; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] 124; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 125; SSE2-NEXT: movdqa %xmm2, %xmm3 126; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 127; SSE2-NEXT: movdqa %xmm0, %xmm4 128; SSE2-NEXT: psrld %xmm3, %xmm4 129; SSE2-NEXT: movdqa %xmm2, %xmm3 130; SSE2-NEXT: psrlq $32, %xmm3 131; SSE2-NEXT: movdqa %xmm0, %xmm5 132; SSE2-NEXT: psrld %xmm3, %xmm5 133; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] 134; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,3,2,3] 135; SSE2-NEXT: pxor %xmm4, %xmm4 136; SSE2-NEXT: movdqa %xmm2, %xmm5 137; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] 138; SSE2-NEXT: movdqa %xmm0, %xmm6 139; SSE2-NEXT: psrld %xmm5, %xmm6 140; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 141; SSE2-NEXT: psrld %xmm2, %xmm0 142; SSE2-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] 143; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] 144; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 145; SSE2-NEXT: por %xmm1, %xmm0 146; SSE2-NEXT: retq 147; 148; SSE41-LABEL: var_rotate_v4i32: 149; SSE41: # BB#0: 150; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32] 151; SSE41-NEXT: psubd %xmm1, %xmm2 152; SSE41-NEXT: pslld $23, %xmm1 153; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1 154; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 155; SSE41-NEXT: pmulld %xmm0, %xmm1 156; SSE41-NEXT: movdqa %xmm2, %xmm3 157; SSE41-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 158; SSE41-NEXT: movdqa %xmm0, %xmm4 159; SSE41-NEXT: psrld %xmm3, %xmm4 160; SSE41-NEXT: movdqa %xmm2, %xmm3 161; SSE41-NEXT: psrlq $32, %xmm3 162; SSE41-NEXT: movdqa %xmm0, %xmm5 163; SSE41-NEXT: psrld %xmm3, %xmm5 164; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7] 165; SSE41-NEXT: pxor %xmm3, %xmm3 166; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero 167; SSE41-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] 168; SSE41-NEXT: movdqa %xmm0, %xmm3 169; SSE41-NEXT: psrld %xmm2, %xmm3 170; SSE41-NEXT: psrld %xmm4, %xmm0 171; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] 172; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] 173; SSE41-NEXT: por %xmm1, %xmm0 174; SSE41-NEXT: retq 175; 176; AVX1-LABEL: var_rotate_v4i32: 177; AVX1: # BB#0: 178; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32] 179; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm2 180; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 181; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 182; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 183; AVX1-NEXT: vpmulld %xmm0, %xmm1, %xmm1 184; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 185; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 186; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4 187; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 188; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 189; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 190; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] 191; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 192; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero 193; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm0 194; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] 195; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] 196; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 197; AVX1-NEXT: retq 198; 199; AVX2-LABEL: var_rotate_v4i32: 200; AVX2: # BB#0: 201; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 202; AVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm2 203; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm1 204; AVX2-NEXT: vpsrlvd %xmm2, %xmm0, %xmm0 205; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 206; AVX2-NEXT: retq 207; 208; XOP-LABEL: var_rotate_v4i32: 209; XOP: # BB#0: 210; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0 211; XOP-NEXT: retq 212; 213; X32-SSE-LABEL: var_rotate_v4i32: 214; X32-SSE: # BB#0: 215; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32] 216; X32-SSE-NEXT: psubd %xmm1, %xmm2 217; X32-SSE-NEXT: pslld $23, %xmm1 218; X32-SSE-NEXT: paddd .LCPI1_1, %xmm1 219; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1 220; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 221; X32-SSE-NEXT: pmuludq %xmm0, %xmm1 222; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 223; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 224; X32-SSE-NEXT: pmuludq %xmm3, %xmm4 225; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] 226; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 227; X32-SSE-NEXT: movdqa %xmm2, %xmm3 228; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 229; X32-SSE-NEXT: movdqa %xmm0, %xmm4 230; X32-SSE-NEXT: psrld %xmm3, %xmm4 231; X32-SSE-NEXT: movdqa %xmm2, %xmm3 232; X32-SSE-NEXT: psrlq $32, %xmm3 233; X32-SSE-NEXT: movdqa %xmm0, %xmm5 234; X32-SSE-NEXT: psrld %xmm3, %xmm5 235; X32-SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] 236; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,3,2,3] 237; X32-SSE-NEXT: pxor %xmm4, %xmm4 238; X32-SSE-NEXT: movdqa %xmm2, %xmm5 239; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] 240; X32-SSE-NEXT: movdqa %xmm0, %xmm6 241; X32-SSE-NEXT: psrld %xmm5, %xmm6 242; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 243; X32-SSE-NEXT: psrld %xmm2, %xmm0 244; X32-SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] 245; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] 246; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 247; X32-SSE-NEXT: por %xmm1, %xmm0 248; X32-SSE-NEXT: retl 249 %b32 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %b 250 %shl = shl <4 x i32> %a, %b 251 %lshr = lshr <4 x i32> %a, %b32 252 %or = or <4 x i32> %shl, %lshr 253 ret <4 x i32> %or 254} 255 256define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 257; SSE2-LABEL: var_rotate_v8i16: 258; SSE2: # BB#0: 259; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 260; SSE2-NEXT: psubw %xmm1, %xmm3 261; SSE2-NEXT: psllw $12, %xmm1 262; SSE2-NEXT: movdqa %xmm1, %xmm2 263; SSE2-NEXT: psraw $15, %xmm2 264; SSE2-NEXT: movdqa %xmm0, %xmm4 265; SSE2-NEXT: psllw $8, %xmm4 266; SSE2-NEXT: pand %xmm2, %xmm4 267; SSE2-NEXT: pandn %xmm0, %xmm2 268; SSE2-NEXT: por %xmm4, %xmm2 269; SSE2-NEXT: paddw %xmm1, %xmm1 270; SSE2-NEXT: movdqa %xmm1, %xmm4 271; SSE2-NEXT: psraw $15, %xmm4 272; SSE2-NEXT: movdqa %xmm4, %xmm5 273; SSE2-NEXT: pandn %xmm2, %xmm5 274; SSE2-NEXT: psllw $4, %xmm2 275; SSE2-NEXT: pand %xmm4, %xmm2 276; SSE2-NEXT: por %xmm5, %xmm2 277; SSE2-NEXT: paddw %xmm1, %xmm1 278; SSE2-NEXT: movdqa %xmm1, %xmm4 279; SSE2-NEXT: psraw $15, %xmm4 280; SSE2-NEXT: movdqa %xmm4, %xmm5 281; SSE2-NEXT: pandn %xmm2, %xmm5 282; SSE2-NEXT: psllw $2, %xmm2 283; SSE2-NEXT: pand %xmm4, %xmm2 284; SSE2-NEXT: por %xmm5, %xmm2 285; SSE2-NEXT: paddw %xmm1, %xmm1 286; SSE2-NEXT: psraw $15, %xmm1 287; SSE2-NEXT: movdqa %xmm1, %xmm4 288; SSE2-NEXT: pandn %xmm2, %xmm4 289; SSE2-NEXT: psllw $1, %xmm2 290; SSE2-NEXT: pand %xmm1, %xmm2 291; SSE2-NEXT: psllw $12, %xmm3 292; SSE2-NEXT: movdqa %xmm3, %xmm1 293; SSE2-NEXT: psraw $15, %xmm1 294; SSE2-NEXT: movdqa %xmm1, %xmm5 295; SSE2-NEXT: pandn %xmm0, %xmm5 296; SSE2-NEXT: psrlw $8, %xmm0 297; SSE2-NEXT: pand %xmm1, %xmm0 298; SSE2-NEXT: por %xmm5, %xmm0 299; SSE2-NEXT: paddw %xmm3, %xmm3 300; SSE2-NEXT: movdqa %xmm3, %xmm1 301; SSE2-NEXT: psraw $15, %xmm1 302; SSE2-NEXT: movdqa %xmm1, %xmm5 303; SSE2-NEXT: pandn %xmm0, %xmm5 304; SSE2-NEXT: psrlw $4, %xmm0 305; SSE2-NEXT: pand %xmm1, %xmm0 306; SSE2-NEXT: por %xmm5, %xmm0 307; SSE2-NEXT: paddw %xmm3, %xmm3 308; SSE2-NEXT: movdqa %xmm3, %xmm1 309; SSE2-NEXT: psraw $15, %xmm1 310; SSE2-NEXT: movdqa %xmm1, %xmm5 311; SSE2-NEXT: pandn %xmm0, %xmm5 312; SSE2-NEXT: psrlw $2, %xmm0 313; SSE2-NEXT: pand %xmm1, %xmm0 314; SSE2-NEXT: por %xmm5, %xmm0 315; SSE2-NEXT: paddw %xmm3, %xmm3 316; SSE2-NEXT: psraw $15, %xmm3 317; SSE2-NEXT: movdqa %xmm3, %xmm1 318; SSE2-NEXT: pandn %xmm0, %xmm1 319; SSE2-NEXT: psrlw $1, %xmm0 320; SSE2-NEXT: pand %xmm3, %xmm0 321; SSE2-NEXT: por %xmm1, %xmm0 322; SSE2-NEXT: por %xmm4, %xmm0 323; SSE2-NEXT: por %xmm2, %xmm0 324; SSE2-NEXT: retq 325; 326; SSE41-LABEL: var_rotate_v8i16: 327; SSE41: # BB#0: 328; SSE41-NEXT: movdqa %xmm0, %xmm3 329; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] 330; SSE41-NEXT: psubw %xmm1, %xmm2 331; SSE41-NEXT: movdqa %xmm1, %xmm0 332; SSE41-NEXT: psllw $12, %xmm0 333; SSE41-NEXT: psllw $4, %xmm1 334; SSE41-NEXT: por %xmm0, %xmm1 335; SSE41-NEXT: movdqa %xmm1, %xmm4 336; SSE41-NEXT: paddw %xmm4, %xmm4 337; SSE41-NEXT: movdqa %xmm3, %xmm6 338; SSE41-NEXT: psllw $8, %xmm6 339; SSE41-NEXT: movdqa %xmm3, %xmm5 340; SSE41-NEXT: movdqa %xmm1, %xmm0 341; SSE41-NEXT: pblendvb %xmm6, %xmm5 342; SSE41-NEXT: movdqa %xmm5, %xmm1 343; SSE41-NEXT: psllw $4, %xmm1 344; SSE41-NEXT: movdqa %xmm4, %xmm0 345; SSE41-NEXT: pblendvb %xmm1, %xmm5 346; SSE41-NEXT: movdqa %xmm5, %xmm1 347; SSE41-NEXT: psllw $2, %xmm1 348; SSE41-NEXT: paddw %xmm4, %xmm4 349; SSE41-NEXT: movdqa %xmm4, %xmm0 350; SSE41-NEXT: pblendvb %xmm1, %xmm5 351; SSE41-NEXT: movdqa %xmm5, %xmm1 352; SSE41-NEXT: psllw $1, %xmm1 353; SSE41-NEXT: paddw %xmm4, %xmm4 354; SSE41-NEXT: movdqa %xmm4, %xmm0 355; SSE41-NEXT: pblendvb %xmm1, %xmm5 356; SSE41-NEXT: movdqa %xmm2, %xmm0 357; SSE41-NEXT: psllw $12, %xmm0 358; SSE41-NEXT: psllw $4, %xmm2 359; SSE41-NEXT: por %xmm0, %xmm2 360; SSE41-NEXT: movdqa %xmm2, %xmm1 361; SSE41-NEXT: paddw %xmm1, %xmm1 362; SSE41-NEXT: movdqa %xmm3, %xmm4 363; SSE41-NEXT: psrlw $8, %xmm4 364; SSE41-NEXT: movdqa %xmm2, %xmm0 365; SSE41-NEXT: pblendvb %xmm4, %xmm3 366; SSE41-NEXT: movdqa %xmm3, %xmm2 367; SSE41-NEXT: psrlw $4, %xmm2 368; SSE41-NEXT: movdqa %xmm1, %xmm0 369; SSE41-NEXT: pblendvb %xmm2, %xmm3 370; SSE41-NEXT: movdqa %xmm3, %xmm2 371; SSE41-NEXT: psrlw $2, %xmm2 372; SSE41-NEXT: paddw %xmm1, %xmm1 373; SSE41-NEXT: movdqa %xmm1, %xmm0 374; SSE41-NEXT: pblendvb %xmm2, %xmm3 375; SSE41-NEXT: movdqa %xmm3, %xmm2 376; SSE41-NEXT: psrlw $1, %xmm2 377; SSE41-NEXT: paddw %xmm1, %xmm1 378; SSE41-NEXT: movdqa %xmm1, %xmm0 379; SSE41-NEXT: pblendvb %xmm2, %xmm3 380; SSE41-NEXT: por %xmm5, %xmm3 381; SSE41-NEXT: movdqa %xmm3, %xmm0 382; SSE41-NEXT: retq 383; 384; AVX1-LABEL: var_rotate_v8i16: 385; AVX1: # BB#0: 386; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] 387; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm2 388; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3 389; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 390; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 391; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3 392; AVX1-NEXT: vpsllw $8, %xmm0, %xmm4 393; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm1 394; AVX1-NEXT: vpsllw $4, %xmm1, %xmm4 395; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm1, %xmm1 396; AVX1-NEXT: vpsllw $2, %xmm1, %xmm4 397; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 398; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm1, %xmm1 399; AVX1-NEXT: vpsllw $1, %xmm1, %xmm4 400; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 401; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm1, %xmm1 402; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3 403; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 404; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 405; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 406; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm4 407; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 408; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 409; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 410; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2 411; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 412; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 413; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2 414; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 415; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 416; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 417; AVX1-NEXT: retq 418; 419; AVX2-LABEL: var_rotate_v8i16: 420; AVX2: # BB#0: 421; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] 422; AVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm2 423; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 424; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 425; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm1 426; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] 427; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 428; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 429; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 430; AVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0 431; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 432; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 433; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 434; AVX2-NEXT: vzeroupper 435; AVX2-NEXT: retq 436; 437; XOP-LABEL: var_rotate_v8i16: 438; XOP: # BB#0: 439; XOP-NEXT: vprotw %xmm1, %xmm0, %xmm0 440; XOP-NEXT: retq 441; 442; X32-SSE-LABEL: var_rotate_v8i16: 443; X32-SSE: # BB#0: 444; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 445; X32-SSE-NEXT: psubw %xmm1, %xmm3 446; X32-SSE-NEXT: psllw $12, %xmm1 447; X32-SSE-NEXT: movdqa %xmm1, %xmm2 448; X32-SSE-NEXT: psraw $15, %xmm2 449; X32-SSE-NEXT: movdqa %xmm0, %xmm4 450; X32-SSE-NEXT: psllw $8, %xmm4 451; X32-SSE-NEXT: pand %xmm2, %xmm4 452; X32-SSE-NEXT: pandn %xmm0, %xmm2 453; X32-SSE-NEXT: por %xmm4, %xmm2 454; X32-SSE-NEXT: paddw %xmm1, %xmm1 455; X32-SSE-NEXT: movdqa %xmm1, %xmm4 456; X32-SSE-NEXT: psraw $15, %xmm4 457; X32-SSE-NEXT: movdqa %xmm4, %xmm5 458; X32-SSE-NEXT: pandn %xmm2, %xmm5 459; X32-SSE-NEXT: psllw $4, %xmm2 460; X32-SSE-NEXT: pand %xmm4, %xmm2 461; X32-SSE-NEXT: por %xmm5, %xmm2 462; X32-SSE-NEXT: paddw %xmm1, %xmm1 463; X32-SSE-NEXT: movdqa %xmm1, %xmm4 464; X32-SSE-NEXT: psraw $15, %xmm4 465; X32-SSE-NEXT: movdqa %xmm4, %xmm5 466; X32-SSE-NEXT: pandn %xmm2, %xmm5 467; X32-SSE-NEXT: psllw $2, %xmm2 468; X32-SSE-NEXT: pand %xmm4, %xmm2 469; X32-SSE-NEXT: por %xmm5, %xmm2 470; X32-SSE-NEXT: paddw %xmm1, %xmm1 471; X32-SSE-NEXT: psraw $15, %xmm1 472; X32-SSE-NEXT: movdqa %xmm1, %xmm4 473; X32-SSE-NEXT: pandn %xmm2, %xmm4 474; X32-SSE-NEXT: psllw $1, %xmm2 475; X32-SSE-NEXT: pand %xmm1, %xmm2 476; X32-SSE-NEXT: psllw $12, %xmm3 477; X32-SSE-NEXT: movdqa %xmm3, %xmm1 478; X32-SSE-NEXT: psraw $15, %xmm1 479; X32-SSE-NEXT: movdqa %xmm1, %xmm5 480; X32-SSE-NEXT: pandn %xmm0, %xmm5 481; X32-SSE-NEXT: psrlw $8, %xmm0 482; X32-SSE-NEXT: pand %xmm1, %xmm0 483; X32-SSE-NEXT: por %xmm5, %xmm0 484; X32-SSE-NEXT: paddw %xmm3, %xmm3 485; X32-SSE-NEXT: movdqa %xmm3, %xmm1 486; X32-SSE-NEXT: psraw $15, %xmm1 487; X32-SSE-NEXT: movdqa %xmm1, %xmm5 488; X32-SSE-NEXT: pandn %xmm0, %xmm5 489; X32-SSE-NEXT: psrlw $4, %xmm0 490; X32-SSE-NEXT: pand %xmm1, %xmm0 491; X32-SSE-NEXT: por %xmm5, %xmm0 492; X32-SSE-NEXT: paddw %xmm3, %xmm3 493; X32-SSE-NEXT: movdqa %xmm3, %xmm1 494; X32-SSE-NEXT: psraw $15, %xmm1 495; X32-SSE-NEXT: movdqa %xmm1, %xmm5 496; X32-SSE-NEXT: pandn %xmm0, %xmm5 497; X32-SSE-NEXT: psrlw $2, %xmm0 498; X32-SSE-NEXT: pand %xmm1, %xmm0 499; X32-SSE-NEXT: por %xmm5, %xmm0 500; X32-SSE-NEXT: paddw %xmm3, %xmm3 501; X32-SSE-NEXT: psraw $15, %xmm3 502; X32-SSE-NEXT: movdqa %xmm3, %xmm1 503; X32-SSE-NEXT: pandn %xmm0, %xmm1 504; X32-SSE-NEXT: psrlw $1, %xmm0 505; X32-SSE-NEXT: pand %xmm3, %xmm0 506; X32-SSE-NEXT: por %xmm1, %xmm0 507; X32-SSE-NEXT: por %xmm4, %xmm0 508; X32-SSE-NEXT: por %xmm2, %xmm0 509; X32-SSE-NEXT: retl 510 %b16 = sub <8 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b 511 %shl = shl <8 x i16> %a, %b 512 %lshr = lshr <8 x i16> %a, %b16 513 %or = or <8 x i16> %shl, %lshr 514 ret <8 x i16> %or 515} 516 517define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 518; SSE2-LABEL: var_rotate_v16i8: 519; SSE2: # BB#0: 520; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 521; SSE2-NEXT: psubb %xmm1, %xmm4 522; SSE2-NEXT: psllw $5, %xmm1 523; SSE2-NEXT: pxor %xmm3, %xmm3 524; SSE2-NEXT: pxor %xmm2, %xmm2 525; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 526; SSE2-NEXT: movdqa %xmm0, %xmm5 527; SSE2-NEXT: psllw $4, %xmm5 528; SSE2-NEXT: pand {{.*}}(%rip), %xmm5 529; SSE2-NEXT: pand %xmm2, %xmm5 530; SSE2-NEXT: pandn %xmm0, %xmm2 531; SSE2-NEXT: por %xmm5, %xmm2 532; SSE2-NEXT: paddb %xmm1, %xmm1 533; SSE2-NEXT: pxor %xmm5, %xmm5 534; SSE2-NEXT: pcmpgtb %xmm1, %xmm5 535; SSE2-NEXT: movdqa %xmm5, %xmm6 536; SSE2-NEXT: pandn %xmm2, %xmm6 537; SSE2-NEXT: psllw $2, %xmm2 538; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 539; SSE2-NEXT: pand %xmm5, %xmm2 540; SSE2-NEXT: por %xmm6, %xmm2 541; SSE2-NEXT: paddb %xmm1, %xmm1 542; SSE2-NEXT: pxor %xmm5, %xmm5 543; SSE2-NEXT: pcmpgtb %xmm1, %xmm5 544; SSE2-NEXT: movdqa %xmm5, %xmm1 545; SSE2-NEXT: pandn %xmm2, %xmm1 546; SSE2-NEXT: paddb %xmm2, %xmm2 547; SSE2-NEXT: pand %xmm5, %xmm2 548; SSE2-NEXT: psllw $5, %xmm4 549; SSE2-NEXT: pxor %xmm5, %xmm5 550; SSE2-NEXT: pcmpgtb %xmm4, %xmm5 551; SSE2-NEXT: movdqa %xmm5, %xmm6 552; SSE2-NEXT: pandn %xmm0, %xmm6 553; SSE2-NEXT: psrlw $4, %xmm0 554; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 555; SSE2-NEXT: pand %xmm5, %xmm0 556; SSE2-NEXT: por %xmm6, %xmm0 557; SSE2-NEXT: paddb %xmm4, %xmm4 558; SSE2-NEXT: pxor %xmm5, %xmm5 559; SSE2-NEXT: pcmpgtb %xmm4, %xmm5 560; SSE2-NEXT: movdqa %xmm5, %xmm6 561; SSE2-NEXT: pandn %xmm0, %xmm6 562; SSE2-NEXT: psrlw $2, %xmm0 563; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 564; SSE2-NEXT: pand %xmm5, %xmm0 565; SSE2-NEXT: por %xmm6, %xmm0 566; SSE2-NEXT: paddb %xmm4, %xmm4 567; SSE2-NEXT: pcmpgtb %xmm4, %xmm3 568; SSE2-NEXT: movdqa %xmm3, %xmm4 569; SSE2-NEXT: pandn %xmm0, %xmm4 570; SSE2-NEXT: psrlw $1, %xmm0 571; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 572; SSE2-NEXT: pand %xmm3, %xmm0 573; SSE2-NEXT: por %xmm4, %xmm0 574; SSE2-NEXT: por %xmm1, %xmm0 575; SSE2-NEXT: por %xmm2, %xmm0 576; SSE2-NEXT: retq 577; 578; SSE41-LABEL: var_rotate_v16i8: 579; SSE41: # BB#0: 580; SSE41-NEXT: movdqa %xmm1, %xmm3 581; SSE41-NEXT: movdqa %xmm0, %xmm1 582; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 583; SSE41-NEXT: psubb %xmm3, %xmm2 584; SSE41-NEXT: psllw $5, %xmm3 585; SSE41-NEXT: movdqa %xmm1, %xmm5 586; SSE41-NEXT: psllw $4, %xmm5 587; SSE41-NEXT: pand {{.*}}(%rip), %xmm5 588; SSE41-NEXT: movdqa %xmm1, %xmm4 589; SSE41-NEXT: movdqa %xmm3, %xmm0 590; SSE41-NEXT: pblendvb %xmm5, %xmm4 591; SSE41-NEXT: movdqa %xmm4, %xmm5 592; SSE41-NEXT: psllw $2, %xmm5 593; SSE41-NEXT: pand {{.*}}(%rip), %xmm5 594; SSE41-NEXT: paddb %xmm3, %xmm3 595; SSE41-NEXT: movdqa %xmm3, %xmm0 596; SSE41-NEXT: pblendvb %xmm5, %xmm4 597; SSE41-NEXT: movdqa %xmm4, %xmm5 598; SSE41-NEXT: paddb %xmm5, %xmm5 599; SSE41-NEXT: paddb %xmm3, %xmm3 600; SSE41-NEXT: movdqa %xmm3, %xmm0 601; SSE41-NEXT: pblendvb %xmm5, %xmm4 602; SSE41-NEXT: psllw $5, %xmm2 603; SSE41-NEXT: movdqa %xmm2, %xmm3 604; SSE41-NEXT: paddb %xmm3, %xmm3 605; SSE41-NEXT: movdqa %xmm1, %xmm5 606; SSE41-NEXT: psrlw $4, %xmm5 607; SSE41-NEXT: pand {{.*}}(%rip), %xmm5 608; SSE41-NEXT: movdqa %xmm2, %xmm0 609; SSE41-NEXT: pblendvb %xmm5, %xmm1 610; SSE41-NEXT: movdqa %xmm1, %xmm2 611; SSE41-NEXT: psrlw $2, %xmm2 612; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 613; SSE41-NEXT: movdqa %xmm3, %xmm0 614; SSE41-NEXT: pblendvb %xmm2, %xmm1 615; SSE41-NEXT: movdqa %xmm1, %xmm2 616; SSE41-NEXT: psrlw $1, %xmm2 617; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 618; SSE41-NEXT: paddb %xmm3, %xmm3 619; SSE41-NEXT: movdqa %xmm3, %xmm0 620; SSE41-NEXT: pblendvb %xmm2, %xmm1 621; SSE41-NEXT: por %xmm4, %xmm1 622; SSE41-NEXT: movdqa %xmm1, %xmm0 623; SSE41-NEXT: retq 624; 625; AVX-LABEL: var_rotate_v16i8: 626; AVX: # BB#0: 627; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 628; AVX-NEXT: vpsubb %xmm1, %xmm2, %xmm2 629; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 630; AVX-NEXT: vpsllw $4, %xmm0, %xmm3 631; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 632; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm3 633; AVX-NEXT: vpsllw $2, %xmm3, %xmm4 634; AVX-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4 635; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 636; AVX-NEXT: vpblendvb %xmm1, %xmm4, %xmm3, %xmm3 637; AVX-NEXT: vpaddb %xmm3, %xmm3, %xmm4 638; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 639; AVX-NEXT: vpblendvb %xmm1, %xmm4, %xmm3, %xmm1 640; AVX-NEXT: vpsllw $5, %xmm2, %xmm2 641; AVX-NEXT: vpaddb %xmm2, %xmm2, %xmm3 642; AVX-NEXT: vpsrlw $4, %xmm0, %xmm4 643; AVX-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4 644; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 645; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 646; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 647; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 648; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 649; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 650; AVX-NEXT: vpaddb %xmm3, %xmm3, %xmm3 651; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 652; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 653; AVX-NEXT: retq 654; 655; XOP-LABEL: var_rotate_v16i8: 656; XOP: # BB#0: 657; XOP-NEXT: vprotb %xmm1, %xmm0, %xmm0 658; XOP-NEXT: retq 659; 660; X32-SSE-LABEL: var_rotate_v16i8: 661; X32-SSE: # BB#0: 662; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 663; X32-SSE-NEXT: psubb %xmm1, %xmm4 664; X32-SSE-NEXT: psllw $5, %xmm1 665; X32-SSE-NEXT: pxor %xmm3, %xmm3 666; X32-SSE-NEXT: pxor %xmm2, %xmm2 667; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 668; X32-SSE-NEXT: movdqa %xmm0, %xmm5 669; X32-SSE-NEXT: psllw $4, %xmm5 670; X32-SSE-NEXT: pand .LCPI3_1, %xmm5 671; X32-SSE-NEXT: pand %xmm2, %xmm5 672; X32-SSE-NEXT: pandn %xmm0, %xmm2 673; X32-SSE-NEXT: por %xmm5, %xmm2 674; X32-SSE-NEXT: paddb %xmm1, %xmm1 675; X32-SSE-NEXT: pxor %xmm5, %xmm5 676; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm5 677; X32-SSE-NEXT: movdqa %xmm5, %xmm6 678; X32-SSE-NEXT: pandn %xmm2, %xmm6 679; X32-SSE-NEXT: psllw $2, %xmm2 680; X32-SSE-NEXT: pand .LCPI3_2, %xmm2 681; X32-SSE-NEXT: pand %xmm5, %xmm2 682; X32-SSE-NEXT: por %xmm6, %xmm2 683; X32-SSE-NEXT: paddb %xmm1, %xmm1 684; X32-SSE-NEXT: pxor %xmm5, %xmm5 685; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm5 686; X32-SSE-NEXT: movdqa %xmm5, %xmm1 687; X32-SSE-NEXT: pandn %xmm2, %xmm1 688; X32-SSE-NEXT: paddb %xmm2, %xmm2 689; X32-SSE-NEXT: pand %xmm5, %xmm2 690; X32-SSE-NEXT: psllw $5, %xmm4 691; X32-SSE-NEXT: pxor %xmm5, %xmm5 692; X32-SSE-NEXT: pcmpgtb %xmm4, %xmm5 693; X32-SSE-NEXT: movdqa %xmm5, %xmm6 694; X32-SSE-NEXT: pandn %xmm0, %xmm6 695; X32-SSE-NEXT: psrlw $4, %xmm0 696; X32-SSE-NEXT: pand .LCPI3_3, %xmm0 697; X32-SSE-NEXT: pand %xmm5, %xmm0 698; X32-SSE-NEXT: por %xmm6, %xmm0 699; X32-SSE-NEXT: paddb %xmm4, %xmm4 700; X32-SSE-NEXT: pxor %xmm5, %xmm5 701; X32-SSE-NEXT: pcmpgtb %xmm4, %xmm5 702; X32-SSE-NEXT: movdqa %xmm5, %xmm6 703; X32-SSE-NEXT: pandn %xmm0, %xmm6 704; X32-SSE-NEXT: psrlw $2, %xmm0 705; X32-SSE-NEXT: pand .LCPI3_4, %xmm0 706; X32-SSE-NEXT: pand %xmm5, %xmm0 707; X32-SSE-NEXT: por %xmm6, %xmm0 708; X32-SSE-NEXT: paddb %xmm4, %xmm4 709; X32-SSE-NEXT: pcmpgtb %xmm4, %xmm3 710; X32-SSE-NEXT: movdqa %xmm3, %xmm4 711; X32-SSE-NEXT: pandn %xmm0, %xmm4 712; X32-SSE-NEXT: psrlw $1, %xmm0 713; X32-SSE-NEXT: pand .LCPI3_5, %xmm0 714; X32-SSE-NEXT: pand %xmm3, %xmm0 715; X32-SSE-NEXT: por %xmm4, %xmm0 716; X32-SSE-NEXT: por %xmm1, %xmm0 717; X32-SSE-NEXT: por %xmm2, %xmm0 718; X32-SSE-NEXT: retl 719 %b8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b 720 %shl = shl <16 x i8> %a, %b 721 %lshr = lshr <16 x i8> %a, %b8 722 %or = or <16 x i8> %shl, %lshr 723 ret <16 x i8> %or 724} 725 726; 727; Constant Rotates 728; 729 730define <2 x i64> @constant_rotate_v2i64(<2 x i64> %a) nounwind { 731; SSE2-LABEL: constant_rotate_v2i64: 732; SSE2: # BB#0: 733; SSE2-NEXT: movdqa %xmm0, %xmm2 734; SSE2-NEXT: psllq $14, %xmm2 735; SSE2-NEXT: movdqa %xmm0, %xmm1 736; SSE2-NEXT: psllq $4, %xmm1 737; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 738; SSE2-NEXT: movdqa %xmm0, %xmm1 739; SSE2-NEXT: psrlq $50, %xmm1 740; SSE2-NEXT: psrlq $60, %xmm0 741; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 742; SSE2-NEXT: orpd %xmm2, %xmm1 743; SSE2-NEXT: movapd %xmm1, %xmm0 744; SSE2-NEXT: retq 745; 746; SSE41-LABEL: constant_rotate_v2i64: 747; SSE41: # BB#0: 748; SSE41-NEXT: movdqa %xmm0, %xmm1 749; SSE41-NEXT: psllq $14, %xmm1 750; SSE41-NEXT: movdqa %xmm0, %xmm2 751; SSE41-NEXT: psllq $4, %xmm2 752; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 753; SSE41-NEXT: movdqa %xmm0, %xmm1 754; SSE41-NEXT: psrlq $50, %xmm1 755; SSE41-NEXT: psrlq $60, %xmm0 756; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 757; SSE41-NEXT: por %xmm2, %xmm0 758; SSE41-NEXT: retq 759; 760; AVX1-LABEL: constant_rotate_v2i64: 761; AVX1: # BB#0: 762; AVX1-NEXT: vpsllq $14, %xmm0, %xmm1 763; AVX1-NEXT: vpsllq $4, %xmm0, %xmm2 764; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 765; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm2 766; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm0 767; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 768; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 769; AVX1-NEXT: retq 770; 771; AVX2-LABEL: constant_rotate_v2i64: 772; AVX2: # BB#0: 773; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm1 774; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 775; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 776; AVX2-NEXT: retq 777; 778; XOPAVX1-LABEL: constant_rotate_v2i64: 779; XOPAVX1: # BB#0: 780; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm1 781; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 782; XOPAVX1-NEXT: vpsubq {{.*}}(%rip), %xmm2, %xmm2 783; XOPAVX1-NEXT: vpshlq %xmm2, %xmm0, %xmm0 784; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 785; XOPAVX1-NEXT: retq 786; 787; XOPAVX2-LABEL: constant_rotate_v2i64: 788; XOPAVX2: # BB#0: 789; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm1 790; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 791; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 792; XOPAVX2-NEXT: retq 793; 794; X32-SSE-LABEL: constant_rotate_v2i64: 795; X32-SSE: # BB#0: 796; X32-SSE-NEXT: movdqa %xmm0, %xmm2 797; X32-SSE-NEXT: psllq $14, %xmm2 798; X32-SSE-NEXT: movdqa %xmm0, %xmm1 799; X32-SSE-NEXT: psllq $4, %xmm1 800; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 801; X32-SSE-NEXT: movdqa %xmm0, %xmm1 802; X32-SSE-NEXT: psrlq $50, %xmm1 803; X32-SSE-NEXT: psrlq $60, %xmm0 804; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 805; X32-SSE-NEXT: orpd %xmm2, %xmm1 806; X32-SSE-NEXT: movapd %xmm1, %xmm0 807; X32-SSE-NEXT: retl 808 %shl = shl <2 x i64> %a, <i64 4, i64 14> 809 %lshr = lshr <2 x i64> %a, <i64 60, i64 50> 810 %or = or <2 x i64> %shl, %lshr 811 ret <2 x i64> %or 812} 813 814define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind { 815; SSE2-LABEL: constant_rotate_v4i32: 816; SSE2: # BB#0: 817; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128] 818; SSE2-NEXT: movdqa %xmm0, %xmm2 819; SSE2-NEXT: pmuludq %xmm1, %xmm2 820; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 821; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 822; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 823; SSE2-NEXT: pmuludq %xmm1, %xmm3 824; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] 825; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 826; SSE2-NEXT: movdqa %xmm0, %xmm1 827; SSE2-NEXT: psrld $25, %xmm1 828; SSE2-NEXT: movdqa %xmm0, %xmm3 829; SSE2-NEXT: psrld $27, %xmm3 830; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] 831; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 832; SSE2-NEXT: movdqa %xmm0, %xmm3 833; SSE2-NEXT: psrld $26, %xmm3 834; SSE2-NEXT: psrld $28, %xmm0 835; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] 836; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 837; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 838; SSE2-NEXT: por %xmm2, %xmm0 839; SSE2-NEXT: retq 840; 841; SSE41-LABEL: constant_rotate_v4i32: 842; SSE41: # BB#0: 843; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128] 844; SSE41-NEXT: pmulld %xmm0, %xmm1 845; SSE41-NEXT: movdqa %xmm0, %xmm2 846; SSE41-NEXT: psrld $25, %xmm2 847; SSE41-NEXT: movdqa %xmm0, %xmm3 848; SSE41-NEXT: psrld $27, %xmm3 849; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] 850; SSE41-NEXT: movdqa %xmm0, %xmm2 851; SSE41-NEXT: psrld $26, %xmm2 852; SSE41-NEXT: psrld $28, %xmm0 853; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 854; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] 855; SSE41-NEXT: por %xmm1, %xmm0 856; SSE41-NEXT: retq 857; 858; AVX1-LABEL: constant_rotate_v4i32: 859; AVX1: # BB#0: 860; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1 861; AVX1-NEXT: vpsrld $25, %xmm0, %xmm2 862; AVX1-NEXT: vpsrld $27, %xmm0, %xmm3 863; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 864; AVX1-NEXT: vpsrld $26, %xmm0, %xmm3 865; AVX1-NEXT: vpsrld $28, %xmm0, %xmm0 866; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] 867; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 868; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 869; AVX1-NEXT: retq 870; 871; AVX2-LABEL: constant_rotate_v4i32: 872; AVX2: # BB#0: 873; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm1 874; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 875; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 876; AVX2-NEXT: retq 877; 878; XOPAVX1-LABEL: constant_rotate_v4i32: 879; XOPAVX1: # BB#0: 880; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm1 881; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 882; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 883; XOPAVX1-NEXT: retq 884; 885; XOPAVX2-LABEL: constant_rotate_v4i32: 886; XOPAVX2: # BB#0: 887; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm1 888; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 889; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 890; XOPAVX2-NEXT: retq 891; 892; X32-SSE-LABEL: constant_rotate_v4i32: 893; X32-SSE: # BB#0: 894; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128] 895; X32-SSE-NEXT: movdqa %xmm0, %xmm2 896; X32-SSE-NEXT: pmuludq %xmm1, %xmm2 897; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 898; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 899; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 900; X32-SSE-NEXT: pmuludq %xmm1, %xmm3 901; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] 902; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 903; X32-SSE-NEXT: movdqa %xmm0, %xmm1 904; X32-SSE-NEXT: psrld $25, %xmm1 905; X32-SSE-NEXT: movdqa %xmm0, %xmm3 906; X32-SSE-NEXT: psrld $27, %xmm3 907; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] 908; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 909; X32-SSE-NEXT: movdqa %xmm0, %xmm3 910; X32-SSE-NEXT: psrld $26, %xmm3 911; X32-SSE-NEXT: psrld $28, %xmm0 912; X32-SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] 913; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 914; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 915; X32-SSE-NEXT: por %xmm2, %xmm0 916; X32-SSE-NEXT: retl 917 %shl = shl <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7> 918 %lshr = lshr <4 x i32> %a, <i32 28, i32 27, i32 26, i32 25> 919 %or = or <4 x i32> %shl, %lshr 920 ret <4 x i32> %or 921} 922 923define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind { 924; SSE2-LABEL: constant_rotate_v8i16: 925; SSE2: # BB#0: 926; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] 927; SSE2-NEXT: pmullw %xmm0, %xmm2 928; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] 929; SSE2-NEXT: movdqa %xmm1, %xmm3 930; SSE2-NEXT: pandn %xmm0, %xmm3 931; SSE2-NEXT: psrlw $8, %xmm0 932; SSE2-NEXT: pand %xmm1, %xmm0 933; SSE2-NEXT: por %xmm3, %xmm0 934; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,0,0,0] 935; SSE2-NEXT: movdqa %xmm1, %xmm3 936; SSE2-NEXT: pandn %xmm0, %xmm3 937; SSE2-NEXT: psrlw $4, %xmm0 938; SSE2-NEXT: pand %xmm1, %xmm0 939; SSE2-NEXT: por %xmm3, %xmm0 940; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,0,65535,65535,0] 941; SSE2-NEXT: movdqa %xmm1, %xmm3 942; SSE2-NEXT: pandn %xmm0, %xmm3 943; SSE2-NEXT: psrlw $2, %xmm0 944; SSE2-NEXT: pand %xmm1, %xmm0 945; SSE2-NEXT: por %xmm3, %xmm0 946; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] 947; SSE2-NEXT: movdqa %xmm0, %xmm1 948; SSE2-NEXT: pand %xmm3, %xmm1 949; SSE2-NEXT: psrlw $1, %xmm0 950; SSE2-NEXT: pandn %xmm0, %xmm3 951; SSE2-NEXT: por %xmm2, %xmm3 952; SSE2-NEXT: por %xmm3, %xmm1 953; SSE2-NEXT: movdqa %xmm1, %xmm0 954; SSE2-NEXT: retq 955; 956; SSE41-LABEL: constant_rotate_v8i16: 957; SSE41: # BB#0: 958; SSE41-NEXT: movdqa %xmm0, %xmm1 959; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] 960; SSE41-NEXT: pmullw %xmm1, %xmm2 961; SSE41-NEXT: movdqa %xmm1, %xmm3 962; SSE41-NEXT: psrlw $8, %xmm3 963; SSE41-NEXT: movaps {{.*#+}} xmm0 = [256,61680,57568,53456,49344,45232,41120,37008] 964; SSE41-NEXT: pblendvb %xmm3, %xmm1 965; SSE41-NEXT: movdqa %xmm1, %xmm3 966; SSE41-NEXT: psrlw $4, %xmm3 967; SSE41-NEXT: movaps {{.*#+}} xmm0 = [512,57824,49600,41376,33152,24928,16704,8480] 968; SSE41-NEXT: pblendvb %xmm3, %xmm1 969; SSE41-NEXT: movdqa %xmm1, %xmm3 970; SSE41-NEXT: psrlw $2, %xmm3 971; SSE41-NEXT: movaps {{.*#+}} xmm0 = [1024,50112,33664,17216,768,49856,33408,16960] 972; SSE41-NEXT: pblendvb %xmm3, %xmm1 973; SSE41-NEXT: movdqa %xmm1, %xmm3 974; SSE41-NEXT: psrlw $1, %xmm3 975; SSE41-NEXT: movaps {{.*#+}} xmm0 = [2048,34688,1792,34432,1536,34176,1280,33920] 976; SSE41-NEXT: pblendvb %xmm3, %xmm1 977; SSE41-NEXT: por %xmm2, %xmm1 978; SSE41-NEXT: movdqa %xmm1, %xmm0 979; SSE41-NEXT: retq 980; 981; AVX1-LABEL: constant_rotate_v8i16: 982; AVX1: # BB#0: 983; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1 984; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 985; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,61680,57568,53456,49344,45232,41120,37008] 986; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 987; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 988; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [512,57824,49600,41376,33152,24928,16704,8480] 989; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 990; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2 991; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1024,50112,33664,17216,768,49856,33408,16960] 992; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 993; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2 994; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2048,34688,1792,34432,1536,34176,1280,33920] 995; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 996; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 997; AVX1-NEXT: retq 998; 999; AVX2-LABEL: constant_rotate_v8i16: 1000; AVX2: # BB#0: 1001; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1 1002; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1003; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1004; AVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0 1005; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero 1006; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1007; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 1008; AVX2-NEXT: vzeroupper 1009; AVX2-NEXT: retq 1010; 1011; XOP-LABEL: constant_rotate_v8i16: 1012; XOP: # BB#0: 1013; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm1 1014; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 1015; XOP-NEXT: vpsubw {{.*}}(%rip), %xmm2, %xmm2 1016; XOP-NEXT: vpshlw %xmm2, %xmm0, %xmm0 1017; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 1018; XOP-NEXT: retq 1019; 1020; X32-SSE-LABEL: constant_rotate_v8i16: 1021; X32-SSE: # BB#0: 1022; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] 1023; X32-SSE-NEXT: pmullw %xmm0, %xmm2 1024; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] 1025; X32-SSE-NEXT: movdqa %xmm1, %xmm3 1026; X32-SSE-NEXT: pandn %xmm0, %xmm3 1027; X32-SSE-NEXT: psrlw $8, %xmm0 1028; X32-SSE-NEXT: pand %xmm1, %xmm0 1029; X32-SSE-NEXT: por %xmm3, %xmm0 1030; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,0,0,0] 1031; X32-SSE-NEXT: movdqa %xmm1, %xmm3 1032; X32-SSE-NEXT: pandn %xmm0, %xmm3 1033; X32-SSE-NEXT: psrlw $4, %xmm0 1034; X32-SSE-NEXT: pand %xmm1, %xmm0 1035; X32-SSE-NEXT: por %xmm3, %xmm0 1036; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,0,65535,65535,0] 1037; X32-SSE-NEXT: movdqa %xmm1, %xmm3 1038; X32-SSE-NEXT: pandn %xmm0, %xmm3 1039; X32-SSE-NEXT: psrlw $2, %xmm0 1040; X32-SSE-NEXT: pand %xmm1, %xmm0 1041; X32-SSE-NEXT: por %xmm3, %xmm0 1042; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] 1043; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1044; X32-SSE-NEXT: pand %xmm3, %xmm1 1045; X32-SSE-NEXT: psrlw $1, %xmm0 1046; X32-SSE-NEXT: pandn %xmm0, %xmm3 1047; X32-SSE-NEXT: por %xmm2, %xmm3 1048; X32-SSE-NEXT: por %xmm3, %xmm1 1049; X32-SSE-NEXT: movdqa %xmm1, %xmm0 1050; X32-SSE-NEXT: retl 1051 %shl = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7> 1052 %lshr = lshr <8 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9> 1053 %or = or <8 x i16> %shl, %lshr 1054 ret <8 x i16> %or 1055} 1056 1057define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind { 1058; SSE2-LABEL: constant_rotate_v16i8: 1059; SSE2: # BB#0: 1060; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] 1061; SSE2-NEXT: psllw $5, %xmm3 1062; SSE2-NEXT: pxor %xmm2, %xmm2 1063; SSE2-NEXT: pxor %xmm1, %xmm1 1064; SSE2-NEXT: pcmpgtb %xmm3, %xmm1 1065; SSE2-NEXT: movdqa %xmm0, %xmm4 1066; SSE2-NEXT: psllw $4, %xmm4 1067; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 1068; SSE2-NEXT: pand %xmm1, %xmm4 1069; SSE2-NEXT: pandn %xmm0, %xmm1 1070; SSE2-NEXT: por %xmm4, %xmm1 1071; SSE2-NEXT: paddb %xmm3, %xmm3 1072; SSE2-NEXT: pxor %xmm4, %xmm4 1073; SSE2-NEXT: pcmpgtb %xmm3, %xmm4 1074; SSE2-NEXT: movdqa %xmm4, %xmm5 1075; SSE2-NEXT: pandn %xmm1, %xmm5 1076; SSE2-NEXT: psllw $2, %xmm1 1077; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 1078; SSE2-NEXT: pand %xmm4, %xmm1 1079; SSE2-NEXT: por %xmm5, %xmm1 1080; SSE2-NEXT: paddb %xmm3, %xmm3 1081; SSE2-NEXT: pxor %xmm4, %xmm4 1082; SSE2-NEXT: pcmpgtb %xmm3, %xmm4 1083; SSE2-NEXT: movdqa %xmm4, %xmm3 1084; SSE2-NEXT: pandn %xmm1, %xmm3 1085; SSE2-NEXT: paddb %xmm1, %xmm1 1086; SSE2-NEXT: pand %xmm4, %xmm1 1087; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7] 1088; SSE2-NEXT: psllw $5, %xmm4 1089; SSE2-NEXT: pxor %xmm5, %xmm5 1090; SSE2-NEXT: pcmpgtb %xmm4, %xmm5 1091; SSE2-NEXT: movdqa %xmm5, %xmm6 1092; SSE2-NEXT: pandn %xmm0, %xmm6 1093; SSE2-NEXT: psrlw $4, %xmm0 1094; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1095; SSE2-NEXT: pand %xmm5, %xmm0 1096; SSE2-NEXT: por %xmm6, %xmm0 1097; SSE2-NEXT: paddb %xmm4, %xmm4 1098; SSE2-NEXT: pxor %xmm5, %xmm5 1099; SSE2-NEXT: pcmpgtb %xmm4, %xmm5 1100; SSE2-NEXT: movdqa %xmm5, %xmm6 1101; SSE2-NEXT: pandn %xmm0, %xmm6 1102; SSE2-NEXT: psrlw $2, %xmm0 1103; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1104; SSE2-NEXT: pand %xmm5, %xmm0 1105; SSE2-NEXT: por %xmm6, %xmm0 1106; SSE2-NEXT: paddb %xmm4, %xmm4 1107; SSE2-NEXT: pcmpgtb %xmm4, %xmm2 1108; SSE2-NEXT: movdqa %xmm2, %xmm4 1109; SSE2-NEXT: pandn %xmm0, %xmm4 1110; SSE2-NEXT: psrlw $1, %xmm0 1111; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1112; SSE2-NEXT: pand %xmm2, %xmm0 1113; SSE2-NEXT: por %xmm4, %xmm0 1114; SSE2-NEXT: por %xmm3, %xmm0 1115; SSE2-NEXT: por %xmm1, %xmm0 1116; SSE2-NEXT: retq 1117; 1118; SSE41-LABEL: constant_rotate_v16i8: 1119; SSE41: # BB#0: 1120; SSE41-NEXT: movdqa %xmm0, %xmm1 1121; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] 1122; SSE41-NEXT: psllw $5, %xmm0 1123; SSE41-NEXT: movdqa %xmm1, %xmm3 1124; SSE41-NEXT: psllw $4, %xmm3 1125; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 1126; SSE41-NEXT: movdqa %xmm1, %xmm2 1127; SSE41-NEXT: pblendvb %xmm3, %xmm2 1128; SSE41-NEXT: movdqa %xmm2, %xmm3 1129; SSE41-NEXT: psllw $2, %xmm3 1130; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 1131; SSE41-NEXT: paddb %xmm0, %xmm0 1132; SSE41-NEXT: pblendvb %xmm3, %xmm2 1133; SSE41-NEXT: movdqa %xmm2, %xmm3 1134; SSE41-NEXT: paddb %xmm3, %xmm3 1135; SSE41-NEXT: paddb %xmm0, %xmm0 1136; SSE41-NEXT: pblendvb %xmm3, %xmm2 1137; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7] 1138; SSE41-NEXT: psllw $5, %xmm0 1139; SSE41-NEXT: movdqa %xmm1, %xmm3 1140; SSE41-NEXT: psrlw $4, %xmm3 1141; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 1142; SSE41-NEXT: pblendvb %xmm3, %xmm1 1143; SSE41-NEXT: movdqa %xmm1, %xmm3 1144; SSE41-NEXT: psrlw $2, %xmm3 1145; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 1146; SSE41-NEXT: paddb %xmm0, %xmm0 1147; SSE41-NEXT: pblendvb %xmm3, %xmm1 1148; SSE41-NEXT: movdqa %xmm1, %xmm3 1149; SSE41-NEXT: psrlw $1, %xmm3 1150; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 1151; SSE41-NEXT: paddb %xmm0, %xmm0 1152; SSE41-NEXT: pblendvb %xmm3, %xmm1 1153; SSE41-NEXT: por %xmm2, %xmm1 1154; SSE41-NEXT: movdqa %xmm1, %xmm0 1155; SSE41-NEXT: retq 1156; 1157; AVX-LABEL: constant_rotate_v16i8: 1158; AVX: # BB#0: 1159; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] 1160; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 1161; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 1162; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1163; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm2 1164; AVX-NEXT: vpsllw $2, %xmm2, %xmm3 1165; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1166; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 1167; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm2, %xmm2 1168; AVX-NEXT: vpaddb %xmm2, %xmm2, %xmm3 1169; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 1170; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm2, %xmm1 1171; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7] 1172; AVX-NEXT: vpsllw $5, %xmm2, %xmm2 1173; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 1174; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1175; AVX-NEXT: vpblendvb %xmm2, %xmm3, %xmm0, %xmm0 1176; AVX-NEXT: vpsrlw $2, %xmm0, %xmm3 1177; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1178; AVX-NEXT: vpaddb %xmm2, %xmm2, %xmm2 1179; AVX-NEXT: vpblendvb %xmm2, %xmm3, %xmm0, %xmm0 1180; AVX-NEXT: vpsrlw $1, %xmm0, %xmm3 1181; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1182; AVX-NEXT: vpaddb %xmm2, %xmm2, %xmm2 1183; AVX-NEXT: vpblendvb %xmm2, %xmm3, %xmm0, %xmm0 1184; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 1185; AVX-NEXT: retq 1186; 1187; XOP-LABEL: constant_rotate_v16i8: 1188; XOP: # BB#0: 1189; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm1 1190; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 1191; XOP-NEXT: vpsubb {{.*}}(%rip), %xmm2, %xmm2 1192; XOP-NEXT: vpshlb %xmm2, %xmm0, %xmm0 1193; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 1194; XOP-NEXT: retq 1195; 1196; X32-SSE-LABEL: constant_rotate_v16i8: 1197; X32-SSE: # BB#0: 1198; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] 1199; X32-SSE-NEXT: psllw $5, %xmm3 1200; X32-SSE-NEXT: pxor %xmm2, %xmm2 1201; X32-SSE-NEXT: pxor %xmm1, %xmm1 1202; X32-SSE-NEXT: pcmpgtb %xmm3, %xmm1 1203; X32-SSE-NEXT: movdqa %xmm0, %xmm4 1204; X32-SSE-NEXT: psllw $4, %xmm4 1205; X32-SSE-NEXT: pand .LCPI7_1, %xmm4 1206; X32-SSE-NEXT: pand %xmm1, %xmm4 1207; X32-SSE-NEXT: pandn %xmm0, %xmm1 1208; X32-SSE-NEXT: por %xmm4, %xmm1 1209; X32-SSE-NEXT: paddb %xmm3, %xmm3 1210; X32-SSE-NEXT: pxor %xmm4, %xmm4 1211; X32-SSE-NEXT: pcmpgtb %xmm3, %xmm4 1212; X32-SSE-NEXT: movdqa %xmm4, %xmm5 1213; X32-SSE-NEXT: pandn %xmm1, %xmm5 1214; X32-SSE-NEXT: psllw $2, %xmm1 1215; X32-SSE-NEXT: pand .LCPI7_2, %xmm1 1216; X32-SSE-NEXT: pand %xmm4, %xmm1 1217; X32-SSE-NEXT: por %xmm5, %xmm1 1218; X32-SSE-NEXT: paddb %xmm3, %xmm3 1219; X32-SSE-NEXT: pxor %xmm4, %xmm4 1220; X32-SSE-NEXT: pcmpgtb %xmm3, %xmm4 1221; X32-SSE-NEXT: movdqa %xmm4, %xmm3 1222; X32-SSE-NEXT: pandn %xmm1, %xmm3 1223; X32-SSE-NEXT: paddb %xmm1, %xmm1 1224; X32-SSE-NEXT: pand %xmm4, %xmm1 1225; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7] 1226; X32-SSE-NEXT: psllw $5, %xmm4 1227; X32-SSE-NEXT: pxor %xmm5, %xmm5 1228; X32-SSE-NEXT: pcmpgtb %xmm4, %xmm5 1229; X32-SSE-NEXT: movdqa %xmm5, %xmm6 1230; X32-SSE-NEXT: pandn %xmm0, %xmm6 1231; X32-SSE-NEXT: psrlw $4, %xmm0 1232; X32-SSE-NEXT: pand .LCPI7_4, %xmm0 1233; X32-SSE-NEXT: pand %xmm5, %xmm0 1234; X32-SSE-NEXT: por %xmm6, %xmm0 1235; X32-SSE-NEXT: paddb %xmm4, %xmm4 1236; X32-SSE-NEXT: pxor %xmm5, %xmm5 1237; X32-SSE-NEXT: pcmpgtb %xmm4, %xmm5 1238; X32-SSE-NEXT: movdqa %xmm5, %xmm6 1239; X32-SSE-NEXT: pandn %xmm0, %xmm6 1240; X32-SSE-NEXT: psrlw $2, %xmm0 1241; X32-SSE-NEXT: pand .LCPI7_5, %xmm0 1242; X32-SSE-NEXT: pand %xmm5, %xmm0 1243; X32-SSE-NEXT: por %xmm6, %xmm0 1244; X32-SSE-NEXT: paddb %xmm4, %xmm4 1245; X32-SSE-NEXT: pcmpgtb %xmm4, %xmm2 1246; X32-SSE-NEXT: movdqa %xmm2, %xmm4 1247; X32-SSE-NEXT: pandn %xmm0, %xmm4 1248; X32-SSE-NEXT: psrlw $1, %xmm0 1249; X32-SSE-NEXT: pand .LCPI7_6, %xmm0 1250; X32-SSE-NEXT: pand %xmm2, %xmm0 1251; X32-SSE-NEXT: por %xmm4, %xmm0 1252; X32-SSE-NEXT: por %xmm3, %xmm0 1253; X32-SSE-NEXT: por %xmm1, %xmm0 1254; X32-SSE-NEXT: retl 1255 %shl = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1> 1256 %lshr = lshr <16 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7> 1257 %or = or <16 x i8> %shl, %lshr 1258 ret <16 x i8> %or 1259} 1260 1261; 1262; Uniform Constant Rotates 1263; 1264 1265define <2 x i64> @splatconstant_rotate_v2i64(<2 x i64> %a) nounwind { 1266; SSE-LABEL: splatconstant_rotate_v2i64: 1267; SSE: # BB#0: 1268; SSE-NEXT: movdqa %xmm0, %xmm1 1269; SSE-NEXT: psllq $14, %xmm1 1270; SSE-NEXT: psrlq $50, %xmm0 1271; SSE-NEXT: por %xmm1, %xmm0 1272; SSE-NEXT: retq 1273; 1274; AVX-LABEL: splatconstant_rotate_v2i64: 1275; AVX: # BB#0: 1276; AVX-NEXT: vpsllq $14, %xmm0, %xmm1 1277; AVX-NEXT: vpsrlq $50, %xmm0, %xmm0 1278; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 1279; AVX-NEXT: retq 1280; 1281; XOP-LABEL: splatconstant_rotate_v2i64: 1282; XOP: # BB#0: 1283; XOP-NEXT: vprotq $14, %xmm0, %xmm0 1284; XOP-NEXT: retq 1285; 1286; X32-SSE-LABEL: splatconstant_rotate_v2i64: 1287; X32-SSE: # BB#0: 1288; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1289; X32-SSE-NEXT: psllq $14, %xmm1 1290; X32-SSE-NEXT: psrlq $50, %xmm0 1291; X32-SSE-NEXT: por %xmm1, %xmm0 1292; X32-SSE-NEXT: retl 1293 %shl = shl <2 x i64> %a, <i64 14, i64 14> 1294 %lshr = lshr <2 x i64> %a, <i64 50, i64 50> 1295 %or = or <2 x i64> %shl, %lshr 1296 ret <2 x i64> %or 1297} 1298 1299define <4 x i32> @splatconstant_rotate_v4i32(<4 x i32> %a) nounwind { 1300; SSE-LABEL: splatconstant_rotate_v4i32: 1301; SSE: # BB#0: 1302; SSE-NEXT: movdqa %xmm0, %xmm1 1303; SSE-NEXT: pslld $4, %xmm1 1304; SSE-NEXT: psrld $28, %xmm0 1305; SSE-NEXT: por %xmm1, %xmm0 1306; SSE-NEXT: retq 1307; 1308; AVX-LABEL: splatconstant_rotate_v4i32: 1309; AVX: # BB#0: 1310; AVX-NEXT: vpslld $4, %xmm0, %xmm1 1311; AVX-NEXT: vpsrld $28, %xmm0, %xmm0 1312; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 1313; AVX-NEXT: retq 1314; 1315; XOP-LABEL: splatconstant_rotate_v4i32: 1316; XOP: # BB#0: 1317; XOP-NEXT: vprotd $4, %xmm0, %xmm0 1318; XOP-NEXT: retq 1319; 1320; X32-SSE-LABEL: splatconstant_rotate_v4i32: 1321; X32-SSE: # BB#0: 1322; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1323; X32-SSE-NEXT: pslld $4, %xmm1 1324; X32-SSE-NEXT: psrld $28, %xmm0 1325; X32-SSE-NEXT: por %xmm1, %xmm0 1326; X32-SSE-NEXT: retl 1327 %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4> 1328 %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28> 1329 %or = or <4 x i32> %shl, %lshr 1330 ret <4 x i32> %or 1331} 1332 1333define <8 x i16> @splatconstant_rotate_v8i16(<8 x i16> %a) nounwind { 1334; SSE-LABEL: splatconstant_rotate_v8i16: 1335; SSE: # BB#0: 1336; SSE-NEXT: movdqa %xmm0, %xmm1 1337; SSE-NEXT: psllw $7, %xmm1 1338; SSE-NEXT: psrlw $9, %xmm0 1339; SSE-NEXT: por %xmm1, %xmm0 1340; SSE-NEXT: retq 1341; 1342; AVX-LABEL: splatconstant_rotate_v8i16: 1343; AVX: # BB#0: 1344; AVX-NEXT: vpsllw $7, %xmm0, %xmm1 1345; AVX-NEXT: vpsrlw $9, %xmm0, %xmm0 1346; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 1347; AVX-NEXT: retq 1348; 1349; XOP-LABEL: splatconstant_rotate_v8i16: 1350; XOP: # BB#0: 1351; XOP-NEXT: vprotw $7, %xmm0, %xmm0 1352; XOP-NEXT: retq 1353; 1354; X32-SSE-LABEL: splatconstant_rotate_v8i16: 1355; X32-SSE: # BB#0: 1356; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1357; X32-SSE-NEXT: psllw $7, %xmm1 1358; X32-SSE-NEXT: psrlw $9, %xmm0 1359; X32-SSE-NEXT: por %xmm1, %xmm0 1360; X32-SSE-NEXT: retl 1361 %shl = shl <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 1362 %lshr = lshr <8 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9> 1363 %or = or <8 x i16> %shl, %lshr 1364 ret <8 x i16> %or 1365} 1366 1367define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind { 1368; SSE-LABEL: splatconstant_rotate_v16i8: 1369; SSE: # BB#0: 1370; SSE-NEXT: movdqa %xmm0, %xmm1 1371; SSE-NEXT: psllw $4, %xmm1 1372; SSE-NEXT: pand {{.*}}(%rip), %xmm1 1373; SSE-NEXT: psrlw $4, %xmm0 1374; SSE-NEXT: pand {{.*}}(%rip), %xmm0 1375; SSE-NEXT: por %xmm1, %xmm0 1376; SSE-NEXT: retq 1377; 1378; AVX-LABEL: splatconstant_rotate_v16i8: 1379; AVX: # BB#0: 1380; AVX-NEXT: vpsllw $4, %xmm0, %xmm1 1381; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1382; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 1383; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1384; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 1385; AVX-NEXT: retq 1386; 1387; XOP-LABEL: splatconstant_rotate_v16i8: 1388; XOP: # BB#0: 1389; XOP-NEXT: vprotb $4, %xmm0, %xmm0 1390; XOP-NEXT: retq 1391; 1392; X32-SSE-LABEL: splatconstant_rotate_v16i8: 1393; X32-SSE: # BB#0: 1394; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1395; X32-SSE-NEXT: psllw $4, %xmm1 1396; X32-SSE-NEXT: pand .LCPI11_0, %xmm1 1397; X32-SSE-NEXT: psrlw $4, %xmm0 1398; X32-SSE-NEXT: pand .LCPI11_1, %xmm0 1399; X32-SSE-NEXT: por %xmm1, %xmm0 1400; X32-SSE-NEXT: retl 1401 %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1402 %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1403 %or = or <16 x i8> %shl, %lshr 1404 ret <16 x i8> %or 1405} 1406 1407; 1408; Masked Uniform Constant Rotates 1409; 1410 1411define <2 x i64> @splatconstant_rotate_mask_v2i64(<2 x i64> %a) nounwind { 1412; SSE-LABEL: splatconstant_rotate_mask_v2i64: 1413; SSE: # BB#0: 1414; SSE-NEXT: movdqa %xmm0, %xmm1 1415; SSE-NEXT: psllq $15, %xmm1 1416; SSE-NEXT: psrlq $49, %xmm0 1417; SSE-NEXT: pand {{.*}}(%rip), %xmm0 1418; SSE-NEXT: pand {{.*}}(%rip), %xmm1 1419; SSE-NEXT: por %xmm0, %xmm1 1420; SSE-NEXT: movdqa %xmm1, %xmm0 1421; SSE-NEXT: retq 1422; 1423; AVX-LABEL: splatconstant_rotate_mask_v2i64: 1424; AVX: # BB#0: 1425; AVX-NEXT: vpsllq $15, %xmm0, %xmm1 1426; AVX-NEXT: vpsrlq $49, %xmm0, %xmm0 1427; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1428; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1429; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 1430; AVX-NEXT: retq 1431; 1432; XOP-LABEL: splatconstant_rotate_mask_v2i64: 1433; XOP: # BB#0: 1434; XOP-NEXT: vprotq $15, %xmm0, %xmm0 1435; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1436; XOP-NEXT: retq 1437; 1438; X32-SSE-LABEL: splatconstant_rotate_mask_v2i64: 1439; X32-SSE: # BB#0: 1440; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1441; X32-SSE-NEXT: psllq $15, %xmm1 1442; X32-SSE-NEXT: psrlq $49, %xmm0 1443; X32-SSE-NEXT: pand .LCPI12_0, %xmm0 1444; X32-SSE-NEXT: pand .LCPI12_1, %xmm1 1445; X32-SSE-NEXT: por %xmm0, %xmm1 1446; X32-SSE-NEXT: movdqa %xmm1, %xmm0 1447; X32-SSE-NEXT: retl 1448 %shl = shl <2 x i64> %a, <i64 15, i64 15> 1449 %lshr = lshr <2 x i64> %a, <i64 49, i64 49> 1450 %rmask = and <2 x i64> %lshr, <i64 255, i64 127> 1451 %lmask = and <2 x i64> %shl, <i64 65, i64 33> 1452 %or = or <2 x i64> %lmask, %rmask 1453 ret <2 x i64> %or 1454} 1455 1456define <4 x i32> @splatconstant_rotate_mask_v4i32(<4 x i32> %a) nounwind { 1457; SSE-LABEL: splatconstant_rotate_mask_v4i32: 1458; SSE: # BB#0: 1459; SSE-NEXT: movdqa %xmm0, %xmm1 1460; SSE-NEXT: pslld $4, %xmm1 1461; SSE-NEXT: psrld $28, %xmm0 1462; SSE-NEXT: pand {{.*}}(%rip), %xmm0 1463; SSE-NEXT: pand {{.*}}(%rip), %xmm1 1464; SSE-NEXT: por %xmm0, %xmm1 1465; SSE-NEXT: movdqa %xmm1, %xmm0 1466; SSE-NEXT: retq 1467; 1468; AVX-LABEL: splatconstant_rotate_mask_v4i32: 1469; AVX: # BB#0: 1470; AVX-NEXT: vpslld $4, %xmm0, %xmm1 1471; AVX-NEXT: vpsrld $28, %xmm0, %xmm0 1472; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1473; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1474; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 1475; AVX-NEXT: retq 1476; 1477; XOP-LABEL: splatconstant_rotate_mask_v4i32: 1478; XOP: # BB#0: 1479; XOP-NEXT: vprotd $4, %xmm0, %xmm0 1480; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1481; XOP-NEXT: retq 1482; 1483; X32-SSE-LABEL: splatconstant_rotate_mask_v4i32: 1484; X32-SSE: # BB#0: 1485; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1486; X32-SSE-NEXT: pslld $4, %xmm1 1487; X32-SSE-NEXT: psrld $28, %xmm0 1488; X32-SSE-NEXT: pand .LCPI13_0, %xmm0 1489; X32-SSE-NEXT: pand .LCPI13_1, %xmm1 1490; X32-SSE-NEXT: por %xmm0, %xmm1 1491; X32-SSE-NEXT: movdqa %xmm1, %xmm0 1492; X32-SSE-NEXT: retl 1493 %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4> 1494 %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28> 1495 %rmask = and <4 x i32> %lshr, <i32 127, i32 255, i32 511, i32 1023> 1496 %lmask = and <4 x i32> %shl, <i32 1023, i32 511, i32 255, i32 127> 1497 %or = or <4 x i32> %lmask, %rmask 1498 ret <4 x i32> %or 1499} 1500 1501define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind { 1502; SSE-LABEL: splatconstant_rotate_mask_v8i16: 1503; SSE: # BB#0: 1504; SSE-NEXT: movdqa %xmm0, %xmm1 1505; SSE-NEXT: psllw $5, %xmm1 1506; SSE-NEXT: psrlw $11, %xmm0 1507; SSE-NEXT: pand {{.*}}(%rip), %xmm0 1508; SSE-NEXT: pand {{.*}}(%rip), %xmm1 1509; SSE-NEXT: por %xmm0, %xmm1 1510; SSE-NEXT: movdqa %xmm1, %xmm0 1511; SSE-NEXT: retq 1512; 1513; AVX-LABEL: splatconstant_rotate_mask_v8i16: 1514; AVX: # BB#0: 1515; AVX-NEXT: vpsllw $5, %xmm0, %xmm1 1516; AVX-NEXT: vpsrlw $11, %xmm0, %xmm0 1517; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1518; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1519; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 1520; AVX-NEXT: retq 1521; 1522; XOP-LABEL: splatconstant_rotate_mask_v8i16: 1523; XOP: # BB#0: 1524; XOP-NEXT: vprotw $5, %xmm0, %xmm0 1525; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1526; XOP-NEXT: retq 1527; 1528; X32-SSE-LABEL: splatconstant_rotate_mask_v8i16: 1529; X32-SSE: # BB#0: 1530; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1531; X32-SSE-NEXT: psllw $5, %xmm1 1532; X32-SSE-NEXT: psrlw $11, %xmm0 1533; X32-SSE-NEXT: pand .LCPI14_0, %xmm0 1534; X32-SSE-NEXT: pand .LCPI14_1, %xmm1 1535; X32-SSE-NEXT: por %xmm0, %xmm1 1536; X32-SSE-NEXT: movdqa %xmm1, %xmm0 1537; X32-SSE-NEXT: retl 1538 %shl = shl <8 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5> 1539 %lshr = lshr <8 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11> 1540 %rmask = and <8 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55> 1541 %lmask = and <8 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33> 1542 %or = or <8 x i16> %lmask, %rmask 1543 ret <8 x i16> %or 1544} 1545 1546define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind { 1547; SSE-LABEL: splatconstant_rotate_mask_v16i8: 1548; SSE: # BB#0: 1549; SSE-NEXT: movdqa %xmm0, %xmm1 1550; SSE-NEXT: psllw $4, %xmm1 1551; SSE-NEXT: pand {{.*}}(%rip), %xmm1 1552; SSE-NEXT: psrlw $4, %xmm0 1553; SSE-NEXT: pand {{.*}}(%rip), %xmm0 1554; SSE-NEXT: pand {{.*}}(%rip), %xmm0 1555; SSE-NEXT: pand {{.*}}(%rip), %xmm1 1556; SSE-NEXT: por %xmm0, %xmm1 1557; SSE-NEXT: movdqa %xmm1, %xmm0 1558; SSE-NEXT: retq 1559; 1560; AVX-LABEL: splatconstant_rotate_mask_v16i8: 1561; AVX: # BB#0: 1562; AVX-NEXT: vpsllw $4, %xmm0, %xmm1 1563; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1564; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 1565; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1566; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1567; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1568; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 1569; AVX-NEXT: retq 1570; 1571; XOP-LABEL: splatconstant_rotate_mask_v16i8: 1572; XOP: # BB#0: 1573; XOP-NEXT: vprotb $4, %xmm0, %xmm0 1574; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1575; XOP-NEXT: retq 1576; 1577; X32-SSE-LABEL: splatconstant_rotate_mask_v16i8: 1578; X32-SSE: # BB#0: 1579; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1580; X32-SSE-NEXT: psllw $4, %xmm1 1581; X32-SSE-NEXT: pand .LCPI15_0, %xmm1 1582; X32-SSE-NEXT: psrlw $4, %xmm0 1583; X32-SSE-NEXT: pand .LCPI15_1, %xmm0 1584; X32-SSE-NEXT: pand .LCPI15_2, %xmm0 1585; X32-SSE-NEXT: pand .LCPI15_3, %xmm1 1586; X32-SSE-NEXT: por %xmm0, %xmm1 1587; X32-SSE-NEXT: movdqa %xmm1, %xmm0 1588; X32-SSE-NEXT: retl 1589 %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1590 %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1591 %rmask = and <16 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55> 1592 %lmask = and <16 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33> 1593 %or = or <16 x i8> %lmask, %rmask 1594 ret <16 x i8> %or 1595} 1596