1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 6 7; 8; Variable Rotates 9; 10 11define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { 12; AVX1-LABEL: var_rotate_v4i64: 13; AVX1: # BB#0: 14; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] 15; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3 16; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 17; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2 18; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 19; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm6 20; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 21; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm4 22; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7] 23; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm6 24; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 25; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1 26; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7] 27; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 28; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm4 29; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 30; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm2 31; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] 32; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm4 33; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 34; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm0 35; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] 36; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 37; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 38; AVX1-NEXT: retq 39; 40; AVX2-LABEL: var_rotate_v4i64: 41; AVX2: # BB#0: 42; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 43; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm2 44; AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm1 45; AVX2-NEXT: vpsrlvq %ymm2, %ymm0, %ymm0 46; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 47; AVX2-NEXT: retq 48; 49; XOPAVX1-LABEL: var_rotate_v4i64: 50; XOPAVX1: # BB#0: 51; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 52; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 53; XOPAVX1-NEXT: vprotq %xmm2, %xmm3, %xmm2 54; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0 55; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 56; XOPAVX1-NEXT: retq 57; 58; XOPAVX2-LABEL: var_rotate_v4i64: 59; XOPAVX2: # BB#0: 60; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 61; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 62; XOPAVX2-NEXT: vprotq %xmm2, %xmm3, %xmm2 63; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0 64; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 65; XOPAVX2-NEXT: retq 66 %b64 = sub <4 x i64> <i64 64, i64 64, i64 64, i64 64>, %b 67 %shl = shl <4 x i64> %a, %b 68 %lshr = lshr <4 x i64> %a, %b64 69 %or = or <4 x i64> %shl, %lshr 70 ret <4 x i64> %or 71} 72 73define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { 74; AVX1-LABEL: var_rotate_v8i32: 75; AVX1: # BB#0: 76; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32] 77; AVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm2 78; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 79; AVX1-NEXT: vpsubd %xmm4, %xmm3, %xmm3 80; AVX1-NEXT: vpslld $23, %xmm4, %xmm4 81; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] 82; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 83; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 84; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 85; AVX1-NEXT: vpmulld %xmm6, %xmm4, %xmm4 86; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 87; AVX1-NEXT: vpaddd %xmm5, %xmm1, %xmm1 88; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 89; AVX1-NEXT: vpmulld %xmm0, %xmm1, %xmm1 90; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 91; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 92; AVX1-NEXT: vpsrld %xmm4, %xmm6, %xmm4 93; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 94; AVX1-NEXT: vpsrld %xmm5, %xmm6, %xmm5 95; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 96; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 97; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] 98; AVX1-NEXT: vpsrld %xmm7, %xmm6, %xmm7 99; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero 100; AVX1-NEXT: vpsrld %xmm3, %xmm6, %xmm3 101; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4,5,6,7] 102; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] 103; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 104; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 105; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6 106; AVX1-NEXT: vpsrld %xmm6, %xmm0, %xmm6 107; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7] 108; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] 109; AVX1-NEXT: vpsrld %xmm5, %xmm0, %xmm5 110; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero 111; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm0 112; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7] 113; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] 114; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 115; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 116; AVX1-NEXT: retq 117; 118; AVX2-LABEL: var_rotate_v8i32: 119; AVX2: # BB#0: 120; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 121; AVX2-NEXT: vpsubd %ymm1, %ymm2, %ymm2 122; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm1 123; AVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0 124; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 125; AVX2-NEXT: retq 126; 127; XOPAVX1-LABEL: var_rotate_v8i32: 128; XOPAVX1: # BB#0: 129; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 130; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 131; XOPAVX1-NEXT: vprotd %xmm2, %xmm3, %xmm2 132; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0 133; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 134; XOPAVX1-NEXT: retq 135; 136; XOPAVX2-LABEL: var_rotate_v8i32: 137; XOPAVX2: # BB#0: 138; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 139; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 140; XOPAVX2-NEXT: vprotd %xmm2, %xmm3, %xmm2 141; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0 142; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 143; XOPAVX2-NEXT: retq 144 %b32 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b 145 %shl = shl <8 x i32> %a, %b 146 %lshr = lshr <8 x i32> %a, %b32 147 %or = or <8 x i32> %shl, %lshr 148 ret <8 x i32> %or 149} 150 151define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { 152; AVX1-LABEL: var_rotate_v16i16: 153; AVX1: # BB#0: 154; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 155; AVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm2 156; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 157; AVX1-NEXT: vpsubw %xmm4, %xmm3, %xmm3 158; AVX1-NEXT: vpsllw $12, %xmm4, %xmm5 159; AVX1-NEXT: vpsllw $4, %xmm4, %xmm4 160; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm5 161; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm6 162; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 163; AVX1-NEXT: vpsllw $8, %xmm4, %xmm7 164; AVX1-NEXT: vpblendvb %xmm5, %xmm7, %xmm4, %xmm5 165; AVX1-NEXT: vpsllw $4, %xmm5, %xmm7 166; AVX1-NEXT: vpblendvb %xmm6, %xmm7, %xmm5, %xmm5 167; AVX1-NEXT: vpsllw $2, %xmm5, %xmm7 168; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6 169; AVX1-NEXT: vpblendvb %xmm6, %xmm7, %xmm5, %xmm5 170; AVX1-NEXT: vpsllw $1, %xmm5, %xmm7 171; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6 172; AVX1-NEXT: vpblendvb %xmm6, %xmm7, %xmm5, %xmm5 173; AVX1-NEXT: vpsllw $12, %xmm1, %xmm6 174; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 175; AVX1-NEXT: vpor %xmm6, %xmm1, %xmm1 176; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm6 177; AVX1-NEXT: vpsllw $8, %xmm0, %xmm7 178; AVX1-NEXT: vpblendvb %xmm1, %xmm7, %xmm0, %xmm1 179; AVX1-NEXT: vpsllw $4, %xmm1, %xmm7 180; AVX1-NEXT: vpblendvb %xmm6, %xmm7, %xmm1, %xmm1 181; AVX1-NEXT: vpsllw $2, %xmm1, %xmm7 182; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6 183; AVX1-NEXT: vpblendvb %xmm6, %xmm7, %xmm1, %xmm1 184; AVX1-NEXT: vpsllw $1, %xmm1, %xmm7 185; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6 186; AVX1-NEXT: vpblendvb %xmm6, %xmm7, %xmm1, %xmm1 187; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 188; AVX1-NEXT: vpsllw $12, %xmm3, %xmm5 189; AVX1-NEXT: vpsllw $4, %xmm3, %xmm3 190; AVX1-NEXT: vpor %xmm5, %xmm3, %xmm3 191; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm5 192; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm6 193; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm4, %xmm3 194; AVX1-NEXT: vpsrlw $4, %xmm3, %xmm4 195; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3 196; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm4 197; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5 198; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3 199; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm4 200; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5 201; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3 202; AVX1-NEXT: vpsllw $12, %xmm2, %xmm4 203; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 204; AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2 205; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm4 206; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm5 207; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm0, %xmm0 208; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 209; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm0, %xmm0 210; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2 211; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4 212; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm0, %xmm0 213; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2 214; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4 215; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm0, %xmm0 216; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 217; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 218; AVX1-NEXT: retq 219; 220; AVX2-LABEL: var_rotate_v16i16: 221; AVX2: # BB#0: 222; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 223; AVX2-NEXT: vpsubw %ymm1, %ymm2, %ymm2 224; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3 225; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] 226; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] 227; AVX2-NEXT: vpsllvd %ymm4, %ymm5, %ymm4 228; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4 229; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] 230; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] 231; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm1 232; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 233; AVX2-NEXT: vpackusdw %ymm4, %ymm1, %ymm1 234; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] 235; AVX2-NEXT: vpsrlvd %ymm4, %ymm5, %ymm4 236; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4 237; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] 238; AVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0 239; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 240; AVX2-NEXT: vpackusdw %ymm4, %ymm0, %ymm0 241; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 242; AVX2-NEXT: retq 243; 244; XOPAVX1-LABEL: var_rotate_v16i16: 245; XOPAVX1: # BB#0: 246; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 247; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 248; XOPAVX1-NEXT: vprotw %xmm2, %xmm3, %xmm2 249; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 250; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 251; XOPAVX1-NEXT: retq 252; 253; XOPAVX2-LABEL: var_rotate_v16i16: 254; XOPAVX2: # BB#0: 255; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 256; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 257; XOPAVX2-NEXT: vprotw %xmm2, %xmm3, %xmm2 258; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0 259; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 260; XOPAVX2-NEXT: retq 261 %b16 = sub <16 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b 262 %shl = shl <16 x i16> %a, %b 263 %lshr = lshr <16 x i16> %a, %b16 264 %or = or <16 x i16> %shl, %lshr 265 ret <16 x i16> %or 266} 267 268define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { 269; AVX1-LABEL: var_rotate_v32i8: 270; AVX1: # BB#0: 271; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 272; AVX1-NEXT: vpsubb %xmm1, %xmm3, %xmm8 273; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 274; AVX1-NEXT: vpsubb %xmm4, %xmm3, %xmm9 275; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 276; AVX1-NEXT: vpsllw $4, %xmm5, %xmm6 277; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 278; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 279; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4 280; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm5, %xmm6 281; AVX1-NEXT: vpsllw $2, %xmm6, %xmm2 282; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 283; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 284; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4 285; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm6, %xmm2 286; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm6 287; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4 288; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm2, %xmm2 289; AVX1-NEXT: vpsllw $4, %xmm0, %xmm4 290; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4 291; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 292; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm4 293; AVX1-NEXT: vpsllw $2, %xmm4, %xmm6 294; AVX1-NEXT: vpand %xmm3, %xmm6, %xmm3 295; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 296; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm4, %xmm3 297; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm4 298; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 299; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm3, %xmm1 300; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 301; AVX1-NEXT: vpsrlw $4, %xmm5, %xmm2 302; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 303; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 304; AVX1-NEXT: vpsllw $5, %xmm9, %xmm4 305; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm5, %xmm2 306; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm5 307; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 308; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5 309; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4 310; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm2, %xmm2 311; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm5 312; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 313; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 314; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4 315; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm2, %xmm2 316; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 317; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3 318; AVX1-NEXT: vpsllw $5, %xmm8, %xmm4 319; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0 320; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm3 321; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 322; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4 323; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0 324; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm3 325; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 326; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4 327; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0 328; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 329; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 330; AVX1-NEXT: retq 331; 332; AVX2-LABEL: var_rotate_v32i8: 333; AVX2: # BB#0: 334; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 335; AVX2-NEXT: vpsubb %ymm1, %ymm2, %ymm2 336; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 337; AVX2-NEXT: vpsllw $4, %ymm0, %ymm3 338; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 339; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm3 340; AVX2-NEXT: vpsllw $2, %ymm3, %ymm4 341; AVX2-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 342; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 343; AVX2-NEXT: vpblendvb %ymm1, %ymm4, %ymm3, %ymm3 344; AVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm4 345; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 346; AVX2-NEXT: vpblendvb %ymm1, %ymm4, %ymm3, %ymm1 347; AVX2-NEXT: vpsllw $5, %ymm2, %ymm2 348; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm3 349; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4 350; AVX2-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 351; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 352; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2 353; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 354; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 355; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2 356; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 357; AVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3 358; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 359; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 360; AVX2-NEXT: retq 361; 362; XOPAVX1-LABEL: var_rotate_v32i8: 363; XOPAVX1: # BB#0: 364; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 365; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 366; XOPAVX1-NEXT: vprotb %xmm2, %xmm3, %xmm2 367; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0 368; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 369; XOPAVX1-NEXT: retq 370; 371; XOPAVX2-LABEL: var_rotate_v32i8: 372; XOPAVX2: # BB#0: 373; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 374; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 375; XOPAVX2-NEXT: vprotb %xmm2, %xmm3, %xmm2 376; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0 377; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 378; XOPAVX2-NEXT: retq 379 %b8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b 380 %shl = shl <32 x i8> %a, %b 381 %lshr = lshr <32 x i8> %a, %b8 382 %or = or <32 x i8> %shl, %lshr 383 ret <32 x i8> %or 384} 385 386; 387; Constant Rotates 388; 389 390define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind { 391; AVX1-LABEL: constant_rotate_v4i64: 392; AVX1: # BB#0: 393; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 394; AVX1-NEXT: vpsllq $60, %xmm1, %xmm2 395; AVX1-NEXT: vpsllq $50, %xmm1, %xmm3 396; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 397; AVX1-NEXT: vpsllq $14, %xmm0, %xmm3 398; AVX1-NEXT: vpsllq $4, %xmm0, %xmm4 399; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 400; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 401; AVX1-NEXT: vpsrlq $2, %xmm1, %xmm3 402; AVX1-NEXT: vpsrlq $14, %xmm1, %xmm1 403; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] 404; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm3 405; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm0 406; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] 407; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 408; AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0 409; AVX1-NEXT: retq 410; 411; AVX2-LABEL: constant_rotate_v4i64: 412; AVX2: # BB#0: 413; AVX2-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm1 414; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 415; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 416; AVX2-NEXT: retq 417; 418; XOPAVX1-LABEL: constant_rotate_v4i64: 419; XOPAVX1: # BB#0: 420; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm1 421; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 422; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm2, %xmm3 423; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 424; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 425; XOPAVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm4 426; XOPAVX1-NEXT: vpshlq %xmm4, %xmm2, %xmm2 427; XOPAVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm3 428; XOPAVX1-NEXT: vpshlq %xmm3, %xmm0, %xmm0 429; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 430; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 431; XOPAVX1-NEXT: retq 432; 433; XOPAVX2-LABEL: constant_rotate_v4i64: 434; XOPAVX2: # BB#0: 435; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm1 436; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 437; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 438; XOPAVX2-NEXT: retq 439 %shl = shl <4 x i64> %a, <i64 4, i64 14, i64 50, i64 60> 440 %lshr = lshr <4 x i64> %a, <i64 60, i64 50, i64 14, i64 2> 441 %or = or <4 x i64> %shl, %lshr 442 ret <4 x i64> %or 443} 444 445define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind { 446; AVX1-LABEL: constant_rotate_v8i32: 447; AVX1: # BB#0: 448; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1 449; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 450; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm3 451; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 452; AVX1-NEXT: vpsrld $21, %xmm2, %xmm3 453; AVX1-NEXT: vpsrld $23, %xmm2, %xmm4 454; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 455; AVX1-NEXT: vpsrld $22, %xmm2, %xmm4 456; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 457; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] 458; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 459; AVX1-NEXT: vpsrld $25, %xmm0, %xmm3 460; AVX1-NEXT: vpsrld $27, %xmm0, %xmm4 461; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 462; AVX1-NEXT: vpsrld $26, %xmm0, %xmm4 463; AVX1-NEXT: vpsrld $28, %xmm0, %xmm0 464; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] 465; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] 466; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 467; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 468; AVX1-NEXT: retq 469; 470; AVX2-LABEL: constant_rotate_v8i32: 471; AVX2: # BB#0: 472; AVX2-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm1 473; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0 474; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 475; AVX2-NEXT: retq 476; 477; XOPAVX1-LABEL: constant_rotate_v8i32: 478; XOPAVX1: # BB#0: 479; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm1 480; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 481; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm2, %xmm3 482; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 483; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 484; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm2, %xmm2 485; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 486; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 487; XOPAVX1-NEXT: retq 488; 489; XOPAVX2-LABEL: constant_rotate_v8i32: 490; XOPAVX2: # BB#0: 491; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm1 492; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0 493; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 494; XOPAVX2-NEXT: retq 495 %shl = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 496 %lshr = lshr <8 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21> 497 %or = or <8 x i32> %shl, %lshr 498 ret <8 x i32> %or 499} 500 501define <16 x i16> @constant_rotate_v8i16(<16 x i16> %a) nounwind { 502; AVX1-LABEL: constant_rotate_v8i16: 503; AVX1: # BB#0: 504; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1 505; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 506; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm3 507; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 508; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm3 509; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32896,28784,24672,20560,16448,12336,8224,4112] 510; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2 511; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3 512; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,57568,49344,41120,32896,24672,16448,8224] 513; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2 514; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3 515; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [512,49600,33152,16704,256,49344,32896,16448] 516; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2 517; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3 518; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1024,33664,768,33408,512,33152,256,32896] 519; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2 520; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3 521; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,61680,57568,53456,49344,45232,41120,37008] 522; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0 523; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 524; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [512,57824,49600,41376,33152,24928,16704,8480] 525; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0 526; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm3 527; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1024,50112,33664,17216,768,49856,33408,16960] 528; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0 529; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm3 530; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [2048,34688,1792,34432,1536,34176,1280,33920] 531; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0 532; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 533; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 534; AVX1-NEXT: retq 535; 536; AVX2-LABEL: constant_rotate_v8i16: 537; AVX2: # BB#0: 538; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm1 539; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 540; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] 541; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] 542; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] 543; AVX2-NEXT: vpsrlvd %ymm4, %ymm5, %ymm4 544; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4 545; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] 546; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] 547; AVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0 548; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 549; AVX2-NEXT: vpackusdw %ymm4, %ymm0, %ymm0 550; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 551; AVX2-NEXT: retq 552; 553; XOPAVX1-LABEL: constant_rotate_v8i16: 554; XOPAVX1: # BB#0: 555; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm1 556; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 557; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm2, %xmm3 558; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 559; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 560; XOPAVX1-NEXT: vpsubw {{.*}}(%rip), %xmm3, %xmm4 561; XOPAVX1-NEXT: vpshlw %xmm4, %xmm2, %xmm2 562; XOPAVX1-NEXT: vpsubw {{.*}}(%rip), %xmm3, %xmm3 563; XOPAVX1-NEXT: vpshlw %xmm3, %xmm0, %xmm0 564; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 565; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 566; XOPAVX1-NEXT: retq 567; 568; XOPAVX2-LABEL: constant_rotate_v8i16: 569; XOPAVX2: # BB#0: 570; XOPAVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm1 571; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 572; XOPAVX2-NEXT: vpsubw {{.*}}(%rip), %xmm2, %xmm3 573; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 574; XOPAVX2-NEXT: vpshlw %xmm3, %xmm4, %xmm3 575; XOPAVX2-NEXT: vpsubw {{.*}}(%rip), %xmm2, %xmm2 576; XOPAVX2-NEXT: vpshlw %xmm2, %xmm0, %xmm0 577; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 578; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 579; XOPAVX2-NEXT: retq 580 %shl = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 581 %lshr = lshr <16 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1> 582 %or = or <16 x i16> %shl, %lshr 583 ret <16 x i16> %or 584} 585 586define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind { 587; AVX1-LABEL: constant_rotate_v32i8: 588; AVX1: # BB#0: 589; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 590; AVX1-NEXT: vpsllw $4, %xmm1, %xmm2 591; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 592; AVX1-NEXT: vpand %xmm8, %xmm2, %xmm2 593; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] 594; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4 595; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm1, %xmm2 596; AVX1-NEXT: vpsllw $2, %xmm2, %xmm5 597; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 598; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5 599; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm7 600; AVX1-NEXT: vpblendvb %xmm7, %xmm5, %xmm2, %xmm2 601; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm5 602; AVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm3 603; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm2, %xmm2 604; AVX1-NEXT: vpsllw $4, %xmm0, %xmm5 605; AVX1-NEXT: vpand %xmm8, %xmm5, %xmm5 606; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm0, %xmm4 607; AVX1-NEXT: vpsllw $2, %xmm4, %xmm5 608; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5 609; AVX1-NEXT: vpblendvb %xmm7, %xmm5, %xmm4, %xmm4 610; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm5 611; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm3 612; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm9 613; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm3 614; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 615; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3 616; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7] 617; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5 618; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm1, %xmm1 619; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm3 620; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 621; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 622; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm7 623; AVX1-NEXT: vpblendvb %xmm7, %xmm3, %xmm1, %xmm1 624; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm3 625; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 626; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 627; AVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm2 628; AVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm1, %xmm1 629; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 630; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3 631; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm0, %xmm0 632; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm3 633; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 634; AVX1-NEXT: vpblendvb %xmm7, %xmm3, %xmm0, %xmm0 635; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm3 636; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 637; AVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm0, %xmm0 638; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 639; AVX1-NEXT: vorps %ymm0, %ymm9, %ymm0 640; AVX1-NEXT: retq 641; 642; AVX2-LABEL: constant_rotate_v32i8: 643; AVX2: # BB#0: 644; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] 645; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 646; AVX2-NEXT: vpsllw $4, %ymm0, %ymm2 647; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 648; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm2 649; AVX2-NEXT: vpsllw $2, %ymm2, %ymm3 650; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 651; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 652; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm2, %ymm2 653; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm3 654; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 655; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm2, %ymm1 656; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7] 657; AVX2-NEXT: vpsllw $5, %ymm2, %ymm2 658; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 659; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 660; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 661; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm3 662; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 663; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2 664; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 665; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm3 666; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 667; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2 668; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 669; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 670; AVX2-NEXT: retq 671; 672; XOPAVX1-LABEL: constant_rotate_v32i8: 673; XOPAVX1: # BB#0: 674; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] 675; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 676; XOPAVX1-NEXT: vpshlb %xmm1, %xmm2, %xmm3 677; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm1 678; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 679; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 680; XOPAVX1-NEXT: vpsubb {{.*}}(%rip), %xmm3, %xmm3 681; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2 682; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0 683; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 684; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 685; XOPAVX1-NEXT: retq 686; 687; XOPAVX2-LABEL: constant_rotate_v32i8: 688; XOPAVX2: # BB#0: 689; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] 690; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 691; XOPAVX2-NEXT: vpshlb %xmm1, %xmm2, %xmm3 692; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm1 693; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 694; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 695; XOPAVX2-NEXT: vpsubb {{.*}}(%rip), %xmm3, %xmm3 696; XOPAVX2-NEXT: vpshlb %xmm3, %xmm2, %xmm2 697; XOPAVX2-NEXT: vpshlb %xmm3, %xmm0, %xmm0 698; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 699; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 700; XOPAVX2-NEXT: retq 701 %shl = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1> 702 %lshr = lshr <32 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7> 703 %or = or <32 x i8> %shl, %lshr 704 ret <32 x i8> %or 705} 706 707; 708; Uniform Constant Rotates 709; 710 711define <4 x i64> @splatconstant_rotate_v4i64(<4 x i64> %a) nounwind { 712; AVX1-LABEL: splatconstant_rotate_v4i64: 713; AVX1: # BB#0: 714; AVX1-NEXT: vpsllq $14, %xmm0, %xmm1 715; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 716; AVX1-NEXT: vpsllq $14, %xmm2, %xmm3 717; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 718; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm0 719; AVX1-NEXT: vpsrlq $50, %xmm2, %xmm2 720; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 721; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 722; AVX1-NEXT: retq 723; 724; AVX2-LABEL: splatconstant_rotate_v4i64: 725; AVX2: # BB#0: 726; AVX2-NEXT: vpsllq $14, %ymm0, %ymm1 727; AVX2-NEXT: vpsrlq $50, %ymm0, %ymm0 728; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 729; AVX2-NEXT: retq 730; 731; XOPAVX1-LABEL: splatconstant_rotate_v4i64: 732; XOPAVX1: # BB#0: 733; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm1 734; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 735; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm0 736; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 737; XOPAVX1-NEXT: retq 738; 739; XOPAVX2-LABEL: splatconstant_rotate_v4i64: 740; XOPAVX2: # BB#0: 741; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm1 742; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 743; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm0 744; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 745; XOPAVX2-NEXT: retq 746 %shl = shl <4 x i64> %a, <i64 14, i64 14, i64 14, i64 14> 747 %lshr = lshr <4 x i64> %a, <i64 50, i64 50, i64 50, i64 50> 748 %or = or <4 x i64> %shl, %lshr 749 ret <4 x i64> %or 750} 751 752define <8 x i32> @splatconstant_rotate_v8i32(<8 x i32> %a) nounwind { 753; AVX1-LABEL: splatconstant_rotate_v8i32: 754; AVX1: # BB#0: 755; AVX1-NEXT: vpslld $4, %xmm0, %xmm1 756; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 757; AVX1-NEXT: vpslld $4, %xmm2, %xmm3 758; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 759; AVX1-NEXT: vpsrld $28, %xmm0, %xmm0 760; AVX1-NEXT: vpsrld $28, %xmm2, %xmm2 761; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 762; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 763; AVX1-NEXT: retq 764; 765; AVX2-LABEL: splatconstant_rotate_v8i32: 766; AVX2: # BB#0: 767; AVX2-NEXT: vpslld $4, %ymm0, %ymm1 768; AVX2-NEXT: vpsrld $28, %ymm0, %ymm0 769; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 770; AVX2-NEXT: retq 771; 772; XOPAVX1-LABEL: splatconstant_rotate_v8i32: 773; XOPAVX1: # BB#0: 774; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1 775; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 776; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0 777; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 778; XOPAVX1-NEXT: retq 779; 780; XOPAVX2-LABEL: splatconstant_rotate_v8i32: 781; XOPAVX2: # BB#0: 782; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm1 783; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 784; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0 785; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 786; XOPAVX2-NEXT: retq 787 %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> 788 %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28> 789 %or = or <8 x i32> %shl, %lshr 790 ret <8 x i32> %or 791} 792 793define <16 x i16> @splatconstant_rotate_v16i16(<16 x i16> %a) nounwind { 794; AVX1-LABEL: splatconstant_rotate_v16i16: 795; AVX1: # BB#0: 796; AVX1-NEXT: vpsllw $7, %xmm0, %xmm1 797; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 798; AVX1-NEXT: vpsllw $7, %xmm2, %xmm3 799; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 800; AVX1-NEXT: vpsrlw $9, %xmm0, %xmm0 801; AVX1-NEXT: vpsrlw $9, %xmm2, %xmm2 802; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 803; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 804; AVX1-NEXT: retq 805; 806; AVX2-LABEL: splatconstant_rotate_v16i16: 807; AVX2: # BB#0: 808; AVX2-NEXT: vpsllw $7, %ymm0, %ymm1 809; AVX2-NEXT: vpsrlw $9, %ymm0, %ymm0 810; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 811; AVX2-NEXT: retq 812; 813; XOPAVX1-LABEL: splatconstant_rotate_v16i16: 814; XOPAVX1: # BB#0: 815; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm1 816; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 817; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm0 818; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 819; XOPAVX1-NEXT: retq 820; 821; XOPAVX2-LABEL: splatconstant_rotate_v16i16: 822; XOPAVX2: # BB#0: 823; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm1 824; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 825; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm0 826; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 827; XOPAVX2-NEXT: retq 828 %shl = shl <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 829 %lshr = lshr <16 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9> 830 %or = or <16 x i16> %shl, %lshr 831 ret <16 x i16> %or 832} 833 834define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind { 835; AVX1-LABEL: splatconstant_rotate_v32i8: 836; AVX1: # BB#0: 837; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 838; AVX1-NEXT: vpsllw $4, %xmm1, %xmm2 839; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 840; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 841; AVX1-NEXT: vpsllw $4, %xmm0, %xmm4 842; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3 843; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 844; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 845; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 846; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 847; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 848; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 849; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 850; AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0 851; AVX1-NEXT: retq 852; 853; AVX2-LABEL: splatconstant_rotate_v32i8: 854; AVX2: # BB#0: 855; AVX2-NEXT: vpsllw $4, %ymm0, %ymm1 856; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 857; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 858; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 859; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 860; AVX2-NEXT: retq 861; 862; XOPAVX1-LABEL: splatconstant_rotate_v32i8: 863; XOPAVX1: # BB#0: 864; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1 865; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 866; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0 867; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 868; XOPAVX1-NEXT: retq 869; 870; XOPAVX2-LABEL: splatconstant_rotate_v32i8: 871; XOPAVX2: # BB#0: 872; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1 873; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 874; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0 875; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 876; XOPAVX2-NEXT: retq 877 %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 878 %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 879 %or = or <32 x i8> %shl, %lshr 880 ret <32 x i8> %or 881} 882 883; 884; Masked Uniform Constant Rotates 885; 886 887define <4 x i64> @splatconstant_rotate_mask_v4i64(<4 x i64> %a) nounwind { 888; AVX1-LABEL: splatconstant_rotate_mask_v4i64: 889; AVX1: # BB#0: 890; AVX1-NEXT: vpsllq $15, %xmm0, %xmm1 891; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 892; AVX1-NEXT: vpsllq $15, %xmm2, %xmm3 893; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 894; AVX1-NEXT: vpsrlq $49, %xmm0, %xmm0 895; AVX1-NEXT: vpsrlq $49, %xmm2, %xmm2 896; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 897; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 898; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 899; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 900; AVX1-NEXT: retq 901; 902; AVX2-LABEL: splatconstant_rotate_mask_v4i64: 903; AVX2: # BB#0: 904; AVX2-NEXT: vpsllq $15, %ymm0, %ymm1 905; AVX2-NEXT: vpsrlq $49, %ymm0, %ymm0 906; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 907; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 908; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 909; AVX2-NEXT: retq 910; 911; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64: 912; XOPAVX1: # BB#0: 913; XOPAVX1-NEXT: vprotq $15, %xmm0, %xmm1 914; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 915; XOPAVX1-NEXT: vprotq $15, %xmm0, %xmm0 916; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 917; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 918; XOPAVX1-NEXT: retq 919; 920; XOPAVX2-LABEL: splatconstant_rotate_mask_v4i64: 921; XOPAVX2: # BB#0: 922; XOPAVX2-NEXT: vprotq $15, %xmm0, %xmm1 923; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 924; XOPAVX2-NEXT: vprotq $15, %xmm0, %xmm0 925; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 926; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 927; XOPAVX2-NEXT: retq 928 %shl = shl <4 x i64> %a, <i64 15, i64 15, i64 15, i64 15> 929 %lshr = lshr <4 x i64> %a, <i64 49, i64 49, i64 49, i64 49> 930 %rmask = and <4 x i64> %lshr, <i64 255, i64 127, i64 127, i64 255> 931 %lmask = and <4 x i64> %shl, <i64 33, i64 65, i64 129, i64 257> 932 %or = or <4 x i64> %lmask, %rmask 933 ret <4 x i64> %or 934} 935 936define <8 x i32> @splatconstant_rotate_mask_v8i32(<8 x i32> %a) nounwind { 937; AVX1-LABEL: splatconstant_rotate_mask_v8i32: 938; AVX1: # BB#0: 939; AVX1-NEXT: vpslld $4, %xmm0, %xmm1 940; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 941; AVX1-NEXT: vpslld $4, %xmm2, %xmm3 942; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 943; AVX1-NEXT: vpsrld $28, %xmm0, %xmm0 944; AVX1-NEXT: vpsrld $28, %xmm2, %xmm2 945; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 946; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 947; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 948; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 949; AVX1-NEXT: retq 950; 951; AVX2-LABEL: splatconstant_rotate_mask_v8i32: 952; AVX2: # BB#0: 953; AVX2-NEXT: vpslld $4, %ymm0, %ymm1 954; AVX2-NEXT: vpsrld $28, %ymm0, %ymm0 955; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 956; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 957; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 958; AVX2-NEXT: retq 959; 960; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i32: 961; XOPAVX1: # BB#0: 962; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1 963; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 964; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0 965; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 966; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 967; XOPAVX1-NEXT: retq 968; 969; XOPAVX2-LABEL: splatconstant_rotate_mask_v8i32: 970; XOPAVX2: # BB#0: 971; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm1 972; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 973; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0 974; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 975; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 976; XOPAVX2-NEXT: retq 977 %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> 978 %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28> 979 %rmask = and <8 x i32> %lshr, <i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511> 980 %lmask = and <8 x i32> %shl, <i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3> 981 %or = or <8 x i32> %lmask, %rmask 982 ret <8 x i32> %or 983} 984 985define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind { 986; AVX1-LABEL: splatconstant_rotate_mask_v16i16: 987; AVX1: # BB#0: 988; AVX1-NEXT: vpsllw $5, %xmm0, %xmm1 989; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 990; AVX1-NEXT: vpsllw $5, %xmm2, %xmm3 991; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 992; AVX1-NEXT: vpsrlw $11, %xmm0, %xmm0 993; AVX1-NEXT: vpsrlw $11, %xmm2, %xmm2 994; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 995; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 996; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 997; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 998; AVX1-NEXT: retq 999; 1000; AVX2-LABEL: splatconstant_rotate_mask_v16i16: 1001; AVX2: # BB#0: 1002; AVX2-NEXT: vpsllw $5, %ymm0, %ymm1 1003; AVX2-NEXT: vpsrlw $11, %ymm0, %ymm0 1004; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1005; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 1006; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 1007; AVX2-NEXT: retq 1008; 1009; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i16: 1010; XOPAVX1: # BB#0: 1011; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm1 1012; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1013; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm0 1014; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1015; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 1016; XOPAVX1-NEXT: retq 1017; 1018; XOPAVX2-LABEL: splatconstant_rotate_mask_v16i16: 1019; XOPAVX2: # BB#0: 1020; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm1 1021; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1022; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm0 1023; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 1024; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1025; XOPAVX2-NEXT: retq 1026 %shl = shl <16 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5> 1027 %lshr = lshr <16 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11> 1028 %rmask = and <16 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55> 1029 %lmask = and <16 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33> 1030 %or = or <16 x i16> %lmask, %rmask 1031 ret <16 x i16> %or 1032} 1033 1034define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind { 1035; AVX1-LABEL: splatconstant_rotate_mask_v32i8: 1036; AVX1: # BB#0: 1037; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1038; AVX1-NEXT: vpsllw $4, %xmm1, %xmm2 1039; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 1040; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1041; AVX1-NEXT: vpsllw $4, %xmm0, %xmm4 1042; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3 1043; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 1044; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1045; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1046; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1047; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1048; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1049; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1050; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 1051; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm1 1052; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 1053; AVX1-NEXT: retq 1054; 1055; AVX2-LABEL: splatconstant_rotate_mask_v32i8: 1056; AVX2: # BB#0: 1057; AVX2-NEXT: vpsllw $4, %ymm0, %ymm1 1058; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 1059; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1060; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1061; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1062; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 1063; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 1064; AVX2-NEXT: retq 1065; 1066; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8: 1067; XOPAVX1: # BB#0: 1068; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1 1069; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1070; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0 1071; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1072; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 1073; XOPAVX1-NEXT: retq 1074; 1075; XOPAVX2-LABEL: splatconstant_rotate_mask_v32i8: 1076; XOPAVX2: # BB#0: 1077; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1 1078; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1079; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0 1080; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 1081; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1082; XOPAVX2-NEXT: retq 1083 %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1084 %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1085 %rmask = and <32 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55> 1086 %lmask = and <32 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33> 1087 %or = or <32 x i8> %lmask, %rmask 1088 ret <32 x i8> %or 1089} 1090