1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 5 6; Verify that the following shifts are lowered into a sequence of two shifts plus 7; a blend. On pre-avx2 targets, instead of scalarizing logical and arithmetic 8; packed shift right by a constant build_vector the backend should always try to 9; emit a simpler sequence of two shifts + blend when possible. 10 11define <8 x i16> @test1(<8 x i16> %a) { 12; SSE-LABEL: test1: 13; SSE: # %bb.0: 14; SSE-NEXT: movdqa %xmm0, %xmm1 15; SSE-NEXT: psrlw $3, %xmm1 16; SSE-NEXT: psrlw $2, %xmm0 17; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 18; SSE-NEXT: retq 19; 20; AVX1-LABEL: test1: 21; AVX1: # %bb.0: 22; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm1 23; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0 24; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 25; AVX1-NEXT: retq 26; 27; AVX2-LABEL: test1: 28; AVX2: # %bb.0: 29; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm1 30; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm0 31; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 32; AVX2-NEXT: retq 33 %lshr = lshr <8 x i16> %a, <i16 3, i16 3, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2> 34 ret <8 x i16> %lshr 35} 36 37define <8 x i16> @test2(<8 x i16> %a) { 38; SSE-LABEL: test2: 39; SSE: # %bb.0: 40; SSE-NEXT: movdqa %xmm0, %xmm1 41; SSE-NEXT: psrlw $3, %xmm1 42; SSE-NEXT: psrlw $2, %xmm0 43; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 44; SSE-NEXT: retq 45; 46; AVX1-LABEL: test2: 47; AVX1: # %bb.0: 48; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 49; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 50; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 51; AVX1-NEXT: retq 52; 53; AVX2-LABEL: test2: 54; AVX2: # %bb.0: 55; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm1 56; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 57; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 58; AVX2-NEXT: retq 59 %lshr = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 2, i16 2, i16 2, i16 2> 60 ret <8 x i16> %lshr 61} 62 63define <4 x i32> @test3(<4 x i32> %a) { 64; SSE-LABEL: test3: 65; SSE: # %bb.0: 66; SSE-NEXT: movdqa %xmm0, %xmm1 67; SSE-NEXT: psrld $3, %xmm1 68; SSE-NEXT: psrld $2, %xmm0 69; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 70; SSE-NEXT: retq 71; 72; AVX1-LABEL: test3: 73; AVX1: # %bb.0: 74; AVX1-NEXT: vpsrld $3, %xmm0, %xmm1 75; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0 76; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 77; AVX1-NEXT: retq 78; 79; AVX2-LABEL: test3: 80; AVX2: # %bb.0: 81; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 82; AVX2-NEXT: retq 83 %lshr = lshr <4 x i32> %a, <i32 3, i32 2, i32 2, i32 2> 84 ret <4 x i32> %lshr 85} 86 87define <4 x i32> @test4(<4 x i32> %a) { 88; SSE-LABEL: test4: 89; SSE: # %bb.0: 90; SSE-NEXT: movdqa %xmm0, %xmm1 91; SSE-NEXT: psrld $3, %xmm1 92; SSE-NEXT: psrld $2, %xmm0 93; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 94; SSE-NEXT: retq 95; 96; AVX1-LABEL: test4: 97; AVX1: # %bb.0: 98; AVX1-NEXT: vpsrld $2, %xmm0, %xmm1 99; AVX1-NEXT: vpsrld $3, %xmm0, %xmm0 100; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 101; AVX1-NEXT: retq 102; 103; AVX2-LABEL: test4: 104; AVX2: # %bb.0: 105; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 106; AVX2-NEXT: retq 107 %lshr = lshr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2> 108 ret <4 x i32> %lshr 109} 110 111define <8 x i16> @test5(<8 x i16> %a) { 112; SSE-LABEL: test5: 113; SSE: # %bb.0: 114; SSE-NEXT: movdqa %xmm0, %xmm1 115; SSE-NEXT: psraw $3, %xmm1 116; SSE-NEXT: psraw $2, %xmm0 117; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 118; SSE-NEXT: retq 119; 120; AVX1-LABEL: test5: 121; AVX1: # %bb.0: 122; AVX1-NEXT: vpsraw $3, %xmm0, %xmm1 123; AVX1-NEXT: vpsraw $2, %xmm0, %xmm0 124; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 125; AVX1-NEXT: retq 126; 127; AVX2-LABEL: test5: 128; AVX2: # %bb.0: 129; AVX2-NEXT: vpsraw $3, %xmm0, %xmm1 130; AVX2-NEXT: vpsraw $2, %xmm0, %xmm0 131; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 132; AVX2-NEXT: retq 133 %lshr = ashr <8 x i16> %a, <i16 3, i16 3, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2> 134 ret <8 x i16> %lshr 135} 136 137define <8 x i16> @test6(<8 x i16> %a) { 138; SSE-LABEL: test6: 139; SSE: # %bb.0: 140; SSE-NEXT: movdqa %xmm0, %xmm1 141; SSE-NEXT: psraw $3, %xmm1 142; SSE-NEXT: psraw $2, %xmm0 143; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 144; SSE-NEXT: retq 145; 146; AVX1-LABEL: test6: 147; AVX1: # %bb.0: 148; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 149; AVX1-NEXT: vpsraw $3, %xmm0, %xmm0 150; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 151; AVX1-NEXT: retq 152; 153; AVX2-LABEL: test6: 154; AVX2: # %bb.0: 155; AVX2-NEXT: vpsraw $2, %xmm0, %xmm1 156; AVX2-NEXT: vpsraw $3, %xmm0, %xmm0 157; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 158; AVX2-NEXT: retq 159 %lshr = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 2, i16 2, i16 2, i16 2> 160 ret <8 x i16> %lshr 161} 162 163define <4 x i32> @test7(<4 x i32> %a) { 164; SSE-LABEL: test7: 165; SSE: # %bb.0: 166; SSE-NEXT: movdqa %xmm0, %xmm1 167; SSE-NEXT: psrad $3, %xmm1 168; SSE-NEXT: psrad $2, %xmm0 169; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 170; SSE-NEXT: retq 171; 172; AVX1-LABEL: test7: 173; AVX1: # %bb.0: 174; AVX1-NEXT: vpsrad $3, %xmm0, %xmm1 175; AVX1-NEXT: vpsrad $2, %xmm0, %xmm0 176; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 177; AVX1-NEXT: retq 178; 179; AVX2-LABEL: test7: 180; AVX2: # %bb.0: 181; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 182; AVX2-NEXT: retq 183 %lshr = ashr <4 x i32> %a, <i32 3, i32 2, i32 2, i32 2> 184 ret <4 x i32> %lshr 185} 186 187define <4 x i32> @test8(<4 x i32> %a) { 188; SSE-LABEL: test8: 189; SSE: # %bb.0: 190; SSE-NEXT: movdqa %xmm0, %xmm1 191; SSE-NEXT: psrad $3, %xmm1 192; SSE-NEXT: psrad $2, %xmm0 193; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 194; SSE-NEXT: retq 195; 196; AVX1-LABEL: test8: 197; AVX1: # %bb.0: 198; AVX1-NEXT: vpsrad $2, %xmm0, %xmm1 199; AVX1-NEXT: vpsrad $3, %xmm0, %xmm0 200; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 201; AVX1-NEXT: retq 202; 203; AVX2-LABEL: test8: 204; AVX2: # %bb.0: 205; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 206; AVX2-NEXT: retq 207 %lshr = ashr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2> 208 ret <4 x i32> %lshr 209} 210 211define <8 x i16> @test9(<8 x i16> %a) { 212; SSE-LABEL: test9: 213; SSE: # %bb.0: 214; SSE-NEXT: movdqa %xmm0, %xmm1 215; SSE-NEXT: psraw $3, %xmm1 216; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,0,0] 217; SSE-NEXT: psraw $1, %xmm0 218; SSE-NEXT: pand %xmm2, %xmm0 219; SSE-NEXT: pandn %xmm1, %xmm2 220; SSE-NEXT: por %xmm2, %xmm0 221; SSE-NEXT: retq 222; 223; AVX-LABEL: test9: 224; AVX: # %bb.0: 225; AVX-NEXT: vpsraw $3, %xmm0, %xmm1 226; AVX-NEXT: vpsraw $1, %xmm0, %xmm0 227; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4],xmm1[5,6,7] 228; AVX-NEXT: retq 229 %lshr = ashr <8 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3> 230 ret <8 x i16> %lshr 231} 232 233define <8 x i32> @test10(<8 x i32>* %a) { 234; SSE-LABEL: test10: 235; SSE: # %bb.0: 236; SSE-NEXT: movdqa (%rdi), %xmm0 237; SSE-NEXT: movdqa 16(%rdi), %xmm1 238; SSE-NEXT: psrad %xmm0, %xmm1 239; SSE-NEXT: psrad $1, %xmm0 240; SSE-NEXT: retq 241; 242; AVX1-LABEL: test10: 243; AVX1: # %bb.0: 244; AVX1-NEXT: vmovdqa (%rdi), %xmm0 245; AVX1-NEXT: vpsrad $1, %xmm0, %xmm0 246; AVX1-NEXT: retq 247; 248; AVX2-LABEL: test10: 249; AVX2: # %bb.0: 250; AVX2-NEXT: vmovdqa (%rdi), %ymm0 251; AVX2-NEXT: vpsrad $1, %ymm0, %ymm0 252; AVX2-NEXT: retq 253 %ld = load <8 x i32>, <8 x i32>* %a, align 32 254 %ashr = ashr <8 x i32> %ld, <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 255 ret <8 x i32> %ashr 256} 257 258; test11 vs test12 - show difference between v16i16 that is repeated/non-repeated at v8i16 level (for PBLENDW masks). 259 260define <16 x i16> @test11(<16 x i16> %a) { 261; SSE-LABEL: test11: 262; SSE: # %bb.0: 263; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 264; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1 265; SSE-NEXT: retq 266; 267; AVX1-LABEL: test11: 268; AVX1: # %bb.0: 269; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 270; AVX1-NEXT: vpsllw $1, %xmm1, %xmm2 271; AVX1-NEXT: vpsllw $3, %xmm1, %xmm1 272; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6],xmm2[7] 273; AVX1-NEXT: vpsllw $3, %xmm0, %xmm2 274; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0 275; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7] 276; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 277; AVX1-NEXT: retq 278; 279; AVX2-LABEL: test11: 280; AVX2: # %bb.0: 281; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 282; AVX2-NEXT: retq 283 %lshr = shl <16 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 1, i16 1, i16 1, i16 3, i16 1> 284 ret <16 x i16> %lshr 285} 286 287define <16 x i16> @test12(<16 x i16> %a) { 288; SSE-LABEL: test12: 289; SSE: # %bb.0: 290; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,8,2,2,2,8,8,8] 291; SSE-NEXT: pmullw %xmm2, %xmm0 292; SSE-NEXT: pmullw %xmm2, %xmm1 293; SSE-NEXT: retq 294; 295; AVX1-LABEL: test12: 296; AVX1: # %bb.0: 297; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 298; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2 299; AVX1-NEXT: vpsllw $1, %xmm1, %xmm1 300; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4],xmm2[5,6,7] 301; AVX1-NEXT: vpsllw $3, %xmm0, %xmm2 302; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0 303; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7] 304; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 305; AVX1-NEXT: retq 306; 307; AVX2-LABEL: test12: 308; AVX2: # %bb.0: 309; AVX2-NEXT: vpsllw $3, %ymm0, %ymm1 310; AVX2-NEXT: vpsllw $1, %ymm0, %ymm0 311; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5,6,7],ymm0[8],ymm1[9],ymm0[10,11,12],ymm1[13,14,15] 312; AVX2-NEXT: retq 313 %lshr = shl <16 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3, i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3> 314 ret <16 x i16> %lshr 315} 316