1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512 6 7; Verify that we don't scalarize a packed vector shift left of 16-bit 8; signed integers if the amount is a constant build_vector. 9; Check that we produce a SSE2 packed integer multiply (pmullw) instead. 10 11define <8 x i16> @test1(<8 x i16> %a) { 12; SSE-LABEL: test1: 13; SSE: # %bb.0: 14; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 15; SSE-NEXT: retq 16; 17; AVX-LABEL: test1: 18; AVX: # %bb.0: 19; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 20; AVX-NEXT: retq 21 %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11> 22 ret <8 x i16> %shl 23} 24 25define <8 x i16> @test2(<8 x i16> %a) { 26; SSE-LABEL: test2: 27; SSE: # %bb.0: 28; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 29; SSE-NEXT: retq 30; 31; AVX-LABEL: test2: 32; AVX: # %bb.0: 33; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 34; AVX-NEXT: retq 35 %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1> 36 ret <8 x i16> %shl 37} 38 39; Verify that a vector shift left of 32-bit signed integers is simply expanded 40; into a SSE4.1 pmulld (instead of cvttps2dq + pmulld) if the vector of shift 41; counts is a constant build_vector. 42 43define <4 x i32> @test3(<4 x i32> %a) { 44; SSE2-LABEL: test3: 45; SSE2: # %bb.0: 46; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 47; SSE2-NEXT: pmuludq %xmm0, %xmm1 48; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 49; SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 50; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 51; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 52; SSE2-NEXT: retq 53; 54; SSE41-LABEL: test3: 55; SSE41: # %bb.0: 56; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 57; SSE41-NEXT: retq 58; 59; AVX-LABEL: test3: 60; AVX: # %bb.0: 61; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 62; AVX-NEXT: retq 63 %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3> 64 ret <4 x i32> %shl 65} 66 67define <4 x i32> @test4(<4 x i32> %a) { 68; SSE2-LABEL: test4: 69; SSE2: # %bb.0: 70; SSE2-NEXT: movdqa %xmm0, %xmm1 71; SSE2-NEXT: pslld $1, %xmm1 72; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 73; SSE2-NEXT: movapd %xmm1, %xmm0 74; SSE2-NEXT: retq 75; 76; SSE41-LABEL: test4: 77; SSE41: # %bb.0: 78; SSE41-NEXT: movdqa %xmm0, %xmm1 79; SSE41-NEXT: pslld $1, %xmm1 80; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] 81; SSE41-NEXT: movdqa %xmm1, %xmm0 82; SSE41-NEXT: retq 83; 84; AVX-LABEL: test4: 85; AVX: # %bb.0: 86; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 87; AVX-NEXT: retq 88 %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1> 89 ret <4 x i32> %shl 90} 91 92; If we have AVX/SSE2 but not AVX2, verify that the following shift is split 93; into two pmullw instructions. With AVX2, the test case below would produce 94; a single vpmullw. 95 96define <16 x i16> @test5(<16 x i16> %a) { 97; SSE-LABEL: test5: 98; SSE: # %bb.0: 99; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8,128,1,512,2048] 100; SSE-NEXT: pmullw %xmm2, %xmm0 101; SSE-NEXT: pmullw %xmm2, %xmm1 102; SSE-NEXT: retq 103; 104; AVX-LABEL: test5: 105; AVX: # %bb.0: 106; AVX-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 107; AVX-NEXT: retq 108 %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11> 109 ret <16 x i16> %shl 110} 111 112; If we have AVX/SSE4.1 but not AVX2, verify that the following shift is split 113; into two pmulld instructions. With AVX2, the test case below would produce 114; a single vpsllvd instead. 115 116define <8 x i32> @test6(<8 x i32> %a) { 117; SSE2-LABEL: test6: 118; SSE2: # %bb.0: 119; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8] 120; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 121; SSE2-NEXT: pmuludq %xmm2, %xmm0 122; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 123; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] 124; SSE2-NEXT: pmuludq %xmm4, %xmm3 125; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 126; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 127; SSE2-NEXT: pmuludq %xmm1, %xmm2 128; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 129; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 130; SSE2-NEXT: pmuludq %xmm4, %xmm1 131; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 132; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 133; SSE2-NEXT: movdqa %xmm2, %xmm1 134; SSE2-NEXT: retq 135; 136; SSE41-LABEL: test6: 137; SSE41: # %bb.0: 138; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8] 139; SSE41-NEXT: pmulld %xmm2, %xmm0 140; SSE41-NEXT: pmulld %xmm2, %xmm1 141; SSE41-NEXT: retq 142; 143; AVX-LABEL: test6: 144; AVX: # %bb.0: 145; AVX-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0 146; AVX-NEXT: retq 147 %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3> 148 ret <8 x i32> %shl 149} 150 151; With AVX2 and AVX512, the test case below should produce a sequence of 152; two vpmullw instructions. On SSE2 instead, we split the shift in four 153; parts and then we convert each part into a pmullw. 154 155define <32 x i16> @test7(<32 x i16> %a) { 156; SSE-LABEL: test7: 157; SSE: # %bb.0: 158; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8,128,1,512,2048] 159; SSE-NEXT: pmullw %xmm4, %xmm0 160; SSE-NEXT: pmullw %xmm4, %xmm1 161; SSE-NEXT: pmullw %xmm4, %xmm2 162; SSE-NEXT: pmullw %xmm4, %xmm3 163; SSE-NEXT: retq 164; 165; AVX2-LABEL: test7: 166; AVX2: # %bb.0: 167; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048] 168; AVX2-NEXT: # ymm2 = mem[0,1,0,1] 169; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 170; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 171; AVX2-NEXT: retq 172; 173; AVX512-LABEL: test7: 174; AVX512: # %bb.0: 175; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 176; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048] 177; AVX512-NEXT: # ymm2 = mem[0,1,0,1] 178; AVX512-NEXT: vpmullw %ymm2, %ymm1, %ymm1 179; AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0 180; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 181; AVX512-NEXT: retq 182 %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11> 183 ret <32 x i16> %shl 184} 185 186; Similar to test7; the difference is that with AVX512 support 187; we only produce a single vpsllvd/vpsllvq instead of a pair of vpsllvd/vpsllvq. 188 189define <16 x i32> @test8(<16 x i32> %a) { 190; SSE2-LABEL: test8: 191; SSE2: # %bb.0: 192; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8] 193; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 194; SSE2-NEXT: pmuludq %xmm4, %xmm0 195; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 196; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] 197; SSE2-NEXT: pmuludq %xmm6, %xmm5 198; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 199; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] 200; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] 201; SSE2-NEXT: pmuludq %xmm4, %xmm1 202; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 203; SSE2-NEXT: pmuludq %xmm6, %xmm5 204; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 205; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] 206; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] 207; SSE2-NEXT: pmuludq %xmm4, %xmm2 208; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 209; SSE2-NEXT: pmuludq %xmm6, %xmm5 210; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 211; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 212; SSE2-NEXT: pmuludq %xmm3, %xmm4 213; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 214; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 215; SSE2-NEXT: pmuludq %xmm6, %xmm3 216; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 217; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 218; SSE2-NEXT: movdqa %xmm4, %xmm3 219; SSE2-NEXT: retq 220; 221; SSE41-LABEL: test8: 222; SSE41: # %bb.0: 223; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8] 224; SSE41-NEXT: pmulld %xmm4, %xmm0 225; SSE41-NEXT: pmulld %xmm4, %xmm1 226; SSE41-NEXT: pmulld %xmm4, %xmm2 227; SSE41-NEXT: pmulld %xmm4, %xmm3 228; SSE41-NEXT: retq 229; 230; AVX2-LABEL: test8: 231; AVX2: # %bb.0: 232; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3] 233; AVX2-NEXT: # ymm2 = mem[0,1,0,1] 234; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 235; AVX2-NEXT: vpsllvd %ymm2, %ymm1, %ymm1 236; AVX2-NEXT: retq 237; 238; AVX512-LABEL: test8: 239; AVX512: # %bb.0: 240; AVX512-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 241; AVX512-NEXT: retq 242 %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3> 243 ret <16 x i32> %shl 244} 245 246; The shift from 'test9' gets shifted separately and blended if we don't have AVX2/AVX512f support. 247 248define <8 x i64> @test9(<8 x i64> %a) { 249; SSE2-LABEL: test9: 250; SSE2: # %bb.0: 251; SSE2-NEXT: movdqa %xmm1, %xmm4 252; SSE2-NEXT: psllq $2, %xmm4 253; SSE2-NEXT: psllq $3, %xmm1 254; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] 255; SSE2-NEXT: movdqa %xmm3, %xmm4 256; SSE2-NEXT: psllq $2, %xmm4 257; SSE2-NEXT: psllq $3, %xmm3 258; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] 259; SSE2-NEXT: paddq %xmm0, %xmm0 260; SSE2-NEXT: paddq %xmm2, %xmm2 261; SSE2-NEXT: retq 262; 263; SSE41-LABEL: test9: 264; SSE41: # %bb.0: 265; SSE41-NEXT: movdqa %xmm1, %xmm4 266; SSE41-NEXT: psllq $3, %xmm4 267; SSE41-NEXT: psllq $2, %xmm1 268; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] 269; SSE41-NEXT: movdqa %xmm3, %xmm4 270; SSE41-NEXT: psllq $3, %xmm4 271; SSE41-NEXT: psllq $2, %xmm3 272; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] 273; SSE41-NEXT: paddq %xmm0, %xmm0 274; SSE41-NEXT: paddq %xmm2, %xmm2 275; SSE41-NEXT: retq 276; 277; AVX2-LABEL: test9: 278; AVX2: # %bb.0: 279; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,2,3] 280; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 281; AVX2-NEXT: vpsllvq %ymm2, %ymm1, %ymm1 282; AVX2-NEXT: retq 283; 284; AVX512-LABEL: test9: 285; AVX512: # %bb.0: 286; AVX512-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0 287; AVX512-NEXT: retq 288 %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3> 289 ret <8 x i64> %shl 290} 291