1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 8 9define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { 10; SSE2-LABEL: testv2i64: 11; SSE2: # BB#0: 12; SSE2-NEXT: movdqa %xmm0, %xmm1 13; SSE2-NEXT: psrlq $1, %xmm1 14; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 15; SSE2-NEXT: psubq %xmm1, %xmm0 16; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323] 17; SSE2-NEXT: movdqa %xmm0, %xmm2 18; SSE2-NEXT: pand %xmm1, %xmm2 19; SSE2-NEXT: psrlq $2, %xmm0 20; SSE2-NEXT: pand %xmm1, %xmm0 21; SSE2-NEXT: paddq %xmm2, %xmm0 22; SSE2-NEXT: movdqa %xmm0, %xmm1 23; SSE2-NEXT: psrlq $4, %xmm1 24; SSE2-NEXT: paddq %xmm0, %xmm1 25; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 26; SSE2-NEXT: pxor %xmm0, %xmm0 27; SSE2-NEXT: psadbw %xmm0, %xmm1 28; SSE2-NEXT: movdqa %xmm1, %xmm0 29; SSE2-NEXT: retq 30; 31; SSE3-LABEL: testv2i64: 32; SSE3: # BB#0: 33; SSE3-NEXT: movdqa %xmm0, %xmm1 34; SSE3-NEXT: psrlq $1, %xmm1 35; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 36; SSE3-NEXT: psubq %xmm1, %xmm0 37; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323] 38; SSE3-NEXT: movdqa %xmm0, %xmm2 39; SSE3-NEXT: pand %xmm1, %xmm2 40; SSE3-NEXT: psrlq $2, %xmm0 41; SSE3-NEXT: pand %xmm1, %xmm0 42; SSE3-NEXT: paddq %xmm2, %xmm0 43; SSE3-NEXT: movdqa %xmm0, %xmm1 44; SSE3-NEXT: psrlq $4, %xmm1 45; SSE3-NEXT: paddq %xmm0, %xmm1 46; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 47; SSE3-NEXT: pxor %xmm0, %xmm0 48; SSE3-NEXT: psadbw %xmm0, %xmm1 49; SSE3-NEXT: movdqa %xmm1, %xmm0 50; SSE3-NEXT: retq 51; 52; SSSE3-LABEL: testv2i64: 53; SSSE3: # BB#0: 54; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 55; SSSE3-NEXT: movdqa %xmm0, %xmm2 56; SSSE3-NEXT: pand %xmm1, %xmm2 57; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 58; SSSE3-NEXT: movdqa %xmm3, %xmm4 59; SSSE3-NEXT: pshufb %xmm2, %xmm4 60; SSSE3-NEXT: psrlw $4, %xmm0 61; SSSE3-NEXT: pand %xmm1, %xmm0 62; SSSE3-NEXT: pshufb %xmm0, %xmm3 63; SSSE3-NEXT: paddb %xmm4, %xmm3 64; SSSE3-NEXT: pxor %xmm0, %xmm0 65; SSSE3-NEXT: psadbw %xmm3, %xmm0 66; SSSE3-NEXT: retq 67; 68; SSE41-LABEL: testv2i64: 69; SSE41: # BB#0: 70; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 71; SSE41-NEXT: movdqa %xmm0, %xmm2 72; SSE41-NEXT: pand %xmm1, %xmm2 73; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 74; SSE41-NEXT: movdqa %xmm3, %xmm4 75; SSE41-NEXT: pshufb %xmm2, %xmm4 76; SSE41-NEXT: psrlw $4, %xmm0 77; SSE41-NEXT: pand %xmm1, %xmm0 78; SSE41-NEXT: pshufb %xmm0, %xmm3 79; SSE41-NEXT: paddb %xmm4, %xmm3 80; SSE41-NEXT: pxor %xmm0, %xmm0 81; SSE41-NEXT: psadbw %xmm3, %xmm0 82; SSE41-NEXT: retq 83; 84; AVX-LABEL: testv2i64: 85; AVX: # BB#0: 86; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 87; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 88; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 89; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 90; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 91; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 92; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 93; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 94; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 95; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 96; AVX-NEXT: retq 97 %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %in) 98 ret <2 x i64> %out 99} 100 101define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { 102; SSE2-LABEL: testv4i32: 103; SSE2: # BB#0: 104; SSE2-NEXT: movdqa %xmm0, %xmm1 105; SSE2-NEXT: psrld $1, %xmm1 106; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 107; SSE2-NEXT: psubd %xmm1, %xmm0 108; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459] 109; SSE2-NEXT: movdqa %xmm0, %xmm2 110; SSE2-NEXT: pand %xmm1, %xmm2 111; SSE2-NEXT: psrld $2, %xmm0 112; SSE2-NEXT: pand %xmm1, %xmm0 113; SSE2-NEXT: paddd %xmm2, %xmm0 114; SSE2-NEXT: movdqa %xmm0, %xmm1 115; SSE2-NEXT: psrld $4, %xmm1 116; SSE2-NEXT: paddd %xmm0, %xmm1 117; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 118; SSE2-NEXT: pxor %xmm0, %xmm0 119; SSE2-NEXT: movdqa %xmm1, %xmm2 120; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] 121; SSE2-NEXT: psadbw %xmm0, %xmm2 122; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 123; SSE2-NEXT: psadbw %xmm0, %xmm1 124; SSE2-NEXT: packuswb %xmm2, %xmm1 125; SSE2-NEXT: movdqa %xmm1, %xmm0 126; SSE2-NEXT: retq 127; 128; SSE3-LABEL: testv4i32: 129; SSE3: # BB#0: 130; SSE3-NEXT: movdqa %xmm0, %xmm1 131; SSE3-NEXT: psrld $1, %xmm1 132; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 133; SSE3-NEXT: psubd %xmm1, %xmm0 134; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459] 135; SSE3-NEXT: movdqa %xmm0, %xmm2 136; SSE3-NEXT: pand %xmm1, %xmm2 137; SSE3-NEXT: psrld $2, %xmm0 138; SSE3-NEXT: pand %xmm1, %xmm0 139; SSE3-NEXT: paddd %xmm2, %xmm0 140; SSE3-NEXT: movdqa %xmm0, %xmm1 141; SSE3-NEXT: psrld $4, %xmm1 142; SSE3-NEXT: paddd %xmm0, %xmm1 143; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 144; SSE3-NEXT: pxor %xmm0, %xmm0 145; SSE3-NEXT: movdqa %xmm1, %xmm2 146; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] 147; SSE3-NEXT: psadbw %xmm0, %xmm2 148; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 149; SSE3-NEXT: psadbw %xmm0, %xmm1 150; SSE3-NEXT: packuswb %xmm2, %xmm1 151; SSE3-NEXT: movdqa %xmm1, %xmm0 152; SSE3-NEXT: retq 153; 154; SSSE3-LABEL: testv4i32: 155; SSSE3: # BB#0: 156; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 157; SSSE3-NEXT: movdqa %xmm0, %xmm3 158; SSSE3-NEXT: pand %xmm2, %xmm3 159; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 160; SSSE3-NEXT: movdqa %xmm1, %xmm4 161; SSSE3-NEXT: pshufb %xmm3, %xmm4 162; SSSE3-NEXT: psrlw $4, %xmm0 163; SSSE3-NEXT: pand %xmm2, %xmm0 164; SSSE3-NEXT: pshufb %xmm0, %xmm1 165; SSSE3-NEXT: paddb %xmm4, %xmm1 166; SSSE3-NEXT: pxor %xmm0, %xmm0 167; SSSE3-NEXT: movdqa %xmm1, %xmm2 168; SSSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] 169; SSSE3-NEXT: psadbw %xmm0, %xmm2 170; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 171; SSSE3-NEXT: psadbw %xmm0, %xmm1 172; SSSE3-NEXT: packuswb %xmm2, %xmm1 173; SSSE3-NEXT: movdqa %xmm1, %xmm0 174; SSSE3-NEXT: retq 175; 176; SSE41-LABEL: testv4i32: 177; SSE41: # BB#0: 178; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 179; SSE41-NEXT: movdqa %xmm0, %xmm3 180; SSE41-NEXT: pand %xmm2, %xmm3 181; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 182; SSE41-NEXT: movdqa %xmm1, %xmm4 183; SSE41-NEXT: pshufb %xmm3, %xmm4 184; SSE41-NEXT: psrlw $4, %xmm0 185; SSE41-NEXT: pand %xmm2, %xmm0 186; SSE41-NEXT: pshufb %xmm0, %xmm1 187; SSE41-NEXT: paddb %xmm4, %xmm1 188; SSE41-NEXT: pxor %xmm0, %xmm0 189; SSE41-NEXT: movdqa %xmm1, %xmm2 190; SSE41-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] 191; SSE41-NEXT: psadbw %xmm0, %xmm2 192; SSE41-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 193; SSE41-NEXT: psadbw %xmm0, %xmm1 194; SSE41-NEXT: packuswb %xmm2, %xmm1 195; SSE41-NEXT: movdqa %xmm1, %xmm0 196; SSE41-NEXT: retq 197; 198; AVX-LABEL: testv4i32: 199; AVX: # BB#0: 200; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 201; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 202; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 203; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 204; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 205; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 206; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 207; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 208; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 209; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 210; AVX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 211; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 212; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 213; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 214; AVX-NEXT: retq 215 %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %in) 216 ret <4 x i32> %out 217} 218 219define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { 220; SSE2-LABEL: testv8i16: 221; SSE2: # BB#0: 222; SSE2-NEXT: movdqa %xmm0, %xmm1 223; SSE2-NEXT: psrlw $1, %xmm1 224; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 225; SSE2-NEXT: psubw %xmm1, %xmm0 226; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107] 227; SSE2-NEXT: movdqa %xmm0, %xmm2 228; SSE2-NEXT: pand %xmm1, %xmm2 229; SSE2-NEXT: psrlw $2, %xmm0 230; SSE2-NEXT: pand %xmm1, %xmm0 231; SSE2-NEXT: paddw %xmm2, %xmm0 232; SSE2-NEXT: movdqa %xmm0, %xmm1 233; SSE2-NEXT: psrlw $4, %xmm1 234; SSE2-NEXT: paddw %xmm0, %xmm1 235; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 236; SSE2-NEXT: movdqa %xmm1, %xmm0 237; SSE2-NEXT: psllw $8, %xmm0 238; SSE2-NEXT: paddb %xmm1, %xmm0 239; SSE2-NEXT: psrlw $8, %xmm0 240; SSE2-NEXT: retq 241; 242; SSE3-LABEL: testv8i16: 243; SSE3: # BB#0: 244; SSE3-NEXT: movdqa %xmm0, %xmm1 245; SSE3-NEXT: psrlw $1, %xmm1 246; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 247; SSE3-NEXT: psubw %xmm1, %xmm0 248; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107] 249; SSE3-NEXT: movdqa %xmm0, %xmm2 250; SSE3-NEXT: pand %xmm1, %xmm2 251; SSE3-NEXT: psrlw $2, %xmm0 252; SSE3-NEXT: pand %xmm1, %xmm0 253; SSE3-NEXT: paddw %xmm2, %xmm0 254; SSE3-NEXT: movdqa %xmm0, %xmm1 255; SSE3-NEXT: psrlw $4, %xmm1 256; SSE3-NEXT: paddw %xmm0, %xmm1 257; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 258; SSE3-NEXT: movdqa %xmm1, %xmm0 259; SSE3-NEXT: psllw $8, %xmm0 260; SSE3-NEXT: paddb %xmm1, %xmm0 261; SSE3-NEXT: psrlw $8, %xmm0 262; SSE3-NEXT: retq 263; 264; SSSE3-LABEL: testv8i16: 265; SSSE3: # BB#0: 266; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 267; SSSE3-NEXT: movdqa %xmm0, %xmm2 268; SSSE3-NEXT: pand %xmm1, %xmm2 269; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 270; SSSE3-NEXT: movdqa %xmm3, %xmm4 271; SSSE3-NEXT: pshufb %xmm2, %xmm4 272; SSSE3-NEXT: psrlw $4, %xmm0 273; SSSE3-NEXT: pand %xmm1, %xmm0 274; SSSE3-NEXT: pshufb %xmm0, %xmm3 275; SSSE3-NEXT: paddb %xmm4, %xmm3 276; SSSE3-NEXT: movdqa %xmm3, %xmm0 277; SSSE3-NEXT: psllw $8, %xmm0 278; SSSE3-NEXT: paddb %xmm3, %xmm0 279; SSSE3-NEXT: psrlw $8, %xmm0 280; SSSE3-NEXT: retq 281; 282; SSE41-LABEL: testv8i16: 283; SSE41: # BB#0: 284; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 285; SSE41-NEXT: movdqa %xmm0, %xmm2 286; SSE41-NEXT: pand %xmm1, %xmm2 287; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 288; SSE41-NEXT: movdqa %xmm3, %xmm4 289; SSE41-NEXT: pshufb %xmm2, %xmm4 290; SSE41-NEXT: psrlw $4, %xmm0 291; SSE41-NEXT: pand %xmm1, %xmm0 292; SSE41-NEXT: pshufb %xmm0, %xmm3 293; SSE41-NEXT: paddb %xmm4, %xmm3 294; SSE41-NEXT: movdqa %xmm3, %xmm0 295; SSE41-NEXT: psllw $8, %xmm0 296; SSE41-NEXT: paddb %xmm3, %xmm0 297; SSE41-NEXT: psrlw $8, %xmm0 298; SSE41-NEXT: retq 299; 300; AVX-LABEL: testv8i16: 301; AVX: # BB#0: 302; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 303; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 304; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 305; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 306; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 307; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 308; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 309; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 310; AVX-NEXT: vpsllw $8, %xmm0, %xmm1 311; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 312; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 313; AVX-NEXT: retq 314 %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %in) 315 ret <8 x i16> %out 316} 317 318define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { 319; SSE2-LABEL: testv16i8: 320; SSE2: # BB#0: 321; SSE2-NEXT: movdqa %xmm0, %xmm1 322; SSE2-NEXT: psrlw $1, %xmm1 323; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 324; SSE2-NEXT: psubb %xmm1, %xmm0 325; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 326; SSE2-NEXT: movdqa %xmm0, %xmm2 327; SSE2-NEXT: pand %xmm1, %xmm2 328; SSE2-NEXT: psrlw $2, %xmm0 329; SSE2-NEXT: pand %xmm1, %xmm0 330; SSE2-NEXT: paddb %xmm2, %xmm0 331; SSE2-NEXT: movdqa %xmm0, %xmm1 332; SSE2-NEXT: psrlw $4, %xmm1 333; SSE2-NEXT: paddb %xmm0, %xmm1 334; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 335; SSE2-NEXT: movdqa %xmm1, %xmm0 336; SSE2-NEXT: retq 337; 338; SSE3-LABEL: testv16i8: 339; SSE3: # BB#0: 340; SSE3-NEXT: movdqa %xmm0, %xmm1 341; SSE3-NEXT: psrlw $1, %xmm1 342; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 343; SSE3-NEXT: psubb %xmm1, %xmm0 344; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 345; SSE3-NEXT: movdqa %xmm0, %xmm2 346; SSE3-NEXT: pand %xmm1, %xmm2 347; SSE3-NEXT: psrlw $2, %xmm0 348; SSE3-NEXT: pand %xmm1, %xmm0 349; SSE3-NEXT: paddb %xmm2, %xmm0 350; SSE3-NEXT: movdqa %xmm0, %xmm1 351; SSE3-NEXT: psrlw $4, %xmm1 352; SSE3-NEXT: paddb %xmm0, %xmm1 353; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 354; SSE3-NEXT: movdqa %xmm1, %xmm0 355; SSE3-NEXT: retq 356; 357; SSSE3-LABEL: testv16i8: 358; SSSE3: # BB#0: 359; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 360; SSSE3-NEXT: movdqa %xmm0, %xmm3 361; SSSE3-NEXT: pand %xmm2, %xmm3 362; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 363; SSSE3-NEXT: movdqa %xmm1, %xmm4 364; SSSE3-NEXT: pshufb %xmm3, %xmm4 365; SSSE3-NEXT: psrlw $4, %xmm0 366; SSSE3-NEXT: pand %xmm2, %xmm0 367; SSSE3-NEXT: pshufb %xmm0, %xmm1 368; SSSE3-NEXT: paddb %xmm4, %xmm1 369; SSSE3-NEXT: movdqa %xmm1, %xmm0 370; SSSE3-NEXT: retq 371; 372; SSE41-LABEL: testv16i8: 373; SSE41: # BB#0: 374; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 375; SSE41-NEXT: movdqa %xmm0, %xmm3 376; SSE41-NEXT: pand %xmm2, %xmm3 377; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 378; SSE41-NEXT: movdqa %xmm1, %xmm4 379; SSE41-NEXT: pshufb %xmm3, %xmm4 380; SSE41-NEXT: psrlw $4, %xmm0 381; SSE41-NEXT: pand %xmm2, %xmm0 382; SSE41-NEXT: pshufb %xmm0, %xmm1 383; SSE41-NEXT: paddb %xmm4, %xmm1 384; SSE41-NEXT: movdqa %xmm1, %xmm0 385; SSE41-NEXT: retq 386; 387; AVX-LABEL: testv16i8: 388; AVX: # BB#0: 389; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 390; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 391; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 392; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 393; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 394; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 395; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 396; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 397; AVX-NEXT: retq 398 %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %in) 399 ret <16 x i8> %out 400} 401 402define <2 x i64> @foldv2i64() nounwind { 403; SSE-LABEL: foldv2i64: 404; SSE: # BB#0: 405; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,64] 406; SSE-NEXT: retq 407; 408; AVX-LABEL: foldv2i64: 409; AVX: # BB#0: 410; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,64] 411; AVX-NEXT: retq 412 %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> <i64 256, i64 -1>) 413 ret <2 x i64> %out 414} 415 416define <4 x i32> @foldv4i32() nounwind { 417; SSE-LABEL: foldv4i32: 418; SSE: # BB#0: 419; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,32,0,8] 420; SSE-NEXT: retq 421; 422; AVX-LABEL: foldv4i32: 423; AVX: # BB#0: 424; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,32,0,8] 425; AVX-NEXT: retq 426 %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>) 427 ret <4 x i32> %out 428} 429 430define <8 x i16> @foldv8i16() nounwind { 431; SSE-LABEL: foldv8i16: 432; SSE: # BB#0: 433; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] 434; SSE-NEXT: retq 435; 436; AVX-LABEL: foldv8i16: 437; AVX: # BB#0: 438; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] 439; AVX-NEXT: retq 440 %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>) 441 ret <8 x i16> %out 442} 443 444define <16 x i8> @foldv16i8() nounwind { 445; SSE-LABEL: foldv16i8: 446; SSE: # BB#0: 447; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] 448; SSE-NEXT: retq 449; 450; AVX-LABEL: foldv16i8: 451; AVX: # BB#0: 452; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] 453; AVX-NEXT: retq 454 %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>) 455 ret <16 x i8> %out 456} 457 458declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) 459declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) 460declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) 461declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) 462