1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD 10; 11; Just one 32-bit run to make sure we do reasonable things for i64 tzcnt. 12; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE41 13 14define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { 15; SSE2-LABEL: testv2i64: 16; SSE2: # BB#0: 17; SSE2-NEXT: movd %xmm0, %rax 18; SSE2-NEXT: bsfq %rax, %rax 19; SSE2-NEXT: movl $64, %ecx 20; SSE2-NEXT: cmoveq %rcx, %rax 21; SSE2-NEXT: movd %rax, %xmm1 22; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 23; SSE2-NEXT: movd %xmm0, %rax 24; SSE2-NEXT: bsfq %rax, %rax 25; SSE2-NEXT: cmoveq %rcx, %rax 26; SSE2-NEXT: movd %rax, %xmm0 27; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 28; SSE2-NEXT: movdqa %xmm1, %xmm0 29; SSE2-NEXT: retq 30; 31; SSE3-LABEL: testv2i64: 32; SSE3: # BB#0: 33; SSE3-NEXT: movd %xmm0, %rax 34; SSE3-NEXT: bsfq %rax, %rax 35; SSE3-NEXT: movl $64, %ecx 36; SSE3-NEXT: cmoveq %rcx, %rax 37; SSE3-NEXT: movd %rax, %xmm1 38; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 39; SSE3-NEXT: movd %xmm0, %rax 40; SSE3-NEXT: bsfq %rax, %rax 41; SSE3-NEXT: cmoveq %rcx, %rax 42; SSE3-NEXT: movd %rax, %xmm0 43; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 44; SSE3-NEXT: movdqa %xmm1, %xmm0 45; SSE3-NEXT: retq 46; 47; SSSE3-LABEL: testv2i64: 48; SSSE3: # BB#0: 49; SSSE3-NEXT: movd %xmm0, %rax 50; SSSE3-NEXT: bsfq %rax, %rax 51; SSSE3-NEXT: movl $64, %ecx 52; SSSE3-NEXT: cmoveq %rcx, %rax 53; SSSE3-NEXT: movd %rax, %xmm1 54; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 55; SSSE3-NEXT: movd %xmm0, %rax 56; SSSE3-NEXT: bsfq %rax, %rax 57; SSSE3-NEXT: cmoveq %rcx, %rax 58; SSSE3-NEXT: movd %rax, %xmm0 59; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 60; SSSE3-NEXT: movdqa %xmm1, %xmm0 61; SSSE3-NEXT: retq 62; 63; SSE41-LABEL: testv2i64: 64; SSE41: # BB#0: 65; SSE41-NEXT: pextrq $1, %xmm0, %rax 66; SSE41-NEXT: bsfq %rax, %rax 67; SSE41-NEXT: movl $64, %ecx 68; SSE41-NEXT: cmoveq %rcx, %rax 69; SSE41-NEXT: movd %rax, %xmm1 70; SSE41-NEXT: movd %xmm0, %rax 71; SSE41-NEXT: bsfq %rax, %rax 72; SSE41-NEXT: cmoveq %rcx, %rax 73; SSE41-NEXT: movd %rax, %xmm0 74; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 75; SSE41-NEXT: retq 76; 77; AVX-LABEL: testv2i64: 78; AVX: # BB#0: 79; AVX-NEXT: vpextrq $1, %xmm0, %rax 80; AVX-NEXT: bsfq %rax, %rax 81; AVX-NEXT: movl $64, %ecx 82; AVX-NEXT: cmoveq %rcx, %rax 83; AVX-NEXT: vmovq %rax, %xmm1 84; AVX-NEXT: vmovq %xmm0, %rax 85; AVX-NEXT: bsfq %rax, %rax 86; AVX-NEXT: cmoveq %rcx, %rax 87; AVX-NEXT: vmovq %rax, %xmm0 88; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 89; AVX-NEXT: retq 90; 91; X32-SSE-LABEL: testv2i64: 92; X32-SSE: # BB#0: 93; X32-SSE-NEXT: pushl %esi 94; X32-SSE-NEXT: pextrd $3, %xmm0, %eax 95; X32-SSE-NEXT: bsfl %eax, %eax 96; X32-SSE-NEXT: movl $32, %ecx 97; X32-SSE-NEXT: cmovel %ecx, %eax 98; X32-SSE-NEXT: addl $32, %eax 99; X32-SSE-NEXT: pextrd $2, %xmm0, %edx 100; X32-SSE-NEXT: bsfl %edx, %esi 101; X32-SSE-NEXT: testl %edx, %edx 102; X32-SSE-NEXT: cmovel %eax, %esi 103; X32-SSE-NEXT: movd %esi, %xmm1 104; X32-SSE-NEXT: pextrd $1, %xmm0, %eax 105; X32-SSE-NEXT: bsfl %eax, %eax 106; X32-SSE-NEXT: cmovel %ecx, %eax 107; X32-SSE-NEXT: addl $32, %eax 108; X32-SSE-NEXT: movd %xmm0, %ecx 109; X32-SSE-NEXT: bsfl %ecx, %edx 110; X32-SSE-NEXT: testl %ecx, %ecx 111; X32-SSE-NEXT: cmovel %eax, %edx 112; X32-SSE-NEXT: movd %edx, %xmm0 113; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 114; X32-SSE-NEXT: popl %esi 115; X32-SSE-NEXT: retl 116 %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %in, i1 0) 117 ret <2 x i64> %out 118} 119 120define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { 121; SSE2-LABEL: testv2i64u: 122; SSE2: # BB#0: 123; SSE2-NEXT: movd %xmm0, %rax 124; SSE2-NEXT: bsfq %rax, %rax 125; SSE2-NEXT: movd %rax, %xmm1 126; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 127; SSE2-NEXT: movd %xmm0, %rax 128; SSE2-NEXT: bsfq %rax, %rax 129; SSE2-NEXT: movd %rax, %xmm0 130; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 131; SSE2-NEXT: movdqa %xmm1, %xmm0 132; SSE2-NEXT: retq 133; 134; SSE3-LABEL: testv2i64u: 135; SSE3: # BB#0: 136; SSE3-NEXT: movd %xmm0, %rax 137; SSE3-NEXT: bsfq %rax, %rax 138; SSE3-NEXT: movd %rax, %xmm1 139; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 140; SSE3-NEXT: movd %xmm0, %rax 141; SSE3-NEXT: bsfq %rax, %rax 142; SSE3-NEXT: movd %rax, %xmm0 143; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 144; SSE3-NEXT: movdqa %xmm1, %xmm0 145; SSE3-NEXT: retq 146; 147; SSSE3-LABEL: testv2i64u: 148; SSSE3: # BB#0: 149; SSSE3-NEXT: movd %xmm0, %rax 150; SSSE3-NEXT: bsfq %rax, %rax 151; SSSE3-NEXT: movd %rax, %xmm1 152; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 153; SSSE3-NEXT: movd %xmm0, %rax 154; SSSE3-NEXT: bsfq %rax, %rax 155; SSSE3-NEXT: movd %rax, %xmm0 156; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 157; SSSE3-NEXT: movdqa %xmm1, %xmm0 158; SSSE3-NEXT: retq 159; 160; SSE41-LABEL: testv2i64u: 161; SSE41: # BB#0: 162; SSE41-NEXT: pextrq $1, %xmm0, %rax 163; SSE41-NEXT: bsfq %rax, %rax 164; SSE41-NEXT: movd %rax, %xmm1 165; SSE41-NEXT: movd %xmm0, %rax 166; SSE41-NEXT: bsfq %rax, %rax 167; SSE41-NEXT: movd %rax, %xmm0 168; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 169; SSE41-NEXT: retq 170; 171; AVX1-LABEL: testv2i64u: 172; AVX1: # BB#0: 173; AVX1-NEXT: vpextrq $1, %xmm0, %rax 174; AVX1-NEXT: bsfq %rax, %rax 175; AVX1-NEXT: vmovq %rax, %xmm1 176; AVX1-NEXT: vmovq %xmm0, %rax 177; AVX1-NEXT: bsfq %rax, %rax 178; AVX1-NEXT: vmovq %rax, %xmm0 179; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 180; AVX1-NEXT: retq 181; 182; AVX2-LABEL: testv2i64u: 183; AVX2: # BB#0: 184; AVX2-NEXT: vpextrq $1, %xmm0, %rax 185; AVX2-NEXT: bsfq %rax, %rax 186; AVX2-NEXT: vmovq %rax, %xmm1 187; AVX2-NEXT: vmovq %xmm0, %rax 188; AVX2-NEXT: bsfq %rax, %rax 189; AVX2-NEXT: vmovq %rax, %xmm0 190; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 191; AVX2-NEXT: retq 192; 193; AVX512CDVL-LABEL: testv2i64u: 194; AVX512CDVL: # BB#0: 195; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1 196; AVX512CDVL-NEXT: vpsubq %xmm0, %xmm1, %xmm1 197; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0 198; AVX512CDVL-NEXT: vplzcntq %xmm0, %xmm0 199; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm1 = [63,63] 200; AVX512CDVL-NEXT: vpsubq %xmm0, %xmm1, %xmm0 201; AVX512CDVL-NEXT: retq 202; 203; AVX512CD-LABEL: testv2i64u: 204; AVX512CD: # BB#0: 205; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 206; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm1 207; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 208; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 209; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63] 210; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0 211; AVX512CD-NEXT: retq 212; 213; X32-SSE-LABEL: testv2i64u: 214; X32-SSE: # BB#0: 215; X32-SSE-NEXT: pextrd $2, %xmm0, %eax 216; X32-SSE-NEXT: bsfl %eax, %ecx 217; X32-SSE-NEXT: pextrd $3, %xmm0, %edx 218; X32-SSE-NEXT: bsfl %edx, %edx 219; X32-SSE-NEXT: addl $32, %edx 220; X32-SSE-NEXT: testl %eax, %eax 221; X32-SSE-NEXT: cmovnel %ecx, %edx 222; X32-SSE-NEXT: movd %edx, %xmm1 223; X32-SSE-NEXT: movd %xmm0, %eax 224; X32-SSE-NEXT: bsfl %eax, %ecx 225; X32-SSE-NEXT: pextrd $1, %xmm0, %edx 226; X32-SSE-NEXT: bsfl %edx, %edx 227; X32-SSE-NEXT: addl $32, %edx 228; X32-SSE-NEXT: testl %eax, %eax 229; X32-SSE-NEXT: cmovnel %ecx, %edx 230; X32-SSE-NEXT: movd %edx, %xmm0 231; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 232; X32-SSE-NEXT: retl 233 %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %in, i1 -1) 234 ret <2 x i64> %out 235} 236 237define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { 238; SSE2-LABEL: testv4i32: 239; SSE2: # BB#0: 240; SSE2-NEXT: pxor %xmm1, %xmm1 241; SSE2-NEXT: pxor %xmm2, %xmm2 242; SSE2-NEXT: psubd %xmm0, %xmm2 243; SSE2-NEXT: pand %xmm0, %xmm2 244; SSE2-NEXT: psubd {{.*}}(%rip), %xmm2 245; SSE2-NEXT: movdqa %xmm2, %xmm0 246; SSE2-NEXT: psrld $1, %xmm0 247; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 248; SSE2-NEXT: psubd %xmm0, %xmm2 249; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] 250; SSE2-NEXT: movdqa %xmm2, %xmm3 251; SSE2-NEXT: pand %xmm0, %xmm3 252; SSE2-NEXT: psrld $2, %xmm2 253; SSE2-NEXT: pand %xmm0, %xmm2 254; SSE2-NEXT: paddd %xmm3, %xmm2 255; SSE2-NEXT: movdqa %xmm2, %xmm0 256; SSE2-NEXT: psrld $4, %xmm0 257; SSE2-NEXT: paddd %xmm2, %xmm0 258; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 259; SSE2-NEXT: movdqa %xmm0, %xmm2 260; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 261; SSE2-NEXT: psadbw %xmm1, %xmm2 262; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 263; SSE2-NEXT: psadbw %xmm1, %xmm0 264; SSE2-NEXT: packuswb %xmm2, %xmm0 265; SSE2-NEXT: retq 266; 267; SSE3-LABEL: testv4i32: 268; SSE3: # BB#0: 269; SSE3-NEXT: pxor %xmm1, %xmm1 270; SSE3-NEXT: pxor %xmm2, %xmm2 271; SSE3-NEXT: psubd %xmm0, %xmm2 272; SSE3-NEXT: pand %xmm0, %xmm2 273; SSE3-NEXT: psubd {{.*}}(%rip), %xmm2 274; SSE3-NEXT: movdqa %xmm2, %xmm0 275; SSE3-NEXT: psrld $1, %xmm0 276; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 277; SSE3-NEXT: psubd %xmm0, %xmm2 278; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] 279; SSE3-NEXT: movdqa %xmm2, %xmm3 280; SSE3-NEXT: pand %xmm0, %xmm3 281; SSE3-NEXT: psrld $2, %xmm2 282; SSE3-NEXT: pand %xmm0, %xmm2 283; SSE3-NEXT: paddd %xmm3, %xmm2 284; SSE3-NEXT: movdqa %xmm2, %xmm0 285; SSE3-NEXT: psrld $4, %xmm0 286; SSE3-NEXT: paddd %xmm2, %xmm0 287; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 288; SSE3-NEXT: movdqa %xmm0, %xmm2 289; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 290; SSE3-NEXT: psadbw %xmm1, %xmm2 291; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 292; SSE3-NEXT: psadbw %xmm1, %xmm0 293; SSE3-NEXT: packuswb %xmm2, %xmm0 294; SSE3-NEXT: retq 295; 296; SSSE3-LABEL: testv4i32: 297; SSSE3: # BB#0: 298; SSSE3-NEXT: pxor %xmm1, %xmm1 299; SSSE3-NEXT: pxor %xmm2, %xmm2 300; SSSE3-NEXT: psubd %xmm0, %xmm2 301; SSSE3-NEXT: pand %xmm0, %xmm2 302; SSSE3-NEXT: psubd {{.*}}(%rip), %xmm2 303; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 304; SSSE3-NEXT: movdqa %xmm2, %xmm4 305; SSSE3-NEXT: pand %xmm3, %xmm4 306; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 307; SSSE3-NEXT: movdqa %xmm0, %xmm5 308; SSSE3-NEXT: pshufb %xmm4, %xmm5 309; SSSE3-NEXT: psrlw $4, %xmm2 310; SSSE3-NEXT: pand %xmm3, %xmm2 311; SSSE3-NEXT: pshufb %xmm2, %xmm0 312; SSSE3-NEXT: paddb %xmm5, %xmm0 313; SSSE3-NEXT: movdqa %xmm0, %xmm2 314; SSSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 315; SSSE3-NEXT: psadbw %xmm1, %xmm2 316; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 317; SSSE3-NEXT: psadbw %xmm1, %xmm0 318; SSSE3-NEXT: packuswb %xmm2, %xmm0 319; SSSE3-NEXT: retq 320; 321; SSE41-LABEL: testv4i32: 322; SSE41: # BB#0: 323; SSE41-NEXT: pxor %xmm1, %xmm1 324; SSE41-NEXT: pxor %xmm2, %xmm2 325; SSE41-NEXT: psubd %xmm0, %xmm2 326; SSE41-NEXT: pand %xmm0, %xmm2 327; SSE41-NEXT: psubd {{.*}}(%rip), %xmm2 328; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 329; SSE41-NEXT: movdqa %xmm2, %xmm4 330; SSE41-NEXT: pand %xmm3, %xmm4 331; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 332; SSE41-NEXT: movdqa %xmm0, %xmm5 333; SSE41-NEXT: pshufb %xmm4, %xmm5 334; SSE41-NEXT: psrlw $4, %xmm2 335; SSE41-NEXT: pand %xmm3, %xmm2 336; SSE41-NEXT: pshufb %xmm2, %xmm0 337; SSE41-NEXT: paddb %xmm5, %xmm0 338; SSE41-NEXT: movdqa %xmm0, %xmm2 339; SSE41-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 340; SSE41-NEXT: psadbw %xmm1, %xmm2 341; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 342; SSE41-NEXT: psadbw %xmm1, %xmm0 343; SSE41-NEXT: packuswb %xmm2, %xmm0 344; SSE41-NEXT: retq 345; 346; AVX1-LABEL: testv4i32: 347; AVX1: # BB#0: 348; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 349; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm2 350; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 351; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 352; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 353; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 354; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 355; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 356; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 357; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 358; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 359; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 360; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 361; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 362; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 363; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 364; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 365; AVX1-NEXT: retq 366; 367; AVX2-LABEL: testv4i32: 368; AVX2: # BB#0: 369; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 370; AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm2 371; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 372; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 373; AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0 374; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 375; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm3 376; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 377; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3 378; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 379; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 380; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm0 381; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0 382; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 383; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 384; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 385; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 386; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 387; AVX2-NEXT: retq 388; 389; AVX512CDVL-LABEL: testv4i32: 390; AVX512CDVL: # BB#0: 391; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1 392; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm2 393; AVX512CDVL-NEXT: vpandd %xmm2, %xmm0, %xmm0 394; AVX512CDVL-NEXT: vpsubd {{.*}}(%rip){1to4}, %xmm0, %xmm0 395; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 396; AVX512CDVL-NEXT: vpandq %xmm2, %xmm0, %xmm3 397; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 398; AVX512CDVL-NEXT: vpshufb %xmm3, %xmm4, %xmm3 399; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 400; AVX512CDVL-NEXT: vpandq %xmm2, %xmm0, %xmm0 401; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm4, %xmm0 402; AVX512CDVL-NEXT: vpaddb %xmm3, %xmm0, %xmm0 403; AVX512CDVL-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 404; AVX512CDVL-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 405; AVX512CDVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 406; AVX512CDVL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 407; AVX512CDVL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 408; AVX512CDVL-NEXT: retq 409; 410; AVX512CD-LABEL: testv4i32: 411; AVX512CD: # BB#0: 412; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 413; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm2 414; AVX512CD-NEXT: vpand %xmm2, %xmm0, %xmm0 415; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 416; AVX512CD-NEXT: vpsubd %xmm2, %xmm0, %xmm0 417; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 418; AVX512CD-NEXT: vpand %xmm2, %xmm0, %xmm3 419; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 420; AVX512CD-NEXT: vpshufb %xmm3, %xmm4, %xmm3 421; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 422; AVX512CD-NEXT: vpand %xmm2, %xmm0, %xmm0 423; AVX512CD-NEXT: vpshufb %xmm0, %xmm4, %xmm0 424; AVX512CD-NEXT: vpaddb %xmm3, %xmm0, %xmm0 425; AVX512CD-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 426; AVX512CD-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 427; AVX512CD-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 428; AVX512CD-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 429; AVX512CD-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 430; AVX512CD-NEXT: retq 431; 432; X32-SSE-LABEL: testv4i32: 433; X32-SSE: # BB#0: 434; X32-SSE-NEXT: pxor %xmm1, %xmm1 435; X32-SSE-NEXT: pxor %xmm2, %xmm2 436; X32-SSE-NEXT: psubd %xmm0, %xmm2 437; X32-SSE-NEXT: pand %xmm0, %xmm2 438; X32-SSE-NEXT: psubd {{\.LCPI.*}}, %xmm2 439; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 440; X32-SSE-NEXT: movdqa %xmm2, %xmm4 441; X32-SSE-NEXT: pand %xmm3, %xmm4 442; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 443; X32-SSE-NEXT: movdqa %xmm0, %xmm5 444; X32-SSE-NEXT: pshufb %xmm4, %xmm5 445; X32-SSE-NEXT: psrlw $4, %xmm2 446; X32-SSE-NEXT: pand %xmm3, %xmm2 447; X32-SSE-NEXT: pshufb %xmm2, %xmm0 448; X32-SSE-NEXT: paddb %xmm5, %xmm0 449; X32-SSE-NEXT: movdqa %xmm0, %xmm2 450; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 451; X32-SSE-NEXT: psadbw %xmm1, %xmm2 452; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 453; X32-SSE-NEXT: psadbw %xmm1, %xmm0 454; X32-SSE-NEXT: packuswb %xmm2, %xmm0 455; X32-SSE-NEXT: retl 456 %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 0) 457 ret <4 x i32> %out 458} 459 460define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { 461; SSE2-LABEL: testv4i32u: 462; SSE2: # BB#0: 463; SSE2-NEXT: pxor %xmm1, %xmm1 464; SSE2-NEXT: pxor %xmm2, %xmm2 465; SSE2-NEXT: psubd %xmm0, %xmm2 466; SSE2-NEXT: pand %xmm0, %xmm2 467; SSE2-NEXT: psubd {{.*}}(%rip), %xmm2 468; SSE2-NEXT: movdqa %xmm2, %xmm0 469; SSE2-NEXT: psrld $1, %xmm0 470; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 471; SSE2-NEXT: psubd %xmm0, %xmm2 472; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] 473; SSE2-NEXT: movdqa %xmm2, %xmm3 474; SSE2-NEXT: pand %xmm0, %xmm3 475; SSE2-NEXT: psrld $2, %xmm2 476; SSE2-NEXT: pand %xmm0, %xmm2 477; SSE2-NEXT: paddd %xmm3, %xmm2 478; SSE2-NEXT: movdqa %xmm2, %xmm0 479; SSE2-NEXT: psrld $4, %xmm0 480; SSE2-NEXT: paddd %xmm2, %xmm0 481; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 482; SSE2-NEXT: movdqa %xmm0, %xmm2 483; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 484; SSE2-NEXT: psadbw %xmm1, %xmm2 485; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 486; SSE2-NEXT: psadbw %xmm1, %xmm0 487; SSE2-NEXT: packuswb %xmm2, %xmm0 488; SSE2-NEXT: retq 489; 490; SSE3-LABEL: testv4i32u: 491; SSE3: # BB#0: 492; SSE3-NEXT: pxor %xmm1, %xmm1 493; SSE3-NEXT: pxor %xmm2, %xmm2 494; SSE3-NEXT: psubd %xmm0, %xmm2 495; SSE3-NEXT: pand %xmm0, %xmm2 496; SSE3-NEXT: psubd {{.*}}(%rip), %xmm2 497; SSE3-NEXT: movdqa %xmm2, %xmm0 498; SSE3-NEXT: psrld $1, %xmm0 499; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 500; SSE3-NEXT: psubd %xmm0, %xmm2 501; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] 502; SSE3-NEXT: movdqa %xmm2, %xmm3 503; SSE3-NEXT: pand %xmm0, %xmm3 504; SSE3-NEXT: psrld $2, %xmm2 505; SSE3-NEXT: pand %xmm0, %xmm2 506; SSE3-NEXT: paddd %xmm3, %xmm2 507; SSE3-NEXT: movdqa %xmm2, %xmm0 508; SSE3-NEXT: psrld $4, %xmm0 509; SSE3-NEXT: paddd %xmm2, %xmm0 510; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 511; SSE3-NEXT: movdqa %xmm0, %xmm2 512; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 513; SSE3-NEXT: psadbw %xmm1, %xmm2 514; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 515; SSE3-NEXT: psadbw %xmm1, %xmm0 516; SSE3-NEXT: packuswb %xmm2, %xmm0 517; SSE3-NEXT: retq 518; 519; SSSE3-LABEL: testv4i32u: 520; SSSE3: # BB#0: 521; SSSE3-NEXT: pxor %xmm1, %xmm1 522; SSSE3-NEXT: pxor %xmm2, %xmm2 523; SSSE3-NEXT: psubd %xmm0, %xmm2 524; SSSE3-NEXT: pand %xmm0, %xmm2 525; SSSE3-NEXT: psubd {{.*}}(%rip), %xmm2 526; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 527; SSSE3-NEXT: movdqa %xmm2, %xmm4 528; SSSE3-NEXT: pand %xmm3, %xmm4 529; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 530; SSSE3-NEXT: movdqa %xmm0, %xmm5 531; SSSE3-NEXT: pshufb %xmm4, %xmm5 532; SSSE3-NEXT: psrlw $4, %xmm2 533; SSSE3-NEXT: pand %xmm3, %xmm2 534; SSSE3-NEXT: pshufb %xmm2, %xmm0 535; SSSE3-NEXT: paddb %xmm5, %xmm0 536; SSSE3-NEXT: movdqa %xmm0, %xmm2 537; SSSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 538; SSSE3-NEXT: psadbw %xmm1, %xmm2 539; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 540; SSSE3-NEXT: psadbw %xmm1, %xmm0 541; SSSE3-NEXT: packuswb %xmm2, %xmm0 542; SSSE3-NEXT: retq 543; 544; SSE41-LABEL: testv4i32u: 545; SSE41: # BB#0: 546; SSE41-NEXT: pxor %xmm1, %xmm1 547; SSE41-NEXT: pxor %xmm2, %xmm2 548; SSE41-NEXT: psubd %xmm0, %xmm2 549; SSE41-NEXT: pand %xmm0, %xmm2 550; SSE41-NEXT: psubd {{.*}}(%rip), %xmm2 551; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 552; SSE41-NEXT: movdqa %xmm2, %xmm4 553; SSE41-NEXT: pand %xmm3, %xmm4 554; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 555; SSE41-NEXT: movdqa %xmm0, %xmm5 556; SSE41-NEXT: pshufb %xmm4, %xmm5 557; SSE41-NEXT: psrlw $4, %xmm2 558; SSE41-NEXT: pand %xmm3, %xmm2 559; SSE41-NEXT: pshufb %xmm2, %xmm0 560; SSE41-NEXT: paddb %xmm5, %xmm0 561; SSE41-NEXT: movdqa %xmm0, %xmm2 562; SSE41-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 563; SSE41-NEXT: psadbw %xmm1, %xmm2 564; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 565; SSE41-NEXT: psadbw %xmm1, %xmm0 566; SSE41-NEXT: packuswb %xmm2, %xmm0 567; SSE41-NEXT: retq 568; 569; AVX1-LABEL: testv4i32u: 570; AVX1: # BB#0: 571; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 572; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm2 573; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 574; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 575; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 576; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 577; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 578; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 579; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 580; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 581; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 582; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 583; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 584; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 585; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 586; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 587; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 588; AVX1-NEXT: retq 589; 590; AVX2-LABEL: testv4i32u: 591; AVX2: # BB#0: 592; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 593; AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm2 594; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 595; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 596; AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0 597; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 598; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm3 599; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 600; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3 601; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 602; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 603; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm0 604; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0 605; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 606; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 607; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 608; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 609; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 610; AVX2-NEXT: retq 611; 612; AVX512CDVL-LABEL: testv4i32u: 613; AVX512CDVL: # BB#0: 614; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1 615; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm1 616; AVX512CDVL-NEXT: vpandd %xmm1, %xmm0, %xmm0 617; AVX512CDVL-NEXT: vplzcntd %xmm0, %xmm0 618; AVX512CDVL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 619; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm0 620; AVX512CDVL-NEXT: retq 621; 622; AVX512CD-LABEL: testv4i32u: 623; AVX512CD: # BB#0: 624; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 625; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm1 626; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 627; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 628; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 629; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm0 630; AVX512CD-NEXT: retq 631; 632; X32-SSE-LABEL: testv4i32u: 633; X32-SSE: # BB#0: 634; X32-SSE-NEXT: pxor %xmm1, %xmm1 635; X32-SSE-NEXT: pxor %xmm2, %xmm2 636; X32-SSE-NEXT: psubd %xmm0, %xmm2 637; X32-SSE-NEXT: pand %xmm0, %xmm2 638; X32-SSE-NEXT: psubd {{\.LCPI.*}}, %xmm2 639; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 640; X32-SSE-NEXT: movdqa %xmm2, %xmm4 641; X32-SSE-NEXT: pand %xmm3, %xmm4 642; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 643; X32-SSE-NEXT: movdqa %xmm0, %xmm5 644; X32-SSE-NEXT: pshufb %xmm4, %xmm5 645; X32-SSE-NEXT: psrlw $4, %xmm2 646; X32-SSE-NEXT: pand %xmm3, %xmm2 647; X32-SSE-NEXT: pshufb %xmm2, %xmm0 648; X32-SSE-NEXT: paddb %xmm5, %xmm0 649; X32-SSE-NEXT: movdqa %xmm0, %xmm2 650; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 651; X32-SSE-NEXT: psadbw %xmm1, %xmm2 652; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 653; X32-SSE-NEXT: psadbw %xmm1, %xmm0 654; X32-SSE-NEXT: packuswb %xmm2, %xmm0 655; X32-SSE-NEXT: retl 656 %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 -1) 657 ret <4 x i32> %out 658} 659 660define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { 661; SSE2-LABEL: testv8i16: 662; SSE2: # BB#0: 663; SSE2-NEXT: pxor %xmm1, %xmm1 664; SSE2-NEXT: psubw %xmm0, %xmm1 665; SSE2-NEXT: pand %xmm0, %xmm1 666; SSE2-NEXT: psubw {{.*}}(%rip), %xmm1 667; SSE2-NEXT: movdqa %xmm1, %xmm0 668; SSE2-NEXT: psrlw $1, %xmm0 669; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 670; SSE2-NEXT: psubw %xmm0, %xmm1 671; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107] 672; SSE2-NEXT: movdqa %xmm1, %xmm2 673; SSE2-NEXT: pand %xmm0, %xmm2 674; SSE2-NEXT: psrlw $2, %xmm1 675; SSE2-NEXT: pand %xmm0, %xmm1 676; SSE2-NEXT: paddw %xmm2, %xmm1 677; SSE2-NEXT: movdqa %xmm1, %xmm2 678; SSE2-NEXT: psrlw $4, %xmm2 679; SSE2-NEXT: paddw %xmm1, %xmm2 680; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 681; SSE2-NEXT: movdqa %xmm2, %xmm0 682; SSE2-NEXT: psllw $8, %xmm0 683; SSE2-NEXT: paddb %xmm2, %xmm0 684; SSE2-NEXT: psrlw $8, %xmm0 685; SSE2-NEXT: retq 686; 687; SSE3-LABEL: testv8i16: 688; SSE3: # BB#0: 689; SSE3-NEXT: pxor %xmm1, %xmm1 690; SSE3-NEXT: psubw %xmm0, %xmm1 691; SSE3-NEXT: pand %xmm0, %xmm1 692; SSE3-NEXT: psubw {{.*}}(%rip), %xmm1 693; SSE3-NEXT: movdqa %xmm1, %xmm0 694; SSE3-NEXT: psrlw $1, %xmm0 695; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 696; SSE3-NEXT: psubw %xmm0, %xmm1 697; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107] 698; SSE3-NEXT: movdqa %xmm1, %xmm2 699; SSE3-NEXT: pand %xmm0, %xmm2 700; SSE3-NEXT: psrlw $2, %xmm1 701; SSE3-NEXT: pand %xmm0, %xmm1 702; SSE3-NEXT: paddw %xmm2, %xmm1 703; SSE3-NEXT: movdqa %xmm1, %xmm2 704; SSE3-NEXT: psrlw $4, %xmm2 705; SSE3-NEXT: paddw %xmm1, %xmm2 706; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 707; SSE3-NEXT: movdqa %xmm2, %xmm0 708; SSE3-NEXT: psllw $8, %xmm0 709; SSE3-NEXT: paddb %xmm2, %xmm0 710; SSE3-NEXT: psrlw $8, %xmm0 711; SSE3-NEXT: retq 712; 713; SSSE3-LABEL: testv8i16: 714; SSSE3: # BB#0: 715; SSSE3-NEXT: pxor %xmm1, %xmm1 716; SSSE3-NEXT: psubw %xmm0, %xmm1 717; SSSE3-NEXT: pand %xmm0, %xmm1 718; SSSE3-NEXT: psubw {{.*}}(%rip), %xmm1 719; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 720; SSSE3-NEXT: movdqa %xmm1, %xmm2 721; SSSE3-NEXT: pand %xmm0, %xmm2 722; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 723; SSSE3-NEXT: movdqa %xmm3, %xmm4 724; SSSE3-NEXT: pshufb %xmm2, %xmm4 725; SSSE3-NEXT: psrlw $4, %xmm1 726; SSSE3-NEXT: pand %xmm0, %xmm1 727; SSSE3-NEXT: pshufb %xmm1, %xmm3 728; SSSE3-NEXT: paddb %xmm4, %xmm3 729; SSSE3-NEXT: movdqa %xmm3, %xmm0 730; SSSE3-NEXT: psllw $8, %xmm0 731; SSSE3-NEXT: paddb %xmm3, %xmm0 732; SSSE3-NEXT: psrlw $8, %xmm0 733; SSSE3-NEXT: retq 734; 735; SSE41-LABEL: testv8i16: 736; SSE41: # BB#0: 737; SSE41-NEXT: pxor %xmm1, %xmm1 738; SSE41-NEXT: psubw %xmm0, %xmm1 739; SSE41-NEXT: pand %xmm0, %xmm1 740; SSE41-NEXT: psubw {{.*}}(%rip), %xmm1 741; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 742; SSE41-NEXT: movdqa %xmm1, %xmm2 743; SSE41-NEXT: pand %xmm0, %xmm2 744; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 745; SSE41-NEXT: movdqa %xmm3, %xmm4 746; SSE41-NEXT: pshufb %xmm2, %xmm4 747; SSE41-NEXT: psrlw $4, %xmm1 748; SSE41-NEXT: pand %xmm0, %xmm1 749; SSE41-NEXT: pshufb %xmm1, %xmm3 750; SSE41-NEXT: paddb %xmm4, %xmm3 751; SSE41-NEXT: movdqa %xmm3, %xmm0 752; SSE41-NEXT: psllw $8, %xmm0 753; SSE41-NEXT: paddb %xmm3, %xmm0 754; SSE41-NEXT: psrlw $8, %xmm0 755; SSE41-NEXT: retq 756; 757; AVX1-LABEL: testv8i16: 758; AVX1: # BB#0: 759; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 760; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1 761; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 762; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 763; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 764; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 765; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 766; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 767; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 768; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 769; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 770; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 771; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 772; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 773; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 774; AVX1-NEXT: retq 775; 776; AVX2-LABEL: testv8i16: 777; AVX2: # BB#0: 778; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 779; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm1 780; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 781; AVX2-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 782; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 783; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 784; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 785; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 786; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 787; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 788; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 789; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 790; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 791; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 792; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 793; AVX2-NEXT: retq 794; 795; AVX512CDVL-LABEL: testv8i16: 796; AVX512CDVL: # BB#0: 797; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1 798; AVX512CDVL-NEXT: vpsubw %xmm0, %xmm1, %xmm1 799; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0 800; AVX512CDVL-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 801; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 802; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm2 803; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 804; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 805; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 806; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0 807; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0 808; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0 809; AVX512CDVL-NEXT: vpsllw $8, %xmm0, %xmm1 810; AVX512CDVL-NEXT: vpaddb %xmm0, %xmm1, %xmm0 811; AVX512CDVL-NEXT: vpsrlw $8, %xmm0, %xmm0 812; AVX512CDVL-NEXT: retq 813; 814; AVX512CD-LABEL: testv8i16: 815; AVX512CD: # BB#0: 816; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 817; AVX512CD-NEXT: vpsubw %xmm0, %xmm1, %xmm1 818; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 819; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 820; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 821; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 822; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 823; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 824; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 825; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 826; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0 827; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0 828; AVX512CD-NEXT: vpsllw $8, %xmm0, %xmm1 829; AVX512CD-NEXT: vpaddb %xmm0, %xmm1, %xmm0 830; AVX512CD-NEXT: vpsrlw $8, %xmm0, %xmm0 831; AVX512CD-NEXT: retq 832; 833; X32-SSE-LABEL: testv8i16: 834; X32-SSE: # BB#0: 835; X32-SSE-NEXT: pxor %xmm1, %xmm1 836; X32-SSE-NEXT: psubw %xmm0, %xmm1 837; X32-SSE-NEXT: pand %xmm0, %xmm1 838; X32-SSE-NEXT: psubw {{\.LCPI.*}}, %xmm1 839; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 840; X32-SSE-NEXT: movdqa %xmm1, %xmm2 841; X32-SSE-NEXT: pand %xmm0, %xmm2 842; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 843; X32-SSE-NEXT: movdqa %xmm3, %xmm4 844; X32-SSE-NEXT: pshufb %xmm2, %xmm4 845; X32-SSE-NEXT: psrlw $4, %xmm1 846; X32-SSE-NEXT: pand %xmm0, %xmm1 847; X32-SSE-NEXT: pshufb %xmm1, %xmm3 848; X32-SSE-NEXT: paddb %xmm4, %xmm3 849; X32-SSE-NEXT: movdqa %xmm3, %xmm0 850; X32-SSE-NEXT: psllw $8, %xmm0 851; X32-SSE-NEXT: paddb %xmm3, %xmm0 852; X32-SSE-NEXT: psrlw $8, %xmm0 853; X32-SSE-NEXT: retl 854 %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %in, i1 0) 855 ret <8 x i16> %out 856} 857 858define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { 859; SSE2-LABEL: testv8i16u: 860; SSE2: # BB#0: 861; SSE2-NEXT: pxor %xmm1, %xmm1 862; SSE2-NEXT: psubw %xmm0, %xmm1 863; SSE2-NEXT: pand %xmm0, %xmm1 864; SSE2-NEXT: psubw {{.*}}(%rip), %xmm1 865; SSE2-NEXT: movdqa %xmm1, %xmm0 866; SSE2-NEXT: psrlw $1, %xmm0 867; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 868; SSE2-NEXT: psubw %xmm0, %xmm1 869; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107] 870; SSE2-NEXT: movdqa %xmm1, %xmm2 871; SSE2-NEXT: pand %xmm0, %xmm2 872; SSE2-NEXT: psrlw $2, %xmm1 873; SSE2-NEXT: pand %xmm0, %xmm1 874; SSE2-NEXT: paddw %xmm2, %xmm1 875; SSE2-NEXT: movdqa %xmm1, %xmm2 876; SSE2-NEXT: psrlw $4, %xmm2 877; SSE2-NEXT: paddw %xmm1, %xmm2 878; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 879; SSE2-NEXT: movdqa %xmm2, %xmm0 880; SSE2-NEXT: psllw $8, %xmm0 881; SSE2-NEXT: paddb %xmm2, %xmm0 882; SSE2-NEXT: psrlw $8, %xmm0 883; SSE2-NEXT: retq 884; 885; SSE3-LABEL: testv8i16u: 886; SSE3: # BB#0: 887; SSE3-NEXT: pxor %xmm1, %xmm1 888; SSE3-NEXT: psubw %xmm0, %xmm1 889; SSE3-NEXT: pand %xmm0, %xmm1 890; SSE3-NEXT: psubw {{.*}}(%rip), %xmm1 891; SSE3-NEXT: movdqa %xmm1, %xmm0 892; SSE3-NEXT: psrlw $1, %xmm0 893; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 894; SSE3-NEXT: psubw %xmm0, %xmm1 895; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107] 896; SSE3-NEXT: movdqa %xmm1, %xmm2 897; SSE3-NEXT: pand %xmm0, %xmm2 898; SSE3-NEXT: psrlw $2, %xmm1 899; SSE3-NEXT: pand %xmm0, %xmm1 900; SSE3-NEXT: paddw %xmm2, %xmm1 901; SSE3-NEXT: movdqa %xmm1, %xmm2 902; SSE3-NEXT: psrlw $4, %xmm2 903; SSE3-NEXT: paddw %xmm1, %xmm2 904; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 905; SSE3-NEXT: movdqa %xmm2, %xmm0 906; SSE3-NEXT: psllw $8, %xmm0 907; SSE3-NEXT: paddb %xmm2, %xmm0 908; SSE3-NEXT: psrlw $8, %xmm0 909; SSE3-NEXT: retq 910; 911; SSSE3-LABEL: testv8i16u: 912; SSSE3: # BB#0: 913; SSSE3-NEXT: pxor %xmm1, %xmm1 914; SSSE3-NEXT: psubw %xmm0, %xmm1 915; SSSE3-NEXT: pand %xmm0, %xmm1 916; SSSE3-NEXT: psubw {{.*}}(%rip), %xmm1 917; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 918; SSSE3-NEXT: movdqa %xmm1, %xmm2 919; SSSE3-NEXT: pand %xmm0, %xmm2 920; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 921; SSSE3-NEXT: movdqa %xmm3, %xmm4 922; SSSE3-NEXT: pshufb %xmm2, %xmm4 923; SSSE3-NEXT: psrlw $4, %xmm1 924; SSSE3-NEXT: pand %xmm0, %xmm1 925; SSSE3-NEXT: pshufb %xmm1, %xmm3 926; SSSE3-NEXT: paddb %xmm4, %xmm3 927; SSSE3-NEXT: movdqa %xmm3, %xmm0 928; SSSE3-NEXT: psllw $8, %xmm0 929; SSSE3-NEXT: paddb %xmm3, %xmm0 930; SSSE3-NEXT: psrlw $8, %xmm0 931; SSSE3-NEXT: retq 932; 933; SSE41-LABEL: testv8i16u: 934; SSE41: # BB#0: 935; SSE41-NEXT: pxor %xmm1, %xmm1 936; SSE41-NEXT: psubw %xmm0, %xmm1 937; SSE41-NEXT: pand %xmm0, %xmm1 938; SSE41-NEXT: psubw {{.*}}(%rip), %xmm1 939; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 940; SSE41-NEXT: movdqa %xmm1, %xmm2 941; SSE41-NEXT: pand %xmm0, %xmm2 942; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 943; SSE41-NEXT: movdqa %xmm3, %xmm4 944; SSE41-NEXT: pshufb %xmm2, %xmm4 945; SSE41-NEXT: psrlw $4, %xmm1 946; SSE41-NEXT: pand %xmm0, %xmm1 947; SSE41-NEXT: pshufb %xmm1, %xmm3 948; SSE41-NEXT: paddb %xmm4, %xmm3 949; SSE41-NEXT: movdqa %xmm3, %xmm0 950; SSE41-NEXT: psllw $8, %xmm0 951; SSE41-NEXT: paddb %xmm3, %xmm0 952; SSE41-NEXT: psrlw $8, %xmm0 953; SSE41-NEXT: retq 954; 955; AVX1-LABEL: testv8i16u: 956; AVX1: # BB#0: 957; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 958; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1 959; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 960; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 961; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 962; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 963; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 964; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 965; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 966; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 967; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 968; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 969; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 970; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 971; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 972; AVX1-NEXT: retq 973; 974; AVX2-LABEL: testv8i16u: 975; AVX2: # BB#0: 976; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 977; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm1 978; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 979; AVX2-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 980; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 981; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 982; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 983; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 984; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 985; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 986; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 987; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 988; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 989; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 990; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 991; AVX2-NEXT: retq 992; 993; AVX512CDVL-LABEL: testv8i16u: 994; AVX512CDVL: # BB#0: 995; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1 996; AVX512CDVL-NEXT: vpsubw %xmm0, %xmm1, %xmm1 997; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0 998; AVX512CDVL-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 999; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1000; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm2 1001; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1002; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1003; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 1004; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0 1005; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0 1006; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1007; AVX512CDVL-NEXT: vpsllw $8, %xmm0, %xmm1 1008; AVX512CDVL-NEXT: vpaddb %xmm0, %xmm1, %xmm0 1009; AVX512CDVL-NEXT: vpsrlw $8, %xmm0, %xmm0 1010; AVX512CDVL-NEXT: retq 1011; 1012; AVX512CD-LABEL: testv8i16u: 1013; AVX512CD: # BB#0: 1014; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 1015; AVX512CD-NEXT: vpsubw %xmm0, %xmm1, %xmm1 1016; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 1017; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 1018; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1019; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 1020; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1021; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1022; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 1023; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 1024; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0 1025; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1026; AVX512CD-NEXT: vpsllw $8, %xmm0, %xmm1 1027; AVX512CD-NEXT: vpaddb %xmm0, %xmm1, %xmm0 1028; AVX512CD-NEXT: vpsrlw $8, %xmm0, %xmm0 1029; AVX512CD-NEXT: retq 1030; 1031; X32-SSE-LABEL: testv8i16u: 1032; X32-SSE: # BB#0: 1033; X32-SSE-NEXT: pxor %xmm1, %xmm1 1034; X32-SSE-NEXT: psubw %xmm0, %xmm1 1035; X32-SSE-NEXT: pand %xmm0, %xmm1 1036; X32-SSE-NEXT: psubw {{\.LCPI.*}}, %xmm1 1037; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1038; X32-SSE-NEXT: movdqa %xmm1, %xmm2 1039; X32-SSE-NEXT: pand %xmm0, %xmm2 1040; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1041; X32-SSE-NEXT: movdqa %xmm3, %xmm4 1042; X32-SSE-NEXT: pshufb %xmm2, %xmm4 1043; X32-SSE-NEXT: psrlw $4, %xmm1 1044; X32-SSE-NEXT: pand %xmm0, %xmm1 1045; X32-SSE-NEXT: pshufb %xmm1, %xmm3 1046; X32-SSE-NEXT: paddb %xmm4, %xmm3 1047; X32-SSE-NEXT: movdqa %xmm3, %xmm0 1048; X32-SSE-NEXT: psllw $8, %xmm0 1049; X32-SSE-NEXT: paddb %xmm3, %xmm0 1050; X32-SSE-NEXT: psrlw $8, %xmm0 1051; X32-SSE-NEXT: retl 1052 %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %in, i1 -1) 1053 ret <8 x i16> %out 1054} 1055 1056define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { 1057; SSE2-LABEL: testv16i8: 1058; SSE2: # BB#0: 1059; SSE2-NEXT: pxor %xmm1, %xmm1 1060; SSE2-NEXT: psubb %xmm0, %xmm1 1061; SSE2-NEXT: pand %xmm0, %xmm1 1062; SSE2-NEXT: psubb {{.*}}(%rip), %xmm1 1063; SSE2-NEXT: movdqa %xmm1, %xmm0 1064; SSE2-NEXT: psrlw $1, %xmm0 1065; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1066; SSE2-NEXT: psubb %xmm0, %xmm1 1067; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1068; SSE2-NEXT: movdqa %xmm1, %xmm2 1069; SSE2-NEXT: pand %xmm0, %xmm2 1070; SSE2-NEXT: psrlw $2, %xmm1 1071; SSE2-NEXT: pand %xmm0, %xmm1 1072; SSE2-NEXT: paddb %xmm2, %xmm1 1073; SSE2-NEXT: movdqa %xmm1, %xmm0 1074; SSE2-NEXT: psrlw $4, %xmm0 1075; SSE2-NEXT: paddb %xmm1, %xmm0 1076; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1077; SSE2-NEXT: retq 1078; 1079; SSE3-LABEL: testv16i8: 1080; SSE3: # BB#0: 1081; SSE3-NEXT: pxor %xmm1, %xmm1 1082; SSE3-NEXT: psubb %xmm0, %xmm1 1083; SSE3-NEXT: pand %xmm0, %xmm1 1084; SSE3-NEXT: psubb {{.*}}(%rip), %xmm1 1085; SSE3-NEXT: movdqa %xmm1, %xmm0 1086; SSE3-NEXT: psrlw $1, %xmm0 1087; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 1088; SSE3-NEXT: psubb %xmm0, %xmm1 1089; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1090; SSE3-NEXT: movdqa %xmm1, %xmm2 1091; SSE3-NEXT: pand %xmm0, %xmm2 1092; SSE3-NEXT: psrlw $2, %xmm1 1093; SSE3-NEXT: pand %xmm0, %xmm1 1094; SSE3-NEXT: paddb %xmm2, %xmm1 1095; SSE3-NEXT: movdqa %xmm1, %xmm0 1096; SSE3-NEXT: psrlw $4, %xmm0 1097; SSE3-NEXT: paddb %xmm1, %xmm0 1098; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 1099; SSE3-NEXT: retq 1100; 1101; SSSE3-LABEL: testv16i8: 1102; SSSE3: # BB#0: 1103; SSSE3-NEXT: pxor %xmm1, %xmm1 1104; SSSE3-NEXT: psubb %xmm0, %xmm1 1105; SSSE3-NEXT: pand %xmm0, %xmm1 1106; SSSE3-NEXT: psubb {{.*}}(%rip), %xmm1 1107; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1108; SSSE3-NEXT: movdqa %xmm1, %xmm3 1109; SSSE3-NEXT: pand %xmm2, %xmm3 1110; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1111; SSSE3-NEXT: movdqa %xmm0, %xmm4 1112; SSSE3-NEXT: pshufb %xmm3, %xmm4 1113; SSSE3-NEXT: psrlw $4, %xmm1 1114; SSSE3-NEXT: pand %xmm2, %xmm1 1115; SSSE3-NEXT: pshufb %xmm1, %xmm0 1116; SSSE3-NEXT: paddb %xmm4, %xmm0 1117; SSSE3-NEXT: retq 1118; 1119; SSE41-LABEL: testv16i8: 1120; SSE41: # BB#0: 1121; SSE41-NEXT: pxor %xmm1, %xmm1 1122; SSE41-NEXT: psubb %xmm0, %xmm1 1123; SSE41-NEXT: pand %xmm0, %xmm1 1124; SSE41-NEXT: psubb {{.*}}(%rip), %xmm1 1125; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1126; SSE41-NEXT: movdqa %xmm1, %xmm3 1127; SSE41-NEXT: pand %xmm2, %xmm3 1128; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1129; SSE41-NEXT: movdqa %xmm0, %xmm4 1130; SSE41-NEXT: pshufb %xmm3, %xmm4 1131; SSE41-NEXT: psrlw $4, %xmm1 1132; SSE41-NEXT: pand %xmm2, %xmm1 1133; SSE41-NEXT: pshufb %xmm1, %xmm0 1134; SSE41-NEXT: paddb %xmm4, %xmm0 1135; SSE41-NEXT: retq 1136; 1137; AVX1-LABEL: testv16i8: 1138; AVX1: # BB#0: 1139; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1140; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm1 1141; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1142; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1143; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1144; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 1145; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1146; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1147; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1148; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1149; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 1150; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1151; AVX1-NEXT: retq 1152; 1153; AVX2-LABEL: testv16i8: 1154; AVX2: # BB#0: 1155; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1156; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm1 1157; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 1158; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1159; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1160; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 1161; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1162; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1163; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 1164; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 1165; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 1166; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1167; AVX2-NEXT: retq 1168; 1169; AVX512CDVL-LABEL: testv16i8: 1170; AVX512CDVL: # BB#0: 1171; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1 1172; AVX512CDVL-NEXT: vpsubb %xmm0, %xmm1, %xmm1 1173; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0 1174; AVX512CDVL-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1175; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1176; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm2 1177; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1178; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1179; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 1180; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0 1181; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0 1182; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1183; AVX512CDVL-NEXT: retq 1184; 1185; AVX512CD-LABEL: testv16i8: 1186; AVX512CD: # BB#0: 1187; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 1188; AVX512CD-NEXT: vpsubb %xmm0, %xmm1, %xmm1 1189; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 1190; AVX512CD-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1191; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1192; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 1193; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1194; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1195; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 1196; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 1197; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0 1198; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1199; AVX512CD-NEXT: retq 1200; 1201; X32-SSE-LABEL: testv16i8: 1202; X32-SSE: # BB#0: 1203; X32-SSE-NEXT: pxor %xmm1, %xmm1 1204; X32-SSE-NEXT: psubb %xmm0, %xmm1 1205; X32-SSE-NEXT: pand %xmm0, %xmm1 1206; X32-SSE-NEXT: psubb {{\.LCPI.*}}, %xmm1 1207; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1208; X32-SSE-NEXT: movdqa %xmm1, %xmm3 1209; X32-SSE-NEXT: pand %xmm2, %xmm3 1210; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1211; X32-SSE-NEXT: movdqa %xmm0, %xmm4 1212; X32-SSE-NEXT: pshufb %xmm3, %xmm4 1213; X32-SSE-NEXT: psrlw $4, %xmm1 1214; X32-SSE-NEXT: pand %xmm2, %xmm1 1215; X32-SSE-NEXT: pshufb %xmm1, %xmm0 1216; X32-SSE-NEXT: paddb %xmm4, %xmm0 1217; X32-SSE-NEXT: retl 1218 %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 0) 1219 ret <16 x i8> %out 1220} 1221 1222define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { 1223; SSE2-LABEL: testv16i8u: 1224; SSE2: # BB#0: 1225; SSE2-NEXT: pxor %xmm1, %xmm1 1226; SSE2-NEXT: psubb %xmm0, %xmm1 1227; SSE2-NEXT: pand %xmm0, %xmm1 1228; SSE2-NEXT: psubb {{.*}}(%rip), %xmm1 1229; SSE2-NEXT: movdqa %xmm1, %xmm0 1230; SSE2-NEXT: psrlw $1, %xmm0 1231; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1232; SSE2-NEXT: psubb %xmm0, %xmm1 1233; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1234; SSE2-NEXT: movdqa %xmm1, %xmm2 1235; SSE2-NEXT: pand %xmm0, %xmm2 1236; SSE2-NEXT: psrlw $2, %xmm1 1237; SSE2-NEXT: pand %xmm0, %xmm1 1238; SSE2-NEXT: paddb %xmm2, %xmm1 1239; SSE2-NEXT: movdqa %xmm1, %xmm0 1240; SSE2-NEXT: psrlw $4, %xmm0 1241; SSE2-NEXT: paddb %xmm1, %xmm0 1242; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1243; SSE2-NEXT: retq 1244; 1245; SSE3-LABEL: testv16i8u: 1246; SSE3: # BB#0: 1247; SSE3-NEXT: pxor %xmm1, %xmm1 1248; SSE3-NEXT: psubb %xmm0, %xmm1 1249; SSE3-NEXT: pand %xmm0, %xmm1 1250; SSE3-NEXT: psubb {{.*}}(%rip), %xmm1 1251; SSE3-NEXT: movdqa %xmm1, %xmm0 1252; SSE3-NEXT: psrlw $1, %xmm0 1253; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 1254; SSE3-NEXT: psubb %xmm0, %xmm1 1255; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1256; SSE3-NEXT: movdqa %xmm1, %xmm2 1257; SSE3-NEXT: pand %xmm0, %xmm2 1258; SSE3-NEXT: psrlw $2, %xmm1 1259; SSE3-NEXT: pand %xmm0, %xmm1 1260; SSE3-NEXT: paddb %xmm2, %xmm1 1261; SSE3-NEXT: movdqa %xmm1, %xmm0 1262; SSE3-NEXT: psrlw $4, %xmm0 1263; SSE3-NEXT: paddb %xmm1, %xmm0 1264; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 1265; SSE3-NEXT: retq 1266; 1267; SSSE3-LABEL: testv16i8u: 1268; SSSE3: # BB#0: 1269; SSSE3-NEXT: pxor %xmm1, %xmm1 1270; SSSE3-NEXT: psubb %xmm0, %xmm1 1271; SSSE3-NEXT: pand %xmm0, %xmm1 1272; SSSE3-NEXT: psubb {{.*}}(%rip), %xmm1 1273; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1274; SSSE3-NEXT: movdqa %xmm1, %xmm3 1275; SSSE3-NEXT: pand %xmm2, %xmm3 1276; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1277; SSSE3-NEXT: movdqa %xmm0, %xmm4 1278; SSSE3-NEXT: pshufb %xmm3, %xmm4 1279; SSSE3-NEXT: psrlw $4, %xmm1 1280; SSSE3-NEXT: pand %xmm2, %xmm1 1281; SSSE3-NEXT: pshufb %xmm1, %xmm0 1282; SSSE3-NEXT: paddb %xmm4, %xmm0 1283; SSSE3-NEXT: retq 1284; 1285; SSE41-LABEL: testv16i8u: 1286; SSE41: # BB#0: 1287; SSE41-NEXT: pxor %xmm1, %xmm1 1288; SSE41-NEXT: psubb %xmm0, %xmm1 1289; SSE41-NEXT: pand %xmm0, %xmm1 1290; SSE41-NEXT: psubb {{.*}}(%rip), %xmm1 1291; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1292; SSE41-NEXT: movdqa %xmm1, %xmm3 1293; SSE41-NEXT: pand %xmm2, %xmm3 1294; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1295; SSE41-NEXT: movdqa %xmm0, %xmm4 1296; SSE41-NEXT: pshufb %xmm3, %xmm4 1297; SSE41-NEXT: psrlw $4, %xmm1 1298; SSE41-NEXT: pand %xmm2, %xmm1 1299; SSE41-NEXT: pshufb %xmm1, %xmm0 1300; SSE41-NEXT: paddb %xmm4, %xmm0 1301; SSE41-NEXT: retq 1302; 1303; AVX1-LABEL: testv16i8u: 1304; AVX1: # BB#0: 1305; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1306; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm1 1307; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1308; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1309; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1310; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 1311; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1312; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1313; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1314; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1315; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 1316; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1317; AVX1-NEXT: retq 1318; 1319; AVX2-LABEL: testv16i8u: 1320; AVX2: # BB#0: 1321; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1322; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm1 1323; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 1324; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1325; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1326; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 1327; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1328; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1329; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 1330; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 1331; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 1332; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1333; AVX2-NEXT: retq 1334; 1335; AVX512CDVL-LABEL: testv16i8u: 1336; AVX512CDVL: # BB#0: 1337; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1 1338; AVX512CDVL-NEXT: vpsubb %xmm0, %xmm1, %xmm1 1339; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0 1340; AVX512CDVL-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1341; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1342; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm2 1343; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1344; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1345; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 1346; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0 1347; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0 1348; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1349; AVX512CDVL-NEXT: retq 1350; 1351; AVX512CD-LABEL: testv16i8u: 1352; AVX512CD: # BB#0: 1353; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 1354; AVX512CD-NEXT: vpsubb %xmm0, %xmm1, %xmm1 1355; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 1356; AVX512CD-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1357; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1358; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 1359; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1360; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1361; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 1362; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 1363; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0 1364; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1365; AVX512CD-NEXT: retq 1366; 1367; X32-SSE-LABEL: testv16i8u: 1368; X32-SSE: # BB#0: 1369; X32-SSE-NEXT: pxor %xmm1, %xmm1 1370; X32-SSE-NEXT: psubb %xmm0, %xmm1 1371; X32-SSE-NEXT: pand %xmm0, %xmm1 1372; X32-SSE-NEXT: psubb {{\.LCPI.*}}, %xmm1 1373; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1374; X32-SSE-NEXT: movdqa %xmm1, %xmm3 1375; X32-SSE-NEXT: pand %xmm2, %xmm3 1376; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1377; X32-SSE-NEXT: movdqa %xmm0, %xmm4 1378; X32-SSE-NEXT: pshufb %xmm3, %xmm4 1379; X32-SSE-NEXT: psrlw $4, %xmm1 1380; X32-SSE-NEXT: pand %xmm2, %xmm1 1381; X32-SSE-NEXT: pshufb %xmm1, %xmm0 1382; X32-SSE-NEXT: paddb %xmm4, %xmm0 1383; X32-SSE-NEXT: retl 1384 %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 -1) 1385 ret <16 x i8> %out 1386} 1387 1388define <2 x i64> @foldv2i64() nounwind { 1389; SSE-LABEL: foldv2i64: 1390; SSE: # BB#0: 1391; SSE-NEXT: movl $8, %eax 1392; SSE-NEXT: movd %rax, %xmm0 1393; SSE-NEXT: retq 1394; 1395; AVX-LABEL: foldv2i64: 1396; AVX: # BB#0: 1397; AVX-NEXT: movl $8, %eax 1398; AVX-NEXT: vmovq %rax, %xmm0 1399; AVX-NEXT: retq 1400; 1401; X32-SSE-LABEL: foldv2i64: 1402; X32-SSE: # BB#0: 1403; X32-SSE-NEXT: movl $8, %eax 1404; X32-SSE-NEXT: movd %eax, %xmm0 1405; X32-SSE-NEXT: retl 1406 %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 0) 1407 ret <2 x i64> %out 1408} 1409 1410define <2 x i64> @foldv2i64u() nounwind { 1411; SSE-LABEL: foldv2i64u: 1412; SSE: # BB#0: 1413; SSE-NEXT: movl $8, %eax 1414; SSE-NEXT: movd %rax, %xmm0 1415; SSE-NEXT: retq 1416; 1417; AVX-LABEL: foldv2i64u: 1418; AVX: # BB#0: 1419; AVX-NEXT: movl $8, %eax 1420; AVX-NEXT: vmovq %rax, %xmm0 1421; AVX-NEXT: retq 1422; 1423; X32-SSE-LABEL: foldv2i64u: 1424; X32-SSE: # BB#0: 1425; X32-SSE-NEXT: movl $8, %eax 1426; X32-SSE-NEXT: movd %eax, %xmm0 1427; X32-SSE-NEXT: retl 1428 %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 -1) 1429 ret <2 x i64> %out 1430} 1431 1432define <4 x i32> @foldv4i32() nounwind { 1433; SSE-LABEL: foldv4i32: 1434; SSE: # BB#0: 1435; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0] 1436; SSE-NEXT: retq 1437; 1438; AVX1-LABEL: foldv4i32: 1439; AVX1: # BB#0: 1440; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] 1441; AVX1-NEXT: retq 1442; 1443; AVX2-LABEL: foldv4i32: 1444; AVX2: # BB#0: 1445; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] 1446; AVX2-NEXT: retq 1447; 1448; AVX512CDVL-LABEL: foldv4i32: 1449; AVX512CDVL: # BB#0: 1450; AVX512CDVL-NEXT: vmovdqa32 {{.*#+}} xmm0 = [8,0,32,0] 1451; AVX512CDVL-NEXT: retq 1452; 1453; AVX512CD-LABEL: foldv4i32: 1454; AVX512CD: # BB#0: 1455; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] 1456; AVX512CD-NEXT: retq 1457; 1458; X32-SSE-LABEL: foldv4i32: 1459; X32-SSE: # BB#0: 1460; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0] 1461; X32-SSE-NEXT: retl 1462 %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 0) 1463 ret <4 x i32> %out 1464} 1465 1466define <4 x i32> @foldv4i32u() nounwind { 1467; SSE-LABEL: foldv4i32u: 1468; SSE: # BB#0: 1469; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0] 1470; SSE-NEXT: retq 1471; 1472; AVX1-LABEL: foldv4i32u: 1473; AVX1: # BB#0: 1474; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] 1475; AVX1-NEXT: retq 1476; 1477; AVX2-LABEL: foldv4i32u: 1478; AVX2: # BB#0: 1479; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] 1480; AVX2-NEXT: retq 1481; 1482; AVX512CDVL-LABEL: foldv4i32u: 1483; AVX512CDVL: # BB#0: 1484; AVX512CDVL-NEXT: vmovdqa32 {{.*#+}} xmm0 = [8,0,32,0] 1485; AVX512CDVL-NEXT: retq 1486; 1487; AVX512CD-LABEL: foldv4i32u: 1488; AVX512CD: # BB#0: 1489; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] 1490; AVX512CD-NEXT: retq 1491; 1492; X32-SSE-LABEL: foldv4i32u: 1493; X32-SSE: # BB#0: 1494; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0] 1495; X32-SSE-NEXT: retl 1496 %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 -1) 1497 ret <4 x i32> %out 1498} 1499 1500define <8 x i16> @foldv8i16() nounwind { 1501; SSE-LABEL: foldv8i16: 1502; SSE: # BB#0: 1503; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1504; SSE-NEXT: retq 1505; 1506; AVX1-LABEL: foldv8i16: 1507; AVX1: # BB#0: 1508; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1509; AVX1-NEXT: retq 1510; 1511; AVX2-LABEL: foldv8i16: 1512; AVX2: # BB#0: 1513; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1514; AVX2-NEXT: retq 1515; 1516; AVX512CDVL-LABEL: foldv8i16: 1517; AVX512CDVL: # BB#0: 1518; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1519; AVX512CDVL-NEXT: retq 1520; 1521; AVX512CD-LABEL: foldv8i16: 1522; AVX512CD: # BB#0: 1523; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1524; AVX512CD-NEXT: retq 1525; 1526; X32-SSE-LABEL: foldv8i16: 1527; X32-SSE: # BB#0: 1528; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1529; X32-SSE-NEXT: retl 1530 %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 0) 1531 ret <8 x i16> %out 1532} 1533 1534define <8 x i16> @foldv8i16u() nounwind { 1535; SSE-LABEL: foldv8i16u: 1536; SSE: # BB#0: 1537; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1538; SSE-NEXT: retq 1539; 1540; AVX1-LABEL: foldv8i16u: 1541; AVX1: # BB#0: 1542; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1543; AVX1-NEXT: retq 1544; 1545; AVX2-LABEL: foldv8i16u: 1546; AVX2: # BB#0: 1547; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1548; AVX2-NEXT: retq 1549; 1550; AVX512CDVL-LABEL: foldv8i16u: 1551; AVX512CDVL: # BB#0: 1552; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1553; AVX512CDVL-NEXT: retq 1554; 1555; AVX512CD-LABEL: foldv8i16u: 1556; AVX512CD: # BB#0: 1557; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1558; AVX512CD-NEXT: retq 1559; 1560; X32-SSE-LABEL: foldv8i16u: 1561; X32-SSE: # BB#0: 1562; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1563; X32-SSE-NEXT: retl 1564 %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 -1) 1565 ret <8 x i16> %out 1566} 1567 1568define <16 x i8> @foldv16i8() nounwind { 1569; SSE-LABEL: foldv16i8: 1570; SSE: # BB#0: 1571; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1572; SSE-NEXT: retq 1573; 1574; AVX1-LABEL: foldv16i8: 1575; AVX1: # BB#0: 1576; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1577; AVX1-NEXT: retq 1578; 1579; AVX2-LABEL: foldv16i8: 1580; AVX2: # BB#0: 1581; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1582; AVX2-NEXT: retq 1583; 1584; AVX512CDVL-LABEL: foldv16i8: 1585; AVX512CDVL: # BB#0: 1586; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1587; AVX512CDVL-NEXT: retq 1588; 1589; AVX512CD-LABEL: foldv16i8: 1590; AVX512CD: # BB#0: 1591; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1592; AVX512CD-NEXT: retq 1593; 1594; X32-SSE-LABEL: foldv16i8: 1595; X32-SSE: # BB#0: 1596; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1597; X32-SSE-NEXT: retl 1598 %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 0) 1599 ret <16 x i8> %out 1600} 1601 1602define <16 x i8> @foldv16i8u() nounwind { 1603; SSE-LABEL: foldv16i8u: 1604; SSE: # BB#0: 1605; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1606; SSE-NEXT: retq 1607; 1608; AVX1-LABEL: foldv16i8u: 1609; AVX1: # BB#0: 1610; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1611; AVX1-NEXT: retq 1612; 1613; AVX2-LABEL: foldv16i8u: 1614; AVX2: # BB#0: 1615; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1616; AVX2-NEXT: retq 1617; 1618; AVX512CDVL-LABEL: foldv16i8u: 1619; AVX512CDVL: # BB#0: 1620; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1621; AVX512CDVL-NEXT: retq 1622; 1623; AVX512CD-LABEL: foldv16i8u: 1624; AVX512CD: # BB#0: 1625; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1626; AVX512CD-NEXT: retq 1627; 1628; X32-SSE-LABEL: foldv16i8u: 1629; X32-SSE: # BB#0: 1630; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1631; X32-SSE-NEXT: retl 1632 %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 -1) 1633 ret <16 x i8> %out 1634} 1635 1636declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1) 1637declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) 1638declare <8 x i16> @llvm.cttz.v8i16(<8 x i16>, i1) 1639declare <16 x i8> @llvm.cttz.v16i8(<16 x i8>, i1) 1640