1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 8; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VLCD 9; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD 10; 11; Just one 32-bit run to make sure we do reasonable things for i64 lzcnt. 12; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE41 13 14define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { 15; SSE2-LABEL: testv2i64: 16; SSE2: # BB#0: 17; SSE2-NEXT: movd %xmm0, %rax 18; SSE2-NEXT: bsrq %rax, %rax 19; SSE2-NEXT: movl $127, %ecx 20; SSE2-NEXT: cmoveq %rcx, %rax 21; SSE2-NEXT: xorq $63, %rax 22; SSE2-NEXT: movd %rax, %xmm1 23; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 24; SSE2-NEXT: movd %xmm0, %rax 25; SSE2-NEXT: bsrq %rax, %rax 26; SSE2-NEXT: cmoveq %rcx, %rax 27; SSE2-NEXT: xorq $63, %rax 28; SSE2-NEXT: movd %rax, %xmm0 29; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 30; SSE2-NEXT: movdqa %xmm1, %xmm0 31; SSE2-NEXT: retq 32; 33; SSE3-LABEL: testv2i64: 34; SSE3: # BB#0: 35; SSE3-NEXT: movd %xmm0, %rax 36; SSE3-NEXT: bsrq %rax, %rax 37; SSE3-NEXT: movl $127, %ecx 38; SSE3-NEXT: cmoveq %rcx, %rax 39; SSE3-NEXT: xorq $63, %rax 40; SSE3-NEXT: movd %rax, %xmm1 41; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 42; SSE3-NEXT: movd %xmm0, %rax 43; SSE3-NEXT: bsrq %rax, %rax 44; SSE3-NEXT: cmoveq %rcx, %rax 45; SSE3-NEXT: xorq $63, %rax 46; SSE3-NEXT: movd %rax, %xmm0 47; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 48; SSE3-NEXT: movdqa %xmm1, %xmm0 49; SSE3-NEXT: retq 50; 51; SSSE3-LABEL: testv2i64: 52; SSSE3: # BB#0: 53; SSSE3-NEXT: movd %xmm0, %rax 54; SSSE3-NEXT: bsrq %rax, %rax 55; SSSE3-NEXT: movl $127, %ecx 56; SSSE3-NEXT: cmoveq %rcx, %rax 57; SSSE3-NEXT: xorq $63, %rax 58; SSSE3-NEXT: movd %rax, %xmm1 59; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 60; SSSE3-NEXT: movd %xmm0, %rax 61; SSSE3-NEXT: bsrq %rax, %rax 62; SSSE3-NEXT: cmoveq %rcx, %rax 63; SSSE3-NEXT: xorq $63, %rax 64; SSSE3-NEXT: movd %rax, %xmm0 65; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 66; SSSE3-NEXT: movdqa %xmm1, %xmm0 67; SSSE3-NEXT: retq 68; 69; SSE41-LABEL: testv2i64: 70; SSE41: # BB#0: 71; SSE41-NEXT: pextrq $1, %xmm0, %rax 72; SSE41-NEXT: bsrq %rax, %rax 73; SSE41-NEXT: movl $127, %ecx 74; SSE41-NEXT: cmoveq %rcx, %rax 75; SSE41-NEXT: xorq $63, %rax 76; SSE41-NEXT: movd %rax, %xmm1 77; SSE41-NEXT: movd %xmm0, %rax 78; SSE41-NEXT: bsrq %rax, %rax 79; SSE41-NEXT: cmoveq %rcx, %rax 80; SSE41-NEXT: xorq $63, %rax 81; SSE41-NEXT: movd %rax, %xmm0 82; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 83; SSE41-NEXT: retq 84; 85; AVX-LABEL: testv2i64: 86; AVX: # BB#0: 87; AVX-NEXT: vpextrq $1, %xmm0, %rax 88; AVX-NEXT: bsrq %rax, %rax 89; AVX-NEXT: movl $127, %ecx 90; AVX-NEXT: cmoveq %rcx, %rax 91; AVX-NEXT: xorq $63, %rax 92; AVX-NEXT: vmovq %rax, %xmm1 93; AVX-NEXT: vmovq %xmm0, %rax 94; AVX-NEXT: bsrq %rax, %rax 95; AVX-NEXT: cmoveq %rcx, %rax 96; AVX-NEXT: xorq $63, %rax 97; AVX-NEXT: vmovq %rax, %xmm0 98; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 99; AVX-NEXT: retq 100; 101; AVX512VLCD-LABEL: testv2i64: 102; AVX512VLCD: ## BB#0: 103; AVX512VLCD-NEXT: vplzcntq %xmm0, %xmm0 104; AVX512VLCD-NEXT: retq 105; 106; AVX512CD-LABEL: testv2i64: 107; AVX512CD: ## BB#0: 108; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def> 109; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 110; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> 111; AVX512CD-NEXT: retq 112; 113; X32-SSE-LABEL: testv2i64: 114; X32-SSE: # BB#0: 115; X32-SSE-NEXT: pushl %esi 116; X32-SSE-NEXT: pextrd $2, %xmm0, %eax 117; X32-SSE-NEXT: bsrl %eax, %eax 118; X32-SSE-NEXT: movl $63, %ecx 119; X32-SSE-NEXT: cmovel %ecx, %eax 120; X32-SSE-NEXT: xorl $31, %eax 121; X32-SSE-NEXT: addl $32, %eax 122; X32-SSE-NEXT: pextrd $3, %xmm0, %edx 123; X32-SSE-NEXT: bsrl %edx, %esi 124; X32-SSE-NEXT: xorl $31, %esi 125; X32-SSE-NEXT: testl %edx, %edx 126; X32-SSE-NEXT: cmovel %eax, %esi 127; X32-SSE-NEXT: movd %esi, %xmm1 128; X32-SSE-NEXT: movd %xmm0, %eax 129; X32-SSE-NEXT: bsrl %eax, %eax 130; X32-SSE-NEXT: cmovel %ecx, %eax 131; X32-SSE-NEXT: xorl $31, %eax 132; X32-SSE-NEXT: addl $32, %eax 133; X32-SSE-NEXT: pextrd $1, %xmm0, %ecx 134; X32-SSE-NEXT: bsrl %ecx, %edx 135; X32-SSE-NEXT: xorl $31, %edx 136; X32-SSE-NEXT: testl %ecx, %ecx 137; X32-SSE-NEXT: cmovel %eax, %edx 138; X32-SSE-NEXT: movd %edx, %xmm0 139; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 140; X32-SSE-NEXT: popl %esi 141; X32-SSE-NEXT: retl 142 143 %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 0) 144 ret <2 x i64> %out 145} 146 147define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { 148; SSE2-LABEL: testv2i64u: 149; SSE2: # BB#0: 150; SSE2-NEXT: movd %xmm0, %rax 151; SSE2-NEXT: bsrq %rax, %rax 152; SSE2-NEXT: xorq $63, %rax 153; SSE2-NEXT: movd %rax, %xmm1 154; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 155; SSE2-NEXT: movd %xmm0, %rax 156; SSE2-NEXT: bsrq %rax, %rax 157; SSE2-NEXT: xorq $63, %rax 158; SSE2-NEXT: movd %rax, %xmm0 159; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 160; SSE2-NEXT: movdqa %xmm1, %xmm0 161; SSE2-NEXT: retq 162; 163; SSE3-LABEL: testv2i64u: 164; SSE3: # BB#0: 165; SSE3-NEXT: movd %xmm0, %rax 166; SSE3-NEXT: bsrq %rax, %rax 167; SSE3-NEXT: xorq $63, %rax 168; SSE3-NEXT: movd %rax, %xmm1 169; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 170; SSE3-NEXT: movd %xmm0, %rax 171; SSE3-NEXT: bsrq %rax, %rax 172; SSE3-NEXT: xorq $63, %rax 173; SSE3-NEXT: movd %rax, %xmm0 174; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 175; SSE3-NEXT: movdqa %xmm1, %xmm0 176; SSE3-NEXT: retq 177; 178; SSSE3-LABEL: testv2i64u: 179; SSSE3: # BB#0: 180; SSSE3-NEXT: movd %xmm0, %rax 181; SSSE3-NEXT: bsrq %rax, %rax 182; SSSE3-NEXT: xorq $63, %rax 183; SSSE3-NEXT: movd %rax, %xmm1 184; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 185; SSSE3-NEXT: movd %xmm0, %rax 186; SSSE3-NEXT: bsrq %rax, %rax 187; SSSE3-NEXT: xorq $63, %rax 188; SSSE3-NEXT: movd %rax, %xmm0 189; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 190; SSSE3-NEXT: movdqa %xmm1, %xmm0 191; SSSE3-NEXT: retq 192; 193; SSE41-LABEL: testv2i64u: 194; SSE41: # BB#0: 195; SSE41-NEXT: pextrq $1, %xmm0, %rax 196; SSE41-NEXT: bsrq %rax, %rax 197; SSE41-NEXT: xorq $63, %rax 198; SSE41-NEXT: movd %rax, %xmm1 199; SSE41-NEXT: movd %xmm0, %rax 200; SSE41-NEXT: bsrq %rax, %rax 201; SSE41-NEXT: xorq $63, %rax 202; SSE41-NEXT: movd %rax, %xmm0 203; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 204; SSE41-NEXT: retq 205; 206; AVX-LABEL: testv2i64u: 207; AVX: # BB#0: 208; AVX-NEXT: vpextrq $1, %xmm0, %rax 209; AVX-NEXT: bsrq %rax, %rax 210; AVX-NEXT: xorq $63, %rax 211; AVX-NEXT: vmovq %rax, %xmm1 212; AVX-NEXT: vmovq %xmm0, %rax 213; AVX-NEXT: bsrq %rax, %rax 214; AVX-NEXT: xorq $63, %rax 215; AVX-NEXT: vmovq %rax, %xmm0 216; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 217; AVX-NEXT: retq 218; 219; AVX512VLCD-LABEL: testv2i64u: 220; AVX512VLCD: ## BB#0: 221; AVX512VLCD-NEXT: vplzcntq %xmm0, %xmm0 222; AVX512VLCD-NEXT: retq 223; 224; AVX512CD-LABEL: testv2i64u: 225; AVX512CD: ## BB#0: 226; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def> 227; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 228; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> 229; AVX512CD-NEXT: retq 230; 231; X32-SSE-LABEL: testv2i64u: 232; X32-SSE: # BB#0: 233; X32-SSE-NEXT: pextrd $3, %xmm0, %eax 234; X32-SSE-NEXT: bsrl %eax, %ecx 235; X32-SSE-NEXT: xorl $31, %ecx 236; X32-SSE-NEXT: pextrd $2, %xmm0, %edx 237; X32-SSE-NEXT: bsrl %edx, %edx 238; X32-SSE-NEXT: xorl $31, %edx 239; X32-SSE-NEXT: addl $32, %edx 240; X32-SSE-NEXT: testl %eax, %eax 241; X32-SSE-NEXT: cmovnel %ecx, %edx 242; X32-SSE-NEXT: movd %edx, %xmm1 243; X32-SSE-NEXT: pextrd $1, %xmm0, %eax 244; X32-SSE-NEXT: bsrl %eax, %ecx 245; X32-SSE-NEXT: xorl $31, %ecx 246; X32-SSE-NEXT: movd %xmm0, %edx 247; X32-SSE-NEXT: bsrl %edx, %edx 248; X32-SSE-NEXT: xorl $31, %edx 249; X32-SSE-NEXT: addl $32, %edx 250; X32-SSE-NEXT: testl %eax, %eax 251; X32-SSE-NEXT: cmovnel %ecx, %edx 252; X32-SSE-NEXT: movd %edx, %xmm0 253; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 254; X32-SSE-NEXT: retl 255 256 %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 -1) 257 ret <2 x i64> %out 258} 259 260define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { 261; SSE2-LABEL: testv4i32: 262; SSE2: # BB#0: 263; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 264; SSE2-NEXT: movd %xmm1, %eax 265; SSE2-NEXT: bsrl %eax, %eax 266; SSE2-NEXT: movl $63, %ecx 267; SSE2-NEXT: cmovel %ecx, %eax 268; SSE2-NEXT: xorl $31, %eax 269; SSE2-NEXT: movd %eax, %xmm1 270; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] 271; SSE2-NEXT: movd %xmm2, %eax 272; SSE2-NEXT: bsrl %eax, %eax 273; SSE2-NEXT: cmovel %ecx, %eax 274; SSE2-NEXT: xorl $31, %eax 275; SSE2-NEXT: movd %eax, %xmm2 276; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 277; SSE2-NEXT: movd %xmm0, %eax 278; SSE2-NEXT: bsrl %eax, %eax 279; SSE2-NEXT: cmovel %ecx, %eax 280; SSE2-NEXT: xorl $31, %eax 281; SSE2-NEXT: movd %eax, %xmm1 282; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 283; SSE2-NEXT: movd %xmm0, %eax 284; SSE2-NEXT: bsrl %eax, %eax 285; SSE2-NEXT: cmovel %ecx, %eax 286; SSE2-NEXT: xorl $31, %eax 287; SSE2-NEXT: movd %eax, %xmm0 288; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 289; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 290; SSE2-NEXT: movdqa %xmm1, %xmm0 291; SSE2-NEXT: retq 292; 293; SSE3-LABEL: testv4i32: 294; SSE3: # BB#0: 295; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 296; SSE3-NEXT: movd %xmm1, %eax 297; SSE3-NEXT: bsrl %eax, %eax 298; SSE3-NEXT: movl $63, %ecx 299; SSE3-NEXT: cmovel %ecx, %eax 300; SSE3-NEXT: xorl $31, %eax 301; SSE3-NEXT: movd %eax, %xmm1 302; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] 303; SSE3-NEXT: movd %xmm2, %eax 304; SSE3-NEXT: bsrl %eax, %eax 305; SSE3-NEXT: cmovel %ecx, %eax 306; SSE3-NEXT: xorl $31, %eax 307; SSE3-NEXT: movd %eax, %xmm2 308; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 309; SSE3-NEXT: movd %xmm0, %eax 310; SSE3-NEXT: bsrl %eax, %eax 311; SSE3-NEXT: cmovel %ecx, %eax 312; SSE3-NEXT: xorl $31, %eax 313; SSE3-NEXT: movd %eax, %xmm1 314; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 315; SSE3-NEXT: movd %xmm0, %eax 316; SSE3-NEXT: bsrl %eax, %eax 317; SSE3-NEXT: cmovel %ecx, %eax 318; SSE3-NEXT: xorl $31, %eax 319; SSE3-NEXT: movd %eax, %xmm0 320; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 321; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 322; SSE3-NEXT: movdqa %xmm1, %xmm0 323; SSE3-NEXT: retq 324; 325; SSSE3-LABEL: testv4i32: 326; SSSE3: # BB#0: 327; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 328; SSSE3-NEXT: movd %xmm1, %eax 329; SSSE3-NEXT: bsrl %eax, %eax 330; SSSE3-NEXT: movl $63, %ecx 331; SSSE3-NEXT: cmovel %ecx, %eax 332; SSSE3-NEXT: xorl $31, %eax 333; SSSE3-NEXT: movd %eax, %xmm1 334; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] 335; SSSE3-NEXT: movd %xmm2, %eax 336; SSSE3-NEXT: bsrl %eax, %eax 337; SSSE3-NEXT: cmovel %ecx, %eax 338; SSSE3-NEXT: xorl $31, %eax 339; SSSE3-NEXT: movd %eax, %xmm2 340; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 341; SSSE3-NEXT: movd %xmm0, %eax 342; SSSE3-NEXT: bsrl %eax, %eax 343; SSSE3-NEXT: cmovel %ecx, %eax 344; SSSE3-NEXT: xorl $31, %eax 345; SSSE3-NEXT: movd %eax, %xmm1 346; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 347; SSSE3-NEXT: movd %xmm0, %eax 348; SSSE3-NEXT: bsrl %eax, %eax 349; SSSE3-NEXT: cmovel %ecx, %eax 350; SSSE3-NEXT: xorl $31, %eax 351; SSSE3-NEXT: movd %eax, %xmm0 352; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 353; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 354; SSSE3-NEXT: movdqa %xmm1, %xmm0 355; SSSE3-NEXT: retq 356; 357; SSE41-LABEL: testv4i32: 358; SSE41: # BB#0: 359; SSE41-NEXT: pextrd $1, %xmm0, %eax 360; SSE41-NEXT: bsrl %eax, %eax 361; SSE41-NEXT: movl $63, %ecx 362; SSE41-NEXT: cmovel %ecx, %eax 363; SSE41-NEXT: xorl $31, %eax 364; SSE41-NEXT: movd %xmm0, %edx 365; SSE41-NEXT: bsrl %edx, %edx 366; SSE41-NEXT: cmovel %ecx, %edx 367; SSE41-NEXT: xorl $31, %edx 368; SSE41-NEXT: movd %edx, %xmm1 369; SSE41-NEXT: pinsrd $1, %eax, %xmm1 370; SSE41-NEXT: pextrd $2, %xmm0, %eax 371; SSE41-NEXT: bsrl %eax, %eax 372; SSE41-NEXT: cmovel %ecx, %eax 373; SSE41-NEXT: xorl $31, %eax 374; SSE41-NEXT: pinsrd $2, %eax, %xmm1 375; SSE41-NEXT: pextrd $3, %xmm0, %eax 376; SSE41-NEXT: bsrl %eax, %eax 377; SSE41-NEXT: cmovel %ecx, %eax 378; SSE41-NEXT: xorl $31, %eax 379; SSE41-NEXT: pinsrd $3, %eax, %xmm1 380; SSE41-NEXT: movdqa %xmm1, %xmm0 381; SSE41-NEXT: retq 382; 383; AVX-LABEL: testv4i32: 384; AVX: # BB#0: 385; AVX-NEXT: vpextrd $1, %xmm0, %eax 386; AVX-NEXT: bsrl %eax, %eax 387; AVX-NEXT: movl $63, %ecx 388; AVX-NEXT: cmovel %ecx, %eax 389; AVX-NEXT: xorl $31, %eax 390; AVX-NEXT: vmovd %xmm0, %edx 391; AVX-NEXT: bsrl %edx, %edx 392; AVX-NEXT: cmovel %ecx, %edx 393; AVX-NEXT: xorl $31, %edx 394; AVX-NEXT: vmovd %edx, %xmm1 395; AVX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 396; AVX-NEXT: vpextrd $2, %xmm0, %eax 397; AVX-NEXT: bsrl %eax, %eax 398; AVX-NEXT: cmovel %ecx, %eax 399; AVX-NEXT: xorl $31, %eax 400; AVX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 401; AVX-NEXT: vpextrd $3, %xmm0, %eax 402; AVX-NEXT: bsrl %eax, %eax 403; AVX-NEXT: cmovel %ecx, %eax 404; AVX-NEXT: xorl $31, %eax 405; AVX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 406; AVX-NEXT: retq 407; 408; AVX512VLCD-LABEL: testv4i32: 409; AVX512VLCD: ## BB#0: 410; AVX512VLCD-NEXT: vplzcntd %xmm0, %xmm0 411; AVX512VLCD-NEXT: retq 412; 413; AVX512CD-LABEL: testv4i32: 414; AVX512CD: ## BB#0: 415; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def> 416; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 417; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> 418; AVX512CD-NEXT: retq 419; 420; X32-SSE-LABEL: testv4i32: 421; X32-SSE: # BB#0: 422; X32-SSE-NEXT: pextrd $1, %xmm0, %eax 423; X32-SSE-NEXT: bsrl %eax, %eax 424; X32-SSE-NEXT: movl $63, %ecx 425; X32-SSE-NEXT: cmovel %ecx, %eax 426; X32-SSE-NEXT: xorl $31, %eax 427; X32-SSE-NEXT: movd %xmm0, %edx 428; X32-SSE-NEXT: bsrl %edx, %edx 429; X32-SSE-NEXT: cmovel %ecx, %edx 430; X32-SSE-NEXT: xorl $31, %edx 431; X32-SSE-NEXT: movd %edx, %xmm1 432; X32-SSE-NEXT: pinsrd $1, %eax, %xmm1 433; X32-SSE-NEXT: pextrd $2, %xmm0, %eax 434; X32-SSE-NEXT: bsrl %eax, %eax 435; X32-SSE-NEXT: cmovel %ecx, %eax 436; X32-SSE-NEXT: xorl $31, %eax 437; X32-SSE-NEXT: pinsrd $2, %eax, %xmm1 438; X32-SSE-NEXT: pextrd $3, %xmm0, %eax 439; X32-SSE-NEXT: bsrl %eax, %eax 440; X32-SSE-NEXT: cmovel %ecx, %eax 441; X32-SSE-NEXT: xorl $31, %eax 442; X32-SSE-NEXT: pinsrd $3, %eax, %xmm1 443; X32-SSE-NEXT: movdqa %xmm1, %xmm0 444; X32-SSE-NEXT: retl 445 446 %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %in, i1 0) 447 ret <4 x i32> %out 448} 449 450define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { 451; SSE2-LABEL: testv4i32u: 452; SSE2: # BB#0: 453; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 454; SSE2-NEXT: movd %xmm1, %eax 455; SSE2-NEXT: bsrl %eax, %eax 456; SSE2-NEXT: xorl $31, %eax 457; SSE2-NEXT: movd %eax, %xmm1 458; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] 459; SSE2-NEXT: movd %xmm2, %eax 460; SSE2-NEXT: bsrl %eax, %eax 461; SSE2-NEXT: xorl $31, %eax 462; SSE2-NEXT: movd %eax, %xmm2 463; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 464; SSE2-NEXT: movd %xmm0, %eax 465; SSE2-NEXT: bsrl %eax, %eax 466; SSE2-NEXT: xorl $31, %eax 467; SSE2-NEXT: movd %eax, %xmm1 468; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 469; SSE2-NEXT: movd %xmm0, %eax 470; SSE2-NEXT: bsrl %eax, %eax 471; SSE2-NEXT: xorl $31, %eax 472; SSE2-NEXT: movd %eax, %xmm0 473; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 474; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 475; SSE2-NEXT: movdqa %xmm1, %xmm0 476; SSE2-NEXT: retq 477; 478; SSE3-LABEL: testv4i32u: 479; SSE3: # BB#0: 480; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 481; SSE3-NEXT: movd %xmm1, %eax 482; SSE3-NEXT: bsrl %eax, %eax 483; SSE3-NEXT: xorl $31, %eax 484; SSE3-NEXT: movd %eax, %xmm1 485; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] 486; SSE3-NEXT: movd %xmm2, %eax 487; SSE3-NEXT: bsrl %eax, %eax 488; SSE3-NEXT: xorl $31, %eax 489; SSE3-NEXT: movd %eax, %xmm2 490; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 491; SSE3-NEXT: movd %xmm0, %eax 492; SSE3-NEXT: bsrl %eax, %eax 493; SSE3-NEXT: xorl $31, %eax 494; SSE3-NEXT: movd %eax, %xmm1 495; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 496; SSE3-NEXT: movd %xmm0, %eax 497; SSE3-NEXT: bsrl %eax, %eax 498; SSE3-NEXT: xorl $31, %eax 499; SSE3-NEXT: movd %eax, %xmm0 500; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 501; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 502; SSE3-NEXT: movdqa %xmm1, %xmm0 503; SSE3-NEXT: retq 504; 505; SSSE3-LABEL: testv4i32u: 506; SSSE3: # BB#0: 507; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 508; SSSE3-NEXT: movd %xmm1, %eax 509; SSSE3-NEXT: bsrl %eax, %eax 510; SSSE3-NEXT: xorl $31, %eax 511; SSSE3-NEXT: movd %eax, %xmm1 512; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] 513; SSSE3-NEXT: movd %xmm2, %eax 514; SSSE3-NEXT: bsrl %eax, %eax 515; SSSE3-NEXT: xorl $31, %eax 516; SSSE3-NEXT: movd %eax, %xmm2 517; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 518; SSSE3-NEXT: movd %xmm0, %eax 519; SSSE3-NEXT: bsrl %eax, %eax 520; SSSE3-NEXT: xorl $31, %eax 521; SSSE3-NEXT: movd %eax, %xmm1 522; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 523; SSSE3-NEXT: movd %xmm0, %eax 524; SSSE3-NEXT: bsrl %eax, %eax 525; SSSE3-NEXT: xorl $31, %eax 526; SSSE3-NEXT: movd %eax, %xmm0 527; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 528; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 529; SSSE3-NEXT: movdqa %xmm1, %xmm0 530; SSSE3-NEXT: retq 531; 532; SSE41-LABEL: testv4i32u: 533; SSE41: # BB#0: 534; SSE41-NEXT: pextrd $1, %xmm0, %eax 535; SSE41-NEXT: bsrl %eax, %eax 536; SSE41-NEXT: xorl $31, %eax 537; SSE41-NEXT: movd %xmm0, %ecx 538; SSE41-NEXT: bsrl %ecx, %ecx 539; SSE41-NEXT: xorl $31, %ecx 540; SSE41-NEXT: movd %ecx, %xmm1 541; SSE41-NEXT: pinsrd $1, %eax, %xmm1 542; SSE41-NEXT: pextrd $2, %xmm0, %eax 543; SSE41-NEXT: bsrl %eax, %eax 544; SSE41-NEXT: xorl $31, %eax 545; SSE41-NEXT: pinsrd $2, %eax, %xmm1 546; SSE41-NEXT: pextrd $3, %xmm0, %eax 547; SSE41-NEXT: bsrl %eax, %eax 548; SSE41-NEXT: xorl $31, %eax 549; SSE41-NEXT: pinsrd $3, %eax, %xmm1 550; SSE41-NEXT: movdqa %xmm1, %xmm0 551; SSE41-NEXT: retq 552; 553; AVX-LABEL: testv4i32u: 554; AVX: # BB#0: 555; AVX-NEXT: vpextrd $1, %xmm0, %eax 556; AVX-NEXT: bsrl %eax, %eax 557; AVX-NEXT: xorl $31, %eax 558; AVX-NEXT: vmovd %xmm0, %ecx 559; AVX-NEXT: bsrl %ecx, %ecx 560; AVX-NEXT: xorl $31, %ecx 561; AVX-NEXT: vmovd %ecx, %xmm1 562; AVX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 563; AVX-NEXT: vpextrd $2, %xmm0, %eax 564; AVX-NEXT: bsrl %eax, %eax 565; AVX-NEXT: xorl $31, %eax 566; AVX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 567; AVX-NEXT: vpextrd $3, %xmm0, %eax 568; AVX-NEXT: bsrl %eax, %eax 569; AVX-NEXT: xorl $31, %eax 570; AVX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 571; AVX-NEXT: retq 572; 573; AVX512VLCD-LABEL: testv4i32u: 574; AVX512VLCD: ## BB#0: 575; AVX512VLCD-NEXT: vplzcntd %xmm0, %xmm0 576; AVX512VLCD-NEXT: retq 577; 578; AVX512CD-LABEL: testv4i32u: 579; AVX512CD: ## BB#0: 580; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def> 581; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 582; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> 583; AVX512CD-NEXT: retq 584; 585; X32-SSE-LABEL: testv4i32u: 586; X32-SSE: # BB#0: 587; X32-SSE-NEXT: pextrd $1, %xmm0, %eax 588; X32-SSE-NEXT: bsrl %eax, %eax 589; X32-SSE-NEXT: xorl $31, %eax 590; X32-SSE-NEXT: movd %xmm0, %ecx 591; X32-SSE-NEXT: bsrl %ecx, %ecx 592; X32-SSE-NEXT: xorl $31, %ecx 593; X32-SSE-NEXT: movd %ecx, %xmm1 594; X32-SSE-NEXT: pinsrd $1, %eax, %xmm1 595; X32-SSE-NEXT: pextrd $2, %xmm0, %eax 596; X32-SSE-NEXT: bsrl %eax, %eax 597; X32-SSE-NEXT: xorl $31, %eax 598; X32-SSE-NEXT: pinsrd $2, %eax, %xmm1 599; X32-SSE-NEXT: pextrd $3, %xmm0, %eax 600; X32-SSE-NEXT: bsrl %eax, %eax 601; X32-SSE-NEXT: xorl $31, %eax 602; X32-SSE-NEXT: pinsrd $3, %eax, %xmm1 603; X32-SSE-NEXT: movdqa %xmm1, %xmm0 604; X32-SSE-NEXT: retl 605 606 %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %in, i1 -1) 607 ret <4 x i32> %out 608} 609 610define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { 611; SSE2-LABEL: testv8i16: 612; SSE2: # BB#0: 613; SSE2-NEXT: pextrw $7, %xmm0, %eax 614; SSE2-NEXT: bsrw %ax, %cx 615; SSE2-NEXT: movw $31, %ax 616; SSE2-NEXT: cmovew %ax, %cx 617; SSE2-NEXT: xorl $15, %ecx 618; SSE2-NEXT: movd %ecx, %xmm1 619; SSE2-NEXT: pextrw $3, %xmm0, %ecx 620; SSE2-NEXT: bsrw %cx, %cx 621; SSE2-NEXT: cmovew %ax, %cx 622; SSE2-NEXT: xorl $15, %ecx 623; SSE2-NEXT: movd %ecx, %xmm2 624; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 625; SSE2-NEXT: pextrw $5, %xmm0, %ecx 626; SSE2-NEXT: bsrw %cx, %cx 627; SSE2-NEXT: cmovew %ax, %cx 628; SSE2-NEXT: xorl $15, %ecx 629; SSE2-NEXT: movd %ecx, %xmm3 630; SSE2-NEXT: pextrw $1, %xmm0, %ecx 631; SSE2-NEXT: bsrw %cx, %cx 632; SSE2-NEXT: cmovew %ax, %cx 633; SSE2-NEXT: xorl $15, %ecx 634; SSE2-NEXT: movd %ecx, %xmm1 635; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 636; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 637; SSE2-NEXT: pextrw $6, %xmm0, %ecx 638; SSE2-NEXT: bsrw %cx, %cx 639; SSE2-NEXT: cmovew %ax, %cx 640; SSE2-NEXT: xorl $15, %ecx 641; SSE2-NEXT: movd %ecx, %xmm2 642; SSE2-NEXT: pextrw $2, %xmm0, %ecx 643; SSE2-NEXT: bsrw %cx, %cx 644; SSE2-NEXT: cmovew %ax, %cx 645; SSE2-NEXT: xorl $15, %ecx 646; SSE2-NEXT: movd %ecx, %xmm3 647; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 648; SSE2-NEXT: pextrw $4, %xmm0, %ecx 649; SSE2-NEXT: bsrw %cx, %cx 650; SSE2-NEXT: cmovew %ax, %cx 651; SSE2-NEXT: xorl $15, %ecx 652; SSE2-NEXT: movd %ecx, %xmm2 653; SSE2-NEXT: movd %xmm0, %ecx 654; SSE2-NEXT: bsrw %cx, %cx 655; SSE2-NEXT: cmovew %ax, %cx 656; SSE2-NEXT: xorl $15, %ecx 657; SSE2-NEXT: movd %ecx, %xmm0 658; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 659; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] 660; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 661; SSE2-NEXT: retq 662; 663; SSE3-LABEL: testv8i16: 664; SSE3: # BB#0: 665; SSE3-NEXT: pextrw $7, %xmm0, %eax 666; SSE3-NEXT: bsrw %ax, %cx 667; SSE3-NEXT: movw $31, %ax 668; SSE3-NEXT: cmovew %ax, %cx 669; SSE3-NEXT: xorl $15, %ecx 670; SSE3-NEXT: movd %ecx, %xmm1 671; SSE3-NEXT: pextrw $3, %xmm0, %ecx 672; SSE3-NEXT: bsrw %cx, %cx 673; SSE3-NEXT: cmovew %ax, %cx 674; SSE3-NEXT: xorl $15, %ecx 675; SSE3-NEXT: movd %ecx, %xmm2 676; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 677; SSE3-NEXT: pextrw $5, %xmm0, %ecx 678; SSE3-NEXT: bsrw %cx, %cx 679; SSE3-NEXT: cmovew %ax, %cx 680; SSE3-NEXT: xorl $15, %ecx 681; SSE3-NEXT: movd %ecx, %xmm3 682; SSE3-NEXT: pextrw $1, %xmm0, %ecx 683; SSE3-NEXT: bsrw %cx, %cx 684; SSE3-NEXT: cmovew %ax, %cx 685; SSE3-NEXT: xorl $15, %ecx 686; SSE3-NEXT: movd %ecx, %xmm1 687; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 688; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 689; SSE3-NEXT: pextrw $6, %xmm0, %ecx 690; SSE3-NEXT: bsrw %cx, %cx 691; SSE3-NEXT: cmovew %ax, %cx 692; SSE3-NEXT: xorl $15, %ecx 693; SSE3-NEXT: movd %ecx, %xmm2 694; SSE3-NEXT: pextrw $2, %xmm0, %ecx 695; SSE3-NEXT: bsrw %cx, %cx 696; SSE3-NEXT: cmovew %ax, %cx 697; SSE3-NEXT: xorl $15, %ecx 698; SSE3-NEXT: movd %ecx, %xmm3 699; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 700; SSE3-NEXT: pextrw $4, %xmm0, %ecx 701; SSE3-NEXT: bsrw %cx, %cx 702; SSE3-NEXT: cmovew %ax, %cx 703; SSE3-NEXT: xorl $15, %ecx 704; SSE3-NEXT: movd %ecx, %xmm2 705; SSE3-NEXT: movd %xmm0, %ecx 706; SSE3-NEXT: bsrw %cx, %cx 707; SSE3-NEXT: cmovew %ax, %cx 708; SSE3-NEXT: xorl $15, %ecx 709; SSE3-NEXT: movd %ecx, %xmm0 710; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 711; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] 712; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 713; SSE3-NEXT: retq 714; 715; SSSE3-LABEL: testv8i16: 716; SSSE3: # BB#0: 717; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 718; SSSE3-NEXT: movdqa %xmm0, %xmm1 719; SSSE3-NEXT: pand %xmm2, %xmm1 720; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 721; SSSE3-NEXT: movdqa %xmm3, %xmm4 722; SSSE3-NEXT: pshufb %xmm1, %xmm4 723; SSSE3-NEXT: movdqa %xmm0, %xmm1 724; SSSE3-NEXT: psrlw $4, %xmm1 725; SSSE3-NEXT: pand %xmm2, %xmm1 726; SSSE3-NEXT: pxor %xmm2, %xmm2 727; SSSE3-NEXT: pshufb %xmm1, %xmm3 728; SSSE3-NEXT: pcmpeqb %xmm2, %xmm1 729; SSSE3-NEXT: pand %xmm4, %xmm1 730; SSSE3-NEXT: paddb %xmm3, %xmm1 731; SSSE3-NEXT: pcmpeqb %xmm2, %xmm0 732; SSSE3-NEXT: psrlw $8, %xmm0 733; SSSE3-NEXT: pand %xmm1, %xmm0 734; SSSE3-NEXT: psrlw $8, %xmm1 735; SSSE3-NEXT: paddw %xmm0, %xmm1 736; SSSE3-NEXT: movdqa %xmm1, %xmm0 737; SSSE3-NEXT: retq 738; 739; SSE41-LABEL: testv8i16: 740; SSE41: # BB#0: 741; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 742; SSE41-NEXT: movdqa %xmm0, %xmm1 743; SSE41-NEXT: pand %xmm2, %xmm1 744; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 745; SSE41-NEXT: movdqa %xmm3, %xmm4 746; SSE41-NEXT: pshufb %xmm1, %xmm4 747; SSE41-NEXT: movdqa %xmm0, %xmm1 748; SSE41-NEXT: psrlw $4, %xmm1 749; SSE41-NEXT: pand %xmm2, %xmm1 750; SSE41-NEXT: pxor %xmm2, %xmm2 751; SSE41-NEXT: pshufb %xmm1, %xmm3 752; SSE41-NEXT: pcmpeqb %xmm2, %xmm1 753; SSE41-NEXT: pand %xmm4, %xmm1 754; SSE41-NEXT: paddb %xmm3, %xmm1 755; SSE41-NEXT: pcmpeqb %xmm2, %xmm0 756; SSE41-NEXT: psrlw $8, %xmm0 757; SSE41-NEXT: pand %xmm1, %xmm0 758; SSE41-NEXT: psrlw $8, %xmm1 759; SSE41-NEXT: paddw %xmm0, %xmm1 760; SSE41-NEXT: movdqa %xmm1, %xmm0 761; SSE41-NEXT: retq 762; 763; AVX-LABEL: testv8i16: 764; AVX: # BB#0: 765; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 766; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 767; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 768; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 769; AVX-NEXT: vpsrlw $4, %xmm0, %xmm4 770; AVX-NEXT: vpand %xmm1, %xmm4, %xmm1 771; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 772; AVX-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 773; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 774; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1 775; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 776; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 777; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 778; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 779; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 780; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 781; AVX-NEXT: retq 782; 783; AVX512VLCD-LABEL: testv8i16: 784; AVX512VLCD: ## BB#0: 785; AVX512VLCD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 786; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0 787; AVX512VLCD-NEXT: vpmovdw %ymm0, %xmm0 788; AVX512VLCD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 789; AVX512VLCD-NEXT: retq 790; 791; AVX512CD-LABEL: testv8i16: 792; AVX512CD: ## BB#0: 793; AVX512CD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 794; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 795; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0 796; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 797; AVX512CD-NEXT: retq 798; 799; X32-SSE-LABEL: testv8i16: 800; X32-SSE: # BB#0: 801; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 802; X32-SSE-NEXT: movdqa %xmm0, %xmm1 803; X32-SSE-NEXT: pand %xmm2, %xmm1 804; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 805; X32-SSE-NEXT: movdqa %xmm3, %xmm4 806; X32-SSE-NEXT: pshufb %xmm1, %xmm4 807; X32-SSE-NEXT: movdqa %xmm0, %xmm1 808; X32-SSE-NEXT: psrlw $4, %xmm1 809; X32-SSE-NEXT: pand %xmm2, %xmm1 810; X32-SSE-NEXT: pxor %xmm2, %xmm2 811; X32-SSE-NEXT: pshufb %xmm1, %xmm3 812; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1 813; X32-SSE-NEXT: pand %xmm4, %xmm1 814; X32-SSE-NEXT: paddb %xmm3, %xmm1 815; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm0 816; X32-SSE-NEXT: psrlw $8, %xmm0 817; X32-SSE-NEXT: pand %xmm1, %xmm0 818; X32-SSE-NEXT: psrlw $8, %xmm1 819; X32-SSE-NEXT: paddw %xmm0, %xmm1 820; X32-SSE-NEXT: movdqa %xmm1, %xmm0 821; X32-SSE-NEXT: retl 822 %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 0) 823 ret <8 x i16> %out 824} 825 826define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { 827; SSE2-LABEL: testv8i16u: 828; SSE2: # BB#0: 829; SSE2-NEXT: pextrw $7, %xmm0, %eax 830; SSE2-NEXT: bsrw %ax, %ax 831; SSE2-NEXT: xorl $15, %eax 832; SSE2-NEXT: movd %eax, %xmm1 833; SSE2-NEXT: pextrw $3, %xmm0, %eax 834; SSE2-NEXT: bsrw %ax, %ax 835; SSE2-NEXT: xorl $15, %eax 836; SSE2-NEXT: movd %eax, %xmm2 837; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 838; SSE2-NEXT: pextrw $5, %xmm0, %eax 839; SSE2-NEXT: bsrw %ax, %ax 840; SSE2-NEXT: xorl $15, %eax 841; SSE2-NEXT: movd %eax, %xmm3 842; SSE2-NEXT: pextrw $1, %xmm0, %eax 843; SSE2-NEXT: bsrw %ax, %ax 844; SSE2-NEXT: xorl $15, %eax 845; SSE2-NEXT: movd %eax, %xmm1 846; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 847; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 848; SSE2-NEXT: pextrw $6, %xmm0, %eax 849; SSE2-NEXT: bsrw %ax, %ax 850; SSE2-NEXT: xorl $15, %eax 851; SSE2-NEXT: movd %eax, %xmm2 852; SSE2-NEXT: pextrw $2, %xmm0, %eax 853; SSE2-NEXT: bsrw %ax, %ax 854; SSE2-NEXT: xorl $15, %eax 855; SSE2-NEXT: movd %eax, %xmm3 856; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 857; SSE2-NEXT: pextrw $4, %xmm0, %eax 858; SSE2-NEXT: bsrw %ax, %ax 859; SSE2-NEXT: xorl $15, %eax 860; SSE2-NEXT: movd %eax, %xmm2 861; SSE2-NEXT: movd %xmm0, %eax 862; SSE2-NEXT: bsrw %ax, %ax 863; SSE2-NEXT: xorl $15, %eax 864; SSE2-NEXT: movd %eax, %xmm0 865; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 866; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] 867; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 868; SSE2-NEXT: retq 869; 870; SSE3-LABEL: testv8i16u: 871; SSE3: # BB#0: 872; SSE3-NEXT: pextrw $7, %xmm0, %eax 873; SSE3-NEXT: bsrw %ax, %ax 874; SSE3-NEXT: xorl $15, %eax 875; SSE3-NEXT: movd %eax, %xmm1 876; SSE3-NEXT: pextrw $3, %xmm0, %eax 877; SSE3-NEXT: bsrw %ax, %ax 878; SSE3-NEXT: xorl $15, %eax 879; SSE3-NEXT: movd %eax, %xmm2 880; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 881; SSE3-NEXT: pextrw $5, %xmm0, %eax 882; SSE3-NEXT: bsrw %ax, %ax 883; SSE3-NEXT: xorl $15, %eax 884; SSE3-NEXT: movd %eax, %xmm3 885; SSE3-NEXT: pextrw $1, %xmm0, %eax 886; SSE3-NEXT: bsrw %ax, %ax 887; SSE3-NEXT: xorl $15, %eax 888; SSE3-NEXT: movd %eax, %xmm1 889; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 890; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 891; SSE3-NEXT: pextrw $6, %xmm0, %eax 892; SSE3-NEXT: bsrw %ax, %ax 893; SSE3-NEXT: xorl $15, %eax 894; SSE3-NEXT: movd %eax, %xmm2 895; SSE3-NEXT: pextrw $2, %xmm0, %eax 896; SSE3-NEXT: bsrw %ax, %ax 897; SSE3-NEXT: xorl $15, %eax 898; SSE3-NEXT: movd %eax, %xmm3 899; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 900; SSE3-NEXT: pextrw $4, %xmm0, %eax 901; SSE3-NEXT: bsrw %ax, %ax 902; SSE3-NEXT: xorl $15, %eax 903; SSE3-NEXT: movd %eax, %xmm2 904; SSE3-NEXT: movd %xmm0, %eax 905; SSE3-NEXT: bsrw %ax, %ax 906; SSE3-NEXT: xorl $15, %eax 907; SSE3-NEXT: movd %eax, %xmm0 908; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 909; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] 910; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 911; SSE3-NEXT: retq 912; 913; SSSE3-LABEL: testv8i16u: 914; SSSE3: # BB#0: 915; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 916; SSSE3-NEXT: movdqa %xmm0, %xmm1 917; SSSE3-NEXT: pand %xmm2, %xmm1 918; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 919; SSSE3-NEXT: movdqa %xmm3, %xmm4 920; SSSE3-NEXT: pshufb %xmm1, %xmm4 921; SSSE3-NEXT: movdqa %xmm0, %xmm1 922; SSSE3-NEXT: psrlw $4, %xmm1 923; SSSE3-NEXT: pand %xmm2, %xmm1 924; SSSE3-NEXT: pxor %xmm2, %xmm2 925; SSSE3-NEXT: pshufb %xmm1, %xmm3 926; SSSE3-NEXT: pcmpeqb %xmm2, %xmm1 927; SSSE3-NEXT: pand %xmm4, %xmm1 928; SSSE3-NEXT: paddb %xmm3, %xmm1 929; SSSE3-NEXT: pcmpeqb %xmm2, %xmm0 930; SSSE3-NEXT: psrlw $8, %xmm0 931; SSSE3-NEXT: pand %xmm1, %xmm0 932; SSSE3-NEXT: psrlw $8, %xmm1 933; SSSE3-NEXT: paddw %xmm0, %xmm1 934; SSSE3-NEXT: movdqa %xmm1, %xmm0 935; SSSE3-NEXT: retq 936; 937; SSE41-LABEL: testv8i16u: 938; SSE41: # BB#0: 939; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 940; SSE41-NEXT: movdqa %xmm0, %xmm1 941; SSE41-NEXT: pand %xmm2, %xmm1 942; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 943; SSE41-NEXT: movdqa %xmm3, %xmm4 944; SSE41-NEXT: pshufb %xmm1, %xmm4 945; SSE41-NEXT: movdqa %xmm0, %xmm1 946; SSE41-NEXT: psrlw $4, %xmm1 947; SSE41-NEXT: pand %xmm2, %xmm1 948; SSE41-NEXT: pxor %xmm2, %xmm2 949; SSE41-NEXT: pshufb %xmm1, %xmm3 950; SSE41-NEXT: pcmpeqb %xmm2, %xmm1 951; SSE41-NEXT: pand %xmm4, %xmm1 952; SSE41-NEXT: paddb %xmm3, %xmm1 953; SSE41-NEXT: pcmpeqb %xmm2, %xmm0 954; SSE41-NEXT: psrlw $8, %xmm0 955; SSE41-NEXT: pand %xmm1, %xmm0 956; SSE41-NEXT: psrlw $8, %xmm1 957; SSE41-NEXT: paddw %xmm0, %xmm1 958; SSE41-NEXT: movdqa %xmm1, %xmm0 959; SSE41-NEXT: retq 960; 961; AVX-LABEL: testv8i16u: 962; AVX: # BB#0: 963; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 964; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 965; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 966; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 967; AVX-NEXT: vpsrlw $4, %xmm0, %xmm4 968; AVX-NEXT: vpand %xmm1, %xmm4, %xmm1 969; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 970; AVX-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 971; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 972; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1 973; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 974; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 975; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 976; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 977; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 978; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 979; AVX-NEXT: retq 980; 981; AVX512VLCD-LABEL: testv8i16u: 982; AVX512VLCD: ## BB#0: 983; AVX512VLCD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 984; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0 985; AVX512VLCD-NEXT: vpmovdw %ymm0, %xmm0 986; AVX512VLCD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 987; AVX512VLCD-NEXT: retq 988; 989; AVX512CD-LABEL: testv8i16u: 990; AVX512CD: ## BB#0: 991; AVX512CD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 992; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 993; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0 994; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 995; AVX512CD-NEXT: retq 996; 997; X32-SSE-LABEL: testv8i16u: 998; X32-SSE: # BB#0: 999; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1000; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1001; X32-SSE-NEXT: pand %xmm2, %xmm1 1002; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1003; X32-SSE-NEXT: movdqa %xmm3, %xmm4 1004; X32-SSE-NEXT: pshufb %xmm1, %xmm4 1005; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1006; X32-SSE-NEXT: psrlw $4, %xmm1 1007; X32-SSE-NEXT: pand %xmm2, %xmm1 1008; X32-SSE-NEXT: pxor %xmm2, %xmm2 1009; X32-SSE-NEXT: pshufb %xmm1, %xmm3 1010; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1 1011; X32-SSE-NEXT: pand %xmm4, %xmm1 1012; X32-SSE-NEXT: paddb %xmm3, %xmm1 1013; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm0 1014; X32-SSE-NEXT: psrlw $8, %xmm0 1015; X32-SSE-NEXT: pand %xmm1, %xmm0 1016; X32-SSE-NEXT: psrlw $8, %xmm1 1017; X32-SSE-NEXT: paddw %xmm0, %xmm1 1018; X32-SSE-NEXT: movdqa %xmm1, %xmm0 1019; X32-SSE-NEXT: retl 1020 %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 -1) 1021 ret <8 x i16> %out 1022} 1023 1024define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { 1025; SSE2-LABEL: testv16i8: 1026; SSE2: # BB#0: 1027; SSE2-NEXT: pushq %rbp 1028; SSE2-NEXT: pushq %rbx 1029; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 1030; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1031; SSE2-NEXT: bsrl %eax, %ecx 1032; SSE2-NEXT: movl $15, %eax 1033; SSE2-NEXT: cmovel %eax, %ecx 1034; SSE2-NEXT: xorl $7, %ecx 1035; SSE2-NEXT: movd %ecx, %xmm0 1036; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx 1037; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi 1038; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d 1039; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx 1040; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d 1041; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi 1042; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d 1043; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx 1044; SSE2-NEXT: bsrl %ecx, %ecx 1045; SSE2-NEXT: cmovel %eax, %ecx 1046; SSE2-NEXT: xorl $7, %ecx 1047; SSE2-NEXT: movd %ecx, %xmm1 1048; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1049; SSE2-NEXT: bsrl %edx, %ecx 1050; SSE2-NEXT: cmovel %eax, %ecx 1051; SSE2-NEXT: xorl $7, %ecx 1052; SSE2-NEXT: movd %ecx, %xmm2 1053; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx 1054; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx 1055; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d 1056; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp 1057; SSE2-NEXT: bsrl %ebp, %ebp 1058; SSE2-NEXT: cmovel %eax, %ebp 1059; SSE2-NEXT: xorl $7, %ebp 1060; SSE2-NEXT: movd %ebp, %xmm0 1061; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1062; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1063; SSE2-NEXT: bsrl %edi, %edi 1064; SSE2-NEXT: cmovel %eax, %edi 1065; SSE2-NEXT: xorl $7, %edi 1066; SSE2-NEXT: movd %edi, %xmm1 1067; SSE2-NEXT: bsrl %ecx, %ecx 1068; SSE2-NEXT: cmovel %eax, %ecx 1069; SSE2-NEXT: xorl $7, %ecx 1070; SSE2-NEXT: movd %ecx, %xmm2 1071; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 1072; SSE2-NEXT: bsrl %esi, %ecx 1073; SSE2-NEXT: cmovel %eax, %ecx 1074; SSE2-NEXT: xorl $7, %ecx 1075; SSE2-NEXT: movd %ecx, %xmm3 1076; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi 1077; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx 1078; SSE2-NEXT: bsrl %ecx, %ecx 1079; SSE2-NEXT: cmovel %eax, %ecx 1080; SSE2-NEXT: xorl $7, %ecx 1081; SSE2-NEXT: movd %ecx, %xmm1 1082; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 1083; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1084; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1085; SSE2-NEXT: bsrl %ebx, %ecx 1086; SSE2-NEXT: cmovel %eax, %ecx 1087; SSE2-NEXT: xorl $7, %ecx 1088; SSE2-NEXT: movd %ecx, %xmm0 1089; SSE2-NEXT: bsrl %edx, %ecx 1090; SSE2-NEXT: cmovel %eax, %ecx 1091; SSE2-NEXT: xorl $7, %ecx 1092; SSE2-NEXT: movd %ecx, %xmm3 1093; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 1094; SSE2-NEXT: bsrl %r11d, %ecx 1095; SSE2-NEXT: cmovel %eax, %ecx 1096; SSE2-NEXT: xorl $7, %ecx 1097; SSE2-NEXT: movd %ecx, %xmm0 1098; SSE2-NEXT: bsrl %esi, %ecx 1099; SSE2-NEXT: cmovel %eax, %ecx 1100; SSE2-NEXT: xorl $7, %ecx 1101; SSE2-NEXT: movd %ecx, %xmm2 1102; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 1103; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 1104; SSE2-NEXT: bsrl %r9d, %ecx 1105; SSE2-NEXT: cmovel %eax, %ecx 1106; SSE2-NEXT: xorl $7, %ecx 1107; SSE2-NEXT: movd %ecx, %xmm0 1108; SSE2-NEXT: bsrl %r10d, %ecx 1109; SSE2-NEXT: cmovel %eax, %ecx 1110; SSE2-NEXT: xorl $7, %ecx 1111; SSE2-NEXT: movd %ecx, %xmm3 1112; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 1113; SSE2-NEXT: bsrl %r8d, %ecx 1114; SSE2-NEXT: cmovel %eax, %ecx 1115; SSE2-NEXT: xorl $7, %ecx 1116; SSE2-NEXT: movd %ecx, %xmm4 1117; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx 1118; SSE2-NEXT: bsrl %ecx, %ecx 1119; SSE2-NEXT: cmovel %eax, %ecx 1120; SSE2-NEXT: xorl $7, %ecx 1121; SSE2-NEXT: movd %ecx, %xmm0 1122; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 1123; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 1124; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1125; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1126; SSE2-NEXT: popq %rbx 1127; SSE2-NEXT: popq %rbp 1128; SSE2-NEXT: retq 1129; 1130; SSE3-LABEL: testv16i8: 1131; SSE3: # BB#0: 1132; SSE3-NEXT: pushq %rbp 1133; SSE3-NEXT: pushq %rbx 1134; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 1135; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1136; SSE3-NEXT: bsrl %eax, %ecx 1137; SSE3-NEXT: movl $15, %eax 1138; SSE3-NEXT: cmovel %eax, %ecx 1139; SSE3-NEXT: xorl $7, %ecx 1140; SSE3-NEXT: movd %ecx, %xmm0 1141; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx 1142; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi 1143; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d 1144; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx 1145; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d 1146; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi 1147; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d 1148; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx 1149; SSE3-NEXT: bsrl %ecx, %ecx 1150; SSE3-NEXT: cmovel %eax, %ecx 1151; SSE3-NEXT: xorl $7, %ecx 1152; SSE3-NEXT: movd %ecx, %xmm1 1153; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1154; SSE3-NEXT: bsrl %edx, %ecx 1155; SSE3-NEXT: cmovel %eax, %ecx 1156; SSE3-NEXT: xorl $7, %ecx 1157; SSE3-NEXT: movd %ecx, %xmm2 1158; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx 1159; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx 1160; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d 1161; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp 1162; SSE3-NEXT: bsrl %ebp, %ebp 1163; SSE3-NEXT: cmovel %eax, %ebp 1164; SSE3-NEXT: xorl $7, %ebp 1165; SSE3-NEXT: movd %ebp, %xmm0 1166; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1167; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1168; SSE3-NEXT: bsrl %edi, %edi 1169; SSE3-NEXT: cmovel %eax, %edi 1170; SSE3-NEXT: xorl $7, %edi 1171; SSE3-NEXT: movd %edi, %xmm1 1172; SSE3-NEXT: bsrl %ecx, %ecx 1173; SSE3-NEXT: cmovel %eax, %ecx 1174; SSE3-NEXT: xorl $7, %ecx 1175; SSE3-NEXT: movd %ecx, %xmm2 1176; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 1177; SSE3-NEXT: bsrl %esi, %ecx 1178; SSE3-NEXT: cmovel %eax, %ecx 1179; SSE3-NEXT: xorl $7, %ecx 1180; SSE3-NEXT: movd %ecx, %xmm3 1181; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi 1182; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx 1183; SSE3-NEXT: bsrl %ecx, %ecx 1184; SSE3-NEXT: cmovel %eax, %ecx 1185; SSE3-NEXT: xorl $7, %ecx 1186; SSE3-NEXT: movd %ecx, %xmm1 1187; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 1188; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1189; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1190; SSE3-NEXT: bsrl %ebx, %ecx 1191; SSE3-NEXT: cmovel %eax, %ecx 1192; SSE3-NEXT: xorl $7, %ecx 1193; SSE3-NEXT: movd %ecx, %xmm0 1194; SSE3-NEXT: bsrl %edx, %ecx 1195; SSE3-NEXT: cmovel %eax, %ecx 1196; SSE3-NEXT: xorl $7, %ecx 1197; SSE3-NEXT: movd %ecx, %xmm3 1198; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 1199; SSE3-NEXT: bsrl %r11d, %ecx 1200; SSE3-NEXT: cmovel %eax, %ecx 1201; SSE3-NEXT: xorl $7, %ecx 1202; SSE3-NEXT: movd %ecx, %xmm0 1203; SSE3-NEXT: bsrl %esi, %ecx 1204; SSE3-NEXT: cmovel %eax, %ecx 1205; SSE3-NEXT: xorl $7, %ecx 1206; SSE3-NEXT: movd %ecx, %xmm2 1207; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 1208; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 1209; SSE3-NEXT: bsrl %r9d, %ecx 1210; SSE3-NEXT: cmovel %eax, %ecx 1211; SSE3-NEXT: xorl $7, %ecx 1212; SSE3-NEXT: movd %ecx, %xmm0 1213; SSE3-NEXT: bsrl %r10d, %ecx 1214; SSE3-NEXT: cmovel %eax, %ecx 1215; SSE3-NEXT: xorl $7, %ecx 1216; SSE3-NEXT: movd %ecx, %xmm3 1217; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 1218; SSE3-NEXT: bsrl %r8d, %ecx 1219; SSE3-NEXT: cmovel %eax, %ecx 1220; SSE3-NEXT: xorl $7, %ecx 1221; SSE3-NEXT: movd %ecx, %xmm4 1222; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx 1223; SSE3-NEXT: bsrl %ecx, %ecx 1224; SSE3-NEXT: cmovel %eax, %ecx 1225; SSE3-NEXT: xorl $7, %ecx 1226; SSE3-NEXT: movd %ecx, %xmm0 1227; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 1228; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 1229; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1230; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1231; SSE3-NEXT: popq %rbx 1232; SSE3-NEXT: popq %rbp 1233; SSE3-NEXT: retq 1234; 1235; SSSE3-LABEL: testv16i8: 1236; SSSE3: # BB#0: 1237; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1238; SSSE3-NEXT: movdqa %xmm0, %xmm3 1239; SSSE3-NEXT: pand %xmm2, %xmm3 1240; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1241; SSSE3-NEXT: movdqa %xmm1, %xmm4 1242; SSSE3-NEXT: pshufb %xmm3, %xmm4 1243; SSSE3-NEXT: psrlw $4, %xmm0 1244; SSSE3-NEXT: pand %xmm2, %xmm0 1245; SSSE3-NEXT: pxor %xmm2, %xmm2 1246; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2 1247; SSSE3-NEXT: pand %xmm4, %xmm2 1248; SSSE3-NEXT: pshufb %xmm0, %xmm1 1249; SSSE3-NEXT: paddb %xmm2, %xmm1 1250; SSSE3-NEXT: movdqa %xmm1, %xmm0 1251; SSSE3-NEXT: retq 1252; 1253; SSE41-LABEL: testv16i8: 1254; SSE41: # BB#0: 1255; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1256; SSE41-NEXT: movdqa %xmm0, %xmm3 1257; SSE41-NEXT: pand %xmm2, %xmm3 1258; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1259; SSE41-NEXT: movdqa %xmm1, %xmm4 1260; SSE41-NEXT: pshufb %xmm3, %xmm4 1261; SSE41-NEXT: psrlw $4, %xmm0 1262; SSE41-NEXT: pand %xmm2, %xmm0 1263; SSE41-NEXT: pxor %xmm2, %xmm2 1264; SSE41-NEXT: pcmpeqb %xmm0, %xmm2 1265; SSE41-NEXT: pand %xmm4, %xmm2 1266; SSE41-NEXT: pshufb %xmm0, %xmm1 1267; SSE41-NEXT: paddb %xmm2, %xmm1 1268; SSE41-NEXT: movdqa %xmm1, %xmm0 1269; SSE41-NEXT: retq 1270; 1271; AVX-LABEL: testv16i8: 1272; AVX: # BB#0: 1273; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1274; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 1275; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1276; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1277; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 1278; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 1279; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1280; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 1281; AVX-NEXT: vpand %xmm1, %xmm2, %xmm1 1282; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 1283; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 1284; AVX-NEXT: retq 1285; 1286; AVX512-LABEL: testv16i8: 1287; AVX512: ## BB#0: 1288; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1289; AVX512-NEXT: vplzcntd %zmm0, %zmm0 1290; AVX512-NEXT: vpmovdb %zmm0, %xmm0 1291; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1292; AVX512-NEXT: retq 1293; 1294; X32-SSE-LABEL: testv16i8: 1295; X32-SSE: # BB#0: 1296; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1297; X32-SSE-NEXT: movdqa %xmm0, %xmm3 1298; X32-SSE-NEXT: pand %xmm2, %xmm3 1299; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1300; X32-SSE-NEXT: movdqa %xmm1, %xmm4 1301; X32-SSE-NEXT: pshufb %xmm3, %xmm4 1302; X32-SSE-NEXT: psrlw $4, %xmm0 1303; X32-SSE-NEXT: pand %xmm2, %xmm0 1304; X32-SSE-NEXT: pxor %xmm2, %xmm2 1305; X32-SSE-NEXT: pcmpeqb %xmm0, %xmm2 1306; X32-SSE-NEXT: pand %xmm4, %xmm2 1307; X32-SSE-NEXT: pshufb %xmm0, %xmm1 1308; X32-SSE-NEXT: paddb %xmm2, %xmm1 1309; X32-SSE-NEXT: movdqa %xmm1, %xmm0 1310; X32-SSE-NEXT: retl 1311 %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 0) 1312 ret <16 x i8> %out 1313} 1314 1315define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { 1316; SSE2-LABEL: testv16i8u: 1317; SSE2: # BB#0: 1318; SSE2-NEXT: pushq %rbx 1319; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 1320; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1321; SSE2-NEXT: bsrl %eax, %eax 1322; SSE2-NEXT: xorl $7, %eax 1323; SSE2-NEXT: movd %eax, %xmm0 1324; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi 1325; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx 1326; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d 1327; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1328; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d 1329; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx 1330; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d 1331; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi 1332; SSE2-NEXT: bsrl %esi, %esi 1333; SSE2-NEXT: xorl $7, %esi 1334; SSE2-NEXT: movd %esi, %xmm1 1335; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1336; SSE2-NEXT: bsrl %eax, %eax 1337; SSE2-NEXT: xorl $7, %eax 1338; SSE2-NEXT: movd %eax, %xmm0 1339; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1340; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi 1341; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d 1342; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx 1343; SSE2-NEXT: bsrl %ebx, %ebx 1344; SSE2-NEXT: xorl $7, %ebx 1345; SSE2-NEXT: movd %ebx, %xmm2 1346; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 1347; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 1348; SSE2-NEXT: bsrl %edx, %edx 1349; SSE2-NEXT: xorl $7, %edx 1350; SSE2-NEXT: movd %edx, %xmm0 1351; SSE2-NEXT: bsrl %esi, %edx 1352; SSE2-NEXT: xorl $7, %edx 1353; SSE2-NEXT: movd %edx, %xmm3 1354; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 1355; SSE2-NEXT: bsrl %ecx, %ecx 1356; SSE2-NEXT: xorl $7, %ecx 1357; SSE2-NEXT: movd %ecx, %xmm0 1358; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx 1359; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx 1360; SSE2-NEXT: bsrl %edx, %edx 1361; SSE2-NEXT: xorl $7, %edx 1362; SSE2-NEXT: movd %edx, %xmm1 1363; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1364; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 1365; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1366; SSE2-NEXT: bsrl %edi, %edx 1367; SSE2-NEXT: xorl $7, %edx 1368; SSE2-NEXT: movd %edx, %xmm0 1369; SSE2-NEXT: bsrl %eax, %eax 1370; SSE2-NEXT: xorl $7, %eax 1371; SSE2-NEXT: movd %eax, %xmm2 1372; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 1373; SSE2-NEXT: bsrl %r10d, %eax 1374; SSE2-NEXT: xorl $7, %eax 1375; SSE2-NEXT: movd %eax, %xmm0 1376; SSE2-NEXT: bsrl %ecx, %eax 1377; SSE2-NEXT: xorl $7, %eax 1378; SSE2-NEXT: movd %eax, %xmm3 1379; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 1380; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 1381; SSE2-NEXT: bsrl %r9d, %eax 1382; SSE2-NEXT: xorl $7, %eax 1383; SSE2-NEXT: movd %eax, %xmm0 1384; SSE2-NEXT: bsrl %r11d, %eax 1385; SSE2-NEXT: xorl $7, %eax 1386; SSE2-NEXT: movd %eax, %xmm2 1387; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 1388; SSE2-NEXT: bsrl %r8d, %eax 1389; SSE2-NEXT: xorl $7, %eax 1390; SSE2-NEXT: movd %eax, %xmm4 1391; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1392; SSE2-NEXT: bsrl %eax, %eax 1393; SSE2-NEXT: xorl $7, %eax 1394; SSE2-NEXT: movd %eax, %xmm0 1395; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 1396; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1397; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 1398; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1399; SSE2-NEXT: popq %rbx 1400; SSE2-NEXT: retq 1401; 1402; SSE3-LABEL: testv16i8u: 1403; SSE3: # BB#0: 1404; SSE3-NEXT: pushq %rbx 1405; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 1406; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1407; SSE3-NEXT: bsrl %eax, %eax 1408; SSE3-NEXT: xorl $7, %eax 1409; SSE3-NEXT: movd %eax, %xmm0 1410; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi 1411; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx 1412; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d 1413; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1414; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d 1415; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx 1416; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d 1417; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi 1418; SSE3-NEXT: bsrl %esi, %esi 1419; SSE3-NEXT: xorl $7, %esi 1420; SSE3-NEXT: movd %esi, %xmm1 1421; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1422; SSE3-NEXT: bsrl %eax, %eax 1423; SSE3-NEXT: xorl $7, %eax 1424; SSE3-NEXT: movd %eax, %xmm0 1425; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1426; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi 1427; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d 1428; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx 1429; SSE3-NEXT: bsrl %ebx, %ebx 1430; SSE3-NEXT: xorl $7, %ebx 1431; SSE3-NEXT: movd %ebx, %xmm2 1432; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 1433; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 1434; SSE3-NEXT: bsrl %edx, %edx 1435; SSE3-NEXT: xorl $7, %edx 1436; SSE3-NEXT: movd %edx, %xmm0 1437; SSE3-NEXT: bsrl %esi, %edx 1438; SSE3-NEXT: xorl $7, %edx 1439; SSE3-NEXT: movd %edx, %xmm3 1440; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 1441; SSE3-NEXT: bsrl %ecx, %ecx 1442; SSE3-NEXT: xorl $7, %ecx 1443; SSE3-NEXT: movd %ecx, %xmm0 1444; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx 1445; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx 1446; SSE3-NEXT: bsrl %edx, %edx 1447; SSE3-NEXT: xorl $7, %edx 1448; SSE3-NEXT: movd %edx, %xmm1 1449; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1450; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 1451; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1452; SSE3-NEXT: bsrl %edi, %edx 1453; SSE3-NEXT: xorl $7, %edx 1454; SSE3-NEXT: movd %edx, %xmm0 1455; SSE3-NEXT: bsrl %eax, %eax 1456; SSE3-NEXT: xorl $7, %eax 1457; SSE3-NEXT: movd %eax, %xmm2 1458; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 1459; SSE3-NEXT: bsrl %r10d, %eax 1460; SSE3-NEXT: xorl $7, %eax 1461; SSE3-NEXT: movd %eax, %xmm0 1462; SSE3-NEXT: bsrl %ecx, %eax 1463; SSE3-NEXT: xorl $7, %eax 1464; SSE3-NEXT: movd %eax, %xmm3 1465; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 1466; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 1467; SSE3-NEXT: bsrl %r9d, %eax 1468; SSE3-NEXT: xorl $7, %eax 1469; SSE3-NEXT: movd %eax, %xmm0 1470; SSE3-NEXT: bsrl %r11d, %eax 1471; SSE3-NEXT: xorl $7, %eax 1472; SSE3-NEXT: movd %eax, %xmm2 1473; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 1474; SSE3-NEXT: bsrl %r8d, %eax 1475; SSE3-NEXT: xorl $7, %eax 1476; SSE3-NEXT: movd %eax, %xmm4 1477; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1478; SSE3-NEXT: bsrl %eax, %eax 1479; SSE3-NEXT: xorl $7, %eax 1480; SSE3-NEXT: movd %eax, %xmm0 1481; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 1482; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1483; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 1484; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1485; SSE3-NEXT: popq %rbx 1486; SSE3-NEXT: retq 1487; 1488; SSSE3-LABEL: testv16i8u: 1489; SSSE3: # BB#0: 1490; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1491; SSSE3-NEXT: movdqa %xmm0, %xmm3 1492; SSSE3-NEXT: pand %xmm2, %xmm3 1493; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1494; SSSE3-NEXT: movdqa %xmm1, %xmm4 1495; SSSE3-NEXT: pshufb %xmm3, %xmm4 1496; SSSE3-NEXT: psrlw $4, %xmm0 1497; SSSE3-NEXT: pand %xmm2, %xmm0 1498; SSSE3-NEXT: pxor %xmm2, %xmm2 1499; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2 1500; SSSE3-NEXT: pand %xmm4, %xmm2 1501; SSSE3-NEXT: pshufb %xmm0, %xmm1 1502; SSSE3-NEXT: paddb %xmm2, %xmm1 1503; SSSE3-NEXT: movdqa %xmm1, %xmm0 1504; SSSE3-NEXT: retq 1505; 1506; SSE41-LABEL: testv16i8u: 1507; SSE41: # BB#0: 1508; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1509; SSE41-NEXT: movdqa %xmm0, %xmm3 1510; SSE41-NEXT: pand %xmm2, %xmm3 1511; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1512; SSE41-NEXT: movdqa %xmm1, %xmm4 1513; SSE41-NEXT: pshufb %xmm3, %xmm4 1514; SSE41-NEXT: psrlw $4, %xmm0 1515; SSE41-NEXT: pand %xmm2, %xmm0 1516; SSE41-NEXT: pxor %xmm2, %xmm2 1517; SSE41-NEXT: pcmpeqb %xmm0, %xmm2 1518; SSE41-NEXT: pand %xmm4, %xmm2 1519; SSE41-NEXT: pshufb %xmm0, %xmm1 1520; SSE41-NEXT: paddb %xmm2, %xmm1 1521; SSE41-NEXT: movdqa %xmm1, %xmm0 1522; SSE41-NEXT: retq 1523; 1524; AVX-LABEL: testv16i8u: 1525; AVX: # BB#0: 1526; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1527; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 1528; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1529; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1530; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 1531; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 1532; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1533; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 1534; AVX-NEXT: vpand %xmm1, %xmm2, %xmm1 1535; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 1536; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 1537; AVX-NEXT: retq 1538; 1539; AVX512-LABEL: testv16i8u: 1540; AVX512: ## BB#0: 1541; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1542; AVX512-NEXT: vplzcntd %zmm0, %zmm0 1543; AVX512-NEXT: vpmovdb %zmm0, %xmm0 1544; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1545; AVX512-NEXT: retq 1546; 1547; X32-SSE-LABEL: testv16i8u: 1548; X32-SSE: # BB#0: 1549; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1550; X32-SSE-NEXT: movdqa %xmm0, %xmm3 1551; X32-SSE-NEXT: pand %xmm2, %xmm3 1552; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1553; X32-SSE-NEXT: movdqa %xmm1, %xmm4 1554; X32-SSE-NEXT: pshufb %xmm3, %xmm4 1555; X32-SSE-NEXT: psrlw $4, %xmm0 1556; X32-SSE-NEXT: pand %xmm2, %xmm0 1557; X32-SSE-NEXT: pxor %xmm2, %xmm2 1558; X32-SSE-NEXT: pcmpeqb %xmm0, %xmm2 1559; X32-SSE-NEXT: pand %xmm4, %xmm2 1560; X32-SSE-NEXT: pshufb %xmm0, %xmm1 1561; X32-SSE-NEXT: paddb %xmm2, %xmm1 1562; X32-SSE-NEXT: movdqa %xmm1, %xmm0 1563; X32-SSE-NEXT: retl 1564 %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 -1) 1565 ret <16 x i8> %out 1566} 1567 1568define <2 x i64> @foldv2i64() nounwind { 1569; SSE-LABEL: foldv2i64: 1570; SSE: # BB#0: 1571; SSE-NEXT: movl $55, %eax 1572; SSE-NEXT: movd %rax, %xmm0 1573; SSE-NEXT: retq 1574; 1575; AVX-LABEL: foldv2i64: 1576; AVX: # BB#0: 1577; AVX-NEXT: movl $55, %eax 1578; AVX-NEXT: vmovq %rax, %xmm0 1579; AVX-NEXT: retq 1580; 1581; AVX512-LABEL: foldv2i64: 1582; AVX512: ## BB#0: 1583; AVX512-NEXT: movl $55, %eax 1584; AVX512-NEXT: vmovq %rax, %xmm0 1585; AVX512-NEXT: retq 1586; 1587; X32-SSE-LABEL: foldv2i64: 1588; X32-SSE: # BB#0: 1589; X32-SSE-NEXT: movl $55, %eax 1590; X32-SSE-NEXT: movd %eax, %xmm0 1591; X32-SSE-NEXT: retl 1592 %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 0) 1593 ret <2 x i64> %out 1594} 1595 1596define <2 x i64> @foldv2i64u() nounwind { 1597; SSE-LABEL: foldv2i64u: 1598; SSE: # BB#0: 1599; SSE-NEXT: movl $55, %eax 1600; SSE-NEXT: movd %rax, %xmm0 1601; SSE-NEXT: retq 1602; 1603; AVX-LABEL: foldv2i64u: 1604; AVX: # BB#0: 1605; AVX-NEXT: movl $55, %eax 1606; AVX-NEXT: vmovq %rax, %xmm0 1607; AVX-NEXT: retq 1608; 1609; AVX512-LABEL: foldv2i64u: 1610; AVX512: ## BB#0: 1611; AVX512-NEXT: movl $55, %eax 1612; AVX512-NEXT: vmovq %rax, %xmm0 1613; AVX512-NEXT: retq 1614; 1615; X32-SSE-LABEL: foldv2i64u: 1616; X32-SSE: # BB#0: 1617; X32-SSE-NEXT: movl $55, %eax 1618; X32-SSE-NEXT: movd %eax, %xmm0 1619; X32-SSE-NEXT: retl 1620 %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 -1) 1621 ret <2 x i64> %out 1622} 1623 1624define <4 x i32> @foldv4i32() nounwind { 1625; SSE-LABEL: foldv4i32: 1626; SSE: # BB#0: 1627; SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24] 1628; SSE-NEXT: retq 1629; 1630; AVX-LABEL: foldv4i32: 1631; AVX: # BB#0: 1632; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] 1633; AVX-NEXT: retq 1634; 1635; AVX512VLCD-LABEL: foldv4i32: 1636; AVX512VLCD: ## BB#0: 1637; AVX512VLCD-NEXT: vmovdqa32 {{.*#+}} xmm0 = [23,0,32,24] 1638; AVX512VLCD-NEXT: retq 1639; 1640; AVX512CD-LABEL: foldv4i32: 1641; AVX512CD: ## BB#0: 1642; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] 1643; AVX512CD-NEXT: retq 1644; 1645; X32-SSE-LABEL: foldv4i32: 1646; X32-SSE: # BB#0: 1647; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24] 1648; X32-SSE-NEXT: retl 1649 %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 0) 1650 ret <4 x i32> %out 1651} 1652 1653define <4 x i32> @foldv4i32u() nounwind { 1654; SSE-LABEL: foldv4i32u: 1655; SSE: # BB#0: 1656; SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24] 1657; SSE-NEXT: retq 1658; 1659; AVX-LABEL: foldv4i32u: 1660; AVX: # BB#0: 1661; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] 1662; AVX-NEXT: retq 1663; 1664; AVX512VLCD-LABEL: foldv4i32u: 1665; AVX512VLCD: ## BB#0: 1666; AVX512VLCD-NEXT: vmovdqa32 {{.*#+}} xmm0 = [23,0,32,24] 1667; AVX512VLCD-NEXT: retq 1668; 1669; AVX512CD-LABEL: foldv4i32u: 1670; AVX512CD: ## BB#0: 1671; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] 1672; AVX512CD-NEXT: retq 1673; 1674; X32-SSE-LABEL: foldv4i32u: 1675; X32-SSE: # BB#0: 1676; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24] 1677; X32-SSE-NEXT: retl 1678 %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 -1) 1679 ret <4 x i32> %out 1680} 1681 1682define <8 x i16> @foldv8i16() nounwind { 1683; SSE-LABEL: foldv8i16: 1684; SSE: # BB#0: 1685; SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] 1686; SSE-NEXT: retq 1687; 1688; AVX-LABEL: foldv8i16: 1689; AVX: # BB#0: 1690; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] 1691; AVX-NEXT: retq 1692; 1693; AVX512VLCD-LABEL: foldv8i16: 1694; AVX512VLCD: ## BB#0: 1695; AVX512VLCD-NEXT: vmovdqa64 {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] 1696; AVX512VLCD-NEXT: retq 1697; 1698; AVX512CD-LABEL: foldv8i16: 1699; AVX512CD: ## BB#0: 1700; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] 1701; AVX512CD-NEXT: retq 1702; 1703; X32-SSE-LABEL: foldv8i16: 1704; X32-SSE: # BB#0: 1705; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] 1706; X32-SSE-NEXT: retl 1707 %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 0) 1708 ret <8 x i16> %out 1709} 1710 1711define <8 x i16> @foldv8i16u() nounwind { 1712; SSE-LABEL: foldv8i16u: 1713; SSE: # BB#0: 1714; SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] 1715; SSE-NEXT: retq 1716; 1717; AVX-LABEL: foldv8i16u: 1718; AVX: # BB#0: 1719; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] 1720; AVX-NEXT: retq 1721; 1722; AVX512VLCD-LABEL: foldv8i16u: 1723; AVX512VLCD: ## BB#0: 1724; AVX512VLCD-NEXT: vmovdqa64 {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] 1725; AVX512VLCD-NEXT: retq 1726; 1727; AVX512CD-LABEL: foldv8i16u: 1728; AVX512CD: ## BB#0: 1729; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] 1730; AVX512CD-NEXT: retq 1731; 1732; X32-SSE-LABEL: foldv8i16u: 1733; X32-SSE: # BB#0: 1734; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] 1735; X32-SSE-NEXT: retl 1736 %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 -1) 1737 ret <8 x i16> %out 1738} 1739 1740define <16 x i8> @foldv16i8() nounwind { 1741; SSE-LABEL: foldv16i8: 1742; SSE: # BB#0: 1743; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] 1744; SSE-NEXT: retq 1745; 1746; AVX-LABEL: foldv16i8: 1747; AVX: # BB#0: 1748; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] 1749; AVX-NEXT: retq 1750; 1751; AVX512VLCD-LABEL: foldv16i8: 1752; AVX512VLCD: ## BB#0: 1753; AVX512VLCD-NEXT: vmovdqa64 {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] 1754; AVX512VLCD-NEXT: retq 1755; 1756; AVX512CD-LABEL: foldv16i8: 1757; AVX512CD: ## BB#0: 1758; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] 1759; AVX512CD-NEXT: retq 1760; 1761; X32-SSE-LABEL: foldv16i8: 1762; X32-SSE: # BB#0: 1763; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] 1764; X32-SSE-NEXT: retl 1765 %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 0) 1766 ret <16 x i8> %out 1767} 1768 1769define <16 x i8> @foldv16i8u() nounwind { 1770; SSE-LABEL: foldv16i8u: 1771; SSE: # BB#0: 1772; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] 1773; SSE-NEXT: retq 1774; 1775; AVX-LABEL: foldv16i8u: 1776; AVX: # BB#0: 1777; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] 1778; AVX-NEXT: retq 1779; 1780; AVX512VLCD-LABEL: foldv16i8u: 1781; AVX512VLCD: ## BB#0: 1782; AVX512VLCD-NEXT: vmovdqa64 {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] 1783; AVX512VLCD-NEXT: retq 1784; 1785; AVX512CD-LABEL: foldv16i8u: 1786; AVX512CD: ## BB#0: 1787; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] 1788; AVX512CD-NEXT: retq 1789; 1790; X32-SSE-LABEL: foldv16i8u: 1791; X32-SSE: # BB#0: 1792; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] 1793; X32-SSE-NEXT: retl 1794 %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 -1) 1795 ret <16 x i8> %out 1796} 1797 1798declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) 1799declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) 1800declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) 1801declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) 1802