1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=CHECK --check-prefix=X32 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=CHECK --check-prefix=X64 4; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+bmi,+lzcnt | FileCheck %s --check-prefix=CHECK --check-prefix=X32-CLZ 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+lzcnt | FileCheck %s --check-prefix=CHECK --check-prefix=X64-CLZ 6 7declare i8 @llvm.cttz.i8(i8, i1) 8declare i16 @llvm.cttz.i16(i16, i1) 9declare i32 @llvm.cttz.i32(i32, i1) 10declare i64 @llvm.cttz.i64(i64, i1) 11 12declare i8 @llvm.ctlz.i8(i8, i1) 13declare i16 @llvm.ctlz.i16(i16, i1) 14declare i32 @llvm.ctlz.i32(i32, i1) 15declare i64 @llvm.ctlz.i64(i64, i1) 16 17define i8 @cttz_i8(i8 %x) { 18; X32-LABEL: cttz_i8: 19; X32: # %bb.0: 20; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 21; X32-NEXT: bsfl %eax, %eax 22; X32-NEXT: # kill: def $al killed $al killed $eax 23; X32-NEXT: retl 24; 25; X64-LABEL: cttz_i8: 26; X64: # %bb.0: 27; X64-NEXT: movzbl %dil, %eax 28; X64-NEXT: bsfl %eax, %eax 29; X64-NEXT: # kill: def $al killed $al killed $eax 30; X64-NEXT: retq 31; 32; X32-CLZ-LABEL: cttz_i8: 33; X32-CLZ: # %bb.0: 34; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax 35; X32-CLZ-NEXT: tzcntl %eax, %eax 36; X32-CLZ-NEXT: # kill: def $al killed $al killed $eax 37; X32-CLZ-NEXT: retl 38; 39; X64-CLZ-LABEL: cttz_i8: 40; X64-CLZ: # %bb.0: 41; X64-CLZ-NEXT: movzbl %dil, %eax 42; X64-CLZ-NEXT: tzcntl %eax, %eax 43; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax 44; X64-CLZ-NEXT: retq 45 %tmp = call i8 @llvm.cttz.i8( i8 %x, i1 true ) 46 ret i8 %tmp 47} 48 49define i16 @cttz_i16(i16 %x) { 50; X32-LABEL: cttz_i16: 51; X32: # %bb.0: 52; X32-NEXT: bsfw {{[0-9]+}}(%esp), %ax 53; X32-NEXT: retl 54; 55; X64-LABEL: cttz_i16: 56; X64: # %bb.0: 57; X64-NEXT: bsfw %di, %ax 58; X64-NEXT: retq 59; 60; X32-CLZ-LABEL: cttz_i16: 61; X32-CLZ: # %bb.0: 62; X32-CLZ-NEXT: tzcntw {{[0-9]+}}(%esp), %ax 63; X32-CLZ-NEXT: retl 64; 65; X64-CLZ-LABEL: cttz_i16: 66; X64-CLZ: # %bb.0: 67; X64-CLZ-NEXT: tzcntw %di, %ax 68; X64-CLZ-NEXT: retq 69 %tmp = call i16 @llvm.cttz.i16( i16 %x, i1 true ) 70 ret i16 %tmp 71} 72 73define i32 @cttz_i32(i32 %x) { 74; X32-LABEL: cttz_i32: 75; X32: # %bb.0: 76; X32-NEXT: bsfl {{[0-9]+}}(%esp), %eax 77; X32-NEXT: retl 78; 79; X64-LABEL: cttz_i32: 80; X64: # %bb.0: 81; X64-NEXT: bsfl %edi, %eax 82; X64-NEXT: retq 83; 84; X32-CLZ-LABEL: cttz_i32: 85; X32-CLZ: # %bb.0: 86; X32-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax 87; X32-CLZ-NEXT: retl 88; 89; X64-CLZ-LABEL: cttz_i32: 90; X64-CLZ: # %bb.0: 91; X64-CLZ-NEXT: tzcntl %edi, %eax 92; X64-CLZ-NEXT: retq 93 %tmp = call i32 @llvm.cttz.i32( i32 %x, i1 true ) 94 ret i32 %tmp 95} 96 97define i64 @cttz_i64(i64 %x) { 98; X32-LABEL: cttz_i64: 99; X32: # %bb.0: 100; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 101; X32-NEXT: testl %eax, %eax 102; X32-NEXT: jne .LBB3_1 103; X32-NEXT: # %bb.2: 104; X32-NEXT: bsfl {{[0-9]+}}(%esp), %eax 105; X32-NEXT: addl $32, %eax 106; X32-NEXT: xorl %edx, %edx 107; X32-NEXT: retl 108; X32-NEXT: .LBB3_1: 109; X32-NEXT: bsfl %eax, %eax 110; X32-NEXT: xorl %edx, %edx 111; X32-NEXT: retl 112; 113; X64-LABEL: cttz_i64: 114; X64: # %bb.0: 115; X64-NEXT: bsfq %rdi, %rax 116; X64-NEXT: retq 117; 118; X32-CLZ-LABEL: cttz_i64: 119; X32-CLZ: # %bb.0: 120; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax 121; X32-CLZ-NEXT: testl %eax, %eax 122; X32-CLZ-NEXT: jne .LBB3_1 123; X32-CLZ-NEXT: # %bb.2: 124; X32-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax 125; X32-CLZ-NEXT: addl $32, %eax 126; X32-CLZ-NEXT: xorl %edx, %edx 127; X32-CLZ-NEXT: retl 128; X32-CLZ-NEXT: .LBB3_1: 129; X32-CLZ-NEXT: tzcntl %eax, %eax 130; X32-CLZ-NEXT: xorl %edx, %edx 131; X32-CLZ-NEXT: retl 132; 133; X64-CLZ-LABEL: cttz_i64: 134; X64-CLZ: # %bb.0: 135; X64-CLZ-NEXT: tzcntq %rdi, %rax 136; X64-CLZ-NEXT: retq 137 %tmp = call i64 @llvm.cttz.i64( i64 %x, i1 true ) 138 ret i64 %tmp 139} 140 141define i8 @ctlz_i8(i8 %x) { 142; X32-LABEL: ctlz_i8: 143; X32: # %bb.0: 144; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 145; X32-NEXT: bsrl %eax, %eax 146; X32-NEXT: xorl $7, %eax 147; X32-NEXT: # kill: def $al killed $al killed $eax 148; X32-NEXT: retl 149; 150; X64-LABEL: ctlz_i8: 151; X64: # %bb.0: 152; X64-NEXT: movzbl %dil, %eax 153; X64-NEXT: bsrl %eax, %eax 154; X64-NEXT: xorl $7, %eax 155; X64-NEXT: # kill: def $al killed $al killed $eax 156; X64-NEXT: retq 157; 158; X32-CLZ-LABEL: ctlz_i8: 159; X32-CLZ: # %bb.0: 160; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax 161; X32-CLZ-NEXT: lzcntl %eax, %eax 162; X32-CLZ-NEXT: addl $-24, %eax 163; X32-CLZ-NEXT: # kill: def $al killed $al killed $eax 164; X32-CLZ-NEXT: retl 165; 166; X64-CLZ-LABEL: ctlz_i8: 167; X64-CLZ: # %bb.0: 168; X64-CLZ-NEXT: movzbl %dil, %eax 169; X64-CLZ-NEXT: lzcntl %eax, %eax 170; X64-CLZ-NEXT: addl $-24, %eax 171; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax 172; X64-CLZ-NEXT: retq 173 %tmp2 = call i8 @llvm.ctlz.i8( i8 %x, i1 true ) 174 ret i8 %tmp2 175} 176 177define i16 @ctlz_i16(i16 %x) { 178; X32-LABEL: ctlz_i16: 179; X32: # %bb.0: 180; X32-NEXT: bsrw {{[0-9]+}}(%esp), %ax 181; X32-NEXT: xorl $15, %eax 182; X32-NEXT: # kill: def $ax killed $ax killed $eax 183; X32-NEXT: retl 184; 185; X64-LABEL: ctlz_i16: 186; X64: # %bb.0: 187; X64-NEXT: bsrw %di, %ax 188; X64-NEXT: xorl $15, %eax 189; X64-NEXT: # kill: def $ax killed $ax killed $eax 190; X64-NEXT: retq 191; 192; X32-CLZ-LABEL: ctlz_i16: 193; X32-CLZ: # %bb.0: 194; X32-CLZ-NEXT: lzcntw {{[0-9]+}}(%esp), %ax 195; X32-CLZ-NEXT: retl 196; 197; X64-CLZ-LABEL: ctlz_i16: 198; X64-CLZ: # %bb.0: 199; X64-CLZ-NEXT: lzcntw %di, %ax 200; X64-CLZ-NEXT: retq 201 %tmp2 = call i16 @llvm.ctlz.i16( i16 %x, i1 true ) 202 ret i16 %tmp2 203} 204 205define i32 @ctlz_i32(i32 %x) { 206; X32-LABEL: ctlz_i32: 207; X32: # %bb.0: 208; X32-NEXT: bsrl {{[0-9]+}}(%esp), %eax 209; X32-NEXT: xorl $31, %eax 210; X32-NEXT: retl 211; 212; X64-LABEL: ctlz_i32: 213; X64: # %bb.0: 214; X64-NEXT: bsrl %edi, %eax 215; X64-NEXT: xorl $31, %eax 216; X64-NEXT: retq 217; 218; X32-CLZ-LABEL: ctlz_i32: 219; X32-CLZ: # %bb.0: 220; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax 221; X32-CLZ-NEXT: retl 222; 223; X64-CLZ-LABEL: ctlz_i32: 224; X64-CLZ: # %bb.0: 225; X64-CLZ-NEXT: lzcntl %edi, %eax 226; X64-CLZ-NEXT: retq 227 %tmp = call i32 @llvm.ctlz.i32( i32 %x, i1 true ) 228 ret i32 %tmp 229} 230 231define i64 @ctlz_i64(i64 %x) { 232; X32-LABEL: ctlz_i64: 233; X32: # %bb.0: 234; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 235; X32-NEXT: testl %eax, %eax 236; X32-NEXT: jne .LBB7_1 237; X32-NEXT: # %bb.2: 238; X32-NEXT: bsrl {{[0-9]+}}(%esp), %eax 239; X32-NEXT: xorl $31, %eax 240; X32-NEXT: addl $32, %eax 241; X32-NEXT: xorl %edx, %edx 242; X32-NEXT: retl 243; X32-NEXT: .LBB7_1: 244; X32-NEXT: bsrl %eax, %eax 245; X32-NEXT: xorl $31, %eax 246; X32-NEXT: xorl %edx, %edx 247; X32-NEXT: retl 248; 249; X64-LABEL: ctlz_i64: 250; X64: # %bb.0: 251; X64-NEXT: bsrq %rdi, %rax 252; X64-NEXT: xorq $63, %rax 253; X64-NEXT: retq 254; 255; X32-CLZ-LABEL: ctlz_i64: 256; X32-CLZ: # %bb.0: 257; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax 258; X32-CLZ-NEXT: testl %eax, %eax 259; X32-CLZ-NEXT: jne .LBB7_1 260; X32-CLZ-NEXT: # %bb.2: 261; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax 262; X32-CLZ-NEXT: addl $32, %eax 263; X32-CLZ-NEXT: xorl %edx, %edx 264; X32-CLZ-NEXT: retl 265; X32-CLZ-NEXT: .LBB7_1: 266; X32-CLZ-NEXT: lzcntl %eax, %eax 267; X32-CLZ-NEXT: xorl %edx, %edx 268; X32-CLZ-NEXT: retl 269; 270; X64-CLZ-LABEL: ctlz_i64: 271; X64-CLZ: # %bb.0: 272; X64-CLZ-NEXT: lzcntq %rdi, %rax 273; X64-CLZ-NEXT: retq 274 %tmp = call i64 @llvm.ctlz.i64( i64 %x, i1 true ) 275 ret i64 %tmp 276} 277 278; Generate a test and branch to handle zero inputs because bsr/bsf are very slow. 279define i8 @ctlz_i8_zero_test(i8 %n) { 280; X32-LABEL: ctlz_i8_zero_test: 281; X32: # %bb.0: 282; X32-NEXT: movb {{[0-9]+}}(%esp), %al 283; X32-NEXT: testb %al, %al 284; X32-NEXT: je .LBB8_1 285; X32-NEXT: # %bb.2: # %cond.false 286; X32-NEXT: movzbl %al, %eax 287; X32-NEXT: bsrl %eax, %eax 288; X32-NEXT: xorl $7, %eax 289; X32-NEXT: # kill: def $al killed $al killed $eax 290; X32-NEXT: retl 291; X32-NEXT: .LBB8_1: 292; X32-NEXT: movb $8, %al 293; X32-NEXT: # kill: def $al killed $al killed $eax 294; X32-NEXT: retl 295; 296; X64-LABEL: ctlz_i8_zero_test: 297; X64: # %bb.0: 298; X64-NEXT: testb %dil, %dil 299; X64-NEXT: je .LBB8_1 300; X64-NEXT: # %bb.2: # %cond.false 301; X64-NEXT: movzbl %dil, %eax 302; X64-NEXT: bsrl %eax, %eax 303; X64-NEXT: xorl $7, %eax 304; X64-NEXT: # kill: def $al killed $al killed $eax 305; X64-NEXT: retq 306; X64-NEXT: .LBB8_1: 307; X64-NEXT: movb $8, %al 308; X64-NEXT: # kill: def $al killed $al killed $eax 309; X64-NEXT: retq 310; 311; X32-CLZ-LABEL: ctlz_i8_zero_test: 312; X32-CLZ: # %bb.0: 313; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax 314; X32-CLZ-NEXT: lzcntl %eax, %eax 315; X32-CLZ-NEXT: addl $-24, %eax 316; X32-CLZ-NEXT: # kill: def $al killed $al killed $eax 317; X32-CLZ-NEXT: retl 318; 319; X64-CLZ-LABEL: ctlz_i8_zero_test: 320; X64-CLZ: # %bb.0: 321; X64-CLZ-NEXT: movzbl %dil, %eax 322; X64-CLZ-NEXT: lzcntl %eax, %eax 323; X64-CLZ-NEXT: addl $-24, %eax 324; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax 325; X64-CLZ-NEXT: retq 326 %tmp1 = call i8 @llvm.ctlz.i8(i8 %n, i1 false) 327 ret i8 %tmp1 328} 329 330; Generate a test and branch to handle zero inputs because bsr/bsf are very slow. 331define i16 @ctlz_i16_zero_test(i16 %n) { 332; X32-LABEL: ctlz_i16_zero_test: 333; X32: # %bb.0: 334; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax 335; X32-NEXT: testw %ax, %ax 336; X32-NEXT: je .LBB9_1 337; X32-NEXT: # %bb.2: # %cond.false 338; X32-NEXT: bsrw %ax, %ax 339; X32-NEXT: xorl $15, %eax 340; X32-NEXT: # kill: def $ax killed $ax killed $eax 341; X32-NEXT: retl 342; X32-NEXT: .LBB9_1: 343; X32-NEXT: movw $16, %ax 344; X32-NEXT: # kill: def $ax killed $ax killed $eax 345; X32-NEXT: retl 346; 347; X64-LABEL: ctlz_i16_zero_test: 348; X64: # %bb.0: 349; X64-NEXT: testw %di, %di 350; X64-NEXT: je .LBB9_1 351; X64-NEXT: # %bb.2: # %cond.false 352; X64-NEXT: bsrw %di, %ax 353; X64-NEXT: xorl $15, %eax 354; X64-NEXT: # kill: def $ax killed $ax killed $eax 355; X64-NEXT: retq 356; X64-NEXT: .LBB9_1: 357; X64-NEXT: movw $16, %ax 358; X64-NEXT: # kill: def $ax killed $ax killed $eax 359; X64-NEXT: retq 360; 361; X32-CLZ-LABEL: ctlz_i16_zero_test: 362; X32-CLZ: # %bb.0: 363; X32-CLZ-NEXT: lzcntw {{[0-9]+}}(%esp), %ax 364; X32-CLZ-NEXT: retl 365; 366; X64-CLZ-LABEL: ctlz_i16_zero_test: 367; X64-CLZ: # %bb.0: 368; X64-CLZ-NEXT: lzcntw %di, %ax 369; X64-CLZ-NEXT: retq 370 %tmp1 = call i16 @llvm.ctlz.i16(i16 %n, i1 false) 371 ret i16 %tmp1 372} 373 374; Generate a test and branch to handle zero inputs because bsr/bsf are very slow. 375define i32 @ctlz_i32_zero_test(i32 %n) { 376; X32-LABEL: ctlz_i32_zero_test: 377; X32: # %bb.0: 378; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 379; X32-NEXT: testl %eax, %eax 380; X32-NEXT: je .LBB10_1 381; X32-NEXT: # %bb.2: # %cond.false 382; X32-NEXT: bsrl %eax, %eax 383; X32-NEXT: xorl $31, %eax 384; X32-NEXT: retl 385; X32-NEXT: .LBB10_1: 386; X32-NEXT: movl $32, %eax 387; X32-NEXT: retl 388; 389; X64-LABEL: ctlz_i32_zero_test: 390; X64: # %bb.0: 391; X64-NEXT: testl %edi, %edi 392; X64-NEXT: je .LBB10_1 393; X64-NEXT: # %bb.2: # %cond.false 394; X64-NEXT: bsrl %edi, %eax 395; X64-NEXT: xorl $31, %eax 396; X64-NEXT: retq 397; X64-NEXT: .LBB10_1: 398; X64-NEXT: movl $32, %eax 399; X64-NEXT: retq 400; 401; X32-CLZ-LABEL: ctlz_i32_zero_test: 402; X32-CLZ: # %bb.0: 403; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax 404; X32-CLZ-NEXT: retl 405; 406; X64-CLZ-LABEL: ctlz_i32_zero_test: 407; X64-CLZ: # %bb.0: 408; X64-CLZ-NEXT: lzcntl %edi, %eax 409; X64-CLZ-NEXT: retq 410 %tmp1 = call i32 @llvm.ctlz.i32(i32 %n, i1 false) 411 ret i32 %tmp1 412} 413 414; Generate a test and branch to handle zero inputs because bsr/bsf are very slow. 415define i64 @ctlz_i64_zero_test(i64 %n) { 416; X32-LABEL: ctlz_i64_zero_test: 417; X32: # %bb.0: 418; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 419; X32-NEXT: bsrl {{[0-9]+}}(%esp), %edx 420; X32-NEXT: movl $63, %eax 421; X32-NEXT: je .LBB11_2 422; X32-NEXT: # %bb.1: 423; X32-NEXT: movl %edx, %eax 424; X32-NEXT: .LBB11_2: 425; X32-NEXT: testl %ecx, %ecx 426; X32-NEXT: jne .LBB11_3 427; X32-NEXT: # %bb.4: 428; X32-NEXT: xorl $31, %eax 429; X32-NEXT: addl $32, %eax 430; X32-NEXT: xorl %edx, %edx 431; X32-NEXT: retl 432; X32-NEXT: .LBB11_3: 433; X32-NEXT: bsrl %ecx, %eax 434; X32-NEXT: xorl $31, %eax 435; X32-NEXT: xorl %edx, %edx 436; X32-NEXT: retl 437; 438; X64-LABEL: ctlz_i64_zero_test: 439; X64: # %bb.0: 440; X64-NEXT: testq %rdi, %rdi 441; X64-NEXT: je .LBB11_1 442; X64-NEXT: # %bb.2: # %cond.false 443; X64-NEXT: bsrq %rdi, %rax 444; X64-NEXT: xorq $63, %rax 445; X64-NEXT: retq 446; X64-NEXT: .LBB11_1: 447; X64-NEXT: movl $64, %eax 448; X64-NEXT: retq 449; 450; X32-CLZ-LABEL: ctlz_i64_zero_test: 451; X32-CLZ: # %bb.0: 452; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax 453; X32-CLZ-NEXT: testl %eax, %eax 454; X32-CLZ-NEXT: jne .LBB11_1 455; X32-CLZ-NEXT: # %bb.2: 456; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax 457; X32-CLZ-NEXT: addl $32, %eax 458; X32-CLZ-NEXT: xorl %edx, %edx 459; X32-CLZ-NEXT: retl 460; X32-CLZ-NEXT: .LBB11_1: 461; X32-CLZ-NEXT: lzcntl %eax, %eax 462; X32-CLZ-NEXT: xorl %edx, %edx 463; X32-CLZ-NEXT: retl 464; 465; X64-CLZ-LABEL: ctlz_i64_zero_test: 466; X64-CLZ: # %bb.0: 467; X64-CLZ-NEXT: lzcntq %rdi, %rax 468; X64-CLZ-NEXT: retq 469 %tmp1 = call i64 @llvm.ctlz.i64(i64 %n, i1 false) 470 ret i64 %tmp1 471} 472 473; Generate a test and branch to handle zero inputs because bsr/bsf are very slow. 474define i8 @cttz_i8_zero_test(i8 %n) { 475; X32-LABEL: cttz_i8_zero_test: 476; X32: # %bb.0: 477; X32-NEXT: movb {{[0-9]+}}(%esp), %al 478; X32-NEXT: testb %al, %al 479; X32-NEXT: je .LBB12_1 480; X32-NEXT: # %bb.2: # %cond.false 481; X32-NEXT: movzbl %al, %eax 482; X32-NEXT: bsfl %eax, %eax 483; X32-NEXT: # kill: def $al killed $al killed $eax 484; X32-NEXT: retl 485; X32-NEXT: .LBB12_1 486; X32-NEXT: movb $8, %al 487; X32-NEXT: # kill: def $al killed $al killed $eax 488; X32-NEXT: retl 489; 490; X64-LABEL: cttz_i8_zero_test: 491; X64: # %bb.0: 492; X64-NEXT: testb %dil, %dil 493; X64-NEXT: je .LBB12_1 494; X64-NEXT: # %bb.2: # %cond.false 495; X64-NEXT: movzbl %dil, %eax 496; X64-NEXT: bsfl %eax, %eax 497; X64-NEXT: # kill: def $al killed $al killed $eax 498; X64-NEXT: retq 499; X64-NEXT: .LBB12_1: 500; X64-NEXT: movb $8, %al 501; X64-NEXT: # kill: def $al killed $al killed $eax 502; X64-NEXT: retq 503; 504; X32-CLZ-LABEL: cttz_i8_zero_test: 505; X32-CLZ: # %bb.0: 506; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax 507; X32-CLZ-NEXT: orl $256, %eax # imm = 0x100 508; X32-CLZ-NEXT: tzcntl %eax, %eax 509; X32-CLZ-NEXT: # kill: def $al killed $al killed $eax 510; X32-CLZ-NEXT: retl 511; 512; X64-CLZ-LABEL: cttz_i8_zero_test: 513; X64-CLZ: # %bb.0: 514; X64-CLZ-NEXT: movzbl %dil, %eax 515; X64-CLZ-NEXT: orl $256, %eax # imm = 0x100 516; X64-CLZ-NEXT: tzcntl %eax, %eax 517; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax 518; X64-CLZ-NEXT: retq 519 %tmp1 = call i8 @llvm.cttz.i8(i8 %n, i1 false) 520 ret i8 %tmp1 521} 522 523; Generate a test and branch to handle zero inputs because bsr/bsf are very slow. 524define i16 @cttz_i16_zero_test(i16 %n) { 525; X32-LABEL: cttz_i16_zero_test: 526; X32: # %bb.0: 527; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax 528; X32-NEXT: testw %ax, %ax 529; X32-NEXT: je .LBB13_1 530; X32-NEXT: # %bb.2: # %cond.false 531; X32-NEXT: bsfw %ax, %ax 532; X32-NEXT: retl 533; X32-NEXT: .LBB13_1 534; X32-NEXT: movw $16, %ax 535; X32-NEXT: retl 536; 537; X64-LABEL: cttz_i16_zero_test: 538; X64: # %bb.0: 539; X64-NEXT: testw %di, %di 540; X64-NEXT: je .LBB13_1 541; X64-NEXT: # %bb.2: # %cond.false 542; X64-NEXT: bsfw %di, %ax 543; X64-NEXT: retq 544; X64-NEXT: .LBB13_1: 545; X64-NEXT: movw $16, %ax 546; X64-NEXT: retq 547; 548; X32-CLZ-LABEL: cttz_i16_zero_test: 549; X32-CLZ: # %bb.0: 550; X32-CLZ-NEXT: tzcntw {{[0-9]+}}(%esp), %ax 551; X32-CLZ-NEXT: retl 552; 553; X64-CLZ-LABEL: cttz_i16_zero_test: 554; X64-CLZ: # %bb.0: 555; X64-CLZ-NEXT: tzcntw %di, %ax 556; X64-CLZ-NEXT: retq 557 %tmp1 = call i16 @llvm.cttz.i16(i16 %n, i1 false) 558 ret i16 %tmp1 559} 560 561; Generate a test and branch to handle zero inputs because bsr/bsf are very slow. 562define i32 @cttz_i32_zero_test(i32 %n) { 563; X32-LABEL: cttz_i32_zero_test: 564; X32: # %bb.0: 565; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 566; X32-NEXT: testl %eax, %eax 567; X32-NEXT: je .LBB14_1 568; X32-NEXT: # %bb.2: # %cond.false 569; X32-NEXT: bsfl %eax, %eax 570; X32-NEXT: retl 571; X32-NEXT: .LBB14_1 572; X32-NEXT: movl $32, %eax 573; X32-NEXT: retl 574; 575; X64-LABEL: cttz_i32_zero_test: 576; X64: # %bb.0: 577; X64-NEXT: testl %edi, %edi 578; X64-NEXT: je .LBB14_1 579; X64-NEXT: # %bb.2: # %cond.false 580; X64-NEXT: bsfl %edi, %eax 581; X64-NEXT: retq 582; X64-NEXT: .LBB14_1: 583; X64-NEXT: movl $32, %eax 584; X64-NEXT: retq 585; 586; X32-CLZ-LABEL: cttz_i32_zero_test: 587; X32-CLZ: # %bb.0: 588; X32-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax 589; X32-CLZ-NEXT: retl 590; 591; X64-CLZ-LABEL: cttz_i32_zero_test: 592; X64-CLZ: # %bb.0: 593; X64-CLZ-NEXT: tzcntl %edi, %eax 594; X64-CLZ-NEXT: retq 595 %tmp1 = call i32 @llvm.cttz.i32(i32 %n, i1 false) 596 ret i32 %tmp1 597} 598 599; Generate a test and branch to handle zero inputs because bsr/bsf are very slow. 600define i64 @cttz_i64_zero_test(i64 %n) { 601; X32-LABEL: cttz_i64_zero_test: 602; X32: # %bb.0: 603; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 604; X32-NEXT: bsfl {{[0-9]+}}(%esp), %edx 605; X32-NEXT: movl $32, %eax 606; X32-NEXT: je .LBB15_2 607; X32-NEXT: # %bb.1: 608; X32-NEXT: movl %edx, %eax 609; X32-NEXT: .LBB15_2: 610; X32-NEXT: testl %ecx, %ecx 611; X32-NEXT: jne .LBB15_3 612; X32-NEXT: # %bb.4: 613; X32-NEXT: addl $32, %eax 614; X32-NEXT: xorl %edx, %edx 615; X32-NEXT: retl 616; X32-NEXT: .LBB15_3: 617; X32-NEXT: bsfl %ecx, %eax 618; X32-NEXT: xorl %edx, %edx 619; X32-NEXT: retl 620; 621; X64-LABEL: cttz_i64_zero_test: 622; X64: # %bb.0: 623; X64-NEXT: testq %rdi, %rdi 624; X64-NEXT: je .LBB15_1 625; X64-NEXT: # %bb.2: # %cond.false 626; X64-NEXT: bsfq %rdi, %rax 627; X64-NEXT: retq 628; X64-NEXT: .LBB15_1: 629; X64-NEXT: movl $64, %eax 630; X64-NEXT: retq 631; 632; X32-CLZ-LABEL: cttz_i64_zero_test: 633; X32-CLZ: # %bb.0: 634; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax 635; X32-CLZ-NEXT: testl %eax, %eax 636; X32-CLZ-NEXT: jne .LBB15_1 637; X32-CLZ-NEXT: # %bb.2: 638; X32-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax 639; X32-CLZ-NEXT: addl $32, %eax 640; X32-CLZ-NEXT: xorl %edx, %edx 641; X32-CLZ-NEXT: retl 642; X32-CLZ-NEXT: .LBB15_1: 643; X32-CLZ-NEXT: tzcntl %eax, %eax 644; X32-CLZ-NEXT: xorl %edx, %edx 645; X32-CLZ-NEXT: retl 646; 647; X64-CLZ-LABEL: cttz_i64_zero_test: 648; X64-CLZ: # %bb.0: 649; X64-CLZ-NEXT: tzcntq %rdi, %rax 650; X64-CLZ-NEXT: retq 651 %tmp1 = call i64 @llvm.cttz.i64(i64 %n, i1 false) 652 ret i64 %tmp1 653} 654 655; Don't generate the cmovne when the source is known non-zero (and bsr would 656; not set ZF). 657; rdar://9490949 658; FIXME: The compare and branch are produced late in IR (by CodeGenPrepare), and 659; codegen doesn't know how to delete the movl and je. 660define i32 @ctlz_i32_fold_cmov(i32 %n) { 661; X32-LABEL: ctlz_i32_fold_cmov: 662; X32: # %bb.0: 663; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 664; X32-NEXT: orl $1, %eax 665; X32-NEXT: je .LBB16_1 666; X32-NEXT: # %bb.2: # %cond.false 667; X32-NEXT: bsrl %eax, %eax 668; X32-NEXT: xorl $31, %eax 669; X32-NEXT: retl 670; X32-NEXT: .LBB16_1 671; X32-NEXT: movl $32, %eax 672; X32-NEXT: retl 673; 674; X64-LABEL: ctlz_i32_fold_cmov: 675; X64: # %bb.0: 676; X64-NEXT: orl $1, %edi 677; X64-NEXT: je .LBB16_1 678; X64-NEXT: # %bb.2: # %cond.false 679; X64-NEXT: bsrl %edi, %eax 680; X64-NEXT: xorl $31, %eax 681; X64-NEXT: retq 682; X64-NEXT: .LBB16_1: 683; X64-NEXT: movl $32, %eax 684; X64-NEXT: retq 685; 686; X32-CLZ-LABEL: ctlz_i32_fold_cmov: 687; X32-CLZ: # %bb.0: 688; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax 689; X32-CLZ-NEXT: orl $1, %eax 690; X32-CLZ-NEXT: lzcntl %eax, %eax 691; X32-CLZ-NEXT: retl 692; 693; X64-CLZ-LABEL: ctlz_i32_fold_cmov: 694; X64-CLZ: # %bb.0: 695; X64-CLZ-NEXT: orl $1, %edi 696; X64-CLZ-NEXT: lzcntl %edi, %eax 697; X64-CLZ-NEXT: retq 698 %or = or i32 %n, 1 699 %tmp1 = call i32 @llvm.ctlz.i32(i32 %or, i1 false) 700 ret i32 %tmp1 701} 702 703; Don't generate any xors when a 'ctlz' intrinsic is actually used to compute 704; the most significant bit, which is what 'bsr' does natively. 705; FIXME: We should probably select BSR instead of LZCNT in these circumstances. 706define i32 @ctlz_bsr(i32 %n) { 707; X32-LABEL: ctlz_bsr: 708; X32: # %bb.0: 709; X32-NEXT: bsrl {{[0-9]+}}(%esp), %eax 710; X32-NEXT: retl 711; 712; X64-LABEL: ctlz_bsr: 713; X64: # %bb.0: 714; X64-NEXT: bsrl %edi, %eax 715; X64-NEXT: retq 716; 717; X32-CLZ-LABEL: ctlz_bsr: 718; X32-CLZ: # %bb.0: 719; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax 720; X32-CLZ-NEXT: xorl $31, %eax 721; X32-CLZ-NEXT: retl 722; 723; X64-CLZ-LABEL: ctlz_bsr: 724; X64-CLZ: # %bb.0: 725; X64-CLZ-NEXT: lzcntl %edi, %eax 726; X64-CLZ-NEXT: xorl $31, %eax 727; X64-CLZ-NEXT: retq 728 %ctlz = call i32 @llvm.ctlz.i32(i32 %n, i1 true) 729 %bsr = xor i32 %ctlz, 31 730 ret i32 %bsr 731} 732 733; Generate a test and branch to handle zero inputs because bsr/bsf are very slow. 734; FIXME: The compare and branch are produced late in IR (by CodeGenPrepare), and 735; codegen doesn't know how to combine the $32 and $31 into $63. 736define i32 @ctlz_bsr_zero_test(i32 %n) { 737; X32-LABEL: ctlz_bsr_zero_test: 738; X32: # %bb.0: 739; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 740; X32-NEXT: testl %eax, %eax 741; X32-NEXT: je .LBB18_1 742; X32-NEXT: # %bb.2: # %cond.false 743; X32-NEXT: bsrl %eax, %eax 744; X32-NEXT: xorl $31, %eax 745; X32-NEXT: xorl $31, %eax 746; X32-NEXT: retl 747; X32-NEXT: .LBB18_1: 748; X32-NEXT: movl $32, %eax 749; X32-NEXT: xorl $31, %eax 750; X32-NEXT: retl 751; 752; X64-LABEL: ctlz_bsr_zero_test: 753; X64: # %bb.0: 754; X64-NEXT: testl %edi, %edi 755; X64-NEXT: je .LBB18_1 756; X64-NEXT: # %bb.2: # %cond.false 757; X64-NEXT: bsrl %edi, %eax 758; X64-NEXT: xorl $31, %eax 759; X64-NEXT: xorl $31, %eax 760; X64-NEXT: retq 761; X64-NEXT: .LBB18_1: 762; X64-NEXT: movl $32, %eax 763; X64-NEXT: xorl $31, %eax 764; X64-NEXT: retq 765; 766; X32-CLZ-LABEL: ctlz_bsr_zero_test: 767; X32-CLZ: # %bb.0: 768; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax 769; X32-CLZ-NEXT: xorl $31, %eax 770; X32-CLZ-NEXT: retl 771; 772; X64-CLZ-LABEL: ctlz_bsr_zero_test: 773; X64-CLZ: # %bb.0: 774; X64-CLZ-NEXT: lzcntl %edi, %eax 775; X64-CLZ-NEXT: xorl $31, %eax 776; X64-CLZ-NEXT: retq 777 %ctlz = call i32 @llvm.ctlz.i32(i32 %n, i1 false) 778 %bsr = xor i32 %ctlz, 31 779 ret i32 %bsr 780} 781 782define i8 @cttz_i8_knownbits(i8 %x) { 783; X32-LABEL: cttz_i8_knownbits: 784; X32: # %bb.0: 785; X32-NEXT: movb {{[0-9]+}}(%esp), %al 786; X32-NEXT: orb $2, %al 787; X32-NEXT: movzbl %al, %eax 788; X32-NEXT: bsfl %eax, %eax 789; X32-NEXT: # kill: def $al killed $al killed $eax 790; X32-NEXT: retl 791; 792; X64-LABEL: cttz_i8_knownbits: 793; X64: # %bb.0: 794; X64-NEXT: orb $2, %dil 795; X64-NEXT: movzbl %dil, %eax 796; X64-NEXT: bsfl %eax, %eax 797; X64-NEXT: # kill: def $al killed $al killed $eax 798; X64-NEXT: retq 799; 800; X32-CLZ-LABEL: cttz_i8_knownbits: 801; X32-CLZ: # %bb.0: 802; X32-CLZ-NEXT: movb {{[0-9]+}}(%esp), %al 803; X32-CLZ-NEXT: orb $2, %al 804; X32-CLZ-NEXT: movzbl %al, %eax 805; X32-CLZ-NEXT: tzcntl %eax, %eax 806; X32-CLZ-NEXT: # kill: def $al killed $al killed $eax 807; X32-CLZ-NEXT: retl 808; 809; X64-CLZ-LABEL: cttz_i8_knownbits: 810; X64-CLZ: # %bb.0: 811; X64-CLZ-NEXT: orb $2, %dil 812; X64-CLZ-NEXT: movzbl %dil, %eax 813; X64-CLZ-NEXT: tzcntl %eax, %eax 814; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax 815; X64-CLZ-NEXT: retq 816 %x2 = or i8 %x, 2 817 %tmp = call i8 @llvm.cttz.i8(i8 %x2, i1 true ) 818 %tmp2 = and i8 %tmp, 1 819 ret i8 %tmp2 820} 821 822define i8 @ctlz_i8_knownbits(i8 %x) { 823; X32-LABEL: ctlz_i8_knownbits: 824; X32: # %bb.0: 825; X32-NEXT: movb {{[0-9]+}}(%esp), %al 826; X32-NEXT: orb $64, %al 827; X32-NEXT: movzbl %al, %eax 828; X32-NEXT: bsrl %eax, %eax 829; X32-NEXT: xorl $7, %eax 830; X32-NEXT: # kill: def $al killed $al killed $eax 831; X32-NEXT: retl 832; 833; X64-LABEL: ctlz_i8_knownbits: 834; X64: # %bb.0: 835; X64-NEXT: orb $64, %dil 836; X64-NEXT: movzbl %dil, %eax 837; X64-NEXT: bsrl %eax, %eax 838; X64-NEXT: xorl $7, %eax 839; X64-NEXT: # kill: def $al killed $al killed $eax 840; X64-NEXT: retq 841; 842; X32-CLZ-LABEL: ctlz_i8_knownbits: 843; X32-CLZ: # %bb.0: 844; X32-CLZ-NEXT: movb {{[0-9]+}}(%esp), %al 845; X32-CLZ-NEXT: orb $64, %al 846; X32-CLZ-NEXT: movzbl %al, %eax 847; X32-CLZ-NEXT: lzcntl %eax, %eax 848; X32-CLZ-NEXT: addl $-24, %eax 849; X32-CLZ-NEXT: # kill: def $al killed $al killed $eax 850; X32-CLZ-NEXT: retl 851; 852; X64-CLZ-LABEL: ctlz_i8_knownbits: 853; X64-CLZ: # %bb.0: 854; X64-CLZ-NEXT: orb $64, %dil 855; X64-CLZ-NEXT: movzbl %dil, %eax 856; X64-CLZ-NEXT: lzcntl %eax, %eax 857; X64-CLZ-NEXT: addl $-24, %eax 858; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax 859; X64-CLZ-NEXT: retq 860 861 %x2 = or i8 %x, 64 862 %tmp = call i8 @llvm.ctlz.i8(i8 %x2, i1 true ) 863 %tmp2 = and i8 %tmp, 1 864 ret i8 %tmp2 865} 866 867; Make sure we can detect that the input is non-zero and avoid cmov after BSR 868; This is relevant for 32-bit mode without lzcnt 869define i64 @ctlz_i64_zero_test_knownneverzero(i64 %n) { 870; X32-LABEL: ctlz_i64_zero_test_knownneverzero: 871; X32: # %bb.0: 872; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 873; X32-NEXT: testl %eax, %eax 874; X32-NEXT: jne .LBB21_1 875; X32-NEXT: # %bb.2: 876; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 877; X32-NEXT: orl $1, %eax 878; X32-NEXT: bsrl %eax, %eax 879; X32-NEXT: xorl $31, %eax 880; X32-NEXT: orl $32, %eax 881; X32-NEXT: xorl %edx, %edx 882; X32-NEXT: retl 883; X32-NEXT: .LBB21_1: 884; X32-NEXT: bsrl %eax, %eax 885; X32-NEXT: xorl $31, %eax 886; X32-NEXT: xorl %edx, %edx 887; X32-NEXT: retl 888; 889; X64-LABEL: ctlz_i64_zero_test_knownneverzero: 890; X64: # %bb.0: 891; X64-NEXT: orq $1, %rdi 892; X64-NEXT: je .LBB21_1 893; X64-NEXT: # %bb.2: # %cond.false 894; X64-NEXT: bsrq %rdi, %rax 895; X64-NEXT: xorq $63, %rax 896; X64-NEXT: retq 897; X64-NEXT: .LBB21_1: 898; X64-NEXT: movl $64, %eax 899; X64-NEXT: retq 900; 901; X32-CLZ-LABEL: ctlz_i64_zero_test_knownneverzero: 902; X32-CLZ: # %bb.0: 903; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax 904; X32-CLZ-NEXT: testl %eax, %eax 905; X32-CLZ-NEXT: jne .LBB21_1 906; X32-CLZ-NEXT: # %bb.2: 907; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax 908; X32-CLZ-NEXT: orl $1, %eax 909; X32-CLZ-NEXT: lzcntl %eax, %eax 910; X32-CLZ-NEXT: orl $32, %eax 911; X32-CLZ-NEXT: xorl %edx, %edx 912; X32-CLZ-NEXT: retl 913; X32-CLZ-NEXT: .LBB21_1: 914; X32-CLZ-NEXT: lzcntl %eax, %eax 915; X32-CLZ-NEXT: xorl %edx, %edx 916; X32-CLZ-NEXT: retl 917; 918; X64-CLZ-LABEL: ctlz_i64_zero_test_knownneverzero: 919; X64-CLZ: # %bb.0: 920; X64-CLZ-NEXT: orq $1, %rdi 921; X64-CLZ-NEXT: lzcntq %rdi, %rax 922; X64-CLZ-NEXT: retq 923 %o = or i64 %n, 1 924 %tmp1 = call i64 @llvm.ctlz.i64(i64 %o, i1 false) 925 ret i64 %tmp1 926} 927 928; Make sure we can detect that the input is non-zero and avoid cmov after BSF 929; This is relevant for 32-bit mode without tzcnt 930define i64 @cttz_i64_zero_test_knownneverzero(i64 %n) { 931; X32-LABEL: cttz_i64_zero_test_knownneverzero: 932; X32: # %bb.0: 933; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 934; X32-NEXT: testl %eax, %eax 935; X32-NEXT: jne .LBB22_1 936; X32-NEXT: # %bb.2: 937; X32-NEXT: movl $-2147483648, %eax # imm = 0x80000000 938; X32-NEXT: orl {{[0-9]+}}(%esp), %eax 939; X32-NEXT: bsfl %eax, %eax 940; X32-NEXT: orl $32, %eax 941; X32-NEXT: xorl %edx, %edx 942; X32-NEXT: retl 943; X32-NEXT: .LBB22_1: 944; X32-NEXT: bsfl %eax, %eax 945; X32-NEXT: xorl %edx, %edx 946; X32-NEXT: retl 947; 948; X64-LABEL: cttz_i64_zero_test_knownneverzero: 949; X64: # %bb.0: 950; X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 951; X64-NEXT: orq %rdi, %rax 952; X64-NEXT: je .LBB22_1 953; X64-NEXT: # %bb.2: # %cond.false 954; X64-NEXT: bsfq %rax, %rax 955; X64-NEXT: retq 956; X64-NEXT: .LBB22_1: 957; X64-NEXT: movl $64, %eax 958; X64-NEXT: retq 959; 960; X32-CLZ-LABEL: cttz_i64_zero_test_knownneverzero: 961; X32-CLZ: # %bb.0: 962; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax 963; X32-CLZ-NEXT: testl %eax, %eax 964; X32-CLZ-NEXT: jne .LBB22_1 965; X32-CLZ-NEXT: # %bb.2: 966; X32-CLZ-NEXT: movl $-2147483648, %eax # imm = 0x80000000 967; X32-CLZ-NEXT: orl {{[0-9]+}}(%esp), %eax 968; X32-CLZ-NEXT: tzcntl %eax, %eax 969; X32-CLZ-NEXT: orl $32, %eax 970; X32-CLZ-NEXT: xorl %edx, %edx 971; X32-CLZ-NEXT: retl 972; X32-CLZ-NEXT: .LBB22_1: 973; X32-CLZ-NEXT: tzcntl %eax, %eax 974; X32-CLZ-NEXT: xorl %edx, %edx 975; X32-CLZ-NEXT: retl 976; 977; X64-CLZ-LABEL: cttz_i64_zero_test_knownneverzero: 978; X64-CLZ: # %bb.0: 979; X64-CLZ-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 980; X64-CLZ-NEXT: orq %rdi, %rax 981; X64-CLZ-NEXT: tzcntq %rax, %rax 982; X64-CLZ-NEXT: retq 983 %o = or i64 %n, -9223372036854775808 ; 0x8000000000000000 984 %tmp1 = call i64 @llvm.cttz.i64(i64 %o, i1 false) 985 ret i64 %tmp1 986} 987