1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 10 11define i8 @test_bitreverse_i8(i8 %a) nounwind { 12; SSE-LABEL: test_bitreverse_i8: 13; SSE: # BB#0: 14; SSE-NEXT: movl %edi, %eax 15; SSE-NEXT: shlb $7, %al 16; SSE-NEXT: movl %edi, %ecx 17; SSE-NEXT: shlb $5, %cl 18; SSE-NEXT: andb $64, %cl 19; SSE-NEXT: movl %edi, %edx 20; SSE-NEXT: shlb $3, %dl 21; SSE-NEXT: andb $32, %dl 22; SSE-NEXT: orb %cl, %dl 23; SSE-NEXT: movl %edi, %ecx 24; SSE-NEXT: addb %cl, %cl 25; SSE-NEXT: andb $16, %cl 26; SSE-NEXT: orb %dl, %cl 27; SSE-NEXT: movl %edi, %edx 28; SSE-NEXT: shrb %dl 29; SSE-NEXT: andb $8, %dl 30; SSE-NEXT: orb %cl, %dl 31; SSE-NEXT: movl %edi, %ecx 32; SSE-NEXT: shrb $3, %cl 33; SSE-NEXT: andb $4, %cl 34; SSE-NEXT: orb %dl, %cl 35; SSE-NEXT: movl %edi, %edx 36; SSE-NEXT: shrb $5, %dl 37; SSE-NEXT: andb $2, %dl 38; SSE-NEXT: orb %cl, %dl 39; SSE-NEXT: shrb $7, %dil 40; SSE-NEXT: orb %dl, %dil 41; SSE-NEXT: orb %al, %dil 42; SSE-NEXT: movl %edi, %eax 43; SSE-NEXT: retq 44; 45; AVX-LABEL: test_bitreverse_i8: 46; AVX: # BB#0: 47; AVX-NEXT: movl %edi, %eax 48; AVX-NEXT: shlb $7, %al 49; AVX-NEXT: movl %edi, %ecx 50; AVX-NEXT: shlb $5, %cl 51; AVX-NEXT: andb $64, %cl 52; AVX-NEXT: movl %edi, %edx 53; AVX-NEXT: shlb $3, %dl 54; AVX-NEXT: andb $32, %dl 55; AVX-NEXT: orb %cl, %dl 56; AVX-NEXT: movl %edi, %ecx 57; AVX-NEXT: addb %cl, %cl 58; AVX-NEXT: andb $16, %cl 59; AVX-NEXT: orb %dl, %cl 60; AVX-NEXT: movl %edi, %edx 61; AVX-NEXT: shrb %dl 62; AVX-NEXT: andb $8, %dl 63; AVX-NEXT: orb %cl, %dl 64; AVX-NEXT: movl %edi, %ecx 65; AVX-NEXT: shrb $3, %cl 66; AVX-NEXT: andb $4, %cl 67; AVX-NEXT: orb %dl, %cl 68; AVX-NEXT: movl %edi, %edx 69; AVX-NEXT: shrb $5, %dl 70; AVX-NEXT: andb $2, %dl 71; AVX-NEXT: orb %cl, %dl 72; AVX-NEXT: shrb $7, %dil 73; AVX-NEXT: orb %dl, %dil 74; AVX-NEXT: orb %al, %dil 75; AVX-NEXT: movl %edi, %eax 76; AVX-NEXT: retq 77; 78; XOP-LABEL: test_bitreverse_i8: 79; XOP: # BB#0: 80; XOP-NEXT: vmovd %edi, %xmm0 81; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 82; XOP-NEXT: vpextrb $0, %xmm0, %eax 83; XOP-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> 84; XOP-NEXT: retq 85 %b = call i8 @llvm.bitreverse.i8(i8 %a) 86 ret i8 %b 87} 88 89define i16 @test_bitreverse_i16(i16 %a) nounwind { 90; SSE-LABEL: test_bitreverse_i16: 91; SSE: # BB#0: 92; SSE-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> 93; SSE-NEXT: movl %edi, %ecx 94; SSE-NEXT: andl $32768, %ecx # imm = 0x8000 95; SSE-NEXT: movl %edi, %eax 96; SSE-NEXT: shll $15, %eax 97; SSE-NEXT: movl %edi, %edx 98; SSE-NEXT: andl $2, %edx 99; SSE-NEXT: shll $13, %edx 100; SSE-NEXT: leal (%rdx,%rax), %eax 101; SSE-NEXT: movl %edi, %edx 102; SSE-NEXT: andl $4, %edx 103; SSE-NEXT: shll $11, %edx 104; SSE-NEXT: orl %edx, %eax 105; SSE-NEXT: movl %edi, %edx 106; SSE-NEXT: andl $8, %edx 107; SSE-NEXT: shll $9, %edx 108; SSE-NEXT: orl %edx, %eax 109; SSE-NEXT: movl %edi, %edx 110; SSE-NEXT: andl $16, %edx 111; SSE-NEXT: shll $7, %edx 112; SSE-NEXT: orl %edx, %eax 113; SSE-NEXT: movl %edi, %edx 114; SSE-NEXT: andl $32, %edx 115; SSE-NEXT: shll $5, %edx 116; SSE-NEXT: orl %edx, %eax 117; SSE-NEXT: movl %edi, %edx 118; SSE-NEXT: andl $64, %edx 119; SSE-NEXT: shll $3, %edx 120; SSE-NEXT: leal (%rdi,%rdi), %esi 121; SSE-NEXT: andl $256, %esi # imm = 0x100 122; SSE-NEXT: orl %edx, %esi 123; SSE-NEXT: movl %edi, %edx 124; SSE-NEXT: shrl %edx 125; SSE-NEXT: andl $128, %edx 126; SSE-NEXT: orl %esi, %edx 127; SSE-NEXT: movl %edi, %esi 128; SSE-NEXT: shrl $3, %esi 129; SSE-NEXT: andl $64, %esi 130; SSE-NEXT: orl %edx, %esi 131; SSE-NEXT: movl %edi, %edx 132; SSE-NEXT: shrl $5, %edx 133; SSE-NEXT: andl $32, %edx 134; SSE-NEXT: orl %esi, %edx 135; SSE-NEXT: movl %edi, %esi 136; SSE-NEXT: shrl $7, %esi 137; SSE-NEXT: andl $16, %esi 138; SSE-NEXT: orl %edx, %esi 139; SSE-NEXT: movl %edi, %edx 140; SSE-NEXT: shrl $9, %edx 141; SSE-NEXT: andl $8, %edx 142; SSE-NEXT: orl %esi, %edx 143; SSE-NEXT: movl %edi, %esi 144; SSE-NEXT: shrl $11, %esi 145; SSE-NEXT: andl $4, %esi 146; SSE-NEXT: orl %edx, %esi 147; SSE-NEXT: shrl $13, %edi 148; SSE-NEXT: andl $2, %edi 149; SSE-NEXT: orl %esi, %edi 150; SSE-NEXT: shrl $15, %ecx 151; SSE-NEXT: orl %edi, %ecx 152; SSE-NEXT: orl %ecx, %eax 153; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> 154; SSE-NEXT: retq 155; 156; AVX-LABEL: test_bitreverse_i16: 157; AVX: # BB#0: 158; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> 159; AVX-NEXT: movl %edi, %ecx 160; AVX-NEXT: andl $32768, %ecx # imm = 0x8000 161; AVX-NEXT: movl %edi, %eax 162; AVX-NEXT: shll $15, %eax 163; AVX-NEXT: movl %edi, %edx 164; AVX-NEXT: andl $2, %edx 165; AVX-NEXT: shll $13, %edx 166; AVX-NEXT: leal (%rdx,%rax), %eax 167; AVX-NEXT: movl %edi, %edx 168; AVX-NEXT: andl $4, %edx 169; AVX-NEXT: shll $11, %edx 170; AVX-NEXT: orl %edx, %eax 171; AVX-NEXT: movl %edi, %edx 172; AVX-NEXT: andl $8, %edx 173; AVX-NEXT: shll $9, %edx 174; AVX-NEXT: orl %edx, %eax 175; AVX-NEXT: movl %edi, %edx 176; AVX-NEXT: andl $16, %edx 177; AVX-NEXT: shll $7, %edx 178; AVX-NEXT: orl %edx, %eax 179; AVX-NEXT: movl %edi, %edx 180; AVX-NEXT: andl $32, %edx 181; AVX-NEXT: shll $5, %edx 182; AVX-NEXT: orl %edx, %eax 183; AVX-NEXT: movl %edi, %edx 184; AVX-NEXT: andl $64, %edx 185; AVX-NEXT: shll $3, %edx 186; AVX-NEXT: leal (%rdi,%rdi), %esi 187; AVX-NEXT: andl $256, %esi # imm = 0x100 188; AVX-NEXT: orl %edx, %esi 189; AVX-NEXT: movl %edi, %edx 190; AVX-NEXT: shrl %edx 191; AVX-NEXT: andl $128, %edx 192; AVX-NEXT: orl %esi, %edx 193; AVX-NEXT: movl %edi, %esi 194; AVX-NEXT: shrl $3, %esi 195; AVX-NEXT: andl $64, %esi 196; AVX-NEXT: orl %edx, %esi 197; AVX-NEXT: movl %edi, %edx 198; AVX-NEXT: shrl $5, %edx 199; AVX-NEXT: andl $32, %edx 200; AVX-NEXT: orl %esi, %edx 201; AVX-NEXT: movl %edi, %esi 202; AVX-NEXT: shrl $7, %esi 203; AVX-NEXT: andl $16, %esi 204; AVX-NEXT: orl %edx, %esi 205; AVX-NEXT: movl %edi, %edx 206; AVX-NEXT: shrl $9, %edx 207; AVX-NEXT: andl $8, %edx 208; AVX-NEXT: orl %esi, %edx 209; AVX-NEXT: movl %edi, %esi 210; AVX-NEXT: shrl $11, %esi 211; AVX-NEXT: andl $4, %esi 212; AVX-NEXT: orl %edx, %esi 213; AVX-NEXT: shrl $13, %edi 214; AVX-NEXT: andl $2, %edi 215; AVX-NEXT: orl %esi, %edi 216; AVX-NEXT: shrl $15, %ecx 217; AVX-NEXT: orl %edi, %ecx 218; AVX-NEXT: orl %ecx, %eax 219; AVX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> 220; AVX-NEXT: retq 221; 222; XOP-LABEL: test_bitreverse_i16: 223; XOP: # BB#0: 224; XOP-NEXT: vmovd %edi, %xmm0 225; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 226; XOP-NEXT: vmovd %xmm0, %eax 227; XOP-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> 228; XOP-NEXT: retq 229 %b = call i16 @llvm.bitreverse.i16(i16 %a) 230 ret i16 %b 231} 232 233define i32 @test_bitreverse_i32(i32 %a) nounwind { 234; SSE-LABEL: test_bitreverse_i32: 235; SSE: # BB#0: 236; SSE-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> 237; SSE-NEXT: movl %edi, %eax 238; SSE-NEXT: shll $31, %eax 239; SSE-NEXT: movl %edi, %ecx 240; SSE-NEXT: andl $2, %ecx 241; SSE-NEXT: shll $29, %ecx 242; SSE-NEXT: leal (%rcx,%rax), %eax 243; SSE-NEXT: movl %edi, %ecx 244; SSE-NEXT: andl $4, %ecx 245; SSE-NEXT: shll $27, %ecx 246; SSE-NEXT: orl %ecx, %eax 247; SSE-NEXT: movl %edi, %ecx 248; SSE-NEXT: andl $8, %ecx 249; SSE-NEXT: shll $25, %ecx 250; SSE-NEXT: orl %ecx, %eax 251; SSE-NEXT: movl %edi, %ecx 252; SSE-NEXT: andl $16, %ecx 253; SSE-NEXT: shll $23, %ecx 254; SSE-NEXT: orl %ecx, %eax 255; SSE-NEXT: movl %edi, %ecx 256; SSE-NEXT: andl $32, %ecx 257; SSE-NEXT: shll $21, %ecx 258; SSE-NEXT: orl %ecx, %eax 259; SSE-NEXT: movl %edi, %ecx 260; SSE-NEXT: andl $64, %ecx 261; SSE-NEXT: shll $19, %ecx 262; SSE-NEXT: movl %edi, %edx 263; SSE-NEXT: shll $17, %edx 264; SSE-NEXT: andl $16777216, %edx # imm = 0x1000000 265; SSE-NEXT: orl %ecx, %edx 266; SSE-NEXT: movl %edi, %ecx 267; SSE-NEXT: shll $15, %ecx 268; SSE-NEXT: andl $8388608, %ecx # imm = 0x800000 269; SSE-NEXT: orl %edx, %ecx 270; SSE-NEXT: movl %edi, %edx 271; SSE-NEXT: shll $13, %edx 272; SSE-NEXT: andl $4194304, %edx # imm = 0x400000 273; SSE-NEXT: orl %ecx, %edx 274; SSE-NEXT: movl %edi, %ecx 275; SSE-NEXT: shll $11, %ecx 276; SSE-NEXT: andl $2097152, %ecx # imm = 0x200000 277; SSE-NEXT: orl %edx, %ecx 278; SSE-NEXT: movl %edi, %edx 279; SSE-NEXT: shll $9, %edx 280; SSE-NEXT: andl $1048576, %edx # imm = 0x100000 281; SSE-NEXT: orl %ecx, %edx 282; SSE-NEXT: movl %edi, %ecx 283; SSE-NEXT: shll $7, %ecx 284; SSE-NEXT: andl $524288, %ecx # imm = 0x80000 285; SSE-NEXT: orl %edx, %ecx 286; SSE-NEXT: movl %edi, %edx 287; SSE-NEXT: shll $5, %edx 288; SSE-NEXT: andl $262144, %edx # imm = 0x40000 289; SSE-NEXT: orl %ecx, %edx 290; SSE-NEXT: leal (,%rdi,8), %ecx 291; SSE-NEXT: andl $131072, %ecx # imm = 0x20000 292; SSE-NEXT: orl %edx, %ecx 293; SSE-NEXT: leal (%rdi,%rdi), %edx 294; SSE-NEXT: andl $65536, %edx # imm = 0x10000 295; SSE-NEXT: orl %ecx, %edx 296; SSE-NEXT: movl %edi, %ecx 297; SSE-NEXT: shrl %ecx 298; SSE-NEXT: andl $32768, %ecx # imm = 0x8000 299; SSE-NEXT: orl %edx, %ecx 300; SSE-NEXT: movl %edi, %edx 301; SSE-NEXT: shrl $3, %edx 302; SSE-NEXT: andl $16384, %edx # imm = 0x4000 303; SSE-NEXT: orl %ecx, %edx 304; SSE-NEXT: movl %edi, %ecx 305; SSE-NEXT: shrl $5, %ecx 306; SSE-NEXT: andl $8192, %ecx # imm = 0x2000 307; SSE-NEXT: orl %edx, %ecx 308; SSE-NEXT: movl %edi, %edx 309; SSE-NEXT: shrl $7, %edx 310; SSE-NEXT: andl $4096, %edx # imm = 0x1000 311; SSE-NEXT: orl %ecx, %edx 312; SSE-NEXT: movl %edi, %ecx 313; SSE-NEXT: shrl $9, %ecx 314; SSE-NEXT: andl $2048, %ecx # imm = 0x800 315; SSE-NEXT: orl %edx, %ecx 316; SSE-NEXT: movl %edi, %edx 317; SSE-NEXT: shrl $11, %edx 318; SSE-NEXT: andl $1024, %edx # imm = 0x400 319; SSE-NEXT: orl %ecx, %edx 320; SSE-NEXT: movl %edi, %ecx 321; SSE-NEXT: shrl $13, %ecx 322; SSE-NEXT: andl $512, %ecx # imm = 0x200 323; SSE-NEXT: orl %edx, %ecx 324; SSE-NEXT: movl %edi, %edx 325; SSE-NEXT: shrl $15, %edx 326; SSE-NEXT: andl $256, %edx # imm = 0x100 327; SSE-NEXT: orl %ecx, %edx 328; SSE-NEXT: movl %edi, %ecx 329; SSE-NEXT: shrl $17, %ecx 330; SSE-NEXT: andl $128, %ecx 331; SSE-NEXT: orl %edx, %ecx 332; SSE-NEXT: movl %edi, %edx 333; SSE-NEXT: shrl $19, %edx 334; SSE-NEXT: andl $64, %edx 335; SSE-NEXT: orl %ecx, %edx 336; SSE-NEXT: movl %edi, %ecx 337; SSE-NEXT: shrl $21, %ecx 338; SSE-NEXT: andl $32, %ecx 339; SSE-NEXT: orl %edx, %ecx 340; SSE-NEXT: movl %edi, %edx 341; SSE-NEXT: shrl $23, %edx 342; SSE-NEXT: andl $16, %edx 343; SSE-NEXT: orl %ecx, %edx 344; SSE-NEXT: movl %edi, %ecx 345; SSE-NEXT: shrl $25, %ecx 346; SSE-NEXT: andl $8, %ecx 347; SSE-NEXT: orl %edx, %ecx 348; SSE-NEXT: movl %edi, %edx 349; SSE-NEXT: shrl $27, %edx 350; SSE-NEXT: andl $4, %edx 351; SSE-NEXT: orl %ecx, %edx 352; SSE-NEXT: movl %edi, %ecx 353; SSE-NEXT: shrl $29, %ecx 354; SSE-NEXT: andl $2, %ecx 355; SSE-NEXT: orl %edx, %ecx 356; SSE-NEXT: shrl $31, %edi 357; SSE-NEXT: orl %ecx, %edi 358; SSE-NEXT: orl %edi, %eax 359; SSE-NEXT: retq 360; 361; AVX-LABEL: test_bitreverse_i32: 362; AVX: # BB#0: 363; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> 364; AVX-NEXT: movl %edi, %eax 365; AVX-NEXT: shll $31, %eax 366; AVX-NEXT: movl %edi, %ecx 367; AVX-NEXT: andl $2, %ecx 368; AVX-NEXT: shll $29, %ecx 369; AVX-NEXT: leal (%rcx,%rax), %eax 370; AVX-NEXT: movl %edi, %ecx 371; AVX-NEXT: andl $4, %ecx 372; AVX-NEXT: shll $27, %ecx 373; AVX-NEXT: orl %ecx, %eax 374; AVX-NEXT: movl %edi, %ecx 375; AVX-NEXT: andl $8, %ecx 376; AVX-NEXT: shll $25, %ecx 377; AVX-NEXT: orl %ecx, %eax 378; AVX-NEXT: movl %edi, %ecx 379; AVX-NEXT: andl $16, %ecx 380; AVX-NEXT: shll $23, %ecx 381; AVX-NEXT: orl %ecx, %eax 382; AVX-NEXT: movl %edi, %ecx 383; AVX-NEXT: andl $32, %ecx 384; AVX-NEXT: shll $21, %ecx 385; AVX-NEXT: orl %ecx, %eax 386; AVX-NEXT: movl %edi, %ecx 387; AVX-NEXT: andl $64, %ecx 388; AVX-NEXT: shll $19, %ecx 389; AVX-NEXT: movl %edi, %edx 390; AVX-NEXT: shll $17, %edx 391; AVX-NEXT: andl $16777216, %edx # imm = 0x1000000 392; AVX-NEXT: orl %ecx, %edx 393; AVX-NEXT: movl %edi, %ecx 394; AVX-NEXT: shll $15, %ecx 395; AVX-NEXT: andl $8388608, %ecx # imm = 0x800000 396; AVX-NEXT: orl %edx, %ecx 397; AVX-NEXT: movl %edi, %edx 398; AVX-NEXT: shll $13, %edx 399; AVX-NEXT: andl $4194304, %edx # imm = 0x400000 400; AVX-NEXT: orl %ecx, %edx 401; AVX-NEXT: movl %edi, %ecx 402; AVX-NEXT: shll $11, %ecx 403; AVX-NEXT: andl $2097152, %ecx # imm = 0x200000 404; AVX-NEXT: orl %edx, %ecx 405; AVX-NEXT: movl %edi, %edx 406; AVX-NEXT: shll $9, %edx 407; AVX-NEXT: andl $1048576, %edx # imm = 0x100000 408; AVX-NEXT: orl %ecx, %edx 409; AVX-NEXT: movl %edi, %ecx 410; AVX-NEXT: shll $7, %ecx 411; AVX-NEXT: andl $524288, %ecx # imm = 0x80000 412; AVX-NEXT: orl %edx, %ecx 413; AVX-NEXT: movl %edi, %edx 414; AVX-NEXT: shll $5, %edx 415; AVX-NEXT: andl $262144, %edx # imm = 0x40000 416; AVX-NEXT: orl %ecx, %edx 417; AVX-NEXT: leal (,%rdi,8), %ecx 418; AVX-NEXT: andl $131072, %ecx # imm = 0x20000 419; AVX-NEXT: orl %edx, %ecx 420; AVX-NEXT: leal (%rdi,%rdi), %edx 421; AVX-NEXT: andl $65536, %edx # imm = 0x10000 422; AVX-NEXT: orl %ecx, %edx 423; AVX-NEXT: movl %edi, %ecx 424; AVX-NEXT: shrl %ecx 425; AVX-NEXT: andl $32768, %ecx # imm = 0x8000 426; AVX-NEXT: orl %edx, %ecx 427; AVX-NEXT: movl %edi, %edx 428; AVX-NEXT: shrl $3, %edx 429; AVX-NEXT: andl $16384, %edx # imm = 0x4000 430; AVX-NEXT: orl %ecx, %edx 431; AVX-NEXT: movl %edi, %ecx 432; AVX-NEXT: shrl $5, %ecx 433; AVX-NEXT: andl $8192, %ecx # imm = 0x2000 434; AVX-NEXT: orl %edx, %ecx 435; AVX-NEXT: movl %edi, %edx 436; AVX-NEXT: shrl $7, %edx 437; AVX-NEXT: andl $4096, %edx # imm = 0x1000 438; AVX-NEXT: orl %ecx, %edx 439; AVX-NEXT: movl %edi, %ecx 440; AVX-NEXT: shrl $9, %ecx 441; AVX-NEXT: andl $2048, %ecx # imm = 0x800 442; AVX-NEXT: orl %edx, %ecx 443; AVX-NEXT: movl %edi, %edx 444; AVX-NEXT: shrl $11, %edx 445; AVX-NEXT: andl $1024, %edx # imm = 0x400 446; AVX-NEXT: orl %ecx, %edx 447; AVX-NEXT: movl %edi, %ecx 448; AVX-NEXT: shrl $13, %ecx 449; AVX-NEXT: andl $512, %ecx # imm = 0x200 450; AVX-NEXT: orl %edx, %ecx 451; AVX-NEXT: movl %edi, %edx 452; AVX-NEXT: shrl $15, %edx 453; AVX-NEXT: andl $256, %edx # imm = 0x100 454; AVX-NEXT: orl %ecx, %edx 455; AVX-NEXT: movl %edi, %ecx 456; AVX-NEXT: shrl $17, %ecx 457; AVX-NEXT: andl $128, %ecx 458; AVX-NEXT: orl %edx, %ecx 459; AVX-NEXT: movl %edi, %edx 460; AVX-NEXT: shrl $19, %edx 461; AVX-NEXT: andl $64, %edx 462; AVX-NEXT: orl %ecx, %edx 463; AVX-NEXT: movl %edi, %ecx 464; AVX-NEXT: shrl $21, %ecx 465; AVX-NEXT: andl $32, %ecx 466; AVX-NEXT: orl %edx, %ecx 467; AVX-NEXT: movl %edi, %edx 468; AVX-NEXT: shrl $23, %edx 469; AVX-NEXT: andl $16, %edx 470; AVX-NEXT: orl %ecx, %edx 471; AVX-NEXT: movl %edi, %ecx 472; AVX-NEXT: shrl $25, %ecx 473; AVX-NEXT: andl $8, %ecx 474; AVX-NEXT: orl %edx, %ecx 475; AVX-NEXT: movl %edi, %edx 476; AVX-NEXT: shrl $27, %edx 477; AVX-NEXT: andl $4, %edx 478; AVX-NEXT: orl %ecx, %edx 479; AVX-NEXT: movl %edi, %ecx 480; AVX-NEXT: shrl $29, %ecx 481; AVX-NEXT: andl $2, %ecx 482; AVX-NEXT: orl %edx, %ecx 483; AVX-NEXT: shrl $31, %edi 484; AVX-NEXT: orl %ecx, %edi 485; AVX-NEXT: orl %edi, %eax 486; AVX-NEXT: retq 487; 488; XOP-LABEL: test_bitreverse_i32: 489; XOP: # BB#0: 490; XOP-NEXT: vmovd %edi, %xmm0 491; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 492; XOP-NEXT: vmovd %xmm0, %eax 493; XOP-NEXT: retq 494 %b = call i32 @llvm.bitreverse.i32(i32 %a) 495 ret i32 %b 496} 497 498define i64 @test_bitreverse_i64(i64 %a) nounwind { 499; SSE-LABEL: test_bitreverse_i64: 500; SSE: # BB#0: 501; SSE-NEXT: leaq (%rdi,%rdi), %rax 502; SSE-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000 503; SSE-NEXT: andq %rax, %rcx 504; SSE-NEXT: movq %rdi, %rax 505; SSE-NEXT: shlq $63, %rax 506; SSE-NEXT: movq %rdi, %rdx 507; SSE-NEXT: andq $2, %rdx 508; SSE-NEXT: shlq $61, %rdx 509; SSE-NEXT: leaq (%rdx,%rax), %rax 510; SSE-NEXT: movq %rdi, %rdx 511; SSE-NEXT: andq $4, %rdx 512; SSE-NEXT: shlq $59, %rdx 513; SSE-NEXT: orq %rdx, %rax 514; SSE-NEXT: movq %rdi, %rdx 515; SSE-NEXT: andq $8, %rdx 516; SSE-NEXT: shlq $57, %rdx 517; SSE-NEXT: orq %rdx, %rax 518; SSE-NEXT: movq %rdi, %rdx 519; SSE-NEXT: andq $16, %rdx 520; SSE-NEXT: shlq $55, %rdx 521; SSE-NEXT: orq %rdx, %rax 522; SSE-NEXT: movq %rdi, %rdx 523; SSE-NEXT: andq $32, %rdx 524; SSE-NEXT: shlq $53, %rdx 525; SSE-NEXT: orq %rdx, %rax 526; SSE-NEXT: movq %rdi, %rdx 527; SSE-NEXT: andq $64, %rdx 528; SSE-NEXT: shlq $51, %rdx 529; SSE-NEXT: movq %rdi, %rsi 530; SSE-NEXT: andq $128, %rsi 531; SSE-NEXT: shlq $49, %rsi 532; SSE-NEXT: orq %rdx, %rsi 533; SSE-NEXT: movq %rdi, %rdx 534; SSE-NEXT: andq $256, %rdx # imm = 0x100 535; SSE-NEXT: shlq $47, %rdx 536; SSE-NEXT: orq %rsi, %rdx 537; SSE-NEXT: movq %rdi, %rsi 538; SSE-NEXT: andq $512, %rsi # imm = 0x200 539; SSE-NEXT: shlq $45, %rsi 540; SSE-NEXT: orq %rdx, %rsi 541; SSE-NEXT: movq %rdi, %rdx 542; SSE-NEXT: andq $1024, %rdx # imm = 0x400 543; SSE-NEXT: shlq $43, %rdx 544; SSE-NEXT: orq %rsi, %rdx 545; SSE-NEXT: movq %rdi, %rsi 546; SSE-NEXT: andq $2048, %rsi # imm = 0x800 547; SSE-NEXT: shlq $41, %rsi 548; SSE-NEXT: orq %rdx, %rsi 549; SSE-NEXT: movq %rdi, %rdx 550; SSE-NEXT: andq $4096, %rdx # imm = 0x1000 551; SSE-NEXT: shlq $39, %rdx 552; SSE-NEXT: orq %rsi, %rdx 553; SSE-NEXT: movq %rdi, %rsi 554; SSE-NEXT: andq $8192, %rsi # imm = 0x2000 555; SSE-NEXT: shlq $37, %rsi 556; SSE-NEXT: orq %rdx, %rsi 557; SSE-NEXT: movq %rdi, %rdx 558; SSE-NEXT: andq $16384, %rdx # imm = 0x4000 559; SSE-NEXT: shlq $35, %rdx 560; SSE-NEXT: orq %rsi, %rdx 561; SSE-NEXT: movq %rdi, %rsi 562; SSE-NEXT: andq $32768, %rsi # imm = 0x8000 563; SSE-NEXT: shlq $33, %rsi 564; SSE-NEXT: orq %rdx, %rsi 565; SSE-NEXT: movq %rdi, %rdx 566; SSE-NEXT: andq $65536, %rdx # imm = 0x10000 567; SSE-NEXT: shlq $31, %rdx 568; SSE-NEXT: orq %rsi, %rdx 569; SSE-NEXT: movq %rdi, %rsi 570; SSE-NEXT: andq $131072, %rsi # imm = 0x20000 571; SSE-NEXT: shlq $29, %rsi 572; SSE-NEXT: orq %rdx, %rsi 573; SSE-NEXT: movq %rdi, %rdx 574; SSE-NEXT: andq $262144, %rdx # imm = 0x40000 575; SSE-NEXT: shlq $27, %rdx 576; SSE-NEXT: orq %rsi, %rdx 577; SSE-NEXT: movq %rdi, %rsi 578; SSE-NEXT: andq $524288, %rsi # imm = 0x80000 579; SSE-NEXT: shlq $25, %rsi 580; SSE-NEXT: orq %rdx, %rsi 581; SSE-NEXT: movq %rdi, %rdx 582; SSE-NEXT: andq $1048576, %rdx # imm = 0x100000 583; SSE-NEXT: shlq $23, %rdx 584; SSE-NEXT: orq %rsi, %rdx 585; SSE-NEXT: movq %rdi, %rsi 586; SSE-NEXT: andq $2097152, %rsi # imm = 0x200000 587; SSE-NEXT: shlq $21, %rsi 588; SSE-NEXT: orq %rdx, %rsi 589; SSE-NEXT: movq %rdi, %rdx 590; SSE-NEXT: andq $4194304, %rdx # imm = 0x400000 591; SSE-NEXT: shlq $19, %rdx 592; SSE-NEXT: orq %rsi, %rdx 593; SSE-NEXT: movq %rdi, %rsi 594; SSE-NEXT: andq $8388608, %rsi # imm = 0x800000 595; SSE-NEXT: shlq $17, %rsi 596; SSE-NEXT: orq %rdx, %rsi 597; SSE-NEXT: movq %rdi, %rdx 598; SSE-NEXT: andq $16777216, %rdx # imm = 0x1000000 599; SSE-NEXT: shlq $15, %rdx 600; SSE-NEXT: orq %rsi, %rdx 601; SSE-NEXT: movq %rdi, %rsi 602; SSE-NEXT: andq $33554432, %rsi # imm = 0x2000000 603; SSE-NEXT: shlq $13, %rsi 604; SSE-NEXT: orq %rdx, %rsi 605; SSE-NEXT: movq %rdi, %rdx 606; SSE-NEXT: andq $67108864, %rdx # imm = 0x4000000 607; SSE-NEXT: shlq $11, %rdx 608; SSE-NEXT: orq %rsi, %rdx 609; SSE-NEXT: movq %rdi, %rsi 610; SSE-NEXT: andq $134217728, %rsi # imm = 0x8000000 611; SSE-NEXT: shlq $9, %rsi 612; SSE-NEXT: orq %rdx, %rsi 613; SSE-NEXT: movq %rdi, %rdx 614; SSE-NEXT: andq $268435456, %rdx # imm = 0x10000000 615; SSE-NEXT: shlq $7, %rdx 616; SSE-NEXT: orq %rsi, %rdx 617; SSE-NEXT: movq %rdi, %rsi 618; SSE-NEXT: andq $536870912, %rsi # imm = 0x20000000 619; SSE-NEXT: shlq $5, %rsi 620; SSE-NEXT: orq %rdx, %rsi 621; SSE-NEXT: movq %rdi, %rdx 622; SSE-NEXT: andq $1073741824, %rdx # imm = 0x40000000 623; SSE-NEXT: shlq $3, %rdx 624; SSE-NEXT: orq %rsi, %rdx 625; SSE-NEXT: orq %rcx, %rdx 626; SSE-NEXT: movq %rdi, %rcx 627; SSE-NEXT: shrq %rcx 628; SSE-NEXT: andl $-2147483648, %ecx # imm = 0x80000000 629; SSE-NEXT: orq %rdx, %rcx 630; SSE-NEXT: movq %rdi, %rdx 631; SSE-NEXT: shrq $3, %rdx 632; SSE-NEXT: andl $1073741824, %edx # imm = 0x40000000 633; SSE-NEXT: orq %rcx, %rdx 634; SSE-NEXT: movq %rdi, %rcx 635; SSE-NEXT: shrq $5, %rcx 636; SSE-NEXT: andl $536870912, %ecx # imm = 0x20000000 637; SSE-NEXT: orq %rdx, %rcx 638; SSE-NEXT: movq %rdi, %rdx 639; SSE-NEXT: shrq $7, %rdx 640; SSE-NEXT: andl $268435456, %edx # imm = 0x10000000 641; SSE-NEXT: orq %rcx, %rdx 642; SSE-NEXT: movq %rdi, %rcx 643; SSE-NEXT: shrq $9, %rcx 644; SSE-NEXT: andl $134217728, %ecx # imm = 0x8000000 645; SSE-NEXT: orq %rdx, %rcx 646; SSE-NEXT: movq %rdi, %rdx 647; SSE-NEXT: shrq $11, %rdx 648; SSE-NEXT: andl $67108864, %edx # imm = 0x4000000 649; SSE-NEXT: orq %rcx, %rdx 650; SSE-NEXT: movq %rdi, %rcx 651; SSE-NEXT: shrq $13, %rcx 652; SSE-NEXT: andl $33554432, %ecx # imm = 0x2000000 653; SSE-NEXT: orq %rdx, %rcx 654; SSE-NEXT: movq %rdi, %rdx 655; SSE-NEXT: shrq $15, %rdx 656; SSE-NEXT: andl $16777216, %edx # imm = 0x1000000 657; SSE-NEXT: orq %rcx, %rdx 658; SSE-NEXT: movq %rdi, %rcx 659; SSE-NEXT: shrq $17, %rcx 660; SSE-NEXT: andl $8388608, %ecx # imm = 0x800000 661; SSE-NEXT: orq %rdx, %rcx 662; SSE-NEXT: movq %rdi, %rdx 663; SSE-NEXT: shrq $19, %rdx 664; SSE-NEXT: andl $4194304, %edx # imm = 0x400000 665; SSE-NEXT: orq %rcx, %rdx 666; SSE-NEXT: movq %rdi, %rcx 667; SSE-NEXT: shrq $21, %rcx 668; SSE-NEXT: andl $2097152, %ecx # imm = 0x200000 669; SSE-NEXT: orq %rdx, %rcx 670; SSE-NEXT: movq %rdi, %rdx 671; SSE-NEXT: shrq $23, %rdx 672; SSE-NEXT: andl $1048576, %edx # imm = 0x100000 673; SSE-NEXT: orq %rcx, %rdx 674; SSE-NEXT: movq %rdi, %rcx 675; SSE-NEXT: shrq $25, %rcx 676; SSE-NEXT: andl $524288, %ecx # imm = 0x80000 677; SSE-NEXT: orq %rdx, %rcx 678; SSE-NEXT: movq %rdi, %rdx 679; SSE-NEXT: shrq $27, %rdx 680; SSE-NEXT: andl $262144, %edx # imm = 0x40000 681; SSE-NEXT: orq %rcx, %rdx 682; SSE-NEXT: movq %rdi, %rcx 683; SSE-NEXT: shrq $29, %rcx 684; SSE-NEXT: andl $131072, %ecx # imm = 0x20000 685; SSE-NEXT: orq %rdx, %rcx 686; SSE-NEXT: movq %rdi, %rdx 687; SSE-NEXT: shrq $31, %rdx 688; SSE-NEXT: andl $65536, %edx # imm = 0x10000 689; SSE-NEXT: orq %rcx, %rdx 690; SSE-NEXT: movq %rdi, %rcx 691; SSE-NEXT: shrq $33, %rcx 692; SSE-NEXT: andl $32768, %ecx # imm = 0x8000 693; SSE-NEXT: orq %rdx, %rcx 694; SSE-NEXT: movq %rdi, %rdx 695; SSE-NEXT: shrq $35, %rdx 696; SSE-NEXT: andl $16384, %edx # imm = 0x4000 697; SSE-NEXT: orq %rcx, %rdx 698; SSE-NEXT: movq %rdi, %rcx 699; SSE-NEXT: shrq $37, %rcx 700; SSE-NEXT: andl $8192, %ecx # imm = 0x2000 701; SSE-NEXT: orq %rdx, %rcx 702; SSE-NEXT: movq %rdi, %rdx 703; SSE-NEXT: shrq $39, %rdx 704; SSE-NEXT: andl $4096, %edx # imm = 0x1000 705; SSE-NEXT: orq %rcx, %rdx 706; SSE-NEXT: movq %rdi, %rcx 707; SSE-NEXT: shrq $41, %rcx 708; SSE-NEXT: andl $2048, %ecx # imm = 0x800 709; SSE-NEXT: orq %rdx, %rcx 710; SSE-NEXT: movq %rdi, %rdx 711; SSE-NEXT: shrq $43, %rdx 712; SSE-NEXT: andl $1024, %edx # imm = 0x400 713; SSE-NEXT: orq %rcx, %rdx 714; SSE-NEXT: movq %rdi, %rcx 715; SSE-NEXT: shrq $45, %rcx 716; SSE-NEXT: andl $512, %ecx # imm = 0x200 717; SSE-NEXT: orq %rdx, %rcx 718; SSE-NEXT: movq %rdi, %rdx 719; SSE-NEXT: shrq $47, %rdx 720; SSE-NEXT: andl $256, %edx # imm = 0x100 721; SSE-NEXT: orq %rcx, %rdx 722; SSE-NEXT: movq %rdi, %rcx 723; SSE-NEXT: shrq $49, %rcx 724; SSE-NEXT: andl $128, %ecx 725; SSE-NEXT: orq %rdx, %rcx 726; SSE-NEXT: movq %rdi, %rdx 727; SSE-NEXT: shrq $51, %rdx 728; SSE-NEXT: andl $64, %edx 729; SSE-NEXT: orq %rcx, %rdx 730; SSE-NEXT: movq %rdi, %rcx 731; SSE-NEXT: shrq $53, %rcx 732; SSE-NEXT: andl $32, %ecx 733; SSE-NEXT: orq %rdx, %rcx 734; SSE-NEXT: movq %rdi, %rdx 735; SSE-NEXT: shrq $55, %rdx 736; SSE-NEXT: andl $16, %edx 737; SSE-NEXT: orq %rcx, %rdx 738; SSE-NEXT: movq %rdi, %rcx 739; SSE-NEXT: shrq $57, %rcx 740; SSE-NEXT: andl $8, %ecx 741; SSE-NEXT: orq %rdx, %rcx 742; SSE-NEXT: movq %rdi, %rdx 743; SSE-NEXT: shrq $59, %rdx 744; SSE-NEXT: andl $4, %edx 745; SSE-NEXT: orq %rcx, %rdx 746; SSE-NEXT: movq %rdi, %rcx 747; SSE-NEXT: shrq $61, %rcx 748; SSE-NEXT: andl $2, %ecx 749; SSE-NEXT: orq %rdx, %rcx 750; SSE-NEXT: shrq $63, %rdi 751; SSE-NEXT: orq %rcx, %rdi 752; SSE-NEXT: orq %rdi, %rax 753; SSE-NEXT: retq 754; 755; AVX-LABEL: test_bitreverse_i64: 756; AVX: # BB#0: 757; AVX-NEXT: leaq (%rdi,%rdi), %rax 758; AVX-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000 759; AVX-NEXT: andq %rax, %rcx 760; AVX-NEXT: movq %rdi, %rax 761; AVX-NEXT: shlq $63, %rax 762; AVX-NEXT: movq %rdi, %rdx 763; AVX-NEXT: andq $2, %rdx 764; AVX-NEXT: shlq $61, %rdx 765; AVX-NEXT: leaq (%rdx,%rax), %rax 766; AVX-NEXT: movq %rdi, %rdx 767; AVX-NEXT: andq $4, %rdx 768; AVX-NEXT: shlq $59, %rdx 769; AVX-NEXT: orq %rdx, %rax 770; AVX-NEXT: movq %rdi, %rdx 771; AVX-NEXT: andq $8, %rdx 772; AVX-NEXT: shlq $57, %rdx 773; AVX-NEXT: orq %rdx, %rax 774; AVX-NEXT: movq %rdi, %rdx 775; AVX-NEXT: andq $16, %rdx 776; AVX-NEXT: shlq $55, %rdx 777; AVX-NEXT: orq %rdx, %rax 778; AVX-NEXT: movq %rdi, %rdx 779; AVX-NEXT: andq $32, %rdx 780; AVX-NEXT: shlq $53, %rdx 781; AVX-NEXT: orq %rdx, %rax 782; AVX-NEXT: movq %rdi, %rdx 783; AVX-NEXT: andq $64, %rdx 784; AVX-NEXT: shlq $51, %rdx 785; AVX-NEXT: movq %rdi, %rsi 786; AVX-NEXT: andq $128, %rsi 787; AVX-NEXT: shlq $49, %rsi 788; AVX-NEXT: orq %rdx, %rsi 789; AVX-NEXT: movq %rdi, %rdx 790; AVX-NEXT: andq $256, %rdx # imm = 0x100 791; AVX-NEXT: shlq $47, %rdx 792; AVX-NEXT: orq %rsi, %rdx 793; AVX-NEXT: movq %rdi, %rsi 794; AVX-NEXT: andq $512, %rsi # imm = 0x200 795; AVX-NEXT: shlq $45, %rsi 796; AVX-NEXT: orq %rdx, %rsi 797; AVX-NEXT: movq %rdi, %rdx 798; AVX-NEXT: andq $1024, %rdx # imm = 0x400 799; AVX-NEXT: shlq $43, %rdx 800; AVX-NEXT: orq %rsi, %rdx 801; AVX-NEXT: movq %rdi, %rsi 802; AVX-NEXT: andq $2048, %rsi # imm = 0x800 803; AVX-NEXT: shlq $41, %rsi 804; AVX-NEXT: orq %rdx, %rsi 805; AVX-NEXT: movq %rdi, %rdx 806; AVX-NEXT: andq $4096, %rdx # imm = 0x1000 807; AVX-NEXT: shlq $39, %rdx 808; AVX-NEXT: orq %rsi, %rdx 809; AVX-NEXT: movq %rdi, %rsi 810; AVX-NEXT: andq $8192, %rsi # imm = 0x2000 811; AVX-NEXT: shlq $37, %rsi 812; AVX-NEXT: orq %rdx, %rsi 813; AVX-NEXT: movq %rdi, %rdx 814; AVX-NEXT: andq $16384, %rdx # imm = 0x4000 815; AVX-NEXT: shlq $35, %rdx 816; AVX-NEXT: orq %rsi, %rdx 817; AVX-NEXT: movq %rdi, %rsi 818; AVX-NEXT: andq $32768, %rsi # imm = 0x8000 819; AVX-NEXT: shlq $33, %rsi 820; AVX-NEXT: orq %rdx, %rsi 821; AVX-NEXT: movq %rdi, %rdx 822; AVX-NEXT: andq $65536, %rdx # imm = 0x10000 823; AVX-NEXT: shlq $31, %rdx 824; AVX-NEXT: orq %rsi, %rdx 825; AVX-NEXT: movq %rdi, %rsi 826; AVX-NEXT: andq $131072, %rsi # imm = 0x20000 827; AVX-NEXT: shlq $29, %rsi 828; AVX-NEXT: orq %rdx, %rsi 829; AVX-NEXT: movq %rdi, %rdx 830; AVX-NEXT: andq $262144, %rdx # imm = 0x40000 831; AVX-NEXT: shlq $27, %rdx 832; AVX-NEXT: orq %rsi, %rdx 833; AVX-NEXT: movq %rdi, %rsi 834; AVX-NEXT: andq $524288, %rsi # imm = 0x80000 835; AVX-NEXT: shlq $25, %rsi 836; AVX-NEXT: orq %rdx, %rsi 837; AVX-NEXT: movq %rdi, %rdx 838; AVX-NEXT: andq $1048576, %rdx # imm = 0x100000 839; AVX-NEXT: shlq $23, %rdx 840; AVX-NEXT: orq %rsi, %rdx 841; AVX-NEXT: movq %rdi, %rsi 842; AVX-NEXT: andq $2097152, %rsi # imm = 0x200000 843; AVX-NEXT: shlq $21, %rsi 844; AVX-NEXT: orq %rdx, %rsi 845; AVX-NEXT: movq %rdi, %rdx 846; AVX-NEXT: andq $4194304, %rdx # imm = 0x400000 847; AVX-NEXT: shlq $19, %rdx 848; AVX-NEXT: orq %rsi, %rdx 849; AVX-NEXT: movq %rdi, %rsi 850; AVX-NEXT: andq $8388608, %rsi # imm = 0x800000 851; AVX-NEXT: shlq $17, %rsi 852; AVX-NEXT: orq %rdx, %rsi 853; AVX-NEXT: movq %rdi, %rdx 854; AVX-NEXT: andq $16777216, %rdx # imm = 0x1000000 855; AVX-NEXT: shlq $15, %rdx 856; AVX-NEXT: orq %rsi, %rdx 857; AVX-NEXT: movq %rdi, %rsi 858; AVX-NEXT: andq $33554432, %rsi # imm = 0x2000000 859; AVX-NEXT: shlq $13, %rsi 860; AVX-NEXT: orq %rdx, %rsi 861; AVX-NEXT: movq %rdi, %rdx 862; AVX-NEXT: andq $67108864, %rdx # imm = 0x4000000 863; AVX-NEXT: shlq $11, %rdx 864; AVX-NEXT: orq %rsi, %rdx 865; AVX-NEXT: movq %rdi, %rsi 866; AVX-NEXT: andq $134217728, %rsi # imm = 0x8000000 867; AVX-NEXT: shlq $9, %rsi 868; AVX-NEXT: orq %rdx, %rsi 869; AVX-NEXT: movq %rdi, %rdx 870; AVX-NEXT: andq $268435456, %rdx # imm = 0x10000000 871; AVX-NEXT: shlq $7, %rdx 872; AVX-NEXT: orq %rsi, %rdx 873; AVX-NEXT: movq %rdi, %rsi 874; AVX-NEXT: andq $536870912, %rsi # imm = 0x20000000 875; AVX-NEXT: shlq $5, %rsi 876; AVX-NEXT: orq %rdx, %rsi 877; AVX-NEXT: movq %rdi, %rdx 878; AVX-NEXT: andq $1073741824, %rdx # imm = 0x40000000 879; AVX-NEXT: shlq $3, %rdx 880; AVX-NEXT: orq %rsi, %rdx 881; AVX-NEXT: orq %rcx, %rdx 882; AVX-NEXT: movq %rdi, %rcx 883; AVX-NEXT: shrq %rcx 884; AVX-NEXT: andl $-2147483648, %ecx # imm = 0x80000000 885; AVX-NEXT: orq %rdx, %rcx 886; AVX-NEXT: movq %rdi, %rdx 887; AVX-NEXT: shrq $3, %rdx 888; AVX-NEXT: andl $1073741824, %edx # imm = 0x40000000 889; AVX-NEXT: orq %rcx, %rdx 890; AVX-NEXT: movq %rdi, %rcx 891; AVX-NEXT: shrq $5, %rcx 892; AVX-NEXT: andl $536870912, %ecx # imm = 0x20000000 893; AVX-NEXT: orq %rdx, %rcx 894; AVX-NEXT: movq %rdi, %rdx 895; AVX-NEXT: shrq $7, %rdx 896; AVX-NEXT: andl $268435456, %edx # imm = 0x10000000 897; AVX-NEXT: orq %rcx, %rdx 898; AVX-NEXT: movq %rdi, %rcx 899; AVX-NEXT: shrq $9, %rcx 900; AVX-NEXT: andl $134217728, %ecx # imm = 0x8000000 901; AVX-NEXT: orq %rdx, %rcx 902; AVX-NEXT: movq %rdi, %rdx 903; AVX-NEXT: shrq $11, %rdx 904; AVX-NEXT: andl $67108864, %edx # imm = 0x4000000 905; AVX-NEXT: orq %rcx, %rdx 906; AVX-NEXT: movq %rdi, %rcx 907; AVX-NEXT: shrq $13, %rcx 908; AVX-NEXT: andl $33554432, %ecx # imm = 0x2000000 909; AVX-NEXT: orq %rdx, %rcx 910; AVX-NEXT: movq %rdi, %rdx 911; AVX-NEXT: shrq $15, %rdx 912; AVX-NEXT: andl $16777216, %edx # imm = 0x1000000 913; AVX-NEXT: orq %rcx, %rdx 914; AVX-NEXT: movq %rdi, %rcx 915; AVX-NEXT: shrq $17, %rcx 916; AVX-NEXT: andl $8388608, %ecx # imm = 0x800000 917; AVX-NEXT: orq %rdx, %rcx 918; AVX-NEXT: movq %rdi, %rdx 919; AVX-NEXT: shrq $19, %rdx 920; AVX-NEXT: andl $4194304, %edx # imm = 0x400000 921; AVX-NEXT: orq %rcx, %rdx 922; AVX-NEXT: movq %rdi, %rcx 923; AVX-NEXT: shrq $21, %rcx 924; AVX-NEXT: andl $2097152, %ecx # imm = 0x200000 925; AVX-NEXT: orq %rdx, %rcx 926; AVX-NEXT: movq %rdi, %rdx 927; AVX-NEXT: shrq $23, %rdx 928; AVX-NEXT: andl $1048576, %edx # imm = 0x100000 929; AVX-NEXT: orq %rcx, %rdx 930; AVX-NEXT: movq %rdi, %rcx 931; AVX-NEXT: shrq $25, %rcx 932; AVX-NEXT: andl $524288, %ecx # imm = 0x80000 933; AVX-NEXT: orq %rdx, %rcx 934; AVX-NEXT: movq %rdi, %rdx 935; AVX-NEXT: shrq $27, %rdx 936; AVX-NEXT: andl $262144, %edx # imm = 0x40000 937; AVX-NEXT: orq %rcx, %rdx 938; AVX-NEXT: movq %rdi, %rcx 939; AVX-NEXT: shrq $29, %rcx 940; AVX-NEXT: andl $131072, %ecx # imm = 0x20000 941; AVX-NEXT: orq %rdx, %rcx 942; AVX-NEXT: movq %rdi, %rdx 943; AVX-NEXT: shrq $31, %rdx 944; AVX-NEXT: andl $65536, %edx # imm = 0x10000 945; AVX-NEXT: orq %rcx, %rdx 946; AVX-NEXT: movq %rdi, %rcx 947; AVX-NEXT: shrq $33, %rcx 948; AVX-NEXT: andl $32768, %ecx # imm = 0x8000 949; AVX-NEXT: orq %rdx, %rcx 950; AVX-NEXT: movq %rdi, %rdx 951; AVX-NEXT: shrq $35, %rdx 952; AVX-NEXT: andl $16384, %edx # imm = 0x4000 953; AVX-NEXT: orq %rcx, %rdx 954; AVX-NEXT: movq %rdi, %rcx 955; AVX-NEXT: shrq $37, %rcx 956; AVX-NEXT: andl $8192, %ecx # imm = 0x2000 957; AVX-NEXT: orq %rdx, %rcx 958; AVX-NEXT: movq %rdi, %rdx 959; AVX-NEXT: shrq $39, %rdx 960; AVX-NEXT: andl $4096, %edx # imm = 0x1000 961; AVX-NEXT: orq %rcx, %rdx 962; AVX-NEXT: movq %rdi, %rcx 963; AVX-NEXT: shrq $41, %rcx 964; AVX-NEXT: andl $2048, %ecx # imm = 0x800 965; AVX-NEXT: orq %rdx, %rcx 966; AVX-NEXT: movq %rdi, %rdx 967; AVX-NEXT: shrq $43, %rdx 968; AVX-NEXT: andl $1024, %edx # imm = 0x400 969; AVX-NEXT: orq %rcx, %rdx 970; AVX-NEXT: movq %rdi, %rcx 971; AVX-NEXT: shrq $45, %rcx 972; AVX-NEXT: andl $512, %ecx # imm = 0x200 973; AVX-NEXT: orq %rdx, %rcx 974; AVX-NEXT: movq %rdi, %rdx 975; AVX-NEXT: shrq $47, %rdx 976; AVX-NEXT: andl $256, %edx # imm = 0x100 977; AVX-NEXT: orq %rcx, %rdx 978; AVX-NEXT: movq %rdi, %rcx 979; AVX-NEXT: shrq $49, %rcx 980; AVX-NEXT: andl $128, %ecx 981; AVX-NEXT: orq %rdx, %rcx 982; AVX-NEXT: movq %rdi, %rdx 983; AVX-NEXT: shrq $51, %rdx 984; AVX-NEXT: andl $64, %edx 985; AVX-NEXT: orq %rcx, %rdx 986; AVX-NEXT: movq %rdi, %rcx 987; AVX-NEXT: shrq $53, %rcx 988; AVX-NEXT: andl $32, %ecx 989; AVX-NEXT: orq %rdx, %rcx 990; AVX-NEXT: movq %rdi, %rdx 991; AVX-NEXT: shrq $55, %rdx 992; AVX-NEXT: andl $16, %edx 993; AVX-NEXT: orq %rcx, %rdx 994; AVX-NEXT: movq %rdi, %rcx 995; AVX-NEXT: shrq $57, %rcx 996; AVX-NEXT: andl $8, %ecx 997; AVX-NEXT: orq %rdx, %rcx 998; AVX-NEXT: movq %rdi, %rdx 999; AVX-NEXT: shrq $59, %rdx 1000; AVX-NEXT: andl $4, %edx 1001; AVX-NEXT: orq %rcx, %rdx 1002; AVX-NEXT: movq %rdi, %rcx 1003; AVX-NEXT: shrq $61, %rcx 1004; AVX-NEXT: andl $2, %ecx 1005; AVX-NEXT: orq %rdx, %rcx 1006; AVX-NEXT: shrq $63, %rdi 1007; AVX-NEXT: orq %rcx, %rdi 1008; AVX-NEXT: orq %rdi, %rax 1009; AVX-NEXT: retq 1010; 1011; XOP-LABEL: test_bitreverse_i64: 1012; XOP: # BB#0: 1013; XOP-NEXT: vmovq %rdi, %xmm0 1014; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 1015; XOP-NEXT: vmovq %xmm0, %rax 1016; XOP-NEXT: retq 1017 %b = call i64 @llvm.bitreverse.i64(i64 %a) 1018 ret i64 %b 1019} 1020 1021define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind { 1022; SSE2-LABEL: test_bitreverse_v16i8: 1023; SSE2: # BB#0: 1024; SSE2-NEXT: movdqa %xmm0, %xmm2 1025; SSE2-NEXT: psrlw $7, %xmm2 1026; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1027; SSE2-NEXT: pand %xmm1, %xmm1 1028; SSE2-NEXT: pand %xmm2, %xmm1 1029; SSE2-NEXT: movdqa %xmm0, %xmm2 1030; SSE2-NEXT: psllw $7, %xmm2 1031; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 1032; SSE2-NEXT: pand %xmm3, %xmm3 1033; SSE2-NEXT: pand %xmm3, %xmm2 1034; SSE2-NEXT: movdqa %xmm0, %xmm3 1035; SSE2-NEXT: psllw $5, %xmm3 1036; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1037; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1038; SSE2-NEXT: movdqa %xmm0, %xmm4 1039; SSE2-NEXT: psllw $3, %xmm4 1040; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 1041; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 1042; SSE2-NEXT: por %xmm3, %xmm4 1043; SSE2-NEXT: movdqa %xmm0, %xmm3 1044; SSE2-NEXT: paddb %xmm3, %xmm3 1045; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1046; SSE2-NEXT: por %xmm4, %xmm3 1047; SSE2-NEXT: movdqa %xmm0, %xmm4 1048; SSE2-NEXT: psrlw $1, %xmm4 1049; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 1050; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 1051; SSE2-NEXT: por %xmm3, %xmm4 1052; SSE2-NEXT: movdqa %xmm0, %xmm3 1053; SSE2-NEXT: psrlw $3, %xmm3 1054; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1055; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1056; SSE2-NEXT: por %xmm4, %xmm3 1057; SSE2-NEXT: psrlw $5, %xmm0 1058; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1059; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1060; SSE2-NEXT: por %xmm3, %xmm0 1061; SSE2-NEXT: por %xmm1, %xmm0 1062; SSE2-NEXT: por %xmm2, %xmm0 1063; SSE2-NEXT: retq 1064; 1065; SSSE3-LABEL: test_bitreverse_v16i8: 1066; SSSE3: # BB#0: 1067; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1068; SSSE3-NEXT: movdqa %xmm0, %xmm2 1069; SSSE3-NEXT: pand %xmm1, %xmm2 1070; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1071; SSSE3-NEXT: pshufb %xmm2, %xmm3 1072; SSSE3-NEXT: psrlw $4, %xmm0 1073; SSSE3-NEXT: pand %xmm1, %xmm0 1074; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1075; SSSE3-NEXT: pshufb %xmm0, %xmm1 1076; SSSE3-NEXT: por %xmm3, %xmm1 1077; SSSE3-NEXT: movdqa %xmm1, %xmm0 1078; SSSE3-NEXT: retq 1079; 1080; AVX-LABEL: test_bitreverse_v16i8: 1081; AVX: # BB#0: 1082; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1083; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 1084; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1085; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1086; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 1087; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 1088; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1089; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 1090; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 1091; AVX-NEXT: retq 1092; 1093; XOP-LABEL: test_bitreverse_v16i8: 1094; XOP: # BB#0: 1095; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 1096; XOP-NEXT: retq 1097 %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) 1098 ret <16 x i8> %b 1099} 1100 1101define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind { 1102; SSE2-LABEL: test_bitreverse_v8i16: 1103; SSE2: # BB#0: 1104; SSE2-NEXT: pxor %xmm1, %xmm1 1105; SSE2-NEXT: movdqa %xmm0, %xmm2 1106; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 1107; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] 1108; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6] 1109; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1110; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 1111; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,4,7,6] 1112; SSE2-NEXT: packuswb %xmm2, %xmm1 1113; SSE2-NEXT: movdqa %xmm1, %xmm0 1114; SSE2-NEXT: psllw $7, %xmm0 1115; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 1116; SSE2-NEXT: pand %xmm2, %xmm2 1117; SSE2-NEXT: pand %xmm0, %xmm2 1118; SSE2-NEXT: movdqa %xmm1, %xmm0 1119; SSE2-NEXT: psllw $5, %xmm0 1120; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1121; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1122; SSE2-NEXT: movdqa %xmm1, %xmm3 1123; SSE2-NEXT: psllw $3, %xmm3 1124; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1125; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1126; SSE2-NEXT: por %xmm0, %xmm3 1127; SSE2-NEXT: movdqa %xmm1, %xmm0 1128; SSE2-NEXT: paddb %xmm0, %xmm0 1129; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1130; SSE2-NEXT: por %xmm3, %xmm0 1131; SSE2-NEXT: movdqa %xmm1, %xmm3 1132; SSE2-NEXT: psrlw $1, %xmm3 1133; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1134; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1135; SSE2-NEXT: por %xmm0, %xmm3 1136; SSE2-NEXT: movdqa %xmm1, %xmm0 1137; SSE2-NEXT: psrlw $3, %xmm0 1138; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1139; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1140; SSE2-NEXT: por %xmm3, %xmm0 1141; SSE2-NEXT: movdqa %xmm1, %xmm3 1142; SSE2-NEXT: psrlw $5, %xmm3 1143; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1144; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1145; SSE2-NEXT: por %xmm0, %xmm3 1146; SSE2-NEXT: psrlw $7, %xmm1 1147; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1148; SSE2-NEXT: pand %xmm0, %xmm0 1149; SSE2-NEXT: pand %xmm1, %xmm0 1150; SSE2-NEXT: por %xmm3, %xmm0 1151; SSE2-NEXT: por %xmm2, %xmm0 1152; SSE2-NEXT: retq 1153; 1154; SSSE3-LABEL: test_bitreverse_v8i16: 1155; SSSE3: # BB#0: 1156; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1157; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1158; SSSE3-NEXT: movdqa %xmm0, %xmm2 1159; SSSE3-NEXT: pand %xmm1, %xmm2 1160; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1161; SSSE3-NEXT: pshufb %xmm2, %xmm3 1162; SSSE3-NEXT: psrlw $4, %xmm0 1163; SSSE3-NEXT: pand %xmm1, %xmm0 1164; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1165; SSSE3-NEXT: pshufb %xmm0, %xmm1 1166; SSSE3-NEXT: por %xmm3, %xmm1 1167; SSSE3-NEXT: movdqa %xmm1, %xmm0 1168; SSSE3-NEXT: retq 1169; 1170; AVX-LABEL: test_bitreverse_v8i16: 1171; AVX: # BB#0: 1172; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1173; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1174; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 1175; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1176; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1177; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 1178; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 1179; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1180; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 1181; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 1182; AVX-NEXT: retq 1183; 1184; XOP-LABEL: test_bitreverse_v8i16: 1185; XOP: # BB#0: 1186; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 1187; XOP-NEXT: retq 1188 %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) 1189 ret <8 x i16> %b 1190} 1191 1192define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind { 1193; SSE2-LABEL: test_bitreverse_v4i32: 1194; SSE2: # BB#0: 1195; SSE2-NEXT: pxor %xmm1, %xmm1 1196; SSE2-NEXT: movdqa %xmm0, %xmm2 1197; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 1198; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 1199; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 1200; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1201; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1202; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4] 1203; SSE2-NEXT: packuswb %xmm2, %xmm1 1204; SSE2-NEXT: movdqa %xmm1, %xmm0 1205; SSE2-NEXT: psllw $7, %xmm0 1206; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 1207; SSE2-NEXT: pand %xmm2, %xmm2 1208; SSE2-NEXT: pand %xmm0, %xmm2 1209; SSE2-NEXT: movdqa %xmm1, %xmm0 1210; SSE2-NEXT: psllw $5, %xmm0 1211; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1212; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1213; SSE2-NEXT: movdqa %xmm1, %xmm3 1214; SSE2-NEXT: psllw $3, %xmm3 1215; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1216; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1217; SSE2-NEXT: por %xmm0, %xmm3 1218; SSE2-NEXT: movdqa %xmm1, %xmm0 1219; SSE2-NEXT: paddb %xmm0, %xmm0 1220; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1221; SSE2-NEXT: por %xmm3, %xmm0 1222; SSE2-NEXT: movdqa %xmm1, %xmm3 1223; SSE2-NEXT: psrlw $1, %xmm3 1224; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1225; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1226; SSE2-NEXT: por %xmm0, %xmm3 1227; SSE2-NEXT: movdqa %xmm1, %xmm0 1228; SSE2-NEXT: psrlw $3, %xmm0 1229; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1230; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1231; SSE2-NEXT: por %xmm3, %xmm0 1232; SSE2-NEXT: movdqa %xmm1, %xmm3 1233; SSE2-NEXT: psrlw $5, %xmm3 1234; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1235; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1236; SSE2-NEXT: por %xmm0, %xmm3 1237; SSE2-NEXT: psrlw $7, %xmm1 1238; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1239; SSE2-NEXT: pand %xmm0, %xmm0 1240; SSE2-NEXT: pand %xmm1, %xmm0 1241; SSE2-NEXT: por %xmm3, %xmm0 1242; SSE2-NEXT: por %xmm2, %xmm0 1243; SSE2-NEXT: retq 1244; 1245; SSSE3-LABEL: test_bitreverse_v4i32: 1246; SSSE3: # BB#0: 1247; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1248; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1249; SSSE3-NEXT: movdqa %xmm0, %xmm2 1250; SSSE3-NEXT: pand %xmm1, %xmm2 1251; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1252; SSSE3-NEXT: pshufb %xmm2, %xmm3 1253; SSSE3-NEXT: psrlw $4, %xmm0 1254; SSSE3-NEXT: pand %xmm1, %xmm0 1255; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1256; SSSE3-NEXT: pshufb %xmm0, %xmm1 1257; SSSE3-NEXT: por %xmm3, %xmm1 1258; SSSE3-NEXT: movdqa %xmm1, %xmm0 1259; SSSE3-NEXT: retq 1260; 1261; AVX-LABEL: test_bitreverse_v4i32: 1262; AVX: # BB#0: 1263; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1264; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1265; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 1266; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1267; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1268; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 1269; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 1270; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1271; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 1272; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 1273; AVX-NEXT: retq 1274; 1275; XOP-LABEL: test_bitreverse_v4i32: 1276; XOP: # BB#0: 1277; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 1278; XOP-NEXT: retq 1279 %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) 1280 ret <4 x i32> %b 1281} 1282 1283define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind { 1284; SSE2-LABEL: test_bitreverse_v2i64: 1285; SSE2: # BB#0: 1286; SSE2-NEXT: pxor %xmm1, %xmm1 1287; SSE2-NEXT: movdqa %xmm0, %xmm2 1288; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 1289; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 1290; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 1291; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 1292; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1293; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1294; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1295; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4] 1296; SSE2-NEXT: packuswb %xmm2, %xmm1 1297; SSE2-NEXT: movdqa %xmm1, %xmm0 1298; SSE2-NEXT: psllw $7, %xmm0 1299; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 1300; SSE2-NEXT: pand %xmm2, %xmm2 1301; SSE2-NEXT: pand %xmm0, %xmm2 1302; SSE2-NEXT: movdqa %xmm1, %xmm0 1303; SSE2-NEXT: psllw $5, %xmm0 1304; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1305; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1306; SSE2-NEXT: movdqa %xmm1, %xmm3 1307; SSE2-NEXT: psllw $3, %xmm3 1308; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1309; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1310; SSE2-NEXT: por %xmm0, %xmm3 1311; SSE2-NEXT: movdqa %xmm1, %xmm0 1312; SSE2-NEXT: paddb %xmm0, %xmm0 1313; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1314; SSE2-NEXT: por %xmm3, %xmm0 1315; SSE2-NEXT: movdqa %xmm1, %xmm3 1316; SSE2-NEXT: psrlw $1, %xmm3 1317; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1318; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1319; SSE2-NEXT: por %xmm0, %xmm3 1320; SSE2-NEXT: movdqa %xmm1, %xmm0 1321; SSE2-NEXT: psrlw $3, %xmm0 1322; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1323; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1324; SSE2-NEXT: por %xmm3, %xmm0 1325; SSE2-NEXT: movdqa %xmm1, %xmm3 1326; SSE2-NEXT: psrlw $5, %xmm3 1327; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1328; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1329; SSE2-NEXT: por %xmm0, %xmm3 1330; SSE2-NEXT: psrlw $7, %xmm1 1331; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1332; SSE2-NEXT: pand %xmm0, %xmm0 1333; SSE2-NEXT: pand %xmm1, %xmm0 1334; SSE2-NEXT: por %xmm3, %xmm0 1335; SSE2-NEXT: por %xmm2, %xmm0 1336; SSE2-NEXT: retq 1337; 1338; SSSE3-LABEL: test_bitreverse_v2i64: 1339; SSSE3: # BB#0: 1340; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1341; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1342; SSSE3-NEXT: movdqa %xmm0, %xmm2 1343; SSSE3-NEXT: pand %xmm1, %xmm2 1344; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1345; SSSE3-NEXT: pshufb %xmm2, %xmm3 1346; SSSE3-NEXT: psrlw $4, %xmm0 1347; SSSE3-NEXT: pand %xmm1, %xmm0 1348; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1349; SSSE3-NEXT: pshufb %xmm0, %xmm1 1350; SSSE3-NEXT: por %xmm3, %xmm1 1351; SSSE3-NEXT: movdqa %xmm1, %xmm0 1352; SSSE3-NEXT: retq 1353; 1354; AVX-LABEL: test_bitreverse_v2i64: 1355; AVX: # BB#0: 1356; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1357; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1358; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 1359; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1360; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1361; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 1362; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 1363; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1364; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 1365; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 1366; AVX-NEXT: retq 1367; 1368; XOP-LABEL: test_bitreverse_v2i64: 1369; XOP: # BB#0: 1370; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 1371; XOP-NEXT: retq 1372 %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) 1373 ret <2 x i64> %b 1374} 1375 1376define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind { 1377; SSE2-LABEL: test_bitreverse_v32i8: 1378; SSE2: # BB#0: 1379; SSE2-NEXT: movdqa %xmm0, %xmm2 1380; SSE2-NEXT: psllw $5, %xmm2 1381; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 1382; SSE2-NEXT: pand {{.*}}(%rip), %xmm9 1383; SSE2-NEXT: pand %xmm9, %xmm2 1384; SSE2-NEXT: movdqa %xmm0, %xmm5 1385; SSE2-NEXT: psllw $7, %xmm5 1386; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 1387; SSE2-NEXT: pand %xmm10, %xmm10 1388; SSE2-NEXT: pand %xmm10, %xmm5 1389; SSE2-NEXT: movdqa %xmm0, %xmm3 1390; SSE2-NEXT: psllw $3, %xmm3 1391; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 1392; SSE2-NEXT: pand {{.*}}(%rip), %xmm11 1393; SSE2-NEXT: pand %xmm11, %xmm3 1394; SSE2-NEXT: por %xmm2, %xmm3 1395; SSE2-NEXT: movdqa %xmm0, %xmm2 1396; SSE2-NEXT: paddb %xmm2, %xmm2 1397; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 1398; SSE2-NEXT: pand %xmm8, %xmm2 1399; SSE2-NEXT: por %xmm3, %xmm2 1400; SSE2-NEXT: movdqa %xmm0, %xmm3 1401; SSE2-NEXT: psrlw $1, %xmm3 1402; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1403; SSE2-NEXT: pand {{.*}}(%rip), %xmm12 1404; SSE2-NEXT: pand %xmm12, %xmm3 1405; SSE2-NEXT: por %xmm2, %xmm3 1406; SSE2-NEXT: movdqa %xmm0, %xmm4 1407; SSE2-NEXT: psrlw $3, %xmm4 1408; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] 1409; SSE2-NEXT: pand {{.*}}(%rip), %xmm6 1410; SSE2-NEXT: pand %xmm6, %xmm4 1411; SSE2-NEXT: por %xmm3, %xmm4 1412; SSE2-NEXT: movdqa %xmm0, %xmm7 1413; SSE2-NEXT: psrlw $5, %xmm7 1414; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 1415; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 1416; SSE2-NEXT: pand %xmm2, %xmm7 1417; SSE2-NEXT: por %xmm4, %xmm7 1418; SSE2-NEXT: psrlw $7, %xmm0 1419; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1420; SSE2-NEXT: pand %xmm3, %xmm3 1421; SSE2-NEXT: pand %xmm3, %xmm0 1422; SSE2-NEXT: por %xmm7, %xmm0 1423; SSE2-NEXT: por %xmm5, %xmm0 1424; SSE2-NEXT: movdqa %xmm1, %xmm4 1425; SSE2-NEXT: psllw $5, %xmm4 1426; SSE2-NEXT: pand %xmm9, %xmm4 1427; SSE2-NEXT: movdqa %xmm1, %xmm5 1428; SSE2-NEXT: psllw $7, %xmm5 1429; SSE2-NEXT: pand %xmm10, %xmm5 1430; SSE2-NEXT: movdqa %xmm1, %xmm7 1431; SSE2-NEXT: psllw $3, %xmm7 1432; SSE2-NEXT: pand %xmm11, %xmm7 1433; SSE2-NEXT: por %xmm4, %xmm7 1434; SSE2-NEXT: movdqa %xmm1, %xmm4 1435; SSE2-NEXT: paddb %xmm4, %xmm4 1436; SSE2-NEXT: pand %xmm8, %xmm4 1437; SSE2-NEXT: por %xmm7, %xmm4 1438; SSE2-NEXT: movdqa %xmm1, %xmm7 1439; SSE2-NEXT: psrlw $1, %xmm7 1440; SSE2-NEXT: pand %xmm12, %xmm7 1441; SSE2-NEXT: por %xmm4, %xmm7 1442; SSE2-NEXT: movdqa %xmm1, %xmm4 1443; SSE2-NEXT: psrlw $3, %xmm4 1444; SSE2-NEXT: pand %xmm6, %xmm4 1445; SSE2-NEXT: por %xmm7, %xmm4 1446; SSE2-NEXT: movdqa %xmm1, %xmm6 1447; SSE2-NEXT: psrlw $5, %xmm6 1448; SSE2-NEXT: pand %xmm2, %xmm6 1449; SSE2-NEXT: por %xmm4, %xmm6 1450; SSE2-NEXT: psrlw $7, %xmm1 1451; SSE2-NEXT: pand %xmm3, %xmm1 1452; SSE2-NEXT: por %xmm6, %xmm1 1453; SSE2-NEXT: por %xmm5, %xmm1 1454; SSE2-NEXT: retq 1455; 1456; SSSE3-LABEL: test_bitreverse_v32i8: 1457; SSSE3: # BB#0: 1458; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1459; SSSE3-NEXT: movdqa %xmm0, %xmm2 1460; SSSE3-NEXT: pand %xmm4, %xmm2 1461; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1462; SSSE3-NEXT: movdqa %xmm5, %xmm6 1463; SSSE3-NEXT: pshufb %xmm2, %xmm6 1464; SSSE3-NEXT: psrlw $4, %xmm0 1465; SSSE3-NEXT: pand %xmm4, %xmm0 1466; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1467; SSSE3-NEXT: movdqa %xmm2, %xmm3 1468; SSSE3-NEXT: pshufb %xmm0, %xmm3 1469; SSSE3-NEXT: por %xmm6, %xmm3 1470; SSSE3-NEXT: movdqa %xmm1, %xmm0 1471; SSSE3-NEXT: pand %xmm4, %xmm0 1472; SSSE3-NEXT: pshufb %xmm0, %xmm5 1473; SSSE3-NEXT: psrlw $4, %xmm1 1474; SSSE3-NEXT: pand %xmm4, %xmm1 1475; SSSE3-NEXT: pshufb %xmm1, %xmm2 1476; SSSE3-NEXT: por %xmm5, %xmm2 1477; SSSE3-NEXT: movdqa %xmm3, %xmm0 1478; SSSE3-NEXT: movdqa %xmm2, %xmm1 1479; SSSE3-NEXT: retq 1480; 1481; AVX1-LABEL: test_bitreverse_v32i8: 1482; AVX1: # BB#0: 1483; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1484; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1485; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3 1486; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1487; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 1488; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1489; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 1490; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1491; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 1492; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 1493; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3 1494; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 1495; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1496; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 1497; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 1498; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 1499; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1500; AVX1-NEXT: retq 1501; 1502; AVX2-LABEL: test_bitreverse_v32i8: 1503; AVX2: # BB#0: 1504; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1505; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1506; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1507; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1508; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1509; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1510; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1511; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1512; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1513; AVX2-NEXT: retq 1514; 1515; AVX512-LABEL: test_bitreverse_v32i8: 1516; AVX512: # BB#0: 1517; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1518; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1519; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1520; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1521; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1522; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1523; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1524; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1525; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1526; AVX512-NEXT: retq 1527; 1528; XOPAVX1-LABEL: test_bitreverse_v32i8: 1529; XOPAVX1: # BB#0: 1530; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1531; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 1532; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1533; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1534; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1535; XOPAVX1-NEXT: retq 1536; 1537; XOPAVX2-LABEL: test_bitreverse_v32i8: 1538; XOPAVX2: # BB#0: 1539; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1540; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 1541; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1542; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1543; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1544; XOPAVX2-NEXT: retq 1545 %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) 1546 ret <32 x i8> %b 1547} 1548 1549define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind { 1550; SSE2-LABEL: test_bitreverse_v16i16: 1551; SSE2: # BB#0: 1552; SSE2-NEXT: pxor %xmm9, %xmm9 1553; SSE2-NEXT: movdqa %xmm0, %xmm2 1554; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] 1555; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] 1556; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6] 1557; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] 1558; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 1559; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] 1560; SSE2-NEXT: packuswb %xmm2, %xmm0 1561; SSE2-NEXT: movdqa %xmm0, %xmm2 1562; SSE2-NEXT: psllw $5, %xmm2 1563; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 1564; SSE2-NEXT: pand {{.*}}(%rip), %xmm10 1565; SSE2-NEXT: pand %xmm10, %xmm2 1566; SSE2-NEXT: movdqa %xmm0, %xmm3 1567; SSE2-NEXT: psllw $7, %xmm3 1568; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 1569; SSE2-NEXT: pand %xmm11, %xmm11 1570; SSE2-NEXT: pand %xmm11, %xmm3 1571; SSE2-NEXT: movdqa %xmm0, %xmm4 1572; SSE2-NEXT: psllw $3, %xmm4 1573; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 1574; SSE2-NEXT: pand {{.*}}(%rip), %xmm12 1575; SSE2-NEXT: pand %xmm12, %xmm4 1576; SSE2-NEXT: por %xmm2, %xmm4 1577; SSE2-NEXT: movdqa %xmm0, %xmm2 1578; SSE2-NEXT: paddb %xmm2, %xmm2 1579; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 1580; SSE2-NEXT: pand %xmm8, %xmm2 1581; SSE2-NEXT: por %xmm4, %xmm2 1582; SSE2-NEXT: movdqa %xmm0, %xmm4 1583; SSE2-NEXT: psrlw $1, %xmm4 1584; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1585; SSE2-NEXT: pand {{.*}}(%rip), %xmm13 1586; SSE2-NEXT: pand %xmm13, %xmm4 1587; SSE2-NEXT: por %xmm2, %xmm4 1588; SSE2-NEXT: movdqa %xmm0, %xmm5 1589; SSE2-NEXT: psrlw $3, %xmm5 1590; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] 1591; SSE2-NEXT: pand {{.*}}(%rip), %xmm6 1592; SSE2-NEXT: pand %xmm6, %xmm5 1593; SSE2-NEXT: por %xmm4, %xmm5 1594; SSE2-NEXT: movdqa %xmm0, %xmm7 1595; SSE2-NEXT: psrlw $5, %xmm7 1596; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 1597; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 1598; SSE2-NEXT: pand %xmm2, %xmm7 1599; SSE2-NEXT: por %xmm5, %xmm7 1600; SSE2-NEXT: psrlw $7, %xmm0 1601; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1602; SSE2-NEXT: pand %xmm4, %xmm4 1603; SSE2-NEXT: pand %xmm4, %xmm0 1604; SSE2-NEXT: por %xmm7, %xmm0 1605; SSE2-NEXT: por %xmm3, %xmm0 1606; SSE2-NEXT: movdqa %xmm1, %xmm3 1607; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] 1608; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] 1609; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6] 1610; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] 1611; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 1612; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6] 1613; SSE2-NEXT: packuswb %xmm3, %xmm1 1614; SSE2-NEXT: movdqa %xmm1, %xmm5 1615; SSE2-NEXT: psllw $5, %xmm5 1616; SSE2-NEXT: pand %xmm10, %xmm5 1617; SSE2-NEXT: movdqa %xmm1, %xmm3 1618; SSE2-NEXT: psllw $7, %xmm3 1619; SSE2-NEXT: pand %xmm11, %xmm3 1620; SSE2-NEXT: movdqa %xmm1, %xmm7 1621; SSE2-NEXT: psllw $3, %xmm7 1622; SSE2-NEXT: pand %xmm12, %xmm7 1623; SSE2-NEXT: por %xmm5, %xmm7 1624; SSE2-NEXT: movdqa %xmm1, %xmm5 1625; SSE2-NEXT: paddb %xmm5, %xmm5 1626; SSE2-NEXT: pand %xmm8, %xmm5 1627; SSE2-NEXT: por %xmm7, %xmm5 1628; SSE2-NEXT: movdqa %xmm1, %xmm7 1629; SSE2-NEXT: psrlw $1, %xmm7 1630; SSE2-NEXT: pand %xmm13, %xmm7 1631; SSE2-NEXT: por %xmm5, %xmm7 1632; SSE2-NEXT: movdqa %xmm1, %xmm5 1633; SSE2-NEXT: psrlw $3, %xmm5 1634; SSE2-NEXT: pand %xmm6, %xmm5 1635; SSE2-NEXT: por %xmm7, %xmm5 1636; SSE2-NEXT: movdqa %xmm1, %xmm6 1637; SSE2-NEXT: psrlw $5, %xmm6 1638; SSE2-NEXT: pand %xmm2, %xmm6 1639; SSE2-NEXT: por %xmm5, %xmm6 1640; SSE2-NEXT: psrlw $7, %xmm1 1641; SSE2-NEXT: pand %xmm4, %xmm1 1642; SSE2-NEXT: por %xmm6, %xmm1 1643; SSE2-NEXT: por %xmm3, %xmm1 1644; SSE2-NEXT: retq 1645; 1646; SSSE3-LABEL: test_bitreverse_v16i16: 1647; SSSE3: # BB#0: 1648; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1649; SSSE3-NEXT: pshufb %xmm4, %xmm0 1650; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1651; SSSE3-NEXT: movdqa %xmm0, %xmm2 1652; SSSE3-NEXT: pand %xmm5, %xmm2 1653; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1654; SSSE3-NEXT: movdqa %xmm6, %xmm7 1655; SSSE3-NEXT: pshufb %xmm2, %xmm7 1656; SSSE3-NEXT: psrlw $4, %xmm0 1657; SSSE3-NEXT: pand %xmm5, %xmm0 1658; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1659; SSSE3-NEXT: movdqa %xmm2, %xmm3 1660; SSSE3-NEXT: pshufb %xmm0, %xmm3 1661; SSSE3-NEXT: por %xmm7, %xmm3 1662; SSSE3-NEXT: pshufb %xmm4, %xmm1 1663; SSSE3-NEXT: movdqa %xmm1, %xmm0 1664; SSSE3-NEXT: pand %xmm5, %xmm0 1665; SSSE3-NEXT: pshufb %xmm0, %xmm6 1666; SSSE3-NEXT: psrlw $4, %xmm1 1667; SSSE3-NEXT: pand %xmm5, %xmm1 1668; SSSE3-NEXT: pshufb %xmm1, %xmm2 1669; SSSE3-NEXT: por %xmm6, %xmm2 1670; SSSE3-NEXT: movdqa %xmm3, %xmm0 1671; SSSE3-NEXT: movdqa %xmm2, %xmm1 1672; SSSE3-NEXT: retq 1673; 1674; AVX1-LABEL: test_bitreverse_v16i16: 1675; AVX1: # BB#0: 1676; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1677; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1678; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1679; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1680; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1681; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1682; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1683; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1684; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1685; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1686; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1687; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1688; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1689; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 1690; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 1691; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1692; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1693; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1694; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 1695; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1696; AVX1-NEXT: retq 1697; 1698; AVX2-LABEL: test_bitreverse_v16i16: 1699; AVX2: # BB#0: 1700; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 1701; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1702; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1703; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1704; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1705; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1706; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1707; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1708; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1709; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1710; AVX2-NEXT: retq 1711; 1712; AVX512-LABEL: test_bitreverse_v16i16: 1713; AVX512: # BB#0: 1714; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 1715; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1716; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1717; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1718; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1719; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1720; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1721; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1722; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1723; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1724; AVX512-NEXT: retq 1725; 1726; XOPAVX1-LABEL: test_bitreverse_v16i16: 1727; XOPAVX1: # BB#0: 1728; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1729; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 1730; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1731; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1732; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1733; XOPAVX1-NEXT: retq 1734; 1735; XOPAVX2-LABEL: test_bitreverse_v16i16: 1736; XOPAVX2: # BB#0: 1737; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1738; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 1739; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1740; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1741; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1742; XOPAVX2-NEXT: retq 1743 %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) 1744 ret <16 x i16> %b 1745} 1746 1747define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind { 1748; SSE2-LABEL: test_bitreverse_v8i32: 1749; SSE2: # BB#0: 1750; SSE2-NEXT: pxor %xmm9, %xmm9 1751; SSE2-NEXT: movdqa %xmm0, %xmm2 1752; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] 1753; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 1754; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 1755; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] 1756; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1757; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 1758; SSE2-NEXT: packuswb %xmm2, %xmm0 1759; SSE2-NEXT: movdqa %xmm0, %xmm2 1760; SSE2-NEXT: psllw $5, %xmm2 1761; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 1762; SSE2-NEXT: pand {{.*}}(%rip), %xmm10 1763; SSE2-NEXT: pand %xmm10, %xmm2 1764; SSE2-NEXT: movdqa %xmm0, %xmm3 1765; SSE2-NEXT: psllw $7, %xmm3 1766; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 1767; SSE2-NEXT: pand %xmm11, %xmm11 1768; SSE2-NEXT: pand %xmm11, %xmm3 1769; SSE2-NEXT: movdqa %xmm0, %xmm4 1770; SSE2-NEXT: psllw $3, %xmm4 1771; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 1772; SSE2-NEXT: pand {{.*}}(%rip), %xmm12 1773; SSE2-NEXT: pand %xmm12, %xmm4 1774; SSE2-NEXT: por %xmm2, %xmm4 1775; SSE2-NEXT: movdqa %xmm0, %xmm2 1776; SSE2-NEXT: paddb %xmm2, %xmm2 1777; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 1778; SSE2-NEXT: pand %xmm8, %xmm2 1779; SSE2-NEXT: por %xmm4, %xmm2 1780; SSE2-NEXT: movdqa %xmm0, %xmm4 1781; SSE2-NEXT: psrlw $1, %xmm4 1782; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1783; SSE2-NEXT: pand {{.*}}(%rip), %xmm13 1784; SSE2-NEXT: pand %xmm13, %xmm4 1785; SSE2-NEXT: por %xmm2, %xmm4 1786; SSE2-NEXT: movdqa %xmm0, %xmm5 1787; SSE2-NEXT: psrlw $3, %xmm5 1788; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] 1789; SSE2-NEXT: pand {{.*}}(%rip), %xmm6 1790; SSE2-NEXT: pand %xmm6, %xmm5 1791; SSE2-NEXT: por %xmm4, %xmm5 1792; SSE2-NEXT: movdqa %xmm0, %xmm7 1793; SSE2-NEXT: psrlw $5, %xmm7 1794; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 1795; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 1796; SSE2-NEXT: pand %xmm2, %xmm7 1797; SSE2-NEXT: por %xmm5, %xmm7 1798; SSE2-NEXT: psrlw $7, %xmm0 1799; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1800; SSE2-NEXT: pand %xmm4, %xmm4 1801; SSE2-NEXT: pand %xmm4, %xmm0 1802; SSE2-NEXT: por %xmm7, %xmm0 1803; SSE2-NEXT: por %xmm3, %xmm0 1804; SSE2-NEXT: movdqa %xmm1, %xmm3 1805; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] 1806; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 1807; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 1808; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] 1809; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 1810; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 1811; SSE2-NEXT: packuswb %xmm3, %xmm1 1812; SSE2-NEXT: movdqa %xmm1, %xmm5 1813; SSE2-NEXT: psllw $5, %xmm5 1814; SSE2-NEXT: pand %xmm10, %xmm5 1815; SSE2-NEXT: movdqa %xmm1, %xmm3 1816; SSE2-NEXT: psllw $7, %xmm3 1817; SSE2-NEXT: pand %xmm11, %xmm3 1818; SSE2-NEXT: movdqa %xmm1, %xmm7 1819; SSE2-NEXT: psllw $3, %xmm7 1820; SSE2-NEXT: pand %xmm12, %xmm7 1821; SSE2-NEXT: por %xmm5, %xmm7 1822; SSE2-NEXT: movdqa %xmm1, %xmm5 1823; SSE2-NEXT: paddb %xmm5, %xmm5 1824; SSE2-NEXT: pand %xmm8, %xmm5 1825; SSE2-NEXT: por %xmm7, %xmm5 1826; SSE2-NEXT: movdqa %xmm1, %xmm7 1827; SSE2-NEXT: psrlw $1, %xmm7 1828; SSE2-NEXT: pand %xmm13, %xmm7 1829; SSE2-NEXT: por %xmm5, %xmm7 1830; SSE2-NEXT: movdqa %xmm1, %xmm5 1831; SSE2-NEXT: psrlw $3, %xmm5 1832; SSE2-NEXT: pand %xmm6, %xmm5 1833; SSE2-NEXT: por %xmm7, %xmm5 1834; SSE2-NEXT: movdqa %xmm1, %xmm6 1835; SSE2-NEXT: psrlw $5, %xmm6 1836; SSE2-NEXT: pand %xmm2, %xmm6 1837; SSE2-NEXT: por %xmm5, %xmm6 1838; SSE2-NEXT: psrlw $7, %xmm1 1839; SSE2-NEXT: pand %xmm4, %xmm1 1840; SSE2-NEXT: por %xmm6, %xmm1 1841; SSE2-NEXT: por %xmm3, %xmm1 1842; SSE2-NEXT: retq 1843; 1844; SSSE3-LABEL: test_bitreverse_v8i32: 1845; SSSE3: # BB#0: 1846; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1847; SSSE3-NEXT: pshufb %xmm4, %xmm0 1848; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1849; SSSE3-NEXT: movdqa %xmm0, %xmm2 1850; SSSE3-NEXT: pand %xmm5, %xmm2 1851; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1852; SSSE3-NEXT: movdqa %xmm6, %xmm7 1853; SSSE3-NEXT: pshufb %xmm2, %xmm7 1854; SSSE3-NEXT: psrlw $4, %xmm0 1855; SSSE3-NEXT: pand %xmm5, %xmm0 1856; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1857; SSSE3-NEXT: movdqa %xmm2, %xmm3 1858; SSSE3-NEXT: pshufb %xmm0, %xmm3 1859; SSSE3-NEXT: por %xmm7, %xmm3 1860; SSSE3-NEXT: pshufb %xmm4, %xmm1 1861; SSSE3-NEXT: movdqa %xmm1, %xmm0 1862; SSSE3-NEXT: pand %xmm5, %xmm0 1863; SSSE3-NEXT: pshufb %xmm0, %xmm6 1864; SSSE3-NEXT: psrlw $4, %xmm1 1865; SSSE3-NEXT: pand %xmm5, %xmm1 1866; SSSE3-NEXT: pshufb %xmm1, %xmm2 1867; SSSE3-NEXT: por %xmm6, %xmm2 1868; SSSE3-NEXT: movdqa %xmm3, %xmm0 1869; SSSE3-NEXT: movdqa %xmm2, %xmm1 1870; SSSE3-NEXT: retq 1871; 1872; AVX1-LABEL: test_bitreverse_v8i32: 1873; AVX1: # BB#0: 1874; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1875; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1876; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1877; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1878; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1879; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1880; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1881; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1882; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1883; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1884; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1885; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1886; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1887; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 1888; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 1889; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1890; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1891; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1892; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 1893; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1894; AVX1-NEXT: retq 1895; 1896; AVX2-LABEL: test_bitreverse_v8i32: 1897; AVX2: # BB#0: 1898; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1899; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1900; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1901; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1902; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1903; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1904; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1905; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1906; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1907; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1908; AVX2-NEXT: retq 1909; 1910; AVX512-LABEL: test_bitreverse_v8i32: 1911; AVX512: # BB#0: 1912; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1913; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1914; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1915; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1916; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1917; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1918; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1919; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1920; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1921; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1922; AVX512-NEXT: retq 1923; 1924; XOPAVX1-LABEL: test_bitreverse_v8i32: 1925; XOPAVX1: # BB#0: 1926; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1927; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 1928; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1929; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1930; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1931; XOPAVX1-NEXT: retq 1932; 1933; XOPAVX2-LABEL: test_bitreverse_v8i32: 1934; XOPAVX2: # BB#0: 1935; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1936; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 1937; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1938; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1939; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1940; XOPAVX2-NEXT: retq 1941 %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) 1942 ret <8 x i32> %b 1943} 1944 1945define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind { 1946; SSE2-LABEL: test_bitreverse_v4i64: 1947; SSE2: # BB#0: 1948; SSE2-NEXT: pxor %xmm9, %xmm9 1949; SSE2-NEXT: movdqa %xmm0, %xmm2 1950; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] 1951; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 1952; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 1953; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 1954; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] 1955; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1956; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1957; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 1958; SSE2-NEXT: packuswb %xmm2, %xmm0 1959; SSE2-NEXT: movdqa %xmm0, %xmm2 1960; SSE2-NEXT: psllw $5, %xmm2 1961; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 1962; SSE2-NEXT: pand {{.*}}(%rip), %xmm10 1963; SSE2-NEXT: pand %xmm10, %xmm2 1964; SSE2-NEXT: movdqa %xmm0, %xmm4 1965; SSE2-NEXT: psllw $7, %xmm4 1966; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 1967; SSE2-NEXT: pand %xmm11, %xmm11 1968; SSE2-NEXT: pand %xmm11, %xmm4 1969; SSE2-NEXT: movdqa %xmm0, %xmm3 1970; SSE2-NEXT: psllw $3, %xmm3 1971; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 1972; SSE2-NEXT: pand {{.*}}(%rip), %xmm12 1973; SSE2-NEXT: pand %xmm12, %xmm3 1974; SSE2-NEXT: por %xmm2, %xmm3 1975; SSE2-NEXT: movdqa %xmm0, %xmm2 1976; SSE2-NEXT: paddb %xmm2, %xmm2 1977; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 1978; SSE2-NEXT: pand %xmm8, %xmm2 1979; SSE2-NEXT: por %xmm3, %xmm2 1980; SSE2-NEXT: movdqa %xmm0, %xmm3 1981; SSE2-NEXT: psrlw $1, %xmm3 1982; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1983; SSE2-NEXT: pand {{.*}}(%rip), %xmm13 1984; SSE2-NEXT: pand %xmm13, %xmm3 1985; SSE2-NEXT: por %xmm2, %xmm3 1986; SSE2-NEXT: movdqa %xmm0, %xmm5 1987; SSE2-NEXT: psrlw $3, %xmm5 1988; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] 1989; SSE2-NEXT: pand {{.*}}(%rip), %xmm6 1990; SSE2-NEXT: pand %xmm6, %xmm5 1991; SSE2-NEXT: por %xmm3, %xmm5 1992; SSE2-NEXT: movdqa %xmm0, %xmm7 1993; SSE2-NEXT: psrlw $5, %xmm7 1994; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 1995; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 1996; SSE2-NEXT: pand %xmm2, %xmm7 1997; SSE2-NEXT: por %xmm5, %xmm7 1998; SSE2-NEXT: psrlw $7, %xmm0 1999; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2000; SSE2-NEXT: pand %xmm3, %xmm3 2001; SSE2-NEXT: pand %xmm3, %xmm0 2002; SSE2-NEXT: por %xmm7, %xmm0 2003; SSE2-NEXT: por %xmm4, %xmm0 2004; SSE2-NEXT: movdqa %xmm1, %xmm4 2005; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 2006; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 2007; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2008; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2009; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] 2010; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2011; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 2012; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 2013; SSE2-NEXT: packuswb %xmm4, %xmm1 2014; SSE2-NEXT: movdqa %xmm1, %xmm5 2015; SSE2-NEXT: psllw $5, %xmm5 2016; SSE2-NEXT: pand %xmm10, %xmm5 2017; SSE2-NEXT: movdqa %xmm1, %xmm4 2018; SSE2-NEXT: psllw $7, %xmm4 2019; SSE2-NEXT: pand %xmm11, %xmm4 2020; SSE2-NEXT: movdqa %xmm1, %xmm7 2021; SSE2-NEXT: psllw $3, %xmm7 2022; SSE2-NEXT: pand %xmm12, %xmm7 2023; SSE2-NEXT: por %xmm5, %xmm7 2024; SSE2-NEXT: movdqa %xmm1, %xmm5 2025; SSE2-NEXT: paddb %xmm5, %xmm5 2026; SSE2-NEXT: pand %xmm8, %xmm5 2027; SSE2-NEXT: por %xmm7, %xmm5 2028; SSE2-NEXT: movdqa %xmm1, %xmm7 2029; SSE2-NEXT: psrlw $1, %xmm7 2030; SSE2-NEXT: pand %xmm13, %xmm7 2031; SSE2-NEXT: por %xmm5, %xmm7 2032; SSE2-NEXT: movdqa %xmm1, %xmm5 2033; SSE2-NEXT: psrlw $3, %xmm5 2034; SSE2-NEXT: pand %xmm6, %xmm5 2035; SSE2-NEXT: por %xmm7, %xmm5 2036; SSE2-NEXT: movdqa %xmm1, %xmm6 2037; SSE2-NEXT: psrlw $5, %xmm6 2038; SSE2-NEXT: pand %xmm2, %xmm6 2039; SSE2-NEXT: por %xmm5, %xmm6 2040; SSE2-NEXT: psrlw $7, %xmm1 2041; SSE2-NEXT: pand %xmm3, %xmm1 2042; SSE2-NEXT: por %xmm6, %xmm1 2043; SSE2-NEXT: por %xmm4, %xmm1 2044; SSE2-NEXT: retq 2045; 2046; SSSE3-LABEL: test_bitreverse_v4i64: 2047; SSSE3: # BB#0: 2048; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2049; SSSE3-NEXT: pshufb %xmm4, %xmm0 2050; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2051; SSSE3-NEXT: movdqa %xmm0, %xmm2 2052; SSSE3-NEXT: pand %xmm5, %xmm2 2053; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2054; SSSE3-NEXT: movdqa %xmm6, %xmm7 2055; SSSE3-NEXT: pshufb %xmm2, %xmm7 2056; SSSE3-NEXT: psrlw $4, %xmm0 2057; SSSE3-NEXT: pand %xmm5, %xmm0 2058; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2059; SSSE3-NEXT: movdqa %xmm2, %xmm3 2060; SSSE3-NEXT: pshufb %xmm0, %xmm3 2061; SSSE3-NEXT: por %xmm7, %xmm3 2062; SSSE3-NEXT: pshufb %xmm4, %xmm1 2063; SSSE3-NEXT: movdqa %xmm1, %xmm0 2064; SSSE3-NEXT: pand %xmm5, %xmm0 2065; SSSE3-NEXT: pshufb %xmm0, %xmm6 2066; SSSE3-NEXT: psrlw $4, %xmm1 2067; SSSE3-NEXT: pand %xmm5, %xmm1 2068; SSSE3-NEXT: pshufb %xmm1, %xmm2 2069; SSSE3-NEXT: por %xmm6, %xmm2 2070; SSSE3-NEXT: movdqa %xmm3, %xmm0 2071; SSSE3-NEXT: movdqa %xmm2, %xmm1 2072; SSSE3-NEXT: retq 2073; 2074; AVX1-LABEL: test_bitreverse_v4i64: 2075; AVX1: # BB#0: 2076; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2077; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2078; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 2079; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2080; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 2081; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2082; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 2083; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2084; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 2085; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2086; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 2087; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 2088; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2089; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 2090; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 2091; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 2092; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 2093; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 2094; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 2095; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2096; AVX1-NEXT: retq 2097; 2098; AVX2-LABEL: test_bitreverse_v4i64: 2099; AVX2: # BB#0: 2100; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 2101; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2102; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 2103; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2104; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 2105; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2106; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 2107; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2108; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 2109; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 2110; AVX2-NEXT: retq 2111; 2112; AVX512-LABEL: test_bitreverse_v4i64: 2113; AVX512: # BB#0: 2114; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 2115; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2116; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 2117; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2118; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 2119; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 2120; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 2121; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2122; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 2123; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 2124; AVX512-NEXT: retq 2125; 2126; XOPAVX1-LABEL: test_bitreverse_v4i64: 2127; XOPAVX1: # BB#0: 2128; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2129; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 2130; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 2131; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 2132; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2133; XOPAVX1-NEXT: retq 2134; 2135; XOPAVX2-LABEL: test_bitreverse_v4i64: 2136; XOPAVX2: # BB#0: 2137; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2138; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 2139; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 2140; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 2141; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 2142; XOPAVX2-NEXT: retq 2143 %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) 2144 ret <4 x i64> %b 2145} 2146 2147define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind { 2148; SSE2-LABEL: test_bitreverse_v64i8: 2149; SSE2: # BB#0: 2150; SSE2-NEXT: movdqa %xmm0, %xmm4 2151; SSE2-NEXT: psllw $5, %xmm4 2152; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 2153; SSE2-NEXT: pand {{.*}}(%rip), %xmm9 2154; SSE2-NEXT: pand %xmm9, %xmm4 2155; SSE2-NEXT: movdqa %xmm0, %xmm7 2156; SSE2-NEXT: psllw $7, %xmm7 2157; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 2158; SSE2-NEXT: pand %xmm10, %xmm10 2159; SSE2-NEXT: pand %xmm10, %xmm7 2160; SSE2-NEXT: movdqa %xmm0, %xmm5 2161; SSE2-NEXT: psllw $3, %xmm5 2162; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 2163; SSE2-NEXT: pand {{.*}}(%rip), %xmm11 2164; SSE2-NEXT: pand %xmm11, %xmm5 2165; SSE2-NEXT: por %xmm4, %xmm5 2166; SSE2-NEXT: movdqa %xmm0, %xmm4 2167; SSE2-NEXT: paddb %xmm4, %xmm4 2168; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2169; SSE2-NEXT: pand %xmm8, %xmm4 2170; SSE2-NEXT: por %xmm5, %xmm4 2171; SSE2-NEXT: movdqa %xmm0, %xmm5 2172; SSE2-NEXT: psrlw $1, %xmm5 2173; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 2174; SSE2-NEXT: pand {{.*}}(%rip), %xmm12 2175; SSE2-NEXT: pand %xmm12, %xmm5 2176; SSE2-NEXT: por %xmm4, %xmm5 2177; SSE2-NEXT: movdqa %xmm0, %xmm6 2178; SSE2-NEXT: psrlw $3, %xmm6 2179; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] 2180; SSE2-NEXT: pand {{.*}}(%rip), %xmm13 2181; SSE2-NEXT: pand %xmm13, %xmm6 2182; SSE2-NEXT: por %xmm5, %xmm6 2183; SSE2-NEXT: movdqa %xmm0, %xmm4 2184; SSE2-NEXT: psrlw $5, %xmm4 2185; SSE2-NEXT: movdqa {{.*#+}} xmm14 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 2186; SSE2-NEXT: pand {{.*}}(%rip), %xmm14 2187; SSE2-NEXT: pand %xmm14, %xmm4 2188; SSE2-NEXT: por %xmm6, %xmm4 2189; SSE2-NEXT: psrlw $7, %xmm0 2190; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2191; SSE2-NEXT: pand %xmm6, %xmm6 2192; SSE2-NEXT: pand %xmm6, %xmm0 2193; SSE2-NEXT: por %xmm4, %xmm0 2194; SSE2-NEXT: por %xmm7, %xmm0 2195; SSE2-NEXT: movdqa %xmm1, %xmm4 2196; SSE2-NEXT: psllw $5, %xmm4 2197; SSE2-NEXT: pand %xmm9, %xmm4 2198; SSE2-NEXT: movdqa %xmm1, %xmm7 2199; SSE2-NEXT: psllw $7, %xmm7 2200; SSE2-NEXT: pand %xmm10, %xmm7 2201; SSE2-NEXT: movdqa %xmm1, %xmm5 2202; SSE2-NEXT: psllw $3, %xmm5 2203; SSE2-NEXT: pand %xmm11, %xmm5 2204; SSE2-NEXT: por %xmm4, %xmm5 2205; SSE2-NEXT: movdqa %xmm1, %xmm4 2206; SSE2-NEXT: paddb %xmm4, %xmm4 2207; SSE2-NEXT: pand %xmm8, %xmm4 2208; SSE2-NEXT: por %xmm5, %xmm4 2209; SSE2-NEXT: movdqa %xmm1, %xmm5 2210; SSE2-NEXT: psrlw $1, %xmm5 2211; SSE2-NEXT: pand %xmm12, %xmm5 2212; SSE2-NEXT: por %xmm4, %xmm5 2213; SSE2-NEXT: movdqa %xmm1, %xmm4 2214; SSE2-NEXT: psrlw $3, %xmm4 2215; SSE2-NEXT: pand %xmm13, %xmm4 2216; SSE2-NEXT: por %xmm5, %xmm4 2217; SSE2-NEXT: movdqa %xmm1, %xmm5 2218; SSE2-NEXT: psrlw $5, %xmm5 2219; SSE2-NEXT: pand %xmm14, %xmm5 2220; SSE2-NEXT: por %xmm4, %xmm5 2221; SSE2-NEXT: psrlw $7, %xmm1 2222; SSE2-NEXT: pand %xmm6, %xmm1 2223; SSE2-NEXT: por %xmm5, %xmm1 2224; SSE2-NEXT: por %xmm7, %xmm1 2225; SSE2-NEXT: movdqa %xmm2, %xmm4 2226; SSE2-NEXT: psllw $5, %xmm4 2227; SSE2-NEXT: pand %xmm9, %xmm4 2228; SSE2-NEXT: movdqa %xmm2, %xmm7 2229; SSE2-NEXT: psllw $7, %xmm7 2230; SSE2-NEXT: pand %xmm10, %xmm7 2231; SSE2-NEXT: movdqa %xmm2, %xmm5 2232; SSE2-NEXT: psllw $3, %xmm5 2233; SSE2-NEXT: pand %xmm11, %xmm5 2234; SSE2-NEXT: por %xmm4, %xmm5 2235; SSE2-NEXT: movdqa %xmm2, %xmm4 2236; SSE2-NEXT: paddb %xmm4, %xmm4 2237; SSE2-NEXT: pand %xmm8, %xmm4 2238; SSE2-NEXT: por %xmm5, %xmm4 2239; SSE2-NEXT: movdqa %xmm2, %xmm5 2240; SSE2-NEXT: psrlw $1, %xmm5 2241; SSE2-NEXT: pand %xmm12, %xmm5 2242; SSE2-NEXT: por %xmm4, %xmm5 2243; SSE2-NEXT: movdqa %xmm2, %xmm4 2244; SSE2-NEXT: psrlw $3, %xmm4 2245; SSE2-NEXT: pand %xmm13, %xmm4 2246; SSE2-NEXT: por %xmm5, %xmm4 2247; SSE2-NEXT: movdqa %xmm2, %xmm5 2248; SSE2-NEXT: psrlw $5, %xmm5 2249; SSE2-NEXT: pand %xmm14, %xmm5 2250; SSE2-NEXT: por %xmm4, %xmm5 2251; SSE2-NEXT: psrlw $7, %xmm2 2252; SSE2-NEXT: pand %xmm6, %xmm2 2253; SSE2-NEXT: por %xmm5, %xmm2 2254; SSE2-NEXT: por %xmm7, %xmm2 2255; SSE2-NEXT: movdqa %xmm3, %xmm4 2256; SSE2-NEXT: psllw $5, %xmm4 2257; SSE2-NEXT: pand %xmm9, %xmm4 2258; SSE2-NEXT: movdqa %xmm3, %xmm7 2259; SSE2-NEXT: psllw $7, %xmm7 2260; SSE2-NEXT: pand %xmm10, %xmm7 2261; SSE2-NEXT: movdqa %xmm3, %xmm5 2262; SSE2-NEXT: psllw $3, %xmm5 2263; SSE2-NEXT: pand %xmm11, %xmm5 2264; SSE2-NEXT: por %xmm4, %xmm5 2265; SSE2-NEXT: movdqa %xmm3, %xmm4 2266; SSE2-NEXT: paddb %xmm4, %xmm4 2267; SSE2-NEXT: pand %xmm8, %xmm4 2268; SSE2-NEXT: por %xmm5, %xmm4 2269; SSE2-NEXT: movdqa %xmm3, %xmm5 2270; SSE2-NEXT: psrlw $1, %xmm5 2271; SSE2-NEXT: pand %xmm12, %xmm5 2272; SSE2-NEXT: por %xmm4, %xmm5 2273; SSE2-NEXT: movdqa %xmm3, %xmm4 2274; SSE2-NEXT: psrlw $3, %xmm4 2275; SSE2-NEXT: pand %xmm13, %xmm4 2276; SSE2-NEXT: por %xmm5, %xmm4 2277; SSE2-NEXT: movdqa %xmm3, %xmm5 2278; SSE2-NEXT: psrlw $5, %xmm5 2279; SSE2-NEXT: pand %xmm14, %xmm5 2280; SSE2-NEXT: por %xmm4, %xmm5 2281; SSE2-NEXT: psrlw $7, %xmm3 2282; SSE2-NEXT: pand %xmm6, %xmm3 2283; SSE2-NEXT: por %xmm5, %xmm3 2284; SSE2-NEXT: por %xmm7, %xmm3 2285; SSE2-NEXT: retq 2286; 2287; SSSE3-LABEL: test_bitreverse_v64i8: 2288; SSSE3: # BB#0: 2289; SSSE3-NEXT: movdqa %xmm0, %xmm5 2290; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2291; SSSE3-NEXT: pand %xmm8, %xmm0 2292; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2293; SSSE3-NEXT: movdqa %xmm9, %xmm6 2294; SSSE3-NEXT: pshufb %xmm0, %xmm6 2295; SSSE3-NEXT: psrlw $4, %xmm5 2296; SSSE3-NEXT: pand %xmm8, %xmm5 2297; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2298; SSSE3-NEXT: movdqa %xmm4, %xmm0 2299; SSSE3-NEXT: pshufb %xmm5, %xmm0 2300; SSSE3-NEXT: por %xmm6, %xmm0 2301; SSSE3-NEXT: movdqa %xmm1, %xmm5 2302; SSSE3-NEXT: pand %xmm8, %xmm5 2303; SSSE3-NEXT: movdqa %xmm9, %xmm6 2304; SSSE3-NEXT: pshufb %xmm5, %xmm6 2305; SSSE3-NEXT: psrlw $4, %xmm1 2306; SSSE3-NEXT: pand %xmm8, %xmm1 2307; SSSE3-NEXT: movdqa %xmm4, %xmm5 2308; SSSE3-NEXT: pshufb %xmm1, %xmm5 2309; SSSE3-NEXT: por %xmm6, %xmm5 2310; SSSE3-NEXT: movdqa %xmm2, %xmm1 2311; SSSE3-NEXT: pand %xmm8, %xmm1 2312; SSSE3-NEXT: movdqa %xmm9, %xmm7 2313; SSSE3-NEXT: pshufb %xmm1, %xmm7 2314; SSSE3-NEXT: psrlw $4, %xmm2 2315; SSSE3-NEXT: pand %xmm8, %xmm2 2316; SSSE3-NEXT: movdqa %xmm4, %xmm6 2317; SSSE3-NEXT: pshufb %xmm2, %xmm6 2318; SSSE3-NEXT: por %xmm7, %xmm6 2319; SSSE3-NEXT: movdqa %xmm3, %xmm1 2320; SSSE3-NEXT: pand %xmm8, %xmm1 2321; SSSE3-NEXT: pshufb %xmm1, %xmm9 2322; SSSE3-NEXT: psrlw $4, %xmm3 2323; SSSE3-NEXT: pand %xmm8, %xmm3 2324; SSSE3-NEXT: pshufb %xmm3, %xmm4 2325; SSSE3-NEXT: por %xmm9, %xmm4 2326; SSSE3-NEXT: movdqa %xmm5, %xmm1 2327; SSSE3-NEXT: movdqa %xmm6, %xmm2 2328; SSSE3-NEXT: movdqa %xmm4, %xmm3 2329; SSSE3-NEXT: retq 2330; 2331; AVX1-LABEL: test_bitreverse_v64i8: 2332; AVX1: # BB#0: 2333; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2334; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2335; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm4 2336; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2337; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 2338; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2339; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 2340; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2341; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 2342; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 2343; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm4 2344; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 2345; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 2346; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 2347; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 2348; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0 2349; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2350; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2351; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm4 2352; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 2353; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2354; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 2355; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 2356; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 2357; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm4 2358; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 2359; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2360; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 2361; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 2362; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 2363; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2364; AVX1-NEXT: retq 2365; 2366; AVX2-LABEL: test_bitreverse_v64i8: 2367; AVX2: # BB#0: 2368; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2369; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 2370; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2371; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 2372; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2373; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 2374; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2375; AVX2-NEXT: vpshufb %ymm0, %ymm5, %ymm0 2376; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0 2377; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 2378; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 2379; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2380; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 2381; AVX2-NEXT: vpshufb %ymm1, %ymm5, %ymm1 2382; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1 2383; AVX2-NEXT: retq 2384; 2385; AVX512F-LABEL: test_bitreverse_v64i8: 2386; AVX512F: # BB#0: 2387; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2388; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3 2389; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2390; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 2391; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 2392; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 2393; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2394; AVX512F-NEXT: vpshufb %ymm0, %ymm5, %ymm0 2395; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 2396; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 2397; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 2398; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 2399; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 2400; AVX512F-NEXT: vpshufb %ymm1, %ymm5, %ymm1 2401; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 2402; AVX512F-NEXT: retq 2403; 2404; AVX512BW-LABEL: test_bitreverse_v64i8: 2405; AVX512BW: # BB#0: 2406; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2407; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2408; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2409; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2410; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2411; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2412; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2413; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2414; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2415; AVX512BW-NEXT: retq 2416; 2417; XOPAVX1-LABEL: test_bitreverse_v64i8: 2418; XOPAVX1: # BB#0: 2419; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2420; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 2421; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2422; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2423; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2424; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2425; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2426; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2427; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2428; XOPAVX1-NEXT: retq 2429; 2430; XOPAVX2-LABEL: test_bitreverse_v64i8: 2431; XOPAVX2: # BB#0: 2432; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2433; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 2434; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2435; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2436; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2437; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2438; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2439; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2440; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2441; XOPAVX2-NEXT: retq 2442 %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) 2443 ret <64 x i8> %b 2444} 2445 2446define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { 2447; SSE2-LABEL: test_bitreverse_v32i16: 2448; SSE2: # BB#0: 2449; SSE2-NEXT: pxor %xmm9, %xmm9 2450; SSE2-NEXT: movdqa %xmm0, %xmm4 2451; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 2452; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7] 2453; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6] 2454; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] 2455; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 2456; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] 2457; SSE2-NEXT: packuswb %xmm4, %xmm0 2458; SSE2-NEXT: movdqa %xmm0, %xmm5 2459; SSE2-NEXT: psllw $5, %xmm5 2460; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 2461; SSE2-NEXT: pand {{.*}}(%rip), %xmm10 2462; SSE2-NEXT: pand %xmm10, %xmm5 2463; SSE2-NEXT: movdqa %xmm0, %xmm4 2464; SSE2-NEXT: psllw $7, %xmm4 2465; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 2466; SSE2-NEXT: pand %xmm11, %xmm11 2467; SSE2-NEXT: pand %xmm11, %xmm4 2468; SSE2-NEXT: movdqa %xmm0, %xmm6 2469; SSE2-NEXT: psllw $3, %xmm6 2470; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 2471; SSE2-NEXT: pand {{.*}}(%rip), %xmm12 2472; SSE2-NEXT: pand %xmm12, %xmm6 2473; SSE2-NEXT: por %xmm5, %xmm6 2474; SSE2-NEXT: movdqa %xmm0, %xmm5 2475; SSE2-NEXT: paddb %xmm5, %xmm5 2476; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2477; SSE2-NEXT: pand %xmm8, %xmm5 2478; SSE2-NEXT: por %xmm6, %xmm5 2479; SSE2-NEXT: movdqa %xmm0, %xmm6 2480; SSE2-NEXT: psrlw $1, %xmm6 2481; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 2482; SSE2-NEXT: pand {{.*}}(%rip), %xmm13 2483; SSE2-NEXT: pand %xmm13, %xmm6 2484; SSE2-NEXT: por %xmm5, %xmm6 2485; SSE2-NEXT: movdqa %xmm0, %xmm7 2486; SSE2-NEXT: psrlw $3, %xmm7 2487; SSE2-NEXT: movdqa {{.*#+}} xmm14 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] 2488; SSE2-NEXT: pand {{.*}}(%rip), %xmm14 2489; SSE2-NEXT: pand %xmm14, %xmm7 2490; SSE2-NEXT: por %xmm6, %xmm7 2491; SSE2-NEXT: movdqa %xmm0, %xmm5 2492; SSE2-NEXT: psrlw $5, %xmm5 2493; SSE2-NEXT: movdqa {{.*#+}} xmm15 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 2494; SSE2-NEXT: pand {{.*}}(%rip), %xmm15 2495; SSE2-NEXT: pand %xmm15, %xmm5 2496; SSE2-NEXT: por %xmm7, %xmm5 2497; SSE2-NEXT: psrlw $7, %xmm0 2498; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2499; SSE2-NEXT: pand %xmm7, %xmm7 2500; SSE2-NEXT: pand %xmm7, %xmm0 2501; SSE2-NEXT: por %xmm5, %xmm0 2502; SSE2-NEXT: por %xmm4, %xmm0 2503; SSE2-NEXT: movdqa %xmm1, %xmm4 2504; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 2505; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7] 2506; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6] 2507; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] 2508; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 2509; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6] 2510; SSE2-NEXT: packuswb %xmm4, %xmm1 2511; SSE2-NEXT: movdqa %xmm1, %xmm5 2512; SSE2-NEXT: psllw $5, %xmm5 2513; SSE2-NEXT: pand %xmm10, %xmm5 2514; SSE2-NEXT: movdqa %xmm1, %xmm4 2515; SSE2-NEXT: psllw $7, %xmm4 2516; SSE2-NEXT: pand %xmm11, %xmm4 2517; SSE2-NEXT: movdqa %xmm1, %xmm6 2518; SSE2-NEXT: psllw $3, %xmm6 2519; SSE2-NEXT: pand %xmm12, %xmm6 2520; SSE2-NEXT: por %xmm5, %xmm6 2521; SSE2-NEXT: movdqa %xmm1, %xmm5 2522; SSE2-NEXT: paddb %xmm5, %xmm5 2523; SSE2-NEXT: pand %xmm8, %xmm5 2524; SSE2-NEXT: por %xmm6, %xmm5 2525; SSE2-NEXT: movdqa %xmm1, %xmm6 2526; SSE2-NEXT: psrlw $1, %xmm6 2527; SSE2-NEXT: pand %xmm13, %xmm6 2528; SSE2-NEXT: por %xmm5, %xmm6 2529; SSE2-NEXT: movdqa %xmm1, %xmm5 2530; SSE2-NEXT: psrlw $3, %xmm5 2531; SSE2-NEXT: pand %xmm14, %xmm5 2532; SSE2-NEXT: por %xmm6, %xmm5 2533; SSE2-NEXT: movdqa %xmm1, %xmm6 2534; SSE2-NEXT: psrlw $5, %xmm6 2535; SSE2-NEXT: pand %xmm15, %xmm6 2536; SSE2-NEXT: por %xmm5, %xmm6 2537; SSE2-NEXT: psrlw $7, %xmm1 2538; SSE2-NEXT: pand %xmm7, %xmm1 2539; SSE2-NEXT: por %xmm6, %xmm1 2540; SSE2-NEXT: por %xmm4, %xmm1 2541; SSE2-NEXT: movdqa %xmm2, %xmm4 2542; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 2543; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7] 2544; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6] 2545; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] 2546; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] 2547; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6] 2548; SSE2-NEXT: packuswb %xmm4, %xmm2 2549; SSE2-NEXT: movdqa %xmm2, %xmm5 2550; SSE2-NEXT: psllw $5, %xmm5 2551; SSE2-NEXT: pand %xmm10, %xmm5 2552; SSE2-NEXT: movdqa %xmm2, %xmm4 2553; SSE2-NEXT: psllw $7, %xmm4 2554; SSE2-NEXT: pand %xmm11, %xmm4 2555; SSE2-NEXT: movdqa %xmm2, %xmm6 2556; SSE2-NEXT: psllw $3, %xmm6 2557; SSE2-NEXT: pand %xmm12, %xmm6 2558; SSE2-NEXT: por %xmm5, %xmm6 2559; SSE2-NEXT: movdqa %xmm2, %xmm5 2560; SSE2-NEXT: paddb %xmm5, %xmm5 2561; SSE2-NEXT: pand %xmm8, %xmm5 2562; SSE2-NEXT: por %xmm6, %xmm5 2563; SSE2-NEXT: movdqa %xmm2, %xmm6 2564; SSE2-NEXT: psrlw $1, %xmm6 2565; SSE2-NEXT: pand %xmm13, %xmm6 2566; SSE2-NEXT: por %xmm5, %xmm6 2567; SSE2-NEXT: movdqa %xmm2, %xmm5 2568; SSE2-NEXT: psrlw $3, %xmm5 2569; SSE2-NEXT: pand %xmm14, %xmm5 2570; SSE2-NEXT: por %xmm6, %xmm5 2571; SSE2-NEXT: movdqa %xmm2, %xmm6 2572; SSE2-NEXT: psrlw $5, %xmm6 2573; SSE2-NEXT: pand %xmm15, %xmm6 2574; SSE2-NEXT: por %xmm5, %xmm6 2575; SSE2-NEXT: psrlw $7, %xmm2 2576; SSE2-NEXT: pand %xmm7, %xmm2 2577; SSE2-NEXT: por %xmm6, %xmm2 2578; SSE2-NEXT: por %xmm4, %xmm2 2579; SSE2-NEXT: movdqa %xmm3, %xmm4 2580; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 2581; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7] 2582; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6] 2583; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] 2584; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] 2585; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6] 2586; SSE2-NEXT: packuswb %xmm4, %xmm3 2587; SSE2-NEXT: movdqa %xmm3, %xmm5 2588; SSE2-NEXT: psllw $5, %xmm5 2589; SSE2-NEXT: pand %xmm10, %xmm5 2590; SSE2-NEXT: movdqa %xmm3, %xmm4 2591; SSE2-NEXT: psllw $7, %xmm4 2592; SSE2-NEXT: pand %xmm11, %xmm4 2593; SSE2-NEXT: movdqa %xmm3, %xmm6 2594; SSE2-NEXT: psllw $3, %xmm6 2595; SSE2-NEXT: pand %xmm12, %xmm6 2596; SSE2-NEXT: por %xmm5, %xmm6 2597; SSE2-NEXT: movdqa %xmm3, %xmm5 2598; SSE2-NEXT: paddb %xmm5, %xmm5 2599; SSE2-NEXT: pand %xmm8, %xmm5 2600; SSE2-NEXT: por %xmm6, %xmm5 2601; SSE2-NEXT: movdqa %xmm3, %xmm6 2602; SSE2-NEXT: psrlw $1, %xmm6 2603; SSE2-NEXT: pand %xmm13, %xmm6 2604; SSE2-NEXT: por %xmm5, %xmm6 2605; SSE2-NEXT: movdqa %xmm3, %xmm5 2606; SSE2-NEXT: psrlw $3, %xmm5 2607; SSE2-NEXT: pand %xmm14, %xmm5 2608; SSE2-NEXT: por %xmm6, %xmm5 2609; SSE2-NEXT: movdqa %xmm3, %xmm6 2610; SSE2-NEXT: psrlw $5, %xmm6 2611; SSE2-NEXT: pand %xmm15, %xmm6 2612; SSE2-NEXT: por %xmm5, %xmm6 2613; SSE2-NEXT: psrlw $7, %xmm3 2614; SSE2-NEXT: pand %xmm7, %xmm3 2615; SSE2-NEXT: por %xmm6, %xmm3 2616; SSE2-NEXT: por %xmm4, %xmm3 2617; SSE2-NEXT: retq 2618; 2619; SSSE3-LABEL: test_bitreverse_v32i16: 2620; SSSE3: # BB#0: 2621; SSSE3-NEXT: movdqa %xmm1, %xmm5 2622; SSSE3-NEXT: movdqa %xmm0, %xmm1 2623; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2624; SSSE3-NEXT: pshufb %xmm8, %xmm1 2625; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2626; SSSE3-NEXT: movdqa %xmm1, %xmm0 2627; SSSE3-NEXT: pand %xmm9, %xmm0 2628; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2629; SSSE3-NEXT: movdqa %xmm7, %xmm6 2630; SSSE3-NEXT: pshufb %xmm0, %xmm6 2631; SSSE3-NEXT: psrlw $4, %xmm1 2632; SSSE3-NEXT: pand %xmm9, %xmm1 2633; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2634; SSSE3-NEXT: movdqa %xmm4, %xmm0 2635; SSSE3-NEXT: pshufb %xmm1, %xmm0 2636; SSSE3-NEXT: por %xmm6, %xmm0 2637; SSSE3-NEXT: pshufb %xmm8, %xmm5 2638; SSSE3-NEXT: movdqa %xmm5, %xmm1 2639; SSSE3-NEXT: pand %xmm9, %xmm1 2640; SSSE3-NEXT: movdqa %xmm7, %xmm6 2641; SSSE3-NEXT: pshufb %xmm1, %xmm6 2642; SSSE3-NEXT: psrlw $4, %xmm5 2643; SSSE3-NEXT: pand %xmm9, %xmm5 2644; SSSE3-NEXT: movdqa %xmm4, %xmm1 2645; SSSE3-NEXT: pshufb %xmm5, %xmm1 2646; SSSE3-NEXT: por %xmm6, %xmm1 2647; SSSE3-NEXT: pshufb %xmm8, %xmm2 2648; SSSE3-NEXT: movdqa %xmm2, %xmm5 2649; SSSE3-NEXT: pand %xmm9, %xmm5 2650; SSSE3-NEXT: movdqa %xmm7, %xmm6 2651; SSSE3-NEXT: pshufb %xmm5, %xmm6 2652; SSSE3-NEXT: psrlw $4, %xmm2 2653; SSSE3-NEXT: pand %xmm9, %xmm2 2654; SSSE3-NEXT: movdqa %xmm4, %xmm5 2655; SSSE3-NEXT: pshufb %xmm2, %xmm5 2656; SSSE3-NEXT: por %xmm6, %xmm5 2657; SSSE3-NEXT: pshufb %xmm8, %xmm3 2658; SSSE3-NEXT: movdqa %xmm3, %xmm2 2659; SSSE3-NEXT: pand %xmm9, %xmm2 2660; SSSE3-NEXT: pshufb %xmm2, %xmm7 2661; SSSE3-NEXT: psrlw $4, %xmm3 2662; SSSE3-NEXT: pand %xmm9, %xmm3 2663; SSSE3-NEXT: pshufb %xmm3, %xmm4 2664; SSSE3-NEXT: por %xmm7, %xmm4 2665; SSSE3-NEXT: movdqa %xmm5, %xmm2 2666; SSSE3-NEXT: movdqa %xmm4, %xmm3 2667; SSSE3-NEXT: retq 2668; 2669; AVX1-LABEL: test_bitreverse_v32i16: 2670; AVX1: # BB#0: 2671; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2672; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2673; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2674; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2675; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2676; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2677; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2678; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2679; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2680; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2681; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2682; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2683; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2684; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 2685; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2686; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 2687; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2688; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 2689; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 2690; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2691; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2692; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2693; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2694; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2695; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2696; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2697; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2698; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2699; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2700; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 2701; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 2702; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2703; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2704; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 2705; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 2706; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2707; AVX1-NEXT: retq 2708; 2709; AVX2-LABEL: test_bitreverse_v32i16: 2710; AVX2: # BB#0: 2711; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2712; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2713; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2714; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 2715; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2716; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2717; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2718; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 2719; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2720; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 2721; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 2722; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2723; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 2724; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2725; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2726; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2727; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 2728; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 2729; AVX2-NEXT: retq 2730; 2731; AVX512F-LABEL: test_bitreverse_v32i16: 2732; AVX512F: # BB#0: 2733; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2734; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2735; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2736; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm4 2737; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2738; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2739; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 2740; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 2741; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2742; AVX512F-NEXT: vpshufb %ymm0, %ymm6, %ymm0 2743; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0 2744; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2745; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm2 2746; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2747; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 2748; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 2749; AVX512F-NEXT: vpshufb %ymm1, %ymm6, %ymm1 2750; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 2751; AVX512F-NEXT: retq 2752; 2753; AVX512BW-LABEL: test_bitreverse_v32i16: 2754; AVX512BW: # BB#0: 2755; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62] 2756; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2757; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2758; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2759; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2760; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2761; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2762; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2763; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2764; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2765; AVX512BW-NEXT: retq 2766; 2767; XOPAVX1-LABEL: test_bitreverse_v32i16: 2768; XOPAVX1: # BB#0: 2769; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2770; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 2771; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2772; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2773; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2774; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2775; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2776; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2777; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2778; XOPAVX1-NEXT: retq 2779; 2780; XOPAVX2-LABEL: test_bitreverse_v32i16: 2781; XOPAVX2: # BB#0: 2782; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2783; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 2784; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2785; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2786; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2787; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2788; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2789; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2790; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2791; XOPAVX2-NEXT: retq 2792 %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) 2793 ret <32 x i16> %b 2794} 2795 2796define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { 2797; SSE2-LABEL: test_bitreverse_v16i32: 2798; SSE2: # BB#0: 2799; SSE2-NEXT: pxor %xmm9, %xmm9 2800; SSE2-NEXT: movdqa %xmm0, %xmm4 2801; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 2802; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2803; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2804; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] 2805; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 2806; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 2807; SSE2-NEXT: packuswb %xmm4, %xmm0 2808; SSE2-NEXT: movdqa %xmm0, %xmm5 2809; SSE2-NEXT: psllw $5, %xmm5 2810; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 2811; SSE2-NEXT: pand {{.*}}(%rip), %xmm10 2812; SSE2-NEXT: pand %xmm10, %xmm5 2813; SSE2-NEXT: movdqa %xmm0, %xmm4 2814; SSE2-NEXT: psllw $7, %xmm4 2815; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 2816; SSE2-NEXT: pand %xmm11, %xmm11 2817; SSE2-NEXT: pand %xmm11, %xmm4 2818; SSE2-NEXT: movdqa %xmm0, %xmm6 2819; SSE2-NEXT: psllw $3, %xmm6 2820; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 2821; SSE2-NEXT: pand {{.*}}(%rip), %xmm12 2822; SSE2-NEXT: pand %xmm12, %xmm6 2823; SSE2-NEXT: por %xmm5, %xmm6 2824; SSE2-NEXT: movdqa %xmm0, %xmm5 2825; SSE2-NEXT: paddb %xmm5, %xmm5 2826; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2827; SSE2-NEXT: pand %xmm8, %xmm5 2828; SSE2-NEXT: por %xmm6, %xmm5 2829; SSE2-NEXT: movdqa %xmm0, %xmm6 2830; SSE2-NEXT: psrlw $1, %xmm6 2831; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 2832; SSE2-NEXT: pand {{.*}}(%rip), %xmm13 2833; SSE2-NEXT: pand %xmm13, %xmm6 2834; SSE2-NEXT: por %xmm5, %xmm6 2835; SSE2-NEXT: movdqa %xmm0, %xmm7 2836; SSE2-NEXT: psrlw $3, %xmm7 2837; SSE2-NEXT: movdqa {{.*#+}} xmm14 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] 2838; SSE2-NEXT: pand {{.*}}(%rip), %xmm14 2839; SSE2-NEXT: pand %xmm14, %xmm7 2840; SSE2-NEXT: por %xmm6, %xmm7 2841; SSE2-NEXT: movdqa %xmm0, %xmm5 2842; SSE2-NEXT: psrlw $5, %xmm5 2843; SSE2-NEXT: movdqa {{.*#+}} xmm15 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 2844; SSE2-NEXT: pand {{.*}}(%rip), %xmm15 2845; SSE2-NEXT: pand %xmm15, %xmm5 2846; SSE2-NEXT: por %xmm7, %xmm5 2847; SSE2-NEXT: psrlw $7, %xmm0 2848; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2849; SSE2-NEXT: pand %xmm7, %xmm7 2850; SSE2-NEXT: pand %xmm7, %xmm0 2851; SSE2-NEXT: por %xmm5, %xmm0 2852; SSE2-NEXT: por %xmm4, %xmm0 2853; SSE2-NEXT: movdqa %xmm1, %xmm4 2854; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 2855; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2856; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2857; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] 2858; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 2859; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 2860; SSE2-NEXT: packuswb %xmm4, %xmm1 2861; SSE2-NEXT: movdqa %xmm1, %xmm5 2862; SSE2-NEXT: psllw $5, %xmm5 2863; SSE2-NEXT: pand %xmm10, %xmm5 2864; SSE2-NEXT: movdqa %xmm1, %xmm4 2865; SSE2-NEXT: psllw $7, %xmm4 2866; SSE2-NEXT: pand %xmm11, %xmm4 2867; SSE2-NEXT: movdqa %xmm1, %xmm6 2868; SSE2-NEXT: psllw $3, %xmm6 2869; SSE2-NEXT: pand %xmm12, %xmm6 2870; SSE2-NEXT: por %xmm5, %xmm6 2871; SSE2-NEXT: movdqa %xmm1, %xmm5 2872; SSE2-NEXT: paddb %xmm5, %xmm5 2873; SSE2-NEXT: pand %xmm8, %xmm5 2874; SSE2-NEXT: por %xmm6, %xmm5 2875; SSE2-NEXT: movdqa %xmm1, %xmm6 2876; SSE2-NEXT: psrlw $1, %xmm6 2877; SSE2-NEXT: pand %xmm13, %xmm6 2878; SSE2-NEXT: por %xmm5, %xmm6 2879; SSE2-NEXT: movdqa %xmm1, %xmm5 2880; SSE2-NEXT: psrlw $3, %xmm5 2881; SSE2-NEXT: pand %xmm14, %xmm5 2882; SSE2-NEXT: por %xmm6, %xmm5 2883; SSE2-NEXT: movdqa %xmm1, %xmm6 2884; SSE2-NEXT: psrlw $5, %xmm6 2885; SSE2-NEXT: pand %xmm15, %xmm6 2886; SSE2-NEXT: por %xmm5, %xmm6 2887; SSE2-NEXT: psrlw $7, %xmm1 2888; SSE2-NEXT: pand %xmm7, %xmm1 2889; SSE2-NEXT: por %xmm6, %xmm1 2890; SSE2-NEXT: por %xmm4, %xmm1 2891; SSE2-NEXT: movdqa %xmm2, %xmm4 2892; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 2893; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2894; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2895; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] 2896; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 2897; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 2898; SSE2-NEXT: packuswb %xmm4, %xmm2 2899; SSE2-NEXT: movdqa %xmm2, %xmm5 2900; SSE2-NEXT: psllw $5, %xmm5 2901; SSE2-NEXT: pand %xmm10, %xmm5 2902; SSE2-NEXT: movdqa %xmm2, %xmm4 2903; SSE2-NEXT: psllw $7, %xmm4 2904; SSE2-NEXT: pand %xmm11, %xmm4 2905; SSE2-NEXT: movdqa %xmm2, %xmm6 2906; SSE2-NEXT: psllw $3, %xmm6 2907; SSE2-NEXT: pand %xmm12, %xmm6 2908; SSE2-NEXT: por %xmm5, %xmm6 2909; SSE2-NEXT: movdqa %xmm2, %xmm5 2910; SSE2-NEXT: paddb %xmm5, %xmm5 2911; SSE2-NEXT: pand %xmm8, %xmm5 2912; SSE2-NEXT: por %xmm6, %xmm5 2913; SSE2-NEXT: movdqa %xmm2, %xmm6 2914; SSE2-NEXT: psrlw $1, %xmm6 2915; SSE2-NEXT: pand %xmm13, %xmm6 2916; SSE2-NEXT: por %xmm5, %xmm6 2917; SSE2-NEXT: movdqa %xmm2, %xmm5 2918; SSE2-NEXT: psrlw $3, %xmm5 2919; SSE2-NEXT: pand %xmm14, %xmm5 2920; SSE2-NEXT: por %xmm6, %xmm5 2921; SSE2-NEXT: movdqa %xmm2, %xmm6 2922; SSE2-NEXT: psrlw $5, %xmm6 2923; SSE2-NEXT: pand %xmm15, %xmm6 2924; SSE2-NEXT: por %xmm5, %xmm6 2925; SSE2-NEXT: psrlw $7, %xmm2 2926; SSE2-NEXT: pand %xmm7, %xmm2 2927; SSE2-NEXT: por %xmm6, %xmm2 2928; SSE2-NEXT: por %xmm4, %xmm2 2929; SSE2-NEXT: movdqa %xmm3, %xmm4 2930; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 2931; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2932; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2933; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] 2934; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 2935; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 2936; SSE2-NEXT: packuswb %xmm4, %xmm3 2937; SSE2-NEXT: movdqa %xmm3, %xmm5 2938; SSE2-NEXT: psllw $5, %xmm5 2939; SSE2-NEXT: pand %xmm10, %xmm5 2940; SSE2-NEXT: movdqa %xmm3, %xmm4 2941; SSE2-NEXT: psllw $7, %xmm4 2942; SSE2-NEXT: pand %xmm11, %xmm4 2943; SSE2-NEXT: movdqa %xmm3, %xmm6 2944; SSE2-NEXT: psllw $3, %xmm6 2945; SSE2-NEXT: pand %xmm12, %xmm6 2946; SSE2-NEXT: por %xmm5, %xmm6 2947; SSE2-NEXT: movdqa %xmm3, %xmm5 2948; SSE2-NEXT: paddb %xmm5, %xmm5 2949; SSE2-NEXT: pand %xmm8, %xmm5 2950; SSE2-NEXT: por %xmm6, %xmm5 2951; SSE2-NEXT: movdqa %xmm3, %xmm6 2952; SSE2-NEXT: psrlw $1, %xmm6 2953; SSE2-NEXT: pand %xmm13, %xmm6 2954; SSE2-NEXT: por %xmm5, %xmm6 2955; SSE2-NEXT: movdqa %xmm3, %xmm5 2956; SSE2-NEXT: psrlw $3, %xmm5 2957; SSE2-NEXT: pand %xmm14, %xmm5 2958; SSE2-NEXT: por %xmm6, %xmm5 2959; SSE2-NEXT: movdqa %xmm3, %xmm6 2960; SSE2-NEXT: psrlw $5, %xmm6 2961; SSE2-NEXT: pand %xmm15, %xmm6 2962; SSE2-NEXT: por %xmm5, %xmm6 2963; SSE2-NEXT: psrlw $7, %xmm3 2964; SSE2-NEXT: pand %xmm7, %xmm3 2965; SSE2-NEXT: por %xmm6, %xmm3 2966; SSE2-NEXT: por %xmm4, %xmm3 2967; SSE2-NEXT: retq 2968; 2969; SSSE3-LABEL: test_bitreverse_v16i32: 2970; SSSE3: # BB#0: 2971; SSSE3-NEXT: movdqa %xmm1, %xmm5 2972; SSSE3-NEXT: movdqa %xmm0, %xmm1 2973; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2974; SSSE3-NEXT: pshufb %xmm8, %xmm1 2975; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2976; SSSE3-NEXT: movdqa %xmm1, %xmm0 2977; SSSE3-NEXT: pand %xmm9, %xmm0 2978; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2979; SSSE3-NEXT: movdqa %xmm7, %xmm6 2980; SSSE3-NEXT: pshufb %xmm0, %xmm6 2981; SSSE3-NEXT: psrlw $4, %xmm1 2982; SSSE3-NEXT: pand %xmm9, %xmm1 2983; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2984; SSSE3-NEXT: movdqa %xmm4, %xmm0 2985; SSSE3-NEXT: pshufb %xmm1, %xmm0 2986; SSSE3-NEXT: por %xmm6, %xmm0 2987; SSSE3-NEXT: pshufb %xmm8, %xmm5 2988; SSSE3-NEXT: movdqa %xmm5, %xmm1 2989; SSSE3-NEXT: pand %xmm9, %xmm1 2990; SSSE3-NEXT: movdqa %xmm7, %xmm6 2991; SSSE3-NEXT: pshufb %xmm1, %xmm6 2992; SSSE3-NEXT: psrlw $4, %xmm5 2993; SSSE3-NEXT: pand %xmm9, %xmm5 2994; SSSE3-NEXT: movdqa %xmm4, %xmm1 2995; SSSE3-NEXT: pshufb %xmm5, %xmm1 2996; SSSE3-NEXT: por %xmm6, %xmm1 2997; SSSE3-NEXT: pshufb %xmm8, %xmm2 2998; SSSE3-NEXT: movdqa %xmm2, %xmm5 2999; SSSE3-NEXT: pand %xmm9, %xmm5 3000; SSSE3-NEXT: movdqa %xmm7, %xmm6 3001; SSSE3-NEXT: pshufb %xmm5, %xmm6 3002; SSSE3-NEXT: psrlw $4, %xmm2 3003; SSSE3-NEXT: pand %xmm9, %xmm2 3004; SSSE3-NEXT: movdqa %xmm4, %xmm5 3005; SSSE3-NEXT: pshufb %xmm2, %xmm5 3006; SSSE3-NEXT: por %xmm6, %xmm5 3007; SSSE3-NEXT: pshufb %xmm8, %xmm3 3008; SSSE3-NEXT: movdqa %xmm3, %xmm2 3009; SSSE3-NEXT: pand %xmm9, %xmm2 3010; SSSE3-NEXT: pshufb %xmm2, %xmm7 3011; SSSE3-NEXT: psrlw $4, %xmm3 3012; SSSE3-NEXT: pand %xmm9, %xmm3 3013; SSSE3-NEXT: pshufb %xmm3, %xmm4 3014; SSSE3-NEXT: por %xmm7, %xmm4 3015; SSSE3-NEXT: movdqa %xmm5, %xmm2 3016; SSSE3-NEXT: movdqa %xmm4, %xmm3 3017; SSSE3-NEXT: retq 3018; 3019; AVX1-LABEL: test_bitreverse_v16i32: 3020; AVX1: # BB#0: 3021; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3022; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 3023; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 3024; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 3025; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 3026; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 3027; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 3028; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 3029; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 3030; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 3031; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 3032; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 3033; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 3034; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 3035; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 3036; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 3037; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 3038; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 3039; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 3040; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 3041; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 3042; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 3043; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 3044; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 3045; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 3046; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 3047; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 3048; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 3049; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 3050; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 3051; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 3052; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 3053; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 3054; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 3055; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 3056; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 3057; AVX1-NEXT: retq 3058; 3059; AVX2-LABEL: test_bitreverse_v16i32: 3060; AVX2: # BB#0: 3061; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 3062; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3063; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 3064; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 3065; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 3066; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 3067; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 3068; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 3069; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 3070; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 3071; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 3072; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3073; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 3074; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 3075; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 3076; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 3077; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 3078; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 3079; AVX2-NEXT: retq 3080; 3081; AVX512F-LABEL: test_bitreverse_v16i32: 3082; AVX512F: # BB#0: 3083; AVX512F-NEXT: vpslld $29, %zmm0, %zmm1 3084; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm1, %zmm1 3085; AVX512F-NEXT: vpslld $31, %zmm0, %zmm2 3086; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3087; AVX512F-NEXT: vpord %zmm1, %zmm2, %zmm1 3088; AVX512F-NEXT: vpslld $27, %zmm0, %zmm2 3089; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3090; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3091; AVX512F-NEXT: vpslld $25, %zmm0, %zmm2 3092; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3093; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3094; AVX512F-NEXT: vpslld $23, %zmm0, %zmm2 3095; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3096; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3097; AVX512F-NEXT: vpslld $21, %zmm0, %zmm2 3098; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3099; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3100; AVX512F-NEXT: vpslld $19, %zmm0, %zmm2 3101; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3102; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3103; AVX512F-NEXT: vpslld $17, %zmm0, %zmm2 3104; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3105; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3106; AVX512F-NEXT: vpslld $15, %zmm0, %zmm2 3107; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3108; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3109; AVX512F-NEXT: vpslld $13, %zmm0, %zmm2 3110; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3111; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3112; AVX512F-NEXT: vpslld $11, %zmm0, %zmm2 3113; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3114; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3115; AVX512F-NEXT: vpslld $9, %zmm0, %zmm2 3116; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3117; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3118; AVX512F-NEXT: vpslld $7, %zmm0, %zmm2 3119; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3120; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3121; AVX512F-NEXT: vpslld $5, %zmm0, %zmm2 3122; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3123; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3124; AVX512F-NEXT: vpslld $3, %zmm0, %zmm2 3125; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3126; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3127; AVX512F-NEXT: vpslld $1, %zmm0, %zmm2 3128; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3129; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3130; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm2 3131; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3132; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3133; AVX512F-NEXT: vpsrld $3, %zmm0, %zmm2 3134; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3135; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3136; AVX512F-NEXT: vpsrld $5, %zmm0, %zmm2 3137; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3138; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3139; AVX512F-NEXT: vpsrld $7, %zmm0, %zmm2 3140; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3141; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3142; AVX512F-NEXT: vpsrld $9, %zmm0, %zmm2 3143; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3144; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3145; AVX512F-NEXT: vpsrld $11, %zmm0, %zmm2 3146; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3147; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3148; AVX512F-NEXT: vpsrld $13, %zmm0, %zmm2 3149; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3150; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3151; AVX512F-NEXT: vpsrld $15, %zmm0, %zmm2 3152; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3153; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3154; AVX512F-NEXT: vpsrld $17, %zmm0, %zmm2 3155; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3156; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3157; AVX512F-NEXT: vpsrld $19, %zmm0, %zmm2 3158; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3159; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3160; AVX512F-NEXT: vpsrld $21, %zmm0, %zmm2 3161; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3162; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3163; AVX512F-NEXT: vpsrld $23, %zmm0, %zmm2 3164; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3165; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3166; AVX512F-NEXT: vpsrld $25, %zmm0, %zmm2 3167; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3168; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3169; AVX512F-NEXT: vpsrld $27, %zmm0, %zmm2 3170; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3171; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3172; AVX512F-NEXT: vpsrld $29, %zmm0, %zmm2 3173; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3174; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3175; AVX512F-NEXT: vpsrld $31, %zmm0, %zmm0 3176; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 3177; AVX512F-NEXT: vpord %zmm0, %zmm1, %zmm0 3178; AVX512F-NEXT: retq 3179; 3180; AVX512BW-LABEL: test_bitreverse_v16i32: 3181; AVX512BW: # BB#0: 3182; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60] 3183; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 3184; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 3185; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 3186; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 3187; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 3188; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 3189; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 3190; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 3191; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 3192; AVX512BW-NEXT: retq 3193; 3194; XOPAVX1-LABEL: test_bitreverse_v16i32: 3195; XOPAVX1: # BB#0: 3196; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3197; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 3198; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3199; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 3200; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 3201; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 3202; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3203; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 3204; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 3205; XOPAVX1-NEXT: retq 3206; 3207; XOPAVX2-LABEL: test_bitreverse_v16i32: 3208; XOPAVX2: # BB#0: 3209; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 3210; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 3211; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3212; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 3213; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 3214; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 3215; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3216; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 3217; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 3218; XOPAVX2-NEXT: retq 3219 %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) 3220 ret <16 x i32> %b 3221} 3222 3223define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { 3224; SSE2-LABEL: test_bitreverse_v8i64: 3225; SSE2: # BB#0: 3226; SSE2-NEXT: pxor %xmm9, %xmm9 3227; SSE2-NEXT: movdqa %xmm0, %xmm4 3228; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 3229; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 3230; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 3231; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 3232; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] 3233; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 3234; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 3235; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 3236; SSE2-NEXT: packuswb %xmm4, %xmm0 3237; SSE2-NEXT: movdqa %xmm0, %xmm5 3238; SSE2-NEXT: psllw $5, %xmm5 3239; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 3240; SSE2-NEXT: pand {{.*}}(%rip), %xmm10 3241; SSE2-NEXT: pand %xmm10, %xmm5 3242; SSE2-NEXT: movdqa %xmm0, %xmm4 3243; SSE2-NEXT: psllw $7, %xmm4 3244; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 3245; SSE2-NEXT: pand %xmm11, %xmm11 3246; SSE2-NEXT: pand %xmm11, %xmm4 3247; SSE2-NEXT: movdqa %xmm0, %xmm6 3248; SSE2-NEXT: psllw $3, %xmm6 3249; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 3250; SSE2-NEXT: pand {{.*}}(%rip), %xmm12 3251; SSE2-NEXT: pand %xmm12, %xmm6 3252; SSE2-NEXT: por %xmm5, %xmm6 3253; SSE2-NEXT: movdqa %xmm0, %xmm5 3254; SSE2-NEXT: paddb %xmm5, %xmm5 3255; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 3256; SSE2-NEXT: pand %xmm8, %xmm5 3257; SSE2-NEXT: por %xmm6, %xmm5 3258; SSE2-NEXT: movdqa %xmm0, %xmm6 3259; SSE2-NEXT: psrlw $1, %xmm6 3260; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 3261; SSE2-NEXT: pand {{.*}}(%rip), %xmm13 3262; SSE2-NEXT: pand %xmm13, %xmm6 3263; SSE2-NEXT: por %xmm5, %xmm6 3264; SSE2-NEXT: movdqa %xmm0, %xmm7 3265; SSE2-NEXT: psrlw $3, %xmm7 3266; SSE2-NEXT: movdqa {{.*#+}} xmm14 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] 3267; SSE2-NEXT: pand {{.*}}(%rip), %xmm14 3268; SSE2-NEXT: pand %xmm14, %xmm7 3269; SSE2-NEXT: por %xmm6, %xmm7 3270; SSE2-NEXT: movdqa %xmm0, %xmm5 3271; SSE2-NEXT: psrlw $5, %xmm5 3272; SSE2-NEXT: movdqa {{.*#+}} xmm15 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 3273; SSE2-NEXT: pand {{.*}}(%rip), %xmm15 3274; SSE2-NEXT: pand %xmm15, %xmm5 3275; SSE2-NEXT: por %xmm7, %xmm5 3276; SSE2-NEXT: psrlw $7, %xmm0 3277; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 3278; SSE2-NEXT: pand %xmm7, %xmm7 3279; SSE2-NEXT: pand %xmm7, %xmm0 3280; SSE2-NEXT: por %xmm5, %xmm0 3281; SSE2-NEXT: por %xmm4, %xmm0 3282; SSE2-NEXT: movdqa %xmm1, %xmm4 3283; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 3284; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 3285; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 3286; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 3287; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] 3288; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 3289; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 3290; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 3291; SSE2-NEXT: packuswb %xmm4, %xmm1 3292; SSE2-NEXT: movdqa %xmm1, %xmm5 3293; SSE2-NEXT: psllw $5, %xmm5 3294; SSE2-NEXT: pand %xmm10, %xmm5 3295; SSE2-NEXT: movdqa %xmm1, %xmm4 3296; SSE2-NEXT: psllw $7, %xmm4 3297; SSE2-NEXT: pand %xmm11, %xmm4 3298; SSE2-NEXT: movdqa %xmm1, %xmm6 3299; SSE2-NEXT: psllw $3, %xmm6 3300; SSE2-NEXT: pand %xmm12, %xmm6 3301; SSE2-NEXT: por %xmm5, %xmm6 3302; SSE2-NEXT: movdqa %xmm1, %xmm5 3303; SSE2-NEXT: paddb %xmm5, %xmm5 3304; SSE2-NEXT: pand %xmm8, %xmm5 3305; SSE2-NEXT: por %xmm6, %xmm5 3306; SSE2-NEXT: movdqa %xmm1, %xmm6 3307; SSE2-NEXT: psrlw $1, %xmm6 3308; SSE2-NEXT: pand %xmm13, %xmm6 3309; SSE2-NEXT: por %xmm5, %xmm6 3310; SSE2-NEXT: movdqa %xmm1, %xmm5 3311; SSE2-NEXT: psrlw $3, %xmm5 3312; SSE2-NEXT: pand %xmm14, %xmm5 3313; SSE2-NEXT: por %xmm6, %xmm5 3314; SSE2-NEXT: movdqa %xmm1, %xmm6 3315; SSE2-NEXT: psrlw $5, %xmm6 3316; SSE2-NEXT: pand %xmm15, %xmm6 3317; SSE2-NEXT: por %xmm5, %xmm6 3318; SSE2-NEXT: psrlw $7, %xmm1 3319; SSE2-NEXT: pand %xmm7, %xmm1 3320; SSE2-NEXT: por %xmm6, %xmm1 3321; SSE2-NEXT: por %xmm4, %xmm1 3322; SSE2-NEXT: movdqa %xmm2, %xmm4 3323; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 3324; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 3325; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 3326; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 3327; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] 3328; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 3329; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 3330; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 3331; SSE2-NEXT: packuswb %xmm4, %xmm2 3332; SSE2-NEXT: movdqa %xmm2, %xmm5 3333; SSE2-NEXT: psllw $5, %xmm5 3334; SSE2-NEXT: pand %xmm10, %xmm5 3335; SSE2-NEXT: movdqa %xmm2, %xmm4 3336; SSE2-NEXT: psllw $7, %xmm4 3337; SSE2-NEXT: pand %xmm11, %xmm4 3338; SSE2-NEXT: movdqa %xmm2, %xmm6 3339; SSE2-NEXT: psllw $3, %xmm6 3340; SSE2-NEXT: pand %xmm12, %xmm6 3341; SSE2-NEXT: por %xmm5, %xmm6 3342; SSE2-NEXT: movdqa %xmm2, %xmm5 3343; SSE2-NEXT: paddb %xmm5, %xmm5 3344; SSE2-NEXT: pand %xmm8, %xmm5 3345; SSE2-NEXT: por %xmm6, %xmm5 3346; SSE2-NEXT: movdqa %xmm2, %xmm6 3347; SSE2-NEXT: psrlw $1, %xmm6 3348; SSE2-NEXT: pand %xmm13, %xmm6 3349; SSE2-NEXT: por %xmm5, %xmm6 3350; SSE2-NEXT: movdqa %xmm2, %xmm5 3351; SSE2-NEXT: psrlw $3, %xmm5 3352; SSE2-NEXT: pand %xmm14, %xmm5 3353; SSE2-NEXT: por %xmm6, %xmm5 3354; SSE2-NEXT: movdqa %xmm2, %xmm6 3355; SSE2-NEXT: psrlw $5, %xmm6 3356; SSE2-NEXT: pand %xmm15, %xmm6 3357; SSE2-NEXT: por %xmm5, %xmm6 3358; SSE2-NEXT: psrlw $7, %xmm2 3359; SSE2-NEXT: pand %xmm7, %xmm2 3360; SSE2-NEXT: por %xmm6, %xmm2 3361; SSE2-NEXT: por %xmm4, %xmm2 3362; SSE2-NEXT: movdqa %xmm3, %xmm4 3363; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 3364; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 3365; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 3366; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 3367; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] 3368; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 3369; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 3370; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 3371; SSE2-NEXT: packuswb %xmm4, %xmm3 3372; SSE2-NEXT: movdqa %xmm3, %xmm5 3373; SSE2-NEXT: psllw $5, %xmm5 3374; SSE2-NEXT: pand %xmm10, %xmm5 3375; SSE2-NEXT: movdqa %xmm3, %xmm4 3376; SSE2-NEXT: psllw $7, %xmm4 3377; SSE2-NEXT: pand %xmm11, %xmm4 3378; SSE2-NEXT: movdqa %xmm3, %xmm6 3379; SSE2-NEXT: psllw $3, %xmm6 3380; SSE2-NEXT: pand %xmm12, %xmm6 3381; SSE2-NEXT: por %xmm5, %xmm6 3382; SSE2-NEXT: movdqa %xmm3, %xmm5 3383; SSE2-NEXT: paddb %xmm5, %xmm5 3384; SSE2-NEXT: pand %xmm8, %xmm5 3385; SSE2-NEXT: por %xmm6, %xmm5 3386; SSE2-NEXT: movdqa %xmm3, %xmm6 3387; SSE2-NEXT: psrlw $1, %xmm6 3388; SSE2-NEXT: pand %xmm13, %xmm6 3389; SSE2-NEXT: por %xmm5, %xmm6 3390; SSE2-NEXT: movdqa %xmm3, %xmm5 3391; SSE2-NEXT: psrlw $3, %xmm5 3392; SSE2-NEXT: pand %xmm14, %xmm5 3393; SSE2-NEXT: por %xmm6, %xmm5 3394; SSE2-NEXT: movdqa %xmm3, %xmm6 3395; SSE2-NEXT: psrlw $5, %xmm6 3396; SSE2-NEXT: pand %xmm15, %xmm6 3397; SSE2-NEXT: por %xmm5, %xmm6 3398; SSE2-NEXT: psrlw $7, %xmm3 3399; SSE2-NEXT: pand %xmm7, %xmm3 3400; SSE2-NEXT: por %xmm6, %xmm3 3401; SSE2-NEXT: por %xmm4, %xmm3 3402; SSE2-NEXT: retq 3403; 3404; SSSE3-LABEL: test_bitreverse_v8i64: 3405; SSSE3: # BB#0: 3406; SSSE3-NEXT: movdqa %xmm1, %xmm5 3407; SSSE3-NEXT: movdqa %xmm0, %xmm1 3408; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 3409; SSSE3-NEXT: pshufb %xmm8, %xmm1 3410; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 3411; SSSE3-NEXT: movdqa %xmm1, %xmm0 3412; SSSE3-NEXT: pand %xmm9, %xmm0 3413; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 3414; SSSE3-NEXT: movdqa %xmm7, %xmm6 3415; SSSE3-NEXT: pshufb %xmm0, %xmm6 3416; SSSE3-NEXT: psrlw $4, %xmm1 3417; SSSE3-NEXT: pand %xmm9, %xmm1 3418; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 3419; SSSE3-NEXT: movdqa %xmm4, %xmm0 3420; SSSE3-NEXT: pshufb %xmm1, %xmm0 3421; SSSE3-NEXT: por %xmm6, %xmm0 3422; SSSE3-NEXT: pshufb %xmm8, %xmm5 3423; SSSE3-NEXT: movdqa %xmm5, %xmm1 3424; SSSE3-NEXT: pand %xmm9, %xmm1 3425; SSSE3-NEXT: movdqa %xmm7, %xmm6 3426; SSSE3-NEXT: pshufb %xmm1, %xmm6 3427; SSSE3-NEXT: psrlw $4, %xmm5 3428; SSSE3-NEXT: pand %xmm9, %xmm5 3429; SSSE3-NEXT: movdqa %xmm4, %xmm1 3430; SSSE3-NEXT: pshufb %xmm5, %xmm1 3431; SSSE3-NEXT: por %xmm6, %xmm1 3432; SSSE3-NEXT: pshufb %xmm8, %xmm2 3433; SSSE3-NEXT: movdqa %xmm2, %xmm5 3434; SSSE3-NEXT: pand %xmm9, %xmm5 3435; SSSE3-NEXT: movdqa %xmm7, %xmm6 3436; SSSE3-NEXT: pshufb %xmm5, %xmm6 3437; SSSE3-NEXT: psrlw $4, %xmm2 3438; SSSE3-NEXT: pand %xmm9, %xmm2 3439; SSSE3-NEXT: movdqa %xmm4, %xmm5 3440; SSSE3-NEXT: pshufb %xmm2, %xmm5 3441; SSSE3-NEXT: por %xmm6, %xmm5 3442; SSSE3-NEXT: pshufb %xmm8, %xmm3 3443; SSSE3-NEXT: movdqa %xmm3, %xmm2 3444; SSSE3-NEXT: pand %xmm9, %xmm2 3445; SSSE3-NEXT: pshufb %xmm2, %xmm7 3446; SSSE3-NEXT: psrlw $4, %xmm3 3447; SSSE3-NEXT: pand %xmm9, %xmm3 3448; SSSE3-NEXT: pshufb %xmm3, %xmm4 3449; SSSE3-NEXT: por %xmm7, %xmm4 3450; SSSE3-NEXT: movdqa %xmm5, %xmm2 3451; SSSE3-NEXT: movdqa %xmm4, %xmm3 3452; SSSE3-NEXT: retq 3453; 3454; AVX1-LABEL: test_bitreverse_v8i64: 3455; AVX1: # BB#0: 3456; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3457; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 3458; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 3459; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 3460; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 3461; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 3462; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 3463; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 3464; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 3465; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 3466; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 3467; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 3468; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 3469; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 3470; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 3471; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 3472; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 3473; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 3474; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 3475; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 3476; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 3477; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 3478; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 3479; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 3480; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 3481; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 3482; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 3483; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 3484; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 3485; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 3486; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 3487; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 3488; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 3489; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 3490; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 3491; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 3492; AVX1-NEXT: retq 3493; 3494; AVX2-LABEL: test_bitreverse_v8i64: 3495; AVX2: # BB#0: 3496; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 3497; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3498; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 3499; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 3500; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 3501; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 3502; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 3503; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 3504; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 3505; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 3506; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 3507; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3508; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 3509; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 3510; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 3511; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 3512; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 3513; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 3514; AVX2-NEXT: retq 3515; 3516; AVX512F-LABEL: test_bitreverse_v8i64: 3517; AVX512F: # BB#0: 3518; AVX512F-NEXT: vpsllq $61, %zmm0, %zmm1 3519; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 3520; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm2 3521; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3522; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 3523; AVX512F-NEXT: vpsllq $59, %zmm0, %zmm2 3524; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3525; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3526; AVX512F-NEXT: vpsllq $57, %zmm0, %zmm2 3527; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3528; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3529; AVX512F-NEXT: vpsllq $55, %zmm0, %zmm2 3530; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3531; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3532; AVX512F-NEXT: vpsllq $53, %zmm0, %zmm2 3533; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3534; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3535; AVX512F-NEXT: vpsllq $51, %zmm0, %zmm2 3536; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3537; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3538; AVX512F-NEXT: vpsllq $49, %zmm0, %zmm2 3539; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3540; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3541; AVX512F-NEXT: vpsllq $47, %zmm0, %zmm2 3542; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3543; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3544; AVX512F-NEXT: vpsllq $45, %zmm0, %zmm2 3545; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3546; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3547; AVX512F-NEXT: vpsllq $43, %zmm0, %zmm2 3548; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3549; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3550; AVX512F-NEXT: vpsllq $41, %zmm0, %zmm2 3551; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3552; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3553; AVX512F-NEXT: vpsllq $39, %zmm0, %zmm2 3554; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3555; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3556; AVX512F-NEXT: vpsllq $37, %zmm0, %zmm2 3557; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3558; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3559; AVX512F-NEXT: vpsllq $35, %zmm0, %zmm2 3560; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3561; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3562; AVX512F-NEXT: vpsllq $33, %zmm0, %zmm2 3563; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3564; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3565; AVX512F-NEXT: vpsllq $31, %zmm0, %zmm2 3566; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3567; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3568; AVX512F-NEXT: vpsllq $29, %zmm0, %zmm2 3569; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3570; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3571; AVX512F-NEXT: vpsllq $27, %zmm0, %zmm2 3572; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3573; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3574; AVX512F-NEXT: vpsllq $25, %zmm0, %zmm2 3575; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3576; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3577; AVX512F-NEXT: vpsllq $23, %zmm0, %zmm2 3578; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3579; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3580; AVX512F-NEXT: vpsllq $21, %zmm0, %zmm2 3581; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3582; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3583; AVX512F-NEXT: vpsllq $19, %zmm0, %zmm2 3584; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3585; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3586; AVX512F-NEXT: vpsllq $17, %zmm0, %zmm2 3587; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3588; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3589; AVX512F-NEXT: vpsllq $15, %zmm0, %zmm2 3590; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3591; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3592; AVX512F-NEXT: vpsllq $13, %zmm0, %zmm2 3593; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3594; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3595; AVX512F-NEXT: vpsllq $11, %zmm0, %zmm2 3596; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3597; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3598; AVX512F-NEXT: vpsllq $9, %zmm0, %zmm2 3599; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3600; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3601; AVX512F-NEXT: vpsllq $7, %zmm0, %zmm2 3602; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3603; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3604; AVX512F-NEXT: vpsllq $5, %zmm0, %zmm2 3605; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3606; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3607; AVX512F-NEXT: vpsllq $3, %zmm0, %zmm2 3608; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3609; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3610; AVX512F-NEXT: vpsllq $1, %zmm0, %zmm2 3611; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3612; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3613; AVX512F-NEXT: vpsrlq $1, %zmm0, %zmm2 3614; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3615; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3616; AVX512F-NEXT: vpsrlq $3, %zmm0, %zmm2 3617; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3618; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3619; AVX512F-NEXT: vpsrlq $5, %zmm0, %zmm2 3620; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3621; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3622; AVX512F-NEXT: vpsrlq $7, %zmm0, %zmm2 3623; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3624; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3625; AVX512F-NEXT: vpsrlq $9, %zmm0, %zmm2 3626; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3627; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3628; AVX512F-NEXT: vpsrlq $11, %zmm0, %zmm2 3629; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3630; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3631; AVX512F-NEXT: vpsrlq $13, %zmm0, %zmm2 3632; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3633; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3634; AVX512F-NEXT: vpsrlq $15, %zmm0, %zmm2 3635; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3636; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3637; AVX512F-NEXT: vpsrlq $17, %zmm0, %zmm2 3638; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3639; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3640; AVX512F-NEXT: vpsrlq $19, %zmm0, %zmm2 3641; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3642; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3643; AVX512F-NEXT: vpsrlq $21, %zmm0, %zmm2 3644; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3645; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3646; AVX512F-NEXT: vpsrlq $23, %zmm0, %zmm2 3647; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3648; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3649; AVX512F-NEXT: vpsrlq $25, %zmm0, %zmm2 3650; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3651; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3652; AVX512F-NEXT: vpsrlq $27, %zmm0, %zmm2 3653; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3654; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3655; AVX512F-NEXT: vpsrlq $29, %zmm0, %zmm2 3656; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3657; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3658; AVX512F-NEXT: vpsrlq $31, %zmm0, %zmm2 3659; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3660; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3661; AVX512F-NEXT: vpsrlq $33, %zmm0, %zmm2 3662; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3663; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3664; AVX512F-NEXT: vpsrlq $35, %zmm0, %zmm2 3665; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3666; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3667; AVX512F-NEXT: vpsrlq $37, %zmm0, %zmm2 3668; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3669; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3670; AVX512F-NEXT: vpsrlq $39, %zmm0, %zmm2 3671; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3672; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3673; AVX512F-NEXT: vpsrlq $41, %zmm0, %zmm2 3674; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3675; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3676; AVX512F-NEXT: vpsrlq $43, %zmm0, %zmm2 3677; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3678; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3679; AVX512F-NEXT: vpsrlq $45, %zmm0, %zmm2 3680; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3681; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3682; AVX512F-NEXT: vpsrlq $47, %zmm0, %zmm2 3683; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3684; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3685; AVX512F-NEXT: vpsrlq $49, %zmm0, %zmm2 3686; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3687; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3688; AVX512F-NEXT: vpsrlq $51, %zmm0, %zmm2 3689; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3690; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3691; AVX512F-NEXT: vpsrlq $53, %zmm0, %zmm2 3692; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3693; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3694; AVX512F-NEXT: vpsrlq $55, %zmm0, %zmm2 3695; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3696; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3697; AVX512F-NEXT: vpsrlq $57, %zmm0, %zmm2 3698; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3699; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3700; AVX512F-NEXT: vpsrlq $59, %zmm0, %zmm2 3701; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3702; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3703; AVX512F-NEXT: vpsrlq $61, %zmm0, %zmm2 3704; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3705; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3706; AVX512F-NEXT: vpsrlq $63, %zmm0, %zmm0 3707; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 3708; AVX512F-NEXT: vporq %zmm0, %zmm1, %zmm0 3709; AVX512F-NEXT: retq 3710; 3711; AVX512BW-LABEL: test_bitreverse_v8i64: 3712; AVX512BW: # BB#0: 3713; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56] 3714; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 3715; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 3716; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 3717; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 3718; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 3719; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 3720; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 3721; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 3722; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 3723; AVX512BW-NEXT: retq 3724; 3725; XOPAVX1-LABEL: test_bitreverse_v8i64: 3726; XOPAVX1: # BB#0: 3727; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3728; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 3729; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3730; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 3731; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 3732; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 3733; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3734; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 3735; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 3736; XOPAVX1-NEXT: retq 3737; 3738; XOPAVX2-LABEL: test_bitreverse_v8i64: 3739; XOPAVX2: # BB#0: 3740; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 3741; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 3742; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3743; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 3744; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 3745; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 3746; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3747; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 3748; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 3749; XOPAVX2-NEXT: retq 3750 %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) 3751 ret <8 x i64> %b 3752} 3753 3754declare i8 @llvm.bitreverse.i8(i8) readnone 3755declare i16 @llvm.bitreverse.i16(i16) readnone 3756declare i32 @llvm.bitreverse.i32(i32) readnone 3757declare i64 @llvm.bitreverse.i64(i64) readnone 3758 3759declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone 3760declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone 3761declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone 3762declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone 3763 3764declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>) readnone 3765declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone 3766declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>) readnone 3767declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) readnone 3768 3769declare <64 x i8> @llvm.bitreverse.v64i8(<64 x i8>) readnone 3770declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone 3771declare <16 x i32> @llvm.bitreverse.v16i32(<16 x i32>) readnone 3772declare <8 x i64> @llvm.bitreverse.v8i64(<8 x i64>) readnone 3773