• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX  --check-prefix=AVX512  --check-prefix=AVX512F
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX  --check-prefix=AVX512  --check-prefix=AVX512BW
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
10
11define i8 @test_bitreverse_i8(i8 %a) nounwind {
12; SSE-LABEL: test_bitreverse_i8:
13; SSE:       # BB#0:
14; SSE-NEXT:    movl %edi, %eax
15; SSE-NEXT:    shlb $7, %al
16; SSE-NEXT:    movl %edi, %ecx
17; SSE-NEXT:    shlb $5, %cl
18; SSE-NEXT:    andb $64, %cl
19; SSE-NEXT:    movl %edi, %edx
20; SSE-NEXT:    shlb $3, %dl
21; SSE-NEXT:    andb $32, %dl
22; SSE-NEXT:    orb %cl, %dl
23; SSE-NEXT:    movl %edi, %ecx
24; SSE-NEXT:    addb %cl, %cl
25; SSE-NEXT:    andb $16, %cl
26; SSE-NEXT:    orb %dl, %cl
27; SSE-NEXT:    movl %edi, %edx
28; SSE-NEXT:    shrb %dl
29; SSE-NEXT:    andb $8, %dl
30; SSE-NEXT:    orb %cl, %dl
31; SSE-NEXT:    movl %edi, %ecx
32; SSE-NEXT:    shrb $3, %cl
33; SSE-NEXT:    andb $4, %cl
34; SSE-NEXT:    orb %dl, %cl
35; SSE-NEXT:    movl %edi, %edx
36; SSE-NEXT:    shrb $5, %dl
37; SSE-NEXT:    andb $2, %dl
38; SSE-NEXT:    orb %cl, %dl
39; SSE-NEXT:    shrb $7, %dil
40; SSE-NEXT:    orb %dl, %dil
41; SSE-NEXT:    orb %al, %dil
42; SSE-NEXT:    movl %edi, %eax
43; SSE-NEXT:    retq
44;
45; AVX-LABEL: test_bitreverse_i8:
46; AVX:       # BB#0:
47; AVX-NEXT:    movl %edi, %eax
48; AVX-NEXT:    shlb $7, %al
49; AVX-NEXT:    movl %edi, %ecx
50; AVX-NEXT:    shlb $5, %cl
51; AVX-NEXT:    andb $64, %cl
52; AVX-NEXT:    movl %edi, %edx
53; AVX-NEXT:    shlb $3, %dl
54; AVX-NEXT:    andb $32, %dl
55; AVX-NEXT:    orb %cl, %dl
56; AVX-NEXT:    movl %edi, %ecx
57; AVX-NEXT:    addb %cl, %cl
58; AVX-NEXT:    andb $16, %cl
59; AVX-NEXT:    orb %dl, %cl
60; AVX-NEXT:    movl %edi, %edx
61; AVX-NEXT:    shrb %dl
62; AVX-NEXT:    andb $8, %dl
63; AVX-NEXT:    orb %cl, %dl
64; AVX-NEXT:    movl %edi, %ecx
65; AVX-NEXT:    shrb $3, %cl
66; AVX-NEXT:    andb $4, %cl
67; AVX-NEXT:    orb %dl, %cl
68; AVX-NEXT:    movl %edi, %edx
69; AVX-NEXT:    shrb $5, %dl
70; AVX-NEXT:    andb $2, %dl
71; AVX-NEXT:    orb %cl, %dl
72; AVX-NEXT:    shrb $7, %dil
73; AVX-NEXT:    orb %dl, %dil
74; AVX-NEXT:    orb %al, %dil
75; AVX-NEXT:    movl %edi, %eax
76; AVX-NEXT:    retq
77;
78; XOP-LABEL: test_bitreverse_i8:
79; XOP:       # BB#0:
80; XOP-NEXT:    vmovd %edi, %xmm0
81; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
82; XOP-NEXT:    vpextrb $0, %xmm0, %eax
83; XOP-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
84; XOP-NEXT:    retq
85  %b = call i8 @llvm.bitreverse.i8(i8 %a)
86  ret i8 %b
87}
88
89define i16 @test_bitreverse_i16(i16 %a) nounwind {
90; SSE-LABEL: test_bitreverse_i16:
91; SSE:       # BB#0:
92; SSE-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
93; SSE-NEXT:    movl %edi, %ecx
94; SSE-NEXT:    andl $32768, %ecx # imm = 0x8000
95; SSE-NEXT:    movl %edi, %eax
96; SSE-NEXT:    shll $15, %eax
97; SSE-NEXT:    movl %edi, %edx
98; SSE-NEXT:    andl $2, %edx
99; SSE-NEXT:    shll $13, %edx
100; SSE-NEXT:    leal (%rdx,%rax), %eax
101; SSE-NEXT:    movl %edi, %edx
102; SSE-NEXT:    andl $4, %edx
103; SSE-NEXT:    shll $11, %edx
104; SSE-NEXT:    orl %edx, %eax
105; SSE-NEXT:    movl %edi, %edx
106; SSE-NEXT:    andl $8, %edx
107; SSE-NEXT:    shll $9, %edx
108; SSE-NEXT:    orl %edx, %eax
109; SSE-NEXT:    movl %edi, %edx
110; SSE-NEXT:    andl $16, %edx
111; SSE-NEXT:    shll $7, %edx
112; SSE-NEXT:    orl %edx, %eax
113; SSE-NEXT:    movl %edi, %edx
114; SSE-NEXT:    andl $32, %edx
115; SSE-NEXT:    shll $5, %edx
116; SSE-NEXT:    orl %edx, %eax
117; SSE-NEXT:    movl %edi, %edx
118; SSE-NEXT:    andl $64, %edx
119; SSE-NEXT:    shll $3, %edx
120; SSE-NEXT:    leal (%rdi,%rdi), %esi
121; SSE-NEXT:    andl $256, %esi # imm = 0x100
122; SSE-NEXT:    orl %edx, %esi
123; SSE-NEXT:    movl %edi, %edx
124; SSE-NEXT:    shrl %edx
125; SSE-NEXT:    andl $128, %edx
126; SSE-NEXT:    orl %esi, %edx
127; SSE-NEXT:    movl %edi, %esi
128; SSE-NEXT:    shrl $3, %esi
129; SSE-NEXT:    andl $64, %esi
130; SSE-NEXT:    orl %edx, %esi
131; SSE-NEXT:    movl %edi, %edx
132; SSE-NEXT:    shrl $5, %edx
133; SSE-NEXT:    andl $32, %edx
134; SSE-NEXT:    orl %esi, %edx
135; SSE-NEXT:    movl %edi, %esi
136; SSE-NEXT:    shrl $7, %esi
137; SSE-NEXT:    andl $16, %esi
138; SSE-NEXT:    orl %edx, %esi
139; SSE-NEXT:    movl %edi, %edx
140; SSE-NEXT:    shrl $9, %edx
141; SSE-NEXT:    andl $8, %edx
142; SSE-NEXT:    orl %esi, %edx
143; SSE-NEXT:    movl %edi, %esi
144; SSE-NEXT:    shrl $11, %esi
145; SSE-NEXT:    andl $4, %esi
146; SSE-NEXT:    orl %edx, %esi
147; SSE-NEXT:    shrl $13, %edi
148; SSE-NEXT:    andl $2, %edi
149; SSE-NEXT:    orl %esi, %edi
150; SSE-NEXT:    shrl $15, %ecx
151; SSE-NEXT:    orl %edi, %ecx
152; SSE-NEXT:    orl %ecx, %eax
153; SSE-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
154; SSE-NEXT:    retq
155;
156; AVX-LABEL: test_bitreverse_i16:
157; AVX:       # BB#0:
158; AVX-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
159; AVX-NEXT:    movl %edi, %ecx
160; AVX-NEXT:    andl $32768, %ecx # imm = 0x8000
161; AVX-NEXT:    movl %edi, %eax
162; AVX-NEXT:    shll $15, %eax
163; AVX-NEXT:    movl %edi, %edx
164; AVX-NEXT:    andl $2, %edx
165; AVX-NEXT:    shll $13, %edx
166; AVX-NEXT:    leal (%rdx,%rax), %eax
167; AVX-NEXT:    movl %edi, %edx
168; AVX-NEXT:    andl $4, %edx
169; AVX-NEXT:    shll $11, %edx
170; AVX-NEXT:    orl %edx, %eax
171; AVX-NEXT:    movl %edi, %edx
172; AVX-NEXT:    andl $8, %edx
173; AVX-NEXT:    shll $9, %edx
174; AVX-NEXT:    orl %edx, %eax
175; AVX-NEXT:    movl %edi, %edx
176; AVX-NEXT:    andl $16, %edx
177; AVX-NEXT:    shll $7, %edx
178; AVX-NEXT:    orl %edx, %eax
179; AVX-NEXT:    movl %edi, %edx
180; AVX-NEXT:    andl $32, %edx
181; AVX-NEXT:    shll $5, %edx
182; AVX-NEXT:    orl %edx, %eax
183; AVX-NEXT:    movl %edi, %edx
184; AVX-NEXT:    andl $64, %edx
185; AVX-NEXT:    shll $3, %edx
186; AVX-NEXT:    leal (%rdi,%rdi), %esi
187; AVX-NEXT:    andl $256, %esi # imm = 0x100
188; AVX-NEXT:    orl %edx, %esi
189; AVX-NEXT:    movl %edi, %edx
190; AVX-NEXT:    shrl %edx
191; AVX-NEXT:    andl $128, %edx
192; AVX-NEXT:    orl %esi, %edx
193; AVX-NEXT:    movl %edi, %esi
194; AVX-NEXT:    shrl $3, %esi
195; AVX-NEXT:    andl $64, %esi
196; AVX-NEXT:    orl %edx, %esi
197; AVX-NEXT:    movl %edi, %edx
198; AVX-NEXT:    shrl $5, %edx
199; AVX-NEXT:    andl $32, %edx
200; AVX-NEXT:    orl %esi, %edx
201; AVX-NEXT:    movl %edi, %esi
202; AVX-NEXT:    shrl $7, %esi
203; AVX-NEXT:    andl $16, %esi
204; AVX-NEXT:    orl %edx, %esi
205; AVX-NEXT:    movl %edi, %edx
206; AVX-NEXT:    shrl $9, %edx
207; AVX-NEXT:    andl $8, %edx
208; AVX-NEXT:    orl %esi, %edx
209; AVX-NEXT:    movl %edi, %esi
210; AVX-NEXT:    shrl $11, %esi
211; AVX-NEXT:    andl $4, %esi
212; AVX-NEXT:    orl %edx, %esi
213; AVX-NEXT:    shrl $13, %edi
214; AVX-NEXT:    andl $2, %edi
215; AVX-NEXT:    orl %esi, %edi
216; AVX-NEXT:    shrl $15, %ecx
217; AVX-NEXT:    orl %edi, %ecx
218; AVX-NEXT:    orl %ecx, %eax
219; AVX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
220; AVX-NEXT:    retq
221;
222; XOP-LABEL: test_bitreverse_i16:
223; XOP:       # BB#0:
224; XOP-NEXT:    vmovd %edi, %xmm0
225; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
226; XOP-NEXT:    vmovd %xmm0, %eax
227; XOP-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
228; XOP-NEXT:    retq
229  %b = call i16 @llvm.bitreverse.i16(i16 %a)
230  ret i16 %b
231}
232
233define i32 @test_bitreverse_i32(i32 %a) nounwind {
234; SSE-LABEL: test_bitreverse_i32:
235; SSE:       # BB#0:
236; SSE-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
237; SSE-NEXT:    movl %edi, %eax
238; SSE-NEXT:    shll $31, %eax
239; SSE-NEXT:    movl %edi, %ecx
240; SSE-NEXT:    andl $2, %ecx
241; SSE-NEXT:    shll $29, %ecx
242; SSE-NEXT:    leal (%rcx,%rax), %eax
243; SSE-NEXT:    movl %edi, %ecx
244; SSE-NEXT:    andl $4, %ecx
245; SSE-NEXT:    shll $27, %ecx
246; SSE-NEXT:    orl %ecx, %eax
247; SSE-NEXT:    movl %edi, %ecx
248; SSE-NEXT:    andl $8, %ecx
249; SSE-NEXT:    shll $25, %ecx
250; SSE-NEXT:    orl %ecx, %eax
251; SSE-NEXT:    movl %edi, %ecx
252; SSE-NEXT:    andl $16, %ecx
253; SSE-NEXT:    shll $23, %ecx
254; SSE-NEXT:    orl %ecx, %eax
255; SSE-NEXT:    movl %edi, %ecx
256; SSE-NEXT:    andl $32, %ecx
257; SSE-NEXT:    shll $21, %ecx
258; SSE-NEXT:    orl %ecx, %eax
259; SSE-NEXT:    movl %edi, %ecx
260; SSE-NEXT:    andl $64, %ecx
261; SSE-NEXT:    shll $19, %ecx
262; SSE-NEXT:    movl %edi, %edx
263; SSE-NEXT:    shll $17, %edx
264; SSE-NEXT:    andl $16777216, %edx # imm = 0x1000000
265; SSE-NEXT:    orl %ecx, %edx
266; SSE-NEXT:    movl %edi, %ecx
267; SSE-NEXT:    shll $15, %ecx
268; SSE-NEXT:    andl $8388608, %ecx # imm = 0x800000
269; SSE-NEXT:    orl %edx, %ecx
270; SSE-NEXT:    movl %edi, %edx
271; SSE-NEXT:    shll $13, %edx
272; SSE-NEXT:    andl $4194304, %edx # imm = 0x400000
273; SSE-NEXT:    orl %ecx, %edx
274; SSE-NEXT:    movl %edi, %ecx
275; SSE-NEXT:    shll $11, %ecx
276; SSE-NEXT:    andl $2097152, %ecx # imm = 0x200000
277; SSE-NEXT:    orl %edx, %ecx
278; SSE-NEXT:    movl %edi, %edx
279; SSE-NEXT:    shll $9, %edx
280; SSE-NEXT:    andl $1048576, %edx # imm = 0x100000
281; SSE-NEXT:    orl %ecx, %edx
282; SSE-NEXT:    movl %edi, %ecx
283; SSE-NEXT:    shll $7, %ecx
284; SSE-NEXT:    andl $524288, %ecx # imm = 0x80000
285; SSE-NEXT:    orl %edx, %ecx
286; SSE-NEXT:    movl %edi, %edx
287; SSE-NEXT:    shll $5, %edx
288; SSE-NEXT:    andl $262144, %edx # imm = 0x40000
289; SSE-NEXT:    orl %ecx, %edx
290; SSE-NEXT:    leal (,%rdi,8), %ecx
291; SSE-NEXT:    andl $131072, %ecx # imm = 0x20000
292; SSE-NEXT:    orl %edx, %ecx
293; SSE-NEXT:    leal (%rdi,%rdi), %edx
294; SSE-NEXT:    andl $65536, %edx # imm = 0x10000
295; SSE-NEXT:    orl %ecx, %edx
296; SSE-NEXT:    movl %edi, %ecx
297; SSE-NEXT:    shrl %ecx
298; SSE-NEXT:    andl $32768, %ecx # imm = 0x8000
299; SSE-NEXT:    orl %edx, %ecx
300; SSE-NEXT:    movl %edi, %edx
301; SSE-NEXT:    shrl $3, %edx
302; SSE-NEXT:    andl $16384, %edx # imm = 0x4000
303; SSE-NEXT:    orl %ecx, %edx
304; SSE-NEXT:    movl %edi, %ecx
305; SSE-NEXT:    shrl $5, %ecx
306; SSE-NEXT:    andl $8192, %ecx # imm = 0x2000
307; SSE-NEXT:    orl %edx, %ecx
308; SSE-NEXT:    movl %edi, %edx
309; SSE-NEXT:    shrl $7, %edx
310; SSE-NEXT:    andl $4096, %edx # imm = 0x1000
311; SSE-NEXT:    orl %ecx, %edx
312; SSE-NEXT:    movl %edi, %ecx
313; SSE-NEXT:    shrl $9, %ecx
314; SSE-NEXT:    andl $2048, %ecx # imm = 0x800
315; SSE-NEXT:    orl %edx, %ecx
316; SSE-NEXT:    movl %edi, %edx
317; SSE-NEXT:    shrl $11, %edx
318; SSE-NEXT:    andl $1024, %edx # imm = 0x400
319; SSE-NEXT:    orl %ecx, %edx
320; SSE-NEXT:    movl %edi, %ecx
321; SSE-NEXT:    shrl $13, %ecx
322; SSE-NEXT:    andl $512, %ecx # imm = 0x200
323; SSE-NEXT:    orl %edx, %ecx
324; SSE-NEXT:    movl %edi, %edx
325; SSE-NEXT:    shrl $15, %edx
326; SSE-NEXT:    andl $256, %edx # imm = 0x100
327; SSE-NEXT:    orl %ecx, %edx
328; SSE-NEXT:    movl %edi, %ecx
329; SSE-NEXT:    shrl $17, %ecx
330; SSE-NEXT:    andl $128, %ecx
331; SSE-NEXT:    orl %edx, %ecx
332; SSE-NEXT:    movl %edi, %edx
333; SSE-NEXT:    shrl $19, %edx
334; SSE-NEXT:    andl $64, %edx
335; SSE-NEXT:    orl %ecx, %edx
336; SSE-NEXT:    movl %edi, %ecx
337; SSE-NEXT:    shrl $21, %ecx
338; SSE-NEXT:    andl $32, %ecx
339; SSE-NEXT:    orl %edx, %ecx
340; SSE-NEXT:    movl %edi, %edx
341; SSE-NEXT:    shrl $23, %edx
342; SSE-NEXT:    andl $16, %edx
343; SSE-NEXT:    orl %ecx, %edx
344; SSE-NEXT:    movl %edi, %ecx
345; SSE-NEXT:    shrl $25, %ecx
346; SSE-NEXT:    andl $8, %ecx
347; SSE-NEXT:    orl %edx, %ecx
348; SSE-NEXT:    movl %edi, %edx
349; SSE-NEXT:    shrl $27, %edx
350; SSE-NEXT:    andl $4, %edx
351; SSE-NEXT:    orl %ecx, %edx
352; SSE-NEXT:    movl %edi, %ecx
353; SSE-NEXT:    shrl $29, %ecx
354; SSE-NEXT:    andl $2, %ecx
355; SSE-NEXT:    orl %edx, %ecx
356; SSE-NEXT:    shrl $31, %edi
357; SSE-NEXT:    orl %ecx, %edi
358; SSE-NEXT:    orl %edi, %eax
359; SSE-NEXT:    retq
360;
361; AVX-LABEL: test_bitreverse_i32:
362; AVX:       # BB#0:
363; AVX-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
364; AVX-NEXT:    movl %edi, %eax
365; AVX-NEXT:    shll $31, %eax
366; AVX-NEXT:    movl %edi, %ecx
367; AVX-NEXT:    andl $2, %ecx
368; AVX-NEXT:    shll $29, %ecx
369; AVX-NEXT:    leal (%rcx,%rax), %eax
370; AVX-NEXT:    movl %edi, %ecx
371; AVX-NEXT:    andl $4, %ecx
372; AVX-NEXT:    shll $27, %ecx
373; AVX-NEXT:    orl %ecx, %eax
374; AVX-NEXT:    movl %edi, %ecx
375; AVX-NEXT:    andl $8, %ecx
376; AVX-NEXT:    shll $25, %ecx
377; AVX-NEXT:    orl %ecx, %eax
378; AVX-NEXT:    movl %edi, %ecx
379; AVX-NEXT:    andl $16, %ecx
380; AVX-NEXT:    shll $23, %ecx
381; AVX-NEXT:    orl %ecx, %eax
382; AVX-NEXT:    movl %edi, %ecx
383; AVX-NEXT:    andl $32, %ecx
384; AVX-NEXT:    shll $21, %ecx
385; AVX-NEXT:    orl %ecx, %eax
386; AVX-NEXT:    movl %edi, %ecx
387; AVX-NEXT:    andl $64, %ecx
388; AVX-NEXT:    shll $19, %ecx
389; AVX-NEXT:    movl %edi, %edx
390; AVX-NEXT:    shll $17, %edx
391; AVX-NEXT:    andl $16777216, %edx # imm = 0x1000000
392; AVX-NEXT:    orl %ecx, %edx
393; AVX-NEXT:    movl %edi, %ecx
394; AVX-NEXT:    shll $15, %ecx
395; AVX-NEXT:    andl $8388608, %ecx # imm = 0x800000
396; AVX-NEXT:    orl %edx, %ecx
397; AVX-NEXT:    movl %edi, %edx
398; AVX-NEXT:    shll $13, %edx
399; AVX-NEXT:    andl $4194304, %edx # imm = 0x400000
400; AVX-NEXT:    orl %ecx, %edx
401; AVX-NEXT:    movl %edi, %ecx
402; AVX-NEXT:    shll $11, %ecx
403; AVX-NEXT:    andl $2097152, %ecx # imm = 0x200000
404; AVX-NEXT:    orl %edx, %ecx
405; AVX-NEXT:    movl %edi, %edx
406; AVX-NEXT:    shll $9, %edx
407; AVX-NEXT:    andl $1048576, %edx # imm = 0x100000
408; AVX-NEXT:    orl %ecx, %edx
409; AVX-NEXT:    movl %edi, %ecx
410; AVX-NEXT:    shll $7, %ecx
411; AVX-NEXT:    andl $524288, %ecx # imm = 0x80000
412; AVX-NEXT:    orl %edx, %ecx
413; AVX-NEXT:    movl %edi, %edx
414; AVX-NEXT:    shll $5, %edx
415; AVX-NEXT:    andl $262144, %edx # imm = 0x40000
416; AVX-NEXT:    orl %ecx, %edx
417; AVX-NEXT:    leal (,%rdi,8), %ecx
418; AVX-NEXT:    andl $131072, %ecx # imm = 0x20000
419; AVX-NEXT:    orl %edx, %ecx
420; AVX-NEXT:    leal (%rdi,%rdi), %edx
421; AVX-NEXT:    andl $65536, %edx # imm = 0x10000
422; AVX-NEXT:    orl %ecx, %edx
423; AVX-NEXT:    movl %edi, %ecx
424; AVX-NEXT:    shrl %ecx
425; AVX-NEXT:    andl $32768, %ecx # imm = 0x8000
426; AVX-NEXT:    orl %edx, %ecx
427; AVX-NEXT:    movl %edi, %edx
428; AVX-NEXT:    shrl $3, %edx
429; AVX-NEXT:    andl $16384, %edx # imm = 0x4000
430; AVX-NEXT:    orl %ecx, %edx
431; AVX-NEXT:    movl %edi, %ecx
432; AVX-NEXT:    shrl $5, %ecx
433; AVX-NEXT:    andl $8192, %ecx # imm = 0x2000
434; AVX-NEXT:    orl %edx, %ecx
435; AVX-NEXT:    movl %edi, %edx
436; AVX-NEXT:    shrl $7, %edx
437; AVX-NEXT:    andl $4096, %edx # imm = 0x1000
438; AVX-NEXT:    orl %ecx, %edx
439; AVX-NEXT:    movl %edi, %ecx
440; AVX-NEXT:    shrl $9, %ecx
441; AVX-NEXT:    andl $2048, %ecx # imm = 0x800
442; AVX-NEXT:    orl %edx, %ecx
443; AVX-NEXT:    movl %edi, %edx
444; AVX-NEXT:    shrl $11, %edx
445; AVX-NEXT:    andl $1024, %edx # imm = 0x400
446; AVX-NEXT:    orl %ecx, %edx
447; AVX-NEXT:    movl %edi, %ecx
448; AVX-NEXT:    shrl $13, %ecx
449; AVX-NEXT:    andl $512, %ecx # imm = 0x200
450; AVX-NEXT:    orl %edx, %ecx
451; AVX-NEXT:    movl %edi, %edx
452; AVX-NEXT:    shrl $15, %edx
453; AVX-NEXT:    andl $256, %edx # imm = 0x100
454; AVX-NEXT:    orl %ecx, %edx
455; AVX-NEXT:    movl %edi, %ecx
456; AVX-NEXT:    shrl $17, %ecx
457; AVX-NEXT:    andl $128, %ecx
458; AVX-NEXT:    orl %edx, %ecx
459; AVX-NEXT:    movl %edi, %edx
460; AVX-NEXT:    shrl $19, %edx
461; AVX-NEXT:    andl $64, %edx
462; AVX-NEXT:    orl %ecx, %edx
463; AVX-NEXT:    movl %edi, %ecx
464; AVX-NEXT:    shrl $21, %ecx
465; AVX-NEXT:    andl $32, %ecx
466; AVX-NEXT:    orl %edx, %ecx
467; AVX-NEXT:    movl %edi, %edx
468; AVX-NEXT:    shrl $23, %edx
469; AVX-NEXT:    andl $16, %edx
470; AVX-NEXT:    orl %ecx, %edx
471; AVX-NEXT:    movl %edi, %ecx
472; AVX-NEXT:    shrl $25, %ecx
473; AVX-NEXT:    andl $8, %ecx
474; AVX-NEXT:    orl %edx, %ecx
475; AVX-NEXT:    movl %edi, %edx
476; AVX-NEXT:    shrl $27, %edx
477; AVX-NEXT:    andl $4, %edx
478; AVX-NEXT:    orl %ecx, %edx
479; AVX-NEXT:    movl %edi, %ecx
480; AVX-NEXT:    shrl $29, %ecx
481; AVX-NEXT:    andl $2, %ecx
482; AVX-NEXT:    orl %edx, %ecx
483; AVX-NEXT:    shrl $31, %edi
484; AVX-NEXT:    orl %ecx, %edi
485; AVX-NEXT:    orl %edi, %eax
486; AVX-NEXT:    retq
487;
488; XOP-LABEL: test_bitreverse_i32:
489; XOP:       # BB#0:
490; XOP-NEXT:    vmovd %edi, %xmm0
491; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
492; XOP-NEXT:    vmovd %xmm0, %eax
493; XOP-NEXT:    retq
494  %b = call i32 @llvm.bitreverse.i32(i32 %a)
495  ret i32 %b
496}
497
498define i64 @test_bitreverse_i64(i64 %a) nounwind {
499; SSE-LABEL: test_bitreverse_i64:
500; SSE:       # BB#0:
501; SSE-NEXT:    leaq (%rdi,%rdi), %rax
502; SSE-NEXT:    movabsq $4294967296, %rcx # imm = 0x100000000
503; SSE-NEXT:    andq %rax, %rcx
504; SSE-NEXT:    movq %rdi, %rax
505; SSE-NEXT:    shlq $63, %rax
506; SSE-NEXT:    movq %rdi, %rdx
507; SSE-NEXT:    andq $2, %rdx
508; SSE-NEXT:    shlq $61, %rdx
509; SSE-NEXT:    leaq (%rdx,%rax), %rax
510; SSE-NEXT:    movq %rdi, %rdx
511; SSE-NEXT:    andq $4, %rdx
512; SSE-NEXT:    shlq $59, %rdx
513; SSE-NEXT:    orq %rdx, %rax
514; SSE-NEXT:    movq %rdi, %rdx
515; SSE-NEXT:    andq $8, %rdx
516; SSE-NEXT:    shlq $57, %rdx
517; SSE-NEXT:    orq %rdx, %rax
518; SSE-NEXT:    movq %rdi, %rdx
519; SSE-NEXT:    andq $16, %rdx
520; SSE-NEXT:    shlq $55, %rdx
521; SSE-NEXT:    orq %rdx, %rax
522; SSE-NEXT:    movq %rdi, %rdx
523; SSE-NEXT:    andq $32, %rdx
524; SSE-NEXT:    shlq $53, %rdx
525; SSE-NEXT:    orq %rdx, %rax
526; SSE-NEXT:    movq %rdi, %rdx
527; SSE-NEXT:    andq $64, %rdx
528; SSE-NEXT:    shlq $51, %rdx
529; SSE-NEXT:    movq %rdi, %rsi
530; SSE-NEXT:    andq $128, %rsi
531; SSE-NEXT:    shlq $49, %rsi
532; SSE-NEXT:    orq %rdx, %rsi
533; SSE-NEXT:    movq %rdi, %rdx
534; SSE-NEXT:    andq $256, %rdx # imm = 0x100
535; SSE-NEXT:    shlq $47, %rdx
536; SSE-NEXT:    orq %rsi, %rdx
537; SSE-NEXT:    movq %rdi, %rsi
538; SSE-NEXT:    andq $512, %rsi # imm = 0x200
539; SSE-NEXT:    shlq $45, %rsi
540; SSE-NEXT:    orq %rdx, %rsi
541; SSE-NEXT:    movq %rdi, %rdx
542; SSE-NEXT:    andq $1024, %rdx # imm = 0x400
543; SSE-NEXT:    shlq $43, %rdx
544; SSE-NEXT:    orq %rsi, %rdx
545; SSE-NEXT:    movq %rdi, %rsi
546; SSE-NEXT:    andq $2048, %rsi # imm = 0x800
547; SSE-NEXT:    shlq $41, %rsi
548; SSE-NEXT:    orq %rdx, %rsi
549; SSE-NEXT:    movq %rdi, %rdx
550; SSE-NEXT:    andq $4096, %rdx # imm = 0x1000
551; SSE-NEXT:    shlq $39, %rdx
552; SSE-NEXT:    orq %rsi, %rdx
553; SSE-NEXT:    movq %rdi, %rsi
554; SSE-NEXT:    andq $8192, %rsi # imm = 0x2000
555; SSE-NEXT:    shlq $37, %rsi
556; SSE-NEXT:    orq %rdx, %rsi
557; SSE-NEXT:    movq %rdi, %rdx
558; SSE-NEXT:    andq $16384, %rdx # imm = 0x4000
559; SSE-NEXT:    shlq $35, %rdx
560; SSE-NEXT:    orq %rsi, %rdx
561; SSE-NEXT:    movq %rdi, %rsi
562; SSE-NEXT:    andq $32768, %rsi # imm = 0x8000
563; SSE-NEXT:    shlq $33, %rsi
564; SSE-NEXT:    orq %rdx, %rsi
565; SSE-NEXT:    movq %rdi, %rdx
566; SSE-NEXT:    andq $65536, %rdx # imm = 0x10000
567; SSE-NEXT:    shlq $31, %rdx
568; SSE-NEXT:    orq %rsi, %rdx
569; SSE-NEXT:    movq %rdi, %rsi
570; SSE-NEXT:    andq $131072, %rsi # imm = 0x20000
571; SSE-NEXT:    shlq $29, %rsi
572; SSE-NEXT:    orq %rdx, %rsi
573; SSE-NEXT:    movq %rdi, %rdx
574; SSE-NEXT:    andq $262144, %rdx # imm = 0x40000
575; SSE-NEXT:    shlq $27, %rdx
576; SSE-NEXT:    orq %rsi, %rdx
577; SSE-NEXT:    movq %rdi, %rsi
578; SSE-NEXT:    andq $524288, %rsi # imm = 0x80000
579; SSE-NEXT:    shlq $25, %rsi
580; SSE-NEXT:    orq %rdx, %rsi
581; SSE-NEXT:    movq %rdi, %rdx
582; SSE-NEXT:    andq $1048576, %rdx # imm = 0x100000
583; SSE-NEXT:    shlq $23, %rdx
584; SSE-NEXT:    orq %rsi, %rdx
585; SSE-NEXT:    movq %rdi, %rsi
586; SSE-NEXT:    andq $2097152, %rsi # imm = 0x200000
587; SSE-NEXT:    shlq $21, %rsi
588; SSE-NEXT:    orq %rdx, %rsi
589; SSE-NEXT:    movq %rdi, %rdx
590; SSE-NEXT:    andq $4194304, %rdx # imm = 0x400000
591; SSE-NEXT:    shlq $19, %rdx
592; SSE-NEXT:    orq %rsi, %rdx
593; SSE-NEXT:    movq %rdi, %rsi
594; SSE-NEXT:    andq $8388608, %rsi # imm = 0x800000
595; SSE-NEXT:    shlq $17, %rsi
596; SSE-NEXT:    orq %rdx, %rsi
597; SSE-NEXT:    movq %rdi, %rdx
598; SSE-NEXT:    andq $16777216, %rdx # imm = 0x1000000
599; SSE-NEXT:    shlq $15, %rdx
600; SSE-NEXT:    orq %rsi, %rdx
601; SSE-NEXT:    movq %rdi, %rsi
602; SSE-NEXT:    andq $33554432, %rsi # imm = 0x2000000
603; SSE-NEXT:    shlq $13, %rsi
604; SSE-NEXT:    orq %rdx, %rsi
605; SSE-NEXT:    movq %rdi, %rdx
606; SSE-NEXT:    andq $67108864, %rdx # imm = 0x4000000
607; SSE-NEXT:    shlq $11, %rdx
608; SSE-NEXT:    orq %rsi, %rdx
609; SSE-NEXT:    movq %rdi, %rsi
610; SSE-NEXT:    andq $134217728, %rsi # imm = 0x8000000
611; SSE-NEXT:    shlq $9, %rsi
612; SSE-NEXT:    orq %rdx, %rsi
613; SSE-NEXT:    movq %rdi, %rdx
614; SSE-NEXT:    andq $268435456, %rdx # imm = 0x10000000
615; SSE-NEXT:    shlq $7, %rdx
616; SSE-NEXT:    orq %rsi, %rdx
617; SSE-NEXT:    movq %rdi, %rsi
618; SSE-NEXT:    andq $536870912, %rsi # imm = 0x20000000
619; SSE-NEXT:    shlq $5, %rsi
620; SSE-NEXT:    orq %rdx, %rsi
621; SSE-NEXT:    movq %rdi, %rdx
622; SSE-NEXT:    andq $1073741824, %rdx # imm = 0x40000000
623; SSE-NEXT:    shlq $3, %rdx
624; SSE-NEXT:    orq %rsi, %rdx
625; SSE-NEXT:    orq %rcx, %rdx
626; SSE-NEXT:    movq %rdi, %rcx
627; SSE-NEXT:    shrq %rcx
628; SSE-NEXT:    andl $-2147483648, %ecx # imm = 0x80000000
629; SSE-NEXT:    orq %rdx, %rcx
630; SSE-NEXT:    movq %rdi, %rdx
631; SSE-NEXT:    shrq $3, %rdx
632; SSE-NEXT:    andl $1073741824, %edx # imm = 0x40000000
633; SSE-NEXT:    orq %rcx, %rdx
634; SSE-NEXT:    movq %rdi, %rcx
635; SSE-NEXT:    shrq $5, %rcx
636; SSE-NEXT:    andl $536870912, %ecx # imm = 0x20000000
637; SSE-NEXT:    orq %rdx, %rcx
638; SSE-NEXT:    movq %rdi, %rdx
639; SSE-NEXT:    shrq $7, %rdx
640; SSE-NEXT:    andl $268435456, %edx # imm = 0x10000000
641; SSE-NEXT:    orq %rcx, %rdx
642; SSE-NEXT:    movq %rdi, %rcx
643; SSE-NEXT:    shrq $9, %rcx
644; SSE-NEXT:    andl $134217728, %ecx # imm = 0x8000000
645; SSE-NEXT:    orq %rdx, %rcx
646; SSE-NEXT:    movq %rdi, %rdx
647; SSE-NEXT:    shrq $11, %rdx
648; SSE-NEXT:    andl $67108864, %edx # imm = 0x4000000
649; SSE-NEXT:    orq %rcx, %rdx
650; SSE-NEXT:    movq %rdi, %rcx
651; SSE-NEXT:    shrq $13, %rcx
652; SSE-NEXT:    andl $33554432, %ecx # imm = 0x2000000
653; SSE-NEXT:    orq %rdx, %rcx
654; SSE-NEXT:    movq %rdi, %rdx
655; SSE-NEXT:    shrq $15, %rdx
656; SSE-NEXT:    andl $16777216, %edx # imm = 0x1000000
657; SSE-NEXT:    orq %rcx, %rdx
658; SSE-NEXT:    movq %rdi, %rcx
659; SSE-NEXT:    shrq $17, %rcx
660; SSE-NEXT:    andl $8388608, %ecx # imm = 0x800000
661; SSE-NEXT:    orq %rdx, %rcx
662; SSE-NEXT:    movq %rdi, %rdx
663; SSE-NEXT:    shrq $19, %rdx
664; SSE-NEXT:    andl $4194304, %edx # imm = 0x400000
665; SSE-NEXT:    orq %rcx, %rdx
666; SSE-NEXT:    movq %rdi, %rcx
667; SSE-NEXT:    shrq $21, %rcx
668; SSE-NEXT:    andl $2097152, %ecx # imm = 0x200000
669; SSE-NEXT:    orq %rdx, %rcx
670; SSE-NEXT:    movq %rdi, %rdx
671; SSE-NEXT:    shrq $23, %rdx
672; SSE-NEXT:    andl $1048576, %edx # imm = 0x100000
673; SSE-NEXT:    orq %rcx, %rdx
674; SSE-NEXT:    movq %rdi, %rcx
675; SSE-NEXT:    shrq $25, %rcx
676; SSE-NEXT:    andl $524288, %ecx # imm = 0x80000
677; SSE-NEXT:    orq %rdx, %rcx
678; SSE-NEXT:    movq %rdi, %rdx
679; SSE-NEXT:    shrq $27, %rdx
680; SSE-NEXT:    andl $262144, %edx # imm = 0x40000
681; SSE-NEXT:    orq %rcx, %rdx
682; SSE-NEXT:    movq %rdi, %rcx
683; SSE-NEXT:    shrq $29, %rcx
684; SSE-NEXT:    andl $131072, %ecx # imm = 0x20000
685; SSE-NEXT:    orq %rdx, %rcx
686; SSE-NEXT:    movq %rdi, %rdx
687; SSE-NEXT:    shrq $31, %rdx
688; SSE-NEXT:    andl $65536, %edx # imm = 0x10000
689; SSE-NEXT:    orq %rcx, %rdx
690; SSE-NEXT:    movq %rdi, %rcx
691; SSE-NEXT:    shrq $33, %rcx
692; SSE-NEXT:    andl $32768, %ecx # imm = 0x8000
693; SSE-NEXT:    orq %rdx, %rcx
694; SSE-NEXT:    movq %rdi, %rdx
695; SSE-NEXT:    shrq $35, %rdx
696; SSE-NEXT:    andl $16384, %edx # imm = 0x4000
697; SSE-NEXT:    orq %rcx, %rdx
698; SSE-NEXT:    movq %rdi, %rcx
699; SSE-NEXT:    shrq $37, %rcx
700; SSE-NEXT:    andl $8192, %ecx # imm = 0x2000
701; SSE-NEXT:    orq %rdx, %rcx
702; SSE-NEXT:    movq %rdi, %rdx
703; SSE-NEXT:    shrq $39, %rdx
704; SSE-NEXT:    andl $4096, %edx # imm = 0x1000
705; SSE-NEXT:    orq %rcx, %rdx
706; SSE-NEXT:    movq %rdi, %rcx
707; SSE-NEXT:    shrq $41, %rcx
708; SSE-NEXT:    andl $2048, %ecx # imm = 0x800
709; SSE-NEXT:    orq %rdx, %rcx
710; SSE-NEXT:    movq %rdi, %rdx
711; SSE-NEXT:    shrq $43, %rdx
712; SSE-NEXT:    andl $1024, %edx # imm = 0x400
713; SSE-NEXT:    orq %rcx, %rdx
714; SSE-NEXT:    movq %rdi, %rcx
715; SSE-NEXT:    shrq $45, %rcx
716; SSE-NEXT:    andl $512, %ecx # imm = 0x200
717; SSE-NEXT:    orq %rdx, %rcx
718; SSE-NEXT:    movq %rdi, %rdx
719; SSE-NEXT:    shrq $47, %rdx
720; SSE-NEXT:    andl $256, %edx # imm = 0x100
721; SSE-NEXT:    orq %rcx, %rdx
722; SSE-NEXT:    movq %rdi, %rcx
723; SSE-NEXT:    shrq $49, %rcx
724; SSE-NEXT:    andl $128, %ecx
725; SSE-NEXT:    orq %rdx, %rcx
726; SSE-NEXT:    movq %rdi, %rdx
727; SSE-NEXT:    shrq $51, %rdx
728; SSE-NEXT:    andl $64, %edx
729; SSE-NEXT:    orq %rcx, %rdx
730; SSE-NEXT:    movq %rdi, %rcx
731; SSE-NEXT:    shrq $53, %rcx
732; SSE-NEXT:    andl $32, %ecx
733; SSE-NEXT:    orq %rdx, %rcx
734; SSE-NEXT:    movq %rdi, %rdx
735; SSE-NEXT:    shrq $55, %rdx
736; SSE-NEXT:    andl $16, %edx
737; SSE-NEXT:    orq %rcx, %rdx
738; SSE-NEXT:    movq %rdi, %rcx
739; SSE-NEXT:    shrq $57, %rcx
740; SSE-NEXT:    andl $8, %ecx
741; SSE-NEXT:    orq %rdx, %rcx
742; SSE-NEXT:    movq %rdi, %rdx
743; SSE-NEXT:    shrq $59, %rdx
744; SSE-NEXT:    andl $4, %edx
745; SSE-NEXT:    orq %rcx, %rdx
746; SSE-NEXT:    movq %rdi, %rcx
747; SSE-NEXT:    shrq $61, %rcx
748; SSE-NEXT:    andl $2, %ecx
749; SSE-NEXT:    orq %rdx, %rcx
750; SSE-NEXT:    shrq $63, %rdi
751; SSE-NEXT:    orq %rcx, %rdi
752; SSE-NEXT:    orq %rdi, %rax
753; SSE-NEXT:    retq
754;
755; AVX-LABEL: test_bitreverse_i64:
756; AVX:       # BB#0:
757; AVX-NEXT:    leaq (%rdi,%rdi), %rax
758; AVX-NEXT:    movabsq $4294967296, %rcx # imm = 0x100000000
759; AVX-NEXT:    andq %rax, %rcx
760; AVX-NEXT:    movq %rdi, %rax
761; AVX-NEXT:    shlq $63, %rax
762; AVX-NEXT:    movq %rdi, %rdx
763; AVX-NEXT:    andq $2, %rdx
764; AVX-NEXT:    shlq $61, %rdx
765; AVX-NEXT:    leaq (%rdx,%rax), %rax
766; AVX-NEXT:    movq %rdi, %rdx
767; AVX-NEXT:    andq $4, %rdx
768; AVX-NEXT:    shlq $59, %rdx
769; AVX-NEXT:    orq %rdx, %rax
770; AVX-NEXT:    movq %rdi, %rdx
771; AVX-NEXT:    andq $8, %rdx
772; AVX-NEXT:    shlq $57, %rdx
773; AVX-NEXT:    orq %rdx, %rax
774; AVX-NEXT:    movq %rdi, %rdx
775; AVX-NEXT:    andq $16, %rdx
776; AVX-NEXT:    shlq $55, %rdx
777; AVX-NEXT:    orq %rdx, %rax
778; AVX-NEXT:    movq %rdi, %rdx
779; AVX-NEXT:    andq $32, %rdx
780; AVX-NEXT:    shlq $53, %rdx
781; AVX-NEXT:    orq %rdx, %rax
782; AVX-NEXT:    movq %rdi, %rdx
783; AVX-NEXT:    andq $64, %rdx
784; AVX-NEXT:    shlq $51, %rdx
785; AVX-NEXT:    movq %rdi, %rsi
786; AVX-NEXT:    andq $128, %rsi
787; AVX-NEXT:    shlq $49, %rsi
788; AVX-NEXT:    orq %rdx, %rsi
789; AVX-NEXT:    movq %rdi, %rdx
790; AVX-NEXT:    andq $256, %rdx # imm = 0x100
791; AVX-NEXT:    shlq $47, %rdx
792; AVX-NEXT:    orq %rsi, %rdx
793; AVX-NEXT:    movq %rdi, %rsi
794; AVX-NEXT:    andq $512, %rsi # imm = 0x200
795; AVX-NEXT:    shlq $45, %rsi
796; AVX-NEXT:    orq %rdx, %rsi
797; AVX-NEXT:    movq %rdi, %rdx
798; AVX-NEXT:    andq $1024, %rdx # imm = 0x400
799; AVX-NEXT:    shlq $43, %rdx
800; AVX-NEXT:    orq %rsi, %rdx
801; AVX-NEXT:    movq %rdi, %rsi
802; AVX-NEXT:    andq $2048, %rsi # imm = 0x800
803; AVX-NEXT:    shlq $41, %rsi
804; AVX-NEXT:    orq %rdx, %rsi
805; AVX-NEXT:    movq %rdi, %rdx
806; AVX-NEXT:    andq $4096, %rdx # imm = 0x1000
807; AVX-NEXT:    shlq $39, %rdx
808; AVX-NEXT:    orq %rsi, %rdx
809; AVX-NEXT:    movq %rdi, %rsi
810; AVX-NEXT:    andq $8192, %rsi # imm = 0x2000
811; AVX-NEXT:    shlq $37, %rsi
812; AVX-NEXT:    orq %rdx, %rsi
813; AVX-NEXT:    movq %rdi, %rdx
814; AVX-NEXT:    andq $16384, %rdx # imm = 0x4000
815; AVX-NEXT:    shlq $35, %rdx
816; AVX-NEXT:    orq %rsi, %rdx
817; AVX-NEXT:    movq %rdi, %rsi
818; AVX-NEXT:    andq $32768, %rsi # imm = 0x8000
819; AVX-NEXT:    shlq $33, %rsi
820; AVX-NEXT:    orq %rdx, %rsi
821; AVX-NEXT:    movq %rdi, %rdx
822; AVX-NEXT:    andq $65536, %rdx # imm = 0x10000
823; AVX-NEXT:    shlq $31, %rdx
824; AVX-NEXT:    orq %rsi, %rdx
825; AVX-NEXT:    movq %rdi, %rsi
826; AVX-NEXT:    andq $131072, %rsi # imm = 0x20000
827; AVX-NEXT:    shlq $29, %rsi
828; AVX-NEXT:    orq %rdx, %rsi
829; AVX-NEXT:    movq %rdi, %rdx
830; AVX-NEXT:    andq $262144, %rdx # imm = 0x40000
831; AVX-NEXT:    shlq $27, %rdx
832; AVX-NEXT:    orq %rsi, %rdx
833; AVX-NEXT:    movq %rdi, %rsi
834; AVX-NEXT:    andq $524288, %rsi # imm = 0x80000
835; AVX-NEXT:    shlq $25, %rsi
836; AVX-NEXT:    orq %rdx, %rsi
837; AVX-NEXT:    movq %rdi, %rdx
838; AVX-NEXT:    andq $1048576, %rdx # imm = 0x100000
839; AVX-NEXT:    shlq $23, %rdx
840; AVX-NEXT:    orq %rsi, %rdx
841; AVX-NEXT:    movq %rdi, %rsi
842; AVX-NEXT:    andq $2097152, %rsi # imm = 0x200000
843; AVX-NEXT:    shlq $21, %rsi
844; AVX-NEXT:    orq %rdx, %rsi
845; AVX-NEXT:    movq %rdi, %rdx
846; AVX-NEXT:    andq $4194304, %rdx # imm = 0x400000
847; AVX-NEXT:    shlq $19, %rdx
848; AVX-NEXT:    orq %rsi, %rdx
849; AVX-NEXT:    movq %rdi, %rsi
850; AVX-NEXT:    andq $8388608, %rsi # imm = 0x800000
851; AVX-NEXT:    shlq $17, %rsi
852; AVX-NEXT:    orq %rdx, %rsi
853; AVX-NEXT:    movq %rdi, %rdx
854; AVX-NEXT:    andq $16777216, %rdx # imm = 0x1000000
855; AVX-NEXT:    shlq $15, %rdx
856; AVX-NEXT:    orq %rsi, %rdx
857; AVX-NEXT:    movq %rdi, %rsi
858; AVX-NEXT:    andq $33554432, %rsi # imm = 0x2000000
859; AVX-NEXT:    shlq $13, %rsi
860; AVX-NEXT:    orq %rdx, %rsi
861; AVX-NEXT:    movq %rdi, %rdx
862; AVX-NEXT:    andq $67108864, %rdx # imm = 0x4000000
863; AVX-NEXT:    shlq $11, %rdx
864; AVX-NEXT:    orq %rsi, %rdx
865; AVX-NEXT:    movq %rdi, %rsi
866; AVX-NEXT:    andq $134217728, %rsi # imm = 0x8000000
867; AVX-NEXT:    shlq $9, %rsi
868; AVX-NEXT:    orq %rdx, %rsi
869; AVX-NEXT:    movq %rdi, %rdx
870; AVX-NEXT:    andq $268435456, %rdx # imm = 0x10000000
871; AVX-NEXT:    shlq $7, %rdx
872; AVX-NEXT:    orq %rsi, %rdx
873; AVX-NEXT:    movq %rdi, %rsi
874; AVX-NEXT:    andq $536870912, %rsi # imm = 0x20000000
875; AVX-NEXT:    shlq $5, %rsi
876; AVX-NEXT:    orq %rdx, %rsi
877; AVX-NEXT:    movq %rdi, %rdx
878; AVX-NEXT:    andq $1073741824, %rdx # imm = 0x40000000
879; AVX-NEXT:    shlq $3, %rdx
880; AVX-NEXT:    orq %rsi, %rdx
881; AVX-NEXT:    orq %rcx, %rdx
882; AVX-NEXT:    movq %rdi, %rcx
883; AVX-NEXT:    shrq %rcx
884; AVX-NEXT:    andl $-2147483648, %ecx # imm = 0x80000000
885; AVX-NEXT:    orq %rdx, %rcx
886; AVX-NEXT:    movq %rdi, %rdx
887; AVX-NEXT:    shrq $3, %rdx
888; AVX-NEXT:    andl $1073741824, %edx # imm = 0x40000000
889; AVX-NEXT:    orq %rcx, %rdx
890; AVX-NEXT:    movq %rdi, %rcx
891; AVX-NEXT:    shrq $5, %rcx
892; AVX-NEXT:    andl $536870912, %ecx # imm = 0x20000000
893; AVX-NEXT:    orq %rdx, %rcx
894; AVX-NEXT:    movq %rdi, %rdx
895; AVX-NEXT:    shrq $7, %rdx
896; AVX-NEXT:    andl $268435456, %edx # imm = 0x10000000
897; AVX-NEXT:    orq %rcx, %rdx
898; AVX-NEXT:    movq %rdi, %rcx
899; AVX-NEXT:    shrq $9, %rcx
900; AVX-NEXT:    andl $134217728, %ecx # imm = 0x8000000
901; AVX-NEXT:    orq %rdx, %rcx
902; AVX-NEXT:    movq %rdi, %rdx
903; AVX-NEXT:    shrq $11, %rdx
904; AVX-NEXT:    andl $67108864, %edx # imm = 0x4000000
905; AVX-NEXT:    orq %rcx, %rdx
906; AVX-NEXT:    movq %rdi, %rcx
907; AVX-NEXT:    shrq $13, %rcx
908; AVX-NEXT:    andl $33554432, %ecx # imm = 0x2000000
909; AVX-NEXT:    orq %rdx, %rcx
910; AVX-NEXT:    movq %rdi, %rdx
911; AVX-NEXT:    shrq $15, %rdx
912; AVX-NEXT:    andl $16777216, %edx # imm = 0x1000000
913; AVX-NEXT:    orq %rcx, %rdx
914; AVX-NEXT:    movq %rdi, %rcx
915; AVX-NEXT:    shrq $17, %rcx
916; AVX-NEXT:    andl $8388608, %ecx # imm = 0x800000
917; AVX-NEXT:    orq %rdx, %rcx
918; AVX-NEXT:    movq %rdi, %rdx
919; AVX-NEXT:    shrq $19, %rdx
920; AVX-NEXT:    andl $4194304, %edx # imm = 0x400000
921; AVX-NEXT:    orq %rcx, %rdx
922; AVX-NEXT:    movq %rdi, %rcx
923; AVX-NEXT:    shrq $21, %rcx
924; AVX-NEXT:    andl $2097152, %ecx # imm = 0x200000
925; AVX-NEXT:    orq %rdx, %rcx
926; AVX-NEXT:    movq %rdi, %rdx
927; AVX-NEXT:    shrq $23, %rdx
928; AVX-NEXT:    andl $1048576, %edx # imm = 0x100000
929; AVX-NEXT:    orq %rcx, %rdx
930; AVX-NEXT:    movq %rdi, %rcx
931; AVX-NEXT:    shrq $25, %rcx
932; AVX-NEXT:    andl $524288, %ecx # imm = 0x80000
933; AVX-NEXT:    orq %rdx, %rcx
934; AVX-NEXT:    movq %rdi, %rdx
935; AVX-NEXT:    shrq $27, %rdx
936; AVX-NEXT:    andl $262144, %edx # imm = 0x40000
937; AVX-NEXT:    orq %rcx, %rdx
938; AVX-NEXT:    movq %rdi, %rcx
939; AVX-NEXT:    shrq $29, %rcx
940; AVX-NEXT:    andl $131072, %ecx # imm = 0x20000
941; AVX-NEXT:    orq %rdx, %rcx
942; AVX-NEXT:    movq %rdi, %rdx
943; AVX-NEXT:    shrq $31, %rdx
944; AVX-NEXT:    andl $65536, %edx # imm = 0x10000
945; AVX-NEXT:    orq %rcx, %rdx
946; AVX-NEXT:    movq %rdi, %rcx
947; AVX-NEXT:    shrq $33, %rcx
948; AVX-NEXT:    andl $32768, %ecx # imm = 0x8000
949; AVX-NEXT:    orq %rdx, %rcx
950; AVX-NEXT:    movq %rdi, %rdx
951; AVX-NEXT:    shrq $35, %rdx
952; AVX-NEXT:    andl $16384, %edx # imm = 0x4000
953; AVX-NEXT:    orq %rcx, %rdx
954; AVX-NEXT:    movq %rdi, %rcx
955; AVX-NEXT:    shrq $37, %rcx
956; AVX-NEXT:    andl $8192, %ecx # imm = 0x2000
957; AVX-NEXT:    orq %rdx, %rcx
958; AVX-NEXT:    movq %rdi, %rdx
959; AVX-NEXT:    shrq $39, %rdx
960; AVX-NEXT:    andl $4096, %edx # imm = 0x1000
961; AVX-NEXT:    orq %rcx, %rdx
962; AVX-NEXT:    movq %rdi, %rcx
963; AVX-NEXT:    shrq $41, %rcx
964; AVX-NEXT:    andl $2048, %ecx # imm = 0x800
965; AVX-NEXT:    orq %rdx, %rcx
966; AVX-NEXT:    movq %rdi, %rdx
967; AVX-NEXT:    shrq $43, %rdx
968; AVX-NEXT:    andl $1024, %edx # imm = 0x400
969; AVX-NEXT:    orq %rcx, %rdx
970; AVX-NEXT:    movq %rdi, %rcx
971; AVX-NEXT:    shrq $45, %rcx
972; AVX-NEXT:    andl $512, %ecx # imm = 0x200
973; AVX-NEXT:    orq %rdx, %rcx
974; AVX-NEXT:    movq %rdi, %rdx
975; AVX-NEXT:    shrq $47, %rdx
976; AVX-NEXT:    andl $256, %edx # imm = 0x100
977; AVX-NEXT:    orq %rcx, %rdx
978; AVX-NEXT:    movq %rdi, %rcx
979; AVX-NEXT:    shrq $49, %rcx
980; AVX-NEXT:    andl $128, %ecx
981; AVX-NEXT:    orq %rdx, %rcx
982; AVX-NEXT:    movq %rdi, %rdx
983; AVX-NEXT:    shrq $51, %rdx
984; AVX-NEXT:    andl $64, %edx
985; AVX-NEXT:    orq %rcx, %rdx
986; AVX-NEXT:    movq %rdi, %rcx
987; AVX-NEXT:    shrq $53, %rcx
988; AVX-NEXT:    andl $32, %ecx
989; AVX-NEXT:    orq %rdx, %rcx
990; AVX-NEXT:    movq %rdi, %rdx
991; AVX-NEXT:    shrq $55, %rdx
992; AVX-NEXT:    andl $16, %edx
993; AVX-NEXT:    orq %rcx, %rdx
994; AVX-NEXT:    movq %rdi, %rcx
995; AVX-NEXT:    shrq $57, %rcx
996; AVX-NEXT:    andl $8, %ecx
997; AVX-NEXT:    orq %rdx, %rcx
998; AVX-NEXT:    movq %rdi, %rdx
999; AVX-NEXT:    shrq $59, %rdx
1000; AVX-NEXT:    andl $4, %edx
1001; AVX-NEXT:    orq %rcx, %rdx
1002; AVX-NEXT:    movq %rdi, %rcx
1003; AVX-NEXT:    shrq $61, %rcx
1004; AVX-NEXT:    andl $2, %ecx
1005; AVX-NEXT:    orq %rdx, %rcx
1006; AVX-NEXT:    shrq $63, %rdi
1007; AVX-NEXT:    orq %rcx, %rdi
1008; AVX-NEXT:    orq %rdi, %rax
1009; AVX-NEXT:    retq
1010;
1011; XOP-LABEL: test_bitreverse_i64:
1012; XOP:       # BB#0:
1013; XOP-NEXT:    vmovq %rdi, %xmm0
1014; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
1015; XOP-NEXT:    vmovq %xmm0, %rax
1016; XOP-NEXT:    retq
1017  %b = call i64 @llvm.bitreverse.i64(i64 %a)
1018  ret i64 %b
1019}
1020
1021define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
1022; SSE2-LABEL: test_bitreverse_v16i8:
1023; SSE2:       # BB#0:
1024; SSE2-NEXT:    movdqa %xmm0, %xmm2
1025; SSE2-NEXT:    psrlw $7, %xmm2
1026; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1027; SSE2-NEXT:    pand %xmm1, %xmm1
1028; SSE2-NEXT:    pand %xmm2, %xmm1
1029; SSE2-NEXT:    movdqa %xmm0, %xmm2
1030; SSE2-NEXT:    psllw $7, %xmm2
1031; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
1032; SSE2-NEXT:    pand %xmm3, %xmm3
1033; SSE2-NEXT:    pand %xmm3, %xmm2
1034; SSE2-NEXT:    movdqa %xmm0, %xmm3
1035; SSE2-NEXT:    psllw $5, %xmm3
1036; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
1037; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
1038; SSE2-NEXT:    movdqa %xmm0, %xmm4
1039; SSE2-NEXT:    psllw $3, %xmm4
1040; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
1041; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
1042; SSE2-NEXT:    por %xmm3, %xmm4
1043; SSE2-NEXT:    movdqa %xmm0, %xmm3
1044; SSE2-NEXT:    paddb %xmm3, %xmm3
1045; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
1046; SSE2-NEXT:    por %xmm4, %xmm3
1047; SSE2-NEXT:    movdqa %xmm0, %xmm4
1048; SSE2-NEXT:    psrlw $1, %xmm4
1049; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
1050; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
1051; SSE2-NEXT:    por %xmm3, %xmm4
1052; SSE2-NEXT:    movdqa %xmm0, %xmm3
1053; SSE2-NEXT:    psrlw $3, %xmm3
1054; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
1055; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
1056; SSE2-NEXT:    por %xmm4, %xmm3
1057; SSE2-NEXT:    psrlw $5, %xmm0
1058; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1059; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1060; SSE2-NEXT:    por %xmm3, %xmm0
1061; SSE2-NEXT:    por %xmm1, %xmm0
1062; SSE2-NEXT:    por %xmm2, %xmm0
1063; SSE2-NEXT:    retq
1064;
1065; SSSE3-LABEL: test_bitreverse_v16i8:
1066; SSSE3:       # BB#0:
1067; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1068; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1069; SSSE3-NEXT:    pand %xmm1, %xmm2
1070; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1071; SSSE3-NEXT:    pshufb %xmm2, %xmm3
1072; SSSE3-NEXT:    psrlw $4, %xmm0
1073; SSSE3-NEXT:    pand %xmm1, %xmm0
1074; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1075; SSSE3-NEXT:    pshufb %xmm0, %xmm1
1076; SSSE3-NEXT:    por %xmm3, %xmm1
1077; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1078; SSSE3-NEXT:    retq
1079;
1080; AVX-LABEL: test_bitreverse_v16i8:
1081; AVX:       # BB#0:
1082; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1083; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
1084; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1085; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
1086; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
1087; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
1088; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1089; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
1090; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
1091; AVX-NEXT:    retq
1092;
1093; XOP-LABEL: test_bitreverse_v16i8:
1094; XOP:       # BB#0:
1095; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
1096; XOP-NEXT:    retq
1097  %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
1098  ret <16 x i8> %b
1099}
1100
1101define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
1102; SSE2-LABEL: test_bitreverse_v8i16:
1103; SSE2:       # BB#0:
1104; SSE2-NEXT:    pxor %xmm1, %xmm1
1105; SSE2-NEXT:    movdqa %xmm0, %xmm2
1106; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1107; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
1108; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
1109; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1110; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
1111; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,4,7,6]
1112; SSE2-NEXT:    packuswb %xmm2, %xmm1
1113; SSE2-NEXT:    movdqa %xmm1, %xmm0
1114; SSE2-NEXT:    psllw $7, %xmm0
1115; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
1116; SSE2-NEXT:    pand %xmm2, %xmm2
1117; SSE2-NEXT:    pand %xmm0, %xmm2
1118; SSE2-NEXT:    movdqa %xmm1, %xmm0
1119; SSE2-NEXT:    psllw $5, %xmm0
1120; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1121; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1122; SSE2-NEXT:    movdqa %xmm1, %xmm3
1123; SSE2-NEXT:    psllw $3, %xmm3
1124; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
1125; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
1126; SSE2-NEXT:    por %xmm0, %xmm3
1127; SSE2-NEXT:    movdqa %xmm1, %xmm0
1128; SSE2-NEXT:    paddb %xmm0, %xmm0
1129; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1130; SSE2-NEXT:    por %xmm3, %xmm0
1131; SSE2-NEXT:    movdqa %xmm1, %xmm3
1132; SSE2-NEXT:    psrlw $1, %xmm3
1133; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
1134; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
1135; SSE2-NEXT:    por %xmm0, %xmm3
1136; SSE2-NEXT:    movdqa %xmm1, %xmm0
1137; SSE2-NEXT:    psrlw $3, %xmm0
1138; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1139; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1140; SSE2-NEXT:    por %xmm3, %xmm0
1141; SSE2-NEXT:    movdqa %xmm1, %xmm3
1142; SSE2-NEXT:    psrlw $5, %xmm3
1143; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
1144; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
1145; SSE2-NEXT:    por %xmm0, %xmm3
1146; SSE2-NEXT:    psrlw $7, %xmm1
1147; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1148; SSE2-NEXT:    pand %xmm0, %xmm0
1149; SSE2-NEXT:    pand %xmm1, %xmm0
1150; SSE2-NEXT:    por %xmm3, %xmm0
1151; SSE2-NEXT:    por %xmm2, %xmm0
1152; SSE2-NEXT:    retq
1153;
1154; SSSE3-LABEL: test_bitreverse_v8i16:
1155; SSSE3:       # BB#0:
1156; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1157; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1158; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1159; SSSE3-NEXT:    pand %xmm1, %xmm2
1160; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1161; SSSE3-NEXT:    pshufb %xmm2, %xmm3
1162; SSSE3-NEXT:    psrlw $4, %xmm0
1163; SSSE3-NEXT:    pand %xmm1, %xmm0
1164; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1165; SSSE3-NEXT:    pshufb %xmm0, %xmm1
1166; SSSE3-NEXT:    por %xmm3, %xmm1
1167; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1168; SSSE3-NEXT:    retq
1169;
1170; AVX-LABEL: test_bitreverse_v8i16:
1171; AVX:       # BB#0:
1172; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1173; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1174; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
1175; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1176; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
1177; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
1178; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
1179; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1180; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
1181; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
1182; AVX-NEXT:    retq
1183;
1184; XOP-LABEL: test_bitreverse_v8i16:
1185; XOP:       # BB#0:
1186; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
1187; XOP-NEXT:    retq
1188  %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
1189  ret <8 x i16> %b
1190}
1191
1192define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
1193; SSE2-LABEL: test_bitreverse_v4i32:
1194; SSE2:       # BB#0:
1195; SSE2-NEXT:    pxor %xmm1, %xmm1
1196; SSE2-NEXT:    movdqa %xmm0, %xmm2
1197; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1198; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
1199; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
1200; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1201; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1202; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4]
1203; SSE2-NEXT:    packuswb %xmm2, %xmm1
1204; SSE2-NEXT:    movdqa %xmm1, %xmm0
1205; SSE2-NEXT:    psllw $7, %xmm0
1206; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
1207; SSE2-NEXT:    pand %xmm2, %xmm2
1208; SSE2-NEXT:    pand %xmm0, %xmm2
1209; SSE2-NEXT:    movdqa %xmm1, %xmm0
1210; SSE2-NEXT:    psllw $5, %xmm0
1211; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1212; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1213; SSE2-NEXT:    movdqa %xmm1, %xmm3
1214; SSE2-NEXT:    psllw $3, %xmm3
1215; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
1216; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
1217; SSE2-NEXT:    por %xmm0, %xmm3
1218; SSE2-NEXT:    movdqa %xmm1, %xmm0
1219; SSE2-NEXT:    paddb %xmm0, %xmm0
1220; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1221; SSE2-NEXT:    por %xmm3, %xmm0
1222; SSE2-NEXT:    movdqa %xmm1, %xmm3
1223; SSE2-NEXT:    psrlw $1, %xmm3
1224; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
1225; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
1226; SSE2-NEXT:    por %xmm0, %xmm3
1227; SSE2-NEXT:    movdqa %xmm1, %xmm0
1228; SSE2-NEXT:    psrlw $3, %xmm0
1229; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1230; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1231; SSE2-NEXT:    por %xmm3, %xmm0
1232; SSE2-NEXT:    movdqa %xmm1, %xmm3
1233; SSE2-NEXT:    psrlw $5, %xmm3
1234; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
1235; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
1236; SSE2-NEXT:    por %xmm0, %xmm3
1237; SSE2-NEXT:    psrlw $7, %xmm1
1238; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1239; SSE2-NEXT:    pand %xmm0, %xmm0
1240; SSE2-NEXT:    pand %xmm1, %xmm0
1241; SSE2-NEXT:    por %xmm3, %xmm0
1242; SSE2-NEXT:    por %xmm2, %xmm0
1243; SSE2-NEXT:    retq
1244;
1245; SSSE3-LABEL: test_bitreverse_v4i32:
1246; SSSE3:       # BB#0:
1247; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1248; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1249; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1250; SSSE3-NEXT:    pand %xmm1, %xmm2
1251; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1252; SSSE3-NEXT:    pshufb %xmm2, %xmm3
1253; SSSE3-NEXT:    psrlw $4, %xmm0
1254; SSSE3-NEXT:    pand %xmm1, %xmm0
1255; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1256; SSSE3-NEXT:    pshufb %xmm0, %xmm1
1257; SSSE3-NEXT:    por %xmm3, %xmm1
1258; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1259; SSSE3-NEXT:    retq
1260;
1261; AVX-LABEL: test_bitreverse_v4i32:
1262; AVX:       # BB#0:
1263; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1264; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1265; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
1266; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1267; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
1268; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
1269; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
1270; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1271; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
1272; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
1273; AVX-NEXT:    retq
1274;
1275; XOP-LABEL: test_bitreverse_v4i32:
1276; XOP:       # BB#0:
1277; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
1278; XOP-NEXT:    retq
1279  %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
1280  ret <4 x i32> %b
1281}
1282
1283define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
1284; SSE2-LABEL: test_bitreverse_v2i64:
1285; SSE2:       # BB#0:
1286; SSE2-NEXT:    pxor %xmm1, %xmm1
1287; SSE2-NEXT:    movdqa %xmm0, %xmm2
1288; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1289; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
1290; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
1291; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
1292; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1293; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1294; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1295; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4]
1296; SSE2-NEXT:    packuswb %xmm2, %xmm1
1297; SSE2-NEXT:    movdqa %xmm1, %xmm0
1298; SSE2-NEXT:    psllw $7, %xmm0
1299; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
1300; SSE2-NEXT:    pand %xmm2, %xmm2
1301; SSE2-NEXT:    pand %xmm0, %xmm2
1302; SSE2-NEXT:    movdqa %xmm1, %xmm0
1303; SSE2-NEXT:    psllw $5, %xmm0
1304; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1305; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1306; SSE2-NEXT:    movdqa %xmm1, %xmm3
1307; SSE2-NEXT:    psllw $3, %xmm3
1308; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
1309; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
1310; SSE2-NEXT:    por %xmm0, %xmm3
1311; SSE2-NEXT:    movdqa %xmm1, %xmm0
1312; SSE2-NEXT:    paddb %xmm0, %xmm0
1313; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1314; SSE2-NEXT:    por %xmm3, %xmm0
1315; SSE2-NEXT:    movdqa %xmm1, %xmm3
1316; SSE2-NEXT:    psrlw $1, %xmm3
1317; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
1318; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
1319; SSE2-NEXT:    por %xmm0, %xmm3
1320; SSE2-NEXT:    movdqa %xmm1, %xmm0
1321; SSE2-NEXT:    psrlw $3, %xmm0
1322; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1323; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1324; SSE2-NEXT:    por %xmm3, %xmm0
1325; SSE2-NEXT:    movdqa %xmm1, %xmm3
1326; SSE2-NEXT:    psrlw $5, %xmm3
1327; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
1328; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
1329; SSE2-NEXT:    por %xmm0, %xmm3
1330; SSE2-NEXT:    psrlw $7, %xmm1
1331; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1332; SSE2-NEXT:    pand %xmm0, %xmm0
1333; SSE2-NEXT:    pand %xmm1, %xmm0
1334; SSE2-NEXT:    por %xmm3, %xmm0
1335; SSE2-NEXT:    por %xmm2, %xmm0
1336; SSE2-NEXT:    retq
1337;
1338; SSSE3-LABEL: test_bitreverse_v2i64:
1339; SSSE3:       # BB#0:
1340; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1341; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1342; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1343; SSSE3-NEXT:    pand %xmm1, %xmm2
1344; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1345; SSSE3-NEXT:    pshufb %xmm2, %xmm3
1346; SSSE3-NEXT:    psrlw $4, %xmm0
1347; SSSE3-NEXT:    pand %xmm1, %xmm0
1348; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1349; SSSE3-NEXT:    pshufb %xmm0, %xmm1
1350; SSSE3-NEXT:    por %xmm3, %xmm1
1351; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1352; SSSE3-NEXT:    retq
1353;
1354; AVX-LABEL: test_bitreverse_v2i64:
1355; AVX:       # BB#0:
1356; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1357; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1358; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
1359; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1360; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
1361; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
1362; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
1363; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1364; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
1365; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
1366; AVX-NEXT:    retq
1367;
1368; XOP-LABEL: test_bitreverse_v2i64:
1369; XOP:       # BB#0:
1370; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
1371; XOP-NEXT:    retq
1372  %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
1373  ret <2 x i64> %b
1374}
1375
1376define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
1377; SSE2-LABEL: test_bitreverse_v32i8:
1378; SSE2:       # BB#0:
1379; SSE2-NEXT:    movdqa %xmm0, %xmm2
1380; SSE2-NEXT:    psllw $5, %xmm2
1381; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
1382; SSE2-NEXT:    pand {{.*}}(%rip), %xmm9
1383; SSE2-NEXT:    pand %xmm9, %xmm2
1384; SSE2-NEXT:    movdqa %xmm0, %xmm5
1385; SSE2-NEXT:    psllw $7, %xmm5
1386; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
1387; SSE2-NEXT:    pand %xmm10, %xmm10
1388; SSE2-NEXT:    pand %xmm10, %xmm5
1389; SSE2-NEXT:    movdqa %xmm0, %xmm3
1390; SSE2-NEXT:    psllw $3, %xmm3
1391; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
1392; SSE2-NEXT:    pand {{.*}}(%rip), %xmm11
1393; SSE2-NEXT:    pand %xmm11, %xmm3
1394; SSE2-NEXT:    por %xmm2, %xmm3
1395; SSE2-NEXT:    movdqa %xmm0, %xmm2
1396; SSE2-NEXT:    paddb %xmm2, %xmm2
1397; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1398; SSE2-NEXT:    pand %xmm8, %xmm2
1399; SSE2-NEXT:    por %xmm3, %xmm2
1400; SSE2-NEXT:    movdqa %xmm0, %xmm3
1401; SSE2-NEXT:    psrlw $1, %xmm3
1402; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1403; SSE2-NEXT:    pand {{.*}}(%rip), %xmm12
1404; SSE2-NEXT:    pand %xmm12, %xmm3
1405; SSE2-NEXT:    por %xmm2, %xmm3
1406; SSE2-NEXT:    movdqa %xmm0, %xmm4
1407; SSE2-NEXT:    psrlw $3, %xmm4
1408; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
1409; SSE2-NEXT:    pand {{.*}}(%rip), %xmm6
1410; SSE2-NEXT:    pand %xmm6, %xmm4
1411; SSE2-NEXT:    por %xmm3, %xmm4
1412; SSE2-NEXT:    movdqa %xmm0, %xmm7
1413; SSE2-NEXT:    psrlw $5, %xmm7
1414; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
1415; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
1416; SSE2-NEXT:    pand %xmm2, %xmm7
1417; SSE2-NEXT:    por %xmm4, %xmm7
1418; SSE2-NEXT:    psrlw $7, %xmm0
1419; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1420; SSE2-NEXT:    pand %xmm3, %xmm3
1421; SSE2-NEXT:    pand %xmm3, %xmm0
1422; SSE2-NEXT:    por %xmm7, %xmm0
1423; SSE2-NEXT:    por %xmm5, %xmm0
1424; SSE2-NEXT:    movdqa %xmm1, %xmm4
1425; SSE2-NEXT:    psllw $5, %xmm4
1426; SSE2-NEXT:    pand %xmm9, %xmm4
1427; SSE2-NEXT:    movdqa %xmm1, %xmm5
1428; SSE2-NEXT:    psllw $7, %xmm5
1429; SSE2-NEXT:    pand %xmm10, %xmm5
1430; SSE2-NEXT:    movdqa %xmm1, %xmm7
1431; SSE2-NEXT:    psllw $3, %xmm7
1432; SSE2-NEXT:    pand %xmm11, %xmm7
1433; SSE2-NEXT:    por %xmm4, %xmm7
1434; SSE2-NEXT:    movdqa %xmm1, %xmm4
1435; SSE2-NEXT:    paddb %xmm4, %xmm4
1436; SSE2-NEXT:    pand %xmm8, %xmm4
1437; SSE2-NEXT:    por %xmm7, %xmm4
1438; SSE2-NEXT:    movdqa %xmm1, %xmm7
1439; SSE2-NEXT:    psrlw $1, %xmm7
1440; SSE2-NEXT:    pand %xmm12, %xmm7
1441; SSE2-NEXT:    por %xmm4, %xmm7
1442; SSE2-NEXT:    movdqa %xmm1, %xmm4
1443; SSE2-NEXT:    psrlw $3, %xmm4
1444; SSE2-NEXT:    pand %xmm6, %xmm4
1445; SSE2-NEXT:    por %xmm7, %xmm4
1446; SSE2-NEXT:    movdqa %xmm1, %xmm6
1447; SSE2-NEXT:    psrlw $5, %xmm6
1448; SSE2-NEXT:    pand %xmm2, %xmm6
1449; SSE2-NEXT:    por %xmm4, %xmm6
1450; SSE2-NEXT:    psrlw $7, %xmm1
1451; SSE2-NEXT:    pand %xmm3, %xmm1
1452; SSE2-NEXT:    por %xmm6, %xmm1
1453; SSE2-NEXT:    por %xmm5, %xmm1
1454; SSE2-NEXT:    retq
1455;
1456; SSSE3-LABEL: test_bitreverse_v32i8:
1457; SSSE3:       # BB#0:
1458; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1459; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1460; SSSE3-NEXT:    pand %xmm4, %xmm2
1461; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1462; SSSE3-NEXT:    movdqa %xmm5, %xmm6
1463; SSSE3-NEXT:    pshufb %xmm2, %xmm6
1464; SSSE3-NEXT:    psrlw $4, %xmm0
1465; SSSE3-NEXT:    pand %xmm4, %xmm0
1466; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1467; SSSE3-NEXT:    movdqa %xmm2, %xmm3
1468; SSSE3-NEXT:    pshufb %xmm0, %xmm3
1469; SSSE3-NEXT:    por %xmm6, %xmm3
1470; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1471; SSSE3-NEXT:    pand %xmm4, %xmm0
1472; SSSE3-NEXT:    pshufb %xmm0, %xmm5
1473; SSSE3-NEXT:    psrlw $4, %xmm1
1474; SSSE3-NEXT:    pand %xmm4, %xmm1
1475; SSSE3-NEXT:    pshufb %xmm1, %xmm2
1476; SSSE3-NEXT:    por %xmm5, %xmm2
1477; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1478; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1479; SSSE3-NEXT:    retq
1480;
1481; AVX1-LABEL: test_bitreverse_v32i8:
1482; AVX1:       # BB#0:
1483; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1484; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1485; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm3
1486; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1487; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
1488; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1489; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
1490; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1491; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
1492; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
1493; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm3
1494; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
1495; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1496; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
1497; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
1498; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
1499; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1500; AVX1-NEXT:    retq
1501;
1502; AVX2-LABEL: test_bitreverse_v32i8:
1503; AVX2:       # BB#0:
1504; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1505; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
1506; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1507; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1508; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1509; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1510; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1511; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1512; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
1513; AVX2-NEXT:    retq
1514;
1515; AVX512-LABEL: test_bitreverse_v32i8:
1516; AVX512:       # BB#0:
1517; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1518; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1519; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1520; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1521; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1522; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1523; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1524; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1525; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1526; AVX512-NEXT:    retq
1527;
1528; XOPAVX1-LABEL: test_bitreverse_v32i8:
1529; XOPAVX1:       # BB#0:
1530; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1531; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
1532; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1533; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1534; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1535; XOPAVX1-NEXT:    retq
1536;
1537; XOPAVX2-LABEL: test_bitreverse_v32i8:
1538; XOPAVX2:       # BB#0:
1539; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1540; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
1541; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1542; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1543; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1544; XOPAVX2-NEXT:    retq
1545  %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
1546  ret <32 x i8> %b
1547}
1548
1549define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
1550; SSE2-LABEL: test_bitreverse_v16i16:
1551; SSE2:       # BB#0:
1552; SSE2-NEXT:    pxor %xmm9, %xmm9
1553; SSE2-NEXT:    movdqa %xmm0, %xmm2
1554; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
1555; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
1556; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
1557; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
1558; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
1559; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
1560; SSE2-NEXT:    packuswb %xmm2, %xmm0
1561; SSE2-NEXT:    movdqa %xmm0, %xmm2
1562; SSE2-NEXT:    psllw $5, %xmm2
1563; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
1564; SSE2-NEXT:    pand {{.*}}(%rip), %xmm10
1565; SSE2-NEXT:    pand %xmm10, %xmm2
1566; SSE2-NEXT:    movdqa %xmm0, %xmm3
1567; SSE2-NEXT:    psllw $7, %xmm3
1568; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
1569; SSE2-NEXT:    pand %xmm11, %xmm11
1570; SSE2-NEXT:    pand %xmm11, %xmm3
1571; SSE2-NEXT:    movdqa %xmm0, %xmm4
1572; SSE2-NEXT:    psllw $3, %xmm4
1573; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
1574; SSE2-NEXT:    pand {{.*}}(%rip), %xmm12
1575; SSE2-NEXT:    pand %xmm12, %xmm4
1576; SSE2-NEXT:    por %xmm2, %xmm4
1577; SSE2-NEXT:    movdqa %xmm0, %xmm2
1578; SSE2-NEXT:    paddb %xmm2, %xmm2
1579; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1580; SSE2-NEXT:    pand %xmm8, %xmm2
1581; SSE2-NEXT:    por %xmm4, %xmm2
1582; SSE2-NEXT:    movdqa %xmm0, %xmm4
1583; SSE2-NEXT:    psrlw $1, %xmm4
1584; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1585; SSE2-NEXT:    pand {{.*}}(%rip), %xmm13
1586; SSE2-NEXT:    pand %xmm13, %xmm4
1587; SSE2-NEXT:    por %xmm2, %xmm4
1588; SSE2-NEXT:    movdqa %xmm0, %xmm5
1589; SSE2-NEXT:    psrlw $3, %xmm5
1590; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
1591; SSE2-NEXT:    pand {{.*}}(%rip), %xmm6
1592; SSE2-NEXT:    pand %xmm6, %xmm5
1593; SSE2-NEXT:    por %xmm4, %xmm5
1594; SSE2-NEXT:    movdqa %xmm0, %xmm7
1595; SSE2-NEXT:    psrlw $5, %xmm7
1596; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
1597; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
1598; SSE2-NEXT:    pand %xmm2, %xmm7
1599; SSE2-NEXT:    por %xmm5, %xmm7
1600; SSE2-NEXT:    psrlw $7, %xmm0
1601; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1602; SSE2-NEXT:    pand %xmm4, %xmm4
1603; SSE2-NEXT:    pand %xmm4, %xmm0
1604; SSE2-NEXT:    por %xmm7, %xmm0
1605; SSE2-NEXT:    por %xmm3, %xmm0
1606; SSE2-NEXT:    movdqa %xmm1, %xmm3
1607; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15]
1608; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
1609; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6]
1610; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
1611; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
1612; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6]
1613; SSE2-NEXT:    packuswb %xmm3, %xmm1
1614; SSE2-NEXT:    movdqa %xmm1, %xmm5
1615; SSE2-NEXT:    psllw $5, %xmm5
1616; SSE2-NEXT:    pand %xmm10, %xmm5
1617; SSE2-NEXT:    movdqa %xmm1, %xmm3
1618; SSE2-NEXT:    psllw $7, %xmm3
1619; SSE2-NEXT:    pand %xmm11, %xmm3
1620; SSE2-NEXT:    movdqa %xmm1, %xmm7
1621; SSE2-NEXT:    psllw $3, %xmm7
1622; SSE2-NEXT:    pand %xmm12, %xmm7
1623; SSE2-NEXT:    por %xmm5, %xmm7
1624; SSE2-NEXT:    movdqa %xmm1, %xmm5
1625; SSE2-NEXT:    paddb %xmm5, %xmm5
1626; SSE2-NEXT:    pand %xmm8, %xmm5
1627; SSE2-NEXT:    por %xmm7, %xmm5
1628; SSE2-NEXT:    movdqa %xmm1, %xmm7
1629; SSE2-NEXT:    psrlw $1, %xmm7
1630; SSE2-NEXT:    pand %xmm13, %xmm7
1631; SSE2-NEXT:    por %xmm5, %xmm7
1632; SSE2-NEXT:    movdqa %xmm1, %xmm5
1633; SSE2-NEXT:    psrlw $3, %xmm5
1634; SSE2-NEXT:    pand %xmm6, %xmm5
1635; SSE2-NEXT:    por %xmm7, %xmm5
1636; SSE2-NEXT:    movdqa %xmm1, %xmm6
1637; SSE2-NEXT:    psrlw $5, %xmm6
1638; SSE2-NEXT:    pand %xmm2, %xmm6
1639; SSE2-NEXT:    por %xmm5, %xmm6
1640; SSE2-NEXT:    psrlw $7, %xmm1
1641; SSE2-NEXT:    pand %xmm4, %xmm1
1642; SSE2-NEXT:    por %xmm6, %xmm1
1643; SSE2-NEXT:    por %xmm3, %xmm1
1644; SSE2-NEXT:    retq
1645;
1646; SSSE3-LABEL: test_bitreverse_v16i16:
1647; SSSE3:       # BB#0:
1648; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1649; SSSE3-NEXT:    pshufb %xmm4, %xmm0
1650; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1651; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1652; SSSE3-NEXT:    pand %xmm5, %xmm2
1653; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1654; SSSE3-NEXT:    movdqa %xmm6, %xmm7
1655; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1656; SSSE3-NEXT:    psrlw $4, %xmm0
1657; SSSE3-NEXT:    pand %xmm5, %xmm0
1658; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1659; SSSE3-NEXT:    movdqa %xmm2, %xmm3
1660; SSSE3-NEXT:    pshufb %xmm0, %xmm3
1661; SSSE3-NEXT:    por %xmm7, %xmm3
1662; SSSE3-NEXT:    pshufb %xmm4, %xmm1
1663; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1664; SSSE3-NEXT:    pand %xmm5, %xmm0
1665; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1666; SSSE3-NEXT:    psrlw $4, %xmm1
1667; SSSE3-NEXT:    pand %xmm5, %xmm1
1668; SSSE3-NEXT:    pshufb %xmm1, %xmm2
1669; SSSE3-NEXT:    por %xmm6, %xmm2
1670; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1671; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1672; SSSE3-NEXT:    retq
1673;
1674; AVX1-LABEL: test_bitreverse_v16i16:
1675; AVX1:       # BB#0:
1676; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1677; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1678; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1679; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1680; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
1681; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1682; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1683; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1684; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1685; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1686; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1687; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
1688; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1689; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
1690; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
1691; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1692; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1693; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1694; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
1695; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1696; AVX1-NEXT:    retq
1697;
1698; AVX2-LABEL: test_bitreverse_v16i16:
1699; AVX2:       # BB#0:
1700; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1701; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1702; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
1703; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1704; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1705; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1706; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1707; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1708; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1709; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
1710; AVX2-NEXT:    retq
1711;
1712; AVX512-LABEL: test_bitreverse_v16i16:
1713; AVX512:       # BB#0:
1714; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1715; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1716; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1717; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1718; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1719; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1720; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1721; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1722; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1723; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1724; AVX512-NEXT:    retq
1725;
1726; XOPAVX1-LABEL: test_bitreverse_v16i16:
1727; XOPAVX1:       # BB#0:
1728; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1729; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
1730; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1731; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1732; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1733; XOPAVX1-NEXT:    retq
1734;
1735; XOPAVX2-LABEL: test_bitreverse_v16i16:
1736; XOPAVX2:       # BB#0:
1737; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1738; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
1739; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1740; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1741; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1742; XOPAVX2-NEXT:    retq
1743  %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
1744  ret <16 x i16> %b
1745}
1746
1747define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
1748; SSE2-LABEL: test_bitreverse_v8i32:
1749; SSE2:       # BB#0:
1750; SSE2-NEXT:    pxor %xmm9, %xmm9
1751; SSE2-NEXT:    movdqa %xmm0, %xmm2
1752; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
1753; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
1754; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
1755; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
1756; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1757; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1758; SSE2-NEXT:    packuswb %xmm2, %xmm0
1759; SSE2-NEXT:    movdqa %xmm0, %xmm2
1760; SSE2-NEXT:    psllw $5, %xmm2
1761; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
1762; SSE2-NEXT:    pand {{.*}}(%rip), %xmm10
1763; SSE2-NEXT:    pand %xmm10, %xmm2
1764; SSE2-NEXT:    movdqa %xmm0, %xmm3
1765; SSE2-NEXT:    psllw $7, %xmm3
1766; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
1767; SSE2-NEXT:    pand %xmm11, %xmm11
1768; SSE2-NEXT:    pand %xmm11, %xmm3
1769; SSE2-NEXT:    movdqa %xmm0, %xmm4
1770; SSE2-NEXT:    psllw $3, %xmm4
1771; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
1772; SSE2-NEXT:    pand {{.*}}(%rip), %xmm12
1773; SSE2-NEXT:    pand %xmm12, %xmm4
1774; SSE2-NEXT:    por %xmm2, %xmm4
1775; SSE2-NEXT:    movdqa %xmm0, %xmm2
1776; SSE2-NEXT:    paddb %xmm2, %xmm2
1777; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1778; SSE2-NEXT:    pand %xmm8, %xmm2
1779; SSE2-NEXT:    por %xmm4, %xmm2
1780; SSE2-NEXT:    movdqa %xmm0, %xmm4
1781; SSE2-NEXT:    psrlw $1, %xmm4
1782; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1783; SSE2-NEXT:    pand {{.*}}(%rip), %xmm13
1784; SSE2-NEXT:    pand %xmm13, %xmm4
1785; SSE2-NEXT:    por %xmm2, %xmm4
1786; SSE2-NEXT:    movdqa %xmm0, %xmm5
1787; SSE2-NEXT:    psrlw $3, %xmm5
1788; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
1789; SSE2-NEXT:    pand {{.*}}(%rip), %xmm6
1790; SSE2-NEXT:    pand %xmm6, %xmm5
1791; SSE2-NEXT:    por %xmm4, %xmm5
1792; SSE2-NEXT:    movdqa %xmm0, %xmm7
1793; SSE2-NEXT:    psrlw $5, %xmm7
1794; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
1795; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
1796; SSE2-NEXT:    pand %xmm2, %xmm7
1797; SSE2-NEXT:    por %xmm5, %xmm7
1798; SSE2-NEXT:    psrlw $7, %xmm0
1799; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1800; SSE2-NEXT:    pand %xmm4, %xmm4
1801; SSE2-NEXT:    pand %xmm4, %xmm0
1802; SSE2-NEXT:    por %xmm7, %xmm0
1803; SSE2-NEXT:    por %xmm3, %xmm0
1804; SSE2-NEXT:    movdqa %xmm1, %xmm3
1805; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15]
1806; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
1807; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
1808; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
1809; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
1810; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
1811; SSE2-NEXT:    packuswb %xmm3, %xmm1
1812; SSE2-NEXT:    movdqa %xmm1, %xmm5
1813; SSE2-NEXT:    psllw $5, %xmm5
1814; SSE2-NEXT:    pand %xmm10, %xmm5
1815; SSE2-NEXT:    movdqa %xmm1, %xmm3
1816; SSE2-NEXT:    psllw $7, %xmm3
1817; SSE2-NEXT:    pand %xmm11, %xmm3
1818; SSE2-NEXT:    movdqa %xmm1, %xmm7
1819; SSE2-NEXT:    psllw $3, %xmm7
1820; SSE2-NEXT:    pand %xmm12, %xmm7
1821; SSE2-NEXT:    por %xmm5, %xmm7
1822; SSE2-NEXT:    movdqa %xmm1, %xmm5
1823; SSE2-NEXT:    paddb %xmm5, %xmm5
1824; SSE2-NEXT:    pand %xmm8, %xmm5
1825; SSE2-NEXT:    por %xmm7, %xmm5
1826; SSE2-NEXT:    movdqa %xmm1, %xmm7
1827; SSE2-NEXT:    psrlw $1, %xmm7
1828; SSE2-NEXT:    pand %xmm13, %xmm7
1829; SSE2-NEXT:    por %xmm5, %xmm7
1830; SSE2-NEXT:    movdqa %xmm1, %xmm5
1831; SSE2-NEXT:    psrlw $3, %xmm5
1832; SSE2-NEXT:    pand %xmm6, %xmm5
1833; SSE2-NEXT:    por %xmm7, %xmm5
1834; SSE2-NEXT:    movdqa %xmm1, %xmm6
1835; SSE2-NEXT:    psrlw $5, %xmm6
1836; SSE2-NEXT:    pand %xmm2, %xmm6
1837; SSE2-NEXT:    por %xmm5, %xmm6
1838; SSE2-NEXT:    psrlw $7, %xmm1
1839; SSE2-NEXT:    pand %xmm4, %xmm1
1840; SSE2-NEXT:    por %xmm6, %xmm1
1841; SSE2-NEXT:    por %xmm3, %xmm1
1842; SSE2-NEXT:    retq
1843;
1844; SSSE3-LABEL: test_bitreverse_v8i32:
1845; SSSE3:       # BB#0:
1846; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1847; SSSE3-NEXT:    pshufb %xmm4, %xmm0
1848; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1849; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1850; SSSE3-NEXT:    pand %xmm5, %xmm2
1851; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1852; SSSE3-NEXT:    movdqa %xmm6, %xmm7
1853; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1854; SSSE3-NEXT:    psrlw $4, %xmm0
1855; SSSE3-NEXT:    pand %xmm5, %xmm0
1856; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1857; SSSE3-NEXT:    movdqa %xmm2, %xmm3
1858; SSSE3-NEXT:    pshufb %xmm0, %xmm3
1859; SSSE3-NEXT:    por %xmm7, %xmm3
1860; SSSE3-NEXT:    pshufb %xmm4, %xmm1
1861; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1862; SSSE3-NEXT:    pand %xmm5, %xmm0
1863; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1864; SSSE3-NEXT:    psrlw $4, %xmm1
1865; SSSE3-NEXT:    pand %xmm5, %xmm1
1866; SSSE3-NEXT:    pshufb %xmm1, %xmm2
1867; SSSE3-NEXT:    por %xmm6, %xmm2
1868; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1869; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1870; SSSE3-NEXT:    retq
1871;
1872; AVX1-LABEL: test_bitreverse_v8i32:
1873; AVX1:       # BB#0:
1874; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1875; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1876; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1877; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1878; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
1879; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1880; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1881; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1882; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1883; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1884; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1885; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
1886; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1887; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
1888; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
1889; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1890; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1891; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1892; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
1893; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1894; AVX1-NEXT:    retq
1895;
1896; AVX2-LABEL: test_bitreverse_v8i32:
1897; AVX2:       # BB#0:
1898; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1899; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1900; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
1901; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1902; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1903; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1904; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1905; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1906; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1907; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
1908; AVX2-NEXT:    retq
1909;
1910; AVX512-LABEL: test_bitreverse_v8i32:
1911; AVX512:       # BB#0:
1912; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1913; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1914; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1915; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1916; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1917; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1918; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1919; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1920; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1921; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1922; AVX512-NEXT:    retq
1923;
1924; XOPAVX1-LABEL: test_bitreverse_v8i32:
1925; XOPAVX1:       # BB#0:
1926; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1927; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
1928; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1929; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1930; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1931; XOPAVX1-NEXT:    retq
1932;
1933; XOPAVX2-LABEL: test_bitreverse_v8i32:
1934; XOPAVX2:       # BB#0:
1935; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1936; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
1937; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1938; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1939; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1940; XOPAVX2-NEXT:    retq
1941  %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
1942  ret <8 x i32> %b
1943}
1944
1945define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
1946; SSE2-LABEL: test_bitreverse_v4i64:
1947; SSE2:       # BB#0:
1948; SSE2-NEXT:    pxor %xmm9, %xmm9
1949; SSE2-NEXT:    movdqa %xmm0, %xmm2
1950; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
1951; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
1952; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
1953; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
1954; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
1955; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1956; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1957; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1958; SSE2-NEXT:    packuswb %xmm2, %xmm0
1959; SSE2-NEXT:    movdqa %xmm0, %xmm2
1960; SSE2-NEXT:    psllw $5, %xmm2
1961; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
1962; SSE2-NEXT:    pand {{.*}}(%rip), %xmm10
1963; SSE2-NEXT:    pand %xmm10, %xmm2
1964; SSE2-NEXT:    movdqa %xmm0, %xmm4
1965; SSE2-NEXT:    psllw $7, %xmm4
1966; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
1967; SSE2-NEXT:    pand %xmm11, %xmm11
1968; SSE2-NEXT:    pand %xmm11, %xmm4
1969; SSE2-NEXT:    movdqa %xmm0, %xmm3
1970; SSE2-NEXT:    psllw $3, %xmm3
1971; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
1972; SSE2-NEXT:    pand {{.*}}(%rip), %xmm12
1973; SSE2-NEXT:    pand %xmm12, %xmm3
1974; SSE2-NEXT:    por %xmm2, %xmm3
1975; SSE2-NEXT:    movdqa %xmm0, %xmm2
1976; SSE2-NEXT:    paddb %xmm2, %xmm2
1977; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1978; SSE2-NEXT:    pand %xmm8, %xmm2
1979; SSE2-NEXT:    por %xmm3, %xmm2
1980; SSE2-NEXT:    movdqa %xmm0, %xmm3
1981; SSE2-NEXT:    psrlw $1, %xmm3
1982; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1983; SSE2-NEXT:    pand {{.*}}(%rip), %xmm13
1984; SSE2-NEXT:    pand %xmm13, %xmm3
1985; SSE2-NEXT:    por %xmm2, %xmm3
1986; SSE2-NEXT:    movdqa %xmm0, %xmm5
1987; SSE2-NEXT:    psrlw $3, %xmm5
1988; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
1989; SSE2-NEXT:    pand {{.*}}(%rip), %xmm6
1990; SSE2-NEXT:    pand %xmm6, %xmm5
1991; SSE2-NEXT:    por %xmm3, %xmm5
1992; SSE2-NEXT:    movdqa %xmm0, %xmm7
1993; SSE2-NEXT:    psrlw $5, %xmm7
1994; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
1995; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
1996; SSE2-NEXT:    pand %xmm2, %xmm7
1997; SSE2-NEXT:    por %xmm5, %xmm7
1998; SSE2-NEXT:    psrlw $7, %xmm0
1999; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2000; SSE2-NEXT:    pand %xmm3, %xmm3
2001; SSE2-NEXT:    pand %xmm3, %xmm0
2002; SSE2-NEXT:    por %xmm7, %xmm0
2003; SSE2-NEXT:    por %xmm4, %xmm0
2004; SSE2-NEXT:    movdqa %xmm1, %xmm4
2005; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
2006; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
2007; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2008; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2009; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
2010; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2011; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
2012; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
2013; SSE2-NEXT:    packuswb %xmm4, %xmm1
2014; SSE2-NEXT:    movdqa %xmm1, %xmm5
2015; SSE2-NEXT:    psllw $5, %xmm5
2016; SSE2-NEXT:    pand %xmm10, %xmm5
2017; SSE2-NEXT:    movdqa %xmm1, %xmm4
2018; SSE2-NEXT:    psllw $7, %xmm4
2019; SSE2-NEXT:    pand %xmm11, %xmm4
2020; SSE2-NEXT:    movdqa %xmm1, %xmm7
2021; SSE2-NEXT:    psllw $3, %xmm7
2022; SSE2-NEXT:    pand %xmm12, %xmm7
2023; SSE2-NEXT:    por %xmm5, %xmm7
2024; SSE2-NEXT:    movdqa %xmm1, %xmm5
2025; SSE2-NEXT:    paddb %xmm5, %xmm5
2026; SSE2-NEXT:    pand %xmm8, %xmm5
2027; SSE2-NEXT:    por %xmm7, %xmm5
2028; SSE2-NEXT:    movdqa %xmm1, %xmm7
2029; SSE2-NEXT:    psrlw $1, %xmm7
2030; SSE2-NEXT:    pand %xmm13, %xmm7
2031; SSE2-NEXT:    por %xmm5, %xmm7
2032; SSE2-NEXT:    movdqa %xmm1, %xmm5
2033; SSE2-NEXT:    psrlw $3, %xmm5
2034; SSE2-NEXT:    pand %xmm6, %xmm5
2035; SSE2-NEXT:    por %xmm7, %xmm5
2036; SSE2-NEXT:    movdqa %xmm1, %xmm6
2037; SSE2-NEXT:    psrlw $5, %xmm6
2038; SSE2-NEXT:    pand %xmm2, %xmm6
2039; SSE2-NEXT:    por %xmm5, %xmm6
2040; SSE2-NEXT:    psrlw $7, %xmm1
2041; SSE2-NEXT:    pand %xmm3, %xmm1
2042; SSE2-NEXT:    por %xmm6, %xmm1
2043; SSE2-NEXT:    por %xmm4, %xmm1
2044; SSE2-NEXT:    retq
2045;
2046; SSSE3-LABEL: test_bitreverse_v4i64:
2047; SSSE3:       # BB#0:
2048; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2049; SSSE3-NEXT:    pshufb %xmm4, %xmm0
2050; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2051; SSSE3-NEXT:    movdqa %xmm0, %xmm2
2052; SSSE3-NEXT:    pand %xmm5, %xmm2
2053; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2054; SSSE3-NEXT:    movdqa %xmm6, %xmm7
2055; SSSE3-NEXT:    pshufb %xmm2, %xmm7
2056; SSSE3-NEXT:    psrlw $4, %xmm0
2057; SSSE3-NEXT:    pand %xmm5, %xmm0
2058; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2059; SSSE3-NEXT:    movdqa %xmm2, %xmm3
2060; SSSE3-NEXT:    pshufb %xmm0, %xmm3
2061; SSSE3-NEXT:    por %xmm7, %xmm3
2062; SSSE3-NEXT:    pshufb %xmm4, %xmm1
2063; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2064; SSSE3-NEXT:    pand %xmm5, %xmm0
2065; SSSE3-NEXT:    pshufb %xmm0, %xmm6
2066; SSSE3-NEXT:    psrlw $4, %xmm1
2067; SSSE3-NEXT:    pand %xmm5, %xmm1
2068; SSSE3-NEXT:    pshufb %xmm1, %xmm2
2069; SSSE3-NEXT:    por %xmm6, %xmm2
2070; SSSE3-NEXT:    movdqa %xmm3, %xmm0
2071; SSSE3-NEXT:    movdqa %xmm2, %xmm1
2072; SSSE3-NEXT:    retq
2073;
2074; AVX1-LABEL: test_bitreverse_v4i64:
2075; AVX1:       # BB#0:
2076; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2077; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2078; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
2079; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2080; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
2081; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2082; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
2083; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
2084; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
2085; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2086; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
2087; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
2088; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2089; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
2090; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
2091; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
2092; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
2093; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
2094; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
2095; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2096; AVX1-NEXT:    retq
2097;
2098; AVX2-LABEL: test_bitreverse_v4i64:
2099; AVX2:       # BB#0:
2100; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
2101; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2102; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
2103; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2104; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
2105; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
2106; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
2107; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2108; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
2109; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
2110; AVX2-NEXT:    retq
2111;
2112; AVX512-LABEL: test_bitreverse_v4i64:
2113; AVX512:       # BB#0:
2114; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
2115; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2116; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
2117; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2118; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
2119; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
2120; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
2121; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2122; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
2123; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
2124; AVX512-NEXT:    retq
2125;
2126; XOPAVX1-LABEL: test_bitreverse_v4i64:
2127; XOPAVX1:       # BB#0:
2128; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2129; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
2130; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
2131; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
2132; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2133; XOPAVX1-NEXT:    retq
2134;
2135; XOPAVX2-LABEL: test_bitreverse_v4i64:
2136; XOPAVX2:       # BB#0:
2137; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2138; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
2139; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
2140; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
2141; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
2142; XOPAVX2-NEXT:    retq
2143  %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
2144  ret <4 x i64> %b
2145}
2146
2147define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
2148; SSE2-LABEL: test_bitreverse_v64i8:
2149; SSE2:       # BB#0:
2150; SSE2-NEXT:    movdqa %xmm0, %xmm4
2151; SSE2-NEXT:    psllw $5, %xmm4
2152; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
2153; SSE2-NEXT:    pand {{.*}}(%rip), %xmm9
2154; SSE2-NEXT:    pand %xmm9, %xmm4
2155; SSE2-NEXT:    movdqa %xmm0, %xmm7
2156; SSE2-NEXT:    psllw $7, %xmm7
2157; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
2158; SSE2-NEXT:    pand %xmm10, %xmm10
2159; SSE2-NEXT:    pand %xmm10, %xmm7
2160; SSE2-NEXT:    movdqa %xmm0, %xmm5
2161; SSE2-NEXT:    psllw $3, %xmm5
2162; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
2163; SSE2-NEXT:    pand {{.*}}(%rip), %xmm11
2164; SSE2-NEXT:    pand %xmm11, %xmm5
2165; SSE2-NEXT:    por %xmm4, %xmm5
2166; SSE2-NEXT:    movdqa %xmm0, %xmm4
2167; SSE2-NEXT:    paddb %xmm4, %xmm4
2168; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2169; SSE2-NEXT:    pand %xmm8, %xmm4
2170; SSE2-NEXT:    por %xmm5, %xmm4
2171; SSE2-NEXT:    movdqa %xmm0, %xmm5
2172; SSE2-NEXT:    psrlw $1, %xmm5
2173; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2174; SSE2-NEXT:    pand {{.*}}(%rip), %xmm12
2175; SSE2-NEXT:    pand %xmm12, %xmm5
2176; SSE2-NEXT:    por %xmm4, %xmm5
2177; SSE2-NEXT:    movdqa %xmm0, %xmm6
2178; SSE2-NEXT:    psrlw $3, %xmm6
2179; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
2180; SSE2-NEXT:    pand {{.*}}(%rip), %xmm13
2181; SSE2-NEXT:    pand %xmm13, %xmm6
2182; SSE2-NEXT:    por %xmm5, %xmm6
2183; SSE2-NEXT:    movdqa %xmm0, %xmm4
2184; SSE2-NEXT:    psrlw $5, %xmm4
2185; SSE2-NEXT:    movdqa {{.*#+}} xmm14 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2186; SSE2-NEXT:    pand {{.*}}(%rip), %xmm14
2187; SSE2-NEXT:    pand %xmm14, %xmm4
2188; SSE2-NEXT:    por %xmm6, %xmm4
2189; SSE2-NEXT:    psrlw $7, %xmm0
2190; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2191; SSE2-NEXT:    pand %xmm6, %xmm6
2192; SSE2-NEXT:    pand %xmm6, %xmm0
2193; SSE2-NEXT:    por %xmm4, %xmm0
2194; SSE2-NEXT:    por %xmm7, %xmm0
2195; SSE2-NEXT:    movdqa %xmm1, %xmm4
2196; SSE2-NEXT:    psllw $5, %xmm4
2197; SSE2-NEXT:    pand %xmm9, %xmm4
2198; SSE2-NEXT:    movdqa %xmm1, %xmm7
2199; SSE2-NEXT:    psllw $7, %xmm7
2200; SSE2-NEXT:    pand %xmm10, %xmm7
2201; SSE2-NEXT:    movdqa %xmm1, %xmm5
2202; SSE2-NEXT:    psllw $3, %xmm5
2203; SSE2-NEXT:    pand %xmm11, %xmm5
2204; SSE2-NEXT:    por %xmm4, %xmm5
2205; SSE2-NEXT:    movdqa %xmm1, %xmm4
2206; SSE2-NEXT:    paddb %xmm4, %xmm4
2207; SSE2-NEXT:    pand %xmm8, %xmm4
2208; SSE2-NEXT:    por %xmm5, %xmm4
2209; SSE2-NEXT:    movdqa %xmm1, %xmm5
2210; SSE2-NEXT:    psrlw $1, %xmm5
2211; SSE2-NEXT:    pand %xmm12, %xmm5
2212; SSE2-NEXT:    por %xmm4, %xmm5
2213; SSE2-NEXT:    movdqa %xmm1, %xmm4
2214; SSE2-NEXT:    psrlw $3, %xmm4
2215; SSE2-NEXT:    pand %xmm13, %xmm4
2216; SSE2-NEXT:    por %xmm5, %xmm4
2217; SSE2-NEXT:    movdqa %xmm1, %xmm5
2218; SSE2-NEXT:    psrlw $5, %xmm5
2219; SSE2-NEXT:    pand %xmm14, %xmm5
2220; SSE2-NEXT:    por %xmm4, %xmm5
2221; SSE2-NEXT:    psrlw $7, %xmm1
2222; SSE2-NEXT:    pand %xmm6, %xmm1
2223; SSE2-NEXT:    por %xmm5, %xmm1
2224; SSE2-NEXT:    por %xmm7, %xmm1
2225; SSE2-NEXT:    movdqa %xmm2, %xmm4
2226; SSE2-NEXT:    psllw $5, %xmm4
2227; SSE2-NEXT:    pand %xmm9, %xmm4
2228; SSE2-NEXT:    movdqa %xmm2, %xmm7
2229; SSE2-NEXT:    psllw $7, %xmm7
2230; SSE2-NEXT:    pand %xmm10, %xmm7
2231; SSE2-NEXT:    movdqa %xmm2, %xmm5
2232; SSE2-NEXT:    psllw $3, %xmm5
2233; SSE2-NEXT:    pand %xmm11, %xmm5
2234; SSE2-NEXT:    por %xmm4, %xmm5
2235; SSE2-NEXT:    movdqa %xmm2, %xmm4
2236; SSE2-NEXT:    paddb %xmm4, %xmm4
2237; SSE2-NEXT:    pand %xmm8, %xmm4
2238; SSE2-NEXT:    por %xmm5, %xmm4
2239; SSE2-NEXT:    movdqa %xmm2, %xmm5
2240; SSE2-NEXT:    psrlw $1, %xmm5
2241; SSE2-NEXT:    pand %xmm12, %xmm5
2242; SSE2-NEXT:    por %xmm4, %xmm5
2243; SSE2-NEXT:    movdqa %xmm2, %xmm4
2244; SSE2-NEXT:    psrlw $3, %xmm4
2245; SSE2-NEXT:    pand %xmm13, %xmm4
2246; SSE2-NEXT:    por %xmm5, %xmm4
2247; SSE2-NEXT:    movdqa %xmm2, %xmm5
2248; SSE2-NEXT:    psrlw $5, %xmm5
2249; SSE2-NEXT:    pand %xmm14, %xmm5
2250; SSE2-NEXT:    por %xmm4, %xmm5
2251; SSE2-NEXT:    psrlw $7, %xmm2
2252; SSE2-NEXT:    pand %xmm6, %xmm2
2253; SSE2-NEXT:    por %xmm5, %xmm2
2254; SSE2-NEXT:    por %xmm7, %xmm2
2255; SSE2-NEXT:    movdqa %xmm3, %xmm4
2256; SSE2-NEXT:    psllw $5, %xmm4
2257; SSE2-NEXT:    pand %xmm9, %xmm4
2258; SSE2-NEXT:    movdqa %xmm3, %xmm7
2259; SSE2-NEXT:    psllw $7, %xmm7
2260; SSE2-NEXT:    pand %xmm10, %xmm7
2261; SSE2-NEXT:    movdqa %xmm3, %xmm5
2262; SSE2-NEXT:    psllw $3, %xmm5
2263; SSE2-NEXT:    pand %xmm11, %xmm5
2264; SSE2-NEXT:    por %xmm4, %xmm5
2265; SSE2-NEXT:    movdqa %xmm3, %xmm4
2266; SSE2-NEXT:    paddb %xmm4, %xmm4
2267; SSE2-NEXT:    pand %xmm8, %xmm4
2268; SSE2-NEXT:    por %xmm5, %xmm4
2269; SSE2-NEXT:    movdqa %xmm3, %xmm5
2270; SSE2-NEXT:    psrlw $1, %xmm5
2271; SSE2-NEXT:    pand %xmm12, %xmm5
2272; SSE2-NEXT:    por %xmm4, %xmm5
2273; SSE2-NEXT:    movdqa %xmm3, %xmm4
2274; SSE2-NEXT:    psrlw $3, %xmm4
2275; SSE2-NEXT:    pand %xmm13, %xmm4
2276; SSE2-NEXT:    por %xmm5, %xmm4
2277; SSE2-NEXT:    movdqa %xmm3, %xmm5
2278; SSE2-NEXT:    psrlw $5, %xmm5
2279; SSE2-NEXT:    pand %xmm14, %xmm5
2280; SSE2-NEXT:    por %xmm4, %xmm5
2281; SSE2-NEXT:    psrlw $7, %xmm3
2282; SSE2-NEXT:    pand %xmm6, %xmm3
2283; SSE2-NEXT:    por %xmm5, %xmm3
2284; SSE2-NEXT:    por %xmm7, %xmm3
2285; SSE2-NEXT:    retq
2286;
2287; SSSE3-LABEL: test_bitreverse_v64i8:
2288; SSSE3:       # BB#0:
2289; SSSE3-NEXT:    movdqa %xmm0, %xmm5
2290; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2291; SSSE3-NEXT:    pand %xmm8, %xmm0
2292; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2293; SSSE3-NEXT:    movdqa %xmm9, %xmm6
2294; SSSE3-NEXT:    pshufb %xmm0, %xmm6
2295; SSSE3-NEXT:    psrlw $4, %xmm5
2296; SSSE3-NEXT:    pand %xmm8, %xmm5
2297; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2298; SSSE3-NEXT:    movdqa %xmm4, %xmm0
2299; SSSE3-NEXT:    pshufb %xmm5, %xmm0
2300; SSSE3-NEXT:    por %xmm6, %xmm0
2301; SSSE3-NEXT:    movdqa %xmm1, %xmm5
2302; SSSE3-NEXT:    pand %xmm8, %xmm5
2303; SSSE3-NEXT:    movdqa %xmm9, %xmm6
2304; SSSE3-NEXT:    pshufb %xmm5, %xmm6
2305; SSSE3-NEXT:    psrlw $4, %xmm1
2306; SSSE3-NEXT:    pand %xmm8, %xmm1
2307; SSSE3-NEXT:    movdqa %xmm4, %xmm5
2308; SSSE3-NEXT:    pshufb %xmm1, %xmm5
2309; SSSE3-NEXT:    por %xmm6, %xmm5
2310; SSSE3-NEXT:    movdqa %xmm2, %xmm1
2311; SSSE3-NEXT:    pand %xmm8, %xmm1
2312; SSSE3-NEXT:    movdqa %xmm9, %xmm7
2313; SSSE3-NEXT:    pshufb %xmm1, %xmm7
2314; SSSE3-NEXT:    psrlw $4, %xmm2
2315; SSSE3-NEXT:    pand %xmm8, %xmm2
2316; SSSE3-NEXT:    movdqa %xmm4, %xmm6
2317; SSSE3-NEXT:    pshufb %xmm2, %xmm6
2318; SSSE3-NEXT:    por %xmm7, %xmm6
2319; SSSE3-NEXT:    movdqa %xmm3, %xmm1
2320; SSSE3-NEXT:    pand %xmm8, %xmm1
2321; SSSE3-NEXT:    pshufb %xmm1, %xmm9
2322; SSSE3-NEXT:    psrlw $4, %xmm3
2323; SSSE3-NEXT:    pand %xmm8, %xmm3
2324; SSSE3-NEXT:    pshufb %xmm3, %xmm4
2325; SSSE3-NEXT:    por %xmm9, %xmm4
2326; SSSE3-NEXT:    movdqa %xmm5, %xmm1
2327; SSSE3-NEXT:    movdqa %xmm6, %xmm2
2328; SSSE3-NEXT:    movdqa %xmm4, %xmm3
2329; SSSE3-NEXT:    retq
2330;
2331; AVX1-LABEL: test_bitreverse_v64i8:
2332; AVX1:       # BB#0:
2333; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2334; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2335; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm4
2336; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2337; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
2338; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2339; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
2340; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2341; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
2342; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
2343; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm4
2344; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
2345; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
2346; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
2347; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
2348; AVX1-NEXT:    vpor %xmm0, %xmm4, %xmm0
2349; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2350; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2351; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm4
2352; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
2353; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2354; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
2355; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
2356; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
2357; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm4
2358; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
2359; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
2360; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
2361; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
2362; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
2363; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2364; AVX1-NEXT:    retq
2365;
2366; AVX2-LABEL: test_bitreverse_v64i8:
2367; AVX2:       # BB#0:
2368; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2369; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
2370; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2371; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
2372; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
2373; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
2374; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2375; AVX2-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
2376; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
2377; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
2378; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
2379; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
2380; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
2381; AVX2-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
2382; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
2383; AVX2-NEXT:    retq
2384;
2385; AVX512F-LABEL: test_bitreverse_v64i8:
2386; AVX512F:       # BB#0:
2387; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2388; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm3
2389; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2390; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
2391; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
2392; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
2393; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2394; AVX512F-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
2395; AVX512F-NEXT:    vpor %ymm0, %ymm3, %ymm0
2396; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
2397; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
2398; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
2399; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
2400; AVX512F-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
2401; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
2402; AVX512F-NEXT:    retq
2403;
2404; AVX512BW-LABEL: test_bitreverse_v64i8:
2405; AVX512BW:       # BB#0:
2406; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2407; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
2408; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2409; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
2410; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
2411; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2412; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2413; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
2414; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
2415; AVX512BW-NEXT:    retq
2416;
2417; XOPAVX1-LABEL: test_bitreverse_v64i8:
2418; XOPAVX1:       # BB#0:
2419; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2420; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
2421; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2422; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2423; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2424; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2425; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2426; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2427; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2428; XOPAVX1-NEXT:    retq
2429;
2430; XOPAVX2-LABEL: test_bitreverse_v64i8:
2431; XOPAVX2:       # BB#0:
2432; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2433; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
2434; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2435; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2436; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
2437; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2438; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2439; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2440; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2441; XOPAVX2-NEXT:    retq
2442  %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
2443  ret <64 x i8> %b
2444}
2445
2446define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
2447; SSE2-LABEL: test_bitreverse_v32i16:
2448; SSE2:       # BB#0:
2449; SSE2-NEXT:    pxor %xmm9, %xmm9
2450; SSE2-NEXT:    movdqa %xmm0, %xmm4
2451; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
2452; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
2453; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
2454; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
2455; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
2456; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
2457; SSE2-NEXT:    packuswb %xmm4, %xmm0
2458; SSE2-NEXT:    movdqa %xmm0, %xmm5
2459; SSE2-NEXT:    psllw $5, %xmm5
2460; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
2461; SSE2-NEXT:    pand {{.*}}(%rip), %xmm10
2462; SSE2-NEXT:    pand %xmm10, %xmm5
2463; SSE2-NEXT:    movdqa %xmm0, %xmm4
2464; SSE2-NEXT:    psllw $7, %xmm4
2465; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
2466; SSE2-NEXT:    pand %xmm11, %xmm11
2467; SSE2-NEXT:    pand %xmm11, %xmm4
2468; SSE2-NEXT:    movdqa %xmm0, %xmm6
2469; SSE2-NEXT:    psllw $3, %xmm6
2470; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
2471; SSE2-NEXT:    pand {{.*}}(%rip), %xmm12
2472; SSE2-NEXT:    pand %xmm12, %xmm6
2473; SSE2-NEXT:    por %xmm5, %xmm6
2474; SSE2-NEXT:    movdqa %xmm0, %xmm5
2475; SSE2-NEXT:    paddb %xmm5, %xmm5
2476; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2477; SSE2-NEXT:    pand %xmm8, %xmm5
2478; SSE2-NEXT:    por %xmm6, %xmm5
2479; SSE2-NEXT:    movdqa %xmm0, %xmm6
2480; SSE2-NEXT:    psrlw $1, %xmm6
2481; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2482; SSE2-NEXT:    pand {{.*}}(%rip), %xmm13
2483; SSE2-NEXT:    pand %xmm13, %xmm6
2484; SSE2-NEXT:    por %xmm5, %xmm6
2485; SSE2-NEXT:    movdqa %xmm0, %xmm7
2486; SSE2-NEXT:    psrlw $3, %xmm7
2487; SSE2-NEXT:    movdqa {{.*#+}} xmm14 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
2488; SSE2-NEXT:    pand {{.*}}(%rip), %xmm14
2489; SSE2-NEXT:    pand %xmm14, %xmm7
2490; SSE2-NEXT:    por %xmm6, %xmm7
2491; SSE2-NEXT:    movdqa %xmm0, %xmm5
2492; SSE2-NEXT:    psrlw $5, %xmm5
2493; SSE2-NEXT:    movdqa {{.*#+}} xmm15 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2494; SSE2-NEXT:    pand {{.*}}(%rip), %xmm15
2495; SSE2-NEXT:    pand %xmm15, %xmm5
2496; SSE2-NEXT:    por %xmm7, %xmm5
2497; SSE2-NEXT:    psrlw $7, %xmm0
2498; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2499; SSE2-NEXT:    pand %xmm7, %xmm7
2500; SSE2-NEXT:    pand %xmm7, %xmm0
2501; SSE2-NEXT:    por %xmm5, %xmm0
2502; SSE2-NEXT:    por %xmm4, %xmm0
2503; SSE2-NEXT:    movdqa %xmm1, %xmm4
2504; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
2505; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
2506; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
2507; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
2508; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
2509; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6]
2510; SSE2-NEXT:    packuswb %xmm4, %xmm1
2511; SSE2-NEXT:    movdqa %xmm1, %xmm5
2512; SSE2-NEXT:    psllw $5, %xmm5
2513; SSE2-NEXT:    pand %xmm10, %xmm5
2514; SSE2-NEXT:    movdqa %xmm1, %xmm4
2515; SSE2-NEXT:    psllw $7, %xmm4
2516; SSE2-NEXT:    pand %xmm11, %xmm4
2517; SSE2-NEXT:    movdqa %xmm1, %xmm6
2518; SSE2-NEXT:    psllw $3, %xmm6
2519; SSE2-NEXT:    pand %xmm12, %xmm6
2520; SSE2-NEXT:    por %xmm5, %xmm6
2521; SSE2-NEXT:    movdqa %xmm1, %xmm5
2522; SSE2-NEXT:    paddb %xmm5, %xmm5
2523; SSE2-NEXT:    pand %xmm8, %xmm5
2524; SSE2-NEXT:    por %xmm6, %xmm5
2525; SSE2-NEXT:    movdqa %xmm1, %xmm6
2526; SSE2-NEXT:    psrlw $1, %xmm6
2527; SSE2-NEXT:    pand %xmm13, %xmm6
2528; SSE2-NEXT:    por %xmm5, %xmm6
2529; SSE2-NEXT:    movdqa %xmm1, %xmm5
2530; SSE2-NEXT:    psrlw $3, %xmm5
2531; SSE2-NEXT:    pand %xmm14, %xmm5
2532; SSE2-NEXT:    por %xmm6, %xmm5
2533; SSE2-NEXT:    movdqa %xmm1, %xmm6
2534; SSE2-NEXT:    psrlw $5, %xmm6
2535; SSE2-NEXT:    pand %xmm15, %xmm6
2536; SSE2-NEXT:    por %xmm5, %xmm6
2537; SSE2-NEXT:    psrlw $7, %xmm1
2538; SSE2-NEXT:    pand %xmm7, %xmm1
2539; SSE2-NEXT:    por %xmm6, %xmm1
2540; SSE2-NEXT:    por %xmm4, %xmm1
2541; SSE2-NEXT:    movdqa %xmm2, %xmm4
2542; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
2543; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
2544; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
2545; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
2546; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
2547; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
2548; SSE2-NEXT:    packuswb %xmm4, %xmm2
2549; SSE2-NEXT:    movdqa %xmm2, %xmm5
2550; SSE2-NEXT:    psllw $5, %xmm5
2551; SSE2-NEXT:    pand %xmm10, %xmm5
2552; SSE2-NEXT:    movdqa %xmm2, %xmm4
2553; SSE2-NEXT:    psllw $7, %xmm4
2554; SSE2-NEXT:    pand %xmm11, %xmm4
2555; SSE2-NEXT:    movdqa %xmm2, %xmm6
2556; SSE2-NEXT:    psllw $3, %xmm6
2557; SSE2-NEXT:    pand %xmm12, %xmm6
2558; SSE2-NEXT:    por %xmm5, %xmm6
2559; SSE2-NEXT:    movdqa %xmm2, %xmm5
2560; SSE2-NEXT:    paddb %xmm5, %xmm5
2561; SSE2-NEXT:    pand %xmm8, %xmm5
2562; SSE2-NEXT:    por %xmm6, %xmm5
2563; SSE2-NEXT:    movdqa %xmm2, %xmm6
2564; SSE2-NEXT:    psrlw $1, %xmm6
2565; SSE2-NEXT:    pand %xmm13, %xmm6
2566; SSE2-NEXT:    por %xmm5, %xmm6
2567; SSE2-NEXT:    movdqa %xmm2, %xmm5
2568; SSE2-NEXT:    psrlw $3, %xmm5
2569; SSE2-NEXT:    pand %xmm14, %xmm5
2570; SSE2-NEXT:    por %xmm6, %xmm5
2571; SSE2-NEXT:    movdqa %xmm2, %xmm6
2572; SSE2-NEXT:    psrlw $5, %xmm6
2573; SSE2-NEXT:    pand %xmm15, %xmm6
2574; SSE2-NEXT:    por %xmm5, %xmm6
2575; SSE2-NEXT:    psrlw $7, %xmm2
2576; SSE2-NEXT:    pand %xmm7, %xmm2
2577; SSE2-NEXT:    por %xmm6, %xmm2
2578; SSE2-NEXT:    por %xmm4, %xmm2
2579; SSE2-NEXT:    movdqa %xmm3, %xmm4
2580; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
2581; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
2582; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
2583; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
2584; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
2585; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6]
2586; SSE2-NEXT:    packuswb %xmm4, %xmm3
2587; SSE2-NEXT:    movdqa %xmm3, %xmm5
2588; SSE2-NEXT:    psllw $5, %xmm5
2589; SSE2-NEXT:    pand %xmm10, %xmm5
2590; SSE2-NEXT:    movdqa %xmm3, %xmm4
2591; SSE2-NEXT:    psllw $7, %xmm4
2592; SSE2-NEXT:    pand %xmm11, %xmm4
2593; SSE2-NEXT:    movdqa %xmm3, %xmm6
2594; SSE2-NEXT:    psllw $3, %xmm6
2595; SSE2-NEXT:    pand %xmm12, %xmm6
2596; SSE2-NEXT:    por %xmm5, %xmm6
2597; SSE2-NEXT:    movdqa %xmm3, %xmm5
2598; SSE2-NEXT:    paddb %xmm5, %xmm5
2599; SSE2-NEXT:    pand %xmm8, %xmm5
2600; SSE2-NEXT:    por %xmm6, %xmm5
2601; SSE2-NEXT:    movdqa %xmm3, %xmm6
2602; SSE2-NEXT:    psrlw $1, %xmm6
2603; SSE2-NEXT:    pand %xmm13, %xmm6
2604; SSE2-NEXT:    por %xmm5, %xmm6
2605; SSE2-NEXT:    movdqa %xmm3, %xmm5
2606; SSE2-NEXT:    psrlw $3, %xmm5
2607; SSE2-NEXT:    pand %xmm14, %xmm5
2608; SSE2-NEXT:    por %xmm6, %xmm5
2609; SSE2-NEXT:    movdqa %xmm3, %xmm6
2610; SSE2-NEXT:    psrlw $5, %xmm6
2611; SSE2-NEXT:    pand %xmm15, %xmm6
2612; SSE2-NEXT:    por %xmm5, %xmm6
2613; SSE2-NEXT:    psrlw $7, %xmm3
2614; SSE2-NEXT:    pand %xmm7, %xmm3
2615; SSE2-NEXT:    por %xmm6, %xmm3
2616; SSE2-NEXT:    por %xmm4, %xmm3
2617; SSE2-NEXT:    retq
2618;
2619; SSSE3-LABEL: test_bitreverse_v32i16:
2620; SSSE3:       # BB#0:
2621; SSSE3-NEXT:    movdqa %xmm1, %xmm5
2622; SSSE3-NEXT:    movdqa %xmm0, %xmm1
2623; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2624; SSSE3-NEXT:    pshufb %xmm8, %xmm1
2625; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2626; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2627; SSSE3-NEXT:    pand %xmm9, %xmm0
2628; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2629; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2630; SSSE3-NEXT:    pshufb %xmm0, %xmm6
2631; SSSE3-NEXT:    psrlw $4, %xmm1
2632; SSSE3-NEXT:    pand %xmm9, %xmm1
2633; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2634; SSSE3-NEXT:    movdqa %xmm4, %xmm0
2635; SSSE3-NEXT:    pshufb %xmm1, %xmm0
2636; SSSE3-NEXT:    por %xmm6, %xmm0
2637; SSSE3-NEXT:    pshufb %xmm8, %xmm5
2638; SSSE3-NEXT:    movdqa %xmm5, %xmm1
2639; SSSE3-NEXT:    pand %xmm9, %xmm1
2640; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2641; SSSE3-NEXT:    pshufb %xmm1, %xmm6
2642; SSSE3-NEXT:    psrlw $4, %xmm5
2643; SSSE3-NEXT:    pand %xmm9, %xmm5
2644; SSSE3-NEXT:    movdqa %xmm4, %xmm1
2645; SSSE3-NEXT:    pshufb %xmm5, %xmm1
2646; SSSE3-NEXT:    por %xmm6, %xmm1
2647; SSSE3-NEXT:    pshufb %xmm8, %xmm2
2648; SSSE3-NEXT:    movdqa %xmm2, %xmm5
2649; SSSE3-NEXT:    pand %xmm9, %xmm5
2650; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2651; SSSE3-NEXT:    pshufb %xmm5, %xmm6
2652; SSSE3-NEXT:    psrlw $4, %xmm2
2653; SSSE3-NEXT:    pand %xmm9, %xmm2
2654; SSSE3-NEXT:    movdqa %xmm4, %xmm5
2655; SSSE3-NEXT:    pshufb %xmm2, %xmm5
2656; SSSE3-NEXT:    por %xmm6, %xmm5
2657; SSSE3-NEXT:    pshufb %xmm8, %xmm3
2658; SSSE3-NEXT:    movdqa %xmm3, %xmm2
2659; SSSE3-NEXT:    pand %xmm9, %xmm2
2660; SSSE3-NEXT:    pshufb %xmm2, %xmm7
2661; SSSE3-NEXT:    psrlw $4, %xmm3
2662; SSSE3-NEXT:    pand %xmm9, %xmm3
2663; SSSE3-NEXT:    pshufb %xmm3, %xmm4
2664; SSSE3-NEXT:    por %xmm7, %xmm4
2665; SSSE3-NEXT:    movdqa %xmm5, %xmm2
2666; SSSE3-NEXT:    movdqa %xmm4, %xmm3
2667; SSSE3-NEXT:    retq
2668;
2669; AVX1-LABEL: test_bitreverse_v32i16:
2670; AVX1:       # BB#0:
2671; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2672; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2673; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2674; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2675; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2676; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2677; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2678; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2679; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2680; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2681; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2682; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2683; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2684; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
2685; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2686; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
2687; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2688; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
2689; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
2690; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2691; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2692; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2693; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2694; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2695; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2696; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2697; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2698; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2699; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2700; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
2701; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
2702; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
2703; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2704; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
2705; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
2706; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2707; AVX1-NEXT:    retq
2708;
2709; AVX2-LABEL: test_bitreverse_v32i16:
2710; AVX2:       # BB#0:
2711; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2712; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2713; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2714; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
2715; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2716; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2717; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
2718; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
2719; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2720; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
2721; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
2722; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2723; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
2724; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2725; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
2726; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2727; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
2728; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
2729; AVX2-NEXT:    retq
2730;
2731; AVX512F-LABEL: test_bitreverse_v32i16:
2732; AVX512F:       # BB#0:
2733; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2734; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2735; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2736; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm4
2737; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2738; AVX512F-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2739; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
2740; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
2741; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2742; AVX512F-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
2743; AVX512F-NEXT:    vpor %ymm0, %ymm4, %ymm0
2744; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2745; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm2
2746; AVX512F-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2747; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
2748; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
2749; AVX512F-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
2750; AVX512F-NEXT:    vpor %ymm1, %ymm2, %ymm1
2751; AVX512F-NEXT:    retq
2752;
2753; AVX512BW-LABEL: test_bitreverse_v32i16:
2754; AVX512BW:       # BB#0:
2755; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
2756; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2757; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
2758; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2759; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
2760; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
2761; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2762; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2763; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
2764; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
2765; AVX512BW-NEXT:    retq
2766;
2767; XOPAVX1-LABEL: test_bitreverse_v32i16:
2768; XOPAVX1:       # BB#0:
2769; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2770; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
2771; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2772; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2773; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2774; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2775; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2776; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2777; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2778; XOPAVX1-NEXT:    retq
2779;
2780; XOPAVX2-LABEL: test_bitreverse_v32i16:
2781; XOPAVX2:       # BB#0:
2782; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2783; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
2784; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2785; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2786; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
2787; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2788; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2789; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2790; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2791; XOPAVX2-NEXT:    retq
2792  %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
2793  ret <32 x i16> %b
2794}
2795
2796define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
2797; SSE2-LABEL: test_bitreverse_v16i32:
2798; SSE2:       # BB#0:
2799; SSE2-NEXT:    pxor %xmm9, %xmm9
2800; SSE2-NEXT:    movdqa %xmm0, %xmm4
2801; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
2802; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2803; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2804; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
2805; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
2806; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
2807; SSE2-NEXT:    packuswb %xmm4, %xmm0
2808; SSE2-NEXT:    movdqa %xmm0, %xmm5
2809; SSE2-NEXT:    psllw $5, %xmm5
2810; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
2811; SSE2-NEXT:    pand {{.*}}(%rip), %xmm10
2812; SSE2-NEXT:    pand %xmm10, %xmm5
2813; SSE2-NEXT:    movdqa %xmm0, %xmm4
2814; SSE2-NEXT:    psllw $7, %xmm4
2815; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
2816; SSE2-NEXT:    pand %xmm11, %xmm11
2817; SSE2-NEXT:    pand %xmm11, %xmm4
2818; SSE2-NEXT:    movdqa %xmm0, %xmm6
2819; SSE2-NEXT:    psllw $3, %xmm6
2820; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
2821; SSE2-NEXT:    pand {{.*}}(%rip), %xmm12
2822; SSE2-NEXT:    pand %xmm12, %xmm6
2823; SSE2-NEXT:    por %xmm5, %xmm6
2824; SSE2-NEXT:    movdqa %xmm0, %xmm5
2825; SSE2-NEXT:    paddb %xmm5, %xmm5
2826; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2827; SSE2-NEXT:    pand %xmm8, %xmm5
2828; SSE2-NEXT:    por %xmm6, %xmm5
2829; SSE2-NEXT:    movdqa %xmm0, %xmm6
2830; SSE2-NEXT:    psrlw $1, %xmm6
2831; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2832; SSE2-NEXT:    pand {{.*}}(%rip), %xmm13
2833; SSE2-NEXT:    pand %xmm13, %xmm6
2834; SSE2-NEXT:    por %xmm5, %xmm6
2835; SSE2-NEXT:    movdqa %xmm0, %xmm7
2836; SSE2-NEXT:    psrlw $3, %xmm7
2837; SSE2-NEXT:    movdqa {{.*#+}} xmm14 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
2838; SSE2-NEXT:    pand {{.*}}(%rip), %xmm14
2839; SSE2-NEXT:    pand %xmm14, %xmm7
2840; SSE2-NEXT:    por %xmm6, %xmm7
2841; SSE2-NEXT:    movdqa %xmm0, %xmm5
2842; SSE2-NEXT:    psrlw $5, %xmm5
2843; SSE2-NEXT:    movdqa {{.*#+}} xmm15 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2844; SSE2-NEXT:    pand {{.*}}(%rip), %xmm15
2845; SSE2-NEXT:    pand %xmm15, %xmm5
2846; SSE2-NEXT:    por %xmm7, %xmm5
2847; SSE2-NEXT:    psrlw $7, %xmm0
2848; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2849; SSE2-NEXT:    pand %xmm7, %xmm7
2850; SSE2-NEXT:    pand %xmm7, %xmm0
2851; SSE2-NEXT:    por %xmm5, %xmm0
2852; SSE2-NEXT:    por %xmm4, %xmm0
2853; SSE2-NEXT:    movdqa %xmm1, %xmm4
2854; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
2855; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2856; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2857; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
2858; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
2859; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
2860; SSE2-NEXT:    packuswb %xmm4, %xmm1
2861; SSE2-NEXT:    movdqa %xmm1, %xmm5
2862; SSE2-NEXT:    psllw $5, %xmm5
2863; SSE2-NEXT:    pand %xmm10, %xmm5
2864; SSE2-NEXT:    movdqa %xmm1, %xmm4
2865; SSE2-NEXT:    psllw $7, %xmm4
2866; SSE2-NEXT:    pand %xmm11, %xmm4
2867; SSE2-NEXT:    movdqa %xmm1, %xmm6
2868; SSE2-NEXT:    psllw $3, %xmm6
2869; SSE2-NEXT:    pand %xmm12, %xmm6
2870; SSE2-NEXT:    por %xmm5, %xmm6
2871; SSE2-NEXT:    movdqa %xmm1, %xmm5
2872; SSE2-NEXT:    paddb %xmm5, %xmm5
2873; SSE2-NEXT:    pand %xmm8, %xmm5
2874; SSE2-NEXT:    por %xmm6, %xmm5
2875; SSE2-NEXT:    movdqa %xmm1, %xmm6
2876; SSE2-NEXT:    psrlw $1, %xmm6
2877; SSE2-NEXT:    pand %xmm13, %xmm6
2878; SSE2-NEXT:    por %xmm5, %xmm6
2879; SSE2-NEXT:    movdqa %xmm1, %xmm5
2880; SSE2-NEXT:    psrlw $3, %xmm5
2881; SSE2-NEXT:    pand %xmm14, %xmm5
2882; SSE2-NEXT:    por %xmm6, %xmm5
2883; SSE2-NEXT:    movdqa %xmm1, %xmm6
2884; SSE2-NEXT:    psrlw $5, %xmm6
2885; SSE2-NEXT:    pand %xmm15, %xmm6
2886; SSE2-NEXT:    por %xmm5, %xmm6
2887; SSE2-NEXT:    psrlw $7, %xmm1
2888; SSE2-NEXT:    pand %xmm7, %xmm1
2889; SSE2-NEXT:    por %xmm6, %xmm1
2890; SSE2-NEXT:    por %xmm4, %xmm1
2891; SSE2-NEXT:    movdqa %xmm2, %xmm4
2892; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
2893; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2894; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2895; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
2896; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
2897; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
2898; SSE2-NEXT:    packuswb %xmm4, %xmm2
2899; SSE2-NEXT:    movdqa %xmm2, %xmm5
2900; SSE2-NEXT:    psllw $5, %xmm5
2901; SSE2-NEXT:    pand %xmm10, %xmm5
2902; SSE2-NEXT:    movdqa %xmm2, %xmm4
2903; SSE2-NEXT:    psllw $7, %xmm4
2904; SSE2-NEXT:    pand %xmm11, %xmm4
2905; SSE2-NEXT:    movdqa %xmm2, %xmm6
2906; SSE2-NEXT:    psllw $3, %xmm6
2907; SSE2-NEXT:    pand %xmm12, %xmm6
2908; SSE2-NEXT:    por %xmm5, %xmm6
2909; SSE2-NEXT:    movdqa %xmm2, %xmm5
2910; SSE2-NEXT:    paddb %xmm5, %xmm5
2911; SSE2-NEXT:    pand %xmm8, %xmm5
2912; SSE2-NEXT:    por %xmm6, %xmm5
2913; SSE2-NEXT:    movdqa %xmm2, %xmm6
2914; SSE2-NEXT:    psrlw $1, %xmm6
2915; SSE2-NEXT:    pand %xmm13, %xmm6
2916; SSE2-NEXT:    por %xmm5, %xmm6
2917; SSE2-NEXT:    movdqa %xmm2, %xmm5
2918; SSE2-NEXT:    psrlw $3, %xmm5
2919; SSE2-NEXT:    pand %xmm14, %xmm5
2920; SSE2-NEXT:    por %xmm6, %xmm5
2921; SSE2-NEXT:    movdqa %xmm2, %xmm6
2922; SSE2-NEXT:    psrlw $5, %xmm6
2923; SSE2-NEXT:    pand %xmm15, %xmm6
2924; SSE2-NEXT:    por %xmm5, %xmm6
2925; SSE2-NEXT:    psrlw $7, %xmm2
2926; SSE2-NEXT:    pand %xmm7, %xmm2
2927; SSE2-NEXT:    por %xmm6, %xmm2
2928; SSE2-NEXT:    por %xmm4, %xmm2
2929; SSE2-NEXT:    movdqa %xmm3, %xmm4
2930; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
2931; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2932; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2933; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
2934; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
2935; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
2936; SSE2-NEXT:    packuswb %xmm4, %xmm3
2937; SSE2-NEXT:    movdqa %xmm3, %xmm5
2938; SSE2-NEXT:    psllw $5, %xmm5
2939; SSE2-NEXT:    pand %xmm10, %xmm5
2940; SSE2-NEXT:    movdqa %xmm3, %xmm4
2941; SSE2-NEXT:    psllw $7, %xmm4
2942; SSE2-NEXT:    pand %xmm11, %xmm4
2943; SSE2-NEXT:    movdqa %xmm3, %xmm6
2944; SSE2-NEXT:    psllw $3, %xmm6
2945; SSE2-NEXT:    pand %xmm12, %xmm6
2946; SSE2-NEXT:    por %xmm5, %xmm6
2947; SSE2-NEXT:    movdqa %xmm3, %xmm5
2948; SSE2-NEXT:    paddb %xmm5, %xmm5
2949; SSE2-NEXT:    pand %xmm8, %xmm5
2950; SSE2-NEXT:    por %xmm6, %xmm5
2951; SSE2-NEXT:    movdqa %xmm3, %xmm6
2952; SSE2-NEXT:    psrlw $1, %xmm6
2953; SSE2-NEXT:    pand %xmm13, %xmm6
2954; SSE2-NEXT:    por %xmm5, %xmm6
2955; SSE2-NEXT:    movdqa %xmm3, %xmm5
2956; SSE2-NEXT:    psrlw $3, %xmm5
2957; SSE2-NEXT:    pand %xmm14, %xmm5
2958; SSE2-NEXT:    por %xmm6, %xmm5
2959; SSE2-NEXT:    movdqa %xmm3, %xmm6
2960; SSE2-NEXT:    psrlw $5, %xmm6
2961; SSE2-NEXT:    pand %xmm15, %xmm6
2962; SSE2-NEXT:    por %xmm5, %xmm6
2963; SSE2-NEXT:    psrlw $7, %xmm3
2964; SSE2-NEXT:    pand %xmm7, %xmm3
2965; SSE2-NEXT:    por %xmm6, %xmm3
2966; SSE2-NEXT:    por %xmm4, %xmm3
2967; SSE2-NEXT:    retq
2968;
2969; SSSE3-LABEL: test_bitreverse_v16i32:
2970; SSSE3:       # BB#0:
2971; SSSE3-NEXT:    movdqa %xmm1, %xmm5
2972; SSSE3-NEXT:    movdqa %xmm0, %xmm1
2973; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2974; SSSE3-NEXT:    pshufb %xmm8, %xmm1
2975; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2976; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2977; SSSE3-NEXT:    pand %xmm9, %xmm0
2978; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2979; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2980; SSSE3-NEXT:    pshufb %xmm0, %xmm6
2981; SSSE3-NEXT:    psrlw $4, %xmm1
2982; SSSE3-NEXT:    pand %xmm9, %xmm1
2983; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2984; SSSE3-NEXT:    movdqa %xmm4, %xmm0
2985; SSSE3-NEXT:    pshufb %xmm1, %xmm0
2986; SSSE3-NEXT:    por %xmm6, %xmm0
2987; SSSE3-NEXT:    pshufb %xmm8, %xmm5
2988; SSSE3-NEXT:    movdqa %xmm5, %xmm1
2989; SSSE3-NEXT:    pand %xmm9, %xmm1
2990; SSSE3-NEXT:    movdqa %xmm7, %xmm6
2991; SSSE3-NEXT:    pshufb %xmm1, %xmm6
2992; SSSE3-NEXT:    psrlw $4, %xmm5
2993; SSSE3-NEXT:    pand %xmm9, %xmm5
2994; SSSE3-NEXT:    movdqa %xmm4, %xmm1
2995; SSSE3-NEXT:    pshufb %xmm5, %xmm1
2996; SSSE3-NEXT:    por %xmm6, %xmm1
2997; SSSE3-NEXT:    pshufb %xmm8, %xmm2
2998; SSSE3-NEXT:    movdqa %xmm2, %xmm5
2999; SSSE3-NEXT:    pand %xmm9, %xmm5
3000; SSSE3-NEXT:    movdqa %xmm7, %xmm6
3001; SSSE3-NEXT:    pshufb %xmm5, %xmm6
3002; SSSE3-NEXT:    psrlw $4, %xmm2
3003; SSSE3-NEXT:    pand %xmm9, %xmm2
3004; SSSE3-NEXT:    movdqa %xmm4, %xmm5
3005; SSSE3-NEXT:    pshufb %xmm2, %xmm5
3006; SSSE3-NEXT:    por %xmm6, %xmm5
3007; SSSE3-NEXT:    pshufb %xmm8, %xmm3
3008; SSSE3-NEXT:    movdqa %xmm3, %xmm2
3009; SSSE3-NEXT:    pand %xmm9, %xmm2
3010; SSSE3-NEXT:    pshufb %xmm2, %xmm7
3011; SSSE3-NEXT:    psrlw $4, %xmm3
3012; SSSE3-NEXT:    pand %xmm9, %xmm3
3013; SSSE3-NEXT:    pshufb %xmm3, %xmm4
3014; SSSE3-NEXT:    por %xmm7, %xmm4
3015; SSSE3-NEXT:    movdqa %xmm5, %xmm2
3016; SSSE3-NEXT:    movdqa %xmm4, %xmm3
3017; SSSE3-NEXT:    retq
3018;
3019; AVX1-LABEL: test_bitreverse_v16i32:
3020; AVX1:       # BB#0:
3021; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3022; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
3023; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
3024; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
3025; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
3026; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
3027; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
3028; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
3029; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
3030; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
3031; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
3032; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
3033; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
3034; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
3035; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
3036; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
3037; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
3038; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
3039; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
3040; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3041; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3042; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
3043; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
3044; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
3045; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
3046; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
3047; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
3048; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
3049; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
3050; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
3051; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
3052; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
3053; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
3054; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
3055; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
3056; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
3057; AVX1-NEXT:    retq
3058;
3059; AVX2-LABEL: test_bitreverse_v16i32:
3060; AVX2:       # BB#0:
3061; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
3062; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
3063; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
3064; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
3065; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
3066; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
3067; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
3068; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
3069; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
3070; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
3071; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
3072; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
3073; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
3074; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
3075; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
3076; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
3077; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
3078; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
3079; AVX2-NEXT:    retq
3080;
3081; AVX512F-LABEL: test_bitreverse_v16i32:
3082; AVX512F:       # BB#0:
3083; AVX512F-NEXT:    vpslld $29, %zmm0, %zmm1
3084; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm1, %zmm1
3085; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm2
3086; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3087; AVX512F-NEXT:    vpord %zmm1, %zmm2, %zmm1
3088; AVX512F-NEXT:    vpslld $27, %zmm0, %zmm2
3089; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3090; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3091; AVX512F-NEXT:    vpslld $25, %zmm0, %zmm2
3092; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3093; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3094; AVX512F-NEXT:    vpslld $23, %zmm0, %zmm2
3095; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3096; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3097; AVX512F-NEXT:    vpslld $21, %zmm0, %zmm2
3098; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3099; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3100; AVX512F-NEXT:    vpslld $19, %zmm0, %zmm2
3101; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3102; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3103; AVX512F-NEXT:    vpslld $17, %zmm0, %zmm2
3104; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3105; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3106; AVX512F-NEXT:    vpslld $15, %zmm0, %zmm2
3107; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3108; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3109; AVX512F-NEXT:    vpslld $13, %zmm0, %zmm2
3110; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3111; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3112; AVX512F-NEXT:    vpslld $11, %zmm0, %zmm2
3113; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3114; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3115; AVX512F-NEXT:    vpslld $9, %zmm0, %zmm2
3116; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3117; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3118; AVX512F-NEXT:    vpslld $7, %zmm0, %zmm2
3119; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3120; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3121; AVX512F-NEXT:    vpslld $5, %zmm0, %zmm2
3122; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3123; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3124; AVX512F-NEXT:    vpslld $3, %zmm0, %zmm2
3125; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3126; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3127; AVX512F-NEXT:    vpslld $1, %zmm0, %zmm2
3128; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3129; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3130; AVX512F-NEXT:    vpsrld $1, %zmm0, %zmm2
3131; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3132; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3133; AVX512F-NEXT:    vpsrld $3, %zmm0, %zmm2
3134; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3135; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3136; AVX512F-NEXT:    vpsrld $5, %zmm0, %zmm2
3137; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3138; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3139; AVX512F-NEXT:    vpsrld $7, %zmm0, %zmm2
3140; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3141; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3142; AVX512F-NEXT:    vpsrld $9, %zmm0, %zmm2
3143; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3144; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3145; AVX512F-NEXT:    vpsrld $11, %zmm0, %zmm2
3146; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3147; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3148; AVX512F-NEXT:    vpsrld $13, %zmm0, %zmm2
3149; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3150; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3151; AVX512F-NEXT:    vpsrld $15, %zmm0, %zmm2
3152; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3153; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3154; AVX512F-NEXT:    vpsrld $17, %zmm0, %zmm2
3155; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3156; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3157; AVX512F-NEXT:    vpsrld $19, %zmm0, %zmm2
3158; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3159; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3160; AVX512F-NEXT:    vpsrld $21, %zmm0, %zmm2
3161; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3162; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3163; AVX512F-NEXT:    vpsrld $23, %zmm0, %zmm2
3164; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3165; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3166; AVX512F-NEXT:    vpsrld $25, %zmm0, %zmm2
3167; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3168; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3169; AVX512F-NEXT:    vpsrld $27, %zmm0, %zmm2
3170; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3171; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3172; AVX512F-NEXT:    vpsrld $29, %zmm0, %zmm2
3173; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3174; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
3175; AVX512F-NEXT:    vpsrld $31, %zmm0, %zmm0
3176; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
3177; AVX512F-NEXT:    vpord %zmm0, %zmm1, %zmm0
3178; AVX512F-NEXT:    retq
3179;
3180; AVX512BW-LABEL: test_bitreverse_v16i32:
3181; AVX512BW:       # BB#0:
3182; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
3183; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
3184; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
3185; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
3186; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
3187; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
3188; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
3189; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
3190; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
3191; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
3192; AVX512BW-NEXT:    retq
3193;
3194; XOPAVX1-LABEL: test_bitreverse_v16i32:
3195; XOPAVX1:       # BB#0:
3196; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3197; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
3198; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
3199; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
3200; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3201; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3202; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
3203; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
3204; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
3205; XOPAVX1-NEXT:    retq
3206;
3207; XOPAVX2-LABEL: test_bitreverse_v16i32:
3208; XOPAVX2:       # BB#0:
3209; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
3210; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
3211; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
3212; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
3213; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
3214; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
3215; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
3216; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
3217; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
3218; XOPAVX2-NEXT:    retq
3219  %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
3220  ret <16 x i32> %b
3221}
3222
3223define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
3224; SSE2-LABEL: test_bitreverse_v8i64:
3225; SSE2:       # BB#0:
3226; SSE2-NEXT:    pxor %xmm9, %xmm9
3227; SSE2-NEXT:    movdqa %xmm0, %xmm4
3228; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
3229; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
3230; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
3231; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
3232; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
3233; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
3234; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
3235; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
3236; SSE2-NEXT:    packuswb %xmm4, %xmm0
3237; SSE2-NEXT:    movdqa %xmm0, %xmm5
3238; SSE2-NEXT:    psllw $5, %xmm5
3239; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
3240; SSE2-NEXT:    pand {{.*}}(%rip), %xmm10
3241; SSE2-NEXT:    pand %xmm10, %xmm5
3242; SSE2-NEXT:    movdqa %xmm0, %xmm4
3243; SSE2-NEXT:    psllw $7, %xmm4
3244; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
3245; SSE2-NEXT:    pand %xmm11, %xmm11
3246; SSE2-NEXT:    pand %xmm11, %xmm4
3247; SSE2-NEXT:    movdqa %xmm0, %xmm6
3248; SSE2-NEXT:    psllw $3, %xmm6
3249; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
3250; SSE2-NEXT:    pand {{.*}}(%rip), %xmm12
3251; SSE2-NEXT:    pand %xmm12, %xmm6
3252; SSE2-NEXT:    por %xmm5, %xmm6
3253; SSE2-NEXT:    movdqa %xmm0, %xmm5
3254; SSE2-NEXT:    paddb %xmm5, %xmm5
3255; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
3256; SSE2-NEXT:    pand %xmm8, %xmm5
3257; SSE2-NEXT:    por %xmm6, %xmm5
3258; SSE2-NEXT:    movdqa %xmm0, %xmm6
3259; SSE2-NEXT:    psrlw $1, %xmm6
3260; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
3261; SSE2-NEXT:    pand {{.*}}(%rip), %xmm13
3262; SSE2-NEXT:    pand %xmm13, %xmm6
3263; SSE2-NEXT:    por %xmm5, %xmm6
3264; SSE2-NEXT:    movdqa %xmm0, %xmm7
3265; SSE2-NEXT:    psrlw $3, %xmm7
3266; SSE2-NEXT:    movdqa {{.*#+}} xmm14 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
3267; SSE2-NEXT:    pand {{.*}}(%rip), %xmm14
3268; SSE2-NEXT:    pand %xmm14, %xmm7
3269; SSE2-NEXT:    por %xmm6, %xmm7
3270; SSE2-NEXT:    movdqa %xmm0, %xmm5
3271; SSE2-NEXT:    psrlw $5, %xmm5
3272; SSE2-NEXT:    movdqa {{.*#+}} xmm15 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
3273; SSE2-NEXT:    pand {{.*}}(%rip), %xmm15
3274; SSE2-NEXT:    pand %xmm15, %xmm5
3275; SSE2-NEXT:    por %xmm7, %xmm5
3276; SSE2-NEXT:    psrlw $7, %xmm0
3277; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
3278; SSE2-NEXT:    pand %xmm7, %xmm7
3279; SSE2-NEXT:    pand %xmm7, %xmm0
3280; SSE2-NEXT:    por %xmm5, %xmm0
3281; SSE2-NEXT:    por %xmm4, %xmm0
3282; SSE2-NEXT:    movdqa %xmm1, %xmm4
3283; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
3284; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
3285; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
3286; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
3287; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
3288; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
3289; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
3290; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
3291; SSE2-NEXT:    packuswb %xmm4, %xmm1
3292; SSE2-NEXT:    movdqa %xmm1, %xmm5
3293; SSE2-NEXT:    psllw $5, %xmm5
3294; SSE2-NEXT:    pand %xmm10, %xmm5
3295; SSE2-NEXT:    movdqa %xmm1, %xmm4
3296; SSE2-NEXT:    psllw $7, %xmm4
3297; SSE2-NEXT:    pand %xmm11, %xmm4
3298; SSE2-NEXT:    movdqa %xmm1, %xmm6
3299; SSE2-NEXT:    psllw $3, %xmm6
3300; SSE2-NEXT:    pand %xmm12, %xmm6
3301; SSE2-NEXT:    por %xmm5, %xmm6
3302; SSE2-NEXT:    movdqa %xmm1, %xmm5
3303; SSE2-NEXT:    paddb %xmm5, %xmm5
3304; SSE2-NEXT:    pand %xmm8, %xmm5
3305; SSE2-NEXT:    por %xmm6, %xmm5
3306; SSE2-NEXT:    movdqa %xmm1, %xmm6
3307; SSE2-NEXT:    psrlw $1, %xmm6
3308; SSE2-NEXT:    pand %xmm13, %xmm6
3309; SSE2-NEXT:    por %xmm5, %xmm6
3310; SSE2-NEXT:    movdqa %xmm1, %xmm5
3311; SSE2-NEXT:    psrlw $3, %xmm5
3312; SSE2-NEXT:    pand %xmm14, %xmm5
3313; SSE2-NEXT:    por %xmm6, %xmm5
3314; SSE2-NEXT:    movdqa %xmm1, %xmm6
3315; SSE2-NEXT:    psrlw $5, %xmm6
3316; SSE2-NEXT:    pand %xmm15, %xmm6
3317; SSE2-NEXT:    por %xmm5, %xmm6
3318; SSE2-NEXT:    psrlw $7, %xmm1
3319; SSE2-NEXT:    pand %xmm7, %xmm1
3320; SSE2-NEXT:    por %xmm6, %xmm1
3321; SSE2-NEXT:    por %xmm4, %xmm1
3322; SSE2-NEXT:    movdqa %xmm2, %xmm4
3323; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
3324; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
3325; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
3326; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
3327; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
3328; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
3329; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
3330; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
3331; SSE2-NEXT:    packuswb %xmm4, %xmm2
3332; SSE2-NEXT:    movdqa %xmm2, %xmm5
3333; SSE2-NEXT:    psllw $5, %xmm5
3334; SSE2-NEXT:    pand %xmm10, %xmm5
3335; SSE2-NEXT:    movdqa %xmm2, %xmm4
3336; SSE2-NEXT:    psllw $7, %xmm4
3337; SSE2-NEXT:    pand %xmm11, %xmm4
3338; SSE2-NEXT:    movdqa %xmm2, %xmm6
3339; SSE2-NEXT:    psllw $3, %xmm6
3340; SSE2-NEXT:    pand %xmm12, %xmm6
3341; SSE2-NEXT:    por %xmm5, %xmm6
3342; SSE2-NEXT:    movdqa %xmm2, %xmm5
3343; SSE2-NEXT:    paddb %xmm5, %xmm5
3344; SSE2-NEXT:    pand %xmm8, %xmm5
3345; SSE2-NEXT:    por %xmm6, %xmm5
3346; SSE2-NEXT:    movdqa %xmm2, %xmm6
3347; SSE2-NEXT:    psrlw $1, %xmm6
3348; SSE2-NEXT:    pand %xmm13, %xmm6
3349; SSE2-NEXT:    por %xmm5, %xmm6
3350; SSE2-NEXT:    movdqa %xmm2, %xmm5
3351; SSE2-NEXT:    psrlw $3, %xmm5
3352; SSE2-NEXT:    pand %xmm14, %xmm5
3353; SSE2-NEXT:    por %xmm6, %xmm5
3354; SSE2-NEXT:    movdqa %xmm2, %xmm6
3355; SSE2-NEXT:    psrlw $5, %xmm6
3356; SSE2-NEXT:    pand %xmm15, %xmm6
3357; SSE2-NEXT:    por %xmm5, %xmm6
3358; SSE2-NEXT:    psrlw $7, %xmm2
3359; SSE2-NEXT:    pand %xmm7, %xmm2
3360; SSE2-NEXT:    por %xmm6, %xmm2
3361; SSE2-NEXT:    por %xmm4, %xmm2
3362; SSE2-NEXT:    movdqa %xmm3, %xmm4
3363; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
3364; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
3365; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
3366; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
3367; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
3368; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
3369; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
3370; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
3371; SSE2-NEXT:    packuswb %xmm4, %xmm3
3372; SSE2-NEXT:    movdqa %xmm3, %xmm5
3373; SSE2-NEXT:    psllw $5, %xmm5
3374; SSE2-NEXT:    pand %xmm10, %xmm5
3375; SSE2-NEXT:    movdqa %xmm3, %xmm4
3376; SSE2-NEXT:    psllw $7, %xmm4
3377; SSE2-NEXT:    pand %xmm11, %xmm4
3378; SSE2-NEXT:    movdqa %xmm3, %xmm6
3379; SSE2-NEXT:    psllw $3, %xmm6
3380; SSE2-NEXT:    pand %xmm12, %xmm6
3381; SSE2-NEXT:    por %xmm5, %xmm6
3382; SSE2-NEXT:    movdqa %xmm3, %xmm5
3383; SSE2-NEXT:    paddb %xmm5, %xmm5
3384; SSE2-NEXT:    pand %xmm8, %xmm5
3385; SSE2-NEXT:    por %xmm6, %xmm5
3386; SSE2-NEXT:    movdqa %xmm3, %xmm6
3387; SSE2-NEXT:    psrlw $1, %xmm6
3388; SSE2-NEXT:    pand %xmm13, %xmm6
3389; SSE2-NEXT:    por %xmm5, %xmm6
3390; SSE2-NEXT:    movdqa %xmm3, %xmm5
3391; SSE2-NEXT:    psrlw $3, %xmm5
3392; SSE2-NEXT:    pand %xmm14, %xmm5
3393; SSE2-NEXT:    por %xmm6, %xmm5
3394; SSE2-NEXT:    movdqa %xmm3, %xmm6
3395; SSE2-NEXT:    psrlw $5, %xmm6
3396; SSE2-NEXT:    pand %xmm15, %xmm6
3397; SSE2-NEXT:    por %xmm5, %xmm6
3398; SSE2-NEXT:    psrlw $7, %xmm3
3399; SSE2-NEXT:    pand %xmm7, %xmm3
3400; SSE2-NEXT:    por %xmm6, %xmm3
3401; SSE2-NEXT:    por %xmm4, %xmm3
3402; SSE2-NEXT:    retq
3403;
3404; SSSE3-LABEL: test_bitreverse_v8i64:
3405; SSSE3:       # BB#0:
3406; SSSE3-NEXT:    movdqa %xmm1, %xmm5
3407; SSSE3-NEXT:    movdqa %xmm0, %xmm1
3408; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
3409; SSSE3-NEXT:    pshufb %xmm8, %xmm1
3410; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
3411; SSSE3-NEXT:    movdqa %xmm1, %xmm0
3412; SSSE3-NEXT:    pand %xmm9, %xmm0
3413; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
3414; SSSE3-NEXT:    movdqa %xmm7, %xmm6
3415; SSSE3-NEXT:    pshufb %xmm0, %xmm6
3416; SSSE3-NEXT:    psrlw $4, %xmm1
3417; SSSE3-NEXT:    pand %xmm9, %xmm1
3418; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
3419; SSSE3-NEXT:    movdqa %xmm4, %xmm0
3420; SSSE3-NEXT:    pshufb %xmm1, %xmm0
3421; SSSE3-NEXT:    por %xmm6, %xmm0
3422; SSSE3-NEXT:    pshufb %xmm8, %xmm5
3423; SSSE3-NEXT:    movdqa %xmm5, %xmm1
3424; SSSE3-NEXT:    pand %xmm9, %xmm1
3425; SSSE3-NEXT:    movdqa %xmm7, %xmm6
3426; SSSE3-NEXT:    pshufb %xmm1, %xmm6
3427; SSSE3-NEXT:    psrlw $4, %xmm5
3428; SSSE3-NEXT:    pand %xmm9, %xmm5
3429; SSSE3-NEXT:    movdqa %xmm4, %xmm1
3430; SSSE3-NEXT:    pshufb %xmm5, %xmm1
3431; SSSE3-NEXT:    por %xmm6, %xmm1
3432; SSSE3-NEXT:    pshufb %xmm8, %xmm2
3433; SSSE3-NEXT:    movdqa %xmm2, %xmm5
3434; SSSE3-NEXT:    pand %xmm9, %xmm5
3435; SSSE3-NEXT:    movdqa %xmm7, %xmm6
3436; SSSE3-NEXT:    pshufb %xmm5, %xmm6
3437; SSSE3-NEXT:    psrlw $4, %xmm2
3438; SSSE3-NEXT:    pand %xmm9, %xmm2
3439; SSSE3-NEXT:    movdqa %xmm4, %xmm5
3440; SSSE3-NEXT:    pshufb %xmm2, %xmm5
3441; SSSE3-NEXT:    por %xmm6, %xmm5
3442; SSSE3-NEXT:    pshufb %xmm8, %xmm3
3443; SSSE3-NEXT:    movdqa %xmm3, %xmm2
3444; SSSE3-NEXT:    pand %xmm9, %xmm2
3445; SSSE3-NEXT:    pshufb %xmm2, %xmm7
3446; SSSE3-NEXT:    psrlw $4, %xmm3
3447; SSSE3-NEXT:    pand %xmm9, %xmm3
3448; SSSE3-NEXT:    pshufb %xmm3, %xmm4
3449; SSSE3-NEXT:    por %xmm7, %xmm4
3450; SSSE3-NEXT:    movdqa %xmm5, %xmm2
3451; SSSE3-NEXT:    movdqa %xmm4, %xmm3
3452; SSSE3-NEXT:    retq
3453;
3454; AVX1-LABEL: test_bitreverse_v8i64:
3455; AVX1:       # BB#0:
3456; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3457; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
3458; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
3459; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
3460; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
3461; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
3462; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
3463; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
3464; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
3465; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
3466; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
3467; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
3468; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
3469; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
3470; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
3471; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
3472; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
3473; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
3474; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
3475; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3476; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3477; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
3478; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
3479; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
3480; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
3481; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
3482; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
3483; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
3484; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
3485; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
3486; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
3487; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
3488; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
3489; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
3490; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
3491; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
3492; AVX1-NEXT:    retq
3493;
3494; AVX2-LABEL: test_bitreverse_v8i64:
3495; AVX2:       # BB#0:
3496; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
3497; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
3498; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
3499; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
3500; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
3501; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
3502; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
3503; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
3504; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
3505; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
3506; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
3507; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
3508; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
3509; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
3510; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
3511; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
3512; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
3513; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
3514; AVX2-NEXT:    retq
3515;
3516; AVX512F-LABEL: test_bitreverse_v8i64:
3517; AVX512F:       # BB#0:
3518; AVX512F-NEXT:    vpsllq $61, %zmm0, %zmm1
3519; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
3520; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm2
3521; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3522; AVX512F-NEXT:    vporq %zmm1, %zmm2, %zmm1
3523; AVX512F-NEXT:    vpsllq $59, %zmm0, %zmm2
3524; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3525; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3526; AVX512F-NEXT:    vpsllq $57, %zmm0, %zmm2
3527; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3528; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3529; AVX512F-NEXT:    vpsllq $55, %zmm0, %zmm2
3530; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3531; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3532; AVX512F-NEXT:    vpsllq $53, %zmm0, %zmm2
3533; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3534; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3535; AVX512F-NEXT:    vpsllq $51, %zmm0, %zmm2
3536; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3537; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3538; AVX512F-NEXT:    vpsllq $49, %zmm0, %zmm2
3539; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3540; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3541; AVX512F-NEXT:    vpsllq $47, %zmm0, %zmm2
3542; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3543; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3544; AVX512F-NEXT:    vpsllq $45, %zmm0, %zmm2
3545; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3546; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3547; AVX512F-NEXT:    vpsllq $43, %zmm0, %zmm2
3548; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3549; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3550; AVX512F-NEXT:    vpsllq $41, %zmm0, %zmm2
3551; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3552; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3553; AVX512F-NEXT:    vpsllq $39, %zmm0, %zmm2
3554; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3555; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3556; AVX512F-NEXT:    vpsllq $37, %zmm0, %zmm2
3557; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3558; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3559; AVX512F-NEXT:    vpsllq $35, %zmm0, %zmm2
3560; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3561; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3562; AVX512F-NEXT:    vpsllq $33, %zmm0, %zmm2
3563; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3564; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3565; AVX512F-NEXT:    vpsllq $31, %zmm0, %zmm2
3566; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3567; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3568; AVX512F-NEXT:    vpsllq $29, %zmm0, %zmm2
3569; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3570; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3571; AVX512F-NEXT:    vpsllq $27, %zmm0, %zmm2
3572; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3573; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3574; AVX512F-NEXT:    vpsllq $25, %zmm0, %zmm2
3575; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3576; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3577; AVX512F-NEXT:    vpsllq $23, %zmm0, %zmm2
3578; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3579; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3580; AVX512F-NEXT:    vpsllq $21, %zmm0, %zmm2
3581; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3582; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3583; AVX512F-NEXT:    vpsllq $19, %zmm0, %zmm2
3584; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3585; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3586; AVX512F-NEXT:    vpsllq $17, %zmm0, %zmm2
3587; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3588; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3589; AVX512F-NEXT:    vpsllq $15, %zmm0, %zmm2
3590; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3591; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3592; AVX512F-NEXT:    vpsllq $13, %zmm0, %zmm2
3593; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3594; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3595; AVX512F-NEXT:    vpsllq $11, %zmm0, %zmm2
3596; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3597; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3598; AVX512F-NEXT:    vpsllq $9, %zmm0, %zmm2
3599; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3600; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3601; AVX512F-NEXT:    vpsllq $7, %zmm0, %zmm2
3602; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3603; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3604; AVX512F-NEXT:    vpsllq $5, %zmm0, %zmm2
3605; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3606; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3607; AVX512F-NEXT:    vpsllq $3, %zmm0, %zmm2
3608; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3609; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3610; AVX512F-NEXT:    vpsllq $1, %zmm0, %zmm2
3611; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3612; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3613; AVX512F-NEXT:    vpsrlq $1, %zmm0, %zmm2
3614; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3615; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3616; AVX512F-NEXT:    vpsrlq $3, %zmm0, %zmm2
3617; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3618; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3619; AVX512F-NEXT:    vpsrlq $5, %zmm0, %zmm2
3620; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3621; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3622; AVX512F-NEXT:    vpsrlq $7, %zmm0, %zmm2
3623; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3624; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3625; AVX512F-NEXT:    vpsrlq $9, %zmm0, %zmm2
3626; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3627; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3628; AVX512F-NEXT:    vpsrlq $11, %zmm0, %zmm2
3629; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3630; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3631; AVX512F-NEXT:    vpsrlq $13, %zmm0, %zmm2
3632; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3633; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3634; AVX512F-NEXT:    vpsrlq $15, %zmm0, %zmm2
3635; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3636; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3637; AVX512F-NEXT:    vpsrlq $17, %zmm0, %zmm2
3638; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3639; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3640; AVX512F-NEXT:    vpsrlq $19, %zmm0, %zmm2
3641; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3642; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3643; AVX512F-NEXT:    vpsrlq $21, %zmm0, %zmm2
3644; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3645; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3646; AVX512F-NEXT:    vpsrlq $23, %zmm0, %zmm2
3647; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3648; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3649; AVX512F-NEXT:    vpsrlq $25, %zmm0, %zmm2
3650; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3651; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3652; AVX512F-NEXT:    vpsrlq $27, %zmm0, %zmm2
3653; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3654; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3655; AVX512F-NEXT:    vpsrlq $29, %zmm0, %zmm2
3656; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3657; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3658; AVX512F-NEXT:    vpsrlq $31, %zmm0, %zmm2
3659; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3660; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3661; AVX512F-NEXT:    vpsrlq $33, %zmm0, %zmm2
3662; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3663; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3664; AVX512F-NEXT:    vpsrlq $35, %zmm0, %zmm2
3665; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3666; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3667; AVX512F-NEXT:    vpsrlq $37, %zmm0, %zmm2
3668; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3669; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3670; AVX512F-NEXT:    vpsrlq $39, %zmm0, %zmm2
3671; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3672; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3673; AVX512F-NEXT:    vpsrlq $41, %zmm0, %zmm2
3674; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3675; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3676; AVX512F-NEXT:    vpsrlq $43, %zmm0, %zmm2
3677; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3678; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3679; AVX512F-NEXT:    vpsrlq $45, %zmm0, %zmm2
3680; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3681; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3682; AVX512F-NEXT:    vpsrlq $47, %zmm0, %zmm2
3683; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3684; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3685; AVX512F-NEXT:    vpsrlq $49, %zmm0, %zmm2
3686; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3687; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3688; AVX512F-NEXT:    vpsrlq $51, %zmm0, %zmm2
3689; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3690; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3691; AVX512F-NEXT:    vpsrlq $53, %zmm0, %zmm2
3692; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3693; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3694; AVX512F-NEXT:    vpsrlq $55, %zmm0, %zmm2
3695; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3696; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3697; AVX512F-NEXT:    vpsrlq $57, %zmm0, %zmm2
3698; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3699; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3700; AVX512F-NEXT:    vpsrlq $59, %zmm0, %zmm2
3701; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3702; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3703; AVX512F-NEXT:    vpsrlq $61, %zmm0, %zmm2
3704; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3705; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
3706; AVX512F-NEXT:    vpsrlq $63, %zmm0, %zmm0
3707; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
3708; AVX512F-NEXT:    vporq %zmm0, %zmm1, %zmm0
3709; AVX512F-NEXT:    retq
3710;
3711; AVX512BW-LABEL: test_bitreverse_v8i64:
3712; AVX512BW:       # BB#0:
3713; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
3714; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
3715; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
3716; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
3717; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
3718; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
3719; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
3720; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
3721; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
3722; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
3723; AVX512BW-NEXT:    retq
3724;
3725; XOPAVX1-LABEL: test_bitreverse_v8i64:
3726; XOPAVX1:       # BB#0:
3727; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3728; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
3729; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
3730; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
3731; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3732; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3733; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
3734; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
3735; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
3736; XOPAVX1-NEXT:    retq
3737;
3738; XOPAVX2-LABEL: test_bitreverse_v8i64:
3739; XOPAVX2:       # BB#0:
3740; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
3741; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
3742; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
3743; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
3744; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
3745; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
3746; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
3747; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
3748; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
3749; XOPAVX2-NEXT:    retq
3750  %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
3751  ret <8 x i64> %b
3752}
3753
3754declare i8 @llvm.bitreverse.i8(i8) readnone
3755declare i16 @llvm.bitreverse.i16(i16) readnone
3756declare i32 @llvm.bitreverse.i32(i32) readnone
3757declare i64 @llvm.bitreverse.i64(i64) readnone
3758
3759declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone
3760declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone
3761declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone
3762declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone
3763
3764declare <32 x i8>  @llvm.bitreverse.v32i8(<32 x i8>) readnone
3765declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone
3766declare <8 x i32>  @llvm.bitreverse.v8i32(<8 x i32>) readnone
3767declare <4 x i64>  @llvm.bitreverse.v4i64(<4 x i64>) readnone
3768
3769declare <64 x i8>  @llvm.bitreverse.v64i8(<64 x i8>) readnone
3770declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone
3771declare <16 x i32>  @llvm.bitreverse.v16i32(<16 x i32>) readnone
3772declare <8 x i64>  @llvm.bitreverse.v8i64(<8 x i64>) readnone
3773