• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# This file is generated from a similarly-named Perl script in the BoringSSL
2# source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
11#if defined(BORINGSSL_PREFIX)
12#include <boringssl_prefix_symbols_asm.h>
13#endif
14.text
15
16
17.globl	_gcm_gmult_4bit
18.private_extern _gcm_gmult_4bit
19
20.p2align	4
21_gcm_gmult_4bit:
22
23	pushq	%rbx
24
25	pushq	%rbp
26
27	pushq	%r12
28
29	pushq	%r13
30
31	pushq	%r14
32
33	pushq	%r15
34
35	subq	$280,%rsp
36
37L$gmult_prologue:
38
39	movzbq	15(%rdi),%r8
40	leaq	L$rem_4bit(%rip),%r11
41	xorq	%rax,%rax
42	xorq	%rbx,%rbx
43	movb	%r8b,%al
44	movb	%r8b,%bl
45	shlb	$4,%al
46	movq	$14,%rcx
47	movq	8(%rsi,%rax,1),%r8
48	movq	(%rsi,%rax,1),%r9
49	andb	$0xf0,%bl
50	movq	%r8,%rdx
51	jmp	L$oop1
52
53.p2align	4
54L$oop1:
55	shrq	$4,%r8
56	andq	$0xf,%rdx
57	movq	%r9,%r10
58	movb	(%rdi,%rcx,1),%al
59	shrq	$4,%r9
60	xorq	8(%rsi,%rbx,1),%r8
61	shlq	$60,%r10
62	xorq	(%rsi,%rbx,1),%r9
63	movb	%al,%bl
64	xorq	(%r11,%rdx,8),%r9
65	movq	%r8,%rdx
66	shlb	$4,%al
67	xorq	%r10,%r8
68	decq	%rcx
69	js	L$break1
70
71	shrq	$4,%r8
72	andq	$0xf,%rdx
73	movq	%r9,%r10
74	shrq	$4,%r9
75	xorq	8(%rsi,%rax,1),%r8
76	shlq	$60,%r10
77	xorq	(%rsi,%rax,1),%r9
78	andb	$0xf0,%bl
79	xorq	(%r11,%rdx,8),%r9
80	movq	%r8,%rdx
81	xorq	%r10,%r8
82	jmp	L$oop1
83
84.p2align	4
85L$break1:
86	shrq	$4,%r8
87	andq	$0xf,%rdx
88	movq	%r9,%r10
89	shrq	$4,%r9
90	xorq	8(%rsi,%rax,1),%r8
91	shlq	$60,%r10
92	xorq	(%rsi,%rax,1),%r9
93	andb	$0xf0,%bl
94	xorq	(%r11,%rdx,8),%r9
95	movq	%r8,%rdx
96	xorq	%r10,%r8
97
98	shrq	$4,%r8
99	andq	$0xf,%rdx
100	movq	%r9,%r10
101	shrq	$4,%r9
102	xorq	8(%rsi,%rbx,1),%r8
103	shlq	$60,%r10
104	xorq	(%rsi,%rbx,1),%r9
105	xorq	%r10,%r8
106	xorq	(%r11,%rdx,8),%r9
107
108	bswapq	%r8
109	bswapq	%r9
110	movq	%r8,8(%rdi)
111	movq	%r9,(%rdi)
112
113	leaq	280+48(%rsp),%rsi
114
115	movq	-8(%rsi),%rbx
116
117	leaq	(%rsi),%rsp
118
119L$gmult_epilogue:
120	.byte	0xf3,0xc3
121
122
123.globl	_gcm_ghash_4bit
124.private_extern _gcm_ghash_4bit
125
126.p2align	4
127_gcm_ghash_4bit:
128
129	pushq	%rbx
130
131	pushq	%rbp
132
133	pushq	%r12
134
135	pushq	%r13
136
137	pushq	%r14
138
139	pushq	%r15
140
141	subq	$280,%rsp
142
143L$ghash_prologue:
144	movq	%rdx,%r14
145	movq	%rcx,%r15
146	subq	$-128,%rsi
147	leaq	16+128(%rsp),%rbp
148	xorl	%edx,%edx
149	movq	0+0-128(%rsi),%r8
150	movq	0+8-128(%rsi),%rax
151	movb	%al,%dl
152	shrq	$4,%rax
153	movq	%r8,%r10
154	shrq	$4,%r8
155	movq	16+0-128(%rsi),%r9
156	shlb	$4,%dl
157	movq	16+8-128(%rsi),%rbx
158	shlq	$60,%r10
159	movb	%dl,0(%rsp)
160	orq	%r10,%rax
161	movb	%bl,%dl
162	shrq	$4,%rbx
163	movq	%r9,%r10
164	shrq	$4,%r9
165	movq	%r8,0(%rbp)
166	movq	32+0-128(%rsi),%r8
167	shlb	$4,%dl
168	movq	%rax,0-128(%rbp)
169	movq	32+8-128(%rsi),%rax
170	shlq	$60,%r10
171	movb	%dl,1(%rsp)
172	orq	%r10,%rbx
173	movb	%al,%dl
174	shrq	$4,%rax
175	movq	%r8,%r10
176	shrq	$4,%r8
177	movq	%r9,8(%rbp)
178	movq	48+0-128(%rsi),%r9
179	shlb	$4,%dl
180	movq	%rbx,8-128(%rbp)
181	movq	48+8-128(%rsi),%rbx
182	shlq	$60,%r10
183	movb	%dl,2(%rsp)
184	orq	%r10,%rax
185	movb	%bl,%dl
186	shrq	$4,%rbx
187	movq	%r9,%r10
188	shrq	$4,%r9
189	movq	%r8,16(%rbp)
190	movq	64+0-128(%rsi),%r8
191	shlb	$4,%dl
192	movq	%rax,16-128(%rbp)
193	movq	64+8-128(%rsi),%rax
194	shlq	$60,%r10
195	movb	%dl,3(%rsp)
196	orq	%r10,%rbx
197	movb	%al,%dl
198	shrq	$4,%rax
199	movq	%r8,%r10
200	shrq	$4,%r8
201	movq	%r9,24(%rbp)
202	movq	80+0-128(%rsi),%r9
203	shlb	$4,%dl
204	movq	%rbx,24-128(%rbp)
205	movq	80+8-128(%rsi),%rbx
206	shlq	$60,%r10
207	movb	%dl,4(%rsp)
208	orq	%r10,%rax
209	movb	%bl,%dl
210	shrq	$4,%rbx
211	movq	%r9,%r10
212	shrq	$4,%r9
213	movq	%r8,32(%rbp)
214	movq	96+0-128(%rsi),%r8
215	shlb	$4,%dl
216	movq	%rax,32-128(%rbp)
217	movq	96+8-128(%rsi),%rax
218	shlq	$60,%r10
219	movb	%dl,5(%rsp)
220	orq	%r10,%rbx
221	movb	%al,%dl
222	shrq	$4,%rax
223	movq	%r8,%r10
224	shrq	$4,%r8
225	movq	%r9,40(%rbp)
226	movq	112+0-128(%rsi),%r9
227	shlb	$4,%dl
228	movq	%rbx,40-128(%rbp)
229	movq	112+8-128(%rsi),%rbx
230	shlq	$60,%r10
231	movb	%dl,6(%rsp)
232	orq	%r10,%rax
233	movb	%bl,%dl
234	shrq	$4,%rbx
235	movq	%r9,%r10
236	shrq	$4,%r9
237	movq	%r8,48(%rbp)
238	movq	128+0-128(%rsi),%r8
239	shlb	$4,%dl
240	movq	%rax,48-128(%rbp)
241	movq	128+8-128(%rsi),%rax
242	shlq	$60,%r10
243	movb	%dl,7(%rsp)
244	orq	%r10,%rbx
245	movb	%al,%dl
246	shrq	$4,%rax
247	movq	%r8,%r10
248	shrq	$4,%r8
249	movq	%r9,56(%rbp)
250	movq	144+0-128(%rsi),%r9
251	shlb	$4,%dl
252	movq	%rbx,56-128(%rbp)
253	movq	144+8-128(%rsi),%rbx
254	shlq	$60,%r10
255	movb	%dl,8(%rsp)
256	orq	%r10,%rax
257	movb	%bl,%dl
258	shrq	$4,%rbx
259	movq	%r9,%r10
260	shrq	$4,%r9
261	movq	%r8,64(%rbp)
262	movq	160+0-128(%rsi),%r8
263	shlb	$4,%dl
264	movq	%rax,64-128(%rbp)
265	movq	160+8-128(%rsi),%rax
266	shlq	$60,%r10
267	movb	%dl,9(%rsp)
268	orq	%r10,%rbx
269	movb	%al,%dl
270	shrq	$4,%rax
271	movq	%r8,%r10
272	shrq	$4,%r8
273	movq	%r9,72(%rbp)
274	movq	176+0-128(%rsi),%r9
275	shlb	$4,%dl
276	movq	%rbx,72-128(%rbp)
277	movq	176+8-128(%rsi),%rbx
278	shlq	$60,%r10
279	movb	%dl,10(%rsp)
280	orq	%r10,%rax
281	movb	%bl,%dl
282	shrq	$4,%rbx
283	movq	%r9,%r10
284	shrq	$4,%r9
285	movq	%r8,80(%rbp)
286	movq	192+0-128(%rsi),%r8
287	shlb	$4,%dl
288	movq	%rax,80-128(%rbp)
289	movq	192+8-128(%rsi),%rax
290	shlq	$60,%r10
291	movb	%dl,11(%rsp)
292	orq	%r10,%rbx
293	movb	%al,%dl
294	shrq	$4,%rax
295	movq	%r8,%r10
296	shrq	$4,%r8
297	movq	%r9,88(%rbp)
298	movq	208+0-128(%rsi),%r9
299	shlb	$4,%dl
300	movq	%rbx,88-128(%rbp)
301	movq	208+8-128(%rsi),%rbx
302	shlq	$60,%r10
303	movb	%dl,12(%rsp)
304	orq	%r10,%rax
305	movb	%bl,%dl
306	shrq	$4,%rbx
307	movq	%r9,%r10
308	shrq	$4,%r9
309	movq	%r8,96(%rbp)
310	movq	224+0-128(%rsi),%r8
311	shlb	$4,%dl
312	movq	%rax,96-128(%rbp)
313	movq	224+8-128(%rsi),%rax
314	shlq	$60,%r10
315	movb	%dl,13(%rsp)
316	orq	%r10,%rbx
317	movb	%al,%dl
318	shrq	$4,%rax
319	movq	%r8,%r10
320	shrq	$4,%r8
321	movq	%r9,104(%rbp)
322	movq	240+0-128(%rsi),%r9
323	shlb	$4,%dl
324	movq	%rbx,104-128(%rbp)
325	movq	240+8-128(%rsi),%rbx
326	shlq	$60,%r10
327	movb	%dl,14(%rsp)
328	orq	%r10,%rax
329	movb	%bl,%dl
330	shrq	$4,%rbx
331	movq	%r9,%r10
332	shrq	$4,%r9
333	movq	%r8,112(%rbp)
334	shlb	$4,%dl
335	movq	%rax,112-128(%rbp)
336	shlq	$60,%r10
337	movb	%dl,15(%rsp)
338	orq	%r10,%rbx
339	movq	%r9,120(%rbp)
340	movq	%rbx,120-128(%rbp)
341	addq	$-128,%rsi
342	movq	8(%rdi),%r8
343	movq	0(%rdi),%r9
344	addq	%r14,%r15
345	leaq	L$rem_8bit(%rip),%r11
346	jmp	L$outer_loop
347.p2align	4
348L$outer_loop:
349	xorq	(%r14),%r9
350	movq	8(%r14),%rdx
351	leaq	16(%r14),%r14
352	xorq	%r8,%rdx
353	movq	%r9,(%rdi)
354	movq	%rdx,8(%rdi)
355	shrq	$32,%rdx
356	xorq	%rax,%rax
357	roll	$8,%edx
358	movb	%dl,%al
359	movzbl	%dl,%ebx
360	shlb	$4,%al
361	shrl	$4,%ebx
362	roll	$8,%edx
363	movq	8(%rsi,%rax,1),%r8
364	movq	(%rsi,%rax,1),%r9
365	movb	%dl,%al
366	movzbl	%dl,%ecx
367	shlb	$4,%al
368	movzbq	(%rsp,%rbx,1),%r12
369	shrl	$4,%ecx
370	xorq	%r8,%r12
371	movq	%r9,%r10
372	shrq	$8,%r8
373	movzbq	%r12b,%r12
374	shrq	$8,%r9
375	xorq	-128(%rbp,%rbx,8),%r8
376	shlq	$56,%r10
377	xorq	(%rbp,%rbx,8),%r9
378	roll	$8,%edx
379	xorq	8(%rsi,%rax,1),%r8
380	xorq	(%rsi,%rax,1),%r9
381	movb	%dl,%al
382	xorq	%r10,%r8
383	movzwq	(%r11,%r12,2),%r12
384	movzbl	%dl,%ebx
385	shlb	$4,%al
386	movzbq	(%rsp,%rcx,1),%r13
387	shrl	$4,%ebx
388	shlq	$48,%r12
389	xorq	%r8,%r13
390	movq	%r9,%r10
391	xorq	%r12,%r9
392	shrq	$8,%r8
393	movzbq	%r13b,%r13
394	shrq	$8,%r9
395	xorq	-128(%rbp,%rcx,8),%r8
396	shlq	$56,%r10
397	xorq	(%rbp,%rcx,8),%r9
398	roll	$8,%edx
399	xorq	8(%rsi,%rax,1),%r8
400	xorq	(%rsi,%rax,1),%r9
401	movb	%dl,%al
402	xorq	%r10,%r8
403	movzwq	(%r11,%r13,2),%r13
404	movzbl	%dl,%ecx
405	shlb	$4,%al
406	movzbq	(%rsp,%rbx,1),%r12
407	shrl	$4,%ecx
408	shlq	$48,%r13
409	xorq	%r8,%r12
410	movq	%r9,%r10
411	xorq	%r13,%r9
412	shrq	$8,%r8
413	movzbq	%r12b,%r12
414	movl	8(%rdi),%edx
415	shrq	$8,%r9
416	xorq	-128(%rbp,%rbx,8),%r8
417	shlq	$56,%r10
418	xorq	(%rbp,%rbx,8),%r9
419	roll	$8,%edx
420	xorq	8(%rsi,%rax,1),%r8
421	xorq	(%rsi,%rax,1),%r9
422	movb	%dl,%al
423	xorq	%r10,%r8
424	movzwq	(%r11,%r12,2),%r12
425	movzbl	%dl,%ebx
426	shlb	$4,%al
427	movzbq	(%rsp,%rcx,1),%r13
428	shrl	$4,%ebx
429	shlq	$48,%r12
430	xorq	%r8,%r13
431	movq	%r9,%r10
432	xorq	%r12,%r9
433	shrq	$8,%r8
434	movzbq	%r13b,%r13
435	shrq	$8,%r9
436	xorq	-128(%rbp,%rcx,8),%r8
437	shlq	$56,%r10
438	xorq	(%rbp,%rcx,8),%r9
439	roll	$8,%edx
440	xorq	8(%rsi,%rax,1),%r8
441	xorq	(%rsi,%rax,1),%r9
442	movb	%dl,%al
443	xorq	%r10,%r8
444	movzwq	(%r11,%r13,2),%r13
445	movzbl	%dl,%ecx
446	shlb	$4,%al
447	movzbq	(%rsp,%rbx,1),%r12
448	shrl	$4,%ecx
449	shlq	$48,%r13
450	xorq	%r8,%r12
451	movq	%r9,%r10
452	xorq	%r13,%r9
453	shrq	$8,%r8
454	movzbq	%r12b,%r12
455	shrq	$8,%r9
456	xorq	-128(%rbp,%rbx,8),%r8
457	shlq	$56,%r10
458	xorq	(%rbp,%rbx,8),%r9
459	roll	$8,%edx
460	xorq	8(%rsi,%rax,1),%r8
461	xorq	(%rsi,%rax,1),%r9
462	movb	%dl,%al
463	xorq	%r10,%r8
464	movzwq	(%r11,%r12,2),%r12
465	movzbl	%dl,%ebx
466	shlb	$4,%al
467	movzbq	(%rsp,%rcx,1),%r13
468	shrl	$4,%ebx
469	shlq	$48,%r12
470	xorq	%r8,%r13
471	movq	%r9,%r10
472	xorq	%r12,%r9
473	shrq	$8,%r8
474	movzbq	%r13b,%r13
475	shrq	$8,%r9
476	xorq	-128(%rbp,%rcx,8),%r8
477	shlq	$56,%r10
478	xorq	(%rbp,%rcx,8),%r9
479	roll	$8,%edx
480	xorq	8(%rsi,%rax,1),%r8
481	xorq	(%rsi,%rax,1),%r9
482	movb	%dl,%al
483	xorq	%r10,%r8
484	movzwq	(%r11,%r13,2),%r13
485	movzbl	%dl,%ecx
486	shlb	$4,%al
487	movzbq	(%rsp,%rbx,1),%r12
488	shrl	$4,%ecx
489	shlq	$48,%r13
490	xorq	%r8,%r12
491	movq	%r9,%r10
492	xorq	%r13,%r9
493	shrq	$8,%r8
494	movzbq	%r12b,%r12
495	movl	4(%rdi),%edx
496	shrq	$8,%r9
497	xorq	-128(%rbp,%rbx,8),%r8
498	shlq	$56,%r10
499	xorq	(%rbp,%rbx,8),%r9
500	roll	$8,%edx
501	xorq	8(%rsi,%rax,1),%r8
502	xorq	(%rsi,%rax,1),%r9
503	movb	%dl,%al
504	xorq	%r10,%r8
505	movzwq	(%r11,%r12,2),%r12
506	movzbl	%dl,%ebx
507	shlb	$4,%al
508	movzbq	(%rsp,%rcx,1),%r13
509	shrl	$4,%ebx
510	shlq	$48,%r12
511	xorq	%r8,%r13
512	movq	%r9,%r10
513	xorq	%r12,%r9
514	shrq	$8,%r8
515	movzbq	%r13b,%r13
516	shrq	$8,%r9
517	xorq	-128(%rbp,%rcx,8),%r8
518	shlq	$56,%r10
519	xorq	(%rbp,%rcx,8),%r9
520	roll	$8,%edx
521	xorq	8(%rsi,%rax,1),%r8
522	xorq	(%rsi,%rax,1),%r9
523	movb	%dl,%al
524	xorq	%r10,%r8
525	movzwq	(%r11,%r13,2),%r13
526	movzbl	%dl,%ecx
527	shlb	$4,%al
528	movzbq	(%rsp,%rbx,1),%r12
529	shrl	$4,%ecx
530	shlq	$48,%r13
531	xorq	%r8,%r12
532	movq	%r9,%r10
533	xorq	%r13,%r9
534	shrq	$8,%r8
535	movzbq	%r12b,%r12
536	shrq	$8,%r9
537	xorq	-128(%rbp,%rbx,8),%r8
538	shlq	$56,%r10
539	xorq	(%rbp,%rbx,8),%r9
540	roll	$8,%edx
541	xorq	8(%rsi,%rax,1),%r8
542	xorq	(%rsi,%rax,1),%r9
543	movb	%dl,%al
544	xorq	%r10,%r8
545	movzwq	(%r11,%r12,2),%r12
546	movzbl	%dl,%ebx
547	shlb	$4,%al
548	movzbq	(%rsp,%rcx,1),%r13
549	shrl	$4,%ebx
550	shlq	$48,%r12
551	xorq	%r8,%r13
552	movq	%r9,%r10
553	xorq	%r12,%r9
554	shrq	$8,%r8
555	movzbq	%r13b,%r13
556	shrq	$8,%r9
557	xorq	-128(%rbp,%rcx,8),%r8
558	shlq	$56,%r10
559	xorq	(%rbp,%rcx,8),%r9
560	roll	$8,%edx
561	xorq	8(%rsi,%rax,1),%r8
562	xorq	(%rsi,%rax,1),%r9
563	movb	%dl,%al
564	xorq	%r10,%r8
565	movzwq	(%r11,%r13,2),%r13
566	movzbl	%dl,%ecx
567	shlb	$4,%al
568	movzbq	(%rsp,%rbx,1),%r12
569	shrl	$4,%ecx
570	shlq	$48,%r13
571	xorq	%r8,%r12
572	movq	%r9,%r10
573	xorq	%r13,%r9
574	shrq	$8,%r8
575	movzbq	%r12b,%r12
576	movl	0(%rdi),%edx
577	shrq	$8,%r9
578	xorq	-128(%rbp,%rbx,8),%r8
579	shlq	$56,%r10
580	xorq	(%rbp,%rbx,8),%r9
581	roll	$8,%edx
582	xorq	8(%rsi,%rax,1),%r8
583	xorq	(%rsi,%rax,1),%r9
584	movb	%dl,%al
585	xorq	%r10,%r8
586	movzwq	(%r11,%r12,2),%r12
587	movzbl	%dl,%ebx
588	shlb	$4,%al
589	movzbq	(%rsp,%rcx,1),%r13
590	shrl	$4,%ebx
591	shlq	$48,%r12
592	xorq	%r8,%r13
593	movq	%r9,%r10
594	xorq	%r12,%r9
595	shrq	$8,%r8
596	movzbq	%r13b,%r13
597	shrq	$8,%r9
598	xorq	-128(%rbp,%rcx,8),%r8
599	shlq	$56,%r10
600	xorq	(%rbp,%rcx,8),%r9
601	roll	$8,%edx
602	xorq	8(%rsi,%rax,1),%r8
603	xorq	(%rsi,%rax,1),%r9
604	movb	%dl,%al
605	xorq	%r10,%r8
606	movzwq	(%r11,%r13,2),%r13
607	movzbl	%dl,%ecx
608	shlb	$4,%al
609	movzbq	(%rsp,%rbx,1),%r12
610	shrl	$4,%ecx
611	shlq	$48,%r13
612	xorq	%r8,%r12
613	movq	%r9,%r10
614	xorq	%r13,%r9
615	shrq	$8,%r8
616	movzbq	%r12b,%r12
617	shrq	$8,%r9
618	xorq	-128(%rbp,%rbx,8),%r8
619	shlq	$56,%r10
620	xorq	(%rbp,%rbx,8),%r9
621	roll	$8,%edx
622	xorq	8(%rsi,%rax,1),%r8
623	xorq	(%rsi,%rax,1),%r9
624	movb	%dl,%al
625	xorq	%r10,%r8
626	movzwq	(%r11,%r12,2),%r12
627	movzbl	%dl,%ebx
628	shlb	$4,%al
629	movzbq	(%rsp,%rcx,1),%r13
630	shrl	$4,%ebx
631	shlq	$48,%r12
632	xorq	%r8,%r13
633	movq	%r9,%r10
634	xorq	%r12,%r9
635	shrq	$8,%r8
636	movzbq	%r13b,%r13
637	shrq	$8,%r9
638	xorq	-128(%rbp,%rcx,8),%r8
639	shlq	$56,%r10
640	xorq	(%rbp,%rcx,8),%r9
641	roll	$8,%edx
642	xorq	8(%rsi,%rax,1),%r8
643	xorq	(%rsi,%rax,1),%r9
644	movb	%dl,%al
645	xorq	%r10,%r8
646	movzwq	(%r11,%r13,2),%r13
647	movzbl	%dl,%ecx
648	shlb	$4,%al
649	movzbq	(%rsp,%rbx,1),%r12
650	andl	$240,%ecx
651	shlq	$48,%r13
652	xorq	%r8,%r12
653	movq	%r9,%r10
654	xorq	%r13,%r9
655	shrq	$8,%r8
656	movzbq	%r12b,%r12
657	movl	-4(%rdi),%edx
658	shrq	$8,%r9
659	xorq	-128(%rbp,%rbx,8),%r8
660	shlq	$56,%r10
661	xorq	(%rbp,%rbx,8),%r9
662	movzwq	(%r11,%r12,2),%r12
663	xorq	8(%rsi,%rax,1),%r8
664	xorq	(%rsi,%rax,1),%r9
665	shlq	$48,%r12
666	xorq	%r10,%r8
667	xorq	%r12,%r9
668	movzbq	%r8b,%r13
669	shrq	$4,%r8
670	movq	%r9,%r10
671	shlb	$4,%r13b
672	shrq	$4,%r9
673	xorq	8(%rsi,%rcx,1),%r8
674	movzwq	(%r11,%r13,2),%r13
675	shlq	$60,%r10
676	xorq	(%rsi,%rcx,1),%r9
677	xorq	%r10,%r8
678	shlq	$48,%r13
679	bswapq	%r8
680	xorq	%r13,%r9
681	bswapq	%r9
682	cmpq	%r15,%r14
683	jb	L$outer_loop
684	movq	%r8,8(%rdi)
685	movq	%r9,(%rdi)
686
687	leaq	280+48(%rsp),%rsi
688
689	movq	-48(%rsi),%r15
690
691	movq	-40(%rsi),%r14
692
693	movq	-32(%rsi),%r13
694
695	movq	-24(%rsi),%r12
696
697	movq	-16(%rsi),%rbp
698
699	movq	-8(%rsi),%rbx
700
701	leaq	0(%rsi),%rsp
702
703L$ghash_epilogue:
704	.byte	0xf3,0xc3
705
706
707.globl	_gcm_init_clmul
708.private_extern _gcm_init_clmul
709
710.p2align	4
711_gcm_init_clmul:
712
713L$_init_clmul:
714	movdqu	(%rsi),%xmm2
715	pshufd	$78,%xmm2,%xmm2
716
717
718	pshufd	$255,%xmm2,%xmm4
719	movdqa	%xmm2,%xmm3
720	psllq	$1,%xmm2
721	pxor	%xmm5,%xmm5
722	psrlq	$63,%xmm3
723	pcmpgtd	%xmm4,%xmm5
724	pslldq	$8,%xmm3
725	por	%xmm3,%xmm2
726
727
728	pand	L$0x1c2_polynomial(%rip),%xmm5
729	pxor	%xmm5,%xmm2
730
731
732	pshufd	$78,%xmm2,%xmm6
733	movdqa	%xmm2,%xmm0
734	pxor	%xmm2,%xmm6
735	movdqa	%xmm0,%xmm1
736	pshufd	$78,%xmm0,%xmm3
737	pxor	%xmm0,%xmm3
738.byte	102,15,58,68,194,0
739.byte	102,15,58,68,202,17
740.byte	102,15,58,68,222,0
741	pxor	%xmm0,%xmm3
742	pxor	%xmm1,%xmm3
743
744	movdqa	%xmm3,%xmm4
745	psrldq	$8,%xmm3
746	pslldq	$8,%xmm4
747	pxor	%xmm3,%xmm1
748	pxor	%xmm4,%xmm0
749
750	movdqa	%xmm0,%xmm4
751	movdqa	%xmm0,%xmm3
752	psllq	$5,%xmm0
753	pxor	%xmm0,%xmm3
754	psllq	$1,%xmm0
755	pxor	%xmm3,%xmm0
756	psllq	$57,%xmm0
757	movdqa	%xmm0,%xmm3
758	pslldq	$8,%xmm0
759	psrldq	$8,%xmm3
760	pxor	%xmm4,%xmm0
761	pxor	%xmm3,%xmm1
762
763
764	movdqa	%xmm0,%xmm4
765	psrlq	$1,%xmm0
766	pxor	%xmm4,%xmm1
767	pxor	%xmm0,%xmm4
768	psrlq	$5,%xmm0
769	pxor	%xmm4,%xmm0
770	psrlq	$1,%xmm0
771	pxor	%xmm1,%xmm0
772	pshufd	$78,%xmm2,%xmm3
773	pshufd	$78,%xmm0,%xmm4
774	pxor	%xmm2,%xmm3
775	movdqu	%xmm2,0(%rdi)
776	pxor	%xmm0,%xmm4
777	movdqu	%xmm0,16(%rdi)
778.byte	102,15,58,15,227,8
779	movdqu	%xmm4,32(%rdi)
780	movdqa	%xmm0,%xmm1
781	pshufd	$78,%xmm0,%xmm3
782	pxor	%xmm0,%xmm3
783.byte	102,15,58,68,194,0
784.byte	102,15,58,68,202,17
785.byte	102,15,58,68,222,0
786	pxor	%xmm0,%xmm3
787	pxor	%xmm1,%xmm3
788
789	movdqa	%xmm3,%xmm4
790	psrldq	$8,%xmm3
791	pslldq	$8,%xmm4
792	pxor	%xmm3,%xmm1
793	pxor	%xmm4,%xmm0
794
795	movdqa	%xmm0,%xmm4
796	movdqa	%xmm0,%xmm3
797	psllq	$5,%xmm0
798	pxor	%xmm0,%xmm3
799	psllq	$1,%xmm0
800	pxor	%xmm3,%xmm0
801	psllq	$57,%xmm0
802	movdqa	%xmm0,%xmm3
803	pslldq	$8,%xmm0
804	psrldq	$8,%xmm3
805	pxor	%xmm4,%xmm0
806	pxor	%xmm3,%xmm1
807
808
809	movdqa	%xmm0,%xmm4
810	psrlq	$1,%xmm0
811	pxor	%xmm4,%xmm1
812	pxor	%xmm0,%xmm4
813	psrlq	$5,%xmm0
814	pxor	%xmm4,%xmm0
815	psrlq	$1,%xmm0
816	pxor	%xmm1,%xmm0
817	movdqa	%xmm0,%xmm5
818	movdqa	%xmm0,%xmm1
819	pshufd	$78,%xmm0,%xmm3
820	pxor	%xmm0,%xmm3
821.byte	102,15,58,68,194,0
822.byte	102,15,58,68,202,17
823.byte	102,15,58,68,222,0
824	pxor	%xmm0,%xmm3
825	pxor	%xmm1,%xmm3
826
827	movdqa	%xmm3,%xmm4
828	psrldq	$8,%xmm3
829	pslldq	$8,%xmm4
830	pxor	%xmm3,%xmm1
831	pxor	%xmm4,%xmm0
832
833	movdqa	%xmm0,%xmm4
834	movdqa	%xmm0,%xmm3
835	psllq	$5,%xmm0
836	pxor	%xmm0,%xmm3
837	psllq	$1,%xmm0
838	pxor	%xmm3,%xmm0
839	psllq	$57,%xmm0
840	movdqa	%xmm0,%xmm3
841	pslldq	$8,%xmm0
842	psrldq	$8,%xmm3
843	pxor	%xmm4,%xmm0
844	pxor	%xmm3,%xmm1
845
846
847	movdqa	%xmm0,%xmm4
848	psrlq	$1,%xmm0
849	pxor	%xmm4,%xmm1
850	pxor	%xmm0,%xmm4
851	psrlq	$5,%xmm0
852	pxor	%xmm4,%xmm0
853	psrlq	$1,%xmm0
854	pxor	%xmm1,%xmm0
855	pshufd	$78,%xmm5,%xmm3
856	pshufd	$78,%xmm0,%xmm4
857	pxor	%xmm5,%xmm3
858	movdqu	%xmm5,48(%rdi)
859	pxor	%xmm0,%xmm4
860	movdqu	%xmm0,64(%rdi)
861.byte	102,15,58,15,227,8
862	movdqu	%xmm4,80(%rdi)
863	.byte	0xf3,0xc3
864
865
866.globl	_gcm_gmult_clmul
867.private_extern _gcm_gmult_clmul
868
869.p2align	4
870_gcm_gmult_clmul:
871
872L$_gmult_clmul:
873	movdqu	(%rdi),%xmm0
874	movdqa	L$bswap_mask(%rip),%xmm5
875	movdqu	(%rsi),%xmm2
876	movdqu	32(%rsi),%xmm4
877.byte	102,15,56,0,197
878	movdqa	%xmm0,%xmm1
879	pshufd	$78,%xmm0,%xmm3
880	pxor	%xmm0,%xmm3
881.byte	102,15,58,68,194,0
882.byte	102,15,58,68,202,17
883.byte	102,15,58,68,220,0
884	pxor	%xmm0,%xmm3
885	pxor	%xmm1,%xmm3
886
887	movdqa	%xmm3,%xmm4
888	psrldq	$8,%xmm3
889	pslldq	$8,%xmm4
890	pxor	%xmm3,%xmm1
891	pxor	%xmm4,%xmm0
892
893	movdqa	%xmm0,%xmm4
894	movdqa	%xmm0,%xmm3
895	psllq	$5,%xmm0
896	pxor	%xmm0,%xmm3
897	psllq	$1,%xmm0
898	pxor	%xmm3,%xmm0
899	psllq	$57,%xmm0
900	movdqa	%xmm0,%xmm3
901	pslldq	$8,%xmm0
902	psrldq	$8,%xmm3
903	pxor	%xmm4,%xmm0
904	pxor	%xmm3,%xmm1
905
906
907	movdqa	%xmm0,%xmm4
908	psrlq	$1,%xmm0
909	pxor	%xmm4,%xmm1
910	pxor	%xmm0,%xmm4
911	psrlq	$5,%xmm0
912	pxor	%xmm4,%xmm0
913	psrlq	$1,%xmm0
914	pxor	%xmm1,%xmm0
915.byte	102,15,56,0,197
916	movdqu	%xmm0,(%rdi)
917	.byte	0xf3,0xc3
918
919
920.globl	_gcm_ghash_clmul
921.private_extern _gcm_ghash_clmul
922
923.p2align	5
924_gcm_ghash_clmul:
925
926L$_ghash_clmul:
927	movdqa	L$bswap_mask(%rip),%xmm10
928
929	movdqu	(%rdi),%xmm0
930	movdqu	(%rsi),%xmm2
931	movdqu	32(%rsi),%xmm7
932.byte	102,65,15,56,0,194
933
934	subq	$0x10,%rcx
935	jz	L$odd_tail
936
937	movdqu	16(%rsi),%xmm6
938	leaq	_OPENSSL_ia32cap_P(%rip),%rax
939	movl	4(%rax),%eax
940	cmpq	$0x30,%rcx
941	jb	L$skip4x
942
943	andl	$71303168,%eax
944	cmpl	$4194304,%eax
945	je	L$skip4x
946
947	subq	$0x30,%rcx
948	movq	$0xA040608020C0E000,%rax
949	movdqu	48(%rsi),%xmm14
950	movdqu	64(%rsi),%xmm15
951
952
953
954
955	movdqu	48(%rdx),%xmm3
956	movdqu	32(%rdx),%xmm11
957.byte	102,65,15,56,0,218
958.byte	102,69,15,56,0,218
959	movdqa	%xmm3,%xmm5
960	pshufd	$78,%xmm3,%xmm4
961	pxor	%xmm3,%xmm4
962.byte	102,15,58,68,218,0
963.byte	102,15,58,68,234,17
964.byte	102,15,58,68,231,0
965
966	movdqa	%xmm11,%xmm13
967	pshufd	$78,%xmm11,%xmm12
968	pxor	%xmm11,%xmm12
969.byte	102,68,15,58,68,222,0
970.byte	102,68,15,58,68,238,17
971.byte	102,68,15,58,68,231,16
972	xorps	%xmm11,%xmm3
973	xorps	%xmm13,%xmm5
974	movups	80(%rsi),%xmm7
975	xorps	%xmm12,%xmm4
976
977	movdqu	16(%rdx),%xmm11
978	movdqu	0(%rdx),%xmm8
979.byte	102,69,15,56,0,218
980.byte	102,69,15,56,0,194
981	movdqa	%xmm11,%xmm13
982	pshufd	$78,%xmm11,%xmm12
983	pxor	%xmm8,%xmm0
984	pxor	%xmm11,%xmm12
985.byte	102,69,15,58,68,222,0
986	movdqa	%xmm0,%xmm1
987	pshufd	$78,%xmm0,%xmm8
988	pxor	%xmm0,%xmm8
989.byte	102,69,15,58,68,238,17
990.byte	102,68,15,58,68,231,0
991	xorps	%xmm11,%xmm3
992	xorps	%xmm13,%xmm5
993
994	leaq	64(%rdx),%rdx
995	subq	$0x40,%rcx
996	jc	L$tail4x
997
998	jmp	L$mod4_loop
999.p2align	5
1000L$mod4_loop:
1001.byte	102,65,15,58,68,199,0
1002	xorps	%xmm12,%xmm4
1003	movdqu	48(%rdx),%xmm11
1004.byte	102,69,15,56,0,218
1005.byte	102,65,15,58,68,207,17
1006	xorps	%xmm3,%xmm0
1007	movdqu	32(%rdx),%xmm3
1008	movdqa	%xmm11,%xmm13
1009.byte	102,68,15,58,68,199,16
1010	pshufd	$78,%xmm11,%xmm12
1011	xorps	%xmm5,%xmm1
1012	pxor	%xmm11,%xmm12
1013.byte	102,65,15,56,0,218
1014	movups	32(%rsi),%xmm7
1015	xorps	%xmm4,%xmm8
1016.byte	102,68,15,58,68,218,0
1017	pshufd	$78,%xmm3,%xmm4
1018
1019	pxor	%xmm0,%xmm8
1020	movdqa	%xmm3,%xmm5
1021	pxor	%xmm1,%xmm8
1022	pxor	%xmm3,%xmm4
1023	movdqa	%xmm8,%xmm9
1024.byte	102,68,15,58,68,234,17
1025	pslldq	$8,%xmm8
1026	psrldq	$8,%xmm9
1027	pxor	%xmm8,%xmm0
1028	movdqa	L$7_mask(%rip),%xmm8
1029	pxor	%xmm9,%xmm1
1030.byte	102,76,15,110,200
1031
1032	pand	%xmm0,%xmm8
1033.byte	102,69,15,56,0,200
1034	pxor	%xmm0,%xmm9
1035.byte	102,68,15,58,68,231,0
1036	psllq	$57,%xmm9
1037	movdqa	%xmm9,%xmm8
1038	pslldq	$8,%xmm9
1039.byte	102,15,58,68,222,0
1040	psrldq	$8,%xmm8
1041	pxor	%xmm9,%xmm0
1042	pxor	%xmm8,%xmm1
1043	movdqu	0(%rdx),%xmm8
1044
1045	movdqa	%xmm0,%xmm9
1046	psrlq	$1,%xmm0
1047.byte	102,15,58,68,238,17
1048	xorps	%xmm11,%xmm3
1049	movdqu	16(%rdx),%xmm11
1050.byte	102,69,15,56,0,218
1051.byte	102,15,58,68,231,16
1052	xorps	%xmm13,%xmm5
1053	movups	80(%rsi),%xmm7
1054.byte	102,69,15,56,0,194
1055	pxor	%xmm9,%xmm1
1056	pxor	%xmm0,%xmm9
1057	psrlq	$5,%xmm0
1058
1059	movdqa	%xmm11,%xmm13
1060	pxor	%xmm12,%xmm4
1061	pshufd	$78,%xmm11,%xmm12
1062	pxor	%xmm9,%xmm0
1063	pxor	%xmm8,%xmm1
1064	pxor	%xmm11,%xmm12
1065.byte	102,69,15,58,68,222,0
1066	psrlq	$1,%xmm0
1067	pxor	%xmm1,%xmm0
1068	movdqa	%xmm0,%xmm1
1069.byte	102,69,15,58,68,238,17
1070	xorps	%xmm11,%xmm3
1071	pshufd	$78,%xmm0,%xmm8
1072	pxor	%xmm0,%xmm8
1073
1074.byte	102,68,15,58,68,231,0
1075	xorps	%xmm13,%xmm5
1076
1077	leaq	64(%rdx),%rdx
1078	subq	$0x40,%rcx
1079	jnc	L$mod4_loop
1080
1081L$tail4x:
1082.byte	102,65,15,58,68,199,0
1083.byte	102,65,15,58,68,207,17
1084.byte	102,68,15,58,68,199,16
1085	xorps	%xmm12,%xmm4
1086	xorps	%xmm3,%xmm0
1087	xorps	%xmm5,%xmm1
1088	pxor	%xmm0,%xmm1
1089	pxor	%xmm4,%xmm8
1090
1091	pxor	%xmm1,%xmm8
1092	pxor	%xmm0,%xmm1
1093
1094	movdqa	%xmm8,%xmm9
1095	psrldq	$8,%xmm8
1096	pslldq	$8,%xmm9
1097	pxor	%xmm8,%xmm1
1098	pxor	%xmm9,%xmm0
1099
1100	movdqa	%xmm0,%xmm4
1101	movdqa	%xmm0,%xmm3
1102	psllq	$5,%xmm0
1103	pxor	%xmm0,%xmm3
1104	psllq	$1,%xmm0
1105	pxor	%xmm3,%xmm0
1106	psllq	$57,%xmm0
1107	movdqa	%xmm0,%xmm3
1108	pslldq	$8,%xmm0
1109	psrldq	$8,%xmm3
1110	pxor	%xmm4,%xmm0
1111	pxor	%xmm3,%xmm1
1112
1113
1114	movdqa	%xmm0,%xmm4
1115	psrlq	$1,%xmm0
1116	pxor	%xmm4,%xmm1
1117	pxor	%xmm0,%xmm4
1118	psrlq	$5,%xmm0
1119	pxor	%xmm4,%xmm0
1120	psrlq	$1,%xmm0
1121	pxor	%xmm1,%xmm0
1122	addq	$0x40,%rcx
1123	jz	L$done
1124	movdqu	32(%rsi),%xmm7
1125	subq	$0x10,%rcx
1126	jz	L$odd_tail
1127L$skip4x:
1128
1129
1130
1131
1132
1133	movdqu	(%rdx),%xmm8
1134	movdqu	16(%rdx),%xmm3
1135.byte	102,69,15,56,0,194
1136.byte	102,65,15,56,0,218
1137	pxor	%xmm8,%xmm0
1138
1139	movdqa	%xmm3,%xmm5
1140	pshufd	$78,%xmm3,%xmm4
1141	pxor	%xmm3,%xmm4
1142.byte	102,15,58,68,218,0
1143.byte	102,15,58,68,234,17
1144.byte	102,15,58,68,231,0
1145
1146	leaq	32(%rdx),%rdx
1147	nop
1148	subq	$0x20,%rcx
1149	jbe	L$even_tail
1150	nop
1151	jmp	L$mod_loop
1152
1153.p2align	5
1154L$mod_loop:
1155	movdqa	%xmm0,%xmm1
1156	movdqa	%xmm4,%xmm8
1157	pshufd	$78,%xmm0,%xmm4
1158	pxor	%xmm0,%xmm4
1159
1160.byte	102,15,58,68,198,0
1161.byte	102,15,58,68,206,17
1162.byte	102,15,58,68,231,16
1163
1164	pxor	%xmm3,%xmm0
1165	pxor	%xmm5,%xmm1
1166	movdqu	(%rdx),%xmm9
1167	pxor	%xmm0,%xmm8
1168.byte	102,69,15,56,0,202
1169	movdqu	16(%rdx),%xmm3
1170
1171	pxor	%xmm1,%xmm8
1172	pxor	%xmm9,%xmm1
1173	pxor	%xmm8,%xmm4
1174.byte	102,65,15,56,0,218
1175	movdqa	%xmm4,%xmm8
1176	psrldq	$8,%xmm8
1177	pslldq	$8,%xmm4
1178	pxor	%xmm8,%xmm1
1179	pxor	%xmm4,%xmm0
1180
1181	movdqa	%xmm3,%xmm5
1182
1183	movdqa	%xmm0,%xmm9
1184	movdqa	%xmm0,%xmm8
1185	psllq	$5,%xmm0
1186	pxor	%xmm0,%xmm8
1187.byte	102,15,58,68,218,0
1188	psllq	$1,%xmm0
1189	pxor	%xmm8,%xmm0
1190	psllq	$57,%xmm0
1191	movdqa	%xmm0,%xmm8
1192	pslldq	$8,%xmm0
1193	psrldq	$8,%xmm8
1194	pxor	%xmm9,%xmm0
1195	pshufd	$78,%xmm5,%xmm4
1196	pxor	%xmm8,%xmm1
1197	pxor	%xmm5,%xmm4
1198
1199	movdqa	%xmm0,%xmm9
1200	psrlq	$1,%xmm0
1201.byte	102,15,58,68,234,17
1202	pxor	%xmm9,%xmm1
1203	pxor	%xmm0,%xmm9
1204	psrlq	$5,%xmm0
1205	pxor	%xmm9,%xmm0
1206	leaq	32(%rdx),%rdx
1207	psrlq	$1,%xmm0
1208.byte	102,15,58,68,231,0
1209	pxor	%xmm1,%xmm0
1210
1211	subq	$0x20,%rcx
1212	ja	L$mod_loop
1213
1214L$even_tail:
1215	movdqa	%xmm0,%xmm1
1216	movdqa	%xmm4,%xmm8
1217	pshufd	$78,%xmm0,%xmm4
1218	pxor	%xmm0,%xmm4
1219
1220.byte	102,15,58,68,198,0
1221.byte	102,15,58,68,206,17
1222.byte	102,15,58,68,231,16
1223
1224	pxor	%xmm3,%xmm0
1225	pxor	%xmm5,%xmm1
1226	pxor	%xmm0,%xmm8
1227	pxor	%xmm1,%xmm8
1228	pxor	%xmm8,%xmm4
1229	movdqa	%xmm4,%xmm8
1230	psrldq	$8,%xmm8
1231	pslldq	$8,%xmm4
1232	pxor	%xmm8,%xmm1
1233	pxor	%xmm4,%xmm0
1234
1235	movdqa	%xmm0,%xmm4
1236	movdqa	%xmm0,%xmm3
1237	psllq	$5,%xmm0
1238	pxor	%xmm0,%xmm3
1239	psllq	$1,%xmm0
1240	pxor	%xmm3,%xmm0
1241	psllq	$57,%xmm0
1242	movdqa	%xmm0,%xmm3
1243	pslldq	$8,%xmm0
1244	psrldq	$8,%xmm3
1245	pxor	%xmm4,%xmm0
1246	pxor	%xmm3,%xmm1
1247
1248
1249	movdqa	%xmm0,%xmm4
1250	psrlq	$1,%xmm0
1251	pxor	%xmm4,%xmm1
1252	pxor	%xmm0,%xmm4
1253	psrlq	$5,%xmm0
1254	pxor	%xmm4,%xmm0
1255	psrlq	$1,%xmm0
1256	pxor	%xmm1,%xmm0
1257	testq	%rcx,%rcx
1258	jnz	L$done
1259
1260L$odd_tail:
1261	movdqu	(%rdx),%xmm8
1262.byte	102,69,15,56,0,194
1263	pxor	%xmm8,%xmm0
1264	movdqa	%xmm0,%xmm1
1265	pshufd	$78,%xmm0,%xmm3
1266	pxor	%xmm0,%xmm3
1267.byte	102,15,58,68,194,0
1268.byte	102,15,58,68,202,17
1269.byte	102,15,58,68,223,0
1270	pxor	%xmm0,%xmm3
1271	pxor	%xmm1,%xmm3
1272
1273	movdqa	%xmm3,%xmm4
1274	psrldq	$8,%xmm3
1275	pslldq	$8,%xmm4
1276	pxor	%xmm3,%xmm1
1277	pxor	%xmm4,%xmm0
1278
1279	movdqa	%xmm0,%xmm4
1280	movdqa	%xmm0,%xmm3
1281	psllq	$5,%xmm0
1282	pxor	%xmm0,%xmm3
1283	psllq	$1,%xmm0
1284	pxor	%xmm3,%xmm0
1285	psllq	$57,%xmm0
1286	movdqa	%xmm0,%xmm3
1287	pslldq	$8,%xmm0
1288	psrldq	$8,%xmm3
1289	pxor	%xmm4,%xmm0
1290	pxor	%xmm3,%xmm1
1291
1292
1293	movdqa	%xmm0,%xmm4
1294	psrlq	$1,%xmm0
1295	pxor	%xmm4,%xmm1
1296	pxor	%xmm0,%xmm4
1297	psrlq	$5,%xmm0
1298	pxor	%xmm4,%xmm0
1299	psrlq	$1,%xmm0
1300	pxor	%xmm1,%xmm0
1301L$done:
1302.byte	102,65,15,56,0,194
1303	movdqu	%xmm0,(%rdi)
1304	.byte	0xf3,0xc3
1305
1306
1307.globl	_gcm_init_avx
1308.private_extern _gcm_init_avx
1309
1310.p2align	5
1311_gcm_init_avx:
1312
1313	vzeroupper
1314
1315	vmovdqu	(%rsi),%xmm2
1316	vpshufd	$78,%xmm2,%xmm2
1317
1318
1319	vpshufd	$255,%xmm2,%xmm4
1320	vpsrlq	$63,%xmm2,%xmm3
1321	vpsllq	$1,%xmm2,%xmm2
1322	vpxor	%xmm5,%xmm5,%xmm5
1323	vpcmpgtd	%xmm4,%xmm5,%xmm5
1324	vpslldq	$8,%xmm3,%xmm3
1325	vpor	%xmm3,%xmm2,%xmm2
1326
1327
1328	vpand	L$0x1c2_polynomial(%rip),%xmm5,%xmm5
1329	vpxor	%xmm5,%xmm2,%xmm2
1330
1331	vpunpckhqdq	%xmm2,%xmm2,%xmm6
1332	vmovdqa	%xmm2,%xmm0
1333	vpxor	%xmm2,%xmm6,%xmm6
1334	movq	$4,%r10
1335	jmp	L$init_start_avx
1336.p2align	5
1337L$init_loop_avx:
1338	vpalignr	$8,%xmm3,%xmm4,%xmm5
1339	vmovdqu	%xmm5,-16(%rdi)
1340	vpunpckhqdq	%xmm0,%xmm0,%xmm3
1341	vpxor	%xmm0,%xmm3,%xmm3
1342	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
1343	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
1344	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
1345	vpxor	%xmm0,%xmm1,%xmm4
1346	vpxor	%xmm4,%xmm3,%xmm3
1347
1348	vpslldq	$8,%xmm3,%xmm4
1349	vpsrldq	$8,%xmm3,%xmm3
1350	vpxor	%xmm4,%xmm0,%xmm0
1351	vpxor	%xmm3,%xmm1,%xmm1
1352	vpsllq	$57,%xmm0,%xmm3
1353	vpsllq	$62,%xmm0,%xmm4
1354	vpxor	%xmm3,%xmm4,%xmm4
1355	vpsllq	$63,%xmm0,%xmm3
1356	vpxor	%xmm3,%xmm4,%xmm4
1357	vpslldq	$8,%xmm4,%xmm3
1358	vpsrldq	$8,%xmm4,%xmm4
1359	vpxor	%xmm3,%xmm0,%xmm0
1360	vpxor	%xmm4,%xmm1,%xmm1
1361
1362	vpsrlq	$1,%xmm0,%xmm4
1363	vpxor	%xmm0,%xmm1,%xmm1
1364	vpxor	%xmm4,%xmm0,%xmm0
1365	vpsrlq	$5,%xmm4,%xmm4
1366	vpxor	%xmm4,%xmm0,%xmm0
1367	vpsrlq	$1,%xmm0,%xmm0
1368	vpxor	%xmm1,%xmm0,%xmm0
1369L$init_start_avx:
1370	vmovdqa	%xmm0,%xmm5
1371	vpunpckhqdq	%xmm0,%xmm0,%xmm3
1372	vpxor	%xmm0,%xmm3,%xmm3
1373	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
1374	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
1375	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
1376	vpxor	%xmm0,%xmm1,%xmm4
1377	vpxor	%xmm4,%xmm3,%xmm3
1378
1379	vpslldq	$8,%xmm3,%xmm4
1380	vpsrldq	$8,%xmm3,%xmm3
1381	vpxor	%xmm4,%xmm0,%xmm0
1382	vpxor	%xmm3,%xmm1,%xmm1
1383	vpsllq	$57,%xmm0,%xmm3
1384	vpsllq	$62,%xmm0,%xmm4
1385	vpxor	%xmm3,%xmm4,%xmm4
1386	vpsllq	$63,%xmm0,%xmm3
1387	vpxor	%xmm3,%xmm4,%xmm4
1388	vpslldq	$8,%xmm4,%xmm3
1389	vpsrldq	$8,%xmm4,%xmm4
1390	vpxor	%xmm3,%xmm0,%xmm0
1391	vpxor	%xmm4,%xmm1,%xmm1
1392
1393	vpsrlq	$1,%xmm0,%xmm4
1394	vpxor	%xmm0,%xmm1,%xmm1
1395	vpxor	%xmm4,%xmm0,%xmm0
1396	vpsrlq	$5,%xmm4,%xmm4
1397	vpxor	%xmm4,%xmm0,%xmm0
1398	vpsrlq	$1,%xmm0,%xmm0
1399	vpxor	%xmm1,%xmm0,%xmm0
1400	vpshufd	$78,%xmm5,%xmm3
1401	vpshufd	$78,%xmm0,%xmm4
1402	vpxor	%xmm5,%xmm3,%xmm3
1403	vmovdqu	%xmm5,0(%rdi)
1404	vpxor	%xmm0,%xmm4,%xmm4
1405	vmovdqu	%xmm0,16(%rdi)
1406	leaq	48(%rdi),%rdi
1407	subq	$1,%r10
1408	jnz	L$init_loop_avx
1409
1410	vpalignr	$8,%xmm4,%xmm3,%xmm5
1411	vmovdqu	%xmm5,-16(%rdi)
1412
1413	vzeroupper
1414	.byte	0xf3,0xc3
1415
1416
1417.globl	_gcm_gmult_avx
1418.private_extern _gcm_gmult_avx
1419
1420.p2align	5
1421_gcm_gmult_avx:
1422
1423	jmp	L$_gmult_clmul
1424
1425
1426.globl	_gcm_ghash_avx
1427.private_extern _gcm_ghash_avx
1428
1429.p2align	5
1430_gcm_ghash_avx:
1431
1432	vzeroupper
1433
1434	vmovdqu	(%rdi),%xmm10
1435	leaq	L$0x1c2_polynomial(%rip),%r10
1436	leaq	64(%rsi),%rsi
1437	vmovdqu	L$bswap_mask(%rip),%xmm13
1438	vpshufb	%xmm13,%xmm10,%xmm10
1439	cmpq	$0x80,%rcx
1440	jb	L$short_avx
1441	subq	$0x80,%rcx
1442
1443	vmovdqu	112(%rdx),%xmm14
1444	vmovdqu	0-64(%rsi),%xmm6
1445	vpshufb	%xmm13,%xmm14,%xmm14
1446	vmovdqu	32-64(%rsi),%xmm7
1447
1448	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1449	vmovdqu	96(%rdx),%xmm15
1450	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1451	vpxor	%xmm14,%xmm9,%xmm9
1452	vpshufb	%xmm13,%xmm15,%xmm15
1453	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1454	vmovdqu	16-64(%rsi),%xmm6
1455	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1456	vmovdqu	80(%rdx),%xmm14
1457	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1458	vpxor	%xmm15,%xmm8,%xmm8
1459
1460	vpshufb	%xmm13,%xmm14,%xmm14
1461	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1462	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1463	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1464	vmovdqu	48-64(%rsi),%xmm6
1465	vpxor	%xmm14,%xmm9,%xmm9
1466	vmovdqu	64(%rdx),%xmm15
1467	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1468	vmovdqu	80-64(%rsi),%xmm7
1469
1470	vpshufb	%xmm13,%xmm15,%xmm15
1471	vpxor	%xmm0,%xmm3,%xmm3
1472	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1473	vpxor	%xmm1,%xmm4,%xmm4
1474	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1475	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1476	vmovdqu	64-64(%rsi),%xmm6
1477	vpxor	%xmm2,%xmm5,%xmm5
1478	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1479	vpxor	%xmm15,%xmm8,%xmm8
1480
1481	vmovdqu	48(%rdx),%xmm14
1482	vpxor	%xmm3,%xmm0,%xmm0
1483	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1484	vpxor	%xmm4,%xmm1,%xmm1
1485	vpshufb	%xmm13,%xmm14,%xmm14
1486	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1487	vmovdqu	96-64(%rsi),%xmm6
1488	vpxor	%xmm5,%xmm2,%xmm2
1489	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1490	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1491	vmovdqu	128-64(%rsi),%xmm7
1492	vpxor	%xmm14,%xmm9,%xmm9
1493
1494	vmovdqu	32(%rdx),%xmm15
1495	vpxor	%xmm0,%xmm3,%xmm3
1496	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1497	vpxor	%xmm1,%xmm4,%xmm4
1498	vpshufb	%xmm13,%xmm15,%xmm15
1499	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1500	vmovdqu	112-64(%rsi),%xmm6
1501	vpxor	%xmm2,%xmm5,%xmm5
1502	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1503	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1504	vpxor	%xmm15,%xmm8,%xmm8
1505
1506	vmovdqu	16(%rdx),%xmm14
1507	vpxor	%xmm3,%xmm0,%xmm0
1508	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1509	vpxor	%xmm4,%xmm1,%xmm1
1510	vpshufb	%xmm13,%xmm14,%xmm14
1511	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1512	vmovdqu	144-64(%rsi),%xmm6
1513	vpxor	%xmm5,%xmm2,%xmm2
1514	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1515	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1516	vmovdqu	176-64(%rsi),%xmm7
1517	vpxor	%xmm14,%xmm9,%xmm9
1518
1519	vmovdqu	(%rdx),%xmm15
1520	vpxor	%xmm0,%xmm3,%xmm3
1521	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1522	vpxor	%xmm1,%xmm4,%xmm4
1523	vpshufb	%xmm13,%xmm15,%xmm15
1524	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1525	vmovdqu	160-64(%rsi),%xmm6
1526	vpxor	%xmm2,%xmm5,%xmm5
1527	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
1528
1529	leaq	128(%rdx),%rdx
1530	cmpq	$0x80,%rcx
1531	jb	L$tail_avx
1532
1533	vpxor	%xmm10,%xmm15,%xmm15
1534	subq	$0x80,%rcx
1535	jmp	L$oop8x_avx
1536
1537.p2align	5
1538L$oop8x_avx:
1539	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1540	vmovdqu	112(%rdx),%xmm14
1541	vpxor	%xmm0,%xmm3,%xmm3
1542	vpxor	%xmm15,%xmm8,%xmm8
1543	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm10
1544	vpshufb	%xmm13,%xmm14,%xmm14
1545	vpxor	%xmm1,%xmm4,%xmm4
1546	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm11
1547	vmovdqu	0-64(%rsi),%xmm6
1548	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1549	vpxor	%xmm2,%xmm5,%xmm5
1550	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm12
1551	vmovdqu	32-64(%rsi),%xmm7
1552	vpxor	%xmm14,%xmm9,%xmm9
1553
1554	vmovdqu	96(%rdx),%xmm15
1555	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1556	vpxor	%xmm3,%xmm10,%xmm10
1557	vpshufb	%xmm13,%xmm15,%xmm15
1558	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1559	vxorps	%xmm4,%xmm11,%xmm11
1560	vmovdqu	16-64(%rsi),%xmm6
1561	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1562	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1563	vpxor	%xmm5,%xmm12,%xmm12
1564	vxorps	%xmm15,%xmm8,%xmm8
1565
1566	vmovdqu	80(%rdx),%xmm14
1567	vpxor	%xmm10,%xmm12,%xmm12
1568	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1569	vpxor	%xmm11,%xmm12,%xmm12
1570	vpslldq	$8,%xmm12,%xmm9
1571	vpxor	%xmm0,%xmm3,%xmm3
1572	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1573	vpsrldq	$8,%xmm12,%xmm12
1574	vpxor	%xmm9,%xmm10,%xmm10
1575	vmovdqu	48-64(%rsi),%xmm6
1576	vpshufb	%xmm13,%xmm14,%xmm14
1577	vxorps	%xmm12,%xmm11,%xmm11
1578	vpxor	%xmm1,%xmm4,%xmm4
1579	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1580	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1581	vmovdqu	80-64(%rsi),%xmm7
1582	vpxor	%xmm14,%xmm9,%xmm9
1583	vpxor	%xmm2,%xmm5,%xmm5
1584
1585	vmovdqu	64(%rdx),%xmm15
1586	vpalignr	$8,%xmm10,%xmm10,%xmm12
1587	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1588	vpshufb	%xmm13,%xmm15,%xmm15
1589	vpxor	%xmm3,%xmm0,%xmm0
1590	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1591	vmovdqu	64-64(%rsi),%xmm6
1592	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1593	vpxor	%xmm4,%xmm1,%xmm1
1594	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1595	vxorps	%xmm15,%xmm8,%xmm8
1596	vpxor	%xmm5,%xmm2,%xmm2
1597
1598	vmovdqu	48(%rdx),%xmm14
1599	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
1600	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1601	vpshufb	%xmm13,%xmm14,%xmm14
1602	vpxor	%xmm0,%xmm3,%xmm3
1603	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1604	vmovdqu	96-64(%rsi),%xmm6
1605	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1606	vpxor	%xmm1,%xmm4,%xmm4
1607	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1608	vmovdqu	128-64(%rsi),%xmm7
1609	vpxor	%xmm14,%xmm9,%xmm9
1610	vpxor	%xmm2,%xmm5,%xmm5
1611
1612	vmovdqu	32(%rdx),%xmm15
1613	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1614	vpshufb	%xmm13,%xmm15,%xmm15
1615	vpxor	%xmm3,%xmm0,%xmm0
1616	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1617	vmovdqu	112-64(%rsi),%xmm6
1618	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1619	vpxor	%xmm4,%xmm1,%xmm1
1620	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1621	vpxor	%xmm15,%xmm8,%xmm8
1622	vpxor	%xmm5,%xmm2,%xmm2
1623	vxorps	%xmm12,%xmm10,%xmm10
1624
1625	vmovdqu	16(%rdx),%xmm14
1626	vpalignr	$8,%xmm10,%xmm10,%xmm12
1627	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1628	vpshufb	%xmm13,%xmm14,%xmm14
1629	vpxor	%xmm0,%xmm3,%xmm3
1630	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1631	vmovdqu	144-64(%rsi),%xmm6
1632	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
1633	vxorps	%xmm11,%xmm12,%xmm12
1634	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1635	vpxor	%xmm1,%xmm4,%xmm4
1636	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1637	vmovdqu	176-64(%rsi),%xmm7
1638	vpxor	%xmm14,%xmm9,%xmm9
1639	vpxor	%xmm2,%xmm5,%xmm5
1640
1641	vmovdqu	(%rdx),%xmm15
1642	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1643	vpshufb	%xmm13,%xmm15,%xmm15
1644	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1645	vmovdqu	160-64(%rsi),%xmm6
1646	vpxor	%xmm12,%xmm15,%xmm15
1647	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
1648	vpxor	%xmm10,%xmm15,%xmm15
1649
1650	leaq	128(%rdx),%rdx
1651	subq	$0x80,%rcx
1652	jnc	L$oop8x_avx
1653
1654	addq	$0x80,%rcx
1655	jmp	L$tail_no_xor_avx
1656
1657.p2align	5
1658L$short_avx:
1659	vmovdqu	-16(%rdx,%rcx,1),%xmm14
1660	leaq	(%rdx,%rcx,1),%rdx
1661	vmovdqu	0-64(%rsi),%xmm6
1662	vmovdqu	32-64(%rsi),%xmm7
1663	vpshufb	%xmm13,%xmm14,%xmm15
1664
1665	vmovdqa	%xmm0,%xmm3
1666	vmovdqa	%xmm1,%xmm4
1667	vmovdqa	%xmm2,%xmm5
1668	subq	$0x10,%rcx
1669	jz	L$tail_avx
1670
1671	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1672	vpxor	%xmm0,%xmm3,%xmm3
1673	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1674	vpxor	%xmm15,%xmm8,%xmm8
1675	vmovdqu	-32(%rdx),%xmm14
1676	vpxor	%xmm1,%xmm4,%xmm4
1677	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1678	vmovdqu	16-64(%rsi),%xmm6
1679	vpshufb	%xmm13,%xmm14,%xmm15
1680	vpxor	%xmm2,%xmm5,%xmm5
1681	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1682	vpsrldq	$8,%xmm7,%xmm7
1683	subq	$0x10,%rcx
1684	jz	L$tail_avx
1685
1686	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1687	vpxor	%xmm0,%xmm3,%xmm3
1688	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1689	vpxor	%xmm15,%xmm8,%xmm8
1690	vmovdqu	-48(%rdx),%xmm14
1691	vpxor	%xmm1,%xmm4,%xmm4
1692	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1693	vmovdqu	48-64(%rsi),%xmm6
1694	vpshufb	%xmm13,%xmm14,%xmm15
1695	vpxor	%xmm2,%xmm5,%xmm5
1696	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1697	vmovdqu	80-64(%rsi),%xmm7
1698	subq	$0x10,%rcx
1699	jz	L$tail_avx
1700
1701	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1702	vpxor	%xmm0,%xmm3,%xmm3
1703	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1704	vpxor	%xmm15,%xmm8,%xmm8
1705	vmovdqu	-64(%rdx),%xmm14
1706	vpxor	%xmm1,%xmm4,%xmm4
1707	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1708	vmovdqu	64-64(%rsi),%xmm6
1709	vpshufb	%xmm13,%xmm14,%xmm15
1710	vpxor	%xmm2,%xmm5,%xmm5
1711	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1712	vpsrldq	$8,%xmm7,%xmm7
1713	subq	$0x10,%rcx
1714	jz	L$tail_avx
1715
1716	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1717	vpxor	%xmm0,%xmm3,%xmm3
1718	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1719	vpxor	%xmm15,%xmm8,%xmm8
1720	vmovdqu	-80(%rdx),%xmm14
1721	vpxor	%xmm1,%xmm4,%xmm4
1722	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1723	vmovdqu	96-64(%rsi),%xmm6
1724	vpshufb	%xmm13,%xmm14,%xmm15
1725	vpxor	%xmm2,%xmm5,%xmm5
1726	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1727	vmovdqu	128-64(%rsi),%xmm7
1728	subq	$0x10,%rcx
1729	jz	L$tail_avx
1730
1731	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1732	vpxor	%xmm0,%xmm3,%xmm3
1733	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1734	vpxor	%xmm15,%xmm8,%xmm8
1735	vmovdqu	-96(%rdx),%xmm14
1736	vpxor	%xmm1,%xmm4,%xmm4
1737	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1738	vmovdqu	112-64(%rsi),%xmm6
1739	vpshufb	%xmm13,%xmm14,%xmm15
1740	vpxor	%xmm2,%xmm5,%xmm5
1741	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1742	vpsrldq	$8,%xmm7,%xmm7
1743	subq	$0x10,%rcx
1744	jz	L$tail_avx
1745
1746	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1747	vpxor	%xmm0,%xmm3,%xmm3
1748	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1749	vpxor	%xmm15,%xmm8,%xmm8
1750	vmovdqu	-112(%rdx),%xmm14
1751	vpxor	%xmm1,%xmm4,%xmm4
1752	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1753	vmovdqu	144-64(%rsi),%xmm6
1754	vpshufb	%xmm13,%xmm14,%xmm15
1755	vpxor	%xmm2,%xmm5,%xmm5
1756	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1757	vmovq	184-64(%rsi),%xmm7
1758	subq	$0x10,%rcx
1759	jmp	L$tail_avx
1760
1761.p2align	5
1762L$tail_avx:
1763	vpxor	%xmm10,%xmm15,%xmm15
1764L$tail_no_xor_avx:
1765	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1766	vpxor	%xmm0,%xmm3,%xmm3
1767	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1768	vpxor	%xmm15,%xmm8,%xmm8
1769	vpxor	%xmm1,%xmm4,%xmm4
1770	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1771	vpxor	%xmm2,%xmm5,%xmm5
1772	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1773
1774	vmovdqu	(%r10),%xmm12
1775
1776	vpxor	%xmm0,%xmm3,%xmm10
1777	vpxor	%xmm1,%xmm4,%xmm11
1778	vpxor	%xmm2,%xmm5,%xmm5
1779
1780	vpxor	%xmm10,%xmm5,%xmm5
1781	vpxor	%xmm11,%xmm5,%xmm5
1782	vpslldq	$8,%xmm5,%xmm9
1783	vpsrldq	$8,%xmm5,%xmm5
1784	vpxor	%xmm9,%xmm10,%xmm10
1785	vpxor	%xmm5,%xmm11,%xmm11
1786
1787	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
1788	vpalignr	$8,%xmm10,%xmm10,%xmm10
1789	vpxor	%xmm9,%xmm10,%xmm10
1790
1791	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
1792	vpalignr	$8,%xmm10,%xmm10,%xmm10
1793	vpxor	%xmm11,%xmm10,%xmm10
1794	vpxor	%xmm9,%xmm10,%xmm10
1795
1796	cmpq	$0,%rcx
1797	jne	L$short_avx
1798
1799	vpshufb	%xmm13,%xmm10,%xmm10
1800	vmovdqu	%xmm10,(%rdi)
1801	vzeroupper
1802	.byte	0xf3,0xc3
1803
1804
1805.p2align	6
1806L$bswap_mask:
1807.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1808L$0x1c2_polynomial:
1809.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1810L$7_mask:
1811.long	7,0,7,0
1812L$7_mask_poly:
1813.long	7,0,450,0
1814.p2align	6
1815
1816L$rem_4bit:
1817.long	0,0,0,471859200,0,943718400,0,610271232
1818.long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
1819.long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
1820.long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
1821
1822L$rem_8bit:
1823.value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
1824.value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
1825.value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
1826.value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
1827.value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
1828.value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
1829.value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
1830.value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
1831.value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
1832.value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
1833.value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
1834.value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
1835.value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
1836.value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
1837.value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
1838.value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
1839.value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
1840.value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
1841.value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
1842.value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
1843.value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
1844.value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
1845.value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
1846.value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
1847.value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
1848.value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
1849.value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
1850.value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
1851.value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
1852.value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
1853.value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
1854.value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
1855
1856.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1857.p2align	6
1858#endif
1859