• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
2.text
3
4
5.globl	_gcm_gmult_4bit
6.private_extern _gcm_gmult_4bit
7
8.p2align	4
9_gcm_gmult_4bit:
10	pushq	%rbx
11	pushq	%rbp
12	pushq	%r12
13	pushq	%r13
14	pushq	%r14
15	pushq	%r15
16	subq	$280,%rsp
17L$gmult_prologue:
18
19	movzbq	15(%rdi),%r8
20	leaq	L$rem_4bit(%rip),%r11
21	xorq	%rax,%rax
22	xorq	%rbx,%rbx
23	movb	%r8b,%al
24	movb	%r8b,%bl
25	shlb	$4,%al
26	movq	$14,%rcx
27	movq	8(%rsi,%rax,1),%r8
28	movq	(%rsi,%rax,1),%r9
29	andb	$0xf0,%bl
30	movq	%r8,%rdx
31	jmp	L$oop1
32
33.p2align	4
34L$oop1:
35	shrq	$4,%r8
36	andq	$0xf,%rdx
37	movq	%r9,%r10
38	movb	(%rdi,%rcx,1),%al
39	shrq	$4,%r9
40	xorq	8(%rsi,%rbx,1),%r8
41	shlq	$60,%r10
42	xorq	(%rsi,%rbx,1),%r9
43	movb	%al,%bl
44	xorq	(%r11,%rdx,8),%r9
45	movq	%r8,%rdx
46	shlb	$4,%al
47	xorq	%r10,%r8
48	decq	%rcx
49	js	L$break1
50
51	shrq	$4,%r8
52	andq	$0xf,%rdx
53	movq	%r9,%r10
54	shrq	$4,%r9
55	xorq	8(%rsi,%rax,1),%r8
56	shlq	$60,%r10
57	xorq	(%rsi,%rax,1),%r9
58	andb	$0xf0,%bl
59	xorq	(%r11,%rdx,8),%r9
60	movq	%r8,%rdx
61	xorq	%r10,%r8
62	jmp	L$oop1
63
64.p2align	4
65L$break1:
66	shrq	$4,%r8
67	andq	$0xf,%rdx
68	movq	%r9,%r10
69	shrq	$4,%r9
70	xorq	8(%rsi,%rax,1),%r8
71	shlq	$60,%r10
72	xorq	(%rsi,%rax,1),%r9
73	andb	$0xf0,%bl
74	xorq	(%r11,%rdx,8),%r9
75	movq	%r8,%rdx
76	xorq	%r10,%r8
77
78	shrq	$4,%r8
79	andq	$0xf,%rdx
80	movq	%r9,%r10
81	shrq	$4,%r9
82	xorq	8(%rsi,%rbx,1),%r8
83	shlq	$60,%r10
84	xorq	(%rsi,%rbx,1),%r9
85	xorq	%r10,%r8
86	xorq	(%r11,%rdx,8),%r9
87
88	bswapq	%r8
89	bswapq	%r9
90	movq	%r8,8(%rdi)
91	movq	%r9,(%rdi)
92
93	leaq	280+48(%rsp),%rsi
94	movq	-8(%rsi),%rbx
95	leaq	(%rsi),%rsp
96L$gmult_epilogue:
97	.byte	0xf3,0xc3
98
99.globl	_gcm_ghash_4bit
100.private_extern _gcm_ghash_4bit
101
102.p2align	4
103_gcm_ghash_4bit:
104	pushq	%rbx
105	pushq	%rbp
106	pushq	%r12
107	pushq	%r13
108	pushq	%r14
109	pushq	%r15
110	subq	$280,%rsp
111L$ghash_prologue:
112	movq	%rdx,%r14
113	movq	%rcx,%r15
114	subq	$-128,%rsi
115	leaq	16+128(%rsp),%rbp
116	xorl	%edx,%edx
117	movq	0+0-128(%rsi),%r8
118	movq	0+8-128(%rsi),%rax
119	movb	%al,%dl
120	shrq	$4,%rax
121	movq	%r8,%r10
122	shrq	$4,%r8
123	movq	16+0-128(%rsi),%r9
124	shlb	$4,%dl
125	movq	16+8-128(%rsi),%rbx
126	shlq	$60,%r10
127	movb	%dl,0(%rsp)
128	orq	%r10,%rax
129	movb	%bl,%dl
130	shrq	$4,%rbx
131	movq	%r9,%r10
132	shrq	$4,%r9
133	movq	%r8,0(%rbp)
134	movq	32+0-128(%rsi),%r8
135	shlb	$4,%dl
136	movq	%rax,0-128(%rbp)
137	movq	32+8-128(%rsi),%rax
138	shlq	$60,%r10
139	movb	%dl,1(%rsp)
140	orq	%r10,%rbx
141	movb	%al,%dl
142	shrq	$4,%rax
143	movq	%r8,%r10
144	shrq	$4,%r8
145	movq	%r9,8(%rbp)
146	movq	48+0-128(%rsi),%r9
147	shlb	$4,%dl
148	movq	%rbx,8-128(%rbp)
149	movq	48+8-128(%rsi),%rbx
150	shlq	$60,%r10
151	movb	%dl,2(%rsp)
152	orq	%r10,%rax
153	movb	%bl,%dl
154	shrq	$4,%rbx
155	movq	%r9,%r10
156	shrq	$4,%r9
157	movq	%r8,16(%rbp)
158	movq	64+0-128(%rsi),%r8
159	shlb	$4,%dl
160	movq	%rax,16-128(%rbp)
161	movq	64+8-128(%rsi),%rax
162	shlq	$60,%r10
163	movb	%dl,3(%rsp)
164	orq	%r10,%rbx
165	movb	%al,%dl
166	shrq	$4,%rax
167	movq	%r8,%r10
168	shrq	$4,%r8
169	movq	%r9,24(%rbp)
170	movq	80+0-128(%rsi),%r9
171	shlb	$4,%dl
172	movq	%rbx,24-128(%rbp)
173	movq	80+8-128(%rsi),%rbx
174	shlq	$60,%r10
175	movb	%dl,4(%rsp)
176	orq	%r10,%rax
177	movb	%bl,%dl
178	shrq	$4,%rbx
179	movq	%r9,%r10
180	shrq	$4,%r9
181	movq	%r8,32(%rbp)
182	movq	96+0-128(%rsi),%r8
183	shlb	$4,%dl
184	movq	%rax,32-128(%rbp)
185	movq	96+8-128(%rsi),%rax
186	shlq	$60,%r10
187	movb	%dl,5(%rsp)
188	orq	%r10,%rbx
189	movb	%al,%dl
190	shrq	$4,%rax
191	movq	%r8,%r10
192	shrq	$4,%r8
193	movq	%r9,40(%rbp)
194	movq	112+0-128(%rsi),%r9
195	shlb	$4,%dl
196	movq	%rbx,40-128(%rbp)
197	movq	112+8-128(%rsi),%rbx
198	shlq	$60,%r10
199	movb	%dl,6(%rsp)
200	orq	%r10,%rax
201	movb	%bl,%dl
202	shrq	$4,%rbx
203	movq	%r9,%r10
204	shrq	$4,%r9
205	movq	%r8,48(%rbp)
206	movq	128+0-128(%rsi),%r8
207	shlb	$4,%dl
208	movq	%rax,48-128(%rbp)
209	movq	128+8-128(%rsi),%rax
210	shlq	$60,%r10
211	movb	%dl,7(%rsp)
212	orq	%r10,%rbx
213	movb	%al,%dl
214	shrq	$4,%rax
215	movq	%r8,%r10
216	shrq	$4,%r8
217	movq	%r9,56(%rbp)
218	movq	144+0-128(%rsi),%r9
219	shlb	$4,%dl
220	movq	%rbx,56-128(%rbp)
221	movq	144+8-128(%rsi),%rbx
222	shlq	$60,%r10
223	movb	%dl,8(%rsp)
224	orq	%r10,%rax
225	movb	%bl,%dl
226	shrq	$4,%rbx
227	movq	%r9,%r10
228	shrq	$4,%r9
229	movq	%r8,64(%rbp)
230	movq	160+0-128(%rsi),%r8
231	shlb	$4,%dl
232	movq	%rax,64-128(%rbp)
233	movq	160+8-128(%rsi),%rax
234	shlq	$60,%r10
235	movb	%dl,9(%rsp)
236	orq	%r10,%rbx
237	movb	%al,%dl
238	shrq	$4,%rax
239	movq	%r8,%r10
240	shrq	$4,%r8
241	movq	%r9,72(%rbp)
242	movq	176+0-128(%rsi),%r9
243	shlb	$4,%dl
244	movq	%rbx,72-128(%rbp)
245	movq	176+8-128(%rsi),%rbx
246	shlq	$60,%r10
247	movb	%dl,10(%rsp)
248	orq	%r10,%rax
249	movb	%bl,%dl
250	shrq	$4,%rbx
251	movq	%r9,%r10
252	shrq	$4,%r9
253	movq	%r8,80(%rbp)
254	movq	192+0-128(%rsi),%r8
255	shlb	$4,%dl
256	movq	%rax,80-128(%rbp)
257	movq	192+8-128(%rsi),%rax
258	shlq	$60,%r10
259	movb	%dl,11(%rsp)
260	orq	%r10,%rbx
261	movb	%al,%dl
262	shrq	$4,%rax
263	movq	%r8,%r10
264	shrq	$4,%r8
265	movq	%r9,88(%rbp)
266	movq	208+0-128(%rsi),%r9
267	shlb	$4,%dl
268	movq	%rbx,88-128(%rbp)
269	movq	208+8-128(%rsi),%rbx
270	shlq	$60,%r10
271	movb	%dl,12(%rsp)
272	orq	%r10,%rax
273	movb	%bl,%dl
274	shrq	$4,%rbx
275	movq	%r9,%r10
276	shrq	$4,%r9
277	movq	%r8,96(%rbp)
278	movq	224+0-128(%rsi),%r8
279	shlb	$4,%dl
280	movq	%rax,96-128(%rbp)
281	movq	224+8-128(%rsi),%rax
282	shlq	$60,%r10
283	movb	%dl,13(%rsp)
284	orq	%r10,%rbx
285	movb	%al,%dl
286	shrq	$4,%rax
287	movq	%r8,%r10
288	shrq	$4,%r8
289	movq	%r9,104(%rbp)
290	movq	240+0-128(%rsi),%r9
291	shlb	$4,%dl
292	movq	%rbx,104-128(%rbp)
293	movq	240+8-128(%rsi),%rbx
294	shlq	$60,%r10
295	movb	%dl,14(%rsp)
296	orq	%r10,%rax
297	movb	%bl,%dl
298	shrq	$4,%rbx
299	movq	%r9,%r10
300	shrq	$4,%r9
301	movq	%r8,112(%rbp)
302	shlb	$4,%dl
303	movq	%rax,112-128(%rbp)
304	shlq	$60,%r10
305	movb	%dl,15(%rsp)
306	orq	%r10,%rbx
307	movq	%r9,120(%rbp)
308	movq	%rbx,120-128(%rbp)
309	addq	$-128,%rsi
310	movq	8(%rdi),%r8
311	movq	0(%rdi),%r9
312	addq	%r14,%r15
313	leaq	L$rem_8bit(%rip),%r11
314	jmp	L$outer_loop
315.p2align	4
316L$outer_loop:
317	xorq	(%r14),%r9
318	movq	8(%r14),%rdx
319	leaq	16(%r14),%r14
320	xorq	%r8,%rdx
321	movq	%r9,(%rdi)
322	movq	%rdx,8(%rdi)
323	shrq	$32,%rdx
324	xorq	%rax,%rax
325	roll	$8,%edx
326	movb	%dl,%al
327	movzbl	%dl,%ebx
328	shlb	$4,%al
329	shrl	$4,%ebx
330	roll	$8,%edx
331	movq	8(%rsi,%rax,1),%r8
332	movq	(%rsi,%rax,1),%r9
333	movb	%dl,%al
334	movzbl	%dl,%ecx
335	shlb	$4,%al
336	movzbq	(%rsp,%rbx,1),%r12
337	shrl	$4,%ecx
338	xorq	%r8,%r12
339	movq	%r9,%r10
340	shrq	$8,%r8
341	movzbq	%r12b,%r12
342	shrq	$8,%r9
343	xorq	-128(%rbp,%rbx,8),%r8
344	shlq	$56,%r10
345	xorq	(%rbp,%rbx,8),%r9
346	roll	$8,%edx
347	xorq	8(%rsi,%rax,1),%r8
348	xorq	(%rsi,%rax,1),%r9
349	movb	%dl,%al
350	xorq	%r10,%r8
351	movzwq	(%r11,%r12,2),%r12
352	movzbl	%dl,%ebx
353	shlb	$4,%al
354	movzbq	(%rsp,%rcx,1),%r13
355	shrl	$4,%ebx
356	shlq	$48,%r12
357	xorq	%r8,%r13
358	movq	%r9,%r10
359	xorq	%r12,%r9
360	shrq	$8,%r8
361	movzbq	%r13b,%r13
362	shrq	$8,%r9
363	xorq	-128(%rbp,%rcx,8),%r8
364	shlq	$56,%r10
365	xorq	(%rbp,%rcx,8),%r9
366	roll	$8,%edx
367	xorq	8(%rsi,%rax,1),%r8
368	xorq	(%rsi,%rax,1),%r9
369	movb	%dl,%al
370	xorq	%r10,%r8
371	movzwq	(%r11,%r13,2),%r13
372	movzbl	%dl,%ecx
373	shlb	$4,%al
374	movzbq	(%rsp,%rbx,1),%r12
375	shrl	$4,%ecx
376	shlq	$48,%r13
377	xorq	%r8,%r12
378	movq	%r9,%r10
379	xorq	%r13,%r9
380	shrq	$8,%r8
381	movzbq	%r12b,%r12
382	movl	8(%rdi),%edx
383	shrq	$8,%r9
384	xorq	-128(%rbp,%rbx,8),%r8
385	shlq	$56,%r10
386	xorq	(%rbp,%rbx,8),%r9
387	roll	$8,%edx
388	xorq	8(%rsi,%rax,1),%r8
389	xorq	(%rsi,%rax,1),%r9
390	movb	%dl,%al
391	xorq	%r10,%r8
392	movzwq	(%r11,%r12,2),%r12
393	movzbl	%dl,%ebx
394	shlb	$4,%al
395	movzbq	(%rsp,%rcx,1),%r13
396	shrl	$4,%ebx
397	shlq	$48,%r12
398	xorq	%r8,%r13
399	movq	%r9,%r10
400	xorq	%r12,%r9
401	shrq	$8,%r8
402	movzbq	%r13b,%r13
403	shrq	$8,%r9
404	xorq	-128(%rbp,%rcx,8),%r8
405	shlq	$56,%r10
406	xorq	(%rbp,%rcx,8),%r9
407	roll	$8,%edx
408	xorq	8(%rsi,%rax,1),%r8
409	xorq	(%rsi,%rax,1),%r9
410	movb	%dl,%al
411	xorq	%r10,%r8
412	movzwq	(%r11,%r13,2),%r13
413	movzbl	%dl,%ecx
414	shlb	$4,%al
415	movzbq	(%rsp,%rbx,1),%r12
416	shrl	$4,%ecx
417	shlq	$48,%r13
418	xorq	%r8,%r12
419	movq	%r9,%r10
420	xorq	%r13,%r9
421	shrq	$8,%r8
422	movzbq	%r12b,%r12
423	shrq	$8,%r9
424	xorq	-128(%rbp,%rbx,8),%r8
425	shlq	$56,%r10
426	xorq	(%rbp,%rbx,8),%r9
427	roll	$8,%edx
428	xorq	8(%rsi,%rax,1),%r8
429	xorq	(%rsi,%rax,1),%r9
430	movb	%dl,%al
431	xorq	%r10,%r8
432	movzwq	(%r11,%r12,2),%r12
433	movzbl	%dl,%ebx
434	shlb	$4,%al
435	movzbq	(%rsp,%rcx,1),%r13
436	shrl	$4,%ebx
437	shlq	$48,%r12
438	xorq	%r8,%r13
439	movq	%r9,%r10
440	xorq	%r12,%r9
441	shrq	$8,%r8
442	movzbq	%r13b,%r13
443	shrq	$8,%r9
444	xorq	-128(%rbp,%rcx,8),%r8
445	shlq	$56,%r10
446	xorq	(%rbp,%rcx,8),%r9
447	roll	$8,%edx
448	xorq	8(%rsi,%rax,1),%r8
449	xorq	(%rsi,%rax,1),%r9
450	movb	%dl,%al
451	xorq	%r10,%r8
452	movzwq	(%r11,%r13,2),%r13
453	movzbl	%dl,%ecx
454	shlb	$4,%al
455	movzbq	(%rsp,%rbx,1),%r12
456	shrl	$4,%ecx
457	shlq	$48,%r13
458	xorq	%r8,%r12
459	movq	%r9,%r10
460	xorq	%r13,%r9
461	shrq	$8,%r8
462	movzbq	%r12b,%r12
463	movl	4(%rdi),%edx
464	shrq	$8,%r9
465	xorq	-128(%rbp,%rbx,8),%r8
466	shlq	$56,%r10
467	xorq	(%rbp,%rbx,8),%r9
468	roll	$8,%edx
469	xorq	8(%rsi,%rax,1),%r8
470	xorq	(%rsi,%rax,1),%r9
471	movb	%dl,%al
472	xorq	%r10,%r8
473	movzwq	(%r11,%r12,2),%r12
474	movzbl	%dl,%ebx
475	shlb	$4,%al
476	movzbq	(%rsp,%rcx,1),%r13
477	shrl	$4,%ebx
478	shlq	$48,%r12
479	xorq	%r8,%r13
480	movq	%r9,%r10
481	xorq	%r12,%r9
482	shrq	$8,%r8
483	movzbq	%r13b,%r13
484	shrq	$8,%r9
485	xorq	-128(%rbp,%rcx,8),%r8
486	shlq	$56,%r10
487	xorq	(%rbp,%rcx,8),%r9
488	roll	$8,%edx
489	xorq	8(%rsi,%rax,1),%r8
490	xorq	(%rsi,%rax,1),%r9
491	movb	%dl,%al
492	xorq	%r10,%r8
493	movzwq	(%r11,%r13,2),%r13
494	movzbl	%dl,%ecx
495	shlb	$4,%al
496	movzbq	(%rsp,%rbx,1),%r12
497	shrl	$4,%ecx
498	shlq	$48,%r13
499	xorq	%r8,%r12
500	movq	%r9,%r10
501	xorq	%r13,%r9
502	shrq	$8,%r8
503	movzbq	%r12b,%r12
504	shrq	$8,%r9
505	xorq	-128(%rbp,%rbx,8),%r8
506	shlq	$56,%r10
507	xorq	(%rbp,%rbx,8),%r9
508	roll	$8,%edx
509	xorq	8(%rsi,%rax,1),%r8
510	xorq	(%rsi,%rax,1),%r9
511	movb	%dl,%al
512	xorq	%r10,%r8
513	movzwq	(%r11,%r12,2),%r12
514	movzbl	%dl,%ebx
515	shlb	$4,%al
516	movzbq	(%rsp,%rcx,1),%r13
517	shrl	$4,%ebx
518	shlq	$48,%r12
519	xorq	%r8,%r13
520	movq	%r9,%r10
521	xorq	%r12,%r9
522	shrq	$8,%r8
523	movzbq	%r13b,%r13
524	shrq	$8,%r9
525	xorq	-128(%rbp,%rcx,8),%r8
526	shlq	$56,%r10
527	xorq	(%rbp,%rcx,8),%r9
528	roll	$8,%edx
529	xorq	8(%rsi,%rax,1),%r8
530	xorq	(%rsi,%rax,1),%r9
531	movb	%dl,%al
532	xorq	%r10,%r8
533	movzwq	(%r11,%r13,2),%r13
534	movzbl	%dl,%ecx
535	shlb	$4,%al
536	movzbq	(%rsp,%rbx,1),%r12
537	shrl	$4,%ecx
538	shlq	$48,%r13
539	xorq	%r8,%r12
540	movq	%r9,%r10
541	xorq	%r13,%r9
542	shrq	$8,%r8
543	movzbq	%r12b,%r12
544	movl	0(%rdi),%edx
545	shrq	$8,%r9
546	xorq	-128(%rbp,%rbx,8),%r8
547	shlq	$56,%r10
548	xorq	(%rbp,%rbx,8),%r9
549	roll	$8,%edx
550	xorq	8(%rsi,%rax,1),%r8
551	xorq	(%rsi,%rax,1),%r9
552	movb	%dl,%al
553	xorq	%r10,%r8
554	movzwq	(%r11,%r12,2),%r12
555	movzbl	%dl,%ebx
556	shlb	$4,%al
557	movzbq	(%rsp,%rcx,1),%r13
558	shrl	$4,%ebx
559	shlq	$48,%r12
560	xorq	%r8,%r13
561	movq	%r9,%r10
562	xorq	%r12,%r9
563	shrq	$8,%r8
564	movzbq	%r13b,%r13
565	shrq	$8,%r9
566	xorq	-128(%rbp,%rcx,8),%r8
567	shlq	$56,%r10
568	xorq	(%rbp,%rcx,8),%r9
569	roll	$8,%edx
570	xorq	8(%rsi,%rax,1),%r8
571	xorq	(%rsi,%rax,1),%r9
572	movb	%dl,%al
573	xorq	%r10,%r8
574	movzwq	(%r11,%r13,2),%r13
575	movzbl	%dl,%ecx
576	shlb	$4,%al
577	movzbq	(%rsp,%rbx,1),%r12
578	shrl	$4,%ecx
579	shlq	$48,%r13
580	xorq	%r8,%r12
581	movq	%r9,%r10
582	xorq	%r13,%r9
583	shrq	$8,%r8
584	movzbq	%r12b,%r12
585	shrq	$8,%r9
586	xorq	-128(%rbp,%rbx,8),%r8
587	shlq	$56,%r10
588	xorq	(%rbp,%rbx,8),%r9
589	roll	$8,%edx
590	xorq	8(%rsi,%rax,1),%r8
591	xorq	(%rsi,%rax,1),%r9
592	movb	%dl,%al
593	xorq	%r10,%r8
594	movzwq	(%r11,%r12,2),%r12
595	movzbl	%dl,%ebx
596	shlb	$4,%al
597	movzbq	(%rsp,%rcx,1),%r13
598	shrl	$4,%ebx
599	shlq	$48,%r12
600	xorq	%r8,%r13
601	movq	%r9,%r10
602	xorq	%r12,%r9
603	shrq	$8,%r8
604	movzbq	%r13b,%r13
605	shrq	$8,%r9
606	xorq	-128(%rbp,%rcx,8),%r8
607	shlq	$56,%r10
608	xorq	(%rbp,%rcx,8),%r9
609	roll	$8,%edx
610	xorq	8(%rsi,%rax,1),%r8
611	xorq	(%rsi,%rax,1),%r9
612	movb	%dl,%al
613	xorq	%r10,%r8
614	movzwq	(%r11,%r13,2),%r13
615	movzbl	%dl,%ecx
616	shlb	$4,%al
617	movzbq	(%rsp,%rbx,1),%r12
618	andl	$240,%ecx
619	shlq	$48,%r13
620	xorq	%r8,%r12
621	movq	%r9,%r10
622	xorq	%r13,%r9
623	shrq	$8,%r8
624	movzbq	%r12b,%r12
625	movl	-4(%rdi),%edx
626	shrq	$8,%r9
627	xorq	-128(%rbp,%rbx,8),%r8
628	shlq	$56,%r10
629	xorq	(%rbp,%rbx,8),%r9
630	movzwq	(%r11,%r12,2),%r12
631	xorq	8(%rsi,%rax,1),%r8
632	xorq	(%rsi,%rax,1),%r9
633	shlq	$48,%r12
634	xorq	%r10,%r8
635	xorq	%r12,%r9
636	movzbq	%r8b,%r13
637	shrq	$4,%r8
638	movq	%r9,%r10
639	shlb	$4,%r13b
640	shrq	$4,%r9
641	xorq	8(%rsi,%rcx,1),%r8
642	movzwq	(%r11,%r13,2),%r13
643	shlq	$60,%r10
644	xorq	(%rsi,%rcx,1),%r9
645	xorq	%r10,%r8
646	shlq	$48,%r13
647	bswapq	%r8
648	xorq	%r13,%r9
649	bswapq	%r9
650	cmpq	%r15,%r14
651	jb	L$outer_loop
652	movq	%r8,8(%rdi)
653	movq	%r9,(%rdi)
654
655	leaq	280+48(%rsp),%rsi
656	movq	-48(%rsi),%r15
657	movq	-40(%rsi),%r14
658	movq	-32(%rsi),%r13
659	movq	-24(%rsi),%r12
660	movq	-16(%rsi),%rbp
661	movq	-8(%rsi),%rbx
662	leaq	0(%rsi),%rsp
663L$ghash_epilogue:
664	.byte	0xf3,0xc3
665
666.globl	_gcm_init_clmul
667.private_extern _gcm_init_clmul
668
669.p2align	4
670_gcm_init_clmul:
671L$_init_clmul:
672	movdqu	(%rsi),%xmm2
673	pshufd	$78,%xmm2,%xmm2
674
675
676	pshufd	$255,%xmm2,%xmm4
677	movdqa	%xmm2,%xmm3
678	psllq	$1,%xmm2
679	pxor	%xmm5,%xmm5
680	psrlq	$63,%xmm3
681	pcmpgtd	%xmm4,%xmm5
682	pslldq	$8,%xmm3
683	por	%xmm3,%xmm2
684
685
686	pand	L$0x1c2_polynomial(%rip),%xmm5
687	pxor	%xmm5,%xmm2
688
689
690	pshufd	$78,%xmm2,%xmm6
691	movdqa	%xmm2,%xmm0
692	pxor	%xmm2,%xmm6
693	movdqa	%xmm0,%xmm1
694	pshufd	$78,%xmm0,%xmm3
695	pxor	%xmm0,%xmm3
696.byte	102,15,58,68,194,0
697.byte	102,15,58,68,202,17
698.byte	102,15,58,68,222,0
699	pxor	%xmm0,%xmm3
700	pxor	%xmm1,%xmm3
701
702	movdqa	%xmm3,%xmm4
703	psrldq	$8,%xmm3
704	pslldq	$8,%xmm4
705	pxor	%xmm3,%xmm1
706	pxor	%xmm4,%xmm0
707
708	movdqa	%xmm0,%xmm4
709	movdqa	%xmm0,%xmm3
710	psllq	$5,%xmm0
711	pxor	%xmm0,%xmm3
712	psllq	$1,%xmm0
713	pxor	%xmm3,%xmm0
714	psllq	$57,%xmm0
715	movdqa	%xmm0,%xmm3
716	pslldq	$8,%xmm0
717	psrldq	$8,%xmm3
718	pxor	%xmm4,%xmm0
719	pxor	%xmm3,%xmm1
720
721
722	movdqa	%xmm0,%xmm4
723	psrlq	$1,%xmm0
724	pxor	%xmm4,%xmm1
725	pxor	%xmm0,%xmm4
726	psrlq	$5,%xmm0
727	pxor	%xmm4,%xmm0
728	psrlq	$1,%xmm0
729	pxor	%xmm1,%xmm0
730	pshufd	$78,%xmm2,%xmm3
731	pshufd	$78,%xmm0,%xmm4
732	pxor	%xmm2,%xmm3
733	movdqu	%xmm2,0(%rdi)
734	pxor	%xmm0,%xmm4
735	movdqu	%xmm0,16(%rdi)
736.byte	102,15,58,15,227,8
737	movdqu	%xmm4,32(%rdi)
738	movdqa	%xmm0,%xmm1
739	pshufd	$78,%xmm0,%xmm3
740	pxor	%xmm0,%xmm3
741.byte	102,15,58,68,194,0
742.byte	102,15,58,68,202,17
743.byte	102,15,58,68,222,0
744	pxor	%xmm0,%xmm3
745	pxor	%xmm1,%xmm3
746
747	movdqa	%xmm3,%xmm4
748	psrldq	$8,%xmm3
749	pslldq	$8,%xmm4
750	pxor	%xmm3,%xmm1
751	pxor	%xmm4,%xmm0
752
753	movdqa	%xmm0,%xmm4
754	movdqa	%xmm0,%xmm3
755	psllq	$5,%xmm0
756	pxor	%xmm0,%xmm3
757	psllq	$1,%xmm0
758	pxor	%xmm3,%xmm0
759	psllq	$57,%xmm0
760	movdqa	%xmm0,%xmm3
761	pslldq	$8,%xmm0
762	psrldq	$8,%xmm3
763	pxor	%xmm4,%xmm0
764	pxor	%xmm3,%xmm1
765
766
767	movdqa	%xmm0,%xmm4
768	psrlq	$1,%xmm0
769	pxor	%xmm4,%xmm1
770	pxor	%xmm0,%xmm4
771	psrlq	$5,%xmm0
772	pxor	%xmm4,%xmm0
773	psrlq	$1,%xmm0
774	pxor	%xmm1,%xmm0
775	movdqa	%xmm0,%xmm5
776	movdqa	%xmm0,%xmm1
777	pshufd	$78,%xmm0,%xmm3
778	pxor	%xmm0,%xmm3
779.byte	102,15,58,68,194,0
780.byte	102,15,58,68,202,17
781.byte	102,15,58,68,222,0
782	pxor	%xmm0,%xmm3
783	pxor	%xmm1,%xmm3
784
785	movdqa	%xmm3,%xmm4
786	psrldq	$8,%xmm3
787	pslldq	$8,%xmm4
788	pxor	%xmm3,%xmm1
789	pxor	%xmm4,%xmm0
790
791	movdqa	%xmm0,%xmm4
792	movdqa	%xmm0,%xmm3
793	psllq	$5,%xmm0
794	pxor	%xmm0,%xmm3
795	psllq	$1,%xmm0
796	pxor	%xmm3,%xmm0
797	psllq	$57,%xmm0
798	movdqa	%xmm0,%xmm3
799	pslldq	$8,%xmm0
800	psrldq	$8,%xmm3
801	pxor	%xmm4,%xmm0
802	pxor	%xmm3,%xmm1
803
804
805	movdqa	%xmm0,%xmm4
806	psrlq	$1,%xmm0
807	pxor	%xmm4,%xmm1
808	pxor	%xmm0,%xmm4
809	psrlq	$5,%xmm0
810	pxor	%xmm4,%xmm0
811	psrlq	$1,%xmm0
812	pxor	%xmm1,%xmm0
813	pshufd	$78,%xmm5,%xmm3
814	pshufd	$78,%xmm0,%xmm4
815	pxor	%xmm5,%xmm3
816	movdqu	%xmm5,48(%rdi)
817	pxor	%xmm0,%xmm4
818	movdqu	%xmm0,64(%rdi)
819.byte	102,15,58,15,227,8
820	movdqu	%xmm4,80(%rdi)
821	.byte	0xf3,0xc3
822
823.globl	_gcm_gmult_clmul
824.private_extern _gcm_gmult_clmul
825
826.p2align	4
827_gcm_gmult_clmul:
828L$_gmult_clmul:
829	movdqu	(%rdi),%xmm0
830	movdqa	L$bswap_mask(%rip),%xmm5
831	movdqu	(%rsi),%xmm2
832	movdqu	32(%rsi),%xmm4
833.byte	102,15,56,0,197
834	movdqa	%xmm0,%xmm1
835	pshufd	$78,%xmm0,%xmm3
836	pxor	%xmm0,%xmm3
837.byte	102,15,58,68,194,0
838.byte	102,15,58,68,202,17
839.byte	102,15,58,68,220,0
840	pxor	%xmm0,%xmm3
841	pxor	%xmm1,%xmm3
842
843	movdqa	%xmm3,%xmm4
844	psrldq	$8,%xmm3
845	pslldq	$8,%xmm4
846	pxor	%xmm3,%xmm1
847	pxor	%xmm4,%xmm0
848
849	movdqa	%xmm0,%xmm4
850	movdqa	%xmm0,%xmm3
851	psllq	$5,%xmm0
852	pxor	%xmm0,%xmm3
853	psllq	$1,%xmm0
854	pxor	%xmm3,%xmm0
855	psllq	$57,%xmm0
856	movdqa	%xmm0,%xmm3
857	pslldq	$8,%xmm0
858	psrldq	$8,%xmm3
859	pxor	%xmm4,%xmm0
860	pxor	%xmm3,%xmm1
861
862
863	movdqa	%xmm0,%xmm4
864	psrlq	$1,%xmm0
865	pxor	%xmm4,%xmm1
866	pxor	%xmm0,%xmm4
867	psrlq	$5,%xmm0
868	pxor	%xmm4,%xmm0
869	psrlq	$1,%xmm0
870	pxor	%xmm1,%xmm0
871.byte	102,15,56,0,197
872	movdqu	%xmm0,(%rdi)
873	.byte	0xf3,0xc3
874
875.globl	_gcm_ghash_clmul
876.private_extern _gcm_ghash_clmul
877
878.p2align	5
879_gcm_ghash_clmul:
880L$_ghash_clmul:
881	movdqa	L$bswap_mask(%rip),%xmm10
882
883	movdqu	(%rdi),%xmm0
884	movdqu	(%rsi),%xmm2
885	movdqu	32(%rsi),%xmm7
886.byte	102,65,15,56,0,194
887
888	subq	$0x10,%rcx
889	jz	L$odd_tail
890
891	movdqu	16(%rsi),%xmm6
892	leaq	_OPENSSL_ia32cap_P(%rip),%rax
893	movl	4(%rax),%eax
894	cmpq	$0x30,%rcx
895	jb	L$skip4x
896
897	andl	$71303168,%eax
898	cmpl	$4194304,%eax
899	je	L$skip4x
900
901	subq	$0x30,%rcx
902	movq	$0xA040608020C0E000,%rax
903	movdqu	48(%rsi),%xmm14
904	movdqu	64(%rsi),%xmm15
905
906
907
908
909	movdqu	48(%rdx),%xmm3
910	movdqu	32(%rdx),%xmm11
911.byte	102,65,15,56,0,218
912.byte	102,69,15,56,0,218
913	movdqa	%xmm3,%xmm5
914	pshufd	$78,%xmm3,%xmm4
915	pxor	%xmm3,%xmm4
916.byte	102,15,58,68,218,0
917.byte	102,15,58,68,234,17
918.byte	102,15,58,68,231,0
919
920	movdqa	%xmm11,%xmm13
921	pshufd	$78,%xmm11,%xmm12
922	pxor	%xmm11,%xmm12
923.byte	102,68,15,58,68,222,0
924.byte	102,68,15,58,68,238,17
925.byte	102,68,15,58,68,231,16
926	xorps	%xmm11,%xmm3
927	xorps	%xmm13,%xmm5
928	movups	80(%rsi),%xmm7
929	xorps	%xmm12,%xmm4
930
931	movdqu	16(%rdx),%xmm11
932	movdqu	0(%rdx),%xmm8
933.byte	102,69,15,56,0,218
934.byte	102,69,15,56,0,194
935	movdqa	%xmm11,%xmm13
936	pshufd	$78,%xmm11,%xmm12
937	pxor	%xmm8,%xmm0
938	pxor	%xmm11,%xmm12
939.byte	102,69,15,58,68,222,0
940	movdqa	%xmm0,%xmm1
941	pshufd	$78,%xmm0,%xmm8
942	pxor	%xmm0,%xmm8
943.byte	102,69,15,58,68,238,17
944.byte	102,68,15,58,68,231,0
945	xorps	%xmm11,%xmm3
946	xorps	%xmm13,%xmm5
947
948	leaq	64(%rdx),%rdx
949	subq	$0x40,%rcx
950	jc	L$tail4x
951
952	jmp	L$mod4_loop
953.p2align	5
954L$mod4_loop:
955.byte	102,65,15,58,68,199,0
956	xorps	%xmm12,%xmm4
957	movdqu	48(%rdx),%xmm11
958.byte	102,69,15,56,0,218
959.byte	102,65,15,58,68,207,17
960	xorps	%xmm3,%xmm0
961	movdqu	32(%rdx),%xmm3
962	movdqa	%xmm11,%xmm13
963.byte	102,68,15,58,68,199,16
964	pshufd	$78,%xmm11,%xmm12
965	xorps	%xmm5,%xmm1
966	pxor	%xmm11,%xmm12
967.byte	102,65,15,56,0,218
968	movups	32(%rsi),%xmm7
969	xorps	%xmm4,%xmm8
970.byte	102,68,15,58,68,218,0
971	pshufd	$78,%xmm3,%xmm4
972
973	pxor	%xmm0,%xmm8
974	movdqa	%xmm3,%xmm5
975	pxor	%xmm1,%xmm8
976	pxor	%xmm3,%xmm4
977	movdqa	%xmm8,%xmm9
978.byte	102,68,15,58,68,234,17
979	pslldq	$8,%xmm8
980	psrldq	$8,%xmm9
981	pxor	%xmm8,%xmm0
982	movdqa	L$7_mask(%rip),%xmm8
983	pxor	%xmm9,%xmm1
984.byte	102,76,15,110,200
985
986	pand	%xmm0,%xmm8
987.byte	102,69,15,56,0,200
988	pxor	%xmm0,%xmm9
989.byte	102,68,15,58,68,231,0
990	psllq	$57,%xmm9
991	movdqa	%xmm9,%xmm8
992	pslldq	$8,%xmm9
993.byte	102,15,58,68,222,0
994	psrldq	$8,%xmm8
995	pxor	%xmm9,%xmm0
996	pxor	%xmm8,%xmm1
997	movdqu	0(%rdx),%xmm8
998
999	movdqa	%xmm0,%xmm9
1000	psrlq	$1,%xmm0
1001.byte	102,15,58,68,238,17
1002	xorps	%xmm11,%xmm3
1003	movdqu	16(%rdx),%xmm11
1004.byte	102,69,15,56,0,218
1005.byte	102,15,58,68,231,16
1006	xorps	%xmm13,%xmm5
1007	movups	80(%rsi),%xmm7
1008.byte	102,69,15,56,0,194
1009	pxor	%xmm9,%xmm1
1010	pxor	%xmm0,%xmm9
1011	psrlq	$5,%xmm0
1012
1013	movdqa	%xmm11,%xmm13
1014	pxor	%xmm12,%xmm4
1015	pshufd	$78,%xmm11,%xmm12
1016	pxor	%xmm9,%xmm0
1017	pxor	%xmm8,%xmm1
1018	pxor	%xmm11,%xmm12
1019.byte	102,69,15,58,68,222,0
1020	psrlq	$1,%xmm0
1021	pxor	%xmm1,%xmm0
1022	movdqa	%xmm0,%xmm1
1023.byte	102,69,15,58,68,238,17
1024	xorps	%xmm11,%xmm3
1025	pshufd	$78,%xmm0,%xmm8
1026	pxor	%xmm0,%xmm8
1027
1028.byte	102,68,15,58,68,231,0
1029	xorps	%xmm13,%xmm5
1030
1031	leaq	64(%rdx),%rdx
1032	subq	$0x40,%rcx
1033	jnc	L$mod4_loop
1034
1035L$tail4x:
1036.byte	102,65,15,58,68,199,0
1037.byte	102,65,15,58,68,207,17
1038.byte	102,68,15,58,68,199,16
1039	xorps	%xmm12,%xmm4
1040	xorps	%xmm3,%xmm0
1041	xorps	%xmm5,%xmm1
1042	pxor	%xmm0,%xmm1
1043	pxor	%xmm4,%xmm8
1044
1045	pxor	%xmm1,%xmm8
1046	pxor	%xmm0,%xmm1
1047
1048	movdqa	%xmm8,%xmm9
1049	psrldq	$8,%xmm8
1050	pslldq	$8,%xmm9
1051	pxor	%xmm8,%xmm1
1052	pxor	%xmm9,%xmm0
1053
1054	movdqa	%xmm0,%xmm4
1055	movdqa	%xmm0,%xmm3
1056	psllq	$5,%xmm0
1057	pxor	%xmm0,%xmm3
1058	psllq	$1,%xmm0
1059	pxor	%xmm3,%xmm0
1060	psllq	$57,%xmm0
1061	movdqa	%xmm0,%xmm3
1062	pslldq	$8,%xmm0
1063	psrldq	$8,%xmm3
1064	pxor	%xmm4,%xmm0
1065	pxor	%xmm3,%xmm1
1066
1067
1068	movdqa	%xmm0,%xmm4
1069	psrlq	$1,%xmm0
1070	pxor	%xmm4,%xmm1
1071	pxor	%xmm0,%xmm4
1072	psrlq	$5,%xmm0
1073	pxor	%xmm4,%xmm0
1074	psrlq	$1,%xmm0
1075	pxor	%xmm1,%xmm0
1076	addq	$0x40,%rcx
1077	jz	L$done
1078	movdqu	32(%rsi),%xmm7
1079	subq	$0x10,%rcx
1080	jz	L$odd_tail
1081L$skip4x:
1082
1083
1084
1085
1086
1087	movdqu	(%rdx),%xmm8
1088	movdqu	16(%rdx),%xmm3
1089.byte	102,69,15,56,0,194
1090.byte	102,65,15,56,0,218
1091	pxor	%xmm8,%xmm0
1092
1093	movdqa	%xmm3,%xmm5
1094	pshufd	$78,%xmm3,%xmm4
1095	pxor	%xmm3,%xmm4
1096.byte	102,15,58,68,218,0
1097.byte	102,15,58,68,234,17
1098.byte	102,15,58,68,231,0
1099
1100	leaq	32(%rdx),%rdx
1101	nop
1102	subq	$0x20,%rcx
1103	jbe	L$even_tail
1104	nop
1105	jmp	L$mod_loop
1106
1107.p2align	5
1108L$mod_loop:
1109	movdqa	%xmm0,%xmm1
1110	movdqa	%xmm4,%xmm8
1111	pshufd	$78,%xmm0,%xmm4
1112	pxor	%xmm0,%xmm4
1113
1114.byte	102,15,58,68,198,0
1115.byte	102,15,58,68,206,17
1116.byte	102,15,58,68,231,16
1117
1118	pxor	%xmm3,%xmm0
1119	pxor	%xmm5,%xmm1
1120	movdqu	(%rdx),%xmm9
1121	pxor	%xmm0,%xmm8
1122.byte	102,69,15,56,0,202
1123	movdqu	16(%rdx),%xmm3
1124
1125	pxor	%xmm1,%xmm8
1126	pxor	%xmm9,%xmm1
1127	pxor	%xmm8,%xmm4
1128.byte	102,65,15,56,0,218
1129	movdqa	%xmm4,%xmm8
1130	psrldq	$8,%xmm8
1131	pslldq	$8,%xmm4
1132	pxor	%xmm8,%xmm1
1133	pxor	%xmm4,%xmm0
1134
1135	movdqa	%xmm3,%xmm5
1136
1137	movdqa	%xmm0,%xmm9
1138	movdqa	%xmm0,%xmm8
1139	psllq	$5,%xmm0
1140	pxor	%xmm0,%xmm8
1141.byte	102,15,58,68,218,0
1142	psllq	$1,%xmm0
1143	pxor	%xmm8,%xmm0
1144	psllq	$57,%xmm0
1145	movdqa	%xmm0,%xmm8
1146	pslldq	$8,%xmm0
1147	psrldq	$8,%xmm8
1148	pxor	%xmm9,%xmm0
1149	pshufd	$78,%xmm5,%xmm4
1150	pxor	%xmm8,%xmm1
1151	pxor	%xmm5,%xmm4
1152
1153	movdqa	%xmm0,%xmm9
1154	psrlq	$1,%xmm0
1155.byte	102,15,58,68,234,17
1156	pxor	%xmm9,%xmm1
1157	pxor	%xmm0,%xmm9
1158	psrlq	$5,%xmm0
1159	pxor	%xmm9,%xmm0
1160	leaq	32(%rdx),%rdx
1161	psrlq	$1,%xmm0
1162.byte	102,15,58,68,231,0
1163	pxor	%xmm1,%xmm0
1164
1165	subq	$0x20,%rcx
1166	ja	L$mod_loop
1167
1168L$even_tail:
1169	movdqa	%xmm0,%xmm1
1170	movdqa	%xmm4,%xmm8
1171	pshufd	$78,%xmm0,%xmm4
1172	pxor	%xmm0,%xmm4
1173
1174.byte	102,15,58,68,198,0
1175.byte	102,15,58,68,206,17
1176.byte	102,15,58,68,231,16
1177
1178	pxor	%xmm3,%xmm0
1179	pxor	%xmm5,%xmm1
1180	pxor	%xmm0,%xmm8
1181	pxor	%xmm1,%xmm8
1182	pxor	%xmm8,%xmm4
1183	movdqa	%xmm4,%xmm8
1184	psrldq	$8,%xmm8
1185	pslldq	$8,%xmm4
1186	pxor	%xmm8,%xmm1
1187	pxor	%xmm4,%xmm0
1188
1189	movdqa	%xmm0,%xmm4
1190	movdqa	%xmm0,%xmm3
1191	psllq	$5,%xmm0
1192	pxor	%xmm0,%xmm3
1193	psllq	$1,%xmm0
1194	pxor	%xmm3,%xmm0
1195	psllq	$57,%xmm0
1196	movdqa	%xmm0,%xmm3
1197	pslldq	$8,%xmm0
1198	psrldq	$8,%xmm3
1199	pxor	%xmm4,%xmm0
1200	pxor	%xmm3,%xmm1
1201
1202
1203	movdqa	%xmm0,%xmm4
1204	psrlq	$1,%xmm0
1205	pxor	%xmm4,%xmm1
1206	pxor	%xmm0,%xmm4
1207	psrlq	$5,%xmm0
1208	pxor	%xmm4,%xmm0
1209	psrlq	$1,%xmm0
1210	pxor	%xmm1,%xmm0
1211	testq	%rcx,%rcx
1212	jnz	L$done
1213
1214L$odd_tail:
1215	movdqu	(%rdx),%xmm8
1216.byte	102,69,15,56,0,194
1217	pxor	%xmm8,%xmm0
1218	movdqa	%xmm0,%xmm1
1219	pshufd	$78,%xmm0,%xmm3
1220	pxor	%xmm0,%xmm3
1221.byte	102,15,58,68,194,0
1222.byte	102,15,58,68,202,17
1223.byte	102,15,58,68,223,0
1224	pxor	%xmm0,%xmm3
1225	pxor	%xmm1,%xmm3
1226
1227	movdqa	%xmm3,%xmm4
1228	psrldq	$8,%xmm3
1229	pslldq	$8,%xmm4
1230	pxor	%xmm3,%xmm1
1231	pxor	%xmm4,%xmm0
1232
1233	movdqa	%xmm0,%xmm4
1234	movdqa	%xmm0,%xmm3
1235	psllq	$5,%xmm0
1236	pxor	%xmm0,%xmm3
1237	psllq	$1,%xmm0
1238	pxor	%xmm3,%xmm0
1239	psllq	$57,%xmm0
1240	movdqa	%xmm0,%xmm3
1241	pslldq	$8,%xmm0
1242	psrldq	$8,%xmm3
1243	pxor	%xmm4,%xmm0
1244	pxor	%xmm3,%xmm1
1245
1246
1247	movdqa	%xmm0,%xmm4
1248	psrlq	$1,%xmm0
1249	pxor	%xmm4,%xmm1
1250	pxor	%xmm0,%xmm4
1251	psrlq	$5,%xmm0
1252	pxor	%xmm4,%xmm0
1253	psrlq	$1,%xmm0
1254	pxor	%xmm1,%xmm0
1255L$done:
1256.byte	102,65,15,56,0,194
1257	movdqu	%xmm0,(%rdi)
1258	.byte	0xf3,0xc3
1259
1260.globl	_gcm_init_avx
1261.private_extern _gcm_init_avx
1262
1263.p2align	5
1264_gcm_init_avx:
1265	vzeroupper
1266
1267	vmovdqu	(%rsi),%xmm2
1268	vpshufd	$78,%xmm2,%xmm2
1269
1270
1271	vpshufd	$255,%xmm2,%xmm4
1272	vpsrlq	$63,%xmm2,%xmm3
1273	vpsllq	$1,%xmm2,%xmm2
1274	vpxor	%xmm5,%xmm5,%xmm5
1275	vpcmpgtd	%xmm4,%xmm5,%xmm5
1276	vpslldq	$8,%xmm3,%xmm3
1277	vpor	%xmm3,%xmm2,%xmm2
1278
1279
1280	vpand	L$0x1c2_polynomial(%rip),%xmm5,%xmm5
1281	vpxor	%xmm5,%xmm2,%xmm2
1282
1283	vpunpckhqdq	%xmm2,%xmm2,%xmm6
1284	vmovdqa	%xmm2,%xmm0
1285	vpxor	%xmm2,%xmm6,%xmm6
1286	movq	$4,%r10
1287	jmp	L$init_start_avx
1288.p2align	5
1289L$init_loop_avx:
1290	vpalignr	$8,%xmm3,%xmm4,%xmm5
1291	vmovdqu	%xmm5,-16(%rdi)
1292	vpunpckhqdq	%xmm0,%xmm0,%xmm3
1293	vpxor	%xmm0,%xmm3,%xmm3
1294	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
1295	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
1296	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
1297	vpxor	%xmm0,%xmm1,%xmm4
1298	vpxor	%xmm4,%xmm3,%xmm3
1299
1300	vpslldq	$8,%xmm3,%xmm4
1301	vpsrldq	$8,%xmm3,%xmm3
1302	vpxor	%xmm4,%xmm0,%xmm0
1303	vpxor	%xmm3,%xmm1,%xmm1
1304	vpsllq	$57,%xmm0,%xmm3
1305	vpsllq	$62,%xmm0,%xmm4
1306	vpxor	%xmm3,%xmm4,%xmm4
1307	vpsllq	$63,%xmm0,%xmm3
1308	vpxor	%xmm3,%xmm4,%xmm4
1309	vpslldq	$8,%xmm4,%xmm3
1310	vpsrldq	$8,%xmm4,%xmm4
1311	vpxor	%xmm3,%xmm0,%xmm0
1312	vpxor	%xmm4,%xmm1,%xmm1
1313
1314	vpsrlq	$1,%xmm0,%xmm4
1315	vpxor	%xmm0,%xmm1,%xmm1
1316	vpxor	%xmm4,%xmm0,%xmm0
1317	vpsrlq	$5,%xmm4,%xmm4
1318	vpxor	%xmm4,%xmm0,%xmm0
1319	vpsrlq	$1,%xmm0,%xmm0
1320	vpxor	%xmm1,%xmm0,%xmm0
1321L$init_start_avx:
1322	vmovdqa	%xmm0,%xmm5
1323	vpunpckhqdq	%xmm0,%xmm0,%xmm3
1324	vpxor	%xmm0,%xmm3,%xmm3
1325	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
1326	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
1327	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
1328	vpxor	%xmm0,%xmm1,%xmm4
1329	vpxor	%xmm4,%xmm3,%xmm3
1330
1331	vpslldq	$8,%xmm3,%xmm4
1332	vpsrldq	$8,%xmm3,%xmm3
1333	vpxor	%xmm4,%xmm0,%xmm0
1334	vpxor	%xmm3,%xmm1,%xmm1
1335	vpsllq	$57,%xmm0,%xmm3
1336	vpsllq	$62,%xmm0,%xmm4
1337	vpxor	%xmm3,%xmm4,%xmm4
1338	vpsllq	$63,%xmm0,%xmm3
1339	vpxor	%xmm3,%xmm4,%xmm4
1340	vpslldq	$8,%xmm4,%xmm3
1341	vpsrldq	$8,%xmm4,%xmm4
1342	vpxor	%xmm3,%xmm0,%xmm0
1343	vpxor	%xmm4,%xmm1,%xmm1
1344
1345	vpsrlq	$1,%xmm0,%xmm4
1346	vpxor	%xmm0,%xmm1,%xmm1
1347	vpxor	%xmm4,%xmm0,%xmm0
1348	vpsrlq	$5,%xmm4,%xmm4
1349	vpxor	%xmm4,%xmm0,%xmm0
1350	vpsrlq	$1,%xmm0,%xmm0
1351	vpxor	%xmm1,%xmm0,%xmm0
1352	vpshufd	$78,%xmm5,%xmm3
1353	vpshufd	$78,%xmm0,%xmm4
1354	vpxor	%xmm5,%xmm3,%xmm3
1355	vmovdqu	%xmm5,0(%rdi)
1356	vpxor	%xmm0,%xmm4,%xmm4
1357	vmovdqu	%xmm0,16(%rdi)
1358	leaq	48(%rdi),%rdi
1359	subq	$1,%r10
1360	jnz	L$init_loop_avx
1361
1362	vpalignr	$8,%xmm4,%xmm3,%xmm5
1363	vmovdqu	%xmm5,-16(%rdi)
1364
1365	vzeroupper
1366	.byte	0xf3,0xc3
1367
1368.globl	_gcm_gmult_avx
1369.private_extern _gcm_gmult_avx
1370
1371.p2align	5
1372_gcm_gmult_avx:
1373	jmp	L$_gmult_clmul
1374
1375.globl	_gcm_ghash_avx
1376.private_extern _gcm_ghash_avx
1377
1378.p2align	5
1379_gcm_ghash_avx:
1380	vzeroupper
1381
1382	vmovdqu	(%rdi),%xmm10
1383	leaq	L$0x1c2_polynomial(%rip),%r10
1384	leaq	64(%rsi),%rsi
1385	vmovdqu	L$bswap_mask(%rip),%xmm13
1386	vpshufb	%xmm13,%xmm10,%xmm10
1387	cmpq	$0x80,%rcx
1388	jb	L$short_avx
1389	subq	$0x80,%rcx
1390
1391	vmovdqu	112(%rdx),%xmm14
1392	vmovdqu	0-64(%rsi),%xmm6
1393	vpshufb	%xmm13,%xmm14,%xmm14
1394	vmovdqu	32-64(%rsi),%xmm7
1395
1396	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1397	vmovdqu	96(%rdx),%xmm15
1398	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1399	vpxor	%xmm14,%xmm9,%xmm9
1400	vpshufb	%xmm13,%xmm15,%xmm15
1401	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1402	vmovdqu	16-64(%rsi),%xmm6
1403	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1404	vmovdqu	80(%rdx),%xmm14
1405	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1406	vpxor	%xmm15,%xmm8,%xmm8
1407
1408	vpshufb	%xmm13,%xmm14,%xmm14
1409	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1410	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1411	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1412	vmovdqu	48-64(%rsi),%xmm6
1413	vpxor	%xmm14,%xmm9,%xmm9
1414	vmovdqu	64(%rdx),%xmm15
1415	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1416	vmovdqu	80-64(%rsi),%xmm7
1417
1418	vpshufb	%xmm13,%xmm15,%xmm15
1419	vpxor	%xmm0,%xmm3,%xmm3
1420	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1421	vpxor	%xmm1,%xmm4,%xmm4
1422	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1423	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1424	vmovdqu	64-64(%rsi),%xmm6
1425	vpxor	%xmm2,%xmm5,%xmm5
1426	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1427	vpxor	%xmm15,%xmm8,%xmm8
1428
1429	vmovdqu	48(%rdx),%xmm14
1430	vpxor	%xmm3,%xmm0,%xmm0
1431	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1432	vpxor	%xmm4,%xmm1,%xmm1
1433	vpshufb	%xmm13,%xmm14,%xmm14
1434	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1435	vmovdqu	96-64(%rsi),%xmm6
1436	vpxor	%xmm5,%xmm2,%xmm2
1437	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1438	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1439	vmovdqu	128-64(%rsi),%xmm7
1440	vpxor	%xmm14,%xmm9,%xmm9
1441
1442	vmovdqu	32(%rdx),%xmm15
1443	vpxor	%xmm0,%xmm3,%xmm3
1444	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1445	vpxor	%xmm1,%xmm4,%xmm4
1446	vpshufb	%xmm13,%xmm15,%xmm15
1447	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1448	vmovdqu	112-64(%rsi),%xmm6
1449	vpxor	%xmm2,%xmm5,%xmm5
1450	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1451	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1452	vpxor	%xmm15,%xmm8,%xmm8
1453
1454	vmovdqu	16(%rdx),%xmm14
1455	vpxor	%xmm3,%xmm0,%xmm0
1456	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1457	vpxor	%xmm4,%xmm1,%xmm1
1458	vpshufb	%xmm13,%xmm14,%xmm14
1459	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1460	vmovdqu	144-64(%rsi),%xmm6
1461	vpxor	%xmm5,%xmm2,%xmm2
1462	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1463	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1464	vmovdqu	176-64(%rsi),%xmm7
1465	vpxor	%xmm14,%xmm9,%xmm9
1466
1467	vmovdqu	(%rdx),%xmm15
1468	vpxor	%xmm0,%xmm3,%xmm3
1469	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1470	vpxor	%xmm1,%xmm4,%xmm4
1471	vpshufb	%xmm13,%xmm15,%xmm15
1472	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1473	vmovdqu	160-64(%rsi),%xmm6
1474	vpxor	%xmm2,%xmm5,%xmm5
1475	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
1476
1477	leaq	128(%rdx),%rdx
1478	cmpq	$0x80,%rcx
1479	jb	L$tail_avx
1480
1481	vpxor	%xmm10,%xmm15,%xmm15
1482	subq	$0x80,%rcx
1483	jmp	L$oop8x_avx
1484
1485.p2align	5
1486L$oop8x_avx:
1487	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1488	vmovdqu	112(%rdx),%xmm14
1489	vpxor	%xmm0,%xmm3,%xmm3
1490	vpxor	%xmm15,%xmm8,%xmm8
1491	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm10
1492	vpshufb	%xmm13,%xmm14,%xmm14
1493	vpxor	%xmm1,%xmm4,%xmm4
1494	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm11
1495	vmovdqu	0-64(%rsi),%xmm6
1496	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1497	vpxor	%xmm2,%xmm5,%xmm5
1498	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm12
1499	vmovdqu	32-64(%rsi),%xmm7
1500	vpxor	%xmm14,%xmm9,%xmm9
1501
1502	vmovdqu	96(%rdx),%xmm15
1503	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1504	vpxor	%xmm3,%xmm10,%xmm10
1505	vpshufb	%xmm13,%xmm15,%xmm15
1506	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1507	vxorps	%xmm4,%xmm11,%xmm11
1508	vmovdqu	16-64(%rsi),%xmm6
1509	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1510	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1511	vpxor	%xmm5,%xmm12,%xmm12
1512	vxorps	%xmm15,%xmm8,%xmm8
1513
1514	vmovdqu	80(%rdx),%xmm14
1515	vpxor	%xmm10,%xmm12,%xmm12
1516	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1517	vpxor	%xmm11,%xmm12,%xmm12
1518	vpslldq	$8,%xmm12,%xmm9
1519	vpxor	%xmm0,%xmm3,%xmm3
1520	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1521	vpsrldq	$8,%xmm12,%xmm12
1522	vpxor	%xmm9,%xmm10,%xmm10
1523	vmovdqu	48-64(%rsi),%xmm6
1524	vpshufb	%xmm13,%xmm14,%xmm14
1525	vxorps	%xmm12,%xmm11,%xmm11
1526	vpxor	%xmm1,%xmm4,%xmm4
1527	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1528	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1529	vmovdqu	80-64(%rsi),%xmm7
1530	vpxor	%xmm14,%xmm9,%xmm9
1531	vpxor	%xmm2,%xmm5,%xmm5
1532
1533	vmovdqu	64(%rdx),%xmm15
1534	vpalignr	$8,%xmm10,%xmm10,%xmm12
1535	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1536	vpshufb	%xmm13,%xmm15,%xmm15
1537	vpxor	%xmm3,%xmm0,%xmm0
1538	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1539	vmovdqu	64-64(%rsi),%xmm6
1540	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1541	vpxor	%xmm4,%xmm1,%xmm1
1542	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1543	vxorps	%xmm15,%xmm8,%xmm8
1544	vpxor	%xmm5,%xmm2,%xmm2
1545
1546	vmovdqu	48(%rdx),%xmm14
1547	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
1548	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1549	vpshufb	%xmm13,%xmm14,%xmm14
1550	vpxor	%xmm0,%xmm3,%xmm3
1551	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1552	vmovdqu	96-64(%rsi),%xmm6
1553	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1554	vpxor	%xmm1,%xmm4,%xmm4
1555	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1556	vmovdqu	128-64(%rsi),%xmm7
1557	vpxor	%xmm14,%xmm9,%xmm9
1558	vpxor	%xmm2,%xmm5,%xmm5
1559
1560	vmovdqu	32(%rdx),%xmm15
1561	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1562	vpshufb	%xmm13,%xmm15,%xmm15
1563	vpxor	%xmm3,%xmm0,%xmm0
1564	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1565	vmovdqu	112-64(%rsi),%xmm6
1566	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1567	vpxor	%xmm4,%xmm1,%xmm1
1568	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1569	vpxor	%xmm15,%xmm8,%xmm8
1570	vpxor	%xmm5,%xmm2,%xmm2
1571	vxorps	%xmm12,%xmm10,%xmm10
1572
1573	vmovdqu	16(%rdx),%xmm14
1574	vpalignr	$8,%xmm10,%xmm10,%xmm12
1575	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1576	vpshufb	%xmm13,%xmm14,%xmm14
1577	vpxor	%xmm0,%xmm3,%xmm3
1578	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1579	vmovdqu	144-64(%rsi),%xmm6
1580	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
1581	vxorps	%xmm11,%xmm12,%xmm12
1582	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1583	vpxor	%xmm1,%xmm4,%xmm4
1584	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1585	vmovdqu	176-64(%rsi),%xmm7
1586	vpxor	%xmm14,%xmm9,%xmm9
1587	vpxor	%xmm2,%xmm5,%xmm5
1588
1589	vmovdqu	(%rdx),%xmm15
1590	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1591	vpshufb	%xmm13,%xmm15,%xmm15
1592	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1593	vmovdqu	160-64(%rsi),%xmm6
1594	vpxor	%xmm12,%xmm15,%xmm15
1595	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
1596	vpxor	%xmm10,%xmm15,%xmm15
1597
1598	leaq	128(%rdx),%rdx
1599	subq	$0x80,%rcx
1600	jnc	L$oop8x_avx
1601
1602	addq	$0x80,%rcx
1603	jmp	L$tail_no_xor_avx
1604
1605.p2align	5
1606L$short_avx:
1607	vmovdqu	-16(%rdx,%rcx,1),%xmm14
1608	leaq	(%rdx,%rcx,1),%rdx
1609	vmovdqu	0-64(%rsi),%xmm6
1610	vmovdqu	32-64(%rsi),%xmm7
1611	vpshufb	%xmm13,%xmm14,%xmm15
1612
1613	vmovdqa	%xmm0,%xmm3
1614	vmovdqa	%xmm1,%xmm4
1615	vmovdqa	%xmm2,%xmm5
1616	subq	$0x10,%rcx
1617	jz	L$tail_avx
1618
1619	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1620	vpxor	%xmm0,%xmm3,%xmm3
1621	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1622	vpxor	%xmm15,%xmm8,%xmm8
1623	vmovdqu	-32(%rdx),%xmm14
1624	vpxor	%xmm1,%xmm4,%xmm4
1625	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1626	vmovdqu	16-64(%rsi),%xmm6
1627	vpshufb	%xmm13,%xmm14,%xmm15
1628	vpxor	%xmm2,%xmm5,%xmm5
1629	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1630	vpsrldq	$8,%xmm7,%xmm7
1631	subq	$0x10,%rcx
1632	jz	L$tail_avx
1633
1634	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1635	vpxor	%xmm0,%xmm3,%xmm3
1636	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1637	vpxor	%xmm15,%xmm8,%xmm8
1638	vmovdqu	-48(%rdx),%xmm14
1639	vpxor	%xmm1,%xmm4,%xmm4
1640	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1641	vmovdqu	48-64(%rsi),%xmm6
1642	vpshufb	%xmm13,%xmm14,%xmm15
1643	vpxor	%xmm2,%xmm5,%xmm5
1644	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1645	vmovdqu	80-64(%rsi),%xmm7
1646	subq	$0x10,%rcx
1647	jz	L$tail_avx
1648
1649	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1650	vpxor	%xmm0,%xmm3,%xmm3
1651	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1652	vpxor	%xmm15,%xmm8,%xmm8
1653	vmovdqu	-64(%rdx),%xmm14
1654	vpxor	%xmm1,%xmm4,%xmm4
1655	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1656	vmovdqu	64-64(%rsi),%xmm6
1657	vpshufb	%xmm13,%xmm14,%xmm15
1658	vpxor	%xmm2,%xmm5,%xmm5
1659	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1660	vpsrldq	$8,%xmm7,%xmm7
1661	subq	$0x10,%rcx
1662	jz	L$tail_avx
1663
1664	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1665	vpxor	%xmm0,%xmm3,%xmm3
1666	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1667	vpxor	%xmm15,%xmm8,%xmm8
1668	vmovdqu	-80(%rdx),%xmm14
1669	vpxor	%xmm1,%xmm4,%xmm4
1670	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1671	vmovdqu	96-64(%rsi),%xmm6
1672	vpshufb	%xmm13,%xmm14,%xmm15
1673	vpxor	%xmm2,%xmm5,%xmm5
1674	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1675	vmovdqu	128-64(%rsi),%xmm7
1676	subq	$0x10,%rcx
1677	jz	L$tail_avx
1678
1679	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1680	vpxor	%xmm0,%xmm3,%xmm3
1681	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1682	vpxor	%xmm15,%xmm8,%xmm8
1683	vmovdqu	-96(%rdx),%xmm14
1684	vpxor	%xmm1,%xmm4,%xmm4
1685	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1686	vmovdqu	112-64(%rsi),%xmm6
1687	vpshufb	%xmm13,%xmm14,%xmm15
1688	vpxor	%xmm2,%xmm5,%xmm5
1689	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1690	vpsrldq	$8,%xmm7,%xmm7
1691	subq	$0x10,%rcx
1692	jz	L$tail_avx
1693
1694	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1695	vpxor	%xmm0,%xmm3,%xmm3
1696	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1697	vpxor	%xmm15,%xmm8,%xmm8
1698	vmovdqu	-112(%rdx),%xmm14
1699	vpxor	%xmm1,%xmm4,%xmm4
1700	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1701	vmovdqu	144-64(%rsi),%xmm6
1702	vpshufb	%xmm13,%xmm14,%xmm15
1703	vpxor	%xmm2,%xmm5,%xmm5
1704	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1705	vmovq	184-64(%rsi),%xmm7
1706	subq	$0x10,%rcx
1707	jmp	L$tail_avx
1708
1709.p2align	5
1710L$tail_avx:
1711	vpxor	%xmm10,%xmm15,%xmm15
1712L$tail_no_xor_avx:
1713	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1714	vpxor	%xmm0,%xmm3,%xmm3
1715	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1716	vpxor	%xmm15,%xmm8,%xmm8
1717	vpxor	%xmm1,%xmm4,%xmm4
1718	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1719	vpxor	%xmm2,%xmm5,%xmm5
1720	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1721
1722	vmovdqu	(%r10),%xmm12
1723
1724	vpxor	%xmm0,%xmm3,%xmm10
1725	vpxor	%xmm1,%xmm4,%xmm11
1726	vpxor	%xmm2,%xmm5,%xmm5
1727
1728	vpxor	%xmm10,%xmm5,%xmm5
1729	vpxor	%xmm11,%xmm5,%xmm5
1730	vpslldq	$8,%xmm5,%xmm9
1731	vpsrldq	$8,%xmm5,%xmm5
1732	vpxor	%xmm9,%xmm10,%xmm10
1733	vpxor	%xmm5,%xmm11,%xmm11
1734
1735	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
1736	vpalignr	$8,%xmm10,%xmm10,%xmm10
1737	vpxor	%xmm9,%xmm10,%xmm10
1738
1739	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
1740	vpalignr	$8,%xmm10,%xmm10,%xmm10
1741	vpxor	%xmm11,%xmm10,%xmm10
1742	vpxor	%xmm9,%xmm10,%xmm10
1743
1744	cmpq	$0,%rcx
1745	jne	L$short_avx
1746
1747	vpshufb	%xmm13,%xmm10,%xmm10
1748	vmovdqu	%xmm10,(%rdi)
1749	vzeroupper
1750	.byte	0xf3,0xc3
1751
1752.p2align	6
1753L$bswap_mask:
1754.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1755L$0x1c2_polynomial:
1756.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1757L$7_mask:
1758.long	7,0,7,0
1759L$7_mask_poly:
1760.long	7,0,450,0
1761.p2align	6
1762
1763L$rem_4bit:
1764.long	0,0,0,471859200,0,943718400,0,610271232
1765.long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
1766.long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
1767.long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
1768
1769L$rem_8bit:
1770.value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
1771.value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
1772.value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
1773.value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
1774.value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
1775.value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
1776.value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
1777.value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
1778.value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
1779.value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
1780.value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
1781.value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
1782.value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
1783.value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
1784.value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
1785.value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
1786.value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
1787.value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
1788.value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
1789.value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
1790.value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
1791.value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
1792.value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
1793.value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
1794.value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
1795.value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
1796.value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
1797.value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
1798.value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
1799.value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
1800.value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
1801.value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
1802
1803.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1804.p2align	6
1805#endif
1806