• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
2.text
3
4.globl	_aesni_encrypt
5.private_extern _aesni_encrypt
6
7.p2align	4
8_aesni_encrypt:
9	movups	(%rdi),%xmm2
10	movl	240(%rdx),%eax
11	movups	(%rdx),%xmm0
12	movups	16(%rdx),%xmm1
13	leaq	32(%rdx),%rdx
14	xorps	%xmm0,%xmm2
15L$oop_enc1_1:
16.byte	102,15,56,220,209
17	decl	%eax
18	movups	(%rdx),%xmm1
19	leaq	16(%rdx),%rdx
20	jnz	L$oop_enc1_1
21.byte	102,15,56,221,209
22	pxor	%xmm0,%xmm0
23	pxor	%xmm1,%xmm1
24	movups	%xmm2,(%rsi)
25	pxor	%xmm2,%xmm2
26	.byte	0xf3,0xc3
27
28
29.globl	_aesni_decrypt
30.private_extern _aesni_decrypt
31
32.p2align	4
33_aesni_decrypt:
34	movups	(%rdi),%xmm2
35	movl	240(%rdx),%eax
36	movups	(%rdx),%xmm0
37	movups	16(%rdx),%xmm1
38	leaq	32(%rdx),%rdx
39	xorps	%xmm0,%xmm2
40L$oop_dec1_2:
41.byte	102,15,56,222,209
42	decl	%eax
43	movups	(%rdx),%xmm1
44	leaq	16(%rdx),%rdx
45	jnz	L$oop_dec1_2
46.byte	102,15,56,223,209
47	pxor	%xmm0,%xmm0
48	pxor	%xmm1,%xmm1
49	movups	%xmm2,(%rsi)
50	pxor	%xmm2,%xmm2
51	.byte	0xf3,0xc3
52
53
54.p2align	4
55_aesni_encrypt2:
56	movups	(%rcx),%xmm0
57	shll	$4,%eax
58	movups	16(%rcx),%xmm1
59	xorps	%xmm0,%xmm2
60	xorps	%xmm0,%xmm3
61	movups	32(%rcx),%xmm0
62	leaq	32(%rcx,%rax,1),%rcx
63	negq	%rax
64	addq	$16,%rax
65
66L$enc_loop2:
67.byte	102,15,56,220,209
68.byte	102,15,56,220,217
69	movups	(%rcx,%rax,1),%xmm1
70	addq	$32,%rax
71.byte	102,15,56,220,208
72.byte	102,15,56,220,216
73	movups	-16(%rcx,%rax,1),%xmm0
74	jnz	L$enc_loop2
75
76.byte	102,15,56,220,209
77.byte	102,15,56,220,217
78.byte	102,15,56,221,208
79.byte	102,15,56,221,216
80	.byte	0xf3,0xc3
81
82
83.p2align	4
84_aesni_decrypt2:
85	movups	(%rcx),%xmm0
86	shll	$4,%eax
87	movups	16(%rcx),%xmm1
88	xorps	%xmm0,%xmm2
89	xorps	%xmm0,%xmm3
90	movups	32(%rcx),%xmm0
91	leaq	32(%rcx,%rax,1),%rcx
92	negq	%rax
93	addq	$16,%rax
94
95L$dec_loop2:
96.byte	102,15,56,222,209
97.byte	102,15,56,222,217
98	movups	(%rcx,%rax,1),%xmm1
99	addq	$32,%rax
100.byte	102,15,56,222,208
101.byte	102,15,56,222,216
102	movups	-16(%rcx,%rax,1),%xmm0
103	jnz	L$dec_loop2
104
105.byte	102,15,56,222,209
106.byte	102,15,56,222,217
107.byte	102,15,56,223,208
108.byte	102,15,56,223,216
109	.byte	0xf3,0xc3
110
111
112.p2align	4
113_aesni_encrypt3:
114	movups	(%rcx),%xmm0
115	shll	$4,%eax
116	movups	16(%rcx),%xmm1
117	xorps	%xmm0,%xmm2
118	xorps	%xmm0,%xmm3
119	xorps	%xmm0,%xmm4
120	movups	32(%rcx),%xmm0
121	leaq	32(%rcx,%rax,1),%rcx
122	negq	%rax
123	addq	$16,%rax
124
125L$enc_loop3:
126.byte	102,15,56,220,209
127.byte	102,15,56,220,217
128.byte	102,15,56,220,225
129	movups	(%rcx,%rax,1),%xmm1
130	addq	$32,%rax
131.byte	102,15,56,220,208
132.byte	102,15,56,220,216
133.byte	102,15,56,220,224
134	movups	-16(%rcx,%rax,1),%xmm0
135	jnz	L$enc_loop3
136
137.byte	102,15,56,220,209
138.byte	102,15,56,220,217
139.byte	102,15,56,220,225
140.byte	102,15,56,221,208
141.byte	102,15,56,221,216
142.byte	102,15,56,221,224
143	.byte	0xf3,0xc3
144
145
146.p2align	4
147_aesni_decrypt3:
148	movups	(%rcx),%xmm0
149	shll	$4,%eax
150	movups	16(%rcx),%xmm1
151	xorps	%xmm0,%xmm2
152	xorps	%xmm0,%xmm3
153	xorps	%xmm0,%xmm4
154	movups	32(%rcx),%xmm0
155	leaq	32(%rcx,%rax,1),%rcx
156	negq	%rax
157	addq	$16,%rax
158
159L$dec_loop3:
160.byte	102,15,56,222,209
161.byte	102,15,56,222,217
162.byte	102,15,56,222,225
163	movups	(%rcx,%rax,1),%xmm1
164	addq	$32,%rax
165.byte	102,15,56,222,208
166.byte	102,15,56,222,216
167.byte	102,15,56,222,224
168	movups	-16(%rcx,%rax,1),%xmm0
169	jnz	L$dec_loop3
170
171.byte	102,15,56,222,209
172.byte	102,15,56,222,217
173.byte	102,15,56,222,225
174.byte	102,15,56,223,208
175.byte	102,15,56,223,216
176.byte	102,15,56,223,224
177	.byte	0xf3,0xc3
178
179
180.p2align	4
181_aesni_encrypt4:
182	movups	(%rcx),%xmm0
183	shll	$4,%eax
184	movups	16(%rcx),%xmm1
185	xorps	%xmm0,%xmm2
186	xorps	%xmm0,%xmm3
187	xorps	%xmm0,%xmm4
188	xorps	%xmm0,%xmm5
189	movups	32(%rcx),%xmm0
190	leaq	32(%rcx,%rax,1),%rcx
191	negq	%rax
192.byte	0x0f,0x1f,0x00
193	addq	$16,%rax
194
195L$enc_loop4:
196.byte	102,15,56,220,209
197.byte	102,15,56,220,217
198.byte	102,15,56,220,225
199.byte	102,15,56,220,233
200	movups	(%rcx,%rax,1),%xmm1
201	addq	$32,%rax
202.byte	102,15,56,220,208
203.byte	102,15,56,220,216
204.byte	102,15,56,220,224
205.byte	102,15,56,220,232
206	movups	-16(%rcx,%rax,1),%xmm0
207	jnz	L$enc_loop4
208
209.byte	102,15,56,220,209
210.byte	102,15,56,220,217
211.byte	102,15,56,220,225
212.byte	102,15,56,220,233
213.byte	102,15,56,221,208
214.byte	102,15,56,221,216
215.byte	102,15,56,221,224
216.byte	102,15,56,221,232
217	.byte	0xf3,0xc3
218
219
220.p2align	4
221_aesni_decrypt4:
222	movups	(%rcx),%xmm0
223	shll	$4,%eax
224	movups	16(%rcx),%xmm1
225	xorps	%xmm0,%xmm2
226	xorps	%xmm0,%xmm3
227	xorps	%xmm0,%xmm4
228	xorps	%xmm0,%xmm5
229	movups	32(%rcx),%xmm0
230	leaq	32(%rcx,%rax,1),%rcx
231	negq	%rax
232.byte	0x0f,0x1f,0x00
233	addq	$16,%rax
234
235L$dec_loop4:
236.byte	102,15,56,222,209
237.byte	102,15,56,222,217
238.byte	102,15,56,222,225
239.byte	102,15,56,222,233
240	movups	(%rcx,%rax,1),%xmm1
241	addq	$32,%rax
242.byte	102,15,56,222,208
243.byte	102,15,56,222,216
244.byte	102,15,56,222,224
245.byte	102,15,56,222,232
246	movups	-16(%rcx,%rax,1),%xmm0
247	jnz	L$dec_loop4
248
249.byte	102,15,56,222,209
250.byte	102,15,56,222,217
251.byte	102,15,56,222,225
252.byte	102,15,56,222,233
253.byte	102,15,56,223,208
254.byte	102,15,56,223,216
255.byte	102,15,56,223,224
256.byte	102,15,56,223,232
257	.byte	0xf3,0xc3
258
259
260.p2align	4
261_aesni_encrypt6:
262	movups	(%rcx),%xmm0
263	shll	$4,%eax
264	movups	16(%rcx),%xmm1
265	xorps	%xmm0,%xmm2
266	pxor	%xmm0,%xmm3
267	pxor	%xmm0,%xmm4
268.byte	102,15,56,220,209
269	leaq	32(%rcx,%rax,1),%rcx
270	negq	%rax
271.byte	102,15,56,220,217
272	pxor	%xmm0,%xmm5
273	pxor	%xmm0,%xmm6
274.byte	102,15,56,220,225
275	pxor	%xmm0,%xmm7
276	movups	(%rcx,%rax,1),%xmm0
277	addq	$16,%rax
278	jmp	L$enc_loop6_enter
279.p2align	4
280L$enc_loop6:
281.byte	102,15,56,220,209
282.byte	102,15,56,220,217
283.byte	102,15,56,220,225
284L$enc_loop6_enter:
285.byte	102,15,56,220,233
286.byte	102,15,56,220,241
287.byte	102,15,56,220,249
288	movups	(%rcx,%rax,1),%xmm1
289	addq	$32,%rax
290.byte	102,15,56,220,208
291.byte	102,15,56,220,216
292.byte	102,15,56,220,224
293.byte	102,15,56,220,232
294.byte	102,15,56,220,240
295.byte	102,15,56,220,248
296	movups	-16(%rcx,%rax,1),%xmm0
297	jnz	L$enc_loop6
298
299.byte	102,15,56,220,209
300.byte	102,15,56,220,217
301.byte	102,15,56,220,225
302.byte	102,15,56,220,233
303.byte	102,15,56,220,241
304.byte	102,15,56,220,249
305.byte	102,15,56,221,208
306.byte	102,15,56,221,216
307.byte	102,15,56,221,224
308.byte	102,15,56,221,232
309.byte	102,15,56,221,240
310.byte	102,15,56,221,248
311	.byte	0xf3,0xc3
312
313
314.p2align	4
315_aesni_decrypt6:
316	movups	(%rcx),%xmm0
317	shll	$4,%eax
318	movups	16(%rcx),%xmm1
319	xorps	%xmm0,%xmm2
320	pxor	%xmm0,%xmm3
321	pxor	%xmm0,%xmm4
322.byte	102,15,56,222,209
323	leaq	32(%rcx,%rax,1),%rcx
324	negq	%rax
325.byte	102,15,56,222,217
326	pxor	%xmm0,%xmm5
327	pxor	%xmm0,%xmm6
328.byte	102,15,56,222,225
329	pxor	%xmm0,%xmm7
330	movups	(%rcx,%rax,1),%xmm0
331	addq	$16,%rax
332	jmp	L$dec_loop6_enter
333.p2align	4
334L$dec_loop6:
335.byte	102,15,56,222,209
336.byte	102,15,56,222,217
337.byte	102,15,56,222,225
338L$dec_loop6_enter:
339.byte	102,15,56,222,233
340.byte	102,15,56,222,241
341.byte	102,15,56,222,249
342	movups	(%rcx,%rax,1),%xmm1
343	addq	$32,%rax
344.byte	102,15,56,222,208
345.byte	102,15,56,222,216
346.byte	102,15,56,222,224
347.byte	102,15,56,222,232
348.byte	102,15,56,222,240
349.byte	102,15,56,222,248
350	movups	-16(%rcx,%rax,1),%xmm0
351	jnz	L$dec_loop6
352
353.byte	102,15,56,222,209
354.byte	102,15,56,222,217
355.byte	102,15,56,222,225
356.byte	102,15,56,222,233
357.byte	102,15,56,222,241
358.byte	102,15,56,222,249
359.byte	102,15,56,223,208
360.byte	102,15,56,223,216
361.byte	102,15,56,223,224
362.byte	102,15,56,223,232
363.byte	102,15,56,223,240
364.byte	102,15,56,223,248
365	.byte	0xf3,0xc3
366
367
368.p2align	4
369_aesni_encrypt8:
370	movups	(%rcx),%xmm0
371	shll	$4,%eax
372	movups	16(%rcx),%xmm1
373	xorps	%xmm0,%xmm2
374	xorps	%xmm0,%xmm3
375	pxor	%xmm0,%xmm4
376	pxor	%xmm0,%xmm5
377	pxor	%xmm0,%xmm6
378	leaq	32(%rcx,%rax,1),%rcx
379	negq	%rax
380.byte	102,15,56,220,209
381	pxor	%xmm0,%xmm7
382	pxor	%xmm0,%xmm8
383.byte	102,15,56,220,217
384	pxor	%xmm0,%xmm9
385	movups	(%rcx,%rax,1),%xmm0
386	addq	$16,%rax
387	jmp	L$enc_loop8_inner
388.p2align	4
389L$enc_loop8:
390.byte	102,15,56,220,209
391.byte	102,15,56,220,217
392L$enc_loop8_inner:
393.byte	102,15,56,220,225
394.byte	102,15,56,220,233
395.byte	102,15,56,220,241
396.byte	102,15,56,220,249
397.byte	102,68,15,56,220,193
398.byte	102,68,15,56,220,201
399L$enc_loop8_enter:
400	movups	(%rcx,%rax,1),%xmm1
401	addq	$32,%rax
402.byte	102,15,56,220,208
403.byte	102,15,56,220,216
404.byte	102,15,56,220,224
405.byte	102,15,56,220,232
406.byte	102,15,56,220,240
407.byte	102,15,56,220,248
408.byte	102,68,15,56,220,192
409.byte	102,68,15,56,220,200
410	movups	-16(%rcx,%rax,1),%xmm0
411	jnz	L$enc_loop8
412
413.byte	102,15,56,220,209
414.byte	102,15,56,220,217
415.byte	102,15,56,220,225
416.byte	102,15,56,220,233
417.byte	102,15,56,220,241
418.byte	102,15,56,220,249
419.byte	102,68,15,56,220,193
420.byte	102,68,15,56,220,201
421.byte	102,15,56,221,208
422.byte	102,15,56,221,216
423.byte	102,15,56,221,224
424.byte	102,15,56,221,232
425.byte	102,15,56,221,240
426.byte	102,15,56,221,248
427.byte	102,68,15,56,221,192
428.byte	102,68,15,56,221,200
429	.byte	0xf3,0xc3
430
431
432.p2align	4
433_aesni_decrypt8:
434	movups	(%rcx),%xmm0
435	shll	$4,%eax
436	movups	16(%rcx),%xmm1
437	xorps	%xmm0,%xmm2
438	xorps	%xmm0,%xmm3
439	pxor	%xmm0,%xmm4
440	pxor	%xmm0,%xmm5
441	pxor	%xmm0,%xmm6
442	leaq	32(%rcx,%rax,1),%rcx
443	negq	%rax
444.byte	102,15,56,222,209
445	pxor	%xmm0,%xmm7
446	pxor	%xmm0,%xmm8
447.byte	102,15,56,222,217
448	pxor	%xmm0,%xmm9
449	movups	(%rcx,%rax,1),%xmm0
450	addq	$16,%rax
451	jmp	L$dec_loop8_inner
452.p2align	4
453L$dec_loop8:
454.byte	102,15,56,222,209
455.byte	102,15,56,222,217
456L$dec_loop8_inner:
457.byte	102,15,56,222,225
458.byte	102,15,56,222,233
459.byte	102,15,56,222,241
460.byte	102,15,56,222,249
461.byte	102,68,15,56,222,193
462.byte	102,68,15,56,222,201
463L$dec_loop8_enter:
464	movups	(%rcx,%rax,1),%xmm1
465	addq	$32,%rax
466.byte	102,15,56,222,208
467.byte	102,15,56,222,216
468.byte	102,15,56,222,224
469.byte	102,15,56,222,232
470.byte	102,15,56,222,240
471.byte	102,15,56,222,248
472.byte	102,68,15,56,222,192
473.byte	102,68,15,56,222,200
474	movups	-16(%rcx,%rax,1),%xmm0
475	jnz	L$dec_loop8
476
477.byte	102,15,56,222,209
478.byte	102,15,56,222,217
479.byte	102,15,56,222,225
480.byte	102,15,56,222,233
481.byte	102,15,56,222,241
482.byte	102,15,56,222,249
483.byte	102,68,15,56,222,193
484.byte	102,68,15,56,222,201
485.byte	102,15,56,223,208
486.byte	102,15,56,223,216
487.byte	102,15,56,223,224
488.byte	102,15,56,223,232
489.byte	102,15,56,223,240
490.byte	102,15,56,223,248
491.byte	102,68,15,56,223,192
492.byte	102,68,15,56,223,200
493	.byte	0xf3,0xc3
494
495.globl	_aesni_ecb_encrypt
496.private_extern _aesni_ecb_encrypt
497
498.p2align	4
499_aesni_ecb_encrypt:
500	andq	$-16,%rdx
501	jz	L$ecb_ret
502
503	movl	240(%rcx),%eax
504	movups	(%rcx),%xmm0
505	movq	%rcx,%r11
506	movl	%eax,%r10d
507	testl	%r8d,%r8d
508	jz	L$ecb_decrypt
509
510	cmpq	$0x80,%rdx
511	jb	L$ecb_enc_tail
512
513	movdqu	(%rdi),%xmm2
514	movdqu	16(%rdi),%xmm3
515	movdqu	32(%rdi),%xmm4
516	movdqu	48(%rdi),%xmm5
517	movdqu	64(%rdi),%xmm6
518	movdqu	80(%rdi),%xmm7
519	movdqu	96(%rdi),%xmm8
520	movdqu	112(%rdi),%xmm9
521	leaq	128(%rdi),%rdi
522	subq	$0x80,%rdx
523	jmp	L$ecb_enc_loop8_enter
524.p2align	4
525L$ecb_enc_loop8:
526	movups	%xmm2,(%rsi)
527	movq	%r11,%rcx
528	movdqu	(%rdi),%xmm2
529	movl	%r10d,%eax
530	movups	%xmm3,16(%rsi)
531	movdqu	16(%rdi),%xmm3
532	movups	%xmm4,32(%rsi)
533	movdqu	32(%rdi),%xmm4
534	movups	%xmm5,48(%rsi)
535	movdqu	48(%rdi),%xmm5
536	movups	%xmm6,64(%rsi)
537	movdqu	64(%rdi),%xmm6
538	movups	%xmm7,80(%rsi)
539	movdqu	80(%rdi),%xmm7
540	movups	%xmm8,96(%rsi)
541	movdqu	96(%rdi),%xmm8
542	movups	%xmm9,112(%rsi)
543	leaq	128(%rsi),%rsi
544	movdqu	112(%rdi),%xmm9
545	leaq	128(%rdi),%rdi
546L$ecb_enc_loop8_enter:
547
548	call	_aesni_encrypt8
549
550	subq	$0x80,%rdx
551	jnc	L$ecb_enc_loop8
552
553	movups	%xmm2,(%rsi)
554	movq	%r11,%rcx
555	movups	%xmm3,16(%rsi)
556	movl	%r10d,%eax
557	movups	%xmm4,32(%rsi)
558	movups	%xmm5,48(%rsi)
559	movups	%xmm6,64(%rsi)
560	movups	%xmm7,80(%rsi)
561	movups	%xmm8,96(%rsi)
562	movups	%xmm9,112(%rsi)
563	leaq	128(%rsi),%rsi
564	addq	$0x80,%rdx
565	jz	L$ecb_ret
566
567L$ecb_enc_tail:
568	movups	(%rdi),%xmm2
569	cmpq	$0x20,%rdx
570	jb	L$ecb_enc_one
571	movups	16(%rdi),%xmm3
572	je	L$ecb_enc_two
573	movups	32(%rdi),%xmm4
574	cmpq	$0x40,%rdx
575	jb	L$ecb_enc_three
576	movups	48(%rdi),%xmm5
577	je	L$ecb_enc_four
578	movups	64(%rdi),%xmm6
579	cmpq	$0x60,%rdx
580	jb	L$ecb_enc_five
581	movups	80(%rdi),%xmm7
582	je	L$ecb_enc_six
583	movdqu	96(%rdi),%xmm8
584	xorps	%xmm9,%xmm9
585	call	_aesni_encrypt8
586	movups	%xmm2,(%rsi)
587	movups	%xmm3,16(%rsi)
588	movups	%xmm4,32(%rsi)
589	movups	%xmm5,48(%rsi)
590	movups	%xmm6,64(%rsi)
591	movups	%xmm7,80(%rsi)
592	movups	%xmm8,96(%rsi)
593	jmp	L$ecb_ret
594.p2align	4
595L$ecb_enc_one:
596	movups	(%rcx),%xmm0
597	movups	16(%rcx),%xmm1
598	leaq	32(%rcx),%rcx
599	xorps	%xmm0,%xmm2
600L$oop_enc1_3:
601.byte	102,15,56,220,209
602	decl	%eax
603	movups	(%rcx),%xmm1
604	leaq	16(%rcx),%rcx
605	jnz	L$oop_enc1_3
606.byte	102,15,56,221,209
607	movups	%xmm2,(%rsi)
608	jmp	L$ecb_ret
609.p2align	4
610L$ecb_enc_two:
611	call	_aesni_encrypt2
612	movups	%xmm2,(%rsi)
613	movups	%xmm3,16(%rsi)
614	jmp	L$ecb_ret
615.p2align	4
616L$ecb_enc_three:
617	call	_aesni_encrypt3
618	movups	%xmm2,(%rsi)
619	movups	%xmm3,16(%rsi)
620	movups	%xmm4,32(%rsi)
621	jmp	L$ecb_ret
622.p2align	4
623L$ecb_enc_four:
624	call	_aesni_encrypt4
625	movups	%xmm2,(%rsi)
626	movups	%xmm3,16(%rsi)
627	movups	%xmm4,32(%rsi)
628	movups	%xmm5,48(%rsi)
629	jmp	L$ecb_ret
630.p2align	4
631L$ecb_enc_five:
632	xorps	%xmm7,%xmm7
633	call	_aesni_encrypt6
634	movups	%xmm2,(%rsi)
635	movups	%xmm3,16(%rsi)
636	movups	%xmm4,32(%rsi)
637	movups	%xmm5,48(%rsi)
638	movups	%xmm6,64(%rsi)
639	jmp	L$ecb_ret
640.p2align	4
641L$ecb_enc_six:
642	call	_aesni_encrypt6
643	movups	%xmm2,(%rsi)
644	movups	%xmm3,16(%rsi)
645	movups	%xmm4,32(%rsi)
646	movups	%xmm5,48(%rsi)
647	movups	%xmm6,64(%rsi)
648	movups	%xmm7,80(%rsi)
649	jmp	L$ecb_ret
650
651.p2align	4
652L$ecb_decrypt:
653	cmpq	$0x80,%rdx
654	jb	L$ecb_dec_tail
655
656	movdqu	(%rdi),%xmm2
657	movdqu	16(%rdi),%xmm3
658	movdqu	32(%rdi),%xmm4
659	movdqu	48(%rdi),%xmm5
660	movdqu	64(%rdi),%xmm6
661	movdqu	80(%rdi),%xmm7
662	movdqu	96(%rdi),%xmm8
663	movdqu	112(%rdi),%xmm9
664	leaq	128(%rdi),%rdi
665	subq	$0x80,%rdx
666	jmp	L$ecb_dec_loop8_enter
667.p2align	4
668L$ecb_dec_loop8:
669	movups	%xmm2,(%rsi)
670	movq	%r11,%rcx
671	movdqu	(%rdi),%xmm2
672	movl	%r10d,%eax
673	movups	%xmm3,16(%rsi)
674	movdqu	16(%rdi),%xmm3
675	movups	%xmm4,32(%rsi)
676	movdqu	32(%rdi),%xmm4
677	movups	%xmm5,48(%rsi)
678	movdqu	48(%rdi),%xmm5
679	movups	%xmm6,64(%rsi)
680	movdqu	64(%rdi),%xmm6
681	movups	%xmm7,80(%rsi)
682	movdqu	80(%rdi),%xmm7
683	movups	%xmm8,96(%rsi)
684	movdqu	96(%rdi),%xmm8
685	movups	%xmm9,112(%rsi)
686	leaq	128(%rsi),%rsi
687	movdqu	112(%rdi),%xmm9
688	leaq	128(%rdi),%rdi
689L$ecb_dec_loop8_enter:
690
691	call	_aesni_decrypt8
692
693	movups	(%r11),%xmm0
694	subq	$0x80,%rdx
695	jnc	L$ecb_dec_loop8
696
697	movups	%xmm2,(%rsi)
698	pxor	%xmm2,%xmm2
699	movq	%r11,%rcx
700	movups	%xmm3,16(%rsi)
701	pxor	%xmm3,%xmm3
702	movl	%r10d,%eax
703	movups	%xmm4,32(%rsi)
704	pxor	%xmm4,%xmm4
705	movups	%xmm5,48(%rsi)
706	pxor	%xmm5,%xmm5
707	movups	%xmm6,64(%rsi)
708	pxor	%xmm6,%xmm6
709	movups	%xmm7,80(%rsi)
710	pxor	%xmm7,%xmm7
711	movups	%xmm8,96(%rsi)
712	pxor	%xmm8,%xmm8
713	movups	%xmm9,112(%rsi)
714	pxor	%xmm9,%xmm9
715	leaq	128(%rsi),%rsi
716	addq	$0x80,%rdx
717	jz	L$ecb_ret
718
719L$ecb_dec_tail:
720	movups	(%rdi),%xmm2
721	cmpq	$0x20,%rdx
722	jb	L$ecb_dec_one
723	movups	16(%rdi),%xmm3
724	je	L$ecb_dec_two
725	movups	32(%rdi),%xmm4
726	cmpq	$0x40,%rdx
727	jb	L$ecb_dec_three
728	movups	48(%rdi),%xmm5
729	je	L$ecb_dec_four
730	movups	64(%rdi),%xmm6
731	cmpq	$0x60,%rdx
732	jb	L$ecb_dec_five
733	movups	80(%rdi),%xmm7
734	je	L$ecb_dec_six
735	movups	96(%rdi),%xmm8
736	movups	(%rcx),%xmm0
737	xorps	%xmm9,%xmm9
738	call	_aesni_decrypt8
739	movups	%xmm2,(%rsi)
740	pxor	%xmm2,%xmm2
741	movups	%xmm3,16(%rsi)
742	pxor	%xmm3,%xmm3
743	movups	%xmm4,32(%rsi)
744	pxor	%xmm4,%xmm4
745	movups	%xmm5,48(%rsi)
746	pxor	%xmm5,%xmm5
747	movups	%xmm6,64(%rsi)
748	pxor	%xmm6,%xmm6
749	movups	%xmm7,80(%rsi)
750	pxor	%xmm7,%xmm7
751	movups	%xmm8,96(%rsi)
752	pxor	%xmm8,%xmm8
753	pxor	%xmm9,%xmm9
754	jmp	L$ecb_ret
755.p2align	4
756L$ecb_dec_one:
757	movups	(%rcx),%xmm0
758	movups	16(%rcx),%xmm1
759	leaq	32(%rcx),%rcx
760	xorps	%xmm0,%xmm2
761L$oop_dec1_4:
762.byte	102,15,56,222,209
763	decl	%eax
764	movups	(%rcx),%xmm1
765	leaq	16(%rcx),%rcx
766	jnz	L$oop_dec1_4
767.byte	102,15,56,223,209
768	movups	%xmm2,(%rsi)
769	pxor	%xmm2,%xmm2
770	jmp	L$ecb_ret
771.p2align	4
772L$ecb_dec_two:
773	call	_aesni_decrypt2
774	movups	%xmm2,(%rsi)
775	pxor	%xmm2,%xmm2
776	movups	%xmm3,16(%rsi)
777	pxor	%xmm3,%xmm3
778	jmp	L$ecb_ret
779.p2align	4
780L$ecb_dec_three:
781	call	_aesni_decrypt3
782	movups	%xmm2,(%rsi)
783	pxor	%xmm2,%xmm2
784	movups	%xmm3,16(%rsi)
785	pxor	%xmm3,%xmm3
786	movups	%xmm4,32(%rsi)
787	pxor	%xmm4,%xmm4
788	jmp	L$ecb_ret
789.p2align	4
790L$ecb_dec_four:
791	call	_aesni_decrypt4
792	movups	%xmm2,(%rsi)
793	pxor	%xmm2,%xmm2
794	movups	%xmm3,16(%rsi)
795	pxor	%xmm3,%xmm3
796	movups	%xmm4,32(%rsi)
797	pxor	%xmm4,%xmm4
798	movups	%xmm5,48(%rsi)
799	pxor	%xmm5,%xmm5
800	jmp	L$ecb_ret
801.p2align	4
802L$ecb_dec_five:
803	xorps	%xmm7,%xmm7
804	call	_aesni_decrypt6
805	movups	%xmm2,(%rsi)
806	pxor	%xmm2,%xmm2
807	movups	%xmm3,16(%rsi)
808	pxor	%xmm3,%xmm3
809	movups	%xmm4,32(%rsi)
810	pxor	%xmm4,%xmm4
811	movups	%xmm5,48(%rsi)
812	pxor	%xmm5,%xmm5
813	movups	%xmm6,64(%rsi)
814	pxor	%xmm6,%xmm6
815	pxor	%xmm7,%xmm7
816	jmp	L$ecb_ret
817.p2align	4
818L$ecb_dec_six:
819	call	_aesni_decrypt6
820	movups	%xmm2,(%rsi)
821	pxor	%xmm2,%xmm2
822	movups	%xmm3,16(%rsi)
823	pxor	%xmm3,%xmm3
824	movups	%xmm4,32(%rsi)
825	pxor	%xmm4,%xmm4
826	movups	%xmm5,48(%rsi)
827	pxor	%xmm5,%xmm5
828	movups	%xmm6,64(%rsi)
829	pxor	%xmm6,%xmm6
830	movups	%xmm7,80(%rsi)
831	pxor	%xmm7,%xmm7
832
833L$ecb_ret:
834	xorps	%xmm0,%xmm0
835	pxor	%xmm1,%xmm1
836	.byte	0xf3,0xc3
837
838.globl	_aesni_ccm64_encrypt_blocks
839.private_extern _aesni_ccm64_encrypt_blocks
840
841.p2align	4
842_aesni_ccm64_encrypt_blocks:
843	movl	240(%rcx),%eax
844	movdqu	(%r8),%xmm6
845	movdqa	L$increment64(%rip),%xmm9
846	movdqa	L$bswap_mask(%rip),%xmm7
847
848	shll	$4,%eax
849	movl	$16,%r10d
850	leaq	0(%rcx),%r11
851	movdqu	(%r9),%xmm3
852	movdqa	%xmm6,%xmm2
853	leaq	32(%rcx,%rax,1),%rcx
854.byte	102,15,56,0,247
855	subq	%rax,%r10
856	jmp	L$ccm64_enc_outer
857.p2align	4
858L$ccm64_enc_outer:
859	movups	(%r11),%xmm0
860	movq	%r10,%rax
861	movups	(%rdi),%xmm8
862
863	xorps	%xmm0,%xmm2
864	movups	16(%r11),%xmm1
865	xorps	%xmm8,%xmm0
866	xorps	%xmm0,%xmm3
867	movups	32(%r11),%xmm0
868
869L$ccm64_enc2_loop:
870.byte	102,15,56,220,209
871.byte	102,15,56,220,217
872	movups	(%rcx,%rax,1),%xmm1
873	addq	$32,%rax
874.byte	102,15,56,220,208
875.byte	102,15,56,220,216
876	movups	-16(%rcx,%rax,1),%xmm0
877	jnz	L$ccm64_enc2_loop
878.byte	102,15,56,220,209
879.byte	102,15,56,220,217
880	paddq	%xmm9,%xmm6
881	decq	%rdx
882.byte	102,15,56,221,208
883.byte	102,15,56,221,216
884
885	leaq	16(%rdi),%rdi
886	xorps	%xmm2,%xmm8
887	movdqa	%xmm6,%xmm2
888	movups	%xmm8,(%rsi)
889.byte	102,15,56,0,215
890	leaq	16(%rsi),%rsi
891	jnz	L$ccm64_enc_outer
892
893	pxor	%xmm0,%xmm0
894	pxor	%xmm1,%xmm1
895	pxor	%xmm2,%xmm2
896	movups	%xmm3,(%r9)
897	pxor	%xmm3,%xmm3
898	pxor	%xmm8,%xmm8
899	pxor	%xmm6,%xmm6
900	.byte	0xf3,0xc3
901
902.globl	_aesni_ccm64_decrypt_blocks
903.private_extern _aesni_ccm64_decrypt_blocks
904
905.p2align	4
906_aesni_ccm64_decrypt_blocks:
907	movl	240(%rcx),%eax
908	movups	(%r8),%xmm6
909	movdqu	(%r9),%xmm3
910	movdqa	L$increment64(%rip),%xmm9
911	movdqa	L$bswap_mask(%rip),%xmm7
912
913	movaps	%xmm6,%xmm2
914	movl	%eax,%r10d
915	movq	%rcx,%r11
916.byte	102,15,56,0,247
917	movups	(%rcx),%xmm0
918	movups	16(%rcx),%xmm1
919	leaq	32(%rcx),%rcx
920	xorps	%xmm0,%xmm2
921L$oop_enc1_5:
922.byte	102,15,56,220,209
923	decl	%eax
924	movups	(%rcx),%xmm1
925	leaq	16(%rcx),%rcx
926	jnz	L$oop_enc1_5
927.byte	102,15,56,221,209
928	shll	$4,%r10d
929	movl	$16,%eax
930	movups	(%rdi),%xmm8
931	paddq	%xmm9,%xmm6
932	leaq	16(%rdi),%rdi
933	subq	%r10,%rax
934	leaq	32(%r11,%r10,1),%rcx
935	movq	%rax,%r10
936	jmp	L$ccm64_dec_outer
937.p2align	4
938L$ccm64_dec_outer:
939	xorps	%xmm2,%xmm8
940	movdqa	%xmm6,%xmm2
941	movups	%xmm8,(%rsi)
942	leaq	16(%rsi),%rsi
943.byte	102,15,56,0,215
944
945	subq	$1,%rdx
946	jz	L$ccm64_dec_break
947
948	movups	(%r11),%xmm0
949	movq	%r10,%rax
950	movups	16(%r11),%xmm1
951	xorps	%xmm0,%xmm8
952	xorps	%xmm0,%xmm2
953	xorps	%xmm8,%xmm3
954	movups	32(%r11),%xmm0
955	jmp	L$ccm64_dec2_loop
956.p2align	4
957L$ccm64_dec2_loop:
958.byte	102,15,56,220,209
959.byte	102,15,56,220,217
960	movups	(%rcx,%rax,1),%xmm1
961	addq	$32,%rax
962.byte	102,15,56,220,208
963.byte	102,15,56,220,216
964	movups	-16(%rcx,%rax,1),%xmm0
965	jnz	L$ccm64_dec2_loop
966	movups	(%rdi),%xmm8
967	paddq	%xmm9,%xmm6
968.byte	102,15,56,220,209
969.byte	102,15,56,220,217
970.byte	102,15,56,221,208
971.byte	102,15,56,221,216
972	leaq	16(%rdi),%rdi
973	jmp	L$ccm64_dec_outer
974
975.p2align	4
976L$ccm64_dec_break:
977
978	movl	240(%r11),%eax
979	movups	(%r11),%xmm0
980	movups	16(%r11),%xmm1
981	xorps	%xmm0,%xmm8
982	leaq	32(%r11),%r11
983	xorps	%xmm8,%xmm3
984L$oop_enc1_6:
985.byte	102,15,56,220,217
986	decl	%eax
987	movups	(%r11),%xmm1
988	leaq	16(%r11),%r11
989	jnz	L$oop_enc1_6
990.byte	102,15,56,221,217
991	pxor	%xmm0,%xmm0
992	pxor	%xmm1,%xmm1
993	pxor	%xmm2,%xmm2
994	movups	%xmm3,(%r9)
995	pxor	%xmm3,%xmm3
996	pxor	%xmm8,%xmm8
997	pxor	%xmm6,%xmm6
998	.byte	0xf3,0xc3
999
1000.globl	_aesni_ctr32_encrypt_blocks
1001.private_extern _aesni_ctr32_encrypt_blocks
1002
1003.p2align	4
1004_aesni_ctr32_encrypt_blocks:
1005	cmpq	$1,%rdx
1006	jne	L$ctr32_bulk
1007
1008
1009
1010	movups	(%r8),%xmm2
1011	movups	(%rdi),%xmm3
1012	movl	240(%rcx),%edx
1013	movups	(%rcx),%xmm0
1014	movups	16(%rcx),%xmm1
1015	leaq	32(%rcx),%rcx
1016	xorps	%xmm0,%xmm2
1017L$oop_enc1_7:
1018.byte	102,15,56,220,209
1019	decl	%edx
1020	movups	(%rcx),%xmm1
1021	leaq	16(%rcx),%rcx
1022	jnz	L$oop_enc1_7
1023.byte	102,15,56,221,209
1024	pxor	%xmm0,%xmm0
1025	pxor	%xmm1,%xmm1
1026	xorps	%xmm3,%xmm2
1027	pxor	%xmm3,%xmm3
1028	movups	%xmm2,(%rsi)
1029	xorps	%xmm2,%xmm2
1030	jmp	L$ctr32_epilogue
1031
1032.p2align	4
1033L$ctr32_bulk:
1034	leaq	(%rsp),%r11
1035	pushq	%rbp
1036	subq	$128,%rsp
1037	andq	$-16,%rsp
1038
1039
1040
1041
1042	movdqu	(%r8),%xmm2
1043	movdqu	(%rcx),%xmm0
1044	movl	12(%r8),%r8d
1045	pxor	%xmm0,%xmm2
1046	movl	12(%rcx),%ebp
1047	movdqa	%xmm2,0(%rsp)
1048	bswapl	%r8d
1049	movdqa	%xmm2,%xmm3
1050	movdqa	%xmm2,%xmm4
1051	movdqa	%xmm2,%xmm5
1052	movdqa	%xmm2,64(%rsp)
1053	movdqa	%xmm2,80(%rsp)
1054	movdqa	%xmm2,96(%rsp)
1055	movq	%rdx,%r10
1056	movdqa	%xmm2,112(%rsp)
1057
1058	leaq	1(%r8),%rax
1059	leaq	2(%r8),%rdx
1060	bswapl	%eax
1061	bswapl	%edx
1062	xorl	%ebp,%eax
1063	xorl	%ebp,%edx
1064.byte	102,15,58,34,216,3
1065	leaq	3(%r8),%rax
1066	movdqa	%xmm3,16(%rsp)
1067.byte	102,15,58,34,226,3
1068	bswapl	%eax
1069	movq	%r10,%rdx
1070	leaq	4(%r8),%r10
1071	movdqa	%xmm4,32(%rsp)
1072	xorl	%ebp,%eax
1073	bswapl	%r10d
1074.byte	102,15,58,34,232,3
1075	xorl	%ebp,%r10d
1076	movdqa	%xmm5,48(%rsp)
1077	leaq	5(%r8),%r9
1078	movl	%r10d,64+12(%rsp)
1079	bswapl	%r9d
1080	leaq	6(%r8),%r10
1081	movl	240(%rcx),%eax
1082	xorl	%ebp,%r9d
1083	bswapl	%r10d
1084	movl	%r9d,80+12(%rsp)
1085	xorl	%ebp,%r10d
1086	leaq	7(%r8),%r9
1087	movl	%r10d,96+12(%rsp)
1088	bswapl	%r9d
1089	leaq	_OPENSSL_ia32cap_P(%rip),%r10
1090	movl	4(%r10),%r10d
1091	xorl	%ebp,%r9d
1092	andl	$71303168,%r10d
1093	movl	%r9d,112+12(%rsp)
1094
1095	movups	16(%rcx),%xmm1
1096
1097	movdqa	64(%rsp),%xmm6
1098	movdqa	80(%rsp),%xmm7
1099
1100	cmpq	$8,%rdx
1101	jb	L$ctr32_tail
1102
1103	subq	$6,%rdx
1104	cmpl	$4194304,%r10d
1105	je	L$ctr32_6x
1106
1107	leaq	128(%rcx),%rcx
1108	subq	$2,%rdx
1109	jmp	L$ctr32_loop8
1110
1111.p2align	4
1112L$ctr32_6x:
1113	shll	$4,%eax
1114	movl	$48,%r10d
1115	bswapl	%ebp
1116	leaq	32(%rcx,%rax,1),%rcx
1117	subq	%rax,%r10
1118	jmp	L$ctr32_loop6
1119
1120.p2align	4
1121L$ctr32_loop6:
1122	addl	$6,%r8d
1123	movups	-48(%rcx,%r10,1),%xmm0
1124.byte	102,15,56,220,209
1125	movl	%r8d,%eax
1126	xorl	%ebp,%eax
1127.byte	102,15,56,220,217
1128.byte	0x0f,0x38,0xf1,0x44,0x24,12
1129	leal	1(%r8),%eax
1130.byte	102,15,56,220,225
1131	xorl	%ebp,%eax
1132.byte	0x0f,0x38,0xf1,0x44,0x24,28
1133.byte	102,15,56,220,233
1134	leal	2(%r8),%eax
1135	xorl	%ebp,%eax
1136.byte	102,15,56,220,241
1137.byte	0x0f,0x38,0xf1,0x44,0x24,44
1138	leal	3(%r8),%eax
1139.byte	102,15,56,220,249
1140	movups	-32(%rcx,%r10,1),%xmm1
1141	xorl	%ebp,%eax
1142
1143.byte	102,15,56,220,208
1144.byte	0x0f,0x38,0xf1,0x44,0x24,60
1145	leal	4(%r8),%eax
1146.byte	102,15,56,220,216
1147	xorl	%ebp,%eax
1148.byte	0x0f,0x38,0xf1,0x44,0x24,76
1149.byte	102,15,56,220,224
1150	leal	5(%r8),%eax
1151	xorl	%ebp,%eax
1152.byte	102,15,56,220,232
1153.byte	0x0f,0x38,0xf1,0x44,0x24,92
1154	movq	%r10,%rax
1155.byte	102,15,56,220,240
1156.byte	102,15,56,220,248
1157	movups	-16(%rcx,%r10,1),%xmm0
1158
1159	call	L$enc_loop6
1160
1161	movdqu	(%rdi),%xmm8
1162	movdqu	16(%rdi),%xmm9
1163	movdqu	32(%rdi),%xmm10
1164	movdqu	48(%rdi),%xmm11
1165	movdqu	64(%rdi),%xmm12
1166	movdqu	80(%rdi),%xmm13
1167	leaq	96(%rdi),%rdi
1168	movups	-64(%rcx,%r10,1),%xmm1
1169	pxor	%xmm2,%xmm8
1170	movaps	0(%rsp),%xmm2
1171	pxor	%xmm3,%xmm9
1172	movaps	16(%rsp),%xmm3
1173	pxor	%xmm4,%xmm10
1174	movaps	32(%rsp),%xmm4
1175	pxor	%xmm5,%xmm11
1176	movaps	48(%rsp),%xmm5
1177	pxor	%xmm6,%xmm12
1178	movaps	64(%rsp),%xmm6
1179	pxor	%xmm7,%xmm13
1180	movaps	80(%rsp),%xmm7
1181	movdqu	%xmm8,(%rsi)
1182	movdqu	%xmm9,16(%rsi)
1183	movdqu	%xmm10,32(%rsi)
1184	movdqu	%xmm11,48(%rsi)
1185	movdqu	%xmm12,64(%rsi)
1186	movdqu	%xmm13,80(%rsi)
1187	leaq	96(%rsi),%rsi
1188
1189	subq	$6,%rdx
1190	jnc	L$ctr32_loop6
1191
1192	addq	$6,%rdx
1193	jz	L$ctr32_done
1194
1195	leal	-48(%r10),%eax
1196	leaq	-80(%rcx,%r10,1),%rcx
1197	negl	%eax
1198	shrl	$4,%eax
1199	jmp	L$ctr32_tail
1200
1201.p2align	5
1202L$ctr32_loop8:
1203	addl	$8,%r8d
1204	movdqa	96(%rsp),%xmm8
1205.byte	102,15,56,220,209
1206	movl	%r8d,%r9d
1207	movdqa	112(%rsp),%xmm9
1208.byte	102,15,56,220,217
1209	bswapl	%r9d
1210	movups	32-128(%rcx),%xmm0
1211.byte	102,15,56,220,225
1212	xorl	%ebp,%r9d
1213	nop
1214.byte	102,15,56,220,233
1215	movl	%r9d,0+12(%rsp)
1216	leaq	1(%r8),%r9
1217.byte	102,15,56,220,241
1218.byte	102,15,56,220,249
1219.byte	102,68,15,56,220,193
1220.byte	102,68,15,56,220,201
1221	movups	48-128(%rcx),%xmm1
1222	bswapl	%r9d
1223.byte	102,15,56,220,208
1224.byte	102,15,56,220,216
1225	xorl	%ebp,%r9d
1226.byte	0x66,0x90
1227.byte	102,15,56,220,224
1228.byte	102,15,56,220,232
1229	movl	%r9d,16+12(%rsp)
1230	leaq	2(%r8),%r9
1231.byte	102,15,56,220,240
1232.byte	102,15,56,220,248
1233.byte	102,68,15,56,220,192
1234.byte	102,68,15,56,220,200
1235	movups	64-128(%rcx),%xmm0
1236	bswapl	%r9d
1237.byte	102,15,56,220,209
1238.byte	102,15,56,220,217
1239	xorl	%ebp,%r9d
1240.byte	0x66,0x90
1241.byte	102,15,56,220,225
1242.byte	102,15,56,220,233
1243	movl	%r9d,32+12(%rsp)
1244	leaq	3(%r8),%r9
1245.byte	102,15,56,220,241
1246.byte	102,15,56,220,249
1247.byte	102,68,15,56,220,193
1248.byte	102,68,15,56,220,201
1249	movups	80-128(%rcx),%xmm1
1250	bswapl	%r9d
1251.byte	102,15,56,220,208
1252.byte	102,15,56,220,216
1253	xorl	%ebp,%r9d
1254.byte	0x66,0x90
1255.byte	102,15,56,220,224
1256.byte	102,15,56,220,232
1257	movl	%r9d,48+12(%rsp)
1258	leaq	4(%r8),%r9
1259.byte	102,15,56,220,240
1260.byte	102,15,56,220,248
1261.byte	102,68,15,56,220,192
1262.byte	102,68,15,56,220,200
1263	movups	96-128(%rcx),%xmm0
1264	bswapl	%r9d
1265.byte	102,15,56,220,209
1266.byte	102,15,56,220,217
1267	xorl	%ebp,%r9d
1268.byte	0x66,0x90
1269.byte	102,15,56,220,225
1270.byte	102,15,56,220,233
1271	movl	%r9d,64+12(%rsp)
1272	leaq	5(%r8),%r9
1273.byte	102,15,56,220,241
1274.byte	102,15,56,220,249
1275.byte	102,68,15,56,220,193
1276.byte	102,68,15,56,220,201
1277	movups	112-128(%rcx),%xmm1
1278	bswapl	%r9d
1279.byte	102,15,56,220,208
1280.byte	102,15,56,220,216
1281	xorl	%ebp,%r9d
1282.byte	0x66,0x90
1283.byte	102,15,56,220,224
1284.byte	102,15,56,220,232
1285	movl	%r9d,80+12(%rsp)
1286	leaq	6(%r8),%r9
1287.byte	102,15,56,220,240
1288.byte	102,15,56,220,248
1289.byte	102,68,15,56,220,192
1290.byte	102,68,15,56,220,200
1291	movups	128-128(%rcx),%xmm0
1292	bswapl	%r9d
1293.byte	102,15,56,220,209
1294.byte	102,15,56,220,217
1295	xorl	%ebp,%r9d
1296.byte	0x66,0x90
1297.byte	102,15,56,220,225
1298.byte	102,15,56,220,233
1299	movl	%r9d,96+12(%rsp)
1300	leaq	7(%r8),%r9
1301.byte	102,15,56,220,241
1302.byte	102,15,56,220,249
1303.byte	102,68,15,56,220,193
1304.byte	102,68,15,56,220,201
1305	movups	144-128(%rcx),%xmm1
1306	bswapl	%r9d
1307.byte	102,15,56,220,208
1308.byte	102,15,56,220,216
1309.byte	102,15,56,220,224
1310	xorl	%ebp,%r9d
1311	movdqu	0(%rdi),%xmm10
1312.byte	102,15,56,220,232
1313	movl	%r9d,112+12(%rsp)
1314	cmpl	$11,%eax
1315.byte	102,15,56,220,240
1316.byte	102,15,56,220,248
1317.byte	102,68,15,56,220,192
1318.byte	102,68,15,56,220,200
1319	movups	160-128(%rcx),%xmm0
1320
1321	jb	L$ctr32_enc_done
1322
1323.byte	102,15,56,220,209
1324.byte	102,15,56,220,217
1325.byte	102,15,56,220,225
1326.byte	102,15,56,220,233
1327.byte	102,15,56,220,241
1328.byte	102,15,56,220,249
1329.byte	102,68,15,56,220,193
1330.byte	102,68,15,56,220,201
1331	movups	176-128(%rcx),%xmm1
1332
1333.byte	102,15,56,220,208
1334.byte	102,15,56,220,216
1335.byte	102,15,56,220,224
1336.byte	102,15,56,220,232
1337.byte	102,15,56,220,240
1338.byte	102,15,56,220,248
1339.byte	102,68,15,56,220,192
1340.byte	102,68,15,56,220,200
1341	movups	192-128(%rcx),%xmm0
1342	je	L$ctr32_enc_done
1343
1344.byte	102,15,56,220,209
1345.byte	102,15,56,220,217
1346.byte	102,15,56,220,225
1347.byte	102,15,56,220,233
1348.byte	102,15,56,220,241
1349.byte	102,15,56,220,249
1350.byte	102,68,15,56,220,193
1351.byte	102,68,15,56,220,201
1352	movups	208-128(%rcx),%xmm1
1353
1354.byte	102,15,56,220,208
1355.byte	102,15,56,220,216
1356.byte	102,15,56,220,224
1357.byte	102,15,56,220,232
1358.byte	102,15,56,220,240
1359.byte	102,15,56,220,248
1360.byte	102,68,15,56,220,192
1361.byte	102,68,15,56,220,200
1362	movups	224-128(%rcx),%xmm0
1363	jmp	L$ctr32_enc_done
1364
1365.p2align	4
1366L$ctr32_enc_done:
1367	movdqu	16(%rdi),%xmm11
1368	pxor	%xmm0,%xmm10
1369	movdqu	32(%rdi),%xmm12
1370	pxor	%xmm0,%xmm11
1371	movdqu	48(%rdi),%xmm13
1372	pxor	%xmm0,%xmm12
1373	movdqu	64(%rdi),%xmm14
1374	pxor	%xmm0,%xmm13
1375	movdqu	80(%rdi),%xmm15
1376	pxor	%xmm0,%xmm14
1377	pxor	%xmm0,%xmm15
1378.byte	102,15,56,220,209
1379.byte	102,15,56,220,217
1380.byte	102,15,56,220,225
1381.byte	102,15,56,220,233
1382.byte	102,15,56,220,241
1383.byte	102,15,56,220,249
1384.byte	102,68,15,56,220,193
1385.byte	102,68,15,56,220,201
1386	movdqu	96(%rdi),%xmm1
1387	leaq	128(%rdi),%rdi
1388
1389.byte	102,65,15,56,221,210
1390	pxor	%xmm0,%xmm1
1391	movdqu	112-128(%rdi),%xmm10
1392.byte	102,65,15,56,221,219
1393	pxor	%xmm0,%xmm10
1394	movdqa	0(%rsp),%xmm11
1395.byte	102,65,15,56,221,228
1396.byte	102,65,15,56,221,237
1397	movdqa	16(%rsp),%xmm12
1398	movdqa	32(%rsp),%xmm13
1399.byte	102,65,15,56,221,246
1400.byte	102,65,15,56,221,255
1401	movdqa	48(%rsp),%xmm14
1402	movdqa	64(%rsp),%xmm15
1403.byte	102,68,15,56,221,193
1404	movdqa	80(%rsp),%xmm0
1405	movups	16-128(%rcx),%xmm1
1406.byte	102,69,15,56,221,202
1407
1408	movups	%xmm2,(%rsi)
1409	movdqa	%xmm11,%xmm2
1410	movups	%xmm3,16(%rsi)
1411	movdqa	%xmm12,%xmm3
1412	movups	%xmm4,32(%rsi)
1413	movdqa	%xmm13,%xmm4
1414	movups	%xmm5,48(%rsi)
1415	movdqa	%xmm14,%xmm5
1416	movups	%xmm6,64(%rsi)
1417	movdqa	%xmm15,%xmm6
1418	movups	%xmm7,80(%rsi)
1419	movdqa	%xmm0,%xmm7
1420	movups	%xmm8,96(%rsi)
1421	movups	%xmm9,112(%rsi)
1422	leaq	128(%rsi),%rsi
1423
1424	subq	$8,%rdx
1425	jnc	L$ctr32_loop8
1426
1427	addq	$8,%rdx
1428	jz	L$ctr32_done
1429	leaq	-128(%rcx),%rcx
1430
1431L$ctr32_tail:
1432
1433
1434	leaq	16(%rcx),%rcx
1435	cmpq	$4,%rdx
1436	jb	L$ctr32_loop3
1437	je	L$ctr32_loop4
1438
1439
1440	shll	$4,%eax
1441	movdqa	96(%rsp),%xmm8
1442	pxor	%xmm9,%xmm9
1443
1444	movups	16(%rcx),%xmm0
1445.byte	102,15,56,220,209
1446.byte	102,15,56,220,217
1447	leaq	32-16(%rcx,%rax,1),%rcx
1448	negq	%rax
1449.byte	102,15,56,220,225
1450	addq	$16,%rax
1451	movups	(%rdi),%xmm10
1452.byte	102,15,56,220,233
1453.byte	102,15,56,220,241
1454	movups	16(%rdi),%xmm11
1455	movups	32(%rdi),%xmm12
1456.byte	102,15,56,220,249
1457.byte	102,68,15,56,220,193
1458
1459	call	L$enc_loop8_enter
1460
1461	movdqu	48(%rdi),%xmm13
1462	pxor	%xmm10,%xmm2
1463	movdqu	64(%rdi),%xmm10
1464	pxor	%xmm11,%xmm3
1465	movdqu	%xmm2,(%rsi)
1466	pxor	%xmm12,%xmm4
1467	movdqu	%xmm3,16(%rsi)
1468	pxor	%xmm13,%xmm5
1469	movdqu	%xmm4,32(%rsi)
1470	pxor	%xmm10,%xmm6
1471	movdqu	%xmm5,48(%rsi)
1472	movdqu	%xmm6,64(%rsi)
1473	cmpq	$6,%rdx
1474	jb	L$ctr32_done
1475
1476	movups	80(%rdi),%xmm11
1477	xorps	%xmm11,%xmm7
1478	movups	%xmm7,80(%rsi)
1479	je	L$ctr32_done
1480
1481	movups	96(%rdi),%xmm12
1482	xorps	%xmm12,%xmm8
1483	movups	%xmm8,96(%rsi)
1484	jmp	L$ctr32_done
1485
1486.p2align	5
1487L$ctr32_loop4:
1488.byte	102,15,56,220,209
1489	leaq	16(%rcx),%rcx
1490	decl	%eax
1491.byte	102,15,56,220,217
1492.byte	102,15,56,220,225
1493.byte	102,15,56,220,233
1494	movups	(%rcx),%xmm1
1495	jnz	L$ctr32_loop4
1496.byte	102,15,56,221,209
1497.byte	102,15,56,221,217
1498	movups	(%rdi),%xmm10
1499	movups	16(%rdi),%xmm11
1500.byte	102,15,56,221,225
1501.byte	102,15,56,221,233
1502	movups	32(%rdi),%xmm12
1503	movups	48(%rdi),%xmm13
1504
1505	xorps	%xmm10,%xmm2
1506	movups	%xmm2,(%rsi)
1507	xorps	%xmm11,%xmm3
1508	movups	%xmm3,16(%rsi)
1509	pxor	%xmm12,%xmm4
1510	movdqu	%xmm4,32(%rsi)
1511	pxor	%xmm13,%xmm5
1512	movdqu	%xmm5,48(%rsi)
1513	jmp	L$ctr32_done
1514
1515.p2align	5
1516L$ctr32_loop3:
1517.byte	102,15,56,220,209
1518	leaq	16(%rcx),%rcx
1519	decl	%eax
1520.byte	102,15,56,220,217
1521.byte	102,15,56,220,225
1522	movups	(%rcx),%xmm1
1523	jnz	L$ctr32_loop3
1524.byte	102,15,56,221,209
1525.byte	102,15,56,221,217
1526.byte	102,15,56,221,225
1527
1528	movups	(%rdi),%xmm10
1529	xorps	%xmm10,%xmm2
1530	movups	%xmm2,(%rsi)
1531	cmpq	$2,%rdx
1532	jb	L$ctr32_done
1533
1534	movups	16(%rdi),%xmm11
1535	xorps	%xmm11,%xmm3
1536	movups	%xmm3,16(%rsi)
1537	je	L$ctr32_done
1538
1539	movups	32(%rdi),%xmm12
1540	xorps	%xmm12,%xmm4
1541	movups	%xmm4,32(%rsi)
1542
1543L$ctr32_done:
1544	xorps	%xmm0,%xmm0
1545	xorl	%ebp,%ebp
1546	pxor	%xmm1,%xmm1
1547	pxor	%xmm2,%xmm2
1548	pxor	%xmm3,%xmm3
1549	pxor	%xmm4,%xmm4
1550	pxor	%xmm5,%xmm5
1551	pxor	%xmm6,%xmm6
1552	pxor	%xmm7,%xmm7
1553	movaps	%xmm0,0(%rsp)
1554	pxor	%xmm8,%xmm8
1555	movaps	%xmm0,16(%rsp)
1556	pxor	%xmm9,%xmm9
1557	movaps	%xmm0,32(%rsp)
1558	pxor	%xmm10,%xmm10
1559	movaps	%xmm0,48(%rsp)
1560	pxor	%xmm11,%xmm11
1561	movaps	%xmm0,64(%rsp)
1562	pxor	%xmm12,%xmm12
1563	movaps	%xmm0,80(%rsp)
1564	pxor	%xmm13,%xmm13
1565	movaps	%xmm0,96(%rsp)
1566	pxor	%xmm14,%xmm14
1567	movaps	%xmm0,112(%rsp)
1568	pxor	%xmm15,%xmm15
1569	movq	-8(%r11),%rbp
1570	leaq	(%r11),%rsp
1571L$ctr32_epilogue:
1572	.byte	0xf3,0xc3
1573
1574.globl	_aesni_xts_encrypt
1575.private_extern _aesni_xts_encrypt
1576
1577.p2align	4
1578_aesni_xts_encrypt:
1579	leaq	(%rsp),%r11
1580	pushq	%rbp
1581	subq	$112,%rsp
1582	andq	$-16,%rsp
1583	movups	(%r9),%xmm2
1584	movl	240(%r8),%eax
1585	movl	240(%rcx),%r10d
1586	movups	(%r8),%xmm0
1587	movups	16(%r8),%xmm1
1588	leaq	32(%r8),%r8
1589	xorps	%xmm0,%xmm2
1590L$oop_enc1_8:
1591.byte	102,15,56,220,209
1592	decl	%eax
1593	movups	(%r8),%xmm1
1594	leaq	16(%r8),%r8
1595	jnz	L$oop_enc1_8
1596.byte	102,15,56,221,209
1597	movups	(%rcx),%xmm0
1598	movq	%rcx,%rbp
1599	movl	%r10d,%eax
1600	shll	$4,%r10d
1601	movq	%rdx,%r9
1602	andq	$-16,%rdx
1603
1604	movups	16(%rcx,%r10,1),%xmm1
1605
1606	movdqa	L$xts_magic(%rip),%xmm8
1607	movdqa	%xmm2,%xmm15
1608	pshufd	$0x5f,%xmm2,%xmm9
1609	pxor	%xmm0,%xmm1
1610	movdqa	%xmm9,%xmm14
1611	paddd	%xmm9,%xmm9
1612	movdqa	%xmm15,%xmm10
1613	psrad	$31,%xmm14
1614	paddq	%xmm15,%xmm15
1615	pand	%xmm8,%xmm14
1616	pxor	%xmm0,%xmm10
1617	pxor	%xmm14,%xmm15
1618	movdqa	%xmm9,%xmm14
1619	paddd	%xmm9,%xmm9
1620	movdqa	%xmm15,%xmm11
1621	psrad	$31,%xmm14
1622	paddq	%xmm15,%xmm15
1623	pand	%xmm8,%xmm14
1624	pxor	%xmm0,%xmm11
1625	pxor	%xmm14,%xmm15
1626	movdqa	%xmm9,%xmm14
1627	paddd	%xmm9,%xmm9
1628	movdqa	%xmm15,%xmm12
1629	psrad	$31,%xmm14
1630	paddq	%xmm15,%xmm15
1631	pand	%xmm8,%xmm14
1632	pxor	%xmm0,%xmm12
1633	pxor	%xmm14,%xmm15
1634	movdqa	%xmm9,%xmm14
1635	paddd	%xmm9,%xmm9
1636	movdqa	%xmm15,%xmm13
1637	psrad	$31,%xmm14
1638	paddq	%xmm15,%xmm15
1639	pand	%xmm8,%xmm14
1640	pxor	%xmm0,%xmm13
1641	pxor	%xmm14,%xmm15
1642	movdqa	%xmm15,%xmm14
1643	psrad	$31,%xmm9
1644	paddq	%xmm15,%xmm15
1645	pand	%xmm8,%xmm9
1646	pxor	%xmm0,%xmm14
1647	pxor	%xmm9,%xmm15
1648	movaps	%xmm1,96(%rsp)
1649
1650	subq	$96,%rdx
1651	jc	L$xts_enc_short
1652
1653	movl	$16+96,%eax
1654	leaq	32(%rbp,%r10,1),%rcx
1655	subq	%r10,%rax
1656	movups	16(%rbp),%xmm1
1657	movq	%rax,%r10
1658	leaq	L$xts_magic(%rip),%r8
1659	jmp	L$xts_enc_grandloop
1660
1661.p2align	5
1662L$xts_enc_grandloop:
1663	movdqu	0(%rdi),%xmm2
1664	movdqa	%xmm0,%xmm8
1665	movdqu	16(%rdi),%xmm3
1666	pxor	%xmm10,%xmm2
1667	movdqu	32(%rdi),%xmm4
1668	pxor	%xmm11,%xmm3
1669.byte	102,15,56,220,209
1670	movdqu	48(%rdi),%xmm5
1671	pxor	%xmm12,%xmm4
1672.byte	102,15,56,220,217
1673	movdqu	64(%rdi),%xmm6
1674	pxor	%xmm13,%xmm5
1675.byte	102,15,56,220,225
1676	movdqu	80(%rdi),%xmm7
1677	pxor	%xmm15,%xmm8
1678	movdqa	96(%rsp),%xmm9
1679	pxor	%xmm14,%xmm6
1680.byte	102,15,56,220,233
1681	movups	32(%rbp),%xmm0
1682	leaq	96(%rdi),%rdi
1683	pxor	%xmm8,%xmm7
1684
1685	pxor	%xmm9,%xmm10
1686.byte	102,15,56,220,241
1687	pxor	%xmm9,%xmm11
1688	movdqa	%xmm10,0(%rsp)
1689.byte	102,15,56,220,249
1690	movups	48(%rbp),%xmm1
1691	pxor	%xmm9,%xmm12
1692
1693.byte	102,15,56,220,208
1694	pxor	%xmm9,%xmm13
1695	movdqa	%xmm11,16(%rsp)
1696.byte	102,15,56,220,216
1697	pxor	%xmm9,%xmm14
1698	movdqa	%xmm12,32(%rsp)
1699.byte	102,15,56,220,224
1700.byte	102,15,56,220,232
1701	pxor	%xmm9,%xmm8
1702	movdqa	%xmm14,64(%rsp)
1703.byte	102,15,56,220,240
1704.byte	102,15,56,220,248
1705	movups	64(%rbp),%xmm0
1706	movdqa	%xmm8,80(%rsp)
1707	pshufd	$0x5f,%xmm15,%xmm9
1708	jmp	L$xts_enc_loop6
1709.p2align	5
1710L$xts_enc_loop6:
1711.byte	102,15,56,220,209
1712.byte	102,15,56,220,217
1713.byte	102,15,56,220,225
1714.byte	102,15,56,220,233
1715.byte	102,15,56,220,241
1716.byte	102,15,56,220,249
1717	movups	-64(%rcx,%rax,1),%xmm1
1718	addq	$32,%rax
1719
1720.byte	102,15,56,220,208
1721.byte	102,15,56,220,216
1722.byte	102,15,56,220,224
1723.byte	102,15,56,220,232
1724.byte	102,15,56,220,240
1725.byte	102,15,56,220,248
1726	movups	-80(%rcx,%rax,1),%xmm0
1727	jnz	L$xts_enc_loop6
1728
1729	movdqa	(%r8),%xmm8
1730	movdqa	%xmm9,%xmm14
1731	paddd	%xmm9,%xmm9
1732.byte	102,15,56,220,209
1733	paddq	%xmm15,%xmm15
1734	psrad	$31,%xmm14
1735.byte	102,15,56,220,217
1736	pand	%xmm8,%xmm14
1737	movups	(%rbp),%xmm10
1738.byte	102,15,56,220,225
1739.byte	102,15,56,220,233
1740.byte	102,15,56,220,241
1741	pxor	%xmm14,%xmm15
1742	movaps	%xmm10,%xmm11
1743.byte	102,15,56,220,249
1744	movups	-64(%rcx),%xmm1
1745
1746	movdqa	%xmm9,%xmm14
1747.byte	102,15,56,220,208
1748	paddd	%xmm9,%xmm9
1749	pxor	%xmm15,%xmm10
1750.byte	102,15,56,220,216
1751	psrad	$31,%xmm14
1752	paddq	%xmm15,%xmm15
1753.byte	102,15,56,220,224
1754.byte	102,15,56,220,232
1755	pand	%xmm8,%xmm14
1756	movaps	%xmm11,%xmm12
1757.byte	102,15,56,220,240
1758	pxor	%xmm14,%xmm15
1759	movdqa	%xmm9,%xmm14
1760.byte	102,15,56,220,248
1761	movups	-48(%rcx),%xmm0
1762
1763	paddd	%xmm9,%xmm9
1764.byte	102,15,56,220,209
1765	pxor	%xmm15,%xmm11
1766	psrad	$31,%xmm14
1767.byte	102,15,56,220,217
1768	paddq	%xmm15,%xmm15
1769	pand	%xmm8,%xmm14
1770.byte	102,15,56,220,225
1771.byte	102,15,56,220,233
1772	movdqa	%xmm13,48(%rsp)
1773	pxor	%xmm14,%xmm15
1774.byte	102,15,56,220,241
1775	movaps	%xmm12,%xmm13
1776	movdqa	%xmm9,%xmm14
1777.byte	102,15,56,220,249
1778	movups	-32(%rcx),%xmm1
1779
1780	paddd	%xmm9,%xmm9
1781.byte	102,15,56,220,208
1782	pxor	%xmm15,%xmm12
1783	psrad	$31,%xmm14
1784.byte	102,15,56,220,216
1785	paddq	%xmm15,%xmm15
1786	pand	%xmm8,%xmm14
1787.byte	102,15,56,220,224
1788.byte	102,15,56,220,232
1789.byte	102,15,56,220,240
1790	pxor	%xmm14,%xmm15
1791	movaps	%xmm13,%xmm14
1792.byte	102,15,56,220,248
1793
1794	movdqa	%xmm9,%xmm0
1795	paddd	%xmm9,%xmm9
1796.byte	102,15,56,220,209
1797	pxor	%xmm15,%xmm13
1798	psrad	$31,%xmm0
1799.byte	102,15,56,220,217
1800	paddq	%xmm15,%xmm15
1801	pand	%xmm8,%xmm0
1802.byte	102,15,56,220,225
1803.byte	102,15,56,220,233
1804	pxor	%xmm0,%xmm15
1805	movups	(%rbp),%xmm0
1806.byte	102,15,56,220,241
1807.byte	102,15,56,220,249
1808	movups	16(%rbp),%xmm1
1809
1810	pxor	%xmm15,%xmm14
1811.byte	102,15,56,221,84,36,0
1812	psrad	$31,%xmm9
1813	paddq	%xmm15,%xmm15
1814.byte	102,15,56,221,92,36,16
1815.byte	102,15,56,221,100,36,32
1816	pand	%xmm8,%xmm9
1817	movq	%r10,%rax
1818.byte	102,15,56,221,108,36,48
1819.byte	102,15,56,221,116,36,64
1820.byte	102,15,56,221,124,36,80
1821	pxor	%xmm9,%xmm15
1822
1823	leaq	96(%rsi),%rsi
1824	movups	%xmm2,-96(%rsi)
1825	movups	%xmm3,-80(%rsi)
1826	movups	%xmm4,-64(%rsi)
1827	movups	%xmm5,-48(%rsi)
1828	movups	%xmm6,-32(%rsi)
1829	movups	%xmm7,-16(%rsi)
1830	subq	$96,%rdx
1831	jnc	L$xts_enc_grandloop
1832
1833	movl	$16+96,%eax
1834	subl	%r10d,%eax
1835	movq	%rbp,%rcx
1836	shrl	$4,%eax
1837
1838L$xts_enc_short:
1839
1840	movl	%eax,%r10d
1841	pxor	%xmm0,%xmm10
1842	addq	$96,%rdx
1843	jz	L$xts_enc_done
1844
1845	pxor	%xmm0,%xmm11
1846	cmpq	$0x20,%rdx
1847	jb	L$xts_enc_one
1848	pxor	%xmm0,%xmm12
1849	je	L$xts_enc_two
1850
1851	pxor	%xmm0,%xmm13
1852	cmpq	$0x40,%rdx
1853	jb	L$xts_enc_three
1854	pxor	%xmm0,%xmm14
1855	je	L$xts_enc_four
1856
1857	movdqu	(%rdi),%xmm2
1858	movdqu	16(%rdi),%xmm3
1859	movdqu	32(%rdi),%xmm4
1860	pxor	%xmm10,%xmm2
1861	movdqu	48(%rdi),%xmm5
1862	pxor	%xmm11,%xmm3
1863	movdqu	64(%rdi),%xmm6
1864	leaq	80(%rdi),%rdi
1865	pxor	%xmm12,%xmm4
1866	pxor	%xmm13,%xmm5
1867	pxor	%xmm14,%xmm6
1868	pxor	%xmm7,%xmm7
1869
1870	call	_aesni_encrypt6
1871
1872	xorps	%xmm10,%xmm2
1873	movdqa	%xmm15,%xmm10
1874	xorps	%xmm11,%xmm3
1875	xorps	%xmm12,%xmm4
1876	movdqu	%xmm2,(%rsi)
1877	xorps	%xmm13,%xmm5
1878	movdqu	%xmm3,16(%rsi)
1879	xorps	%xmm14,%xmm6
1880	movdqu	%xmm4,32(%rsi)
1881	movdqu	%xmm5,48(%rsi)
1882	movdqu	%xmm6,64(%rsi)
1883	leaq	80(%rsi),%rsi
1884	jmp	L$xts_enc_done
1885
1886.p2align	4
1887L$xts_enc_one:
1888	movups	(%rdi),%xmm2
1889	leaq	16(%rdi),%rdi
1890	xorps	%xmm10,%xmm2
1891	movups	(%rcx),%xmm0
1892	movups	16(%rcx),%xmm1
1893	leaq	32(%rcx),%rcx
1894	xorps	%xmm0,%xmm2
1895L$oop_enc1_9:
1896.byte	102,15,56,220,209
1897	decl	%eax
1898	movups	(%rcx),%xmm1
1899	leaq	16(%rcx),%rcx
1900	jnz	L$oop_enc1_9
1901.byte	102,15,56,221,209
1902	xorps	%xmm10,%xmm2
1903	movdqa	%xmm11,%xmm10
1904	movups	%xmm2,(%rsi)
1905	leaq	16(%rsi),%rsi
1906	jmp	L$xts_enc_done
1907
1908.p2align	4
1909L$xts_enc_two:
1910	movups	(%rdi),%xmm2
1911	movups	16(%rdi),%xmm3
1912	leaq	32(%rdi),%rdi
1913	xorps	%xmm10,%xmm2
1914	xorps	%xmm11,%xmm3
1915
1916	call	_aesni_encrypt2
1917
1918	xorps	%xmm10,%xmm2
1919	movdqa	%xmm12,%xmm10
1920	xorps	%xmm11,%xmm3
1921	movups	%xmm2,(%rsi)
1922	movups	%xmm3,16(%rsi)
1923	leaq	32(%rsi),%rsi
1924	jmp	L$xts_enc_done
1925
1926.p2align	4
1927L$xts_enc_three:
1928	movups	(%rdi),%xmm2
1929	movups	16(%rdi),%xmm3
1930	movups	32(%rdi),%xmm4
1931	leaq	48(%rdi),%rdi
1932	xorps	%xmm10,%xmm2
1933	xorps	%xmm11,%xmm3
1934	xorps	%xmm12,%xmm4
1935
1936	call	_aesni_encrypt3
1937
1938	xorps	%xmm10,%xmm2
1939	movdqa	%xmm13,%xmm10
1940	xorps	%xmm11,%xmm3
1941	xorps	%xmm12,%xmm4
1942	movups	%xmm2,(%rsi)
1943	movups	%xmm3,16(%rsi)
1944	movups	%xmm4,32(%rsi)
1945	leaq	48(%rsi),%rsi
1946	jmp	L$xts_enc_done
1947
1948.p2align	4
1949L$xts_enc_four:
1950	movups	(%rdi),%xmm2
1951	movups	16(%rdi),%xmm3
1952	movups	32(%rdi),%xmm4
1953	xorps	%xmm10,%xmm2
1954	movups	48(%rdi),%xmm5
1955	leaq	64(%rdi),%rdi
1956	xorps	%xmm11,%xmm3
1957	xorps	%xmm12,%xmm4
1958	xorps	%xmm13,%xmm5
1959
1960	call	_aesni_encrypt4
1961
1962	pxor	%xmm10,%xmm2
1963	movdqa	%xmm14,%xmm10
1964	pxor	%xmm11,%xmm3
1965	pxor	%xmm12,%xmm4
1966	movdqu	%xmm2,(%rsi)
1967	pxor	%xmm13,%xmm5
1968	movdqu	%xmm3,16(%rsi)
1969	movdqu	%xmm4,32(%rsi)
1970	movdqu	%xmm5,48(%rsi)
1971	leaq	64(%rsi),%rsi
1972	jmp	L$xts_enc_done
1973
1974.p2align	4
1975L$xts_enc_done:
1976	andq	$15,%r9
1977	jz	L$xts_enc_ret
1978	movq	%r9,%rdx
1979
1980L$xts_enc_steal:
1981	movzbl	(%rdi),%eax
1982	movzbl	-16(%rsi),%ecx
1983	leaq	1(%rdi),%rdi
1984	movb	%al,-16(%rsi)
1985	movb	%cl,0(%rsi)
1986	leaq	1(%rsi),%rsi
1987	subq	$1,%rdx
1988	jnz	L$xts_enc_steal
1989
1990	subq	%r9,%rsi
1991	movq	%rbp,%rcx
1992	movl	%r10d,%eax
1993
1994	movups	-16(%rsi),%xmm2
1995	xorps	%xmm10,%xmm2
1996	movups	(%rcx),%xmm0
1997	movups	16(%rcx),%xmm1
1998	leaq	32(%rcx),%rcx
1999	xorps	%xmm0,%xmm2
2000L$oop_enc1_10:
2001.byte	102,15,56,220,209
2002	decl	%eax
2003	movups	(%rcx),%xmm1
2004	leaq	16(%rcx),%rcx
2005	jnz	L$oop_enc1_10
2006.byte	102,15,56,221,209
2007	xorps	%xmm10,%xmm2
2008	movups	%xmm2,-16(%rsi)
2009
2010L$xts_enc_ret:
2011	xorps	%xmm0,%xmm0
2012	pxor	%xmm1,%xmm1
2013	pxor	%xmm2,%xmm2
2014	pxor	%xmm3,%xmm3
2015	pxor	%xmm4,%xmm4
2016	pxor	%xmm5,%xmm5
2017	pxor	%xmm6,%xmm6
2018	pxor	%xmm7,%xmm7
2019	movaps	%xmm0,0(%rsp)
2020	pxor	%xmm8,%xmm8
2021	movaps	%xmm0,16(%rsp)
2022	pxor	%xmm9,%xmm9
2023	movaps	%xmm0,32(%rsp)
2024	pxor	%xmm10,%xmm10
2025	movaps	%xmm0,48(%rsp)
2026	pxor	%xmm11,%xmm11
2027	movaps	%xmm0,64(%rsp)
2028	pxor	%xmm12,%xmm12
2029	movaps	%xmm0,80(%rsp)
2030	pxor	%xmm13,%xmm13
2031	movaps	%xmm0,96(%rsp)
2032	pxor	%xmm14,%xmm14
2033	pxor	%xmm15,%xmm15
2034	movq	-8(%r11),%rbp
2035	leaq	(%r11),%rsp
2036L$xts_enc_epilogue:
2037	.byte	0xf3,0xc3
2038
2039.globl	_aesni_xts_decrypt
2040.private_extern _aesni_xts_decrypt
2041
2042.p2align	4
2043_aesni_xts_decrypt:
2044	leaq	(%rsp),%r11
2045	pushq	%rbp
2046	subq	$112,%rsp
2047	andq	$-16,%rsp
2048	movups	(%r9),%xmm2
2049	movl	240(%r8),%eax
2050	movl	240(%rcx),%r10d
2051	movups	(%r8),%xmm0
2052	movups	16(%r8),%xmm1
2053	leaq	32(%r8),%r8
2054	xorps	%xmm0,%xmm2
2055L$oop_enc1_11:
2056.byte	102,15,56,220,209
2057	decl	%eax
2058	movups	(%r8),%xmm1
2059	leaq	16(%r8),%r8
2060	jnz	L$oop_enc1_11
2061.byte	102,15,56,221,209
2062	xorl	%eax,%eax
2063	testq	$15,%rdx
2064	setnz	%al
2065	shlq	$4,%rax
2066	subq	%rax,%rdx
2067
2068	movups	(%rcx),%xmm0
2069	movq	%rcx,%rbp
2070	movl	%r10d,%eax
2071	shll	$4,%r10d
2072	movq	%rdx,%r9
2073	andq	$-16,%rdx
2074
2075	movups	16(%rcx,%r10,1),%xmm1
2076
2077	movdqa	L$xts_magic(%rip),%xmm8
2078	movdqa	%xmm2,%xmm15
2079	pshufd	$0x5f,%xmm2,%xmm9
2080	pxor	%xmm0,%xmm1
2081	movdqa	%xmm9,%xmm14
2082	paddd	%xmm9,%xmm9
2083	movdqa	%xmm15,%xmm10
2084	psrad	$31,%xmm14
2085	paddq	%xmm15,%xmm15
2086	pand	%xmm8,%xmm14
2087	pxor	%xmm0,%xmm10
2088	pxor	%xmm14,%xmm15
2089	movdqa	%xmm9,%xmm14
2090	paddd	%xmm9,%xmm9
2091	movdqa	%xmm15,%xmm11
2092	psrad	$31,%xmm14
2093	paddq	%xmm15,%xmm15
2094	pand	%xmm8,%xmm14
2095	pxor	%xmm0,%xmm11
2096	pxor	%xmm14,%xmm15
2097	movdqa	%xmm9,%xmm14
2098	paddd	%xmm9,%xmm9
2099	movdqa	%xmm15,%xmm12
2100	psrad	$31,%xmm14
2101	paddq	%xmm15,%xmm15
2102	pand	%xmm8,%xmm14
2103	pxor	%xmm0,%xmm12
2104	pxor	%xmm14,%xmm15
2105	movdqa	%xmm9,%xmm14
2106	paddd	%xmm9,%xmm9
2107	movdqa	%xmm15,%xmm13
2108	psrad	$31,%xmm14
2109	paddq	%xmm15,%xmm15
2110	pand	%xmm8,%xmm14
2111	pxor	%xmm0,%xmm13
2112	pxor	%xmm14,%xmm15
2113	movdqa	%xmm15,%xmm14
2114	psrad	$31,%xmm9
2115	paddq	%xmm15,%xmm15
2116	pand	%xmm8,%xmm9
2117	pxor	%xmm0,%xmm14
2118	pxor	%xmm9,%xmm15
2119	movaps	%xmm1,96(%rsp)
2120
2121	subq	$96,%rdx
2122	jc	L$xts_dec_short
2123
2124	movl	$16+96,%eax
2125	leaq	32(%rbp,%r10,1),%rcx
2126	subq	%r10,%rax
2127	movups	16(%rbp),%xmm1
2128	movq	%rax,%r10
2129	leaq	L$xts_magic(%rip),%r8
2130	jmp	L$xts_dec_grandloop
2131
2132.p2align	5
2133L$xts_dec_grandloop:
2134	movdqu	0(%rdi),%xmm2
2135	movdqa	%xmm0,%xmm8
2136	movdqu	16(%rdi),%xmm3
2137	pxor	%xmm10,%xmm2
2138	movdqu	32(%rdi),%xmm4
2139	pxor	%xmm11,%xmm3
2140.byte	102,15,56,222,209
2141	movdqu	48(%rdi),%xmm5
2142	pxor	%xmm12,%xmm4
2143.byte	102,15,56,222,217
2144	movdqu	64(%rdi),%xmm6
2145	pxor	%xmm13,%xmm5
2146.byte	102,15,56,222,225
2147	movdqu	80(%rdi),%xmm7
2148	pxor	%xmm15,%xmm8
2149	movdqa	96(%rsp),%xmm9
2150	pxor	%xmm14,%xmm6
2151.byte	102,15,56,222,233
2152	movups	32(%rbp),%xmm0
2153	leaq	96(%rdi),%rdi
2154	pxor	%xmm8,%xmm7
2155
2156	pxor	%xmm9,%xmm10
2157.byte	102,15,56,222,241
2158	pxor	%xmm9,%xmm11
2159	movdqa	%xmm10,0(%rsp)
2160.byte	102,15,56,222,249
2161	movups	48(%rbp),%xmm1
2162	pxor	%xmm9,%xmm12
2163
2164.byte	102,15,56,222,208
2165	pxor	%xmm9,%xmm13
2166	movdqa	%xmm11,16(%rsp)
2167.byte	102,15,56,222,216
2168	pxor	%xmm9,%xmm14
2169	movdqa	%xmm12,32(%rsp)
2170.byte	102,15,56,222,224
2171.byte	102,15,56,222,232
2172	pxor	%xmm9,%xmm8
2173	movdqa	%xmm14,64(%rsp)
2174.byte	102,15,56,222,240
2175.byte	102,15,56,222,248
2176	movups	64(%rbp),%xmm0
2177	movdqa	%xmm8,80(%rsp)
2178	pshufd	$0x5f,%xmm15,%xmm9
2179	jmp	L$xts_dec_loop6
2180.p2align	5
2181L$xts_dec_loop6:
2182.byte	102,15,56,222,209
2183.byte	102,15,56,222,217
2184.byte	102,15,56,222,225
2185.byte	102,15,56,222,233
2186.byte	102,15,56,222,241
2187.byte	102,15,56,222,249
2188	movups	-64(%rcx,%rax,1),%xmm1
2189	addq	$32,%rax
2190
2191.byte	102,15,56,222,208
2192.byte	102,15,56,222,216
2193.byte	102,15,56,222,224
2194.byte	102,15,56,222,232
2195.byte	102,15,56,222,240
2196.byte	102,15,56,222,248
2197	movups	-80(%rcx,%rax,1),%xmm0
2198	jnz	L$xts_dec_loop6
2199
2200	movdqa	(%r8),%xmm8
2201	movdqa	%xmm9,%xmm14
2202	paddd	%xmm9,%xmm9
2203.byte	102,15,56,222,209
2204	paddq	%xmm15,%xmm15
2205	psrad	$31,%xmm14
2206.byte	102,15,56,222,217
2207	pand	%xmm8,%xmm14
2208	movups	(%rbp),%xmm10
2209.byte	102,15,56,222,225
2210.byte	102,15,56,222,233
2211.byte	102,15,56,222,241
2212	pxor	%xmm14,%xmm15
2213	movaps	%xmm10,%xmm11
2214.byte	102,15,56,222,249
2215	movups	-64(%rcx),%xmm1
2216
2217	movdqa	%xmm9,%xmm14
2218.byte	102,15,56,222,208
2219	paddd	%xmm9,%xmm9
2220	pxor	%xmm15,%xmm10
2221.byte	102,15,56,222,216
2222	psrad	$31,%xmm14
2223	paddq	%xmm15,%xmm15
2224.byte	102,15,56,222,224
2225.byte	102,15,56,222,232
2226	pand	%xmm8,%xmm14
2227	movaps	%xmm11,%xmm12
2228.byte	102,15,56,222,240
2229	pxor	%xmm14,%xmm15
2230	movdqa	%xmm9,%xmm14
2231.byte	102,15,56,222,248
2232	movups	-48(%rcx),%xmm0
2233
2234	paddd	%xmm9,%xmm9
2235.byte	102,15,56,222,209
2236	pxor	%xmm15,%xmm11
2237	psrad	$31,%xmm14
2238.byte	102,15,56,222,217
2239	paddq	%xmm15,%xmm15
2240	pand	%xmm8,%xmm14
2241.byte	102,15,56,222,225
2242.byte	102,15,56,222,233
2243	movdqa	%xmm13,48(%rsp)
2244	pxor	%xmm14,%xmm15
2245.byte	102,15,56,222,241
2246	movaps	%xmm12,%xmm13
2247	movdqa	%xmm9,%xmm14
2248.byte	102,15,56,222,249
2249	movups	-32(%rcx),%xmm1
2250
2251	paddd	%xmm9,%xmm9
2252.byte	102,15,56,222,208
2253	pxor	%xmm15,%xmm12
2254	psrad	$31,%xmm14
2255.byte	102,15,56,222,216
2256	paddq	%xmm15,%xmm15
2257	pand	%xmm8,%xmm14
2258.byte	102,15,56,222,224
2259.byte	102,15,56,222,232
2260.byte	102,15,56,222,240
2261	pxor	%xmm14,%xmm15
2262	movaps	%xmm13,%xmm14
2263.byte	102,15,56,222,248
2264
2265	movdqa	%xmm9,%xmm0
2266	paddd	%xmm9,%xmm9
2267.byte	102,15,56,222,209
2268	pxor	%xmm15,%xmm13
2269	psrad	$31,%xmm0
2270.byte	102,15,56,222,217
2271	paddq	%xmm15,%xmm15
2272	pand	%xmm8,%xmm0
2273.byte	102,15,56,222,225
2274.byte	102,15,56,222,233
2275	pxor	%xmm0,%xmm15
2276	movups	(%rbp),%xmm0
2277.byte	102,15,56,222,241
2278.byte	102,15,56,222,249
2279	movups	16(%rbp),%xmm1
2280
2281	pxor	%xmm15,%xmm14
2282.byte	102,15,56,223,84,36,0
2283	psrad	$31,%xmm9
2284	paddq	%xmm15,%xmm15
2285.byte	102,15,56,223,92,36,16
2286.byte	102,15,56,223,100,36,32
2287	pand	%xmm8,%xmm9
2288	movq	%r10,%rax
2289.byte	102,15,56,223,108,36,48
2290.byte	102,15,56,223,116,36,64
2291.byte	102,15,56,223,124,36,80
2292	pxor	%xmm9,%xmm15
2293
2294	leaq	96(%rsi),%rsi
2295	movups	%xmm2,-96(%rsi)
2296	movups	%xmm3,-80(%rsi)
2297	movups	%xmm4,-64(%rsi)
2298	movups	%xmm5,-48(%rsi)
2299	movups	%xmm6,-32(%rsi)
2300	movups	%xmm7,-16(%rsi)
2301	subq	$96,%rdx
2302	jnc	L$xts_dec_grandloop
2303
2304	movl	$16+96,%eax
2305	subl	%r10d,%eax
2306	movq	%rbp,%rcx
2307	shrl	$4,%eax
2308
2309L$xts_dec_short:
2310
2311	movl	%eax,%r10d
2312	pxor	%xmm0,%xmm10
2313	pxor	%xmm0,%xmm11
2314	addq	$96,%rdx
2315	jz	L$xts_dec_done
2316
2317	pxor	%xmm0,%xmm12
2318	cmpq	$0x20,%rdx
2319	jb	L$xts_dec_one
2320	pxor	%xmm0,%xmm13
2321	je	L$xts_dec_two
2322
2323	pxor	%xmm0,%xmm14
2324	cmpq	$0x40,%rdx
2325	jb	L$xts_dec_three
2326	je	L$xts_dec_four
2327
2328	movdqu	(%rdi),%xmm2
2329	movdqu	16(%rdi),%xmm3
2330	movdqu	32(%rdi),%xmm4
2331	pxor	%xmm10,%xmm2
2332	movdqu	48(%rdi),%xmm5
2333	pxor	%xmm11,%xmm3
2334	movdqu	64(%rdi),%xmm6
2335	leaq	80(%rdi),%rdi
2336	pxor	%xmm12,%xmm4
2337	pxor	%xmm13,%xmm5
2338	pxor	%xmm14,%xmm6
2339
2340	call	_aesni_decrypt6
2341
2342	xorps	%xmm10,%xmm2
2343	xorps	%xmm11,%xmm3
2344	xorps	%xmm12,%xmm4
2345	movdqu	%xmm2,(%rsi)
2346	xorps	%xmm13,%xmm5
2347	movdqu	%xmm3,16(%rsi)
2348	xorps	%xmm14,%xmm6
2349	movdqu	%xmm4,32(%rsi)
2350	pxor	%xmm14,%xmm14
2351	movdqu	%xmm5,48(%rsi)
2352	pcmpgtd	%xmm15,%xmm14
2353	movdqu	%xmm6,64(%rsi)
2354	leaq	80(%rsi),%rsi
2355	pshufd	$0x13,%xmm14,%xmm11
2356	andq	$15,%r9
2357	jz	L$xts_dec_ret
2358
2359	movdqa	%xmm15,%xmm10
2360	paddq	%xmm15,%xmm15
2361	pand	%xmm8,%xmm11
2362	pxor	%xmm15,%xmm11
2363	jmp	L$xts_dec_done2
2364
2365.p2align	4
2366L$xts_dec_one:
2367	movups	(%rdi),%xmm2
2368	leaq	16(%rdi),%rdi
2369	xorps	%xmm10,%xmm2
2370	movups	(%rcx),%xmm0
2371	movups	16(%rcx),%xmm1
2372	leaq	32(%rcx),%rcx
2373	xorps	%xmm0,%xmm2
2374L$oop_dec1_12:
2375.byte	102,15,56,222,209
2376	decl	%eax
2377	movups	(%rcx),%xmm1
2378	leaq	16(%rcx),%rcx
2379	jnz	L$oop_dec1_12
2380.byte	102,15,56,223,209
2381	xorps	%xmm10,%xmm2
2382	movdqa	%xmm11,%xmm10
2383	movups	%xmm2,(%rsi)
2384	movdqa	%xmm12,%xmm11
2385	leaq	16(%rsi),%rsi
2386	jmp	L$xts_dec_done
2387
2388.p2align	4
2389L$xts_dec_two:
2390	movups	(%rdi),%xmm2
2391	movups	16(%rdi),%xmm3
2392	leaq	32(%rdi),%rdi
2393	xorps	%xmm10,%xmm2
2394	xorps	%xmm11,%xmm3
2395
2396	call	_aesni_decrypt2
2397
2398	xorps	%xmm10,%xmm2
2399	movdqa	%xmm12,%xmm10
2400	xorps	%xmm11,%xmm3
2401	movdqa	%xmm13,%xmm11
2402	movups	%xmm2,(%rsi)
2403	movups	%xmm3,16(%rsi)
2404	leaq	32(%rsi),%rsi
2405	jmp	L$xts_dec_done
2406
2407.p2align	4
2408L$xts_dec_three:
2409	movups	(%rdi),%xmm2
2410	movups	16(%rdi),%xmm3
2411	movups	32(%rdi),%xmm4
2412	leaq	48(%rdi),%rdi
2413	xorps	%xmm10,%xmm2
2414	xorps	%xmm11,%xmm3
2415	xorps	%xmm12,%xmm4
2416
2417	call	_aesni_decrypt3
2418
2419	xorps	%xmm10,%xmm2
2420	movdqa	%xmm13,%xmm10
2421	xorps	%xmm11,%xmm3
2422	movdqa	%xmm14,%xmm11
2423	xorps	%xmm12,%xmm4
2424	movups	%xmm2,(%rsi)
2425	movups	%xmm3,16(%rsi)
2426	movups	%xmm4,32(%rsi)
2427	leaq	48(%rsi),%rsi
2428	jmp	L$xts_dec_done
2429
2430.p2align	4
2431L$xts_dec_four:
2432	movups	(%rdi),%xmm2
2433	movups	16(%rdi),%xmm3
2434	movups	32(%rdi),%xmm4
2435	xorps	%xmm10,%xmm2
2436	movups	48(%rdi),%xmm5
2437	leaq	64(%rdi),%rdi
2438	xorps	%xmm11,%xmm3
2439	xorps	%xmm12,%xmm4
2440	xorps	%xmm13,%xmm5
2441
2442	call	_aesni_decrypt4
2443
2444	pxor	%xmm10,%xmm2
2445	movdqa	%xmm14,%xmm10
2446	pxor	%xmm11,%xmm3
2447	movdqa	%xmm15,%xmm11
2448	pxor	%xmm12,%xmm4
2449	movdqu	%xmm2,(%rsi)
2450	pxor	%xmm13,%xmm5
2451	movdqu	%xmm3,16(%rsi)
2452	movdqu	%xmm4,32(%rsi)
2453	movdqu	%xmm5,48(%rsi)
2454	leaq	64(%rsi),%rsi
2455	jmp	L$xts_dec_done
2456
2457.p2align	4
2458L$xts_dec_done:
2459	andq	$15,%r9
2460	jz	L$xts_dec_ret
2461L$xts_dec_done2:
2462	movq	%r9,%rdx
2463	movq	%rbp,%rcx
2464	movl	%r10d,%eax
2465
2466	movups	(%rdi),%xmm2
2467	xorps	%xmm11,%xmm2
2468	movups	(%rcx),%xmm0
2469	movups	16(%rcx),%xmm1
2470	leaq	32(%rcx),%rcx
2471	xorps	%xmm0,%xmm2
2472L$oop_dec1_13:
2473.byte	102,15,56,222,209
2474	decl	%eax
2475	movups	(%rcx),%xmm1
2476	leaq	16(%rcx),%rcx
2477	jnz	L$oop_dec1_13
2478.byte	102,15,56,223,209
2479	xorps	%xmm11,%xmm2
2480	movups	%xmm2,(%rsi)
2481
2482L$xts_dec_steal:
2483	movzbl	16(%rdi),%eax
2484	movzbl	(%rsi),%ecx
2485	leaq	1(%rdi),%rdi
2486	movb	%al,(%rsi)
2487	movb	%cl,16(%rsi)
2488	leaq	1(%rsi),%rsi
2489	subq	$1,%rdx
2490	jnz	L$xts_dec_steal
2491
2492	subq	%r9,%rsi
2493	movq	%rbp,%rcx
2494	movl	%r10d,%eax
2495
2496	movups	(%rsi),%xmm2
2497	xorps	%xmm10,%xmm2
2498	movups	(%rcx),%xmm0
2499	movups	16(%rcx),%xmm1
2500	leaq	32(%rcx),%rcx
2501	xorps	%xmm0,%xmm2
2502L$oop_dec1_14:
2503.byte	102,15,56,222,209
2504	decl	%eax
2505	movups	(%rcx),%xmm1
2506	leaq	16(%rcx),%rcx
2507	jnz	L$oop_dec1_14
2508.byte	102,15,56,223,209
2509	xorps	%xmm10,%xmm2
2510	movups	%xmm2,(%rsi)
2511
2512L$xts_dec_ret:
2513	xorps	%xmm0,%xmm0
2514	pxor	%xmm1,%xmm1
2515	pxor	%xmm2,%xmm2
2516	pxor	%xmm3,%xmm3
2517	pxor	%xmm4,%xmm4
2518	pxor	%xmm5,%xmm5
2519	pxor	%xmm6,%xmm6
2520	pxor	%xmm7,%xmm7
2521	movaps	%xmm0,0(%rsp)
2522	pxor	%xmm8,%xmm8
2523	movaps	%xmm0,16(%rsp)
2524	pxor	%xmm9,%xmm9
2525	movaps	%xmm0,32(%rsp)
2526	pxor	%xmm10,%xmm10
2527	movaps	%xmm0,48(%rsp)
2528	pxor	%xmm11,%xmm11
2529	movaps	%xmm0,64(%rsp)
2530	pxor	%xmm12,%xmm12
2531	movaps	%xmm0,80(%rsp)
2532	pxor	%xmm13,%xmm13
2533	movaps	%xmm0,96(%rsp)
2534	pxor	%xmm14,%xmm14
2535	pxor	%xmm15,%xmm15
2536	movq	-8(%r11),%rbp
2537	leaq	(%r11),%rsp
2538L$xts_dec_epilogue:
2539	.byte	0xf3,0xc3
2540
2541.globl	_aesni_ocb_encrypt
2542.private_extern _aesni_ocb_encrypt
2543
2544.p2align	5
2545_aesni_ocb_encrypt:
2546	leaq	(%rsp),%rax
2547	pushq	%rbx
2548	pushq	%rbp
2549	pushq	%r12
2550	pushq	%r13
2551	pushq	%r14
2552	movq	8(%rax),%rbx
2553	movq	8+8(%rax),%rbp
2554
2555	movl	240(%rcx),%r10d
2556	movq	%rcx,%r11
2557	shll	$4,%r10d
2558	movups	(%rcx),%xmm9
2559	movups	16(%rcx,%r10,1),%xmm1
2560
2561	movdqu	(%r9),%xmm15
2562	pxor	%xmm1,%xmm9
2563	pxor	%xmm1,%xmm15
2564
2565	movl	$16+32,%eax
2566	leaq	32(%r11,%r10,1),%rcx
2567	movups	16(%r11),%xmm1
2568	subq	%r10,%rax
2569	movq	%rax,%r10
2570
2571	movdqu	(%rbx),%xmm10
2572	movdqu	(%rbp),%xmm8
2573
2574	testq	$1,%r8
2575	jnz	L$ocb_enc_odd
2576
2577	bsfq	%r8,%r12
2578	addq	$1,%r8
2579	shlq	$4,%r12
2580	movdqu	(%rbx,%r12,1),%xmm7
2581	movdqu	(%rdi),%xmm2
2582	leaq	16(%rdi),%rdi
2583
2584	call	__ocb_encrypt1
2585
2586	movdqa	%xmm7,%xmm15
2587	movups	%xmm2,(%rsi)
2588	leaq	16(%rsi),%rsi
2589	subq	$1,%rdx
2590	jz	L$ocb_enc_done
2591
2592L$ocb_enc_odd:
2593	leaq	1(%r8),%r12
2594	leaq	3(%r8),%r13
2595	leaq	5(%r8),%r14
2596	leaq	6(%r8),%r8
2597	bsfq	%r12,%r12
2598	bsfq	%r13,%r13
2599	bsfq	%r14,%r14
2600	shlq	$4,%r12
2601	shlq	$4,%r13
2602	shlq	$4,%r14
2603
2604	subq	$6,%rdx
2605	jc	L$ocb_enc_short
2606	jmp	L$ocb_enc_grandloop
2607
2608.p2align	5
2609L$ocb_enc_grandloop:
2610	movdqu	0(%rdi),%xmm2
2611	movdqu	16(%rdi),%xmm3
2612	movdqu	32(%rdi),%xmm4
2613	movdqu	48(%rdi),%xmm5
2614	movdqu	64(%rdi),%xmm6
2615	movdqu	80(%rdi),%xmm7
2616	leaq	96(%rdi),%rdi
2617
2618	call	__ocb_encrypt6
2619
2620	movups	%xmm2,0(%rsi)
2621	movups	%xmm3,16(%rsi)
2622	movups	%xmm4,32(%rsi)
2623	movups	%xmm5,48(%rsi)
2624	movups	%xmm6,64(%rsi)
2625	movups	%xmm7,80(%rsi)
2626	leaq	96(%rsi),%rsi
2627	subq	$6,%rdx
2628	jnc	L$ocb_enc_grandloop
2629
2630L$ocb_enc_short:
2631	addq	$6,%rdx
2632	jz	L$ocb_enc_done
2633
2634	movdqu	0(%rdi),%xmm2
2635	cmpq	$2,%rdx
2636	jb	L$ocb_enc_one
2637	movdqu	16(%rdi),%xmm3
2638	je	L$ocb_enc_two
2639
2640	movdqu	32(%rdi),%xmm4
2641	cmpq	$4,%rdx
2642	jb	L$ocb_enc_three
2643	movdqu	48(%rdi),%xmm5
2644	je	L$ocb_enc_four
2645
2646	movdqu	64(%rdi),%xmm6
2647	pxor	%xmm7,%xmm7
2648
2649	call	__ocb_encrypt6
2650
2651	movdqa	%xmm14,%xmm15
2652	movups	%xmm2,0(%rsi)
2653	movups	%xmm3,16(%rsi)
2654	movups	%xmm4,32(%rsi)
2655	movups	%xmm5,48(%rsi)
2656	movups	%xmm6,64(%rsi)
2657
2658	jmp	L$ocb_enc_done
2659
2660.p2align	4
2661L$ocb_enc_one:
2662	movdqa	%xmm10,%xmm7
2663
2664	call	__ocb_encrypt1
2665
2666	movdqa	%xmm7,%xmm15
2667	movups	%xmm2,0(%rsi)
2668	jmp	L$ocb_enc_done
2669
2670.p2align	4
2671L$ocb_enc_two:
2672	pxor	%xmm4,%xmm4
2673	pxor	%xmm5,%xmm5
2674
2675	call	__ocb_encrypt4
2676
2677	movdqa	%xmm11,%xmm15
2678	movups	%xmm2,0(%rsi)
2679	movups	%xmm3,16(%rsi)
2680
2681	jmp	L$ocb_enc_done
2682
2683.p2align	4
2684L$ocb_enc_three:
2685	pxor	%xmm5,%xmm5
2686
2687	call	__ocb_encrypt4
2688
2689	movdqa	%xmm12,%xmm15
2690	movups	%xmm2,0(%rsi)
2691	movups	%xmm3,16(%rsi)
2692	movups	%xmm4,32(%rsi)
2693
2694	jmp	L$ocb_enc_done
2695
2696.p2align	4
2697L$ocb_enc_four:
2698	call	__ocb_encrypt4
2699
2700	movdqa	%xmm13,%xmm15
2701	movups	%xmm2,0(%rsi)
2702	movups	%xmm3,16(%rsi)
2703	movups	%xmm4,32(%rsi)
2704	movups	%xmm5,48(%rsi)
2705
2706L$ocb_enc_done:
2707	pxor	%xmm0,%xmm15
2708	movdqu	%xmm8,(%rbp)
2709	movdqu	%xmm15,(%r9)
2710
2711	xorps	%xmm0,%xmm0
2712	pxor	%xmm1,%xmm1
2713	pxor	%xmm2,%xmm2
2714	pxor	%xmm3,%xmm3
2715	pxor	%xmm4,%xmm4
2716	pxor	%xmm5,%xmm5
2717	pxor	%xmm6,%xmm6
2718	pxor	%xmm7,%xmm7
2719	pxor	%xmm8,%xmm8
2720	pxor	%xmm9,%xmm9
2721	pxor	%xmm10,%xmm10
2722	pxor	%xmm11,%xmm11
2723	pxor	%xmm12,%xmm12
2724	pxor	%xmm13,%xmm13
2725	pxor	%xmm14,%xmm14
2726	pxor	%xmm15,%xmm15
2727	leaq	40(%rsp),%rax
2728	movq	-40(%rax),%r14
2729	movq	-32(%rax),%r13
2730	movq	-24(%rax),%r12
2731	movq	-16(%rax),%rbp
2732	movq	-8(%rax),%rbx
2733	leaq	(%rax),%rsp
2734L$ocb_enc_epilogue:
2735	.byte	0xf3,0xc3
2736
2737
2738
2739.p2align	5
2740__ocb_encrypt6:
2741	pxor	%xmm9,%xmm15
2742	movdqu	(%rbx,%r12,1),%xmm11
2743	movdqa	%xmm10,%xmm12
2744	movdqu	(%rbx,%r13,1),%xmm13
2745	movdqa	%xmm10,%xmm14
2746	pxor	%xmm15,%xmm10
2747	movdqu	(%rbx,%r14,1),%xmm15
2748	pxor	%xmm10,%xmm11
2749	pxor	%xmm2,%xmm8
2750	pxor	%xmm10,%xmm2
2751	pxor	%xmm11,%xmm12
2752	pxor	%xmm3,%xmm8
2753	pxor	%xmm11,%xmm3
2754	pxor	%xmm12,%xmm13
2755	pxor	%xmm4,%xmm8
2756	pxor	%xmm12,%xmm4
2757	pxor	%xmm13,%xmm14
2758	pxor	%xmm5,%xmm8
2759	pxor	%xmm13,%xmm5
2760	pxor	%xmm14,%xmm15
2761	pxor	%xmm6,%xmm8
2762	pxor	%xmm14,%xmm6
2763	pxor	%xmm7,%xmm8
2764	pxor	%xmm15,%xmm7
2765	movups	32(%r11),%xmm0
2766
2767	leaq	1(%r8),%r12
2768	leaq	3(%r8),%r13
2769	leaq	5(%r8),%r14
2770	addq	$6,%r8
2771	pxor	%xmm9,%xmm10
2772	bsfq	%r12,%r12
2773	bsfq	%r13,%r13
2774	bsfq	%r14,%r14
2775
2776.byte	102,15,56,220,209
2777.byte	102,15,56,220,217
2778.byte	102,15,56,220,225
2779.byte	102,15,56,220,233
2780	pxor	%xmm9,%xmm11
2781	pxor	%xmm9,%xmm12
2782.byte	102,15,56,220,241
2783	pxor	%xmm9,%xmm13
2784	pxor	%xmm9,%xmm14
2785.byte	102,15,56,220,249
2786	movups	48(%r11),%xmm1
2787	pxor	%xmm9,%xmm15
2788
2789.byte	102,15,56,220,208
2790.byte	102,15,56,220,216
2791.byte	102,15,56,220,224
2792.byte	102,15,56,220,232
2793.byte	102,15,56,220,240
2794.byte	102,15,56,220,248
2795	movups	64(%r11),%xmm0
2796	shlq	$4,%r12
2797	shlq	$4,%r13
2798	jmp	L$ocb_enc_loop6
2799
2800.p2align	5
2801L$ocb_enc_loop6:
2802.byte	102,15,56,220,209
2803.byte	102,15,56,220,217
2804.byte	102,15,56,220,225
2805.byte	102,15,56,220,233
2806.byte	102,15,56,220,241
2807.byte	102,15,56,220,249
2808	movups	(%rcx,%rax,1),%xmm1
2809	addq	$32,%rax
2810
2811.byte	102,15,56,220,208
2812.byte	102,15,56,220,216
2813.byte	102,15,56,220,224
2814.byte	102,15,56,220,232
2815.byte	102,15,56,220,240
2816.byte	102,15,56,220,248
2817	movups	-16(%rcx,%rax,1),%xmm0
2818	jnz	L$ocb_enc_loop6
2819
2820.byte	102,15,56,220,209
2821.byte	102,15,56,220,217
2822.byte	102,15,56,220,225
2823.byte	102,15,56,220,233
2824.byte	102,15,56,220,241
2825.byte	102,15,56,220,249
2826	movups	16(%r11),%xmm1
2827	shlq	$4,%r14
2828
2829.byte	102,65,15,56,221,210
2830	movdqu	(%rbx),%xmm10
2831	movq	%r10,%rax
2832.byte	102,65,15,56,221,219
2833.byte	102,65,15,56,221,228
2834.byte	102,65,15,56,221,237
2835.byte	102,65,15,56,221,246
2836.byte	102,65,15,56,221,255
2837	.byte	0xf3,0xc3
2838
2839
2840
2841.p2align	5
2842__ocb_encrypt4:
2843	pxor	%xmm9,%xmm15
2844	movdqu	(%rbx,%r12,1),%xmm11
2845	movdqa	%xmm10,%xmm12
2846	movdqu	(%rbx,%r13,1),%xmm13
2847	pxor	%xmm15,%xmm10
2848	pxor	%xmm10,%xmm11
2849	pxor	%xmm2,%xmm8
2850	pxor	%xmm10,%xmm2
2851	pxor	%xmm11,%xmm12
2852	pxor	%xmm3,%xmm8
2853	pxor	%xmm11,%xmm3
2854	pxor	%xmm12,%xmm13
2855	pxor	%xmm4,%xmm8
2856	pxor	%xmm12,%xmm4
2857	pxor	%xmm5,%xmm8
2858	pxor	%xmm13,%xmm5
2859	movups	32(%r11),%xmm0
2860
2861	pxor	%xmm9,%xmm10
2862	pxor	%xmm9,%xmm11
2863	pxor	%xmm9,%xmm12
2864	pxor	%xmm9,%xmm13
2865
2866.byte	102,15,56,220,209
2867.byte	102,15,56,220,217
2868.byte	102,15,56,220,225
2869.byte	102,15,56,220,233
2870	movups	48(%r11),%xmm1
2871
2872.byte	102,15,56,220,208
2873.byte	102,15,56,220,216
2874.byte	102,15,56,220,224
2875.byte	102,15,56,220,232
2876	movups	64(%r11),%xmm0
2877	jmp	L$ocb_enc_loop4
2878
2879.p2align	5
2880L$ocb_enc_loop4:
2881.byte	102,15,56,220,209
2882.byte	102,15,56,220,217
2883.byte	102,15,56,220,225
2884.byte	102,15,56,220,233
2885	movups	(%rcx,%rax,1),%xmm1
2886	addq	$32,%rax
2887
2888.byte	102,15,56,220,208
2889.byte	102,15,56,220,216
2890.byte	102,15,56,220,224
2891.byte	102,15,56,220,232
2892	movups	-16(%rcx,%rax,1),%xmm0
2893	jnz	L$ocb_enc_loop4
2894
2895.byte	102,15,56,220,209
2896.byte	102,15,56,220,217
2897.byte	102,15,56,220,225
2898.byte	102,15,56,220,233
2899	movups	16(%r11),%xmm1
2900	movq	%r10,%rax
2901
2902.byte	102,65,15,56,221,210
2903.byte	102,65,15,56,221,219
2904.byte	102,65,15,56,221,228
2905.byte	102,65,15,56,221,237
2906	.byte	0xf3,0xc3
2907
2908
2909
2910.p2align	5
2911__ocb_encrypt1:
2912	pxor	%xmm15,%xmm7
2913	pxor	%xmm9,%xmm7
2914	pxor	%xmm2,%xmm8
2915	pxor	%xmm7,%xmm2
2916	movups	32(%r11),%xmm0
2917
2918.byte	102,15,56,220,209
2919	movups	48(%r11),%xmm1
2920	pxor	%xmm9,%xmm7
2921
2922.byte	102,15,56,220,208
2923	movups	64(%r11),%xmm0
2924	jmp	L$ocb_enc_loop1
2925
2926.p2align	5
2927L$ocb_enc_loop1:
2928.byte	102,15,56,220,209
2929	movups	(%rcx,%rax,1),%xmm1
2930	addq	$32,%rax
2931
2932.byte	102,15,56,220,208
2933	movups	-16(%rcx,%rax,1),%xmm0
2934	jnz	L$ocb_enc_loop1
2935
2936.byte	102,15,56,220,209
2937	movups	16(%r11),%xmm1
2938	movq	%r10,%rax
2939
2940.byte	102,15,56,221,215
2941	.byte	0xf3,0xc3
2942
2943
2944.globl	_aesni_ocb_decrypt
2945.private_extern _aesni_ocb_decrypt
2946
2947.p2align	5
2948_aesni_ocb_decrypt:
2949	leaq	(%rsp),%rax
2950	pushq	%rbx
2951	pushq	%rbp
2952	pushq	%r12
2953	pushq	%r13
2954	pushq	%r14
2955	movq	8(%rax),%rbx
2956	movq	8+8(%rax),%rbp
2957
2958	movl	240(%rcx),%r10d
2959	movq	%rcx,%r11
2960	shll	$4,%r10d
2961	movups	(%rcx),%xmm9
2962	movups	16(%rcx,%r10,1),%xmm1
2963
2964	movdqu	(%r9),%xmm15
2965	pxor	%xmm1,%xmm9
2966	pxor	%xmm1,%xmm15
2967
2968	movl	$16+32,%eax
2969	leaq	32(%r11,%r10,1),%rcx
2970	movups	16(%r11),%xmm1
2971	subq	%r10,%rax
2972	movq	%rax,%r10
2973
2974	movdqu	(%rbx),%xmm10
2975	movdqu	(%rbp),%xmm8
2976
2977	testq	$1,%r8
2978	jnz	L$ocb_dec_odd
2979
2980	bsfq	%r8,%r12
2981	addq	$1,%r8
2982	shlq	$4,%r12
2983	movdqu	(%rbx,%r12,1),%xmm7
2984	movdqu	(%rdi),%xmm2
2985	leaq	16(%rdi),%rdi
2986
2987	call	__ocb_decrypt1
2988
2989	movdqa	%xmm7,%xmm15
2990	movups	%xmm2,(%rsi)
2991	xorps	%xmm2,%xmm8
2992	leaq	16(%rsi),%rsi
2993	subq	$1,%rdx
2994	jz	L$ocb_dec_done
2995
2996L$ocb_dec_odd:
2997	leaq	1(%r8),%r12
2998	leaq	3(%r8),%r13
2999	leaq	5(%r8),%r14
3000	leaq	6(%r8),%r8
3001	bsfq	%r12,%r12
3002	bsfq	%r13,%r13
3003	bsfq	%r14,%r14
3004	shlq	$4,%r12
3005	shlq	$4,%r13
3006	shlq	$4,%r14
3007
3008	subq	$6,%rdx
3009	jc	L$ocb_dec_short
3010	jmp	L$ocb_dec_grandloop
3011
3012.p2align	5
3013L$ocb_dec_grandloop:
3014	movdqu	0(%rdi),%xmm2
3015	movdqu	16(%rdi),%xmm3
3016	movdqu	32(%rdi),%xmm4
3017	movdqu	48(%rdi),%xmm5
3018	movdqu	64(%rdi),%xmm6
3019	movdqu	80(%rdi),%xmm7
3020	leaq	96(%rdi),%rdi
3021
3022	call	__ocb_decrypt6
3023
3024	movups	%xmm2,0(%rsi)
3025	pxor	%xmm2,%xmm8
3026	movups	%xmm3,16(%rsi)
3027	pxor	%xmm3,%xmm8
3028	movups	%xmm4,32(%rsi)
3029	pxor	%xmm4,%xmm8
3030	movups	%xmm5,48(%rsi)
3031	pxor	%xmm5,%xmm8
3032	movups	%xmm6,64(%rsi)
3033	pxor	%xmm6,%xmm8
3034	movups	%xmm7,80(%rsi)
3035	pxor	%xmm7,%xmm8
3036	leaq	96(%rsi),%rsi
3037	subq	$6,%rdx
3038	jnc	L$ocb_dec_grandloop
3039
3040L$ocb_dec_short:
3041	addq	$6,%rdx
3042	jz	L$ocb_dec_done
3043
3044	movdqu	0(%rdi),%xmm2
3045	cmpq	$2,%rdx
3046	jb	L$ocb_dec_one
3047	movdqu	16(%rdi),%xmm3
3048	je	L$ocb_dec_two
3049
3050	movdqu	32(%rdi),%xmm4
3051	cmpq	$4,%rdx
3052	jb	L$ocb_dec_three
3053	movdqu	48(%rdi),%xmm5
3054	je	L$ocb_dec_four
3055
3056	movdqu	64(%rdi),%xmm6
3057	pxor	%xmm7,%xmm7
3058
3059	call	__ocb_decrypt6
3060
3061	movdqa	%xmm14,%xmm15
3062	movups	%xmm2,0(%rsi)
3063	pxor	%xmm2,%xmm8
3064	movups	%xmm3,16(%rsi)
3065	pxor	%xmm3,%xmm8
3066	movups	%xmm4,32(%rsi)
3067	pxor	%xmm4,%xmm8
3068	movups	%xmm5,48(%rsi)
3069	pxor	%xmm5,%xmm8
3070	movups	%xmm6,64(%rsi)
3071	pxor	%xmm6,%xmm8
3072
3073	jmp	L$ocb_dec_done
3074
3075.p2align	4
3076L$ocb_dec_one:
3077	movdqa	%xmm10,%xmm7
3078
3079	call	__ocb_decrypt1
3080
3081	movdqa	%xmm7,%xmm15
3082	movups	%xmm2,0(%rsi)
3083	xorps	%xmm2,%xmm8
3084	jmp	L$ocb_dec_done
3085
3086.p2align	4
3087L$ocb_dec_two:
3088	pxor	%xmm4,%xmm4
3089	pxor	%xmm5,%xmm5
3090
3091	call	__ocb_decrypt4
3092
3093	movdqa	%xmm11,%xmm15
3094	movups	%xmm2,0(%rsi)
3095	xorps	%xmm2,%xmm8
3096	movups	%xmm3,16(%rsi)
3097	xorps	%xmm3,%xmm8
3098
3099	jmp	L$ocb_dec_done
3100
3101.p2align	4
3102L$ocb_dec_three:
3103	pxor	%xmm5,%xmm5
3104
3105	call	__ocb_decrypt4
3106
3107	movdqa	%xmm12,%xmm15
3108	movups	%xmm2,0(%rsi)
3109	xorps	%xmm2,%xmm8
3110	movups	%xmm3,16(%rsi)
3111	xorps	%xmm3,%xmm8
3112	movups	%xmm4,32(%rsi)
3113	xorps	%xmm4,%xmm8
3114
3115	jmp	L$ocb_dec_done
3116
3117.p2align	4
3118L$ocb_dec_four:
3119	call	__ocb_decrypt4
3120
3121	movdqa	%xmm13,%xmm15
3122	movups	%xmm2,0(%rsi)
3123	pxor	%xmm2,%xmm8
3124	movups	%xmm3,16(%rsi)
3125	pxor	%xmm3,%xmm8
3126	movups	%xmm4,32(%rsi)
3127	pxor	%xmm4,%xmm8
3128	movups	%xmm5,48(%rsi)
3129	pxor	%xmm5,%xmm8
3130
3131L$ocb_dec_done:
3132	pxor	%xmm0,%xmm15
3133	movdqu	%xmm8,(%rbp)
3134	movdqu	%xmm15,(%r9)
3135
3136	xorps	%xmm0,%xmm0
3137	pxor	%xmm1,%xmm1
3138	pxor	%xmm2,%xmm2
3139	pxor	%xmm3,%xmm3
3140	pxor	%xmm4,%xmm4
3141	pxor	%xmm5,%xmm5
3142	pxor	%xmm6,%xmm6
3143	pxor	%xmm7,%xmm7
3144	pxor	%xmm8,%xmm8
3145	pxor	%xmm9,%xmm9
3146	pxor	%xmm10,%xmm10
3147	pxor	%xmm11,%xmm11
3148	pxor	%xmm12,%xmm12
3149	pxor	%xmm13,%xmm13
3150	pxor	%xmm14,%xmm14
3151	pxor	%xmm15,%xmm15
3152	leaq	40(%rsp),%rax
3153	movq	-40(%rax),%r14
3154	movq	-32(%rax),%r13
3155	movq	-24(%rax),%r12
3156	movq	-16(%rax),%rbp
3157	movq	-8(%rax),%rbx
3158	leaq	(%rax),%rsp
3159L$ocb_dec_epilogue:
3160	.byte	0xf3,0xc3
3161
3162
3163
3164.p2align	5
3165__ocb_decrypt6:
3166	pxor	%xmm9,%xmm15
3167	movdqu	(%rbx,%r12,1),%xmm11
3168	movdqa	%xmm10,%xmm12
3169	movdqu	(%rbx,%r13,1),%xmm13
3170	movdqa	%xmm10,%xmm14
3171	pxor	%xmm15,%xmm10
3172	movdqu	(%rbx,%r14,1),%xmm15
3173	pxor	%xmm10,%xmm11
3174	pxor	%xmm10,%xmm2
3175	pxor	%xmm11,%xmm12
3176	pxor	%xmm11,%xmm3
3177	pxor	%xmm12,%xmm13
3178	pxor	%xmm12,%xmm4
3179	pxor	%xmm13,%xmm14
3180	pxor	%xmm13,%xmm5
3181	pxor	%xmm14,%xmm15
3182	pxor	%xmm14,%xmm6
3183	pxor	%xmm15,%xmm7
3184	movups	32(%r11),%xmm0
3185
3186	leaq	1(%r8),%r12
3187	leaq	3(%r8),%r13
3188	leaq	5(%r8),%r14
3189	addq	$6,%r8
3190	pxor	%xmm9,%xmm10
3191	bsfq	%r12,%r12
3192	bsfq	%r13,%r13
3193	bsfq	%r14,%r14
3194
3195.byte	102,15,56,222,209
3196.byte	102,15,56,222,217
3197.byte	102,15,56,222,225
3198.byte	102,15,56,222,233
3199	pxor	%xmm9,%xmm11
3200	pxor	%xmm9,%xmm12
3201.byte	102,15,56,222,241
3202	pxor	%xmm9,%xmm13
3203	pxor	%xmm9,%xmm14
3204.byte	102,15,56,222,249
3205	movups	48(%r11),%xmm1
3206	pxor	%xmm9,%xmm15
3207
3208.byte	102,15,56,222,208
3209.byte	102,15,56,222,216
3210.byte	102,15,56,222,224
3211.byte	102,15,56,222,232
3212.byte	102,15,56,222,240
3213.byte	102,15,56,222,248
3214	movups	64(%r11),%xmm0
3215	shlq	$4,%r12
3216	shlq	$4,%r13
3217	jmp	L$ocb_dec_loop6
3218
3219.p2align	5
3220L$ocb_dec_loop6:
3221.byte	102,15,56,222,209
3222.byte	102,15,56,222,217
3223.byte	102,15,56,222,225
3224.byte	102,15,56,222,233
3225.byte	102,15,56,222,241
3226.byte	102,15,56,222,249
3227	movups	(%rcx,%rax,1),%xmm1
3228	addq	$32,%rax
3229
3230.byte	102,15,56,222,208
3231.byte	102,15,56,222,216
3232.byte	102,15,56,222,224
3233.byte	102,15,56,222,232
3234.byte	102,15,56,222,240
3235.byte	102,15,56,222,248
3236	movups	-16(%rcx,%rax,1),%xmm0
3237	jnz	L$ocb_dec_loop6
3238
3239.byte	102,15,56,222,209
3240.byte	102,15,56,222,217
3241.byte	102,15,56,222,225
3242.byte	102,15,56,222,233
3243.byte	102,15,56,222,241
3244.byte	102,15,56,222,249
3245	movups	16(%r11),%xmm1
3246	shlq	$4,%r14
3247
3248.byte	102,65,15,56,223,210
3249	movdqu	(%rbx),%xmm10
3250	movq	%r10,%rax
3251.byte	102,65,15,56,223,219
3252.byte	102,65,15,56,223,228
3253.byte	102,65,15,56,223,237
3254.byte	102,65,15,56,223,246
3255.byte	102,65,15,56,223,255
3256	.byte	0xf3,0xc3
3257
3258
3259
3260.p2align	5
3261__ocb_decrypt4:
3262	pxor	%xmm9,%xmm15
3263	movdqu	(%rbx,%r12,1),%xmm11
3264	movdqa	%xmm10,%xmm12
3265	movdqu	(%rbx,%r13,1),%xmm13
3266	pxor	%xmm15,%xmm10
3267	pxor	%xmm10,%xmm11
3268	pxor	%xmm10,%xmm2
3269	pxor	%xmm11,%xmm12
3270	pxor	%xmm11,%xmm3
3271	pxor	%xmm12,%xmm13
3272	pxor	%xmm12,%xmm4
3273	pxor	%xmm13,%xmm5
3274	movups	32(%r11),%xmm0
3275
3276	pxor	%xmm9,%xmm10
3277	pxor	%xmm9,%xmm11
3278	pxor	%xmm9,%xmm12
3279	pxor	%xmm9,%xmm13
3280
3281.byte	102,15,56,222,209
3282.byte	102,15,56,222,217
3283.byte	102,15,56,222,225
3284.byte	102,15,56,222,233
3285	movups	48(%r11),%xmm1
3286
3287.byte	102,15,56,222,208
3288.byte	102,15,56,222,216
3289.byte	102,15,56,222,224
3290.byte	102,15,56,222,232
3291	movups	64(%r11),%xmm0
3292	jmp	L$ocb_dec_loop4
3293
3294.p2align	5
3295L$ocb_dec_loop4:
3296.byte	102,15,56,222,209
3297.byte	102,15,56,222,217
3298.byte	102,15,56,222,225
3299.byte	102,15,56,222,233
3300	movups	(%rcx,%rax,1),%xmm1
3301	addq	$32,%rax
3302
3303.byte	102,15,56,222,208
3304.byte	102,15,56,222,216
3305.byte	102,15,56,222,224
3306.byte	102,15,56,222,232
3307	movups	-16(%rcx,%rax,1),%xmm0
3308	jnz	L$ocb_dec_loop4
3309
3310.byte	102,15,56,222,209
3311.byte	102,15,56,222,217
3312.byte	102,15,56,222,225
3313.byte	102,15,56,222,233
3314	movups	16(%r11),%xmm1
3315	movq	%r10,%rax
3316
3317.byte	102,65,15,56,223,210
3318.byte	102,65,15,56,223,219
3319.byte	102,65,15,56,223,228
3320.byte	102,65,15,56,223,237
3321	.byte	0xf3,0xc3
3322
3323
3324
3325.p2align	5
3326__ocb_decrypt1:
3327	pxor	%xmm15,%xmm7
3328	pxor	%xmm9,%xmm7
3329	pxor	%xmm7,%xmm2
3330	movups	32(%r11),%xmm0
3331
3332.byte	102,15,56,222,209
3333	movups	48(%r11),%xmm1
3334	pxor	%xmm9,%xmm7
3335
3336.byte	102,15,56,222,208
3337	movups	64(%r11),%xmm0
3338	jmp	L$ocb_dec_loop1
3339
3340.p2align	5
3341L$ocb_dec_loop1:
3342.byte	102,15,56,222,209
3343	movups	(%rcx,%rax,1),%xmm1
3344	addq	$32,%rax
3345
3346.byte	102,15,56,222,208
3347	movups	-16(%rcx,%rax,1),%xmm0
3348	jnz	L$ocb_dec_loop1
3349
3350.byte	102,15,56,222,209
3351	movups	16(%r11),%xmm1
3352	movq	%r10,%rax
3353
3354.byte	102,15,56,223,215
3355	.byte	0xf3,0xc3
3356
3357.globl	_aesni_cbc_encrypt
3358.private_extern _aesni_cbc_encrypt
3359
3360.p2align	4
3361_aesni_cbc_encrypt:
3362	testq	%rdx,%rdx
3363	jz	L$cbc_ret
3364
3365	movl	240(%rcx),%r10d
3366	movq	%rcx,%r11
3367	testl	%r9d,%r9d
3368	jz	L$cbc_decrypt
3369
3370	movups	(%r8),%xmm2
3371	movl	%r10d,%eax
3372	cmpq	$16,%rdx
3373	jb	L$cbc_enc_tail
3374	subq	$16,%rdx
3375	jmp	L$cbc_enc_loop
3376.p2align	4
3377L$cbc_enc_loop:
3378	movups	(%rdi),%xmm3
3379	leaq	16(%rdi),%rdi
3380
3381	movups	(%rcx),%xmm0
3382	movups	16(%rcx),%xmm1
3383	xorps	%xmm0,%xmm3
3384	leaq	32(%rcx),%rcx
3385	xorps	%xmm3,%xmm2
3386L$oop_enc1_15:
3387.byte	102,15,56,220,209
3388	decl	%eax
3389	movups	(%rcx),%xmm1
3390	leaq	16(%rcx),%rcx
3391	jnz	L$oop_enc1_15
3392.byte	102,15,56,221,209
3393	movl	%r10d,%eax
3394	movq	%r11,%rcx
3395	movups	%xmm2,0(%rsi)
3396	leaq	16(%rsi),%rsi
3397	subq	$16,%rdx
3398	jnc	L$cbc_enc_loop
3399	addq	$16,%rdx
3400	jnz	L$cbc_enc_tail
3401	pxor	%xmm0,%xmm0
3402	pxor	%xmm1,%xmm1
3403	movups	%xmm2,(%r8)
3404	pxor	%xmm2,%xmm2
3405	pxor	%xmm3,%xmm3
3406	jmp	L$cbc_ret
3407
3408L$cbc_enc_tail:
3409	movq	%rdx,%rcx
3410	xchgq	%rdi,%rsi
3411.long	0x9066A4F3
3412	movl	$16,%ecx
3413	subq	%rdx,%rcx
3414	xorl	%eax,%eax
3415.long	0x9066AAF3
3416	leaq	-16(%rdi),%rdi
3417	movl	%r10d,%eax
3418	movq	%rdi,%rsi
3419	movq	%r11,%rcx
3420	xorq	%rdx,%rdx
3421	jmp	L$cbc_enc_loop
3422
3423.p2align	4
3424L$cbc_decrypt:
3425	cmpq	$16,%rdx
3426	jne	L$cbc_decrypt_bulk
3427
3428
3429
3430	movdqu	(%rdi),%xmm2
3431	movdqu	(%r8),%xmm3
3432	movdqa	%xmm2,%xmm4
3433	movups	(%rcx),%xmm0
3434	movups	16(%rcx),%xmm1
3435	leaq	32(%rcx),%rcx
3436	xorps	%xmm0,%xmm2
3437L$oop_dec1_16:
3438.byte	102,15,56,222,209
3439	decl	%r10d
3440	movups	(%rcx),%xmm1
3441	leaq	16(%rcx),%rcx
3442	jnz	L$oop_dec1_16
3443.byte	102,15,56,223,209
3444	pxor	%xmm0,%xmm0
3445	pxor	%xmm1,%xmm1
3446	movdqu	%xmm4,(%r8)
3447	xorps	%xmm3,%xmm2
3448	pxor	%xmm3,%xmm3
3449	movups	%xmm2,(%rsi)
3450	pxor	%xmm2,%xmm2
3451	jmp	L$cbc_ret
3452.p2align	4
3453L$cbc_decrypt_bulk:
3454	leaq	(%rsp),%r11
3455	pushq	%rbp
3456	subq	$16,%rsp
3457	andq	$-16,%rsp
3458	movq	%rcx,%rbp
3459	movups	(%r8),%xmm10
3460	movl	%r10d,%eax
3461	cmpq	$0x50,%rdx
3462	jbe	L$cbc_dec_tail
3463
3464	movups	(%rcx),%xmm0
3465	movdqu	0(%rdi),%xmm2
3466	movdqu	16(%rdi),%xmm3
3467	movdqa	%xmm2,%xmm11
3468	movdqu	32(%rdi),%xmm4
3469	movdqa	%xmm3,%xmm12
3470	movdqu	48(%rdi),%xmm5
3471	movdqa	%xmm4,%xmm13
3472	movdqu	64(%rdi),%xmm6
3473	movdqa	%xmm5,%xmm14
3474	movdqu	80(%rdi),%xmm7
3475	movdqa	%xmm6,%xmm15
3476	leaq	_OPENSSL_ia32cap_P(%rip),%r9
3477	movl	4(%r9),%r9d
3478	cmpq	$0x70,%rdx
3479	jbe	L$cbc_dec_six_or_seven
3480
3481	andl	$71303168,%r9d
3482	subq	$0x50,%rdx
3483	cmpl	$4194304,%r9d
3484	je	L$cbc_dec_loop6_enter
3485	subq	$0x20,%rdx
3486	leaq	112(%rcx),%rcx
3487	jmp	L$cbc_dec_loop8_enter
3488.p2align	4
3489L$cbc_dec_loop8:
3490	movups	%xmm9,(%rsi)
3491	leaq	16(%rsi),%rsi
3492L$cbc_dec_loop8_enter:
3493	movdqu	96(%rdi),%xmm8
3494	pxor	%xmm0,%xmm2
3495	movdqu	112(%rdi),%xmm9
3496	pxor	%xmm0,%xmm3
3497	movups	16-112(%rcx),%xmm1
3498	pxor	%xmm0,%xmm4
3499	movq	$-1,%rbp
3500	cmpq	$0x70,%rdx
3501	pxor	%xmm0,%xmm5
3502	pxor	%xmm0,%xmm6
3503	pxor	%xmm0,%xmm7
3504	pxor	%xmm0,%xmm8
3505
3506.byte	102,15,56,222,209
3507	pxor	%xmm0,%xmm9
3508	movups	32-112(%rcx),%xmm0
3509.byte	102,15,56,222,217
3510.byte	102,15,56,222,225
3511.byte	102,15,56,222,233
3512.byte	102,15,56,222,241
3513.byte	102,15,56,222,249
3514.byte	102,68,15,56,222,193
3515	adcq	$0,%rbp
3516	andq	$128,%rbp
3517.byte	102,68,15,56,222,201
3518	addq	%rdi,%rbp
3519	movups	48-112(%rcx),%xmm1
3520.byte	102,15,56,222,208
3521.byte	102,15,56,222,216
3522.byte	102,15,56,222,224
3523.byte	102,15,56,222,232
3524.byte	102,15,56,222,240
3525.byte	102,15,56,222,248
3526.byte	102,68,15,56,222,192
3527.byte	102,68,15,56,222,200
3528	movups	64-112(%rcx),%xmm0
3529	nop
3530.byte	102,15,56,222,209
3531.byte	102,15,56,222,217
3532.byte	102,15,56,222,225
3533.byte	102,15,56,222,233
3534.byte	102,15,56,222,241
3535.byte	102,15,56,222,249
3536.byte	102,68,15,56,222,193
3537.byte	102,68,15,56,222,201
3538	movups	80-112(%rcx),%xmm1
3539	nop
3540.byte	102,15,56,222,208
3541.byte	102,15,56,222,216
3542.byte	102,15,56,222,224
3543.byte	102,15,56,222,232
3544.byte	102,15,56,222,240
3545.byte	102,15,56,222,248
3546.byte	102,68,15,56,222,192
3547.byte	102,68,15,56,222,200
3548	movups	96-112(%rcx),%xmm0
3549	nop
3550.byte	102,15,56,222,209
3551.byte	102,15,56,222,217
3552.byte	102,15,56,222,225
3553.byte	102,15,56,222,233
3554.byte	102,15,56,222,241
3555.byte	102,15,56,222,249
3556.byte	102,68,15,56,222,193
3557.byte	102,68,15,56,222,201
3558	movups	112-112(%rcx),%xmm1
3559	nop
3560.byte	102,15,56,222,208
3561.byte	102,15,56,222,216
3562.byte	102,15,56,222,224
3563.byte	102,15,56,222,232
3564.byte	102,15,56,222,240
3565.byte	102,15,56,222,248
3566.byte	102,68,15,56,222,192
3567.byte	102,68,15,56,222,200
3568	movups	128-112(%rcx),%xmm0
3569	nop
3570.byte	102,15,56,222,209
3571.byte	102,15,56,222,217
3572.byte	102,15,56,222,225
3573.byte	102,15,56,222,233
3574.byte	102,15,56,222,241
3575.byte	102,15,56,222,249
3576.byte	102,68,15,56,222,193
3577.byte	102,68,15,56,222,201
3578	movups	144-112(%rcx),%xmm1
3579	cmpl	$11,%eax
3580.byte	102,15,56,222,208
3581.byte	102,15,56,222,216
3582.byte	102,15,56,222,224
3583.byte	102,15,56,222,232
3584.byte	102,15,56,222,240
3585.byte	102,15,56,222,248
3586.byte	102,68,15,56,222,192
3587.byte	102,68,15,56,222,200
3588	movups	160-112(%rcx),%xmm0
3589	jb	L$cbc_dec_done
3590.byte	102,15,56,222,209
3591.byte	102,15,56,222,217
3592.byte	102,15,56,222,225
3593.byte	102,15,56,222,233
3594.byte	102,15,56,222,241
3595.byte	102,15,56,222,249
3596.byte	102,68,15,56,222,193
3597.byte	102,68,15,56,222,201
3598	movups	176-112(%rcx),%xmm1
3599	nop
3600.byte	102,15,56,222,208
3601.byte	102,15,56,222,216
3602.byte	102,15,56,222,224
3603.byte	102,15,56,222,232
3604.byte	102,15,56,222,240
3605.byte	102,15,56,222,248
3606.byte	102,68,15,56,222,192
3607.byte	102,68,15,56,222,200
3608	movups	192-112(%rcx),%xmm0
3609	je	L$cbc_dec_done
3610.byte	102,15,56,222,209
3611.byte	102,15,56,222,217
3612.byte	102,15,56,222,225
3613.byte	102,15,56,222,233
3614.byte	102,15,56,222,241
3615.byte	102,15,56,222,249
3616.byte	102,68,15,56,222,193
3617.byte	102,68,15,56,222,201
3618	movups	208-112(%rcx),%xmm1
3619	nop
3620.byte	102,15,56,222,208
3621.byte	102,15,56,222,216
3622.byte	102,15,56,222,224
3623.byte	102,15,56,222,232
3624.byte	102,15,56,222,240
3625.byte	102,15,56,222,248
3626.byte	102,68,15,56,222,192
3627.byte	102,68,15,56,222,200
3628	movups	224-112(%rcx),%xmm0
3629	jmp	L$cbc_dec_done
3630.p2align	4
3631L$cbc_dec_done:
3632.byte	102,15,56,222,209
3633.byte	102,15,56,222,217
3634	pxor	%xmm0,%xmm10
3635	pxor	%xmm0,%xmm11
3636.byte	102,15,56,222,225
3637.byte	102,15,56,222,233
3638	pxor	%xmm0,%xmm12
3639	pxor	%xmm0,%xmm13
3640.byte	102,15,56,222,241
3641.byte	102,15,56,222,249
3642	pxor	%xmm0,%xmm14
3643	pxor	%xmm0,%xmm15
3644.byte	102,68,15,56,222,193
3645.byte	102,68,15,56,222,201
3646	movdqu	80(%rdi),%xmm1
3647
3648.byte	102,65,15,56,223,210
3649	movdqu	96(%rdi),%xmm10
3650	pxor	%xmm0,%xmm1
3651.byte	102,65,15,56,223,219
3652	pxor	%xmm0,%xmm10
3653	movdqu	112(%rdi),%xmm0
3654.byte	102,65,15,56,223,228
3655	leaq	128(%rdi),%rdi
3656	movdqu	0(%rbp),%xmm11
3657.byte	102,65,15,56,223,237
3658.byte	102,65,15,56,223,246
3659	movdqu	16(%rbp),%xmm12
3660	movdqu	32(%rbp),%xmm13
3661.byte	102,65,15,56,223,255
3662.byte	102,68,15,56,223,193
3663	movdqu	48(%rbp),%xmm14
3664	movdqu	64(%rbp),%xmm15
3665.byte	102,69,15,56,223,202
3666	movdqa	%xmm0,%xmm10
3667	movdqu	80(%rbp),%xmm1
3668	movups	-112(%rcx),%xmm0
3669
3670	movups	%xmm2,(%rsi)
3671	movdqa	%xmm11,%xmm2
3672	movups	%xmm3,16(%rsi)
3673	movdqa	%xmm12,%xmm3
3674	movups	%xmm4,32(%rsi)
3675	movdqa	%xmm13,%xmm4
3676	movups	%xmm5,48(%rsi)
3677	movdqa	%xmm14,%xmm5
3678	movups	%xmm6,64(%rsi)
3679	movdqa	%xmm15,%xmm6
3680	movups	%xmm7,80(%rsi)
3681	movdqa	%xmm1,%xmm7
3682	movups	%xmm8,96(%rsi)
3683	leaq	112(%rsi),%rsi
3684
3685	subq	$0x80,%rdx
3686	ja	L$cbc_dec_loop8
3687
3688	movaps	%xmm9,%xmm2
3689	leaq	-112(%rcx),%rcx
3690	addq	$0x70,%rdx
3691	jle	L$cbc_dec_clear_tail_collected
3692	movups	%xmm9,(%rsi)
3693	leaq	16(%rsi),%rsi
3694	cmpq	$0x50,%rdx
3695	jbe	L$cbc_dec_tail
3696
3697	movaps	%xmm11,%xmm2
3698L$cbc_dec_six_or_seven:
3699	cmpq	$0x60,%rdx
3700	ja	L$cbc_dec_seven
3701
3702	movaps	%xmm7,%xmm8
3703	call	_aesni_decrypt6
3704	pxor	%xmm10,%xmm2
3705	movaps	%xmm8,%xmm10
3706	pxor	%xmm11,%xmm3
3707	movdqu	%xmm2,(%rsi)
3708	pxor	%xmm12,%xmm4
3709	movdqu	%xmm3,16(%rsi)
3710	pxor	%xmm3,%xmm3
3711	pxor	%xmm13,%xmm5
3712	movdqu	%xmm4,32(%rsi)
3713	pxor	%xmm4,%xmm4
3714	pxor	%xmm14,%xmm6
3715	movdqu	%xmm5,48(%rsi)
3716	pxor	%xmm5,%xmm5
3717	pxor	%xmm15,%xmm7
3718	movdqu	%xmm6,64(%rsi)
3719	pxor	%xmm6,%xmm6
3720	leaq	80(%rsi),%rsi
3721	movdqa	%xmm7,%xmm2
3722	pxor	%xmm7,%xmm7
3723	jmp	L$cbc_dec_tail_collected
3724
3725.p2align	4
3726L$cbc_dec_seven:
3727	movups	96(%rdi),%xmm8
3728	xorps	%xmm9,%xmm9
3729	call	_aesni_decrypt8
3730	movups	80(%rdi),%xmm9
3731	pxor	%xmm10,%xmm2
3732	movups	96(%rdi),%xmm10
3733	pxor	%xmm11,%xmm3
3734	movdqu	%xmm2,(%rsi)
3735	pxor	%xmm12,%xmm4
3736	movdqu	%xmm3,16(%rsi)
3737	pxor	%xmm3,%xmm3
3738	pxor	%xmm13,%xmm5
3739	movdqu	%xmm4,32(%rsi)
3740	pxor	%xmm4,%xmm4
3741	pxor	%xmm14,%xmm6
3742	movdqu	%xmm5,48(%rsi)
3743	pxor	%xmm5,%xmm5
3744	pxor	%xmm15,%xmm7
3745	movdqu	%xmm6,64(%rsi)
3746	pxor	%xmm6,%xmm6
3747	pxor	%xmm9,%xmm8
3748	movdqu	%xmm7,80(%rsi)
3749	pxor	%xmm7,%xmm7
3750	leaq	96(%rsi),%rsi
3751	movdqa	%xmm8,%xmm2
3752	pxor	%xmm8,%xmm8
3753	pxor	%xmm9,%xmm9
3754	jmp	L$cbc_dec_tail_collected
3755
3756.p2align	4
3757L$cbc_dec_loop6:
3758	movups	%xmm7,(%rsi)
3759	leaq	16(%rsi),%rsi
3760	movdqu	0(%rdi),%xmm2
3761	movdqu	16(%rdi),%xmm3
3762	movdqa	%xmm2,%xmm11
3763	movdqu	32(%rdi),%xmm4
3764	movdqa	%xmm3,%xmm12
3765	movdqu	48(%rdi),%xmm5
3766	movdqa	%xmm4,%xmm13
3767	movdqu	64(%rdi),%xmm6
3768	movdqa	%xmm5,%xmm14
3769	movdqu	80(%rdi),%xmm7
3770	movdqa	%xmm6,%xmm15
3771L$cbc_dec_loop6_enter:
3772	leaq	96(%rdi),%rdi
3773	movdqa	%xmm7,%xmm8
3774
3775	call	_aesni_decrypt6
3776
3777	pxor	%xmm10,%xmm2
3778	movdqa	%xmm8,%xmm10
3779	pxor	%xmm11,%xmm3
3780	movdqu	%xmm2,(%rsi)
3781	pxor	%xmm12,%xmm4
3782	movdqu	%xmm3,16(%rsi)
3783	pxor	%xmm13,%xmm5
3784	movdqu	%xmm4,32(%rsi)
3785	pxor	%xmm14,%xmm6
3786	movq	%rbp,%rcx
3787	movdqu	%xmm5,48(%rsi)
3788	pxor	%xmm15,%xmm7
3789	movl	%r10d,%eax
3790	movdqu	%xmm6,64(%rsi)
3791	leaq	80(%rsi),%rsi
3792	subq	$0x60,%rdx
3793	ja	L$cbc_dec_loop6
3794
3795	movdqa	%xmm7,%xmm2
3796	addq	$0x50,%rdx
3797	jle	L$cbc_dec_clear_tail_collected
3798	movups	%xmm7,(%rsi)
3799	leaq	16(%rsi),%rsi
3800
3801L$cbc_dec_tail:
3802	movups	(%rdi),%xmm2
3803	subq	$0x10,%rdx
3804	jbe	L$cbc_dec_one
3805
3806	movups	16(%rdi),%xmm3
3807	movaps	%xmm2,%xmm11
3808	subq	$0x10,%rdx
3809	jbe	L$cbc_dec_two
3810
3811	movups	32(%rdi),%xmm4
3812	movaps	%xmm3,%xmm12
3813	subq	$0x10,%rdx
3814	jbe	L$cbc_dec_three
3815
3816	movups	48(%rdi),%xmm5
3817	movaps	%xmm4,%xmm13
3818	subq	$0x10,%rdx
3819	jbe	L$cbc_dec_four
3820
3821	movups	64(%rdi),%xmm6
3822	movaps	%xmm5,%xmm14
3823	movaps	%xmm6,%xmm15
3824	xorps	%xmm7,%xmm7
3825	call	_aesni_decrypt6
3826	pxor	%xmm10,%xmm2
3827	movaps	%xmm15,%xmm10
3828	pxor	%xmm11,%xmm3
3829	movdqu	%xmm2,(%rsi)
3830	pxor	%xmm12,%xmm4
3831	movdqu	%xmm3,16(%rsi)
3832	pxor	%xmm3,%xmm3
3833	pxor	%xmm13,%xmm5
3834	movdqu	%xmm4,32(%rsi)
3835	pxor	%xmm4,%xmm4
3836	pxor	%xmm14,%xmm6
3837	movdqu	%xmm5,48(%rsi)
3838	pxor	%xmm5,%xmm5
3839	leaq	64(%rsi),%rsi
3840	movdqa	%xmm6,%xmm2
3841	pxor	%xmm6,%xmm6
3842	pxor	%xmm7,%xmm7
3843	subq	$0x10,%rdx
3844	jmp	L$cbc_dec_tail_collected
3845
3846.p2align	4
3847L$cbc_dec_one:
3848	movaps	%xmm2,%xmm11
3849	movups	(%rcx),%xmm0
3850	movups	16(%rcx),%xmm1
3851	leaq	32(%rcx),%rcx
3852	xorps	%xmm0,%xmm2
3853L$oop_dec1_17:
3854.byte	102,15,56,222,209
3855	decl	%eax
3856	movups	(%rcx),%xmm1
3857	leaq	16(%rcx),%rcx
3858	jnz	L$oop_dec1_17
3859.byte	102,15,56,223,209
3860	xorps	%xmm10,%xmm2
3861	movaps	%xmm11,%xmm10
3862	jmp	L$cbc_dec_tail_collected
3863.p2align	4
3864L$cbc_dec_two:
3865	movaps	%xmm3,%xmm12
3866	call	_aesni_decrypt2
3867	pxor	%xmm10,%xmm2
3868	movaps	%xmm12,%xmm10
3869	pxor	%xmm11,%xmm3
3870	movdqu	%xmm2,(%rsi)
3871	movdqa	%xmm3,%xmm2
3872	pxor	%xmm3,%xmm3
3873	leaq	16(%rsi),%rsi
3874	jmp	L$cbc_dec_tail_collected
3875.p2align	4
3876L$cbc_dec_three:
3877	movaps	%xmm4,%xmm13
3878	call	_aesni_decrypt3
3879	pxor	%xmm10,%xmm2
3880	movaps	%xmm13,%xmm10
3881	pxor	%xmm11,%xmm3
3882	movdqu	%xmm2,(%rsi)
3883	pxor	%xmm12,%xmm4
3884	movdqu	%xmm3,16(%rsi)
3885	pxor	%xmm3,%xmm3
3886	movdqa	%xmm4,%xmm2
3887	pxor	%xmm4,%xmm4
3888	leaq	32(%rsi),%rsi
3889	jmp	L$cbc_dec_tail_collected
3890.p2align	4
3891L$cbc_dec_four:
3892	movaps	%xmm5,%xmm14
3893	call	_aesni_decrypt4
3894	pxor	%xmm10,%xmm2
3895	movaps	%xmm14,%xmm10
3896	pxor	%xmm11,%xmm3
3897	movdqu	%xmm2,(%rsi)
3898	pxor	%xmm12,%xmm4
3899	movdqu	%xmm3,16(%rsi)
3900	pxor	%xmm3,%xmm3
3901	pxor	%xmm13,%xmm5
3902	movdqu	%xmm4,32(%rsi)
3903	pxor	%xmm4,%xmm4
3904	movdqa	%xmm5,%xmm2
3905	pxor	%xmm5,%xmm5
3906	leaq	48(%rsi),%rsi
3907	jmp	L$cbc_dec_tail_collected
3908
3909.p2align	4
3910L$cbc_dec_clear_tail_collected:
3911	pxor	%xmm3,%xmm3
3912	pxor	%xmm4,%xmm4
3913	pxor	%xmm5,%xmm5
3914	pxor	%xmm6,%xmm6
3915	pxor	%xmm7,%xmm7
3916	pxor	%xmm8,%xmm8
3917	pxor	%xmm9,%xmm9
3918L$cbc_dec_tail_collected:
3919	movups	%xmm10,(%r8)
3920	andq	$15,%rdx
3921	jnz	L$cbc_dec_tail_partial
3922	movups	%xmm2,(%rsi)
3923	pxor	%xmm2,%xmm2
3924	jmp	L$cbc_dec_ret
3925.p2align	4
3926L$cbc_dec_tail_partial:
3927	movaps	%xmm2,(%rsp)
3928	pxor	%xmm2,%xmm2
3929	movq	$16,%rcx
3930	movq	%rsi,%rdi
3931	subq	%rdx,%rcx
3932	leaq	(%rsp),%rsi
3933.long	0x9066A4F3
3934	movdqa	%xmm2,(%rsp)
3935
3936L$cbc_dec_ret:
3937	xorps	%xmm0,%xmm0
3938	pxor	%xmm1,%xmm1
3939	movq	-8(%r11),%rbp
3940	leaq	(%r11),%rsp
3941L$cbc_ret:
3942	.byte	0xf3,0xc3
3943
3944.globl	_aesni_set_decrypt_key
3945.private_extern _aesni_set_decrypt_key
3946
3947.p2align	4
3948_aesni_set_decrypt_key:
3949.byte	0x48,0x83,0xEC,0x08
3950	call	__aesni_set_encrypt_key
3951	shll	$4,%esi
3952	testl	%eax,%eax
3953	jnz	L$dec_key_ret
3954	leaq	16(%rdx,%rsi,1),%rdi
3955
3956	movups	(%rdx),%xmm0
3957	movups	(%rdi),%xmm1
3958	movups	%xmm0,(%rdi)
3959	movups	%xmm1,(%rdx)
3960	leaq	16(%rdx),%rdx
3961	leaq	-16(%rdi),%rdi
3962
3963L$dec_key_inverse:
3964	movups	(%rdx),%xmm0
3965	movups	(%rdi),%xmm1
3966.byte	102,15,56,219,192
3967.byte	102,15,56,219,201
3968	leaq	16(%rdx),%rdx
3969	leaq	-16(%rdi),%rdi
3970	movups	%xmm0,16(%rdi)
3971	movups	%xmm1,-16(%rdx)
3972	cmpq	%rdx,%rdi
3973	ja	L$dec_key_inverse
3974
3975	movups	(%rdx),%xmm0
3976.byte	102,15,56,219,192
3977	pxor	%xmm1,%xmm1
3978	movups	%xmm0,(%rdi)
3979	pxor	%xmm0,%xmm0
3980L$dec_key_ret:
3981	addq	$8,%rsp
3982	.byte	0xf3,0xc3
3983L$SEH_end_set_decrypt_key:
3984
3985.globl	_aesni_set_encrypt_key
3986.private_extern _aesni_set_encrypt_key
3987
3988.p2align	4
3989_aesni_set_encrypt_key:
3990__aesni_set_encrypt_key:
3991.byte	0x48,0x83,0xEC,0x08
3992	movq	$-1,%rax
3993	testq	%rdi,%rdi
3994	jz	L$enc_key_ret
3995	testq	%rdx,%rdx
3996	jz	L$enc_key_ret
3997
3998	movups	(%rdi),%xmm0
3999	xorps	%xmm4,%xmm4
4000	leaq	_OPENSSL_ia32cap_P(%rip),%r10
4001	movl	4(%r10),%r10d
4002	andl	$268437504,%r10d
4003	leaq	16(%rdx),%rax
4004	cmpl	$256,%esi
4005	je	L$14rounds
4006	cmpl	$192,%esi
4007	je	L$12rounds
4008	cmpl	$128,%esi
4009	jne	L$bad_keybits
4010
4011L$10rounds:
4012	movl	$9,%esi
4013	cmpl	$268435456,%r10d
4014	je	L$10rounds_alt
4015
4016	movups	%xmm0,(%rdx)
4017.byte	102,15,58,223,200,1
4018	call	L$key_expansion_128_cold
4019.byte	102,15,58,223,200,2
4020	call	L$key_expansion_128
4021.byte	102,15,58,223,200,4
4022	call	L$key_expansion_128
4023.byte	102,15,58,223,200,8
4024	call	L$key_expansion_128
4025.byte	102,15,58,223,200,16
4026	call	L$key_expansion_128
4027.byte	102,15,58,223,200,32
4028	call	L$key_expansion_128
4029.byte	102,15,58,223,200,64
4030	call	L$key_expansion_128
4031.byte	102,15,58,223,200,128
4032	call	L$key_expansion_128
4033.byte	102,15,58,223,200,27
4034	call	L$key_expansion_128
4035.byte	102,15,58,223,200,54
4036	call	L$key_expansion_128
4037	movups	%xmm0,(%rax)
4038	movl	%esi,80(%rax)
4039	xorl	%eax,%eax
4040	jmp	L$enc_key_ret
4041
4042.p2align	4
4043L$10rounds_alt:
4044	movdqa	L$key_rotate(%rip),%xmm5
4045	movl	$8,%r10d
4046	movdqa	L$key_rcon1(%rip),%xmm4
4047	movdqa	%xmm0,%xmm2
4048	movdqu	%xmm0,(%rdx)
4049	jmp	L$oop_key128
4050
4051.p2align	4
4052L$oop_key128:
4053.byte	102,15,56,0,197
4054.byte	102,15,56,221,196
4055	pslld	$1,%xmm4
4056	leaq	16(%rax),%rax
4057
4058	movdqa	%xmm2,%xmm3
4059	pslldq	$4,%xmm2
4060	pxor	%xmm2,%xmm3
4061	pslldq	$4,%xmm2
4062	pxor	%xmm2,%xmm3
4063	pslldq	$4,%xmm2
4064	pxor	%xmm3,%xmm2
4065
4066	pxor	%xmm2,%xmm0
4067	movdqu	%xmm0,-16(%rax)
4068	movdqa	%xmm0,%xmm2
4069
4070	decl	%r10d
4071	jnz	L$oop_key128
4072
4073	movdqa	L$key_rcon1b(%rip),%xmm4
4074
4075.byte	102,15,56,0,197
4076.byte	102,15,56,221,196
4077	pslld	$1,%xmm4
4078
4079	movdqa	%xmm2,%xmm3
4080	pslldq	$4,%xmm2
4081	pxor	%xmm2,%xmm3
4082	pslldq	$4,%xmm2
4083	pxor	%xmm2,%xmm3
4084	pslldq	$4,%xmm2
4085	pxor	%xmm3,%xmm2
4086
4087	pxor	%xmm2,%xmm0
4088	movdqu	%xmm0,(%rax)
4089
4090	movdqa	%xmm0,%xmm2
4091.byte	102,15,56,0,197
4092.byte	102,15,56,221,196
4093
4094	movdqa	%xmm2,%xmm3
4095	pslldq	$4,%xmm2
4096	pxor	%xmm2,%xmm3
4097	pslldq	$4,%xmm2
4098	pxor	%xmm2,%xmm3
4099	pslldq	$4,%xmm2
4100	pxor	%xmm3,%xmm2
4101
4102	pxor	%xmm2,%xmm0
4103	movdqu	%xmm0,16(%rax)
4104
4105	movl	%esi,96(%rax)
4106	xorl	%eax,%eax
4107	jmp	L$enc_key_ret
4108
4109.p2align	4
4110L$12rounds:
4111	movq	16(%rdi),%xmm2
4112	movl	$11,%esi
4113	cmpl	$268435456,%r10d
4114	je	L$12rounds_alt
4115
4116	movups	%xmm0,(%rdx)
4117.byte	102,15,58,223,202,1
4118	call	L$key_expansion_192a_cold
4119.byte	102,15,58,223,202,2
4120	call	L$key_expansion_192b
4121.byte	102,15,58,223,202,4
4122	call	L$key_expansion_192a
4123.byte	102,15,58,223,202,8
4124	call	L$key_expansion_192b
4125.byte	102,15,58,223,202,16
4126	call	L$key_expansion_192a
4127.byte	102,15,58,223,202,32
4128	call	L$key_expansion_192b
4129.byte	102,15,58,223,202,64
4130	call	L$key_expansion_192a
4131.byte	102,15,58,223,202,128
4132	call	L$key_expansion_192b
4133	movups	%xmm0,(%rax)
4134	movl	%esi,48(%rax)
4135	xorq	%rax,%rax
4136	jmp	L$enc_key_ret
4137
4138.p2align	4
4139L$12rounds_alt:
4140	movdqa	L$key_rotate192(%rip),%xmm5
4141	movdqa	L$key_rcon1(%rip),%xmm4
4142	movl	$8,%r10d
4143	movdqu	%xmm0,(%rdx)
4144	jmp	L$oop_key192
4145
4146.p2align	4
4147L$oop_key192:
4148	movq	%xmm2,0(%rax)
4149	movdqa	%xmm2,%xmm1
4150.byte	102,15,56,0,213
4151.byte	102,15,56,221,212
4152	pslld	$1,%xmm4
4153	leaq	24(%rax),%rax
4154
4155	movdqa	%xmm0,%xmm3
4156	pslldq	$4,%xmm0
4157	pxor	%xmm0,%xmm3
4158	pslldq	$4,%xmm0
4159	pxor	%xmm0,%xmm3
4160	pslldq	$4,%xmm0
4161	pxor	%xmm3,%xmm0
4162
4163	pshufd	$0xff,%xmm0,%xmm3
4164	pxor	%xmm1,%xmm3
4165	pslldq	$4,%xmm1
4166	pxor	%xmm1,%xmm3
4167
4168	pxor	%xmm2,%xmm0
4169	pxor	%xmm3,%xmm2
4170	movdqu	%xmm0,-16(%rax)
4171
4172	decl	%r10d
4173	jnz	L$oop_key192
4174
4175	movl	%esi,32(%rax)
4176	xorl	%eax,%eax
4177	jmp	L$enc_key_ret
4178
4179.p2align	4
4180L$14rounds:
4181	movups	16(%rdi),%xmm2
4182	movl	$13,%esi
4183	leaq	16(%rax),%rax
4184	cmpl	$268435456,%r10d
4185	je	L$14rounds_alt
4186
4187	movups	%xmm0,(%rdx)
4188	movups	%xmm2,16(%rdx)
4189.byte	102,15,58,223,202,1
4190	call	L$key_expansion_256a_cold
4191.byte	102,15,58,223,200,1
4192	call	L$key_expansion_256b
4193.byte	102,15,58,223,202,2
4194	call	L$key_expansion_256a
4195.byte	102,15,58,223,200,2
4196	call	L$key_expansion_256b
4197.byte	102,15,58,223,202,4
4198	call	L$key_expansion_256a
4199.byte	102,15,58,223,200,4
4200	call	L$key_expansion_256b
4201.byte	102,15,58,223,202,8
4202	call	L$key_expansion_256a
4203.byte	102,15,58,223,200,8
4204	call	L$key_expansion_256b
4205.byte	102,15,58,223,202,16
4206	call	L$key_expansion_256a
4207.byte	102,15,58,223,200,16
4208	call	L$key_expansion_256b
4209.byte	102,15,58,223,202,32
4210	call	L$key_expansion_256a
4211.byte	102,15,58,223,200,32
4212	call	L$key_expansion_256b
4213.byte	102,15,58,223,202,64
4214	call	L$key_expansion_256a
4215	movups	%xmm0,(%rax)
4216	movl	%esi,16(%rax)
4217	xorq	%rax,%rax
4218	jmp	L$enc_key_ret
4219
4220.p2align	4
4221L$14rounds_alt:
4222	movdqa	L$key_rotate(%rip),%xmm5
4223	movdqa	L$key_rcon1(%rip),%xmm4
4224	movl	$7,%r10d
4225	movdqu	%xmm0,0(%rdx)
4226	movdqa	%xmm2,%xmm1
4227	movdqu	%xmm2,16(%rdx)
4228	jmp	L$oop_key256
4229
4230.p2align	4
4231L$oop_key256:
4232.byte	102,15,56,0,213
4233.byte	102,15,56,221,212
4234
4235	movdqa	%xmm0,%xmm3
4236	pslldq	$4,%xmm0
4237	pxor	%xmm0,%xmm3
4238	pslldq	$4,%xmm0
4239	pxor	%xmm0,%xmm3
4240	pslldq	$4,%xmm0
4241	pxor	%xmm3,%xmm0
4242	pslld	$1,%xmm4
4243
4244	pxor	%xmm2,%xmm0
4245	movdqu	%xmm0,(%rax)
4246
4247	decl	%r10d
4248	jz	L$done_key256
4249
4250	pshufd	$0xff,%xmm0,%xmm2
4251	pxor	%xmm3,%xmm3
4252.byte	102,15,56,221,211
4253
4254	movdqa	%xmm1,%xmm3
4255	pslldq	$4,%xmm1
4256	pxor	%xmm1,%xmm3
4257	pslldq	$4,%xmm1
4258	pxor	%xmm1,%xmm3
4259	pslldq	$4,%xmm1
4260	pxor	%xmm3,%xmm1
4261
4262	pxor	%xmm1,%xmm2
4263	movdqu	%xmm2,16(%rax)
4264	leaq	32(%rax),%rax
4265	movdqa	%xmm2,%xmm1
4266
4267	jmp	L$oop_key256
4268
4269L$done_key256:
4270	movl	%esi,16(%rax)
4271	xorl	%eax,%eax
4272	jmp	L$enc_key_ret
4273
4274.p2align	4
4275L$bad_keybits:
4276	movq	$-2,%rax
4277L$enc_key_ret:
4278	pxor	%xmm0,%xmm0
4279	pxor	%xmm1,%xmm1
4280	pxor	%xmm2,%xmm2
4281	pxor	%xmm3,%xmm3
4282	pxor	%xmm4,%xmm4
4283	pxor	%xmm5,%xmm5
4284	addq	$8,%rsp
4285	.byte	0xf3,0xc3
4286L$SEH_end_set_encrypt_key:
4287
4288.p2align	4
4289L$key_expansion_128:
4290	movups	%xmm0,(%rax)
4291	leaq	16(%rax),%rax
4292L$key_expansion_128_cold:
4293	shufps	$16,%xmm0,%xmm4
4294	xorps	%xmm4,%xmm0
4295	shufps	$140,%xmm0,%xmm4
4296	xorps	%xmm4,%xmm0
4297	shufps	$255,%xmm1,%xmm1
4298	xorps	%xmm1,%xmm0
4299	.byte	0xf3,0xc3
4300
4301.p2align	4
4302L$key_expansion_192a:
4303	movups	%xmm0,(%rax)
4304	leaq	16(%rax),%rax
4305L$key_expansion_192a_cold:
4306	movaps	%xmm2,%xmm5
4307L$key_expansion_192b_warm:
4308	shufps	$16,%xmm0,%xmm4
4309	movdqa	%xmm2,%xmm3
4310	xorps	%xmm4,%xmm0
4311	shufps	$140,%xmm0,%xmm4
4312	pslldq	$4,%xmm3
4313	xorps	%xmm4,%xmm0
4314	pshufd	$85,%xmm1,%xmm1
4315	pxor	%xmm3,%xmm2
4316	pxor	%xmm1,%xmm0
4317	pshufd	$255,%xmm0,%xmm3
4318	pxor	%xmm3,%xmm2
4319	.byte	0xf3,0xc3
4320
4321.p2align	4
4322L$key_expansion_192b:
4323	movaps	%xmm0,%xmm3
4324	shufps	$68,%xmm0,%xmm5
4325	movups	%xmm5,(%rax)
4326	shufps	$78,%xmm2,%xmm3
4327	movups	%xmm3,16(%rax)
4328	leaq	32(%rax),%rax
4329	jmp	L$key_expansion_192b_warm
4330
4331.p2align	4
4332L$key_expansion_256a:
4333	movups	%xmm2,(%rax)
4334	leaq	16(%rax),%rax
4335L$key_expansion_256a_cold:
4336	shufps	$16,%xmm0,%xmm4
4337	xorps	%xmm4,%xmm0
4338	shufps	$140,%xmm0,%xmm4
4339	xorps	%xmm4,%xmm0
4340	shufps	$255,%xmm1,%xmm1
4341	xorps	%xmm1,%xmm0
4342	.byte	0xf3,0xc3
4343
4344.p2align	4
4345L$key_expansion_256b:
4346	movups	%xmm0,(%rax)
4347	leaq	16(%rax),%rax
4348
4349	shufps	$16,%xmm2,%xmm4
4350	xorps	%xmm4,%xmm2
4351	shufps	$140,%xmm2,%xmm4
4352	xorps	%xmm4,%xmm2
4353	shufps	$170,%xmm1,%xmm1
4354	xorps	%xmm1,%xmm2
4355	.byte	0xf3,0xc3
4356
4357
4358.p2align	6
4359L$bswap_mask:
4360.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
4361L$increment32:
4362.long	6,6,6,0
4363L$increment64:
4364.long	1,0,0,0
4365L$xts_magic:
4366.long	0x87,0,1,0
4367L$increment1:
4368.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4369L$key_rotate:
4370.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
4371L$key_rotate192:
4372.long	0x04070605,0x04070605,0x04070605,0x04070605
4373L$key_rcon1:
4374.long	1,1,1,1
4375L$key_rcon1b:
4376.long	0x1b,0x1b,0x1b,0x1b
4377
4378.byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
4379.p2align	6
4380#endif
4381