• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# This file is generated from a similarly-named Perl script in the BoringSSL
2# source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
11#include "ring_core_generated/prefix_symbols_asm.h"
12.text
13
14.globl	_aes_hw_encrypt
15.private_extern _aes_hw_encrypt
16
17.p2align	4
18_aes_hw_encrypt:
19
20	movups	(%rdi),%xmm2
21	movl	240(%rdx),%eax
22	movups	(%rdx),%xmm0
23	movups	16(%rdx),%xmm1
24	leaq	32(%rdx),%rdx
25	xorps	%xmm0,%xmm2
26L$oop_enc1_1:
27.byte	102,15,56,220,209
28	decl	%eax
29	movups	(%rdx),%xmm1
30	leaq	16(%rdx),%rdx
31	jnz	L$oop_enc1_1
32.byte	102,15,56,221,209
33	pxor	%xmm0,%xmm0
34	pxor	%xmm1,%xmm1
35	movups	%xmm2,(%rsi)
36	pxor	%xmm2,%xmm2
37	.byte	0xf3,0xc3
38
39
40
41.p2align	4
42_aesni_encrypt2:
43
44	movups	(%rcx),%xmm0
45	shll	$4,%eax
46	movups	16(%rcx),%xmm1
47	xorps	%xmm0,%xmm2
48	xorps	%xmm0,%xmm3
49	movups	32(%rcx),%xmm0
50	leaq	32(%rcx,%rax,1),%rcx
51	negq	%rax
52	addq	$16,%rax
53
54L$enc_loop2:
55.byte	102,15,56,220,209
56.byte	102,15,56,220,217
57	movups	(%rcx,%rax,1),%xmm1
58	addq	$32,%rax
59.byte	102,15,56,220,208
60.byte	102,15,56,220,216
61	movups	-16(%rcx,%rax,1),%xmm0
62	jnz	L$enc_loop2
63
64.byte	102,15,56,220,209
65.byte	102,15,56,220,217
66.byte	102,15,56,221,208
67.byte	102,15,56,221,216
68	.byte	0xf3,0xc3
69
70
71
72.p2align	4
73_aesni_encrypt3:
74
75	movups	(%rcx),%xmm0
76	shll	$4,%eax
77	movups	16(%rcx),%xmm1
78	xorps	%xmm0,%xmm2
79	xorps	%xmm0,%xmm3
80	xorps	%xmm0,%xmm4
81	movups	32(%rcx),%xmm0
82	leaq	32(%rcx,%rax,1),%rcx
83	negq	%rax
84	addq	$16,%rax
85
86L$enc_loop3:
87.byte	102,15,56,220,209
88.byte	102,15,56,220,217
89.byte	102,15,56,220,225
90	movups	(%rcx,%rax,1),%xmm1
91	addq	$32,%rax
92.byte	102,15,56,220,208
93.byte	102,15,56,220,216
94.byte	102,15,56,220,224
95	movups	-16(%rcx,%rax,1),%xmm0
96	jnz	L$enc_loop3
97
98.byte	102,15,56,220,209
99.byte	102,15,56,220,217
100.byte	102,15,56,220,225
101.byte	102,15,56,221,208
102.byte	102,15,56,221,216
103.byte	102,15,56,221,224
104	.byte	0xf3,0xc3
105
106
107
108.p2align	4
109_aesni_encrypt4:
110
111	movups	(%rcx),%xmm0
112	shll	$4,%eax
113	movups	16(%rcx),%xmm1
114	xorps	%xmm0,%xmm2
115	xorps	%xmm0,%xmm3
116	xorps	%xmm0,%xmm4
117	xorps	%xmm0,%xmm5
118	movups	32(%rcx),%xmm0
119	leaq	32(%rcx,%rax,1),%rcx
120	negq	%rax
121.byte	0x0f,0x1f,0x00
122	addq	$16,%rax
123
124L$enc_loop4:
125.byte	102,15,56,220,209
126.byte	102,15,56,220,217
127.byte	102,15,56,220,225
128.byte	102,15,56,220,233
129	movups	(%rcx,%rax,1),%xmm1
130	addq	$32,%rax
131.byte	102,15,56,220,208
132.byte	102,15,56,220,216
133.byte	102,15,56,220,224
134.byte	102,15,56,220,232
135	movups	-16(%rcx,%rax,1),%xmm0
136	jnz	L$enc_loop4
137
138.byte	102,15,56,220,209
139.byte	102,15,56,220,217
140.byte	102,15,56,220,225
141.byte	102,15,56,220,233
142.byte	102,15,56,221,208
143.byte	102,15,56,221,216
144.byte	102,15,56,221,224
145.byte	102,15,56,221,232
146	.byte	0xf3,0xc3
147
148
149
150.p2align	4
151_aesni_encrypt6:
152
153	movups	(%rcx),%xmm0
154	shll	$4,%eax
155	movups	16(%rcx),%xmm1
156	xorps	%xmm0,%xmm2
157	pxor	%xmm0,%xmm3
158	pxor	%xmm0,%xmm4
159.byte	102,15,56,220,209
160	leaq	32(%rcx,%rax,1),%rcx
161	negq	%rax
162.byte	102,15,56,220,217
163	pxor	%xmm0,%xmm5
164	pxor	%xmm0,%xmm6
165.byte	102,15,56,220,225
166	pxor	%xmm0,%xmm7
167	movups	(%rcx,%rax,1),%xmm0
168	addq	$16,%rax
169	jmp	L$enc_loop6_enter
170.p2align	4
171L$enc_loop6:
172.byte	102,15,56,220,209
173.byte	102,15,56,220,217
174.byte	102,15,56,220,225
175L$enc_loop6_enter:
176.byte	102,15,56,220,233
177.byte	102,15,56,220,241
178.byte	102,15,56,220,249
179	movups	(%rcx,%rax,1),%xmm1
180	addq	$32,%rax
181.byte	102,15,56,220,208
182.byte	102,15,56,220,216
183.byte	102,15,56,220,224
184.byte	102,15,56,220,232
185.byte	102,15,56,220,240
186.byte	102,15,56,220,248
187	movups	-16(%rcx,%rax,1),%xmm0
188	jnz	L$enc_loop6
189
190.byte	102,15,56,220,209
191.byte	102,15,56,220,217
192.byte	102,15,56,220,225
193.byte	102,15,56,220,233
194.byte	102,15,56,220,241
195.byte	102,15,56,220,249
196.byte	102,15,56,221,208
197.byte	102,15,56,221,216
198.byte	102,15,56,221,224
199.byte	102,15,56,221,232
200.byte	102,15,56,221,240
201.byte	102,15,56,221,248
202	.byte	0xf3,0xc3
203
204
205
206.p2align	4
207_aesni_encrypt8:
208
209	movups	(%rcx),%xmm0
210	shll	$4,%eax
211	movups	16(%rcx),%xmm1
212	xorps	%xmm0,%xmm2
213	xorps	%xmm0,%xmm3
214	pxor	%xmm0,%xmm4
215	pxor	%xmm0,%xmm5
216	pxor	%xmm0,%xmm6
217	leaq	32(%rcx,%rax,1),%rcx
218	negq	%rax
219.byte	102,15,56,220,209
220	pxor	%xmm0,%xmm7
221	pxor	%xmm0,%xmm8
222.byte	102,15,56,220,217
223	pxor	%xmm0,%xmm9
224	movups	(%rcx,%rax,1),%xmm0
225	addq	$16,%rax
226	jmp	L$enc_loop8_inner
227.p2align	4
228L$enc_loop8:
229.byte	102,15,56,220,209
230.byte	102,15,56,220,217
231L$enc_loop8_inner:
232.byte	102,15,56,220,225
233.byte	102,15,56,220,233
234.byte	102,15,56,220,241
235.byte	102,15,56,220,249
236.byte	102,68,15,56,220,193
237.byte	102,68,15,56,220,201
238L$enc_loop8_enter:
239	movups	(%rcx,%rax,1),%xmm1
240	addq	$32,%rax
241.byte	102,15,56,220,208
242.byte	102,15,56,220,216
243.byte	102,15,56,220,224
244.byte	102,15,56,220,232
245.byte	102,15,56,220,240
246.byte	102,15,56,220,248
247.byte	102,68,15,56,220,192
248.byte	102,68,15,56,220,200
249	movups	-16(%rcx,%rax,1),%xmm0
250	jnz	L$enc_loop8
251
252.byte	102,15,56,220,209
253.byte	102,15,56,220,217
254.byte	102,15,56,220,225
255.byte	102,15,56,220,233
256.byte	102,15,56,220,241
257.byte	102,15,56,220,249
258.byte	102,68,15,56,220,193
259.byte	102,68,15,56,220,201
260.byte	102,15,56,221,208
261.byte	102,15,56,221,216
262.byte	102,15,56,221,224
263.byte	102,15,56,221,232
264.byte	102,15,56,221,240
265.byte	102,15,56,221,248
266.byte	102,68,15,56,221,192
267.byte	102,68,15,56,221,200
268	.byte	0xf3,0xc3
269
270
271.globl	_aes_hw_ctr32_encrypt_blocks
272.private_extern _aes_hw_ctr32_encrypt_blocks
273
274.p2align	4
275_aes_hw_ctr32_encrypt_blocks:
276
277	cmpq	$1,%rdx
278	jne	L$ctr32_bulk
279
280
281
282	movups	(%r8),%xmm2
283	movups	(%rdi),%xmm3
284	movl	240(%rcx),%edx
285	movups	(%rcx),%xmm0
286	movups	16(%rcx),%xmm1
287	leaq	32(%rcx),%rcx
288	xorps	%xmm0,%xmm2
289L$oop_enc1_2:
290.byte	102,15,56,220,209
291	decl	%edx
292	movups	(%rcx),%xmm1
293	leaq	16(%rcx),%rcx
294	jnz	L$oop_enc1_2
295.byte	102,15,56,221,209
296	pxor	%xmm0,%xmm0
297	pxor	%xmm1,%xmm1
298	xorps	%xmm3,%xmm2
299	pxor	%xmm3,%xmm3
300	movups	%xmm2,(%rsi)
301	xorps	%xmm2,%xmm2
302	jmp	L$ctr32_epilogue
303
304.p2align	4
305L$ctr32_bulk:
306	leaq	(%rsp),%r11
307
308	pushq	%rbp
309
310	subq	$128,%rsp
311	andq	$-16,%rsp
312
313
314
315
316	movdqu	(%r8),%xmm2
317	movdqu	(%rcx),%xmm0
318	movl	12(%r8),%r8d
319	pxor	%xmm0,%xmm2
320	movl	12(%rcx),%ebp
321	movdqa	%xmm2,0(%rsp)
322	bswapl	%r8d
323	movdqa	%xmm2,%xmm3
324	movdqa	%xmm2,%xmm4
325	movdqa	%xmm2,%xmm5
326	movdqa	%xmm2,64(%rsp)
327	movdqa	%xmm2,80(%rsp)
328	movdqa	%xmm2,96(%rsp)
329	movq	%rdx,%r10
330	movdqa	%xmm2,112(%rsp)
331
332	leaq	1(%r8),%rax
333	leaq	2(%r8),%rdx
334	bswapl	%eax
335	bswapl	%edx
336	xorl	%ebp,%eax
337	xorl	%ebp,%edx
338.byte	102,15,58,34,216,3
339	leaq	3(%r8),%rax
340	movdqa	%xmm3,16(%rsp)
341.byte	102,15,58,34,226,3
342	bswapl	%eax
343	movq	%r10,%rdx
344	leaq	4(%r8),%r10
345	movdqa	%xmm4,32(%rsp)
346	xorl	%ebp,%eax
347	bswapl	%r10d
348.byte	102,15,58,34,232,3
349	xorl	%ebp,%r10d
350	movdqa	%xmm5,48(%rsp)
351	leaq	5(%r8),%r9
352	movl	%r10d,64+12(%rsp)
353	bswapl	%r9d
354	leaq	6(%r8),%r10
355	movl	240(%rcx),%eax
356	xorl	%ebp,%r9d
357	bswapl	%r10d
358	movl	%r9d,80+12(%rsp)
359	xorl	%ebp,%r10d
360	leaq	7(%r8),%r9
361	movl	%r10d,96+12(%rsp)
362	bswapl	%r9d
363	leaq	_OPENSSL_ia32cap_P(%rip),%r10
364	movl	4(%r10),%r10d
365	xorl	%ebp,%r9d
366	andl	$71303168,%r10d
367	movl	%r9d,112+12(%rsp)
368
369	movups	16(%rcx),%xmm1
370
371	movdqa	64(%rsp),%xmm6
372	movdqa	80(%rsp),%xmm7
373
374	cmpq	$8,%rdx
375	jb	L$ctr32_tail
376
377	subq	$6,%rdx
378	cmpl	$4194304,%r10d
379	je	L$ctr32_6x
380
381	leaq	128(%rcx),%rcx
382	subq	$2,%rdx
383	jmp	L$ctr32_loop8
384
385.p2align	4
386L$ctr32_6x:
387	shll	$4,%eax
388	movl	$48,%r10d
389	bswapl	%ebp
390	leaq	32(%rcx,%rax,1),%rcx
391	subq	%rax,%r10
392	jmp	L$ctr32_loop6
393
394.p2align	4
395L$ctr32_loop6:
396	addl	$6,%r8d
397	movups	-48(%rcx,%r10,1),%xmm0
398.byte	102,15,56,220,209
399	movl	%r8d,%eax
400	xorl	%ebp,%eax
401.byte	102,15,56,220,217
402.byte	0x0f,0x38,0xf1,0x44,0x24,12
403	leal	1(%r8),%eax
404.byte	102,15,56,220,225
405	xorl	%ebp,%eax
406.byte	0x0f,0x38,0xf1,0x44,0x24,28
407.byte	102,15,56,220,233
408	leal	2(%r8),%eax
409	xorl	%ebp,%eax
410.byte	102,15,56,220,241
411.byte	0x0f,0x38,0xf1,0x44,0x24,44
412	leal	3(%r8),%eax
413.byte	102,15,56,220,249
414	movups	-32(%rcx,%r10,1),%xmm1
415	xorl	%ebp,%eax
416
417.byte	102,15,56,220,208
418.byte	0x0f,0x38,0xf1,0x44,0x24,60
419	leal	4(%r8),%eax
420.byte	102,15,56,220,216
421	xorl	%ebp,%eax
422.byte	0x0f,0x38,0xf1,0x44,0x24,76
423.byte	102,15,56,220,224
424	leal	5(%r8),%eax
425	xorl	%ebp,%eax
426.byte	102,15,56,220,232
427.byte	0x0f,0x38,0xf1,0x44,0x24,92
428	movq	%r10,%rax
429.byte	102,15,56,220,240
430.byte	102,15,56,220,248
431	movups	-16(%rcx,%r10,1),%xmm0
432
433	call	L$enc_loop6
434
435	movdqu	(%rdi),%xmm8
436	movdqu	16(%rdi),%xmm9
437	movdqu	32(%rdi),%xmm10
438	movdqu	48(%rdi),%xmm11
439	movdqu	64(%rdi),%xmm12
440	movdqu	80(%rdi),%xmm13
441	leaq	96(%rdi),%rdi
442	movups	-64(%rcx,%r10,1),%xmm1
443	pxor	%xmm2,%xmm8
444	movaps	0(%rsp),%xmm2
445	pxor	%xmm3,%xmm9
446	movaps	16(%rsp),%xmm3
447	pxor	%xmm4,%xmm10
448	movaps	32(%rsp),%xmm4
449	pxor	%xmm5,%xmm11
450	movaps	48(%rsp),%xmm5
451	pxor	%xmm6,%xmm12
452	movaps	64(%rsp),%xmm6
453	pxor	%xmm7,%xmm13
454	movaps	80(%rsp),%xmm7
455	movdqu	%xmm8,(%rsi)
456	movdqu	%xmm9,16(%rsi)
457	movdqu	%xmm10,32(%rsi)
458	movdqu	%xmm11,48(%rsi)
459	movdqu	%xmm12,64(%rsi)
460	movdqu	%xmm13,80(%rsi)
461	leaq	96(%rsi),%rsi
462
463	subq	$6,%rdx
464	jnc	L$ctr32_loop6
465
466	addq	$6,%rdx
467	jz	L$ctr32_done
468
469	leal	-48(%r10),%eax
470	leaq	-80(%rcx,%r10,1),%rcx
471	negl	%eax
472	shrl	$4,%eax
473	jmp	L$ctr32_tail
474
475.p2align	5
476L$ctr32_loop8:
477	addl	$8,%r8d
478	movdqa	96(%rsp),%xmm8
479.byte	102,15,56,220,209
480	movl	%r8d,%r9d
481	movdqa	112(%rsp),%xmm9
482.byte	102,15,56,220,217
483	bswapl	%r9d
484	movups	32-128(%rcx),%xmm0
485.byte	102,15,56,220,225
486	xorl	%ebp,%r9d
487	nop
488.byte	102,15,56,220,233
489	movl	%r9d,0+12(%rsp)
490	leaq	1(%r8),%r9
491.byte	102,15,56,220,241
492.byte	102,15,56,220,249
493.byte	102,68,15,56,220,193
494.byte	102,68,15,56,220,201
495	movups	48-128(%rcx),%xmm1
496	bswapl	%r9d
497.byte	102,15,56,220,208
498.byte	102,15,56,220,216
499	xorl	%ebp,%r9d
500.byte	0x66,0x90
501.byte	102,15,56,220,224
502.byte	102,15,56,220,232
503	movl	%r9d,16+12(%rsp)
504	leaq	2(%r8),%r9
505.byte	102,15,56,220,240
506.byte	102,15,56,220,248
507.byte	102,68,15,56,220,192
508.byte	102,68,15,56,220,200
509	movups	64-128(%rcx),%xmm0
510	bswapl	%r9d
511.byte	102,15,56,220,209
512.byte	102,15,56,220,217
513	xorl	%ebp,%r9d
514.byte	0x66,0x90
515.byte	102,15,56,220,225
516.byte	102,15,56,220,233
517	movl	%r9d,32+12(%rsp)
518	leaq	3(%r8),%r9
519.byte	102,15,56,220,241
520.byte	102,15,56,220,249
521.byte	102,68,15,56,220,193
522.byte	102,68,15,56,220,201
523	movups	80-128(%rcx),%xmm1
524	bswapl	%r9d
525.byte	102,15,56,220,208
526.byte	102,15,56,220,216
527	xorl	%ebp,%r9d
528.byte	0x66,0x90
529.byte	102,15,56,220,224
530.byte	102,15,56,220,232
531	movl	%r9d,48+12(%rsp)
532	leaq	4(%r8),%r9
533.byte	102,15,56,220,240
534.byte	102,15,56,220,248
535.byte	102,68,15,56,220,192
536.byte	102,68,15,56,220,200
537	movups	96-128(%rcx),%xmm0
538	bswapl	%r9d
539.byte	102,15,56,220,209
540.byte	102,15,56,220,217
541	xorl	%ebp,%r9d
542.byte	0x66,0x90
543.byte	102,15,56,220,225
544.byte	102,15,56,220,233
545	movl	%r9d,64+12(%rsp)
546	leaq	5(%r8),%r9
547.byte	102,15,56,220,241
548.byte	102,15,56,220,249
549.byte	102,68,15,56,220,193
550.byte	102,68,15,56,220,201
551	movups	112-128(%rcx),%xmm1
552	bswapl	%r9d
553.byte	102,15,56,220,208
554.byte	102,15,56,220,216
555	xorl	%ebp,%r9d
556.byte	0x66,0x90
557.byte	102,15,56,220,224
558.byte	102,15,56,220,232
559	movl	%r9d,80+12(%rsp)
560	leaq	6(%r8),%r9
561.byte	102,15,56,220,240
562.byte	102,15,56,220,248
563.byte	102,68,15,56,220,192
564.byte	102,68,15,56,220,200
565	movups	128-128(%rcx),%xmm0
566	bswapl	%r9d
567.byte	102,15,56,220,209
568.byte	102,15,56,220,217
569	xorl	%ebp,%r9d
570.byte	0x66,0x90
571.byte	102,15,56,220,225
572.byte	102,15,56,220,233
573	movl	%r9d,96+12(%rsp)
574	leaq	7(%r8),%r9
575.byte	102,15,56,220,241
576.byte	102,15,56,220,249
577.byte	102,68,15,56,220,193
578.byte	102,68,15,56,220,201
579	movups	144-128(%rcx),%xmm1
580	bswapl	%r9d
581.byte	102,15,56,220,208
582.byte	102,15,56,220,216
583.byte	102,15,56,220,224
584	xorl	%ebp,%r9d
585	movdqu	0(%rdi),%xmm10
586.byte	102,15,56,220,232
587	movl	%r9d,112+12(%rsp)
588	cmpl	$11,%eax
589.byte	102,15,56,220,240
590.byte	102,15,56,220,248
591.byte	102,68,15,56,220,192
592.byte	102,68,15,56,220,200
593	movups	160-128(%rcx),%xmm0
594
595	jb	L$ctr32_enc_done
596
597.byte	102,15,56,220,209
598.byte	102,15,56,220,217
599.byte	102,15,56,220,225
600.byte	102,15,56,220,233
601.byte	102,15,56,220,241
602.byte	102,15,56,220,249
603.byte	102,68,15,56,220,193
604.byte	102,68,15,56,220,201
605	movups	176-128(%rcx),%xmm1
606
607.byte	102,15,56,220,208
608.byte	102,15,56,220,216
609.byte	102,15,56,220,224
610.byte	102,15,56,220,232
611.byte	102,15,56,220,240
612.byte	102,15,56,220,248
613.byte	102,68,15,56,220,192
614.byte	102,68,15,56,220,200
615	movups	192-128(%rcx),%xmm0
616
617
618
619.byte	102,15,56,220,209
620.byte	102,15,56,220,217
621.byte	102,15,56,220,225
622.byte	102,15,56,220,233
623.byte	102,15,56,220,241
624.byte	102,15,56,220,249
625.byte	102,68,15,56,220,193
626.byte	102,68,15,56,220,201
627	movups	208-128(%rcx),%xmm1
628
629.byte	102,15,56,220,208
630.byte	102,15,56,220,216
631.byte	102,15,56,220,224
632.byte	102,15,56,220,232
633.byte	102,15,56,220,240
634.byte	102,15,56,220,248
635.byte	102,68,15,56,220,192
636.byte	102,68,15,56,220,200
637	movups	224-128(%rcx),%xmm0
638	jmp	L$ctr32_enc_done
639
640.p2align	4
641L$ctr32_enc_done:
642	movdqu	16(%rdi),%xmm11
643	pxor	%xmm0,%xmm10
644	movdqu	32(%rdi),%xmm12
645	pxor	%xmm0,%xmm11
646	movdqu	48(%rdi),%xmm13
647	pxor	%xmm0,%xmm12
648	movdqu	64(%rdi),%xmm14
649	pxor	%xmm0,%xmm13
650	movdqu	80(%rdi),%xmm15
651	pxor	%xmm0,%xmm14
652	pxor	%xmm0,%xmm15
653.byte	102,15,56,220,209
654.byte	102,15,56,220,217
655.byte	102,15,56,220,225
656.byte	102,15,56,220,233
657.byte	102,15,56,220,241
658.byte	102,15,56,220,249
659.byte	102,68,15,56,220,193
660.byte	102,68,15,56,220,201
661	movdqu	96(%rdi),%xmm1
662	leaq	128(%rdi),%rdi
663
664.byte	102,65,15,56,221,210
665	pxor	%xmm0,%xmm1
666	movdqu	112-128(%rdi),%xmm10
667.byte	102,65,15,56,221,219
668	pxor	%xmm0,%xmm10
669	movdqa	0(%rsp),%xmm11
670.byte	102,65,15,56,221,228
671.byte	102,65,15,56,221,237
672	movdqa	16(%rsp),%xmm12
673	movdqa	32(%rsp),%xmm13
674.byte	102,65,15,56,221,246
675.byte	102,65,15,56,221,255
676	movdqa	48(%rsp),%xmm14
677	movdqa	64(%rsp),%xmm15
678.byte	102,68,15,56,221,193
679	movdqa	80(%rsp),%xmm0
680	movups	16-128(%rcx),%xmm1
681.byte	102,69,15,56,221,202
682
683	movups	%xmm2,(%rsi)
684	movdqa	%xmm11,%xmm2
685	movups	%xmm3,16(%rsi)
686	movdqa	%xmm12,%xmm3
687	movups	%xmm4,32(%rsi)
688	movdqa	%xmm13,%xmm4
689	movups	%xmm5,48(%rsi)
690	movdqa	%xmm14,%xmm5
691	movups	%xmm6,64(%rsi)
692	movdqa	%xmm15,%xmm6
693	movups	%xmm7,80(%rsi)
694	movdqa	%xmm0,%xmm7
695	movups	%xmm8,96(%rsi)
696	movups	%xmm9,112(%rsi)
697	leaq	128(%rsi),%rsi
698
699	subq	$8,%rdx
700	jnc	L$ctr32_loop8
701
702	addq	$8,%rdx
703	jz	L$ctr32_done
704	leaq	-128(%rcx),%rcx
705
706L$ctr32_tail:
707
708
709	leaq	16(%rcx),%rcx
710	cmpq	$4,%rdx
711	jb	L$ctr32_loop3
712	je	L$ctr32_loop4
713
714
715	shll	$4,%eax
716	movdqa	96(%rsp),%xmm8
717	pxor	%xmm9,%xmm9
718
719	movups	16(%rcx),%xmm0
720.byte	102,15,56,220,209
721.byte	102,15,56,220,217
722	leaq	32-16(%rcx,%rax,1),%rcx
723	negq	%rax
724.byte	102,15,56,220,225
725	addq	$16,%rax
726	movups	(%rdi),%xmm10
727.byte	102,15,56,220,233
728.byte	102,15,56,220,241
729	movups	16(%rdi),%xmm11
730	movups	32(%rdi),%xmm12
731.byte	102,15,56,220,249
732.byte	102,68,15,56,220,193
733
734	call	L$enc_loop8_enter
735
736	movdqu	48(%rdi),%xmm13
737	pxor	%xmm10,%xmm2
738	movdqu	64(%rdi),%xmm10
739	pxor	%xmm11,%xmm3
740	movdqu	%xmm2,(%rsi)
741	pxor	%xmm12,%xmm4
742	movdqu	%xmm3,16(%rsi)
743	pxor	%xmm13,%xmm5
744	movdqu	%xmm4,32(%rsi)
745	pxor	%xmm10,%xmm6
746	movdqu	%xmm5,48(%rsi)
747	movdqu	%xmm6,64(%rsi)
748	cmpq	$6,%rdx
749	jb	L$ctr32_done
750
751	movups	80(%rdi),%xmm11
752	xorps	%xmm11,%xmm7
753	movups	%xmm7,80(%rsi)
754	je	L$ctr32_done
755
756	movups	96(%rdi),%xmm12
757	xorps	%xmm12,%xmm8
758	movups	%xmm8,96(%rsi)
759	jmp	L$ctr32_done
760
761.p2align	5
762L$ctr32_loop4:
763.byte	102,15,56,220,209
764	leaq	16(%rcx),%rcx
765	decl	%eax
766.byte	102,15,56,220,217
767.byte	102,15,56,220,225
768.byte	102,15,56,220,233
769	movups	(%rcx),%xmm1
770	jnz	L$ctr32_loop4
771.byte	102,15,56,221,209
772.byte	102,15,56,221,217
773	movups	(%rdi),%xmm10
774	movups	16(%rdi),%xmm11
775.byte	102,15,56,221,225
776.byte	102,15,56,221,233
777	movups	32(%rdi),%xmm12
778	movups	48(%rdi),%xmm13
779
780	xorps	%xmm10,%xmm2
781	movups	%xmm2,(%rsi)
782	xorps	%xmm11,%xmm3
783	movups	%xmm3,16(%rsi)
784	pxor	%xmm12,%xmm4
785	movdqu	%xmm4,32(%rsi)
786	pxor	%xmm13,%xmm5
787	movdqu	%xmm5,48(%rsi)
788	jmp	L$ctr32_done
789
790.p2align	5
791L$ctr32_loop3:
792.byte	102,15,56,220,209
793	leaq	16(%rcx),%rcx
794	decl	%eax
795.byte	102,15,56,220,217
796.byte	102,15,56,220,225
797	movups	(%rcx),%xmm1
798	jnz	L$ctr32_loop3
799.byte	102,15,56,221,209
800.byte	102,15,56,221,217
801.byte	102,15,56,221,225
802
803	movups	(%rdi),%xmm10
804	xorps	%xmm10,%xmm2
805	movups	%xmm2,(%rsi)
806	cmpq	$2,%rdx
807	jb	L$ctr32_done
808
809	movups	16(%rdi),%xmm11
810	xorps	%xmm11,%xmm3
811	movups	%xmm3,16(%rsi)
812	je	L$ctr32_done
813
814	movups	32(%rdi),%xmm12
815	xorps	%xmm12,%xmm4
816	movups	%xmm4,32(%rsi)
817
818L$ctr32_done:
819	xorps	%xmm0,%xmm0
820	xorl	%ebp,%ebp
821	pxor	%xmm1,%xmm1
822	pxor	%xmm2,%xmm2
823	pxor	%xmm3,%xmm3
824	pxor	%xmm4,%xmm4
825	pxor	%xmm5,%xmm5
826	pxor	%xmm6,%xmm6
827	pxor	%xmm7,%xmm7
828	movaps	%xmm0,0(%rsp)
829	pxor	%xmm8,%xmm8
830	movaps	%xmm0,16(%rsp)
831	pxor	%xmm9,%xmm9
832	movaps	%xmm0,32(%rsp)
833	pxor	%xmm10,%xmm10
834	movaps	%xmm0,48(%rsp)
835	pxor	%xmm11,%xmm11
836	movaps	%xmm0,64(%rsp)
837	pxor	%xmm12,%xmm12
838	movaps	%xmm0,80(%rsp)
839	pxor	%xmm13,%xmm13
840	movaps	%xmm0,96(%rsp)
841	pxor	%xmm14,%xmm14
842	movaps	%xmm0,112(%rsp)
843	pxor	%xmm15,%xmm15
844	movq	-8(%r11),%rbp
845
846	leaq	(%r11),%rsp
847
848L$ctr32_epilogue:
849	.byte	0xf3,0xc3
850
851
852.globl	_aes_hw_set_encrypt_key
853.private_extern _aes_hw_set_encrypt_key
854
855.p2align	4
856_aes_hw_set_encrypt_key:
857__aesni_set_encrypt_key:
858
859.byte	0x48,0x83,0xEC,0x08
860
861	movq	$-1,%rax
862	testq	%rdi,%rdi
863	jz	L$enc_key_ret
864	testq	%rdx,%rdx
865	jz	L$enc_key_ret
866
867	movups	(%rdi),%xmm0
868	xorps	%xmm4,%xmm4
869	leaq	_OPENSSL_ia32cap_P(%rip),%r10
870	movl	4(%r10),%r10d
871	andl	$268437504,%r10d
872	leaq	16(%rdx),%rax
873	cmpl	$256,%esi
874	je	L$14rounds
875
876	cmpl	$128,%esi
877	jne	L$bad_keybits
878
879L$10rounds:
880	movl	$9,%esi
881	cmpl	$268435456,%r10d
882	je	L$10rounds_alt
883
884	movups	%xmm0,(%rdx)
885.byte	102,15,58,223,200,1
886	call	L$key_expansion_128_cold
887.byte	102,15,58,223,200,2
888	call	L$key_expansion_128
889.byte	102,15,58,223,200,4
890	call	L$key_expansion_128
891.byte	102,15,58,223,200,8
892	call	L$key_expansion_128
893.byte	102,15,58,223,200,16
894	call	L$key_expansion_128
895.byte	102,15,58,223,200,32
896	call	L$key_expansion_128
897.byte	102,15,58,223,200,64
898	call	L$key_expansion_128
899.byte	102,15,58,223,200,128
900	call	L$key_expansion_128
901.byte	102,15,58,223,200,27
902	call	L$key_expansion_128
903.byte	102,15,58,223,200,54
904	call	L$key_expansion_128
905	movups	%xmm0,(%rax)
906	movl	%esi,80(%rax)
907	xorl	%eax,%eax
908	jmp	L$enc_key_ret
909
910.p2align	4
911L$10rounds_alt:
912	movdqa	L$key_rotate(%rip),%xmm5
913	movl	$8,%r10d
914	movdqa	L$key_rcon1(%rip),%xmm4
915	movdqa	%xmm0,%xmm2
916	movdqu	%xmm0,(%rdx)
917	jmp	L$oop_key128
918
919.p2align	4
920L$oop_key128:
921.byte	102,15,56,0,197
922.byte	102,15,56,221,196
923	pslld	$1,%xmm4
924	leaq	16(%rax),%rax
925
926	movdqa	%xmm2,%xmm3
927	pslldq	$4,%xmm2
928	pxor	%xmm2,%xmm3
929	pslldq	$4,%xmm2
930	pxor	%xmm2,%xmm3
931	pslldq	$4,%xmm2
932	pxor	%xmm3,%xmm2
933
934	pxor	%xmm2,%xmm0
935	movdqu	%xmm0,-16(%rax)
936	movdqa	%xmm0,%xmm2
937
938	decl	%r10d
939	jnz	L$oop_key128
940
941	movdqa	L$key_rcon1b(%rip),%xmm4
942
943.byte	102,15,56,0,197
944.byte	102,15,56,221,196
945	pslld	$1,%xmm4
946
947	movdqa	%xmm2,%xmm3
948	pslldq	$4,%xmm2
949	pxor	%xmm2,%xmm3
950	pslldq	$4,%xmm2
951	pxor	%xmm2,%xmm3
952	pslldq	$4,%xmm2
953	pxor	%xmm3,%xmm2
954
955	pxor	%xmm2,%xmm0
956	movdqu	%xmm0,(%rax)
957
958	movdqa	%xmm0,%xmm2
959.byte	102,15,56,0,197
960.byte	102,15,56,221,196
961
962	movdqa	%xmm2,%xmm3
963	pslldq	$4,%xmm2
964	pxor	%xmm2,%xmm3
965	pslldq	$4,%xmm2
966	pxor	%xmm2,%xmm3
967	pslldq	$4,%xmm2
968	pxor	%xmm3,%xmm2
969
970	pxor	%xmm2,%xmm0
971	movdqu	%xmm0,16(%rax)
972
973	movl	%esi,96(%rax)
974	xorl	%eax,%eax
975	jmp	L$enc_key_ret
976
977
978
979.p2align	4
980L$14rounds:
981	movups	16(%rdi),%xmm2
982	movl	$13,%esi
983	leaq	16(%rax),%rax
984	cmpl	$268435456,%r10d
985	je	L$14rounds_alt
986
987	movups	%xmm0,(%rdx)
988	movups	%xmm2,16(%rdx)
989.byte	102,15,58,223,202,1
990	call	L$key_expansion_256a_cold
991.byte	102,15,58,223,200,1
992	call	L$key_expansion_256b
993.byte	102,15,58,223,202,2
994	call	L$key_expansion_256a
995.byte	102,15,58,223,200,2
996	call	L$key_expansion_256b
997.byte	102,15,58,223,202,4
998	call	L$key_expansion_256a
999.byte	102,15,58,223,200,4
1000	call	L$key_expansion_256b
1001.byte	102,15,58,223,202,8
1002	call	L$key_expansion_256a
1003.byte	102,15,58,223,200,8
1004	call	L$key_expansion_256b
1005.byte	102,15,58,223,202,16
1006	call	L$key_expansion_256a
1007.byte	102,15,58,223,200,16
1008	call	L$key_expansion_256b
1009.byte	102,15,58,223,202,32
1010	call	L$key_expansion_256a
1011.byte	102,15,58,223,200,32
1012	call	L$key_expansion_256b
1013.byte	102,15,58,223,202,64
1014	call	L$key_expansion_256a
1015	movups	%xmm0,(%rax)
1016	movl	%esi,16(%rax)
1017	xorq	%rax,%rax
1018	jmp	L$enc_key_ret
1019
1020.p2align	4
1021L$14rounds_alt:
1022	movdqa	L$key_rotate(%rip),%xmm5
1023	movdqa	L$key_rcon1(%rip),%xmm4
1024	movl	$7,%r10d
1025	movdqu	%xmm0,0(%rdx)
1026	movdqa	%xmm2,%xmm1
1027	movdqu	%xmm2,16(%rdx)
1028	jmp	L$oop_key256
1029
1030.p2align	4
1031L$oop_key256:
1032.byte	102,15,56,0,213
1033.byte	102,15,56,221,212
1034
1035	movdqa	%xmm0,%xmm3
1036	pslldq	$4,%xmm0
1037	pxor	%xmm0,%xmm3
1038	pslldq	$4,%xmm0
1039	pxor	%xmm0,%xmm3
1040	pslldq	$4,%xmm0
1041	pxor	%xmm3,%xmm0
1042	pslld	$1,%xmm4
1043
1044	pxor	%xmm2,%xmm0
1045	movdqu	%xmm0,(%rax)
1046
1047	decl	%r10d
1048	jz	L$done_key256
1049
1050	pshufd	$0xff,%xmm0,%xmm2
1051	pxor	%xmm3,%xmm3
1052.byte	102,15,56,221,211
1053
1054	movdqa	%xmm1,%xmm3
1055	pslldq	$4,%xmm1
1056	pxor	%xmm1,%xmm3
1057	pslldq	$4,%xmm1
1058	pxor	%xmm1,%xmm3
1059	pslldq	$4,%xmm1
1060	pxor	%xmm3,%xmm1
1061
1062	pxor	%xmm1,%xmm2
1063	movdqu	%xmm2,16(%rax)
1064	leaq	32(%rax),%rax
1065	movdqa	%xmm2,%xmm1
1066
1067	jmp	L$oop_key256
1068
1069L$done_key256:
1070	movl	%esi,16(%rax)
1071	xorl	%eax,%eax
1072	jmp	L$enc_key_ret
1073
1074.p2align	4
1075L$bad_keybits:
1076	movq	$-2,%rax
1077L$enc_key_ret:
1078	pxor	%xmm0,%xmm0
1079	pxor	%xmm1,%xmm1
1080	pxor	%xmm2,%xmm2
1081	pxor	%xmm3,%xmm3
1082	pxor	%xmm4,%xmm4
1083	pxor	%xmm5,%xmm5
1084	addq	$8,%rsp
1085
1086	.byte	0xf3,0xc3
1087
1088L$SEH_end_set_encrypt_key:
1089
1090.p2align	4
1091L$key_expansion_128:
1092	movups	%xmm0,(%rax)
1093	leaq	16(%rax),%rax
1094L$key_expansion_128_cold:
1095	shufps	$16,%xmm0,%xmm4
1096	xorps	%xmm4,%xmm0
1097	shufps	$140,%xmm0,%xmm4
1098	xorps	%xmm4,%xmm0
1099	shufps	$255,%xmm1,%xmm1
1100	xorps	%xmm1,%xmm0
1101	.byte	0xf3,0xc3
1102
1103.p2align	4
1104L$key_expansion_192a:
1105	movups	%xmm0,(%rax)
1106	leaq	16(%rax),%rax
1107L$key_expansion_192a_cold:
1108	movaps	%xmm2,%xmm5
1109L$key_expansion_192b_warm:
1110	shufps	$16,%xmm0,%xmm4
1111	movdqa	%xmm2,%xmm3
1112	xorps	%xmm4,%xmm0
1113	shufps	$140,%xmm0,%xmm4
1114	pslldq	$4,%xmm3
1115	xorps	%xmm4,%xmm0
1116	pshufd	$85,%xmm1,%xmm1
1117	pxor	%xmm3,%xmm2
1118	pxor	%xmm1,%xmm0
1119	pshufd	$255,%xmm0,%xmm3
1120	pxor	%xmm3,%xmm2
1121	.byte	0xf3,0xc3
1122
1123.p2align	4
1124L$key_expansion_192b:
1125	movaps	%xmm0,%xmm3
1126	shufps	$68,%xmm0,%xmm5
1127	movups	%xmm5,(%rax)
1128	shufps	$78,%xmm2,%xmm3
1129	movups	%xmm3,16(%rax)
1130	leaq	32(%rax),%rax
1131	jmp	L$key_expansion_192b_warm
1132
1133.p2align	4
1134L$key_expansion_256a:
1135	movups	%xmm2,(%rax)
1136	leaq	16(%rax),%rax
1137L$key_expansion_256a_cold:
1138	shufps	$16,%xmm0,%xmm4
1139	xorps	%xmm4,%xmm0
1140	shufps	$140,%xmm0,%xmm4
1141	xorps	%xmm4,%xmm0
1142	shufps	$255,%xmm1,%xmm1
1143	xorps	%xmm1,%xmm0
1144	.byte	0xf3,0xc3
1145
1146.p2align	4
1147L$key_expansion_256b:
1148	movups	%xmm0,(%rax)
1149	leaq	16(%rax),%rax
1150
1151	shufps	$16,%xmm2,%xmm4
1152	xorps	%xmm4,%xmm2
1153	shufps	$140,%xmm2,%xmm4
1154	xorps	%xmm4,%xmm2
1155	shufps	$170,%xmm1,%xmm1
1156	xorps	%xmm1,%xmm2
1157	.byte	0xf3,0xc3
1158
1159
1160.p2align	6
1161L$bswap_mask:
1162.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1163L$increment32:
1164.long	6,6,6,0
1165L$increment64:
1166.long	1,0,0,0
1167L$increment1:
1168.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1169L$key_rotate:
1170.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
1171L$key_rotate192:
1172.long	0x04070605,0x04070605,0x04070605,0x04070605
1173L$key_rcon1:
1174.long	1,1,1,1
1175L$key_rcon1b:
1176.long	0x1b,0x1b,0x1b,0x1b
1177
1178.byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1179.p2align	6
1180#endif
1181