• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# This file is generated from a similarly-named Perl script in the BoringSSL
2# source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
11.text
12
13.globl	_GFp_aes_hw_encrypt
14.private_extern _GFp_aes_hw_encrypt
15
16.p2align	4
17_GFp_aes_hw_encrypt:
18
19	movups	(%rdi),%xmm2
20	movl	240(%rdx),%eax
21	movups	(%rdx),%xmm0
22	movups	16(%rdx),%xmm1
23	leaq	32(%rdx),%rdx
24	xorps	%xmm0,%xmm2
25L$oop_enc1_1:
26.byte	102,15,56,220,209
27	decl	%eax
28	movups	(%rdx),%xmm1
29	leaq	16(%rdx),%rdx
30	jnz	L$oop_enc1_1
31.byte	102,15,56,221,209
32	pxor	%xmm0,%xmm0
33	pxor	%xmm1,%xmm1
34	movups	%xmm2,(%rsi)
35	pxor	%xmm2,%xmm2
36	.byte	0xf3,0xc3
37
38
39
40.p2align	4
41_aesni_encrypt2:
42
43	movups	(%rcx),%xmm0
44	shll	$4,%eax
45	movups	16(%rcx),%xmm1
46	xorps	%xmm0,%xmm2
47	xorps	%xmm0,%xmm3
48	movups	32(%rcx),%xmm0
49	leaq	32(%rcx,%rax,1),%rcx
50	negq	%rax
51	addq	$16,%rax
52
53L$enc_loop2:
54.byte	102,15,56,220,209
55.byte	102,15,56,220,217
56	movups	(%rcx,%rax,1),%xmm1
57	addq	$32,%rax
58.byte	102,15,56,220,208
59.byte	102,15,56,220,216
60	movups	-16(%rcx,%rax,1),%xmm0
61	jnz	L$enc_loop2
62
63.byte	102,15,56,220,209
64.byte	102,15,56,220,217
65.byte	102,15,56,221,208
66.byte	102,15,56,221,216
67	.byte	0xf3,0xc3
68
69
70
71.p2align	4
72_aesni_encrypt3:
73
74	movups	(%rcx),%xmm0
75	shll	$4,%eax
76	movups	16(%rcx),%xmm1
77	xorps	%xmm0,%xmm2
78	xorps	%xmm0,%xmm3
79	xorps	%xmm0,%xmm4
80	movups	32(%rcx),%xmm0
81	leaq	32(%rcx,%rax,1),%rcx
82	negq	%rax
83	addq	$16,%rax
84
85L$enc_loop3:
86.byte	102,15,56,220,209
87.byte	102,15,56,220,217
88.byte	102,15,56,220,225
89	movups	(%rcx,%rax,1),%xmm1
90	addq	$32,%rax
91.byte	102,15,56,220,208
92.byte	102,15,56,220,216
93.byte	102,15,56,220,224
94	movups	-16(%rcx,%rax,1),%xmm0
95	jnz	L$enc_loop3
96
97.byte	102,15,56,220,209
98.byte	102,15,56,220,217
99.byte	102,15,56,220,225
100.byte	102,15,56,221,208
101.byte	102,15,56,221,216
102.byte	102,15,56,221,224
103	.byte	0xf3,0xc3
104
105
106
107.p2align	4
108_aesni_encrypt4:
109
110	movups	(%rcx),%xmm0
111	shll	$4,%eax
112	movups	16(%rcx),%xmm1
113	xorps	%xmm0,%xmm2
114	xorps	%xmm0,%xmm3
115	xorps	%xmm0,%xmm4
116	xorps	%xmm0,%xmm5
117	movups	32(%rcx),%xmm0
118	leaq	32(%rcx,%rax,1),%rcx
119	negq	%rax
120.byte	0x0f,0x1f,0x00
121	addq	$16,%rax
122
123L$enc_loop4:
124.byte	102,15,56,220,209
125.byte	102,15,56,220,217
126.byte	102,15,56,220,225
127.byte	102,15,56,220,233
128	movups	(%rcx,%rax,1),%xmm1
129	addq	$32,%rax
130.byte	102,15,56,220,208
131.byte	102,15,56,220,216
132.byte	102,15,56,220,224
133.byte	102,15,56,220,232
134	movups	-16(%rcx,%rax,1),%xmm0
135	jnz	L$enc_loop4
136
137.byte	102,15,56,220,209
138.byte	102,15,56,220,217
139.byte	102,15,56,220,225
140.byte	102,15,56,220,233
141.byte	102,15,56,221,208
142.byte	102,15,56,221,216
143.byte	102,15,56,221,224
144.byte	102,15,56,221,232
145	.byte	0xf3,0xc3
146
147
148
149.p2align	4
150_aesni_encrypt6:
151
152	movups	(%rcx),%xmm0
153	shll	$4,%eax
154	movups	16(%rcx),%xmm1
155	xorps	%xmm0,%xmm2
156	pxor	%xmm0,%xmm3
157	pxor	%xmm0,%xmm4
158.byte	102,15,56,220,209
159	leaq	32(%rcx,%rax,1),%rcx
160	negq	%rax
161.byte	102,15,56,220,217
162	pxor	%xmm0,%xmm5
163	pxor	%xmm0,%xmm6
164.byte	102,15,56,220,225
165	pxor	%xmm0,%xmm7
166	movups	(%rcx,%rax,1),%xmm0
167	addq	$16,%rax
168	jmp	L$enc_loop6_enter
169.p2align	4
170L$enc_loop6:
171.byte	102,15,56,220,209
172.byte	102,15,56,220,217
173.byte	102,15,56,220,225
174L$enc_loop6_enter:
175.byte	102,15,56,220,233
176.byte	102,15,56,220,241
177.byte	102,15,56,220,249
178	movups	(%rcx,%rax,1),%xmm1
179	addq	$32,%rax
180.byte	102,15,56,220,208
181.byte	102,15,56,220,216
182.byte	102,15,56,220,224
183.byte	102,15,56,220,232
184.byte	102,15,56,220,240
185.byte	102,15,56,220,248
186	movups	-16(%rcx,%rax,1),%xmm0
187	jnz	L$enc_loop6
188
189.byte	102,15,56,220,209
190.byte	102,15,56,220,217
191.byte	102,15,56,220,225
192.byte	102,15,56,220,233
193.byte	102,15,56,220,241
194.byte	102,15,56,220,249
195.byte	102,15,56,221,208
196.byte	102,15,56,221,216
197.byte	102,15,56,221,224
198.byte	102,15,56,221,232
199.byte	102,15,56,221,240
200.byte	102,15,56,221,248
201	.byte	0xf3,0xc3
202
203
204
205.p2align	4
206_aesni_encrypt8:
207
208	movups	(%rcx),%xmm0
209	shll	$4,%eax
210	movups	16(%rcx),%xmm1
211	xorps	%xmm0,%xmm2
212	xorps	%xmm0,%xmm3
213	pxor	%xmm0,%xmm4
214	pxor	%xmm0,%xmm5
215	pxor	%xmm0,%xmm6
216	leaq	32(%rcx,%rax,1),%rcx
217	negq	%rax
218.byte	102,15,56,220,209
219	pxor	%xmm0,%xmm7
220	pxor	%xmm0,%xmm8
221.byte	102,15,56,220,217
222	pxor	%xmm0,%xmm9
223	movups	(%rcx,%rax,1),%xmm0
224	addq	$16,%rax
225	jmp	L$enc_loop8_inner
226.p2align	4
227L$enc_loop8:
228.byte	102,15,56,220,209
229.byte	102,15,56,220,217
230L$enc_loop8_inner:
231.byte	102,15,56,220,225
232.byte	102,15,56,220,233
233.byte	102,15,56,220,241
234.byte	102,15,56,220,249
235.byte	102,68,15,56,220,193
236.byte	102,68,15,56,220,201
237L$enc_loop8_enter:
238	movups	(%rcx,%rax,1),%xmm1
239	addq	$32,%rax
240.byte	102,15,56,220,208
241.byte	102,15,56,220,216
242.byte	102,15,56,220,224
243.byte	102,15,56,220,232
244.byte	102,15,56,220,240
245.byte	102,15,56,220,248
246.byte	102,68,15,56,220,192
247.byte	102,68,15,56,220,200
248	movups	-16(%rcx,%rax,1),%xmm0
249	jnz	L$enc_loop8
250
251.byte	102,15,56,220,209
252.byte	102,15,56,220,217
253.byte	102,15,56,220,225
254.byte	102,15,56,220,233
255.byte	102,15,56,220,241
256.byte	102,15,56,220,249
257.byte	102,68,15,56,220,193
258.byte	102,68,15,56,220,201
259.byte	102,15,56,221,208
260.byte	102,15,56,221,216
261.byte	102,15,56,221,224
262.byte	102,15,56,221,232
263.byte	102,15,56,221,240
264.byte	102,15,56,221,248
265.byte	102,68,15,56,221,192
266.byte	102,68,15,56,221,200
267	.byte	0xf3,0xc3
268
269
270.globl	_GFp_aes_hw_ctr32_encrypt_blocks
271.private_extern _GFp_aes_hw_ctr32_encrypt_blocks
272
273.p2align	4
274_GFp_aes_hw_ctr32_encrypt_blocks:
275
276	cmpq	$1,%rdx
277	jne	L$ctr32_bulk
278
279
280
281	movups	(%r8),%xmm2
282	movups	(%rdi),%xmm3
283	movl	240(%rcx),%edx
284	movups	(%rcx),%xmm0
285	movups	16(%rcx),%xmm1
286	leaq	32(%rcx),%rcx
287	xorps	%xmm0,%xmm2
288L$oop_enc1_2:
289.byte	102,15,56,220,209
290	decl	%edx
291	movups	(%rcx),%xmm1
292	leaq	16(%rcx),%rcx
293	jnz	L$oop_enc1_2
294.byte	102,15,56,221,209
295	pxor	%xmm0,%xmm0
296	pxor	%xmm1,%xmm1
297	xorps	%xmm3,%xmm2
298	pxor	%xmm3,%xmm3
299	movups	%xmm2,(%rsi)
300	xorps	%xmm2,%xmm2
301	jmp	L$ctr32_epilogue
302
303.p2align	4
304L$ctr32_bulk:
305	leaq	(%rsp),%r11
306
307	pushq	%rbp
308
309	subq	$128,%rsp
310	andq	$-16,%rsp
311
312
313
314
315	movdqu	(%r8),%xmm2
316	movdqu	(%rcx),%xmm0
317	movl	12(%r8),%r8d
318	pxor	%xmm0,%xmm2
319	movl	12(%rcx),%ebp
320	movdqa	%xmm2,0(%rsp)
321	bswapl	%r8d
322	movdqa	%xmm2,%xmm3
323	movdqa	%xmm2,%xmm4
324	movdqa	%xmm2,%xmm5
325	movdqa	%xmm2,64(%rsp)
326	movdqa	%xmm2,80(%rsp)
327	movdqa	%xmm2,96(%rsp)
328	movq	%rdx,%r10
329	movdqa	%xmm2,112(%rsp)
330
331	leaq	1(%r8),%rax
332	leaq	2(%r8),%rdx
333	bswapl	%eax
334	bswapl	%edx
335	xorl	%ebp,%eax
336	xorl	%ebp,%edx
337.byte	102,15,58,34,216,3
338	leaq	3(%r8),%rax
339	movdqa	%xmm3,16(%rsp)
340.byte	102,15,58,34,226,3
341	bswapl	%eax
342	movq	%r10,%rdx
343	leaq	4(%r8),%r10
344	movdqa	%xmm4,32(%rsp)
345	xorl	%ebp,%eax
346	bswapl	%r10d
347.byte	102,15,58,34,232,3
348	xorl	%ebp,%r10d
349	movdqa	%xmm5,48(%rsp)
350	leaq	5(%r8),%r9
351	movl	%r10d,64+12(%rsp)
352	bswapl	%r9d
353	leaq	6(%r8),%r10
354	movl	240(%rcx),%eax
355	xorl	%ebp,%r9d
356	bswapl	%r10d
357	movl	%r9d,80+12(%rsp)
358	xorl	%ebp,%r10d
359	leaq	7(%r8),%r9
360	movl	%r10d,96+12(%rsp)
361	bswapl	%r9d
362	leaq	_GFp_ia32cap_P(%rip),%r10
363	movl	4(%r10),%r10d
364	xorl	%ebp,%r9d
365	andl	$71303168,%r10d
366	movl	%r9d,112+12(%rsp)
367
368	movups	16(%rcx),%xmm1
369
370	movdqa	64(%rsp),%xmm6
371	movdqa	80(%rsp),%xmm7
372
373	cmpq	$8,%rdx
374	jb	L$ctr32_tail
375
376	subq	$6,%rdx
377	cmpl	$4194304,%r10d
378	je	L$ctr32_6x
379
380	leaq	128(%rcx),%rcx
381	subq	$2,%rdx
382	jmp	L$ctr32_loop8
383
384.p2align	4
385L$ctr32_6x:
386	shll	$4,%eax
387	movl	$48,%r10d
388	bswapl	%ebp
389	leaq	32(%rcx,%rax,1),%rcx
390	subq	%rax,%r10
391	jmp	L$ctr32_loop6
392
393.p2align	4
394L$ctr32_loop6:
395	addl	$6,%r8d
396	movups	-48(%rcx,%r10,1),%xmm0
397.byte	102,15,56,220,209
398	movl	%r8d,%eax
399	xorl	%ebp,%eax
400.byte	102,15,56,220,217
401.byte	0x0f,0x38,0xf1,0x44,0x24,12
402	leal	1(%r8),%eax
403.byte	102,15,56,220,225
404	xorl	%ebp,%eax
405.byte	0x0f,0x38,0xf1,0x44,0x24,28
406.byte	102,15,56,220,233
407	leal	2(%r8),%eax
408	xorl	%ebp,%eax
409.byte	102,15,56,220,241
410.byte	0x0f,0x38,0xf1,0x44,0x24,44
411	leal	3(%r8),%eax
412.byte	102,15,56,220,249
413	movups	-32(%rcx,%r10,1),%xmm1
414	xorl	%ebp,%eax
415
416.byte	102,15,56,220,208
417.byte	0x0f,0x38,0xf1,0x44,0x24,60
418	leal	4(%r8),%eax
419.byte	102,15,56,220,216
420	xorl	%ebp,%eax
421.byte	0x0f,0x38,0xf1,0x44,0x24,76
422.byte	102,15,56,220,224
423	leal	5(%r8),%eax
424	xorl	%ebp,%eax
425.byte	102,15,56,220,232
426.byte	0x0f,0x38,0xf1,0x44,0x24,92
427	movq	%r10,%rax
428.byte	102,15,56,220,240
429.byte	102,15,56,220,248
430	movups	-16(%rcx,%r10,1),%xmm0
431
432	call	L$enc_loop6
433
434	movdqu	(%rdi),%xmm8
435	movdqu	16(%rdi),%xmm9
436	movdqu	32(%rdi),%xmm10
437	movdqu	48(%rdi),%xmm11
438	movdqu	64(%rdi),%xmm12
439	movdqu	80(%rdi),%xmm13
440	leaq	96(%rdi),%rdi
441	movups	-64(%rcx,%r10,1),%xmm1
442	pxor	%xmm2,%xmm8
443	movaps	0(%rsp),%xmm2
444	pxor	%xmm3,%xmm9
445	movaps	16(%rsp),%xmm3
446	pxor	%xmm4,%xmm10
447	movaps	32(%rsp),%xmm4
448	pxor	%xmm5,%xmm11
449	movaps	48(%rsp),%xmm5
450	pxor	%xmm6,%xmm12
451	movaps	64(%rsp),%xmm6
452	pxor	%xmm7,%xmm13
453	movaps	80(%rsp),%xmm7
454	movdqu	%xmm8,(%rsi)
455	movdqu	%xmm9,16(%rsi)
456	movdqu	%xmm10,32(%rsi)
457	movdqu	%xmm11,48(%rsi)
458	movdqu	%xmm12,64(%rsi)
459	movdqu	%xmm13,80(%rsi)
460	leaq	96(%rsi),%rsi
461
462	subq	$6,%rdx
463	jnc	L$ctr32_loop6
464
465	addq	$6,%rdx
466	jz	L$ctr32_done
467
468	leal	-48(%r10),%eax
469	leaq	-80(%rcx,%r10,1),%rcx
470	negl	%eax
471	shrl	$4,%eax
472	jmp	L$ctr32_tail
473
474.p2align	5
475L$ctr32_loop8:
476	addl	$8,%r8d
477	movdqa	96(%rsp),%xmm8
478.byte	102,15,56,220,209
479	movl	%r8d,%r9d
480	movdqa	112(%rsp),%xmm9
481.byte	102,15,56,220,217
482	bswapl	%r9d
483	movups	32-128(%rcx),%xmm0
484.byte	102,15,56,220,225
485	xorl	%ebp,%r9d
486	nop
487.byte	102,15,56,220,233
488	movl	%r9d,0+12(%rsp)
489	leaq	1(%r8),%r9
490.byte	102,15,56,220,241
491.byte	102,15,56,220,249
492.byte	102,68,15,56,220,193
493.byte	102,68,15,56,220,201
494	movups	48-128(%rcx),%xmm1
495	bswapl	%r9d
496.byte	102,15,56,220,208
497.byte	102,15,56,220,216
498	xorl	%ebp,%r9d
499.byte	0x66,0x90
500.byte	102,15,56,220,224
501.byte	102,15,56,220,232
502	movl	%r9d,16+12(%rsp)
503	leaq	2(%r8),%r9
504.byte	102,15,56,220,240
505.byte	102,15,56,220,248
506.byte	102,68,15,56,220,192
507.byte	102,68,15,56,220,200
508	movups	64-128(%rcx),%xmm0
509	bswapl	%r9d
510.byte	102,15,56,220,209
511.byte	102,15,56,220,217
512	xorl	%ebp,%r9d
513.byte	0x66,0x90
514.byte	102,15,56,220,225
515.byte	102,15,56,220,233
516	movl	%r9d,32+12(%rsp)
517	leaq	3(%r8),%r9
518.byte	102,15,56,220,241
519.byte	102,15,56,220,249
520.byte	102,68,15,56,220,193
521.byte	102,68,15,56,220,201
522	movups	80-128(%rcx),%xmm1
523	bswapl	%r9d
524.byte	102,15,56,220,208
525.byte	102,15,56,220,216
526	xorl	%ebp,%r9d
527.byte	0x66,0x90
528.byte	102,15,56,220,224
529.byte	102,15,56,220,232
530	movl	%r9d,48+12(%rsp)
531	leaq	4(%r8),%r9
532.byte	102,15,56,220,240
533.byte	102,15,56,220,248
534.byte	102,68,15,56,220,192
535.byte	102,68,15,56,220,200
536	movups	96-128(%rcx),%xmm0
537	bswapl	%r9d
538.byte	102,15,56,220,209
539.byte	102,15,56,220,217
540	xorl	%ebp,%r9d
541.byte	0x66,0x90
542.byte	102,15,56,220,225
543.byte	102,15,56,220,233
544	movl	%r9d,64+12(%rsp)
545	leaq	5(%r8),%r9
546.byte	102,15,56,220,241
547.byte	102,15,56,220,249
548.byte	102,68,15,56,220,193
549.byte	102,68,15,56,220,201
550	movups	112-128(%rcx),%xmm1
551	bswapl	%r9d
552.byte	102,15,56,220,208
553.byte	102,15,56,220,216
554	xorl	%ebp,%r9d
555.byte	0x66,0x90
556.byte	102,15,56,220,224
557.byte	102,15,56,220,232
558	movl	%r9d,80+12(%rsp)
559	leaq	6(%r8),%r9
560.byte	102,15,56,220,240
561.byte	102,15,56,220,248
562.byte	102,68,15,56,220,192
563.byte	102,68,15,56,220,200
564	movups	128-128(%rcx),%xmm0
565	bswapl	%r9d
566.byte	102,15,56,220,209
567.byte	102,15,56,220,217
568	xorl	%ebp,%r9d
569.byte	0x66,0x90
570.byte	102,15,56,220,225
571.byte	102,15,56,220,233
572	movl	%r9d,96+12(%rsp)
573	leaq	7(%r8),%r9
574.byte	102,15,56,220,241
575.byte	102,15,56,220,249
576.byte	102,68,15,56,220,193
577.byte	102,68,15,56,220,201
578	movups	144-128(%rcx),%xmm1
579	bswapl	%r9d
580.byte	102,15,56,220,208
581.byte	102,15,56,220,216
582.byte	102,15,56,220,224
583	xorl	%ebp,%r9d
584	movdqu	0(%rdi),%xmm10
585.byte	102,15,56,220,232
586	movl	%r9d,112+12(%rsp)
587	cmpl	$11,%eax
588.byte	102,15,56,220,240
589.byte	102,15,56,220,248
590.byte	102,68,15,56,220,192
591.byte	102,68,15,56,220,200
592	movups	160-128(%rcx),%xmm0
593
594	jb	L$ctr32_enc_done
595
596.byte	102,15,56,220,209
597.byte	102,15,56,220,217
598.byte	102,15,56,220,225
599.byte	102,15,56,220,233
600.byte	102,15,56,220,241
601.byte	102,15,56,220,249
602.byte	102,68,15,56,220,193
603.byte	102,68,15,56,220,201
604	movups	176-128(%rcx),%xmm1
605
606.byte	102,15,56,220,208
607.byte	102,15,56,220,216
608.byte	102,15,56,220,224
609.byte	102,15,56,220,232
610.byte	102,15,56,220,240
611.byte	102,15,56,220,248
612.byte	102,68,15,56,220,192
613.byte	102,68,15,56,220,200
614	movups	192-128(%rcx),%xmm0
615
616
617
618.byte	102,15,56,220,209
619.byte	102,15,56,220,217
620.byte	102,15,56,220,225
621.byte	102,15,56,220,233
622.byte	102,15,56,220,241
623.byte	102,15,56,220,249
624.byte	102,68,15,56,220,193
625.byte	102,68,15,56,220,201
626	movups	208-128(%rcx),%xmm1
627
628.byte	102,15,56,220,208
629.byte	102,15,56,220,216
630.byte	102,15,56,220,224
631.byte	102,15,56,220,232
632.byte	102,15,56,220,240
633.byte	102,15,56,220,248
634.byte	102,68,15,56,220,192
635.byte	102,68,15,56,220,200
636	movups	224-128(%rcx),%xmm0
637	jmp	L$ctr32_enc_done
638
639.p2align	4
640L$ctr32_enc_done:
641	movdqu	16(%rdi),%xmm11
642	pxor	%xmm0,%xmm10
643	movdqu	32(%rdi),%xmm12
644	pxor	%xmm0,%xmm11
645	movdqu	48(%rdi),%xmm13
646	pxor	%xmm0,%xmm12
647	movdqu	64(%rdi),%xmm14
648	pxor	%xmm0,%xmm13
649	movdqu	80(%rdi),%xmm15
650	pxor	%xmm0,%xmm14
651	pxor	%xmm0,%xmm15
652.byte	102,15,56,220,209
653.byte	102,15,56,220,217
654.byte	102,15,56,220,225
655.byte	102,15,56,220,233
656.byte	102,15,56,220,241
657.byte	102,15,56,220,249
658.byte	102,68,15,56,220,193
659.byte	102,68,15,56,220,201
660	movdqu	96(%rdi),%xmm1
661	leaq	128(%rdi),%rdi
662
663.byte	102,65,15,56,221,210
664	pxor	%xmm0,%xmm1
665	movdqu	112-128(%rdi),%xmm10
666.byte	102,65,15,56,221,219
667	pxor	%xmm0,%xmm10
668	movdqa	0(%rsp),%xmm11
669.byte	102,65,15,56,221,228
670.byte	102,65,15,56,221,237
671	movdqa	16(%rsp),%xmm12
672	movdqa	32(%rsp),%xmm13
673.byte	102,65,15,56,221,246
674.byte	102,65,15,56,221,255
675	movdqa	48(%rsp),%xmm14
676	movdqa	64(%rsp),%xmm15
677.byte	102,68,15,56,221,193
678	movdqa	80(%rsp),%xmm0
679	movups	16-128(%rcx),%xmm1
680.byte	102,69,15,56,221,202
681
682	movups	%xmm2,(%rsi)
683	movdqa	%xmm11,%xmm2
684	movups	%xmm3,16(%rsi)
685	movdqa	%xmm12,%xmm3
686	movups	%xmm4,32(%rsi)
687	movdqa	%xmm13,%xmm4
688	movups	%xmm5,48(%rsi)
689	movdqa	%xmm14,%xmm5
690	movups	%xmm6,64(%rsi)
691	movdqa	%xmm15,%xmm6
692	movups	%xmm7,80(%rsi)
693	movdqa	%xmm0,%xmm7
694	movups	%xmm8,96(%rsi)
695	movups	%xmm9,112(%rsi)
696	leaq	128(%rsi),%rsi
697
698	subq	$8,%rdx
699	jnc	L$ctr32_loop8
700
701	addq	$8,%rdx
702	jz	L$ctr32_done
703	leaq	-128(%rcx),%rcx
704
705L$ctr32_tail:
706
707
708	leaq	16(%rcx),%rcx
709	cmpq	$4,%rdx
710	jb	L$ctr32_loop3
711	je	L$ctr32_loop4
712
713
714	shll	$4,%eax
715	movdqa	96(%rsp),%xmm8
716	pxor	%xmm9,%xmm9
717
718	movups	16(%rcx),%xmm0
719.byte	102,15,56,220,209
720.byte	102,15,56,220,217
721	leaq	32-16(%rcx,%rax,1),%rcx
722	negq	%rax
723.byte	102,15,56,220,225
724	addq	$16,%rax
725	movups	(%rdi),%xmm10
726.byte	102,15,56,220,233
727.byte	102,15,56,220,241
728	movups	16(%rdi),%xmm11
729	movups	32(%rdi),%xmm12
730.byte	102,15,56,220,249
731.byte	102,68,15,56,220,193
732
733	call	L$enc_loop8_enter
734
735	movdqu	48(%rdi),%xmm13
736	pxor	%xmm10,%xmm2
737	movdqu	64(%rdi),%xmm10
738	pxor	%xmm11,%xmm3
739	movdqu	%xmm2,(%rsi)
740	pxor	%xmm12,%xmm4
741	movdqu	%xmm3,16(%rsi)
742	pxor	%xmm13,%xmm5
743	movdqu	%xmm4,32(%rsi)
744	pxor	%xmm10,%xmm6
745	movdqu	%xmm5,48(%rsi)
746	movdqu	%xmm6,64(%rsi)
747	cmpq	$6,%rdx
748	jb	L$ctr32_done
749
750	movups	80(%rdi),%xmm11
751	xorps	%xmm11,%xmm7
752	movups	%xmm7,80(%rsi)
753	je	L$ctr32_done
754
755	movups	96(%rdi),%xmm12
756	xorps	%xmm12,%xmm8
757	movups	%xmm8,96(%rsi)
758	jmp	L$ctr32_done
759
760.p2align	5
761L$ctr32_loop4:
762.byte	102,15,56,220,209
763	leaq	16(%rcx),%rcx
764	decl	%eax
765.byte	102,15,56,220,217
766.byte	102,15,56,220,225
767.byte	102,15,56,220,233
768	movups	(%rcx),%xmm1
769	jnz	L$ctr32_loop4
770.byte	102,15,56,221,209
771.byte	102,15,56,221,217
772	movups	(%rdi),%xmm10
773	movups	16(%rdi),%xmm11
774.byte	102,15,56,221,225
775.byte	102,15,56,221,233
776	movups	32(%rdi),%xmm12
777	movups	48(%rdi),%xmm13
778
779	xorps	%xmm10,%xmm2
780	movups	%xmm2,(%rsi)
781	xorps	%xmm11,%xmm3
782	movups	%xmm3,16(%rsi)
783	pxor	%xmm12,%xmm4
784	movdqu	%xmm4,32(%rsi)
785	pxor	%xmm13,%xmm5
786	movdqu	%xmm5,48(%rsi)
787	jmp	L$ctr32_done
788
789.p2align	5
790L$ctr32_loop3:
791.byte	102,15,56,220,209
792	leaq	16(%rcx),%rcx
793	decl	%eax
794.byte	102,15,56,220,217
795.byte	102,15,56,220,225
796	movups	(%rcx),%xmm1
797	jnz	L$ctr32_loop3
798.byte	102,15,56,221,209
799.byte	102,15,56,221,217
800.byte	102,15,56,221,225
801
802	movups	(%rdi),%xmm10
803	xorps	%xmm10,%xmm2
804	movups	%xmm2,(%rsi)
805	cmpq	$2,%rdx
806	jb	L$ctr32_done
807
808	movups	16(%rdi),%xmm11
809	xorps	%xmm11,%xmm3
810	movups	%xmm3,16(%rsi)
811	je	L$ctr32_done
812
813	movups	32(%rdi),%xmm12
814	xorps	%xmm12,%xmm4
815	movups	%xmm4,32(%rsi)
816
817L$ctr32_done:
818	xorps	%xmm0,%xmm0
819	xorl	%ebp,%ebp
820	pxor	%xmm1,%xmm1
821	pxor	%xmm2,%xmm2
822	pxor	%xmm3,%xmm3
823	pxor	%xmm4,%xmm4
824	pxor	%xmm5,%xmm5
825	pxor	%xmm6,%xmm6
826	pxor	%xmm7,%xmm7
827	movaps	%xmm0,0(%rsp)
828	pxor	%xmm8,%xmm8
829	movaps	%xmm0,16(%rsp)
830	pxor	%xmm9,%xmm9
831	movaps	%xmm0,32(%rsp)
832	pxor	%xmm10,%xmm10
833	movaps	%xmm0,48(%rsp)
834	pxor	%xmm11,%xmm11
835	movaps	%xmm0,64(%rsp)
836	pxor	%xmm12,%xmm12
837	movaps	%xmm0,80(%rsp)
838	pxor	%xmm13,%xmm13
839	movaps	%xmm0,96(%rsp)
840	pxor	%xmm14,%xmm14
841	movaps	%xmm0,112(%rsp)
842	pxor	%xmm15,%xmm15
843	movq	-8(%r11),%rbp
844
845	leaq	(%r11),%rsp
846
847L$ctr32_epilogue:
848	.byte	0xf3,0xc3
849
850
851.globl	_GFp_aes_hw_set_encrypt_key
852.private_extern _GFp_aes_hw_set_encrypt_key
853
854.p2align	4
855_GFp_aes_hw_set_encrypt_key:
856__aesni_set_encrypt_key:
857
858.byte	0x48,0x83,0xEC,0x08
859
860	movq	$-1,%rax
861	testq	%rdi,%rdi
862	jz	L$enc_key_ret
863	testq	%rdx,%rdx
864	jz	L$enc_key_ret
865
866	movups	(%rdi),%xmm0
867	xorps	%xmm4,%xmm4
868	leaq	_GFp_ia32cap_P(%rip),%r10
869	movl	4(%r10),%r10d
870	andl	$268437504,%r10d
871	leaq	16(%rdx),%rax
872	cmpl	$256,%esi
873	je	L$14rounds
874
875	cmpl	$128,%esi
876	jne	L$bad_keybits
877
878L$10rounds:
879	movl	$9,%esi
880	cmpl	$268435456,%r10d
881	je	L$10rounds_alt
882
883	movups	%xmm0,(%rdx)
884.byte	102,15,58,223,200,1
885	call	L$key_expansion_128_cold
886.byte	102,15,58,223,200,2
887	call	L$key_expansion_128
888.byte	102,15,58,223,200,4
889	call	L$key_expansion_128
890.byte	102,15,58,223,200,8
891	call	L$key_expansion_128
892.byte	102,15,58,223,200,16
893	call	L$key_expansion_128
894.byte	102,15,58,223,200,32
895	call	L$key_expansion_128
896.byte	102,15,58,223,200,64
897	call	L$key_expansion_128
898.byte	102,15,58,223,200,128
899	call	L$key_expansion_128
900.byte	102,15,58,223,200,27
901	call	L$key_expansion_128
902.byte	102,15,58,223,200,54
903	call	L$key_expansion_128
904	movups	%xmm0,(%rax)
905	movl	%esi,80(%rax)
906	xorl	%eax,%eax
907	jmp	L$enc_key_ret
908
909.p2align	4
910L$10rounds_alt:
911	movdqa	L$key_rotate(%rip),%xmm5
912	movl	$8,%r10d
913	movdqa	L$key_rcon1(%rip),%xmm4
914	movdqa	%xmm0,%xmm2
915	movdqu	%xmm0,(%rdx)
916	jmp	L$oop_key128
917
918.p2align	4
919L$oop_key128:
920.byte	102,15,56,0,197
921.byte	102,15,56,221,196
922	pslld	$1,%xmm4
923	leaq	16(%rax),%rax
924
925	movdqa	%xmm2,%xmm3
926	pslldq	$4,%xmm2
927	pxor	%xmm2,%xmm3
928	pslldq	$4,%xmm2
929	pxor	%xmm2,%xmm3
930	pslldq	$4,%xmm2
931	pxor	%xmm3,%xmm2
932
933	pxor	%xmm2,%xmm0
934	movdqu	%xmm0,-16(%rax)
935	movdqa	%xmm0,%xmm2
936
937	decl	%r10d
938	jnz	L$oop_key128
939
940	movdqa	L$key_rcon1b(%rip),%xmm4
941
942.byte	102,15,56,0,197
943.byte	102,15,56,221,196
944	pslld	$1,%xmm4
945
946	movdqa	%xmm2,%xmm3
947	pslldq	$4,%xmm2
948	pxor	%xmm2,%xmm3
949	pslldq	$4,%xmm2
950	pxor	%xmm2,%xmm3
951	pslldq	$4,%xmm2
952	pxor	%xmm3,%xmm2
953
954	pxor	%xmm2,%xmm0
955	movdqu	%xmm0,(%rax)
956
957	movdqa	%xmm0,%xmm2
958.byte	102,15,56,0,197
959.byte	102,15,56,221,196
960
961	movdqa	%xmm2,%xmm3
962	pslldq	$4,%xmm2
963	pxor	%xmm2,%xmm3
964	pslldq	$4,%xmm2
965	pxor	%xmm2,%xmm3
966	pslldq	$4,%xmm2
967	pxor	%xmm3,%xmm2
968
969	pxor	%xmm2,%xmm0
970	movdqu	%xmm0,16(%rax)
971
972	movl	%esi,96(%rax)
973	xorl	%eax,%eax
974	jmp	L$enc_key_ret
975
976
977
978.p2align	4
979L$14rounds:
980	movups	16(%rdi),%xmm2
981	movl	$13,%esi
982	leaq	16(%rax),%rax
983	cmpl	$268435456,%r10d
984	je	L$14rounds_alt
985
986	movups	%xmm0,(%rdx)
987	movups	%xmm2,16(%rdx)
988.byte	102,15,58,223,202,1
989	call	L$key_expansion_256a_cold
990.byte	102,15,58,223,200,1
991	call	L$key_expansion_256b
992.byte	102,15,58,223,202,2
993	call	L$key_expansion_256a
994.byte	102,15,58,223,200,2
995	call	L$key_expansion_256b
996.byte	102,15,58,223,202,4
997	call	L$key_expansion_256a
998.byte	102,15,58,223,200,4
999	call	L$key_expansion_256b
1000.byte	102,15,58,223,202,8
1001	call	L$key_expansion_256a
1002.byte	102,15,58,223,200,8
1003	call	L$key_expansion_256b
1004.byte	102,15,58,223,202,16
1005	call	L$key_expansion_256a
1006.byte	102,15,58,223,200,16
1007	call	L$key_expansion_256b
1008.byte	102,15,58,223,202,32
1009	call	L$key_expansion_256a
1010.byte	102,15,58,223,200,32
1011	call	L$key_expansion_256b
1012.byte	102,15,58,223,202,64
1013	call	L$key_expansion_256a
1014	movups	%xmm0,(%rax)
1015	movl	%esi,16(%rax)
1016	xorq	%rax,%rax
1017	jmp	L$enc_key_ret
1018
1019.p2align	4
1020L$14rounds_alt:
1021	movdqa	L$key_rotate(%rip),%xmm5
1022	movdqa	L$key_rcon1(%rip),%xmm4
1023	movl	$7,%r10d
1024	movdqu	%xmm0,0(%rdx)
1025	movdqa	%xmm2,%xmm1
1026	movdqu	%xmm2,16(%rdx)
1027	jmp	L$oop_key256
1028
1029.p2align	4
1030L$oop_key256:
1031.byte	102,15,56,0,213
1032.byte	102,15,56,221,212
1033
1034	movdqa	%xmm0,%xmm3
1035	pslldq	$4,%xmm0
1036	pxor	%xmm0,%xmm3
1037	pslldq	$4,%xmm0
1038	pxor	%xmm0,%xmm3
1039	pslldq	$4,%xmm0
1040	pxor	%xmm3,%xmm0
1041	pslld	$1,%xmm4
1042
1043	pxor	%xmm2,%xmm0
1044	movdqu	%xmm0,(%rax)
1045
1046	decl	%r10d
1047	jz	L$done_key256
1048
1049	pshufd	$0xff,%xmm0,%xmm2
1050	pxor	%xmm3,%xmm3
1051.byte	102,15,56,221,211
1052
1053	movdqa	%xmm1,%xmm3
1054	pslldq	$4,%xmm1
1055	pxor	%xmm1,%xmm3
1056	pslldq	$4,%xmm1
1057	pxor	%xmm1,%xmm3
1058	pslldq	$4,%xmm1
1059	pxor	%xmm3,%xmm1
1060
1061	pxor	%xmm1,%xmm2
1062	movdqu	%xmm2,16(%rax)
1063	leaq	32(%rax),%rax
1064	movdqa	%xmm2,%xmm1
1065
1066	jmp	L$oop_key256
1067
1068L$done_key256:
1069	movl	%esi,16(%rax)
1070	xorl	%eax,%eax
1071	jmp	L$enc_key_ret
1072
1073.p2align	4
1074L$bad_keybits:
1075	movq	$-2,%rax
1076L$enc_key_ret:
1077	pxor	%xmm0,%xmm0
1078	pxor	%xmm1,%xmm1
1079	pxor	%xmm2,%xmm2
1080	pxor	%xmm3,%xmm3
1081	pxor	%xmm4,%xmm4
1082	pxor	%xmm5,%xmm5
1083	addq	$8,%rsp
1084
1085	.byte	0xf3,0xc3
1086
1087L$SEH_end_GFp_set_encrypt_key:
1088
1089.p2align	4
1090L$key_expansion_128:
1091	movups	%xmm0,(%rax)
1092	leaq	16(%rax),%rax
1093L$key_expansion_128_cold:
1094	shufps	$16,%xmm0,%xmm4
1095	xorps	%xmm4,%xmm0
1096	shufps	$140,%xmm0,%xmm4
1097	xorps	%xmm4,%xmm0
1098	shufps	$255,%xmm1,%xmm1
1099	xorps	%xmm1,%xmm0
1100	.byte	0xf3,0xc3
1101
1102.p2align	4
1103L$key_expansion_192a:
1104	movups	%xmm0,(%rax)
1105	leaq	16(%rax),%rax
1106L$key_expansion_192a_cold:
1107	movaps	%xmm2,%xmm5
1108L$key_expansion_192b_warm:
1109	shufps	$16,%xmm0,%xmm4
1110	movdqa	%xmm2,%xmm3
1111	xorps	%xmm4,%xmm0
1112	shufps	$140,%xmm0,%xmm4
1113	pslldq	$4,%xmm3
1114	xorps	%xmm4,%xmm0
1115	pshufd	$85,%xmm1,%xmm1
1116	pxor	%xmm3,%xmm2
1117	pxor	%xmm1,%xmm0
1118	pshufd	$255,%xmm0,%xmm3
1119	pxor	%xmm3,%xmm2
1120	.byte	0xf3,0xc3
1121
1122.p2align	4
1123L$key_expansion_192b:
1124	movaps	%xmm0,%xmm3
1125	shufps	$68,%xmm0,%xmm5
1126	movups	%xmm5,(%rax)
1127	shufps	$78,%xmm2,%xmm3
1128	movups	%xmm3,16(%rax)
1129	leaq	32(%rax),%rax
1130	jmp	L$key_expansion_192b_warm
1131
1132.p2align	4
1133L$key_expansion_256a:
1134	movups	%xmm2,(%rax)
1135	leaq	16(%rax),%rax
1136L$key_expansion_256a_cold:
1137	shufps	$16,%xmm0,%xmm4
1138	xorps	%xmm4,%xmm0
1139	shufps	$140,%xmm0,%xmm4
1140	xorps	%xmm4,%xmm0
1141	shufps	$255,%xmm1,%xmm1
1142	xorps	%xmm1,%xmm0
1143	.byte	0xf3,0xc3
1144
1145.p2align	4
1146L$key_expansion_256b:
1147	movups	%xmm0,(%rax)
1148	leaq	16(%rax),%rax
1149
1150	shufps	$16,%xmm2,%xmm4
1151	xorps	%xmm4,%xmm2
1152	shufps	$140,%xmm2,%xmm4
1153	xorps	%xmm4,%xmm2
1154	shufps	$170,%xmm1,%xmm1
1155	xorps	%xmm1,%xmm2
1156	.byte	0xf3,0xc3
1157
1158
1159.p2align	6
1160L$bswap_mask:
1161.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1162L$increment32:
1163.long	6,6,6,0
1164L$increment64:
1165.long	1,0,0,0
1166L$increment1:
1167.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1168L$key_rotate:
1169.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
1170L$key_rotate192:
1171.long	0x04070605,0x04070605,0x04070605,0x04070605
1172L$key_rcon1:
1173.long	1,1,1,1
1174L$key_rcon1b:
1175.long	0x1b,0x1b,0x1b,0x1b
1176
1177.byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1178.p2align	6
1179#endif
1180