• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 *	Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
3 *
4 * This is AES128/192/256 CTR mode optimization implementation. It requires
5 * the support of Intel(R) AESNI and AVX instructions.
6 *
7 * This work was inspired by the AES CTR mode optimization published
8 * in Intel Optimized IPSEC Cryptograhpic library.
9 * Additional information on it can be found at:
10 *    http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
11 *
12 * This file is provided under a dual BSD/GPLv2 license.  When using or
13 * redistributing this file, you may do so under either license.
14 *
15 * GPL LICENSE SUMMARY
16 *
17 * Copyright(c) 2014 Intel Corporation.
18 *
19 * This program is free software; you can redistribute it and/or modify
20 * it under the terms of version 2 of the GNU General Public License as
21 * published by the Free Software Foundation.
22 *
23 * This program is distributed in the hope that it will be useful, but
24 * WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
26 * General Public License for more details.
27 *
28 * Contact Information:
29 * James Guilford <james.guilford@intel.com>
30 * Sean Gulley <sean.m.gulley@intel.com>
31 * Chandramouli Narayanan <mouli@linux.intel.com>
32 *
33 * BSD LICENSE
34 *
35 * Copyright(c) 2014 Intel Corporation.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 *
41 * Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in
45 * the documentation and/or other materials provided with the
46 * distribution.
47 * Neither the name of Intel Corporation nor the names of its
48 * contributors may be used to endorse or promote products derived
49 * from this software without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62 *
63 */
64
65#include <linux/linkage.h>
66
67#define VMOVDQ		vmovdqu
68
69/*
70 * Note: the "x" prefix in these aliases means "this is an xmm register".  The
71 * alias prefixes have no relation to XCTR where the "X" prefix means "XOR
72 * counter".
73 */
74#define xdata0		%xmm0
75#define xdata1		%xmm1
76#define xdata2		%xmm2
77#define xdata3		%xmm3
78#define xdata4		%xmm4
79#define xdata5		%xmm5
80#define xdata6		%xmm6
81#define xdata7		%xmm7
82#define xcounter	%xmm8	// CTR mode only
83#define xiv		%xmm8	// XCTR mode only
84#define xbyteswap	%xmm9	// CTR mode only
85#define xtmp		%xmm9	// XCTR mode only
86#define xkey0		%xmm10
87#define xkey4		%xmm11
88#define xkey8		%xmm12
89#define xkey12		%xmm13
90#define xkeyA		%xmm14
91#define xkeyB		%xmm15
92
93#define p_in		%rdi
94#define p_iv		%rsi
95#define p_keys		%rdx
96#define p_out		%rcx
97#define num_bytes	%r8
98#define counter		%r9	// XCTR mode only
99#define tmp		%r10
100#define	DDQ_DATA	0
101#define	XDATA		1
102#define KEY_128		1
103#define KEY_192		2
104#define KEY_256		3
105
106.section .rodata
107.align 16
108
109byteswap_const:
110	.octa 0x000102030405060708090A0B0C0D0E0F
111ddq_low_msk:
112	.octa 0x0000000000000000FFFFFFFFFFFFFFFF
113ddq_high_add_1:
114	.octa 0x00000000000000010000000000000000
115ddq_add_1:
116	.octa 0x00000000000000000000000000000001
117ddq_add_2:
118	.octa 0x00000000000000000000000000000002
119ddq_add_3:
120	.octa 0x00000000000000000000000000000003
121ddq_add_4:
122	.octa 0x00000000000000000000000000000004
123ddq_add_5:
124	.octa 0x00000000000000000000000000000005
125ddq_add_6:
126	.octa 0x00000000000000000000000000000006
127ddq_add_7:
128	.octa 0x00000000000000000000000000000007
129ddq_add_8:
130	.octa 0x00000000000000000000000000000008
131
132.text
133
134/* generate a unique variable for ddq_add_x */
135
136/* generate a unique variable for xmm register */
137.macro setxdata n
138	var_xdata = %xmm\n
139.endm
140
141/* club the numeric 'id' to the symbol 'name' */
142
143.macro club name, id
144.altmacro
145	.if \name == XDATA
146		setxdata %\id
147	.endif
148.noaltmacro
149.endm
150
151/*
152 * do_aes num_in_par load_keys key_len
153 * This increments p_in, but not p_out
154 */
155.macro do_aes b, k, key_len, xctr
156	.set by, \b
157	.set load_keys, \k
158	.set klen, \key_len
159
160	.if (load_keys)
161		vmovdqa	0*16(p_keys), xkey0
162	.endif
163
164	.if \xctr
165		movq counter, xtmp
166		.set i, 0
167		.rept (by)
168			club XDATA, i
169			vpaddq	(ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata
170			.set i, (i +1)
171		.endr
172		.set i, 0
173		.rept (by)
174			club	XDATA, i
175			vpxor	xiv, var_xdata, var_xdata
176			.set i, (i +1)
177		.endr
178	.else
179		vpshufb	xbyteswap, xcounter, xdata0
180		.set i, 1
181		.rept (by - 1)
182			club XDATA, i
183			vpaddq	(ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
184			vptest	ddq_low_msk(%rip), var_xdata
185			jnz 1f
186			vpaddq	ddq_high_add_1(%rip), var_xdata, var_xdata
187			vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
188			1:
189			vpshufb	xbyteswap, var_xdata, var_xdata
190			.set i, (i +1)
191		.endr
192	.endif
193
194	vmovdqa	1*16(p_keys), xkeyA
195
196	vpxor	xkey0, xdata0, xdata0
197	.if \xctr
198		add $by, counter
199	.else
200		vpaddq	(ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
201		vptest	ddq_low_msk(%rip), xcounter
202		jnz	1f
203		vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
204		1:
205	.endif
206
207	.set i, 1
208	.rept (by - 1)
209		club XDATA, i
210		vpxor	xkey0, var_xdata, var_xdata
211		.set i, (i +1)
212	.endr
213
214	vmovdqa	2*16(p_keys), xkeyB
215
216	.set i, 0
217	.rept by
218		club XDATA, i
219		vaesenc	xkeyA, var_xdata, var_xdata		/* key 1 */
220		.set i, (i +1)
221	.endr
222
223	.if (klen == KEY_128)
224		.if (load_keys)
225			vmovdqa	3*16(p_keys), xkey4
226		.endif
227	.else
228		vmovdqa	3*16(p_keys), xkeyA
229	.endif
230
231	.set i, 0
232	.rept by
233		club XDATA, i
234		vaesenc	xkeyB, var_xdata, var_xdata		/* key 2 */
235		.set i, (i +1)
236	.endr
237
238	add	$(16*by), p_in
239
240	.if (klen == KEY_128)
241		vmovdqa	4*16(p_keys), xkeyB
242	.else
243		.if (load_keys)
244			vmovdqa	4*16(p_keys), xkey4
245		.endif
246	.endif
247
248	.set i, 0
249	.rept by
250		club XDATA, i
251		/* key 3 */
252		.if (klen == KEY_128)
253			vaesenc	xkey4, var_xdata, var_xdata
254		.else
255			vaesenc	xkeyA, var_xdata, var_xdata
256		.endif
257		.set i, (i +1)
258	.endr
259
260	vmovdqa	5*16(p_keys), xkeyA
261
262	.set i, 0
263	.rept by
264		club XDATA, i
265		/* key 4 */
266		.if (klen == KEY_128)
267			vaesenc	xkeyB, var_xdata, var_xdata
268		.else
269			vaesenc	xkey4, var_xdata, var_xdata
270		.endif
271		.set i, (i +1)
272	.endr
273
274	.if (klen == KEY_128)
275		.if (load_keys)
276			vmovdqa	6*16(p_keys), xkey8
277		.endif
278	.else
279		vmovdqa	6*16(p_keys), xkeyB
280	.endif
281
282	.set i, 0
283	.rept by
284		club XDATA, i
285		vaesenc	xkeyA, var_xdata, var_xdata		/* key 5 */
286		.set i, (i +1)
287	.endr
288
289	vmovdqa	7*16(p_keys), xkeyA
290
291	.set i, 0
292	.rept by
293		club XDATA, i
294		/* key 6 */
295		.if (klen == KEY_128)
296			vaesenc	xkey8, var_xdata, var_xdata
297		.else
298			vaesenc	xkeyB, var_xdata, var_xdata
299		.endif
300		.set i, (i +1)
301	.endr
302
303	.if (klen == KEY_128)
304		vmovdqa	8*16(p_keys), xkeyB
305	.else
306		.if (load_keys)
307			vmovdqa	8*16(p_keys), xkey8
308		.endif
309	.endif
310
311	.set i, 0
312	.rept by
313		club XDATA, i
314		vaesenc	xkeyA, var_xdata, var_xdata		/* key 7 */
315		.set i, (i +1)
316	.endr
317
318	.if (klen == KEY_128)
319		.if (load_keys)
320			vmovdqa	9*16(p_keys), xkey12
321		.endif
322	.else
323		vmovdqa	9*16(p_keys), xkeyA
324	.endif
325
326	.set i, 0
327	.rept by
328		club XDATA, i
329		/* key 8 */
330		.if (klen == KEY_128)
331			vaesenc	xkeyB, var_xdata, var_xdata
332		.else
333			vaesenc	xkey8, var_xdata, var_xdata
334		.endif
335		.set i, (i +1)
336	.endr
337
338	vmovdqa	10*16(p_keys), xkeyB
339
340	.set i, 0
341	.rept by
342		club XDATA, i
343		/* key 9 */
344		.if (klen == KEY_128)
345			vaesenc	xkey12, var_xdata, var_xdata
346		.else
347			vaesenc	xkeyA, var_xdata, var_xdata
348		.endif
349		.set i, (i +1)
350	.endr
351
352	.if (klen != KEY_128)
353		vmovdqa	11*16(p_keys), xkeyA
354	.endif
355
356	.set i, 0
357	.rept by
358		club XDATA, i
359		/* key 10 */
360		.if (klen == KEY_128)
361			vaesenclast	xkeyB, var_xdata, var_xdata
362		.else
363			vaesenc	xkeyB, var_xdata, var_xdata
364		.endif
365		.set i, (i +1)
366	.endr
367
368	.if (klen != KEY_128)
369		.if (load_keys)
370			vmovdqa	12*16(p_keys), xkey12
371		.endif
372
373		.set i, 0
374		.rept by
375			club XDATA, i
376			vaesenc	xkeyA, var_xdata, var_xdata	/* key 11 */
377			.set i, (i +1)
378		.endr
379
380		.if (klen == KEY_256)
381			vmovdqa	13*16(p_keys), xkeyA
382		.endif
383
384		.set i, 0
385		.rept by
386			club XDATA, i
387			.if (klen == KEY_256)
388				/* key 12 */
389				vaesenc	xkey12, var_xdata, var_xdata
390			.else
391				vaesenclast xkey12, var_xdata, var_xdata
392			.endif
393			.set i, (i +1)
394		.endr
395
396		.if (klen == KEY_256)
397			vmovdqa	14*16(p_keys), xkeyB
398
399			.set i, 0
400			.rept by
401				club XDATA, i
402				/* key 13 */
403				vaesenc	xkeyA, var_xdata, var_xdata
404				.set i, (i +1)
405			.endr
406
407			.set i, 0
408			.rept by
409				club XDATA, i
410				/* key 14 */
411				vaesenclast	xkeyB, var_xdata, var_xdata
412				.set i, (i +1)
413			.endr
414		.endif
415	.endif
416
417	.set i, 0
418	.rept (by / 2)
419		.set j, (i+1)
420		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
421		VMOVDQ	(j*16 - 16*by)(p_in), xkeyB
422		club XDATA, i
423		vpxor	xkeyA, var_xdata, var_xdata
424		club XDATA, j
425		vpxor	xkeyB, var_xdata, var_xdata
426		.set i, (i+2)
427	.endr
428
429	.if (i < by)
430		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
431		club XDATA, i
432		vpxor	xkeyA, var_xdata, var_xdata
433	.endif
434
435	.set i, 0
436	.rept by
437		club XDATA, i
438		VMOVDQ	var_xdata, i*16(p_out)
439		.set i, (i+1)
440	.endr
441.endm
442
443.macro do_aes_load val, key_len, xctr
444	do_aes \val, 1, \key_len, \xctr
445.endm
446
447.macro do_aes_noload val, key_len, xctr
448	do_aes \val, 0, \key_len, \xctr
449.endm
450
451/* main body of aes ctr load */
452
453.macro do_aes_ctrmain key_len, xctr
454	cmp	$16, num_bytes
455	jb	.Ldo_return2\xctr\key_len
456
457	.if \xctr
458		shr	$4, counter
459		vmovdqu	(p_iv), xiv
460	.else
461		vmovdqa	byteswap_const(%rip), xbyteswap
462		vmovdqu	(p_iv), xcounter
463		vpshufb	xbyteswap, xcounter, xcounter
464	.endif
465
466	mov	num_bytes, tmp
467	and	$(7*16), tmp
468	jz	.Lmult_of_8_blks\xctr\key_len
469
470	/* 1 <= tmp <= 7 */
471	cmp	$(4*16), tmp
472	jg	.Lgt4\xctr\key_len
473	je	.Leq4\xctr\key_len
474
475.Llt4\xctr\key_len:
476	cmp	$(2*16), tmp
477	jg	.Leq3\xctr\key_len
478	je	.Leq2\xctr\key_len
479
480.Leq1\xctr\key_len:
481	do_aes_load	1, \key_len, \xctr
482	add	$(1*16), p_out
483	and	$(~7*16), num_bytes
484	jz	.Ldo_return2\xctr\key_len
485	jmp	.Lmain_loop2\xctr\key_len
486
487.Leq2\xctr\key_len:
488	do_aes_load	2, \key_len, \xctr
489	add	$(2*16), p_out
490	and	$(~7*16), num_bytes
491	jz	.Ldo_return2\xctr\key_len
492	jmp	.Lmain_loop2\xctr\key_len
493
494
495.Leq3\xctr\key_len:
496	do_aes_load	3, \key_len, \xctr
497	add	$(3*16), p_out
498	and	$(~7*16), num_bytes
499	jz	.Ldo_return2\xctr\key_len
500	jmp	.Lmain_loop2\xctr\key_len
501
502.Leq4\xctr\key_len:
503	do_aes_load	4, \key_len, \xctr
504	add	$(4*16), p_out
505	and	$(~7*16), num_bytes
506	jz	.Ldo_return2\xctr\key_len
507	jmp	.Lmain_loop2\xctr\key_len
508
509.Lgt4\xctr\key_len:
510	cmp	$(6*16), tmp
511	jg	.Leq7\xctr\key_len
512	je	.Leq6\xctr\key_len
513
514.Leq5\xctr\key_len:
515	do_aes_load	5, \key_len, \xctr
516	add	$(5*16), p_out
517	and	$(~7*16), num_bytes
518	jz	.Ldo_return2\xctr\key_len
519	jmp	.Lmain_loop2\xctr\key_len
520
521.Leq6\xctr\key_len:
522	do_aes_load	6, \key_len, \xctr
523	add	$(6*16), p_out
524	and	$(~7*16), num_bytes
525	jz	.Ldo_return2\xctr\key_len
526	jmp	.Lmain_loop2\xctr\key_len
527
528.Leq7\xctr\key_len:
529	do_aes_load	7, \key_len, \xctr
530	add	$(7*16), p_out
531	and	$(~7*16), num_bytes
532	jz	.Ldo_return2\xctr\key_len
533	jmp	.Lmain_loop2\xctr\key_len
534
535.Lmult_of_8_blks\xctr\key_len:
536	.if (\key_len != KEY_128)
537		vmovdqa	0*16(p_keys), xkey0
538		vmovdqa	4*16(p_keys), xkey4
539		vmovdqa	8*16(p_keys), xkey8
540		vmovdqa	12*16(p_keys), xkey12
541	.else
542		vmovdqa	0*16(p_keys), xkey0
543		vmovdqa	3*16(p_keys), xkey4
544		vmovdqa	6*16(p_keys), xkey8
545		vmovdqa	9*16(p_keys), xkey12
546	.endif
547.align 16
548.Lmain_loop2\xctr\key_len:
549	/* num_bytes is a multiple of 8 and >0 */
550	do_aes_noload	8, \key_len, \xctr
551	add	$(8*16), p_out
552	sub	$(8*16), num_bytes
553	jne	.Lmain_loop2\xctr\key_len
554
555.Ldo_return2\xctr\key_len:
556	.if !\xctr
557		/* return updated IV */
558		vpshufb	xbyteswap, xcounter, xcounter
559		vmovdqu	xcounter, (p_iv)
560	.endif
561	RET
562.endm
563
564/*
565 * routine to do AES128 CTR enc/decrypt "by8"
566 * XMM registers are clobbered.
567 * Saving/restoring must be done at a higher level
568 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
569 *			unsigned int num_bytes)
570 */
571SYM_FUNC_START(aes_ctr_enc_128_avx_by8)
572	/* call the aes main loop */
573	do_aes_ctrmain KEY_128 0
574
575SYM_FUNC_END(aes_ctr_enc_128_avx_by8)
576
577/*
578 * routine to do AES192 CTR enc/decrypt "by8"
579 * XMM registers are clobbered.
580 * Saving/restoring must be done at a higher level
581 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
582 *			unsigned int num_bytes)
583 */
584SYM_FUNC_START(aes_ctr_enc_192_avx_by8)
585	/* call the aes main loop */
586	do_aes_ctrmain KEY_192 0
587
588SYM_FUNC_END(aes_ctr_enc_192_avx_by8)
589
590/*
591 * routine to do AES256 CTR enc/decrypt "by8"
592 * XMM registers are clobbered.
593 * Saving/restoring must be done at a higher level
594 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
595 *			unsigned int num_bytes)
596 */
597SYM_FUNC_START(aes_ctr_enc_256_avx_by8)
598	/* call the aes main loop */
599	do_aes_ctrmain KEY_256 0
600
601SYM_FUNC_END(aes_ctr_enc_256_avx_by8)
602
603/*
604 * routine to do AES128 XCTR enc/decrypt "by8"
605 * XMM registers are clobbered.
606 * Saving/restoring must be done at a higher level
607 * aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys,
608 * 	u8* out, unsigned int num_bytes, unsigned int byte_ctr)
609 */
610SYM_FUNC_START(aes_xctr_enc_128_avx_by8)
611	/* call the aes main loop */
612	do_aes_ctrmain KEY_128 1
613
614SYM_FUNC_END(aes_xctr_enc_128_avx_by8)
615
616/*
617 * routine to do AES192 XCTR enc/decrypt "by8"
618 * XMM registers are clobbered.
619 * Saving/restoring must be done at a higher level
620 * aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys,
621 * 	u8* out, unsigned int num_bytes, unsigned int byte_ctr)
622 */
623SYM_FUNC_START(aes_xctr_enc_192_avx_by8)
624	/* call the aes main loop */
625	do_aes_ctrmain KEY_192 1
626
627SYM_FUNC_END(aes_xctr_enc_192_avx_by8)
628
629/*
630 * routine to do AES256 XCTR enc/decrypt "by8"
631 * XMM registers are clobbered.
632 * Saving/restoring must be done at a higher level
633 * aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys,
634 * 	u8* out, unsigned int num_bytes, unsigned int byte_ctr)
635 */
636SYM_FUNC_START(aes_xctr_enc_256_avx_by8)
637	/* call the aes main loop */
638	do_aes_ctrmain KEY_256 1
639
640SYM_FUNC_END(aes_xctr_enc_256_avx_by8)
641