• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 *	Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
3 *
4 * This is AES128/192/256 CTR mode optimization implementation. It requires
5 * the support of Intel(R) AESNI and AVX instructions.
6 *
7 * This work was inspired by the AES CTR mode optimization published
8 * in Intel Optimized IPSEC Cryptograhpic library.
9 * Additional information on it can be found at:
10 *    http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
11 *
12 * This file is provided under a dual BSD/GPLv2 license.  When using or
13 * redistributing this file, you may do so under either license.
14 *
15 * GPL LICENSE SUMMARY
16 *
17 * Copyright(c) 2014 Intel Corporation.
18 *
19 * This program is free software; you can redistribute it and/or modify
20 * it under the terms of version 2 of the GNU General Public License as
21 * published by the Free Software Foundation.
22 *
23 * This program is distributed in the hope that it will be useful, but
24 * WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
26 * General Public License for more details.
27 *
28 * Contact Information:
29 * James Guilford <james.guilford@intel.com>
30 * Sean Gulley <sean.m.gulley@intel.com>
31 * Chandramouli Narayanan <mouli@linux.intel.com>
32 *
33 * BSD LICENSE
34 *
35 * Copyright(c) 2014 Intel Corporation.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 *
41 * Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in
45 * the documentation and/or other materials provided with the
46 * distribution.
47 * Neither the name of Intel Corporation nor the names of its
48 * contributors may be used to endorse or promote products derived
49 * from this software without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62 *
63 */
64
65#include <linux/linkage.h>
66#include <asm/inst.h>
67
68#define VMOVDQ		vmovdqu
69
70#define xdata0		%xmm0
71#define xdata1		%xmm1
72#define xdata2		%xmm2
73#define xdata3		%xmm3
74#define xdata4		%xmm4
75#define xdata5		%xmm5
76#define xdata6		%xmm6
77#define xdata7		%xmm7
78#define xcounter	%xmm8
79#define xbyteswap	%xmm9
80#define xkey0		%xmm10
81#define xkey4		%xmm11
82#define xkey8		%xmm12
83#define xkey12		%xmm13
84#define xkeyA		%xmm14
85#define xkeyB		%xmm15
86
87#define p_in		%rdi
88#define p_iv		%rsi
89#define p_keys		%rdx
90#define p_out		%rcx
91#define num_bytes	%r8
92
93#define tmp		%r10
94#define	DDQ_DATA	0
95#define	XDATA		1
96#define KEY_128		1
97#define KEY_192		2
98#define KEY_256		3
99
100.section .rodata
101.align 16
102
103byteswap_const:
104	.octa 0x000102030405060708090A0B0C0D0E0F
105ddq_low_msk:
106	.octa 0x0000000000000000FFFFFFFFFFFFFFFF
107ddq_high_add_1:
108	.octa 0x00000000000000010000000000000000
109ddq_add_1:
110	.octa 0x00000000000000000000000000000001
111ddq_add_2:
112	.octa 0x00000000000000000000000000000002
113ddq_add_3:
114	.octa 0x00000000000000000000000000000003
115ddq_add_4:
116	.octa 0x00000000000000000000000000000004
117ddq_add_5:
118	.octa 0x00000000000000000000000000000005
119ddq_add_6:
120	.octa 0x00000000000000000000000000000006
121ddq_add_7:
122	.octa 0x00000000000000000000000000000007
123ddq_add_8:
124	.octa 0x00000000000000000000000000000008
125
126.text
127
128/* generate a unique variable for ddq_add_x */
129
130/* generate a unique variable for xmm register */
131.macro setxdata n
132	var_xdata = %xmm\n
133.endm
134
135/* club the numeric 'id' to the symbol 'name' */
136
137.macro club name, id
138.altmacro
139	.if \name == XDATA
140		setxdata %\id
141	.endif
142.noaltmacro
143.endm
144
145/*
146 * do_aes num_in_par load_keys key_len
147 * This increments p_in, but not p_out
148 */
149.macro do_aes b, k, key_len
150	.set by, \b
151	.set load_keys, \k
152	.set klen, \key_len
153
154	.if (load_keys)
155		vmovdqa	0*16(p_keys), xkey0
156	.endif
157
158	vpshufb	xbyteswap, xcounter, xdata0
159
160	.set i, 1
161	.rept (by - 1)
162		club XDATA, i
163		vpaddq	(ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
164		vptest	ddq_low_msk(%rip), var_xdata
165		jnz 1f
166		vpaddq	ddq_high_add_1(%rip), var_xdata, var_xdata
167		vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
168		1:
169		vpshufb	xbyteswap, var_xdata, var_xdata
170		.set i, (i +1)
171	.endr
172
173	vmovdqa	1*16(p_keys), xkeyA
174
175	vpxor	xkey0, xdata0, xdata0
176	vpaddq	(ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
177	vptest	ddq_low_msk(%rip), xcounter
178	jnz	1f
179	vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
180	1:
181
182	.set i, 1
183	.rept (by - 1)
184		club XDATA, i
185		vpxor	xkey0, var_xdata, var_xdata
186		.set i, (i +1)
187	.endr
188
189	vmovdqa	2*16(p_keys), xkeyB
190
191	.set i, 0
192	.rept by
193		club XDATA, i
194		vaesenc	xkeyA, var_xdata, var_xdata		/* key 1 */
195		.set i, (i +1)
196	.endr
197
198	.if (klen == KEY_128)
199		.if (load_keys)
200			vmovdqa	3*16(p_keys), xkey4
201		.endif
202	.else
203		vmovdqa	3*16(p_keys), xkeyA
204	.endif
205
206	.set i, 0
207	.rept by
208		club XDATA, i
209		vaesenc	xkeyB, var_xdata, var_xdata		/* key 2 */
210		.set i, (i +1)
211	.endr
212
213	add	$(16*by), p_in
214
215	.if (klen == KEY_128)
216		vmovdqa	4*16(p_keys), xkeyB
217	.else
218		.if (load_keys)
219			vmovdqa	4*16(p_keys), xkey4
220		.endif
221	.endif
222
223	.set i, 0
224	.rept by
225		club XDATA, i
226		/* key 3 */
227		.if (klen == KEY_128)
228			vaesenc	xkey4, var_xdata, var_xdata
229		.else
230			vaesenc	xkeyA, var_xdata, var_xdata
231		.endif
232		.set i, (i +1)
233	.endr
234
235	vmovdqa	5*16(p_keys), xkeyA
236
237	.set i, 0
238	.rept by
239		club XDATA, i
240		/* key 4 */
241		.if (klen == KEY_128)
242			vaesenc	xkeyB, var_xdata, var_xdata
243		.else
244			vaesenc	xkey4, var_xdata, var_xdata
245		.endif
246		.set i, (i +1)
247	.endr
248
249	.if (klen == KEY_128)
250		.if (load_keys)
251			vmovdqa	6*16(p_keys), xkey8
252		.endif
253	.else
254		vmovdqa	6*16(p_keys), xkeyB
255	.endif
256
257	.set i, 0
258	.rept by
259		club XDATA, i
260		vaesenc	xkeyA, var_xdata, var_xdata		/* key 5 */
261		.set i, (i +1)
262	.endr
263
264	vmovdqa	7*16(p_keys), xkeyA
265
266	.set i, 0
267	.rept by
268		club XDATA, i
269		/* key 6 */
270		.if (klen == KEY_128)
271			vaesenc	xkey8, var_xdata, var_xdata
272		.else
273			vaesenc	xkeyB, var_xdata, var_xdata
274		.endif
275		.set i, (i +1)
276	.endr
277
278	.if (klen == KEY_128)
279		vmovdqa	8*16(p_keys), xkeyB
280	.else
281		.if (load_keys)
282			vmovdqa	8*16(p_keys), xkey8
283		.endif
284	.endif
285
286	.set i, 0
287	.rept by
288		club XDATA, i
289		vaesenc	xkeyA, var_xdata, var_xdata		/* key 7 */
290		.set i, (i +1)
291	.endr
292
293	.if (klen == KEY_128)
294		.if (load_keys)
295			vmovdqa	9*16(p_keys), xkey12
296		.endif
297	.else
298		vmovdqa	9*16(p_keys), xkeyA
299	.endif
300
301	.set i, 0
302	.rept by
303		club XDATA, i
304		/* key 8 */
305		.if (klen == KEY_128)
306			vaesenc	xkeyB, var_xdata, var_xdata
307		.else
308			vaesenc	xkey8, var_xdata, var_xdata
309		.endif
310		.set i, (i +1)
311	.endr
312
313	vmovdqa	10*16(p_keys), xkeyB
314
315	.set i, 0
316	.rept by
317		club XDATA, i
318		/* key 9 */
319		.if (klen == KEY_128)
320			vaesenc	xkey12, var_xdata, var_xdata
321		.else
322			vaesenc	xkeyA, var_xdata, var_xdata
323		.endif
324		.set i, (i +1)
325	.endr
326
327	.if (klen != KEY_128)
328		vmovdqa	11*16(p_keys), xkeyA
329	.endif
330
331	.set i, 0
332	.rept by
333		club XDATA, i
334		/* key 10 */
335		.if (klen == KEY_128)
336			vaesenclast	xkeyB, var_xdata, var_xdata
337		.else
338			vaesenc	xkeyB, var_xdata, var_xdata
339		.endif
340		.set i, (i +1)
341	.endr
342
343	.if (klen != KEY_128)
344		.if (load_keys)
345			vmovdqa	12*16(p_keys), xkey12
346		.endif
347
348		.set i, 0
349		.rept by
350			club XDATA, i
351			vaesenc	xkeyA, var_xdata, var_xdata	/* key 11 */
352			.set i, (i +1)
353		.endr
354
355		.if (klen == KEY_256)
356			vmovdqa	13*16(p_keys), xkeyA
357		.endif
358
359		.set i, 0
360		.rept by
361			club XDATA, i
362			.if (klen == KEY_256)
363				/* key 12 */
364				vaesenc	xkey12, var_xdata, var_xdata
365			.else
366				vaesenclast xkey12, var_xdata, var_xdata
367			.endif
368			.set i, (i +1)
369		.endr
370
371		.if (klen == KEY_256)
372			vmovdqa	14*16(p_keys), xkeyB
373
374			.set i, 0
375			.rept by
376				club XDATA, i
377				/* key 13 */
378				vaesenc	xkeyA, var_xdata, var_xdata
379				.set i, (i +1)
380			.endr
381
382			.set i, 0
383			.rept by
384				club XDATA, i
385				/* key 14 */
386				vaesenclast	xkeyB, var_xdata, var_xdata
387				.set i, (i +1)
388			.endr
389		.endif
390	.endif
391
392	.set i, 0
393	.rept (by / 2)
394		.set j, (i+1)
395		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
396		VMOVDQ	(j*16 - 16*by)(p_in), xkeyB
397		club XDATA, i
398		vpxor	xkeyA, var_xdata, var_xdata
399		club XDATA, j
400		vpxor	xkeyB, var_xdata, var_xdata
401		.set i, (i+2)
402	.endr
403
404	.if (i < by)
405		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
406		club XDATA, i
407		vpxor	xkeyA, var_xdata, var_xdata
408	.endif
409
410	.set i, 0
411	.rept by
412		club XDATA, i
413		VMOVDQ	var_xdata, i*16(p_out)
414		.set i, (i+1)
415	.endr
416.endm
417
418.macro do_aes_load val, key_len
419	do_aes \val, 1, \key_len
420.endm
421
422.macro do_aes_noload val, key_len
423	do_aes \val, 0, \key_len
424.endm
425
426/* main body of aes ctr load */
427
428.macro do_aes_ctrmain key_len
429	cmp	$16, num_bytes
430	jb	.Ldo_return2\key_len
431
432	vmovdqa	byteswap_const(%rip), xbyteswap
433	vmovdqu	(p_iv), xcounter
434	vpshufb	xbyteswap, xcounter, xcounter
435
436	mov	num_bytes, tmp
437	and	$(7*16), tmp
438	jz	.Lmult_of_8_blks\key_len
439
440	/* 1 <= tmp <= 7 */
441	cmp	$(4*16), tmp
442	jg	.Lgt4\key_len
443	je	.Leq4\key_len
444
445.Llt4\key_len:
446	cmp	$(2*16), tmp
447	jg	.Leq3\key_len
448	je	.Leq2\key_len
449
450.Leq1\key_len:
451	do_aes_load	1, \key_len
452	add	$(1*16), p_out
453	and	$(~7*16), num_bytes
454	jz	.Ldo_return2\key_len
455	jmp	.Lmain_loop2\key_len
456
457.Leq2\key_len:
458	do_aes_load	2, \key_len
459	add	$(2*16), p_out
460	and	$(~7*16), num_bytes
461	jz	.Ldo_return2\key_len
462	jmp	.Lmain_loop2\key_len
463
464
465.Leq3\key_len:
466	do_aes_load	3, \key_len
467	add	$(3*16), p_out
468	and	$(~7*16), num_bytes
469	jz	.Ldo_return2\key_len
470	jmp	.Lmain_loop2\key_len
471
472.Leq4\key_len:
473	do_aes_load	4, \key_len
474	add	$(4*16), p_out
475	and	$(~7*16), num_bytes
476	jz	.Ldo_return2\key_len
477	jmp	.Lmain_loop2\key_len
478
479.Lgt4\key_len:
480	cmp	$(6*16), tmp
481	jg	.Leq7\key_len
482	je	.Leq6\key_len
483
484.Leq5\key_len:
485	do_aes_load	5, \key_len
486	add	$(5*16), p_out
487	and	$(~7*16), num_bytes
488	jz	.Ldo_return2\key_len
489	jmp	.Lmain_loop2\key_len
490
491.Leq6\key_len:
492	do_aes_load	6, \key_len
493	add	$(6*16), p_out
494	and	$(~7*16), num_bytes
495	jz	.Ldo_return2\key_len
496	jmp	.Lmain_loop2\key_len
497
498.Leq7\key_len:
499	do_aes_load	7, \key_len
500	add	$(7*16), p_out
501	and	$(~7*16), num_bytes
502	jz	.Ldo_return2\key_len
503	jmp	.Lmain_loop2\key_len
504
505.Lmult_of_8_blks\key_len:
506	.if (\key_len != KEY_128)
507		vmovdqa	0*16(p_keys), xkey0
508		vmovdqa	4*16(p_keys), xkey4
509		vmovdqa	8*16(p_keys), xkey8
510		vmovdqa	12*16(p_keys), xkey12
511	.else
512		vmovdqa	0*16(p_keys), xkey0
513		vmovdqa	3*16(p_keys), xkey4
514		vmovdqa	6*16(p_keys), xkey8
515		vmovdqa	9*16(p_keys), xkey12
516	.endif
517.align 16
518.Lmain_loop2\key_len:
519	/* num_bytes is a multiple of 8 and >0 */
520	do_aes_noload	8, \key_len
521	add	$(8*16), p_out
522	sub	$(8*16), num_bytes
523	jne	.Lmain_loop2\key_len
524
525.Ldo_return2\key_len:
526	/* return updated IV */
527	vpshufb	xbyteswap, xcounter, xcounter
528	vmovdqu	xcounter, (p_iv)
529	ret
530.endm
531
532/*
533 * routine to do AES128 CTR enc/decrypt "by8"
534 * XMM registers are clobbered.
535 * Saving/restoring must be done at a higher level
536 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
537 *			unsigned int num_bytes)
538 */
539ENTRY(aes_ctr_enc_128_avx_by8)
540	/* call the aes main loop */
541	do_aes_ctrmain KEY_128
542
543ENDPROC(aes_ctr_enc_128_avx_by8)
544
545/*
546 * routine to do AES192 CTR enc/decrypt "by8"
547 * XMM registers are clobbered.
548 * Saving/restoring must be done at a higher level
549 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
550 *			unsigned int num_bytes)
551 */
552ENTRY(aes_ctr_enc_192_avx_by8)
553	/* call the aes main loop */
554	do_aes_ctrmain KEY_192
555
556ENDPROC(aes_ctr_enc_192_avx_by8)
557
558/*
559 * routine to do AES256 CTR enc/decrypt "by8"
560 * XMM registers are clobbered.
561 * Saving/restoring must be done at a higher level
562 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
563 *			unsigned int num_bytes)
564 */
565ENTRY(aes_ctr_enc_256_avx_by8)
566	/* call the aes main loop */
567	do_aes_ctrmain KEY_256
568
569ENDPROC(aes_ctr_enc_256_avx_by8)
570