• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15#include "hitls_build.h"
16#if defined(HITLS_CRYPTO_AES) && defined(HITLS_CRYPTO_XTS)
17
18#include "crypt_aes_macro_x86_64.s"
19
20.file   "crypt_aes_xts_x86_64.S"
21
22.set    KEY, %rdi
23.set    IN, %rsi
24.set    OUT, %rdx
25.set    LEN, %ecx
26.set    TWEAK, %r8
27
28.set    KTMP, %r9
29.set    LTMP, %r15d
30.set    TAILNUM,%r14d
31.set    TMPOUT,%r13
32.set    TMPIN,%r9
33
34.set    ROUNDS, %eax
35.set    RET, %eax
36.set    TROUNDS, %r10
37.set    ROUNDSQ,%rax
38.set    KEYEND,%r9
39
40.set    WTMP0,   %ecx
41.set    WTMP1,   %r10d
42.set    WTMP2,   %r11d
43
44.set    XTMP0,   %rcx
45.set    XTMP1,   %r10
46.set    XTMP2,   %r11
47
48.set    TWX0,    %r13
49.set    TWX1,    %r14
50
51.set    BLK0, %xmm8
52.set    BLK1, %xmm9
53.set    BLK2, %xmm10
54.set    BLK3, %xmm11
55.set    BLK4, %xmm12
56.set    BLK5, %xmm13
57.set    BLK6, %xmm14
58
59.set    TWEAK0, %xmm0
60.set    TWEAK1, %xmm1
61.set    TWEAK2, %xmm2
62.set    TWEAK3, %xmm3
63.set    TWEAK4, %xmm4
64.set    TWEAK5, %xmm5
65.set    TWEAK6, %xmm6
66
67.set    RDK, %xmm15
68.set    RDK1, %xmm7
69.set    TMPX, %xmm7
70.set    GFP, %xmm6
71.set    TWKTMP, %xmm14
72
73
74.macro NextTweakCore gfp, twkin, twktmp, tmp
75    vmovdqa \twktmp,\tmp
76    vpaddd \twktmp,\twktmp,\twktmp  // doubleword << 1
77    vpsrad $31,\tmp,\tmp            // ASR doubleword
78    vpaddq \twkin,\twkin,\twkin     // quadword << 1
79    vpand \gfp,\tmp,\tmp            // and 0x10000000000000087
80    vpxor \tmp,\twkin,\twkin
81.endm
82
83.macro NextTweak gfp, twkin, twkout, twktmp, tmp
84    NextTweakCore \gfp,\twkin,\twktmp,\tmp
85    vmovdqa \twkin,\twkout
86.endm
87
88.macro SAVE_STACK
89    push %rbx
90    push %rbp
91    push %rsp
92    push %r12
93    push %r13
94    push %r14
95    push %r15
96.endm
97
98.macro LOAD_STACK
99    pop %r15
100    pop %r14
101    pop %r13
102    pop %r12
103    pop %rsp
104    pop %rbp
105    pop %rbx
106.endm
107
108.data
109.align 64
110// modulus of Galois Field x^128+x^7+x^2+x+1 => 0x87(0b10000111)
111.Lgfp128:
112.long 0x87,0,1,0
113
114.text
115
116/**
117 *  Function description: Sets the AES encryption assembly acceleration API in XTS mode.
118 *  Function prototype: int32_t CRYPT_AES_XTS_Encrypt(const CRYPT_AES_Key *ctx,
119 *                                       const uint8_t *in, uint8_t *out, uint32_t len);
120 *  Input register:
121 *        x0: Pointer to the input key structure.
122 *        x1: Points to the 128-bit input data.
123 *        x2: Points to the 128-bit output data.
124 *        x3: Indicates the length of a data block, that is, 16 bytes.
125 *  Change register: xmm1,xmm3,xmm4,xmm5,xmm6,xmm10,xmm11,xmm12,xmm13.
126 *  Output register: eax.
127 *  Function/Macro Call: None.
128 */
129.align 32
130.globl CRYPT_AES_XTS_Encrypt
131.type CRYPT_AES_XTS_Encrypt, @function
132CRYPT_AES_XTS_Encrypt:
133.cfi_startproc
134    pushq %rbx
135    pushq %rbp
136    pushq %r12
137    pushq %r13
138    pushq %r14
139    pushq %r15
140    sub $96,%rsp
141    mov %rsp,%rbp
142    and $-16,%rsp  // 16 bytes align
143
144    movl LEN, LTMP
145    movl LEN, TAILNUM
146    andl $-16,LTMP
147    andl $0xf,TAILNUM // LEN % 16
148    movl 240(KEY), ROUNDS
149    vmovdqa .Lgfp128(%rip),GFP
150    vmovdqu (TWEAK), TWEAK0
151    shl $4,ROUNDS  // roundkey size: rounds*16, except for the last one
152    lea 16(KEY, ROUNDSQ),KEYEND   // step to the end of roundkeys
153
154.Lxts_aesenc_start:
155    cmpl    $64, LTMP
156    jae     .Lxts_enc_above_equal_4_blks
157    cmpl    $32, LTMP
158    jae     .Lxts_enc_above_equal_2_blks
159    cmpl    $0, LTMP
160    je      .Lxts_aesenc_finish
161    jmp     .Lxts_enc_proc_1_blk
162
163.Lxts_enc_above_equal_2_blks:
164    cmpl    $48, LTMP
165    jb      .Lxts_enc_proc_2_blks
166    jmp     .Lxts_enc_proc_3_blks
167
168.Lxts_enc_above_equal_4_blks:
169    cmpl    $96, LTMP
170    jae     .Lxts_enc_proc_6_blks_pre
171    cmpl    $80, LTMP
172    jb      .Lxts_enc_proc_4_blks
173    jmp     .Lxts_enc_proc_5_blks
174
175.align 16
176.Lxts_enc_proc_1_blk:
177    vmovdqu (IN),BLK0
178.Lxts_enc_proc_1blk_loaded:
179    mov KEY,KTMP
180    vpshufd $0x5f,TWEAK0,TWKTMP
181    vmovdqa TWEAK0,TWEAK5
182    movl 240(KTMP), ROUNDS
183    vmovdqu (KTMP), RDK
184    vpxor RDK,BLK0,BLK0
185    decl ROUNDS
186    vpxor TWEAK0, BLK0, BLK0
187    AES_ENC_1_BLK KTMP ROUNDS RDK BLK0
188    vpxor TWEAK0, BLK0, BLK0
189    vmovdqu BLK0, (OUT)
190    NextTweak GFP, TWEAK5, TWEAK0, TWKTMP, TMPX
191    lea 16(IN),IN
192    subl $16,LTMP
193    lea 16(OUT),OUT
194
195    je .Lxts_aesenc_finish
196
197.align 16
198.Lxts_enc_proc_2_blks:
199    mov KEY,KTMP
200    vpshufd $0x5f,TWEAK0,TWKTMP
201    vmovdqa TWEAK0,TWEAK5
202    movl 240(KTMP), ROUNDS
203    vmovdqu (KTMP), RDK
204    NextTweak GFP, TWEAK5, TWEAK1, TWKTMP, TMPX
205    vpxor (IN), RDK, BLK0
206    vpxor 16(IN), RDK, BLK1
207    decl ROUNDS
208    vpxor TWEAK0, BLK0, BLK0
209    vpxor TWEAK1, BLK1, BLK1
210    AES_ENC_2_BLKS    KTMP ROUNDS RDK BLK0 BLK1
211    vpxor TWEAK0, BLK0, BLK0
212    vpxor TWEAK1, BLK1, BLK1
213    vmovdqu BLK0, (OUT)
214    vmovdqu BLK1, 16(OUT)
215    NextTweak GFP, TWEAK5, TWEAK0, TWKTMP, TMPX
216    lea 32(IN),IN
217    subl $32,LTMP
218    lea 32(OUT),OUT
219
220    je .Lxts_aesenc_finish
221
222.align 16
223.Lxts_enc_proc_3_blks:
224    mov KEY,KTMP
225    vpshufd $0x5f,TWEAK0,TWKTMP
226    vmovdqa TWEAK0,TWEAK5
227    movl 240(KTMP), ROUNDS
228    vmovdqu (KTMP), RDK
229    NextTweak GFP, TWEAK5, TWEAK1, TWKTMP, TMPX
230    vpxor (IN), RDK, BLK0
231    vpxor 16(IN), RDK, BLK1
232    NextTweak GFP, TWEAK5, TWEAK2, TWKTMP, TMPX
233    vpxor 32(IN), RDK, BLK2
234    decl ROUNDS
235    vpxor TWEAK0, BLK0, BLK0
236    vpxor TWEAK1, BLK1, BLK1
237    vpxor TWEAK2, BLK2, BLK2
238    AES_ENC_3_BLKS    KTMP ROUNDS RDK BLK0 BLK1 BLK2
239    vpxor TWEAK0, BLK0, BLK0
240    vpxor TWEAK1, BLK1, BLK1
241    vpxor TWEAK2, BLK2, BLK2
242    vmovdqu BLK0, (OUT)
243    vmovdqu BLK1, 16(OUT)
244    vmovdqu BLK2, 32(OUT)
245    NextTweak GFP, TWEAK5, TWEAK0, TWKTMP, TMPX
246    lea 48(IN),IN
247    subl $48,LTMP
248    lea 48(OUT),OUT
249    je  .Lxts_aesenc_finish
250
251.align 16
252.Lxts_enc_proc_4_blks:
253    mov KEY,KTMP
254    vpshufd $0x5f,TWEAK0,TWKTMP
255    vmovdqa TWEAK0,TWEAK5
256    movl 240(KTMP), ROUNDS
257    vmovdqu (KTMP), RDK
258    NextTweak GFP, TWEAK5, TWEAK1, TWKTMP, TMPX
259    vpxor (IN), RDK, BLK0
260    vpxor 16(IN), RDK, BLK1
261    NextTweak GFP, TWEAK5, TWEAK2, TWKTMP, TMPX
262    vpxor 32(IN), RDK, BLK2
263    NextTweak GFP, TWEAK5, TWEAK3, TWKTMP, TMPX
264    vpxor 48(IN), RDK, BLK3
265    decl ROUNDS
266    vpxor TWEAK0, BLK0, BLK0
267    vpxor TWEAK1, BLK1, BLK1
268    vpxor TWEAK2, BLK2, BLK2
269    vpxor TWEAK3, BLK3, BLK3
270    AES_ENC_4_BLKS    KTMP ROUNDS RDK BLK0 BLK1 BLK2 BLK3
271    vpxor TWEAK0, BLK0, BLK0
272    vpxor TWEAK1, BLK1, BLK1
273    vpxor TWEAK2, BLK2, BLK2
274    vpxor TWEAK3, BLK3, BLK3
275    vmovdqu BLK0, (OUT)
276    vmovdqu BLK1, 16(OUT)
277    vmovdqu BLK2, 32(OUT)
278    vmovdqu BLK3, 48(OUT)
279    NextTweak GFP, TWEAK5, TWEAK0, TWKTMP, TMPX
280    lea 64(IN),IN
281    subl $64,LTMP
282    lea 64(OUT),OUT
283    je  .Lxts_aesenc_finish
284
285.align 16
286.Lxts_enc_proc_5_blks:
287    mov KEY,KTMP
288    vpshufd $0x5f,TWEAK0,TWKTMP
289    vmovdqa TWEAK0,TWEAK5
290    movl 240(KTMP), ROUNDS
291    vmovdqu (KTMP), RDK
292    NextTweak GFP, TWEAK5, TWEAK1, TWKTMP, TMPX
293    vpxor (IN), RDK, BLK0
294    vpxor 16(IN), RDK, BLK1
295    NextTweak GFP, TWEAK5, TWEAK2, TWKTMP, TMPX
296    vpxor 32(IN), RDK, BLK2
297    NextTweak GFP, TWEAK5, TWEAK3, TWKTMP, TMPX
298    vpxor 48(IN), RDK, BLK3
299    NextTweak GFP, TWEAK5, TWEAK4, TWKTMP, TMPX
300    vpxor 64(IN), RDK, BLK4
301    decl ROUNDS
302    vpxor TWEAK0, BLK0, BLK0
303    vpxor TWEAK1, BLK1, BLK1
304    vpxor TWEAK2, BLK2, BLK2
305    vpxor TWEAK3, BLK3, BLK3
306    vpxor TWEAK4, BLK4, BLK4
307    AES_ENC_5_BLKS    KTMP ROUNDS RDK BLK0 BLK1 BLK2 BLK3 BLK4
308    vpxor TWEAK0, BLK0, BLK0
309    vpxor TWEAK1, BLK1, BLK1
310    vpxor TWEAK2, BLK2, BLK2
311    vpxor TWEAK3, BLK3, BLK3
312    vpxor TWEAK4, BLK4, BLK4
313    vmovdqu BLK0, (OUT)
314    vmovdqu BLK1, 16(OUT)
315    vmovdqu BLK2, 32(OUT)
316    vmovdqu BLK3, 48(OUT)
317    vmovdqu BLK4, 64(OUT)
318    NextTweak GFP, TWEAK5, TWEAK0, TWKTMP, TMPX
319    lea 80(IN),IN
320    subl $80,LTMP
321    lea 80(OUT),OUT
322    je  .Lxts_aesenc_finish
323
324.align 16
325.Lxts_enc_proc_6_blks_pre:
326    vpshufd $0x5f,TWEAK0,TWKTMP   // save higher doubleword of tweak
327    vmovdqa TWEAK0,TWEAK5    // copy first tweak
328    NextTweak GFP, TWEAK5, TWEAK1, TWKTMP, TMPX
329    NextTweak GFP, TWEAK5, TWEAK2, TWKTMP, TMPX
330    NextTweak GFP, TWEAK5, TWEAK3, TWKTMP, TMPX
331    NextTweak GFP, TWEAK5, TWEAK4, TWKTMP, TMPX
332    NextTweakCore GFP, TWEAK5, TWKTMP, TMPX
333
334.Lxts_enc_proc_6_blks:
335    vmovdqu (KEY), RDK
336    vmovdqu (IN),BLK0
337    vpxor TWEAK0,BLK0,BLK0    // blk0 ^= tweak0
338    vpxor RDK,BLK0,BLK0   // blk0 = blk0 ^ tweak0 ^ rk0, prepared for the loop round
339    vmovdqu -16(KEYEND),RDK1   // load last round key
340
341    vmovdqu 16(IN),BLK1
342    vpxor RDK1,TWEAK0,TWEAK0
343    aesenc 16(KEY),BLK0  // first round: rk1
344    vmovdqa TWEAK0,(%rsp)
345    vpxor TWEAK1,BLK1,BLK1
346    vpxor RDK,BLK1,BLK1
347
348    vmovdqu 32(IN),BLK2
349    vpxor RDK1,TWEAK1,TWEAK1
350    aesenc 16(KEY),BLK1
351    vmovdqa TWEAK1,16(%rsp)
352    vpxor TWEAK2,BLK2,BLK2
353    vpxor RDK,BLK2,BLK2
354
355    vmovdqu 48(IN),BLK3
356    vpxor RDK1,TWEAK2,TWEAK2
357    aesenc 16(KEY),BLK2
358    vmovdqa TWEAK2,32(%rsp)
359    vpxor TWEAK3,BLK3,BLK3
360    vpxor RDK,BLK3,BLK3
361
362    vmovdqu 64(IN),BLK4
363    vpxor RDK1,TWEAK3,TWEAK3
364    aesenc 16(KEY),BLK3
365    vmovdqa TWEAK3,48(%rsp)
366    vpxor TWEAK4,BLK4,BLK4
367    vpxor RDK,BLK4,BLK4
368
369    vmovdqu 80(IN),BLK5
370    vpxor RDK1,TWEAK4,TWEAK4
371    aesenc 16(KEY),BLK4
372    vmovdqa TWEAK4,64(%rsp)
373    vpxor TWEAK5,BLK5,BLK5
374    vpxor RDK,BLK5,BLK5
375    vpxor RDK1,TWEAK5,TWEAK5
376    aesenc 16(KEY),BLK5
377    vmovdqa TWEAK5,80(%rsp)
378
379    mov $(7*16),TROUNDS  // loop 7 rounds
380    sub ROUNDSQ,TROUNDS
381.align 16
382.Lxts_6_blks_loop:
383    vmovdqu -96(KEYEND,TROUNDS),RDK  // left 5+1 block to interval
384    aesenc  RDK, BLK0
385    aesenc  RDK, BLK1
386    aesenc  RDK, BLK2
387    add $16,TROUNDS
388    aesenc  RDK, BLK3
389    aesenc  RDK, BLK4
390    aesenc  RDK, BLK5
391    jnz .Lxts_6_blks_loop
392
393    vpxor 80(%rsp),RDK1,TWEAK5  // tweak5 = tweak5^lastroundkey^lastroundkey
394    vmovdqu -96(KEYEND,TROUNDS),RDK
395    vpshufd $0x5f,TWEAK5,TWKTMP  // use new tweak-tmp
396    vmovdqa TWKTMP,TMPX      // pre-calculate next round tweak0~tweak5
397    aesenc  RDK, BLK0
398    vpaddd TWKTMP,TWKTMP,TWKTMP
399    vpsrad $31,TMPX,TMPX
400    aesenc  RDK, BLK1
401    vpaddq TWEAK5,TWEAK5,TWEAK5
402    vpand GFP,TMPX,TMPX
403    aesenc  RDK, BLK2
404    vpxor TMPX,TWEAK5,TWEAK5
405    add $16,TROUNDS
406    aesenc  RDK, BLK3
407    vmovdqa TWEAK5,TWEAK0
408    aesenc  RDK, BLK4
409    aesenc  RDK, BLK5
410
411    vmovdqu -96(KEYEND,TROUNDS),RDK
412    vmovdqa TWKTMP,TMPX
413    aesenc  RDK, BLK0
414    vpaddd TWKTMP,TWKTMP,TWKTMP
415    vpsrad $31,TMPX,TMPX
416    aesenc  RDK, BLK1
417    vpaddq TWEAK5,TWEAK5,TWEAK5
418    vpand GFP,TMPX,TMPX
419    aesenc  RDK, BLK2
420    vpxor TMPX,TWEAK5,TWEAK5
421    add $16,TROUNDS
422    aesenc  RDK, BLK3
423    vmovdqa TWEAK5,TWEAK1
424    aesenc  RDK, BLK4
425    aesenc  RDK, BLK5
426
427    vmovdqu -96(KEYEND,TROUNDS),RDK
428    vmovdqa TWKTMP,TMPX
429    aesenc  RDK, BLK0
430    vpaddd TWKTMP,TWKTMP,TWKTMP
431    vpsrad $31,TMPX,TMPX
432    aesenc  RDK, BLK1
433    vpaddq TWEAK5,TWEAK5,TWEAK5
434    vpand GFP,TMPX,TMPX
435    aesenc  RDK, BLK2
436    vpxor TMPX,TWEAK5,TWEAK5
437    add $16,TROUNDS
438    aesenc  RDK, BLK3
439    vmovdqa TWEAK5,TWEAK2
440    aesenc  RDK, BLK4
441    aesenc  RDK, BLK5
442
443    vmovdqu -96(KEYEND,TROUNDS),RDK
444    vmovdqa TWKTMP,TMPX
445    aesenc  RDK, BLK0
446    vpaddd TWKTMP,TWKTMP,TWKTMP
447    vpsrad $31,TMPX,TMPX
448    aesenc  RDK, BLK1
449    vpaddq TWEAK5,TWEAK5,TWEAK5
450    vpand GFP,TMPX,TMPX
451    aesenc  RDK, BLK2
452    vpxor TMPX,TWEAK5,TWEAK5
453    add $16,TROUNDS
454    aesenc  RDK, BLK3
455    vmovdqa TWEAK5,TWEAK3
456    aesenc  RDK, BLK4
457    aesenc  RDK, BLK5
458
459    vmovdqu -96(KEYEND,TROUNDS),RDK
460    vmovdqa TWKTMP,TMPX
461    aesenc  RDK, BLK0
462    vpaddd TWKTMP,TWKTMP,TWKTMP
463    vpsrad $31,TMPX,TMPX
464    aesenc  RDK, BLK1
465    vpaddq TWEAK5,TWEAK5,TWEAK5
466    vpand GFP,TMPX,TMPX
467    aesenc  RDK, BLK2
468    vpxor TMPX,TWEAK5,TWEAK5
469    aesenc  RDK, BLK3
470    vmovdqa TWEAK5,TWEAK4
471    aesenc  RDK, BLK4
472    aesenc  RDK, BLK5
473
474    vmovdqa TWKTMP,TMPX
475    aesenclast (%rsp), BLK0
476    aesenclast 16(%rsp), BLK1   // already do the tweak^lastround, so here just aesenclast
477    vpaddd TWKTMP,TWKTMP,TWKTMP
478    vpsrad $31,TMPX,TMPX
479    aesenclast 32(%rsp), BLK2
480    vpaddq TWEAK5,TWEAK5,TWEAK5
481    vpand GFP,TMPX,TMPX
482    aesenclast 48(%rsp), BLK3
483    vpxor TMPX,TWEAK5,TWEAK5
484    aesenclast 64(%rsp), BLK4
485    aesenclast 80(%rsp), BLK5
486
487    vmovdqu BLK0, (OUT)
488    vmovdqu BLK1, 16(OUT)
489    vmovdqu BLK2, 32(OUT)
490    vmovdqu BLK3, 48(OUT)
491    vmovdqu BLK4, 64(OUT)
492    vmovdqu BLK5, 80(OUT)
493
494    leaq 96(IN), IN
495    leaq 96(OUT), OUT
496    sub $96, LTMP
497    cmp $96, LTMP
498    jb  .Lxts_aesenc_start
499    jmp  .Lxts_enc_proc_6_blks
500
501.align 16
502.Lxts_aesenc_finish:
503    cmp $0,TAILNUM
504    je .Lxts_ret
505.Lxts_tail_proc:
506    mov OUT,TMPOUT
507    mov IN,TMPIN
508.Lxts_tail_loop:
509    sub $1,TAILNUM
510    movzb -16(TMPOUT),%r10d
511    movzb (TMPIN),%r11d
512    mov %r10b,(TMPOUT)
513    lea 1(TMPIN),TMPIN
514    mov %r11b,-16(TMPOUT)
515    lea 1(TMPOUT),TMPOUT
516    ja .Lxts_tail_loop
517    sub $16,OUT  // step 1 block back to save the last stealing block encryption
518    add $16,LTMP
519    vmovdqu (OUT),BLK0
520    jmp .Lxts_enc_proc_1blk_loaded
521
522.Lxts_ret:
523    vmovdqu TWEAK0, (TWEAK)
524    vpxor BLK0, BLK0, BLK0
525    vpxor BLK1, BLK1, BLK1
526    vpxor BLK2, BLK2, BLK2
527    vpxor BLK3, BLK3, BLK3
528    vpxor BLK4, BLK4, BLK4
529    vpxor BLK5, BLK5, BLK5
530    vpxor BLK6, BLK6, BLK6
531    vpxor RDK, RDK, RDK
532    movl $0, RET
533
534    mov %rbp,%rsp
535    add $96,%rsp
536    popq %r15
537    popq %r14
538    popq %r13
539    popq %r12
540    popq %rbp
541    popq %rbx
542    ret
543.cfi_endproc
544.size CRYPT_AES_XTS_Encrypt, .-CRYPT_AES_XTS_Encrypt
545
546
547/**
548 *  Function description: Sets the AES decryption and assembly acceleration API in XTS mode.
549 *  Function prototype: int32_t CRYPT_AES_XTS_Decrypt(const CRYPT_AES_Key *ctx,
550 *                                              const uint8_t *in, uint8_t *out, uint32_t len);
551 *  Input register:
552 *        x0: Pointer to the input key structure.
553 *        x1: Points to the 128-bit input data.
554 *        x2: Indicates the 128-bit output data.
555 *        x3: Indicates the length of a data block, that is, 16 bytes.
556 *  Change register: xmm1,xmm3,xmm4,xmm5,xmm6,xmm10,xmm11,xmm12,xmm13.
557 *  Output register: eax.
558 *  Function/Macro Call: None.
559 */
560.align 32
561.globl CRYPT_AES_XTS_Decrypt
562.type CRYPT_AES_XTS_Decrypt, @function
563CRYPT_AES_XTS_Decrypt:
564.cfi_startproc
565    pushq %rbx
566    pushq %rbp
567    pushq %r12
568    pushq %r13
569    pushq %r14
570    pushq %r15
571    sub $96,%rsp
572    mov %rsp,%rbp
573    and $-16,%rsp  // 16 bytes align
574
575    movl LEN, LTMP
576    movl LEN, TAILNUM
577
578    andl $-16,LTMP
579    movl LTMP,WTMP2
580    sub $16,WTMP2    // preserve last and tail block
581    andl $0xf,TAILNUM // LEN % 16
582    cmovg WTMP2,LTMP
583    movl 240(KEY), ROUNDS
584    vmovdqa .Lgfp128(%rip),GFP
585    vmovdqu (TWEAK), TWEAK0
586    shl $4,ROUNDS  // roundkey size: rounds*16, except for the last one
587    lea 16(KEY, ROUNDSQ),KEYEND   // step to the end of roundkeys
588
589.Lxts_aesdec_start:
590    cmpl    $64, LTMP
591    jae     .Lxts_dec_above_equal_4_blks
592    cmpl    $32, LTMP
593    jae     .Lxts_dec_above_equal_2_blks
594    cmpl    $0, LTMP
595    je      .Lxts_dec_last_2blks
596    jmp     .Lxts_dec_proc_1_blk
597
598.Lxts_dec_above_equal_2_blks:
599    cmpl    $48, LTMP
600    jb      .Lxts_dec_proc_2_blks
601    jmp     .Lxts_dec_proc_3_blks
602
603.Lxts_dec_above_equal_4_blks:
604    cmpl    $96, LTMP
605    jae     .Lxts_dec_proc_6_blks_pre
606    cmpl    $80, LTMP
607    jb      .Lxts_dec_proc_4_blks
608    jmp     .Lxts_dec_proc_5_blks
609
610.align 16
611.Lxts_dec_tail_proc:
612    cmp $0,TAILNUM
613    je .Lxts_aesdec_finish
614    vmovdqa TWEAK1,TWEAK0  // restore back tweak0
615    mov OUT,TMPOUT
616    mov IN,TMPIN
617.Lxts_dec_tail_loop:
618    sub $1,TAILNUM
619    movzb -16(TMPOUT),%r10d
620    movzb (TMPIN),%r11d
621    mov %r10b,(TMPOUT)
622    lea 1(TMPIN),TMPIN
623    mov %r11b,-16(TMPOUT)
624    lea 1(TMPOUT),TMPOUT
625    ja .Lxts_dec_tail_loop
626
627    sub $16,OUT  // step 1 block back to save the last stealing block encryption
628    add $16,LTMP
629
630    vmovdqu (OUT),BLK0
631    jmp .Lxts_dec_proc_1blk_loaded
632
633.align 16
634.Lxts_dec_last_2blks:
635    cmp $0,TAILNUM
636    je .Lxts_aesdec_finish
637    vpshufd $0x5f,TWEAK0,TWKTMP
638    vmovdqa TWEAK0,TWEAK1    // tail block use tweak0, last block use tweak1
639    NextTweakCore GFP, TWEAK0, TWKTMP, TMPX
640.Lxts_dec_proc_1_blk:
641    vmovdqu (IN),BLK0
642.Lxts_dec_proc_1blk_loaded:
643    mov KEY,KTMP
644    vpshufd $0x5f,TWEAK0,TWKTMP
645    vmovdqa TWEAK0,TWEAK5
646    movl 240(KTMP), ROUNDS
647    vmovdqu (KTMP), RDK
648    vpxor RDK,BLK0,BLK0
649    decl ROUNDS
650    vpxor TWEAK0, BLK0, BLK0
651    AES_DEC_1_BLK KTMP ROUNDS RDK BLK0
652    vpxor TWEAK0, BLK0, BLK0
653    vmovdqu BLK0, (OUT)
654    NextTweak GFP, TWEAK5, TWEAK0, TWKTMP, TMPX
655    lea 16(IN),IN
656    subl $16,LTMP
657    lea 16(OUT),OUT
658    jl .Lxts_dec_tail_proc
659    jmp .Lxts_aesdec_start
660
661.align 16
662.Lxts_dec_proc_2_blks:
663    mov KEY,KTMP
664    vpshufd $0x5f,TWEAK0,TWKTMP
665    vmovdqa TWEAK0,TWEAK5
666    movl 240(KTMP), ROUNDS
667    vmovdqu (KTMP), RDK
668    NextTweak GFP, TWEAK5, TWEAK1, TWKTMP, TMPX
669    vpxor (IN), RDK, BLK0
670    vpxor 16(IN), RDK, BLK1
671    decl ROUNDS
672    vpxor TWEAK0, BLK0, BLK0
673    vpxor TWEAK1, BLK1, BLK1
674    AES_DEC_2_BLKS    KTMP ROUNDS RDK BLK0 BLK1
675    vpxor TWEAK0, BLK0, BLK0
676    vpxor TWEAK1, BLK1, BLK1
677    vmovdqu BLK0, (OUT)
678    vmovdqu BLK1, 16(OUT)
679    NextTweak GFP, TWEAK5, TWEAK0, TWKTMP, TMPX
680    lea 32(IN),IN
681    subl $32,LTMP
682    lea 32(OUT),OUT
683
684    jge .Lxts_aesdec_start
685
686.align 16
687.Lxts_dec_proc_3_blks:
688    mov KEY,KTMP
689    vpshufd $0x5f,TWEAK0,TWKTMP
690    vmovdqa TWEAK0,TWEAK5
691    movl 240(KTMP), ROUNDS
692    vmovdqu (KTMP), RDK
693    NextTweak GFP, TWEAK5, TWEAK1, TWKTMP, TMPX
694    vpxor (IN), RDK, BLK0
695    vpxor 16(IN), RDK, BLK1
696    NextTweak GFP, TWEAK5, TWEAK2, TWKTMP, TMPX
697    vpxor 32(IN), RDK, BLK2
698    decl ROUNDS
699    vpxor TWEAK0, BLK0, BLK0
700    vpxor TWEAK1, BLK1, BLK1
701    vpxor TWEAK2, BLK2, BLK2
702    AES_DEC_3_BLKS    KTMP ROUNDS RDK BLK0 BLK1 BLK2
703    vpxor TWEAK0, BLK0, BLK0
704    vpxor TWEAK1, BLK1, BLK1
705    vpxor TWEAK2, BLK2, BLK2
706    vmovdqu BLK0, (OUT)
707    vmovdqu BLK1, 16(OUT)
708    vmovdqu BLK2, 32(OUT)
709    NextTweak GFP, TWEAK5, TWEAK0, TWKTMP, TMPX
710    lea 48(IN),IN
711    subl $48,LTMP
712    lea 48(OUT),OUT
713    jge .Lxts_aesdec_start
714
715.align 16
716.Lxts_dec_proc_4_blks:
717    mov KEY,KTMP
718    vpshufd $0x5f,TWEAK0,TWKTMP
719    vmovdqa TWEAK0,TWEAK5
720    movl 240(KTMP), ROUNDS
721    vmovdqu (KTMP), RDK
722    NextTweak GFP, TWEAK5, TWEAK1, TWKTMP, TMPX
723    vpxor (IN), RDK, BLK0
724    vpxor 16(IN), RDK, BLK1
725    NextTweak GFP, TWEAK5, TWEAK2, TWKTMP, TMPX
726    vpxor 32(IN), RDK, BLK2
727    NextTweak GFP, TWEAK5, TWEAK3, TWKTMP, TMPX
728    vpxor 48(IN), RDK, BLK3
729    decl ROUNDS
730    vpxor TWEAK0, BLK0, BLK0
731    vpxor TWEAK1, BLK1, BLK1
732    vpxor TWEAK2, BLK2, BLK2
733    vpxor TWEAK3, BLK3, BLK3
734    AES_DEC_4_BLKS    KTMP ROUNDS RDK BLK0 BLK1 BLK2 BLK3
735    vpxor TWEAK0, BLK0, BLK0
736    vpxor TWEAK1, BLK1, BLK1
737    vpxor TWEAK2, BLK2, BLK2
738    vpxor TWEAK3, BLK3, BLK3
739    vmovdqu BLK0, (OUT)
740    vmovdqu BLK1, 16(OUT)
741    vmovdqu BLK2, 32(OUT)
742    vmovdqu BLK3, 48(OUT)
743    NextTweak GFP, TWEAK5, TWEAK0, TWKTMP, TMPX
744    lea 64(IN),IN
745    subl $64,LTMP
746    lea 64(OUT),OUT
747    jge .Lxts_aesdec_start
748
749.align 16
750.Lxts_dec_proc_5_blks:
751    mov KEY,KTMP
752    vpshufd $0x5f,TWEAK0,TWKTMP
753    vmovdqa TWEAK0,TWEAK5
754    movl 240(KTMP), ROUNDS
755    vmovdqu (KTMP), RDK
756    NextTweak GFP, TWEAK5, TWEAK1, TWKTMP, TMPX
757    vpxor (IN), RDK, BLK0
758    vpxor 16(IN), RDK, BLK1
759    NextTweak GFP, TWEAK5, TWEAK2, TWKTMP, TMPX
760    vpxor 32(IN), RDK, BLK2
761    NextTweak GFP, TWEAK5, TWEAK3, TWKTMP, TMPX
762    vpxor 48(IN), RDK, BLK3
763    NextTweak GFP, TWEAK5, TWEAK4, TWKTMP, TMPX
764    vpxor 64(IN), RDK, BLK4
765    decl ROUNDS
766    vpxor TWEAK0, BLK0, BLK0
767    vpxor TWEAK1, BLK1, BLK1
768    vpxor TWEAK2, BLK2, BLK2
769    vpxor TWEAK3, BLK3, BLK3
770    vpxor TWEAK4, BLK4, BLK4
771    AES_DEC_5_BLKS    KTMP ROUNDS RDK BLK0 BLK1 BLK2 BLK3 BLK4
772    vpxor TWEAK0, BLK0, BLK0
773    vpxor TWEAK1, BLK1, BLK1
774    vpxor TWEAK2, BLK2, BLK2
775    vpxor TWEAK3, BLK3, BLK3
776    vpxor TWEAK4, BLK4, BLK4
777    vmovdqu BLK0, (OUT)
778    vmovdqu BLK1, 16(OUT)
779    vmovdqu BLK2, 32(OUT)
780    vmovdqu BLK3, 48(OUT)
781    vmovdqu BLK4, 64(OUT)
782    NextTweak GFP, TWEAK5, TWEAK0, TWKTMP, TMPX
783    lea 80(IN),IN
784    subl $80,LTMP
785    lea 80(OUT),OUT
786    jge .Lxts_aesdec_start
787
788.align 32
789.Lxts_dec_proc_6_blks_pre:
790    vpshufd $0x5f,TWEAK0,TWKTMP   // save higher doubleword of tweak
791    vmovdqa TWEAK0,TWEAK5    // copy first tweak
792    NextTweak GFP, TWEAK5, TWEAK1, TWKTMP, TMPX
793    NextTweak GFP, TWEAK5, TWEAK2, TWKTMP, TMPX
794    NextTweak GFP, TWEAK5, TWEAK3, TWKTMP, TMPX
795    NextTweak GFP, TWEAK5, TWEAK4, TWKTMP, TMPX
796    NextTweakCore GFP, TWEAK5, TWKTMP, TMPX
797.align 32
798.Lxts_dec_proc_6_blks:
799    vmovdqu (KEY), RDK
800    vmovdqu (IN),BLK0
801    vpxor TWEAK0,BLK0,BLK0    // blk0 ^= tweak0
802    vpxor RDK,BLK0,BLK0   // blk0 = blk0 ^ tweak0 ^ rk0, prepared for the loop round
803    vmovdqu -16(KEYEND),RDK1   // load last round key
804
805    vmovdqu 16(IN),BLK1
806    vpxor RDK1,TWEAK0,TWEAK0
807    aesdec 16(KEY),BLK0  // first round: rk1
808    vmovdqa TWEAK0,(%rsp)
809    vpxor TWEAK1,BLK1,BLK1
810    vpxor RDK,BLK1,BLK1
811
812    vmovdqu 32(IN),BLK2
813    vpxor RDK1,TWEAK1,TWEAK1
814    aesdec 16(KEY),BLK1
815    vmovdqa TWEAK1,16(%rsp)
816    vpxor TWEAK2,BLK2,BLK2
817    vpxor RDK,BLK2,BLK2
818
819    vmovdqu 48(IN),BLK3
820    vpxor RDK1,TWEAK2,TWEAK2
821    aesdec 16(KEY),BLK2
822    vmovdqa TWEAK2,32(%rsp)
823    vpxor TWEAK3,BLK3,BLK3
824    vpxor RDK,BLK3,BLK3
825
826    vmovdqu 64(IN),BLK4
827    vpxor RDK1,TWEAK3,TWEAK3
828    aesdec 16(KEY),BLK3
829    vmovdqa TWEAK3,48(%rsp)
830    vpxor TWEAK4,BLK4,BLK4
831    vpxor RDK,BLK4,BLK4
832
833    vmovdqu 80(IN),BLK5
834    vpxor RDK1,TWEAK4,TWEAK4
835    aesdec 16(KEY),BLK4
836    vmovdqa TWEAK4,64(%rsp)
837    vpxor TWEAK5,BLK5,BLK5
838    vpxor RDK,BLK5,BLK5
839    vpxor RDK1,TWEAK5,TWEAK5
840    aesdec 16(KEY),BLK5
841    vmovdqa TWEAK5,80(%rsp)
842
843    mov $(7*16),TROUNDS  // loop 7 rounds
844    sub ROUNDSQ,TROUNDS
845.align 32
846.Lxts_dec_6blks_loop:
847    vmovdqu -96(KEYEND,TROUNDS),RDK  // left 5+1 block to interval
848    aesdec  RDK, BLK0
849    aesdec  RDK, BLK1
850    aesdec  RDK, BLK2
851    add $16,TROUNDS
852    aesdec  RDK, BLK3
853    aesdec  RDK, BLK4
854    aesdec  RDK, BLK5
855    jnz .Lxts_dec_6blks_loop
856
857    vpxor 80(%rsp),RDK1,TWEAK5  // tweak5 = tweak5^lastroundkey^lastroundkey
858    vmovdqu -96(KEYEND,TROUNDS),RDK
859    vpshufd $0x5f,TWEAK5,TWKTMP  // use new tweak-tmp
860    vmovdqa TWKTMP,TMPX      // pre-calculate next round tweak0~tweak5
861    aesdec  RDK, BLK0
862    vpaddd TWKTMP,TWKTMP,TWKTMP
863    vpsrad $31,TMPX,TMPX
864    aesdec  RDK, BLK1
865    vpaddq TWEAK5,TWEAK5,TWEAK5
866    vpand GFP,TMPX,TMPX
867    aesdec  RDK, BLK2
868    vpxor TMPX,TWEAK5,TWEAK5
869    add $16,TROUNDS
870    aesdec  RDK, BLK3
871    vmovdqa TWEAK5,TWEAK0
872    aesdec  RDK, BLK4
873    aesdec  RDK, BLK5
874
875    vmovdqu -96(KEYEND,TROUNDS),RDK
876    vmovdqa TWKTMP,TMPX
877    aesdec  RDK, BLK0
878    vpaddd TWKTMP,TWKTMP,TWKTMP
879    vpsrad $31,TMPX,TMPX
880    aesdec  RDK, BLK1
881    vpaddq TWEAK5,TWEAK5,TWEAK5
882    vpand GFP,TMPX,TMPX
883    aesdec  RDK, BLK2
884    vpxor TMPX,TWEAK5,TWEAK5
885    add $16,TROUNDS
886    aesdec  RDK, BLK3
887    vmovdqa TWEAK5,TWEAK1
888    aesdec  RDK, BLK4
889    aesdec  RDK, BLK5
890
891    vmovdqu -96(KEYEND,TROUNDS),RDK
892    vmovdqa TWKTMP,TMPX
893    aesdec  RDK, BLK0
894    vpaddd TWKTMP,TWKTMP,TWKTMP
895    vpsrad $31,TMPX,TMPX
896    aesdec  RDK, BLK1
897    vpaddq TWEAK5,TWEAK5,TWEAK5
898    vpand GFP,TMPX,TMPX
899    aesdec  RDK, BLK2
900    vpxor TMPX,TWEAK5,TWEAK5
901    add $16,TROUNDS
902    aesdec  RDK, BLK3
903    vmovdqa TWEAK5,TWEAK2
904    aesdec  RDK, BLK4
905    aesdec  RDK, BLK5
906
907    vmovdqu -96(KEYEND,TROUNDS),RDK
908    vmovdqa TWKTMP,TMPX
909    aesdec  RDK, BLK0
910    vpaddd TWKTMP,TWKTMP,TWKTMP
911    vpsrad $31,TMPX,TMPX
912    aesdec  RDK, BLK1
913    vpaddq TWEAK5,TWEAK5,TWEAK5
914    vpand GFP,TMPX,TMPX
915    aesdec  RDK, BLK2
916    vpxor TMPX,TWEAK5,TWEAK5
917    add $16,TROUNDS
918    aesdec  RDK, BLK3
919    vmovdqa TWEAK5,TWEAK3
920    aesdec  RDK, BLK4
921    aesdec  RDK, BLK5
922
923    vmovdqu -96(KEYEND,TROUNDS),RDK
924    vmovdqa TWKTMP,TMPX
925    aesdec  RDK, BLK0
926    vpaddd TWKTMP,TWKTMP,TWKTMP
927    vpsrad $31,TMPX,TMPX
928    aesdec  RDK, BLK1
929    vpaddq TWEAK5,TWEAK5,TWEAK5
930    vpand GFP,TMPX,TMPX
931    aesdec  RDK, BLK2
932    vpxor TMPX,TWEAK5,TWEAK5
933    aesdec  RDK, BLK3
934    vmovdqa TWEAK5,TWEAK4
935    aesdec  RDK, BLK4
936    aesdec  RDK, BLK5
937
938    vmovdqa TWKTMP,TMPX
939    aesdeclast (%rsp), BLK0
940    aesdeclast 16(%rsp), BLK1   // already do the tweak^lastround, so here just aesdeclast
941    vpaddd TWKTMP,TWKTMP,TWKTMP
942    vpsrad $31,TMPX,TMPX
943    aesdeclast 32(%rsp), BLK2
944    vpaddq TWEAK5,TWEAK5,TWEAK5
945    vpand GFP,TMPX,TMPX
946    aesdeclast 48(%rsp), BLK3
947    vpxor TMPX,TWEAK5,TWEAK5
948    aesdeclast 64(%rsp), BLK4
949    aesdeclast 80(%rsp), BLK5
950
951    vmovdqu BLK0, (OUT)
952    vmovdqu BLK1, 16(OUT)
953    vmovdqu BLK2, 32(OUT)
954    vmovdqu BLK3, 48(OUT)
955    vmovdqu BLK4, 64(OUT)
956    vmovdqu BLK5, 80(OUT)
957
958    leaq 96(IN), IN
959    leaq 96(OUT), OUT
960    sub $96, LTMP
961    cmp $96, LTMP
962    jb  .Lxts_aesdec_start
963    jmp  .Lxts_dec_proc_6_blks
964
965.align 16
966.Lxts_aesdec_finish:
967    vmovdqu TWEAK0, (TWEAK)
968    vpxor BLK0, BLK0, BLK0
969    vpxor BLK1, BLK1, BLK1
970    vpxor BLK2, BLK2, BLK2
971    vpxor BLK3, BLK3, BLK3
972    vpxor BLK4, BLK4, BLK4
973    vpxor BLK5, BLK5, BLK5
974    vpxor BLK6, BLK6, BLK6
975    vpxor RDK, RDK, RDK
976    movl $0, RET
977
978    mov %rbp,%rsp
979    add $96,%rsp
980    popq %r15
981    popq %r14
982    popq %r13
983    popq %r12
984    popq %rbp
985    popq %rbx
986    ret
987.cfi_endproc
988.size CRYPT_AES_XTS_Decrypt, .-CRYPT_AES_XTS_Decrypt
989
990#endif
991