• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15
16#include "hitls_build.h"
17#if defined(HITLS_CRYPTO_AES) && defined(HITLS_CRYPTO_CTR)
18
19.file   "crypt_aes_ctr_x86_64.S"
20.text
21
22.set    KEY, %rdi
23.set    INPUT, %rsi
24.set    OUTPUT, %rdx
25.set    LEN, %ecx
26.set    CTR_IV, %r8
27
28.set    RDK, %xmm0
29.set    RDK2, %xmm1
30.set    KTMP, %r13
31.set    ROUNDS, %eax
32.set    RET, %eax
33
34.set    IV0, %xmm2
35.set    IV1, %xmm3
36.set    IV2, %xmm4
37.set    IV3, %xmm5
38.set    IV4, %xmm6
39.set    IV5, %xmm7
40.set    IV6, %xmm8
41.set    IV7, %xmm9
42.set    BLK0, %xmm10
43.set    BLK1, %xmm11
44.set    BLK2, %xmm12
45.set    BLK3, %xmm13
46.set    BLK4, %xmm14
47.set    BLK5, %xmm15
48
49/**
50 *    Macro description: Eight IVs are encrypted.
51 *       Input register:
52 *                  Key: Round key.
53 *             block0-7: Encrypted IV.
54 *  Modify the register:  block0-7.
55 *      Output register:
56 *             block0-7:  IV after a round of encryption.
57 */
58.macro ONE_ENC key block0 block1 block2 block3 block4 block5 block6 block7
59    aesenc  \key, \block0
60    aesenc  \key, \block1
61    aesenc  \key, \block2
62    aesenc  \key, \block3
63    aesenc  \key, \block4
64    aesenc  \key, \block5
65    aesenc  \key, \block6
66    aesenc  \key, \block7
67.endm
68
69/**
70 *  Macro description: Obtains a new ctr and XORs it with the round key.
71 *  input register:
72 *      ctr32:Initialization vector.
73 *     offset:Offset.
74 *       temp:32-bit CTR temporary register.
75 *      key32:32-bit round key.
76 * addrOffset:push stack address offset.
77 *       addr:push stack address.
78 *  Modify the register: Temp.
79 */
80.macro XOR_KEY ctr32 offset temp key32 addrOffset addr
81    leal \offset(\ctr32), \temp                 // XOR 32-bit ctr and key, push into the stack
82    bswapl \temp
83    xorl \key32, \temp
84    movl \temp, \addrOffset+12(\addr)
85.endm
86
87/**
88 *  Macro description: Obtain the round key, encrypt the IV, obtain the next round of ctr, and XOR the round key.
89 *  Input register:
90 *              Key: pointer to the key.
91 *           Offset: round key offset.
92 *             Temp: Temporary register for the round key.
93 *            Ctr32: initialization vector.
94 *          Offset2: Ctr offset.
95 *            Temp2: 32-bit CTR temporary register.
96 *            Key32: 32-bit round key.
97 *       AddrOffset: Offest of entering the stack.
98 *             Addr: Address for entering the stack.
99 *  Modify register: Temp temp2 IV0-7.
100 *  Output register:
101 *            IV0-7:  IV after a round of encryption.
102 */
103.macro ONE_ENC_XOR_KEY key offset temp ctr32 offset2 temp2 key32 addrOffset addr
104    vmovdqu \offset(\key), \temp
105    aesenc  \temp, IV0
106    leal    \offset2(\ctr32), \temp2                 // XOR 32-bit ctr and key, push stack.
107    aesenc  \temp, IV1
108    bswapl  \temp2
109    aesenc  \temp, IV2
110    aesenc  \temp, IV3
111    xorl    \key32, \temp2
112    aesenc  \temp, IV4
113    aesenc  \temp, IV5
114    movl    \temp2, \addrOffset+12(\addr)
115    aesenc  \temp, IV6
116    aesenc  \temp, IV7
117
118.endm
119
120/**
121 *  Macro description: Update the in and out pointer offsets and the remaining length of len.
122 *       Input register:
123 *                Input:pointer to the input memory.
124 *               Output:pointer to the output memory.
125 *                  Len:remaining data length.
126 *               Offset:indicates the offset.
127 *  Modify the register: Input output len.
128 *      Output register:
129 *      Input output len
130 */
131.macro UPDATE_DATA input output len offset
132    leaq    \offset(\input), \input
133    leaq    \offset(\output), \output
134    subl    $\offset, \len
135.endm
136
137/**
138 *  Function description:Sets the AES encrypted assembly acceleration API, ctr mode.
139 *  Function prototype:int32_t CRYPT_AES_CTR_Encrypt(const CRYPT_AES_Key *ctx, const uint8_t *in, uint8_t *out,
140 *                                          uint32_t len, uint8_t *iv);
141 *  Input register:
142 *        rdi:Pointer to the input key structure.
143 *        rsi:Points to the 128-bit input data.
144 *        rdx:Points to the 128-bit output data.
145 *        rcx:Length of the data block, that is, 16 bytes.
146 *         r8: 16-byte initialization vector.
147 *  Change register:xmm1, xmm3, xmm4, xmm5, xmm6, xmm10, xmm11, xmm12, xmm13.
148 *  Output register:rdx, r8.
149 */
150.globl  CRYPT_AES_CTR_Encrypt
151    .type   CRYPT_AES_CTR_Encrypt, @function
152CRYPT_AES_CTR_Encrypt:
153    .cfi_startproc
154    pushq   %r12
155    pushq   %r13
156    pushq   %r14
157    pushq   %r15
158    mov     %rsp, %r12
159    subq    $128, %rsp                      // Declare for 128-byte stack space.
160    andq    $-16, %rsp
161
162    vmovdqu (KEY), RDK
163    vpxor   (CTR_IV), RDK, IV0
164    vmovdqa IV0, 0(%rsp)
165    vmovdqa IV0, 16(%rsp)
166    vmovdqa IV0, 32(%rsp)
167    vmovdqa IV0, 48(%rsp)
168    vmovdqa IV0, 64(%rsp)
169    vmovdqa IV0, 80(%rsp)
170    vmovdqa IV0, 96(%rsp)
171    vmovdqa IV0, 112(%rsp)
172
173    movl    12(CTR_IV), %r11d              // Read 32-bit ctr.
174    movl    12(KEY), %r9d                  // Read 32-bit key.
175    bswap   %r11d
176
177    mov     LEN, %r14d
178    shr    $4, %r14d
179    and    $7, %r14d
180    cmp     $1, %r14d
181    je .Lctr_enc_proc_1_blk
182    cmp     $2, %r14d
183    je .Lctr_enc_proc_2_blk
184    cmp     $3, %r14d
185    je .Lctr_enc_proc_3_blk
186    cmp     $4, %r14d
187    je .Lctr_enc_proc_4_blk
188    cmp     $5, %r14d
189    je .Lctr_enc_proc_5_blk
190    cmp     $6, %r14d
191    je .Lctr_enc_proc_6_blk
192    cmp     $7, %r14d
193    je .Lctr_enc_proc_7_blk
194
195.Lctr_enc_proc_8_blk:
196    cmp $0, LEN
197    je .Lctr_aesenc_finish
198
199    leal 0(%r11d), %r15d
200    leal 1(%r11d), %r10d
201    bswapl %r15d
202    bswapl %r10d
203    xorl %r9d, %r15d
204    xorl %r9d, %r10d
205    leal 2(%r11d), %r14d
206    movl %r15d, 12(%rsp)
207    bswapl %r14d
208    movl %r10d, 16+12(%rsp)
209    xorl %r9d, %r14d
210    leal 3(%r11d), %r15d
211    leal 4(%r11d), %r10d
212    bswapl %r15d
213    bswapl %r10d
214    movl %r14d, 32+12(%rsp)
215    xorl %r9d, %r15d
216    xorl %r9d, %r10d
217    movl %r15d, 48+12(%rsp)
218    leal 5(%r11d), %r14d
219    bswapl %r14d
220    movl %r10d, 64+12(%rsp)
221    xorl %r9d, %r14d
222    leal 6(%r11d), %r15d
223    leal 7(%r11d), %r10d
224    movl %r14d, 80+12(%rsp)
225    bswapl %r15d
226    bswapl %r10d
227    xorl %r9d, %r15d
228    xorl %r9d, %r10d
229    movl %r15d, 96+12(%rsp)
230    movl %r10d, 112+12(%rsp)
231
232    vmovdqa (%rsp), IV0
233    vmovdqa 16(%rsp), IV1
234    vmovdqa 32(%rsp), IV2
235    vmovdqa 48(%rsp), IV3
236    vmovdqa 64(%rsp), IV4
237    vmovdqa 80(%rsp), IV5
238    vmovdqa 96(%rsp), IV6
239    vmovdqa 112(%rsp), IV7
240.align 16
241.Lctr_aesenc_8_blks_enc_loop:
242    addl    $8, %r11d                                                   // ctr+8
243    movl    240(KEY), ROUNDS
244    ONE_ENC_XOR_KEY KEY, 16, RDK2, %r11d, 0, %r10d, %r9d, 0, %rsp       // Round 1 encryption
245    ONE_ENC_XOR_KEY KEY, 32, RDK2, %r11d, 1, %r10d, %r9d, 16, %rsp      // Round 2 encryption
246    ONE_ENC_XOR_KEY KEY, 48, RDK2, %r11d, 2, %r10d, %r9d, 32, %rsp      // Round 3 encryption
247    ONE_ENC_XOR_KEY KEY, 64, RDK2, %r11d, 3, %r10d, %r9d, 48, %rsp      // Round 4 encryption
248    ONE_ENC_XOR_KEY KEY, 80, RDK2, %r11d, 4, %r10d, %r9d, 64, %rsp      // Round 5 encryption
249    ONE_ENC_XOR_KEY KEY, 96, RDK2, %r11d, 5, %r10d, %r9d, 80, %rsp      // Round 6 encryption
250    ONE_ENC_XOR_KEY KEY, 112, RDK2, %r11d, 6, %r10d, %r9d, 96, %rsp     // Round 7 encryption
251    ONE_ENC_XOR_KEY KEY, 128, RDK2, %r11d, 7, %r10d, %r9d, 112, %rsp    // Round 8 encryption
252
253    vmovdqu 144(KEY), RDK                                               // Round 9 key Load
254    vmovdqu 160(KEY), RDK2                                              // Round 10 key Load
255    cmp     $12, ROUNDS
256    jb .Lctr_aesenc_8_blks_enc_last
257
258    ONE_ENC RDK, IV0, IV1, IV2, IV3, IV4, IV5, IV6, IV7                 // Round 9 encryption
259    vmovdqu 176(KEY), RDK                                               // Round 11 key Load
260    ONE_ENC RDK2, IV0, IV1, IV2, IV3, IV4, IV5, IV6, IV7                // Round 10 encryption
261    vmovdqu 192(KEY), RDK2                                              // Round 12 key Load
262
263    je .Lctr_aesenc_8_blks_enc_last
264
265    ONE_ENC RDK, IV0, IV1, IV2, IV3, IV4, IV5, IV6, IV7                 // Round 11 encryption
266    vmovdqu 208(KEY), RDK                                               // Round 13 key Load
267    ONE_ENC RDK2, IV0, IV1, IV2, IV3, IV4, IV5, IV6, IV7                // Round 12 encryption
268    vmovdqu 224(KEY), RDK2                                              // Round 14 key Load
269
270.align 16
271.Lctr_aesenc_8_blks_enc_last:
272    vpxor   (INPUT), RDK2, BLK0         // Last round Key ^ Plaintext.
273    vpxor   16(INPUT), RDK2, BLK1
274    vpxor   32(INPUT), RDK2, BLK2
275    vpxor   48(INPUT), RDK2, BLK3
276
277    ONE_ENC RDK, IV0, IV1, IV2, IV3, IV4, IV5, IV6, IV7
278
279    aesenclast BLK0, IV0                // Last round of encryption.
280    aesenclast BLK1, IV1
281    aesenclast BLK2, IV2
282    aesenclast BLK3, IV3
283    aesenclast RDK2, IV4
284    aesenclast RDK2, IV5
285    aesenclast RDK2, IV6
286    aesenclast RDK2, IV7
287
288    vmovdqu IV0, (OUTPUT)               // The first four ciphertexts are stored in out.
289    vmovdqu IV1, 16(OUTPUT)
290    vmovdqu IV2, 32(OUTPUT)
291    vmovdqu IV3, 48(OUTPUT)
292    vpxor   64(INPUT), IV4, BLK0        // Last Round Key ^ Plaintext.
293    vpxor   80(INPUT), IV5, BLK1
294    vpxor   96(INPUT), IV6, BLK2
295    vpxor   112(INPUT), IV7, BLK3
296
297    vmovdqu BLK0, 64(OUTPUT)
298    vmovdqu BLK1, 80(OUTPUT)
299    vmovdqu BLK2, 96(OUTPUT)            // The last four ciphertexts are stored in out.
300    vmovdqu BLK3, 112(OUTPUT)
301    vmovdqa (%rsp), IV0                 // Reads the next round of ctr from the stack.
302    vmovdqa 16(%rsp), IV1
303    vmovdqa 32(%rsp), IV2
304    vmovdqa 48(%rsp), IV3
305    vmovdqa 64(%rsp), IV4
306    vmovdqa 80(%rsp), IV5
307    vmovdqa 96(%rsp), IV6
308    vmovdqa 112(%rsp), IV7
309    UPDATE_DATA INPUT, OUTPUT, LEN, 128
310    cmpl    $0, LEN
311    jbe .Lctr_aesenc_finish
312    jmp .Lctr_aesenc_8_blks_enc_loop
313
314.Lctr_enc_proc_1_blk:
315    movl    240(KEY), ROUNDS
316    movq    KEY, KTMP
317    decl    ROUNDS
318.align  16
319.Laesenc_loop:
320    leaq    16(KTMP), KTMP
321    vmovdqu (KTMP), RDK
322    aesenc  RDK, IV0
323    decl    ROUNDS
324    jnz .Laesenc_loop                   // Loop the loop until the ROUNDS is 0.
325    leaq    16(KTMP), KTMP
326    vmovdqu (KTMP), RDK
327    aesenclast RDK, IV0
328    addl    $1, %r11d                   // Update ctr32.
329    vpxor   (INPUT), IV0, BLK0
330    vmovdqu BLK0, (OUTPUT)              // Ciphertext stored in out.
331    UPDATE_DATA INPUT, OUTPUT, LEN, 16
332    jmp .Lctr_enc_proc_8_blk
333.Lctr_enc_proc_2_blk:
334    movl    240(KEY), ROUNDS
335    movq    KEY, KTMP
336    decl    ROUNDS
337    XOR_KEY %r11d, 1, %r10d, %r9d, 16, %rsp
338    vmovdqa 16(%rsp), IV1
339.align 16
340.Laesenc_2_blks_loop:
341    leaq    16(KTMP), KTMP
342    vmovdqu (KTMP), RDK
343    aesenc  RDK, IV0
344    aesenc  RDK, IV1
345    decl    ROUNDS
346    jnz .Laesenc_2_blks_loop
347    leaq    16(KTMP), KTMP
348    vmovdqu (KTMP), RDK
349    aesenclast RDK, IV0
350    aesenclast RDK, IV1
351
352    vpxor   (INPUT), IV0, BLK0
353    vpxor   16(INPUT), IV1, BLK1
354    vmovdqu BLK0, (OUTPUT)
355    vmovdqu BLK1, 16(OUTPUT)
356    addl    $2, %r11d
357    UPDATE_DATA INPUT, OUTPUT, LEN, 32
358    jmp  .Lctr_enc_proc_8_blk
359.Lctr_enc_proc_3_blk:
360    movl    240(KEY), ROUNDS
361    movq    KEY, KTMP
362    decl    ROUNDS
363    XOR_KEY %r11d, 1, %r10d, %r9d, 16, %rsp
364    XOR_KEY %r11d, 2, %r10d, %r9d, 32, %rsp
365    vmovdqa 16(%rsp), IV1
366    vmovdqa 32(%rsp), IV2
367.align 16
368.Laesenc_3_blks_loop:
369    leaq    16(KTMP), KTMP
370    vmovdqu (KTMP), RDK
371    aesenc  RDK, IV0
372    aesenc  RDK, IV1
373    aesenc  RDK, IV2
374    decl    ROUNDS
375    jnz .Laesenc_3_blks_loop
376    leaq    16(KTMP), KTMP
377    vmovdqu (KTMP), RDK
378    aesenclast RDK, IV0
379    aesenclast RDK, IV1
380    aesenclast RDK, IV2
381
382    vpxor   (INPUT), IV0, BLK0
383    vpxor   16(INPUT), IV1, BLK1
384    vpxor   32(INPUT), IV2, BLK2
385
386    vmovdqu BLK0, (OUTPUT)
387    vmovdqu BLK1, 16(OUTPUT)
388    vmovdqu BLK2, 32(OUTPUT)
389    addl    $3, %r11d
390    UPDATE_DATA INPUT, OUTPUT, LEN, 48
391    jmp  .Lctr_enc_proc_8_blk
392.Lctr_enc_proc_4_blk:
393    movl    240(KEY), ROUNDS
394    movq    KEY, KTMP
395    decl    ROUNDS
396    XOR_KEY %r11d, 1, %r10d, %r9d, 16, %rsp
397    XOR_KEY %r11d, 2, %r10d, %r9d, 32, %rsp
398    XOR_KEY %r11d, 3, %r10d, %r9d, 48, %rsp
399    vmovdqa 16(%rsp), IV1
400    vmovdqa 32(%rsp), IV2
401    vmovdqa 48(%rsp), IV3
402.align 16
403.Laesenc_4_blks_loop:
404    leaq    16(KTMP), KTMP
405    vmovdqu (KTMP), RDK
406    aesenc  RDK, IV0
407    aesenc  RDK, IV1
408    aesenc  RDK, IV2
409    aesenc  RDK, IV3
410    decl    ROUNDS
411    jnz .Laesenc_4_blks_loop
412    leaq    16(KTMP), KTMP
413    vmovdqu (KTMP), RDK
414    aesenclast RDK, IV0
415    aesenclast RDK, IV1
416    aesenclast RDK, IV2
417    aesenclast RDK, IV3
418
419    vpxor   (INPUT), IV0, BLK0
420    vpxor   16(INPUT), IV1, BLK1
421    vpxor   32(INPUT), IV2, BLK2
422    vpxor   48(INPUT), IV3, BLK3
423
424    vmovdqu BLK0, (OUTPUT)
425    vmovdqu BLK1, 16(OUTPUT)
426    vmovdqu BLK2, 32(OUTPUT)
427    vmovdqu BLK3, 48(OUTPUT)
428    addl    $4, %r11d
429    UPDATE_DATA INPUT, OUTPUT, LEN, 64
430    jmp  .Lctr_enc_proc_8_blk
431
432.Lctr_enc_proc_5_blk:
433    movl    240(KEY), ROUNDS
434    movq    KEY, KTMP
435    decl ROUNDS
436    XOR_KEY %r11d, 1, %r10d, %r9d, 16, %rsp
437    XOR_KEY %r11d, 2, %r10d, %r9d, 32, %rsp
438    XOR_KEY %r11d, 3, %r10d, %r9d, 48, %rsp
439    XOR_KEY %r11d, 4, %r10d, %r9d, 64, %rsp
440    vmovdqa 16(%rsp), IV1
441    vmovdqa 32(%rsp), IV2
442    vmovdqa 48(%rsp), IV3
443    vmovdqa 64(%rsp), IV4
444.align 16
445.Laesenc_5_blks_loop:
446    leaq    16(KTMP), KTMP
447    vmovdqu (KTMP), RDK
448    aesenc  RDK, IV0
449    aesenc  RDK, IV1
450    aesenc  RDK, IV2
451    aesenc  RDK, IV3
452    aesenc  RDK, IV4
453    decl    ROUNDS
454    jnz .Laesenc_5_blks_loop
455    leaq    16(KTMP), KTMP
456    vmovdqu (KTMP), RDK
457    aesenclast RDK, IV0
458    aesenclast RDK, IV1
459    aesenclast RDK, IV2
460    aesenclast RDK, IV3
461    aesenclast RDK, IV4
462
463    vpxor   (INPUT), IV0, BLK0
464    vpxor   16(INPUT), IV1, BLK1
465    vpxor   32(INPUT), IV2, BLK2
466    vpxor   48(INPUT), IV3, BLK3
467    vpxor   64(INPUT), IV4, BLK4
468    vmovdqu BLK0, (OUTPUT)
469    vmovdqu BLK1, 16(OUTPUT)
470    vmovdqu BLK2, 32(OUTPUT)
471    vmovdqu BLK3, 48(OUTPUT)
472    vmovdqu BLK4, 64(OUTPUT)
473    addl    $5, %r11d
474    UPDATE_DATA INPUT, OUTPUT, LEN, 80
475    jmp  .Lctr_enc_proc_8_blk
476.Lctr_enc_proc_6_blk:
477    movl    240(KEY), ROUNDS
478    movq    KEY, KTMP
479    decl    ROUNDS
480    XOR_KEY %r11d, 1, %r10d, %r9d, 16, %rsp
481    XOR_KEY %r11d, 2, %r10d, %r9d, 32, %rsp
482    XOR_KEY %r11d, 3, %r10d, %r9d, 48, %rsp
483    XOR_KEY %r11d, 4, %r10d, %r9d, 64, %rsp
484    XOR_KEY %r11d, 5, %r10d, %r9d, 80, %rsp
485    vmovdqa 16(%rsp), IV1
486    vmovdqa 32(%rsp), IV2
487    vmovdqa 48(%rsp), IV3
488    vmovdqa 64(%rsp), IV4
489    vmovdqa 80(%rsp), IV5
490.align 16
491.Laesenc_6_blks_loop:
492    leaq    16(KTMP), KTMP
493    vmovdqu (KTMP), RDK
494    aesenc  RDK, IV0
495    aesenc  RDK, IV1
496    aesenc  RDK, IV2
497    aesenc  RDK, IV3
498    aesenc  RDK, IV4
499    aesenc  RDK, IV5
500    decl    ROUNDS
501    jnz .Laesenc_6_blks_loop
502    leaq    16(KTMP), KTMP
503    vmovdqu (KTMP), RDK
504    aesenclast RDK, IV0
505    aesenclast RDK, IV1
506    aesenclast RDK, IV2
507    aesenclast RDK, IV3
508    aesenclast RDK, IV4
509    aesenclast RDK, IV5
510
511    vpxor   (INPUT), IV0, BLK0
512    vpxor   16(INPUT), IV1, BLK1
513    vpxor   32(INPUT), IV2, BLK2
514    vpxor   48(INPUT), IV3, BLK3
515    vpxor   64(INPUT), IV4, BLK4
516    vpxor   80(INPUT), IV5, BLK5
517    vmovdqu BLK0, (OUTPUT)
518    vmovdqu BLK1, 16(OUTPUT)
519    vmovdqu BLK2, 32(OUTPUT)
520    vmovdqu BLK3, 48(OUTPUT)
521    vmovdqu BLK4, 64(OUTPUT)
522    vmovdqu BLK5, 80(OUTPUT)
523    addl    $6, %r11d
524    UPDATE_DATA INPUT, OUTPUT, LEN, 96
525
526    jmp  .Lctr_enc_proc_8_blk
527.Lctr_enc_proc_7_blk:
528    movl    240(KEY), ROUNDS
529    movq    KEY, KTMP
530    decl    ROUNDS
531    XOR_KEY %r11d, 1, %r10d, %r9d, 16, %rsp
532    XOR_KEY %r11d, 2, %r10d, %r9d, 32, %rsp
533    XOR_KEY %r11d, 3, %r10d, %r9d, 48, %rsp
534    XOR_KEY %r11d, 4, %r10d, %r9d, 64, %rsp
535    XOR_KEY %r11d, 5, %r10d, %r9d, 80, %rsp
536    XOR_KEY %r11d, 6, %r10d, %r9d, 96, %rsp
537    vmovdqa 16(%rsp), IV1
538    vmovdqa 32(%rsp), IV2
539    vmovdqa 48(%rsp), IV3
540    vmovdqa 64(%rsp), IV4
541    vmovdqa 80(%rsp), IV5
542    vmovdqa 96(%rsp), IV6
543
544.align 16
545.Laesenc_7_blks_loop:
546    leaq    16(KTMP), KTMP
547    vmovdqu (KTMP), RDK
548    aesenc  RDK, IV0
549    aesenc  RDK, IV1
550    aesenc  RDK, IV2
551    aesenc  RDK, IV3
552    aesenc  RDK, IV4
553    aesenc  RDK, IV5
554    aesenc  RDK, IV6
555    decl    ROUNDS
556    jnz .Laesenc_7_blks_loop
557    leaq    16(KTMP), KTMP
558    vmovdqu (KTMP), RDK
559    aesenclast RDK, IV0
560    aesenclast RDK, IV1
561    aesenclast RDK, IV2
562    aesenclast RDK, IV3
563    aesenclast RDK, IV4
564    aesenclast RDK, IV5
565    aesenclast RDK, IV6
566    vpxor   (INPUT), IV0, BLK0
567    vpxor   16(INPUT), IV1, BLK1
568    vpxor   32(INPUT), IV2, BLK2
569    vpxor   48(INPUT), IV3, BLK3
570    vmovdqu BLK0, (OUTPUT)
571    vmovdqu BLK1, 16(OUTPUT)
572    vmovdqu BLK2, 32(OUTPUT)
573    vmovdqu BLK3, 48(OUTPUT)
574    vpxor   64(INPUT), IV4, BLK0
575    vpxor   80(INPUT), IV5, BLK1
576    vpxor   96(INPUT), IV6, BLK2
577    vmovdqu BLK0, 64(OUTPUT)
578    vmovdqu BLK1, 80(OUTPUT)
579    vmovdqu BLK2, 96(OUTPUT)
580    addl    $7, %r11d
581    UPDATE_DATA INPUT, OUTPUT, LEN, 112
582    jmp  .Lctr_enc_proc_8_blk
583
584.Lctr_aesenc_finish:
585    bswap   %r11d
586    movl    %r11d, 12(CTR_IV)
587    vpxor   IV0, IV0, IV0
588    vpxor   IV1, IV1, IV1
589    vpxor   IV2, IV2, IV2
590    vpxor   IV3, IV3, IV3
591    vpxor   IV4, IV4, IV4
592    vpxor   IV5, IV5, IV5
593    vpxor   IV6, IV6, IV6
594    vpxor   IV7, IV7, IV7
595    vpxor   RDK, RDK, RDK
596    vmovdqa IV0, 0(%rsp)
597    vmovdqa IV0, 16(%rsp)
598    vmovdqa IV0, 32(%rsp)
599    vmovdqa IV0, 48(%rsp)
600    vmovdqa IV0, 64(%rsp)
601    vmovdqa IV0, 80(%rsp)
602    vmovdqa IV0, 96(%rsp)
603    vmovdqa IV0, 112(%rsp)
604
605    movq    %r12, %rsp
606    popq    %r15
607    popq    %r14
608    popq    %r13
609    popq    %r12
610
611    movl    $0, RET
612    ret
613    .cfi_endproc
614    .size CRYPT_AES_CTR_Encrypt, .-CRYPT_AES_CTR_Encrypt
615
616#endif
617