• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15
16#include "hitls_build.h"
17#ifdef HITLS_CRYPTO_CHACHA20
18
19#include "chacha20_x8664_common.S"
20.text
21.align    64
22g_ror16:
23    .byte   0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
24    .size   g_ror16, .-g_ror16
25.align    64
26g_ror8:
27    .byte   0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
28    .size   g_ror8, .-g_ror8
29.align    64
30g_ror16_128:
31    .byte   0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd, \
32            0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
33    .size   g_ror16_128, .-g_ror16_128
34.align    64
35g_ror8_128:
36    .byte   0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe, \
37            0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
38    .size   g_ror8_128, .-g_ror8_128
39.align    64
40g_addOne:
41    .long   0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
42    .size   g_addOne, .-g_addOne
43.align    64
44g_add4block:
45    .long   0, 1, 2, 3
46    .size   g_add4block, .-g_add4block
47.align    64
48g_addsecond4block:
49    .long   4, 4, 4, 4
50    .size   g_addsecond4block, .-g_addsecond4block
51.align    64
52g_add8block:
53    .long   0, 1, 2, 3, 4, 5, 6, 7
54    .size   g_add8block, .-g_add8block
55.align    64
56g_addsecond8block:
57    .long   8, 8, 8, 8, 8, 8, 8, 8
58    .size   g_addsecond8block, .-g_addsecond8block
59.align    64
60g_add16block:
61    .long   0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
62    .size   g_add16block, .-g_add16block
63.align    64
64g_addsecond16block:
65    .long   16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
66    .size   g_addsecond16block, .-g_addsecond16block
67
68.set  IN, %rsi
69.set OUT, %rdx
70
71/*
72 * Processing 64 bytes: 4 x registers, number of instructions in a single loop: 21*2 = 42
73 * xmm0 ~ xmm3:
74 * xmm0 {0,  1,  2,  3}
75 * xmm1 {4,  5,  6,  7}
76 * xmm2 {8,  9,  10, 11}
77 * xmm3 {12, 13, 14, 15}
78 *
79 * Processing 128-256 bytes: 4 x registers, number of instructions in a single loop:30
80 * ymm0 ~ ymm3:
81 * ymm0 {0,  1,  2,  3,  0,  1,  2,  3 }
82 * ymm1 {4,  5,  6,  7,  4,  5,  6,  7 }
83 * ymm2 {8,  9,  10, 11, 8,  9,  10, 11}
84 * ymm3 {12, 13, 14, 15, 12, 13, 14, 15}
85 *
86 * Processing 512 bytes: y registers 0-15, 128 stack space and y registers 16-31,number of instructions
87 *in a single loop:12*8 = 96
88 * Processing 1024 bytes: z registers 0-15, 256 stack space and z registers 16-31, number of instructions
89 * in a single loop:12*8 = 96
90 * ymm0 ~ ymm15:
91 * ymm0  {0,  0,  0,  0,  0,  0,  0,  0}
92 * ymm1  {1,  1,  1,  1,  1,  1,  1,  1}
93 * ymm2  {2,  2,  2,  2,  2,  2,  2,  2}
94 * ymm3  {3,  3,  3,  3,  3,  3,  3,  3}
95 * ......
96 * ymm15 {15, 15, 15, 15, 15, 15, 15, 15}
97 *
98 * zmm0 ~ zmm31:
99 * zmm0  {0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0}
100 * zmm1  {1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1}
101 * zmm2  {2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2}
102 * zmm3  {3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3}
103 * ...
104 * zmm15 {15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15}
105 */
106
107.macro CHACHA20_ROUND s0 s1 s2 s3
108    vpaddd   \s1, \s0, \s0
109    vpxord   \s0, \s3, \s3
110    vprold   $16, \s3, \s3
111
112    vpaddd   \s3, \s2, \s2
113    vpxord   \s2, \s1, \s1
114    vprold   $12, \s1, \s1
115
116    vpaddd   \s1, \s0, \s0
117    vpxord   \s0, \s3, \s3
118    vprold   $8, \s3, \s3
119
120    vpaddd   \s3, \s2, \s2
121    vpxord   \s2, \s1, \s1
122    vprold   $7, \s1, \s1
123.endm
124
125/* convert y registers and write back */
126.macro CONVERT_Y s0 s1 pos inpos outpos
127    /* ymm16 => {xmm16, xmm17} */
128    vextracti32x4   \pos, \s0, %xmm16
129    vextracti32x4   \pos, \s1, %xmm17
130    vinserti32x4    $1, %xmm17, %ymm16, %ymm16
131
132    vpxord      (IN), %ymm16, %ymm16
133    vmovdqu64   %ymm16, (OUT)
134    add $32, \inpos
135    add $32, \outpos
136.endm
137
138/* convert z registers and write back */
139.macro CONVERT_Z s0 s1 s2 s3 pos inpos outpos
140
141    /* zmm16 => {xmm16, xmm17, xmm18, xmm19} */
142    vextracti64x2  \pos, \s0, %xmm16
143    vextracti64x2  \pos, \s1, %xmm17
144    vextracti64x2  \pos, \s2, %xmm18
145    vextracti64x2  \pos, \s3, %xmm19
146    vinserti64x2  $1, %xmm17, %zmm16, %zmm16
147    vinserti64x2  $2, %xmm18, %zmm16, %zmm16
148    vinserti64x2  $3, %xmm19, %zmm16, %zmm16
149
150    vpxord      (IN), %zmm16, %zmm16
151    vmovdqu64   %zmm16, (OUT)
152    add $64, \inpos
153    add $64, \outpos
154.endm
155
156 /**
157 * @Interconnection with the C interface:void CHACHA20_Update(CRYPT_CHACHA20_Ctx *ctx, const uint8_t *in, uint8_t *out, uint32_t len);
158 * @brief chacha20 algorithm
159 * @param ctx [IN] Algorithm context, which is set by the C interface and transferred.
160 * @param in [IN] Data to be encrypted
161 * @param out [OUT] Data after encryption
162 * @param len [IN] Encrypted length
163 * esp cannot use 15 available ctx in out len
164 * 16 registers are needed in one cycle, then
165 * {0,  1,  4,  5,  8,   9,  12, 13}
166 * {2,  3,  6,  7,  10,  11, 14, 15}
167**/
168
169.globl CHACHA20_Update
170.type CHACHA20_Update,%function
171.align 64
172CHACHA20_Update:
173    .cfi_startproc
174    mov  48(%rdi), %r11d
175    mov  %rsp, %r9
176    subq $2048,%rsp
177    andq $-1024,%rsp
178
179.Lchacha20_start:
180    cmp  $1024, %rcx
181    jae  .Lchacha20_1024_start
182    cmp  $512, %rcx
183    jae  .Lchacha20_512_start
184    cmp  $256, %rcx
185    jae  .Lchacha20_256_start
186    cmp  $128, %rcx
187    jae  .Lchacha20_128_start
188    cmp  $64, %rcx
189    jae  .Lchacha20_64_start
190    jmp  .Lchacha20_end
191
192.Lchacha20_64_start:
193    LOAD_STATE  %xmm0, %xmm1, %xmm2, %xmm3, %rdi
194
195    vmovdqa %xmm0, %xmm10
196    vmovdqa %xmm1, %xmm11
197    vmovdqa %xmm2, %xmm12
198    vmovdqa %xmm3, %xmm13
199    mov $10, %r8
200
201.Lchacha20_64_loop:
202    /* 0 = 0 + 4, 12 = (12 ^ 0) >>> 16 | 8 = 8 + 12, 4 = (4 ^ 8) >>> 12 |
203     * 0 = 0 + 4, 12 = (12 ^ 0) >>> 8 |  8 = 8 + 12, 4 = (4 ^ 8) >>> 7
204     * 1 = 1 + 5, 13 = (13 ^ 1) >>> 16 | 9 = 9 + 13, 5 = (5 ^ 9) >>> 12 |
205     * 1 = 1 + 5, 13 = (13 ^ 1) >>> 8 |  9 = 9 + 13, 5 = (5 ^ 9) >>> 7
206     * 2 = 2 + 6, 14 = (14 ^ 2) >>> 16 | 10 = 10 + 14, 6 = (6 ^ 10)>>> 12 |
207     * 2 = 2 + 6, 14 = (14 ^ 2) >>> 8 |  10 = 10 + 14, 6 = (6 ^ 10)>>> 7
208     * 3 = 3 + 7, 15 = (15 ^ 3) >>> 16 | 11 = 11 + 15, 7 = (7 ^ 11)>>> 12 |
209     * 3 = 3 + 7 ,15 = (15 ^ 3) >>> 8 |  11 = 11 + 15, 7 = (7 ^ 11)>>> 7
210     */
211    CHACHA20_ROUND %xmm0, %xmm1, %xmm2, %xmm3
212
213    vpshufd  $78, %xmm2, %xmm2       // {8  9  10 11} ==> {10 11 8  9}  01 00 11 10
214    vpshufd  $57, %xmm1, %xmm1       // {4  5  6   7} ==> {5  6  7  4}  00 11 10 01
215    vpshufd  $147, %xmm3, %xmm3      // {12 13 14 15} ==> {15 12 13 14} 10 01 00 11
216
217    /* 0 = 0 + 5 , 15 = (15 ^ 0) >>> 16 | 10 = 10 + 15, 5 = (5 ^ 10) >>> 12 |
218     * 0 = 0 + 5, 15 = (15 ^ 0) >>> 8 |  10 = 10 + 15, 5 = (5 ^ 10) >>> 7
219     * 1 = 1 + 6 , 12 = (12 ^ 1) >>> 16 | 11 = 11 + 12, 6 = (6 ^ 11) >>> 12 |
220     * 1 = 1 + 6, 12 = (12 ^ 1) >>> 8 |  11 = 11 + 12,  6 = (6 ^ 11) >>> 7
221     * 2 = 2 + 7 , 13 = (13 ^ 2) >>> 16 | 8 = 8 + 13, 7 = (7 ^ 8)>>> 12 |
222     * 2 = 2 + 7, 13 = (13 ^ 2) >>> 8 |  8 =  8 + 13, 7 = (7 ^ 8)>>> 7
223     * 3 = 3 + 4 , 14 = (14 ^ 3) >>> 16 | 9 = 9 + 14, 4 = (4 ^ 9)>>> 12 |
224     * 3 = 3 + 4, 14 = (14 ^ 3) >>> 8 |  9 =  9 + 14, 4 = (4 ^ 9)>>> 7
225     */
226    CHACHA20_ROUND %xmm0, %xmm1, %xmm2, %xmm3
227
228    vpshufd  $78, %xmm2, %xmm2       // {10 11 8  9} ==> {8  9  10 11}  01 00 11 10
229    vpshufd  $147, %xmm1, %xmm1      // {5  6  7  4} ==> {4  5  6   7}  00 11 10 01
230    vpshufd  $57, %xmm3, %xmm3       // {15 12 13 14} ==> {12 13 14 15} 10 01 00 11
231
232    decq   %r8
233    jnz .Lchacha20_64_loop
234
235    vpaddd  %xmm10, %xmm0, %xmm0
236    vpaddd  %xmm11, %xmm1, %xmm1
237    vpaddd  %xmm12, %xmm2, %xmm2
238    vpaddd  %xmm13, %xmm3, %xmm3
239
240    add     $1, %r11d
241    WRITEBACK_64_AVX512    IN, OUT, %xmm0, %xmm1, %xmm2, %xmm3
242    mov  %r11d, 48(%rdi)
243    jmp  .Lchacha20_end
244
245.Lchacha20_128_start:
246
247    vbroadcasti128 (%rdi),   %ymm0    // {0  1  2   3  0  1  2   3}
248    vbroadcasti128 16(%rdi), %ymm1    // {4  5  6   7  4  5  6   7}
249    vbroadcasti128 32(%rdi), %ymm2    // {8  9  10 11  8  9  10 11}
250    vbroadcasti128 48(%rdi), %ymm3    // {12 13 14 15  12 13 14 15}
251    vpaddd g_addOne(%rip), %ymm3, %ymm3
252
253    vmovdqa32 %ymm0, %ymm16
254    vmovdqa32 %ymm1, %ymm17
255    vmovdqa32 %ymm2, %ymm18
256    vmovdqa32 %ymm3, %ymm19
257    mov $10, %r8
258
259.Lchacha20_128_loop:
260
261    CHACHA20_ROUND %ymm0, %ymm1, %ymm2, %ymm3
262
263    vpshufd  $78, %ymm2, %ymm2       // {8  9  10 11} ==> {10 11 8  9}  01 00 11 10
264    vpshufd  $57, %ymm1, %ymm1       // {4  5  6   7} ==> {5  6  7  4}  00 11 10 01
265    vpshufd  $147, %ymm3, %ymm3      // {12 13 14 15} ==> {15 12 13 14} 10 01 00 11
266
267    CHACHA20_ROUND %ymm0, %ymm1, %ymm2, %ymm3
268
269    vpshufd  $78, %ymm2, %ymm2       // {8  9  10 11} ==> {10 11 8  9}  01 00 11 10
270    vpshufd  $147, %ymm1, %ymm1      // {4  5  6   7} ==> {5  6  7  4}  00 11 10 01
271    vpshufd  $57, %ymm3, %ymm3       // {12 13 14 15} ==> {15 12 13 14} 10 01 00 11
272
273    decq   %r8
274    jnz    .Lchacha20_128_loop
275
276    vpaddd    %ymm16, %ymm0, %ymm0
277    vpaddd    %ymm17, %ymm1, %ymm1
278    vpaddd    %ymm18, %ymm2, %ymm2
279    vpaddd    %ymm19, %ymm3, %ymm3
280
281    vextracti32x4  $1, %ymm0, %xmm5     // ymm0 => {xmm0 xmm5}
282    vextracti32x4  $1, %ymm1, %xmm6     // ymm1 => {xmm1 xmm6}
283    vextracti32x4  $1, %ymm2, %xmm7     // ymm2 => {xmm2 xmm7}
284    vextracti32x4  $1, %ymm3, %xmm8     // ymm3 => {xmm3 xmm8}
285
286    WRITEBACK_64_AVX512     IN, OUT, %xmm0, %xmm1, %xmm2, %xmm3
287    WRITEBACK_64_AVX512     IN, OUT, %xmm5, %xmm6, %xmm7, %xmm8
288
289    add $2, %r11d
290    sub $128, %rcx
291    mov %r11d, 48(%rdi)
292    jz  .Lchacha20_end
293    jmp .Lchacha20_start
294
295.Lchacha20_256_start:
296
297    LOAD_1024_STATE %zmm0 %zmm1 %zmm2 %zmm3 %rdi
298    vpaddd g_addOne(%rip), %zmm3, %zmm3
299
300    vmovdqa64 %zmm0, %zmm16
301    vmovdqa64 %zmm1, %zmm17
302    vmovdqa64 %zmm2, %zmm18
303    vmovdqa64 %zmm3, %zmm19
304    mov $10, %r8
305
306.Lchacha20_256_loop:
307
308    CHACHA20_ROUND %zmm0, %zmm1, %zmm2, %zmm3
309
310    vpshufd  $78, %zmm2, %zmm2       // {8  9  10 11} ==> {10 11 8  9}  01 00 11 10
311    vpshufd  $57, %zmm1, %zmm1       // {4  5  6   7} ==> {5  6  7  4}  00 11 10 01
312    vpshufd  $147, %zmm3, %zmm3      // {12 13 14 15} ==> {15 12 13 14} 10 01 00 11
313
314    CHACHA20_ROUND %zmm0, %zmm1, %zmm2, %zmm3
315
316    vpshufd  $78, %zmm2, %zmm2       // {8  9  10 11} ==> {10 11 8  9}  01 00 11 10
317    vpshufd  $147, %zmm1, %zmm1      // {4  5  6   7} ==> {5  6  7  4}  00 11 10 01
318    vpshufd  $57, %zmm3, %zmm3       // {12 13 14 15} ==> {15 12 13 14} 10 01 00 11
319
320    decq   %r8
321    jnz    .Lchacha20_256_loop
322
323    vpaddd    %zmm16, %zmm0, %zmm0
324    vpaddd    %zmm17, %zmm1, %zmm1
325    vpaddd    %zmm18, %zmm2, %zmm2
326    vpaddd    %zmm19, %zmm3, %zmm3
327
328    vextracti64x2  $1, %zmm0, %xmm4
329    vextracti64x2  $1, %zmm1, %xmm5
330    vextracti64x2  $1, %zmm2, %xmm6
331    vextracti64x2  $1, %zmm3, %xmm7
332
333    vextracti64x2  $2, %zmm0, %xmm8
334    vextracti64x2  $2, %zmm1, %xmm9
335    vextracti64x2  $2, %zmm2, %xmm10
336    vextracti64x2  $2, %zmm3, %xmm11
337
338    vextracti64x2  $3, %zmm0, %xmm12
339    vextracti64x2  $3, %zmm1, %xmm13
340    vextracti64x2  $3, %zmm2, %xmm14
341    vextracti64x2  $3, %zmm3, %xmm15
342
343    WRITEBACK_64_AVX512 IN, OUT, %xmm0, %xmm1, %xmm2, %xmm3
344    WRITEBACK_64_AVX512 IN, OUT, %xmm4, %xmm5, %xmm6, %xmm7
345    WRITEBACK_64_AVX512 IN, OUT, %xmm8, %xmm9, %xmm10, %xmm11
346    WRITEBACK_64_AVX512 IN, OUT, %xmm12, %xmm13, %xmm14, %xmm15
347
348    add   $4, %r11d
349    sub   $256, %rcx
350    mov  %r11d, 48(%rdi)
351    jz   .Lchacha20_end
352    jmp  .Lchacha20_start
353
354.Lchacha20_512_start:
355    LOAD_512_STATE %ymm0, %ymm1, %ymm2, %ymm3, %rdi
356
357    vpshufd $0b00000000, %ymm3, %ymm12
358    vpshufd $0b01010101, %ymm3, %ymm13
359
360    vpaddd g_add8block(%rip), %ymm12, %ymm12             // 0, 1, 2, 3, 4, 5, 6 ,7
361    vmovdqa32 %ymm12, %ymm28
362    vpshufd $0b10101010, %ymm3, %ymm14
363    vmovdqa32 %ymm13, %ymm29
364    vpshufd $0b11111111, %ymm3, %ymm15
365    vmovdqa32 %ymm14, %ymm30
366
367    vpshufd $0b00000000, %ymm2, %ymm8
368    vmovdqa32 %ymm15, %ymm31
369    vpshufd $0b01010101, %ymm2, %ymm9
370    vmovdqa32 %ymm8, %ymm24
371    vpshufd $0b10101010, %ymm2, %ymm10
372    vmovdqa32 %ymm9, %ymm25
373    vpshufd $0b11111111, %ymm2, %ymm11
374    vmovdqa32 %ymm10, %ymm26
375
376    vpshufd $0b00000000, %ymm1, %ymm4
377    vmovdqa32 %ymm11, %ymm27
378    vpshufd $0b01010101, %ymm1, %ymm5
379    vmovdqa32 %ymm4, %ymm20
380    vpshufd $0b10101010, %ymm1, %ymm6
381    vmovdqa32 %ymm5, %ymm21
382    vpshufd $0b11111111, %ymm1, %ymm7
383    vmovdqa32 %ymm6, %ymm22
384
385    vpshufd $0b11111111, %ymm0, %ymm3
386    vmovdqa32 %ymm7, %ymm23
387    vpshufd $0b10101010, %ymm0, %ymm2
388    vmovdqa32 %ymm3, %ymm19
389    vpshufd $0b01010101, %ymm0, %ymm1
390    vmovdqa32 %ymm2, %ymm18
391    vpshufd $0b00000000, %ymm0, %ymm0
392    vmovdqa32 %ymm1, %ymm17
393    vmovdqa32 %ymm0, %ymm16
394    mov $10, %r8
395
396.Lchacha20_512_loop:
397
398    CHACHA20_LOOP_AVX512 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, \
399                        %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15
400
401    decq  %r8
402    jnz .Lchacha20_512_loop
403
404    /* ymm16~31: original matrix */
405    vpaddd %ymm16, %ymm0, %ymm0
406    vpaddd %ymm17, %ymm1, %ymm1
407    vpaddd %ymm18, %ymm2, %ymm2
408    vpaddd %ymm19, %ymm3, %ymm3
409    vpaddd %ymm20, %ymm4, %ymm4
410    vpaddd %ymm21, %ymm5, %ymm5
411    vpaddd %ymm22, %ymm6, %ymm6
412    vpaddd %ymm23, %ymm7, %ymm7
413    vpaddd %ymm24, %ymm8, %ymm8
414    vpaddd %ymm25, %ymm9, %ymm9
415    vpaddd %ymm26, %ymm10, %ymm10
416    vpaddd %ymm27, %ymm11, %ymm11
417    vpaddd %ymm28, %ymm12, %ymm12
418    vpaddd %ymm29, %ymm13, %ymm13
419    vpaddd %ymm30, %ymm14, %ymm14
420    vpaddd %ymm31, %ymm15, %ymm15
421
422    MATRIX_TO_STATE %ymm0, %ymm1, %ymm2, %ymm3, %ymm20, %ymm21              // set state 0, 3, 9, 10
423    MATRIX_TO_STATE %ymm4, %ymm5, %ymm6, %ymm7, %ymm22, %ymm23              // set state 4, 7, 13, 14
424    MATRIX_TO_STATE %ymm8, %ymm9, %ymm10, %ymm11, %ymm1, %ymm2              // set state 8, 11, 1, 2
425    MATRIX_TO_STATE %ymm12, %ymm13, %ymm14, %ymm15, %ymm5, %ymm6            // set state 12, 15, 5, 6
426
427    /*
428     * {A0 A1 A2 A3 E0 E1 E2 E3}
429     * {B0 B1 B2 B3 F0 F1 F2 F3}
430     * {C0 C1 C2 C3 G0 G1 G2 G3}
431     * {D0 D1 D2 D3 H0 H1 H2 H3}
432     * ...
433     * =>
434     * {A0 A1 A2 A3 B0 B1 B2 B3}
435     * {C0 C1 C2 C3 D0 D1 D2 D3}
436     * ....
437     */
438
439    CONVERT_Y %ymm0, %ymm4, $0 IN OUT
440    CONVERT_Y %ymm8, %ymm12, $0 IN OUT
441    CONVERT_Y %ymm3, %ymm7, $0 IN OUT
442    CONVERT_Y %ymm11, %ymm15, $0 IN OUT
443    CONVERT_Y %ymm20, %ymm22, $0 IN OUT
444    CONVERT_Y %ymm1, %ymm5, $0 IN OUT
445    CONVERT_Y %ymm21, %ymm23, $0 IN OUT
446    CONVERT_Y %ymm2, %ymm6, $0 IN OUT
447    CONVERT_Y %ymm0, %ymm4, $1 IN OUT
448    CONVERT_Y %ymm8, %ymm12, $1 IN OUT
449    CONVERT_Y %ymm3, %ymm7, $1 IN OUT
450    CONVERT_Y %ymm11, %ymm15, $1 IN OUT
451    CONVERT_Y %ymm20, %ymm22, $1 IN OUT
452    CONVERT_Y %ymm1, %ymm5, $1 IN OUT
453    CONVERT_Y %ymm21, %ymm23, $1 IN OUT
454    CONVERT_Y %ymm2, %ymm6, $1 IN OUT
455
456    add   $8, %r11d
457    sub   $512, %rcx
458    mov   %r11d, 48(%rdi)
459    jz   .Lchacha20_end
460    jmp  .Lchacha20_start
461
462.Lchacha20_1024_start:
463
464    LOAD_1024_STATE %zmm0 %zmm1 %zmm2 %zmm3 %rdi
465
466    STATE_TO_MATRIX_Z_AVX512 %zmm0, %zmm16, %zmm17, %zmm18, %zmm19
467    STATE_TO_MATRIX_Z_AVX512 %zmm1, %zmm20, %zmm21, %zmm22, %zmm23
468    STATE_TO_MATRIX_Z_AVX512 %zmm2, %zmm24, %zmm25, %zmm26, %zmm27
469    STATE_TO_MATRIX_Z_AVX512 %zmm3, %zmm28, %zmm29, %zmm30, %zmm31
470    vpaddd g_add16block(%rip), %zmm28, %zmm28
471
472    vmovdqa64 %zmm16, %zmm0
473    vmovdqa64 %zmm17, %zmm1
474    vmovdqa64 %zmm18, %zmm2
475    vmovdqa64 %zmm19, %zmm3
476    vmovdqa64 %zmm20, %zmm4
477    vmovdqa64 %zmm21, %zmm5
478    vmovdqa64 %zmm22, %zmm6
479    vmovdqa64 %zmm23, %zmm7
480    vmovdqa64 %zmm24, %zmm8
481    vmovdqa64 %zmm25, %zmm9
482    vmovdqa64 %zmm26, %zmm10
483    vmovdqa64 %zmm27, %zmm11
484    vmovdqa64 %zmm28, %zmm12
485    vmovdqa64 %zmm29, %zmm13
486    vmovdqa64 %zmm30, %zmm14
487    vmovdqa64 %zmm31, %zmm15
488    mov $10, %r8
489    jmp .Lchacha20_1024_loop
490
491.Lchacha20_1024_start_cont:
492
493    vmovdqa32 %zmm16, %zmm0
494    vmovdqa32 %zmm17, %zmm1
495    vmovdqa32 %zmm18, %zmm2
496    vmovdqa32 %zmm19, %zmm3
497    vmovdqa32 %zmm20, %zmm4
498    vmovdqa32 %zmm21, %zmm5
499    vmovdqa32 %zmm22, %zmm6
500    vmovdqa32 %zmm23, %zmm7
501    vmovdqa32 %zmm24, %zmm8
502    vmovdqa32 %zmm25, %zmm9
503    vmovdqa32 %zmm26, %zmm10
504    vmovdqa32 %zmm27, %zmm11
505    vmovdqa32 %zmm28, %zmm12
506    vmovdqa32 %zmm29, %zmm13
507    vpaddd g_addsecond16block(%rip), %zmm12, %zmm12                   // add 8, 8, 8, 8, 8, 8, 8, 8 or 4, 4, 4, 4
508    vmovdqa32 %zmm30, %zmm14
509    vmovdqa32 %zmm31, %zmm15
510    vmovdqa32 %zmm12, %zmm28
511    mov $10, %r8
512
513.Lchacha20_1024_loop:
514
515    CHACHA20_LOOP_AVX512    %zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7, %zmm8, %zmm9, \
516                            %zmm10, %zmm11, %zmm12, %zmm13, %zmm14, %zmm15
517    decq  %r8
518    jnz .Lchacha20_1024_loop
519
520    vpaddd %zmm16, %zmm0, %zmm0
521    vpaddd %zmm17, %zmm1, %zmm1
522    vpaddd %zmm18, %zmm2, %zmm2
523    vpaddd %zmm19, %zmm3, %zmm3
524    vpaddd %zmm20, %zmm4, %zmm4
525    vpaddd %zmm21, %zmm5, %zmm5
526    vpaddd %zmm22, %zmm6, %zmm6
527    vpaddd %zmm23, %zmm7, %zmm7
528    vpaddd %zmm24, %zmm8, %zmm8
529    vpaddd %zmm25, %zmm9, %zmm9
530    vpaddd %zmm26, %zmm10, %zmm10
531    vpaddd %zmm27, %zmm11, %zmm11
532    vpaddd %zmm28, %zmm12, %zmm12
533    vpaddd %zmm29, %zmm13, %zmm13
534    vpaddd %zmm30, %zmm14, %zmm14
535    vpaddd %zmm31, %zmm15, %zmm15
536
537    /* store matrix 16, 17, 18, 19 in stack */
538    vmovdqa64 %zmm16,    (%rsp)
539    vmovdqa64 %zmm17,  64(%rsp)
540    vmovdqa64 %zmm18, 128(%rsp)
541    vmovdqa64 %zmm19, 192(%rsp)
542
543    /* store matrix 9, 10, 13, 14 in zmm16, 17, 18, 19 */
544    vmovdqa64 %zmm9,  %zmm16                                            // zmm16: encrypt matrix zmm9
545    vmovdqa64 %zmm10, %zmm17                                            // zmm17: encrypt matrix zmm10
546    vmovdqa64 %zmm13, %zmm18                                            // zmm18: encrypt matrix zmm13
547    vmovdqa64 %zmm14, %zmm19                                            // zmm19: encrypt matrix zmm14
548
549    /* zmm0~15: encrypt matrix 0 ~ 15*/
550    MATRIX_TO_STATE %zmm0, %zmm1, %zmm2, %zmm3, %zmm9, %zmm10           // set state 0, 3, 9, 10
551    MATRIX_TO_STATE %zmm4, %zmm5, %zmm6, %zmm7, %zmm13, %zmm14          // set state 4, 7, 13, 14
552    MATRIX_TO_STATE %zmm8, %zmm16, %zmm17, %zmm11, %zmm1, %zmm2         // set state 8, 11, 1, 2
553    MATRIX_TO_STATE %zmm12, %zmm18, %zmm19, %zmm15, %zmm5, %zmm6        // set state 12, 15, 5, 6
554
555    /*
556     * {A0 A1 A2 A3 E0 E1 E2 E3 I0 I1 I2 I3 M0 M1 M2 M3}
557     * {B0 B1 B2 B3 F0 F1 F2 F3 J0 J1 J2 J3 N0 N1 N2 N3}
558     * {C0 C1 C2 C3 G0 G1 G2 G3 K0 K1 K2 K3 O0 O1 O2 O3}
559     * {D0 D1 D2 D3 H0 H1 H2 H3 L0 L1 L2 L3 P0 P1 P2 P3}
560     * ...
561     * =>
562     * {A0 A1 A2 A3 B0 B1 B2 B3 C0 C1 C2 C3 D0 D1 D2 D3}
563     * {E0 E1 E2 E3 F0 F1 F2 F3 G0 G1 G2 G3 H0 H1 H2 H3}
564     * {I0 I1 I2 I3 J0 J1 J2 J3 K0 K1 K2 K3 L0 L1 L2 L3}
565     * ....
566     */
567
568    CONVERT_Z %zmm0, %zmm4, %zmm8, %zmm12, $0 IN OUT
569    CONVERT_Z %zmm3, %zmm7, %zmm11, %zmm15, $0 IN OUT
570    CONVERT_Z %zmm9, %zmm13, %zmm1, %zmm5, $0 IN OUT
571    CONVERT_Z %zmm10, %zmm14, %zmm2, %zmm6, $0 IN OUT
572    CONVERT_Z %zmm0, %zmm4, %zmm8, %zmm12, $1 IN OUT
573    CONVERT_Z %zmm3, %zmm7, %zmm11, %zmm15, $1 IN OUT
574    CONVERT_Z %zmm9, %zmm13, %zmm1, %zmm5, $1 IN OUT
575    CONVERT_Z %zmm10, %zmm14, %zmm2, %zmm6, $1 IN OUT
576    CONVERT_Z %zmm0, %zmm4, %zmm8, %zmm12, $2 IN OUT
577    CONVERT_Z %zmm3, %zmm7, %zmm11, %zmm15, $2 IN OUT
578    CONVERT_Z %zmm9, %zmm13, %zmm1, %zmm5, $2 IN OUT
579    CONVERT_Z %zmm10, %zmm14, %zmm2, %zmm6, $2 IN OUT
580    CONVERT_Z %zmm0, %zmm4, %zmm8, %zmm12, $3 IN OUT
581    CONVERT_Z %zmm3, %zmm7, %zmm11, %zmm15, $3 IN OUT
582    CONVERT_Z %zmm9, %zmm13, %zmm1, %zmm5, $3 IN OUT
583    CONVERT_Z %zmm10, %zmm14, %zmm2, %zmm6, $3 IN OUT
584
585    /* store zmm16~19 in stack */
586    vmovdqa64    (%rsp), %zmm16
587    vmovdqa64  64(%rsp), %zmm17
588    vmovdqa64 128(%rsp), %zmm18
589    vmovdqa64 192(%rsp), %zmm19
590
591    add  $16, %r11d
592    sub  $1024, %rcx
593    mov  %r11d, 48(%rdi)
594    jz   .Lchacha20_clear
595    cmp  $1024, %rcx
596    jae  .Lchacha20_1024_start_cont
597    jmp  .Lchacha20_start
598
599.Lchacha20_clear:
600    /* clear sensitive info in stack */
601    vpxord %zmm0, %zmm0, %zmm0
602    vmovdqa64 %zmm0,    (%rsp)
603    vmovdqa64 %zmm0,  64(%rsp)
604    vmovdqa64 %zmm0, 128(%rsp)
605    vmovdqa64 %zmm0, 192(%rsp)
606
607.Lchacha20_end:
608    xor   %r11d, %r11d
609    mov   %r9, %rsp
610    .cfi_endproc
611    ret
612.size CHACHA20_Update,.-CHACHA20_Update
613
614#endif
615