• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15
16#include "hitls_build.h"
17#ifdef HITLS_CRYPTO_CHACHA20
18
19#include "chacha20_x8664_common.S"
20.text
21.align    64
22g_ror16_128:
23    .byte   0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd, \
24            0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
25    .size   g_ror16_128, .-g_ror16_128
26.align    64
27g_ror8_128:
28    .byte   0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe, \
29            0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
30    .size   g_ror8_128, .-g_ror8_128
31.align    64
32g_ror16:
33    .byte   0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
34    .size   g_ror16, .-g_ror16
35.align    64
36g_ror8:
37    .byte   0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
38    .size   g_ror8, .-g_ror8
39.align    64
40g_ror16_512:
41    .byte   0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd, \
42            0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
43    .size   g_ror16_512, .-g_ror16_512
44.align    64
45g_ror8_512:
46    .byte   0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe, \
47            0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
48    .size   g_ror8_512, .-g_ror8_512
49.align    64
50g_add4block:
51    .long   0, 1, 2, 3
52    .size   g_add4block, .-g_add4block
53.align    64
54g_addsecond4block:
55    .long   4, 4, 4, 4
56    .size   g_addsecond4block, .-g_addsecond4block
57.align    64
58g_add8block:
59    .long   0, 1, 2, 3, 4, 5, 6, 7
60    .size   g_add8block, .-g_add8block
61.align    64
62g_addsecond8block:
63    .long   8, 8, 8, 8, 8, 8, 8, 8
64    .size   g_addsecond8block, .-g_addsecond8block
65.align    64
66g_addOne:
67    .long   0, 0, 0, 0, 1, 0, 0, 0
68    .size   g_addOne, .-g_addOne
69
70.set  IN, %rsi
71.set OUT, %rdx
72
73/* QUARTERROUND for one state */
74.macro CHACHA20_ROUND s0 s1 s2 s3 cur ror16 ror8
75    vpaddd  \s1, \s0, \s0
76    vpxor   \s0, \s3, \s3
77    vpshufb (\ror16), \s3, \s3
78
79    vpaddd  \s3, \s2, \s2
80    vpxor   \s2, \s1, \s1
81    vmovdqa \s1, \cur
82    vpsrld  $20, \s1, \s1
83    vpslld  $12, \cur, \cur
84    vpor    \cur, \s1, \s1
85
86    vpaddd  \s1, \s0, \s0
87    vpxor   \s0, \s3, \s3
88    vpshufb (\ror8), \s3, \s3
89
90    vpaddd  \s3, \s2, \s2
91    vpxor   \s2, \s1, \s1
92    vmovdqa \s1, \cur
93    vpsrld  $25, \s1, \s1
94    vpslld  $7, \cur, \cur
95    vpor    \cur, \s1, \s1
96.endm
97
98/* QUARTERROUND for two states */
99.macro CHACHA20_2_ROUND s0 s1 s2 s3 cur s4 s5 s6 s7 cur1 ror16 ror8
100    vpaddd  \s1, \s0, \s0
101    vpxor   \s0, \s3, \s3
102    vpshufb (\ror16), \s3, \s3
103
104    vpaddd  \s3, \s2, \s2
105    vpxor   \s2, \s1, \s1
106    vmovdqa \s1, \cur
107    vpsrld  $20, \s1, \s1
108    vpslld  $12, \cur, \cur
109    vpor    \cur, \s1, \s1
110
111    vpaddd  \s1, \s0, \s0
112    vpxor   \s0, \s3, \s3
113    vpshufb (\ror8), \s3, \s3
114
115    vpaddd  \s3, \s2, \s2
116    vpxor   \s2, \s1, \s1
117    vmovdqa \s1, \cur
118    vpsrld  $25, \s1, \s1
119    vpslld  $7, \cur, \cur
120    vpor    \cur, \s1, \s1
121
122    vpaddd  \s5, \s4, \s4
123    vpxor   \s4, \s7, \s7
124    vpshufb (\ror16), \s7, \s7
125
126    vpaddd  \s7, \s6, \s6
127    vpxor   \s6, \s5, \s5
128    vmovdqa \s5, \cur1
129    vpsrld  $20, \s5, \s5
130    vpslld  $12, \cur1, \cur1
131    vpor    \cur1, \s5, \s5
132
133    vpaddd  \s5, \s4, \s4
134    vpxor   \s4, \s7, \s7
135    vpshufb (\ror8), \s7, \s7
136
137    vpaddd  \s7, \s6, \s6
138    vpxor   \s6, \s5, \s5
139    vmovdqa \s5, \cur1
140    vpsrld  $25, \s5, \s5
141    vpslld  $7, \cur1, \cur1
142    vpor    \cur1, \s5, \s5
143.endm
144
145/* current matrix add original matrix */
146.macro LASTADD_MATRIX S0 S1 S2 S3 S4 S5 S6 S7 S8 S9 S10 S11 S12 S13 S14 S15 PER
147    vpaddd (%rsp), \S0, \S0
148    vpaddd 1*\PER(%rsp), \S1, \S1
149    vpaddd 2*\PER(%rsp), \S2, \S2
150    vpaddd 3*\PER(%rsp), \S3, \S3
151    vpaddd 4*\PER(%rsp), \S4, \S4
152    vpaddd 5*\PER(%rsp), \S5, \S5
153    vpaddd 6*\PER(%rsp), \S6, \S6
154    vpaddd 7*\PER(%rsp), \S7, \S7
155    vpaddd 8*\PER(%rsp), \S8, \S8
156    vpaddd 9*\PER(%rsp), \S9, \S9
157    vpaddd 10*\PER(%rsp), \S10, \S10
158    vpaddd 11*\PER(%rsp), \S11, \S11
159    vpaddd 12*\PER(%rsp), \S12, \S12
160    vpaddd 13*\PER(%rsp), \S13, \S13
161    vpaddd 14*\PER(%rsp), \S14, \S14
162    vpaddd 15*\PER(%rsp), \S15, \S15
163.endm
164
165/* write output for left part of 512 bytes (ymm) */
166.macro WRITE_BACK_512_L inpos outpos s0 s1 s2 s3 s4 s5 s6 s7 out0 out1 out2 out3
167
168    /* {A0 B0 C0 D0 E0 F0 G0 H0} {A1 B1 C1 D1 E1 F1 G1 H1} => {A0 B0 C0 D0 A1 B1 C1 D1} */
169    vperm2i128        $0x20, \s1, \s0, \out0
170    vpxor           (\inpos), \out0, \out0
171    vmovdqu         \out0, (\outpos)                      // write back output
172
173    vperm2i128        $0x20, \s3, \s2, \out1
174    vpxor           32(\inpos), \out1, \out1
175    vmovdqu         \out1, 32(\outpos)
176
177    vperm2i128        $0x20, \s5, \s4, \out2
178    vpxor           64(\inpos), \out2, \out2                // write back output
179    vmovdqu         \out2, 64(\outpos)
180
181    vperm2i128        $0x20, \s7, \s6, \out3
182    vpxor           96(\inpos), \out3, \out3
183    vmovdqu         \out3, 96(\outpos)
184.endm
185
186/* write output for right part of 512 bytes (ymm) */
187.macro WRITE_BACK_512_R inpos outpos s0 s1 s2 s3 s4 s5 s6 s7
188
189    /* {A0 B0 C0 D0 E0 F0 G0 H0} {A1 B1 C1 D1 E1 F1 G1 H1} => {E0 F0 G0 H0 E1 F1 G1 H1} */
190    vperm2i128        $0x31, \s1, \s0, \s1
191    vpxor           (\inpos), \s1, \s1
192    vmovdqu         \s1, (\outpos)                 // write back output
193
194    vperm2i128        $0x31, \s3, \s2, \s3
195    vpxor           32(\inpos), \s3, \s3
196    vmovdqu         \s3, 32(\outpos)
197
198    vperm2i128        $0x31, \s5, \s4, \s5
199    vpxor           64(\inpos), \s5, \s5
200    vmovdqu         \s5, 64(\outpos)              // write back output
201
202    vperm2i128        $0x31, \s7, \s6, \s7
203    vpxor           96(\inpos), \s7, \s7
204    vmovdqu         \s7, 96(\outpos)
205.endm
206
207/*
208 * Processing 64 bytes: 4 xmm registers
209 * xmm0 ~ xmm3:
210 * xmm0 {0,  1,  2,  3}
211 * xmm1 {4,  5,  6,  7}
212 * xmm2 {8,  9,  10, 11}
213 * xmm3 {12, 13, 14, 15}
214 *
215 * Processing 128 bytes: 8 xmm registers
216 * xmm0 ~ xmm8:
217 * xmm0 {0,  1,  2,  3}           xmm5 {0,  1,  2,  3}
218 * xmm1 {4,  5,  6,  7}           xmm6 {4,  5,  6,  7}
219 * xmm2 {8,  9,  10, 11}          xmm7 {8,  9,  10, 11}
220 * xmm3 {12, 13, 14, 15}          xmm8 {12, 13, 14, 15}
221 *
222 * Processing 256 bytes: 16 xmm registers
223 * xmm0 ~ xmm15:
224 * xmm0 {0,  0,  0,  0}
225 * xmm1 {1,  2,  2,  2}
226 * xmm2 {3,  3,  3,  3}
227 * xmm3 {4,  4,  4,  4}
228 * ...
229 * xmm15 {15, 15, 15, 15}
230 *
231 * Processing 512 bytes: 16 xmm registers
232 * ymm0 ~ ymm15:
233 * ymm0 {0,  0,  0,  0}
234 * ymm1 {1,  2,  2,  2}
235 * ymm2 {3,  3,  3,  3}
236 * ymm3 {4,  4,  4,  4}
237 * ...
238 * ymm15 {15, 15, 15, 15}
239 *
240 */
241
242/*
243 * @Interconnection with the C interface:void CHACHA20_Update(CRYPT_CHACHA20_Ctx *ctx, const uint8_t *in, uint8_t *out, uint32_t len);
244 * @brief chacha20 algorithm
245 * @param ctx [IN] Algorithm context, which is set by the C interface and transferred.
246 * @param in [IN] Data to be encrypted
247 * @param out [OUT] Data after encryption
248 * @param len [IN] Encrypted length
249 * esp cannot use 15 available ctx in out len
250 * 16 registers are needed in one cycle, then
251 * {0,  1,  4,  5,  8,   9,  12, 13}
252 * {2,  3,  6,  7,  10,  11, 14, 15}
253 */
254
255.globl CHACHA20_Update
256.type CHACHA20_Update,%function
257.align 64
258CHACHA20_Update:
259    .cfi_startproc
260    mov     48(%rdi), %r11d
261    mov     %rsp, %rax
262    subq    $1024,%rsp
263    andq    $-512,%rsp
264
265.Lchacha20_start:
266    cmp  $512, %rcx
267    jae  .Lchacha20_512_start
268    cmp  $256, %rcx
269    jae  .Lchacha20_256_start
270    cmp  $128, %rcx
271    jae  .Lchacha20_128_start
272    cmp  $64, %rcx
273    jae  .Lchacha20_64_start
274    jmp  .Lchacha20_end
275
276.Lchacha20_64_start:
277
278    LOAD_STATE %xmm0, %xmm1, %xmm2, %xmm3, %rdi
279
280    vmovdqa %xmm0, %xmm10
281    vmovdqa %xmm1, %xmm11
282    vmovdqa %xmm2, %xmm12
283    vmovdqa %xmm3, %xmm13
284
285    leaq    g_ror16(%rip), %r9
286    leaq    g_ror8(%rip), %r10
287    mov     $10, %r8
288
289.Lchacha20_64_loop:
290
291    /* 0 = 0 + 4, 12 = (12 ^ 0) >>> 16 | 8 = 8 + 12, 4 = (4 ^ 8) >>> 12 |
292     * 0 = 0 + 4, 12 = (12 ^ 0) >>> 8 |  8 = 8 + 12, 4 = (4 ^ 8) >>> 7
293     * 1 = 1 + 5, 13 = (13 ^ 1) >>> 16 | 9 = 9 + 13, 5 = (5 ^ 9) >>> 12 |
294     * 1 = 1 + 5, 13 = (13 ^ 1) >>> 8 |  9 = 9 + 13, 5 = (5 ^ 9) >>> 7
295     * 2 = 2 + 6, 14 = (14 ^ 2) >>> 16 | 10 = 10 + 14, 6 = (6 ^ 10)>>> 12 |
296     * 2 = 2 + 6, 14 = (14 ^ 2) >>> 8 |  10 = 10 + 14, 6 = (6 ^ 10)>>> 7
297     * 3 = 3 + 7, 15 = (15 ^ 3) >>> 16 | 11 = 11 + 15, 7 = (7 ^ 11)>>> 12 |
298     * 3 = 3 + 7 ,15 = (15 ^ 3) >>> 8 |  11 = 11 + 15, 7 = (7 ^ 11)>>> 7
299     */
300    CHACHA20_ROUND %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %r9, %r10
301
302    vpshufd  $78, %xmm2, %xmm2       // {8  9  10 11} ==> {10 11 8  9}  01 00 11 10
303    vpshufd  $57, %xmm1, %xmm1       // {4  5  6   7} ==> {5  6  7  4}  00 11 10 01
304    vpshufd  $147, %xmm3, %xmm3      // {12 13 14 15} ==> {15 12 13 14} 10 01 00 11
305
306    /* 0 = 0 + 5 , 15 = (15 ^ 0) >>> 16 | 10 = 10 + 15, 5 = (5 ^ 10) >>> 12 |
307     * 0 = 0 + 5, 15 = (15 ^ 0) >>> 8 |  10 = 10 + 15, 5 = (5 ^ 10) >>> 7
308     * 1 = 1 + 6 , 12 = (12 ^ 1) >>> 16 | 11 = 11 + 12, 6 = (6 ^ 11) >>> 12 |
309     * 1 = 1 + 6, 12 = (12 ^ 1) >>> 8 |  11 = 11 + 12,  6 = (6 ^ 11) >>> 7
310     * 2 = 2 + 7 , 13 = (13 ^ 2) >>> 16 | 8 = 8 + 13, 7 = (7 ^ 8)>>> 12 |
311     * 2 = 2 + 7, 13 = (13 ^ 2) >>> 8 |  8 =  8 + 13, 7 = (7 ^ 8)>>> 7
312     * 3 = 3 + 4 , 14 = (14 ^ 3) >>> 16 | 9 = 9 + 14, 4 = (4 ^ 9)>>> 12 |
313     * 3 = 3 + 4, 14 = (14 ^ 3) >>> 8 |  9 =  9 + 14, 4 = (4 ^ 9)>>> 7
314     */
315    CHACHA20_ROUND %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %r9, %r10
316
317    vpshufd  $78, %xmm2, %xmm2       // {10 11 8  9} ==> {8  9  10 11}  01 00 11 10
318    vpshufd  $147, %xmm1, %xmm1      // {5  6  7  4} ==> {4  5  6   7}  00 11 10 01
319    vpshufd  $57, %xmm3, %xmm3       // {15 12 13 14} ==> {12 13 14 15} 10 01 00 11
320
321    decq  %r8
322    jnz   .Lchacha20_64_loop
323
324    vpaddd  %xmm10, %xmm0, %xmm0
325    vpaddd  %xmm11, %xmm1, %xmm1
326    vpaddd  %xmm12, %xmm2, %xmm2
327    vpaddd  %xmm13, %xmm3, %xmm3
328
329    add     $1, %r11d
330    vpxor   0(IN),  %xmm0, %xmm4
331    vpxor   16(IN), %xmm1, %xmm5
332    vpxor   32(IN), %xmm2, %xmm6
333    vpxor   48(IN), %xmm3, %xmm7
334
335    vmovdqu %xmm4, 0(OUT)
336    vmovdqu %xmm5, 16(OUT)
337    vmovdqu %xmm6, 32(OUT)
338    vmovdqu %xmm7, 48(OUT)
339
340    add $64, IN
341    add $64, OUT
342
343    mov %r11d, 48(%rdi)
344    jmp .Lchacha20_end
345
346.Lchacha20_128_start:
347
348    vbroadcasti128 (%rdi),   %ymm0    // {0  1  2   3  0  1  2   3}
349    vbroadcasti128 16(%rdi), %ymm1    // {4  5  6   7  4  5  6   7}
350    vbroadcasti128 32(%rdi), %ymm2    // {8  9  10 11  8  9  10 11}
351    vbroadcasti128 48(%rdi), %ymm3    // {12 13 14 15  12 13 14 15}
352
353    vpaddd g_addOne(%rip), %ymm3, %ymm3
354
355    vmovdqa %ymm0, %ymm12
356    vmovdqa %ymm1, %ymm13
357    vmovdqa %ymm2, %ymm14
358    vmovdqa %ymm3, %ymm15
359
360    leaq    g_ror16_128(%rip), %r9
361    leaq    g_ror8_128(%rip), %r10
362    mov     $10, %r8
363
364.Lchacha20_128_loop:
365
366    CHACHA20_ROUND %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %r9, %r10
367
368    vpshufd  $78, %ymm2, %ymm2       // {8  9  10 11} ==> {10 11 8  9}  01 00 11 10
369    vpshufd  $57, %ymm1, %ymm1       // {4  5  6   7} ==> {5  6  7  4}  00 11 10 01
370    vpshufd  $147, %ymm3, %ymm3      // {12 13 14 15} ==> {15 12 13 14} 10 01 00 11
371
372    CHACHA20_ROUND %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %r9, %r10
373
374    vpshufd  $78, %ymm2, %ymm2       // {8  9  10 11} ==> {10 11 8  9}  01 00 11 10
375    vpshufd  $147, %ymm1, %ymm1      // {4  5  6   7} ==> {5  6  7  4}  00 11 10 01
376    vpshufd  $57, %ymm3, %ymm3       // {12 13 14 15} ==> {15 12 13 14} 10 01 00 11
377
378    decq %r8
379    jnz  .Lchacha20_128_loop
380
381    vpaddd  %ymm12, %ymm0, %ymm0
382    vpaddd  %ymm13, %ymm1, %ymm1
383    vpaddd  %ymm14, %ymm2, %ymm2
384    vpaddd  %ymm15, %ymm3, %ymm3
385
386    vextracti128 $1, %ymm0, %xmm4     // ymm0 => {xmm0 xmm5}
387    vextracti128 $1, %ymm1, %xmm5     // ymm1 => {xmm1 xmm6}
388    vextracti128 $1, %ymm2, %xmm6     // ymm2 => {xmm2 xmm7}
389    vextracti128 $1, %ymm3, %xmm7     // ymm3 => {xmm3 xmm8}
390
391    WRITEBACK_64_AVX2   IN, OUT, %xmm0, %xmm1, %xmm2, %xmm3
392    add   $2, %r11d
393    WRITEBACK_64_AVX2   IN, OUT, %xmm4, %xmm5, %xmm6, %xmm7
394    mov  %r11d, 48(%rdi)
395
396    sub $128, %rcx
397    jz  .Lchacha20_end
398    jmp .Lchacha20_start
399
400.Lchacha20_256_start:
401
402    LOAD_STATE %xmm0, %xmm1, %xmm2, %xmm3, %rdi
403    STATE_TO_MATRIX %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, \
404                    %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, 0, 16, g_add4block(%rip)
405
406    /* move xmm8~11 into stack for CHACHA20_LOOP encryption */
407    vmovdqa  %xmm8, 256(%rsp)
408    vmovdqa  %xmm9, 256+16(%rsp)
409    vmovdqa %xmm10, 256+32(%rsp)
410    vmovdqa %xmm11, 256+48(%rsp)
411
412    leaq    g_ror16(%rip), %r9
413    leaq    g_ror8(%rip), %r10
414
415    mov     $10, %r8
416
417.Lchacha20_256_loop:
418
419    CHACHA20_LOOP   %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10 \
420                    %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, 256, 16, %rsp, %r9, %r10
421
422    decq %r8
423    jnz  .Lchacha20_256_loop
424
425    /* xmm0~15: encrypt matrix 0 ~ 15*/
426    vmovdqa 256+32(%rsp), %xmm10                                            // rsp32: encrypt matrix xmm10
427    vmovdqa 256+48(%rsp), %xmm11
428
429    LASTADD_MATRIX  %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10 \
430                    %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, 16
431
432    /* store xmm9, 10, 13, 14 in stack */
433    vmovdqa %xmm9,  256(%rsp)                                               // rsp 0: encrypt matrix xmm9
434    vmovdqa %xmm10, 256+32(%rsp)                                            // rsp32: encrypt matrix xmm9
435    vmovdqa %xmm13, 256+16(%rsp)                                            // rsp16: encrypt matrix xmm13
436    vmovdqa %xmm14, 256+48(%rsp)                                            // rsp48: encrypt matrix xmm14
437
438    MATRIX_TO_STATE %xmm0, %xmm1, %xmm2, %xmm3, %xmm9, %xmm10               // set state 0, 3, 9, 10
439    MATRIX_TO_STATE %xmm4, %xmm5, %xmm6, %xmm7, %xmm13, %xmm14              // set state 4, 7, 13, 14
440
441    vmovdqa    256(%rsp), %xmm5
442    vmovdqa 256+32(%rsp), %xmm6
443    vmovdqa        %xmm9, 256(%rsp)
444    vmovdqa       %xmm10, 256+32(%rsp)
445
446    MATRIX_TO_STATE %xmm8, %xmm5, %xmm6, %xmm11, %xmm1, %xmm2               // set state 8, 11, 1, 2
447
448    vmovdqa 256+16(%rsp), %xmm9
449    vmovdqa 256+48(%rsp), %xmm10
450    vmovdqa       %xmm13, 256+16(%rsp)
451    vmovdqa       %xmm14, 256+48(%rsp)
452
453    MATRIX_TO_STATE %xmm12, %xmm9, %xmm10, %xmm15, %xmm5, %xmm6             // set state 12, 15, 5, 6
454
455    vmovdqa    256(%rsp), %xmm9                                             // rsp 0: state 9
456    vmovdqa 256+32(%rsp), %xmm10                                            // rsp32: state 10
457    vmovdqa 256+16(%rsp), %xmm13                                            // rsp16: state 13
458    vmovdqa 256+48(%rsp), %xmm14                                            // rsp48: state 14
459
460    /* finish state calculation, now write result to output */
461    WRITEBACK_64_AVX2 IN, OUT, %xmm0, %xmm4, %xmm8, %xmm12
462    WRITEBACK_64_AVX2 IN, OUT, %xmm3, %xmm7, %xmm11, %xmm15
463    WRITEBACK_64_AVX2 IN, OUT, %xmm9, %xmm13, %xmm1, %xmm5
464    WRITEBACK_64_AVX2 IN, OUT, %xmm10, %xmm14, %xmm2, %xmm6
465
466    add $4, %r11d
467    sub $256, %rcx
468    mov %r11d, 48(%rdi)
469    cmp $256, %rcx
470    jz  .Lchacha20_end
471    jmp .Lchacha20_start
472
473.Lchacha20_512_start:
474
475    LOAD_512_STATE %ymm0 %ymm1 %ymm2 %ymm3 %rdi
476    STATE_TO_MATRIX %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, \
477                    %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, 0, 32, g_add8block(%rip)
478    jmp  .Lchacha20_512_run
479
480.Lchacha20_512_start_cont:
481
482    LOAD_MATRIX %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, \
483                %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, 0, 32, g_addsecond8block(%rip)
484
485.Lchacha20_512_run:
486
487    /* move ymm8~11 into stack for CHACHA20_LOOP encryption */
488    vmovdqa     %ymm8, 512(%rsp)
489    vmovdqa     %ymm9, 512+32(%rsp)
490    vmovdqa     %ymm10, 512+64(%rsp)
491    vmovdqa     %ymm11, 512+96(%rsp)
492    leaq        g_ror16_512(%rip), %r9
493    leaq        g_ror8_512(%rip), %r10
494    mov         $10, %r8
495.align 32
496.Lchacha20_512_loop:
497
498    CHACHA20_LOOP   %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10 \
499                    %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, 512, 32, %rsp, %r9, %r10
500
501    decq %r8
502    jnz  .Lchacha20_512_loop
503
504    /* ymm0~15: encrypt matrix 0 ~ 15*/
505    vmovdqa 512+64(%rsp), %ymm10                                            // rsp64: encrypt matrix ymm10
506    vmovdqu 512+96(%rsp), %ymm11
507
508    LASTADD_MATRIX  %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10 \
509                    %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, 32
510
511    /* store matrix ymm9, 10, 13, 14 in stack */
512    vmovdqa %ymm9, 512(%rsp)                                                // rsp 0: encrypt matrix ymm9
513    vmovdqu %ymm10, 512+32(%rsp)                                            // rsp32: encrypt matrix ymm10
514    vmovdqa %ymm13, 512+64(%rsp)                                            // rsp64: encrypt matrix ymm13
515    vmovdqu %ymm14, 512+96(%rsp)                                            // rsp96: encrypt matrix ymm14
516
517    MATRIX_TO_STATE %ymm0, %ymm1, %ymm2, %ymm3, %ymm9, %ymm10               // set state 0, 3, 9, 10
518    MATRIX_TO_STATE %ymm4, %ymm5, %ymm6, %ymm7, %ymm13, %ymm14              // set state 4, 7, 13, 14
519
520    vmovdqu 512(%rsp), %ymm5
521    vmovdqa 512+32(%rsp), %ymm6
522    vmovdqu %ymm9, 512(%rsp)
523    vmovdqa %ymm10, 512+32(%rsp)
524
525    MATRIX_TO_STATE %ymm8, %ymm5, %ymm6, %ymm11, %ymm1, %ymm2               // set state 8, 11, 1, 2
526
527    vmovdqa 512+64(%rsp), %ymm9
528    vmovdqu 512+96(%rsp), %ymm10
529    vmovdqa %ymm13, 512+64(%rsp)
530    vmovdqu %ymm14, 512+96(%rsp)
531
532    MATRIX_TO_STATE %ymm12, %ymm9, %ymm10, %ymm15, %ymm5, %ymm6             // set state 12, 15, 5, 6
533
534    /*
535     * {A0 A1 A2 A3 E0 E1 E2 E3}
536     * {B0 B1 B2 B3 F0 F1 F2 F3}
537     * {C0 C1 C2 C3 G0 G1 G2 G3}
538     * {D0 D1 D2 D3 H0 H1 H2 H3}
539     * ...
540     * =>
541     * {A0 A1 A2 A3 B0 B1 B2 B3}
542     * {C0 C1 C2 C3 D0 D1 D2 D3}
543     * ....
544     */
545
546    /* left half of ymm registers */
547    WRITE_BACK_512_L IN, OUT, %ymm0, %ymm4, %ymm8, %ymm12, %ymm3, %ymm7, %ymm11, %ymm15, %ymm9, %ymm10, %ymm13, %ymm14
548    add $256, IN
549    add $256, OUT
550
551    /* right half of ymm registers */
552    WRITE_BACK_512_R IN, OUT, %ymm0, %ymm4, %ymm8, %ymm12, %ymm3, %ymm7, %ymm11, %ymm15
553    sub $128, IN
554    sub $128, OUT
555
556    vmovdqa 512(%rsp), %ymm9
557    vmovdqu 512+32(%rsp), %ymm10
558    vmovdqa 512+64(%rsp), %ymm13
559    vmovdqu 512+96(%rsp), %ymm14
560
561    /* second left half of ymm registers */
562    WRITE_BACK_512_L IN, OUT, %ymm9, %ymm13, %ymm1, %ymm5, %ymm10, %ymm14, %ymm2, %ymm6, %ymm0, %ymm4, %ymm8, %ymm12
563    add $256, IN
564    add $256, OUT
565
566    /* second right half of ymm registers */
567    WRITE_BACK_512_R IN, OUT, %ymm9, %ymm13, %ymm1, %ymm5, %ymm10, %ymm14, %ymm2, %ymm6
568    add $128, IN
569    add $128, OUT
570
571    add $8, %r11d
572    sub $512, %rcx
573    mov %r11d, 48(%rdi)
574    jz  .Lchacha20_end
575    cmp $512, %rcx
576    jae .Lchacha20_512_start_cont
577    jmp .Lchacha20_start
578
579.Lchacha20_end:
580    /* clear sensitive info in stack */
581    vpxor   %ymm0, %ymm0, %ymm0
582    xor     %r11d, %r11d
583    vmovdqa %ymm0, (%rsp)
584    vmovdqa %ymm0, 32(%rsp)
585    vmovdqa %ymm0, 64(%rsp)
586    vmovdqa %ymm0, 96(%rsp)
587    vmovdqa %ymm0, 128(%rsp)
588    vmovdqa %ymm0, 160(%rsp)
589    vmovdqa %ymm0, 192(%rsp)
590    vmovdqa %ymm0, 224(%rsp)
591    vmovdqa %ymm0, 256(%rsp)
592    vmovdqa %ymm0, 288(%rsp)
593    vmovdqa %ymm0, 320(%rsp)
594    vmovdqa %ymm0, 352(%rsp)
595    vmovdqa %ymm0, 384(%rsp)
596    vmovdqa %ymm0, 416(%rsp)
597    vmovdqa %ymm0, 448(%rsp)
598    vmovdqa %ymm0, 480(%rsp)
599    vmovdqa %ymm0, 512(%rsp)
600    vmovdqa %ymm0, 512+32(%rsp)
601    vmovdqa %ymm0, 512+64(%rsp)
602    vmovdqa %ymm0, 512+96(%rsp)
603    mov %rax, %rsp
604    .cfi_endproc
605    ret
606.size CHACHA20_Update,.-CHACHA20_Update
607
608#endif
609