• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15
16#include "hitls_build.h"
17#ifdef HITLS_CRYPTO_CHACHA20
18
19.text
20.LAndBlock:
21.long 1, 0, 0, 0
22.LRor16:
23.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
24.LRor8:
25.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
26
27.set IN, %r9
28.set OUT, %r10
29
30/* Original State */
31.set O00, %xmm12
32.set O01, %xmm13
33.set O02, %xmm14
34.set O03, %xmm15
35
36/* State 0 */
37.set S00, %xmm0    // LINE 0 STATE 0
38.set S01, %xmm1    // LINE 1 STATE 0
39.set S02, %xmm2    // LINE 2 STATE 0
40.set S03, %xmm3    // LINE 3 STATE 0
41
42/* State 1 */
43.set S10, %xmm5    // LINE 0 STATE 1
44.set S11, %xmm6    // LINE 1 STATE 1
45.set S12, %xmm7    // LINE 2 STATE 1
46.set S13, %xmm8    // LINE 3 STATE 1
47
48
49.macro CHACHA20_ROUND  S0 S1 S2 S3 CUR
50    paddd  \S1, \S0
51    pxor   \S0, \S3
52    pshufb .LRor16(%rip), \S3
53
54    paddd  \S3, \S2
55    pxor   \S2, \S1
56    movdqa \S1, \CUR
57    psrld  $20, \S1
58    pslld  $12, \CUR
59    por	\CUR, \S1
60
61    paddd  \S1, \S0
62    pxor   \S0, \S3
63    pshufb .LRor8(%rip), \S3
64
65    paddd  \S3, \S2
66    pxor   \S2, \S1
67    movdqa \S1, \CUR
68    psrld  $25, \S1
69    pslld  $7, \CUR
70    por	\CUR, \S1
71.endm
72
73/* QUARTERROUND for two states */
74.macro CHACHA20_2_ROUND  S0 S1 S2 S3 CUR S4 S5 S6 S7 CUR1
75    paddd  \S1, \S0
76    pxor   \S0, \S3
77    pshufb .LRor16(%rip), \S3
78
79    paddd  \S3, \S2
80    pxor   \S2, \S1
81    movdqa \S1, \CUR
82    psrld  $20, \S1
83    pslld  $12, \CUR
84    por	\CUR, \S1
85
86    paddd  \S1, \S0
87    pxor   \S0, \S3
88    pshufb .LRor8(%rip), \S3
89
90    paddd  \S3, \S2
91    pxor   \S2, \S1
92    movdqa \S1, \CUR
93    psrld  $25, \S1
94    pslld  $7, \CUR
95    por	\CUR, \S1
96
97    paddd  \S5, \S4
98    pxor   \S4, \S7
99    pshufb .LRor16(%rip), \S7
100
101    paddd  \S7, \S6
102    pxor   \S6, \S5
103    movdqa \S5, \CUR1
104    psrld  $20, \S5
105    pslld  $12, \CUR1
106    por	\CUR1, \S5
107
108    paddd  \S5, \S4
109    pxor   \S4, \S7
110    pshufb .LRor8(%rip), \S7
111
112    paddd  \S7, \S6
113    pxor   \S6, \S5
114    movdqa \S5, \CUR1
115    psrld  $25, \S5
116    pslld  $7, \CUR1
117    por	\CUR1, \S5
118.endm
119
120/* final add & xor for 64 bytes */
121 .macro WRITE_BACK_64 IN_POS OUT_POS
122    paddd  O00, S00
123    paddd  O01, S01
124    paddd  O02, S02
125    paddd  O03, S03
126
127    movdqu  (\IN_POS),   %xmm4         // get input
128    movdqu  16(\IN_POS), %xmm9
129    movdqu  32(\IN_POS), %xmm10
130    movdqu  48(\IN_POS), %xmm11
131
132    pxor   %xmm4, S00
133    pxor   %xmm9, S01
134    pxor   %xmm10, S02
135    pxor   %xmm11, S03
136
137    movdqu  S00, (\OUT_POS)          // write back output
138    movdqu  S01, 16(\OUT_POS)
139    movdqu  S02, 32(\OUT_POS)
140    movdqu  S03, 48(\OUT_POS)
141.endm
142
143/* final add & xor for 128 bytes */
144.macro WRITE_BACK_128 IN_POS OUT_POS
145    paddd  O00, S00               // state 0 + origin state 0
146    paddd  O01, S01
147    paddd  O02, S02
148    paddd  O03, S03
149
150    pinsrd  $0, %r11d, O03        // change Original state 0 to Original state 1
151
152    paddd  O00, S10               // state 1 + origin state 1
153    paddd  O01, S11
154    paddd  O02, S12
155    paddd  O03, S13
156
157    movdqu  (\IN_POS),   %xmm4         // get input 0
158    movdqu  16(\IN_POS), %xmm9
159    movdqu  32(\IN_POS), %xmm10
160    movdqu  48(\IN_POS), %xmm11
161
162    pxor   %xmm4, S00              // input 0 ^ state 0
163    pxor   %xmm9, S01
164    pxor   %xmm10, S02
165    pxor   %xmm11, S03
166
167    movdqu  S00, (\OUT_POS)          // write back to output 0
168    movdqu  S01, 16(\OUT_POS)
169    movdqu  S02, 32(\OUT_POS)
170    movdqu  S03, 48(\OUT_POS)
171
172    movdqu  64(\IN_POS), %xmm4         // get input 1
173    movdqu  80(\IN_POS), %xmm9
174    movdqu  96(\IN_POS), %xmm10
175    movdqu  112(\IN_POS), %xmm11
176
177    pxor   %xmm4, S10              // input 1 ^ state 1
178    pxor   %xmm9, S11
179    pxor   %xmm10, S12
180    pxor   %xmm11, S13
181
182    movdqu  S10, 64(\OUT_POS)         // write back to output 1
183    movdqu  S11, 80(\OUT_POS)
184    movdqu  S12, 96(\OUT_POS)
185    movdqu  S13, 112(\OUT_POS)
186.endm
187
188.macro GENERATE_1_STATE
189    add   $1, %r11d
190    pinsrd  $0, %r11d, O03
191
192    movdqu  O00, S00         // set state 0
193    movdqu  O01, S01
194    movdqu  O02, S02
195    movdqu  O03, S03
196.endm
197
198.macro GENERATE_2_STATE
199    add   $1, %r11d
200    pinsrd  $0, %r11d, O03
201
202    movdqu  O00, S00         // set state 0
203    movdqu  O01, S01
204    movdqu  O02, S02
205    movdqu  O03, S03
206    movdqu  O00, S10         // set state 1
207    movdqu  O01, S11
208    movdqu  O02, S12
209    movdqu  O03, S13
210
211    add   $1, %r11d
212    pinsrd  $0, %r11d, S13
213.endm
214
215/*
216 * Processing 64 bytes: 4 xmm registers
217 * xmm0 ~ xmm3:
218 * xmm0 {0,  1,  2,  3}
219 * xmm1 {4,  5,  6,  7}
220 * xmm2 {8,  9,  10, 11}
221 * xmm3 {12, 13, 14, 15}
222 *
223 * Processing 128 bytes: 8 xmm registers
224 * xmm0 ~ xmm8:
225 * xmm0 {0,  1,  2,  3}           xmm5 {0,  1,  2,  3}
226 * xmm1 {4,  5,  6,  7}           xmm6 {4,  5,  6,  7}
227 * xmm2 {8,  9,  10, 11}          xmm7 {8,  9,  10, 11}
228 * xmm3 {12, 13, 14, 15}          xmm8 {12, 13, 14, 15}
229 *
230 * Processing 256 bytes: 16 xmm registers
231 * xmm0 ~ xmm15:
232 * xmm0 {0,  0,  0,  0}
233 * xmm1 {1,  2,  2,  2}
234 * xmm2 {3,  3,  3,  3}
235 * xmm3 {4,  4,  4,  4}
236 * ...
237 * xmm15 {15, 15, 15, 15}
238 *
239 * Processing 512 bytes: 16 xmm registers
240 * ymm0 ~ ymm15:
241 * ymm0 {0,  0,  0,  0}
242 * ymm1 {1,  2,  2,  2}
243 * ymm2 {3,  3,  3,  3}
244 * ymm3 {4,  4,  4,  4}
245 * ...
246 * ymm15 {15, 15, 15, 15}
247 *
248 */
249
250 /**
251 * @Interconnection with the C interface:void CHACHA20_Update(CRYPT_CHACHA20_Ctx *ctx, const uint8_t *in, uint8_t *out, uint32_t len);
252 * @brief chacha20 algorithm
253 * @param ctx [IN] Algorithm context, which is set by the C interface and transferred.
254 * @param in [IN] Data to be encrypted
255 * @param out [OUT] Data after encryption
256 * @param len [IN] Encrypted length
257 * esp cannot use 15 available ctx in out len
258 * 16 registers are needed in one cycle, then
259 * {0,  1,  4,  5,  8,   9,  12, 13}
260 * {2,  3,  6,  7,  10,  11, 14, 15}
261**/
262
263.globl CHACHA20_Update
264.type CHACHA20_Update,%function
265.align 64
266CHACHA20_Update:
267    .cfi_startproc
268    push %r12
269    mov %rcx, %r12
270    mov	48(%rdi), %r11d
271    mov %rsi, IN
272    mov %rdx, OUT
273
274    movdqu  (%rdi),   O00         // state[0-3]
275    movdqu  16(%rdi), O01         // state[4-7]
276    movdqu  32(%rdi), O02         // state[8-11]
277    movdqu  48(%rdi), O03         // state[12-15]
278
279    sub   $1, %r11d
280
281.LChaCha20_start:
282    cmp $128, %r12
283    jae  .LChaCha20_128_start
284    cmp $64, %r12
285    jae  .LChaCha20_64_start
286    jmp   .LChaCha20_end
287
288.LChaCha20_64_start:
289    GENERATE_1_STATE
290    mov $10, %r8
291
292.LChaCha20_64_loop:
293
294    sub   $1, %r8
295
296    /* 0 = 0 + 4, 12 = (12 ^ 0) >>> 16 | 8 =  8 + 12, 4 = (4 ^ 8) >>> 12 | 0 = 0 + 4, 12 = (12 ^ 0) >>> 8 |  8 = 8 + 12,  4 = (4 ^ 8) >>> 7 */
297    /* 1 = 1 + 5, 13 = (13 ^ 1) >>> 16 | 9 =  9 + 13, 5 = (5 ^ 9) >>> 12 | 1 = 1 + 5, 13 = (13 ^ 1) >>> 8 |  9 = 9 + 13,  5 = (5 ^ 9) >>> 7 */
298    /* 2 = 2 + 6, 14 = (14 ^ 2) >>> 16 | 10 = 10 + 14, 6 = (6 ^ 10)>>> 12 | 2 = 2 + 6, 14 = (14 ^ 2) >>> 8 |  10 =10+ 14,  6 = (6 ^ 10)>>> 7 */
299    /* 3 = 3 + 7, 15 = (15 ^ 3) >>> 16 | 11 = 11 + 15, 7 = (7 ^ 11)>>> 12 | 3 = 3 + 7 ,15 = (15 ^ 3) >>> 8 |  11 =11+ 15,  7 = (7 ^ 11)>>> 7 */
300    CHACHA20_ROUND S00 S01 S02 S03 %xmm4
301
302    pshufd  $78, S02, S02       // {8  9  10 11} ==> {10 11 8  9}  01 00 11 10
303    pshufd  $57, S01, S01       // {4  5  6   7} ==> {5  6  7  4}  00 11 10 01
304    pshufd  $147, S03, S03      // {12 13 14 15} ==> {15 12 13 14} 10 01 00 11
305
306    /* 0 = 0 + 5 , 15 = (15 ^ 0) >>> 16 | 10 = 10 + 15,   5 = (5 ^ 10) >>> 12 | 0 = 0 + 5, 15 = (15 ^ 0) >>> 8 |  10 = 10 + 15,  5 = (5 ^ 10) >>> 7 */
307    /* 1 = 1 + 6 , 12 = (12 ^ 1) >>> 16 | 11 = 11 + 12,   6 = (6 ^ 11) >>> 12 | 1 = 1 + 6, 12 = (12 ^ 1) >>> 8 |  11 = 11 + 12,  6 = (6 ^ 11) >>> 7 */
308    /* 2 = 2 + 7 , 13 = (13 ^ 2) >>> 16 | 8 = 8 + 13,     7 = (7 ^ 8)>>> 12   | 2 = 2 + 7, 13 = (13 ^ 2) >>> 8 |  8 =  8  + 13,  7 = (7 ^ 8)>>> 7 */
309    /* 3 = 3 + 4 , 14 = (14 ^ 3) >>> 16 | 9 = 9 + 14,     4 = (4 ^ 9)>>> 12   | 3 = 3 + 4, 14 = (14 ^ 3) >>> 8 |  9 =  9  + 14,  4 = (4 ^ 9)>>> 7 */
310    CHACHA20_ROUND S00 S01 S02 S03 %xmm4
311    pshufd  $78, S02, S02       // {10 11 8  9} ==> {8  9  10 11}  01 00 11 10
312    pshufd  $147, S01, S01      // {5  6  7  4} ==> {4  5  6   7}  00 11 10 01
313    pshufd  $57, S03, S03       // {15 12 13 14} ==> {12 13 14 15} 10 01 00 11
314
315    jnz	.LChaCha20_64_loop
316
317    WRITE_BACK_64 IN OUT
318
319    add $64, IN
320    add $64, OUT
321
322    sub  $64, %r12
323    jmp .LChaCha20_start
324
325.LChaCha20_128_start:
326    GENERATE_2_STATE
327    mov $10, %r8
328
329.LChaCha20_128_loop:
330
331    CHACHA20_2_ROUND  S00 S01 S02 S03 %xmm4 S10 S11 S12 S13 %xmm9
332
333    pshufd  $78, S02, S02       // {8  9  10 11} ==> {10 11 8  9}  01 00 11 10
334    pshufd  $57, S01, S01       // {4  5  6   7} ==> {5  6  7  4}  00 11 10 01
335    pshufd  $147, S03, S03      // {12 13 14 15} ==> {15 12 13 14} 10 01 00 11
336
337    pshufd  $78, S12, S12       // {8  9  10 11} ==> {10 11 8  9}  01 00 11 10
338    pshufd  $57, S11, S11       // {4  5  6   7} ==> {5  6  7  4}  00 11 10 01
339    pshufd  $147, S13, S13      // {12 13 14 15} ==> {15 12 13 14} 10 01 00 11
340
341    CHACHA20_2_ROUND  S00 S01 S02 S03 %xmm4 S10 S11 S12 S13 %xmm9
342
343    pshufd  $78, S02, S02       // {8  9  10 11} ==> {10 11 8  9}  01 00 11 10
344    pshufd  $147, S01, S01      // {4  5  6   7} ==> {5  6  7  4}  00 11 10 01
345    pshufd  $57, S03, S03       // {12 13 14 15} ==> {15 12 13 14} 10 01 00 11
346
347    pshufd  $78, S12, S12       // {8  9  10 11} ==> {10 11 8  9}  01 00 11 10
348    pshufd  $147, S11, S11      // {4  5  6   7} ==> {5  6  7  4}  00 11 10 01
349    pshufd  $57, S13, S13       // {12 13 14 15} ==> {15 12 13 14} 10 01 00 11
350
351    sub  $1, %r8
352    jnz	.LChaCha20_128_loop
353
354    WRITE_BACK_128 IN OUT
355    add $128, IN
356    add $128, OUT
357
358    sub   $128, %r12
359    jmp  .LChaCha20_start
360
361.LChaCha20_end:
362    add   $1, %r11d
363    mov  %r11d, 48(%rdi)
364    pop %r12
365    ret
366	.cfi_endproc
367
368.size CHACHA20_Update,.-CHACHA20_Update
369
370#endif
371