• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15
16#include "hitls_build.h"
17#if defined(HITLS_CRYPTO_CHACHA20) && defined(HITLS_CRYPTO_CHACHA20POLY1305)
18
19#include "crypt_arm.h"
20
21.arch   armv8-a
22
23/**
24 * c structure:Poly1305Ctx
25 */
26
27.set CTX_acc, 0
28.set CTX_r, 24
29.set CTX_s, 40
30.set CTX_table, 56
31.set CTX_data, 200
32.set CTX_lastLen, 216
33.set CTX_flag, 220
34.set CTX_size, 224
35
36.equ FLAG_BASE2_26,     1
37
38/* 104           78            52            26            0
39 * out4          out3          out2          out1          out0
40 *                             in0[63:52]    in0[51:26]    in0[25:0]
41 * in1[63:40]    in1[39:14]    in1[13:0]<<12
42 * in2[39:0]<<24
43 */
44/**
45 *  Macro description: converts the large number format. Three pieces of base 2^64 data are transferred,
46 *                     and five pieces of base 2^26 data are transferred.
47 *  Input register:
48 *      in0: digits 0 to 63 of the large number in the original format
49 *      in1:64 to 127 characters in the original format
50 *      in2:128 or more digits of the large number in the original format
51 *  Modified register: None
52 *  Output register:
53 *      out0: 0 to 25 bits of the converted large number
54 *      out1:26 to 51 digits of the converted large number
55 *      out2:52 to 77 bits of the converted large number
56 *      out3:78 to 103 bits of the converted large number
57 *      out4:104 or more digits of the converted large number
58 *  Function/Macro Call: None
59 *  Restriction: Note that the valid bits of in2 cannot exceed 40 bits.
60 *               Otherwise, data will be lost.
61 */
62    .macro CONVERT_64TO26    out0 out1 out2 out3 out4 in0 in1 in2
63    and     \out0, \in0, #0x03ffffff
64    ubfx    \out1, \in0, #26, #26
65    extr    \out2, \in1, \in0, #52
66    and     \out2, \out2, #0x03ffffff
67    ubfx    \out3, \in1, #14, #26
68    extr    \out4, \in2, \in1, #40
69    .endm
70
71/* 128            64            0
72 * out2           out1        out0
73 *                            in0
74 *               (in1>>38)    in1<<26
75 *                in2>>12     in2<<52
76 * (in3>>50)      in3<<14
77 * in4>>24        in4<<40
78 */
79/**
80 *  Macro description: converts the large number format. Five pieces of base2^26 data are transferred,
81 *                     and three pieces of base2^64 data are transferred.
82 *  Input register:
83 *      in0: large data block 0 in the original format
84 *      in1: large data block 1 in the original format
85 *      in2: large data block 2 in the original format
86 *      in3: large data block 3 in the original format
87 *      in4: large data block 4 in the original format
88 *  Modified register: None
89 *  Output register:
90 *       out0: bits 0 to 63 of the converted large number
91 *       out1: 64-127 bits of the converted large number
92 *       out2: 128 or more digits of the converted large number
93 *  Function/Macro Call: None
94 *  Restriction: Ensure that the valid bits in0-in4 of the input data do not exceed 38 bits. Otherwise,
95 *               data will be lost.
96 */
97    .macro CONVERT_26TO64    out0 out1 out2 in0 in1 in2 in3 in4
98    add     \out0, \in0, \in1, lsl#26
99    adds    \out0, \out0, \in2, lsl#52
100    lsr     \out1, \in2, #12
101    add     \out1, \out1, \in3, lsl#14
102    adc     \out1, \out1, xzr
103    adds    \out1, \out1, \in4, lsl#40
104    lsr     \out2, \in4, #24
105    adc     \out2, \out2, xzr
106    .endm
107
108
109/*   register     |  t_0             t_1          |  t_2             |
110 *   bits         |           128 bits            |     64 bits      |
111 *   1            |  r0*a0(lo)       r0*a1(lo)    |  r0*a2(lo)       |
112 *   2            |                  r0*a0(hi)    |  r0*a1(hi)       |
113 *   3            |  s1*a1(lo)       r1*a0(lo)    |                  |
114 *   4            |                  s1*a1(hi)    |  r1*a0(hi)       |
115 *   5            |                  s1*a2(lo)    |                  |
116 */
117/**
118 *  Macro description: Multiply large numbers and perform modulo
119 *                     (a0|a1|a2) = (a0|a1|a2) * (r0|r1) mod P
120 *  Input register:
121 *        a_0: digits 0 to 63 of the large number a
122 *        a_1: 64 to 127 digits of the major number a
123 *        a_2: 128 or more digits of the major number a
124 *        r_0: bits 0 to 63 of the large number r
125 *        r_1: 64-127 bits of the large number r
126 *        s_1: 5/4 times the large number r_1
127 *  Change register: x11-x15
128 *  Output register:
129 *        a_0: bits 0 to 63 of the multiplication result
130 *        a_1: 64-127 bits of the multiplication result
131 *        a_2: 128 or more bits of the multiplication result
132 *  Function/Macro Call: None
133 *  Restriction: The relationship between s1 and r1 is s1 = r1 + r1 >> 2.
134 */
135    .macro  POLY1305_MOD_MUL  a_0, a_1, a_2, r_0, r_1, s_1
136    /* 1 */
137    mul     x11, \r_0, \a_0
138    mul     x12, \r_0, \a_1
139    mul     x13, \r_0, \a_2
140    /* 2 */
141    umulh   x14, \r_0, \a_0
142    umulh   x15, \r_0, \a_1
143    adds    x12, x12, x14
144    adc     x13, x13, x15
145    /* 3 */
146    mul     x14, \s_1, \a_1
147    mul     x15, \r_1, \a_0
148    adds    x11, x11, x14
149    adcs    x12, x12, x15
150    adc     x13, x13, xzr
151    /* 4 */
152    umulh   x14, \s_1, \a_1
153    umulh   x15, \r_1, \a_0
154    adds    x12, x12, x14
155    adc     x13, x13, x15
156    /* 5 */
157    mul     x15, \s_1, \a_2
158    adds    x12, x12, x15
159    adc     x13, x13, xzr
160    /* Split x13 and add 5/4 of the high-order part to x11. */
161    bic     x15, x13, #3
162    and     x13, x13, #3
163    add     x15, x15, x15, lsr#2
164    adds    \a_0, x11, x15
165    adcs    \a_1, x12, xzr
166    adc     \a_2, x13, xzr
167    .endm
168
169/**
170 *  Macro description: Convert the content of a large number (r_0|r_1|r_2) into the format of 2 ^ 26,
171 *                     and then fill the memory pointed to by ptr at intervals.
172 *  Input register:
173 *       r_0: digits 0 to 63 of a large number
174 *       r_1: indicates the 64th to 127th digits of the large number.
175 *       r_2: 128th to 191th digits of a large number
176 *       ptr: start address of the memory to be filled
177 *   Change register: x11-x15
178 *   Output register: None
179 *   Function/Macro call: TRANSFER_64TO26
180 *
181 */
182    .macro  Fill_TABLE r_0, r_1, r_2, ptr
183    /* base 2^64 -> base 2^26 */
184    /* r_0 r_1 r_2 --> x11 x12 x13 x14 x15 */
185    CONVERT_64TO26 x11, x12, x13, x14, x15, \r_0, \r_1, \r_2
186    /* Stores the converted value. */
187    str     w11, [\ptr, #16*0]
188    str     w12, [\ptr, #16*1]
189    str     w13, [\ptr, #16*2]
190    str     w14, [\ptr, #16*3]
191    str     w15, [\ptr, #16*4]
192    /* Multiply 5 times and continue to store */
193    add     w12, w12, w12, lsl#2
194    add     w13, w13, w13, lsl#2
195    add     w14, w14, w14, lsl#2
196    add     w15, w15, w15, lsl#2
197
198    str     w12, [\ptr, #16*5]
199    str     w13, [\ptr, #16*6]
200    str     w14, [\ptr, #16*7]
201    str     w15, [\ptr, #16*8]
202    .endm
203
204/**
205 *  Function description: This function is used to initialize the pre-computation table.
206 *  Function prototype: void Poly1305InitForAsm(Poly1305Ctx *ctx);
207 *  Input register:
208 *         x0: address of the context structure
209 *  Change register x0 and x5-x15.
210 *  Output register: None
211 *  Function/Macro Call: Poly1305_MOD_MUL Fill_TABLE
212 */
213.text
214.balign 64
215.global Poly1305InitForAsm
216.type Poly1305InitForAsm, %function
217Poly1305InitForAsm:
218AARCH64_PACIASP
219    stp     x29, x30, [sp, #-16]!
220    add     x29, sp, #0
221
222    /* Clearing the member flag */
223    str     wzr, [x0, #CTX_flag]
224
225    /* Initialize the r table. */
226    ldp     x8, x9, [x0, #CTX_r]
227
228#ifdef	HITLS_BIG_ENDIAN
229    /* The r value needs to be reversed in the big-endian case. */
230    ror     x8, x8, #32
231    ror     x9, x9, #32
232#endif
233
234    add     x10, x9, x9, lsr#2
235    /* padding r^1 */
236    add     x0, x0, #CTX_table + 12
237    mov     x5, x8
238    mov     x6, x9
239    mov     x7, xzr
240    Fill_TABLE x5, x6, x7, x0
241
242    /* Calculate and populate r^2 */
243    sub     x0, x0, #4
244    POLY1305_MOD_MUL x5, x6, x7, x8, x9, x10
245    Fill_TABLE x5, x6, x7, x0
246
247    /* Calculate and populate r^3 */
248    sub     x0, x0, #4
249    POLY1305_MOD_MUL x5, x6, x7, x8, x9, x10
250    Fill_TABLE x5, x6, x7, x0
251
252    /* Calculate and populate r^4 */
253    sub     x0, x0, #4
254    POLY1305_MOD_MUL x5, x6, x7, x8, x9, x10
255    Fill_TABLE x5, x6, x7, x0
256
257    eor     x5, x5, x5
258    eor     x6, x6, x6
259    eor     x7, x7, x7
260    eor     x8, x8, x8
261    eor     x9, x9, x9
262    eor     x10, x10, x10
263
264    ldp     x29, x30, [sp], #16
265AARCH64_AUTIASP
266    ret
267.size Poly1305InitForAsm, .-Poly1305InitForAsm
268
269/**
270 *  Function description: Outputs the final result value to the specified memory.
271 *  Function prototype: void Poly1305Last(Poly1305Ctx *ctx, uint8_t mac[POLY1305_TAGSIZE]);
272 *  Input register:
273 *         x0: address of the context structure
274 *         x1: pointer to the output buffer
275 *  Change register: x3-x15
276 *  Output register: None
277 *  Function/Macro Call: Poly1305LastNeon
278 */
279.text
280.balign 64
281.global Poly1305Last
282.type Poly1305Last, %function
283Poly1305Last:
284AARCH64_PACIASP
285    ldr     w15, [x0, #CTX_flag]
286    and     w15, w15, #FLAG_BASE2_26
287    cbnz    w15, Poly1305LastNeon
288
289    ldp     x3, x4, [x0, #CTX_acc]
290    ldr     x5, [x0, #CTX_acc + 16]
291    ldp     x12, x13, [x0, #CTX_s]
292
293    adds    x9, x3, #5        // Compute acc + 5
294    adcs    x10, x4, xzr
295    adc     x11, x5, xzr
296    /* Test for more than 2 ^ 130 */
297    cmp     x11, #3
298    /* If yes, use the value after adding 5 (equal to the value after modulo operation).
299       If no, use the original value. */
300    csel    x3, x3, x9, le
301    csel    x4, x4, x10, le
302    /* Plus the s value */
303#ifdef	HITLS_BIG_ENDIAN
304    /* In the big-endian scenario, the s value needs to be reversed. */
305    ror     x12, x12, #32
306    ror     x13, x13, #32
307#endif
308    adds    x3, x3, x12
309    adc     x4, x4, x13
310    mov     x12, xzr // zero out.
311    mov     x13, xzr
312#ifdef	HITLS_BIG_ENDIAN
313    /* In big-endian mode, the data is converted to little-endian and then output to the memory. */
314    rev     x3, x3
315    rev     x4, x4
316#endif
317    stp     x3, x4, [x1]
318AARCH64_AUTIASP
319    ret
320.size Poly1305Last, .-Poly1305Last
321
322/**
323 *  Function description: Outputs the final result value to the specified memory.
324 *  Function prototype: void Poly1305LastNeon(Poly1305Ctx *ctx, uint8_t mac[POLY1305_TAGSIZE]);
325 *  Input register:
326 *         x0: address of the context structure
327 *         x1: pointer to the output buffer
328 *  Change register: x2-x15
329 *  Output register: None
330 *  Function/Macro Call: None
331 */
332.text
333.balign 64
334.type   Poly1305LastNeon, %function
335Poly1305LastNeon:
336AARCH64_PACIASP
337    /* Load the value of base 2^26. */
338    ldp     w11, w12, [x0, #CTX_acc]
339    ldp     w13, w14, [x0, #CTX_acc + 8]
340    ldr     w15, [x0, #CTX_acc + 16]
341    /* Converted to base 2^64, x11 to x15 are within 30 bits. */
342    CONVERT_26TO64 x5, x6, x7, x11, x12, x13, x14, x15
343    /* Load the s value. */
344    ldp     x2, x3, [x0, #CTX_s]
345
346    /* Add more than 130 bits by 5 to the lower bits. */
347    bic     x15, x7, #3
348    and     x7, x7, #3
349    add     x15, x15, x15, lsr#2
350    adds    x5, x5, x15
351    adcs    x6, x6, xzr
352    adc     x7, x7, xzr
353
354    /* Modulo P, subtract directly */
355    /* subtraction:acc - (2^130 - 5) = acc + 5 - 2^130 */
356    adds    x11, x5, #5
357    adcs    x12, x6, xzr
358    adc     x13, x7, xzr
359    /* Test for more than 2 ^ 130 */
360    cmp     x13, #4
361    /* If P is greater than or equal to P, the new value is used. */
362    csel    x5, x11, x5, ge
363    csel    x6, x12, x6, ge
364
365    /* Value of s plus acc */
366#ifdef	HITLS_BIG_ENDIAN
367    /* In the big-endian scenario, the s value needs to be reversed. */
368    ror     x2, x2, #32
369    ror     x3, x3, #32
370#endif
371
372    adds    x2, x2, x5
373    adc     x3, x3, x6
374
375#ifdef	HITLS_BIG_ENDIAN
376    /* In big-endian mode, the data is converted to little-endian and then output to the memory. */
377    rev     x2, x2
378    rev     x3, x3
379#endif
380
381    stp     x2, x3, [x1]
382AARCH64_AUTIASP
383    ret
384.size Poly1305LastNeon, .-Poly1305LastNeon
385
386
387/**
388 *  Function description: Compresses the input data and stores it in the context structure.
389 *  Function prototype: uint32_t Poly1305Block(Poly1305Ctx *ctx, const uint8_t *data,
390 *                                             uint32_t dataLen, uint32_t padbit);
391 *  Input register:
392 *         x0: address of the context structure
393 *         x1: pointer to the input data
394 *         x2: length of the input data
395 *         x3: padded bits, 0 or 1.
396 *  Change register: x4-x15
397 *  Output register:
398 *  x0: length of the remaining data to be processed
399 *  Function/Macro Call: CONVERT_26TO64 POLY1305_MOD_MUL Poly1305BlockNeon
400 */
401.text
402.balign 64
403.global  Poly1305Block
404.type   Poly1305Block, %function
405Poly1305Block:
406AARCH64_PACIASP
407    /* x4 indicates the length of the basic instruction set to be processed,
408       and x2 indicates the remaining length of the instruction set to be processed. */
409    /* If the value is less than 16, no processing is required. If NEON is supported,
410       the part that is greater than or equal to 256 is reserved for NEON. */
411    and     x4, x2, #0xF0     // x4 is the processing length of the basic instruction set.
412    bic     x2, x2, #0xF0     // x2 is the remaining length after the basic instruction set is processed.
413    cbz     x4, .Lskip_process
414    /* Load the ACC value. */
415    ldr     w15, [x0, #CTX_flag]
416    and     w14, w15, #FLAG_BASE2_26
417    cbz     w14, .Lload_acc_64
418    bic     w15, w15, #FLAG_BASE2_26
419    str     w15, [x0, #CTX_flag]
420    ldp     w10, w11, [x0, #CTX_acc]
421    ldp     w12, w13, [x0, #CTX_acc + 8]
422    ldr     w14, [x0, #CTX_acc + 16]
423    CONVERT_26TO64 x5, x6, x7, x10, x11, x12, x13, x14
424    b       .Lend_load_acc_64
425.Lload_acc_64:
426    ldp     x5, x6, [x0, #CTX_acc]
427    ldr     x7, [x0, #CTX_acc + 16]
428.Lend_load_acc_64:
429
430    /* Load the r value. */
431    ldp     x8, x9, [x0, #CTX_r]
432
433#ifdef	HITLS_BIG_ENDIAN
434    /* The r value needs to be reversed in the big-endian case. */
435    ror     x8, x8, #32
436    ror     x9, x9, #32
437#endif
438
439    add     x10, x9, x9, lsr#2
440
441.Lloop_64:
442    /* Accumulator acc plus plaintext block with padding x3 */
443    ldp     x11, x12, [x1], #16
444
445#ifdef	HITLS_BIG_ENDIAN
446    rev     x11, x11
447    rev     x12, x12
448#endif
449
450    adds    x5, x5, x11
451    adcs    x6, x6, x12
452    adc     x7, x7, x3
453    /* Multiply large numbers and take modulo (x5|x6|x7) = (x5|x6|x7) * (x8|x9) mod P */
454    /* x10 = x9 + x9 >> 2 */
455    POLY1305_MOD_MUL x5, x6, x7, x8, x9, x10
456    /* End of loop, update iteration information */
457    sub     x4, x4, #16
458    cbnz    x4, .Lloop_64
459
460    stp     x5, x6, [x0, #CTX_acc]
461    str     x7, [x0, #CTX_acc + 16]
462.Lskip_process:
463    /* If the remaining length is 256 bytes or more, the NEON processes the remaining length. */
464    bic     x4, x2, #0xFF
465    cbnz    x4, Poly1305BlockNeon
466
467    /* function returns */
468    and     x0, x2, #15 // The return value is the unprocessed length.
469    eor     x8, x8, x8
470    eor     x9, x9, x9
471AARCH64_AUTIASP
472    ret
473.size Poly1305Block, .-Poly1305Block
474
475/**
476 *  Function description: Compresses the input data, stores the data in the context structure, and uses the NEON register.
477 *  Function prototype: uint32_t Poly1305BlockNeon(Poly1305Ctx *ctx, const uint8_t *data, uint32_t dataLen, uint32_t padbit);
478 *  Input register:
479 *         x0: context structure address
480 *         x1: pointer to the input data
481 *         x2: length of the input data
482 *         x3: padding bit, 0 or 1.
483 *  Modify the register x0-x15,v0-v7,v16-v31.
484 *  Output register:
485 *         x0: length of the remaining data to be processed
486 *  Function/Macro call: CONVERT_64TO26
487 */
488.text
489.balign 64
490.type   Poly1305BlockNeon, %function
491Poly1305BlockNeon:
492    stp     x29, x30, [sp, #-16]!
493    stp     d8, d9, [sp, #-16]!
494    stp     d10, d11, [sp, #-16]!
495    stp     d12, d13, [sp, #-16]!
496    stp     d14, d15, [sp, #-16]!
497
498    /* Load the acc value, which is stored in v24-v28. */
499    ldr     w15, [x0, #CTX_flag]
500    and     w14, w15, #FLAG_BASE2_26
501    cbnz    w14, .Lload_acc_26
502    orr     w15, w15, #FLAG_BASE2_26
503    str     w15, [x0, #CTX_flag]
504    ldp     x5, x6, [x0, #CTX_acc]
505    ldr     x7, [x0, #CTX_acc + 16]
506    CONVERT_64TO26 x11, x12, x13, x14, x15, x5, x6, x7
507    fmov    s24, w11
508    fmov    s25, w12
509    fmov    s26, w13
510    fmov    s27, w14
511    fmov    s28, w15
512    b       .Lend_load_acc_26
513.Lload_acc_26:
514    ldp     s24, s25, [x0, #CTX_acc]
515    ldp     s26, s27, [x0, #CTX_acc + 8]
516    ldr     s28, [x0, #CTX_acc + 16]
517.Lend_load_acc_26:
518
519    /* Load r-value table */
520    add     x15, x0, #CTX_table
521    ld1     {v0.4s}, [x15], #16                         // r^n[0] mod P, n = 1, 2, 3, 4
522    ld1     {v1.4s, v2.4s, v3.4s, v4.4s}, [x15], #64    // r^n[1:4] mod P
523    ld1     {v5.4s, v6.4s, v7.4s, v8.4s}, [x15], #64    // 5 * r^n[1:4] mod P
524
525    /* Pre-treatment before start of cycle */
526    add     x1, x1, #64
527    sub     x4, x4, #64
528    /* v31.2d is {0x3ffffff, 0x3ffffff} */
529    movi    v31.16b, #0xFF
530    ushr    v31.2d, v31.2d, #38
531
532    /* Load (m[2], m[3]), convert the format, and save it to v14-v18. */
533    ldp     x9, x10, [x1, #-32]
534    ldp     x14, x15, [x1, #-16]
535
536#ifdef	HITLS_BIG_ENDIAN
537    rev     x9, x9
538    rev     x10, x10
539    rev     x14, x14
540    rev     x15, x15
541#endif
542
543    and     x6, x9, #0x03ffffff
544    ubfx    x7, x9, #26, #26
545    extr    x8, x10, x9, #52
546    and     x8, x8, #0x03ffffff
547    ubfx    x9, x10, #14, #26
548    extr    x10, x3, x10, #40
549
550    and     x11, x14, #0x03ffffff
551    ubfx    x12, x14, #26, #26
552    extr    x13, x15, x14, #52
553    and     x13, x13, #0x03ffffff
554    ubfx    x14, x15, #14, #26
555    extr    x15, x3, x15, #40
556
557    add     x6, x6, x11, lsl#32
558    add     x7, x7, x12, lsl#32
559    add     x8, x8, x13, lsl#32
560    add     x9, x9, x14, lsl#32
561    add     x10, x10, x15, lsl#32
562
563    fmov    d14, x6
564    fmov    d15, x7
565    fmov    d16, x8
566    fmov    d17, x9
567    fmov    d18, x10
568
569    /* Load (m[0], m[1]) and save the converted format in v9-v13. */
570    ldp     x9, x10, [x1, #-64]
571    ldp     x14, x15, [x1, #-48]
572
573#ifdef	HITLS_BIG_ENDIAN
574    rev     x9, x9
575    rev     x10, x10
576    rev     x14, x14
577    rev     x15, x15
578#endif
579
580    and     x6, x9, #0x03ffffff
581    ubfx    x7, x9, #26, #26
582    extr    x8, x10, x9, #52
583    and     x8, x8, #0x03ffffff
584    ubfx    x9, x10, #14, #26
585    extr    x10, x3, x10, #40
586
587    and     x11, x14, #0x03ffffff
588    ubfx    x12, x14, #26, #26
589    extr    x13, x15, x14, #52
590    and     x13, x13, #0x03ffffff
591    ubfx    x14, x15, #14, #26
592    extr    x15, x3, x15, #40
593
594    add     x6, x6, x11, lsl#32
595    add     x7, x7, x12, lsl#32
596    add     x8, x8, x13, lsl#32
597    add     x9, x9, x14, lsl#32
598    add     x10, x10, x15, lsl#32
599
600    fmov    d9, x6
601    fmov    d10, x7
602    fmov    d11, x8
603    fmov    d12, x9
604    fmov    d13, x10
605
606    /*
607        See NEON Crypto by Daniel J. Bernstein and Peter Schwabe
608        Use base 2^26 to represent a large number: f = f[0] + f[1]<<26 + f[2]<<52 + f[3]<<78 + f[4]<<104
609        Calculate h = (f * g) mod (2^130 - 5), using the NEON register
610        h[0] = f[0]g[0] + 5f[1]g[4] + 5f[2]g[3] + 5f[3]g[2] + 5f[4]g[1]
611        h[1] = f[0]g[1] +  f[1]g[0] + 5f[2]g[4] + 5f[3]g[3] + 5f[4]g[2]
612        h[2] = f[0]g[2] +  f[1]g[1] +  f[2]g[0] + 5f[3]g[4] + 5f[4]g[3]
613        h[3] = f[0]g[3] +  f[1]g[2] +  f[2]g[1] +  f[3]g[0] + 5f[4]g[4]
614        h[4] = f[0]g[4] +  f[1]g[3] +  f[2]g[2] +  f[3]g[1] +  f[4]g[0]
615
616        NEON Polynomial Calculation Process:
617          ((m[0]r^4 + m[2]r^2 + m[4])*r^4 + m[6]r^2 + m[8])*r^4 + m[10]r^2
618        + ((m[1]r^4 + m[3]r^2 + m[5])*r^4 + m[7]r^2 + m[9])*r^3 + m[11]r^1
619
620        Calculated inside the loop:
621            (x[0],y[0]) = (acc, 0)
622            (x[1],y[1]) = (m[2],m[3])*(r^2,r^2) + ((m[0],m[1]) + (x[0],y[0]))*(r^4,r^4)
623            (x[2],y[2]) = (m[6],m[7])*(r^2,r^2) + ((m[4],m[5]) + (x[1],y[1]))*(r^4,r^4)
624    */
625    /* Start loop, vector register has used v0-v8 to hold r value precalculated table, v24-v28 to hold ACC value */
626.Lloop_neon:
627    add     x1, x1, #64
628    sub     x4, x4, #64
629
630    /* Compute (m[2 + 4i], m[3 + 4i])*(r^2, r^2), stored in v19-v23 */
631    /* Load the (m[6 + 4i], m[7 + 4i]) file and save it in v14-v18. */
632    ldp     x9, x10, [x1, #-32]
633
634    umull   v19.2d, v14.2s, v0.s[2]
635    umull   v20.2d, v14.2s, v1.s[2]
636    umull   v21.2d, v14.2s, v2.s[2]
637    umull   v22.2d, v14.2s, v3.s[2]
638    umull   v23.2d, v14.2s, v4.s[2]
639
640    ldp     x14, x15, [x1, #-16]
641
642    umlal   v19.2d, v15.2s, v8.s[2]
643    umlal   v20.2d, v15.2s, v0.s[2]
644    umlal   v21.2d, v15.2s, v1.s[2]
645    umlal   v22.2d, v15.2s, v2.s[2]
646    umlal   v23.2d, v15.2s, v3.s[2]
647
648#ifdef	HITLS_BIG_ENDIAN
649    rev     x9, x9
650    rev     x10, x10
651    rev     x14, x14
652    rev     x15, x15
653#endif
654
655    and     x6, x9, #0x03ffffff
656    and     x11, x14, #0x03ffffff
657    ubfx    x7, x9, #26, #26
658    ubfx    x12, x14, #26, #26
659    extr    x8, x10, x9, #52
660    extr    x13, x15, x14, #52
661
662    umlal   v19.2d, v16.2s, v7.s[2]
663    umlal   v20.2d, v16.2s, v8.s[2]
664    umlal   v21.2d, v16.2s, v0.s[2]
665    umlal   v22.2d, v16.2s, v1.s[2]
666    umlal   v23.2d, v16.2s, v2.s[2]
667
668    and     x8, x8, #0x03ffffff
669    and     x13, x13, #0x03ffffff
670    ubfx    x9, x10, #14, #26
671    ubfx    x14, x15, #14, #26
672    extr    x10, x3, x10, #40
673    extr    x15, x3, x15, #40
674
675    umlal   v19.2d, v17.2s, v6.s[2]
676    umlal   v20.2d, v17.2s, v7.s[2]
677    umlal   v21.2d, v17.2s, v8.s[2]
678    umlal   v22.2d, v17.2s, v0.s[2]
679    umlal   v23.2d, v17.2s, v1.s[2]
680
681    add     x6, x6, x11, lsl#32
682    add     x7, x7, x12, lsl#32
683    add     x8, x8, x13, lsl#32
684    add     x9, x9, x14, lsl#32
685    add     x10, x10, x15, lsl#32
686
687    umlal   v19.2d, v18.2s, v5.s[2]
688    umlal   v20.2d, v18.2s, v6.s[2]
689    umlal   v21.2d, v18.2s, v7.s[2]
690    umlal   v22.2d, v18.2s, v8.s[2]
691    umlal   v23.2d, v18.2s, v0.s[2]
692
693    fmov    d14, x6
694    fmov    d15, x7
695    fmov    d16, x8
696    fmov    d17, x9
697    fmov    d18, x10
698
699    /* It is not placed at the beginning of the loop because it depends on v24 to v28. */
700    /* Compute ((m[0 + 4i], m[1 + 4i]) + (x[i], y[i]))*(r^4, r^4), stored in v19-v23 */
701    /* Load the (m[4 + 4i], m[5 + 4i]) file and save it in v9-v13. */
702    add     v9.2s, v9.2s, v24.2s
703    add     v10.2s, v10.2s, v25.2s
704    add     v11.2s, v11.2s, v26.2s
705    add     v12.2s, v12.2s, v27.2s
706    add     v13.2s, v13.2s, v28.2s
707
708    ldp     x9, x10, [x1, #-64]
709
710    umlal   v19.2d, v9.2s, v0.s[0]
711    umlal   v20.2d, v9.2s, v1.s[0]
712    umlal   v21.2d, v9.2s, v2.s[0]
713    umlal   v22.2d, v9.2s, v3.s[0]
714    umlal   v23.2d, v9.2s, v4.s[0]
715
716    ldp     x14, x15, [x1, #-48]
717
718    umlal   v19.2d, v10.2s, v8.s[0]
719    umlal   v20.2d, v10.2s, v0.s[0]
720    umlal   v21.2d, v10.2s, v1.s[0]
721    umlal   v22.2d, v10.2s, v2.s[0]
722    umlal   v23.2d, v10.2s, v3.s[0]
723
724#ifdef	HITLS_BIG_ENDIAN
725    rev     x9, x9
726    rev     x10, x10
727    rev     x14, x14
728    rev     x15, x15
729#endif
730
731    and     x6, x9, #0x03ffffff
732    and     x11, x14, #0x03ffffff
733    ubfx    x7, x9, #26, #26
734    ubfx    x12, x14, #26, #26
735    extr    x8, x10, x9, #52
736    extr    x13, x15, x14, #52
737
738    umlal   v19.2d, v11.2s, v7.s[0]
739    umlal   v20.2d, v11.2s, v8.s[0]
740    umlal   v21.2d, v11.2s, v0.s[0]
741    umlal   v22.2d, v11.2s, v1.s[0]
742    umlal   v23.2d, v11.2s, v2.s[0]
743
744    and     x8, x8, #0x03ffffff
745    and     x13, x13, #0x03ffffff
746    ubfx    x9, x10, #14, #26
747    ubfx    x14, x15, #14, #26
748    extr    x10, x3, x10, #40
749    extr    x15, x3, x15, #40
750
751    umlal   v19.2d, v12.2s, v6.s[0]
752    umlal   v20.2d, v12.2s, v7.s[0]
753    umlal   v21.2d, v12.2s, v8.s[0]
754    umlal   v22.2d, v12.2s, v0.s[0]
755    umlal   v23.2d, v12.2s, v1.s[0]
756
757    add     x6, x6, x11, lsl#32
758    add     x7, x7, x12, lsl#32
759    add     x8, x8, x13, lsl#32
760    add     x9, x9, x14, lsl#32
761    add     x10, x10, x15, lsl#32
762
763    umlal   v19.2d, v13.2s, v5.s[0]
764    umlal   v20.2d, v13.2s, v6.s[0]
765    umlal   v21.2d, v13.2s, v7.s[0]
766    umlal   v22.2d, v13.2s, v8.s[0]
767    umlal   v23.2d, v13.2s, v0.s[0]
768
769    fmov    d9, x6
770    fmov    d10, x7
771    fmov    d11, x8
772    fmov    d12, x9
773    fmov    d13, x10
774
775    /* Because v19-v23 significant bits may exceed 56 bits, to ensure that subsequent multiplication
776       does not overflow, two carry is processed. */
777    ushr    v24.2d, v19.2d, #26
778    ushr    v25.2d, v20.2d, #26
779    ushr    v26.2d, v21.2d, #26
780    ushr    v27.2d, v22.2d, #26
781    ushr    v28.2d, v23.2d, #26
782    /* More than 130 digits multiplied by 5 to the lower bits */
783    shl     v29.2d, v28.2d, #2
784    add     v28.2d, v28.2d, v29.2d
785    /* Use the AND operation to truncate the lower 26 bits. */
786    and     v19.16b, v19.16b, v31.16b
787    and     v20.16b, v20.16b, v31.16b
788    and     v21.16b, v21.16b, v31.16b
789    and     v22.16b, v22.16b, v31.16b
790    and     v23.16b, v23.16b, v31.16b
791    /* Add the part of the low carry */
792    add     v19.2d, v19.2d, v28.2d
793    add     v20.2d, v20.2d, v24.2d
794    add     v21.2d, v21.2d, v25.2d
795    add     v22.2d, v22.2d, v26.2d
796    add     v23.2d, v23.2d, v27.2d
797    /* Continue carry processing */
798    ushr    v24.2d, v19.2d, #26
799    ushr    v25.2d, v20.2d, #26
800    ushr    v26.2d, v21.2d, #26
801    ushr    v27.2d, v22.2d, #26
802    ushr    v28.2d, v23.2d, #26
803    shl     v29.2d, v28.2d, #2
804    add     v28.2d, v28.2d, v29.2d
805
806    and     v19.16b, v19.16b, v31.16b
807    and     v20.16b, v20.16b, v31.16b
808    and     v21.16b, v21.16b, v31.16b
809    and     v22.16b, v22.16b, v31.16b
810    and     v23.16b, v23.16b, v31.16b
811
812    add     v19.2d, v19.2d, v28.2d
813    add     v20.2d, v20.2d, v24.2d
814    add     v21.2d, v21.2d, v25.2d
815    add     v22.2d, v22.2d, v26.2d
816    add     v23.2d, v23.2d, v27.2d
817
818    /* The calculated (x[i + 1], y[i + 1]) is stored in v24-v28 and is reserved for the next cycle. */
819    xtn     v24.2s, v19.2d
820    xtn     v25.2s, v20.2d
821    xtn     v26.2s, v21.2d
822    xtn     v27.2s, v22.2d
823    xtn     v28.2s, v23.2d
824
825    /* End of loop, skip */
826    cbnz    x4, .Lloop_neon
827
828    /* Dealing with the tail */
829    /* Compute (m[6 + 4i], m[7 + 4i])*(r^2, r^1), stored in v19-v23 */
830    dup     v14.2d, v14.d[0]
831    dup     v15.2d, v15.d[0]
832    dup     v16.2d, v16.d[0]
833    dup     v17.2d, v17.d[0]
834    dup     v18.2d, v18.d[0]
835
836    umull2  v19.2d, v14.4s, v0.4s
837    umull2  v20.2d, v14.4s, v1.4s
838    umull2  v21.2d, v14.4s, v2.4s
839    umull2  v22.2d, v14.4s, v3.4s
840    umull2  v23.2d, v14.4s, v4.4s
841
842    umlal2  v19.2d, v15.4s, v8.4s
843    umlal2  v20.2d, v15.4s, v0.4s
844    umlal2  v21.2d, v15.4s, v1.4s
845    umlal2  v22.2d, v15.4s, v2.4s
846    umlal2  v23.2d, v15.4s, v3.4s
847
848    umlal2  v19.2d, v16.4s, v7.4s
849    umlal2  v20.2d, v16.4s, v8.4s
850    umlal2  v21.2d, v16.4s, v0.4s
851    umlal2  v22.2d, v16.4s, v1.4s
852    umlal2  v23.2d, v16.4s, v2.4s
853
854    umlal2  v19.2d, v17.4s, v6.4s
855    umlal2  v20.2d, v17.4s, v7.4s
856    umlal2  v21.2d, v17.4s, v8.4s
857    umlal2  v22.2d, v17.4s, v0.4s
858    umlal2  v23.2d, v17.4s, v1.4s
859
860    umlal2  v19.2d, v18.4s, v5.4s
861    umlal2  v20.2d, v18.4s, v6.4s
862    umlal2  v21.2d, v18.4s, v7.4s
863    umlal2  v22.2d, v18.4s, v8.4s
864    umlal2  v23.2d, v18.4s, v0.4s
865
866    /* Compute (m[4 + 4i], m[5 + 4i])*(r^4, r^3), stored in v19-v23 */
867    add     v9.2s, v9.2s, v24.2s
868    add     v10.2s, v10.2s, v25.2s
869    add     v11.2s, v11.2s, v26.2s
870    add     v12.2s, v12.2s, v27.2s
871    add     v13.2s, v13.2s, v28.2s
872
873    umlal   v19.2d, v9.2s, v0.2s
874    umlal   v20.2d, v9.2s, v1.2s
875    umlal   v21.2d, v9.2s, v2.2s
876    umlal   v22.2d, v9.2s, v3.2s
877    umlal   v23.2d, v9.2s, v4.2s
878
879    umlal   v19.2d, v10.2s, v8.2s
880    umlal   v20.2d, v10.2s, v0.2s
881    umlal   v21.2d, v10.2s, v1.2s
882    umlal   v22.2d, v10.2s, v2.2s
883    umlal   v23.2d, v10.2s, v3.2s
884
885    umlal   v19.2d, v11.2s, v7.2s
886    umlal   v20.2d, v11.2s, v8.2s
887    umlal   v21.2d, v11.2s, v0.2s
888    umlal   v22.2d, v11.2s, v1.2s
889    umlal   v23.2d, v11.2s, v2.2s
890
891    umlal   v19.2d, v12.2s, v6.2s
892    umlal   v20.2d, v12.2s, v7.2s
893    umlal   v21.2d, v12.2s, v8.2s
894    umlal   v22.2d, v12.2s, v0.2s
895    umlal   v23.2d, v12.2s, v1.2s
896
897    umlal   v19.2d, v13.2s, v5.2s
898    umlal   v20.2d, v13.2s, v6.2s
899    umlal   v21.2d, v13.2s, v7.2s
900    umlal   v22.2d, v13.2s, v8.2s
901    umlal   v23.2d, v13.2s, v0.2s
902
903    /* The results are added, stored in v24-v28, and base 2^26 carry. */
904    ushr    v24.2d, v19.2d, #26
905    ushr    v25.2d, v20.2d, #26
906    ushr    v26.2d, v21.2d, #26
907    ushr    v27.2d, v22.2d, #26
908    ushr    v28.2d, v23.2d, #26
909    shl     v29.2d, v28.2d, #2
910    add     v28.2d, v28.2d, v29.2d
911
912    and     v19.16b, v19.16b, v31.16b
913    and     v20.16b, v20.16b, v31.16b
914    and     v21.16b, v21.16b, v31.16b
915    and     v22.16b, v22.16b, v31.16b
916    and     v23.16b, v23.16b, v31.16b
917
918    add     v19.2d, v19.2d, v28.2d
919    add     v20.2d, v20.2d, v24.2d
920    add     v21.2d, v21.2d, v25.2d
921    add     v22.2d, v22.2d, v26.2d
922    add     v23.2d, v23.2d, v27.2d
923    /* Continue carry processing */
924    ushr    v24.2d, v19.2d, #26
925    ushr    v25.2d, v20.2d, #26
926    ushr    v26.2d, v21.2d, #26
927    ushr    v27.2d, v22.2d, #26
928    ushr    v28.2d, v23.2d, #26
929    shl     v29.2d, v28.2d, #2
930    add     v28.2d, v28.2d, v29.2d
931
932    and     v19.16b, v19.16b, v31.16b
933    and     v20.16b, v20.16b, v31.16b
934    and     v21.16b, v21.16b, v31.16b
935    and     v22.16b, v22.16b, v31.16b
936    and     v23.16b, v23.16b, v31.16b
937
938    add     v19.2d, v19.2d, v28.2d
939    add     v20.2d, v20.2d, v24.2d
940    add     v21.2d, v21.2d, v25.2d
941    add     v22.2d, v22.2d, v26.2d
942    add     v23.2d, v23.2d, v27.2d
943
944    addp    v24.2d, v19.2d, v19.2d
945    addp    v25.2d, v20.2d, v20.2d
946    addp    v26.2d, v21.2d, v21.2d
947    addp    v27.2d, v22.2d, v22.2d
948    addp    v28.2d, v23.2d, v23.2d
949    /* After the processing is complete, save the data. Note that the carry may not be completely processed. */
950    stp     s24, s25, [x0, #CTX_acc]
951    stp     s26, s27, [x0, #CTX_acc + 8]
952    str     s28, [x0, #CTX_acc + 16]
953
954    /* return */
955    mov     x5, xzr
956    ldp     d14, d15, [sp], #16
957    ldp     d12, d13, [sp], #16
958    ldp     d10, d11, [sp], #16
959    ldp     d8, d9, [sp], #16
960    ldp     x29, x30, [sp], #16
961    and     x0, x2, #15 // The return value is the unprocessed length.
962AARCH64_AUTIASP
963    ret
964.size Poly1305BlockNeon, .-Poly1305BlockNeon
965
966/**
967 *  Function description: This function is used to clear residual sensitive information in registers.
968 *  Function prototype: void Poly1305CleanRegister();
969 *  Input register: None
970 *  Modify the registers v0-v7, v16-v31.
971 *  Output register: None
972 *  Function/Macro Call: None
973 */
974.text
975.balign 64
976.global Poly1305CleanRegister
977.type Poly1305CleanRegister, %function
978Poly1305CleanRegister:
979AARCH64_PACIASP
980    movi    v0.16b, #0
981    and     v1.16b, v1.16b, v0.16b
982    and     v2.16b, v2.16b, v0.16b
983    and     v3.16b, v3.16b, v0.16b
984    and     v4.16b, v4.16b, v0.16b
985    and     v5.16b, v5.16b, v0.16b
986    and     v6.16b, v6.16b, v0.16b
987    and     v7.16b, v7.16b, v0.16b
988    /* V8 to V15 are overwritten during register recovery and do not need to be cleared. */
989    and     v16.16b, v16.16b, v0.16b
990    and     v17.16b, v17.16b, v0.16b
991    and     v18.16b, v18.16b, v0.16b
992    and     v19.16b, v19.16b, v0.16b
993    and     v20.16b, v20.16b, v0.16b
994    and     v21.16b, v21.16b, v0.16b
995    and     v22.16b, v22.16b, v0.16b
996    and     v23.16b, v23.16b, v0.16b
997    and     v24.16b, v24.16b, v0.16b
998    and     v25.16b, v25.16b, v0.16b
999    and     v26.16b, v26.16b, v0.16b
1000    and     v27.16b, v27.16b, v0.16b
1001    and     v28.16b, v28.16b, v0.16b
1002    and     v29.16b, v29.16b, v0.16b
1003    and     v30.16b, v30.16b, v0.16b
1004    and     v31.16b, v31.16b, v0.16b
1005AARCH64_AUTIASP
1006    ret
1007.size Poly1305CleanRegister, .-Poly1305CleanRegister
1008
1009#endif
1010