• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15
16#include "hitls_build.h"
17#if defined(HITLS_CRYPTO_CHACHA20) && defined(HITLS_CRYPTO_CHACHA20POLY1305)
18
19#include "poly1305_x86_64.S"
20
21.file   "poly1305_x86_64_avx512.S"
22.text
23
24.set    ZH0, %zmm0
25.set    ZH1, %zmm1
26.set    ZH2, %zmm2
27.set    ZH3, %zmm3
28.set    ZH4, %zmm4
29.set    ZT0, %zmm5
30.set    ZT1, %zmm6
31.set    ZT2, %zmm7
32.set    ZT3, %zmm8
33.set    ZT4, %zmm9
34.set    ZMASK, %zmm10
35.set    ZB0, %zmm11
36.set    ZB1, %zmm12
37.set    ZB2, %zmm13
38.set    ZB3, %zmm14
39.set    ZB4, %zmm15
40.set    ZR0, %zmm16
41.set    ZR1, %zmm17
42.set    ZR2, %zmm18
43.set    ZR3, %zmm19
44.set    ZR4, %zmm20
45.set    ZS1, %zmm21
46.set    ZS2, %zmm22
47.set    ZS3, %zmm23
48.set    ZS4, %zmm24
49.set    ZM0, %zmm25
50.set    ZM1, %zmm26
51.set    ZM2, %zmm27
52.set    ZM3, %zmm28
53.set    ZM4, %zmm29
54.set    PADBIT_ZMM, %zmm30
55
56.align  64
57g_permd_avx512:
58    .long  0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7
59.size   g_permd_avx512, .-g_permd_avx512
60
61/**
62 *  Function description: This function is implemented by x86_64 poly1305. The result is stored in ctx->acc.
63 *  Function prototype: uint32_t Poly1305Block(Poly1305_Ctx *ctx, const uint8_t *data, uint32_t dataLen, uint32_t padbit);
64 *  Input register:
65 *        CTX: address of the Poly305_Ctx structure
66 *        INP: pointer to the input data
67 *        LEN: length of the input data
68 *        PADBIT: padding bit, 0 or 1.
69 *  Change register: r8-r15, rbx, rbp, rdx, rax
70 *  Output register:
71 *        %rax: length of the remaining data to be processed
72 *  Function/Macro Call: Poly1305_MOD_MUL
73 */
74.globl  Poly1305Block
75.type   Poly1305Block, @function
76Poly1305Block:
77.cfi_startproc
78.align  32
79    cmp $256, LEN
80    jae .Lblock_avx_pre
81    call Poly1305Block64Bit
82    ret
83
84.Lblock_avx_pre:
85    andq $-16, LEN
86    test $63, LEN
87    jz  Poly1305BlockAVX512
88
89.Lbase2_64_avx_body:
90
91    push %rbx
92    push %rbp
93    push %r12
94    push %r13
95    push %r14
96    push %r15
97
98    movq LEN, %r15
99    movq (CTX), ACC1                     // load acc
100    LOAD_ACC_R  CTX, R0, R1, R2, ACC1, ACC2, ACC3, %r8d, %rax
101    test   %r8d, %r8d
102    jz  .Lbase2_64_avx_loop
103
104    CONVERT_26TO64_PRE  ACC1, ACC2, D1, D2, D3
105    CONVERT_26TO64 ACC1 D1, ACC2, D2, D3, ACC3
106    movl $0, 220(CTX)
107
108.align 32
109.Lbase2_64_avx_loop:
110    addq (INP), ACC1
111    adcq 8(INP), ACC2
112    adcq PADBIT, ACC3
113    lea 16(INP), INP
114
115    POLY1305_MOD_MUL ACC1, ACC2, ACC3, R0, R1, R2
116
117    subq $16, %r15
118    test $63, %r15
119    movq R1, %rax
120    jnz .Lbase2_64_avx_loop
121
122    movq ACC1, (CTX)
123    movq ACC2, 8(CTX)
124    movq ACC3, 16(CTX)
125    movq %r15, LEN
126    pop %r15
127    pop %r14
128    pop %r13
129    pop %r12
130    pop %rbp
131    pop %rbx
132
133    jmp Poly1305BlockAVX512
134    ret
135.cfi_endproc
136.size  Poly1305Block, .-Poly1305Block
137
138/**
139 *  Function description: x86_64 poly1305 AVX512 assembly acceleration implementation
140 *  Input register:
141 *      CTX: address of the Poly305_Ctx structure
142 *      INP: pointer to the input data
143 *      LEN: length of the input data
144 *      PADBIT: padding bit, 0 or 1.
145 *  Change register: zmm0-31, rax, rsp, r11, rcx, rdi, k1-k3
146 *  Output register:
147 *      rax: length of the remaining data to be processed
148 *  Function/Macro Call:
149 *        CONVERT_64TO26
150 */
151.globl  Poly1305BlockAVX512
152.type   Poly1305BlockAVX512, @function
153.align  32
154Poly1305BlockAVX512:
155.cfi_startproc
156    push %rbx
157    push %rbp
158    push %r12
159    push %r13
160    push %r14
161    push %r15
162
163    vzeroupper
164    movq (CTX), ACC1
165    movq 8(CTX), ACC2
166    movq 16(CTX), ACC3
167    movl 220(CTX), %r8d
168    test %r8d, %r8d
169    jnz .Lblock_avx512_pre
170    movq LEN, %r15
171    CONVERT_64TO26  ACC1, ACC2, ACC3, %rax, %rdx
172    movq %r15, LEN
173    jmp .Lblock_avx512_body
174
175.Lblock_avx512_pre:
176    movd %r14, %xmm0
177    movd %rbx, %xmm2
178    shrq $32, %r14
179    shrq $32, %rbx
180    movd %r14, %xmm1
181    movd %rbx, %xmm3
182    movd %rbp, %xmm4
183
184.Lblock_avx512_body:
185
186    movl $15, %eax
187    kmovw %eax, %k2
188    leaq -8(%rsp), %r11
189    subq $0x128, %rsp
190    leaq 56(CTX), CTX
191    vmovdqa g_permd_avx2(%rip), YT2                        // g_permd_avx2
192
193    // Extend the precomputation table to the power of 8
194    andq $-512, %rsp
195    movq $0x20, %rax
196    vmovdqu (CTX), %xmm11
197    vmovdqu 16(CTX), %xmm12
198    vmovdqu 32(CTX), %xmm5
199    vmovdqu 48(CTX), %xmm13
200    vmovdqu 64(CTX), %xmm6
201    vmovdqu 80(CTX), %xmm14
202    vpermd  ZB0, ZT2, ZR0                                 // 00 00 34 12 -> 14 24 34 44
203    vmovdqu 96(CTX), %xmm8
204    vpbroadcastq     g_mask26(%rip), ZMASK                // g_mask26
205    vmovdqu 112(CTX), %xmm15
206    vpermd  ZB1, ZT2, ZR1
207    vmovdqu 128(CTX), %xmm9
208    vpermd  ZT0, ZT2, ZS1
209    vpermd  ZB2, ZT2, ZR2
210    vmovdqa64   ZR0, (%rsp){%k2}
211    vpsrlq  $32, ZR0, ZT0                                 // 14 24 34 44 -> 01 02 03 04
212    vpermd  ZT1, ZT2, ZS2
213    vmovdqu64   ZR1, (%rsp, %rax){%k2}
214    vpsrlq  $32, ZR1, ZT1
215    vpermd  ZB3, ZT2, ZR3
216    vmovdqa64   ZS1, 0x40(%rsp){%k2}
217    vpermd  ZT3, ZT2, ZS3
218    vmovdqu64   ZR2, 0x40(%rsp, %rax){%k2}
219    vpermd  ZB4, ZT2, ZR4
220    vmovdqa64   ZS2, 0x80(%rsp){%k2}
221    vpermd  ZT4, ZT2, ZS4
222    vmovdqu64   ZR3, 0x80(%rsp, %rax){%k2}
223    vmovdqa64   ZS3, 0xc0(%rsp){%k2}
224    vmovdqu64   ZR4, 0xc0(%rsp, %rax){%k2}
225    vmovdqa64   ZS4, 0x100(%rsp){%k2}
226
227    vpmuludq    ZT0, ZR0, ZB0
228    vpmuludq    ZT0, ZR1, ZB1
229    vpmuludq    ZT0, ZR2, ZB2
230    vpmuludq    ZT0, ZR3, ZB3
231    vpmuludq    ZT0, ZR4, ZB4
232    vpsrlq      $32, ZR2, ZT2
233
234    vpmuludq    ZT1, ZS4, ZM0
235    vpmuludq    ZT1, ZR0, ZM1
236    vpmuludq    ZT1, ZR1, ZM2
237    vpmuludq    ZT1, ZR2, ZM3
238    vpmuludq    ZT1, ZR3, ZM4
239    vpsrlq      $32, ZR3, ZT3
240    vpaddq      ZM0, ZB0, ZB0
241    vpaddq      ZM1, ZB1, ZB1
242    vpaddq      ZM2, ZB2, ZB2
243    vpaddq      ZM3, ZB3, ZB3
244    vpaddq      ZM4, ZB4, ZB4
245
246    vpmuludq    ZT2, ZS3, ZM0
247    vpmuludq    ZT2, ZS4, ZM1
248    vpmuludq    ZT2, ZR0, ZM2
249    vpmuludq    ZT2, ZR1, ZM3
250    vpmuludq    ZT2, ZR2, ZM4
251    vpsrlq      $32, ZR4, ZT4
252    vpaddq      ZM0, ZB0, ZB0
253    vpaddq      ZM1, ZB1, ZB1
254    vpaddq      ZM2, ZB2, ZB2
255    vpaddq      ZM3, ZB3, ZB3
256    vpaddq      ZM4, ZB4, ZB4
257
258    vpmuludq    ZT3, ZS2, ZM0
259    vpmuludq    ZT3, ZS3, ZM1
260    vpmuludq    ZT3, ZS4, ZM2
261    vpmuludq    ZT3, ZR0, ZM3
262    vpmuludq    ZT3, ZR1, ZM4
263    vpaddq      ZM0, ZB0, ZB0
264    vpaddq      ZM1, ZB1, ZB1
265    vpaddq      ZM2, ZB2, ZB2
266    vpaddq      ZM3, ZB3, ZB3
267    vpaddq      ZM4, ZB4, ZB4
268
269    vpmuludq    ZT4, ZS1, ZM0
270    vpmuludq    ZT4, ZS2, ZM1
271    vpmuludq    ZT4, ZS3, ZM2
272    vpmuludq    ZT4, ZS4, ZM3
273    vpmuludq    ZT4, ZR0, ZM4
274    vpaddq      ZM0, ZB0, ZB0
275    vpaddq      ZM1, ZB1, ZB1
276    vpaddq      ZM2, ZB2, ZB2
277    vpaddq      ZM3, ZB3, ZB3
278    vpaddq      ZM4, ZB4, ZB4
279
280    // reduction
281    vpsrlq      $26, ZB3, ZM3
282    vpandq      ZMASK, ZB3, ZB3
283    vpaddq      ZM3, ZB4, ZB4                               // d3 -> d4
284    vpsrlq      $26, ZB0, ZM0
285    vpandq      ZMASK, ZB0, ZB0
286    vpaddq      ZM0, ZB1, ZB1                               // d0 -> d1
287    vpsrlq      $26, ZB4, ZM4
288    vpandq      ZMASK, ZB4, ZB4
289    vmovdqu64   (INP), ZT3
290    vmovdqu64   64(INP), ZT4
291    leaq    128(INP), INP
292    vpsrlq      $26, ZB1, ZM1
293    vpandq      ZMASK, ZB1, ZB1
294    vpaddq      ZM1, ZB2, ZB2                               // d1 -> d2
295    vpaddq      ZM4, ZB0, ZB0
296    vpsllq      $2, ZM4, ZM4
297    vpaddq      ZM4, ZB0, ZB0                               // d4 -> d0
298    vpsrlq      $26, ZB2, ZM2
299    vpandq      ZMASK, ZB2, ZB2
300    vpaddq      ZM2, ZB3, ZB3                               // d2 -> d3
301    vpsrlq      $26, ZB0, ZM0
302    vpandq      ZMASK, ZB0, ZB0
303    vpaddq      ZM0, ZB1, ZB1                               // d0 -> d1
304    vpsrlq      $26, ZB3, ZM3
305    vpandq      ZMASK, ZB3, ZB3
306    vpaddq      ZM3, ZB4, ZB4                               // d3 -> d4
307
308    vpunpcklqdq ZT4, ZT3, ZT0
309    vpunpckhqdq ZT4, ZT3, ZT4
310
311    // Construct R and S to make them in operable form.
312    vmovdqu32   g_permd_avx512(%rip), ZM0                   // g_permd_avx512
313    movl    $0x7777, %eax
314    kmovw   %eax, %k1
315    vpermd      ZR0, ZM0, ZR0                               // 14 24 34 44 -> 1444 2444 3444 4444
316    vpermd      ZR1, ZM0, ZR1
317    vpermd      ZR2, ZM0, ZR2
318    vpermd      ZR3, ZM0, ZR3
319    vpermd      ZR4, ZM0, ZR4
320    vpermd      ZB0, ZM0, ZR0{%k1}                          // 05 06 07 08 and 1444 2444 3444 4444 -> 1858 2868 3878 4888
321    vpermd      ZB1, ZM0, ZR1{%k1}
322    vpermd      ZB2, ZM0, ZR2{%k1}
323    vpermd      ZB3, ZM0, ZR3{%k1}
324    vpermd      ZB4, ZM0, ZR4{%k1}
325
326    vpslld      $2, ZR1, ZS1
327    vpslld      $2, ZR2, ZS2
328    vpslld      $2, ZR3, ZS3
329    vpslld      $2, ZR4, ZS4
330    vpaddd      ZR1, ZS1, ZS1
331    vpaddd      ZR2, ZS2, ZS2
332    vpaddd      ZR3, ZS3, ZS3
333    vpaddd      ZR4, ZS4, ZS4
334
335    // Processes the input message block and constructs the operation form.
336    vpbroadcastq    g_129(%rip), PADBIT_ZMM                 // g_129
337    vpsrlq      $52, ZT0, ZT2
338    vpsllq      $12, ZT4, ZT3
339    vporq       ZT3, ZT2, ZT2
340    vpsrlq      $26, ZT0, ZT1
341    vpsrlq      $14, ZT4, ZT3
342    vpsrlq      $40, ZT4, ZT4                               // 4
343    vpandq      ZMASK, ZT0, ZT0                             // 0
344    vpandq      ZMASK, ZT2, ZT2                             // 2
345
346    vpaddq      ZH2, ZT2, ZH2
347    subq    $192, LEN
348    jbe     .Lblock_avx512_tail
349    jmp     .Lblock_avx512_loop
350
351.align  32
352.Lblock_avx512_loop:
353
354    // ((inp[0] * r^8 + inp[ 8]) * r^8 + inp[16]) * r^8
355    // ((inp[1] * r^8 + inp[ 9]) * r^8 + inp[17]) * r^7
356    // ((inp[2] * r^8 + inp[10]) * r^8 + inp[18]) * r^6
357    // ((inp[3] * r^8 + inp[11]) * r^8 + inp[19]) * r^5
358    // ((inp[4] * r^8 + inp[12]) * r^8 + inp[20]) * r^4
359    // ((inp[5] * r^8 + inp[13]) * r^8 + inp[21]) * r^3
360    // ((inp[6] * r^8 + inp[14]) * r^8 + inp[22]) * r^2
361    // ((inp[7] * r^8 + inp[15]) * r^8 + inp[23]) * r^1
362
363        // b3 = h2*r1   + h0*r3 + h1*r2   + h3*r0 + h4*5*r4
364    // b4 = h2*r2   + h0*r4 + h1*r3   + h3*r1 + h4*r0
365    // b0 = h2*5*r3 + h0*r0 + h1*5*r4         + h3*5*r2 + h4*5*r1
366    // b1 = h2*5*r4 + h0*r1           + h1*r0 + h3*5*r3 + h4*5*r2
367    // b2 = h2*r0           + h0*r2   + h1*r1 + h3*5*r4 + h4*5*r3
368
369    vpmuludq    ZH2, ZR1, ZB3
370    vpandq      ZMASK, ZT1, ZT1                             // 1
371    vpmuludq    ZH2, ZR2, ZB4
372    vpandq      ZMASK, ZT3, ZT3                             // 3
373    vpmuludq    ZH2, ZS3, ZB0
374    vporq       PADBIT_ZMM, ZT4, ZT4
375    vpmuludq    ZH2, ZS4, ZB1
376    vpaddq      ZH0, ZT0, ZH0
377    vpmuludq    ZH2, ZR0, ZB2
378    vpaddq      ZH1, ZT1, ZH1
379    vpaddq      ZH3, ZT3, ZH3
380    vpaddq      ZH4, ZT4, ZH4
381    vmovdqu64   (INP), ZT3
382    vmovdqu64   64(INP), ZT4
383    lea   128(INP), INP
384
385    vpmuludq    ZH0, ZR3, ZM3
386    vpmuludq    ZH0, ZR4, ZM4
387    vpmuludq    ZH0, ZR0, ZM0
388    vpmuludq    ZH0, ZR1, ZM1
389    vpaddq      ZM3, ZB3, ZB3
390    vpaddq      ZM4, ZB4, ZB4
391    vpaddq      ZM0, ZB0, ZB0
392    vpaddq      ZM1, ZB1, ZB1
393
394    vpmuludq    ZH1, ZR2, ZM3
395    vpmuludq    ZH1, ZR3, ZM4
396    vpmuludq    ZH1, ZS4, ZM0
397    vpmuludq    ZH0, ZR2, ZM2
398    vpaddq      ZM3, ZB3, ZB3
399    vpaddq      ZM4, ZB4, ZB4
400    vpaddq      ZM0, ZB0, ZB0
401    vpaddq      ZM2, ZB2, ZB2
402    vpunpcklqdq ZT4, ZT3, ZT0
403    vpunpckhqdq ZT4, ZT3, ZT4
404
405    vpmuludq    ZH3, ZR0, ZM3
406    vpmuludq    ZH3, ZR1, ZM4
407    vpmuludq    ZH1, ZR0, ZM1
408    vpmuludq    ZH1, ZR1, ZM2
409    vpaddq      ZM3, ZB3, ZB3
410    vpaddq      ZM4, ZB4, ZB4
411    vpaddq      ZM1, ZB1, ZB1
412    vpaddq      ZM2, ZB2, ZB2
413
414    vpmuludq    ZH4, ZS4, ZM3
415    vpmuludq    ZH4, ZR0, ZM4
416    vpmuludq    ZH3, ZS2, ZM0
417    vpmuludq    ZH3, ZS3, ZM1
418    vpmuludq    ZH3, ZS4, ZM2
419    vpaddq      ZM3, ZB3, ZB3
420    vpaddq      ZM4, ZB4, ZB4
421    vpaddq      ZM0, ZB0, ZB0
422    vpaddq      ZM1, ZB1, ZB1
423    vpaddq      ZM2, ZB2, ZB2
424
425    vpmuludq    ZH4, ZS1, ZM0
426    vpmuludq    ZH4, ZS2, ZM1
427    vpmuludq    ZH4, ZS3, ZM2
428    vpaddq      ZM0, ZB0, ZH0
429    vpaddq      ZM1, ZB1, ZH1
430    vpaddq      ZM2, ZB2, ZH2
431    vpsrlq    $52, ZT0, ZT2
432    vpsllq    $12, ZT4, ZT3
433
434    // reduction
435    vpsrlq      $26, ZB3, ZH3
436    vpandq      ZMASK, ZB3, ZB3
437    vpaddq      ZH3, ZB4, ZH4
438    vporq       ZT3, ZT2, ZT2
439
440    vpsrlq      $26, ZH0, ZB0
441    vpandq      ZMASK, ZH0, ZH0
442    vpaddq      ZB0, ZH1, ZH1
443    vpandq      ZMASK, ZT2, ZT2
444
445    vpsrlq      $26, ZH4, ZB4
446    vpandq      ZMASK, ZH4, ZH4
447    vpsrlq      $26, ZH1, ZB1
448    vpandq      ZMASK, ZH1, ZH1
449    vpaddq      ZB1, ZH2, ZH2
450
451    vpaddq      ZB4, ZH0, ZH0
452    vpsllq      $2, ZB4, ZB4
453    vpaddq      ZB4, ZH0, ZH0
454    vpaddq      ZT2, ZH2, ZH2
455    vpsrlq      $26, ZT0, ZT1
456
457    vpsrlq      $26, ZH2, ZB2
458    vpandq      ZMASK, ZH2, ZH2
459    vpaddq      ZB2, ZB3, ZH3
460    vpsrlq      $14, ZT4, ZT3
461    vpsrlq      $40, ZT4, ZT4
462    vpandq      ZMASK, ZT0, ZT0
463
464    vpsrlq      $26, ZH0, ZB0
465    vpandq      ZMASK, ZH0, ZH0
466    vpaddq      ZB0, ZH1, ZH1
467
468    vpsrlq      $26, ZH3, ZB3
469    vpandq      ZMASK, ZH3, ZH3
470    vpaddq      ZB3, ZH4, ZH4
471
472    subq    $128, LEN
473    ja  .Lblock_avx512_loop
474
475.align  32
476.Lblock_avx512_tail:
477
478    vpsrlq      $32, ZR0, ZR0                               // 1858286838784888 -> 0105020603070408
479    vpsrlq      $32, ZR1, ZR1
480    vpsrlq      $32, ZS1, ZS1
481    vpsrlq      $32, ZR2, ZR2
482    vpsrlq      $32, ZS2, ZS2
483    vpsrlq      $32, ZR3, ZR3
484    vpsrlq      $32, ZS3, ZS3
485    vpsrlq      $32, ZR4, ZR4
486    vpsrlq      $32, ZS4, ZS4
487
488    lea (INP, LEN), INP
489    vpaddq      ZH0, ZT0, ZH0
490    vpmuludq    ZH2, ZR1, ZB3
491    vpandq      ZMASK, ZT1, ZT1
492    vpmuludq    ZH2, ZR2, ZB4
493    vpandq      ZMASK, ZT3, ZT3
494    vpmuludq    ZH2, ZS3, ZB0
495    vporq       PADBIT_ZMM, ZT4, ZT4
496    vpmuludq    ZH2, ZS4, ZB1
497    vpaddq      ZH1, ZT1, ZH1
498    vpmuludq    ZH2, ZR0, ZB2
499    vpaddq      ZH3, ZT3, ZH3
500    vpaddq      ZH4, ZT4, ZH4
501
502    vmovdqu     (INP), %xmm5
503    vmovdqu     16(INP), %xmm6
504    vpmuludq    ZH0, ZR3, ZM3
505    vpmuludq    ZH0, ZR4, ZM4
506    vpmuludq    ZH0, ZR0, ZM0
507    vpmuludq    ZH0, ZR1, ZM1
508    vpaddq      ZM3, ZB3, ZB3
509    vpaddq      ZM4, ZB4, ZB4
510    vpaddq      ZM0, ZB0, ZB0
511    vpaddq      ZM1, ZB1, ZB1
512
513    vinserti128    $1, 32(INP), YT0, YT0
514    vinserti128    $1, 48(INP), YT1, YT1
515    vpmuludq    ZH1, ZR2, ZM3
516    vpmuludq    ZH1, ZR3, ZM4
517    vpmuludq    ZH1, ZS4, ZM0
518    vpmuludq    ZH0, ZR2, ZM2
519    vpaddq      ZM3, ZB3, ZB3
520    vpaddq      ZM4, ZB4, ZB4
521    vpaddq      ZM0, ZB0, ZB0
522    vpaddq      ZM2, ZB2, ZB2
523
524    vpmuludq    ZH3, ZR0, ZM3
525    vpmuludq    ZH3, ZR1, ZM4
526    vpmuludq    ZH1, ZR0, ZM1
527    vpmuludq    ZH1, ZR1, ZM2
528    vpaddq      ZM3, ZB3, ZB3
529    vpaddq      ZM4, ZB4, ZB4
530    vpaddq      ZM1, ZB1, ZB1
531    vpaddq      ZM2, ZB2, ZB2
532
533    vpmuludq    ZH4, ZS4, ZM3
534    vpmuludq    ZH4, ZR0, ZM4
535    vpmuludq    ZH3, ZS2, ZM0
536    vpmuludq    ZH3, ZS3, ZM1
537    vpmuludq    ZH3, ZS4, ZM2
538    vpaddq      ZM3, ZB3, ZH3
539    vpaddq      ZM4, ZB4, ZB4
540    vpaddq      ZM0, ZB0, ZB0
541    vpaddq      ZM1, ZB1, ZB1
542    vpaddq      ZM2, ZB2, ZB2
543
544    vpmuludq    ZH4, ZS1, ZM0
545    vpmuludq    ZH4, ZS2, ZM1
546    vpmuludq    ZH4, ZS3, ZM2
547    vpaddq      ZM0, ZB0, ZH0
548    vpaddq      ZM1, ZB1, ZH1
549    vpaddq      ZM2, ZB2, ZH2
550
551    // Summary of calculation results of different blocks
552    movl    $1, %eax
553    kmovw   %eax, %k3
554    vpermq      $0xb1, ZH0, ZB0
555    vpermq      $0xb1, ZH1, ZB1
556    vpermq      $0xb1, ZH2, ZB2
557    vpermq      $0xb1, ZH3, ZB3
558    vpermq      $0xb1, ZB4, ZH4
559    vpaddq      ZB0, ZH0, ZH0
560    vpaddq      ZB1, ZH1, ZH1
561    vpaddq      ZB2, ZH2, ZH2
562    vpaddq      ZB3, ZH3, ZH3
563    vpaddq      ZB4, ZH4, ZH4
564    vpermq      $0x2, ZH0, ZB0
565    vpermq      $0x2, ZH1, ZB1
566    vpermq      $0x2, ZH2, ZB2
567    vpermq      $0x2, ZH3, ZB3
568    vpermq      $0x2, ZH4, ZB4
569    vpaddq      ZB0, ZH0, ZH0
570    vpaddq      ZB1, ZH1, ZH1
571    vpaddq      ZB2, ZH2, ZH2
572    vpaddq      ZB3, ZH3, ZH3
573    vpaddq      ZB4, ZH4, ZH4
574
575    vextracti64x4   $0x1, ZH0, YB0
576    vextracti64x4   $0x1, ZH1, YB1
577    vextracti64x4   $0x1, ZH2, YB2
578    vextracti64x4   $0x1, ZH3, YB3
579    vextracti64x4   $0x1, ZH4, YB4
580    vpaddq      ZB0, ZH0, ZH0{%k3}{z}
581    vpaddq      ZB1, ZH1, ZH1{%k3}{z}
582    vpaddq      ZB2, ZH2, ZH2{%k3}{z}
583    vpaddq      ZB3, ZH3, ZH3{%k3}{z}
584    vpaddq      ZB4, ZH4, ZH4{%k3}{z}
585
586    // reduction
587    vpsrlq      $26, YH3, YB3
588    vpandq      YMASK, YH3, YH3
589    vpaddq      YB3, YH4, YH4
590    vpsrldq     $6, YT0, YT2
591    vpsrldq     $6, YT1, YT3
592
593    vpsrlq      $26, YH0, YB0
594    vpandq      YMASK, YH0, YH0
595    vpaddq      YB0, YH1, YH1
596    vpunpckhqdq    YT1, YT0, YT4
597    vpunpcklqdq    YT1, YT0, YT0
598    vpunpcklqdq    YT3, YT2, YT2
599
600    vpsrlq      $26, YH4, YB4
601    vpandq      YMASK, YH4, YH4
602    vpsrlq      $26, YH1, YB1
603    vpandq      YMASK, YH1, YH1
604    vpaddq      YB1, YH2, YH2
605    vpsrlq      $30, YT2, YT3
606    vpsrlq      $4, YT2, YT2
607
608    vpaddq      YB4, YH0, YH0
609    vpsllq      $2, YB4, YB4
610    vpaddq      YB4, YH0, YH0
611    vpsrlq      $26, YT0, YT1
612    vpsrlq      $40, YT4, YT4
613
614    vpsrlq      $26, YH2, YB2
615    vpandq      YMASK, YH2, YH2
616    vpaddq      YB2, YH3, YH3
617    vpand       YMASK, YT2, YT2
618    vpand       YMASK, YT3, YT3
619
620    vpsrlq      $26, YH0, YB0
621    vpandq      YMASK, YH0, YH0
622    vpaddq      YB0, YH1, YH1
623    vpaddq      YH2, YT2, YH2
624    vpand       YMASK, YT1, YT1
625
626    vpsrlq      $26, YH3, YB3
627    vpand       YMASK, YH3, YH3
628    vpaddq      YB3, YH4, YH4
629    vpand       YMASK, YT0, YT0
630    vpor        g_129(%rip), YT4, YT4
631
632    addq    $64, LEN
633    jnz     .Lblock_4_tail
634
635    vpsubq      YT2, YH2, YH2
636    jmp     .Lblock_avx512_end
637
638.align  32
639.Lblock_4_tail:
640    BLOCK4_AVX2_TAIL YT0, YT1, YT2, YT3, YT4, YH0, YH1, YH2, YH3, YH4, YB0, YB1, YB2, YB3, YB4, YMASK, %rsp
641
642.Lblock_avx512_end:
643    vmovd       %xmm0, -56(CTX)
644    vmovd       %xmm1, -52(CTX)
645    vmovd       %xmm2, -48(CTX)
646    vmovd       %xmm3, -44(CTX)
647    vmovd       %xmm4, -40(CTX)
648    vzeroall
649    lea    8(%r11),%rsp
650
651    pop %r15
652    pop %r14
653    pop %r13
654    pop %r12
655    pop %rbp
656    pop %rbx
657    movq    LEN, %rax
658    ret
659.cfi_endproc
660.size  Poly1305BlockAVX512, .-Poly1305BlockAVX512
661
662 /**
663 *  Function description: This function is used to clear residual sensitive information in a register.
664 *  Function prototype: void Poly1305CleanRegister();
665 *  Input register: None
666 *  Modify the register:
667 *  Output register: None
668 *  Function/Macro Call: None
669 */
670.globl  Poly1305CleanRegister
671.type   Poly1305CleanRegister,@function
672Poly1305CleanRegister:
673.cfi_startproc
674    vzeroall
675    vpxorq   ZR0, ZR0, ZR0
676    vpxorq   ZR1, ZR1, ZR1
677    vpxorq   ZR2, ZR2, ZR2
678    vpxorq   ZR3, ZR3, ZR3
679    vpxorq   ZR4, ZR4, ZR4
680    vpxorq   ZS1, ZS1, ZS1
681    vpxorq   ZS2, ZS2, ZS2
682    vpxorq   ZS3, ZS3, ZS3
683    vpxorq   ZS4, ZS4, ZS4
684    vpxorq   ZM0, ZM0, ZM0
685    vpxorq   ZM1, ZM1, ZM1
686    vpxorq   ZM2, ZM2, ZM2
687    vpxorq   ZM3, ZM3, ZM3
688    vpxorq   ZM4, ZM4, ZM4
689    ret
690.cfi_endproc
691.size  Poly1305CleanRegister, .-Poly1305CleanRegister
692
693#endif
694