• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15#include "hitls_build.h"
16#if defined(HITLS_CRYPTO_CHACHA20) && defined(HITLS_CRYPTO_CHACHA20POLY1305)
17
18#include "poly1305_x86_64.S"
19
20.file   "poly1305_x86_64_avx2.S"
21.text
22
23/**
24 *  Function description: This function is implemented by x86_64 poly1305. The result is stored in ctx->acc.
25 *  Function prototype: uint32_t Poly1305Block(Poly1305_Ctx *ctx, const uint8_t *data,
26 *                                             uint32_t dataLen, uint32_t padbit);
27 *  Input register:
28 *        CTX: address of the Poly305_Ctx structure
29 *        INP: pointer to the input data
30 *        LEN: length of the input data
31 *        PADBIT: padding bit, 0 or 1.
32 *  Change registers: r8-r14, rbx, rbp
33 *  Output register:
34 *        %rax: length of the remaining data to be processed
35 *  Function/Macro Call:Poly1305_MOD_MUL
36 */
37.globl  Poly1305Block
38.type   Poly1305Block,@function
39Poly1305Block:
40.cfi_startproc
41.align  32
42    cmp $256, LEN
43    jae .Lblock_avx_pre
44    jmp Poly1305Block64Bit
45
46.Lblock_avx_pre:
47    andq $-16, LEN
48    test $63, LEN
49    jz  Poly1305BlockAVX2
50
51.Lbase2_64_avx_body:
52
53    push %rbx
54    push %rbp
55    push %r12
56    push %r13
57    push %r14
58    push %r15
59
60    movq LEN, %r15
61    LOAD_ACC_R  CTX, R0, R1, R2, ACC1, ACC2, ACC3, %r8d, %rax
62    test   %r8d, %r8d
63    jz  .Lbase2_64_avx_loop
64
65    CONVERT_26TO64_PRE  ACC1, ACC2, D1, D2, D3
66    CONVERT_26TO64 ACC1, D1, ACC2, D2, D3, ACC3
67    movl $0, 220(CTX)
68
69.align 32
70.Lbase2_64_avx_loop:
71    addq (INP), ACC1
72    adcq 8(INP), ACC2
73    adcq PADBIT, ACC3
74    lea 16(INP), INP
75
76    POLY1305_MOD_MUL ACC1, ACC2, ACC3, R0, R1, R2
77
78    subq $16, %r15
79    movq R1, %rax
80    test $63, %r15
81    jnz .Lbase2_64_avx_loop
82
83    movq ACC1, (CTX)
84    movq ACC2, 8(CTX)
85    movq ACC3, 16(CTX)
86    movq %r15, LEN
87    pop %r15
88    pop %r14
89    pop %r13
90    pop %r12
91    pop %rbp
92    pop %rbx
93
94    jmp Poly1305BlockAVX2
95    ret
96.cfi_endproc
97.size  Poly1305Block, .-Poly1305Block
98
99/**
100 *  Function description: x86_64 poly1305 AVX2 implementation
101 *  Input register:
102 *        CTX: address of the Poly305_Ctx structure
103 *        INP: pointer to the input data
104 *        LEN: length of the input data
105 *        PADBIT: padding bit, 0 or 1.
106 *  Change register: ymm0-15, r8, r9, r14, r15, rax, rbx, rdx, rbp
107 *  Output register:
108 *        rax: length of the remaining data to be processed
109 *  Function/Macro Call:
110 *         CONVERT_64TO26
111 */
112.globl  Poly1305BlockAVX2
113.type   Poly1305BlockAVX2, @function
114.align  32
115Poly1305BlockAVX2:
116.cfi_startproc
117    push %rbx
118    push %rbp
119    push %r14
120    push %r15
121
122    vzeroupper
123    movq (CTX), ACC1                                    // load acc
124    movq 8(CTX), ACC2
125    movq 16(CTX), ACC3
126    movl 220(CTX), %r8d
127    test %r8d, %r8d
128    jnz  .Lblock_avx2_pre
129    movq LEN, %r15
130    CONVERT_64TO26  ACC1, ACC2, ACC3, %rax, %rdx        // base2_64 --> base2_26
131    movq %r15, LEN
132    jmp  .Lblock_avx2_body
133
134.Lblock_avx2_pre:
135    movd %r14, %xmm0
136    movd %rbx, %xmm2
137    movd %rbp, %xmm4
138    shrq $32, %r14
139    shrq $32, %rbx
140    movd %r14, %xmm1
141    movd %rbx, %xmm3
142
143.align  32
144.Lblock_avx2_body:
145
146    leaq 56(CTX), CTX                                   // 56(CTX)
147    vmovdqu g_permd_avx2(%rip), YT0                     // g_permd_avx2
148    leaq -8(%rsp), %r11
149
150    /* Transform the content in the precomputation table into a computable form and put it into the stack. */
151    vmovdqu (CTX), %xmm7
152    vmovdqu 16(CTX), %xmm8
153    subq $0x128, %rsp
154    vmovdqu 32(CTX), %xmm9
155    vmovdqu 48(CTX), %xmm11
156    andq $-512, %rsp
157    vmovdqu 64(CTX), %xmm12
158    vmovdqu 80(CTX), %xmm13
159    vpermd  YT2, YT0, YT2                               // 00 00 34 12 --> 14 24 34 44
160    vmovdqu 96(CTX), %xmm14
161    vpermd  YT3, YT0, YT3
162    vmovdqu 112(CTX), %xmm15
163    vpermd  YT4, YT0, YT4
164    vmovdqu 128(CTX), %xmm10
165    vpermd  YB0, YT0, YB0
166    vmovdqa YT2, (%rsp)                                 // r0
167    vpermd  YB1, YT0, YB1
168    vmovdqa YT3, 0x20(%rsp)                             // r1
169    vpermd  YB2, YT0, YB2
170    vmovdqa YT4, 0x40(%rsp)                             // s1
171    vpermd  YB3, YT0, YB3
172    vmovdqa YB0, 0x60(%rsp)                             // r2
173    vpermd  YB4, YT0, YB4
174    vmovdqa YB1, 0x80(%rsp)                             // s2
175    vpermd  YMASK, YT0, YMASK
176    vmovdqa YB2, 0xa0(%rsp)                             // r3
177    vmovdqa YB3, 0xc0(%rsp)                             // s3
178    vmovdqa YB4, 0xe0(%rsp)                             // r4
179    vmovdqa YMASK, 0x100(%rsp)                          // s4
180
181    /* Load 4 blocks of data and convert them to base2_26 */
182    vmovdqu g_mask26(%rip), YMASK                       // g_mask26
183    vmovdqu (INP), %xmm5
184    vmovdqu 16(INP), %xmm6
185    vinserti128 $1, 32(INP), YT0, YT0
186    vinserti128 $1, 48(INP), YT1, YT1
187    leaq 64(INP), INP
188
189    vpsrldq     $6, YT0, YT2
190    vpsrldq     $6, YT1, YT3
191    vpunpckhqdq YT1, YT0, YT4
192    vpunpcklqdq YT1, YT0, YT0
193    vpunpcklqdq YT3, YT2, YT2
194
195    vpsrlq  $26, YT0, YT1
196    vpsrlq  $30, YT2, YT3
197    vpsrlq  $4, YT2, YT2
198    vpsrlq  $40, YT4, YT4                               // 4
199    vpand   YMASK, YT3, YT3                             // 3
200    vpand   YMASK, YT2, YT2                             // 2
201    vpor    g_129(%rip), YT4, YT4                       // padbit
202    vpand   YMASK, YT1, YT1                             // 1
203    vpand   YMASK, YT0, YT0                             // 0
204
205    vpaddq  YH2, YT2, YH2
206    sub     $64, LEN
207    jz  .Lblock_avx2_tail
208    jmp .Lblock_avx2_loop
209
210.align  32
211.Lblock_avx2_loop:
212
213    // ((inp[0]*r^4 + inp[4])*r^4 + inp[ 8])*r^4
214    // ((inp[1]*r^4 + inp[5])*r^4 + inp[ 9])*r^3
215    // ((inp[2]*r^4 + inp[6])*r^4 + inp[10])*r^2
216    // ((inp[3]*r^4 + inp[7])*r^4 + inp[11])*r^1
217    vpaddq  YH0, YT0, YH0
218    vpaddq  YH1, YT1, YH1
219    vpaddq  YH3, YT3, YH3
220    vpaddq  YH4, YT4, YH4
221    vmovdqa (%rsp), YT0                              // r0^4
222    vmovdqa 0x20(%rsp), YT1                          // r1^4
223    vmovdqa 0x60(%rsp), YT2                          // r2^4
224    vmovdqa 0xc0(%rsp), YT3                          // s3^4
225    vmovdqa 0x100(%rsp), YMASK                       // s4^4
226
227    // b4 = h4*r0^4 + h3*r1^4 + h2*r2^4 + h1*r3^4 + h0*r4^4
228    // b3 = h3*r0^4 + h2*r1^4 + h1*r2^4 + h0*r3^4 + h4*s4^4
229    // b2 = h2*r0^4 + h1*r1^4 + h0*r2^4 + h4*s3^4 + h3*s4^4
230    // b1 = h1*r0^4 + h0*r1^4 + h4*s2^4 + h3*s3^4 + h2*s4^4
231    // b0 = h0*r0^4 + h4*s1^4 + h3*s2^4 + h2*s3^4 + h1*s4^4
232    //
233    // First calculate h2, the above formula can be deformed as
234    //
235    // b4 = h2*r2^4 + h4*r0^4 + h3*r1^4 +         + h1*r3^4 + h0*r4^4
236    // b3 = h2*r1^4 + h3*r0^4 +         + h1*r2^4 + h0*r3^4 + h4*s4^4
237    // b2 = h2*r0^4 +         + h1*r1^4 + h0*r2^4 + h4*s3^4 + h3*s4^4
238    // b1 = h2*s4^4 + h1*r0^4 + h0*r1^4 + h4*s2^4 + h3*s3^4 +
239    // b0 = h2*s3^4 + h0*r0^4 + h4*s1^4 + h3*s2^4 +         + h1*s4^4
240
241    vpmuludq    YH2, YT0, YB2                          // b2 = h2 * r0^4
242    vpmuludq    YH2, YT1, YB3                          // b3 = h2 * r1^4
243    vpmuludq    YH2, YT2, YB4                          // b4 = h2 * r2^4
244    vpmuludq    YH2, YT3, YB0                          // b0 = h2 * s3^4
245    vpmuludq    YH2, YMASK, YB1                        // b1 = h2 * s4^4
246
247    vpmuludq    YH1, YT1, YT4                          // h1 * r1^4     (Available Scratch Registers:T4、H2)
248    vpmuludq    YH0, YT1, YH2                          // h0 * r1^4
249    vpaddq      YT4, YB2, YB2                          // b2 += h1 * r1^4
250    vpaddq      YH2, YB1, YB1                          // b1 += h0 * r1^4
251    vpmuludq    YH3, YT1, YT4                          // h3 * r1^4
252    vpmuludq    0x40(%rsp), YH4, YH2                   // h4 * s1^4
253    vpaddq      YT4 ,YB4, YB4                          // b4 += h3 * r1^4
254    vpaddq      YH2, YB0, YB0                          // b0 += h4 * s1^4
255    vmovdqa     0x80(%rsp), YT1                        // load s2^4
256
257    vpmuludq    YH4, YT0, YT4                          // h4 * r0^4     (Available Scratch Registers:T4、H2)
258    vpmuludq    YH3, YT0, YH2                          // h3 * r0^4
259    vpaddq      YT4, YB4, YB4                          // b4 += h4 * r0^4
260    vpaddq      YH2, YB3, YB3                          // b3 += h3 * r0^4
261    vpmuludq    YH0, YT0, YT4                          // h0 * r0^4
262    vpmuludq    YH1, YT0, YH2                          // h1 * r0^4
263    vpaddq      YT4, YB0, YB0                          // b0 += h0 * r0^4
264    vpaddq      YH2, YB1, YB1                          // b1 += h1 * r0^4
265    vmovdqu     (INP), %xmm5                           // load input    (YT0)
266
267    vpmuludq    YH4, YT1, YT4                          // h4 * s2^4
268    vpmuludq    YH3, YT1, YH2                          // h3 * s2^4
269    vinserti128    $1, 32(INP), YT0, YT0
270    vpaddq      YT4, YB1, YB1                          // b1 += h4 * s2^4
271    vpaddq      YH2, YB0, YB0                          // b0 += h3 * s2^4
272    vpmuludq    YH1, YT2, YT4                          // h1 * r2^4     (Available Scratch Registers:T4、H2)
273    vpmuludq    YH0, YT2, YH2                          // h0 * r2^4
274    vmovdqu     16(INP), %xmm6                         // load input    (YT1)
275    vpaddq      YT4, YB3, YB3                          // b3 += h1 * r2^4
276    vpaddq      YH2, YB2, YB2                          // b2 += h0 * r2^4
277    vinserti128    $1, 48(INP), YT1, YT1
278    vmovdqa     0xa0(%rsp), YH2                        // load r3^4
279    leaq    64(INP), INP
280
281    vpmuludq    YH1, YH2, YT4                          // h1 * r3^4     (Available Scratch Registers:T4、H2)
282    vpmuludq    YH0, YH2, YH2                          // h0 * r3^4
283    vpsrldq     $6, YT0, YT2
284    vpaddq      YT4, YB4, YB4                          // b4 += h1 * r3^4
285    vpaddq      YH2, YB3, YB3                          // b3 += h0 * r3^4
286    vpmuludq    YH4, YT3, YT4                          // h4 * s3^4
287    vpmuludq    YH3, YT3, YH2                          // h3 * s3^4
288    vpsrldq     $6, YT1, YT3
289    vpaddq      YT4, YB2, YB2                          // b2 += h4 * s3^4
290    vpaddq      YH2, YB1, YB1                          // b1 += h3 * s3^4   (finish)
291    vpunpckhqdq YT1, YT0, YT4
292
293    vpmuludq    YH3, YMASK, YH3                        // h3 * s4^4
294    vpmuludq    YH4, YMASK, YH4                        // h4 * s4^4
295    vpunpcklqdq YT1, YT0, YT0
296    vpaddq  YB2, YH3, YH2                              // h2 += h3 * s4^4   (finish)
297    vpaddq  YB3, YH4, YH3                              // h3 += h4 * s4^4   (finish)
298    vpunpcklqdq YT3, YT2, YT3
299    vpmuludq    0xe0(%rsp), YH0, YH4                   // h0 * r4^4
300    vpmuludq    YH1, YMASK, YH0                        // h1 * s4^4
301    vmovdqu     g_mask26(%rip), YMASK
302    vpaddq  YH4, YB4, YH4                              // h4 += h0 * r4^4   (finish)
303    vpaddq  YH0, YB0, YH0                              // h0 += h1 * s4^4   (finish)
304
305    // reduction
306    vpsrlq      $26, YH3, YB3
307    vpand       YMASK, YH3, YH3
308    vpaddq      YB3, YH4, YH4                          // h3 -> h4
309    vpsrlq      $26, YH0, YB0
310    vpand       YMASK, YH0, YH0
311    vpaddq      YB0, YB1, YH1                          // h0 -> h1
312    vpsrlq      $26, YH4, YB4
313    vpand       YMASK, YH4, YH4
314    vpsrlq      $4, YT3, YT2
315    vpsrlq      $26, YH1, YB1
316    vpand       YMASK, YH1, YH1
317    vpaddq      YB1, YH2, YH2                          // h1 -> h2
318    vpaddq      YB4, YH0, YH0
319    vpsllq      $2, YB4, YB4
320    vpaddq      YB4, YH0, YH0                          // h4 -> h0
321    vpand       YMASK, YT2, YT2
322    vpsrlq      $26, YT0, YT1
323    vpsrlq      $26, YH2, YB2
324    vpand       YMASK, YH2, YH2
325    vpaddq      YB2, YH3, YH3                          // h2 -> h3
326    vpaddq      YT2, YH2, YH2                          // prepare next 4 block
327    vpsrlq      $30, YT3, YT3
328    vpsrlq      $26, YH0, YB0
329    vpand       YMASK, YH0, YH0
330    vpaddq      YB0, YH1, YH1                          // h0 -> h1
331    vpsrlq      $40, YT4, YT4
332    vpsrlq      $26, YH3, YB3
333    vpand       YMASK, YH3, YH3
334    vpaddq      YB3, YH4, YH4                          // h3 -> h4
335
336    vpand      YMASK, YT0, YT0                         // new input 0
337    vpand      YMASK, YT1, YT1                         // new input 1
338    vpand      YMASK, YT3, YT3                         // new input 3
339    vpor       g_129(%rip), YT4, YT4                   // new input 4, padbit
340
341    subq $64, LEN
342    jnz .Lblock_avx2_loop
343
344.Lblock_avx2_tail:
345    BLOCK4_AVX2_TAIL   YT0, YT1, YT2, YT3, YT4, YH0, YH1, YH2, YH3, YH4, YB0, YB1, YB2, YB3, YB4, YMASK, %rsp
346
347    vmovd       %xmm0, -56(CTX)
348    vmovd       %xmm1, -52(CTX)
349    vmovd       %xmm2, -48(CTX)
350    vmovd       %xmm3, -44(CTX)
351    vmovd       %xmm4, -40(CTX)
352    vzeroupper
353    leaq     8(%r11), %rsp
354    pop %r15
355    pop %r14
356    pop %rbp
357    pop %rbx
358    movq LEN, %rax
359    ret
360.cfi_endproc
361.size  Poly1305BlockAVX2, .-Poly1305BlockAVX2
362
363 /**
364 *  Function description: This function is used to clear residual sensitive information in a register.
365 *  Function prototype: void Poly1305CleanRegister();
366 */
367.globl  Poly1305CleanRegister
368.type   Poly1305CleanRegister, @function
369Poly1305CleanRegister:
370.cfi_startproc
371    vzeroall
372    ret
373.cfi_endproc
374.size  Poly1305CleanRegister, .-Poly1305CleanRegister
375
376#endif
377