• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15
16#include "hitls_build.h"
17#if defined(HITLS_CRYPTO_CHACHA20) && defined(HITLS_CRYPTO_CHACHA20POLY1305)
18
19#include "poly1305_x86_64_macro.s"
20
21.file   "poly1305_x86_64.S"
22.text
23
24/**
25 *  Function description: Initializes the pre-computation table and clears the flag.
26 *  Function prototype: void Poly1305InitForAsm(Poly1305_Ctx *ctx);
27 *  Input register:
28 *         CTX: address of the Poly305_Ctx structure
29 *  Modify the register: rax, rdx, rbx, rbp, r8, r9, r11-r14.
30 *  Output register: None
31 *  Function/Macro Call: Poly1305_MOD_MUL
32 */
33.globl  Poly1305InitForAsm
34.type   Poly1305InitForAsm, @function
35.align  32
36Poly1305InitForAsm:
37.cfi_startproc
38    push %rbx
39    push %rbp
40    push %r12
41    push %r13
42    push %r14
43
44    movl $0, 220(CTX)                                    // flag bit Clear
45    movq 24(CTX), R0
46    movq 32(CTX), R1
47    movq R1, R2
48    shrq $2, R2
49    addq R1, R2
50    lea 56(CTX), CTX
51    movq R0, ACC1
52    movq R1, ACC2
53    xorq ACC3, ACC3
54
55    movq R1, %rax
56    POLY1305_MOD_MUL    ACC1, ACC2, ACC3, R0, R1, R2     // r^2
57    movl $0x3ffffff, %eax
58    movl $0x3ffffff, %edx
59    movq ACC1, D1
60    andl %r14d, %eax
61    movq R0, D2
62    andl %r11d, %edx
63    movl %eax, (CTX)                                     // r0^2
64    shrq $26, D1
65    movl %edx, 4(CTX)                                    // r0
66    shrq $26, D2
67    movl $0x3ffffff, %eax
68    movl $0x3ffffff, %edx
69    andl %r8d, %eax
70    andl %r9d, %edx
71    movl %eax, 16(CTX)                                   // r1^2
72    lea (%rax, %rax, 4), %eax
73    movl %edx, 20(CTX)                                   // r1
74    lea (%rdx, %rdx, 4), %edx
75    movl %eax, 32(CTX)                                   // s1^2
76    shrq $26, D1
77    movl %edx, 36(CTX)                                   // s1
78    shrq $26, D2
79
80    movq ACC2, %rax
81    movq R1, %rdx
82    shlq $12, %rax
83    shlq $12, %rdx
84    orq  D1, %rax
85    orq  D2, %rdx
86    andl $0x3ffffff, %eax
87    andl $0x3ffffff, %edx
88    movl %eax, 48(CTX)                                  // r2^2
89    lea (%rax, %rax, 4), %eax
90    movl %edx, 52(CTX)                                  // r2
91    lea (%rdx, %rdx, 4), %edx
92    movl %eax, 64(CTX)                                  // s2^2
93    movq ACC2, D1
94    movl %edx, 68(CTX)                                  // s2
95    movq R1, D2
96
97    shrq $14, D1
98    movl $0x3ffffff, %eax
99    shrq $14, D2
100    movl $0x3ffffff, %edx
101    andl %r8d, %eax
102    andl %r9d, %edx
103    movl %eax, 80(CTX)                                  // r3^2
104    lea (%rax, %rax, 4), %eax
105    movl %edx, 84(CTX)                                  // r3
106    lea (%rdx, %rdx, 4), %edx
107    movl %eax, 96(CTX)                                  // s3^2
108    shrq $26, D1
109    movl %edx, 100(CTX)                                 // s3
110    shrq $26, D2
111
112    movq ACC3, %rax
113    shlq $24, %rax
114    orq  %rax, D1
115    movl %r8d, 112(CTX)                                 // r4^2
116    lea (D1, D1, 4), D1
117    movl %r9d, 116(CTX)                                 // r4
118    lea (D2, D2, 4), D2
119    movl %r8d, 128(CTX)                                 // s4^2
120    movl %r9d, 132(CTX)                                 // s4
121
122    movq R1, %rax
123    POLY1305_MOD_MUL    ACC1, ACC2, ACC3, R0, R1, R2    // r^3
124    movq ACC1, D1
125    movl $0x3ffffff, %edx
126    andl %r8d, %edx
127    movl %edx, 12(CTX)                                  // r0^3
128    shrq $26, D1
129    movl $0x3ffffff, %edx
130    andl %r8d, %edx
131    movl %edx, 28(CTX)                                  // r1^3
132    lea (%rdx, %rdx, 4), %edx
133    shrq $26, D1
134    movl %edx, 44(CTX)                                  // s1^3
135    movq ACC2, %rax
136    shlq $12, %rax
137    orq  D1, %rax
138    andl $0x3ffffff, %eax
139    movl %eax, 60(CTX)                                  // r2^3
140    lea (%rax, %rax, 4), %eax
141    movq ACC2, D1
142    movl %eax, 76(CTX)                                  // s2^3
143    shrq $14, D1
144    movl $0x3ffffff, %eax
145    andl %r8d, %eax
146    movl %eax, 92(CTX)                                  // r3^3
147    lea (%rax, %rax, 4), %eax
148    shrq $26, D1
149    movl %eax, 108(CTX)                                 // s3^3
150    movq ACC3, %rdx
151    shlq $24, %rdx
152    orq  %rdx, D1
153    movl %r8d, 124(CTX)                                 // r4^3
154    lea (D1, D1, 4), D1
155    movl %r8d, 140(CTX)                                 // s4^3
156
157    movq R1, %rax
158    POLY1305_MOD_MUL    ACC1, ACC2, ACC3, R0, R1, R2    // r^4
159    movq ACC1, D1
160    movl $0x3ffffff, %edx
161    andl %r8d, %edx
162    movl %edx, 8(CTX)                                   // r0^4
163    shrq $26, D1
164    movl $0x3ffffff, %edx
165    andl %r8d, %edx
166    movl %edx, 24(CTX)                                  // r1^4
167    lea (%rdx, %rdx, 4), %edx
168    shrq $26, D1
169    movl %edx, 40(CTX)                                  // s1^4
170    movq ACC2, %rax
171    shlq $12, %rax
172    orq  D1, %rax
173    andl $0x3ffffff, %eax
174    movl %eax, 56(CTX)                                  // r2^4
175    lea (%rax, %rax, 4), %eax
176    movq ACC2, D1
177    movl %eax, 72(CTX)                                  // s2^4
178    shrq $14, D1
179    movl $0x3ffffff, %eax
180    andl %r8d, %eax
181    movl %eax, 88(CTX)                                  // r3^4
182    lea (%rax, %rax, 4), %eax
183    shrq $26, D1
184    movl %eax, 104(CTX)                                 // s3^4
185    movq ACC3, %rdx
186    shlq $24, %rdx
187    orq  %rdx, D1
188    movl %r8d, 120(CTX)                                 // r4^4
189    lea (D1, D1, 4), D1
190    movl %r8d, 136(CTX)                                 // s4^4
191
192    lea -56(CTX), CTX
193    pop %r14
194    pop %r13
195    pop %r12
196    pop %rbp
197    pop %rbx
198    ret
199.cfi_endproc
200.size  Poly1305InitForAsm, .-Poly1305InitForAsm
201
202/**
203 *  Function description: x86_64 poly1305 64-bit basic instruction implementation
204 *  Input register:
205 *      CTX: address of the Poly305_Ctx structure
206 *      INP: data pointer
207 *      LEN: data length
208 *      PADBIT: padding data
209 *  Change register: r8-r15, rax, rbx, rdx, rbp
210 *  Output register:
211 *        rax: length of the remaining data to be processed
212 *  Macro invoking:Poly1305_MOD_MUL
213 */
214.globl  Poly1305Block64Bit
215.type   Poly1305Block64Bit, @function
216Poly1305Block64Bit:
217.cfi_startproc
218.align  32
219.Lblock_start:
220    push %rbx
221    push %rbp
222    push %r12
223    push %r13
224    push %r14
225    push %r15
226
227    movq LEN, %r15
228    LOAD_ACC_R  CTX, R0, R1, R2, ACC1, ACC2, ACC3, %r8d, %rax
229    test %r8d, %r8d
230    jz  .Lblock64_loop
231
232    CONVERT_26TO64_PRE  ACC1, ACC2, D1, D2, D3
233    CONVERT_26TO64 ACC1, D1, ACC2, D2, D3, ACC3
234    movl $0, 220(CTX)
235
236.align 32
237.Lblock64_loop:
238
239    addq (INP), ACC1
240    adcq 8(INP), ACC2
241    adcq PADBIT, ACC3
242    lea 16(INP), INP
243
244    POLY1305_MOD_MUL ACC1, ACC2, ACC3, R0, R1, R2
245
246    subq $16, %r15
247    movq R1, %rax
248    jnz .Lblock64_loop
249
250    movq ACC1, (CTX)
251    movq ACC2, 8(CTX)
252    movq ACC3, 16(CTX)
253    movq %r15, %rax
254
255    pop %r15
256    pop %r14
257    pop %r13
258    pop %r12
259    pop %rbp
260    pop %rbx
261    ret
262.cfi_endproc
263.size  Poly1305Block64Bit, .-Poly1305Block64Bit
264
265/**
266 *  Function description: Calculates (acc + s) mod 2^128 and outputs the final result to the specified memory.
267 *  Function prototype: void Poly1305Last(Poly1305_Ctx *ctx, uint8_t mac[POLY1305_TAGSIZE]);
268 *  Input register:
269 *         rdi: address of the Poly305_Ctx structure
270 *         rsi: pointer to the output buffer
271 *  Modify the register: rax, rcx, r14, rbx, rbp, r8-r10.
272 *  Output register: None
273 *  Function/Macro Call:
274 *         CONVERT_26TO64
275 */
276.globl  Poly1305Last
277.type   Poly1305Last, @function
278.align  32
279Poly1305Last:
280.cfi_startproc
281    push %rbx
282    push %rbp
283    push %r14
284    movl 220(CTX), %r8d
285    movq (CTX), ACC1
286    movq 8(CTX), ACC2
287    movq 16(CTX), ACC3
288
289    test %r8d, %r8d
290    jz  .Lblock_last_body
291    CONVERT_26TO64_PRE  ACC1, ACC2, D1, D2, D3
292    CONVERT_26TO64 ACC1, D1, ACC2, D2, D3, ACC3
293    movl $0, 220(CTX)
294
295.Lblock_last_body:
296    movq ACC1, %rax
297    addq $5, ACC1
298    movq ACC2, %rcx
299    adcq $0, ACC2
300    adcq $0, ACC3
301    shrq $2, ACC3
302    cmovnz  ACC1, %rax
303    cmovnz  ACC2, %rcx
304
305    addq 40(CTX), %rax
306    adcq 48(CTX), %rcx
307    movq %rax, (%rsi)
308    movq %rcx, 8(%rsi)
309
310    pop %r14
311    pop %rbp
312    pop %rbx
313    ret
314.cfi_endproc
315.size  Poly1305Last, .-Poly1305Last
316
317#endif
318