1/* 2 * This file is part of the openHiTLS project. 3 * 4 * openHiTLS is licensed under the Mulan PSL v2. 5 * You can use this software according to the terms and conditions of the Mulan PSL v2. 6 * You may obtain a copy of Mulan PSL v2 at: 7 * 8 * http://license.coscl.org.cn/MulanPSL2 9 * 10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13 * See the Mulan PSL v2 for more details. 14 */ 15 16#include "hitls_build.h" 17#if defined(HITLS_CRYPTO_CHACHA20) && defined(HITLS_CRYPTO_CHACHA20POLY1305) 18 19#include "poly1305_x86_64_macro.s" 20 21.file "poly1305_x86_64.S" 22.text 23 24/** 25 * Function description: Initializes the pre-computation table and clears the flag. 26 * Function prototype: void Poly1305InitForAsm(Poly1305_Ctx *ctx); 27 * Input register: 28 * CTX: address of the Poly305_Ctx structure 29 * Modify the register: rax, rdx, rbx, rbp, r8, r9, r11-r14. 30 * Output register: None 31 * Function/Macro Call: Poly1305_MOD_MUL 32 */ 33.globl Poly1305InitForAsm 34.type Poly1305InitForAsm, @function 35.align 32 36Poly1305InitForAsm: 37.cfi_startproc 38 push %rbx 39 push %rbp 40 push %r12 41 push %r13 42 push %r14 43 44 movl $0, 220(CTX) // flag bit Clear 45 movq 24(CTX), R0 46 movq 32(CTX), R1 47 movq R1, R2 48 shrq $2, R2 49 addq R1, R2 50 lea 56(CTX), CTX 51 movq R0, ACC1 52 movq R1, ACC2 53 xorq ACC3, ACC3 54 55 movq R1, %rax 56 POLY1305_MOD_MUL ACC1, ACC2, ACC3, R0, R1, R2 // r^2 57 movl $0x3ffffff, %eax 58 movl $0x3ffffff, %edx 59 movq ACC1, D1 60 andl %r14d, %eax 61 movq R0, D2 62 andl %r11d, %edx 63 movl %eax, (CTX) // r0^2 64 shrq $26, D1 65 movl %edx, 4(CTX) // r0 66 shrq $26, D2 67 movl $0x3ffffff, %eax 68 movl $0x3ffffff, %edx 69 andl %r8d, %eax 70 andl %r9d, %edx 71 movl %eax, 16(CTX) // r1^2 72 lea (%rax, %rax, 4), %eax 73 movl %edx, 20(CTX) // r1 74 lea (%rdx, %rdx, 4), %edx 75 movl %eax, 32(CTX) // s1^2 76 shrq $26, D1 77 movl %edx, 36(CTX) // s1 78 shrq $26, D2 79 80 movq ACC2, %rax 81 movq R1, %rdx 82 shlq $12, %rax 83 shlq $12, %rdx 84 orq D1, %rax 85 orq D2, %rdx 86 andl $0x3ffffff, %eax 87 andl $0x3ffffff, %edx 88 movl %eax, 48(CTX) // r2^2 89 lea (%rax, %rax, 4), %eax 90 movl %edx, 52(CTX) // r2 91 lea (%rdx, %rdx, 4), %edx 92 movl %eax, 64(CTX) // s2^2 93 movq ACC2, D1 94 movl %edx, 68(CTX) // s2 95 movq R1, D2 96 97 shrq $14, D1 98 movl $0x3ffffff, %eax 99 shrq $14, D2 100 movl $0x3ffffff, %edx 101 andl %r8d, %eax 102 andl %r9d, %edx 103 movl %eax, 80(CTX) // r3^2 104 lea (%rax, %rax, 4), %eax 105 movl %edx, 84(CTX) // r3 106 lea (%rdx, %rdx, 4), %edx 107 movl %eax, 96(CTX) // s3^2 108 shrq $26, D1 109 movl %edx, 100(CTX) // s3 110 shrq $26, D2 111 112 movq ACC3, %rax 113 shlq $24, %rax 114 orq %rax, D1 115 movl %r8d, 112(CTX) // r4^2 116 lea (D1, D1, 4), D1 117 movl %r9d, 116(CTX) // r4 118 lea (D2, D2, 4), D2 119 movl %r8d, 128(CTX) // s4^2 120 movl %r9d, 132(CTX) // s4 121 122 movq R1, %rax 123 POLY1305_MOD_MUL ACC1, ACC2, ACC3, R0, R1, R2 // r^3 124 movq ACC1, D1 125 movl $0x3ffffff, %edx 126 andl %r8d, %edx 127 movl %edx, 12(CTX) // r0^3 128 shrq $26, D1 129 movl $0x3ffffff, %edx 130 andl %r8d, %edx 131 movl %edx, 28(CTX) // r1^3 132 lea (%rdx, %rdx, 4), %edx 133 shrq $26, D1 134 movl %edx, 44(CTX) // s1^3 135 movq ACC2, %rax 136 shlq $12, %rax 137 orq D1, %rax 138 andl $0x3ffffff, %eax 139 movl %eax, 60(CTX) // r2^3 140 lea (%rax, %rax, 4), %eax 141 movq ACC2, D1 142 movl %eax, 76(CTX) // s2^3 143 shrq $14, D1 144 movl $0x3ffffff, %eax 145 andl %r8d, %eax 146 movl %eax, 92(CTX) // r3^3 147 lea (%rax, %rax, 4), %eax 148 shrq $26, D1 149 movl %eax, 108(CTX) // s3^3 150 movq ACC3, %rdx 151 shlq $24, %rdx 152 orq %rdx, D1 153 movl %r8d, 124(CTX) // r4^3 154 lea (D1, D1, 4), D1 155 movl %r8d, 140(CTX) // s4^3 156 157 movq R1, %rax 158 POLY1305_MOD_MUL ACC1, ACC2, ACC3, R0, R1, R2 // r^4 159 movq ACC1, D1 160 movl $0x3ffffff, %edx 161 andl %r8d, %edx 162 movl %edx, 8(CTX) // r0^4 163 shrq $26, D1 164 movl $0x3ffffff, %edx 165 andl %r8d, %edx 166 movl %edx, 24(CTX) // r1^4 167 lea (%rdx, %rdx, 4), %edx 168 shrq $26, D1 169 movl %edx, 40(CTX) // s1^4 170 movq ACC2, %rax 171 shlq $12, %rax 172 orq D1, %rax 173 andl $0x3ffffff, %eax 174 movl %eax, 56(CTX) // r2^4 175 lea (%rax, %rax, 4), %eax 176 movq ACC2, D1 177 movl %eax, 72(CTX) // s2^4 178 shrq $14, D1 179 movl $0x3ffffff, %eax 180 andl %r8d, %eax 181 movl %eax, 88(CTX) // r3^4 182 lea (%rax, %rax, 4), %eax 183 shrq $26, D1 184 movl %eax, 104(CTX) // s3^4 185 movq ACC3, %rdx 186 shlq $24, %rdx 187 orq %rdx, D1 188 movl %r8d, 120(CTX) // r4^4 189 lea (D1, D1, 4), D1 190 movl %r8d, 136(CTX) // s4^4 191 192 lea -56(CTX), CTX 193 pop %r14 194 pop %r13 195 pop %r12 196 pop %rbp 197 pop %rbx 198 ret 199.cfi_endproc 200.size Poly1305InitForAsm, .-Poly1305InitForAsm 201 202/** 203 * Function description: x86_64 poly1305 64-bit basic instruction implementation 204 * Input register: 205 * CTX: address of the Poly305_Ctx structure 206 * INP: data pointer 207 * LEN: data length 208 * PADBIT: padding data 209 * Change register: r8-r15, rax, rbx, rdx, rbp 210 * Output register: 211 * rax: length of the remaining data to be processed 212 * Macro invoking:Poly1305_MOD_MUL 213 */ 214.globl Poly1305Block64Bit 215.type Poly1305Block64Bit, @function 216Poly1305Block64Bit: 217.cfi_startproc 218.align 32 219.Lblock_start: 220 push %rbx 221 push %rbp 222 push %r12 223 push %r13 224 push %r14 225 push %r15 226 227 movq LEN, %r15 228 LOAD_ACC_R CTX, R0, R1, R2, ACC1, ACC2, ACC3, %r8d, %rax 229 test %r8d, %r8d 230 jz .Lblock64_loop 231 232 CONVERT_26TO64_PRE ACC1, ACC2, D1, D2, D3 233 CONVERT_26TO64 ACC1, D1, ACC2, D2, D3, ACC3 234 movl $0, 220(CTX) 235 236.align 32 237.Lblock64_loop: 238 239 addq (INP), ACC1 240 adcq 8(INP), ACC2 241 adcq PADBIT, ACC3 242 lea 16(INP), INP 243 244 POLY1305_MOD_MUL ACC1, ACC2, ACC3, R0, R1, R2 245 246 subq $16, %r15 247 movq R1, %rax 248 jnz .Lblock64_loop 249 250 movq ACC1, (CTX) 251 movq ACC2, 8(CTX) 252 movq ACC3, 16(CTX) 253 movq %r15, %rax 254 255 pop %r15 256 pop %r14 257 pop %r13 258 pop %r12 259 pop %rbp 260 pop %rbx 261 ret 262.cfi_endproc 263.size Poly1305Block64Bit, .-Poly1305Block64Bit 264 265/** 266 * Function description: Calculates (acc + s) mod 2^128 and outputs the final result to the specified memory. 267 * Function prototype: void Poly1305Last(Poly1305_Ctx *ctx, uint8_t mac[POLY1305_TAGSIZE]); 268 * Input register: 269 * rdi: address of the Poly305_Ctx structure 270 * rsi: pointer to the output buffer 271 * Modify the register: rax, rcx, r14, rbx, rbp, r8-r10. 272 * Output register: None 273 * Function/Macro Call: 274 * CONVERT_26TO64 275 */ 276.globl Poly1305Last 277.type Poly1305Last, @function 278.align 32 279Poly1305Last: 280.cfi_startproc 281 push %rbx 282 push %rbp 283 push %r14 284 movl 220(CTX), %r8d 285 movq (CTX), ACC1 286 movq 8(CTX), ACC2 287 movq 16(CTX), ACC3 288 289 test %r8d, %r8d 290 jz .Lblock_last_body 291 CONVERT_26TO64_PRE ACC1, ACC2, D1, D2, D3 292 CONVERT_26TO64 ACC1, D1, ACC2, D2, D3, ACC3 293 movl $0, 220(CTX) 294 295.Lblock_last_body: 296 movq ACC1, %rax 297 addq $5, ACC1 298 movq ACC2, %rcx 299 adcq $0, ACC2 300 adcq $0, ACC3 301 shrq $2, ACC3 302 cmovnz ACC1, %rax 303 cmovnz ACC2, %rcx 304 305 addq 40(CTX), %rax 306 adcq 48(CTX), %rcx 307 movq %rax, (%rsi) 308 movq %rcx, 8(%rsi) 309 310 pop %r14 311 pop %rbp 312 pop %rbx 313 ret 314.cfi_endproc 315.size Poly1305Last, .-Poly1305Last 316 317#endif 318