1/* 2 * This file is part of the openHiTLS project. 3 * 4 * openHiTLS is licensed under the Mulan PSL v2. 5 * You can use this software according to the terms and conditions of the Mulan PSL v2. 6 * You may obtain a copy of Mulan PSL v2 at: 7 * 8 * http://license.coscl.org.cn/MulanPSL2 9 * 10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13 * See the Mulan PSL v2 for more details. 14 */ 15 16#include "hitls_build.h" 17#if defined(HITLS_CRYPTO_CHACHA20) && defined(HITLS_CRYPTO_CHACHA20POLY1305) 18 19.file "poly1305_x86_64_macro.s" 20.text 21 22.align 32 23g_129: 24 .long 1<<24, 0, 1<<24, 0, 1<<24, 0, 1<<24, 0 25.size g_129, .-g_129 26.align 32 27g_mask26: 28 .long 0x3ffffff, 0, 0x3ffffff, 0, 0x3ffffff, 0, 0x3ffffff, 0 29.size g_mask26, .-g_mask26 30.align 32 31g_permd_avx2: 32 .long 2, 2, 2, 3, 2, 0, 2, 1 33.size g_permd_avx2, .-g_permd_avx2 34 35.set CTX, %rdi 36.set INP, %rsi 37.set LEN, %rdx 38.set PADBIT, %rcx 39 40.set ACC1, %r14 41.set ACC2, %rbx 42.set ACC3, %rbp 43.set D1, %r8 44.set D2, %r9 45.set D3, %r10 46.set R0, %r11 47.set R1, %r12 48.set R2, %r13 49 50.set YH0, %ymm0 51.set YH1, %ymm1 52.set YH2, %ymm2 53.set YH3, %ymm3 54.set YH4, %ymm4 55.set YT0, %ymm5 56.set YT1, %ymm6 57.set YT2, %ymm7 58.set YT3, %ymm8 59.set YT4, %ymm9 60.set YMASK, %ymm10 61.set YB0, %ymm11 62.set YB1, %ymm12 63.set YB2, %ymm13 64.set YB3, %ymm14 65.set YB4, %ymm15 66 67/** 68 * Macro description: x86_64 poly1305 big number multiplication modulo basic instruction implementation (acc1|acc2|acc3) = (acc1|acc2|acc3) * (r0|r1) mod P 69 * Input register: 70 * acc1-3: accumulator 71 * r0-1: key r 72 * r2: r1 + (r1 >> 2) 73 * Change register: r8-r14, rbx, rbp, rax 74 * Output register: 75 * acc1-3: result of the one block operation 76 */ 77.macro POLY1305_MOD_MUL acc1 acc2 acc3 r0 r1 r2 78 mulq \acc1 // acc1 * r1 79 movq %rax, D2 80 movq \r0, %rax 81 movq %rdx, D3 82 83 mulq \acc1 // acc1 * r0 84 movq %rax, \acc1 85 movq \r0, %rax 86 movq %rdx, D1 87 88 mulq \acc2 // acc2 * r0 89 addq %rax, D2 90 movq \r2, %rax 91 adcq %rdx, D3 92 93 mulq \acc2 // acc2 * (r1 + (r1 >> 2)) 94 movq \acc3, \acc2 95 addq %rax, \acc1 96 adcq %rdx, D1 97 98 imulq \r2, \acc2 // acc3 * (r1 + (r1 >> 2)) 99 addq \acc2, D2 100 movq D1, \acc2 101 adcq $0, D3 102 103 imulq \r0, \acc3 // acc3 * r0 104 mov $-4, %rax 105 addq D2, \acc2 106 adcq \acc3, D3 107 108 andq D3, %rax // reduction 109 movq D3, \acc3 110 shrq $2, D3 111 andq $3, \acc3 112 addq D3, %rax 113 addq %rax, \acc1 114 adcq $0, \acc2 115 adcq $0, \acc3 116.endm 117 118/** 119 * Macro description: converts 130-bit base2^26 data into base 2^64 data. 120 * Input register: 121 * a1: large data block 0 in the original format 122 * d1: large data block 1 in the original format 123 * a2: large data block 2 in the original format 124 * d2: large data block 3 in the original format 125 * r2: big number of data blocks 2 and 3 in the original format 126 * a3: large data block 4 in the original format 127 * Modify the register r8, r9, r13, r14, rbx, rbp. 128 * Output register: 129 * a1: bits 0 to 63 of the converted big number 130 * a2: 64-127 bits of the converted big number 131 * a3: 128-130 bits of the converted big number 132 * Function/Macro Call: None 133 */ 134.macro CONVERT_26TO64 a1 d1 a2 d2 r2 a3 135 shrq $6, \d1 136 shlq $52, \r2 137 shrq $12, \a2 138 addq \d1, \a1 139 shrq $18, \d2 140 addq \r2, \a1 // 1st 64bit 141 142 adcq \d2, \a2 143 movq \a3, \d1 144 shlq $40, \d1 145 shrq $24, \a3 146 addq \d1, \a2 // 2nd 64bit 147 adcq $0, \a3 // 3rd 64bit 148.endm 149 150/** 151 * Macro description: converts 130-bit base2^64 data to base 2^26 data. 152 * Input register: 153 * a1: large data block 0 in the original format 154 * a2: large data block 1 in the original format 155 * a3: large data block 2 in the original format 156 * Modify the register: r8, r9, r14, rax, rdx, rbp, rbx. 157 * Output register: 158 * a4: 0 to 25 digits of the converted big number 159 * a5: 26 to 51 digits of the converted big number 160 * a1: 52 to 77 digits of the converted big number 161 * a2: 78 to 103 bits of the converted big number 162 * a3: 104-130 bits of the converted big number 163 * Function/Macro Call: None 164 */ 165.macro CONVERT_64TO26 a1 a2 a3 a4 a5 166 movq \a1, \a4 167 movq \a1, \a5 168 andq $0x3ffffff, \a4 // 1st 26bit 169 shrq $26, \a5 170 movd \a4, %xmm0 171 andq $0x3ffffff, \a5 // 2nd 26bit 172 shrq $52, \a1 173 movd \a5, %xmm1 174 movq \a2, D1 175 movq \a2, D2 176 shlq $12, D1 177 orq D1, \a1 178 andq $0x3ffffff, \a1 // 3rd 26bit 179 shrq $14, \a2 180 movd \a1, %xmm2 181 shlq $24, \a3 182 andq $0x3ffffff, \a2 // 4th 26bit 183 shrq $40, D2 184 movd \a2, %xmm3 185 orq D2, \a3 // 5th 26bit 186 movl $1, 220(CTX) 187 movd \a3, %xmm4 188 189.endm 190 191/** 192 * Macro description: preprocessing of converting base2^26 data to base 2^64 193 * Input register: 128 bits of acc1 and acc2 data 194 * Change register: r8-r10, r14, and rbx. 195 * Output register: acc1, acc2, d1, d2, d3 196 */ 197.macro CONVERT_26TO64_PRE acc1 acc2 d1 d2 d3 198 movq $0xffffffff, \d3 // base2_26 --> base2_64 199 movq \acc1, \d1 200 movq \acc2, \d2 201 andq \d3, \acc1 202 andq \d3, \acc2 203 andq $-1*(1<<31), \d1 204 movq \d2, \d3 205 andq $-1*(1<<31), \d2 206.endm 207 208/** 209 * Macro description: load accumulator data and key r 210 * Input register: in_ctx context 211 * Modify the register: r8, r11-r14, rax, rbp, rbx. 212 * Output register: 213 * r0 - r2: key r 214 * acc1 - acc3: accumulator data 215 * flag: indicates the data organization flag of the current accumulator. 216 * mul: r1 217 */ 218.macro LOAD_ACC_R inctx r0 r1 r2 acc1 acc2 acc3 flag mul 219 movq 24(\inctx), \r0 // load r 220 movq 32(\inctx), \r1 221 movl 220(\inctx), \flag // judge the ACC organization form. 222 movq \r1, \r2 223 movq (\inctx), \acc1 // load acc 224 shrq $2, \r2 225 movq 8(\inctx), \acc2 226 addq \r1, \r2 // R2 = R1 + (R1 >> 2) 227 movq 16(\inctx), \acc3 228 movq \r1, \mul 229.endm 230 231/** 232 * Macro description: The avx2 instruction set implements parallel operation of the last four blocks. 233 * Input register: 234 * yh0 - yh4: stores messages. 235 * yt0 - yt4: stores keys. 236 * yb0 - yb4: temporary storage of intermediate results 237 * addr: stack address 238 * Output register: 239 * yh0 - yh4: store operation results. 240 */ 241.macro BLOCK4_AVX2_TAIL yt0 yt1 yt2 yt3 yt4 yh0 yh1 yh2 yh3 yh4 yb0 yb1 yb2 yb3 yb4 ymask addr 242 vpaddq \yt0, \yh0, \yh0 243 vpaddq \yt1, \yh1, \yh1 244 vpaddq \yt3, \yh3, \yh3 245 vpaddq \yt4, \yh4, \yh4 246 vmovdqu 0x4(\addr), \yt0 // r0^i 247 vmovdqu 0x24(\addr), \yt1 // r1^i 248 vmovdqu 0x64(\addr), \yt2 // r2^i 249 vmovdqu 0xc4(\addr), \yt3 // s3^i 250 vmovdqu 0x104(\addr), \ymask // s4^i 251 252 vpmuludq \yh2, \yt0, \yb2 // b2 = h2 * r0^i 253 vpmuludq \yh2, \yt1, \yb3 // b3 = h2 * r1^i 254 vpmuludq \yh2, \yt2, \yb4 // b4 = h2 * r2^i 255 vpmuludq \yh2, \yt3, \yb0 // b0 = h2 * s3^i 256 vpmuludq \yh2, \ymask, \yb1 // b1 = h2 * s4^i 257 258 vpmuludq \yh1, \yt1, \yt4 // h1 * r1^i 259 vpmuludq \yh0, \yt1, \yh2 // h0 * r1^i 260 vpaddq \yt4, \yb2, \yb2 // b2 += h1 * r1^i 261 vpaddq \yh2, \yb1, \yb1 // b1 += h0 * r1^i 262 vpmuludq \yh3, \yt1, \yt4 // h3 * r1^i 263 vpmuludq 0x44(\addr), \yh4, \yh2 // h4 * s1^i 264 vpaddq \yt4, \yb4, \yb4 // b4 += h3 * r1^i 265 vpaddq \yh2, \yb0, \yb0 // b0 += h4 * s1^i 266 vmovdqu 0x84(\addr), \yt1 // load s2^i 267 268 vpmuludq \yh4, \yt0, \yt4 // h4 * r0^i 269 vpmuludq \yh3, \yt0, \yh2 // h3 * r0^i 270 vpaddq \yt4, \yb4, \yb4 // b4 += h4 * r0^i 271 vpaddq \yh2, \yb3, \yb3 // b3 += h3 * r0^i 272 vpmuludq \yh0, \yt0, \yt4 // h0 * r0^i 273 vpmuludq \yh1, \yt0, \yh2 // h1 * r0^i 274 vpaddq \yt4, \yb0, \yb0 // b0 += h0 * r0^i 275 vpaddq \yh2, \yb1, \yb1 // b1 += h1 * r0^i 276 277 vpmuludq \yh1, \yt2, \yt4 // h1 * r2^i 278 vpmuludq \yh0, \yt2, \yh2 // h0 * r2^i 279 vpaddq \yt4, \yb3, \yb3 // b3 += h1 * r2^i 280 vpaddq \yh2, \yb2, \yb2 // b2 += h0 * r2^i 281 vpmuludq \yh4, \yt1, \yt4 // h4 * s2^i 282 vpmuludq \yh3, \yt1, \yh2 // h3 * s2^i 283 vpaddq \yt4, \yb1, \yb1 // b1 += h4 * s2^i 284 vpaddq \yh2, \yb0, \yb0 // b0 += h3 * s2^i 285 vmovdqu 0xa4(\addr), \yh2 // load r3^i 286 287 vpmuludq \yh1, \yh2, \yt4 // h1 * r3^i 288 vpmuludq \yh0, \yh2, \yh2 // h0 * r3^i 289 vpaddq \yt4, \yb4, \yb4 // b4 += h1 * r3^i 290 vpaddq \yh2, \yb3, \yb3 // b3 += h0 * r3^i 291 vpmuludq \yh4, \yt3, \yt4 // h4 * s3^i 292 vpmuludq \yh3, \yt3, \yh2 // h3 * s3^i 293 vpaddq \yt4, \yb2, \yb2 // b2 += h4 * s3^i 294 vpaddq \yh2, \yb1, \yb1 // b1 += h3 * s3^i (finish) 295 296 vpmuludq \yh3, \ymask, \yh3 // h3 * s4^i 297 vpmuludq \yh4, \ymask, \yh4 // h4 * s4^i 298 vpaddq \yb2, \yh3, \yh2 // h2 += h3 * s4^i (finish) 299 vpaddq \yb3, \yh4, \yh3 // h3 += h4 * s4^i (finish) 300 vpmuludq 0xe4(\addr), \yh0, \yh4 // h0 * r4^i 301 vpmuludq \yh1, \ymask, \yh0 // h1 * s4^i 302 vmovdqu g_mask26(%rip), \ymask 303 vpaddq \yh4, \yb4, \yh4 // h4 += h0 * r4^i (finish) 304 vpaddq \yh0, \yb0, \yh0 // h0 += h1 * s4^i (finish) 305 306 // Summary of calculation results of different blocks 307 vpsrldq $8, \yh0, \yt0 308 vpsrldq $8, \yb1, \yt1 309 vpaddq \yt0, \yh0, \yh0 310 vpsrldq $8, \yh2, \yt2 311 vpaddq \yt1, \yb1, \yb1 312 vpsrldq $8, \yh3, \yt3 313 vpaddq \yt2, \yh2, \yh2 314 vpsrldq $8, \yh4, \yt4 315 vpaddq \yt3, \yh3, \yh3 316 vpaddq \yt4, \yh4, \yh4 317 318 vpermq $0x2, \yh0, \yt0 319 vpermq $0x2, \yb1, \yt1 320 vpaddq \yt0, \yh0, \yh0 321 vpermq $0x2, \yh2, \yt2 322 vpaddq \yt1, \yb1, \yb1 323 vpermq $0x2, \yh3, \yt3 324 vpaddq \yt2, \yh2, \yh2 325 vpermq $0x2, \yh4, \yt4 326 vpaddq \yt3, \yh3, \yh3 327 vpaddq \yt4, \yh4, \yh4 328 329 // reduction 330 vpsrlq $26, \yh3, \yb3 331 vpand \ymask, \yh3, \yh3 332 vpaddq \yb3, \yh4, \yh4 // h3 -> h4 333 vpsrlq $26, \yh0, \yb0 334 vpand \ymask, \yh0, \yh0 335 vpaddq \yb0, \yb1, \yh1 // h0 -> h1 336 vpsrlq $26, \yh4, \yb4 337 vpand \ymask, \yh4, \yh4 338 vpsrlq $26, \yh1, \yb1 339 vpand \ymask, \yh1, \yh1 340 vpaddq \yb1, \yh2, \yh2 // h1 -> h2 341 vpaddq \yb4, \yh0, \yh0 342 vpsllq $2, \yb4, \yb4 343 vpaddq \yb4, \yh0, \yh0 // h4 -> h0 344 vpsrlq $26, \yh2, \yb2 345 vpand \ymask, \yh2, \yh2 346 vpaddq \yb2, \yh3, \yh3 // h2 -> h3 347 vpsrlq $26, \yh0, \yb0 348 vpand \ymask, \yh0, \yh0 349 vpaddq \yb0, \yh1, \yh1 // h0 -> h1 350 vpsrlq $26, \yh3, \yb3 351 vpand \ymask, \yh3, \yh3 352 vpaddq \yb3, \yh4, \yh4 // h3 -> h4 353.endm 354 355#endif 356