1/* 2 * This file is part of the openHiTLS project. 3 * 4 * openHiTLS is licensed under the Mulan PSL v2. 5 * You can use this software according to the terms and conditions of the Mulan PSL v2. 6 * You may obtain a copy of Mulan PSL v2 at: 7 * 8 * http://license.coscl.org.cn/MulanPSL2 9 * 10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13 * See the Mulan PSL v2 for more details. 14 */ 15 16#include "hitls_build.h" 17#ifdef HITLS_CRYPTO_CHACHA20 18 19#include "chacha20_x8664_common.S" 20.text 21.align 64 22g_ror16_128: 23 .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd, \ 24 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 25 .size g_ror16_128, .-g_ror16_128 26.align 64 27g_ror8_128: 28 .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe, \ 29 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 30 .size g_ror8_128, .-g_ror8_128 31.align 64 32g_ror16: 33 .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 34 .size g_ror16, .-g_ror16 35.align 64 36g_ror8: 37 .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 38 .size g_ror8, .-g_ror8 39.align 64 40g_ror16_512: 41 .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd, \ 42 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 43 .size g_ror16_512, .-g_ror16_512 44.align 64 45g_ror8_512: 46 .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe, \ 47 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 48 .size g_ror8_512, .-g_ror8_512 49.align 64 50g_add4block: 51 .long 0, 1, 2, 3 52 .size g_add4block, .-g_add4block 53.align 64 54g_addsecond4block: 55 .long 4, 4, 4, 4 56 .size g_addsecond4block, .-g_addsecond4block 57.align 64 58g_add8block: 59 .long 0, 1, 2, 3, 4, 5, 6, 7 60 .size g_add8block, .-g_add8block 61.align 64 62g_addsecond8block: 63 .long 8, 8, 8, 8, 8, 8, 8, 8 64 .size g_addsecond8block, .-g_addsecond8block 65.align 64 66g_addOne: 67 .long 0, 0, 0, 0, 1, 0, 0, 0 68 .size g_addOne, .-g_addOne 69 70.set IN, %rsi 71.set OUT, %rdx 72 73/* QUARTERROUND for one state */ 74.macro CHACHA20_ROUND s0 s1 s2 s3 cur ror16 ror8 75 vpaddd \s1, \s0, \s0 76 vpxor \s0, \s3, \s3 77 vpshufb (\ror16), \s3, \s3 78 79 vpaddd \s3, \s2, \s2 80 vpxor \s2, \s1, \s1 81 vmovdqa \s1, \cur 82 vpsrld $20, \s1, \s1 83 vpslld $12, \cur, \cur 84 vpor \cur, \s1, \s1 85 86 vpaddd \s1, \s0, \s0 87 vpxor \s0, \s3, \s3 88 vpshufb (\ror8), \s3, \s3 89 90 vpaddd \s3, \s2, \s2 91 vpxor \s2, \s1, \s1 92 vmovdqa \s1, \cur 93 vpsrld $25, \s1, \s1 94 vpslld $7, \cur, \cur 95 vpor \cur, \s1, \s1 96.endm 97 98/* QUARTERROUND for two states */ 99.macro CHACHA20_2_ROUND s0 s1 s2 s3 cur s4 s5 s6 s7 cur1 ror16 ror8 100 vpaddd \s1, \s0, \s0 101 vpxor \s0, \s3, \s3 102 vpshufb (\ror16), \s3, \s3 103 104 vpaddd \s3, \s2, \s2 105 vpxor \s2, \s1, \s1 106 vmovdqa \s1, \cur 107 vpsrld $20, \s1, \s1 108 vpslld $12, \cur, \cur 109 vpor \cur, \s1, \s1 110 111 vpaddd \s1, \s0, \s0 112 vpxor \s0, \s3, \s3 113 vpshufb (\ror8), \s3, \s3 114 115 vpaddd \s3, \s2, \s2 116 vpxor \s2, \s1, \s1 117 vmovdqa \s1, \cur 118 vpsrld $25, \s1, \s1 119 vpslld $7, \cur, \cur 120 vpor \cur, \s1, \s1 121 122 vpaddd \s5, \s4, \s4 123 vpxor \s4, \s7, \s7 124 vpshufb (\ror16), \s7, \s7 125 126 vpaddd \s7, \s6, \s6 127 vpxor \s6, \s5, \s5 128 vmovdqa \s5, \cur1 129 vpsrld $20, \s5, \s5 130 vpslld $12, \cur1, \cur1 131 vpor \cur1, \s5, \s5 132 133 vpaddd \s5, \s4, \s4 134 vpxor \s4, \s7, \s7 135 vpshufb (\ror8), \s7, \s7 136 137 vpaddd \s7, \s6, \s6 138 vpxor \s6, \s5, \s5 139 vmovdqa \s5, \cur1 140 vpsrld $25, \s5, \s5 141 vpslld $7, \cur1, \cur1 142 vpor \cur1, \s5, \s5 143.endm 144 145/* current matrix add original matrix */ 146.macro LASTADD_MATRIX S0 S1 S2 S3 S4 S5 S6 S7 S8 S9 S10 S11 S12 S13 S14 S15 PER 147 vpaddd (%rsp), \S0, \S0 148 vpaddd 1*\PER(%rsp), \S1, \S1 149 vpaddd 2*\PER(%rsp), \S2, \S2 150 vpaddd 3*\PER(%rsp), \S3, \S3 151 vpaddd 4*\PER(%rsp), \S4, \S4 152 vpaddd 5*\PER(%rsp), \S5, \S5 153 vpaddd 6*\PER(%rsp), \S6, \S6 154 vpaddd 7*\PER(%rsp), \S7, \S7 155 vpaddd 8*\PER(%rsp), \S8, \S8 156 vpaddd 9*\PER(%rsp), \S9, \S9 157 vpaddd 10*\PER(%rsp), \S10, \S10 158 vpaddd 11*\PER(%rsp), \S11, \S11 159 vpaddd 12*\PER(%rsp), \S12, \S12 160 vpaddd 13*\PER(%rsp), \S13, \S13 161 vpaddd 14*\PER(%rsp), \S14, \S14 162 vpaddd 15*\PER(%rsp), \S15, \S15 163.endm 164 165/* write output for left part of 512 bytes (ymm) */ 166.macro WRITE_BACK_512_L inpos outpos s0 s1 s2 s3 s4 s5 s6 s7 out0 out1 out2 out3 167 168 /* {A0 B0 C0 D0 E0 F0 G0 H0} {A1 B1 C1 D1 E1 F1 G1 H1} => {A0 B0 C0 D0 A1 B1 C1 D1} */ 169 vperm2i128 $0x20, \s1, \s0, \out0 170 vpxor (\inpos), \out0, \out0 171 vmovdqu \out0, (\outpos) // write back output 172 173 vperm2i128 $0x20, \s3, \s2, \out1 174 vpxor 32(\inpos), \out1, \out1 175 vmovdqu \out1, 32(\outpos) 176 177 vperm2i128 $0x20, \s5, \s4, \out2 178 vpxor 64(\inpos), \out2, \out2 // write back output 179 vmovdqu \out2, 64(\outpos) 180 181 vperm2i128 $0x20, \s7, \s6, \out3 182 vpxor 96(\inpos), \out3, \out3 183 vmovdqu \out3, 96(\outpos) 184.endm 185 186/* write output for right part of 512 bytes (ymm) */ 187.macro WRITE_BACK_512_R inpos outpos s0 s1 s2 s3 s4 s5 s6 s7 188 189 /* {A0 B0 C0 D0 E0 F0 G0 H0} {A1 B1 C1 D1 E1 F1 G1 H1} => {E0 F0 G0 H0 E1 F1 G1 H1} */ 190 vperm2i128 $0x31, \s1, \s0, \s1 191 vpxor (\inpos), \s1, \s1 192 vmovdqu \s1, (\outpos) // write back output 193 194 vperm2i128 $0x31, \s3, \s2, \s3 195 vpxor 32(\inpos), \s3, \s3 196 vmovdqu \s3, 32(\outpos) 197 198 vperm2i128 $0x31, \s5, \s4, \s5 199 vpxor 64(\inpos), \s5, \s5 200 vmovdqu \s5, 64(\outpos) // write back output 201 202 vperm2i128 $0x31, \s7, \s6, \s7 203 vpxor 96(\inpos), \s7, \s7 204 vmovdqu \s7, 96(\outpos) 205.endm 206 207/* 208 * Processing 64 bytes: 4 xmm registers 209 * xmm0 ~ xmm3: 210 * xmm0 {0, 1, 2, 3} 211 * xmm1 {4, 5, 6, 7} 212 * xmm2 {8, 9, 10, 11} 213 * xmm3 {12, 13, 14, 15} 214 * 215 * Processing 128 bytes: 8 xmm registers 216 * xmm0 ~ xmm8: 217 * xmm0 {0, 1, 2, 3} xmm5 {0, 1, 2, 3} 218 * xmm1 {4, 5, 6, 7} xmm6 {4, 5, 6, 7} 219 * xmm2 {8, 9, 10, 11} xmm7 {8, 9, 10, 11} 220 * xmm3 {12, 13, 14, 15} xmm8 {12, 13, 14, 15} 221 * 222 * Processing 256 bytes: 16 xmm registers 223 * xmm0 ~ xmm15: 224 * xmm0 {0, 0, 0, 0} 225 * xmm1 {1, 2, 2, 2} 226 * xmm2 {3, 3, 3, 3} 227 * xmm3 {4, 4, 4, 4} 228 * ... 229 * xmm15 {15, 15, 15, 15} 230 * 231 * Processing 512 bytes: 16 xmm registers 232 * ymm0 ~ ymm15: 233 * ymm0 {0, 0, 0, 0} 234 * ymm1 {1, 2, 2, 2} 235 * ymm2 {3, 3, 3, 3} 236 * ymm3 {4, 4, 4, 4} 237 * ... 238 * ymm15 {15, 15, 15, 15} 239 * 240 */ 241 242/* 243 * @Interconnection with the C interface:void CHACHA20_Update(CRYPT_CHACHA20_Ctx *ctx, const uint8_t *in, uint8_t *out, uint32_t len); 244 * @brief chacha20 algorithm 245 * @param ctx [IN] Algorithm context, which is set by the C interface and transferred. 246 * @param in [IN] Data to be encrypted 247 * @param out [OUT] Data after encryption 248 * @param len [IN] Encrypted length 249 * esp cannot use 15 available ctx in out len 250 * 16 registers are needed in one cycle, then 251 * {0, 1, 4, 5, 8, 9, 12, 13} 252 * {2, 3, 6, 7, 10, 11, 14, 15} 253 */ 254 255.globl CHACHA20_Update 256.type CHACHA20_Update,%function 257.align 64 258CHACHA20_Update: 259 .cfi_startproc 260 mov 48(%rdi), %r11d 261 mov %rsp, %rax 262 subq $1024,%rsp 263 andq $-512,%rsp 264 265.Lchacha20_start: 266 cmp $512, %rcx 267 jae .Lchacha20_512_start 268 cmp $256, %rcx 269 jae .Lchacha20_256_start 270 cmp $128, %rcx 271 jae .Lchacha20_128_start 272 cmp $64, %rcx 273 jae .Lchacha20_64_start 274 jmp .Lchacha20_end 275 276.Lchacha20_64_start: 277 278 LOAD_STATE %xmm0, %xmm1, %xmm2, %xmm3, %rdi 279 280 vmovdqa %xmm0, %xmm10 281 vmovdqa %xmm1, %xmm11 282 vmovdqa %xmm2, %xmm12 283 vmovdqa %xmm3, %xmm13 284 285 leaq g_ror16(%rip), %r9 286 leaq g_ror8(%rip), %r10 287 mov $10, %r8 288 289.Lchacha20_64_loop: 290 291 /* 0 = 0 + 4, 12 = (12 ^ 0) >>> 16 | 8 = 8 + 12, 4 = (4 ^ 8) >>> 12 | 292 * 0 = 0 + 4, 12 = (12 ^ 0) >>> 8 | 8 = 8 + 12, 4 = (4 ^ 8) >>> 7 293 * 1 = 1 + 5, 13 = (13 ^ 1) >>> 16 | 9 = 9 + 13, 5 = (5 ^ 9) >>> 12 | 294 * 1 = 1 + 5, 13 = (13 ^ 1) >>> 8 | 9 = 9 + 13, 5 = (5 ^ 9) >>> 7 295 * 2 = 2 + 6, 14 = (14 ^ 2) >>> 16 | 10 = 10 + 14, 6 = (6 ^ 10)>>> 12 | 296 * 2 = 2 + 6, 14 = (14 ^ 2) >>> 8 | 10 = 10 + 14, 6 = (6 ^ 10)>>> 7 297 * 3 = 3 + 7, 15 = (15 ^ 3) >>> 16 | 11 = 11 + 15, 7 = (7 ^ 11)>>> 12 | 298 * 3 = 3 + 7 ,15 = (15 ^ 3) >>> 8 | 11 = 11 + 15, 7 = (7 ^ 11)>>> 7 299 */ 300 CHACHA20_ROUND %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %r9, %r10 301 302 vpshufd $78, %xmm2, %xmm2 // {8 9 10 11} ==> {10 11 8 9} 01 00 11 10 303 vpshufd $57, %xmm1, %xmm1 // {4 5 6 7} ==> {5 6 7 4} 00 11 10 01 304 vpshufd $147, %xmm3, %xmm3 // {12 13 14 15} ==> {15 12 13 14} 10 01 00 11 305 306 /* 0 = 0 + 5 , 15 = (15 ^ 0) >>> 16 | 10 = 10 + 15, 5 = (5 ^ 10) >>> 12 | 307 * 0 = 0 + 5, 15 = (15 ^ 0) >>> 8 | 10 = 10 + 15, 5 = (5 ^ 10) >>> 7 308 * 1 = 1 + 6 , 12 = (12 ^ 1) >>> 16 | 11 = 11 + 12, 6 = (6 ^ 11) >>> 12 | 309 * 1 = 1 + 6, 12 = (12 ^ 1) >>> 8 | 11 = 11 + 12, 6 = (6 ^ 11) >>> 7 310 * 2 = 2 + 7 , 13 = (13 ^ 2) >>> 16 | 8 = 8 + 13, 7 = (7 ^ 8)>>> 12 | 311 * 2 = 2 + 7, 13 = (13 ^ 2) >>> 8 | 8 = 8 + 13, 7 = (7 ^ 8)>>> 7 312 * 3 = 3 + 4 , 14 = (14 ^ 3) >>> 16 | 9 = 9 + 14, 4 = (4 ^ 9)>>> 12 | 313 * 3 = 3 + 4, 14 = (14 ^ 3) >>> 8 | 9 = 9 + 14, 4 = (4 ^ 9)>>> 7 314 */ 315 CHACHA20_ROUND %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %r9, %r10 316 317 vpshufd $78, %xmm2, %xmm2 // {10 11 8 9} ==> {8 9 10 11} 01 00 11 10 318 vpshufd $147, %xmm1, %xmm1 // {5 6 7 4} ==> {4 5 6 7} 00 11 10 01 319 vpshufd $57, %xmm3, %xmm3 // {15 12 13 14} ==> {12 13 14 15} 10 01 00 11 320 321 decq %r8 322 jnz .Lchacha20_64_loop 323 324 vpaddd %xmm10, %xmm0, %xmm0 325 vpaddd %xmm11, %xmm1, %xmm1 326 vpaddd %xmm12, %xmm2, %xmm2 327 vpaddd %xmm13, %xmm3, %xmm3 328 329 add $1, %r11d 330 vpxor 0(IN), %xmm0, %xmm4 331 vpxor 16(IN), %xmm1, %xmm5 332 vpxor 32(IN), %xmm2, %xmm6 333 vpxor 48(IN), %xmm3, %xmm7 334 335 vmovdqu %xmm4, 0(OUT) 336 vmovdqu %xmm5, 16(OUT) 337 vmovdqu %xmm6, 32(OUT) 338 vmovdqu %xmm7, 48(OUT) 339 340 add $64, IN 341 add $64, OUT 342 343 mov %r11d, 48(%rdi) 344 jmp .Lchacha20_end 345 346.Lchacha20_128_start: 347 348 vbroadcasti128 (%rdi), %ymm0 // {0 1 2 3 0 1 2 3} 349 vbroadcasti128 16(%rdi), %ymm1 // {4 5 6 7 4 5 6 7} 350 vbroadcasti128 32(%rdi), %ymm2 // {8 9 10 11 8 9 10 11} 351 vbroadcasti128 48(%rdi), %ymm3 // {12 13 14 15 12 13 14 15} 352 353 vpaddd g_addOne(%rip), %ymm3, %ymm3 354 355 vmovdqa %ymm0, %ymm12 356 vmovdqa %ymm1, %ymm13 357 vmovdqa %ymm2, %ymm14 358 vmovdqa %ymm3, %ymm15 359 360 leaq g_ror16_128(%rip), %r9 361 leaq g_ror8_128(%rip), %r10 362 mov $10, %r8 363 364.Lchacha20_128_loop: 365 366 CHACHA20_ROUND %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %r9, %r10 367 368 vpshufd $78, %ymm2, %ymm2 // {8 9 10 11} ==> {10 11 8 9} 01 00 11 10 369 vpshufd $57, %ymm1, %ymm1 // {4 5 6 7} ==> {5 6 7 4} 00 11 10 01 370 vpshufd $147, %ymm3, %ymm3 // {12 13 14 15} ==> {15 12 13 14} 10 01 00 11 371 372 CHACHA20_ROUND %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %r9, %r10 373 374 vpshufd $78, %ymm2, %ymm2 // {8 9 10 11} ==> {10 11 8 9} 01 00 11 10 375 vpshufd $147, %ymm1, %ymm1 // {4 5 6 7} ==> {5 6 7 4} 00 11 10 01 376 vpshufd $57, %ymm3, %ymm3 // {12 13 14 15} ==> {15 12 13 14} 10 01 00 11 377 378 decq %r8 379 jnz .Lchacha20_128_loop 380 381 vpaddd %ymm12, %ymm0, %ymm0 382 vpaddd %ymm13, %ymm1, %ymm1 383 vpaddd %ymm14, %ymm2, %ymm2 384 vpaddd %ymm15, %ymm3, %ymm3 385 386 vextracti128 $1, %ymm0, %xmm4 // ymm0 => {xmm0 xmm5} 387 vextracti128 $1, %ymm1, %xmm5 // ymm1 => {xmm1 xmm6} 388 vextracti128 $1, %ymm2, %xmm6 // ymm2 => {xmm2 xmm7} 389 vextracti128 $1, %ymm3, %xmm7 // ymm3 => {xmm3 xmm8} 390 391 WRITEBACK_64_AVX2 IN, OUT, %xmm0, %xmm1, %xmm2, %xmm3 392 add $2, %r11d 393 WRITEBACK_64_AVX2 IN, OUT, %xmm4, %xmm5, %xmm6, %xmm7 394 mov %r11d, 48(%rdi) 395 396 sub $128, %rcx 397 jz .Lchacha20_end 398 jmp .Lchacha20_start 399 400.Lchacha20_256_start: 401 402 LOAD_STATE %xmm0, %xmm1, %xmm2, %xmm3, %rdi 403 STATE_TO_MATRIX %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, \ 404 %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, 0, 16, g_add4block(%rip) 405 406 /* move xmm8~11 into stack for CHACHA20_LOOP encryption */ 407 vmovdqa %xmm8, 256(%rsp) 408 vmovdqa %xmm9, 256+16(%rsp) 409 vmovdqa %xmm10, 256+32(%rsp) 410 vmovdqa %xmm11, 256+48(%rsp) 411 412 leaq g_ror16(%rip), %r9 413 leaq g_ror8(%rip), %r10 414 415 mov $10, %r8 416 417.Lchacha20_256_loop: 418 419 CHACHA20_LOOP %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10 \ 420 %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, 256, 16, %rsp, %r9, %r10 421 422 decq %r8 423 jnz .Lchacha20_256_loop 424 425 /* xmm0~15: encrypt matrix 0 ~ 15*/ 426 vmovdqa 256+32(%rsp), %xmm10 // rsp32: encrypt matrix xmm10 427 vmovdqa 256+48(%rsp), %xmm11 428 429 LASTADD_MATRIX %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10 \ 430 %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, 16 431 432 /* store xmm9, 10, 13, 14 in stack */ 433 vmovdqa %xmm9, 256(%rsp) // rsp 0: encrypt matrix xmm9 434 vmovdqa %xmm10, 256+32(%rsp) // rsp32: encrypt matrix xmm9 435 vmovdqa %xmm13, 256+16(%rsp) // rsp16: encrypt matrix xmm13 436 vmovdqa %xmm14, 256+48(%rsp) // rsp48: encrypt matrix xmm14 437 438 MATRIX_TO_STATE %xmm0, %xmm1, %xmm2, %xmm3, %xmm9, %xmm10 // set state 0, 3, 9, 10 439 MATRIX_TO_STATE %xmm4, %xmm5, %xmm6, %xmm7, %xmm13, %xmm14 // set state 4, 7, 13, 14 440 441 vmovdqa 256(%rsp), %xmm5 442 vmovdqa 256+32(%rsp), %xmm6 443 vmovdqa %xmm9, 256(%rsp) 444 vmovdqa %xmm10, 256+32(%rsp) 445 446 MATRIX_TO_STATE %xmm8, %xmm5, %xmm6, %xmm11, %xmm1, %xmm2 // set state 8, 11, 1, 2 447 448 vmovdqa 256+16(%rsp), %xmm9 449 vmovdqa 256+48(%rsp), %xmm10 450 vmovdqa %xmm13, 256+16(%rsp) 451 vmovdqa %xmm14, 256+48(%rsp) 452 453 MATRIX_TO_STATE %xmm12, %xmm9, %xmm10, %xmm15, %xmm5, %xmm6 // set state 12, 15, 5, 6 454 455 vmovdqa 256(%rsp), %xmm9 // rsp 0: state 9 456 vmovdqa 256+32(%rsp), %xmm10 // rsp32: state 10 457 vmovdqa 256+16(%rsp), %xmm13 // rsp16: state 13 458 vmovdqa 256+48(%rsp), %xmm14 // rsp48: state 14 459 460 /* finish state calculation, now write result to output */ 461 WRITEBACK_64_AVX2 IN, OUT, %xmm0, %xmm4, %xmm8, %xmm12 462 WRITEBACK_64_AVX2 IN, OUT, %xmm3, %xmm7, %xmm11, %xmm15 463 WRITEBACK_64_AVX2 IN, OUT, %xmm9, %xmm13, %xmm1, %xmm5 464 WRITEBACK_64_AVX2 IN, OUT, %xmm10, %xmm14, %xmm2, %xmm6 465 466 add $4, %r11d 467 sub $256, %rcx 468 mov %r11d, 48(%rdi) 469 cmp $256, %rcx 470 jz .Lchacha20_end 471 jmp .Lchacha20_start 472 473.Lchacha20_512_start: 474 475 LOAD_512_STATE %ymm0 %ymm1 %ymm2 %ymm3 %rdi 476 STATE_TO_MATRIX %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, \ 477 %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, 0, 32, g_add8block(%rip) 478 jmp .Lchacha20_512_run 479 480.Lchacha20_512_start_cont: 481 482 LOAD_MATRIX %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, \ 483 %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, 0, 32, g_addsecond8block(%rip) 484 485.Lchacha20_512_run: 486 487 /* move ymm8~11 into stack for CHACHA20_LOOP encryption */ 488 vmovdqa %ymm8, 512(%rsp) 489 vmovdqa %ymm9, 512+32(%rsp) 490 vmovdqa %ymm10, 512+64(%rsp) 491 vmovdqa %ymm11, 512+96(%rsp) 492 leaq g_ror16_512(%rip), %r9 493 leaq g_ror8_512(%rip), %r10 494 mov $10, %r8 495.align 32 496.Lchacha20_512_loop: 497 498 CHACHA20_LOOP %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10 \ 499 %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, 512, 32, %rsp, %r9, %r10 500 501 decq %r8 502 jnz .Lchacha20_512_loop 503 504 /* ymm0~15: encrypt matrix 0 ~ 15*/ 505 vmovdqa 512+64(%rsp), %ymm10 // rsp64: encrypt matrix ymm10 506 vmovdqu 512+96(%rsp), %ymm11 507 508 LASTADD_MATRIX %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10 \ 509 %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, 32 510 511 /* store matrix ymm9, 10, 13, 14 in stack */ 512 vmovdqa %ymm9, 512(%rsp) // rsp 0: encrypt matrix ymm9 513 vmovdqu %ymm10, 512+32(%rsp) // rsp32: encrypt matrix ymm10 514 vmovdqa %ymm13, 512+64(%rsp) // rsp64: encrypt matrix ymm13 515 vmovdqu %ymm14, 512+96(%rsp) // rsp96: encrypt matrix ymm14 516 517 MATRIX_TO_STATE %ymm0, %ymm1, %ymm2, %ymm3, %ymm9, %ymm10 // set state 0, 3, 9, 10 518 MATRIX_TO_STATE %ymm4, %ymm5, %ymm6, %ymm7, %ymm13, %ymm14 // set state 4, 7, 13, 14 519 520 vmovdqu 512(%rsp), %ymm5 521 vmovdqa 512+32(%rsp), %ymm6 522 vmovdqu %ymm9, 512(%rsp) 523 vmovdqa %ymm10, 512+32(%rsp) 524 525 MATRIX_TO_STATE %ymm8, %ymm5, %ymm6, %ymm11, %ymm1, %ymm2 // set state 8, 11, 1, 2 526 527 vmovdqa 512+64(%rsp), %ymm9 528 vmovdqu 512+96(%rsp), %ymm10 529 vmovdqa %ymm13, 512+64(%rsp) 530 vmovdqu %ymm14, 512+96(%rsp) 531 532 MATRIX_TO_STATE %ymm12, %ymm9, %ymm10, %ymm15, %ymm5, %ymm6 // set state 12, 15, 5, 6 533 534 /* 535 * {A0 A1 A2 A3 E0 E1 E2 E3} 536 * {B0 B1 B2 B3 F0 F1 F2 F3} 537 * {C0 C1 C2 C3 G0 G1 G2 G3} 538 * {D0 D1 D2 D3 H0 H1 H2 H3} 539 * ... 540 * => 541 * {A0 A1 A2 A3 B0 B1 B2 B3} 542 * {C0 C1 C2 C3 D0 D1 D2 D3} 543 * .... 544 */ 545 546 /* left half of ymm registers */ 547 WRITE_BACK_512_L IN, OUT, %ymm0, %ymm4, %ymm8, %ymm12, %ymm3, %ymm7, %ymm11, %ymm15, %ymm9, %ymm10, %ymm13, %ymm14 548 add $256, IN 549 add $256, OUT 550 551 /* right half of ymm registers */ 552 WRITE_BACK_512_R IN, OUT, %ymm0, %ymm4, %ymm8, %ymm12, %ymm3, %ymm7, %ymm11, %ymm15 553 sub $128, IN 554 sub $128, OUT 555 556 vmovdqa 512(%rsp), %ymm9 557 vmovdqu 512+32(%rsp), %ymm10 558 vmovdqa 512+64(%rsp), %ymm13 559 vmovdqu 512+96(%rsp), %ymm14 560 561 /* second left half of ymm registers */ 562 WRITE_BACK_512_L IN, OUT, %ymm9, %ymm13, %ymm1, %ymm5, %ymm10, %ymm14, %ymm2, %ymm6, %ymm0, %ymm4, %ymm8, %ymm12 563 add $256, IN 564 add $256, OUT 565 566 /* second right half of ymm registers */ 567 WRITE_BACK_512_R IN, OUT, %ymm9, %ymm13, %ymm1, %ymm5, %ymm10, %ymm14, %ymm2, %ymm6 568 add $128, IN 569 add $128, OUT 570 571 add $8, %r11d 572 sub $512, %rcx 573 mov %r11d, 48(%rdi) 574 jz .Lchacha20_end 575 cmp $512, %rcx 576 jae .Lchacha20_512_start_cont 577 jmp .Lchacha20_start 578 579.Lchacha20_end: 580 /* clear sensitive info in stack */ 581 vpxor %ymm0, %ymm0, %ymm0 582 xor %r11d, %r11d 583 vmovdqa %ymm0, (%rsp) 584 vmovdqa %ymm0, 32(%rsp) 585 vmovdqa %ymm0, 64(%rsp) 586 vmovdqa %ymm0, 96(%rsp) 587 vmovdqa %ymm0, 128(%rsp) 588 vmovdqa %ymm0, 160(%rsp) 589 vmovdqa %ymm0, 192(%rsp) 590 vmovdqa %ymm0, 224(%rsp) 591 vmovdqa %ymm0, 256(%rsp) 592 vmovdqa %ymm0, 288(%rsp) 593 vmovdqa %ymm0, 320(%rsp) 594 vmovdqa %ymm0, 352(%rsp) 595 vmovdqa %ymm0, 384(%rsp) 596 vmovdqa %ymm0, 416(%rsp) 597 vmovdqa %ymm0, 448(%rsp) 598 vmovdqa %ymm0, 480(%rsp) 599 vmovdqa %ymm0, 512(%rsp) 600 vmovdqa %ymm0, 512+32(%rsp) 601 vmovdqa %ymm0, 512+64(%rsp) 602 vmovdqa %ymm0, 512+96(%rsp) 603 mov %rax, %rsp 604 .cfi_endproc 605 ret 606.size CHACHA20_Update,.-CHACHA20_Update 607 608#endif 609