1/* 2 * This file is part of the openHiTLS project. 3 * 4 * openHiTLS is licensed under the Mulan PSL v2. 5 * You can use this software according to the terms and conditions of the Mulan PSL v2. 6 * You may obtain a copy of Mulan PSL v2 at: 7 * 8 * http://license.coscl.org.cn/MulanPSL2 9 * 10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13 * See the Mulan PSL v2 for more details. 14 */ 15 16#include "hitls_build.h" 17#ifdef HITLS_CRYPTO_CHACHA20 18 19#include "chacha20_x8664_common.S" 20.text 21.align 64 22g_ror16: 23 .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 24 .size g_ror16, .-g_ror16 25.align 64 26g_ror8: 27 .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 28 .size g_ror8, .-g_ror8 29.align 64 30g_ror16_128: 31 .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd, \ 32 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 33 .size g_ror16_128, .-g_ror16_128 34.align 64 35g_ror8_128: 36 .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe, \ 37 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 38 .size g_ror8_128, .-g_ror8_128 39.align 64 40g_addOne: 41 .long 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 42 .size g_addOne, .-g_addOne 43.align 64 44g_add4block: 45 .long 0, 1, 2, 3 46 .size g_add4block, .-g_add4block 47.align 64 48g_addsecond4block: 49 .long 4, 4, 4, 4 50 .size g_addsecond4block, .-g_addsecond4block 51.align 64 52g_add8block: 53 .long 0, 1, 2, 3, 4, 5, 6, 7 54 .size g_add8block, .-g_add8block 55.align 64 56g_addsecond8block: 57 .long 8, 8, 8, 8, 8, 8, 8, 8 58 .size g_addsecond8block, .-g_addsecond8block 59.align 64 60g_add16block: 61 .long 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 62 .size g_add16block, .-g_add16block 63.align 64 64g_addsecond16block: 65 .long 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 66 .size g_addsecond16block, .-g_addsecond16block 67 68.set IN, %rsi 69.set OUT, %rdx 70 71/* 72 * Processing 64 bytes: 4 x registers, number of instructions in a single loop: 21*2 = 42 73 * xmm0 ~ xmm3: 74 * xmm0 {0, 1, 2, 3} 75 * xmm1 {4, 5, 6, 7} 76 * xmm2 {8, 9, 10, 11} 77 * xmm3 {12, 13, 14, 15} 78 * 79 * Processing 128-256 bytes: 4 x registers, number of instructions in a single loop:30 80 * ymm0 ~ ymm3: 81 * ymm0 {0, 1, 2, 3, 0, 1, 2, 3 } 82 * ymm1 {4, 5, 6, 7, 4, 5, 6, 7 } 83 * ymm2 {8, 9, 10, 11, 8, 9, 10, 11} 84 * ymm3 {12, 13, 14, 15, 12, 13, 14, 15} 85 * 86 * Processing 512 bytes: y registers 0-15, 128 stack space and y registers 16-31,number of instructions 87 *in a single loop:12*8 = 96 88 * Processing 1024 bytes: z registers 0-15, 256 stack space and z registers 16-31, number of instructions 89 * in a single loop:12*8 = 96 90 * ymm0 ~ ymm15: 91 * ymm0 {0, 0, 0, 0, 0, 0, 0, 0} 92 * ymm1 {1, 1, 1, 1, 1, 1, 1, 1} 93 * ymm2 {2, 2, 2, 2, 2, 2, 2, 2} 94 * ymm3 {3, 3, 3, 3, 3, 3, 3, 3} 95 * ...... 96 * ymm15 {15, 15, 15, 15, 15, 15, 15, 15} 97 * 98 * zmm0 ~ zmm31: 99 * zmm0 {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} 100 * zmm1 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1} 101 * zmm2 {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2} 102 * zmm3 {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3} 103 * ... 104 * zmm15 {15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15} 105 */ 106 107.macro CHACHA20_ROUND s0 s1 s2 s3 108 vpaddd \s1, \s0, \s0 109 vpxord \s0, \s3, \s3 110 vprold $16, \s3, \s3 111 112 vpaddd \s3, \s2, \s2 113 vpxord \s2, \s1, \s1 114 vprold $12, \s1, \s1 115 116 vpaddd \s1, \s0, \s0 117 vpxord \s0, \s3, \s3 118 vprold $8, \s3, \s3 119 120 vpaddd \s3, \s2, \s2 121 vpxord \s2, \s1, \s1 122 vprold $7, \s1, \s1 123.endm 124 125/* convert y registers and write back */ 126.macro CONVERT_Y s0 s1 pos inpos outpos 127 /* ymm16 => {xmm16, xmm17} */ 128 vextracti32x4 \pos, \s0, %xmm16 129 vextracti32x4 \pos, \s1, %xmm17 130 vinserti32x4 $1, %xmm17, %ymm16, %ymm16 131 132 vpxord (IN), %ymm16, %ymm16 133 vmovdqu64 %ymm16, (OUT) 134 add $32, \inpos 135 add $32, \outpos 136.endm 137 138/* convert z registers and write back */ 139.macro CONVERT_Z s0 s1 s2 s3 pos inpos outpos 140 141 /* zmm16 => {xmm16, xmm17, xmm18, xmm19} */ 142 vextracti64x2 \pos, \s0, %xmm16 143 vextracti64x2 \pos, \s1, %xmm17 144 vextracti64x2 \pos, \s2, %xmm18 145 vextracti64x2 \pos, \s3, %xmm19 146 vinserti64x2 $1, %xmm17, %zmm16, %zmm16 147 vinserti64x2 $2, %xmm18, %zmm16, %zmm16 148 vinserti64x2 $3, %xmm19, %zmm16, %zmm16 149 150 vpxord (IN), %zmm16, %zmm16 151 vmovdqu64 %zmm16, (OUT) 152 add $64, \inpos 153 add $64, \outpos 154.endm 155 156 /** 157 * @Interconnection with the C interface:void CHACHA20_Update(CRYPT_CHACHA20_Ctx *ctx, const uint8_t *in, uint8_t *out, uint32_t len); 158 * @brief chacha20 algorithm 159 * @param ctx [IN] Algorithm context, which is set by the C interface and transferred. 160 * @param in [IN] Data to be encrypted 161 * @param out [OUT] Data after encryption 162 * @param len [IN] Encrypted length 163 * esp cannot use 15 available ctx in out len 164 * 16 registers are needed in one cycle, then 165 * {0, 1, 4, 5, 8, 9, 12, 13} 166 * {2, 3, 6, 7, 10, 11, 14, 15} 167**/ 168 169.globl CHACHA20_Update 170.type CHACHA20_Update,%function 171.align 64 172CHACHA20_Update: 173 .cfi_startproc 174 mov 48(%rdi), %r11d 175 mov %rsp, %r9 176 subq $2048,%rsp 177 andq $-1024,%rsp 178 179.Lchacha20_start: 180 cmp $1024, %rcx 181 jae .Lchacha20_1024_start 182 cmp $512, %rcx 183 jae .Lchacha20_512_start 184 cmp $256, %rcx 185 jae .Lchacha20_256_start 186 cmp $128, %rcx 187 jae .Lchacha20_128_start 188 cmp $64, %rcx 189 jae .Lchacha20_64_start 190 jmp .Lchacha20_end 191 192.Lchacha20_64_start: 193 LOAD_STATE %xmm0, %xmm1, %xmm2, %xmm3, %rdi 194 195 vmovdqa %xmm0, %xmm10 196 vmovdqa %xmm1, %xmm11 197 vmovdqa %xmm2, %xmm12 198 vmovdqa %xmm3, %xmm13 199 mov $10, %r8 200 201.Lchacha20_64_loop: 202 /* 0 = 0 + 4, 12 = (12 ^ 0) >>> 16 | 8 = 8 + 12, 4 = (4 ^ 8) >>> 12 | 203 * 0 = 0 + 4, 12 = (12 ^ 0) >>> 8 | 8 = 8 + 12, 4 = (4 ^ 8) >>> 7 204 * 1 = 1 + 5, 13 = (13 ^ 1) >>> 16 | 9 = 9 + 13, 5 = (5 ^ 9) >>> 12 | 205 * 1 = 1 + 5, 13 = (13 ^ 1) >>> 8 | 9 = 9 + 13, 5 = (5 ^ 9) >>> 7 206 * 2 = 2 + 6, 14 = (14 ^ 2) >>> 16 | 10 = 10 + 14, 6 = (6 ^ 10)>>> 12 | 207 * 2 = 2 + 6, 14 = (14 ^ 2) >>> 8 | 10 = 10 + 14, 6 = (6 ^ 10)>>> 7 208 * 3 = 3 + 7, 15 = (15 ^ 3) >>> 16 | 11 = 11 + 15, 7 = (7 ^ 11)>>> 12 | 209 * 3 = 3 + 7 ,15 = (15 ^ 3) >>> 8 | 11 = 11 + 15, 7 = (7 ^ 11)>>> 7 210 */ 211 CHACHA20_ROUND %xmm0, %xmm1, %xmm2, %xmm3 212 213 vpshufd $78, %xmm2, %xmm2 // {8 9 10 11} ==> {10 11 8 9} 01 00 11 10 214 vpshufd $57, %xmm1, %xmm1 // {4 5 6 7} ==> {5 6 7 4} 00 11 10 01 215 vpshufd $147, %xmm3, %xmm3 // {12 13 14 15} ==> {15 12 13 14} 10 01 00 11 216 217 /* 0 = 0 + 5 , 15 = (15 ^ 0) >>> 16 | 10 = 10 + 15, 5 = (5 ^ 10) >>> 12 | 218 * 0 = 0 + 5, 15 = (15 ^ 0) >>> 8 | 10 = 10 + 15, 5 = (5 ^ 10) >>> 7 219 * 1 = 1 + 6 , 12 = (12 ^ 1) >>> 16 | 11 = 11 + 12, 6 = (6 ^ 11) >>> 12 | 220 * 1 = 1 + 6, 12 = (12 ^ 1) >>> 8 | 11 = 11 + 12, 6 = (6 ^ 11) >>> 7 221 * 2 = 2 + 7 , 13 = (13 ^ 2) >>> 16 | 8 = 8 + 13, 7 = (7 ^ 8)>>> 12 | 222 * 2 = 2 + 7, 13 = (13 ^ 2) >>> 8 | 8 = 8 + 13, 7 = (7 ^ 8)>>> 7 223 * 3 = 3 + 4 , 14 = (14 ^ 3) >>> 16 | 9 = 9 + 14, 4 = (4 ^ 9)>>> 12 | 224 * 3 = 3 + 4, 14 = (14 ^ 3) >>> 8 | 9 = 9 + 14, 4 = (4 ^ 9)>>> 7 225 */ 226 CHACHA20_ROUND %xmm0, %xmm1, %xmm2, %xmm3 227 228 vpshufd $78, %xmm2, %xmm2 // {10 11 8 9} ==> {8 9 10 11} 01 00 11 10 229 vpshufd $147, %xmm1, %xmm1 // {5 6 7 4} ==> {4 5 6 7} 00 11 10 01 230 vpshufd $57, %xmm3, %xmm3 // {15 12 13 14} ==> {12 13 14 15} 10 01 00 11 231 232 decq %r8 233 jnz .Lchacha20_64_loop 234 235 vpaddd %xmm10, %xmm0, %xmm0 236 vpaddd %xmm11, %xmm1, %xmm1 237 vpaddd %xmm12, %xmm2, %xmm2 238 vpaddd %xmm13, %xmm3, %xmm3 239 240 add $1, %r11d 241 WRITEBACK_64_AVX512 IN, OUT, %xmm0, %xmm1, %xmm2, %xmm3 242 mov %r11d, 48(%rdi) 243 jmp .Lchacha20_end 244 245.Lchacha20_128_start: 246 247 vbroadcasti128 (%rdi), %ymm0 // {0 1 2 3 0 1 2 3} 248 vbroadcasti128 16(%rdi), %ymm1 // {4 5 6 7 4 5 6 7} 249 vbroadcasti128 32(%rdi), %ymm2 // {8 9 10 11 8 9 10 11} 250 vbroadcasti128 48(%rdi), %ymm3 // {12 13 14 15 12 13 14 15} 251 vpaddd g_addOne(%rip), %ymm3, %ymm3 252 253 vmovdqa32 %ymm0, %ymm16 254 vmovdqa32 %ymm1, %ymm17 255 vmovdqa32 %ymm2, %ymm18 256 vmovdqa32 %ymm3, %ymm19 257 mov $10, %r8 258 259.Lchacha20_128_loop: 260 261 CHACHA20_ROUND %ymm0, %ymm1, %ymm2, %ymm3 262 263 vpshufd $78, %ymm2, %ymm2 // {8 9 10 11} ==> {10 11 8 9} 01 00 11 10 264 vpshufd $57, %ymm1, %ymm1 // {4 5 6 7} ==> {5 6 7 4} 00 11 10 01 265 vpshufd $147, %ymm3, %ymm3 // {12 13 14 15} ==> {15 12 13 14} 10 01 00 11 266 267 CHACHA20_ROUND %ymm0, %ymm1, %ymm2, %ymm3 268 269 vpshufd $78, %ymm2, %ymm2 // {8 9 10 11} ==> {10 11 8 9} 01 00 11 10 270 vpshufd $147, %ymm1, %ymm1 // {4 5 6 7} ==> {5 6 7 4} 00 11 10 01 271 vpshufd $57, %ymm3, %ymm3 // {12 13 14 15} ==> {15 12 13 14} 10 01 00 11 272 273 decq %r8 274 jnz .Lchacha20_128_loop 275 276 vpaddd %ymm16, %ymm0, %ymm0 277 vpaddd %ymm17, %ymm1, %ymm1 278 vpaddd %ymm18, %ymm2, %ymm2 279 vpaddd %ymm19, %ymm3, %ymm3 280 281 vextracti32x4 $1, %ymm0, %xmm5 // ymm0 => {xmm0 xmm5} 282 vextracti32x4 $1, %ymm1, %xmm6 // ymm1 => {xmm1 xmm6} 283 vextracti32x4 $1, %ymm2, %xmm7 // ymm2 => {xmm2 xmm7} 284 vextracti32x4 $1, %ymm3, %xmm8 // ymm3 => {xmm3 xmm8} 285 286 WRITEBACK_64_AVX512 IN, OUT, %xmm0, %xmm1, %xmm2, %xmm3 287 WRITEBACK_64_AVX512 IN, OUT, %xmm5, %xmm6, %xmm7, %xmm8 288 289 add $2, %r11d 290 sub $128, %rcx 291 mov %r11d, 48(%rdi) 292 jz .Lchacha20_end 293 jmp .Lchacha20_start 294 295.Lchacha20_256_start: 296 297 LOAD_1024_STATE %zmm0 %zmm1 %zmm2 %zmm3 %rdi 298 vpaddd g_addOne(%rip), %zmm3, %zmm3 299 300 vmovdqa64 %zmm0, %zmm16 301 vmovdqa64 %zmm1, %zmm17 302 vmovdqa64 %zmm2, %zmm18 303 vmovdqa64 %zmm3, %zmm19 304 mov $10, %r8 305 306.Lchacha20_256_loop: 307 308 CHACHA20_ROUND %zmm0, %zmm1, %zmm2, %zmm3 309 310 vpshufd $78, %zmm2, %zmm2 // {8 9 10 11} ==> {10 11 8 9} 01 00 11 10 311 vpshufd $57, %zmm1, %zmm1 // {4 5 6 7} ==> {5 6 7 4} 00 11 10 01 312 vpshufd $147, %zmm3, %zmm3 // {12 13 14 15} ==> {15 12 13 14} 10 01 00 11 313 314 CHACHA20_ROUND %zmm0, %zmm1, %zmm2, %zmm3 315 316 vpshufd $78, %zmm2, %zmm2 // {8 9 10 11} ==> {10 11 8 9} 01 00 11 10 317 vpshufd $147, %zmm1, %zmm1 // {4 5 6 7} ==> {5 6 7 4} 00 11 10 01 318 vpshufd $57, %zmm3, %zmm3 // {12 13 14 15} ==> {15 12 13 14} 10 01 00 11 319 320 decq %r8 321 jnz .Lchacha20_256_loop 322 323 vpaddd %zmm16, %zmm0, %zmm0 324 vpaddd %zmm17, %zmm1, %zmm1 325 vpaddd %zmm18, %zmm2, %zmm2 326 vpaddd %zmm19, %zmm3, %zmm3 327 328 vextracti64x2 $1, %zmm0, %xmm4 329 vextracti64x2 $1, %zmm1, %xmm5 330 vextracti64x2 $1, %zmm2, %xmm6 331 vextracti64x2 $1, %zmm3, %xmm7 332 333 vextracti64x2 $2, %zmm0, %xmm8 334 vextracti64x2 $2, %zmm1, %xmm9 335 vextracti64x2 $2, %zmm2, %xmm10 336 vextracti64x2 $2, %zmm3, %xmm11 337 338 vextracti64x2 $3, %zmm0, %xmm12 339 vextracti64x2 $3, %zmm1, %xmm13 340 vextracti64x2 $3, %zmm2, %xmm14 341 vextracti64x2 $3, %zmm3, %xmm15 342 343 WRITEBACK_64_AVX512 IN, OUT, %xmm0, %xmm1, %xmm2, %xmm3 344 WRITEBACK_64_AVX512 IN, OUT, %xmm4, %xmm5, %xmm6, %xmm7 345 WRITEBACK_64_AVX512 IN, OUT, %xmm8, %xmm9, %xmm10, %xmm11 346 WRITEBACK_64_AVX512 IN, OUT, %xmm12, %xmm13, %xmm14, %xmm15 347 348 add $4, %r11d 349 sub $256, %rcx 350 mov %r11d, 48(%rdi) 351 jz .Lchacha20_end 352 jmp .Lchacha20_start 353 354.Lchacha20_512_start: 355 LOAD_512_STATE %ymm0, %ymm1, %ymm2, %ymm3, %rdi 356 357 vpshufd $0b00000000, %ymm3, %ymm12 358 vpshufd $0b01010101, %ymm3, %ymm13 359 360 vpaddd g_add8block(%rip), %ymm12, %ymm12 // 0, 1, 2, 3, 4, 5, 6 ,7 361 vmovdqa32 %ymm12, %ymm28 362 vpshufd $0b10101010, %ymm3, %ymm14 363 vmovdqa32 %ymm13, %ymm29 364 vpshufd $0b11111111, %ymm3, %ymm15 365 vmovdqa32 %ymm14, %ymm30 366 367 vpshufd $0b00000000, %ymm2, %ymm8 368 vmovdqa32 %ymm15, %ymm31 369 vpshufd $0b01010101, %ymm2, %ymm9 370 vmovdqa32 %ymm8, %ymm24 371 vpshufd $0b10101010, %ymm2, %ymm10 372 vmovdqa32 %ymm9, %ymm25 373 vpshufd $0b11111111, %ymm2, %ymm11 374 vmovdqa32 %ymm10, %ymm26 375 376 vpshufd $0b00000000, %ymm1, %ymm4 377 vmovdqa32 %ymm11, %ymm27 378 vpshufd $0b01010101, %ymm1, %ymm5 379 vmovdqa32 %ymm4, %ymm20 380 vpshufd $0b10101010, %ymm1, %ymm6 381 vmovdqa32 %ymm5, %ymm21 382 vpshufd $0b11111111, %ymm1, %ymm7 383 vmovdqa32 %ymm6, %ymm22 384 385 vpshufd $0b11111111, %ymm0, %ymm3 386 vmovdqa32 %ymm7, %ymm23 387 vpshufd $0b10101010, %ymm0, %ymm2 388 vmovdqa32 %ymm3, %ymm19 389 vpshufd $0b01010101, %ymm0, %ymm1 390 vmovdqa32 %ymm2, %ymm18 391 vpshufd $0b00000000, %ymm0, %ymm0 392 vmovdqa32 %ymm1, %ymm17 393 vmovdqa32 %ymm0, %ymm16 394 mov $10, %r8 395 396.Lchacha20_512_loop: 397 398 CHACHA20_LOOP_AVX512 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, \ 399 %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15 400 401 decq %r8 402 jnz .Lchacha20_512_loop 403 404 /* ymm16~31: original matrix */ 405 vpaddd %ymm16, %ymm0, %ymm0 406 vpaddd %ymm17, %ymm1, %ymm1 407 vpaddd %ymm18, %ymm2, %ymm2 408 vpaddd %ymm19, %ymm3, %ymm3 409 vpaddd %ymm20, %ymm4, %ymm4 410 vpaddd %ymm21, %ymm5, %ymm5 411 vpaddd %ymm22, %ymm6, %ymm6 412 vpaddd %ymm23, %ymm7, %ymm7 413 vpaddd %ymm24, %ymm8, %ymm8 414 vpaddd %ymm25, %ymm9, %ymm9 415 vpaddd %ymm26, %ymm10, %ymm10 416 vpaddd %ymm27, %ymm11, %ymm11 417 vpaddd %ymm28, %ymm12, %ymm12 418 vpaddd %ymm29, %ymm13, %ymm13 419 vpaddd %ymm30, %ymm14, %ymm14 420 vpaddd %ymm31, %ymm15, %ymm15 421 422 MATRIX_TO_STATE %ymm0, %ymm1, %ymm2, %ymm3, %ymm20, %ymm21 // set state 0, 3, 9, 10 423 MATRIX_TO_STATE %ymm4, %ymm5, %ymm6, %ymm7, %ymm22, %ymm23 // set state 4, 7, 13, 14 424 MATRIX_TO_STATE %ymm8, %ymm9, %ymm10, %ymm11, %ymm1, %ymm2 // set state 8, 11, 1, 2 425 MATRIX_TO_STATE %ymm12, %ymm13, %ymm14, %ymm15, %ymm5, %ymm6 // set state 12, 15, 5, 6 426 427 /* 428 * {A0 A1 A2 A3 E0 E1 E2 E3} 429 * {B0 B1 B2 B3 F0 F1 F2 F3} 430 * {C0 C1 C2 C3 G0 G1 G2 G3} 431 * {D0 D1 D2 D3 H0 H1 H2 H3} 432 * ... 433 * => 434 * {A0 A1 A2 A3 B0 B1 B2 B3} 435 * {C0 C1 C2 C3 D0 D1 D2 D3} 436 * .... 437 */ 438 439 CONVERT_Y %ymm0, %ymm4, $0 IN OUT 440 CONVERT_Y %ymm8, %ymm12, $0 IN OUT 441 CONVERT_Y %ymm3, %ymm7, $0 IN OUT 442 CONVERT_Y %ymm11, %ymm15, $0 IN OUT 443 CONVERT_Y %ymm20, %ymm22, $0 IN OUT 444 CONVERT_Y %ymm1, %ymm5, $0 IN OUT 445 CONVERT_Y %ymm21, %ymm23, $0 IN OUT 446 CONVERT_Y %ymm2, %ymm6, $0 IN OUT 447 CONVERT_Y %ymm0, %ymm4, $1 IN OUT 448 CONVERT_Y %ymm8, %ymm12, $1 IN OUT 449 CONVERT_Y %ymm3, %ymm7, $1 IN OUT 450 CONVERT_Y %ymm11, %ymm15, $1 IN OUT 451 CONVERT_Y %ymm20, %ymm22, $1 IN OUT 452 CONVERT_Y %ymm1, %ymm5, $1 IN OUT 453 CONVERT_Y %ymm21, %ymm23, $1 IN OUT 454 CONVERT_Y %ymm2, %ymm6, $1 IN OUT 455 456 add $8, %r11d 457 sub $512, %rcx 458 mov %r11d, 48(%rdi) 459 jz .Lchacha20_end 460 jmp .Lchacha20_start 461 462.Lchacha20_1024_start: 463 464 LOAD_1024_STATE %zmm0 %zmm1 %zmm2 %zmm3 %rdi 465 466 STATE_TO_MATRIX_Z_AVX512 %zmm0, %zmm16, %zmm17, %zmm18, %zmm19 467 STATE_TO_MATRIX_Z_AVX512 %zmm1, %zmm20, %zmm21, %zmm22, %zmm23 468 STATE_TO_MATRIX_Z_AVX512 %zmm2, %zmm24, %zmm25, %zmm26, %zmm27 469 STATE_TO_MATRIX_Z_AVX512 %zmm3, %zmm28, %zmm29, %zmm30, %zmm31 470 vpaddd g_add16block(%rip), %zmm28, %zmm28 471 472 vmovdqa64 %zmm16, %zmm0 473 vmovdqa64 %zmm17, %zmm1 474 vmovdqa64 %zmm18, %zmm2 475 vmovdqa64 %zmm19, %zmm3 476 vmovdqa64 %zmm20, %zmm4 477 vmovdqa64 %zmm21, %zmm5 478 vmovdqa64 %zmm22, %zmm6 479 vmovdqa64 %zmm23, %zmm7 480 vmovdqa64 %zmm24, %zmm8 481 vmovdqa64 %zmm25, %zmm9 482 vmovdqa64 %zmm26, %zmm10 483 vmovdqa64 %zmm27, %zmm11 484 vmovdqa64 %zmm28, %zmm12 485 vmovdqa64 %zmm29, %zmm13 486 vmovdqa64 %zmm30, %zmm14 487 vmovdqa64 %zmm31, %zmm15 488 mov $10, %r8 489 jmp .Lchacha20_1024_loop 490 491.Lchacha20_1024_start_cont: 492 493 vmovdqa32 %zmm16, %zmm0 494 vmovdqa32 %zmm17, %zmm1 495 vmovdqa32 %zmm18, %zmm2 496 vmovdqa32 %zmm19, %zmm3 497 vmovdqa32 %zmm20, %zmm4 498 vmovdqa32 %zmm21, %zmm5 499 vmovdqa32 %zmm22, %zmm6 500 vmovdqa32 %zmm23, %zmm7 501 vmovdqa32 %zmm24, %zmm8 502 vmovdqa32 %zmm25, %zmm9 503 vmovdqa32 %zmm26, %zmm10 504 vmovdqa32 %zmm27, %zmm11 505 vmovdqa32 %zmm28, %zmm12 506 vmovdqa32 %zmm29, %zmm13 507 vpaddd g_addsecond16block(%rip), %zmm12, %zmm12 // add 8, 8, 8, 8, 8, 8, 8, 8 or 4, 4, 4, 4 508 vmovdqa32 %zmm30, %zmm14 509 vmovdqa32 %zmm31, %zmm15 510 vmovdqa32 %zmm12, %zmm28 511 mov $10, %r8 512 513.Lchacha20_1024_loop: 514 515 CHACHA20_LOOP_AVX512 %zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7, %zmm8, %zmm9, \ 516 %zmm10, %zmm11, %zmm12, %zmm13, %zmm14, %zmm15 517 decq %r8 518 jnz .Lchacha20_1024_loop 519 520 vpaddd %zmm16, %zmm0, %zmm0 521 vpaddd %zmm17, %zmm1, %zmm1 522 vpaddd %zmm18, %zmm2, %zmm2 523 vpaddd %zmm19, %zmm3, %zmm3 524 vpaddd %zmm20, %zmm4, %zmm4 525 vpaddd %zmm21, %zmm5, %zmm5 526 vpaddd %zmm22, %zmm6, %zmm6 527 vpaddd %zmm23, %zmm7, %zmm7 528 vpaddd %zmm24, %zmm8, %zmm8 529 vpaddd %zmm25, %zmm9, %zmm9 530 vpaddd %zmm26, %zmm10, %zmm10 531 vpaddd %zmm27, %zmm11, %zmm11 532 vpaddd %zmm28, %zmm12, %zmm12 533 vpaddd %zmm29, %zmm13, %zmm13 534 vpaddd %zmm30, %zmm14, %zmm14 535 vpaddd %zmm31, %zmm15, %zmm15 536 537 /* store matrix 16, 17, 18, 19 in stack */ 538 vmovdqa64 %zmm16, (%rsp) 539 vmovdqa64 %zmm17, 64(%rsp) 540 vmovdqa64 %zmm18, 128(%rsp) 541 vmovdqa64 %zmm19, 192(%rsp) 542 543 /* store matrix 9, 10, 13, 14 in zmm16, 17, 18, 19 */ 544 vmovdqa64 %zmm9, %zmm16 // zmm16: encrypt matrix zmm9 545 vmovdqa64 %zmm10, %zmm17 // zmm17: encrypt matrix zmm10 546 vmovdqa64 %zmm13, %zmm18 // zmm18: encrypt matrix zmm13 547 vmovdqa64 %zmm14, %zmm19 // zmm19: encrypt matrix zmm14 548 549 /* zmm0~15: encrypt matrix 0 ~ 15*/ 550 MATRIX_TO_STATE %zmm0, %zmm1, %zmm2, %zmm3, %zmm9, %zmm10 // set state 0, 3, 9, 10 551 MATRIX_TO_STATE %zmm4, %zmm5, %zmm6, %zmm7, %zmm13, %zmm14 // set state 4, 7, 13, 14 552 MATRIX_TO_STATE %zmm8, %zmm16, %zmm17, %zmm11, %zmm1, %zmm2 // set state 8, 11, 1, 2 553 MATRIX_TO_STATE %zmm12, %zmm18, %zmm19, %zmm15, %zmm5, %zmm6 // set state 12, 15, 5, 6 554 555 /* 556 * {A0 A1 A2 A3 E0 E1 E2 E3 I0 I1 I2 I3 M0 M1 M2 M3} 557 * {B0 B1 B2 B3 F0 F1 F2 F3 J0 J1 J2 J3 N0 N1 N2 N3} 558 * {C0 C1 C2 C3 G0 G1 G2 G3 K0 K1 K2 K3 O0 O1 O2 O3} 559 * {D0 D1 D2 D3 H0 H1 H2 H3 L0 L1 L2 L3 P0 P1 P2 P3} 560 * ... 561 * => 562 * {A0 A1 A2 A3 B0 B1 B2 B3 C0 C1 C2 C3 D0 D1 D2 D3} 563 * {E0 E1 E2 E3 F0 F1 F2 F3 G0 G1 G2 G3 H0 H1 H2 H3} 564 * {I0 I1 I2 I3 J0 J1 J2 J3 K0 K1 K2 K3 L0 L1 L2 L3} 565 * .... 566 */ 567 568 CONVERT_Z %zmm0, %zmm4, %zmm8, %zmm12, $0 IN OUT 569 CONVERT_Z %zmm3, %zmm7, %zmm11, %zmm15, $0 IN OUT 570 CONVERT_Z %zmm9, %zmm13, %zmm1, %zmm5, $0 IN OUT 571 CONVERT_Z %zmm10, %zmm14, %zmm2, %zmm6, $0 IN OUT 572 CONVERT_Z %zmm0, %zmm4, %zmm8, %zmm12, $1 IN OUT 573 CONVERT_Z %zmm3, %zmm7, %zmm11, %zmm15, $1 IN OUT 574 CONVERT_Z %zmm9, %zmm13, %zmm1, %zmm5, $1 IN OUT 575 CONVERT_Z %zmm10, %zmm14, %zmm2, %zmm6, $1 IN OUT 576 CONVERT_Z %zmm0, %zmm4, %zmm8, %zmm12, $2 IN OUT 577 CONVERT_Z %zmm3, %zmm7, %zmm11, %zmm15, $2 IN OUT 578 CONVERT_Z %zmm9, %zmm13, %zmm1, %zmm5, $2 IN OUT 579 CONVERT_Z %zmm10, %zmm14, %zmm2, %zmm6, $2 IN OUT 580 CONVERT_Z %zmm0, %zmm4, %zmm8, %zmm12, $3 IN OUT 581 CONVERT_Z %zmm3, %zmm7, %zmm11, %zmm15, $3 IN OUT 582 CONVERT_Z %zmm9, %zmm13, %zmm1, %zmm5, $3 IN OUT 583 CONVERT_Z %zmm10, %zmm14, %zmm2, %zmm6, $3 IN OUT 584 585 /* store zmm16~19 in stack */ 586 vmovdqa64 (%rsp), %zmm16 587 vmovdqa64 64(%rsp), %zmm17 588 vmovdqa64 128(%rsp), %zmm18 589 vmovdqa64 192(%rsp), %zmm19 590 591 add $16, %r11d 592 sub $1024, %rcx 593 mov %r11d, 48(%rdi) 594 jz .Lchacha20_clear 595 cmp $1024, %rcx 596 jae .Lchacha20_1024_start_cont 597 jmp .Lchacha20_start 598 599.Lchacha20_clear: 600 /* clear sensitive info in stack */ 601 vpxord %zmm0, %zmm0, %zmm0 602 vmovdqa64 %zmm0, (%rsp) 603 vmovdqa64 %zmm0, 64(%rsp) 604 vmovdqa64 %zmm0, 128(%rsp) 605 vmovdqa64 %zmm0, 192(%rsp) 606 607.Lchacha20_end: 608 xor %r11d, %r11d 609 mov %r9, %rsp 610 .cfi_endproc 611 ret 612.size CHACHA20_Update,.-CHACHA20_Update 613 614#endif 615