1// Copyright 2016 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare. 6 7//go:build gc && !purego 8 9#include "textflag.h" 10// General register allocation 11#define oup DI 12#define inp SI 13#define inl BX 14#define adp CX // free to reuse, after we hash the additional data 15#define keyp R8 // free to reuse, when we copy the key to stack 16#define itr2 R9 // general iterator 17#define itr1 CX // general iterator 18#define acc0 R10 19#define acc1 R11 20#define acc2 R12 21#define t0 R13 22#define t1 R14 23#define t2 R15 24#define t3 R8 25// Register and stack allocation for the SSE code 26#define rStore (0*16)(BP) 27#define sStore (1*16)(BP) 28#define state1Store (2*16)(BP) 29#define state2Store (3*16)(BP) 30#define tmpStore (4*16)(BP) 31#define ctr0Store (5*16)(BP) 32#define ctr1Store (6*16)(BP) 33#define ctr2Store (7*16)(BP) 34#define ctr3Store (8*16)(BP) 35#define A0 X0 36#define A1 X1 37#define A2 X2 38#define B0 X3 39#define B1 X4 40#define B2 X5 41#define C0 X6 42#define C1 X7 43#define C2 X8 44#define D0 X9 45#define D1 X10 46#define D2 X11 47#define T0 X12 48#define T1 X13 49#define T2 X14 50#define T3 X15 51#define A3 T0 52#define B3 T1 53#define C3 T2 54#define D3 T3 55// Register and stack allocation for the AVX2 code 56#define rsStoreAVX2 (0*32)(BP) 57#define state1StoreAVX2 (1*32)(BP) 58#define state2StoreAVX2 (2*32)(BP) 59#define ctr0StoreAVX2 (3*32)(BP) 60#define ctr1StoreAVX2 (4*32)(BP) 61#define ctr2StoreAVX2 (5*32)(BP) 62#define ctr3StoreAVX2 (6*32)(BP) 63#define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack 64#define AA0 Y0 65#define AA1 Y5 66#define AA2 Y6 67#define AA3 Y7 68#define BB0 Y14 69#define BB1 Y9 70#define BB2 Y10 71#define BB3 Y11 72#define CC0 Y12 73#define CC1 Y13 74#define CC2 Y8 75#define CC3 Y15 76#define DD0 Y4 77#define DD1 Y1 78#define DD2 Y2 79#define DD3 Y3 80#define TT0 DD3 81#define TT1 AA3 82#define TT2 BB3 83#define TT3 CC3 84// ChaCha20 constants 85DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865 86DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e 87DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32 88DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574 89DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865 90DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e 91DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32 92DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574 93// <<< 16 with PSHUFB 94DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302 95DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A 96DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302 97DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A 98// <<< 8 with PSHUFB 99DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003 100DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B 101DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003 102DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B 103 104DATA ·avx2InitMask<>+0x00(SB)/8, $0x0 105DATA ·avx2InitMask<>+0x08(SB)/8, $0x0 106DATA ·avx2InitMask<>+0x10(SB)/8, $0x1 107DATA ·avx2InitMask<>+0x18(SB)/8, $0x0 108 109DATA ·avx2IncMask<>+0x00(SB)/8, $0x2 110DATA ·avx2IncMask<>+0x08(SB)/8, $0x0 111DATA ·avx2IncMask<>+0x10(SB)/8, $0x2 112DATA ·avx2IncMask<>+0x18(SB)/8, $0x0 113// Poly1305 key clamp 114DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF 115DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC 116DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF 117DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF 118 119DATA ·sseIncMask<>+0x00(SB)/8, $0x1 120DATA ·sseIncMask<>+0x08(SB)/8, $0x0 121// To load/store the last < 16 bytes in a buffer 122DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff 123DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000 124DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff 125DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000 126DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff 127DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000 128DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff 129DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000 130DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff 131DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000 132DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff 133DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000 134DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff 135DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000 136DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff 137DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000 138DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff 139DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff 140DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff 141DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff 142DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff 143DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff 144DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff 145DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff 146DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff 147DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff 148DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff 149DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff 150DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff 151DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff 152 153GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32 154GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32 155GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32 156GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16 157GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32 158GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32 159GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32 160GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240 161// No PALIGNR in Go ASM yet (but VPALIGNR is present). 162#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3 163#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4 164#define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5 165#define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13 166#define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6 167#define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7 168#define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8 169#define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14 170#define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9 171#define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10 172#define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11 173#define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15 174#define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3 175#define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4 176#define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5 177#define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13 178#define shiftC0Right shiftC0Left 179#define shiftC1Right shiftC1Left 180#define shiftC2Right shiftC2Left 181#define shiftC3Right shiftC3Left 182#define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9 183#define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10 184#define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11 185#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15 186 187// Some macros 188 189// ROL rotates the uint32s in register R left by N bits, using temporary T. 190#define ROL(N, R, T) \ 191 MOVO R, T; PSLLL $(N), T; PSRLL $(32-(N)), R; PXOR T, R 192 193// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed. 194#ifdef GOAMD64_v2 195#define ROL16(R, T) PSHUFB ·rol16<>(SB), R 196#else 197#define ROL16(R, T) ROL(16, R, T) 198#endif 199 200// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed. 201#ifdef GOAMD64_v2 202#define ROL8(R, T) PSHUFB ·rol8<>(SB), R 203#else 204#define ROL8(R, T) ROL(8, R, T) 205#endif 206 207#define chachaQR(A, B, C, D, T) \ 208 PADDD B, A; PXOR A, D; ROL16(D, T) \ 209 PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \ 210 PADDD B, A; PXOR A, D; ROL8(D, T) \ 211 PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B 212 213#define chachaQR_AVX2(A, B, C, D, T) \ 214 VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \ 215 VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \ 216 VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \ 217 VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B 218 219#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2 220#define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2 221#define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX 222#define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3 223#define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t3, t2; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2 224 225#define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2 226#define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3 227#define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3 228 229#define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage 230#define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage 231// ---------------------------------------------------------------------------- 232TEXT polyHashADInternal<>(SB), NOSPLIT, $0 233 // adp points to beginning of additional data 234 // itr2 holds ad length 235 XORQ acc0, acc0 236 XORQ acc1, acc1 237 XORQ acc2, acc2 238 CMPQ itr2, $13 239 JNE hashADLoop 240 241openFastTLSAD: 242 // Special treatment for the TLS case of 13 bytes 243 MOVQ (adp), acc0 244 MOVQ 5(adp), acc1 245 SHRQ $24, acc1 246 MOVQ $1, acc2 247 polyMul 248 RET 249 250hashADLoop: 251 // Hash in 16 byte chunks 252 CMPQ itr2, $16 253 JB hashADTail 254 polyAdd(0(adp)) 255 LEAQ (1*16)(adp), adp 256 SUBQ $16, itr2 257 polyMul 258 JMP hashADLoop 259 260hashADTail: 261 CMPQ itr2, $0 262 JE hashADDone 263 264 // Hash last < 16 byte tail 265 XORQ t0, t0 266 XORQ t1, t1 267 XORQ t2, t2 268 ADDQ itr2, adp 269 270hashADTailLoop: 271 SHLQ $8, t0, t1 272 SHLQ $8, t0 273 MOVB -1(adp), t2 274 XORQ t2, t0 275 DECQ adp 276 DECQ itr2 277 JNE hashADTailLoop 278 279hashADTailFinish: 280 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 281 polyMul 282 283 // Finished AD 284hashADDone: 285 RET 286 287// ---------------------------------------------------------------------------- 288// func chacha20Poly1305Open(dst, key, src, ad []byte) bool 289TEXT ·chacha20Poly1305Open(SB), 0, $288-97 290 // For aligned stack access 291 MOVQ SP, BP 292 ADDQ $32, BP 293 ANDQ $-32, BP 294 MOVQ dst+0(FP), oup 295 MOVQ key+24(FP), keyp 296 MOVQ src+48(FP), inp 297 MOVQ src_len+56(FP), inl 298 MOVQ ad+72(FP), adp 299 300 // Check for AVX2 support 301 CMPB ·useAVX2(SB), $1 302 JE chacha20Poly1305Open_AVX2 303 304 // Special optimization, for very short buffers 305 CMPQ inl, $128 306 JBE openSSE128 // About 16% faster 307 308 // For long buffers, prepare the poly key first 309 MOVOU ·chacha20Constants<>(SB), A0 310 MOVOU (1*16)(keyp), B0 311 MOVOU (2*16)(keyp), C0 312 MOVOU (3*16)(keyp), D0 313 MOVO D0, T1 314 315 // Store state on stack for future use 316 MOVO B0, state1Store 317 MOVO C0, state2Store 318 MOVO D0, ctr3Store 319 MOVQ $10, itr2 320 321openSSEPreparePolyKey: 322 chachaQR(A0, B0, C0, D0, T0) 323 shiftB0Left; shiftC0Left; shiftD0Left 324 chachaQR(A0, B0, C0, D0, T0) 325 shiftB0Right; shiftC0Right; shiftD0Right 326 DECQ itr2 327 JNE openSSEPreparePolyKey 328 329 // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded 330 PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0 331 332 // Clamp and store the key 333 PAND ·polyClampMask<>(SB), A0 334 MOVO A0, rStore; MOVO B0, sStore 335 336 // Hash AAD 337 MOVQ ad_len+80(FP), itr2 338 CALL polyHashADInternal<>(SB) 339 340openSSEMainLoop: 341 CMPQ inl, $256 342 JB openSSEMainLoopDone 343 344 // Load state, increment counter blocks 345 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 346 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 347 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 348 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 349 350 // Store counters 351 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store 352 353 // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16 354 MOVQ $4, itr1 355 MOVQ inp, itr2 356 357openSSEInternalLoop: 358 MOVO C3, tmpStore 359 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 360 MOVO tmpStore, C3 361 MOVO C1, tmpStore 362 chachaQR(A3, B3, C3, D3, C1) 363 MOVO tmpStore, C1 364 polyAdd(0(itr2)) 365 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left 366 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left 367 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left 368 polyMulStage1 369 polyMulStage2 370 LEAQ (2*8)(itr2), itr2 371 MOVO C3, tmpStore 372 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 373 MOVO tmpStore, C3 374 MOVO C1, tmpStore 375 polyMulStage3 376 chachaQR(A3, B3, C3, D3, C1) 377 MOVO tmpStore, C1 378 polyMulReduceStage 379 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right 380 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right 381 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right 382 DECQ itr1 383 JGE openSSEInternalLoop 384 385 polyAdd(0(itr2)) 386 polyMul 387 LEAQ (2*8)(itr2), itr2 388 389 CMPQ itr1, $-6 390 JG openSSEInternalLoop 391 392 // Add in the state 393 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 394 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 395 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 396 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 397 398 // Load - xor - store 399 MOVO D3, tmpStore 400 MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup) 401 MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup) 402 MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup) 403 MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup) 404 MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup) 405 MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup) 406 MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup) 407 MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup) 408 MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup) 409 MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup) 410 MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup) 411 MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup) 412 MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup) 413 MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup) 414 MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup) 415 MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup) 416 LEAQ 256(inp), inp 417 LEAQ 256(oup), oup 418 SUBQ $256, inl 419 JMP openSSEMainLoop 420 421openSSEMainLoopDone: 422 // Handle the various tail sizes efficiently 423 TESTQ inl, inl 424 JE openSSEFinalize 425 CMPQ inl, $64 426 JBE openSSETail64 427 CMPQ inl, $128 428 JBE openSSETail128 429 CMPQ inl, $192 430 JBE openSSETail192 431 JMP openSSETail256 432 433openSSEFinalize: 434 // Hash in the PT, AAD lengths 435 ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2 436 polyMul 437 438 // Final reduce 439 MOVQ acc0, t0 440 MOVQ acc1, t1 441 MOVQ acc2, t2 442 SUBQ $-5, acc0 443 SBBQ $-1, acc1 444 SBBQ $3, acc2 445 CMOVQCS t0, acc0 446 CMOVQCS t1, acc1 447 CMOVQCS t2, acc2 448 449 // Add in the "s" part of the key 450 ADDQ 0+sStore, acc0 451 ADCQ 8+sStore, acc1 452 453 // Finally, constant time compare to the tag at the end of the message 454 XORQ AX, AX 455 MOVQ $1, DX 456 XORQ (0*8)(inp), acc0 457 XORQ (1*8)(inp), acc1 458 ORQ acc1, acc0 459 CMOVQEQ DX, AX 460 461 // Return true iff tags are equal 462 MOVB AX, ret+96(FP) 463 RET 464 465// ---------------------------------------------------------------------------- 466// Special optimization for buffers smaller than 129 bytes 467openSSE128: 468 // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks 469 MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 470 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 471 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 472 MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 473 MOVQ $10, itr2 474 475openSSE128InnerCipherLoop: 476 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 477 shiftB0Left; shiftB1Left; shiftB2Left 478 shiftC0Left; shiftC1Left; shiftC2Left 479 shiftD0Left; shiftD1Left; shiftD2Left 480 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 481 shiftB0Right; shiftB1Right; shiftB2Right 482 shiftC0Right; shiftC1Right; shiftC2Right 483 shiftD0Right; shiftD1Right; shiftD2Right 484 DECQ itr2 485 JNE openSSE128InnerCipherLoop 486 487 // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded 488 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 489 PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 490 PADDL T2, C1; PADDL T2, C2 491 PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 492 493 // Clamp and store the key 494 PAND ·polyClampMask<>(SB), A0 495 MOVOU A0, rStore; MOVOU B0, sStore 496 497 // Hash 498 MOVQ ad_len+80(FP), itr2 499 CALL polyHashADInternal<>(SB) 500 501openSSE128Open: 502 CMPQ inl, $16 503 JB openSSETail16 504 SUBQ $16, inl 505 506 // Load for hashing 507 polyAdd(0(inp)) 508 509 // Load for decryption 510 MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup) 511 LEAQ (1*16)(inp), inp 512 LEAQ (1*16)(oup), oup 513 polyMul 514 515 // Shift the stream "left" 516 MOVO B1, A1 517 MOVO C1, B1 518 MOVO D1, C1 519 MOVO A2, D1 520 MOVO B2, A2 521 MOVO C2, B2 522 MOVO D2, C2 523 JMP openSSE128Open 524 525openSSETail16: 526 TESTQ inl, inl 527 JE openSSEFinalize 528 529 // We can safely load the CT from the end, because it is padded with the MAC 530 MOVQ inl, itr2 531 SHLQ $4, itr2 532 LEAQ ·andMask<>(SB), t0 533 MOVOU (inp), T0 534 ADDQ inl, inp 535 PAND -16(t0)(itr2*1), T0 536 MOVO T0, 0+tmpStore 537 MOVQ T0, t0 538 MOVQ 8+tmpStore, t1 539 PXOR A1, T0 540 541 // We can only store one byte at a time, since plaintext can be shorter than 16 bytes 542openSSETail16Store: 543 MOVQ T0, t3 544 MOVB t3, (oup) 545 PSRLDQ $1, T0 546 INCQ oup 547 DECQ inl 548 JNE openSSETail16Store 549 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 550 polyMul 551 JMP openSSEFinalize 552 553// ---------------------------------------------------------------------------- 554// Special optimization for the last 64 bytes of ciphertext 555openSSETail64: 556 // Need to decrypt up to 64 bytes - prepare single block 557 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store 558 XORQ itr2, itr2 559 MOVQ inl, itr1 560 CMPQ itr1, $16 561 JB openSSETail64LoopB 562 563openSSETail64LoopA: 564 // Perform ChaCha rounds, while hashing the remaining input 565 polyAdd(0(inp)(itr2*1)) 566 polyMul 567 SUBQ $16, itr1 568 569openSSETail64LoopB: 570 ADDQ $16, itr2 571 chachaQR(A0, B0, C0, D0, T0) 572 shiftB0Left; shiftC0Left; shiftD0Left 573 chachaQR(A0, B0, C0, D0, T0) 574 shiftB0Right; shiftC0Right; shiftD0Right 575 576 CMPQ itr1, $16 577 JAE openSSETail64LoopA 578 579 CMPQ itr2, $160 580 JNE openSSETail64LoopB 581 582 PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0 583 584openSSETail64DecLoop: 585 CMPQ inl, $16 586 JB openSSETail64DecLoopDone 587 SUBQ $16, inl 588 MOVOU (inp), T0 589 PXOR T0, A0 590 MOVOU A0, (oup) 591 LEAQ 16(inp), inp 592 LEAQ 16(oup), oup 593 MOVO B0, A0 594 MOVO C0, B0 595 MOVO D0, C0 596 JMP openSSETail64DecLoop 597 598openSSETail64DecLoopDone: 599 MOVO A0, A1 600 JMP openSSETail16 601 602// ---------------------------------------------------------------------------- 603// Special optimization for the last 128 bytes of ciphertext 604openSSETail128: 605 // Need to decrypt up to 128 bytes - prepare two blocks 606 MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store 607 MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store 608 XORQ itr2, itr2 609 MOVQ inl, itr1 610 ANDQ $-16, itr1 611 612openSSETail128LoopA: 613 // Perform ChaCha rounds, while hashing the remaining input 614 polyAdd(0(inp)(itr2*1)) 615 polyMul 616 617openSSETail128LoopB: 618 ADDQ $16, itr2 619 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) 620 shiftB0Left; shiftC0Left; shiftD0Left 621 shiftB1Left; shiftC1Left; shiftD1Left 622 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) 623 shiftB0Right; shiftC0Right; shiftD0Right 624 shiftB1Right; shiftC1Right; shiftD1Right 625 626 CMPQ itr2, itr1 627 JB openSSETail128LoopA 628 629 CMPQ itr2, $160 630 JNE openSSETail128LoopB 631 632 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 633 PADDL state1Store, B0; PADDL state1Store, B1 634 PADDL state2Store, C0; PADDL state2Store, C1 635 PADDL ctr1Store, D0; PADDL ctr0Store, D1 636 637 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 638 PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 639 MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) 640 641 SUBQ $64, inl 642 LEAQ 64(inp), inp 643 LEAQ 64(oup), oup 644 JMP openSSETail64DecLoop 645 646// ---------------------------------------------------------------------------- 647// Special optimization for the last 192 bytes of ciphertext 648openSSETail192: 649 // Need to decrypt up to 192 bytes - prepare three blocks 650 MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store 651 MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store 652 MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store 653 654 MOVQ inl, itr1 655 MOVQ $160, itr2 656 CMPQ itr1, $160 657 CMOVQGT itr2, itr1 658 ANDQ $-16, itr1 659 XORQ itr2, itr2 660 661openSSLTail192LoopA: 662 // Perform ChaCha rounds, while hashing the remaining input 663 polyAdd(0(inp)(itr2*1)) 664 polyMul 665 666openSSLTail192LoopB: 667 ADDQ $16, itr2 668 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 669 shiftB0Left; shiftC0Left; shiftD0Left 670 shiftB1Left; shiftC1Left; shiftD1Left 671 shiftB2Left; shiftC2Left; shiftD2Left 672 673 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 674 shiftB0Right; shiftC0Right; shiftD0Right 675 shiftB1Right; shiftC1Right; shiftD1Right 676 shiftB2Right; shiftC2Right; shiftD2Right 677 678 CMPQ itr2, itr1 679 JB openSSLTail192LoopA 680 681 CMPQ itr2, $160 682 JNE openSSLTail192LoopB 683 684 CMPQ inl, $176 685 JB openSSLTail192Store 686 687 polyAdd(160(inp)) 688 polyMul 689 690 CMPQ inl, $192 691 JB openSSLTail192Store 692 693 polyAdd(176(inp)) 694 polyMul 695 696openSSLTail192Store: 697 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 698 PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 699 PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 700 PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2 701 702 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 703 PXOR T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2 704 MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup) 705 706 MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 707 PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 708 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) 709 710 SUBQ $128, inl 711 LEAQ 128(inp), inp 712 LEAQ 128(oup), oup 713 JMP openSSETail64DecLoop 714 715// ---------------------------------------------------------------------------- 716// Special optimization for the last 256 bytes of ciphertext 717openSSETail256: 718 // Need to decrypt up to 256 bytes - prepare four blocks 719 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 720 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 721 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 722 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 723 724 // Store counters 725 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store 726 XORQ itr2, itr2 727 728openSSETail256Loop: 729 // This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication 730 polyAdd(0(inp)(itr2*1)) 731 MOVO C3, tmpStore 732 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 733 MOVO tmpStore, C3 734 MOVO C1, tmpStore 735 chachaQR(A3, B3, C3, D3, C1) 736 MOVO tmpStore, C1 737 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left 738 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left 739 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left 740 polyMulStage1 741 polyMulStage2 742 MOVO C3, tmpStore 743 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 744 MOVO tmpStore, C3 745 MOVO C1, tmpStore 746 chachaQR(A3, B3, C3, D3, C1) 747 MOVO tmpStore, C1 748 polyMulStage3 749 polyMulReduceStage 750 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right 751 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right 752 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right 753 ADDQ $2*8, itr2 754 CMPQ itr2, $160 755 JB openSSETail256Loop 756 MOVQ inl, itr1 757 ANDQ $-16, itr1 758 759openSSETail256HashLoop: 760 polyAdd(0(inp)(itr2*1)) 761 polyMul 762 ADDQ $2*8, itr2 763 CMPQ itr2, itr1 764 JB openSSETail256HashLoop 765 766 // Add in the state 767 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 768 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 769 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 770 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 771 MOVO D3, tmpStore 772 773 // Load - xor - store 774 MOVOU (0*16)(inp), D3; PXOR D3, A0 775 MOVOU (1*16)(inp), D3; PXOR D3, B0 776 MOVOU (2*16)(inp), D3; PXOR D3, C0 777 MOVOU (3*16)(inp), D3; PXOR D3, D0 778 MOVOU A0, (0*16)(oup) 779 MOVOU B0, (1*16)(oup) 780 MOVOU C0, (2*16)(oup) 781 MOVOU D0, (3*16)(oup) 782 MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 783 PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 784 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) 785 MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 786 PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 787 MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) 788 LEAQ 192(inp), inp 789 LEAQ 192(oup), oup 790 SUBQ $192, inl 791 MOVO A3, A0 792 MOVO B3, B0 793 MOVO C3, C0 794 MOVO tmpStore, D0 795 796 JMP openSSETail64DecLoop 797 798// ---------------------------------------------------------------------------- 799// ------------------------- AVX2 Code ---------------------------------------- 800chacha20Poly1305Open_AVX2: 801 VZEROUPPER 802 VMOVDQU ·chacha20Constants<>(SB), AA0 803 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 804 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 805 BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 806 VPADDD ·avx2InitMask<>(SB), DD0, DD0 807 808 // Special optimization, for very short buffers 809 CMPQ inl, $192 810 JBE openAVX2192 811 CMPQ inl, $320 812 JBE openAVX2320 813 814 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream 815 VMOVDQA BB0, state1StoreAVX2 816 VMOVDQA CC0, state2StoreAVX2 817 VMOVDQA DD0, ctr3StoreAVX2 818 MOVQ $10, itr2 819 820openAVX2PreparePolyKey: 821 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) 822 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 823 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) 824 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 825 DECQ itr2 826 JNE openAVX2PreparePolyKey 827 828 VPADDD ·chacha20Constants<>(SB), AA0, AA0 829 VPADDD state1StoreAVX2, BB0, BB0 830 VPADDD state2StoreAVX2, CC0, CC0 831 VPADDD ctr3StoreAVX2, DD0, DD0 832 833 VPERM2I128 $0x02, AA0, BB0, TT0 834 835 // Clamp and store poly key 836 VPAND ·polyClampMask<>(SB), TT0, TT0 837 VMOVDQA TT0, rsStoreAVX2 838 839 // Stream for the first 64 bytes 840 VPERM2I128 $0x13, AA0, BB0, AA0 841 VPERM2I128 $0x13, CC0, DD0, BB0 842 843 // Hash AD + first 64 bytes 844 MOVQ ad_len+80(FP), itr2 845 CALL polyHashADInternal<>(SB) 846 XORQ itr1, itr1 847 848openAVX2InitialHash64: 849 polyAdd(0(inp)(itr1*1)) 850 polyMulAVX2 851 ADDQ $16, itr1 852 CMPQ itr1, $64 853 JNE openAVX2InitialHash64 854 855 // Decrypt the first 64 bytes 856 VPXOR (0*32)(inp), AA0, AA0 857 VPXOR (1*32)(inp), BB0, BB0 858 VMOVDQU AA0, (0*32)(oup) 859 VMOVDQU BB0, (1*32)(oup) 860 LEAQ (2*32)(inp), inp 861 LEAQ (2*32)(oup), oup 862 SUBQ $64, inl 863 864openAVX2MainLoop: 865 CMPQ inl, $512 866 JB openAVX2MainLoopDone 867 868 // Load state, increment counter blocks, store the incremented counters 869 VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 870 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 871 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 872 VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 873 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 874 XORQ itr1, itr1 875 876openAVX2InternalLoop: 877 // Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications 878 // Effectively per 512 bytes of stream we hash 480 bytes of ciphertext 879 polyAdd(0*8(inp)(itr1*1)) 880 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 881 polyMulStage1_AVX2 882 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 883 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 884 polyMulStage2_AVX2 885 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 886 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 887 polyMulStage3_AVX2 888 VMOVDQA CC3, tmpStoreAVX2 889 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 890 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 891 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 892 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 893 VMOVDQA tmpStoreAVX2, CC3 894 polyMulReduceStage 895 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 896 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 897 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 898 polyAdd(2*8(inp)(itr1*1)) 899 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 900 polyMulStage1_AVX2 901 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 902 VMOVDQA CC3, tmpStoreAVX2 903 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 904 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 905 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 906 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 907 VMOVDQA tmpStoreAVX2, CC3 908 polyMulStage2_AVX2 909 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 910 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 911 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 912 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 913 polyMulStage3_AVX2 914 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 915 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 916 polyMulReduceStage 917 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 918 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 919 polyAdd(4*8(inp)(itr1*1)) 920 LEAQ (6*8)(itr1), itr1 921 VMOVDQA CC3, tmpStoreAVX2 922 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 923 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 924 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 925 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 926 VMOVDQA tmpStoreAVX2, CC3 927 polyMulStage1_AVX2 928 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 929 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 930 polyMulStage2_AVX2 931 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 932 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 933 polyMulStage3_AVX2 934 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 935 VMOVDQA CC3, tmpStoreAVX2 936 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 937 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 938 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 939 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 940 VMOVDQA tmpStoreAVX2, CC3 941 polyMulReduceStage 942 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 943 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 944 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 945 CMPQ itr1, $480 946 JNE openAVX2InternalLoop 947 948 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 949 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 950 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 951 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 952 VMOVDQA CC3, tmpStoreAVX2 953 954 // We only hashed 480 of the 512 bytes available - hash the remaining 32 here 955 polyAdd(480(inp)) 956 polyMulAVX2 957 VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 958 VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 959 VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) 960 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 961 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 962 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) 963 964 // and here 965 polyAdd(496(inp)) 966 polyMulAVX2 967 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 968 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 969 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) 970 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 971 VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 972 VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) 973 LEAQ (32*16)(inp), inp 974 LEAQ (32*16)(oup), oup 975 SUBQ $(32*16), inl 976 JMP openAVX2MainLoop 977 978openAVX2MainLoopDone: 979 // Handle the various tail sizes efficiently 980 TESTQ inl, inl 981 JE openSSEFinalize 982 CMPQ inl, $128 983 JBE openAVX2Tail128 984 CMPQ inl, $256 985 JBE openAVX2Tail256 986 CMPQ inl, $384 987 JBE openAVX2Tail384 988 JMP openAVX2Tail512 989 990// ---------------------------------------------------------------------------- 991// Special optimization for buffers smaller than 193 bytes 992openAVX2192: 993 // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks 994 VMOVDQA AA0, AA1 995 VMOVDQA BB0, BB1 996 VMOVDQA CC0, CC1 997 VPADDD ·avx2IncMask<>(SB), DD0, DD1 998 VMOVDQA AA0, AA2 999 VMOVDQA BB0, BB2 1000 VMOVDQA CC0, CC2 1001 VMOVDQA DD0, DD2 1002 VMOVDQA DD1, TT3 1003 MOVQ $10, itr2 1004 1005openAVX2192InnerCipherLoop: 1006 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 1007 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 1008 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 1009 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 1010 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 1011 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 1012 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 1013 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 1014 DECQ itr2 1015 JNE openAVX2192InnerCipherLoop 1016 VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 1017 VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 1018 VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 1019 VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 1020 VPERM2I128 $0x02, AA0, BB0, TT0 1021 1022 // Clamp and store poly key 1023 VPAND ·polyClampMask<>(SB), TT0, TT0 1024 VMOVDQA TT0, rsStoreAVX2 1025 1026 // Stream for up to 192 bytes 1027 VPERM2I128 $0x13, AA0, BB0, AA0 1028 VPERM2I128 $0x13, CC0, DD0, BB0 1029 VPERM2I128 $0x02, AA1, BB1, CC0 1030 VPERM2I128 $0x02, CC1, DD1, DD0 1031 VPERM2I128 $0x13, AA1, BB1, AA1 1032 VPERM2I128 $0x13, CC1, DD1, BB1 1033 1034openAVX2ShortOpen: 1035 // Hash 1036 MOVQ ad_len+80(FP), itr2 1037 CALL polyHashADInternal<>(SB) 1038 1039openAVX2ShortOpenLoop: 1040 CMPQ inl, $32 1041 JB openAVX2ShortTail32 1042 SUBQ $32, inl 1043 1044 // Load for hashing 1045 polyAdd(0*8(inp)) 1046 polyMulAVX2 1047 polyAdd(2*8(inp)) 1048 polyMulAVX2 1049 1050 // Load for decryption 1051 VPXOR (inp), AA0, AA0 1052 VMOVDQU AA0, (oup) 1053 LEAQ (1*32)(inp), inp 1054 LEAQ (1*32)(oup), oup 1055 1056 // Shift stream left 1057 VMOVDQA BB0, AA0 1058 VMOVDQA CC0, BB0 1059 VMOVDQA DD0, CC0 1060 VMOVDQA AA1, DD0 1061 VMOVDQA BB1, AA1 1062 VMOVDQA CC1, BB1 1063 VMOVDQA DD1, CC1 1064 VMOVDQA AA2, DD1 1065 VMOVDQA BB2, AA2 1066 JMP openAVX2ShortOpenLoop 1067 1068openAVX2ShortTail32: 1069 CMPQ inl, $16 1070 VMOVDQA A0, A1 1071 JB openAVX2ShortDone 1072 1073 SUBQ $16, inl 1074 1075 // Load for hashing 1076 polyAdd(0*8(inp)) 1077 polyMulAVX2 1078 1079 // Load for decryption 1080 VPXOR (inp), A0, T0 1081 VMOVDQU T0, (oup) 1082 LEAQ (1*16)(inp), inp 1083 LEAQ (1*16)(oup), oup 1084 VPERM2I128 $0x11, AA0, AA0, AA0 1085 VMOVDQA A0, A1 1086 1087openAVX2ShortDone: 1088 VZEROUPPER 1089 JMP openSSETail16 1090 1091// ---------------------------------------------------------------------------- 1092// Special optimization for buffers smaller than 321 bytes 1093openAVX2320: 1094 // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks 1095 VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 1096 VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 1097 VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 1098 MOVQ $10, itr2 1099 1100openAVX2320InnerCipherLoop: 1101 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 1102 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 1103 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 1104 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 1105 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 1106 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 1107 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 1108 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 1109 DECQ itr2 1110 JNE openAVX2320InnerCipherLoop 1111 1112 VMOVDQA ·chacha20Constants<>(SB), TT0 1113 VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 1114 VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 1115 VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 1116 VMOVDQA ·avx2IncMask<>(SB), TT0 1117 VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 1118 VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 1119 VPADDD TT3, DD2, DD2 1120 1121 // Clamp and store poly key 1122 VPERM2I128 $0x02, AA0, BB0, TT0 1123 VPAND ·polyClampMask<>(SB), TT0, TT0 1124 VMOVDQA TT0, rsStoreAVX2 1125 1126 // Stream for up to 320 bytes 1127 VPERM2I128 $0x13, AA0, BB0, AA0 1128 VPERM2I128 $0x13, CC0, DD0, BB0 1129 VPERM2I128 $0x02, AA1, BB1, CC0 1130 VPERM2I128 $0x02, CC1, DD1, DD0 1131 VPERM2I128 $0x13, AA1, BB1, AA1 1132 VPERM2I128 $0x13, CC1, DD1, BB1 1133 VPERM2I128 $0x02, AA2, BB2, CC1 1134 VPERM2I128 $0x02, CC2, DD2, DD1 1135 VPERM2I128 $0x13, AA2, BB2, AA2 1136 VPERM2I128 $0x13, CC2, DD2, BB2 1137 JMP openAVX2ShortOpen 1138 1139// ---------------------------------------------------------------------------- 1140// Special optimization for the last 128 bytes of ciphertext 1141openAVX2Tail128: 1142 // Need to decrypt up to 128 bytes - prepare two blocks 1143 VMOVDQA ·chacha20Constants<>(SB), AA1 1144 VMOVDQA state1StoreAVX2, BB1 1145 VMOVDQA state2StoreAVX2, CC1 1146 VMOVDQA ctr3StoreAVX2, DD1 1147 VPADDD ·avx2IncMask<>(SB), DD1, DD1 1148 VMOVDQA DD1, DD0 1149 1150 XORQ itr2, itr2 1151 MOVQ inl, itr1 1152 ANDQ $-16, itr1 1153 TESTQ itr1, itr1 1154 JE openAVX2Tail128LoopB 1155 1156openAVX2Tail128LoopA: 1157 // Perform ChaCha rounds, while hashing the remaining input 1158 polyAdd(0(inp)(itr2*1)) 1159 polyMulAVX2 1160 1161openAVX2Tail128LoopB: 1162 ADDQ $16, itr2 1163 chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 1164 VPALIGNR $4, BB1, BB1, BB1 1165 VPALIGNR $8, CC1, CC1, CC1 1166 VPALIGNR $12, DD1, DD1, DD1 1167 chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 1168 VPALIGNR $12, BB1, BB1, BB1 1169 VPALIGNR $8, CC1, CC1, CC1 1170 VPALIGNR $4, DD1, DD1, DD1 1171 CMPQ itr2, itr1 1172 JB openAVX2Tail128LoopA 1173 CMPQ itr2, $160 1174 JNE openAVX2Tail128LoopB 1175 1176 VPADDD ·chacha20Constants<>(SB), AA1, AA1 1177 VPADDD state1StoreAVX2, BB1, BB1 1178 VPADDD state2StoreAVX2, CC1, CC1 1179 VPADDD DD0, DD1, DD1 1180 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 1181 1182openAVX2TailLoop: 1183 CMPQ inl, $32 1184 JB openAVX2Tail 1185 SUBQ $32, inl 1186 1187 // Load for decryption 1188 VPXOR (inp), AA0, AA0 1189 VMOVDQU AA0, (oup) 1190 LEAQ (1*32)(inp), inp 1191 LEAQ (1*32)(oup), oup 1192 VMOVDQA BB0, AA0 1193 VMOVDQA CC0, BB0 1194 VMOVDQA DD0, CC0 1195 JMP openAVX2TailLoop 1196 1197openAVX2Tail: 1198 CMPQ inl, $16 1199 VMOVDQA A0, A1 1200 JB openAVX2TailDone 1201 SUBQ $16, inl 1202 1203 // Load for decryption 1204 VPXOR (inp), A0, T0 1205 VMOVDQU T0, (oup) 1206 LEAQ (1*16)(inp), inp 1207 LEAQ (1*16)(oup), oup 1208 VPERM2I128 $0x11, AA0, AA0, AA0 1209 VMOVDQA A0, A1 1210 1211openAVX2TailDone: 1212 VZEROUPPER 1213 JMP openSSETail16 1214 1215// ---------------------------------------------------------------------------- 1216// Special optimization for the last 256 bytes of ciphertext 1217openAVX2Tail256: 1218 // Need to decrypt up to 256 bytes - prepare four blocks 1219 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1 1220 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1 1221 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1 1222 VMOVDQA ctr3StoreAVX2, DD0 1223 VPADDD ·avx2IncMask<>(SB), DD0, DD0 1224 VPADDD ·avx2IncMask<>(SB), DD0, DD1 1225 VMOVDQA DD0, TT1 1226 VMOVDQA DD1, TT2 1227 1228 // Compute the number of iterations that will hash data 1229 MOVQ inl, tmpStoreAVX2 1230 MOVQ inl, itr1 1231 SUBQ $128, itr1 1232 SHRQ $4, itr1 1233 MOVQ $10, itr2 1234 CMPQ itr1, $10 1235 CMOVQGT itr2, itr1 1236 MOVQ inp, inl 1237 XORQ itr2, itr2 1238 1239openAVX2Tail256LoopA: 1240 polyAdd(0(inl)) 1241 polyMulAVX2 1242 LEAQ 16(inl), inl 1243 1244 // Perform ChaCha rounds, while hashing the remaining input 1245openAVX2Tail256LoopB: 1246 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 1247 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 1248 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 1249 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 1250 INCQ itr2 1251 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 1252 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 1253 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 1254 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 1255 CMPQ itr2, itr1 1256 JB openAVX2Tail256LoopA 1257 1258 CMPQ itr2, $10 1259 JNE openAVX2Tail256LoopB 1260 1261 MOVQ inl, itr2 1262 SUBQ inp, inl 1263 MOVQ inl, itr1 1264 MOVQ tmpStoreAVX2, inl 1265 1266 // Hash the remainder of data (if any) 1267openAVX2Tail256Hash: 1268 ADDQ $16, itr1 1269 CMPQ itr1, inl 1270 JGT openAVX2Tail256HashEnd 1271 polyAdd (0(itr2)) 1272 polyMulAVX2 1273 LEAQ 16(itr2), itr2 1274 JMP openAVX2Tail256Hash 1275 1276// Store 128 bytes safely, then go to store loop 1277openAVX2Tail256HashEnd: 1278 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 1279 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 1280 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 1281 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 1282 VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2 1283 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 1284 1285 VPXOR (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2 1286 VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup) 1287 LEAQ (4*32)(inp), inp 1288 LEAQ (4*32)(oup), oup 1289 SUBQ $4*32, inl 1290 1291 JMP openAVX2TailLoop 1292 1293// ---------------------------------------------------------------------------- 1294// Special optimization for the last 384 bytes of ciphertext 1295openAVX2Tail384: 1296 // Need to decrypt up to 384 bytes - prepare six blocks 1297 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 1298 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 1299 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 1300 VMOVDQA ctr3StoreAVX2, DD0 1301 VPADDD ·avx2IncMask<>(SB), DD0, DD0 1302 VPADDD ·avx2IncMask<>(SB), DD0, DD1 1303 VPADDD ·avx2IncMask<>(SB), DD1, DD2 1304 VMOVDQA DD0, ctr0StoreAVX2 1305 VMOVDQA DD1, ctr1StoreAVX2 1306 VMOVDQA DD2, ctr2StoreAVX2 1307 1308 // Compute the number of iterations that will hash two blocks of data 1309 MOVQ inl, tmpStoreAVX2 1310 MOVQ inl, itr1 1311 SUBQ $256, itr1 1312 SHRQ $4, itr1 1313 ADDQ $6, itr1 1314 MOVQ $10, itr2 1315 CMPQ itr1, $10 1316 CMOVQGT itr2, itr1 1317 MOVQ inp, inl 1318 XORQ itr2, itr2 1319 1320 // Perform ChaCha rounds, while hashing the remaining input 1321openAVX2Tail384LoopB: 1322 polyAdd(0(inl)) 1323 polyMulAVX2 1324 LEAQ 16(inl), inl 1325 1326openAVX2Tail384LoopA: 1327 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 1328 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 1329 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 1330 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 1331 polyAdd(0(inl)) 1332 polyMulAVX2 1333 LEAQ 16(inl), inl 1334 INCQ itr2 1335 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 1336 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 1337 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 1338 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 1339 1340 CMPQ itr2, itr1 1341 JB openAVX2Tail384LoopB 1342 1343 CMPQ itr2, $10 1344 JNE openAVX2Tail384LoopA 1345 1346 MOVQ inl, itr2 1347 SUBQ inp, inl 1348 MOVQ inl, itr1 1349 MOVQ tmpStoreAVX2, inl 1350 1351openAVX2Tail384Hash: 1352 ADDQ $16, itr1 1353 CMPQ itr1, inl 1354 JGT openAVX2Tail384HashEnd 1355 polyAdd(0(itr2)) 1356 polyMulAVX2 1357 LEAQ 16(itr2), itr2 1358 JMP openAVX2Tail384Hash 1359 1360// Store 256 bytes safely, then go to store loop 1361openAVX2Tail384HashEnd: 1362 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 1363 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 1364 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 1365 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2 1366 VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3 1367 VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 1368 VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) 1369 VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3 1370 VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 1371 VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) 1372 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 1373 LEAQ (8*32)(inp), inp 1374 LEAQ (8*32)(oup), oup 1375 SUBQ $8*32, inl 1376 JMP openAVX2TailLoop 1377 1378// ---------------------------------------------------------------------------- 1379// Special optimization for the last 512 bytes of ciphertext 1380openAVX2Tail512: 1381 VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 1382 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 1383 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 1384 VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 1385 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 1386 XORQ itr1, itr1 1387 MOVQ inp, itr2 1388 1389openAVX2Tail512LoopB: 1390 polyAdd(0(itr2)) 1391 polyMulAVX2 1392 LEAQ (2*8)(itr2), itr2 1393 1394openAVX2Tail512LoopA: 1395 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 1396 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 1397 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 1398 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 1399 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 1400 VMOVDQA CC3, tmpStoreAVX2 1401 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 1402 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 1403 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 1404 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 1405 VMOVDQA tmpStoreAVX2, CC3 1406 polyAdd(0*8(itr2)) 1407 polyMulAVX2 1408 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 1409 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 1410 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 1411 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 1412 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 1413 VMOVDQA CC3, tmpStoreAVX2 1414 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 1415 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 1416 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 1417 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 1418 VMOVDQA tmpStoreAVX2, CC3 1419 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 1420 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 1421 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 1422 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 1423 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 1424 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 1425 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 1426 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 1427 polyAdd(2*8(itr2)) 1428 polyMulAVX2 1429 LEAQ (4*8)(itr2), itr2 1430 VMOVDQA CC3, tmpStoreAVX2 1431 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 1432 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 1433 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 1434 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 1435 VMOVDQA tmpStoreAVX2, CC3 1436 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 1437 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 1438 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 1439 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 1440 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 1441 VMOVDQA CC3, tmpStoreAVX2 1442 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 1443 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 1444 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 1445 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 1446 VMOVDQA tmpStoreAVX2, CC3 1447 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 1448 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 1449 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 1450 INCQ itr1 1451 CMPQ itr1, $4 1452 JLT openAVX2Tail512LoopB 1453 1454 CMPQ itr1, $10 1455 JNE openAVX2Tail512LoopA 1456 1457 MOVQ inl, itr1 1458 SUBQ $384, itr1 1459 ANDQ $-16, itr1 1460 1461openAVX2Tail512HashLoop: 1462 TESTQ itr1, itr1 1463 JE openAVX2Tail512HashEnd 1464 polyAdd(0(itr2)) 1465 polyMulAVX2 1466 LEAQ 16(itr2), itr2 1467 SUBQ $16, itr1 1468 JMP openAVX2Tail512HashLoop 1469 1470openAVX2Tail512HashEnd: 1471 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 1472 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 1473 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 1474 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 1475 VMOVDQA CC3, tmpStoreAVX2 1476 VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 1477 VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 1478 VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) 1479 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 1480 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 1481 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) 1482 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 1483 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 1484 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) 1485 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 1486 1487 LEAQ (12*32)(inp), inp 1488 LEAQ (12*32)(oup), oup 1489 SUBQ $12*32, inl 1490 1491 JMP openAVX2TailLoop 1492 1493// ---------------------------------------------------------------------------- 1494// ---------------------------------------------------------------------------- 1495// func chacha20Poly1305Seal(dst, key, src, ad []byte) 1496TEXT ·chacha20Poly1305Seal(SB), 0, $288-96 1497 // For aligned stack access 1498 MOVQ SP, BP 1499 ADDQ $32, BP 1500 ANDQ $-32, BP 1501 MOVQ dst+0(FP), oup 1502 MOVQ key+24(FP), keyp 1503 MOVQ src+48(FP), inp 1504 MOVQ src_len+56(FP), inl 1505 MOVQ ad+72(FP), adp 1506 1507 CMPB ·useAVX2(SB), $1 1508 JE chacha20Poly1305Seal_AVX2 1509 1510 // Special optimization, for very short buffers 1511 CMPQ inl, $128 1512 JBE sealSSE128 // About 15% faster 1513 1514 // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration 1515 MOVOU ·chacha20Constants<>(SB), A0 1516 MOVOU (1*16)(keyp), B0 1517 MOVOU (2*16)(keyp), C0 1518 MOVOU (3*16)(keyp), D0 1519 1520 // Store state on stack for future use 1521 MOVO B0, state1Store 1522 MOVO C0, state2Store 1523 1524 // Load state, increment counter blocks 1525 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 1526 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 1527 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 1528 1529 // Store counters 1530 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store 1531 MOVQ $10, itr2 1532 1533sealSSEIntroLoop: 1534 MOVO C3, tmpStore 1535 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 1536 MOVO tmpStore, C3 1537 MOVO C1, tmpStore 1538 chachaQR(A3, B3, C3, D3, C1) 1539 MOVO tmpStore, C1 1540 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left 1541 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left 1542 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left 1543 1544 MOVO C3, tmpStore 1545 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 1546 MOVO tmpStore, C3 1547 MOVO C1, tmpStore 1548 chachaQR(A3, B3, C3, D3, C1) 1549 MOVO tmpStore, C1 1550 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right 1551 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right 1552 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right 1553 DECQ itr2 1554 JNE sealSSEIntroLoop 1555 1556 // Add in the state 1557 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 1558 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 1559 PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 1560 PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 1561 1562 // Clamp and store the key 1563 PAND ·polyClampMask<>(SB), A0 1564 MOVO A0, rStore 1565 MOVO B0, sStore 1566 1567 // Hash AAD 1568 MOVQ ad_len+80(FP), itr2 1569 CALL polyHashADInternal<>(SB) 1570 1571 MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 1572 PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 1573 MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) 1574 MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 1575 PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 1576 MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup) 1577 1578 MOVQ $128, itr1 1579 SUBQ $128, inl 1580 LEAQ 128(inp), inp 1581 1582 MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1 1583 1584 CMPQ inl, $64 1585 JBE sealSSE128SealHash 1586 1587 MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 1588 PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 1589 MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup) 1590 1591 ADDQ $64, itr1 1592 SUBQ $64, inl 1593 LEAQ 64(inp), inp 1594 1595 MOVQ $2, itr1 1596 MOVQ $8, itr2 1597 1598 CMPQ inl, $64 1599 JBE sealSSETail64 1600 CMPQ inl, $128 1601 JBE sealSSETail128 1602 CMPQ inl, $192 1603 JBE sealSSETail192 1604 1605sealSSEMainLoop: 1606 // Load state, increment counter blocks 1607 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 1608 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 1609 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 1610 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 1611 1612 // Store counters 1613 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store 1614 1615sealSSEInnerLoop: 1616 MOVO C3, tmpStore 1617 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 1618 MOVO tmpStore, C3 1619 MOVO C1, tmpStore 1620 chachaQR(A3, B3, C3, D3, C1) 1621 MOVO tmpStore, C1 1622 polyAdd(0(oup)) 1623 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left 1624 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left 1625 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left 1626 polyMulStage1 1627 polyMulStage2 1628 LEAQ (2*8)(oup), oup 1629 MOVO C3, tmpStore 1630 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 1631 MOVO tmpStore, C3 1632 MOVO C1, tmpStore 1633 polyMulStage3 1634 chachaQR(A3, B3, C3, D3, C1) 1635 MOVO tmpStore, C1 1636 polyMulReduceStage 1637 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right 1638 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right 1639 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right 1640 DECQ itr2 1641 JGE sealSSEInnerLoop 1642 polyAdd(0(oup)) 1643 polyMul 1644 LEAQ (2*8)(oup), oup 1645 DECQ itr1 1646 JG sealSSEInnerLoop 1647 1648 // Add in the state 1649 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 1650 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 1651 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 1652 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 1653 MOVO D3, tmpStore 1654 1655 // Load - xor - store 1656 MOVOU (0*16)(inp), D3; PXOR D3, A0 1657 MOVOU (1*16)(inp), D3; PXOR D3, B0 1658 MOVOU (2*16)(inp), D3; PXOR D3, C0 1659 MOVOU (3*16)(inp), D3; PXOR D3, D0 1660 MOVOU A0, (0*16)(oup) 1661 MOVOU B0, (1*16)(oup) 1662 MOVOU C0, (2*16)(oup) 1663 MOVOU D0, (3*16)(oup) 1664 MOVO tmpStore, D3 1665 1666 MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 1667 PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 1668 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) 1669 MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 1670 PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 1671 MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) 1672 ADDQ $192, inp 1673 MOVQ $192, itr1 1674 SUBQ $192, inl 1675 MOVO A3, A1 1676 MOVO B3, B1 1677 MOVO C3, C1 1678 MOVO D3, D1 1679 CMPQ inl, $64 1680 JBE sealSSE128SealHash 1681 MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 1682 PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 1683 MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup) 1684 LEAQ 64(inp), inp 1685 SUBQ $64, inl 1686 MOVQ $6, itr1 1687 MOVQ $4, itr2 1688 CMPQ inl, $192 1689 JG sealSSEMainLoop 1690 1691 MOVQ inl, itr1 1692 TESTQ inl, inl 1693 JE sealSSE128SealHash 1694 MOVQ $6, itr1 1695 CMPQ inl, $64 1696 JBE sealSSETail64 1697 CMPQ inl, $128 1698 JBE sealSSETail128 1699 JMP sealSSETail192 1700 1701// ---------------------------------------------------------------------------- 1702// Special optimization for the last 64 bytes of plaintext 1703sealSSETail64: 1704 // Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes 1705 MOVO ·chacha20Constants<>(SB), A1 1706 MOVO state1Store, B1 1707 MOVO state2Store, C1 1708 MOVO ctr3Store, D1 1709 PADDL ·sseIncMask<>(SB), D1 1710 MOVO D1, ctr0Store 1711 1712sealSSETail64LoopA: 1713 // Perform ChaCha rounds, while hashing the previously encrypted ciphertext 1714 polyAdd(0(oup)) 1715 polyMul 1716 LEAQ 16(oup), oup 1717 1718sealSSETail64LoopB: 1719 chachaQR(A1, B1, C1, D1, T1) 1720 shiftB1Left; shiftC1Left; shiftD1Left 1721 chachaQR(A1, B1, C1, D1, T1) 1722 shiftB1Right; shiftC1Right; shiftD1Right 1723 polyAdd(0(oup)) 1724 polyMul 1725 LEAQ 16(oup), oup 1726 1727 DECQ itr1 1728 JG sealSSETail64LoopA 1729 1730 DECQ itr2 1731 JGE sealSSETail64LoopB 1732 PADDL ·chacha20Constants<>(SB), A1 1733 PADDL state1Store, B1 1734 PADDL state2Store, C1 1735 PADDL ctr0Store, D1 1736 1737 JMP sealSSE128Seal 1738 1739// ---------------------------------------------------------------------------- 1740// Special optimization for the last 128 bytes of plaintext 1741sealSSETail128: 1742 // Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes 1743 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store 1744 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store 1745 1746sealSSETail128LoopA: 1747 // Perform ChaCha rounds, while hashing the previously encrypted ciphertext 1748 polyAdd(0(oup)) 1749 polyMul 1750 LEAQ 16(oup), oup 1751 1752sealSSETail128LoopB: 1753 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) 1754 shiftB0Left; shiftC0Left; shiftD0Left 1755 shiftB1Left; shiftC1Left; shiftD1Left 1756 polyAdd(0(oup)) 1757 polyMul 1758 LEAQ 16(oup), oup 1759 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) 1760 shiftB0Right; shiftC0Right; shiftD0Right 1761 shiftB1Right; shiftC1Right; shiftD1Right 1762 1763 DECQ itr1 1764 JG sealSSETail128LoopA 1765 1766 DECQ itr2 1767 JGE sealSSETail128LoopB 1768 1769 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 1770 PADDL state1Store, B0; PADDL state1Store, B1 1771 PADDL state2Store, C0; PADDL state2Store, C1 1772 PADDL ctr0Store, D0; PADDL ctr1Store, D1 1773 1774 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 1775 PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 1776 MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) 1777 1778 MOVQ $64, itr1 1779 LEAQ 64(inp), inp 1780 SUBQ $64, inl 1781 1782 JMP sealSSE128SealHash 1783 1784// ---------------------------------------------------------------------------- 1785// Special optimization for the last 192 bytes of plaintext 1786sealSSETail192: 1787 // Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes 1788 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store 1789 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store 1790 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store 1791 1792sealSSETail192LoopA: 1793 // Perform ChaCha rounds, while hashing the previously encrypted ciphertext 1794 polyAdd(0(oup)) 1795 polyMul 1796 LEAQ 16(oup), oup 1797 1798sealSSETail192LoopB: 1799 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 1800 shiftB0Left; shiftC0Left; shiftD0Left 1801 shiftB1Left; shiftC1Left; shiftD1Left 1802 shiftB2Left; shiftC2Left; shiftD2Left 1803 1804 polyAdd(0(oup)) 1805 polyMul 1806 LEAQ 16(oup), oup 1807 1808 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 1809 shiftB0Right; shiftC0Right; shiftD0Right 1810 shiftB1Right; shiftC1Right; shiftD1Right 1811 shiftB2Right; shiftC2Right; shiftD2Right 1812 1813 DECQ itr1 1814 JG sealSSETail192LoopA 1815 1816 DECQ itr2 1817 JGE sealSSETail192LoopB 1818 1819 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 1820 PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 1821 PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 1822 PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2 1823 1824 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 1825 PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 1826 MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) 1827 MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 1828 PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 1829 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) 1830 1831 MOVO A2, A1 1832 MOVO B2, B1 1833 MOVO C2, C1 1834 MOVO D2, D1 1835 MOVQ $128, itr1 1836 LEAQ 128(inp), inp 1837 SUBQ $128, inl 1838 1839 JMP sealSSE128SealHash 1840 1841// ---------------------------------------------------------------------------- 1842// Special seal optimization for buffers smaller than 129 bytes 1843sealSSE128: 1844 // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks 1845 MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 1846 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 1847 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 1848 MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 1849 MOVQ $10, itr2 1850 1851sealSSE128InnerCipherLoop: 1852 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 1853 shiftB0Left; shiftB1Left; shiftB2Left 1854 shiftC0Left; shiftC1Left; shiftC2Left 1855 shiftD0Left; shiftD1Left; shiftD2Left 1856 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 1857 shiftB0Right; shiftB1Right; shiftB2Right 1858 shiftC0Right; shiftC1Right; shiftC2Right 1859 shiftD0Right; shiftD1Right; shiftD2Right 1860 DECQ itr2 1861 JNE sealSSE128InnerCipherLoop 1862 1863 // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded 1864 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 1865 PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 1866 PADDL T2, C1; PADDL T2, C2 1867 PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 1868 PAND ·polyClampMask<>(SB), A0 1869 MOVOU A0, rStore 1870 MOVOU B0, sStore 1871 1872 // Hash 1873 MOVQ ad_len+80(FP), itr2 1874 CALL polyHashADInternal<>(SB) 1875 XORQ itr1, itr1 1876 1877sealSSE128SealHash: 1878 // itr1 holds the number of bytes encrypted but not yet hashed 1879 CMPQ itr1, $16 1880 JB sealSSE128Seal 1881 polyAdd(0(oup)) 1882 polyMul 1883 1884 SUBQ $16, itr1 1885 ADDQ $16, oup 1886 1887 JMP sealSSE128SealHash 1888 1889sealSSE128Seal: 1890 CMPQ inl, $16 1891 JB sealSSETail 1892 SUBQ $16, inl 1893 1894 // Load for decryption 1895 MOVOU (inp), T0 1896 PXOR T0, A1 1897 MOVOU A1, (oup) 1898 LEAQ (1*16)(inp), inp 1899 LEAQ (1*16)(oup), oup 1900 1901 // Extract for hashing 1902 MOVQ A1, t0 1903 PSRLDQ $8, A1 1904 MOVQ A1, t1 1905 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 1906 polyMul 1907 1908 // Shift the stream "left" 1909 MOVO B1, A1 1910 MOVO C1, B1 1911 MOVO D1, C1 1912 MOVO A2, D1 1913 MOVO B2, A2 1914 MOVO C2, B2 1915 MOVO D2, C2 1916 JMP sealSSE128Seal 1917 1918sealSSETail: 1919 TESTQ inl, inl 1920 JE sealSSEFinalize 1921 1922 // We can only load the PT one byte at a time to avoid read after end of buffer 1923 MOVQ inl, itr2 1924 SHLQ $4, itr2 1925 LEAQ ·andMask<>(SB), t0 1926 MOVQ inl, itr1 1927 LEAQ -1(inp)(inl*1), inp 1928 XORQ t2, t2 1929 XORQ t3, t3 1930 XORQ AX, AX 1931 1932sealSSETailLoadLoop: 1933 SHLQ $8, t2, t3 1934 SHLQ $8, t2 1935 MOVB (inp), AX 1936 XORQ AX, t2 1937 LEAQ -1(inp), inp 1938 DECQ itr1 1939 JNE sealSSETailLoadLoop 1940 MOVQ t2, 0+tmpStore 1941 MOVQ t3, 8+tmpStore 1942 PXOR 0+tmpStore, A1 1943 MOVOU A1, (oup) 1944 MOVOU -16(t0)(itr2*1), T0 1945 PAND T0, A1 1946 MOVQ A1, t0 1947 PSRLDQ $8, A1 1948 MOVQ A1, t1 1949 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 1950 polyMul 1951 1952 ADDQ inl, oup 1953 1954sealSSEFinalize: 1955 // Hash in the buffer lengths 1956 ADDQ ad_len+80(FP), acc0 1957 ADCQ src_len+56(FP), acc1 1958 ADCQ $1, acc2 1959 polyMul 1960 1961 // Final reduce 1962 MOVQ acc0, t0 1963 MOVQ acc1, t1 1964 MOVQ acc2, t2 1965 SUBQ $-5, acc0 1966 SBBQ $-1, acc1 1967 SBBQ $3, acc2 1968 CMOVQCS t0, acc0 1969 CMOVQCS t1, acc1 1970 CMOVQCS t2, acc2 1971 1972 // Add in the "s" part of the key 1973 ADDQ 0+sStore, acc0 1974 ADCQ 8+sStore, acc1 1975 1976 // Finally store the tag at the end of the message 1977 MOVQ acc0, (0*8)(oup) 1978 MOVQ acc1, (1*8)(oup) 1979 RET 1980 1981// ---------------------------------------------------------------------------- 1982// ------------------------- AVX2 Code ---------------------------------------- 1983chacha20Poly1305Seal_AVX2: 1984 VZEROUPPER 1985 VMOVDQU ·chacha20Constants<>(SB), AA0 1986 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 1987 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 1988 BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 1989 VPADDD ·avx2InitMask<>(SB), DD0, DD0 1990 1991 // Special optimizations, for very short buffers 1992 CMPQ inl, $192 1993 JBE seal192AVX2 // 33% faster 1994 CMPQ inl, $320 1995 JBE seal320AVX2 // 17% faster 1996 1997 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream 1998 VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 1999 VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2 2000 VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2 2001 VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2 2002 VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2 2003 VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2 2004 VMOVDQA DD3, ctr3StoreAVX2 2005 MOVQ $10, itr2 2006 2007sealAVX2IntroLoop: 2008 VMOVDQA CC3, tmpStoreAVX2 2009 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) 2010 VMOVDQA tmpStoreAVX2, CC3 2011 VMOVDQA CC1, tmpStoreAVX2 2012 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) 2013 VMOVDQA tmpStoreAVX2, CC1 2014 2015 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 2016 VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 2017 VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 2018 VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 2019 2020 VMOVDQA CC3, tmpStoreAVX2 2021 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) 2022 VMOVDQA tmpStoreAVX2, CC3 2023 VMOVDQA CC1, tmpStoreAVX2 2024 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) 2025 VMOVDQA tmpStoreAVX2, CC1 2026 2027 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 2028 VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 2029 VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 2030 VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 2031 DECQ itr2 2032 JNE sealAVX2IntroLoop 2033 2034 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 2035 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 2036 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 2037 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 2038 2039 VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127 2040 VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key 2041 VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95 2042 2043 // Clamp and store poly key 2044 VPAND ·polyClampMask<>(SB), DD0, DD0 2045 VMOVDQA DD0, rsStoreAVX2 2046 2047 // Hash AD 2048 MOVQ ad_len+80(FP), itr2 2049 CALL polyHashADInternal<>(SB) 2050 2051 // Can store at least 320 bytes 2052 VPXOR (0*32)(inp), AA0, AA0 2053 VPXOR (1*32)(inp), CC0, CC0 2054 VMOVDQU AA0, (0*32)(oup) 2055 VMOVDQU CC0, (1*32)(oup) 2056 2057 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 2058 VPXOR (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0 2059 VMOVDQU AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup) 2060 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 2061 VPXOR (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0 2062 VMOVDQU AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup) 2063 2064 MOVQ $320, itr1 2065 SUBQ $320, inl 2066 LEAQ 320(inp), inp 2067 2068 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0 2069 CMPQ inl, $128 2070 JBE sealAVX2SealHash 2071 2072 VPXOR (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0 2073 VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup) 2074 SUBQ $128, inl 2075 LEAQ 128(inp), inp 2076 2077 MOVQ $8, itr1 2078 MOVQ $2, itr2 2079 2080 CMPQ inl, $128 2081 JBE sealAVX2Tail128 2082 CMPQ inl, $256 2083 JBE sealAVX2Tail256 2084 CMPQ inl, $384 2085 JBE sealAVX2Tail384 2086 CMPQ inl, $512 2087 JBE sealAVX2Tail512 2088 2089 // We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop 2090 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 2091 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 2092 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 2093 VMOVDQA ctr3StoreAVX2, DD0 2094 VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 2095 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 2096 2097 VMOVDQA CC3, tmpStoreAVX2 2098 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) 2099 VMOVDQA tmpStoreAVX2, CC3 2100 VMOVDQA CC1, tmpStoreAVX2 2101 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) 2102 VMOVDQA tmpStoreAVX2, CC1 2103 2104 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 2105 VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 2106 VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 2107 VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 2108 2109 VMOVDQA CC3, tmpStoreAVX2 2110 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) 2111 VMOVDQA tmpStoreAVX2, CC3 2112 VMOVDQA CC1, tmpStoreAVX2 2113 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) 2114 VMOVDQA tmpStoreAVX2, CC1 2115 2116 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 2117 VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 2118 VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 2119 VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 2120 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2121 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2122 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 2123 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2124 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2125 VMOVDQA CC3, tmpStoreAVX2 2126 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 2127 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 2128 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 2129 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 2130 VMOVDQA tmpStoreAVX2, CC3 2131 2132 SUBQ $16, oup // Adjust the pointer 2133 MOVQ $9, itr1 2134 JMP sealAVX2InternalLoopStart 2135 2136sealAVX2MainLoop: 2137 // Load state, increment counter blocks, store the incremented counters 2138 VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 2139 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 2140 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 2141 VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 2142 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 2143 MOVQ $10, itr1 2144 2145sealAVX2InternalLoop: 2146 polyAdd(0*8(oup)) 2147 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2148 polyMulStage1_AVX2 2149 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2150 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 2151 polyMulStage2_AVX2 2152 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2153 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2154 polyMulStage3_AVX2 2155 VMOVDQA CC3, tmpStoreAVX2 2156 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 2157 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 2158 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 2159 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 2160 VMOVDQA tmpStoreAVX2, CC3 2161 polyMulReduceStage 2162 2163sealAVX2InternalLoopStart: 2164 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2165 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2166 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 2167 polyAdd(2*8(oup)) 2168 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2169 polyMulStage1_AVX2 2170 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2171 VMOVDQA CC3, tmpStoreAVX2 2172 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 2173 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 2174 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 2175 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 2176 VMOVDQA tmpStoreAVX2, CC3 2177 polyMulStage2_AVX2 2178 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 2179 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 2180 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 2181 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2182 polyMulStage3_AVX2 2183 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2184 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 2185 polyMulReduceStage 2186 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2187 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2188 polyAdd(4*8(oup)) 2189 LEAQ (6*8)(oup), oup 2190 VMOVDQA CC3, tmpStoreAVX2 2191 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 2192 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 2193 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 2194 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 2195 VMOVDQA tmpStoreAVX2, CC3 2196 polyMulStage1_AVX2 2197 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2198 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2199 polyMulStage2_AVX2 2200 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 2201 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2202 polyMulStage3_AVX2 2203 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2204 VMOVDQA CC3, tmpStoreAVX2 2205 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 2206 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 2207 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 2208 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 2209 VMOVDQA tmpStoreAVX2, CC3 2210 polyMulReduceStage 2211 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 2212 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 2213 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 2214 DECQ itr1 2215 JNE sealAVX2InternalLoop 2216 2217 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 2218 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 2219 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 2220 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 2221 VMOVDQA CC3, tmpStoreAVX2 2222 2223 // We only hashed 480 of the 512 bytes available - hash the remaining 32 here 2224 polyAdd(0*8(oup)) 2225 polyMulAVX2 2226 LEAQ (4*8)(oup), oup 2227 VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 2228 VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 2229 VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) 2230 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 2231 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 2232 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) 2233 2234 // and here 2235 polyAdd(-2*8(oup)) 2236 polyMulAVX2 2237 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 2238 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 2239 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) 2240 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 2241 VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 2242 VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) 2243 LEAQ (32*16)(inp), inp 2244 SUBQ $(32*16), inl 2245 CMPQ inl, $512 2246 JG sealAVX2MainLoop 2247 2248 // Tail can only hash 480 bytes 2249 polyAdd(0*8(oup)) 2250 polyMulAVX2 2251 polyAdd(2*8(oup)) 2252 polyMulAVX2 2253 LEAQ 32(oup), oup 2254 2255 MOVQ $10, itr1 2256 MOVQ $0, itr2 2257 CMPQ inl, $128 2258 JBE sealAVX2Tail128 2259 CMPQ inl, $256 2260 JBE sealAVX2Tail256 2261 CMPQ inl, $384 2262 JBE sealAVX2Tail384 2263 JMP sealAVX2Tail512 2264 2265// ---------------------------------------------------------------------------- 2266// Special optimization for buffers smaller than 193 bytes 2267seal192AVX2: 2268 // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks 2269 VMOVDQA AA0, AA1 2270 VMOVDQA BB0, BB1 2271 VMOVDQA CC0, CC1 2272 VPADDD ·avx2IncMask<>(SB), DD0, DD1 2273 VMOVDQA AA0, AA2 2274 VMOVDQA BB0, BB2 2275 VMOVDQA CC0, CC2 2276 VMOVDQA DD0, DD2 2277 VMOVDQA DD1, TT3 2278 MOVQ $10, itr2 2279 2280sealAVX2192InnerCipherLoop: 2281 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 2282 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 2283 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 2284 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 2285 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 2286 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 2287 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 2288 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 2289 DECQ itr2 2290 JNE sealAVX2192InnerCipherLoop 2291 VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 2292 VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 2293 VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 2294 VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 2295 VPERM2I128 $0x02, AA0, BB0, TT0 2296 2297 // Clamp and store poly key 2298 VPAND ·polyClampMask<>(SB), TT0, TT0 2299 VMOVDQA TT0, rsStoreAVX2 2300 2301 // Stream for up to 192 bytes 2302 VPERM2I128 $0x13, AA0, BB0, AA0 2303 VPERM2I128 $0x13, CC0, DD0, BB0 2304 VPERM2I128 $0x02, AA1, BB1, CC0 2305 VPERM2I128 $0x02, CC1, DD1, DD0 2306 VPERM2I128 $0x13, AA1, BB1, AA1 2307 VPERM2I128 $0x13, CC1, DD1, BB1 2308 2309sealAVX2ShortSeal: 2310 // Hash aad 2311 MOVQ ad_len+80(FP), itr2 2312 CALL polyHashADInternal<>(SB) 2313 XORQ itr1, itr1 2314 2315sealAVX2SealHash: 2316 // itr1 holds the number of bytes encrypted but not yet hashed 2317 CMPQ itr1, $16 2318 JB sealAVX2ShortSealLoop 2319 polyAdd(0(oup)) 2320 polyMul 2321 SUBQ $16, itr1 2322 ADDQ $16, oup 2323 JMP sealAVX2SealHash 2324 2325sealAVX2ShortSealLoop: 2326 CMPQ inl, $32 2327 JB sealAVX2ShortTail32 2328 SUBQ $32, inl 2329 2330 // Load for encryption 2331 VPXOR (inp), AA0, AA0 2332 VMOVDQU AA0, (oup) 2333 LEAQ (1*32)(inp), inp 2334 2335 // Now can hash 2336 polyAdd(0*8(oup)) 2337 polyMulAVX2 2338 polyAdd(2*8(oup)) 2339 polyMulAVX2 2340 LEAQ (1*32)(oup), oup 2341 2342 // Shift stream left 2343 VMOVDQA BB0, AA0 2344 VMOVDQA CC0, BB0 2345 VMOVDQA DD0, CC0 2346 VMOVDQA AA1, DD0 2347 VMOVDQA BB1, AA1 2348 VMOVDQA CC1, BB1 2349 VMOVDQA DD1, CC1 2350 VMOVDQA AA2, DD1 2351 VMOVDQA BB2, AA2 2352 JMP sealAVX2ShortSealLoop 2353 2354sealAVX2ShortTail32: 2355 CMPQ inl, $16 2356 VMOVDQA A0, A1 2357 JB sealAVX2ShortDone 2358 2359 SUBQ $16, inl 2360 2361 // Load for encryption 2362 VPXOR (inp), A0, T0 2363 VMOVDQU T0, (oup) 2364 LEAQ (1*16)(inp), inp 2365 2366 // Hash 2367 polyAdd(0*8(oup)) 2368 polyMulAVX2 2369 LEAQ (1*16)(oup), oup 2370 VPERM2I128 $0x11, AA0, AA0, AA0 2371 VMOVDQA A0, A1 2372 2373sealAVX2ShortDone: 2374 VZEROUPPER 2375 JMP sealSSETail 2376 2377// ---------------------------------------------------------------------------- 2378// Special optimization for buffers smaller than 321 bytes 2379seal320AVX2: 2380 // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks 2381 VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 2382 VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 2383 VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 2384 MOVQ $10, itr2 2385 2386sealAVX2320InnerCipherLoop: 2387 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 2388 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 2389 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 2390 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 2391 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 2392 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 2393 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 2394 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 2395 DECQ itr2 2396 JNE sealAVX2320InnerCipherLoop 2397 2398 VMOVDQA ·chacha20Constants<>(SB), TT0 2399 VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 2400 VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 2401 VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 2402 VMOVDQA ·avx2IncMask<>(SB), TT0 2403 VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 2404 VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 2405 VPADDD TT3, DD2, DD2 2406 2407 // Clamp and store poly key 2408 VPERM2I128 $0x02, AA0, BB0, TT0 2409 VPAND ·polyClampMask<>(SB), TT0, TT0 2410 VMOVDQA TT0, rsStoreAVX2 2411 2412 // Stream for up to 320 bytes 2413 VPERM2I128 $0x13, AA0, BB0, AA0 2414 VPERM2I128 $0x13, CC0, DD0, BB0 2415 VPERM2I128 $0x02, AA1, BB1, CC0 2416 VPERM2I128 $0x02, CC1, DD1, DD0 2417 VPERM2I128 $0x13, AA1, BB1, AA1 2418 VPERM2I128 $0x13, CC1, DD1, BB1 2419 VPERM2I128 $0x02, AA2, BB2, CC1 2420 VPERM2I128 $0x02, CC2, DD2, DD1 2421 VPERM2I128 $0x13, AA2, BB2, AA2 2422 VPERM2I128 $0x13, CC2, DD2, BB2 2423 JMP sealAVX2ShortSeal 2424 2425// ---------------------------------------------------------------------------- 2426// Special optimization for the last 128 bytes of ciphertext 2427sealAVX2Tail128: 2428 // Need to decrypt up to 128 bytes - prepare two blocks 2429 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed 2430 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed 2431 VMOVDQA ·chacha20Constants<>(SB), AA0 2432 VMOVDQA state1StoreAVX2, BB0 2433 VMOVDQA state2StoreAVX2, CC0 2434 VMOVDQA ctr3StoreAVX2, DD0 2435 VPADDD ·avx2IncMask<>(SB), DD0, DD0 2436 VMOVDQA DD0, DD1 2437 2438sealAVX2Tail128LoopA: 2439 polyAdd(0(oup)) 2440 polyMul 2441 LEAQ 16(oup), oup 2442 2443sealAVX2Tail128LoopB: 2444 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) 2445 polyAdd(0(oup)) 2446 polyMul 2447 VPALIGNR $4, BB0, BB0, BB0 2448 VPALIGNR $8, CC0, CC0, CC0 2449 VPALIGNR $12, DD0, DD0, DD0 2450 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) 2451 polyAdd(16(oup)) 2452 polyMul 2453 LEAQ 32(oup), oup 2454 VPALIGNR $12, BB0, BB0, BB0 2455 VPALIGNR $8, CC0, CC0, CC0 2456 VPALIGNR $4, DD0, DD0, DD0 2457 DECQ itr1 2458 JG sealAVX2Tail128LoopA 2459 DECQ itr2 2460 JGE sealAVX2Tail128LoopB 2461 2462 VPADDD ·chacha20Constants<>(SB), AA0, AA1 2463 VPADDD state1StoreAVX2, BB0, BB1 2464 VPADDD state2StoreAVX2, CC0, CC1 2465 VPADDD DD1, DD0, DD1 2466 2467 VPERM2I128 $0x02, AA1, BB1, AA0 2468 VPERM2I128 $0x02, CC1, DD1, BB0 2469 VPERM2I128 $0x13, AA1, BB1, CC0 2470 VPERM2I128 $0x13, CC1, DD1, DD0 2471 JMP sealAVX2ShortSealLoop 2472 2473// ---------------------------------------------------------------------------- 2474// Special optimization for the last 256 bytes of ciphertext 2475sealAVX2Tail256: 2476 // Need to decrypt up to 256 bytes - prepare two blocks 2477 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed 2478 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed 2479 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1 2480 VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1 2481 VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1 2482 VMOVDQA ctr3StoreAVX2, DD0 2483 VPADDD ·avx2IncMask<>(SB), DD0, DD0 2484 VPADDD ·avx2IncMask<>(SB), DD0, DD1 2485 VMOVDQA DD0, TT1 2486 VMOVDQA DD1, TT2 2487 2488sealAVX2Tail256LoopA: 2489 polyAdd(0(oup)) 2490 polyMul 2491 LEAQ 16(oup), oup 2492 2493sealAVX2Tail256LoopB: 2494 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 2495 polyAdd(0(oup)) 2496 polyMul 2497 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 2498 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 2499 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 2500 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 2501 polyAdd(16(oup)) 2502 polyMul 2503 LEAQ 32(oup), oup 2504 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 2505 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 2506 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 2507 DECQ itr1 2508 JG sealAVX2Tail256LoopA 2509 DECQ itr2 2510 JGE sealAVX2Tail256LoopB 2511 2512 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 2513 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 2514 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 2515 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 2516 VPERM2I128 $0x02, AA0, BB0, TT0 2517 VPERM2I128 $0x02, CC0, DD0, TT1 2518 VPERM2I128 $0x13, AA0, BB0, TT2 2519 VPERM2I128 $0x13, CC0, DD0, TT3 2520 VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 2521 VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) 2522 MOVQ $128, itr1 2523 LEAQ 128(inp), inp 2524 SUBQ $128, inl 2525 VPERM2I128 $0x02, AA1, BB1, AA0 2526 VPERM2I128 $0x02, CC1, DD1, BB0 2527 VPERM2I128 $0x13, AA1, BB1, CC0 2528 VPERM2I128 $0x13, CC1, DD1, DD0 2529 2530 JMP sealAVX2SealHash 2531 2532// ---------------------------------------------------------------------------- 2533// Special optimization for the last 384 bytes of ciphertext 2534sealAVX2Tail384: 2535 // Need to decrypt up to 384 bytes - prepare two blocks 2536 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed 2537 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed 2538 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 2539 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 2540 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 2541 VMOVDQA ctr3StoreAVX2, DD0 2542 VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2 2543 VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3 2544 2545sealAVX2Tail384LoopA: 2546 polyAdd(0(oup)) 2547 polyMul 2548 LEAQ 16(oup), oup 2549 2550sealAVX2Tail384LoopB: 2551 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 2552 polyAdd(0(oup)) 2553 polyMul 2554 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 2555 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 2556 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 2557 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 2558 polyAdd(16(oup)) 2559 polyMul 2560 LEAQ 32(oup), oup 2561 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 2562 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 2563 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 2564 DECQ itr1 2565 JG sealAVX2Tail384LoopA 2566 DECQ itr2 2567 JGE sealAVX2Tail384LoopB 2568 2569 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 2570 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 2571 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 2572 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2 2573 VPERM2I128 $0x02, AA0, BB0, TT0 2574 VPERM2I128 $0x02, CC0, DD0, TT1 2575 VPERM2I128 $0x13, AA0, BB0, TT2 2576 VPERM2I128 $0x13, CC0, DD0, TT3 2577 VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 2578 VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) 2579 VPERM2I128 $0x02, AA1, BB1, TT0 2580 VPERM2I128 $0x02, CC1, DD1, TT1 2581 VPERM2I128 $0x13, AA1, BB1, TT2 2582 VPERM2I128 $0x13, CC1, DD1, TT3 2583 VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 2584 VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) 2585 MOVQ $256, itr1 2586 LEAQ 256(inp), inp 2587 SUBQ $256, inl 2588 VPERM2I128 $0x02, AA2, BB2, AA0 2589 VPERM2I128 $0x02, CC2, DD2, BB0 2590 VPERM2I128 $0x13, AA2, BB2, CC0 2591 VPERM2I128 $0x13, CC2, DD2, DD0 2592 2593 JMP sealAVX2SealHash 2594 2595// ---------------------------------------------------------------------------- 2596// Special optimization for the last 512 bytes of ciphertext 2597sealAVX2Tail512: 2598 // Need to decrypt up to 512 bytes - prepare two blocks 2599 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed 2600 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed 2601 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 2602 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 2603 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 2604 VMOVDQA ctr3StoreAVX2, DD0 2605 VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 2606 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 2607 2608sealAVX2Tail512LoopA: 2609 polyAdd(0(oup)) 2610 polyMul 2611 LEAQ 16(oup), oup 2612 2613sealAVX2Tail512LoopB: 2614 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2615 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2616 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 2617 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2618 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2619 VMOVDQA CC3, tmpStoreAVX2 2620 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 2621 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 2622 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 2623 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 2624 VMOVDQA tmpStoreAVX2, CC3 2625 polyAdd(0*8(oup)) 2626 polyMulAVX2 2627 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2628 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2629 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 2630 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2631 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2632 VMOVDQA CC3, tmpStoreAVX2 2633 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 2634 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 2635 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 2636 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 2637 VMOVDQA tmpStoreAVX2, CC3 2638 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 2639 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 2640 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 2641 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2642 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2643 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 2644 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2645 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2646 polyAdd(2*8(oup)) 2647 polyMulAVX2 2648 LEAQ (4*8)(oup), oup 2649 VMOVDQA CC3, tmpStoreAVX2 2650 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 2651 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 2652 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 2653 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 2654 VMOVDQA tmpStoreAVX2, CC3 2655 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2656 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2657 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 2658 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2659 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2660 VMOVDQA CC3, tmpStoreAVX2 2661 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 2662 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 2663 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 2664 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 2665 VMOVDQA tmpStoreAVX2, CC3 2666 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 2667 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 2668 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 2669 2670 DECQ itr1 2671 JG sealAVX2Tail512LoopA 2672 DECQ itr2 2673 JGE sealAVX2Tail512LoopB 2674 2675 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 2676 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 2677 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 2678 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 2679 VMOVDQA CC3, tmpStoreAVX2 2680 VPERM2I128 $0x02, AA0, BB0, CC3 2681 VPXOR (0*32)(inp), CC3, CC3 2682 VMOVDQU CC3, (0*32)(oup) 2683 VPERM2I128 $0x02, CC0, DD0, CC3 2684 VPXOR (1*32)(inp), CC3, CC3 2685 VMOVDQU CC3, (1*32)(oup) 2686 VPERM2I128 $0x13, AA0, BB0, CC3 2687 VPXOR (2*32)(inp), CC3, CC3 2688 VMOVDQU CC3, (2*32)(oup) 2689 VPERM2I128 $0x13, CC0, DD0, CC3 2690 VPXOR (3*32)(inp), CC3, CC3 2691 VMOVDQU CC3, (3*32)(oup) 2692 2693 VPERM2I128 $0x02, AA1, BB1, AA0 2694 VPERM2I128 $0x02, CC1, DD1, BB0 2695 VPERM2I128 $0x13, AA1, BB1, CC0 2696 VPERM2I128 $0x13, CC1, DD1, DD0 2697 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 2698 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) 2699 2700 VPERM2I128 $0x02, AA2, BB2, AA0 2701 VPERM2I128 $0x02, CC2, DD2, BB0 2702 VPERM2I128 $0x13, AA2, BB2, CC0 2703 VPERM2I128 $0x13, CC2, DD2, DD0 2704 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 2705 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) 2706 2707 MOVQ $384, itr1 2708 LEAQ 384(inp), inp 2709 SUBQ $384, inl 2710 VPERM2I128 $0x02, AA3, BB3, AA0 2711 VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0 2712 VPERM2I128 $0x13, AA3, BB3, CC0 2713 VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 2714 2715 JMP sealAVX2SealHash 2716